Add Scraper object

This commit is contained in:
Alexander Sheiko 2020-12-12 23:33:57 +02:00
parent edad8f6393
commit 6bf65cd482
9 changed files with 158 additions and 80 deletions

View file

@ -26,7 +26,8 @@ import (
)
func main() {
for tweet := range twitterscraper.GetTweets(context.Background(), "Twitter", 50) {
scraper := twitterscraper.New()
for tweet := range scraper.GetTweets(context.Background(), "Twitter", 50) {
if tweet.Error != nil {
panic(tweet.Error)
}
@ -51,7 +52,8 @@ import (
)
func main() {
for tweet := range twitterscraper.SearchTweets(context.Background(),
scraper := twitterscraper.New()
for tweet := range scraper.SearchTweets(context.Background(),
"twitter scraper data -filter:retweets", 50) {
if tweet.Error != nil {
panic(tweet.Error)
@ -76,7 +78,8 @@ import (
)
func main() {
profile, err := twitterscraper.GetProfile("Twitter")
scraper := twitterscraper.New()
profile, err := scraper.GetProfile("Twitter")
if err != nil {
panic(err)
}
@ -95,7 +98,8 @@ import (
)
func main() {
trends, err := twitterscraper.GetTrends()
scraper := twitterscraper.New()
trends, err := scraper.GetTrends()
if err != nil {
panic(err)
}
@ -106,11 +110,36 @@ func main() {
### Use http proxy
```golang
twitterscraper.SetProxy("http://localhost:3128")
err := scraper.SetProxy("http://localhost:3128")
if err != nil {
panic(err)
}
```
### Load timeline with tweet replies
```golang
twitterscraper.IncludeReplies = true
scraper.WithReplies(true)
```
### Default Scraper (Ad hoc)
In simple cases, you can use the default scraper without creating an object instance
```golang
import twitterscraper "github.com/n0madic/twitter-scraper"
// for tweets
twitterscraper.GetTweets(context.Background(), "Twitter", 50)
// for tweets with replies
twitterscraper.WithReplies(true).GetTweets(context.Background(), "Twitter", 50)
// for search
twitterscraper.SearchTweets(context.Background(), "twitter", 50)
// for profile
twitterscraper.GetProfile("Twitter")
// for trends
twitterscraper.GetTrends()
```

29
api.go
View file

@ -18,23 +18,22 @@ type user struct {
} `json:"data"`
}
var (
guestToken string
cacheIDs sync.Map
)
// Global cache for user IDs
var cacheIDs sync.Map
func requestAPI(req *http.Request, target interface{}) error {
if guestToken == "" {
err := GetGuestToken()
// RequestAPI get JSON from frontend API and decodes it
func (s *Scraper) RequestAPI(req *http.Request, target interface{}) error {
if s.guestToken == "" {
err := s.GetGuestToken()
if err != nil {
return err
}
}
req.Header.Set("Authorization", "Bearer "+bearerToken)
req.Header.Set("X-Guest-Token", guestToken)
req.Header.Set("X-Guest-Token", s.guestToken)
resp, err := newHTTPClient().Do(req)
resp, err := s.client.Do(req)
if err != nil {
return err
}
@ -43,15 +42,15 @@ func requestAPI(req *http.Request, target interface{}) error {
return json.NewDecoder(resp.Body).Decode(target)
}
// GetGuestToken from API
func GetGuestToken() error {
// GetGuestToken from Twitter API
func (s *Scraper) GetGuestToken() error {
req, err := http.NewRequest("POST", "https://api.twitter.com/1.1/guest/activate.json", nil)
if err != nil {
return err
}
req.Header.Set("Authorization", "Bearer "+bearerToken)
resp, err := newHTTPClient().Do(req)
resp, err := s.client.Do(req)
if err != nil {
return err
}
@ -70,7 +69,7 @@ func GetGuestToken() error {
return err
}
var ok bool
if guestToken, ok = jsn["guest_token"].(string); !ok {
if s.guestToken, ok = jsn["guest_token"].(string); !ok {
return fmt.Errorf("guest_token not found")
}
@ -78,7 +77,7 @@ func GetGuestToken() error {
}
// GetUserIDByScreenName from API
func GetUserIDByScreenName(screenName string) (string, error) {
func (s *Scraper) GetUserIDByScreenName(screenName string) (string, error) {
id, ok := cacheIDs.Load(screenName)
if ok {
return id.(string), nil
@ -90,7 +89,7 @@ func GetUserIDByScreenName(screenName string) (string, error) {
return "", err
}
err = requestAPI(req, &jsn)
err = s.RequestAPI(req, &jsn)
if err != nil {
return "", err
}

View file

@ -5,16 +5,18 @@ import (
)
func TestGetGuestToken(t *testing.T) {
if err := GetGuestToken(); err != nil {
scraper := New()
if err := scraper.GetGuestToken(); err != nil {
t.Errorf("getGuestToken() error = %v", err)
}
if guestToken == "" {
if scraper.guestToken == "" {
t.Error("Expected non-empty guestToken")
}
}
func TestGetUserIDByScreenName(t *testing.T) {
userID, err := GetUserIDByScreenName("Twitter")
scraper := New()
userID, err := scraper.GetUserIDByScreenName("Twitter")
if err != nil {
t.Errorf("getUserByScreenName() error = %v", err)
}

View file

@ -30,13 +30,13 @@ type Profile struct {
}
// GetProfile return parsed user profile.
func GetProfile(username string) (Profile, error) {
userID, err := GetUserIDByScreenName(username)
func (s *Scraper) GetProfile(username string) (Profile, error) {
userID, err := s.GetUserIDByScreenName(username)
if err != nil {
return Profile{}, err
}
req, err := newRequest("GET", "https://twitter.com/i/api/2/timeline/profile/"+userID+".json")
req, err := s.newRequest("GET", "https://twitter.com/i/api/2/timeline/profile/"+userID+".json")
if err != nil {
return Profile{}, err
}
@ -47,7 +47,7 @@ func GetProfile(username string) (Profile, error) {
req.URL.RawQuery = q.Encode()
var timeline timeline
err = requestAPI(req, &timeline)
err = s.RequestAPI(req, &timeline)
if err != nil {
return Profile{}, err
}
@ -89,3 +89,8 @@ func GetProfile(username string) (Profile, error) {
return profile, nil
}
// GetProfile wrapper for default scraper
func GetProfile(username string) (Profile, error) {
return defaultScraper.GetProfile(username)
}

66
scraper.go Normal file
View file

@ -0,0 +1,66 @@
package twitterscraper
import (
"errors"
"net"
"net/http"
"net/url"
"strings"
"time"
)
// Scraper object
type Scraper struct {
client *http.Client
guestToken string
includeReplies bool
}
var defaultScraper Scraper
// New creates a Scraper object
func New() Scraper {
return Scraper{
client: &http.Client{Timeout: 10 * time.Second},
}
}
// WithReplies enable/disable load timeline with tweet replies
func (s *Scraper) WithReplies(b bool) *Scraper {
s.includeReplies = b
return s
}
// WithReplies wrapper for default Scraper
func WithReplies(b bool) *Scraper {
return defaultScraper.WithReplies(b)
}
// SetProxy set http proxy in the format `http://HOST:PORT`
func (s *Scraper) SetProxy(proxy string) error {
if !strings.HasPrefix(proxy, "http://") {
return errors.New("only support http protocol")
}
urlproxy, err := url.Parse(proxy)
if err != nil {
return err
}
s.client = &http.Client{
Transport: &http.Transport{
Proxy: http.ProxyURL(urlproxy),
DialContext: (&net.Dialer{
Timeout: 10 * time.Second,
}).DialContext,
},
}
return nil
}
// SetProxy wrapper for default Scraper
func SetProxy(proxy string) error {
return defaultScraper.SetProxy(proxy)
}
func init() {
defaultScraper = New()
}

View file

@ -7,18 +7,23 @@ import (
)
// SearchTweets returns channel with tweets for a given search query
func (s *Scraper) SearchTweets(ctx context.Context, query string, maxTweetsNbr int) <-chan *Result {
return getTimeline(ctx, query, maxTweetsNbr, s.FetchSearchTweets)
}
// SearchTweets wrapper for default Scraper
func SearchTweets(ctx context.Context, query string, maxTweetsNbr int) <-chan *Result {
return getTimeline(ctx, query, maxTweetsNbr, FetchSearchTweets)
return defaultScraper.SearchTweets(ctx, query, maxTweetsNbr)
}
// FetchSearchTweets gets tweets for a given search query, via the Twitter frontend API
func FetchSearchTweets(query string, maxTweetsNbr int, cursor string) ([]*Tweet, string, error) {
func (s *Scraper) FetchSearchTweets(query string, maxTweetsNbr int, cursor string) ([]*Tweet, string, error) {
query = url.PathEscape(query)
if maxTweetsNbr > 200 {
maxTweetsNbr = 200
}
req, err := newRequest("GET", "https://twitter.com/i/api/2/search/adaptive.json")
req, err := s.newRequest("GET", "https://twitter.com/i/api/2/search/adaptive.json")
if err != nil {
return nil, "", err
}
@ -35,7 +40,7 @@ func FetchSearchTweets(query string, maxTweetsNbr int, cursor string) ([]*Tweet,
req.URL.RawQuery = q.Encode()
var timeline timeline
err = requestAPI(req, &timeline)
err = s.RequestAPI(req, &timeline)
if err != nil {
return nil, "", err
}

View file

@ -1,8 +1,8 @@
package twitterscraper
// GetTrends return list of trends.
func GetTrends() ([]string, error) {
req, err := newRequest("GET", "https://twitter.com/i/api/2/guide.json")
func (s *Scraper) GetTrends() ([]string, error) {
req, err := s.newRequest("GET", "https://twitter.com/i/api/2/guide.json")
if err != nil {
return nil, err
}
@ -15,7 +15,7 @@ func GetTrends() ([]string, error) {
req.URL.RawQuery = q.Encode()
var jsn timeline
err = requestAPI(req, &jsn)
err = s.RequestAPI(req, &jsn)
if err != nil {
return nil, err
}
@ -27,3 +27,8 @@ func GetTrends() ([]string, error) {
return trends, nil
}
// GetTrends wrapper for default Scraper
func GetTrends() ([]string, error) {
return defaultScraper.GetTrends()
}

View file

@ -6,22 +6,27 @@ import (
)
// GetTweets returns channel with tweets for a given user.
func (s *Scraper) GetTweets(ctx context.Context, user string, maxTweetsNbr int) <-chan *Result {
return getTimeline(ctx, user, maxTweetsNbr, s.FetchTweets)
}
// GetTweets wrapper for default Scraper
func GetTweets(ctx context.Context, user string, maxTweetsNbr int) <-chan *Result {
return getTimeline(ctx, user, maxTweetsNbr, FetchTweets)
return defaultScraper.GetTweets(ctx, user, maxTweetsNbr)
}
// FetchTweets gets tweets for a given user, via the Twitter frontend API.
func FetchTweets(user string, maxTweetsNbr int, cursor string) ([]*Tweet, string, error) {
func (s *Scraper) FetchTweets(user string, maxTweetsNbr int, cursor string) ([]*Tweet, string, error) {
if maxTweetsNbr > 200 {
maxTweetsNbr = 200
}
userID, err := GetUserIDByScreenName(user)
userID, err := s.GetUserIDByScreenName(user)
if err != nil {
return nil, "", err
}
req, err := newRequest("GET", "https://api.twitter.com/2/timeline/profile/"+userID+".json")
req, err := s.newRequest("GET", "https://api.twitter.com/2/timeline/profile/"+userID+".json")
if err != nil {
return nil, "", err
}
@ -35,7 +40,7 @@ func FetchTweets(user string, maxTweetsNbr int, cursor string) ([]*Tweet, string
req.URL.RawQuery = q.Encode()
var timeline timeline
err = requestAPI(req, &timeline)
err = s.RequestAPI(req, &timeline)
if err != nil {
return nil, "", err
}

42
util.go
View file

@ -2,52 +2,14 @@ package twitterscraper
import (
"context"
"errors"
"fmt"
"net"
"net/http"
"net/url"
"strconv"
"strings"
"time"
)
var (
// IncludeReplies enable tweet reply
IncludeReplies bool
// HTTPProxy Public variable for Http proxy
HTTPProxy *url.URL
)
// SetProxy set http proxy format `http://HOST:PORT`
func SetProxy(proxy string) error {
if !strings.HasPrefix(proxy, "http://") {
return errors.New("only support http protocol")
}
urlproxy, err := url.Parse(proxy)
if err != nil {
return err
}
HTTPProxy = urlproxy
return nil
}
func newHTTPClient() *http.Client {
client := &http.Client{Timeout: 10 * time.Second}
if HTTPProxy != nil {
client = &http.Client{
Transport: &http.Transport{
Proxy: http.ProxyURL(HTTPProxy),
DialContext: (&net.Dialer{
Timeout: 10 * time.Second,
}).DialContext,
},
}
}
return client
}
func newRequest(method string, url string) (*http.Request, error) {
func (s *Scraper) newRequest(method string, url string) (*http.Request, error) {
req, err := http.NewRequest(method, url, nil)
if err != nil {
return nil, err
@ -75,7 +37,7 @@ func newRequest(method string, url string) (*http.Request, error) {
q.Add("include_ext_media_availability", "true")
q.Add("send_error_codes", "true")
q.Add("simple_quoted_tweet", "true")
q.Add("include_tweet_replies", strconv.FormatBool(IncludeReplies))
q.Add("include_tweet_replies", strconv.FormatBool(s.includeReplies))
q.Add("ext", "mediaStats,highlightedLabel")
req.URL.RawQuery = q.Encode()