From edad8f6393fc293822ed45213633be7f0efdcfc7 Mon Sep 17 00:00:00 2001 From: Alexander Sheiko Date: Fri, 11 Dec 2020 20:58:49 +0200 Subject: [PATCH] Total refactoring Used guest frontend API BREAKING CHANGE: remove tweet.HTML property Loading more information Minor fixes and changes --- README.md | 41 ++++------ api.go | 105 +++++++++++++++++++++++++ api_test.go | 24 ++++++ go.mod | 5 +- go.sum | 14 +--- profile.go | 110 ++++++++++++-------------- profile_test.go | 38 +++++---- search.go | 137 +++++--------------------------- search_test.go | 8 +- trends.go | 52 +++--------- trends_test.go | 10 ++- tweets.go | 196 ++++++--------------------------------------- tweets_test.go | 29 +++++-- types.go | 151 +++++++++++++++++++++++++++++++++++ util.go | 205 +++++++++++++++++++++++++++++++++++++++--------- 15 files changed, 628 insertions(+), 497 deletions(-) create mode 100644 api.go create mode 100644 api_test.go create mode 100644 types.go diff --git a/README.md b/README.md index 5b3a2d5..4b6ee75 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,5 @@ # Twitter Scraper -Golang implementation of python library - Twitter's API is annoying to work with, and has lots of limitations — luckily their frontend (JavaScript) has it's own API, which I reverse-engineered. No API rate limits. No tokens needed. No restrictions. Extremely fast. @@ -32,12 +30,12 @@ func main() { if tweet.Error != nil { panic(tweet.Error) } - fmt.Println(tweet.HTML) + fmt.Println(tweet.Text) } } ``` -It appears you can ask for up to 50 tweets. +It appears you can ask for up to 50 tweets (limit ~3200 tweets). ### Search tweets by query standard operators @@ -58,32 +56,11 @@ func main() { if tweet.Error != nil { panic(tweet.Error) } - fmt.Println(tweet.HTML) + fmt.Println(tweet.Text) } } ``` -#### With http proxy -```golang -package main - -import ( - "context" - "fmt" - twitterscraper "github.com/n0madic/twitter-scraper" -) - -func main() { - twitterscraper.SetProxy("http://localhost:16379") - for tweet := range twitterscraper.SearchTweets(context.Background(), - "twitter scraper data -filter:retweets", 50) { - if tweet.Error != nil { - panic(tweet.Error) - } - fmt.Println(tweet.HTML) - } -} -``` The search ends if we have 50 tweets. See [Rules and filtering](https://developer.twitter.com/en/docs/tweets/rules-and-filtering/overview/standard-operators) for build standard queries. @@ -125,3 +102,15 @@ func main() { fmt.Println(trends) } ``` + +### Use http proxy + +```golang +twitterscraper.SetProxy("http://localhost:3128") +``` + +### Load timeline with tweet replies + +```golang +twitterscraper.IncludeReplies = true +``` diff --git a/api.go b/api.go new file mode 100644 index 0000000..93a9109 --- /dev/null +++ b/api.go @@ -0,0 +1,105 @@ +package twitterscraper + +import ( + "encoding/json" + "fmt" + "io/ioutil" + "net/http" + "sync" +) + +const bearerToken string = "AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA" + +type user struct { + Data struct { + User struct { + RestID string `json:"rest_id"` + } `json:"user"` + } `json:"data"` +} + +var ( + guestToken string + cacheIDs sync.Map +) + +func requestAPI(req *http.Request, target interface{}) error { + if guestToken == "" { + err := GetGuestToken() + if err != nil { + return err + } + } + + req.Header.Set("Authorization", "Bearer "+bearerToken) + req.Header.Set("X-Guest-Token", guestToken) + + resp, err := newHTTPClient().Do(req) + if err != nil { + return err + } + defer resp.Body.Close() + + return json.NewDecoder(resp.Body).Decode(target) +} + +// GetGuestToken from API +func GetGuestToken() error { + req, err := http.NewRequest("POST", "https://api.twitter.com/1.1/guest/activate.json", nil) + if err != nil { + return err + } + req.Header.Set("Authorization", "Bearer "+bearerToken) + + resp, err := newHTTPClient().Do(req) + if err != nil { + return err + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return fmt.Errorf("response status %s", resp.Status) + } + body, err := ioutil.ReadAll(resp.Body) + if err != nil { + return err + } + + var jsn map[string]interface{} + if err := json.Unmarshal(body, &jsn); err != nil { + return err + } + var ok bool + if guestToken, ok = jsn["guest_token"].(string); !ok { + return fmt.Errorf("guest_token not found") + } + + return nil +} + +// GetUserIDByScreenName from API +func GetUserIDByScreenName(screenName string) (string, error) { + id, ok := cacheIDs.Load(screenName) + if ok { + return id.(string), nil + } + + var jsn user + req, err := http.NewRequest("GET", "https://api.twitter.com/graphql/4S2ihIKfF3xhp-ENxvUAfQ/UserByScreenName?variables=%7B%22screen_name%22%3A%22"+screenName+"%22%2C%22withHighlightedLabel%22%3Atrue%7D", nil) + if err != nil { + return "", err + } + + err = requestAPI(req, &jsn) + if err != nil { + return "", err + } + + if jsn.Data.User.RestID == "" { + return "", fmt.Errorf("rest_id not found") + } + + cacheIDs.Store(screenName, jsn.Data.User.RestID) + + return jsn.Data.User.RestID, nil +} diff --git a/api_test.go b/api_test.go new file mode 100644 index 0000000..23a52ed --- /dev/null +++ b/api_test.go @@ -0,0 +1,24 @@ +package twitterscraper + +import ( + "testing" +) + +func TestGetGuestToken(t *testing.T) { + if err := GetGuestToken(); err != nil { + t.Errorf("getGuestToken() error = %v", err) + } + if guestToken == "" { + t.Error("Expected non-empty guestToken") + } +} + +func TestGetUserIDByScreenName(t *testing.T) { + userID, err := GetUserIDByScreenName("Twitter") + if err != nil { + t.Errorf("getUserByScreenName() error = %v", err) + } + if userID == "" { + t.Error("Expected non-empty user ID") + } +} diff --git a/go.mod b/go.mod index fc4b2c8..0bd43be 100644 --- a/go.mod +++ b/go.mod @@ -2,7 +2,4 @@ module github.com/n0madic/twitter-scraper go 1.13 -require ( - github.com/PuerkitoBio/goquery v1.5.1 - github.com/google/go-cmp v0.4.0 -) +require github.com/google/go-cmp v0.5.4 diff --git a/go.sum b/go.sum index 5fc056f..1ffcbdb 100644 --- a/go.sum +++ b/go.sum @@ -1,14 +1,4 @@ -github.com/PuerkitoBio/goquery v1.5.1 h1:PSPBGne8NIUWw+/7vFBV+kG2J/5MOjbzc7154OaKCSE= -github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc= -github.com/andybalholm/cascadia v1.1.0 h1:BuuO6sSfQNFRu1LppgbD25Hr2vLYW25JvxHs5zzsLTo= -github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y= -github.com/google/go-cmp v0.4.0 h1:xsAVV57WRhGj6kEIi8ReJzQlHHqcBYCElAvkovg3B/4= -github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= -golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= -golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= -golang.org/x/net v0.0.0-20200202094626-16171245cfb2 h1:CCH4IOTTfewWjGOlSp+zGcjutRKlBEZQ6wTn8ozI/nI= -golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +github.com/google/go-cmp v0.5.4 h1:L8R9j+yAqZuZjsqh/z+F1NCffTKKLShY6zXTItVIZ8M= +github.com/google/go-cmp v0.5.4/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= diff --git a/profile.go b/profile.go index 54bdae2..1978115 100644 --- a/profile.go +++ b/profile.go @@ -2,13 +2,7 @@ package twitterscraper import ( "fmt" - "net" - "net/http" - "strconv" - "strings" "time" - - "github.com/PuerkitoBio/goquery" ) // Profile of twitter user. @@ -19,12 +13,15 @@ type Profile struct { Birthday string FollowersCount int FollowingCount int + FriendsCount int IsPrivate bool IsVerified bool Joined *time.Time LikesCount int + ListedCount int Location string Name string + PinnedTweetIDs []string TweetsCount int URL string UserID string @@ -34,66 +31,61 @@ type Profile struct { // GetProfile return parsed user profile. func GetProfile(username string) (Profile, error) { - url := "https://mobile.twitter.com/" + username - - client := http.DefaultClient - if HTTPProxy != nil { - client = &http.Client{ - Transport: &http.Transport{ - Proxy: http.ProxyURL(HTTPProxy), - DialContext: (&net.Dialer{ - Timeout: 10 * time.Second, - }).DialContext, - }, - } - } - - req, err := http.NewRequest("GET", url, nil) - if err != nil { - return Profile{}, err - } - req.Header.Set("Accept-Language", "en-US") - - resp, err := client.Do(req) - if resp == nil { - return Profile{}, err - } - defer resp.Body.Close() - - if resp.StatusCode != http.StatusOK { - return Profile{}, fmt.Errorf("response status: %s", resp.Status) - } - - doc, err := goquery.NewDocumentFromReader(resp.Body) + userID, err := GetUserIDByScreenName(username) if err != nil { return Profile{}, err } - // parse join date text - screenName := doc.Find(".screen-name").First().Text() + req, err := newRequest("GET", "https://twitter.com/i/api/2/timeline/profile/"+userID+".json") + if err != nil { + return Profile{}, err + } - // check is username valid - if screenName == "" { + q := req.URL.Query() + q.Add("count", "20") + q.Add("userId", userID) + req.URL.RawQuery = q.Encode() + + var timeline timeline + err = requestAPI(req, &timeline) + if err != nil { + return Profile{}, err + } + + user, found := timeline.GlobalObjects.Users[userID] + if !found { return Profile{}, fmt.Errorf("either @%s does not exist or is private", username) } - return Profile{ - Avatar: doc.Find("td.avatar > img").First().AttrOr("src", ""), - Biography: strings.TrimSpace(doc.Find(".bio").First().Text()), - FollowersCount: parseCount(doc.Find("table.profile-stats > tbody > tr > td:nth-child(3) > a > div.statnum").First().Text()), - FollowingCount: parseCount(doc.Find("table.profile-stats > tbody > tr > td:nth-child(2) > a > div.statnum").First().Text()), - IsPrivate: strings.Contains(doc.Find("div.fullname > a.badge > img").First().AttrOr("src", ""), "protected"), - IsVerified: strings.Contains(doc.Find("div.fullname > a.badge > img").First().AttrOr("src", ""), "verified"), - Location: strings.TrimSpace(doc.Find(".location").First().Text()), - Name: strings.TrimSpace(doc.Find(".fullname").First().Text()), - TweetsCount: parseCount(doc.Find("table.profile-stats > tbody > tr > td:nth-child(1) > div.statnum").First().Text()), - URL: "https://twitter.com/" + screenName, - Username: screenName, - Website: strings.TrimSpace(doc.Find("div.url > div > a").First().AttrOr("data-url", "")), - }, nil -} + profile := Profile{ + Avatar: user.ProfileImageURLHTTPS, + Banner: user.ProfileBannerURL, + Biography: user.Description, + FollowersCount: user.FollowersCount, + FollowingCount: user.FavouritesCount, + FriendsCount: user.FriendsCount, + IsPrivate: user.Protected, + IsVerified: user.Verified, + LikesCount: user.FavouritesCount, + ListedCount: user.ListedCount, + Location: user.Location, + Name: user.Name, + PinnedTweetIDs: user.PinnedTweetIdsStr, + TweetsCount: user.StatusesCount, + URL: "https://twitter.com/" + user.ScreenName, + UserID: user.IDStr, + Username: user.ScreenName, + } -func parseCount(str string) (i int) { - i, _ = strconv.Atoi(strings.Replace(str, ",", "", -1)) - return + tm, err := time.Parse(time.RubyDate, user.CreatedAt) + if err == nil { + tm = tm.UTC() + profile.Joined = &tm + } + + if len(user.Entities.URL.Urls) > 0 { + profile.Website = user.Entities.URL.Urls[0].ExpandedURL + } + + return profile, nil } diff --git a/profile_test.go b/profile_test.go index 1d9eec2..64dd6af 100644 --- a/profile_test.go +++ b/profile_test.go @@ -2,28 +2,30 @@ package twitterscraper import ( "testing" + "time" "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" ) func TestGetProfile(t *testing.T) { - // loc := time.FixedZone("UTC", 0) - // joined := time.Date(2007, 02, 20, 6, 35, 0, 0, loc) + loc := time.FixedZone("UTC", 0) + joined := time.Date(2007, 02, 20, 14, 35, 54, 0, loc) sample := Profile{ - Avatar: "https://pbs.twimg.com/profile_images/1308010958862905345/-SGZioPb_normal.jpg", - // Banner: "https://pbs.twimg.com/profile_banners/783214/1596041768/1500x500", + Avatar: "https://pbs.twimg.com/profile_images/1308010958862905345/-SGZioPb_normal.jpg", + Banner: "https://pbs.twimg.com/profile_banners/783214/1604501727", Biography: "What's happening!?", - // Birthday: "March 21", - IsPrivate: false, - IsVerified: true, - // Joined: &joined, - Location: "everywhere", - Name: "Twitter", - URL: "https://twitter.com/Twitter", - // UserID: "783214", - Username: "Twitter", - Website: "about.twitter.com", + // Birthday: "March 21", + IsPrivate: false, + IsVerified: true, + Joined: &joined, + Location: "everywhere", + Name: "Twitter", + PinnedTweetIDs: []string{}, + URL: "https://twitter.com/Twitter", + UserID: "783214", + Username: "Twitter", + Website: "https://about.twitter.com/", } profile, err := GetProfile("Twitter") @@ -34,7 +36,9 @@ func TestGetProfile(t *testing.T) { cmpOptions := cmp.Options{ cmpopts.IgnoreFields(Profile{}, "FollowersCount"), cmpopts.IgnoreFields(Profile{}, "FollowingCount"), + cmpopts.IgnoreFields(Profile{}, "FriendsCount"), cmpopts.IgnoreFields(Profile{}, "LikesCount"), + cmpopts.IgnoreFields(Profile{}, "ListedCount"), cmpopts.IgnoreFields(Profile{}, "TweetsCount"), } if diff := cmp.Diff(sample, profile, cmpOptions...); diff != "" { @@ -47,9 +51,9 @@ func TestGetProfile(t *testing.T) { if profile.FollowingCount == 0 { t.Error("Expected FollowingCount is greater than zero") } - // if profile.LikesCount == 0 { - // t.Error("Expected LikesCount is greater than zero") - // } + if profile.LikesCount == 0 { + t.Error("Expected LikesCount is greater than zero") + } if profile.TweetsCount == 0 { t.Error("Expected TweetsCount is greater than zero") } diff --git a/search.go b/search.go index 52eb540..abd2d90 100644 --- a/search.go +++ b/search.go @@ -2,143 +2,44 @@ package twitterscraper import ( "context" - "fmt" - "io" - "net" - "net/http" "net/url" - "strings" - "time" - - "github.com/PuerkitoBio/goquery" + "strconv" ) -const mobileSearchURL = "https://mobile.twitter.com/search?q=%s" - // SearchTweets returns channel with tweets for a given search query func SearchTweets(ctx context.Context, query string, maxTweetsNbr int) <-chan *Result { - channel := make(chan *Result) - go func(query string) { - defer close(channel) - var nextCursor string - tweetsNbr := 0 - for tweetsNbr < maxTweetsNbr { - select { - case <-ctx.Done(): - channel <- &Result{Error: ctx.Err()} - return - default: - } - - tweets, next, err := FetchSearchTweets(query, nextCursor) - if err != nil { - channel <- &Result{Error: err} - return - } - - if len(tweets) == 0 { - break - } - - for _, tweet := range tweets { - select { - case <-ctx.Done(): - channel <- &Result{Error: ctx.Err()} - return - default: - } - - if tweetsNbr < maxTweetsNbr { - nextCursor = next - channel <- &Result{Tweet: *tweet} - } - tweetsNbr++ - } - } - }(query) - return channel + return getTimeline(ctx, query, maxTweetsNbr, FetchSearchTweets) } // FetchSearchTweets gets tweets for a given search query, via the Twitter frontend API -func FetchSearchTweets(query, nextCursor string) ([]*Tweet, string, error) { - url := fmt.Sprintf(mobileSearchURL, url.PathEscape(query)) - if nextCursor != "" { - url = "https://mobile.twitter.com" + nextCursor +func FetchSearchTweets(query string, maxTweetsNbr int, cursor string) ([]*Tweet, string, error) { + query = url.PathEscape(query) + if maxTweetsNbr > 200 { + maxTweetsNbr = 200 } - client := http.DefaultClient - if HTTPProxy != nil { - client = &http.Client{ - Transport: &http.Transport{ - Proxy: http.ProxyURL(HTTPProxy), - DialContext: (&net.Dialer{ - Timeout: 10 * time.Second, - }).DialContext, - }, - } - } - - req, err := http.NewRequest("GET", url, nil) + req, err := newRequest("GET", "https://twitter.com/i/api/2/search/adaptive.json") if err != nil { return nil, "", err } - req.Header.Set("Referer", "https://mobile.twitter.com/") - req.Header.Set("User-Agent", "Opera/9.80 (J2ME/MIDP; Opera Mini/5.1.21214/28.2725; U; ru) Presto/2.8.119 Version/11.10") - - resp, err := client.Do(req) - if resp == nil { - return nil, "", err + q := req.URL.Query() + q.Add("q", query) + q.Add("count", strconv.Itoa(maxTweetsNbr)) + q.Add("query_source", "typed_query") + q.Add("pc", "1") + q.Add("spelling_corrections", "1") + if cursor != "" { + q.Add("cursor", cursor) } - defer resp.Body.Close() + req.URL.RawQuery = q.Encode() - if resp.StatusCode != http.StatusOK { - return nil, "", fmt.Errorf("response status: %s", resp.Status) - } - - return readTweetsFromMobileHTML(resp.Body) -} - -func readTweetsFromMobileHTML(htm io.ReadCloser) ([]*Tweet, string, error) { - var tweets []*Tweet - - doc, err := goquery.NewDocumentFromReader(htm) + var timeline timeline + err = requestAPI(req, &timeline) if err != nil { return nil, "", err } - doc.Find("table.tweet").Each(func(i int, s *goquery.Selection) { - var tweet Tweet - tweetID, ok := s.Find(".tweet-text").Attr("data-id") - if ok { - tweet.ID = tweetID - tweet.Username = strings.TrimPrefix(strings.TrimSpace(s.Find("td.user-info > a > div.username").Text()), "@") - tweet.PermanentURL = fmt.Sprintf("https://twitter.com/%s/status/%s", tweet.Username, tweet.ID) - tweet.Text = strings.TrimSpace(s.Find(".tweet-text").Text()) - tweet.HTML, _ = s.Find(".tweet-text").Html() - tweet.HTML = strings.TrimSpace(tweet.HTML) - s.Find("td.tweet-social-context > span").Each(func(i int, c *goquery.Selection) { - tweet.IsRetweet = true - }) - s.Find(".twitter-hashtag").Each(func(i int, h *goquery.Selection) { - tweet.Hashtags = append(tweet.Hashtags, h.Text()) - }) - s.Find("a.tco-link:not(.u-hidden)").Each(func(i int, u *goquery.Selection) { - if link, ok := u.Attr("data-expanded-url"); ok { - tweet.URLs = append(tweet.URLs, link) - } - }) - s.Find("div.media > img").Each(func(i int, p *goquery.Selection) { - if link, ok := p.Attr("src"); ok { - tweet.Photos = append(tweet.Photos, strings.TrimSuffix(link, ":small")) - } - }) - - tweets = append(tweets, &tweet) - } - }) - - nextCursor := doc.Find("div.w-button-more > a").AttrOr("href", "") - + tweets, nextCursor := parseTimeline(&timeline) return tweets, nextCursor, nil } diff --git a/search_test.go b/search_test.go index f1e5907..4149b2d 100644 --- a/search_test.go +++ b/search_test.go @@ -7,21 +7,21 @@ import ( func TestGetSearchTweets(t *testing.T) { count := 0 - maxTweetsNbr := 50 + maxTweetsNbr := 250 for tweet := range SearchTweets(context.Background(), "twitter scraper data -filter:retweets", maxTweetsNbr) { if tweet.Error != nil { t.Error(tweet.Error) } else { count++ - if tweet.HTML == "" { - t.Error("Expected tweet HTML is not empty") - } if tweet.ID == "" { t.Error("Expected tweet ID is not empty") } if tweet.PermanentURL == "" { t.Error("Expected tweet PermanentURL is not empty") } + if tweet.IsRetweet { + t.Error("Expected tweet IsRetweet is false") + } if tweet.Text == "" { t.Error("Expected tweet Text is not empty") } diff --git a/trends.go b/trends.go index 69a316f..cf2e61a 100644 --- a/trends.go +++ b/trends.go @@ -1,55 +1,29 @@ package twitterscraper -import ( - "fmt" - "net" - "net/http" - "strings" - "time" - - "github.com/PuerkitoBio/goquery" -) - -const trendsURL = "https://mobile.twitter.com/trends" - // GetTrends return list of trends. func GetTrends() ([]string, error) { - client := http.DefaultClient - if HTTPProxy != nil { - client = &http.Client{ - Transport: &http.Transport{ - Proxy: http.ProxyURL(HTTPProxy), - DialContext: (&net.Dialer{ - Timeout: 10 * time.Second, - }).DialContext, - }, - } - } - - req, err := http.NewRequest("GET", trendsURL, nil) + req, err := newRequest("GET", "https://twitter.com/i/api/2/guide.json") if err != nil { return nil, err } - req.Header.Set("Accept-Language", "en-US") - resp, err := client.Do(req) - if resp == nil { - return nil, err - } - defer resp.Body.Close() + q := req.URL.Query() + q.Add("count", "20") + q.Add("candidate_source", "trends") + q.Add("include_page_configuration", "false") + q.Add("entity_tokens", "false") + req.URL.RawQuery = q.Encode() - if resp.StatusCode != http.StatusOK { - return nil, fmt.Errorf("response status: %s", resp.Status) - } - - doc, err := goquery.NewDocumentFromReader(resp.Body) + var jsn timeline + err = requestAPI(req, &jsn) if err != nil { return nil, err } var trends []string - doc.Find("li.topic").Each(func(i int, s *goquery.Selection) { - trends = append(trends, strings.TrimSpace(s.Text())) - }) + for _, item := range jsn.Timeline.Instructions[1].AddEntries.Entries[1].Content.TimelineModule.Items { + trends = append(trends, item.Item.ClientEventInfo.Details.GuideDetails.TransparentGuideDetails.TrendMetadata.TrendName) + } + return trends, nil } diff --git a/trends_test.go b/trends_test.go index ad0db70..ddab009 100644 --- a/trends_test.go +++ b/trends_test.go @@ -10,7 +10,13 @@ func TestGetTrends(t *testing.T) { t.Error(err) } - if len(trends) != 10 { - t.Errorf("Expected 10 trends, got %d: %#v", len(trends), trends) + if len(trends) != 20 { + t.Errorf("Expected 20 trends, got %d: %#v", len(trends), trends) + } + + for _, trend := range trends { + if trend == "" { + t.Error("Expected trend is not empty") + } } } diff --git a/tweets.go b/tweets.go index 14499ad..1c945c4 100644 --- a/tweets.go +++ b/tweets.go @@ -2,196 +2,44 @@ package twitterscraper import ( "context" - "fmt" - "net/http" "strconv" - "strings" - "time" - - "github.com/PuerkitoBio/goquery" ) -const ajaxURL = "https://twitter.com/i/profiles/show/%s/timeline/tweets" - -// Video type. -type Video struct { - ID string - Preview string -} - -// Tweet type. -type Tweet struct { - Hashtags []string - HTML string - ID string - IsPin bool - IsRetweet bool - Likes int - PermanentURL string - Photos []string - Replies int - Retweets int - Text string - TimeParsed time.Time - Timestamp int64 - URLs []string - UserID string - Username string - Videos []Video -} - -// Result of scrapping. -type Result struct { - Tweet - Error error -} - // GetTweets returns channel with tweets for a given user. func GetTweets(ctx context.Context, user string, maxTweetsNbr int) <-chan *Result { - channel := make(chan *Result) - go func(user string) { - defer close(channel) - var lastTweetID string - tweetsNbr := 0 - for tweetsNbr < maxTweetsNbr { - select { - case <-ctx.Done(): - channel <- &Result{Error: ctx.Err()} - return - default: - } - - tweets, err := FetchTweets(user, lastTweetID) - if err != nil { - channel <- &Result{Error: err} - return - } - - if len(tweets) == 0 { - break - } - - for _, tweet := range tweets { - select { - case <-ctx.Done(): - channel <- &Result{Error: ctx.Err()} - return - default: - } - - if tweetsNbr < maxTweetsNbr { - lastId, _ := strconv.ParseInt(tweet.ID, 10, 64) - lastTweetID = strconv.FormatInt(lastId-1, 10) - channel <- &Result{Tweet: *tweet} - } - tweetsNbr++ - } - } - }(user) - return channel + return getTimeline(ctx, user, maxTweetsNbr, FetchTweets) } // FetchTweets gets tweets for a given user, via the Twitter frontend API. -func FetchTweets(user string, last string) ([]*Tweet, error) { - req, err := http.NewRequest("GET", "https://syndication.twitter.com/timeline/profile", nil) - if err != nil { - return nil, err +func FetchTweets(user string, maxTweetsNbr int, cursor string) ([]*Tweet, string, error) { + if maxTweetsNbr > 200 { + maxTweetsNbr = 200 } - req.Header.Set("Referer", "https://publish.twitter.com/") + userID, err := GetUserIDByScreenName(user) + if err != nil { + return nil, "", err + } + + req, err := newRequest("GET", "https://api.twitter.com/2/timeline/profile/"+userID+".json") + if err != nil { + return nil, "", err + } q := req.URL.Query() - q.Add("screen_name", user) - q.Add("with_replies", "true") - if last != "" { - q.Add("max_position", last) + q.Add("count", strconv.Itoa(maxTweetsNbr)) + q.Add("userId", userID) + if cursor != "" { + q.Add("cursor", cursor) } req.URL.RawQuery = q.Encode() - htm, err := getHTMLFromJSON(req, "body") + var timeline timeline + err = requestAPI(req, &timeline) if err != nil { - return nil, err + return nil, "", err } - tweets, err := readTweetsFromHTML(htm) - if err != nil { - return nil, err - } - - return tweets, nil -} - -func readTweetsFromHTML(htm *strings.Reader) ([]*Tweet, error) { - var tweets []*Tweet - - doc, err := goquery.NewDocumentFromReader(htm) - if err != nil { - return nil, err - } - - doc.Find(".timeline-Tweet").Each(func(i int, s *goquery.Selection) { - var tweet Tweet - timeStr, ok := s.Find(".timeline-Tweet-metadata > a > time").Attr("datetime") - if ok { - tweet.TimeParsed, _ = time.Parse("2006-01-02T15:04:05-0700", timeStr) - tweet.Timestamp = tweet.TimeParsed.Unix() - tweet.ID = s.AttrOr("data-tweet-id", "") - // tweet.UserID = s.Find(".tweet").AttrOr("data-user-id", "") - tweet.Username = strings.TrimPrefix(s.Find(".TweetAuthor-screenName").AttrOr("title", ""), "@") - tweet.PermanentURL = fmt.Sprintf("https://twitter.com/%s/status/%s", tweet.Username, tweet.ID) - tweet.Text = s.Find(".timeline-Tweet-text").Text() - tweet.HTML, _ = s.Find(".timeline-Tweet-text").Html() - s.Find(".timeline-Tweet-retweetCredit").Each(func(i int, c *goquery.Selection) { - tweet.IsRetweet = true - }) - // s.Find("span.js-pinned-text").Each(func(i int, c *goquery.Selection) { - // tweet.IsPin = true - // }) - // s.Find(".ProfileTweet-actionCount").Each(func(i int, c *goquery.Selection) { - // txt := strings.TrimSpace(c.Text()) - // switch { - // case strings.HasSuffix(txt, "likes"): - // l := strings.Split(txt, " ") - // tweet.Likes, _ = strconv.Atoi(l[0]) - // case strings.HasSuffix(txt, "replies"): - // l := strings.Split(txt, " ") - // tweet.Replies, _ = strconv.Atoi(l[0]) - // case strings.HasSuffix(txt, "retweets"): - // l := strings.Split(txt, " ") - // tweet.Retweets, _ = strconv.Atoi(l[0]) - // } - // }) - s.Find(".hashtag > span.PrettyLink-value").Each(func(i int, h *goquery.Selection) { - tweet.Hashtags = append(tweet.Hashtags, h.Text()) - }) - s.Find("a.link:not(.u-hidden)").Each(func(i int, u *goquery.Selection) { - if link, ok := u.Attr("data-expanded-url"); ok { - tweet.URLs = append(tweet.URLs, link) - } - }) - s.Find(".NaturalImage-image").Each(func(i int, p *goquery.Selection) { - if link, ok := p.Attr("data-image"); ok { - tweet.Photos = append(tweet.Photos, link+"?format=jpg&name=orig") - } - }) - s.Find(".CroppedImage-image").Each(func(i int, p *goquery.Selection) { - if link, ok := p.Attr("data-image"); ok { - tweet.Photos = append(tweet.Photos, link+"?format=jpg&name=orig") - } - }) - // s.Find(".PlayableMedia-player").Each(func(i int, v *goquery.Selection) { - // if style, ok := v.Attr("style"); ok { - // if strings.Contains(style, "background") { - // match := regexp.MustCompile(`https:\/\/.+\/([\w-]+)\.(?:jpg|png)`).FindStringSubmatch(style) - // if len(match) == 2 { - // tweet.Videos = append(tweet.Videos, Video{ID: match[1], Preview: match[0]}) - // } - // } - // } - // }) - tweets = append(tweets, &tweet) - } - }) - - return tweets, nil + tweets, nextCursor := parseTimeline(&timeline) + return tweets, nextCursor, nil } diff --git a/tweets_test.go b/tweets_test.go index 105db6b..4897eca 100644 --- a/tweets_test.go +++ b/tweets_test.go @@ -7,21 +7,25 @@ import ( func TestGetTweets(t *testing.T) { count := 0 - maxTweetsNbr := 50 + maxTweetsNbr := 300 + dupcheck := make(map[string]bool) for tweet := range GetTweets(context.Background(), "Twitter", maxTweetsNbr) { if tweet.Error != nil { t.Error(tweet.Error) } else { count++ - if tweet.HTML == "" { - t.Error("Expected tweet HTML is not empty") - } if tweet.ID == "" { t.Error("Expected tweet ID is not empty") + } else { + if dupcheck[tweet.ID] { + t.Errorf("Detect duplicated tweet ID: %s", tweet.ID) + } else { + dupcheck[tweet.ID] = true + } + } + if tweet.UserID == "" { + t.Error("Expected tweet UserID is not empty") } - // if tweet.UserID == "" { - // t.Error("Expected tweet UserID is not empty") - // } if tweet.Username == "" { t.Error("Expected tweet Username is not empty") } @@ -37,6 +41,17 @@ func TestGetTweets(t *testing.T) { if tweet.Timestamp == 0 { t.Error("Expected tweet Timestamp is greater than zero") } + for _, video := range tweet.Videos { + if video.ID == "" { + t.Error("Expected tweet video ID is not empty") + } + if video.Preview == "" { + t.Error("Expected tweet video Preview is not empty") + } + if video.URL == "" { + t.Error("Expected tweet video URL is not empty") + } + } } } if count != maxTweetsNbr { diff --git a/types.go b/types.go new file mode 100644 index 0000000..13f5377 --- /dev/null +++ b/types.go @@ -0,0 +1,151 @@ +package twitterscraper + +import "time" + +type ( + // Video type. + Video struct { + ID string + Preview string + URL string + } + + // Tweet type. + Tweet struct { + Hashtags []string + ID string + IsQuoted bool + IsPin bool + IsReply bool + IsRetweet bool + Likes int + PermanentURL string + Photos []string + Replies int + Retweets int + Text string + TimeParsed time.Time + Timestamp int64 + URLs []string + UserID string + Username string + Videos []Video + } + + // Result of scrapping. + Result struct { + Tweet + Error error + } + + // timeline JSON + timeline struct { + GlobalObjects struct { + Tweets map[string]struct { + ConversationIDStr string `json:"conversation_id_str"` + CreatedAt string `json:"created_at"` + FavoriteCount int `json:"favorite_count"` + FullText string `json:"full_text"` + Entities struct { + Hashtags []struct { + Text string `json:"text"` + } `json:"hashtags"` + Media []struct { + MediaURLHttps string `json:"media_url_https"` + Type string `json:"type"` + } `json:"media"` + URLs []struct { + ExpandedURL string `json:"expanded_url"` + } `json:"urls"` + } `json:"entities"` + ExtendedEntities struct { + Media []struct { + IDStr string `json:"id_str"` + MediaURLHttps string `json:"media_url_https"` + Type string `json:"type"` + VideoInfo struct { + Variants []struct { + Bitrate int `json:"bitrate,omitempty"` + URL string `json:"url"` + } `json:"variants"` + } `json:"video_info"` + } `json:"media"` + } `json:"extended_entities"` + InReplyToStatusIDStr string `json:"in_reply_to_status_id_str"` + ReplyCount int `json:"reply_count"` + RetweetCount int `json:"retweet_count"` + RetweetedStatusIDStr string `json:"retweeted_status_id_str"` + QuotedStatusIDStr string `json:"quoted_status_id_str"` + Time time.Time `json:"time"` + UserIDStr string `json:"user_id_str"` + } `json:"tweets"` + Users map[string]struct { + CreatedAt string `json:"created_at"` + Description string `json:"description"` + Entities struct { + URL struct { + Urls []struct { + ExpandedURL string `json:"expanded_url"` + } `json:"urls"` + } `json:"url"` + } `json:"entities"` + FavouritesCount int `json:"favourites_count"` + FollowersCount int `json:"followers_count"` + FriendsCount int `json:"friends_count"` + IDStr string `json:"id_str"` + ListedCount int `json:"listed_count"` + Name string `json:"name"` + Location string `json:"location"` + PinnedTweetIdsStr []string `json:"pinned_tweet_ids_str"` + ProfileBannerURL string `json:"profile_banner_url"` + ProfileImageURLHTTPS string `json:"profile_image_url_https"` + Protected bool `json:"protected"` + ScreenName string `json:"screen_name"` + StatusesCount int `json:"statuses_count"` + Verified bool `json:"verified"` + } `json:"users"` + } `json:"globalObjects"` + Timeline struct { + Instructions []struct { + AddEntries struct { + Entries []struct { + Content struct { + Item struct { + Content struct { + Tweet struct { + ID string `json:"id"` + } `json:"tweet"` + } `json:"content"` + } `json:"item"` + Operation struct { + Cursor struct { + Value string `json:"value"` + CursorType string `json:"cursorType"` + } `json:"cursor"` + } `json:"operation"` + TimelineModule struct { + Items []struct { + Item struct { + ClientEventInfo struct { + Details struct { + GuideDetails struct { + TransparentGuideDetails struct { + TrendMetadata struct { + TrendName string `json:"trendName"` + } `json:"trendMetadata"` + } `json:"transparentGuideDetails"` + } `json:"guideDetails"` + } `json:"details"` + } `json:"clientEventInfo"` + } `json:"item"` + } `json:"items"` + } `json:"timelineModule"` + } `json:"content,omitempty"` + } `json:"entries"` + } `json:"addEntries"` + } `json:"instructions"` + } `json:"timeline"` + } + + fetchFunc func(user string, maxTweetsNbr int, cursor string) ([]*Tweet, string, error) +) diff --git a/util.go b/util.go index 155d0e0..8f6540d 100644 --- a/util.go +++ b/util.go @@ -1,25 +1,30 @@ package twitterscraper import ( - "encoding/json" + "context" "errors" "fmt" + "net" "net/http" "net/url" - "regexp" + "strconv" "strings" + "time" ) -//HttpProxy Public variable for Http proxy -var HTTPProxy *url.URL +var ( + // IncludeReplies enable tweet reply + IncludeReplies bool + // HTTPProxy Public variable for Http proxy + HTTPProxy *url.URL +) -//SetProxy set http proxy format `http://HOST:PORT` -func SetProxy(Proxy string) error { - match, _ := regexp.MatchString("http.+", Proxy) - if !match { +// SetProxy set http proxy format `http://HOST:PORT` +func SetProxy(proxy string) error { + if !strings.HasPrefix(proxy, "http://") { return errors.New("only support http protocol") } - urlproxy, err := url.Parse(Proxy) + urlproxy, err := url.Parse(proxy) if err != nil { return err } @@ -27,42 +32,172 @@ func SetProxy(Proxy string) error { return nil } -func newRequest(url string) (*http.Request, error) { - req, err := http.NewRequest("GET", url, nil) +func newHTTPClient() *http.Client { + client := &http.Client{Timeout: 10 * time.Second} + if HTTPProxy != nil { + client = &http.Client{ + Transport: &http.Transport{ + Proxy: http.ProxyURL(HTTPProxy), + DialContext: (&net.Dialer{ + Timeout: 10 * time.Second, + }).DialContext, + }, + } + } + return client +} + +func newRequest(method string, url string) (*http.Request, error) { + req, err := http.NewRequest(method, url, nil) if err != nil { return nil, err } - req.Header.Set("Accept", "application/json, text/javascript, */*; q=0.01") - req.Header.Set("Accept-Language", "en-US") - req.Header.Set("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8") - req.Header.Set("X-Twitter-Active-User", "yes") - req.Header.Set("X-Requested-With", "XMLHttpRequest") + q := req.URL.Query() + q.Add("include_profile_interstitial_type", "1") + q.Add("include_blocking", "1") + q.Add("include_blocked_by", "1") + q.Add("include_followed_by", "1") + q.Add("include_want_retweets", "1") + q.Add("include_mute_edge", "1") + q.Add("include_can_dm", "1") + q.Add("include_can_media_tag", "1") + q.Add("skip_status", "1") + q.Add("cards_platform", "Web-12") + q.Add("include_cards", "1") + q.Add("include_ext_alt_text", "true") + q.Add("include_quote_count", "true") + q.Add("include_reply_count", "1") + q.Add("tweet_mode", "extended") + q.Add("include_entities", "true") + q.Add("include_user_entities", "true") + q.Add("include_ext_media_color", "true") + q.Add("include_ext_media_availability", "true") + q.Add("send_error_codes", "true") + q.Add("simple_quoted_tweet", "true") + q.Add("include_tweet_replies", strconv.FormatBool(IncludeReplies)) + q.Add("ext", "mediaStats,highlightedLabel") + req.URL.RawQuery = q.Encode() return req, nil } -func getHTMLFromJSON(req *http.Request, field string) (*strings.Reader, error) { - resp, err := http.DefaultClient.Do(req) - if err != nil { - return nil, err - } - defer resp.Body.Close() +func getTimeline(ctx context.Context, query string, maxTweetsNbr int, fetchFunc fetchFunc) <-chan *Result { + channel := make(chan *Result) + go func(user string) { + defer close(channel) + var nextCursor string + tweetsNbr := 0 + for tweetsNbr < maxTweetsNbr { + select { + case <-ctx.Done(): + channel <- &Result{Error: ctx.Err()} + return + default: + } - if resp.StatusCode != http.StatusOK { - return nil, fmt.Errorf("response status: %s", resp.Status) - } + tweets, next, err := fetchFunc(query, maxTweetsNbr, nextCursor) + if err != nil { + channel <- &Result{Error: err} + return + } - ajaxJSON := make(map[string]interface{}) - err = json.NewDecoder(resp.Body).Decode(&ajaxJSON) - if err != nil { - return nil, err - } + if len(tweets) == 0 { + break + } - htm, ok := ajaxJSON[field].(string) - if !ok { - return nil, fmt.Errorf("field %s not found in JSON", field) - } + for _, tweet := range tweets { + select { + case <-ctx.Done(): + channel <- &Result{Error: ctx.Err()} + return + default: + } - return strings.NewReader(htm), nil + if tweetsNbr < maxTweetsNbr { + nextCursor = next + channel <- &Result{Tweet: *tweet} + } + tweetsNbr++ + } + } + }(query) + return channel +} + +func parseTimeline(timeline *timeline) ([]*Tweet, string) { + tweets := make(map[string]Tweet) + + for id, tweet := range timeline.GlobalObjects.Tweets { + username := timeline.GlobalObjects.Users[tweet.UserIDStr].ScreenName + tw := Tweet{ + ID: id, + Likes: tweet.FavoriteCount, + PermanentURL: fmt.Sprintf("https://twitter.com/%s/status/%s", username, id), + Replies: tweet.RetweetCount, + Retweets: tweet.RetweetCount, + Text: tweet.FullText, + UserID: tweet.UserIDStr, + Username: username, + } + tm, err := time.Parse(time.RubyDate, tweet.CreatedAt) + if err == nil { + tw.TimeParsed = tm + tw.Timestamp = tm.Unix() + } + if tweet.QuotedStatusIDStr != "" { + tw.IsQuoted = true + } + if tweet.InReplyToStatusIDStr != "" { + tw.IsReply = true + } + if tweet.RetweetedStatusIDStr != "" { + tw.IsRetweet = true + } + for _, pinned := range timeline.GlobalObjects.Users[tweet.UserIDStr].PinnedTweetIdsStr { + if tweet.ConversationIDStr == pinned { + tw.IsPin = true + break + } + } + for _, hash := range tweet.Entities.Hashtags { + tw.Hashtags = append(tw.Hashtags, hash.Text) + } + for _, media := range tweet.Entities.Media { + if media.Type == "photo" { + tw.Photos = append(tw.Photos, media.MediaURLHttps) + } + } + for _, media := range tweet.ExtendedEntities.Media { + if media.Type == "video" { + video := Video{ + ID: media.IDStr, + Preview: media.MediaURLHttps, + } + maxBitrate := 0 + for _, variant := range media.VideoInfo.Variants { + if variant.Bitrate > maxBitrate { + video.URL = strings.TrimSuffix(variant.URL, "?tag=10") + } + } + tw.Videos = append(tw.Videos, video) + } + } + for _, url := range tweet.Entities.URLs { + tw.URLs = append(tw.URLs, url.ExpandedURL) + } + tweets[tw.ID] = tw + } + + var cursor string + var orderedTweets []*Tweet + for _, entry := range timeline.Timeline.Instructions[0].AddEntries.Entries { + if tweet, ok := tweets[entry.Content.Item.Content.Tweet.ID]; ok { + orderedTweets = append(orderedTweets, &tweet) + } + if entry.Content.Operation.Cursor.CursorType == "Bottom" { + cursor = entry.Content.Operation.Cursor.Value + } + } + return orderedTweets, cursor }