From 391e443058c60ac4fac55c840387f342959c6b30 Mon Sep 17 00:00:00 2001 From: Alexander Sheiko Date: Mon, 3 Jul 2023 14:38:46 +0300 Subject: [PATCH] Use search GraphQL API Close #109 #116 --- README.md | 2 +- search.go | 131 +++++++++++++++++++++++++++++----- search_test.go | 27 +++++-- timeline_v2.go | 189 ++----------------------------------------------- util.go | 184 +++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 329 insertions(+), 204 deletions(-) diff --git a/README.md b/README.md index 7c7cdb0..252199b 100644 --- a/README.md +++ b/README.md @@ -155,7 +155,7 @@ import ( func main() { scraper := twitterscraper.New() - err := scraper.LoginOpenAccount() + err := scraper.Login(username, password) if err != nil { panic(err) } diff --git a/search.go b/search.go index b0fa45b..b4254f9 100644 --- a/search.go +++ b/search.go @@ -3,10 +3,79 @@ package twitterscraper import ( "context" "errors" + "net/url" "strconv" ) -const searchURL = "https://api.twitter.com/2/search/adaptive.json" +const searchURL = "https://twitter.com/i/api/graphql/nK1dw4oV3k4w5TdtcAdSww/SearchTimeline" + +type searchTimeline struct { + Data struct { + SearchByRawQuery struct { + SearchTimeline struct { + Timeline struct { + Instructions []struct { + Type string `json:"type"` + Entries []entry `json:"entries"` + Entry entry `json:"entry,omitempty"` + } `json:"instructions"` + } `json:"timeline"` + } `json:"search_timeline"` + } `json:"search_by_raw_query"` + } `json:"data"` +} + +func (timeline *searchTimeline) parseTweets() ([]*Tweet, string) { + tweets := make([]*Tweet, 0) + cursor := "" + for _, instruction := range timeline.Data.SearchByRawQuery.SearchTimeline.Timeline.Instructions { + if instruction.Type == "TimelineAddEntries" || instruction.Type == "TimelineReplaceEntry" { + if instruction.Entry.Content.CursorType == "Bottom" { + cursor = instruction.Entry.Content.Value + continue + } + for _, entry := range instruction.Entries { + if entry.Content.ItemContent.TweetDisplayType == "Tweet" { + if tweet := parseLegacyTweet(&entry.Content.ItemContent.TweetResults.Result.Core.UserResults.Result.Legacy, &entry.Content.ItemContent.TweetResults.Result.Legacy); tweet != nil { + if tweet.Views == 0 && entry.Content.ItemContent.TweetResults.Result.Views.Count != "" { + tweet.Views, _ = strconv.Atoi(entry.Content.ItemContent.TweetResults.Result.Views.Count) + } + tweets = append(tweets, tweet) + } + } else if entry.Content.CursorType == "Bottom" { + cursor = entry.Content.Value + } + } + } + } + return tweets, cursor +} + +func (timeline *searchTimeline) parseUsers() ([]*Profile, string) { + profiles := make([]*Profile, 0) + cursor := "" + for _, instruction := range timeline.Data.SearchByRawQuery.SearchTimeline.Timeline.Instructions { + if instruction.Type == "TimelineAddEntries" || instruction.Type == "TimelineReplaceEntry" { + if instruction.Entry.Content.CursorType == "Bottom" { + cursor = instruction.Entry.Content.Value + continue + } + for _, entry := range instruction.Entries { + if entry.Content.ItemContent.UserDisplayType == "User" { + if profile := parseProfile(entry.Content.ItemContent.UserResults.Result.Legacy); profile.Name != "" { + if profile.UserID == "" { + profile.UserID = entry.Content.ItemContent.UserResults.Result.RestID + } + profiles = append(profiles, &profile) + } + } else if entry.Content.CursorType == "Bottom" { + cursor = entry.Content.Value + } + } + } + } + return profiles, cursor +} // SearchTweets returns channel with tweets for a given search query func (s *Scraper) SearchTweets(ctx context.Context, query string, maxTweetsNbr int) <-chan *TweetResult { @@ -19,7 +88,7 @@ func (s *Scraper) SearchProfiles(ctx context.Context, query string, maxProfilesN } // getSearchTimeline gets results for a given search query, via the Twitter frontend API -func (s *Scraper) getSearchTimeline(query string, maxNbr int, cursor string) (*timelineV1, error) { +func (s *Scraper) getSearchTimeline(query string, maxNbr int, cursor string) (*searchTimeline, error) { if !s.isLogged { return nil, errors.New("scraper is not logged in for search") } @@ -33,31 +102,61 @@ func (s *Scraper) getSearchTimeline(query string, maxNbr int, cursor string) (*t return nil, err } - q := req.URL.Query() - q.Add("q", query) - q.Add("count", strconv.Itoa(maxNbr)) - q.Add("query_source", "typed_query") - q.Add("pc", "1") - q.Add("requestContext", "launch") - q.Add("spelling_corrections", "1") - q.Add("include_ext_edit_control", "true") + variables := map[string]interface{}{ + "rawQuery": query, + "count": maxNbr, + "querySource": "typed_query", + "product": "Top", + } + + features := map[string]interface{}{ + "rweb_lists_timeline_redesign_enabled": true, + "responsive_web_graphql_exclude_directive_enabled": true, + "verified_phone_label_enabled": false, + "creator_subscriptions_tweet_preview_api_enabled": true, + "responsive_web_graphql_timeline_navigation_enabled": true, + "responsive_web_graphql_skip_user_profile_image_extensions_enabled": false, + "tweetypie_unmention_optimization_enabled": true, + "responsive_web_edit_tweet_api_enabled": true, + "graphql_is_translatable_rweb_tweet_is_translatable_enabled": true, + "view_counts_everywhere_api_enabled": true, + "longform_notetweets_consumption_enabled": true, + "responsive_web_twitter_article_tweet_consumption_enabled": false, + "tweet_awards_web_tipping_enabled": false, + "freedom_of_speech_not_reach_fetch_enabled": true, + "standardized_nudges_misinfo": true, + "tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled": true, + "longform_notetweets_rich_text_read_enabled": true, + "longform_notetweets_inline_media_enabled": true, + "responsive_web_media_download_video_enabled": false, + "responsive_web_enhance_cards_enabled": false, + } + + fieldToggles := map[string]interface{}{ + "withArticleRichContentState": false, + } + if cursor != "" { - q.Add("cursor", cursor) + variables["cursor"] = cursor } switch s.searchMode { case SearchLatest: - q.Add("tweet_search_mode", "live") + variables["product"] = "Latest" case SearchPhotos: - q.Add("result_filter", "image") + variables["product"] = "Photos" case SearchVideos: - q.Add("result_filter", "video") + variables["product"] = "Videos" case SearchUsers: - q.Add("result_filter", "user") + variables["product"] = "People" } + q := url.Values{} + q.Set("variables", mapToJSONString(variables)) + q.Set("features", mapToJSONString(features)) + q.Set("fieldToggles", mapToJSONString(fieldToggles)) req.URL.RawQuery = q.Encode() - var timeline timelineV1 + var timeline searchTimeline err = s.RequestAPI(req, &timeline) if err != nil { return nil, err diff --git a/search_test.go b/search_test.go index fcd03b4..84050b1 100644 --- a/search_test.go +++ b/search_test.go @@ -2,18 +2,23 @@ package twitterscraper_test import ( "context" + "os" "testing" twitterscraper "github.com/n0madic/twitter-scraper" ) -var searchScraper = twitterscraper.New() - func TestFetchSearchCursor(t *testing.T) { - err := searchScraper.LoginOpenAccount() + if os.Getenv("SKIP_AUTH_TEST") != "" { + t.Skip("Skipping test due to environment variable") + } + searchScraper := twitterscraper.New() + err := searchScraper.Login(username, password) if err != nil { t.Fatal(err) } + defer searchScraper.Logout() + maxTweetsNbr := 150 tweetsNbr := 0 nextCursor := "" @@ -31,13 +36,19 @@ func TestFetchSearchCursor(t *testing.T) { } func TestGetSearchProfiles(t *testing.T) { + if os.Getenv("SKIP_AUTH_TEST") != "" { + t.Skip("Skipping test due to environment variable") + } count := 0 maxProfilesNbr := 150 dupcheck := make(map[string]bool) - err := searchScraper.LoginOpenAccount() + searchScraper := twitterscraper.New() + err := searchScraper.Login(username, password) if err != nil { t.Fatal(err) } + defer searchScraper.Logout() + searchScraper.SetSearchMode(twitterscraper.SearchUsers) for profile := range searchScraper.SearchProfiles(context.Background(), "Twitter", maxProfilesNbr) { if profile.Error != nil { @@ -61,13 +72,19 @@ func TestGetSearchProfiles(t *testing.T) { } } func TestGetSearchTweets(t *testing.T) { + if os.Getenv("SKIP_AUTH_TEST") != "" { + t.Skip("Skipping test due to environment variable") + } count := 0 maxTweetsNbr := 150 dupcheck := make(map[string]bool) - err := searchScraper.LoginOpenAccount() + searchScraper := twitterscraper.New() + err := searchScraper.Login(username, password) if err != nil { t.Fatal(err) } + defer searchScraper.Logout() + searchScraper.SetSearchMode(twitterscraper.SearchLatest) for tweet := range searchScraper.SearchTweets(context.Background(), "twitter", maxTweetsNbr) { if tweet.Error != nil { diff --git a/timeline_v2.go b/timeline_v2.go index 020177e..b00db8e 100644 --- a/timeline_v2.go +++ b/timeline_v2.go @@ -1,10 +1,7 @@ package twitterscraper import ( - "fmt" "strconv" - "strings" - "time" ) type result struct { @@ -66,6 +63,13 @@ type entry struct { TweetResults struct { Result result `json:"result"` } `json:"tweet_results"` + UserDisplayType string `json:"userDisplayType"` + UserResults struct { + Result struct { + RestID string `json:"rest_id"` + Legacy legacyUser `json:"legacy"` + } `json:"result"` + } `json:"user_results"` } `json:"itemContent"` } `json:"content"` } @@ -166,182 +170,3 @@ func (conversation *threadedConversation) parse() []*Tweet { } return tweets } - -func parseLegacyTweet(user *legacyUser, tweet *legacyTweet) *Tweet { - username := user.ScreenName - name := user.Name - tweetID := tweet.IDStr - tw := &Tweet{ - ConversationID: tweet.ConversationIDStr, - ID: tweetID, - Likes: tweet.FavoriteCount, - Name: name, - PermanentURL: fmt.Sprintf("https://twitter.com/%s/status/%s", username, tweetID), - Replies: tweet.ReplyCount, - Retweets: tweet.RetweetCount, - Text: tweet.FullText, - UserID: tweet.UserIDStr, - Username: username, - } - - tm, err := time.Parse(time.RubyDate, tweet.CreatedAt) - if err == nil { - tw.TimeParsed = tm - tw.Timestamp = tm.Unix() - } - - if tweet.Place.ID != "" { - tw.Place = &tweet.Place - } - - if tweet.QuotedStatusIDStr != "" { - tw.IsQuoted = true - tw.QuotedStatusID = tweet.QuotedStatusIDStr - } - if tweet.InReplyToStatusIDStr != "" { - tw.IsReply = true - tw.InReplyToStatusID = tweet.InReplyToStatusIDStr - } - if tweet.RetweetedStatusIDStr != "" || tweet.RetweetedStatusResult.Result != nil { - tw.IsRetweet = true - tw.RetweetedStatusID = tweet.RetweetedStatusIDStr - if tweet.RetweetedStatusResult.Result != nil { - tw.RetweetedStatus = parseLegacyTweet(&tweet.RetweetedStatusResult.Result.Core.UserResults.Result.Legacy, &tweet.RetweetedStatusResult.Result.Legacy) - tw.RetweetedStatusID = tw.RetweetedStatus.ID - } - } - - if tweet.Views.Count != "" { - views, viewsErr := strconv.Atoi(tweet.Views.Count) - if viewsErr != nil { - views = 0 - } - tw.Views = views - } - - for _, pinned := range user.PinnedTweetIdsStr { - if tweet.IDStr == pinned { - tw.IsPin = true - break - } - } - - for _, hash := range tweet.Entities.Hashtags { - tw.Hashtags = append(tw.Hashtags, hash.Text) - } - - for _, mention := range tweet.Entities.UserMentions { - tw.Mentions = append(tw.Mentions, Mention{ - ID: mention.IDStr, - Username: mention.ScreenName, - Name: mention.Name, - }) - } - - for _, media := range tweet.ExtendedEntities.Media { - if media.Type == "photo" { - photo := Photo{ - ID: media.IDStr, - URL: media.MediaURLHttps, - } - - tw.Photos = append(tw.Photos, photo) - } else if media.Type == "video" { - video := Video{ - ID: media.IDStr, - Preview: media.MediaURLHttps, - } - - maxBitrate := 0 - for _, variant := range media.VideoInfo.Variants { - if variant.Bitrate > maxBitrate { - video.URL = strings.TrimSuffix(variant.URL, "?tag=10") - maxBitrate = variant.Bitrate - } - } - - tw.Videos = append(tw.Videos, video) - } else if media.Type == "animated_gif" { - gif := GIF{ - ID: media.IDStr, - Preview: media.MediaURLHttps, - } - - // Twitter's API doesn't provide bitrate for GIFs, (it's always set to zero). - // Therefore we check for `>=` instead of `>` in the loop below. - // Also, GIFs have just a single variant today. Just in case that changes in the future, - // and there will be multiple variants, we'll pick the one with the highest bitrate, - // if other one will have a non-zero bitrate. - maxBitrate := 0 - for _, variant := range media.VideoInfo.Variants { - if variant.Bitrate >= maxBitrate { - gif.URL = variant.URL - maxBitrate = variant.Bitrate - } - } - - tw.GIFs = append(tw.GIFs, gif) - } - - if !tw.SensitiveContent { - sensitive := media.ExtSensitiveMediaWarning - tw.SensitiveContent = sensitive.AdultContent || sensitive.GraphicViolence || sensitive.Other - } - } - - for _, url := range tweet.Entities.URLs { - tw.URLs = append(tw.URLs, url.ExpandedURL) - } - - tw.HTML = tweet.FullText - tw.HTML = reHashtag.ReplaceAllStringFunc(tw.HTML, func(hashtag string) string { - return fmt.Sprintf(`%s`, - strings.TrimPrefix(hashtag, "#"), - hashtag, - ) - }) - tw.HTML = reUsername.ReplaceAllStringFunc(tw.HTML, func(username string) string { - return fmt.Sprintf(`%s`, - strings.TrimPrefix(username, "@"), - username, - ) - }) - var foundedMedia []string - tw.HTML = reTwitterURL.ReplaceAllStringFunc(tw.HTML, func(tco string) string { - for _, entity := range tweet.Entities.URLs { - if tco == entity.URL { - return fmt.Sprintf(`%s`, entity.ExpandedURL, tco) - } - } - for _, entity := range tweet.ExtendedEntities.Media { - if tco == entity.URL { - foundedMedia = append(foundedMedia, entity.MediaURLHttps) - return fmt.Sprintf(`
`, tco, entity.MediaURLHttps) - } - } - return tco - }) - for _, photo := range tw.Photos { - url := photo.URL - if stringInSlice(url, foundedMedia) { - continue - } - tw.HTML += fmt.Sprintf(`
`, url) - } - for _, video := range tw.Videos { - url := video.Preview - if stringInSlice(url, foundedMedia) { - continue - } - tw.HTML += fmt.Sprintf(`
`, url) - } - for _, gif := range tw.GIFs { - url := gif.Preview - if stringInSlice(url, foundedMedia) { - continue - } - tw.HTML += fmt.Sprintf(`
`, url) - } - tw.HTML = strings.Replace(tw.HTML, "\n", "
", -1) - return tw -} diff --git a/util.go b/util.go index 42d0987..9bf6d55 100644 --- a/util.go +++ b/util.go @@ -3,10 +3,12 @@ package twitterscraper import ( "context" "encoding/json" + "fmt" "net/http" "net/url" "regexp" "strconv" + "strings" "time" ) @@ -150,6 +152,188 @@ func getTweetTimeline(ctx context.Context, query string, maxTweetsNbr int, fetch return channel } +func parseLegacyTweet(user *legacyUser, tweet *legacyTweet) *Tweet { + tweetID := tweet.IDStr + if tweetID == "" { + return nil + } + username := user.ScreenName + name := user.Name + tw := &Tweet{ + ConversationID: tweet.ConversationIDStr, + ID: tweetID, + Likes: tweet.FavoriteCount, + Name: name, + PermanentURL: fmt.Sprintf("https://twitter.com/%s/status/%s", username, tweetID), + Replies: tweet.ReplyCount, + Retweets: tweet.RetweetCount, + Text: tweet.FullText, + UserID: tweet.UserIDStr, + Username: username, + } + + tm, err := time.Parse(time.RubyDate, tweet.CreatedAt) + if err == nil { + tw.TimeParsed = tm + tw.Timestamp = tm.Unix() + } + + if tweet.Place.ID != "" { + tw.Place = &tweet.Place + } + + if tweet.QuotedStatusIDStr != "" { + tw.IsQuoted = true + tw.QuotedStatusID = tweet.QuotedStatusIDStr + } + if tweet.InReplyToStatusIDStr != "" { + tw.IsReply = true + tw.InReplyToStatusID = tweet.InReplyToStatusIDStr + } + if tweet.RetweetedStatusIDStr != "" || tweet.RetweetedStatusResult.Result != nil { + tw.IsRetweet = true + tw.RetweetedStatusID = tweet.RetweetedStatusIDStr + if tweet.RetweetedStatusResult.Result != nil { + tw.RetweetedStatus = parseLegacyTweet(&tweet.RetweetedStatusResult.Result.Core.UserResults.Result.Legacy, &tweet.RetweetedStatusResult.Result.Legacy) + tw.RetweetedStatusID = tw.RetweetedStatus.ID + } + } + + if tweet.Views.Count != "" { + views, viewsErr := strconv.Atoi(tweet.Views.Count) + if viewsErr != nil { + views = 0 + } + tw.Views = views + } + + for _, pinned := range user.PinnedTweetIdsStr { + if tweet.IDStr == pinned { + tw.IsPin = true + break + } + } + + for _, hash := range tweet.Entities.Hashtags { + tw.Hashtags = append(tw.Hashtags, hash.Text) + } + + for _, mention := range tweet.Entities.UserMentions { + tw.Mentions = append(tw.Mentions, Mention{ + ID: mention.IDStr, + Username: mention.ScreenName, + Name: mention.Name, + }) + } + + for _, media := range tweet.ExtendedEntities.Media { + if media.Type == "photo" { + photo := Photo{ + ID: media.IDStr, + URL: media.MediaURLHttps, + } + + tw.Photos = append(tw.Photos, photo) + } else if media.Type == "video" { + video := Video{ + ID: media.IDStr, + Preview: media.MediaURLHttps, + } + + maxBitrate := 0 + for _, variant := range media.VideoInfo.Variants { + if variant.Bitrate > maxBitrate { + video.URL = strings.TrimSuffix(variant.URL, "?tag=10") + maxBitrate = variant.Bitrate + } + } + + tw.Videos = append(tw.Videos, video) + } else if media.Type == "animated_gif" { + gif := GIF{ + ID: media.IDStr, + Preview: media.MediaURLHttps, + } + + // Twitter's API doesn't provide bitrate for GIFs, (it's always set to zero). + // Therefore we check for `>=` instead of `>` in the loop below. + // Also, GIFs have just a single variant today. Just in case that changes in the future, + // and there will be multiple variants, we'll pick the one with the highest bitrate, + // if other one will have a non-zero bitrate. + maxBitrate := 0 + for _, variant := range media.VideoInfo.Variants { + if variant.Bitrate >= maxBitrate { + gif.URL = variant.URL + maxBitrate = variant.Bitrate + } + } + + tw.GIFs = append(tw.GIFs, gif) + } + + if !tw.SensitiveContent { + sensitive := media.ExtSensitiveMediaWarning + tw.SensitiveContent = sensitive.AdultContent || sensitive.GraphicViolence || sensitive.Other + } + } + + for _, url := range tweet.Entities.URLs { + tw.URLs = append(tw.URLs, url.ExpandedURL) + } + + tw.HTML = tweet.FullText + tw.HTML = reHashtag.ReplaceAllStringFunc(tw.HTML, func(hashtag string) string { + return fmt.Sprintf(`%s`, + strings.TrimPrefix(hashtag, "#"), + hashtag, + ) + }) + tw.HTML = reUsername.ReplaceAllStringFunc(tw.HTML, func(username string) string { + return fmt.Sprintf(`%s`, + strings.TrimPrefix(username, "@"), + username, + ) + }) + var foundedMedia []string + tw.HTML = reTwitterURL.ReplaceAllStringFunc(tw.HTML, func(tco string) string { + for _, entity := range tweet.Entities.URLs { + if tco == entity.URL { + return fmt.Sprintf(`%s`, entity.ExpandedURL, tco) + } + } + for _, entity := range tweet.ExtendedEntities.Media { + if tco == entity.URL { + foundedMedia = append(foundedMedia, entity.MediaURLHttps) + return fmt.Sprintf(`
`, tco, entity.MediaURLHttps) + } + } + return tco + }) + for _, photo := range tw.Photos { + url := photo.URL + if stringInSlice(url, foundedMedia) { + continue + } + tw.HTML += fmt.Sprintf(`
`, url) + } + for _, video := range tw.Videos { + url := video.Preview + if stringInSlice(url, foundedMedia) { + continue + } + tw.HTML += fmt.Sprintf(`
`, url) + } + for _, gif := range tw.GIFs { + url := gif.Preview + if stringInSlice(url, foundedMedia) { + continue + } + tw.HTML += fmt.Sprintf(`
`, url) + } + tw.HTML = strings.Replace(tw.HTML, "\n", "
", -1) + return tw +} + func parseProfile(user legacyUser) Profile { profile := Profile{ Avatar: user.ProfileImageURLHTTPS,