Use search GraphQL API

Close #109 #116
2023-07-03 14:38:46 +03:00 · 2023-07-03 14:38:46 +03:00 · 391e443058
commit 391e443058
parent 07ad3789ef
5 changed files with 329 additions and 204 deletions
--- a/README.md
+++ b/README.md
@ -155,7 +155,7 @@ import (
 func main() {
    scraper := twitterscraper.New()
-    err := scraper.LoginOpenAccount()
+    err := scraper.Login(username, password)
    if err != nil {
        panic(err)
    }
--- a/search.go
+++ b/search.go
@ -3,10 +3,79 @@ package twitterscraper
 import (
 	"context"
 	"errors"
 	"net/url"
 	"strconv"
 )
-const searchURL = "https://api.twitter.com/2/search/adaptive.json"
+const searchURL = "https://twitter.com/i/api/graphql/nK1dw4oV3k4w5TdtcAdSww/SearchTimeline"
 type searchTimeline struct {
 	Data struct {
 		SearchByRawQuery struct {
 			SearchTimeline struct {
 				Timeline struct {
 					Instructions []struct {
 						Type    string  `json:"type"`
 						Entries []entry `json:"entries"`
 						Entry   entry   `json:"entry,omitempty"`
 					} `json:"instructions"`
 				} `json:"timeline"`
 			} `json:"search_timeline"`
 		} `json:"search_by_raw_query"`
 	} `json:"data"`
 }
 func (timeline *searchTimeline) parseTweets() ([]*Tweet, string) {
 	tweets := make([]*Tweet, 0)
 	cursor := ""
 	for _, instruction := range timeline.Data.SearchByRawQuery.SearchTimeline.Timeline.Instructions {
 		if instruction.Type == "TimelineAddEntries" || instruction.Type == "TimelineReplaceEntry" {
 			if instruction.Entry.Content.CursorType == "Bottom" {
 				cursor = instruction.Entry.Content.Value
 				continue
 			}
 			for _, entry := range instruction.Entries {
 				if entry.Content.ItemContent.TweetDisplayType == "Tweet" {
 					if tweet := parseLegacyTweet(&entry.Content.ItemContent.TweetResults.Result.Core.UserResults.Result.Legacy, &entry.Content.ItemContent.TweetResults.Result.Legacy); tweet != nil {
 						if tweet.Views == 0 && entry.Content.ItemContent.TweetResults.Result.Views.Count != "" {
 							tweet.Views, _ = strconv.Atoi(entry.Content.ItemContent.TweetResults.Result.Views.Count)
 						}
 						tweets = append(tweets, tweet)
 					}
 				} else if entry.Content.CursorType == "Bottom" {
 					cursor = entry.Content.Value
 				}
 			}
 		}
 	}
 	return tweets, cursor
 }
 func (timeline *searchTimeline) parseUsers() ([]*Profile, string) {
 	profiles := make([]*Profile, 0)
 	cursor := ""
 	for _, instruction := range timeline.Data.SearchByRawQuery.SearchTimeline.Timeline.Instructions {
 		if instruction.Type == "TimelineAddEntries" || instruction.Type == "TimelineReplaceEntry" {
 			if instruction.Entry.Content.CursorType == "Bottom" {
 				cursor = instruction.Entry.Content.Value
 				continue
 			}
 			for _, entry := range instruction.Entries {
 				if entry.Content.ItemContent.UserDisplayType == "User" {
 					if profile := parseProfile(entry.Content.ItemContent.UserResults.Result.Legacy); profile.Name != "" {
 						if profile.UserID == "" {
 							profile.UserID = entry.Content.ItemContent.UserResults.Result.RestID
 						}
 						profiles = append(profiles, &profile)
 					}
 				} else if entry.Content.CursorType == "Bottom" {
 					cursor = entry.Content.Value
 				}
 			}
 		}
 	}
 	return profiles, cursor
 }
 // SearchTweets returns channel with tweets for a given search query
 func (s *Scraper) SearchTweets(ctx context.Context, query string, maxTweetsNbr int) <-chan *TweetResult {
@ -19,7 +88,7 @@ func (s *Scraper) SearchProfiles(ctx context.Context, query string, maxProfilesN
 }
 // getSearchTimeline gets results for a given search query, via the Twitter frontend API
-func (s *Scraper) getSearchTimeline(query string, maxNbr int, cursor string) (*timelineV1, error) {
+func (s *Scraper) getSearchTimeline(query string, maxNbr int, cursor string) (*searchTimeline, error) {
 	if !s.isLogged {
 		return nil, errors.New("scraper is not logged in for search")
 	}
@ -33,31 +102,61 @@ func (s *Scraper) getSearchTimeline(query string, maxNbr int, cursor string) (*t
 		return nil, err
 	}
-	q := req.URL.Query()
+	variables := map[string]interface{}{
-	q.Add("q", query)
+		"rawQuery":    query,
-	q.Add("count", strconv.Itoa(maxNbr))
+		"count":       maxNbr,
-	q.Add("query_source", "typed_query")
+		"querySource": "typed_query",
-	q.Add("pc", "1")
+		"product":     "Top",
-	q.Add("requestContext", "launch")
+	}
-	q.Add("spelling_corrections", "1")
+
-	q.Add("include_ext_edit_control", "true")
+	features := map[string]interface{}{
 		"rweb_lists_timeline_redesign_enabled":                                    true,
 		"responsive_web_graphql_exclude_directive_enabled":                        true,
 		"verified_phone_label_enabled":                                            false,
 		"creator_subscriptions_tweet_preview_api_enabled":                         true,
 		"responsive_web_graphql_timeline_navigation_enabled":                      true,
 		"responsive_web_graphql_skip_user_profile_image_extensions_enabled":       false,
 		"tweetypie_unmention_optimization_enabled":                                true,
 		"responsive_web_edit_tweet_api_enabled":                                   true,
 		"graphql_is_translatable_rweb_tweet_is_translatable_enabled":              true,
 		"view_counts_everywhere_api_enabled":                                      true,
 		"longform_notetweets_consumption_enabled":                                 true,
 		"responsive_web_twitter_article_tweet_consumption_enabled":                false,
 		"tweet_awards_web_tipping_enabled":                                        false,
 		"freedom_of_speech_not_reach_fetch_enabled":                               true,
 		"standardized_nudges_misinfo":                                             true,
 		"tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled": true,
 		"longform_notetweets_rich_text_read_enabled":                              true,
 		"longform_notetweets_inline_media_enabled":                                true,
 		"responsive_web_media_download_video_enabled":                             false,
 		"responsive_web_enhance_cards_enabled":                                    false,
 	}
 	fieldToggles := map[string]interface{}{
 		"withArticleRichContentState": false,
 	}
 	if cursor != "" {
-		q.Add("cursor", cursor)
+		variables["cursor"] = cursor
 	}
 	switch s.searchMode {
 	case SearchLatest:
-		q.Add("tweet_search_mode", "live")
+		variables["product"] = "Latest"
 	case SearchPhotos:
-		q.Add("result_filter", "image")
+		variables["product"] = "Photos"
 	case SearchVideos:
-		q.Add("result_filter", "video")
+		variables["product"] = "Videos"
 	case SearchUsers:
-		q.Add("result_filter", "user")
+		variables["product"] = "People"
 	}
 	q := url.Values{}
 	q.Set("variables", mapToJSONString(variables))
 	q.Set("features", mapToJSONString(features))
 	q.Set("fieldToggles", mapToJSONString(fieldToggles))
 	req.URL.RawQuery = q.Encode()
-	var timeline timelineV1
+	var timeline searchTimeline
 	err = s.RequestAPI(req, &timeline)
 	if err != nil {
 		return nil, err
--- a/search_test.go
+++ b/search_test.go
@ -2,18 +2,23 @@ package twitterscraper_test
 import (
 	"context"
 	"os"
 	"testing"
 	twitterscraper "github.com/n0madic/twitter-scraper"
 )
 var searchScraper = twitterscraper.New()
 func TestFetchSearchCursor(t *testing.T) {
-	err := searchScraper.LoginOpenAccount()
+	if os.Getenv("SKIP_AUTH_TEST") != "" {
 		t.Skip("Skipping test due to environment variable")
 	}
 	searchScraper := twitterscraper.New()
 	err := searchScraper.Login(username, password)
 	if err != nil {
 		t.Fatal(err)
 	}
 	defer searchScraper.Logout()
 	maxTweetsNbr := 150
 	tweetsNbr := 0
 	nextCursor := ""
@ -31,13 +36,19 @@ func TestFetchSearchCursor(t *testing.T) {
 }
 func TestGetSearchProfiles(t *testing.T) {
 	if os.Getenv("SKIP_AUTH_TEST") != "" {
 		t.Skip("Skipping test due to environment variable")
 	}
 	count := 0
 	maxProfilesNbr := 150
 	dupcheck := make(map[string]bool)
-	err := searchScraper.LoginOpenAccount()
+	searchScraper := twitterscraper.New()
 	err := searchScraper.Login(username, password)
 	if err != nil {
 		t.Fatal(err)
 	}
 	defer searchScraper.Logout()
 	searchScraper.SetSearchMode(twitterscraper.SearchUsers)
 	for profile := range searchScraper.SearchProfiles(context.Background(), "Twitter", maxProfilesNbr) {
 		if profile.Error != nil {
@ -61,13 +72,19 @@ func TestGetSearchProfiles(t *testing.T) {
 	}
 }
 func TestGetSearchTweets(t *testing.T) {
 	if os.Getenv("SKIP_AUTH_TEST") != "" {
 		t.Skip("Skipping test due to environment variable")
 	}
 	count := 0
 	maxTweetsNbr := 150
 	dupcheck := make(map[string]bool)
-	err := searchScraper.LoginOpenAccount()
+	searchScraper := twitterscraper.New()
 	err := searchScraper.Login(username, password)
 	if err != nil {
 		t.Fatal(err)
 	}
 	defer searchScraper.Logout()
 	searchScraper.SetSearchMode(twitterscraper.SearchLatest)
 	for tweet := range searchScraper.SearchTweets(context.Background(), "twitter", maxTweetsNbr) {
 		if tweet.Error != nil {
--- a/timeline_v2.go
+++ b/timeline_v2.go
@ -1,10 +1,7 @@
 package twitterscraper
 import (
 	"fmt"
 	"strconv"
 	"strings"
 	"time"
 )
 type result struct {
@ -66,6 +63,13 @@ type entry struct {
 			TweetResults     struct {
 				Result result `json:"result"`
 			} `json:"tweet_results"`
 			UserDisplayType string `json:"userDisplayType"`
 			UserResults     struct {
 				Result struct {
 					RestID string     `json:"rest_id"`
 					Legacy legacyUser `json:"legacy"`
 				} `json:"result"`
 			} `json:"user_results"`
 		} `json:"itemContent"`
 	} `json:"content"`
 }
@ -166,182 +170,3 @@ func (conversation *threadedConversation) parse() []*Tweet {
 	}
 	return tweets
 }
 func parseLegacyTweet(user *legacyUser, tweet *legacyTweet) *Tweet {
 	username := user.ScreenName
 	name := user.Name
 	tweetID := tweet.IDStr
 	tw := &Tweet{
 		ConversationID: tweet.ConversationIDStr,
 		ID:             tweetID,
 		Likes:          tweet.FavoriteCount,
 		Name:           name,
 		PermanentURL:   fmt.Sprintf("https://twitter.com/%s/status/%s", username, tweetID),
 		Replies:        tweet.ReplyCount,
 		Retweets:       tweet.RetweetCount,
 		Text:           tweet.FullText,
 		UserID:         tweet.UserIDStr,
 		Username:       username,
 	}
 	tm, err := time.Parse(time.RubyDate, tweet.CreatedAt)
 	if err == nil {
 		tw.TimeParsed = tm
 		tw.Timestamp = tm.Unix()
 	}
 	if tweet.Place.ID != "" {
 		tw.Place = &tweet.Place
 	}
 	if tweet.QuotedStatusIDStr != "" {
 		tw.IsQuoted = true
 		tw.QuotedStatusID = tweet.QuotedStatusIDStr
 	}
 	if tweet.InReplyToStatusIDStr != "" {
 		tw.IsReply = true
 		tw.InReplyToStatusID = tweet.InReplyToStatusIDStr
 	}
 	if tweet.RetweetedStatusIDStr != "" || tweet.RetweetedStatusResult.Result != nil {
 		tw.IsRetweet = true
 		tw.RetweetedStatusID = tweet.RetweetedStatusIDStr
 		if tweet.RetweetedStatusResult.Result != nil {
 			tw.RetweetedStatus = parseLegacyTweet(&tweet.RetweetedStatusResult.Result.Core.UserResults.Result.Legacy, &tweet.RetweetedStatusResult.Result.Legacy)
 			tw.RetweetedStatusID = tw.RetweetedStatus.ID
 		}
 	}
 	if tweet.Views.Count != "" {
 		views, viewsErr := strconv.Atoi(tweet.Views.Count)
 		if viewsErr != nil {
 			views = 0
 		}
 		tw.Views = views
 	}
 	for _, pinned := range user.PinnedTweetIdsStr {
 		if tweet.IDStr == pinned {
 			tw.IsPin = true
 			break
 		}
 	}
 	for _, hash := range tweet.Entities.Hashtags {
 		tw.Hashtags = append(tw.Hashtags, hash.Text)
 	}
 	for _, mention := range tweet.Entities.UserMentions {
 		tw.Mentions = append(tw.Mentions, Mention{
 			ID:       mention.IDStr,
 			Username: mention.ScreenName,
 			Name:     mention.Name,
 		})
 	}
 	for _, media := range tweet.ExtendedEntities.Media {
 		if media.Type == "photo" {
 			photo := Photo{
 				ID:  media.IDStr,
 				URL: media.MediaURLHttps,
 			}
 			tw.Photos = append(tw.Photos, photo)
 		} else if media.Type == "video" {
 			video := Video{
 				ID:      media.IDStr,
 				Preview: media.MediaURLHttps,
 			}
 			maxBitrate := 0
 			for _, variant := range media.VideoInfo.Variants {
 				if variant.Bitrate > maxBitrate {
 					video.URL = strings.TrimSuffix(variant.URL, "?tag=10")
 					maxBitrate = variant.Bitrate
 				}
 			}
 			tw.Videos = append(tw.Videos, video)
 		} else if media.Type == "animated_gif" {
 			gif := GIF{
 				ID:      media.IDStr,
 				Preview: media.MediaURLHttps,
 			}
 			// Twitter's API doesn't provide bitrate for GIFs, (it's always set to zero).
 			// Therefore we check for `>=` instead of `>` in the loop below.
 			// Also, GIFs have just a single variant today. Just in case that changes in the future,
 			// and there will be multiple variants, we'll pick the one with the highest bitrate,
 			// if other one will have a non-zero bitrate.
 			maxBitrate := 0
 			for _, variant := range media.VideoInfo.Variants {
 				if variant.Bitrate >= maxBitrate {
 					gif.URL = variant.URL
 					maxBitrate = variant.Bitrate
 				}
 			}
 			tw.GIFs = append(tw.GIFs, gif)
 		}
 		if !tw.SensitiveContent {
 			sensitive := media.ExtSensitiveMediaWarning
 			tw.SensitiveContent = sensitive.AdultContent || sensitive.GraphicViolence || sensitive.Other
 		}
 	}
 	for _, url := range tweet.Entities.URLs {
 		tw.URLs = append(tw.URLs, url.ExpandedURL)
 	}
 	tw.HTML = tweet.FullText
 	tw.HTML = reHashtag.ReplaceAllStringFunc(tw.HTML, func(hashtag string) string {
 		return fmt.Sprintf(`<a href="https://twitter.com/hashtag/%s">%s</a>`,
 			strings.TrimPrefix(hashtag, "#"),
 			hashtag,
 		)
 	})
 	tw.HTML = reUsername.ReplaceAllStringFunc(tw.HTML, func(username string) string {
 		return fmt.Sprintf(`<a href="https://twitter.com/%s">%s</a>`,
 			strings.TrimPrefix(username, "@"),
 			username,
 		)
 	})
 	var foundedMedia []string
 	tw.HTML = reTwitterURL.ReplaceAllStringFunc(tw.HTML, func(tco string) string {
 		for _, entity := range tweet.Entities.URLs {
 			if tco == entity.URL {
 				return fmt.Sprintf(`<a href="%s">%s</a>`, entity.ExpandedURL, tco)
 			}
 		}
 		for _, entity := range tweet.ExtendedEntities.Media {
 			if tco == entity.URL {
 				foundedMedia = append(foundedMedia, entity.MediaURLHttps)
 				return fmt.Sprintf(`<br><a href="%s"><img src="%s"/></a>`, tco, entity.MediaURLHttps)
 			}
 		}
 		return tco
 	})
 	for _, photo := range tw.Photos {
 		url := photo.URL
 		if stringInSlice(url, foundedMedia) {
 			continue
 		}
 		tw.HTML += fmt.Sprintf(`<br><img src="%s"/>`, url)
 	}
 	for _, video := range tw.Videos {
 		url := video.Preview
 		if stringInSlice(url, foundedMedia) {
 			continue
 		}
 		tw.HTML += fmt.Sprintf(`<br><img src="%s"/>`, url)
 	}
 	for _, gif := range tw.GIFs {
 		url := gif.Preview
 		if stringInSlice(url, foundedMedia) {
 			continue
 		}
 		tw.HTML += fmt.Sprintf(`<br><img src="%s"/>`, url)
 	}
 	tw.HTML = strings.Replace(tw.HTML, "\n", "<br>", -1)
 	return tw
 }
--- a/util.go
+++ b/util.go
@ -3,10 +3,12 @@ package twitterscraper
 import (
 	"context"
 	"encoding/json"
 	"fmt"
 	"net/http"
 	"net/url"
 	"regexp"
 	"strconv"
 	"strings"
 	"time"
 )
@ -150,6 +152,188 @@ func getTweetTimeline(ctx context.Context, query string, maxTweetsNbr int, fetch
 	return channel
 }
 func parseLegacyTweet(user *legacyUser, tweet *legacyTweet) *Tweet {
 	tweetID := tweet.IDStr
 	if tweetID == "" {
 		return nil
 	}
 	username := user.ScreenName
 	name := user.Name
 	tw := &Tweet{
 		ConversationID: tweet.ConversationIDStr,
 		ID:             tweetID,
 		Likes:          tweet.FavoriteCount,
 		Name:           name,
 		PermanentURL:   fmt.Sprintf("https://twitter.com/%s/status/%s", username, tweetID),
 		Replies:        tweet.ReplyCount,
 		Retweets:       tweet.RetweetCount,
 		Text:           tweet.FullText,
 		UserID:         tweet.UserIDStr,
 		Username:       username,
 	}
 	tm, err := time.Parse(time.RubyDate, tweet.CreatedAt)
 	if err == nil {
 		tw.TimeParsed = tm
 		tw.Timestamp = tm.Unix()
 	}
 	if tweet.Place.ID != "" {
 		tw.Place = &tweet.Place
 	}
 	if tweet.QuotedStatusIDStr != "" {
 		tw.IsQuoted = true
 		tw.QuotedStatusID = tweet.QuotedStatusIDStr
 	}
 	if tweet.InReplyToStatusIDStr != "" {
 		tw.IsReply = true
 		tw.InReplyToStatusID = tweet.InReplyToStatusIDStr
 	}
 	if tweet.RetweetedStatusIDStr != "" || tweet.RetweetedStatusResult.Result != nil {
 		tw.IsRetweet = true
 		tw.RetweetedStatusID = tweet.RetweetedStatusIDStr
 		if tweet.RetweetedStatusResult.Result != nil {
 			tw.RetweetedStatus = parseLegacyTweet(&tweet.RetweetedStatusResult.Result.Core.UserResults.Result.Legacy, &tweet.RetweetedStatusResult.Result.Legacy)
 			tw.RetweetedStatusID = tw.RetweetedStatus.ID
 		}
 	}
 	if tweet.Views.Count != "" {
 		views, viewsErr := strconv.Atoi(tweet.Views.Count)
 		if viewsErr != nil {
 			views = 0
 		}
 		tw.Views = views
 	}
 	for _, pinned := range user.PinnedTweetIdsStr {
 		if tweet.IDStr == pinned {
 			tw.IsPin = true
 			break
 		}
 	}
 	for _, hash := range tweet.Entities.Hashtags {
 		tw.Hashtags = append(tw.Hashtags, hash.Text)
 	}
 	for _, mention := range tweet.Entities.UserMentions {
 		tw.Mentions = append(tw.Mentions, Mention{
 			ID:       mention.IDStr,
 			Username: mention.ScreenName,
 			Name:     mention.Name,
 		})
 	}
 	for _, media := range tweet.ExtendedEntities.Media {
 		if media.Type == "photo" {
 			photo := Photo{
 				ID:  media.IDStr,
 				URL: media.MediaURLHttps,
 			}
 			tw.Photos = append(tw.Photos, photo)
 		} else if media.Type == "video" {
 			video := Video{
 				ID:      media.IDStr,
 				Preview: media.MediaURLHttps,
 			}
 			maxBitrate := 0
 			for _, variant := range media.VideoInfo.Variants {
 				if variant.Bitrate > maxBitrate {
 					video.URL = strings.TrimSuffix(variant.URL, "?tag=10")
 					maxBitrate = variant.Bitrate
 				}
 			}
 			tw.Videos = append(tw.Videos, video)
 		} else if media.Type == "animated_gif" {
 			gif := GIF{
 				ID:      media.IDStr,
 				Preview: media.MediaURLHttps,
 			}
 			// Twitter's API doesn't provide bitrate for GIFs, (it's always set to zero).
 			// Therefore we check for `>=` instead of `>` in the loop below.
 			// Also, GIFs have just a single variant today. Just in case that changes in the future,
 			// and there will be multiple variants, we'll pick the one with the highest bitrate,
 			// if other one will have a non-zero bitrate.
 			maxBitrate := 0
 			for _, variant := range media.VideoInfo.Variants {
 				if variant.Bitrate >= maxBitrate {
 					gif.URL = variant.URL
 					maxBitrate = variant.Bitrate
 				}
 			}
 			tw.GIFs = append(tw.GIFs, gif)
 		}
 		if !tw.SensitiveContent {
 			sensitive := media.ExtSensitiveMediaWarning
 			tw.SensitiveContent = sensitive.AdultContent || sensitive.GraphicViolence || sensitive.Other
 		}
 	}
 	for _, url := range tweet.Entities.URLs {
 		tw.URLs = append(tw.URLs, url.ExpandedURL)
 	}
 	tw.HTML = tweet.FullText
 	tw.HTML = reHashtag.ReplaceAllStringFunc(tw.HTML, func(hashtag string) string {
 		return fmt.Sprintf(`<a href="https://twitter.com/hashtag/%s">%s</a>`,
 			strings.TrimPrefix(hashtag, "#"),
 			hashtag,
 		)
 	})
 	tw.HTML = reUsername.ReplaceAllStringFunc(tw.HTML, func(username string) string {
 		return fmt.Sprintf(`<a href="https://twitter.com/%s">%s</a>`,
 			strings.TrimPrefix(username, "@"),
 			username,
 		)
 	})
 	var foundedMedia []string
 	tw.HTML = reTwitterURL.ReplaceAllStringFunc(tw.HTML, func(tco string) string {
 		for _, entity := range tweet.Entities.URLs {
 			if tco == entity.URL {
 				return fmt.Sprintf(`<a href="%s">%s</a>`, entity.ExpandedURL, tco)
 			}
 		}
 		for _, entity := range tweet.ExtendedEntities.Media {
 			if tco == entity.URL {
 				foundedMedia = append(foundedMedia, entity.MediaURLHttps)
 				return fmt.Sprintf(`<br><a href="%s"><img src="%s"/></a>`, tco, entity.MediaURLHttps)
 			}
 		}
 		return tco
 	})
 	for _, photo := range tw.Photos {
 		url := photo.URL
 		if stringInSlice(url, foundedMedia) {
 			continue
 		}
 		tw.HTML += fmt.Sprintf(`<br><img src="%s"/>`, url)
 	}
 	for _, video := range tw.Videos {
 		url := video.Preview
 		if stringInSlice(url, foundedMedia) {
 			continue
 		}
 		tw.HTML += fmt.Sprintf(`<br><img src="%s"/>`, url)
 	}
 	for _, gif := range tw.GIFs {
 		url := gif.Preview
 		if stringInSlice(url, foundedMedia) {
 			continue
 		}
 		tw.HTML += fmt.Sprintf(`<br><img src="%s"/>`, url)
 	}
 	tw.HTML = strings.Replace(tw.HTML, "\n", "<br>", -1)
 	return tw
 }
 func parseProfile(user legacyUser) Profile {
 	profile := Profile{
 		Avatar:         user.ProfileImageURLHTTPS,