Use search GraphQL API

Close #109 #116
This commit is contained in:
Alexander Sheiko 2023-07-03 14:38:46 +03:00
parent 07ad3789ef
commit 391e443058
5 changed files with 329 additions and 204 deletions

View file

@ -155,7 +155,7 @@ import (
func main() { func main() {
scraper := twitterscraper.New() scraper := twitterscraper.New()
err := scraper.LoginOpenAccount() err := scraper.Login(username, password)
if err != nil { if err != nil {
panic(err) panic(err)
} }

131
search.go
View file

@ -3,10 +3,79 @@ package twitterscraper
import ( import (
"context" "context"
"errors" "errors"
"net/url"
"strconv" "strconv"
) )
const searchURL = "https://api.twitter.com/2/search/adaptive.json" const searchURL = "https://twitter.com/i/api/graphql/nK1dw4oV3k4w5TdtcAdSww/SearchTimeline"
type searchTimeline struct {
Data struct {
SearchByRawQuery struct {
SearchTimeline struct {
Timeline struct {
Instructions []struct {
Type string `json:"type"`
Entries []entry `json:"entries"`
Entry entry `json:"entry,omitempty"`
} `json:"instructions"`
} `json:"timeline"`
} `json:"search_timeline"`
} `json:"search_by_raw_query"`
} `json:"data"`
}
func (timeline *searchTimeline) parseTweets() ([]*Tweet, string) {
tweets := make([]*Tweet, 0)
cursor := ""
for _, instruction := range timeline.Data.SearchByRawQuery.SearchTimeline.Timeline.Instructions {
if instruction.Type == "TimelineAddEntries" || instruction.Type == "TimelineReplaceEntry" {
if instruction.Entry.Content.CursorType == "Bottom" {
cursor = instruction.Entry.Content.Value
continue
}
for _, entry := range instruction.Entries {
if entry.Content.ItemContent.TweetDisplayType == "Tweet" {
if tweet := parseLegacyTweet(&entry.Content.ItemContent.TweetResults.Result.Core.UserResults.Result.Legacy, &entry.Content.ItemContent.TweetResults.Result.Legacy); tweet != nil {
if tweet.Views == 0 && entry.Content.ItemContent.TweetResults.Result.Views.Count != "" {
tweet.Views, _ = strconv.Atoi(entry.Content.ItemContent.TweetResults.Result.Views.Count)
}
tweets = append(tweets, tweet)
}
} else if entry.Content.CursorType == "Bottom" {
cursor = entry.Content.Value
}
}
}
}
return tweets, cursor
}
func (timeline *searchTimeline) parseUsers() ([]*Profile, string) {
profiles := make([]*Profile, 0)
cursor := ""
for _, instruction := range timeline.Data.SearchByRawQuery.SearchTimeline.Timeline.Instructions {
if instruction.Type == "TimelineAddEntries" || instruction.Type == "TimelineReplaceEntry" {
if instruction.Entry.Content.CursorType == "Bottom" {
cursor = instruction.Entry.Content.Value
continue
}
for _, entry := range instruction.Entries {
if entry.Content.ItemContent.UserDisplayType == "User" {
if profile := parseProfile(entry.Content.ItemContent.UserResults.Result.Legacy); profile.Name != "" {
if profile.UserID == "" {
profile.UserID = entry.Content.ItemContent.UserResults.Result.RestID
}
profiles = append(profiles, &profile)
}
} else if entry.Content.CursorType == "Bottom" {
cursor = entry.Content.Value
}
}
}
}
return profiles, cursor
}
// SearchTweets returns channel with tweets for a given search query // SearchTweets returns channel with tweets for a given search query
func (s *Scraper) SearchTweets(ctx context.Context, query string, maxTweetsNbr int) <-chan *TweetResult { func (s *Scraper) SearchTweets(ctx context.Context, query string, maxTweetsNbr int) <-chan *TweetResult {
@ -19,7 +88,7 @@ func (s *Scraper) SearchProfiles(ctx context.Context, query string, maxProfilesN
} }
// getSearchTimeline gets results for a given search query, via the Twitter frontend API // getSearchTimeline gets results for a given search query, via the Twitter frontend API
func (s *Scraper) getSearchTimeline(query string, maxNbr int, cursor string) (*timelineV1, error) { func (s *Scraper) getSearchTimeline(query string, maxNbr int, cursor string) (*searchTimeline, error) {
if !s.isLogged { if !s.isLogged {
return nil, errors.New("scraper is not logged in for search") return nil, errors.New("scraper is not logged in for search")
} }
@ -33,31 +102,61 @@ func (s *Scraper) getSearchTimeline(query string, maxNbr int, cursor string) (*t
return nil, err return nil, err
} }
q := req.URL.Query() variables := map[string]interface{}{
q.Add("q", query) "rawQuery": query,
q.Add("count", strconv.Itoa(maxNbr)) "count": maxNbr,
q.Add("query_source", "typed_query") "querySource": "typed_query",
q.Add("pc", "1") "product": "Top",
q.Add("requestContext", "launch") }
q.Add("spelling_corrections", "1")
q.Add("include_ext_edit_control", "true") features := map[string]interface{}{
"rweb_lists_timeline_redesign_enabled": true,
"responsive_web_graphql_exclude_directive_enabled": true,
"verified_phone_label_enabled": false,
"creator_subscriptions_tweet_preview_api_enabled": true,
"responsive_web_graphql_timeline_navigation_enabled": true,
"responsive_web_graphql_skip_user_profile_image_extensions_enabled": false,
"tweetypie_unmention_optimization_enabled": true,
"responsive_web_edit_tweet_api_enabled": true,
"graphql_is_translatable_rweb_tweet_is_translatable_enabled": true,
"view_counts_everywhere_api_enabled": true,
"longform_notetweets_consumption_enabled": true,
"responsive_web_twitter_article_tweet_consumption_enabled": false,
"tweet_awards_web_tipping_enabled": false,
"freedom_of_speech_not_reach_fetch_enabled": true,
"standardized_nudges_misinfo": true,
"tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled": true,
"longform_notetweets_rich_text_read_enabled": true,
"longform_notetweets_inline_media_enabled": true,
"responsive_web_media_download_video_enabled": false,
"responsive_web_enhance_cards_enabled": false,
}
fieldToggles := map[string]interface{}{
"withArticleRichContentState": false,
}
if cursor != "" { if cursor != "" {
q.Add("cursor", cursor) variables["cursor"] = cursor
} }
switch s.searchMode { switch s.searchMode {
case SearchLatest: case SearchLatest:
q.Add("tweet_search_mode", "live") variables["product"] = "Latest"
case SearchPhotos: case SearchPhotos:
q.Add("result_filter", "image") variables["product"] = "Photos"
case SearchVideos: case SearchVideos:
q.Add("result_filter", "video") variables["product"] = "Videos"
case SearchUsers: case SearchUsers:
q.Add("result_filter", "user") variables["product"] = "People"
} }
q := url.Values{}
q.Set("variables", mapToJSONString(variables))
q.Set("features", mapToJSONString(features))
q.Set("fieldToggles", mapToJSONString(fieldToggles))
req.URL.RawQuery = q.Encode() req.URL.RawQuery = q.Encode()
var timeline timelineV1 var timeline searchTimeline
err = s.RequestAPI(req, &timeline) err = s.RequestAPI(req, &timeline)
if err != nil { if err != nil {
return nil, err return nil, err

View file

@ -2,18 +2,23 @@ package twitterscraper_test
import ( import (
"context" "context"
"os"
"testing" "testing"
twitterscraper "github.com/n0madic/twitter-scraper" twitterscraper "github.com/n0madic/twitter-scraper"
) )
var searchScraper = twitterscraper.New()
func TestFetchSearchCursor(t *testing.T) { func TestFetchSearchCursor(t *testing.T) {
err := searchScraper.LoginOpenAccount() if os.Getenv("SKIP_AUTH_TEST") != "" {
t.Skip("Skipping test due to environment variable")
}
searchScraper := twitterscraper.New()
err := searchScraper.Login(username, password)
if err != nil { if err != nil {
t.Fatal(err) t.Fatal(err)
} }
defer searchScraper.Logout()
maxTweetsNbr := 150 maxTweetsNbr := 150
tweetsNbr := 0 tweetsNbr := 0
nextCursor := "" nextCursor := ""
@ -31,13 +36,19 @@ func TestFetchSearchCursor(t *testing.T) {
} }
func TestGetSearchProfiles(t *testing.T) { func TestGetSearchProfiles(t *testing.T) {
if os.Getenv("SKIP_AUTH_TEST") != "" {
t.Skip("Skipping test due to environment variable")
}
count := 0 count := 0
maxProfilesNbr := 150 maxProfilesNbr := 150
dupcheck := make(map[string]bool) dupcheck := make(map[string]bool)
err := searchScraper.LoginOpenAccount() searchScraper := twitterscraper.New()
err := searchScraper.Login(username, password)
if err != nil { if err != nil {
t.Fatal(err) t.Fatal(err)
} }
defer searchScraper.Logout()
searchScraper.SetSearchMode(twitterscraper.SearchUsers) searchScraper.SetSearchMode(twitterscraper.SearchUsers)
for profile := range searchScraper.SearchProfiles(context.Background(), "Twitter", maxProfilesNbr) { for profile := range searchScraper.SearchProfiles(context.Background(), "Twitter", maxProfilesNbr) {
if profile.Error != nil { if profile.Error != nil {
@ -61,13 +72,19 @@ func TestGetSearchProfiles(t *testing.T) {
} }
} }
func TestGetSearchTweets(t *testing.T) { func TestGetSearchTweets(t *testing.T) {
if os.Getenv("SKIP_AUTH_TEST") != "" {
t.Skip("Skipping test due to environment variable")
}
count := 0 count := 0
maxTweetsNbr := 150 maxTweetsNbr := 150
dupcheck := make(map[string]bool) dupcheck := make(map[string]bool)
err := searchScraper.LoginOpenAccount() searchScraper := twitterscraper.New()
err := searchScraper.Login(username, password)
if err != nil { if err != nil {
t.Fatal(err) t.Fatal(err)
} }
defer searchScraper.Logout()
searchScraper.SetSearchMode(twitterscraper.SearchLatest) searchScraper.SetSearchMode(twitterscraper.SearchLatest)
for tweet := range searchScraper.SearchTweets(context.Background(), "twitter", maxTweetsNbr) { for tweet := range searchScraper.SearchTweets(context.Background(), "twitter", maxTweetsNbr) {
if tweet.Error != nil { if tweet.Error != nil {

View file

@ -1,10 +1,7 @@
package twitterscraper package twitterscraper
import ( import (
"fmt"
"strconv" "strconv"
"strings"
"time"
) )
type result struct { type result struct {
@ -66,6 +63,13 @@ type entry struct {
TweetResults struct { TweetResults struct {
Result result `json:"result"` Result result `json:"result"`
} `json:"tweet_results"` } `json:"tweet_results"`
UserDisplayType string `json:"userDisplayType"`
UserResults struct {
Result struct {
RestID string `json:"rest_id"`
Legacy legacyUser `json:"legacy"`
} `json:"result"`
} `json:"user_results"`
} `json:"itemContent"` } `json:"itemContent"`
} `json:"content"` } `json:"content"`
} }
@ -166,182 +170,3 @@ func (conversation *threadedConversation) parse() []*Tweet {
} }
return tweets return tweets
} }
func parseLegacyTweet(user *legacyUser, tweet *legacyTweet) *Tweet {
username := user.ScreenName
name := user.Name
tweetID := tweet.IDStr
tw := &Tweet{
ConversationID: tweet.ConversationIDStr,
ID: tweetID,
Likes: tweet.FavoriteCount,
Name: name,
PermanentURL: fmt.Sprintf("https://twitter.com/%s/status/%s", username, tweetID),
Replies: tweet.ReplyCount,
Retweets: tweet.RetweetCount,
Text: tweet.FullText,
UserID: tweet.UserIDStr,
Username: username,
}
tm, err := time.Parse(time.RubyDate, tweet.CreatedAt)
if err == nil {
tw.TimeParsed = tm
tw.Timestamp = tm.Unix()
}
if tweet.Place.ID != "" {
tw.Place = &tweet.Place
}
if tweet.QuotedStatusIDStr != "" {
tw.IsQuoted = true
tw.QuotedStatusID = tweet.QuotedStatusIDStr
}
if tweet.InReplyToStatusIDStr != "" {
tw.IsReply = true
tw.InReplyToStatusID = tweet.InReplyToStatusIDStr
}
if tweet.RetweetedStatusIDStr != "" || tweet.RetweetedStatusResult.Result != nil {
tw.IsRetweet = true
tw.RetweetedStatusID = tweet.RetweetedStatusIDStr
if tweet.RetweetedStatusResult.Result != nil {
tw.RetweetedStatus = parseLegacyTweet(&tweet.RetweetedStatusResult.Result.Core.UserResults.Result.Legacy, &tweet.RetweetedStatusResult.Result.Legacy)
tw.RetweetedStatusID = tw.RetweetedStatus.ID
}
}
if tweet.Views.Count != "" {
views, viewsErr := strconv.Atoi(tweet.Views.Count)
if viewsErr != nil {
views = 0
}
tw.Views = views
}
for _, pinned := range user.PinnedTweetIdsStr {
if tweet.IDStr == pinned {
tw.IsPin = true
break
}
}
for _, hash := range tweet.Entities.Hashtags {
tw.Hashtags = append(tw.Hashtags, hash.Text)
}
for _, mention := range tweet.Entities.UserMentions {
tw.Mentions = append(tw.Mentions, Mention{
ID: mention.IDStr,
Username: mention.ScreenName,
Name: mention.Name,
})
}
for _, media := range tweet.ExtendedEntities.Media {
if media.Type == "photo" {
photo := Photo{
ID: media.IDStr,
URL: media.MediaURLHttps,
}
tw.Photos = append(tw.Photos, photo)
} else if media.Type == "video" {
video := Video{
ID: media.IDStr,
Preview: media.MediaURLHttps,
}
maxBitrate := 0
for _, variant := range media.VideoInfo.Variants {
if variant.Bitrate > maxBitrate {
video.URL = strings.TrimSuffix(variant.URL, "?tag=10")
maxBitrate = variant.Bitrate
}
}
tw.Videos = append(tw.Videos, video)
} else if media.Type == "animated_gif" {
gif := GIF{
ID: media.IDStr,
Preview: media.MediaURLHttps,
}
// Twitter's API doesn't provide bitrate for GIFs, (it's always set to zero).
// Therefore we check for `>=` instead of `>` in the loop below.
// Also, GIFs have just a single variant today. Just in case that changes in the future,
// and there will be multiple variants, we'll pick the one with the highest bitrate,
// if other one will have a non-zero bitrate.
maxBitrate := 0
for _, variant := range media.VideoInfo.Variants {
if variant.Bitrate >= maxBitrate {
gif.URL = variant.URL
maxBitrate = variant.Bitrate
}
}
tw.GIFs = append(tw.GIFs, gif)
}
if !tw.SensitiveContent {
sensitive := media.ExtSensitiveMediaWarning
tw.SensitiveContent = sensitive.AdultContent || sensitive.GraphicViolence || sensitive.Other
}
}
for _, url := range tweet.Entities.URLs {
tw.URLs = append(tw.URLs, url.ExpandedURL)
}
tw.HTML = tweet.FullText
tw.HTML = reHashtag.ReplaceAllStringFunc(tw.HTML, func(hashtag string) string {
return fmt.Sprintf(`<a href="https://twitter.com/hashtag/%s">%s</a>`,
strings.TrimPrefix(hashtag, "#"),
hashtag,
)
})
tw.HTML = reUsername.ReplaceAllStringFunc(tw.HTML, func(username string) string {
return fmt.Sprintf(`<a href="https://twitter.com/%s">%s</a>`,
strings.TrimPrefix(username, "@"),
username,
)
})
var foundedMedia []string
tw.HTML = reTwitterURL.ReplaceAllStringFunc(tw.HTML, func(tco string) string {
for _, entity := range tweet.Entities.URLs {
if tco == entity.URL {
return fmt.Sprintf(`<a href="%s">%s</a>`, entity.ExpandedURL, tco)
}
}
for _, entity := range tweet.ExtendedEntities.Media {
if tco == entity.URL {
foundedMedia = append(foundedMedia, entity.MediaURLHttps)
return fmt.Sprintf(`<br><a href="%s"><img src="%s"/></a>`, tco, entity.MediaURLHttps)
}
}
return tco
})
for _, photo := range tw.Photos {
url := photo.URL
if stringInSlice(url, foundedMedia) {
continue
}
tw.HTML += fmt.Sprintf(`<br><img src="%s"/>`, url)
}
for _, video := range tw.Videos {
url := video.Preview
if stringInSlice(url, foundedMedia) {
continue
}
tw.HTML += fmt.Sprintf(`<br><img src="%s"/>`, url)
}
for _, gif := range tw.GIFs {
url := gif.Preview
if stringInSlice(url, foundedMedia) {
continue
}
tw.HTML += fmt.Sprintf(`<br><img src="%s"/>`, url)
}
tw.HTML = strings.Replace(tw.HTML, "\n", "<br>", -1)
return tw
}

184
util.go
View file

@ -3,10 +3,12 @@ package twitterscraper
import ( import (
"context" "context"
"encoding/json" "encoding/json"
"fmt"
"net/http" "net/http"
"net/url" "net/url"
"regexp" "regexp"
"strconv" "strconv"
"strings"
"time" "time"
) )
@ -150,6 +152,188 @@ func getTweetTimeline(ctx context.Context, query string, maxTweetsNbr int, fetch
return channel return channel
} }
func parseLegacyTweet(user *legacyUser, tweet *legacyTweet) *Tweet {
tweetID := tweet.IDStr
if tweetID == "" {
return nil
}
username := user.ScreenName
name := user.Name
tw := &Tweet{
ConversationID: tweet.ConversationIDStr,
ID: tweetID,
Likes: tweet.FavoriteCount,
Name: name,
PermanentURL: fmt.Sprintf("https://twitter.com/%s/status/%s", username, tweetID),
Replies: tweet.ReplyCount,
Retweets: tweet.RetweetCount,
Text: tweet.FullText,
UserID: tweet.UserIDStr,
Username: username,
}
tm, err := time.Parse(time.RubyDate, tweet.CreatedAt)
if err == nil {
tw.TimeParsed = tm
tw.Timestamp = tm.Unix()
}
if tweet.Place.ID != "" {
tw.Place = &tweet.Place
}
if tweet.QuotedStatusIDStr != "" {
tw.IsQuoted = true
tw.QuotedStatusID = tweet.QuotedStatusIDStr
}
if tweet.InReplyToStatusIDStr != "" {
tw.IsReply = true
tw.InReplyToStatusID = tweet.InReplyToStatusIDStr
}
if tweet.RetweetedStatusIDStr != "" || tweet.RetweetedStatusResult.Result != nil {
tw.IsRetweet = true
tw.RetweetedStatusID = tweet.RetweetedStatusIDStr
if tweet.RetweetedStatusResult.Result != nil {
tw.RetweetedStatus = parseLegacyTweet(&tweet.RetweetedStatusResult.Result.Core.UserResults.Result.Legacy, &tweet.RetweetedStatusResult.Result.Legacy)
tw.RetweetedStatusID = tw.RetweetedStatus.ID
}
}
if tweet.Views.Count != "" {
views, viewsErr := strconv.Atoi(tweet.Views.Count)
if viewsErr != nil {
views = 0
}
tw.Views = views
}
for _, pinned := range user.PinnedTweetIdsStr {
if tweet.IDStr == pinned {
tw.IsPin = true
break
}
}
for _, hash := range tweet.Entities.Hashtags {
tw.Hashtags = append(tw.Hashtags, hash.Text)
}
for _, mention := range tweet.Entities.UserMentions {
tw.Mentions = append(tw.Mentions, Mention{
ID: mention.IDStr,
Username: mention.ScreenName,
Name: mention.Name,
})
}
for _, media := range tweet.ExtendedEntities.Media {
if media.Type == "photo" {
photo := Photo{
ID: media.IDStr,
URL: media.MediaURLHttps,
}
tw.Photos = append(tw.Photos, photo)
} else if media.Type == "video" {
video := Video{
ID: media.IDStr,
Preview: media.MediaURLHttps,
}
maxBitrate := 0
for _, variant := range media.VideoInfo.Variants {
if variant.Bitrate > maxBitrate {
video.URL = strings.TrimSuffix(variant.URL, "?tag=10")
maxBitrate = variant.Bitrate
}
}
tw.Videos = append(tw.Videos, video)
} else if media.Type == "animated_gif" {
gif := GIF{
ID: media.IDStr,
Preview: media.MediaURLHttps,
}
// Twitter's API doesn't provide bitrate for GIFs, (it's always set to zero).
// Therefore we check for `>=` instead of `>` in the loop below.
// Also, GIFs have just a single variant today. Just in case that changes in the future,
// and there will be multiple variants, we'll pick the one with the highest bitrate,
// if other one will have a non-zero bitrate.
maxBitrate := 0
for _, variant := range media.VideoInfo.Variants {
if variant.Bitrate >= maxBitrate {
gif.URL = variant.URL
maxBitrate = variant.Bitrate
}
}
tw.GIFs = append(tw.GIFs, gif)
}
if !tw.SensitiveContent {
sensitive := media.ExtSensitiveMediaWarning
tw.SensitiveContent = sensitive.AdultContent || sensitive.GraphicViolence || sensitive.Other
}
}
for _, url := range tweet.Entities.URLs {
tw.URLs = append(tw.URLs, url.ExpandedURL)
}
tw.HTML = tweet.FullText
tw.HTML = reHashtag.ReplaceAllStringFunc(tw.HTML, func(hashtag string) string {
return fmt.Sprintf(`<a href="https://twitter.com/hashtag/%s">%s</a>`,
strings.TrimPrefix(hashtag, "#"),
hashtag,
)
})
tw.HTML = reUsername.ReplaceAllStringFunc(tw.HTML, func(username string) string {
return fmt.Sprintf(`<a href="https://twitter.com/%s">%s</a>`,
strings.TrimPrefix(username, "@"),
username,
)
})
var foundedMedia []string
tw.HTML = reTwitterURL.ReplaceAllStringFunc(tw.HTML, func(tco string) string {
for _, entity := range tweet.Entities.URLs {
if tco == entity.URL {
return fmt.Sprintf(`<a href="%s">%s</a>`, entity.ExpandedURL, tco)
}
}
for _, entity := range tweet.ExtendedEntities.Media {
if tco == entity.URL {
foundedMedia = append(foundedMedia, entity.MediaURLHttps)
return fmt.Sprintf(`<br><a href="%s"><img src="%s"/></a>`, tco, entity.MediaURLHttps)
}
}
return tco
})
for _, photo := range tw.Photos {
url := photo.URL
if stringInSlice(url, foundedMedia) {
continue
}
tw.HTML += fmt.Sprintf(`<br><img src="%s"/>`, url)
}
for _, video := range tw.Videos {
url := video.Preview
if stringInSlice(url, foundedMedia) {
continue
}
tw.HTML += fmt.Sprintf(`<br><img src="%s"/>`, url)
}
for _, gif := range tw.GIFs {
url := gif.Preview
if stringInSlice(url, foundedMedia) {
continue
}
tw.HTML += fmt.Sprintf(`<br><img src="%s"/>`, url)
}
tw.HTML = strings.Replace(tw.HTML, "\n", "<br>", -1)
return tw
}
func parseProfile(user legacyUser) Profile { func parseProfile(user legacyUser) Profile {
profile := Profile{ profile := Profile{
Avatar: user.ProfileImageURLHTTPS, Avatar: user.ProfileImageURLHTTPS,