diff --git a/README.md b/README.md index 70c8c8b..bf65e5f 100644 --- a/README.md +++ b/README.md @@ -103,6 +103,7 @@ Options: * `twitterscraper.SearchLatest` - live mode * `twitterscraper.SearchPhotos` - image mode * `twitterscraper.SearchVideos` - video mode +* `twitterscraper.SearchUsers` - user mode ### Get profile @@ -124,6 +125,28 @@ func main() { } ``` +### Search profiles by query + +```golang +package main + +import ( + "context" + "fmt" + twitterscraper "github.com/n0madic/twitter-scraper" +) + +func main() { + scraper := twitterscraper.New().SetSearchMode(twitterscraper.SearchUsers) + for profile := range scraper.SearchUsers(context.Background(), "Twitter", 50) { + if profile.Error != nil { + panic(profile.Error) + } + fmt.Println(profile.Name) + } +} +``` + ### Get trends ```golang diff --git a/api.go b/api.go index 3a67082..1628919 100644 --- a/api.go +++ b/api.go @@ -11,42 +11,6 @@ import ( const bearerToken string = "AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA" -type user struct { - Data struct { - User struct { - RestID string `json:"rest_id"` - Legacy struct { - CreatedAt string `json:"created_at"` - Description string `json:"description"` - Entities struct { - URL struct { - Urls []struct { - ExpandedURL string `json:"expanded_url"` - } `json:"urls"` - } `json:"url"` - } `json:"entities"` - FavouritesCount int `json:"favourites_count"` - FollowersCount int `json:"followers_count"` - FriendsCount int `json:"friends_count"` - IDStr string `json:"id_str"` - ListedCount int `json:"listed_count"` - Name string `json:"name"` - Location string `json:"location"` - PinnedTweetIdsStr []string `json:"pinned_tweet_ids_str"` - ProfileBannerURL string `json:"profile_banner_url"` - ProfileImageURLHTTPS string `json:"profile_image_url_https"` - Protected bool `json:"protected"` - ScreenName string `json:"screen_name"` - StatusesCount int `json:"statuses_count"` - Verified bool `json:"verified"` - } `json:"legacy"` - } `json:"user"` - } `json:"data"` - Errors []struct { - Message string `json:"message"` - } `json:"errors"` -} - // Global cache for user IDs var cacheIDs sync.Map @@ -70,7 +34,8 @@ func (s *Scraper) RequestAPI(req *http.Request, target interface{}) error { // private profiles return forbidden, but also data if resp.StatusCode != http.StatusOK && resp.StatusCode != http.StatusForbidden { - return fmt.Errorf("response status %s", resp.Status) + content, _ := ioutil.ReadAll(resp.Body) + return fmt.Errorf("response status %s: %s", resp.Status, content) } if resp.Header.Get("X-Rate-Limit-Remaining") == "0" { @@ -95,7 +60,8 @@ func (s *Scraper) GetGuestToken() error { defer resp.Body.Close() if resp.StatusCode != http.StatusOK { - return fmt.Errorf("response status %s", resp.Status) + content, _ := ioutil.ReadAll(resp.Body) + return fmt.Errorf("response status %s: %s", resp.Status, content) } body, err := ioutil.ReadAll(resp.Body) if err != nil { diff --git a/profile.go b/profile.go index 36b7878..98562f2 100644 --- a/profile.go +++ b/profile.go @@ -30,6 +30,18 @@ type Profile struct { Website string } +type user struct { + Data struct { + User struct { + RestID string `json:"rest_id"` + Legacy legacyUser `json:"legacy"` + } `json:"user"` + } `json:"data"` + Errors []struct { + Message string `json:"message"` + } `json:"errors"` +} + // GetProfile return parsed user profile. func (s *Scraper) GetProfile(username string) (Profile, error) { var jsn user @@ -50,44 +62,13 @@ func (s *Scraper) GetProfile(username string) (Profile, error) { if jsn.Data.User.RestID == "" { return Profile{}, fmt.Errorf("rest_id not found") } + jsn.Data.User.Legacy.IDStr = jsn.Data.User.RestID if jsn.Data.User.Legacy.ScreenName == "" { return Profile{}, fmt.Errorf("either @%s does not exist or is private", username) } - user := jsn.Data.User.Legacy - - profile := Profile{ - Avatar: user.ProfileImageURLHTTPS, - Banner: user.ProfileBannerURL, - Biography: user.Description, - FollowersCount: user.FollowersCount, - FollowingCount: user.FavouritesCount, - FriendsCount: user.FriendsCount, - IsPrivate: user.Protected, - IsVerified: user.Verified, - LikesCount: user.FavouritesCount, - ListedCount: user.ListedCount, - Location: user.Location, - Name: user.Name, - PinnedTweetIDs: user.PinnedTweetIdsStr, - TweetsCount: user.StatusesCount, - URL: "https://twitter.com/" + user.ScreenName, - UserID: jsn.Data.User.RestID, - Username: user.ScreenName, - } - - tm, err := time.Parse(time.RubyDate, user.CreatedAt) - if err == nil { - tm = tm.UTC() - profile.Joined = &tm - } - - if len(user.Entities.URL.Urls) > 0 { - profile.Website = user.Entities.URL.Urls[0].ExpandedURL - } - - return profile, nil + return parseProfile(jsn.Data.User.Legacy), nil } // GetProfile wrapper for default scraper diff --git a/scraper.go b/scraper.go index 91484a9..98e1743 100644 --- a/scraper.go +++ b/scraper.go @@ -31,6 +31,8 @@ const ( SearchPhotos // SearchVideos - video mode SearchVideos + // SearchUsers - user mode + SearchUsers ) var defaultScraper *Scraper diff --git a/search.go b/search.go index 0df959a..fdd3162 100644 --- a/search.go +++ b/search.go @@ -7,30 +7,40 @@ import ( ) // SearchTweets returns channel with tweets for a given search query -func (s *Scraper) SearchTweets(ctx context.Context, query string, maxTweetsNbr int) <-chan *Result { - return getTimeline(ctx, query, maxTweetsNbr, s.FetchSearchTweets) +func (s *Scraper) SearchTweets(ctx context.Context, query string, maxTweetsNbr int) <-chan *TweetResult { + return getTweetTimeline(ctx, query, maxTweetsNbr, s.FetchSearchTweets) } // SearchTweets wrapper for default Scraper -func SearchTweets(ctx context.Context, query string, maxTweetsNbr int) <-chan *Result { +func SearchTweets(ctx context.Context, query string, maxTweetsNbr int) <-chan *TweetResult { return defaultScraper.SearchTweets(ctx, query, maxTweetsNbr) } -// FetchSearchTweets gets tweets for a given search query, via the Twitter frontend API -func (s *Scraper) FetchSearchTweets(query string, maxTweetsNbr int, cursor string) ([]*Tweet, string, error) { +// SearchProfiles returns channel with profiles for a given search query +func (s *Scraper) SearchProfiles(ctx context.Context, query string, maxProfilesNbr int) <-chan *ProfileResult { + return getUserTimeline(ctx, query, maxProfilesNbr, s.FetchSearchProfiles) +} + +// SearchProfiles wrapper for default Scraper +func SearchProfiles(ctx context.Context, query string, maxProfilesNbr int) <-chan *ProfileResult { + return defaultScraper.SearchProfiles(ctx, query, maxProfilesNbr) +} + +// getSearchTimeline gets results for a given search query, via the Twitter frontend API +func (s *Scraper) getSearchTimeline(query string, maxNbr int, cursor string) (*timeline, error) { query = url.PathEscape(query) - if maxTweetsNbr > 100 { - maxTweetsNbr = 100 + if maxNbr > 50 { + maxNbr = 50 } req, err := s.newRequest("GET", "https://twitter.com/i/api/2/search/adaptive.json") if err != nil { - return nil, "", err + return nil, err } q := req.URL.Query() q.Add("q", query) - q.Add("count", strconv.Itoa(maxTweetsNbr)) + q.Add("count", strconv.Itoa(maxNbr)) q.Add("query_source", "typed_query") q.Add("pc", "1") q.Add("spelling_corrections", "1") @@ -44,16 +54,36 @@ func (s *Scraper) FetchSearchTweets(query string, maxTweetsNbr int, cursor strin q.Add("result_filter", "image") case SearchVideos: q.Add("result_filter", "video") + case SearchUsers: + q.Add("result_filter", "user") } req.URL.RawQuery = q.Encode() var timeline timeline err = s.RequestAPI(req, &timeline) + if err != nil { + return nil, err + } + return &timeline, nil +} + +// FetchSearchTweets gets tweets for a given search query, via the Twitter frontend API +func (s *Scraper) FetchSearchTweets(query string, maxTweetsNbr int, cursor string) ([]*Tweet, string, error) { + timeline, err := s.getSearchTimeline(query, maxTweetsNbr, cursor) if err != nil { return nil, "", err } - - tweets, nextCursor := parseTimeline(&timeline) + tweets, nextCursor := parseTimeline(timeline) return tweets, nextCursor, nil } + +// FetchSearchProfiles gets users for a given search query, via the Twitter frontend API +func (s *Scraper) FetchSearchProfiles(query string, maxProfilesNbr int, cursor string) ([]*Profile, string, error) { + timeline, err := s.getSearchTimeline(query, maxProfilesNbr, cursor) + if err != nil { + return nil, "", err + } + users, nextCursor := parseUsers(timeline) + return users, nextCursor, nil +} diff --git a/search_test.go b/search_test.go index 11aaf89..8d771ec 100644 --- a/search_test.go +++ b/search_test.go @@ -16,16 +16,42 @@ func TestFetchSearchCursor(t *testing.T) { t.Fatal(err) } if cursor == "" { - t.Fatal("Expected search cursor is not empty") + t.Fatal("Expected search cursor is empty") } tweetsNbr += len(tweets) nextCursor = cursor } } +func TestGetSearchProfiles(t *testing.T) { + count := 0 + maxProfilesNbr := 150 + dupcheck := make(map[string]bool) + scraper := New().SetSearchMode(SearchUsers) + for profile := range scraper.SearchProfiles(context.Background(), "Twitter", maxProfilesNbr) { + if profile.Error != nil { + t.Error(profile.Error) + } else { + count++ + if profile.UserID == "" { + t.Error("Expected UserID is empty") + } else { + if dupcheck[profile.UserID] { + t.Errorf("Detect duplicated UserID: %s", profile.UserID) + } else { + dupcheck[profile.UserID] = true + } + } + } + } + + if count != maxProfilesNbr { + t.Errorf("Expected profiles count=%v, got: %v", maxProfilesNbr, count) + } +} func TestGetSearchTweets(t *testing.T) { count := 0 - maxTweetsNbr := 250 + maxTweetsNbr := 150 dupcheck := make(map[string]bool) for tweet := range SearchTweets(context.Background(), "twitter -filter:retweets", maxTweetsNbr) { if tweet.Error != nil { @@ -33,7 +59,7 @@ func TestGetSearchTweets(t *testing.T) { } else { count++ if tweet.ID == "" { - t.Error("Expected tweet ID is not empty") + t.Error("Expected tweet ID is empty") } else { if dupcheck[tweet.ID] { t.Errorf("Detect duplicated tweet ID: %s", tweet.ID) @@ -42,13 +68,13 @@ func TestGetSearchTweets(t *testing.T) { } } if tweet.PermanentURL == "" { - t.Error("Expected tweet PermanentURL is not empty") + t.Error("Expected tweet PermanentURL is empty") } if tweet.IsRetweet { t.Error("Expected tweet IsRetweet is false") } if tweet.Text == "" { - t.Error("Expected tweet Text is not empty") + t.Error("Expected tweet Text is empty") } } } diff --git a/tweets.go b/tweets.go index 5aa1812..9c6df0f 100644 --- a/tweets.go +++ b/tweets.go @@ -7,12 +7,12 @@ import ( ) // GetTweets returns channel with tweets for a given user. -func (s *Scraper) GetTweets(ctx context.Context, user string, maxTweetsNbr int) <-chan *Result { - return getTimeline(ctx, user, maxTweetsNbr, s.FetchTweets) +func (s *Scraper) GetTweets(ctx context.Context, user string, maxTweetsNbr int) <-chan *TweetResult { + return getTweetTimeline(ctx, user, maxTweetsNbr, s.FetchTweets) } // GetTweets wrapper for default Scraper -func GetTweets(ctx context.Context, user string, maxTweetsNbr int) <-chan *Result { +func GetTweets(ctx context.Context, user string, maxTweetsNbr int) <-chan *TweetResult { return defaultScraper.GetTweets(ctx, user, maxTweetsNbr) } diff --git a/types.go b/types.go index 1520f87..3b2b2e3 100644 --- a/types.go +++ b/types.go @@ -43,12 +43,44 @@ type ( Videos []Video } - // Result of scrapping. - Result struct { + // ProfileResult of scrapping. + ProfileResult struct { + Profile + Error error + } + + // TweetResult of scrapping. + TweetResult struct { Tweet Error error } + legacyUser struct { + CreatedAt string `json:"created_at"` + Description string `json:"description"` + Entities struct { + URL struct { + Urls []struct { + ExpandedURL string `json:"expanded_url"` + } `json:"urls"` + } `json:"url"` + } `json:"entities"` + FavouritesCount int `json:"favourites_count"` + FollowersCount int `json:"followers_count"` + FriendsCount int `json:"friends_count"` + IDStr string `json:"id_str"` + ListedCount int `json:"listed_count"` + Name string `json:"name"` + Location string `json:"location"` + PinnedTweetIdsStr []string `json:"pinned_tweet_ids_str"` + ProfileBannerURL string `json:"profile_banner_url"` + ProfileImageURLHTTPS string `json:"profile_image_url_https"` + Protected bool `json:"protected"` + ScreenName string `json:"screen_name"` + StatusesCount int `json:"statuses_count"` + Verified bool `json:"verified"` + } + // timeline JSON timeline struct { GlobalObjects struct { @@ -128,6 +160,9 @@ type ( Tweet struct { ID string `json:"id"` } `json:"tweet"` + User struct { + ID string `json:"id"` + } `json:"user"` } `json:"content"` } `json:"item"` Operation struct { @@ -185,5 +220,6 @@ type ( } `json:"timeline"` } - fetchFunc func(user string, maxTweetsNbr int, cursor string) ([]*Tweet, string, error) + fetchProfileFunc func(query string, maxProfilesNbr int, cursor string) ([]*Profile, string, error) + fetchTweetFunc func(query string, maxTweetsNbr int, cursor string) ([]*Tweet, string, error) ) diff --git a/util.go b/util.go index 5187dea..70a99d0 100644 --- a/util.go +++ b/util.go @@ -51,23 +51,68 @@ func (s *Scraper) newRequest(method string, url string) (*http.Request, error) { return req, nil } -func getTimeline(ctx context.Context, query string, maxTweetsNbr int, fetchFunc fetchFunc) <-chan *Result { - channel := make(chan *Result) - go func(user string) { +func getUserTimeline(ctx context.Context, query string, maxProfilesNbr int, fetchFunc fetchProfileFunc) <-chan *ProfileResult { + channel := make(chan *ProfileResult) + go func(query string) { + defer close(channel) + var nextCursor string + profilesNbr := 0 + for profilesNbr < maxProfilesNbr { + select { + case <-ctx.Done(): + channel <- &ProfileResult{Error: ctx.Err()} + return + default: + } + + profiles, next, err := fetchFunc(query, maxProfilesNbr, nextCursor) + if err != nil { + channel <- &ProfileResult{Error: err} + return + } + + if len(profiles) == 0 { + break + } + + for _, profile := range profiles { + select { + case <-ctx.Done(): + channel <- &ProfileResult{Error: ctx.Err()} + return + default: + } + + if profilesNbr < maxProfilesNbr { + nextCursor = next + channel <- &ProfileResult{Profile: *profile} + } else { + break + } + profilesNbr++ + } + } + }(query) + return channel +} + +func getTweetTimeline(ctx context.Context, query string, maxTweetsNbr int, fetchFunc fetchTweetFunc) <-chan *TweetResult { + channel := make(chan *TweetResult) + go func(query string) { defer close(channel) var nextCursor string tweetsNbr := 0 for tweetsNbr < maxTweetsNbr { select { case <-ctx.Done(): - channel <- &Result{Error: ctx.Err()} + channel <- &TweetResult{Error: ctx.Err()} return default: } tweets, next, err := fetchFunc(query, maxTweetsNbr, nextCursor) if err != nil { - channel <- &Result{Error: err} + channel <- &TweetResult{Error: err} return } @@ -78,7 +123,7 @@ func getTimeline(ctx context.Context, query string, maxTweetsNbr int, fetchFunc for _, tweet := range tweets { select { case <-ctx.Done(): - channel <- &Result{Error: ctx.Err()} + channel <- &TweetResult{Error: ctx.Err()} return default: } @@ -88,7 +133,7 @@ func getTimeline(ctx context.Context, query string, maxTweetsNbr int, fetchFunc continue } nextCursor = next - channel <- &Result{Tweet: *tweet} + channel <- &TweetResult{Tweet: *tweet} } else { break } @@ -99,6 +144,40 @@ func getTimeline(ctx context.Context, query string, maxTweetsNbr int, fetchFunc return channel } +func parseProfile(user legacyUser) Profile { + profile := Profile{ + Avatar: user.ProfileImageURLHTTPS, + Banner: user.ProfileBannerURL, + Biography: user.Description, + FollowersCount: user.FollowersCount, + FollowingCount: user.FavouritesCount, + FriendsCount: user.FriendsCount, + IsPrivate: user.Protected, + IsVerified: user.Verified, + LikesCount: user.FavouritesCount, + ListedCount: user.ListedCount, + Location: user.Location, + Name: user.Name, + PinnedTweetIDs: user.PinnedTweetIdsStr, + TweetsCount: user.StatusesCount, + URL: "https://twitter.com/" + user.ScreenName, + UserID: user.IDStr, + Username: user.ScreenName, + } + + tm, err := time.Parse(time.RubyDate, user.CreatedAt) + if err == nil { + tm = tm.UTC() + profile.Joined = &tm + } + + if len(user.Entities.URL.Urls) > 0 { + profile.Website = user.Entities.URL.Urls[0].ExpandedURL + } + + return profile +} + func parseTimeline(timeline *timeline) ([]*Tweet, string) { tweets := make(map[string]Tweet) @@ -234,3 +313,28 @@ func parseTimeline(timeline *timeline) ([]*Tweet, string) { } return orderedTweets, cursor } + +func parseUsers(timeline *timeline) ([]*Profile, string) { + users := make(map[string]Profile) + + for id, user := range timeline.GlobalObjects.Users { + users[id] = parseProfile(user) + } + + var cursor string + var orderedProfiles []*Profile + for _, instruction := range timeline.Timeline.Instructions { + for _, entry := range instruction.AddEntries.Entries { + if profile, ok := users[entry.Content.Item.Content.User.ID]; ok { + orderedProfiles = append(orderedProfiles, &profile) + } + if entry.Content.Operation.Cursor.CursorType == "Bottom" { + cursor = entry.Content.Operation.Cursor.Value + } + } + if instruction.ReplaceEntry.Entry.Content.Operation.Cursor.CursorType == "Bottom" { + cursor = instruction.ReplaceEntry.Entry.Content.Operation.Cursor.Value + } + } + return orderedProfiles, cursor +}