Add SearchProfiles

Minor changes and fixes
This commit is contained in:
Alexander Sheiko 2021-04-22 21:38:49 +03:00
parent f3597d0db6
commit 5032ecd29d
9 changed files with 268 additions and 100 deletions

View file

@ -103,6 +103,7 @@ Options:
* `twitterscraper.SearchLatest` - live mode * `twitterscraper.SearchLatest` - live mode
* `twitterscraper.SearchPhotos` - image mode * `twitterscraper.SearchPhotos` - image mode
* `twitterscraper.SearchVideos` - video mode * `twitterscraper.SearchVideos` - video mode
* `twitterscraper.SearchUsers` - user mode
### Get profile ### Get profile
@ -124,6 +125,28 @@ func main() {
} }
``` ```
### Search profiles by query
```golang
package main
import (
"context"
"fmt"
twitterscraper "github.com/n0madic/twitter-scraper"
)
func main() {
scraper := twitterscraper.New().SetSearchMode(twitterscraper.SearchUsers)
for profile := range scraper.SearchUsers(context.Background(), "Twitter", 50) {
if profile.Error != nil {
panic(profile.Error)
}
fmt.Println(profile.Name)
}
}
```
### Get trends ### Get trends
```golang ```golang

42
api.go
View file

@ -11,42 +11,6 @@ import (
const bearerToken string = "AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA" const bearerToken string = "AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA"
type user struct {
Data struct {
User struct {
RestID string `json:"rest_id"`
Legacy struct {
CreatedAt string `json:"created_at"`
Description string `json:"description"`
Entities struct {
URL struct {
Urls []struct {
ExpandedURL string `json:"expanded_url"`
} `json:"urls"`
} `json:"url"`
} `json:"entities"`
FavouritesCount int `json:"favourites_count"`
FollowersCount int `json:"followers_count"`
FriendsCount int `json:"friends_count"`
IDStr string `json:"id_str"`
ListedCount int `json:"listed_count"`
Name string `json:"name"`
Location string `json:"location"`
PinnedTweetIdsStr []string `json:"pinned_tweet_ids_str"`
ProfileBannerURL string `json:"profile_banner_url"`
ProfileImageURLHTTPS string `json:"profile_image_url_https"`
Protected bool `json:"protected"`
ScreenName string `json:"screen_name"`
StatusesCount int `json:"statuses_count"`
Verified bool `json:"verified"`
} `json:"legacy"`
} `json:"user"`
} `json:"data"`
Errors []struct {
Message string `json:"message"`
} `json:"errors"`
}
// Global cache for user IDs // Global cache for user IDs
var cacheIDs sync.Map var cacheIDs sync.Map
@ -70,7 +34,8 @@ func (s *Scraper) RequestAPI(req *http.Request, target interface{}) error {
// private profiles return forbidden, but also data // private profiles return forbidden, but also data
if resp.StatusCode != http.StatusOK && resp.StatusCode != http.StatusForbidden { if resp.StatusCode != http.StatusOK && resp.StatusCode != http.StatusForbidden {
return fmt.Errorf("response status %s", resp.Status) content, _ := ioutil.ReadAll(resp.Body)
return fmt.Errorf("response status %s: %s", resp.Status, content)
} }
if resp.Header.Get("X-Rate-Limit-Remaining") == "0" { if resp.Header.Get("X-Rate-Limit-Remaining") == "0" {
@ -95,7 +60,8 @@ func (s *Scraper) GetGuestToken() error {
defer resp.Body.Close() defer resp.Body.Close()
if resp.StatusCode != http.StatusOK { if resp.StatusCode != http.StatusOK {
return fmt.Errorf("response status %s", resp.Status) content, _ := ioutil.ReadAll(resp.Body)
return fmt.Errorf("response status %s: %s", resp.Status, content)
} }
body, err := ioutil.ReadAll(resp.Body) body, err := ioutil.ReadAll(resp.Body)
if err != nil { if err != nil {

View file

@ -30,6 +30,18 @@ type Profile struct {
Website string Website string
} }
type user struct {
Data struct {
User struct {
RestID string `json:"rest_id"`
Legacy legacyUser `json:"legacy"`
} `json:"user"`
} `json:"data"`
Errors []struct {
Message string `json:"message"`
} `json:"errors"`
}
// GetProfile return parsed user profile. // GetProfile return parsed user profile.
func (s *Scraper) GetProfile(username string) (Profile, error) { func (s *Scraper) GetProfile(username string) (Profile, error) {
var jsn user var jsn user
@ -50,44 +62,13 @@ func (s *Scraper) GetProfile(username string) (Profile, error) {
if jsn.Data.User.RestID == "" { if jsn.Data.User.RestID == "" {
return Profile{}, fmt.Errorf("rest_id not found") return Profile{}, fmt.Errorf("rest_id not found")
} }
jsn.Data.User.Legacy.IDStr = jsn.Data.User.RestID
if jsn.Data.User.Legacy.ScreenName == "" { if jsn.Data.User.Legacy.ScreenName == "" {
return Profile{}, fmt.Errorf("either @%s does not exist or is private", username) return Profile{}, fmt.Errorf("either @%s does not exist or is private", username)
} }
user := jsn.Data.User.Legacy return parseProfile(jsn.Data.User.Legacy), nil
profile := Profile{
Avatar: user.ProfileImageURLHTTPS,
Banner: user.ProfileBannerURL,
Biography: user.Description,
FollowersCount: user.FollowersCount,
FollowingCount: user.FavouritesCount,
FriendsCount: user.FriendsCount,
IsPrivate: user.Protected,
IsVerified: user.Verified,
LikesCount: user.FavouritesCount,
ListedCount: user.ListedCount,
Location: user.Location,
Name: user.Name,
PinnedTweetIDs: user.PinnedTweetIdsStr,
TweetsCount: user.StatusesCount,
URL: "https://twitter.com/" + user.ScreenName,
UserID: jsn.Data.User.RestID,
Username: user.ScreenName,
}
tm, err := time.Parse(time.RubyDate, user.CreatedAt)
if err == nil {
tm = tm.UTC()
profile.Joined = &tm
}
if len(user.Entities.URL.Urls) > 0 {
profile.Website = user.Entities.URL.Urls[0].ExpandedURL
}
return profile, nil
} }
// GetProfile wrapper for default scraper // GetProfile wrapper for default scraper

View file

@ -31,6 +31,8 @@ const (
SearchPhotos SearchPhotos
// SearchVideos - video mode // SearchVideos - video mode
SearchVideos SearchVideos
// SearchUsers - user mode
SearchUsers
) )
var defaultScraper *Scraper var defaultScraper *Scraper

View file

@ -7,30 +7,40 @@ import (
) )
// SearchTweets returns channel with tweets for a given search query // SearchTweets returns channel with tweets for a given search query
func (s *Scraper) SearchTweets(ctx context.Context, query string, maxTweetsNbr int) <-chan *Result { func (s *Scraper) SearchTweets(ctx context.Context, query string, maxTweetsNbr int) <-chan *TweetResult {
return getTimeline(ctx, query, maxTweetsNbr, s.FetchSearchTweets) return getTweetTimeline(ctx, query, maxTweetsNbr, s.FetchSearchTweets)
} }
// SearchTweets wrapper for default Scraper // SearchTweets wrapper for default Scraper
func SearchTweets(ctx context.Context, query string, maxTweetsNbr int) <-chan *Result { func SearchTweets(ctx context.Context, query string, maxTweetsNbr int) <-chan *TweetResult {
return defaultScraper.SearchTweets(ctx, query, maxTweetsNbr) return defaultScraper.SearchTweets(ctx, query, maxTweetsNbr)
} }
// FetchSearchTweets gets tweets for a given search query, via the Twitter frontend API // SearchProfiles returns channel with profiles for a given search query
func (s *Scraper) FetchSearchTweets(query string, maxTweetsNbr int, cursor string) ([]*Tweet, string, error) { func (s *Scraper) SearchProfiles(ctx context.Context, query string, maxProfilesNbr int) <-chan *ProfileResult {
return getUserTimeline(ctx, query, maxProfilesNbr, s.FetchSearchProfiles)
}
// SearchProfiles wrapper for default Scraper
func SearchProfiles(ctx context.Context, query string, maxProfilesNbr int) <-chan *ProfileResult {
return defaultScraper.SearchProfiles(ctx, query, maxProfilesNbr)
}
// getSearchTimeline gets results for a given search query, via the Twitter frontend API
func (s *Scraper) getSearchTimeline(query string, maxNbr int, cursor string) (*timeline, error) {
query = url.PathEscape(query) query = url.PathEscape(query)
if maxTweetsNbr > 100 { if maxNbr > 50 {
maxTweetsNbr = 100 maxNbr = 50
} }
req, err := s.newRequest("GET", "https://twitter.com/i/api/2/search/adaptive.json") req, err := s.newRequest("GET", "https://twitter.com/i/api/2/search/adaptive.json")
if err != nil { if err != nil {
return nil, "", err return nil, err
} }
q := req.URL.Query() q := req.URL.Query()
q.Add("q", query) q.Add("q", query)
q.Add("count", strconv.Itoa(maxTweetsNbr)) q.Add("count", strconv.Itoa(maxNbr))
q.Add("query_source", "typed_query") q.Add("query_source", "typed_query")
q.Add("pc", "1") q.Add("pc", "1")
q.Add("spelling_corrections", "1") q.Add("spelling_corrections", "1")
@ -44,16 +54,36 @@ func (s *Scraper) FetchSearchTweets(query string, maxTweetsNbr int, cursor strin
q.Add("result_filter", "image") q.Add("result_filter", "image")
case SearchVideos: case SearchVideos:
q.Add("result_filter", "video") q.Add("result_filter", "video")
case SearchUsers:
q.Add("result_filter", "user")
} }
req.URL.RawQuery = q.Encode() req.URL.RawQuery = q.Encode()
var timeline timeline var timeline timeline
err = s.RequestAPI(req, &timeline) err = s.RequestAPI(req, &timeline)
if err != nil {
return nil, err
}
return &timeline, nil
}
// FetchSearchTweets gets tweets for a given search query, via the Twitter frontend API
func (s *Scraper) FetchSearchTweets(query string, maxTweetsNbr int, cursor string) ([]*Tweet, string, error) {
timeline, err := s.getSearchTimeline(query, maxTweetsNbr, cursor)
if err != nil { if err != nil {
return nil, "", err return nil, "", err
} }
tweets, nextCursor := parseTimeline(timeline)
tweets, nextCursor := parseTimeline(&timeline)
return tweets, nextCursor, nil return tweets, nextCursor, nil
} }
// FetchSearchProfiles gets users for a given search query, via the Twitter frontend API
func (s *Scraper) FetchSearchProfiles(query string, maxProfilesNbr int, cursor string) ([]*Profile, string, error) {
timeline, err := s.getSearchTimeline(query, maxProfilesNbr, cursor)
if err != nil {
return nil, "", err
}
users, nextCursor := parseUsers(timeline)
return users, nextCursor, nil
}

View file

@ -16,16 +16,42 @@ func TestFetchSearchCursor(t *testing.T) {
t.Fatal(err) t.Fatal(err)
} }
if cursor == "" { if cursor == "" {
t.Fatal("Expected search cursor is not empty") t.Fatal("Expected search cursor is empty")
} }
tweetsNbr += len(tweets) tweetsNbr += len(tweets)
nextCursor = cursor nextCursor = cursor
} }
} }
func TestGetSearchProfiles(t *testing.T) {
count := 0
maxProfilesNbr := 150
dupcheck := make(map[string]bool)
scraper := New().SetSearchMode(SearchUsers)
for profile := range scraper.SearchProfiles(context.Background(), "Twitter", maxProfilesNbr) {
if profile.Error != nil {
t.Error(profile.Error)
} else {
count++
if profile.UserID == "" {
t.Error("Expected UserID is empty")
} else {
if dupcheck[profile.UserID] {
t.Errorf("Detect duplicated UserID: %s", profile.UserID)
} else {
dupcheck[profile.UserID] = true
}
}
}
}
if count != maxProfilesNbr {
t.Errorf("Expected profiles count=%v, got: %v", maxProfilesNbr, count)
}
}
func TestGetSearchTweets(t *testing.T) { func TestGetSearchTweets(t *testing.T) {
count := 0 count := 0
maxTweetsNbr := 250 maxTweetsNbr := 150
dupcheck := make(map[string]bool) dupcheck := make(map[string]bool)
for tweet := range SearchTweets(context.Background(), "twitter -filter:retweets", maxTweetsNbr) { for tweet := range SearchTweets(context.Background(), "twitter -filter:retweets", maxTweetsNbr) {
if tweet.Error != nil { if tweet.Error != nil {
@ -33,7 +59,7 @@ func TestGetSearchTweets(t *testing.T) {
} else { } else {
count++ count++
if tweet.ID == "" { if tweet.ID == "" {
t.Error("Expected tweet ID is not empty") t.Error("Expected tweet ID is empty")
} else { } else {
if dupcheck[tweet.ID] { if dupcheck[tweet.ID] {
t.Errorf("Detect duplicated tweet ID: %s", tweet.ID) t.Errorf("Detect duplicated tweet ID: %s", tweet.ID)
@ -42,13 +68,13 @@ func TestGetSearchTweets(t *testing.T) {
} }
} }
if tweet.PermanentURL == "" { if tweet.PermanentURL == "" {
t.Error("Expected tweet PermanentURL is not empty") t.Error("Expected tweet PermanentURL is empty")
} }
if tweet.IsRetweet { if tweet.IsRetweet {
t.Error("Expected tweet IsRetweet is false") t.Error("Expected tweet IsRetweet is false")
} }
if tweet.Text == "" { if tweet.Text == "" {
t.Error("Expected tweet Text is not empty") t.Error("Expected tweet Text is empty")
} }
} }
} }

View file

@ -7,12 +7,12 @@ import (
) )
// GetTweets returns channel with tweets for a given user. // GetTweets returns channel with tweets for a given user.
func (s *Scraper) GetTweets(ctx context.Context, user string, maxTweetsNbr int) <-chan *Result { func (s *Scraper) GetTweets(ctx context.Context, user string, maxTweetsNbr int) <-chan *TweetResult {
return getTimeline(ctx, user, maxTweetsNbr, s.FetchTweets) return getTweetTimeline(ctx, user, maxTweetsNbr, s.FetchTweets)
} }
// GetTweets wrapper for default Scraper // GetTweets wrapper for default Scraper
func GetTweets(ctx context.Context, user string, maxTweetsNbr int) <-chan *Result { func GetTweets(ctx context.Context, user string, maxTweetsNbr int) <-chan *TweetResult {
return defaultScraper.GetTweets(ctx, user, maxTweetsNbr) return defaultScraper.GetTweets(ctx, user, maxTweetsNbr)
} }

View file

@ -43,12 +43,44 @@ type (
Videos []Video Videos []Video
} }
// Result of scrapping. // ProfileResult of scrapping.
Result struct { ProfileResult struct {
Profile
Error error
}
// TweetResult of scrapping.
TweetResult struct {
Tweet Tweet
Error error Error error
} }
legacyUser struct {
CreatedAt string `json:"created_at"`
Description string `json:"description"`
Entities struct {
URL struct {
Urls []struct {
ExpandedURL string `json:"expanded_url"`
} `json:"urls"`
} `json:"url"`
} `json:"entities"`
FavouritesCount int `json:"favourites_count"`
FollowersCount int `json:"followers_count"`
FriendsCount int `json:"friends_count"`
IDStr string `json:"id_str"`
ListedCount int `json:"listed_count"`
Name string `json:"name"`
Location string `json:"location"`
PinnedTweetIdsStr []string `json:"pinned_tweet_ids_str"`
ProfileBannerURL string `json:"profile_banner_url"`
ProfileImageURLHTTPS string `json:"profile_image_url_https"`
Protected bool `json:"protected"`
ScreenName string `json:"screen_name"`
StatusesCount int `json:"statuses_count"`
Verified bool `json:"verified"`
}
// timeline JSON // timeline JSON
timeline struct { timeline struct {
GlobalObjects struct { GlobalObjects struct {
@ -128,6 +160,9 @@ type (
Tweet struct { Tweet struct {
ID string `json:"id"` ID string `json:"id"`
} `json:"tweet"` } `json:"tweet"`
User struct {
ID string `json:"id"`
} `json:"user"`
} `json:"content"` } `json:"content"`
} `json:"item"` } `json:"item"`
Operation struct { Operation struct {
@ -185,5 +220,6 @@ type (
} `json:"timeline"` } `json:"timeline"`
} }
fetchFunc func(user string, maxTweetsNbr int, cursor string) ([]*Tweet, string, error) fetchProfileFunc func(query string, maxProfilesNbr int, cursor string) ([]*Profile, string, error)
fetchTweetFunc func(query string, maxTweetsNbr int, cursor string) ([]*Tweet, string, error)
) )

118
util.go
View file

@ -51,23 +51,68 @@ func (s *Scraper) newRequest(method string, url string) (*http.Request, error) {
return req, nil return req, nil
} }
func getTimeline(ctx context.Context, query string, maxTweetsNbr int, fetchFunc fetchFunc) <-chan *Result { func getUserTimeline(ctx context.Context, query string, maxProfilesNbr int, fetchFunc fetchProfileFunc) <-chan *ProfileResult {
channel := make(chan *Result) channel := make(chan *ProfileResult)
go func(user string) { go func(query string) {
defer close(channel)
var nextCursor string
profilesNbr := 0
for profilesNbr < maxProfilesNbr {
select {
case <-ctx.Done():
channel <- &ProfileResult{Error: ctx.Err()}
return
default:
}
profiles, next, err := fetchFunc(query, maxProfilesNbr, nextCursor)
if err != nil {
channel <- &ProfileResult{Error: err}
return
}
if len(profiles) == 0 {
break
}
for _, profile := range profiles {
select {
case <-ctx.Done():
channel <- &ProfileResult{Error: ctx.Err()}
return
default:
}
if profilesNbr < maxProfilesNbr {
nextCursor = next
channel <- &ProfileResult{Profile: *profile}
} else {
break
}
profilesNbr++
}
}
}(query)
return channel
}
func getTweetTimeline(ctx context.Context, query string, maxTweetsNbr int, fetchFunc fetchTweetFunc) <-chan *TweetResult {
channel := make(chan *TweetResult)
go func(query string) {
defer close(channel) defer close(channel)
var nextCursor string var nextCursor string
tweetsNbr := 0 tweetsNbr := 0
for tweetsNbr < maxTweetsNbr { for tweetsNbr < maxTweetsNbr {
select { select {
case <-ctx.Done(): case <-ctx.Done():
channel <- &Result{Error: ctx.Err()} channel <- &TweetResult{Error: ctx.Err()}
return return
default: default:
} }
tweets, next, err := fetchFunc(query, maxTweetsNbr, nextCursor) tweets, next, err := fetchFunc(query, maxTweetsNbr, nextCursor)
if err != nil { if err != nil {
channel <- &Result{Error: err} channel <- &TweetResult{Error: err}
return return
} }
@ -78,7 +123,7 @@ func getTimeline(ctx context.Context, query string, maxTweetsNbr int, fetchFunc
for _, tweet := range tweets { for _, tweet := range tweets {
select { select {
case <-ctx.Done(): case <-ctx.Done():
channel <- &Result{Error: ctx.Err()} channel <- &TweetResult{Error: ctx.Err()}
return return
default: default:
} }
@ -88,7 +133,7 @@ func getTimeline(ctx context.Context, query string, maxTweetsNbr int, fetchFunc
continue continue
} }
nextCursor = next nextCursor = next
channel <- &Result{Tweet: *tweet} channel <- &TweetResult{Tweet: *tweet}
} else { } else {
break break
} }
@ -99,6 +144,40 @@ func getTimeline(ctx context.Context, query string, maxTweetsNbr int, fetchFunc
return channel return channel
} }
func parseProfile(user legacyUser) Profile {
profile := Profile{
Avatar: user.ProfileImageURLHTTPS,
Banner: user.ProfileBannerURL,
Biography: user.Description,
FollowersCount: user.FollowersCount,
FollowingCount: user.FavouritesCount,
FriendsCount: user.FriendsCount,
IsPrivate: user.Protected,
IsVerified: user.Verified,
LikesCount: user.FavouritesCount,
ListedCount: user.ListedCount,
Location: user.Location,
Name: user.Name,
PinnedTweetIDs: user.PinnedTweetIdsStr,
TweetsCount: user.StatusesCount,
URL: "https://twitter.com/" + user.ScreenName,
UserID: user.IDStr,
Username: user.ScreenName,
}
tm, err := time.Parse(time.RubyDate, user.CreatedAt)
if err == nil {
tm = tm.UTC()
profile.Joined = &tm
}
if len(user.Entities.URL.Urls) > 0 {
profile.Website = user.Entities.URL.Urls[0].ExpandedURL
}
return profile
}
func parseTimeline(timeline *timeline) ([]*Tweet, string) { func parseTimeline(timeline *timeline) ([]*Tweet, string) {
tweets := make(map[string]Tweet) tweets := make(map[string]Tweet)
@ -234,3 +313,28 @@ func parseTimeline(timeline *timeline) ([]*Tweet, string) {
} }
return orderedTweets, cursor return orderedTweets, cursor
} }
func parseUsers(timeline *timeline) ([]*Profile, string) {
users := make(map[string]Profile)
for id, user := range timeline.GlobalObjects.Users {
users[id] = parseProfile(user)
}
var cursor string
var orderedProfiles []*Profile
for _, instruction := range timeline.Timeline.Instructions {
for _, entry := range instruction.AddEntries.Entries {
if profile, ok := users[entry.Content.Item.Content.User.ID]; ok {
orderedProfiles = append(orderedProfiles, &profile)
}
if entry.Content.Operation.Cursor.CursorType == "Bottom" {
cursor = entry.Content.Operation.Cursor.Value
}
}
if instruction.ReplaceEntry.Entry.Content.Operation.Cursor.CursorType == "Bottom" {
cursor = instruction.ReplaceEntry.Entry.Content.Operation.Cursor.Value
}
}
return orderedProfiles, cursor
}