Add SearchProfiles

Minor changes and fixes
This commit is contained in:
Alexander Sheiko 2021-04-22 21:38:49 +03:00
parent f3597d0db6
commit 5032ecd29d
9 changed files with 268 additions and 100 deletions

View file

@ -103,6 +103,7 @@ Options:
* `twitterscraper.SearchLatest` - live mode
* `twitterscraper.SearchPhotos` - image mode
* `twitterscraper.SearchVideos` - video mode
* `twitterscraper.SearchUsers` - user mode
### Get profile
@ -124,6 +125,28 @@ func main() {
}
```
### Search profiles by query
```golang
package main
import (
"context"
"fmt"
twitterscraper "github.com/n0madic/twitter-scraper"
)
func main() {
scraper := twitterscraper.New().SetSearchMode(twitterscraper.SearchUsers)
for profile := range scraper.SearchUsers(context.Background(), "Twitter", 50) {
if profile.Error != nil {
panic(profile.Error)
}
fmt.Println(profile.Name)
}
}
```
### Get trends
```golang

42
api.go
View file

@ -11,42 +11,6 @@ import (
const bearerToken string = "AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA"
type user struct {
Data struct {
User struct {
RestID string `json:"rest_id"`
Legacy struct {
CreatedAt string `json:"created_at"`
Description string `json:"description"`
Entities struct {
URL struct {
Urls []struct {
ExpandedURL string `json:"expanded_url"`
} `json:"urls"`
} `json:"url"`
} `json:"entities"`
FavouritesCount int `json:"favourites_count"`
FollowersCount int `json:"followers_count"`
FriendsCount int `json:"friends_count"`
IDStr string `json:"id_str"`
ListedCount int `json:"listed_count"`
Name string `json:"name"`
Location string `json:"location"`
PinnedTweetIdsStr []string `json:"pinned_tweet_ids_str"`
ProfileBannerURL string `json:"profile_banner_url"`
ProfileImageURLHTTPS string `json:"profile_image_url_https"`
Protected bool `json:"protected"`
ScreenName string `json:"screen_name"`
StatusesCount int `json:"statuses_count"`
Verified bool `json:"verified"`
} `json:"legacy"`
} `json:"user"`
} `json:"data"`
Errors []struct {
Message string `json:"message"`
} `json:"errors"`
}
// Global cache for user IDs
var cacheIDs sync.Map
@ -70,7 +34,8 @@ func (s *Scraper) RequestAPI(req *http.Request, target interface{}) error {
// private profiles return forbidden, but also data
if resp.StatusCode != http.StatusOK && resp.StatusCode != http.StatusForbidden {
return fmt.Errorf("response status %s", resp.Status)
content, _ := ioutil.ReadAll(resp.Body)
return fmt.Errorf("response status %s: %s", resp.Status, content)
}
if resp.Header.Get("X-Rate-Limit-Remaining") == "0" {
@ -95,7 +60,8 @@ func (s *Scraper) GetGuestToken() error {
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return fmt.Errorf("response status %s", resp.Status)
content, _ := ioutil.ReadAll(resp.Body)
return fmt.Errorf("response status %s: %s", resp.Status, content)
}
body, err := ioutil.ReadAll(resp.Body)
if err != nil {

View file

@ -30,6 +30,18 @@ type Profile struct {
Website string
}
type user struct {
Data struct {
User struct {
RestID string `json:"rest_id"`
Legacy legacyUser `json:"legacy"`
} `json:"user"`
} `json:"data"`
Errors []struct {
Message string `json:"message"`
} `json:"errors"`
}
// GetProfile return parsed user profile.
func (s *Scraper) GetProfile(username string) (Profile, error) {
var jsn user
@ -50,44 +62,13 @@ func (s *Scraper) GetProfile(username string) (Profile, error) {
if jsn.Data.User.RestID == "" {
return Profile{}, fmt.Errorf("rest_id not found")
}
jsn.Data.User.Legacy.IDStr = jsn.Data.User.RestID
if jsn.Data.User.Legacy.ScreenName == "" {
return Profile{}, fmt.Errorf("either @%s does not exist or is private", username)
}
user := jsn.Data.User.Legacy
profile := Profile{
Avatar: user.ProfileImageURLHTTPS,
Banner: user.ProfileBannerURL,
Biography: user.Description,
FollowersCount: user.FollowersCount,
FollowingCount: user.FavouritesCount,
FriendsCount: user.FriendsCount,
IsPrivate: user.Protected,
IsVerified: user.Verified,
LikesCount: user.FavouritesCount,
ListedCount: user.ListedCount,
Location: user.Location,
Name: user.Name,
PinnedTweetIDs: user.PinnedTweetIdsStr,
TweetsCount: user.StatusesCount,
URL: "https://twitter.com/" + user.ScreenName,
UserID: jsn.Data.User.RestID,
Username: user.ScreenName,
}
tm, err := time.Parse(time.RubyDate, user.CreatedAt)
if err == nil {
tm = tm.UTC()
profile.Joined = &tm
}
if len(user.Entities.URL.Urls) > 0 {
profile.Website = user.Entities.URL.Urls[0].ExpandedURL
}
return profile, nil
return parseProfile(jsn.Data.User.Legacy), nil
}
// GetProfile wrapper for default scraper

View file

@ -31,6 +31,8 @@ const (
SearchPhotos
// SearchVideos - video mode
SearchVideos
// SearchUsers - user mode
SearchUsers
)
var defaultScraper *Scraper

View file

@ -7,30 +7,40 @@ import (
)
// SearchTweets returns channel with tweets for a given search query
func (s *Scraper) SearchTweets(ctx context.Context, query string, maxTweetsNbr int) <-chan *Result {
return getTimeline(ctx, query, maxTweetsNbr, s.FetchSearchTweets)
func (s *Scraper) SearchTweets(ctx context.Context, query string, maxTweetsNbr int) <-chan *TweetResult {
return getTweetTimeline(ctx, query, maxTweetsNbr, s.FetchSearchTweets)
}
// SearchTweets wrapper for default Scraper
func SearchTweets(ctx context.Context, query string, maxTweetsNbr int) <-chan *Result {
func SearchTweets(ctx context.Context, query string, maxTweetsNbr int) <-chan *TweetResult {
return defaultScraper.SearchTweets(ctx, query, maxTweetsNbr)
}
// FetchSearchTweets gets tweets for a given search query, via the Twitter frontend API
func (s *Scraper) FetchSearchTweets(query string, maxTweetsNbr int, cursor string) ([]*Tweet, string, error) {
// SearchProfiles returns channel with profiles for a given search query
func (s *Scraper) SearchProfiles(ctx context.Context, query string, maxProfilesNbr int) <-chan *ProfileResult {
return getUserTimeline(ctx, query, maxProfilesNbr, s.FetchSearchProfiles)
}
// SearchProfiles wrapper for default Scraper
func SearchProfiles(ctx context.Context, query string, maxProfilesNbr int) <-chan *ProfileResult {
return defaultScraper.SearchProfiles(ctx, query, maxProfilesNbr)
}
// getSearchTimeline gets results for a given search query, via the Twitter frontend API
func (s *Scraper) getSearchTimeline(query string, maxNbr int, cursor string) (*timeline, error) {
query = url.PathEscape(query)
if maxTweetsNbr > 100 {
maxTweetsNbr = 100
if maxNbr > 50 {
maxNbr = 50
}
req, err := s.newRequest("GET", "https://twitter.com/i/api/2/search/adaptive.json")
if err != nil {
return nil, "", err
return nil, err
}
q := req.URL.Query()
q.Add("q", query)
q.Add("count", strconv.Itoa(maxTweetsNbr))
q.Add("count", strconv.Itoa(maxNbr))
q.Add("query_source", "typed_query")
q.Add("pc", "1")
q.Add("spelling_corrections", "1")
@ -44,16 +54,36 @@ func (s *Scraper) FetchSearchTweets(query string, maxTweetsNbr int, cursor strin
q.Add("result_filter", "image")
case SearchVideos:
q.Add("result_filter", "video")
case SearchUsers:
q.Add("result_filter", "user")
}
req.URL.RawQuery = q.Encode()
var timeline timeline
err = s.RequestAPI(req, &timeline)
if err != nil {
return nil, err
}
return &timeline, nil
}
// FetchSearchTweets gets tweets for a given search query, via the Twitter frontend API
func (s *Scraper) FetchSearchTweets(query string, maxTweetsNbr int, cursor string) ([]*Tweet, string, error) {
timeline, err := s.getSearchTimeline(query, maxTweetsNbr, cursor)
if err != nil {
return nil, "", err
}
tweets, nextCursor := parseTimeline(&timeline)
tweets, nextCursor := parseTimeline(timeline)
return tweets, nextCursor, nil
}
// FetchSearchProfiles gets users for a given search query, via the Twitter frontend API
func (s *Scraper) FetchSearchProfiles(query string, maxProfilesNbr int, cursor string) ([]*Profile, string, error) {
timeline, err := s.getSearchTimeline(query, maxProfilesNbr, cursor)
if err != nil {
return nil, "", err
}
users, nextCursor := parseUsers(timeline)
return users, nextCursor, nil
}

View file

@ -16,16 +16,42 @@ func TestFetchSearchCursor(t *testing.T) {
t.Fatal(err)
}
if cursor == "" {
t.Fatal("Expected search cursor is not empty")
t.Fatal("Expected search cursor is empty")
}
tweetsNbr += len(tweets)
nextCursor = cursor
}
}
func TestGetSearchProfiles(t *testing.T) {
count := 0
maxProfilesNbr := 150
dupcheck := make(map[string]bool)
scraper := New().SetSearchMode(SearchUsers)
for profile := range scraper.SearchProfiles(context.Background(), "Twitter", maxProfilesNbr) {
if profile.Error != nil {
t.Error(profile.Error)
} else {
count++
if profile.UserID == "" {
t.Error("Expected UserID is empty")
} else {
if dupcheck[profile.UserID] {
t.Errorf("Detect duplicated UserID: %s", profile.UserID)
} else {
dupcheck[profile.UserID] = true
}
}
}
}
if count != maxProfilesNbr {
t.Errorf("Expected profiles count=%v, got: %v", maxProfilesNbr, count)
}
}
func TestGetSearchTweets(t *testing.T) {
count := 0
maxTweetsNbr := 250
maxTweetsNbr := 150
dupcheck := make(map[string]bool)
for tweet := range SearchTweets(context.Background(), "twitter -filter:retweets", maxTweetsNbr) {
if tweet.Error != nil {
@ -33,7 +59,7 @@ func TestGetSearchTweets(t *testing.T) {
} else {
count++
if tweet.ID == "" {
t.Error("Expected tweet ID is not empty")
t.Error("Expected tweet ID is empty")
} else {
if dupcheck[tweet.ID] {
t.Errorf("Detect duplicated tweet ID: %s", tweet.ID)
@ -42,13 +68,13 @@ func TestGetSearchTweets(t *testing.T) {
}
}
if tweet.PermanentURL == "" {
t.Error("Expected tweet PermanentURL is not empty")
t.Error("Expected tweet PermanentURL is empty")
}
if tweet.IsRetweet {
t.Error("Expected tweet IsRetweet is false")
}
if tweet.Text == "" {
t.Error("Expected tweet Text is not empty")
t.Error("Expected tweet Text is empty")
}
}
}

View file

@ -7,12 +7,12 @@ import (
)
// GetTweets returns channel with tweets for a given user.
func (s *Scraper) GetTweets(ctx context.Context, user string, maxTweetsNbr int) <-chan *Result {
return getTimeline(ctx, user, maxTweetsNbr, s.FetchTweets)
func (s *Scraper) GetTweets(ctx context.Context, user string, maxTweetsNbr int) <-chan *TweetResult {
return getTweetTimeline(ctx, user, maxTweetsNbr, s.FetchTweets)
}
// GetTweets wrapper for default Scraper
func GetTweets(ctx context.Context, user string, maxTweetsNbr int) <-chan *Result {
func GetTweets(ctx context.Context, user string, maxTweetsNbr int) <-chan *TweetResult {
return defaultScraper.GetTweets(ctx, user, maxTweetsNbr)
}

View file

@ -43,12 +43,44 @@ type (
Videos []Video
}
// Result of scrapping.
Result struct {
// ProfileResult of scrapping.
ProfileResult struct {
Profile
Error error
}
// TweetResult of scrapping.
TweetResult struct {
Tweet
Error error
}
legacyUser struct {
CreatedAt string `json:"created_at"`
Description string `json:"description"`
Entities struct {
URL struct {
Urls []struct {
ExpandedURL string `json:"expanded_url"`
} `json:"urls"`
} `json:"url"`
} `json:"entities"`
FavouritesCount int `json:"favourites_count"`
FollowersCount int `json:"followers_count"`
FriendsCount int `json:"friends_count"`
IDStr string `json:"id_str"`
ListedCount int `json:"listed_count"`
Name string `json:"name"`
Location string `json:"location"`
PinnedTweetIdsStr []string `json:"pinned_tweet_ids_str"`
ProfileBannerURL string `json:"profile_banner_url"`
ProfileImageURLHTTPS string `json:"profile_image_url_https"`
Protected bool `json:"protected"`
ScreenName string `json:"screen_name"`
StatusesCount int `json:"statuses_count"`
Verified bool `json:"verified"`
}
// timeline JSON
timeline struct {
GlobalObjects struct {
@ -128,6 +160,9 @@ type (
Tweet struct {
ID string `json:"id"`
} `json:"tweet"`
User struct {
ID string `json:"id"`
} `json:"user"`
} `json:"content"`
} `json:"item"`
Operation struct {
@ -185,5 +220,6 @@ type (
} `json:"timeline"`
}
fetchFunc func(user string, maxTweetsNbr int, cursor string) ([]*Tweet, string, error)
fetchProfileFunc func(query string, maxProfilesNbr int, cursor string) ([]*Profile, string, error)
fetchTweetFunc func(query string, maxTweetsNbr int, cursor string) ([]*Tweet, string, error)
)

118
util.go
View file

@ -51,23 +51,68 @@ func (s *Scraper) newRequest(method string, url string) (*http.Request, error) {
return req, nil
}
func getTimeline(ctx context.Context, query string, maxTweetsNbr int, fetchFunc fetchFunc) <-chan *Result {
channel := make(chan *Result)
go func(user string) {
func getUserTimeline(ctx context.Context, query string, maxProfilesNbr int, fetchFunc fetchProfileFunc) <-chan *ProfileResult {
channel := make(chan *ProfileResult)
go func(query string) {
defer close(channel)
var nextCursor string
profilesNbr := 0
for profilesNbr < maxProfilesNbr {
select {
case <-ctx.Done():
channel <- &ProfileResult{Error: ctx.Err()}
return
default:
}
profiles, next, err := fetchFunc(query, maxProfilesNbr, nextCursor)
if err != nil {
channel <- &ProfileResult{Error: err}
return
}
if len(profiles) == 0 {
break
}
for _, profile := range profiles {
select {
case <-ctx.Done():
channel <- &ProfileResult{Error: ctx.Err()}
return
default:
}
if profilesNbr < maxProfilesNbr {
nextCursor = next
channel <- &ProfileResult{Profile: *profile}
} else {
break
}
profilesNbr++
}
}
}(query)
return channel
}
func getTweetTimeline(ctx context.Context, query string, maxTweetsNbr int, fetchFunc fetchTweetFunc) <-chan *TweetResult {
channel := make(chan *TweetResult)
go func(query string) {
defer close(channel)
var nextCursor string
tweetsNbr := 0
for tweetsNbr < maxTweetsNbr {
select {
case <-ctx.Done():
channel <- &Result{Error: ctx.Err()}
channel <- &TweetResult{Error: ctx.Err()}
return
default:
}
tweets, next, err := fetchFunc(query, maxTweetsNbr, nextCursor)
if err != nil {
channel <- &Result{Error: err}
channel <- &TweetResult{Error: err}
return
}
@ -78,7 +123,7 @@ func getTimeline(ctx context.Context, query string, maxTweetsNbr int, fetchFunc
for _, tweet := range tweets {
select {
case <-ctx.Done():
channel <- &Result{Error: ctx.Err()}
channel <- &TweetResult{Error: ctx.Err()}
return
default:
}
@ -88,7 +133,7 @@ func getTimeline(ctx context.Context, query string, maxTweetsNbr int, fetchFunc
continue
}
nextCursor = next
channel <- &Result{Tweet: *tweet}
channel <- &TweetResult{Tweet: *tweet}
} else {
break
}
@ -99,6 +144,40 @@ func getTimeline(ctx context.Context, query string, maxTweetsNbr int, fetchFunc
return channel
}
func parseProfile(user legacyUser) Profile {
profile := Profile{
Avatar: user.ProfileImageURLHTTPS,
Banner: user.ProfileBannerURL,
Biography: user.Description,
FollowersCount: user.FollowersCount,
FollowingCount: user.FavouritesCount,
FriendsCount: user.FriendsCount,
IsPrivate: user.Protected,
IsVerified: user.Verified,
LikesCount: user.FavouritesCount,
ListedCount: user.ListedCount,
Location: user.Location,
Name: user.Name,
PinnedTweetIDs: user.PinnedTweetIdsStr,
TweetsCount: user.StatusesCount,
URL: "https://twitter.com/" + user.ScreenName,
UserID: user.IDStr,
Username: user.ScreenName,
}
tm, err := time.Parse(time.RubyDate, user.CreatedAt)
if err == nil {
tm = tm.UTC()
profile.Joined = &tm
}
if len(user.Entities.URL.Urls) > 0 {
profile.Website = user.Entities.URL.Urls[0].ExpandedURL
}
return profile
}
func parseTimeline(timeline *timeline) ([]*Tweet, string) {
tweets := make(map[string]Tweet)
@ -234,3 +313,28 @@ func parseTimeline(timeline *timeline) ([]*Tweet, string) {
}
return orderedTweets, cursor
}
func parseUsers(timeline *timeline) ([]*Profile, string) {
users := make(map[string]Profile)
for id, user := range timeline.GlobalObjects.Users {
users[id] = parseProfile(user)
}
var cursor string
var orderedProfiles []*Profile
for _, instruction := range timeline.Timeline.Instructions {
for _, entry := range instruction.AddEntries.Entries {
if profile, ok := users[entry.Content.Item.Content.User.ID]; ok {
orderedProfiles = append(orderedProfiles, &profile)
}
if entry.Content.Operation.Cursor.CursorType == "Bottom" {
cursor = entry.Content.Operation.Cursor.Value
}
}
if instruction.ReplaceEntry.Entry.Content.Operation.Cursor.CursorType == "Bottom" {
cursor = instruction.ReplaceEntry.Entry.Content.Operation.Cursor.Value
}
}
return orderedProfiles, cursor
}