Total refactoring

Used guest frontend API
BREAKING CHANGE: remove tweet.HTML property
Loading more information
Minor fixes and changes
This commit is contained in:
Alexander Sheiko 2020-12-11 20:58:49 +02:00
parent 1c582e142e
commit edad8f6393
15 changed files with 628 additions and 497 deletions

View file

@ -1,7 +1,5 @@
# Twitter Scraper # Twitter Scraper
Golang implementation of python library <https://github.com/bisguzar/twitter-scraper>
Twitter's API is annoying to work with, and has lots of limitations — Twitter's API is annoying to work with, and has lots of limitations —
luckily their frontend (JavaScript) has it's own API, which I reverse-engineered. luckily their frontend (JavaScript) has it's own API, which I reverse-engineered.
No API rate limits. No tokens needed. No restrictions. Extremely fast. No API rate limits. No tokens needed. No restrictions. Extremely fast.
@ -32,12 +30,12 @@ func main() {
if tweet.Error != nil { if tweet.Error != nil {
panic(tweet.Error) panic(tweet.Error)
} }
fmt.Println(tweet.HTML) fmt.Println(tweet.Text)
} }
} }
``` ```
It appears you can ask for up to 50 tweets. It appears you can ask for up to 50 tweets (limit ~3200 tweets).
### Search tweets by query standard operators ### Search tweets by query standard operators
@ -58,32 +56,11 @@ func main() {
if tweet.Error != nil { if tweet.Error != nil {
panic(tweet.Error) panic(tweet.Error)
} }
fmt.Println(tweet.HTML) fmt.Println(tweet.Text)
} }
} }
``` ```
#### With http proxy
```golang
package main
import (
"context"
"fmt"
twitterscraper "github.com/n0madic/twitter-scraper"
)
func main() {
twitterscraper.SetProxy("http://localhost:16379")
for tweet := range twitterscraper.SearchTweets(context.Background(),
"twitter scraper data -filter:retweets", 50) {
if tweet.Error != nil {
panic(tweet.Error)
}
fmt.Println(tweet.HTML)
}
}
```
The search ends if we have 50 tweets. The search ends if we have 50 tweets.
See [Rules and filtering](https://developer.twitter.com/en/docs/tweets/rules-and-filtering/overview/standard-operators) for build standard queries. See [Rules and filtering](https://developer.twitter.com/en/docs/tweets/rules-and-filtering/overview/standard-operators) for build standard queries.
@ -125,3 +102,15 @@ func main() {
fmt.Println(trends) fmt.Println(trends)
} }
``` ```
### Use http proxy
```golang
twitterscraper.SetProxy("http://localhost:3128")
```
### Load timeline with tweet replies
```golang
twitterscraper.IncludeReplies = true
```

105
api.go Normal file
View file

@ -0,0 +1,105 @@
package twitterscraper
import (
"encoding/json"
"fmt"
"io/ioutil"
"net/http"
"sync"
)
const bearerToken string = "AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA"
type user struct {
Data struct {
User struct {
RestID string `json:"rest_id"`
} `json:"user"`
} `json:"data"`
}
var (
guestToken string
cacheIDs sync.Map
)
func requestAPI(req *http.Request, target interface{}) error {
if guestToken == "" {
err := GetGuestToken()
if err != nil {
return err
}
}
req.Header.Set("Authorization", "Bearer "+bearerToken)
req.Header.Set("X-Guest-Token", guestToken)
resp, err := newHTTPClient().Do(req)
if err != nil {
return err
}
defer resp.Body.Close()
return json.NewDecoder(resp.Body).Decode(target)
}
// GetGuestToken from API
func GetGuestToken() error {
req, err := http.NewRequest("POST", "https://api.twitter.com/1.1/guest/activate.json", nil)
if err != nil {
return err
}
req.Header.Set("Authorization", "Bearer "+bearerToken)
resp, err := newHTTPClient().Do(req)
if err != nil {
return err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return fmt.Errorf("response status %s", resp.Status)
}
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
return err
}
var jsn map[string]interface{}
if err := json.Unmarshal(body, &jsn); err != nil {
return err
}
var ok bool
if guestToken, ok = jsn["guest_token"].(string); !ok {
return fmt.Errorf("guest_token not found")
}
return nil
}
// GetUserIDByScreenName from API
func GetUserIDByScreenName(screenName string) (string, error) {
id, ok := cacheIDs.Load(screenName)
if ok {
return id.(string), nil
}
var jsn user
req, err := http.NewRequest("GET", "https://api.twitter.com/graphql/4S2ihIKfF3xhp-ENxvUAfQ/UserByScreenName?variables=%7B%22screen_name%22%3A%22"+screenName+"%22%2C%22withHighlightedLabel%22%3Atrue%7D", nil)
if err != nil {
return "", err
}
err = requestAPI(req, &jsn)
if err != nil {
return "", err
}
if jsn.Data.User.RestID == "" {
return "", fmt.Errorf("rest_id not found")
}
cacheIDs.Store(screenName, jsn.Data.User.RestID)
return jsn.Data.User.RestID, nil
}

24
api_test.go Normal file
View file

@ -0,0 +1,24 @@
package twitterscraper
import (
"testing"
)
func TestGetGuestToken(t *testing.T) {
if err := GetGuestToken(); err != nil {
t.Errorf("getGuestToken() error = %v", err)
}
if guestToken == "" {
t.Error("Expected non-empty guestToken")
}
}
func TestGetUserIDByScreenName(t *testing.T) {
userID, err := GetUserIDByScreenName("Twitter")
if err != nil {
t.Errorf("getUserByScreenName() error = %v", err)
}
if userID == "" {
t.Error("Expected non-empty user ID")
}
}

5
go.mod
View file

@ -2,7 +2,4 @@ module github.com/n0madic/twitter-scraper
go 1.13 go 1.13
require ( require github.com/google/go-cmp v0.5.4
github.com/PuerkitoBio/goquery v1.5.1
github.com/google/go-cmp v0.4.0
)

14
go.sum
View file

@ -1,14 +1,4 @@
github.com/PuerkitoBio/goquery v1.5.1 h1:PSPBGne8NIUWw+/7vFBV+kG2J/5MOjbzc7154OaKCSE= github.com/google/go-cmp v0.5.4 h1:L8R9j+yAqZuZjsqh/z+F1NCffTKKLShY6zXTItVIZ8M=
github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc= github.com/google/go-cmp v0.5.4/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/andybalholm/cascadia v1.1.0 h1:BuuO6sSfQNFRu1LppgbD25Hr2vLYW25JvxHs5zzsLTo=
github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
github.com/google/go-cmp v0.4.0 h1:xsAVV57WRhGj6kEIi8ReJzQlHHqcBYCElAvkovg3B/4=
github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20200202094626-16171245cfb2 h1:CCH4IOTTfewWjGOlSp+zGcjutRKlBEZQ6wTn8ozI/nI=
golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=

View file

@ -2,13 +2,7 @@ package twitterscraper
import ( import (
"fmt" "fmt"
"net"
"net/http"
"strconv"
"strings"
"time" "time"
"github.com/PuerkitoBio/goquery"
) )
// Profile of twitter user. // Profile of twitter user.
@ -19,12 +13,15 @@ type Profile struct {
Birthday string Birthday string
FollowersCount int FollowersCount int
FollowingCount int FollowingCount int
FriendsCount int
IsPrivate bool IsPrivate bool
IsVerified bool IsVerified bool
Joined *time.Time Joined *time.Time
LikesCount int LikesCount int
ListedCount int
Location string Location string
Name string Name string
PinnedTweetIDs []string
TweetsCount int TweetsCount int
URL string URL string
UserID string UserID string
@ -34,66 +31,61 @@ type Profile struct {
// GetProfile return parsed user profile. // GetProfile return parsed user profile.
func GetProfile(username string) (Profile, error) { func GetProfile(username string) (Profile, error) {
url := "https://mobile.twitter.com/" + username userID, err := GetUserIDByScreenName(username)
client := http.DefaultClient
if HTTPProxy != nil {
client = &http.Client{
Transport: &http.Transport{
Proxy: http.ProxyURL(HTTPProxy),
DialContext: (&net.Dialer{
Timeout: 10 * time.Second,
}).DialContext,
},
}
}
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return Profile{}, err
}
req.Header.Set("Accept-Language", "en-US")
resp, err := client.Do(req)
if resp == nil {
return Profile{}, err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return Profile{}, fmt.Errorf("response status: %s", resp.Status)
}
doc, err := goquery.NewDocumentFromReader(resp.Body)
if err != nil { if err != nil {
return Profile{}, err return Profile{}, err
} }
// parse join date text req, err := newRequest("GET", "https://twitter.com/i/api/2/timeline/profile/"+userID+".json")
screenName := doc.Find(".screen-name").First().Text() if err != nil {
return Profile{}, err
}
// check is username valid q := req.URL.Query()
if screenName == "" { q.Add("count", "20")
q.Add("userId", userID)
req.URL.RawQuery = q.Encode()
var timeline timeline
err = requestAPI(req, &timeline)
if err != nil {
return Profile{}, err
}
user, found := timeline.GlobalObjects.Users[userID]
if !found {
return Profile{}, fmt.Errorf("either @%s does not exist or is private", username) return Profile{}, fmt.Errorf("either @%s does not exist or is private", username)
} }
return Profile{ profile := Profile{
Avatar: doc.Find("td.avatar > img").First().AttrOr("src", ""), Avatar: user.ProfileImageURLHTTPS,
Biography: strings.TrimSpace(doc.Find(".bio").First().Text()), Banner: user.ProfileBannerURL,
FollowersCount: parseCount(doc.Find("table.profile-stats > tbody > tr > td:nth-child(3) > a > div.statnum").First().Text()), Biography: user.Description,
FollowingCount: parseCount(doc.Find("table.profile-stats > tbody > tr > td:nth-child(2) > a > div.statnum").First().Text()), FollowersCount: user.FollowersCount,
IsPrivate: strings.Contains(doc.Find("div.fullname > a.badge > img").First().AttrOr("src", ""), "protected"), FollowingCount: user.FavouritesCount,
IsVerified: strings.Contains(doc.Find("div.fullname > a.badge > img").First().AttrOr("src", ""), "verified"), FriendsCount: user.FriendsCount,
Location: strings.TrimSpace(doc.Find(".location").First().Text()), IsPrivate: user.Protected,
Name: strings.TrimSpace(doc.Find(".fullname").First().Text()), IsVerified: user.Verified,
TweetsCount: parseCount(doc.Find("table.profile-stats > tbody > tr > td:nth-child(1) > div.statnum").First().Text()), LikesCount: user.FavouritesCount,
URL: "https://twitter.com/" + screenName, ListedCount: user.ListedCount,
Username: screenName, Location: user.Location,
Website: strings.TrimSpace(doc.Find("div.url > div > a").First().AttrOr("data-url", "")), Name: user.Name,
}, nil PinnedTweetIDs: user.PinnedTweetIdsStr,
} TweetsCount: user.StatusesCount,
URL: "https://twitter.com/" + user.ScreenName,
UserID: user.IDStr,
Username: user.ScreenName,
}
func parseCount(str string) (i int) { tm, err := time.Parse(time.RubyDate, user.CreatedAt)
i, _ = strconv.Atoi(strings.Replace(str, ",", "", -1)) if err == nil {
return tm = tm.UTC()
profile.Joined = &tm
}
if len(user.Entities.URL.Urls) > 0 {
profile.Website = user.Entities.URL.Urls[0].ExpandedURL
}
return profile, nil
} }

View file

@ -2,28 +2,30 @@ package twitterscraper
import ( import (
"testing" "testing"
"time"
"github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp"
"github.com/google/go-cmp/cmp/cmpopts" "github.com/google/go-cmp/cmp/cmpopts"
) )
func TestGetProfile(t *testing.T) { func TestGetProfile(t *testing.T) {
// loc := time.FixedZone("UTC", 0) loc := time.FixedZone("UTC", 0)
// joined := time.Date(2007, 02, 20, 6, 35, 0, 0, loc) joined := time.Date(2007, 02, 20, 14, 35, 54, 0, loc)
sample := Profile{ sample := Profile{
Avatar: "https://pbs.twimg.com/profile_images/1308010958862905345/-SGZioPb_normal.jpg", Avatar: "https://pbs.twimg.com/profile_images/1308010958862905345/-SGZioPb_normal.jpg",
// Banner: "https://pbs.twimg.com/profile_banners/783214/1596041768/1500x500", Banner: "https://pbs.twimg.com/profile_banners/783214/1604501727",
Biography: "What's happening!?", Biography: "What's happening!?",
// Birthday: "March 21", // Birthday: "March 21",
IsPrivate: false, IsPrivate: false,
IsVerified: true, IsVerified: true,
// Joined: &joined, Joined: &joined,
Location: "everywhere", Location: "everywhere",
Name: "Twitter", Name: "Twitter",
URL: "https://twitter.com/Twitter", PinnedTweetIDs: []string{},
// UserID: "783214", URL: "https://twitter.com/Twitter",
Username: "Twitter", UserID: "783214",
Website: "about.twitter.com", Username: "Twitter",
Website: "https://about.twitter.com/",
} }
profile, err := GetProfile("Twitter") profile, err := GetProfile("Twitter")
@ -34,7 +36,9 @@ func TestGetProfile(t *testing.T) {
cmpOptions := cmp.Options{ cmpOptions := cmp.Options{
cmpopts.IgnoreFields(Profile{}, "FollowersCount"), cmpopts.IgnoreFields(Profile{}, "FollowersCount"),
cmpopts.IgnoreFields(Profile{}, "FollowingCount"), cmpopts.IgnoreFields(Profile{}, "FollowingCount"),
cmpopts.IgnoreFields(Profile{}, "FriendsCount"),
cmpopts.IgnoreFields(Profile{}, "LikesCount"), cmpopts.IgnoreFields(Profile{}, "LikesCount"),
cmpopts.IgnoreFields(Profile{}, "ListedCount"),
cmpopts.IgnoreFields(Profile{}, "TweetsCount"), cmpopts.IgnoreFields(Profile{}, "TweetsCount"),
} }
if diff := cmp.Diff(sample, profile, cmpOptions...); diff != "" { if diff := cmp.Diff(sample, profile, cmpOptions...); diff != "" {
@ -47,9 +51,9 @@ func TestGetProfile(t *testing.T) {
if profile.FollowingCount == 0 { if profile.FollowingCount == 0 {
t.Error("Expected FollowingCount is greater than zero") t.Error("Expected FollowingCount is greater than zero")
} }
// if profile.LikesCount == 0 { if profile.LikesCount == 0 {
// t.Error("Expected LikesCount is greater than zero") t.Error("Expected LikesCount is greater than zero")
// } }
if profile.TweetsCount == 0 { if profile.TweetsCount == 0 {
t.Error("Expected TweetsCount is greater than zero") t.Error("Expected TweetsCount is greater than zero")
} }

137
search.go
View file

@ -2,143 +2,44 @@ package twitterscraper
import ( import (
"context" "context"
"fmt"
"io"
"net"
"net/http"
"net/url" "net/url"
"strings" "strconv"
"time"
"github.com/PuerkitoBio/goquery"
) )
const mobileSearchURL = "https://mobile.twitter.com/search?q=%s"
// SearchTweets returns channel with tweets for a given search query // SearchTweets returns channel with tweets for a given search query
func SearchTweets(ctx context.Context, query string, maxTweetsNbr int) <-chan *Result { func SearchTweets(ctx context.Context, query string, maxTweetsNbr int) <-chan *Result {
channel := make(chan *Result) return getTimeline(ctx, query, maxTweetsNbr, FetchSearchTweets)
go func(query string) {
defer close(channel)
var nextCursor string
tweetsNbr := 0
for tweetsNbr < maxTweetsNbr {
select {
case <-ctx.Done():
channel <- &Result{Error: ctx.Err()}
return
default:
}
tweets, next, err := FetchSearchTweets(query, nextCursor)
if err != nil {
channel <- &Result{Error: err}
return
}
if len(tweets) == 0 {
break
}
for _, tweet := range tweets {
select {
case <-ctx.Done():
channel <- &Result{Error: ctx.Err()}
return
default:
}
if tweetsNbr < maxTweetsNbr {
nextCursor = next
channel <- &Result{Tweet: *tweet}
}
tweetsNbr++
}
}
}(query)
return channel
} }
// FetchSearchTweets gets tweets for a given search query, via the Twitter frontend API // FetchSearchTweets gets tweets for a given search query, via the Twitter frontend API
func FetchSearchTweets(query, nextCursor string) ([]*Tweet, string, error) { func FetchSearchTweets(query string, maxTweetsNbr int, cursor string) ([]*Tweet, string, error) {
url := fmt.Sprintf(mobileSearchURL, url.PathEscape(query)) query = url.PathEscape(query)
if nextCursor != "" { if maxTweetsNbr > 200 {
url = "https://mobile.twitter.com" + nextCursor maxTweetsNbr = 200
} }
client := http.DefaultClient req, err := newRequest("GET", "https://twitter.com/i/api/2/search/adaptive.json")
if HTTPProxy != nil {
client = &http.Client{
Transport: &http.Transport{
Proxy: http.ProxyURL(HTTPProxy),
DialContext: (&net.Dialer{
Timeout: 10 * time.Second,
}).DialContext,
},
}
}
req, err := http.NewRequest("GET", url, nil)
if err != nil { if err != nil {
return nil, "", err return nil, "", err
} }
req.Header.Set("Referer", "https://mobile.twitter.com/") q := req.URL.Query()
req.Header.Set("User-Agent", "Opera/9.80 (J2ME/MIDP; Opera Mini/5.1.21214/28.2725; U; ru) Presto/2.8.119 Version/11.10") q.Add("q", query)
q.Add("count", strconv.Itoa(maxTweetsNbr))
resp, err := client.Do(req) q.Add("query_source", "typed_query")
if resp == nil { q.Add("pc", "1")
return nil, "", err q.Add("spelling_corrections", "1")
if cursor != "" {
q.Add("cursor", cursor)
} }
defer resp.Body.Close() req.URL.RawQuery = q.Encode()
if resp.StatusCode != http.StatusOK { var timeline timeline
return nil, "", fmt.Errorf("response status: %s", resp.Status) err = requestAPI(req, &timeline)
}
return readTweetsFromMobileHTML(resp.Body)
}
func readTweetsFromMobileHTML(htm io.ReadCloser) ([]*Tweet, string, error) {
var tweets []*Tweet
doc, err := goquery.NewDocumentFromReader(htm)
if err != nil { if err != nil {
return nil, "", err return nil, "", err
} }
doc.Find("table.tweet").Each(func(i int, s *goquery.Selection) { tweets, nextCursor := parseTimeline(&timeline)
var tweet Tweet
tweetID, ok := s.Find(".tweet-text").Attr("data-id")
if ok {
tweet.ID = tweetID
tweet.Username = strings.TrimPrefix(strings.TrimSpace(s.Find("td.user-info > a > div.username").Text()), "@")
tweet.PermanentURL = fmt.Sprintf("https://twitter.com/%s/status/%s", tweet.Username, tweet.ID)
tweet.Text = strings.TrimSpace(s.Find(".tweet-text").Text())
tweet.HTML, _ = s.Find(".tweet-text").Html()
tweet.HTML = strings.TrimSpace(tweet.HTML)
s.Find("td.tweet-social-context > span").Each(func(i int, c *goquery.Selection) {
tweet.IsRetweet = true
})
s.Find(".twitter-hashtag").Each(func(i int, h *goquery.Selection) {
tweet.Hashtags = append(tweet.Hashtags, h.Text())
})
s.Find("a.tco-link:not(.u-hidden)").Each(func(i int, u *goquery.Selection) {
if link, ok := u.Attr("data-expanded-url"); ok {
tweet.URLs = append(tweet.URLs, link)
}
})
s.Find("div.media > img").Each(func(i int, p *goquery.Selection) {
if link, ok := p.Attr("src"); ok {
tweet.Photos = append(tweet.Photos, strings.TrimSuffix(link, ":small"))
}
})
tweets = append(tweets, &tweet)
}
})
nextCursor := doc.Find("div.w-button-more > a").AttrOr("href", "")
return tweets, nextCursor, nil return tweets, nextCursor, nil
} }

View file

@ -7,21 +7,21 @@ import (
func TestGetSearchTweets(t *testing.T) { func TestGetSearchTweets(t *testing.T) {
count := 0 count := 0
maxTweetsNbr := 50 maxTweetsNbr := 250
for tweet := range SearchTweets(context.Background(), "twitter scraper data -filter:retweets", maxTweetsNbr) { for tweet := range SearchTweets(context.Background(), "twitter scraper data -filter:retweets", maxTweetsNbr) {
if tweet.Error != nil { if tweet.Error != nil {
t.Error(tweet.Error) t.Error(tweet.Error)
} else { } else {
count++ count++
if tweet.HTML == "" {
t.Error("Expected tweet HTML is not empty")
}
if tweet.ID == "" { if tweet.ID == "" {
t.Error("Expected tweet ID is not empty") t.Error("Expected tweet ID is not empty")
} }
if tweet.PermanentURL == "" { if tweet.PermanentURL == "" {
t.Error("Expected tweet PermanentURL is not empty") t.Error("Expected tweet PermanentURL is not empty")
} }
if tweet.IsRetweet {
t.Error("Expected tweet IsRetweet is false")
}
if tweet.Text == "" { if tweet.Text == "" {
t.Error("Expected tweet Text is not empty") t.Error("Expected tweet Text is not empty")
} }

View file

@ -1,55 +1,29 @@
package twitterscraper package twitterscraper
import (
"fmt"
"net"
"net/http"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
)
const trendsURL = "https://mobile.twitter.com/trends"
// GetTrends return list of trends. // GetTrends return list of trends.
func GetTrends() ([]string, error) { func GetTrends() ([]string, error) {
client := http.DefaultClient req, err := newRequest("GET", "https://twitter.com/i/api/2/guide.json")
if HTTPProxy != nil {
client = &http.Client{
Transport: &http.Transport{
Proxy: http.ProxyURL(HTTPProxy),
DialContext: (&net.Dialer{
Timeout: 10 * time.Second,
}).DialContext,
},
}
}
req, err := http.NewRequest("GET", trendsURL, nil)
if err != nil { if err != nil {
return nil, err return nil, err
} }
req.Header.Set("Accept-Language", "en-US")
resp, err := client.Do(req) q := req.URL.Query()
if resp == nil { q.Add("count", "20")
return nil, err q.Add("candidate_source", "trends")
} q.Add("include_page_configuration", "false")
defer resp.Body.Close() q.Add("entity_tokens", "false")
req.URL.RawQuery = q.Encode()
if resp.StatusCode != http.StatusOK { var jsn timeline
return nil, fmt.Errorf("response status: %s", resp.Status) err = requestAPI(req, &jsn)
}
doc, err := goquery.NewDocumentFromReader(resp.Body)
if err != nil { if err != nil {
return nil, err return nil, err
} }
var trends []string var trends []string
doc.Find("li.topic").Each(func(i int, s *goquery.Selection) { for _, item := range jsn.Timeline.Instructions[1].AddEntries.Entries[1].Content.TimelineModule.Items {
trends = append(trends, strings.TrimSpace(s.Text())) trends = append(trends, item.Item.ClientEventInfo.Details.GuideDetails.TransparentGuideDetails.TrendMetadata.TrendName)
}) }
return trends, nil return trends, nil
} }

View file

@ -10,7 +10,13 @@ func TestGetTrends(t *testing.T) {
t.Error(err) t.Error(err)
} }
if len(trends) != 10 { if len(trends) != 20 {
t.Errorf("Expected 10 trends, got %d: %#v", len(trends), trends) t.Errorf("Expected 20 trends, got %d: %#v", len(trends), trends)
}
for _, trend := range trends {
if trend == "" {
t.Error("Expected trend is not empty")
}
} }
} }

196
tweets.go
View file

@ -2,196 +2,44 @@ package twitterscraper
import ( import (
"context" "context"
"fmt"
"net/http"
"strconv" "strconv"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
) )
const ajaxURL = "https://twitter.com/i/profiles/show/%s/timeline/tweets"
// Video type.
type Video struct {
ID string
Preview string
}
// Tweet type.
type Tweet struct {
Hashtags []string
HTML string
ID string
IsPin bool
IsRetweet bool
Likes int
PermanentURL string
Photos []string
Replies int
Retweets int
Text string
TimeParsed time.Time
Timestamp int64
URLs []string
UserID string
Username string
Videos []Video
}
// Result of scrapping.
type Result struct {
Tweet
Error error
}
// GetTweets returns channel with tweets for a given user. // GetTweets returns channel with tweets for a given user.
func GetTweets(ctx context.Context, user string, maxTweetsNbr int) <-chan *Result { func GetTweets(ctx context.Context, user string, maxTweetsNbr int) <-chan *Result {
channel := make(chan *Result) return getTimeline(ctx, user, maxTweetsNbr, FetchTweets)
go func(user string) {
defer close(channel)
var lastTweetID string
tweetsNbr := 0
for tweetsNbr < maxTweetsNbr {
select {
case <-ctx.Done():
channel <- &Result{Error: ctx.Err()}
return
default:
}
tweets, err := FetchTweets(user, lastTweetID)
if err != nil {
channel <- &Result{Error: err}
return
}
if len(tweets) == 0 {
break
}
for _, tweet := range tweets {
select {
case <-ctx.Done():
channel <- &Result{Error: ctx.Err()}
return
default:
}
if tweetsNbr < maxTweetsNbr {
lastId, _ := strconv.ParseInt(tweet.ID, 10, 64)
lastTweetID = strconv.FormatInt(lastId-1, 10)
channel <- &Result{Tweet: *tweet}
}
tweetsNbr++
}
}
}(user)
return channel
} }
// FetchTweets gets tweets for a given user, via the Twitter frontend API. // FetchTweets gets tweets for a given user, via the Twitter frontend API.
func FetchTweets(user string, last string) ([]*Tweet, error) { func FetchTweets(user string, maxTweetsNbr int, cursor string) ([]*Tweet, string, error) {
req, err := http.NewRequest("GET", "https://syndication.twitter.com/timeline/profile", nil) if maxTweetsNbr > 200 {
if err != nil { maxTweetsNbr = 200
return nil, err
} }
req.Header.Set("Referer", "https://publish.twitter.com/") userID, err := GetUserIDByScreenName(user)
if err != nil {
return nil, "", err
}
req, err := newRequest("GET", "https://api.twitter.com/2/timeline/profile/"+userID+".json")
if err != nil {
return nil, "", err
}
q := req.URL.Query() q := req.URL.Query()
q.Add("screen_name", user) q.Add("count", strconv.Itoa(maxTweetsNbr))
q.Add("with_replies", "true") q.Add("userId", userID)
if last != "" { if cursor != "" {
q.Add("max_position", last) q.Add("cursor", cursor)
} }
req.URL.RawQuery = q.Encode() req.URL.RawQuery = q.Encode()
htm, err := getHTMLFromJSON(req, "body") var timeline timeline
err = requestAPI(req, &timeline)
if err != nil { if err != nil {
return nil, err return nil, "", err
} }
tweets, err := readTweetsFromHTML(htm) tweets, nextCursor := parseTimeline(&timeline)
if err != nil { return tweets, nextCursor, nil
return nil, err
}
return tweets, nil
}
func readTweetsFromHTML(htm *strings.Reader) ([]*Tweet, error) {
var tweets []*Tweet
doc, err := goquery.NewDocumentFromReader(htm)
if err != nil {
return nil, err
}
doc.Find(".timeline-Tweet").Each(func(i int, s *goquery.Selection) {
var tweet Tweet
timeStr, ok := s.Find(".timeline-Tweet-metadata > a > time").Attr("datetime")
if ok {
tweet.TimeParsed, _ = time.Parse("2006-01-02T15:04:05-0700", timeStr)
tweet.Timestamp = tweet.TimeParsed.Unix()
tweet.ID = s.AttrOr("data-tweet-id", "")
// tweet.UserID = s.Find(".tweet").AttrOr("data-user-id", "")
tweet.Username = strings.TrimPrefix(s.Find(".TweetAuthor-screenName").AttrOr("title", ""), "@")
tweet.PermanentURL = fmt.Sprintf("https://twitter.com/%s/status/%s", tweet.Username, tweet.ID)
tweet.Text = s.Find(".timeline-Tweet-text").Text()
tweet.HTML, _ = s.Find(".timeline-Tweet-text").Html()
s.Find(".timeline-Tweet-retweetCredit").Each(func(i int, c *goquery.Selection) {
tweet.IsRetweet = true
})
// s.Find("span.js-pinned-text").Each(func(i int, c *goquery.Selection) {
// tweet.IsPin = true
// })
// s.Find(".ProfileTweet-actionCount").Each(func(i int, c *goquery.Selection) {
// txt := strings.TrimSpace(c.Text())
// switch {
// case strings.HasSuffix(txt, "likes"):
// l := strings.Split(txt, " ")
// tweet.Likes, _ = strconv.Atoi(l[0])
// case strings.HasSuffix(txt, "replies"):
// l := strings.Split(txt, " ")
// tweet.Replies, _ = strconv.Atoi(l[0])
// case strings.HasSuffix(txt, "retweets"):
// l := strings.Split(txt, " ")
// tweet.Retweets, _ = strconv.Atoi(l[0])
// }
// })
s.Find(".hashtag > span.PrettyLink-value").Each(func(i int, h *goquery.Selection) {
tweet.Hashtags = append(tweet.Hashtags, h.Text())
})
s.Find("a.link:not(.u-hidden)").Each(func(i int, u *goquery.Selection) {
if link, ok := u.Attr("data-expanded-url"); ok {
tweet.URLs = append(tweet.URLs, link)
}
})
s.Find(".NaturalImage-image").Each(func(i int, p *goquery.Selection) {
if link, ok := p.Attr("data-image"); ok {
tweet.Photos = append(tweet.Photos, link+"?format=jpg&name=orig")
}
})
s.Find(".CroppedImage-image").Each(func(i int, p *goquery.Selection) {
if link, ok := p.Attr("data-image"); ok {
tweet.Photos = append(tweet.Photos, link+"?format=jpg&name=orig")
}
})
// s.Find(".PlayableMedia-player").Each(func(i int, v *goquery.Selection) {
// if style, ok := v.Attr("style"); ok {
// if strings.Contains(style, "background") {
// match := regexp.MustCompile(`https:\/\/.+\/([\w-]+)\.(?:jpg|png)`).FindStringSubmatch(style)
// if len(match) == 2 {
// tweet.Videos = append(tweet.Videos, Video{ID: match[1], Preview: match[0]})
// }
// }
// }
// })
tweets = append(tweets, &tweet)
}
})
return tweets, nil
} }

View file

@ -7,21 +7,25 @@ import (
func TestGetTweets(t *testing.T) { func TestGetTweets(t *testing.T) {
count := 0 count := 0
maxTweetsNbr := 50 maxTweetsNbr := 300
dupcheck := make(map[string]bool)
for tweet := range GetTweets(context.Background(), "Twitter", maxTweetsNbr) { for tweet := range GetTweets(context.Background(), "Twitter", maxTweetsNbr) {
if tweet.Error != nil { if tweet.Error != nil {
t.Error(tweet.Error) t.Error(tweet.Error)
} else { } else {
count++ count++
if tweet.HTML == "" {
t.Error("Expected tweet HTML is not empty")
}
if tweet.ID == "" { if tweet.ID == "" {
t.Error("Expected tweet ID is not empty") t.Error("Expected tweet ID is not empty")
} else {
if dupcheck[tweet.ID] {
t.Errorf("Detect duplicated tweet ID: %s", tweet.ID)
} else {
dupcheck[tweet.ID] = true
}
}
if tweet.UserID == "" {
t.Error("Expected tweet UserID is not empty")
} }
// if tweet.UserID == "" {
// t.Error("Expected tweet UserID is not empty")
// }
if tweet.Username == "" { if tweet.Username == "" {
t.Error("Expected tweet Username is not empty") t.Error("Expected tweet Username is not empty")
} }
@ -37,6 +41,17 @@ func TestGetTweets(t *testing.T) {
if tweet.Timestamp == 0 { if tweet.Timestamp == 0 {
t.Error("Expected tweet Timestamp is greater than zero") t.Error("Expected tweet Timestamp is greater than zero")
} }
for _, video := range tweet.Videos {
if video.ID == "" {
t.Error("Expected tweet video ID is not empty")
}
if video.Preview == "" {
t.Error("Expected tweet video Preview is not empty")
}
if video.URL == "" {
t.Error("Expected tweet video URL is not empty")
}
}
} }
} }
if count != maxTweetsNbr { if count != maxTweetsNbr {

151
types.go Normal file
View file

@ -0,0 +1,151 @@
package twitterscraper
import "time"
type (
// Video type.
Video struct {
ID string
Preview string
URL string
}
// Tweet type.
Tweet struct {
Hashtags []string
ID string
IsQuoted bool
IsPin bool
IsReply bool
IsRetweet bool
Likes int
PermanentURL string
Photos []string
Replies int
Retweets int
Text string
TimeParsed time.Time
Timestamp int64
URLs []string
UserID string
Username string
Videos []Video
}
// Result of scrapping.
Result struct {
Tweet
Error error
}
// timeline JSON
timeline struct {
GlobalObjects struct {
Tweets map[string]struct {
ConversationIDStr string `json:"conversation_id_str"`
CreatedAt string `json:"created_at"`
FavoriteCount int `json:"favorite_count"`
FullText string `json:"full_text"`
Entities struct {
Hashtags []struct {
Text string `json:"text"`
} `json:"hashtags"`
Media []struct {
MediaURLHttps string `json:"media_url_https"`
Type string `json:"type"`
} `json:"media"`
URLs []struct {
ExpandedURL string `json:"expanded_url"`
} `json:"urls"`
} `json:"entities"`
ExtendedEntities struct {
Media []struct {
IDStr string `json:"id_str"`
MediaURLHttps string `json:"media_url_https"`
Type string `json:"type"`
VideoInfo struct {
Variants []struct {
Bitrate int `json:"bitrate,omitempty"`
URL string `json:"url"`
} `json:"variants"`
} `json:"video_info"`
} `json:"media"`
} `json:"extended_entities"`
InReplyToStatusIDStr string `json:"in_reply_to_status_id_str"`
ReplyCount int `json:"reply_count"`
RetweetCount int `json:"retweet_count"`
RetweetedStatusIDStr string `json:"retweeted_status_id_str"`
QuotedStatusIDStr string `json:"quoted_status_id_str"`
Time time.Time `json:"time"`
UserIDStr string `json:"user_id_str"`
} `json:"tweets"`
Users map[string]struct {
CreatedAt string `json:"created_at"`
Description string `json:"description"`
Entities struct {
URL struct {
Urls []struct {
ExpandedURL string `json:"expanded_url"`
} `json:"urls"`
} `json:"url"`
} `json:"entities"`
FavouritesCount int `json:"favourites_count"`
FollowersCount int `json:"followers_count"`
FriendsCount int `json:"friends_count"`
IDStr string `json:"id_str"`
ListedCount int `json:"listed_count"`
Name string `json:"name"`
Location string `json:"location"`
PinnedTweetIdsStr []string `json:"pinned_tweet_ids_str"`
ProfileBannerURL string `json:"profile_banner_url"`
ProfileImageURLHTTPS string `json:"profile_image_url_https"`
Protected bool `json:"protected"`
ScreenName string `json:"screen_name"`
StatusesCount int `json:"statuses_count"`
Verified bool `json:"verified"`
} `json:"users"`
} `json:"globalObjects"`
Timeline struct {
Instructions []struct {
AddEntries struct {
Entries []struct {
Content struct {
Item struct {
Content struct {
Tweet struct {
ID string `json:"id"`
} `json:"tweet"`
} `json:"content"`
} `json:"item"`
Operation struct {
Cursor struct {
Value string `json:"value"`
CursorType string `json:"cursorType"`
} `json:"cursor"`
} `json:"operation"`
TimelineModule struct {
Items []struct {
Item struct {
ClientEventInfo struct {
Details struct {
GuideDetails struct {
TransparentGuideDetails struct {
TrendMetadata struct {
TrendName string `json:"trendName"`
} `json:"trendMetadata"`
} `json:"transparentGuideDetails"`
} `json:"guideDetails"`
} `json:"details"`
} `json:"clientEventInfo"`
} `json:"item"`
} `json:"items"`
} `json:"timelineModule"`
} `json:"content,omitempty"`
} `json:"entries"`
} `json:"addEntries"`
} `json:"instructions"`
} `json:"timeline"`
}
fetchFunc func(user string, maxTweetsNbr int, cursor string) ([]*Tweet, string, error)
)

205
util.go
View file

@ -1,25 +1,30 @@
package twitterscraper package twitterscraper
import ( import (
"encoding/json" "context"
"errors" "errors"
"fmt" "fmt"
"net"
"net/http" "net/http"
"net/url" "net/url"
"regexp" "strconv"
"strings" "strings"
"time"
) )
//HttpProxy Public variable for Http proxy var (
var HTTPProxy *url.URL // IncludeReplies enable tweet reply
IncludeReplies bool
// HTTPProxy Public variable for Http proxy
HTTPProxy *url.URL
)
//SetProxy set http proxy format `http://HOST:PORT` // SetProxy set http proxy format `http://HOST:PORT`
func SetProxy(Proxy string) error { func SetProxy(proxy string) error {
match, _ := regexp.MatchString("http.+", Proxy) if !strings.HasPrefix(proxy, "http://") {
if !match {
return errors.New("only support http protocol") return errors.New("only support http protocol")
} }
urlproxy, err := url.Parse(Proxy) urlproxy, err := url.Parse(proxy)
if err != nil { if err != nil {
return err return err
} }
@ -27,42 +32,172 @@ func SetProxy(Proxy string) error {
return nil return nil
} }
func newRequest(url string) (*http.Request, error) { func newHTTPClient() *http.Client {
req, err := http.NewRequest("GET", url, nil) client := &http.Client{Timeout: 10 * time.Second}
if HTTPProxy != nil {
client = &http.Client{
Transport: &http.Transport{
Proxy: http.ProxyURL(HTTPProxy),
DialContext: (&net.Dialer{
Timeout: 10 * time.Second,
}).DialContext,
},
}
}
return client
}
func newRequest(method string, url string) (*http.Request, error) {
req, err := http.NewRequest(method, url, nil)
if err != nil { if err != nil {
return nil, err return nil, err
} }
req.Header.Set("Accept", "application/json, text/javascript, */*; q=0.01") q := req.URL.Query()
req.Header.Set("Accept-Language", "en-US") q.Add("include_profile_interstitial_type", "1")
req.Header.Set("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8") q.Add("include_blocking", "1")
req.Header.Set("X-Twitter-Active-User", "yes") q.Add("include_blocked_by", "1")
req.Header.Set("X-Requested-With", "XMLHttpRequest") q.Add("include_followed_by", "1")
q.Add("include_want_retweets", "1")
q.Add("include_mute_edge", "1")
q.Add("include_can_dm", "1")
q.Add("include_can_media_tag", "1")
q.Add("skip_status", "1")
q.Add("cards_platform", "Web-12")
q.Add("include_cards", "1")
q.Add("include_ext_alt_text", "true")
q.Add("include_quote_count", "true")
q.Add("include_reply_count", "1")
q.Add("tweet_mode", "extended")
q.Add("include_entities", "true")
q.Add("include_user_entities", "true")
q.Add("include_ext_media_color", "true")
q.Add("include_ext_media_availability", "true")
q.Add("send_error_codes", "true")
q.Add("simple_quoted_tweet", "true")
q.Add("include_tweet_replies", strconv.FormatBool(IncludeReplies))
q.Add("ext", "mediaStats,highlightedLabel")
req.URL.RawQuery = q.Encode()
return req, nil return req, nil
} }
func getHTMLFromJSON(req *http.Request, field string) (*strings.Reader, error) { func getTimeline(ctx context.Context, query string, maxTweetsNbr int, fetchFunc fetchFunc) <-chan *Result {
resp, err := http.DefaultClient.Do(req) channel := make(chan *Result)
if err != nil { go func(user string) {
return nil, err defer close(channel)
} var nextCursor string
defer resp.Body.Close() tweetsNbr := 0
for tweetsNbr < maxTweetsNbr {
select {
case <-ctx.Done():
channel <- &Result{Error: ctx.Err()}
return
default:
}
if resp.StatusCode != http.StatusOK { tweets, next, err := fetchFunc(query, maxTweetsNbr, nextCursor)
return nil, fmt.Errorf("response status: %s", resp.Status) if err != nil {
} channel <- &Result{Error: err}
return
}
ajaxJSON := make(map[string]interface{}) if len(tweets) == 0 {
err = json.NewDecoder(resp.Body).Decode(&ajaxJSON) break
if err != nil { }
return nil, err
}
htm, ok := ajaxJSON[field].(string) for _, tweet := range tweets {
if !ok { select {
return nil, fmt.Errorf("field %s not found in JSON", field) case <-ctx.Done():
} channel <- &Result{Error: ctx.Err()}
return
default:
}
return strings.NewReader(htm), nil if tweetsNbr < maxTweetsNbr {
nextCursor = next
channel <- &Result{Tweet: *tweet}
}
tweetsNbr++
}
}
}(query)
return channel
}
func parseTimeline(timeline *timeline) ([]*Tweet, string) {
tweets := make(map[string]Tweet)
for id, tweet := range timeline.GlobalObjects.Tweets {
username := timeline.GlobalObjects.Users[tweet.UserIDStr].ScreenName
tw := Tweet{
ID: id,
Likes: tweet.FavoriteCount,
PermanentURL: fmt.Sprintf("https://twitter.com/%s/status/%s", username, id),
Replies: tweet.RetweetCount,
Retweets: tweet.RetweetCount,
Text: tweet.FullText,
UserID: tweet.UserIDStr,
Username: username,
}
tm, err := time.Parse(time.RubyDate, tweet.CreatedAt)
if err == nil {
tw.TimeParsed = tm
tw.Timestamp = tm.Unix()
}
if tweet.QuotedStatusIDStr != "" {
tw.IsQuoted = true
}
if tweet.InReplyToStatusIDStr != "" {
tw.IsReply = true
}
if tweet.RetweetedStatusIDStr != "" {
tw.IsRetweet = true
}
for _, pinned := range timeline.GlobalObjects.Users[tweet.UserIDStr].PinnedTweetIdsStr {
if tweet.ConversationIDStr == pinned {
tw.IsPin = true
break
}
}
for _, hash := range tweet.Entities.Hashtags {
tw.Hashtags = append(tw.Hashtags, hash.Text)
}
for _, media := range tweet.Entities.Media {
if media.Type == "photo" {
tw.Photos = append(tw.Photos, media.MediaURLHttps)
}
}
for _, media := range tweet.ExtendedEntities.Media {
if media.Type == "video" {
video := Video{
ID: media.IDStr,
Preview: media.MediaURLHttps,
}
maxBitrate := 0
for _, variant := range media.VideoInfo.Variants {
if variant.Bitrate > maxBitrate {
video.URL = strings.TrimSuffix(variant.URL, "?tag=10")
}
}
tw.Videos = append(tw.Videos, video)
}
}
for _, url := range tweet.Entities.URLs {
tw.URLs = append(tw.URLs, url.ExpandedURL)
}
tweets[tw.ID] = tw
}
var cursor string
var orderedTweets []*Tweet
for _, entry := range timeline.Timeline.Instructions[0].AddEntries.Entries {
if tweet, ok := tweets[entry.Content.Item.Content.Tweet.ID]; ok {
orderedTweets = append(orderedTweets, &tweet)
}
if entry.Content.Operation.Cursor.CursorType == "Bottom" {
cursor = entry.Content.Operation.Cursor.Value
}
}
return orderedTweets, cursor
} }