Total refactoring
Used guest frontend API BREAKING CHANGE: remove tweet.HTML property Loading more information Minor fixes and changes
This commit is contained in:
parent
1c582e142e
commit
edad8f6393
15 changed files with 628 additions and 497 deletions
41
README.md
41
README.md
|
|
@ -1,7 +1,5 @@
|
|||
# Twitter Scraper
|
||||
|
||||
Golang implementation of python library <https://github.com/bisguzar/twitter-scraper>
|
||||
|
||||
Twitter's API is annoying to work with, and has lots of limitations —
|
||||
luckily their frontend (JavaScript) has it's own API, which I reverse-engineered.
|
||||
No API rate limits. No tokens needed. No restrictions. Extremely fast.
|
||||
|
|
@ -32,12 +30,12 @@ func main() {
|
|||
if tweet.Error != nil {
|
||||
panic(tweet.Error)
|
||||
}
|
||||
fmt.Println(tweet.HTML)
|
||||
fmt.Println(tweet.Text)
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
It appears you can ask for up to 50 tweets.
|
||||
It appears you can ask for up to 50 tweets (limit ~3200 tweets).
|
||||
|
||||
### Search tweets by query standard operators
|
||||
|
||||
|
|
@ -58,32 +56,11 @@ func main() {
|
|||
if tweet.Error != nil {
|
||||
panic(tweet.Error)
|
||||
}
|
||||
fmt.Println(tweet.HTML)
|
||||
fmt.Println(tweet.Text)
|
||||
}
|
||||
}
|
||||
```
|
||||
#### With http proxy
|
||||
|
||||
```golang
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
twitterscraper "github.com/n0madic/twitter-scraper"
|
||||
)
|
||||
|
||||
func main() {
|
||||
twitterscraper.SetProxy("http://localhost:16379")
|
||||
for tweet := range twitterscraper.SearchTweets(context.Background(),
|
||||
"twitter scraper data -filter:retweets", 50) {
|
||||
if tweet.Error != nil {
|
||||
panic(tweet.Error)
|
||||
}
|
||||
fmt.Println(tweet.HTML)
|
||||
}
|
||||
}
|
||||
```
|
||||
The search ends if we have 50 tweets.
|
||||
|
||||
See [Rules and filtering](https://developer.twitter.com/en/docs/tweets/rules-and-filtering/overview/standard-operators) for build standard queries.
|
||||
|
|
@ -125,3 +102,15 @@ func main() {
|
|||
fmt.Println(trends)
|
||||
}
|
||||
```
|
||||
|
||||
### Use http proxy
|
||||
|
||||
```golang
|
||||
twitterscraper.SetProxy("http://localhost:3128")
|
||||
```
|
||||
|
||||
### Load timeline with tweet replies
|
||||
|
||||
```golang
|
||||
twitterscraper.IncludeReplies = true
|
||||
```
|
||||
|
|
|
|||
105
api.go
Normal file
105
api.go
Normal file
|
|
@ -0,0 +1,105 @@
|
|||
package twitterscraper
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"net/http"
|
||||
"sync"
|
||||
)
|
||||
|
||||
const bearerToken string = "AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA"
|
||||
|
||||
type user struct {
|
||||
Data struct {
|
||||
User struct {
|
||||
RestID string `json:"rest_id"`
|
||||
} `json:"user"`
|
||||
} `json:"data"`
|
||||
}
|
||||
|
||||
var (
|
||||
guestToken string
|
||||
cacheIDs sync.Map
|
||||
)
|
||||
|
||||
func requestAPI(req *http.Request, target interface{}) error {
|
||||
if guestToken == "" {
|
||||
err := GetGuestToken()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
req.Header.Set("Authorization", "Bearer "+bearerToken)
|
||||
req.Header.Set("X-Guest-Token", guestToken)
|
||||
|
||||
resp, err := newHTTPClient().Do(req)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
return json.NewDecoder(resp.Body).Decode(target)
|
||||
}
|
||||
|
||||
// GetGuestToken from API
|
||||
func GetGuestToken() error {
|
||||
req, err := http.NewRequest("POST", "https://api.twitter.com/1.1/guest/activate.json", nil)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
req.Header.Set("Authorization", "Bearer "+bearerToken)
|
||||
|
||||
resp, err := newHTTPClient().Do(req)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return fmt.Errorf("response status %s", resp.Status)
|
||||
}
|
||||
body, err := ioutil.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
var jsn map[string]interface{}
|
||||
if err := json.Unmarshal(body, &jsn); err != nil {
|
||||
return err
|
||||
}
|
||||
var ok bool
|
||||
if guestToken, ok = jsn["guest_token"].(string); !ok {
|
||||
return fmt.Errorf("guest_token not found")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// GetUserIDByScreenName from API
|
||||
func GetUserIDByScreenName(screenName string) (string, error) {
|
||||
id, ok := cacheIDs.Load(screenName)
|
||||
if ok {
|
||||
return id.(string), nil
|
||||
}
|
||||
|
||||
var jsn user
|
||||
req, err := http.NewRequest("GET", "https://api.twitter.com/graphql/4S2ihIKfF3xhp-ENxvUAfQ/UserByScreenName?variables=%7B%22screen_name%22%3A%22"+screenName+"%22%2C%22withHighlightedLabel%22%3Atrue%7D", nil)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
err = requestAPI(req, &jsn)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
if jsn.Data.User.RestID == "" {
|
||||
return "", fmt.Errorf("rest_id not found")
|
||||
}
|
||||
|
||||
cacheIDs.Store(screenName, jsn.Data.User.RestID)
|
||||
|
||||
return jsn.Data.User.RestID, nil
|
||||
}
|
||||
24
api_test.go
Normal file
24
api_test.go
Normal file
|
|
@ -0,0 +1,24 @@
|
|||
package twitterscraper
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestGetGuestToken(t *testing.T) {
|
||||
if err := GetGuestToken(); err != nil {
|
||||
t.Errorf("getGuestToken() error = %v", err)
|
||||
}
|
||||
if guestToken == "" {
|
||||
t.Error("Expected non-empty guestToken")
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetUserIDByScreenName(t *testing.T) {
|
||||
userID, err := GetUserIDByScreenName("Twitter")
|
||||
if err != nil {
|
||||
t.Errorf("getUserByScreenName() error = %v", err)
|
||||
}
|
||||
if userID == "" {
|
||||
t.Error("Expected non-empty user ID")
|
||||
}
|
||||
}
|
||||
5
go.mod
5
go.mod
|
|
@ -2,7 +2,4 @@ module github.com/n0madic/twitter-scraper
|
|||
|
||||
go 1.13
|
||||
|
||||
require (
|
||||
github.com/PuerkitoBio/goquery v1.5.1
|
||||
github.com/google/go-cmp v0.4.0
|
||||
)
|
||||
require github.com/google/go-cmp v0.5.4
|
||||
|
|
|
|||
14
go.sum
14
go.sum
|
|
@ -1,14 +1,4 @@
|
|||
github.com/PuerkitoBio/goquery v1.5.1 h1:PSPBGne8NIUWw+/7vFBV+kG2J/5MOjbzc7154OaKCSE=
|
||||
github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc=
|
||||
github.com/andybalholm/cascadia v1.1.0 h1:BuuO6sSfQNFRu1LppgbD25Hr2vLYW25JvxHs5zzsLTo=
|
||||
github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
|
||||
github.com/google/go-cmp v0.4.0 h1:xsAVV57WRhGj6kEIi8ReJzQlHHqcBYCElAvkovg3B/4=
|
||||
github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
|
||||
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
|
||||
golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
|
||||
golang.org/x/net v0.0.0-20200202094626-16171245cfb2 h1:CCH4IOTTfewWjGOlSp+zGcjutRKlBEZQ6wTn8ozI/nI=
|
||||
golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
|
||||
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
||||
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
|
||||
github.com/google/go-cmp v0.5.4 h1:L8R9j+yAqZuZjsqh/z+F1NCffTKKLShY6zXTItVIZ8M=
|
||||
github.com/google/go-cmp v0.5.4/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
|
||||
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4=
|
||||
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
|
|
|
|||
108
profile.go
108
profile.go
|
|
@ -2,13 +2,7 @@ package twitterscraper
|
|||
|
||||
import (
|
||||
"fmt"
|
||||
"net"
|
||||
"net/http"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
)
|
||||
|
||||
// Profile of twitter user.
|
||||
|
|
@ -19,12 +13,15 @@ type Profile struct {
|
|||
Birthday string
|
||||
FollowersCount int
|
||||
FollowingCount int
|
||||
FriendsCount int
|
||||
IsPrivate bool
|
||||
IsVerified bool
|
||||
Joined *time.Time
|
||||
LikesCount int
|
||||
ListedCount int
|
||||
Location string
|
||||
Name string
|
||||
PinnedTweetIDs []string
|
||||
TweetsCount int
|
||||
URL string
|
||||
UserID string
|
||||
|
|
@ -34,66 +31,61 @@ type Profile struct {
|
|||
|
||||
// GetProfile return parsed user profile.
|
||||
func GetProfile(username string) (Profile, error) {
|
||||
url := "https://mobile.twitter.com/" + username
|
||||
|
||||
client := http.DefaultClient
|
||||
if HTTPProxy != nil {
|
||||
client = &http.Client{
|
||||
Transport: &http.Transport{
|
||||
Proxy: http.ProxyURL(HTTPProxy),
|
||||
DialContext: (&net.Dialer{
|
||||
Timeout: 10 * time.Second,
|
||||
}).DialContext,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
req, err := http.NewRequest("GET", url, nil)
|
||||
if err != nil {
|
||||
return Profile{}, err
|
||||
}
|
||||
req.Header.Set("Accept-Language", "en-US")
|
||||
|
||||
resp, err := client.Do(req)
|
||||
if resp == nil {
|
||||
return Profile{}, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return Profile{}, fmt.Errorf("response status: %s", resp.Status)
|
||||
}
|
||||
|
||||
doc, err := goquery.NewDocumentFromReader(resp.Body)
|
||||
userID, err := GetUserIDByScreenName(username)
|
||||
if err != nil {
|
||||
return Profile{}, err
|
||||
}
|
||||
|
||||
// parse join date text
|
||||
screenName := doc.Find(".screen-name").First().Text()
|
||||
req, err := newRequest("GET", "https://twitter.com/i/api/2/timeline/profile/"+userID+".json")
|
||||
if err != nil {
|
||||
return Profile{}, err
|
||||
}
|
||||
|
||||
// check is username valid
|
||||
if screenName == "" {
|
||||
q := req.URL.Query()
|
||||
q.Add("count", "20")
|
||||
q.Add("userId", userID)
|
||||
req.URL.RawQuery = q.Encode()
|
||||
|
||||
var timeline timeline
|
||||
err = requestAPI(req, &timeline)
|
||||
if err != nil {
|
||||
return Profile{}, err
|
||||
}
|
||||
|
||||
user, found := timeline.GlobalObjects.Users[userID]
|
||||
if !found {
|
||||
return Profile{}, fmt.Errorf("either @%s does not exist or is private", username)
|
||||
}
|
||||
|
||||
return Profile{
|
||||
Avatar: doc.Find("td.avatar > img").First().AttrOr("src", ""),
|
||||
Biography: strings.TrimSpace(doc.Find(".bio").First().Text()),
|
||||
FollowersCount: parseCount(doc.Find("table.profile-stats > tbody > tr > td:nth-child(3) > a > div.statnum").First().Text()),
|
||||
FollowingCount: parseCount(doc.Find("table.profile-stats > tbody > tr > td:nth-child(2) > a > div.statnum").First().Text()),
|
||||
IsPrivate: strings.Contains(doc.Find("div.fullname > a.badge > img").First().AttrOr("src", ""), "protected"),
|
||||
IsVerified: strings.Contains(doc.Find("div.fullname > a.badge > img").First().AttrOr("src", ""), "verified"),
|
||||
Location: strings.TrimSpace(doc.Find(".location").First().Text()),
|
||||
Name: strings.TrimSpace(doc.Find(".fullname").First().Text()),
|
||||
TweetsCount: parseCount(doc.Find("table.profile-stats > tbody > tr > td:nth-child(1) > div.statnum").First().Text()),
|
||||
URL: "https://twitter.com/" + screenName,
|
||||
Username: screenName,
|
||||
Website: strings.TrimSpace(doc.Find("div.url > div > a").First().AttrOr("data-url", "")),
|
||||
}, nil
|
||||
profile := Profile{
|
||||
Avatar: user.ProfileImageURLHTTPS,
|
||||
Banner: user.ProfileBannerURL,
|
||||
Biography: user.Description,
|
||||
FollowersCount: user.FollowersCount,
|
||||
FollowingCount: user.FavouritesCount,
|
||||
FriendsCount: user.FriendsCount,
|
||||
IsPrivate: user.Protected,
|
||||
IsVerified: user.Verified,
|
||||
LikesCount: user.FavouritesCount,
|
||||
ListedCount: user.ListedCount,
|
||||
Location: user.Location,
|
||||
Name: user.Name,
|
||||
PinnedTweetIDs: user.PinnedTweetIdsStr,
|
||||
TweetsCount: user.StatusesCount,
|
||||
URL: "https://twitter.com/" + user.ScreenName,
|
||||
UserID: user.IDStr,
|
||||
Username: user.ScreenName,
|
||||
}
|
||||
|
||||
func parseCount(str string) (i int) {
|
||||
i, _ = strconv.Atoi(strings.Replace(str, ",", "", -1))
|
||||
return
|
||||
tm, err := time.Parse(time.RubyDate, user.CreatedAt)
|
||||
if err == nil {
|
||||
tm = tm.UTC()
|
||||
profile.Joined = &tm
|
||||
}
|
||||
|
||||
if len(user.Entities.URL.Urls) > 0 {
|
||||
profile.Website = user.Entities.URL.Urls[0].ExpandedURL
|
||||
}
|
||||
|
||||
return profile, nil
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,28 +2,30 @@ package twitterscraper
|
|||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/google/go-cmp/cmp"
|
||||
"github.com/google/go-cmp/cmp/cmpopts"
|
||||
)
|
||||
|
||||
func TestGetProfile(t *testing.T) {
|
||||
// loc := time.FixedZone("UTC", 0)
|
||||
// joined := time.Date(2007, 02, 20, 6, 35, 0, 0, loc)
|
||||
loc := time.FixedZone("UTC", 0)
|
||||
joined := time.Date(2007, 02, 20, 14, 35, 54, 0, loc)
|
||||
sample := Profile{
|
||||
Avatar: "https://pbs.twimg.com/profile_images/1308010958862905345/-SGZioPb_normal.jpg",
|
||||
// Banner: "https://pbs.twimg.com/profile_banners/783214/1596041768/1500x500",
|
||||
Banner: "https://pbs.twimg.com/profile_banners/783214/1604501727",
|
||||
Biography: "What's happening!?",
|
||||
// Birthday: "March 21",
|
||||
IsPrivate: false,
|
||||
IsVerified: true,
|
||||
// Joined: &joined,
|
||||
Joined: &joined,
|
||||
Location: "everywhere",
|
||||
Name: "Twitter",
|
||||
PinnedTweetIDs: []string{},
|
||||
URL: "https://twitter.com/Twitter",
|
||||
// UserID: "783214",
|
||||
UserID: "783214",
|
||||
Username: "Twitter",
|
||||
Website: "about.twitter.com",
|
||||
Website: "https://about.twitter.com/",
|
||||
}
|
||||
|
||||
profile, err := GetProfile("Twitter")
|
||||
|
|
@ -34,7 +36,9 @@ func TestGetProfile(t *testing.T) {
|
|||
cmpOptions := cmp.Options{
|
||||
cmpopts.IgnoreFields(Profile{}, "FollowersCount"),
|
||||
cmpopts.IgnoreFields(Profile{}, "FollowingCount"),
|
||||
cmpopts.IgnoreFields(Profile{}, "FriendsCount"),
|
||||
cmpopts.IgnoreFields(Profile{}, "LikesCount"),
|
||||
cmpopts.IgnoreFields(Profile{}, "ListedCount"),
|
||||
cmpopts.IgnoreFields(Profile{}, "TweetsCount"),
|
||||
}
|
||||
if diff := cmp.Diff(sample, profile, cmpOptions...); diff != "" {
|
||||
|
|
@ -47,9 +51,9 @@ func TestGetProfile(t *testing.T) {
|
|||
if profile.FollowingCount == 0 {
|
||||
t.Error("Expected FollowingCount is greater than zero")
|
||||
}
|
||||
// if profile.LikesCount == 0 {
|
||||
// t.Error("Expected LikesCount is greater than zero")
|
||||
// }
|
||||
if profile.LikesCount == 0 {
|
||||
t.Error("Expected LikesCount is greater than zero")
|
||||
}
|
||||
if profile.TweetsCount == 0 {
|
||||
t.Error("Expected TweetsCount is greater than zero")
|
||||
}
|
||||
|
|
|
|||
137
search.go
137
search.go
|
|
@ -2,143 +2,44 @@ package twitterscraper
|
|||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"net"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"strconv"
|
||||
)
|
||||
|
||||
const mobileSearchURL = "https://mobile.twitter.com/search?q=%s"
|
||||
|
||||
// SearchTweets returns channel with tweets for a given search query
|
||||
func SearchTweets(ctx context.Context, query string, maxTweetsNbr int) <-chan *Result {
|
||||
channel := make(chan *Result)
|
||||
go func(query string) {
|
||||
defer close(channel)
|
||||
var nextCursor string
|
||||
tweetsNbr := 0
|
||||
for tweetsNbr < maxTweetsNbr {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
channel <- &Result{Error: ctx.Err()}
|
||||
return
|
||||
default:
|
||||
}
|
||||
|
||||
tweets, next, err := FetchSearchTweets(query, nextCursor)
|
||||
if err != nil {
|
||||
channel <- &Result{Error: err}
|
||||
return
|
||||
}
|
||||
|
||||
if len(tweets) == 0 {
|
||||
break
|
||||
}
|
||||
|
||||
for _, tweet := range tweets {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
channel <- &Result{Error: ctx.Err()}
|
||||
return
|
||||
default:
|
||||
}
|
||||
|
||||
if tweetsNbr < maxTweetsNbr {
|
||||
nextCursor = next
|
||||
channel <- &Result{Tweet: *tweet}
|
||||
}
|
||||
tweetsNbr++
|
||||
}
|
||||
}
|
||||
}(query)
|
||||
return channel
|
||||
return getTimeline(ctx, query, maxTweetsNbr, FetchSearchTweets)
|
||||
}
|
||||
|
||||
// FetchSearchTweets gets tweets for a given search query, via the Twitter frontend API
|
||||
func FetchSearchTweets(query, nextCursor string) ([]*Tweet, string, error) {
|
||||
url := fmt.Sprintf(mobileSearchURL, url.PathEscape(query))
|
||||
if nextCursor != "" {
|
||||
url = "https://mobile.twitter.com" + nextCursor
|
||||
func FetchSearchTweets(query string, maxTweetsNbr int, cursor string) ([]*Tweet, string, error) {
|
||||
query = url.PathEscape(query)
|
||||
if maxTweetsNbr > 200 {
|
||||
maxTweetsNbr = 200
|
||||
}
|
||||
|
||||
client := http.DefaultClient
|
||||
if HTTPProxy != nil {
|
||||
client = &http.Client{
|
||||
Transport: &http.Transport{
|
||||
Proxy: http.ProxyURL(HTTPProxy),
|
||||
DialContext: (&net.Dialer{
|
||||
Timeout: 10 * time.Second,
|
||||
}).DialContext,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
req, err := http.NewRequest("GET", url, nil)
|
||||
req, err := newRequest("GET", "https://twitter.com/i/api/2/search/adaptive.json")
|
||||
if err != nil {
|
||||
return nil, "", err
|
||||
}
|
||||
|
||||
req.Header.Set("Referer", "https://mobile.twitter.com/")
|
||||
req.Header.Set("User-Agent", "Opera/9.80 (J2ME/MIDP; Opera Mini/5.1.21214/28.2725; U; ru) Presto/2.8.119 Version/11.10")
|
||||
|
||||
resp, err := client.Do(req)
|
||||
if resp == nil {
|
||||
return nil, "", err
|
||||
q := req.URL.Query()
|
||||
q.Add("q", query)
|
||||
q.Add("count", strconv.Itoa(maxTweetsNbr))
|
||||
q.Add("query_source", "typed_query")
|
||||
q.Add("pc", "1")
|
||||
q.Add("spelling_corrections", "1")
|
||||
if cursor != "" {
|
||||
q.Add("cursor", cursor)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
req.URL.RawQuery = q.Encode()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return nil, "", fmt.Errorf("response status: %s", resp.Status)
|
||||
}
|
||||
|
||||
return readTweetsFromMobileHTML(resp.Body)
|
||||
}
|
||||
|
||||
func readTweetsFromMobileHTML(htm io.ReadCloser) ([]*Tweet, string, error) {
|
||||
var tweets []*Tweet
|
||||
|
||||
doc, err := goquery.NewDocumentFromReader(htm)
|
||||
var timeline timeline
|
||||
err = requestAPI(req, &timeline)
|
||||
if err != nil {
|
||||
return nil, "", err
|
||||
}
|
||||
|
||||
doc.Find("table.tweet").Each(func(i int, s *goquery.Selection) {
|
||||
var tweet Tweet
|
||||
tweetID, ok := s.Find(".tweet-text").Attr("data-id")
|
||||
if ok {
|
||||
tweet.ID = tweetID
|
||||
tweet.Username = strings.TrimPrefix(strings.TrimSpace(s.Find("td.user-info > a > div.username").Text()), "@")
|
||||
tweet.PermanentURL = fmt.Sprintf("https://twitter.com/%s/status/%s", tweet.Username, tweet.ID)
|
||||
tweet.Text = strings.TrimSpace(s.Find(".tweet-text").Text())
|
||||
tweet.HTML, _ = s.Find(".tweet-text").Html()
|
||||
tweet.HTML = strings.TrimSpace(tweet.HTML)
|
||||
s.Find("td.tweet-social-context > span").Each(func(i int, c *goquery.Selection) {
|
||||
tweet.IsRetweet = true
|
||||
})
|
||||
s.Find(".twitter-hashtag").Each(func(i int, h *goquery.Selection) {
|
||||
tweet.Hashtags = append(tweet.Hashtags, h.Text())
|
||||
})
|
||||
s.Find("a.tco-link:not(.u-hidden)").Each(func(i int, u *goquery.Selection) {
|
||||
if link, ok := u.Attr("data-expanded-url"); ok {
|
||||
tweet.URLs = append(tweet.URLs, link)
|
||||
}
|
||||
})
|
||||
s.Find("div.media > img").Each(func(i int, p *goquery.Selection) {
|
||||
if link, ok := p.Attr("src"); ok {
|
||||
tweet.Photos = append(tweet.Photos, strings.TrimSuffix(link, ":small"))
|
||||
}
|
||||
})
|
||||
|
||||
tweets = append(tweets, &tweet)
|
||||
}
|
||||
})
|
||||
|
||||
nextCursor := doc.Find("div.w-button-more > a").AttrOr("href", "")
|
||||
|
||||
tweets, nextCursor := parseTimeline(&timeline)
|
||||
return tweets, nextCursor, nil
|
||||
}
|
||||
|
|
|
|||
|
|
@ -7,21 +7,21 @@ import (
|
|||
|
||||
func TestGetSearchTweets(t *testing.T) {
|
||||
count := 0
|
||||
maxTweetsNbr := 50
|
||||
maxTweetsNbr := 250
|
||||
for tweet := range SearchTweets(context.Background(), "twitter scraper data -filter:retweets", maxTweetsNbr) {
|
||||
if tweet.Error != nil {
|
||||
t.Error(tweet.Error)
|
||||
} else {
|
||||
count++
|
||||
if tweet.HTML == "" {
|
||||
t.Error("Expected tweet HTML is not empty")
|
||||
}
|
||||
if tweet.ID == "" {
|
||||
t.Error("Expected tweet ID is not empty")
|
||||
}
|
||||
if tweet.PermanentURL == "" {
|
||||
t.Error("Expected tweet PermanentURL is not empty")
|
||||
}
|
||||
if tweet.IsRetweet {
|
||||
t.Error("Expected tweet IsRetweet is false")
|
||||
}
|
||||
if tweet.Text == "" {
|
||||
t.Error("Expected tweet Text is not empty")
|
||||
}
|
||||
|
|
|
|||
52
trends.go
52
trends.go
|
|
@ -1,55 +1,29 @@
|
|||
package twitterscraper
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
)
|
||||
|
||||
const trendsURL = "https://mobile.twitter.com/trends"
|
||||
|
||||
// GetTrends return list of trends.
|
||||
func GetTrends() ([]string, error) {
|
||||
client := http.DefaultClient
|
||||
if HTTPProxy != nil {
|
||||
client = &http.Client{
|
||||
Transport: &http.Transport{
|
||||
Proxy: http.ProxyURL(HTTPProxy),
|
||||
DialContext: (&net.Dialer{
|
||||
Timeout: 10 * time.Second,
|
||||
}).DialContext,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
req, err := http.NewRequest("GET", trendsURL, nil)
|
||||
req, err := newRequest("GET", "https://twitter.com/i/api/2/guide.json")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
req.Header.Set("Accept-Language", "en-US")
|
||||
|
||||
resp, err := client.Do(req)
|
||||
if resp == nil {
|
||||
return nil, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
q := req.URL.Query()
|
||||
q.Add("count", "20")
|
||||
q.Add("candidate_source", "trends")
|
||||
q.Add("include_page_configuration", "false")
|
||||
q.Add("entity_tokens", "false")
|
||||
req.URL.RawQuery = q.Encode()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return nil, fmt.Errorf("response status: %s", resp.Status)
|
||||
}
|
||||
|
||||
doc, err := goquery.NewDocumentFromReader(resp.Body)
|
||||
var jsn timeline
|
||||
err = requestAPI(req, &jsn)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var trends []string
|
||||
doc.Find("li.topic").Each(func(i int, s *goquery.Selection) {
|
||||
trends = append(trends, strings.TrimSpace(s.Text()))
|
||||
})
|
||||
for _, item := range jsn.Timeline.Instructions[1].AddEntries.Entries[1].Content.TimelineModule.Items {
|
||||
trends = append(trends, item.Item.ClientEventInfo.Details.GuideDetails.TransparentGuideDetails.TrendMetadata.TrendName)
|
||||
}
|
||||
|
||||
return trends, nil
|
||||
}
|
||||
|
|
|
|||
|
|
@ -10,7 +10,13 @@ func TestGetTrends(t *testing.T) {
|
|||
t.Error(err)
|
||||
}
|
||||
|
||||
if len(trends) != 10 {
|
||||
t.Errorf("Expected 10 trends, got %d: %#v", len(trends), trends)
|
||||
if len(trends) != 20 {
|
||||
t.Errorf("Expected 20 trends, got %d: %#v", len(trends), trends)
|
||||
}
|
||||
|
||||
for _, trend := range trends {
|
||||
if trend == "" {
|
||||
t.Error("Expected trend is not empty")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
196
tweets.go
196
tweets.go
|
|
@ -2,196 +2,44 @@ package twitterscraper
|
|||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
)
|
||||
|
||||
const ajaxURL = "https://twitter.com/i/profiles/show/%s/timeline/tweets"
|
||||
|
||||
// Video type.
|
||||
type Video struct {
|
||||
ID string
|
||||
Preview string
|
||||
}
|
||||
|
||||
// Tweet type.
|
||||
type Tweet struct {
|
||||
Hashtags []string
|
||||
HTML string
|
||||
ID string
|
||||
IsPin bool
|
||||
IsRetweet bool
|
||||
Likes int
|
||||
PermanentURL string
|
||||
Photos []string
|
||||
Replies int
|
||||
Retweets int
|
||||
Text string
|
||||
TimeParsed time.Time
|
||||
Timestamp int64
|
||||
URLs []string
|
||||
UserID string
|
||||
Username string
|
||||
Videos []Video
|
||||
}
|
||||
|
||||
// Result of scrapping.
|
||||
type Result struct {
|
||||
Tweet
|
||||
Error error
|
||||
}
|
||||
|
||||
// GetTweets returns channel with tweets for a given user.
|
||||
func GetTweets(ctx context.Context, user string, maxTweetsNbr int) <-chan *Result {
|
||||
channel := make(chan *Result)
|
||||
go func(user string) {
|
||||
defer close(channel)
|
||||
var lastTweetID string
|
||||
tweetsNbr := 0
|
||||
for tweetsNbr < maxTweetsNbr {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
channel <- &Result{Error: ctx.Err()}
|
||||
return
|
||||
default:
|
||||
}
|
||||
|
||||
tweets, err := FetchTweets(user, lastTweetID)
|
||||
if err != nil {
|
||||
channel <- &Result{Error: err}
|
||||
return
|
||||
}
|
||||
|
||||
if len(tweets) == 0 {
|
||||
break
|
||||
}
|
||||
|
||||
for _, tweet := range tweets {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
channel <- &Result{Error: ctx.Err()}
|
||||
return
|
||||
default:
|
||||
}
|
||||
|
||||
if tweetsNbr < maxTweetsNbr {
|
||||
lastId, _ := strconv.ParseInt(tweet.ID, 10, 64)
|
||||
lastTweetID = strconv.FormatInt(lastId-1, 10)
|
||||
channel <- &Result{Tweet: *tweet}
|
||||
}
|
||||
tweetsNbr++
|
||||
}
|
||||
}
|
||||
}(user)
|
||||
return channel
|
||||
return getTimeline(ctx, user, maxTweetsNbr, FetchTweets)
|
||||
}
|
||||
|
||||
// FetchTweets gets tweets for a given user, via the Twitter frontend API.
|
||||
func FetchTweets(user string, last string) ([]*Tweet, error) {
|
||||
req, err := http.NewRequest("GET", "https://syndication.twitter.com/timeline/profile", nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
func FetchTweets(user string, maxTweetsNbr int, cursor string) ([]*Tweet, string, error) {
|
||||
if maxTweetsNbr > 200 {
|
||||
maxTweetsNbr = 200
|
||||
}
|
||||
|
||||
req.Header.Set("Referer", "https://publish.twitter.com/")
|
||||
userID, err := GetUserIDByScreenName(user)
|
||||
if err != nil {
|
||||
return nil, "", err
|
||||
}
|
||||
|
||||
req, err := newRequest("GET", "https://api.twitter.com/2/timeline/profile/"+userID+".json")
|
||||
if err != nil {
|
||||
return nil, "", err
|
||||
}
|
||||
|
||||
q := req.URL.Query()
|
||||
q.Add("screen_name", user)
|
||||
q.Add("with_replies", "true")
|
||||
if last != "" {
|
||||
q.Add("max_position", last)
|
||||
q.Add("count", strconv.Itoa(maxTweetsNbr))
|
||||
q.Add("userId", userID)
|
||||
if cursor != "" {
|
||||
q.Add("cursor", cursor)
|
||||
}
|
||||
req.URL.RawQuery = q.Encode()
|
||||
|
||||
htm, err := getHTMLFromJSON(req, "body")
|
||||
var timeline timeline
|
||||
err = requestAPI(req, &timeline)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
return nil, "", err
|
||||
}
|
||||
|
||||
tweets, err := readTweetsFromHTML(htm)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return tweets, nil
|
||||
}
|
||||
|
||||
func readTweetsFromHTML(htm *strings.Reader) ([]*Tweet, error) {
|
||||
var tweets []*Tweet
|
||||
|
||||
doc, err := goquery.NewDocumentFromReader(htm)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
doc.Find(".timeline-Tweet").Each(func(i int, s *goquery.Selection) {
|
||||
var tweet Tweet
|
||||
timeStr, ok := s.Find(".timeline-Tweet-metadata > a > time").Attr("datetime")
|
||||
if ok {
|
||||
tweet.TimeParsed, _ = time.Parse("2006-01-02T15:04:05-0700", timeStr)
|
||||
tweet.Timestamp = tweet.TimeParsed.Unix()
|
||||
tweet.ID = s.AttrOr("data-tweet-id", "")
|
||||
// tweet.UserID = s.Find(".tweet").AttrOr("data-user-id", "")
|
||||
tweet.Username = strings.TrimPrefix(s.Find(".TweetAuthor-screenName").AttrOr("title", ""), "@")
|
||||
tweet.PermanentURL = fmt.Sprintf("https://twitter.com/%s/status/%s", tweet.Username, tweet.ID)
|
||||
tweet.Text = s.Find(".timeline-Tweet-text").Text()
|
||||
tweet.HTML, _ = s.Find(".timeline-Tweet-text").Html()
|
||||
s.Find(".timeline-Tweet-retweetCredit").Each(func(i int, c *goquery.Selection) {
|
||||
tweet.IsRetweet = true
|
||||
})
|
||||
// s.Find("span.js-pinned-text").Each(func(i int, c *goquery.Selection) {
|
||||
// tweet.IsPin = true
|
||||
// })
|
||||
// s.Find(".ProfileTweet-actionCount").Each(func(i int, c *goquery.Selection) {
|
||||
// txt := strings.TrimSpace(c.Text())
|
||||
// switch {
|
||||
// case strings.HasSuffix(txt, "likes"):
|
||||
// l := strings.Split(txt, " ")
|
||||
// tweet.Likes, _ = strconv.Atoi(l[0])
|
||||
// case strings.HasSuffix(txt, "replies"):
|
||||
// l := strings.Split(txt, " ")
|
||||
// tweet.Replies, _ = strconv.Atoi(l[0])
|
||||
// case strings.HasSuffix(txt, "retweets"):
|
||||
// l := strings.Split(txt, " ")
|
||||
// tweet.Retweets, _ = strconv.Atoi(l[0])
|
||||
// }
|
||||
// })
|
||||
s.Find(".hashtag > span.PrettyLink-value").Each(func(i int, h *goquery.Selection) {
|
||||
tweet.Hashtags = append(tweet.Hashtags, h.Text())
|
||||
})
|
||||
s.Find("a.link:not(.u-hidden)").Each(func(i int, u *goquery.Selection) {
|
||||
if link, ok := u.Attr("data-expanded-url"); ok {
|
||||
tweet.URLs = append(tweet.URLs, link)
|
||||
}
|
||||
})
|
||||
s.Find(".NaturalImage-image").Each(func(i int, p *goquery.Selection) {
|
||||
if link, ok := p.Attr("data-image"); ok {
|
||||
tweet.Photos = append(tweet.Photos, link+"?format=jpg&name=orig")
|
||||
}
|
||||
})
|
||||
s.Find(".CroppedImage-image").Each(func(i int, p *goquery.Selection) {
|
||||
if link, ok := p.Attr("data-image"); ok {
|
||||
tweet.Photos = append(tweet.Photos, link+"?format=jpg&name=orig")
|
||||
}
|
||||
})
|
||||
// s.Find(".PlayableMedia-player").Each(func(i int, v *goquery.Selection) {
|
||||
// if style, ok := v.Attr("style"); ok {
|
||||
// if strings.Contains(style, "background") {
|
||||
// match := regexp.MustCompile(`https:\/\/.+\/([\w-]+)\.(?:jpg|png)`).FindStringSubmatch(style)
|
||||
// if len(match) == 2 {
|
||||
// tweet.Videos = append(tweet.Videos, Video{ID: match[1], Preview: match[0]})
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// })
|
||||
tweets = append(tweets, &tweet)
|
||||
}
|
||||
})
|
||||
|
||||
return tweets, nil
|
||||
tweets, nextCursor := parseTimeline(&timeline)
|
||||
return tweets, nextCursor, nil
|
||||
}
|
||||
|
|
|
|||
|
|
@ -7,21 +7,25 @@ import (
|
|||
|
||||
func TestGetTweets(t *testing.T) {
|
||||
count := 0
|
||||
maxTweetsNbr := 50
|
||||
maxTweetsNbr := 300
|
||||
dupcheck := make(map[string]bool)
|
||||
for tweet := range GetTweets(context.Background(), "Twitter", maxTweetsNbr) {
|
||||
if tweet.Error != nil {
|
||||
t.Error(tweet.Error)
|
||||
} else {
|
||||
count++
|
||||
if tweet.HTML == "" {
|
||||
t.Error("Expected tweet HTML is not empty")
|
||||
}
|
||||
if tweet.ID == "" {
|
||||
t.Error("Expected tweet ID is not empty")
|
||||
} else {
|
||||
if dupcheck[tweet.ID] {
|
||||
t.Errorf("Detect duplicated tweet ID: %s", tweet.ID)
|
||||
} else {
|
||||
dupcheck[tweet.ID] = true
|
||||
}
|
||||
}
|
||||
if tweet.UserID == "" {
|
||||
t.Error("Expected tweet UserID is not empty")
|
||||
}
|
||||
// if tweet.UserID == "" {
|
||||
// t.Error("Expected tweet UserID is not empty")
|
||||
// }
|
||||
if tweet.Username == "" {
|
||||
t.Error("Expected tweet Username is not empty")
|
||||
}
|
||||
|
|
@ -37,6 +41,17 @@ func TestGetTweets(t *testing.T) {
|
|||
if tweet.Timestamp == 0 {
|
||||
t.Error("Expected tweet Timestamp is greater than zero")
|
||||
}
|
||||
for _, video := range tweet.Videos {
|
||||
if video.ID == "" {
|
||||
t.Error("Expected tweet video ID is not empty")
|
||||
}
|
||||
if video.Preview == "" {
|
||||
t.Error("Expected tweet video Preview is not empty")
|
||||
}
|
||||
if video.URL == "" {
|
||||
t.Error("Expected tweet video URL is not empty")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if count != maxTweetsNbr {
|
||||
|
|
|
|||
151
types.go
Normal file
151
types.go
Normal file
|
|
@ -0,0 +1,151 @@
|
|||
package twitterscraper
|
||||
|
||||
import "time"
|
||||
|
||||
type (
|
||||
// Video type.
|
||||
Video struct {
|
||||
ID string
|
||||
Preview string
|
||||
URL string
|
||||
}
|
||||
|
||||
// Tweet type.
|
||||
Tweet struct {
|
||||
Hashtags []string
|
||||
ID string
|
||||
IsQuoted bool
|
||||
IsPin bool
|
||||
IsReply bool
|
||||
IsRetweet bool
|
||||
Likes int
|
||||
PermanentURL string
|
||||
Photos []string
|
||||
Replies int
|
||||
Retweets int
|
||||
Text string
|
||||
TimeParsed time.Time
|
||||
Timestamp int64
|
||||
URLs []string
|
||||
UserID string
|
||||
Username string
|
||||
Videos []Video
|
||||
}
|
||||
|
||||
// Result of scrapping.
|
||||
Result struct {
|
||||
Tweet
|
||||
Error error
|
||||
}
|
||||
|
||||
// timeline JSON
|
||||
timeline struct {
|
||||
GlobalObjects struct {
|
||||
Tweets map[string]struct {
|
||||
ConversationIDStr string `json:"conversation_id_str"`
|
||||
CreatedAt string `json:"created_at"`
|
||||
FavoriteCount int `json:"favorite_count"`
|
||||
FullText string `json:"full_text"`
|
||||
Entities struct {
|
||||
Hashtags []struct {
|
||||
Text string `json:"text"`
|
||||
} `json:"hashtags"`
|
||||
Media []struct {
|
||||
MediaURLHttps string `json:"media_url_https"`
|
||||
Type string `json:"type"`
|
||||
} `json:"media"`
|
||||
URLs []struct {
|
||||
ExpandedURL string `json:"expanded_url"`
|
||||
} `json:"urls"`
|
||||
} `json:"entities"`
|
||||
ExtendedEntities struct {
|
||||
Media []struct {
|
||||
IDStr string `json:"id_str"`
|
||||
MediaURLHttps string `json:"media_url_https"`
|
||||
Type string `json:"type"`
|
||||
VideoInfo struct {
|
||||
Variants []struct {
|
||||
Bitrate int `json:"bitrate,omitempty"`
|
||||
URL string `json:"url"`
|
||||
} `json:"variants"`
|
||||
} `json:"video_info"`
|
||||
} `json:"media"`
|
||||
} `json:"extended_entities"`
|
||||
InReplyToStatusIDStr string `json:"in_reply_to_status_id_str"`
|
||||
ReplyCount int `json:"reply_count"`
|
||||
RetweetCount int `json:"retweet_count"`
|
||||
RetweetedStatusIDStr string `json:"retweeted_status_id_str"`
|
||||
QuotedStatusIDStr string `json:"quoted_status_id_str"`
|
||||
Time time.Time `json:"time"`
|
||||
UserIDStr string `json:"user_id_str"`
|
||||
} `json:"tweets"`
|
||||
Users map[string]struct {
|
||||
CreatedAt string `json:"created_at"`
|
||||
Description string `json:"description"`
|
||||
Entities struct {
|
||||
URL struct {
|
||||
Urls []struct {
|
||||
ExpandedURL string `json:"expanded_url"`
|
||||
} `json:"urls"`
|
||||
} `json:"url"`
|
||||
} `json:"entities"`
|
||||
FavouritesCount int `json:"favourites_count"`
|
||||
FollowersCount int `json:"followers_count"`
|
||||
FriendsCount int `json:"friends_count"`
|
||||
IDStr string `json:"id_str"`
|
||||
ListedCount int `json:"listed_count"`
|
||||
Name string `json:"name"`
|
||||
Location string `json:"location"`
|
||||
PinnedTweetIdsStr []string `json:"pinned_tweet_ids_str"`
|
||||
ProfileBannerURL string `json:"profile_banner_url"`
|
||||
ProfileImageURLHTTPS string `json:"profile_image_url_https"`
|
||||
Protected bool `json:"protected"`
|
||||
ScreenName string `json:"screen_name"`
|
||||
StatusesCount int `json:"statuses_count"`
|
||||
Verified bool `json:"verified"`
|
||||
} `json:"users"`
|
||||
} `json:"globalObjects"`
|
||||
Timeline struct {
|
||||
Instructions []struct {
|
||||
AddEntries struct {
|
||||
Entries []struct {
|
||||
Content struct {
|
||||
Item struct {
|
||||
Content struct {
|
||||
Tweet struct {
|
||||
ID string `json:"id"`
|
||||
} `json:"tweet"`
|
||||
} `json:"content"`
|
||||
} `json:"item"`
|
||||
Operation struct {
|
||||
Cursor struct {
|
||||
Value string `json:"value"`
|
||||
CursorType string `json:"cursorType"`
|
||||
} `json:"cursor"`
|
||||
} `json:"operation"`
|
||||
TimelineModule struct {
|
||||
Items []struct {
|
||||
Item struct {
|
||||
ClientEventInfo struct {
|
||||
Details struct {
|
||||
GuideDetails struct {
|
||||
TransparentGuideDetails struct {
|
||||
TrendMetadata struct {
|
||||
TrendName string `json:"trendName"`
|
||||
} `json:"trendMetadata"`
|
||||
} `json:"transparentGuideDetails"`
|
||||
} `json:"guideDetails"`
|
||||
} `json:"details"`
|
||||
} `json:"clientEventInfo"`
|
||||
} `json:"item"`
|
||||
} `json:"items"`
|
||||
} `json:"timelineModule"`
|
||||
} `json:"content,omitempty"`
|
||||
} `json:"entries"`
|
||||
} `json:"addEntries"`
|
||||
} `json:"instructions"`
|
||||
} `json:"timeline"`
|
||||
}
|
||||
|
||||
fetchFunc func(user string, maxTweetsNbr int, cursor string) ([]*Tweet, string, error)
|
||||
)
|
||||
197
util.go
197
util.go
|
|
@ -1,25 +1,30 @@
|
|||
package twitterscraper
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"net"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
//HttpProxy Public variable for Http proxy
|
||||
var HTTPProxy *url.URL
|
||||
var (
|
||||
// IncludeReplies enable tweet reply
|
||||
IncludeReplies bool
|
||||
// HTTPProxy Public variable for Http proxy
|
||||
HTTPProxy *url.URL
|
||||
)
|
||||
|
||||
// SetProxy set http proxy format `http://HOST:PORT`
|
||||
func SetProxy(Proxy string) error {
|
||||
match, _ := regexp.MatchString("http.+", Proxy)
|
||||
if !match {
|
||||
func SetProxy(proxy string) error {
|
||||
if !strings.HasPrefix(proxy, "http://") {
|
||||
return errors.New("only support http protocol")
|
||||
}
|
||||
urlproxy, err := url.Parse(Proxy)
|
||||
urlproxy, err := url.Parse(proxy)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
|
@ -27,42 +32,172 @@ func SetProxy(Proxy string) error {
|
|||
return nil
|
||||
}
|
||||
|
||||
func newRequest(url string) (*http.Request, error) {
|
||||
req, err := http.NewRequest("GET", url, nil)
|
||||
func newHTTPClient() *http.Client {
|
||||
client := &http.Client{Timeout: 10 * time.Second}
|
||||
if HTTPProxy != nil {
|
||||
client = &http.Client{
|
||||
Transport: &http.Transport{
|
||||
Proxy: http.ProxyURL(HTTPProxy),
|
||||
DialContext: (&net.Dialer{
|
||||
Timeout: 10 * time.Second,
|
||||
}).DialContext,
|
||||
},
|
||||
}
|
||||
}
|
||||
return client
|
||||
}
|
||||
|
||||
func newRequest(method string, url string) (*http.Request, error) {
|
||||
req, err := http.NewRequest(method, url, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
req.Header.Set("Accept", "application/json, text/javascript, */*; q=0.01")
|
||||
req.Header.Set("Accept-Language", "en-US")
|
||||
req.Header.Set("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8")
|
||||
req.Header.Set("X-Twitter-Active-User", "yes")
|
||||
req.Header.Set("X-Requested-With", "XMLHttpRequest")
|
||||
q := req.URL.Query()
|
||||
q.Add("include_profile_interstitial_type", "1")
|
||||
q.Add("include_blocking", "1")
|
||||
q.Add("include_blocked_by", "1")
|
||||
q.Add("include_followed_by", "1")
|
||||
q.Add("include_want_retweets", "1")
|
||||
q.Add("include_mute_edge", "1")
|
||||
q.Add("include_can_dm", "1")
|
||||
q.Add("include_can_media_tag", "1")
|
||||
q.Add("skip_status", "1")
|
||||
q.Add("cards_platform", "Web-12")
|
||||
q.Add("include_cards", "1")
|
||||
q.Add("include_ext_alt_text", "true")
|
||||
q.Add("include_quote_count", "true")
|
||||
q.Add("include_reply_count", "1")
|
||||
q.Add("tweet_mode", "extended")
|
||||
q.Add("include_entities", "true")
|
||||
q.Add("include_user_entities", "true")
|
||||
q.Add("include_ext_media_color", "true")
|
||||
q.Add("include_ext_media_availability", "true")
|
||||
q.Add("send_error_codes", "true")
|
||||
q.Add("simple_quoted_tweet", "true")
|
||||
q.Add("include_tweet_replies", strconv.FormatBool(IncludeReplies))
|
||||
q.Add("ext", "mediaStats,highlightedLabel")
|
||||
req.URL.RawQuery = q.Encode()
|
||||
|
||||
return req, nil
|
||||
}
|
||||
|
||||
func getHTMLFromJSON(req *http.Request, field string) (*strings.Reader, error) {
|
||||
resp, err := http.DefaultClient.Do(req)
|
||||
func getTimeline(ctx context.Context, query string, maxTweetsNbr int, fetchFunc fetchFunc) <-chan *Result {
|
||||
channel := make(chan *Result)
|
||||
go func(user string) {
|
||||
defer close(channel)
|
||||
var nextCursor string
|
||||
tweetsNbr := 0
|
||||
for tweetsNbr < maxTweetsNbr {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
channel <- &Result{Error: ctx.Err()}
|
||||
return
|
||||
default:
|
||||
}
|
||||
|
||||
tweets, next, err := fetchFunc(query, maxTweetsNbr, nextCursor)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return nil, fmt.Errorf("response status: %s", resp.Status)
|
||||
channel <- &Result{Error: err}
|
||||
return
|
||||
}
|
||||
|
||||
ajaxJSON := make(map[string]interface{})
|
||||
err = json.NewDecoder(resp.Body).Decode(&ajaxJSON)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
if len(tweets) == 0 {
|
||||
break
|
||||
}
|
||||
|
||||
htm, ok := ajaxJSON[field].(string)
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("field %s not found in JSON", field)
|
||||
for _, tweet := range tweets {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
channel <- &Result{Error: ctx.Err()}
|
||||
return
|
||||
default:
|
||||
}
|
||||
|
||||
return strings.NewReader(htm), nil
|
||||
if tweetsNbr < maxTweetsNbr {
|
||||
nextCursor = next
|
||||
channel <- &Result{Tweet: *tweet}
|
||||
}
|
||||
tweetsNbr++
|
||||
}
|
||||
}
|
||||
}(query)
|
||||
return channel
|
||||
}
|
||||
|
||||
func parseTimeline(timeline *timeline) ([]*Tweet, string) {
|
||||
tweets := make(map[string]Tweet)
|
||||
|
||||
for id, tweet := range timeline.GlobalObjects.Tweets {
|
||||
username := timeline.GlobalObjects.Users[tweet.UserIDStr].ScreenName
|
||||
tw := Tweet{
|
||||
ID: id,
|
||||
Likes: tweet.FavoriteCount,
|
||||
PermanentURL: fmt.Sprintf("https://twitter.com/%s/status/%s", username, id),
|
||||
Replies: tweet.RetweetCount,
|
||||
Retweets: tweet.RetweetCount,
|
||||
Text: tweet.FullText,
|
||||
UserID: tweet.UserIDStr,
|
||||
Username: username,
|
||||
}
|
||||
tm, err := time.Parse(time.RubyDate, tweet.CreatedAt)
|
||||
if err == nil {
|
||||
tw.TimeParsed = tm
|
||||
tw.Timestamp = tm.Unix()
|
||||
}
|
||||
if tweet.QuotedStatusIDStr != "" {
|
||||
tw.IsQuoted = true
|
||||
}
|
||||
if tweet.InReplyToStatusIDStr != "" {
|
||||
tw.IsReply = true
|
||||
}
|
||||
if tweet.RetweetedStatusIDStr != "" {
|
||||
tw.IsRetweet = true
|
||||
}
|
||||
for _, pinned := range timeline.GlobalObjects.Users[tweet.UserIDStr].PinnedTweetIdsStr {
|
||||
if tweet.ConversationIDStr == pinned {
|
||||
tw.IsPin = true
|
||||
break
|
||||
}
|
||||
}
|
||||
for _, hash := range tweet.Entities.Hashtags {
|
||||
tw.Hashtags = append(tw.Hashtags, hash.Text)
|
||||
}
|
||||
for _, media := range tweet.Entities.Media {
|
||||
if media.Type == "photo" {
|
||||
tw.Photos = append(tw.Photos, media.MediaURLHttps)
|
||||
}
|
||||
}
|
||||
for _, media := range tweet.ExtendedEntities.Media {
|
||||
if media.Type == "video" {
|
||||
video := Video{
|
||||
ID: media.IDStr,
|
||||
Preview: media.MediaURLHttps,
|
||||
}
|
||||
maxBitrate := 0
|
||||
for _, variant := range media.VideoInfo.Variants {
|
||||
if variant.Bitrate > maxBitrate {
|
||||
video.URL = strings.TrimSuffix(variant.URL, "?tag=10")
|
||||
}
|
||||
}
|
||||
tw.Videos = append(tw.Videos, video)
|
||||
}
|
||||
}
|
||||
for _, url := range tweet.Entities.URLs {
|
||||
tw.URLs = append(tw.URLs, url.ExpandedURL)
|
||||
}
|
||||
tweets[tw.ID] = tw
|
||||
}
|
||||
|
||||
var cursor string
|
||||
var orderedTweets []*Tweet
|
||||
for _, entry := range timeline.Timeline.Instructions[0].AddEntries.Entries {
|
||||
if tweet, ok := tweets[entry.Content.Item.Content.Tweet.ID]; ok {
|
||||
orderedTweets = append(orderedTweets, &tweet)
|
||||
}
|
||||
if entry.Content.Operation.Cursor.CursorType == "Bottom" {
|
||||
cursor = entry.Content.Operation.Cursor.Value
|
||||
}
|
||||
}
|
||||
return orderedTweets, cursor
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue