From 37bb62a82a6bce76863b750a03cae4c51d7398ff Mon Sep 17 00:00:00 2001 From: xisco Date: Wed, 13 May 2020 17:35:44 +0200 Subject: [PATCH 1/4] create readTweetsFromHtml func for recycle code --- search.go | 1 + tweets.go | 13 ++++++++++++- 2 files changed, 13 insertions(+), 1 deletion(-) create mode 100644 search.go diff --git a/search.go b/search.go new file mode 100644 index 0000000..df409dd --- /dev/null +++ b/search.go @@ -0,0 +1 @@ +package twitterscraper diff --git a/tweets.go b/tweets.go index 407b31b..124ee36 100644 --- a/tweets.go +++ b/tweets.go @@ -68,7 +68,6 @@ func GetTweets(user string, pages int) <-chan *Result { // FetchTweets gets tweets for a given user, via the Twitter frontend API func FetchTweets(user string, last string) ([]*Tweet, error) { - var tweets []*Tweet req, err := http.NewRequest("GET", fmt.Sprintf(ajaxURL, user), nil) if err != nil { @@ -96,6 +95,17 @@ func FetchTweets(user string, last string) ([]*Tweet, error) { return nil, err } + tweets, err := readTweetsFromHTML(htm) + if err != nil { + return nil, err + } + + return tweets, nil +} + +func readTweetsFromHTML (htm *strings.Reader) ([]*Tweet, error) { + var tweets []*Tweet + doc, err := goquery.NewDocumentFromReader(htm) if err != nil { return nil, err @@ -108,6 +118,7 @@ func FetchTweets(user string, last string) ([]*Tweet, error) { tweet.Timestamp, _ = strconv.ParseInt(timeStr, 10, 64) tweet.TimeParsed = time.Unix(tweet.Timestamp, 0) tweet.ID = s.AttrOr("data-item-id", "") + user, _ := s.Find(".tweet").Attr("data-screen-name") tweet.PermanentURL = fmt.Sprintf("https://twitter.com/%s/status/%s", user, tweet.ID) tweet.Text = s.Find(".tweet-text").Text() tweet.HTML, _ = s.Find(".tweet-text").Html() From 2abfb27a7b02d77493ef790e4538ecfabed515fe Mon Sep 17 00:00:00 2001 From: xisco Date: Thu, 14 May 2020 14:59:33 +0200 Subject: [PATCH 2/4] add scrap tweets for any search query feature --- .gitignore | 1 + README.md | 29 +++++++++++++++++- search.go | 82 ++++++++++++++++++++++++++++++++++++++++++++++++++ search_test.go | 36 ++++++++++++++++++++++ 4 files changed, 147 insertions(+), 1 deletion(-) create mode 100644 search_test.go diff --git a/.gitignore b/.gitignore index f8c1096..0a035ec 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ *.htm? *.json +.idea/ diff --git a/README.md b/README.md index 34d0f70..8c29aa1 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ You can use this library to get the text of any user's Tweets trivially. ## Usage -### Get tweets +### Get user tweets ```golang package main @@ -32,6 +32,33 @@ func main() { It appears you can ask for up to 25 pages of tweets reliably (~486 tweets). +### Get query search tweets + +Tweets containing “twitter” and “scraper” and “data“, filtering out retweets: + +```golang +package main + +import ( + "fmt" + twitterscraper "github.com/n0madic/twitter-scraper" +) + +func main() { + for tweet := range twitterscraper.GetSearchTweets("twitter scraper data -filter:retweets", 50) { + if tweet.Error != nil { + panic(tweet.Error) + } + fmt.Println(tweet.HTML) + } +} +``` + +The search ends if we have 50 tweets. + +See for build standard queries. + + ### Get profile ```golang diff --git a/search.go b/search.go index df409dd..2fded71 100644 --- a/search.go +++ b/search.go @@ -1 +1,83 @@ package twitterscraper + +import ( + "fmt" + "net/http" + "net/url" + "strconv" +) + +const ajaxSearchURL = "https://twitter.com/i/search/timeline?q=%s" + +// GetTweets returns channel with tweets for a given search query +func GetSearchTweets(query string, maxTweetsNbr int) <-chan *Result { + channel := make(chan *Result) + go func(query string) { + defer close(channel) + var maxId string + tweetsNbr := 0 + for tweetsNbr < maxTweetsNbr { + tweets, err := FetchSearchTweets(query, maxId) + if err != nil { + channel <- &Result{Error: err} + return + } + + if len(tweets) == 0 { + break + } + + for _, tweet := range tweets { + if tweetsNbr < maxTweetsNbr { + lastId, _ := strconv.ParseInt(tweet.ID, 10, 64) + maxId = strconv.FormatInt(lastId - 1, 10) + channel <- &Result{Tweet: *tweet} + } + tweetsNbr++ + } + } + }(query) + return channel +} + +// FetchTweets gets tweets for a given search query, via the Twitter frontend API +func FetchSearchTweets(query, maxId string) ([]*Tweet, error) { + if maxId != "" { + query = query + " max_id:" + maxId + } + + req, err := http.NewRequest( + "GET", + fmt.Sprintf(ajaxSearchURL, url.PathEscape(query)), + nil, + ) + if err != nil { + return nil, err + } + req.Header.Set("Referer", "https://twitter.com/search/timeline") + req.Header.Set("Accept", "application/json, text/javascript, */*; q=0.01") + req.Header.Set("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8") + req.Header.Set("Accept-Language", "en-US") + req.Header.Set("X-Twitter-Active-User", "yes") + req.Header.Set("X-Requested-With", "XMLHttpRequest") + + q := req.URL.Query() + q.Add("f", "tweets") + q.Add("include_available_features", "1") + q.Add("include_entities", "1") + q.Add("include_new_items_bar", "true") + + req.URL.RawQuery = q.Encode() + + htm, err := getHTMLFromJSON(req, "items_html") + if err != nil { + return nil, err + } + + tweets, err := readTweetsFromHTML(htm) + if err != nil { + return nil, err + } + + return tweets, nil +} diff --git a/search_test.go b/search_test.go new file mode 100644 index 0000000..bb0a771 --- /dev/null +++ b/search_test.go @@ -0,0 +1,36 @@ +package twitterscraper + +import "testing" + +func TestGetSearchTweets(t *testing.T) { + count := 0 + for tweet := range GetSearchTweets("twitter scraper data -filter:retweets", 50) { + if tweet.Error != nil { + t.Error(tweet.Error) + } else { + count++ + if tweet.HTML == "" { + t.Error("Expected tweet HTML is not empty") + } + if tweet.ID == "" { + t.Error("Expected tweet ID is not empty") + } + if tweet.PermanentURL == "" { + t.Error("Expected tweet PermanentURL is not empty") + } + if tweet.Text == "" { + t.Error("Expected tweet Text is not empty") + } + if tweet.TimeParsed.IsZero() { + t.Error("Expected tweet TimeParsed is not zero") + } + if tweet.Timestamp == 0 { + t.Error("Expected tweet Timestamp is greater than zero") + } + } + } + + if count == 0 { + t.Error("Expected tweets count is greater than zero") + } +} From 9497edf1994854abb18b50f342ef1dc3e0349c17 Mon Sep 17 00:00:00 2001 From: xisco Date: Fri, 15 May 2020 17:52:06 +0200 Subject: [PATCH 3/4] add check for maxTweetsNbr and rename GetSearchTweets fun to Search Tweets --- search.go | 6 +++--- search_test.go | 7 ++++--- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/search.go b/search.go index 2fded71..28aa6ea 100644 --- a/search.go +++ b/search.go @@ -9,8 +9,8 @@ import ( const ajaxSearchURL = "https://twitter.com/i/search/timeline?q=%s" -// GetTweets returns channel with tweets for a given search query -func GetSearchTweets(query string, maxTweetsNbr int) <-chan *Result { +// SearchTweets returns channel with tweets for a given search query +func SearchTweets(query string, maxTweetsNbr int) <-chan *Result { channel := make(chan *Result) go func(query string) { defer close(channel) @@ -40,7 +40,7 @@ func GetSearchTweets(query string, maxTweetsNbr int) <-chan *Result { return channel } -// FetchTweets gets tweets for a given search query, via the Twitter frontend API +// FetchSearchTweets gets tweets for a given search query, via the Twitter frontend API func FetchSearchTweets(query, maxId string) ([]*Tweet, error) { if maxId != "" { query = query + " max_id:" + maxId diff --git a/search_test.go b/search_test.go index bb0a771..23e8872 100644 --- a/search_test.go +++ b/search_test.go @@ -4,7 +4,8 @@ import "testing" func TestGetSearchTweets(t *testing.T) { count := 0 - for tweet := range GetSearchTweets("twitter scraper data -filter:retweets", 50) { + maxTweetsNbr := 50 + for tweet := range SearchTweets("twitter scraper data -filter:retweets", maxTweetsNbr) { if tweet.Error != nil { t.Error(tweet.Error) } else { @@ -30,7 +31,7 @@ func TestGetSearchTweets(t *testing.T) { } } - if count == 0 { - t.Error("Expected tweets count is greater than zero") + if count != maxTweetsNbr { + t.Errorf("Expected tweets count=%v, got: %v", maxTweetsNbr, count) } } From 929df76331c076b7792dfc0c2325768571f4b68b Mon Sep 17 00:00:00 2001 From: xisco Date: Fri, 15 May 2020 17:56:57 +0200 Subject: [PATCH 4/4] update SearchTweets func on README --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 8c29aa1..e7c98ea 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,7 @@ func main() { It appears you can ask for up to 25 pages of tweets reliably (~486 tweets). -### Get query search tweets +### Search tweets by query standard operators Tweets containing “twitter” and “scraper” and “data“, filtering out retweets: @@ -45,7 +45,7 @@ import ( ) func main() { - for tweet := range twitterscraper.GetSearchTweets("twitter scraper data -filter:retweets", 50) { + for tweet := range twitterscraper.SearchTweets("twitter scraper data -filter:retweets", 50) { if tweet.Error != nil { panic(tweet.Error) }