add scrap tweets for any search query feature

This commit is contained in:
xisco 2020-05-14 14:59:33 +02:00
parent 37bb62a82a
commit 2abfb27a7b
4 changed files with 147 additions and 1 deletions

1
.gitignore vendored
View file

@ -1,2 +1,3 @@
*.htm?
*.json
.idea/

View file

@ -10,7 +10,7 @@ You can use this library to get the text of any user's Tweets trivially.
## Usage
### Get tweets
### Get user tweets
```golang
package main
@ -32,6 +32,33 @@ func main() {
It appears you can ask for up to 25 pages of tweets reliably (~486 tweets).
### Get query search tweets
Tweets containing “twitter” and “scraper” and “data“, filtering out retweets:
```golang
package main
import (
"fmt"
twitterscraper "github.com/n0madic/twitter-scraper"
)
func main() {
for tweet := range twitterscraper.GetSearchTweets("twitter scraper data -filter:retweets", 50) {
if tweet.Error != nil {
panic(tweet.Error)
}
fmt.Println(tweet.HTML)
}
}
```
The search ends if we have 50 tweets.
See <https://developer.twitter.com/en/docs/tweets/rules-and-filtering/overview/standard-operators> for build standard queries.
### Get profile
```golang

View file

@ -1 +1,83 @@
package twitterscraper
import (
"fmt"
"net/http"
"net/url"
"strconv"
)
const ajaxSearchURL = "https://twitter.com/i/search/timeline?q=%s"
// GetTweets returns channel with tweets for a given search query
func GetSearchTweets(query string, maxTweetsNbr int) <-chan *Result {
channel := make(chan *Result)
go func(query string) {
defer close(channel)
var maxId string
tweetsNbr := 0
for tweetsNbr < maxTweetsNbr {
tweets, err := FetchSearchTweets(query, maxId)
if err != nil {
channel <- &Result{Error: err}
return
}
if len(tweets) == 0 {
break
}
for _, tweet := range tweets {
if tweetsNbr < maxTweetsNbr {
lastId, _ := strconv.ParseInt(tweet.ID, 10, 64)
maxId = strconv.FormatInt(lastId - 1, 10)
channel <- &Result{Tweet: *tweet}
}
tweetsNbr++
}
}
}(query)
return channel
}
// FetchTweets gets tweets for a given search query, via the Twitter frontend API
func FetchSearchTweets(query, maxId string) ([]*Tweet, error) {
if maxId != "" {
query = query + " max_id:" + maxId
}
req, err := http.NewRequest(
"GET",
fmt.Sprintf(ajaxSearchURL, url.PathEscape(query)),
nil,
)
if err != nil {
return nil, err
}
req.Header.Set("Referer", "https://twitter.com/search/timeline")
req.Header.Set("Accept", "application/json, text/javascript, */*; q=0.01")
req.Header.Set("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8")
req.Header.Set("Accept-Language", "en-US")
req.Header.Set("X-Twitter-Active-User", "yes")
req.Header.Set("X-Requested-With", "XMLHttpRequest")
q := req.URL.Query()
q.Add("f", "tweets")
q.Add("include_available_features", "1")
q.Add("include_entities", "1")
q.Add("include_new_items_bar", "true")
req.URL.RawQuery = q.Encode()
htm, err := getHTMLFromJSON(req, "items_html")
if err != nil {
return nil, err
}
tweets, err := readTweetsFromHTML(htm)
if err != nil {
return nil, err
}
return tweets, nil
}

36
search_test.go Normal file
View file

@ -0,0 +1,36 @@
package twitterscraper
import "testing"
func TestGetSearchTweets(t *testing.T) {
count := 0
for tweet := range GetSearchTweets("twitter scraper data -filter:retweets", 50) {
if tweet.Error != nil {
t.Error(tweet.Error)
} else {
count++
if tweet.HTML == "" {
t.Error("Expected tweet HTML is not empty")
}
if tweet.ID == "" {
t.Error("Expected tweet ID is not empty")
}
if tweet.PermanentURL == "" {
t.Error("Expected tweet PermanentURL is not empty")
}
if tweet.Text == "" {
t.Error("Expected tweet Text is not empty")
}
if tweet.TimeParsed.IsZero() {
t.Error("Expected tweet TimeParsed is not zero")
}
if tweet.Timestamp == 0 {
t.Error("Expected tweet Timestamp is greater than zero")
}
}
}
if count == 0 {
t.Error("Expected tweets count is greater than zero")
}
}