2018-11-29 17:33:44 +02:00
|
|
|
package twitterscraper
|
|
|
|
|
|
|
|
|
|
import (
|
2020-06-12 21:31:08 +08:00
|
|
|
"context"
|
2018-11-29 17:33:44 +02:00
|
|
|
"fmt"
|
2020-09-19 00:57:03 +03:00
|
|
|
"net/http"
|
2018-11-29 17:33:44 +02:00
|
|
|
"strconv"
|
|
|
|
|
"strings"
|
|
|
|
|
"time"
|
|
|
|
|
|
|
|
|
|
"github.com/PuerkitoBio/goquery"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
const ajaxURL = "https://twitter.com/i/profiles/show/%s/timeline/tweets"
|
|
|
|
|
|
2020-05-14 21:52:55 +03:00
|
|
|
// Video type.
|
2018-11-29 17:33:44 +02:00
|
|
|
type Video struct {
|
|
|
|
|
ID string
|
|
|
|
|
Preview string
|
|
|
|
|
}
|
|
|
|
|
|
2020-05-14 21:52:55 +03:00
|
|
|
// Tweet type.
|
2018-11-29 17:33:44 +02:00
|
|
|
type Tweet struct {
|
2019-09-13 22:38:05 +03:00
|
|
|
Hashtags []string
|
|
|
|
|
HTML string
|
2018-11-29 17:33:44 +02:00
|
|
|
ID string
|
2020-03-04 01:53:13 +03:00
|
|
|
IsPin bool
|
2019-09-13 22:38:05 +03:00
|
|
|
IsRetweet bool
|
|
|
|
|
Likes int
|
2018-11-29 17:33:44 +02:00
|
|
|
PermanentURL string
|
2019-09-13 22:38:05 +03:00
|
|
|
Photos []string
|
2018-11-29 17:33:44 +02:00
|
|
|
Replies int
|
|
|
|
|
Retweets int
|
2019-09-13 22:38:05 +03:00
|
|
|
Text string
|
|
|
|
|
TimeParsed time.Time
|
|
|
|
|
Timestamp int64
|
2018-11-29 17:33:44 +02:00
|
|
|
URLs []string
|
2020-06-15 16:38:51 +03:00
|
|
|
UserID string
|
|
|
|
|
Username string
|
2018-11-29 17:33:44 +02:00
|
|
|
Videos []Video
|
|
|
|
|
}
|
|
|
|
|
|
2020-05-14 21:52:55 +03:00
|
|
|
// Result of scrapping.
|
2019-09-15 10:56:58 +03:00
|
|
|
type Result struct {
|
|
|
|
|
Tweet
|
|
|
|
|
Error error
|
|
|
|
|
}
|
|
|
|
|
|
2020-05-14 21:52:55 +03:00
|
|
|
// GetTweets returns channel with tweets for a given user.
|
2020-06-15 16:16:08 +03:00
|
|
|
func GetTweets(ctx context.Context, user string, maxTweetsNbr int) <-chan *Result {
|
2020-02-14 16:21:20 +02:00
|
|
|
channel := make(chan *Result)
|
2018-11-29 17:33:44 +02:00
|
|
|
go func(user string) {
|
2019-09-15 10:56:58 +03:00
|
|
|
defer close(channel)
|
2018-11-29 17:33:44 +02:00
|
|
|
var lastTweetID string
|
2020-06-15 16:16:08 +03:00
|
|
|
tweetsNbr := 0
|
|
|
|
|
for tweetsNbr < maxTweetsNbr {
|
2020-06-12 21:31:08 +08:00
|
|
|
select {
|
|
|
|
|
case <-ctx.Done():
|
|
|
|
|
channel <- &Result{Error: ctx.Err()}
|
|
|
|
|
return
|
|
|
|
|
default:
|
|
|
|
|
}
|
|
|
|
|
|
2020-09-19 00:57:03 +03:00
|
|
|
tweets, err := FetchTweets(user, lastTweetID)
|
2018-11-29 17:33:44 +02:00
|
|
|
if err != nil {
|
2019-09-15 10:56:58 +03:00
|
|
|
channel <- &Result{Error: err}
|
|
|
|
|
return
|
2018-11-29 17:33:44 +02:00
|
|
|
}
|
2020-06-15 16:16:08 +03:00
|
|
|
|
|
|
|
|
if len(tweets) == 0 {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
|
2018-11-29 17:33:44 +02:00
|
|
|
for _, tweet := range tweets {
|
2020-06-12 21:31:08 +08:00
|
|
|
select {
|
|
|
|
|
case <-ctx.Done():
|
|
|
|
|
channel <- &Result{Error: ctx.Err()}
|
|
|
|
|
return
|
|
|
|
|
default:
|
|
|
|
|
}
|
|
|
|
|
|
2020-06-15 16:16:08 +03:00
|
|
|
if tweetsNbr < maxTweetsNbr {
|
2020-08-13 17:41:24 +03:00
|
|
|
lastId, _ := strconv.ParseInt(tweet.ID, 10, 64)
|
|
|
|
|
lastTweetID = strconv.FormatInt(lastId-1, 10)
|
2020-06-15 16:16:08 +03:00
|
|
|
channel <- &Result{Tweet: *tweet}
|
|
|
|
|
}
|
|
|
|
|
tweetsNbr++
|
2018-11-29 17:33:44 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}(user)
|
|
|
|
|
return channel
|
|
|
|
|
}
|
|
|
|
|
|
2020-05-14 21:52:55 +03:00
|
|
|
// FetchTweets gets tweets for a given user, via the Twitter frontend API.
|
2018-11-29 17:33:44 +02:00
|
|
|
func FetchTweets(user string, last string) ([]*Tweet, error) {
|
2020-09-19 00:57:03 +03:00
|
|
|
req, err := http.NewRequest("GET", "https://syndication.twitter.com/timeline/profile", nil)
|
2018-11-29 17:33:44 +02:00
|
|
|
if err != nil {
|
|
|
|
|
return nil, err
|
|
|
|
|
}
|
|
|
|
|
|
2020-09-19 00:57:03 +03:00
|
|
|
req.Header.Set("Referer", "https://publish.twitter.com/")
|
2018-11-29 17:33:44 +02:00
|
|
|
|
|
|
|
|
q := req.URL.Query()
|
2020-09-19 00:57:03 +03:00
|
|
|
q.Add("screen_name", user)
|
|
|
|
|
q.Add("with_replies", "true")
|
2018-11-29 17:33:44 +02:00
|
|
|
if last != "" {
|
|
|
|
|
q.Add("max_position", last)
|
|
|
|
|
}
|
|
|
|
|
req.URL.RawQuery = q.Encode()
|
|
|
|
|
|
2020-09-19 00:57:03 +03:00
|
|
|
htm, err := getHTMLFromJSON(req, "body")
|
2018-11-29 17:33:44 +02:00
|
|
|
if err != nil {
|
|
|
|
|
return nil, err
|
|
|
|
|
}
|
|
|
|
|
|
2020-05-13 17:35:44 +02:00
|
|
|
tweets, err := readTweetsFromHTML(htm)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return nil, err
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return tweets, nil
|
|
|
|
|
}
|
|
|
|
|
|
2020-06-12 21:31:08 +08:00
|
|
|
func readTweetsFromHTML(htm *strings.Reader) ([]*Tweet, error) {
|
2020-05-13 17:35:44 +02:00
|
|
|
var tweets []*Tweet
|
|
|
|
|
|
2020-02-12 10:45:19 +02:00
|
|
|
doc, err := goquery.NewDocumentFromReader(htm)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return nil, err
|
|
|
|
|
}
|
2018-11-29 17:33:44 +02:00
|
|
|
|
2020-09-19 00:57:03 +03:00
|
|
|
doc.Find(".timeline-Tweet").Each(func(i int, s *goquery.Selection) {
|
2020-02-12 10:45:19 +02:00
|
|
|
var tweet Tweet
|
2020-09-19 00:57:03 +03:00
|
|
|
timeStr, ok := s.Find(".timeline-Tweet-metadata > a > time").Attr("datetime")
|
2020-02-12 10:45:19 +02:00
|
|
|
if ok {
|
2020-09-19 00:57:03 +03:00
|
|
|
tweet.TimeParsed, _ = time.Parse("2006-01-02T15:04:05-0700", timeStr)
|
|
|
|
|
tweet.Timestamp = tweet.TimeParsed.Unix()
|
|
|
|
|
tweet.ID = s.AttrOr("data-tweet-id", "")
|
|
|
|
|
//tweet.UserID = s.Find(".tweet").AttrOr("data-user-id", "")
|
|
|
|
|
tweet.Username = strings.TrimPrefix(s.Find(".TweetAuthor-screenName").AttrOr("title", ""), "@")
|
2020-06-15 16:38:51 +03:00
|
|
|
tweet.PermanentURL = fmt.Sprintf("https://twitter.com/%s/status/%s", tweet.Username, tweet.ID)
|
2020-09-19 00:57:03 +03:00
|
|
|
tweet.Text = s.Find(".timeline-Tweet-text").Text()
|
|
|
|
|
tweet.HTML, _ = s.Find(".timeline-Tweet-text").Html()
|
|
|
|
|
s.Find(".timeline-Tweet-retweetCredit").Each(func(i int, c *goquery.Selection) {
|
2020-02-12 10:45:19 +02:00
|
|
|
tweet.IsRetweet = true
|
|
|
|
|
})
|
2020-09-19 00:57:03 +03:00
|
|
|
// s.Find("span.js-pinned-text").Each(func(i int, c *goquery.Selection) {
|
|
|
|
|
// tweet.IsPin = true
|
|
|
|
|
// })
|
|
|
|
|
// s.Find(".ProfileTweet-actionCount").Each(func(i int, c *goquery.Selection) {
|
|
|
|
|
// txt := strings.TrimSpace(c.Text())
|
|
|
|
|
// switch {
|
|
|
|
|
// case strings.HasSuffix(txt, "likes"):
|
|
|
|
|
// l := strings.Split(txt, " ")
|
|
|
|
|
// tweet.Likes, _ = strconv.Atoi(l[0])
|
|
|
|
|
// case strings.HasSuffix(txt, "replies"):
|
|
|
|
|
// l := strings.Split(txt, " ")
|
|
|
|
|
// tweet.Replies, _ = strconv.Atoi(l[0])
|
|
|
|
|
// case strings.HasSuffix(txt, "retweets"):
|
|
|
|
|
// l := strings.Split(txt, " ")
|
|
|
|
|
// tweet.Retweets, _ = strconv.Atoi(l[0])
|
|
|
|
|
// }
|
|
|
|
|
// })
|
|
|
|
|
s.Find(".hashtag > span.PrettyLink-value").Each(func(i int, h *goquery.Selection) {
|
2020-02-12 10:45:19 +02:00
|
|
|
tweet.Hashtags = append(tweet.Hashtags, h.Text())
|
|
|
|
|
})
|
|
|
|
|
s.Find("a.twitter-timeline-link:not(.u-hidden)").Each(func(i int, u *goquery.Selection) {
|
|
|
|
|
if link, ok := u.Attr("data-expanded-url"); ok {
|
|
|
|
|
tweet.URLs = append(tweet.URLs, link)
|
|
|
|
|
}
|
|
|
|
|
})
|
2020-09-19 00:57:03 +03:00
|
|
|
s.Find(".NaturalImage-image").Each(func(i int, p *goquery.Selection) {
|
|
|
|
|
if link, ok := p.Attr("data-image"); ok {
|
2020-02-12 10:45:19 +02:00
|
|
|
tweet.Photos = append(tweet.Photos, link)
|
|
|
|
|
}
|
|
|
|
|
})
|
2020-09-19 00:57:03 +03:00
|
|
|
// s.Find(".PlayableMedia-player").Each(func(i int, v *goquery.Selection) {
|
|
|
|
|
// if style, ok := v.Attr("style"); ok {
|
|
|
|
|
// if strings.Contains(style, "background") {
|
|
|
|
|
// match := regexp.MustCompile(`https:\/\/.+\/([\w-]+)\.(?:jpg|png)`).FindStringSubmatch(style)
|
|
|
|
|
// if len(match) == 2 {
|
|
|
|
|
// tweet.Videos = append(tweet.Videos, Video{ID: match[1], Preview: match[0]})
|
|
|
|
|
// }
|
|
|
|
|
// }
|
|
|
|
|
// }
|
|
|
|
|
// })
|
2020-02-12 10:45:19 +02:00
|
|
|
tweets = append(tweets, &tweet)
|
|
|
|
|
}
|
|
|
|
|
})
|
2018-11-29 17:33:44 +02:00
|
|
|
|
|
|
|
|
return tweets, nil
|
|
|
|
|
}
|