twitter-scrapper/tweets.go

193 lines
4.9 KiB
Go
Raw Normal View History

2018-11-29 17:33:44 +02:00
package twitterscraper
import (
"context"
2018-11-29 17:33:44 +02:00
"fmt"
2020-09-19 00:57:03 +03:00
"net/http"
2018-11-29 17:33:44 +02:00
"strconv"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
)
const ajaxURL = "https://twitter.com/i/profiles/show/%s/timeline/tweets"
// Video type.
2018-11-29 17:33:44 +02:00
type Video struct {
ID string
Preview string
}
// Tweet type.
2018-11-29 17:33:44 +02:00
type Tweet struct {
2019-09-13 22:38:05 +03:00
Hashtags []string
HTML string
2018-11-29 17:33:44 +02:00
ID string
2020-03-04 01:53:13 +03:00
IsPin bool
2019-09-13 22:38:05 +03:00
IsRetweet bool
Likes int
2018-11-29 17:33:44 +02:00
PermanentURL string
2019-09-13 22:38:05 +03:00
Photos []string
2018-11-29 17:33:44 +02:00
Replies int
Retweets int
2019-09-13 22:38:05 +03:00
Text string
TimeParsed time.Time
Timestamp int64
2018-11-29 17:33:44 +02:00
URLs []string
2020-06-15 16:38:51 +03:00
UserID string
Username string
2018-11-29 17:33:44 +02:00
Videos []Video
}
// Result of scrapping.
2019-09-15 10:56:58 +03:00
type Result struct {
Tweet
Error error
}
// GetTweets returns channel with tweets for a given user.
func GetTweets(ctx context.Context, user string, maxTweetsNbr int) <-chan *Result {
2020-02-14 16:21:20 +02:00
channel := make(chan *Result)
2018-11-29 17:33:44 +02:00
go func(user string) {
2019-09-15 10:56:58 +03:00
defer close(channel)
2018-11-29 17:33:44 +02:00
var lastTweetID string
tweetsNbr := 0
for tweetsNbr < maxTweetsNbr {
select {
case <-ctx.Done():
channel <- &Result{Error: ctx.Err()}
return
default:
}
2020-09-19 00:57:03 +03:00
tweets, err := FetchTweets(user, lastTweetID)
2018-11-29 17:33:44 +02:00
if err != nil {
2019-09-15 10:56:58 +03:00
channel <- &Result{Error: err}
return
2018-11-29 17:33:44 +02:00
}
if len(tweets) == 0 {
break
}
2018-11-29 17:33:44 +02:00
for _, tweet := range tweets {
select {
case <-ctx.Done():
channel <- &Result{Error: ctx.Err()}
return
default:
}
if tweetsNbr < maxTweetsNbr {
2020-08-13 17:41:24 +03:00
lastId, _ := strconv.ParseInt(tweet.ID, 10, 64)
lastTweetID = strconv.FormatInt(lastId-1, 10)
channel <- &Result{Tweet: *tweet}
}
tweetsNbr++
2018-11-29 17:33:44 +02:00
}
}
}(user)
return channel
}
// FetchTweets gets tweets for a given user, via the Twitter frontend API.
2018-11-29 17:33:44 +02:00
func FetchTweets(user string, last string) ([]*Tweet, error) {
2020-09-19 00:57:03 +03:00
req, err := http.NewRequest("GET", "https://syndication.twitter.com/timeline/profile", nil)
2018-11-29 17:33:44 +02:00
if err != nil {
return nil, err
}
2020-09-19 00:57:03 +03:00
req.Header.Set("Referer", "https://publish.twitter.com/")
2018-11-29 17:33:44 +02:00
q := req.URL.Query()
2020-09-19 00:57:03 +03:00
q.Add("screen_name", user)
q.Add("with_replies", "true")
2018-11-29 17:33:44 +02:00
if last != "" {
q.Add("max_position", last)
}
req.URL.RawQuery = q.Encode()
2020-09-19 00:57:03 +03:00
htm, err := getHTMLFromJSON(req, "body")
2018-11-29 17:33:44 +02:00
if err != nil {
return nil, err
}
tweets, err := readTweetsFromHTML(htm)
if err != nil {
return nil, err
}
return tweets, nil
}
func readTweetsFromHTML(htm *strings.Reader) ([]*Tweet, error) {
var tweets []*Tweet
2020-02-12 10:45:19 +02:00
doc, err := goquery.NewDocumentFromReader(htm)
if err != nil {
return nil, err
}
2018-11-29 17:33:44 +02:00
2020-09-19 00:57:03 +03:00
doc.Find(".timeline-Tweet").Each(func(i int, s *goquery.Selection) {
2020-02-12 10:45:19 +02:00
var tweet Tweet
2020-09-19 00:57:03 +03:00
timeStr, ok := s.Find(".timeline-Tweet-metadata > a > time").Attr("datetime")
2020-02-12 10:45:19 +02:00
if ok {
2020-09-19 00:57:03 +03:00
tweet.TimeParsed, _ = time.Parse("2006-01-02T15:04:05-0700", timeStr)
tweet.Timestamp = tweet.TimeParsed.Unix()
tweet.ID = s.AttrOr("data-tweet-id", "")
//tweet.UserID = s.Find(".tweet").AttrOr("data-user-id", "")
tweet.Username = strings.TrimPrefix(s.Find(".TweetAuthor-screenName").AttrOr("title", ""), "@")
2020-06-15 16:38:51 +03:00
tweet.PermanentURL = fmt.Sprintf("https://twitter.com/%s/status/%s", tweet.Username, tweet.ID)
2020-09-19 00:57:03 +03:00
tweet.Text = s.Find(".timeline-Tweet-text").Text()
tweet.HTML, _ = s.Find(".timeline-Tweet-text").Html()
s.Find(".timeline-Tweet-retweetCredit").Each(func(i int, c *goquery.Selection) {
2020-02-12 10:45:19 +02:00
tweet.IsRetweet = true
})
2020-09-19 00:57:03 +03:00
// s.Find("span.js-pinned-text").Each(func(i int, c *goquery.Selection) {
// tweet.IsPin = true
// })
// s.Find(".ProfileTweet-actionCount").Each(func(i int, c *goquery.Selection) {
// txt := strings.TrimSpace(c.Text())
// switch {
// case strings.HasSuffix(txt, "likes"):
// l := strings.Split(txt, " ")
// tweet.Likes, _ = strconv.Atoi(l[0])
// case strings.HasSuffix(txt, "replies"):
// l := strings.Split(txt, " ")
// tweet.Replies, _ = strconv.Atoi(l[0])
// case strings.HasSuffix(txt, "retweets"):
// l := strings.Split(txt, " ")
// tweet.Retweets, _ = strconv.Atoi(l[0])
// }
// })
s.Find(".hashtag > span.PrettyLink-value").Each(func(i int, h *goquery.Selection) {
2020-02-12 10:45:19 +02:00
tweet.Hashtags = append(tweet.Hashtags, h.Text())
})
s.Find("a.twitter-timeline-link:not(.u-hidden)").Each(func(i int, u *goquery.Selection) {
if link, ok := u.Attr("data-expanded-url"); ok {
tweet.URLs = append(tweet.URLs, link)
}
})
2020-09-19 00:57:03 +03:00
s.Find(".NaturalImage-image").Each(func(i int, p *goquery.Selection) {
if link, ok := p.Attr("data-image"); ok {
2020-02-12 10:45:19 +02:00
tweet.Photos = append(tweet.Photos, link)
}
})
2020-09-19 00:57:03 +03:00
// s.Find(".PlayableMedia-player").Each(func(i int, v *goquery.Selection) {
// if style, ok := v.Attr("style"); ok {
// if strings.Contains(style, "background") {
// match := regexp.MustCompile(`https:\/\/.+\/([\w-]+)\.(?:jpg|png)`).FindStringSubmatch(style)
// if len(match) == 2 {
// tweet.Videos = append(tweet.Videos, Video{ID: match[1], Preview: match[0]})
// }
// }
// }
// })
2020-02-12 10:45:19 +02:00
tweets = append(tweets, &tweet)
}
})
2018-11-29 17:33:44 +02:00
return tweets, nil
}