twitter-scrapper/scraper.go

135 lines
3 KiB
Go
Raw Normal View History

2020-12-12 23:33:57 +02:00
package twitterscraper
import (
2020-12-13 00:04:34 +02:00
"crypto/tls"
2020-12-12 23:33:57 +02:00
"errors"
"net"
"net/http"
"net/http/cookiejar"
2020-12-12 23:33:57 +02:00
"net/url"
"strings"
"sync"
2020-12-12 23:33:57 +02:00
"time"
2021-12-07 10:18:01 +02:00
"golang.org/x/net/proxy"
2020-12-12 23:33:57 +02:00
)
// Scraper object
type Scraper struct {
2022-04-18 12:42:39 +03:00
bearerToken string
2020-12-12 23:33:57 +02:00
client *http.Client
delay int64
2020-12-12 23:33:57 +02:00
guestToken string
2021-01-05 14:21:08 +02:00
guestCreatedAt time.Time
2020-12-12 23:33:57 +02:00
includeReplies bool
isLogged bool
2020-12-23 19:53:48 +02:00
searchMode SearchMode
wg sync.WaitGroup
2020-12-12 23:33:57 +02:00
}
2020-12-23 19:53:48 +02:00
// SearchMode type
type SearchMode int
const (
// SearchTop - default mode
SearchTop SearchMode = iota
// SearchLatest - live mode
SearchLatest
// SearchPhotos - image mode
SearchPhotos
// SearchVideos - video mode
SearchVideos
// SearchUsers - user mode
SearchUsers
2020-12-23 19:53:48 +02:00
)
2021-09-09 11:15:53 +08:00
// default http client timeout
const DefaultClientTimeout = 10 * time.Second
2020-12-12 23:33:57 +02:00
// New creates a Scraper object
2020-12-12 23:45:14 +02:00
func New() *Scraper {
jar, _ := cookiejar.New(nil)
2020-12-12 23:45:14 +02:00
return &Scraper{
2022-04-18 12:42:39 +03:00
bearerToken: bearerToken,
client: &http.Client{
Jar: jar,
Timeout: DefaultClientTimeout,
},
2020-12-12 23:33:57 +02:00
}
}
2022-04-18 12:42:39 +03:00
func (s *Scraper) setBearerToken(token string) {
s.bearerToken = token
s.guestToken = ""
}
2021-12-07 10:18:01 +02:00
// IsGuestToken check if guest token not empty
func (s *Scraper) IsGuestToken() bool {
return s.guestToken != ""
}
2020-12-23 19:53:48 +02:00
// SetSearchMode switcher
func (s *Scraper) SetSearchMode(mode SearchMode) *Scraper {
s.searchMode = mode
return s
}
// WithDelay add delay between API requests (in seconds)
func (s *Scraper) WithDelay(seconds int64) *Scraper {
s.delay = seconds
return s
}
2020-12-12 23:33:57 +02:00
// WithReplies enable/disable load timeline with tweet replies
func (s *Scraper) WithReplies(b bool) *Scraper {
s.includeReplies = b
return s
}
2021-09-09 11:15:53 +08:00
// client timeout
func (s *Scraper) WithClientTimeout(timeout time.Duration) *Scraper {
2022-01-11 14:47:17 +02:00
s.client.Timeout = timeout
2021-09-09 11:15:53 +08:00
return s
2020-12-12 23:33:57 +02:00
}
2021-09-09 11:15:53 +08:00
// SetProxy
// set http proxy in the format `http://HOST:PORT`
// set socket proxy in the format `socks5://HOST:PORT`
func (s *Scraper) SetProxy(proxyAddr string) error {
if strings.HasPrefix(proxyAddr, "http") {
urlproxy, err := url.Parse(proxyAddr)
if err != nil {
return err
}
2023-05-10 08:50:24 +03:00
s.client.Transport = &http.Transport{
Proxy: http.ProxyURL(urlproxy),
TLSNextProto: make(map[string]func(authority string, c *tls.Conn) http.RoundTripper),
DialContext: (&net.Dialer{
Timeout: s.client.Timeout,
}).DialContext,
2021-09-09 11:15:53 +08:00
}
return nil
2021-09-08 16:30:37 +08:00
}
2021-09-09 11:15:53 +08:00
if strings.HasPrefix(proxyAddr, "socks5") {
baseDialer := &net.Dialer{
2022-01-11 14:47:17 +02:00
Timeout: s.client.Timeout,
KeepAlive: s.client.Timeout,
2021-09-09 11:15:53 +08:00
}
socksHostPort := strings.ReplaceAll(proxyAddr, "socks5://", "")
dialSocksProxy, err := proxy.SOCKS5("tcp", socksHostPort, nil, baseDialer)
2021-09-08 16:30:37 +08:00
if err != nil {
2021-09-09 11:15:53 +08:00
return errors.New("error creating socks5 proxy :" + err.Error())
2021-09-08 16:30:37 +08:00
}
if contextDialer, ok := dialSocksProxy.(proxy.ContextDialer); ok {
dialContext := contextDialer.DialContext
2023-05-10 08:50:24 +03:00
s.client.Transport = &http.Transport{
DialContext: dialContext,
2021-09-08 16:30:37 +08:00
}
} else {
2021-09-09 11:15:53 +08:00
return errors.New("failed type assertion to DialContext")
2021-09-08 16:02:53 +08:00
}
2021-09-09 11:15:53 +08:00
return nil
2021-09-08 16:02:53 +08:00
}
2021-09-09 11:15:53 +08:00
return errors.New("only support http(s) or socks5 protocol")
2021-09-08 16:02:53 +08:00
}