From 31c9e5da5a159e33701d3b58b8d17975a80cf426 Mon Sep 17 00:00:00 2001 From: "one.cat" Date: Wed, 8 Sep 2021 16:02:53 +0800 Subject: [PATCH 1/4] add cookie and x-csrf-token --- api.go | 6 ++++++ go.mod | 5 ++++- go.sum | 7 +++++++ scraper.go | 39 +++++++++++++++++++++++++++++++++++++++ tweets.go | 2 ++ 5 files changed, 58 insertions(+), 1 deletion(-) diff --git a/api.go b/api.go index 590dd64..79cfc33 100644 --- a/api.go +++ b/api.go @@ -33,6 +33,12 @@ func (s *Scraper) RequestAPI(req *http.Request, target interface{}) error { req.Header.Set("Authorization", "Bearer "+bearerToken) req.Header.Set("X-Guest-Token", s.guestToken) + // use cookie + if len(s.Cookie) > 0 && len(s.XCsrfToken) > 0 { + req.Header.Set("Cookie", s.Cookie) + req.Header.Set("x-csrf-token", s.XCsrfToken) + } + resp, err := s.client.Do(req) if err != nil { return err diff --git a/go.mod b/go.mod index 0bd43be..72cb1e8 100644 --- a/go.mod +++ b/go.mod @@ -2,4 +2,7 @@ module github.com/n0madic/twitter-scraper go 1.13 -require github.com/google/go-cmp v0.5.4 +require ( + github.com/google/go-cmp v0.5.4 + golang.org/x/net v0.0.0-20210805182204-aaa1db679c0d +) diff --git a/go.sum b/go.sum index 1ffcbdb..534c5d3 100644 --- a/go.sum +++ b/go.sum @@ -1,4 +1,11 @@ github.com/google/go-cmp v0.5.4 h1:L8R9j+yAqZuZjsqh/z+F1NCffTKKLShY6zXTItVIZ8M= github.com/google/go-cmp v0.5.4/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +golang.org/x/net v0.0.0-20210805182204-aaa1db679c0d h1:20cMwl2fHAzkJMEA+8J4JgqBQcQGzbisXo31MIeenXI= +golang.org/x/net v0.0.0-20210805182204-aaa1db679c0d/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= diff --git a/scraper.go b/scraper.go index 5a7c8b3..548e270 100644 --- a/scraper.go +++ b/scraper.go @@ -3,6 +3,9 @@ package twitterscraper import ( "crypto/tls" "errors" + "fmt" + "golang.org/x/net/proxy" + "log" "net" "net/http" "net/url" @@ -20,6 +23,9 @@ type Scraper struct { includeReplies bool searchMode SearchMode wg sync.WaitGroup + + Cookie string + XCsrfToken string } // SearchMode type @@ -80,6 +86,18 @@ func WithReplies(b bool) *Scraper { return defaultScraper.WithReplies(b) } +// cookie +func (s *Scraper) WithCookie(cookie string) *Scraper { + s.Cookie = cookie + return s +} + +// x csrf token +func (s *Scraper) WithXCsrfToken(xcsrfToken string) *Scraper { + s.XCsrfToken = xcsrfToken + return s +} + // SetProxy set http proxy in the format `http://HOST:PORT` func (s *Scraper) SetProxy(proxy string) error { if !strings.HasPrefix(proxy, "http") { @@ -101,6 +119,27 @@ func (s *Scraper) SetProxy(proxy string) error { return nil } +// SetProxy set socks5 proxy in the format `HOST:PORT` +func (s *Scraper) SetSocks5Proxy(socks5 string) error { + log.Println(socks5) + if dialer, err := proxy.SOCKS5("tcp", socks5, nil, proxy.Direct); err != nil { + return errors.New(fmt.Sprintf("can't connect to the socks5 proxy: %s, err: %s", socks5, err.Error())) + } else { + s.client = &http.Client{ + Transport: &http.Transport{ + Dial: dialer.Dial, + // TLSNextProto: make(map[string]func(authority string, c *tls.Conn) http.RoundTripper), + DialContext: (&net.Dialer{ + Timeout: 10 * time.Second, + }).DialContext, + }, + } + + log.Println(s.client) + } + return nil +} + // SetProxy wrapper for default Scraper func SetProxy(proxy string) error { return defaultScraper.SetProxy(proxy) diff --git a/tweets.go b/tweets.go index 4bf6d93..1b017e9 100644 --- a/tweets.go +++ b/tweets.go @@ -3,6 +3,7 @@ package twitterscraper import ( "context" "fmt" + "log" "strconv" ) @@ -41,6 +42,7 @@ func (s *Scraper) FetchTweets(user string, maxTweetsNbr int, cursor string) ([]* req.URL.RawQuery = q.Encode() var timeline timeline + log.Println(req, timeline, "getuser tww") err = s.RequestAPI(req, &timeline) if err != nil { return nil, "", err From dea52191dd70b92d386cb32b00e6c2940b64458f Mon Sep 17 00:00:00 2001 From: vanilla Date: Wed, 8 Sep 2021 16:30:37 +0800 Subject: [PATCH 2/4] fix socket5 --- scraper.go | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/scraper.go b/scraper.go index 548e270..377541d 100644 --- a/scraper.go +++ b/scraper.go @@ -3,9 +3,7 @@ package twitterscraper import ( "crypto/tls" "errors" - "fmt" "golang.org/x/net/proxy" - "log" "net" "net/http" "net/url" @@ -121,21 +119,31 @@ func (s *Scraper) SetProxy(proxy string) error { // SetProxy set socks5 proxy in the format `HOST:PORT` func (s *Scraper) SetSocks5Proxy(socks5 string) error { - log.Println(socks5) - if dialer, err := proxy.SOCKS5("tcp", socks5, nil, proxy.Direct); err != nil { - return errors.New(fmt.Sprintf("can't connect to the socks5 proxy: %s, err: %s", socks5, err.Error())) + baseDialer := &net.Dialer{ + Timeout: 30 * time.Second, + KeepAlive: 30 * time.Second, + } + if socks5 != "" { + dialSocksProxy, err := proxy.SOCKS5("tcp", socks5, nil, baseDialer) + if err != nil { + return errors.New("Error creating SOCKS5 proxy") + } + if contextDialer, ok := dialSocksProxy.(proxy.ContextDialer); ok { + dialContext := contextDialer.DialContext + s.client = &http.Client{ + Transport: &http.Transport{ + DialContext: dialContext, + }, + } + } else { + return errors.New("Failed type assertion to DialContext") + } } else { s.client = &http.Client{ Transport: &http.Transport{ - Dial: dialer.Dial, - // TLSNextProto: make(map[string]func(authority string, c *tls.Conn) http.RoundTripper), - DialContext: (&net.Dialer{ - Timeout: 10 * time.Second, - }).DialContext, + DialContext: (baseDialer).DialContext, }, } - - log.Println(s.client) } return nil } From d3057f34fbfe805c878efe2980ede3e8294d53f1 Mon Sep 17 00:00:00 2001 From: vanilla Date: Wed, 8 Sep 2021 16:33:53 +0800 Subject: [PATCH 3/4] remove log --- tweets.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/tweets.go b/tweets.go index 1b017e9..4bf6d93 100644 --- a/tweets.go +++ b/tweets.go @@ -3,7 +3,6 @@ package twitterscraper import ( "context" "fmt" - "log" "strconv" ) @@ -42,7 +41,6 @@ func (s *Scraper) FetchTweets(user string, maxTweetsNbr int, cursor string) ([]* req.URL.RawQuery = q.Encode() var timeline timeline - log.Println(req, timeline, "getuser tww") err = s.RequestAPI(req, &timeline) if err != nil { return nil, "", err From 7e61608f798624527d05bb2ca3fb1c4cf8fc45e5 Mon Sep 17 00:00:00 2001 From: vanilla Date: Thu, 9 Sep 2021 11:15:53 +0800 Subject: [PATCH 4/4] make a couple of changes pr:43 --- README.md | 21 ++++++++++++- api.go | 6 ++-- scraper.go | 86 ++++++++++++++++++++++++++++-------------------------- 3 files changed, 68 insertions(+), 45 deletions(-) diff --git a/README.md b/README.md index b4b0b9b..5d4e389 100644 --- a/README.md +++ b/README.md @@ -29,6 +29,12 @@ import ( func main() { scraper := twitterscraper.New() + + // Cookie and xCsrfToken is optional + // Some specified user tweets are protected that you must login and follow + scraper.WithCookie("twitter cookie after login") + scraper.WithXCsrfToken("twitter X-Csrf-Token after login") + for tweet := range scraper.GetTweets(context.Background(), "Twitter", 50) { if tweet.Error != nil { panic(tweet.Error) @@ -167,7 +173,11 @@ func main() { } ``` -### Use http proxy +### Use Proxy + +Support http and socks5 proxy + +#### with http ```golang err := scraper.SetProxy("http://localhost:3128") @@ -176,6 +186,15 @@ if err != nil { } ``` +#### with socks5 + +```golang +err := scraper.SetProxy("socks5://localhost:3128") +if err != nil { + panic(err) +} +``` + ### Delay requests Add delay between API requests (in seconds) diff --git a/api.go b/api.go index 79cfc33..2f4d657 100644 --- a/api.go +++ b/api.go @@ -34,9 +34,9 @@ func (s *Scraper) RequestAPI(req *http.Request, target interface{}) error { req.Header.Set("X-Guest-Token", s.guestToken) // use cookie - if len(s.Cookie) > 0 && len(s.XCsrfToken) > 0 { - req.Header.Set("Cookie", s.Cookie) - req.Header.Set("x-csrf-token", s.XCsrfToken) + if len(s.cookie) > 0 && len(s.xCsrfToken) > 0 { + req.Header.Set("Cookie", s.cookie) + req.Header.Set("x-csrf-token", s.xCsrfToken) } resp, err := s.client.Do(req) diff --git a/scraper.go b/scraper.go index 377541d..a5a5d4a 100644 --- a/scraper.go +++ b/scraper.go @@ -15,6 +15,7 @@ import ( // Scraper object type Scraper struct { client *http.Client + clientTimeout time.Duration delay int64 guestToken string guestCreatedAt time.Time @@ -22,8 +23,8 @@ type Scraper struct { searchMode SearchMode wg sync.WaitGroup - Cookie string - XCsrfToken string + cookie string + xCsrfToken string } // SearchMode type @@ -42,12 +43,16 @@ const ( SearchUsers ) +// default http client timeout +const DefaultClientTimeout = 10 * time.Second + var defaultScraper *Scraper // New creates a Scraper object func New() *Scraper { return &Scraper{ - client: &http.Client{Timeout: 10 * time.Second}, + client: &http.Client{Timeout: DefaultClientTimeout}, + clientTimeout: DefaultClientTimeout, } } @@ -86,47 +91,51 @@ func WithReplies(b bool) *Scraper { // cookie func (s *Scraper) WithCookie(cookie string) *Scraper { - s.Cookie = cookie + s.cookie = cookie return s } // x csrf token func (s *Scraper) WithXCsrfToken(xcsrfToken string) *Scraper { - s.XCsrfToken = xcsrfToken + s.xCsrfToken = xcsrfToken return s } -// SetProxy set http proxy in the format `http://HOST:PORT` -func (s *Scraper) SetProxy(proxy string) error { - if !strings.HasPrefix(proxy, "http") { - return errors.New("only support http(s) protocol") - } - urlproxy, err := url.Parse(proxy) - if err != nil { - return err - } - s.client = &http.Client{ - Transport: &http.Transport{ - Proxy: http.ProxyURL(urlproxy), - TLSNextProto: make(map[string]func(authority string, c *tls.Conn) http.RoundTripper), - DialContext: (&net.Dialer{ - Timeout: 10 * time.Second, - }).DialContext, - }, - } - return nil +// client timeout +func (s *Scraper) WithClientTimeout(timeout time.Duration) *Scraper { + s.clientTimeout = timeout + return s } -// SetProxy set socks5 proxy in the format `HOST:PORT` -func (s *Scraper) SetSocks5Proxy(socks5 string) error { - baseDialer := &net.Dialer{ - Timeout: 30 * time.Second, - KeepAlive: 30 * time.Second, - } - if socks5 != "" { - dialSocksProxy, err := proxy.SOCKS5("tcp", socks5, nil, baseDialer) +// SetProxy +// set http proxy in the format `http://HOST:PORT` +// set socket proxy in the format `socks5://HOST:PORT` +func (s *Scraper) SetProxy(proxyAddr string) error { + if strings.HasPrefix(proxyAddr, "http") { + urlproxy, err := url.Parse(proxyAddr) if err != nil { - return errors.New("Error creating SOCKS5 proxy") + return err + } + s.client = &http.Client{ + Transport: &http.Transport{ + Proxy: http.ProxyURL(urlproxy), + TLSNextProto: make(map[string]func(authority string, c *tls.Conn) http.RoundTripper), + DialContext: (&net.Dialer{ + Timeout: s.clientTimeout, + }).DialContext, + }, + } + return nil + } + if strings.HasPrefix(proxyAddr, "socks5") { + baseDialer := &net.Dialer{ + Timeout: s.clientTimeout, + KeepAlive: s.clientTimeout, + } + socksHostPort := strings.ReplaceAll(proxyAddr, "socks5://", "") + dialSocksProxy, err := proxy.SOCKS5("tcp", socksHostPort, nil, baseDialer) + if err != nil { + return errors.New("error creating socks5 proxy :" + err.Error()) } if contextDialer, ok := dialSocksProxy.(proxy.ContextDialer); ok { dialContext := contextDialer.DialContext @@ -136,16 +145,11 @@ func (s *Scraper) SetSocks5Proxy(socks5 string) error { }, } } else { - return errors.New("Failed type assertion to DialContext") - } - } else { - s.client = &http.Client{ - Transport: &http.Transport{ - DialContext: (baseDialer).DialContext, - }, + return errors.New("failed type assertion to DialContext") } + return nil } - return nil + return errors.New("only support http(s) or socks5 protocol") } // SetProxy wrapper for default Scraper