1// Copyright 2018 Adam Tauber
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//      http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// Package colly implements a HTTP scraping framework
16package colly
17
18import (
19	"bytes"
20	"context"
21	"crypto/rand"
22	"encoding/json"
23	"errors"
24	"fmt"
25	"hash/fnv"
26	"io"
27	"io/ioutil"
28	"log"
29	"net/http"
30	"net/http/cookiejar"
31	"net/url"
32	"os"
33	"path/filepath"
34	"regexp"
35	"strconv"
36	"strings"
37	"sync"
38	"sync/atomic"
39	"time"
40
41	"github.com/PuerkitoBio/goquery"
42	"github.com/antchfx/htmlquery"
43	"github.com/antchfx/xmlquery"
44	"github.com/gocolly/colly/v2/debug"
45	"github.com/gocolly/colly/v2/storage"
46	"github.com/kennygrant/sanitize"
47	"github.com/temoto/robotstxt"
48	"google.golang.org/appengine/urlfetch"
49)
50
51// A CollectorOption sets an option on a Collector.
52type CollectorOption func(*Collector)
53
54// Collector provides the scraper instance for a scraping job
55type Collector struct {
56	// UserAgent is the User-Agent string used by HTTP requests
57	UserAgent string
58	// MaxDepth limits the recursion depth of visited URLs.
59	// Set it to 0 for infinite recursion (default).
60	MaxDepth int
61	// AllowedDomains is a domain whitelist.
62	// Leave it blank to allow any domains to be visited
63	AllowedDomains []string
64	// DisallowedDomains is a domain blacklist.
65	DisallowedDomains []string
66	// DisallowedURLFilters is a list of regular expressions which restricts
67	// visiting URLs. If any of the rules matches to a URL the
68	// request will be stopped. DisallowedURLFilters will
69	// be evaluated before URLFilters
70	// Leave it blank to allow any URLs to be visited
71	DisallowedURLFilters []*regexp.Regexp
72	// URLFilters is a list of regular expressions which restricts
73	// visiting URLs. If any of the rules matches to a URL the
74	// request won't be stopped. DisallowedURLFilters will
75	// be evaluated before URLFilters
76
77	// Leave it blank to allow any URLs to be visited
78	URLFilters []*regexp.Regexp
79
80	// AllowURLRevisit allows multiple downloads of the same URL
81	AllowURLRevisit bool
82	// MaxBodySize is the limit of the retrieved response body in bytes.
83	// 0 means unlimited.
84	// The default value for MaxBodySize is 10MB (10 * 1024 * 1024 bytes).
85	MaxBodySize int
86	// CacheDir specifies a location where GET requests are cached as files.
87	// When it's not defined, caching is disabled.
88	CacheDir string
89	// IgnoreRobotsTxt allows the Collector to ignore any restrictions set by
90	// the target host's robots.txt file.  See http://www.robotstxt.org/ for more
91	// information.
92	IgnoreRobotsTxt bool
93	// Async turns on asynchronous network communication. Use Collector.Wait() to
94	// be sure all requests have been finished.
95	Async bool
96	// ParseHTTPErrorResponse allows parsing HTTP responses with non 2xx status codes.
97	// By default, Colly parses only successful HTTP responses. Set ParseHTTPErrorResponse
98	// to true to enable it.
99	ParseHTTPErrorResponse bool
100	// ID is the unique identifier of a collector
101	ID uint32
102	// DetectCharset can enable character encoding detection for non-utf8 response bodies
103	// without explicit charset declaration. This feature uses https://github.com/saintfish/chardet
104	DetectCharset bool
105	// RedirectHandler allows control on how a redirect will be managed
106	// use c.SetRedirectHandler to set this value
107	redirectHandler func(req *http.Request, via []*http.Request) error
108	// CheckHead performs a HEAD request before every GET to pre-validate the response
109	CheckHead bool
110	// TraceHTTP enables capturing and reporting request performance for crawler tuning.
111	// When set to true, the Response.Trace will be filled in with an HTTPTrace object.
112	TraceHTTP                bool
113	store                    storage.Storage
114	debugger                 debug.Debugger
115	robotsMap                map[string]*robotstxt.RobotsData
116	htmlCallbacks            []*htmlCallbackContainer
117	xmlCallbacks             []*xmlCallbackContainer
118	requestCallbacks         []RequestCallback
119	responseCallbacks        []ResponseCallback
120	responseHeadersCallbacks []ResponseHeadersCallback
121	errorCallbacks           []ErrorCallback
122	scrapedCallbacks         []ScrapedCallback
123	requestCount             uint32
124	responseCount            uint32
125	backend                  *httpBackend
126	wg                       *sync.WaitGroup
127	lock                     *sync.RWMutex
128}
129
130// RequestCallback is a type alias for OnRequest callback functions
131type RequestCallback func(*Request)
132
133// ResponseHeadersCallback is a type alias for OnResponseHeaders callback functions
134type ResponseHeadersCallback func(*Response)
135
136// ResponseCallback is a type alias for OnResponse callback functions
137type ResponseCallback func(*Response)
138
139// HTMLCallback is a type alias for OnHTML callback functions
140type HTMLCallback func(*HTMLElement)
141
142// XMLCallback is a type alias for OnXML callback functions
143type XMLCallback func(*XMLElement)
144
145// ErrorCallback is a type alias for OnError callback functions
146type ErrorCallback func(*Response, error)
147
148// ScrapedCallback is a type alias for OnScraped callback functions
149type ScrapedCallback func(*Response)
150
151// ProxyFunc is a type alias for proxy setter functions.
152type ProxyFunc func(*http.Request) (*url.URL, error)
153
154type htmlCallbackContainer struct {
155	Selector string
156	Function HTMLCallback
157}
158
159type xmlCallbackContainer struct {
160	Query    string
161	Function XMLCallback
162}
163
164type cookieJarSerializer struct {
165	store storage.Storage
166	lock  *sync.RWMutex
167}
168
169var collectorCounter uint32
170
171// The key type is unexported to prevent collisions with context keys defined in
172// other packages.
173type key int
174
175// ProxyURLKey is the context key for the request proxy address.
176const ProxyURLKey key = iota
177
178var (
179	// ErrForbiddenDomain is the error thrown if visiting
180	// a domain which is not allowed in AllowedDomains
181	ErrForbiddenDomain = errors.New("Forbidden domain")
182	// ErrMissingURL is the error type for missing URL errors
183	ErrMissingURL = errors.New("Missing URL")
184	// ErrMaxDepth is the error type for exceeding max depth
185	ErrMaxDepth = errors.New("Max depth limit reached")
186	// ErrForbiddenURL is the error thrown if visiting
187	// a URL which is not allowed by URLFilters
188	ErrForbiddenURL = errors.New("ForbiddenURL")
189
190	// ErrNoURLFiltersMatch is the error thrown if visiting
191	// a URL which is not allowed by URLFilters
192	ErrNoURLFiltersMatch = errors.New("No URLFilters match")
193	// ErrAlreadyVisited is the error type for already visited URLs
194	ErrAlreadyVisited = errors.New("URL already visited")
195	// ErrRobotsTxtBlocked is the error type for robots.txt errors
196	ErrRobotsTxtBlocked = errors.New("URL blocked by robots.txt")
197	// ErrNoCookieJar is the error type for missing cookie jar
198	ErrNoCookieJar = errors.New("Cookie jar is not available")
199	// ErrNoPattern is the error type for LimitRules without patterns
200	ErrNoPattern = errors.New("No pattern defined in LimitRule")
201	// ErrEmptyProxyURL is the error type for empty Proxy URL list
202	ErrEmptyProxyURL = errors.New("Proxy URL list is empty")
203	// ErrAbortedAfterHeaders is the error returned when OnResponseHeaders aborts the transfer.
204	ErrAbortedAfterHeaders = errors.New("Aborted after receiving response headers")
205	// ErrQueueFull is the error returned when the queue is full
206	ErrQueueFull = errors.New("Queue MaxSize reached")
207)
208
209var envMap = map[string]func(*Collector, string){
210	"ALLOWED_DOMAINS": func(c *Collector, val string) {
211		c.AllowedDomains = strings.Split(val, ",")
212	},
213	"CACHE_DIR": func(c *Collector, val string) {
214		c.CacheDir = val
215	},
216	"DETECT_CHARSET": func(c *Collector, val string) {
217		c.DetectCharset = isYesString(val)
218	},
219	"DISABLE_COOKIES": func(c *Collector, _ string) {
220		c.backend.Client.Jar = nil
221	},
222	"DISALLOWED_DOMAINS": func(c *Collector, val string) {
223		c.DisallowedDomains = strings.Split(val, ",")
224	},
225	"IGNORE_ROBOTSTXT": func(c *Collector, val string) {
226		c.IgnoreRobotsTxt = isYesString(val)
227	},
228	"FOLLOW_REDIRECTS": func(c *Collector, val string) {
229		if !isYesString(val) {
230			c.redirectHandler = func(req *http.Request, via []*http.Request) error {
231				return http.ErrUseLastResponse
232			}
233		}
234	},
235	"MAX_BODY_SIZE": func(c *Collector, val string) {
236		size, err := strconv.Atoi(val)
237		if err == nil {
238			c.MaxBodySize = size
239		}
240	},
241	"MAX_DEPTH": func(c *Collector, val string) {
242		maxDepth, err := strconv.Atoi(val)
243		if err == nil {
244			c.MaxDepth = maxDepth
245		}
246	},
247	"PARSE_HTTP_ERROR_RESPONSE": func(c *Collector, val string) {
248		c.ParseHTTPErrorResponse = isYesString(val)
249	},
250	"TRACE_HTTP": func(c *Collector, val string) {
251		c.TraceHTTP = isYesString(val)
252	},
253	"USER_AGENT": func(c *Collector, val string) {
254		c.UserAgent = val
255	},
256}
257
258// NewCollector creates a new Collector instance with default configuration
259func NewCollector(options ...CollectorOption) *Collector {
260	c := &Collector{}
261	c.Init()
262
263	for _, f := range options {
264		f(c)
265	}
266
267	c.parseSettingsFromEnv()
268
269	return c
270}
271
272// UserAgent sets the user agent used by the Collector.
273func UserAgent(ua string) CollectorOption {
274	return func(c *Collector) {
275		c.UserAgent = ua
276	}
277}
278
279// MaxDepth limits the recursion depth of visited URLs.
280func MaxDepth(depth int) CollectorOption {
281	return func(c *Collector) {
282		c.MaxDepth = depth
283	}
284}
285
286// AllowedDomains sets the domain whitelist used by the Collector.
287func AllowedDomains(domains ...string) CollectorOption {
288	return func(c *Collector) {
289		c.AllowedDomains = domains
290	}
291}
292
293// ParseHTTPErrorResponse allows parsing responses with HTTP errors
294func ParseHTTPErrorResponse() CollectorOption {
295	return func(c *Collector) {
296		c.ParseHTTPErrorResponse = true
297	}
298}
299
300// DisallowedDomains sets the domain blacklist used by the Collector.
301func DisallowedDomains(domains ...string) CollectorOption {
302	return func(c *Collector) {
303		c.DisallowedDomains = domains
304	}
305}
306
307// DisallowedURLFilters sets the list of regular expressions which restricts
308// visiting URLs. If any of the rules matches to a URL the request will be stopped.
309func DisallowedURLFilters(filters ...*regexp.Regexp) CollectorOption {
310	return func(c *Collector) {
311		c.DisallowedURLFilters = filters
312	}
313}
314
315// URLFilters sets the list of regular expressions which restricts
316// visiting URLs. If any of the rules matches to a URL the request won't be stopped.
317func URLFilters(filters ...*regexp.Regexp) CollectorOption {
318	return func(c *Collector) {
319		c.URLFilters = filters
320	}
321}
322
323// AllowURLRevisit instructs the Collector to allow multiple downloads of the same URL
324func AllowURLRevisit() CollectorOption {
325	return func(c *Collector) {
326		c.AllowURLRevisit = true
327	}
328}
329
330// MaxBodySize sets the limit of the retrieved response body in bytes.
331func MaxBodySize(sizeInBytes int) CollectorOption {
332	return func(c *Collector) {
333		c.MaxBodySize = sizeInBytes
334	}
335}
336
337// CacheDir specifies the location where GET requests are cached as files.
338func CacheDir(path string) CollectorOption {
339	return func(c *Collector) {
340		c.CacheDir = path
341	}
342}
343
344// IgnoreRobotsTxt instructs the Collector to ignore any restrictions
345// set by the target host's robots.txt file.
346func IgnoreRobotsTxt() CollectorOption {
347	return func(c *Collector) {
348		c.IgnoreRobotsTxt = true
349	}
350}
351
352// TraceHTTP instructs the Collector to collect and report request trace data
353// on the Response.Trace.
354func TraceHTTP() CollectorOption {
355	return func(c *Collector) {
356		c.TraceHTTP = true
357	}
358}
359
360// ID sets the unique identifier of the Collector.
361func ID(id uint32) CollectorOption {
362	return func(c *Collector) {
363		c.ID = id
364	}
365}
366
367// Async turns on asynchronous network requests.
368func Async(a ...bool) CollectorOption {
369	return func(c *Collector) {
370		c.Async = true
371	}
372}
373
374// DetectCharset enables character encoding detection for non-utf8 response bodies
375// without explicit charset declaration. This feature uses https://github.com/saintfish/chardet
376func DetectCharset() CollectorOption {
377	return func(c *Collector) {
378		c.DetectCharset = true
379	}
380}
381
382// Debugger sets the debugger used by the Collector.
383func Debugger(d debug.Debugger) CollectorOption {
384	return func(c *Collector) {
385		d.Init()
386		c.debugger = d
387	}
388}
389
390// CheckHead performs a HEAD request before every GET to pre-validate the response
391func CheckHead() CollectorOption {
392	return func(c *Collector) {
393		c.CheckHead = true
394	}
395}
396
397// Init initializes the Collector's private variables and sets default
398// configuration for the Collector
399func (c *Collector) Init() {
400	c.UserAgent = "colly - https://github.com/gocolly/colly/v2"
401	c.MaxDepth = 0
402	c.store = &storage.InMemoryStorage{}
403	c.store.Init()
404	c.MaxBodySize = 10 * 1024 * 1024
405	c.backend = &httpBackend{}
406	jar, _ := cookiejar.New(nil)
407	c.backend.Init(jar)
408	c.backend.Client.CheckRedirect = c.checkRedirectFunc()
409	c.wg = &sync.WaitGroup{}
410	c.lock = &sync.RWMutex{}
411	c.robotsMap = make(map[string]*robotstxt.RobotsData)
412	c.IgnoreRobotsTxt = true
413	c.ID = atomic.AddUint32(&collectorCounter, 1)
414	c.TraceHTTP = false
415}
416
417// Appengine will replace the Collector's backend http.Client
418// With an Http.Client that is provided by appengine/urlfetch
419// This function should be used when the scraper is run on
420// Google App Engine. Example:
421//   func startScraper(w http.ResponseWriter, r *http.Request) {
422//     ctx := appengine.NewContext(r)
423//     c := colly.NewCollector()
424//     c.Appengine(ctx)
425//      ...
426//     c.Visit("https://google.ca")
427//   }
428func (c *Collector) Appengine(ctx context.Context) {
429	client := urlfetch.Client(ctx)
430	client.Jar = c.backend.Client.Jar
431	client.CheckRedirect = c.backend.Client.CheckRedirect
432	client.Timeout = c.backend.Client.Timeout
433
434	c.backend.Client = client
435}
436
437// Visit starts Collector's collecting job by creating a
438// request to the URL specified in parameter.
439// Visit also calls the previously provided callbacks
440func (c *Collector) Visit(URL string) error {
441	if c.CheckHead {
442		if check := c.scrape(URL, "HEAD", 1, nil, nil, nil, true); check != nil {
443			return check
444		}
445	}
446	return c.scrape(URL, "GET", 1, nil, nil, nil, true)
447}
448
449// HasVisited checks if the provided URL has been visited
450func (c *Collector) HasVisited(URL string) (bool, error) {
451	return c.checkHasVisited(URL, nil)
452}
453
454// HasPosted checks if the provided URL and requestData has been visited
455// This method is useful more likely to prevent re-visit same URL and POST body
456func (c *Collector) HasPosted(URL string, requestData map[string]string) (bool, error) {
457	return c.checkHasVisited(URL, requestData)
458}
459
460// Head starts a collector job by creating a HEAD request.
461func (c *Collector) Head(URL string) error {
462	return c.scrape(URL, "HEAD", 1, nil, nil, nil, false)
463}
464
465// Post starts a collector job by creating a POST request.
466// Post also calls the previously provided callbacks
467func (c *Collector) Post(URL string, requestData map[string]string) error {
468	return c.scrape(URL, "POST", 1, createFormReader(requestData), nil, nil, true)
469}
470
471// PostRaw starts a collector job by creating a POST request with raw binary data.
472// Post also calls the previously provided callbacks
473func (c *Collector) PostRaw(URL string, requestData []byte) error {
474	return c.scrape(URL, "POST", 1, bytes.NewReader(requestData), nil, nil, true)
475}
476
477// PostMultipart starts a collector job by creating a Multipart POST request
478// with raw binary data.  PostMultipart also calls the previously provided callbacks
479func (c *Collector) PostMultipart(URL string, requestData map[string][]byte) error {
480	boundary := randomBoundary()
481	hdr := http.Header{}
482	hdr.Set("Content-Type", "multipart/form-data; boundary="+boundary)
483	hdr.Set("User-Agent", c.UserAgent)
484	return c.scrape(URL, "POST", 1, createMultipartReader(boundary, requestData), nil, hdr, true)
485}
486
487// Request starts a collector job by creating a custom HTTP request
488// where method, context, headers and request data can be specified.
489// Set requestData, ctx, hdr parameters to nil if you don't want to use them.
490// Valid methods:
491//   - "GET"
492//   - "HEAD"
493//   - "POST"
494//   - "PUT"
495//   - "DELETE"
496//   - "PATCH"
497//   - "OPTIONS"
498func (c *Collector) Request(method, URL string, requestData io.Reader, ctx *Context, hdr http.Header) error {
499	return c.scrape(URL, method, 1, requestData, ctx, hdr, true)
500}
501
502// SetDebugger attaches a debugger to the collector
503func (c *Collector) SetDebugger(d debug.Debugger) {
504	d.Init()
505	c.debugger = d
506}
507
508// UnmarshalRequest creates a Request from serialized data
509func (c *Collector) UnmarshalRequest(r []byte) (*Request, error) {
510	req := &serializableRequest{}
511	err := json.Unmarshal(r, req)
512	if err != nil {
513		return nil, err
514	}
515
516	u, err := url.Parse(req.URL)
517	if err != nil {
518		return nil, err
519	}
520
521	ctx := NewContext()
522	for k, v := range req.Ctx {
523		ctx.Put(k, v)
524	}
525
526	return &Request{
527		Method:    req.Method,
528		URL:       u,
529		Depth:     req.Depth,
530		Body:      bytes.NewReader(req.Body),
531		Ctx:       ctx,
532		ID:        atomic.AddUint32(&c.requestCount, 1),
533		Headers:   &req.Headers,
534		collector: c,
535	}, nil
536}
537
538func (c *Collector) scrape(u, method string, depth int, requestData io.Reader, ctx *Context, hdr http.Header, checkRevisit bool) error {
539	parsedURL, err := url.Parse(u)
540	if err != nil {
541		return err
542	}
543	if err := c.requestCheck(u, parsedURL, method, requestData, depth, checkRevisit); err != nil {
544		return err
545	}
546
547	if hdr == nil {
548		hdr = http.Header{"User-Agent": []string{c.UserAgent}}
549	}
550	rc, ok := requestData.(io.ReadCloser)
551	if !ok && requestData != nil {
552		rc = ioutil.NopCloser(requestData)
553	}
554	// The Go HTTP API ignores "Host" in the headers, preferring the client
555	// to use the Host field on Request.
556	host := parsedURL.Host
557	if hostHeader := hdr.Get("Host"); hostHeader != "" {
558		host = hostHeader
559	}
560	req := &http.Request{
561		Method:     method,
562		URL:        parsedURL,
563		Proto:      "HTTP/1.1",
564		ProtoMajor: 1,
565		ProtoMinor: 1,
566		Header:     hdr,
567		Body:       rc,
568		Host:       host,
569	}
570	setRequestBody(req, requestData)
571	u = parsedURL.String()
572	c.wg.Add(1)
573	if c.Async {
574		go c.fetch(u, method, depth, requestData, ctx, hdr, req)
575		return nil
576	}
577	return c.fetch(u, method, depth, requestData, ctx, hdr, req)
578}
579
580func setRequestBody(req *http.Request, body io.Reader) {
581	if body != nil {
582		switch v := body.(type) {
583		case *bytes.Buffer:
584			req.ContentLength = int64(v.Len())
585			buf := v.Bytes()
586			req.GetBody = func() (io.ReadCloser, error) {
587				r := bytes.NewReader(buf)
588				return ioutil.NopCloser(r), nil
589			}
590		case *bytes.Reader:
591			req.ContentLength = int64(v.Len())
592			snapshot := *v
593			req.GetBody = func() (io.ReadCloser, error) {
594				r := snapshot
595				return ioutil.NopCloser(&r), nil
596			}
597		case *strings.Reader:
598			req.ContentLength = int64(v.Len())
599			snapshot := *v
600			req.GetBody = func() (io.ReadCloser, error) {
601				r := snapshot
602				return ioutil.NopCloser(&r), nil
603			}
604		}
605		if req.GetBody != nil && req.ContentLength == 0 {
606			req.Body = http.NoBody
607			req.GetBody = func() (io.ReadCloser, error) { return http.NoBody, nil }
608		}
609	}
610}
611
612func (c *Collector) fetch(u, method string, depth int, requestData io.Reader, ctx *Context, hdr http.Header, req *http.Request) error {
613	defer c.wg.Done()
614	if ctx == nil {
615		ctx = NewContext()
616	}
617	request := &Request{
618		URL:       req.URL,
619		Headers:   &req.Header,
620		Ctx:       ctx,
621		Depth:     depth,
622		Method:    method,
623		Body:      requestData,
624		collector: c,
625		ID:        atomic.AddUint32(&c.requestCount, 1),
626	}
627
628	c.handleOnRequest(request)
629
630	if request.abort {
631		return nil
632	}
633
634	if method == "POST" && req.Header.Get("Content-Type") == "" {
635		req.Header.Add("Content-Type", "application/x-www-form-urlencoded")
636	}
637
638	if req.Header.Get("Accept") == "" {
639		req.Header.Set("Accept", "*/*")
640	}
641
642	var hTrace *HTTPTrace
643	if c.TraceHTTP {
644		hTrace = &HTTPTrace{}
645		req = hTrace.WithTrace(req)
646	}
647	checkHeadersFunc := func(statusCode int, headers http.Header) bool {
648		c.handleOnResponseHeaders(&Response{Ctx: ctx, Request: request, StatusCode: statusCode, Headers: &headers})
649		return !request.abort
650	}
651
652	origURL := req.URL
653	response, err := c.backend.Cache(req, c.MaxBodySize, checkHeadersFunc, c.CacheDir)
654	if proxyURL, ok := req.Context().Value(ProxyURLKey).(string); ok {
655		request.ProxyURL = proxyURL
656	}
657	if err := c.handleOnError(response, err, request, ctx); err != nil {
658		return err
659	}
660	if req.URL != origURL {
661		request.URL = req.URL
662		request.Headers = &req.Header
663	}
664	atomic.AddUint32(&c.responseCount, 1)
665	response.Ctx = ctx
666	response.Request = request
667	response.Trace = hTrace
668
669	err = response.fixCharset(c.DetectCharset, request.ResponseCharacterEncoding)
670	if err != nil {
671		return err
672	}
673
674	c.handleOnResponse(response)
675
676	err = c.handleOnHTML(response)
677	if err != nil {
678		c.handleOnError(response, err, request, ctx)
679	}
680
681	err = c.handleOnXML(response)
682	if err != nil {
683		c.handleOnError(response, err, request, ctx)
684	}
685
686	c.handleOnScraped(response)
687
688	return err
689}
690
691func (c *Collector) requestCheck(u string, parsedURL *url.URL, method string, requestData io.Reader, depth int, checkRevisit bool) error {
692	if u == "" {
693		return ErrMissingURL
694	}
695	if c.MaxDepth > 0 && c.MaxDepth < depth {
696		return ErrMaxDepth
697	}
698	if len(c.DisallowedURLFilters) > 0 {
699		if isMatchingFilter(c.DisallowedURLFilters, []byte(u)) {
700			return ErrForbiddenURL
701		}
702	}
703	if len(c.URLFilters) > 0 {
704		if !isMatchingFilter(c.URLFilters, []byte(u)) {
705			return ErrNoURLFiltersMatch
706		}
707	}
708	if !c.isDomainAllowed(parsedURL.Hostname()) {
709		return ErrForbiddenDomain
710	}
711	if method != "HEAD" && !c.IgnoreRobotsTxt {
712		if err := c.checkRobots(parsedURL); err != nil {
713			return err
714		}
715	}
716	if checkRevisit && !c.AllowURLRevisit {
717		h := fnv.New64a()
718		h.Write([]byte(u))
719
720		var uHash uint64
721		if method == "GET" {
722			uHash = h.Sum64()
723		} else if requestData != nil {
724			h.Write(streamToByte(requestData))
725			uHash = h.Sum64()
726		} else {
727			return nil
728		}
729
730		visited, err := c.store.IsVisited(uHash)
731		if err != nil {
732			return err
733		}
734		if visited {
735			return ErrAlreadyVisited
736		}
737		return c.store.Visited(uHash)
738	}
739	return nil
740}
741
742func (c *Collector) isDomainAllowed(domain string) bool {
743	for _, d2 := range c.DisallowedDomains {
744		if d2 == domain {
745			return false
746		}
747	}
748	if c.AllowedDomains == nil || len(c.AllowedDomains) == 0 {
749		return true
750	}
751	for _, d2 := range c.AllowedDomains {
752		if d2 == domain {
753			return true
754		}
755	}
756	return false
757}
758
759func (c *Collector) checkRobots(u *url.URL) error {
760	c.lock.RLock()
761	robot, ok := c.robotsMap[u.Host]
762	c.lock.RUnlock()
763
764	if !ok {
765		// no robots file cached
766		resp, err := c.backend.Client.Get(u.Scheme + "://" + u.Host + "/robots.txt")
767		if err != nil {
768			return err
769		}
770		defer resp.Body.Close()
771
772		robot, err = robotstxt.FromResponse(resp)
773		if err != nil {
774			return err
775		}
776		c.lock.Lock()
777		c.robotsMap[u.Host] = robot
778		c.lock.Unlock()
779	}
780
781	uaGroup := robot.FindGroup(c.UserAgent)
782	if uaGroup == nil {
783		return nil
784	}
785
786	eu := u.EscapedPath()
787	if u.RawQuery != "" {
788		eu += "?" + u.Query().Encode()
789	}
790	if !uaGroup.Test(eu) {
791		return ErrRobotsTxtBlocked
792	}
793	return nil
794}
795
796// String is the text representation of the collector.
797// It contains useful debug information about the collector's internals
798func (c *Collector) String() string {
799	return fmt.Sprintf(
800		"Requests made: %d (%d responses) | Callbacks: OnRequest: %d, OnHTML: %d, OnResponse: %d, OnError: %d",
801		c.requestCount,
802		c.responseCount,
803		len(c.requestCallbacks),
804		len(c.htmlCallbacks),
805		len(c.responseCallbacks),
806		len(c.errorCallbacks),
807	)
808}
809
810// Wait returns when the collector jobs are finished
811func (c *Collector) Wait() {
812	c.wg.Wait()
813}
814
815// OnRequest registers a function. Function will be executed on every
816// request made by the Collector
817func (c *Collector) OnRequest(f RequestCallback) {
818	c.lock.Lock()
819	if c.requestCallbacks == nil {
820		c.requestCallbacks = make([]RequestCallback, 0, 4)
821	}
822	c.requestCallbacks = append(c.requestCallbacks, f)
823	c.lock.Unlock()
824}
825
826// OnResponseHeaders registers a function. Function will be executed on every response
827// when headers and status are already received, but body is not yet read.
828//
829// Like in OnRequest, you can call Request.Abort to abort the transfer. This might be
830// useful if, for example, you're following all hyperlinks, but want to avoid
831// downloading files.
832//
833// Be aware that using this will prevent HTTP/1.1 connection reuse, as
834// the only way to abort a download is to immediately close the connection.
835// HTTP/2 doesn't suffer from this problem, as it's possible to close
836// specific stream inside the connection.
837func (c *Collector) OnResponseHeaders(f ResponseHeadersCallback) {
838	c.lock.Lock()
839	c.responseHeadersCallbacks = append(c.responseHeadersCallbacks, f)
840	c.lock.Unlock()
841}
842
843// OnResponse registers a function. Function will be executed on every response
844func (c *Collector) OnResponse(f ResponseCallback) {
845	c.lock.Lock()
846	if c.responseCallbacks == nil {
847		c.responseCallbacks = make([]ResponseCallback, 0, 4)
848	}
849	c.responseCallbacks = append(c.responseCallbacks, f)
850	c.lock.Unlock()
851}
852
853// OnHTML registers a function. Function will be executed on every HTML
854// element matched by the GoQuery Selector parameter.
855// GoQuery Selector is a selector used by https://github.com/PuerkitoBio/goquery
856func (c *Collector) OnHTML(goquerySelector string, f HTMLCallback) {
857	c.lock.Lock()
858	if c.htmlCallbacks == nil {
859		c.htmlCallbacks = make([]*htmlCallbackContainer, 0, 4)
860	}
861	c.htmlCallbacks = append(c.htmlCallbacks, &htmlCallbackContainer{
862		Selector: goquerySelector,
863		Function: f,
864	})
865	c.lock.Unlock()
866}
867
868// OnXML registers a function. Function will be executed on every XML
869// element matched by the xpath Query parameter.
870// xpath Query is used by https://github.com/antchfx/xmlquery
871func (c *Collector) OnXML(xpathQuery string, f XMLCallback) {
872	c.lock.Lock()
873	if c.xmlCallbacks == nil {
874		c.xmlCallbacks = make([]*xmlCallbackContainer, 0, 4)
875	}
876	c.xmlCallbacks = append(c.xmlCallbacks, &xmlCallbackContainer{
877		Query:    xpathQuery,
878		Function: f,
879	})
880	c.lock.Unlock()
881}
882
883// OnHTMLDetach deregister a function. Function will not be execute after detached
884func (c *Collector) OnHTMLDetach(goquerySelector string) {
885	c.lock.Lock()
886	deleteIdx := -1
887	for i, cc := range c.htmlCallbacks {
888		if cc.Selector == goquerySelector {
889			deleteIdx = i
890			break
891		}
892	}
893	if deleteIdx != -1 {
894		c.htmlCallbacks = append(c.htmlCallbacks[:deleteIdx], c.htmlCallbacks[deleteIdx+1:]...)
895	}
896	c.lock.Unlock()
897}
898
899// OnXMLDetach deregister a function. Function will not be execute after detached
900func (c *Collector) OnXMLDetach(xpathQuery string) {
901	c.lock.Lock()
902	deleteIdx := -1
903	for i, cc := range c.xmlCallbacks {
904		if cc.Query == xpathQuery {
905			deleteIdx = i
906			break
907		}
908	}
909	if deleteIdx != -1 {
910		c.xmlCallbacks = append(c.xmlCallbacks[:deleteIdx], c.xmlCallbacks[deleteIdx+1:]...)
911	}
912	c.lock.Unlock()
913}
914
915// OnError registers a function. Function will be executed if an error
916// occurs during the HTTP request.
917func (c *Collector) OnError(f ErrorCallback) {
918	c.lock.Lock()
919	if c.errorCallbacks == nil {
920		c.errorCallbacks = make([]ErrorCallback, 0, 4)
921	}
922	c.errorCallbacks = append(c.errorCallbacks, f)
923	c.lock.Unlock()
924}
925
926// OnScraped registers a function. Function will be executed after
927// OnHTML, as a final part of the scraping.
928func (c *Collector) OnScraped(f ScrapedCallback) {
929	c.lock.Lock()
930	if c.scrapedCallbacks == nil {
931		c.scrapedCallbacks = make([]ScrapedCallback, 0, 4)
932	}
933	c.scrapedCallbacks = append(c.scrapedCallbacks, f)
934	c.lock.Unlock()
935}
936
937// SetClient will override the previously set http.Client
938func (c *Collector) SetClient(client *http.Client) {
939	c.backend.Client = client
940}
941
942// WithTransport allows you to set a custom http.RoundTripper (transport)
943func (c *Collector) WithTransport(transport http.RoundTripper) {
944	c.backend.Client.Transport = transport
945}
946
947// DisableCookies turns off cookie handling
948func (c *Collector) DisableCookies() {
949	c.backend.Client.Jar = nil
950}
951
952// SetCookieJar overrides the previously set cookie jar
953func (c *Collector) SetCookieJar(j http.CookieJar) {
954	c.backend.Client.Jar = j
955}
956
957// SetRequestTimeout overrides the default timeout (10 seconds) for this collector
958func (c *Collector) SetRequestTimeout(timeout time.Duration) {
959	c.backend.Client.Timeout = timeout
960}
961
962// SetStorage overrides the default in-memory storage.
963// Storage stores scraping related data like cookies and visited urls
964func (c *Collector) SetStorage(s storage.Storage) error {
965	if err := s.Init(); err != nil {
966		return err
967	}
968	c.store = s
969	c.backend.Client.Jar = createJar(s)
970	return nil
971}
972
973// SetProxy sets a proxy for the collector. This method overrides the previously
974// used http.Transport if the type of the transport is not http.RoundTripper.
975// The proxy type is determined by the URL scheme. "http"
976// and "socks5" are supported. If the scheme is empty,
977// "http" is assumed.
978func (c *Collector) SetProxy(proxyURL string) error {
979	proxyParsed, err := url.Parse(proxyURL)
980	if err != nil {
981		return err
982	}
983
984	c.SetProxyFunc(http.ProxyURL(proxyParsed))
985
986	return nil
987}
988
989// SetProxyFunc sets a custom proxy setter/switcher function.
990// See built-in ProxyFuncs for more details.
991// This method overrides the previously used http.Transport
992// if the type of the transport is not http.RoundTripper.
993// The proxy type is determined by the URL scheme. "http"
994// and "socks5" are supported. If the scheme is empty,
995// "http" is assumed.
996func (c *Collector) SetProxyFunc(p ProxyFunc) {
997	t, ok := c.backend.Client.Transport.(*http.Transport)
998	if c.backend.Client.Transport != nil && ok {
999		t.Proxy = p
1000	} else {
1001		c.backend.Client.Transport = &http.Transport{
1002			Proxy: p,
1003		}
1004	}
1005}
1006
1007func createEvent(eventType string, requestID, collectorID uint32, kvargs map[string]string) *debug.Event {
1008	return &debug.Event{
1009		CollectorID: collectorID,
1010		RequestID:   requestID,
1011		Type:        eventType,
1012		Values:      kvargs,
1013	}
1014}
1015
1016func (c *Collector) handleOnRequest(r *Request) {
1017	if c.debugger != nil {
1018		c.debugger.Event(createEvent("request", r.ID, c.ID, map[string]string{
1019			"url": r.URL.String(),
1020		}))
1021	}
1022	for _, f := range c.requestCallbacks {
1023		f(r)
1024	}
1025}
1026
1027func (c *Collector) handleOnResponse(r *Response) {
1028	if c.debugger != nil {
1029		c.debugger.Event(createEvent("response", r.Request.ID, c.ID, map[string]string{
1030			"url":    r.Request.URL.String(),
1031			"status": http.StatusText(r.StatusCode),
1032		}))
1033	}
1034	for _, f := range c.responseCallbacks {
1035		f(r)
1036	}
1037}
1038
1039func (c *Collector) handleOnResponseHeaders(r *Response) {
1040	if c.debugger != nil {
1041		c.debugger.Event(createEvent("responseHeaders", r.Request.ID, c.ID, map[string]string{
1042			"url":    r.Request.URL.String(),
1043			"status": http.StatusText(r.StatusCode),
1044		}))
1045	}
1046	for _, f := range c.responseHeadersCallbacks {
1047		f(r)
1048	}
1049}
1050
1051func (c *Collector) handleOnHTML(resp *Response) error {
1052	if len(c.htmlCallbacks) == 0 || !strings.Contains(strings.ToLower(resp.Headers.Get("Content-Type")), "html") {
1053		return nil
1054	}
1055	doc, err := goquery.NewDocumentFromReader(bytes.NewBuffer(resp.Body))
1056	if err != nil {
1057		return err
1058	}
1059	if href, found := doc.Find("base[href]").Attr("href"); found {
1060		resp.Request.baseURL, _ = resp.Request.URL.Parse(href)
1061	}
1062	for _, cc := range c.htmlCallbacks {
1063		i := 0
1064		doc.Find(cc.Selector).Each(func(_ int, s *goquery.Selection) {
1065			for _, n := range s.Nodes {
1066				e := NewHTMLElementFromSelectionNode(resp, s, n, i)
1067				i++
1068				if c.debugger != nil {
1069					c.debugger.Event(createEvent("html", resp.Request.ID, c.ID, map[string]string{
1070						"selector": cc.Selector,
1071						"url":      resp.Request.URL.String(),
1072					}))
1073				}
1074				cc.Function(e)
1075			}
1076		})
1077	}
1078	return nil
1079}
1080
1081func (c *Collector) handleOnXML(resp *Response) error {
1082	if len(c.xmlCallbacks) == 0 {
1083		return nil
1084	}
1085	contentType := strings.ToLower(resp.Headers.Get("Content-Type"))
1086	isXMLFile := strings.HasSuffix(strings.ToLower(resp.Request.URL.Path), ".xml") || strings.HasSuffix(strings.ToLower(resp.Request.URL.Path), ".xml.gz")
1087	if !strings.Contains(contentType, "html") && (!strings.Contains(contentType, "xml") && !isXMLFile) {
1088		return nil
1089	}
1090
1091	if strings.Contains(contentType, "html") {
1092		doc, err := htmlquery.Parse(bytes.NewBuffer(resp.Body))
1093		if err != nil {
1094			return err
1095		}
1096		if e := htmlquery.FindOne(doc, "//base"); e != nil {
1097			for _, a := range e.Attr {
1098				if a.Key == "href" {
1099					resp.Request.baseURL, _ = resp.Request.URL.Parse(a.Val)
1100					break
1101				}
1102			}
1103		}
1104
1105		for _, cc := range c.xmlCallbacks {
1106			for _, n := range htmlquery.Find(doc, cc.Query) {
1107				e := NewXMLElementFromHTMLNode(resp, n)
1108				if c.debugger != nil {
1109					c.debugger.Event(createEvent("xml", resp.Request.ID, c.ID, map[string]string{
1110						"selector": cc.Query,
1111						"url":      resp.Request.URL.String(),
1112					}))
1113				}
1114				cc.Function(e)
1115			}
1116		}
1117	} else if strings.Contains(contentType, "xml") || isXMLFile {
1118		doc, err := xmlquery.Parse(bytes.NewBuffer(resp.Body))
1119		if err != nil {
1120			return err
1121		}
1122
1123		for _, cc := range c.xmlCallbacks {
1124			xmlquery.FindEach(doc, cc.Query, func(i int, n *xmlquery.Node) {
1125				e := NewXMLElementFromXMLNode(resp, n)
1126				if c.debugger != nil {
1127					c.debugger.Event(createEvent("xml", resp.Request.ID, c.ID, map[string]string{
1128						"selector": cc.Query,
1129						"url":      resp.Request.URL.String(),
1130					}))
1131				}
1132				cc.Function(e)
1133			})
1134		}
1135	}
1136	return nil
1137}
1138
1139func (c *Collector) handleOnError(response *Response, err error, request *Request, ctx *Context) error {
1140	if err == nil && (c.ParseHTTPErrorResponse || response.StatusCode < 203) {
1141		return nil
1142	}
1143	if err == nil && response.StatusCode >= 203 {
1144		err = errors.New(http.StatusText(response.StatusCode))
1145	}
1146	if response == nil {
1147		response = &Response{
1148			Request: request,
1149			Ctx:     ctx,
1150		}
1151	}
1152	if c.debugger != nil {
1153		c.debugger.Event(createEvent("error", request.ID, c.ID, map[string]string{
1154			"url":    request.URL.String(),
1155			"status": http.StatusText(response.StatusCode),
1156		}))
1157	}
1158	if response.Request == nil {
1159		response.Request = request
1160	}
1161	if response.Ctx == nil {
1162		response.Ctx = request.Ctx
1163	}
1164	for _, f := range c.errorCallbacks {
1165		f(response, err)
1166	}
1167	return err
1168}
1169
1170func (c *Collector) handleOnScraped(r *Response) {
1171	if c.debugger != nil {
1172		c.debugger.Event(createEvent("scraped", r.Request.ID, c.ID, map[string]string{
1173			"url": r.Request.URL.String(),
1174		}))
1175	}
1176	for _, f := range c.scrapedCallbacks {
1177		f(r)
1178	}
1179}
1180
1181// Limit adds a new LimitRule to the collector
1182func (c *Collector) Limit(rule *LimitRule) error {
1183	return c.backend.Limit(rule)
1184}
1185
1186// Limits adds new LimitRules to the collector
1187func (c *Collector) Limits(rules []*LimitRule) error {
1188	return c.backend.Limits(rules)
1189}
1190
1191// SetRedirectHandler instructs the Collector to allow multiple downloads of the same URL
1192func (c *Collector) SetRedirectHandler(f func(req *http.Request, via []*http.Request) error) {
1193	c.redirectHandler = f
1194	c.backend.Client.CheckRedirect = c.checkRedirectFunc()
1195}
1196
1197// SetCookies handles the receipt of the cookies in a reply for the given URL
1198func (c *Collector) SetCookies(URL string, cookies []*http.Cookie) error {
1199	if c.backend.Client.Jar == nil {
1200		return ErrNoCookieJar
1201	}
1202	u, err := url.Parse(URL)
1203	if err != nil {
1204		return err
1205	}
1206	c.backend.Client.Jar.SetCookies(u, cookies)
1207	return nil
1208}
1209
1210// Cookies returns the cookies to send in a request for the given URL.
1211func (c *Collector) Cookies(URL string) []*http.Cookie {
1212	if c.backend.Client.Jar == nil {
1213		return nil
1214	}
1215	u, err := url.Parse(URL)
1216	if err != nil {
1217		return nil
1218	}
1219	return c.backend.Client.Jar.Cookies(u)
1220}
1221
1222// Clone creates an exact copy of a Collector without callbacks.
1223// HTTP backend, robots.txt cache and cookie jar are shared
1224// between collectors.
1225func (c *Collector) Clone() *Collector {
1226	return &Collector{
1227		AllowedDomains:         c.AllowedDomains,
1228		AllowURLRevisit:        c.AllowURLRevisit,
1229		CacheDir:               c.CacheDir,
1230		DetectCharset:          c.DetectCharset,
1231		DisallowedDomains:      c.DisallowedDomains,
1232		ID:                     atomic.AddUint32(&collectorCounter, 1),
1233		IgnoreRobotsTxt:        c.IgnoreRobotsTxt,
1234		MaxBodySize:            c.MaxBodySize,
1235		MaxDepth:               c.MaxDepth,
1236		DisallowedURLFilters:   c.DisallowedURLFilters,
1237		URLFilters:             c.URLFilters,
1238		CheckHead:              c.CheckHead,
1239		ParseHTTPErrorResponse: c.ParseHTTPErrorResponse,
1240		UserAgent:              c.UserAgent,
1241		TraceHTTP:              c.TraceHTTP,
1242		store:                  c.store,
1243		backend:                c.backend,
1244		debugger:               c.debugger,
1245		Async:                  c.Async,
1246		redirectHandler:        c.redirectHandler,
1247		errorCallbacks:         make([]ErrorCallback, 0, 8),
1248		htmlCallbacks:          make([]*htmlCallbackContainer, 0, 8),
1249		xmlCallbacks:           make([]*xmlCallbackContainer, 0, 8),
1250		scrapedCallbacks:       make([]ScrapedCallback, 0, 8),
1251		lock:                   c.lock,
1252		requestCallbacks:       make([]RequestCallback, 0, 8),
1253		responseCallbacks:      make([]ResponseCallback, 0, 8),
1254		robotsMap:              c.robotsMap,
1255		wg:                     &sync.WaitGroup{},
1256	}
1257}
1258
1259func (c *Collector) checkRedirectFunc() func(req *http.Request, via []*http.Request) error {
1260	return func(req *http.Request, via []*http.Request) error {
1261		if !c.isDomainAllowed(req.URL.Hostname()) {
1262			return fmt.Errorf("Not following redirect to %s because its not in AllowedDomains", req.URL.Host)
1263		}
1264
1265		if c.redirectHandler != nil {
1266			return c.redirectHandler(req, via)
1267		}
1268
1269		// Honor golangs default of maximum of 10 redirects
1270		if len(via) >= 10 {
1271			return http.ErrUseLastResponse
1272		}
1273
1274		lastRequest := via[len(via)-1]
1275
1276		// If domain has changed, remove the Authorization-header if it exists
1277		if req.URL.Host != lastRequest.URL.Host {
1278			req.Header.Del("Authorization")
1279		}
1280
1281		return nil
1282	}
1283}
1284
1285func (c *Collector) parseSettingsFromEnv() {
1286	for _, e := range os.Environ() {
1287		if !strings.HasPrefix(e, "COLLY_") {
1288			continue
1289		}
1290		pair := strings.SplitN(e[6:], "=", 2)
1291		if f, ok := envMap[pair[0]]; ok {
1292			f(c, pair[1])
1293		} else {
1294			log.Println("Unknown environment variable:", pair[0])
1295		}
1296	}
1297}
1298
1299func (c *Collector) checkHasVisited(URL string, requestData map[string]string) (bool, error) {
1300	h := fnv.New64a()
1301	h.Write([]byte(URL))
1302
1303	if requestData != nil {
1304		h.Write(streamToByte(createFormReader(requestData)))
1305	}
1306
1307	return c.store.IsVisited(h.Sum64())
1308}
1309
1310// SanitizeFileName replaces dangerous characters in a string
1311// so the return value can be used as a safe file name.
1312func SanitizeFileName(fileName string) string {
1313	ext := filepath.Ext(fileName)
1314	cleanExt := sanitize.BaseName(ext)
1315	if cleanExt == "" {
1316		cleanExt = ".unknown"
1317	}
1318	return strings.Replace(fmt.Sprintf(
1319		"%s.%s",
1320		sanitize.BaseName(fileName[:len(fileName)-len(ext)]),
1321		cleanExt[1:],
1322	), "-", "_", -1)
1323}
1324
1325func createFormReader(data map[string]string) io.Reader {
1326	form := url.Values{}
1327	for k, v := range data {
1328		form.Add(k, v)
1329	}
1330	return strings.NewReader(form.Encode())
1331}
1332
1333func createMultipartReader(boundary string, data map[string][]byte) io.Reader {
1334	dashBoundary := "--" + boundary
1335
1336	body := []byte{}
1337	buffer := bytes.NewBuffer(body)
1338
1339	buffer.WriteString("Content-type: multipart/form-data; boundary=" + boundary + "\n\n")
1340	for contentType, content := range data {
1341		buffer.WriteString(dashBoundary + "\n")
1342		buffer.WriteString("Content-Disposition: form-data; name=" + contentType + "\n")
1343		buffer.WriteString(fmt.Sprintf("Content-Length: %d \n\n", len(content)))
1344		buffer.Write(content)
1345		buffer.WriteString("\n")
1346	}
1347	buffer.WriteString(dashBoundary + "--\n\n")
1348	return buffer
1349}
1350
1351// randomBoundary was borrowed from
1352// github.com/golang/go/mime/multipart/writer.go#randomBoundary
1353func randomBoundary() string {
1354	var buf [30]byte
1355	_, err := io.ReadFull(rand.Reader, buf[:])
1356	if err != nil {
1357		panic(err)
1358	}
1359	return fmt.Sprintf("%x", buf[:])
1360}
1361
1362func isYesString(s string) bool {
1363	switch strings.ToLower(s) {
1364	case "1", "yes", "true", "y":
1365		return true
1366	}
1367	return false
1368}
1369
1370func createJar(s storage.Storage) http.CookieJar {
1371	return &cookieJarSerializer{store: s, lock: &sync.RWMutex{}}
1372}
1373
1374func (j *cookieJarSerializer) SetCookies(u *url.URL, cookies []*http.Cookie) {
1375	j.lock.Lock()
1376	defer j.lock.Unlock()
1377	cookieStr := j.store.Cookies(u)
1378
1379	// Merge existing cookies, new cookies have precedence.
1380	cnew := make([]*http.Cookie, len(cookies))
1381	copy(cnew, cookies)
1382	existing := storage.UnstringifyCookies(cookieStr)
1383	for _, c := range existing {
1384		if !storage.ContainsCookie(cnew, c.Name) {
1385			cnew = append(cnew, c)
1386		}
1387	}
1388	j.store.SetCookies(u, storage.StringifyCookies(cnew))
1389}
1390
1391func (j *cookieJarSerializer) Cookies(u *url.URL) []*http.Cookie {
1392	cookies := storage.UnstringifyCookies(j.store.Cookies(u))
1393	// Filter.
1394	now := time.Now()
1395	cnew := make([]*http.Cookie, 0, len(cookies))
1396	for _, c := range cookies {
1397		// Drop expired cookies.
1398		if c.RawExpires != "" && c.Expires.Before(now) {
1399			continue
1400		}
1401		// Drop secure cookies if not over https.
1402		if c.Secure && u.Scheme != "https" {
1403			continue
1404		}
1405		cnew = append(cnew, c)
1406	}
1407	return cnew
1408}
1409
1410func isMatchingFilter(fs []*regexp.Regexp, d []byte) bool {
1411	for _, r := range fs {
1412		if r.Match(d) {
1413			return true
1414		}
1415	}
1416	return false
1417}
1418
1419func streamToByte(r io.Reader) []byte {
1420	buf := new(bytes.Buffer)
1421	buf.ReadFrom(r)
1422
1423	if strReader, k := r.(*strings.Reader); k {
1424		strReader.Seek(0, 0)
1425	} else if bReader, kb := r.(*bytes.Reader); kb {
1426		bReader.Seek(0, 0)
1427	}
1428
1429	return buf.Bytes()
1430}
1431