1// Copyright 2018 Adam Tauber
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//      http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// Package colly implements a HTTP scraping framework
16package colly
17
18import (
19	"bytes"
20	"context"
21	"crypto/rand"
22	"encoding/json"
23	"errors"
24	"fmt"
25	"hash/fnv"
26	"io"
27	"io/ioutil"
28	"log"
29	"net/http"
30	"net/http/cookiejar"
31	"net/url"
32	"os"
33	"path/filepath"
34	"regexp"
35	"strconv"
36	"strings"
37	"sync"
38	"sync/atomic"
39	"time"
40
41	"google.golang.org/appengine/urlfetch"
42
43	"github.com/PuerkitoBio/goquery"
44	"github.com/antchfx/htmlquery"
45	"github.com/antchfx/xmlquery"
46	"github.com/kennygrant/sanitize"
47	"github.com/temoto/robotstxt"
48
49	"github.com/gocolly/colly/debug"
50	"github.com/gocolly/colly/storage"
51)
52
53// Collector provides the scraper instance for a scraping job
54type Collector struct {
55	// UserAgent is the User-Agent string used by HTTP requests
56	UserAgent string
57	// MaxDepth limits the recursion depth of visited URLs.
58	// Set it to 0 for infinite recursion (default).
59	MaxDepth int
60	// AllowedDomains is a domain whitelist.
61	// Leave it blank to allow any domains to be visited
62	AllowedDomains []string
63	// DisallowedDomains is a domain blacklist.
64	DisallowedDomains []string
65	// DisallowedURLFilters is a list of regular expressions which restricts
66	// visiting URLs. If any of the rules matches to a URL the
67	// request will be stopped. DisallowedURLFilters will
68	// be evaluated before URLFilters
69	// Leave it blank to allow any URLs to be visited
70	DisallowedURLFilters []*regexp.Regexp
71	// URLFilters is a list of regular expressions which restricts
72	// visiting URLs. If any of the rules matches to a URL the
73	// request won't be stopped. DisallowedURLFilters will
74	// be evaluated before URLFilters
75
76	// Leave it blank to allow any URLs to be visited
77	URLFilters []*regexp.Regexp
78
79	// AllowURLRevisit allows multiple downloads of the same URL
80	AllowURLRevisit bool
81	// MaxBodySize is the limit of the retrieved response body in bytes.
82	// 0 means unlimited.
83	// The default value for MaxBodySize is 10MB (10 * 1024 * 1024 bytes).
84	MaxBodySize int
85	// CacheDir specifies a location where GET requests are cached as files.
86	// When it's not defined, caching is disabled.
87	CacheDir string
88	// IgnoreRobotsTxt allows the Collector to ignore any restrictions set by
89	// the target host's robots.txt file.  See http://www.robotstxt.org/ for more
90	// information.
91	IgnoreRobotsTxt bool
92	// Async turns on asynchronous network communication. Use Collector.Wait() to
93	// be sure all requests have been finished.
94	Async bool
95	// ParseHTTPErrorResponse allows parsing HTTP responses with non 2xx status codes.
96	// By default, Colly parses only successful HTTP responses. Set ParseHTTPErrorResponse
97	// to true to enable it.
98	ParseHTTPErrorResponse bool
99	// ID is the unique identifier of a collector
100	ID uint32
101	// DetectCharset can enable character encoding detection for non-utf8 response bodies
102	// without explicit charset declaration. This feature uses https://github.com/saintfish/chardet
103	DetectCharset bool
104	// RedirectHandler allows control on how a redirect will be managed
105	RedirectHandler func(req *http.Request, via []*http.Request) error
106	// CheckHead performs a HEAD request before every GET to pre-validate the response
107	CheckHead         bool
108	store             storage.Storage
109	debugger          debug.Debugger
110	robotsMap         map[string]*robotstxt.RobotsData
111	htmlCallbacks     []*htmlCallbackContainer
112	xmlCallbacks      []*xmlCallbackContainer
113	requestCallbacks  []RequestCallback
114	responseCallbacks []ResponseCallback
115	errorCallbacks    []ErrorCallback
116	scrapedCallbacks  []ScrapedCallback
117	requestCount      uint32
118	responseCount     uint32
119	backend           *httpBackend
120	wg                *sync.WaitGroup
121	lock              *sync.RWMutex
122}
123
124// RequestCallback is a type alias for OnRequest callback functions
125type RequestCallback func(*Request)
126
127// ResponseCallback is a type alias for OnResponse callback functions
128type ResponseCallback func(*Response)
129
130// HTMLCallback is a type alias for OnHTML callback functions
131type HTMLCallback func(*HTMLElement)
132
133// XMLCallback is a type alias for OnXML callback functions
134type XMLCallback func(*XMLElement)
135
136// ErrorCallback is a type alias for OnError callback functions
137type ErrorCallback func(*Response, error)
138
139// ScrapedCallback is a type alias for OnScraped callback functions
140type ScrapedCallback func(*Response)
141
142// ProxyFunc is a type alias for proxy setter functions.
143type ProxyFunc func(*http.Request) (*url.URL, error)
144
145type htmlCallbackContainer struct {
146	Selector string
147	Function HTMLCallback
148}
149
150type xmlCallbackContainer struct {
151	Query    string
152	Function XMLCallback
153}
154
155type cookieJarSerializer struct {
156	store storage.Storage
157	lock  *sync.RWMutex
158}
159
160var collectorCounter uint32
161
162// The key type is unexported to prevent collisions with context keys defined in
163// other packages.
164type key int
165
166// ProxyURLKey is the context key for the request proxy address.
167const ProxyURLKey key = iota
168
169var (
170	// ErrForbiddenDomain is the error thrown if visiting
171	// a domain which is not allowed in AllowedDomains
172	ErrForbiddenDomain = errors.New("Forbidden domain")
173	// ErrMissingURL is the error type for missing URL errors
174	ErrMissingURL = errors.New("Missing URL")
175	// ErrMaxDepth is the error type for exceeding max depth
176	ErrMaxDepth = errors.New("Max depth limit reached")
177	// ErrForbiddenURL is the error thrown if visiting
178	// a URL which is not allowed by URLFilters
179	ErrForbiddenURL = errors.New("ForbiddenURL")
180
181	// ErrNoURLFiltersMatch is the error thrown if visiting
182	// a URL which is not allowed by URLFilters
183	ErrNoURLFiltersMatch = errors.New("No URLFilters match")
184	// ErrAlreadyVisited is the error type for already visited URLs
185	ErrAlreadyVisited = errors.New("URL already visited")
186	// ErrRobotsTxtBlocked is the error type for robots.txt errors
187	ErrRobotsTxtBlocked = errors.New("URL blocked by robots.txt")
188	// ErrNoCookieJar is the error type for missing cookie jar
189	ErrNoCookieJar = errors.New("Cookie jar is not available")
190	// ErrNoPattern is the error type for LimitRules without patterns
191	ErrNoPattern = errors.New("No pattern defined in LimitRule")
192)
193
194var envMap = map[string]func(*Collector, string){
195	"ALLOWED_DOMAINS": func(c *Collector, val string) {
196		c.AllowedDomains = strings.Split(val, ",")
197	},
198	"CACHE_DIR": func(c *Collector, val string) {
199		c.CacheDir = val
200	},
201	"DETECT_CHARSET": func(c *Collector, val string) {
202		c.DetectCharset = isYesString(val)
203	},
204	"DISABLE_COOKIES": func(c *Collector, _ string) {
205		c.backend.Client.Jar = nil
206	},
207	"DISALLOWED_DOMAINS": func(c *Collector, val string) {
208		c.DisallowedDomains = strings.Split(val, ",")
209	},
210	"IGNORE_ROBOTSTXT": func(c *Collector, val string) {
211		c.IgnoreRobotsTxt = isYesString(val)
212	},
213	"FOLLOW_REDIRECTS": func(c *Collector, val string) {
214		if !isYesString(val) {
215			c.RedirectHandler = func(req *http.Request, via []*http.Request) error {
216				return http.ErrUseLastResponse
217			}
218		}
219	},
220	"MAX_BODY_SIZE": func(c *Collector, val string) {
221		size, err := strconv.Atoi(val)
222		if err == nil {
223			c.MaxBodySize = size
224		}
225	},
226	"MAX_DEPTH": func(c *Collector, val string) {
227		maxDepth, err := strconv.Atoi(val)
228		if err != nil {
229			c.MaxDepth = maxDepth
230		}
231	},
232	"PARSE_HTTP_ERROR_RESPONSE": func(c *Collector, val string) {
233		c.ParseHTTPErrorResponse = isYesString(val)
234	},
235	"USER_AGENT": func(c *Collector, val string) {
236		c.UserAgent = val
237	},
238}
239
240// NewCollector creates a new Collector instance with default configuration
241func NewCollector(options ...func(*Collector)) *Collector {
242	c := &Collector{}
243	c.Init()
244
245	for _, f := range options {
246		f(c)
247	}
248
249	c.parseSettingsFromEnv()
250
251	return c
252}
253
254// UserAgent sets the user agent used by the Collector.
255func UserAgent(ua string) func(*Collector) {
256	return func(c *Collector) {
257		c.UserAgent = ua
258	}
259}
260
261// MaxDepth limits the recursion depth of visited URLs.
262func MaxDepth(depth int) func(*Collector) {
263	return func(c *Collector) {
264		c.MaxDepth = depth
265	}
266}
267
268// AllowedDomains sets the domain whitelist used by the Collector.
269func AllowedDomains(domains ...string) func(*Collector) {
270	return func(c *Collector) {
271		c.AllowedDomains = domains
272	}
273}
274
275// ParseHTTPErrorResponse allows parsing responses with HTTP errors
276func ParseHTTPErrorResponse() func(*Collector) {
277	return func(c *Collector) {
278		c.ParseHTTPErrorResponse = true
279	}
280}
281
282// DisallowedDomains sets the domain blacklist used by the Collector.
283func DisallowedDomains(domains ...string) func(*Collector) {
284	return func(c *Collector) {
285		c.DisallowedDomains = domains
286	}
287}
288
289// DisallowedURLFilters sets the list of regular expressions which restricts
290// visiting URLs. If any of the rules matches to a URL the request will be stopped.
291func DisallowedURLFilters(filters ...*regexp.Regexp) func(*Collector) {
292	return func(c *Collector) {
293		c.DisallowedURLFilters = filters
294	}
295}
296
297// URLFilters sets the list of regular expressions which restricts
298// visiting URLs. If any of the rules matches to a URL the request won't be stopped.
299func URLFilters(filters ...*regexp.Regexp) func(*Collector) {
300	return func(c *Collector) {
301		c.URLFilters = filters
302	}
303}
304
305// AllowURLRevisit instructs the Collector to allow multiple downloads of the same URL
306func AllowURLRevisit() func(*Collector) {
307	return func(c *Collector) {
308		c.AllowURLRevisit = true
309	}
310}
311
312// MaxBodySize sets the limit of the retrieved response body in bytes.
313func MaxBodySize(sizeInBytes int) func(*Collector) {
314	return func(c *Collector) {
315		c.MaxBodySize = sizeInBytes
316	}
317}
318
319// CacheDir specifies the location where GET requests are cached as files.
320func CacheDir(path string) func(*Collector) {
321	return func(c *Collector) {
322		c.CacheDir = path
323	}
324}
325
326// IgnoreRobotsTxt instructs the Collector to ignore any restrictions
327// set by the target host's robots.txt file.
328func IgnoreRobotsTxt() func(*Collector) {
329	return func(c *Collector) {
330		c.IgnoreRobotsTxt = true
331	}
332}
333
334// ID sets the unique identifier of the Collector.
335func ID(id uint32) func(*Collector) {
336	return func(c *Collector) {
337		c.ID = id
338	}
339}
340
341// Async turns on asynchronous network requests.
342func Async(a bool) func(*Collector) {
343	return func(c *Collector) {
344		c.Async = a
345	}
346}
347
348// DetectCharset enables character encoding detection for non-utf8 response bodies
349// without explicit charset declaration. This feature uses https://github.com/saintfish/chardet
350func DetectCharset() func(*Collector) {
351	return func(c *Collector) {
352		c.DetectCharset = true
353	}
354}
355
356// Debugger sets the debugger used by the Collector.
357func Debugger(d debug.Debugger) func(*Collector) {
358	return func(c *Collector) {
359		d.Init()
360		c.debugger = d
361	}
362}
363
364// Init initializes the Collector's private variables and sets default
365// configuration for the Collector
366func (c *Collector) Init() {
367	c.UserAgent = "colly - https://github.com/gocolly/colly"
368	c.MaxDepth = 0
369	c.store = &storage.InMemoryStorage{}
370	c.store.Init()
371	c.MaxBodySize = 10 * 1024 * 1024
372	c.backend = &httpBackend{}
373	jar, _ := cookiejar.New(nil)
374	c.backend.Init(jar)
375	c.backend.Client.CheckRedirect = c.checkRedirectFunc()
376	c.wg = &sync.WaitGroup{}
377	c.lock = &sync.RWMutex{}
378	c.robotsMap = make(map[string]*robotstxt.RobotsData)
379	c.IgnoreRobotsTxt = true
380	c.ID = atomic.AddUint32(&collectorCounter, 1)
381}
382
383// Appengine will replace the Collector's backend http.Client
384// With an Http.Client that is provided by appengine/urlfetch
385// This function should be used when the scraper is run on
386// Google App Engine. Example:
387//   func startScraper(w http.ResponseWriter, r *http.Request) {
388//     ctx := appengine.NewContext(r)
389//     c := colly.NewCollector()
390//     c.Appengine(ctx)
391//      ...
392//     c.Visit("https://google.ca")
393//   }
394func (c *Collector) Appengine(ctx context.Context) {
395	client := urlfetch.Client(ctx)
396	client.Jar = c.backend.Client.Jar
397	client.CheckRedirect = c.backend.Client.CheckRedirect
398	client.Timeout = c.backend.Client.Timeout
399
400	c.backend.Client = client
401}
402
403// Visit starts Collector's collecting job by creating a
404// request to the URL specified in parameter.
405// Visit also calls the previously provided callbacks
406func (c *Collector) Visit(URL string) error {
407	if c.CheckHead {
408		if check := c.scrape(URL, "HEAD", 1, nil, nil, nil, true); check != nil {
409			return check
410		}
411	}
412	return c.scrape(URL, "GET", 1, nil, nil, nil, true)
413}
414
415// Head starts a collector job by creating a HEAD request.
416func (c *Collector) Head(URL string) error {
417	return c.scrape(URL, "HEAD", 1, nil, nil, nil, false)
418}
419
420// Post starts a collector job by creating a POST request.
421// Post also calls the previously provided callbacks
422func (c *Collector) Post(URL string, requestData map[string]string) error {
423	return c.scrape(URL, "POST", 1, createFormReader(requestData), nil, nil, true)
424}
425
426// PostRaw starts a collector job by creating a POST request with raw binary data.
427// Post also calls the previously provided callbacks
428func (c *Collector) PostRaw(URL string, requestData []byte) error {
429	return c.scrape(URL, "POST", 1, bytes.NewReader(requestData), nil, nil, true)
430}
431
432// PostMultipart starts a collector job by creating a Multipart POST request
433// with raw binary data.  PostMultipart also calls the previously provided callbacks
434func (c *Collector) PostMultipart(URL string, requestData map[string][]byte) error {
435	boundary := randomBoundary()
436	hdr := http.Header{}
437	hdr.Set("Content-Type", "multipart/form-data; boundary="+boundary)
438	hdr.Set("User-Agent", c.UserAgent)
439	return c.scrape(URL, "POST", 1, createMultipartReader(boundary, requestData), nil, hdr, true)
440}
441
442// Request starts a collector job by creating a custom HTTP request
443// where method, context, headers and request data can be specified.
444// Set requestData, ctx, hdr parameters to nil if you don't want to use them.
445// Valid methods:
446//   - "GET"
447//   - "HEAD"
448//   - "POST"
449//   - "PUT"
450//   - "DELETE"
451//   - "PATCH"
452//   - "OPTIONS"
453func (c *Collector) Request(method, URL string, requestData io.Reader, ctx *Context, hdr http.Header) error {
454	return c.scrape(URL, method, 1, requestData, ctx, hdr, true)
455}
456
457// SetDebugger attaches a debugger to the collector
458func (c *Collector) SetDebugger(d debug.Debugger) {
459	d.Init()
460	c.debugger = d
461}
462
463// UnmarshalRequest creates a Request from serialized data
464func (c *Collector) UnmarshalRequest(r []byte) (*Request, error) {
465	req := &serializableRequest{}
466	err := json.Unmarshal(r, req)
467	if err != nil {
468		return nil, err
469	}
470
471	u, err := url.Parse(req.URL)
472	if err != nil {
473		return nil, err
474	}
475
476	ctx := NewContext()
477	for k, v := range req.Ctx {
478		ctx.Put(k, v)
479	}
480
481	return &Request{
482		Method:    req.Method,
483		URL:       u,
484		Body:      bytes.NewReader(req.Body),
485		Ctx:       ctx,
486		ID:        atomic.AddUint32(&c.requestCount, 1),
487		Headers:   &req.Headers,
488		collector: c,
489	}, nil
490}
491
492func (c *Collector) scrape(u, method string, depth int, requestData io.Reader, ctx *Context, hdr http.Header, checkRevisit bool) error {
493	if err := c.requestCheck(u, method, depth, checkRevisit); err != nil {
494		return err
495	}
496	parsedURL, err := url.Parse(u)
497	if err != nil {
498		return err
499	}
500	if parsedURL.Scheme == "" {
501		parsedURL.Scheme = "http"
502	}
503	if !c.isDomainAllowed(parsedURL.Host) {
504		return ErrForbiddenDomain
505	}
506	if method != "HEAD" && !c.IgnoreRobotsTxt {
507		if err = c.checkRobots(parsedURL); err != nil {
508			return err
509		}
510	}
511	if hdr == nil {
512		hdr = http.Header{"User-Agent": []string{c.UserAgent}}
513	}
514	rc, ok := requestData.(io.ReadCloser)
515	if !ok && requestData != nil {
516		rc = ioutil.NopCloser(requestData)
517	}
518	// The Go HTTP API ignores "Host" in the headers, preferring the client
519	// to use the Host field on Request.
520	host := parsedURL.Host
521	if hostHeader := hdr.Get("Host"); hostHeader != "" {
522		host = hostHeader
523	}
524	req := &http.Request{
525		Method:     method,
526		URL:        parsedURL,
527		Proto:      "HTTP/1.1",
528		ProtoMajor: 1,
529		ProtoMinor: 1,
530		Header:     hdr,
531		Body:       rc,
532		Host:       host,
533	}
534	setRequestBody(req, requestData)
535	u = parsedURL.String()
536	c.wg.Add(1)
537	if c.Async {
538		go c.fetch(u, method, depth, requestData, ctx, hdr, req)
539		return nil
540	}
541	return c.fetch(u, method, depth, requestData, ctx, hdr, req)
542}
543
544func setRequestBody(req *http.Request, body io.Reader) {
545	if body != nil {
546		switch v := body.(type) {
547		case *bytes.Buffer:
548			req.ContentLength = int64(v.Len())
549			buf := v.Bytes()
550			req.GetBody = func() (io.ReadCloser, error) {
551				r := bytes.NewReader(buf)
552				return ioutil.NopCloser(r), nil
553			}
554		case *bytes.Reader:
555			req.ContentLength = int64(v.Len())
556			snapshot := *v
557			req.GetBody = func() (io.ReadCloser, error) {
558				r := snapshot
559				return ioutil.NopCloser(&r), nil
560			}
561		case *strings.Reader:
562			req.ContentLength = int64(v.Len())
563			snapshot := *v
564			req.GetBody = func() (io.ReadCloser, error) {
565				r := snapshot
566				return ioutil.NopCloser(&r), nil
567			}
568		}
569		if req.GetBody != nil && req.ContentLength == 0 {
570			req.Body = http.NoBody
571			req.GetBody = func() (io.ReadCloser, error) { return http.NoBody, nil }
572		}
573	}
574}
575
576func (c *Collector) fetch(u, method string, depth int, requestData io.Reader, ctx *Context, hdr http.Header, req *http.Request) error {
577	defer c.wg.Done()
578	if ctx == nil {
579		ctx = NewContext()
580	}
581	request := &Request{
582		URL:       req.URL,
583		Headers:   &req.Header,
584		Ctx:       ctx,
585		Depth:     depth,
586		Method:    method,
587		Body:      requestData,
588		collector: c,
589		ID:        atomic.AddUint32(&c.requestCount, 1),
590	}
591
592	c.handleOnRequest(request)
593
594	if request.abort {
595		return nil
596	}
597
598	if method == "POST" && req.Header.Get("Content-Type") == "" {
599		req.Header.Add("Content-Type", "application/x-www-form-urlencoded")
600	}
601
602	if req.Header.Get("Accept") == "" {
603		req.Header.Set("Accept", "*/*")
604	}
605
606	origURL := req.URL
607	response, err := c.backend.Cache(req, c.MaxBodySize, c.CacheDir)
608	if proxyURL, ok := req.Context().Value(ProxyURLKey).(string); ok {
609		request.ProxyURL = proxyURL
610	}
611	if err := c.handleOnError(response, err, request, ctx); err != nil {
612		return err
613	}
614	if req.URL != origURL {
615		request.URL = req.URL
616		request.Headers = &req.Header
617	}
618	atomic.AddUint32(&c.responseCount, 1)
619	response.Ctx = ctx
620	response.Request = request
621
622	err = response.fixCharset(c.DetectCharset, request.ResponseCharacterEncoding)
623	if err != nil {
624		return err
625	}
626
627	c.handleOnResponse(response)
628
629	err = c.handleOnHTML(response)
630	if err != nil {
631		c.handleOnError(response, err, request, ctx)
632	}
633
634	err = c.handleOnXML(response)
635	if err != nil {
636		c.handleOnError(response, err, request, ctx)
637	}
638
639	c.handleOnScraped(response)
640
641	return err
642}
643
644func (c *Collector) requestCheck(u, method string, depth int, checkRevisit bool) error {
645	if u == "" {
646		return ErrMissingURL
647	}
648	if c.MaxDepth > 0 && c.MaxDepth < depth {
649		return ErrMaxDepth
650	}
651	if len(c.DisallowedURLFilters) > 0 {
652		if isMatchingFilter(c.DisallowedURLFilters, []byte(u)) {
653			return ErrForbiddenURL
654		}
655	}
656	if len(c.URLFilters) > 0 {
657		if !isMatchingFilter(c.URLFilters, []byte(u)) {
658			return ErrNoURLFiltersMatch
659		}
660	}
661	if checkRevisit && !c.AllowURLRevisit && method == "GET" {
662		h := fnv.New64a()
663		h.Write([]byte(u))
664		uHash := h.Sum64()
665		visited, err := c.store.IsVisited(uHash)
666		if err != nil {
667			return err
668		}
669		if visited {
670			return ErrAlreadyVisited
671		}
672		return c.store.Visited(uHash)
673	}
674	return nil
675}
676
677func (c *Collector) isDomainAllowed(domain string) bool {
678	for _, d2 := range c.DisallowedDomains {
679		if d2 == domain {
680			return false
681		}
682	}
683	if c.AllowedDomains == nil || len(c.AllowedDomains) == 0 {
684		return true
685	}
686	for _, d2 := range c.AllowedDomains {
687		if d2 == domain {
688			return true
689		}
690	}
691	return false
692}
693
694func (c *Collector) checkRobots(u *url.URL) error {
695	c.lock.RLock()
696	robot, ok := c.robotsMap[u.Host]
697	c.lock.RUnlock()
698
699	if !ok {
700		// no robots file cached
701		resp, err := c.backend.Client.Get(u.Scheme + "://" + u.Host + "/robots.txt")
702		if err != nil {
703			return err
704		}
705		robot, err = robotstxt.FromResponse(resp)
706		if err != nil {
707			return err
708		}
709		c.lock.Lock()
710		c.robotsMap[u.Host] = robot
711		c.lock.Unlock()
712	}
713
714	uaGroup := robot.FindGroup(c.UserAgent)
715	if uaGroup == nil {
716		return nil
717	}
718
719	if !uaGroup.Test(u.EscapedPath()) {
720		return ErrRobotsTxtBlocked
721	}
722	return nil
723}
724
725// String is the text representation of the collector.
726// It contains useful debug information about the collector's internals
727func (c *Collector) String() string {
728	return fmt.Sprintf(
729		"Requests made: %d (%d responses) | Callbacks: OnRequest: %d, OnHTML: %d, OnResponse: %d, OnError: %d",
730		c.requestCount,
731		c.responseCount,
732		len(c.requestCallbacks),
733		len(c.htmlCallbacks),
734		len(c.responseCallbacks),
735		len(c.errorCallbacks),
736	)
737}
738
739// Wait returns when the collector jobs are finished
740func (c *Collector) Wait() {
741	c.wg.Wait()
742}
743
744// OnRequest registers a function. Function will be executed on every
745// request made by the Collector
746func (c *Collector) OnRequest(f RequestCallback) {
747	c.lock.Lock()
748	if c.requestCallbacks == nil {
749		c.requestCallbacks = make([]RequestCallback, 0, 4)
750	}
751	c.requestCallbacks = append(c.requestCallbacks, f)
752	c.lock.Unlock()
753}
754
755// OnResponse registers a function. Function will be executed on every response
756func (c *Collector) OnResponse(f ResponseCallback) {
757	c.lock.Lock()
758	if c.responseCallbacks == nil {
759		c.responseCallbacks = make([]ResponseCallback, 0, 4)
760	}
761	c.responseCallbacks = append(c.responseCallbacks, f)
762	c.lock.Unlock()
763}
764
765// OnHTML registers a function. Function will be executed on every HTML
766// element matched by the GoQuery Selector parameter.
767// GoQuery Selector is a selector used by https://github.com/PuerkitoBio/goquery
768func (c *Collector) OnHTML(goquerySelector string, f HTMLCallback) {
769	c.lock.Lock()
770	if c.htmlCallbacks == nil {
771		c.htmlCallbacks = make([]*htmlCallbackContainer, 0, 4)
772	}
773	c.htmlCallbacks = append(c.htmlCallbacks, &htmlCallbackContainer{
774		Selector: goquerySelector,
775		Function: f,
776	})
777	c.lock.Unlock()
778}
779
780// OnXML registers a function. Function will be executed on every XML
781// element matched by the xpath Query parameter.
782// xpath Query is used by https://github.com/antchfx/xmlquery
783func (c *Collector) OnXML(xpathQuery string, f XMLCallback) {
784	c.lock.Lock()
785	if c.xmlCallbacks == nil {
786		c.xmlCallbacks = make([]*xmlCallbackContainer, 0, 4)
787	}
788	c.xmlCallbacks = append(c.xmlCallbacks, &xmlCallbackContainer{
789		Query:    xpathQuery,
790		Function: f,
791	})
792	c.lock.Unlock()
793}
794
795// OnHTMLDetach deregister a function. Function will not be execute after detached
796func (c *Collector) OnHTMLDetach(goquerySelector string) {
797	c.lock.Lock()
798	deleteIdx := -1
799	for i, cc := range c.htmlCallbacks {
800		if cc.Selector == goquerySelector {
801			deleteIdx = i
802			break
803		}
804	}
805	if deleteIdx != -1 {
806		c.htmlCallbacks = append(c.htmlCallbacks[:deleteIdx], c.htmlCallbacks[deleteIdx+1:]...)
807	}
808	c.lock.Unlock()
809}
810
811// OnXMLDetach deregister a function. Function will not be execute after detached
812func (c *Collector) OnXMLDetach(xpathQuery string) {
813	c.lock.Lock()
814	deleteIdx := -1
815	for i, cc := range c.xmlCallbacks {
816		if cc.Query == xpathQuery {
817			deleteIdx = i
818			break
819		}
820	}
821	if deleteIdx != -1 {
822		c.xmlCallbacks = append(c.xmlCallbacks[:deleteIdx], c.xmlCallbacks[deleteIdx+1:]...)
823	}
824	c.lock.Unlock()
825}
826
827// OnError registers a function. Function will be executed if an error
828// occurs during the HTTP request.
829func (c *Collector) OnError(f ErrorCallback) {
830	c.lock.Lock()
831	if c.errorCallbacks == nil {
832		c.errorCallbacks = make([]ErrorCallback, 0, 4)
833	}
834	c.errorCallbacks = append(c.errorCallbacks, f)
835	c.lock.Unlock()
836}
837
838// OnScraped registers a function. Function will be executed after
839// OnHTML, as a final part of the scraping.
840func (c *Collector) OnScraped(f ScrapedCallback) {
841	c.lock.Lock()
842	if c.scrapedCallbacks == nil {
843		c.scrapedCallbacks = make([]ScrapedCallback, 0, 4)
844	}
845	c.scrapedCallbacks = append(c.scrapedCallbacks, f)
846	c.lock.Unlock()
847}
848
849// WithTransport allows you to set a custom http.RoundTripper (transport)
850func (c *Collector) WithTransport(transport http.RoundTripper) {
851	c.backend.Client.Transport = transport
852}
853
854// DisableCookies turns off cookie handling
855func (c *Collector) DisableCookies() {
856	c.backend.Client.Jar = nil
857}
858
859// SetCookieJar overrides the previously set cookie jar
860func (c *Collector) SetCookieJar(j *cookiejar.Jar) {
861	c.backend.Client.Jar = j
862}
863
864// SetRequestTimeout overrides the default timeout (10 seconds) for this collector
865func (c *Collector) SetRequestTimeout(timeout time.Duration) {
866	c.backend.Client.Timeout = timeout
867}
868
869// SetStorage overrides the default in-memory storage.
870// Storage stores scraping related data like cookies and visited urls
871func (c *Collector) SetStorage(s storage.Storage) error {
872	if err := s.Init(); err != nil {
873		return err
874	}
875	c.store = s
876	c.backend.Client.Jar = createJar(s)
877	return nil
878}
879
880// SetProxy sets a proxy for the collector. This method overrides the previously
881// used http.Transport if the type of the transport is not http.RoundTripper.
882// The proxy type is determined by the URL scheme. "http"
883// and "socks5" are supported. If the scheme is empty,
884// "http" is assumed.
885func (c *Collector) SetProxy(proxyURL string) error {
886	proxyParsed, err := url.Parse(proxyURL)
887	if err != nil {
888		return err
889	}
890
891	c.SetProxyFunc(http.ProxyURL(proxyParsed))
892
893	return nil
894}
895
896// SetProxyFunc sets a custom proxy setter/switcher function.
897// See built-in ProxyFuncs for more details.
898// This method overrides the previously used http.Transport
899// if the type of the transport is not http.RoundTripper.
900// The proxy type is determined by the URL scheme. "http"
901// and "socks5" are supported. If the scheme is empty,
902// "http" is assumed.
903func (c *Collector) SetProxyFunc(p ProxyFunc) {
904	t, ok := c.backend.Client.Transport.(*http.Transport)
905	if c.backend.Client.Transport != nil && ok {
906		t.Proxy = p
907	} else {
908		c.backend.Client.Transport = &http.Transport{
909			Proxy: p,
910		}
911	}
912}
913
914func createEvent(eventType string, requestID, collectorID uint32, kvargs map[string]string) *debug.Event {
915	return &debug.Event{
916		CollectorID: collectorID,
917		RequestID:   requestID,
918		Type:        eventType,
919		Values:      kvargs,
920	}
921}
922
923func (c *Collector) handleOnRequest(r *Request) {
924	if c.debugger != nil {
925		c.debugger.Event(createEvent("request", r.ID, c.ID, map[string]string{
926			"url": r.URL.String(),
927		}))
928	}
929	for _, f := range c.requestCallbacks {
930		f(r)
931	}
932}
933
934func (c *Collector) handleOnResponse(r *Response) {
935	if c.debugger != nil {
936		c.debugger.Event(createEvent("response", r.Request.ID, c.ID, map[string]string{
937			"url":    r.Request.URL.String(),
938			"status": http.StatusText(r.StatusCode),
939		}))
940	}
941	for _, f := range c.responseCallbacks {
942		f(r)
943	}
944}
945
946func (c *Collector) handleOnHTML(resp *Response) error {
947	if len(c.htmlCallbacks) == 0 || !strings.Contains(strings.ToLower(resp.Headers.Get("Content-Type")), "html") {
948		return nil
949	}
950	doc, err := goquery.NewDocumentFromReader(bytes.NewBuffer(resp.Body))
951	if err != nil {
952		return err
953	}
954	if href, found := doc.Find("base[href]").Attr("href"); found {
955		resp.Request.baseURL, _ = url.Parse(href)
956	}
957	for _, cc := range c.htmlCallbacks {
958		i := 0
959		doc.Find(cc.Selector).Each(func(_ int, s *goquery.Selection) {
960			for _, n := range s.Nodes {
961				e := NewHTMLElementFromSelectionNode(resp, s, n, i)
962				i++
963				if c.debugger != nil {
964					c.debugger.Event(createEvent("html", resp.Request.ID, c.ID, map[string]string{
965						"selector": cc.Selector,
966						"url":      resp.Request.URL.String(),
967					}))
968				}
969				cc.Function(e)
970			}
971		})
972	}
973	return nil
974}
975
976func (c *Collector) handleOnXML(resp *Response) error {
977	if len(c.xmlCallbacks) == 0 {
978		return nil
979	}
980	contentType := strings.ToLower(resp.Headers.Get("Content-Type"))
981	if !strings.Contains(contentType, "html") && !strings.Contains(contentType, "xml") {
982		return nil
983	}
984
985	if strings.Contains(contentType, "html") {
986		doc, err := htmlquery.Parse(bytes.NewBuffer(resp.Body))
987		if err != nil {
988			return err
989		}
990		if e := htmlquery.FindOne(doc, "//base"); e != nil {
991			for _, a := range e.Attr {
992				if a.Key == "href" {
993					resp.Request.baseURL, _ = url.Parse(a.Val)
994					break
995				}
996			}
997		}
998
999		for _, cc := range c.xmlCallbacks {
1000			for _, n := range htmlquery.Find(doc, cc.Query) {
1001				e := NewXMLElementFromHTMLNode(resp, n)
1002				if c.debugger != nil {
1003					c.debugger.Event(createEvent("xml", resp.Request.ID, c.ID, map[string]string{
1004						"selector": cc.Query,
1005						"url":      resp.Request.URL.String(),
1006					}))
1007				}
1008				cc.Function(e)
1009			}
1010		}
1011	} else if strings.Contains(contentType, "xml") {
1012		doc, err := xmlquery.Parse(bytes.NewBuffer(resp.Body))
1013		if err != nil {
1014			return err
1015		}
1016
1017		for _, cc := range c.xmlCallbacks {
1018			xmlquery.FindEach(doc, cc.Query, func(i int, n *xmlquery.Node) {
1019				e := NewXMLElementFromXMLNode(resp, n)
1020				if c.debugger != nil {
1021					c.debugger.Event(createEvent("xml", resp.Request.ID, c.ID, map[string]string{
1022						"selector": cc.Query,
1023						"url":      resp.Request.URL.String(),
1024					}))
1025				}
1026				cc.Function(e)
1027			})
1028		}
1029	}
1030	return nil
1031}
1032
1033func (c *Collector) handleOnError(response *Response, err error, request *Request, ctx *Context) error {
1034	if err == nil && (c.ParseHTTPErrorResponse || response.StatusCode < 203) {
1035		return nil
1036	}
1037	if err == nil && response.StatusCode >= 203 {
1038		err = errors.New(http.StatusText(response.StatusCode))
1039	}
1040	if response == nil {
1041		response = &Response{
1042			Request: request,
1043			Ctx:     ctx,
1044		}
1045	}
1046	if c.debugger != nil {
1047		c.debugger.Event(createEvent("error", request.ID, c.ID, map[string]string{
1048			"url":    request.URL.String(),
1049			"status": http.StatusText(response.StatusCode),
1050		}))
1051	}
1052	if response.Request == nil {
1053		response.Request = request
1054	}
1055	if response.Ctx == nil {
1056		response.Ctx = request.Ctx
1057	}
1058	for _, f := range c.errorCallbacks {
1059		f(response, err)
1060	}
1061	return err
1062}
1063
1064func (c *Collector) handleOnScraped(r *Response) {
1065	if c.debugger != nil {
1066		c.debugger.Event(createEvent("scraped", r.Request.ID, c.ID, map[string]string{
1067			"url": r.Request.URL.String(),
1068		}))
1069	}
1070	for _, f := range c.scrapedCallbacks {
1071		f(r)
1072	}
1073}
1074
1075// Limit adds a new LimitRule to the collector
1076func (c *Collector) Limit(rule *LimitRule) error {
1077	return c.backend.Limit(rule)
1078}
1079
1080// Limits adds new LimitRules to the collector
1081func (c *Collector) Limits(rules []*LimitRule) error {
1082	return c.backend.Limits(rules)
1083}
1084
1085// SetCookies handles the receipt of the cookies in a reply for the given URL
1086func (c *Collector) SetCookies(URL string, cookies []*http.Cookie) error {
1087	if c.backend.Client.Jar == nil {
1088		return ErrNoCookieJar
1089	}
1090	u, err := url.Parse(URL)
1091	if err != nil {
1092		return err
1093	}
1094	c.backend.Client.Jar.SetCookies(u, cookies)
1095	return nil
1096}
1097
1098// Cookies returns the cookies to send in a request for the given URL.
1099func (c *Collector) Cookies(URL string) []*http.Cookie {
1100	if c.backend.Client.Jar == nil {
1101		return nil
1102	}
1103	u, err := url.Parse(URL)
1104	if err != nil {
1105		return nil
1106	}
1107	return c.backend.Client.Jar.Cookies(u)
1108}
1109
1110// Clone creates an exact copy of a Collector without callbacks.
1111// HTTP backend, robots.txt cache and cookie jar are shared
1112// between collectors.
1113func (c *Collector) Clone() *Collector {
1114	return &Collector{
1115		AllowedDomains:         c.AllowedDomains,
1116		AllowURLRevisit:        c.AllowURLRevisit,
1117		CacheDir:               c.CacheDir,
1118		DetectCharset:          c.DetectCharset,
1119		DisallowedDomains:      c.DisallowedDomains,
1120		ID:                     atomic.AddUint32(&collectorCounter, 1),
1121		IgnoreRobotsTxt:        c.IgnoreRobotsTxt,
1122		MaxBodySize:            c.MaxBodySize,
1123		MaxDepth:               c.MaxDepth,
1124		DisallowedURLFilters:   c.DisallowedURLFilters,
1125		URLFilters:             c.URLFilters,
1126		ParseHTTPErrorResponse: c.ParseHTTPErrorResponse,
1127		UserAgent:              c.UserAgent,
1128		store:                  c.store,
1129		backend:                c.backend,
1130		debugger:               c.debugger,
1131		Async:                  c.Async,
1132		RedirectHandler:        c.RedirectHandler,
1133		errorCallbacks:         make([]ErrorCallback, 0, 8),
1134		htmlCallbacks:          make([]*htmlCallbackContainer, 0, 8),
1135		xmlCallbacks:           make([]*xmlCallbackContainer, 0, 8),
1136		scrapedCallbacks:       make([]ScrapedCallback, 0, 8),
1137		lock:                   c.lock,
1138		requestCallbacks:       make([]RequestCallback, 0, 8),
1139		responseCallbacks:      make([]ResponseCallback, 0, 8),
1140		robotsMap:              c.robotsMap,
1141		wg:                     &sync.WaitGroup{},
1142	}
1143}
1144
1145func (c *Collector) checkRedirectFunc() func(req *http.Request, via []*http.Request) error {
1146	return func(req *http.Request, via []*http.Request) error {
1147		if !c.isDomainAllowed(req.URL.Host) {
1148			return fmt.Errorf("Not following redirect to %s because its not in AllowedDomains", req.URL.Host)
1149		}
1150
1151		if c.RedirectHandler != nil {
1152			return c.RedirectHandler(req, via)
1153		}
1154
1155		// Honor golangs default of maximum of 10 redirects
1156		if len(via) >= 10 {
1157			return http.ErrUseLastResponse
1158		}
1159
1160		lastRequest := via[len(via)-1]
1161
1162		// Copy the headers from last request
1163		for hName, hValues := range lastRequest.Header {
1164			for _, hValue := range hValues {
1165				req.Header.Set(hName, hValue)
1166			}
1167		}
1168
1169		// If domain has changed, remove the Authorization-header if it exists
1170		if req.URL.Host != lastRequest.URL.Host {
1171			req.Header.Del("Authorization")
1172		}
1173
1174		return nil
1175	}
1176}
1177
1178func (c *Collector) parseSettingsFromEnv() {
1179	for _, e := range os.Environ() {
1180		if !strings.HasPrefix(e, "COLLY_") {
1181			continue
1182		}
1183		pair := strings.SplitN(e[6:], "=", 2)
1184		if f, ok := envMap[pair[0]]; ok {
1185			f(c, pair[1])
1186		} else {
1187			log.Println("Unknown environment variable:", pair[0])
1188		}
1189	}
1190}
1191
1192// SanitizeFileName replaces dangerous characters in a string
1193// so the return value can be used as a safe file name.
1194func SanitizeFileName(fileName string) string {
1195	ext := filepath.Ext(fileName)
1196	cleanExt := sanitize.BaseName(ext)
1197	if cleanExt == "" {
1198		cleanExt = ".unknown"
1199	}
1200	return strings.Replace(fmt.Sprintf(
1201		"%s.%s",
1202		sanitize.BaseName(fileName[:len(fileName)-len(ext)]),
1203		cleanExt[1:],
1204	), "-", "_", -1)
1205}
1206
1207func createFormReader(data map[string]string) io.Reader {
1208	form := url.Values{}
1209	for k, v := range data {
1210		form.Add(k, v)
1211	}
1212	return strings.NewReader(form.Encode())
1213}
1214
1215func createMultipartReader(boundary string, data map[string][]byte) io.Reader {
1216	dashBoundary := "--" + boundary
1217
1218	body := []byte{}
1219	buffer := bytes.NewBuffer(body)
1220
1221	buffer.WriteString("Content-type: multipart/form-data; boundary=" + boundary + "\n\n")
1222	for contentType, content := range data {
1223		buffer.WriteString(dashBoundary + "\n")
1224		buffer.WriteString("Content-Disposition: form-data; name=" + contentType + "\n")
1225		buffer.WriteString(fmt.Sprintf("Content-Length: %d \n\n", len(content)))
1226		buffer.Write(content)
1227		buffer.WriteString("\n")
1228	}
1229	buffer.WriteString(dashBoundary + "--\n\n")
1230	return buffer
1231}
1232
1233// randomBoundary was borrowed from
1234// github.com/golang/go/mime/multipart/writer.go#randomBoundary
1235func randomBoundary() string {
1236	var buf [30]byte
1237	_, err := io.ReadFull(rand.Reader, buf[:])
1238	if err != nil {
1239		panic(err)
1240	}
1241	return fmt.Sprintf("%x", buf[:])
1242}
1243
1244func isYesString(s string) bool {
1245	switch strings.ToLower(s) {
1246	case "1", "yes", "true", "y":
1247		return true
1248	}
1249	return false
1250}
1251
1252func createJar(s storage.Storage) http.CookieJar {
1253	return &cookieJarSerializer{store: s, lock: &sync.RWMutex{}}
1254}
1255
1256func (j *cookieJarSerializer) SetCookies(u *url.URL, cookies []*http.Cookie) {
1257	j.lock.Lock()
1258	defer j.lock.Unlock()
1259	cookieStr := j.store.Cookies(u)
1260
1261	// Merge existing cookies, new cookies have precedence.
1262	cnew := make([]*http.Cookie, len(cookies))
1263	copy(cnew, cookies)
1264	existing := storage.UnstringifyCookies(cookieStr)
1265	for _, c := range existing {
1266		if !storage.ContainsCookie(cnew, c.Name) {
1267			cnew = append(cnew, c)
1268		}
1269	}
1270	j.store.SetCookies(u, storage.StringifyCookies(cnew))
1271}
1272
1273func (j *cookieJarSerializer) Cookies(u *url.URL) []*http.Cookie {
1274	cookies := storage.UnstringifyCookies(j.store.Cookies(u))
1275	// Filter.
1276	now := time.Now()
1277	cnew := make([]*http.Cookie, 0, len(cookies))
1278	for _, c := range cookies {
1279		// Drop expired cookies.
1280		if c.RawExpires != "" && c.Expires.Before(now) {
1281			continue
1282		}
1283		// Drop secure cookies if not over https.
1284		if c.Secure && u.Scheme != "https" {
1285			continue
1286		}
1287		cnew = append(cnew, c)
1288	}
1289	return cnew
1290}
1291
1292func isMatchingFilter(fs []*regexp.Regexp, d []byte) bool {
1293	for _, r := range fs {
1294		if r.Match(d) {
1295			return true
1296		}
1297	}
1298	return false
1299}
1300