1// Copyright 2018 Adam Tauber 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15// Package colly implements a HTTP scraping framework 16package colly 17 18import ( 19 "bytes" 20 "context" 21 "crypto/rand" 22 "encoding/json" 23 "errors" 24 "fmt" 25 "hash/fnv" 26 "io" 27 "io/ioutil" 28 "log" 29 "net/http" 30 "net/http/cookiejar" 31 "net/url" 32 "os" 33 "path/filepath" 34 "regexp" 35 "strconv" 36 "strings" 37 "sync" 38 "sync/atomic" 39 "time" 40 41 "google.golang.org/appengine/urlfetch" 42 43 "github.com/PuerkitoBio/goquery" 44 "github.com/antchfx/htmlquery" 45 "github.com/antchfx/xmlquery" 46 "github.com/kennygrant/sanitize" 47 "github.com/temoto/robotstxt" 48 49 "github.com/gocolly/colly/debug" 50 "github.com/gocolly/colly/storage" 51) 52 53// Collector provides the scraper instance for a scraping job 54type Collector struct { 55 // UserAgent is the User-Agent string used by HTTP requests 56 UserAgent string 57 // MaxDepth limits the recursion depth of visited URLs. 58 // Set it to 0 for infinite recursion (default). 59 MaxDepth int 60 // AllowedDomains is a domain whitelist. 61 // Leave it blank to allow any domains to be visited 62 AllowedDomains []string 63 // DisallowedDomains is a domain blacklist. 64 DisallowedDomains []string 65 // DisallowedURLFilters is a list of regular expressions which restricts 66 // visiting URLs. If any of the rules matches to a URL the 67 // request will be stopped. DisallowedURLFilters will 68 // be evaluated before URLFilters 69 // Leave it blank to allow any URLs to be visited 70 DisallowedURLFilters []*regexp.Regexp 71 // URLFilters is a list of regular expressions which restricts 72 // visiting URLs. If any of the rules matches to a URL the 73 // request won't be stopped. DisallowedURLFilters will 74 // be evaluated before URLFilters 75 76 // Leave it blank to allow any URLs to be visited 77 URLFilters []*regexp.Regexp 78 79 // AllowURLRevisit allows multiple downloads of the same URL 80 AllowURLRevisit bool 81 // MaxBodySize is the limit of the retrieved response body in bytes. 82 // 0 means unlimited. 83 // The default value for MaxBodySize is 10MB (10 * 1024 * 1024 bytes). 84 MaxBodySize int 85 // CacheDir specifies a location where GET requests are cached as files. 86 // When it's not defined, caching is disabled. 87 CacheDir string 88 // IgnoreRobotsTxt allows the Collector to ignore any restrictions set by 89 // the target host's robots.txt file. See http://www.robotstxt.org/ for more 90 // information. 91 IgnoreRobotsTxt bool 92 // Async turns on asynchronous network communication. Use Collector.Wait() to 93 // be sure all requests have been finished. 94 Async bool 95 // ParseHTTPErrorResponse allows parsing HTTP responses with non 2xx status codes. 96 // By default, Colly parses only successful HTTP responses. Set ParseHTTPErrorResponse 97 // to true to enable it. 98 ParseHTTPErrorResponse bool 99 // ID is the unique identifier of a collector 100 ID uint32 101 // DetectCharset can enable character encoding detection for non-utf8 response bodies 102 // without explicit charset declaration. This feature uses https://github.com/saintfish/chardet 103 DetectCharset bool 104 // RedirectHandler allows control on how a redirect will be managed 105 RedirectHandler func(req *http.Request, via []*http.Request) error 106 // CheckHead performs a HEAD request before every GET to pre-validate the response 107 CheckHead bool 108 store storage.Storage 109 debugger debug.Debugger 110 robotsMap map[string]*robotstxt.RobotsData 111 htmlCallbacks []*htmlCallbackContainer 112 xmlCallbacks []*xmlCallbackContainer 113 requestCallbacks []RequestCallback 114 responseCallbacks []ResponseCallback 115 errorCallbacks []ErrorCallback 116 scrapedCallbacks []ScrapedCallback 117 requestCount uint32 118 responseCount uint32 119 backend *httpBackend 120 wg *sync.WaitGroup 121 lock *sync.RWMutex 122} 123 124// RequestCallback is a type alias for OnRequest callback functions 125type RequestCallback func(*Request) 126 127// ResponseCallback is a type alias for OnResponse callback functions 128type ResponseCallback func(*Response) 129 130// HTMLCallback is a type alias for OnHTML callback functions 131type HTMLCallback func(*HTMLElement) 132 133// XMLCallback is a type alias for OnXML callback functions 134type XMLCallback func(*XMLElement) 135 136// ErrorCallback is a type alias for OnError callback functions 137type ErrorCallback func(*Response, error) 138 139// ScrapedCallback is a type alias for OnScraped callback functions 140type ScrapedCallback func(*Response) 141 142// ProxyFunc is a type alias for proxy setter functions. 143type ProxyFunc func(*http.Request) (*url.URL, error) 144 145type htmlCallbackContainer struct { 146 Selector string 147 Function HTMLCallback 148} 149 150type xmlCallbackContainer struct { 151 Query string 152 Function XMLCallback 153} 154 155type cookieJarSerializer struct { 156 store storage.Storage 157 lock *sync.RWMutex 158} 159 160var collectorCounter uint32 161 162// The key type is unexported to prevent collisions with context keys defined in 163// other packages. 164type key int 165 166// ProxyURLKey is the context key for the request proxy address. 167const ProxyURLKey key = iota 168 169var ( 170 // ErrForbiddenDomain is the error thrown if visiting 171 // a domain which is not allowed in AllowedDomains 172 ErrForbiddenDomain = errors.New("Forbidden domain") 173 // ErrMissingURL is the error type for missing URL errors 174 ErrMissingURL = errors.New("Missing URL") 175 // ErrMaxDepth is the error type for exceeding max depth 176 ErrMaxDepth = errors.New("Max depth limit reached") 177 // ErrForbiddenURL is the error thrown if visiting 178 // a URL which is not allowed by URLFilters 179 ErrForbiddenURL = errors.New("ForbiddenURL") 180 181 // ErrNoURLFiltersMatch is the error thrown if visiting 182 // a URL which is not allowed by URLFilters 183 ErrNoURLFiltersMatch = errors.New("No URLFilters match") 184 // ErrAlreadyVisited is the error type for already visited URLs 185 ErrAlreadyVisited = errors.New("URL already visited") 186 // ErrRobotsTxtBlocked is the error type for robots.txt errors 187 ErrRobotsTxtBlocked = errors.New("URL blocked by robots.txt") 188 // ErrNoCookieJar is the error type for missing cookie jar 189 ErrNoCookieJar = errors.New("Cookie jar is not available") 190 // ErrNoPattern is the error type for LimitRules without patterns 191 ErrNoPattern = errors.New("No pattern defined in LimitRule") 192) 193 194var envMap = map[string]func(*Collector, string){ 195 "ALLOWED_DOMAINS": func(c *Collector, val string) { 196 c.AllowedDomains = strings.Split(val, ",") 197 }, 198 "CACHE_DIR": func(c *Collector, val string) { 199 c.CacheDir = val 200 }, 201 "DETECT_CHARSET": func(c *Collector, val string) { 202 c.DetectCharset = isYesString(val) 203 }, 204 "DISABLE_COOKIES": func(c *Collector, _ string) { 205 c.backend.Client.Jar = nil 206 }, 207 "DISALLOWED_DOMAINS": func(c *Collector, val string) { 208 c.DisallowedDomains = strings.Split(val, ",") 209 }, 210 "IGNORE_ROBOTSTXT": func(c *Collector, val string) { 211 c.IgnoreRobotsTxt = isYesString(val) 212 }, 213 "FOLLOW_REDIRECTS": func(c *Collector, val string) { 214 if !isYesString(val) { 215 c.RedirectHandler = func(req *http.Request, via []*http.Request) error { 216 return http.ErrUseLastResponse 217 } 218 } 219 }, 220 "MAX_BODY_SIZE": func(c *Collector, val string) { 221 size, err := strconv.Atoi(val) 222 if err == nil { 223 c.MaxBodySize = size 224 } 225 }, 226 "MAX_DEPTH": func(c *Collector, val string) { 227 maxDepth, err := strconv.Atoi(val) 228 if err != nil { 229 c.MaxDepth = maxDepth 230 } 231 }, 232 "PARSE_HTTP_ERROR_RESPONSE": func(c *Collector, val string) { 233 c.ParseHTTPErrorResponse = isYesString(val) 234 }, 235 "USER_AGENT": func(c *Collector, val string) { 236 c.UserAgent = val 237 }, 238} 239 240// NewCollector creates a new Collector instance with default configuration 241func NewCollector(options ...func(*Collector)) *Collector { 242 c := &Collector{} 243 c.Init() 244 245 for _, f := range options { 246 f(c) 247 } 248 249 c.parseSettingsFromEnv() 250 251 return c 252} 253 254// UserAgent sets the user agent used by the Collector. 255func UserAgent(ua string) func(*Collector) { 256 return func(c *Collector) { 257 c.UserAgent = ua 258 } 259} 260 261// MaxDepth limits the recursion depth of visited URLs. 262func MaxDepth(depth int) func(*Collector) { 263 return func(c *Collector) { 264 c.MaxDepth = depth 265 } 266} 267 268// AllowedDomains sets the domain whitelist used by the Collector. 269func AllowedDomains(domains ...string) func(*Collector) { 270 return func(c *Collector) { 271 c.AllowedDomains = domains 272 } 273} 274 275// ParseHTTPErrorResponse allows parsing responses with HTTP errors 276func ParseHTTPErrorResponse() func(*Collector) { 277 return func(c *Collector) { 278 c.ParseHTTPErrorResponse = true 279 } 280} 281 282// DisallowedDomains sets the domain blacklist used by the Collector. 283func DisallowedDomains(domains ...string) func(*Collector) { 284 return func(c *Collector) { 285 c.DisallowedDomains = domains 286 } 287} 288 289// DisallowedURLFilters sets the list of regular expressions which restricts 290// visiting URLs. If any of the rules matches to a URL the request will be stopped. 291func DisallowedURLFilters(filters ...*regexp.Regexp) func(*Collector) { 292 return func(c *Collector) { 293 c.DisallowedURLFilters = filters 294 } 295} 296 297// URLFilters sets the list of regular expressions which restricts 298// visiting URLs. If any of the rules matches to a URL the request won't be stopped. 299func URLFilters(filters ...*regexp.Regexp) func(*Collector) { 300 return func(c *Collector) { 301 c.URLFilters = filters 302 } 303} 304 305// AllowURLRevisit instructs the Collector to allow multiple downloads of the same URL 306func AllowURLRevisit() func(*Collector) { 307 return func(c *Collector) { 308 c.AllowURLRevisit = true 309 } 310} 311 312// MaxBodySize sets the limit of the retrieved response body in bytes. 313func MaxBodySize(sizeInBytes int) func(*Collector) { 314 return func(c *Collector) { 315 c.MaxBodySize = sizeInBytes 316 } 317} 318 319// CacheDir specifies the location where GET requests are cached as files. 320func CacheDir(path string) func(*Collector) { 321 return func(c *Collector) { 322 c.CacheDir = path 323 } 324} 325 326// IgnoreRobotsTxt instructs the Collector to ignore any restrictions 327// set by the target host's robots.txt file. 328func IgnoreRobotsTxt() func(*Collector) { 329 return func(c *Collector) { 330 c.IgnoreRobotsTxt = true 331 } 332} 333 334// ID sets the unique identifier of the Collector. 335func ID(id uint32) func(*Collector) { 336 return func(c *Collector) { 337 c.ID = id 338 } 339} 340 341// Async turns on asynchronous network requests. 342func Async(a bool) func(*Collector) { 343 return func(c *Collector) { 344 c.Async = a 345 } 346} 347 348// DetectCharset enables character encoding detection for non-utf8 response bodies 349// without explicit charset declaration. This feature uses https://github.com/saintfish/chardet 350func DetectCharset() func(*Collector) { 351 return func(c *Collector) { 352 c.DetectCharset = true 353 } 354} 355 356// Debugger sets the debugger used by the Collector. 357func Debugger(d debug.Debugger) func(*Collector) { 358 return func(c *Collector) { 359 d.Init() 360 c.debugger = d 361 } 362} 363 364// Init initializes the Collector's private variables and sets default 365// configuration for the Collector 366func (c *Collector) Init() { 367 c.UserAgent = "colly - https://github.com/gocolly/colly" 368 c.MaxDepth = 0 369 c.store = &storage.InMemoryStorage{} 370 c.store.Init() 371 c.MaxBodySize = 10 * 1024 * 1024 372 c.backend = &httpBackend{} 373 jar, _ := cookiejar.New(nil) 374 c.backend.Init(jar) 375 c.backend.Client.CheckRedirect = c.checkRedirectFunc() 376 c.wg = &sync.WaitGroup{} 377 c.lock = &sync.RWMutex{} 378 c.robotsMap = make(map[string]*robotstxt.RobotsData) 379 c.IgnoreRobotsTxt = true 380 c.ID = atomic.AddUint32(&collectorCounter, 1) 381} 382 383// Appengine will replace the Collector's backend http.Client 384// With an Http.Client that is provided by appengine/urlfetch 385// This function should be used when the scraper is run on 386// Google App Engine. Example: 387// func startScraper(w http.ResponseWriter, r *http.Request) { 388// ctx := appengine.NewContext(r) 389// c := colly.NewCollector() 390// c.Appengine(ctx) 391// ... 392// c.Visit("https://google.ca") 393// } 394func (c *Collector) Appengine(ctx context.Context) { 395 client := urlfetch.Client(ctx) 396 client.Jar = c.backend.Client.Jar 397 client.CheckRedirect = c.backend.Client.CheckRedirect 398 client.Timeout = c.backend.Client.Timeout 399 400 c.backend.Client = client 401} 402 403// Visit starts Collector's collecting job by creating a 404// request to the URL specified in parameter. 405// Visit also calls the previously provided callbacks 406func (c *Collector) Visit(URL string) error { 407 if c.CheckHead { 408 if check := c.scrape(URL, "HEAD", 1, nil, nil, nil, true); check != nil { 409 return check 410 } 411 } 412 return c.scrape(URL, "GET", 1, nil, nil, nil, true) 413} 414 415// Head starts a collector job by creating a HEAD request. 416func (c *Collector) Head(URL string) error { 417 return c.scrape(URL, "HEAD", 1, nil, nil, nil, false) 418} 419 420// Post starts a collector job by creating a POST request. 421// Post also calls the previously provided callbacks 422func (c *Collector) Post(URL string, requestData map[string]string) error { 423 return c.scrape(URL, "POST", 1, createFormReader(requestData), nil, nil, true) 424} 425 426// PostRaw starts a collector job by creating a POST request with raw binary data. 427// Post also calls the previously provided callbacks 428func (c *Collector) PostRaw(URL string, requestData []byte) error { 429 return c.scrape(URL, "POST", 1, bytes.NewReader(requestData), nil, nil, true) 430} 431 432// PostMultipart starts a collector job by creating a Multipart POST request 433// with raw binary data. PostMultipart also calls the previously provided callbacks 434func (c *Collector) PostMultipart(URL string, requestData map[string][]byte) error { 435 boundary := randomBoundary() 436 hdr := http.Header{} 437 hdr.Set("Content-Type", "multipart/form-data; boundary="+boundary) 438 hdr.Set("User-Agent", c.UserAgent) 439 return c.scrape(URL, "POST", 1, createMultipartReader(boundary, requestData), nil, hdr, true) 440} 441 442// Request starts a collector job by creating a custom HTTP request 443// where method, context, headers and request data can be specified. 444// Set requestData, ctx, hdr parameters to nil if you don't want to use them. 445// Valid methods: 446// - "GET" 447// - "HEAD" 448// - "POST" 449// - "PUT" 450// - "DELETE" 451// - "PATCH" 452// - "OPTIONS" 453func (c *Collector) Request(method, URL string, requestData io.Reader, ctx *Context, hdr http.Header) error { 454 return c.scrape(URL, method, 1, requestData, ctx, hdr, true) 455} 456 457// SetDebugger attaches a debugger to the collector 458func (c *Collector) SetDebugger(d debug.Debugger) { 459 d.Init() 460 c.debugger = d 461} 462 463// UnmarshalRequest creates a Request from serialized data 464func (c *Collector) UnmarshalRequest(r []byte) (*Request, error) { 465 req := &serializableRequest{} 466 err := json.Unmarshal(r, req) 467 if err != nil { 468 return nil, err 469 } 470 471 u, err := url.Parse(req.URL) 472 if err != nil { 473 return nil, err 474 } 475 476 ctx := NewContext() 477 for k, v := range req.Ctx { 478 ctx.Put(k, v) 479 } 480 481 return &Request{ 482 Method: req.Method, 483 URL: u, 484 Body: bytes.NewReader(req.Body), 485 Ctx: ctx, 486 ID: atomic.AddUint32(&c.requestCount, 1), 487 Headers: &req.Headers, 488 collector: c, 489 }, nil 490} 491 492func (c *Collector) scrape(u, method string, depth int, requestData io.Reader, ctx *Context, hdr http.Header, checkRevisit bool) error { 493 if err := c.requestCheck(u, method, depth, checkRevisit); err != nil { 494 return err 495 } 496 parsedURL, err := url.Parse(u) 497 if err != nil { 498 return err 499 } 500 if parsedURL.Scheme == "" { 501 parsedURL.Scheme = "http" 502 } 503 if !c.isDomainAllowed(parsedURL.Host) { 504 return ErrForbiddenDomain 505 } 506 if method != "HEAD" && !c.IgnoreRobotsTxt { 507 if err = c.checkRobots(parsedURL); err != nil { 508 return err 509 } 510 } 511 if hdr == nil { 512 hdr = http.Header{"User-Agent": []string{c.UserAgent}} 513 } 514 rc, ok := requestData.(io.ReadCloser) 515 if !ok && requestData != nil { 516 rc = ioutil.NopCloser(requestData) 517 } 518 // The Go HTTP API ignores "Host" in the headers, preferring the client 519 // to use the Host field on Request. 520 host := parsedURL.Host 521 if hostHeader := hdr.Get("Host"); hostHeader != "" { 522 host = hostHeader 523 } 524 req := &http.Request{ 525 Method: method, 526 URL: parsedURL, 527 Proto: "HTTP/1.1", 528 ProtoMajor: 1, 529 ProtoMinor: 1, 530 Header: hdr, 531 Body: rc, 532 Host: host, 533 } 534 setRequestBody(req, requestData) 535 u = parsedURL.String() 536 c.wg.Add(1) 537 if c.Async { 538 go c.fetch(u, method, depth, requestData, ctx, hdr, req) 539 return nil 540 } 541 return c.fetch(u, method, depth, requestData, ctx, hdr, req) 542} 543 544func setRequestBody(req *http.Request, body io.Reader) { 545 if body != nil { 546 switch v := body.(type) { 547 case *bytes.Buffer: 548 req.ContentLength = int64(v.Len()) 549 buf := v.Bytes() 550 req.GetBody = func() (io.ReadCloser, error) { 551 r := bytes.NewReader(buf) 552 return ioutil.NopCloser(r), nil 553 } 554 case *bytes.Reader: 555 req.ContentLength = int64(v.Len()) 556 snapshot := *v 557 req.GetBody = func() (io.ReadCloser, error) { 558 r := snapshot 559 return ioutil.NopCloser(&r), nil 560 } 561 case *strings.Reader: 562 req.ContentLength = int64(v.Len()) 563 snapshot := *v 564 req.GetBody = func() (io.ReadCloser, error) { 565 r := snapshot 566 return ioutil.NopCloser(&r), nil 567 } 568 } 569 if req.GetBody != nil && req.ContentLength == 0 { 570 req.Body = http.NoBody 571 req.GetBody = func() (io.ReadCloser, error) { return http.NoBody, nil } 572 } 573 } 574} 575 576func (c *Collector) fetch(u, method string, depth int, requestData io.Reader, ctx *Context, hdr http.Header, req *http.Request) error { 577 defer c.wg.Done() 578 if ctx == nil { 579 ctx = NewContext() 580 } 581 request := &Request{ 582 URL: req.URL, 583 Headers: &req.Header, 584 Ctx: ctx, 585 Depth: depth, 586 Method: method, 587 Body: requestData, 588 collector: c, 589 ID: atomic.AddUint32(&c.requestCount, 1), 590 } 591 592 c.handleOnRequest(request) 593 594 if request.abort { 595 return nil 596 } 597 598 if method == "POST" && req.Header.Get("Content-Type") == "" { 599 req.Header.Add("Content-Type", "application/x-www-form-urlencoded") 600 } 601 602 if req.Header.Get("Accept") == "" { 603 req.Header.Set("Accept", "*/*") 604 } 605 606 origURL := req.URL 607 response, err := c.backend.Cache(req, c.MaxBodySize, c.CacheDir) 608 if proxyURL, ok := req.Context().Value(ProxyURLKey).(string); ok { 609 request.ProxyURL = proxyURL 610 } 611 if err := c.handleOnError(response, err, request, ctx); err != nil { 612 return err 613 } 614 if req.URL != origURL { 615 request.URL = req.URL 616 request.Headers = &req.Header 617 } 618 atomic.AddUint32(&c.responseCount, 1) 619 response.Ctx = ctx 620 response.Request = request 621 622 err = response.fixCharset(c.DetectCharset, request.ResponseCharacterEncoding) 623 if err != nil { 624 return err 625 } 626 627 c.handleOnResponse(response) 628 629 err = c.handleOnHTML(response) 630 if err != nil { 631 c.handleOnError(response, err, request, ctx) 632 } 633 634 err = c.handleOnXML(response) 635 if err != nil { 636 c.handleOnError(response, err, request, ctx) 637 } 638 639 c.handleOnScraped(response) 640 641 return err 642} 643 644func (c *Collector) requestCheck(u, method string, depth int, checkRevisit bool) error { 645 if u == "" { 646 return ErrMissingURL 647 } 648 if c.MaxDepth > 0 && c.MaxDepth < depth { 649 return ErrMaxDepth 650 } 651 if len(c.DisallowedURLFilters) > 0 { 652 if isMatchingFilter(c.DisallowedURLFilters, []byte(u)) { 653 return ErrForbiddenURL 654 } 655 } 656 if len(c.URLFilters) > 0 { 657 if !isMatchingFilter(c.URLFilters, []byte(u)) { 658 return ErrNoURLFiltersMatch 659 } 660 } 661 if checkRevisit && !c.AllowURLRevisit && method == "GET" { 662 h := fnv.New64a() 663 h.Write([]byte(u)) 664 uHash := h.Sum64() 665 visited, err := c.store.IsVisited(uHash) 666 if err != nil { 667 return err 668 } 669 if visited { 670 return ErrAlreadyVisited 671 } 672 return c.store.Visited(uHash) 673 } 674 return nil 675} 676 677func (c *Collector) isDomainAllowed(domain string) bool { 678 for _, d2 := range c.DisallowedDomains { 679 if d2 == domain { 680 return false 681 } 682 } 683 if c.AllowedDomains == nil || len(c.AllowedDomains) == 0 { 684 return true 685 } 686 for _, d2 := range c.AllowedDomains { 687 if d2 == domain { 688 return true 689 } 690 } 691 return false 692} 693 694func (c *Collector) checkRobots(u *url.URL) error { 695 c.lock.RLock() 696 robot, ok := c.robotsMap[u.Host] 697 c.lock.RUnlock() 698 699 if !ok { 700 // no robots file cached 701 resp, err := c.backend.Client.Get(u.Scheme + "://" + u.Host + "/robots.txt") 702 if err != nil { 703 return err 704 } 705 robot, err = robotstxt.FromResponse(resp) 706 if err != nil { 707 return err 708 } 709 c.lock.Lock() 710 c.robotsMap[u.Host] = robot 711 c.lock.Unlock() 712 } 713 714 uaGroup := robot.FindGroup(c.UserAgent) 715 if uaGroup == nil { 716 return nil 717 } 718 719 if !uaGroup.Test(u.EscapedPath()) { 720 return ErrRobotsTxtBlocked 721 } 722 return nil 723} 724 725// String is the text representation of the collector. 726// It contains useful debug information about the collector's internals 727func (c *Collector) String() string { 728 return fmt.Sprintf( 729 "Requests made: %d (%d responses) | Callbacks: OnRequest: %d, OnHTML: %d, OnResponse: %d, OnError: %d", 730 c.requestCount, 731 c.responseCount, 732 len(c.requestCallbacks), 733 len(c.htmlCallbacks), 734 len(c.responseCallbacks), 735 len(c.errorCallbacks), 736 ) 737} 738 739// Wait returns when the collector jobs are finished 740func (c *Collector) Wait() { 741 c.wg.Wait() 742} 743 744// OnRequest registers a function. Function will be executed on every 745// request made by the Collector 746func (c *Collector) OnRequest(f RequestCallback) { 747 c.lock.Lock() 748 if c.requestCallbacks == nil { 749 c.requestCallbacks = make([]RequestCallback, 0, 4) 750 } 751 c.requestCallbacks = append(c.requestCallbacks, f) 752 c.lock.Unlock() 753} 754 755// OnResponse registers a function. Function will be executed on every response 756func (c *Collector) OnResponse(f ResponseCallback) { 757 c.lock.Lock() 758 if c.responseCallbacks == nil { 759 c.responseCallbacks = make([]ResponseCallback, 0, 4) 760 } 761 c.responseCallbacks = append(c.responseCallbacks, f) 762 c.lock.Unlock() 763} 764 765// OnHTML registers a function. Function will be executed on every HTML 766// element matched by the GoQuery Selector parameter. 767// GoQuery Selector is a selector used by https://github.com/PuerkitoBio/goquery 768func (c *Collector) OnHTML(goquerySelector string, f HTMLCallback) { 769 c.lock.Lock() 770 if c.htmlCallbacks == nil { 771 c.htmlCallbacks = make([]*htmlCallbackContainer, 0, 4) 772 } 773 c.htmlCallbacks = append(c.htmlCallbacks, &htmlCallbackContainer{ 774 Selector: goquerySelector, 775 Function: f, 776 }) 777 c.lock.Unlock() 778} 779 780// OnXML registers a function. Function will be executed on every XML 781// element matched by the xpath Query parameter. 782// xpath Query is used by https://github.com/antchfx/xmlquery 783func (c *Collector) OnXML(xpathQuery string, f XMLCallback) { 784 c.lock.Lock() 785 if c.xmlCallbacks == nil { 786 c.xmlCallbacks = make([]*xmlCallbackContainer, 0, 4) 787 } 788 c.xmlCallbacks = append(c.xmlCallbacks, &xmlCallbackContainer{ 789 Query: xpathQuery, 790 Function: f, 791 }) 792 c.lock.Unlock() 793} 794 795// OnHTMLDetach deregister a function. Function will not be execute after detached 796func (c *Collector) OnHTMLDetach(goquerySelector string) { 797 c.lock.Lock() 798 deleteIdx := -1 799 for i, cc := range c.htmlCallbacks { 800 if cc.Selector == goquerySelector { 801 deleteIdx = i 802 break 803 } 804 } 805 if deleteIdx != -1 { 806 c.htmlCallbacks = append(c.htmlCallbacks[:deleteIdx], c.htmlCallbacks[deleteIdx+1:]...) 807 } 808 c.lock.Unlock() 809} 810 811// OnXMLDetach deregister a function. Function will not be execute after detached 812func (c *Collector) OnXMLDetach(xpathQuery string) { 813 c.lock.Lock() 814 deleteIdx := -1 815 for i, cc := range c.xmlCallbacks { 816 if cc.Query == xpathQuery { 817 deleteIdx = i 818 break 819 } 820 } 821 if deleteIdx != -1 { 822 c.xmlCallbacks = append(c.xmlCallbacks[:deleteIdx], c.xmlCallbacks[deleteIdx+1:]...) 823 } 824 c.lock.Unlock() 825} 826 827// OnError registers a function. Function will be executed if an error 828// occurs during the HTTP request. 829func (c *Collector) OnError(f ErrorCallback) { 830 c.lock.Lock() 831 if c.errorCallbacks == nil { 832 c.errorCallbacks = make([]ErrorCallback, 0, 4) 833 } 834 c.errorCallbacks = append(c.errorCallbacks, f) 835 c.lock.Unlock() 836} 837 838// OnScraped registers a function. Function will be executed after 839// OnHTML, as a final part of the scraping. 840func (c *Collector) OnScraped(f ScrapedCallback) { 841 c.lock.Lock() 842 if c.scrapedCallbacks == nil { 843 c.scrapedCallbacks = make([]ScrapedCallback, 0, 4) 844 } 845 c.scrapedCallbacks = append(c.scrapedCallbacks, f) 846 c.lock.Unlock() 847} 848 849// WithTransport allows you to set a custom http.RoundTripper (transport) 850func (c *Collector) WithTransport(transport http.RoundTripper) { 851 c.backend.Client.Transport = transport 852} 853 854// DisableCookies turns off cookie handling 855func (c *Collector) DisableCookies() { 856 c.backend.Client.Jar = nil 857} 858 859// SetCookieJar overrides the previously set cookie jar 860func (c *Collector) SetCookieJar(j *cookiejar.Jar) { 861 c.backend.Client.Jar = j 862} 863 864// SetRequestTimeout overrides the default timeout (10 seconds) for this collector 865func (c *Collector) SetRequestTimeout(timeout time.Duration) { 866 c.backend.Client.Timeout = timeout 867} 868 869// SetStorage overrides the default in-memory storage. 870// Storage stores scraping related data like cookies and visited urls 871func (c *Collector) SetStorage(s storage.Storage) error { 872 if err := s.Init(); err != nil { 873 return err 874 } 875 c.store = s 876 c.backend.Client.Jar = createJar(s) 877 return nil 878} 879 880// SetProxy sets a proxy for the collector. This method overrides the previously 881// used http.Transport if the type of the transport is not http.RoundTripper. 882// The proxy type is determined by the URL scheme. "http" 883// and "socks5" are supported. If the scheme is empty, 884// "http" is assumed. 885func (c *Collector) SetProxy(proxyURL string) error { 886 proxyParsed, err := url.Parse(proxyURL) 887 if err != nil { 888 return err 889 } 890 891 c.SetProxyFunc(http.ProxyURL(proxyParsed)) 892 893 return nil 894} 895 896// SetProxyFunc sets a custom proxy setter/switcher function. 897// See built-in ProxyFuncs for more details. 898// This method overrides the previously used http.Transport 899// if the type of the transport is not http.RoundTripper. 900// The proxy type is determined by the URL scheme. "http" 901// and "socks5" are supported. If the scheme is empty, 902// "http" is assumed. 903func (c *Collector) SetProxyFunc(p ProxyFunc) { 904 t, ok := c.backend.Client.Transport.(*http.Transport) 905 if c.backend.Client.Transport != nil && ok { 906 t.Proxy = p 907 } else { 908 c.backend.Client.Transport = &http.Transport{ 909 Proxy: p, 910 } 911 } 912} 913 914func createEvent(eventType string, requestID, collectorID uint32, kvargs map[string]string) *debug.Event { 915 return &debug.Event{ 916 CollectorID: collectorID, 917 RequestID: requestID, 918 Type: eventType, 919 Values: kvargs, 920 } 921} 922 923func (c *Collector) handleOnRequest(r *Request) { 924 if c.debugger != nil { 925 c.debugger.Event(createEvent("request", r.ID, c.ID, map[string]string{ 926 "url": r.URL.String(), 927 })) 928 } 929 for _, f := range c.requestCallbacks { 930 f(r) 931 } 932} 933 934func (c *Collector) handleOnResponse(r *Response) { 935 if c.debugger != nil { 936 c.debugger.Event(createEvent("response", r.Request.ID, c.ID, map[string]string{ 937 "url": r.Request.URL.String(), 938 "status": http.StatusText(r.StatusCode), 939 })) 940 } 941 for _, f := range c.responseCallbacks { 942 f(r) 943 } 944} 945 946func (c *Collector) handleOnHTML(resp *Response) error { 947 if len(c.htmlCallbacks) == 0 || !strings.Contains(strings.ToLower(resp.Headers.Get("Content-Type")), "html") { 948 return nil 949 } 950 doc, err := goquery.NewDocumentFromReader(bytes.NewBuffer(resp.Body)) 951 if err != nil { 952 return err 953 } 954 if href, found := doc.Find("base[href]").Attr("href"); found { 955 resp.Request.baseURL, _ = url.Parse(href) 956 } 957 for _, cc := range c.htmlCallbacks { 958 i := 0 959 doc.Find(cc.Selector).Each(func(_ int, s *goquery.Selection) { 960 for _, n := range s.Nodes { 961 e := NewHTMLElementFromSelectionNode(resp, s, n, i) 962 i++ 963 if c.debugger != nil { 964 c.debugger.Event(createEvent("html", resp.Request.ID, c.ID, map[string]string{ 965 "selector": cc.Selector, 966 "url": resp.Request.URL.String(), 967 })) 968 } 969 cc.Function(e) 970 } 971 }) 972 } 973 return nil 974} 975 976func (c *Collector) handleOnXML(resp *Response) error { 977 if len(c.xmlCallbacks) == 0 { 978 return nil 979 } 980 contentType := strings.ToLower(resp.Headers.Get("Content-Type")) 981 if !strings.Contains(contentType, "html") && !strings.Contains(contentType, "xml") { 982 return nil 983 } 984 985 if strings.Contains(contentType, "html") { 986 doc, err := htmlquery.Parse(bytes.NewBuffer(resp.Body)) 987 if err != nil { 988 return err 989 } 990 if e := htmlquery.FindOne(doc, "//base"); e != nil { 991 for _, a := range e.Attr { 992 if a.Key == "href" { 993 resp.Request.baseURL, _ = url.Parse(a.Val) 994 break 995 } 996 } 997 } 998 999 for _, cc := range c.xmlCallbacks { 1000 for _, n := range htmlquery.Find(doc, cc.Query) { 1001 e := NewXMLElementFromHTMLNode(resp, n) 1002 if c.debugger != nil { 1003 c.debugger.Event(createEvent("xml", resp.Request.ID, c.ID, map[string]string{ 1004 "selector": cc.Query, 1005 "url": resp.Request.URL.String(), 1006 })) 1007 } 1008 cc.Function(e) 1009 } 1010 } 1011 } else if strings.Contains(contentType, "xml") { 1012 doc, err := xmlquery.Parse(bytes.NewBuffer(resp.Body)) 1013 if err != nil { 1014 return err 1015 } 1016 1017 for _, cc := range c.xmlCallbacks { 1018 xmlquery.FindEach(doc, cc.Query, func(i int, n *xmlquery.Node) { 1019 e := NewXMLElementFromXMLNode(resp, n) 1020 if c.debugger != nil { 1021 c.debugger.Event(createEvent("xml", resp.Request.ID, c.ID, map[string]string{ 1022 "selector": cc.Query, 1023 "url": resp.Request.URL.String(), 1024 })) 1025 } 1026 cc.Function(e) 1027 }) 1028 } 1029 } 1030 return nil 1031} 1032 1033func (c *Collector) handleOnError(response *Response, err error, request *Request, ctx *Context) error { 1034 if err == nil && (c.ParseHTTPErrorResponse || response.StatusCode < 203) { 1035 return nil 1036 } 1037 if err == nil && response.StatusCode >= 203 { 1038 err = errors.New(http.StatusText(response.StatusCode)) 1039 } 1040 if response == nil { 1041 response = &Response{ 1042 Request: request, 1043 Ctx: ctx, 1044 } 1045 } 1046 if c.debugger != nil { 1047 c.debugger.Event(createEvent("error", request.ID, c.ID, map[string]string{ 1048 "url": request.URL.String(), 1049 "status": http.StatusText(response.StatusCode), 1050 })) 1051 } 1052 if response.Request == nil { 1053 response.Request = request 1054 } 1055 if response.Ctx == nil { 1056 response.Ctx = request.Ctx 1057 } 1058 for _, f := range c.errorCallbacks { 1059 f(response, err) 1060 } 1061 return err 1062} 1063 1064func (c *Collector) handleOnScraped(r *Response) { 1065 if c.debugger != nil { 1066 c.debugger.Event(createEvent("scraped", r.Request.ID, c.ID, map[string]string{ 1067 "url": r.Request.URL.String(), 1068 })) 1069 } 1070 for _, f := range c.scrapedCallbacks { 1071 f(r) 1072 } 1073} 1074 1075// Limit adds a new LimitRule to the collector 1076func (c *Collector) Limit(rule *LimitRule) error { 1077 return c.backend.Limit(rule) 1078} 1079 1080// Limits adds new LimitRules to the collector 1081func (c *Collector) Limits(rules []*LimitRule) error { 1082 return c.backend.Limits(rules) 1083} 1084 1085// SetCookies handles the receipt of the cookies in a reply for the given URL 1086func (c *Collector) SetCookies(URL string, cookies []*http.Cookie) error { 1087 if c.backend.Client.Jar == nil { 1088 return ErrNoCookieJar 1089 } 1090 u, err := url.Parse(URL) 1091 if err != nil { 1092 return err 1093 } 1094 c.backend.Client.Jar.SetCookies(u, cookies) 1095 return nil 1096} 1097 1098// Cookies returns the cookies to send in a request for the given URL. 1099func (c *Collector) Cookies(URL string) []*http.Cookie { 1100 if c.backend.Client.Jar == nil { 1101 return nil 1102 } 1103 u, err := url.Parse(URL) 1104 if err != nil { 1105 return nil 1106 } 1107 return c.backend.Client.Jar.Cookies(u) 1108} 1109 1110// Clone creates an exact copy of a Collector without callbacks. 1111// HTTP backend, robots.txt cache and cookie jar are shared 1112// between collectors. 1113func (c *Collector) Clone() *Collector { 1114 return &Collector{ 1115 AllowedDomains: c.AllowedDomains, 1116 AllowURLRevisit: c.AllowURLRevisit, 1117 CacheDir: c.CacheDir, 1118 DetectCharset: c.DetectCharset, 1119 DisallowedDomains: c.DisallowedDomains, 1120 ID: atomic.AddUint32(&collectorCounter, 1), 1121 IgnoreRobotsTxt: c.IgnoreRobotsTxt, 1122 MaxBodySize: c.MaxBodySize, 1123 MaxDepth: c.MaxDepth, 1124 DisallowedURLFilters: c.DisallowedURLFilters, 1125 URLFilters: c.URLFilters, 1126 ParseHTTPErrorResponse: c.ParseHTTPErrorResponse, 1127 UserAgent: c.UserAgent, 1128 store: c.store, 1129 backend: c.backend, 1130 debugger: c.debugger, 1131 Async: c.Async, 1132 RedirectHandler: c.RedirectHandler, 1133 errorCallbacks: make([]ErrorCallback, 0, 8), 1134 htmlCallbacks: make([]*htmlCallbackContainer, 0, 8), 1135 xmlCallbacks: make([]*xmlCallbackContainer, 0, 8), 1136 scrapedCallbacks: make([]ScrapedCallback, 0, 8), 1137 lock: c.lock, 1138 requestCallbacks: make([]RequestCallback, 0, 8), 1139 responseCallbacks: make([]ResponseCallback, 0, 8), 1140 robotsMap: c.robotsMap, 1141 wg: &sync.WaitGroup{}, 1142 } 1143} 1144 1145func (c *Collector) checkRedirectFunc() func(req *http.Request, via []*http.Request) error { 1146 return func(req *http.Request, via []*http.Request) error { 1147 if !c.isDomainAllowed(req.URL.Host) { 1148 return fmt.Errorf("Not following redirect to %s because its not in AllowedDomains", req.URL.Host) 1149 } 1150 1151 if c.RedirectHandler != nil { 1152 return c.RedirectHandler(req, via) 1153 } 1154 1155 // Honor golangs default of maximum of 10 redirects 1156 if len(via) >= 10 { 1157 return http.ErrUseLastResponse 1158 } 1159 1160 lastRequest := via[len(via)-1] 1161 1162 // Copy the headers from last request 1163 for hName, hValues := range lastRequest.Header { 1164 for _, hValue := range hValues { 1165 req.Header.Set(hName, hValue) 1166 } 1167 } 1168 1169 // If domain has changed, remove the Authorization-header if it exists 1170 if req.URL.Host != lastRequest.URL.Host { 1171 req.Header.Del("Authorization") 1172 } 1173 1174 return nil 1175 } 1176} 1177 1178func (c *Collector) parseSettingsFromEnv() { 1179 for _, e := range os.Environ() { 1180 if !strings.HasPrefix(e, "COLLY_") { 1181 continue 1182 } 1183 pair := strings.SplitN(e[6:], "=", 2) 1184 if f, ok := envMap[pair[0]]; ok { 1185 f(c, pair[1]) 1186 } else { 1187 log.Println("Unknown environment variable:", pair[0]) 1188 } 1189 } 1190} 1191 1192// SanitizeFileName replaces dangerous characters in a string 1193// so the return value can be used as a safe file name. 1194func SanitizeFileName(fileName string) string { 1195 ext := filepath.Ext(fileName) 1196 cleanExt := sanitize.BaseName(ext) 1197 if cleanExt == "" { 1198 cleanExt = ".unknown" 1199 } 1200 return strings.Replace(fmt.Sprintf( 1201 "%s.%s", 1202 sanitize.BaseName(fileName[:len(fileName)-len(ext)]), 1203 cleanExt[1:], 1204 ), "-", "_", -1) 1205} 1206 1207func createFormReader(data map[string]string) io.Reader { 1208 form := url.Values{} 1209 for k, v := range data { 1210 form.Add(k, v) 1211 } 1212 return strings.NewReader(form.Encode()) 1213} 1214 1215func createMultipartReader(boundary string, data map[string][]byte) io.Reader { 1216 dashBoundary := "--" + boundary 1217 1218 body := []byte{} 1219 buffer := bytes.NewBuffer(body) 1220 1221 buffer.WriteString("Content-type: multipart/form-data; boundary=" + boundary + "\n\n") 1222 for contentType, content := range data { 1223 buffer.WriteString(dashBoundary + "\n") 1224 buffer.WriteString("Content-Disposition: form-data; name=" + contentType + "\n") 1225 buffer.WriteString(fmt.Sprintf("Content-Length: %d \n\n", len(content))) 1226 buffer.Write(content) 1227 buffer.WriteString("\n") 1228 } 1229 buffer.WriteString(dashBoundary + "--\n\n") 1230 return buffer 1231} 1232 1233// randomBoundary was borrowed from 1234// github.com/golang/go/mime/multipart/writer.go#randomBoundary 1235func randomBoundary() string { 1236 var buf [30]byte 1237 _, err := io.ReadFull(rand.Reader, buf[:]) 1238 if err != nil { 1239 panic(err) 1240 } 1241 return fmt.Sprintf("%x", buf[:]) 1242} 1243 1244func isYesString(s string) bool { 1245 switch strings.ToLower(s) { 1246 case "1", "yes", "true", "y": 1247 return true 1248 } 1249 return false 1250} 1251 1252func createJar(s storage.Storage) http.CookieJar { 1253 return &cookieJarSerializer{store: s, lock: &sync.RWMutex{}} 1254} 1255 1256func (j *cookieJarSerializer) SetCookies(u *url.URL, cookies []*http.Cookie) { 1257 j.lock.Lock() 1258 defer j.lock.Unlock() 1259 cookieStr := j.store.Cookies(u) 1260 1261 // Merge existing cookies, new cookies have precedence. 1262 cnew := make([]*http.Cookie, len(cookies)) 1263 copy(cnew, cookies) 1264 existing := storage.UnstringifyCookies(cookieStr) 1265 for _, c := range existing { 1266 if !storage.ContainsCookie(cnew, c.Name) { 1267 cnew = append(cnew, c) 1268 } 1269 } 1270 j.store.SetCookies(u, storage.StringifyCookies(cnew)) 1271} 1272 1273func (j *cookieJarSerializer) Cookies(u *url.URL) []*http.Cookie { 1274 cookies := storage.UnstringifyCookies(j.store.Cookies(u)) 1275 // Filter. 1276 now := time.Now() 1277 cnew := make([]*http.Cookie, 0, len(cookies)) 1278 for _, c := range cookies { 1279 // Drop expired cookies. 1280 if c.RawExpires != "" && c.Expires.Before(now) { 1281 continue 1282 } 1283 // Drop secure cookies if not over https. 1284 if c.Secure && u.Scheme != "https" { 1285 continue 1286 } 1287 cnew = append(cnew, c) 1288 } 1289 return cnew 1290} 1291 1292func isMatchingFilter(fs []*regexp.Regexp, d []byte) bool { 1293 for _, r := range fs { 1294 if r.Match(d) { 1295 return true 1296 } 1297 } 1298 return false 1299} 1300