1// Copyright 2018 Adam Tauber 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15// Package colly implements a HTTP scraping framework 16package colly 17 18import ( 19 "bytes" 20 "context" 21 "crypto/rand" 22 "encoding/json" 23 "errors" 24 "fmt" 25 "hash/fnv" 26 "io" 27 "io/ioutil" 28 "log" 29 "net/http" 30 "net/http/cookiejar" 31 "net/url" 32 "os" 33 "path/filepath" 34 "regexp" 35 "strconv" 36 "strings" 37 "sync" 38 "sync/atomic" 39 "time" 40 41 "github.com/PuerkitoBio/goquery" 42 "github.com/antchfx/htmlquery" 43 "github.com/antchfx/xmlquery" 44 "github.com/gocolly/colly/v2/debug" 45 "github.com/gocolly/colly/v2/storage" 46 "github.com/kennygrant/sanitize" 47 "github.com/temoto/robotstxt" 48 "google.golang.org/appengine/urlfetch" 49) 50 51// A CollectorOption sets an option on a Collector. 52type CollectorOption func(*Collector) 53 54// Collector provides the scraper instance for a scraping job 55type Collector struct { 56 // UserAgent is the User-Agent string used by HTTP requests 57 UserAgent string 58 // MaxDepth limits the recursion depth of visited URLs. 59 // Set it to 0 for infinite recursion (default). 60 MaxDepth int 61 // AllowedDomains is a domain whitelist. 62 // Leave it blank to allow any domains to be visited 63 AllowedDomains []string 64 // DisallowedDomains is a domain blacklist. 65 DisallowedDomains []string 66 // DisallowedURLFilters is a list of regular expressions which restricts 67 // visiting URLs. If any of the rules matches to a URL the 68 // request will be stopped. DisallowedURLFilters will 69 // be evaluated before URLFilters 70 // Leave it blank to allow any URLs to be visited 71 DisallowedURLFilters []*regexp.Regexp 72 // URLFilters is a list of regular expressions which restricts 73 // visiting URLs. If any of the rules matches to a URL the 74 // request won't be stopped. DisallowedURLFilters will 75 // be evaluated before URLFilters 76 77 // Leave it blank to allow any URLs to be visited 78 URLFilters []*regexp.Regexp 79 80 // AllowURLRevisit allows multiple downloads of the same URL 81 AllowURLRevisit bool 82 // MaxBodySize is the limit of the retrieved response body in bytes. 83 // 0 means unlimited. 84 // The default value for MaxBodySize is 10MB (10 * 1024 * 1024 bytes). 85 MaxBodySize int 86 // CacheDir specifies a location where GET requests are cached as files. 87 // When it's not defined, caching is disabled. 88 CacheDir string 89 // IgnoreRobotsTxt allows the Collector to ignore any restrictions set by 90 // the target host's robots.txt file. See http://www.robotstxt.org/ for more 91 // information. 92 IgnoreRobotsTxt bool 93 // Async turns on asynchronous network communication. Use Collector.Wait() to 94 // be sure all requests have been finished. 95 Async bool 96 // ParseHTTPErrorResponse allows parsing HTTP responses with non 2xx status codes. 97 // By default, Colly parses only successful HTTP responses. Set ParseHTTPErrorResponse 98 // to true to enable it. 99 ParseHTTPErrorResponse bool 100 // ID is the unique identifier of a collector 101 ID uint32 102 // DetectCharset can enable character encoding detection for non-utf8 response bodies 103 // without explicit charset declaration. This feature uses https://github.com/saintfish/chardet 104 DetectCharset bool 105 // RedirectHandler allows control on how a redirect will be managed 106 // use c.SetRedirectHandler to set this value 107 redirectHandler func(req *http.Request, via []*http.Request) error 108 // CheckHead performs a HEAD request before every GET to pre-validate the response 109 CheckHead bool 110 // TraceHTTP enables capturing and reporting request performance for crawler tuning. 111 // When set to true, the Response.Trace will be filled in with an HTTPTrace object. 112 TraceHTTP bool 113 store storage.Storage 114 debugger debug.Debugger 115 robotsMap map[string]*robotstxt.RobotsData 116 htmlCallbacks []*htmlCallbackContainer 117 xmlCallbacks []*xmlCallbackContainer 118 requestCallbacks []RequestCallback 119 responseCallbacks []ResponseCallback 120 responseHeadersCallbacks []ResponseHeadersCallback 121 errorCallbacks []ErrorCallback 122 scrapedCallbacks []ScrapedCallback 123 requestCount uint32 124 responseCount uint32 125 backend *httpBackend 126 wg *sync.WaitGroup 127 lock *sync.RWMutex 128} 129 130// RequestCallback is a type alias for OnRequest callback functions 131type RequestCallback func(*Request) 132 133// ResponseHeadersCallback is a type alias for OnResponseHeaders callback functions 134type ResponseHeadersCallback func(*Response) 135 136// ResponseCallback is a type alias for OnResponse callback functions 137type ResponseCallback func(*Response) 138 139// HTMLCallback is a type alias for OnHTML callback functions 140type HTMLCallback func(*HTMLElement) 141 142// XMLCallback is a type alias for OnXML callback functions 143type XMLCallback func(*XMLElement) 144 145// ErrorCallback is a type alias for OnError callback functions 146type ErrorCallback func(*Response, error) 147 148// ScrapedCallback is a type alias for OnScraped callback functions 149type ScrapedCallback func(*Response) 150 151// ProxyFunc is a type alias for proxy setter functions. 152type ProxyFunc func(*http.Request) (*url.URL, error) 153 154type htmlCallbackContainer struct { 155 Selector string 156 Function HTMLCallback 157} 158 159type xmlCallbackContainer struct { 160 Query string 161 Function XMLCallback 162} 163 164type cookieJarSerializer struct { 165 store storage.Storage 166 lock *sync.RWMutex 167} 168 169var collectorCounter uint32 170 171// The key type is unexported to prevent collisions with context keys defined in 172// other packages. 173type key int 174 175// ProxyURLKey is the context key for the request proxy address. 176const ProxyURLKey key = iota 177 178var ( 179 // ErrForbiddenDomain is the error thrown if visiting 180 // a domain which is not allowed in AllowedDomains 181 ErrForbiddenDomain = errors.New("Forbidden domain") 182 // ErrMissingURL is the error type for missing URL errors 183 ErrMissingURL = errors.New("Missing URL") 184 // ErrMaxDepth is the error type for exceeding max depth 185 ErrMaxDepth = errors.New("Max depth limit reached") 186 // ErrForbiddenURL is the error thrown if visiting 187 // a URL which is not allowed by URLFilters 188 ErrForbiddenURL = errors.New("ForbiddenURL") 189 190 // ErrNoURLFiltersMatch is the error thrown if visiting 191 // a URL which is not allowed by URLFilters 192 ErrNoURLFiltersMatch = errors.New("No URLFilters match") 193 // ErrAlreadyVisited is the error type for already visited URLs 194 ErrAlreadyVisited = errors.New("URL already visited") 195 // ErrRobotsTxtBlocked is the error type for robots.txt errors 196 ErrRobotsTxtBlocked = errors.New("URL blocked by robots.txt") 197 // ErrNoCookieJar is the error type for missing cookie jar 198 ErrNoCookieJar = errors.New("Cookie jar is not available") 199 // ErrNoPattern is the error type for LimitRules without patterns 200 ErrNoPattern = errors.New("No pattern defined in LimitRule") 201 // ErrEmptyProxyURL is the error type for empty Proxy URL list 202 ErrEmptyProxyURL = errors.New("Proxy URL list is empty") 203 // ErrAbortedAfterHeaders is the error returned when OnResponseHeaders aborts the transfer. 204 ErrAbortedAfterHeaders = errors.New("Aborted after receiving response headers") 205 // ErrQueueFull is the error returned when the queue is full 206 ErrQueueFull = errors.New("Queue MaxSize reached") 207) 208 209var envMap = map[string]func(*Collector, string){ 210 "ALLOWED_DOMAINS": func(c *Collector, val string) { 211 c.AllowedDomains = strings.Split(val, ",") 212 }, 213 "CACHE_DIR": func(c *Collector, val string) { 214 c.CacheDir = val 215 }, 216 "DETECT_CHARSET": func(c *Collector, val string) { 217 c.DetectCharset = isYesString(val) 218 }, 219 "DISABLE_COOKIES": func(c *Collector, _ string) { 220 c.backend.Client.Jar = nil 221 }, 222 "DISALLOWED_DOMAINS": func(c *Collector, val string) { 223 c.DisallowedDomains = strings.Split(val, ",") 224 }, 225 "IGNORE_ROBOTSTXT": func(c *Collector, val string) { 226 c.IgnoreRobotsTxt = isYesString(val) 227 }, 228 "FOLLOW_REDIRECTS": func(c *Collector, val string) { 229 if !isYesString(val) { 230 c.redirectHandler = func(req *http.Request, via []*http.Request) error { 231 return http.ErrUseLastResponse 232 } 233 } 234 }, 235 "MAX_BODY_SIZE": func(c *Collector, val string) { 236 size, err := strconv.Atoi(val) 237 if err == nil { 238 c.MaxBodySize = size 239 } 240 }, 241 "MAX_DEPTH": func(c *Collector, val string) { 242 maxDepth, err := strconv.Atoi(val) 243 if err == nil { 244 c.MaxDepth = maxDepth 245 } 246 }, 247 "PARSE_HTTP_ERROR_RESPONSE": func(c *Collector, val string) { 248 c.ParseHTTPErrorResponse = isYesString(val) 249 }, 250 "TRACE_HTTP": func(c *Collector, val string) { 251 c.TraceHTTP = isYesString(val) 252 }, 253 "USER_AGENT": func(c *Collector, val string) { 254 c.UserAgent = val 255 }, 256} 257 258// NewCollector creates a new Collector instance with default configuration 259func NewCollector(options ...CollectorOption) *Collector { 260 c := &Collector{} 261 c.Init() 262 263 for _, f := range options { 264 f(c) 265 } 266 267 c.parseSettingsFromEnv() 268 269 return c 270} 271 272// UserAgent sets the user agent used by the Collector. 273func UserAgent(ua string) CollectorOption { 274 return func(c *Collector) { 275 c.UserAgent = ua 276 } 277} 278 279// MaxDepth limits the recursion depth of visited URLs. 280func MaxDepth(depth int) CollectorOption { 281 return func(c *Collector) { 282 c.MaxDepth = depth 283 } 284} 285 286// AllowedDomains sets the domain whitelist used by the Collector. 287func AllowedDomains(domains ...string) CollectorOption { 288 return func(c *Collector) { 289 c.AllowedDomains = domains 290 } 291} 292 293// ParseHTTPErrorResponse allows parsing responses with HTTP errors 294func ParseHTTPErrorResponse() CollectorOption { 295 return func(c *Collector) { 296 c.ParseHTTPErrorResponse = true 297 } 298} 299 300// DisallowedDomains sets the domain blacklist used by the Collector. 301func DisallowedDomains(domains ...string) CollectorOption { 302 return func(c *Collector) { 303 c.DisallowedDomains = domains 304 } 305} 306 307// DisallowedURLFilters sets the list of regular expressions which restricts 308// visiting URLs. If any of the rules matches to a URL the request will be stopped. 309func DisallowedURLFilters(filters ...*regexp.Regexp) CollectorOption { 310 return func(c *Collector) { 311 c.DisallowedURLFilters = filters 312 } 313} 314 315// URLFilters sets the list of regular expressions which restricts 316// visiting URLs. If any of the rules matches to a URL the request won't be stopped. 317func URLFilters(filters ...*regexp.Regexp) CollectorOption { 318 return func(c *Collector) { 319 c.URLFilters = filters 320 } 321} 322 323// AllowURLRevisit instructs the Collector to allow multiple downloads of the same URL 324func AllowURLRevisit() CollectorOption { 325 return func(c *Collector) { 326 c.AllowURLRevisit = true 327 } 328} 329 330// MaxBodySize sets the limit of the retrieved response body in bytes. 331func MaxBodySize(sizeInBytes int) CollectorOption { 332 return func(c *Collector) { 333 c.MaxBodySize = sizeInBytes 334 } 335} 336 337// CacheDir specifies the location where GET requests are cached as files. 338func CacheDir(path string) CollectorOption { 339 return func(c *Collector) { 340 c.CacheDir = path 341 } 342} 343 344// IgnoreRobotsTxt instructs the Collector to ignore any restrictions 345// set by the target host's robots.txt file. 346func IgnoreRobotsTxt() CollectorOption { 347 return func(c *Collector) { 348 c.IgnoreRobotsTxt = true 349 } 350} 351 352// TraceHTTP instructs the Collector to collect and report request trace data 353// on the Response.Trace. 354func TraceHTTP() CollectorOption { 355 return func(c *Collector) { 356 c.TraceHTTP = true 357 } 358} 359 360// ID sets the unique identifier of the Collector. 361func ID(id uint32) CollectorOption { 362 return func(c *Collector) { 363 c.ID = id 364 } 365} 366 367// Async turns on asynchronous network requests. 368func Async(a ...bool) CollectorOption { 369 return func(c *Collector) { 370 c.Async = true 371 } 372} 373 374// DetectCharset enables character encoding detection for non-utf8 response bodies 375// without explicit charset declaration. This feature uses https://github.com/saintfish/chardet 376func DetectCharset() CollectorOption { 377 return func(c *Collector) { 378 c.DetectCharset = true 379 } 380} 381 382// Debugger sets the debugger used by the Collector. 383func Debugger(d debug.Debugger) CollectorOption { 384 return func(c *Collector) { 385 d.Init() 386 c.debugger = d 387 } 388} 389 390// CheckHead performs a HEAD request before every GET to pre-validate the response 391func CheckHead() CollectorOption { 392 return func(c *Collector) { 393 c.CheckHead = true 394 } 395} 396 397// Init initializes the Collector's private variables and sets default 398// configuration for the Collector 399func (c *Collector) Init() { 400 c.UserAgent = "colly - https://github.com/gocolly/colly/v2" 401 c.MaxDepth = 0 402 c.store = &storage.InMemoryStorage{} 403 c.store.Init() 404 c.MaxBodySize = 10 * 1024 * 1024 405 c.backend = &httpBackend{} 406 jar, _ := cookiejar.New(nil) 407 c.backend.Init(jar) 408 c.backend.Client.CheckRedirect = c.checkRedirectFunc() 409 c.wg = &sync.WaitGroup{} 410 c.lock = &sync.RWMutex{} 411 c.robotsMap = make(map[string]*robotstxt.RobotsData) 412 c.IgnoreRobotsTxt = true 413 c.ID = atomic.AddUint32(&collectorCounter, 1) 414 c.TraceHTTP = false 415} 416 417// Appengine will replace the Collector's backend http.Client 418// With an Http.Client that is provided by appengine/urlfetch 419// This function should be used when the scraper is run on 420// Google App Engine. Example: 421// func startScraper(w http.ResponseWriter, r *http.Request) { 422// ctx := appengine.NewContext(r) 423// c := colly.NewCollector() 424// c.Appengine(ctx) 425// ... 426// c.Visit("https://google.ca") 427// } 428func (c *Collector) Appengine(ctx context.Context) { 429 client := urlfetch.Client(ctx) 430 client.Jar = c.backend.Client.Jar 431 client.CheckRedirect = c.backend.Client.CheckRedirect 432 client.Timeout = c.backend.Client.Timeout 433 434 c.backend.Client = client 435} 436 437// Visit starts Collector's collecting job by creating a 438// request to the URL specified in parameter. 439// Visit also calls the previously provided callbacks 440func (c *Collector) Visit(URL string) error { 441 if c.CheckHead { 442 if check := c.scrape(URL, "HEAD", 1, nil, nil, nil, true); check != nil { 443 return check 444 } 445 } 446 return c.scrape(URL, "GET", 1, nil, nil, nil, true) 447} 448 449// HasVisited checks if the provided URL has been visited 450func (c *Collector) HasVisited(URL string) (bool, error) { 451 return c.checkHasVisited(URL, nil) 452} 453 454// HasPosted checks if the provided URL and requestData has been visited 455// This method is useful more likely to prevent re-visit same URL and POST body 456func (c *Collector) HasPosted(URL string, requestData map[string]string) (bool, error) { 457 return c.checkHasVisited(URL, requestData) 458} 459 460// Head starts a collector job by creating a HEAD request. 461func (c *Collector) Head(URL string) error { 462 return c.scrape(URL, "HEAD", 1, nil, nil, nil, false) 463} 464 465// Post starts a collector job by creating a POST request. 466// Post also calls the previously provided callbacks 467func (c *Collector) Post(URL string, requestData map[string]string) error { 468 return c.scrape(URL, "POST", 1, createFormReader(requestData), nil, nil, true) 469} 470 471// PostRaw starts a collector job by creating a POST request with raw binary data. 472// Post also calls the previously provided callbacks 473func (c *Collector) PostRaw(URL string, requestData []byte) error { 474 return c.scrape(URL, "POST", 1, bytes.NewReader(requestData), nil, nil, true) 475} 476 477// PostMultipart starts a collector job by creating a Multipart POST request 478// with raw binary data. PostMultipart also calls the previously provided callbacks 479func (c *Collector) PostMultipart(URL string, requestData map[string][]byte) error { 480 boundary := randomBoundary() 481 hdr := http.Header{} 482 hdr.Set("Content-Type", "multipart/form-data; boundary="+boundary) 483 hdr.Set("User-Agent", c.UserAgent) 484 return c.scrape(URL, "POST", 1, createMultipartReader(boundary, requestData), nil, hdr, true) 485} 486 487// Request starts a collector job by creating a custom HTTP request 488// where method, context, headers and request data can be specified. 489// Set requestData, ctx, hdr parameters to nil if you don't want to use them. 490// Valid methods: 491// - "GET" 492// - "HEAD" 493// - "POST" 494// - "PUT" 495// - "DELETE" 496// - "PATCH" 497// - "OPTIONS" 498func (c *Collector) Request(method, URL string, requestData io.Reader, ctx *Context, hdr http.Header) error { 499 return c.scrape(URL, method, 1, requestData, ctx, hdr, true) 500} 501 502// SetDebugger attaches a debugger to the collector 503func (c *Collector) SetDebugger(d debug.Debugger) { 504 d.Init() 505 c.debugger = d 506} 507 508// UnmarshalRequest creates a Request from serialized data 509func (c *Collector) UnmarshalRequest(r []byte) (*Request, error) { 510 req := &serializableRequest{} 511 err := json.Unmarshal(r, req) 512 if err != nil { 513 return nil, err 514 } 515 516 u, err := url.Parse(req.URL) 517 if err != nil { 518 return nil, err 519 } 520 521 ctx := NewContext() 522 for k, v := range req.Ctx { 523 ctx.Put(k, v) 524 } 525 526 return &Request{ 527 Method: req.Method, 528 URL: u, 529 Depth: req.Depth, 530 Body: bytes.NewReader(req.Body), 531 Ctx: ctx, 532 ID: atomic.AddUint32(&c.requestCount, 1), 533 Headers: &req.Headers, 534 collector: c, 535 }, nil 536} 537 538func (c *Collector) scrape(u, method string, depth int, requestData io.Reader, ctx *Context, hdr http.Header, checkRevisit bool) error { 539 parsedURL, err := url.Parse(u) 540 if err != nil { 541 return err 542 } 543 if err := c.requestCheck(u, parsedURL, method, requestData, depth, checkRevisit); err != nil { 544 return err 545 } 546 547 if hdr == nil { 548 hdr = http.Header{"User-Agent": []string{c.UserAgent}} 549 } 550 rc, ok := requestData.(io.ReadCloser) 551 if !ok && requestData != nil { 552 rc = ioutil.NopCloser(requestData) 553 } 554 // The Go HTTP API ignores "Host" in the headers, preferring the client 555 // to use the Host field on Request. 556 host := parsedURL.Host 557 if hostHeader := hdr.Get("Host"); hostHeader != "" { 558 host = hostHeader 559 } 560 req := &http.Request{ 561 Method: method, 562 URL: parsedURL, 563 Proto: "HTTP/1.1", 564 ProtoMajor: 1, 565 ProtoMinor: 1, 566 Header: hdr, 567 Body: rc, 568 Host: host, 569 } 570 setRequestBody(req, requestData) 571 u = parsedURL.String() 572 c.wg.Add(1) 573 if c.Async { 574 go c.fetch(u, method, depth, requestData, ctx, hdr, req) 575 return nil 576 } 577 return c.fetch(u, method, depth, requestData, ctx, hdr, req) 578} 579 580func setRequestBody(req *http.Request, body io.Reader) { 581 if body != nil { 582 switch v := body.(type) { 583 case *bytes.Buffer: 584 req.ContentLength = int64(v.Len()) 585 buf := v.Bytes() 586 req.GetBody = func() (io.ReadCloser, error) { 587 r := bytes.NewReader(buf) 588 return ioutil.NopCloser(r), nil 589 } 590 case *bytes.Reader: 591 req.ContentLength = int64(v.Len()) 592 snapshot := *v 593 req.GetBody = func() (io.ReadCloser, error) { 594 r := snapshot 595 return ioutil.NopCloser(&r), nil 596 } 597 case *strings.Reader: 598 req.ContentLength = int64(v.Len()) 599 snapshot := *v 600 req.GetBody = func() (io.ReadCloser, error) { 601 r := snapshot 602 return ioutil.NopCloser(&r), nil 603 } 604 } 605 if req.GetBody != nil && req.ContentLength == 0 { 606 req.Body = http.NoBody 607 req.GetBody = func() (io.ReadCloser, error) { return http.NoBody, nil } 608 } 609 } 610} 611 612func (c *Collector) fetch(u, method string, depth int, requestData io.Reader, ctx *Context, hdr http.Header, req *http.Request) error { 613 defer c.wg.Done() 614 if ctx == nil { 615 ctx = NewContext() 616 } 617 request := &Request{ 618 URL: req.URL, 619 Headers: &req.Header, 620 Ctx: ctx, 621 Depth: depth, 622 Method: method, 623 Body: requestData, 624 collector: c, 625 ID: atomic.AddUint32(&c.requestCount, 1), 626 } 627 628 c.handleOnRequest(request) 629 630 if request.abort { 631 return nil 632 } 633 634 if method == "POST" && req.Header.Get("Content-Type") == "" { 635 req.Header.Add("Content-Type", "application/x-www-form-urlencoded") 636 } 637 638 if req.Header.Get("Accept") == "" { 639 req.Header.Set("Accept", "*/*") 640 } 641 642 var hTrace *HTTPTrace 643 if c.TraceHTTP { 644 hTrace = &HTTPTrace{} 645 req = hTrace.WithTrace(req) 646 } 647 checkHeadersFunc := func(statusCode int, headers http.Header) bool { 648 c.handleOnResponseHeaders(&Response{Ctx: ctx, Request: request, StatusCode: statusCode, Headers: &headers}) 649 return !request.abort 650 } 651 652 origURL := req.URL 653 response, err := c.backend.Cache(req, c.MaxBodySize, checkHeadersFunc, c.CacheDir) 654 if proxyURL, ok := req.Context().Value(ProxyURLKey).(string); ok { 655 request.ProxyURL = proxyURL 656 } 657 if err := c.handleOnError(response, err, request, ctx); err != nil { 658 return err 659 } 660 if req.URL != origURL { 661 request.URL = req.URL 662 request.Headers = &req.Header 663 } 664 atomic.AddUint32(&c.responseCount, 1) 665 response.Ctx = ctx 666 response.Request = request 667 response.Trace = hTrace 668 669 err = response.fixCharset(c.DetectCharset, request.ResponseCharacterEncoding) 670 if err != nil { 671 return err 672 } 673 674 c.handleOnResponse(response) 675 676 err = c.handleOnHTML(response) 677 if err != nil { 678 c.handleOnError(response, err, request, ctx) 679 } 680 681 err = c.handleOnXML(response) 682 if err != nil { 683 c.handleOnError(response, err, request, ctx) 684 } 685 686 c.handleOnScraped(response) 687 688 return err 689} 690 691func (c *Collector) requestCheck(u string, parsedURL *url.URL, method string, requestData io.Reader, depth int, checkRevisit bool) error { 692 if u == "" { 693 return ErrMissingURL 694 } 695 if c.MaxDepth > 0 && c.MaxDepth < depth { 696 return ErrMaxDepth 697 } 698 if len(c.DisallowedURLFilters) > 0 { 699 if isMatchingFilter(c.DisallowedURLFilters, []byte(u)) { 700 return ErrForbiddenURL 701 } 702 } 703 if len(c.URLFilters) > 0 { 704 if !isMatchingFilter(c.URLFilters, []byte(u)) { 705 return ErrNoURLFiltersMatch 706 } 707 } 708 if !c.isDomainAllowed(parsedURL.Hostname()) { 709 return ErrForbiddenDomain 710 } 711 if method != "HEAD" && !c.IgnoreRobotsTxt { 712 if err := c.checkRobots(parsedURL); err != nil { 713 return err 714 } 715 } 716 if checkRevisit && !c.AllowURLRevisit { 717 h := fnv.New64a() 718 h.Write([]byte(u)) 719 720 var uHash uint64 721 if method == "GET" { 722 uHash = h.Sum64() 723 } else if requestData != nil { 724 h.Write(streamToByte(requestData)) 725 uHash = h.Sum64() 726 } else { 727 return nil 728 } 729 730 visited, err := c.store.IsVisited(uHash) 731 if err != nil { 732 return err 733 } 734 if visited { 735 return ErrAlreadyVisited 736 } 737 return c.store.Visited(uHash) 738 } 739 return nil 740} 741 742func (c *Collector) isDomainAllowed(domain string) bool { 743 for _, d2 := range c.DisallowedDomains { 744 if d2 == domain { 745 return false 746 } 747 } 748 if c.AllowedDomains == nil || len(c.AllowedDomains) == 0 { 749 return true 750 } 751 for _, d2 := range c.AllowedDomains { 752 if d2 == domain { 753 return true 754 } 755 } 756 return false 757} 758 759func (c *Collector) checkRobots(u *url.URL) error { 760 c.lock.RLock() 761 robot, ok := c.robotsMap[u.Host] 762 c.lock.RUnlock() 763 764 if !ok { 765 // no robots file cached 766 resp, err := c.backend.Client.Get(u.Scheme + "://" + u.Host + "/robots.txt") 767 if err != nil { 768 return err 769 } 770 defer resp.Body.Close() 771 772 robot, err = robotstxt.FromResponse(resp) 773 if err != nil { 774 return err 775 } 776 c.lock.Lock() 777 c.robotsMap[u.Host] = robot 778 c.lock.Unlock() 779 } 780 781 uaGroup := robot.FindGroup(c.UserAgent) 782 if uaGroup == nil { 783 return nil 784 } 785 786 eu := u.EscapedPath() 787 if u.RawQuery != "" { 788 eu += "?" + u.Query().Encode() 789 } 790 if !uaGroup.Test(eu) { 791 return ErrRobotsTxtBlocked 792 } 793 return nil 794} 795 796// String is the text representation of the collector. 797// It contains useful debug information about the collector's internals 798func (c *Collector) String() string { 799 return fmt.Sprintf( 800 "Requests made: %d (%d responses) | Callbacks: OnRequest: %d, OnHTML: %d, OnResponse: %d, OnError: %d", 801 c.requestCount, 802 c.responseCount, 803 len(c.requestCallbacks), 804 len(c.htmlCallbacks), 805 len(c.responseCallbacks), 806 len(c.errorCallbacks), 807 ) 808} 809 810// Wait returns when the collector jobs are finished 811func (c *Collector) Wait() { 812 c.wg.Wait() 813} 814 815// OnRequest registers a function. Function will be executed on every 816// request made by the Collector 817func (c *Collector) OnRequest(f RequestCallback) { 818 c.lock.Lock() 819 if c.requestCallbacks == nil { 820 c.requestCallbacks = make([]RequestCallback, 0, 4) 821 } 822 c.requestCallbacks = append(c.requestCallbacks, f) 823 c.lock.Unlock() 824} 825 826// OnResponseHeaders registers a function. Function will be executed on every response 827// when headers and status are already received, but body is not yet read. 828// 829// Like in OnRequest, you can call Request.Abort to abort the transfer. This might be 830// useful if, for example, you're following all hyperlinks, but want to avoid 831// downloading files. 832// 833// Be aware that using this will prevent HTTP/1.1 connection reuse, as 834// the only way to abort a download is to immediately close the connection. 835// HTTP/2 doesn't suffer from this problem, as it's possible to close 836// specific stream inside the connection. 837func (c *Collector) OnResponseHeaders(f ResponseHeadersCallback) { 838 c.lock.Lock() 839 c.responseHeadersCallbacks = append(c.responseHeadersCallbacks, f) 840 c.lock.Unlock() 841} 842 843// OnResponse registers a function. Function will be executed on every response 844func (c *Collector) OnResponse(f ResponseCallback) { 845 c.lock.Lock() 846 if c.responseCallbacks == nil { 847 c.responseCallbacks = make([]ResponseCallback, 0, 4) 848 } 849 c.responseCallbacks = append(c.responseCallbacks, f) 850 c.lock.Unlock() 851} 852 853// OnHTML registers a function. Function will be executed on every HTML 854// element matched by the GoQuery Selector parameter. 855// GoQuery Selector is a selector used by https://github.com/PuerkitoBio/goquery 856func (c *Collector) OnHTML(goquerySelector string, f HTMLCallback) { 857 c.lock.Lock() 858 if c.htmlCallbacks == nil { 859 c.htmlCallbacks = make([]*htmlCallbackContainer, 0, 4) 860 } 861 c.htmlCallbacks = append(c.htmlCallbacks, &htmlCallbackContainer{ 862 Selector: goquerySelector, 863 Function: f, 864 }) 865 c.lock.Unlock() 866} 867 868// OnXML registers a function. Function will be executed on every XML 869// element matched by the xpath Query parameter. 870// xpath Query is used by https://github.com/antchfx/xmlquery 871func (c *Collector) OnXML(xpathQuery string, f XMLCallback) { 872 c.lock.Lock() 873 if c.xmlCallbacks == nil { 874 c.xmlCallbacks = make([]*xmlCallbackContainer, 0, 4) 875 } 876 c.xmlCallbacks = append(c.xmlCallbacks, &xmlCallbackContainer{ 877 Query: xpathQuery, 878 Function: f, 879 }) 880 c.lock.Unlock() 881} 882 883// OnHTMLDetach deregister a function. Function will not be execute after detached 884func (c *Collector) OnHTMLDetach(goquerySelector string) { 885 c.lock.Lock() 886 deleteIdx := -1 887 for i, cc := range c.htmlCallbacks { 888 if cc.Selector == goquerySelector { 889 deleteIdx = i 890 break 891 } 892 } 893 if deleteIdx != -1 { 894 c.htmlCallbacks = append(c.htmlCallbacks[:deleteIdx], c.htmlCallbacks[deleteIdx+1:]...) 895 } 896 c.lock.Unlock() 897} 898 899// OnXMLDetach deregister a function. Function will not be execute after detached 900func (c *Collector) OnXMLDetach(xpathQuery string) { 901 c.lock.Lock() 902 deleteIdx := -1 903 for i, cc := range c.xmlCallbacks { 904 if cc.Query == xpathQuery { 905 deleteIdx = i 906 break 907 } 908 } 909 if deleteIdx != -1 { 910 c.xmlCallbacks = append(c.xmlCallbacks[:deleteIdx], c.xmlCallbacks[deleteIdx+1:]...) 911 } 912 c.lock.Unlock() 913} 914 915// OnError registers a function. Function will be executed if an error 916// occurs during the HTTP request. 917func (c *Collector) OnError(f ErrorCallback) { 918 c.lock.Lock() 919 if c.errorCallbacks == nil { 920 c.errorCallbacks = make([]ErrorCallback, 0, 4) 921 } 922 c.errorCallbacks = append(c.errorCallbacks, f) 923 c.lock.Unlock() 924} 925 926// OnScraped registers a function. Function will be executed after 927// OnHTML, as a final part of the scraping. 928func (c *Collector) OnScraped(f ScrapedCallback) { 929 c.lock.Lock() 930 if c.scrapedCallbacks == nil { 931 c.scrapedCallbacks = make([]ScrapedCallback, 0, 4) 932 } 933 c.scrapedCallbacks = append(c.scrapedCallbacks, f) 934 c.lock.Unlock() 935} 936 937// SetClient will override the previously set http.Client 938func (c *Collector) SetClient(client *http.Client) { 939 c.backend.Client = client 940} 941 942// WithTransport allows you to set a custom http.RoundTripper (transport) 943func (c *Collector) WithTransport(transport http.RoundTripper) { 944 c.backend.Client.Transport = transport 945} 946 947// DisableCookies turns off cookie handling 948func (c *Collector) DisableCookies() { 949 c.backend.Client.Jar = nil 950} 951 952// SetCookieJar overrides the previously set cookie jar 953func (c *Collector) SetCookieJar(j http.CookieJar) { 954 c.backend.Client.Jar = j 955} 956 957// SetRequestTimeout overrides the default timeout (10 seconds) for this collector 958func (c *Collector) SetRequestTimeout(timeout time.Duration) { 959 c.backend.Client.Timeout = timeout 960} 961 962// SetStorage overrides the default in-memory storage. 963// Storage stores scraping related data like cookies and visited urls 964func (c *Collector) SetStorage(s storage.Storage) error { 965 if err := s.Init(); err != nil { 966 return err 967 } 968 c.store = s 969 c.backend.Client.Jar = createJar(s) 970 return nil 971} 972 973// SetProxy sets a proxy for the collector. This method overrides the previously 974// used http.Transport if the type of the transport is not http.RoundTripper. 975// The proxy type is determined by the URL scheme. "http" 976// and "socks5" are supported. If the scheme is empty, 977// "http" is assumed. 978func (c *Collector) SetProxy(proxyURL string) error { 979 proxyParsed, err := url.Parse(proxyURL) 980 if err != nil { 981 return err 982 } 983 984 c.SetProxyFunc(http.ProxyURL(proxyParsed)) 985 986 return nil 987} 988 989// SetProxyFunc sets a custom proxy setter/switcher function. 990// See built-in ProxyFuncs for more details. 991// This method overrides the previously used http.Transport 992// if the type of the transport is not http.RoundTripper. 993// The proxy type is determined by the URL scheme. "http" 994// and "socks5" are supported. If the scheme is empty, 995// "http" is assumed. 996func (c *Collector) SetProxyFunc(p ProxyFunc) { 997 t, ok := c.backend.Client.Transport.(*http.Transport) 998 if c.backend.Client.Transport != nil && ok { 999 t.Proxy = p 1000 } else { 1001 c.backend.Client.Transport = &http.Transport{ 1002 Proxy: p, 1003 } 1004 } 1005} 1006 1007func createEvent(eventType string, requestID, collectorID uint32, kvargs map[string]string) *debug.Event { 1008 return &debug.Event{ 1009 CollectorID: collectorID, 1010 RequestID: requestID, 1011 Type: eventType, 1012 Values: kvargs, 1013 } 1014} 1015 1016func (c *Collector) handleOnRequest(r *Request) { 1017 if c.debugger != nil { 1018 c.debugger.Event(createEvent("request", r.ID, c.ID, map[string]string{ 1019 "url": r.URL.String(), 1020 })) 1021 } 1022 for _, f := range c.requestCallbacks { 1023 f(r) 1024 } 1025} 1026 1027func (c *Collector) handleOnResponse(r *Response) { 1028 if c.debugger != nil { 1029 c.debugger.Event(createEvent("response", r.Request.ID, c.ID, map[string]string{ 1030 "url": r.Request.URL.String(), 1031 "status": http.StatusText(r.StatusCode), 1032 })) 1033 } 1034 for _, f := range c.responseCallbacks { 1035 f(r) 1036 } 1037} 1038 1039func (c *Collector) handleOnResponseHeaders(r *Response) { 1040 if c.debugger != nil { 1041 c.debugger.Event(createEvent("responseHeaders", r.Request.ID, c.ID, map[string]string{ 1042 "url": r.Request.URL.String(), 1043 "status": http.StatusText(r.StatusCode), 1044 })) 1045 } 1046 for _, f := range c.responseHeadersCallbacks { 1047 f(r) 1048 } 1049} 1050 1051func (c *Collector) handleOnHTML(resp *Response) error { 1052 if len(c.htmlCallbacks) == 0 || !strings.Contains(strings.ToLower(resp.Headers.Get("Content-Type")), "html") { 1053 return nil 1054 } 1055 doc, err := goquery.NewDocumentFromReader(bytes.NewBuffer(resp.Body)) 1056 if err != nil { 1057 return err 1058 } 1059 if href, found := doc.Find("base[href]").Attr("href"); found { 1060 resp.Request.baseURL, _ = resp.Request.URL.Parse(href) 1061 } 1062 for _, cc := range c.htmlCallbacks { 1063 i := 0 1064 doc.Find(cc.Selector).Each(func(_ int, s *goquery.Selection) { 1065 for _, n := range s.Nodes { 1066 e := NewHTMLElementFromSelectionNode(resp, s, n, i) 1067 i++ 1068 if c.debugger != nil { 1069 c.debugger.Event(createEvent("html", resp.Request.ID, c.ID, map[string]string{ 1070 "selector": cc.Selector, 1071 "url": resp.Request.URL.String(), 1072 })) 1073 } 1074 cc.Function(e) 1075 } 1076 }) 1077 } 1078 return nil 1079} 1080 1081func (c *Collector) handleOnXML(resp *Response) error { 1082 if len(c.xmlCallbacks) == 0 { 1083 return nil 1084 } 1085 contentType := strings.ToLower(resp.Headers.Get("Content-Type")) 1086 isXMLFile := strings.HasSuffix(strings.ToLower(resp.Request.URL.Path), ".xml") || strings.HasSuffix(strings.ToLower(resp.Request.URL.Path), ".xml.gz") 1087 if !strings.Contains(contentType, "html") && (!strings.Contains(contentType, "xml") && !isXMLFile) { 1088 return nil 1089 } 1090 1091 if strings.Contains(contentType, "html") { 1092 doc, err := htmlquery.Parse(bytes.NewBuffer(resp.Body)) 1093 if err != nil { 1094 return err 1095 } 1096 if e := htmlquery.FindOne(doc, "//base"); e != nil { 1097 for _, a := range e.Attr { 1098 if a.Key == "href" { 1099 resp.Request.baseURL, _ = resp.Request.URL.Parse(a.Val) 1100 break 1101 } 1102 } 1103 } 1104 1105 for _, cc := range c.xmlCallbacks { 1106 for _, n := range htmlquery.Find(doc, cc.Query) { 1107 e := NewXMLElementFromHTMLNode(resp, n) 1108 if c.debugger != nil { 1109 c.debugger.Event(createEvent("xml", resp.Request.ID, c.ID, map[string]string{ 1110 "selector": cc.Query, 1111 "url": resp.Request.URL.String(), 1112 })) 1113 } 1114 cc.Function(e) 1115 } 1116 } 1117 } else if strings.Contains(contentType, "xml") || isXMLFile { 1118 doc, err := xmlquery.Parse(bytes.NewBuffer(resp.Body)) 1119 if err != nil { 1120 return err 1121 } 1122 1123 for _, cc := range c.xmlCallbacks { 1124 xmlquery.FindEach(doc, cc.Query, func(i int, n *xmlquery.Node) { 1125 e := NewXMLElementFromXMLNode(resp, n) 1126 if c.debugger != nil { 1127 c.debugger.Event(createEvent("xml", resp.Request.ID, c.ID, map[string]string{ 1128 "selector": cc.Query, 1129 "url": resp.Request.URL.String(), 1130 })) 1131 } 1132 cc.Function(e) 1133 }) 1134 } 1135 } 1136 return nil 1137} 1138 1139func (c *Collector) handleOnError(response *Response, err error, request *Request, ctx *Context) error { 1140 if err == nil && (c.ParseHTTPErrorResponse || response.StatusCode < 203) { 1141 return nil 1142 } 1143 if err == nil && response.StatusCode >= 203 { 1144 err = errors.New(http.StatusText(response.StatusCode)) 1145 } 1146 if response == nil { 1147 response = &Response{ 1148 Request: request, 1149 Ctx: ctx, 1150 } 1151 } 1152 if c.debugger != nil { 1153 c.debugger.Event(createEvent("error", request.ID, c.ID, map[string]string{ 1154 "url": request.URL.String(), 1155 "status": http.StatusText(response.StatusCode), 1156 })) 1157 } 1158 if response.Request == nil { 1159 response.Request = request 1160 } 1161 if response.Ctx == nil { 1162 response.Ctx = request.Ctx 1163 } 1164 for _, f := range c.errorCallbacks { 1165 f(response, err) 1166 } 1167 return err 1168} 1169 1170func (c *Collector) handleOnScraped(r *Response) { 1171 if c.debugger != nil { 1172 c.debugger.Event(createEvent("scraped", r.Request.ID, c.ID, map[string]string{ 1173 "url": r.Request.URL.String(), 1174 })) 1175 } 1176 for _, f := range c.scrapedCallbacks { 1177 f(r) 1178 } 1179} 1180 1181// Limit adds a new LimitRule to the collector 1182func (c *Collector) Limit(rule *LimitRule) error { 1183 return c.backend.Limit(rule) 1184} 1185 1186// Limits adds new LimitRules to the collector 1187func (c *Collector) Limits(rules []*LimitRule) error { 1188 return c.backend.Limits(rules) 1189} 1190 1191// SetRedirectHandler instructs the Collector to allow multiple downloads of the same URL 1192func (c *Collector) SetRedirectHandler(f func(req *http.Request, via []*http.Request) error) { 1193 c.redirectHandler = f 1194 c.backend.Client.CheckRedirect = c.checkRedirectFunc() 1195} 1196 1197// SetCookies handles the receipt of the cookies in a reply for the given URL 1198func (c *Collector) SetCookies(URL string, cookies []*http.Cookie) error { 1199 if c.backend.Client.Jar == nil { 1200 return ErrNoCookieJar 1201 } 1202 u, err := url.Parse(URL) 1203 if err != nil { 1204 return err 1205 } 1206 c.backend.Client.Jar.SetCookies(u, cookies) 1207 return nil 1208} 1209 1210// Cookies returns the cookies to send in a request for the given URL. 1211func (c *Collector) Cookies(URL string) []*http.Cookie { 1212 if c.backend.Client.Jar == nil { 1213 return nil 1214 } 1215 u, err := url.Parse(URL) 1216 if err != nil { 1217 return nil 1218 } 1219 return c.backend.Client.Jar.Cookies(u) 1220} 1221 1222// Clone creates an exact copy of a Collector without callbacks. 1223// HTTP backend, robots.txt cache and cookie jar are shared 1224// between collectors. 1225func (c *Collector) Clone() *Collector { 1226 return &Collector{ 1227 AllowedDomains: c.AllowedDomains, 1228 AllowURLRevisit: c.AllowURLRevisit, 1229 CacheDir: c.CacheDir, 1230 DetectCharset: c.DetectCharset, 1231 DisallowedDomains: c.DisallowedDomains, 1232 ID: atomic.AddUint32(&collectorCounter, 1), 1233 IgnoreRobotsTxt: c.IgnoreRobotsTxt, 1234 MaxBodySize: c.MaxBodySize, 1235 MaxDepth: c.MaxDepth, 1236 DisallowedURLFilters: c.DisallowedURLFilters, 1237 URLFilters: c.URLFilters, 1238 CheckHead: c.CheckHead, 1239 ParseHTTPErrorResponse: c.ParseHTTPErrorResponse, 1240 UserAgent: c.UserAgent, 1241 TraceHTTP: c.TraceHTTP, 1242 store: c.store, 1243 backend: c.backend, 1244 debugger: c.debugger, 1245 Async: c.Async, 1246 redirectHandler: c.redirectHandler, 1247 errorCallbacks: make([]ErrorCallback, 0, 8), 1248 htmlCallbacks: make([]*htmlCallbackContainer, 0, 8), 1249 xmlCallbacks: make([]*xmlCallbackContainer, 0, 8), 1250 scrapedCallbacks: make([]ScrapedCallback, 0, 8), 1251 lock: c.lock, 1252 requestCallbacks: make([]RequestCallback, 0, 8), 1253 responseCallbacks: make([]ResponseCallback, 0, 8), 1254 robotsMap: c.robotsMap, 1255 wg: &sync.WaitGroup{}, 1256 } 1257} 1258 1259func (c *Collector) checkRedirectFunc() func(req *http.Request, via []*http.Request) error { 1260 return func(req *http.Request, via []*http.Request) error { 1261 if !c.isDomainAllowed(req.URL.Hostname()) { 1262 return fmt.Errorf("Not following redirect to %s because its not in AllowedDomains", req.URL.Host) 1263 } 1264 1265 if c.redirectHandler != nil { 1266 return c.redirectHandler(req, via) 1267 } 1268 1269 // Honor golangs default of maximum of 10 redirects 1270 if len(via) >= 10 { 1271 return http.ErrUseLastResponse 1272 } 1273 1274 lastRequest := via[len(via)-1] 1275 1276 // If domain has changed, remove the Authorization-header if it exists 1277 if req.URL.Host != lastRequest.URL.Host { 1278 req.Header.Del("Authorization") 1279 } 1280 1281 return nil 1282 } 1283} 1284 1285func (c *Collector) parseSettingsFromEnv() { 1286 for _, e := range os.Environ() { 1287 if !strings.HasPrefix(e, "COLLY_") { 1288 continue 1289 } 1290 pair := strings.SplitN(e[6:], "=", 2) 1291 if f, ok := envMap[pair[0]]; ok { 1292 f(c, pair[1]) 1293 } else { 1294 log.Println("Unknown environment variable:", pair[0]) 1295 } 1296 } 1297} 1298 1299func (c *Collector) checkHasVisited(URL string, requestData map[string]string) (bool, error) { 1300 h := fnv.New64a() 1301 h.Write([]byte(URL)) 1302 1303 if requestData != nil { 1304 h.Write(streamToByte(createFormReader(requestData))) 1305 } 1306 1307 return c.store.IsVisited(h.Sum64()) 1308} 1309 1310// SanitizeFileName replaces dangerous characters in a string 1311// so the return value can be used as a safe file name. 1312func SanitizeFileName(fileName string) string { 1313 ext := filepath.Ext(fileName) 1314 cleanExt := sanitize.BaseName(ext) 1315 if cleanExt == "" { 1316 cleanExt = ".unknown" 1317 } 1318 return strings.Replace(fmt.Sprintf( 1319 "%s.%s", 1320 sanitize.BaseName(fileName[:len(fileName)-len(ext)]), 1321 cleanExt[1:], 1322 ), "-", "_", -1) 1323} 1324 1325func createFormReader(data map[string]string) io.Reader { 1326 form := url.Values{} 1327 for k, v := range data { 1328 form.Add(k, v) 1329 } 1330 return strings.NewReader(form.Encode()) 1331} 1332 1333func createMultipartReader(boundary string, data map[string][]byte) io.Reader { 1334 dashBoundary := "--" + boundary 1335 1336 body := []byte{} 1337 buffer := bytes.NewBuffer(body) 1338 1339 buffer.WriteString("Content-type: multipart/form-data; boundary=" + boundary + "\n\n") 1340 for contentType, content := range data { 1341 buffer.WriteString(dashBoundary + "\n") 1342 buffer.WriteString("Content-Disposition: form-data; name=" + contentType + "\n") 1343 buffer.WriteString(fmt.Sprintf("Content-Length: %d \n\n", len(content))) 1344 buffer.Write(content) 1345 buffer.WriteString("\n") 1346 } 1347 buffer.WriteString(dashBoundary + "--\n\n") 1348 return buffer 1349} 1350 1351// randomBoundary was borrowed from 1352// github.com/golang/go/mime/multipart/writer.go#randomBoundary 1353func randomBoundary() string { 1354 var buf [30]byte 1355 _, err := io.ReadFull(rand.Reader, buf[:]) 1356 if err != nil { 1357 panic(err) 1358 } 1359 return fmt.Sprintf("%x", buf[:]) 1360} 1361 1362func isYesString(s string) bool { 1363 switch strings.ToLower(s) { 1364 case "1", "yes", "true", "y": 1365 return true 1366 } 1367 return false 1368} 1369 1370func createJar(s storage.Storage) http.CookieJar { 1371 return &cookieJarSerializer{store: s, lock: &sync.RWMutex{}} 1372} 1373 1374func (j *cookieJarSerializer) SetCookies(u *url.URL, cookies []*http.Cookie) { 1375 j.lock.Lock() 1376 defer j.lock.Unlock() 1377 cookieStr := j.store.Cookies(u) 1378 1379 // Merge existing cookies, new cookies have precedence. 1380 cnew := make([]*http.Cookie, len(cookies)) 1381 copy(cnew, cookies) 1382 existing := storage.UnstringifyCookies(cookieStr) 1383 for _, c := range existing { 1384 if !storage.ContainsCookie(cnew, c.Name) { 1385 cnew = append(cnew, c) 1386 } 1387 } 1388 j.store.SetCookies(u, storage.StringifyCookies(cnew)) 1389} 1390 1391func (j *cookieJarSerializer) Cookies(u *url.URL) []*http.Cookie { 1392 cookies := storage.UnstringifyCookies(j.store.Cookies(u)) 1393 // Filter. 1394 now := time.Now() 1395 cnew := make([]*http.Cookie, 0, len(cookies)) 1396 for _, c := range cookies { 1397 // Drop expired cookies. 1398 if c.RawExpires != "" && c.Expires.Before(now) { 1399 continue 1400 } 1401 // Drop secure cookies if not over https. 1402 if c.Secure && u.Scheme != "https" { 1403 continue 1404 } 1405 cnew = append(cnew, c) 1406 } 1407 return cnew 1408} 1409 1410func isMatchingFilter(fs []*regexp.Regexp, d []byte) bool { 1411 for _, r := range fs { 1412 if r.Match(d) { 1413 return true 1414 } 1415 } 1416 return false 1417} 1418 1419func streamToByte(r io.Reader) []byte { 1420 buf := new(bytes.Buffer) 1421 buf.ReadFrom(r) 1422 1423 if strReader, k := r.(*strings.Reader); k { 1424 strReader.Seek(0, 0) 1425 } else if bReader, kb := r.(*bytes.Reader); kb { 1426 bReader.Seek(0, 0) 1427 } 1428 1429 return buf.Bytes() 1430} 1431