1""" 2Base class for Scrapy spiders 3 4See documentation in docs/topics/spiders.rst 5""" 6import logging 7import warnings 8from typing import Optional 9 10from scrapy import signals 11from scrapy.http import Request 12from scrapy.utils.trackref import object_ref 13from scrapy.utils.url import url_is_from_spider 14from scrapy.utils.deprecate import method_is_overridden 15 16 17class Spider(object_ref): 18 """Base class for scrapy spiders. All spiders must inherit from this 19 class. 20 """ 21 22 name: Optional[str] = None 23 custom_settings: Optional[dict] = None 24 25 def __init__(self, name=None, **kwargs): 26 if name is not None: 27 self.name = name 28 elif not getattr(self, 'name', None): 29 raise ValueError(f"{type(self).__name__} must have a name") 30 self.__dict__.update(kwargs) 31 if not hasattr(self, 'start_urls'): 32 self.start_urls = [] 33 34 @property 35 def logger(self): 36 logger = logging.getLogger(self.name) 37 return logging.LoggerAdapter(logger, {'spider': self}) 38 39 def log(self, message, level=logging.DEBUG, **kw): 40 """Log the given message at the given log level 41 42 This helper wraps a log call to the logger within the spider, but you 43 can use it directly (e.g. Spider.logger.info('msg')) or use any other 44 Python logger too. 45 """ 46 self.logger.log(level, message, **kw) 47 48 @classmethod 49 def from_crawler(cls, crawler, *args, **kwargs): 50 spider = cls(*args, **kwargs) 51 spider._set_crawler(crawler) 52 return spider 53 54 def _set_crawler(self, crawler): 55 self.crawler = crawler 56 self.settings = crawler.settings 57 crawler.signals.connect(self.close, signals.spider_closed) 58 59 def start_requests(self): 60 cls = self.__class__ 61 if not self.start_urls and hasattr(self, 'start_url'): 62 raise AttributeError( 63 "Crawling could not start: 'start_urls' not found " 64 "or empty (but found 'start_url' attribute instead, " 65 "did you miss an 's'?)") 66 if method_is_overridden(cls, Spider, 'make_requests_from_url'): 67 warnings.warn( 68 "Spider.make_requests_from_url method is deprecated; it " 69 "won't be called in future Scrapy releases. Please " 70 "override Spider.start_requests method instead " 71 f"(see {cls.__module__}.{cls.__name__}).", 72 ) 73 for url in self.start_urls: 74 yield self.make_requests_from_url(url) 75 else: 76 for url in self.start_urls: 77 yield Request(url, dont_filter=True) 78 79 def make_requests_from_url(self, url): 80 """ This method is deprecated. """ 81 warnings.warn( 82 "Spider.make_requests_from_url method is deprecated: " 83 "it will be removed and not be called by the default " 84 "Spider.start_requests method in future Scrapy releases. " 85 "Please override Spider.start_requests method instead." 86 ) 87 return Request(url, dont_filter=True) 88 89 def _parse(self, response, **kwargs): 90 return self.parse(response, **kwargs) 91 92 def parse(self, response, **kwargs): 93 raise NotImplementedError(f'{self.__class__.__name__}.parse callback is not defined') 94 95 @classmethod 96 def update_settings(cls, settings): 97 settings.setdict(cls.custom_settings or {}, priority='spider') 98 99 @classmethod 100 def handles_request(cls, request): 101 return url_is_from_spider(request.url, cls) 102 103 @staticmethod 104 def close(spider, reason): 105 closed = getattr(spider, 'closed', None) 106 if callable(closed): 107 return closed(reason) 108 109 def __str__(self): 110 return f"<{type(self).__name__} {self.name!r} at 0x{id(self):0x}>" 111 112 __repr__ = __str__ 113 114 115# Top-level imports 116from scrapy.spiders.crawl import CrawlSpider, Rule 117from scrapy.spiders.feed import XMLFeedSpider, CSVFeedSpider 118from scrapy.spiders.sitemap import SitemapSpider 119