1""" 2This is a middleware to respect robots.txt policies. To activate it you must 3enable this middleware and enable the ROBOTSTXT_OBEY setting. 4 5""" 6 7import logging 8 9from twisted.internet.defer import Deferred, maybeDeferred 10from scrapy.exceptions import NotConfigured, IgnoreRequest 11from scrapy.http import Request 12from scrapy.utils.httpobj import urlparse_cached 13from scrapy.utils.log import failure_to_exc_info 14from scrapy.utils.misc import load_object 15 16logger = logging.getLogger(__name__) 17 18 19class RobotsTxtMiddleware: 20 DOWNLOAD_PRIORITY = 1000 21 22 def __init__(self, crawler): 23 if not crawler.settings.getbool('ROBOTSTXT_OBEY'): 24 raise NotConfigured 25 self._default_useragent = crawler.settings.get('USER_AGENT', 'Scrapy') 26 self._robotstxt_useragent = crawler.settings.get('ROBOTSTXT_USER_AGENT', None) 27 self.crawler = crawler 28 self._parsers = {} 29 self._parserimpl = load_object(crawler.settings.get('ROBOTSTXT_PARSER')) 30 31 # check if parser dependencies are met, this should throw an error otherwise. 32 self._parserimpl.from_crawler(self.crawler, b'') 33 34 @classmethod 35 def from_crawler(cls, crawler): 36 return cls(crawler) 37 38 def process_request(self, request, spider): 39 if request.meta.get('dont_obey_robotstxt'): 40 return 41 d = maybeDeferred(self.robot_parser, request, spider) 42 d.addCallback(self.process_request_2, request, spider) 43 return d 44 45 def process_request_2(self, rp, request, spider): 46 if rp is None: 47 return 48 49 useragent = self._robotstxt_useragent 50 if not useragent: 51 useragent = request.headers.get(b'User-Agent', self._default_useragent) 52 if not rp.allowed(request.url, useragent): 53 logger.debug("Forbidden by robots.txt: %(request)s", 54 {'request': request}, extra={'spider': spider}) 55 self.crawler.stats.inc_value('robotstxt/forbidden') 56 raise IgnoreRequest("Forbidden by robots.txt") 57 58 def robot_parser(self, request, spider): 59 url = urlparse_cached(request) 60 netloc = url.netloc 61 62 if netloc not in self._parsers: 63 self._parsers[netloc] = Deferred() 64 robotsurl = f"{url.scheme}://{url.netloc}/robots.txt" 65 robotsreq = Request( 66 robotsurl, 67 priority=self.DOWNLOAD_PRIORITY, 68 meta={'dont_obey_robotstxt': True} 69 ) 70 dfd = self.crawler.engine.download(robotsreq, spider) 71 dfd.addCallback(self._parse_robots, netloc, spider) 72 dfd.addErrback(self._logerror, robotsreq, spider) 73 dfd.addErrback(self._robots_error, netloc) 74 self.crawler.stats.inc_value('robotstxt/request_count') 75 76 if isinstance(self._parsers[netloc], Deferred): 77 d = Deferred() 78 79 def cb(result): 80 d.callback(result) 81 return result 82 self._parsers[netloc].addCallback(cb) 83 return d 84 else: 85 return self._parsers[netloc] 86 87 def _logerror(self, failure, request, spider): 88 if failure.type is not IgnoreRequest: 89 logger.error("Error downloading %(request)s: %(f_exception)s", 90 {'request': request, 'f_exception': failure.value}, 91 exc_info=failure_to_exc_info(failure), 92 extra={'spider': spider}) 93 return failure 94 95 def _parse_robots(self, response, netloc, spider): 96 self.crawler.stats.inc_value('robotstxt/response_count') 97 self.crawler.stats.inc_value(f'robotstxt/response_status_count/{response.status}') 98 rp = self._parserimpl.from_crawler(self.crawler, response.body) 99 rp_dfd = self._parsers[netloc] 100 self._parsers[netloc] = rp 101 rp_dfd.callback(rp) 102 103 def _robots_error(self, failure, netloc): 104 if failure.type is not IgnoreRequest: 105 key = f'robotstxt/exception_count/{failure.type}' 106 self.crawler.stats.inc_value(key) 107 rp_dfd = self._parsers[netloc] 108 self._parsers[netloc] = None 109 rp_dfd.callback(None) 110