1"""
2This is a middleware to respect robots.txt policies. To activate it you must
3enable this middleware and enable the ROBOTSTXT_OBEY setting.
4
5"""
6
7import logging
8
9from twisted.internet.defer import Deferred, maybeDeferred
10from scrapy.exceptions import NotConfigured, IgnoreRequest
11from scrapy.http import Request
12from scrapy.utils.httpobj import urlparse_cached
13from scrapy.utils.log import failure_to_exc_info
14from scrapy.utils.misc import load_object
15
16logger = logging.getLogger(__name__)
17
18
19class RobotsTxtMiddleware:
20    DOWNLOAD_PRIORITY = 1000
21
22    def __init__(self, crawler):
23        if not crawler.settings.getbool('ROBOTSTXT_OBEY'):
24            raise NotConfigured
25        self._default_useragent = crawler.settings.get('USER_AGENT', 'Scrapy')
26        self._robotstxt_useragent = crawler.settings.get('ROBOTSTXT_USER_AGENT', None)
27        self.crawler = crawler
28        self._parsers = {}
29        self._parserimpl = load_object(crawler.settings.get('ROBOTSTXT_PARSER'))
30
31        # check if parser dependencies are met, this should throw an error otherwise.
32        self._parserimpl.from_crawler(self.crawler, b'')
33
34    @classmethod
35    def from_crawler(cls, crawler):
36        return cls(crawler)
37
38    def process_request(self, request, spider):
39        if request.meta.get('dont_obey_robotstxt'):
40            return
41        d = maybeDeferred(self.robot_parser, request, spider)
42        d.addCallback(self.process_request_2, request, spider)
43        return d
44
45    def process_request_2(self, rp, request, spider):
46        if rp is None:
47            return
48
49        useragent = self._robotstxt_useragent
50        if not useragent:
51            useragent = request.headers.get(b'User-Agent', self._default_useragent)
52        if not rp.allowed(request.url, useragent):
53            logger.debug("Forbidden by robots.txt: %(request)s",
54                         {'request': request}, extra={'spider': spider})
55            self.crawler.stats.inc_value('robotstxt/forbidden')
56            raise IgnoreRequest("Forbidden by robots.txt")
57
58    def robot_parser(self, request, spider):
59        url = urlparse_cached(request)
60        netloc = url.netloc
61
62        if netloc not in self._parsers:
63            self._parsers[netloc] = Deferred()
64            robotsurl = f"{url.scheme}://{url.netloc}/robots.txt"
65            robotsreq = Request(
66                robotsurl,
67                priority=self.DOWNLOAD_PRIORITY,
68                meta={'dont_obey_robotstxt': True}
69            )
70            dfd = self.crawler.engine.download(robotsreq, spider)
71            dfd.addCallback(self._parse_robots, netloc, spider)
72            dfd.addErrback(self._logerror, robotsreq, spider)
73            dfd.addErrback(self._robots_error, netloc)
74            self.crawler.stats.inc_value('robotstxt/request_count')
75
76        if isinstance(self._parsers[netloc], Deferred):
77            d = Deferred()
78
79            def cb(result):
80                d.callback(result)
81                return result
82            self._parsers[netloc].addCallback(cb)
83            return d
84        else:
85            return self._parsers[netloc]
86
87    def _logerror(self, failure, request, spider):
88        if failure.type is not IgnoreRequest:
89            logger.error("Error downloading %(request)s: %(f_exception)s",
90                         {'request': request, 'f_exception': failure.value},
91                         exc_info=failure_to_exc_info(failure),
92                         extra={'spider': spider})
93        return failure
94
95    def _parse_robots(self, response, netloc, spider):
96        self.crawler.stats.inc_value('robotstxt/response_count')
97        self.crawler.stats.inc_value(f'robotstxt/response_status_count/{response.status}')
98        rp = self._parserimpl.from_crawler(self.crawler, response.body)
99        rp_dfd = self._parsers[netloc]
100        self._parsers[netloc] = rp
101        rp_dfd.callback(rp)
102
103    def _robots_error(self, failure, netloc):
104        if failure.type is not IgnoreRequest:
105            key = f'robotstxt/exception_count/{failure.type}'
106            self.crawler.stats.inc_value(key)
107        rp_dfd = self._parsers[netloc]
108        self._parsers[netloc] = None
109        rp_dfd.callback(None)
110