1"""
2Url Length Spider Middleware
3
4See documentation in docs/topics/spider-middleware.rst
5"""
6
7import logging
8
9from scrapy.http import Request
10from scrapy.exceptions import NotConfigured
11
12logger = logging.getLogger(__name__)
13
14
15class UrlLengthMiddleware:
16
17    def __init__(self, maxlength):
18        self.maxlength = maxlength
19
20    @classmethod
21    def from_settings(cls, settings):
22        maxlength = settings.getint('URLLENGTH_LIMIT')
23        if not maxlength:
24            raise NotConfigured
25        return cls(maxlength)
26
27    def process_spider_output(self, response, result, spider):
28        def _filter(request):
29            if isinstance(request, Request) and len(request.url) > self.maxlength:
30                logger.info(
31                    "Ignoring link (url length > %(maxlength)d): %(url)s ",
32                    {'maxlength': self.maxlength, 'url': request.url},
33                    extra={'spider': spider}
34                )
35                spider.crawler.stats.inc_value('urllength/request_ignored_count', spider=spider)
36                return False
37            else:
38                return True
39
40        return (r for r in result or () if _filter(r))
41