1""" 2Url Length Spider Middleware 3 4See documentation in docs/topics/spider-middleware.rst 5""" 6 7import logging 8 9from scrapy.http import Request 10from scrapy.exceptions import NotConfigured 11 12logger = logging.getLogger(__name__) 13 14 15class UrlLengthMiddleware: 16 17 def __init__(self, maxlength): 18 self.maxlength = maxlength 19 20 @classmethod 21 def from_settings(cls, settings): 22 maxlength = settings.getint('URLLENGTH_LIMIT') 23 if not maxlength: 24 raise NotConfigured 25 return cls(maxlength) 26 27 def process_spider_output(self, response, result, spider): 28 def _filter(request): 29 if isinstance(request, Request) and len(request.url) > self.maxlength: 30 logger.info( 31 "Ignoring link (url length > %(maxlength)d): %(url)s ", 32 {'maxlength': self.maxlength, 'url': request.url}, 33 extra={'spider': spider} 34 ) 35 spider.crawler.stats.inc_value('urllength/request_ignored_count', spider=spider) 36 return False 37 else: 38 return True 39 40 return (r for r in result or () if _filter(r)) 41