1from time import time 2from urllib.parse import urlparse, urlunparse, urldefrag 3 4from twisted.web.http import HTTPClient 5from twisted.internet import defer, reactor 6from twisted.internet.protocol import ClientFactory 7 8from scrapy.http import Headers 9from scrapy.utils.httpobj import urlparse_cached 10from scrapy.utils.python import to_bytes, to_unicode 11from scrapy.responsetypes import responsetypes 12 13 14def _parsed_url_args(parsed): 15 # Assume parsed is urlparse-d from Request.url, 16 # which was passed via safe_url_string and is ascii-only. 17 path = urlunparse(('', '', parsed.path or '/', parsed.params, parsed.query, '')) 18 path = to_bytes(path, encoding="ascii") 19 host = to_bytes(parsed.hostname, encoding="ascii") 20 port = parsed.port 21 scheme = to_bytes(parsed.scheme, encoding="ascii") 22 netloc = to_bytes(parsed.netloc, encoding="ascii") 23 if port is None: 24 port = 443 if scheme == b'https' else 80 25 return scheme, netloc, host, port, path 26 27 28def _parse(url): 29 """ Return tuple of (scheme, netloc, host, port, path), 30 all in bytes except for port which is int. 31 Assume url is from Request.url, which was passed via safe_url_string 32 and is ascii-only. 33 """ 34 url = url.strip() 35 parsed = urlparse(url) 36 return _parsed_url_args(parsed) 37 38 39class ScrapyHTTPPageGetter(HTTPClient): 40 41 delimiter = b'\n' 42 43 def connectionMade(self): 44 self.headers = Headers() # bucket for response headers 45 46 # Method command 47 self.sendCommand(self.factory.method, self.factory.path) 48 # Headers 49 for key, values in self.factory.headers.items(): 50 for value in values: 51 self.sendHeader(key, value) 52 self.endHeaders() 53 # Body 54 if self.factory.body is not None: 55 self.transport.write(self.factory.body) 56 57 def lineReceived(self, line): 58 return HTTPClient.lineReceived(self, line.rstrip()) 59 60 def handleHeader(self, key, value): 61 self.headers.appendlist(key, value) 62 63 def handleStatus(self, version, status, message): 64 self.factory.gotStatus(version, status, message) 65 66 def handleEndHeaders(self): 67 self.factory.gotHeaders(self.headers) 68 69 def connectionLost(self, reason): 70 self._connection_lost_reason = reason 71 HTTPClient.connectionLost(self, reason) 72 self.factory.noPage(reason) 73 74 def handleResponse(self, response): 75 if self.factory.method.upper() == b'HEAD': 76 self.factory.page(b'') 77 elif self.length is not None and self.length > 0: 78 self.factory.noPage(self._connection_lost_reason) 79 else: 80 self.factory.page(response) 81 self.transport.loseConnection() 82 83 def timeout(self): 84 self.transport.loseConnection() 85 86 # transport cleanup needed for HTTPS connections 87 if self.factory.url.startswith(b'https'): 88 self.transport.stopProducing() 89 90 self.factory.noPage( 91 defer.TimeoutError(f"Getting {self.factory.url} took longer " 92 f"than {self.factory.timeout} seconds.")) 93 94 95# This class used to inherit from Twisted’s 96# twisted.web.client.HTTPClientFactory. When that class was deprecated in 97# Twisted (https://github.com/twisted/twisted/pull/643), we merged its 98# non-overriden code into this class. 99class ScrapyHTTPClientFactory(ClientFactory): 100 101 protocol = ScrapyHTTPPageGetter 102 103 waiting = 1 104 noisy = False 105 followRedirect = False 106 afterFoundGet = False 107 108 def _build_response(self, body, request): 109 request.meta['download_latency'] = self.headers_time - self.start_time 110 status = int(self.status) 111 headers = Headers(self.response_headers) 112 respcls = responsetypes.from_args(headers=headers, url=self._url) 113 return respcls(url=self._url, status=status, headers=headers, body=body, protocol=to_unicode(self.version)) 114 115 def _set_connection_attributes(self, request): 116 parsed = urlparse_cached(request) 117 self.scheme, self.netloc, self.host, self.port, self.path = _parsed_url_args(parsed) 118 proxy = request.meta.get('proxy') 119 if proxy: 120 self.scheme, _, self.host, self.port, _ = _parse(proxy) 121 self.path = self.url 122 123 def __init__(self, request, timeout=180): 124 self._url = urldefrag(request.url)[0] 125 # converting to bytes to comply to Twisted interface 126 self.url = to_bytes(self._url, encoding='ascii') 127 self.method = to_bytes(request.method, encoding='ascii') 128 self.body = request.body or None 129 self.headers = Headers(request.headers) 130 self.response_headers = None 131 self.timeout = request.meta.get('download_timeout') or timeout 132 self.start_time = time() 133 self.deferred = defer.Deferred().addCallback(self._build_response, request) 134 135 # Fixes Twisted 11.1.0+ support as HTTPClientFactory is expected 136 # to have _disconnectedDeferred. See Twisted r32329. 137 # As Scrapy implements it's own logic to handle redirects is not 138 # needed to add the callback _waitForDisconnect. 139 # Specifically this avoids the AttributeError exception when 140 # clientConnectionFailed method is called. 141 self._disconnectedDeferred = defer.Deferred() 142 143 self._set_connection_attributes(request) 144 145 # set Host header based on url 146 self.headers.setdefault('Host', self.netloc) 147 148 # set Content-Length based len of body 149 if self.body is not None: 150 self.headers['Content-Length'] = len(self.body) 151 # just in case a broken http/1.1 decides to keep connection alive 152 self.headers.setdefault("Connection", "close") 153 # Content-Length must be specified in POST method even with no body 154 elif self.method == b'POST': 155 self.headers['Content-Length'] = 0 156 157 def __repr__(self): 158 return f"<{self.__class__.__name__}: {self.url}>" 159 160 def _cancelTimeout(self, result, timeoutCall): 161 if timeoutCall.active(): 162 timeoutCall.cancel() 163 return result 164 165 def buildProtocol(self, addr): 166 p = ClientFactory.buildProtocol(self, addr) 167 p.followRedirect = self.followRedirect 168 p.afterFoundGet = self.afterFoundGet 169 if self.timeout: 170 timeoutCall = reactor.callLater(self.timeout, p.timeout) 171 self.deferred.addBoth(self._cancelTimeout, timeoutCall) 172 return p 173 174 def gotHeaders(self, headers): 175 self.headers_time = time() 176 self.response_headers = headers 177 178 def gotStatus(self, version, status, message): 179 """ 180 Set the status of the request on us. 181 @param version: The HTTP version. 182 @type version: L{bytes} 183 @param status: The HTTP status code, an integer represented as a 184 bytestring. 185 @type status: L{bytes} 186 @param message: The HTTP status message. 187 @type message: L{bytes} 188 """ 189 self.version, self.status, self.message = version, status, message 190 191 def page(self, page): 192 if self.waiting: 193 self.waiting = 0 194 self.deferred.callback(page) 195 196 def noPage(self, reason): 197 if self.waiting: 198 self.waiting = 0 199 self.deferred.errback(reason) 200 201 def clientConnectionFailed(self, _, reason): 202 """ 203 When a connection attempt fails, the request cannot be issued. If no 204 result has yet been provided to the result Deferred, provide the 205 connection failure reason as an error result. 206 """ 207 if self.waiting: 208 self.waiting = 0 209 # If the connection attempt failed, there is nothing more to 210 # disconnect, so just fire that Deferred now. 211 self._disconnectedDeferred.callback(None) 212 self.deferred.errback(reason) 213