1from time import time
2from urllib.parse import urlparse, urlunparse, urldefrag
3
4from twisted.web.http import HTTPClient
5from twisted.internet import defer, reactor
6from twisted.internet.protocol import ClientFactory
7
8from scrapy.http import Headers
9from scrapy.utils.httpobj import urlparse_cached
10from scrapy.utils.python import to_bytes, to_unicode
11from scrapy.responsetypes import responsetypes
12
13
14def _parsed_url_args(parsed):
15    # Assume parsed is urlparse-d from Request.url,
16    # which was passed via safe_url_string and is ascii-only.
17    path = urlunparse(('', '', parsed.path or '/', parsed.params, parsed.query, ''))
18    path = to_bytes(path, encoding="ascii")
19    host = to_bytes(parsed.hostname, encoding="ascii")
20    port = parsed.port
21    scheme = to_bytes(parsed.scheme, encoding="ascii")
22    netloc = to_bytes(parsed.netloc, encoding="ascii")
23    if port is None:
24        port = 443 if scheme == b'https' else 80
25    return scheme, netloc, host, port, path
26
27
28def _parse(url):
29    """ Return tuple of (scheme, netloc, host, port, path),
30    all in bytes except for port which is int.
31    Assume url is from Request.url, which was passed via safe_url_string
32    and is ascii-only.
33    """
34    url = url.strip()
35    parsed = urlparse(url)
36    return _parsed_url_args(parsed)
37
38
39class ScrapyHTTPPageGetter(HTTPClient):
40
41    delimiter = b'\n'
42
43    def connectionMade(self):
44        self.headers = Headers()  # bucket for response headers
45
46        # Method command
47        self.sendCommand(self.factory.method, self.factory.path)
48        # Headers
49        for key, values in self.factory.headers.items():
50            for value in values:
51                self.sendHeader(key, value)
52        self.endHeaders()
53        # Body
54        if self.factory.body is not None:
55            self.transport.write(self.factory.body)
56
57    def lineReceived(self, line):
58        return HTTPClient.lineReceived(self, line.rstrip())
59
60    def handleHeader(self, key, value):
61        self.headers.appendlist(key, value)
62
63    def handleStatus(self, version, status, message):
64        self.factory.gotStatus(version, status, message)
65
66    def handleEndHeaders(self):
67        self.factory.gotHeaders(self.headers)
68
69    def connectionLost(self, reason):
70        self._connection_lost_reason = reason
71        HTTPClient.connectionLost(self, reason)
72        self.factory.noPage(reason)
73
74    def handleResponse(self, response):
75        if self.factory.method.upper() == b'HEAD':
76            self.factory.page(b'')
77        elif self.length is not None and self.length > 0:
78            self.factory.noPage(self._connection_lost_reason)
79        else:
80            self.factory.page(response)
81        self.transport.loseConnection()
82
83    def timeout(self):
84        self.transport.loseConnection()
85
86        # transport cleanup needed for HTTPS connections
87        if self.factory.url.startswith(b'https'):
88            self.transport.stopProducing()
89
90        self.factory.noPage(
91            defer.TimeoutError(f"Getting {self.factory.url} took longer "
92                               f"than {self.factory.timeout} seconds."))
93
94
95# This class used to inherit from Twisted’s
96# twisted.web.client.HTTPClientFactory. When that class was deprecated in
97# Twisted (https://github.com/twisted/twisted/pull/643), we merged its
98# non-overriden code into this class.
99class ScrapyHTTPClientFactory(ClientFactory):
100
101    protocol = ScrapyHTTPPageGetter
102
103    waiting = 1
104    noisy = False
105    followRedirect = False
106    afterFoundGet = False
107
108    def _build_response(self, body, request):
109        request.meta['download_latency'] = self.headers_time - self.start_time
110        status = int(self.status)
111        headers = Headers(self.response_headers)
112        respcls = responsetypes.from_args(headers=headers, url=self._url)
113        return respcls(url=self._url, status=status, headers=headers, body=body, protocol=to_unicode(self.version))
114
115    def _set_connection_attributes(self, request):
116        parsed = urlparse_cached(request)
117        self.scheme, self.netloc, self.host, self.port, self.path = _parsed_url_args(parsed)
118        proxy = request.meta.get('proxy')
119        if proxy:
120            self.scheme, _, self.host, self.port, _ = _parse(proxy)
121            self.path = self.url
122
123    def __init__(self, request, timeout=180):
124        self._url = urldefrag(request.url)[0]
125        # converting to bytes to comply to Twisted interface
126        self.url = to_bytes(self._url, encoding='ascii')
127        self.method = to_bytes(request.method, encoding='ascii')
128        self.body = request.body or None
129        self.headers = Headers(request.headers)
130        self.response_headers = None
131        self.timeout = request.meta.get('download_timeout') or timeout
132        self.start_time = time()
133        self.deferred = defer.Deferred().addCallback(self._build_response, request)
134
135        # Fixes Twisted 11.1.0+ support as HTTPClientFactory is expected
136        # to have _disconnectedDeferred. See Twisted r32329.
137        # As Scrapy implements it's own logic to handle redirects is not
138        # needed to add the callback _waitForDisconnect.
139        # Specifically this avoids the AttributeError exception when
140        # clientConnectionFailed method is called.
141        self._disconnectedDeferred = defer.Deferred()
142
143        self._set_connection_attributes(request)
144
145        # set Host header based on url
146        self.headers.setdefault('Host', self.netloc)
147
148        # set Content-Length based len of body
149        if self.body is not None:
150            self.headers['Content-Length'] = len(self.body)
151            # just in case a broken http/1.1 decides to keep connection alive
152            self.headers.setdefault("Connection", "close")
153        # Content-Length must be specified in POST method even with no body
154        elif self.method == b'POST':
155            self.headers['Content-Length'] = 0
156
157    def __repr__(self):
158        return f"<{self.__class__.__name__}: {self.url}>"
159
160    def _cancelTimeout(self, result, timeoutCall):
161        if timeoutCall.active():
162            timeoutCall.cancel()
163        return result
164
165    def buildProtocol(self, addr):
166        p = ClientFactory.buildProtocol(self, addr)
167        p.followRedirect = self.followRedirect
168        p.afterFoundGet = self.afterFoundGet
169        if self.timeout:
170            timeoutCall = reactor.callLater(self.timeout, p.timeout)
171            self.deferred.addBoth(self._cancelTimeout, timeoutCall)
172        return p
173
174    def gotHeaders(self, headers):
175        self.headers_time = time()
176        self.response_headers = headers
177
178    def gotStatus(self, version, status, message):
179        """
180        Set the status of the request on us.
181        @param version: The HTTP version.
182        @type version: L{bytes}
183        @param status: The HTTP status code, an integer represented as a
184            bytestring.
185        @type status: L{bytes}
186        @param message: The HTTP status message.
187        @type message: L{bytes}
188        """
189        self.version, self.status, self.message = version, status, message
190
191    def page(self, page):
192        if self.waiting:
193            self.waiting = 0
194            self.deferred.callback(page)
195
196    def noPage(self, reason):
197        if self.waiting:
198            self.waiting = 0
199            self.deferred.errback(reason)
200
201    def clientConnectionFailed(self, _, reason):
202        """
203        When a connection attempt fails, the request cannot be issued.  If no
204        result has yet been provided to the result Deferred, provide the
205        connection failure reason as an error result.
206        """
207        if self.waiting:
208            self.waiting = 0
209            # If the connection attempt failed, there is nothing more to
210            # disconnect, so just fire that Deferred now.
211            self._disconnectedDeferred.callback(None)
212            self.deferred.errback(reason)
213