1"""URL opener.
2
3Copyright 2004-2006 John J Lee <jjl@pobox.com>
4
5This code is free software; you can redistribute it and/or modify it
6under the terms of the BSD or ZPL 2.1 licenses (see the file
7LICENSE included with the distribution).
8
9"""
10
11from __future__ import absolute_import
12
13import bisect
14import os
15import tempfile
16import threading
17
18from . import _response
19from . import _rfc3986
20from . import _sockettimeout
21from . import _urllib2_fork
22from ._request import Request
23from ._util import isstringlike
24from .polyglot import HTTPError, URLError, iteritems, is_class
25
26
27open_file = open
28
29
30class ContentTooShortError(URLError):
31
32    def __init__(self, reason, result):
33        URLError.__init__(self, reason)
34        self.result = result
35
36
37def set_request_attr(req, name, value, default):
38    try:
39        getattr(req, name)
40    except AttributeError:
41        setattr(req, name, default)
42    if value is not default:
43        setattr(req, name, value)
44
45
46class OpenerDirector(_urllib2_fork.OpenerDirector):
47
48    def __init__(self):
49        _urllib2_fork.OpenerDirector.__init__(self)
50        # really none of these are (sanely) public -- the lack of initial
51        # underscore on some is just due to following urllib2
52        self.process_response = {}
53        self.process_request = {}
54        self._any_request = {}
55        self._any_response = {}
56        self._handler_index_valid = True
57        self._tempfiles = []
58
59    def add_handler(self, handler):
60        if not hasattr(handler, "add_parent"):
61            raise TypeError("expected BaseHandler instance, got %r" %
62                            type(handler))
63
64        if handler in self.handlers:
65            return
66        # XXX why does self.handlers need to be sorted?
67        bisect.insort(self.handlers, handler)
68        handler.add_parent(self)
69        self._handler_index_valid = False
70
71    def _maybe_reindex_handlers(self):
72        if self._handler_index_valid:
73            return
74
75        handle_error = {}
76        handle_open = {}
77        process_request = {}
78        process_response = {}
79        any_request = set()
80        any_response = set()
81        unwanted = []
82
83        for handler in self.handlers:
84            added = False
85            for meth in dir(handler):
86                if meth in ["redirect_request", "do_open", "proxy_open"]:
87                    # oops, coincidental match
88                    continue
89
90                if meth == "any_request":
91                    any_request.add(handler)
92                    added = True
93                    continue
94                elif meth == "any_response":
95                    any_response.add(handler)
96                    added = True
97                    continue
98
99                ii = meth.find("_")
100                scheme = meth[:ii]
101                condition = meth[ii + 1:]
102
103                if condition.startswith("error"):
104                    jj = meth[ii + 1:].find("_") + ii + 1
105                    kind = meth[jj + 1:]
106                    try:
107                        kind = int(kind)
108                    except ValueError:
109                        pass
110                    lookup = handle_error.setdefault(scheme, {})
111                elif condition == "open":
112                    kind = scheme
113                    lookup = handle_open
114                elif condition == "request":
115                    kind = scheme
116                    lookup = process_request
117                elif condition == "response":
118                    kind = scheme
119                    lookup = process_response
120                else:
121                    continue
122
123                lookup.setdefault(kind, set()).add(handler)
124                added = True
125
126            if not added:
127                unwanted.append(handler)
128
129        for handler in unwanted:
130            self.handlers.remove(handler)
131
132        # sort indexed methods
133        # XXX could be cleaned up
134        for lookup in [process_request, process_response]:
135            for scheme, handlers in iteritems(lookup):
136                lookup[scheme] = handlers
137        for scheme, lookup in iteritems(handle_error):
138            for code, handlers in iteritems(lookup):
139                handlers = list(handlers)
140                handlers.sort()
141                lookup[code] = handlers
142        for scheme, handlers in iteritems(handle_open):
143            handlers = list(handlers)
144            handlers.sort()
145            handle_open[scheme] = handlers
146
147        # cache the indexes
148        self.handle_error = handle_error
149        self.handle_open = handle_open
150        self.process_request = process_request
151        self.process_response = process_response
152        self._any_request = any_request
153        self._any_response = any_response
154
155    def _request(self, url_or_req, data, visit,
156                 timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT):
157        if isstringlike(url_or_req):
158            req = Request(url_or_req, data, visit=visit, timeout=timeout)
159        else:
160            # already a mechanize.Request instance
161            req = url_or_req
162            if data is not None:
163                req.add_data(data)
164            # XXX yuck
165            set_request_attr(req, "visit", visit, None)
166            set_request_attr(req, "timeout", timeout,
167                             _sockettimeout._GLOBAL_DEFAULT_TIMEOUT)
168        return req
169
170    def open(self, fullurl, data=None,
171             timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT):
172        req = self._request(fullurl, data, None, timeout)
173        req_scheme = req.get_type()
174
175        self._maybe_reindex_handlers()
176
177        # pre-process request
178        # XXX should we allow a Processor to change the URL scheme
179        #   of the request?
180        request_processors = set(self.process_request.get(req_scheme, []))
181        request_processors.update(self._any_request)
182        request_processors = list(request_processors)
183        request_processors.sort()
184        for processor in request_processors:
185            for meth_name in ["any_request", req_scheme + "_request"]:
186                meth = getattr(processor, meth_name, None)
187                if meth:
188                    req = meth(req)
189
190        # In Python >= 2.4, .open() supports processors already, so we must
191        # call ._open() instead.
192        urlopen = _urllib2_fork.OpenerDirector._open
193        response = urlopen(self, req, data)
194
195        # post-process response
196        response_processors = set(self.process_response.get(req_scheme, []))
197        response_processors.update(self._any_response)
198        response_processors = list(response_processors)
199        response_processors.sort()
200        for processor in response_processors:
201            for meth_name in ["any_response", req_scheme + "_response"]:
202                meth = getattr(processor, meth_name, None)
203                if meth:
204                    response = meth(req, response)
205
206        return response
207
208    def error(self, proto, *args):
209        if proto in ['http', 'https']:
210            # XXX http[s] protocols are special-cased
211            # https is not different than http
212            dict = self.handle_error['http']
213            proto = args[2]  # YUCK!
214            meth_name = 'http_error_%s' % proto
215            http_err = 1
216            orig_args = args
217        else:
218            dict = self.handle_error
219            meth_name = proto + '_error'
220            http_err = 0
221        args = (dict, proto, meth_name) + args
222        result = self._call_chain(*args)
223        if result:
224            return result
225
226        if http_err:
227            args = (dict, 'default', 'http_error_default') + orig_args
228            return self._call_chain(*args)
229
230    BLOCK_SIZE = 1024 * 8
231
232    def retrieve(self, fullurl, filename=None, reporthook=None, data=None,
233                 timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT,
234                 open=open_file):
235        """Returns (filename, headers).
236
237        For remote objects, the default filename will refer to a temporary
238        file.  Temporary files are removed when the OpenerDirector.close()
239        method is called.
240
241        For file: URLs, at present the returned filename is None.  This may
242        change in future.
243
244        If the actual number of bytes read is less than indicated by the
245        Content-Length header, raises ContentTooShortError (a URLError
246        subclass).  The exception's .result attribute contains the (filename,
247        headers) that would have been returned.
248
249        """
250        req = self._request(fullurl, data, False, timeout)
251        scheme = req.get_type()
252        fp = self.open(req)
253        try:
254            headers = fp.info()
255            if filename is None and scheme == 'file':
256                # XXX req.get_selector() seems broken here, return None,
257                #   pending sanity :-/
258                return None, headers
259                # return urllib.url2pathname(req.get_selector()), headers
260            if filename:
261                tfp = open(filename, 'wb')
262            else:
263                path = _rfc3986.urlsplit(req.get_full_url())[2]
264                suffix = os.path.splitext(path)[1]
265                fd, filename = tempfile.mkstemp(suffix)
266                self._tempfiles.append(filename)
267                tfp = os.fdopen(fd, 'wb')
268            try:
269                result = filename, headers
270                bs = self.BLOCK_SIZE
271                size = -1
272                read = 0
273                blocknum = 0
274                if reporthook:
275                    if "content-length" in headers:
276                        size = int(headers["content-length"])
277                    reporthook(blocknum, bs, size)
278                while 1:
279                    block = fp.read(bs)
280                    if not block:
281                        break
282                    read += len(block)
283                    tfp.write(block)
284                    blocknum += 1
285                    if reporthook:
286                        reporthook(blocknum, bs, size)
287            finally:
288                tfp.close()
289        finally:
290            fp.close()
291
292        # raise exception if actual size does not match content-length header
293        if size >= 0 and read < size:
294            raise ContentTooShortError(
295                "retrieval incomplete: "
296                "got only %i out of %i bytes" % (read, size),
297                result
298            )
299
300        return result
301
302    def close(self):
303        _urllib2_fork.OpenerDirector.close(self)
304
305        # make it very obvious this object is no longer supposed to be used
306        self.open = self.error = self.retrieve = self.add_handler = None
307
308        if self._tempfiles:
309            for filename in self._tempfiles:
310                try:
311                    os.unlink(filename)
312                except OSError:
313                    pass
314            del self._tempfiles[:]
315
316
317def wrapped_open(urlopen, process_response_object, fullurl, data=None,
318                 timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT):
319    success = True
320    try:
321        response = urlopen(fullurl, data, timeout)
322    except HTTPError as error:
323        success = False
324        if error.fp is None:  # not a response
325            raise
326        response = error
327
328    if response is not None:
329        response = process_response_object(response)
330
331    if not success:
332        raise response
333    return response
334
335
336class ResponseProcessingOpener(OpenerDirector):
337
338    def open(self, fullurl, data=None,
339             timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT):
340        def bound_open(fullurl, data=None,
341                       timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT):
342            return OpenerDirector.open(self, fullurl, data, timeout)
343        return wrapped_open(
344            bound_open, self.process_response_object, fullurl, data, timeout)
345
346    def process_response_object(self, response):
347        return response
348
349
350class SeekableResponseOpener(ResponseProcessingOpener):
351
352    def process_response_object(self, response):
353        return _response.seek_wrapped_response(response)
354
355
356class OpenerFactory:
357    """This class's interface is quite likely to change."""
358
359    default_classes = [
360        # handlers
361        _urllib2_fork.ProxyHandler,
362        _urllib2_fork.UnknownHandler,
363        _urllib2_fork.HTTPHandler,
364        _urllib2_fork.HTTPDefaultErrorHandler,
365        _urllib2_fork.HTTPRedirectHandler,
366        _urllib2_fork.FTPHandler,
367        _urllib2_fork.FileHandler,
368        # processors
369        _urllib2_fork.HTTPCookieProcessor,
370        _urllib2_fork.HTTPErrorProcessor,
371    ]
372    default_classes.append(_urllib2_fork.HTTPSHandler)
373    handlers = []
374    replacement_handlers = []
375
376    def __init__(self, klass=OpenerDirector):
377        self.klass = klass
378
379    def build_opener(self, *handlers):
380        """Create an opener object from a list of handlers and processors.
381
382        The opener will use several default handlers and processors, including
383        support for HTTP and FTP.
384
385        If any of the handlers passed as arguments are subclasses of the
386        default handlers, the default handlers will not be used.
387
388        """
389        opener = self.klass()
390        default_classes = list(self.default_classes)
391        skip = set()
392        for klass in default_classes:
393            for check in handlers:
394                if is_class(check):
395                    if issubclass(check, klass):
396                        skip.add(klass)
397                elif isinstance(check, klass):
398                    skip.add(klass)
399        for klass in skip:
400            default_classes.remove(klass)
401
402        for klass in default_classes:
403            opener.add_handler(klass())
404        for h in handlers:
405            if is_class(h):
406                h = h()
407            opener.add_handler(h)
408
409        return opener
410
411
412build_opener = OpenerFactory().build_opener
413
414thread_local = threading.local()
415thread_local.opener = None
416
417
418def get_thread_local_opener():
419    try:
420        ans = thread_local.opener
421    except AttributeError:
422        # threading module is broken, use a single global instance
423        ans = getattr(get_thread_local_opener, 'ans', None)
424        if ans is None:
425            ans = get_thread_local_opener.ans = build_opener()
426    if ans is None:
427        ans = thread_local.opener = build_opener()
428    return ans
429
430
431def urlopen(url, data=None, timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT):
432    return get_thread_local_opener().open(url, data, timeout)
433
434
435def urlretrieve(url, filename=None, reporthook=None, data=None,
436                timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT):
437    return get_thread_local_opener().retrieve(
438        url, filename, reporthook, data, timeout)
439
440
441def install_opener(opener):
442    get_thread_local_opener.ans = opener
443    try:
444        thread_local.opener = opener
445    except AttributeError:
446        pass
447