1# -*- coding: utf-8 -*-
2
3"""
4requests.utils
5~~~~~~~~~~~~~~
6
7This module provides utility functions that are used within Requests
8that are also useful for external consumption.
9
10"""
11
12import cgi
13import codecs
14import collections
15import io
16import os
17import platform
18import re
19import sys
20import socket
21import struct
22import warnings
23
24from . import __version__
25from . import certs
26from .compat import parse_http_list as _parse_list_header
27from .compat import (quote, urlparse, bytes, str, OrderedDict, unquote, is_py2,
28                     builtin_str, getproxies, proxy_bypass, urlunparse,
29                     basestring)
30from .cookies import RequestsCookieJar, cookiejar_from_dict
31from .structures import CaseInsensitiveDict
32from .exceptions import InvalidURL, FileModeWarning
33
34_hush_pyflakes = (RequestsCookieJar,)
35
36NETRC_FILES = ('.netrc', '_netrc')
37
38DEFAULT_CA_BUNDLE_PATH = certs.where()
39
40
41def dict_to_sequence(d):
42    """Returns an internal sequence dictionary update."""
43
44    if hasattr(d, 'items'):
45        d = d.items()
46
47    return d
48
49
50def super_len(o):
51    total_length = 0
52    current_position = 0
53
54    if hasattr(o, '__len__'):
55        total_length = len(o)
56
57    elif hasattr(o, 'len'):
58        total_length = o.len
59
60    elif hasattr(o, 'getvalue'):
61        # e.g. BytesIO, cStringIO.StringIO
62        total_length = len(o.getvalue())
63
64    elif hasattr(o, 'fileno'):
65        try:
66            fileno = o.fileno()
67        except io.UnsupportedOperation:
68            pass
69        else:
70            total_length = os.fstat(fileno).st_size
71
72            # Having used fstat to determine the file length, we need to
73            # confirm that this file was opened up in binary mode.
74            if 'b' not in o.mode:
75                warnings.warn((
76                    "Requests has determined the content-length for this "
77                    "request using the binary size of the file: however, the "
78                    "file has been opened in text mode (i.e. without the 'b' "
79                    "flag in the mode). This may lead to an incorrect "
80                    "content-length. In Requests 3.0, support will be removed "
81                    "for files in text mode."),
82                    FileModeWarning
83                )
84
85    if hasattr(o, 'tell'):
86        current_position = o.tell()
87
88    return max(0, total_length - current_position)
89
90
91def get_netrc_auth(url, raise_errors=False):
92    """Returns the Requests tuple auth for a given url from netrc."""
93
94    try:
95        from netrc import netrc, NetrcParseError
96
97        netrc_path = None
98
99        for f in NETRC_FILES:
100            try:
101                loc = os.path.expanduser('~/{0}'.format(f))
102            except KeyError:
103                # os.path.expanduser can fail when $HOME is undefined and
104                # getpwuid fails. See http://bugs.python.org/issue20164 &
105                # https://github.com/kennethreitz/requests/issues/1846
106                return
107
108            if os.path.exists(loc):
109                netrc_path = loc
110                break
111
112        # Abort early if there isn't one.
113        if netrc_path is None:
114            return
115
116        ri = urlparse(url)
117
118        # Strip port numbers from netloc. This weird `if...encode`` dance is
119        # used for Python 3.2, which doesn't support unicode literals.
120        splitstr = b':'
121        if isinstance(url, str):
122            splitstr = splitstr.decode('ascii')
123        host = ri.netloc.split(splitstr)[0]
124
125        try:
126            _netrc = netrc(netrc_path).authenticators(host)
127            if _netrc:
128                # Return with login / password
129                login_i = (0 if _netrc[0] else 1)
130                return (_netrc[login_i], _netrc[2])
131        except (NetrcParseError, IOError):
132            # If there was a parsing error or a permissions issue reading the file,
133            # we'll just skip netrc auth unless explicitly asked to raise errors.
134            if raise_errors:
135                raise
136
137    # AppEngine hackiness.
138    except (ImportError, AttributeError):
139        pass
140
141
142def guess_filename(obj):
143    """Tries to guess the filename of the given object."""
144    name = getattr(obj, 'name', None)
145    if (name and isinstance(name, basestring) and name[0] != '<' and
146            name[-1] != '>'):
147        return os.path.basename(name)
148
149
150def from_key_val_list(value):
151    """Take an object and test to see if it can be represented as a
152    dictionary. Unless it can not be represented as such, return an
153    OrderedDict, e.g.,
154
155    ::
156
157        >>> from_key_val_list([('key', 'val')])
158        OrderedDict([('key', 'val')])
159        >>> from_key_val_list('string')
160        ValueError: need more than 1 value to unpack
161        >>> from_key_val_list({'key': 'val'})
162        OrderedDict([('key', 'val')])
163    """
164    if value is None:
165        return None
166
167    if isinstance(value, (str, bytes, bool, int)):
168        raise ValueError('cannot encode objects that are not 2-tuples')
169
170    return OrderedDict(value)
171
172
173def to_key_val_list(value):
174    """Take an object and test to see if it can be represented as a
175    dictionary. If it can be, return a list of tuples, e.g.,
176
177    ::
178
179        >>> to_key_val_list([('key', 'val')])
180        [('key', 'val')]
181        >>> to_key_val_list({'key': 'val'})
182        [('key', 'val')]
183        >>> to_key_val_list('string')
184        ValueError: cannot encode objects that are not 2-tuples.
185    """
186    if value is None:
187        return None
188
189    if isinstance(value, (str, bytes, bool, int)):
190        raise ValueError('cannot encode objects that are not 2-tuples')
191
192    if isinstance(value, collections.Mapping):
193        value = value.items()
194
195    return list(value)
196
197
198# From mitsuhiko/werkzeug (used with permission).
199def parse_list_header(value):
200    """Parse lists as described by RFC 2068 Section 2.
201
202    In particular, parse comma-separated lists where the elements of
203    the list may include quoted-strings.  A quoted-string could
204    contain a comma.  A non-quoted string could have quotes in the
205    middle.  Quotes are removed automatically after parsing.
206
207    It basically works like :func:`parse_set_header` just that items
208    may appear multiple times and case sensitivity is preserved.
209
210    The return value is a standard :class:`list`:
211
212    >>> parse_list_header('token, "quoted value"')
213    ['token', 'quoted value']
214
215    To create a header from the :class:`list` again, use the
216    :func:`dump_header` function.
217
218    :param value: a string with a list header.
219    :return: :class:`list`
220    """
221    result = []
222    for item in _parse_list_header(value):
223        if item[:1] == item[-1:] == '"':
224            item = unquote_header_value(item[1:-1])
225        result.append(item)
226    return result
227
228
229# From mitsuhiko/werkzeug (used with permission).
230def parse_dict_header(value):
231    """Parse lists of key, value pairs as described by RFC 2068 Section 2 and
232    convert them into a python dict:
233
234    >>> d = parse_dict_header('foo="is a fish", bar="as well"')
235    >>> type(d) is dict
236    True
237    >>> sorted(d.items())
238    [('bar', 'as well'), ('foo', 'is a fish')]
239
240    If there is no value for a key it will be `None`:
241
242    >>> parse_dict_header('key_without_value')
243    {'key_without_value': None}
244
245    To create a header from the :class:`dict` again, use the
246    :func:`dump_header` function.
247
248    :param value: a string with a dict header.
249    :return: :class:`dict`
250    """
251    result = {}
252    for item in _parse_list_header(value):
253        if '=' not in item:
254            result[item] = None
255            continue
256        name, value = item.split('=', 1)
257        if value[:1] == value[-1:] == '"':
258            value = unquote_header_value(value[1:-1])
259        result[name] = value
260    return result
261
262
263# From mitsuhiko/werkzeug (used with permission).
264def unquote_header_value(value, is_filename=False):
265    r"""Unquotes a header value.  (Reversal of :func:`quote_header_value`).
266    This does not use the real unquoting but what browsers are actually
267    using for quoting.
268
269    :param value: the header value to unquote.
270    """
271    if value and value[0] == value[-1] == '"':
272        # this is not the real unquoting, but fixing this so that the
273        # RFC is met will result in bugs with internet explorer and
274        # probably some other browsers as well.  IE for example is
275        # uploading files with "C:\foo\bar.txt" as filename
276        value = value[1:-1]
277
278        # if this is a filename and the starting characters look like
279        # a UNC path, then just return the value without quotes.  Using the
280        # replace sequence below on a UNC path has the effect of turning
281        # the leading double slash into a single slash and then
282        # _fix_ie_filename() doesn't work correctly.  See #458.
283        if not is_filename or value[:2] != '\\\\':
284            return value.replace('\\\\', '\\').replace('\\"', '"')
285    return value
286
287
288def dict_from_cookiejar(cj):
289    """Returns a key/value dictionary from a CookieJar.
290
291    :param cj: CookieJar object to extract cookies from.
292    """
293
294    cookie_dict = {}
295
296    for cookie in cj:
297        cookie_dict[cookie.name] = cookie.value
298
299    return cookie_dict
300
301
302def add_dict_to_cookiejar(cj, cookie_dict):
303    """Returns a CookieJar from a key/value dictionary.
304
305    :param cj: CookieJar to insert cookies into.
306    :param cookie_dict: Dict of key/values to insert into CookieJar.
307    """
308
309    cj2 = cookiejar_from_dict(cookie_dict)
310    cj.update(cj2)
311    return cj
312
313
314def get_encodings_from_content(content):
315    """Returns encodings from given content string.
316
317    :param content: bytestring to extract encodings from.
318    """
319    warnings.warn((
320        'In requests 3.0, get_encodings_from_content will be removed. For '
321        'more information, please see the discussion on issue #2266. (This'
322        ' warning should only appear once.)'),
323        DeprecationWarning)
324
325    charset_re = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I)
326    pragma_re = re.compile(r'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I)
327    xml_re = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]')
328
329    return (charset_re.findall(content) +
330            pragma_re.findall(content) +
331            xml_re.findall(content))
332
333
334def get_encoding_from_headers(headers):
335    """Returns encodings from given HTTP Header Dict.
336
337    :param headers: dictionary to extract encoding from.
338    """
339
340    content_type = headers.get('content-type')
341
342    if not content_type:
343        return None
344
345    content_type, params = cgi.parse_header(content_type)
346
347    if 'charset' in params:
348        return params['charset'].strip("'\"")
349
350    if 'text' in content_type:
351        return 'ISO-8859-1'
352
353
354def stream_decode_response_unicode(iterator, r):
355    """Stream decodes a iterator."""
356
357    if r.encoding is None:
358        for item in iterator:
359            yield item
360        return
361
362    decoder = codecs.getincrementaldecoder(r.encoding)(errors='replace')
363    for chunk in iterator:
364        rv = decoder.decode(chunk)
365        if rv:
366            yield rv
367    rv = decoder.decode(b'', final=True)
368    if rv:
369        yield rv
370
371
372def iter_slices(string, slice_length):
373    """Iterate over slices of a string."""
374    pos = 0
375    while pos < len(string):
376        yield string[pos:pos + slice_length]
377        pos += slice_length
378
379
380def get_unicode_from_response(r):
381    """Returns the requested content back in unicode.
382
383    :param r: Response object to get unicode content from.
384
385    Tried:
386
387    1. charset from content-type
388    2. fall back and replace all unicode characters
389
390    """
391    warnings.warn((
392        'In requests 3.0, get_unicode_from_response will be removed. For '
393        'more information, please see the discussion on issue #2266. (This'
394        ' warning should only appear once.)'),
395        DeprecationWarning)
396
397    tried_encodings = []
398
399    # Try charset from content-type
400    encoding = get_encoding_from_headers(r.headers)
401
402    if encoding:
403        try:
404            return str(r.content, encoding)
405        except UnicodeError:
406            tried_encodings.append(encoding)
407
408    # Fall back:
409    try:
410        return str(r.content, encoding, errors='replace')
411    except TypeError:
412        return r.content
413
414
415# The unreserved URI characters (RFC 3986)
416UNRESERVED_SET = frozenset(
417    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
418    + "0123456789-._~")
419
420
421def unquote_unreserved(uri):
422    """Un-escape any percent-escape sequences in a URI that are unreserved
423    characters. This leaves all reserved, illegal and non-ASCII bytes encoded.
424    """
425    parts = uri.split('%')
426    for i in range(1, len(parts)):
427        h = parts[i][0:2]
428        if len(h) == 2 and h.isalnum():
429            try:
430                c = chr(int(h, 16))
431            except ValueError:
432                raise InvalidURL("Invalid percent-escape sequence: '%s'" % h)
433
434            if c in UNRESERVED_SET:
435                parts[i] = c + parts[i][2:]
436            else:
437                parts[i] = '%' + parts[i]
438        else:
439            parts[i] = '%' + parts[i]
440    return ''.join(parts)
441
442
443def requote_uri(uri):
444    """Re-quote the given URI.
445
446    This function passes the given URI through an unquote/quote cycle to
447    ensure that it is fully and consistently quoted.
448    """
449    safe_with_percent = "!#$%&'()*+,/:;=?@[]~"
450    safe_without_percent = "!#$&'()*+,/:;=?@[]~"
451    try:
452        # Unquote only the unreserved characters
453        # Then quote only illegal characters (do not quote reserved,
454        # unreserved, or '%')
455        return quote(unquote_unreserved(uri), safe=safe_with_percent)
456    except InvalidURL:
457        # We couldn't unquote the given URI, so let's try quoting it, but
458        # there may be unquoted '%'s in the URI. We need to make sure they're
459        # properly quoted so they do not cause issues elsewhere.
460        return quote(uri, safe=safe_without_percent)
461
462
463def address_in_network(ip, net):
464    """
465    This function allows you to check if on IP belongs to a network subnet
466    Example: returns True if ip = 192.168.1.1 and net = 192.168.1.0/24
467             returns False if ip = 192.168.1.1 and net = 192.168.100.0/24
468    """
469    ipaddr = struct.unpack('=L', socket.inet_aton(ip))[0]
470    netaddr, bits = net.split('/')
471    netmask = struct.unpack('=L', socket.inet_aton(dotted_netmask(int(bits))))[0]
472    network = struct.unpack('=L', socket.inet_aton(netaddr))[0] & netmask
473    return (ipaddr & netmask) == (network & netmask)
474
475
476def dotted_netmask(mask):
477    """
478    Converts mask from /xx format to xxx.xxx.xxx.xxx
479    Example: if mask is 24 function returns 255.255.255.0
480    """
481    bits = 0xffffffff ^ (1 << 32 - mask) - 1
482    return socket.inet_ntoa(struct.pack('>I', bits))
483
484
485def is_ipv4_address(string_ip):
486    try:
487        socket.inet_aton(string_ip)
488    except socket.error:
489        return False
490    return True
491
492
493def is_valid_cidr(string_network):
494    """Very simple check of the cidr format in no_proxy variable"""
495    if string_network.count('/') == 1:
496        try:
497            mask = int(string_network.split('/')[1])
498        except ValueError:
499            return False
500
501        if mask < 1 or mask > 32:
502            return False
503
504        try:
505            socket.inet_aton(string_network.split('/')[0])
506        except socket.error:
507            return False
508    else:
509        return False
510    return True
511
512
513def should_bypass_proxies(url):
514    """
515    Returns whether we should bypass proxies or not.
516    """
517    get_proxy = lambda k: os.environ.get(k) or os.environ.get(k.upper())
518
519    # First check whether no_proxy is defined. If it is, check that the URL
520    # we're getting isn't in the no_proxy list.
521    no_proxy = get_proxy('no_proxy')
522    netloc = urlparse(url).netloc
523
524    if no_proxy:
525        # We need to check whether we match here. We need to see if we match
526        # the end of the netloc, both with and without the port.
527        no_proxy = (
528            host for host in no_proxy.replace(' ', '').split(',') if host
529        )
530
531        ip = netloc.split(':')[0]
532        if is_ipv4_address(ip):
533            for proxy_ip in no_proxy:
534                if is_valid_cidr(proxy_ip):
535                    if address_in_network(ip, proxy_ip):
536                        return True
537        else:
538            for host in no_proxy:
539                if netloc.endswith(host) or netloc.split(':')[0].endswith(host):
540                    # The URL does match something in no_proxy, so we don't want
541                    # to apply the proxies on this URL.
542                    return True
543
544    # If the system proxy settings indicate that this URL should be bypassed,
545    # don't proxy.
546    # The proxy_bypass function is incredibly buggy on OS X in early versions
547    # of Python 2.6, so allow this call to fail. Only catch the specific
548    # exceptions we've seen, though: this call failing in other ways can reveal
549    # legitimate problems.
550    try:
551        bypass = proxy_bypass(netloc)
552    except (TypeError, socket.gaierror):
553        bypass = False
554
555    if bypass:
556        return True
557
558    return False
559
560def get_environ_proxies(url):
561    """Return a dict of environment proxies."""
562    if should_bypass_proxies(url):
563        return {}
564    else:
565        return getproxies()
566
567def select_proxy(url, proxies):
568    """Select a proxy for the url, if applicable.
569
570    :param url: The url being for the request
571    :param proxies: A dictionary of schemes or schemes and hosts to proxy URLs
572    """
573    proxies = proxies or {}
574    urlparts = urlparse(url)
575    proxy = proxies.get(urlparts.scheme+'://'+urlparts.hostname)
576    if proxy is None:
577        proxy = proxies.get(urlparts.scheme)
578    return proxy
579
580def default_user_agent(name="python-requests"):
581    """Return a string representing the default user agent."""
582    return '%s/%s' % (name, __version__)
583
584
585def default_headers():
586    return CaseInsensitiveDict({
587        'User-Agent': default_user_agent(),
588        'Accept-Encoding': ', '.join(('gzip', 'deflate')),
589        'Accept': '*/*',
590        'Connection': 'keep-alive',
591    })
592
593
594def parse_header_links(value):
595    """Return a dict of parsed link headers proxies.
596
597    i.e. Link: <http:/.../front.jpeg>; rel=front; type="image/jpeg",<http://.../back.jpeg>; rel=back;type="image/jpeg"
598
599    """
600
601    links = []
602
603    replace_chars = " '\""
604
605    for val in re.split(", *<", value):
606        try:
607            url, params = val.split(";", 1)
608        except ValueError:
609            url, params = val, ''
610
611        link = {}
612
613        link["url"] = url.strip("<> '\"")
614
615        for param in params.split(";"):
616            try:
617                key, value = param.split("=")
618            except ValueError:
619                break
620
621            link[key.strip(replace_chars)] = value.strip(replace_chars)
622
623        links.append(link)
624
625    return links
626
627
628# Null bytes; no need to recreate these on each call to guess_json_utf
629_null = '\x00'.encode('ascii')  # encoding to ASCII for Python 3
630_null2 = _null * 2
631_null3 = _null * 3
632
633
634def guess_json_utf(data):
635    # JSON always starts with two ASCII characters, so detection is as
636    # easy as counting the nulls and from their location and count
637    # determine the encoding. Also detect a BOM, if present.
638    sample = data[:4]
639    if sample in (codecs.BOM_UTF32_LE, codecs.BOM32_BE):
640        return 'utf-32'     # BOM included
641    if sample[:3] == codecs.BOM_UTF8:
642        return 'utf-8-sig'  # BOM included, MS style (discouraged)
643    if sample[:2] in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE):
644        return 'utf-16'     # BOM included
645    nullcount = sample.count(_null)
646    if nullcount == 0:
647        return 'utf-8'
648    if nullcount == 2:
649        if sample[::2] == _null2:   # 1st and 3rd are null
650            return 'utf-16-be'
651        if sample[1::2] == _null2:  # 2nd and 4th are null
652            return 'utf-16-le'
653        # Did not detect 2 valid UTF-16 ascii-range characters
654    if nullcount == 3:
655        if sample[:3] == _null3:
656            return 'utf-32-be'
657        if sample[1:] == _null3:
658            return 'utf-32-le'
659        # Did not detect a valid UTF-32 ascii-range character
660    return None
661
662
663def prepend_scheme_if_needed(url, new_scheme):
664    '''Given a URL that may or may not have a scheme, prepend the given scheme.
665    Does not replace a present scheme with the one provided as an argument.'''
666    scheme, netloc, path, params, query, fragment = urlparse(url, new_scheme)
667
668    # urlparse is a finicky beast, and sometimes decides that there isn't a
669    # netloc present. Assume that it's being over-cautious, and switch netloc
670    # and path if urlparse decided there was no netloc.
671    if not netloc:
672        netloc, path = path, netloc
673
674    return urlunparse((scheme, netloc, path, params, query, fragment))
675
676
677def get_auth_from_url(url):
678    """Given a url with authentication components, extract them into a tuple of
679    username,password."""
680    parsed = urlparse(url)
681
682    try:
683        auth = (unquote(parsed.username), unquote(parsed.password))
684    except (AttributeError, TypeError):
685        auth = ('', '')
686
687    return auth
688
689
690def to_native_string(string, encoding='ascii'):
691    """
692    Given a string object, regardless of type, returns a representation of that
693    string in the native string type, encoding and decoding where necessary.
694    This assumes ASCII unless told otherwise.
695    """
696    out = None
697
698    if isinstance(string, builtin_str):
699        out = string
700    else:
701        if is_py2:
702            out = string.encode(encoding)
703        else:
704            out = string.decode(encoding)
705
706    return out
707
708
709def urldefragauth(url):
710    """
711    Given a url remove the fragment and the authentication part
712    """
713    scheme, netloc, path, params, query, fragment = urlparse(url)
714
715    # see func:`prepend_scheme_if_needed`
716    if not netloc:
717        netloc, path = path, netloc
718
719    netloc = netloc.rsplit('@', 1)[-1]
720
721    return urlunparse((scheme, netloc, path, params, query, ''))
722