1# -*- coding: utf-8 -*- 2 3""" 4requests.utils 5~~~~~~~~~~~~~~ 6 7This module provides utility functions that are used within Requests 8that are also useful for external consumption. 9 10""" 11 12import cgi 13import codecs 14import collections 15import io 16import os 17import platform 18import re 19import sys 20import socket 21import struct 22import warnings 23 24from . import __version__ 25from . import certs 26from .compat import parse_http_list as _parse_list_header 27from .compat import (quote, urlparse, bytes, str, OrderedDict, unquote, is_py2, 28 builtin_str, getproxies, proxy_bypass, urlunparse, 29 basestring) 30from .cookies import RequestsCookieJar, cookiejar_from_dict 31from .structures import CaseInsensitiveDict 32from .exceptions import InvalidURL, FileModeWarning 33 34_hush_pyflakes = (RequestsCookieJar,) 35 36NETRC_FILES = ('.netrc', '_netrc') 37 38DEFAULT_CA_BUNDLE_PATH = certs.where() 39 40 41def dict_to_sequence(d): 42 """Returns an internal sequence dictionary update.""" 43 44 if hasattr(d, 'items'): 45 d = d.items() 46 47 return d 48 49 50def super_len(o): 51 total_length = 0 52 current_position = 0 53 54 if hasattr(o, '__len__'): 55 total_length = len(o) 56 57 elif hasattr(o, 'len'): 58 total_length = o.len 59 60 elif hasattr(o, 'getvalue'): 61 # e.g. BytesIO, cStringIO.StringIO 62 total_length = len(o.getvalue()) 63 64 elif hasattr(o, 'fileno'): 65 try: 66 fileno = o.fileno() 67 except io.UnsupportedOperation: 68 pass 69 else: 70 total_length = os.fstat(fileno).st_size 71 72 # Having used fstat to determine the file length, we need to 73 # confirm that this file was opened up in binary mode. 74 if 'b' not in o.mode: 75 warnings.warn(( 76 "Requests has determined the content-length for this " 77 "request using the binary size of the file: however, the " 78 "file has been opened in text mode (i.e. without the 'b' " 79 "flag in the mode). This may lead to an incorrect " 80 "content-length. In Requests 3.0, support will be removed " 81 "for files in text mode."), 82 FileModeWarning 83 ) 84 85 if hasattr(o, 'tell'): 86 current_position = o.tell() 87 88 return max(0, total_length - current_position) 89 90 91def get_netrc_auth(url, raise_errors=False): 92 """Returns the Requests tuple auth for a given url from netrc.""" 93 94 try: 95 from netrc import netrc, NetrcParseError 96 97 netrc_path = None 98 99 for f in NETRC_FILES: 100 try: 101 loc = os.path.expanduser('~/{0}'.format(f)) 102 except KeyError: 103 # os.path.expanduser can fail when $HOME is undefined and 104 # getpwuid fails. See http://bugs.python.org/issue20164 & 105 # https://github.com/kennethreitz/requests/issues/1846 106 return 107 108 if os.path.exists(loc): 109 netrc_path = loc 110 break 111 112 # Abort early if there isn't one. 113 if netrc_path is None: 114 return 115 116 ri = urlparse(url) 117 118 # Strip port numbers from netloc. This weird `if...encode`` dance is 119 # used for Python 3.2, which doesn't support unicode literals. 120 splitstr = b':' 121 if isinstance(url, str): 122 splitstr = splitstr.decode('ascii') 123 host = ri.netloc.split(splitstr)[0] 124 125 try: 126 _netrc = netrc(netrc_path).authenticators(host) 127 if _netrc: 128 # Return with login / password 129 login_i = (0 if _netrc[0] else 1) 130 return (_netrc[login_i], _netrc[2]) 131 except (NetrcParseError, IOError): 132 # If there was a parsing error or a permissions issue reading the file, 133 # we'll just skip netrc auth unless explicitly asked to raise errors. 134 if raise_errors: 135 raise 136 137 # AppEngine hackiness. 138 except (ImportError, AttributeError): 139 pass 140 141 142def guess_filename(obj): 143 """Tries to guess the filename of the given object.""" 144 name = getattr(obj, 'name', None) 145 if (name and isinstance(name, basestring) and name[0] != '<' and 146 name[-1] != '>'): 147 return os.path.basename(name) 148 149 150def from_key_val_list(value): 151 """Take an object and test to see if it can be represented as a 152 dictionary. Unless it can not be represented as such, return an 153 OrderedDict, e.g., 154 155 :: 156 157 >>> from_key_val_list([('key', 'val')]) 158 OrderedDict([('key', 'val')]) 159 >>> from_key_val_list('string') 160 ValueError: need more than 1 value to unpack 161 >>> from_key_val_list({'key': 'val'}) 162 OrderedDict([('key', 'val')]) 163 """ 164 if value is None: 165 return None 166 167 if isinstance(value, (str, bytes, bool, int)): 168 raise ValueError('cannot encode objects that are not 2-tuples') 169 170 return OrderedDict(value) 171 172 173def to_key_val_list(value): 174 """Take an object and test to see if it can be represented as a 175 dictionary. If it can be, return a list of tuples, e.g., 176 177 :: 178 179 >>> to_key_val_list([('key', 'val')]) 180 [('key', 'val')] 181 >>> to_key_val_list({'key': 'val'}) 182 [('key', 'val')] 183 >>> to_key_val_list('string') 184 ValueError: cannot encode objects that are not 2-tuples. 185 """ 186 if value is None: 187 return None 188 189 if isinstance(value, (str, bytes, bool, int)): 190 raise ValueError('cannot encode objects that are not 2-tuples') 191 192 if isinstance(value, collections.Mapping): 193 value = value.items() 194 195 return list(value) 196 197 198# From mitsuhiko/werkzeug (used with permission). 199def parse_list_header(value): 200 """Parse lists as described by RFC 2068 Section 2. 201 202 In particular, parse comma-separated lists where the elements of 203 the list may include quoted-strings. A quoted-string could 204 contain a comma. A non-quoted string could have quotes in the 205 middle. Quotes are removed automatically after parsing. 206 207 It basically works like :func:`parse_set_header` just that items 208 may appear multiple times and case sensitivity is preserved. 209 210 The return value is a standard :class:`list`: 211 212 >>> parse_list_header('token, "quoted value"') 213 ['token', 'quoted value'] 214 215 To create a header from the :class:`list` again, use the 216 :func:`dump_header` function. 217 218 :param value: a string with a list header. 219 :return: :class:`list` 220 """ 221 result = [] 222 for item in _parse_list_header(value): 223 if item[:1] == item[-1:] == '"': 224 item = unquote_header_value(item[1:-1]) 225 result.append(item) 226 return result 227 228 229# From mitsuhiko/werkzeug (used with permission). 230def parse_dict_header(value): 231 """Parse lists of key, value pairs as described by RFC 2068 Section 2 and 232 convert them into a python dict: 233 234 >>> d = parse_dict_header('foo="is a fish", bar="as well"') 235 >>> type(d) is dict 236 True 237 >>> sorted(d.items()) 238 [('bar', 'as well'), ('foo', 'is a fish')] 239 240 If there is no value for a key it will be `None`: 241 242 >>> parse_dict_header('key_without_value') 243 {'key_without_value': None} 244 245 To create a header from the :class:`dict` again, use the 246 :func:`dump_header` function. 247 248 :param value: a string with a dict header. 249 :return: :class:`dict` 250 """ 251 result = {} 252 for item in _parse_list_header(value): 253 if '=' not in item: 254 result[item] = None 255 continue 256 name, value = item.split('=', 1) 257 if value[:1] == value[-1:] == '"': 258 value = unquote_header_value(value[1:-1]) 259 result[name] = value 260 return result 261 262 263# From mitsuhiko/werkzeug (used with permission). 264def unquote_header_value(value, is_filename=False): 265 r"""Unquotes a header value. (Reversal of :func:`quote_header_value`). 266 This does not use the real unquoting but what browsers are actually 267 using for quoting. 268 269 :param value: the header value to unquote. 270 """ 271 if value and value[0] == value[-1] == '"': 272 # this is not the real unquoting, but fixing this so that the 273 # RFC is met will result in bugs with internet explorer and 274 # probably some other browsers as well. IE for example is 275 # uploading files with "C:\foo\bar.txt" as filename 276 value = value[1:-1] 277 278 # if this is a filename and the starting characters look like 279 # a UNC path, then just return the value without quotes. Using the 280 # replace sequence below on a UNC path has the effect of turning 281 # the leading double slash into a single slash and then 282 # _fix_ie_filename() doesn't work correctly. See #458. 283 if not is_filename or value[:2] != '\\\\': 284 return value.replace('\\\\', '\\').replace('\\"', '"') 285 return value 286 287 288def dict_from_cookiejar(cj): 289 """Returns a key/value dictionary from a CookieJar. 290 291 :param cj: CookieJar object to extract cookies from. 292 """ 293 294 cookie_dict = {} 295 296 for cookie in cj: 297 cookie_dict[cookie.name] = cookie.value 298 299 return cookie_dict 300 301 302def add_dict_to_cookiejar(cj, cookie_dict): 303 """Returns a CookieJar from a key/value dictionary. 304 305 :param cj: CookieJar to insert cookies into. 306 :param cookie_dict: Dict of key/values to insert into CookieJar. 307 """ 308 309 cj2 = cookiejar_from_dict(cookie_dict) 310 cj.update(cj2) 311 return cj 312 313 314def get_encodings_from_content(content): 315 """Returns encodings from given content string. 316 317 :param content: bytestring to extract encodings from. 318 """ 319 warnings.warn(( 320 'In requests 3.0, get_encodings_from_content will be removed. For ' 321 'more information, please see the discussion on issue #2266. (This' 322 ' warning should only appear once.)'), 323 DeprecationWarning) 324 325 charset_re = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I) 326 pragma_re = re.compile(r'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I) 327 xml_re = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]') 328 329 return (charset_re.findall(content) + 330 pragma_re.findall(content) + 331 xml_re.findall(content)) 332 333 334def get_encoding_from_headers(headers): 335 """Returns encodings from given HTTP Header Dict. 336 337 :param headers: dictionary to extract encoding from. 338 """ 339 340 content_type = headers.get('content-type') 341 342 if not content_type: 343 return None 344 345 content_type, params = cgi.parse_header(content_type) 346 347 if 'charset' in params: 348 return params['charset'].strip("'\"") 349 350 if 'text' in content_type: 351 return 'ISO-8859-1' 352 353 354def stream_decode_response_unicode(iterator, r): 355 """Stream decodes a iterator.""" 356 357 if r.encoding is None: 358 for item in iterator: 359 yield item 360 return 361 362 decoder = codecs.getincrementaldecoder(r.encoding)(errors='replace') 363 for chunk in iterator: 364 rv = decoder.decode(chunk) 365 if rv: 366 yield rv 367 rv = decoder.decode(b'', final=True) 368 if rv: 369 yield rv 370 371 372def iter_slices(string, slice_length): 373 """Iterate over slices of a string.""" 374 pos = 0 375 while pos < len(string): 376 yield string[pos:pos + slice_length] 377 pos += slice_length 378 379 380def get_unicode_from_response(r): 381 """Returns the requested content back in unicode. 382 383 :param r: Response object to get unicode content from. 384 385 Tried: 386 387 1. charset from content-type 388 2. fall back and replace all unicode characters 389 390 """ 391 warnings.warn(( 392 'In requests 3.0, get_unicode_from_response will be removed. For ' 393 'more information, please see the discussion on issue #2266. (This' 394 ' warning should only appear once.)'), 395 DeprecationWarning) 396 397 tried_encodings = [] 398 399 # Try charset from content-type 400 encoding = get_encoding_from_headers(r.headers) 401 402 if encoding: 403 try: 404 return str(r.content, encoding) 405 except UnicodeError: 406 tried_encodings.append(encoding) 407 408 # Fall back: 409 try: 410 return str(r.content, encoding, errors='replace') 411 except TypeError: 412 return r.content 413 414 415# The unreserved URI characters (RFC 3986) 416UNRESERVED_SET = frozenset( 417 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" 418 + "0123456789-._~") 419 420 421def unquote_unreserved(uri): 422 """Un-escape any percent-escape sequences in a URI that are unreserved 423 characters. This leaves all reserved, illegal and non-ASCII bytes encoded. 424 """ 425 parts = uri.split('%') 426 for i in range(1, len(parts)): 427 h = parts[i][0:2] 428 if len(h) == 2 and h.isalnum(): 429 try: 430 c = chr(int(h, 16)) 431 except ValueError: 432 raise InvalidURL("Invalid percent-escape sequence: '%s'" % h) 433 434 if c in UNRESERVED_SET: 435 parts[i] = c + parts[i][2:] 436 else: 437 parts[i] = '%' + parts[i] 438 else: 439 parts[i] = '%' + parts[i] 440 return ''.join(parts) 441 442 443def requote_uri(uri): 444 """Re-quote the given URI. 445 446 This function passes the given URI through an unquote/quote cycle to 447 ensure that it is fully and consistently quoted. 448 """ 449 safe_with_percent = "!#$%&'()*+,/:;=?@[]~" 450 safe_without_percent = "!#$&'()*+,/:;=?@[]~" 451 try: 452 # Unquote only the unreserved characters 453 # Then quote only illegal characters (do not quote reserved, 454 # unreserved, or '%') 455 return quote(unquote_unreserved(uri), safe=safe_with_percent) 456 except InvalidURL: 457 # We couldn't unquote the given URI, so let's try quoting it, but 458 # there may be unquoted '%'s in the URI. We need to make sure they're 459 # properly quoted so they do not cause issues elsewhere. 460 return quote(uri, safe=safe_without_percent) 461 462 463def address_in_network(ip, net): 464 """ 465 This function allows you to check if on IP belongs to a network subnet 466 Example: returns True if ip = 192.168.1.1 and net = 192.168.1.0/24 467 returns False if ip = 192.168.1.1 and net = 192.168.100.0/24 468 """ 469 ipaddr = struct.unpack('=L', socket.inet_aton(ip))[0] 470 netaddr, bits = net.split('/') 471 netmask = struct.unpack('=L', socket.inet_aton(dotted_netmask(int(bits))))[0] 472 network = struct.unpack('=L', socket.inet_aton(netaddr))[0] & netmask 473 return (ipaddr & netmask) == (network & netmask) 474 475 476def dotted_netmask(mask): 477 """ 478 Converts mask from /xx format to xxx.xxx.xxx.xxx 479 Example: if mask is 24 function returns 255.255.255.0 480 """ 481 bits = 0xffffffff ^ (1 << 32 - mask) - 1 482 return socket.inet_ntoa(struct.pack('>I', bits)) 483 484 485def is_ipv4_address(string_ip): 486 try: 487 socket.inet_aton(string_ip) 488 except socket.error: 489 return False 490 return True 491 492 493def is_valid_cidr(string_network): 494 """Very simple check of the cidr format in no_proxy variable""" 495 if string_network.count('/') == 1: 496 try: 497 mask = int(string_network.split('/')[1]) 498 except ValueError: 499 return False 500 501 if mask < 1 or mask > 32: 502 return False 503 504 try: 505 socket.inet_aton(string_network.split('/')[0]) 506 except socket.error: 507 return False 508 else: 509 return False 510 return True 511 512 513def should_bypass_proxies(url): 514 """ 515 Returns whether we should bypass proxies or not. 516 """ 517 get_proxy = lambda k: os.environ.get(k) or os.environ.get(k.upper()) 518 519 # First check whether no_proxy is defined. If it is, check that the URL 520 # we're getting isn't in the no_proxy list. 521 no_proxy = get_proxy('no_proxy') 522 netloc = urlparse(url).netloc 523 524 if no_proxy: 525 # We need to check whether we match here. We need to see if we match 526 # the end of the netloc, both with and without the port. 527 no_proxy = ( 528 host for host in no_proxy.replace(' ', '').split(',') if host 529 ) 530 531 ip = netloc.split(':')[0] 532 if is_ipv4_address(ip): 533 for proxy_ip in no_proxy: 534 if is_valid_cidr(proxy_ip): 535 if address_in_network(ip, proxy_ip): 536 return True 537 else: 538 for host in no_proxy: 539 if netloc.endswith(host) or netloc.split(':')[0].endswith(host): 540 # The URL does match something in no_proxy, so we don't want 541 # to apply the proxies on this URL. 542 return True 543 544 # If the system proxy settings indicate that this URL should be bypassed, 545 # don't proxy. 546 # The proxy_bypass function is incredibly buggy on OS X in early versions 547 # of Python 2.6, so allow this call to fail. Only catch the specific 548 # exceptions we've seen, though: this call failing in other ways can reveal 549 # legitimate problems. 550 try: 551 bypass = proxy_bypass(netloc) 552 except (TypeError, socket.gaierror): 553 bypass = False 554 555 if bypass: 556 return True 557 558 return False 559 560def get_environ_proxies(url): 561 """Return a dict of environment proxies.""" 562 if should_bypass_proxies(url): 563 return {} 564 else: 565 return getproxies() 566 567def select_proxy(url, proxies): 568 """Select a proxy for the url, if applicable. 569 570 :param url: The url being for the request 571 :param proxies: A dictionary of schemes or schemes and hosts to proxy URLs 572 """ 573 proxies = proxies or {} 574 urlparts = urlparse(url) 575 proxy = proxies.get(urlparts.scheme+'://'+urlparts.hostname) 576 if proxy is None: 577 proxy = proxies.get(urlparts.scheme) 578 return proxy 579 580def default_user_agent(name="python-requests"): 581 """Return a string representing the default user agent.""" 582 return '%s/%s' % (name, __version__) 583 584 585def default_headers(): 586 return CaseInsensitiveDict({ 587 'User-Agent': default_user_agent(), 588 'Accept-Encoding': ', '.join(('gzip', 'deflate')), 589 'Accept': '*/*', 590 'Connection': 'keep-alive', 591 }) 592 593 594def parse_header_links(value): 595 """Return a dict of parsed link headers proxies. 596 597 i.e. Link: <http:/.../front.jpeg>; rel=front; type="image/jpeg",<http://.../back.jpeg>; rel=back;type="image/jpeg" 598 599 """ 600 601 links = [] 602 603 replace_chars = " '\"" 604 605 for val in re.split(", *<", value): 606 try: 607 url, params = val.split(";", 1) 608 except ValueError: 609 url, params = val, '' 610 611 link = {} 612 613 link["url"] = url.strip("<> '\"") 614 615 for param in params.split(";"): 616 try: 617 key, value = param.split("=") 618 except ValueError: 619 break 620 621 link[key.strip(replace_chars)] = value.strip(replace_chars) 622 623 links.append(link) 624 625 return links 626 627 628# Null bytes; no need to recreate these on each call to guess_json_utf 629_null = '\x00'.encode('ascii') # encoding to ASCII for Python 3 630_null2 = _null * 2 631_null3 = _null * 3 632 633 634def guess_json_utf(data): 635 # JSON always starts with two ASCII characters, so detection is as 636 # easy as counting the nulls and from their location and count 637 # determine the encoding. Also detect a BOM, if present. 638 sample = data[:4] 639 if sample in (codecs.BOM_UTF32_LE, codecs.BOM32_BE): 640 return 'utf-32' # BOM included 641 if sample[:3] == codecs.BOM_UTF8: 642 return 'utf-8-sig' # BOM included, MS style (discouraged) 643 if sample[:2] in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE): 644 return 'utf-16' # BOM included 645 nullcount = sample.count(_null) 646 if nullcount == 0: 647 return 'utf-8' 648 if nullcount == 2: 649 if sample[::2] == _null2: # 1st and 3rd are null 650 return 'utf-16-be' 651 if sample[1::2] == _null2: # 2nd and 4th are null 652 return 'utf-16-le' 653 # Did not detect 2 valid UTF-16 ascii-range characters 654 if nullcount == 3: 655 if sample[:3] == _null3: 656 return 'utf-32-be' 657 if sample[1:] == _null3: 658 return 'utf-32-le' 659 # Did not detect a valid UTF-32 ascii-range character 660 return None 661 662 663def prepend_scheme_if_needed(url, new_scheme): 664 '''Given a URL that may or may not have a scheme, prepend the given scheme. 665 Does not replace a present scheme with the one provided as an argument.''' 666 scheme, netloc, path, params, query, fragment = urlparse(url, new_scheme) 667 668 # urlparse is a finicky beast, and sometimes decides that there isn't a 669 # netloc present. Assume that it's being over-cautious, and switch netloc 670 # and path if urlparse decided there was no netloc. 671 if not netloc: 672 netloc, path = path, netloc 673 674 return urlunparse((scheme, netloc, path, params, query, fragment)) 675 676 677def get_auth_from_url(url): 678 """Given a url with authentication components, extract them into a tuple of 679 username,password.""" 680 parsed = urlparse(url) 681 682 try: 683 auth = (unquote(parsed.username), unquote(parsed.password)) 684 except (AttributeError, TypeError): 685 auth = ('', '') 686 687 return auth 688 689 690def to_native_string(string, encoding='ascii'): 691 """ 692 Given a string object, regardless of type, returns a representation of that 693 string in the native string type, encoding and decoding where necessary. 694 This assumes ASCII unless told otherwise. 695 """ 696 out = None 697 698 if isinstance(string, builtin_str): 699 out = string 700 else: 701 if is_py2: 702 out = string.encode(encoding) 703 else: 704 out = string.decode(encoding) 705 706 return out 707 708 709def urldefragauth(url): 710 """ 711 Given a url remove the fragment and the authentication part 712 """ 713 scheme, netloc, path, params, query, fragment = urlparse(url) 714 715 # see func:`prepend_scheme_if_needed` 716 if not netloc: 717 netloc, path = path, netloc 718 719 netloc = netloc.rsplit('@', 1)[-1] 720 721 return urlunparse((scheme, netloc, path, params, query, '')) 722