1# -*- coding: utf-8 -*- 2 3""" 4requests.utils 5~~~~~~~~~~~~~~ 6 7This module provides utility functions that are used within Requests 8that are also useful for external consumption. 9""" 10 11import codecs 12import contextlib 13import io 14import os 15import re 16import socket 17import struct 18import sys 19import tempfile 20import warnings 21import zipfile 22from collections import OrderedDict 23 24from .__version__ import __version__ 25from . import certs 26# to_native_string is unused here, but imported here for backwards compatibility 27from ._internal_utils import to_native_string 28from .compat import parse_http_list as _parse_list_header 29from .compat import ( 30 quote, urlparse, bytes, str, unquote, getproxies, 31 proxy_bypass, urlunparse, basestring, integer_types, is_py3, 32 proxy_bypass_environment, getproxies_environment, Mapping) 33from .cookies import cookiejar_from_dict 34from .structures import CaseInsensitiveDict 35from .exceptions import ( 36 InvalidURL, InvalidHeader, FileModeWarning, UnrewindableBodyError) 37 38NETRC_FILES = ('.netrc', '_netrc') 39 40DEFAULT_CA_BUNDLE_PATH = certs.where() 41 42DEFAULT_PORTS = {'http': 80, 'https': 443} 43 44 45if sys.platform == 'win32': 46 # provide a proxy_bypass version on Windows without DNS lookups 47 48 def proxy_bypass_registry(host): 49 try: 50 if is_py3: 51 import winreg 52 else: 53 import _winreg as winreg 54 except ImportError: 55 return False 56 57 try: 58 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER, 59 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings') 60 # ProxyEnable could be REG_SZ or REG_DWORD, normalizing it 61 proxyEnable = int(winreg.QueryValueEx(internetSettings, 62 'ProxyEnable')[0]) 63 # ProxyOverride is almost always a string 64 proxyOverride = winreg.QueryValueEx(internetSettings, 65 'ProxyOverride')[0] 66 except OSError: 67 return False 68 if not proxyEnable or not proxyOverride: 69 return False 70 71 # make a check value list from the registry entry: replace the 72 # '<local>' string by the localhost entry and the corresponding 73 # canonical entry. 74 proxyOverride = proxyOverride.split(';') 75 # now check if we match one of the registry values. 76 for test in proxyOverride: 77 if test == '<local>': 78 if '.' not in host: 79 return True 80 test = test.replace(".", r"\.") # mask dots 81 test = test.replace("*", r".*") # change glob sequence 82 test = test.replace("?", r".") # change glob char 83 if re.match(test, host, re.I): 84 return True 85 return False 86 87 def proxy_bypass(host): # noqa 88 """Return True, if the host should be bypassed. 89 90 Checks proxy settings gathered from the environment, if specified, 91 or the registry. 92 """ 93 if getproxies_environment(): 94 return proxy_bypass_environment(host) 95 else: 96 return proxy_bypass_registry(host) 97 98 99def dict_to_sequence(d): 100 """Returns an internal sequence dictionary update.""" 101 102 if hasattr(d, 'items'): 103 d = d.items() 104 105 return d 106 107 108def super_len(o): 109 total_length = None 110 current_position = 0 111 112 if hasattr(o, '__len__'): 113 total_length = len(o) 114 115 elif hasattr(o, 'len'): 116 total_length = o.len 117 118 elif hasattr(o, 'fileno'): 119 try: 120 fileno = o.fileno() 121 except io.UnsupportedOperation: 122 pass 123 else: 124 total_length = os.fstat(fileno).st_size 125 126 # Having used fstat to determine the file length, we need to 127 # confirm that this file was opened up in binary mode. 128 if 'b' not in o.mode: 129 warnings.warn(( 130 "Requests has determined the content-length for this " 131 "request using the binary size of the file: however, the " 132 "file has been opened in text mode (i.e. without the 'b' " 133 "flag in the mode). This may lead to an incorrect " 134 "content-length. In Requests 3.0, support will be removed " 135 "for files in text mode."), 136 FileModeWarning 137 ) 138 139 if hasattr(o, 'tell'): 140 try: 141 current_position = o.tell() 142 except (OSError, IOError): 143 # This can happen in some weird situations, such as when the file 144 # is actually a special file descriptor like stdin. In this 145 # instance, we don't know what the length is, so set it to zero and 146 # let requests chunk it instead. 147 if total_length is not None: 148 current_position = total_length 149 else: 150 if hasattr(o, 'seek') and total_length is None: 151 # StringIO and BytesIO have seek but no useable fileno 152 try: 153 # seek to end of file 154 o.seek(0, 2) 155 total_length = o.tell() 156 157 # seek back to current position to support 158 # partially read file-like objects 159 o.seek(current_position or 0) 160 except (OSError, IOError): 161 total_length = 0 162 163 if total_length is None: 164 total_length = 0 165 166 return max(0, total_length - current_position) 167 168 169def get_netrc_auth(url, raise_errors=False): 170 """Returns the Requests tuple auth for a given url from netrc.""" 171 172 netrc_file = os.environ.get('NETRC') 173 if netrc_file is not None: 174 netrc_locations = (netrc_file,) 175 else: 176 netrc_locations = ('~/{}'.format(f) for f in NETRC_FILES) 177 178 try: 179 from netrc import netrc, NetrcParseError 180 181 netrc_path = None 182 183 for f in netrc_locations: 184 try: 185 loc = os.path.expanduser(f) 186 except KeyError: 187 # os.path.expanduser can fail when $HOME is undefined and 188 # getpwuid fails. See https://bugs.python.org/issue20164 & 189 # https://github.com/psf/requests/issues/1846 190 return 191 192 if os.path.exists(loc): 193 netrc_path = loc 194 break 195 196 # Abort early if there isn't one. 197 if netrc_path is None: 198 return 199 200 ri = urlparse(url) 201 202 # Strip port numbers from netloc. This weird `if...encode`` dance is 203 # used for Python 3.2, which doesn't support unicode literals. 204 splitstr = b':' 205 if isinstance(url, str): 206 splitstr = splitstr.decode('ascii') 207 host = ri.netloc.split(splitstr)[0] 208 209 try: 210 _netrc = netrc(netrc_path).authenticators(host) 211 if _netrc: 212 # Return with login / password 213 login_i = (0 if _netrc[0] else 1) 214 return (_netrc[login_i], _netrc[2]) 215 except (NetrcParseError, IOError): 216 # If there was a parsing error or a permissions issue reading the file, 217 # we'll just skip netrc auth unless explicitly asked to raise errors. 218 if raise_errors: 219 raise 220 221 # App Engine hackiness. 222 except (ImportError, AttributeError): 223 pass 224 225 226def guess_filename(obj): 227 """Tries to guess the filename of the given object.""" 228 name = getattr(obj, 'name', None) 229 if (name and isinstance(name, basestring) and name[0] != '<' and 230 name[-1] != '>'): 231 return os.path.basename(name) 232 233 234def extract_zipped_paths(path): 235 """Replace nonexistent paths that look like they refer to a member of a zip 236 archive with the location of an extracted copy of the target, or else 237 just return the provided path unchanged. 238 """ 239 if os.path.exists(path): 240 # this is already a valid path, no need to do anything further 241 return path 242 243 # find the first valid part of the provided path and treat that as a zip archive 244 # assume the rest of the path is the name of a member in the archive 245 archive, member = os.path.split(path) 246 while archive and not os.path.exists(archive): 247 archive, prefix = os.path.split(archive) 248 member = '/'.join([prefix, member]) 249 250 if not zipfile.is_zipfile(archive): 251 return path 252 253 zip_file = zipfile.ZipFile(archive) 254 if member not in zip_file.namelist(): 255 return path 256 257 # we have a valid zip archive and a valid member of that archive 258 tmp = tempfile.gettempdir() 259 extracted_path = os.path.join(tmp, *member.split('/')) 260 if not os.path.exists(extracted_path): 261 extracted_path = zip_file.extract(member, path=tmp) 262 263 return extracted_path 264 265 266def from_key_val_list(value): 267 """Take an object and test to see if it can be represented as a 268 dictionary. Unless it can not be represented as such, return an 269 OrderedDict, e.g., 270 271 :: 272 273 >>> from_key_val_list([('key', 'val')]) 274 OrderedDict([('key', 'val')]) 275 >>> from_key_val_list('string') 276 Traceback (most recent call last): 277 ... 278 ValueError: cannot encode objects that are not 2-tuples 279 >>> from_key_val_list({'key': 'val'}) 280 OrderedDict([('key', 'val')]) 281 282 :rtype: OrderedDict 283 """ 284 if value is None: 285 return None 286 287 if isinstance(value, (str, bytes, bool, int)): 288 raise ValueError('cannot encode objects that are not 2-tuples') 289 290 return OrderedDict(value) 291 292 293def to_key_val_list(value): 294 """Take an object and test to see if it can be represented as a 295 dictionary. If it can be, return a list of tuples, e.g., 296 297 :: 298 299 >>> to_key_val_list([('key', 'val')]) 300 [('key', 'val')] 301 >>> to_key_val_list({'key': 'val'}) 302 [('key', 'val')] 303 >>> to_key_val_list('string') 304 Traceback (most recent call last): 305 ... 306 ValueError: cannot encode objects that are not 2-tuples 307 308 :rtype: list 309 """ 310 if value is None: 311 return None 312 313 if isinstance(value, (str, bytes, bool, int)): 314 raise ValueError('cannot encode objects that are not 2-tuples') 315 316 if isinstance(value, Mapping): 317 value = value.items() 318 319 return list(value) 320 321 322# From mitsuhiko/werkzeug (used with permission). 323def parse_list_header(value): 324 """Parse lists as described by RFC 2068 Section 2. 325 326 In particular, parse comma-separated lists where the elements of 327 the list may include quoted-strings. A quoted-string could 328 contain a comma. A non-quoted string could have quotes in the 329 middle. Quotes are removed automatically after parsing. 330 331 It basically works like :func:`parse_set_header` just that items 332 may appear multiple times and case sensitivity is preserved. 333 334 The return value is a standard :class:`list`: 335 336 >>> parse_list_header('token, "quoted value"') 337 ['token', 'quoted value'] 338 339 To create a header from the :class:`list` again, use the 340 :func:`dump_header` function. 341 342 :param value: a string with a list header. 343 :return: :class:`list` 344 :rtype: list 345 """ 346 result = [] 347 for item in _parse_list_header(value): 348 if item[:1] == item[-1:] == '"': 349 item = unquote_header_value(item[1:-1]) 350 result.append(item) 351 return result 352 353 354# From mitsuhiko/werkzeug (used with permission). 355def parse_dict_header(value): 356 """Parse lists of key, value pairs as described by RFC 2068 Section 2 and 357 convert them into a python dict: 358 359 >>> d = parse_dict_header('foo="is a fish", bar="as well"') 360 >>> type(d) is dict 361 True 362 >>> sorted(d.items()) 363 [('bar', 'as well'), ('foo', 'is a fish')] 364 365 If there is no value for a key it will be `None`: 366 367 >>> parse_dict_header('key_without_value') 368 {'key_without_value': None} 369 370 To create a header from the :class:`dict` again, use the 371 :func:`dump_header` function. 372 373 :param value: a string with a dict header. 374 :return: :class:`dict` 375 :rtype: dict 376 """ 377 result = {} 378 for item in _parse_list_header(value): 379 if '=' not in item: 380 result[item] = None 381 continue 382 name, value = item.split('=', 1) 383 if value[:1] == value[-1:] == '"': 384 value = unquote_header_value(value[1:-1]) 385 result[name] = value 386 return result 387 388 389# From mitsuhiko/werkzeug (used with permission). 390def unquote_header_value(value, is_filename=False): 391 r"""Unquotes a header value. (Reversal of :func:`quote_header_value`). 392 This does not use the real unquoting but what browsers are actually 393 using for quoting. 394 395 :param value: the header value to unquote. 396 :rtype: str 397 """ 398 if value and value[0] == value[-1] == '"': 399 # this is not the real unquoting, but fixing this so that the 400 # RFC is met will result in bugs with internet explorer and 401 # probably some other browsers as well. IE for example is 402 # uploading files with "C:\foo\bar.txt" as filename 403 value = value[1:-1] 404 405 # if this is a filename and the starting characters look like 406 # a UNC path, then just return the value without quotes. Using the 407 # replace sequence below on a UNC path has the effect of turning 408 # the leading double slash into a single slash and then 409 # _fix_ie_filename() doesn't work correctly. See #458. 410 if not is_filename or value[:2] != '\\\\': 411 return value.replace('\\\\', '\\').replace('\\"', '"') 412 return value 413 414 415def dict_from_cookiejar(cj): 416 """Returns a key/value dictionary from a CookieJar. 417 418 :param cj: CookieJar object to extract cookies from. 419 :rtype: dict 420 """ 421 422 cookie_dict = {} 423 424 for cookie in cj: 425 cookie_dict[cookie.name] = cookie.value 426 427 return cookie_dict 428 429 430def add_dict_to_cookiejar(cj, cookie_dict): 431 """Returns a CookieJar from a key/value dictionary. 432 433 :param cj: CookieJar to insert cookies into. 434 :param cookie_dict: Dict of key/values to insert into CookieJar. 435 :rtype: CookieJar 436 """ 437 438 return cookiejar_from_dict(cookie_dict, cj) 439 440 441def get_encodings_from_content(content): 442 """Returns encodings from given content string. 443 444 :param content: bytestring to extract encodings from. 445 """ 446 warnings.warn(( 447 'In requests 3.0, get_encodings_from_content will be removed. For ' 448 'more information, please see the discussion on issue #2266. (This' 449 ' warning should only appear once.)'), 450 DeprecationWarning) 451 452 charset_re = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I) 453 pragma_re = re.compile(r'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I) 454 xml_re = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]') 455 456 return (charset_re.findall(content) + 457 pragma_re.findall(content) + 458 xml_re.findall(content)) 459 460 461def _parse_content_type_header(header): 462 """Returns content type and parameters from given header 463 464 :param header: string 465 :return: tuple containing content type and dictionary of 466 parameters 467 """ 468 469 tokens = header.split(';') 470 content_type, params = tokens[0].strip(), tokens[1:] 471 params_dict = {} 472 items_to_strip = "\"' " 473 474 for param in params: 475 param = param.strip() 476 if param: 477 key, value = param, True 478 index_of_equals = param.find("=") 479 if index_of_equals != -1: 480 key = param[:index_of_equals].strip(items_to_strip) 481 value = param[index_of_equals + 1:].strip(items_to_strip) 482 params_dict[key.lower()] = value 483 return content_type, params_dict 484 485 486def get_encoding_from_headers(headers): 487 """Returns encodings from given HTTP Header Dict. 488 489 :param headers: dictionary to extract encoding from. 490 :rtype: str 491 """ 492 493 content_type = headers.get('content-type') 494 495 if not content_type: 496 return None 497 498 content_type, params = _parse_content_type_header(content_type) 499 500 if 'charset' in params: 501 return params['charset'].strip("'\"") 502 503 if 'text' in content_type: 504 return 'ISO-8859-1' 505 506 if 'application/json' in content_type: 507 # Assume UTF-8 based on RFC 4627: https://www.ietf.org/rfc/rfc4627.txt since the charset was unset 508 return 'utf-8' 509 510 511def stream_decode_response_unicode(iterator, r): 512 """Stream decodes a iterator.""" 513 514 if r.encoding is None: 515 for item in iterator: 516 yield item 517 return 518 519 decoder = codecs.getincrementaldecoder(r.encoding)(errors='replace') 520 for chunk in iterator: 521 rv = decoder.decode(chunk) 522 if rv: 523 yield rv 524 rv = decoder.decode(b'', final=True) 525 if rv: 526 yield rv 527 528 529def iter_slices(string, slice_length): 530 """Iterate over slices of a string.""" 531 pos = 0 532 if slice_length is None or slice_length <= 0: 533 slice_length = len(string) 534 while pos < len(string): 535 yield string[pos:pos + slice_length] 536 pos += slice_length 537 538 539def get_unicode_from_response(r): 540 """Returns the requested content back in unicode. 541 542 :param r: Response object to get unicode content from. 543 544 Tried: 545 546 1. charset from content-type 547 2. fall back and replace all unicode characters 548 549 :rtype: str 550 """ 551 warnings.warn(( 552 'In requests 3.0, get_unicode_from_response will be removed. For ' 553 'more information, please see the discussion on issue #2266. (This' 554 ' warning should only appear once.)'), 555 DeprecationWarning) 556 557 tried_encodings = [] 558 559 # Try charset from content-type 560 encoding = get_encoding_from_headers(r.headers) 561 562 if encoding: 563 try: 564 return str(r.content, encoding) 565 except UnicodeError: 566 tried_encodings.append(encoding) 567 568 # Fall back: 569 try: 570 return str(r.content, encoding, errors='replace') 571 except TypeError: 572 return r.content 573 574 575# The unreserved URI characters (RFC 3986) 576UNRESERVED_SET = frozenset( 577 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" + "0123456789-._~") 578 579 580def unquote_unreserved(uri): 581 """Un-escape any percent-escape sequences in a URI that are unreserved 582 characters. This leaves all reserved, illegal and non-ASCII bytes encoded. 583 584 :rtype: str 585 """ 586 parts = uri.split('%') 587 for i in range(1, len(parts)): 588 h = parts[i][0:2] 589 if len(h) == 2 and h.isalnum(): 590 try: 591 c = chr(int(h, 16)) 592 except ValueError: 593 raise InvalidURL("Invalid percent-escape sequence: '%s'" % h) 594 595 if c in UNRESERVED_SET: 596 parts[i] = c + parts[i][2:] 597 else: 598 parts[i] = '%' + parts[i] 599 else: 600 parts[i] = '%' + parts[i] 601 return ''.join(parts) 602 603 604def requote_uri(uri): 605 """Re-quote the given URI. 606 607 This function passes the given URI through an unquote/quote cycle to 608 ensure that it is fully and consistently quoted. 609 610 :rtype: str 611 """ 612 safe_with_percent = "!#$%&'()*+,/:;=?@[]~" 613 safe_without_percent = "!#$&'()*+,/:;=?@[]~" 614 try: 615 # Unquote only the unreserved characters 616 # Then quote only illegal characters (do not quote reserved, 617 # unreserved, or '%') 618 return quote(unquote_unreserved(uri), safe=safe_with_percent) 619 except InvalidURL: 620 # We couldn't unquote the given URI, so let's try quoting it, but 621 # there may be unquoted '%'s in the URI. We need to make sure they're 622 # properly quoted so they do not cause issues elsewhere. 623 return quote(uri, safe=safe_without_percent) 624 625 626def address_in_network(ip, net): 627 """This function allows you to check if an IP belongs to a network subnet 628 629 Example: returns True if ip = 192.168.1.1 and net = 192.168.1.0/24 630 returns False if ip = 192.168.1.1 and net = 192.168.100.0/24 631 632 :rtype: bool 633 """ 634 ipaddr = struct.unpack('=L', socket.inet_aton(ip))[0] 635 netaddr, bits = net.split('/') 636 netmask = struct.unpack('=L', socket.inet_aton(dotted_netmask(int(bits))))[0] 637 network = struct.unpack('=L', socket.inet_aton(netaddr))[0] & netmask 638 return (ipaddr & netmask) == (network & netmask) 639 640 641def dotted_netmask(mask): 642 """Converts mask from /xx format to xxx.xxx.xxx.xxx 643 644 Example: if mask is 24 function returns 255.255.255.0 645 646 :rtype: str 647 """ 648 bits = 0xffffffff ^ (1 << 32 - mask) - 1 649 return socket.inet_ntoa(struct.pack('>I', bits)) 650 651 652def is_ipv4_address(string_ip): 653 """ 654 :rtype: bool 655 """ 656 try: 657 socket.inet_aton(string_ip) 658 except socket.error: 659 return False 660 return True 661 662 663def is_valid_cidr(string_network): 664 """ 665 Very simple check of the cidr format in no_proxy variable. 666 667 :rtype: bool 668 """ 669 if string_network.count('/') == 1: 670 try: 671 mask = int(string_network.split('/')[1]) 672 except ValueError: 673 return False 674 675 if mask < 1 or mask > 32: 676 return False 677 678 try: 679 socket.inet_aton(string_network.split('/')[0]) 680 except socket.error: 681 return False 682 else: 683 return False 684 return True 685 686 687@contextlib.contextmanager 688def set_environ(env_name, value): 689 """Set the environment variable 'env_name' to 'value' 690 691 Save previous value, yield, and then restore the previous value stored in 692 the environment variable 'env_name'. 693 694 If 'value' is None, do nothing""" 695 value_changed = value is not None 696 if value_changed: 697 old_value = os.environ.get(env_name) 698 os.environ[env_name] = value 699 try: 700 yield 701 finally: 702 if value_changed: 703 if old_value is None: 704 del os.environ[env_name] 705 else: 706 os.environ[env_name] = old_value 707 708 709def should_bypass_proxies(url, no_proxy): 710 """ 711 Returns whether we should bypass proxies or not. 712 713 :rtype: bool 714 """ 715 # Prioritize lowercase environment variables over uppercase 716 # to keep a consistent behaviour with other http projects (curl, wget). 717 get_proxy = lambda k: os.environ.get(k) or os.environ.get(k.upper()) 718 719 # First check whether no_proxy is defined. If it is, check that the URL 720 # we're getting isn't in the no_proxy list. 721 no_proxy_arg = no_proxy 722 if no_proxy is None: 723 no_proxy = get_proxy('no_proxy') 724 parsed = urlparse(url) 725 726 if parsed.hostname is None: 727 # URLs don't always have hostnames, e.g. file:/// urls. 728 return True 729 730 if no_proxy: 731 # We need to check whether we match here. We need to see if we match 732 # the end of the hostname, both with and without the port. 733 no_proxy = ( 734 host for host in no_proxy.replace(' ', '').split(',') if host 735 ) 736 737 if is_ipv4_address(parsed.hostname): 738 for proxy_ip in no_proxy: 739 if is_valid_cidr(proxy_ip): 740 if address_in_network(parsed.hostname, proxy_ip): 741 return True 742 elif parsed.hostname == proxy_ip: 743 # If no_proxy ip was defined in plain IP notation instead of cidr notation & 744 # matches the IP of the index 745 return True 746 else: 747 host_with_port = parsed.hostname 748 if parsed.port: 749 host_with_port += ':{}'.format(parsed.port) 750 751 for host in no_proxy: 752 if parsed.hostname.endswith(host) or host_with_port.endswith(host): 753 # The URL does match something in no_proxy, so we don't want 754 # to apply the proxies on this URL. 755 return True 756 757 with set_environ('no_proxy', no_proxy_arg): 758 # parsed.hostname can be `None` in cases such as a file URI. 759 try: 760 bypass = proxy_bypass(parsed.hostname) 761 except (TypeError, socket.gaierror): 762 bypass = False 763 764 if bypass: 765 return True 766 767 return False 768 769 770def get_environ_proxies(url, no_proxy=None): 771 """ 772 Return a dict of environment proxies. 773 774 :rtype: dict 775 """ 776 if should_bypass_proxies(url, no_proxy=no_proxy): 777 return {} 778 else: 779 return getproxies() 780 781 782def select_proxy(url, proxies): 783 """Select a proxy for the url, if applicable. 784 785 :param url: The url being for the request 786 :param proxies: A dictionary of schemes or schemes and hosts to proxy URLs 787 """ 788 proxies = proxies or {} 789 urlparts = urlparse(url) 790 if urlparts.hostname is None: 791 return proxies.get(urlparts.scheme, proxies.get('all')) 792 793 proxy_keys = [ 794 urlparts.scheme + '://' + urlparts.hostname, 795 urlparts.scheme, 796 'all://' + urlparts.hostname, 797 'all', 798 ] 799 proxy = None 800 for proxy_key in proxy_keys: 801 if proxy_key in proxies: 802 proxy = proxies[proxy_key] 803 break 804 805 return proxy 806 807 808def default_user_agent(name="python-requests"): 809 """ 810 Return a string representing the default user agent. 811 812 :rtype: str 813 """ 814 return '%s/%s' % (name, __version__) 815 816 817def default_headers(): 818 """ 819 :rtype: requests.structures.CaseInsensitiveDict 820 """ 821 return CaseInsensitiveDict({ 822 'User-Agent': default_user_agent(), 823 'Accept-Encoding': ', '.join(('gzip', 'deflate')), 824 'Accept': '*/*', 825 'Connection': 'keep-alive', 826 }) 827 828 829def parse_header_links(value): 830 """Return a list of parsed link headers proxies. 831 832 i.e. Link: <http:/.../front.jpeg>; rel=front; type="image/jpeg",<http://.../back.jpeg>; rel=back;type="image/jpeg" 833 834 :rtype: list 835 """ 836 837 links = [] 838 839 replace_chars = ' \'"' 840 841 value = value.strip(replace_chars) 842 if not value: 843 return links 844 845 for val in re.split(', *<', value): 846 try: 847 url, params = val.split(';', 1) 848 except ValueError: 849 url, params = val, '' 850 851 link = {'url': url.strip('<> \'"')} 852 853 for param in params.split(';'): 854 try: 855 key, value = param.split('=') 856 except ValueError: 857 break 858 859 link[key.strip(replace_chars)] = value.strip(replace_chars) 860 861 links.append(link) 862 863 return links 864 865 866# Null bytes; no need to recreate these on each call to guess_json_utf 867_null = '\x00'.encode('ascii') # encoding to ASCII for Python 3 868_null2 = _null * 2 869_null3 = _null * 3 870 871 872def guess_json_utf(data): 873 """ 874 :rtype: str 875 """ 876 # JSON always starts with two ASCII characters, so detection is as 877 # easy as counting the nulls and from their location and count 878 # determine the encoding. Also detect a BOM, if present. 879 sample = data[:4] 880 if sample in (codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE): 881 return 'utf-32' # BOM included 882 if sample[:3] == codecs.BOM_UTF8: 883 return 'utf-8-sig' # BOM included, MS style (discouraged) 884 if sample[:2] in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE): 885 return 'utf-16' # BOM included 886 nullcount = sample.count(_null) 887 if nullcount == 0: 888 return 'utf-8' 889 if nullcount == 2: 890 if sample[::2] == _null2: # 1st and 3rd are null 891 return 'utf-16-be' 892 if sample[1::2] == _null2: # 2nd and 4th are null 893 return 'utf-16-le' 894 # Did not detect 2 valid UTF-16 ascii-range characters 895 if nullcount == 3: 896 if sample[:3] == _null3: 897 return 'utf-32-be' 898 if sample[1:] == _null3: 899 return 'utf-32-le' 900 # Did not detect a valid UTF-32 ascii-range character 901 return None 902 903 904def prepend_scheme_if_needed(url, new_scheme): 905 """Given a URL that may or may not have a scheme, prepend the given scheme. 906 Does not replace a present scheme with the one provided as an argument. 907 908 :rtype: str 909 """ 910 scheme, netloc, path, params, query, fragment = urlparse(url, new_scheme) 911 912 # urlparse is a finicky beast, and sometimes decides that there isn't a 913 # netloc present. Assume that it's being over-cautious, and switch netloc 914 # and path if urlparse decided there was no netloc. 915 if not netloc: 916 netloc, path = path, netloc 917 918 return urlunparse((scheme, netloc, path, params, query, fragment)) 919 920 921def get_auth_from_url(url): 922 """Given a url with authentication components, extract them into a tuple of 923 username,password. 924 925 :rtype: (str,str) 926 """ 927 parsed = urlparse(url) 928 929 try: 930 auth = (unquote(parsed.username), unquote(parsed.password)) 931 except (AttributeError, TypeError): 932 auth = ('', '') 933 934 return auth 935 936 937# Moved outside of function to avoid recompile every call 938_CLEAN_HEADER_REGEX_BYTE = re.compile(b'^\\S[^\\r\\n]*$|^$') 939_CLEAN_HEADER_REGEX_STR = re.compile(r'^\S[^\r\n]*$|^$') 940 941 942def check_header_validity(header): 943 """Verifies that header value is a string which doesn't contain 944 leading whitespace or return characters. This prevents unintended 945 header injection. 946 947 :param header: tuple, in the format (name, value). 948 """ 949 name, value = header 950 951 if isinstance(value, bytes): 952 pat = _CLEAN_HEADER_REGEX_BYTE 953 else: 954 pat = _CLEAN_HEADER_REGEX_STR 955 try: 956 if not pat.match(value): 957 raise InvalidHeader("Invalid return character or leading space in header: %s" % name) 958 except TypeError: 959 raise InvalidHeader("Value for header {%s: %s} must be of type str or " 960 "bytes, not %s" % (name, value, type(value))) 961 962 963def urldefragauth(url): 964 """ 965 Given a url remove the fragment and the authentication part. 966 967 :rtype: str 968 """ 969 scheme, netloc, path, params, query, fragment = urlparse(url) 970 971 # see func:`prepend_scheme_if_needed` 972 if not netloc: 973 netloc, path = path, netloc 974 975 netloc = netloc.rsplit('@', 1)[-1] 976 977 return urlunparse((scheme, netloc, path, params, query, '')) 978 979 980def rewind_body(prepared_request): 981 """Move file pointer back to its recorded starting position 982 so it can be read again on redirect. 983 """ 984 body_seek = getattr(prepared_request.body, 'seek', None) 985 if body_seek is not None and isinstance(prepared_request._body_position, integer_types): 986 try: 987 body_seek(prepared_request._body_position) 988 except (IOError, OSError): 989 raise UnrewindableBodyError("An error occurred when rewinding request " 990 "body for redirect.") 991 else: 992 raise UnrewindableBodyError("Unable to rewind request body for redirect.") 993