1# -*- coding: utf-8 -*- 2 3""" 4requests.utils 5~~~~~~~~~~~~~~ 6 7This module provides utility functions that are used within Requests 8that are also useful for external consumption. 9""" 10 11import codecs 12import contextlib 13import io 14import os 15import re 16import socket 17import struct 18import sys 19import tempfile 20import warnings 21import zipfile 22 23from .__version__ import __version__ 24from . import certs 25# to_native_string is unused here, but imported here for backwards compatibility 26from ._internal_utils import to_native_string 27from .compat import parse_http_list as _parse_list_header 28from .compat import ( 29 quote, urlparse, bytes, str, OrderedDict, unquote, getproxies, 30 proxy_bypass, urlunparse, basestring, integer_types, is_py3, 31 proxy_bypass_environment, getproxies_environment, Mapping) 32from .cookies import cookiejar_from_dict 33from .structures import CaseInsensitiveDict 34from .exceptions import ( 35 InvalidURL, InvalidHeader, FileModeWarning, UnrewindableBodyError) 36 37NETRC_FILES = ('.netrc', '_netrc') 38 39DEFAULT_CA_BUNDLE_PATH = certs.where() 40 41DEFAULT_PORTS = {'http': 80, 'https': 443} 42 43 44if sys.platform == 'win32': 45 # provide a proxy_bypass version on Windows without DNS lookups 46 47 def proxy_bypass_registry(host): 48 try: 49 if is_py3: 50 import winreg 51 else: 52 import _winreg as winreg 53 except ImportError: 54 return False 55 56 try: 57 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER, 58 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings') 59 # ProxyEnable could be REG_SZ or REG_DWORD, normalizing it 60 proxyEnable = int(winreg.QueryValueEx(internetSettings, 61 'ProxyEnable')[0]) 62 # ProxyOverride is almost always a string 63 proxyOverride = winreg.QueryValueEx(internetSettings, 64 'ProxyOverride')[0] 65 except OSError: 66 return False 67 if not proxyEnable or not proxyOverride: 68 return False 69 70 # make a check value list from the registry entry: replace the 71 # '<local>' string by the localhost entry and the corresponding 72 # canonical entry. 73 proxyOverride = proxyOverride.split(';') 74 # now check if we match one of the registry values. 75 for test in proxyOverride: 76 if test == '<local>': 77 if '.' not in host: 78 return True 79 test = test.replace(".", r"\.") # mask dots 80 test = test.replace("*", r".*") # change glob sequence 81 test = test.replace("?", r".") # change glob char 82 if re.match(test, host, re.I): 83 return True 84 return False 85 86 def proxy_bypass(host): # noqa 87 """Return True, if the host should be bypassed. 88 89 Checks proxy settings gathered from the environment, if specified, 90 or the registry. 91 """ 92 if getproxies_environment(): 93 return proxy_bypass_environment(host) 94 else: 95 return proxy_bypass_registry(host) 96 97 98def dict_to_sequence(d): 99 """Returns an internal sequence dictionary update.""" 100 101 if hasattr(d, 'items'): 102 d = d.items() 103 104 return d 105 106 107def super_len(o): 108 total_length = None 109 current_position = 0 110 111 if hasattr(o, '__len__'): 112 total_length = len(o) 113 114 elif hasattr(o, 'len'): 115 total_length = o.len 116 117 elif hasattr(o, 'fileno'): 118 try: 119 fileno = o.fileno() 120 except io.UnsupportedOperation: 121 pass 122 else: 123 total_length = os.fstat(fileno).st_size 124 125 # Having used fstat to determine the file length, we need to 126 # confirm that this file was opened up in binary mode. 127 if 'b' not in o.mode: 128 warnings.warn(( 129 "Requests has determined the content-length for this " 130 "request using the binary size of the file: however, the " 131 "file has been opened in text mode (i.e. without the 'b' " 132 "flag in the mode). This may lead to an incorrect " 133 "content-length. In Requests 3.0, support will be removed " 134 "for files in text mode."), 135 FileModeWarning 136 ) 137 138 if hasattr(o, 'tell'): 139 try: 140 current_position = o.tell() 141 except (OSError, IOError): 142 # This can happen in some weird situations, such as when the file 143 # is actually a special file descriptor like stdin. In this 144 # instance, we don't know what the length is, so set it to zero and 145 # let requests chunk it instead. 146 if total_length is not None: 147 current_position = total_length 148 else: 149 if hasattr(o, 'seek') and total_length is None: 150 # StringIO and BytesIO have seek but no useable fileno 151 try: 152 # seek to end of file 153 o.seek(0, 2) 154 total_length = o.tell() 155 156 # seek back to current position to support 157 # partially read file-like objects 158 o.seek(current_position or 0) 159 except (OSError, IOError): 160 total_length = 0 161 162 if total_length is None: 163 total_length = 0 164 165 return max(0, total_length - current_position) 166 167 168def get_netrc_auth(url, raise_errors=False): 169 """Returns the Requests tuple auth for a given url from netrc.""" 170 171 try: 172 from netrc import netrc, NetrcParseError 173 174 netrc_path = None 175 176 for f in NETRC_FILES: 177 try: 178 loc = os.path.expanduser('~/{}'.format(f)) 179 except KeyError: 180 # os.path.expanduser can fail when $HOME is undefined and 181 # getpwuid fails. See https://bugs.python.org/issue20164 & 182 # https://github.com/requests/requests/issues/1846 183 return 184 185 if os.path.exists(loc): 186 netrc_path = loc 187 break 188 189 # Abort early if there isn't one. 190 if netrc_path is None: 191 return 192 193 ri = urlparse(url) 194 195 # Strip port numbers from netloc. This weird `if...encode`` dance is 196 # used for Python 3.2, which doesn't support unicode literals. 197 splitstr = b':' 198 if isinstance(url, str): 199 splitstr = splitstr.decode('ascii') 200 host = ri.netloc.split(splitstr)[0] 201 202 try: 203 _netrc = netrc(netrc_path).authenticators(host) 204 if _netrc: 205 # Return with login / password 206 login_i = (0 if _netrc[0] else 1) 207 return (_netrc[login_i], _netrc[2]) 208 except (NetrcParseError, IOError): 209 # If there was a parsing error or a permissions issue reading the file, 210 # we'll just skip netrc auth unless explicitly asked to raise errors. 211 if raise_errors: 212 raise 213 214 # AppEngine hackiness. 215 except (ImportError, AttributeError): 216 pass 217 218 219def guess_filename(obj): 220 """Tries to guess the filename of the given object.""" 221 name = getattr(obj, 'name', None) 222 if (name and isinstance(name, basestring) and name[0] != '<' and 223 name[-1] != '>'): 224 return os.path.basename(name) 225 226 227def extract_zipped_paths(path): 228 """Replace nonexistent paths that look like they refer to a member of a zip 229 archive with the location of an extracted copy of the target, or else 230 just return the provided path unchanged. 231 """ 232 if os.path.exists(path): 233 # this is already a valid path, no need to do anything further 234 return path 235 236 # find the first valid part of the provided path and treat that as a zip archive 237 # assume the rest of the path is the name of a member in the archive 238 archive, member = os.path.split(path) 239 while archive and not os.path.exists(archive): 240 archive, prefix = os.path.split(archive) 241 member = '/'.join([prefix, member]) 242 243 if not zipfile.is_zipfile(archive): 244 return path 245 246 zip_file = zipfile.ZipFile(archive) 247 if member not in zip_file.namelist(): 248 return path 249 250 # we have a valid zip archive and a valid member of that archive 251 tmp = tempfile.gettempdir() 252 extracted_path = os.path.join(tmp, *member.split('/')) 253 if not os.path.exists(extracted_path): 254 extracted_path = zip_file.extract(member, path=tmp) 255 256 return extracted_path 257 258 259def from_key_val_list(value): 260 """Take an object and test to see if it can be represented as a 261 dictionary. Unless it can not be represented as such, return an 262 OrderedDict, e.g., 263 264 :: 265 266 >>> from_key_val_list([('key', 'val')]) 267 OrderedDict([('key', 'val')]) 268 >>> from_key_val_list('string') 269 ValueError: cannot encode objects that are not 2-tuples 270 >>> from_key_val_list({'key': 'val'}) 271 OrderedDict([('key', 'val')]) 272 273 :rtype: OrderedDict 274 """ 275 if value is None: 276 return None 277 278 if isinstance(value, (str, bytes, bool, int)): 279 raise ValueError('cannot encode objects that are not 2-tuples') 280 281 return OrderedDict(value) 282 283 284def to_key_val_list(value): 285 """Take an object and test to see if it can be represented as a 286 dictionary. If it can be, return a list of tuples, e.g., 287 288 :: 289 290 >>> to_key_val_list([('key', 'val')]) 291 [('key', 'val')] 292 >>> to_key_val_list({'key': 'val'}) 293 [('key', 'val')] 294 >>> to_key_val_list('string') 295 ValueError: cannot encode objects that are not 2-tuples. 296 297 :rtype: list 298 """ 299 if value is None: 300 return None 301 302 if isinstance(value, (str, bytes, bool, int)): 303 raise ValueError('cannot encode objects that are not 2-tuples') 304 305 if isinstance(value, Mapping): 306 value = value.items() 307 308 return list(value) 309 310 311# From mitsuhiko/werkzeug (used with permission). 312def parse_list_header(value): 313 """Parse lists as described by RFC 2068 Section 2. 314 315 In particular, parse comma-separated lists where the elements of 316 the list may include quoted-strings. A quoted-string could 317 contain a comma. A non-quoted string could have quotes in the 318 middle. Quotes are removed automatically after parsing. 319 320 It basically works like :func:`parse_set_header` just that items 321 may appear multiple times and case sensitivity is preserved. 322 323 The return value is a standard :class:`list`: 324 325 >>> parse_list_header('token, "quoted value"') 326 ['token', 'quoted value'] 327 328 To create a header from the :class:`list` again, use the 329 :func:`dump_header` function. 330 331 :param value: a string with a list header. 332 :return: :class:`list` 333 :rtype: list 334 """ 335 result = [] 336 for item in _parse_list_header(value): 337 if item[:1] == item[-1:] == '"': 338 item = unquote_header_value(item[1:-1]) 339 result.append(item) 340 return result 341 342 343# From mitsuhiko/werkzeug (used with permission). 344def parse_dict_header(value): 345 """Parse lists of key, value pairs as described by RFC 2068 Section 2 and 346 convert them into a python dict: 347 348 >>> d = parse_dict_header('foo="is a fish", bar="as well"') 349 >>> type(d) is dict 350 True 351 >>> sorted(d.items()) 352 [('bar', 'as well'), ('foo', 'is a fish')] 353 354 If there is no value for a key it will be `None`: 355 356 >>> parse_dict_header('key_without_value') 357 {'key_without_value': None} 358 359 To create a header from the :class:`dict` again, use the 360 :func:`dump_header` function. 361 362 :param value: a string with a dict header. 363 :return: :class:`dict` 364 :rtype: dict 365 """ 366 result = {} 367 for item in _parse_list_header(value): 368 if '=' not in item: 369 result[item] = None 370 continue 371 name, value = item.split('=', 1) 372 if value[:1] == value[-1:] == '"': 373 value = unquote_header_value(value[1:-1]) 374 result[name] = value 375 return result 376 377 378# From mitsuhiko/werkzeug (used with permission). 379def unquote_header_value(value, is_filename=False): 380 r"""Unquotes a header value. (Reversal of :func:`quote_header_value`). 381 This does not use the real unquoting but what browsers are actually 382 using for quoting. 383 384 :param value: the header value to unquote. 385 :rtype: str 386 """ 387 if value and value[0] == value[-1] == '"': 388 # this is not the real unquoting, but fixing this so that the 389 # RFC is met will result in bugs with internet explorer and 390 # probably some other browsers as well. IE for example is 391 # uploading files with "C:\foo\bar.txt" as filename 392 value = value[1:-1] 393 394 # if this is a filename and the starting characters look like 395 # a UNC path, then just return the value without quotes. Using the 396 # replace sequence below on a UNC path has the effect of turning 397 # the leading double slash into a single slash and then 398 # _fix_ie_filename() doesn't work correctly. See #458. 399 if not is_filename or value[:2] != '\\\\': 400 return value.replace('\\\\', '\\').replace('\\"', '"') 401 return value 402 403 404def dict_from_cookiejar(cj): 405 """Returns a key/value dictionary from a CookieJar. 406 407 :param cj: CookieJar object to extract cookies from. 408 :rtype: dict 409 """ 410 411 cookie_dict = {} 412 413 for cookie in cj: 414 cookie_dict[cookie.name] = cookie.value 415 416 return cookie_dict 417 418 419def add_dict_to_cookiejar(cj, cookie_dict): 420 """Returns a CookieJar from a key/value dictionary. 421 422 :param cj: CookieJar to insert cookies into. 423 :param cookie_dict: Dict of key/values to insert into CookieJar. 424 :rtype: CookieJar 425 """ 426 427 return cookiejar_from_dict(cookie_dict, cj) 428 429 430def get_encodings_from_content(content): 431 """Returns encodings from given content string. 432 433 :param content: bytestring to extract encodings from. 434 """ 435 warnings.warn(( 436 'In requests 3.0, get_encodings_from_content will be removed. For ' 437 'more information, please see the discussion on issue #2266. (This' 438 ' warning should only appear once.)'), 439 DeprecationWarning) 440 441 charset_re = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I) 442 pragma_re = re.compile(r'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I) 443 xml_re = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]') 444 445 return (charset_re.findall(content) + 446 pragma_re.findall(content) + 447 xml_re.findall(content)) 448 449 450def _parse_content_type_header(header): 451 """Returns content type and parameters from given header 452 453 :param header: string 454 :return: tuple containing content type and dictionary of 455 parameters 456 """ 457 458 tokens = header.split(';') 459 content_type, params = tokens[0].strip(), tokens[1:] 460 params_dict = {} 461 items_to_strip = "\"' " 462 463 for param in params: 464 param = param.strip() 465 if param: 466 key, value = param, True 467 index_of_equals = param.find("=") 468 if index_of_equals != -1: 469 key = param[:index_of_equals].strip(items_to_strip) 470 value = param[index_of_equals + 1:].strip(items_to_strip) 471 params_dict[key.lower()] = value 472 return content_type, params_dict 473 474 475def get_encoding_from_headers(headers): 476 """Returns encodings from given HTTP Header Dict. 477 478 :param headers: dictionary to extract encoding from. 479 :rtype: str 480 """ 481 482 content_type = headers.get('content-type') 483 484 if not content_type: 485 return None 486 487 content_type, params = _parse_content_type_header(content_type) 488 489 if 'charset' in params: 490 return params['charset'].strip("'\"") 491 492 if 'text' in content_type: 493 return 'ISO-8859-1' 494 495 496def stream_decode_response_unicode(iterator, r): 497 """Stream decodes a iterator.""" 498 499 if r.encoding is None: 500 for item in iterator: 501 yield item 502 return 503 504 decoder = codecs.getincrementaldecoder(r.encoding)(errors='replace') 505 for chunk in iterator: 506 rv = decoder.decode(chunk) 507 if rv: 508 yield rv 509 rv = decoder.decode(b'', final=True) 510 if rv: 511 yield rv 512 513 514def iter_slices(string, slice_length): 515 """Iterate over slices of a string.""" 516 pos = 0 517 if slice_length is None or slice_length <= 0: 518 slice_length = len(string) 519 while pos < len(string): 520 yield string[pos:pos + slice_length] 521 pos += slice_length 522 523 524def get_unicode_from_response(r): 525 """Returns the requested content back in unicode. 526 527 :param r: Response object to get unicode content from. 528 529 Tried: 530 531 1. charset from content-type 532 2. fall back and replace all unicode characters 533 534 :rtype: str 535 """ 536 warnings.warn(( 537 'In requests 3.0, get_unicode_from_response will be removed. For ' 538 'more information, please see the discussion on issue #2266. (This' 539 ' warning should only appear once.)'), 540 DeprecationWarning) 541 542 tried_encodings = [] 543 544 # Try charset from content-type 545 encoding = get_encoding_from_headers(r.headers) 546 547 if encoding: 548 try: 549 return str(r.content, encoding) 550 except UnicodeError: 551 tried_encodings.append(encoding) 552 553 # Fall back: 554 try: 555 return str(r.content, encoding, errors='replace') 556 except TypeError: 557 return r.content 558 559 560# The unreserved URI characters (RFC 3986) 561UNRESERVED_SET = frozenset( 562 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" + "0123456789-._~") 563 564 565def unquote_unreserved(uri): 566 """Un-escape any percent-escape sequences in a URI that are unreserved 567 characters. This leaves all reserved, illegal and non-ASCII bytes encoded. 568 569 :rtype: str 570 """ 571 parts = uri.split('%') 572 for i in range(1, len(parts)): 573 h = parts[i][0:2] 574 if len(h) == 2 and h.isalnum(): 575 try: 576 c = chr(int(h, 16)) 577 except ValueError: 578 raise InvalidURL("Invalid percent-escape sequence: '%s'" % h) 579 580 if c in UNRESERVED_SET: 581 parts[i] = c + parts[i][2:] 582 else: 583 parts[i] = '%' + parts[i] 584 else: 585 parts[i] = '%' + parts[i] 586 return ''.join(parts) 587 588 589def requote_uri(uri): 590 """Re-quote the given URI. 591 592 This function passes the given URI through an unquote/quote cycle to 593 ensure that it is fully and consistently quoted. 594 595 :rtype: str 596 """ 597 safe_with_percent = "!#$%&'()*+,/:;=?@[]~" 598 safe_without_percent = "!#$&'()*+,/:;=?@[]~" 599 try: 600 # Unquote only the unreserved characters 601 # Then quote only illegal characters (do not quote reserved, 602 # unreserved, or '%') 603 return quote(unquote_unreserved(uri), safe=safe_with_percent) 604 except InvalidURL: 605 # We couldn't unquote the given URI, so let's try quoting it, but 606 # there may be unquoted '%'s in the URI. We need to make sure they're 607 # properly quoted so they do not cause issues elsewhere. 608 return quote(uri, safe=safe_without_percent) 609 610 611def address_in_network(ip, net): 612 """This function allows you to check if an IP belongs to a network subnet 613 614 Example: returns True if ip = 192.168.1.1 and net = 192.168.1.0/24 615 returns False if ip = 192.168.1.1 and net = 192.168.100.0/24 616 617 :rtype: bool 618 """ 619 ipaddr = struct.unpack('=L', socket.inet_aton(ip))[0] 620 netaddr, bits = net.split('/') 621 netmask = struct.unpack('=L', socket.inet_aton(dotted_netmask(int(bits))))[0] 622 network = struct.unpack('=L', socket.inet_aton(netaddr))[0] & netmask 623 return (ipaddr & netmask) == (network & netmask) 624 625 626def dotted_netmask(mask): 627 """Converts mask from /xx format to xxx.xxx.xxx.xxx 628 629 Example: if mask is 24 function returns 255.255.255.0 630 631 :rtype: str 632 """ 633 bits = 0xffffffff ^ (1 << 32 - mask) - 1 634 return socket.inet_ntoa(struct.pack('>I', bits)) 635 636 637def is_ipv4_address(string_ip): 638 """ 639 :rtype: bool 640 """ 641 try: 642 socket.inet_aton(string_ip) 643 except socket.error: 644 return False 645 return True 646 647 648def is_valid_cidr(string_network): 649 """ 650 Very simple check of the cidr format in no_proxy variable. 651 652 :rtype: bool 653 """ 654 if string_network.count('/') == 1: 655 try: 656 mask = int(string_network.split('/')[1]) 657 except ValueError: 658 return False 659 660 if mask < 1 or mask > 32: 661 return False 662 663 try: 664 socket.inet_aton(string_network.split('/')[0]) 665 except socket.error: 666 return False 667 else: 668 return False 669 return True 670 671 672@contextlib.contextmanager 673def set_environ(env_name, value): 674 """Set the environment variable 'env_name' to 'value' 675 676 Save previous value, yield, and then restore the previous value stored in 677 the environment variable 'env_name'. 678 679 If 'value' is None, do nothing""" 680 value_changed = value is not None 681 if value_changed: 682 old_value = os.environ.get(env_name) 683 os.environ[env_name] = value 684 try: 685 yield 686 finally: 687 if value_changed: 688 if old_value is None: 689 del os.environ[env_name] 690 else: 691 os.environ[env_name] = old_value 692 693 694def should_bypass_proxies(url, no_proxy): 695 """ 696 Returns whether we should bypass proxies or not. 697 698 :rtype: bool 699 """ 700 # Prioritize lowercase environment variables over uppercase 701 # to keep a consistent behaviour with other http projects (curl, wget). 702 get_proxy = lambda k: os.environ.get(k) or os.environ.get(k.upper()) 703 704 # First check whether no_proxy is defined. If it is, check that the URL 705 # we're getting isn't in the no_proxy list. 706 no_proxy_arg = no_proxy 707 if no_proxy is None: 708 no_proxy = get_proxy('no_proxy') 709 parsed = urlparse(url) 710 711 if parsed.hostname is None: 712 # URLs don't always have hostnames, e.g. file:/// urls. 713 return True 714 715 if no_proxy: 716 # We need to check whether we match here. We need to see if we match 717 # the end of the hostname, both with and without the port. 718 no_proxy = ( 719 host for host in no_proxy.replace(' ', '').split(',') if host 720 ) 721 722 if is_ipv4_address(parsed.hostname): 723 for proxy_ip in no_proxy: 724 if is_valid_cidr(proxy_ip): 725 if address_in_network(parsed.hostname, proxy_ip): 726 return True 727 elif parsed.hostname == proxy_ip: 728 # If no_proxy ip was defined in plain IP notation instead of cidr notation & 729 # matches the IP of the index 730 return True 731 else: 732 host_with_port = parsed.hostname 733 if parsed.port: 734 host_with_port += ':{}'.format(parsed.port) 735 736 for host in no_proxy: 737 if parsed.hostname.endswith(host) or host_with_port.endswith(host): 738 # The URL does match something in no_proxy, so we don't want 739 # to apply the proxies on this URL. 740 return True 741 742 with set_environ('no_proxy', no_proxy_arg): 743 # parsed.hostname can be `None` in cases such as a file URI. 744 try: 745 bypass = proxy_bypass(parsed.hostname) 746 except (TypeError, socket.gaierror): 747 bypass = False 748 749 if bypass: 750 return True 751 752 return False 753 754 755def get_environ_proxies(url, no_proxy=None): 756 """ 757 Return a dict of environment proxies. 758 759 :rtype: dict 760 """ 761 if should_bypass_proxies(url, no_proxy=no_proxy): 762 return {} 763 else: 764 return getproxies() 765 766 767def select_proxy(url, proxies): 768 """Select a proxy for the url, if applicable. 769 770 :param url: The url being for the request 771 :param proxies: A dictionary of schemes or schemes and hosts to proxy URLs 772 """ 773 proxies = proxies or {} 774 urlparts = urlparse(url) 775 if urlparts.hostname is None: 776 return proxies.get(urlparts.scheme, proxies.get('all')) 777 778 proxy_keys = [ 779 urlparts.scheme + '://' + urlparts.hostname, 780 urlparts.scheme, 781 'all://' + urlparts.hostname, 782 'all', 783 ] 784 proxy = None 785 for proxy_key in proxy_keys: 786 if proxy_key in proxies: 787 proxy = proxies[proxy_key] 788 break 789 790 return proxy 791 792 793def default_user_agent(name="python-requests"): 794 """ 795 Return a string representing the default user agent. 796 797 :rtype: str 798 """ 799 return '%s/%s' % (name, __version__) 800 801 802def default_headers(): 803 """ 804 :rtype: requests.structures.CaseInsensitiveDict 805 """ 806 return CaseInsensitiveDict({ 807 'User-Agent': default_user_agent(), 808 'Accept-Encoding': ', '.join(('gzip', 'deflate')), 809 'Accept': '*/*', 810 'Connection': 'keep-alive', 811 }) 812 813 814def parse_header_links(value): 815 """Return a list of parsed link headers proxies. 816 817 i.e. Link: <http:/.../front.jpeg>; rel=front; type="image/jpeg",<http://.../back.jpeg>; rel=back;type="image/jpeg" 818 819 :rtype: list 820 """ 821 822 links = [] 823 824 replace_chars = ' \'"' 825 826 value = value.strip(replace_chars) 827 if not value: 828 return links 829 830 for val in re.split(', *<', value): 831 try: 832 url, params = val.split(';', 1) 833 except ValueError: 834 url, params = val, '' 835 836 link = {'url': url.strip('<> \'"')} 837 838 for param in params.split(';'): 839 try: 840 key, value = param.split('=') 841 except ValueError: 842 break 843 844 link[key.strip(replace_chars)] = value.strip(replace_chars) 845 846 links.append(link) 847 848 return links 849 850 851# Null bytes; no need to recreate these on each call to guess_json_utf 852_null = '\x00'.encode('ascii') # encoding to ASCII for Python 3 853_null2 = _null * 2 854_null3 = _null * 3 855 856 857def guess_json_utf(data): 858 """ 859 :rtype: str 860 """ 861 # JSON always starts with two ASCII characters, so detection is as 862 # easy as counting the nulls and from their location and count 863 # determine the encoding. Also detect a BOM, if present. 864 sample = data[:4] 865 if sample in (codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE): 866 return 'utf-32' # BOM included 867 if sample[:3] == codecs.BOM_UTF8: 868 return 'utf-8-sig' # BOM included, MS style (discouraged) 869 if sample[:2] in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE): 870 return 'utf-16' # BOM included 871 nullcount = sample.count(_null) 872 if nullcount == 0: 873 return 'utf-8' 874 if nullcount == 2: 875 if sample[::2] == _null2: # 1st and 3rd are null 876 return 'utf-16-be' 877 if sample[1::2] == _null2: # 2nd and 4th are null 878 return 'utf-16-le' 879 # Did not detect 2 valid UTF-16 ascii-range characters 880 if nullcount == 3: 881 if sample[:3] == _null3: 882 return 'utf-32-be' 883 if sample[1:] == _null3: 884 return 'utf-32-le' 885 # Did not detect a valid UTF-32 ascii-range character 886 return None 887 888 889def prepend_scheme_if_needed(url, new_scheme): 890 """Given a URL that may or may not have a scheme, prepend the given scheme. 891 Does not replace a present scheme with the one provided as an argument. 892 893 :rtype: str 894 """ 895 scheme, netloc, path, params, query, fragment = urlparse(url, new_scheme) 896 897 # urlparse is a finicky beast, and sometimes decides that there isn't a 898 # netloc present. Assume that it's being over-cautious, and switch netloc 899 # and path if urlparse decided there was no netloc. 900 if not netloc: 901 netloc, path = path, netloc 902 903 return urlunparse((scheme, netloc, path, params, query, fragment)) 904 905 906def get_auth_from_url(url): 907 """Given a url with authentication components, extract them into a tuple of 908 username,password. 909 910 :rtype: (str,str) 911 """ 912 parsed = urlparse(url) 913 914 try: 915 auth = (unquote(parsed.username), unquote(parsed.password)) 916 except (AttributeError, TypeError): 917 auth = ('', '') 918 919 return auth 920 921 922# Moved outside of function to avoid recompile every call 923_CLEAN_HEADER_REGEX_BYTE = re.compile(b'^\\S[^\\r\\n]*$|^$') 924_CLEAN_HEADER_REGEX_STR = re.compile(r'^\S[^\r\n]*$|^$') 925 926 927def check_header_validity(header): 928 """Verifies that header value is a string which doesn't contain 929 leading whitespace or return characters. This prevents unintended 930 header injection. 931 932 :param header: tuple, in the format (name, value). 933 """ 934 name, value = header 935 936 if isinstance(value, bytes): 937 pat = _CLEAN_HEADER_REGEX_BYTE 938 else: 939 pat = _CLEAN_HEADER_REGEX_STR 940 try: 941 if not pat.match(value): 942 raise InvalidHeader("Invalid return character or leading space in header: %s" % name) 943 except TypeError: 944 raise InvalidHeader("Value for header {%s: %s} must be of type str or " 945 "bytes, not %s" % (name, value, type(value))) 946 947 948def urldefragauth(url): 949 """ 950 Given a url remove the fragment and the authentication part. 951 952 :rtype: str 953 """ 954 scheme, netloc, path, params, query, fragment = urlparse(url) 955 956 # see func:`prepend_scheme_if_needed` 957 if not netloc: 958 netloc, path = path, netloc 959 960 netloc = netloc.rsplit('@', 1)[-1] 961 962 return urlunparse((scheme, netloc, path, params, query, '')) 963 964 965def rewind_body(prepared_request): 966 """Move file pointer back to its recorded starting position 967 so it can be read again on redirect. 968 """ 969 body_seek = getattr(prepared_request.body, 'seek', None) 970 if body_seek is not None and isinstance(prepared_request._body_position, integer_types): 971 try: 972 body_seek(prepared_request._body_position) 973 except (IOError, OSError): 974 raise UnrewindableBodyError("An error occurred when rewinding request " 975 "body for redirect.") 976 else: 977 raise UnrewindableBodyError("Unable to rewind request body for redirect.") 978