1# coding: utf-8 2# Modified Work: Copyright (c) 2018, 2021, Oracle and/or its affiliates. All rights reserved. 3# This software is dual-licensed to you under the Universal Permissive License (UPL) 1.0 as shown at https://oss.oracle.com/licenses/upl or Apache License 2.0 as shown at http://www.apache.org/licenses/LICENSE-2.0. You may choose either license. 4# Copyright 2018 Kenneth Reitz 5 6# -*- coding: utf-8 -*- 7 8""" 9requests.utils 10~~~~~~~~~~~~~~ 11 12This module provides utility functions that are used within Requests 13that are also useful for external consumption. 14""" 15 16import codecs 17import contextlib 18import io 19import os 20import re 21import socket 22import struct 23import sys 24import tempfile 25import warnings 26import zipfile 27from collections import OrderedDict 28 29from .__version__ import __version__ 30from . import certs 31# to_native_string is unused here, but imported here for backwards compatibility 32from ._internal_utils import to_native_string 33from .compat import parse_http_list as _parse_list_header 34from .compat import ( 35 quote, urlparse, bytes, str, unquote, getproxies, 36 proxy_bypass, urlunparse, basestring, integer_types, is_py3, 37 proxy_bypass_environment, getproxies_environment, Mapping) 38from .cookies import cookiejar_from_dict 39from .structures import CaseInsensitiveDict 40from .exceptions import ( 41 InvalidURL, InvalidHeader, FileModeWarning, UnrewindableBodyError) 42 43NETRC_FILES = ('.netrc', '_netrc') 44 45DEFAULT_CA_BUNDLE_PATH = certs.where() 46 47DEFAULT_PORTS = {'http': 80, 'https': 443} 48 49 50if sys.platform == 'win32': 51 # provide a proxy_bypass version on Windows without DNS lookups 52 53 def proxy_bypass_registry(host): 54 try: 55 if is_py3: 56 import winreg 57 else: 58 import _winreg as winreg 59 except ImportError: 60 return False 61 62 try: 63 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER, 64 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings') 65 # ProxyEnable could be REG_SZ or REG_DWORD, normalizing it 66 proxyEnable = int(winreg.QueryValueEx(internetSettings, 67 'ProxyEnable')[0]) 68 # ProxyOverride is almost always a string 69 proxyOverride = winreg.QueryValueEx(internetSettings, 70 'ProxyOverride')[0] 71 except OSError: 72 return False 73 if not proxyEnable or not proxyOverride: 74 return False 75 76 # make a check value list from the registry entry: replace the 77 # '<local>' string by the localhost entry and the corresponding 78 # canonical entry. 79 proxyOverride = proxyOverride.split(';') 80 # now check if we match one of the registry values. 81 for test in proxyOverride: 82 if test == '<local>': 83 if '.' not in host: 84 return True 85 test = test.replace(".", r"\.") # mask dots 86 test = test.replace("*", r".*") # change glob sequence 87 test = test.replace("?", r".") # change glob char 88 if re.match(test, host, re.I): 89 return True 90 return False 91 92 def proxy_bypass(host): # noqa 93 """Return True, if the host should be bypassed. 94 95 Checks proxy settings gathered from the environment, if specified, 96 or the registry. 97 """ 98 if getproxies_environment(): 99 return proxy_bypass_environment(host) 100 else: 101 return proxy_bypass_registry(host) 102 103 104def dict_to_sequence(d): 105 """Returns an internal sequence dictionary update.""" 106 107 if hasattr(d, 'items'): 108 d = d.items() 109 110 return d 111 112 113def super_len(o): 114 total_length = None 115 current_position = 0 116 117 if hasattr(o, '__len__'): 118 total_length = len(o) 119 120 elif hasattr(o, 'len'): 121 total_length = o.len 122 123 elif hasattr(o, 'fileno'): 124 try: 125 fileno = o.fileno() 126 except io.UnsupportedOperation: 127 pass 128 else: 129 total_length = os.fstat(fileno).st_size 130 131 # Having used fstat to determine the file length, we need to 132 # confirm that this file was opened up in binary mode. 133 if 'b' not in o.mode: 134 warnings.warn(( 135 "Requests has determined the content-length for this " 136 "request using the binary size of the file: however, the " 137 "file has been opened in text mode (i.e. without the 'b' " 138 "flag in the mode). This may lead to an incorrect " 139 "content-length. In Requests 3.0, support will be removed " 140 "for files in text mode."), 141 FileModeWarning 142 ) 143 144 if hasattr(o, 'tell'): 145 try: 146 current_position = o.tell() 147 except (OSError, IOError): 148 # This can happen in some weird situations, such as when the file 149 # is actually a special file descriptor like stdin. In this 150 # instance, we don't know what the length is, so set it to zero and 151 # let requests chunk it instead. 152 if total_length is not None: 153 current_position = total_length 154 else: 155 if hasattr(o, 'seek') and total_length is None: 156 # StringIO and BytesIO have seek but no useable fileno 157 try: 158 # seek to end of file 159 o.seek(0, 2) 160 total_length = o.tell() 161 162 # seek back to current position to support 163 # partially read file-like objects 164 o.seek(current_position or 0) 165 except (OSError, IOError): 166 total_length = 0 167 168 if total_length is None: 169 total_length = 0 170 171 return max(0, total_length - current_position) 172 173 174def get_netrc_auth(url, raise_errors=False): 175 """Returns the Requests tuple auth for a given url from netrc.""" 176 177 netrc_file = os.environ.get('NETRC') 178 if netrc_file is not None: 179 netrc_locations = (netrc_file,) 180 else: 181 netrc_locations = ('~/{}'.format(f) for f in NETRC_FILES) 182 183 try: 184 from netrc import netrc, NetrcParseError 185 186 netrc_path = None 187 188 for f in netrc_locations: 189 try: 190 loc = os.path.expanduser(f) 191 except KeyError: 192 # os.path.expanduser can fail when $HOME is undefined and 193 # getpwuid fails. See https://bugs.python.org/issue20164 & 194 # https://github.com/psf/requests/issues/1846 195 return 196 197 if os.path.exists(loc): 198 netrc_path = loc 199 break 200 201 # Abort early if there isn't one. 202 if netrc_path is None: 203 return 204 205 ri = urlparse(url) 206 207 # Strip port numbers from netloc. This weird `if...encode`` dance is 208 # used for Python 3.2, which doesn't support unicode literals. 209 splitstr = b':' 210 if isinstance(url, str): 211 splitstr = splitstr.decode('ascii') 212 host = ri.netloc.split(splitstr)[0] 213 214 try: 215 _netrc = netrc(netrc_path).authenticators(host) 216 if _netrc: 217 # Return with login / password 218 login_i = (0 if _netrc[0] else 1) 219 return (_netrc[login_i], _netrc[2]) 220 except (NetrcParseError, IOError): 221 # If there was a parsing error or a permissions issue reading the file, 222 # we'll just skip netrc auth unless explicitly asked to raise errors. 223 if raise_errors: 224 raise 225 226 # App Engine hackiness. 227 except (ImportError, AttributeError): 228 pass 229 230 231def guess_filename(obj): 232 """Tries to guess the filename of the given object.""" 233 name = getattr(obj, 'name', None) 234 if (name and isinstance(name, basestring) and name[0] != '<' and 235 name[-1] != '>'): 236 return os.path.basename(name) 237 238 239def extract_zipped_paths(path): 240 """Replace nonexistent paths that look like they refer to a member of a zip 241 archive with the location of an extracted copy of the target, or else 242 just return the provided path unchanged. 243 """ 244 if os.path.exists(path): 245 # this is already a valid path, no need to do anything further 246 return path 247 248 # find the first valid part of the provided path and treat that as a zip archive 249 # assume the rest of the path is the name of a member in the archive 250 archive, member = os.path.split(path) 251 while archive and not os.path.exists(archive): 252 archive, prefix = os.path.split(archive) 253 member = '/'.join([prefix, member]) 254 255 if not zipfile.is_zipfile(archive): 256 return path 257 258 zip_file = zipfile.ZipFile(archive) 259 if member not in zip_file.namelist(): 260 return path 261 262 # we have a valid zip archive and a valid member of that archive 263 tmp = tempfile.gettempdir() 264 extracted_path = os.path.join(tmp, *member.split('/')) 265 if not os.path.exists(extracted_path): 266 extracted_path = zip_file.extract(member, path=tmp) 267 268 return extracted_path 269 270 271def from_key_val_list(value): 272 """Take an object and test to see if it can be represented as a 273 dictionary. Unless it can not be represented as such, return an 274 OrderedDict, e.g., 275 276 :: 277 278 >>> from_key_val_list([('key', 'val')]) 279 OrderedDict([('key', 'val')]) 280 >>> from_key_val_list('string') 281 Traceback (most recent call last): 282 ... 283 ValueError: cannot encode objects that are not 2-tuples 284 >>> from_key_val_list({'key': 'val'}) 285 OrderedDict([('key', 'val')]) 286 287 :rtype: OrderedDict 288 """ 289 if value is None: 290 return None 291 292 if isinstance(value, (str, bytes, bool, int)): 293 raise ValueError('cannot encode objects that are not 2-tuples') 294 295 return OrderedDict(value) 296 297 298def to_key_val_list(value): 299 """Take an object and test to see if it can be represented as a 300 dictionary. If it can be, return a list of tuples, e.g., 301 302 :: 303 304 >>> to_key_val_list([('key', 'val')]) 305 [('key', 'val')] 306 >>> to_key_val_list({'key': 'val'}) 307 [('key', 'val')] 308 >>> to_key_val_list('string') 309 Traceback (most recent call last): 310 ... 311 ValueError: cannot encode objects that are not 2-tuples 312 313 :rtype: list 314 """ 315 if value is None: 316 return None 317 318 if isinstance(value, (str, bytes, bool, int)): 319 raise ValueError('cannot encode objects that are not 2-tuples') 320 321 if isinstance(value, Mapping): 322 value = value.items() 323 324 return list(value) 325 326 327# From mitsuhiko/werkzeug (used with permission). 328def parse_list_header(value): 329 """Parse lists as described by RFC 2068 Section 2. 330 331 In particular, parse comma-separated lists where the elements of 332 the list may include quoted-strings. A quoted-string could 333 contain a comma. A non-quoted string could have quotes in the 334 middle. Quotes are removed automatically after parsing. 335 336 It basically works like :func:`parse_set_header` just that items 337 may appear multiple times and case sensitivity is preserved. 338 339 The return value is a standard :class:`list`: 340 341 >>> parse_list_header('token, "quoted value"') 342 ['token', 'quoted value'] 343 344 To create a header from the :class:`list` again, use the 345 :func:`dump_header` function. 346 347 :param value: a string with a list header. 348 :return: :class:`list` 349 :rtype: list 350 """ 351 result = [] 352 for item in _parse_list_header(value): 353 if item[:1] == item[-1:] == '"': 354 item = unquote_header_value(item[1:-1]) 355 result.append(item) 356 return result 357 358 359# From mitsuhiko/werkzeug (used with permission). 360def parse_dict_header(value): 361 """Parse lists of key, value pairs as described by RFC 2068 Section 2 and 362 convert them into a python dict: 363 364 >>> d = parse_dict_header('foo="is a fish", bar="as well"') 365 >>> type(d) is dict 366 True 367 >>> sorted(d.items()) 368 [('bar', 'as well'), ('foo', 'is a fish')] 369 370 If there is no value for a key it will be `None`: 371 372 >>> parse_dict_header('key_without_value') 373 {'key_without_value': None} 374 375 To create a header from the :class:`dict` again, use the 376 :func:`dump_header` function. 377 378 :param value: a string with a dict header. 379 :return: :class:`dict` 380 :rtype: dict 381 """ 382 result = {} 383 for item in _parse_list_header(value): 384 if '=' not in item: 385 result[item] = None 386 continue 387 name, value = item.split('=', 1) 388 if value[:1] == value[-1:] == '"': 389 value = unquote_header_value(value[1:-1]) 390 result[name] = value 391 return result 392 393 394# From mitsuhiko/werkzeug (used with permission). 395def unquote_header_value(value, is_filename=False): 396 r"""Unquotes a header value. (Reversal of :func:`quote_header_value`). 397 This does not use the real unquoting but what browsers are actually 398 using for quoting. 399 400 :param value: the header value to unquote. 401 :rtype: str 402 """ 403 if value and value[0] == value[-1] == '"': 404 # this is not the real unquoting, but fixing this so that the 405 # RFC is met will result in bugs with internet explorer and 406 # probably some other browsers as well. IE for example is 407 # uploading files with "C:\foo\bar.txt" as filename 408 value = value[1:-1] 409 410 # if this is a filename and the starting characters look like 411 # a UNC path, then just return the value without quotes. Using the 412 # replace sequence below on a UNC path has the effect of turning 413 # the leading double slash into a single slash and then 414 # _fix_ie_filename() doesn't work correctly. See #458. 415 if not is_filename or value[:2] != '\\\\': 416 return value.replace('\\\\', '\\').replace('\\"', '"') 417 return value 418 419 420def dict_from_cookiejar(cj): 421 """Returns a key/value dictionary from a CookieJar. 422 423 :param cj: CookieJar object to extract cookies from. 424 :rtype: dict 425 """ 426 427 cookie_dict = {} 428 429 for cookie in cj: 430 cookie_dict[cookie.name] = cookie.value 431 432 return cookie_dict 433 434 435def add_dict_to_cookiejar(cj, cookie_dict): 436 """Returns a CookieJar from a key/value dictionary. 437 438 :param cj: CookieJar to insert cookies into. 439 :param cookie_dict: Dict of key/values to insert into CookieJar. 440 :rtype: CookieJar 441 """ 442 443 return cookiejar_from_dict(cookie_dict, cj) 444 445 446def get_encodings_from_content(content): 447 """Returns encodings from given content string. 448 449 :param content: bytestring to extract encodings from. 450 """ 451 warnings.warn(( 452 'In requests 3.0, get_encodings_from_content will be removed. For ' 453 'more information, please see the discussion on issue #2266. (This' 454 ' warning should only appear once.)'), 455 DeprecationWarning) 456 457 charset_re = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I) 458 pragma_re = re.compile(r'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I) 459 xml_re = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]') 460 461 return (charset_re.findall(content) + 462 pragma_re.findall(content) + 463 xml_re.findall(content)) 464 465 466def _parse_content_type_header(header): 467 """Returns content type and parameters from given header 468 469 :param header: string 470 :return: tuple containing content type and dictionary of 471 parameters 472 """ 473 474 tokens = header.split(';') 475 content_type, params = tokens[0].strip(), tokens[1:] 476 params_dict = {} 477 items_to_strip = "\"' " 478 479 for param in params: 480 param = param.strip() 481 if param: 482 key, value = param, True 483 index_of_equals = param.find("=") 484 if index_of_equals != -1: 485 key = param[:index_of_equals].strip(items_to_strip) 486 value = param[index_of_equals + 1:].strip(items_to_strip) 487 params_dict[key.lower()] = value 488 return content_type, params_dict 489 490 491def get_encoding_from_headers(headers): 492 """Returns encodings from given HTTP Header Dict. 493 494 :param headers: dictionary to extract encoding from. 495 :rtype: str 496 """ 497 498 content_type = headers.get('content-type') 499 500 if not content_type: 501 return None 502 503 content_type, params = _parse_content_type_header(content_type) 504 505 if 'charset' in params: 506 return params['charset'].strip("'\"") 507 508 if 'text' in content_type: 509 return 'ISO-8859-1' 510 511 if 'application/json' in content_type: 512 # Assume UTF-8 based on RFC 4627: https://www.ietf.org/rfc/rfc4627.txt since the charset was unset 513 return 'utf-8' 514 515 516def stream_decode_response_unicode(iterator, r): 517 """Stream decodes a iterator.""" 518 519 if r.encoding is None: 520 for item in iterator: 521 yield item 522 return 523 524 decoder = codecs.getincrementaldecoder(r.encoding)(errors='replace') 525 for chunk in iterator: 526 rv = decoder.decode(chunk) 527 if rv: 528 yield rv 529 rv = decoder.decode(b'', final=True) 530 if rv: 531 yield rv 532 533 534def iter_slices(string, slice_length): 535 """Iterate over slices of a string.""" 536 pos = 0 537 if slice_length is None or slice_length <= 0: 538 slice_length = len(string) 539 while pos < len(string): 540 yield string[pos:pos + slice_length] 541 pos += slice_length 542 543 544def get_unicode_from_response(r): 545 """Returns the requested content back in unicode. 546 547 :param r: Response object to get unicode content from. 548 549 Tried: 550 551 1. charset from content-type 552 2. fall back and replace all unicode characters 553 554 :rtype: str 555 """ 556 warnings.warn(( 557 'In requests 3.0, get_unicode_from_response will be removed. For ' 558 'more information, please see the discussion on issue #2266. (This' 559 ' warning should only appear once.)'), 560 DeprecationWarning) 561 562 tried_encodings = [] 563 564 # Try charset from content-type 565 encoding = get_encoding_from_headers(r.headers) 566 567 if encoding: 568 try: 569 return str(r.content, encoding) 570 except UnicodeError: 571 tried_encodings.append(encoding) 572 573 # Fall back: 574 try: 575 return str(r.content, encoding, errors='replace') 576 except TypeError: 577 return r.content 578 579 580# The unreserved URI characters (RFC 3986) 581UNRESERVED_SET = frozenset( 582 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" + "0123456789-._~") 583 584 585def unquote_unreserved(uri): 586 """Un-escape any percent-escape sequences in a URI that are unreserved 587 characters. This leaves all reserved, illegal and non-ASCII bytes encoded. 588 589 :rtype: str 590 """ 591 parts = uri.split('%') 592 for i in range(1, len(parts)): 593 h = parts[i][0:2] 594 if len(h) == 2 and h.isalnum(): 595 try: 596 c = chr(int(h, 16)) 597 except ValueError: 598 raise InvalidURL("Invalid percent-escape sequence: '%s'" % h) 599 600 if c in UNRESERVED_SET: 601 parts[i] = c + parts[i][2:] 602 else: 603 parts[i] = '%' + parts[i] 604 else: 605 parts[i] = '%' + parts[i] 606 return ''.join(parts) 607 608 609def requote_uri(uri): 610 """Re-quote the given URI. 611 612 This function passes the given URI through an unquote/quote cycle to 613 ensure that it is fully and consistently quoted. 614 615 :rtype: str 616 """ 617 safe_with_percent = "!#$%&'()*+,/:;=?@[]~" 618 safe_without_percent = "!#$&'()*+,/:;=?@[]~" 619 try: 620 # Unquote only the unreserved characters 621 # Then quote only illegal characters (do not quote reserved, 622 # unreserved, or '%') 623 return quote(unquote_unreserved(uri), safe=safe_with_percent) 624 except InvalidURL: 625 # We couldn't unquote the given URI, so let's try quoting it, but 626 # there may be unquoted '%'s in the URI. We need to make sure they're 627 # properly quoted so they do not cause issues elsewhere. 628 return quote(uri, safe=safe_without_percent) 629 630 631def address_in_network(ip, net): 632 """This function allows you to check if an IP belongs to a network subnet 633 634 Example: returns True if ip = 192.168.1.1 and net = 192.168.1.0/24 635 returns False if ip = 192.168.1.1 and net = 192.168.100.0/24 636 637 :rtype: bool 638 """ 639 ipaddr = struct.unpack('=L', socket.inet_aton(ip))[0] 640 netaddr, bits = net.split('/') 641 netmask = struct.unpack('=L', socket.inet_aton(dotted_netmask(int(bits))))[0] 642 network = struct.unpack('=L', socket.inet_aton(netaddr))[0] & netmask 643 return (ipaddr & netmask) == (network & netmask) 644 645 646def dotted_netmask(mask): 647 """Converts mask from /xx format to xxx.xxx.xxx.xxx 648 649 Example: if mask is 24 function returns 255.255.255.0 650 651 :rtype: str 652 """ 653 bits = 0xffffffff ^ (1 << 32 - mask) - 1 654 return socket.inet_ntoa(struct.pack('>I', bits)) 655 656 657def is_ipv4_address(string_ip): 658 """ 659 :rtype: bool 660 """ 661 try: 662 socket.inet_aton(string_ip) 663 except socket.error: 664 return False 665 return True 666 667 668def is_valid_cidr(string_network): 669 """ 670 Very simple check of the cidr format in no_proxy variable. 671 672 :rtype: bool 673 """ 674 if string_network.count('/') == 1: 675 try: 676 mask = int(string_network.split('/')[1]) 677 except ValueError: 678 return False 679 680 if mask < 1 or mask > 32: 681 return False 682 683 try: 684 socket.inet_aton(string_network.split('/')[0]) 685 except socket.error: 686 return False 687 else: 688 return False 689 return True 690 691 692@contextlib.contextmanager 693def set_environ(env_name, value): 694 """Set the environment variable 'env_name' to 'value' 695 696 Save previous value, yield, and then restore the previous value stored in 697 the environment variable 'env_name'. 698 699 If 'value' is None, do nothing""" 700 value_changed = value is not None 701 if value_changed: 702 old_value = os.environ.get(env_name) 703 os.environ[env_name] = value 704 try: 705 yield 706 finally: 707 if value_changed: 708 if old_value is None: 709 del os.environ[env_name] 710 else: 711 os.environ[env_name] = old_value 712 713 714def should_bypass_proxies(url, no_proxy): 715 """ 716 Returns whether we should bypass proxies or not. 717 718 :rtype: bool 719 """ 720 # Prioritize lowercase environment variables over uppercase 721 # to keep a consistent behaviour with other http projects (curl, wget). 722 get_proxy = lambda k: os.environ.get(k) or os.environ.get(k.upper()) 723 724 # First check whether no_proxy is defined. If it is, check that the URL 725 # we're getting isn't in the no_proxy list. 726 no_proxy_arg = no_proxy 727 if no_proxy is None: 728 no_proxy = get_proxy('no_proxy') 729 parsed = urlparse(url) 730 731 if parsed.hostname is None: 732 # URLs don't always have hostnames, e.g. file:/// urls. 733 return True 734 735 if no_proxy: 736 # We need to check whether we match here. We need to see if we match 737 # the end of the hostname, both with and without the port. 738 no_proxy = ( 739 host for host in no_proxy.replace(' ', '').split(',') if host 740 ) 741 742 if is_ipv4_address(parsed.hostname): 743 for proxy_ip in no_proxy: 744 if is_valid_cidr(proxy_ip): 745 if address_in_network(parsed.hostname, proxy_ip): 746 return True 747 elif parsed.hostname == proxy_ip: 748 # If no_proxy ip was defined in plain IP notation instead of cidr notation & 749 # matches the IP of the index 750 return True 751 else: 752 host_with_port = parsed.hostname 753 if parsed.port: 754 host_with_port += ':{}'.format(parsed.port) 755 756 for host in no_proxy: 757 if parsed.hostname.endswith(host) or host_with_port.endswith(host): 758 # The URL does match something in no_proxy, so we don't want 759 # to apply the proxies on this URL. 760 return True 761 762 with set_environ('no_proxy', no_proxy_arg): 763 # parsed.hostname can be `None` in cases such as a file URI. 764 try: 765 bypass = proxy_bypass(parsed.hostname) 766 except (TypeError, socket.gaierror): 767 bypass = False 768 769 if bypass: 770 return True 771 772 return False 773 774 775def get_environ_proxies(url, no_proxy=None): 776 """ 777 Return a dict of environment proxies. 778 779 :rtype: dict 780 """ 781 if should_bypass_proxies(url, no_proxy=no_proxy): 782 return {} 783 else: 784 return getproxies() 785 786 787def select_proxy(url, proxies): 788 """Select a proxy for the url, if applicable. 789 790 :param url: The url being for the request 791 :param proxies: A dictionary of schemes or schemes and hosts to proxy URLs 792 """ 793 proxies = proxies or {} 794 urlparts = urlparse(url) 795 if urlparts.hostname is None: 796 return proxies.get(urlparts.scheme, proxies.get('all')) 797 798 proxy_keys = [ 799 urlparts.scheme + '://' + urlparts.hostname, 800 urlparts.scheme, 801 'all://' + urlparts.hostname, 802 'all', 803 ] 804 proxy = None 805 for proxy_key in proxy_keys: 806 if proxy_key in proxies: 807 proxy = proxies[proxy_key] 808 break 809 810 return proxy 811 812 813def default_user_agent(name="python-requests"): 814 """ 815 Return a string representing the default user agent. 816 817 :rtype: str 818 """ 819 return '%s/%s' % (name, __version__) 820 821 822def default_headers(): 823 """ 824 :rtype: requests.structures.CaseInsensitiveDict 825 """ 826 return CaseInsensitiveDict({ 827 'User-Agent': default_user_agent(), 828 'Accept-Encoding': ', '.join(('gzip', 'deflate')), 829 'Accept': '*/*', 830 'Connection': 'keep-alive', 831 }) 832 833 834def parse_header_links(value): 835 """Return a list of parsed link headers proxies. 836 837 i.e. Link: <http:/.../front.jpeg>; rel=front; type="image/jpeg",<http://.../back.jpeg>; rel=back;type="image/jpeg" 838 839 :rtype: list 840 """ 841 842 links = [] 843 844 replace_chars = ' \'"' 845 846 value = value.strip(replace_chars) 847 if not value: 848 return links 849 850 for val in re.split(', *<', value): 851 try: 852 url, params = val.split(';', 1) 853 except ValueError: 854 url, params = val, '' 855 856 link = {'url': url.strip('<> \'"')} 857 858 for param in params.split(';'): 859 try: 860 key, value = param.split('=') 861 except ValueError: 862 break 863 864 link[key.strip(replace_chars)] = value.strip(replace_chars) 865 866 links.append(link) 867 868 return links 869 870 871# Null bytes; no need to recreate these on each call to guess_json_utf 872_null = '\x00'.encode('ascii') # encoding to ASCII for Python 3 873_null2 = _null * 2 874_null3 = _null * 3 875 876 877def guess_json_utf(data): 878 """ 879 :rtype: str 880 """ 881 # JSON always starts with two ASCII characters, so detection is as 882 # easy as counting the nulls and from their location and count 883 # determine the encoding. Also detect a BOM, if present. 884 sample = data[:4] 885 if sample in (codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE): 886 return 'utf-32' # BOM included 887 if sample[:3] == codecs.BOM_UTF8: 888 return 'utf-8-sig' # BOM included, MS style (discouraged) 889 if sample[:2] in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE): 890 return 'utf-16' # BOM included 891 nullcount = sample.count(_null) 892 if nullcount == 0: 893 return 'utf-8' 894 if nullcount == 2: 895 if sample[::2] == _null2: # 1st and 3rd are null 896 return 'utf-16-be' 897 if sample[1::2] == _null2: # 2nd and 4th are null 898 return 'utf-16-le' 899 # Did not detect 2 valid UTF-16 ascii-range characters 900 if nullcount == 3: 901 if sample[:3] == _null3: 902 return 'utf-32-be' 903 if sample[1:] == _null3: 904 return 'utf-32-le' 905 # Did not detect a valid UTF-32 ascii-range character 906 return None 907 908 909def prepend_scheme_if_needed(url, new_scheme): 910 """Given a URL that may or may not have a scheme, prepend the given scheme. 911 Does not replace a present scheme with the one provided as an argument. 912 913 :rtype: str 914 """ 915 scheme, netloc, path, params, query, fragment = urlparse(url, new_scheme) 916 917 # urlparse is a finicky beast, and sometimes decides that there isn't a 918 # netloc present. Assume that it's being over-cautious, and switch netloc 919 # and path if urlparse decided there was no netloc. 920 if not netloc: 921 netloc, path = path, netloc 922 923 return urlunparse((scheme, netloc, path, params, query, fragment)) 924 925 926def get_auth_from_url(url): 927 """Given a url with authentication components, extract them into a tuple of 928 username,password. 929 930 :rtype: (str,str) 931 """ 932 parsed = urlparse(url) 933 934 try: 935 auth = (unquote(parsed.username), unquote(parsed.password)) 936 except (AttributeError, TypeError): 937 auth = ('', '') 938 939 return auth 940 941 942# Moved outside of function to avoid recompile every call 943_CLEAN_HEADER_REGEX_BYTE = re.compile(b'^\\S[^\\r\\n]*$|^$') 944_CLEAN_HEADER_REGEX_STR = re.compile(r'^\S[^\r\n]*$|^$') 945 946 947def check_header_validity(header): 948 """Verifies that header value is a string which doesn't contain 949 leading whitespace or return characters. This prevents unintended 950 header injection. 951 952 :param header: tuple, in the format (name, value). 953 """ 954 name, value = header 955 956 if isinstance(value, bytes): 957 pat = _CLEAN_HEADER_REGEX_BYTE 958 else: 959 pat = _CLEAN_HEADER_REGEX_STR 960 try: 961 if not pat.match(value): 962 raise InvalidHeader("Invalid return character or leading space in header: %s" % name) 963 except TypeError: 964 raise InvalidHeader("Value for header {%s: %s} must be of type str or " 965 "bytes, not %s" % (name, value, type(value))) 966 967 968def urldefragauth(url): 969 """ 970 Given a url remove the fragment and the authentication part. 971 972 :rtype: str 973 """ 974 scheme, netloc, path, params, query, fragment = urlparse(url) 975 976 # see func:`prepend_scheme_if_needed` 977 if not netloc: 978 netloc, path = path, netloc 979 980 netloc = netloc.rsplit('@', 1)[-1] 981 982 return urlunparse((scheme, netloc, path, params, query, '')) 983 984 985def rewind_body(prepared_request): 986 """Move file pointer back to its recorded starting position 987 so it can be read again on redirect. 988 """ 989 body_seek = getattr(prepared_request.body, 'seek', None) 990 if body_seek is not None and isinstance(prepared_request._body_position, integer_types): 991 try: 992 body_seek(prepared_request._body_position) 993 except (IOError, OSError): 994 raise UnrewindableBodyError("An error occurred when rewinding request " 995 "body for redirect.") 996 else: 997 raise UnrewindableBodyError("Unable to rewind request body for redirect.") 998