1from __future__ import absolute_import 2 3import re 4from collections import namedtuple 5 6from ..exceptions import LocationParseError 7from ..packages import six 8 9url_attrs = ["scheme", "auth", "host", "port", "path", "query", "fragment"] 10 11# We only want to normalize urls with an HTTP(S) scheme. 12# urllib3 infers URLs without a scheme (None) to be http. 13NORMALIZABLE_SCHEMES = ("http", "https", None) 14 15# Almost all of these patterns were derived from the 16# 'rfc3986' module: https://github.com/python-hyper/rfc3986 17PERCENT_RE = re.compile(r"%[a-fA-F0-9]{2}") 18SCHEME_RE = re.compile(r"^(?:[a-zA-Z][a-zA-Z0-9+-]*:|/)") 19URI_RE = re.compile( 20 r"^(?:([a-zA-Z][a-zA-Z0-9+.-]*):)?" 21 r"(?://([^\\/?#]*))?" 22 r"([^?#]*)" 23 r"(?:\?([^#]*))?" 24 r"(?:#(.*))?$", 25 re.UNICODE | re.DOTALL, 26) 27 28IPV4_PAT = r"(?:[0-9]{1,3}\.){3}[0-9]{1,3}" 29HEX_PAT = "[0-9A-Fa-f]{1,4}" 30LS32_PAT = "(?:{hex}:{hex}|{ipv4})".format(hex=HEX_PAT, ipv4=IPV4_PAT) 31_subs = {"hex": HEX_PAT, "ls32": LS32_PAT} 32_variations = [ 33 # 6( h16 ":" ) ls32 34 "(?:%(hex)s:){6}%(ls32)s", 35 # "::" 5( h16 ":" ) ls32 36 "::(?:%(hex)s:){5}%(ls32)s", 37 # [ h16 ] "::" 4( h16 ":" ) ls32 38 "(?:%(hex)s)?::(?:%(hex)s:){4}%(ls32)s", 39 # [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32 40 "(?:(?:%(hex)s:)?%(hex)s)?::(?:%(hex)s:){3}%(ls32)s", 41 # [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32 42 "(?:(?:%(hex)s:){0,2}%(hex)s)?::(?:%(hex)s:){2}%(ls32)s", 43 # [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32 44 "(?:(?:%(hex)s:){0,3}%(hex)s)?::%(hex)s:%(ls32)s", 45 # [ *4( h16 ":" ) h16 ] "::" ls32 46 "(?:(?:%(hex)s:){0,4}%(hex)s)?::%(ls32)s", 47 # [ *5( h16 ":" ) h16 ] "::" h16 48 "(?:(?:%(hex)s:){0,5}%(hex)s)?::%(hex)s", 49 # [ *6( h16 ":" ) h16 ] "::" 50 "(?:(?:%(hex)s:){0,6}%(hex)s)?::", 51] 52 53UNRESERVED_PAT = r"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789._!\-~" 54IPV6_PAT = "(?:" + "|".join([x % _subs for x in _variations]) + ")" 55ZONE_ID_PAT = "(?:%25|%)(?:[" + UNRESERVED_PAT + "]|%[a-fA-F0-9]{2})+" 56IPV6_ADDRZ_PAT = r"\[" + IPV6_PAT + r"(?:" + ZONE_ID_PAT + r")?\]" 57REG_NAME_PAT = r"(?:[^\[\]%:/?#]|%[a-fA-F0-9]{2})*" 58TARGET_RE = re.compile(r"^(/[^?#]*)(?:\?([^#]*))?(?:#.*)?$") 59 60IPV4_RE = re.compile("^" + IPV4_PAT + "$") 61IPV6_RE = re.compile("^" + IPV6_PAT + "$") 62IPV6_ADDRZ_RE = re.compile("^" + IPV6_ADDRZ_PAT + "$") 63BRACELESS_IPV6_ADDRZ_RE = re.compile("^" + IPV6_ADDRZ_PAT[2:-2] + "$") 64ZONE_ID_RE = re.compile("(" + ZONE_ID_PAT + r")\]$") 65 66SUBAUTHORITY_PAT = (u"^(?:(.*)@)?(%s|%s|%s)(?::([0-9]{0,5}))?$") % ( 67 REG_NAME_PAT, 68 IPV4_PAT, 69 IPV6_ADDRZ_PAT, 70) 71SUBAUTHORITY_RE = re.compile(SUBAUTHORITY_PAT, re.UNICODE | re.DOTALL) 72 73UNRESERVED_CHARS = set( 74 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789._-~" 75) 76SUB_DELIM_CHARS = set("!$&'()*+,;=") 77USERINFO_CHARS = UNRESERVED_CHARS | SUB_DELIM_CHARS | {":"} 78PATH_CHARS = USERINFO_CHARS | {"@", "/"} 79QUERY_CHARS = FRAGMENT_CHARS = PATH_CHARS | {"?"} 80 81 82class Url(namedtuple("Url", url_attrs)): 83 """ 84 Data structure for representing an HTTP URL. Used as a return value for 85 :func:`parse_url`. Both the scheme and host are normalized as they are 86 both case-insensitive according to RFC 3986. 87 """ 88 89 __slots__ = () 90 91 def __new__( 92 cls, 93 scheme=None, 94 auth=None, 95 host=None, 96 port=None, 97 path=None, 98 query=None, 99 fragment=None, 100 ): 101 if path and not path.startswith("/"): 102 path = "/" + path 103 if scheme is not None: 104 scheme = scheme.lower() 105 return super(Url, cls).__new__( 106 cls, scheme, auth, host, port, path, query, fragment 107 ) 108 109 @property 110 def hostname(self): 111 """For backwards-compatibility with urlparse. We're nice like that.""" 112 return self.host 113 114 @property 115 def request_uri(self): 116 """Absolute path including the query string.""" 117 uri = self.path or "/" 118 119 if self.query is not None: 120 uri += "?" + self.query 121 122 return uri 123 124 @property 125 def netloc(self): 126 """Network location including host and port""" 127 if self.port: 128 return "%s:%d" % (self.host, self.port) 129 return self.host 130 131 @property 132 def url(self): 133 """ 134 Convert self into a url 135 136 This function should more or less round-trip with :func:`.parse_url`. The 137 returned url may not be exactly the same as the url inputted to 138 :func:`.parse_url`, but it should be equivalent by the RFC (e.g., urls 139 with a blank port will have : removed). 140 141 Example: :: 142 143 >>> U = parse_url('http://google.com/mail/') 144 >>> U.url 145 'http://google.com/mail/' 146 >>> Url('http', 'username:password', 'host.com', 80, 147 ... '/path', 'query', 'fragment').url 148 'http://username:password@host.com:80/path?query#fragment' 149 """ 150 scheme, auth, host, port, path, query, fragment = self 151 url = u"" 152 153 # We use "is not None" we want things to happen with empty strings (or 0 port) 154 if scheme is not None: 155 url += scheme + u"://" 156 if auth is not None: 157 url += auth + u"@" 158 if host is not None: 159 url += host 160 if port is not None: 161 url += u":" + str(port) 162 if path is not None: 163 url += path 164 if query is not None: 165 url += u"?" + query 166 if fragment is not None: 167 url += u"#" + fragment 168 169 return url 170 171 def __str__(self): 172 return self.url 173 174 175def split_first(s, delims): 176 """ 177 .. deprecated:: 1.25 178 179 Given a string and an iterable of delimiters, split on the first found 180 delimiter. Return two split parts and the matched delimiter. 181 182 If not found, then the first part is the full input string. 183 184 Example:: 185 186 >>> split_first('foo/bar?baz', '?/=') 187 ('foo', 'bar?baz', '/') 188 >>> split_first('foo/bar?baz', '123') 189 ('foo/bar?baz', '', None) 190 191 Scales linearly with number of delims. Not ideal for large number of delims. 192 """ 193 min_idx = None 194 min_delim = None 195 for d in delims: 196 idx = s.find(d) 197 if idx < 0: 198 continue 199 200 if min_idx is None or idx < min_idx: 201 min_idx = idx 202 min_delim = d 203 204 if min_idx is None or min_idx < 0: 205 return s, "", None 206 207 return s[:min_idx], s[min_idx + 1 :], min_delim 208 209 210def _encode_invalid_chars(component, allowed_chars, encoding="utf-8"): 211 """Percent-encodes a URI component without reapplying 212 onto an already percent-encoded component. 213 """ 214 if component is None: 215 return component 216 217 component = six.ensure_text(component) 218 219 # Normalize existing percent-encoded bytes. 220 # Try to see if the component we're encoding is already percent-encoded 221 # so we can skip all '%' characters but still encode all others. 222 component, percent_encodings = PERCENT_RE.subn( 223 lambda match: match.group(0).upper(), component 224 ) 225 226 uri_bytes = component.encode("utf-8", "surrogatepass") 227 is_percent_encoded = percent_encodings == uri_bytes.count(b"%") 228 encoded_component = bytearray() 229 230 for i in range(0, len(uri_bytes)): 231 # Will return a single character bytestring on both Python 2 & 3 232 byte = uri_bytes[i : i + 1] 233 byte_ord = ord(byte) 234 if (is_percent_encoded and byte == b"%") or ( 235 byte_ord < 128 and byte.decode() in allowed_chars 236 ): 237 encoded_component += byte 238 continue 239 encoded_component.extend(b"%" + (hex(byte_ord)[2:].encode().zfill(2).upper())) 240 241 return encoded_component.decode(encoding) 242 243 244def _remove_path_dot_segments(path): 245 # See http://tools.ietf.org/html/rfc3986#section-5.2.4 for pseudo-code 246 segments = path.split("/") # Turn the path into a list of segments 247 output = [] # Initialize the variable to use to store output 248 249 for segment in segments: 250 # '.' is the current directory, so ignore it, it is superfluous 251 if segment == ".": 252 continue 253 # Anything other than '..', should be appended to the output 254 elif segment != "..": 255 output.append(segment) 256 # In this case segment == '..', if we can, we should pop the last 257 # element 258 elif output: 259 output.pop() 260 261 # If the path starts with '/' and the output is empty or the first string 262 # is non-empty 263 if path.startswith("/") and (not output or output[0]): 264 output.insert(0, "") 265 266 # If the path starts with '/.' or '/..' ensure we add one more empty 267 # string to add a trailing '/' 268 if path.endswith(("/.", "/..")): 269 output.append("") 270 271 return "/".join(output) 272 273 274def _normalize_host(host, scheme): 275 if host: 276 if isinstance(host, six.binary_type): 277 host = six.ensure_str(host) 278 279 if scheme in NORMALIZABLE_SCHEMES: 280 is_ipv6 = IPV6_ADDRZ_RE.match(host) 281 if is_ipv6: 282 match = ZONE_ID_RE.search(host) 283 if match: 284 start, end = match.span(1) 285 zone_id = host[start:end] 286 287 if zone_id.startswith("%25") and zone_id != "%25": 288 zone_id = zone_id[3:] 289 else: 290 zone_id = zone_id[1:] 291 zone_id = "%" + _encode_invalid_chars(zone_id, UNRESERVED_CHARS) 292 return host[:start].lower() + zone_id + host[end:] 293 else: 294 return host.lower() 295 elif not IPV4_RE.match(host): 296 return six.ensure_str( 297 b".".join([_idna_encode(label) for label in host.split(".")]) 298 ) 299 return host 300 301 302def _idna_encode(name): 303 if name and any([ord(x) > 128 for x in name]): 304 try: 305 from pip._vendor import idna 306 except ImportError: 307 six.raise_from( 308 LocationParseError("Unable to parse URL without the 'idna' module"), 309 None, 310 ) 311 try: 312 return idna.encode(name.lower(), strict=True, std3_rules=True) 313 except idna.IDNAError: 314 six.raise_from( 315 LocationParseError(u"Name '%s' is not a valid IDNA label" % name), None 316 ) 317 return name.lower().encode("ascii") 318 319 320def _encode_target(target): 321 """Percent-encodes a request target so that there are no invalid characters""" 322 path, query = TARGET_RE.match(target).groups() 323 target = _encode_invalid_chars(path, PATH_CHARS) 324 query = _encode_invalid_chars(query, QUERY_CHARS) 325 if query is not None: 326 target += "?" + query 327 return target 328 329 330def parse_url(url): 331 """ 332 Given a url, return a parsed :class:`.Url` namedtuple. Best-effort is 333 performed to parse incomplete urls. Fields not provided will be None. 334 This parser is RFC 3986 compliant. 335 336 The parser logic and helper functions are based heavily on 337 work done in the ``rfc3986`` module. 338 339 :param str url: URL to parse into a :class:`.Url` namedtuple. 340 341 Partly backwards-compatible with :mod:`urlparse`. 342 343 Example:: 344 345 >>> parse_url('http://google.com/mail/') 346 Url(scheme='http', host='google.com', port=None, path='/mail/', ...) 347 >>> parse_url('google.com:80') 348 Url(scheme=None, host='google.com', port=80, path=None, ...) 349 >>> parse_url('/foo?bar') 350 Url(scheme=None, host=None, port=None, path='/foo', query='bar', ...) 351 """ 352 if not url: 353 # Empty 354 return Url() 355 356 source_url = url 357 if not SCHEME_RE.search(url): 358 url = "//" + url 359 360 try: 361 scheme, authority, path, query, fragment = URI_RE.match(url).groups() 362 normalize_uri = scheme is None or scheme.lower() in NORMALIZABLE_SCHEMES 363 364 if scheme: 365 scheme = scheme.lower() 366 367 if authority: 368 auth, host, port = SUBAUTHORITY_RE.match(authority).groups() 369 if auth and normalize_uri: 370 auth = _encode_invalid_chars(auth, USERINFO_CHARS) 371 if port == "": 372 port = None 373 else: 374 auth, host, port = None, None, None 375 376 if port is not None: 377 port = int(port) 378 if not (0 <= port <= 65535): 379 raise LocationParseError(url) 380 381 host = _normalize_host(host, scheme) 382 383 if normalize_uri and path: 384 path = _remove_path_dot_segments(path) 385 path = _encode_invalid_chars(path, PATH_CHARS) 386 if normalize_uri and query: 387 query = _encode_invalid_chars(query, QUERY_CHARS) 388 if normalize_uri and fragment: 389 fragment = _encode_invalid_chars(fragment, FRAGMENT_CHARS) 390 391 except (ValueError, AttributeError): 392 return six.raise_from(LocationParseError(source_url), None) 393 394 # For the sake of backwards compatibility we put empty 395 # string values for path if there are any defined values 396 # beyond the path in the URL. 397 # TODO: Remove this when we break backwards compatibility. 398 if not path: 399 if query is not None or fragment is not None: 400 path = "" 401 else: 402 path = None 403 404 # Ensure that each part of the URL is a `str` for 405 # backwards compatibility. 406 if isinstance(url, six.text_type): 407 ensure_func = six.ensure_text 408 else: 409 ensure_func = six.ensure_str 410 411 def ensure_type(x): 412 return x if x is None else ensure_func(x) 413 414 return Url( 415 scheme=ensure_type(scheme), 416 auth=ensure_type(auth), 417 host=ensure_type(host), 418 port=port, 419 path=ensure_type(path), 420 query=ensure_type(query), 421 fragment=ensure_type(fragment), 422 ) 423 424 425def get_host(url): 426 """ 427 Deprecated. Use :func:`parse_url` instead. 428 """ 429 p = parse_url(url) 430 return p.scheme or "http", p.hostname, p.port 431