1from __future__ import absolute_import
2
3import re
4from collections import namedtuple
5
6from ..exceptions import LocationParseError
7from ..packages import six
8
9url_attrs = ["scheme", "auth", "host", "port", "path", "query", "fragment"]
10
11# We only want to normalize urls with an HTTP(S) scheme.
12# urllib3 infers URLs without a scheme (None) to be http.
13NORMALIZABLE_SCHEMES = ("http", "https", None)
14
15# Almost all of these patterns were derived from the
16# 'rfc3986' module: https://github.com/python-hyper/rfc3986
17PERCENT_RE = re.compile(r"%[a-fA-F0-9]{2}")
18SCHEME_RE = re.compile(r"^(?:[a-zA-Z][a-zA-Z0-9+-]*:|/)")
19URI_RE = re.compile(
20    r"^(?:([a-zA-Z][a-zA-Z0-9+.-]*):)?"
21    r"(?://([^\\/?#]*))?"
22    r"([^?#]*)"
23    r"(?:\?([^#]*))?"
24    r"(?:#(.*))?$",
25    re.UNICODE | re.DOTALL,
26)
27
28IPV4_PAT = r"(?:[0-9]{1,3}\.){3}[0-9]{1,3}"
29HEX_PAT = "[0-9A-Fa-f]{1,4}"
30LS32_PAT = "(?:{hex}:{hex}|{ipv4})".format(hex=HEX_PAT, ipv4=IPV4_PAT)
31_subs = {"hex": HEX_PAT, "ls32": LS32_PAT}
32_variations = [
33    #                            6( h16 ":" ) ls32
34    "(?:%(hex)s:){6}%(ls32)s",
35    #                       "::" 5( h16 ":" ) ls32
36    "::(?:%(hex)s:){5}%(ls32)s",
37    # [               h16 ] "::" 4( h16 ":" ) ls32
38    "(?:%(hex)s)?::(?:%(hex)s:){4}%(ls32)s",
39    # [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
40    "(?:(?:%(hex)s:)?%(hex)s)?::(?:%(hex)s:){3}%(ls32)s",
41    # [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
42    "(?:(?:%(hex)s:){0,2}%(hex)s)?::(?:%(hex)s:){2}%(ls32)s",
43    # [ *3( h16 ":" ) h16 ] "::"    h16 ":"   ls32
44    "(?:(?:%(hex)s:){0,3}%(hex)s)?::%(hex)s:%(ls32)s",
45    # [ *4( h16 ":" ) h16 ] "::"              ls32
46    "(?:(?:%(hex)s:){0,4}%(hex)s)?::%(ls32)s",
47    # [ *5( h16 ":" ) h16 ] "::"              h16
48    "(?:(?:%(hex)s:){0,5}%(hex)s)?::%(hex)s",
49    # [ *6( h16 ":" ) h16 ] "::"
50    "(?:(?:%(hex)s:){0,6}%(hex)s)?::",
51]
52
53UNRESERVED_PAT = r"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789._!\-~"
54IPV6_PAT = "(?:" + "|".join([x % _subs for x in _variations]) + ")"
55ZONE_ID_PAT = "(?:%25|%)(?:[" + UNRESERVED_PAT + "]|%[a-fA-F0-9]{2})+"
56IPV6_ADDRZ_PAT = r"\[" + IPV6_PAT + r"(?:" + ZONE_ID_PAT + r")?\]"
57REG_NAME_PAT = r"(?:[^\[\]%:/?#]|%[a-fA-F0-9]{2})*"
58TARGET_RE = re.compile(r"^(/[^?#]*)(?:\?([^#]*))?(?:#.*)?$")
59
60IPV4_RE = re.compile("^" + IPV4_PAT + "$")
61IPV6_RE = re.compile("^" + IPV6_PAT + "$")
62IPV6_ADDRZ_RE = re.compile("^" + IPV6_ADDRZ_PAT + "$")
63BRACELESS_IPV6_ADDRZ_RE = re.compile("^" + IPV6_ADDRZ_PAT[2:-2] + "$")
64ZONE_ID_RE = re.compile("(" + ZONE_ID_PAT + r")\]$")
65
66SUBAUTHORITY_PAT = (u"^(?:(.*)@)?(%s|%s|%s)(?::([0-9]{0,5}))?$") % (
67    REG_NAME_PAT,
68    IPV4_PAT,
69    IPV6_ADDRZ_PAT,
70)
71SUBAUTHORITY_RE = re.compile(SUBAUTHORITY_PAT, re.UNICODE | re.DOTALL)
72
73UNRESERVED_CHARS = set(
74    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789._-~"
75)
76SUB_DELIM_CHARS = set("!$&'()*+,;=")
77USERINFO_CHARS = UNRESERVED_CHARS | SUB_DELIM_CHARS | {":"}
78PATH_CHARS = USERINFO_CHARS | {"@", "/"}
79QUERY_CHARS = FRAGMENT_CHARS = PATH_CHARS | {"?"}
80
81
82class Url(namedtuple("Url", url_attrs)):
83    """
84    Data structure for representing an HTTP URL. Used as a return value for
85    :func:`parse_url`. Both the scheme and host are normalized as they are
86    both case-insensitive according to RFC 3986.
87    """
88
89    __slots__ = ()
90
91    def __new__(
92        cls,
93        scheme=None,
94        auth=None,
95        host=None,
96        port=None,
97        path=None,
98        query=None,
99        fragment=None,
100    ):
101        if path and not path.startswith("/"):
102            path = "/" + path
103        if scheme is not None:
104            scheme = scheme.lower()
105        return super(Url, cls).__new__(
106            cls, scheme, auth, host, port, path, query, fragment
107        )
108
109    @property
110    def hostname(self):
111        """For backwards-compatibility with urlparse. We're nice like that."""
112        return self.host
113
114    @property
115    def request_uri(self):
116        """Absolute path including the query string."""
117        uri = self.path or "/"
118
119        if self.query is not None:
120            uri += "?" + self.query
121
122        return uri
123
124    @property
125    def netloc(self):
126        """Network location including host and port"""
127        if self.port:
128            return "%s:%d" % (self.host, self.port)
129        return self.host
130
131    @property
132    def url(self):
133        """
134        Convert self into a url
135
136        This function should more or less round-trip with :func:`.parse_url`. The
137        returned url may not be exactly the same as the url inputted to
138        :func:`.parse_url`, but it should be equivalent by the RFC (e.g., urls
139        with a blank port will have : removed).
140
141        Example: ::
142
143            >>> U = parse_url('http://google.com/mail/')
144            >>> U.url
145            'http://google.com/mail/'
146            >>> Url('http', 'username:password', 'host.com', 80,
147            ... '/path', 'query', 'fragment').url
148            'http://username:password@host.com:80/path?query#fragment'
149        """
150        scheme, auth, host, port, path, query, fragment = self
151        url = u""
152
153        # We use "is not None" we want things to happen with empty strings (or 0 port)
154        if scheme is not None:
155            url += scheme + u"://"
156        if auth is not None:
157            url += auth + u"@"
158        if host is not None:
159            url += host
160        if port is not None:
161            url += u":" + str(port)
162        if path is not None:
163            url += path
164        if query is not None:
165            url += u"?" + query
166        if fragment is not None:
167            url += u"#" + fragment
168
169        return url
170
171    def __str__(self):
172        return self.url
173
174
175def split_first(s, delims):
176    """
177    .. deprecated:: 1.25
178
179    Given a string and an iterable of delimiters, split on the first found
180    delimiter. Return two split parts and the matched delimiter.
181
182    If not found, then the first part is the full input string.
183
184    Example::
185
186        >>> split_first('foo/bar?baz', '?/=')
187        ('foo', 'bar?baz', '/')
188        >>> split_first('foo/bar?baz', '123')
189        ('foo/bar?baz', '', None)
190
191    Scales linearly with number of delims. Not ideal for large number of delims.
192    """
193    min_idx = None
194    min_delim = None
195    for d in delims:
196        idx = s.find(d)
197        if idx < 0:
198            continue
199
200        if min_idx is None or idx < min_idx:
201            min_idx = idx
202            min_delim = d
203
204    if min_idx is None or min_idx < 0:
205        return s, "", None
206
207    return s[:min_idx], s[min_idx + 1 :], min_delim
208
209
210def _encode_invalid_chars(component, allowed_chars, encoding="utf-8"):
211    """Percent-encodes a URI component without reapplying
212    onto an already percent-encoded component.
213    """
214    if component is None:
215        return component
216
217    component = six.ensure_text(component)
218
219    # Normalize existing percent-encoded bytes.
220    # Try to see if the component we're encoding is already percent-encoded
221    # so we can skip all '%' characters but still encode all others.
222    component, percent_encodings = PERCENT_RE.subn(
223        lambda match: match.group(0).upper(), component
224    )
225
226    uri_bytes = component.encode("utf-8", "surrogatepass")
227    is_percent_encoded = percent_encodings == uri_bytes.count(b"%")
228    encoded_component = bytearray()
229
230    for i in range(0, len(uri_bytes)):
231        # Will return a single character bytestring on both Python 2 & 3
232        byte = uri_bytes[i : i + 1]
233        byte_ord = ord(byte)
234        if (is_percent_encoded and byte == b"%") or (
235            byte_ord < 128 and byte.decode() in allowed_chars
236        ):
237            encoded_component += byte
238            continue
239        encoded_component.extend(b"%" + (hex(byte_ord)[2:].encode().zfill(2).upper()))
240
241    return encoded_component.decode(encoding)
242
243
244def _remove_path_dot_segments(path):
245    # See http://tools.ietf.org/html/rfc3986#section-5.2.4 for pseudo-code
246    segments = path.split("/")  # Turn the path into a list of segments
247    output = []  # Initialize the variable to use to store output
248
249    for segment in segments:
250        # '.' is the current directory, so ignore it, it is superfluous
251        if segment == ".":
252            continue
253        # Anything other than '..', should be appended to the output
254        elif segment != "..":
255            output.append(segment)
256        # In this case segment == '..', if we can, we should pop the last
257        # element
258        elif output:
259            output.pop()
260
261    # If the path starts with '/' and the output is empty or the first string
262    # is non-empty
263    if path.startswith("/") and (not output or output[0]):
264        output.insert(0, "")
265
266    # If the path starts with '/.' or '/..' ensure we add one more empty
267    # string to add a trailing '/'
268    if path.endswith(("/.", "/..")):
269        output.append("")
270
271    return "/".join(output)
272
273
274def _normalize_host(host, scheme):
275    if host:
276        if isinstance(host, six.binary_type):
277            host = six.ensure_str(host)
278
279        if scheme in NORMALIZABLE_SCHEMES:
280            is_ipv6 = IPV6_ADDRZ_RE.match(host)
281            if is_ipv6:
282                match = ZONE_ID_RE.search(host)
283                if match:
284                    start, end = match.span(1)
285                    zone_id = host[start:end]
286
287                    if zone_id.startswith("%25") and zone_id != "%25":
288                        zone_id = zone_id[3:]
289                    else:
290                        zone_id = zone_id[1:]
291                    zone_id = "%" + _encode_invalid_chars(zone_id, UNRESERVED_CHARS)
292                    return host[:start].lower() + zone_id + host[end:]
293                else:
294                    return host.lower()
295            elif not IPV4_RE.match(host):
296                return six.ensure_str(
297                    b".".join([_idna_encode(label) for label in host.split(".")])
298                )
299    return host
300
301
302def _idna_encode(name):
303    if name and any([ord(x) > 128 for x in name]):
304        try:
305            from pip._vendor import idna
306        except ImportError:
307            six.raise_from(
308                LocationParseError("Unable to parse URL without the 'idna' module"),
309                None,
310            )
311        try:
312            return idna.encode(name.lower(), strict=True, std3_rules=True)
313        except idna.IDNAError:
314            six.raise_from(
315                LocationParseError(u"Name '%s' is not a valid IDNA label" % name), None
316            )
317    return name.lower().encode("ascii")
318
319
320def _encode_target(target):
321    """Percent-encodes a request target so that there are no invalid characters"""
322    path, query = TARGET_RE.match(target).groups()
323    target = _encode_invalid_chars(path, PATH_CHARS)
324    query = _encode_invalid_chars(query, QUERY_CHARS)
325    if query is not None:
326        target += "?" + query
327    return target
328
329
330def parse_url(url):
331    """
332    Given a url, return a parsed :class:`.Url` namedtuple. Best-effort is
333    performed to parse incomplete urls. Fields not provided will be None.
334    This parser is RFC 3986 compliant.
335
336    The parser logic and helper functions are based heavily on
337    work done in the ``rfc3986`` module.
338
339    :param str url: URL to parse into a :class:`.Url` namedtuple.
340
341    Partly backwards-compatible with :mod:`urlparse`.
342
343    Example::
344
345        >>> parse_url('http://google.com/mail/')
346        Url(scheme='http', host='google.com', port=None, path='/mail/', ...)
347        >>> parse_url('google.com:80')
348        Url(scheme=None, host='google.com', port=80, path=None, ...)
349        >>> parse_url('/foo?bar')
350        Url(scheme=None, host=None, port=None, path='/foo', query='bar', ...)
351    """
352    if not url:
353        # Empty
354        return Url()
355
356    source_url = url
357    if not SCHEME_RE.search(url):
358        url = "//" + url
359
360    try:
361        scheme, authority, path, query, fragment = URI_RE.match(url).groups()
362        normalize_uri = scheme is None or scheme.lower() in NORMALIZABLE_SCHEMES
363
364        if scheme:
365            scheme = scheme.lower()
366
367        if authority:
368            auth, host, port = SUBAUTHORITY_RE.match(authority).groups()
369            if auth and normalize_uri:
370                auth = _encode_invalid_chars(auth, USERINFO_CHARS)
371            if port == "":
372                port = None
373        else:
374            auth, host, port = None, None, None
375
376        if port is not None:
377            port = int(port)
378            if not (0 <= port <= 65535):
379                raise LocationParseError(url)
380
381        host = _normalize_host(host, scheme)
382
383        if normalize_uri and path:
384            path = _remove_path_dot_segments(path)
385            path = _encode_invalid_chars(path, PATH_CHARS)
386        if normalize_uri and query:
387            query = _encode_invalid_chars(query, QUERY_CHARS)
388        if normalize_uri and fragment:
389            fragment = _encode_invalid_chars(fragment, FRAGMENT_CHARS)
390
391    except (ValueError, AttributeError):
392        return six.raise_from(LocationParseError(source_url), None)
393
394    # For the sake of backwards compatibility we put empty
395    # string values for path if there are any defined values
396    # beyond the path in the URL.
397    # TODO: Remove this when we break backwards compatibility.
398    if not path:
399        if query is not None or fragment is not None:
400            path = ""
401        else:
402            path = None
403
404    # Ensure that each part of the URL is a `str` for
405    # backwards compatibility.
406    if isinstance(url, six.text_type):
407        ensure_func = six.ensure_text
408    else:
409        ensure_func = six.ensure_str
410
411    def ensure_type(x):
412        return x if x is None else ensure_func(x)
413
414    return Url(
415        scheme=ensure_type(scheme),
416        auth=ensure_type(auth),
417        host=ensure_type(host),
418        port=port,
419        path=ensure_type(path),
420        query=ensure_type(query),
421        fragment=ensure_type(fragment),
422    )
423
424
425def get_host(url):
426    """
427    Deprecated. Use :func:`parse_url` instead.
428    """
429    p = parse_url(url)
430    return p.scheme or "http", p.hostname, p.port
431