1"""HTTP library functions.
2
3This module contains functions for building an HTTP application
4framework: any one, not just one whose name starts with "Ch". ;) If you
5reference any modules from some popular framework inside *this* module,
6FuManChu will personally hang you up by your thumbs and submit you
7to a public caning.
8"""
9
10import functools
11import email.utils
12import re
13import builtins
14from binascii import b2a_base64
15from cgi import parse_header
16from email.header import decode_header
17from http.server import BaseHTTPRequestHandler
18from urllib.parse import unquote_plus
19
20import jaraco.collections
21
22import cherrypy
23from cherrypy._cpcompat import ntob, ntou
24
25response_codes = BaseHTTPRequestHandler.responses.copy()
26
27# From https://github.com/cherrypy/cherrypy/issues/361
28response_codes[500] = ('Internal Server Error',
29                       'The server encountered an unexpected condition '
30                       'which prevented it from fulfilling the request.')
31response_codes[503] = ('Service Unavailable',
32                       'The server is currently unable to handle the '
33                       'request due to a temporary overloading or '
34                       'maintenance of the server.')
35
36
37HTTPDate = functools.partial(email.utils.formatdate, usegmt=True)
38
39
40def urljoin(*atoms):
41    r"""Return the given path \*atoms, joined into a single URL.
42
43    This will correctly join a SCRIPT_NAME and PATH_INFO into the
44    original URL, even if either atom is blank.
45    """
46    url = '/'.join([x for x in atoms if x])
47    while '//' in url:
48        url = url.replace('//', '/')
49    # Special-case the final url of "", and return "/" instead.
50    return url or '/'
51
52
53def urljoin_bytes(*atoms):
54    """Return the given path `*atoms`, joined into a single URL.
55
56    This will correctly join a SCRIPT_NAME and PATH_INFO into the
57    original URL, even if either atom is blank.
58    """
59    url = b'/'.join([x for x in atoms if x])
60    while b'//' in url:
61        url = url.replace(b'//', b'/')
62    # Special-case the final url of "", and return "/" instead.
63    return url or b'/'
64
65
66def protocol_from_http(protocol_str):
67    """Return a protocol tuple from the given 'HTTP/x.y' string."""
68    return int(protocol_str[5]), int(protocol_str[7])
69
70
71def get_ranges(headervalue, content_length):
72    """Return a list of (start, stop) indices from a Range header, or None.
73
74    Each (start, stop) tuple will be composed of two ints, which are suitable
75    for use in a slicing operation. That is, the header "Range: bytes=3-6",
76    if applied against a Python string, is requesting resource[3:7]. This
77    function will return the list [(3, 7)].
78
79    If this function returns an empty list, you should return HTTP 416.
80    """
81
82    if not headervalue:
83        return None
84
85    result = []
86    bytesunit, byteranges = headervalue.split('=', 1)
87    for brange in byteranges.split(','):
88        start, stop = [x.strip() for x in brange.split('-', 1)]
89        if start:
90            if not stop:
91                stop = content_length - 1
92            start, stop = int(start), int(stop)
93            if start >= content_length:
94                # From rfc 2616 sec 14.16:
95                # "If the server receives a request (other than one
96                # including an If-Range request-header field) with an
97                # unsatisfiable Range request-header field (that is,
98                # all of whose byte-range-spec values have a first-byte-pos
99                # value greater than the current length of the selected
100                # resource), it SHOULD return a response code of 416
101                # (Requested range not satisfiable)."
102                continue
103            if stop < start:
104                # From rfc 2616 sec 14.16:
105                # "If the server ignores a byte-range-spec because it
106                # is syntactically invalid, the server SHOULD treat
107                # the request as if the invalid Range header field
108                # did not exist. (Normally, this means return a 200
109                # response containing the full entity)."
110                return None
111            result.append((start, stop + 1))
112        else:
113            if not stop:
114                # See rfc quote above.
115                return None
116            # Negative subscript (last N bytes)
117            #
118            # RFC 2616 Section 14.35.1:
119            #   If the entity is shorter than the specified suffix-length,
120            #   the entire entity-body is used.
121            if int(stop) > content_length:
122                result.append((0, content_length))
123            else:
124                result.append((content_length - int(stop), content_length))
125
126    return result
127
128
129class HeaderElement(object):
130
131    """An element (with parameters) from an HTTP header's element list."""
132
133    def __init__(self, value, params=None):
134        self.value = value
135        if params is None:
136            params = {}
137        self.params = params
138
139    def __cmp__(self, other):
140        return builtins.cmp(self.value, other.value)
141
142    def __lt__(self, other):
143        return self.value < other.value
144
145    def __str__(self):
146        p = [';%s=%s' % (k, v) for k, v in self.params.items()]
147        return str('%s%s' % (self.value, ''.join(p)))
148
149    def __bytes__(self):
150        return ntob(self.__str__())
151
152    def __unicode__(self):
153        return ntou(self.__str__())
154
155    @staticmethod
156    def parse(elementstr):
157        """Transform 'token;key=val' to ('token', {'key': 'val'})."""
158        initial_value, params = parse_header(elementstr)
159        return initial_value, params
160
161    @classmethod
162    def from_str(cls, elementstr):
163        """Construct an instance from a string of the form 'token;key=val'."""
164        ival, params = cls.parse(elementstr)
165        return cls(ival, params)
166
167
168q_separator = re.compile(r'; *q *=')
169
170
171class AcceptElement(HeaderElement):
172
173    """An element (with parameters) from an Accept* header's element list.
174
175    AcceptElement objects are comparable; the more-preferred object will be
176    "less than" the less-preferred object. They are also therefore sortable;
177    if you sort a list of AcceptElement objects, they will be listed in
178    priority order; the most preferred value will be first. Yes, it should
179    have been the other way around, but it's too late to fix now.
180    """
181
182    @classmethod
183    def from_str(cls, elementstr):
184        qvalue = None
185        # The first "q" parameter (if any) separates the initial
186        # media-range parameter(s) (if any) from the accept-params.
187        atoms = q_separator.split(elementstr, 1)
188        media_range = atoms.pop(0).strip()
189        if atoms:
190            # The qvalue for an Accept header can have extensions. The other
191            # headers cannot, but it's easier to parse them as if they did.
192            qvalue = HeaderElement.from_str(atoms[0].strip())
193
194        media_type, params = cls.parse(media_range)
195        if qvalue is not None:
196            params['q'] = qvalue
197        return cls(media_type, params)
198
199    @property
200    def qvalue(self):
201        'The qvalue, or priority, of this value.'
202        val = self.params.get('q', '1')
203        if isinstance(val, HeaderElement):
204            val = val.value
205        try:
206            return float(val)
207        except ValueError as val_err:
208            """Fail client requests with invalid quality value.
209
210            Ref: https://github.com/cherrypy/cherrypy/issues/1370
211            """
212            raise cherrypy.HTTPError(
213                400,
214                'Malformed HTTP header: `{}`'.
215                format(str(self)),
216            ) from val_err
217
218    def __cmp__(self, other):
219        diff = builtins.cmp(self.qvalue, other.qvalue)
220        if diff == 0:
221            diff = builtins.cmp(str(self), str(other))
222        return diff
223
224    def __lt__(self, other):
225        if self.qvalue == other.qvalue:
226            return str(self) < str(other)
227        else:
228            return self.qvalue < other.qvalue
229
230
231RE_HEADER_SPLIT = re.compile(',(?=(?:[^"]*"[^"]*")*[^"]*$)')
232
233
234def header_elements(fieldname, fieldvalue):
235    """Return a sorted HeaderElement list from a comma-separated header string.
236    """
237    if not fieldvalue:
238        return []
239
240    result = []
241    for element in RE_HEADER_SPLIT.split(fieldvalue):
242        if fieldname.startswith('Accept') or fieldname == 'TE':
243            hv = AcceptElement.from_str(element)
244        else:
245            hv = HeaderElement.from_str(element)
246        result.append(hv)
247
248    return list(reversed(sorted(result)))
249
250
251def decode_TEXT(value):
252    r"""
253    Decode :rfc:`2047` TEXT
254
255    >>> decode_TEXT("=?utf-8?q?f=C3=BCr?=") == b'f\xfcr'.decode('latin-1')
256    True
257    """
258    atoms = decode_header(value)
259    decodedvalue = ''
260    for atom, charset in atoms:
261        if charset is not None:
262            atom = atom.decode(charset)
263        decodedvalue += atom
264    return decodedvalue
265
266
267def decode_TEXT_maybe(value):
268    """
269    Decode the text but only if '=?' appears in it.
270    """
271    return decode_TEXT(value) if '=?' in value else value
272
273
274def valid_status(status):
275    """Return legal HTTP status Code, Reason-phrase and Message.
276
277    The status arg must be an int, a str that begins with an int
278    or the constant from ``http.client`` stdlib module.
279
280    If status has no reason-phrase is supplied, a default reason-
281    phrase will be provided.
282
283    >>> import http.client
284    >>> from http.server import BaseHTTPRequestHandler
285    >>> valid_status(http.client.ACCEPTED) == (
286    ...     int(http.client.ACCEPTED),
287    ... ) + BaseHTTPRequestHandler.responses[http.client.ACCEPTED]
288    True
289    """
290
291    if not status:
292        status = 200
293
294    code, reason = status, None
295    if isinstance(status, str):
296        code, _, reason = status.partition(' ')
297        reason = reason.strip() or None
298
299    try:
300        code = int(code)
301    except (TypeError, ValueError):
302        raise ValueError('Illegal response status from server '
303                         '(%s is non-numeric).' % repr(code))
304
305    if code < 100 or code > 599:
306        raise ValueError('Illegal response status from server '
307                         '(%s is out of range).' % repr(code))
308
309    if code not in response_codes:
310        # code is unknown but not illegal
311        default_reason, message = '', ''
312    else:
313        default_reason, message = response_codes[code]
314
315    if reason is None:
316        reason = default_reason
317
318    return code, reason, message
319
320
321# NOTE: the parse_qs functions that follow are modified version of those
322# in the python3.0 source - we need to pass through an encoding to the unquote
323# method, but the default parse_qs function doesn't allow us to.  These do.
324
325def _parse_qs(qs, keep_blank_values=0, strict_parsing=0, encoding='utf-8'):
326    """Parse a query given as a string argument.
327
328    Arguments:
329
330    qs: URL-encoded query string to be parsed
331
332    keep_blank_values: flag indicating whether blank values in
333        URL encoded queries should be treated as blank strings.  A
334        true value indicates that blanks should be retained as blank
335        strings.  The default false value indicates that blank values
336        are to be ignored and treated as if they were  not included.
337
338    strict_parsing: flag indicating what to do with parsing errors. If
339        false (the default), errors are silently ignored. If true,
340        errors raise a ValueError exception.
341
342    Returns a dict, as G-d intended.
343    """
344    pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
345    d = {}
346    for name_value in pairs:
347        if not name_value and not strict_parsing:
348            continue
349        nv = name_value.split('=', 1)
350        if len(nv) != 2:
351            if strict_parsing:
352                raise ValueError('bad query field: %r' % (name_value,))
353            # Handle case of a control-name with no equal sign
354            if keep_blank_values:
355                nv.append('')
356            else:
357                continue
358        if len(nv[1]) or keep_blank_values:
359            name = unquote_plus(nv[0], encoding, errors='strict')
360            value = unquote_plus(nv[1], encoding, errors='strict')
361            if name in d:
362                if not isinstance(d[name], list):
363                    d[name] = [d[name]]
364                d[name].append(value)
365            else:
366                d[name] = value
367    return d
368
369
370image_map_pattern = re.compile(r'[0-9]+,[0-9]+')
371
372
373def parse_query_string(query_string, keep_blank_values=True, encoding='utf-8'):
374    """Build a params dictionary from a query_string.
375
376    Duplicate key/value pairs in the provided query_string will be
377    returned as {'key': [val1, val2, ...]}. Single key/values will
378    be returned as strings: {'key': 'value'}.
379    """
380    if image_map_pattern.match(query_string):
381        # Server-side image map. Map the coords to 'x' and 'y'
382        # (like CGI::Request does).
383        pm = query_string.split(',')
384        pm = {'x': int(pm[0]), 'y': int(pm[1])}
385    else:
386        pm = _parse_qs(query_string, keep_blank_values, encoding=encoding)
387    return pm
388
389
390class CaseInsensitiveDict(jaraco.collections.KeyTransformingDict):
391
392    """A case-insensitive dict subclass.
393
394    Each key is changed on entry to title case.
395    """
396
397    @staticmethod
398    def transform_key(key):
399        if key is None:
400            # TODO(#1830): why?
401            return 'None'
402        return key.title()
403
404
405#   TEXT = <any OCTET except CTLs, but including LWS>
406#
407# A CRLF is allowed in the definition of TEXT only as part of a header
408# field continuation. It is expected that the folding LWS will be
409# replaced with a single SP before interpretation of the TEXT value."
410if str == bytes:
411    header_translate_table = ''.join([chr(i) for i in range(256)])
412    header_translate_deletechars = ''.join(
413        [chr(i) for i in range(32)]) + chr(127)
414else:
415    header_translate_table = None
416    header_translate_deletechars = bytes(range(32)) + bytes([127])
417
418
419class HeaderMap(CaseInsensitiveDict):
420
421    """A dict subclass for HTTP request and response headers.
422
423    Each key is changed on entry to str(key).title(). This allows headers
424    to be case-insensitive and avoid duplicates.
425
426    Values are header values (decoded according to :rfc:`2047` if necessary).
427    """
428
429    protocol = (1, 1)
430    encodings = ['ISO-8859-1']
431
432    # Someday, when http-bis is done, this will probably get dropped
433    # since few servers, clients, or intermediaries do it. But until then,
434    # we're going to obey the spec as is.
435    # "Words of *TEXT MAY contain characters from character sets other than
436    # ISO-8859-1 only when encoded according to the rules of RFC 2047."
437    use_rfc_2047 = True
438
439    def elements(self, key):
440        """Return a sorted list of HeaderElements for the given header."""
441        return header_elements(self.transform_key(key), self.get(key))
442
443    def values(self, key):
444        """Return a sorted list of HeaderElement.value for the given header."""
445        return [e.value for e in self.elements(key)]
446
447    def output(self):
448        """Transform self into a list of (name, value) tuples."""
449        return list(self.encode_header_items(self.items()))
450
451    @classmethod
452    def encode_header_items(cls, header_items):
453        """
454        Prepare the sequence of name, value tuples into a form suitable for
455        transmitting on the wire for HTTP.
456        """
457        for k, v in header_items:
458            if not isinstance(v, str) and not isinstance(v, bytes):
459                v = str(v)
460
461            yield tuple(map(cls.encode_header_item, (k, v)))
462
463    @classmethod
464    def encode_header_item(cls, item):
465        if isinstance(item, str):
466            item = cls.encode(item)
467
468        # See header_translate_* constants above.
469        # Replace only if you really know what you're doing.
470        return item.translate(
471            header_translate_table, header_translate_deletechars)
472
473    @classmethod
474    def encode(cls, v):
475        """Return the given header name or value, encoded for HTTP output."""
476        for enc in cls.encodings:
477            try:
478                return v.encode(enc)
479            except UnicodeEncodeError:
480                continue
481
482        if cls.protocol == (1, 1) and cls.use_rfc_2047:
483            # Encode RFC-2047 TEXT
484            # (e.g. u"\u8200" -> "=?utf-8?b?6IiA?=").
485            # We do our own here instead of using the email module
486            # because we never want to fold lines--folding has
487            # been deprecated by the HTTP working group.
488            v = b2a_base64(v.encode('utf-8'))
489            return (b'=?utf-8?b?' + v.strip(b'\n') + b'?=')
490
491        raise ValueError('Could not encode header part %r using '
492                         'any of the encodings %r.' %
493                         (v, cls.encodings))
494
495
496class Host(object):
497
498    """An internet address.
499
500    name
501        Should be the client's host name. If not available (because no DNS
502        lookup is performed), the IP address should be used instead.
503
504    """
505
506    ip = '0.0.0.0'
507    port = 80
508    name = 'unknown.tld'
509
510    def __init__(self, ip, port, name=None):
511        self.ip = ip
512        self.port = port
513        if name is None:
514            name = ip
515        self.name = name
516
517    def __repr__(self):
518        return 'httputil.Host(%r, %r, %r)' % (self.ip, self.port, self.name)
519