1"""HTTP library functions. 2 3This module contains functions for building an HTTP application 4framework: any one, not just one whose name starts with "Ch". ;) If you 5reference any modules from some popular framework inside *this* module, 6FuManChu will personally hang you up by your thumbs and submit you 7to a public caning. 8""" 9 10import functools 11import email.utils 12import re 13import builtins 14from binascii import b2a_base64 15from cgi import parse_header 16from email.header import decode_header 17from http.server import BaseHTTPRequestHandler 18from urllib.parse import unquote_plus 19 20import jaraco.collections 21 22import cherrypy 23from cherrypy._cpcompat import ntob, ntou 24 25response_codes = BaseHTTPRequestHandler.responses.copy() 26 27# From https://github.com/cherrypy/cherrypy/issues/361 28response_codes[500] = ('Internal Server Error', 29 'The server encountered an unexpected condition ' 30 'which prevented it from fulfilling the request.') 31response_codes[503] = ('Service Unavailable', 32 'The server is currently unable to handle the ' 33 'request due to a temporary overloading or ' 34 'maintenance of the server.') 35 36 37HTTPDate = functools.partial(email.utils.formatdate, usegmt=True) 38 39 40def urljoin(*atoms): 41 r"""Return the given path \*atoms, joined into a single URL. 42 43 This will correctly join a SCRIPT_NAME and PATH_INFO into the 44 original URL, even if either atom is blank. 45 """ 46 url = '/'.join([x for x in atoms if x]) 47 while '//' in url: 48 url = url.replace('//', '/') 49 # Special-case the final url of "", and return "/" instead. 50 return url or '/' 51 52 53def urljoin_bytes(*atoms): 54 """Return the given path `*atoms`, joined into a single URL. 55 56 This will correctly join a SCRIPT_NAME and PATH_INFO into the 57 original URL, even if either atom is blank. 58 """ 59 url = b'/'.join([x for x in atoms if x]) 60 while b'//' in url: 61 url = url.replace(b'//', b'/') 62 # Special-case the final url of "", and return "/" instead. 63 return url or b'/' 64 65 66def protocol_from_http(protocol_str): 67 """Return a protocol tuple from the given 'HTTP/x.y' string.""" 68 return int(protocol_str[5]), int(protocol_str[7]) 69 70 71def get_ranges(headervalue, content_length): 72 """Return a list of (start, stop) indices from a Range header, or None. 73 74 Each (start, stop) tuple will be composed of two ints, which are suitable 75 for use in a slicing operation. That is, the header "Range: bytes=3-6", 76 if applied against a Python string, is requesting resource[3:7]. This 77 function will return the list [(3, 7)]. 78 79 If this function returns an empty list, you should return HTTP 416. 80 """ 81 82 if not headervalue: 83 return None 84 85 result = [] 86 bytesunit, byteranges = headervalue.split('=', 1) 87 for brange in byteranges.split(','): 88 start, stop = [x.strip() for x in brange.split('-', 1)] 89 if start: 90 if not stop: 91 stop = content_length - 1 92 start, stop = int(start), int(stop) 93 if start >= content_length: 94 # From rfc 2616 sec 14.16: 95 # "If the server receives a request (other than one 96 # including an If-Range request-header field) with an 97 # unsatisfiable Range request-header field (that is, 98 # all of whose byte-range-spec values have a first-byte-pos 99 # value greater than the current length of the selected 100 # resource), it SHOULD return a response code of 416 101 # (Requested range not satisfiable)." 102 continue 103 if stop < start: 104 # From rfc 2616 sec 14.16: 105 # "If the server ignores a byte-range-spec because it 106 # is syntactically invalid, the server SHOULD treat 107 # the request as if the invalid Range header field 108 # did not exist. (Normally, this means return a 200 109 # response containing the full entity)." 110 return None 111 result.append((start, stop + 1)) 112 else: 113 if not stop: 114 # See rfc quote above. 115 return None 116 # Negative subscript (last N bytes) 117 # 118 # RFC 2616 Section 14.35.1: 119 # If the entity is shorter than the specified suffix-length, 120 # the entire entity-body is used. 121 if int(stop) > content_length: 122 result.append((0, content_length)) 123 else: 124 result.append((content_length - int(stop), content_length)) 125 126 return result 127 128 129class HeaderElement(object): 130 131 """An element (with parameters) from an HTTP header's element list.""" 132 133 def __init__(self, value, params=None): 134 self.value = value 135 if params is None: 136 params = {} 137 self.params = params 138 139 def __cmp__(self, other): 140 return builtins.cmp(self.value, other.value) 141 142 def __lt__(self, other): 143 return self.value < other.value 144 145 def __str__(self): 146 p = [';%s=%s' % (k, v) for k, v in self.params.items()] 147 return str('%s%s' % (self.value, ''.join(p))) 148 149 def __bytes__(self): 150 return ntob(self.__str__()) 151 152 def __unicode__(self): 153 return ntou(self.__str__()) 154 155 @staticmethod 156 def parse(elementstr): 157 """Transform 'token;key=val' to ('token', {'key': 'val'}).""" 158 initial_value, params = parse_header(elementstr) 159 return initial_value, params 160 161 @classmethod 162 def from_str(cls, elementstr): 163 """Construct an instance from a string of the form 'token;key=val'.""" 164 ival, params = cls.parse(elementstr) 165 return cls(ival, params) 166 167 168q_separator = re.compile(r'; *q *=') 169 170 171class AcceptElement(HeaderElement): 172 173 """An element (with parameters) from an Accept* header's element list. 174 175 AcceptElement objects are comparable; the more-preferred object will be 176 "less than" the less-preferred object. They are also therefore sortable; 177 if you sort a list of AcceptElement objects, they will be listed in 178 priority order; the most preferred value will be first. Yes, it should 179 have been the other way around, but it's too late to fix now. 180 """ 181 182 @classmethod 183 def from_str(cls, elementstr): 184 qvalue = None 185 # The first "q" parameter (if any) separates the initial 186 # media-range parameter(s) (if any) from the accept-params. 187 atoms = q_separator.split(elementstr, 1) 188 media_range = atoms.pop(0).strip() 189 if atoms: 190 # The qvalue for an Accept header can have extensions. The other 191 # headers cannot, but it's easier to parse them as if they did. 192 qvalue = HeaderElement.from_str(atoms[0].strip()) 193 194 media_type, params = cls.parse(media_range) 195 if qvalue is not None: 196 params['q'] = qvalue 197 return cls(media_type, params) 198 199 @property 200 def qvalue(self): 201 'The qvalue, or priority, of this value.' 202 val = self.params.get('q', '1') 203 if isinstance(val, HeaderElement): 204 val = val.value 205 try: 206 return float(val) 207 except ValueError as val_err: 208 """Fail client requests with invalid quality value. 209 210 Ref: https://github.com/cherrypy/cherrypy/issues/1370 211 """ 212 raise cherrypy.HTTPError( 213 400, 214 'Malformed HTTP header: `{}`'. 215 format(str(self)), 216 ) from val_err 217 218 def __cmp__(self, other): 219 diff = builtins.cmp(self.qvalue, other.qvalue) 220 if diff == 0: 221 diff = builtins.cmp(str(self), str(other)) 222 return diff 223 224 def __lt__(self, other): 225 if self.qvalue == other.qvalue: 226 return str(self) < str(other) 227 else: 228 return self.qvalue < other.qvalue 229 230 231RE_HEADER_SPLIT = re.compile(',(?=(?:[^"]*"[^"]*")*[^"]*$)') 232 233 234def header_elements(fieldname, fieldvalue): 235 """Return a sorted HeaderElement list from a comma-separated header string. 236 """ 237 if not fieldvalue: 238 return [] 239 240 result = [] 241 for element in RE_HEADER_SPLIT.split(fieldvalue): 242 if fieldname.startswith('Accept') or fieldname == 'TE': 243 hv = AcceptElement.from_str(element) 244 else: 245 hv = HeaderElement.from_str(element) 246 result.append(hv) 247 248 return list(reversed(sorted(result))) 249 250 251def decode_TEXT(value): 252 r""" 253 Decode :rfc:`2047` TEXT 254 255 >>> decode_TEXT("=?utf-8?q?f=C3=BCr?=") == b'f\xfcr'.decode('latin-1') 256 True 257 """ 258 atoms = decode_header(value) 259 decodedvalue = '' 260 for atom, charset in atoms: 261 if charset is not None: 262 atom = atom.decode(charset) 263 decodedvalue += atom 264 return decodedvalue 265 266 267def decode_TEXT_maybe(value): 268 """ 269 Decode the text but only if '=?' appears in it. 270 """ 271 return decode_TEXT(value) if '=?' in value else value 272 273 274def valid_status(status): 275 """Return legal HTTP status Code, Reason-phrase and Message. 276 277 The status arg must be an int, a str that begins with an int 278 or the constant from ``http.client`` stdlib module. 279 280 If status has no reason-phrase is supplied, a default reason- 281 phrase will be provided. 282 283 >>> import http.client 284 >>> from http.server import BaseHTTPRequestHandler 285 >>> valid_status(http.client.ACCEPTED) == ( 286 ... int(http.client.ACCEPTED), 287 ... ) + BaseHTTPRequestHandler.responses[http.client.ACCEPTED] 288 True 289 """ 290 291 if not status: 292 status = 200 293 294 code, reason = status, None 295 if isinstance(status, str): 296 code, _, reason = status.partition(' ') 297 reason = reason.strip() or None 298 299 try: 300 code = int(code) 301 except (TypeError, ValueError): 302 raise ValueError('Illegal response status from server ' 303 '(%s is non-numeric).' % repr(code)) 304 305 if code < 100 or code > 599: 306 raise ValueError('Illegal response status from server ' 307 '(%s is out of range).' % repr(code)) 308 309 if code not in response_codes: 310 # code is unknown but not illegal 311 default_reason, message = '', '' 312 else: 313 default_reason, message = response_codes[code] 314 315 if reason is None: 316 reason = default_reason 317 318 return code, reason, message 319 320 321# NOTE: the parse_qs functions that follow are modified version of those 322# in the python3.0 source - we need to pass through an encoding to the unquote 323# method, but the default parse_qs function doesn't allow us to. These do. 324 325def _parse_qs(qs, keep_blank_values=0, strict_parsing=0, encoding='utf-8'): 326 """Parse a query given as a string argument. 327 328 Arguments: 329 330 qs: URL-encoded query string to be parsed 331 332 keep_blank_values: flag indicating whether blank values in 333 URL encoded queries should be treated as blank strings. A 334 true value indicates that blanks should be retained as blank 335 strings. The default false value indicates that blank values 336 are to be ignored and treated as if they were not included. 337 338 strict_parsing: flag indicating what to do with parsing errors. If 339 false (the default), errors are silently ignored. If true, 340 errors raise a ValueError exception. 341 342 Returns a dict, as G-d intended. 343 """ 344 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')] 345 d = {} 346 for name_value in pairs: 347 if not name_value and not strict_parsing: 348 continue 349 nv = name_value.split('=', 1) 350 if len(nv) != 2: 351 if strict_parsing: 352 raise ValueError('bad query field: %r' % (name_value,)) 353 # Handle case of a control-name with no equal sign 354 if keep_blank_values: 355 nv.append('') 356 else: 357 continue 358 if len(nv[1]) or keep_blank_values: 359 name = unquote_plus(nv[0], encoding, errors='strict') 360 value = unquote_plus(nv[1], encoding, errors='strict') 361 if name in d: 362 if not isinstance(d[name], list): 363 d[name] = [d[name]] 364 d[name].append(value) 365 else: 366 d[name] = value 367 return d 368 369 370image_map_pattern = re.compile(r'[0-9]+,[0-9]+') 371 372 373def parse_query_string(query_string, keep_blank_values=True, encoding='utf-8'): 374 """Build a params dictionary from a query_string. 375 376 Duplicate key/value pairs in the provided query_string will be 377 returned as {'key': [val1, val2, ...]}. Single key/values will 378 be returned as strings: {'key': 'value'}. 379 """ 380 if image_map_pattern.match(query_string): 381 # Server-side image map. Map the coords to 'x' and 'y' 382 # (like CGI::Request does). 383 pm = query_string.split(',') 384 pm = {'x': int(pm[0]), 'y': int(pm[1])} 385 else: 386 pm = _parse_qs(query_string, keep_blank_values, encoding=encoding) 387 return pm 388 389 390class CaseInsensitiveDict(jaraco.collections.KeyTransformingDict): 391 392 """A case-insensitive dict subclass. 393 394 Each key is changed on entry to title case. 395 """ 396 397 @staticmethod 398 def transform_key(key): 399 if key is None: 400 # TODO(#1830): why? 401 return 'None' 402 return key.title() 403 404 405# TEXT = <any OCTET except CTLs, but including LWS> 406# 407# A CRLF is allowed in the definition of TEXT only as part of a header 408# field continuation. It is expected that the folding LWS will be 409# replaced with a single SP before interpretation of the TEXT value." 410if str == bytes: 411 header_translate_table = ''.join([chr(i) for i in range(256)]) 412 header_translate_deletechars = ''.join( 413 [chr(i) for i in range(32)]) + chr(127) 414else: 415 header_translate_table = None 416 header_translate_deletechars = bytes(range(32)) + bytes([127]) 417 418 419class HeaderMap(CaseInsensitiveDict): 420 421 """A dict subclass for HTTP request and response headers. 422 423 Each key is changed on entry to str(key).title(). This allows headers 424 to be case-insensitive and avoid duplicates. 425 426 Values are header values (decoded according to :rfc:`2047` if necessary). 427 """ 428 429 protocol = (1, 1) 430 encodings = ['ISO-8859-1'] 431 432 # Someday, when http-bis is done, this will probably get dropped 433 # since few servers, clients, or intermediaries do it. But until then, 434 # we're going to obey the spec as is. 435 # "Words of *TEXT MAY contain characters from character sets other than 436 # ISO-8859-1 only when encoded according to the rules of RFC 2047." 437 use_rfc_2047 = True 438 439 def elements(self, key): 440 """Return a sorted list of HeaderElements for the given header.""" 441 return header_elements(self.transform_key(key), self.get(key)) 442 443 def values(self, key): 444 """Return a sorted list of HeaderElement.value for the given header.""" 445 return [e.value for e in self.elements(key)] 446 447 def output(self): 448 """Transform self into a list of (name, value) tuples.""" 449 return list(self.encode_header_items(self.items())) 450 451 @classmethod 452 def encode_header_items(cls, header_items): 453 """ 454 Prepare the sequence of name, value tuples into a form suitable for 455 transmitting on the wire for HTTP. 456 """ 457 for k, v in header_items: 458 if not isinstance(v, str) and not isinstance(v, bytes): 459 v = str(v) 460 461 yield tuple(map(cls.encode_header_item, (k, v))) 462 463 @classmethod 464 def encode_header_item(cls, item): 465 if isinstance(item, str): 466 item = cls.encode(item) 467 468 # See header_translate_* constants above. 469 # Replace only if you really know what you're doing. 470 return item.translate( 471 header_translate_table, header_translate_deletechars) 472 473 @classmethod 474 def encode(cls, v): 475 """Return the given header name or value, encoded for HTTP output.""" 476 for enc in cls.encodings: 477 try: 478 return v.encode(enc) 479 except UnicodeEncodeError: 480 continue 481 482 if cls.protocol == (1, 1) and cls.use_rfc_2047: 483 # Encode RFC-2047 TEXT 484 # (e.g. u"\u8200" -> "=?utf-8?b?6IiA?="). 485 # We do our own here instead of using the email module 486 # because we never want to fold lines--folding has 487 # been deprecated by the HTTP working group. 488 v = b2a_base64(v.encode('utf-8')) 489 return (b'=?utf-8?b?' + v.strip(b'\n') + b'?=') 490 491 raise ValueError('Could not encode header part %r using ' 492 'any of the encodings %r.' % 493 (v, cls.encodings)) 494 495 496class Host(object): 497 498 """An internet address. 499 500 name 501 Should be the client's host name. If not available (because no DNS 502 lookup is performed), the IP address should be used instead. 503 504 """ 505 506 ip = '0.0.0.0' 507 port = 80 508 name = 'unknown.tld' 509 510 def __init__(self, ip, port, name=None): 511 self.ip = ip 512 self.port = port 513 if name is None: 514 name = ip 515 self.name = name 516 517 def __repr__(self): 518 return 'httputil.Host(%r, %r, %r)' % (self.ip, self.port, self.name) 519