1from __future__ import with_statement, absolute_import, print_function 2 3from six import ( 4 binary_type, 5 text_type, 6 PY3, 7) 8 9from .decoders import * 10from .exceptions import * 11 12try: 13 from urlparse import parse_qs 14except ImportError: 15 from urllib.parse import parse_qs 16 17import os 18import re 19import sys 20import shutil 21import logging 22import tempfile 23from io import BytesIO 24from numbers import Number 25 26# Unique missing object. 27_missing = object() 28 29# States for the querystring parser. 30STATE_BEFORE_FIELD = 0 31STATE_FIELD_NAME = 1 32STATE_FIELD_DATA = 2 33 34# States for the multipart parser 35STATE_START = 0 36STATE_START_BOUNDARY = 1 37STATE_HEADER_FIELD_START = 2 38STATE_HEADER_FIELD = 3 39STATE_HEADER_VALUE_START = 4 40STATE_HEADER_VALUE = 5 41STATE_HEADER_VALUE_ALMOST_DONE = 6 42STATE_HEADERS_ALMOST_DONE = 7 43STATE_PART_DATA_START = 8 44STATE_PART_DATA = 9 45STATE_PART_DATA_END = 10 46STATE_END = 11 47 48STATES = [ 49 "START", 50 "START_BOUNDARY", "HEADER_FEILD_START", "HEADER_FIELD", "HEADER_VALUE_START", "HEADER_VALUE", 51 "HEADER_VALUE_ALMOST_DONE", "HEADRES_ALMOST_DONE", "PART_DATA_START", "PART_DATA", "PART_DATA_END", "END" 52] 53 54 55# Flags for the multipart parser. 56FLAG_PART_BOUNDARY = 1 57FLAG_LAST_BOUNDARY = 2 58 59# Get constants. Since iterating over a str on Python 2 gives you a 1-length 60# string, but iterating over a bytes object on Python 3 gives you an integer, 61# we need to save these constants. 62CR = b'\r'[0] 63LF = b'\n'[0] 64COLON = b':'[0] 65SPACE = b' '[0] 66HYPHEN = b'-'[0] 67AMPERSAND = b'&'[0] 68SEMICOLON = b';'[0] 69LOWER_A = b'a'[0] 70LOWER_Z = b'z'[0] 71NULL = b'\x00'[0] 72 73# Lower-casing a character is different, because of the difference between 74# str on Py2, and bytes on Py3. Same with getting the ordinal value of a byte, 75# and joining a list of bytes together. 76# These functions abstract that. 77if PY3: # pragma: no cover 78 lower_char = lambda c: c | 0x20 79 ord_char = lambda c: c 80 join_bytes = lambda b: bytes(list(b)) 81else: # pragma: no cover 82 lower_char = lambda c: c.lower() 83 ord_char = lambda c: ord(c) 84 join_bytes = lambda b: b''.join(list(b)) 85 86# These are regexes for parsing header values. 87SPECIAL_CHARS = re.escape(b'()<>@,;:\\"/[]?={} \t') 88QUOTED_STR = br'"(?:\\.|[^"])*"' 89VALUE_STR = br'(?:[^' + SPECIAL_CHARS + br']+|' + QUOTED_STR + br')' 90OPTION_RE_STR = ( 91 br'(?:;|^)\s*([^' + SPECIAL_CHARS + br']+)\s*=\s*(' + VALUE_STR + br')' 92) 93OPTION_RE = re.compile(OPTION_RE_STR) 94QUOTE = b'"'[0] 95 96 97def parse_options_header(value): 98 """ 99 Parses a Content-Type header into a value in the following format: 100 (content_type, {parameters}) 101 """ 102 if not value: 103 return (b'', {}) 104 105 # If we are passed a string, we assume that it conforms to WSGI and does 106 # not contain any code point that's not in latin-1. 107 if isinstance(value, text_type): # pragma: no cover 108 value = value.encode('latin-1') 109 110 # If we have no options, return the string as-is. 111 if b';' not in value: 112 return (value.lower().strip(), {}) 113 114 # Split at the first semicolon, to get our value and then options. 115 ctype, rest = value.split(b';', 1) 116 options = {} 117 118 # Parse the options. 119 for match in OPTION_RE.finditer(rest): 120 key = match.group(1).lower() 121 value = match.group(2) 122 if value[0] == QUOTE and value[-1] == QUOTE: 123 # Unquote the value. 124 value = value[1:-1] 125 value = value.replace(b'\\\\', b'\\').replace(b'\\"', b'"') 126 127 # If the value is a filename, we need to fix a bug on IE6 that sends 128 # the full file path instead of the filename. 129 if key == b'filename': 130 if value[1:3] == b':\\' or value[:2] == b'\\\\': 131 value = value.split(b'\\')[-1] 132 133 options[key] = value 134 135 return ctype, options 136 137 138class Field(object): 139 """A Field object represents a (parsed) form field. It represents a single 140 field with a corresponding name and value. 141 142 The name that a :class:`Field` will be instantiated with is the same name 143 that would be found in the following HTML:: 144 145 <input name="name_goes_here" type="text"/> 146 147 This class defines two methods, :meth:`on_data` and :meth:`on_end`, that 148 will be called when data is written to the Field, and when the Field is 149 finalized, respectively. 150 151 :param name: the name of the form field 152 """ 153 def __init__(self, name): 154 self._name = name 155 self._value = [] 156 157 # We cache the joined version of _value for speed. 158 self._cache = _missing 159 160 @classmethod 161 def from_value(klass, name, value): 162 """Create an instance of a :class:`Field`, and set the corresponding 163 value - either None or an actual value. This method will also 164 finalize the Field itself. 165 166 :param name: the name of the form field 167 :param value: the value of the form field - either a bytestring or 168 None 169 """ 170 171 f = klass(name) 172 if value is None: 173 f.set_none() 174 else: 175 f.write(value) 176 f.finalize() 177 return f 178 179 def write(self, data): 180 """Write some data into the form field. 181 182 :param data: a bytestring 183 """ 184 return self.on_data(data) 185 186 def on_data(self, data): 187 """This method is a callback that will be called whenever data is 188 written to the Field. 189 190 :param data: a bytestring 191 """ 192 self._value.append(data) 193 self._cache = _missing 194 return len(data) 195 196 def on_end(self): 197 """This method is called whenever the Field is finalized. 198 """ 199 if self._cache is _missing: 200 self._cache = b''.join(self._value) 201 202 def finalize(self): 203 """Finalize the form field. 204 """ 205 self.on_end() 206 207 def close(self): 208 """Close the Field object. This will free any underlying cache. 209 """ 210 # Free our value array. 211 if self._cache is _missing: 212 self._cache = b''.join(self._value) 213 214 del self._value 215 216 def set_none(self): 217 """Some fields in a querystring can possibly have a value of None - for 218 example, the string "foo&bar=&baz=asdf" will have a field with the 219 name "foo" and value None, one with name "bar" and value "", and one 220 with name "baz" and value "asdf". Since the write() interface doesn't 221 support writing None, this function will set the field value to None. 222 """ 223 self._cache = None 224 225 @property 226 def field_name(self): 227 """This property returns the name of the field.""" 228 return self._name 229 230 @property 231 def value(self): 232 """This property returns the value of the form field.""" 233 if self._cache is _missing: 234 self._cache = b''.join(self._value) 235 236 return self._cache 237 238 def __eq__(self, other): 239 if isinstance(other, Field): 240 return ( 241 self.field_name == other.field_name and 242 self.value == other.value 243 ) 244 else: 245 return NotImplemented 246 247 def __repr__(self): 248 if len(self.value) > 97: 249 # We get the repr, and then insert three dots before the final 250 # quote. 251 v = repr(self.value[:97])[:-1] + "...'" 252 else: 253 v = repr(self.value) 254 255 return "%s(field_name=%r, value=%s)" % ( 256 self.__class__.__name__, 257 self.field_name, 258 v 259 ) 260 261 262class File(object): 263 """This class represents an uploaded file. It handles writing file data to 264 either an in-memory file or a temporary file on-disk, if the optional 265 threshold is passed. 266 267 There are some options that can be passed to the File to change behavior 268 of the class. Valid options are as follows: 269 270 .. list-table:: 271 :widths: 15 5 5 30 272 :header-rows: 1 273 274 * - Name 275 - Type 276 - Default 277 - Description 278 * - UPLOAD_DIR 279 - `str` 280 - None 281 - The directory to store uploaded files in. If this is None, a 282 temporary file will be created in the system's standard location. 283 * - UPLOAD_DELETE_TMP 284 - `bool` 285 - True 286 - Delete automatically created TMP file 287 * - UPLOAD_KEEP_FILENAME 288 - `bool` 289 - False 290 - Whether or not to keep the filename of the uploaded file. If True, 291 then the filename will be converted to a safe representation (e.g. 292 by removing any invalid path segments), and then saved with the 293 same name). Otherwise, a temporary name will be used. 294 * - UPLOAD_KEEP_EXTENSIONS 295 - `bool` 296 - False 297 - Whether or not to keep the uploaded file's extension. If False, the 298 file will be saved with the default temporary extension (usually 299 ".tmp"). Otherwise, the file's extension will be maintained. Note 300 that this will properly combine with the UPLOAD_KEEP_FILENAME 301 setting. 302 * - MAX_MEMORY_FILE_SIZE 303 - `int` 304 - 1 MiB 305 - The maximum number of bytes of a File to keep in memory. By 306 default, the contents of a File are kept into memory until a certain 307 limit is reached, after which the contents of the File are written 308 to a temporary file. This behavior can be disabled by setting this 309 value to an appropriately large value (or, for example, infinity, 310 such as `float('inf')`. 311 312 :param file_name: The name of the file that this :class:`File` represents 313 314 :param field_name: The field name that uploaded this file. Note that this 315 can be None, if, for example, the file was uploaded 316 with Content-Type application/octet-stream 317 318 :param config: The configuration for this File. See above for valid 319 configuration keys and their corresponding values. 320 """ 321 def __init__(self, file_name, field_name=None, config={}): 322 # Save configuration, set other variables default. 323 self.logger = logging.getLogger(__name__) 324 self._config = config 325 self._in_memory = True 326 self._bytes_written = 0 327 self._fileobj = BytesIO() 328 329 # Save the provided field/file name. 330 self._field_name = field_name 331 self._file_name = file_name 332 333 # Our actual file name is None by default, since, depending on our 334 # config, we may not actually use the provided name. 335 self._actual_file_name = None 336 337 # Split the extension from the filename. 338 if file_name is not None: 339 base, ext = os.path.splitext(file_name) 340 self._file_base = base 341 self._ext = ext 342 343 @property 344 def field_name(self): 345 """The form field associated with this file. May be None if there isn't 346 one, for example when we have an application/octet-stream upload. 347 """ 348 return self._field_name 349 350 @property 351 def file_name(self): 352 """The file name given in the upload request. 353 """ 354 return self._file_name 355 356 @property 357 def actual_file_name(self): 358 """The file name that this file is saved as. Will be None if it's not 359 currently saved on disk. 360 """ 361 return self._actual_file_name 362 363 @property 364 def file_object(self): 365 """The file object that we're currently writing to. Note that this 366 will either be an instance of a :class:`io.BytesIO`, or a regular file 367 object. 368 """ 369 return self._fileobj 370 371 @property 372 def size(self): 373 """The total size of this file, counted as the number of bytes that 374 currently have been written to the file. 375 """ 376 return self._bytes_written 377 378 @property 379 def in_memory(self): 380 """A boolean representing whether or not this file object is currently 381 stored in-memory or on-disk. 382 """ 383 return self._in_memory 384 385 def flush_to_disk(self): 386 """If the file is already on-disk, do nothing. Otherwise, copy from 387 the in-memory buffer to a disk file, and then reassign our internal 388 file object to this new disk file. 389 390 Note that if you attempt to flush a file that is already on-disk, a 391 warning will be logged to this module's logger. 392 """ 393 if not self._in_memory: 394 self.logger.warning( 395 "Trying to flush to disk when we're not in memory" 396 ) 397 return 398 399 # Go back to the start of our file. 400 self._fileobj.seek(0) 401 402 # Open a new file. 403 new_file = self._get_disk_file() 404 405 # Copy the file objects. 406 shutil.copyfileobj(self._fileobj, new_file) 407 408 # Seek to the new position in our new file. 409 new_file.seek(self._bytes_written) 410 411 # Reassign the fileobject. 412 old_fileobj = self._fileobj 413 self._fileobj = new_file 414 415 # We're no longer in memory. 416 self._in_memory = False 417 418 # Close the old file object. 419 old_fileobj.close() 420 421 def _get_disk_file(self): 422 """This function is responsible for getting a file object on-disk for us. 423 """ 424 self.logger.info("Opening a file on disk") 425 426 file_dir = self._config.get('UPLOAD_DIR') 427 keep_filename = self._config.get('UPLOAD_KEEP_FILENAME', False) 428 keep_extensions = self._config.get('UPLOAD_KEEP_EXTENSIONS', False) 429 delete_tmp = self._config.get('UPLOAD_DELETE_TMP', True) 430 431 # If we have a directory and are to keep the filename... 432 if file_dir is not None and keep_filename: 433 self.logger.info("Saving with filename in: %r", file_dir) 434 435 # Build our filename. 436 # TODO: what happens if we don't have a filename? 437 fname = self._file_base 438 if keep_extensions: 439 fname = fname + self._ext 440 441 path = os.path.join(file_dir, fname) 442 try: 443 self.logger.info("Opening file: %r", path) 444 tmp_file = open(path, 'w+b') 445 except (IOError, OSError) as e: 446 tmp_file = None 447 448 self.logger.exception("Error opening temporary file") 449 raise FileError("Error opening temporary file: %r" % path) 450 else: 451 # Build options array. 452 # Note that on Python 3, tempfile doesn't support byte names. We 453 # encode our paths using the default filesystem encoding. 454 options = {} 455 if keep_extensions: 456 ext = self._ext 457 if isinstance(ext, binary_type): 458 ext = ext.decode(sys.getfilesystemencoding()) 459 460 options['suffix'] = ext 461 if file_dir is not None: 462 d = file_dir 463 if isinstance(d, binary_type): 464 d = d.decode(sys.getfilesystemencoding()) 465 466 options['dir'] = d 467 options['delete'] = delete_tmp 468 469 # Create a temporary (named) file with the appropriate settings. 470 self.logger.info("Creating a temporary file with options: %r", 471 options) 472 try: 473 tmp_file = tempfile.NamedTemporaryFile(**options) 474 except (IOError, OSError): 475 self.logger.exception("Error creating named temporary file") 476 raise FileError("Error creating named temporary file") 477 478 fname = tmp_file.name 479 480 # Encode filename as bytes. 481 if isinstance(fname, text_type): 482 fname = fname.encode(sys.getfilesystemencoding()) 483 484 self._actual_file_name = fname 485 return tmp_file 486 487 def write(self, data): 488 """Write some data to the File. 489 490 :param data: a bytestring 491 """ 492 return self.on_data(data) 493 494 def on_data(self, data): 495 """This method is a callback that will be called whenever data is 496 written to the File. 497 498 :param data: a bytestring 499 """ 500 pos = self._fileobj.tell() 501 bwritten = self._fileobj.write(data) 502 # true file objects write returns None 503 if bwritten is None: 504 bwritten = self._fileobj.tell() - pos 505 506 # If the bytes written isn't the same as the length, just return. 507 if bwritten != len(data): 508 self.logger.warning("bwritten != len(data) (%d != %d)", bwritten, 509 len(data)) 510 return bwritten 511 512 # Keep track of how many bytes we've written. 513 self._bytes_written += bwritten 514 515 # If we're in-memory and are over our limit, we create a file. 516 if (self._in_memory and 517 self._config.get('MAX_MEMORY_FILE_SIZE') is not None and 518 (self._bytes_written > 519 self._config.get('MAX_MEMORY_FILE_SIZE'))): 520 self.logger.info("Flushing to disk") 521 self.flush_to_disk() 522 523 # Return the number of bytes written. 524 return bwritten 525 526 def on_end(self): 527 """This method is called whenever the Field is finalized. 528 """ 529 # Flush the underlying file object 530 self._fileobj.flush() 531 532 def finalize(self): 533 """Finalize the form file. This will not close the underlying file, 534 but simply signal that we are finished writing to the File. 535 """ 536 self.on_end() 537 538 def close(self): 539 """Close the File object. This will actually close the underlying 540 file object (whether it's a :class:`io.BytesIO` or an actual file 541 object). 542 """ 543 self._fileobj.close() 544 545 def __repr__(self): 546 return "%s(file_name=%r, field_name=%r)" % ( 547 self.__class__.__name__, 548 self.file_name, 549 self.field_name 550 ) 551 552 553class BaseParser(object): 554 """This class is the base class for all parsers. It contains the logic for 555 calling and adding callbacks. 556 557 A callback can be one of two different forms. "Notification callbacks" are 558 callbacks that are called when something happens - for example, when a new 559 part of a multipart message is encountered by the parser. "Data callbacks" 560 are called when we get some sort of data - for example, part of the body of 561 a multipart chunk. Notification callbacks are called with no parameters, 562 whereas data callbacks are called with three, as follows:: 563 564 data_callback(data, start, end) 565 566 The "data" parameter is a bytestring (i.e. "foo" on Python 2, or b"foo" on 567 Python 3). "start" and "end" are integer indexes into the "data" string 568 that represent the data of interest. Thus, in a data callback, the slice 569 `data[start:end]` represents the data that the callback is "interested in". 570 The callback is not passed a copy of the data, since copying severely hurts 571 performance. 572 """ 573 def __init__(self): 574 self.logger = logging.getLogger(__name__) 575 576 def callback(self, name, data=None, start=None, end=None): 577 """This function calls a provided callback with some data. If the 578 callback is not set, will do nothing. 579 580 :param name: The name of the callback to call (as a string). 581 582 :param data: Data to pass to the callback. If None, then it is 583 assumed that the callback is a notification callback, 584 and no parameters are given. 585 586 :param end: An integer that is passed to the data callback. 587 588 :param start: An integer that is passed to the data callback. 589 """ 590 name = "on_" + name 591 func = self.callbacks.get(name) 592 if func is None: 593 return 594 595 # Depending on whether we're given a buffer... 596 if data is not None: 597 # Don't do anything if we have start == end. 598 if start is not None and start == end: 599 return 600 601 self.logger.debug("Calling %s with data[%d:%d]", name, start, end) 602 func(data, start, end) 603 else: 604 self.logger.debug("Calling %s with no data", name) 605 func() 606 607 def set_callback(self, name, new_func): 608 """Update the function for a callback. Removes from the callbacks dict 609 if new_func is None. 610 611 :param name: The name of the callback to call (as a string). 612 613 :param new_func: The new function for the callback. If None, then the 614 callback will be removed (with no error if it does not 615 exist). 616 """ 617 if new_func is None: 618 self.callbacks.pop('on_' + name, None) 619 else: 620 self.callbacks['on_' + name] = new_func 621 622 def close(self): 623 pass # pragma: no cover 624 625 def finalize(self): 626 pass # pragma: no cover 627 628 def __repr__(self): 629 return "%s()" % self.__class__.__name__ 630 631 632class OctetStreamParser(BaseParser): 633 """This parser parses an octet-stream request body and calls callbacks when 634 incoming data is received. Callbacks are as follows: 635 636 .. list-table:: 637 :widths: 15 10 30 638 :header-rows: 1 639 640 * - Callback Name 641 - Parameters 642 - Description 643 * - on_start 644 - None 645 - Called when the first data is parsed. 646 * - on_data 647 - data, start, end 648 - Called for each data chunk that is parsed. 649 * - on_end 650 - None 651 - Called when the parser is finished parsing all data. 652 653 :param callbacks: A dictionary of callbacks. See the documentation for 654 :class:`BaseParser`. 655 656 :param max_size: The maximum size of body to parse. Defaults to infinity - 657 i.e. unbounded. 658 """ 659 def __init__(self, callbacks={}, max_size=float('inf')): 660 super(OctetStreamParser, self).__init__() 661 self.callbacks = callbacks 662 self._started = False 663 664 if not isinstance(max_size, Number) or max_size < 1: 665 raise ValueError("max_size must be a positive number, not %r" % 666 max_size) 667 self.max_size = max_size 668 self._current_size = 0 669 670 def write(self, data): 671 """Write some data to the parser, which will perform size verification, 672 and then pass the data to the underlying callback. 673 674 :param data: a bytestring 675 """ 676 if not self._started: 677 self.callback('start') 678 self._started = True 679 680 # Truncate data length. 681 data_len = len(data) 682 if (self._current_size + data_len) > self.max_size: 683 # We truncate the length of data that we are to process. 684 new_size = int(self.max_size - self._current_size) 685 self.logger.warning("Current size is %d (max %d), so truncating " 686 "data length from %d to %d", 687 self._current_size, self.max_size, data_len, 688 new_size) 689 data_len = new_size 690 691 # Increment size, then callback, in case there's an exception. 692 self._current_size += data_len 693 self.callback('data', data, 0, data_len) 694 return data_len 695 696 def finalize(self): 697 """Finalize this parser, which signals to that we are finished parsing, 698 and sends the on_end callback. 699 """ 700 self.callback('end') 701 702 def __repr__(self): 703 return "%s()" % self.__class__.__name__ 704 705 706class QuerystringParser(BaseParser): 707 """This is a streaming querystring parser. It will consume data, and call 708 the callbacks given when it has data. 709 710 .. list-table:: 711 :widths: 15 10 30 712 :header-rows: 1 713 714 * - Callback Name 715 - Parameters 716 - Description 717 * - on_field_start 718 - None 719 - Called when a new field is encountered. 720 * - on_field_name 721 - data, start, end 722 - Called when a portion of a field's name is encountered. 723 * - on_field_data 724 - data, start, end 725 - Called when a portion of a field's data is encountered. 726 * - on_field_end 727 - None 728 - Called when the end of a field is encountered. 729 * - on_end 730 - None 731 - Called when the parser is finished parsing all data. 732 733 :param callbacks: A dictionary of callbacks. See the documentation for 734 :class:`BaseParser`. 735 736 :param strict_parsing: Whether or not to parse the body strictly. Defaults 737 to False. If this is set to True, then the behavior 738 of the parser changes as the following: if a field 739 has a value with an equal sign (e.g. "foo=bar", or 740 "foo="), it is always included. If a field has no 741 equals sign (e.g. "...&name&..."), it will be 742 treated as an error if 'strict_parsing' is True, 743 otherwise included. If an error is encountered, 744 then a 745 :class:`multipart.exceptions.QuerystringParseError` 746 will be raised. 747 748 :param max_size: The maximum size of body to parse. Defaults to infinity - 749 i.e. unbounded. 750 """ 751 def __init__(self, callbacks={}, strict_parsing=False, 752 max_size=float('inf')): 753 super(QuerystringParser, self).__init__() 754 self.state = STATE_BEFORE_FIELD 755 self._found_sep = False 756 757 self.callbacks = callbacks 758 759 # Max-size stuff 760 if not isinstance(max_size, Number) or max_size < 1: 761 raise ValueError("max_size must be a positive number, not %r" % 762 max_size) 763 self.max_size = max_size 764 self._current_size = 0 765 766 # Should parsing be strict? 767 self.strict_parsing = strict_parsing 768 769 def write(self, data): 770 """Write some data to the parser, which will perform size verification, 771 parse into either a field name or value, and then pass the 772 corresponding data to the underlying callback. If an error is 773 encountered while parsing, a QuerystringParseError will be raised. The 774 "offset" attribute of the raised exception will be set to the offset in 775 the input data chunk (NOT the overall stream) that caused the error. 776 777 :param data: a bytestring 778 """ 779 # Handle sizing. 780 data_len = len(data) 781 if (self._current_size + data_len) > self.max_size: 782 # We truncate the length of data that we are to process. 783 new_size = int(self.max_size - self._current_size) 784 self.logger.warning("Current size is %d (max %d), so truncating " 785 "data length from %d to %d", 786 self._current_size, self.max_size, data_len, 787 new_size) 788 data_len = new_size 789 790 l = 0 791 try: 792 l = self._internal_write(data, data_len) 793 finally: 794 self._current_size += l 795 796 return l 797 798 def _internal_write(self, data, length): 799 state = self.state 800 strict_parsing = self.strict_parsing 801 found_sep = self._found_sep 802 803 i = 0 804 while i < length: 805 ch = data[i] 806 807 # Depending on our state... 808 if state == STATE_BEFORE_FIELD: 809 # If the 'found_sep' flag is set, we've already encountered 810 # and skipped a single seperator. If so, we check our strict 811 # parsing flag and decide what to do. Otherwise, we haven't 812 # yet reached a seperator, and thus, if we do, we need to skip 813 # it as it will be the boundary between fields that's supposed 814 # to be there. 815 if ch == AMPERSAND or ch == SEMICOLON: 816 if found_sep: 817 # If we're parsing strictly, we disallow blank chunks. 818 if strict_parsing: 819 e = QuerystringParseError( 820 "Skipping duplicate ampersand/semicolon at " 821 "%d" % i 822 ) 823 e.offset = i 824 raise e 825 else: 826 self.logger.debug("Skipping duplicate ampersand/" 827 "semicolon at %d", i) 828 else: 829 # This case is when we're skipping the (first) 830 # seperator between fields, so we just set our flag 831 # and continue on. 832 found_sep = True 833 else: 834 # Emit a field-start event, and go to that state. Also, 835 # reset the "found_sep" flag, for the next time we get to 836 # this state. 837 self.callback('field_start') 838 i -= 1 839 state = STATE_FIELD_NAME 840 found_sep = False 841 842 elif state == STATE_FIELD_NAME: 843 # Try and find a seperator - we ensure that, if we do, we only 844 # look for the equal sign before it. 845 sep_pos = data.find(b'&', i) 846 if sep_pos == -1: 847 sep_pos = data.find(b';', i) 848 849 # See if we can find an equals sign in the remaining data. If 850 # so, we can immedately emit the field name and jump to the 851 # data state. 852 if sep_pos != -1: 853 equals_pos = data.find(b'=', i, sep_pos) 854 else: 855 equals_pos = data.find(b'=', i) 856 857 if equals_pos != -1: 858 # Emit this name. 859 self.callback('field_name', data, i, equals_pos) 860 861 # Jump i to this position. Note that it will then have 1 862 # added to it below, which means the next iteration of this 863 # loop will inspect the character after the equals sign. 864 i = equals_pos 865 state = STATE_FIELD_DATA 866 else: 867 # No equals sign found. 868 if not strict_parsing: 869 # See also comments in the STATE_FIELD_DATA case below. 870 # If we found the seperator, we emit the name and just 871 # end - there's no data callback at all (not even with 872 # a blank value). 873 if sep_pos != -1: 874 self.callback('field_name', data, i, sep_pos) 875 self.callback('field_end') 876 877 i = sep_pos - 1 878 state = STATE_BEFORE_FIELD 879 else: 880 # Otherwise, no seperator in this block, so the 881 # rest of this chunk must be a name. 882 self.callback('field_name', data, i, length) 883 i = length 884 885 else: 886 # We're parsing strictly. If we find a seperator, 887 # this is an error - we require an equals sign. 888 if sep_pos != -1: 889 e = QuerystringParseError( 890 "When strict_parsing is True, we require an " 891 "equals sign in all field chunks. Did not " 892 "find one in the chunk that starts at %d" % 893 (i,) 894 ) 895 e.offset = i 896 raise e 897 898 # No seperator in the rest of this chunk, so it's just 899 # a field name. 900 self.callback('field_name', data, i, length) 901 i = length 902 903 elif state == STATE_FIELD_DATA: 904 # Try finding either an ampersand or a semicolon after this 905 # position. 906 sep_pos = data.find(b'&', i) 907 if sep_pos == -1: 908 sep_pos = data.find(b';', i) 909 910 # If we found it, callback this bit as data and then go back 911 # to expecting to find a field. 912 if sep_pos != -1: 913 self.callback('field_data', data, i, sep_pos) 914 self.callback('field_end') 915 916 # Note that we go to the seperator, which brings us to the 917 # "before field" state. This allows us to properly emit 918 # "field_start" events only when we actually have data for 919 # a field of some sort. 920 i = sep_pos - 1 921 state = STATE_BEFORE_FIELD 922 923 # Otherwise, emit the rest as data and finish. 924 else: 925 self.callback('field_data', data, i, length) 926 i = length 927 928 else: # pragma: no cover (error case) 929 msg = "Reached an unknown state %d at %d" % (state, i) 930 self.logger.warning(msg) 931 e = QuerystringParseError(msg) 932 e.offset = i 933 raise e 934 935 i += 1 936 937 self.state = state 938 self._found_sep = found_sep 939 return len(data) 940 941 def finalize(self): 942 """Finalize this parser, which signals to that we are finished parsing, 943 if we're still in the middle of a field, an on_field_end callback, and 944 then the on_end callback. 945 """ 946 # If we're currently in the middle of a field, we finish it. 947 if self.state == STATE_FIELD_DATA: 948 self.callback('field_end') 949 self.callback('end') 950 951 def __repr__(self): 952 return "%s(keep_blank_values=%r, strict_parsing=%r, max_size=%r)" % ( 953 self.__class__.__name__, 954 self.keep_blank_values, self.strict_parsing, self.max_size 955 ) 956 957 958class MultipartParser(BaseParser): 959 """This class is a streaming multipart/form-data parser. 960 961 .. list-table:: 962 :widths: 15 10 30 963 :header-rows: 1 964 965 * - Callback Name 966 - Parameters 967 - Description 968 * - on_part_begin 969 - None 970 - Called when a new part of the multipart message is encountered. 971 * - on_part_data 972 - data, start, end 973 - Called when a portion of a part's data is encountered. 974 * - on_part_end 975 - None 976 - Called when the end of a part is reached. 977 * - on_header_begin 978 - None 979 - Called when we've found a new header in a part of a multipart 980 message 981 * - on_header_field 982 - data, start, end 983 - Called each time an additional portion of a header is read (i.e. the 984 part of the header that is before the colon; the "Foo" in 985 "Foo: Bar"). 986 * - on_header_value 987 - data, start, end 988 - Called when we get data for a header. 989 * - on_header_end 990 - None 991 - Called when the current header is finished - i.e. we've reached the 992 newline at the end of the header. 993 * - on_headers_finished 994 - None 995 - Called when all headers are finished, and before the part data 996 starts. 997 * - on_end 998 - None 999 - Called when the parser is finished parsing all data. 1000 1001 1002 :param boundary: The multipart boundary. This is required, and must match 1003 what is given in the HTTP request - usually in the 1004 Content-Type header. 1005 1006 :param callbacks: A dictionary of callbacks. See the documentation for 1007 :class:`BaseParser`. 1008 1009 :param max_size: The maximum size of body to parse. Defaults to infinity - 1010 i.e. unbounded. 1011 """ 1012 1013 def __init__(self, boundary, callbacks={}, max_size=float('inf')): 1014 # Initialize parser state. 1015 super(MultipartParser, self).__init__() 1016 self.state = STATE_START 1017 self.index = self.flags = 0 1018 1019 self.callbacks = callbacks 1020 1021 if not isinstance(max_size, Number) or max_size < 1: 1022 raise ValueError("max_size must be a positive number, not %r" % 1023 max_size) 1024 self.max_size = max_size 1025 self._current_size = 0 1026 1027 # Setup marks. These are used to track the state of data recieved. 1028 self.marks = {} 1029 1030 # TODO: Actually use this rather than the dumb version we currently use 1031 # # Precompute the skip table for the Boyer-Moore-Horspool algorithm. 1032 # skip = [len(boundary) for x in range(256)] 1033 # for i in range(len(boundary) - 1): 1034 # skip[ord_char(boundary[i])] = len(boundary) - i - 1 1035 # 1036 # # We use a tuple since it's a constant, and marginally faster. 1037 # self.skip = tuple(skip) 1038 1039 # Save our boundary. 1040 if isinstance(boundary, text_type): # pragma: no cover 1041 boundary = boundary.encode('latin-1') 1042 self.boundary = b'\r\n--' + boundary 1043 1044 # Get a set of characters that belong to our boundary. 1045 self.boundary_chars = frozenset(self.boundary) 1046 1047 # We also create a lookbehind list. 1048 # Note: the +8 is since we can have, at maximum, "\r\n--" + boundary + 1049 # "--\r\n" at the final boundary, and the length of '\r\n--' and 1050 # '--\r\n' is 8 bytes. 1051 self.lookbehind = [NULL for x in range(len(boundary) + 8)] 1052 1053 def write(self, data): 1054 """Write some data to the parser, which will perform size verification, 1055 and then parse the data into the appropriate location (e.g. header, 1056 data, etc.), and pass this on to the underlying callback. If an error 1057 is encountered, a MultipartParseError will be raised. The "offset" 1058 attribute on the raised exception will be set to the offset of the byte 1059 in the input chunk that caused the error. 1060 1061 :param data: a bytestring 1062 """ 1063 # Handle sizing. 1064 data_len = len(data) 1065 if (self._current_size + data_len) > self.max_size: 1066 # We truncate the length of data that we are to process. 1067 new_size = int(self.max_size - self._current_size) 1068 self.logger.warning("Current size is %d (max %d), so truncating " 1069 "data length from %d to %d", 1070 self._current_size, self.max_size, data_len, 1071 new_size) 1072 data_len = new_size 1073 1074 l = 0 1075 try: 1076 l = self._internal_write(data, data_len) 1077 finally: 1078 self._current_size += l 1079 1080 return l 1081 1082 def _internal_write(self, data, length): 1083 # Get values from locals. 1084 boundary = self.boundary 1085 1086 # Get our state, flags and index. These are persisted between calls to 1087 # this function. 1088 state = self.state 1089 index = self.index 1090 flags = self.flags 1091 1092 # Our index defaults to 0. 1093 i = 0 1094 1095 # Set a mark. 1096 def set_mark(name): 1097 self.marks[name] = i 1098 1099 # Remove a mark. 1100 def delete_mark(name, reset=False): 1101 self.marks.pop(name, None) 1102 1103 # Helper function that makes calling a callback with data easier. The 1104 # 'remaining' parameter will callback from the marked value until the 1105 # end of the buffer, and reset the mark, instead of deleting it. This 1106 # is used at the end of the function to call our callbacks with any 1107 # remaining data in this chunk. 1108 def data_callback(name, remaining=False): 1109 marked_index = self.marks.get(name) 1110 if marked_index is None: 1111 return 1112 1113 # If we're getting remaining data, we ignore the current i value 1114 # and just call with the remaining data. 1115 if remaining: 1116 self.callback(name, data, marked_index, length) 1117 self.marks[name] = 0 1118 1119 # Otherwise, we call it from the mark to the current byte we're 1120 # processing. 1121 else: 1122 self.callback(name, data, marked_index, i) 1123 self.marks.pop(name, None) 1124 1125 # For each byte... 1126 while i < length: 1127 c = data[i] 1128 1129 if state == STATE_START: 1130 # Skip leading newlines 1131 if c == CR or c == LF: 1132 i += 1 1133 self.logger.debug("Skipping leading CR/LF at %d", i) 1134 continue 1135 1136 # index is used as in index into our boundary. Set to 0. 1137 index = 0 1138 1139 # Move to the next state, but decrement i so that we re-process 1140 # this character. 1141 state = STATE_START_BOUNDARY 1142 i -= 1 1143 1144 elif state == STATE_START_BOUNDARY: 1145 # Check to ensure that the last 2 characters in our boundary 1146 # are CRLF. 1147 if index == len(boundary) - 2: 1148 if c != CR: 1149 # Error! 1150 msg = "Did not find CR at end of boundary (%d)" % (i,) 1151 self.logger.warning(msg) 1152 e = MultipartParseError(msg) 1153 e.offset = i 1154 raise e 1155 1156 index += 1 1157 1158 elif index == len(boundary) - 2 + 1: 1159 if c != LF: 1160 msg = "Did not find LF at end of boundary (%d)" % (i,) 1161 self.logger.warning(msg) 1162 e = MultipartParseError(msg) 1163 e.offset = i 1164 raise e 1165 1166 # The index is now used for indexing into our boundary. 1167 index = 0 1168 1169 # Callback for the start of a part. 1170 self.callback('part_begin') 1171 1172 # Move to the next character and state. 1173 state = STATE_HEADER_FIELD_START 1174 1175 else: 1176 # Check to ensure our boundary matches 1177 if c != boundary[index + 2]: 1178 msg = "Did not find boundary character %r at index " \ 1179 "%d" % (c, index + 2) 1180 self.logger.warning(msg) 1181 e = MultipartParseError(msg) 1182 e.offset = i 1183 raise e 1184 1185 # Increment index into boundary and continue. 1186 index += 1 1187 1188 elif state == STATE_HEADER_FIELD_START: 1189 # Mark the start of a header field here, reset the index, and 1190 # continue parsing our header field. 1191 index = 0 1192 1193 # Set a mark of our header field. 1194 set_mark('header_field') 1195 1196 # Move to parsing header fields. 1197 state = STATE_HEADER_FIELD 1198 i -= 1 1199 1200 elif state == STATE_HEADER_FIELD: 1201 # If we've reached a CR at the beginning of a header, it means 1202 # that we've reached the second of 2 newlines, and so there are 1203 # no more headers to parse. 1204 if c == CR: 1205 delete_mark('header_field') 1206 state = STATE_HEADERS_ALMOST_DONE 1207 i += 1 1208 continue 1209 1210 # Increment our index in the header. 1211 index += 1 1212 1213 # Do nothing if we encounter a hyphen. 1214 if c == HYPHEN: 1215 pass 1216 1217 # If we've reached a colon, we're done with this header. 1218 elif c == COLON: 1219 # A 0-length header is an error. 1220 if index == 1: 1221 msg = "Found 0-length header at %d" % (i,) 1222 self.logger.warning(msg) 1223 e = MultipartParseError(msg) 1224 e.offset = i 1225 raise e 1226 1227 # Call our callback with the header field. 1228 data_callback('header_field') 1229 1230 # Move to parsing the header value. 1231 state = STATE_HEADER_VALUE_START 1232 1233 else: 1234 # Lower-case this character, and ensure that it is in fact 1235 # a valid letter. If not, it's an error. 1236 cl = lower_char(c) 1237 if cl < LOWER_A or cl > LOWER_Z: 1238 msg = "Found non-alphanumeric character %r in " \ 1239 "header at %d" % (c, i) 1240 self.logger.warning(msg) 1241 e = MultipartParseError(msg) 1242 e.offset = i 1243 raise e 1244 1245 elif state == STATE_HEADER_VALUE_START: 1246 # Skip leading spaces. 1247 if c == SPACE: 1248 i += 1 1249 continue 1250 1251 # Mark the start of the header value. 1252 set_mark('header_value') 1253 1254 # Move to the header-value state, reprocessing this character. 1255 state = STATE_HEADER_VALUE 1256 i -= 1 1257 1258 elif state == STATE_HEADER_VALUE: 1259 # If we've got a CR, we're nearly done our headers. Otherwise, 1260 # we do nothing and just move past this character. 1261 if c == CR: 1262 data_callback('header_value') 1263 self.callback('header_end') 1264 state = STATE_HEADER_VALUE_ALMOST_DONE 1265 1266 elif state == STATE_HEADER_VALUE_ALMOST_DONE: 1267 # The last character should be a LF. If not, it's an error. 1268 if c != LF: 1269 msg = "Did not find LF character at end of header " \ 1270 "(found %r)" % (c,) 1271 self.logger.warning(msg) 1272 e = MultipartParseError(msg) 1273 e.offset = i 1274 raise e 1275 1276 # Move back to the start of another header. Note that if that 1277 # state detects ANOTHER newline, it'll trigger the end of our 1278 # headers. 1279 state = STATE_HEADER_FIELD_START 1280 1281 elif state == STATE_HEADERS_ALMOST_DONE: 1282 # We're almost done our headers. This is reached when we parse 1283 # a CR at the beginning of a header, so our next character 1284 # should be a LF, or it's an error. 1285 if c != LF: 1286 msg = "Did not find LF at end of headers (found %r)" % (c,) 1287 self.logger.warning(msg) 1288 e = MultipartParseError(msg) 1289 e.offset = i 1290 raise e 1291 1292 self.callback('headers_finished') 1293 state = STATE_PART_DATA_START 1294 1295 elif state == STATE_PART_DATA_START: 1296 # Mark the start of our part data. 1297 set_mark('part_data') 1298 1299 # Start processing part data, including this character. 1300 state = STATE_PART_DATA 1301 i -= 1 1302 1303 elif state == STATE_PART_DATA: 1304 # We're processing our part data right now. During this, we 1305 # need to efficiently search for our boundary, since any data 1306 # on any number of lines can be a part of the current data. 1307 # We use the Boyer-Moore-Horspool algorithm to efficiently 1308 # search through the remainder of the buffer looking for our 1309 # boundary. 1310 1311 # Save the current value of our index. We use this in case we 1312 # find part of a boundary, but it doesn't match fully. 1313 prev_index = index 1314 1315 # Set up variables. 1316 boundary_length = len(boundary) 1317 boundary_end = boundary_length - 1 1318 data_length = length 1319 boundary_chars = self.boundary_chars 1320 1321 # If our index is 0, we're starting a new part, so start our 1322 # search. 1323 if index == 0: 1324 # Search forward until we either hit the end of our buffer, 1325 # or reach a character that's in our boundary. 1326 i += boundary_end 1327 while i < data_length - 1 and data[i] not in boundary_chars: 1328 i += boundary_length 1329 1330 # Reset i back the length of our boundary, which is the 1331 # earliest possible location that could be our match (i.e. 1332 # if we've just broken out of our loop since we saw the 1333 # last character in our boundary) 1334 i -= boundary_end 1335 c = data[i] 1336 1337 # Now, we have a couple of cases here. If our index is before 1338 # the end of the boundary... 1339 if index < boundary_length: 1340 # If the character matches... 1341 if boundary[index] == c: 1342 # If we found a match for our boundary, we send the 1343 # existing data. 1344 if index == 0: 1345 data_callback('part_data') 1346 1347 # The current character matches, so continue! 1348 index += 1 1349 else: 1350 index = 0 1351 1352 # Our index is equal to the length of our boundary! 1353 elif index == boundary_length: 1354 # First we increment it. 1355 index += 1 1356 1357 # Now, if we've reached a newline, we need to set this as 1358 # the potential end of our boundary. 1359 if c == CR: 1360 flags |= FLAG_PART_BOUNDARY 1361 1362 # Otherwise, if this is a hyphen, we might be at the last 1363 # of all boundaries. 1364 elif c == HYPHEN: 1365 flags |= FLAG_LAST_BOUNDARY 1366 1367 # Otherwise, we reset our index, since this isn't either a 1368 # newline or a hyphen. 1369 else: 1370 index = 0 1371 1372 # Our index is right after the part boundary, which should be 1373 # a LF. 1374 elif index == boundary_length + 1: 1375 # If we're at a part boundary (i.e. we've seen a CR 1376 # character already)... 1377 if flags & FLAG_PART_BOUNDARY: 1378 # We need a LF character next. 1379 if c == LF: 1380 # Unset the part boundary flag. 1381 flags &= (~FLAG_PART_BOUNDARY) 1382 1383 # Callback indicating that we've reached the end of 1384 # a part, and are starting a new one. 1385 self.callback('part_end') 1386 self.callback('part_begin') 1387 1388 # Move to parsing new headers. 1389 index = 0 1390 state = STATE_HEADER_FIELD_START 1391 i += 1 1392 continue 1393 1394 # We didn't find an LF character, so no match. Reset 1395 # our index and clear our flag. 1396 index = 0 1397 flags &= (~FLAG_PART_BOUNDARY) 1398 1399 # Otherwise, if we're at the last boundary (i.e. we've 1400 # seen a hyphen already)... 1401 elif flags & FLAG_LAST_BOUNDARY: 1402 # We need a second hyphen here. 1403 if c == HYPHEN: 1404 # Callback to end the current part, and then the 1405 # message. 1406 self.callback('part_end') 1407 self.callback('end') 1408 state = STATE_END 1409 else: 1410 # No match, so reset index. 1411 index = 0 1412 1413 # If we have an index, we need to keep this byte for later, in 1414 # case we can't match the full boundary. 1415 if index > 0: 1416 self.lookbehind[index - 1] = c 1417 1418 # Otherwise, our index is 0. If the previous index is not, it 1419 # means we reset something, and we need to take the data we 1420 # thought was part of our boundary and send it along as actual 1421 # data. 1422 elif prev_index > 0: 1423 # Callback to write the saved data. 1424 lb_data = join_bytes(self.lookbehind) 1425 self.callback('part_data', lb_data, 0, prev_index) 1426 1427 # Overwrite our previous index. 1428 prev_index = 0 1429 1430 # Re-set our mark for part data. 1431 set_mark('part_data') 1432 1433 # Re-consider the current character, since this could be 1434 # the start of the boundary itself. 1435 i -= 1 1436 1437 elif state == STATE_END: 1438 # Do nothing and just consume a byte in the end state. 1439 if c not in (CR, LF): 1440 self.logger.warning("Consuming a byte '0x%x' in the end state", c) 1441 1442 else: # pragma: no cover (error case) 1443 # We got into a strange state somehow! Just stop processing. 1444 msg = "Reached an unknown state %d at %d" % (state, i) 1445 self.logger.warning(msg) 1446 e = MultipartParseError(msg) 1447 e.offset = i 1448 raise e 1449 1450 # Move to the next byte. 1451 i += 1 1452 1453 # We call our callbacks with any remaining data. Note that we pass 1454 # the 'remaining' flag, which sets the mark back to 0 instead of 1455 # deleting it, if it's found. This is because, if the mark is found 1456 # at this point, we assume that there's data for one of these things 1457 # that has been parsed, but not yet emitted. And, as such, it implies 1458 # that we haven't yet reached the end of this 'thing'. So, by setting 1459 # the mark to 0, we cause any data callbacks that take place in future 1460 # calls to this function to start from the beginning of that buffer. 1461 data_callback('header_field', True) 1462 data_callback('header_value', True) 1463 data_callback('part_data', True) 1464 1465 # Save values to locals. 1466 self.state = state 1467 self.index = index 1468 self.flags = flags 1469 1470 # Return our data length to indicate no errors, and that we processed 1471 # all of it. 1472 return length 1473 1474 def finalize(self): 1475 """Finalize this parser, which signals to that we are finished parsing. 1476 1477 Note: It does not currently, but in the future, it will verify that we 1478 are in the final state of the parser (i.e. the end of the multipart 1479 message is well-formed), and, if not, throw an error. 1480 """ 1481 # TODO: verify that we're in the state STATE_END, otherwise throw an 1482 # error or otherwise state that we're not finished parsing. 1483 pass 1484 1485 def __repr__(self): 1486 return "%s(boundary=%r)" % (self.__class__.__name__, self.boundary) 1487 1488 1489class FormParser(object): 1490 """This class is the all-in-one form parser. Given all the information 1491 necessary to parse a form, it will instantiate the correct parser, create 1492 the proper :class:`Field` and :class:`File` classes to store the data that 1493 is parsed, and call the two given callbacks with each field and file as 1494 they become available. 1495 1496 :param content_type: The Content-Type of the incoming request. This is 1497 used to select the appropriate parser. 1498 1499 :param on_field: The callback to call when a field has been parsed and is 1500 ready for usage. See above for parameters. 1501 1502 :param on_file: The callback to call when a file has been parsed and is 1503 ready for usage. See above for parameters. 1504 1505 :param on_end: An optional callback to call when all fields and files in a 1506 request has been parsed. Can be None. 1507 1508 :param boundary: If the request is a multipart/form-data request, this 1509 should be the boundary of the request, as given in the 1510 Content-Type header, as a bytestring. 1511 1512 :param file_name: If the request is of type application/octet-stream, then 1513 the body of the request will not contain any information 1514 about the uploaded file. In such cases, you can provide 1515 the file name of the uploaded file manually. 1516 1517 :param FileClass: The class to use for uploaded files. Defaults to 1518 :class:`File`, but you can provide your own class if you 1519 wish to customize behaviour. The class will be 1520 instantiated as FileClass(file_name, field_name), and it 1521 must provide the folllowing functions:: 1522 file_instance.write(data) 1523 file_instance.finalize() 1524 file_instance.close() 1525 1526 :param FieldClass: The class to use for uploaded fields. Defaults to 1527 :class:`Field`, but you can provide your own class if 1528 you wish to customize behaviour. The class will be 1529 instantiated as FieldClass(field_name), and it must 1530 provide the folllowing functions:: 1531 field_instance.write(data) 1532 field_instance.finalize() 1533 field_instance.close() 1534 1535 :param config: Configuration to use for this FormParser. The default 1536 values are taken from the DEFAULT_CONFIG value, and then 1537 any keys present in this dictionary will overwrite the 1538 default values. 1539 1540 """ 1541 #: This is the default configuration for our form parser. 1542 #: Note: all file sizes should be in bytes. 1543 DEFAULT_CONFIG = { 1544 'MAX_BODY_SIZE': float('inf'), 1545 'MAX_MEMORY_FILE_SIZE': 1 * 1024 * 1024, 1546 'UPLOAD_DIR': None, 1547 'UPLOAD_KEEP_FILENAME': False, 1548 'UPLOAD_KEEP_EXTENSIONS': False, 1549 1550 # Error on invalid Content-Transfer-Encoding? 1551 'UPLOAD_ERROR_ON_BAD_CTE': False, 1552 } 1553 1554 def __init__(self, content_type, on_field, on_file, on_end=None, 1555 boundary=None, file_name=None, FileClass=File, 1556 FieldClass=Field, config={}): 1557 1558 self.logger = logging.getLogger(__name__) 1559 1560 # Save variables. 1561 self.content_type = content_type 1562 self.boundary = boundary 1563 self.bytes_received = 0 1564 self.parser = None 1565 1566 # Save callbacks. 1567 self.on_field = on_field 1568 self.on_file = on_file 1569 self.on_end = on_end 1570 1571 # Save classes. 1572 self.FileClass = File 1573 self.FieldClass = Field 1574 1575 # Set configuration options. 1576 self.config = self.DEFAULT_CONFIG.copy() 1577 self.config.update(config) 1578 1579 # Depending on the Content-Type, we instantiate the correct parser. 1580 if content_type == 'application/octet-stream': 1581 # Work around the lack of 'nonlocal' in Py2 1582 class vars(object): 1583 f = None 1584 1585 def on_start(): 1586 vars.f = FileClass(file_name, None, config=self.config) 1587 1588 def on_data(data, start, end): 1589 vars.f.write(data[start:end]) 1590 1591 def on_end(): 1592 # Finalize the file itself. 1593 vars.f.finalize() 1594 1595 # Call our callback. 1596 on_file(vars.f) 1597 1598 # Call the on-end callback. 1599 if self.on_end is not None: 1600 self.on_end() 1601 1602 callbacks = { 1603 'on_start': on_start, 1604 'on_data': on_data, 1605 'on_end': on_end, 1606 } 1607 1608 # Instantiate an octet-stream parser 1609 parser = OctetStreamParser(callbacks, 1610 max_size=self.config['MAX_BODY_SIZE']) 1611 1612 elif (content_type == 'application/x-www-form-urlencoded' or 1613 content_type == 'application/x-url-encoded'): 1614 1615 name_buffer = [] 1616 1617 class vars(object): 1618 f = None 1619 1620 def on_field_start(): 1621 pass 1622 1623 def on_field_name(data, start, end): 1624 name_buffer.append(data[start:end]) 1625 1626 def on_field_data(data, start, end): 1627 if vars.f is None: 1628 vars.f = FieldClass(b''.join(name_buffer)) 1629 del name_buffer[:] 1630 vars.f.write(data[start:end]) 1631 1632 def on_field_end(): 1633 # Finalize and call callback. 1634 if vars.f is None: 1635 # If we get here, it's because there was no field data. 1636 # We create a field, set it to None, and then continue. 1637 vars.f = FieldClass(b''.join(name_buffer)) 1638 del name_buffer[:] 1639 vars.f.set_none() 1640 1641 vars.f.finalize() 1642 on_field(vars.f) 1643 vars.f = None 1644 1645 def on_end(): 1646 if self.on_end is not None: 1647 self.on_end() 1648 1649 # Setup callbacks. 1650 callbacks = { 1651 'on_field_start': on_field_start, 1652 'on_field_name': on_field_name, 1653 'on_field_data': on_field_data, 1654 'on_field_end': on_field_end, 1655 'on_end': on_end, 1656 } 1657 1658 # Instantiate parser. 1659 parser = QuerystringParser( 1660 callbacks=callbacks, 1661 max_size=self.config['MAX_BODY_SIZE'] 1662 ) 1663 1664 elif content_type == 'multipart/form-data': 1665 if boundary is None: 1666 self.logger.error("No boundary given") 1667 raise FormParserError("No boundary given") 1668 1669 header_name = [] 1670 header_value = [] 1671 headers = {} 1672 1673 # No 'nonlocal' on Python 2 :-( 1674 class vars(object): 1675 f = None 1676 writer = None 1677 is_file = False 1678 1679 def on_part_begin(): 1680 pass 1681 1682 def on_part_data(data, start, end): 1683 bytes_processed = vars.writer.write(data[start:end]) 1684 # TODO: check for error here. 1685 return bytes_processed 1686 1687 def on_part_end(): 1688 vars.f.finalize() 1689 if vars.is_file: 1690 on_file(vars.f) 1691 else: 1692 on_field(vars.f) 1693 1694 def on_header_field(data, start, end): 1695 header_name.append(data[start:end]) 1696 1697 def on_header_value(data, start, end): 1698 header_value.append(data[start:end]) 1699 1700 def on_header_end(): 1701 headers[b''.join(header_name)] = b''.join(header_value) 1702 del header_name[:] 1703 del header_value[:] 1704 1705 def on_headers_finished(): 1706 # Reset the 'is file' flag. 1707 vars.is_file = False 1708 1709 # Parse the content-disposition header. 1710 # TODO: handle mixed case 1711 content_disp = headers.get(b'Content-Disposition') 1712 disp, options = parse_options_header(content_disp) 1713 1714 # Get the field and filename. 1715 field_name = options.get(b'name') 1716 file_name = options.get(b'filename') 1717 # TODO: check for errors 1718 1719 # Create the proper class. 1720 if file_name is None: 1721 vars.f = FieldClass(field_name) 1722 else: 1723 vars.f = FileClass(file_name, field_name, config=self.config) 1724 vars.is_file = True 1725 1726 # Parse the given Content-Transfer-Encoding to determine what 1727 # we need to do with the incoming data. 1728 # TODO: check that we properly handle 8bit / 7bit encoding. 1729 transfer_encoding = headers.get(b'Content-Transfer-Encoding', 1730 b'7bit') 1731 1732 if (transfer_encoding == b'binary' or 1733 transfer_encoding == b'8bit' or 1734 transfer_encoding == b'7bit'): 1735 vars.writer = vars.f 1736 1737 elif transfer_encoding == b'base64': 1738 vars.writer = Base64Decoder(vars.f) 1739 1740 elif transfer_encoding == b'quoted-printable': 1741 vars.writer = QuotedPrintableDecoder(vars.f) 1742 1743 else: 1744 self.logger.warning("Unknown Content-Transfer-Encoding: " 1745 "%r", transfer_encoding) 1746 if self.config['UPLOAD_ERROR_ON_BAD_CTE']: 1747 raise FormParserError( 1748 'Unknown Content-Transfer-Encoding "{0}"'.format( 1749 transfer_encoding 1750 ) 1751 ) 1752 else: 1753 # If we aren't erroring, then we just treat this as an 1754 # unencoded Content-Transfer-Encoding. 1755 vars.writer = vars.f 1756 1757 def on_end(): 1758 vars.writer.finalize() 1759 if self.on_end is not None: 1760 self.on_end() 1761 1762 # These are our callbacks for the parser. 1763 callbacks = { 1764 'on_part_begin': on_part_begin, 1765 'on_part_data': on_part_data, 1766 'on_part_end': on_part_end, 1767 'on_header_field': on_header_field, 1768 'on_header_value': on_header_value, 1769 'on_header_end': on_header_end, 1770 'on_headers_finished': on_headers_finished, 1771 'on_end': on_end, 1772 } 1773 1774 # Instantiate a multipart parser. 1775 parser = MultipartParser(boundary, callbacks, 1776 max_size=self.config['MAX_BODY_SIZE']) 1777 1778 else: 1779 self.logger.warning("Unknown Content-Type: %r", content_type) 1780 raise FormParserError("Unknown Content-Type: {0}".format( 1781 content_type 1782 )) 1783 1784 self.parser = parser 1785 1786 def write(self, data): 1787 """Write some data. The parser will forward this to the appropriate 1788 underlying parser. 1789 1790 :param data: a bytestring 1791 """ 1792 self.bytes_received += len(data) 1793 # TODO: check the parser's return value for errors? 1794 return self.parser.write(data) 1795 1796 def finalize(self): 1797 """Finalize the parser.""" 1798 if self.parser is not None and hasattr(self.parser, 'finalize'): 1799 self.parser.finalize() 1800 1801 def close(self): 1802 """Close the parser.""" 1803 if self.parser is not None and hasattr(self.parser, 'close'): 1804 self.parser.close() 1805 1806 def __repr__(self): 1807 return "%s(content_type=%r, parser=%r)" % ( 1808 self.__class__.__name__, 1809 self.content_type, 1810 self.parser, 1811 ) 1812 1813 1814def create_form_parser(headers, on_field, on_file, trust_x_headers=False, 1815 config={}): 1816 """This function is a helper function to aid in creating a FormParser 1817 instances. Given a dictionary-like headers object, it will determine 1818 the correct information needed, instantiate a FormParser with the 1819 appropriate values and given callbacks, and then return the corresponding 1820 parser. 1821 1822 :param headers: A dictionary-like object of HTTP headers. The only 1823 required header is Content-Type. 1824 1825 :param on_field: Callback to call with each parsed field. 1826 1827 :param on_file: Callback to call with each parsed file. 1828 1829 :param trust_x_headers: Whether or not to trust information received from 1830 certain X-Headers - for example, the file name from 1831 X-File-Name. 1832 1833 :param config: Configuration variables to pass to the FormParser. 1834 """ 1835 content_type = headers.get('Content-Type') 1836 if content_type is None: 1837 logging.getLogger(__name__).warning("No Content-Type header given") 1838 raise ValueError("No Content-Type header given!") 1839 1840 # Boundaries are optional (the FormParser will raise if one is needed 1841 # but not given). 1842 content_type, params = parse_options_header(content_type) 1843 boundary = params.get(b'boundary') 1844 1845 # We need content_type to be a string, not a bytes object. 1846 content_type = content_type.decode('latin-1') 1847 1848 # File names are optional. 1849 file_name = headers.get('X-File-Name') 1850 1851 # Instantiate a form parser. 1852 form_parser = FormParser(content_type, 1853 on_field, 1854 on_file, 1855 boundary=boundary, 1856 file_name=file_name, 1857 config=config) 1858 1859 # Return our parser. 1860 return form_parser 1861 1862 1863def parse_form(headers, input_stream, on_field, on_file, chunk_size=1048576, 1864 **kwargs): 1865 """This function is useful if you just want to parse a request body, 1866 without too much work. Pass it a dictionary-like object of the request's 1867 headers, and a file-like object for the input stream, along with two 1868 callbacks that will get called whenever a field or file is parsed. 1869 1870 :param headers: A dictionary-like object of HTTP headers. The only 1871 required header is Content-Type. 1872 1873 :param input_stream: A file-like object that represents the request body. 1874 The read() method must return bytestrings. 1875 1876 :param on_field: Callback to call with each parsed field. 1877 1878 :param on_file: Callback to call with each parsed file. 1879 1880 :param chunk_size: The maximum size to read from the input stream and write 1881 to the parser at one time. Defaults to 1 MiB. 1882 """ 1883 1884 # Create our form parser. 1885 parser = create_form_parser(headers, on_field, on_file) 1886 1887 # Read chunks of 100KiB and write to the parser, but never read more than 1888 # the given Content-Length, if any. 1889 content_length = headers.get('Content-Length') 1890 if content_length is not None: 1891 content_length = int(content_length) 1892 else: 1893 content_length = float('inf') 1894 bytes_read = 0 1895 1896 while True: 1897 # Read only up to the Content-Length given. 1898 max_readable = min(content_length - bytes_read, 1048576) 1899 buff = input_stream.read(max_readable) 1900 1901 # Write to the parser and update our length. 1902 parser.write(buff) 1903 bytes_read += len(buff) 1904 1905 # If we get a buffer that's smaller than the size requested, or if we 1906 # have read up to our content length, we're done. 1907 if len(buff) != max_readable or bytes_read == content_length: 1908 break 1909 1910 # Tell our parser that we're done writing data. 1911 parser.finalize() 1912