1from __future__ import with_statement, absolute_import, print_function
2
3from six import (
4    binary_type,
5    text_type,
6    PY3,
7)
8
9from .decoders import *
10from .exceptions import *
11
12try:
13    from urlparse import parse_qs
14except ImportError:
15    from urllib.parse import parse_qs
16
17import os
18import re
19import sys
20import shutil
21import logging
22import tempfile
23from io import BytesIO
24from numbers import Number
25
26# Unique missing object.
27_missing = object()
28
29# States for the querystring parser.
30STATE_BEFORE_FIELD = 0
31STATE_FIELD_NAME   = 1
32STATE_FIELD_DATA   = 2
33
34# States for the multipart parser
35STATE_START                     = 0
36STATE_START_BOUNDARY            = 1
37STATE_HEADER_FIELD_START        = 2
38STATE_HEADER_FIELD              = 3
39STATE_HEADER_VALUE_START        = 4
40STATE_HEADER_VALUE              = 5
41STATE_HEADER_VALUE_ALMOST_DONE  = 6
42STATE_HEADERS_ALMOST_DONE       = 7
43STATE_PART_DATA_START           = 8
44STATE_PART_DATA                 = 9
45STATE_PART_DATA_END             = 10
46STATE_END                       = 11
47
48STATES = [
49    "START",
50    "START_BOUNDARY", "HEADER_FEILD_START", "HEADER_FIELD", "HEADER_VALUE_START", "HEADER_VALUE",
51    "HEADER_VALUE_ALMOST_DONE", "HEADRES_ALMOST_DONE", "PART_DATA_START", "PART_DATA", "PART_DATA_END", "END"
52]
53
54
55# Flags for the multipart parser.
56FLAG_PART_BOUNDARY              = 1
57FLAG_LAST_BOUNDARY              = 2
58
59# Get constants.  Since iterating over a str on Python 2 gives you a 1-length
60# string, but iterating over a bytes object on Python 3 gives you an integer,
61# we need to save these constants.
62CR = b'\r'[0]
63LF = b'\n'[0]
64COLON = b':'[0]
65SPACE = b' '[0]
66HYPHEN = b'-'[0]
67AMPERSAND = b'&'[0]
68SEMICOLON = b';'[0]
69LOWER_A = b'a'[0]
70LOWER_Z = b'z'[0]
71NULL = b'\x00'[0]
72
73# Lower-casing a character is different, because of the difference between
74# str on Py2, and bytes on Py3.  Same with getting the ordinal value of a byte,
75# and joining a list of bytes together.
76# These functions abstract that.
77if PY3:                         # pragma: no cover
78    lower_char = lambda c: c | 0x20
79    ord_char = lambda c: c
80    join_bytes = lambda b: bytes(list(b))
81else:                           # pragma: no cover
82    lower_char = lambda c: c.lower()
83    ord_char = lambda c: ord(c)
84    join_bytes = lambda b: b''.join(list(b))
85
86# These are regexes for parsing header values.
87SPECIAL_CHARS = re.escape(b'()<>@,;:\\"/[]?={} \t')
88QUOTED_STR = br'"(?:\\.|[^"])*"'
89VALUE_STR = br'(?:[^' + SPECIAL_CHARS + br']+|' + QUOTED_STR + br')'
90OPTION_RE_STR = (
91    br'(?:;|^)\s*([^' + SPECIAL_CHARS + br']+)\s*=\s*(' + VALUE_STR + br')'
92)
93OPTION_RE = re.compile(OPTION_RE_STR)
94QUOTE = b'"'[0]
95
96
97def parse_options_header(value):
98    """
99    Parses a Content-Type header into a value in the following format:
100        (content_type, {parameters})
101    """
102    if not value:
103        return (b'', {})
104
105    # If we are passed a string, we assume that it conforms to WSGI and does
106    # not contain any code point that's not in latin-1.
107    if isinstance(value, text_type):            # pragma: no cover
108        value = value.encode('latin-1')
109
110    # If we have no options, return the string as-is.
111    if b';' not in value:
112        return (value.lower().strip(), {})
113
114    # Split at the first semicolon, to get our value and then options.
115    ctype, rest = value.split(b';', 1)
116    options = {}
117
118    # Parse the options.
119    for match in OPTION_RE.finditer(rest):
120        key = match.group(1).lower()
121        value = match.group(2)
122        if value[0] == QUOTE and value[-1] == QUOTE:
123            # Unquote the value.
124            value = value[1:-1]
125            value = value.replace(b'\\\\', b'\\').replace(b'\\"', b'"')
126
127        # If the value is a filename, we need to fix a bug on IE6 that sends
128        # the full file path instead of the filename.
129        if key == b'filename':
130            if value[1:3] == b':\\' or value[:2] == b'\\\\':
131                value = value.split(b'\\')[-1]
132
133        options[key] = value
134
135    return ctype, options
136
137
138class Field(object):
139    """A Field object represents a (parsed) form field.  It represents a single
140    field with a corresponding name and value.
141
142    The name that a :class:`Field` will be instantiated with is the same name
143    that would be found in the following HTML::
144
145        <input name="name_goes_here" type="text"/>
146
147    This class defines two methods, :meth:`on_data` and :meth:`on_end`, that
148    will be called when data is written to the Field, and when the Field is
149    finalized, respectively.
150
151    :param name: the name of the form field
152    """
153    def __init__(self, name):
154        self._name = name
155        self._value = []
156
157        # We cache the joined version of _value for speed.
158        self._cache = _missing
159
160    @classmethod
161    def from_value(klass, name, value):
162        """Create an instance of a :class:`Field`, and set the corresponding
163        value - either None or an actual value.  This method will also
164        finalize the Field itself.
165
166        :param name: the name of the form field
167        :param value: the value of the form field - either a bytestring or
168                      None
169        """
170
171        f = klass(name)
172        if value is None:
173            f.set_none()
174        else:
175            f.write(value)
176        f.finalize()
177        return f
178
179    def write(self, data):
180        """Write some data into the form field.
181
182        :param data: a bytestring
183        """
184        return self.on_data(data)
185
186    def on_data(self, data):
187        """This method is a callback that will be called whenever data is
188        written to the Field.
189
190        :param data: a bytestring
191        """
192        self._value.append(data)
193        self._cache = _missing
194        return len(data)
195
196    def on_end(self):
197        """This method is called whenever the Field is finalized.
198        """
199        if self._cache is _missing:
200            self._cache = b''.join(self._value)
201
202    def finalize(self):
203        """Finalize the form field.
204        """
205        self.on_end()
206
207    def close(self):
208        """Close the Field object.  This will free any underlying cache.
209        """
210        # Free our value array.
211        if self._cache is _missing:
212            self._cache = b''.join(self._value)
213
214        del self._value
215
216    def set_none(self):
217        """Some fields in a querystring can possibly have a value of None - for
218        example, the string "foo&bar=&baz=asdf" will have a field with the
219        name "foo" and value None, one with name "bar" and value "", and one
220        with name "baz" and value "asdf".  Since the write() interface doesn't
221        support writing None, this function will set the field value to None.
222        """
223        self._cache = None
224
225    @property
226    def field_name(self):
227        """This property returns the name of the field."""
228        return self._name
229
230    @property
231    def value(self):
232        """This property returns the value of the form field."""
233        if self._cache is _missing:
234            self._cache = b''.join(self._value)
235
236        return self._cache
237
238    def __eq__(self, other):
239        if isinstance(other, Field):
240            return (
241                self.field_name == other.field_name and
242                self.value == other.value
243            )
244        else:
245            return NotImplemented
246
247    def __repr__(self):
248        if len(self.value) > 97:
249            # We get the repr, and then insert three dots before the final
250            # quote.
251            v = repr(self.value[:97])[:-1] + "...'"
252        else:
253            v = repr(self.value)
254
255        return "%s(field_name=%r, value=%s)" % (
256            self.__class__.__name__,
257            self.field_name,
258            v
259        )
260
261
262class File(object):
263    """This class represents an uploaded file.  It handles writing file data to
264    either an in-memory file or a temporary file on-disk, if the optional
265    threshold is passed.
266
267    There are some options that can be passed to the File to change behavior
268    of the class.  Valid options are as follows:
269
270    .. list-table::
271       :widths: 15 5 5 30
272       :header-rows: 1
273
274       * - Name
275         - Type
276         - Default
277         - Description
278       * - UPLOAD_DIR
279         - `str`
280         - None
281         - The directory to store uploaded files in.  If this is None, a
282           temporary file will be created in the system's standard location.
283       * - UPLOAD_DELETE_TMP
284         - `bool`
285         - True
286         - Delete automatically created TMP file
287       * - UPLOAD_KEEP_FILENAME
288         - `bool`
289         - False
290         - Whether or not to keep the filename of the uploaded file.  If True,
291           then the filename will be converted to a safe representation (e.g.
292           by removing any invalid path segments), and then saved with the
293           same name).  Otherwise, a temporary name will be used.
294       * - UPLOAD_KEEP_EXTENSIONS
295         - `bool`
296         - False
297         - Whether or not to keep the uploaded file's extension.  If False, the
298           file will be saved with the default temporary extension (usually
299           ".tmp").  Otherwise, the file's extension will be maintained.  Note
300           that this will properly combine with the UPLOAD_KEEP_FILENAME
301           setting.
302       * - MAX_MEMORY_FILE_SIZE
303         - `int`
304         - 1 MiB
305         - The maximum number of bytes of a File to keep in memory.  By
306           default, the contents of a File are kept into memory until a certain
307           limit is reached, after which the contents of the File are written
308           to a temporary file.  This behavior can be disabled by setting this
309           value to an appropriately large value (or, for example, infinity,
310           such as `float('inf')`.
311
312    :param file_name: The name of the file that this :class:`File` represents
313
314    :param field_name: The field name that uploaded this file.  Note that this
315                       can be None, if, for example, the file was uploaded
316                       with Content-Type application/octet-stream
317
318    :param config: The configuration for this File.  See above for valid
319                   configuration keys and their corresponding values.
320    """
321    def __init__(self, file_name, field_name=None, config={}):
322        # Save configuration, set other variables default.
323        self.logger = logging.getLogger(__name__)
324        self._config = config
325        self._in_memory = True
326        self._bytes_written = 0
327        self._fileobj = BytesIO()
328
329        # Save the provided field/file name.
330        self._field_name = field_name
331        self._file_name = file_name
332
333        # Our actual file name is None by default, since, depending on our
334        # config, we may not actually use the provided name.
335        self._actual_file_name = None
336
337        # Split the extension from the filename.
338        if file_name is not None:
339            base, ext = os.path.splitext(file_name)
340            self._file_base = base
341            self._ext = ext
342
343    @property
344    def field_name(self):
345        """The form field associated with this file.  May be None if there isn't
346        one, for example when we have an application/octet-stream upload.
347        """
348        return self._field_name
349
350    @property
351    def file_name(self):
352        """The file name given in the upload request.
353        """
354        return self._file_name
355
356    @property
357    def actual_file_name(self):
358        """The file name that this file is saved as.  Will be None if it's not
359        currently saved on disk.
360        """
361        return self._actual_file_name
362
363    @property
364    def file_object(self):
365        """The file object that we're currently writing to.  Note that this
366        will either be an instance of a :class:`io.BytesIO`, or a regular file
367        object.
368        """
369        return self._fileobj
370
371    @property
372    def size(self):
373        """The total size of this file, counted as the number of bytes that
374        currently have been written to the file.
375        """
376        return self._bytes_written
377
378    @property
379    def in_memory(self):
380        """A boolean representing whether or not this file object is currently
381        stored in-memory or on-disk.
382        """
383        return self._in_memory
384
385    def flush_to_disk(self):
386        """If the file is already on-disk, do nothing.  Otherwise, copy from
387        the in-memory buffer to a disk file, and then reassign our internal
388        file object to this new disk file.
389
390        Note that if you attempt to flush a file that is already on-disk, a
391        warning will be logged to this module's logger.
392        """
393        if not self._in_memory:
394            self.logger.warning(
395                "Trying to flush to disk when we're not in memory"
396            )
397            return
398
399        # Go back to the start of our file.
400        self._fileobj.seek(0)
401
402        # Open a new file.
403        new_file = self._get_disk_file()
404
405        # Copy the file objects.
406        shutil.copyfileobj(self._fileobj, new_file)
407
408        # Seek to the new position in our new file.
409        new_file.seek(self._bytes_written)
410
411        # Reassign the fileobject.
412        old_fileobj = self._fileobj
413        self._fileobj = new_file
414
415        # We're no longer in memory.
416        self._in_memory = False
417
418        # Close the old file object.
419        old_fileobj.close()
420
421    def _get_disk_file(self):
422        """This function is responsible for getting a file object on-disk for us.
423        """
424        self.logger.info("Opening a file on disk")
425
426        file_dir = self._config.get('UPLOAD_DIR')
427        keep_filename = self._config.get('UPLOAD_KEEP_FILENAME', False)
428        keep_extensions = self._config.get('UPLOAD_KEEP_EXTENSIONS', False)
429        delete_tmp = self._config.get('UPLOAD_DELETE_TMP', True)
430
431        # If we have a directory and are to keep the filename...
432        if file_dir is not None and keep_filename:
433            self.logger.info("Saving with filename in: %r", file_dir)
434
435            # Build our filename.
436            # TODO: what happens if we don't have a filename?
437            fname = self._file_base
438            if keep_extensions:
439                fname = fname + self._ext
440
441            path = os.path.join(file_dir, fname)
442            try:
443                self.logger.info("Opening file: %r", path)
444                tmp_file = open(path, 'w+b')
445            except (IOError, OSError) as e:
446                tmp_file = None
447
448                self.logger.exception("Error opening temporary file")
449                raise FileError("Error opening temporary file: %r" % path)
450        else:
451            # Build options array.
452            # Note that on Python 3, tempfile doesn't support byte names.  We
453            # encode our paths using the default filesystem encoding.
454            options = {}
455            if keep_extensions:
456                ext = self._ext
457                if isinstance(ext, binary_type):
458                    ext = ext.decode(sys.getfilesystemencoding())
459
460                options['suffix'] = ext
461            if file_dir is not None:
462                d = file_dir
463                if isinstance(d, binary_type):
464                    d = d.decode(sys.getfilesystemencoding())
465
466                options['dir'] = d
467            options['delete'] = delete_tmp
468
469            # Create a temporary (named) file with the appropriate settings.
470            self.logger.info("Creating a temporary file with options: %r",
471                             options)
472            try:
473                tmp_file = tempfile.NamedTemporaryFile(**options)
474            except (IOError, OSError):
475                self.logger.exception("Error creating named temporary file")
476                raise FileError("Error creating named temporary file")
477
478            fname = tmp_file.name
479
480            # Encode filename as bytes.
481            if isinstance(fname, text_type):
482                fname = fname.encode(sys.getfilesystemencoding())
483
484        self._actual_file_name = fname
485        return tmp_file
486
487    def write(self, data):
488        """Write some data to the File.
489
490        :param data: a bytestring
491        """
492        return self.on_data(data)
493
494    def on_data(self, data):
495        """This method is a callback that will be called whenever data is
496        written to the File.
497
498        :param data: a bytestring
499        """
500        pos = self._fileobj.tell()
501        bwritten = self._fileobj.write(data)
502        # true file objects write  returns None
503        if bwritten is None:
504            bwritten = self._fileobj.tell() - pos
505
506        # If the bytes written isn't the same as the length, just return.
507        if bwritten != len(data):
508            self.logger.warning("bwritten != len(data) (%d != %d)", bwritten,
509                                len(data))
510            return bwritten
511
512        # Keep track of how many bytes we've written.
513        self._bytes_written += bwritten
514
515        # If we're in-memory and are over our limit, we create a file.
516        if (self._in_memory and
517                self._config.get('MAX_MEMORY_FILE_SIZE') is not None and
518                (self._bytes_written >
519                 self._config.get('MAX_MEMORY_FILE_SIZE'))):
520            self.logger.info("Flushing to disk")
521            self.flush_to_disk()
522
523        # Return the number of bytes written.
524        return bwritten
525
526    def on_end(self):
527        """This method is called whenever the Field is finalized.
528        """
529        # Flush the underlying file object
530        self._fileobj.flush()
531
532    def finalize(self):
533        """Finalize the form file.  This will not close the underlying file,
534        but simply signal that we are finished writing to the File.
535        """
536        self.on_end()
537
538    def close(self):
539        """Close the File object.  This will actually close the underlying
540        file object (whether it's a :class:`io.BytesIO` or an actual file
541        object).
542        """
543        self._fileobj.close()
544
545    def __repr__(self):
546        return "%s(file_name=%r, field_name=%r)" % (
547            self.__class__.__name__,
548            self.file_name,
549            self.field_name
550        )
551
552
553class BaseParser(object):
554    """This class is the base class for all parsers.  It contains the logic for
555    calling and adding callbacks.
556
557    A callback can be one of two different forms.  "Notification callbacks" are
558    callbacks that are called when something happens - for example, when a new
559    part of a multipart message is encountered by the parser.  "Data callbacks"
560    are called when we get some sort of data - for example, part of the body of
561    a multipart chunk.  Notification callbacks are called with no parameters,
562    whereas data callbacks are called with three, as follows::
563
564        data_callback(data, start, end)
565
566    The "data" parameter is a bytestring (i.e. "foo" on Python 2, or b"foo" on
567    Python 3).  "start" and "end" are integer indexes into the "data" string
568    that represent the data of interest.  Thus, in a data callback, the slice
569    `data[start:end]` represents the data that the callback is "interested in".
570    The callback is not passed a copy of the data, since copying severely hurts
571    performance.
572    """
573    def __init__(self):
574        self.logger = logging.getLogger(__name__)
575
576    def callback(self, name, data=None, start=None, end=None):
577        """This function calls a provided callback with some data.  If the
578        callback is not set, will do nothing.
579
580        :param name: The name of the callback to call (as a string).
581
582        :param data: Data to pass to the callback.  If None, then it is
583                     assumed that the callback is a notification callback,
584                     and no parameters are given.
585
586        :param end: An integer that is passed to the data callback.
587
588        :param start: An integer that is passed to the data callback.
589        """
590        name = "on_" + name
591        func = self.callbacks.get(name)
592        if func is None:
593            return
594
595        # Depending on whether we're given a buffer...
596        if data is not None:
597            # Don't do anything if we have start == end.
598            if start is not None and start == end:
599                return
600
601            self.logger.debug("Calling %s with data[%d:%d]", name, start, end)
602            func(data, start, end)
603        else:
604            self.logger.debug("Calling %s with no data", name)
605            func()
606
607    def set_callback(self, name, new_func):
608        """Update the function for a callback.  Removes from the callbacks dict
609        if new_func is None.
610
611        :param name: The name of the callback to call (as a string).
612
613        :param new_func: The new function for the callback.  If None, then the
614                         callback will be removed (with no error if it does not
615                         exist).
616        """
617        if new_func is None:
618            self.callbacks.pop('on_' + name, None)
619        else:
620            self.callbacks['on_' + name] = new_func
621
622    def close(self):
623        pass                # pragma: no cover
624
625    def finalize(self):
626        pass                # pragma: no cover
627
628    def __repr__(self):
629        return "%s()" % self.__class__.__name__
630
631
632class OctetStreamParser(BaseParser):
633    """This parser parses an octet-stream request body and calls callbacks when
634    incoming data is received.  Callbacks are as follows:
635
636    .. list-table::
637       :widths: 15 10 30
638       :header-rows: 1
639
640       * - Callback Name
641         - Parameters
642         - Description
643       * - on_start
644         - None
645         - Called when the first data is parsed.
646       * - on_data
647         - data, start, end
648         - Called for each data chunk that is parsed.
649       * - on_end
650         - None
651         - Called when the parser is finished parsing all data.
652
653    :param callbacks: A dictionary of callbacks.  See the documentation for
654                      :class:`BaseParser`.
655
656    :param max_size: The maximum size of body to parse.  Defaults to infinity -
657                     i.e. unbounded.
658    """
659    def __init__(self, callbacks={}, max_size=float('inf')):
660        super(OctetStreamParser, self).__init__()
661        self.callbacks = callbacks
662        self._started = False
663
664        if not isinstance(max_size, Number) or max_size < 1:
665            raise ValueError("max_size must be a positive number, not %r" %
666                             max_size)
667        self.max_size = max_size
668        self._current_size = 0
669
670    def write(self, data):
671        """Write some data to the parser, which will perform size verification,
672        and then pass the data to the underlying callback.
673
674        :param data: a bytestring
675        """
676        if not self._started:
677            self.callback('start')
678            self._started = True
679
680        # Truncate data length.
681        data_len = len(data)
682        if (self._current_size + data_len) > self.max_size:
683            # We truncate the length of data that we are to process.
684            new_size = int(self.max_size - self._current_size)
685            self.logger.warning("Current size is %d (max %d), so truncating "
686                                "data length from %d to %d",
687                                self._current_size, self.max_size, data_len,
688                                new_size)
689            data_len = new_size
690
691        # Increment size, then callback, in case there's an exception.
692        self._current_size += data_len
693        self.callback('data', data, 0, data_len)
694        return data_len
695
696    def finalize(self):
697        """Finalize this parser, which signals to that we are finished parsing,
698        and sends the on_end callback.
699        """
700        self.callback('end')
701
702    def __repr__(self):
703        return "%s()" % self.__class__.__name__
704
705
706class QuerystringParser(BaseParser):
707    """This is a streaming querystring parser.  It will consume data, and call
708    the callbacks given when it has data.
709
710    .. list-table::
711       :widths: 15 10 30
712       :header-rows: 1
713
714       * - Callback Name
715         - Parameters
716         - Description
717       * - on_field_start
718         - None
719         - Called when a new field is encountered.
720       * - on_field_name
721         - data, start, end
722         - Called when a portion of a field's name is encountered.
723       * - on_field_data
724         - data, start, end
725         - Called when a portion of a field's data is encountered.
726       * - on_field_end
727         - None
728         - Called when the end of a field is encountered.
729       * - on_end
730         - None
731         - Called when the parser is finished parsing all data.
732
733    :param callbacks: A dictionary of callbacks.  See the documentation for
734                      :class:`BaseParser`.
735
736    :param strict_parsing: Whether or not to parse the body strictly.  Defaults
737                           to False.  If this is set to True, then the behavior
738                           of the parser changes as the following: if a field
739                           has a value with an equal sign (e.g. "foo=bar", or
740                           "foo="), it is always included.  If a field has no
741                           equals sign (e.g. "...&name&..."), it will be
742                           treated as an error if 'strict_parsing' is True,
743                           otherwise included.  If an error is encountered,
744                           then a
745                           :class:`multipart.exceptions.QuerystringParseError`
746                           will be raised.
747
748    :param max_size: The maximum size of body to parse.  Defaults to infinity -
749                     i.e. unbounded.
750    """
751    def __init__(self, callbacks={}, strict_parsing=False,
752                 max_size=float('inf')):
753        super(QuerystringParser, self).__init__()
754        self.state = STATE_BEFORE_FIELD
755        self._found_sep = False
756
757        self.callbacks = callbacks
758
759        # Max-size stuff
760        if not isinstance(max_size, Number) or max_size < 1:
761            raise ValueError("max_size must be a positive number, not %r" %
762                             max_size)
763        self.max_size = max_size
764        self._current_size = 0
765
766        # Should parsing be strict?
767        self.strict_parsing = strict_parsing
768
769    def write(self, data):
770        """Write some data to the parser, which will perform size verification,
771        parse into either a field name or value, and then pass the
772        corresponding data to the underlying callback.  If an error is
773        encountered while parsing, a QuerystringParseError will be raised.  The
774        "offset" attribute of the raised exception will be set to the offset in
775        the input data chunk (NOT the overall stream) that caused the error.
776
777        :param data: a bytestring
778        """
779        # Handle sizing.
780        data_len = len(data)
781        if (self._current_size + data_len) > self.max_size:
782            # We truncate the length of data that we are to process.
783            new_size = int(self.max_size - self._current_size)
784            self.logger.warning("Current size is %d (max %d), so truncating "
785                                "data length from %d to %d",
786                                self._current_size, self.max_size, data_len,
787                                new_size)
788            data_len = new_size
789
790        l = 0
791        try:
792            l = self._internal_write(data, data_len)
793        finally:
794            self._current_size += l
795
796        return l
797
798    def _internal_write(self, data, length):
799        state = self.state
800        strict_parsing = self.strict_parsing
801        found_sep = self._found_sep
802
803        i = 0
804        while i < length:
805            ch = data[i]
806
807            # Depending on our state...
808            if state == STATE_BEFORE_FIELD:
809                # If the 'found_sep' flag is set, we've already encountered
810                # and skipped a single seperator.  If so, we check our strict
811                # parsing flag and decide what to do.  Otherwise, we haven't
812                # yet reached a seperator, and thus, if we do, we need to skip
813                # it as it will be the boundary between fields that's supposed
814                # to be there.
815                if ch == AMPERSAND or ch == SEMICOLON:
816                    if found_sep:
817                        # If we're parsing strictly, we disallow blank chunks.
818                        if strict_parsing:
819                            e = QuerystringParseError(
820                                "Skipping duplicate ampersand/semicolon at "
821                                "%d" % i
822                            )
823                            e.offset = i
824                            raise e
825                        else:
826                            self.logger.debug("Skipping duplicate ampersand/"
827                                         "semicolon at %d", i)
828                    else:
829                        # This case is when we're skipping the (first)
830                        # seperator between fields, so we just set our flag
831                        # and continue on.
832                        found_sep = True
833                else:
834                    # Emit a field-start event, and go to that state.  Also,
835                    # reset the "found_sep" flag, for the next time we get to
836                    # this state.
837                    self.callback('field_start')
838                    i -= 1
839                    state = STATE_FIELD_NAME
840                    found_sep = False
841
842            elif state == STATE_FIELD_NAME:
843                # Try and find a seperator - we ensure that, if we do, we only
844                # look for the equal sign before it.
845                sep_pos = data.find(b'&', i)
846                if sep_pos == -1:
847                    sep_pos = data.find(b';', i)
848
849                # See if we can find an equals sign in the remaining data.  If
850                # so, we can immedately emit the field name and jump to the
851                # data state.
852                if sep_pos != -1:
853                    equals_pos = data.find(b'=', i, sep_pos)
854                else:
855                    equals_pos = data.find(b'=', i)
856
857                if equals_pos != -1:
858                    # Emit this name.
859                    self.callback('field_name', data, i, equals_pos)
860
861                    # Jump i to this position.  Note that it will then have 1
862                    # added to it below, which means the next iteration of this
863                    # loop will inspect the character after the equals sign.
864                    i = equals_pos
865                    state = STATE_FIELD_DATA
866                else:
867                    # No equals sign found.
868                    if not strict_parsing:
869                        # See also comments in the STATE_FIELD_DATA case below.
870                        # If we found the seperator, we emit the name and just
871                        # end - there's no data callback at all (not even with
872                        # a blank value).
873                        if sep_pos != -1:
874                            self.callback('field_name', data, i, sep_pos)
875                            self.callback('field_end')
876
877                            i = sep_pos - 1
878                            state = STATE_BEFORE_FIELD
879                        else:
880                            # Otherwise, no seperator in this block, so the
881                            # rest of this chunk must be a name.
882                            self.callback('field_name', data, i, length)
883                            i = length
884
885                    else:
886                        # We're parsing strictly.  If we find a seperator,
887                        # this is an error - we require an equals sign.
888                        if sep_pos != -1:
889                            e =  QuerystringParseError(
890                                "When strict_parsing is True, we require an "
891                                "equals sign in all field chunks. Did not "
892                                "find one in the chunk that starts at %d" %
893                                (i,)
894                            )
895                            e.offset = i
896                            raise e
897
898                        # No seperator in the rest of this chunk, so it's just
899                        # a field name.
900                        self.callback('field_name', data, i, length)
901                        i = length
902
903            elif state == STATE_FIELD_DATA:
904                # Try finding either an ampersand or a semicolon after this
905                # position.
906                sep_pos = data.find(b'&', i)
907                if sep_pos == -1:
908                    sep_pos = data.find(b';', i)
909
910                # If we found it, callback this bit as data and then go back
911                # to expecting to find a field.
912                if sep_pos != -1:
913                    self.callback('field_data', data, i, sep_pos)
914                    self.callback('field_end')
915
916                    # Note that we go to the seperator, which brings us to the
917                    # "before field" state.  This allows us to properly emit
918                    # "field_start" events only when we actually have data for
919                    # a field of some sort.
920                    i = sep_pos - 1
921                    state = STATE_BEFORE_FIELD
922
923                # Otherwise, emit the rest as data and finish.
924                else:
925                    self.callback('field_data', data, i, length)
926                    i = length
927
928            else:                   # pragma: no cover (error case)
929                msg = "Reached an unknown state %d at %d" % (state, i)
930                self.logger.warning(msg)
931                e = QuerystringParseError(msg)
932                e.offset = i
933                raise e
934
935            i += 1
936
937        self.state = state
938        self._found_sep = found_sep
939        return len(data)
940
941    def finalize(self):
942        """Finalize this parser, which signals to that we are finished parsing,
943        if we're still in the middle of a field, an on_field_end callback, and
944        then the on_end callback.
945        """
946        # If we're currently in the middle of a field, we finish it.
947        if self.state == STATE_FIELD_DATA:
948            self.callback('field_end')
949        self.callback('end')
950
951    def __repr__(self):
952        return "%s(keep_blank_values=%r, strict_parsing=%r, max_size=%r)" % (
953            self.__class__.__name__,
954            self.keep_blank_values, self.strict_parsing, self.max_size
955        )
956
957
958class MultipartParser(BaseParser):
959    """This class is a streaming multipart/form-data parser.
960
961    .. list-table::
962       :widths: 15 10 30
963       :header-rows: 1
964
965       * - Callback Name
966         - Parameters
967         - Description
968       * - on_part_begin
969         - None
970         - Called when a new part of the multipart message is encountered.
971       * - on_part_data
972         - data, start, end
973         - Called when a portion of a part's data is encountered.
974       * - on_part_end
975         - None
976         - Called when the end of a part is reached.
977       * - on_header_begin
978         - None
979         - Called when we've found a new header in a part of a multipart
980           message
981       * - on_header_field
982         - data, start, end
983         - Called each time an additional portion of a header is read (i.e. the
984           part of the header that is before the colon; the "Foo" in
985           "Foo: Bar").
986       * - on_header_value
987         - data, start, end
988         - Called when we get data for a header.
989       * - on_header_end
990         - None
991         - Called when the current header is finished - i.e. we've reached the
992           newline at the end of the header.
993       * - on_headers_finished
994         - None
995         - Called when all headers are finished, and before the part data
996           starts.
997       * - on_end
998         - None
999         - Called when the parser is finished parsing all data.
1000
1001
1002    :param boundary: The multipart boundary.  This is required, and must match
1003                     what is given in the HTTP request - usually in the
1004                     Content-Type header.
1005
1006    :param callbacks: A dictionary of callbacks.  See the documentation for
1007                      :class:`BaseParser`.
1008
1009    :param max_size: The maximum size of body to parse.  Defaults to infinity -
1010                     i.e. unbounded.
1011    """
1012
1013    def __init__(self, boundary, callbacks={}, max_size=float('inf')):
1014        # Initialize parser state.
1015        super(MultipartParser, self).__init__()
1016        self.state = STATE_START
1017        self.index = self.flags = 0
1018
1019        self.callbacks = callbacks
1020
1021        if not isinstance(max_size, Number) or max_size < 1:
1022            raise ValueError("max_size must be a positive number, not %r" %
1023                             max_size)
1024        self.max_size = max_size
1025        self._current_size = 0
1026
1027        # Setup marks.  These are used to track the state of data recieved.
1028        self.marks = {}
1029
1030        # TODO: Actually use this rather than the dumb version we currently use
1031        # # Precompute the skip table for the Boyer-Moore-Horspool algorithm.
1032        # skip = [len(boundary) for x in range(256)]
1033        # for i in range(len(boundary) - 1):
1034        #     skip[ord_char(boundary[i])] = len(boundary) - i - 1
1035        #
1036        # # We use a tuple since it's a constant, and marginally faster.
1037        # self.skip = tuple(skip)
1038
1039        # Save our boundary.
1040        if isinstance(boundary, text_type):         # pragma: no cover
1041            boundary = boundary.encode('latin-1')
1042        self.boundary = b'\r\n--' + boundary
1043
1044        # Get a set of characters that belong to our boundary.
1045        self.boundary_chars = frozenset(self.boundary)
1046
1047        # We also create a lookbehind list.
1048        # Note: the +8 is since we can have, at maximum, "\r\n--" + boundary +
1049        # "--\r\n" at the final boundary, and the length of '\r\n--' and
1050        # '--\r\n' is 8 bytes.
1051        self.lookbehind = [NULL for x in range(len(boundary) + 8)]
1052
1053    def write(self, data):
1054        """Write some data to the parser, which will perform size verification,
1055        and then parse the data into the appropriate location (e.g. header,
1056        data, etc.), and pass this on to the underlying callback.  If an error
1057        is encountered, a MultipartParseError will be raised.  The "offset"
1058        attribute on the raised exception will be set to the offset of the byte
1059        in the input chunk that caused the error.
1060
1061        :param data: a bytestring
1062        """
1063        # Handle sizing.
1064        data_len = len(data)
1065        if (self._current_size + data_len) > self.max_size:
1066            # We truncate the length of data that we are to process.
1067            new_size = int(self.max_size - self._current_size)
1068            self.logger.warning("Current size is %d (max %d), so truncating "
1069                                "data length from %d to %d",
1070                                self._current_size, self.max_size, data_len,
1071                                new_size)
1072            data_len = new_size
1073
1074        l = 0
1075        try:
1076            l = self._internal_write(data, data_len)
1077        finally:
1078            self._current_size += l
1079
1080        return l
1081
1082    def _internal_write(self, data, length):
1083        # Get values from locals.
1084        boundary = self.boundary
1085
1086        # Get our state, flags and index.  These are persisted between calls to
1087        # this function.
1088        state = self.state
1089        index = self.index
1090        flags = self.flags
1091
1092        # Our index defaults to 0.
1093        i = 0
1094
1095        # Set a mark.
1096        def set_mark(name):
1097            self.marks[name] = i
1098
1099        # Remove a mark.
1100        def delete_mark(name, reset=False):
1101            self.marks.pop(name, None)
1102
1103        # Helper function that makes calling a callback with data easier. The
1104        # 'remaining' parameter will callback from the marked value until the
1105        # end of the buffer, and reset the mark, instead of deleting it.  This
1106        # is used at the end of the function to call our callbacks with any
1107        # remaining data in this chunk.
1108        def data_callback(name, remaining=False):
1109            marked_index = self.marks.get(name)
1110            if marked_index is None:
1111                return
1112
1113            # If we're getting remaining data, we ignore the current i value
1114            # and just call with the remaining data.
1115            if remaining:
1116                self.callback(name, data, marked_index, length)
1117                self.marks[name] = 0
1118
1119            # Otherwise, we call it from the mark to the current byte we're
1120            # processing.
1121            else:
1122                self.callback(name, data, marked_index, i)
1123                self.marks.pop(name, None)
1124
1125        # For each byte...
1126        while i < length:
1127            c = data[i]
1128
1129            if state == STATE_START:
1130                # Skip leading newlines
1131                if c == CR or c == LF:
1132                    i += 1
1133                    self.logger.debug("Skipping leading CR/LF at %d", i)
1134                    continue
1135
1136                # index is used as in index into our boundary.  Set to 0.
1137                index = 0
1138
1139                # Move to the next state, but decrement i so that we re-process
1140                # this character.
1141                state = STATE_START_BOUNDARY
1142                i -= 1
1143
1144            elif state == STATE_START_BOUNDARY:
1145                # Check to ensure that the last 2 characters in our boundary
1146                # are CRLF.
1147                if index == len(boundary) - 2:
1148                    if c != CR:
1149                        # Error!
1150                        msg = "Did not find CR at end of boundary (%d)" % (i,)
1151                        self.logger.warning(msg)
1152                        e = MultipartParseError(msg)
1153                        e.offset = i
1154                        raise e
1155
1156                    index += 1
1157
1158                elif index == len(boundary) - 2 + 1:
1159                    if c != LF:
1160                        msg = "Did not find LF at end of boundary (%d)" % (i,)
1161                        self.logger.warning(msg)
1162                        e = MultipartParseError(msg)
1163                        e.offset = i
1164                        raise e
1165
1166                    # The index is now used for indexing into our boundary.
1167                    index = 0
1168
1169                    # Callback for the start of a part.
1170                    self.callback('part_begin')
1171
1172                    # Move to the next character and state.
1173                    state = STATE_HEADER_FIELD_START
1174
1175                else:
1176                    # Check to ensure our boundary matches
1177                    if c != boundary[index + 2]:
1178                        msg = "Did not find boundary character %r at index " \
1179                              "%d" % (c, index + 2)
1180                        self.logger.warning(msg)
1181                        e = MultipartParseError(msg)
1182                        e.offset = i
1183                        raise e
1184
1185                    # Increment index into boundary and continue.
1186                    index += 1
1187
1188            elif state == STATE_HEADER_FIELD_START:
1189                # Mark the start of a header field here, reset the index, and
1190                # continue parsing our header field.
1191                index = 0
1192
1193                # Set a mark of our header field.
1194                set_mark('header_field')
1195
1196                # Move to parsing header fields.
1197                state = STATE_HEADER_FIELD
1198                i -= 1
1199
1200            elif state == STATE_HEADER_FIELD:
1201                # If we've reached a CR at the beginning of a header, it means
1202                # that we've reached the second of 2 newlines, and so there are
1203                # no more headers to parse.
1204                if c == CR:
1205                    delete_mark('header_field')
1206                    state = STATE_HEADERS_ALMOST_DONE
1207                    i += 1
1208                    continue
1209
1210                # Increment our index in the header.
1211                index += 1
1212
1213                # Do nothing if we encounter a hyphen.
1214                if c == HYPHEN:
1215                    pass
1216
1217                # If we've reached a colon, we're done with this header.
1218                elif c == COLON:
1219                    # A 0-length header is an error.
1220                    if index == 1:
1221                        msg = "Found 0-length header at %d" % (i,)
1222                        self.logger.warning(msg)
1223                        e = MultipartParseError(msg)
1224                        e.offset = i
1225                        raise e
1226
1227                    # Call our callback with the header field.
1228                    data_callback('header_field')
1229
1230                    # Move to parsing the header value.
1231                    state = STATE_HEADER_VALUE_START
1232
1233                else:
1234                    # Lower-case this character, and ensure that it is in fact
1235                    # a valid letter.  If not, it's an error.
1236                    cl = lower_char(c)
1237                    if cl < LOWER_A or cl > LOWER_Z:
1238                        msg = "Found non-alphanumeric character %r in " \
1239                              "header at %d" % (c, i)
1240                        self.logger.warning(msg)
1241                        e = MultipartParseError(msg)
1242                        e.offset = i
1243                        raise e
1244
1245            elif state == STATE_HEADER_VALUE_START:
1246                # Skip leading spaces.
1247                if c == SPACE:
1248                    i += 1
1249                    continue
1250
1251                # Mark the start of the header value.
1252                set_mark('header_value')
1253
1254                # Move to the header-value state, reprocessing this character.
1255                state = STATE_HEADER_VALUE
1256                i -= 1
1257
1258            elif state == STATE_HEADER_VALUE:
1259                # If we've got a CR, we're nearly done our headers.  Otherwise,
1260                # we do nothing and just move past this character.
1261                if c == CR:
1262                    data_callback('header_value')
1263                    self.callback('header_end')
1264                    state = STATE_HEADER_VALUE_ALMOST_DONE
1265
1266            elif state == STATE_HEADER_VALUE_ALMOST_DONE:
1267                # The last character should be a LF.  If not, it's an error.
1268                if c != LF:
1269                    msg = "Did not find LF character at end of header " \
1270                          "(found %r)" % (c,)
1271                    self.logger.warning(msg)
1272                    e = MultipartParseError(msg)
1273                    e.offset = i
1274                    raise e
1275
1276                # Move back to the start of another header.  Note that if that
1277                # state detects ANOTHER newline, it'll trigger the end of our
1278                # headers.
1279                state = STATE_HEADER_FIELD_START
1280
1281            elif state == STATE_HEADERS_ALMOST_DONE:
1282                # We're almost done our headers.  This is reached when we parse
1283                # a CR at the beginning of a header, so our next character
1284                # should be a LF, or it's an error.
1285                if c != LF:
1286                    msg = "Did not find LF at end of headers (found %r)" % (c,)
1287                    self.logger.warning(msg)
1288                    e = MultipartParseError(msg)
1289                    e.offset = i
1290                    raise e
1291
1292                self.callback('headers_finished')
1293                state = STATE_PART_DATA_START
1294
1295            elif state == STATE_PART_DATA_START:
1296                # Mark the start of our part data.
1297                set_mark('part_data')
1298
1299                # Start processing part data, including this character.
1300                state = STATE_PART_DATA
1301                i -= 1
1302
1303            elif state == STATE_PART_DATA:
1304                # We're processing our part data right now.  During this, we
1305                # need to efficiently search for our boundary, since any data
1306                # on any number of lines can be a part of the current data.
1307                # We use the Boyer-Moore-Horspool algorithm to efficiently
1308                # search through the remainder of the buffer looking for our
1309                # boundary.
1310
1311                # Save the current value of our index.  We use this in case we
1312                # find part of a boundary, but it doesn't match fully.
1313                prev_index = index
1314
1315                # Set up variables.
1316                boundary_length = len(boundary)
1317                boundary_end = boundary_length - 1
1318                data_length = length
1319                boundary_chars = self.boundary_chars
1320
1321                # If our index is 0, we're starting a new part, so start our
1322                # search.
1323                if index == 0:
1324                    # Search forward until we either hit the end of our buffer,
1325                    # or reach a character that's in our boundary.
1326                    i += boundary_end
1327                    while i < data_length - 1 and data[i] not in boundary_chars:
1328                        i += boundary_length
1329
1330                    # Reset i back the length of our boundary, which is the
1331                    # earliest possible location that could be our match (i.e.
1332                    # if we've just broken out of our loop since we saw the
1333                    # last character in our boundary)
1334                    i -= boundary_end
1335                    c = data[i]
1336
1337                # Now, we have a couple of cases here.  If our index is before
1338                # the end of the boundary...
1339                if index < boundary_length:
1340                    # If the character matches...
1341                    if boundary[index] == c:
1342                        # If we found a match for our boundary, we send the
1343                        # existing data.
1344                        if index == 0:
1345                            data_callback('part_data')
1346
1347                        # The current character matches, so continue!
1348                        index += 1
1349                    else:
1350                        index = 0
1351
1352                # Our index is equal to the length of our boundary!
1353                elif index == boundary_length:
1354                    # First we increment it.
1355                    index += 1
1356
1357                    # Now, if we've reached a newline, we need to set this as
1358                    # the potential end of our boundary.
1359                    if c == CR:
1360                        flags |= FLAG_PART_BOUNDARY
1361
1362                    # Otherwise, if this is a hyphen, we might be at the last
1363                    # of all boundaries.
1364                    elif c == HYPHEN:
1365                        flags |= FLAG_LAST_BOUNDARY
1366
1367                    # Otherwise, we reset our index, since this isn't either a
1368                    # newline or a hyphen.
1369                    else:
1370                        index = 0
1371
1372                # Our index is right after the part boundary, which should be
1373                # a LF.
1374                elif index == boundary_length + 1:
1375                    # If we're at a part boundary (i.e. we've seen a CR
1376                    # character already)...
1377                    if flags & FLAG_PART_BOUNDARY:
1378                        # We need a LF character next.
1379                        if c == LF:
1380                            # Unset the part boundary flag.
1381                            flags &= (~FLAG_PART_BOUNDARY)
1382
1383                            # Callback indicating that we've reached the end of
1384                            # a part, and are starting a new one.
1385                            self.callback('part_end')
1386                            self.callback('part_begin')
1387
1388                            # Move to parsing new headers.
1389                            index = 0
1390                            state = STATE_HEADER_FIELD_START
1391                            i += 1
1392                            continue
1393
1394                        # We didn't find an LF character, so no match.  Reset
1395                        # our index and clear our flag.
1396                        index = 0
1397                        flags &= (~FLAG_PART_BOUNDARY)
1398
1399                    # Otherwise, if we're at the last boundary (i.e. we've
1400                    # seen a hyphen already)...
1401                    elif flags & FLAG_LAST_BOUNDARY:
1402                        # We need a second hyphen here.
1403                        if c == HYPHEN:
1404                            # Callback to end the current part, and then the
1405                            # message.
1406                            self.callback('part_end')
1407                            self.callback('end')
1408                            state = STATE_END
1409                        else:
1410                            # No match, so reset index.
1411                            index = 0
1412
1413                # If we have an index, we need to keep this byte for later, in
1414                # case we can't match the full boundary.
1415                if index > 0:
1416                    self.lookbehind[index - 1] = c
1417
1418                # Otherwise, our index is 0.  If the previous index is not, it
1419                # means we reset something, and we need to take the data we
1420                # thought was part of our boundary and send it along as actual
1421                # data.
1422                elif prev_index > 0:
1423                    # Callback to write the saved data.
1424                    lb_data = join_bytes(self.lookbehind)
1425                    self.callback('part_data', lb_data, 0, prev_index)
1426
1427                    # Overwrite our previous index.
1428                    prev_index = 0
1429
1430                    # Re-set our mark for part data.
1431                    set_mark('part_data')
1432
1433                    # Re-consider the current character, since this could be
1434                    # the start of the boundary itself.
1435                    i -= 1
1436
1437            elif state == STATE_END:
1438                # Do nothing and just consume a byte in the end state.
1439                if c not in (CR, LF):
1440                    self.logger.warning("Consuming a byte '0x%x' in the end state", c)
1441
1442            else:                   # pragma: no cover (error case)
1443                # We got into a strange state somehow!  Just stop processing.
1444                msg = "Reached an unknown state %d at %d" % (state, i)
1445                self.logger.warning(msg)
1446                e = MultipartParseError(msg)
1447                e.offset = i
1448                raise e
1449
1450            # Move to the next byte.
1451            i += 1
1452
1453        # We call our callbacks with any remaining data.  Note that we pass
1454        # the 'remaining' flag, which sets the mark back to 0 instead of
1455        # deleting it, if it's found.  This is because, if the mark is found
1456        # at this point, we assume that there's data for one of these things
1457        # that has been parsed, but not yet emitted.  And, as such, it implies
1458        # that we haven't yet reached the end of this 'thing'.  So, by setting
1459        # the mark to 0, we cause any data callbacks that take place in future
1460        # calls to this function to start from the beginning of that buffer.
1461        data_callback('header_field', True)
1462        data_callback('header_value', True)
1463        data_callback('part_data', True)
1464
1465        # Save values to locals.
1466        self.state = state
1467        self.index = index
1468        self.flags = flags
1469
1470        # Return our data length to indicate no errors, and that we processed
1471        # all of it.
1472        return length
1473
1474    def finalize(self):
1475        """Finalize this parser, which signals to that we are finished parsing.
1476
1477        Note: It does not currently, but in the future, it will verify that we
1478        are in the final state of the parser (i.e. the end of the multipart
1479        message is well-formed), and, if not, throw an error.
1480        """
1481        # TODO: verify that we're in the state STATE_END, otherwise throw an
1482        # error or otherwise state that we're not finished parsing.
1483        pass
1484
1485    def __repr__(self):
1486        return "%s(boundary=%r)" % (self.__class__.__name__, self.boundary)
1487
1488
1489class FormParser(object):
1490    """This class is the all-in-one form parser.  Given all the information
1491    necessary to parse a form, it will instantiate the correct parser, create
1492    the proper :class:`Field` and :class:`File` classes to store the data that
1493    is parsed, and call the two given callbacks with each field and file as
1494    they become available.
1495
1496    :param content_type: The Content-Type of the incoming request.  This is
1497                         used to select the appropriate parser.
1498
1499    :param on_field: The callback to call when a field has been parsed and is
1500                     ready for usage.  See above for parameters.
1501
1502    :param on_file: The callback to call when a file has been parsed and is
1503                    ready for usage.  See above for parameters.
1504
1505    :param on_end: An optional callback to call when all fields and files in a
1506                   request has been parsed.  Can be None.
1507
1508    :param boundary: If the request is a multipart/form-data request, this
1509                     should be the boundary of the request, as given in the
1510                     Content-Type header, as a bytestring.
1511
1512    :param file_name: If the request is of type application/octet-stream, then
1513                      the body of the request will not contain any information
1514                      about the uploaded file.  In such cases, you can provide
1515                      the file name of the uploaded file manually.
1516
1517    :param FileClass: The class to use for uploaded files.  Defaults to
1518                      :class:`File`, but you can provide your own class if you
1519                      wish to customize behaviour.  The class will be
1520                      instantiated as FileClass(file_name, field_name), and it
1521                      must provide the folllowing functions::
1522                          file_instance.write(data)
1523                          file_instance.finalize()
1524                          file_instance.close()
1525
1526    :param FieldClass: The class to use for uploaded fields.  Defaults to
1527                       :class:`Field`, but you can provide your own class if
1528                       you wish to customize behaviour.  The class will be
1529                       instantiated as FieldClass(field_name), and it must
1530                       provide the folllowing functions::
1531                           field_instance.write(data)
1532                           field_instance.finalize()
1533                           field_instance.close()
1534
1535    :param config: Configuration to use for this FormParser.  The default
1536                   values are taken from the DEFAULT_CONFIG value, and then
1537                   any keys present in this dictionary will overwrite the
1538                   default values.
1539
1540    """
1541    #: This is the default configuration for our form parser.
1542    #: Note: all file sizes should be in bytes.
1543    DEFAULT_CONFIG = {
1544        'MAX_BODY_SIZE': float('inf'),
1545        'MAX_MEMORY_FILE_SIZE': 1 * 1024 * 1024,
1546        'UPLOAD_DIR': None,
1547        'UPLOAD_KEEP_FILENAME': False,
1548        'UPLOAD_KEEP_EXTENSIONS': False,
1549
1550        # Error on invalid Content-Transfer-Encoding?
1551        'UPLOAD_ERROR_ON_BAD_CTE': False,
1552    }
1553
1554    def __init__(self, content_type, on_field, on_file, on_end=None,
1555                 boundary=None, file_name=None, FileClass=File,
1556                 FieldClass=Field, config={}):
1557
1558        self.logger = logging.getLogger(__name__)
1559
1560        # Save variables.
1561        self.content_type = content_type
1562        self.boundary = boundary
1563        self.bytes_received = 0
1564        self.parser = None
1565
1566        # Save callbacks.
1567        self.on_field = on_field
1568        self.on_file = on_file
1569        self.on_end = on_end
1570
1571        # Save classes.
1572        self.FileClass = File
1573        self.FieldClass = Field
1574
1575        # Set configuration options.
1576        self.config = self.DEFAULT_CONFIG.copy()
1577        self.config.update(config)
1578
1579        # Depending on the Content-Type, we instantiate the correct parser.
1580        if content_type == 'application/octet-stream':
1581            # Work around the lack of 'nonlocal' in Py2
1582            class vars(object):
1583                f = None
1584
1585            def on_start():
1586                vars.f = FileClass(file_name, None, config=self.config)
1587
1588            def on_data(data, start, end):
1589                vars.f.write(data[start:end])
1590
1591            def on_end():
1592                # Finalize the file itself.
1593                vars.f.finalize()
1594
1595                # Call our callback.
1596                on_file(vars.f)
1597
1598                # Call the on-end callback.
1599                if self.on_end is not None:
1600                    self.on_end()
1601
1602            callbacks = {
1603                'on_start': on_start,
1604                'on_data': on_data,
1605                'on_end': on_end,
1606            }
1607
1608            # Instantiate an octet-stream parser
1609            parser = OctetStreamParser(callbacks,
1610                                       max_size=self.config['MAX_BODY_SIZE'])
1611
1612        elif (content_type == 'application/x-www-form-urlencoded' or
1613              content_type == 'application/x-url-encoded'):
1614
1615            name_buffer = []
1616
1617            class vars(object):
1618                f = None
1619
1620            def on_field_start():
1621                pass
1622
1623            def on_field_name(data, start, end):
1624                name_buffer.append(data[start:end])
1625
1626            def on_field_data(data, start, end):
1627                if vars.f is None:
1628                    vars.f = FieldClass(b''.join(name_buffer))
1629                    del name_buffer[:]
1630                vars.f.write(data[start:end])
1631
1632            def on_field_end():
1633                # Finalize and call callback.
1634                if vars.f is None:
1635                    # If we get here, it's because there was no field data.
1636                    # We create a field, set it to None, and then continue.
1637                    vars.f = FieldClass(b''.join(name_buffer))
1638                    del name_buffer[:]
1639                    vars.f.set_none()
1640
1641                vars.f.finalize()
1642                on_field(vars.f)
1643                vars.f = None
1644
1645            def on_end():
1646                if self.on_end is not None:
1647                    self.on_end()
1648
1649            # Setup callbacks.
1650            callbacks = {
1651                'on_field_start': on_field_start,
1652                'on_field_name': on_field_name,
1653                'on_field_data': on_field_data,
1654                'on_field_end': on_field_end,
1655                'on_end': on_end,
1656            }
1657
1658            # Instantiate parser.
1659            parser = QuerystringParser(
1660                callbacks=callbacks,
1661                max_size=self.config['MAX_BODY_SIZE']
1662            )
1663
1664        elif content_type == 'multipart/form-data':
1665            if boundary is None:
1666                self.logger.error("No boundary given")
1667                raise FormParserError("No boundary given")
1668
1669            header_name = []
1670            header_value = []
1671            headers = {}
1672
1673            # No 'nonlocal' on Python 2 :-(
1674            class vars(object):
1675                f = None
1676                writer = None
1677                is_file = False
1678
1679            def on_part_begin():
1680                pass
1681
1682            def on_part_data(data, start, end):
1683                bytes_processed = vars.writer.write(data[start:end])
1684                # TODO: check for error here.
1685                return bytes_processed
1686
1687            def on_part_end():
1688                vars.f.finalize()
1689                if vars.is_file:
1690                    on_file(vars.f)
1691                else:
1692                    on_field(vars.f)
1693
1694            def on_header_field(data, start, end):
1695                header_name.append(data[start:end])
1696
1697            def on_header_value(data, start, end):
1698                header_value.append(data[start:end])
1699
1700            def on_header_end():
1701                headers[b''.join(header_name)] = b''.join(header_value)
1702                del header_name[:]
1703                del header_value[:]
1704
1705            def on_headers_finished():
1706                # Reset the 'is file' flag.
1707                vars.is_file = False
1708
1709                # Parse the content-disposition header.
1710                # TODO: handle mixed case
1711                content_disp = headers.get(b'Content-Disposition')
1712                disp, options = parse_options_header(content_disp)
1713
1714                # Get the field and filename.
1715                field_name = options.get(b'name')
1716                file_name = options.get(b'filename')
1717                # TODO: check for errors
1718
1719                # Create the proper class.
1720                if file_name is None:
1721                    vars.f = FieldClass(field_name)
1722                else:
1723                    vars.f = FileClass(file_name, field_name, config=self.config)
1724                    vars.is_file = True
1725
1726                # Parse the given Content-Transfer-Encoding to determine what
1727                # we need to do with the incoming data.
1728                # TODO: check that we properly handle 8bit / 7bit encoding.
1729                transfer_encoding = headers.get(b'Content-Transfer-Encoding',
1730                                                b'7bit')
1731
1732                if (transfer_encoding == b'binary' or
1733                        transfer_encoding == b'8bit' or
1734                        transfer_encoding == b'7bit'):
1735                    vars.writer = vars.f
1736
1737                elif transfer_encoding == b'base64':
1738                    vars.writer = Base64Decoder(vars.f)
1739
1740                elif transfer_encoding == b'quoted-printable':
1741                    vars.writer = QuotedPrintableDecoder(vars.f)
1742
1743                else:
1744                    self.logger.warning("Unknown Content-Transfer-Encoding: "
1745                                        "%r", transfer_encoding)
1746                    if self.config['UPLOAD_ERROR_ON_BAD_CTE']:
1747                        raise FormParserError(
1748                            'Unknown Content-Transfer-Encoding "{0}"'.format(
1749                                transfer_encoding
1750                            )
1751                        )
1752                    else:
1753                        # If we aren't erroring, then we just treat this as an
1754                        # unencoded Content-Transfer-Encoding.
1755                        vars.writer = vars.f
1756
1757            def on_end():
1758                vars.writer.finalize()
1759                if self.on_end is not None:
1760                    self.on_end()
1761
1762            # These are our callbacks for the parser.
1763            callbacks = {
1764                'on_part_begin': on_part_begin,
1765                'on_part_data': on_part_data,
1766                'on_part_end': on_part_end,
1767                'on_header_field': on_header_field,
1768                'on_header_value': on_header_value,
1769                'on_header_end': on_header_end,
1770                'on_headers_finished': on_headers_finished,
1771                'on_end': on_end,
1772            }
1773
1774            # Instantiate a multipart parser.
1775            parser = MultipartParser(boundary, callbacks,
1776                                     max_size=self.config['MAX_BODY_SIZE'])
1777
1778        else:
1779            self.logger.warning("Unknown Content-Type: %r", content_type)
1780            raise FormParserError("Unknown Content-Type: {0}".format(
1781                content_type
1782            ))
1783
1784        self.parser = parser
1785
1786    def write(self, data):
1787        """Write some data.  The parser will forward this to the appropriate
1788        underlying parser.
1789
1790        :param data: a bytestring
1791        """
1792        self.bytes_received += len(data)
1793        # TODO: check the parser's return value for errors?
1794        return self.parser.write(data)
1795
1796    def finalize(self):
1797        """Finalize the parser."""
1798        if self.parser is not None and hasattr(self.parser, 'finalize'):
1799            self.parser.finalize()
1800
1801    def close(self):
1802        """Close the parser."""
1803        if self.parser is not None and hasattr(self.parser, 'close'):
1804            self.parser.close()
1805
1806    def __repr__(self):
1807        return "%s(content_type=%r, parser=%r)" % (
1808            self.__class__.__name__,
1809            self.content_type,
1810            self.parser,
1811        )
1812
1813
1814def create_form_parser(headers, on_field, on_file, trust_x_headers=False,
1815                       config={}):
1816    """This function is a helper function to aid in creating a FormParser
1817    instances.  Given a dictionary-like headers object, it will determine
1818    the correct information needed, instantiate a FormParser with the
1819    appropriate values and given callbacks, and then return the corresponding
1820    parser.
1821
1822    :param headers: A dictionary-like object of HTTP headers.  The only
1823                    required header is Content-Type.
1824
1825    :param on_field: Callback to call with each parsed field.
1826
1827    :param on_file: Callback to call with each parsed file.
1828
1829    :param trust_x_headers: Whether or not to trust information received from
1830                            certain X-Headers - for example, the file name from
1831                            X-File-Name.
1832
1833    :param config: Configuration variables to pass to the FormParser.
1834    """
1835    content_type = headers.get('Content-Type')
1836    if content_type is None:
1837        logging.getLogger(__name__).warning("No Content-Type header given")
1838        raise ValueError("No Content-Type header given!")
1839
1840    # Boundaries are optional (the FormParser will raise if one is needed
1841    # but not given).
1842    content_type, params = parse_options_header(content_type)
1843    boundary = params.get(b'boundary')
1844
1845    # We need content_type to be a string, not a bytes object.
1846    content_type = content_type.decode('latin-1')
1847
1848    # File names are optional.
1849    file_name = headers.get('X-File-Name')
1850
1851    # Instantiate a form parser.
1852    form_parser = FormParser(content_type,
1853                             on_field,
1854                             on_file,
1855                             boundary=boundary,
1856                             file_name=file_name,
1857                             config=config)
1858
1859    # Return our parser.
1860    return form_parser
1861
1862
1863def parse_form(headers, input_stream, on_field, on_file, chunk_size=1048576,
1864               **kwargs):
1865    """This function is useful if you just want to parse a request body,
1866    without too much work.  Pass it a dictionary-like object of the request's
1867    headers, and a file-like object for the input stream, along with two
1868    callbacks that will get called whenever a field or file is parsed.
1869
1870    :param headers: A dictionary-like object of HTTP headers.  The only
1871                    required header is Content-Type.
1872
1873    :param input_stream: A file-like object that represents the request body.
1874                         The read() method must return bytestrings.
1875
1876    :param on_field: Callback to call with each parsed field.
1877
1878    :param on_file: Callback to call with each parsed file.
1879
1880    :param chunk_size: The maximum size to read from the input stream and write
1881                       to the parser at one time.  Defaults to 1 MiB.
1882    """
1883
1884    # Create our form parser.
1885    parser = create_form_parser(headers, on_field, on_file)
1886
1887    # Read chunks of 100KiB and write to the parser, but never read more than
1888    # the given Content-Length, if any.
1889    content_length = headers.get('Content-Length')
1890    if content_length is not None:
1891        content_length = int(content_length)
1892    else:
1893        content_length = float('inf')
1894    bytes_read = 0
1895
1896    while True:
1897        # Read only up to the Content-Length given.
1898        max_readable = min(content_length - bytes_read, 1048576)
1899        buff = input_stream.read(max_readable)
1900
1901        # Write to the parser and update our length.
1902        parser.write(buff)
1903        bytes_read += len(buff)
1904
1905        # If we get a buffer that's smaller than the size requested, or if we
1906        # have read up to our content length, we're done.
1907        if len(buff) != max_readable or bytes_read == content_length:
1908            break
1909
1910    # Tell our parser that we're done writing data.
1911    parser.finalize()
1912