1""" codecs -- Python Codec Registry, API and helpers.
2
3
4Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""
9
10import builtins
11import sys
12
13### Registry and builtin stateless codec functions
14
15try:
16    from _codecs import *
17except ImportError as why:
18    raise SystemError('Failed to load the builtin codecs: %s' % why)
19
20__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
21           "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
22           "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
23           "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
24           "CodecInfo", "Codec", "IncrementalEncoder", "IncrementalDecoder",
25           "StreamReader", "StreamWriter",
26           "StreamReaderWriter", "StreamRecoder",
27           "getencoder", "getdecoder", "getincrementalencoder",
28           "getincrementaldecoder", "getreader", "getwriter",
29           "encode", "decode", "iterencode", "iterdecode",
30           "strict_errors", "ignore_errors", "replace_errors",
31           "xmlcharrefreplace_errors",
32           "backslashreplace_errors", "namereplace_errors",
33           "register_error", "lookup_error"]
34
35### Constants
36
37#
38# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
39# and its possible byte string values
40# for UTF8/UTF16/UTF32 output and little/big endian machines
41#
42
43# UTF-8
44BOM_UTF8 = b'\xef\xbb\xbf'
45
46# UTF-16, little endian
47BOM_LE = BOM_UTF16_LE = b'\xff\xfe'
48
49# UTF-16, big endian
50BOM_BE = BOM_UTF16_BE = b'\xfe\xff'
51
52# UTF-32, little endian
53BOM_UTF32_LE = b'\xff\xfe\x00\x00'
54
55# UTF-32, big endian
56BOM_UTF32_BE = b'\x00\x00\xfe\xff'
57
58if sys.byteorder == 'little':
59
60    # UTF-16, native endianness
61    BOM = BOM_UTF16 = BOM_UTF16_LE
62
63    # UTF-32, native endianness
64    BOM_UTF32 = BOM_UTF32_LE
65
66else:
67
68    # UTF-16, native endianness
69    BOM = BOM_UTF16 = BOM_UTF16_BE
70
71    # UTF-32, native endianness
72    BOM_UTF32 = BOM_UTF32_BE
73
74# Old broken names (don't use in new code)
75BOM32_LE = BOM_UTF16_LE
76BOM32_BE = BOM_UTF16_BE
77BOM64_LE = BOM_UTF32_LE
78BOM64_BE = BOM_UTF32_BE
79
80
81### Codec base classes (defining the API)
82
83class CodecInfo(tuple):
84    """Codec details when looking up the codec registry"""
85
86    # Private API to allow Python 3.4 to blacklist the known non-Unicode
87    # codecs in the standard library. A more general mechanism to
88    # reliably distinguish test encodings from other codecs will hopefully
89    # be defined for Python 3.5
90    #
91    # See http://bugs.python.org/issue19619
92    _is_text_encoding = True # Assume codecs are text encodings by default
93
94    def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
95        incrementalencoder=None, incrementaldecoder=None, name=None,
96        *, _is_text_encoding=None):
97        self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
98        self.name = name
99        self.encode = encode
100        self.decode = decode
101        self.incrementalencoder = incrementalencoder
102        self.incrementaldecoder = incrementaldecoder
103        self.streamwriter = streamwriter
104        self.streamreader = streamreader
105        if _is_text_encoding is not None:
106            self._is_text_encoding = _is_text_encoding
107        return self
108
109    def __repr__(self):
110        return "<%s.%s object for encoding %s at %#x>" % \
111                (self.__class__.__module__, self.__class__.__qualname__,
112                 self.name, id(self))
113
114class Codec:
115
116    """ Defines the interface for stateless encoders/decoders.
117
118        The .encode()/.decode() methods may use different error
119        handling schemes by providing the errors argument. These
120        string values are predefined:
121
122         'strict' - raise a ValueError error (or a subclass)
123         'ignore' - ignore the character and continue with the next
124         'replace' - replace with a suitable replacement character;
125                    Python will use the official U+FFFD REPLACEMENT
126                    CHARACTER for the builtin Unicode codecs on
127                    decoding and '?' on encoding.
128         'surrogateescape' - replace with private code points U+DCnn.
129         'xmlcharrefreplace' - Replace with the appropriate XML
130                               character reference (only for encoding).
131         'backslashreplace'  - Replace with backslashed escape sequences.
132         'namereplace'       - Replace with \\N{...} escape sequences
133                               (only for encoding).
134
135        The set of allowed values can be extended via register_error.
136
137    """
138    def encode(self, input, errors='strict'):
139
140        """ Encodes the object input and returns a tuple (output
141            object, length consumed).
142
143            errors defines the error handling to apply. It defaults to
144            'strict' handling.
145
146            The method may not store state in the Codec instance. Use
147            StreamWriter for codecs which have to keep state in order to
148            make encoding efficient.
149
150            The encoder must be able to handle zero length input and
151            return an empty object of the output object type in this
152            situation.
153
154        """
155        raise NotImplementedError
156
157    def decode(self, input, errors='strict'):
158
159        """ Decodes the object input and returns a tuple (output
160            object, length consumed).
161
162            input must be an object which provides the bf_getreadbuf
163            buffer slot. Python strings, buffer objects and memory
164            mapped files are examples of objects providing this slot.
165
166            errors defines the error handling to apply. It defaults to
167            'strict' handling.
168
169            The method may not store state in the Codec instance. Use
170            StreamReader for codecs which have to keep state in order to
171            make decoding efficient.
172
173            The decoder must be able to handle zero length input and
174            return an empty object of the output object type in this
175            situation.
176
177        """
178        raise NotImplementedError
179
180class IncrementalEncoder(object):
181    """
182    An IncrementalEncoder encodes an input in multiple steps. The input can
183    be passed piece by piece to the encode() method. The IncrementalEncoder
184    remembers the state of the encoding process between calls to encode().
185    """
186    def __init__(self, errors='strict'):
187        """
188        Creates an IncrementalEncoder instance.
189
190        The IncrementalEncoder may use different error handling schemes by
191        providing the errors keyword argument. See the module docstring
192        for a list of possible values.
193        """
194        self.errors = errors
195        self.buffer = ""
196
197    def encode(self, input, final=False):
198        """
199        Encodes input and returns the resulting object.
200        """
201        raise NotImplementedError
202
203    def reset(self):
204        """
205        Resets the encoder to the initial state.
206        """
207
208    def getstate(self):
209        """
210        Return the current state of the encoder.
211        """
212        return 0
213
214    def setstate(self, state):
215        """
216        Set the current state of the encoder. state must have been
217        returned by getstate().
218        """
219
220class BufferedIncrementalEncoder(IncrementalEncoder):
221    """
222    This subclass of IncrementalEncoder can be used as the baseclass for an
223    incremental encoder if the encoder must keep some of the output in a
224    buffer between calls to encode().
225    """
226    def __init__(self, errors='strict'):
227        IncrementalEncoder.__init__(self, errors)
228        # unencoded input that is kept between calls to encode()
229        self.buffer = ""
230
231    def _buffer_encode(self, input, errors, final):
232        # Overwrite this method in subclasses: It must encode input
233        # and return an (output, length consumed) tuple
234        raise NotImplementedError
235
236    def encode(self, input, final=False):
237        # encode input (taking the buffer into account)
238        data = self.buffer + input
239        (result, consumed) = self._buffer_encode(data, self.errors, final)
240        # keep unencoded input until the next call
241        self.buffer = data[consumed:]
242        return result
243
244    def reset(self):
245        IncrementalEncoder.reset(self)
246        self.buffer = ""
247
248    def getstate(self):
249        return self.buffer or 0
250
251    def setstate(self, state):
252        self.buffer = state or ""
253
254class IncrementalDecoder(object):
255    """
256    An IncrementalDecoder decodes an input in multiple steps. The input can
257    be passed piece by piece to the decode() method. The IncrementalDecoder
258    remembers the state of the decoding process between calls to decode().
259    """
260    def __init__(self, errors='strict'):
261        """
262        Create an IncrementalDecoder instance.
263
264        The IncrementalDecoder may use different error handling schemes by
265        providing the errors keyword argument. See the module docstring
266        for a list of possible values.
267        """
268        self.errors = errors
269
270    def decode(self, input, final=False):
271        """
272        Decode input and returns the resulting object.
273        """
274        raise NotImplementedError
275
276    def reset(self):
277        """
278        Reset the decoder to the initial state.
279        """
280
281    def getstate(self):
282        """
283        Return the current state of the decoder.
284
285        This must be a (buffered_input, additional_state_info) tuple.
286        buffered_input must be a bytes object containing bytes that
287        were passed to decode() that have not yet been converted.
288        additional_state_info must be a non-negative integer
289        representing the state of the decoder WITHOUT yet having
290        processed the contents of buffered_input.  In the initial state
291        and after reset(), getstate() must return (b"", 0).
292        """
293        return (b"", 0)
294
295    def setstate(self, state):
296        """
297        Set the current state of the decoder.
298
299        state must have been returned by getstate().  The effect of
300        setstate((b"", 0)) must be equivalent to reset().
301        """
302
303class BufferedIncrementalDecoder(IncrementalDecoder):
304    """
305    This subclass of IncrementalDecoder can be used as the baseclass for an
306    incremental decoder if the decoder must be able to handle incomplete
307    byte sequences.
308    """
309    def __init__(self, errors='strict'):
310        IncrementalDecoder.__init__(self, errors)
311        # undecoded input that is kept between calls to decode()
312        self.buffer = b""
313
314    def _buffer_decode(self, input, errors, final):
315        # Overwrite this method in subclasses: It must decode input
316        # and return an (output, length consumed) tuple
317        raise NotImplementedError
318
319    def decode(self, input, final=False):
320        # decode input (taking the buffer into account)
321        data = self.buffer + input
322        (result, consumed) = self._buffer_decode(data, self.errors, final)
323        # keep undecoded input until the next call
324        self.buffer = data[consumed:]
325        return result
326
327    def reset(self):
328        IncrementalDecoder.reset(self)
329        self.buffer = b""
330
331    def getstate(self):
332        # additional state info is always 0
333        return (self.buffer, 0)
334
335    def setstate(self, state):
336        # ignore additional state info
337        self.buffer = state[0]
338
339#
340# The StreamWriter and StreamReader class provide generic working
341# interfaces which can be used to implement new encoding submodules
342# very easily. See encodings/utf_8.py for an example on how this is
343# done.
344#
345
346class StreamWriter(Codec):
347
348    def __init__(self, stream, errors='strict'):
349
350        """ Creates a StreamWriter instance.
351
352            stream must be a file-like object open for writing.
353
354            The StreamWriter may use different error handling
355            schemes by providing the errors keyword argument. These
356            parameters are predefined:
357
358             'strict' - raise a ValueError (or a subclass)
359             'ignore' - ignore the character and continue with the next
360             'replace'- replace with a suitable replacement character
361             'xmlcharrefreplace' - Replace with the appropriate XML
362                                   character reference.
363             'backslashreplace'  - Replace with backslashed escape
364                                   sequences.
365             'namereplace'       - Replace with \\N{...} escape sequences.
366
367            The set of allowed parameter values can be extended via
368            register_error.
369        """
370        self.stream = stream
371        self.errors = errors
372
373    def write(self, object):
374
375        """ Writes the object's contents encoded to self.stream.
376        """
377        data, consumed = self.encode(object, self.errors)
378        self.stream.write(data)
379
380    def writelines(self, list):
381
382        """ Writes the concatenated list of strings to the stream
383            using .write().
384        """
385        self.write(''.join(list))
386
387    def reset(self):
388
389        """ Flushes and resets the codec buffers used for keeping state.
390
391            Calling this method should ensure that the data on the
392            output is put into a clean state, that allows appending
393            of new fresh data without having to rescan the whole
394            stream to recover state.
395
396        """
397        pass
398
399    def seek(self, offset, whence=0):
400        self.stream.seek(offset, whence)
401        if whence == 0 and offset == 0:
402            self.reset()
403
404    def __getattr__(self, name,
405                    getattr=getattr):
406
407        """ Inherit all other methods from the underlying stream.
408        """
409        return getattr(self.stream, name)
410
411    def __enter__(self):
412        return self
413
414    def __exit__(self, type, value, tb):
415        self.stream.close()
416
417###
418
419class StreamReader(Codec):
420
421    charbuffertype = str
422
423    def __init__(self, stream, errors='strict'):
424
425        """ Creates a StreamReader instance.
426
427            stream must be a file-like object open for reading.
428
429            The StreamReader may use different error handling
430            schemes by providing the errors keyword argument. These
431            parameters are predefined:
432
433             'strict' - raise a ValueError (or a subclass)
434             'ignore' - ignore the character and continue with the next
435             'replace'- replace with a suitable replacement character
436             'backslashreplace' - Replace with backslashed escape sequences;
437
438            The set of allowed parameter values can be extended via
439            register_error.
440        """
441        self.stream = stream
442        self.errors = errors
443        self.bytebuffer = b""
444        self._empty_charbuffer = self.charbuffertype()
445        self.charbuffer = self._empty_charbuffer
446        self.linebuffer = None
447
448    def decode(self, input, errors='strict'):
449        raise NotImplementedError
450
451    def read(self, size=-1, chars=-1, firstline=False):
452
453        """ Decodes data from the stream self.stream and returns the
454            resulting object.
455
456            chars indicates the number of decoded code points or bytes to
457            return. read() will never return more data than requested,
458            but it might return less, if there is not enough available.
459
460            size indicates the approximate maximum number of decoded
461            bytes or code points to read for decoding. The decoder
462            can modify this setting as appropriate. The default value
463            -1 indicates to read and decode as much as possible.  size
464            is intended to prevent having to decode huge files in one
465            step.
466
467            If firstline is true, and a UnicodeDecodeError happens
468            after the first line terminator in the input only the first line
469            will be returned, the rest of the input will be kept until the
470            next call to read().
471
472            The method should use a greedy read strategy, meaning that
473            it should read as much data as is allowed within the
474            definition of the encoding and the given size, e.g.  if
475            optional encoding endings or state markers are available
476            on the stream, these should be read too.
477        """
478        # If we have lines cached, first merge them back into characters
479        if self.linebuffer:
480            self.charbuffer = self._empty_charbuffer.join(self.linebuffer)
481            self.linebuffer = None
482
483        if chars < 0:
484            # For compatibility with other read() methods that take a
485            # single argument
486            chars = size
487
488        # read until we get the required number of characters (if available)
489        while True:
490            # can the request be satisfied from the character buffer?
491            if chars >= 0:
492                if len(self.charbuffer) >= chars:
493                    break
494            # we need more data
495            if size < 0:
496                newdata = self.stream.read()
497            else:
498                newdata = self.stream.read(size)
499            # decode bytes (those remaining from the last call included)
500            data = self.bytebuffer + newdata
501            if not data:
502                break
503            try:
504                newchars, decodedbytes = self.decode(data, self.errors)
505            except UnicodeDecodeError as exc:
506                if firstline:
507                    newchars, decodedbytes = \
508                        self.decode(data[:exc.start], self.errors)
509                    lines = newchars.splitlines(keepends=True)
510                    if len(lines)<=1:
511                        raise
512                else:
513                    raise
514            # keep undecoded bytes until the next call
515            self.bytebuffer = data[decodedbytes:]
516            # put new characters in the character buffer
517            self.charbuffer += newchars
518            # there was no data available
519            if not newdata:
520                break
521        if chars < 0:
522            # Return everything we've got
523            result = self.charbuffer
524            self.charbuffer = self._empty_charbuffer
525        else:
526            # Return the first chars characters
527            result = self.charbuffer[:chars]
528            self.charbuffer = self.charbuffer[chars:]
529        return result
530
531    def readline(self, size=None, keepends=True):
532
533        """ Read one line from the input stream and return the
534            decoded data.
535
536            size, if given, is passed as size argument to the
537            read() method.
538
539        """
540        # If we have lines cached from an earlier read, return
541        # them unconditionally
542        if self.linebuffer:
543            line = self.linebuffer[0]
544            del self.linebuffer[0]
545            if len(self.linebuffer) == 1:
546                # revert to charbuffer mode; we might need more data
547                # next time
548                self.charbuffer = self.linebuffer[0]
549                self.linebuffer = None
550            if not keepends:
551                line = line.splitlines(keepends=False)[0]
552            return line
553
554        readsize = size or 72
555        line = self._empty_charbuffer
556        # If size is given, we call read() only once
557        while True:
558            data = self.read(readsize, firstline=True)
559            if data:
560                # If we're at a "\r" read one extra character (which might
561                # be a "\n") to get a proper line ending. If the stream is
562                # temporarily exhausted we return the wrong line ending.
563                if (isinstance(data, str) and data.endswith("\r")) or \
564                   (isinstance(data, bytes) and data.endswith(b"\r")):
565                    data += self.read(size=1, chars=1)
566
567            line += data
568            lines = line.splitlines(keepends=True)
569            if lines:
570                if len(lines) > 1:
571                    # More than one line result; the first line is a full line
572                    # to return
573                    line = lines[0]
574                    del lines[0]
575                    if len(lines) > 1:
576                        # cache the remaining lines
577                        lines[-1] += self.charbuffer
578                        self.linebuffer = lines
579                        self.charbuffer = None
580                    else:
581                        # only one remaining line, put it back into charbuffer
582                        self.charbuffer = lines[0] + self.charbuffer
583                    if not keepends:
584                        line = line.splitlines(keepends=False)[0]
585                    break
586                line0withend = lines[0]
587                line0withoutend = lines[0].splitlines(keepends=False)[0]
588                if line0withend != line0withoutend: # We really have a line end
589                    # Put the rest back together and keep it until the next call
590                    self.charbuffer = self._empty_charbuffer.join(lines[1:]) + \
591                                      self.charbuffer
592                    if keepends:
593                        line = line0withend
594                    else:
595                        line = line0withoutend
596                    break
597            # we didn't get anything or this was our only try
598            if not data or size is not None:
599                if line and not keepends:
600                    line = line.splitlines(keepends=False)[0]
601                break
602            if readsize < 8000:
603                readsize *= 2
604        return line
605
606    def readlines(self, sizehint=None, keepends=True):
607
608        """ Read all lines available on the input stream
609            and return them as a list.
610
611            Line breaks are implemented using the codec's decoder
612            method and are included in the list entries.
613
614            sizehint, if given, is ignored since there is no efficient
615            way to finding the true end-of-line.
616
617        """
618        data = self.read()
619        return data.splitlines(keepends)
620
621    def reset(self):
622
623        """ Resets the codec buffers used for keeping state.
624
625            Note that no stream repositioning should take place.
626            This method is primarily intended to be able to recover
627            from decoding errors.
628
629        """
630        self.bytebuffer = b""
631        self.charbuffer = self._empty_charbuffer
632        self.linebuffer = None
633
634    def seek(self, offset, whence=0):
635        """ Set the input stream's current position.
636
637            Resets the codec buffers used for keeping state.
638        """
639        self.stream.seek(offset, whence)
640        self.reset()
641
642    def __next__(self):
643
644        """ Return the next decoded line from the input stream."""
645        line = self.readline()
646        if line:
647            return line
648        raise StopIteration
649
650    def __iter__(self):
651        return self
652
653    def __getattr__(self, name,
654                    getattr=getattr):
655
656        """ Inherit all other methods from the underlying stream.
657        """
658        return getattr(self.stream, name)
659
660    def __enter__(self):
661        return self
662
663    def __exit__(self, type, value, tb):
664        self.stream.close()
665
666###
667
668class StreamReaderWriter:
669
670    """ StreamReaderWriter instances allow wrapping streams which
671        work in both read and write modes.
672
673        The design is such that one can use the factory functions
674        returned by the codec.lookup() function to construct the
675        instance.
676
677    """
678    # Optional attributes set by the file wrappers below
679    encoding = 'unknown'
680
681    def __init__(self, stream, Reader, Writer, errors='strict'):
682
683        """ Creates a StreamReaderWriter instance.
684
685            stream must be a Stream-like object.
686
687            Reader, Writer must be factory functions or classes
688            providing the StreamReader, StreamWriter interface resp.
689
690            Error handling is done in the same way as defined for the
691            StreamWriter/Readers.
692
693        """
694        self.stream = stream
695        self.reader = Reader(stream, errors)
696        self.writer = Writer(stream, errors)
697        self.errors = errors
698
699    def read(self, size=-1):
700
701        return self.reader.read(size)
702
703    def readline(self, size=None):
704
705        return self.reader.readline(size)
706
707    def readlines(self, sizehint=None):
708
709        return self.reader.readlines(sizehint)
710
711    def __next__(self):
712
713        """ Return the next decoded line from the input stream."""
714        return next(self.reader)
715
716    def __iter__(self):
717        return self
718
719    def write(self, data):
720
721        return self.writer.write(data)
722
723    def writelines(self, list):
724
725        return self.writer.writelines(list)
726
727    def reset(self):
728
729        self.reader.reset()
730        self.writer.reset()
731
732    def seek(self, offset, whence=0):
733        self.stream.seek(offset, whence)
734        self.reader.reset()
735        if whence == 0 and offset == 0:
736            self.writer.reset()
737
738    def __getattr__(self, name,
739                    getattr=getattr):
740
741        """ Inherit all other methods from the underlying stream.
742        """
743        return getattr(self.stream, name)
744
745    # these are needed to make "with StreamReaderWriter(...)" work properly
746
747    def __enter__(self):
748        return self
749
750    def __exit__(self, type, value, tb):
751        self.stream.close()
752
753###
754
755class StreamRecoder:
756
757    """ StreamRecoder instances translate data from one encoding to another.
758
759        They use the complete set of APIs returned by the
760        codecs.lookup() function to implement their task.
761
762        Data written to the StreamRecoder is first decoded into an
763        intermediate format (depending on the "decode" codec) and then
764        written to the underlying stream using an instance of the provided
765        Writer class.
766
767        In the other direction, data is read from the underlying stream using
768        a Reader instance and then encoded and returned to the caller.
769
770    """
771    # Optional attributes set by the file wrappers below
772    data_encoding = 'unknown'
773    file_encoding = 'unknown'
774
775    def __init__(self, stream, encode, decode, Reader, Writer,
776                 errors='strict'):
777
778        """ Creates a StreamRecoder instance which implements a two-way
779            conversion: encode and decode work on the frontend (the
780            data visible to .read() and .write()) while Reader and Writer
781            work on the backend (the data in stream).
782
783            You can use these objects to do transparent
784            transcodings from e.g. latin-1 to utf-8 and back.
785
786            stream must be a file-like object.
787
788            encode and decode must adhere to the Codec interface; Reader and
789            Writer must be factory functions or classes providing the
790            StreamReader and StreamWriter interfaces resp.
791
792            Error handling is done in the same way as defined for the
793            StreamWriter/Readers.
794
795        """
796        self.stream = stream
797        self.encode = encode
798        self.decode = decode
799        self.reader = Reader(stream, errors)
800        self.writer = Writer(stream, errors)
801        self.errors = errors
802
803    def read(self, size=-1):
804
805        data = self.reader.read(size)
806        data, bytesencoded = self.encode(data, self.errors)
807        return data
808
809    def readline(self, size=None):
810
811        if size is None:
812            data = self.reader.readline()
813        else:
814            data = self.reader.readline(size)
815        data, bytesencoded = self.encode(data, self.errors)
816        return data
817
818    def readlines(self, sizehint=None):
819
820        data = self.reader.read()
821        data, bytesencoded = self.encode(data, self.errors)
822        return data.splitlines(keepends=True)
823
824    def __next__(self):
825
826        """ Return the next decoded line from the input stream."""
827        data = next(self.reader)
828        data, bytesencoded = self.encode(data, self.errors)
829        return data
830
831    def __iter__(self):
832        return self
833
834    def write(self, data):
835
836        data, bytesdecoded = self.decode(data, self.errors)
837        return self.writer.write(data)
838
839    def writelines(self, list):
840
841        data = b''.join(list)
842        data, bytesdecoded = self.decode(data, self.errors)
843        return self.writer.write(data)
844
845    def reset(self):
846
847        self.reader.reset()
848        self.writer.reset()
849
850    def seek(self, offset, whence=0):
851        # Seeks must be propagated to both the readers and writers
852        # as they might need to reset their internal buffers.
853        self.reader.seek(offset, whence)
854        self.writer.seek(offset, whence)
855
856    def __getattr__(self, name,
857                    getattr=getattr):
858
859        """ Inherit all other methods from the underlying stream.
860        """
861        return getattr(self.stream, name)
862
863    def __enter__(self):
864        return self
865
866    def __exit__(self, type, value, tb):
867        self.stream.close()
868
869### Shortcuts
870
871def open(filename, mode='r', encoding=None, errors='strict', buffering=1):
872
873    """ Open an encoded file using the given mode and return
874        a wrapped version providing transparent encoding/decoding.
875
876        Note: The wrapped version will only accept the object format
877        defined by the codecs, i.e. Unicode objects for most builtin
878        codecs. Output is also codec dependent and will usually be
879        Unicode as well.
880
881        Underlying encoded files are always opened in binary mode.
882        The default file mode is 'r', meaning to open the file in read mode.
883
884        encoding specifies the encoding which is to be used for the
885        file.
886
887        errors may be given to define the error handling. It defaults
888        to 'strict' which causes ValueErrors to be raised in case an
889        encoding error occurs.
890
891        buffering has the same meaning as for the builtin open() API.
892        It defaults to line buffered.
893
894        The returned wrapped file object provides an extra attribute
895        .encoding which allows querying the used encoding. This
896        attribute is only available if an encoding was specified as
897        parameter.
898
899    """
900    if encoding is not None and \
901       'b' not in mode:
902        # Force opening of the file in binary mode
903        mode = mode + 'b'
904    file = builtins.open(filename, mode, buffering)
905    if encoding is None:
906        return file
907
908    try:
909        info = lookup(encoding)
910        srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
911        # Add attributes to simplify introspection
912        srw.encoding = encoding
913        return srw
914    except:
915        file.close()
916        raise
917
918def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
919
920    """ Return a wrapped version of file which provides transparent
921        encoding translation.
922
923        Data written to the wrapped file is decoded according
924        to the given data_encoding and then encoded to the underlying
925        file using file_encoding. The intermediate data type
926        will usually be Unicode but depends on the specified codecs.
927
928        Bytes read from the file are decoded using file_encoding and then
929        passed back to the caller encoded using data_encoding.
930
931        If file_encoding is not given, it defaults to data_encoding.
932
933        errors may be given to define the error handling. It defaults
934        to 'strict' which causes ValueErrors to be raised in case an
935        encoding error occurs.
936
937        The returned wrapped file object provides two extra attributes
938        .data_encoding and .file_encoding which reflect the given
939        parameters of the same name. The attributes can be used for
940        introspection by Python programs.
941
942    """
943    if file_encoding is None:
944        file_encoding = data_encoding
945    data_info = lookup(data_encoding)
946    file_info = lookup(file_encoding)
947    sr = StreamRecoder(file, data_info.encode, data_info.decode,
948                       file_info.streamreader, file_info.streamwriter, errors)
949    # Add attributes to simplify introspection
950    sr.data_encoding = data_encoding
951    sr.file_encoding = file_encoding
952    return sr
953
954### Helpers for codec lookup
955
956def getencoder(encoding):
957
958    """ Lookup up the codec for the given encoding and return
959        its encoder function.
960
961        Raises a LookupError in case the encoding cannot be found.
962
963    """
964    return lookup(encoding).encode
965
966def getdecoder(encoding):
967
968    """ Lookup up the codec for the given encoding and return
969        its decoder function.
970
971        Raises a LookupError in case the encoding cannot be found.
972
973    """
974    return lookup(encoding).decode
975
976def getincrementalencoder(encoding):
977
978    """ Lookup up the codec for the given encoding and return
979        its IncrementalEncoder class or factory function.
980
981        Raises a LookupError in case the encoding cannot be found
982        or the codecs doesn't provide an incremental encoder.
983
984    """
985    encoder = lookup(encoding).incrementalencoder
986    if encoder is None:
987        raise LookupError(encoding)
988    return encoder
989
990def getincrementaldecoder(encoding):
991
992    """ Lookup up the codec for the given encoding and return
993        its IncrementalDecoder class or factory function.
994
995        Raises a LookupError in case the encoding cannot be found
996        or the codecs doesn't provide an incremental decoder.
997
998    """
999    decoder = lookup(encoding).incrementaldecoder
1000    if decoder is None:
1001        raise LookupError(encoding)
1002    return decoder
1003
1004def getreader(encoding):
1005
1006    """ Lookup up the codec for the given encoding and return
1007        its StreamReader class or factory function.
1008
1009        Raises a LookupError in case the encoding cannot be found.
1010
1011    """
1012    return lookup(encoding).streamreader
1013
1014def getwriter(encoding):
1015
1016    """ Lookup up the codec for the given encoding and return
1017        its StreamWriter class or factory function.
1018
1019        Raises a LookupError in case the encoding cannot be found.
1020
1021    """
1022    return lookup(encoding).streamwriter
1023
1024def iterencode(iterator, encoding, errors='strict', **kwargs):
1025    """
1026    Encoding iterator.
1027
1028    Encodes the input strings from the iterator using an IncrementalEncoder.
1029
1030    errors and kwargs are passed through to the IncrementalEncoder
1031    constructor.
1032    """
1033    encoder = getincrementalencoder(encoding)(errors, **kwargs)
1034    for input in iterator:
1035        output = encoder.encode(input)
1036        if output:
1037            yield output
1038    output = encoder.encode("", True)
1039    if output:
1040        yield output
1041
1042def iterdecode(iterator, encoding, errors='strict', **kwargs):
1043    """
1044    Decoding iterator.
1045
1046    Decodes the input strings from the iterator using an IncrementalDecoder.
1047
1048    errors and kwargs are passed through to the IncrementalDecoder
1049    constructor.
1050    """
1051    decoder = getincrementaldecoder(encoding)(errors, **kwargs)
1052    for input in iterator:
1053        output = decoder.decode(input)
1054        if output:
1055            yield output
1056    output = decoder.decode(b"", True)
1057    if output:
1058        yield output
1059
1060### Helpers for charmap-based codecs
1061
1062def make_identity_dict(rng):
1063
1064    """ make_identity_dict(rng) -> dict
1065
1066        Return a dictionary where elements of the rng sequence are
1067        mapped to themselves.
1068
1069    """
1070    return {i:i for i in rng}
1071
1072def make_encoding_map(decoding_map):
1073
1074    """ Creates an encoding map from a decoding map.
1075
1076        If a target mapping in the decoding map occurs multiple
1077        times, then that target is mapped to None (undefined mapping),
1078        causing an exception when encountered by the charmap codec
1079        during translation.
1080
1081        One example where this happens is cp875.py which decodes
1082        multiple character to \\u001a.
1083
1084    """
1085    m = {}
1086    for k,v in decoding_map.items():
1087        if not v in m:
1088            m[v] = k
1089        else:
1090            m[v] = None
1091    return m
1092
1093### error handlers
1094
1095try:
1096    strict_errors = lookup_error("strict")
1097    ignore_errors = lookup_error("ignore")
1098    replace_errors = lookup_error("replace")
1099    xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
1100    backslashreplace_errors = lookup_error("backslashreplace")
1101    namereplace_errors = lookup_error("namereplace")
1102except LookupError:
1103    # In --disable-unicode builds, these error handler are missing
1104    strict_errors = None
1105    ignore_errors = None
1106    replace_errors = None
1107    xmlcharrefreplace_errors = None
1108    backslashreplace_errors = None
1109    namereplace_errors = None
1110
1111# Tell modulefinder that using codecs probably needs the encodings
1112# package
1113_false = 0
1114if _false:
1115    import encodings
1116
1117### Tests
1118
1119if __name__ == '__main__':
1120
1121    # Make stdout translate Latin-1 output into UTF-8 output
1122    sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
1123
1124    # Have stdin translate Latin-1 input into UTF-8 input
1125    sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')
1126