1# coding: utf-8
2r"""
3Decodes single-byte encodings, filling their "holes" in the same messy way that
4everyone else does.
5
6A single-byte encoding maps each byte to a Unicode character, except that some
7bytes are left unmapped. In the commonly-used Windows-1252 encoding, for
8example, bytes 0x81 and 0x8D, among others, have no meaning.
9
10Python, wanting to preserve some sense of decorum, will handle these bytes
11as errors. But Windows knows that 0x81 and 0x8D are possible bytes and they're
12different from each other. It just hasn't defined what they are in terms of
13Unicode.
14
15Software that has to interoperate with Windows-1252 and Unicode -- such as all
16the common Web browsers -- will pick some Unicode characters for them to map
17to, and the characters they pick are the Unicode characters with the same
18numbers: U+0081 and U+008D. This is the same as what Latin-1 does, and the
19resulting characters tend to fall into a range of Unicode that's set aside for
20obselete Latin-1 control characters anyway.
21
22These sloppy codecs let Python do the same thing, thus interoperating with
23other software that works this way. It defines a sloppy version of many
24single-byte encodings with holes. (There is no need for a sloppy version of
25an encoding without holes: for example, there is no such thing as
26sloppy-iso-8859-2 or sloppy-macroman.)
27
28The following encodings will become defined:
29
30- sloppy-windows-1250 (Central European, sort of based on ISO-8859-2)
31- sloppy-windows-1251 (Cyrillic)
32- sloppy-windows-1252 (Western European, based on Latin-1)
33- sloppy-windows-1253 (Greek, sort of based on ISO-8859-7)
34- sloppy-windows-1254 (Turkish, based on ISO-8859-9)
35- sloppy-windows-1255 (Hebrew, based on ISO-8859-8)
36- sloppy-windows-1256 (Arabic)
37- sloppy-windows-1257 (Baltic, based on ISO-8859-13)
38- sloppy-windows-1258 (Vietnamese)
39- sloppy-cp874 (Thai, based on ISO-8859-11)
40- sloppy-iso-8859-3 (Maltese and Esperanto, I guess)
41- sloppy-iso-8859-6 (different Arabic)
42- sloppy-iso-8859-7 (Greek)
43- sloppy-iso-8859-8 (Hebrew)
44- sloppy-iso-8859-11 (Thai)
45
46Aliases such as "sloppy-cp1252" for "sloppy-windows-1252" will also be
47defined.
48
49Only sloppy-windows-1251 and sloppy-windows-1252 are used by the rest of ftfy;
50the rest are rather uncommon.
51
52Here are some examples, using `ftfy.explain_unicode` to illustrate how
53sloppy-windows-1252 merges Windows-1252 with Latin-1:
54
55    >>> from ftfy import explain_unicode
56    >>> some_bytes = b'\x80\x81\x82'
57    >>> explain_unicode(some_bytes.decode('latin-1'))
58    U+0080  \x80    [Cc] <unknown>
59    U+0081  \x81    [Cc] <unknown>
60    U+0082  \x82    [Cc] <unknown>
61
62    >>> explain_unicode(some_bytes.decode('windows-1252', 'replace'))
63    U+20AC  €       [Sc] EURO SIGN
64    U+FFFD  �       [So] REPLACEMENT CHARACTER
65    U+201A  ‚       [Ps] SINGLE LOW-9 QUOTATION MARK
66
67    >>> explain_unicode(some_bytes.decode('sloppy-windows-1252'))
68    U+20AC  €       [Sc] EURO SIGN
69    U+0081  \x81    [Cc] <unknown>
70    U+201A  ‚       [Ps] SINGLE LOW-9 QUOTATION MARK
71"""
72from __future__ import unicode_literals
73import codecs
74from encodings import normalize_encoding
75import sys
76
77REPLACEMENT_CHAR = '\ufffd'
78PY26 = sys.version_info[:2] == (2, 6)
79
80def make_sloppy_codec(encoding):
81    """
82    Take a codec name, and return a 'sloppy' version of that codec that can
83    encode and decode the unassigned bytes in that encoding.
84
85    Single-byte encodings in the standard library are defined using some
86    boilerplate classes surrounding the functions that do the actual work,
87    `codecs.charmap_decode` and `charmap_encode`. This function, given an
88    encoding name, *defines* those boilerplate classes.
89    """
90    # Make an array of all 256 possible bytes.
91    all_bytes = bytearray(range(256))
92
93    # Get a list of what they would decode to in Latin-1.
94    sloppy_chars = list(all_bytes.decode('latin-1'))
95
96    # Get a list of what they decode to in the given encoding. Use the
97    # replacement character for unassigned bytes.
98    if PY26:
99        decoded_chars = all_bytes.decode(encoding, 'replace')
100    else:
101        decoded_chars = all_bytes.decode(encoding, errors='replace')
102
103    # Update the sloppy_chars list. Each byte that was successfully decoded
104    # gets its decoded value in the list. The unassigned bytes are left as
105    # they are, which gives their decoding in Latin-1.
106    for i, char in enumerate(decoded_chars):
107        if char != REPLACEMENT_CHAR:
108            sloppy_chars[i] = char
109
110    # For ftfy's own purposes, we're going to allow byte 1A, the "Substitute"
111    # control code, to encode the Unicode replacement character U+FFFD.
112    sloppy_chars[0x1a] = REPLACEMENT_CHAR
113
114    # Create the data structures that tell the charmap methods how to encode
115    # and decode in this sloppy encoding.
116    decoding_table = ''.join(sloppy_chars)
117    encoding_table = codecs.charmap_build(decoding_table)
118
119    # Now produce all the class boilerplate. Look at the Python source for
120    # `encodings.cp1252` for comparison; this is almost exactly the same,
121    # except I made it follow pep8.
122    class Codec(codecs.Codec):
123        def encode(self, input, errors='strict'):
124            return codecs.charmap_encode(input, errors, encoding_table)
125
126        def decode(self, input, errors='strict'):
127            return codecs.charmap_decode(input, errors, decoding_table)
128
129    class IncrementalEncoder(codecs.IncrementalEncoder):
130        def encode(self, input, final=False):
131            return codecs.charmap_encode(input, self.errors, encoding_table)[0]
132
133    class IncrementalDecoder(codecs.IncrementalDecoder):
134        def decode(self, input, final=False):
135            return codecs.charmap_decode(input, self.errors, decoding_table)[0]
136
137    class StreamWriter(Codec, codecs.StreamWriter):
138        pass
139
140    class StreamReader(Codec, codecs.StreamReader):
141        pass
142
143    return codecs.CodecInfo(
144        name='sloppy-' + encoding,
145        encode=Codec().encode,
146        decode=Codec().decode,
147        incrementalencoder=IncrementalEncoder,
148        incrementaldecoder=IncrementalDecoder,
149        streamreader=StreamReader,
150        streamwriter=StreamWriter,
151    )
152
153# Define a codec for each incomplete encoding. The resulting CODECS dictionary
154# can be used by the main module of ftfy.bad_codecs.
155CODECS = {}
156INCOMPLETE_ENCODINGS = (
157    ['windows-%s' % num for num in range(1250, 1259)] +
158    ['iso-8859-%s' % num for num in (3, 6, 7, 8, 11)] +
159    ['cp%s' % num for num in range(1250, 1259)] + ['cp874']
160)
161
162for _encoding in INCOMPLETE_ENCODINGS:
163    _new_name = normalize_encoding('sloppy-' + _encoding)
164    CODECS[_new_name] = make_sloppy_codec(_encoding)
165