1r"""
2`ftfy.bad_codecs.sloppy` provides character-map encodings that fill their "holes"
3in a messy but common way: by outputting the Unicode codepoints with the same
4numbers.
5
6This is incredibly ugly, and it's also in the HTML5 standard.
7
8A single-byte encoding maps each byte to a Unicode character, except that some
9bytes are left unmapped. In the commonly-used Windows-1252 encoding, for
10example, bytes 0x81 and 0x8D, among others, have no meaning.
11
12Python, wanting to preserve some sense of decorum, will handle these bytes
13as errors. But Windows knows that 0x81 and 0x8D are possible bytes and they're
14different from each other. It just hasn't defined what they are in terms of
15Unicode.
16
17Software that has to interoperate with Windows-1252 and Unicode -- such as all
18the common Web browsers -- will pick some Unicode characters for them to map
19to, and the characters they pick are the Unicode characters with the same
20numbers: U+0081 and U+008D. This is the same as what Latin-1 does, and the
21resulting characters tend to fall into a range of Unicode that's set aside for
22obsolete Latin-1 control characters anyway.
23
24These sloppy codecs let Python do the same thing, thus interoperating with
25other software that works this way. It defines a sloppy version of many
26single-byte encodings with holes. (There is no need for a sloppy version of
27an encoding without holes: for example, there is no such thing as
28sloppy-iso-8859-2 or sloppy-macroman.)
29
30The following encodings will become defined:
31
32- sloppy-windows-1250 (Central European, sort of based on ISO-8859-2)
33- sloppy-windows-1251 (Cyrillic)
34- sloppy-windows-1252 (Western European, based on Latin-1)
35- sloppy-windows-1253 (Greek, sort of based on ISO-8859-7)
36- sloppy-windows-1254 (Turkish, based on ISO-8859-9)
37- sloppy-windows-1255 (Hebrew, based on ISO-8859-8)
38- sloppy-windows-1256 (Arabic)
39- sloppy-windows-1257 (Baltic, based on ISO-8859-13)
40- sloppy-windows-1258 (Vietnamese)
41- sloppy-cp874 (Thai, based on ISO-8859-11)
42- sloppy-iso-8859-3 (Maltese and Esperanto, I guess)
43- sloppy-iso-8859-6 (different Arabic)
44- sloppy-iso-8859-7 (Greek)
45- sloppy-iso-8859-8 (Hebrew)
46- sloppy-iso-8859-11 (Thai)
47
48Aliases such as "sloppy-cp1252" for "sloppy-windows-1252" will also be
49defined.
50
51Five of these encodings (`sloppy-windows-1250` through `sloppy-windows-1254`)
52are used within ftfy.
53
54Here are some examples, using :func:`ftfy.explain_unicode` to illustrate how
55sloppy-windows-1252 merges Windows-1252 with Latin-1:
56
57    >>> from ftfy import explain_unicode
58    >>> some_bytes = b'\x80\x81\x82'
59    >>> explain_unicode(some_bytes.decode('latin-1'))
60    U+0080  \x80    [Cc] <unknown>
61    U+0081  \x81    [Cc] <unknown>
62    U+0082  \x82    [Cc] <unknown>
63
64    >>> explain_unicode(some_bytes.decode('windows-1252', 'replace'))
65    U+20AC  €       [Sc] EURO SIGN
66    U+FFFD  �       [So] REPLACEMENT CHARACTER
67    U+201A  ‚       [Ps] SINGLE LOW-9 QUOTATION MARK
68
69    >>> explain_unicode(some_bytes.decode('sloppy-windows-1252'))
70    U+20AC  €       [Sc] EURO SIGN
71    U+0081  \x81    [Cc] <unknown>
72    U+201A  ‚       [Ps] SINGLE LOW-9 QUOTATION MARK
73"""
74import codecs
75from encodings import normalize_encoding
76import sys
77
78REPLACEMENT_CHAR = '\ufffd'
79PY26 = sys.version_info[:2] == (2, 6)
80
81
82def make_sloppy_codec(encoding):
83    """
84    Take a codec name, and return a 'sloppy' version of that codec that can
85    encode and decode the unassigned bytes in that encoding.
86
87    Single-byte encodings in the standard library are defined using some
88    boilerplate classes surrounding the functions that do the actual work,
89    `codecs.charmap_decode` and `charmap_encode`. This function, given an
90    encoding name, *defines* those boilerplate classes.
91    """
92    # Make a bytestring of all 256 possible bytes.
93    all_bytes = bytes(range(256))
94
95    # Get a list of what they would decode to in Latin-1.
96    sloppy_chars = list(all_bytes.decode('latin-1'))
97
98    # Get a list of what they decode to in the given encoding. Use the
99    # replacement character for unassigned bytes.
100    if PY26:
101        decoded_chars = all_bytes.decode(encoding, 'replace')
102    else:
103        decoded_chars = all_bytes.decode(encoding, errors='replace')
104
105    # Update the sloppy_chars list. Each byte that was successfully decoded
106    # gets its decoded value in the list. The unassigned bytes are left as
107    # they are, which gives their decoding in Latin-1.
108    for i, char in enumerate(decoded_chars):
109        if char != REPLACEMENT_CHAR:
110            sloppy_chars[i] = char
111
112    # For ftfy's own purposes, we're going to allow byte 1A, the "Substitute"
113    # control code, to encode the Unicode replacement character U+FFFD.
114    sloppy_chars[0x1a] = REPLACEMENT_CHAR
115
116    # Create the data structures that tell the charmap methods how to encode
117    # and decode in this sloppy encoding.
118    decoding_table = ''.join(sloppy_chars)
119    encoding_table = codecs.charmap_build(decoding_table)
120
121    # Now produce all the class boilerplate. Look at the Python source for
122    # `encodings.cp1252` for comparison; this is almost exactly the same,
123    # except I made it follow pep8.
124    class Codec(codecs.Codec):
125        def encode(self, input, errors='strict'):
126            return codecs.charmap_encode(input, errors, encoding_table)
127
128        def decode(self, input, errors='strict'):
129            return codecs.charmap_decode(input, errors, decoding_table)
130
131    class IncrementalEncoder(codecs.IncrementalEncoder):
132        def encode(self, input, final=False):
133            return codecs.charmap_encode(input, self.errors, encoding_table)[0]
134
135    class IncrementalDecoder(codecs.IncrementalDecoder):
136        def decode(self, input, final=False):
137            return codecs.charmap_decode(input, self.errors, decoding_table)[0]
138
139    class StreamWriter(Codec, codecs.StreamWriter):
140        pass
141
142    class StreamReader(Codec, codecs.StreamReader):
143        pass
144
145    return codecs.CodecInfo(
146        name='sloppy-' + encoding,
147        encode=Codec().encode,
148        decode=Codec().decode,
149        incrementalencoder=IncrementalEncoder,
150        incrementaldecoder=IncrementalDecoder,
151        streamreader=StreamReader,
152        streamwriter=StreamWriter,
153    )
154
155
156# Define a codec for each incomplete encoding. The resulting CODECS dictionary
157# can be used by the main module of ftfy.bad_codecs.
158CODECS = {}
159INCOMPLETE_ENCODINGS = (
160    ['windows-%s' % num for num in range(1250, 1259)] +
161    ['iso-8859-%s' % num for num in (3, 6, 7, 8, 11)] +
162    ['cp%s' % num for num in range(1250, 1259)] + ['cp874']
163)
164
165for _encoding in INCOMPLETE_ENCODINGS:
166    _new_name = normalize_encoding('sloppy-' + _encoding)
167    CODECS[_new_name] = make_sloppy_codec(_encoding)
168