1r""" 2`ftfy.bad_codecs.sloppy` provides character-map encodings that fill their "holes" 3in a messy but common way: by outputting the Unicode codepoints with the same 4numbers. 5 6This is incredibly ugly, and it's also in the HTML5 standard. 7 8A single-byte encoding maps each byte to a Unicode character, except that some 9bytes are left unmapped. In the commonly-used Windows-1252 encoding, for 10example, bytes 0x81 and 0x8D, among others, have no meaning. 11 12Python, wanting to preserve some sense of decorum, will handle these bytes 13as errors. But Windows knows that 0x81 and 0x8D are possible bytes and they're 14different from each other. It just hasn't defined what they are in terms of 15Unicode. 16 17Software that has to interoperate with Windows-1252 and Unicode -- such as all 18the common Web browsers -- will pick some Unicode characters for them to map 19to, and the characters they pick are the Unicode characters with the same 20numbers: U+0081 and U+008D. This is the same as what Latin-1 does, and the 21resulting characters tend to fall into a range of Unicode that's set aside for 22obsolete Latin-1 control characters anyway. 23 24These sloppy codecs let Python do the same thing, thus interoperating with 25other software that works this way. It defines a sloppy version of many 26single-byte encodings with holes. (There is no need for a sloppy version of 27an encoding without holes: for example, there is no such thing as 28sloppy-iso-8859-2 or sloppy-macroman.) 29 30The following encodings will become defined: 31 32- sloppy-windows-1250 (Central European, sort of based on ISO-8859-2) 33- sloppy-windows-1251 (Cyrillic) 34- sloppy-windows-1252 (Western European, based on Latin-1) 35- sloppy-windows-1253 (Greek, sort of based on ISO-8859-7) 36- sloppy-windows-1254 (Turkish, based on ISO-8859-9) 37- sloppy-windows-1255 (Hebrew, based on ISO-8859-8) 38- sloppy-windows-1256 (Arabic) 39- sloppy-windows-1257 (Baltic, based on ISO-8859-13) 40- sloppy-windows-1258 (Vietnamese) 41- sloppy-cp874 (Thai, based on ISO-8859-11) 42- sloppy-iso-8859-3 (Maltese and Esperanto, I guess) 43- sloppy-iso-8859-6 (different Arabic) 44- sloppy-iso-8859-7 (Greek) 45- sloppy-iso-8859-8 (Hebrew) 46- sloppy-iso-8859-11 (Thai) 47 48Aliases such as "sloppy-cp1252" for "sloppy-windows-1252" will also be 49defined. 50 51Five of these encodings (`sloppy-windows-1250` through `sloppy-windows-1254`) 52are used within ftfy. 53 54Here are some examples, using :func:`ftfy.explain_unicode` to illustrate how 55sloppy-windows-1252 merges Windows-1252 with Latin-1: 56 57 >>> from ftfy import explain_unicode 58 >>> some_bytes = b'\x80\x81\x82' 59 >>> explain_unicode(some_bytes.decode('latin-1')) 60 U+0080 \x80 [Cc] <unknown> 61 U+0081 \x81 [Cc] <unknown> 62 U+0082 \x82 [Cc] <unknown> 63 64 >>> explain_unicode(some_bytes.decode('windows-1252', 'replace')) 65 U+20AC € [Sc] EURO SIGN 66 U+FFFD � [So] REPLACEMENT CHARACTER 67 U+201A ‚ [Ps] SINGLE LOW-9 QUOTATION MARK 68 69 >>> explain_unicode(some_bytes.decode('sloppy-windows-1252')) 70 U+20AC € [Sc] EURO SIGN 71 U+0081 \x81 [Cc] <unknown> 72 U+201A ‚ [Ps] SINGLE LOW-9 QUOTATION MARK 73""" 74import codecs 75from encodings import normalize_encoding 76import sys 77 78REPLACEMENT_CHAR = '\ufffd' 79PY26 = sys.version_info[:2] == (2, 6) 80 81 82def make_sloppy_codec(encoding): 83 """ 84 Take a codec name, and return a 'sloppy' version of that codec that can 85 encode and decode the unassigned bytes in that encoding. 86 87 Single-byte encodings in the standard library are defined using some 88 boilerplate classes surrounding the functions that do the actual work, 89 `codecs.charmap_decode` and `charmap_encode`. This function, given an 90 encoding name, *defines* those boilerplate classes. 91 """ 92 # Make a bytestring of all 256 possible bytes. 93 all_bytes = bytes(range(256)) 94 95 # Get a list of what they would decode to in Latin-1. 96 sloppy_chars = list(all_bytes.decode('latin-1')) 97 98 # Get a list of what they decode to in the given encoding. Use the 99 # replacement character for unassigned bytes. 100 if PY26: 101 decoded_chars = all_bytes.decode(encoding, 'replace') 102 else: 103 decoded_chars = all_bytes.decode(encoding, errors='replace') 104 105 # Update the sloppy_chars list. Each byte that was successfully decoded 106 # gets its decoded value in the list. The unassigned bytes are left as 107 # they are, which gives their decoding in Latin-1. 108 for i, char in enumerate(decoded_chars): 109 if char != REPLACEMENT_CHAR: 110 sloppy_chars[i] = char 111 112 # For ftfy's own purposes, we're going to allow byte 1A, the "Substitute" 113 # control code, to encode the Unicode replacement character U+FFFD. 114 sloppy_chars[0x1a] = REPLACEMENT_CHAR 115 116 # Create the data structures that tell the charmap methods how to encode 117 # and decode in this sloppy encoding. 118 decoding_table = ''.join(sloppy_chars) 119 encoding_table = codecs.charmap_build(decoding_table) 120 121 # Now produce all the class boilerplate. Look at the Python source for 122 # `encodings.cp1252` for comparison; this is almost exactly the same, 123 # except I made it follow pep8. 124 class Codec(codecs.Codec): 125 def encode(self, input, errors='strict'): 126 return codecs.charmap_encode(input, errors, encoding_table) 127 128 def decode(self, input, errors='strict'): 129 return codecs.charmap_decode(input, errors, decoding_table) 130 131 class IncrementalEncoder(codecs.IncrementalEncoder): 132 def encode(self, input, final=False): 133 return codecs.charmap_encode(input, self.errors, encoding_table)[0] 134 135 class IncrementalDecoder(codecs.IncrementalDecoder): 136 def decode(self, input, final=False): 137 return codecs.charmap_decode(input, self.errors, decoding_table)[0] 138 139 class StreamWriter(Codec, codecs.StreamWriter): 140 pass 141 142 class StreamReader(Codec, codecs.StreamReader): 143 pass 144 145 return codecs.CodecInfo( 146 name='sloppy-' + encoding, 147 encode=Codec().encode, 148 decode=Codec().decode, 149 incrementalencoder=IncrementalEncoder, 150 incrementaldecoder=IncrementalDecoder, 151 streamreader=StreamReader, 152 streamwriter=StreamWriter, 153 ) 154 155 156# Define a codec for each incomplete encoding. The resulting CODECS dictionary 157# can be used by the main module of ftfy.bad_codecs. 158CODECS = {} 159INCOMPLETE_ENCODINGS = ( 160 ['windows-%s' % num for num in range(1250, 1259)] + 161 ['iso-8859-%s' % num for num in (3, 6, 7, 8, 11)] + 162 ['cp%s' % num for num in range(1250, 1259)] + ['cp874'] 163) 164 165for _encoding in INCOMPLETE_ENCODINGS: 166 _new_name = normalize_encoding('sloppy-' + _encoding) 167 CODECS[_new_name] = make_sloppy_codec(_encoding) 168