1# coding: utf-8 2r""" 3Decodes single-byte encodings, filling their "holes" in the same messy way that 4everyone else does. 5 6A single-byte encoding maps each byte to a Unicode character, except that some 7bytes are left unmapped. In the commonly-used Windows-1252 encoding, for 8example, bytes 0x81 and 0x8D, among others, have no meaning. 9 10Python, wanting to preserve some sense of decorum, will handle these bytes 11as errors. But Windows knows that 0x81 and 0x8D are possible bytes and they're 12different from each other. It just hasn't defined what they are in terms of 13Unicode. 14 15Software that has to interoperate with Windows-1252 and Unicode -- such as all 16the common Web browsers -- will pick some Unicode characters for them to map 17to, and the characters they pick are the Unicode characters with the same 18numbers: U+0081 and U+008D. This is the same as what Latin-1 does, and the 19resulting characters tend to fall into a range of Unicode that's set aside for 20obselete Latin-1 control characters anyway. 21 22These sloppy codecs let Python do the same thing, thus interoperating with 23other software that works this way. It defines a sloppy version of many 24single-byte encodings with holes. (There is no need for a sloppy version of 25an encoding without holes: for example, there is no such thing as 26sloppy-iso-8859-2 or sloppy-macroman.) 27 28The following encodings will become defined: 29 30- sloppy-windows-1250 (Central European, sort of based on ISO-8859-2) 31- sloppy-windows-1251 (Cyrillic) 32- sloppy-windows-1252 (Western European, based on Latin-1) 33- sloppy-windows-1253 (Greek, sort of based on ISO-8859-7) 34- sloppy-windows-1254 (Turkish, based on ISO-8859-9) 35- sloppy-windows-1255 (Hebrew, based on ISO-8859-8) 36- sloppy-windows-1256 (Arabic) 37- sloppy-windows-1257 (Baltic, based on ISO-8859-13) 38- sloppy-windows-1258 (Vietnamese) 39- sloppy-cp874 (Thai, based on ISO-8859-11) 40- sloppy-iso-8859-3 (Maltese and Esperanto, I guess) 41- sloppy-iso-8859-6 (different Arabic) 42- sloppy-iso-8859-7 (Greek) 43- sloppy-iso-8859-8 (Hebrew) 44- sloppy-iso-8859-11 (Thai) 45 46Aliases such as "sloppy-cp1252" for "sloppy-windows-1252" will also be 47defined. 48 49Only sloppy-windows-1251 and sloppy-windows-1252 are used by the rest of ftfy; 50the rest are rather uncommon. 51 52Here are some examples, using `ftfy.explain_unicode` to illustrate how 53sloppy-windows-1252 merges Windows-1252 with Latin-1: 54 55 >>> from ftfy import explain_unicode 56 >>> some_bytes = b'\x80\x81\x82' 57 >>> explain_unicode(some_bytes.decode('latin-1')) 58 U+0080 \x80 [Cc] <unknown> 59 U+0081 \x81 [Cc] <unknown> 60 U+0082 \x82 [Cc] <unknown> 61 62 >>> explain_unicode(some_bytes.decode('windows-1252', 'replace')) 63 U+20AC € [Sc] EURO SIGN 64 U+FFFD � [So] REPLACEMENT CHARACTER 65 U+201A ‚ [Ps] SINGLE LOW-9 QUOTATION MARK 66 67 >>> explain_unicode(some_bytes.decode('sloppy-windows-1252')) 68 U+20AC € [Sc] EURO SIGN 69 U+0081 \x81 [Cc] <unknown> 70 U+201A ‚ [Ps] SINGLE LOW-9 QUOTATION MARK 71""" 72from __future__ import unicode_literals 73import codecs 74from encodings import normalize_encoding 75import sys 76 77REPLACEMENT_CHAR = '\ufffd' 78PY26 = sys.version_info[:2] == (2, 6) 79 80def make_sloppy_codec(encoding): 81 """ 82 Take a codec name, and return a 'sloppy' version of that codec that can 83 encode and decode the unassigned bytes in that encoding. 84 85 Single-byte encodings in the standard library are defined using some 86 boilerplate classes surrounding the functions that do the actual work, 87 `codecs.charmap_decode` and `charmap_encode`. This function, given an 88 encoding name, *defines* those boilerplate classes. 89 """ 90 # Make an array of all 256 possible bytes. 91 all_bytes = bytearray(range(256)) 92 93 # Get a list of what they would decode to in Latin-1. 94 sloppy_chars = list(all_bytes.decode('latin-1')) 95 96 # Get a list of what they decode to in the given encoding. Use the 97 # replacement character for unassigned bytes. 98 if PY26: 99 decoded_chars = all_bytes.decode(encoding, 'replace') 100 else: 101 decoded_chars = all_bytes.decode(encoding, errors='replace') 102 103 # Update the sloppy_chars list. Each byte that was successfully decoded 104 # gets its decoded value in the list. The unassigned bytes are left as 105 # they are, which gives their decoding in Latin-1. 106 for i, char in enumerate(decoded_chars): 107 if char != REPLACEMENT_CHAR: 108 sloppy_chars[i] = char 109 110 # For ftfy's own purposes, we're going to allow byte 1A, the "Substitute" 111 # control code, to encode the Unicode replacement character U+FFFD. 112 sloppy_chars[0x1a] = REPLACEMENT_CHAR 113 114 # Create the data structures that tell the charmap methods how to encode 115 # and decode in this sloppy encoding. 116 decoding_table = ''.join(sloppy_chars) 117 encoding_table = codecs.charmap_build(decoding_table) 118 119 # Now produce all the class boilerplate. Look at the Python source for 120 # `encodings.cp1252` for comparison; this is almost exactly the same, 121 # except I made it follow pep8. 122 class Codec(codecs.Codec): 123 def encode(self, input, errors='strict'): 124 return codecs.charmap_encode(input, errors, encoding_table) 125 126 def decode(self, input, errors='strict'): 127 return codecs.charmap_decode(input, errors, decoding_table) 128 129 class IncrementalEncoder(codecs.IncrementalEncoder): 130 def encode(self, input, final=False): 131 return codecs.charmap_encode(input, self.errors, encoding_table)[0] 132 133 class IncrementalDecoder(codecs.IncrementalDecoder): 134 def decode(self, input, final=False): 135 return codecs.charmap_decode(input, self.errors, decoding_table)[0] 136 137 class StreamWriter(Codec, codecs.StreamWriter): 138 pass 139 140 class StreamReader(Codec, codecs.StreamReader): 141 pass 142 143 return codecs.CodecInfo( 144 name='sloppy-' + encoding, 145 encode=Codec().encode, 146 decode=Codec().decode, 147 incrementalencoder=IncrementalEncoder, 148 incrementaldecoder=IncrementalDecoder, 149 streamreader=StreamReader, 150 streamwriter=StreamWriter, 151 ) 152 153# Define a codec for each incomplete encoding. The resulting CODECS dictionary 154# can be used by the main module of ftfy.bad_codecs. 155CODECS = {} 156INCOMPLETE_ENCODINGS = ( 157 ['windows-%s' % num for num in range(1250, 1259)] + 158 ['iso-8859-%s' % num for num in (3, 6, 7, 8, 11)] + 159 ['cp%s' % num for num in range(1250, 1259)] + ['cp874'] 160) 161 162for _encoding in INCOMPLETE_ENCODINGS: 163 _new_name = normalize_encoding('sloppy-' + _encoding) 164 CODECS[_new_name] = make_sloppy_codec(_encoding) 165