1# coding: utf-8 2""" 3 4 webencodings 5 ~~~~~~~~~~~~ 6 7 This is a Python implementation of the `WHATWG Encoding standard 8 <http://encoding.spec.whatwg.org/>`. See README for details. 9 10 :copyright: Copyright 2012 by Simon Sapin 11 :license: BSD, see LICENSE for details. 12 13""" 14 15from __future__ import unicode_literals 16 17import codecs 18 19from .labels import LABELS 20 21 22VERSION = '0.5.1' 23 24 25# Some names in Encoding are not valid Python aliases. Remap these. 26PYTHON_NAMES = { 27 'iso-8859-8-i': 'iso-8859-8', 28 'x-mac-cyrillic': 'mac-cyrillic', 29 'macintosh': 'mac-roman', 30 'windows-874': 'cp874'} 31 32CACHE = {} 33 34 35def ascii_lower(string): 36 r"""Transform (only) ASCII letters to lower case: A-Z is mapped to a-z. 37 38 :param string: An Unicode string. 39 :returns: A new Unicode string. 40 41 This is used for `ASCII case-insensitive 42 <http://encoding.spec.whatwg.org/#ascii-case-insensitive>`_ 43 matching of encoding labels. 44 The same matching is also used, among other things, 45 for `CSS keywords <http://dev.w3.org/csswg/css-values/#keywords>`_. 46 47 This is different from the :meth:`~py:str.lower` method of Unicode strings 48 which also affect non-ASCII characters, 49 sometimes mapping them into the ASCII range: 50 51 >>> keyword = u'Bac\N{KELVIN SIGN}ground' 52 >>> assert keyword.lower() == u'background' 53 >>> assert ascii_lower(keyword) != keyword.lower() 54 >>> assert ascii_lower(keyword) == u'bac\N{KELVIN SIGN}ground' 55 56 """ 57 # This turns out to be faster than unicode.translate() 58 return string.encode('utf8').lower().decode('utf8') 59 60 61def lookup(label): 62 """ 63 Look for an encoding by its label. 64 This is the spec’s `get an encoding 65 <http://encoding.spec.whatwg.org/#concept-encoding-get>`_ algorithm. 66 Supported labels are listed there. 67 68 :param label: A string. 69 :returns: 70 An :class:`Encoding` object, or :obj:`None` for an unknown label. 71 72 """ 73 # Only strip ASCII whitespace: U+0009, U+000A, U+000C, U+000D, and U+0020. 74 label = ascii_lower(label.strip('\t\n\f\r ')) 75 name = LABELS.get(label) 76 if name is None: 77 return None 78 encoding = CACHE.get(name) 79 if encoding is None: 80 if name == 'x-user-defined': 81 from .x_user_defined import codec_info 82 else: 83 python_name = PYTHON_NAMES.get(name, name) 84 # Any python_name value that gets to here should be valid. 85 codec_info = codecs.lookup(python_name) 86 encoding = Encoding(name, codec_info) 87 CACHE[name] = encoding 88 return encoding 89 90 91def _get_encoding(encoding_or_label): 92 """ 93 Accept either an encoding object or label. 94 95 :param encoding: An :class:`Encoding` object or a label string. 96 :returns: An :class:`Encoding` object. 97 :raises: :exc:`~exceptions.LookupError` for an unknown label. 98 99 """ 100 if hasattr(encoding_or_label, 'codec_info'): 101 return encoding_or_label 102 103 encoding = lookup(encoding_or_label) 104 if encoding is None: 105 raise LookupError('Unknown encoding label: %r' % encoding_or_label) 106 return encoding 107 108 109class Encoding(object): 110 """Reresents a character encoding such as UTF-8, 111 that can be used for decoding or encoding. 112 113 .. attribute:: name 114 115 Canonical name of the encoding 116 117 .. attribute:: codec_info 118 119 The actual implementation of the encoding, 120 a stdlib :class:`~codecs.CodecInfo` object. 121 See :func:`codecs.register`. 122 123 """ 124 def __init__(self, name, codec_info): 125 self.name = name 126 self.codec_info = codec_info 127 128 def __repr__(self): 129 return '<Encoding %s>' % self.name 130 131 132#: The UTF-8 encoding. Should be used for new content and formats. 133UTF8 = lookup('utf-8') 134 135_UTF16LE = lookup('utf-16le') 136_UTF16BE = lookup('utf-16be') 137 138 139def decode(input, fallback_encoding, errors='replace'): 140 """ 141 Decode a single string. 142 143 :param input: A byte string 144 :param fallback_encoding: 145 An :class:`Encoding` object or a label string. 146 The encoding to use if :obj:`input` does note have a BOM. 147 :param errors: Type of error handling. See :func:`codecs.register`. 148 :raises: :exc:`~exceptions.LookupError` for an unknown encoding label. 149 :return: 150 A ``(output, encoding)`` tuple of an Unicode string 151 and an :obj:`Encoding`. 152 153 """ 154 # Fail early if `encoding` is an invalid label. 155 fallback_encoding = _get_encoding(fallback_encoding) 156 bom_encoding, input = _detect_bom(input) 157 encoding = bom_encoding or fallback_encoding 158 return encoding.codec_info.decode(input, errors)[0], encoding 159 160 161def _detect_bom(input): 162 """Return (bom_encoding, input), with any BOM removed from the input.""" 163 if input.startswith(b'\xFF\xFE'): 164 return _UTF16LE, input[2:] 165 if input.startswith(b'\xFE\xFF'): 166 return _UTF16BE, input[2:] 167 if input.startswith(b'\xEF\xBB\xBF'): 168 return UTF8, input[3:] 169 return None, input 170 171 172def encode(input, encoding=UTF8, errors='strict'): 173 """ 174 Encode a single string. 175 176 :param input: An Unicode string. 177 :param encoding: An :class:`Encoding` object or a label string. 178 :param errors: Type of error handling. See :func:`codecs.register`. 179 :raises: :exc:`~exceptions.LookupError` for an unknown encoding label. 180 :return: A byte string. 181 182 """ 183 return _get_encoding(encoding).codec_info.encode(input, errors)[0] 184 185 186def iter_decode(input, fallback_encoding, errors='replace'): 187 """ 188 "Pull"-based decoder. 189 190 :param input: 191 An iterable of byte strings. 192 193 The input is first consumed just enough to determine the encoding 194 based on the precense of a BOM, 195 then consumed on demand when the return value is. 196 :param fallback_encoding: 197 An :class:`Encoding` object or a label string. 198 The encoding to use if :obj:`input` does note have a BOM. 199 :param errors: Type of error handling. See :func:`codecs.register`. 200 :raises: :exc:`~exceptions.LookupError` for an unknown encoding label. 201 :returns: 202 An ``(output, encoding)`` tuple. 203 :obj:`output` is an iterable of Unicode strings, 204 :obj:`encoding` is the :obj:`Encoding` that is being used. 205 206 """ 207 208 decoder = IncrementalDecoder(fallback_encoding, errors) 209 generator = _iter_decode_generator(input, decoder) 210 encoding = next(generator) 211 return generator, encoding 212 213 214def _iter_decode_generator(input, decoder): 215 """Return a generator that first yields the :obj:`Encoding`, 216 then yields output chukns as Unicode strings. 217 218 """ 219 decode = decoder.decode 220 input = iter(input) 221 for chunck in input: 222 output = decode(chunck) 223 if output: 224 assert decoder.encoding is not None 225 yield decoder.encoding 226 yield output 227 break 228 else: 229 # Input exhausted without determining the encoding 230 output = decode(b'', final=True) 231 assert decoder.encoding is not None 232 yield decoder.encoding 233 if output: 234 yield output 235 return 236 237 for chunck in input: 238 output = decode(chunck) 239 if output: 240 yield output 241 output = decode(b'', final=True) 242 if output: 243 yield output 244 245 246def iter_encode(input, encoding=UTF8, errors='strict'): 247 """ 248 “Pull”-based encoder. 249 250 :param input: An iterable of Unicode strings. 251 :param encoding: An :class:`Encoding` object or a label string. 252 :param errors: Type of error handling. See :func:`codecs.register`. 253 :raises: :exc:`~exceptions.LookupError` for an unknown encoding label. 254 :returns: An iterable of byte strings. 255 256 """ 257 # Fail early if `encoding` is an invalid label. 258 encode = IncrementalEncoder(encoding, errors).encode 259 return _iter_encode_generator(input, encode) 260 261 262def _iter_encode_generator(input, encode): 263 for chunck in input: 264 output = encode(chunck) 265 if output: 266 yield output 267 output = encode('', final=True) 268 if output: 269 yield output 270 271 272class IncrementalDecoder(object): 273 """ 274 “Push”-based decoder. 275 276 :param fallback_encoding: 277 An :class:`Encoding` object or a label string. 278 The encoding to use if :obj:`input` does note have a BOM. 279 :param errors: Type of error handling. See :func:`codecs.register`. 280 :raises: :exc:`~exceptions.LookupError` for an unknown encoding label. 281 282 """ 283 def __init__(self, fallback_encoding, errors='replace'): 284 # Fail early if `encoding` is an invalid label. 285 self._fallback_encoding = _get_encoding(fallback_encoding) 286 self._errors = errors 287 self._buffer = b'' 288 self._decoder = None 289 #: The actual :class:`Encoding` that is being used, 290 #: or :obj:`None` if that is not determined yet. 291 #: (Ie. if there is not enough input yet to determine 292 #: if there is a BOM.) 293 self.encoding = None # Not known yet. 294 295 def decode(self, input, final=False): 296 """Decode one chunk of the input. 297 298 :param input: A byte string. 299 :param final: 300 Indicate that no more input is available. 301 Must be :obj:`True` if this is the last call. 302 :returns: An Unicode string. 303 304 """ 305 decoder = self._decoder 306 if decoder is not None: 307 return decoder(input, final) 308 309 input = self._buffer + input 310 encoding, input = _detect_bom(input) 311 if encoding is None: 312 if len(input) < 3 and not final: # Not enough data yet. 313 self._buffer = input 314 return '' 315 else: # No BOM 316 encoding = self._fallback_encoding 317 decoder = encoding.codec_info.incrementaldecoder(self._errors).decode 318 self._decoder = decoder 319 self.encoding = encoding 320 return decoder(input, final) 321 322 323class IncrementalEncoder(object): 324 """ 325 “Push”-based encoder. 326 327 :param encoding: An :class:`Encoding` object or a label string. 328 :param errors: Type of error handling. See :func:`codecs.register`. 329 :raises: :exc:`~exceptions.LookupError` for an unknown encoding label. 330 331 .. method:: encode(input, final=False) 332 333 :param input: An Unicode string. 334 :param final: 335 Indicate that no more input is available. 336 Must be :obj:`True` if this is the last call. 337 :returns: A byte string. 338 339 """ 340 def __init__(self, encoding=UTF8, errors='strict'): 341 encoding = _get_encoding(encoding) 342 self.encode = encoding.codec_info.incrementalencoder(errors).encode 343