1# coding: utf-8
2"""
3
4    webencodings
5    ~~~~~~~~~~~~
6
7    This is a Python implementation of the `WHATWG Encoding standard
8    <http://encoding.spec.whatwg.org/>`. See README for details.
9
10    :copyright: Copyright 2012 by Simon Sapin
11    :license: BSD, see LICENSE for details.
12
13"""
14
15from __future__ import unicode_literals
16
17import codecs
18
19from .labels import LABELS
20
21
22VERSION = '0.5.1'
23
24
25# Some names in Encoding are not valid Python aliases. Remap these.
26PYTHON_NAMES = {
27    'iso-8859-8-i': 'iso-8859-8',
28    'x-mac-cyrillic': 'mac-cyrillic',
29    'macintosh': 'mac-roman',
30    'windows-874': 'cp874'}
31
32CACHE = {}
33
34
35def ascii_lower(string):
36    r"""Transform (only) ASCII letters to lower case: A-Z is mapped to a-z.
37
38    :param string: An Unicode string.
39    :returns: A new Unicode string.
40
41    This is used for `ASCII case-insensitive
42    <http://encoding.spec.whatwg.org/#ascii-case-insensitive>`_
43    matching of encoding labels.
44    The same matching is also used, among other things,
45    for `CSS keywords <http://dev.w3.org/csswg/css-values/#keywords>`_.
46
47    This is different from the :meth:`~py:str.lower` method of Unicode strings
48    which also affect non-ASCII characters,
49    sometimes mapping them into the ASCII range:
50
51        >>> keyword = u'Bac\N{KELVIN SIGN}ground'
52        >>> assert keyword.lower() == u'background'
53        >>> assert ascii_lower(keyword) != keyword.lower()
54        >>> assert ascii_lower(keyword) == u'bac\N{KELVIN SIGN}ground'
55
56    """
57    # This turns out to be faster than unicode.translate()
58    return string.encode('utf8').lower().decode('utf8')
59
60
61def lookup(label):
62    """
63    Look for an encoding by its label.
64    This is the spec’s `get an encoding
65    <http://encoding.spec.whatwg.org/#concept-encoding-get>`_ algorithm.
66    Supported labels are listed there.
67
68    :param label: A string.
69    :returns:
70        An :class:`Encoding` object, or :obj:`None` for an unknown label.
71
72    """
73    # Only strip ASCII whitespace: U+0009, U+000A, U+000C, U+000D, and U+0020.
74    label = ascii_lower(label.strip('\t\n\f\r '))
75    name = LABELS.get(label)
76    if name is None:
77        return None
78    encoding = CACHE.get(name)
79    if encoding is None:
80        if name == 'x-user-defined':
81            from .x_user_defined import codec_info
82        else:
83            python_name = PYTHON_NAMES.get(name, name)
84            # Any python_name value that gets to here should be valid.
85            codec_info = codecs.lookup(python_name)
86        encoding = Encoding(name, codec_info)
87        CACHE[name] = encoding
88    return encoding
89
90
91def _get_encoding(encoding_or_label):
92    """
93    Accept either an encoding object or label.
94
95    :param encoding: An :class:`Encoding` object or a label string.
96    :returns: An :class:`Encoding` object.
97    :raises: :exc:`~exceptions.LookupError` for an unknown label.
98
99    """
100    if hasattr(encoding_or_label, 'codec_info'):
101        return encoding_or_label
102
103    encoding = lookup(encoding_or_label)
104    if encoding is None:
105        raise LookupError('Unknown encoding label: %r' % encoding_or_label)
106    return encoding
107
108
109class Encoding(object):
110    """Reresents a character encoding such as UTF-8,
111    that can be used for decoding or encoding.
112
113    .. attribute:: name
114
115        Canonical name of the encoding
116
117    .. attribute:: codec_info
118
119        The actual implementation of the encoding,
120        a stdlib :class:`~codecs.CodecInfo` object.
121        See :func:`codecs.register`.
122
123    """
124    def __init__(self, name, codec_info):
125        self.name = name
126        self.codec_info = codec_info
127
128    def __repr__(self):
129        return '<Encoding %s>' % self.name
130
131
132#: The UTF-8 encoding. Should be used for new content and formats.
133UTF8 = lookup('utf-8')
134
135_UTF16LE = lookup('utf-16le')
136_UTF16BE = lookup('utf-16be')
137
138
139def decode(input, fallback_encoding, errors='replace'):
140    """
141    Decode a single string.
142
143    :param input: A byte string
144    :param fallback_encoding:
145        An :class:`Encoding` object or a label string.
146        The encoding to use if :obj:`input` does note have a BOM.
147    :param errors: Type of error handling. See :func:`codecs.register`.
148    :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
149    :return:
150        A ``(output, encoding)`` tuple of an Unicode string
151        and an :obj:`Encoding`.
152
153    """
154    # Fail early if `encoding` is an invalid label.
155    fallback_encoding = _get_encoding(fallback_encoding)
156    bom_encoding, input = _detect_bom(input)
157    encoding = bom_encoding or fallback_encoding
158    return encoding.codec_info.decode(input, errors)[0], encoding
159
160
161def _detect_bom(input):
162    """Return (bom_encoding, input), with any BOM removed from the input."""
163    if input.startswith(b'\xFF\xFE'):
164        return _UTF16LE, input[2:]
165    if input.startswith(b'\xFE\xFF'):
166        return _UTF16BE, input[2:]
167    if input.startswith(b'\xEF\xBB\xBF'):
168        return UTF8, input[3:]
169    return None, input
170
171
172def encode(input, encoding=UTF8, errors='strict'):
173    """
174    Encode a single string.
175
176    :param input: An Unicode string.
177    :param encoding: An :class:`Encoding` object or a label string.
178    :param errors: Type of error handling. See :func:`codecs.register`.
179    :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
180    :return: A byte string.
181
182    """
183    return _get_encoding(encoding).codec_info.encode(input, errors)[0]
184
185
186def iter_decode(input, fallback_encoding, errors='replace'):
187    """
188    "Pull"-based decoder.
189
190    :param input:
191        An iterable of byte strings.
192
193        The input is first consumed just enough to determine the encoding
194        based on the precense of a BOM,
195        then consumed on demand when the return value is.
196    :param fallback_encoding:
197        An :class:`Encoding` object or a label string.
198        The encoding to use if :obj:`input` does note have a BOM.
199    :param errors: Type of error handling. See :func:`codecs.register`.
200    :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
201    :returns:
202        An ``(output, encoding)`` tuple.
203        :obj:`output` is an iterable of Unicode strings,
204        :obj:`encoding` is the :obj:`Encoding` that is being used.
205
206    """
207
208    decoder = IncrementalDecoder(fallback_encoding, errors)
209    generator = _iter_decode_generator(input, decoder)
210    encoding = next(generator)
211    return generator, encoding
212
213
214def _iter_decode_generator(input, decoder):
215    """Return a generator that first yields the :obj:`Encoding`,
216    then yields output chukns as Unicode strings.
217
218    """
219    decode = decoder.decode
220    input = iter(input)
221    for chunck in input:
222        output = decode(chunck)
223        if output:
224            assert decoder.encoding is not None
225            yield decoder.encoding
226            yield output
227            break
228    else:
229        # Input exhausted without determining the encoding
230        output = decode(b'', final=True)
231        assert decoder.encoding is not None
232        yield decoder.encoding
233        if output:
234            yield output
235        return
236
237    for chunck in input:
238        output = decode(chunck)
239        if output:
240            yield output
241    output = decode(b'', final=True)
242    if output:
243        yield output
244
245
246def iter_encode(input, encoding=UTF8, errors='strict'):
247    """
248    “Pull”-based encoder.
249
250    :param input: An iterable of Unicode strings.
251    :param encoding: An :class:`Encoding` object or a label string.
252    :param errors: Type of error handling. See :func:`codecs.register`.
253    :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
254    :returns: An iterable of byte strings.
255
256    """
257    # Fail early if `encoding` is an invalid label.
258    encode = IncrementalEncoder(encoding, errors).encode
259    return _iter_encode_generator(input, encode)
260
261
262def _iter_encode_generator(input, encode):
263    for chunck in input:
264        output = encode(chunck)
265        if output:
266            yield output
267    output = encode('', final=True)
268    if output:
269        yield output
270
271
272class IncrementalDecoder(object):
273    """
274    “Push”-based decoder.
275
276    :param fallback_encoding:
277        An :class:`Encoding` object or a label string.
278        The encoding to use if :obj:`input` does note have a BOM.
279    :param errors: Type of error handling. See :func:`codecs.register`.
280    :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
281
282    """
283    def __init__(self, fallback_encoding, errors='replace'):
284        # Fail early if `encoding` is an invalid label.
285        self._fallback_encoding = _get_encoding(fallback_encoding)
286        self._errors = errors
287        self._buffer = b''
288        self._decoder = None
289        #: The actual :class:`Encoding` that is being used,
290        #: or :obj:`None` if that is not determined yet.
291        #: (Ie. if there is not enough input yet to determine
292        #: if there is a BOM.)
293        self.encoding = None  # Not known yet.
294
295    def decode(self, input, final=False):
296        """Decode one chunk of the input.
297
298        :param input: A byte string.
299        :param final:
300            Indicate that no more input is available.
301            Must be :obj:`True` if this is the last call.
302        :returns: An Unicode string.
303
304        """
305        decoder = self._decoder
306        if decoder is not None:
307            return decoder(input, final)
308
309        input = self._buffer + input
310        encoding, input = _detect_bom(input)
311        if encoding is None:
312            if len(input) < 3 and not final:  # Not enough data yet.
313                self._buffer = input
314                return ''
315            else:  # No BOM
316                encoding = self._fallback_encoding
317        decoder = encoding.codec_info.incrementaldecoder(self._errors).decode
318        self._decoder = decoder
319        self.encoding = encoding
320        return decoder(input, final)
321
322
323class IncrementalEncoder(object):
324    """
325    “Push”-based encoder.
326
327    :param encoding: An :class:`Encoding` object or a label string.
328    :param errors: Type of error handling. See :func:`codecs.register`.
329    :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
330
331    .. method:: encode(input, final=False)
332
333        :param input: An Unicode string.
334        :param final:
335            Indicate that no more input is available.
336            Must be :obj:`True` if this is the last call.
337        :returns: A byte string.
338
339    """
340    def __init__(self, encoding=UTF8, errors='strict'):
341        encoding = _get_encoding(encoding)
342        self.encode = encoding.codec_info.incrementalencoder(errors).encode
343