1# coding: utf-8
2"""
3
4    webencodings.tests
5    ~~~~~~~~~~~~~~~~~~
6
7    A basic test suite for Encoding.
8
9    :copyright: Copyright 2012 by Simon Sapin
10    :license: BSD, see LICENSE for details.
11
12"""
13
14from __future__ import unicode_literals
15
16from . import (lookup, LABELS, decode, encode, iter_decode, iter_encode,
17               IncrementalDecoder, IncrementalEncoder, UTF8)
18
19
20def assert_raises(exception, function, *args, **kwargs):
21    try:
22        function(*args, **kwargs)
23    except exception:
24        return
25    else:  # pragma: no cover
26        raise AssertionError('Did not raise %s.' % exception)
27
28
29def test_labels():
30    assert lookup('utf-8').name == 'utf-8'
31    assert lookup('Utf-8').name == 'utf-8'
32    assert lookup('UTF-8').name == 'utf-8'
33    assert lookup('utf8').name == 'utf-8'
34    assert lookup('utf8').name == 'utf-8'
35    assert lookup('utf8 ').name == 'utf-8'
36    assert lookup(' \r\nutf8\t').name == 'utf-8'
37    assert lookup('u8') is None  # Python label.
38    assert lookup('utf-8 ') is None  # Non-ASCII white space.
39
40    assert lookup('US-ASCII').name == 'windows-1252'
41    assert lookup('iso-8859-1').name == 'windows-1252'
42    assert lookup('latin1').name == 'windows-1252'
43    assert lookup('LATIN1').name == 'windows-1252'
44    assert lookup('latin-1') is None
45    assert lookup('LATİN1') is None  # ASCII-only case insensitivity.
46
47
48def test_all_labels():
49    for label in LABELS:
50        assert decode(b'', label) == ('', lookup(label))
51        assert encode('', label) == b''
52        for repeat in [0, 1, 12]:
53            output, _ = iter_decode([b''] * repeat, label)
54            assert list(output) == []
55            assert list(iter_encode([''] * repeat, label)) == []
56        decoder = IncrementalDecoder(label)
57        assert decoder.decode(b'') == ''
58        assert decoder.decode(b'', final=True) == ''
59        encoder = IncrementalEncoder(label)
60        assert encoder.encode('') == b''
61        assert encoder.encode('', final=True) == b''
62    # All encoding names are valid labels too:
63    for name in set(LABELS.values()):
64        assert lookup(name).name == name
65
66
67def test_invalid_label():
68    assert_raises(LookupError, decode, b'\xEF\xBB\xBF\xc3\xa9', 'invalid')
69    assert_raises(LookupError, encode, 'é', 'invalid')
70    assert_raises(LookupError, iter_decode, [], 'invalid')
71    assert_raises(LookupError, iter_encode, [], 'invalid')
72    assert_raises(LookupError, IncrementalDecoder, 'invalid')
73    assert_raises(LookupError, IncrementalEncoder, 'invalid')
74
75
76def test_decode():
77    assert decode(b'\x80', 'latin1') == ('€', lookup('latin1'))
78    assert decode(b'\x80', lookup('latin1')) == ('€', lookup('latin1'))
79    assert decode(b'\xc3\xa9', 'utf8') == ('é', lookup('utf8'))
80    assert decode(b'\xc3\xa9', UTF8) == ('é', lookup('utf8'))
81    assert decode(b'\xc3\xa9', 'ascii') == ('é', lookup('ascii'))
82    assert decode(b'\xEF\xBB\xBF\xc3\xa9', 'ascii') == ('é', lookup('utf8'))  # UTF-8 with BOM
83
84    assert decode(b'\xFE\xFF\x00\xe9', 'ascii') == ('é', lookup('utf-16be'))  # UTF-16-BE with BOM
85    assert decode(b'\xFF\xFE\xe9\x00', 'ascii') == ('é', lookup('utf-16le'))  # UTF-16-LE with BOM
86    assert decode(b'\xFE\xFF\xe9\x00', 'ascii') == ('\ue900', lookup('utf-16be'))
87    assert decode(b'\xFF\xFE\x00\xe9', 'ascii') == ('\ue900', lookup('utf-16le'))
88
89    assert decode(b'\x00\xe9', 'UTF-16BE') == ('é', lookup('utf-16be'))
90    assert decode(b'\xe9\x00', 'UTF-16LE') == ('é', lookup('utf-16le'))
91    assert decode(b'\xe9\x00', 'UTF-16') == ('é', lookup('utf-16le'))
92
93    assert decode(b'\xe9\x00', 'UTF-16BE') == ('\ue900', lookup('utf-16be'))
94    assert decode(b'\x00\xe9', 'UTF-16LE') == ('\ue900', lookup('utf-16le'))
95    assert decode(b'\x00\xe9', 'UTF-16') == ('\ue900', lookup('utf-16le'))
96
97
98def test_encode():
99    assert encode('é', 'latin1') == b'\xe9'
100    assert encode('é', 'utf8') == b'\xc3\xa9'
101    assert encode('é', 'utf8') == b'\xc3\xa9'
102    assert encode('é', 'utf-16') == b'\xe9\x00'
103    assert encode('é', 'utf-16le') == b'\xe9\x00'
104    assert encode('é', 'utf-16be') == b'\x00\xe9'
105
106
107def test_iter_decode():
108    def iter_decode_to_string(input, fallback_encoding):
109        output, _encoding = iter_decode(input, fallback_encoding)
110        return ''.join(output)
111    assert iter_decode_to_string([], 'latin1') == ''
112    assert iter_decode_to_string([b''], 'latin1') == ''
113    assert iter_decode_to_string([b'\xe9'], 'latin1') == 'é'
114    assert iter_decode_to_string([b'hello'], 'latin1') == 'hello'
115    assert iter_decode_to_string([b'he', b'llo'], 'latin1') == 'hello'
116    assert iter_decode_to_string([b'hell', b'o'], 'latin1') == 'hello'
117    assert iter_decode_to_string([b'\xc3\xa9'], 'latin1') == 'é'
118    assert iter_decode_to_string([b'\xEF\xBB\xBF\xc3\xa9'], 'latin1') == 'é'
119    assert iter_decode_to_string([
120        b'\xEF\xBB\xBF', b'\xc3', b'\xa9'], 'latin1') == 'é'
121    assert iter_decode_to_string([
122        b'\xEF\xBB\xBF', b'a', b'\xc3'], 'latin1') == 'a\uFFFD'
123    assert iter_decode_to_string([
124        b'', b'\xEF', b'', b'', b'\xBB\xBF\xc3', b'\xa9'], 'latin1') == 'é'
125    assert iter_decode_to_string([b'\xEF\xBB\xBF'], 'latin1') == ''
126    assert iter_decode_to_string([b'\xEF\xBB'], 'latin1') == 'ï»'
127    assert iter_decode_to_string([b'\xFE\xFF\x00\xe9'], 'latin1') == 'é'
128    assert iter_decode_to_string([b'\xFF\xFE\xe9\x00'], 'latin1') == 'é'
129    assert iter_decode_to_string([
130        b'', b'\xFF', b'', b'', b'\xFE\xe9', b'\x00'], 'latin1') == 'é'
131    assert iter_decode_to_string([
132        b'', b'h\xe9', b'llo'], 'x-user-defined') == 'h\uF7E9llo'
133
134
135def test_iter_encode():
136    assert b''.join(iter_encode([], 'latin1')) == b''
137    assert b''.join(iter_encode([''], 'latin1')) == b''
138    assert b''.join(iter_encode(['é'], 'latin1')) == b'\xe9'
139    assert b''.join(iter_encode(['', 'é', '', ''], 'latin1')) == b'\xe9'
140    assert b''.join(iter_encode(['', 'é', '', ''], 'utf-16')) == b'\xe9\x00'
141    assert b''.join(iter_encode(['', 'é', '', ''], 'utf-16le')) == b'\xe9\x00'
142    assert b''.join(iter_encode(['', 'é', '', ''], 'utf-16be')) == b'\x00\xe9'
143    assert b''.join(iter_encode([
144        '', 'h\uF7E9', '', 'llo'], 'x-user-defined')) == b'h\xe9llo'
145
146
147def test_x_user_defined():
148    encoded = b'2,\x0c\x0b\x1aO\xd9#\xcb\x0f\xc9\xbbt\xcf\xa8\xca'
149    decoded = '2,\x0c\x0b\x1aO\uf7d9#\uf7cb\x0f\uf7c9\uf7bbt\uf7cf\uf7a8\uf7ca'
150    encoded = b'aa'
151    decoded = 'aa'
152    assert decode(encoded, 'x-user-defined') == (decoded, lookup('x-user-defined'))
153    assert encode(decoded, 'x-user-defined') == encoded
154