1# coding: utf-8 2""" 3 4 webencodings.tests 5 ~~~~~~~~~~~~~~~~~~ 6 7 A basic test suite for Encoding. 8 9 :copyright: Copyright 2012 by Simon Sapin 10 :license: BSD, see LICENSE for details. 11 12""" 13 14from __future__ import unicode_literals 15 16from . import (lookup, LABELS, decode, encode, iter_decode, iter_encode, 17 IncrementalDecoder, IncrementalEncoder, UTF8) 18 19 20def assert_raises(exception, function, *args, **kwargs): 21 try: 22 function(*args, **kwargs) 23 except exception: 24 return 25 else: # pragma: no cover 26 raise AssertionError('Did not raise %s.' % exception) 27 28 29def test_labels(): 30 assert lookup('utf-8').name == 'utf-8' 31 assert lookup('Utf-8').name == 'utf-8' 32 assert lookup('UTF-8').name == 'utf-8' 33 assert lookup('utf8').name == 'utf-8' 34 assert lookup('utf8').name == 'utf-8' 35 assert lookup('utf8 ').name == 'utf-8' 36 assert lookup(' \r\nutf8\t').name == 'utf-8' 37 assert lookup('u8') is None # Python label. 38 assert lookup('utf-8 ') is None # Non-ASCII white space. 39 40 assert lookup('US-ASCII').name == 'windows-1252' 41 assert lookup('iso-8859-1').name == 'windows-1252' 42 assert lookup('latin1').name == 'windows-1252' 43 assert lookup('LATIN1').name == 'windows-1252' 44 assert lookup('latin-1') is None 45 assert lookup('LATİN1') is None # ASCII-only case insensitivity. 46 47 48def test_all_labels(): 49 for label in LABELS: 50 assert decode(b'', label) == ('', lookup(label)) 51 assert encode('', label) == b'' 52 for repeat in [0, 1, 12]: 53 output, _ = iter_decode([b''] * repeat, label) 54 assert list(output) == [] 55 assert list(iter_encode([''] * repeat, label)) == [] 56 decoder = IncrementalDecoder(label) 57 assert decoder.decode(b'') == '' 58 assert decoder.decode(b'', final=True) == '' 59 encoder = IncrementalEncoder(label) 60 assert encoder.encode('') == b'' 61 assert encoder.encode('', final=True) == b'' 62 # All encoding names are valid labels too: 63 for name in set(LABELS.values()): 64 assert lookup(name).name == name 65 66 67def test_invalid_label(): 68 assert_raises(LookupError, decode, b'\xEF\xBB\xBF\xc3\xa9', 'invalid') 69 assert_raises(LookupError, encode, 'é', 'invalid') 70 assert_raises(LookupError, iter_decode, [], 'invalid') 71 assert_raises(LookupError, iter_encode, [], 'invalid') 72 assert_raises(LookupError, IncrementalDecoder, 'invalid') 73 assert_raises(LookupError, IncrementalEncoder, 'invalid') 74 75 76def test_decode(): 77 assert decode(b'\x80', 'latin1') == ('€', lookup('latin1')) 78 assert decode(b'\x80', lookup('latin1')) == ('€', lookup('latin1')) 79 assert decode(b'\xc3\xa9', 'utf8') == ('é', lookup('utf8')) 80 assert decode(b'\xc3\xa9', UTF8) == ('é', lookup('utf8')) 81 assert decode(b'\xc3\xa9', 'ascii') == ('é', lookup('ascii')) 82 assert decode(b'\xEF\xBB\xBF\xc3\xa9', 'ascii') == ('é', lookup('utf8')) # UTF-8 with BOM 83 84 assert decode(b'\xFE\xFF\x00\xe9', 'ascii') == ('é', lookup('utf-16be')) # UTF-16-BE with BOM 85 assert decode(b'\xFF\xFE\xe9\x00', 'ascii') == ('é', lookup('utf-16le')) # UTF-16-LE with BOM 86 assert decode(b'\xFE\xFF\xe9\x00', 'ascii') == ('\ue900', lookup('utf-16be')) 87 assert decode(b'\xFF\xFE\x00\xe9', 'ascii') == ('\ue900', lookup('utf-16le')) 88 89 assert decode(b'\x00\xe9', 'UTF-16BE') == ('é', lookup('utf-16be')) 90 assert decode(b'\xe9\x00', 'UTF-16LE') == ('é', lookup('utf-16le')) 91 assert decode(b'\xe9\x00', 'UTF-16') == ('é', lookup('utf-16le')) 92 93 assert decode(b'\xe9\x00', 'UTF-16BE') == ('\ue900', lookup('utf-16be')) 94 assert decode(b'\x00\xe9', 'UTF-16LE') == ('\ue900', lookup('utf-16le')) 95 assert decode(b'\x00\xe9', 'UTF-16') == ('\ue900', lookup('utf-16le')) 96 97 98def test_encode(): 99 assert encode('é', 'latin1') == b'\xe9' 100 assert encode('é', 'utf8') == b'\xc3\xa9' 101 assert encode('é', 'utf8') == b'\xc3\xa9' 102 assert encode('é', 'utf-16') == b'\xe9\x00' 103 assert encode('é', 'utf-16le') == b'\xe9\x00' 104 assert encode('é', 'utf-16be') == b'\x00\xe9' 105 106 107def test_iter_decode(): 108 def iter_decode_to_string(input, fallback_encoding): 109 output, _encoding = iter_decode(input, fallback_encoding) 110 return ''.join(output) 111 assert iter_decode_to_string([], 'latin1') == '' 112 assert iter_decode_to_string([b''], 'latin1') == '' 113 assert iter_decode_to_string([b'\xe9'], 'latin1') == 'é' 114 assert iter_decode_to_string([b'hello'], 'latin1') == 'hello' 115 assert iter_decode_to_string([b'he', b'llo'], 'latin1') == 'hello' 116 assert iter_decode_to_string([b'hell', b'o'], 'latin1') == 'hello' 117 assert iter_decode_to_string([b'\xc3\xa9'], 'latin1') == 'é' 118 assert iter_decode_to_string([b'\xEF\xBB\xBF\xc3\xa9'], 'latin1') == 'é' 119 assert iter_decode_to_string([ 120 b'\xEF\xBB\xBF', b'\xc3', b'\xa9'], 'latin1') == 'é' 121 assert iter_decode_to_string([ 122 b'\xEF\xBB\xBF', b'a', b'\xc3'], 'latin1') == 'a\uFFFD' 123 assert iter_decode_to_string([ 124 b'', b'\xEF', b'', b'', b'\xBB\xBF\xc3', b'\xa9'], 'latin1') == 'é' 125 assert iter_decode_to_string([b'\xEF\xBB\xBF'], 'latin1') == '' 126 assert iter_decode_to_string([b'\xEF\xBB'], 'latin1') == 'ï»' 127 assert iter_decode_to_string([b'\xFE\xFF\x00\xe9'], 'latin1') == 'é' 128 assert iter_decode_to_string([b'\xFF\xFE\xe9\x00'], 'latin1') == 'é' 129 assert iter_decode_to_string([ 130 b'', b'\xFF', b'', b'', b'\xFE\xe9', b'\x00'], 'latin1') == 'é' 131 assert iter_decode_to_string([ 132 b'', b'h\xe9', b'llo'], 'x-user-defined') == 'h\uF7E9llo' 133 134 135def test_iter_encode(): 136 assert b''.join(iter_encode([], 'latin1')) == b'' 137 assert b''.join(iter_encode([''], 'latin1')) == b'' 138 assert b''.join(iter_encode(['é'], 'latin1')) == b'\xe9' 139 assert b''.join(iter_encode(['', 'é', '', ''], 'latin1')) == b'\xe9' 140 assert b''.join(iter_encode(['', 'é', '', ''], 'utf-16')) == b'\xe9\x00' 141 assert b''.join(iter_encode(['', 'é', '', ''], 'utf-16le')) == b'\xe9\x00' 142 assert b''.join(iter_encode(['', 'é', '', ''], 'utf-16be')) == b'\x00\xe9' 143 assert b''.join(iter_encode([ 144 '', 'h\uF7E9', '', 'llo'], 'x-user-defined')) == b'h\xe9llo' 145 146 147def test_x_user_defined(): 148 encoded = b'2,\x0c\x0b\x1aO\xd9#\xcb\x0f\xc9\xbbt\xcf\xa8\xca' 149 decoded = '2,\x0c\x0b\x1aO\uf7d9#\uf7cb\x0f\uf7c9\uf7bbt\uf7cf\uf7a8\uf7ca' 150 encoded = b'aa' 151 decoded = 'aa' 152 assert decode(encoded, 'x-user-defined') == (decoded, lookup('x-user-defined')) 153 assert encode(decoded, 'x-user-defined') == encoded 154