1""" Test script for the Unicode implementation. 2 3Written by Bill Tutt. 4Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com) 5 6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY. 7 8"""#" 9 10import unittest 11import unicodedata 12 13from test import support 14from http.client import HTTPException 15 16try: 17 from _testcapi import INT_MAX, PY_SSIZE_T_MAX, UINT_MAX 18except ImportError: 19 INT_MAX = PY_SSIZE_T_MAX = UINT_MAX = 2**64 - 1 20 21class UnicodeNamesTest(unittest.TestCase): 22 23 def checkletter(self, name, code): 24 # Helper that put all \N escapes inside eval'd raw strings, 25 # to make sure this script runs even if the compiler 26 # chokes on \N escapes 27 res = eval(r'"\N{%s}"' % name) 28 self.assertEqual(res, code) 29 return res 30 31 def test_general(self): 32 # General and case insensitivity test: 33 chars = [ 34 "LATIN CAPITAL LETTER T", 35 "LATIN SMALL LETTER H", 36 "LATIN SMALL LETTER E", 37 "SPACE", 38 "LATIN SMALL LETTER R", 39 "LATIN CAPITAL LETTER E", 40 "LATIN SMALL LETTER D", 41 "SPACE", 42 "LATIN SMALL LETTER f", 43 "LATIN CAPITAL LeTtEr o", 44 "LATIN SMaLl LETTER x", 45 "SPACE", 46 "LATIN SMALL LETTER A", 47 "LATIN SMALL LETTER T", 48 "LATIN SMALL LETTER E", 49 "SPACE", 50 "LATIN SMALL LETTER T", 51 "LATIN SMALL LETTER H", 52 "LATIN SMALL LETTER E", 53 "SpAcE", 54 "LATIN SMALL LETTER S", 55 "LATIN SMALL LETTER H", 56 "LATIN small LETTER e", 57 "LATIN small LETTER e", 58 "LATIN SMALL LETTER P", 59 "FULL STOP" 60 ] 61 string = "The rEd fOx ate the sheep." 62 63 self.assertEqual( 64 "".join([self.checkletter(*args) for args in zip(chars, string)]), 65 string 66 ) 67 68 def test_ascii_letters(self): 69 for char in "".join(map(chr, range(ord("a"), ord("z")))): 70 name = "LATIN SMALL LETTER %s" % char.upper() 71 code = unicodedata.lookup(name) 72 self.assertEqual(unicodedata.name(code), name) 73 74 def test_hangul_syllables(self): 75 self.checkletter("HANGUL SYLLABLE GA", "\uac00") 76 self.checkletter("HANGUL SYLLABLE GGWEOSS", "\uafe8") 77 self.checkletter("HANGUL SYLLABLE DOLS", "\ub3d0") 78 self.checkletter("HANGUL SYLLABLE RYAN", "\ub7b8") 79 self.checkletter("HANGUL SYLLABLE MWIK", "\ubba0") 80 self.checkletter("HANGUL SYLLABLE BBWAEM", "\ubf88") 81 self.checkletter("HANGUL SYLLABLE SSEOL", "\uc370") 82 self.checkletter("HANGUL SYLLABLE YI", "\uc758") 83 self.checkletter("HANGUL SYLLABLE JJYOSS", "\ucb40") 84 self.checkletter("HANGUL SYLLABLE KYEOLS", "\ucf28") 85 self.checkletter("HANGUL SYLLABLE PAN", "\ud310") 86 self.checkletter("HANGUL SYLLABLE HWEOK", "\ud6f8") 87 self.checkletter("HANGUL SYLLABLE HIH", "\ud7a3") 88 89 self.assertRaises(ValueError, unicodedata.name, "\ud7a4") 90 91 def test_cjk_unified_ideographs(self): 92 self.checkletter("CJK UNIFIED IDEOGRAPH-3400", "\u3400") 93 self.checkletter("CJK UNIFIED IDEOGRAPH-4DB5", "\u4db5") 94 self.checkletter("CJK UNIFIED IDEOGRAPH-4E00", "\u4e00") 95 self.checkletter("CJK UNIFIED IDEOGRAPH-9FCB", "\u9fCB") 96 self.checkletter("CJK UNIFIED IDEOGRAPH-20000", "\U00020000") 97 self.checkletter("CJK UNIFIED IDEOGRAPH-2A6D6", "\U0002a6d6") 98 self.checkletter("CJK UNIFIED IDEOGRAPH-2A700", "\U0002A700") 99 self.checkletter("CJK UNIFIED IDEOGRAPH-2B734", "\U0002B734") 100 self.checkletter("CJK UNIFIED IDEOGRAPH-2B740", "\U0002B740") 101 self.checkletter("CJK UNIFIED IDEOGRAPH-2B81D", "\U0002B81D") 102 self.checkletter("CJK UNIFIED IDEOGRAPH-3134A", "\U0003134A") 103 104 def test_bmp_characters(self): 105 for code in range(0x10000): 106 char = chr(code) 107 name = unicodedata.name(char, None) 108 if name is not None: 109 self.assertEqual(unicodedata.lookup(name), char) 110 111 def test_misc_symbols(self): 112 self.checkletter("PILCROW SIGN", "\u00b6") 113 self.checkletter("REPLACEMENT CHARACTER", "\uFFFD") 114 self.checkletter("HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK", "\uFF9F") 115 self.checkletter("FULLWIDTH LATIN SMALL LETTER A", "\uFF41") 116 117 def test_aliases(self): 118 # Check that the aliases defined in the NameAliases.txt file work. 119 # This should be updated when new aliases are added or the file 120 # should be downloaded and parsed instead. See #12753. 121 aliases = [ 122 ('LATIN CAPITAL LETTER GHA', 0x01A2), 123 ('LATIN SMALL LETTER GHA', 0x01A3), 124 ('KANNADA LETTER LLLA', 0x0CDE), 125 ('LAO LETTER FO FON', 0x0E9D), 126 ('LAO LETTER FO FAY', 0x0E9F), 127 ('LAO LETTER RO', 0x0EA3), 128 ('LAO LETTER LO', 0x0EA5), 129 ('TIBETAN MARK BKA- SHOG GI MGO RGYAN', 0x0FD0), 130 ('YI SYLLABLE ITERATION MARK', 0xA015), 131 ('PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET', 0xFE18), 132 ('BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS', 0x1D0C5) 133 ] 134 for alias, codepoint in aliases: 135 self.checkletter(alias, chr(codepoint)) 136 name = unicodedata.name(chr(codepoint)) 137 self.assertNotEqual(name, alias) 138 self.assertEqual(unicodedata.lookup(alias), 139 unicodedata.lookup(name)) 140 with self.assertRaises(KeyError): 141 unicodedata.ucd_3_2_0.lookup(alias) 142 143 def test_aliases_names_in_pua_range(self): 144 # We are storing aliases in the PUA 15, but their names shouldn't leak 145 for cp in range(0xf0000, 0xf0100): 146 with self.assertRaises(ValueError) as cm: 147 unicodedata.name(chr(cp)) 148 self.assertEqual(str(cm.exception), 'no such name') 149 150 def test_named_sequences_names_in_pua_range(self): 151 # We are storing named seq in the PUA 15, but their names shouldn't leak 152 for cp in range(0xf0100, 0xf0fff): 153 with self.assertRaises(ValueError) as cm: 154 unicodedata.name(chr(cp)) 155 self.assertEqual(str(cm.exception), 'no such name') 156 157 def test_named_sequences_sample(self): 158 # Check a few named sequences. See #12753. 159 sequences = [ 160 ('LATIN SMALL LETTER R WITH TILDE', '\u0072\u0303'), 161 ('TAMIL SYLLABLE SAI', '\u0BB8\u0BC8'), 162 ('TAMIL SYLLABLE MOO', '\u0BAE\u0BCB'), 163 ('TAMIL SYLLABLE NNOO', '\u0BA3\u0BCB'), 164 ('TAMIL CONSONANT KSS', '\u0B95\u0BCD\u0BB7\u0BCD'), 165 ] 166 for seqname, codepoints in sequences: 167 self.assertEqual(unicodedata.lookup(seqname), codepoints) 168 with self.assertRaises(SyntaxError): 169 self.checkletter(seqname, None) 170 with self.assertRaises(KeyError): 171 unicodedata.ucd_3_2_0.lookup(seqname) 172 173 def test_named_sequences_full(self): 174 # Check all the named sequences 175 def check_version(testfile): 176 hdr = testfile.readline() 177 return unicodedata.unidata_version in hdr 178 url = ("http://www.pythontest.net/unicode/%s/NamedSequences.txt" % 179 unicodedata.unidata_version) 180 try: 181 testdata = support.open_urlresource(url, encoding="utf-8", 182 check=check_version) 183 except (OSError, HTTPException): 184 self.skipTest("Could not retrieve " + url) 185 self.addCleanup(testdata.close) 186 for line in testdata: 187 line = line.strip() 188 if not line or line.startswith('#'): 189 continue 190 seqname, codepoints = line.split(';') 191 codepoints = ''.join(chr(int(cp, 16)) for cp in codepoints.split()) 192 self.assertEqual(unicodedata.lookup(seqname), codepoints) 193 with self.assertRaises(SyntaxError): 194 self.checkletter(seqname, None) 195 with self.assertRaises(KeyError): 196 unicodedata.ucd_3_2_0.lookup(seqname) 197 198 def test_errors(self): 199 self.assertRaises(TypeError, unicodedata.name) 200 self.assertRaises(TypeError, unicodedata.name, 'xx') 201 self.assertRaises(TypeError, unicodedata.lookup) 202 self.assertRaises(KeyError, unicodedata.lookup, 'unknown') 203 204 def test_strict_error_handling(self): 205 # bogus character name 206 self.assertRaises( 207 UnicodeError, 208 str, b"\\N{blah}", 'unicode-escape', 'strict' 209 ) 210 # long bogus character name 211 self.assertRaises( 212 UnicodeError, 213 str, bytes("\\N{%s}" % ("x" * 100000), "ascii"), 'unicode-escape', 'strict' 214 ) 215 # missing closing brace 216 self.assertRaises( 217 UnicodeError, 218 str, b"\\N{SPACE", 'unicode-escape', 'strict' 219 ) 220 # missing opening brace 221 self.assertRaises( 222 UnicodeError, 223 str, b"\\NSPACE", 'unicode-escape', 'strict' 224 ) 225 226 @support.cpython_only 227 @unittest.skipUnless(INT_MAX < PY_SSIZE_T_MAX, "needs UINT_MAX < SIZE_MAX") 228 @support.bigmemtest(size=UINT_MAX + 1, memuse=2 + 1, dry_run=False) 229 def test_issue16335(self, size): 230 # very very long bogus character name 231 x = b'\\N{SPACE' + b'x' * (UINT_MAX + 1) + b'}' 232 self.assertEqual(len(x), len(b'\\N{SPACE}') + (UINT_MAX + 1)) 233 self.assertRaisesRegex(UnicodeError, 234 'unknown Unicode character name', 235 x.decode, 'unicode-escape' 236 ) 237 238 239if __name__ == "__main__": 240 unittest.main() 241