1""" Test script for the Unicode implementation.
2
3Written by Bill Tutt.
4Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
9
10import unittest
11import unicodedata
12
13from test import support
14from http.client import HTTPException
15
16try:
17    from _testcapi import INT_MAX, PY_SSIZE_T_MAX, UINT_MAX
18except ImportError:
19    INT_MAX = PY_SSIZE_T_MAX = UINT_MAX = 2**64 - 1
20
21class UnicodeNamesTest(unittest.TestCase):
22
23    def checkletter(self, name, code):
24        # Helper that put all \N escapes inside eval'd raw strings,
25        # to make sure this script runs even if the compiler
26        # chokes on \N escapes
27        res = eval(r'"\N{%s}"' % name)
28        self.assertEqual(res, code)
29        return res
30
31    def test_general(self):
32        # General and case insensitivity test:
33        chars = [
34            "LATIN CAPITAL LETTER T",
35            "LATIN SMALL LETTER H",
36            "LATIN SMALL LETTER E",
37            "SPACE",
38            "LATIN SMALL LETTER R",
39            "LATIN CAPITAL LETTER E",
40            "LATIN SMALL LETTER D",
41            "SPACE",
42            "LATIN SMALL LETTER f",
43            "LATIN CAPITAL LeTtEr o",
44            "LATIN SMaLl LETTER x",
45            "SPACE",
46            "LATIN SMALL LETTER A",
47            "LATIN SMALL LETTER T",
48            "LATIN SMALL LETTER E",
49            "SPACE",
50            "LATIN SMALL LETTER T",
51            "LATIN SMALL LETTER H",
52            "LATIN SMALL LETTER E",
53            "SpAcE",
54            "LATIN SMALL LETTER S",
55            "LATIN SMALL LETTER H",
56            "LATIN small LETTER e",
57            "LATIN small LETTER e",
58            "LATIN SMALL LETTER P",
59            "FULL STOP"
60        ]
61        string = "The rEd fOx ate the sheep."
62
63        self.assertEqual(
64            "".join([self.checkletter(*args) for args in zip(chars, string)]),
65            string
66        )
67
68    def test_ascii_letters(self):
69        for char in "".join(map(chr, range(ord("a"), ord("z")))):
70            name = "LATIN SMALL LETTER %s" % char.upper()
71            code = unicodedata.lookup(name)
72            self.assertEqual(unicodedata.name(code), name)
73
74    def test_hangul_syllables(self):
75        self.checkletter("HANGUL SYLLABLE GA", "\uac00")
76        self.checkletter("HANGUL SYLLABLE GGWEOSS", "\uafe8")
77        self.checkletter("HANGUL SYLLABLE DOLS", "\ub3d0")
78        self.checkletter("HANGUL SYLLABLE RYAN", "\ub7b8")
79        self.checkletter("HANGUL SYLLABLE MWIK", "\ubba0")
80        self.checkletter("HANGUL SYLLABLE BBWAEM", "\ubf88")
81        self.checkletter("HANGUL SYLLABLE SSEOL", "\uc370")
82        self.checkletter("HANGUL SYLLABLE YI", "\uc758")
83        self.checkletter("HANGUL SYLLABLE JJYOSS", "\ucb40")
84        self.checkletter("HANGUL SYLLABLE KYEOLS", "\ucf28")
85        self.checkletter("HANGUL SYLLABLE PAN", "\ud310")
86        self.checkletter("HANGUL SYLLABLE HWEOK", "\ud6f8")
87        self.checkletter("HANGUL SYLLABLE HIH", "\ud7a3")
88
89        self.assertRaises(ValueError, unicodedata.name, "\ud7a4")
90
91    def test_cjk_unified_ideographs(self):
92        self.checkletter("CJK UNIFIED IDEOGRAPH-3400", "\u3400")
93        self.checkletter("CJK UNIFIED IDEOGRAPH-4DB5", "\u4db5")
94        self.checkletter("CJK UNIFIED IDEOGRAPH-4E00", "\u4e00")
95        self.checkletter("CJK UNIFIED IDEOGRAPH-9FCB", "\u9fCB")
96        self.checkletter("CJK UNIFIED IDEOGRAPH-20000", "\U00020000")
97        self.checkletter("CJK UNIFIED IDEOGRAPH-2A6D6", "\U0002a6d6")
98        self.checkletter("CJK UNIFIED IDEOGRAPH-2A700", "\U0002A700")
99        self.checkletter("CJK UNIFIED IDEOGRAPH-2B734", "\U0002B734")
100        self.checkletter("CJK UNIFIED IDEOGRAPH-2B740", "\U0002B740")
101        self.checkletter("CJK UNIFIED IDEOGRAPH-2B81D", "\U0002B81D")
102        self.checkletter("CJK UNIFIED IDEOGRAPH-3134A", "\U0003134A")
103
104    def test_bmp_characters(self):
105        for code in range(0x10000):
106            char = chr(code)
107            name = unicodedata.name(char, None)
108            if name is not None:
109                self.assertEqual(unicodedata.lookup(name), char)
110
111    def test_misc_symbols(self):
112        self.checkletter("PILCROW SIGN", "\u00b6")
113        self.checkletter("REPLACEMENT CHARACTER", "\uFFFD")
114        self.checkletter("HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK", "\uFF9F")
115        self.checkletter("FULLWIDTH LATIN SMALL LETTER A", "\uFF41")
116
117    def test_aliases(self):
118        # Check that the aliases defined in the NameAliases.txt file work.
119        # This should be updated when new aliases are added or the file
120        # should be downloaded and parsed instead.  See #12753.
121        aliases = [
122            ('LATIN CAPITAL LETTER GHA', 0x01A2),
123            ('LATIN SMALL LETTER GHA', 0x01A3),
124            ('KANNADA LETTER LLLA', 0x0CDE),
125            ('LAO LETTER FO FON', 0x0E9D),
126            ('LAO LETTER FO FAY', 0x0E9F),
127            ('LAO LETTER RO', 0x0EA3),
128            ('LAO LETTER LO', 0x0EA5),
129            ('TIBETAN MARK BKA- SHOG GI MGO RGYAN', 0x0FD0),
130            ('YI SYLLABLE ITERATION MARK', 0xA015),
131            ('PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET', 0xFE18),
132            ('BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS', 0x1D0C5)
133        ]
134        for alias, codepoint in aliases:
135            self.checkletter(alias, chr(codepoint))
136            name = unicodedata.name(chr(codepoint))
137            self.assertNotEqual(name, alias)
138            self.assertEqual(unicodedata.lookup(alias),
139                             unicodedata.lookup(name))
140            with self.assertRaises(KeyError):
141                unicodedata.ucd_3_2_0.lookup(alias)
142
143    def test_aliases_names_in_pua_range(self):
144        # We are storing aliases in the PUA 15, but their names shouldn't leak
145        for cp in range(0xf0000, 0xf0100):
146            with self.assertRaises(ValueError) as cm:
147                unicodedata.name(chr(cp))
148            self.assertEqual(str(cm.exception), 'no such name')
149
150    def test_named_sequences_names_in_pua_range(self):
151        # We are storing named seq in the PUA 15, but their names shouldn't leak
152        for cp in range(0xf0100, 0xf0fff):
153            with self.assertRaises(ValueError) as cm:
154                unicodedata.name(chr(cp))
155            self.assertEqual(str(cm.exception), 'no such name')
156
157    def test_named_sequences_sample(self):
158        # Check a few named sequences.  See #12753.
159        sequences = [
160            ('LATIN SMALL LETTER R WITH TILDE', '\u0072\u0303'),
161            ('TAMIL SYLLABLE SAI', '\u0BB8\u0BC8'),
162            ('TAMIL SYLLABLE MOO', '\u0BAE\u0BCB'),
163            ('TAMIL SYLLABLE NNOO', '\u0BA3\u0BCB'),
164            ('TAMIL CONSONANT KSS', '\u0B95\u0BCD\u0BB7\u0BCD'),
165        ]
166        for seqname, codepoints in sequences:
167            self.assertEqual(unicodedata.lookup(seqname), codepoints)
168            with self.assertRaises(SyntaxError):
169                self.checkletter(seqname, None)
170            with self.assertRaises(KeyError):
171                unicodedata.ucd_3_2_0.lookup(seqname)
172
173    def test_named_sequences_full(self):
174        # Check all the named sequences
175        def check_version(testfile):
176            hdr = testfile.readline()
177            return unicodedata.unidata_version in hdr
178        url = ("http://www.pythontest.net/unicode/%s/NamedSequences.txt" %
179               unicodedata.unidata_version)
180        try:
181            testdata = support.open_urlresource(url, encoding="utf-8",
182                                                check=check_version)
183        except (OSError, HTTPException):
184            self.skipTest("Could not retrieve " + url)
185        self.addCleanup(testdata.close)
186        for line in testdata:
187            line = line.strip()
188            if not line or line.startswith('#'):
189                continue
190            seqname, codepoints = line.split(';')
191            codepoints = ''.join(chr(int(cp, 16)) for cp in codepoints.split())
192            self.assertEqual(unicodedata.lookup(seqname), codepoints)
193            with self.assertRaises(SyntaxError):
194                self.checkletter(seqname, None)
195            with self.assertRaises(KeyError):
196                unicodedata.ucd_3_2_0.lookup(seqname)
197
198    def test_errors(self):
199        self.assertRaises(TypeError, unicodedata.name)
200        self.assertRaises(TypeError, unicodedata.name, 'xx')
201        self.assertRaises(TypeError, unicodedata.lookup)
202        self.assertRaises(KeyError, unicodedata.lookup, 'unknown')
203
204    def test_strict_error_handling(self):
205        # bogus character name
206        self.assertRaises(
207            UnicodeError,
208            str, b"\\N{blah}", 'unicode-escape', 'strict'
209        )
210        # long bogus character name
211        self.assertRaises(
212            UnicodeError,
213            str, bytes("\\N{%s}" % ("x" * 100000), "ascii"), 'unicode-escape', 'strict'
214        )
215        # missing closing brace
216        self.assertRaises(
217            UnicodeError,
218            str, b"\\N{SPACE", 'unicode-escape', 'strict'
219        )
220        # missing opening brace
221        self.assertRaises(
222            UnicodeError,
223            str, b"\\NSPACE", 'unicode-escape', 'strict'
224        )
225
226    @support.cpython_only
227    @unittest.skipUnless(INT_MAX < PY_SSIZE_T_MAX, "needs UINT_MAX < SIZE_MAX")
228    @support.bigmemtest(size=UINT_MAX + 1, memuse=2 + 1, dry_run=False)
229    def test_issue16335(self, size):
230        # very very long bogus character name
231        x = b'\\N{SPACE' + b'x' * (UINT_MAX + 1) + b'}'
232        self.assertEqual(len(x), len(b'\\N{SPACE}') + (UINT_MAX + 1))
233        self.assertRaisesRegex(UnicodeError,
234            'unknown Unicode character name',
235            x.decode, 'unicode-escape'
236        )
237
238
239if __name__ == "__main__":
240    unittest.main()
241