1""" Test script for the Unicode implementation.
2
3Written by Marc-Andre Lemburg (mal@lemburg.com).
4
5(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
6
7"""
8import _string
9import codecs
10import itertools
11import operator
12import struct
13import sys
14import unicodedata
15import unittest
16import warnings
17from test import support, string_tests
18
19# Error handling (bad decoder return)
20def search_function(encoding):
21    def decode1(input, errors="strict"):
22        return 42 # not a tuple
23    def encode1(input, errors="strict"):
24        return 42 # not a tuple
25    def encode2(input, errors="strict"):
26        return (42, 42) # no unicode
27    def decode2(input, errors="strict"):
28        return (42, 42) # no unicode
29    if encoding=="test.unicode1":
30        return (encode1, decode1, None, None)
31    elif encoding=="test.unicode2":
32        return (encode2, decode2, None, None)
33    else:
34        return None
35codecs.register(search_function)
36
37def duplicate_string(text):
38    """
39    Try to get a fresh clone of the specified text:
40    new object with a reference count of 1.
41
42    This is a best-effort: latin1 single letters and the empty
43    string ('') are singletons and cannot be cloned.
44    """
45    return text.encode().decode()
46
47class StrSubclass(str):
48    pass
49
50class UnicodeTest(string_tests.CommonTest,
51        string_tests.MixinStrUnicodeUserStringTest,
52        string_tests.MixinStrUnicodeTest,
53        unittest.TestCase):
54
55    type2test = str
56
57    def checkequalnofix(self, result, object, methodname, *args):
58        method = getattr(object, methodname)
59        realresult = method(*args)
60        self.assertEqual(realresult, result)
61        self.assertTrue(type(realresult) is type(result))
62
63        # if the original is returned make sure that
64        # this doesn't happen with subclasses
65        if realresult is object:
66            class usub(str):
67                def __repr__(self):
68                    return 'usub(%r)' % str.__repr__(self)
69            object = usub(object)
70            method = getattr(object, methodname)
71            realresult = method(*args)
72            self.assertEqual(realresult, result)
73            self.assertTrue(object is not realresult)
74
75    def test_literals(self):
76        self.assertEqual('\xff', '\u00ff')
77        self.assertEqual('\uffff', '\U0000ffff')
78        self.assertRaises(SyntaxError, eval, '\'\\Ufffffffe\'')
79        self.assertRaises(SyntaxError, eval, '\'\\Uffffffff\'')
80        self.assertRaises(SyntaxError, eval, '\'\\U%08x\'' % 0x110000)
81        # raw strings should not have unicode escapes
82        self.assertNotEqual(r"\u0020", " ")
83
84    def test_ascii(self):
85        if not sys.platform.startswith('java'):
86            # Test basic sanity of repr()
87            self.assertEqual(ascii('abc'), "'abc'")
88            self.assertEqual(ascii('ab\\c'), "'ab\\\\c'")
89            self.assertEqual(ascii('ab\\'), "'ab\\\\'")
90            self.assertEqual(ascii('\\c'), "'\\\\c'")
91            self.assertEqual(ascii('\\'), "'\\\\'")
92            self.assertEqual(ascii('\n'), "'\\n'")
93            self.assertEqual(ascii('\r'), "'\\r'")
94            self.assertEqual(ascii('\t'), "'\\t'")
95            self.assertEqual(ascii('\b'), "'\\x08'")
96            self.assertEqual(ascii("'\""), """'\\'"'""")
97            self.assertEqual(ascii("'\""), """'\\'"'""")
98            self.assertEqual(ascii("'"), '''"'"''')
99            self.assertEqual(ascii('"'), """'"'""")
100            latin1repr = (
101                "'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
102                "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
103                "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
104                "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
105                "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
106                "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
107                "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
108                "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
109                "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
110                "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
111                "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
112                "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
113                "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
114                "\\xfe\\xff'")
115            testrepr = ascii(''.join(map(chr, range(256))))
116            self.assertEqual(testrepr, latin1repr)
117            # Test ascii works on wide unicode escapes without overflow.
118            self.assertEqual(ascii("\U00010000" * 39 + "\uffff" * 4096),
119                             ascii("\U00010000" * 39 + "\uffff" * 4096))
120
121            class WrongRepr:
122                def __repr__(self):
123                    return b'byte-repr'
124            self.assertRaises(TypeError, ascii, WrongRepr())
125
126    def test_repr(self):
127        if not sys.platform.startswith('java'):
128            # Test basic sanity of repr()
129            self.assertEqual(repr('abc'), "'abc'")
130            self.assertEqual(repr('ab\\c'), "'ab\\\\c'")
131            self.assertEqual(repr('ab\\'), "'ab\\\\'")
132            self.assertEqual(repr('\\c'), "'\\\\c'")
133            self.assertEqual(repr('\\'), "'\\\\'")
134            self.assertEqual(repr('\n'), "'\\n'")
135            self.assertEqual(repr('\r'), "'\\r'")
136            self.assertEqual(repr('\t'), "'\\t'")
137            self.assertEqual(repr('\b'), "'\\x08'")
138            self.assertEqual(repr("'\""), """'\\'"'""")
139            self.assertEqual(repr("'\""), """'\\'"'""")
140            self.assertEqual(repr("'"), '''"'"''')
141            self.assertEqual(repr('"'), """'"'""")
142            latin1repr = (
143                "'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
144                "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
145                "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
146                "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
147                "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
148                "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
149                "\\x9c\\x9d\\x9e\\x9f\\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9"
150                "\xaa\xab\xac\\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
151                "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5"
152                "\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3"
153                "\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1"
154                "\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"
155                "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd"
156                "\xfe\xff'")
157            testrepr = repr(''.join(map(chr, range(256))))
158            self.assertEqual(testrepr, latin1repr)
159            # Test repr works on wide unicode escapes without overflow.
160            self.assertEqual(repr("\U00010000" * 39 + "\uffff" * 4096),
161                             repr("\U00010000" * 39 + "\uffff" * 4096))
162
163            class WrongRepr:
164                def __repr__(self):
165                    return b'byte-repr'
166            self.assertRaises(TypeError, repr, WrongRepr())
167
168    def test_iterators(self):
169        # Make sure unicode objects have an __iter__ method
170        it = "\u1111\u2222\u3333".__iter__()
171        self.assertEqual(next(it), "\u1111")
172        self.assertEqual(next(it), "\u2222")
173        self.assertEqual(next(it), "\u3333")
174        self.assertRaises(StopIteration, next, it)
175
176    def test_count(self):
177        string_tests.CommonTest.test_count(self)
178        # check mixed argument types
179        self.checkequalnofix(3,  'aaa', 'count', 'a')
180        self.checkequalnofix(0,  'aaa', 'count', 'b')
181        self.checkequalnofix(3, 'aaa', 'count',  'a')
182        self.checkequalnofix(0, 'aaa', 'count',  'b')
183        self.checkequalnofix(0, 'aaa', 'count',  'b')
184        self.checkequalnofix(1, 'aaa', 'count',  'a', -1)
185        self.checkequalnofix(3, 'aaa', 'count',  'a', -10)
186        self.checkequalnofix(2, 'aaa', 'count',  'a', 0, -1)
187        self.checkequalnofix(0, 'aaa', 'count',  'a', 0, -10)
188        # test mixed kinds
189        self.checkequal(10, '\u0102' + 'a' * 10, 'count', 'a')
190        self.checkequal(10, '\U00100304' + 'a' * 10, 'count', 'a')
191        self.checkequal(10, '\U00100304' + '\u0102' * 10, 'count', '\u0102')
192        self.checkequal(0, 'a' * 10, 'count', '\u0102')
193        self.checkequal(0, 'a' * 10, 'count', '\U00100304')
194        self.checkequal(0, '\u0102' * 10, 'count', '\U00100304')
195        self.checkequal(10, '\u0102' + 'a_' * 10, 'count', 'a_')
196        self.checkequal(10, '\U00100304' + 'a_' * 10, 'count', 'a_')
197        self.checkequal(10, '\U00100304' + '\u0102_' * 10, 'count', '\u0102_')
198        self.checkequal(0, 'a' * 10, 'count', 'a\u0102')
199        self.checkequal(0, 'a' * 10, 'count', 'a\U00100304')
200        self.checkequal(0, '\u0102' * 10, 'count', '\u0102\U00100304')
201
202    def test_find(self):
203        string_tests.CommonTest.test_find(self)
204        # test implementation details of the memchr fast path
205        self.checkequal(100, 'a' * 100 + '\u0102', 'find', '\u0102')
206        self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0201')
207        self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0120')
208        self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0220')
209        self.checkequal(100, 'a' * 100 + '\U00100304', 'find', '\U00100304')
210        self.checkequal(-1, 'a' * 100 + '\U00100304', 'find', '\U00100204')
211        self.checkequal(-1, 'a' * 100 + '\U00100304', 'find', '\U00102004')
212        # check mixed argument types
213        self.checkequalnofix(0,  'abcdefghiabc', 'find', 'abc')
214        self.checkequalnofix(9,  'abcdefghiabc', 'find', 'abc', 1)
215        self.checkequalnofix(-1, 'abcdefghiabc', 'find', 'def', 4)
216
217        self.assertRaises(TypeError, 'hello'.find)
218        self.assertRaises(TypeError, 'hello'.find, 42)
219        # test mixed kinds
220        self.checkequal(100, '\u0102' * 100 + 'a', 'find', 'a')
221        self.checkequal(100, '\U00100304' * 100 + 'a', 'find', 'a')
222        self.checkequal(100, '\U00100304' * 100 + '\u0102', 'find', '\u0102')
223        self.checkequal(-1, 'a' * 100, 'find', '\u0102')
224        self.checkequal(-1, 'a' * 100, 'find', '\U00100304')
225        self.checkequal(-1, '\u0102' * 100, 'find', '\U00100304')
226        self.checkequal(100, '\u0102' * 100 + 'a_', 'find', 'a_')
227        self.checkequal(100, '\U00100304' * 100 + 'a_', 'find', 'a_')
228        self.checkequal(100, '\U00100304' * 100 + '\u0102_', 'find', '\u0102_')
229        self.checkequal(-1, 'a' * 100, 'find', 'a\u0102')
230        self.checkequal(-1, 'a' * 100, 'find', 'a\U00100304')
231        self.checkequal(-1, '\u0102' * 100, 'find', '\u0102\U00100304')
232
233    def test_rfind(self):
234        string_tests.CommonTest.test_rfind(self)
235        # test implementation details of the memrchr fast path
236        self.checkequal(0, '\u0102' + 'a' * 100 , 'rfind', '\u0102')
237        self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0201')
238        self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0120')
239        self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0220')
240        self.checkequal(0, '\U00100304' + 'a' * 100, 'rfind', '\U00100304')
241        self.checkequal(-1, '\U00100304' + 'a' * 100, 'rfind', '\U00100204')
242        self.checkequal(-1, '\U00100304' + 'a' * 100, 'rfind', '\U00102004')
243        # check mixed argument types
244        self.checkequalnofix(9,   'abcdefghiabc', 'rfind', 'abc')
245        self.checkequalnofix(12,  'abcdefghiabc', 'rfind', '')
246        self.checkequalnofix(12, 'abcdefghiabc', 'rfind',  '')
247        # test mixed kinds
248        self.checkequal(0, 'a' + '\u0102' * 100, 'rfind', 'a')
249        self.checkequal(0, 'a' + '\U00100304' * 100, 'rfind', 'a')
250        self.checkequal(0, '\u0102' + '\U00100304' * 100, 'rfind', '\u0102')
251        self.checkequal(-1, 'a' * 100, 'rfind', '\u0102')
252        self.checkequal(-1, 'a' * 100, 'rfind', '\U00100304')
253        self.checkequal(-1, '\u0102' * 100, 'rfind', '\U00100304')
254        self.checkequal(0, '_a' + '\u0102' * 100, 'rfind', '_a')
255        self.checkequal(0, '_a' + '\U00100304' * 100, 'rfind', '_a')
256        self.checkequal(0, '_\u0102' + '\U00100304' * 100, 'rfind', '_\u0102')
257        self.checkequal(-1, 'a' * 100, 'rfind', '\u0102a')
258        self.checkequal(-1, 'a' * 100, 'rfind', '\U00100304a')
259        self.checkequal(-1, '\u0102' * 100, 'rfind', '\U00100304\u0102')
260
261    def test_index(self):
262        string_tests.CommonTest.test_index(self)
263        self.checkequalnofix(0, 'abcdefghiabc', 'index',  '')
264        self.checkequalnofix(3, 'abcdefghiabc', 'index',  'def')
265        self.checkequalnofix(0, 'abcdefghiabc', 'index',  'abc')
266        self.checkequalnofix(9, 'abcdefghiabc', 'index',  'abc', 1)
267        self.assertRaises(ValueError, 'abcdefghiabc'.index, 'hib')
268        self.assertRaises(ValueError, 'abcdefghiab'.index,  'abc', 1)
269        self.assertRaises(ValueError, 'abcdefghi'.index,  'ghi', 8)
270        self.assertRaises(ValueError, 'abcdefghi'.index,  'ghi', -1)
271        # test mixed kinds
272        self.checkequal(100, '\u0102' * 100 + 'a', 'index', 'a')
273        self.checkequal(100, '\U00100304' * 100 + 'a', 'index', 'a')
274        self.checkequal(100, '\U00100304' * 100 + '\u0102', 'index', '\u0102')
275        self.assertRaises(ValueError, ('a' * 100).index, '\u0102')
276        self.assertRaises(ValueError, ('a' * 100).index, '\U00100304')
277        self.assertRaises(ValueError, ('\u0102' * 100).index, '\U00100304')
278        self.checkequal(100, '\u0102' * 100 + 'a_', 'index', 'a_')
279        self.checkequal(100, '\U00100304' * 100 + 'a_', 'index', 'a_')
280        self.checkequal(100, '\U00100304' * 100 + '\u0102_', 'index', '\u0102_')
281        self.assertRaises(ValueError, ('a' * 100).index, 'a\u0102')
282        self.assertRaises(ValueError, ('a' * 100).index, 'a\U00100304')
283        self.assertRaises(ValueError, ('\u0102' * 100).index, '\u0102\U00100304')
284
285    def test_rindex(self):
286        string_tests.CommonTest.test_rindex(self)
287        self.checkequalnofix(12, 'abcdefghiabc', 'rindex',  '')
288        self.checkequalnofix(3,  'abcdefghiabc', 'rindex',  'def')
289        self.checkequalnofix(9,  'abcdefghiabc', 'rindex',  'abc')
290        self.checkequalnofix(0,  'abcdefghiabc', 'rindex',  'abc', 0, -1)
291
292        self.assertRaises(ValueError, 'abcdefghiabc'.rindex,  'hib')
293        self.assertRaises(ValueError, 'defghiabc'.rindex,  'def', 1)
294        self.assertRaises(ValueError, 'defghiabc'.rindex,  'abc', 0, -1)
295        self.assertRaises(ValueError, 'abcdefghi'.rindex,  'ghi', 0, 8)
296        self.assertRaises(ValueError, 'abcdefghi'.rindex,  'ghi', 0, -1)
297        # test mixed kinds
298        self.checkequal(0, 'a' + '\u0102' * 100, 'rindex', 'a')
299        self.checkequal(0, 'a' + '\U00100304' * 100, 'rindex', 'a')
300        self.checkequal(0, '\u0102' + '\U00100304' * 100, 'rindex', '\u0102')
301        self.assertRaises(ValueError, ('a' * 100).rindex, '\u0102')
302        self.assertRaises(ValueError, ('a' * 100).rindex, '\U00100304')
303        self.assertRaises(ValueError, ('\u0102' * 100).rindex, '\U00100304')
304        self.checkequal(0, '_a' + '\u0102' * 100, 'rindex', '_a')
305        self.checkequal(0, '_a' + '\U00100304' * 100, 'rindex', '_a')
306        self.checkequal(0, '_\u0102' + '\U00100304' * 100, 'rindex', '_\u0102')
307        self.assertRaises(ValueError, ('a' * 100).rindex, '\u0102a')
308        self.assertRaises(ValueError, ('a' * 100).rindex, '\U00100304a')
309        self.assertRaises(ValueError, ('\u0102' * 100).rindex, '\U00100304\u0102')
310
311    def test_maketrans_translate(self):
312        # these work with plain translate()
313        self.checkequalnofix('bbbc', 'abababc', 'translate',
314                             {ord('a'): None})
315        self.checkequalnofix('iiic', 'abababc', 'translate',
316                             {ord('a'): None, ord('b'): ord('i')})
317        self.checkequalnofix('iiix', 'abababc', 'translate',
318                             {ord('a'): None, ord('b'): ord('i'), ord('c'): 'x'})
319        self.checkequalnofix('c', 'abababc', 'translate',
320                             {ord('a'): None, ord('b'): ''})
321        self.checkequalnofix('xyyx', 'xzx', 'translate',
322                             {ord('z'): 'yy'})
323
324        # this needs maketrans()
325        self.checkequalnofix('abababc', 'abababc', 'translate',
326                             {'b': '<i>'})
327        tbl = self.type2test.maketrans({'a': None, 'b': '<i>'})
328        self.checkequalnofix('<i><i><i>c', 'abababc', 'translate', tbl)
329        # test alternative way of calling maketrans()
330        tbl = self.type2test.maketrans('abc', 'xyz', 'd')
331        self.checkequalnofix('xyzzy', 'abdcdcbdddd', 'translate', tbl)
332
333        # various tests switching from ASCII to latin1 or the opposite;
334        # same length, remove a letter, or replace with a longer string.
335        self.assertEqual("[a]".translate(str.maketrans('a', 'X')),
336                         "[X]")
337        self.assertEqual("[a]".translate(str.maketrans({'a': 'X'})),
338                         "[X]")
339        self.assertEqual("[a]".translate(str.maketrans({'a': None})),
340                         "[]")
341        self.assertEqual("[a]".translate(str.maketrans({'a': 'XXX'})),
342                         "[XXX]")
343        self.assertEqual("[a]".translate(str.maketrans({'a': '\xe9'})),
344                         "[\xe9]")
345        self.assertEqual('axb'.translate(str.maketrans({'a': None, 'b': '123'})),
346                         "x123")
347        self.assertEqual('axb'.translate(str.maketrans({'a': None, 'b': '\xe9'})),
348                         "x\xe9")
349
350        # test non-ASCII (don't take the fast-path)
351        self.assertEqual("[a]".translate(str.maketrans({'a': '<\xe9>'})),
352                         "[<\xe9>]")
353        self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': 'a'})),
354                         "[a]")
355        self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': None})),
356                         "[]")
357        self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': '123'})),
358                         "[123]")
359        self.assertEqual("[a\xe9]".translate(str.maketrans({'a': '<\u20ac>'})),
360                         "[<\u20ac>\xe9]")
361
362        # invalid Unicode characters
363        invalid_char = 0x10ffff+1
364        for before in "a\xe9\u20ac\U0010ffff":
365            mapping = str.maketrans({before: invalid_char})
366            text = "[%s]" % before
367            self.assertRaises(ValueError, text.translate, mapping)
368
369        # errors
370        self.assertRaises(TypeError, self.type2test.maketrans)
371        self.assertRaises(ValueError, self.type2test.maketrans, 'abc', 'defg')
372        self.assertRaises(TypeError, self.type2test.maketrans, 2, 'def')
373        self.assertRaises(TypeError, self.type2test.maketrans, 'abc', 2)
374        self.assertRaises(TypeError, self.type2test.maketrans, 'abc', 'def', 2)
375        self.assertRaises(ValueError, self.type2test.maketrans, {'xy': 2})
376        self.assertRaises(TypeError, self.type2test.maketrans, {(1,): 2})
377
378        self.assertRaises(TypeError, 'hello'.translate)
379        self.assertRaises(TypeError, 'abababc'.translate, 'abc', 'xyz')
380
381    def test_split(self):
382        string_tests.CommonTest.test_split(self)
383
384        # test mixed kinds
385        for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
386            left *= 9
387            right *= 9
388            for delim in ('c', '\u0102', '\U00010302'):
389                self.checkequal([left + right],
390                                left + right, 'split', delim)
391                self.checkequal([left, right],
392                                left + delim + right, 'split', delim)
393                self.checkequal([left + right],
394                                left + right, 'split', delim * 2)
395                self.checkequal([left, right],
396                                left + delim * 2 + right, 'split', delim *2)
397
398    def test_rsplit(self):
399        string_tests.CommonTest.test_rsplit(self)
400        # test mixed kinds
401        for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
402            left *= 9
403            right *= 9
404            for delim in ('c', '\u0102', '\U00010302'):
405                self.checkequal([left + right],
406                                left + right, 'rsplit', delim)
407                self.checkequal([left, right],
408                                left + delim + right, 'rsplit', delim)
409                self.checkequal([left + right],
410                                left + right, 'rsplit', delim * 2)
411                self.checkequal([left, right],
412                                left + delim * 2 + right, 'rsplit', delim *2)
413
414    def test_partition(self):
415        string_tests.MixinStrUnicodeUserStringTest.test_partition(self)
416        # test mixed kinds
417        self.checkequal(('ABCDEFGH', '', ''), 'ABCDEFGH', 'partition', '\u4200')
418        for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
419            left *= 9
420            right *= 9
421            for delim in ('c', '\u0102', '\U00010302'):
422                self.checkequal((left + right, '', ''),
423                                left + right, 'partition', delim)
424                self.checkequal((left, delim, right),
425                                left + delim + right, 'partition', delim)
426                self.checkequal((left + right, '', ''),
427                                left + right, 'partition', delim * 2)
428                self.checkequal((left, delim * 2, right),
429                                left + delim * 2 + right, 'partition', delim * 2)
430
431    def test_rpartition(self):
432        string_tests.MixinStrUnicodeUserStringTest.test_rpartition(self)
433        # test mixed kinds
434        self.checkequal(('', '', 'ABCDEFGH'), 'ABCDEFGH', 'rpartition', '\u4200')
435        for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
436            left *= 9
437            right *= 9
438            for delim in ('c', '\u0102', '\U00010302'):
439                self.checkequal(('', '', left + right),
440                                left + right, 'rpartition', delim)
441                self.checkequal((left, delim, right),
442                                left + delim + right, 'rpartition', delim)
443                self.checkequal(('', '', left + right),
444                                left + right, 'rpartition', delim * 2)
445                self.checkequal((left, delim * 2, right),
446                                left + delim * 2 + right, 'rpartition', delim * 2)
447
448    def test_join(self):
449        string_tests.MixinStrUnicodeUserStringTest.test_join(self)
450
451        class MyWrapper:
452            def __init__(self, sval): self.sval = sval
453            def __str__(self): return self.sval
454
455        # mixed arguments
456        self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
457        self.checkequalnofix('abcd', '', 'join', ('a', 'b', 'c', 'd'))
458        self.checkequalnofix('w x y z', ' ', 'join', string_tests.Sequence('wxyz'))
459        self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
460        self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
461        self.checkequalnofix('abcd', '', 'join', ('a', 'b', 'c', 'd'))
462        self.checkequalnofix('w x y z', ' ', 'join', string_tests.Sequence('wxyz'))
463        self.checkraises(TypeError, ' ', 'join', ['1', '2', MyWrapper('foo')])
464        self.checkraises(TypeError, ' ', 'join', ['1', '2', '3', bytes()])
465        self.checkraises(TypeError, ' ', 'join', [1, 2, 3])
466        self.checkraises(TypeError, ' ', 'join', ['1', '2', 3])
467
468    @unittest.skipIf(sys.maxsize > 2**32,
469        'needs too much memory on a 64-bit platform')
470    def test_join_overflow(self):
471        size = int(sys.maxsize**0.5) + 1
472        seq = ('A' * size,) * size
473        self.assertRaises(OverflowError, ''.join, seq)
474
475    def test_replace(self):
476        string_tests.CommonTest.test_replace(self)
477
478        # method call forwarded from str implementation because of unicode argument
479        self.checkequalnofix('one@two!three!', 'one!two!three!', 'replace', '!', '@', 1)
480        self.assertRaises(TypeError, 'replace'.replace, "r", 42)
481        # test mixed kinds
482        for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
483            left *= 9
484            right *= 9
485            for delim in ('c', '\u0102', '\U00010302'):
486                for repl in ('d', '\u0103', '\U00010303'):
487                    self.checkequal(left + right,
488                                    left + right, 'replace', delim, repl)
489                    self.checkequal(left + repl + right,
490                                    left + delim + right,
491                                    'replace', delim, repl)
492                    self.checkequal(left + right,
493                                    left + right, 'replace', delim * 2, repl)
494                    self.checkequal(left + repl + right,
495                                    left + delim * 2 + right,
496                                    'replace', delim * 2, repl)
497
498    @support.cpython_only
499    def test_replace_id(self):
500        pattern = 'abc'
501        text = 'abc def'
502        self.assertIs(text.replace(pattern, pattern), text)
503
504    def test_bytes_comparison(self):
505        with support.check_warnings():
506            warnings.simplefilter('ignore', BytesWarning)
507            self.assertEqual('abc' == b'abc', False)
508            self.assertEqual('abc' != b'abc', True)
509            self.assertEqual('abc' == bytearray(b'abc'), False)
510            self.assertEqual('abc' != bytearray(b'abc'), True)
511
512    def test_comparison(self):
513        # Comparisons:
514        self.assertEqual('abc', 'abc')
515        self.assertTrue('abcd' > 'abc')
516        self.assertTrue('abc' < 'abcd')
517
518        if 0:
519            # Move these tests to a Unicode collation module test...
520            # Testing UTF-16 code point order comparisons...
521
522            # No surrogates, no fixup required.
523            self.assertTrue('\u0061' < '\u20ac')
524            # Non surrogate below surrogate value, no fixup required
525            self.assertTrue('\u0061' < '\ud800\udc02')
526
527            # Non surrogate above surrogate value, fixup required
528            def test_lecmp(s, s2):
529                self.assertTrue(s < s2)
530
531            def test_fixup(s):
532                s2 = '\ud800\udc01'
533                test_lecmp(s, s2)
534                s2 = '\ud900\udc01'
535                test_lecmp(s, s2)
536                s2 = '\uda00\udc01'
537                test_lecmp(s, s2)
538                s2 = '\udb00\udc01'
539                test_lecmp(s, s2)
540                s2 = '\ud800\udd01'
541                test_lecmp(s, s2)
542                s2 = '\ud900\udd01'
543                test_lecmp(s, s2)
544                s2 = '\uda00\udd01'
545                test_lecmp(s, s2)
546                s2 = '\udb00\udd01'
547                test_lecmp(s, s2)
548                s2 = '\ud800\ude01'
549                test_lecmp(s, s2)
550                s2 = '\ud900\ude01'
551                test_lecmp(s, s2)
552                s2 = '\uda00\ude01'
553                test_lecmp(s, s2)
554                s2 = '\udb00\ude01'
555                test_lecmp(s, s2)
556                s2 = '\ud800\udfff'
557                test_lecmp(s, s2)
558                s2 = '\ud900\udfff'
559                test_lecmp(s, s2)
560                s2 = '\uda00\udfff'
561                test_lecmp(s, s2)
562                s2 = '\udb00\udfff'
563                test_lecmp(s, s2)
564
565                test_fixup('\ue000')
566                test_fixup('\uff61')
567
568        # Surrogates on both sides, no fixup required
569        self.assertTrue('\ud800\udc02' < '\ud84d\udc56')
570
571    def test_islower(self):
572        super().test_islower()
573        self.checkequalnofix(False, '\u1FFc', 'islower')
574        self.assertFalse('\u2167'.islower())
575        self.assertTrue('\u2177'.islower())
576        # non-BMP, uppercase
577        self.assertFalse('\U00010401'.islower())
578        self.assertFalse('\U00010427'.islower())
579        # non-BMP, lowercase
580        self.assertTrue('\U00010429'.islower())
581        self.assertTrue('\U0001044E'.islower())
582        # non-BMP, non-cased
583        self.assertFalse('\U0001F40D'.islower())
584        self.assertFalse('\U0001F46F'.islower())
585
586    def test_isupper(self):
587        super().test_isupper()
588        if not sys.platform.startswith('java'):
589            self.checkequalnofix(False, '\u1FFc', 'isupper')
590        self.assertTrue('\u2167'.isupper())
591        self.assertFalse('\u2177'.isupper())
592        # non-BMP, uppercase
593        self.assertTrue('\U00010401'.isupper())
594        self.assertTrue('\U00010427'.isupper())
595        # non-BMP, lowercase
596        self.assertFalse('\U00010429'.isupper())
597        self.assertFalse('\U0001044E'.isupper())
598        # non-BMP, non-cased
599        self.assertFalse('\U0001F40D'.isupper())
600        self.assertFalse('\U0001F46F'.isupper())
601
602    def test_istitle(self):
603        super().test_istitle()
604        self.checkequalnofix(True, '\u1FFc', 'istitle')
605        self.checkequalnofix(True, 'Greek \u1FFcitlecases ...', 'istitle')
606
607        # non-BMP, uppercase + lowercase
608        self.assertTrue('\U00010401\U00010429'.istitle())
609        self.assertTrue('\U00010427\U0001044E'.istitle())
610        # apparently there are no titlecased (Lt) non-BMP chars in Unicode 6
611        for ch in ['\U00010429', '\U0001044E', '\U0001F40D', '\U0001F46F']:
612            self.assertFalse(ch.istitle(), '{!a} is not title'.format(ch))
613
614    def test_isspace(self):
615        super().test_isspace()
616        self.checkequalnofix(True, '\u2000', 'isspace')
617        self.checkequalnofix(True, '\u200a', 'isspace')
618        self.checkequalnofix(False, '\u2014', 'isspace')
619        # There are no non-BMP whitespace chars as of Unicode 12.
620        for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
621                   '\U0001F40D', '\U0001F46F']:
622            self.assertFalse(ch.isspace(), '{!a} is not space.'.format(ch))
623
624    @support.requires_resource('cpu')
625    def test_isspace_invariant(self):
626        for codepoint in range(sys.maxunicode + 1):
627            char = chr(codepoint)
628            bidirectional = unicodedata.bidirectional(char)
629            category = unicodedata.category(char)
630            self.assertEqual(char.isspace(),
631                             (bidirectional in ('WS', 'B', 'S')
632                              or category == 'Zs'))
633
634    def test_isalnum(self):
635        super().test_isalnum()
636        for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
637                   '\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107']:
638            self.assertTrue(ch.isalnum(), '{!a} is alnum.'.format(ch))
639
640    def test_isalpha(self):
641        super().test_isalpha()
642        self.checkequalnofix(True, '\u1FFc', 'isalpha')
643        # non-BMP, cased
644        self.assertTrue('\U00010401'.isalpha())
645        self.assertTrue('\U00010427'.isalpha())
646        self.assertTrue('\U00010429'.isalpha())
647        self.assertTrue('\U0001044E'.isalpha())
648        # non-BMP, non-cased
649        self.assertFalse('\U0001F40D'.isalpha())
650        self.assertFalse('\U0001F46F'.isalpha())
651
652    def test_isascii(self):
653        super().test_isascii()
654        self.assertFalse("\u20ac".isascii())
655        self.assertFalse("\U0010ffff".isascii())
656
657    def test_isdecimal(self):
658        self.checkequalnofix(False, '', 'isdecimal')
659        self.checkequalnofix(False, 'a', 'isdecimal')
660        self.checkequalnofix(True, '0', 'isdecimal')
661        self.checkequalnofix(False, '\u2460', 'isdecimal') # CIRCLED DIGIT ONE
662        self.checkequalnofix(False, '\xbc', 'isdecimal') # VULGAR FRACTION ONE QUARTER
663        self.checkequalnofix(True, '\u0660', 'isdecimal') # ARABIC-INDIC DIGIT ZERO
664        self.checkequalnofix(True, '0123456789', 'isdecimal')
665        self.checkequalnofix(False, '0123456789a', 'isdecimal')
666
667        self.checkraises(TypeError, 'abc', 'isdecimal', 42)
668
669        for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
670                   '\U0001F40D', '\U0001F46F', '\U00011065', '\U0001F107']:
671            self.assertFalse(ch.isdecimal(), '{!a} is not decimal.'.format(ch))
672        for ch in ['\U0001D7F6', '\U00011066', '\U000104A0']:
673            self.assertTrue(ch.isdecimal(), '{!a} is decimal.'.format(ch))
674
675    def test_isdigit(self):
676        super().test_isdigit()
677        self.checkequalnofix(True, '\u2460', 'isdigit')
678        self.checkequalnofix(False, '\xbc', 'isdigit')
679        self.checkequalnofix(True, '\u0660', 'isdigit')
680
681        for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
682                   '\U0001F40D', '\U0001F46F', '\U00011065']:
683            self.assertFalse(ch.isdigit(), '{!a} is not a digit.'.format(ch))
684        for ch in ['\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107']:
685            self.assertTrue(ch.isdigit(), '{!a} is a digit.'.format(ch))
686
687    def test_isnumeric(self):
688        self.checkequalnofix(False, '', 'isnumeric')
689        self.checkequalnofix(False, 'a', 'isnumeric')
690        self.checkequalnofix(True, '0', 'isnumeric')
691        self.checkequalnofix(True, '\u2460', 'isnumeric')
692        self.checkequalnofix(True, '\xbc', 'isnumeric')
693        self.checkequalnofix(True, '\u0660', 'isnumeric')
694        self.checkequalnofix(True, '0123456789', 'isnumeric')
695        self.checkequalnofix(False, '0123456789a', 'isnumeric')
696
697        self.assertRaises(TypeError, "abc".isnumeric, 42)
698
699        for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
700                   '\U0001F40D', '\U0001F46F']:
701            self.assertFalse(ch.isnumeric(), '{!a} is not numeric.'.format(ch))
702        for ch in ['\U00011065', '\U0001D7F6', '\U00011066',
703                   '\U000104A0', '\U0001F107']:
704            self.assertTrue(ch.isnumeric(), '{!a} is numeric.'.format(ch))
705
706    def test_isidentifier(self):
707        self.assertTrue("a".isidentifier())
708        self.assertTrue("Z".isidentifier())
709        self.assertTrue("_".isidentifier())
710        self.assertTrue("b0".isidentifier())
711        self.assertTrue("bc".isidentifier())
712        self.assertTrue("b_".isidentifier())
713        self.assertTrue("µ".isidentifier())
714        self.assertTrue("��������������".isidentifier())
715
716        self.assertFalse(" ".isidentifier())
717        self.assertFalse("[".isidentifier())
718        self.assertFalse("©".isidentifier())
719        self.assertFalse("0".isidentifier())
720
721    def test_isprintable(self):
722        self.assertTrue("".isprintable())
723        self.assertTrue(" ".isprintable())
724        self.assertTrue("abcdefg".isprintable())
725        self.assertFalse("abcdefg\n".isprintable())
726        # some defined Unicode character
727        self.assertTrue("\u0374".isprintable())
728        # undefined character
729        self.assertFalse("\u0378".isprintable())
730        # single surrogate character
731        self.assertFalse("\ud800".isprintable())
732
733        self.assertTrue('\U0001F46F'.isprintable())
734        self.assertFalse('\U000E0020'.isprintable())
735
736    def test_surrogates(self):
737        for s in ('a\uD800b\uDFFF', 'a\uDFFFb\uD800',
738                  'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'):
739            self.assertTrue(s.islower())
740            self.assertFalse(s.isupper())
741            self.assertFalse(s.istitle())
742        for s in ('A\uD800B\uDFFF', 'A\uDFFFB\uD800',
743                  'A\uD800B\uDFFFA', 'A\uDFFFB\uD800A'):
744            self.assertFalse(s.islower())
745            self.assertTrue(s.isupper())
746            self.assertTrue(s.istitle())
747
748        for meth_name in ('islower', 'isupper', 'istitle'):
749            meth = getattr(str, meth_name)
750            for s in ('\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF'):
751                self.assertFalse(meth(s), '%a.%s() is False' % (s, meth_name))
752
753        for meth_name in ('isalpha', 'isalnum', 'isdigit', 'isspace',
754                          'isdecimal', 'isnumeric',
755                          'isidentifier', 'isprintable'):
756            meth = getattr(str, meth_name)
757            for s in ('\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF',
758                      'a\uD800b\uDFFF', 'a\uDFFFb\uD800',
759                      'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'):
760                self.assertFalse(meth(s), '%a.%s() is False' % (s, meth_name))
761
762
763    def test_lower(self):
764        string_tests.CommonTest.test_lower(self)
765        self.assertEqual('\U00010427'.lower(), '\U0001044F')
766        self.assertEqual('\U00010427\U00010427'.lower(),
767                         '\U0001044F\U0001044F')
768        self.assertEqual('\U00010427\U0001044F'.lower(),
769                         '\U0001044F\U0001044F')
770        self.assertEqual('X\U00010427x\U0001044F'.lower(),
771                         'x\U0001044Fx\U0001044F')
772        self.assertEqual('fi'.lower(), 'fi')
773        self.assertEqual('\u0130'.lower(), '\u0069\u0307')
774        # Special case for GREEK CAPITAL LETTER SIGMA U+03A3
775        self.assertEqual('\u03a3'.lower(), '\u03c3')
776        self.assertEqual('\u0345\u03a3'.lower(), '\u0345\u03c3')
777        self.assertEqual('A\u0345\u03a3'.lower(), 'a\u0345\u03c2')
778        self.assertEqual('A\u0345\u03a3a'.lower(), 'a\u0345\u03c3a')
779        self.assertEqual('A\u0345\u03a3'.lower(), 'a\u0345\u03c2')
780        self.assertEqual('A\u03a3\u0345'.lower(), 'a\u03c2\u0345')
781        self.assertEqual('\u03a3\u0345 '.lower(), '\u03c3\u0345 ')
782        self.assertEqual('\U0008fffe'.lower(), '\U0008fffe')
783        self.assertEqual('\u2177'.lower(), '\u2177')
784
785    def test_casefold(self):
786        self.assertEqual('hello'.casefold(), 'hello')
787        self.assertEqual('hELlo'.casefold(), 'hello')
788        self.assertEqual('ß'.casefold(), 'ss')
789        self.assertEqual('fi'.casefold(), 'fi')
790        self.assertEqual('\u03a3'.casefold(), '\u03c3')
791        self.assertEqual('A\u0345\u03a3'.casefold(), 'a\u03b9\u03c3')
792        self.assertEqual('\u00b5'.casefold(), '\u03bc')
793
794    def test_upper(self):
795        string_tests.CommonTest.test_upper(self)
796        self.assertEqual('\U0001044F'.upper(), '\U00010427')
797        self.assertEqual('\U0001044F\U0001044F'.upper(),
798                         '\U00010427\U00010427')
799        self.assertEqual('\U00010427\U0001044F'.upper(),
800                         '\U00010427\U00010427')
801        self.assertEqual('X\U00010427x\U0001044F'.upper(),
802                         'X\U00010427X\U00010427')
803        self.assertEqual('fi'.upper(), 'FI')
804        self.assertEqual('\u0130'.upper(), '\u0130')
805        self.assertEqual('\u03a3'.upper(), '\u03a3')
806        self.assertEqual('ß'.upper(), 'SS')
807        self.assertEqual('\u1fd2'.upper(), '\u0399\u0308\u0300')
808        self.assertEqual('\U0008fffe'.upper(), '\U0008fffe')
809        self.assertEqual('\u2177'.upper(), '\u2167')
810
811    def test_capitalize(self):
812        string_tests.CommonTest.test_capitalize(self)
813        self.assertEqual('\U0001044F'.capitalize(), '\U00010427')
814        self.assertEqual('\U0001044F\U0001044F'.capitalize(),
815                         '\U00010427\U0001044F')
816        self.assertEqual('\U00010427\U0001044F'.capitalize(),
817                         '\U00010427\U0001044F')
818        self.assertEqual('\U0001044F\U00010427'.capitalize(),
819                         '\U00010427\U0001044F')
820        self.assertEqual('X\U00010427x\U0001044F'.capitalize(),
821                         'X\U0001044Fx\U0001044F')
822        self.assertEqual('h\u0130'.capitalize(), 'H\u0069\u0307')
823        exp = '\u0399\u0308\u0300\u0069\u0307'
824        self.assertEqual('\u1fd2\u0130'.capitalize(), exp)
825        self.assertEqual('finnish'.capitalize(), 'FInnish')
826        self.assertEqual('A\u0345\u03a3'.capitalize(), 'A\u0345\u03c2')
827
828    def test_title(self):
829        super().test_title()
830        self.assertEqual('\U0001044F'.title(), '\U00010427')
831        self.assertEqual('\U0001044F\U0001044F'.title(),
832                         '\U00010427\U0001044F')
833        self.assertEqual('\U0001044F\U0001044F \U0001044F\U0001044F'.title(),
834                         '\U00010427\U0001044F \U00010427\U0001044F')
835        self.assertEqual('\U00010427\U0001044F \U00010427\U0001044F'.title(),
836                         '\U00010427\U0001044F \U00010427\U0001044F')
837        self.assertEqual('\U0001044F\U00010427 \U0001044F\U00010427'.title(),
838                         '\U00010427\U0001044F \U00010427\U0001044F')
839        self.assertEqual('X\U00010427x\U0001044F X\U00010427x\U0001044F'.title(),
840                         'X\U0001044Fx\U0001044F X\U0001044Fx\U0001044F')
841        self.assertEqual('fiNNISH'.title(), 'Finnish')
842        self.assertEqual('A\u03a3 \u1fa1xy'.title(), 'A\u03c2 \u1fa9xy')
843        self.assertEqual('A\u03a3A'.title(), 'A\u03c3a')
844
845    def test_swapcase(self):
846        string_tests.CommonTest.test_swapcase(self)
847        self.assertEqual('\U0001044F'.swapcase(), '\U00010427')
848        self.assertEqual('\U00010427'.swapcase(), '\U0001044F')
849        self.assertEqual('\U0001044F\U0001044F'.swapcase(),
850                         '\U00010427\U00010427')
851        self.assertEqual('\U00010427\U0001044F'.swapcase(),
852                         '\U0001044F\U00010427')
853        self.assertEqual('\U0001044F\U00010427'.swapcase(),
854                         '\U00010427\U0001044F')
855        self.assertEqual('X\U00010427x\U0001044F'.swapcase(),
856                         'x\U0001044FX\U00010427')
857        self.assertEqual('fi'.swapcase(), 'FI')
858        self.assertEqual('\u0130'.swapcase(), '\u0069\u0307')
859        # Special case for GREEK CAPITAL LETTER SIGMA U+03A3
860        self.assertEqual('\u03a3'.swapcase(), '\u03c3')
861        self.assertEqual('\u0345\u03a3'.swapcase(), '\u0399\u03c3')
862        self.assertEqual('A\u0345\u03a3'.swapcase(), 'a\u0399\u03c2')
863        self.assertEqual('A\u0345\u03a3a'.swapcase(), 'a\u0399\u03c3A')
864        self.assertEqual('A\u0345\u03a3'.swapcase(), 'a\u0399\u03c2')
865        self.assertEqual('A\u03a3\u0345'.swapcase(), 'a\u03c2\u0399')
866        self.assertEqual('\u03a3\u0345 '.swapcase(), '\u03c3\u0399 ')
867        self.assertEqual('\u03a3'.swapcase(), '\u03c3')
868        self.assertEqual('ß'.swapcase(), 'SS')
869        self.assertEqual('\u1fd2'.swapcase(), '\u0399\u0308\u0300')
870
871    def test_center(self):
872        string_tests.CommonTest.test_center(self)
873        self.assertEqual('x'.center(2, '\U0010FFFF'),
874                         'x\U0010FFFF')
875        self.assertEqual('x'.center(3, '\U0010FFFF'),
876                         '\U0010FFFFx\U0010FFFF')
877        self.assertEqual('x'.center(4, '\U0010FFFF'),
878                         '\U0010FFFFx\U0010FFFF\U0010FFFF')
879
880    @unittest.skipUnless(sys.maxsize == 2**31 - 1, "requires 32-bit system")
881    @support.cpython_only
882    def test_case_operation_overflow(self):
883        # Issue #22643
884        size = 2**32//12 + 1
885        try:
886            s = "ü" * size
887        except MemoryError:
888            self.skipTest('no enough memory (%.0f MiB required)' % (size / 2**20))
889        try:
890            self.assertRaises(OverflowError, s.upper)
891        finally:
892            del s
893
894    def test_contains(self):
895        # Testing Unicode contains method
896        self.assertIn('a', 'abdb')
897        self.assertIn('a', 'bdab')
898        self.assertIn('a', 'bdaba')
899        self.assertIn('a', 'bdba')
900        self.assertNotIn('a', 'bdb')
901        self.assertIn('a', 'bdba')
902        self.assertIn('a', ('a',1,None))
903        self.assertIn('a', (1,None,'a'))
904        self.assertIn('a', ('a',1,None))
905        self.assertIn('a', (1,None,'a'))
906        self.assertNotIn('a', ('x',1,'y'))
907        self.assertNotIn('a', ('x',1,None))
908        self.assertNotIn('abcd', 'abcxxxx')
909        self.assertIn('ab', 'abcd')
910        self.assertIn('ab', 'abc')
911        self.assertIn('ab', (1,None,'ab'))
912        self.assertIn('', 'abc')
913        self.assertIn('', '')
914        self.assertIn('', 'abc')
915        self.assertNotIn('\0', 'abc')
916        self.assertIn('\0', '\0abc')
917        self.assertIn('\0', 'abc\0')
918        self.assertIn('a', '\0abc')
919        self.assertIn('asdf', 'asdf')
920        self.assertNotIn('asdf', 'asd')
921        self.assertNotIn('asdf', '')
922
923        self.assertRaises(TypeError, "abc".__contains__)
924        # test mixed kinds
925        for fill in ('a', '\u0100', '\U00010300'):
926            fill *= 9
927            for delim in ('c', '\u0102', '\U00010302'):
928                self.assertNotIn(delim, fill)
929                self.assertIn(delim, fill + delim)
930                self.assertNotIn(delim * 2, fill)
931                self.assertIn(delim * 2, fill + delim * 2)
932
933    def test_issue18183(self):
934        '\U00010000\U00100000'.lower()
935        '\U00010000\U00100000'.casefold()
936        '\U00010000\U00100000'.upper()
937        '\U00010000\U00100000'.capitalize()
938        '\U00010000\U00100000'.title()
939        '\U00010000\U00100000'.swapcase()
940        '\U00100000'.center(3, '\U00010000')
941        '\U00100000'.ljust(3, '\U00010000')
942        '\U00100000'.rjust(3, '\U00010000')
943
944    def test_format(self):
945        self.assertEqual(''.format(), '')
946        self.assertEqual('a'.format(), 'a')
947        self.assertEqual('ab'.format(), 'ab')
948        self.assertEqual('a{{'.format(), 'a{')
949        self.assertEqual('a}}'.format(), 'a}')
950        self.assertEqual('{{b'.format(), '{b')
951        self.assertEqual('}}b'.format(), '}b')
952        self.assertEqual('a{{b'.format(), 'a{b')
953
954        # examples from the PEP:
955        import datetime
956        self.assertEqual("My name is {0}".format('Fred'), "My name is Fred")
957        self.assertEqual("My name is {0[name]}".format(dict(name='Fred')),
958                         "My name is Fred")
959        self.assertEqual("My name is {0} :-{{}}".format('Fred'),
960                         "My name is Fred :-{}")
961
962        d = datetime.date(2007, 8, 18)
963        self.assertEqual("The year is {0.year}".format(d),
964                         "The year is 2007")
965
966        # classes we'll use for testing
967        class C:
968            def __init__(self, x=100):
969                self._x = x
970            def __format__(self, spec):
971                return spec
972
973        class D:
974            def __init__(self, x):
975                self.x = x
976            def __format__(self, spec):
977                return str(self.x)
978
979        # class with __str__, but no __format__
980        class E:
981            def __init__(self, x):
982                self.x = x
983            def __str__(self):
984                return 'E(' + self.x + ')'
985
986        # class with __repr__, but no __format__ or __str__
987        class F:
988            def __init__(self, x):
989                self.x = x
990            def __repr__(self):
991                return 'F(' + self.x + ')'
992
993        # class with __format__ that forwards to string, for some format_spec's
994        class G:
995            def __init__(self, x):
996                self.x = x
997            def __str__(self):
998                return "string is " + self.x
999            def __format__(self, format_spec):
1000                if format_spec == 'd':
1001                    return 'G(' + self.x + ')'
1002                return object.__format__(self, format_spec)
1003
1004        class I(datetime.date):
1005            def __format__(self, format_spec):
1006                return self.strftime(format_spec)
1007
1008        class J(int):
1009            def __format__(self, format_spec):
1010                return int.__format__(self * 2, format_spec)
1011
1012        class M:
1013            def __init__(self, x):
1014                self.x = x
1015            def __repr__(self):
1016                return 'M(' + self.x + ')'
1017            __str__ = None
1018
1019        class N:
1020            def __init__(self, x):
1021                self.x = x
1022            def __repr__(self):
1023                return 'N(' + self.x + ')'
1024            __format__ = None
1025
1026        self.assertEqual(''.format(), '')
1027        self.assertEqual('abc'.format(), 'abc')
1028        self.assertEqual('{0}'.format('abc'), 'abc')
1029        self.assertEqual('{0:}'.format('abc'), 'abc')
1030#        self.assertEqual('{ 0 }'.format('abc'), 'abc')
1031        self.assertEqual('X{0}'.format('abc'), 'Xabc')
1032        self.assertEqual('{0}X'.format('abc'), 'abcX')
1033        self.assertEqual('X{0}Y'.format('abc'), 'XabcY')
1034        self.assertEqual('{1}'.format(1, 'abc'), 'abc')
1035        self.assertEqual('X{1}'.format(1, 'abc'), 'Xabc')
1036        self.assertEqual('{1}X'.format(1, 'abc'), 'abcX')
1037        self.assertEqual('X{1}Y'.format(1, 'abc'), 'XabcY')
1038        self.assertEqual('{0}'.format(-15), '-15')
1039        self.assertEqual('{0}{1}'.format(-15, 'abc'), '-15abc')
1040        self.assertEqual('{0}X{1}'.format(-15, 'abc'), '-15Xabc')
1041        self.assertEqual('{{'.format(), '{')
1042        self.assertEqual('}}'.format(), '}')
1043        self.assertEqual('{{}}'.format(), '{}')
1044        self.assertEqual('{{x}}'.format(), '{x}')
1045        self.assertEqual('{{{0}}}'.format(123), '{123}')
1046        self.assertEqual('{{{{0}}}}'.format(), '{{0}}')
1047        self.assertEqual('}}{{'.format(), '}{')
1048        self.assertEqual('}}x{{'.format(), '}x{')
1049
1050        # weird field names
1051        self.assertEqual("{0[foo-bar]}".format({'foo-bar':'baz'}), 'baz')
1052        self.assertEqual("{0[foo bar]}".format({'foo bar':'baz'}), 'baz')
1053        self.assertEqual("{0[ ]}".format({' ':3}), '3')
1054
1055        self.assertEqual('{foo._x}'.format(foo=C(20)), '20')
1056        self.assertEqual('{1}{0}'.format(D(10), D(20)), '2010')
1057        self.assertEqual('{0._x.x}'.format(C(D('abc'))), 'abc')
1058        self.assertEqual('{0[0]}'.format(['abc', 'def']), 'abc')
1059        self.assertEqual('{0[1]}'.format(['abc', 'def']), 'def')
1060        self.assertEqual('{0[1][0]}'.format(['abc', ['def']]), 'def')
1061        self.assertEqual('{0[1][0].x}'.format(['abc', [D('def')]]), 'def')
1062
1063        # strings
1064        self.assertEqual('{0:.3s}'.format('abc'), 'abc')
1065        self.assertEqual('{0:.3s}'.format('ab'), 'ab')
1066        self.assertEqual('{0:.3s}'.format('abcdef'), 'abc')
1067        self.assertEqual('{0:.0s}'.format('abcdef'), '')
1068        self.assertEqual('{0:3.3s}'.format('abc'), 'abc')
1069        self.assertEqual('{0:2.3s}'.format('abc'), 'abc')
1070        self.assertEqual('{0:2.2s}'.format('abc'), 'ab')
1071        self.assertEqual('{0:3.2s}'.format('abc'), 'ab ')
1072        self.assertEqual('{0:x<0s}'.format('result'), 'result')
1073        self.assertEqual('{0:x<5s}'.format('result'), 'result')
1074        self.assertEqual('{0:x<6s}'.format('result'), 'result')
1075        self.assertEqual('{0:x<7s}'.format('result'), 'resultx')
1076        self.assertEqual('{0:x<8s}'.format('result'), 'resultxx')
1077        self.assertEqual('{0: <7s}'.format('result'), 'result ')
1078        self.assertEqual('{0:<7s}'.format('result'), 'result ')
1079        self.assertEqual('{0:>7s}'.format('result'), ' result')
1080        self.assertEqual('{0:>8s}'.format('result'), '  result')
1081        self.assertEqual('{0:^8s}'.format('result'), ' result ')
1082        self.assertEqual('{0:^9s}'.format('result'), ' result  ')
1083        self.assertEqual('{0:^10s}'.format('result'), '  result  ')
1084        self.assertEqual('{0:10000}'.format('a'), 'a' + ' ' * 9999)
1085        self.assertEqual('{0:10000}'.format(''), ' ' * 10000)
1086        self.assertEqual('{0:10000000}'.format(''), ' ' * 10000000)
1087
1088        # issue 12546: use \x00 as a fill character
1089        self.assertEqual('{0:\x00<6s}'.format('foo'), 'foo\x00\x00\x00')
1090        self.assertEqual('{0:\x01<6s}'.format('foo'), 'foo\x01\x01\x01')
1091        self.assertEqual('{0:\x00^6s}'.format('foo'), '\x00foo\x00\x00')
1092        self.assertEqual('{0:^6s}'.format('foo'), ' foo  ')
1093
1094        self.assertEqual('{0:\x00<6}'.format(3), '3\x00\x00\x00\x00\x00')
1095        self.assertEqual('{0:\x01<6}'.format(3), '3\x01\x01\x01\x01\x01')
1096        self.assertEqual('{0:\x00^6}'.format(3), '\x00\x003\x00\x00\x00')
1097        self.assertEqual('{0:<6}'.format(3), '3     ')
1098
1099        self.assertEqual('{0:\x00<6}'.format(3.14), '3.14\x00\x00')
1100        self.assertEqual('{0:\x01<6}'.format(3.14), '3.14\x01\x01')
1101        self.assertEqual('{0:\x00^6}'.format(3.14), '\x003.14\x00')
1102        self.assertEqual('{0:^6}'.format(3.14), ' 3.14 ')
1103
1104        self.assertEqual('{0:\x00<12}'.format(3+2.0j), '(3+2j)\x00\x00\x00\x00\x00\x00')
1105        self.assertEqual('{0:\x01<12}'.format(3+2.0j), '(3+2j)\x01\x01\x01\x01\x01\x01')
1106        self.assertEqual('{0:\x00^12}'.format(3+2.0j), '\x00\x00\x00(3+2j)\x00\x00\x00')
1107        self.assertEqual('{0:^12}'.format(3+2.0j), '   (3+2j)   ')
1108
1109        # format specifiers for user defined type
1110        self.assertEqual('{0:abc}'.format(C()), 'abc')
1111
1112        # !r, !s and !a coercions
1113        self.assertEqual('{0!s}'.format('Hello'), 'Hello')
1114        self.assertEqual('{0!s:}'.format('Hello'), 'Hello')
1115        self.assertEqual('{0!s:15}'.format('Hello'), 'Hello          ')
1116        self.assertEqual('{0!s:15s}'.format('Hello'), 'Hello          ')
1117        self.assertEqual('{0!r}'.format('Hello'), "'Hello'")
1118        self.assertEqual('{0!r:}'.format('Hello'), "'Hello'")
1119        self.assertEqual('{0!r}'.format(F('Hello')), 'F(Hello)')
1120        self.assertEqual('{0!r}'.format('\u0378'), "'\\u0378'") # nonprintable
1121        self.assertEqual('{0!r}'.format('\u0374'), "'\u0374'")  # printable
1122        self.assertEqual('{0!r}'.format(F('\u0374')), 'F(\u0374)')
1123        self.assertEqual('{0!a}'.format('Hello'), "'Hello'")
1124        self.assertEqual('{0!a}'.format('\u0378'), "'\\u0378'") # nonprintable
1125        self.assertEqual('{0!a}'.format('\u0374'), "'\\u0374'") # printable
1126        self.assertEqual('{0!a:}'.format('Hello'), "'Hello'")
1127        self.assertEqual('{0!a}'.format(F('Hello')), 'F(Hello)')
1128        self.assertEqual('{0!a}'.format(F('\u0374')), 'F(\\u0374)')
1129
1130        # test fallback to object.__format__
1131        self.assertEqual('{0}'.format({}), '{}')
1132        self.assertEqual('{0}'.format([]), '[]')
1133        self.assertEqual('{0}'.format([1]), '[1]')
1134
1135        self.assertEqual('{0:d}'.format(G('data')), 'G(data)')
1136        self.assertEqual('{0!s}'.format(G('data')), 'string is data')
1137
1138        self.assertRaises(TypeError, '{0:^10}'.format, E('data'))
1139        self.assertRaises(TypeError, '{0:^10s}'.format, E('data'))
1140        self.assertRaises(TypeError, '{0:>15s}'.format, G('data'))
1141
1142        self.assertEqual("{0:date: %Y-%m-%d}".format(I(year=2007,
1143                                                       month=8,
1144                                                       day=27)),
1145                         "date: 2007-08-27")
1146
1147        # test deriving from a builtin type and overriding __format__
1148        self.assertEqual("{0}".format(J(10)), "20")
1149
1150
1151        # string format specifiers
1152        self.assertEqual('{0:}'.format('a'), 'a')
1153
1154        # computed format specifiers
1155        self.assertEqual("{0:.{1}}".format('hello world', 5), 'hello')
1156        self.assertEqual("{0:.{1}s}".format('hello world', 5), 'hello')
1157        self.assertEqual("{0:.{precision}s}".format('hello world', precision=5), 'hello')
1158        self.assertEqual("{0:{width}.{precision}s}".format('hello world', width=10, precision=5), 'hello     ')
1159        self.assertEqual("{0:{width}.{precision}s}".format('hello world', width='10', precision='5'), 'hello     ')
1160
1161        # test various errors
1162        self.assertRaises(ValueError, '{'.format)
1163        self.assertRaises(ValueError, '}'.format)
1164        self.assertRaises(ValueError, 'a{'.format)
1165        self.assertRaises(ValueError, 'a}'.format)
1166        self.assertRaises(ValueError, '{a'.format)
1167        self.assertRaises(ValueError, '}a'.format)
1168        self.assertRaises(IndexError, '{0}'.format)
1169        self.assertRaises(IndexError, '{1}'.format, 'abc')
1170        self.assertRaises(KeyError,   '{x}'.format)
1171        self.assertRaises(ValueError, "}{".format)
1172        self.assertRaises(ValueError, "abc{0:{}".format)
1173        self.assertRaises(ValueError, "{0".format)
1174        self.assertRaises(IndexError, "{0.}".format)
1175        self.assertRaises(ValueError, "{0.}".format, 0)
1176        self.assertRaises(ValueError, "{0[}".format)
1177        self.assertRaises(ValueError, "{0[}".format, [])
1178        self.assertRaises(KeyError,   "{0]}".format)
1179        self.assertRaises(ValueError, "{0.[]}".format, 0)
1180        self.assertRaises(ValueError, "{0..foo}".format, 0)
1181        self.assertRaises(ValueError, "{0[0}".format, 0)
1182        self.assertRaises(ValueError, "{0[0:foo}".format, 0)
1183        self.assertRaises(KeyError,   "{c]}".format)
1184        self.assertRaises(ValueError, "{{ {{{0}}".format, 0)
1185        self.assertRaises(ValueError, "{0}}".format, 0)
1186        self.assertRaises(KeyError,   "{foo}".format, bar=3)
1187        self.assertRaises(ValueError, "{0!x}".format, 3)
1188        self.assertRaises(ValueError, "{0!}".format, 0)
1189        self.assertRaises(ValueError, "{0!rs}".format, 0)
1190        self.assertRaises(ValueError, "{!}".format)
1191        self.assertRaises(IndexError, "{:}".format)
1192        self.assertRaises(IndexError, "{:s}".format)
1193        self.assertRaises(IndexError, "{}".format)
1194        big = "23098475029384702983476098230754973209482573"
1195        self.assertRaises(ValueError, ("{" + big + "}").format)
1196        self.assertRaises(ValueError, ("{[" + big + "]}").format, [0])
1197
1198        # issue 6089
1199        self.assertRaises(ValueError, "{0[0]x}".format, [None])
1200        self.assertRaises(ValueError, "{0[0](10)}".format, [None])
1201
1202        # can't have a replacement on the field name portion
1203        self.assertRaises(TypeError, '{0[{1}]}'.format, 'abcdefg', 4)
1204
1205        # exceed maximum recursion depth
1206        self.assertRaises(ValueError, "{0:{1:{2}}}".format, 'abc', 's', '')
1207        self.assertRaises(ValueError, "{0:{1:{2:{3:{4:{5:{6}}}}}}}".format,
1208                          0, 1, 2, 3, 4, 5, 6, 7)
1209
1210        # string format spec errors
1211        self.assertRaises(ValueError, "{0:-s}".format, '')
1212        self.assertRaises(ValueError, format, "", "-")
1213        self.assertRaises(ValueError, "{0:=s}".format, '')
1214
1215        # Alternate formatting is not supported
1216        self.assertRaises(ValueError, format, '', '#')
1217        self.assertRaises(ValueError, format, '', '#20')
1218
1219        # Non-ASCII
1220        self.assertEqual("{0:s}{1:s}".format("ABC", "\u0410\u0411\u0412"),
1221                         'ABC\u0410\u0411\u0412')
1222        self.assertEqual("{0:.3s}".format("ABC\u0410\u0411\u0412"),
1223                         'ABC')
1224        self.assertEqual("{0:.0s}".format("ABC\u0410\u0411\u0412"),
1225                         '')
1226
1227        self.assertEqual("{[{}]}".format({"{}": 5}), "5")
1228        self.assertEqual("{[{}]}".format({"{}" : "a"}), "a")
1229        self.assertEqual("{[{]}".format({"{" : "a"}), "a")
1230        self.assertEqual("{[}]}".format({"}" : "a"}), "a")
1231        self.assertEqual("{[[]}".format({"[" : "a"}), "a")
1232        self.assertEqual("{[!]}".format({"!" : "a"}), "a")
1233        self.assertRaises(ValueError, "{a{}b}".format, 42)
1234        self.assertRaises(ValueError, "{a{b}".format, 42)
1235        self.assertRaises(ValueError, "{[}".format, 42)
1236
1237        self.assertEqual("0x{:0{:d}X}".format(0x0,16), "0x0000000000000000")
1238
1239        # Blocking fallback
1240        m = M('data')
1241        self.assertEqual("{!r}".format(m), 'M(data)')
1242        self.assertRaises(TypeError, "{!s}".format, m)
1243        self.assertRaises(TypeError, "{}".format, m)
1244        n = N('data')
1245        self.assertEqual("{!r}".format(n), 'N(data)')
1246        self.assertEqual("{!s}".format(n), 'N(data)')
1247        self.assertRaises(TypeError, "{}".format, n)
1248
1249    def test_format_map(self):
1250        self.assertEqual(''.format_map({}), '')
1251        self.assertEqual('a'.format_map({}), 'a')
1252        self.assertEqual('ab'.format_map({}), 'ab')
1253        self.assertEqual('a{{'.format_map({}), 'a{')
1254        self.assertEqual('a}}'.format_map({}), 'a}')
1255        self.assertEqual('{{b'.format_map({}), '{b')
1256        self.assertEqual('}}b'.format_map({}), '}b')
1257        self.assertEqual('a{{b'.format_map({}), 'a{b')
1258
1259        # using mappings
1260        class Mapping(dict):
1261            def __missing__(self, key):
1262                return key
1263        self.assertEqual('{hello}'.format_map(Mapping()), 'hello')
1264        self.assertEqual('{a} {world}'.format_map(Mapping(a='hello')), 'hello world')
1265
1266        class InternalMapping:
1267            def __init__(self):
1268                self.mapping = {'a': 'hello'}
1269            def __getitem__(self, key):
1270                return self.mapping[key]
1271        self.assertEqual('{a}'.format_map(InternalMapping()), 'hello')
1272
1273
1274        class C:
1275            def __init__(self, x=100):
1276                self._x = x
1277            def __format__(self, spec):
1278                return spec
1279        self.assertEqual('{foo._x}'.format_map({'foo': C(20)}), '20')
1280
1281        # test various errors
1282        self.assertRaises(TypeError, ''.format_map)
1283        self.assertRaises(TypeError, 'a'.format_map)
1284
1285        self.assertRaises(ValueError, '{'.format_map, {})
1286        self.assertRaises(ValueError, '}'.format_map, {})
1287        self.assertRaises(ValueError, 'a{'.format_map, {})
1288        self.assertRaises(ValueError, 'a}'.format_map, {})
1289        self.assertRaises(ValueError, '{a'.format_map, {})
1290        self.assertRaises(ValueError, '}a'.format_map, {})
1291
1292        # issue #12579: can't supply positional params to format_map
1293        self.assertRaises(ValueError, '{}'.format_map, {'a' : 2})
1294        self.assertRaises(ValueError, '{}'.format_map, 'a')
1295        self.assertRaises(ValueError, '{a} {}'.format_map, {"a" : 2, "b" : 1})
1296
1297        class BadMapping:
1298            def __getitem__(self, key):
1299                return 1/0
1300        self.assertRaises(KeyError, '{a}'.format_map, {})
1301        self.assertRaises(TypeError, '{a}'.format_map, [])
1302        self.assertRaises(ZeroDivisionError, '{a}'.format_map, BadMapping())
1303
1304    def test_format_huge_precision(self):
1305        format_string = ".{}f".format(sys.maxsize + 1)
1306        with self.assertRaises(ValueError):
1307            result = format(2.34, format_string)
1308
1309    def test_format_huge_width(self):
1310        format_string = "{}f".format(sys.maxsize + 1)
1311        with self.assertRaises(ValueError):
1312            result = format(2.34, format_string)
1313
1314    def test_format_huge_item_number(self):
1315        format_string = "{{{}:.6f}}".format(sys.maxsize + 1)
1316        with self.assertRaises(ValueError):
1317            result = format_string.format(2.34)
1318
1319    def test_format_auto_numbering(self):
1320        class C:
1321            def __init__(self, x=100):
1322                self._x = x
1323            def __format__(self, spec):
1324                return spec
1325
1326        self.assertEqual('{}'.format(10), '10')
1327        self.assertEqual('{:5}'.format('s'), 's    ')
1328        self.assertEqual('{!r}'.format('s'), "'s'")
1329        self.assertEqual('{._x}'.format(C(10)), '10')
1330        self.assertEqual('{[1]}'.format([1, 2]), '2')
1331        self.assertEqual('{[a]}'.format({'a':4, 'b':2}), '4')
1332        self.assertEqual('a{}b{}c'.format(0, 1), 'a0b1c')
1333
1334        self.assertEqual('a{:{}}b'.format('x', '^10'), 'a    x     b')
1335        self.assertEqual('a{:{}x}b'.format(20, '#'), 'a0x14b')
1336
1337        # can't mix and match numbering and auto-numbering
1338        self.assertRaises(ValueError, '{}{1}'.format, 1, 2)
1339        self.assertRaises(ValueError, '{1}{}'.format, 1, 2)
1340        self.assertRaises(ValueError, '{:{1}}'.format, 1, 2)
1341        self.assertRaises(ValueError, '{0:{}}'.format, 1, 2)
1342
1343        # can mix and match auto-numbering and named
1344        self.assertEqual('{f}{}'.format(4, f='test'), 'test4')
1345        self.assertEqual('{}{f}'.format(4, f='test'), '4test')
1346        self.assertEqual('{:{f}}{g}{}'.format(1, 3, g='g', f=2), ' 1g3')
1347        self.assertEqual('{f:{}}{}{g}'.format(2, 4, f=1, g='g'), ' 14g')
1348
1349    def test_formatting(self):
1350        string_tests.MixinStrUnicodeUserStringTest.test_formatting(self)
1351        # Testing Unicode formatting strings...
1352        self.assertEqual("%s, %s" % ("abc", "abc"), 'abc, abc')
1353        self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", 1, 2, 3), 'abc, abc, 1, 2.000000,  3.00')
1354        self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", 1, -2, 3), 'abc, abc, 1, -2.000000,  3.00')
1355        self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 3.5), 'abc, abc, -1, -2.000000,  3.50')
1356        self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 3.57), 'abc, abc, -1, -2.000000,  3.57')
1357        self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 1003.57), 'abc, abc, -1, -2.000000, 1003.57')
1358        if not sys.platform.startswith('java'):
1359            self.assertEqual("%r, %r" % (b"abc", "abc"), "b'abc', 'abc'")
1360            self.assertEqual("%r" % ("\u1234",), "'\u1234'")
1361            self.assertEqual("%a" % ("\u1234",), "'\\u1234'")
1362        self.assertEqual("%(x)s, %(y)s" % {'x':"abc", 'y':"def"}, 'abc, def')
1363        self.assertEqual("%(x)s, %(\xfc)s" % {'x':"abc", '\xfc':"def"}, 'abc, def')
1364
1365        self.assertEqual('%c' % 0x1234, '\u1234')
1366        self.assertEqual('%c' % 0x21483, '\U00021483')
1367        self.assertRaises(OverflowError, "%c".__mod__, (0x110000,))
1368        self.assertEqual('%c' % '\U00021483', '\U00021483')
1369        self.assertRaises(TypeError, "%c".__mod__, "aa")
1370        self.assertRaises(ValueError, "%.1\u1032f".__mod__, (1.0/3))
1371        self.assertRaises(TypeError, "%i".__mod__, "aa")
1372
1373        # formatting jobs delegated from the string implementation:
1374        self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1375        self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1376        self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1377        self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1378        self.assertEqual('...%(foo)s...' % {'foo':"abc",'def':123},  '...abc...')
1379        self.assertEqual('...%(foo)s...' % {'foo':"abc",'def':123}, '...abc...')
1380        self.assertEqual('...%s...%s...%s...%s...' % (1,2,3,"abc"), '...1...2...3...abc...')
1381        self.assertEqual('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,"abc"), '...%...%s...1...2...3...abc...')
1382        self.assertEqual('...%s...' % "abc", '...abc...')
1383        self.assertEqual('%*s' % (5,'abc',), '  abc')
1384        self.assertEqual('%*s' % (-5,'abc',), 'abc  ')
1385        self.assertEqual('%*.*s' % (5,2,'abc',), '   ab')
1386        self.assertEqual('%*.*s' % (5,3,'abc',), '  abc')
1387        self.assertEqual('%i %*.*s' % (10, 5,3,'abc',), '10   abc')
1388        self.assertEqual('%i%s %*.*s' % (10, 3, 5, 3, 'abc',), '103   abc')
1389        self.assertEqual('%c' % 'a', 'a')
1390        class Wrapper:
1391            def __str__(self):
1392                return '\u1234'
1393        self.assertEqual('%s' % Wrapper(), '\u1234')
1394
1395        # issue 3382
1396        NAN = float('nan')
1397        INF = float('inf')
1398        self.assertEqual('%f' % NAN, 'nan')
1399        self.assertEqual('%F' % NAN, 'NAN')
1400        self.assertEqual('%f' % INF, 'inf')
1401        self.assertEqual('%F' % INF, 'INF')
1402
1403        # PEP 393
1404        self.assertEqual('%.1s' % "a\xe9\u20ac", 'a')
1405        self.assertEqual('%.2s' % "a\xe9\u20ac", 'a\xe9')
1406
1407        #issue 19995
1408        class PseudoInt:
1409            def __init__(self, value):
1410                self.value = int(value)
1411            def __int__(self):
1412                return self.value
1413            def __index__(self):
1414                return self.value
1415        class PseudoFloat:
1416            def __init__(self, value):
1417                self.value = float(value)
1418            def __int__(self):
1419                return int(self.value)
1420        pi = PseudoFloat(3.1415)
1421        letter_m = PseudoInt(109)
1422        self.assertEqual('%x' % 42, '2a')
1423        self.assertEqual('%X' % 15, 'F')
1424        self.assertEqual('%o' % 9, '11')
1425        self.assertEqual('%c' % 109, 'm')
1426        self.assertEqual('%x' % letter_m, '6d')
1427        self.assertEqual('%X' % letter_m, '6D')
1428        self.assertEqual('%o' % letter_m, '155')
1429        self.assertEqual('%c' % letter_m, 'm')
1430        self.assertRaisesRegex(TypeError, '%x format: an integer is required, not float', operator.mod, '%x', 3.14),
1431        self.assertRaisesRegex(TypeError, '%X format: an integer is required, not float', operator.mod, '%X', 2.11),
1432        self.assertRaisesRegex(TypeError, '%o format: an integer is required, not float', operator.mod, '%o', 1.79),
1433        self.assertRaisesRegex(TypeError, '%x format: an integer is required, not PseudoFloat', operator.mod, '%x', pi),
1434        self.assertRaises(TypeError, operator.mod, '%c', pi),
1435
1436    def test_formatting_with_enum(self):
1437        # issue18780
1438        import enum
1439        class Float(float, enum.Enum):
1440            PI = 3.1415926
1441        class Int(enum.IntEnum):
1442            IDES = 15
1443        class Str(str, enum.Enum):
1444            ABC = 'abc'
1445        # Testing Unicode formatting strings...
1446        self.assertEqual("%s, %s" % (Str.ABC, Str.ABC),
1447                         'Str.ABC, Str.ABC')
1448        self.assertEqual("%s, %s, %d, %i, %u, %f, %5.2f" %
1449                        (Str.ABC, Str.ABC,
1450                         Int.IDES, Int.IDES, Int.IDES,
1451                         Float.PI, Float.PI),
1452                         'Str.ABC, Str.ABC, 15, 15, 15, 3.141593,  3.14')
1453
1454        # formatting jobs delegated from the string implementation:
1455        self.assertEqual('...%(foo)s...' % {'foo':Str.ABC},
1456                         '...Str.ABC...')
1457        self.assertEqual('...%(foo)s...' % {'foo':Int.IDES},
1458                         '...Int.IDES...')
1459        self.assertEqual('...%(foo)i...' % {'foo':Int.IDES},
1460                         '...15...')
1461        self.assertEqual('...%(foo)d...' % {'foo':Int.IDES},
1462                         '...15...')
1463        self.assertEqual('...%(foo)u...' % {'foo':Int.IDES, 'def':Float.PI},
1464                         '...15...')
1465        self.assertEqual('...%(foo)f...' % {'foo':Float.PI,'def':123},
1466                         '...3.141593...')
1467
1468    def test_formatting_huge_precision(self):
1469        format_string = "%.{}f".format(sys.maxsize + 1)
1470        with self.assertRaises(ValueError):
1471            result = format_string % 2.34
1472
1473    def test_issue28598_strsubclass_rhs(self):
1474        # A subclass of str with an __rmod__ method should be able to hook
1475        # into the % operator
1476        class SubclassedStr(str):
1477            def __rmod__(self, other):
1478                return 'Success, self.__rmod__({!r}) was called'.format(other)
1479        self.assertEqual('lhs %% %r' % SubclassedStr('rhs'),
1480                         "Success, self.__rmod__('lhs %% %r') was called")
1481
1482    @support.cpython_only
1483    def test_formatting_huge_precision_c_limits(self):
1484        from _testcapi import INT_MAX
1485        format_string = "%.{}f".format(INT_MAX + 1)
1486        with self.assertRaises(ValueError):
1487            result = format_string % 2.34
1488
1489    def test_formatting_huge_width(self):
1490        format_string = "%{}f".format(sys.maxsize + 1)
1491        with self.assertRaises(ValueError):
1492            result = format_string % 2.34
1493
1494    def test_startswith_endswith_errors(self):
1495        for meth in ('foo'.startswith, 'foo'.endswith):
1496            with self.assertRaises(TypeError) as cm:
1497                meth(['f'])
1498            exc = str(cm.exception)
1499            self.assertIn('str', exc)
1500            self.assertIn('tuple', exc)
1501
1502    @support.run_with_locale('LC_ALL', 'de_DE', 'fr_FR')
1503    def test_format_float(self):
1504        # should not format with a comma, but always with C locale
1505        self.assertEqual('1.0', '%.1f' % 1.0)
1506
1507    def test_constructor(self):
1508        # unicode(obj) tests (this maps to PyObject_Unicode() at C level)
1509
1510        self.assertEqual(
1511            str('unicode remains unicode'),
1512            'unicode remains unicode'
1513        )
1514
1515        for text in ('ascii', '\xe9', '\u20ac', '\U0010FFFF'):
1516            subclass = StrSubclass(text)
1517            self.assertEqual(str(subclass), text)
1518            self.assertEqual(len(subclass), len(text))
1519            if text == 'ascii':
1520                self.assertEqual(subclass.encode('ascii'), b'ascii')
1521                self.assertEqual(subclass.encode('utf-8'), b'ascii')
1522
1523        self.assertEqual(
1524            str('strings are converted to unicode'),
1525            'strings are converted to unicode'
1526        )
1527
1528        class StringCompat:
1529            def __init__(self, x):
1530                self.x = x
1531            def __str__(self):
1532                return self.x
1533
1534        self.assertEqual(
1535            str(StringCompat('__str__ compatible objects are recognized')),
1536            '__str__ compatible objects are recognized'
1537        )
1538
1539        # unicode(obj) is compatible to str():
1540
1541        o = StringCompat('unicode(obj) is compatible to str()')
1542        self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
1543        self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
1544
1545        for obj in (123, 123.45, 123):
1546            self.assertEqual(str(obj), str(str(obj)))
1547
1548        # unicode(obj, encoding, error) tests (this maps to
1549        # PyUnicode_FromEncodedObject() at C level)
1550
1551        if not sys.platform.startswith('java'):
1552            self.assertRaises(
1553                TypeError,
1554                str,
1555                'decoding unicode is not supported',
1556                'utf-8',
1557                'strict'
1558            )
1559
1560        self.assertEqual(
1561            str(b'strings are decoded to unicode', 'utf-8', 'strict'),
1562            'strings are decoded to unicode'
1563        )
1564
1565        if not sys.platform.startswith('java'):
1566            self.assertEqual(
1567                str(
1568                    memoryview(b'character buffers are decoded to unicode'),
1569                    'utf-8',
1570                    'strict'
1571                ),
1572                'character buffers are decoded to unicode'
1573            )
1574
1575        self.assertRaises(TypeError, str, 42, 42, 42)
1576
1577    def test_constructor_keyword_args(self):
1578        """Pass various keyword argument combinations to the constructor."""
1579        # The object argument can be passed as a keyword.
1580        self.assertEqual(str(object='foo'), 'foo')
1581        self.assertEqual(str(object=b'foo', encoding='utf-8'), 'foo')
1582        # The errors argument without encoding triggers "decode" mode.
1583        self.assertEqual(str(b'foo', errors='strict'), 'foo')  # not "b'foo'"
1584        self.assertEqual(str(object=b'foo', errors='strict'), 'foo')
1585
1586    def test_constructor_defaults(self):
1587        """Check the constructor argument defaults."""
1588        # The object argument defaults to '' or b''.
1589        self.assertEqual(str(), '')
1590        self.assertEqual(str(errors='strict'), '')
1591        utf8_cent = '¢'.encode('utf-8')
1592        # The encoding argument defaults to utf-8.
1593        self.assertEqual(str(utf8_cent, errors='strict'), '¢')
1594        # The errors argument defaults to strict.
1595        self.assertRaises(UnicodeDecodeError, str, utf8_cent, encoding='ascii')
1596
1597    def test_codecs_utf7(self):
1598        utfTests = [
1599            ('A\u2262\u0391.', b'A+ImIDkQ.'),             # RFC2152 example
1600            ('Hi Mom -\u263a-!', b'Hi Mom -+Jjo--!'),     # RFC2152 example
1601            ('\u65E5\u672C\u8A9E', b'+ZeVnLIqe-'),        # RFC2152 example
1602            ('Item 3 is \u00a31.', b'Item 3 is +AKM-1.'), # RFC2152 example
1603            ('+', b'+-'),
1604            ('+-', b'+--'),
1605            ('+?', b'+-?'),
1606            (r'\?', b'+AFw?'),
1607            ('+?', b'+-?'),
1608            (r'\\?', b'+AFwAXA?'),
1609            (r'\\\?', b'+AFwAXABc?'),
1610            (r'++--', b'+-+---'),
1611            ('\U000abcde', b'+2m/c3g-'),                  # surrogate pairs
1612            ('/', b'/'),
1613        ]
1614
1615        for (x, y) in utfTests:
1616            self.assertEqual(x.encode('utf-7'), y)
1617
1618        # Unpaired surrogates are passed through
1619        self.assertEqual('\uD801'.encode('utf-7'), b'+2AE-')
1620        self.assertEqual('\uD801x'.encode('utf-7'), b'+2AE-x')
1621        self.assertEqual('\uDC01'.encode('utf-7'), b'+3AE-')
1622        self.assertEqual('\uDC01x'.encode('utf-7'), b'+3AE-x')
1623        self.assertEqual(b'+2AE-'.decode('utf-7'), '\uD801')
1624        self.assertEqual(b'+2AE-x'.decode('utf-7'), '\uD801x')
1625        self.assertEqual(b'+3AE-'.decode('utf-7'), '\uDC01')
1626        self.assertEqual(b'+3AE-x'.decode('utf-7'), '\uDC01x')
1627
1628        self.assertEqual('\uD801\U000abcde'.encode('utf-7'), b'+2AHab9ze-')
1629        self.assertEqual(b'+2AHab9ze-'.decode('utf-7'), '\uD801\U000abcde')
1630
1631        # Issue #2242: crash on some Windows/MSVC versions
1632        self.assertEqual(b'+\xc1'.decode('utf-7', 'ignore'), '')
1633
1634        # Direct encoded characters
1635        set_d = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?"
1636        # Optional direct characters
1637        set_o = '!"#$%&*;<=>@[]^_`{|}'
1638        for c in set_d:
1639            self.assertEqual(c.encode('utf7'), c.encode('ascii'))
1640            self.assertEqual(c.encode('ascii').decode('utf7'), c)
1641        for c in set_o:
1642            self.assertEqual(c.encode('ascii').decode('utf7'), c)
1643
1644    def test_codecs_utf8(self):
1645        self.assertEqual(''.encode('utf-8'), b'')
1646        self.assertEqual('\u20ac'.encode('utf-8'), b'\xe2\x82\xac')
1647        self.assertEqual('\U00010002'.encode('utf-8'), b'\xf0\x90\x80\x82')
1648        self.assertEqual('\U00023456'.encode('utf-8'), b'\xf0\xa3\x91\x96')
1649        self.assertEqual('\ud800'.encode('utf-8', 'surrogatepass'), b'\xed\xa0\x80')
1650        self.assertEqual('\udc00'.encode('utf-8', 'surrogatepass'), b'\xed\xb0\x80')
1651        self.assertEqual(('\U00010002'*10).encode('utf-8'),
1652                         b'\xf0\x90\x80\x82'*10)
1653        self.assertEqual(
1654            '\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f'
1655            '\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00'
1656            '\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c'
1657            '\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067'
1658            '\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das'
1659            ' Nunstuck git und'.encode('utf-8'),
1660            b'\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81'
1661            b'\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3'
1662            b'\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe'
1663            b'\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83'
1664            b'\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8'
1665            b'\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81'
1666            b'\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81'
1667            b'\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3'
1668            b'\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf'
1669            b'\xe3\x80\x8cWenn ist das Nunstuck git und'
1670        )
1671
1672        # UTF-8 specific decoding tests
1673        self.assertEqual(str(b'\xf0\xa3\x91\x96', 'utf-8'), '\U00023456' )
1674        self.assertEqual(str(b'\xf0\x90\x80\x82', 'utf-8'), '\U00010002' )
1675        self.assertEqual(str(b'\xe2\x82\xac', 'utf-8'), '\u20ac' )
1676
1677        # Other possible utf-8 test cases:
1678        # * strict decoding testing for all of the
1679        #   UTF8_ERROR cases in PyUnicode_DecodeUTF8
1680
1681    def test_utf8_decode_valid_sequences(self):
1682        sequences = [
1683            # single byte
1684            (b'\x00', '\x00'), (b'a', 'a'), (b'\x7f', '\x7f'),
1685            # 2 bytes
1686            (b'\xc2\x80', '\x80'), (b'\xdf\xbf', '\u07ff'),
1687            # 3 bytes
1688            (b'\xe0\xa0\x80', '\u0800'), (b'\xed\x9f\xbf', '\ud7ff'),
1689            (b'\xee\x80\x80', '\uE000'), (b'\xef\xbf\xbf', '\uffff'),
1690            # 4 bytes
1691            (b'\xF0\x90\x80\x80', '\U00010000'),
1692            (b'\xf4\x8f\xbf\xbf', '\U0010FFFF')
1693        ]
1694        for seq, res in sequences:
1695            self.assertEqual(seq.decode('utf-8'), res)
1696
1697
1698    def test_utf8_decode_invalid_sequences(self):
1699        # continuation bytes in a sequence of 2, 3, or 4 bytes
1700        continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0)]
1701        # start bytes of a 2-byte sequence equivalent to code points < 0x7F
1702        invalid_2B_seq_start_bytes = [bytes([x]) for x in range(0xC0, 0xC2)]
1703        # start bytes of a 4-byte sequence equivalent to code points > 0x10FFFF
1704        invalid_4B_seq_start_bytes = [bytes([x]) for x in range(0xF5, 0xF8)]
1705        invalid_start_bytes = (
1706            continuation_bytes + invalid_2B_seq_start_bytes +
1707            invalid_4B_seq_start_bytes + [bytes([x]) for x in range(0xF7, 0x100)]
1708        )
1709
1710        for byte in invalid_start_bytes:
1711            self.assertRaises(UnicodeDecodeError, byte.decode, 'utf-8')
1712
1713        for sb in invalid_2B_seq_start_bytes:
1714            for cb in continuation_bytes:
1715                self.assertRaises(UnicodeDecodeError, (sb+cb).decode, 'utf-8')
1716
1717        for sb in invalid_4B_seq_start_bytes:
1718            for cb1 in continuation_bytes[:3]:
1719                for cb3 in continuation_bytes[:3]:
1720                    self.assertRaises(UnicodeDecodeError,
1721                                      (sb+cb1+b'\x80'+cb3).decode, 'utf-8')
1722
1723        for cb in [bytes([x]) for x in range(0x80, 0xA0)]:
1724            self.assertRaises(UnicodeDecodeError,
1725                              (b'\xE0'+cb+b'\x80').decode, 'utf-8')
1726            self.assertRaises(UnicodeDecodeError,
1727                              (b'\xE0'+cb+b'\xBF').decode, 'utf-8')
1728        # surrogates
1729        for cb in [bytes([x]) for x in range(0xA0, 0xC0)]:
1730            self.assertRaises(UnicodeDecodeError,
1731                              (b'\xED'+cb+b'\x80').decode, 'utf-8')
1732            self.assertRaises(UnicodeDecodeError,
1733                              (b'\xED'+cb+b'\xBF').decode, 'utf-8')
1734        for cb in [bytes([x]) for x in range(0x80, 0x90)]:
1735            self.assertRaises(UnicodeDecodeError,
1736                              (b'\xF0'+cb+b'\x80\x80').decode, 'utf-8')
1737            self.assertRaises(UnicodeDecodeError,
1738                              (b'\xF0'+cb+b'\xBF\xBF').decode, 'utf-8')
1739        for cb in [bytes([x]) for x in range(0x90, 0xC0)]:
1740            self.assertRaises(UnicodeDecodeError,
1741                              (b'\xF4'+cb+b'\x80\x80').decode, 'utf-8')
1742            self.assertRaises(UnicodeDecodeError,
1743                              (b'\xF4'+cb+b'\xBF\xBF').decode, 'utf-8')
1744
1745    def test_issue8271(self):
1746        # Issue #8271: during the decoding of an invalid UTF-8 byte sequence,
1747        # only the start byte and the continuation byte(s) are now considered
1748        # invalid, instead of the number of bytes specified by the start byte.
1749        # See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf (page 95,
1750        # table 3-8, Row 2) for more information about the algorithm used.
1751        FFFD = '\ufffd'
1752        sequences = [
1753            # invalid start bytes
1754            (b'\x80', FFFD), # continuation byte
1755            (b'\x80\x80', FFFD*2), # 2 continuation bytes
1756            (b'\xc0', FFFD),
1757            (b'\xc0\xc0', FFFD*2),
1758            (b'\xc1', FFFD),
1759            (b'\xc1\xc0', FFFD*2),
1760            (b'\xc0\xc1', FFFD*2),
1761            # with start byte of a 2-byte sequence
1762            (b'\xc2', FFFD), # only the start byte
1763            (b'\xc2\xc2', FFFD*2), # 2 start bytes
1764            (b'\xc2\xc2\xc2', FFFD*3), # 3 start bytes
1765            (b'\xc2\x41', FFFD+'A'), # invalid continuation byte
1766            # with start byte of a 3-byte sequence
1767            (b'\xe1', FFFD), # only the start byte
1768            (b'\xe1\xe1', FFFD*2), # 2 start bytes
1769            (b'\xe1\xe1\xe1', FFFD*3), # 3 start bytes
1770            (b'\xe1\xe1\xe1\xe1', FFFD*4), # 4 start bytes
1771            (b'\xe1\x80', FFFD), # only 1 continuation byte
1772            (b'\xe1\x41', FFFD+'A'), # invalid continuation byte
1773            (b'\xe1\x41\x80', FFFD+'A'+FFFD), # invalid cb followed by valid cb
1774            (b'\xe1\x41\x41', FFFD+'AA'), # 2 invalid continuation bytes
1775            (b'\xe1\x80\x41', FFFD+'A'), # only 1 valid continuation byte
1776            (b'\xe1\x80\xe1\x41', FFFD*2+'A'), # 1 valid and the other invalid
1777            (b'\xe1\x41\xe1\x80', FFFD+'A'+FFFD), # 1 invalid and the other valid
1778            # with start byte of a 4-byte sequence
1779            (b'\xf1', FFFD), # only the start byte
1780            (b'\xf1\xf1', FFFD*2), # 2 start bytes
1781            (b'\xf1\xf1\xf1', FFFD*3), # 3 start bytes
1782            (b'\xf1\xf1\xf1\xf1', FFFD*4), # 4 start bytes
1783            (b'\xf1\xf1\xf1\xf1\xf1', FFFD*5), # 5 start bytes
1784            (b'\xf1\x80', FFFD), # only 1 continuation bytes
1785            (b'\xf1\x80\x80', FFFD), # only 2 continuation bytes
1786            (b'\xf1\x80\x41', FFFD+'A'), # 1 valid cb and 1 invalid
1787            (b'\xf1\x80\x41\x41', FFFD+'AA'), # 1 valid cb and 1 invalid
1788            (b'\xf1\x80\x80\x41', FFFD+'A'), # 2 valid cb and 1 invalid
1789            (b'\xf1\x41\x80', FFFD+'A'+FFFD), # 1 invalid cv and 1 valid
1790            (b'\xf1\x41\x80\x80', FFFD+'A'+FFFD*2), # 1 invalid cb and 2 invalid
1791            (b'\xf1\x41\x80\x41', FFFD+'A'+FFFD+'A'), # 2 invalid cb and 1 invalid
1792            (b'\xf1\x41\x41\x80', FFFD+'AA'+FFFD), # 1 valid cb and 1 invalid
1793            (b'\xf1\x41\xf1\x80', FFFD+'A'+FFFD),
1794            (b'\xf1\x41\x80\xf1', FFFD+'A'+FFFD*2),
1795            (b'\xf1\xf1\x80\x41', FFFD*2+'A'),
1796            (b'\xf1\x41\xf1\xf1', FFFD+'A'+FFFD*2),
1797            # with invalid start byte of a 4-byte sequence (rfc2279)
1798            (b'\xf5', FFFD), # only the start byte
1799            (b'\xf5\xf5', FFFD*2), # 2 start bytes
1800            (b'\xf5\x80', FFFD*2), # only 1 continuation byte
1801            (b'\xf5\x80\x80', FFFD*3), # only 2 continuation byte
1802            (b'\xf5\x80\x80\x80', FFFD*4), # 3 continuation bytes
1803            (b'\xf5\x80\x41', FFFD*2+'A'), #  1 valid cb and 1 invalid
1804            (b'\xf5\x80\x41\xf5', FFFD*2+'A'+FFFD),
1805            (b'\xf5\x41\x80\x80\x41', FFFD+'A'+FFFD*2+'A'),
1806            # with invalid start byte of a 5-byte sequence (rfc2279)
1807            (b'\xf8', FFFD), # only the start byte
1808            (b'\xf8\xf8', FFFD*2), # 2 start bytes
1809            (b'\xf8\x80', FFFD*2), # only one continuation byte
1810            (b'\xf8\x80\x41', FFFD*2 + 'A'), # 1 valid cb and 1 invalid
1811            (b'\xf8\x80\x80\x80\x80', FFFD*5), # invalid 5 bytes seq with 5 bytes
1812            # with invalid start byte of a 6-byte sequence (rfc2279)
1813            (b'\xfc', FFFD), # only the start byte
1814            (b'\xfc\xfc', FFFD*2), # 2 start bytes
1815            (b'\xfc\x80\x80', FFFD*3), # only 2 continuation bytes
1816            (b'\xfc\x80\x80\x80\x80\x80', FFFD*6), # 6 continuation bytes
1817            # invalid start byte
1818            (b'\xfe', FFFD),
1819            (b'\xfe\x80\x80', FFFD*3),
1820            # other sequences
1821            (b'\xf1\x80\x41\x42\x43', '\ufffd\x41\x42\x43'),
1822            (b'\xf1\x80\xff\x42\x43', '\ufffd\ufffd\x42\x43'),
1823            (b'\xf1\x80\xc2\x81\x43', '\ufffd\x81\x43'),
1824            (b'\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64',
1825             '\x61\uFFFD\uFFFD\uFFFD\x62\uFFFD\x63\uFFFD\uFFFD\x64'),
1826        ]
1827        for n, (seq, res) in enumerate(sequences):
1828            self.assertRaises(UnicodeDecodeError, seq.decode, 'utf-8', 'strict')
1829            self.assertEqual(seq.decode('utf-8', 'replace'), res)
1830            self.assertEqual((seq+b'b').decode('utf-8', 'replace'), res+'b')
1831            self.assertEqual(seq.decode('utf-8', 'ignore'),
1832                             res.replace('\uFFFD', ''))
1833
1834    def assertCorrectUTF8Decoding(self, seq, res, err):
1835        """
1836        Check that an invalid UTF-8 sequence raises a UnicodeDecodeError when
1837        'strict' is used, returns res when 'replace' is used, and that doesn't
1838        return anything when 'ignore' is used.
1839        """
1840        with self.assertRaises(UnicodeDecodeError) as cm:
1841            seq.decode('utf-8')
1842        exc = cm.exception
1843
1844        self.assertIn(err, str(exc))
1845        self.assertEqual(seq.decode('utf-8', 'replace'), res)
1846        self.assertEqual((b'aaaa' + seq + b'bbbb').decode('utf-8', 'replace'),
1847                         'aaaa' + res + 'bbbb')
1848        res = res.replace('\ufffd', '')
1849        self.assertEqual(seq.decode('utf-8', 'ignore'), res)
1850        self.assertEqual((b'aaaa' + seq + b'bbbb').decode('utf-8', 'ignore'),
1851                          'aaaa' + res + 'bbbb')
1852
1853    def test_invalid_start_byte(self):
1854        """
1855        Test that an 'invalid start byte' error is raised when the first byte
1856        is not in the ASCII range or is not a valid start byte of a 2-, 3-, or
1857        4-bytes sequence. The invalid start byte is replaced with a single
1858        U+FFFD when errors='replace'.
1859        E.g. <80> is a continuation byte and can appear only after a start byte.
1860        """
1861        FFFD = '\ufffd'
1862        for byte in b'\x80\xA0\x9F\xBF\xC0\xC1\xF5\xFF':
1863            self.assertCorrectUTF8Decoding(bytes([byte]), '\ufffd',
1864                                           'invalid start byte')
1865
1866    def test_unexpected_end_of_data(self):
1867        """
1868        Test that an 'unexpected end of data' error is raised when the string
1869        ends after a start byte of a 2-, 3-, or 4-bytes sequence without having
1870        enough continuation bytes.  The incomplete sequence is replaced with a
1871        single U+FFFD when errors='replace'.
1872        E.g. in the sequence <F3 80 80>, F3 is the start byte of a 4-bytes
1873        sequence, but it's followed by only 2 valid continuation bytes and the
1874        last continuation bytes is missing.
1875        Note: the continuation bytes must be all valid, if one of them is
1876        invalid another error will be raised.
1877        """
1878        sequences = [
1879            'C2', 'DF',
1880            'E0 A0', 'E0 BF', 'E1 80', 'E1 BF', 'EC 80', 'EC BF',
1881            'ED 80', 'ED 9F', 'EE 80', 'EE BF', 'EF 80', 'EF BF',
1882            'F0 90', 'F0 BF', 'F0 90 80', 'F0 90 BF', 'F0 BF 80', 'F0 BF BF',
1883            'F1 80', 'F1 BF', 'F1 80 80', 'F1 80 BF', 'F1 BF 80', 'F1 BF BF',
1884            'F3 80', 'F3 BF', 'F3 80 80', 'F3 80 BF', 'F3 BF 80', 'F3 BF BF',
1885            'F4 80', 'F4 8F', 'F4 80 80', 'F4 80 BF', 'F4 8F 80', 'F4 8F BF'
1886        ]
1887        FFFD = '\ufffd'
1888        for seq in sequences:
1889            self.assertCorrectUTF8Decoding(bytes.fromhex(seq), '\ufffd',
1890                                           'unexpected end of data')
1891
1892    def test_invalid_cb_for_2bytes_seq(self):
1893        """
1894        Test that an 'invalid continuation byte' error is raised when the
1895        continuation byte of a 2-bytes sequence is invalid.  The start byte
1896        is replaced by a single U+FFFD and the second byte is handled
1897        separately when errors='replace'.
1898        E.g. in the sequence <C2 41>, C2 is the start byte of a 2-bytes
1899        sequence, but 41 is not a valid continuation byte because it's the
1900        ASCII letter 'A'.
1901        """
1902        FFFD = '\ufffd'
1903        FFFDx2 = FFFD * 2
1904        sequences = [
1905            ('C2 00', FFFD+'\x00'), ('C2 7F', FFFD+'\x7f'),
1906            ('C2 C0', FFFDx2), ('C2 FF', FFFDx2),
1907            ('DF 00', FFFD+'\x00'), ('DF 7F', FFFD+'\x7f'),
1908            ('DF C0', FFFDx2), ('DF FF', FFFDx2),
1909        ]
1910        for seq, res in sequences:
1911            self.assertCorrectUTF8Decoding(bytes.fromhex(seq), res,
1912                                           'invalid continuation byte')
1913
1914    def test_invalid_cb_for_3bytes_seq(self):
1915        """
1916        Test that an 'invalid continuation byte' error is raised when the
1917        continuation byte(s) of a 3-bytes sequence are invalid.  When
1918        errors='replace', if the first continuation byte is valid, the first
1919        two bytes (start byte + 1st cb) are replaced by a single U+FFFD and the
1920        third byte is handled separately, otherwise only the start byte is
1921        replaced with a U+FFFD and the other continuation bytes are handled
1922        separately.
1923        E.g. in the sequence <E1 80 41>, E1 is the start byte of a 3-bytes
1924        sequence, 80 is a valid continuation byte, but 41 is not a valid cb
1925        because it's the ASCII letter 'A'.
1926        Note: when the start byte is E0 or ED, the valid ranges for the first
1927        continuation byte are limited to A0..BF and 80..9F respectively.
1928        Python 2 used to consider all the bytes in range 80..BF valid when the
1929        start byte was ED.  This is fixed in Python 3.
1930        """
1931        FFFD = '\ufffd'
1932        FFFDx2 = FFFD * 2
1933        sequences = [
1934            ('E0 00', FFFD+'\x00'), ('E0 7F', FFFD+'\x7f'), ('E0 80', FFFDx2),
1935            ('E0 9F', FFFDx2), ('E0 C0', FFFDx2), ('E0 FF', FFFDx2),
1936            ('E0 A0 00', FFFD+'\x00'), ('E0 A0 7F', FFFD+'\x7f'),
1937            ('E0 A0 C0', FFFDx2), ('E0 A0 FF', FFFDx2),
1938            ('E0 BF 00', FFFD+'\x00'), ('E0 BF 7F', FFFD+'\x7f'),
1939            ('E0 BF C0', FFFDx2), ('E0 BF FF', FFFDx2), ('E1 00', FFFD+'\x00'),
1940            ('E1 7F', FFFD+'\x7f'), ('E1 C0', FFFDx2), ('E1 FF', FFFDx2),
1941            ('E1 80 00', FFFD+'\x00'), ('E1 80 7F', FFFD+'\x7f'),
1942            ('E1 80 C0', FFFDx2), ('E1 80 FF', FFFDx2),
1943            ('E1 BF 00', FFFD+'\x00'), ('E1 BF 7F', FFFD+'\x7f'),
1944            ('E1 BF C0', FFFDx2), ('E1 BF FF', FFFDx2), ('EC 00', FFFD+'\x00'),
1945            ('EC 7F', FFFD+'\x7f'), ('EC C0', FFFDx2), ('EC FF', FFFDx2),
1946            ('EC 80 00', FFFD+'\x00'), ('EC 80 7F', FFFD+'\x7f'),
1947            ('EC 80 C0', FFFDx2), ('EC 80 FF', FFFDx2),
1948            ('EC BF 00', FFFD+'\x00'), ('EC BF 7F', FFFD+'\x7f'),
1949            ('EC BF C0', FFFDx2), ('EC BF FF', FFFDx2), ('ED 00', FFFD+'\x00'),
1950            ('ED 7F', FFFD+'\x7f'),
1951            ('ED A0', FFFDx2), ('ED BF', FFFDx2), # see note ^
1952            ('ED C0', FFFDx2), ('ED FF', FFFDx2), ('ED 80 00', FFFD+'\x00'),
1953            ('ED 80 7F', FFFD+'\x7f'), ('ED 80 C0', FFFDx2),
1954            ('ED 80 FF', FFFDx2), ('ED 9F 00', FFFD+'\x00'),
1955            ('ED 9F 7F', FFFD+'\x7f'), ('ED 9F C0', FFFDx2),
1956            ('ED 9F FF', FFFDx2), ('EE 00', FFFD+'\x00'),
1957            ('EE 7F', FFFD+'\x7f'), ('EE C0', FFFDx2), ('EE FF', FFFDx2),
1958            ('EE 80 00', FFFD+'\x00'), ('EE 80 7F', FFFD+'\x7f'),
1959            ('EE 80 C0', FFFDx2), ('EE 80 FF', FFFDx2),
1960            ('EE BF 00', FFFD+'\x00'), ('EE BF 7F', FFFD+'\x7f'),
1961            ('EE BF C0', FFFDx2), ('EE BF FF', FFFDx2), ('EF 00', FFFD+'\x00'),
1962            ('EF 7F', FFFD+'\x7f'), ('EF C0', FFFDx2), ('EF FF', FFFDx2),
1963            ('EF 80 00', FFFD+'\x00'), ('EF 80 7F', FFFD+'\x7f'),
1964            ('EF 80 C0', FFFDx2), ('EF 80 FF', FFFDx2),
1965            ('EF BF 00', FFFD+'\x00'), ('EF BF 7F', FFFD+'\x7f'),
1966            ('EF BF C0', FFFDx2), ('EF BF FF', FFFDx2),
1967        ]
1968        for seq, res in sequences:
1969            self.assertCorrectUTF8Decoding(bytes.fromhex(seq), res,
1970                                           'invalid continuation byte')
1971
1972    def test_invalid_cb_for_4bytes_seq(self):
1973        """
1974        Test that an 'invalid continuation byte' error is raised when the
1975        continuation byte(s) of a 4-bytes sequence are invalid.  When
1976        errors='replace',the start byte and all the following valid
1977        continuation bytes are replaced with a single U+FFFD, and all the bytes
1978        starting from the first invalid continuation bytes (included) are
1979        handled separately.
1980        E.g. in the sequence <E1 80 41>, E1 is the start byte of a 3-bytes
1981        sequence, 80 is a valid continuation byte, but 41 is not a valid cb
1982        because it's the ASCII letter 'A'.
1983        Note: when the start byte is E0 or ED, the valid ranges for the first
1984        continuation byte are limited to A0..BF and 80..9F respectively.
1985        However, when the start byte is ED, Python 2 considers all the bytes
1986        in range 80..BF valid.  This is fixed in Python 3.
1987        """
1988        FFFD = '\ufffd'
1989        FFFDx2 = FFFD * 2
1990        sequences = [
1991            ('F0 00', FFFD+'\x00'), ('F0 7F', FFFD+'\x7f'), ('F0 80', FFFDx2),
1992            ('F0 8F', FFFDx2), ('F0 C0', FFFDx2), ('F0 FF', FFFDx2),
1993            ('F0 90 00', FFFD+'\x00'), ('F0 90 7F', FFFD+'\x7f'),
1994            ('F0 90 C0', FFFDx2), ('F0 90 FF', FFFDx2),
1995            ('F0 BF 00', FFFD+'\x00'), ('F0 BF 7F', FFFD+'\x7f'),
1996            ('F0 BF C0', FFFDx2), ('F0 BF FF', FFFDx2),
1997            ('F0 90 80 00', FFFD+'\x00'), ('F0 90 80 7F', FFFD+'\x7f'),
1998            ('F0 90 80 C0', FFFDx2), ('F0 90 80 FF', FFFDx2),
1999            ('F0 90 BF 00', FFFD+'\x00'), ('F0 90 BF 7F', FFFD+'\x7f'),
2000            ('F0 90 BF C0', FFFDx2), ('F0 90 BF FF', FFFDx2),
2001            ('F0 BF 80 00', FFFD+'\x00'), ('F0 BF 80 7F', FFFD+'\x7f'),
2002            ('F0 BF 80 C0', FFFDx2), ('F0 BF 80 FF', FFFDx2),
2003            ('F0 BF BF 00', FFFD+'\x00'), ('F0 BF BF 7F', FFFD+'\x7f'),
2004            ('F0 BF BF C0', FFFDx2), ('F0 BF BF FF', FFFDx2),
2005            ('F1 00', FFFD+'\x00'), ('F1 7F', FFFD+'\x7f'), ('F1 C0', FFFDx2),
2006            ('F1 FF', FFFDx2), ('F1 80 00', FFFD+'\x00'),
2007            ('F1 80 7F', FFFD+'\x7f'), ('F1 80 C0', FFFDx2),
2008            ('F1 80 FF', FFFDx2), ('F1 BF 00', FFFD+'\x00'),
2009            ('F1 BF 7F', FFFD+'\x7f'), ('F1 BF C0', FFFDx2),
2010            ('F1 BF FF', FFFDx2), ('F1 80 80 00', FFFD+'\x00'),
2011            ('F1 80 80 7F', FFFD+'\x7f'), ('F1 80 80 C0', FFFDx2),
2012            ('F1 80 80 FF', FFFDx2), ('F1 80 BF 00', FFFD+'\x00'),
2013            ('F1 80 BF 7F', FFFD+'\x7f'), ('F1 80 BF C0', FFFDx2),
2014            ('F1 80 BF FF', FFFDx2), ('F1 BF 80 00', FFFD+'\x00'),
2015            ('F1 BF 80 7F', FFFD+'\x7f'), ('F1 BF 80 C0', FFFDx2),
2016            ('F1 BF 80 FF', FFFDx2), ('F1 BF BF 00', FFFD+'\x00'),
2017            ('F1 BF BF 7F', FFFD+'\x7f'), ('F1 BF BF C0', FFFDx2),
2018            ('F1 BF BF FF', FFFDx2), ('F3 00', FFFD+'\x00'),
2019            ('F3 7F', FFFD+'\x7f'), ('F3 C0', FFFDx2), ('F3 FF', FFFDx2),
2020            ('F3 80 00', FFFD+'\x00'), ('F3 80 7F', FFFD+'\x7f'),
2021            ('F3 80 C0', FFFDx2), ('F3 80 FF', FFFDx2),
2022            ('F3 BF 00', FFFD+'\x00'), ('F3 BF 7F', FFFD+'\x7f'),
2023            ('F3 BF C0', FFFDx2), ('F3 BF FF', FFFDx2),
2024            ('F3 80 80 00', FFFD+'\x00'), ('F3 80 80 7F', FFFD+'\x7f'),
2025            ('F3 80 80 C0', FFFDx2), ('F3 80 80 FF', FFFDx2),
2026            ('F3 80 BF 00', FFFD+'\x00'), ('F3 80 BF 7F', FFFD+'\x7f'),
2027            ('F3 80 BF C0', FFFDx2), ('F3 80 BF FF', FFFDx2),
2028            ('F3 BF 80 00', FFFD+'\x00'), ('F3 BF 80 7F', FFFD+'\x7f'),
2029            ('F3 BF 80 C0', FFFDx2), ('F3 BF 80 FF', FFFDx2),
2030            ('F3 BF BF 00', FFFD+'\x00'), ('F3 BF BF 7F', FFFD+'\x7f'),
2031            ('F3 BF BF C0', FFFDx2), ('F3 BF BF FF', FFFDx2),
2032            ('F4 00', FFFD+'\x00'), ('F4 7F', FFFD+'\x7f'), ('F4 90', FFFDx2),
2033            ('F4 BF', FFFDx2), ('F4 C0', FFFDx2), ('F4 FF', FFFDx2),
2034            ('F4 80 00', FFFD+'\x00'), ('F4 80 7F', FFFD+'\x7f'),
2035            ('F4 80 C0', FFFDx2), ('F4 80 FF', FFFDx2),
2036            ('F4 8F 00', FFFD+'\x00'), ('F4 8F 7F', FFFD+'\x7f'),
2037            ('F4 8F C0', FFFDx2), ('F4 8F FF', FFFDx2),
2038            ('F4 80 80 00', FFFD+'\x00'), ('F4 80 80 7F', FFFD+'\x7f'),
2039            ('F4 80 80 C0', FFFDx2), ('F4 80 80 FF', FFFDx2),
2040            ('F4 80 BF 00', FFFD+'\x00'), ('F4 80 BF 7F', FFFD+'\x7f'),
2041            ('F4 80 BF C0', FFFDx2), ('F4 80 BF FF', FFFDx2),
2042            ('F4 8F 80 00', FFFD+'\x00'), ('F4 8F 80 7F', FFFD+'\x7f'),
2043            ('F4 8F 80 C0', FFFDx2), ('F4 8F 80 FF', FFFDx2),
2044            ('F4 8F BF 00', FFFD+'\x00'), ('F4 8F BF 7F', FFFD+'\x7f'),
2045            ('F4 8F BF C0', FFFDx2), ('F4 8F BF FF', FFFDx2)
2046        ]
2047        for seq, res in sequences:
2048            self.assertCorrectUTF8Decoding(bytes.fromhex(seq), res,
2049                                           'invalid continuation byte')
2050
2051    def test_codecs_idna(self):
2052        # Test whether trailing dot is preserved
2053        self.assertEqual("www.python.org.".encode("idna"), b"www.python.org.")
2054
2055    def test_codecs_errors(self):
2056        # Error handling (encoding)
2057        self.assertRaises(UnicodeError, 'Andr\202 x'.encode, 'ascii')
2058        self.assertRaises(UnicodeError, 'Andr\202 x'.encode, 'ascii','strict')
2059        self.assertEqual('Andr\202 x'.encode('ascii','ignore'), b"Andr x")
2060        self.assertEqual('Andr\202 x'.encode('ascii','replace'), b"Andr? x")
2061        self.assertEqual('Andr\202 x'.encode('ascii', 'replace'),
2062                         'Andr\202 x'.encode('ascii', errors='replace'))
2063        self.assertEqual('Andr\202 x'.encode('ascii', 'ignore'),
2064                         'Andr\202 x'.encode(encoding='ascii', errors='ignore'))
2065
2066        # Error handling (decoding)
2067        self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii')
2068        self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii', 'strict')
2069        self.assertEqual(str(b'Andr\202 x', 'ascii', 'ignore'), "Andr x")
2070        self.assertEqual(str(b'Andr\202 x', 'ascii', 'replace'), 'Andr\uFFFD x')
2071        self.assertEqual(str(b'\202 x', 'ascii', 'replace'), '\uFFFD x')
2072
2073        # Error handling (unknown character names)
2074        self.assertEqual(b"\\N{foo}xx".decode("unicode-escape", "ignore"), "xx")
2075
2076        # Error handling (truncated escape sequence)
2077        self.assertRaises(UnicodeError, b"\\".decode, "unicode-escape")
2078
2079        self.assertRaises(TypeError, b"hello".decode, "test.unicode1")
2080        self.assertRaises(TypeError, str, b"hello", "test.unicode2")
2081        self.assertRaises(TypeError, "hello".encode, "test.unicode1")
2082        self.assertRaises(TypeError, "hello".encode, "test.unicode2")
2083
2084        # Error handling (wrong arguments)
2085        self.assertRaises(TypeError, "hello".encode, 42, 42, 42)
2086
2087        # Error handling (lone surrogate in
2088        # _PyUnicode_TransformDecimalAndSpaceToASCII())
2089        self.assertRaises(ValueError, int, "\ud800")
2090        self.assertRaises(ValueError, int, "\udf00")
2091        self.assertRaises(ValueError, float, "\ud800")
2092        self.assertRaises(ValueError, float, "\udf00")
2093        self.assertRaises(ValueError, complex, "\ud800")
2094        self.assertRaises(ValueError, complex, "\udf00")
2095
2096    def test_codecs(self):
2097        # Encoding
2098        self.assertEqual('hello'.encode('ascii'), b'hello')
2099        self.assertEqual('hello'.encode('utf-7'), b'hello')
2100        self.assertEqual('hello'.encode('utf-8'), b'hello')
2101        self.assertEqual('hello'.encode('utf-8'), b'hello')
2102        self.assertEqual('hello'.encode('utf-16-le'), b'h\000e\000l\000l\000o\000')
2103        self.assertEqual('hello'.encode('utf-16-be'), b'\000h\000e\000l\000l\000o')
2104        self.assertEqual('hello'.encode('latin-1'), b'hello')
2105
2106        # Default encoding is utf-8
2107        self.assertEqual('\u2603'.encode(), b'\xe2\x98\x83')
2108
2109        # Roundtrip safety for BMP (just the first 1024 chars)
2110        for c in range(1024):
2111            u = chr(c)
2112            for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le',
2113                             'utf-16-be', 'raw_unicode_escape',
2114                             'unicode_escape', 'unicode_internal'):
2115                with warnings.catch_warnings():
2116                    # unicode-internal has been deprecated
2117                    warnings.simplefilter("ignore", DeprecationWarning)
2118
2119                    self.assertEqual(str(u.encode(encoding),encoding), u)
2120
2121        # Roundtrip safety for BMP (just the first 256 chars)
2122        for c in range(256):
2123            u = chr(c)
2124            for encoding in ('latin-1',):
2125                self.assertEqual(str(u.encode(encoding),encoding), u)
2126
2127        # Roundtrip safety for BMP (just the first 128 chars)
2128        for c in range(128):
2129            u = chr(c)
2130            for encoding in ('ascii',):
2131                self.assertEqual(str(u.encode(encoding),encoding), u)
2132
2133        # Roundtrip safety for non-BMP (just a few chars)
2134        with warnings.catch_warnings():
2135            # unicode-internal has been deprecated
2136            warnings.simplefilter("ignore", DeprecationWarning)
2137
2138            u = '\U00010001\U00020002\U00030003\U00040004\U00050005'
2139            for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
2140                             'raw_unicode_escape',
2141                             'unicode_escape', 'unicode_internal'):
2142                self.assertEqual(str(u.encode(encoding),encoding), u)
2143
2144        # UTF-8 must be roundtrip safe for all code points
2145        # (except surrogates, which are forbidden).
2146        u = ''.join(map(chr, list(range(0, 0xd800)) +
2147                             list(range(0xe000, 0x110000))))
2148        for encoding in ('utf-8',):
2149            self.assertEqual(str(u.encode(encoding),encoding), u)
2150
2151    def test_codecs_charmap(self):
2152        # 0-127
2153        s = bytes(range(128))
2154        for encoding in (
2155            'cp037', 'cp1026', 'cp273',
2156            'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
2157            'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
2158            'cp863', 'cp865', 'cp866', 'cp1125',
2159            'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
2160            'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
2161            'iso8859_7', 'iso8859_9',
2162            'koi8_r', 'koi8_t', 'koi8_u', 'kz1048', 'latin_1',
2163            'mac_cyrillic', 'mac_latin2',
2164
2165            'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
2166            'cp1256', 'cp1257', 'cp1258',
2167            'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
2168
2169            'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
2170            'cp1006', 'iso8859_8',
2171
2172            ### These have undefined mappings:
2173            #'cp424',
2174
2175            ### These fail the round-trip:
2176            #'cp875'
2177
2178            ):
2179            self.assertEqual(str(s, encoding).encode(encoding), s)
2180
2181        # 128-255
2182        s = bytes(range(128, 256))
2183        for encoding in (
2184            'cp037', 'cp1026', 'cp273',
2185            'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
2186            'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
2187            'cp863', 'cp865', 'cp866', 'cp1125',
2188            'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
2189            'iso8859_2', 'iso8859_4', 'iso8859_5',
2190            'iso8859_9', 'koi8_r', 'koi8_u', 'latin_1',
2191            'mac_cyrillic', 'mac_latin2',
2192
2193            ### These have undefined mappings:
2194            #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
2195            #'cp1256', 'cp1257', 'cp1258',
2196            #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
2197            #'iso8859_3', 'iso8859_6', 'iso8859_7', 'koi8_t', 'kz1048',
2198            #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
2199
2200            ### These fail the round-trip:
2201            #'cp1006', 'cp875', 'iso8859_8',
2202
2203            ):
2204            self.assertEqual(str(s, encoding).encode(encoding), s)
2205
2206    def test_concatenation(self):
2207        self.assertEqual(("abc" "def"), "abcdef")
2208        self.assertEqual(("abc" "def"), "abcdef")
2209        self.assertEqual(("abc" "def"), "abcdef")
2210        self.assertEqual(("abc" "def" "ghi"), "abcdefghi")
2211        self.assertEqual(("abc" "def" "ghi"), "abcdefghi")
2212
2213    def test_printing(self):
2214        class BitBucket:
2215            def write(self, text):
2216                pass
2217
2218        out = BitBucket()
2219        print('abc', file=out)
2220        print('abc', 'def', file=out)
2221        print('abc', 'def', file=out)
2222        print('abc', 'def', file=out)
2223        print('abc\n', file=out)
2224        print('abc\n', end=' ', file=out)
2225        print('abc\n', end=' ', file=out)
2226        print('def\n', file=out)
2227        print('def\n', file=out)
2228
2229    def test_ucs4(self):
2230        x = '\U00100000'
2231        y = x.encode("raw-unicode-escape").decode("raw-unicode-escape")
2232        self.assertEqual(x, y)
2233
2234        y = br'\U00100000'
2235        x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
2236        self.assertEqual(x, y)
2237        y = br'\U00010000'
2238        x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
2239        self.assertEqual(x, y)
2240
2241        try:
2242            br'\U11111111'.decode("raw-unicode-escape")
2243        except UnicodeDecodeError as e:
2244            self.assertEqual(e.start, 0)
2245            self.assertEqual(e.end, 10)
2246        else:
2247            self.fail("Should have raised UnicodeDecodeError")
2248
2249    def test_conversion(self):
2250        # Make sure __str__() works properly
2251        class ObjectToStr:
2252            def __str__(self):
2253                return "foo"
2254
2255        class StrSubclassToStr(str):
2256            def __str__(self):
2257                return "foo"
2258
2259        class StrSubclassToStrSubclass(str):
2260            def __new__(cls, content=""):
2261                return str.__new__(cls, 2*content)
2262            def __str__(self):
2263                return self
2264
2265        self.assertEqual(str(ObjectToStr()), "foo")
2266        self.assertEqual(str(StrSubclassToStr("bar")), "foo")
2267        s = str(StrSubclassToStrSubclass("foo"))
2268        self.assertEqual(s, "foofoo")
2269        self.assertIs(type(s), StrSubclassToStrSubclass)
2270        s = StrSubclass(StrSubclassToStrSubclass("foo"))
2271        self.assertEqual(s, "foofoo")
2272        self.assertIs(type(s), StrSubclass)
2273
2274    def test_unicode_repr(self):
2275        class s1:
2276            def __repr__(self):
2277                return '\\n'
2278
2279        class s2:
2280            def __repr__(self):
2281                return '\\n'
2282
2283        self.assertEqual(repr(s1()), '\\n')
2284        self.assertEqual(repr(s2()), '\\n')
2285
2286    def test_printable_repr(self):
2287        self.assertEqual(repr('\U00010000'), "'%c'" % (0x10000,)) # printable
2288        self.assertEqual(repr('\U00014000'), "'\\U00014000'")     # nonprintable
2289
2290    # This test only affects 32-bit platforms because expandtabs can only take
2291    # an int as the max value, not a 64-bit C long.  If expandtabs is changed
2292    # to take a 64-bit long, this test should apply to all platforms.
2293    @unittest.skipIf(sys.maxsize > (1 << 32) or struct.calcsize('P') != 4,
2294                     'only applies to 32-bit platforms')
2295    def test_expandtabs_overflows_gracefully(self):
2296        self.assertRaises(OverflowError, 't\tt\t'.expandtabs, sys.maxsize)
2297
2298    @support.cpython_only
2299    def test_expandtabs_optimization(self):
2300        s = 'abc'
2301        self.assertIs(s.expandtabs(), s)
2302
2303    def test_raiseMemError(self):
2304        if struct.calcsize('P') == 8:
2305            # 64 bits pointers
2306            ascii_struct_size = 48
2307            compact_struct_size = 72
2308        else:
2309            # 32 bits pointers
2310            ascii_struct_size = 24
2311            compact_struct_size = 36
2312
2313        for char in ('a', '\xe9', '\u20ac', '\U0010ffff'):
2314            code = ord(char)
2315            if code < 0x100:
2316                char_size = 1  # sizeof(Py_UCS1)
2317                struct_size = ascii_struct_size
2318            elif code < 0x10000:
2319                char_size = 2  # sizeof(Py_UCS2)
2320                struct_size = compact_struct_size
2321            else:
2322                char_size = 4  # sizeof(Py_UCS4)
2323                struct_size = compact_struct_size
2324            # Note: sys.maxsize is half of the actual max allocation because of
2325            # the signedness of Py_ssize_t. Strings of maxlen-1 should in principle
2326            # be allocatable, given enough memory.
2327            maxlen = ((sys.maxsize - struct_size) // char_size)
2328            alloc = lambda: char * maxlen
2329            self.assertRaises(MemoryError, alloc)
2330            self.assertRaises(MemoryError, alloc)
2331
2332    def test_format_subclass(self):
2333        class S(str):
2334            def __str__(self):
2335                return '__str__ overridden'
2336        s = S('xxx')
2337        self.assertEqual("%s" % s, '__str__ overridden')
2338        self.assertEqual("{}".format(s), '__str__ overridden')
2339
2340    def test_subclass_add(self):
2341        class S(str):
2342            def __add__(self, o):
2343                return "3"
2344        self.assertEqual(S("4") + S("5"), "3")
2345        class S(str):
2346            def __iadd__(self, o):
2347                return "3"
2348        s = S("1")
2349        s += "4"
2350        self.assertEqual(s, "3")
2351
2352    def test_getnewargs(self):
2353        text = 'abc'
2354        args = text.__getnewargs__()
2355        self.assertIsNot(args[0], text)
2356        self.assertEqual(args[0], text)
2357        self.assertEqual(len(args), 1)
2358
2359    def test_resize(self):
2360        for length in range(1, 100, 7):
2361            # generate a fresh string (refcount=1)
2362            text = 'a' * length + 'b'
2363
2364            with support.check_warnings(('unicode_internal codec has been '
2365                                         'deprecated', DeprecationWarning)):
2366                # fill wstr internal field
2367                abc = text.encode('unicode_internal')
2368                self.assertEqual(abc.decode('unicode_internal'), text)
2369
2370                # resize text: wstr field must be cleared and then recomputed
2371                text += 'c'
2372                abcdef = text.encode('unicode_internal')
2373                self.assertNotEqual(abc, abcdef)
2374                self.assertEqual(abcdef.decode('unicode_internal'), text)
2375
2376    def test_compare(self):
2377        # Issue #17615
2378        N = 10
2379        ascii = 'a' * N
2380        ascii2 = 'z' * N
2381        latin = '\x80' * N
2382        latin2 = '\xff' * N
2383        bmp = '\u0100' * N
2384        bmp2 = '\uffff' * N
2385        astral = '\U00100000' * N
2386        astral2 = '\U0010ffff' * N
2387        strings = (
2388            ascii, ascii2,
2389            latin, latin2,
2390            bmp, bmp2,
2391            astral, astral2)
2392        for text1, text2 in itertools.combinations(strings, 2):
2393            equal = (text1 is text2)
2394            self.assertEqual(text1 == text2, equal)
2395            self.assertEqual(text1 != text2, not equal)
2396
2397            if equal:
2398                self.assertTrue(text1 <= text2)
2399                self.assertTrue(text1 >= text2)
2400
2401                # text1 is text2: duplicate strings to skip the "str1 == str2"
2402                # optimization in unicode_compare_eq() and really compare
2403                # character per character
2404                copy1 = duplicate_string(text1)
2405                copy2 = duplicate_string(text2)
2406                self.assertIsNot(copy1, copy2)
2407
2408                self.assertTrue(copy1 == copy2)
2409                self.assertFalse(copy1 != copy2)
2410
2411                self.assertTrue(copy1 <= copy2)
2412                self.assertTrue(copy2 >= copy2)
2413
2414        self.assertTrue(ascii < ascii2)
2415        self.assertTrue(ascii < latin)
2416        self.assertTrue(ascii < bmp)
2417        self.assertTrue(ascii < astral)
2418        self.assertFalse(ascii >= ascii2)
2419        self.assertFalse(ascii >= latin)
2420        self.assertFalse(ascii >= bmp)
2421        self.assertFalse(ascii >= astral)
2422
2423        self.assertFalse(latin < ascii)
2424        self.assertTrue(latin < latin2)
2425        self.assertTrue(latin < bmp)
2426        self.assertTrue(latin < astral)
2427        self.assertTrue(latin >= ascii)
2428        self.assertFalse(latin >= latin2)
2429        self.assertFalse(latin >= bmp)
2430        self.assertFalse(latin >= astral)
2431
2432        self.assertFalse(bmp < ascii)
2433        self.assertFalse(bmp < latin)
2434        self.assertTrue(bmp < bmp2)
2435        self.assertTrue(bmp < astral)
2436        self.assertTrue(bmp >= ascii)
2437        self.assertTrue(bmp >= latin)
2438        self.assertFalse(bmp >= bmp2)
2439        self.assertFalse(bmp >= astral)
2440
2441        self.assertFalse(astral < ascii)
2442        self.assertFalse(astral < latin)
2443        self.assertFalse(astral < bmp2)
2444        self.assertTrue(astral < astral2)
2445        self.assertTrue(astral >= ascii)
2446        self.assertTrue(astral >= latin)
2447        self.assertTrue(astral >= bmp2)
2448        self.assertFalse(astral >= astral2)
2449
2450    def test_free_after_iterating(self):
2451        support.check_free_after_iterating(self, iter, str)
2452        support.check_free_after_iterating(self, reversed, str)
2453
2454
2455class CAPITest(unittest.TestCase):
2456
2457    # Test PyUnicode_FromFormat()
2458    def test_from_format(self):
2459        support.import_module('ctypes')
2460        from ctypes import (
2461            pythonapi, py_object, sizeof,
2462            c_int, c_long, c_longlong, c_ssize_t,
2463            c_uint, c_ulong, c_ulonglong, c_size_t, c_void_p)
2464        name = "PyUnicode_FromFormat"
2465        _PyUnicode_FromFormat = getattr(pythonapi, name)
2466        _PyUnicode_FromFormat.restype = py_object
2467
2468        def PyUnicode_FromFormat(format, *args):
2469            cargs = tuple(
2470                py_object(arg) if isinstance(arg, str) else arg
2471                for arg in args)
2472            return _PyUnicode_FromFormat(format, *cargs)
2473
2474        def check_format(expected, format, *args):
2475            text = PyUnicode_FromFormat(format, *args)
2476            self.assertEqual(expected, text)
2477
2478        # ascii format, non-ascii argument
2479        check_format('ascii\x7f=unicode\xe9',
2480                     b'ascii\x7f=%U', 'unicode\xe9')
2481
2482        # non-ascii format, ascii argument: ensure that PyUnicode_FromFormatV()
2483        # raises an error
2484        self.assertRaisesRegex(ValueError,
2485            r'^PyUnicode_FromFormatV\(\) expects an ASCII-encoded format '
2486            'string, got a non-ASCII byte: 0xe9$',
2487            PyUnicode_FromFormat, b'unicode\xe9=%s', 'ascii')
2488
2489        # test "%c"
2490        check_format('\uabcd',
2491                     b'%c', c_int(0xabcd))
2492        check_format('\U0010ffff',
2493                     b'%c', c_int(0x10ffff))
2494        with self.assertRaises(OverflowError):
2495            PyUnicode_FromFormat(b'%c', c_int(0x110000))
2496        # Issue #18183
2497        check_format('\U00010000\U00100000',
2498                     b'%c%c', c_int(0x10000), c_int(0x100000))
2499
2500        # test "%"
2501        check_format('%',
2502                     b'%')
2503        check_format('%',
2504                     b'%%')
2505        check_format('%s',
2506                     b'%%s')
2507        check_format('[%]',
2508                     b'[%%]')
2509        check_format('%abc',
2510                     b'%%%s', b'abc')
2511
2512        # truncated string
2513        check_format('abc',
2514                     b'%.3s', b'abcdef')
2515        check_format('abc[\ufffd',
2516                     b'%.5s', 'abc[\u20ac]'.encode('utf8'))
2517        check_format("'\\u20acABC'",
2518                     b'%A', '\u20acABC')
2519        check_format("'\\u20",
2520                     b'%.5A', '\u20acABCDEF')
2521        check_format("'\u20acABC'",
2522                     b'%R', '\u20acABC')
2523        check_format("'\u20acA",
2524                     b'%.3R', '\u20acABCDEF')
2525        check_format('\u20acAB',
2526                     b'%.3S', '\u20acABCDEF')
2527        check_format('\u20acAB',
2528                     b'%.3U', '\u20acABCDEF')
2529        check_format('\u20acAB',
2530                     b'%.3V', '\u20acABCDEF', None)
2531        check_format('abc[\ufffd',
2532                     b'%.5V', None, 'abc[\u20ac]'.encode('utf8'))
2533
2534        # following tests comes from #7330
2535        # test width modifier and precision modifier with %S
2536        check_format("repr=  abc",
2537                     b'repr=%5S', 'abc')
2538        check_format("repr=ab",
2539                     b'repr=%.2S', 'abc')
2540        check_format("repr=   ab",
2541                     b'repr=%5.2S', 'abc')
2542
2543        # test width modifier and precision modifier with %R
2544        check_format("repr=   'abc'",
2545                     b'repr=%8R', 'abc')
2546        check_format("repr='ab",
2547                     b'repr=%.3R', 'abc')
2548        check_format("repr=  'ab",
2549                     b'repr=%5.3R', 'abc')
2550
2551        # test width modifier and precision modifier with %A
2552        check_format("repr=   'abc'",
2553                     b'repr=%8A', 'abc')
2554        check_format("repr='ab",
2555                     b'repr=%.3A', 'abc')
2556        check_format("repr=  'ab",
2557                     b'repr=%5.3A', 'abc')
2558
2559        # test width modifier and precision modifier with %s
2560        check_format("repr=  abc",
2561                     b'repr=%5s', b'abc')
2562        check_format("repr=ab",
2563                     b'repr=%.2s', b'abc')
2564        check_format("repr=   ab",
2565                     b'repr=%5.2s', b'abc')
2566
2567        # test width modifier and precision modifier with %U
2568        check_format("repr=  abc",
2569                     b'repr=%5U', 'abc')
2570        check_format("repr=ab",
2571                     b'repr=%.2U', 'abc')
2572        check_format("repr=   ab",
2573                     b'repr=%5.2U', 'abc')
2574
2575        # test width modifier and precision modifier with %V
2576        check_format("repr=  abc",
2577                     b'repr=%5V', 'abc', b'123')
2578        check_format("repr=ab",
2579                     b'repr=%.2V', 'abc', b'123')
2580        check_format("repr=   ab",
2581                     b'repr=%5.2V', 'abc', b'123')
2582        check_format("repr=  123",
2583                     b'repr=%5V', None, b'123')
2584        check_format("repr=12",
2585                     b'repr=%.2V', None, b'123')
2586        check_format("repr=   12",
2587                     b'repr=%5.2V', None, b'123')
2588
2589        # test integer formats (%i, %d, %u)
2590        check_format('010',
2591                     b'%03i', c_int(10))
2592        check_format('0010',
2593                     b'%0.4i', c_int(10))
2594        check_format('-123',
2595                     b'%i', c_int(-123))
2596        check_format('-123',
2597                     b'%li', c_long(-123))
2598        check_format('-123',
2599                     b'%lli', c_longlong(-123))
2600        check_format('-123',
2601                     b'%zi', c_ssize_t(-123))
2602
2603        check_format('-123',
2604                     b'%d', c_int(-123))
2605        check_format('-123',
2606                     b'%ld', c_long(-123))
2607        check_format('-123',
2608                     b'%lld', c_longlong(-123))
2609        check_format('-123',
2610                     b'%zd', c_ssize_t(-123))
2611
2612        check_format('123',
2613                     b'%u', c_uint(123))
2614        check_format('123',
2615                     b'%lu', c_ulong(123))
2616        check_format('123',
2617                     b'%llu', c_ulonglong(123))
2618        check_format('123',
2619                     b'%zu', c_size_t(123))
2620
2621        # test long output
2622        min_longlong = -(2 ** (8 * sizeof(c_longlong) - 1))
2623        max_longlong = -min_longlong - 1
2624        check_format(str(min_longlong),
2625                     b'%lld', c_longlong(min_longlong))
2626        check_format(str(max_longlong),
2627                     b'%lld', c_longlong(max_longlong))
2628        max_ulonglong = 2 ** (8 * sizeof(c_ulonglong)) - 1
2629        check_format(str(max_ulonglong),
2630                     b'%llu', c_ulonglong(max_ulonglong))
2631        PyUnicode_FromFormat(b'%p', c_void_p(-1))
2632
2633        # test padding (width and/or precision)
2634        check_format('123'.rjust(10, '0'),
2635                     b'%010i', c_int(123))
2636        check_format('123'.rjust(100),
2637                     b'%100i', c_int(123))
2638        check_format('123'.rjust(100, '0'),
2639                     b'%.100i', c_int(123))
2640        check_format('123'.rjust(80, '0').rjust(100),
2641                     b'%100.80i', c_int(123))
2642
2643        check_format('123'.rjust(10, '0'),
2644                     b'%010u', c_uint(123))
2645        check_format('123'.rjust(100),
2646                     b'%100u', c_uint(123))
2647        check_format('123'.rjust(100, '0'),
2648                     b'%.100u', c_uint(123))
2649        check_format('123'.rjust(80, '0').rjust(100),
2650                     b'%100.80u', c_uint(123))
2651
2652        check_format('123'.rjust(10, '0'),
2653                     b'%010x', c_int(0x123))
2654        check_format('123'.rjust(100),
2655                     b'%100x', c_int(0x123))
2656        check_format('123'.rjust(100, '0'),
2657                     b'%.100x', c_int(0x123))
2658        check_format('123'.rjust(80, '0').rjust(100),
2659                     b'%100.80x', c_int(0x123))
2660
2661        # test %A
2662        check_format(r"%A:'abc\xe9\uabcd\U0010ffff'",
2663                     b'%%A:%A', 'abc\xe9\uabcd\U0010ffff')
2664
2665        # test %V
2666        check_format('repr=abc',
2667                     b'repr=%V', 'abc', b'xyz')
2668
2669        # Test string decode from parameter of %s using utf-8.
2670        # b'\xe4\xba\xba\xe6\xb0\x91' is utf-8 encoded byte sequence of
2671        # '\u4eba\u6c11'
2672        check_format('repr=\u4eba\u6c11',
2673                     b'repr=%V', None, b'\xe4\xba\xba\xe6\xb0\x91')
2674
2675        #Test replace error handler.
2676        check_format('repr=abc\ufffd',
2677                     b'repr=%V', None, b'abc\xff')
2678
2679        # not supported: copy the raw format string. these tests are just here
2680        # to check for crashes and should not be considered as specifications
2681        check_format('%s',
2682                     b'%1%s', b'abc')
2683        check_format('%1abc',
2684                     b'%1abc')
2685        check_format('%+i',
2686                     b'%+i', c_int(10))
2687        check_format('%.%s',
2688                     b'%.%s', b'abc')
2689
2690        # Issue #33817: empty strings
2691        check_format('',
2692                     b'')
2693        check_format('',
2694                     b'%s', b'')
2695
2696    # Test PyUnicode_AsWideChar()
2697    @support.cpython_only
2698    def test_aswidechar(self):
2699        from _testcapi import unicode_aswidechar
2700        support.import_module('ctypes')
2701        from ctypes import c_wchar, sizeof
2702
2703        wchar, size = unicode_aswidechar('abcdef', 2)
2704        self.assertEqual(size, 2)
2705        self.assertEqual(wchar, 'ab')
2706
2707        wchar, size = unicode_aswidechar('abc', 3)
2708        self.assertEqual(size, 3)
2709        self.assertEqual(wchar, 'abc')
2710
2711        wchar, size = unicode_aswidechar('abc', 4)
2712        self.assertEqual(size, 3)
2713        self.assertEqual(wchar, 'abc\0')
2714
2715        wchar, size = unicode_aswidechar('abc', 10)
2716        self.assertEqual(size, 3)
2717        self.assertEqual(wchar, 'abc\0')
2718
2719        wchar, size = unicode_aswidechar('abc\0def', 20)
2720        self.assertEqual(size, 7)
2721        self.assertEqual(wchar, 'abc\0def\0')
2722
2723        nonbmp = chr(0x10ffff)
2724        if sizeof(c_wchar) == 2:
2725            buflen = 3
2726            nchar = 2
2727        else: # sizeof(c_wchar) == 4
2728            buflen = 2
2729            nchar = 1
2730        wchar, size = unicode_aswidechar(nonbmp, buflen)
2731        self.assertEqual(size, nchar)
2732        self.assertEqual(wchar, nonbmp + '\0')
2733
2734    # Test PyUnicode_AsWideCharString()
2735    @support.cpython_only
2736    def test_aswidecharstring(self):
2737        from _testcapi import unicode_aswidecharstring
2738        support.import_module('ctypes')
2739        from ctypes import c_wchar, sizeof
2740
2741        wchar, size = unicode_aswidecharstring('abc')
2742        self.assertEqual(size, 3)
2743        self.assertEqual(wchar, 'abc\0')
2744
2745        wchar, size = unicode_aswidecharstring('abc\0def')
2746        self.assertEqual(size, 7)
2747        self.assertEqual(wchar, 'abc\0def\0')
2748
2749        nonbmp = chr(0x10ffff)
2750        if sizeof(c_wchar) == 2:
2751            nchar = 2
2752        else: # sizeof(c_wchar) == 4
2753            nchar = 1
2754        wchar, size = unicode_aswidecharstring(nonbmp)
2755        self.assertEqual(size, nchar)
2756        self.assertEqual(wchar, nonbmp + '\0')
2757
2758    # Test PyUnicode_AsUCS4()
2759    @support.cpython_only
2760    def test_asucs4(self):
2761        from _testcapi import unicode_asucs4
2762        for s in ['abc', '\xa1\xa2', '\u4f60\u597d', 'a\U0001f600',
2763                  'a\ud800b\udfffc', '\ud834\udd1e']:
2764            l = len(s)
2765            self.assertEqual(unicode_asucs4(s, l, 1), s+'\0')
2766            self.assertEqual(unicode_asucs4(s, l, 0), s+'\uffff')
2767            self.assertEqual(unicode_asucs4(s, l+1, 1), s+'\0\uffff')
2768            self.assertEqual(unicode_asucs4(s, l+1, 0), s+'\0\uffff')
2769            self.assertRaises(SystemError, unicode_asucs4, s, l-1, 1)
2770            self.assertRaises(SystemError, unicode_asucs4, s, l-2, 0)
2771            s = '\0'.join([s, s])
2772            self.assertEqual(unicode_asucs4(s, len(s), 1), s+'\0')
2773            self.assertEqual(unicode_asucs4(s, len(s), 0), s+'\uffff')
2774
2775    # Test PyUnicode_FindChar()
2776    @support.cpython_only
2777    def test_findchar(self):
2778        from _testcapi import unicode_findchar
2779
2780        for str in "\xa1", "\u8000\u8080", "\ud800\udc02", "\U0001f100\U0001f1f1":
2781            for i, ch in enumerate(str):
2782                self.assertEqual(unicode_findchar(str, ord(ch), 0, len(str), 1), i)
2783                self.assertEqual(unicode_findchar(str, ord(ch), 0, len(str), -1), i)
2784
2785        str = "!>_<!"
2786        self.assertEqual(unicode_findchar(str, 0x110000, 0, len(str), 1), -1)
2787        self.assertEqual(unicode_findchar(str, 0x110000, 0, len(str), -1), -1)
2788        # start < end
2789        self.assertEqual(unicode_findchar(str, ord('!'), 1, len(str)+1, 1), 4)
2790        self.assertEqual(unicode_findchar(str, ord('!'), 1, len(str)+1, -1), 4)
2791        # start >= end
2792        self.assertEqual(unicode_findchar(str, ord('!'), 0, 0, 1), -1)
2793        self.assertEqual(unicode_findchar(str, ord('!'), len(str), 0, 1), -1)
2794        # negative
2795        self.assertEqual(unicode_findchar(str, ord('!'), -len(str), -1, 1), 0)
2796        self.assertEqual(unicode_findchar(str, ord('!'), -len(str), -1, -1), 0)
2797
2798    # Test PyUnicode_CopyCharacters()
2799    @support.cpython_only
2800    def test_copycharacters(self):
2801        from _testcapi import unicode_copycharacters
2802
2803        strings = [
2804            'abcde', '\xa1\xa2\xa3\xa4\xa5',
2805            '\u4f60\u597d\u4e16\u754c\uff01',
2806            '\U0001f600\U0001f601\U0001f602\U0001f603\U0001f604'
2807        ]
2808
2809        for idx, from_ in enumerate(strings):
2810            # wide -> narrow: exceed maxchar limitation
2811            for to in strings[:idx]:
2812                self.assertRaises(
2813                    SystemError,
2814                    unicode_copycharacters, to, 0, from_, 0, 5
2815                )
2816            # same kind
2817            for from_start in range(5):
2818                self.assertEqual(
2819                    unicode_copycharacters(from_, 0, from_, from_start, 5),
2820                    (from_[from_start:from_start+5].ljust(5, '\0'),
2821                     5-from_start)
2822                )
2823            for to_start in range(5):
2824                self.assertEqual(
2825                    unicode_copycharacters(from_, to_start, from_, to_start, 5),
2826                    (from_[to_start:to_start+5].rjust(5, '\0'),
2827                     5-to_start)
2828                )
2829            # narrow -> wide
2830            # Tests omitted since this creates invalid strings.
2831
2832        s = strings[0]
2833        self.assertRaises(IndexError, unicode_copycharacters, s, 6, s, 0, 5)
2834        self.assertRaises(IndexError, unicode_copycharacters, s, -1, s, 0, 5)
2835        self.assertRaises(IndexError, unicode_copycharacters, s, 0, s, 6, 5)
2836        self.assertRaises(IndexError, unicode_copycharacters, s, 0, s, -1, 5)
2837        self.assertRaises(SystemError, unicode_copycharacters, s, 1, s, 0, 5)
2838        self.assertRaises(SystemError, unicode_copycharacters, s, 0, s, 0, -1)
2839        self.assertRaises(SystemError, unicode_copycharacters, s, 0, b'', 0, 0)
2840
2841    @support.cpython_only
2842    def test_encode_decimal(self):
2843        from _testcapi import unicode_encodedecimal
2844        self.assertEqual(unicode_encodedecimal('123'),
2845                         b'123')
2846        self.assertEqual(unicode_encodedecimal('\u0663.\u0661\u0664'),
2847                         b'3.14')
2848        self.assertEqual(unicode_encodedecimal("\N{EM SPACE}3.14\N{EN SPACE}"),
2849                         b' 3.14 ')
2850        self.assertRaises(UnicodeEncodeError,
2851                          unicode_encodedecimal, "123\u20ac", "strict")
2852        self.assertRaisesRegex(
2853            ValueError,
2854            "^'decimal' codec can't encode character",
2855            unicode_encodedecimal, "123\u20ac", "replace")
2856
2857    @support.cpython_only
2858    def test_transform_decimal(self):
2859        from _testcapi import unicode_transformdecimaltoascii as transform_decimal
2860        self.assertEqual(transform_decimal('123'),
2861                         '123')
2862        self.assertEqual(transform_decimal('\u0663.\u0661\u0664'),
2863                         '3.14')
2864        self.assertEqual(transform_decimal("\N{EM SPACE}3.14\N{EN SPACE}"),
2865                         "\N{EM SPACE}3.14\N{EN SPACE}")
2866        self.assertEqual(transform_decimal('123\u20ac'),
2867                         '123\u20ac')
2868
2869    @support.cpython_only
2870    def test_pep393_utf8_caching_bug(self):
2871        # Issue #25709: Problem with string concatenation and utf-8 cache
2872        from _testcapi import getargs_s_hash
2873        for k in 0x24, 0xa4, 0x20ac, 0x1f40d:
2874            s = ''
2875            for i in range(5):
2876                # Due to CPython specific optimization the 's' string can be
2877                # resized in-place.
2878                s += chr(k)
2879                # Parsing with the "s#" format code calls indirectly
2880                # PyUnicode_AsUTF8AndSize() which creates the UTF-8
2881                # encoded string cached in the Unicode object.
2882                self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1))
2883                # Check that the second call returns the same result
2884                self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1))
2885
2886class StringModuleTest(unittest.TestCase):
2887    def test_formatter_parser(self):
2888        def parse(format):
2889            return list(_string.formatter_parser(format))
2890
2891        formatter = parse("prefix {2!s}xxx{0:^+10.3f}{obj.attr!s} {z[0]!s:10}")
2892        self.assertEqual(formatter, [
2893            ('prefix ', '2', '', 's'),
2894            ('xxx', '0', '^+10.3f', None),
2895            ('', 'obj.attr', '', 's'),
2896            (' ', 'z[0]', '10', 's'),
2897        ])
2898
2899        formatter = parse("prefix {} suffix")
2900        self.assertEqual(formatter, [
2901            ('prefix ', '', '', None),
2902            (' suffix', None, None, None),
2903        ])
2904
2905        formatter = parse("str")
2906        self.assertEqual(formatter, [
2907            ('str', None, None, None),
2908        ])
2909
2910        formatter = parse("")
2911        self.assertEqual(formatter, [])
2912
2913        formatter = parse("{0}")
2914        self.assertEqual(formatter, [
2915            ('', '0', '', None),
2916        ])
2917
2918        self.assertRaises(TypeError, _string.formatter_parser, 1)
2919
2920    def test_formatter_field_name_split(self):
2921        def split(name):
2922            items = list(_string.formatter_field_name_split(name))
2923            items[1] = list(items[1])
2924            return items
2925        self.assertEqual(split("obj"), ["obj", []])
2926        self.assertEqual(split("obj.arg"), ["obj", [(True, 'arg')]])
2927        self.assertEqual(split("obj[key]"), ["obj", [(False, 'key')]])
2928        self.assertEqual(split("obj.arg[key1][key2]"), [
2929            "obj",
2930            [(True, 'arg'),
2931             (False, 'key1'),
2932             (False, 'key2'),
2933            ]])
2934        self.assertRaises(TypeError, _string.formatter_field_name_split, 1)
2935
2936
2937if __name__ == "__main__":
2938    unittest.main()
2939