1import codecs
2import html.entities
3import sys
4import unicodedata
5import unittest
6
7
8class PosReturn:
9    # this can be used for configurable callbacks
10
11    def __init__(self):
12        self.pos = 0
13
14    def handle(self, exc):
15        oldpos = self.pos
16        realpos = oldpos
17        if realpos<0:
18            realpos = len(exc.object) + realpos
19        # if we don't advance this time, terminate on the next call
20        # otherwise we'd get an endless loop
21        if realpos <= exc.start:
22            self.pos = len(exc.object)
23        return ("<?>", oldpos)
24
25# A UnicodeEncodeError object with a bad start attribute
26class BadStartUnicodeEncodeError(UnicodeEncodeError):
27    def __init__(self):
28        UnicodeEncodeError.__init__(self, "ascii", "", 0, 1, "bad")
29        self.start = []
30
31# A UnicodeEncodeError object with a bad object attribute
32class BadObjectUnicodeEncodeError(UnicodeEncodeError):
33    def __init__(self):
34        UnicodeEncodeError.__init__(self, "ascii", "", 0, 1, "bad")
35        self.object = []
36
37# A UnicodeDecodeError object without an end attribute
38class NoEndUnicodeDecodeError(UnicodeDecodeError):
39    def __init__(self):
40        UnicodeDecodeError.__init__(self, "ascii", bytearray(b""), 0, 1, "bad")
41        del self.end
42
43# A UnicodeDecodeError object with a bad object attribute
44class BadObjectUnicodeDecodeError(UnicodeDecodeError):
45    def __init__(self):
46        UnicodeDecodeError.__init__(self, "ascii", bytearray(b""), 0, 1, "bad")
47        self.object = []
48
49# A UnicodeTranslateError object without a start attribute
50class NoStartUnicodeTranslateError(UnicodeTranslateError):
51    def __init__(self):
52        UnicodeTranslateError.__init__(self, "", 0, 1, "bad")
53        del self.start
54
55# A UnicodeTranslateError object without an end attribute
56class NoEndUnicodeTranslateError(UnicodeTranslateError):
57    def __init__(self):
58        UnicodeTranslateError.__init__(self,  "", 0, 1, "bad")
59        del self.end
60
61# A UnicodeTranslateError object without an object attribute
62class NoObjectUnicodeTranslateError(UnicodeTranslateError):
63    def __init__(self):
64        UnicodeTranslateError.__init__(self, "", 0, 1, "bad")
65        del self.object
66
67class CodecCallbackTest(unittest.TestCase):
68
69    def test_xmlcharrefreplace(self):
70        # replace unencodable characters which numeric character entities.
71        # For ascii, latin-1 and charmaps this is completely implemented
72        # in C and should be reasonably fast.
73        s = "\u30b9\u30d1\u30e2 \xe4nd eggs"
74        self.assertEqual(
75            s.encode("ascii", "xmlcharrefreplace"),
76            b"&#12473;&#12497;&#12514; &#228;nd eggs"
77        )
78        self.assertEqual(
79            s.encode("latin-1", "xmlcharrefreplace"),
80            b"&#12473;&#12497;&#12514; \xe4nd eggs"
81        )
82
83    def test_xmlcharnamereplace(self):
84        # This time use a named character entity for unencodable
85        # characters, if one is available.
86
87        def xmlcharnamereplace(exc):
88            if not isinstance(exc, UnicodeEncodeError):
89                raise TypeError("don't know how to handle %r" % exc)
90            l = []
91            for c in exc.object[exc.start:exc.end]:
92                try:
93                    l.append("&%s;" % html.entities.codepoint2name[ord(c)])
94                except KeyError:
95                    l.append("&#%d;" % ord(c))
96            return ("".join(l), exc.end)
97
98        codecs.register_error(
99            "test.xmlcharnamereplace", xmlcharnamereplace)
100
101        sin = "\xab\u211c\xbb = \u2329\u1234\u20ac\u232a"
102        sout = b"&laquo;&real;&raquo; = &lang;&#4660;&euro;&rang;"
103        self.assertEqual(sin.encode("ascii", "test.xmlcharnamereplace"), sout)
104        sout = b"\xab&real;\xbb = &lang;&#4660;&euro;&rang;"
105        self.assertEqual(sin.encode("latin-1", "test.xmlcharnamereplace"), sout)
106        sout = b"\xab&real;\xbb = &lang;&#4660;\xa4&rang;"
107        self.assertEqual(sin.encode("iso-8859-15", "test.xmlcharnamereplace"), sout)
108
109    def test_uninamereplace(self):
110        # We're using the names from the unicode database this time,
111        # and we're doing "syntax highlighting" here, i.e. we include
112        # the replaced text in ANSI escape sequences. For this it is
113        # useful that the error handler is not called for every single
114        # unencodable character, but for a complete sequence of
115        # unencodable characters, otherwise we would output many
116        # unnecessary escape sequences.
117
118        def uninamereplace(exc):
119            if not isinstance(exc, UnicodeEncodeError):
120                raise TypeError("don't know how to handle %r" % exc)
121            l = []
122            for c in exc.object[exc.start:exc.end]:
123                l.append(unicodedata.name(c, "0x%x" % ord(c)))
124            return ("\033[1m%s\033[0m" % ", ".join(l), exc.end)
125
126        codecs.register_error(
127            "test.uninamereplace", uninamereplace)
128
129        sin = "\xac\u1234\u20ac\u8000"
130        sout = b"\033[1mNOT SIGN, ETHIOPIC SYLLABLE SEE, EURO SIGN, CJK UNIFIED IDEOGRAPH-8000\033[0m"
131        self.assertEqual(sin.encode("ascii", "test.uninamereplace"), sout)
132
133        sout = b"\xac\033[1mETHIOPIC SYLLABLE SEE, EURO SIGN, CJK UNIFIED IDEOGRAPH-8000\033[0m"
134        self.assertEqual(sin.encode("latin-1", "test.uninamereplace"), sout)
135
136        sout = b"\xac\033[1mETHIOPIC SYLLABLE SEE\033[0m\xa4\033[1mCJK UNIFIED IDEOGRAPH-8000\033[0m"
137        self.assertEqual(sin.encode("iso-8859-15", "test.uninamereplace"), sout)
138
139    def test_backslashescape(self):
140        # Does the same as the "unicode-escape" encoding, but with different
141        # base encodings.
142        sin = "a\xac\u1234\u20ac\u8000\U0010ffff"
143        sout = b"a\\xac\\u1234\\u20ac\\u8000\\U0010ffff"
144        self.assertEqual(sin.encode("ascii", "backslashreplace"), sout)
145
146        sout = b"a\xac\\u1234\\u20ac\\u8000\\U0010ffff"
147        self.assertEqual(sin.encode("latin-1", "backslashreplace"), sout)
148
149        sout = b"a\xac\\u1234\xa4\\u8000\\U0010ffff"
150        self.assertEqual(sin.encode("iso-8859-15", "backslashreplace"), sout)
151
152    def test_nameescape(self):
153        # Does the same as backslashescape, but prefers ``\N{...}`` escape
154        # sequences.
155        sin = "a\xac\u1234\u20ac\u8000\U0010ffff"
156        sout = (b'a\\N{NOT SIGN}\\N{ETHIOPIC SYLLABLE SEE}\\N{EURO SIGN}'
157                b'\\N{CJK UNIFIED IDEOGRAPH-8000}\\U0010ffff')
158        self.assertEqual(sin.encode("ascii", "namereplace"), sout)
159
160        sout = (b'a\xac\\N{ETHIOPIC SYLLABLE SEE}\\N{EURO SIGN}'
161                b'\\N{CJK UNIFIED IDEOGRAPH-8000}\\U0010ffff')
162        self.assertEqual(sin.encode("latin-1", "namereplace"), sout)
163
164        sout = (b'a\xac\\N{ETHIOPIC SYLLABLE SEE}\xa4'
165                b'\\N{CJK UNIFIED IDEOGRAPH-8000}\\U0010ffff')
166        self.assertEqual(sin.encode("iso-8859-15", "namereplace"), sout)
167
168    def test_decoding_callbacks(self):
169        # This is a test for a decoding callback handler
170        # that allows the decoding of the invalid sequence
171        # "\xc0\x80" and returns "\x00" instead of raising an error.
172        # All other illegal sequences will be handled strictly.
173        def relaxedutf8(exc):
174            if not isinstance(exc, UnicodeDecodeError):
175                raise TypeError("don't know how to handle %r" % exc)
176            if exc.object[exc.start:exc.start+2] == b"\xc0\x80":
177                return ("\x00", exc.start+2) # retry after two bytes
178            else:
179                raise exc
180
181        codecs.register_error("test.relaxedutf8", relaxedutf8)
182
183        # all the "\xc0\x80" will be decoded to "\x00"
184        sin = b"a\x00b\xc0\x80c\xc3\xbc\xc0\x80\xc0\x80"
185        sout = "a\x00b\x00c\xfc\x00\x00"
186        self.assertEqual(sin.decode("utf-8", "test.relaxedutf8"), sout)
187
188        # "\xc0\x81" is not valid and a UnicodeDecodeError will be raised
189        sin = b"\xc0\x80\xc0\x81"
190        self.assertRaises(UnicodeDecodeError, sin.decode,
191                          "utf-8", "test.relaxedutf8")
192
193    def test_charmapencode(self):
194        # For charmap encodings the replacement string will be
195        # mapped through the encoding again. This means, that
196        # to be able to use e.g. the "replace" handler, the
197        # charmap has to have a mapping for "?".
198        charmap = dict((ord(c), bytes(2*c.upper(), 'ascii')) for c in "abcdefgh")
199        sin = "abc"
200        sout = b"AABBCC"
201        self.assertEqual(codecs.charmap_encode(sin, "strict", charmap)[0], sout)
202
203        sin = "abcA"
204        self.assertRaises(UnicodeError, codecs.charmap_encode, sin, "strict", charmap)
205
206        charmap[ord("?")] = b"XYZ"
207        sin = "abcDEF"
208        sout = b"AABBCCXYZXYZXYZ"
209        self.assertEqual(codecs.charmap_encode(sin, "replace", charmap)[0], sout)
210
211        charmap[ord("?")] = "XYZ" # wrong type in mapping
212        self.assertRaises(TypeError, codecs.charmap_encode, sin, "replace", charmap)
213
214    def test_callbacks(self):
215        def handler1(exc):
216            r = range(exc.start, exc.end)
217            if isinstance(exc, UnicodeEncodeError):
218                l = ["<%d>" % ord(exc.object[pos]) for pos in r]
219            elif isinstance(exc, UnicodeDecodeError):
220                l = ["<%d>" % exc.object[pos] for pos in r]
221            else:
222                raise TypeError("don't know how to handle %r" % exc)
223            return ("[%s]" % "".join(l), exc.end)
224
225        codecs.register_error("test.handler1", handler1)
226
227        def handler2(exc):
228            if not isinstance(exc, UnicodeDecodeError):
229                raise TypeError("don't know how to handle %r" % exc)
230            l = ["<%d>" % exc.object[pos] for pos in range(exc.start, exc.end)]
231            return ("[%s]" % "".join(l), exc.end+1) # skip one character
232
233        codecs.register_error("test.handler2", handler2)
234
235        s = b"\x00\x81\x7f\x80\xff"
236
237        self.assertEqual(
238            s.decode("ascii", "test.handler1"),
239            "\x00[<129>]\x7f[<128>][<255>]"
240        )
241        self.assertEqual(
242            s.decode("ascii", "test.handler2"),
243            "\x00[<129>][<128>]"
244        )
245
246        self.assertEqual(
247            b"\\u3042\\u3xxx".decode("unicode-escape", "test.handler1"),
248            "\u3042[<92><117><51>]xxx"
249        )
250
251        self.assertEqual(
252            b"\\u3042\\u3xx".decode("unicode-escape", "test.handler1"),
253            "\u3042[<92><117><51>]xx"
254        )
255
256        self.assertEqual(
257            codecs.charmap_decode(b"abc", "test.handler1", {ord("a"): "z"})[0],
258            "z[<98>][<99>]"
259        )
260
261        self.assertEqual(
262            "g\xfc\xdfrk".encode("ascii", "test.handler1"),
263            b"g[<252><223>]rk"
264        )
265
266        self.assertEqual(
267            "g\xfc\xdf".encode("ascii", "test.handler1"),
268            b"g[<252><223>]"
269        )
270
271    def test_longstrings(self):
272        # test long strings to check for memory overflow problems
273        errors = [ "strict", "ignore", "replace", "xmlcharrefreplace",
274                   "backslashreplace", "namereplace"]
275        # register the handlers under different names,
276        # to prevent the codec from recognizing the name
277        for err in errors:
278            codecs.register_error("test." + err, codecs.lookup_error(err))
279        l = 1000
280        errors += [ "test." + err for err in errors ]
281        for uni in [ s*l for s in ("x", "\u3042", "a\xe4") ]:
282            for enc in ("ascii", "latin-1", "iso-8859-1", "iso-8859-15",
283                        "utf-8", "utf-7", "utf-16", "utf-32"):
284                for err in errors:
285                    try:
286                        uni.encode(enc, err)
287                    except UnicodeError:
288                        pass
289
290    def check_exceptionobjectargs(self, exctype, args, msg):
291        # Test UnicodeError subclasses: construction, attribute assignment and __str__ conversion
292        # check with one missing argument
293        self.assertRaises(TypeError, exctype, *args[:-1])
294        # check with one argument too much
295        self.assertRaises(TypeError, exctype, *(args + ["too much"]))
296        # check with one argument of the wrong type
297        wrongargs = [ "spam", b"eggs", b"spam", 42, 1.0, None ]
298        for i in range(len(args)):
299            for wrongarg in wrongargs:
300                if type(wrongarg) is type(args[i]):
301                    continue
302                # build argument array
303                callargs = []
304                for j in range(len(args)):
305                    if i==j:
306                        callargs.append(wrongarg)
307                    else:
308                        callargs.append(args[i])
309                self.assertRaises(TypeError, exctype, *callargs)
310
311        # check with the correct number and type of arguments
312        exc = exctype(*args)
313        self.assertEqual(str(exc), msg)
314
315    def test_unicodeencodeerror(self):
316        self.check_exceptionobjectargs(
317            UnicodeEncodeError,
318            ["ascii", "g\xfcrk", 1, 2, "ouch"],
319            "'ascii' codec can't encode character '\\xfc' in position 1: ouch"
320        )
321        self.check_exceptionobjectargs(
322            UnicodeEncodeError,
323            ["ascii", "g\xfcrk", 1, 4, "ouch"],
324            "'ascii' codec can't encode characters in position 1-3: ouch"
325        )
326        self.check_exceptionobjectargs(
327            UnicodeEncodeError,
328            ["ascii", "\xfcx", 0, 1, "ouch"],
329            "'ascii' codec can't encode character '\\xfc' in position 0: ouch"
330        )
331        self.check_exceptionobjectargs(
332            UnicodeEncodeError,
333            ["ascii", "\u0100x", 0, 1, "ouch"],
334            "'ascii' codec can't encode character '\\u0100' in position 0: ouch"
335        )
336        self.check_exceptionobjectargs(
337            UnicodeEncodeError,
338            ["ascii", "\uffffx", 0, 1, "ouch"],
339            "'ascii' codec can't encode character '\\uffff' in position 0: ouch"
340        )
341        self.check_exceptionobjectargs(
342            UnicodeEncodeError,
343            ["ascii", "\U00010000x", 0, 1, "ouch"],
344            "'ascii' codec can't encode character '\\U00010000' in position 0: ouch"
345        )
346
347    def test_unicodedecodeerror(self):
348        self.check_exceptionobjectargs(
349            UnicodeDecodeError,
350            ["ascii", bytearray(b"g\xfcrk"), 1, 2, "ouch"],
351            "'ascii' codec can't decode byte 0xfc in position 1: ouch"
352        )
353        self.check_exceptionobjectargs(
354            UnicodeDecodeError,
355            ["ascii", bytearray(b"g\xfcrk"), 1, 3, "ouch"],
356            "'ascii' codec can't decode bytes in position 1-2: ouch"
357        )
358
359    def test_unicodetranslateerror(self):
360        self.check_exceptionobjectargs(
361            UnicodeTranslateError,
362            ["g\xfcrk", 1, 2, "ouch"],
363            "can't translate character '\\xfc' in position 1: ouch"
364        )
365        self.check_exceptionobjectargs(
366            UnicodeTranslateError,
367            ["g\u0100rk", 1, 2, "ouch"],
368            "can't translate character '\\u0100' in position 1: ouch"
369        )
370        self.check_exceptionobjectargs(
371            UnicodeTranslateError,
372            ["g\uffffrk", 1, 2, "ouch"],
373            "can't translate character '\\uffff' in position 1: ouch"
374        )
375        self.check_exceptionobjectargs(
376            UnicodeTranslateError,
377            ["g\U00010000rk", 1, 2, "ouch"],
378            "can't translate character '\\U00010000' in position 1: ouch"
379        )
380        self.check_exceptionobjectargs(
381            UnicodeTranslateError,
382            ["g\xfcrk", 1, 3, "ouch"],
383            "can't translate characters in position 1-2: ouch"
384        )
385
386    def test_badandgoodstrictexceptions(self):
387        # "strict" complains about a non-exception passed in
388        self.assertRaises(
389            TypeError,
390            codecs.strict_errors,
391            42
392        )
393        # "strict" complains about the wrong exception type
394        self.assertRaises(
395            Exception,
396            codecs.strict_errors,
397            Exception("ouch")
398        )
399
400        # If the correct exception is passed in, "strict" raises it
401        self.assertRaises(
402            UnicodeEncodeError,
403            codecs.strict_errors,
404            UnicodeEncodeError("ascii", "\u3042", 0, 1, "ouch")
405        )
406        self.assertRaises(
407            UnicodeDecodeError,
408            codecs.strict_errors,
409            UnicodeDecodeError("ascii", bytearray(b"\xff"), 0, 1, "ouch")
410        )
411        self.assertRaises(
412            UnicodeTranslateError,
413            codecs.strict_errors,
414            UnicodeTranslateError("\u3042", 0, 1, "ouch")
415        )
416
417    def test_badandgoodignoreexceptions(self):
418        # "ignore" complains about a non-exception passed in
419        self.assertRaises(
420           TypeError,
421           codecs.ignore_errors,
422           42
423        )
424        # "ignore" complains about the wrong exception type
425        self.assertRaises(
426           TypeError,
427           codecs.ignore_errors,
428           UnicodeError("ouch")
429        )
430        # If the correct exception is passed in, "ignore" returns an empty replacement
431        self.assertEqual(
432            codecs.ignore_errors(
433                UnicodeEncodeError("ascii", "a\u3042b", 1, 2, "ouch")),
434            ("", 2)
435        )
436        self.assertEqual(
437            codecs.ignore_errors(
438                UnicodeDecodeError("ascii", bytearray(b"a\xffb"), 1, 2, "ouch")),
439            ("", 2)
440        )
441        self.assertEqual(
442            codecs.ignore_errors(
443                UnicodeTranslateError("a\u3042b", 1, 2, "ouch")),
444            ("", 2)
445        )
446
447    def test_badandgoodreplaceexceptions(self):
448        # "replace" complains about a non-exception passed in
449        self.assertRaises(
450           TypeError,
451           codecs.replace_errors,
452           42
453        )
454        # "replace" complains about the wrong exception type
455        self.assertRaises(
456           TypeError,
457           codecs.replace_errors,
458           UnicodeError("ouch")
459        )
460        self.assertRaises(
461            TypeError,
462            codecs.replace_errors,
463            BadObjectUnicodeEncodeError()
464        )
465        self.assertRaises(
466            TypeError,
467            codecs.replace_errors,
468            BadObjectUnicodeDecodeError()
469        )
470        # With the correct exception, "replace" returns an "?" or "\ufffd" replacement
471        self.assertEqual(
472            codecs.replace_errors(
473                UnicodeEncodeError("ascii", "a\u3042b", 1, 2, "ouch")),
474            ("?", 2)
475        )
476        self.assertEqual(
477            codecs.replace_errors(
478                UnicodeDecodeError("ascii", bytearray(b"a\xffb"), 1, 2, "ouch")),
479            ("\ufffd", 2)
480        )
481        self.assertEqual(
482            codecs.replace_errors(
483                UnicodeTranslateError("a\u3042b", 1, 2, "ouch")),
484            ("\ufffd", 2)
485        )
486
487    def test_badandgoodxmlcharrefreplaceexceptions(self):
488        # "xmlcharrefreplace" complains about a non-exception passed in
489        self.assertRaises(
490           TypeError,
491           codecs.xmlcharrefreplace_errors,
492           42
493        )
494        # "xmlcharrefreplace" complains about the wrong exception types
495        self.assertRaises(
496           TypeError,
497           codecs.xmlcharrefreplace_errors,
498           UnicodeError("ouch")
499        )
500        # "xmlcharrefreplace" can only be used for encoding
501        self.assertRaises(
502            TypeError,
503            codecs.xmlcharrefreplace_errors,
504            UnicodeDecodeError("ascii", bytearray(b"\xff"), 0, 1, "ouch")
505        )
506        self.assertRaises(
507            TypeError,
508            codecs.xmlcharrefreplace_errors,
509            UnicodeTranslateError("\u3042", 0, 1, "ouch")
510        )
511        # Use the correct exception
512        cs = (0, 1, 9, 10, 99, 100, 999, 1000, 9999, 10000, 99999, 100000,
513              999999, 1000000)
514        cs += (0xd800, 0xdfff)
515        s = "".join(chr(c) for c in cs)
516        self.assertEqual(
517            codecs.xmlcharrefreplace_errors(
518                UnicodeEncodeError("ascii", "a" + s + "b",
519                                   1, 1 + len(s), "ouch")
520            ),
521            ("".join("&#%d;" % c for c in cs), 1 + len(s))
522        )
523
524    def test_badandgoodbackslashreplaceexceptions(self):
525        # "backslashreplace" complains about a non-exception passed in
526        self.assertRaises(
527           TypeError,
528           codecs.backslashreplace_errors,
529           42
530        )
531        # "backslashreplace" complains about the wrong exception types
532        self.assertRaises(
533           TypeError,
534           codecs.backslashreplace_errors,
535           UnicodeError("ouch")
536        )
537        # Use the correct exception
538        tests = [
539            ("\u3042", "\\u3042"),
540            ("\n", "\\x0a"),
541            ("a", "\\x61"),
542            ("\x00", "\\x00"),
543            ("\xff", "\\xff"),
544            ("\u0100", "\\u0100"),
545            ("\uffff", "\\uffff"),
546            ("\U00010000", "\\U00010000"),
547            ("\U0010ffff", "\\U0010ffff"),
548            # Lone surrogates
549            ("\ud800", "\\ud800"),
550            ("\udfff", "\\udfff"),
551            ("\ud800\udfff", "\\ud800\\udfff"),
552        ]
553        for s, r in tests:
554            with self.subTest(str=s):
555                self.assertEqual(
556                    codecs.backslashreplace_errors(
557                        UnicodeEncodeError("ascii", "a" + s + "b",
558                                           1, 1 + len(s), "ouch")),
559                    (r, 1 + len(s))
560                )
561                self.assertEqual(
562                    codecs.backslashreplace_errors(
563                        UnicodeTranslateError("a" + s + "b",
564                                              1, 1 + len(s), "ouch")),
565                    (r, 1 + len(s))
566                )
567        tests = [
568            (b"a", "\\x61"),
569            (b"\n", "\\x0a"),
570            (b"\x00", "\\x00"),
571            (b"\xff", "\\xff"),
572        ]
573        for b, r in tests:
574            with self.subTest(bytes=b):
575                self.assertEqual(
576                    codecs.backslashreplace_errors(
577                        UnicodeDecodeError("ascii", bytearray(b"a" + b + b"b"),
578                                           1, 2, "ouch")),
579                    (r, 2)
580                )
581
582    def test_badandgoodnamereplaceexceptions(self):
583        # "namereplace" complains about a non-exception passed in
584        self.assertRaises(
585           TypeError,
586           codecs.namereplace_errors,
587           42
588        )
589        # "namereplace" complains about the wrong exception types
590        self.assertRaises(
591           TypeError,
592           codecs.namereplace_errors,
593           UnicodeError("ouch")
594        )
595        # "namereplace" can only be used for encoding
596        self.assertRaises(
597            TypeError,
598            codecs.namereplace_errors,
599            UnicodeDecodeError("ascii", bytearray(b"\xff"), 0, 1, "ouch")
600        )
601        self.assertRaises(
602            TypeError,
603            codecs.namereplace_errors,
604            UnicodeTranslateError("\u3042", 0, 1, "ouch")
605        )
606        # Use the correct exception
607        tests = [
608            ("\u3042", "\\N{HIRAGANA LETTER A}"),
609            ("\x00", "\\x00"),
610            ("\ufbf9", "\\N{ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH "
611                       "HAMZA ABOVE WITH ALEF MAKSURA ISOLATED FORM}"),
612            ("\U000e007f", "\\N{CANCEL TAG}"),
613            ("\U0010ffff", "\\U0010ffff"),
614            # Lone surrogates
615            ("\ud800", "\\ud800"),
616            ("\udfff", "\\udfff"),
617            ("\ud800\udfff", "\\ud800\\udfff"),
618        ]
619        for s, r in tests:
620            with self.subTest(str=s):
621                self.assertEqual(
622                    codecs.namereplace_errors(
623                        UnicodeEncodeError("ascii", "a" + s + "b",
624                                           1, 1 + len(s), "ouch")),
625                    (r, 1 + len(s))
626                )
627
628    def test_badandgoodsurrogateescapeexceptions(self):
629        surrogateescape_errors = codecs.lookup_error('surrogateescape')
630        # "surrogateescape" complains about a non-exception passed in
631        self.assertRaises(
632           TypeError,
633           surrogateescape_errors,
634           42
635        )
636        # "surrogateescape" complains about the wrong exception types
637        self.assertRaises(
638           TypeError,
639           surrogateescape_errors,
640           UnicodeError("ouch")
641        )
642        # "surrogateescape" can not be used for translating
643        self.assertRaises(
644            TypeError,
645            surrogateescape_errors,
646            UnicodeTranslateError("\udc80", 0, 1, "ouch")
647        )
648        # Use the correct exception
649        for s in ("a", "\udc7f", "\udd00"):
650            with self.subTest(str=s):
651                self.assertRaises(
652                    UnicodeEncodeError,
653                    surrogateescape_errors,
654                    UnicodeEncodeError("ascii", s, 0, 1, "ouch")
655                )
656        self.assertEqual(
657            surrogateescape_errors(
658                UnicodeEncodeError("ascii", "a\udc80b", 1, 2, "ouch")),
659            (b"\x80", 2)
660        )
661        self.assertRaises(
662            UnicodeDecodeError,
663            surrogateescape_errors,
664            UnicodeDecodeError("ascii", bytearray(b"a"), 0, 1, "ouch")
665        )
666        self.assertEqual(
667            surrogateescape_errors(
668                UnicodeDecodeError("ascii", bytearray(b"a\x80b"), 1, 2, "ouch")),
669            ("\udc80", 2)
670        )
671
672    def test_badandgoodsurrogatepassexceptions(self):
673        surrogatepass_errors = codecs.lookup_error('surrogatepass')
674        # "surrogatepass" complains about a non-exception passed in
675        self.assertRaises(
676           TypeError,
677           surrogatepass_errors,
678           42
679        )
680        # "surrogatepass" complains about the wrong exception types
681        self.assertRaises(
682           TypeError,
683           surrogatepass_errors,
684           UnicodeError("ouch")
685        )
686        # "surrogatepass" can not be used for translating
687        self.assertRaises(
688            TypeError,
689            surrogatepass_errors,
690            UnicodeTranslateError("\ud800", 0, 1, "ouch")
691        )
692        # Use the correct exception
693        for enc in ("utf-8", "utf-16le", "utf-16be", "utf-32le", "utf-32be"):
694            with self.subTest(encoding=enc):
695                self.assertRaises(
696                    UnicodeEncodeError,
697                    surrogatepass_errors,
698                    UnicodeEncodeError(enc, "a", 0, 1, "ouch")
699                )
700                self.assertRaises(
701                    UnicodeDecodeError,
702                    surrogatepass_errors,
703                    UnicodeDecodeError(enc, "a".encode(enc), 0, 1, "ouch")
704                )
705        for s in ("\ud800", "\udfff", "\ud800\udfff"):
706            with self.subTest(str=s):
707                self.assertRaises(
708                    UnicodeEncodeError,
709                    surrogatepass_errors,
710                    UnicodeEncodeError("ascii", s, 0, len(s), "ouch")
711                )
712        tests = [
713            ("utf-8", "\ud800", b'\xed\xa0\x80', 3),
714            ("utf-16le", "\ud800", b'\x00\xd8', 2),
715            ("utf-16be", "\ud800", b'\xd8\x00', 2),
716            ("utf-32le", "\ud800", b'\x00\xd8\x00\x00', 4),
717            ("utf-32be", "\ud800", b'\x00\x00\xd8\x00', 4),
718            ("utf-8", "\udfff", b'\xed\xbf\xbf', 3),
719            ("utf-16le", "\udfff", b'\xff\xdf', 2),
720            ("utf-16be", "\udfff", b'\xdf\xff', 2),
721            ("utf-32le", "\udfff", b'\xff\xdf\x00\x00', 4),
722            ("utf-32be", "\udfff", b'\x00\x00\xdf\xff', 4),
723            ("utf-8", "\ud800\udfff", b'\xed\xa0\x80\xed\xbf\xbf', 3),
724            ("utf-16le", "\ud800\udfff", b'\x00\xd8\xff\xdf', 2),
725            ("utf-16be", "\ud800\udfff", b'\xd8\x00\xdf\xff', 2),
726            ("utf-32le", "\ud800\udfff", b'\x00\xd8\x00\x00\xff\xdf\x00\x00', 4),
727            ("utf-32be", "\ud800\udfff", b'\x00\x00\xd8\x00\x00\x00\xdf\xff', 4),
728        ]
729        for enc, s, b, n in tests:
730            with self.subTest(encoding=enc, str=s, bytes=b):
731                self.assertEqual(
732                    surrogatepass_errors(
733                        UnicodeEncodeError(enc, "a" + s + "b",
734                                           1, 1 + len(s), "ouch")),
735                    (b, 1 + len(s))
736                )
737                self.assertEqual(
738                    surrogatepass_errors(
739                        UnicodeDecodeError(enc, bytearray(b"a" + b[:n] + b"b"),
740                                           1, 1 + n, "ouch")),
741                    (s[:1], 1 + n)
742                )
743
744    def test_badhandlerresults(self):
745        results = ( 42, "foo", (1,2,3), ("foo", 1, 3), ("foo", None), ("foo",), ("foo", 1, 3), ("foo", None), ("foo",) )
746        encs = ("ascii", "latin-1", "iso-8859-1", "iso-8859-15")
747
748        for res in results:
749            codecs.register_error("test.badhandler", lambda x: res)
750            for enc in encs:
751                self.assertRaises(
752                    TypeError,
753                    "\u3042".encode,
754                    enc,
755                    "test.badhandler"
756                )
757            for (enc, bytes) in (
758                ("ascii", b"\xff"),
759                ("utf-8", b"\xff"),
760                ("utf-7", b"+x-"),
761            ):
762                self.assertRaises(
763                    TypeError,
764                    bytes.decode,
765                    enc,
766                    "test.badhandler"
767                )
768
769    def test_lookup(self):
770        self.assertEqual(codecs.strict_errors, codecs.lookup_error("strict"))
771        self.assertEqual(codecs.ignore_errors, codecs.lookup_error("ignore"))
772        self.assertEqual(codecs.strict_errors, codecs.lookup_error("strict"))
773        self.assertEqual(
774            codecs.xmlcharrefreplace_errors,
775            codecs.lookup_error("xmlcharrefreplace")
776        )
777        self.assertEqual(
778            codecs.backslashreplace_errors,
779            codecs.lookup_error("backslashreplace")
780        )
781        self.assertEqual(
782            codecs.namereplace_errors,
783            codecs.lookup_error("namereplace")
784        )
785
786    def test_unencodablereplacement(self):
787        def unencrepl(exc):
788            if isinstance(exc, UnicodeEncodeError):
789                return ("\u4242", exc.end)
790            else:
791                raise TypeError("don't know how to handle %r" % exc)
792        codecs.register_error("test.unencreplhandler", unencrepl)
793        for enc in ("ascii", "iso-8859-1", "iso-8859-15"):
794            self.assertRaises(
795                UnicodeEncodeError,
796                "\u4242".encode,
797                enc,
798                "test.unencreplhandler"
799            )
800
801    def test_badregistercall(self):
802        # enhance coverage of:
803        # Modules/_codecsmodule.c::register_error()
804        # Python/codecs.c::PyCodec_RegisterError()
805        self.assertRaises(TypeError, codecs.register_error, 42)
806        self.assertRaises(TypeError, codecs.register_error, "test.dummy", 42)
807
808    def test_badlookupcall(self):
809        # enhance coverage of:
810        # Modules/_codecsmodule.c::lookup_error()
811        self.assertRaises(TypeError, codecs.lookup_error)
812
813    def test_unknownhandler(self):
814        # enhance coverage of:
815        # Modules/_codecsmodule.c::lookup_error()
816        self.assertRaises(LookupError, codecs.lookup_error, "test.unknown")
817
818    def test_xmlcharrefvalues(self):
819        # enhance coverage of:
820        # Python/codecs.c::PyCodec_XMLCharRefReplaceErrors()
821        # and inline implementations
822        v = (1, 5, 10, 50, 100, 500, 1000, 5000, 10000, 50000, 100000,
823             500000, 1000000)
824        s = "".join([chr(x) for x in v])
825        codecs.register_error("test.xmlcharrefreplace", codecs.xmlcharrefreplace_errors)
826        for enc in ("ascii", "iso-8859-15"):
827            for err in ("xmlcharrefreplace", "test.xmlcharrefreplace"):
828                s.encode(enc, err)
829
830    def test_decodehelper(self):
831        # enhance coverage of:
832        # Objects/unicodeobject.c::unicode_decode_call_errorhandler()
833        # and callers
834        self.assertRaises(LookupError, b"\xff".decode, "ascii", "test.unknown")
835
836        def baddecodereturn1(exc):
837            return 42
838        codecs.register_error("test.baddecodereturn1", baddecodereturn1)
839        self.assertRaises(TypeError, b"\xff".decode, "ascii", "test.baddecodereturn1")
840        self.assertRaises(TypeError, b"\\".decode, "unicode-escape", "test.baddecodereturn1")
841        self.assertRaises(TypeError, b"\\x0".decode, "unicode-escape", "test.baddecodereturn1")
842        self.assertRaises(TypeError, b"\\x0y".decode, "unicode-escape", "test.baddecodereturn1")
843        self.assertRaises(TypeError, b"\\Uffffeeee".decode, "unicode-escape", "test.baddecodereturn1")
844        self.assertRaises(TypeError, b"\\uyyyy".decode, "raw-unicode-escape", "test.baddecodereturn1")
845
846        def baddecodereturn2(exc):
847            return ("?", None)
848        codecs.register_error("test.baddecodereturn2", baddecodereturn2)
849        self.assertRaises(TypeError, b"\xff".decode, "ascii", "test.baddecodereturn2")
850
851        handler = PosReturn()
852        codecs.register_error("test.posreturn", handler.handle)
853
854        # Valid negative position
855        handler.pos = -1
856        self.assertEqual(b"\xff0".decode("ascii", "test.posreturn"), "<?>0")
857
858        # Valid negative position
859        handler.pos = -2
860        self.assertEqual(b"\xff0".decode("ascii", "test.posreturn"), "<?><?>")
861
862        # Negative position out of bounds
863        handler.pos = -3
864        self.assertRaises(IndexError, b"\xff0".decode, "ascii", "test.posreturn")
865
866        # Valid positive position
867        handler.pos = 1
868        self.assertEqual(b"\xff0".decode("ascii", "test.posreturn"), "<?>0")
869
870        # Largest valid positive position (one beyond end of input)
871        handler.pos = 2
872        self.assertEqual(b"\xff0".decode("ascii", "test.posreturn"), "<?>")
873
874        # Invalid positive position
875        handler.pos = 3
876        self.assertRaises(IndexError, b"\xff0".decode, "ascii", "test.posreturn")
877
878        # Restart at the "0"
879        handler.pos = 6
880        self.assertEqual(b"\\uyyyy0".decode("raw-unicode-escape", "test.posreturn"), "<?>0")
881
882        class D(dict):
883            def __getitem__(self, key):
884                raise ValueError
885        self.assertRaises(UnicodeError, codecs.charmap_decode, b"\xff", "strict", {0xff: None})
886        self.assertRaises(ValueError, codecs.charmap_decode, b"\xff", "strict", D())
887        self.assertRaises(TypeError, codecs.charmap_decode, b"\xff", "strict", {0xff: sys.maxunicode+1})
888
889    def test_encodehelper(self):
890        # enhance coverage of:
891        # Objects/unicodeobject.c::unicode_encode_call_errorhandler()
892        # and callers
893        self.assertRaises(LookupError, "\xff".encode, "ascii", "test.unknown")
894
895        def badencodereturn1(exc):
896            return 42
897        codecs.register_error("test.badencodereturn1", badencodereturn1)
898        self.assertRaises(TypeError, "\xff".encode, "ascii", "test.badencodereturn1")
899
900        def badencodereturn2(exc):
901            return ("?", None)
902        codecs.register_error("test.badencodereturn2", badencodereturn2)
903        self.assertRaises(TypeError, "\xff".encode, "ascii", "test.badencodereturn2")
904
905        handler = PosReturn()
906        codecs.register_error("test.posreturn", handler.handle)
907
908        # Valid negative position
909        handler.pos = -1
910        self.assertEqual("\xff0".encode("ascii", "test.posreturn"), b"<?>0")
911
912        # Valid negative position
913        handler.pos = -2
914        self.assertEqual("\xff0".encode("ascii", "test.posreturn"), b"<?><?>")
915
916        # Negative position out of bounds
917        handler.pos = -3
918        self.assertRaises(IndexError, "\xff0".encode, "ascii", "test.posreturn")
919
920        # Valid positive position
921        handler.pos = 1
922        self.assertEqual("\xff0".encode("ascii", "test.posreturn"), b"<?>0")
923
924        # Largest valid positive position (one beyond end of input
925        handler.pos = 2
926        self.assertEqual("\xff0".encode("ascii", "test.posreturn"), b"<?>")
927
928        # Invalid positive position
929        handler.pos = 3
930        self.assertRaises(IndexError, "\xff0".encode, "ascii", "test.posreturn")
931
932        handler.pos = 0
933
934        class D(dict):
935            def __getitem__(self, key):
936                raise ValueError
937        for err in ("strict", "replace", "xmlcharrefreplace",
938                    "backslashreplace", "namereplace", "test.posreturn"):
939            self.assertRaises(UnicodeError, codecs.charmap_encode, "\xff", err, {0xff: None})
940            self.assertRaises(ValueError, codecs.charmap_encode, "\xff", err, D())
941            self.assertRaises(TypeError, codecs.charmap_encode, "\xff", err, {0xff: 300})
942
943    def test_translatehelper(self):
944        # enhance coverage of:
945        # Objects/unicodeobject.c::unicode_encode_call_errorhandler()
946        # and callers
947        # (Unfortunately the errors argument is not directly accessible
948        # from Python, so we can't test that much)
949        class D(dict):
950            def __getitem__(self, key):
951                raise ValueError
952        #self.assertRaises(ValueError, "\xff".translate, D())
953        self.assertRaises(ValueError, "\xff".translate, {0xff: sys.maxunicode+1})
954        self.assertRaises(TypeError, "\xff".translate, {0xff: ()})
955
956    def test_bug828737(self):
957        charmap = {
958            ord("&"): "&amp;",
959            ord("<"): "&lt;",
960            ord(">"): "&gt;",
961            ord('"'): "&quot;",
962        }
963
964        for n in (1, 10, 100, 1000):
965            text = 'abc<def>ghi'*n
966            text.translate(charmap)
967
968    def test_mutatingdecodehandler(self):
969        baddata = [
970            ("ascii", b"\xff"),
971            ("utf-7", b"++"),
972            ("utf-8",  b"\xff"),
973            ("utf-16", b"\xff"),
974            ("utf-32", b"\xff"),
975            ("unicode-escape", b"\\u123g"),
976            ("raw-unicode-escape", b"\\u123g"),
977        ]
978
979        def replacing(exc):
980            if isinstance(exc, UnicodeDecodeError):
981                exc.object = 42
982                return ("\u4242", 0)
983            else:
984                raise TypeError("don't know how to handle %r" % exc)
985        codecs.register_error("test.replacing", replacing)
986
987        for (encoding, data) in baddata:
988            with self.assertRaises(TypeError):
989                data.decode(encoding, "test.replacing")
990
991        def mutating(exc):
992            if isinstance(exc, UnicodeDecodeError):
993                exc.object = b""
994                return ("\u4242", 0)
995            else:
996                raise TypeError("don't know how to handle %r" % exc)
997        codecs.register_error("test.mutating", mutating)
998        # If the decoder doesn't pick up the modified input the following
999        # will lead to an endless loop
1000        for (encoding, data) in baddata:
1001            self.assertEqual(data.decode(encoding, "test.mutating"), "\u4242")
1002
1003    # issue32583
1004    def test_crashing_decode_handler(self):
1005        # better generating one more character to fill the extra space slot
1006        # so in debug build it can steadily fail
1007        def forward_shorter_than_end(exc):
1008            if isinstance(exc, UnicodeDecodeError):
1009                # size one character, 0 < forward < exc.end
1010                return ('\ufffd', exc.start+1)
1011            else:
1012                raise TypeError("don't know how to handle %r" % exc)
1013        codecs.register_error(
1014            "test.forward_shorter_than_end", forward_shorter_than_end)
1015
1016        self.assertEqual(
1017            b'\xd8\xd8\xd8\xd8\xd8\x00\x00\x00'.decode(
1018                'utf-16-le', 'test.forward_shorter_than_end'),
1019            '\ufffd\ufffd\ufffd\ufffd\xd8\x00'
1020        )
1021        self.assertEqual(
1022            b'\xd8\xd8\xd8\xd8\x00\xd8\x00\x00'.decode(
1023                'utf-16-be', 'test.forward_shorter_than_end'),
1024            '\ufffd\ufffd\ufffd\ufffd\xd8\x00'
1025        )
1026        self.assertEqual(
1027            b'\x11\x11\x11\x11\x11\x00\x00\x00\x00\x00\x00'.decode(
1028                'utf-32-le', 'test.forward_shorter_than_end'),
1029            '\ufffd\ufffd\ufffd\u1111\x00'
1030        )
1031        self.assertEqual(
1032            b'\x11\x11\x11\x00\x00\x11\x11\x00\x00\x00\x00'.decode(
1033                'utf-32-be', 'test.forward_shorter_than_end'),
1034            '\ufffd\ufffd\ufffd\u1111\x00'
1035        )
1036
1037        def replace_with_long(exc):
1038            if isinstance(exc, UnicodeDecodeError):
1039                exc.object = b"\x00" * 8
1040                return ('\ufffd', exc.start)
1041            else:
1042                raise TypeError("don't know how to handle %r" % exc)
1043        codecs.register_error("test.replace_with_long", replace_with_long)
1044
1045        self.assertEqual(
1046            b'\x00'.decode('utf-16', 'test.replace_with_long'),
1047            '\ufffd\x00\x00\x00\x00'
1048        )
1049        self.assertEqual(
1050            b'\x00'.decode('utf-32', 'test.replace_with_long'),
1051            '\ufffd\x00\x00'
1052        )
1053
1054
1055    def test_fake_error_class(self):
1056        handlers = [
1057            codecs.strict_errors,
1058            codecs.ignore_errors,
1059            codecs.replace_errors,
1060            codecs.backslashreplace_errors,
1061            codecs.namereplace_errors,
1062            codecs.xmlcharrefreplace_errors,
1063            codecs.lookup_error('surrogateescape'),
1064            codecs.lookup_error('surrogatepass'),
1065        ]
1066        for cls in UnicodeEncodeError, UnicodeDecodeError, UnicodeTranslateError:
1067            class FakeUnicodeError(str):
1068                __class__ = cls
1069            for handler in handlers:
1070                with self.subTest(handler=handler, error_class=cls):
1071                    self.assertRaises(TypeError, handler, FakeUnicodeError())
1072            class FakeUnicodeError(Exception):
1073                __class__ = cls
1074            for handler in handlers:
1075                with self.subTest(handler=handler, error_class=cls):
1076                    with self.assertRaises((TypeError, FakeUnicodeError)):
1077                        handler(FakeUnicodeError())
1078
1079
1080if __name__ == "__main__":
1081    unittest.main()
1082