1import codecs 2import contextlib 3import io 4import locale 5import sys 6import unittest 7import encodings 8from unittest import mock 9 10from test import support 11 12try: 13 import _testcapi 14except ImportError as exc: 15 _testcapi = None 16 17try: 18 import ctypes 19except ImportError: 20 ctypes = None 21 SIZEOF_WCHAR_T = -1 22else: 23 SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar) 24 25def coding_checker(self, coder): 26 def check(input, expect): 27 self.assertEqual(coder(input), (expect, len(input))) 28 return check 29 30# On small versions of Windows like Windows IoT or Windows Nano Server not all codepages are present 31def is_code_page_present(cp): 32 from ctypes import POINTER, WINFUNCTYPE, WinDLL 33 from ctypes.wintypes import BOOL, UINT, BYTE, WCHAR, UINT, DWORD 34 35 MAX_LEADBYTES = 12 # 5 ranges, 2 bytes ea., 0 term. 36 MAX_DEFAULTCHAR = 2 # single or double byte 37 MAX_PATH = 260 38 class CPINFOEXW(ctypes.Structure): 39 _fields_ = [("MaxCharSize", UINT), 40 ("DefaultChar", BYTE*MAX_DEFAULTCHAR), 41 ("LeadByte", BYTE*MAX_LEADBYTES), 42 ("UnicodeDefaultChar", WCHAR), 43 ("CodePage", UINT), 44 ("CodePageName", WCHAR*MAX_PATH)] 45 46 prototype = WINFUNCTYPE(BOOL, UINT, DWORD, POINTER(CPINFOEXW)) 47 GetCPInfoEx = prototype(("GetCPInfoExW", WinDLL("kernel32"))) 48 info = CPINFOEXW() 49 return GetCPInfoEx(cp, 0, info) 50 51class Queue(object): 52 """ 53 queue: write bytes at one end, read bytes from the other end 54 """ 55 def __init__(self, buffer): 56 self._buffer = buffer 57 58 def write(self, chars): 59 self._buffer += chars 60 61 def read(self, size=-1): 62 if size<0: 63 s = self._buffer 64 self._buffer = self._buffer[:0] # make empty 65 return s 66 else: 67 s = self._buffer[:size] 68 self._buffer = self._buffer[size:] 69 return s 70 71 72class MixInCheckStateHandling: 73 def check_state_handling_decode(self, encoding, u, s): 74 for i in range(len(s)+1): 75 d = codecs.getincrementaldecoder(encoding)() 76 part1 = d.decode(s[:i]) 77 state = d.getstate() 78 self.assertIsInstance(state[1], int) 79 # Check that the condition stated in the documentation for 80 # IncrementalDecoder.getstate() holds 81 if not state[1]: 82 # reset decoder to the default state without anything buffered 83 d.setstate((state[0][:0], 0)) 84 # Feeding the previous input may not produce any output 85 self.assertTrue(not d.decode(state[0])) 86 # The decoder must return to the same state 87 self.assertEqual(state, d.getstate()) 88 # Create a new decoder and set it to the state 89 # we extracted from the old one 90 d = codecs.getincrementaldecoder(encoding)() 91 d.setstate(state) 92 part2 = d.decode(s[i:], True) 93 self.assertEqual(u, part1+part2) 94 95 def check_state_handling_encode(self, encoding, u, s): 96 for i in range(len(u)+1): 97 d = codecs.getincrementalencoder(encoding)() 98 part1 = d.encode(u[:i]) 99 state = d.getstate() 100 d = codecs.getincrementalencoder(encoding)() 101 d.setstate(state) 102 part2 = d.encode(u[i:], True) 103 self.assertEqual(s, part1+part2) 104 105 106class ReadTest(MixInCheckStateHandling): 107 def check_partial(self, input, partialresults): 108 # get a StreamReader for the encoding and feed the bytestring version 109 # of input to the reader byte by byte. Read everything available from 110 # the StreamReader and check that the results equal the appropriate 111 # entries from partialresults. 112 q = Queue(b"") 113 r = codecs.getreader(self.encoding)(q) 114 result = "" 115 for (c, partialresult) in zip(input.encode(self.encoding), partialresults): 116 q.write(bytes([c])) 117 result += r.read() 118 self.assertEqual(result, partialresult) 119 # check that there's nothing left in the buffers 120 self.assertEqual(r.read(), "") 121 self.assertEqual(r.bytebuffer, b"") 122 123 # do the check again, this time using an incremental decoder 124 d = codecs.getincrementaldecoder(self.encoding)() 125 result = "" 126 for (c, partialresult) in zip(input.encode(self.encoding), partialresults): 127 result += d.decode(bytes([c])) 128 self.assertEqual(result, partialresult) 129 # check that there's nothing left in the buffers 130 self.assertEqual(d.decode(b"", True), "") 131 self.assertEqual(d.buffer, b"") 132 133 # Check whether the reset method works properly 134 d.reset() 135 result = "" 136 for (c, partialresult) in zip(input.encode(self.encoding), partialresults): 137 result += d.decode(bytes([c])) 138 self.assertEqual(result, partialresult) 139 # check that there's nothing left in the buffers 140 self.assertEqual(d.decode(b"", True), "") 141 self.assertEqual(d.buffer, b"") 142 143 # check iterdecode() 144 encoded = input.encode(self.encoding) 145 self.assertEqual( 146 input, 147 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding)) 148 ) 149 150 def test_readline(self): 151 def getreader(input): 152 stream = io.BytesIO(input.encode(self.encoding)) 153 return codecs.getreader(self.encoding)(stream) 154 155 def readalllines(input, keepends=True, size=None): 156 reader = getreader(input) 157 lines = [] 158 while True: 159 line = reader.readline(size=size, keepends=keepends) 160 if not line: 161 break 162 lines.append(line) 163 return "|".join(lines) 164 165 s = "foo\nbar\r\nbaz\rspam\u2028eggs" 166 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs" 167 sexpectednoends = "foo|bar|baz|spam|eggs" 168 self.assertEqual(readalllines(s, True), sexpected) 169 self.assertEqual(readalllines(s, False), sexpectednoends) 170 self.assertEqual(readalllines(s, True, 10), sexpected) 171 self.assertEqual(readalllines(s, False, 10), sexpectednoends) 172 173 lineends = ("\n", "\r\n", "\r", "\u2028") 174 # Test long lines (multiple calls to read() in readline()) 175 vw = [] 176 vwo = [] 177 for (i, lineend) in enumerate(lineends): 178 vw.append((i*200+200)*"\u3042" + lineend) 179 vwo.append((i*200+200)*"\u3042") 180 self.assertEqual(readalllines("".join(vw), True), "|".join(vw)) 181 self.assertEqual(readalllines("".join(vw), False), "|".join(vwo)) 182 183 # Test lines where the first read might end with \r, so the 184 # reader has to look ahead whether this is a lone \r or a \r\n 185 for size in range(80): 186 for lineend in lineends: 187 s = 10*(size*"a" + lineend + "xxx\n") 188 reader = getreader(s) 189 for i in range(10): 190 self.assertEqual( 191 reader.readline(keepends=True), 192 size*"a" + lineend, 193 ) 194 self.assertEqual( 195 reader.readline(keepends=True), 196 "xxx\n", 197 ) 198 reader = getreader(s) 199 for i in range(10): 200 self.assertEqual( 201 reader.readline(keepends=False), 202 size*"a", 203 ) 204 self.assertEqual( 205 reader.readline(keepends=False), 206 "xxx", 207 ) 208 209 def test_mixed_readline_and_read(self): 210 lines = ["Humpty Dumpty sat on a wall,\n", 211 "Humpty Dumpty had a great fall.\r\n", 212 "All the king's horses and all the king's men\r", 213 "Couldn't put Humpty together again."] 214 data = ''.join(lines) 215 def getreader(): 216 stream = io.BytesIO(data.encode(self.encoding)) 217 return codecs.getreader(self.encoding)(stream) 218 219 # Issue #8260: Test readline() followed by read() 220 f = getreader() 221 self.assertEqual(f.readline(), lines[0]) 222 self.assertEqual(f.read(), ''.join(lines[1:])) 223 self.assertEqual(f.read(), '') 224 225 # Issue #32110: Test readline() followed by read(n) 226 f = getreader() 227 self.assertEqual(f.readline(), lines[0]) 228 self.assertEqual(f.read(1), lines[1][0]) 229 self.assertEqual(f.read(0), '') 230 self.assertEqual(f.read(100), data[len(lines[0]) + 1:][:100]) 231 232 # Issue #16636: Test readline() followed by readlines() 233 f = getreader() 234 self.assertEqual(f.readline(), lines[0]) 235 self.assertEqual(f.readlines(), lines[1:]) 236 self.assertEqual(f.read(), '') 237 238 # Test read(n) followed by read() 239 f = getreader() 240 self.assertEqual(f.read(size=40, chars=5), data[:5]) 241 self.assertEqual(f.read(), data[5:]) 242 self.assertEqual(f.read(), '') 243 244 # Issue #32110: Test read(n) followed by read(n) 245 f = getreader() 246 self.assertEqual(f.read(size=40, chars=5), data[:5]) 247 self.assertEqual(f.read(1), data[5]) 248 self.assertEqual(f.read(0), '') 249 self.assertEqual(f.read(100), data[6:106]) 250 251 # Issue #12446: Test read(n) followed by readlines() 252 f = getreader() 253 self.assertEqual(f.read(size=40, chars=5), data[:5]) 254 self.assertEqual(f.readlines(), [lines[0][5:]] + lines[1:]) 255 self.assertEqual(f.read(), '') 256 257 def test_bug1175396(self): 258 s = [ 259 '<%!--===================================================\r\n', 260 ' BLOG index page: show recent articles,\r\n', 261 ' today\'s articles, or articles of a specific date.\r\n', 262 '========================================================--%>\r\n', 263 '<%@inputencoding="ISO-8859-1"%>\r\n', 264 '<%@pagetemplate=TEMPLATE.y%>\r\n', 265 '<%@import=import frog.util, frog%>\r\n', 266 '<%@import=import frog.objects%>\r\n', 267 '<%@import=from frog.storageerrors import StorageError%>\r\n', 268 '<%\r\n', 269 '\r\n', 270 'import logging\r\n', 271 'log=logging.getLogger("Snakelets.logger")\r\n', 272 '\r\n', 273 '\r\n', 274 'user=self.SessionCtx.user\r\n', 275 'storageEngine=self.SessionCtx.storageEngine\r\n', 276 '\r\n', 277 '\r\n', 278 'def readArticlesFromDate(date, count=None):\r\n', 279 ' entryids=storageEngine.listBlogEntries(date)\r\n', 280 ' entryids.reverse() # descending\r\n', 281 ' if count:\r\n', 282 ' entryids=entryids[:count]\r\n', 283 ' try:\r\n', 284 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n', 285 ' except StorageError,x:\r\n', 286 ' log.error("Error loading articles: "+str(x))\r\n', 287 ' self.abort("cannot load articles")\r\n', 288 '\r\n', 289 'showdate=None\r\n', 290 '\r\n', 291 'arg=self.Request.getArg()\r\n', 292 'if arg=="today":\r\n', 293 ' #-------------------- TODAY\'S ARTICLES\r\n', 294 ' self.write("<h2>Today\'s articles</h2>")\r\n', 295 ' showdate = frog.util.isodatestr() \r\n', 296 ' entries = readArticlesFromDate(showdate)\r\n', 297 'elif arg=="active":\r\n', 298 ' #-------------------- ACTIVE ARTICLES redirect\r\n', 299 ' self.Yredirect("active.y")\r\n', 300 'elif arg=="login":\r\n', 301 ' #-------------------- LOGIN PAGE redirect\r\n', 302 ' self.Yredirect("login.y")\r\n', 303 'elif arg=="date":\r\n', 304 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n', 305 ' showdate = self.Request.getParameter("date")\r\n', 306 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n', 307 ' entries = readArticlesFromDate(showdate)\r\n', 308 'else:\r\n', 309 ' #-------------------- RECENT ARTICLES\r\n', 310 ' self.write("<h2>Recent articles</h2>")\r\n', 311 ' dates=storageEngine.listBlogEntryDates()\r\n', 312 ' if dates:\r\n', 313 ' entries=[]\r\n', 314 ' SHOWAMOUNT=10\r\n', 315 ' for showdate in dates:\r\n', 316 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n', 317 ' if len(entries)>=SHOWAMOUNT:\r\n', 318 ' break\r\n', 319 ' \r\n', 320 ] 321 stream = io.BytesIO("".join(s).encode(self.encoding)) 322 reader = codecs.getreader(self.encoding)(stream) 323 for (i, line) in enumerate(reader): 324 self.assertEqual(line, s[i]) 325 326 def test_readlinequeue(self): 327 q = Queue(b"") 328 writer = codecs.getwriter(self.encoding)(q) 329 reader = codecs.getreader(self.encoding)(q) 330 331 # No lineends 332 writer.write("foo\r") 333 self.assertEqual(reader.readline(keepends=False), "foo") 334 writer.write("\nbar\r") 335 self.assertEqual(reader.readline(keepends=False), "") 336 self.assertEqual(reader.readline(keepends=False), "bar") 337 writer.write("baz") 338 self.assertEqual(reader.readline(keepends=False), "baz") 339 self.assertEqual(reader.readline(keepends=False), "") 340 341 # Lineends 342 writer.write("foo\r") 343 self.assertEqual(reader.readline(keepends=True), "foo\r") 344 writer.write("\nbar\r") 345 self.assertEqual(reader.readline(keepends=True), "\n") 346 self.assertEqual(reader.readline(keepends=True), "bar\r") 347 writer.write("baz") 348 self.assertEqual(reader.readline(keepends=True), "baz") 349 self.assertEqual(reader.readline(keepends=True), "") 350 writer.write("foo\r\n") 351 self.assertEqual(reader.readline(keepends=True), "foo\r\n") 352 353 def test_bug1098990_a(self): 354 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n" 355 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n" 356 s3 = "next line.\r\n" 357 358 s = (s1+s2+s3).encode(self.encoding) 359 stream = io.BytesIO(s) 360 reader = codecs.getreader(self.encoding)(stream) 361 self.assertEqual(reader.readline(), s1) 362 self.assertEqual(reader.readline(), s2) 363 self.assertEqual(reader.readline(), s3) 364 self.assertEqual(reader.readline(), "") 365 366 def test_bug1098990_b(self): 367 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n" 368 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n" 369 s3 = "stillokay:bbbbxx\r\n" 370 s4 = "broken!!!!badbad\r\n" 371 s5 = "againokay.\r\n" 372 373 s = (s1+s2+s3+s4+s5).encode(self.encoding) 374 stream = io.BytesIO(s) 375 reader = codecs.getreader(self.encoding)(stream) 376 self.assertEqual(reader.readline(), s1) 377 self.assertEqual(reader.readline(), s2) 378 self.assertEqual(reader.readline(), s3) 379 self.assertEqual(reader.readline(), s4) 380 self.assertEqual(reader.readline(), s5) 381 self.assertEqual(reader.readline(), "") 382 383 ill_formed_sequence_replace = "\ufffd" 384 385 def test_lone_surrogates(self): 386 self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding) 387 self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"), 388 "[\\udc80]".encode(self.encoding)) 389 self.assertEqual("[\uDC80]".encode(self.encoding, "namereplace"), 390 "[\\udc80]".encode(self.encoding)) 391 self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"), 392 "[�]".encode(self.encoding)) 393 self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"), 394 "[]".encode(self.encoding)) 395 self.assertEqual("[\uDC80]".encode(self.encoding, "replace"), 396 "[?]".encode(self.encoding)) 397 398 # sequential surrogate characters 399 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "ignore"), 400 "[]".encode(self.encoding)) 401 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "replace"), 402 "[??]".encode(self.encoding)) 403 404 bom = "".encode(self.encoding) 405 for before, after in [("\U00010fff", "A"), ("[", "]"), 406 ("A", "\U00010fff")]: 407 before_sequence = before.encode(self.encoding)[len(bom):] 408 after_sequence = after.encode(self.encoding)[len(bom):] 409 test_string = before + "\uDC80" + after 410 test_sequence = (bom + before_sequence + 411 self.ill_formed_sequence + after_sequence) 412 self.assertRaises(UnicodeDecodeError, test_sequence.decode, 413 self.encoding) 414 self.assertEqual(test_string.encode(self.encoding, 415 "surrogatepass"), 416 test_sequence) 417 self.assertEqual(test_sequence.decode(self.encoding, 418 "surrogatepass"), 419 test_string) 420 self.assertEqual(test_sequence.decode(self.encoding, "ignore"), 421 before + after) 422 self.assertEqual(test_sequence.decode(self.encoding, "replace"), 423 before + self.ill_formed_sequence_replace + after) 424 backslashreplace = ''.join('\\x%02x' % b 425 for b in self.ill_formed_sequence) 426 self.assertEqual(test_sequence.decode(self.encoding, "backslashreplace"), 427 before + backslashreplace + after) 428 429 def test_incremental_surrogatepass(self): 430 # Test incremental decoder for surrogatepass handler: 431 # see issue #24214 432 # High surrogate 433 data = '\uD901'.encode(self.encoding, 'surrogatepass') 434 for i in range(1, len(data)): 435 dec = codecs.getincrementaldecoder(self.encoding)('surrogatepass') 436 self.assertEqual(dec.decode(data[:i]), '') 437 self.assertEqual(dec.decode(data[i:], True), '\uD901') 438 # Low surrogate 439 data = '\uDC02'.encode(self.encoding, 'surrogatepass') 440 for i in range(1, len(data)): 441 dec = codecs.getincrementaldecoder(self.encoding)('surrogatepass') 442 self.assertEqual(dec.decode(data[:i]), '') 443 self.assertEqual(dec.decode(data[i:]), '\uDC02') 444 445 446class UTF32Test(ReadTest, unittest.TestCase): 447 encoding = "utf-32" 448 if sys.byteorder == 'little': 449 ill_formed_sequence = b"\x80\xdc\x00\x00" 450 else: 451 ill_formed_sequence = b"\x00\x00\xdc\x80" 452 453 spamle = (b'\xff\xfe\x00\x00' 454 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00' 455 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00') 456 spambe = (b'\x00\x00\xfe\xff' 457 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m' 458 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m') 459 460 def test_only_one_bom(self): 461 _,_,reader,writer = codecs.lookup(self.encoding) 462 # encode some stream 463 s = io.BytesIO() 464 f = writer(s) 465 f.write("spam") 466 f.write("spam") 467 d = s.getvalue() 468 # check whether there is exactly one BOM in it 469 self.assertTrue(d == self.spamle or d == self.spambe) 470 # try to read it back 471 s = io.BytesIO(d) 472 f = reader(s) 473 self.assertEqual(f.read(), "spamspam") 474 475 def test_badbom(self): 476 s = io.BytesIO(4*b"\xff") 477 f = codecs.getreader(self.encoding)(s) 478 self.assertRaises(UnicodeError, f.read) 479 480 s = io.BytesIO(8*b"\xff") 481 f = codecs.getreader(self.encoding)(s) 482 self.assertRaises(UnicodeError, f.read) 483 484 def test_partial(self): 485 self.check_partial( 486 "\x00\xff\u0100\uffff\U00010000", 487 [ 488 "", # first byte of BOM read 489 "", # second byte of BOM read 490 "", # third byte of BOM read 491 "", # fourth byte of BOM read => byteorder known 492 "", 493 "", 494 "", 495 "\x00", 496 "\x00", 497 "\x00", 498 "\x00", 499 "\x00\xff", 500 "\x00\xff", 501 "\x00\xff", 502 "\x00\xff", 503 "\x00\xff\u0100", 504 "\x00\xff\u0100", 505 "\x00\xff\u0100", 506 "\x00\xff\u0100", 507 "\x00\xff\u0100\uffff", 508 "\x00\xff\u0100\uffff", 509 "\x00\xff\u0100\uffff", 510 "\x00\xff\u0100\uffff", 511 "\x00\xff\u0100\uffff\U00010000", 512 ] 513 ) 514 515 def test_handlers(self): 516 self.assertEqual(('\ufffd', 1), 517 codecs.utf_32_decode(b'\x01', 'replace', True)) 518 self.assertEqual(('', 1), 519 codecs.utf_32_decode(b'\x01', 'ignore', True)) 520 521 def test_errors(self): 522 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode, 523 b"\xff", "strict", True) 524 525 def test_decoder_state(self): 526 self.check_state_handling_decode(self.encoding, 527 "spamspam", self.spamle) 528 self.check_state_handling_decode(self.encoding, 529 "spamspam", self.spambe) 530 531 def test_issue8941(self): 532 # Issue #8941: insufficient result allocation when decoding into 533 # surrogate pairs on UCS-2 builds. 534 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024 535 self.assertEqual('\U00010000' * 1024, 536 codecs.utf_32_decode(encoded_le)[0]) 537 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024 538 self.assertEqual('\U00010000' * 1024, 539 codecs.utf_32_decode(encoded_be)[0]) 540 541 542class UTF32LETest(ReadTest, unittest.TestCase): 543 encoding = "utf-32-le" 544 ill_formed_sequence = b"\x80\xdc\x00\x00" 545 546 def test_partial(self): 547 self.check_partial( 548 "\x00\xff\u0100\uffff\U00010000", 549 [ 550 "", 551 "", 552 "", 553 "\x00", 554 "\x00", 555 "\x00", 556 "\x00", 557 "\x00\xff", 558 "\x00\xff", 559 "\x00\xff", 560 "\x00\xff", 561 "\x00\xff\u0100", 562 "\x00\xff\u0100", 563 "\x00\xff\u0100", 564 "\x00\xff\u0100", 565 "\x00\xff\u0100\uffff", 566 "\x00\xff\u0100\uffff", 567 "\x00\xff\u0100\uffff", 568 "\x00\xff\u0100\uffff", 569 "\x00\xff\u0100\uffff\U00010000", 570 ] 571 ) 572 573 def test_simple(self): 574 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00") 575 576 def test_errors(self): 577 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode, 578 b"\xff", "strict", True) 579 580 def test_issue8941(self): 581 # Issue #8941: insufficient result allocation when decoding into 582 # surrogate pairs on UCS-2 builds. 583 encoded = b'\x00\x00\x01\x00' * 1024 584 self.assertEqual('\U00010000' * 1024, 585 codecs.utf_32_le_decode(encoded)[0]) 586 587 588class UTF32BETest(ReadTest, unittest.TestCase): 589 encoding = "utf-32-be" 590 ill_formed_sequence = b"\x00\x00\xdc\x80" 591 592 def test_partial(self): 593 self.check_partial( 594 "\x00\xff\u0100\uffff\U00010000", 595 [ 596 "", 597 "", 598 "", 599 "\x00", 600 "\x00", 601 "\x00", 602 "\x00", 603 "\x00\xff", 604 "\x00\xff", 605 "\x00\xff", 606 "\x00\xff", 607 "\x00\xff\u0100", 608 "\x00\xff\u0100", 609 "\x00\xff\u0100", 610 "\x00\xff\u0100", 611 "\x00\xff\u0100\uffff", 612 "\x00\xff\u0100\uffff", 613 "\x00\xff\u0100\uffff", 614 "\x00\xff\u0100\uffff", 615 "\x00\xff\u0100\uffff\U00010000", 616 ] 617 ) 618 619 def test_simple(self): 620 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03") 621 622 def test_errors(self): 623 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode, 624 b"\xff", "strict", True) 625 626 def test_issue8941(self): 627 # Issue #8941: insufficient result allocation when decoding into 628 # surrogate pairs on UCS-2 builds. 629 encoded = b'\x00\x01\x00\x00' * 1024 630 self.assertEqual('\U00010000' * 1024, 631 codecs.utf_32_be_decode(encoded)[0]) 632 633 634class UTF16Test(ReadTest, unittest.TestCase): 635 encoding = "utf-16" 636 if sys.byteorder == 'little': 637 ill_formed_sequence = b"\x80\xdc" 638 else: 639 ill_formed_sequence = b"\xdc\x80" 640 641 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00' 642 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m' 643 644 def test_only_one_bom(self): 645 _,_,reader,writer = codecs.lookup(self.encoding) 646 # encode some stream 647 s = io.BytesIO() 648 f = writer(s) 649 f.write("spam") 650 f.write("spam") 651 d = s.getvalue() 652 # check whether there is exactly one BOM in it 653 self.assertTrue(d == self.spamle or d == self.spambe) 654 # try to read it back 655 s = io.BytesIO(d) 656 f = reader(s) 657 self.assertEqual(f.read(), "spamspam") 658 659 def test_badbom(self): 660 s = io.BytesIO(b"\xff\xff") 661 f = codecs.getreader(self.encoding)(s) 662 self.assertRaises(UnicodeError, f.read) 663 664 s = io.BytesIO(b"\xff\xff\xff\xff") 665 f = codecs.getreader(self.encoding)(s) 666 self.assertRaises(UnicodeError, f.read) 667 668 def test_partial(self): 669 self.check_partial( 670 "\x00\xff\u0100\uffff\U00010000", 671 [ 672 "", # first byte of BOM read 673 "", # second byte of BOM read => byteorder known 674 "", 675 "\x00", 676 "\x00", 677 "\x00\xff", 678 "\x00\xff", 679 "\x00\xff\u0100", 680 "\x00\xff\u0100", 681 "\x00\xff\u0100\uffff", 682 "\x00\xff\u0100\uffff", 683 "\x00\xff\u0100\uffff", 684 "\x00\xff\u0100\uffff", 685 "\x00\xff\u0100\uffff\U00010000", 686 ] 687 ) 688 689 def test_handlers(self): 690 self.assertEqual(('\ufffd', 1), 691 codecs.utf_16_decode(b'\x01', 'replace', True)) 692 self.assertEqual(('', 1), 693 codecs.utf_16_decode(b'\x01', 'ignore', True)) 694 695 def test_errors(self): 696 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode, 697 b"\xff", "strict", True) 698 699 def test_decoder_state(self): 700 self.check_state_handling_decode(self.encoding, 701 "spamspam", self.spamle) 702 self.check_state_handling_decode(self.encoding, 703 "spamspam", self.spambe) 704 705 def test_bug691291(self): 706 # Files are always opened in binary mode, even if no binary mode was 707 # specified. This means that no automatic conversion of '\n' is done 708 # on reading and writing. 709 s1 = 'Hello\r\nworld\r\n' 710 711 s = s1.encode(self.encoding) 712 self.addCleanup(support.unlink, support.TESTFN) 713 with open(support.TESTFN, 'wb') as fp: 714 fp.write(s) 715 with support.check_warnings(('', DeprecationWarning)): 716 reader = codecs.open(support.TESTFN, 'U', encoding=self.encoding) 717 with reader: 718 self.assertEqual(reader.read(), s1) 719 720class UTF16LETest(ReadTest, unittest.TestCase): 721 encoding = "utf-16-le" 722 ill_formed_sequence = b"\x80\xdc" 723 724 def test_partial(self): 725 self.check_partial( 726 "\x00\xff\u0100\uffff\U00010000", 727 [ 728 "", 729 "\x00", 730 "\x00", 731 "\x00\xff", 732 "\x00\xff", 733 "\x00\xff\u0100", 734 "\x00\xff\u0100", 735 "\x00\xff\u0100\uffff", 736 "\x00\xff\u0100\uffff", 737 "\x00\xff\u0100\uffff", 738 "\x00\xff\u0100\uffff", 739 "\x00\xff\u0100\uffff\U00010000", 740 ] 741 ) 742 743 def test_errors(self): 744 tests = [ 745 (b'\xff', '\ufffd'), 746 (b'A\x00Z', 'A\ufffd'), 747 (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'), 748 (b'\x00\xd8', '\ufffd'), 749 (b'\x00\xd8A', '\ufffd'), 750 (b'\x00\xd8A\x00', '\ufffdA'), 751 (b'\x00\xdcA\x00', '\ufffdA'), 752 ] 753 for raw, expected in tests: 754 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode, 755 raw, 'strict', True) 756 self.assertEqual(raw.decode('utf-16le', 'replace'), expected) 757 758 def test_nonbmp(self): 759 self.assertEqual("\U00010203".encode(self.encoding), 760 b'\x00\xd8\x03\xde') 761 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding), 762 "\U00010203") 763 764class UTF16BETest(ReadTest, unittest.TestCase): 765 encoding = "utf-16-be" 766 ill_formed_sequence = b"\xdc\x80" 767 768 def test_partial(self): 769 self.check_partial( 770 "\x00\xff\u0100\uffff\U00010000", 771 [ 772 "", 773 "\x00", 774 "\x00", 775 "\x00\xff", 776 "\x00\xff", 777 "\x00\xff\u0100", 778 "\x00\xff\u0100", 779 "\x00\xff\u0100\uffff", 780 "\x00\xff\u0100\uffff", 781 "\x00\xff\u0100\uffff", 782 "\x00\xff\u0100\uffff", 783 "\x00\xff\u0100\uffff\U00010000", 784 ] 785 ) 786 787 def test_errors(self): 788 tests = [ 789 (b'\xff', '\ufffd'), 790 (b'\x00A\xff', 'A\ufffd'), 791 (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'), 792 (b'\xd8\x00', '\ufffd'), 793 (b'\xd8\x00\xdc', '\ufffd'), 794 (b'\xd8\x00\x00A', '\ufffdA'), 795 (b'\xdc\x00\x00A', '\ufffdA'), 796 ] 797 for raw, expected in tests: 798 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode, 799 raw, 'strict', True) 800 self.assertEqual(raw.decode('utf-16be', 'replace'), expected) 801 802 def test_nonbmp(self): 803 self.assertEqual("\U00010203".encode(self.encoding), 804 b'\xd8\x00\xde\x03') 805 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding), 806 "\U00010203") 807 808class UTF8Test(ReadTest, unittest.TestCase): 809 encoding = "utf-8" 810 ill_formed_sequence = b"\xed\xb2\x80" 811 ill_formed_sequence_replace = "\ufffd" * 3 812 BOM = b'' 813 814 def test_partial(self): 815 self.check_partial( 816 "\x00\xff\u07ff\u0800\uffff\U00010000", 817 [ 818 "\x00", 819 "\x00", 820 "\x00\xff", 821 "\x00\xff", 822 "\x00\xff\u07ff", 823 "\x00\xff\u07ff", 824 "\x00\xff\u07ff", 825 "\x00\xff\u07ff\u0800", 826 "\x00\xff\u07ff\u0800", 827 "\x00\xff\u07ff\u0800", 828 "\x00\xff\u07ff\u0800\uffff", 829 "\x00\xff\u07ff\u0800\uffff", 830 "\x00\xff\u07ff\u0800\uffff", 831 "\x00\xff\u07ff\u0800\uffff", 832 "\x00\xff\u07ff\u0800\uffff\U00010000", 833 ] 834 ) 835 836 def test_decoder_state(self): 837 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff" 838 self.check_state_handling_decode(self.encoding, 839 u, u.encode(self.encoding)) 840 841 def test_decode_error(self): 842 for data, error_handler, expected in ( 843 (b'[\x80\xff]', 'ignore', '[]'), 844 (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'), 845 (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'), 846 (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'), 847 ): 848 with self.subTest(data=data, error_handler=error_handler, 849 expected=expected): 850 self.assertEqual(data.decode(self.encoding, error_handler), 851 expected) 852 853 def test_lone_surrogates(self): 854 super().test_lone_surrogates() 855 # not sure if this is making sense for 856 # UTF-16 and UTF-32 857 self.assertEqual("[\uDC80]".encode(self.encoding, "surrogateescape"), 858 self.BOM + b'[\x80]') 859 860 with self.assertRaises(UnicodeEncodeError) as cm: 861 "[\uDC80\uD800\uDFFF]".encode(self.encoding, "surrogateescape") 862 exc = cm.exception 863 self.assertEqual(exc.object[exc.start:exc.end], '\uD800\uDFFF') 864 865 def test_surrogatepass_handler(self): 866 self.assertEqual("abc\ud800def".encode(self.encoding, "surrogatepass"), 867 self.BOM + b"abc\xed\xa0\x80def") 868 self.assertEqual("\U00010fff\uD800".encode(self.encoding, "surrogatepass"), 869 self.BOM + b"\xf0\x90\xbf\xbf\xed\xa0\x80") 870 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "surrogatepass"), 871 self.BOM + b'[\xed\xa0\x80\xed\xb2\x80]') 872 873 self.assertEqual(b"abc\xed\xa0\x80def".decode(self.encoding, "surrogatepass"), 874 "abc\ud800def") 875 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode(self.encoding, "surrogatepass"), 876 "\U00010fff\uD800") 877 878 self.assertTrue(codecs.lookup_error("surrogatepass")) 879 with self.assertRaises(UnicodeDecodeError): 880 b"abc\xed\xa0".decode(self.encoding, "surrogatepass") 881 with self.assertRaises(UnicodeDecodeError): 882 b"abc\xed\xa0z".decode(self.encoding, "surrogatepass") 883 884 def test_incremental_errors(self): 885 # Test that the incremental decoder can fail with final=False. 886 # See issue #24214 887 cases = [b'\x80', b'\xBF', b'\xC0', b'\xC1', b'\xF5', b'\xF6', b'\xFF'] 888 for prefix in (b'\xC2', b'\xDF', b'\xE0', b'\xE0\xA0', b'\xEF', 889 b'\xEF\xBF', b'\xF0', b'\xF0\x90', b'\xF0\x90\x80', 890 b'\xF4', b'\xF4\x8F', b'\xF4\x8F\xBF'): 891 for suffix in b'\x7F', b'\xC0': 892 cases.append(prefix + suffix) 893 cases.extend((b'\xE0\x80', b'\xE0\x9F', b'\xED\xA0\x80', 894 b'\xED\xBF\xBF', b'\xF0\x80', b'\xF0\x8F', b'\xF4\x90')) 895 896 for data in cases: 897 with self.subTest(data=data): 898 dec = codecs.getincrementaldecoder(self.encoding)() 899 self.assertRaises(UnicodeDecodeError, dec.decode, data) 900 901 902class UTF7Test(ReadTest, unittest.TestCase): 903 encoding = "utf-7" 904 905 def test_ascii(self): 906 # Set D (directly encoded characters) 907 set_d = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ' 908 'abcdefghijklmnopqrstuvwxyz' 909 '0123456789' 910 '\'(),-./:?') 911 self.assertEqual(set_d.encode(self.encoding), set_d.encode('ascii')) 912 self.assertEqual(set_d.encode('ascii').decode(self.encoding), set_d) 913 # Set O (optional direct characters) 914 set_o = ' !"#$%&*;<=>@[]^_`{|}' 915 self.assertEqual(set_o.encode(self.encoding), set_o.encode('ascii')) 916 self.assertEqual(set_o.encode('ascii').decode(self.encoding), set_o) 917 # + 918 self.assertEqual('a+b'.encode(self.encoding), b'a+-b') 919 self.assertEqual(b'a+-b'.decode(self.encoding), 'a+b') 920 # White spaces 921 ws = ' \t\n\r' 922 self.assertEqual(ws.encode(self.encoding), ws.encode('ascii')) 923 self.assertEqual(ws.encode('ascii').decode(self.encoding), ws) 924 # Other ASCII characters 925 other_ascii = ''.join(sorted(set(bytes(range(0x80)).decode()) - 926 set(set_d + set_o + '+' + ws))) 927 self.assertEqual(other_ascii.encode(self.encoding), 928 b'+AAAAAQACAAMABAAFAAYABwAIAAsADAAOAA8AEAARABIAEwAU' 929 b'ABUAFgAXABgAGQAaABsAHAAdAB4AHwBcAH4Afw-') 930 931 def test_partial(self): 932 self.check_partial( 933 'a+-b\x00c\x80d\u0100e\U00010000f', 934 [ 935 'a', 936 'a', 937 'a+', 938 'a+-', 939 'a+-b', 940 'a+-b', 941 'a+-b', 942 'a+-b', 943 'a+-b', 944 'a+-b\x00', 945 'a+-b\x00c', 946 'a+-b\x00c', 947 'a+-b\x00c', 948 'a+-b\x00c', 949 'a+-b\x00c', 950 'a+-b\x00c\x80', 951 'a+-b\x00c\x80d', 952 'a+-b\x00c\x80d', 953 'a+-b\x00c\x80d', 954 'a+-b\x00c\x80d', 955 'a+-b\x00c\x80d', 956 'a+-b\x00c\x80d\u0100', 957 'a+-b\x00c\x80d\u0100e', 958 'a+-b\x00c\x80d\u0100e', 959 'a+-b\x00c\x80d\u0100e', 960 'a+-b\x00c\x80d\u0100e', 961 'a+-b\x00c\x80d\u0100e', 962 'a+-b\x00c\x80d\u0100e', 963 'a+-b\x00c\x80d\u0100e', 964 'a+-b\x00c\x80d\u0100e', 965 'a+-b\x00c\x80d\u0100e\U00010000', 966 'a+-b\x00c\x80d\u0100e\U00010000f', 967 ] 968 ) 969 970 def test_errors(self): 971 tests = [ 972 (b'\xffb', '\ufffdb'), 973 (b'a\xffb', 'a\ufffdb'), 974 (b'a\xff\xffb', 'a\ufffd\ufffdb'), 975 (b'a+IK', 'a\ufffd'), 976 (b'a+IK-b', 'a\ufffdb'), 977 (b'a+IK,b', 'a\ufffdb'), 978 (b'a+IKx', 'a\u20ac\ufffd'), 979 (b'a+IKx-b', 'a\u20ac\ufffdb'), 980 (b'a+IKwgr', 'a\u20ac\ufffd'), 981 (b'a+IKwgr-b', 'a\u20ac\ufffdb'), 982 (b'a+IKwgr,', 'a\u20ac\ufffd'), 983 (b'a+IKwgr,-b', 'a\u20ac\ufffd-b'), 984 (b'a+IKwgrB', 'a\u20ac\u20ac\ufffd'), 985 (b'a+IKwgrB-b', 'a\u20ac\u20ac\ufffdb'), 986 (b'a+/,+IKw-b', 'a\ufffd\u20acb'), 987 (b'a+//,+IKw-b', 'a\ufffd\u20acb'), 988 (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'), 989 (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'), 990 (b'a+IKw-b\xff', 'a\u20acb\ufffd'), 991 (b'a+IKw\xffb', 'a\u20ac\ufffdb'), 992 (b'a+@b', 'a\ufffdb'), 993 ] 994 for raw, expected in tests: 995 with self.subTest(raw=raw): 996 self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode, 997 raw, 'strict', True) 998 self.assertEqual(raw.decode('utf-7', 'replace'), expected) 999 1000 def test_nonbmp(self): 1001 self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-') 1002 self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-') 1003 self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0') 1004 self.assertEqual(b'+2AHcoA'.decode(self.encoding), '\U000104A0') 1005 self.assertEqual('\u20ac\U000104A0'.encode(self.encoding), b'+IKzYAdyg-') 1006 self.assertEqual(b'+IKzYAdyg-'.decode(self.encoding), '\u20ac\U000104A0') 1007 self.assertEqual(b'+IKzYAdyg'.decode(self.encoding), '\u20ac\U000104A0') 1008 self.assertEqual('\u20ac\u20ac\U000104A0'.encode(self.encoding), 1009 b'+IKwgrNgB3KA-') 1010 self.assertEqual(b'+IKwgrNgB3KA-'.decode(self.encoding), 1011 '\u20ac\u20ac\U000104A0') 1012 self.assertEqual(b'+IKwgrNgB3KA'.decode(self.encoding), 1013 '\u20ac\u20ac\U000104A0') 1014 1015 def test_lone_surrogates(self): 1016 tests = [ 1017 (b'a+2AE-b', 'a\ud801b'), 1018 (b'a+2AE\xffb', 'a\ufffdb'), 1019 (b'a+2AE', 'a\ufffd'), 1020 (b'a+2AEA-b', 'a\ufffdb'), 1021 (b'a+2AH-b', 'a\ufffdb'), 1022 (b'a+IKzYAQ-b', 'a\u20ac\ud801b'), 1023 (b'a+IKzYAQ\xffb', 'a\u20ac\ufffdb'), 1024 (b'a+IKzYAQA-b', 'a\u20ac\ufffdb'), 1025 (b'a+IKzYAd-b', 'a\u20ac\ufffdb'), 1026 (b'a+IKwgrNgB-b', 'a\u20ac\u20ac\ud801b'), 1027 (b'a+IKwgrNgB\xffb', 'a\u20ac\u20ac\ufffdb'), 1028 (b'a+IKwgrNgB', 'a\u20ac\u20ac\ufffd'), 1029 (b'a+IKwgrNgBA-b', 'a\u20ac\u20ac\ufffdb'), 1030 ] 1031 for raw, expected in tests: 1032 with self.subTest(raw=raw): 1033 self.assertEqual(raw.decode('utf-7', 'replace'), expected) 1034 1035 1036class UTF16ExTest(unittest.TestCase): 1037 1038 def test_errors(self): 1039 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True) 1040 1041 def test_bad_args(self): 1042 self.assertRaises(TypeError, codecs.utf_16_ex_decode) 1043 1044class ReadBufferTest(unittest.TestCase): 1045 1046 def test_array(self): 1047 import array 1048 self.assertEqual( 1049 codecs.readbuffer_encode(array.array("b", b"spam")), 1050 (b"spam", 4) 1051 ) 1052 1053 def test_empty(self): 1054 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0)) 1055 1056 def test_bad_args(self): 1057 self.assertRaises(TypeError, codecs.readbuffer_encode) 1058 self.assertRaises(TypeError, codecs.readbuffer_encode, 42) 1059 1060class UTF8SigTest(UTF8Test, unittest.TestCase): 1061 encoding = "utf-8-sig" 1062 BOM = codecs.BOM_UTF8 1063 1064 def test_partial(self): 1065 self.check_partial( 1066 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000", 1067 [ 1068 "", 1069 "", 1070 "", # First BOM has been read and skipped 1071 "", 1072 "", 1073 "\ufeff", # Second BOM has been read and emitted 1074 "\ufeff\x00", # "\x00" read and emitted 1075 "\ufeff\x00", # First byte of encoded "\xff" read 1076 "\ufeff\x00\xff", # Second byte of encoded "\xff" read 1077 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read 1078 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read 1079 "\ufeff\x00\xff\u07ff", 1080 "\ufeff\x00\xff\u07ff", 1081 "\ufeff\x00\xff\u07ff\u0800", 1082 "\ufeff\x00\xff\u07ff\u0800", 1083 "\ufeff\x00\xff\u07ff\u0800", 1084 "\ufeff\x00\xff\u07ff\u0800\uffff", 1085 "\ufeff\x00\xff\u07ff\u0800\uffff", 1086 "\ufeff\x00\xff\u07ff\u0800\uffff", 1087 "\ufeff\x00\xff\u07ff\u0800\uffff", 1088 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000", 1089 ] 1090 ) 1091 1092 def test_bug1601501(self): 1093 # SF bug #1601501: check that the codec works with a buffer 1094 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "") 1095 1096 def test_bom(self): 1097 d = codecs.getincrementaldecoder("utf-8-sig")() 1098 s = "spam" 1099 self.assertEqual(d.decode(s.encode("utf-8-sig")), s) 1100 1101 def test_stream_bom(self): 1102 unistring = "ABC\u00A1\u2200XYZ" 1103 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ" 1104 1105 reader = codecs.getreader("utf-8-sig") 1106 for sizehint in [None] + list(range(1, 11)) + \ 1107 [64, 128, 256, 512, 1024]: 1108 istream = reader(io.BytesIO(bytestring)) 1109 ostream = io.StringIO() 1110 while 1: 1111 if sizehint is not None: 1112 data = istream.read(sizehint) 1113 else: 1114 data = istream.read() 1115 1116 if not data: 1117 break 1118 ostream.write(data) 1119 1120 got = ostream.getvalue() 1121 self.assertEqual(got, unistring) 1122 1123 def test_stream_bare(self): 1124 unistring = "ABC\u00A1\u2200XYZ" 1125 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ" 1126 1127 reader = codecs.getreader("utf-8-sig") 1128 for sizehint in [None] + list(range(1, 11)) + \ 1129 [64, 128, 256, 512, 1024]: 1130 istream = reader(io.BytesIO(bytestring)) 1131 ostream = io.StringIO() 1132 while 1: 1133 if sizehint is not None: 1134 data = istream.read(sizehint) 1135 else: 1136 data = istream.read() 1137 1138 if not data: 1139 break 1140 ostream.write(data) 1141 1142 got = ostream.getvalue() 1143 self.assertEqual(got, unistring) 1144 1145 1146class EscapeDecodeTest(unittest.TestCase): 1147 def test_empty(self): 1148 self.assertEqual(codecs.escape_decode(b""), (b"", 0)) 1149 self.assertEqual(codecs.escape_decode(bytearray()), (b"", 0)) 1150 1151 def test_raw(self): 1152 decode = codecs.escape_decode 1153 for b in range(256): 1154 b = bytes([b]) 1155 if b != b'\\': 1156 self.assertEqual(decode(b + b'0'), (b + b'0', 2)) 1157 1158 def test_escape(self): 1159 decode = codecs.escape_decode 1160 check = coding_checker(self, decode) 1161 check(b"[\\\n]", b"[]") 1162 check(br'[\"]', b'["]') 1163 check(br"[\']", b"[']") 1164 check(br"[\\]", b"[\\]") 1165 check(br"[\a]", b"[\x07]") 1166 check(br"[\b]", b"[\x08]") 1167 check(br"[\t]", b"[\x09]") 1168 check(br"[\n]", b"[\x0a]") 1169 check(br"[\v]", b"[\x0b]") 1170 check(br"[\f]", b"[\x0c]") 1171 check(br"[\r]", b"[\x0d]") 1172 check(br"[\7]", b"[\x07]") 1173 check(br"[\78]", b"[\x078]") 1174 check(br"[\41]", b"[!]") 1175 check(br"[\418]", b"[!8]") 1176 check(br"[\101]", b"[A]") 1177 check(br"[\1010]", b"[A0]") 1178 check(br"[\501]", b"[A]") 1179 check(br"[\x41]", b"[A]") 1180 check(br"[\x410]", b"[A0]") 1181 for i in range(97, 123): 1182 b = bytes([i]) 1183 if b not in b'abfnrtvx': 1184 with self.assertWarns(DeprecationWarning): 1185 check(b"\\" + b, b"\\" + b) 1186 with self.assertWarns(DeprecationWarning): 1187 check(b"\\" + b.upper(), b"\\" + b.upper()) 1188 with self.assertWarns(DeprecationWarning): 1189 check(br"\8", b"\\8") 1190 with self.assertWarns(DeprecationWarning): 1191 check(br"\9", b"\\9") 1192 with self.assertWarns(DeprecationWarning): 1193 check(b"\\\xfa", b"\\\xfa") 1194 1195 def test_errors(self): 1196 decode = codecs.escape_decode 1197 self.assertRaises(ValueError, decode, br"\x") 1198 self.assertRaises(ValueError, decode, br"[\x]") 1199 self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6)) 1200 self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6)) 1201 self.assertRaises(ValueError, decode, br"\x0") 1202 self.assertRaises(ValueError, decode, br"[\x0]") 1203 self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8)) 1204 self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8)) 1205 1206 1207# From RFC 3492 1208punycode_testcases = [ 1209 # A Arabic (Egyptian): 1210 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644" 1211 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F", 1212 b"egbpdaj6bu4bxfgehfvwxn"), 1213 # B Chinese (simplified): 1214 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587", 1215 b"ihqwcrb4cv8a8dqg056pqjye"), 1216 # C Chinese (traditional): 1217 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587", 1218 b"ihqwctvzc91f659drss3x8bo0yb"), 1219 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky 1220 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074" 1221 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D" 1222 "\u0065\u0073\u006B\u0079", 1223 b"Proprostnemluvesky-uyb24dma41a"), 1224 # E Hebrew: 1225 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8" 1226 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2" 1227 "\u05D1\u05E8\u05D9\u05EA", 1228 b"4dbcagdahymbxekheh6e0a7fei0b"), 1229 # F Hindi (Devanagari): 1230 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D" 1231 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939" 1232 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947" 1233 "\u0939\u0948\u0902", 1234 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"), 1235 1236 #(G) Japanese (kanji and hiragana): 1237 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092" 1238 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B", 1239 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"), 1240 1241 # (H) Korean (Hangul syllables): 1242 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774" 1243 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74" 1244 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C", 1245 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j" 1246 b"psd879ccm6fea98c"), 1247 1248 # (I) Russian (Cyrillic): 1249 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E" 1250 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440" 1251 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A" 1252 "\u0438", 1253 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"), 1254 1255 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol 1256 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070" 1257 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070" 1258 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061" 1259 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070" 1260 "\u0061\u00F1\u006F\u006C", 1261 b"PorqunopuedensimplementehablarenEspaol-fmd56a"), 1262 1263 # (K) Vietnamese: 1264 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\ 1265 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t 1266 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B" 1267 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068" 1268 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067" 1269 "\u0056\u0069\u1EC7\u0074", 1270 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"), 1271 1272 #(L) 3<nen>B<gumi><kinpachi><sensei> 1273 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F", 1274 b"3B-ww4c5e180e575a65lsy2b"), 1275 1276 # (M) <amuro><namie>-with-SUPER-MONKEYS 1277 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074" 1278 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D" 1279 "\u004F\u004E\u004B\u0045\u0059\u0053", 1280 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"), 1281 1282 # (N) Hello-Another-Way-<sorezore><no><basho> 1283 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F" 1284 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D" 1285 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240", 1286 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"), 1287 1288 # (O) <hitotsu><yane><no><shita>2 1289 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032", 1290 b"2-u9tlzr9756bt3uc0v"), 1291 1292 # (P) Maji<de>Koi<suru>5<byou><mae> 1293 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059" 1294 "\u308B\u0035\u79D2\u524D", 1295 b"MajiKoi5-783gue6qz075azm5e"), 1296 1297 # (Q) <pafii>de<runba> 1298 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0", 1299 b"de-jg4avhby1noc0d"), 1300 1301 # (R) <sono><supiido><de> 1302 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067", 1303 b"d9juau41awczczp"), 1304 1305 # (S) -> $1.00 <- 1306 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020" 1307 "\u003C\u002D", 1308 b"-> $1.00 <--") 1309 ] 1310 1311for i in punycode_testcases: 1312 if len(i)!=2: 1313 print(repr(i)) 1314 1315 1316class PunycodeTest(unittest.TestCase): 1317 def test_encode(self): 1318 for uni, puny in punycode_testcases: 1319 # Need to convert both strings to lower case, since 1320 # some of the extended encodings use upper case, but our 1321 # code produces only lower case. Converting just puny to 1322 # lower is also insufficient, since some of the input characters 1323 # are upper case. 1324 self.assertEqual( 1325 str(uni.encode("punycode"), "ascii").lower(), 1326 str(puny, "ascii").lower() 1327 ) 1328 1329 def test_decode(self): 1330 for uni, puny in punycode_testcases: 1331 self.assertEqual(uni, puny.decode("punycode")) 1332 puny = puny.decode("ascii").encode("ascii") 1333 self.assertEqual(uni, puny.decode("punycode")) 1334 1335 def test_decode_invalid(self): 1336 testcases = [ 1337 (b"xn--w&", "strict", UnicodeError()), 1338 (b"xn--w&", "ignore", "xn-"), 1339 ] 1340 for puny, errors, expected in testcases: 1341 with self.subTest(puny=puny, errors=errors): 1342 if isinstance(expected, Exception): 1343 self.assertRaises(UnicodeError, puny.decode, "punycode", errors) 1344 else: 1345 self.assertEqual(puny.decode("punycode", errors), expected) 1346 1347 1348# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html 1349nameprep_tests = [ 1350 # 3.1 Map to nothing. 1351 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar' 1352 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef' 1353 b'\xb8\x8f\xef\xbb\xbf', 1354 b'foobarbaz'), 1355 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045. 1356 (b'CAFE', 1357 b'cafe'), 1358 # 3.3 Case folding 8bit U+00DF (german sharp s). 1359 # The original test case is bogus; it says \xc3\xdf 1360 (b'\xc3\x9f', 1361 b'ss'), 1362 # 3.4 Case folding U+0130 (turkish capital I with dot). 1363 (b'\xc4\xb0', 1364 b'i\xcc\x87'), 1365 # 3.5 Case folding multibyte U+0143 U+037A. 1366 (b'\xc5\x83\xcd\xba', 1367 b'\xc5\x84 \xce\xb9'), 1368 # 3.6 Case folding U+2121 U+33C6 U+1D7BB. 1369 # XXX: skip this as it fails in UCS-2 mode 1370 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb', 1371 # 'telc\xe2\x88\x95kg\xcf\x83'), 1372 (None, None), 1373 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA. 1374 (b'j\xcc\x8c\xc2\xa0\xc2\xaa', 1375 b'\xc7\xb0 a'), 1376 # 3.8 Case folding U+1FB7 and normalization. 1377 (b'\xe1\xbe\xb7', 1378 b'\xe1\xbe\xb6\xce\xb9'), 1379 # 3.9 Self-reverting case folding U+01F0 and normalization. 1380 # The original test case is bogus, it says `\xc7\xf0' 1381 (b'\xc7\xb0', 1382 b'\xc7\xb0'), 1383 # 3.10 Self-reverting case folding U+0390 and normalization. 1384 (b'\xce\x90', 1385 b'\xce\x90'), 1386 # 3.11 Self-reverting case folding U+03B0 and normalization. 1387 (b'\xce\xb0', 1388 b'\xce\xb0'), 1389 # 3.12 Self-reverting case folding U+1E96 and normalization. 1390 (b'\xe1\xba\x96', 1391 b'\xe1\xba\x96'), 1392 # 3.13 Self-reverting case folding U+1F56 and normalization. 1393 (b'\xe1\xbd\x96', 1394 b'\xe1\xbd\x96'), 1395 # 3.14 ASCII space character U+0020. 1396 (b' ', 1397 b' '), 1398 # 3.15 Non-ASCII 8bit space character U+00A0. 1399 (b'\xc2\xa0', 1400 b' '), 1401 # 3.16 Non-ASCII multibyte space character U+1680. 1402 (b'\xe1\x9a\x80', 1403 None), 1404 # 3.17 Non-ASCII multibyte space character U+2000. 1405 (b'\xe2\x80\x80', 1406 b' '), 1407 # 3.18 Zero Width Space U+200b. 1408 (b'\xe2\x80\x8b', 1409 b''), 1410 # 3.19 Non-ASCII multibyte space character U+3000. 1411 (b'\xe3\x80\x80', 1412 b' '), 1413 # 3.20 ASCII control characters U+0010 U+007F. 1414 (b'\x10\x7f', 1415 b'\x10\x7f'), 1416 # 3.21 Non-ASCII 8bit control character U+0085. 1417 (b'\xc2\x85', 1418 None), 1419 # 3.22 Non-ASCII multibyte control character U+180E. 1420 (b'\xe1\xa0\x8e', 1421 None), 1422 # 3.23 Zero Width No-Break Space U+FEFF. 1423 (b'\xef\xbb\xbf', 1424 b''), 1425 # 3.24 Non-ASCII control character U+1D175. 1426 (b'\xf0\x9d\x85\xb5', 1427 None), 1428 # 3.25 Plane 0 private use character U+F123. 1429 (b'\xef\x84\xa3', 1430 None), 1431 # 3.26 Plane 15 private use character U+F1234. 1432 (b'\xf3\xb1\x88\xb4', 1433 None), 1434 # 3.27 Plane 16 private use character U+10F234. 1435 (b'\xf4\x8f\x88\xb4', 1436 None), 1437 # 3.28 Non-character code point U+8FFFE. 1438 (b'\xf2\x8f\xbf\xbe', 1439 None), 1440 # 3.29 Non-character code point U+10FFFF. 1441 (b'\xf4\x8f\xbf\xbf', 1442 None), 1443 # 3.30 Surrogate code U+DF42. 1444 (b'\xed\xbd\x82', 1445 None), 1446 # 3.31 Non-plain text character U+FFFD. 1447 (b'\xef\xbf\xbd', 1448 None), 1449 # 3.32 Ideographic description character U+2FF5. 1450 (b'\xe2\xbf\xb5', 1451 None), 1452 # 3.33 Display property character U+0341. 1453 (b'\xcd\x81', 1454 b'\xcc\x81'), 1455 # 3.34 Left-to-right mark U+200E. 1456 (b'\xe2\x80\x8e', 1457 None), 1458 # 3.35 Deprecated U+202A. 1459 (b'\xe2\x80\xaa', 1460 None), 1461 # 3.36 Language tagging character U+E0001. 1462 (b'\xf3\xa0\x80\x81', 1463 None), 1464 # 3.37 Language tagging character U+E0042. 1465 (b'\xf3\xa0\x81\x82', 1466 None), 1467 # 3.38 Bidi: RandALCat character U+05BE and LCat characters. 1468 (b'foo\xd6\xbebar', 1469 None), 1470 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters. 1471 (b'foo\xef\xb5\x90bar', 1472 None), 1473 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters. 1474 (b'foo\xef\xb9\xb6bar', 1475 b'foo \xd9\x8ebar'), 1476 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031. 1477 (b'\xd8\xa71', 1478 None), 1479 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628. 1480 (b'\xd8\xa71\xd8\xa8', 1481 b'\xd8\xa71\xd8\xa8'), 1482 # 3.43 Unassigned code point U+E0002. 1483 # Skip this test as we allow unassigned 1484 #(b'\xf3\xa0\x80\x82', 1485 # None), 1486 (None, None), 1487 # 3.44 Larger test (shrinking). 1488 # Original test case reads \xc3\xdf 1489 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2' 1490 b'\xaa\xce\xb0\xe2\x80\x80', 1491 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '), 1492 # 3.45 Larger test (expanding). 1493 # Original test case reads \xc3\x9f 1494 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c' 1495 b'\x80', 1496 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3' 1497 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82' 1498 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88') 1499 ] 1500 1501 1502class NameprepTest(unittest.TestCase): 1503 def test_nameprep(self): 1504 from encodings.idna import nameprep 1505 for pos, (orig, prepped) in enumerate(nameprep_tests): 1506 if orig is None: 1507 # Skipped 1508 continue 1509 # The Unicode strings are given in UTF-8 1510 orig = str(orig, "utf-8", "surrogatepass") 1511 if prepped is None: 1512 # Input contains prohibited characters 1513 self.assertRaises(UnicodeError, nameprep, orig) 1514 else: 1515 prepped = str(prepped, "utf-8", "surrogatepass") 1516 try: 1517 self.assertEqual(nameprep(orig), prepped) 1518 except Exception as e: 1519 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e))) 1520 1521 1522class IDNACodecTest(unittest.TestCase): 1523 def test_builtin_decode(self): 1524 self.assertEqual(str(b"python.org", "idna"), "python.org") 1525 self.assertEqual(str(b"python.org.", "idna"), "python.org.") 1526 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org") 1527 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.") 1528 1529 def test_builtin_encode(self): 1530 self.assertEqual("python.org".encode("idna"), b"python.org") 1531 self.assertEqual("python.org.".encode("idna"), b"python.org.") 1532 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org") 1533 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.") 1534 1535 def test_stream(self): 1536 r = codecs.getreader("idna")(io.BytesIO(b"abc")) 1537 r.read(3) 1538 self.assertEqual(r.read(), "") 1539 1540 def test_incremental_decode(self): 1541 self.assertEqual( 1542 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")), 1543 "python.org" 1544 ) 1545 self.assertEqual( 1546 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")), 1547 "python.org." 1548 ) 1549 self.assertEqual( 1550 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")), 1551 "pyth\xf6n.org." 1552 ) 1553 self.assertEqual( 1554 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")), 1555 "pyth\xf6n.org." 1556 ) 1557 1558 decoder = codecs.getincrementaldecoder("idna")() 1559 self.assertEqual(decoder.decode(b"xn--xam", ), "") 1560 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.") 1561 self.assertEqual(decoder.decode(b"rg"), "") 1562 self.assertEqual(decoder.decode(b"", True), "org") 1563 1564 decoder.reset() 1565 self.assertEqual(decoder.decode(b"xn--xam", ), "") 1566 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.") 1567 self.assertEqual(decoder.decode(b"rg."), "org.") 1568 self.assertEqual(decoder.decode(b"", True), "") 1569 1570 def test_incremental_encode(self): 1571 self.assertEqual( 1572 b"".join(codecs.iterencode("python.org", "idna")), 1573 b"python.org" 1574 ) 1575 self.assertEqual( 1576 b"".join(codecs.iterencode("python.org.", "idna")), 1577 b"python.org." 1578 ) 1579 self.assertEqual( 1580 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")), 1581 b"xn--pythn-mua.org." 1582 ) 1583 self.assertEqual( 1584 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")), 1585 b"xn--pythn-mua.org." 1586 ) 1587 1588 encoder = codecs.getincrementalencoder("idna")() 1589 self.assertEqual(encoder.encode("\xe4x"), b"") 1590 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.") 1591 self.assertEqual(encoder.encode("", True), b"org") 1592 1593 encoder.reset() 1594 self.assertEqual(encoder.encode("\xe4x"), b"") 1595 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.") 1596 self.assertEqual(encoder.encode("", True), b"") 1597 1598 def test_errors(self): 1599 """Only supports "strict" error handler""" 1600 "python.org".encode("idna", "strict") 1601 b"python.org".decode("idna", "strict") 1602 for errors in ("ignore", "replace", "backslashreplace", 1603 "surrogateescape"): 1604 self.assertRaises(Exception, "python.org".encode, "idna", errors) 1605 self.assertRaises(Exception, 1606 b"python.org".decode, "idna", errors) 1607 1608 1609class CodecsModuleTest(unittest.TestCase): 1610 1611 def test_decode(self): 1612 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'), 1613 '\xe4\xf6\xfc') 1614 self.assertRaises(TypeError, codecs.decode) 1615 self.assertEqual(codecs.decode(b'abc'), 'abc') 1616 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii') 1617 1618 # test keywords 1619 self.assertEqual(codecs.decode(obj=b'\xe4\xf6\xfc', encoding='latin-1'), 1620 '\xe4\xf6\xfc') 1621 self.assertEqual(codecs.decode(b'[\xff]', 'ascii', errors='ignore'), 1622 '[]') 1623 1624 def test_encode(self): 1625 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'), 1626 b'\xe4\xf6\xfc') 1627 self.assertRaises(TypeError, codecs.encode) 1628 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__") 1629 self.assertEqual(codecs.encode('abc'), b'abc') 1630 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii') 1631 1632 # test keywords 1633 self.assertEqual(codecs.encode(obj='\xe4\xf6\xfc', encoding='latin-1'), 1634 b'\xe4\xf6\xfc') 1635 self.assertEqual(codecs.encode('[\xff]', 'ascii', errors='ignore'), 1636 b'[]') 1637 1638 def test_register(self): 1639 self.assertRaises(TypeError, codecs.register) 1640 self.assertRaises(TypeError, codecs.register, 42) 1641 1642 def test_lookup(self): 1643 self.assertRaises(TypeError, codecs.lookup) 1644 self.assertRaises(LookupError, codecs.lookup, "__spam__") 1645 self.assertRaises(LookupError, codecs.lookup, " ") 1646 1647 def test_getencoder(self): 1648 self.assertRaises(TypeError, codecs.getencoder) 1649 self.assertRaises(LookupError, codecs.getencoder, "__spam__") 1650 1651 def test_getdecoder(self): 1652 self.assertRaises(TypeError, codecs.getdecoder) 1653 self.assertRaises(LookupError, codecs.getdecoder, "__spam__") 1654 1655 def test_getreader(self): 1656 self.assertRaises(TypeError, codecs.getreader) 1657 self.assertRaises(LookupError, codecs.getreader, "__spam__") 1658 1659 def test_getwriter(self): 1660 self.assertRaises(TypeError, codecs.getwriter) 1661 self.assertRaises(LookupError, codecs.getwriter, "__spam__") 1662 1663 def test_lookup_issue1813(self): 1664 # Issue #1813: under Turkish locales, lookup of some codecs failed 1665 # because 'I' is lowercased as "ı" (dotless i) 1666 oldlocale = locale.setlocale(locale.LC_CTYPE) 1667 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale) 1668 try: 1669 locale.setlocale(locale.LC_CTYPE, 'tr_TR') 1670 except locale.Error: 1671 # Unsupported locale on this system 1672 self.skipTest('test needs Turkish locale') 1673 c = codecs.lookup('ASCII') 1674 self.assertEqual(c.name, 'ascii') 1675 1676 def test_all(self): 1677 api = ( 1678 "encode", "decode", 1679 "register", "CodecInfo", "Codec", "IncrementalEncoder", 1680 "IncrementalDecoder", "StreamReader", "StreamWriter", "lookup", 1681 "getencoder", "getdecoder", "getincrementalencoder", 1682 "getincrementaldecoder", "getreader", "getwriter", 1683 "register_error", "lookup_error", 1684 "strict_errors", "replace_errors", "ignore_errors", 1685 "xmlcharrefreplace_errors", "backslashreplace_errors", 1686 "namereplace_errors", 1687 "open", "EncodedFile", 1688 "iterencode", "iterdecode", 1689 "BOM", "BOM_BE", "BOM_LE", 1690 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_BE", "BOM_UTF16_LE", 1691 "BOM_UTF32", "BOM_UTF32_BE", "BOM_UTF32_LE", 1692 "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", # Undocumented 1693 "StreamReaderWriter", "StreamRecoder", 1694 ) 1695 self.assertCountEqual(api, codecs.__all__) 1696 for api in codecs.__all__: 1697 getattr(codecs, api) 1698 1699 def test_open(self): 1700 self.addCleanup(support.unlink, support.TESTFN) 1701 for mode in ('w', 'r', 'r+', 'w+', 'a', 'a+'): 1702 with self.subTest(mode), \ 1703 codecs.open(support.TESTFN, mode, 'ascii') as file: 1704 self.assertIsInstance(file, codecs.StreamReaderWriter) 1705 1706 def test_undefined(self): 1707 self.assertRaises(UnicodeError, codecs.encode, 'abc', 'undefined') 1708 self.assertRaises(UnicodeError, codecs.decode, b'abc', 'undefined') 1709 self.assertRaises(UnicodeError, codecs.encode, '', 'undefined') 1710 self.assertRaises(UnicodeError, codecs.decode, b'', 'undefined') 1711 for errors in ('strict', 'ignore', 'replace', 'backslashreplace'): 1712 self.assertRaises(UnicodeError, 1713 codecs.encode, 'abc', 'undefined', errors) 1714 self.assertRaises(UnicodeError, 1715 codecs.decode, b'abc', 'undefined', errors) 1716 1717 def test_file_closes_if_lookup_error_raised(self): 1718 mock_open = mock.mock_open() 1719 with mock.patch('builtins.open', mock_open) as file: 1720 with self.assertRaises(LookupError): 1721 codecs.open(support.TESTFN, 'wt', 'invalid-encoding') 1722 1723 file().close.assert_called() 1724 1725 1726class StreamReaderTest(unittest.TestCase): 1727 1728 def setUp(self): 1729 self.reader = codecs.getreader('utf-8') 1730 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80') 1731 1732 def test_readlines(self): 1733 f = self.reader(self.stream) 1734 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00']) 1735 1736 1737class EncodedFileTest(unittest.TestCase): 1738 1739 def test_basic(self): 1740 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80') 1741 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8') 1742 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae') 1743 1744 f = io.BytesIO() 1745 ef = codecs.EncodedFile(f, 'utf-8', 'latin-1') 1746 ef.write(b'\xc3\xbc') 1747 self.assertEqual(f.getvalue(), b'\xfc') 1748 1749all_unicode_encodings = [ 1750 "ascii", 1751 "big5", 1752 "big5hkscs", 1753 "charmap", 1754 "cp037", 1755 "cp1006", 1756 "cp1026", 1757 "cp1125", 1758 "cp1140", 1759 "cp1250", 1760 "cp1251", 1761 "cp1252", 1762 "cp1253", 1763 "cp1254", 1764 "cp1255", 1765 "cp1256", 1766 "cp1257", 1767 "cp1258", 1768 "cp424", 1769 "cp437", 1770 "cp500", 1771 "cp720", 1772 "cp737", 1773 "cp775", 1774 "cp850", 1775 "cp852", 1776 "cp855", 1777 "cp856", 1778 "cp857", 1779 "cp858", 1780 "cp860", 1781 "cp861", 1782 "cp862", 1783 "cp863", 1784 "cp864", 1785 "cp865", 1786 "cp866", 1787 "cp869", 1788 "cp874", 1789 "cp875", 1790 "cp932", 1791 "cp949", 1792 "cp950", 1793 "euc_jis_2004", 1794 "euc_jisx0213", 1795 "euc_jp", 1796 "euc_kr", 1797 "gb18030", 1798 "gb2312", 1799 "gbk", 1800 "hp_roman8", 1801 "hz", 1802 "idna", 1803 "iso2022_jp", 1804 "iso2022_jp_1", 1805 "iso2022_jp_2", 1806 "iso2022_jp_2004", 1807 "iso2022_jp_3", 1808 "iso2022_jp_ext", 1809 "iso2022_kr", 1810 "iso8859_1", 1811 "iso8859_10", 1812 "iso8859_11", 1813 "iso8859_13", 1814 "iso8859_14", 1815 "iso8859_15", 1816 "iso8859_16", 1817 "iso8859_2", 1818 "iso8859_3", 1819 "iso8859_4", 1820 "iso8859_5", 1821 "iso8859_6", 1822 "iso8859_7", 1823 "iso8859_8", 1824 "iso8859_9", 1825 "johab", 1826 "koi8_r", 1827 "koi8_t", 1828 "koi8_u", 1829 "kz1048", 1830 "latin_1", 1831 "mac_cyrillic", 1832 "mac_greek", 1833 "mac_iceland", 1834 "mac_latin2", 1835 "mac_roman", 1836 "mac_turkish", 1837 "palmos", 1838 "ptcp154", 1839 "punycode", 1840 "raw_unicode_escape", 1841 "shift_jis", 1842 "shift_jis_2004", 1843 "shift_jisx0213", 1844 "tis_620", 1845 "unicode_escape", 1846 "utf_16", 1847 "utf_16_be", 1848 "utf_16_le", 1849 "utf_7", 1850 "utf_8", 1851] 1852 1853if hasattr(codecs, "mbcs_encode"): 1854 all_unicode_encodings.append("mbcs") 1855if hasattr(codecs, "oem_encode"): 1856 all_unicode_encodings.append("oem") 1857 1858# The following encoding is not tested, because it's not supposed 1859# to work: 1860# "undefined" 1861 1862# The following encodings don't work in stateful mode 1863broken_unicode_with_stateful = [ 1864 "punycode", 1865] 1866 1867 1868class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling): 1869 def test_basics(self): 1870 s = "abc123" # all codecs should be able to encode these 1871 for encoding in all_unicode_encodings: 1872 name = codecs.lookup(encoding).name 1873 if encoding.endswith("_codec"): 1874 name += "_codec" 1875 elif encoding == "latin_1": 1876 name = "latin_1" 1877 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-")) 1878 1879 (b, size) = codecs.getencoder(encoding)(s) 1880 self.assertEqual(size, len(s), "encoding=%r" % encoding) 1881 (chars, size) = codecs.getdecoder(encoding)(b) 1882 self.assertEqual(chars, s, "encoding=%r" % encoding) 1883 1884 if encoding not in broken_unicode_with_stateful: 1885 # check stream reader/writer 1886 q = Queue(b"") 1887 writer = codecs.getwriter(encoding)(q) 1888 encodedresult = b"" 1889 for c in s: 1890 writer.write(c) 1891 chunk = q.read() 1892 self.assertTrue(type(chunk) is bytes, type(chunk)) 1893 encodedresult += chunk 1894 q = Queue(b"") 1895 reader = codecs.getreader(encoding)(q) 1896 decodedresult = "" 1897 for c in encodedresult: 1898 q.write(bytes([c])) 1899 decodedresult += reader.read() 1900 self.assertEqual(decodedresult, s, "encoding=%r" % encoding) 1901 1902 if encoding not in broken_unicode_with_stateful: 1903 # check incremental decoder/encoder and iterencode()/iterdecode() 1904 try: 1905 encoder = codecs.getincrementalencoder(encoding)() 1906 except LookupError: # no IncrementalEncoder 1907 pass 1908 else: 1909 # check incremental decoder/encoder 1910 encodedresult = b"" 1911 for c in s: 1912 encodedresult += encoder.encode(c) 1913 encodedresult += encoder.encode("", True) 1914 decoder = codecs.getincrementaldecoder(encoding)() 1915 decodedresult = "" 1916 for c in encodedresult: 1917 decodedresult += decoder.decode(bytes([c])) 1918 decodedresult += decoder.decode(b"", True) 1919 self.assertEqual(decodedresult, s, 1920 "encoding=%r" % encoding) 1921 1922 # check iterencode()/iterdecode() 1923 result = "".join(codecs.iterdecode( 1924 codecs.iterencode(s, encoding), encoding)) 1925 self.assertEqual(result, s, "encoding=%r" % encoding) 1926 1927 # check iterencode()/iterdecode() with empty string 1928 result = "".join(codecs.iterdecode( 1929 codecs.iterencode("", encoding), encoding)) 1930 self.assertEqual(result, "") 1931 1932 if encoding not in ("idna", "mbcs"): 1933 # check incremental decoder/encoder with errors argument 1934 try: 1935 encoder = codecs.getincrementalencoder(encoding)("ignore") 1936 except LookupError: # no IncrementalEncoder 1937 pass 1938 else: 1939 encodedresult = b"".join(encoder.encode(c) for c in s) 1940 decoder = codecs.getincrementaldecoder(encoding)("ignore") 1941 decodedresult = "".join(decoder.decode(bytes([c])) 1942 for c in encodedresult) 1943 self.assertEqual(decodedresult, s, 1944 "encoding=%r" % encoding) 1945 1946 @support.cpython_only 1947 def test_basics_capi(self): 1948 s = "abc123" # all codecs should be able to encode these 1949 for encoding in all_unicode_encodings: 1950 if encoding not in broken_unicode_with_stateful: 1951 # check incremental decoder/encoder (fetched via the C API) 1952 try: 1953 cencoder = _testcapi.codec_incrementalencoder(encoding) 1954 except LookupError: # no IncrementalEncoder 1955 pass 1956 else: 1957 # check C API 1958 encodedresult = b"" 1959 for c in s: 1960 encodedresult += cencoder.encode(c) 1961 encodedresult += cencoder.encode("", True) 1962 cdecoder = _testcapi.codec_incrementaldecoder(encoding) 1963 decodedresult = "" 1964 for c in encodedresult: 1965 decodedresult += cdecoder.decode(bytes([c])) 1966 decodedresult += cdecoder.decode(b"", True) 1967 self.assertEqual(decodedresult, s, 1968 "encoding=%r" % encoding) 1969 1970 if encoding not in ("idna", "mbcs"): 1971 # check incremental decoder/encoder with errors argument 1972 try: 1973 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore") 1974 except LookupError: # no IncrementalEncoder 1975 pass 1976 else: 1977 encodedresult = b"".join(cencoder.encode(c) for c in s) 1978 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore") 1979 decodedresult = "".join(cdecoder.decode(bytes([c])) 1980 for c in encodedresult) 1981 self.assertEqual(decodedresult, s, 1982 "encoding=%r" % encoding) 1983 1984 def test_seek(self): 1985 # all codecs should be able to encode these 1986 s = "%s\n%s\n" % (100*"abc123", 100*"def456") 1987 for encoding in all_unicode_encodings: 1988 if encoding == "idna": # FIXME: See SF bug #1163178 1989 continue 1990 if encoding in broken_unicode_with_stateful: 1991 continue 1992 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding))) 1993 for t in range(5): 1994 # Test that calling seek resets the internal codec state and buffers 1995 reader.seek(0, 0) 1996 data = reader.read() 1997 self.assertEqual(s, data) 1998 1999 def test_bad_decode_args(self): 2000 for encoding in all_unicode_encodings: 2001 decoder = codecs.getdecoder(encoding) 2002 self.assertRaises(TypeError, decoder) 2003 if encoding not in ("idna", "punycode"): 2004 self.assertRaises(TypeError, decoder, 42) 2005 2006 def test_bad_encode_args(self): 2007 for encoding in all_unicode_encodings: 2008 encoder = codecs.getencoder(encoding) 2009 self.assertRaises(TypeError, encoder) 2010 2011 def test_encoding_map_type_initialized(self): 2012 from encodings import cp1140 2013 # This used to crash, we are only verifying there's no crash. 2014 table_type = type(cp1140.encoding_table) 2015 self.assertEqual(table_type, table_type) 2016 2017 def test_decoder_state(self): 2018 # Check that getstate() and setstate() handle the state properly 2019 u = "abc123" 2020 for encoding in all_unicode_encodings: 2021 if encoding not in broken_unicode_with_stateful: 2022 self.check_state_handling_decode(encoding, u, u.encode(encoding)) 2023 self.check_state_handling_encode(encoding, u, u.encode(encoding)) 2024 2025 2026class CharmapTest(unittest.TestCase): 2027 def test_decode_with_string_map(self): 2028 self.assertEqual( 2029 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"), 2030 ("abc", 3) 2031 ) 2032 2033 self.assertEqual( 2034 codecs.charmap_decode(b"\x00\x01\x02", "strict", "\U0010FFFFbc"), 2035 ("\U0010FFFFbc", 3) 2036 ) 2037 2038 self.assertRaises(UnicodeDecodeError, 2039 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab" 2040 ) 2041 2042 self.assertRaises(UnicodeDecodeError, 2043 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab\ufffe" 2044 ) 2045 2046 self.assertEqual( 2047 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"), 2048 ("ab\ufffd", 3) 2049 ) 2050 2051 self.assertEqual( 2052 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"), 2053 ("ab\ufffd", 3) 2054 ) 2055 2056 self.assertEqual( 2057 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab"), 2058 ("ab\\x02", 3) 2059 ) 2060 2061 self.assertEqual( 2062 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab\ufffe"), 2063 ("ab\\x02", 3) 2064 ) 2065 2066 self.assertEqual( 2067 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"), 2068 ("ab", 3) 2069 ) 2070 2071 self.assertEqual( 2072 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"), 2073 ("ab", 3) 2074 ) 2075 2076 allbytes = bytes(range(256)) 2077 self.assertEqual( 2078 codecs.charmap_decode(allbytes, "ignore", ""), 2079 ("", len(allbytes)) 2080 ) 2081 2082 def test_decode_with_int2str_map(self): 2083 self.assertEqual( 2084 codecs.charmap_decode(b"\x00\x01\x02", "strict", 2085 {0: 'a', 1: 'b', 2: 'c'}), 2086 ("abc", 3) 2087 ) 2088 2089 self.assertEqual( 2090 codecs.charmap_decode(b"\x00\x01\x02", "strict", 2091 {0: 'Aa', 1: 'Bb', 2: 'Cc'}), 2092 ("AaBbCc", 3) 2093 ) 2094 2095 self.assertEqual( 2096 codecs.charmap_decode(b"\x00\x01\x02", "strict", 2097 {0: '\U0010FFFF', 1: 'b', 2: 'c'}), 2098 ("\U0010FFFFbc", 3) 2099 ) 2100 2101 self.assertEqual( 2102 codecs.charmap_decode(b"\x00\x01\x02", "strict", 2103 {0: 'a', 1: 'b', 2: ''}), 2104 ("ab", 3) 2105 ) 2106 2107 self.assertRaises(UnicodeDecodeError, 2108 codecs.charmap_decode, b"\x00\x01\x02", "strict", 2109 {0: 'a', 1: 'b'} 2110 ) 2111 2112 self.assertRaises(UnicodeDecodeError, 2113 codecs.charmap_decode, b"\x00\x01\x02", "strict", 2114 {0: 'a', 1: 'b', 2: None} 2115 ) 2116 2117 # Issue #14850 2118 self.assertRaises(UnicodeDecodeError, 2119 codecs.charmap_decode, b"\x00\x01\x02", "strict", 2120 {0: 'a', 1: 'b', 2: '\ufffe'} 2121 ) 2122 2123 self.assertEqual( 2124 codecs.charmap_decode(b"\x00\x01\x02", "replace", 2125 {0: 'a', 1: 'b'}), 2126 ("ab\ufffd", 3) 2127 ) 2128 2129 self.assertEqual( 2130 codecs.charmap_decode(b"\x00\x01\x02", "replace", 2131 {0: 'a', 1: 'b', 2: None}), 2132 ("ab\ufffd", 3) 2133 ) 2134 2135 # Issue #14850 2136 self.assertEqual( 2137 codecs.charmap_decode(b"\x00\x01\x02", "replace", 2138 {0: 'a', 1: 'b', 2: '\ufffe'}), 2139 ("ab\ufffd", 3) 2140 ) 2141 2142 self.assertEqual( 2143 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", 2144 {0: 'a', 1: 'b'}), 2145 ("ab\\x02", 3) 2146 ) 2147 2148 self.assertEqual( 2149 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", 2150 {0: 'a', 1: 'b', 2: None}), 2151 ("ab\\x02", 3) 2152 ) 2153 2154 # Issue #14850 2155 self.assertEqual( 2156 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", 2157 {0: 'a', 1: 'b', 2: '\ufffe'}), 2158 ("ab\\x02", 3) 2159 ) 2160 2161 self.assertEqual( 2162 codecs.charmap_decode(b"\x00\x01\x02", "ignore", 2163 {0: 'a', 1: 'b'}), 2164 ("ab", 3) 2165 ) 2166 2167 self.assertEqual( 2168 codecs.charmap_decode(b"\x00\x01\x02", "ignore", 2169 {0: 'a', 1: 'b', 2: None}), 2170 ("ab", 3) 2171 ) 2172 2173 # Issue #14850 2174 self.assertEqual( 2175 codecs.charmap_decode(b"\x00\x01\x02", "ignore", 2176 {0: 'a', 1: 'b', 2: '\ufffe'}), 2177 ("ab", 3) 2178 ) 2179 2180 allbytes = bytes(range(256)) 2181 self.assertEqual( 2182 codecs.charmap_decode(allbytes, "ignore", {}), 2183 ("", len(allbytes)) 2184 ) 2185 2186 self.assertRaisesRegex(TypeError, 2187 "character mapping must be in range\\(0x110000\\)", 2188 codecs.charmap_decode, 2189 b"\x00\x01\x02", "strict", {0: "A", 1: 'Bb', 2: -2} 2190 ) 2191 2192 self.assertRaisesRegex(TypeError, 2193 "character mapping must be in range\\(0x110000\\)", 2194 codecs.charmap_decode, 2195 b"\x00\x01\x02", "strict", {0: "A", 1: 'Bb', 2: 999999999} 2196 ) 2197 2198 def test_decode_with_int2int_map(self): 2199 a = ord('a') 2200 b = ord('b') 2201 c = ord('c') 2202 2203 self.assertEqual( 2204 codecs.charmap_decode(b"\x00\x01\x02", "strict", 2205 {0: a, 1: b, 2: c}), 2206 ("abc", 3) 2207 ) 2208 2209 # Issue #15379 2210 self.assertEqual( 2211 codecs.charmap_decode(b"\x00\x01\x02", "strict", 2212 {0: 0x10FFFF, 1: b, 2: c}), 2213 ("\U0010FFFFbc", 3) 2214 ) 2215 2216 self.assertEqual( 2217 codecs.charmap_decode(b"\x00\x01\x02", "strict", 2218 {0: sys.maxunicode, 1: b, 2: c}), 2219 (chr(sys.maxunicode) + "bc", 3) 2220 ) 2221 2222 self.assertRaises(TypeError, 2223 codecs.charmap_decode, b"\x00\x01\x02", "strict", 2224 {0: sys.maxunicode + 1, 1: b, 2: c} 2225 ) 2226 2227 self.assertRaises(UnicodeDecodeError, 2228 codecs.charmap_decode, b"\x00\x01\x02", "strict", 2229 {0: a, 1: b}, 2230 ) 2231 2232 self.assertRaises(UnicodeDecodeError, 2233 codecs.charmap_decode, b"\x00\x01\x02", "strict", 2234 {0: a, 1: b, 2: 0xFFFE}, 2235 ) 2236 2237 self.assertEqual( 2238 codecs.charmap_decode(b"\x00\x01\x02", "replace", 2239 {0: a, 1: b}), 2240 ("ab\ufffd", 3) 2241 ) 2242 2243 self.assertEqual( 2244 codecs.charmap_decode(b"\x00\x01\x02", "replace", 2245 {0: a, 1: b, 2: 0xFFFE}), 2246 ("ab\ufffd", 3) 2247 ) 2248 2249 self.assertEqual( 2250 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", 2251 {0: a, 1: b}), 2252 ("ab\\x02", 3) 2253 ) 2254 2255 self.assertEqual( 2256 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", 2257 {0: a, 1: b, 2: 0xFFFE}), 2258 ("ab\\x02", 3) 2259 ) 2260 2261 self.assertEqual( 2262 codecs.charmap_decode(b"\x00\x01\x02", "ignore", 2263 {0: a, 1: b}), 2264 ("ab", 3) 2265 ) 2266 2267 self.assertEqual( 2268 codecs.charmap_decode(b"\x00\x01\x02", "ignore", 2269 {0: a, 1: b, 2: 0xFFFE}), 2270 ("ab", 3) 2271 ) 2272 2273 2274class WithStmtTest(unittest.TestCase): 2275 def test_encodedfile(self): 2276 f = io.BytesIO(b"\xc3\xbc") 2277 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef: 2278 self.assertEqual(ef.read(), b"\xfc") 2279 self.assertTrue(f.closed) 2280 2281 def test_streamreaderwriter(self): 2282 f = io.BytesIO(b"\xc3\xbc") 2283 info = codecs.lookup("utf-8") 2284 with codecs.StreamReaderWriter(f, info.streamreader, 2285 info.streamwriter, 'strict') as srw: 2286 self.assertEqual(srw.read(), "\xfc") 2287 2288 2289class TypesTest(unittest.TestCase): 2290 def test_decode_unicode(self): 2291 # Most decoders don't accept unicode input 2292 decoders = [ 2293 codecs.utf_7_decode, 2294 codecs.utf_8_decode, 2295 codecs.utf_16_le_decode, 2296 codecs.utf_16_be_decode, 2297 codecs.utf_16_ex_decode, 2298 codecs.utf_32_decode, 2299 codecs.utf_32_le_decode, 2300 codecs.utf_32_be_decode, 2301 codecs.utf_32_ex_decode, 2302 codecs.latin_1_decode, 2303 codecs.ascii_decode, 2304 codecs.charmap_decode, 2305 ] 2306 if hasattr(codecs, "mbcs_decode"): 2307 decoders.append(codecs.mbcs_decode) 2308 for decoder in decoders: 2309 self.assertRaises(TypeError, decoder, "xxx") 2310 2311 def test_unicode_escape(self): 2312 # Escape-decoding a unicode string is supported and gives the same 2313 # result as decoding the equivalent ASCII bytes string. 2314 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6)) 2315 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6)) 2316 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6)) 2317 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6)) 2318 2319 self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000") 2320 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10)) 2321 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "backslashreplace"), 2322 (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10)) 2323 2324 self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000") 2325 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10)) 2326 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "backslashreplace"), 2327 (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10)) 2328 2329 2330class UnicodeEscapeTest(unittest.TestCase): 2331 def test_empty(self): 2332 self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0)) 2333 self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0)) 2334 2335 def test_raw_encode(self): 2336 encode = codecs.unicode_escape_encode 2337 for b in range(32, 127): 2338 if b != b'\\'[0]: 2339 self.assertEqual(encode(chr(b)), (bytes([b]), 1)) 2340 2341 def test_raw_decode(self): 2342 decode = codecs.unicode_escape_decode 2343 for b in range(256): 2344 if b != b'\\'[0]: 2345 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2)) 2346 2347 def test_escape_encode(self): 2348 encode = codecs.unicode_escape_encode 2349 check = coding_checker(self, encode) 2350 check('\t', br'\t') 2351 check('\n', br'\n') 2352 check('\r', br'\r') 2353 check('\\', br'\\') 2354 for b in range(32): 2355 if chr(b) not in '\t\n\r': 2356 check(chr(b), ('\\x%02x' % b).encode()) 2357 for b in range(127, 256): 2358 check(chr(b), ('\\x%02x' % b).encode()) 2359 check('\u20ac', br'\u20ac') 2360 check('\U0001d120', br'\U0001d120') 2361 2362 def test_escape_decode(self): 2363 decode = codecs.unicode_escape_decode 2364 check = coding_checker(self, decode) 2365 check(b"[\\\n]", "[]") 2366 check(br'[\"]', '["]') 2367 check(br"[\']", "[']") 2368 check(br"[\\]", r"[\]") 2369 check(br"[\a]", "[\x07]") 2370 check(br"[\b]", "[\x08]") 2371 check(br"[\t]", "[\x09]") 2372 check(br"[\n]", "[\x0a]") 2373 check(br"[\v]", "[\x0b]") 2374 check(br"[\f]", "[\x0c]") 2375 check(br"[\r]", "[\x0d]") 2376 check(br"[\7]", "[\x07]") 2377 check(br"[\78]", "[\x078]") 2378 check(br"[\41]", "[!]") 2379 check(br"[\418]", "[!8]") 2380 check(br"[\101]", "[A]") 2381 check(br"[\1010]", "[A0]") 2382 check(br"[\x41]", "[A]") 2383 check(br"[\x410]", "[A0]") 2384 check(br"\u20ac", "\u20ac") 2385 check(br"\U0001d120", "\U0001d120") 2386 for i in range(97, 123): 2387 b = bytes([i]) 2388 if b not in b'abfnrtuvx': 2389 with self.assertWarns(DeprecationWarning): 2390 check(b"\\" + b, "\\" + chr(i)) 2391 if b.upper() not in b'UN': 2392 with self.assertWarns(DeprecationWarning): 2393 check(b"\\" + b.upper(), "\\" + chr(i-32)) 2394 with self.assertWarns(DeprecationWarning): 2395 check(br"\8", "\\8") 2396 with self.assertWarns(DeprecationWarning): 2397 check(br"\9", "\\9") 2398 with self.assertWarns(DeprecationWarning): 2399 check(b"\\\xfa", "\\\xfa") 2400 2401 def test_decode_errors(self): 2402 decode = codecs.unicode_escape_decode 2403 for c, d in (b'x', 2), (b'u', 4), (b'U', 4): 2404 for i in range(d): 2405 self.assertRaises(UnicodeDecodeError, decode, 2406 b"\\" + c + b"0"*i) 2407 self.assertRaises(UnicodeDecodeError, decode, 2408 b"[\\" + c + b"0"*i + b"]") 2409 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i 2410 self.assertEqual(decode(data, "ignore"), ("[]", len(data))) 2411 self.assertEqual(decode(data, "replace"), 2412 ("[\ufffd]\ufffd", len(data))) 2413 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000") 2414 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10)) 2415 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10)) 2416 2417 2418class RawUnicodeEscapeTest(unittest.TestCase): 2419 def test_empty(self): 2420 self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0)) 2421 self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0)) 2422 2423 def test_raw_encode(self): 2424 encode = codecs.raw_unicode_escape_encode 2425 for b in range(256): 2426 self.assertEqual(encode(chr(b)), (bytes([b]), 1)) 2427 2428 def test_raw_decode(self): 2429 decode = codecs.raw_unicode_escape_decode 2430 for b in range(256): 2431 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2)) 2432 2433 def test_escape_encode(self): 2434 encode = codecs.raw_unicode_escape_encode 2435 check = coding_checker(self, encode) 2436 for b in range(256): 2437 if b not in b'uU': 2438 check('\\' + chr(b), b'\\' + bytes([b])) 2439 check('\u20ac', br'\u20ac') 2440 check('\U0001d120', br'\U0001d120') 2441 2442 def test_escape_decode(self): 2443 decode = codecs.raw_unicode_escape_decode 2444 check = coding_checker(self, decode) 2445 for b in range(256): 2446 if b not in b'uU': 2447 check(b'\\' + bytes([b]), '\\' + chr(b)) 2448 check(br"\u20ac", "\u20ac") 2449 check(br"\U0001d120", "\U0001d120") 2450 2451 def test_decode_errors(self): 2452 decode = codecs.raw_unicode_escape_decode 2453 for c, d in (b'u', 4), (b'U', 4): 2454 for i in range(d): 2455 self.assertRaises(UnicodeDecodeError, decode, 2456 b"\\" + c + b"0"*i) 2457 self.assertRaises(UnicodeDecodeError, decode, 2458 b"[\\" + c + b"0"*i + b"]") 2459 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i 2460 self.assertEqual(decode(data, "ignore"), ("[]", len(data))) 2461 self.assertEqual(decode(data, "replace"), 2462 ("[\ufffd]\ufffd", len(data))) 2463 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000") 2464 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10)) 2465 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10)) 2466 2467 2468class EscapeEncodeTest(unittest.TestCase): 2469 2470 def test_escape_encode(self): 2471 tests = [ 2472 (b'', (b'', 0)), 2473 (b'foobar', (b'foobar', 6)), 2474 (b'spam\0eggs', (b'spam\\x00eggs', 9)), 2475 (b'a\'b', (b"a\\'b", 3)), 2476 (b'b\\c', (b'b\\\\c', 3)), 2477 (b'c\nd', (b'c\\nd', 3)), 2478 (b'd\re', (b'd\\re', 3)), 2479 (b'f\x7fg', (b'f\\x7fg', 3)), 2480 ] 2481 for data, output in tests: 2482 with self.subTest(data=data): 2483 self.assertEqual(codecs.escape_encode(data), output) 2484 self.assertRaises(TypeError, codecs.escape_encode, 'spam') 2485 self.assertRaises(TypeError, codecs.escape_encode, bytearray(b'spam')) 2486 2487 2488class SurrogateEscapeTest(unittest.TestCase): 2489 2490 def test_utf8(self): 2491 # Bad byte 2492 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"), 2493 "foo\udc80bar") 2494 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"), 2495 b"foo\x80bar") 2496 # bad-utf-8 encoded surrogate 2497 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"), 2498 "\udced\udcb0\udc80") 2499 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"), 2500 b"\xed\xb0\x80") 2501 2502 def test_ascii(self): 2503 # bad byte 2504 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"), 2505 "foo\udc80bar") 2506 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"), 2507 b"foo\x80bar") 2508 2509 def test_charmap(self): 2510 # bad byte: \xa5 is unmapped in iso-8859-3 2511 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"), 2512 "foo\udca5bar") 2513 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"), 2514 b"foo\xa5bar") 2515 2516 def test_latin1(self): 2517 # Issue6373 2518 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"), 2519 b"\xe4\xeb\xef\xf6\xfc") 2520 2521 2522class BomTest(unittest.TestCase): 2523 def test_seek0(self): 2524 data = "1234567890" 2525 tests = ("utf-16", 2526 "utf-16-le", 2527 "utf-16-be", 2528 "utf-32", 2529 "utf-32-le", 2530 "utf-32-be") 2531 self.addCleanup(support.unlink, support.TESTFN) 2532 for encoding in tests: 2533 # Check if the BOM is written only once 2534 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f: 2535 f.write(data) 2536 f.write(data) 2537 f.seek(0) 2538 self.assertEqual(f.read(), data * 2) 2539 f.seek(0) 2540 self.assertEqual(f.read(), data * 2) 2541 2542 # Check that the BOM is written after a seek(0) 2543 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f: 2544 f.write(data[0]) 2545 self.assertNotEqual(f.tell(), 0) 2546 f.seek(0) 2547 f.write(data) 2548 f.seek(0) 2549 self.assertEqual(f.read(), data) 2550 2551 # (StreamWriter) Check that the BOM is written after a seek(0) 2552 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f: 2553 f.writer.write(data[0]) 2554 self.assertNotEqual(f.writer.tell(), 0) 2555 f.writer.seek(0) 2556 f.writer.write(data) 2557 f.seek(0) 2558 self.assertEqual(f.read(), data) 2559 2560 # Check that the BOM is not written after a seek() at a position 2561 # different than the start 2562 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f: 2563 f.write(data) 2564 f.seek(f.tell()) 2565 f.write(data) 2566 f.seek(0) 2567 self.assertEqual(f.read(), data * 2) 2568 2569 # (StreamWriter) Check that the BOM is not written after a seek() 2570 # at a position different than the start 2571 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f: 2572 f.writer.write(data) 2573 f.writer.seek(f.writer.tell()) 2574 f.writer.write(data) 2575 f.seek(0) 2576 self.assertEqual(f.read(), data * 2) 2577 2578 2579bytes_transform_encodings = [ 2580 "base64_codec", 2581 "uu_codec", 2582 "quopri_codec", 2583 "hex_codec", 2584] 2585 2586transform_aliases = { 2587 "base64_codec": ["base64", "base_64"], 2588 "uu_codec": ["uu"], 2589 "quopri_codec": ["quopri", "quoted_printable", "quotedprintable"], 2590 "hex_codec": ["hex"], 2591 "rot_13": ["rot13"], 2592} 2593 2594try: 2595 import zlib 2596except ImportError: 2597 zlib = None 2598else: 2599 bytes_transform_encodings.append("zlib_codec") 2600 transform_aliases["zlib_codec"] = ["zip", "zlib"] 2601try: 2602 import bz2 2603except ImportError: 2604 pass 2605else: 2606 bytes_transform_encodings.append("bz2_codec") 2607 transform_aliases["bz2_codec"] = ["bz2"] 2608 2609 2610class TransformCodecTest(unittest.TestCase): 2611 2612 def test_basics(self): 2613 binput = bytes(range(256)) 2614 for encoding in bytes_transform_encodings: 2615 with self.subTest(encoding=encoding): 2616 # generic codecs interface 2617 (o, size) = codecs.getencoder(encoding)(binput) 2618 self.assertEqual(size, len(binput)) 2619 (i, size) = codecs.getdecoder(encoding)(o) 2620 self.assertEqual(size, len(o)) 2621 self.assertEqual(i, binput) 2622 2623 def test_read(self): 2624 for encoding in bytes_transform_encodings: 2625 with self.subTest(encoding=encoding): 2626 sin = codecs.encode(b"\x80", encoding) 2627 reader = codecs.getreader(encoding)(io.BytesIO(sin)) 2628 sout = reader.read() 2629 self.assertEqual(sout, b"\x80") 2630 2631 def test_readline(self): 2632 for encoding in bytes_transform_encodings: 2633 with self.subTest(encoding=encoding): 2634 sin = codecs.encode(b"\x80", encoding) 2635 reader = codecs.getreader(encoding)(io.BytesIO(sin)) 2636 sout = reader.readline() 2637 self.assertEqual(sout, b"\x80") 2638 2639 def test_buffer_api_usage(self): 2640 # We check all the transform codecs accept memoryview input 2641 # for encoding and decoding 2642 # and also that they roundtrip correctly 2643 original = b"12345\x80" 2644 for encoding in bytes_transform_encodings: 2645 with self.subTest(encoding=encoding): 2646 data = original 2647 view = memoryview(data) 2648 data = codecs.encode(data, encoding) 2649 view_encoded = codecs.encode(view, encoding) 2650 self.assertEqual(view_encoded, data) 2651 view = memoryview(data) 2652 data = codecs.decode(data, encoding) 2653 self.assertEqual(data, original) 2654 view_decoded = codecs.decode(view, encoding) 2655 self.assertEqual(view_decoded, data) 2656 2657 def test_text_to_binary_blacklists_binary_transforms(self): 2658 # Check binary -> binary codecs give a good error for str input 2659 bad_input = "bad input type" 2660 for encoding in bytes_transform_encodings: 2661 with self.subTest(encoding=encoding): 2662 fmt = (r"{!r} is not a text encoding; " 2663 r"use codecs.encode\(\) to handle arbitrary codecs") 2664 msg = fmt.format(encoding) 2665 with self.assertRaisesRegex(LookupError, msg) as failure: 2666 bad_input.encode(encoding) 2667 self.assertIsNone(failure.exception.__cause__) 2668 2669 def test_text_to_binary_blacklists_text_transforms(self): 2670 # Check str.encode gives a good error message for str -> str codecs 2671 msg = (r"^'rot_13' is not a text encoding; " 2672 r"use codecs.encode\(\) to handle arbitrary codecs") 2673 with self.assertRaisesRegex(LookupError, msg): 2674 "just an example message".encode("rot_13") 2675 2676 def test_binary_to_text_blacklists_binary_transforms(self): 2677 # Check bytes.decode and bytearray.decode give a good error 2678 # message for binary -> binary codecs 2679 data = b"encode first to ensure we meet any format restrictions" 2680 for encoding in bytes_transform_encodings: 2681 with self.subTest(encoding=encoding): 2682 encoded_data = codecs.encode(data, encoding) 2683 fmt = (r"{!r} is not a text encoding; " 2684 r"use codecs.decode\(\) to handle arbitrary codecs") 2685 msg = fmt.format(encoding) 2686 with self.assertRaisesRegex(LookupError, msg): 2687 encoded_data.decode(encoding) 2688 with self.assertRaisesRegex(LookupError, msg): 2689 bytearray(encoded_data).decode(encoding) 2690 2691 def test_binary_to_text_blacklists_text_transforms(self): 2692 # Check str -> str codec gives a good error for binary input 2693 for bad_input in (b"immutable", bytearray(b"mutable")): 2694 with self.subTest(bad_input=bad_input): 2695 msg = (r"^'rot_13' is not a text encoding; " 2696 r"use codecs.decode\(\) to handle arbitrary codecs") 2697 with self.assertRaisesRegex(LookupError, msg) as failure: 2698 bad_input.decode("rot_13") 2699 self.assertIsNone(failure.exception.__cause__) 2700 2701 @unittest.skipUnless(zlib, "Requires zlib support") 2702 def test_custom_zlib_error_is_wrapped(self): 2703 # Check zlib codec gives a good error for malformed input 2704 msg = "^decoding with 'zlib_codec' codec failed" 2705 with self.assertRaisesRegex(Exception, msg) as failure: 2706 codecs.decode(b"hello", "zlib_codec") 2707 self.assertIsInstance(failure.exception.__cause__, 2708 type(failure.exception)) 2709 2710 def test_custom_hex_error_is_wrapped(self): 2711 # Check hex codec gives a good error for malformed input 2712 msg = "^decoding with 'hex_codec' codec failed" 2713 with self.assertRaisesRegex(Exception, msg) as failure: 2714 codecs.decode(b"hello", "hex_codec") 2715 self.assertIsInstance(failure.exception.__cause__, 2716 type(failure.exception)) 2717 2718 # Unfortunately, the bz2 module throws OSError, which the codec 2719 # machinery currently can't wrap :( 2720 2721 # Ensure codec aliases from http://bugs.python.org/issue7475 work 2722 def test_aliases(self): 2723 for codec_name, aliases in transform_aliases.items(): 2724 expected_name = codecs.lookup(codec_name).name 2725 for alias in aliases: 2726 with self.subTest(alias=alias): 2727 info = codecs.lookup(alias) 2728 self.assertEqual(info.name, expected_name) 2729 2730 def test_quopri_stateless(self): 2731 # Should encode with quotetabs=True 2732 encoded = codecs.encode(b"space tab\teol \n", "quopri-codec") 2733 self.assertEqual(encoded, b"space=20tab=09eol=20\n") 2734 # But should still support unescaped tabs and spaces 2735 unescaped = b"space tab eol\n" 2736 self.assertEqual(codecs.decode(unescaped, "quopri-codec"), unescaped) 2737 2738 def test_uu_invalid(self): 2739 # Missing "begin" line 2740 self.assertRaises(ValueError, codecs.decode, b"", "uu-codec") 2741 2742 2743# The codec system tries to wrap exceptions in order to ensure the error 2744# mentions the operation being performed and the codec involved. We 2745# currently *only* want this to happen for relatively stateless 2746# exceptions, where the only significant information they contain is their 2747# type and a single str argument. 2748 2749# Use a local codec registry to avoid appearing to leak objects when 2750# registering multiple search functions 2751_TEST_CODECS = {} 2752 2753def _get_test_codec(codec_name): 2754 return _TEST_CODECS.get(codec_name) 2755codecs.register(_get_test_codec) # Returns None, not usable as a decorator 2756 2757try: 2758 # Issue #22166: Also need to clear the internal cache in CPython 2759 from _codecs import _forget_codec 2760except ImportError: 2761 def _forget_codec(codec_name): 2762 pass 2763 2764 2765class ExceptionChainingTest(unittest.TestCase): 2766 2767 def setUp(self): 2768 # There's no way to unregister a codec search function, so we just 2769 # ensure we render this one fairly harmless after the test 2770 # case finishes by using the test case repr as the codec name 2771 # The codecs module normalizes codec names, although this doesn't 2772 # appear to be formally documented... 2773 # We also make sure we use a truly unique id for the custom codec 2774 # to avoid issues with the codec cache when running these tests 2775 # multiple times (e.g. when hunting for refleaks) 2776 unique_id = repr(self) + str(id(self)) 2777 self.codec_name = encodings.normalize_encoding(unique_id).lower() 2778 2779 # We store the object to raise on the instance because of a bad 2780 # interaction between the codec caching (which means we can't 2781 # recreate the codec entry) and regrtest refleak hunting (which 2782 # runs the same test instance multiple times). This means we 2783 # need to ensure the codecs call back in to the instance to find 2784 # out which exception to raise rather than binding them in a 2785 # closure to an object that may change on the next run 2786 self.obj_to_raise = RuntimeError 2787 2788 def tearDown(self): 2789 _TEST_CODECS.pop(self.codec_name, None) 2790 # Issue #22166: Also pop from caches to avoid appearance of ref leaks 2791 encodings._cache.pop(self.codec_name, None) 2792 try: 2793 _forget_codec(self.codec_name) 2794 except KeyError: 2795 pass 2796 2797 def set_codec(self, encode, decode): 2798 codec_info = codecs.CodecInfo(encode, decode, 2799 name=self.codec_name) 2800 _TEST_CODECS[self.codec_name] = codec_info 2801 2802 @contextlib.contextmanager 2803 def assertWrapped(self, operation, exc_type, msg): 2804 full_msg = r"{} with {!r} codec failed \({}: {}\)".format( 2805 operation, self.codec_name, exc_type.__name__, msg) 2806 with self.assertRaisesRegex(exc_type, full_msg) as caught: 2807 yield caught 2808 self.assertIsInstance(caught.exception.__cause__, exc_type) 2809 self.assertIsNotNone(caught.exception.__cause__.__traceback__) 2810 2811 def raise_obj(self, *args, **kwds): 2812 # Helper to dynamically change the object raised by a test codec 2813 raise self.obj_to_raise 2814 2815 def check_wrapped(self, obj_to_raise, msg, exc_type=RuntimeError): 2816 self.obj_to_raise = obj_to_raise 2817 self.set_codec(self.raise_obj, self.raise_obj) 2818 with self.assertWrapped("encoding", exc_type, msg): 2819 "str_input".encode(self.codec_name) 2820 with self.assertWrapped("encoding", exc_type, msg): 2821 codecs.encode("str_input", self.codec_name) 2822 with self.assertWrapped("decoding", exc_type, msg): 2823 b"bytes input".decode(self.codec_name) 2824 with self.assertWrapped("decoding", exc_type, msg): 2825 codecs.decode(b"bytes input", self.codec_name) 2826 2827 def test_raise_by_type(self): 2828 self.check_wrapped(RuntimeError, "") 2829 2830 def test_raise_by_value(self): 2831 msg = "This should be wrapped" 2832 self.check_wrapped(RuntimeError(msg), msg) 2833 2834 def test_raise_grandchild_subclass_exact_size(self): 2835 msg = "This should be wrapped" 2836 class MyRuntimeError(RuntimeError): 2837 __slots__ = () 2838 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError) 2839 2840 def test_raise_subclass_with_weakref_support(self): 2841 msg = "This should be wrapped" 2842 class MyRuntimeError(RuntimeError): 2843 pass 2844 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError) 2845 2846 def check_not_wrapped(self, obj_to_raise, msg): 2847 def raise_obj(*args, **kwds): 2848 raise obj_to_raise 2849 self.set_codec(raise_obj, raise_obj) 2850 with self.assertRaisesRegex(RuntimeError, msg): 2851 "str input".encode(self.codec_name) 2852 with self.assertRaisesRegex(RuntimeError, msg): 2853 codecs.encode("str input", self.codec_name) 2854 with self.assertRaisesRegex(RuntimeError, msg): 2855 b"bytes input".decode(self.codec_name) 2856 with self.assertRaisesRegex(RuntimeError, msg): 2857 codecs.decode(b"bytes input", self.codec_name) 2858 2859 def test_init_override_is_not_wrapped(self): 2860 class CustomInit(RuntimeError): 2861 def __init__(self): 2862 pass 2863 self.check_not_wrapped(CustomInit, "") 2864 2865 def test_new_override_is_not_wrapped(self): 2866 class CustomNew(RuntimeError): 2867 def __new__(cls): 2868 return super().__new__(cls) 2869 self.check_not_wrapped(CustomNew, "") 2870 2871 def test_instance_attribute_is_not_wrapped(self): 2872 msg = "This should NOT be wrapped" 2873 exc = RuntimeError(msg) 2874 exc.attr = 1 2875 self.check_not_wrapped(exc, "^{}$".format(msg)) 2876 2877 def test_non_str_arg_is_not_wrapped(self): 2878 self.check_not_wrapped(RuntimeError(1), "1") 2879 2880 def test_multiple_args_is_not_wrapped(self): 2881 msg_re = r"^\('a', 'b', 'c'\)$" 2882 self.check_not_wrapped(RuntimeError('a', 'b', 'c'), msg_re) 2883 2884 # http://bugs.python.org/issue19609 2885 def test_codec_lookup_failure_not_wrapped(self): 2886 msg = "^unknown encoding: {}$".format(self.codec_name) 2887 # The initial codec lookup should not be wrapped 2888 with self.assertRaisesRegex(LookupError, msg): 2889 "str input".encode(self.codec_name) 2890 with self.assertRaisesRegex(LookupError, msg): 2891 codecs.encode("str input", self.codec_name) 2892 with self.assertRaisesRegex(LookupError, msg): 2893 b"bytes input".decode(self.codec_name) 2894 with self.assertRaisesRegex(LookupError, msg): 2895 codecs.decode(b"bytes input", self.codec_name) 2896 2897 def test_unflagged_non_text_codec_handling(self): 2898 # The stdlib non-text codecs are now marked so they're 2899 # pre-emptively skipped by the text model related methods 2900 # However, third party codecs won't be flagged, so we still make 2901 # sure the case where an inappropriate output type is produced is 2902 # handled appropriately 2903 def encode_to_str(*args, **kwds): 2904 return "not bytes!", 0 2905 def decode_to_bytes(*args, **kwds): 2906 return b"not str!", 0 2907 self.set_codec(encode_to_str, decode_to_bytes) 2908 # No input or output type checks on the codecs module functions 2909 encoded = codecs.encode(None, self.codec_name) 2910 self.assertEqual(encoded, "not bytes!") 2911 decoded = codecs.decode(None, self.codec_name) 2912 self.assertEqual(decoded, b"not str!") 2913 # Text model methods should complain 2914 fmt = (r"^{!r} encoder returned 'str' instead of 'bytes'; " 2915 r"use codecs.encode\(\) to encode to arbitrary types$") 2916 msg = fmt.format(self.codec_name) 2917 with self.assertRaisesRegex(TypeError, msg): 2918 "str_input".encode(self.codec_name) 2919 fmt = (r"^{!r} decoder returned 'bytes' instead of 'str'; " 2920 r"use codecs.decode\(\) to decode to arbitrary types$") 2921 msg = fmt.format(self.codec_name) 2922 with self.assertRaisesRegex(TypeError, msg): 2923 b"bytes input".decode(self.codec_name) 2924 2925 2926 2927@unittest.skipUnless(sys.platform == 'win32', 2928 'code pages are specific to Windows') 2929class CodePageTest(unittest.TestCase): 2930 CP_UTF8 = 65001 2931 2932 def test_invalid_code_page(self): 2933 self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a') 2934 self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a') 2935 self.assertRaises(OSError, codecs.code_page_encode, 123, 'a') 2936 self.assertRaises(OSError, codecs.code_page_decode, 123, b'a') 2937 2938 def test_code_page_name(self): 2939 self.assertRaisesRegex(UnicodeEncodeError, 'cp932', 2940 codecs.code_page_encode, 932, '\xff') 2941 self.assertRaisesRegex(UnicodeDecodeError, 'cp932', 2942 codecs.code_page_decode, 932, b'\x81\x00', 'strict', True) 2943 self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8', 2944 codecs.code_page_decode, self.CP_UTF8, b'\xff', 'strict', True) 2945 2946 def check_decode(self, cp, tests): 2947 for raw, errors, expected in tests: 2948 if expected is not None: 2949 try: 2950 decoded = codecs.code_page_decode(cp, raw, errors, True) 2951 except UnicodeDecodeError as err: 2952 self.fail('Unable to decode %a from "cp%s" with ' 2953 'errors=%r: %s' % (raw, cp, errors, err)) 2954 self.assertEqual(decoded[0], expected, 2955 '%a.decode("cp%s", %r)=%a != %a' 2956 % (raw, cp, errors, decoded[0], expected)) 2957 # assert 0 <= decoded[1] <= len(raw) 2958 self.assertGreaterEqual(decoded[1], 0) 2959 self.assertLessEqual(decoded[1], len(raw)) 2960 else: 2961 self.assertRaises(UnicodeDecodeError, 2962 codecs.code_page_decode, cp, raw, errors, True) 2963 2964 def check_encode(self, cp, tests): 2965 for text, errors, expected in tests: 2966 if expected is not None: 2967 try: 2968 encoded = codecs.code_page_encode(cp, text, errors) 2969 except UnicodeEncodeError as err: 2970 self.fail('Unable to encode %a to "cp%s" with ' 2971 'errors=%r: %s' % (text, cp, errors, err)) 2972 self.assertEqual(encoded[0], expected, 2973 '%a.encode("cp%s", %r)=%a != %a' 2974 % (text, cp, errors, encoded[0], expected)) 2975 self.assertEqual(encoded[1], len(text)) 2976 else: 2977 self.assertRaises(UnicodeEncodeError, 2978 codecs.code_page_encode, cp, text, errors) 2979 2980 def test_cp932(self): 2981 self.check_encode(932, ( 2982 ('abc', 'strict', b'abc'), 2983 ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'), 2984 # test error handlers 2985 ('\xff', 'strict', None), 2986 ('[\xff]', 'ignore', b'[]'), 2987 ('[\xff]', 'replace', b'[y]'), 2988 ('[\u20ac]', 'replace', b'[?]'), 2989 ('[\xff]', 'backslashreplace', b'[\\xff]'), 2990 ('[\xff]', 'namereplace', 2991 b'[\\N{LATIN SMALL LETTER Y WITH DIAERESIS}]'), 2992 ('[\xff]', 'xmlcharrefreplace', b'[ÿ]'), 2993 ('\udcff', 'strict', None), 2994 ('[\udcff]', 'surrogateescape', b'[\xff]'), 2995 ('[\udcff]', 'surrogatepass', None), 2996 )) 2997 self.check_decode(932, ( 2998 (b'abc', 'strict', 'abc'), 2999 (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'), 3000 # invalid bytes 3001 (b'[\xff]', 'strict', None), 3002 (b'[\xff]', 'ignore', '[]'), 3003 (b'[\xff]', 'replace', '[\ufffd]'), 3004 (b'[\xff]', 'backslashreplace', '[\\xff]'), 3005 (b'[\xff]', 'surrogateescape', '[\udcff]'), 3006 (b'[\xff]', 'surrogatepass', None), 3007 (b'\x81\x00abc', 'strict', None), 3008 (b'\x81\x00abc', 'ignore', '\x00abc'), 3009 (b'\x81\x00abc', 'replace', '\ufffd\x00abc'), 3010 (b'\x81\x00abc', 'backslashreplace', '\\x81\x00abc'), 3011 )) 3012 3013 def test_cp1252(self): 3014 self.check_encode(1252, ( 3015 ('abc', 'strict', b'abc'), 3016 ('\xe9\u20ac', 'strict', b'\xe9\x80'), 3017 ('\xff', 'strict', b'\xff'), 3018 # test error handlers 3019 ('\u0141', 'strict', None), 3020 ('\u0141', 'ignore', b''), 3021 ('\u0141', 'replace', b'L'), 3022 ('\udc98', 'surrogateescape', b'\x98'), 3023 ('\udc98', 'surrogatepass', None), 3024 )) 3025 self.check_decode(1252, ( 3026 (b'abc', 'strict', 'abc'), 3027 (b'\xe9\x80', 'strict', '\xe9\u20ac'), 3028 (b'\xff', 'strict', '\xff'), 3029 )) 3030 3031 def test_cp_utf7(self): 3032 cp = 65000 3033 self.check_encode(cp, ( 3034 ('abc', 'strict', b'abc'), 3035 ('\xe9\u20ac', 'strict', b'+AOkgrA-'), 3036 ('\U0010ffff', 'strict', b'+2//f/w-'), 3037 ('\udc80', 'strict', b'+3IA-'), 3038 ('\ufffd', 'strict', b'+//0-'), 3039 )) 3040 self.check_decode(cp, ( 3041 (b'abc', 'strict', 'abc'), 3042 (b'+AOkgrA-', 'strict', '\xe9\u20ac'), 3043 (b'+2//f/w-', 'strict', '\U0010ffff'), 3044 (b'+3IA-', 'strict', '\udc80'), 3045 (b'+//0-', 'strict', '\ufffd'), 3046 # invalid bytes 3047 (b'[+/]', 'strict', '[]'), 3048 (b'[\xff]', 'strict', '[\xff]'), 3049 )) 3050 3051 def test_multibyte_encoding(self): 3052 self.check_decode(932, ( 3053 (b'\x84\xe9\x80', 'ignore', '\u9a3e'), 3054 (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'), 3055 )) 3056 self.check_decode(self.CP_UTF8, ( 3057 (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'), 3058 (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'), 3059 )) 3060 self.check_encode(self.CP_UTF8, ( 3061 ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'), 3062 ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'), 3063 )) 3064 3065 def test_code_page_decode_flags(self): 3066 # Issue #36312: For some code pages (e.g. UTF-7) flags for 3067 # MultiByteToWideChar() must be set to 0. 3068 if support.verbose: 3069 sys.stdout.write('\n') 3070 for cp in (50220, 50221, 50222, 50225, 50227, 50229, 3071 *range(57002, 57011+1), 65000): 3072 # On small versions of Windows like Windows IoT 3073 # not all codepages are present. 3074 # A missing codepage causes an OSError exception 3075 # so check for the codepage before decoding 3076 if is_code_page_present(cp): 3077 self.assertEqual(codecs.code_page_decode(cp, b'abc'), ('abc', 3), f'cp{cp}') 3078 else: 3079 if support.verbose: 3080 print(f" skipping cp={cp}") 3081 self.assertEqual(codecs.code_page_decode(42, b'abc'), 3082 ('\uf061\uf062\uf063', 3)) 3083 3084 def test_incremental(self): 3085 decoded = codecs.code_page_decode(932, b'\x82', 'strict', False) 3086 self.assertEqual(decoded, ('', 0)) 3087 3088 decoded = codecs.code_page_decode(932, 3089 b'\xe9\x80\xe9', 'strict', 3090 False) 3091 self.assertEqual(decoded, ('\u9a3e', 2)) 3092 3093 decoded = codecs.code_page_decode(932, 3094 b'\xe9\x80\xe9\x80', 'strict', 3095 False) 3096 self.assertEqual(decoded, ('\u9a3e\u9a3e', 4)) 3097 3098 decoded = codecs.code_page_decode(932, 3099 b'abc', 'strict', 3100 False) 3101 self.assertEqual(decoded, ('abc', 3)) 3102 3103 def test_mbcs_alias(self): 3104 # Check that looking up our 'default' codepage will return 3105 # mbcs when we don't have a more specific one available 3106 with mock.patch('_winapi.GetACP', return_value=123): 3107 codec = codecs.lookup('cp123') 3108 self.assertEqual(codec.name, 'mbcs') 3109 3110 @support.bigmemtest(size=2**31, memuse=7, dry_run=False) 3111 def test_large_input(self, size): 3112 # Test input longer than INT_MAX. 3113 # Input should contain undecodable bytes before and after 3114 # the INT_MAX limit. 3115 encoded = (b'01234567' * ((size//8)-1) + 3116 b'\x85\x86\xea\xeb\xec\xef\xfc\xfd\xfe\xff') 3117 self.assertEqual(len(encoded), size+2) 3118 decoded = codecs.code_page_decode(932, encoded, 'surrogateescape', True) 3119 self.assertEqual(decoded[1], len(encoded)) 3120 del encoded 3121 self.assertEqual(len(decoded[0]), decoded[1]) 3122 self.assertEqual(decoded[0][:10], '0123456701') 3123 self.assertEqual(decoded[0][-20:], 3124 '6701234567' 3125 '\udc85\udc86\udcea\udceb\udcec' 3126 '\udcef\udcfc\udcfd\udcfe\udcff') 3127 3128 @support.bigmemtest(size=2**31, memuse=6, dry_run=False) 3129 def test_large_utf8_input(self, size): 3130 # Test input longer than INT_MAX. 3131 # Input should contain a decodable multi-byte character 3132 # surrounding INT_MAX 3133 encoded = (b'0123456\xed\x84\x80' * (size//8)) 3134 self.assertEqual(len(encoded), size // 8 * 10) 3135 decoded = codecs.code_page_decode(65001, encoded, 'ignore', True) 3136 self.assertEqual(decoded[1], len(encoded)) 3137 del encoded 3138 self.assertEqual(len(decoded[0]), size) 3139 self.assertEqual(decoded[0][:10], '0123456\ud10001') 3140 self.assertEqual(decoded[0][-11:], '56\ud1000123456\ud100') 3141 3142 3143class ASCIITest(unittest.TestCase): 3144 def test_encode(self): 3145 self.assertEqual('abc123'.encode('ascii'), b'abc123') 3146 3147 def test_encode_error(self): 3148 for data, error_handler, expected in ( 3149 ('[\x80\xff\u20ac]', 'ignore', b'[]'), 3150 ('[\x80\xff\u20ac]', 'replace', b'[???]'), 3151 ('[\x80\xff\u20ac]', 'xmlcharrefreplace', b'[€ÿ€]'), 3152 ('[\x80\xff\u20ac\U000abcde]', 'backslashreplace', 3153 b'[\\x80\\xff\\u20ac\\U000abcde]'), 3154 ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'), 3155 ): 3156 with self.subTest(data=data, error_handler=error_handler, 3157 expected=expected): 3158 self.assertEqual(data.encode('ascii', error_handler), 3159 expected) 3160 3161 def test_encode_surrogateescape_error(self): 3162 with self.assertRaises(UnicodeEncodeError): 3163 # the first character can be decoded, but not the second 3164 '\udc80\xff'.encode('ascii', 'surrogateescape') 3165 3166 def test_decode(self): 3167 self.assertEqual(b'abc'.decode('ascii'), 'abc') 3168 3169 def test_decode_error(self): 3170 for data, error_handler, expected in ( 3171 (b'[\x80\xff]', 'ignore', '[]'), 3172 (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'), 3173 (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'), 3174 (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'), 3175 ): 3176 with self.subTest(data=data, error_handler=error_handler, 3177 expected=expected): 3178 self.assertEqual(data.decode('ascii', error_handler), 3179 expected) 3180 3181 3182class Latin1Test(unittest.TestCase): 3183 def test_encode(self): 3184 for data, expected in ( 3185 ('abc', b'abc'), 3186 ('\x80\xe9\xff', b'\x80\xe9\xff'), 3187 ): 3188 with self.subTest(data=data, expected=expected): 3189 self.assertEqual(data.encode('latin1'), expected) 3190 3191 def test_encode_errors(self): 3192 for data, error_handler, expected in ( 3193 ('[\u20ac\udc80]', 'ignore', b'[]'), 3194 ('[\u20ac\udc80]', 'replace', b'[??]'), 3195 ('[\u20ac\U000abcde]', 'backslashreplace', 3196 b'[\\u20ac\\U000abcde]'), 3197 ('[\u20ac\udc80]', 'xmlcharrefreplace', b'[€�]'), 3198 ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'), 3199 ): 3200 with self.subTest(data=data, error_handler=error_handler, 3201 expected=expected): 3202 self.assertEqual(data.encode('latin1', error_handler), 3203 expected) 3204 3205 def test_encode_surrogateescape_error(self): 3206 with self.assertRaises(UnicodeEncodeError): 3207 # the first character can be decoded, but not the second 3208 '\udc80\u20ac'.encode('latin1', 'surrogateescape') 3209 3210 def test_decode(self): 3211 for data, expected in ( 3212 (b'abc', 'abc'), 3213 (b'[\x80\xff]', '[\x80\xff]'), 3214 ): 3215 with self.subTest(data=data, expected=expected): 3216 self.assertEqual(data.decode('latin1'), expected) 3217 3218 3219class StreamRecoderTest(unittest.TestCase): 3220 def test_writelines(self): 3221 bio = io.BytesIO() 3222 codec = codecs.lookup('ascii') 3223 sr = codecs.StreamRecoder(bio, codec.encode, codec.decode, 3224 encodings.ascii.StreamReader, encodings.ascii.StreamWriter) 3225 sr.writelines([b'a', b'b']) 3226 self.assertEqual(bio.getvalue(), b'ab') 3227 3228 def test_write(self): 3229 bio = io.BytesIO() 3230 codec = codecs.lookup('latin1') 3231 # Recode from Latin-1 to utf-8. 3232 sr = codecs.StreamRecoder(bio, codec.encode, codec.decode, 3233 encodings.utf_8.StreamReader, encodings.utf_8.StreamWriter) 3234 3235 text = 'àñé' 3236 sr.write(text.encode('latin1')) 3237 self.assertEqual(bio.getvalue(), text.encode('utf-8')) 3238 3239 def test_seeking_read(self): 3240 bio = io.BytesIO('line1\nline2\nline3\n'.encode('utf-16-le')) 3241 sr = codecs.EncodedFile(bio, 'utf-8', 'utf-16-le') 3242 3243 self.assertEqual(sr.readline(), b'line1\n') 3244 sr.seek(0) 3245 self.assertEqual(sr.readline(), b'line1\n') 3246 self.assertEqual(sr.readline(), b'line2\n') 3247 self.assertEqual(sr.readline(), b'line3\n') 3248 self.assertEqual(sr.readline(), b'') 3249 3250 def test_seeking_write(self): 3251 bio = io.BytesIO('123456789\n'.encode('utf-16-le')) 3252 sr = codecs.EncodedFile(bio, 'utf-8', 'utf-16-le') 3253 3254 # Test that seek() only resets its internal buffer when offset 3255 # and whence are zero. 3256 sr.seek(2) 3257 sr.write(b'\nabc\n') 3258 self.assertEqual(sr.readline(), b'789\n') 3259 sr.seek(0) 3260 self.assertEqual(sr.readline(), b'1\n') 3261 self.assertEqual(sr.readline(), b'abc\n') 3262 self.assertEqual(sr.readline(), b'789\n') 3263 3264 3265@unittest.skipIf(_testcapi is None, 'need _testcapi module') 3266class LocaleCodecTest(unittest.TestCase): 3267 """ 3268 Test indirectly _Py_DecodeUTF8Ex() and _Py_EncodeUTF8Ex(). 3269 """ 3270 ENCODING = sys.getfilesystemencoding() 3271 STRINGS = ("ascii", "ulatin1:\xa7\xe9", 3272 "u255:\xff", 3273 "UCS:\xe9\u20ac\U0010ffff", 3274 "surrogates:\uDC80\uDCFF") 3275 BYTES_STRINGS = (b"blatin1:\xa7\xe9", b"b255:\xff") 3276 SURROGATES = "\uDC80\uDCFF" 3277 3278 def encode(self, text, errors="strict"): 3279 return _testcapi.EncodeLocaleEx(text, 0, errors) 3280 3281 def check_encode_strings(self, errors): 3282 for text in self.STRINGS: 3283 with self.subTest(text=text): 3284 try: 3285 expected = text.encode(self.ENCODING, errors) 3286 except UnicodeEncodeError: 3287 with self.assertRaises(RuntimeError) as cm: 3288 self.encode(text, errors) 3289 errmsg = str(cm.exception) 3290 self.assertRegex(errmsg, r"encode error: pos=[0-9]+, reason=") 3291 else: 3292 encoded = self.encode(text, errors) 3293 self.assertEqual(encoded, expected) 3294 3295 def test_encode_strict(self): 3296 self.check_encode_strings("strict") 3297 3298 def test_encode_surrogateescape(self): 3299 self.check_encode_strings("surrogateescape") 3300 3301 def test_encode_surrogatepass(self): 3302 try: 3303 self.encode('', 'surrogatepass') 3304 except ValueError as exc: 3305 if str(exc) == 'unsupported error handler': 3306 self.skipTest(f"{self.ENCODING!r} encoder doesn't support " 3307 f"surrogatepass error handler") 3308 else: 3309 raise 3310 3311 self.check_encode_strings("surrogatepass") 3312 3313 def test_encode_unsupported_error_handler(self): 3314 with self.assertRaises(ValueError) as cm: 3315 self.encode('', 'backslashreplace') 3316 self.assertEqual(str(cm.exception), 'unsupported error handler') 3317 3318 def decode(self, encoded, errors="strict"): 3319 return _testcapi.DecodeLocaleEx(encoded, 0, errors) 3320 3321 def check_decode_strings(self, errors): 3322 is_utf8 = (self.ENCODING == "utf-8") 3323 if is_utf8: 3324 encode_errors = 'surrogateescape' 3325 else: 3326 encode_errors = 'strict' 3327 3328 strings = list(self.BYTES_STRINGS) 3329 for text in self.STRINGS: 3330 try: 3331 encoded = text.encode(self.ENCODING, encode_errors) 3332 if encoded not in strings: 3333 strings.append(encoded) 3334 except UnicodeEncodeError: 3335 encoded = None 3336 3337 if is_utf8: 3338 encoded2 = text.encode(self.ENCODING, 'surrogatepass') 3339 if encoded2 != encoded: 3340 strings.append(encoded2) 3341 3342 for encoded in strings: 3343 with self.subTest(encoded=encoded): 3344 try: 3345 expected = encoded.decode(self.ENCODING, errors) 3346 except UnicodeDecodeError: 3347 with self.assertRaises(RuntimeError) as cm: 3348 self.decode(encoded, errors) 3349 errmsg = str(cm.exception) 3350 self.assertTrue(errmsg.startswith("decode error: "), errmsg) 3351 else: 3352 decoded = self.decode(encoded, errors) 3353 self.assertEqual(decoded, expected) 3354 3355 def test_decode_strict(self): 3356 self.check_decode_strings("strict") 3357 3358 def test_decode_surrogateescape(self): 3359 self.check_decode_strings("surrogateescape") 3360 3361 def test_decode_surrogatepass(self): 3362 try: 3363 self.decode(b'', 'surrogatepass') 3364 except ValueError as exc: 3365 if str(exc) == 'unsupported error handler': 3366 self.skipTest(f"{self.ENCODING!r} decoder doesn't support " 3367 f"surrogatepass error handler") 3368 else: 3369 raise 3370 3371 self.check_decode_strings("surrogatepass") 3372 3373 def test_decode_unsupported_error_handler(self): 3374 with self.assertRaises(ValueError) as cm: 3375 self.decode(b'', 'backslashreplace') 3376 self.assertEqual(str(cm.exception), 'unsupported error handler') 3377 3378 3379class Rot13Test(unittest.TestCase): 3380 """Test the educational ROT-13 codec.""" 3381 def test_encode(self): 3382 ciphertext = codecs.encode("Caesar liked ciphers", 'rot-13') 3383 self.assertEqual(ciphertext, 'Pnrfne yvxrq pvcuref') 3384 3385 def test_decode(self): 3386 plaintext = codecs.decode('Rg gh, Oehgr?', 'rot-13') 3387 self.assertEqual(plaintext, 'Et tu, Brute?') 3388 3389 def test_incremental_encode(self): 3390 encoder = codecs.getincrementalencoder('rot-13')() 3391 ciphertext = encoder.encode('ABBA nag Cheryl Baker') 3392 self.assertEqual(ciphertext, 'NOON ant Purely Onxre') 3393 3394 def test_incremental_decode(self): 3395 decoder = codecs.getincrementaldecoder('rot-13')() 3396 plaintext = decoder.decode('terra Ares envy tha') 3397 self.assertEqual(plaintext, 'green Nerf rail gun') 3398 3399 3400class Rot13UtilTest(unittest.TestCase): 3401 """Test the ROT-13 codec via rot13 function, 3402 i.e. the user has done something like: 3403 $ echo "Hello World" | python -m encodings.rot_13 3404 """ 3405 def test_rot13_func(self): 3406 infile = io.StringIO('Gb or, be abg gb or, gung vf gur dhrfgvba') 3407 outfile = io.StringIO() 3408 encodings.rot_13.rot13(infile, outfile) 3409 outfile.seek(0) 3410 plain_text = outfile.read() 3411 self.assertEqual( 3412 plain_text, 3413 'To be, or not to be, that is the question') 3414 3415 3416if __name__ == "__main__": 3417 unittest.main() 3418