1--TEST-- 2Exhaustive test of EUC-JP-2004 encoding verification and conversion 3--EXTENSIONS-- 4mbstring 5--SKIPIF-- 6<?php 7if (getenv("SKIP_SLOW_TESTS")) die("skip slow test"); 8?> 9--FILE-- 10<?php 11srand(200); /* Make results consistent */ 12include('encoding_tests.inc'); 13mb_substitute_character(0x25); // '%' 14 15$validChars = array(); /* EUC-JP-2004 string -> UTF-32BE */ 16$fromUnicode = array(); /* UTF-16BE -> EUC-JP-2004 */ 17$fp = fopen(__DIR__ . '/data/EUC-JP-2004.txt', 'r+'); 18while ($line = fgets($fp, 256)) { 19 if ($line[0] == '#') 20 continue; 21 22 $codepoint2 = null; 23 if (sscanf($line, "0x%x\tU+%x+%x", $bytes, $codepoint1, $codepoint2) >= 2) { 24 if ($bytes < 256) 25 $eucjp = chr($bytes); 26 else if ($bytes <= 0xFFFF) 27 $eucjp = pack('n', $bytes); 28 else 29 $eucjp = chr($bytes >> 16) . pack('n', $bytes & 0xFFFF); 30 31 if ($codepoint2) { 32 $validChars[$eucjp] = pack('NN', $codepoint1, $codepoint2); 33 } else { 34 $validChars[$eucjp] = pack('N', $codepoint1); 35 if ($codepoint1 <= 0xFFFF) 36 $fromUnicode[pack('n', $codepoint1)] = $eucjp; 37 } 38 } 39} 40 41/* Convert 0xA1B1 to U+FFE3 (FULLWIDTH MACRON), not U+203E (OVERLINE) */ 42$validChars["\xA1\xB1"] = "\x00\x00\xFF\xE3"; 43$fromUnicode["\xFF\xE3"] = "\xA1\xB1"; 44 45/* Convert 0xA1EF to U+FFE5 (FULLWIDTH YEN SIGN), not U+00A5 (YEN SIGN) */ 46$validChars["\xA1\xEF"] = "\x00\x00\xFF\xE5"; 47$fromUnicode["\xFF\xE5"] = "\xA1\xEF"; 48 49/* Convert U+00A5 (YEN SIGN) to 0x5C; that is one of the single bytes 50 * which many legacy Japanese text encodings used to represent something 51 * different from its normal meaning ASCII. In ASCII it's a backslash, 52 * but legacy Japanese software often used it for a yen sign. */ 53$fromUnicode["\x00\xA5"] = "\x5C"; 54/* The other one is 0x7E, which is a tilde in ASCII, but was used in 55 * legacy Japanese software for an overline */ 56$fromUnicode["\x20\x3E"] = "\x7E"; 57 58testAllValidChars($validChars, 'EUC-JP-2004', 'UTF-32BE'); 59echo "EUC-JP-2004 verification and conversion works for all valid characters\n"; 60 61findInvalidChars($validChars, $invalidChars, $truncated); 62testAllInvalidChars($invalidChars, $validChars, 'EUC-JP-2004', 'UTF-32BE', "\x00\x00\x00%"); 63testTruncatedChars($truncated, 'EUC-JP-2004', 'UTF-32BE', "\x00\x00\x00%"); 64echo "EUC-JP-2004 verification and conversion rejects all invalid characters\n"; 65 66testAllValidChars($fromUnicode, 'UTF-16BE', 'EUC-JP-2004', false); 67echo "Unicode -> EUC-JP-2004 conversion works on all valid characters\n"; 68 69findInvalidChars($fromUnicode, $invalidChars, $unused, array_fill_keys(range(0, 0xFF), 2)); 70convertAllInvalidChars($invalidChars, $fromUnicode, 'UTF-16BE', 'EUC-JP-2004', '%'); 71echo "Unicode -> EUC-JP-2004 conversion works on all invalid characters\n"; 72 73// Test "long" illegal character markers 74mb_substitute_character("long"); 75convertInvalidString("\x80", "%", "EUC-JP-2004", "UTF-8"); 76convertInvalidString("\xFE\xFF", "%", "EUC-JP-2004", "UTF-8"); 77 78echo "Done!\n"; 79?> 80--EXPECT-- 81EUC-JP-2004 verification and conversion works for all valid characters 82EUC-JP-2004 verification and conversion rejects all invalid characters 83Unicode -> EUC-JP-2004 conversion works on all valid characters 84Unicode -> EUC-JP-2004 conversion works on all invalid characters 85Done! 86