1--TEST--
2Exhaustive test of SJIS-2004 encoding verification and conversion
3--EXTENSIONS--
4mbstring
5--SKIPIF--
6<?php
7if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
8?>
9--FILE--
10<?php
11srand(101); /* Make results consistent */
12include('encoding_tests.inc');
13mb_substitute_character(0x25); // '%'
14
15/* Read in the table of all characters in SJIS-2004 */
16$validChars = array(); /* SJIS-2004 string -> UTF-32BE string */
17$fromUnicode = array(); /* UTF-16BE -> SJIS-2004 */
18$fp = fopen(__DIR__ . '/data/SJIS-2004.txt', 'r+');
19while ($line = fgets($fp, 256)) {
20	if ($line[0] == '#')
21		continue;
22
23	$codepoint2 = null;
24	if (sscanf($line, "0x%x\tU+%x+%x", $bytes, $codepoint1, $codepoint2) >= 2) {
25		$sjis = ($bytes < 256) ? chr($bytes) : pack('n', $bytes);
26		if ($codepoint2) {
27			$validChars[$sjis] = pack('NN', $codepoint1, $codepoint2);
28		} else {
29			/* Two input byte sequences can translate to either a 'halfwidth' or a
30			 * 'fullwidth' version of a character; our implementation of SJIS-2004
31			 * translates them to the fullwidth versions */
32			if (preg_match('/Fullwidth: U\+([0-9A-F]+)/', $line, $match))
33				$codepoint1 = hexdec($match[1]);
34			$validChars[$sjis] = pack('N', $codepoint1);
35			if ($codepoint1 <= 0xFFFF)
36				$fromUnicode[pack('n', $codepoint1)] = $sjis;
37		}
38	}
39}
40
41/* U+007E is TILDE, Shift-JIS 0x8160 is WAVE DASH */
42$fromUnicode["\x00\x7E"] = "\x81\x60";
43
44/* U+005C is backslash, Shift-JIS 0x815F is REVERSE SOLIDUS
45 * (ie. a fancy way to say "backslash") */
46$fromUnicode["\x00\x5C"] = "\x81\x5F";
47
48testAllValidChars($validChars, 'SJIS-2004', 'UTF-32BE');
49echo "SJIS-2004 verification and conversion works for all valid characters\n";
50
51findInvalidChars($validChars, $invalidChars, $truncated,
52	array_fill_keys(range(0x81, 0x9F), 2) + array_fill_keys(range(0xE0, 0xFC), 2));
53testAllInvalidChars($invalidChars, $validChars, 'SJIS-2004', 'UTF-32BE', "\x00\x00\x00%");
54testTruncatedChars($truncated, 'SJIS-2004', 'UTF-32BE', "\x00\x00\x00%");
55echo "SJIS-2004 verification and conversion rejects all invalid characters\n";
56
57testAllValidChars($fromUnicode, 'UTF-16BE', 'SJIS-2004', false);
58echo "Unicode -> SJIS-2004 conversion works on all valid characters\n";
59
60findInvalidChars($fromUnicode, $invalidChars, $unused, array_fill_keys(range(0, 0xFF), 2));
61convertAllInvalidChars($invalidChars, $fromUnicode, 'UTF-16BE', 'SJIS-2004', '%');
62echo "Unicode -> SJIS-2004 conversion works on all invalid characters\n";
63
64// Some pairs of Unicode codepoints are represented by a single character in SJIS-2004
65// Test the case where the first codepoint looks like it might be one of these pairs...
66// but the second one doesn't match
67convertValidString("\x30\x4B\x00A", "\x82\xA9A", 'UTF-16BE', 'SJIS-2004', false);
68
69// Test "long" illegal character markers
70mb_substitute_character("long");
71convertInvalidString("\x80", "%", "SJIS-2004", "UTF-8");
72convertInvalidString("\x81\x20", "%", "SJIS-2004", "UTF-8");
73convertInvalidString("\xFC\xF5", "%", "SJIS-2004", "UTF-8");
74
75echo "Done!\n";
76?>
77--EXPECT--
78SJIS-2004 verification and conversion works for all valid characters
79SJIS-2004 verification and conversion rejects all invalid characters
80Unicode -> SJIS-2004 conversion works on all valid characters
81Unicode -> SJIS-2004 conversion works on all invalid characters
82Done!
83