1<?php
2# Copyright (C) 2004 Brion Vibber <brion@pobox.com>
3# http://www.mediawiki.org/
4#
5# This program is free software; you can redistribute it and/or modify
6# it under the terms of the GNU General Public License as published by
7# the Free Software Foundation; either version 2 of the License, or
8# (at your option) any later version.
9#
10# This program is distributed in the hope that it will be useful,
11# but WITHOUT ANY WARRANTY; without even the implied warranty of
12# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13# GNU General Public License for more details.
14#
15# You should have received a copy of the GNU General Public License along
16# with this program; if not, write to the Free Software Foundation, Inc.,
17# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18# http://www.gnu.org/copyleft/gpl.html
19
20
21if (php_sapi_name() != 'cli') {
22    die("Run me from the command line please.\n");
23}
24
25/** */
26if (isset($_SERVER['argv']) && in_array('--icu', $_SERVER['argv'])) {
27    dl('php_utfnormal.so');
28}
29
30#ini_set( 'memory_limit', '40M' );
31
32require_once 'PHPUnit/Framework.php';
33require_once 'PHPUnit/TextUI/TestRunner.php';
34
35require_once 'include/Unicode/UtfNormal.php';
36
37/**
38 * Additional tests for UtfNormal::cleanUp() function, inclusion
39 * regression checks for known problems.
40 * Requires PHPUnit.
41 *
42 * @addtogroup UtfNormal
43 * @private
44 */
45class CleanUpTest extends PHPUnit_Framework_TestCase
46{
47    /** @todo document */
48    public function setUp()
49    {
50    }
51
52    /** @todo document */
53    public function tearDown()
54    {
55    }
56
57    /** @todo document */
58    public function testAscii()
59    {
60        $text = 'This is plain ASCII text.';
61        $this->assertEquals($text, UtfNormal::cleanUp($text));
62    }
63
64    /** @todo document */
65    public function testNull()
66    {
67        $text = "a \x00 null";
68        $expect = "a \xef\xbf\xbd null";
69        $this->assertEquals(
70            bin2hex($expect),
71            bin2hex(UtfNormal::cleanUp($text))
72        );
73    }
74
75    /** @todo document */
76    public function testLatin()
77    {
78        $text = "L'\xc3\xa9cole";
79        $this->assertEquals($text, UtfNormal::cleanUp($text));
80    }
81
82    /** @todo document */
83    public function testLatinNormal()
84    {
85        $text = "L'e\xcc\x81cole";
86        $expect = "L'\xc3\xa9cole";
87        $this->assertEquals($expect, UtfNormal::cleanUp($text));
88    }
89
90    /**
91     * This test is *very* expensive!
92     * @todo document
93     */
94    public function XtestAllChars()
95    {
96        $rep = UTF8_REPLACEMENT;
97        global $utfCanonicalComp, $utfCanonicalDecomp;
98        for ($i = 0x0; $i < UNICODE_MAX; $i++) {
99            $char = codepointToUtf8($i);
100            $clean = UtfNormal::cleanUp($char);
101            $x = sprintf("%04X", $i);
102            if ($i % 0x1000 == 0) {
103                echo "U+$x\n";
104            }
105            if ($i == 0x0009 ||
106                $i == 0x000a ||
107                $i == 0x000d ||
108                ($i > 0x001f && $i < UNICODE_SURROGATE_FIRST) ||
109                ($i > UNICODE_SURROGATE_LAST && $i < 0xfffe) ||
110                ($i > 0xffff && $i <= UNICODE_MAX)) {
111                if (isset($utfCanonicalComp[$char]) || isset($utfCanonicalDecomp[$char])) {
112                    $comp = UtfNormal::NFC($char);
113                    $this->assertEquals(
114                        bin2hex($comp),
115                        bin2hex($clean),
116                        "U+$x should be decomposed"
117                    );
118                } else {
119                    $this->assertEquals(
120                        bin2hex($char),
121                        bin2hex($clean),
122                        "U+$x should be intact"
123                    );
124                }
125            } else {
126                $this->assertEquals(bin2hex($rep), bin2hex($clean), $x);
127            }
128        }
129    }
130
131    /** @todo document */
132    public function testAllBytes()
133    {
134        $this->doTestBytes('', '');
135        $this->doTestBytes('x', '');
136        $this->doTestBytes('', 'x');
137        $this->doTestBytes('x', 'x');
138    }
139
140    /** @todo document */
141    public function doTestBytes($head, $tail)
142    {
143        for ($i = 0x0; $i < 256; $i++) {
144            $char = $head . chr($i) . $tail;
145            $clean = UtfNormal::cleanUp($char);
146            $x = sprintf("%02X", $i);
147            if ($i == 0x0009 ||
148                $i == 0x000a ||
149                $i == 0x000d ||
150                ($i > 0x001f && $i < 0x80)) {
151                $this->assertEquals(
152                    bin2hex($char),
153                    bin2hex($clean),
154                    "ASCII byte $x should be intact"
155                );
156                if ($char != $clean) {
157                    return;
158                }
159            } else {
160                $norm = $head . UTF8_REPLACEMENT . $tail;
161                $this->assertEquals(
162                    bin2hex($norm),
163                    bin2hex($clean),
164                    "Forbidden byte $x should be rejected"
165                );
166                if ($norm != $clean) {
167                    return;
168                }
169            }
170        }
171    }
172
173    /** @todo document */
174    public function testDoubleBytes()
175    {
176        $this->doTestDoubleBytes('', '');
177        $this->doTestDoubleBytes('x', '');
178        $this->doTestDoubleBytes('', 'x');
179        $this->doTestDoubleBytes('x', 'x');
180    }
181
182    /**
183     * @todo document
184     */
185    public function doTestDoubleBytes($head, $tail)
186    {
187        for ($first = 0xc0; $first < 0x100; $first++) {
188            for ($second = 0x80; $second < 0x100; $second++) {
189                $char = $head . chr($first) . chr($second) . $tail;
190                $clean = UtfNormal::cleanUp($char);
191                $x = sprintf("%02X,%02X", $first, $second);
192                if ($first > 0xc1 &&
193                    $first < 0xe0 &&
194                    $second < 0xc0) {
195                    $norm = UtfNormal::NFC($char);
196                    $this->assertEquals(
197                        bin2hex($norm),
198                        bin2hex($clean),
199                        "Pair $x should be intact"
200                    );
201                    if ($norm != $clean) {
202                        return;
203                    }
204                } elseif ($first > 0xfd || $second > 0xbf) {
205                    # fe and ff are not legal head bytes -- expect two replacement chars
206                    $norm = $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail;
207                    $this->assertEquals(
208                        bin2hex($norm),
209                        bin2hex($clean),
210                        "Forbidden pair $x should be rejected"
211                    );
212                    if ($norm != $clean) {
213                        return;
214                    }
215                } else {
216                    $norm = $head . UTF8_REPLACEMENT . $tail;
217                    $this->assertEquals(
218                        bin2hex($norm),
219                        bin2hex($clean),
220                        "Forbidden pair $x should be rejected"
221                    );
222                    if ($norm != $clean) {
223                        return;
224                    }
225                }
226            }
227        }
228    }
229
230    /** @todo document */
231    public function testTripleBytes()
232    {
233        $this->doTestTripleBytes('', '');
234        $this->doTestTripleBytes('x', '');
235        $this->doTestTripleBytes('', 'x');
236        $this->doTestTripleBytes('x', 'x');
237    }
238
239    /** @todo document */
240    public function doTestTripleBytes($head, $tail)
241    {
242        for ($first = 0xc0; $first < 0x100; $first++) {
243            for ($second = 0x80; $second < 0x100; $second++) {
244                #for( $third = 0x80; $third < 0x100; $third++ ) {
245                for ($third = 0x80; $third < 0x81; $third++) {
246                    $char = $head . chr($first) . chr($second) . chr($third) . $tail;
247                    $clean = UtfNormal::cleanUp($char);
248                    $x = sprintf("%02X,%02X,%02X", $first, $second, $third);
249                    if ($first >= 0xe0 &&
250                        $first < 0xf0 &&
251                        $second < 0xc0 &&
252                        $third < 0xc0) {
253                        if ($first == 0xe0 && $second < 0xa0) {
254                            $this->assertEquals(
255                                bin2hex($head . UTF8_REPLACEMENT . $tail),
256                                bin2hex($clean),
257                                "Overlong triplet $x should be rejected"
258                            );
259                        } elseif ($first == 0xed &&
260                            (chr($first) . chr($second) . chr($third)) >= UTF8_SURROGATE_FIRST) {
261                            $this->assertEquals(
262                                bin2hex($head . UTF8_REPLACEMENT . $tail),
263                                bin2hex($clean),
264                                "Surrogate triplet $x should be rejected"
265                            );
266                        } else {
267                            $this->assertEquals(
268                                bin2hex(UtfNormal::NFC($char)),
269                                bin2hex($clean),
270                                "Triplet $x should be intact"
271                            );
272                        }
273                    } elseif ($first > 0xc1 && $first < 0xe0 && $second < 0xc0) {
274                        $this->assertEquals(
275                            bin2hex(UtfNormal::NFC($head . chr($first) . chr($second)) . UTF8_REPLACEMENT . $tail),
276                            bin2hex($clean),
277                            "Valid 2-byte $x + broken tail"
278                        );
279                    } elseif ($second > 0xc1 && $second < 0xe0 && $third < 0xc0) {
280                        $this->assertEquals(
281                            bin2hex($head . UTF8_REPLACEMENT . UtfNormal::NFC(chr($second) . chr($third) . $tail)),
282                            bin2hex($clean),
283                            "Broken head + valid 2-byte $x"
284                        );
285                    } elseif (($first > 0xfd || $second > 0xfd) &&
286                                (($second > 0xbf && $third > 0xbf) ||
287                                  ($second < 0xc0 && $third < 0xc0) ||
288                                  ($second > 0xfd) ||
289                                  ($third > 0xfd))) {
290                        # fe and ff are not legal head bytes -- expect three replacement chars
291                        $this->assertEquals(
292                            bin2hex($head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail),
293                            bin2hex($clean),
294                            "Forbidden triplet $x should be rejected"
295                        );
296                    } elseif ($first > 0xc2 && $second < 0xc0 && $third < 0xc0) {
297                        $this->assertEquals(
298                            bin2hex($head . UTF8_REPLACEMENT . $tail),
299                            bin2hex($clean),
300                            "Forbidden triplet $x should be rejected"
301                        );
302                    } else {
303                        $this->assertEquals(
304                            bin2hex($head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail),
305                            bin2hex($clean),
306                            "Forbidden triplet $x should be rejected"
307                        );
308                    }
309                }
310            }
311        }
312    }
313
314    /** @todo document */
315    public function testChunkRegression()
316    {
317        # Check for regression against a chunking bug
318        $text = "\x46\x55\xb8" .
319                  "\xdc\x96" .
320                  "\xee" .
321                  "\xe7" .
322                  "\x44" .
323                  "\xaa" .
324                  "\x2f\x25";
325        $expect = "\x46\x55\xef\xbf\xbd" .
326                  "\xdc\x96" .
327                  "\xef\xbf\xbd" .
328                  "\xef\xbf\xbd" .
329                  "\x44" .
330                  "\xef\xbf\xbd" .
331                  "\x2f\x25";
332
333        $this->assertEquals(
334            bin2hex($expect),
335            bin2hex(UtfNormal::cleanUp($text))
336        );
337    }
338
339    /** @todo document */
340    public function testInterposeRegression()
341    {
342        $text = "\x4e\x30" .
343                  "\xb1" .		# bad tail
344                  "\x3a" .
345                  "\x92" .		# bad tail
346                  "\x62\x3a" .
347                  "\x84" .		# bad tail
348                  "\x43" .
349                  "\xc6" .		# bad head
350                  "\x3f" .
351                  "\x92" .		# bad tail
352                  "\xad" .		# bad tail
353                  "\x7d" .
354                  "\xd9\x95";
355
356        $expect = "\x4e\x30" .
357                  "\xef\xbf\xbd" .
358                  "\x3a" .
359                  "\xef\xbf\xbd" .
360                  "\x62\x3a" .
361                  "\xef\xbf\xbd" .
362                  "\x43" .
363                  "\xef\xbf\xbd" .
364                  "\x3f" .
365                  "\xef\xbf\xbd" .
366                  "\xef\xbf\xbd" .
367                  "\x7d" .
368                  "\xd9\x95";
369
370        $this->assertEquals(
371            bin2hex($expect),
372            bin2hex(UtfNormal::cleanUp($text))
373        );
374    }
375
376    /** @todo document */
377    public function testOverlongRegression()
378    {
379        $text = "\x67" .
380                  "\x1a" . # forbidden ascii
381                  "\xea" . # bad head
382                  "\xc1\xa6" . # overlong sequence
383                  "\xad" . # bad tail
384                  "\x1c" . # forbidden ascii
385                  "\xb0" . # bad tail
386                  "\x3c" .
387                  "\x9e";  # bad tail
388        $expect = "\x67" .
389                  "\xef\xbf\xbd" .
390                  "\xef\xbf\xbd" .
391                  "\xef\xbf\xbd" .
392                  "\xef\xbf\xbd" .
393                  "\xef\xbf\xbd" .
394                  "\xef\xbf\xbd" .
395                  "\x3c" .
396                  "\xef\xbf\xbd";
397        $this->assertEquals(
398            bin2hex($expect),
399            bin2hex(UtfNormal::cleanUp($text))
400        );
401    }
402
403    /** @todo document */
404    public function testSurrogateRegression()
405    {
406        $text = "\xed\xb4\x96" . # surrogate 0xDD16
407                  "\x83" . # bad tail
408                  "\xb4" . # bad tail
409                  "\xac";  # bad head
410        $expect = "\xef\xbf\xbd" .
411                  "\xef\xbf\xbd" .
412                  "\xef\xbf\xbd" .
413                  "\xef\xbf\xbd";
414        $this->assertEquals(
415            bin2hex($expect),
416            bin2hex(UtfNormal::cleanUp($text))
417        );
418    }
419
420    /** @todo document */
421    public function testBomRegression()
422    {
423        $text = "\xef\xbf\xbe" . # U+FFFE, illegal char
424                  "\xb2" . # bad tail
425                  "\xef" . # bad head
426                  "\x59";
427        $expect = "\xef\xbf\xbd" .
428                  "\xef\xbf\xbd" .
429                  "\xef\xbf\xbd" .
430                  "\x59";
431        $this->assertEquals(
432            bin2hex($expect),
433            bin2hex(UtfNormal::cleanUp($text))
434        );
435    }
436
437    /** @todo document */
438    public function testForbiddenRegression()
439    {
440        $text = "\xef\xbf\xbf"; # U+FFFF, illegal char
441        $expect = "\xef\xbf\xbd";
442        $this->assertEquals(
443            bin2hex($expect),
444            bin2hex(UtfNormal::cleanUp($text))
445        );
446    }
447
448    /** @todo document */
449    public function testHangulRegression()
450    {
451        $text = "\xed\x9c\xaf" . # Hangul char
452                "\xe1\x87\x81";  # followed by another final jamo
453        $expect = $text;         # Should *not* change.
454        $this->assertEquals(
455            bin2hex($expect),
456            bin2hex(UtfNormal::cleanUp($text))
457        );
458    }
459}
460
461
462$suite = new PHPUnit_Framework_TestSuite('CleanUpTest');
463$result = PHPUnit_TextUI_TestRunner::run($suite);
464
465if (!$result->wasSuccessful()) {
466    exit(-1);
467}
468exit(0);
469