1<?php 2# Copyright (C) 2004 Brion Vibber <brion@pobox.com> 3# http://www.mediawiki.org/ 4# 5# This program is free software; you can redistribute it and/or modify 6# it under the terms of the GNU General Public License as published by 7# the Free Software Foundation; either version 2 of the License, or 8# (at your option) any later version. 9# 10# This program is distributed in the hope that it will be useful, 11# but WITHOUT ANY WARRANTY; without even the implied warranty of 12# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13# GNU General Public License for more details. 14# 15# You should have received a copy of the GNU General Public License along 16# with this program; if not, write to the Free Software Foundation, Inc., 17# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 18# http://www.gnu.org/copyleft/gpl.html 19 20 21if (php_sapi_name() != 'cli') { 22 die("Run me from the command line please.\n"); 23} 24 25/** */ 26if (isset($_SERVER['argv']) && in_array('--icu', $_SERVER['argv'])) { 27 dl('php_utfnormal.so'); 28} 29 30#ini_set( 'memory_limit', '40M' ); 31 32require_once 'PHPUnit/Framework.php'; 33require_once 'PHPUnit/TextUI/TestRunner.php'; 34 35require_once 'include/Unicode/UtfNormal.php'; 36 37/** 38 * Additional tests for UtfNormal::cleanUp() function, inclusion 39 * regression checks for known problems. 40 * Requires PHPUnit. 41 * 42 * @addtogroup UtfNormal 43 * @private 44 */ 45class CleanUpTest extends PHPUnit_Framework_TestCase 46{ 47 /** @todo document */ 48 public function setUp() 49 { 50 } 51 52 /** @todo document */ 53 public function tearDown() 54 { 55 } 56 57 /** @todo document */ 58 public function testAscii() 59 { 60 $text = 'This is plain ASCII text.'; 61 $this->assertEquals($text, UtfNormal::cleanUp($text)); 62 } 63 64 /** @todo document */ 65 public function testNull() 66 { 67 $text = "a \x00 null"; 68 $expect = "a \xef\xbf\xbd null"; 69 $this->assertEquals( 70 bin2hex($expect), 71 bin2hex(UtfNormal::cleanUp($text)) 72 ); 73 } 74 75 /** @todo document */ 76 public function testLatin() 77 { 78 $text = "L'\xc3\xa9cole"; 79 $this->assertEquals($text, UtfNormal::cleanUp($text)); 80 } 81 82 /** @todo document */ 83 public function testLatinNormal() 84 { 85 $text = "L'e\xcc\x81cole"; 86 $expect = "L'\xc3\xa9cole"; 87 $this->assertEquals($expect, UtfNormal::cleanUp($text)); 88 } 89 90 /** 91 * This test is *very* expensive! 92 * @todo document 93 */ 94 public function XtestAllChars() 95 { 96 $rep = UTF8_REPLACEMENT; 97 global $utfCanonicalComp, $utfCanonicalDecomp; 98 for ($i = 0x0; $i < UNICODE_MAX; $i++) { 99 $char = codepointToUtf8($i); 100 $clean = UtfNormal::cleanUp($char); 101 $x = sprintf("%04X", $i); 102 if ($i % 0x1000 == 0) { 103 echo "U+$x\n"; 104 } 105 if ($i == 0x0009 || 106 $i == 0x000a || 107 $i == 0x000d || 108 ($i > 0x001f && $i < UNICODE_SURROGATE_FIRST) || 109 ($i > UNICODE_SURROGATE_LAST && $i < 0xfffe) || 110 ($i > 0xffff && $i <= UNICODE_MAX)) { 111 if (isset($utfCanonicalComp[$char]) || isset($utfCanonicalDecomp[$char])) { 112 $comp = UtfNormal::NFC($char); 113 $this->assertEquals( 114 bin2hex($comp), 115 bin2hex($clean), 116 "U+$x should be decomposed" 117 ); 118 } else { 119 $this->assertEquals( 120 bin2hex($char), 121 bin2hex($clean), 122 "U+$x should be intact" 123 ); 124 } 125 } else { 126 $this->assertEquals(bin2hex($rep), bin2hex($clean), $x); 127 } 128 } 129 } 130 131 /** @todo document */ 132 public function testAllBytes() 133 { 134 $this->doTestBytes('', ''); 135 $this->doTestBytes('x', ''); 136 $this->doTestBytes('', 'x'); 137 $this->doTestBytes('x', 'x'); 138 } 139 140 /** @todo document */ 141 public function doTestBytes($head, $tail) 142 { 143 for ($i = 0x0; $i < 256; $i++) { 144 $char = $head . chr($i) . $tail; 145 $clean = UtfNormal::cleanUp($char); 146 $x = sprintf("%02X", $i); 147 if ($i == 0x0009 || 148 $i == 0x000a || 149 $i == 0x000d || 150 ($i > 0x001f && $i < 0x80)) { 151 $this->assertEquals( 152 bin2hex($char), 153 bin2hex($clean), 154 "ASCII byte $x should be intact" 155 ); 156 if ($char != $clean) { 157 return; 158 } 159 } else { 160 $norm = $head . UTF8_REPLACEMENT . $tail; 161 $this->assertEquals( 162 bin2hex($norm), 163 bin2hex($clean), 164 "Forbidden byte $x should be rejected" 165 ); 166 if ($norm != $clean) { 167 return; 168 } 169 } 170 } 171 } 172 173 /** @todo document */ 174 public function testDoubleBytes() 175 { 176 $this->doTestDoubleBytes('', ''); 177 $this->doTestDoubleBytes('x', ''); 178 $this->doTestDoubleBytes('', 'x'); 179 $this->doTestDoubleBytes('x', 'x'); 180 } 181 182 /** 183 * @todo document 184 */ 185 public function doTestDoubleBytes($head, $tail) 186 { 187 for ($first = 0xc0; $first < 0x100; $first++) { 188 for ($second = 0x80; $second < 0x100; $second++) { 189 $char = $head . chr($first) . chr($second) . $tail; 190 $clean = UtfNormal::cleanUp($char); 191 $x = sprintf("%02X,%02X", $first, $second); 192 if ($first > 0xc1 && 193 $first < 0xe0 && 194 $second < 0xc0) { 195 $norm = UtfNormal::NFC($char); 196 $this->assertEquals( 197 bin2hex($norm), 198 bin2hex($clean), 199 "Pair $x should be intact" 200 ); 201 if ($norm != $clean) { 202 return; 203 } 204 } elseif ($first > 0xfd || $second > 0xbf) { 205 # fe and ff are not legal head bytes -- expect two replacement chars 206 $norm = $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail; 207 $this->assertEquals( 208 bin2hex($norm), 209 bin2hex($clean), 210 "Forbidden pair $x should be rejected" 211 ); 212 if ($norm != $clean) { 213 return; 214 } 215 } else { 216 $norm = $head . UTF8_REPLACEMENT . $tail; 217 $this->assertEquals( 218 bin2hex($norm), 219 bin2hex($clean), 220 "Forbidden pair $x should be rejected" 221 ); 222 if ($norm != $clean) { 223 return; 224 } 225 } 226 } 227 } 228 } 229 230 /** @todo document */ 231 public function testTripleBytes() 232 { 233 $this->doTestTripleBytes('', ''); 234 $this->doTestTripleBytes('x', ''); 235 $this->doTestTripleBytes('', 'x'); 236 $this->doTestTripleBytes('x', 'x'); 237 } 238 239 /** @todo document */ 240 public function doTestTripleBytes($head, $tail) 241 { 242 for ($first = 0xc0; $first < 0x100; $first++) { 243 for ($second = 0x80; $second < 0x100; $second++) { 244 #for( $third = 0x80; $third < 0x100; $third++ ) { 245 for ($third = 0x80; $third < 0x81; $third++) { 246 $char = $head . chr($first) . chr($second) . chr($third) . $tail; 247 $clean = UtfNormal::cleanUp($char); 248 $x = sprintf("%02X,%02X,%02X", $first, $second, $third); 249 if ($first >= 0xe0 && 250 $first < 0xf0 && 251 $second < 0xc0 && 252 $third < 0xc0) { 253 if ($first == 0xe0 && $second < 0xa0) { 254 $this->assertEquals( 255 bin2hex($head . UTF8_REPLACEMENT . $tail), 256 bin2hex($clean), 257 "Overlong triplet $x should be rejected" 258 ); 259 } elseif ($first == 0xed && 260 (chr($first) . chr($second) . chr($third)) >= UTF8_SURROGATE_FIRST) { 261 $this->assertEquals( 262 bin2hex($head . UTF8_REPLACEMENT . $tail), 263 bin2hex($clean), 264 "Surrogate triplet $x should be rejected" 265 ); 266 } else { 267 $this->assertEquals( 268 bin2hex(UtfNormal::NFC($char)), 269 bin2hex($clean), 270 "Triplet $x should be intact" 271 ); 272 } 273 } elseif ($first > 0xc1 && $first < 0xe0 && $second < 0xc0) { 274 $this->assertEquals( 275 bin2hex(UtfNormal::NFC($head . chr($first) . chr($second)) . UTF8_REPLACEMENT . $tail), 276 bin2hex($clean), 277 "Valid 2-byte $x + broken tail" 278 ); 279 } elseif ($second > 0xc1 && $second < 0xe0 && $third < 0xc0) { 280 $this->assertEquals( 281 bin2hex($head . UTF8_REPLACEMENT . UtfNormal::NFC(chr($second) . chr($third) . $tail)), 282 bin2hex($clean), 283 "Broken head + valid 2-byte $x" 284 ); 285 } elseif (($first > 0xfd || $second > 0xfd) && 286 (($second > 0xbf && $third > 0xbf) || 287 ($second < 0xc0 && $third < 0xc0) || 288 ($second > 0xfd) || 289 ($third > 0xfd))) { 290 # fe and ff are not legal head bytes -- expect three replacement chars 291 $this->assertEquals( 292 bin2hex($head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail), 293 bin2hex($clean), 294 "Forbidden triplet $x should be rejected" 295 ); 296 } elseif ($first > 0xc2 && $second < 0xc0 && $third < 0xc0) { 297 $this->assertEquals( 298 bin2hex($head . UTF8_REPLACEMENT . $tail), 299 bin2hex($clean), 300 "Forbidden triplet $x should be rejected" 301 ); 302 } else { 303 $this->assertEquals( 304 bin2hex($head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail), 305 bin2hex($clean), 306 "Forbidden triplet $x should be rejected" 307 ); 308 } 309 } 310 } 311 } 312 } 313 314 /** @todo document */ 315 public function testChunkRegression() 316 { 317 # Check for regression against a chunking bug 318 $text = "\x46\x55\xb8" . 319 "\xdc\x96" . 320 "\xee" . 321 "\xe7" . 322 "\x44" . 323 "\xaa" . 324 "\x2f\x25"; 325 $expect = "\x46\x55\xef\xbf\xbd" . 326 "\xdc\x96" . 327 "\xef\xbf\xbd" . 328 "\xef\xbf\xbd" . 329 "\x44" . 330 "\xef\xbf\xbd" . 331 "\x2f\x25"; 332 333 $this->assertEquals( 334 bin2hex($expect), 335 bin2hex(UtfNormal::cleanUp($text)) 336 ); 337 } 338 339 /** @todo document */ 340 public function testInterposeRegression() 341 { 342 $text = "\x4e\x30" . 343 "\xb1" . # bad tail 344 "\x3a" . 345 "\x92" . # bad tail 346 "\x62\x3a" . 347 "\x84" . # bad tail 348 "\x43" . 349 "\xc6" . # bad head 350 "\x3f" . 351 "\x92" . # bad tail 352 "\xad" . # bad tail 353 "\x7d" . 354 "\xd9\x95"; 355 356 $expect = "\x4e\x30" . 357 "\xef\xbf\xbd" . 358 "\x3a" . 359 "\xef\xbf\xbd" . 360 "\x62\x3a" . 361 "\xef\xbf\xbd" . 362 "\x43" . 363 "\xef\xbf\xbd" . 364 "\x3f" . 365 "\xef\xbf\xbd" . 366 "\xef\xbf\xbd" . 367 "\x7d" . 368 "\xd9\x95"; 369 370 $this->assertEquals( 371 bin2hex($expect), 372 bin2hex(UtfNormal::cleanUp($text)) 373 ); 374 } 375 376 /** @todo document */ 377 public function testOverlongRegression() 378 { 379 $text = "\x67" . 380 "\x1a" . # forbidden ascii 381 "\xea" . # bad head 382 "\xc1\xa6" . # overlong sequence 383 "\xad" . # bad tail 384 "\x1c" . # forbidden ascii 385 "\xb0" . # bad tail 386 "\x3c" . 387 "\x9e"; # bad tail 388 $expect = "\x67" . 389 "\xef\xbf\xbd" . 390 "\xef\xbf\xbd" . 391 "\xef\xbf\xbd" . 392 "\xef\xbf\xbd" . 393 "\xef\xbf\xbd" . 394 "\xef\xbf\xbd" . 395 "\x3c" . 396 "\xef\xbf\xbd"; 397 $this->assertEquals( 398 bin2hex($expect), 399 bin2hex(UtfNormal::cleanUp($text)) 400 ); 401 } 402 403 /** @todo document */ 404 public function testSurrogateRegression() 405 { 406 $text = "\xed\xb4\x96" . # surrogate 0xDD16 407 "\x83" . # bad tail 408 "\xb4" . # bad tail 409 "\xac"; # bad head 410 $expect = "\xef\xbf\xbd" . 411 "\xef\xbf\xbd" . 412 "\xef\xbf\xbd" . 413 "\xef\xbf\xbd"; 414 $this->assertEquals( 415 bin2hex($expect), 416 bin2hex(UtfNormal::cleanUp($text)) 417 ); 418 } 419 420 /** @todo document */ 421 public function testBomRegression() 422 { 423 $text = "\xef\xbf\xbe" . # U+FFFE, illegal char 424 "\xb2" . # bad tail 425 "\xef" . # bad head 426 "\x59"; 427 $expect = "\xef\xbf\xbd" . 428 "\xef\xbf\xbd" . 429 "\xef\xbf\xbd" . 430 "\x59"; 431 $this->assertEquals( 432 bin2hex($expect), 433 bin2hex(UtfNormal::cleanUp($text)) 434 ); 435 } 436 437 /** @todo document */ 438 public function testForbiddenRegression() 439 { 440 $text = "\xef\xbf\xbf"; # U+FFFF, illegal char 441 $expect = "\xef\xbf\xbd"; 442 $this->assertEquals( 443 bin2hex($expect), 444 bin2hex(UtfNormal::cleanUp($text)) 445 ); 446 } 447 448 /** @todo document */ 449 public function testHangulRegression() 450 { 451 $text = "\xed\x9c\xaf" . # Hangul char 452 "\xe1\x87\x81"; # followed by another final jamo 453 $expect = $text; # Should *not* change. 454 $this->assertEquals( 455 bin2hex($expect), 456 bin2hex(UtfNormal::cleanUp($text)) 457 ); 458 } 459} 460 461 462$suite = new PHPUnit_Framework_TestSuite('CleanUpTest'); 463$result = PHPUnit_TextUI_TestRunner::run($suite); 464 465if (!$result->wasSuccessful()) { 466 exit(-1); 467} 468exit(0); 469