1<?php 2# Copyright (C) 2004 Brion Vibber <brion@pobox.com> 3# http://www.mediawiki.org/ 4# 5# This program is free software; you can redistribute it and/or modify 6# it under the terms of the GNU General Public License as published by 7# the Free Software Foundation; either version 2 of the License, or 8# (at your option) any later version. 9# 10# This program is distributed in the hope that it will be useful, 11# but WITHOUT ANY WARRANTY; without even the implied warranty of 12# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13# GNU General Public License for more details. 14# 15# You should have received a copy of the GNU General Public License along 16# with this program; if not, write to the Free Software Foundation, Inc., 17# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 18# http://www.gnu.org/copyleft/gpl.html 19 20/** */ 21require_once dirname(__FILE__) . '/UtfNormalUtil.php'; 22 23global $utfCombiningClass, $utfCanonicalComp, $utfCanonicalDecomp; 24$utfCombiningClass = null; 25$utfCanonicalComp = null; 26$utfCanonicalDecomp = null; 27 28# Load compatibility decompositions on demand if they are needed. 29global $utfCompatibilityDecomp; 30$utfCompatibilityDecomp = null; 31 32define('UNICODE_HANGUL_FIRST', 0xac00); 33define('UNICODE_HANGUL_LAST', 0xd7a3); 34 35define('UNICODE_HANGUL_LBASE', 0x1100); 36define('UNICODE_HANGUL_VBASE', 0x1161); 37define('UNICODE_HANGUL_TBASE', 0x11a7); 38 39define('UNICODE_HANGUL_LCOUNT', 19); 40define('UNICODE_HANGUL_VCOUNT', 21); 41define('UNICODE_HANGUL_TCOUNT', 28); 42define('UNICODE_HANGUL_NCOUNT', UNICODE_HANGUL_VCOUNT * UNICODE_HANGUL_TCOUNT); 43 44define('UNICODE_HANGUL_LEND', UNICODE_HANGUL_LBASE + UNICODE_HANGUL_LCOUNT - 1); 45define('UNICODE_HANGUL_VEND', UNICODE_HANGUL_VBASE + UNICODE_HANGUL_VCOUNT - 1); 46define('UNICODE_HANGUL_TEND', UNICODE_HANGUL_TBASE + UNICODE_HANGUL_TCOUNT - 1); 47 48define('UNICODE_SURROGATE_FIRST', 0xd800); 49define('UNICODE_SURROGATE_LAST', 0xdfff); 50define('UNICODE_MAX', 0x10ffff); 51define('UNICODE_REPLACEMENT', 0xfffd); 52 53 54define('UTF8_HANGUL_FIRST', "\xea\xb0\x80" /*codepointToUtf8( UNICODE_HANGUL_FIRST )*/); 55define('UTF8_HANGUL_LAST', "\xed\x9e\xa3" /*codepointToUtf8( UNICODE_HANGUL_LAST )*/); 56 57define('UTF8_HANGUL_LBASE', "\xe1\x84\x80" /*codepointToUtf8( UNICODE_HANGUL_LBASE )*/); 58define('UTF8_HANGUL_VBASE', "\xe1\x85\xa1" /*codepointToUtf8( UNICODE_HANGUL_VBASE )*/); 59define('UTF8_HANGUL_TBASE', "\xe1\x86\xa7" /*codepointToUtf8( UNICODE_HANGUL_TBASE )*/); 60 61define('UTF8_HANGUL_LEND', "\xe1\x84\x92" /*codepointToUtf8( UNICODE_HANGUL_LEND )*/); 62define('UTF8_HANGUL_VEND', "\xe1\x85\xb5" /*codepointToUtf8( UNICODE_HANGUL_VEND )*/); 63define('UTF8_HANGUL_TEND', "\xe1\x87\x82" /*codepointToUtf8( UNICODE_HANGUL_TEND )*/); 64 65define('UTF8_SURROGATE_FIRST', "\xed\xa0\x80" /*codepointToUtf8( UNICODE_SURROGATE_FIRST )*/); 66define('UTF8_SURROGATE_LAST', "\xed\xbf\xbf" /*codepointToUtf8( UNICODE_SURROGATE_LAST )*/); 67define('UTF8_MAX', "\xf4\x8f\xbf\xbf" /*codepointToUtf8( UNICODE_MAX )*/); 68if (!defined('UTF8_REPLACEMENT')) { 69 define('UTF8_REPLACEMENT', "\xef\xbf\xbd" /*codepointToUtf8( UNICODE_REPLACEMENT )*/); 70} 71#define( 'UTF8_REPLACEMENT', '!' ); 72 73define('UTF8_OVERLONG_A', "\xc1\xbf"); 74define('UTF8_OVERLONG_B', "\xe0\x9f\xbf"); 75define('UTF8_OVERLONG_C', "\xf0\x8f\xbf\xbf"); 76 77# These two ranges are illegal 78define('UTF8_FDD0', "\xef\xb7\x90" /*codepointToUtf8( 0xfdd0 )*/); 79define('UTF8_FDEF', "\xef\xb7\xaf" /*codepointToUtf8( 0xfdef )*/); 80define('UTF8_FFFE', "\xef\xbf\xbe" /*codepointToUtf8( 0xfffe )*/); 81define('UTF8_FFFF', "\xef\xbf\xbf" /*codepointToUtf8( 0xffff )*/); 82 83define('UTF8_HEAD', false); 84define('UTF8_TAIL', true); 85 86 87/** 88 * For using the ICU wrapper 89 */ 90define('UNORM_NONE', 1); 91define('UNORM_NFD', 2); 92define('UNORM_NFKD', 3); 93define('UNORM_NFC', 4); 94define('UNORM_DEFAULT', UNORM_NFC); 95define('UNORM_NFKC', 5); 96define('UNORM_FCD', 6); 97 98define('NORMALIZE_ICU', function_exists('utf8_normalize')); 99 100/** 101 * Unicode normalization routines for working with UTF-8 strings. 102 * Currently assumes that input strings are valid UTF-8! 103 * 104 * Not as fast as I'd like, but should be usable for most purposes. 105 * UtfNormal::toNFC() will bail early if given ASCII text or text 106 * it can quickly deterimine is already normalized. 107 * 108 * All functions can be called static. 109 * 110 * See description of forms at http://www.unicode.org/reports/tr15/ 111 * 112 * @addtogroup UtfNormal 113 */ 114class UtfNormal 115{ 116 /** 117 * The ultimate convenience function! Clean up invalid UTF-8 sequences, 118 * and convert to normal form C, canonical composition. 119 * 120 * Fast return for pure ASCII strings; some lesser optimizations for 121 * strings containing only known-good characters. Not as fast as toNFC(). 122 * 123 * @param string $string a UTF-8 string 124 * @return string a clean, shiny, normalized UTF-8 string 125 * @static 126 */ 127 public static function cleanUp($string) 128 { 129 if (NORMALIZE_ICU) { 130 # We exclude a few chars that ICU would not. 131 $string = preg_replace( 132 '/[\x00-\x08\x0b\x0c\x0e-\x1f]/', 133 UTF8_REPLACEMENT, 134 $string 135 ); 136 $string = str_replace(UTF8_FFFE, UTF8_REPLACEMENT, $string); 137 $string = str_replace(UTF8_FFFF, UTF8_REPLACEMENT, $string); 138 139 # UnicodeString constructor fails if the string ends with a 140 # head byte. Add a junk char at the end, we'll strip it off. 141 return rtrim(utf8_normalize($string . "\x01", UNORM_NFC), "\x01"); 142 } elseif (UtfNormal::quickIsNFCVerify($string)) { 143 # Side effect -- $string has had UTF-8 errors cleaned up. 144 return $string; 145 } else { 146 return UtfNormal::NFC($string); 147 } 148 } 149 150 /** 151 * Convert a UTF-8 string to normal form C, canonical composition. 152 * Fast return for pure ASCII strings; some lesser optimizations for 153 * strings containing only known-good characters. 154 * 155 * @param string $string a valid UTF-8 string. Input is not validated. 156 * @return string a UTF-8 string in normal form C 157 * @static 158 */ 159 public static function toNFC($string) 160 { 161 if (NORMALIZE_ICU) { 162 return utf8_normalize($string, UNORM_NFC); 163 } elseif (UtfNormal::quickIsNFC($string)) { 164 return $string; 165 } else { 166 return UtfNormal::NFC($string); 167 } 168 } 169 170 /** 171 * Convert a UTF-8 string to normal form D, canonical decomposition. 172 * Fast return for pure ASCII strings. 173 * 174 * @param string $string a valid UTF-8 string. Input is not validated. 175 * @return string a UTF-8 string in normal form D 176 * @static 177 */ 178 public static function toNFD($string) 179 { 180 if (NORMALIZE_ICU) { 181 return utf8_normalize($string, UNORM_NFD); 182 } elseif (preg_match('/[\x80-\xff]/', $string)) { 183 return UtfNormal::NFD($string); 184 } else { 185 return $string; 186 } 187 } 188 189 /** 190 * Convert a UTF-8 string to normal form KC, compatibility composition. 191 * This may cause irreversible information loss, use judiciously. 192 * Fast return for pure ASCII strings. 193 * 194 * @param string $string a valid UTF-8 string. Input is not validated. 195 * @return string a UTF-8 string in normal form KC 196 * @static 197 */ 198 public static function toNFKC($string) 199 { 200 if (NORMALIZE_ICU) { 201 return utf8_normalize($string, UNORM_NFKC); 202 } elseif (preg_match('/[\x80-\xff]/', $string)) { 203 return UtfNormal::NFKC($string); 204 } else { 205 return $string; 206 } 207 } 208 209 /** 210 * Convert a UTF-8 string to normal form KD, compatibility decomposition. 211 * This may cause irreversible information loss, use judiciously. 212 * Fast return for pure ASCII strings. 213 * 214 * @param string $string a valid UTF-8 string. Input is not validated. 215 * @return string a UTF-8 string in normal form KD 216 * @static 217 */ 218 public static function toNFKD($string) 219 { 220 if (NORMALIZE_ICU) { 221 return utf8_normalize($string, UNORM_NFKD); 222 } elseif (preg_match('/[\x80-\xff]/', $string)) { 223 return UtfNormal::NFKD($string); 224 } else { 225 return $string; 226 } 227 } 228 229 /** 230 * Load the basic composition data if necessary 231 * @private 232 * @static 233 */ 234 public static function loadData() 235 { 236 global $utfCombiningClass; 237 if (!isset($utfCombiningClass)) { 238 require_once(dirname(__FILE__) . '/UtfNormalData.inc'); 239 } 240 } 241 242 /** 243 * Returns true if the string is _definitely_ in NFC. 244 * Returns false if not or uncertain. 245 * @param string $string a valid UTF-8 string. Input is not validated. 246 * @return bool 247 * @static 248 */ 249 public static function quickIsNFC($string) 250 { 251 # ASCII is always valid NFC! 252 # If it's pure ASCII, let it through. 253 if (!preg_match('/[\x80-\xff]/', $string)) { 254 return true; 255 } 256 257 UtfNormal::loadData(); 258 global $utfCheckNFC, $utfCombiningClass; 259 $len = strlen($string); 260 for ($i = 0; $i < $len; $i++) { 261 $c = $string[$i]; 262 $n = ord($c); 263 if ($n < 0x80) { 264 continue; 265 } elseif ($n >= 0xf0) { 266 $c = substr($string, $i, 4); 267 $i += 3; 268 } elseif ($n >= 0xe0) { 269 $c = substr($string, $i, 3); 270 $i += 2; 271 } elseif ($n >= 0xc0) { 272 $c = substr($string, $i, 2); 273 $i++; 274 } 275 if (isset($utfCheckNFC[$c])) { 276 # If it's NO or MAYBE, bail and do the slow check. 277 return false; 278 } 279 if (isset($utfCombiningClass[$c])) { 280 # Combining character? We might have to do sorting, at least. 281 return false; 282 } 283 } 284 return true; 285 } 286 287 /** 288 * Returns true if the string is _definitely_ in NFC. 289 * Returns false if not or uncertain. 290 * @param string $string a UTF-8 string, altered on output to be valid UTF-8 safe for XML. 291 * @static 292 */ 293 public static function quickIsNFCVerify(&$string) 294 { 295 # Screen out some characters that eg won't be allowed in XML 296 $string = preg_replace('/[\x00-\x08\x0b\x0c\x0e-\x1f]/', UTF8_REPLACEMENT, $string); 297 298 # ASCII is always valid NFC! 299 # If we're only ever given plain ASCII, we can avoid the overhead 300 # of initializing the decomposition tables by skipping out early. 301 if (!preg_match('/[\x80-\xff]/', $string)) { 302 return true; 303 } 304 305 static $checkit = null, $tailBytes = null, $utfCheckOrCombining = null; 306 if (!isset($checkit)) { 307 # Load/build some scary lookup tables... 308 UtfNormal::loadData(); 309 global $utfCheckNFC, $utfCombiningClass; 310 311 $utfCheckOrCombining = array_merge($utfCheckNFC, $utfCombiningClass); 312 313 # Head bytes for sequences which we should do further validity checks 314 $checkit = array_flip(array_map( 315 'chr', 316 array( 0xc0, 0xc1, 0xe0, 0xed, 0xef, 317 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 318 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff ) 319 )); 320 321 # Each UTF-8 head byte is followed by a certain 322 # number of tail bytes. 323 $tailBytes = array(); 324 for ($n = 0; $n < 256; $n++) { 325 if ($n < 0xc0) { 326 $remaining = 0; 327 } elseif ($n < 0xe0) { 328 $remaining = 1; 329 } elseif ($n < 0xf0) { 330 $remaining = 2; 331 } elseif ($n < 0xf8) { 332 $remaining = 3; 333 } elseif ($n < 0xfc) { 334 $remaining = 4; 335 } elseif ($n < 0xfe) { 336 $remaining = 5; 337 } else { 338 $remaining = 0; 339 } 340 $tailBytes[chr($n)] = $remaining; 341 } 342 } 343 344 # Chop the text into pure-ASCII and non-ASCII areas; 345 # large ASCII parts can be handled much more quickly. 346 # Don't chop up Unicode areas for punctuation, though, 347 # that wastes energy. 348 $matches = array(); 349 preg_match_all( 350 '/([\x00-\x7f]+|[\x80-\xff][\x00-\x40\x5b-\x5f\x7b-\xff]*)/', 351 $string, 352 $matches 353 ); 354 355 $looksNormal = true; 356 $base = 0; 357 $replace = array(); 358 foreach ($matches[1] as $str) { 359 $chunk = strlen($str); 360 361 if ($str[0] < "\x80") { 362 # ASCII chunk: guaranteed to be valid UTF-8 363 # and in normal form C, so skip over it. 364 $base += $chunk; 365 continue; 366 } 367 368 # We'll have to examine the chunk byte by byte to ensure 369 # that it consists of valid UTF-8 sequences, and to see 370 # if any of them might not be normalized. 371 # 372 # Since PHP is not the fastest language on earth, some of 373 # this code is a little ugly with inner loop optimizations. 374 375 $head = ''; 376 $len = $chunk + 1; # Counting down is faster. I'm *so* sorry. 377 378 for ($i = -1; --$len;) { 379 if ($remaining = $tailBytes[$c = $str[++$i]]) { 380 # UTF-8 head byte! 381 $sequence = $head = $c; 382 do { 383 # Look for the defined number of tail bytes... 384 if (--$len && ($c = $str[++$i]) >= "\x80" && $c < "\xc0") { 385 # Legal tail bytes are nice. 386 $sequence .= $c; 387 } else { 388 if (0 == $len) { 389 # Premature end of string! 390 # Drop a replacement character into output to 391 # represent the invalid UTF-8 sequence. 392 $replace[] = array( UTF8_REPLACEMENT, 393 $base + $i + 1 - strlen($sequence), 394 strlen($sequence) ); 395 break 2; 396 } else { 397 # Illegal tail byte; abandon the sequence. 398 $replace[] = array( UTF8_REPLACEMENT, 399 $base + $i - strlen($sequence), 400 strlen($sequence) ); 401 # Back up and reprocess this byte; it may itself 402 # be a legal ASCII or UTF-8 sequence head. 403 --$i; 404 ++$len; 405 continue 2; 406 } 407 } 408 } while (--$remaining); 409 410 if (isset($checkit[$head])) { 411 # Do some more detailed validity checks, for 412 # invalid characters and illegal sequences. 413 if ($head == "\xed") { 414 # 0xed is relatively frequent in Korean, which 415 # abuts the surrogate area, so we're doing 416 # this check separately to speed things up. 417 418 if ($sequence >= UTF8_SURROGATE_FIRST) { 419 # Surrogates are legal only in UTF-16 code. 420 # They are totally forbidden here in UTF-8 421 # utopia. 422 $replace[] = array( UTF8_REPLACEMENT, 423 $base + $i + 1 - strlen($sequence), 424 strlen($sequence) ); 425 $head = ''; 426 continue; 427 } 428 } else { 429 # Slower, but rarer checks... 430 $n = ord($head); 431 if ( 432 # "Overlong sequences" are those that are syntactically 433 # correct but use more UTF-8 bytes than are necessary to 434 # encode a character. Naïve string comparisons can be 435 # tricked into failing to see a match for an ASCII 436 # character, for instance, which can be a security hole 437 # if blacklist checks are being used. 438 ($n < 0xc2 && $sequence <= UTF8_OVERLONG_A) 439 || ($n == 0xe0 && $sequence <= UTF8_OVERLONG_B) 440 || ($n == 0xf0 && $sequence <= UTF8_OVERLONG_C) 441 442 # U+FFFE and U+FFFF are explicitly forbidden in Unicode. 443 || ($n == 0xef && 444 ($sequence == UTF8_FFFE) 445 || ($sequence == UTF8_FFFF)) 446 447 # Unicode has been limited to 21 bits; longer 448 # sequences are not allowed. 449 || ($n >= 0xf0 && $sequence > UTF8_MAX)) { 450 $replace[] = array( UTF8_REPLACEMENT, 451 $base + $i + 1 - strlen($sequence), 452 strlen($sequence) ); 453 $head = ''; 454 continue; 455 } 456 } 457 } 458 459 if (isset($utfCheckOrCombining[$sequence])) { 460 # If it's NO or MAYBE, we'll have to rip 461 # the string apart and put it back together. 462 # That's going to be mighty slow. 463 $looksNormal = false; 464 } 465 466 # The sequence is legal! 467 $head = ''; 468 } elseif ($c < "\x80") { 469 # ASCII byte. 470 $head = ''; 471 } elseif ($c < "\xc0") { 472 # Illegal tail bytes 473 if ($head == '') { 474 # Out of the blue! 475 $replace[] = array( UTF8_REPLACEMENT, $base + $i, 1 ); 476 } else { 477 # Don't add if we're continuing a broken sequence; 478 # we already put a replacement character when we looked 479 # at the broken sequence. 480 $replace[] = array( '', $base + $i, 1 ); 481 } 482 } else { 483 # Miscellaneous freaks. 484 $replace[] = array( UTF8_REPLACEMENT, $base + $i, 1 ); 485 $head = ''; 486 } 487 } 488 $base += $chunk; 489 } 490 if (count($replace)) { 491 # There were illegal UTF-8 sequences we need to fix up. 492 $out = ''; 493 $last = 0; 494 foreach ($replace as $rep) { 495 list($replacement, $start, $length) = $rep; 496 if ($last < $start) { 497 $out .= substr($string, $last, $start - $last); 498 } 499 $out .= $replacement; 500 $last = $start + $length; 501 } 502 if ($last < strlen($string)) { 503 $out .= substr($string, $last); 504 } 505 $string = $out; 506 } 507 return $looksNormal; 508 } 509 510 # These take a string and run the normalization on them, without 511 # checking for validity or any optimization etc. Input must be 512 # VALID UTF-8! 513 /** 514 * @param string $string 515 * @return string 516 * @private 517 * @static 518 */ 519 public static function NFC($string) 520 { 521 return UtfNormal::fastCompose(UtfNormal::NFD($string)); 522 } 523 524 /** 525 * @param string $string 526 * @return string 527 * @private 528 * @static 529 */ 530 public static function NFD($string) 531 { 532 UtfNormal::loadData(); 533 global $utfCanonicalDecomp; 534 return UtfNormal::fastCombiningSort( 535 UtfNormal::fastDecompose($string, $utfCanonicalDecomp) 536 ); 537 } 538 539 /** 540 * @param string $string 541 * @return string 542 * @private 543 * @static 544 */ 545 public static function NFKC($string) 546 { 547 return UtfNormal::fastCompose(UtfNormal::NFKD($string)); 548 } 549 550 /** 551 * @param string $string 552 * @return string 553 * @private 554 * @static 555 */ 556 public static function NFKD($string) 557 { 558 global $utfCompatibilityDecomp; 559 if (!isset($utfCompatibilityDecomp)) { 560 require_once('UtfNormalDataK.inc'); 561 } 562 return UtfNormal::fastCombiningSort( 563 UtfNormal::fastDecompose($string, $utfCompatibilityDecomp) 564 ); 565 } 566 567 568 /** 569 * Perform decomposition of a UTF-8 string into either D or KD form 570 * (depending on which decomposition map is passed to us). 571 * Input is assumed to be *valid* UTF-8. Invalid code will break. 572 * @private 573 * @param string $string Valid UTF-8 string 574 * @param array $map hash of expanded decomposition map 575 * @return string a UTF-8 string decomposed, not yet normalized (needs sorting) 576 * @static 577 */ 578 public static function fastDecompose($string, $map) 579 { 580 UtfNormal::loadData(); 581 $len = strlen($string); 582 $out = ''; 583 for ($i = 0; $i < $len; $i++) { 584 $c = $string[$i]; 585 $n = ord($c); 586 if ($n < 0x80) { 587 # ASCII chars never decompose 588 # THEY ARE IMMORTAL 589 $out .= $c; 590 continue; 591 } elseif ($n >= 0xf0) { 592 $c = substr($string, $i, 4); 593 $i += 3; 594 } elseif ($n >= 0xe0) { 595 $c = substr($string, $i, 3); 596 $i += 2; 597 } elseif ($n >= 0xc0) { 598 $c = substr($string, $i, 2); 599 $i++; 600 } 601 if (isset($map[$c])) { 602 $out .= $map[$c]; 603 continue; 604 } else { 605 if ($c >= UTF8_HANGUL_FIRST && $c <= UTF8_HANGUL_LAST) { 606 # Decompose a hangul syllable into jamo; 607 # hardcoded for three-byte UTF-8 sequence. 608 # A lookup table would be slightly faster, 609 # but adds a lot of memory & disk needs. 610 # 611 $index = ((ord($c[0]) & 0x0f) << 12 612 | (ord($c[1]) & 0x3f) << 6 613 | (ord($c[2]) & 0x3f)) 614 - UNICODE_HANGUL_FIRST; 615 $l = intval($index / UNICODE_HANGUL_NCOUNT); 616 $v = intval(($index % UNICODE_HANGUL_NCOUNT) / UNICODE_HANGUL_TCOUNT); 617 $t = $index % UNICODE_HANGUL_TCOUNT; 618 $out .= "\xe1\x84" . chr(0x80 + $l) . "\xe1\x85" . chr(0xa1 + $v); 619 if ($t >= 25) { 620 $out .= "\xe1\x87" . chr(0x80 + $t - 25); 621 } elseif ($t) { 622 $out .= "\xe1\x86" . chr(0xa7 + $t); 623 } 624 continue; 625 } 626 } 627 $out .= $c; 628 } 629 return $out; 630 } 631 632 /** 633 * Sorts combining characters into canonical order. This is the 634 * final step in creating decomposed normal forms D and KD. 635 * @private 636 * @param string $string a valid, decomposed UTF-8 string. Input is not validated. 637 * @return string a UTF-8 string with combining characters sorted in canonical order 638 * @static 639 */ 640 public static function fastCombiningSort($string) 641 { 642 UtfNormal::loadData(); 643 global $utfCombiningClass; 644 $len = strlen($string); 645 $out = ''; 646 $combiners = array(); 647 $lastClass = -1; 648 for ($i = 0; $i < $len; $i++) { 649 $c = $string[$i]; 650 $n = ord($c); 651 if ($n >= 0x80) { 652 if ($n >= 0xf0) { 653 $c = substr($string, $i, 4); 654 $i += 3; 655 } elseif ($n >= 0xe0) { 656 $c = substr($string, $i, 3); 657 $i += 2; 658 } elseif ($n >= 0xc0) { 659 $c = substr($string, $i, 2); 660 $i++; 661 } 662 if (isset($utfCombiningClass[$c])) { 663 $lastClass = $utfCombiningClass[$c]; 664 if (isset($combiners[$lastClass])) { 665 $combiners[$lastClass] .= $c; 666 } else { 667 $combiners[$lastClass] = $c; 668 } 669 continue; 670 } 671 } 672 if ($lastClass) { 673 ksort($combiners); 674 $out .= implode('', $combiners); 675 $combiners = array(); 676 } 677 $out .= $c; 678 $lastClass = 0; 679 } 680 if ($lastClass) { 681 ksort($combiners); 682 $out .= implode('', $combiners); 683 } 684 return $out; 685 } 686 687 /** 688 * Produces canonically composed sequences, i.e. normal form C or KC. 689 * 690 * @private 691 * @param string $string a valid UTF-8 string in sorted normal form D or KD. Input is not validated. 692 * @return string a UTF-8 string with canonical precomposed characters used where possible 693 * @static 694 */ 695 public static function fastCompose($string) 696 { 697 UtfNormal::loadData(); 698 global $utfCanonicalComp, $utfCombiningClass; 699 $len = strlen($string); 700 $out = ''; 701 $lastClass = -1; 702 $lastHangul = 0; 703 $startChar = ''; 704 $combining = ''; 705 $x1 = ord(substr(UTF8_HANGUL_VBASE, 0, 1)); 706 $x2 = ord(substr(UTF8_HANGUL_TEND, 0, 1)); 707 for ($i = 0; $i < $len; $i++) { 708 $c = $string[$i]; 709 $n = ord($c); 710 if ($n < 0x80) { 711 # No combining characters here... 712 $out .= $startChar; 713 $out .= $combining; 714 $startChar = $c; 715 $combining = ''; 716 $lastClass = 0; 717 continue; 718 } elseif ($n >= 0xf0) { 719 $c = substr($string, $i, 4); 720 $i += 3; 721 } elseif ($n >= 0xe0) { 722 $c = substr($string, $i, 3); 723 $i += 2; 724 } elseif ($n >= 0xc0) { 725 $c = substr($string, $i, 2); 726 $i++; 727 } 728 $pair = $startChar . $c; 729 if ($n > 0x80) { 730 if (isset($utfCombiningClass[$c])) { 731 # A combining char; see what we can do with it 732 $class = $utfCombiningClass[$c]; 733 if (!empty($startChar) && 734 $lastClass < $class && 735 $class > 0 && 736 isset($utfCanonicalComp[$pair])) { 737 $startChar = $utfCanonicalComp[$pair]; 738 $class = 0; 739 } else { 740 $combining .= $c; 741 } 742 $lastClass = $class; 743 $lastHangul = 0; 744 continue; 745 } 746 } 747 # New start char 748 if ($lastClass == 0) { 749 if (isset($utfCanonicalComp[$pair])) { 750 $startChar = $utfCanonicalComp[$pair]; 751 $lastHangul = 0; 752 continue; 753 } 754 if ($n >= $x1 && $n <= $x2) { 755 # WARNING: Hangul code is painfully slow. 756 # I apologize for this ugly, ugly code; however 757 # performance is even more teh suck if we call 758 # out to nice clean functions. Lookup tables are 759 # marginally faster, but require a lot of space. 760 # 761 if ($c >= UTF8_HANGUL_VBASE && 762 $c <= UTF8_HANGUL_VEND && 763 $startChar >= UTF8_HANGUL_LBASE && 764 $startChar <= UTF8_HANGUL_LEND) { 765 # 766 #$lIndex = utf8ToCodepoint( $startChar ) - UNICODE_HANGUL_LBASE; 767 #$vIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_VBASE; 768 $lIndex = ord($startChar[2]) - 0x80; 769 $vIndex = ord($c[2]) - 0xa1; 770 771 $hangulPoint = UNICODE_HANGUL_FIRST + 772 UNICODE_HANGUL_TCOUNT * 773 (UNICODE_HANGUL_VCOUNT * $lIndex + $vIndex); 774 775 # Hardcode the limited-range UTF-8 conversion: 776 $startChar = chr($hangulPoint >> 12 & 0x0f | 0xe0) . 777 chr($hangulPoint >> 6 & 0x3f | 0x80) . 778 chr($hangulPoint & 0x3f | 0x80); 779 $lastHangul = 0; 780 continue; 781 } elseif ($c >= UTF8_HANGUL_TBASE && 782 $c <= UTF8_HANGUL_TEND && 783 $startChar >= UTF8_HANGUL_FIRST && 784 $startChar <= UTF8_HANGUL_LAST && 785 !$lastHangul) { 786 # $tIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_TBASE; 787 $tIndex = ord($c[2]) - 0xa7; 788 if ($tIndex < 0) { 789 $tIndex = ord($c[2]) - 0x80 + (0x11c0 - 0x11a7); 790 } 791 792 # Increment the code point by $tIndex, without 793 # the function overhead of decoding and recoding UTF-8 794 # 795 $tail = ord($startChar[2]) + $tIndex; 796 if ($tail > 0xbf) { 797 $tail -= 0x40; 798 $mid = ord($startChar[1]) + 1; 799 if ($mid > 0xbf) { 800 $startChar[0] = chr(ord($startChar[0]) + 1); 801 $mid -= 0x40; 802 } 803 $startChar[1] = chr($mid); 804 } 805 $startChar[2] = chr($tail); 806 807 # If there's another jamo char after this, *don't* try to merge it. 808 $lastHangul = 1; 809 continue; 810 } 811 } 812 } 813 $out .= $startChar; 814 $out .= $combining; 815 $startChar = $c; 816 $combining = ''; 817 $lastClass = 0; 818 $lastHangul = 0; 819 } 820 $out .= $startChar . $combining; 821 return $out; 822 } 823 824 /** 825 * This is just used for the benchmark, comparing how long it takes to 826 * interate through a string without really doing anything of substance. 827 * @param string $string 828 * @return string 829 * @static 830 */ 831 public static function placebo($string) 832 { 833 $len = strlen($string); 834 $out = ''; 835 for ($i = 0; $i < $len; $i++) { 836 $out .= $string[$i]; 837 } 838 return $out; 839 } 840} 841