1<?php 2 3/** 4 * Convert a string into valid UTF-8. This function is quite slow. 5 * 6 * When invalid byte subsequences are encountered, they will be replaced with 7 * U+FFFD, the Unicode replacement character. 8 * 9 * This function treats overlong encodings as invalid. 10 * 11 * @param string String to convert to valid UTF-8. 12 * @return string String with invalid UTF-8 byte subsequences replaced with 13 * U+FFFD. 14 */ 15function phutil_utf8ize($string) { 16 if (phutil_is_utf8($string)) { 17 return $string; 18 } 19 20 // There is no function to do this in iconv, mbstring or ICU to do this, so 21 // do it (very very slowly) in pure PHP. 22 23 // TODO: Provide an optional fast C implementation ala fb_utf8ize() if this 24 // ever shows up in profiles? 25 26 $result = array(); 27 28 $regex = 29 "/([\x01-\x7F]". 30 "|[\xC2-\xDF][\x80-\xBF]". 31 "|[\xE0][\xA0-\xBF][\x80-\xBF]". 32 "|[\xE1-\xEF][\x80-\xBF][\x80-\xBF]". 33 "|[\xF0][\x90-\xBF][\x80-\xBF][\x80-\xBF]". 34 "|[\xF1-\xF3][\x80-\xBF][\x80-\xBF][\x80-\xBF]". 35 "|[\xF4][\x80-\x8F][\x80-\xBF][\x80-\xBF])". 36 "|(.)/"; 37 38 $replacement = "\xEF\xBF\xBD"; 39 40 $offset = 0; 41 $matches = null; 42 while (preg_match($regex, $string, $matches, 0, $offset)) { 43 if (!isset($matches[2])) { 44 $match = $matches[1]; 45 46 if ($match[0] == "\xED") { 47 // If this is a 3-byte character that may be part of one of the 48 // surrogate ranges, check if it's actually in those ranges. Reject 49 // it as invalid if it is. These sequences are used in UTF16 and 50 // functions like json_encode() refuse to encode them. 51 52 $codepoint = ((ord($match[0]) & 0x0F) << 12) 53 + ((ord($match[1]) & 0x3F) << 6) 54 + ((ord($match[2]) & 0x3F)); 55 if ($codepoint >= 0xD800 && $codepoint <= 0xDFFF) { 56 $result[] = str_repeat($replacement, strlen($match)); 57 $offset += strlen($matches[0]); 58 continue; 59 } 60 } 61 62 $result[] = $match; 63 } else { 64 // Unicode replacement character, U+FFFD. 65 $result[] = $replacement; 66 } 67 68 $offset += strlen($matches[0]); 69 } 70 71 return implode('', $result); 72} 73 74 75/** 76 * Determine if a string is valid UTF-8, with only basic multilingual plane 77 * characters. This is particularly important because MySQL's `utf8` column 78 * types silently truncate strings which contain characters outside of this 79 * set. 80 * 81 * @param string String to test for being valid UTF-8 with only characters in 82 * the basic multilingual plane. 83 * @return bool True if the string is valid UTF-8 with only BMP characters. 84 */ 85function phutil_is_utf8_with_only_bmp_characters($string) { 86 return phutil_is_utf8_slowly($string, $only_bmp = true); 87} 88 89 90/** 91 * Determine if a string is valid UTF-8. 92 * 93 * @param string Some string which may or may not be valid UTF-8. 94 * @return bool True if the string is valid UTF-8. 95 */ 96function phutil_is_utf8($string) { 97 if (function_exists('mb_check_encoding')) { 98 // See T13527. In some versions of PHP, "mb_check_encoding()" strictly 99 // requires a string parameter. 100 $string = phutil_string_cast($string); 101 102 // If mbstring is available, this is significantly faster than using PHP. 103 return mb_check_encoding($string, 'UTF-8'); 104 } 105 106 return phutil_is_utf8_slowly($string); 107} 108 109 110/** 111 * Determine if a string is valid UTF-8, slowly. 112 * 113 * This works on any system, but has very poor performance. 114 * 115 * You should call @{function:phutil_is_utf8} instead of this function, as 116 * that function can use more performant mechanisms if they are available on 117 * the system. 118 * 119 * @param string Some string which may or may not be valid UTF-8. 120 * @param bool True to require all characters be part of the basic 121 * multilingual plane (no more than 3-bytes long). 122 * @return bool True if the string is valid UTF-8. 123 */ 124function phutil_is_utf8_slowly($string, $only_bmp = false) { 125 // First, check the common case of normal ASCII strings. We're fine if 126 // the string contains no bytes larger than 127. 127 if (preg_match('/^[\x01-\x7F]+\z/', $string)) { 128 return true; 129 } 130 131 // NOTE: In the past, we used a large regular expression in the form of 132 // '(x|y|z)+' to match UTF8 strings. However, PCRE can segfaults on patterns 133 // like this at relatively small input sizes, at least on some systems 134 // (observed on OSX and Windows). This is apparently because the internal 135 // implementation is recursive and it blows the stack. 136 137 // See <https://bugs.php.net/bug.php?id=45735> for some discussion. Since the 138 // input limit is extremely low (less than 50KB on my system), do this check 139 // very very slowly in PHP instead. See also T5316. 140 141 $len = strlen($string); 142 for ($ii = 0; $ii < $len; $ii++) { 143 $chr = ord($string[$ii]); 144 if ($chr >= 0x01 && $chr <= 0x7F) { 145 continue; 146 } else if ($chr >= 0xC2 && $chr <= 0xDF) { 147 ++$ii; 148 if ($ii >= $len) { 149 return false; 150 } 151 $chr = ord($string[$ii]); 152 if ($chr >= 0x80 && $chr <= 0xBF) { 153 continue; 154 } 155 return false; 156 } else if ($chr == 0xED) { 157 // See T11525. Some sequences in this block are surrogate codepoints 158 // that are reserved for use in UTF16. We should reject them. 159 $codepoint = ($chr & 0x0F) << 12; 160 ++$ii; 161 if ($ii >= $len) { 162 return false; 163 } 164 $chr = ord($string[$ii]); 165 $codepoint += ($chr & 0x3F) << 6; 166 if ($chr >= 0x80 && $chr <= 0xBF) { 167 ++$ii; 168 if ($ii >= $len) { 169 return false; 170 } 171 $chr = ord($string[$ii]); 172 $codepoint += ($chr & 0x3F); 173 174 if ($codepoint >= 0xD800 && $codepoint <= 0xDFFF) { 175 // Reject these surrogate codepoints. 176 return false; 177 } 178 179 if ($chr >= 0x80 && $chr <= 0xBF) { 180 continue; 181 } 182 } 183 return false; 184 } else if ($chr > 0xE0 && $chr <= 0xEF) { 185 ++$ii; 186 if ($ii >= $len) { 187 return false; 188 } 189 $chr = ord($string[$ii]); 190 if ($chr >= 0x80 && $chr <= 0xBF) { 191 ++$ii; 192 if ($ii >= $len) { 193 return false; 194 } 195 $chr = ord($string[$ii]); 196 if ($chr >= 0x80 && $chr <= 0xBF) { 197 continue; 198 } 199 } 200 return false; 201 } else if ($chr == 0xE0) { 202 ++$ii; 203 if ($ii >= $len) { 204 return false; 205 } 206 $chr = ord($string[$ii]); 207 208 // NOTE: This range starts at 0xA0, not 0x80. The values 0x80-0xA0 are 209 // "valid", but not minimal representations, and MySQL rejects them. We're 210 // special casing this part of the range. 211 212 if ($chr >= 0xA0 && $chr <= 0xBF) { 213 ++$ii; 214 if ($ii >= $len) { 215 return false; 216 } 217 $chr = ord($string[$ii]); 218 if ($chr >= 0x80 && $chr <= 0xBF) { 219 continue; 220 } 221 } 222 return false; 223 } else if (!$only_bmp) { 224 if ($chr > 0xF0 && $chr <= 0xF4) { 225 ++$ii; 226 if ($ii >= $len) { 227 return false; 228 } 229 $chr = ord($string[$ii]); 230 if ($chr >= 0x80 && $chr <= 0xBF) { 231 ++$ii; 232 if ($ii >= $len) { 233 return false; 234 } 235 $chr = ord($string[$ii]); 236 if ($chr >= 0x80 && $chr <= 0xBF) { 237 ++$ii; 238 if ($ii >= $len) { 239 return false; 240 } 241 $chr = ord($string[$ii]); 242 if ($chr >= 0x80 && $chr <= 0xBF) { 243 continue; 244 } 245 } 246 } 247 } else if ($chr == 0xF0) { 248 ++$ii; 249 if ($ii >= $len) { 250 return false; 251 } 252 $chr = ord($string[$ii]); 253 254 // NOTE: As above, this range starts at 0x90, not 0x80. The values 255 // 0x80-0x90 are not minimal representations. 256 257 if ($chr >= 0x90 && $chr <= 0xBF) { 258 ++$ii; 259 if ($ii >= $len) { 260 return false; 261 } 262 $chr = ord($string[$ii]); 263 if ($chr >= 0x80 && $chr <= 0xBF) { 264 ++$ii; 265 if ($ii >= $len) { 266 return false; 267 } 268 $chr = ord($string[$ii]); 269 if ($chr >= 0x80 && $chr <= 0xBF) { 270 continue; 271 } 272 } 273 } 274 } 275 } 276 277 return false; 278 } 279 280 return true; 281} 282 283 284/** 285 * Find the character length of a UTF-8 string. 286 * 287 * @param string A valid utf-8 string. 288 * @return int The character length of the string. 289 */ 290function phutil_utf8_strlen($string) { 291 if (function_exists('utf8_decode')) { 292 return strlen(utf8_decode($string)); 293 } 294 return count(phutil_utf8v($string)); 295} 296 297 298/** 299 * Find the console display length of a UTF-8 string. This may differ from the 300 * character length of the string if it contains double-width characters, like 301 * many Chinese characters. 302 * 303 * This method is based on a C implementation here, which is based on the IEEE 304 * standards. The source has more discussion and addresses more considerations 305 * than this implementation does. 306 * 307 * http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c 308 * 309 * NOTE: We currently assume width 1 for East-Asian ambiguous characters. 310 * 311 * NOTE: This function is VERY slow. 312 * 313 * @param string A valid UTF-8 string. 314 * @return int The console display length of the string. 315 */ 316function phutil_utf8_console_strlen($string) { 317 // Formatting and colors don't contribute any width in the console. 318 $string = preg_replace("/\x1B\[\d*m/", '', $string); 319 320 // In the common case of an ASCII string, just return the string length. 321 if (preg_match('/^[\x01-\x7F]*\z/', $string)) { 322 return strlen($string); 323 } 324 325 $len = 0; 326 327 // NOTE: To deal with combining characters, we're splitting the string into 328 // glyphs first (characters with combiners) and then counting just the width 329 // of the first character in each glyph. 330 331 $display_glyphs = phutil_utf8v_combined($string); 332 foreach ($display_glyphs as $display_glyph) { 333 $glyph_codepoints = phutil_utf8v_codepoints($display_glyph); 334 foreach ($glyph_codepoints as $c) { 335 if ($c == 0) { 336 break; 337 } 338 339 $len += 1 + 340 ($c >= 0x1100 && 341 ($c <= 0x115F || /* Hangul Jamo init. consonants */ 342 $c == 0x2329 || $c == 0x232A || 343 ($c >= 0x2E80 && $c <= 0xA4CF && 344 $c != 0x303F) || /* CJK ... Yi */ 345 ($c >= 0xAC00 && $c <= 0xD7A3) || /* Hangul Syllables */ 346 ($c >= 0xF900 && $c <= 0xFAFF) || /* CJK Compatibility Ideographs */ 347 ($c >= 0xFE10 && $c <= 0xFE19) || /* Vertical forms */ 348 ($c >= 0xFE30 && $c <= 0xFE6F) || /* CJK Compatibility Forms */ 349 ($c >= 0xFF00 && $c <= 0xFF60) || /* Fullwidth Forms */ 350 ($c >= 0xFFE0 && $c <= 0xFFE6) || 351 ($c >= 0x20000 && $c <= 0x2FFFD) || 352 ($c >= 0x30000 && $c <= 0x3FFFD))); 353 354 break; 355 } 356 } 357 358 return $len; 359} 360 361 362/** 363 * Test if a string contains Chinese, Japanese, or Korean characters. 364 * 365 * Most languages use spaces to separate words, but these languages do not. 366 * 367 * @param string String to examine, in UTF8. 368 * @return bool True if the string contains Chinese, Japanese, or Korean 369 * characters. 370 */ 371function phutil_utf8_is_cjk($string) { 372 $codepoints = phutil_utf8v_codepoints($string); 373 374 foreach ($codepoints as $codepoint) { 375 // CJK Unified Ideographs 376 if ($codepoint >= 0x4E00 && $codepoint <= 0x9FFF) { 377 return true; 378 } 379 380 // CJK Unified Ideographs Extension A 381 if ($codepoint >= 0x3400 && $codepoint <= 0x4DBF) { 382 return true; 383 } 384 385 // CJK Unified Ideographs Extension B 386 if ($codepoint >= 0x20000 && $codepoint <= 0x2A6DF) { 387 return true; 388 } 389 390 // CJK Unified Ideographs Extension C 391 if ($codepoint >= 0x2A700 && $codepoint <= 0x2B73F) { 392 return true; 393 } 394 395 // CJK Unified Ideographs Extension D 396 if ($codepoint >= 0x2B740 && $codepoint <= 0x2B81F) { 397 return true; 398 } 399 400 // CJK Unified Ideographs Extension E 401 if ($codepoint >= 0x2B820 && $codepoint <= 0x2CEAF) { 402 return true; 403 } 404 405 // CJK Unified Ideographs Extension F 406 if ($codepoint >= 0x2CEB0 && $codepoint <= 0x2EBEF) { 407 return true; 408 } 409 410 // CJK Compatibility Ideographs 411 if ($codepoint >= 0xF900 && $codepoint <= 0xFAFF) { 412 return true; 413 } 414 } 415 416 return false; 417} 418 419 420/** 421 * Split a UTF-8 string into an array of characters. Combining characters are 422 * also split. 423 * 424 * @param string A valid utf-8 string. 425 * @param int|null Stop processing after examining this many bytes. 426 * @return list A list of characters in the string. 427 */ 428function phutil_utf8v($string, $byte_limit = null) { 429 $string = phutil_string_cast($string); 430 431 $res = array(); 432 $len = strlen($string); 433 434 $ii = 0; 435 while ($ii < $len) { 436 $byte = $string[$ii]; 437 if ($byte <= "\x7F") { 438 $res[] = $byte; 439 $ii += 1; 440 441 if ($byte_limit && ($ii >= $byte_limit)) { 442 break; 443 } 444 445 continue; 446 } else if ($byte < "\xC0") { 447 throw new Exception( 448 pht('Invalid UTF-8 string passed to %s.', __FUNCTION__)); 449 } else if ($byte <= "\xDF") { 450 $seq_len = 2; 451 } else if ($byte <= "\xEF") { 452 $seq_len = 3; 453 } else if ($byte <= "\xF7") { 454 $seq_len = 4; 455 } else if ($byte <= "\xFB") { 456 $seq_len = 5; 457 } else if ($byte <= "\xFD") { 458 $seq_len = 6; 459 } else { 460 throw new Exception( 461 pht('Invalid UTF-8 string passed to %s.', __FUNCTION__)); 462 } 463 464 if ($ii + $seq_len > $len) { 465 throw new Exception( 466 pht('Invalid UTF-8 string passed to %s.', __FUNCTION__)); 467 } 468 for ($jj = 1; $jj < $seq_len; ++$jj) { 469 if ($string[$ii + $jj] >= "\xC0") { 470 throw new Exception( 471 pht('Invalid UTF-8 string passed to %s.', __FUNCTION__)); 472 } 473 } 474 $res[] = substr($string, $ii, $seq_len); 475 $ii += $seq_len; 476 477 if ($byte_limit && ($ii >= $byte_limit)) { 478 break; 479 } 480 } 481 482 return $res; 483} 484 485 486/** 487 * Split a UTF-8 string into an array of codepoints (as integers). 488 * 489 * @param string A valid UTF-8 string. 490 * @return list A list of codepoints, as integers. 491 */ 492function phutil_utf8v_codepoints($string) { 493 $str_v = phutil_utf8v($string); 494 495 foreach ($str_v as $key => $char) { 496 $c = ord($char[0]); 497 $v = 0; 498 499 if (($c & 0x80) == 0) { 500 $v = $c; 501 } else if (($c & 0xE0) == 0xC0) { 502 $v = (($c & 0x1F) << 6) 503 + ((ord($char[1]) & 0x3F)); 504 } else if (($c & 0xF0) == 0xE0) { 505 $v = (($c & 0x0F) << 12) 506 + ((ord($char[1]) & 0x3F) << 6) 507 + ((ord($char[2]) & 0x3F)); 508 } else if (($c & 0xF8) == 0xF0) { 509 $v = (($c & 0x07) << 18) 510 + ((ord($char[1]) & 0x3F) << 12) 511 + ((ord($char[2]) & 0x3F) << 6) 512 + ((ord($char[3]) & 0x3F)); 513 } else if (($c & 0xFC) == 0xF8) { 514 $v = (($c & 0x03) << 24) 515 + ((ord($char[1]) & 0x3F) << 18) 516 + ((ord($char[2]) & 0x3F) << 12) 517 + ((ord($char[3]) & 0x3F) << 6) 518 + ((ord($char[4]) & 0x3F)); 519 } else if (($c & 0xFE) == 0xFC) { 520 $v = (($c & 0x01) << 30) 521 + ((ord($char[1]) & 0x3F) << 24) 522 + ((ord($char[2]) & 0x3F) << 18) 523 + ((ord($char[3]) & 0x3F) << 12) 524 + ((ord($char[4]) & 0x3F) << 6) 525 + ((ord($char[5]) & 0x3F)); 526 } 527 528 $str_v[$key] = $v; 529 } 530 531 return $str_v; 532} 533 534 535/** 536 * Convert a Unicode codepoint into a UTF8-encoded string. 537 * 538 * @param int Unicode codepoint. 539 * @return string UTF8 encoding. 540 */ 541function phutil_utf8_encode_codepoint($codepoint) { 542 if ($codepoint < 0x80) { 543 $r = chr($codepoint); 544 } else if ($codepoint < 0x800) { 545 $r = chr(0xC0 | (($codepoint >> 6) & 0x1F)). 546 chr(0x80 | (($codepoint) & 0x3F)); 547 } else if ($codepoint < 0x10000) { 548 $r = chr(0xE0 | (($codepoint >> 12) & 0x0F)). 549 chr(0x80 | (($codepoint >> 6) & 0x3F)). 550 chr(0x80 | (($codepoint) & 0x3F)); 551 } else if ($codepoint < 0x110000) { 552 $r = chr(0xF0 | (($codepoint >> 18) & 0x07)). 553 chr(0x80 | (($codepoint >> 12) & 0x3F)). 554 chr(0x80 | (($codepoint >> 6) & 0x3F)). 555 chr(0x80 | (($codepoint) & 0x3F)); 556 } else { 557 throw new Exception( 558 pht( 559 'Encoding UTF8 codepoint "%s" is not supported.', 560 $codepoint)); 561 } 562 563 return $r; 564} 565 566 567/** 568 * Hard-wrap a block of UTF-8 text with embedded HTML tags and entities. 569 * 570 * @param string An HTML string with tags and entities. 571 * @return list List of hard-wrapped lines. 572 */ 573function phutil_utf8_hard_wrap_html($string, $width) { 574 $break_here = array(); 575 576 // Convert the UTF-8 string into a list of UTF-8 characters. 577 $vector = phutil_utf8v($string); 578 $len = count($vector); 579 $char_pos = 0; 580 for ($ii = 0; $ii < $len; ++$ii) { 581 // An ampersand indicates an HTML entity; consume the whole thing (until 582 // ";") but treat it all as one character. 583 if ($vector[$ii] == '&') { 584 do { 585 ++$ii; 586 } while ($vector[$ii] != ';'); 587 ++$char_pos; 588 // An "<" indicates an HTML tag, consume the whole thing but don't treat 589 // it as a character. 590 } else if ($vector[$ii] == '<') { 591 do { 592 ++$ii; 593 } while ($vector[$ii] != '>'); 594 } else { 595 ++$char_pos; 596 } 597 598 // Keep track of where we need to break the string later. 599 if ($char_pos == $width) { 600 $break_here[$ii] = true; 601 $char_pos = 0; 602 } 603 } 604 605 $result = array(); 606 $string = ''; 607 foreach ($vector as $ii => $char) { 608 $string .= $char; 609 if (isset($break_here[$ii])) { 610 $result[] = $string; 611 $string = ''; 612 } 613 } 614 615 if (strlen($string)) { 616 $result[] = $string; 617 } 618 619 return $result; 620} 621 622/** 623 * Hard-wrap a block of UTF-8 text with no embedded HTML tags and entities. 624 * 625 * @param string A non HTML string 626 * @param int Width of the hard-wrapped lines 627 * @return list List of hard-wrapped lines. 628 */ 629function phutil_utf8_hard_wrap($string, $width) { 630 $result = array(); 631 632 $lines = phutil_split_lines($string, $retain_endings = false); 633 foreach ($lines as $line) { 634 635 // Convert the UTF-8 string into a list of UTF-8 characters. 636 $vector = phutil_utf8v($line); 637 638 $len = count($vector); 639 $buffer = ''; 640 641 for ($ii = 1; $ii <= $len; ++$ii) { 642 $buffer .= $vector[$ii - 1]; 643 if (($ii % $width) === 0) { 644 $result[] = $buffer; 645 $buffer = ''; 646 } 647 } 648 649 if (strlen($buffer)) { 650 $result[] = $buffer; 651 } 652 } 653 654 return $result; 655} 656 657/** 658 * Convert a string from one encoding (like ISO-8859-1) to another encoding 659 * (like UTF-8). 660 * 661 * This is primarily a thin wrapper around `mb_convert_encoding()` which checks 662 * you have the extension installed, since we try to require the extension 663 * only if you actually need it (i.e., you want to work with encodings other 664 * than UTF-8). 665 * 666 * NOTE: This function assumes that the input is in the given source encoding. 667 * If it is not, it may not output in the specified target encoding. If you 668 * need to perform a hard conversion to UTF-8, use this function in conjunction 669 * with @{function:phutil_utf8ize}. We can detect failures caused by invalid 670 * encoding names, but `mb_convert_encoding()` fails silently if the 671 * encoding name identifies a real encoding but the string is not actually 672 * encoded with that encoding. 673 * 674 * @param string String to re-encode. 675 * @param string Target encoding name, like "UTF-8". 676 * @param string Source encoding name, like "ISO-8859-1". 677 * @return string Input string, with converted character encoding. 678 * 679 * @phutil-external-symbol function mb_convert_encoding 680 */ 681function phutil_utf8_convert($string, $to_encoding, $from_encoding) { 682 if (!$from_encoding) { 683 throw new InvalidArgumentException( 684 pht( 685 'Attempting to convert a string encoding, but no source encoding '. 686 'was provided. Explicitly provide the source encoding.')); 687 } 688 if (!$to_encoding) { 689 throw new InvalidArgumentException( 690 pht( 691 'Attempting to convert a string encoding, but no target encoding '. 692 'was provided. Explicitly provide the target encoding.')); 693 } 694 695 // Normalize encoding names so we can no-op the very common case of UTF8 696 // to UTF8 (or any other conversion where both encodings are identical). 697 $to_upper = strtoupper(str_replace('-', '', $to_encoding)); 698 $from_upper = strtoupper(str_replace('-', '', $from_encoding)); 699 if ($from_upper == $to_upper) { 700 return $string; 701 } 702 703 if (!function_exists('mb_convert_encoding')) { 704 throw new Exception( 705 pht( 706 "Attempting to convert a string encoding from '%s' to '%s', ". 707 "but the '%s' PHP extension is not available. Install %s to ". 708 "work with encodings other than UTF-8.", 709 $from_encoding, 710 $to_encoding, 711 'mbstring', 712 'mbstring')); 713 } 714 715 $result = @mb_convert_encoding($string, $to_encoding, $from_encoding); 716 717 if ($result === false) { 718 $message = error_get_last(); 719 if ($message) { 720 $message = idx($message, 'message', pht('Unknown error.')); 721 } 722 throw new Exception( 723 pht( 724 "String conversion from encoding '%s' to encoding '%s' failed: %s", 725 $from_encoding, 726 $to_encoding, 727 $message)); 728 } 729 730 return $result; 731} 732 733 734/** 735 * Convert a string to title case in a UTF8-aware way. This function doesn't 736 * necessarily do a great job, but the builtin implementation of `ucwords()` can 737 * completely destroy inputs, so it just has to be better than that. Similar to 738 * @{function:ucwords}. 739 * 740 * @param string UTF-8 input string. 741 * @return string Input, in some semblance of title case. 742 */ 743function phutil_utf8_ucwords($str) { 744 // NOTE: mb_convert_case() discards uppercase letters in words when converting 745 // to title case. For example, it will convert "AAA" into "Aaa", which is 746 // undesirable. 747 748 $v = phutil_utf8v($str); 749 $result = ''; 750 $last = null; 751 752 $ord_a = ord('a'); 753 $ord_z = ord('z'); 754 foreach ($v as $c) { 755 $convert = false; 756 if ($last === null || $last === ' ') { 757 $o = ord($c[0]); 758 if ($o >= $ord_a && $o <= $ord_z) { 759 $convert = true; 760 } 761 } 762 763 if ($convert) { 764 $result .= phutil_utf8_strtoupper($c); 765 } else { 766 $result .= $c; 767 } 768 769 $last = $c; 770 } 771 772 return $result; 773} 774 775 776/** 777 * Convert a string to lower case in a UTF8-aware way. Similar to 778 * @{function:strtolower}. 779 * 780 * @param string UTF-8 input string. 781 * @return string Input, in some semblance of lower case. 782 * 783 * @phutil-external-symbol function mb_convert_case 784 */ 785function phutil_utf8_strtolower($str) { 786 if (function_exists('mb_convert_case')) { 787 return mb_convert_case($str, MB_CASE_LOWER, 'UTF-8'); 788 } 789 790 static $map; 791 if ($map === null) { 792 $map = array_combine( 793 range('A', 'Z'), 794 range('a', 'z')); 795 } 796 797 return phutil_utf8_strtr($str, $map); 798} 799 800 801/** 802 * Convert a string to upper case in a UTF8-aware way. Similar to 803 * @{function:strtoupper}. 804 * 805 * @param string UTF-8 input string. 806 * @return string Input, in some semblance of upper case. 807 * 808 * @phutil-external-symbol function mb_convert_case 809 */ 810function phutil_utf8_strtoupper($str) { 811 if (function_exists('mb_convert_case')) { 812 return mb_convert_case($str, MB_CASE_UPPER, 'UTF-8'); 813 } 814 815 static $map; 816 if ($map === null) { 817 $map = array_combine( 818 range('a', 'z'), 819 range('A', 'Z')); 820 } 821 822 return phutil_utf8_strtr($str, $map); 823} 824 825 826/** 827 * Replace characters in a string in a UTF-aware way. Similar to 828 * @{function:strtr}. 829 * 830 * @param string UTF-8 input string. 831 * @param map<string, string> Map of characters to replace. 832 * @return string Input with translated characters. 833 */ 834function phutil_utf8_strtr($str, array $map) { 835 $v = phutil_utf8v($str); 836 $result = ''; 837 foreach ($v as $c) { 838 if (isset($map[$c])) { 839 $result .= $map[$c]; 840 } else { 841 $result .= $c; 842 } 843 } 844 845 return $result; 846} 847 848/** 849 * Determine if a given unicode character is a combining character or not. 850 * 851 * @param string A single unicode character. 852 * @return boolean True or false. 853 */ 854function phutil_utf8_is_combining_character($character) { 855 $components = phutil_utf8v_codepoints($character); 856 857 // Combining Diacritical Marks (0300 - 036F). 858 // Combining Diacritical Marks Supplement (1DC0 - 1DFF). 859 // Combining Diacritical Marks for Symbols (20D0 - 20FF). 860 // Combining Half Marks (FE20 - FE2F). 861 862 foreach ($components as $codepoint) { 863 if ($codepoint >= 0x0300 && $codepoint <= 0x036F || 864 $codepoint >= 0x1DC0 && $codepoint <= 0x1DFF || 865 $codepoint >= 0x20D0 && $codepoint <= 0x20FF || 866 $codepoint >= 0xFE20 && $codepoint <= 0xFE2F) { 867 return true; 868 } 869 } 870 871 return false; 872} 873 874 875/** 876 * Split a UTF-8 string into an array of characters. Combining characters 877 * are not split. 878 * 879 * @param string A valid utf-8 string. 880 * @return list A list of characters in the string. 881 */ 882function phutil_utf8v_combined($string) { 883 $components = phutil_utf8v($string); 884 return phutil_utf8v_combine_characters($components); 885} 886 887 888/** 889 * Merge combining characters in a UTF-8 string. 890 * 891 * This is a low-level method which can allow other operations to do less work. 892 * If you have a string, call @{method:phutil_utf8v_combined} instead. 893 * 894 * @param list List of UTF-8 characters. 895 * @return list List of UTF-8 strings with combining characters merged. 896 */ 897function phutil_utf8v_combine_characters(array $characters) { 898 if (!$characters) { 899 return array(); 900 } 901 902 // If the first character in the string is a combining character, 903 // start with a space. 904 if (phutil_utf8_is_combining_character($characters[0])) { 905 $buf = ' '; 906 } else { 907 $buf = null; 908 } 909 910 $parts = array(); 911 foreach ($characters as $character) { 912 if (!isset($character[1])) { 913 // This an optimization: there are no one-byte combining characters, 914 // so we can just pass these through unmodified. 915 $is_combining = false; 916 } else { 917 $is_combining = phutil_utf8_is_combining_character($character); 918 } 919 920 if ($is_combining) { 921 $buf .= $character; 922 } else { 923 if ($buf !== null) { 924 $parts[] = $buf; 925 } 926 $buf = $character; 927 } 928 } 929 930 $parts[] = $buf; 931 932 return $parts; 933} 934 935 936/** 937 * Return the current system locale setting (LC_ALL). 938 * 939 * @return string Current system locale setting. 940 */ 941function phutil_get_system_locale() { 942 $locale = setlocale(LC_ALL, 0); 943 944 if ($locale === false) { 945 throw new Exception( 946 pht( 947 'Unable to determine current system locale (call to '. 948 '"setlocale(LC_ALL, 0)" failed).')); 949 } 950 951 return $locale; 952} 953 954 955/** 956 * Test if a system locale (LC_ALL) is available on the system. 957 * 958 * @param string Locale name like "en_US.UTF-8". 959 * @return bool True if the locale is available. 960 */ 961function phutil_is_system_locale_available($locale) { 962 $old_locale = phutil_get_system_locale(); 963 $is_available = @setlocale(LC_ALL, $locale); 964 setlocale(LC_ALL, $old_locale); 965 966 return ($is_available !== false); 967} 968 969 970/** 971 * Set the system locale (LC_ALL) to a particular value. 972 * 973 * @param string New locale setting. 974 * @return void 975 */ 976function phutil_set_system_locale($locale) { 977 $ok = @setlocale(LC_ALL, $locale); 978 if (!$ok) { 979 throw new Exception( 980 pht( 981 'Failed to set system locale (to "%s").', 982 $locale)); 983 } 984} 985