1<?php
2
3/**
4 * Convert a string into valid UTF-8. This function is quite slow.
5 *
6 * When invalid byte subsequences are encountered, they will be replaced with
7 * U+FFFD, the Unicode replacement character.
8 *
9 * This function treats overlong encodings as invalid.
10 *
11 * @param   string  String to convert to valid UTF-8.
12 * @return  string  String with invalid UTF-8 byte subsequences replaced with
13 *                  U+FFFD.
14 */
15function phutil_utf8ize($string) {
16  if (phutil_is_utf8($string)) {
17    return $string;
18  }
19
20  // There is no function to do this in iconv, mbstring or ICU to do this, so
21  // do it (very very slowly) in pure PHP.
22
23  // TODO: Provide an optional fast C implementation ala fb_utf8ize() if this
24  // ever shows up in profiles?
25
26  $result = array();
27
28  $regex =
29    "/([\x01-\x7F]".
30      "|[\xC2-\xDF][\x80-\xBF]".
31      "|[\xE0][\xA0-\xBF][\x80-\xBF]".
32      "|[\xE1-\xEF][\x80-\xBF][\x80-\xBF]".
33      "|[\xF0][\x90-\xBF][\x80-\xBF][\x80-\xBF]".
34      "|[\xF1-\xF3][\x80-\xBF][\x80-\xBF][\x80-\xBF]".
35      "|[\xF4][\x80-\x8F][\x80-\xBF][\x80-\xBF])".
36    "|(.)/";
37
38  $replacement = "\xEF\xBF\xBD";
39
40  $offset = 0;
41  $matches = null;
42  while (preg_match($regex, $string, $matches, 0, $offset)) {
43    if (!isset($matches[2])) {
44      $match = $matches[1];
45
46      if ($match[0] == "\xED") {
47        // If this is a 3-byte character that may be part of one of the
48        // surrogate ranges, check if it's actually in those ranges. Reject
49        // it as invalid if it is. These sequences are used in UTF16 and
50        // functions like json_encode() refuse to encode them.
51
52        $codepoint = ((ord($match[0]) & 0x0F) << 12)
53                   + ((ord($match[1]) & 0x3F) << 6)
54                   + ((ord($match[2]) & 0x3F));
55        if ($codepoint >= 0xD800 && $codepoint <= 0xDFFF) {
56          $result[] = str_repeat($replacement, strlen($match));
57          $offset += strlen($matches[0]);
58          continue;
59        }
60      }
61
62      $result[] = $match;
63    } else {
64      // Unicode replacement character, U+FFFD.
65      $result[] = $replacement;
66    }
67
68    $offset += strlen($matches[0]);
69  }
70
71  return implode('', $result);
72}
73
74
75/**
76 * Determine if a string is valid UTF-8, with only basic multilingual plane
77 * characters. This is particularly important because MySQL's `utf8` column
78 * types silently truncate strings which contain characters outside of this
79 * set.
80 *
81 * @param string  String to test for being valid UTF-8 with only characters in
82 *                the basic multilingual plane.
83 * @return bool   True if the string is valid UTF-8 with only BMP characters.
84 */
85function phutil_is_utf8_with_only_bmp_characters($string) {
86  return phutil_is_utf8_slowly($string, $only_bmp = true);
87}
88
89
90/**
91 * Determine if a string is valid UTF-8.
92 *
93 * @param string  Some string which may or may not be valid UTF-8.
94 * @return bool    True if the string is valid UTF-8.
95 */
96function phutil_is_utf8($string) {
97  if (function_exists('mb_check_encoding')) {
98    // See T13527. In some versions of PHP, "mb_check_encoding()" strictly
99    // requires a string parameter.
100    $string = phutil_string_cast($string);
101
102    // If mbstring is available, this is significantly faster than using PHP.
103    return mb_check_encoding($string, 'UTF-8');
104  }
105
106  return phutil_is_utf8_slowly($string);
107}
108
109
110/**
111 * Determine if a string is valid UTF-8, slowly.
112 *
113 * This works on any system, but has very poor performance.
114 *
115 * You should call @{function:phutil_is_utf8} instead of this function, as
116 * that function can use more performant mechanisms if they are available on
117 * the system.
118 *
119 * @param string  Some string which may or may not be valid UTF-8.
120 * @param bool    True to require all characters be part of the basic
121 *                multilingual plane (no more than 3-bytes long).
122 * @return bool   True if the string is valid UTF-8.
123 */
124function phutil_is_utf8_slowly($string, $only_bmp = false) {
125  // First, check the common case of normal ASCII strings. We're fine if
126  // the string contains no bytes larger than 127.
127  if (preg_match('/^[\x01-\x7F]+\z/', $string)) {
128    return true;
129  }
130
131  // NOTE: In the past, we used a large regular expression in the form of
132  // '(x|y|z)+' to match UTF8 strings. However, PCRE can segfaults on patterns
133  // like this at relatively small input sizes, at least on some systems
134  // (observed on OSX and Windows). This is apparently because the internal
135  // implementation is recursive and it blows the stack.
136
137  // See <https://bugs.php.net/bug.php?id=45735> for some discussion. Since the
138  // input limit is extremely low (less than 50KB on my system), do this check
139  // very very slowly in PHP instead. See also T5316.
140
141  $len = strlen($string);
142  for ($ii = 0; $ii < $len; $ii++) {
143    $chr = ord($string[$ii]);
144    if ($chr >= 0x01 && $chr <= 0x7F) {
145      continue;
146    } else if ($chr >= 0xC2 && $chr <= 0xDF) {
147      ++$ii;
148      if ($ii >= $len) {
149        return false;
150      }
151      $chr = ord($string[$ii]);
152      if ($chr >= 0x80 && $chr <= 0xBF) {
153        continue;
154      }
155      return false;
156    } else if ($chr == 0xED) {
157      // See T11525. Some sequences in this block are surrogate codepoints
158      // that are reserved for use in UTF16. We should reject them.
159      $codepoint = ($chr & 0x0F) << 12;
160      ++$ii;
161      if ($ii >= $len) {
162        return false;
163      }
164      $chr = ord($string[$ii]);
165      $codepoint += ($chr & 0x3F) << 6;
166      if ($chr >= 0x80 && $chr <= 0xBF) {
167        ++$ii;
168        if ($ii >= $len) {
169          return false;
170        }
171        $chr = ord($string[$ii]);
172        $codepoint += ($chr & 0x3F);
173
174        if ($codepoint >= 0xD800 && $codepoint <= 0xDFFF) {
175          // Reject these surrogate codepoints.
176          return false;
177        }
178
179        if ($chr >= 0x80 && $chr <= 0xBF) {
180          continue;
181        }
182      }
183      return false;
184    } else if ($chr > 0xE0 && $chr <= 0xEF) {
185      ++$ii;
186      if ($ii >= $len) {
187        return false;
188      }
189      $chr = ord($string[$ii]);
190      if ($chr >= 0x80 && $chr <= 0xBF) {
191        ++$ii;
192        if ($ii >= $len) {
193          return false;
194        }
195        $chr = ord($string[$ii]);
196        if ($chr >= 0x80 && $chr <= 0xBF) {
197          continue;
198        }
199      }
200      return false;
201    } else if ($chr == 0xE0) {
202      ++$ii;
203      if ($ii >= $len) {
204        return false;
205      }
206      $chr = ord($string[$ii]);
207
208      // NOTE: This range starts at 0xA0, not 0x80. The values 0x80-0xA0 are
209      // "valid", but not minimal representations, and MySQL rejects them. We're
210      // special casing this part of the range.
211
212      if ($chr >= 0xA0 && $chr <= 0xBF) {
213        ++$ii;
214        if ($ii >= $len) {
215          return false;
216        }
217        $chr = ord($string[$ii]);
218        if ($chr >= 0x80 && $chr <= 0xBF) {
219          continue;
220        }
221      }
222      return false;
223    } else if (!$only_bmp) {
224      if ($chr > 0xF0 && $chr <= 0xF4) {
225        ++$ii;
226        if ($ii >= $len) {
227          return false;
228        }
229        $chr = ord($string[$ii]);
230        if ($chr >= 0x80 && $chr <= 0xBF) {
231          ++$ii;
232          if ($ii >= $len) {
233            return false;
234          }
235          $chr = ord($string[$ii]);
236          if ($chr >= 0x80 && $chr <= 0xBF) {
237            ++$ii;
238            if ($ii >= $len) {
239              return false;
240            }
241            $chr = ord($string[$ii]);
242            if ($chr >= 0x80 && $chr <= 0xBF) {
243              continue;
244            }
245          }
246        }
247      } else if ($chr == 0xF0) {
248        ++$ii;
249        if ($ii >= $len) {
250          return false;
251        }
252        $chr = ord($string[$ii]);
253
254        // NOTE: As above, this range starts at 0x90, not 0x80. The values
255        // 0x80-0x90 are not minimal representations.
256
257        if ($chr >= 0x90 && $chr <= 0xBF) {
258          ++$ii;
259          if ($ii >= $len) {
260            return false;
261          }
262          $chr = ord($string[$ii]);
263          if ($chr >= 0x80 && $chr <= 0xBF) {
264            ++$ii;
265            if ($ii >= $len) {
266              return false;
267            }
268            $chr = ord($string[$ii]);
269            if ($chr >= 0x80 && $chr <= 0xBF) {
270              continue;
271            }
272          }
273        }
274      }
275    }
276
277    return false;
278  }
279
280  return true;
281}
282
283
284/**
285 * Find the character length of a UTF-8 string.
286 *
287 * @param string A valid utf-8 string.
288 * @return int   The character length of the string.
289 */
290function phutil_utf8_strlen($string) {
291  if (function_exists('utf8_decode')) {
292    return strlen(utf8_decode($string));
293  }
294  return count(phutil_utf8v($string));
295}
296
297
298/**
299 * Find the console display length of a UTF-8 string. This may differ from the
300 * character length of the string if it contains double-width characters, like
301 * many Chinese characters.
302 *
303 * This method is based on a C implementation here, which is based on the IEEE
304 * standards. The source has more discussion and addresses more considerations
305 * than this implementation does.
306 *
307 *   http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
308 *
309 * NOTE: We currently assume width 1 for East-Asian ambiguous characters.
310 *
311 * NOTE: This function is VERY slow.
312 *
313 * @param   string  A valid UTF-8 string.
314 * @return  int     The console display length of the string.
315 */
316function phutil_utf8_console_strlen($string) {
317  // Formatting and colors don't contribute any width in the console.
318  $string = preg_replace("/\x1B\[\d*m/", '', $string);
319
320  // In the common case of an ASCII string, just return the string length.
321  if (preg_match('/^[\x01-\x7F]*\z/', $string)) {
322    return strlen($string);
323  }
324
325  $len = 0;
326
327  // NOTE: To deal with combining characters, we're splitting the string into
328  // glyphs first (characters with combiners) and then counting just the width
329  // of the first character in each glyph.
330
331  $display_glyphs = phutil_utf8v_combined($string);
332  foreach ($display_glyphs as $display_glyph) {
333    $glyph_codepoints = phutil_utf8v_codepoints($display_glyph);
334    foreach ($glyph_codepoints as $c) {
335      if ($c == 0) {
336        break;
337      }
338
339      $len += 1 +
340        ($c >= 0x1100 &&
341          ($c <= 0x115F ||                    /* Hangul Jamo init. consonants */
342            $c == 0x2329 || $c == 0x232A ||
343            ($c >= 0x2E80 && $c <= 0xA4CF &&
344              $c != 0x303F) ||                  /* CJK ... Yi */
345            ($c >= 0xAC00 && $c <= 0xD7A3) || /* Hangul Syllables */
346            ($c >= 0xF900 && $c <= 0xFAFF) || /* CJK Compatibility Ideographs */
347            ($c >= 0xFE10 && $c <= 0xFE19) || /* Vertical forms */
348            ($c >= 0xFE30 && $c <= 0xFE6F) || /* CJK Compatibility Forms */
349            ($c >= 0xFF00 && $c <= 0xFF60) || /* Fullwidth Forms */
350            ($c >= 0xFFE0 && $c <= 0xFFE6) ||
351            ($c >= 0x20000 && $c <= 0x2FFFD) ||
352            ($c >= 0x30000 && $c <= 0x3FFFD)));
353
354      break;
355    }
356  }
357
358  return $len;
359}
360
361
362/**
363 * Test if a string contains Chinese, Japanese, or Korean characters.
364 *
365 * Most languages use spaces to separate words, but these languages do not.
366 *
367 * @param string String to examine, in UTF8.
368 * @return bool True if the string contains Chinese, Japanese, or Korean
369 *   characters.
370 */
371function phutil_utf8_is_cjk($string) {
372  $codepoints = phutil_utf8v_codepoints($string);
373
374  foreach ($codepoints as $codepoint) {
375    // CJK Unified Ideographs
376    if ($codepoint >= 0x4E00 && $codepoint <= 0x9FFF) {
377      return true;
378    }
379
380    // CJK Unified Ideographs Extension A
381    if ($codepoint >= 0x3400 && $codepoint <= 0x4DBF) {
382      return true;
383    }
384
385    // CJK Unified Ideographs Extension B
386    if ($codepoint >= 0x20000 && $codepoint <= 0x2A6DF) {
387      return true;
388    }
389
390    // CJK Unified Ideographs Extension C
391    if ($codepoint >= 0x2A700 && $codepoint <= 0x2B73F) {
392      return true;
393    }
394
395    // CJK Unified Ideographs Extension D
396    if ($codepoint >= 0x2B740 && $codepoint <= 0x2B81F) {
397      return true;
398    }
399
400    // CJK Unified Ideographs Extension E
401    if ($codepoint >= 0x2B820 && $codepoint <= 0x2CEAF) {
402      return true;
403    }
404
405    // CJK Unified Ideographs Extension F
406    if ($codepoint >= 0x2CEB0 && $codepoint <= 0x2EBEF) {
407      return true;
408    }
409
410    // CJK Compatibility Ideographs
411    if ($codepoint >= 0xF900 && $codepoint <= 0xFAFF) {
412      return true;
413    }
414  }
415
416  return false;
417}
418
419
420/**
421 * Split a UTF-8 string into an array of characters. Combining characters are
422 * also split.
423 *
424 * @param string A valid utf-8 string.
425 * @param int|null Stop processing after examining this many bytes.
426 * @return list  A list of characters in the string.
427 */
428function phutil_utf8v($string, $byte_limit = null) {
429  $string = phutil_string_cast($string);
430
431  $res = array();
432  $len = strlen($string);
433
434  $ii = 0;
435  while ($ii < $len) {
436    $byte = $string[$ii];
437    if ($byte <= "\x7F") {
438      $res[] = $byte;
439      $ii += 1;
440
441      if ($byte_limit && ($ii >= $byte_limit)) {
442        break;
443      }
444
445      continue;
446    } else if ($byte < "\xC0") {
447      throw new Exception(
448        pht('Invalid UTF-8 string passed to %s.', __FUNCTION__));
449    } else if ($byte <= "\xDF") {
450      $seq_len = 2;
451    } else if ($byte <= "\xEF") {
452      $seq_len = 3;
453    } else if ($byte <= "\xF7") {
454      $seq_len = 4;
455    } else if ($byte <= "\xFB") {
456      $seq_len = 5;
457    } else if ($byte <= "\xFD") {
458      $seq_len = 6;
459    } else {
460      throw new Exception(
461        pht('Invalid UTF-8 string passed to %s.', __FUNCTION__));
462    }
463
464    if ($ii + $seq_len > $len) {
465      throw new Exception(
466        pht('Invalid UTF-8 string passed to %s.', __FUNCTION__));
467    }
468    for ($jj = 1; $jj < $seq_len; ++$jj) {
469      if ($string[$ii + $jj] >= "\xC0") {
470        throw new Exception(
471          pht('Invalid UTF-8 string passed to %s.', __FUNCTION__));
472      }
473    }
474    $res[] = substr($string, $ii, $seq_len);
475    $ii += $seq_len;
476
477    if ($byte_limit && ($ii >= $byte_limit)) {
478      break;
479    }
480  }
481
482  return $res;
483}
484
485
486/**
487 * Split a UTF-8 string into an array of codepoints (as integers).
488 *
489 * @param   string  A valid UTF-8 string.
490 * @return  list    A list of codepoints, as integers.
491 */
492function phutil_utf8v_codepoints($string) {
493  $str_v = phutil_utf8v($string);
494
495  foreach ($str_v as $key => $char) {
496    $c = ord($char[0]);
497    $v = 0;
498
499    if (($c & 0x80) == 0) {
500      $v = $c;
501    } else if (($c & 0xE0) == 0xC0) {
502      $v = (($c & 0x1F) << 6)
503         + ((ord($char[1]) & 0x3F));
504    } else if (($c & 0xF0) == 0xE0) {
505      $v = (($c & 0x0F) << 12)
506         + ((ord($char[1]) & 0x3F) << 6)
507         + ((ord($char[2]) & 0x3F));
508    } else if (($c & 0xF8) == 0xF0) {
509      $v = (($c & 0x07) << 18)
510         + ((ord($char[1]) & 0x3F) << 12)
511         + ((ord($char[2]) & 0x3F) << 6)
512         + ((ord($char[3]) & 0x3F));
513    } else if (($c & 0xFC) == 0xF8) {
514      $v = (($c & 0x03) << 24)
515         + ((ord($char[1]) & 0x3F) << 18)
516         + ((ord($char[2]) & 0x3F) << 12)
517         + ((ord($char[3]) & 0x3F) << 6)
518         + ((ord($char[4]) & 0x3F));
519    } else if (($c & 0xFE) == 0xFC) {
520      $v = (($c & 0x01) << 30)
521         + ((ord($char[1]) & 0x3F) << 24)
522         + ((ord($char[2]) & 0x3F) << 18)
523         + ((ord($char[3]) & 0x3F) << 12)
524         + ((ord($char[4]) & 0x3F) << 6)
525         + ((ord($char[5]) & 0x3F));
526    }
527
528    $str_v[$key] = $v;
529  }
530
531  return $str_v;
532}
533
534
535/**
536 * Convert a Unicode codepoint into a UTF8-encoded string.
537 *
538 * @param int Unicode codepoint.
539 * @return string UTF8 encoding.
540 */
541function phutil_utf8_encode_codepoint($codepoint) {
542  if ($codepoint < 0x80) {
543    $r = chr($codepoint);
544  } else if ($codepoint < 0x800) {
545    $r = chr(0xC0 | (($codepoint >> 6)  & 0x1F)).
546         chr(0x80 | (($codepoint)       & 0x3F));
547  } else if ($codepoint < 0x10000) {
548    $r = chr(0xE0 | (($codepoint >> 12) & 0x0F)).
549         chr(0x80 | (($codepoint >> 6)  & 0x3F)).
550         chr(0x80 | (($codepoint)       & 0x3F));
551  } else if ($codepoint < 0x110000) {
552    $r = chr(0xF0 | (($codepoint >> 18) & 0x07)).
553         chr(0x80 | (($codepoint >> 12) & 0x3F)).
554         chr(0x80 | (($codepoint >> 6)  & 0x3F)).
555         chr(0x80 | (($codepoint)       & 0x3F));
556  } else {
557    throw new Exception(
558      pht(
559        'Encoding UTF8 codepoint "%s" is not supported.',
560        $codepoint));
561  }
562
563  return $r;
564}
565
566
567/**
568 * Hard-wrap a block of UTF-8 text with embedded HTML tags and entities.
569 *
570 * @param   string An HTML string with tags and entities.
571 * @return  list   List of hard-wrapped lines.
572 */
573function phutil_utf8_hard_wrap_html($string, $width) {
574  $break_here = array();
575
576  // Convert the UTF-8 string into a list of UTF-8 characters.
577  $vector = phutil_utf8v($string);
578  $len = count($vector);
579  $char_pos = 0;
580  for ($ii = 0; $ii < $len; ++$ii) {
581    // An ampersand indicates an HTML entity; consume the whole thing (until
582    // ";") but treat it all as one character.
583    if ($vector[$ii] == '&') {
584      do {
585        ++$ii;
586      } while ($vector[$ii] != ';');
587      ++$char_pos;
588    // An "<" indicates an HTML tag, consume the whole thing but don't treat
589    // it as a character.
590    } else if ($vector[$ii] == '<') {
591      do {
592        ++$ii;
593      } while ($vector[$ii] != '>');
594    } else {
595      ++$char_pos;
596    }
597
598    // Keep track of where we need to break the string later.
599    if ($char_pos == $width) {
600      $break_here[$ii] = true;
601      $char_pos = 0;
602    }
603  }
604
605  $result = array();
606  $string = '';
607  foreach ($vector as $ii => $char) {
608    $string .= $char;
609    if (isset($break_here[$ii])) {
610      $result[] = $string;
611      $string = '';
612    }
613  }
614
615  if (strlen($string)) {
616    $result[] = $string;
617  }
618
619  return $result;
620}
621
622/**
623  * Hard-wrap a block of UTF-8 text with no embedded HTML tags and entities.
624  *
625  * @param string A non HTML string
626  * @param int Width of the hard-wrapped lines
627  * @return list List of hard-wrapped lines.
628  */
629function phutil_utf8_hard_wrap($string, $width) {
630  $result = array();
631
632  $lines = phutil_split_lines($string, $retain_endings = false);
633  foreach ($lines as $line) {
634
635    // Convert the UTF-8 string into a list of UTF-8 characters.
636    $vector = phutil_utf8v($line);
637
638    $len = count($vector);
639    $buffer = '';
640
641    for ($ii = 1; $ii <= $len; ++$ii) {
642      $buffer .= $vector[$ii - 1];
643      if (($ii % $width) === 0) {
644        $result[] = $buffer;
645        $buffer = '';
646      }
647    }
648
649    if (strlen($buffer)) {
650      $result[] = $buffer;
651    }
652  }
653
654  return $result;
655}
656
657/**
658 * Convert a string from one encoding (like ISO-8859-1) to another encoding
659 * (like UTF-8).
660 *
661 * This is primarily a thin wrapper around `mb_convert_encoding()` which checks
662 * you have the extension installed, since we try to require the extension
663 * only if you actually need it (i.e., you want to work with encodings other
664 * than UTF-8).
665 *
666 * NOTE: This function assumes that the input is in the given source encoding.
667 * If it is not, it may not output in the specified target encoding. If you
668 * need to perform a hard conversion to UTF-8, use this function in conjunction
669 * with @{function:phutil_utf8ize}. We can detect failures caused by invalid
670 * encoding names, but `mb_convert_encoding()` fails silently if the
671 * encoding name identifies a real encoding but the string is not actually
672 * encoded with that encoding.
673 *
674 * @param string String to re-encode.
675 * @param string Target encoding name, like "UTF-8".
676 * @param string Source encoding name, like "ISO-8859-1".
677 * @return string Input string, with converted character encoding.
678 *
679 * @phutil-external-symbol function mb_convert_encoding
680 */
681function phutil_utf8_convert($string, $to_encoding, $from_encoding) {
682  if (!$from_encoding) {
683    throw new InvalidArgumentException(
684      pht(
685        'Attempting to convert a string encoding, but no source encoding '.
686        'was provided. Explicitly provide the source encoding.'));
687  }
688  if (!$to_encoding) {
689    throw new InvalidArgumentException(
690      pht(
691        'Attempting to convert a string encoding, but no target encoding '.
692        'was provided. Explicitly provide the target encoding.'));
693  }
694
695  // Normalize encoding names so we can no-op the very common case of UTF8
696  // to UTF8 (or any other conversion where both encodings are identical).
697  $to_upper = strtoupper(str_replace('-', '', $to_encoding));
698  $from_upper = strtoupper(str_replace('-', '', $from_encoding));
699  if ($from_upper == $to_upper) {
700    return $string;
701  }
702
703  if (!function_exists('mb_convert_encoding')) {
704    throw new Exception(
705      pht(
706        "Attempting to convert a string encoding from '%s' to '%s', ".
707        "but the '%s' PHP extension is not available. Install %s to ".
708        "work with encodings other than UTF-8.",
709        $from_encoding,
710        $to_encoding,
711        'mbstring',
712        'mbstring'));
713  }
714
715  $result = @mb_convert_encoding($string, $to_encoding, $from_encoding);
716
717  if ($result === false) {
718    $message = error_get_last();
719    if ($message) {
720      $message = idx($message, 'message', pht('Unknown error.'));
721    }
722    throw new Exception(
723      pht(
724        "String conversion from encoding '%s' to encoding '%s' failed: %s",
725        $from_encoding,
726        $to_encoding,
727        $message));
728  }
729
730  return $result;
731}
732
733
734/**
735 * Convert a string to title case in a UTF8-aware way. This function doesn't
736 * necessarily do a great job, but the builtin implementation of `ucwords()` can
737 * completely destroy inputs, so it just has to be better than that. Similar to
738 * @{function:ucwords}.
739 *
740 * @param   string  UTF-8 input string.
741 * @return  string  Input, in some semblance of title case.
742 */
743function phutil_utf8_ucwords($str) {
744  // NOTE: mb_convert_case() discards uppercase letters in words when converting
745  // to title case. For example, it will convert "AAA" into "Aaa", which is
746  // undesirable.
747
748  $v = phutil_utf8v($str);
749  $result = '';
750  $last = null;
751
752  $ord_a = ord('a');
753  $ord_z = ord('z');
754  foreach ($v as $c) {
755    $convert = false;
756    if ($last === null || $last === ' ') {
757      $o = ord($c[0]);
758      if ($o >= $ord_a && $o <= $ord_z) {
759        $convert = true;
760      }
761    }
762
763    if ($convert) {
764      $result .= phutil_utf8_strtoupper($c);
765    } else {
766      $result .= $c;
767    }
768
769    $last = $c;
770  }
771
772  return $result;
773}
774
775
776/**
777 * Convert a string to lower case in a UTF8-aware way. Similar to
778 * @{function:strtolower}.
779 *
780 * @param   string  UTF-8 input string.
781 * @return  string  Input, in some semblance of lower case.
782 *
783 * @phutil-external-symbol function mb_convert_case
784 */
785function phutil_utf8_strtolower($str) {
786  if (function_exists('mb_convert_case')) {
787    return mb_convert_case($str, MB_CASE_LOWER, 'UTF-8');
788  }
789
790  static $map;
791  if ($map === null) {
792    $map = array_combine(
793      range('A', 'Z'),
794      range('a', 'z'));
795  }
796
797  return phutil_utf8_strtr($str, $map);
798}
799
800
801/**
802 * Convert a string to upper case in a UTF8-aware way. Similar to
803 * @{function:strtoupper}.
804 *
805 * @param   string  UTF-8 input string.
806 * @return  string  Input, in some semblance of upper case.
807 *
808 * @phutil-external-symbol function mb_convert_case
809 */
810function phutil_utf8_strtoupper($str) {
811  if (function_exists('mb_convert_case')) {
812    return mb_convert_case($str, MB_CASE_UPPER, 'UTF-8');
813  }
814
815  static $map;
816  if ($map === null) {
817    $map = array_combine(
818      range('a', 'z'),
819      range('A', 'Z'));
820  }
821
822  return phutil_utf8_strtr($str, $map);
823}
824
825
826/**
827 * Replace characters in a string in a UTF-aware way. Similar to
828 * @{function:strtr}.
829 *
830 * @param   string              UTF-8 input string.
831 * @param   map<string, string> Map of characters to replace.
832 * @return  string              Input with translated characters.
833 */
834function phutil_utf8_strtr($str, array $map) {
835  $v = phutil_utf8v($str);
836  $result = '';
837  foreach ($v as $c) {
838    if (isset($map[$c])) {
839      $result .= $map[$c];
840    } else {
841      $result .= $c;
842    }
843  }
844
845  return $result;
846}
847
848/**
849 * Determine if a given unicode character is a combining character or not.
850 *
851 * @param   string              A single unicode character.
852 * @return  boolean             True or false.
853 */
854function phutil_utf8_is_combining_character($character) {
855  $components = phutil_utf8v_codepoints($character);
856
857  // Combining Diacritical Marks (0300 - 036F).
858  // Combining Diacritical Marks Supplement (1DC0 - 1DFF).
859  // Combining Diacritical Marks for Symbols (20D0 - 20FF).
860  // Combining Half Marks (FE20 - FE2F).
861
862  foreach ($components as $codepoint) {
863    if ($codepoint >= 0x0300 && $codepoint <= 0x036F ||
864         $codepoint >= 0x1DC0 && $codepoint <= 0x1DFF ||
865         $codepoint >= 0x20D0 && $codepoint <= 0x20FF ||
866         $codepoint >= 0xFE20 && $codepoint <= 0xFE2F) {
867      return true;
868    }
869  }
870
871  return false;
872}
873
874
875/**
876 * Split a UTF-8 string into an array of characters. Combining characters
877 * are not split.
878 *
879 * @param string A valid utf-8 string.
880 * @return list  A list of characters in the string.
881 */
882function phutil_utf8v_combined($string) {
883  $components = phutil_utf8v($string);
884  return phutil_utf8v_combine_characters($components);
885}
886
887
888/**
889 * Merge combining characters in a UTF-8 string.
890 *
891 * This is a low-level method which can allow other operations to do less work.
892 * If you have a string, call @{method:phutil_utf8v_combined} instead.
893 *
894 * @param list List of UTF-8 characters.
895 * @return list List of UTF-8 strings with combining characters merged.
896 */
897function phutil_utf8v_combine_characters(array $characters) {
898  if (!$characters) {
899    return array();
900  }
901
902  // If the first character in the string is a combining character,
903  // start with a space.
904  if (phutil_utf8_is_combining_character($characters[0])) {
905    $buf = ' ';
906  } else {
907    $buf = null;
908  }
909
910  $parts = array();
911  foreach ($characters as $character) {
912    if (!isset($character[1])) {
913      // This an optimization: there are no one-byte combining characters,
914      // so we can just pass these through unmodified.
915      $is_combining = false;
916    } else {
917      $is_combining = phutil_utf8_is_combining_character($character);
918    }
919
920    if ($is_combining) {
921      $buf .= $character;
922    } else {
923      if ($buf !== null) {
924        $parts[] = $buf;
925      }
926      $buf = $character;
927    }
928  }
929
930  $parts[] = $buf;
931
932  return $parts;
933}
934
935
936/**
937 * Return the current system locale setting (LC_ALL).
938 *
939 * @return string Current system locale setting.
940 */
941function phutil_get_system_locale() {
942  $locale = setlocale(LC_ALL, 0);
943
944  if ($locale === false) {
945    throw new Exception(
946      pht(
947        'Unable to determine current system locale (call to '.
948        '"setlocale(LC_ALL, 0)" failed).'));
949  }
950
951  return $locale;
952}
953
954
955/**
956 * Test if a system locale (LC_ALL) is available on the system.
957 *
958 * @param string Locale name like "en_US.UTF-8".
959 * @return bool True if the locale is available.
960 */
961function phutil_is_system_locale_available($locale) {
962  $old_locale = phutil_get_system_locale();
963  $is_available = @setlocale(LC_ALL, $locale);
964  setlocale(LC_ALL, $old_locale);
965
966  return ($is_available !== false);
967}
968
969
970/**
971 * Set the system locale (LC_ALL) to a particular value.
972 *
973 * @param string New locale setting.
974 * @return void
975 */
976function phutil_set_system_locale($locale) {
977  $ok = @setlocale(LC_ALL, $locale);
978  if (!$ok) {
979    throw new Exception(
980      pht(
981        'Failed to set system locale (to "%s").',
982        $locale));
983  }
984}
985