1<?php 2 3/* 4 * This file is part of the Symfony package. 5 * 6 * (c) Fabien Potencier <fabien@symfony.com> 7 * 8 * For the full copyright and license information, please view the LICENSE 9 * file that was distributed with this source code. 10 */ 11 12namespace Symfony\Polyfill\Intl\Normalizer; 13 14/** 15 * Normalizer is a PHP fallback implementation of the Normalizer class provided by the intl extension. 16 * 17 * It has been validated with Unicode 6.3 Normalization Conformance Test. 18 * See http://www.unicode.org/reports/tr15/ for detailed info about Unicode normalizations. 19 * 20 * @author Nicolas Grekas <p@tchwork.com> 21 * 22 * @internal 23 */ 24class Normalizer 25{ 26 const FORM_D = \Normalizer::FORM_D; 27 const FORM_KD = \Normalizer::FORM_KD; 28 const FORM_C = \Normalizer::FORM_C; 29 const FORM_KC = \Normalizer::FORM_KC; 30 const NFD = \Normalizer::NFD; 31 const NFKD = \Normalizer::NFKD; 32 const NFC = \Normalizer::NFC; 33 const NFKC = \Normalizer::NFKC; 34 35 private static $C; 36 private static $D; 37 private static $KD; 38 private static $cC; 39 private static $ulenMask = array("\xC0" => 2, "\xD0" => 2, "\xE0" => 3, "\xF0" => 4); 40 private static $ASCII = "\x20\x65\x69\x61\x73\x6E\x74\x72\x6F\x6C\x75\x64\x5D\x5B\x63\x6D\x70\x27\x0A\x67\x7C\x68\x76\x2E\x66\x62\x2C\x3A\x3D\x2D\x71\x31\x30\x43\x32\x2A\x79\x78\x29\x28\x4C\x39\x41\x53\x2F\x50\x22\x45\x6A\x4D\x49\x6B\x33\x3E\x35\x54\x3C\x44\x34\x7D\x42\x7B\x38\x46\x77\x52\x36\x37\x55\x47\x4E\x3B\x4A\x7A\x56\x23\x48\x4F\x57\x5F\x26\x21\x4B\x3F\x58\x51\x25\x59\x5C\x09\x5A\x2B\x7E\x5E\x24\x40\x60\x7F\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0D\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F"; 41 42 public static function isNormalized($s, $form = self::NFC) 43 { 44 if (!\in_array($form, array(self::NFD, self::NFKD, self::NFC, self::NFKC))) { 45 return false; 46 } 47 $s = (string) $s; 48 if (!isset($s[strspn($s, self::$ASCII)])) { 49 return true; 50 } 51 if (self::NFC == $form && preg_match('//u', $s) && !preg_match('/[^\x00-\x{2FF}]/u', $s)) { 52 return true; 53 } 54 55 return self::normalize($s, $form) === $s; 56 } 57 58 public static function normalize($s, $form = self::NFC) 59 { 60 $s = (string) $s; 61 if (!preg_match('//u', $s)) { 62 return false; 63 } 64 65 switch ($form) { 66 case self::NFC: $C = true; $K = false; break; 67 case self::NFD: $C = false; $K = false; break; 68 case self::NFKC: $C = true; $K = true; break; 69 case self::NFKD: $C = false; $K = true; break; 70 default: 71 if (\defined('Normalizer::NONE') && \Normalizer::NONE == $form) { 72 return $s; 73 } 74 75 return false; 76 } 77 78 if ('' === $s) { 79 return ''; 80 } 81 82 if ($K && null === self::$KD) { 83 self::$KD = self::getData('compatibilityDecomposition'); 84 } 85 86 if (null === self::$D) { 87 self::$D = self::getData('canonicalDecomposition'); 88 self::$cC = self::getData('combiningClass'); 89 } 90 91 if (null !== $mbEncoding = (2 /* MB_OVERLOAD_STRING */ & (int) ini_get('mbstring.func_overload')) ? mb_internal_encoding() : null) { 92 mb_internal_encoding('8bit'); 93 } 94 95 $r = self::decompose($s, $K); 96 97 if ($C) { 98 if (null === self::$C) { 99 self::$C = self::getData('canonicalComposition'); 100 } 101 102 $r = self::recompose($r); 103 } 104 if (null !== $mbEncoding) { 105 mb_internal_encoding($mbEncoding); 106 } 107 108 return $r; 109 } 110 111 private static function recompose($s) 112 { 113 $ASCII = self::$ASCII; 114 $compMap = self::$C; 115 $combClass = self::$cC; 116 $ulenMask = self::$ulenMask; 117 118 $result = $tail = ''; 119 120 $i = $s[0] < "\x80" ? 1 : $ulenMask[$s[0] & "\xF0"]; 121 $len = \strlen($s); 122 123 $lastUchr = substr($s, 0, $i); 124 $lastUcls = isset($combClass[$lastUchr]) ? 256 : 0; 125 126 while ($i < $len) { 127 if ($s[$i] < "\x80") { 128 // ASCII chars 129 130 if ($tail) { 131 $lastUchr .= $tail; 132 $tail = ''; 133 } 134 135 if ($j = strspn($s, $ASCII, $i + 1)) { 136 $lastUchr .= substr($s, $i, $j); 137 $i += $j; 138 } 139 140 $result .= $lastUchr; 141 $lastUchr = $s[$i]; 142 $lastUcls = 0; 143 ++$i; 144 continue; 145 } 146 147 $ulen = $ulenMask[$s[$i] & "\xF0"]; 148 $uchr = substr($s, $i, $ulen); 149 150 if ($lastUchr < "\xE1\x84\x80" || "\xE1\x84\x92" < $lastUchr 151 || $uchr < "\xE1\x85\xA1" || "\xE1\x85\xB5" < $uchr 152 || $lastUcls) { 153 // Table lookup and combining chars composition 154 155 $ucls = isset($combClass[$uchr]) ? $combClass[$uchr] : 0; 156 157 if (isset($compMap[$lastUchr.$uchr]) && (!$lastUcls || $lastUcls < $ucls)) { 158 $lastUchr = $compMap[$lastUchr.$uchr]; 159 } elseif ($lastUcls = $ucls) { 160 $tail .= $uchr; 161 } else { 162 if ($tail) { 163 $lastUchr .= $tail; 164 $tail = ''; 165 } 166 167 $result .= $lastUchr; 168 $lastUchr = $uchr; 169 } 170 } else { 171 // Hangul chars 172 173 $L = \ord($lastUchr[2]) - 0x80; 174 $V = \ord($uchr[2]) - 0xA1; 175 $T = 0; 176 177 $uchr = substr($s, $i + $ulen, 3); 178 179 if ("\xE1\x86\xA7" <= $uchr && $uchr <= "\xE1\x87\x82") { 180 $T = \ord($uchr[2]) - 0xA7; 181 0 > $T && $T += 0x40; 182 $ulen += 3; 183 } 184 185 $L = 0xAC00 + ($L * 21 + $V) * 28 + $T; 186 $lastUchr = \chr(0xE0 | $L >> 12).\chr(0x80 | $L >> 6 & 0x3F).\chr(0x80 | $L & 0x3F); 187 } 188 189 $i += $ulen; 190 } 191 192 return $result.$lastUchr.$tail; 193 } 194 195 private static function decompose($s, $c) 196 { 197 $result = ''; 198 199 $ASCII = self::$ASCII; 200 $decompMap = self::$D; 201 $combClass = self::$cC; 202 $ulenMask = self::$ulenMask; 203 if ($c) { 204 $compatMap = self::$KD; 205 } 206 207 $c = array(); 208 $i = 0; 209 $len = \strlen($s); 210 211 while ($i < $len) { 212 if ($s[$i] < "\x80") { 213 // ASCII chars 214 215 if ($c) { 216 ksort($c); 217 $result .= implode('', $c); 218 $c = array(); 219 } 220 221 $j = 1 + strspn($s, $ASCII, $i + 1); 222 $result .= substr($s, $i, $j); 223 $i += $j; 224 continue; 225 } 226 227 $ulen = $ulenMask[$s[$i] & "\xF0"]; 228 $uchr = substr($s, $i, $ulen); 229 $i += $ulen; 230 231 if ($uchr < "\xEA\xB0\x80" || "\xED\x9E\xA3" < $uchr) { 232 // Table lookup 233 234 if ($uchr !== $j = isset($compatMap[$uchr]) ? $compatMap[$uchr] : (isset($decompMap[$uchr]) ? $decompMap[$uchr] : $uchr)) { 235 $uchr = $j; 236 237 $j = \strlen($uchr); 238 $ulen = $uchr[0] < "\x80" ? 1 : $ulenMask[$uchr[0] & "\xF0"]; 239 240 if ($ulen != $j) { 241 // Put trailing chars in $s 242 243 $j -= $ulen; 244 $i -= $j; 245 246 if (0 > $i) { 247 $s = str_repeat(' ', -$i).$s; 248 $len -= $i; 249 $i = 0; 250 } 251 252 while ($j--) { 253 $s[$i + $j] = $uchr[$ulen + $j]; 254 } 255 256 $uchr = substr($uchr, 0, $ulen); 257 } 258 } 259 if (isset($combClass[$uchr])) { 260 // Combining chars, for sorting 261 262 if (!isset($c[$combClass[$uchr]])) { 263 $c[$combClass[$uchr]] = ''; 264 } 265 $c[$combClass[$uchr]] .= $uchr; 266 continue; 267 } 268 } else { 269 // Hangul chars 270 271 $uchr = unpack('C*', $uchr); 272 $j = (($uchr[1] - 224) << 12) + (($uchr[2] - 128) << 6) + $uchr[3] - 0xAC80; 273 274 $uchr = "\xE1\x84".\chr(0x80 + (int) ($j / 588)) 275 ."\xE1\x85".\chr(0xA1 + (int) (($j % 588) / 28)); 276 277 if ($j %= 28) { 278 $uchr .= $j < 25 279 ? ("\xE1\x86".\chr(0xA7 + $j)) 280 : ("\xE1\x87".\chr(0x67 + $j)); 281 } 282 } 283 if ($c) { 284 ksort($c); 285 $result .= implode('', $c); 286 $c = array(); 287 } 288 289 $result .= $uchr; 290 } 291 292 if ($c) { 293 ksort($c); 294 $result .= implode('', $c); 295 } 296 297 return $result; 298 } 299 300 private static function getData($file) 301 { 302 if (file_exists($file = __DIR__.'/Resources/unidata/'.$file.'.php')) { 303 return require $file; 304 } 305 306 return false; 307 } 308} 309