1# !!!!!!! DO NOT EDIT THIS FILE !!!!!!! 2# This file is machine-generated by lib/unicore/mktables from the Unicode 3# database, Version 6.3.0. Any changes made here will be lost! 4 5 6# !!!!!!! INTERNAL PERL USE ONLY !!!!!!! 7# This file is for internal use by core Perl only. The format and even the 8# name or existence of this file are subject to change without notice. Don't 9# use it directly. Use Unicode::UCD to access the Unicode character data 10# base. 11 12 13package charnames; 14 15# This module contains machine-generated tables and code for the 16# algorithmically-determinable Unicode character names. The following 17# routines can be used to translate between name and code point and vice versa 18 19{ # Closure 20 21 # Matches legal code point. 4-6 hex numbers, If there are 6, the first 22 # two must be 10; if there are 5, the first must not be a 0. Written this 23 # way to decrease backtracking. The first regex allows the code point to 24 # be at the end of a word, but to work properly, the word shouldn't end 25 # with a valid hex character. The second one won't match a code point at 26 # the end of a word, and doesn't have the run-on issue 27 my $run_on_code_point_re = qr/(?^aax: (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b)/; 28 my $code_point_re = qr/(?^aa:\b(?^aax: (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b))/; 29 30 # In the following hash, the keys are the bases of names which include 31 # the code point in the name, like CJK UNIFIED IDEOGRAPH-4E01. The value 32 # of each key is another hash which is used to get the low and high ends 33 # for each range of code points that apply to the name. 34 my %names_ending_in_code_point = ( 35'CJK COMPATIBILITY IDEOGRAPH' => 36{ 37'high' => 38[ 3964109, 4064217, 41195101, 42], 43'low' => 44[ 4563744, 4664112, 47194560, 48], 49}, 50'CJK UNIFIED IDEOGRAPH' => 51{ 52'high' => 53[ 5419893, 5540908, 56173782, 57177972, 58178205, 59], 60'low' => 61[ 6213312, 6319968, 64131072, 65173824, 66177984, 67], 68}, 69 70 ); 71 72 # The following hash is a copy of the previous one, except is for loose 73 # matching, so each name has blanks and dashes squeezed out 74 my %loose_names_ending_in_code_point = ( 75'CJKCOMPATIBILITYIDEOGRAPH' => 76{ 77'high' => 78[ 7964109, 8064217, 81195101, 82], 83'low' => 84[ 8563744, 8664112, 87194560, 88], 89}, 90'CJKUNIFIEDIDEOGRAPH' => 91{ 92'high' => 93[ 9419893, 9540908, 96173782, 97177972, 98178205, 99], 100'low' => 101[ 10213312, 10319968, 104131072, 105173824, 106177984, 107], 108}, 109 110 ); 111 112 # And the following array gives the inverse mapping from code points to 113 # names. Lowest code points are first 114 my @code_points_ending_in_code_point = ( 115 116{ 117'high' => 19893, 118'low' => 13312, 119'name' => 'CJK UNIFIED IDEOGRAPH', 120}, 121{ 122'high' => 40908, 123'low' => 19968, 124'name' => 'CJK UNIFIED IDEOGRAPH', 125}, 126{ 127'high' => 64109, 128'low' => 63744, 129'name' => 'CJK COMPATIBILITY IDEOGRAPH', 130}, 131{ 132'high' => 64217, 133'low' => 64112, 134'name' => 'CJK COMPATIBILITY IDEOGRAPH', 135}, 136{ 137'high' => 173782, 138'low' => 131072, 139'name' => 'CJK UNIFIED IDEOGRAPH', 140}, 141{ 142'high' => 177972, 143'low' => 173824, 144'name' => 'CJK UNIFIED IDEOGRAPH', 145}, 146{ 147'high' => 178205, 148'low' => 177984, 149'name' => 'CJK UNIFIED IDEOGRAPH', 150}, 151{ 152'high' => 195101, 153'low' => 194560, 154'name' => 'CJK COMPATIBILITY IDEOGRAPH', 155}, 156, 157 158 ); 159 160 # Convert from code point to Jamo short name for use in composing Hangul 161 # syllable names 162 my %Jamo = ( 1634352 => 'G', 1644353 => 'GG', 1654354 => 'N', 1664355 => 'D', 1674356 => 'DD', 1684357 => 'R', 1694358 => 'M', 1704359 => 'B', 1714360 => 'BB', 1724361 => 'S', 1734362 => 'SS', 1744363 => '', 1754364 => 'J', 1764365 => 'JJ', 1774366 => 'C', 1784367 => 'K', 1794368 => 'T', 1804369 => 'P', 1814370 => 'H', 1824449 => 'A', 1834450 => 'AE', 1844451 => 'YA', 1854452 => 'YAE', 1864453 => 'EO', 1874454 => 'E', 1884455 => 'YEO', 1894456 => 'YE', 1904457 => 'O', 1914458 => 'WA', 1924459 => 'WAE', 1934460 => 'OE', 1944461 => 'YO', 1954462 => 'U', 1964463 => 'WEO', 1974464 => 'WE', 1984465 => 'WI', 1994466 => 'YU', 2004467 => 'EU', 2014468 => 'YI', 2024469 => 'I', 2034520 => 'G', 2044521 => 'GG', 2054522 => 'GS', 2064523 => 'N', 2074524 => 'NJ', 2084525 => 'NH', 2094526 => 'D', 2104527 => 'L', 2114528 => 'LG', 2124529 => 'LM', 2134530 => 'LB', 2144531 => 'LS', 2154532 => 'LT', 2164533 => 'LP', 2174534 => 'LH', 2184535 => 'M', 2194536 => 'B', 2204537 => 'BS', 2214538 => 'S', 2224539 => 'SS', 2234540 => 'NG', 2244541 => 'J', 2254542 => 'C', 2264543 => 'K', 2274544 => 'T', 2284545 => 'P', 2294546 => 'H', 230 231 ); 232 233 # Leading consonant (can be null) 234 my %Jamo_L = ( 235'' => 11, 236'B' => 7, 237'BB' => 8, 238'C' => 14, 239'D' => 3, 240'DD' => 4, 241'G' => 0, 242'GG' => 1, 243'H' => 18, 244'J' => 12, 245'JJ' => 13, 246'K' => 15, 247'M' => 6, 248'N' => 2, 249'P' => 17, 250'R' => 5, 251'S' => 9, 252'SS' => 10, 253'T' => 16, 254 255 ); 256 257 # Vowel 258 my %Jamo_V = ( 259'A' => 0, 260'AE' => 1, 261'E' => 5, 262'EO' => 4, 263'EU' => 18, 264'I' => 20, 265'O' => 8, 266'OE' => 11, 267'U' => 13, 268'WA' => 9, 269'WAE' => 10, 270'WE' => 15, 271'WEO' => 14, 272'WI' => 16, 273'YA' => 2, 274'YAE' => 3, 275'YE' => 7, 276'YEO' => 6, 277'YI' => 19, 278'YO' => 12, 279'YU' => 17, 280 281 ); 282 283 # Optional trailing consonant 284 my %Jamo_T = ( 285'B' => 17, 286'BS' => 18, 287'C' => 23, 288'D' => 7, 289'G' => 1, 290'GG' => 2, 291'GS' => 3, 292'H' => 27, 293'J' => 22, 294'K' => 24, 295'L' => 8, 296'LB' => 11, 297'LG' => 9, 298'LH' => 15, 299'LM' => 10, 300'LP' => 14, 301'LS' => 12, 302'LT' => 13, 303'M' => 16, 304'N' => 4, 305'NG' => 21, 306'NH' => 6, 307'NJ' => 5, 308'P' => 26, 309'S' => 19, 310'SS' => 20, 311'T' => 25, 312 313 ); 314 315 # Computed re that splits up a Hangul name into LVT or LV syllables 316 my $syllable_re = qr/(|B|BB|C|D|DD|G|GG|H|J|JJ|K|M|N|P|R|S|SS|T)(A|AE|E|EO|EU|I|O|OE|U|WA|WAE|WE|WEO|WI|YA|YAE|YE|YEO|YI|YO|YU)(B|BS|C|D|G|GG|GS|H|J|K|L|LB|LG|LH|LM|LP|LS|LT|M|N|NG|NH|NJ|P|S|SS|T)?/; 317 318 my $HANGUL_SYLLABLE = "HANGUL SYLLABLE "; 319 my $loose_HANGUL_SYLLABLE = "HANGULSYLLABLE"; 320 321 # These constants names and values were taken from the Unicode standard, 322 # version 5.1, section 3.12. They are used in conjunction with Hangul 323 # syllables 324 my $SBase = 0xAC00; 325 my $LBase = 0x1100; 326 my $VBase = 0x1161; 327 my $TBase = 0x11A7; 328 my $SCount = 11172; 329 my $LCount = 19; 330 my $VCount = 21; 331 my $TCount = 28; 332 my $NCount = $VCount * $TCount; 333 334 sub name_to_code_point_special { 335 my ($name, $loose) = @_; 336 337 # Returns undef if not one of the specially handled names; otherwise 338 # returns the code point equivalent to the input name 339 # $loose is non-zero if to use loose matching, 'name' in that case 340 # must be input as upper case with all blanks and dashes squeezed out. 341 342 if ((! $loose && $name =~ s/$HANGUL_SYLLABLE//) 343 || ($loose && $name =~ s/$loose_HANGUL_SYLLABLE//)) 344 { 345 return if $name !~ qr/^$syllable_re$/; 346 my $L = $Jamo_L{$1}; 347 my $V = $Jamo_V{$2}; 348 my $T = (defined $3) ? $Jamo_T{$3} : 0; 349 return ($L * $VCount + $V) * $TCount + $T + $SBase; 350 } 351 352 # Name must end in 'code_point' for this to handle. 353 return if (($loose && $name !~ /^ (.*?) ($run_on_code_point_re) $/x) 354 || (! $loose && $name !~ /^ (.*) ($code_point_re) $/x)); 355 356 my $base = $1; 357 my $code_point = CORE::hex $2; 358 my $names_ref; 359 360 if ($loose) { 361 $names_ref = \%loose_names_ending_in_code_point; 362 } 363 else { 364 return if $base !~ s/-$//; 365 $names_ref = \%names_ending_in_code_point; 366 } 367 368 # Name must be one of the ones which has the code point in it. 369 return if ! $names_ref->{$base}; 370 371 # Look through the list of ranges that apply to this name to see if 372 # the code point is in one of them. 373 for (my $i = 0; $i < scalar @{$names_ref->{$base}{'low'}}; $i++) { 374 return if $names_ref->{$base}{'low'}->[$i] > $code_point; 375 next if $names_ref->{$base}{'high'}->[$i] < $code_point; 376 377 # Here, the code point is in the range. 378 return $code_point; 379 } 380 381 # Here, looked like the name had a code point number in it, but 382 # did not match one of the valid ones. 383 return; 384 } 385 386 sub code_point_to_name_special { 387 my $code_point = shift; 388 389 # Returns the name of a code point if algorithmically determinable; 390 # undef if not 391 392 # If in the Hangul range, calculate the name based on Unicode's 393 # algorithm 394 if ($code_point >= $SBase && $code_point <= $SBase + $SCount -1) { 395 use integer; 396 my $SIndex = $code_point - $SBase; 397 my $L = $LBase + $SIndex / $NCount; 398 my $V = $VBase + ($SIndex % $NCount) / $TCount; 399 my $T = $TBase + $SIndex % $TCount; 400 $name = "$HANGUL_SYLLABLE$Jamo{$L}$Jamo{$V}"; 401 $name .= $Jamo{$T} if $T != $TBase; 402 return $name; 403 } 404 405 # Look through list of these code points for one in range. 406 foreach my $hash (@code_points_ending_in_code_point) { 407 return if $code_point < $hash->{'low'}; 408 if ($code_point <= $hash->{'high'}) { 409 return sprintf("%s-%04X", $hash->{'name'}, $code_point); 410 } 411 } 412 return; # None found 413 } 414} # End closure 415 4161; 417