1# !!!!!!! INTERNAL PERL USE ONLY !!!!!!! 2# This helper module is for internal use by core Perl only. This module is 3# subject to change or removal at any time without notice. Don't use it 4# directly. Use the public <charnames> module instead. 5 6package _charnames; 7use strict; 8use warnings; 9our $VERSION = '1.50'; 10use unicore::Name; # mktables-generated algorithmically-defined names 11 12use bytes (); # for $bytes::hint_bits 13use re "/aa"; # Everything in here should be ASCII 14 15$Carp::Internal{ (__PACKAGE__) } = 1; 16 17# Translate between Unicode character names and their code points. This is a 18# submodule of package <charnames>, used to allow \N{...} to be autoloaded, 19# but it was decided not to autoload the various functions in charnames; the 20# splitting allows this behavior. 21# 22# The official names with their code points are stored in a table in 23# lib/unicore/Name.pl which is read in as a large string (almost 3/4 Mb in 24# Unicode 6.0). Each code point sequence appears on a line by itself, with 25# its corresponding name occupying the next line in the string. (Some of the 26# CJK and the Hangul syllable names are instead determined algorithmically via 27# subroutines stored instead in lib/unicore/Name.pm). Because of the large 28# size of this table, it isn't converted into hashes for faster lookup. 29# 30# But, user defined aliases are stored in their own hashes, as are Perl 31# extensions to the official names. These are checked first before looking at 32# the official table. 33# 34# Basically, the table is grepped for the input code point (viacode()) or 35# name (the other functions), and the corresponding value on the next or 36# previous line is returned. The grepping is done by turning the input into a 37# regular expression. Thus, the same table does double duty, used by both 38# name and code point lookup. (If we were to have hashes, we would need two, 39# one for each lookup direction.) 40# 41# For loose name matching, the logical thing would be to have a table 42# with all the ignorable characters squeezed out, and then grep it with the 43# similiarly-squeezed input name. (And this is in fact how the lookups are 44# done with the small Perl extension hashes.) But since we need to be able to 45# go from code point to official name, the original table would still need to 46# exist. Due to the large size of the table, it was decided to not read 47# another very large string into memory for a second table. Instead, the 48# regular expression of the input name is modified to have optional spaces and 49# dashes between characters. For example, in strict matching, the regular 50# expression would be: 51# qr/^DIGIT ONE$/m 52# Under loose matching, the blank would be squeezed out, and the re would be: 53# qr/^D[- ]?I[- ]?G[- ]?I[- ]?T[- ]?O[- ]?N[- ]?E$/m 54# which matches a blank or dash between any characters in the official table. 55# 56# This is also how script lookup is done. Basically the re looks like 57# qr/ (?:LATIN|GREEK|CYRILLIC) (?:SMALL )?LETTER $name/ 58# where $name is the loose or strict regex for the remainder of the name. 59 60# The hashes are stored as utf8 strings. This makes it easier to deal with 61# sequences. I (khw) also tried making Name.pl utf8, but it slowed things 62# down by a factor of 7. I then tried making Name.pl store the utf8 63# equivalents but not calling them utf8. That led to similar speed as leaving 64# it alone, but since that is harder for a human to parse, I left it as-is. 65 66my %system_aliases = ( 67 68 'SINGLE-SHIFT 2' => chr utf8::unicode_to_native(0x8E), 69 'SINGLE-SHIFT 3' => chr utf8::unicode_to_native(0x8F), 70 'PRIVATE USE 1' => chr utf8::unicode_to_native(0x91), 71 'PRIVATE USE 2' => chr utf8::unicode_to_native(0x92), 72); 73 74# These are the aliases above that differ under :loose and :full matching 75# because the :full versions have blanks or hyphens in them. 76#my %loose_system_aliases = ( 77#); 78 79#my %deprecated_aliases; 80#$deprecated_aliases{'BELL'} = chr utf8::unicode_to_native(0x07) if $^V lt v5.17.0; 81 82#my %loose_deprecated_aliases = ( 83#); 84 85# These are special cased in :loose matching, differing only in a medial 86# hyphen 87my $HANGUL_JUNGSEONG_O_E_utf8 = chr 0x1180; 88my $HANGUL_JUNGSEONG_OE_utf8 = chr 0x116C; 89 90 91my $txt; # The table of official character names 92 93my %full_names_cache; # Holds already-looked-up names, so don't have to 94# re-look them up again. The previous versions of charnames had scoping 95# bugs. For example if we use script A in one scope and find and cache 96# what Z resolves to, we can't use that cache in a different scope that 97# uses script B instead of A, as Z might be an entirely different letter 98# there; or there might be different aliases in effect in different 99# scopes, or :short may be in effect or not effect in different scopes, 100# or various combinations thereof. This was solved in this version 101# mostly by moving things to %^H. But some things couldn't be moved 102# there. One of them was the cache of runtime looked-up names, in part 103# because %^H is read-only at runtime. I (khw) don't know why the cache 104# was run-time only in the previous versions: perhaps oversight; perhaps 105# that compile time looking doesn't happen in a loop so didn't think it 106# was worthwhile; perhaps not wanting to make the cache too large. But 107# I decided to make it compile time as well; this could easily be 108# changed. 109# Anyway, this hash is not scoped, and is added to at runtime. It 110# doesn't have scoping problems because the data in it is restricted to 111# official names, which are always invariant, and we only set it and 112# look at it at during :full lookups, so is unaffected by any other 113# scoped options. I put this in to maintain parity with the older 114# version. If desired, a %short_names cache could also be made, as well 115# as one for each script, say in %script_names_cache, with each key 116# being a hash for a script named in a 'use charnames' statement. I 117# decided not to do that for now, just because it's added complication, 118# and because I'm just trying to maintain parity, not extend it. 119 120# Like %full_names_cache, but for use when :loose is in effect. There needs 121# to be two caches because :loose may not be in effect for a scope, and a 122# loose name could inappropriately be returned when only exact matching is 123# called for. 124my %loose_names_cache; 125 126# Designed so that test decimal first, and then hex. Leading zeros 127# imply non-decimal, as do non-[0-9] 128my $decimal_qr = qr/^[1-9]\d*$/; 129 130# Returns the hex number in $1. 131my $hex_qr = qr/^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/; 132 133sub croak 134{ 135 require Carp; goto &Carp::croak; 136} # croak 137 138sub carp 139{ 140 require Carp; goto &Carp::carp; 141} # carp 142 143sub populate_txt() 144{ 145 return if $txt; 146 147 $txt = do "unicore/Name.pl"; 148 Internals::SvREADONLY($txt, 1); 149} 150 151sub alias (@) # Set up a single alias 152{ 153 my @errors; 154 my $nbsp = chr utf8::unicode_to_native(0xA0); 155 156 my $alias = ref $_[0] ? $_[0] : { @_ }; 157 foreach my $name (sort keys %$alias) { # Sort only because it helps having 158 # deterministic output for 159 # t/lib/charnames/alias 160 my $value = $alias->{$name}; 161 next unless defined $value; # Omit if screwed up. 162 163 # Is slightly slower to just after this statement see if it is 164 # decimal, since we already know it is after having converted from 165 # hex, but makes the code easier to maintain, and is called 166 # infrequently, only at compile-time 167 if ($value !~ $decimal_qr && $value =~ $hex_qr) { 168 my $temp = CORE::hex $1; 169 $temp = utf8::unicode_to_native($temp) if $value =~ /^[Uu]\+/; 170 $value = $temp; 171 } 172 if ($value =~ $decimal_qr) { 173 no warnings qw(non_unicode surrogate nonchar); # Allow any of these 174 $^H{charnames_ord_aliases}{$name} = chr $value; 175 176 # Use a canonical form. 177 $^H{charnames_inverse_ords}{sprintf("%05X", $value)} = $name; 178 } 179 else { 180 my $ok_portion = ""; 181 $ok_portion = $1 if $name =~ / ^ ( 182 \p{_Perl_Charname_Begin} 183 \p{_Perl_Charname_Continue}* 184 ) /x; 185 186 # If the name was fully correct, the above should have matched all of 187 # it. 188 if (length $ok_portion < length $name) { 189 my $first_bad = substr($name, length($ok_portion), 1); 190 push @errors, "Invalid character in charnames alias definition; " 191 . "marked by <-- HERE in '$ok_portion$first_bad<-- HERE " 192 . substr($name, length($ok_portion) + 1) 193 . "'"; 194 } 195 else { 196 if ($name =~ / ( .* \s ) ( \s* ) $ /x) { 197 push @errors, "charnames alias definitions may not contain " 198 . "trailing white-space; marked by <-- HERE in " 199 . "'$1 <-- HERE " . $2 . "'"; 200 next; 201 } 202 203 # Use '+' instead of '*' in this regex, because any trailing 204 # blanks have already been found 205 if ($name =~ / ( .*? \s{2} ) ( .+ ) /x) { 206 push @errors, "charnames alias definitions may not contain a " 207 . "sequence of multiple spaces; marked by <-- HERE " 208 . "in '$1 <-- HERE " . $2 . "'"; 209 next; 210 } 211 212 $^H{charnames_name_aliases}{$name} = $value; 213 } 214 } 215 } 216 217 # We find and output all errors from this :alias definition, rather than 218 # failing on the first one, so fewer runs are needed to get it to compile 219 if (@errors) { 220 croak join "\n", @errors; 221 } 222 223 return; 224} # alias 225 226sub not_legal_use_bytes_msg { 227 my ($name, $utf8) = @_; 228 my $return; 229 230 if (length($utf8) == 1) { 231 $return = sprintf("Character 0x%04x with name '%s' is", ord $utf8, $name); 232 } else { 233 $return = sprintf("String with name '%s' (and ordinals %s) contains character(s)", $name, join(" ", map { sprintf "0x%04X", ord $_ } split(//, $utf8))); 234 } 235 return $return . " above 0xFF with 'use bytes' in effect"; 236} 237 238sub alias_file ($) # Reads a file containing alias definitions 239{ 240 require File::Spec; 241 my ($arg, $file) = @_; 242 if (-f $arg && File::Spec->file_name_is_absolute ($arg)) { 243 $file = $arg; 244 } 245 elsif ($arg =~ m/ ^ \p{_Perl_IDStart} \p{_Perl_IDCont}* $/x) { 246 $file = "unicore/${arg}_alias.pl"; 247 } 248 else { 249 croak "Charnames alias file names can only have identifier characters"; 250 } 251 if (my @alias = do $file) { 252 @alias == 1 && !defined $alias[0] and 253 croak "$file cannot be used as alias file for charnames"; 254 @alias % 2 and 255 croak "$file did not return a (valid) list of alias pairs"; 256 alias (@alias); 257 return (1); 258 } 259 0; 260} # alias_file 261 262# For use when don't import anything. This structure must be kept in 263# sync with the one that import() fills up. 264my %dummy_H = ( 265 charnames_stringified_names => "", 266 charnames_stringified_ords => "", 267 charnames_scripts => "", 268 charnames_full => 1, 269 charnames_loose => 0, 270 charnames_short => 0, 271 ); 272 273 274sub lookup_name ($$$;$) { 275 my ($name, $wants_ord, $runtime, $regex_loose) = @_; 276 $regex_loose //= 0; 277 278 # Lookup the name or sequence $name in the tables. If $wants_ord is false, 279 # returns the string equivalent of $name; if true, returns the ordinal value 280 # instead, but in this case $name must not be a sequence; otherwise undef is 281 # returned and a warning raised. $runtime is 0 if compiletime, otherwise 282 # gives the number of stack frames to go back to get the application caller 283 # info. 284 # If $name is not found, returns undef in runtime with no warning; and in 285 # compiletime, the Unicode replacement character, with a warning. 286 287 # It looks first in the aliases, then in the large table of official Unicode 288 # names. 289 290 my $result; # The string result 291 my $save_input; 292 293 if ($runtime && ! $regex_loose) { 294 295 my $hints_ref = (caller($runtime))[10]; 296 297 # If we didn't import anything (which happens with 'use charnames ()', 298 # substitute a dummy structure. 299 $hints_ref = \%dummy_H if ! defined $hints_ref 300 || (! defined $hints_ref->{charnames_full} 301 && ! defined $hints_ref->{charnames_loose}); 302 303 # At runtime, but currently not at compile time, %^H gets 304 # stringified, so un-stringify back to the original data structures. 305 # These get thrown away by perl before the next invocation 306 # Also fill in the hash with the non-stringified data. 307 # N.B. New fields must be also added to %dummy_H 308 309 %{$^H{charnames_name_aliases}} = split ',', 310 $hints_ref->{charnames_stringified_names}; 311 %{$^H{charnames_ord_aliases}} = split ',', 312 $hints_ref->{charnames_stringified_ords}; 313 $^H{charnames_scripts} = $hints_ref->{charnames_scripts}; 314 $^H{charnames_full} = $hints_ref->{charnames_full}; 315 $^H{charnames_loose} = $hints_ref->{charnames_loose}; 316 $^H{charnames_short} = $hints_ref->{charnames_short}; 317 } 318 319 my $loose = $regex_loose || $^H{charnames_loose}; 320 my $lookup_name; # Input name suitably modified for grepping for in the 321 # table 322 323 # User alias should be checked first or else can't override ours, and if we 324 # were to add any, could conflict with theirs. 325 if (! $regex_loose && exists $^H{charnames_ord_aliases}{$name}) { 326 $result = $^H{charnames_ord_aliases}{$name}; 327 } 328 elsif (! $regex_loose && exists $^H{charnames_name_aliases}{$name}) { 329 $name = $^H{charnames_name_aliases}{$name}; 330 $save_input = $lookup_name = $name; # Cache the result for any error 331 # message 332 # The aliases are documented to not match loosely, so change loose match 333 # into full. 334 if ($loose) { 335 $loose = 0; 336 $^H{charnames_full} = 1; 337 } 338 } 339 else { 340 341 # Here, not a user alias. That means that loose matching may be in 342 # effect; will have to modify the input name. 343 $lookup_name = $name; 344 if ($loose) { 345 $lookup_name = uc $lookup_name; 346 347 # Squeeze out all underscores 348 $lookup_name =~ s/_//g; 349 350 # Remove all medial hyphens 351 $lookup_name =~ s/ (?<= \S ) - (?= \S )//gx; 352 353 # Squeeze out all spaces 354 $lookup_name =~ s/\s//g; 355 } 356 357 # Here, $lookup_name has been modified as necessary for looking in the 358 # hashes. Check the system alias files next. Most of these aliases are 359 # the same for both strict and loose matching. To save space, the ones 360 # which differ are in their own separate hash, which is checked if loose 361 # matching is selected and the regular match fails. To save time, the 362 # loose hashes could be expanded to include all aliases, and there would 363 # only have to be one check. But if someone specifies :loose, they are 364 # interested in convenience over speed, and the time for this second check 365 # is miniscule compared to the rest of the routine. 366 if (exists $system_aliases{$lookup_name}) { 367 $result = $system_aliases{$lookup_name}; 368 } 369 # There are currently no entries in this hash, so don't waste time looking 370 # for them. But the code is retained for the unlikely possibility that 371 # some will be added in the future. 372# elsif ($loose && exists $loose_system_aliases{$lookup_name}) { 373# $result = $loose_system_aliases{$lookup_name}; 374# } 375# if (exists $deprecated_aliases{$lookup_name}) { 376# require warnings; 377# warnings::warnif('deprecated', 378# "Unicode character name \"$name\" is deprecated, use \"" 379# . viacode(ord $deprecated_aliases{$lookup_name}) 380# . "\" instead"); 381# $result = $deprecated_aliases{$lookup_name}; 382# } 383 # There are currently no entries in this hash, so don't waste time looking 384 # for them. But the code is retained for the unlikely possibility that 385 # some will be added in the future. 386# elsif ($loose && exists $loose_deprecated_aliases{$lookup_name}) { 387# require warnings; 388# warnings::warnif('deprecated', 389# "Unicode character name \"$name\" is deprecated, use \"" 390# . viacode(ord $loose_deprecated_aliases{$lookup_name}) 391# . "\" instead"); 392# $result = $loose_deprecated_aliases{$lookup_name}; 393# } 394 } 395 396 my @off; # Offsets into table of pattern match begin and end 397 398 # If haven't found it yet... 399 if (! defined $result) { 400 401 # See if has looked this input up earlier. 402 if (! $loose && $^H{charnames_full} && exists $full_names_cache{$name}) { 403 $result = $full_names_cache{$name}; 404 } 405 elsif ($loose && exists $loose_names_cache{$name}) { 406 $result = $loose_names_cache{$name}; 407 } 408 else { # Here, must do a look-up 409 410 # If full or loose matching succeeded, points to where to cache the 411 # result 412 my $cache_ref; 413 414 ## Suck in the code/name list as a big string. 415 ## Entries look like: 416 ## "00052\nLATIN CAPITAL LETTER R\n\n" 417 # or 418 # "0052 0303\nLATIN CAPITAL LETTER R WITH TILDE\n\n" 419 populate_txt() unless $txt; 420 421 ## @off will hold the index into the code/name string of the start and 422 ## end of the name as we find it. 423 424 ## If :loose, look for a loose match; if :full, look for the name 425 ## exactly 426 # First, see if the name is one which is algorithmically determinable. 427 # The subroutine is included in Name.pl. The table contained in 428 # $txt doesn't contain these. Experiments show that checking 429 # for these before checking for the regular names has no 430 # noticeable impact on performance for the regular names, but 431 # the other way around slows down finding these immensely. 432 # Algorithmically determinables are not placed in the cache because 433 # that uses up memory, and finding these again is fast. 434 if ( ($loose || $^H{charnames_full}) 435 && (defined (my $ord = charnames::name_to_code_point_special($lookup_name, $loose)))) 436 { 437 $result = chr $ord; 438 } 439 else { 440 441 # Not algorithmically determinable; look up in the table. The name 442 # will be turned into a regex, so quote any meta characters. 443 $lookup_name = quotemeta $lookup_name; 444 445 if ($loose) { 446 447 # For loose matches, $lookup_name has already squeezed out the 448 # non-essential characters. We have to add in code to make the 449 # squeezed version match the non-squeezed equivalent in the table. 450 # The only remaining hyphens are ones that start or end a word in 451 # the original. They have been quoted in $lookup_name so they look 452 # like "\-". Change all other characters except the backslash 453 # quotes for any metacharacters, and the final character, so that 454 # e.g., COLON gets transformed into: /C[- ]?O[- ]?L[- ]?O[- ]?N/ 455 $lookup_name =~ s/ (?! \\ -) # Don't do this to the \- sequence 456 ( [^-\\] ) # Nor the "-" within that sequence, 457 # nor the "\" that quotes metachars, 458 # but otherwise put the char into $1 459 (?=.) # And don't do it for the final char 460 /$1\[- \]?/gx; # And add an optional blank or 461 # '-' after each $1 char 462 463 # Those remaining hyphens were originally at the beginning or end of 464 # a word, so they can match either a blank before or after, but not 465 # both. (Keep in mind that they have been quoted, so are a '\-' 466 # sequence) 467 $lookup_name =~ s/\\ -/(?:- | -)/xg; 468 } 469 470 # Do the lookup in the full table if asked for, and if succeeds 471 # save the offsets and set where to cache the result. 472 if (($loose || $^H{charnames_full}) && $txt =~ /^$lookup_name$/m) { 473 @off = ($-[0], $+[0]); 474 $cache_ref = ($loose) ? \%loose_names_cache : \%full_names_cache; 475 } 476 elsif ($regex_loose) { 477 # Currently don't allow :short when this is set 478 return; 479 } 480 else { 481 482 # Here, didn't look for, or didn't find the name. 483 # If :short is allowed, see if input is like "greek:Sigma". 484 # Keep in mind that $lookup_name has had the metas quoted. 485 my $scripts_trie = ""; 486 my $name_has_uppercase; 487 my @scripts; 488 if (($^H{charnames_short}) 489 && $lookup_name =~ /^ (?: \\ \s)* # Quoted space 490 (.+?) # $1 = the script 491 (?: \\ \s)* 492 \\ : # Quoted colon 493 (?: \\ \s)* 494 (.+?) # $2 = the name 495 (?: \\ \s)* $ 496 /xs) 497 { 498 # Even in non-loose matching, the script traditionally has been 499 # case insensitive 500 $scripts_trie = "\U$1"; 501 $lookup_name = $2; 502 503 # Use original name to find its input casing, but ignore the 504 # script part of that to make the determination. 505 $save_input = $name if ! defined $save_input; 506 $name =~ s/.*?://; 507 $name_has_uppercase = $name =~ /[[:upper:]]/; 508 } 509 else { # Otherwise look in allowed scripts 510 # We want to search first by script name then by letter name, so that 511 # if the user imported `use charnames qw(arabic hebrew)` and asked for 512 # \N{alef} they get ARABIC LETTER ALEF, and if they imported 513 # `... (hebrew arabic)` and ask for \N{alef} they get HEBREW LETTER ALEF. 514 # We can't rely on the regex engine to preserve ordering like that, so 515 # pick the pipe-seperated string apart so we can iterate over it. 516 @scripts = split(/\|/, $^H{charnames_scripts}); 517 518 # Use original name to find its input casing 519 $name_has_uppercase = $name =~ /[[:upper:]]/; 520 } 521 my $case = $name_has_uppercase ? "CAPITAL" : "SMALL"; 522 523 if(@scripts) { 524 SCRIPTS: foreach my $script (@scripts) { 525 if($txt =~ /^ (?: $script ) \ (?:$case\ )? LETTER \ \U$lookup_name $/xm) { 526 @off = ($-[0], $+[0]); 527 last SCRIPTS; 528 } 529 } 530 return unless(@off); 531 } 532 else { 533 return if (! $scripts_trie || $txt !~ 534 /^ (?: $scripts_trie ) \ (?:$case\ )? LETTER \ \U$lookup_name $/xm); 535 @off = ($-[0], $+[0]); 536 } 537 } 538 539 # Here, the input name has been found; we haven't set up the output, 540 # but we know where in the string 541 # the name starts. The string is set up so that for single characters 542 # (and not named sequences), the name is on a line by itself, and the 543 # previous line contains precisely 5 hex digits for its code point. 544 # Named sequences won't have the 7th preceding character be a \n. 545 # (Actually, for the very first entry in the table this isn't strictly 546 # true: subtracting 7 will yield -1, and the substr below will 547 # therefore yield the very last character in the table, which should 548 # also be a \n, so the statement works anyway.) 549 if (substr($txt, $off[0] - 7, 1) eq "\n") { 550 $result = chr CORE::hex substr($txt, $off[0] - 6, 5); 551 552 # Handle the single loose matching special case, in which two names 553 # differ only by a single medial hyphen. If the original had a 554 # hyphen (or more) in the right place, then it is that one. 555 $result = $HANGUL_JUNGSEONG_O_E_utf8 556 if $loose 557 && $result eq $HANGUL_JUNGSEONG_OE_utf8 558 && $name =~ m/O \s* - [-\s]* E/ix; 559 # Note that this wouldn't work if there were a 2nd 560 # OE in the name 561 } 562 else { 563 564 # Here, is a named sequence. Need to go looking for the beginning, 565 # which is just after the \n from the previous entry in the table. 566 # The +1 skips past that newline, or, if the rindex() fails, to put 567 # us to an offset of zero. 568 my $charstart = rindex($txt, "\n", $off[0] - 7) + 1; 569 $result = pack("W*", map { CORE::hex } 570 split " ", substr($txt, $charstart, $off[0] - $charstart - 1)); 571 } 572 } 573 574 # Cache the input so as to not have to search the large table 575 # again, but only if it came from the one search that we cache. 576 # (Haven't bothered with the pain of sorting out scoping issues for the 577 # scripts searches.) 578 $cache_ref->{$name} = $result if defined $cache_ref; 579 } 580 } 581 582 # Here, have the result character. If the return is to be an ord, must be 583 # any single character. 584 if ($wants_ord) { 585 return ord($result) if length $result == 1; 586 } 587 elsif (! utf8::is_utf8($result)) { 588 589 # Here isn't UTF-8. That's OK if it is all ASCII, or we are being called 590 # at compile time where we know we can guarantee that Unicode rules are 591 # correctly imposed on the result, or under 'bytes' where we don't want 592 # those rules. But otherwise we have to make it UTF8 to guarantee Unicode 593 # rules on the returned string. 594 return $result if ! $runtime 595 || (caller $runtime)[8] & $bytes::hint_bits 596 || $result !~ /[[:^ascii:]]/; 597 utf8::upgrade($result); 598 return $result; 599 } 600 else { 601 602 # Here, wants string output. If utf8 is acceptable, just return what 603 # we've got; otherwise attempt to convert it to non-utf8 and return that. 604 my $in_bytes = ! $regex_loose # \p{name=} doesn't currently care if 605 # in bytes or not 606 && (($runtime) 607 ? (caller $runtime)[8] & $bytes::hint_bits 608 : $^H & $bytes::hint_bits); 609 return $result if (! $in_bytes || utf8::downgrade($result, 1)) # The 1 arg 610 # means don't die on failure 611 } 612 613 # Here, there is an error: either there are too many characters, or the 614 # result string needs to be non-utf8, and at least one character requires 615 # utf8. Prefer any official name over the input one for the error message. 616 if (@off) { 617 $name = substr($txt, $off[0], $off[1] - $off[0]) if @off; 618 } 619 else { 620 $name = (defined $save_input) ? $save_input : $_[0]; 621 } 622 623 if ($wants_ord) { 624 # Only way to get here in this case is if result too long. Message 625 # assumes that our only caller that requires single char result is 626 # vianame. 627 carp "charnames::vianame() doesn't handle named sequences ($name). Use charnames::string_vianame() instead"; 628 return; 629 } 630 631 # Only other possible failure here is from use bytes. 632 if ($runtime) { 633 carp not_legal_use_bytes_msg($name, $result); 634 return; 635 } else { 636 croak not_legal_use_bytes_msg($name, $result); 637 } 638 639} # lookup_name 640 641sub charnames { 642 643 # For \N{...}. Looks up the character name and returns the string 644 # representation of it. 645 646 # The first 0 arg means wants a string returned; the second that we are in 647 # compile time 648 return lookup_name($_[0], 0, 0); 649} 650 651sub _loose_regcomp_lookup { 652 # For use only by regcomp.c to compile \p{name=...} 653 # khw thinks it best to not do :short matching, and only official names. 654 # But that is only a guess, and if demand warrants, could be changed 655 return lookup_name($_[0], 0, 1, 656 1 # Always use :loose matching 657 ); 658} 659 660sub _get_names_info { 661 # For use only by regcomp.c to compile \p{name=/.../} 662 populate_txt() unless $txt; 663 664 665 return ( \$txt, \@charnames::code_points_ending_in_code_point ); 666} 667 668sub import 669{ 670 shift; ## ignore class name 671 672 populate_txt() unless $txt; 673 674 if (not @_) { 675 carp("'use charnames' needs explicit imports list"); 676 } 677 $^H{charnames} = \&charnames ; 678 $^H{charnames_ord_aliases} = {}; 679 $^H{charnames_name_aliases} = {}; 680 $^H{charnames_inverse_ords} = {}; 681 # New fields must be added to %dummy_H, and the code in lookup_name() 682 # that copies fields from the runtime structure 683 684 ## 685 ## fill %h keys with our @_ args. 686 ## 687 my ($promote, %h, @args) = (0); 688 while (my $arg = shift) { 689 if ($arg eq ":alias") { 690 @_ or 691 croak ":alias needs an argument in charnames"; 692 my $alias = shift; 693 if (ref $alias) { 694 ref $alias eq "HASH" or 695 croak "Only HASH reference supported as argument to :alias"; 696 alias ($alias); 697 $promote = 1; 698 next; 699 } 700 if ($alias =~ m{:(\w+)$}) { 701 $1 eq "full" || $1 eq "loose" || $1 eq "short" and 702 croak ":alias cannot use existing pragma :$1 (reversed order?)"; 703 alias_file ($1) and $promote = 1; 704 next; 705 } 706 alias_file ($alias) and $promote = 1; 707 next; 708 } 709 if (substr($arg, 0, 1) eq ':' 710 and ! ($arg eq ":full" || $arg eq ":short" || $arg eq ":loose")) 711 { 712 warn "unsupported special '$arg' in charnames"; 713 next; 714 } 715 push @args, $arg; 716 } 717 718 @args == 0 && $promote and @args = (":full"); 719 @h{@args} = (1) x @args; 720 721 # Don't leave these undefined as are tested for in lookup_names 722 $^H{charnames_full} = delete $h{':full'} || 0; 723 $^H{charnames_loose} = delete $h{':loose'} || 0; 724 $^H{charnames_short} = delete $h{':short'} || 0; 725 my @scripts = map { uc quotemeta } grep { /^[^:]/ } @args; 726 727 ## 728 ## If utf8? warnings are enabled, and some scripts were given, 729 ## see if at least we can find one letter from each script. 730 ## 731 if (warnings::enabled('utf8') && @scripts) { 732 for my $script (@scripts) { 733 if (not $txt =~ m/^$script (?:CAPITAL |SMALL )?LETTER /m) { 734 warnings::warn('utf8', "No such script: '$script'"); 735 $script = quotemeta $script; # Escape it, for use in the re. 736 } 737 } 738 } 739 740 # %^H gets stringified, so serialize it ourselves so can extract the 741 # real data back later. 742 $^H{charnames_stringified_ords} = join ",", %{$^H{charnames_ord_aliases}}; 743 $^H{charnames_stringified_names} = join ",", %{$^H{charnames_name_aliases}}; 744 $^H{charnames_stringified_inverse_ords} = join ",", %{$^H{charnames_inverse_ords}}; 745 746 # Modify the input script names for loose name matching if that is also 747 # specified, similar to the way the base character name is prepared. They 748 # don't (currently, and hopefully never will) have dashes. These go into a 749 # regex, and have already been uppercased and quotemeta'd. Squeeze out all 750 # input underscores, blanks, and dashes. Then convert so will match a blank 751 # between any characters. 752 if ($^H{charnames_loose}) { 753 for (my $i = 0; $i < @scripts; $i++) { 754 $scripts[$i] =~ s/[_ -]//g; 755 $scripts[$i] =~ s/ ( [^\\] ) (?= . ) /$1\\ ?/gx; 756 } 757 } 758 759 my %letters_by_script = map { 760 $_ => [ 761 ($txt =~ m/$_(?: (?:small|capital))? letter (.*)/ig) 762 ] 763 } @scripts; 764 SCRIPTS: foreach my $this_script (@scripts) { 765 my @other_scripts = grep { $_ ne $this_script } @scripts; 766 my @this_script_letters = @{$letters_by_script{$this_script}}; 767 my @other_script_letters = map { @{$letters_by_script{$_}} } @other_scripts; 768 foreach my $this_letter (@this_script_letters) { 769 if(grep { $_ eq $this_letter } @other_script_letters) { 770 warn "charnames: some short character names may clash in [".join(', ', sort @scripts)."], for example $this_letter\n"; 771 last SCRIPTS; 772 } 773 } 774 } 775 776 $^H{charnames_scripts} = join "|", @scripts; # Stringifiy them as a trie 777} # import 778 779# Cache of already looked-up values. This is set to only contain 780# official values, and user aliases can't override them, so scoping is 781# not an issue. 782my %viacode; 783 784my $no_name_code_points_re = join "|", map { sprintf("%05X", 785 utf8::unicode_to_native($_)) } 786 0x80, 0x81, 0x84, 0x99; 787$no_name_code_points_re = qr/$no_name_code_points_re/; 788 789sub viacode { 790 791 # Returns the name of the code point argument 792 793 if (@_ != 1) { 794 carp "charnames::viacode() expects one argument"; 795 return; 796 } 797 798 my $arg = shift; 799 800 # This is derived from Unicode::UCD, where it is nearly the same as the 801 # function _getcode(), but here it makes sure that even a hex argument 802 # has the proper number of leading zeros, which is critical in 803 # matching against $txt below 804 # Must check if decimal first; see comments at that definition 805 my $hex; 806 if ($arg =~ $decimal_qr) { 807 $hex = sprintf "%05X", $arg; 808 } elsif ($arg =~ $hex_qr) { 809 $hex = CORE::hex $1; 810 $hex = utf8::unicode_to_native($hex) if $arg =~ /^[Uu]\+/; 811 # Below is the line that differs from the _getcode() source 812 $hex = sprintf "%05X", $hex; 813 } else { 814 carp("unexpected arg \"$arg\" to charnames::viacode()"); 815 return; 816 } 817 818 return $viacode{$hex} if exists $viacode{$hex}; 819 820 my $return; 821 822 # If the code point is above the max in the table, there's no point 823 # looking through it. Checking the length first is slightly faster 824 if (length($hex) <= 5 || CORE::hex($hex) <= 0x10FFFF) { 825 populate_txt() unless $txt; 826 827 # See if the name is algorithmically determinable. 828 my $algorithmic = charnames::code_point_to_name_special(CORE::hex $hex); 829 if (defined $algorithmic) { 830 $viacode{$hex} = $algorithmic; 831 return $algorithmic; 832 } 833 834 # Return the official name, if exists. It's unclear to me (khw) at 835 # this juncture if it is better to return a user-defined override, so 836 # leaving it as is for now. 837 if ($txt =~ m/^$hex\n/m) { 838 839 # The name starts with the next character and goes up to the 840 # next new-line. Using capturing parentheses above instead of 841 # @+ more than doubles the execution time in Perl 5.13 842 $return = substr($txt, $+[0], index($txt, "\n", $+[0]) - $+[0]); 843 844 # If not one of these 4 code points, return what we've found. 845 if ($hex !~ / ^ $no_name_code_points_re $ /x) { 846 $viacode{$hex} = $return; 847 return $return; 848 } 849 850 # For backwards compatibility, we don't return the official name of 851 # the 4 code points if there are user-defined aliases for them -- so 852 # continue looking. 853 } 854 } 855 856 # See if there is a user name for it, before giving up completely. 857 # First get the scoped aliases, give up if have none. 858 my $H_ref = (caller(1))[10]; 859 return if ! defined $return 860 && (! defined $H_ref 861 || ! exists $H_ref->{charnames_stringified_inverse_ords}); 862 863 my %code_point_aliases; 864 if (defined $H_ref->{charnames_stringified_inverse_ords}) { 865 %code_point_aliases = split ',', 866 $H_ref->{charnames_stringified_inverse_ords}; 867 return $code_point_aliases{$hex} if exists $code_point_aliases{$hex}; 868 } 869 870 # Here there is no user-defined alias, return any official one. 871 return $return if defined $return; 872 873 if (CORE::hex($hex) > 0x10FFFF 874 && warnings::enabled('non_unicode')) 875 { 876 carp "Unicode characters only allocated up to U+10FFFF (you asked for U+$hex)"; 877 } 878 return; 879 880} # viacode 881 8821; 883 884# ex: set ts=8 sts=2 sw=2 et: 885