1package Unicode::UCD; 2 3use strict; 4use warnings; 5 6our $VERSION = '0.27'; 7 8use Storable qw(dclone); 9 10require Exporter; 11 12our @ISA = qw(Exporter); 13 14our @EXPORT_OK = qw(charinfo 15 charblock charscript 16 charblocks charscripts 17 charinrange 18 general_categories bidi_types 19 compexcl 20 casefold casespec 21 namedseq); 22 23use Carp; 24 25=head1 NAME 26 27Unicode::UCD - Unicode character database 28 29=head1 SYNOPSIS 30 31 use Unicode::UCD 'charinfo'; 32 my $charinfo = charinfo($codepoint); 33 34 use Unicode::UCD 'casefold'; 35 my $casefold = casefold(0xFB00); 36 37 use Unicode::UCD 'casespec'; 38 my $casespec = casespec(0xFB00); 39 40 use Unicode::UCD 'charblock'; 41 my $charblock = charblock($codepoint); 42 43 use Unicode::UCD 'charscript'; 44 my $charscript = charscript($codepoint); 45 46 use Unicode::UCD 'charblocks'; 47 my $charblocks = charblocks(); 48 49 use Unicode::UCD 'charscripts'; 50 my $charscripts = charscripts(); 51 52 use Unicode::UCD qw(charscript charinrange); 53 my $range = charscript($script); 54 print "looks like $script\n" if charinrange($range, $codepoint); 55 56 use Unicode::UCD qw(general_categories bidi_types); 57 my $categories = general_categories(); 58 my $types = bidi_types(); 59 60 use Unicode::UCD 'compexcl'; 61 my $compexcl = compexcl($codepoint); 62 63 use Unicode::UCD 'namedseq'; 64 my $namedseq = namedseq($named_sequence_name); 65 66 my $unicode_version = Unicode::UCD::UnicodeVersion(); 67 68=head1 DESCRIPTION 69 70The Unicode::UCD module offers a series of functions that 71provide a simple interface to the Unicode 72Character Database. 73 74=head2 code point argument 75 76Some of the functions are called with a I<code point argument>, which is either 77a decimal or a hexadecimal scalar designating a Unicode code point, or C<U+> 78followed by hexadecimals designating a Unicode code point. In other words, if 79you want a code point to be interpreted as a hexadecimal number, you must 80prefix it with either C<0x> or C<U+>, because a string like e.g. C<123> will be 81interpreted as a decimal code point. Also note that Unicode is B<not> limited 82to 16 bits (the number of Unicode code points is open-ended, in theory 83unlimited): you may have more than 4 hexdigits. 84=cut 85 86my $UNICODEFH; 87my $BLOCKSFH; 88my $SCRIPTSFH; 89my $VERSIONFH; 90my $COMPEXCLFH; 91my $CASEFOLDFH; 92my $CASESPECFH; 93my $NAMEDSEQFH; 94 95sub openunicode { 96 my ($rfh, @path) = @_; 97 my $f; 98 unless (defined $$rfh) { 99 for my $d (@INC) { 100 use File::Spec; 101 $f = File::Spec->catfile($d, "unicore", @path); 102 last if open($$rfh, $f); 103 undef $f; 104 } 105 croak __PACKAGE__, ": failed to find ", 106 File::Spec->catfile(@path), " in @INC" 107 unless defined $f; 108 } 109 return $f; 110} 111 112=head2 B<charinfo()> 113 114 use Unicode::UCD 'charinfo'; 115 116 my $charinfo = charinfo(0x41); 117 118This returns information about the input L</code point argument> 119as a reference to a hash of fields as defined by the Unicode 120standard. If the L</code point argument> is not assigned in the standard 121(i.e., has the general category C<Cn> meaning C<Unassigned>) 122or is a non-character (meaning it is guaranteed to never be assigned in 123the standard), 124B<undef> is returned. 125 126Fields that aren't applicable to the particular code point argument exist in the 127returned hash, and are empty. 128 129The keys in the hash with the meanings of their values are: 130 131=over 132 133=item B<code> 134 135the input L</code point argument> expressed in hexadecimal, with leading zeros 136added if necessary to make it contain at least four hexdigits 137 138=item B<name> 139 140name of I<code>, all IN UPPER CASE. 141Some control-type code points do not have names. 142This field will be empty for C<Surrogate> and C<Private Use> code points, 143and for the others without a name, 144it will contain a description enclosed in angle brackets, like 145C<E<lt>controlE<gt>>. 146 147 148=item B<category> 149 150The short name of the general category of I<code>. 151This will match one of the keys in the hash returned by L</general_categories()>. 152 153=item B<combining> 154 155the combining class number for I<code> used in the Canonical Ordering Algorithm. 156For Unicode 5.1, this is described in Section 3.11 C<Canonical Ordering Behavior> 157available at 158L<http://www.unicode.org/versions/Unicode5.1.0/> 159 160=item B<bidi> 161 162bidirectional type of I<code>. 163This will match one of the keys in the hash returned by L</bidi_types()>. 164 165=item B<decomposition> 166 167is empty if I<code> has no decomposition; or is one or more codes 168(separated by spaces) that taken in order represent a decomposition for 169I<code>. Each has at least four hexdigits. 170The codes may be preceded by a word enclosed in angle brackets then a space, 171like C<E<lt>compatE<gt> >, giving the type of decomposition 172 173=item B<decimal> 174 175if I<code> is a decimal digit this is its integer numeric value 176 177=item B<digit> 178 179if I<code> represents a whole number, this is its integer numeric value 180 181=item B<numeric> 182 183if I<code> represents a whole or rational number, this is its numeric value. 184Rational values are expressed as a string like C<1/4>. 185 186=item B<mirrored> 187 188C<Y> or C<N> designating if I<code> is mirrored in bidirectional text 189 190=item B<unicode10> 191 192name of I<code> in the Unicode 1.0 standard if one 193existed for this code point and is different from the current name 194 195=item B<comment> 196 197ISO 10646 comment field. 198It appears in parentheses in the ISO 10646 names list, 199or contains an asterisk to indicate there is 200a note for this code point in Annex P of that standard. 201 202=item B<upper> 203 204is empty if there is no single code point uppercase mapping for I<code>; 205otherwise it is that mapping expressed as at least four hexdigits. 206(L</casespec()> should be used in addition to B<charinfo()> 207for case mappings when the calling program can cope with multiple code point 208mappings.) 209 210=item B<lower> 211 212is empty if there is no single code point lowercase mapping for I<code>; 213otherwise it is that mapping expressed as at least four hexdigits. 214(L</casespec()> should be used in addition to B<charinfo()> 215for case mappings when the calling program can cope with multiple code point 216mappings.) 217 218=item B<title> 219 220is empty if there is no single code point titlecase mapping for I<code>; 221otherwise it is that mapping expressed as at least four hexdigits. 222(L</casespec()> should be used in addition to B<charinfo()> 223for case mappings when the calling program can cope with multiple code point 224mappings.) 225 226=item B<block> 227 228block I<code> belongs to (used in \p{In...}). 229See L</Blocks versus Scripts>. 230 231 232=item B<script> 233 234script I<code> belongs to. 235See L</Blocks versus Scripts>. 236 237=back 238 239Note that you cannot do (de)composition and casing based solely on the 240I<decomposition>, I<combining>, I<lower>, I<upper>, and I<title> fields; 241you will need also the L</compexcl()>, and L</casespec()> functions. 242 243=cut 244 245# NB: This function is duplicated in charnames.pm 246sub _getcode { 247 my $arg = shift; 248 249 if ($arg =~ /^[1-9]\d*$/) { 250 return $arg; 251 } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) { 252 return hex($1); 253 } 254 255 return; 256} 257 258# Lingua::KO::Hangul::Util not part of the standard distribution 259# but it will be used if available. 260 261eval { require Lingua::KO::Hangul::Util }; 262my $hasHangulUtil = ! $@; 263if ($hasHangulUtil) { 264 Lingua::KO::Hangul::Util->import(); 265} 266 267sub hangul_decomp { # internal: called from charinfo 268 if ($hasHangulUtil) { 269 my @tmp = decomposeHangul(shift); 270 return sprintf("%04X %04X", @tmp) if @tmp == 2; 271 return sprintf("%04X %04X %04X", @tmp) if @tmp == 3; 272 } 273 return; 274} 275 276sub hangul_charname { # internal: called from charinfo 277 return sprintf("HANGUL SYLLABLE-%04X", shift); 278} 279 280sub han_charname { # internal: called from charinfo 281 return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift); 282} 283 284# Overwritten by data in file 285my %first_last = ( 286 'CJK Ideograph Extension A' => [ 0x3400, 0x4DB5 ], 287 'CJK Ideograph' => [ 0x4E00, 0x9FA5 ], 288 'CJK Ideograph Extension B' => [ 0x20000, 0x2A6D6 ], 289); 290 291get_charinfo_ranges(); 292 293sub get_charinfo_ranges { 294 my @blocks = keys %first_last; 295 296 my $fh; 297 openunicode( \$fh, 'UnicodeData.txt' ); 298 if( defined $fh ){ 299 while( my $line = <$fh> ){ 300 next unless $line =~ /(?:First|Last)/; 301 if( grep{ $line =~ /[^;]+;<$_\s*,\s*(?:First|Last)>/ }@blocks ){ 302 my ($number,$block,$type); 303 ($number,$block) = split /;/, $line; 304 $block =~ s/<|>//g; 305 ($block,$type) = split /, /, $block; 306 my $index = $type eq 'First' ? 0 : 1; 307 $first_last{ $block }->[$index] = hex $number; 308 } 309 } 310 } 311} 312 313my @CharinfoRanges = ( 314# block name 315# [ first, last, coderef to name, coderef to decompose ], 316# CJK Ideographs Extension A 317 [ @{ $first_last{'CJK Ideograph Extension A'} }, \&han_charname, undef ], 318# CJK Ideographs 319 [ @{ $first_last{'CJK Ideograph'} }, \&han_charname, undef ], 320# Hangul Syllables 321 [ 0xAC00, 0xD7A3, $hasHangulUtil ? \&getHangulName : \&hangul_charname, \&hangul_decomp ], 322# Non-Private Use High Surrogates 323 [ 0xD800, 0xDB7F, undef, undef ], 324# Private Use High Surrogates 325 [ 0xDB80, 0xDBFF, undef, undef ], 326# Low Surrogates 327 [ 0xDC00, 0xDFFF, undef, undef ], 328# The Private Use Area 329 [ 0xE000, 0xF8FF, undef, undef ], 330# CJK Ideographs Extension B 331 [ @{ $first_last{'CJK Ideograph Extension B'} }, \&han_charname, undef ], 332# Plane 15 Private Use Area 333 [ 0xF0000, 0xFFFFD, undef, undef ], 334# Plane 16 Private Use Area 335 [ 0x100000, 0x10FFFD, undef, undef ], 336); 337 338sub charinfo { 339 my $arg = shift; 340 my $code = _getcode($arg); 341 croak __PACKAGE__, "::charinfo: unknown code '$arg'" 342 unless defined $code; 343 my $hexk = sprintf("%06X", $code); 344 my($rcode,$rname,$rdec); 345 foreach my $range (@CharinfoRanges){ 346 if ($range->[0] <= $code && $code <= $range->[1]) { 347 $rcode = $hexk; 348 $rcode =~ s/^0+//; 349 $rcode = sprintf("%04X", hex($rcode)); 350 $rname = $range->[2] ? $range->[2]->($code) : ''; 351 $rdec = $range->[3] ? $range->[3]->($code) : ''; 352 $hexk = sprintf("%06X", $range->[0]); # replace by the first 353 last; 354 } 355 } 356 openunicode(\$UNICODEFH, "UnicodeData.txt"); 357 if (defined $UNICODEFH) { 358 use Search::Dict 1.02; 359 if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) { 360 my $line = <$UNICODEFH>; 361 return unless defined $line; 362 chomp $line; 363 my %prop; 364 @prop{qw( 365 code name category 366 combining bidi decomposition 367 decimal digit numeric 368 mirrored unicode10 comment 369 upper lower title 370 )} = split(/;/, $line, -1); 371 $hexk =~ s/^0+//; 372 $hexk = sprintf("%04X", hex($hexk)); 373 if ($prop{code} eq $hexk) { 374 $prop{block} = charblock($code); 375 $prop{script} = charscript($code); 376 if(defined $rname){ 377 $prop{code} = $rcode; 378 $prop{name} = $rname; 379 $prop{decomposition} = $rdec; 380 } 381 return \%prop; 382 } 383 } 384 } 385 return; 386} 387 388sub _search { # Binary search in a [[lo,hi,prop],[...],...] table. 389 my ($table, $lo, $hi, $code) = @_; 390 391 return if $lo > $hi; 392 393 my $mid = int(($lo+$hi) / 2); 394 395 if ($table->[$mid]->[0] < $code) { 396 if ($table->[$mid]->[1] >= $code) { 397 return $table->[$mid]->[2]; 398 } else { 399 _search($table, $mid + 1, $hi, $code); 400 } 401 } elsif ($table->[$mid]->[0] > $code) { 402 _search($table, $lo, $mid - 1, $code); 403 } else { 404 return $table->[$mid]->[2]; 405 } 406} 407 408sub charinrange { 409 my ($range, $arg) = @_; 410 my $code = _getcode($arg); 411 croak __PACKAGE__, "::charinrange: unknown code '$arg'" 412 unless defined $code; 413 _search($range, 0, $#$range, $code); 414} 415 416=head2 B<charblock()> 417 418 use Unicode::UCD 'charblock'; 419 420 my $charblock = charblock(0x41); 421 my $charblock = charblock(1234); 422 my $charblock = charblock(0x263a); 423 my $charblock = charblock("U+263a"); 424 425 my $range = charblock('Armenian'); 426 427With a L</code point argument> charblock() returns the I<block> the code point 428belongs to, e.g. C<Basic Latin>. 429If the code point is unassigned, this returns the block it would belong to if 430it were assigned (which it may in future versions of the Unicode Standard). 431 432See also L</Blocks versus Scripts>. 433 434If supplied with an argument that can't be a code point, charblock() tries 435to do the opposite and interpret the argument as a code point block. The 436return value is a I<range>: an anonymous list of lists that contain 437I<start-of-range>, I<end-of-range> code point pairs. You can test whether 438a code point is in a range using the L</charinrange()> function. If the 439argument is not a known code point block, B<undef> is returned. 440 441=cut 442 443my @BLOCKS; 444my %BLOCKS; 445 446sub _charblocks { 447 unless (@BLOCKS) { 448 if (openunicode(\$BLOCKSFH, "Blocks.txt")) { 449 local $_; 450 while (<$BLOCKSFH>) { 451 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) { 452 my ($lo, $hi) = (hex($1), hex($2)); 453 my $subrange = [ $lo, $hi, $3 ]; 454 push @BLOCKS, $subrange; 455 push @{$BLOCKS{$3}}, $subrange; 456 } 457 } 458 close($BLOCKSFH); 459 } 460 } 461} 462 463sub charblock { 464 my $arg = shift; 465 466 _charblocks() unless @BLOCKS; 467 468 my $code = _getcode($arg); 469 470 if (defined $code) { 471 _search(\@BLOCKS, 0, $#BLOCKS, $code); 472 } else { 473 if (exists $BLOCKS{$arg}) { 474 return dclone $BLOCKS{$arg}; 475 } else { 476 return; 477 } 478 } 479} 480 481=head2 B<charscript()> 482 483 use Unicode::UCD 'charscript'; 484 485 my $charscript = charscript(0x41); 486 my $charscript = charscript(1234); 487 my $charscript = charscript("U+263a"); 488 489 my $range = charscript('Thai'); 490 491With a L</code point argument> charscript() returns the I<script> the 492code point belongs to, e.g. C<Latin>, C<Greek>, C<Han>. 493If the code point is unassigned, it returns B<undef> 494 495If supplied with an argument that can't be a code point, charscript() tries 496to do the opposite and interpret the argument as a code point script. The 497return value is a I<range>: an anonymous list of lists that contain 498I<start-of-range>, I<end-of-range> code point pairs. You can test whether a 499code point is in a range using the L</charinrange()> function. If the 500argument is not a known code point script, B<undef> is returned. 501 502See also L</Blocks versus Scripts>. 503 504=cut 505 506my @SCRIPTS; 507my %SCRIPTS; 508 509sub _charscripts { 510 unless (@SCRIPTS) { 511 if (openunicode(\$SCRIPTSFH, "Scripts.txt")) { 512 local $_; 513 while (<$SCRIPTSFH>) { 514 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) { 515 my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1)); 516 my $script = lc($3); 517 $script =~ s/\b(\w)/uc($1)/ge; 518 my $subrange = [ $lo, $hi, $script ]; 519 push @SCRIPTS, $subrange; 520 push @{$SCRIPTS{$script}}, $subrange; 521 } 522 } 523 close($SCRIPTSFH); 524 @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS; 525 } 526 } 527} 528 529sub charscript { 530 my $arg = shift; 531 532 _charscripts() unless @SCRIPTS; 533 534 my $code = _getcode($arg); 535 536 if (defined $code) { 537 _search(\@SCRIPTS, 0, $#SCRIPTS, $code); 538 } else { 539 if (exists $SCRIPTS{$arg}) { 540 return dclone $SCRIPTS{$arg}; 541 } else { 542 return; 543 } 544 } 545} 546 547=head2 B<charblocks()> 548 549 use Unicode::UCD 'charblocks'; 550 551 my $charblocks = charblocks(); 552 553charblocks() returns a reference to a hash with the known block names 554as the keys, and the code point ranges (see L</charblock()>) as the values. 555 556See also L</Blocks versus Scripts>. 557 558=cut 559 560sub charblocks { 561 _charblocks() unless %BLOCKS; 562 return dclone \%BLOCKS; 563} 564 565=head2 B<charscripts()> 566 567 use Unicode::UCD 'charscripts'; 568 569 my $charscripts = charscripts(); 570 571charscripts() returns a reference to a hash with the known script 572names as the keys, and the code point ranges (see L</charscript()>) as 573the values. 574 575See also L</Blocks versus Scripts>. 576 577=cut 578 579sub charscripts { 580 _charscripts() unless %SCRIPTS; 581 return dclone \%SCRIPTS; 582} 583 584=head2 B<charinrange()> 585 586In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you 587can also test whether a code point is in the I<range> as returned by 588L</charblock()> and L</charscript()> or as the values of the hash returned 589by L</charblocks()> and L</charscripts()> by using charinrange(): 590 591 use Unicode::UCD qw(charscript charinrange); 592 593 $range = charscript('Hiragana'); 594 print "looks like hiragana\n" if charinrange($range, $codepoint); 595 596=cut 597 598my %GENERAL_CATEGORIES = 599 ( 600 'L' => 'Letter', 601 'LC' => 'CasedLetter', 602 'Lu' => 'UppercaseLetter', 603 'Ll' => 'LowercaseLetter', 604 'Lt' => 'TitlecaseLetter', 605 'Lm' => 'ModifierLetter', 606 'Lo' => 'OtherLetter', 607 'M' => 'Mark', 608 'Mn' => 'NonspacingMark', 609 'Mc' => 'SpacingMark', 610 'Me' => 'EnclosingMark', 611 'N' => 'Number', 612 'Nd' => 'DecimalNumber', 613 'Nl' => 'LetterNumber', 614 'No' => 'OtherNumber', 615 'P' => 'Punctuation', 616 'Pc' => 'ConnectorPunctuation', 617 'Pd' => 'DashPunctuation', 618 'Ps' => 'OpenPunctuation', 619 'Pe' => 'ClosePunctuation', 620 'Pi' => 'InitialPunctuation', 621 'Pf' => 'FinalPunctuation', 622 'Po' => 'OtherPunctuation', 623 'S' => 'Symbol', 624 'Sm' => 'MathSymbol', 625 'Sc' => 'CurrencySymbol', 626 'Sk' => 'ModifierSymbol', 627 'So' => 'OtherSymbol', 628 'Z' => 'Separator', 629 'Zs' => 'SpaceSeparator', 630 'Zl' => 'LineSeparator', 631 'Zp' => 'ParagraphSeparator', 632 'C' => 'Other', 633 'Cc' => 'Control', 634 'Cf' => 'Format', 635 'Cs' => 'Surrogate', 636 'Co' => 'PrivateUse', 637 'Cn' => 'Unassigned', 638 ); 639 640sub general_categories { 641 return dclone \%GENERAL_CATEGORIES; 642} 643 644=head2 B<general_categories()> 645 646 use Unicode::UCD 'general_categories'; 647 648 my $categories = general_categories(); 649 650This returns a reference to a hash which has short 651general category names (such as C<Lu>, C<Nd>, C<Zs>, C<S>) as keys and long 652names (such as C<UppercaseLetter>, C<DecimalNumber>, C<SpaceSeparator>, 653C<Symbol>) as values. The hash is reversible in case you need to go 654from the long names to the short names. The general category is the 655one returned from 656L</charinfo()> under the C<category> key. 657 658=cut 659 660my %BIDI_TYPES = 661 ( 662 'L' => 'Left-to-Right', 663 'LRE' => 'Left-to-Right Embedding', 664 'LRO' => 'Left-to-Right Override', 665 'R' => 'Right-to-Left', 666 'AL' => 'Right-to-Left Arabic', 667 'RLE' => 'Right-to-Left Embedding', 668 'RLO' => 'Right-to-Left Override', 669 'PDF' => 'Pop Directional Format', 670 'EN' => 'European Number', 671 'ES' => 'European Number Separator', 672 'ET' => 'European Number Terminator', 673 'AN' => 'Arabic Number', 674 'CS' => 'Common Number Separator', 675 'NSM' => 'Non-Spacing Mark', 676 'BN' => 'Boundary Neutral', 677 'B' => 'Paragraph Separator', 678 'S' => 'Segment Separator', 679 'WS' => 'Whitespace', 680 'ON' => 'Other Neutrals', 681 ); 682 683=head2 B<bidi_types()> 684 685 use Unicode::UCD 'bidi_types'; 686 687 my $categories = bidi_types(); 688 689This returns a reference to a hash which has the short 690bidi (bidirectional) type names (such as C<L>, C<R>) as keys and long 691names (such as C<Left-to-Right>, C<Right-to-Left>) as values. The 692hash is reversible in case you need to go from the long names to the 693short names. The bidi type is the one returned from 694L</charinfo()> 695under the C<bidi> key. For the exact meaning of the various bidi classes 696the Unicode TR9 is recommended reading: 697L<http://www.unicode.org/reports/tr9/> 698(as of Unicode 5.0.0) 699 700=cut 701 702sub bidi_types { 703 return dclone \%BIDI_TYPES; 704} 705 706=head2 B<compexcl()> 707 708 use Unicode::UCD 'compexcl'; 709 710 my $compexcl = compexcl(0x09dc); 711 712This returns B<true> if the 713L</code point argument> should not be produced by composition normalization, 714B<AND> if that fact is not otherwise determinable from the Unicode data base. 715It currently does not return B<true> if the code point has a decomposition 716consisting of another single code point, nor if its decomposition starts 717with a code point whose combining class is non-zero. Code points that meet 718either of these conditions should also not be produced by composition 719normalization. 720 721It returns B<false> otherwise. 722 723=cut 724 725my %COMPEXCL; 726 727sub _compexcl { 728 unless (%COMPEXCL) { 729 if (openunicode(\$COMPEXCLFH, "CompositionExclusions.txt")) { 730 local $_; 731 while (<$COMPEXCLFH>) { 732 if (/^([0-9A-F]+)\s+\#\s+/) { 733 my $code = hex($1); 734 $COMPEXCL{$code} = undef; 735 } 736 } 737 close($COMPEXCLFH); 738 } 739 } 740} 741 742sub compexcl { 743 my $arg = shift; 744 my $code = _getcode($arg); 745 croak __PACKAGE__, "::compexcl: unknown code '$arg'" 746 unless defined $code; 747 748 _compexcl() unless %COMPEXCL; 749 750 return exists $COMPEXCL{$code}; 751} 752 753=head2 B<casefold()> 754 755 use Unicode::UCD 'casefold'; 756 757 my $casefold = casefold(0xDF); 758 if (defined $casefold) { 759 my @full_fold_hex = split / /, $casefold->{'full'}; 760 my $full_fold_string = 761 join "", map {chr(hex($_))} @full_fold_hex; 762 my @turkic_fold_hex = 763 split / /, ($casefold->{'turkic'} ne "") 764 ? $casefold->{'turkic'} 765 : $casefold->{'full'}; 766 my $turkic_fold_string = 767 join "", map {chr(hex($_))} @turkic_fold_hex; 768 } 769 if (defined $casefold && $casefold->{'simple'} ne "") { 770 my $simple_fold_hex = $casefold->{'simple'}; 771 my $simple_fold_string = chr(hex($simple_fold_hex)); 772 } 773 774This returns the (almost) locale-independent case folding of the 775character specified by the L</code point argument>. 776 777If there is no case folding for that code point, B<undef> is returned. 778 779If there is a case folding for that code point, a reference to a hash 780with the following fields is returned: 781 782=over 783 784=item B<code> 785 786the input L</code point argument> expressed in hexadecimal, with leading zeros 787added if necessary to make it contain at least four hexdigits 788 789=item B<full> 790 791one or more codes (separated by spaces) that taken in order give the 792code points for the case folding for I<code>. 793Each has at least four hexdigits. 794 795=item B<simple> 796 797is empty, or is exactly one code with at least four hexdigits which can be used 798as an alternative case folding when the calling program cannot cope with the 799fold being a sequence of multiple code points. If I<full> is just one code 800point, then I<simple> equals I<full>. If there is no single code point folding 801defined for I<code>, then I<simple> is the empty string. Otherwise, it is an 802inferior, but still better-than-nothing alternative folding to I<full>. 803 804=item B<mapping> 805 806is the same as I<simple> if I<simple> is not empty, and it is the same as I<full> 807otherwise. It can be considered to be the simplest possible folding for 808I<code>. It is defined primarily for backwards compatibility. 809 810=item B<status> 811 812is C<C> (for C<common>) if the best possible fold is a single code point 813(I<simple> equals I<full> equals I<mapping>). It is C<S> if there are distinct 814folds, I<simple> and I<full> (I<mapping> equals I<simple>). And it is C<F> if 815there only a I<full> fold (I<mapping> equals I<full>; I<simple> is empty). Note 816that this 817describes the contents of I<mapping>. It is defined primarily for backwards 818compatibility. 819 820On versions 3.1 and earlier of Unicode, I<status> can also be 821C<I> which is the same as C<C> but is a special case for dotted uppercase I and 822dotless lowercase i: 823 824=over 825 826=item B<*> 827 828If you use this C<I> mapping, the result is case-insensitive, 829but dotless and dotted I's are not distinguished 830 831=item B<*> 832 833If you exclude this C<I> mapping, the result is not fully case-insensitive, but 834dotless and dotted I's are distinguished 835 836=back 837 838=item B<turkic> 839 840contains any special folding for Turkic languages. For versions of Unicode 841starting with 3.2, this field is empty unless I<code> has a different folding 842in Turkic languages, in which case it is one or more codes (separated by 843spaces) that taken in order give the code points for the case folding for 844I<code> in those languages. 845Each code has at least four hexdigits. 846Note that this folding does not maintain canonical equivalence without 847additional processing. 848 849For versions of Unicode 3.1 and earlier, this field is empty unless there is a 850special folding for Turkic languages, in which case I<status> is C<I>, and 851I<mapping>, I<full>, I<simple>, and I<turkic> are all equal. 852 853=back 854 855Programs that want complete generality and the best folding results should use 856the folding contained in the I<full> field. But note that the fold for some 857code points will be a sequence of multiple code points. 858 859Programs that can't cope with the fold mapping being multiple code points can 860use the folding contained in the I<simple> field, with the loss of some 861generality. In Unicode 5.1, about 7% of the defined foldings have no single 862code point folding. 863 864The I<mapping> and I<status> fields are provided for backwards compatibility for 865existing programs. They contain the same values as in previous versions of 866this function. 867 868Locale is not completely independent. The I<turkic> field contains results to 869use when the locale is a Turkic language. 870 871For more information about case mappings see 872L<http://www.unicode.org/unicode/reports/tr21> 873 874=cut 875 876my %CASEFOLD; 877 878sub _casefold { 879 unless (%CASEFOLD) { 880 if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) { 881 local $_; 882 while (<$CASEFOLDFH>) { 883 if (/^([0-9A-F]+); ([CFIST]); ([0-9A-F]+(?: [0-9A-F]+)*);/) { 884 my $code = hex($1); 885 $CASEFOLD{$code}{'code'} = $1; 886 $CASEFOLD{$code}{'turkic'} = "" unless 887 defined $CASEFOLD{$code}{'turkic'}; 888 if ($2 eq 'C' || $2 eq 'I') { # 'I' is only on 3.1 and 889 # earlier Unicodes 890 # Both entries there (I 891 # only checked 3.1) are 892 # the same as C, and 893 # there are no other 894 # entries for those 895 # codepoints, so treat 896 # as if C, but override 897 # the turkic one for 898 # 'I'. 899 $CASEFOLD{$code}{'status'} = $2; 900 $CASEFOLD{$code}{'full'} = $CASEFOLD{$code}{'simple'} = 901 $CASEFOLD{$code}{'mapping'} = $3; 902 $CASEFOLD{$code}{'turkic'} = $3 if $2 eq 'I'; 903 } elsif ($2 eq 'F') { 904 $CASEFOLD{$code}{'full'} = $3; 905 unless (defined $CASEFOLD{$code}{'simple'}) { 906 $CASEFOLD{$code}{'simple'} = ""; 907 $CASEFOLD{$code}{'mapping'} = $3; 908 $CASEFOLD{$code}{'status'} = $2; 909 } 910 } elsif ($2 eq 'S') { 911 912 913 # There can't be a simple without a full, and simple 914 # overrides all but full 915 916 $CASEFOLD{$code}{'simple'} = $3; 917 $CASEFOLD{$code}{'mapping'} = $3; 918 $CASEFOLD{$code}{'status'} = $2; 919 } elsif ($2 eq 'T') { 920 $CASEFOLD{$code}{'turkic'} = $3; 921 } # else can't happen because only [CIFST] are possible 922 } 923 } 924 close($CASEFOLDFH); 925 } 926 } 927} 928 929sub casefold { 930 my $arg = shift; 931 my $code = _getcode($arg); 932 croak __PACKAGE__, "::casefold: unknown code '$arg'" 933 unless defined $code; 934 935 _casefold() unless %CASEFOLD; 936 937 return $CASEFOLD{$code}; 938} 939 940=head2 B<casespec()> 941 942 use Unicode::UCD 'casespec'; 943 944 my $casespec = casespec(0xFB00); 945 946This returns the potentially locale-dependent case mappings of the L</code point 947argument>. The mappings may be longer than a single code point (which the basic 948Unicode case mappings as returned by L</charinfo()> never are). 949 950If there are no case mappings for the L</code point argument>, or if all three 951possible mappings (I<lower>, I<title> and I<upper>) result in single code 952points and are locale independent and unconditional, B<undef> is returned 953(which means that the case mappings, if any, for the code point are those 954returned by L</charinfo()>). 955 956Otherwise, a reference to a hash giving the mappings (or a reference to a hash 957of such hashes, explained below) is returned with the following keys and their 958meanings: 959 960The keys in the bottom layer hash with the meanings of their values are: 961 962=over 963 964=item B<code> 965 966the input L</code point argument> expressed in hexadecimal, with leading zeros 967added if necessary to make it contain at least four hexdigits 968 969=item B<lower> 970 971one or more codes (separated by spaces) that taken in order give the 972code points for the lower case of I<code>. 973Each has at least four hexdigits. 974 975=item B<title> 976 977one or more codes (separated by spaces) that taken in order give the 978code points for the title case of I<code>. 979Each has at least four hexdigits. 980 981=item B<lower> 982 983one or more codes (separated by spaces) that taken in order give the 984code points for the upper case of I<code>. 985Each has at least four hexdigits. 986 987=item B<condition> 988 989the conditions for the mappings to be valid. 990If B<undef>, the mappings are always valid. 991When defined, this field is a list of conditions, 992all of which must be true for the mappings to be valid. 993The list consists of one or more 994I<locales> (see below) 995and/or I<contexts> (explained in the next paragraph), 996separated by spaces. 997(Other than as used to separate elements, spaces are to be ignored.) 998Case distinctions in the condition list are not significant. 999Conditions preceded by "NON_" represent the negation of the condition. 1000 1001A I<context> is one of those defined in the Unicode standard. 1002For Unicode 5.1, they are defined in Section 3.13 C<Default Case Operations> 1003available at 1004L<http://www.unicode.org/versions/Unicode5.1.0/>. 1005These are for context-sensitive casing. 1006 1007=back 1008 1009The hash described above is returned for locale-independent casing, where 1010at least one of the mappings has length longer than one. If B<undef> is 1011returned, the code point may have mappings, but if so, all are length one, 1012and are returned by L</charinfo()>. 1013Note that when this function does return a value, it will be for the complete 1014set of mappings for a code point, even those whose length is one. 1015 1016If there are additional casing rules that apply only in certain locales, 1017an additional key for each will be defined in the returned hash. Each such key 1018will be its locale name, defined as a 2-letter ISO 3166 country code, possibly 1019followed by a "_" and a 2-letter ISO language code (possibly followed by a "_" 1020and a variant code). You can find the lists of all possible locales, see 1021L<Locale::Country> and L<Locale::Language>. 1022(In Unicode 5.1, the only locales returned by this function 1023are C<lt>, C<tr>, and C<az>.) 1024 1025Each locale key is a reference to a hash that has the form above, and gives 1026the casing rules for that particular locale, which take precedence over the 1027locale-independent ones when in that locale. 1028 1029If the only casing for a code point is locale-dependent, then the returned 1030hash will not have any of the base keys, like C<code>, C<upper>, etc., but 1031will contain only locale keys. 1032 1033For more information about case mappings see 1034L<http://www.unicode.org/unicode/reports/tr21/> 1035 1036=cut 1037 1038my %CASESPEC; 1039 1040sub _casespec { 1041 unless (%CASESPEC) { 1042 if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) { 1043 local $_; 1044 while (<$CASESPECFH>) { 1045 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) { 1046 my ($hexcode, $lower, $title, $upper, $condition) = 1047 ($1, $2, $3, $4, $5); 1048 my $code = hex($hexcode); 1049 if (exists $CASESPEC{$code}) { 1050 if (exists $CASESPEC{$code}->{code}) { 1051 my ($oldlower, 1052 $oldtitle, 1053 $oldupper, 1054 $oldcondition) = 1055 @{$CASESPEC{$code}}{qw(lower 1056 title 1057 upper 1058 condition)}; 1059 if (defined $oldcondition) { 1060 my ($oldlocale) = 1061 ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/); 1062 delete $CASESPEC{$code}; 1063 $CASESPEC{$code}->{$oldlocale} = 1064 { code => $hexcode, 1065 lower => $oldlower, 1066 title => $oldtitle, 1067 upper => $oldupper, 1068 condition => $oldcondition }; 1069 } 1070 } 1071 my ($locale) = 1072 ($condition =~ /^([a-z][a-z](?:_\S+)?)/); 1073 $CASESPEC{$code}->{$locale} = 1074 { code => $hexcode, 1075 lower => $lower, 1076 title => $title, 1077 upper => $upper, 1078 condition => $condition }; 1079 } else { 1080 $CASESPEC{$code} = 1081 { code => $hexcode, 1082 lower => $lower, 1083 title => $title, 1084 upper => $upper, 1085 condition => $condition }; 1086 } 1087 } 1088 } 1089 close($CASESPECFH); 1090 } 1091 } 1092} 1093 1094sub casespec { 1095 my $arg = shift; 1096 my $code = _getcode($arg); 1097 croak __PACKAGE__, "::casespec: unknown code '$arg'" 1098 unless defined $code; 1099 1100 _casespec() unless %CASESPEC; 1101 1102 return ref $CASESPEC{$code} ? dclone $CASESPEC{$code} : $CASESPEC{$code}; 1103} 1104 1105=head2 B<namedseq()> 1106 1107 use Unicode::UCD 'namedseq'; 1108 1109 my $namedseq = namedseq("KATAKANA LETTER AINU P"); 1110 my @namedseq = namedseq("KATAKANA LETTER AINU P"); 1111 my %namedseq = namedseq(); 1112 1113If used with a single argument in a scalar context, returns the string 1114consisting of the code points of the named sequence, or B<undef> if no 1115named sequence by that name exists. If used with a single argument in 1116a list context, it returns the list of the ordinals of the code points. If used 1117with no 1118arguments in a list context, returns a hash with the names of the 1119named sequences as the keys and the named sequences as strings as 1120the values. Otherwise, it returns B<undef> or an empty list depending 1121on the context. 1122 1123This function only operates on officially approved (not provisional) named 1124sequences. 1125 1126=cut 1127 1128my %NAMEDSEQ; 1129 1130sub _namedseq { 1131 unless (%NAMEDSEQ) { 1132 if (openunicode(\$NAMEDSEQFH, "NamedSequences.txt")) { 1133 local $_; 1134 while (<$NAMEDSEQFH>) { 1135 if (/^(.+)\s*;\s*([0-9A-F]+(?: [0-9A-F]+)*)$/) { 1136 my ($n, $s) = ($1, $2); 1137 my @s = map { chr(hex($_)) } split(' ', $s); 1138 $NAMEDSEQ{$n} = join("", @s); 1139 } 1140 } 1141 close($NAMEDSEQFH); 1142 } 1143 } 1144} 1145 1146sub namedseq { 1147 _namedseq() unless %NAMEDSEQ; 1148 my $wantarray = wantarray(); 1149 if (defined $wantarray) { 1150 if ($wantarray) { 1151 if (@_ == 0) { 1152 return %NAMEDSEQ; 1153 } elsif (@_ == 1) { 1154 my $s = $NAMEDSEQ{ $_[0] }; 1155 return defined $s ? map { ord($_) } split('', $s) : (); 1156 } 1157 } elsif (@_ == 1) { 1158 return $NAMEDSEQ{ $_[0] }; 1159 } 1160 } 1161 return; 1162} 1163 1164=head2 Unicode::UCD::UnicodeVersion 1165 1166This returns the version of the Unicode Character Database, in other words, the 1167version of the Unicode standard the database implements. The version is a 1168string of numbers delimited by dots (C<'.'>). 1169 1170=cut 1171 1172my $UNICODEVERSION; 1173 1174sub UnicodeVersion { 1175 unless (defined $UNICODEVERSION) { 1176 openunicode(\$VERSIONFH, "version"); 1177 chomp($UNICODEVERSION = <$VERSIONFH>); 1178 close($VERSIONFH); 1179 croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'" 1180 unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/; 1181 } 1182 return $UNICODEVERSION; 1183} 1184 1185=head2 B<Blocks versus Scripts> 1186 1187The difference between a block and a script is that scripts are closer 1188to the linguistic notion of a set of code points required to present 1189languages, while block is more of an artifact of the Unicode code point 1190numbering and separation into blocks of (mostly) 256 code points. 1191 1192For example the Latin B<script> is spread over several B<blocks>, such 1193as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and 1194C<Latin Extended-B>. On the other hand, the Latin script does not 1195contain all the characters of the C<Basic Latin> block (also known as 1196ASCII): it includes only the letters, and not, for example, the digits 1197or the punctuation. 1198 1199For blocks see L<http://www.unicode.org/Public/UNIDATA/Blocks.txt> 1200 1201For scripts see UTR #24: L<http://www.unicode.org/unicode/reports/tr24/> 1202 1203=head2 B<Matching Scripts and Blocks> 1204 1205Scripts are matched with the regular-expression construct 1206C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script), 1207while C<\p{In...}> is used for blocks (e.g. C<\p{InTibetan}> matches 1208any of the 256 code points in the Tibetan block). 1209 1210 1211=head2 Implementation Note 1212 1213The first use of charinfo() opens a read-only filehandle to the Unicode 1214Character Database (the database is included in the Perl distribution). 1215The filehandle is then kept open for further queries. In other words, 1216if you are wondering where one of your filehandles went, that's where. 1217 1218=head1 BUGS 1219 1220Does not yet support EBCDIC platforms. 1221 1222L</compexcl()> should give a complete list of excluded code points. 1223 1224=head1 AUTHOR 1225 1226Jarkko Hietaniemi 1227 1228=cut 1229 12301; 1231