1package Unicode::Normalize; 2 3use 5.006; 4use strict; 5use warnings; 6use Carp; 7 8no warnings 'utf8'; 9 10our $VERSION = '1.32'; 11our $PACKAGE = __PACKAGE__; 12 13our @EXPORT = qw( NFC NFD NFKC NFKD ); 14our @EXPORT_OK = qw( 15 normalize decompose reorder compose 16 checkNFD checkNFKD checkNFC checkNFKC check 17 getCanon getCompat getComposite getCombinClass 18 isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex 19 isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE 20 FCD checkFCD FCC checkFCC composeContiguous splitOnLastStarter 21 normalize_partial NFC_partial NFD_partial NFKC_partial NFKD_partial 22); 23our %EXPORT_TAGS = ( 24 all => [ @EXPORT, @EXPORT_OK ], 25 normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ], 26 check => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ], 27 fast => [ qw/FCD checkFCD FCC checkFCC composeContiguous/ ], 28); 29 30## 31## utilities for tests 32## 33 34 # No EBCDIC support on early perls 35*to_native = ($::IS_ASCII || $] < 5.008) 36 ? sub { return shift } 37 : sub { utf8::unicode_to_native(shift) }; 38 39*from_native = ($::IS_ASCII || $] < 5.008) 40 ? sub { return shift } 41 : sub { utf8::native_to_unicode(shift) }; 42 43# The .t files are all in terms of Unicode, so xlate to/from native 44sub dot_t_pack_U { 45 return pack('U*', map { to_native($_) } @_); 46} 47 48sub dot_t_unpack_U { 49 50 # The empty pack returns an empty UTF-8 string, so the effect is to force 51 # the shifted parameter into being UTF-8. This allows this to work on 52 # Perl 5.6, where there is no utf8::upgrade(). 53 return map { from_native($_) } unpack('U*', shift(@_).pack('U*')); 54} 55 56sub get_printable_string ($) { 57 use bytes; 58 my $s = shift; 59 60 # DeMorgan's laws cause this to mean ascii printables 61 return $s if $s =~ /[^[:^ascii:][:^print:]]/; 62 63 return join " ", map { sprintf "\\x%02x", ord $_ } split "", $s; 64} 65 66sub ok ($$;$) { 67 my $count_ref = shift; # Test number in caller 68 my $p = my $r = shift; 69 my $x; 70 if (@_) { 71 $x = shift; 72 $p = !defined $x ? !defined $r : !defined $r ? 0 : $r eq $x; 73 } 74 75 print $p ? "ok" : "not ok", ' ', ++$$count_ref, "\n"; 76 77 return if $p; 78 79 my (undef, $file, $line) = caller(1); 80 print STDERR "# Failed test $$count_ref at $file line $line\n"; 81 82 return unless defined $x; 83 84 print STDERR "# got ", get_printable_string($r), "\n"; 85 print STDERR "# expected ", get_printable_string($x), "\n"; 86} 87 88require Exporter; 89 90##### The above part is common to XS and PP ##### 91 92our @ISA = qw(Exporter); 93use XSLoader (); 94XSLoader::load( 'Unicode::Normalize', $VERSION ); 95 96##### The below part is common to XS and PP ##### 97 98## 99## normalize 100## 101 102sub FCD ($) { 103 my $str = shift; 104 return checkFCD($str) ? $str : NFD($str); 105} 106 107our %formNorm = ( 108 NFC => \&NFC, C => \&NFC, 109 NFD => \&NFD, D => \&NFD, 110 NFKC => \&NFKC, KC => \&NFKC, 111 NFKD => \&NFKD, KD => \&NFKD, 112 FCD => \&FCD, FCC => \&FCC, 113); 114 115sub normalize($$) 116{ 117 my $form = shift; 118 my $str = shift; 119 if (exists $formNorm{$form}) { 120 return $formNorm{$form}->($str); 121 } 122 croak($PACKAGE."::normalize: invalid form name: $form"); 123} 124 125## 126## partial 127## 128 129sub normalize_partial ($$) { 130 if (exists $formNorm{$_[0]}) { 131 my $n = normalize($_[0], $_[1]); 132 my($p, $u) = splitOnLastStarter($n); 133 $_[1] = $u; 134 return $p; 135 } 136 croak($PACKAGE."::normalize_partial: invalid form name: $_[0]"); 137} 138 139sub NFD_partial ($) { return normalize_partial('NFD', $_[0]) } 140sub NFC_partial ($) { return normalize_partial('NFC', $_[0]) } 141sub NFKD_partial($) { return normalize_partial('NFKD',$_[0]) } 142sub NFKC_partial($) { return normalize_partial('NFKC',$_[0]) } 143 144## 145## check 146## 147 148our %formCheck = ( 149 NFC => \&checkNFC, C => \&checkNFC, 150 NFD => \&checkNFD, D => \&checkNFD, 151 NFKC => \&checkNFKC, KC => \&checkNFKC, 152 NFKD => \&checkNFKD, KD => \&checkNFKD, 153 FCD => \&checkFCD, FCC => \&checkFCC, 154); 155 156sub check($$) 157{ 158 my $form = shift; 159 my $str = shift; 160 if (exists $formCheck{$form}) { 161 return $formCheck{$form}->($str); 162 } 163 croak($PACKAGE."::check: invalid form name: $form"); 164} 165 1661; 167__END__ 168 169=head1 NAME 170 171Unicode::Normalize - Unicode Normalization Forms 172 173=head1 SYNOPSIS 174 175(1) using function names exported by default: 176 177 use Unicode::Normalize; 178 179 $NFD_string = NFD($string); # Normalization Form D 180 $NFC_string = NFC($string); # Normalization Form C 181 $NFKD_string = NFKD($string); # Normalization Form KD 182 $NFKC_string = NFKC($string); # Normalization Form KC 183 184(2) using function names exported on request: 185 186 use Unicode::Normalize 'normalize'; 187 188 $NFD_string = normalize('D', $string); # Normalization Form D 189 $NFC_string = normalize('C', $string); # Normalization Form C 190 $NFKD_string = normalize('KD', $string); # Normalization Form KD 191 $NFKC_string = normalize('KC', $string); # Normalization Form KC 192 193=head1 DESCRIPTION 194 195Parameters: 196 197C<$string> is used as a string under character semantics (see L<perlunicode>). 198 199C<$code_point> should be an unsigned integer representing a Unicode code point. 200 201Note: Between XSUB and pure Perl, there is an incompatibility 202about the interpretation of C<$code_point> as a decimal number. 203XSUB converts C<$code_point> to an unsigned integer, but pure Perl does not. 204Do not use a floating point nor a negative sign in C<$code_point>. 205 206=head2 Normalization Forms 207 208=over 4 209 210=item C<$NFD_string = NFD($string)> 211 212It returns the Normalization Form D (formed by canonical decomposition). 213 214=item C<$NFC_string = NFC($string)> 215 216It returns the Normalization Form C (formed by canonical decomposition 217followed by canonical composition). 218 219=item C<$NFKD_string = NFKD($string)> 220 221It returns the Normalization Form KD (formed by compatibility decomposition). 222 223=item C<$NFKC_string = NFKC($string)> 224 225It returns the Normalization Form KC (formed by compatibility decomposition 226followed by B<canonical> composition). 227 228=item C<$FCD_string = FCD($string)> 229 230If the given string is in FCD ("Fast C or D" form; cf. UTN #5), 231it returns the string without modification; otherwise it returns an FCD string. 232 233Note: FCD is not always unique, then plural forms may be equivalent 234each other. C<FCD()> will return one of these equivalent forms. 235 236=item C<$FCC_string = FCC($string)> 237 238It returns the FCC form ("Fast C Contiguous"; cf. UTN #5). 239 240Note: FCC is unique, as well as four normalization forms (NF*). 241 242=item C<$normalized_string = normalize($form_name, $string)> 243 244It returns the normalization form of C<$form_name>. 245 246As C<$form_name>, one of the following names must be given. 247 248 'C' or 'NFC' for Normalization Form C (UAX #15) 249 'D' or 'NFD' for Normalization Form D (UAX #15) 250 'KC' or 'NFKC' for Normalization Form KC (UAX #15) 251 'KD' or 'NFKD' for Normalization Form KD (UAX #15) 252 253 'FCD' for "Fast C or D" Form (UTN #5) 254 'FCC' for "Fast C Contiguous" (UTN #5) 255 256=back 257 258=head2 Decomposition and Composition 259 260=over 4 261 262=item C<$decomposed_string = decompose($string [, $useCompatMapping])> 263 264It returns the concatenation of the decomposition of each character 265in the string. 266 267If the second parameter (a boolean) is omitted or false, 268the decomposition is canonical decomposition; 269if the second parameter (a boolean) is true, 270the decomposition is compatibility decomposition. 271 272The string returned is not always in NFD/NFKD. Reordering may be required. 273 274 $NFD_string = reorder(decompose($string)); # eq. to NFD() 275 $NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD() 276 277=item C<$reordered_string = reorder($string)> 278 279It returns the result of reordering the combining characters 280according to Canonical Ordering Behavior. 281 282For example, when you have a list of NFD/NFKD strings, 283you can get the concatenated NFD/NFKD string from them, by saying 284 285 $concat_NFD = reorder(join '', @NFD_strings); 286 $concat_NFKD = reorder(join '', @NFKD_strings); 287 288=item C<$composed_string = compose($string)> 289 290It returns the result of canonical composition 291without applying any decomposition. 292 293For example, when you have a NFD/NFKD string, 294you can get its NFC/NFKC string, by saying 295 296 $NFC_string = compose($NFD_string); 297 $NFKC_string = compose($NFKD_string); 298 299=item C<($processed, $unprocessed) = splitOnLastStarter($normalized)> 300 301It returns two strings: the first one, C<$processed>, is a part 302before the last starter, and the second one, C<$unprocessed> is 303another part after the first part. A starter is a character having 304a combining class of zero (see UAX #15). 305 306Note that C<$processed> may be empty (when C<$normalized> contains no 307starter or starts with the last starter), and then C<$unprocessed> 308should be equal to the entire C<$normalized>. 309 310When you have a C<$normalized> string and an C<$unnormalized> string 311following it, a simple concatenation is wrong: 312 313 $concat = $normalized . normalize($form, $unnormalized); # wrong! 314 315Instead of it, do like this: 316 317 ($processed, $unprocessed) = splitOnLastStarter($normalized); 318 $concat = $processed . normalize($form,$unprocessed.$unnormalized); 319 320C<splitOnLastStarter()> should be called with a pre-normalized parameter 321C<$normalized>, that is in the same form as C<$form> you want. 322 323If you have an array of C<@string> that should be concatenated and then 324normalized, you can do like this: 325 326 my $result = ""; 327 my $unproc = ""; 328 foreach my $str (@string) { 329 $unproc .= $str; 330 my $n = normalize($form, $unproc); 331 my($p, $u) = splitOnLastStarter($n); 332 $result .= $p; 333 $unproc = $u; 334 } 335 $result .= $unproc; 336 # instead of normalize($form, join('', @string)) 337 338=item C<$processed = normalize_partial($form, $unprocessed)> 339 340A wrapper for the combination of C<normalize()> and C<splitOnLastStarter()>. 341Note that C<$unprocessed> will be modified as a side-effect. 342 343If you have an array of C<@string> that should be concatenated and then 344normalized, you can do like this: 345 346 my $result = ""; 347 my $unproc = ""; 348 foreach my $str (@string) { 349 $unproc .= $str; 350 $result .= normalize_partial($form, $unproc); 351 } 352 $result .= $unproc; 353 # instead of normalize($form, join('', @string)) 354 355=item C<$processed = NFD_partial($unprocessed)> 356 357It does like C<normalize_partial('NFD', $unprocessed)>. 358Note that C<$unprocessed> will be modified as a side-effect. 359 360=item C<$processed = NFC_partial($unprocessed)> 361 362It does like C<normalize_partial('NFC', $unprocessed)>. 363Note that C<$unprocessed> will be modified as a side-effect. 364 365=item C<$processed = NFKD_partial($unprocessed)> 366 367It does like C<normalize_partial('NFKD', $unprocessed)>. 368Note that C<$unprocessed> will be modified as a side-effect. 369 370=item C<$processed = NFKC_partial($unprocessed)> 371 372It does like C<normalize_partial('NFKC', $unprocessed)>. 373Note that C<$unprocessed> will be modified as a side-effect. 374 375=back 376 377=head2 Quick Check 378 379(see Annex 8, UAX #15; and F<lib/unicore/DerivedNormalizationProps.txt>) 380 381The following functions check whether the string is in that normalization form. 382 383The result returned will be one of the following: 384 385 YES The string is in that normalization form. 386 NO The string is not in that normalization form. 387 MAYBE Dubious. Maybe yes, maybe no. 388 389=over 4 390 391=item C<$result = checkNFD($string)> 392 393It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>. 394 395=item C<$result = checkNFC($string)> 396 397It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>; 398C<undef> if C<MAYBE>. 399 400=item C<$result = checkNFKD($string)> 401 402It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>. 403 404=item C<$result = checkNFKC($string)> 405 406It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>; 407C<undef> if C<MAYBE>. 408 409=item C<$result = checkFCD($string)> 410 411It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>. 412 413=item C<$result = checkFCC($string)> 414 415It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>; 416C<undef> if C<MAYBE>. 417 418Note: If a string is not in FCD, it must not be in FCC. 419So C<checkFCC($not_FCD_string)> should return C<NO>. 420 421=item C<$result = check($form_name, $string)> 422 423It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>; 424C<undef> if C<MAYBE>. 425 426As C<$form_name>, one of the following names must be given. 427 428 'C' or 'NFC' for Normalization Form C (UAX #15) 429 'D' or 'NFD' for Normalization Form D (UAX #15) 430 'KC' or 'NFKC' for Normalization Form KC (UAX #15) 431 'KD' or 'NFKD' for Normalization Form KD (UAX #15) 432 433 'FCD' for "Fast C or D" Form (UTN #5) 434 'FCC' for "Fast C Contiguous" (UTN #5) 435 436=back 437 438B<Note> 439 440In the cases of NFD, NFKD, and FCD, the answer must be 441either C<YES> or C<NO>. The answer C<MAYBE> may be returned 442in the cases of NFC, NFKC, and FCC. 443 444A C<MAYBE> string should contain at least one combining character 445or the like. For example, C<COMBINING ACUTE ACCENT> has 446the MAYBE_NFC/MAYBE_NFKC property. 447 448Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")> 449and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>. 450C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC 451(its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">), 452while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC. 453 454If you want to check exactly, compare the string with its NFC/NFKC/FCC. 455 456 if ($string eq NFC($string)) { 457 # $string is exactly normalized in NFC; 458 } else { 459 # $string is not normalized in NFC; 460 } 461 462 if ($string eq NFKC($string)) { 463 # $string is exactly normalized in NFKC; 464 } else { 465 # $string is not normalized in NFKC; 466 } 467 468=head2 Character Data 469 470These functions are interface of character data used internally. 471If you want only to get Unicode normalization forms, you don't need 472call them yourself. 473 474=over 4 475 476=item C<$canonical_decomposition = getCanon($code_point)> 477 478If the character is canonically decomposable (including Hangul Syllables), 479it returns the (full) canonical decomposition as a string. 480Otherwise it returns C<undef>. 481 482B<Note:> According to the Unicode standard, the canonical decomposition 483of the character that is not canonically decomposable is same as 484the character itself. 485 486=item C<$compatibility_decomposition = getCompat($code_point)> 487 488If the character is compatibility decomposable (including Hangul Syllables), 489it returns the (full) compatibility decomposition as a string. 490Otherwise it returns C<undef>. 491 492B<Note:> According to the Unicode standard, the compatibility decomposition 493of the character that is not compatibility decomposable is same as 494the character itself. 495 496=item C<$code_point_composite = getComposite($code_point_here, $code_point_next)> 497 498If two characters here and next (as code points) are composable 499(including Hangul Jamo/Syllables and Composition Exclusions), 500it returns the code point of the composite. 501 502If they are not composable, it returns C<undef>. 503 504=item C<$combining_class = getCombinClass($code_point)> 505 506It returns the combining class (as an integer) of the character. 507 508=item C<$may_be_composed_with_prev_char = isComp2nd($code_point)> 509 510It returns a boolean whether the character of the specified codepoint 511may be composed with the previous one in a certain composition 512(including Hangul Compositions, but excluding 513Composition Exclusions and Non-Starter Decompositions). 514 515=item C<$is_exclusion = isExclusion($code_point)> 516 517It returns a boolean whether the code point is a composition exclusion. 518 519=item C<$is_singleton = isSingleton($code_point)> 520 521It returns a boolean whether the code point is a singleton 522 523=item C<$is_non_starter_decomposition = isNonStDecomp($code_point)> 524 525It returns a boolean whether the code point has Non-Starter Decomposition. 526 527=item C<$is_Full_Composition_Exclusion = isComp_Ex($code_point)> 528 529It returns a boolean of the derived property Comp_Ex 530(Full_Composition_Exclusion). This property is generated from 531Composition Exclusions + Singletons + Non-Starter Decompositions. 532 533=item C<$NFD_is_NO = isNFD_NO($code_point)> 534 535It returns a boolean of the derived property NFD_NO 536(NFD_Quick_Check=No). 537 538=item C<$NFC_is_NO = isNFC_NO($code_point)> 539 540It returns a boolean of the derived property NFC_NO 541(NFC_Quick_Check=No). 542 543=item C<$NFC_is_MAYBE = isNFC_MAYBE($code_point)> 544 545It returns a boolean of the derived property NFC_MAYBE 546(NFC_Quick_Check=Maybe). 547 548=item C<$NFKD_is_NO = isNFKD_NO($code_point)> 549 550It returns a boolean of the derived property NFKD_NO 551(NFKD_Quick_Check=No). 552 553=item C<$NFKC_is_NO = isNFKC_NO($code_point)> 554 555It returns a boolean of the derived property NFKC_NO 556(NFKC_Quick_Check=No). 557 558=item C<$NFKC_is_MAYBE = isNFKC_MAYBE($code_point)> 559 560It returns a boolean of the derived property NFKC_MAYBE 561(NFKC_Quick_Check=Maybe). 562 563=back 564 565=head1 EXPORT 566 567C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default. 568 569C<normalize> and other some functions: on request. 570 571=head1 CAVEATS 572 573=over 4 574 575=item Perl's version vs. Unicode version 576 577Since this module refers to perl core's Unicode database in the directory 578F</lib/unicore> (or formerly F</lib/unicode>), the Unicode version of 579normalization implemented by this module depends on what has been 580compiled into your perl. The following table lists the default Unicode 581version that comes with various perl versions. (It is possible to change 582the Unicode version in any perl version to be any earlier Unicode version, 583so one could cause Unicode 3.2 to be used in any perl version starting with 5845.8.0. Read F<C<$Config{privlib}>/unicore/README.perl> for details. 585 586 perl's version implemented Unicode version 587 5.6.1 3.0.1 588 5.7.2 3.1.0 589 5.7.3 3.1.1 (normalization is same as 3.1.0) 590 5.8.0 3.2.0 591 5.8.1-5.8.3 4.0.0 592 5.8.4-5.8.6 4.0.1 (normalization is same as 4.0.0) 593 5.8.7-5.8.8 4.1.0 594 5.10.0 5.0.0 595 5.8.9, 5.10.1 5.1.0 596 5.12.x 5.2.0 597 5.14.x 6.0.0 598 5.16.x 6.1.0 599 5.18.x 6.2.0 600 5.20.x 6.3.0 601 5.22.x 7.0.0 602 603=item Correction of decomposition mapping 604 605In older Unicode versions, a small number of characters (all of which are 606CJK compatibility ideographs as far as they have been found) may have 607an erroneous decomposition mapping (see 608F<lib/unicore/NormalizationCorrections.txt>). 609Anyhow, this module will neither refer to 610F<lib/unicore/NormalizationCorrections.txt> 611nor provide any specific version of normalization. Therefore this module 612running on an older perl with an older Unicode database may use 613the erroneous decomposition mapping blindly conforming to the Unicode database. 614 615=item Revised definition of canonical composition 616 617In Unicode 4.1.0, the definition D2 of canonical composition (which 618affects NFC and NFKC) has been changed (see Public Review Issue #29 619and recent UAX #15). This module has used the newer definition 620since the version 0.07 (Oct 31, 2001). 621This module will not support the normalization according to the older 622definition, even if the Unicode version implemented by perl is 623lower than 4.1.0. 624 625=back 626 627=head1 AUTHOR 628 629SADAHIRO Tomoyuki <SADAHIRO@cpan.org> 630 631Currently maintained by <perl5-porters@perl.org> 632 633Copyright(C) 2001-2012, SADAHIRO Tomoyuki. Japan. All rights reserved. 634 635=head1 LICENSE 636 637This module is free software; you can redistribute it 638and/or modify it under the same terms as Perl itself. 639 640=head1 SEE ALSO 641 642=over 4 643 644=item L<http://www.unicode.org/reports/tr15/> 645 646Unicode Normalization Forms - UAX #15 647 648=item L<http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt> 649 650Composition Exclusion Table 651 652=item L<http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt> 653 654Derived Normalization Properties 655 656=item L<http://www.unicode.org/Public/UNIDATA/NormalizationCorrections.txt> 657 658Normalization Corrections 659 660=item L<http://www.unicode.org/review/pr-29.html> 661 662Public Review Issue #29: Normalization Issue 663 664=item L<http://www.unicode.org/notes/tn5/> 665 666Canonical Equivalence in Applications - UTN #5 667 668=back 669 670=cut 671