1package Unicode::Collate; 2 3BEGIN { 4 unless ("A" eq pack('U', 0x41)) { 5 die "Unicode::Collate cannot stringify a Unicode code point\n"; 6 } 7 unless (0x41 == unpack('U', 'A')) { 8 die "Unicode::Collate cannot get a Unicode code point\n"; 9 } 10} 11 12use 5.006; 13use strict; 14use warnings; 15use Carp; 16use File::Spec; 17 18no warnings 'utf8'; 19 20our $VERSION = '1.29'; 21our $PACKAGE = __PACKAGE__; 22 23### begin XS only ### 24use XSLoader (); 25XSLoader::load('Unicode::Collate', $VERSION); 26### end XS only ### 27 28my @Path = qw(Unicode Collate); 29my $KeyFile = 'allkeys.txt'; 30 31# Perl's boolean 32use constant TRUE => 1; 33use constant FALSE => ""; 34use constant NOMATCHPOS => -1; 35 36# A coderef to get combining class imported from Unicode::Normalize 37# (i.e. \&Unicode::Normalize::getCombinClass). 38# This is also used as a HAS_UNICODE_NORMALIZE flag. 39my $CVgetCombinClass; 40 41# Supported Levels 42use constant MinLevel => 1; 43use constant MaxLevel => 4; 44 45# Minimum weights at level 2 and 3, respectively 46use constant Min2Wt => 0x20; 47use constant Min3Wt => 0x02; 48 49# Shifted weight at 4th level 50use constant Shift4Wt => 0xFFFF; 51 52# A boolean for Variable and 16-bit weights at 4 levels of Collation Element 53use constant VCE_TEMPLATE => 'Cn4'; 54 55# A sort key: 16-bit weights 56use constant KEY_TEMPLATE => 'n*'; 57 58# The tie-breaking: 32-bit weights 59use constant TIE_TEMPLATE => 'N*'; 60 61# Level separator in a sort key: 62# i.e. pack(KEY_TEMPLATE, 0) 63use constant LEVEL_SEP => "\0\0"; 64 65# As Unicode code point separator for hash keys. 66# A joined code point string (denoted by JCPS below) 67# like "65;768" is used for internal processing 68# instead of Perl's Unicode string like "\x41\x{300}", 69# as the native code point is different from the Unicode code point 70# on EBCDIC platform. 71# This character must not be included in any stringified 72# representation of an integer. 73use constant CODE_SEP => ';'; 74 # NOTE: in regex /;/ is used for $jcps! 75 76# boolean values of variable weights 77use constant NON_VAR => 0; # Non-Variable character 78use constant VAR => 1; # Variable character 79 80# specific code points 81use constant Hangul_SIni => 0xAC00; 82use constant Hangul_SFin => 0xD7A3; 83 84# Logical_Order_Exception in PropList.txt 85my $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ]; 86 87# for highestFFFF and minimalFFFE 88my $HighestVCE = pack(VCE_TEMPLATE, 0, 0xFFFE, 0x20, 0x5, 0xFFFF); 89my $minimalVCE = pack(VCE_TEMPLATE, 0, 1, 0x20, 0x5, 0xFFFE); 90 91sub UCA_Version { '43' } 92 93sub Base_Unicode_Version { '13.0.0' } 94 95###### 96 97sub pack_U { 98 return pack('U*', @_); 99} 100 101sub unpack_U { 102 return unpack('U*', shift(@_).pack('U*')); 103} 104 105###### 106 107my (%VariableOK); 108@VariableOK{ qw/ 109 blanked non-ignorable shifted shift-trimmed 110 / } = (); # keys lowercased 111 112our @ChangeOK = qw/ 113 alternate backwards level normalization rearrange 114 katakana_before_hiragana upper_before_lower ignore_level2 115 overrideCJK overrideHangul overrideOut preprocess UCA_Version 116 hangul_terminator variable identical highestFFFF minimalFFFE 117 long_contraction 118 /; 119 120our @ChangeNG = qw/ 121 entry mapping table maxlength contraction 122 ignoreChar ignoreName undefChar undefName rewrite 123 versionTable alternateTable backwardsTable forwardsTable 124 rearrangeTable variableTable 125 derivCode normCode rearrangeHash backwardsFlag 126 suppress suppressHash 127 __useXS /; ### XS only 128# The hash key 'ignored' was deleted at v 0.21. 129# The hash key 'isShift' was deleted at v 0.23. 130# The hash key 'combining' was deleted at v 0.24. 131# The hash key 'entries' was deleted at v 0.30. 132# The hash key 'L3_ignorable' was deleted at v 0.40. 133 134sub version { 135 my $self = shift; 136 return $self->{versionTable} || 'unknown'; 137} 138 139my (%ChangeOK, %ChangeNG); 140@ChangeOK{ @ChangeOK } = (); 141@ChangeNG{ @ChangeNG } = (); 142 143sub change { 144 my $self = shift; 145 my %hash = @_; 146 my %old; 147 if (exists $hash{alternate}) { 148 if (exists $hash{variable}) { 149 delete $hash{alternate}; 150 } else { 151 $hash{variable} = $hash{alternate}; 152 } 153 } 154 foreach my $k (keys %hash) { 155 if (exists $ChangeOK{$k}) { 156 $old{$k} = $self->{$k}; 157 $self->{$k} = $hash{$k}; 158 } elsif (exists $ChangeNG{$k}) { 159 croak "change of $k via change() is not allowed!"; 160 } 161 # else => ignored 162 } 163 $self->checkCollator(); 164 return wantarray ? %old : $self; 165} 166 167sub _checkLevel { 168 my $level = shift; 169 my $key = shift; # 'level' or 'backwards' 170 MinLevel <= $level or croak sprintf 171 "Illegal level %d (in value for key '%s') lower than %d.", 172 $level, $key, MinLevel; 173 $level <= MaxLevel or croak sprintf 174 "Unsupported level %d (in value for key '%s') higher than %d.", 175 $level, $key, MaxLevel; 176} 177 178my %DerivCode = ( 179 8 => \&_derivCE_8, 180 9 => \&_derivCE_9, 181 11 => \&_derivCE_9, # 11 == 9 182 14 => \&_derivCE_14, 183 16 => \&_derivCE_14, # 16 == 14 184 18 => \&_derivCE_18, 185 20 => \&_derivCE_20, 186 22 => \&_derivCE_22, 187 24 => \&_derivCE_24, 188 26 => \&_derivCE_24, # 26 == 24 189 28 => \&_derivCE_24, # 28 == 24 190 30 => \&_derivCE_24, # 30 == 24 191 32 => \&_derivCE_32, 192 34 => \&_derivCE_34, 193 36 => \&_derivCE_36, 194 38 => \&_derivCE_38, 195 40 => \&_derivCE_40, 196 41 => \&_derivCE_40, # 41 == 40 197 43 => \&_derivCE_43, 198); 199 200sub checkCollator { 201 my $self = shift; 202 _checkLevel($self->{level}, 'level'); 203 204 $self->{derivCode} = $DerivCode{ $self->{UCA_Version} } 205 or croak "Illegal UCA version (passed $self->{UCA_Version})."; 206 207 $self->{variable} ||= $self->{alternate} || $self->{variableTable} || 208 $self->{alternateTable} || 'shifted'; 209 $self->{variable} = $self->{alternate} = lc($self->{variable}); 210 exists $VariableOK{ $self->{variable} } 211 or croak "$PACKAGE unknown variable parameter name: $self->{variable}"; 212 213 if (! defined $self->{backwards}) { 214 $self->{backwardsFlag} = 0; 215 } elsif (! ref $self->{backwards}) { 216 _checkLevel($self->{backwards}, 'backwards'); 217 $self->{backwardsFlag} = 1 << $self->{backwards}; 218 } else { 219 my %level; 220 $self->{backwardsFlag} = 0; 221 for my $b (@{ $self->{backwards} }) { 222 _checkLevel($b, 'backwards'); 223 $level{$b} = 1; 224 } 225 for my $v (sort keys %level) { 226 $self->{backwardsFlag} += 1 << $v; 227 } 228 } 229 230 defined $self->{rearrange} or $self->{rearrange} = []; 231 ref $self->{rearrange} 232 or croak "$PACKAGE: list for rearrangement must be store in ARRAYREF"; 233 234 # keys of $self->{rearrangeHash} are $self->{rearrange}. 235 $self->{rearrangeHash} = undef; 236 237 if (@{ $self->{rearrange} }) { 238 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = (); 239 } 240 241 $self->{normCode} = undef; 242 243 if (defined $self->{normalization}) { 244 eval { require Unicode::Normalize }; 245 $@ and croak "Unicode::Normalize is required to normalize strings"; 246 247 $CVgetCombinClass ||= \&Unicode::Normalize::getCombinClass; 248 249 if ($self->{normalization} =~ /^(?:NF)D\z/) { # tweak for default 250 $self->{normCode} = \&Unicode::Normalize::NFD; 251 } 252 elsif ($self->{normalization} ne 'prenormalized') { 253 my $norm = $self->{normalization}; 254 $self->{normCode} = sub { 255 Unicode::Normalize::normalize($norm, shift); 256 }; 257 eval { $self->{normCode}->("") }; # try 258 $@ and croak "$PACKAGE unknown normalization form name: $norm"; 259 } 260 } 261 return; 262} 263 264sub new 265{ 266 my $class = shift; 267 my $self = bless { @_ }, $class; 268 269### begin XS only ### 270 if (! exists $self->{table} && !defined $self->{rewrite} && 271 !defined $self->{undefName} && !defined $self->{ignoreName} && 272 !defined $self->{undefChar} && !defined $self->{ignoreChar}) { 273 $self->{__useXS} = \&_fetch_simple; 274 } else { 275 $self->{__useXS} = undef; 276 } 277### end XS only ### 278 279 # keys of $self->{suppressHash} are $self->{suppress}. 280 if ($self->{suppress} && @{ $self->{suppress} }) { 281 @{ $self->{suppressHash} }{ @{ $self->{suppress} } } = (); 282 } # before read_table() 283 284 # If undef is passed explicitly, no file is read. 285 $self->{table} = $KeyFile if ! exists $self->{table}; 286 $self->read_table() if defined $self->{table}; 287 288 if ($self->{entry}) { 289 while ($self->{entry} =~ /([^\n]+)/g) { 290 $self->parseEntry($1, TRUE); 291 } 292 } 293 294 # only in new(), not in change() 295 $self->{level} ||= MaxLevel; 296 $self->{UCA_Version} ||= UCA_Version(); 297 298 $self->{overrideHangul} = FALSE 299 if ! exists $self->{overrideHangul}; 300 $self->{overrideCJK} = FALSE 301 if ! exists $self->{overrideCJK}; 302 $self->{normalization} = 'NFD' 303 if ! exists $self->{normalization}; 304 $self->{rearrange} = $self->{rearrangeTable} || 305 ($self->{UCA_Version} <= 11 ? $DefaultRearrange : []) 306 if ! exists $self->{rearrange}; 307 $self->{backwards} = $self->{backwardsTable} 308 if ! exists $self->{backwards}; 309 exists $self->{long_contraction} or $self->{long_contraction} 310 = 22 <= $self->{UCA_Version} && $self->{UCA_Version} <= 24; 311 312 # checkCollator() will be called in change() 313 $self->checkCollator(); 314 315 return $self; 316} 317 318sub parseAtmark { 319 my $self = shift; 320 my $line = shift; # after s/^\s*\@// 321 322 if ($line =~ /^version\s*(\S*)/) { 323 $self->{versionTable} ||= $1; 324 } 325 elsif ($line =~ /^variable\s+(\S*)/) { # since UTS #10-9 326 $self->{variableTable} ||= $1; 327 } 328 elsif ($line =~ /^alternate\s+(\S*)/) { # till UTS #10-8 329 $self->{alternateTable} ||= $1; 330 } 331 elsif ($line =~ /^backwards\s+(\S*)/) { 332 push @{ $self->{backwardsTable} }, $1; 333 } 334 elsif ($line =~ /^forwards\s+(\S*)/) { # perhaps no use 335 push @{ $self->{forwardsTable} }, $1; 336 } 337 elsif ($line =~ /^rearrange\s+(.*)/) { # (\S*) is NG 338 push @{ $self->{rearrangeTable} }, _getHexArray($1); 339 } 340} 341 342sub read_table { 343 my $self = shift; 344 345### begin XS only ### 346 if ($self->{__useXS}) { 347 my @rest = _fetch_rest(); # complex matter need to parse 348 for my $line (@rest) { 349 next if $line =~ /^\s*#/; 350 351 if ($line =~ s/^\s*\@//) { 352 $self->parseAtmark($line); 353 } else { 354 $self->parseEntry($line); 355 } 356 } 357 return; 358 } 359### end XS only ### 360 361 my($f, $fh); 362 foreach my $d (@INC) { 363 $f = File::Spec->catfile($d, @Path, $self->{table}); 364 last if open($fh, $f); 365 $f = undef; 366 } 367 if (!defined $f) { 368 $f = File::Spec->catfile(@Path, $self->{table}); 369 croak("$PACKAGE: Can't locate $f in \@INC (\@INC contains: @INC)"); 370 } 371 372 while (my $line = <$fh>) { 373 next if $line =~ /^\s*#/; 374 375 if ($line =~ s/^\s*\@//) { 376 $self->parseAtmark($line); 377 } else { 378 $self->parseEntry($line); 379 } 380 } 381 close $fh; 382} 383 384 385## 386## get $line, parse it, and write an entry in $self 387## 388sub parseEntry 389{ 390 my $self = shift; 391 my $line = shift; 392 my $tailoring = shift; 393 my($name, $entry, @uv, @key); 394 395 if (defined $self->{rewrite}) { 396 $line = $self->{rewrite}->($line); 397 } 398 399 return if $line !~ /^\s*[0-9A-Fa-f]/; 400 401 # removes comment and gets name 402 $name = $1 403 if $line =~ s/[#%]\s*(.*)//; 404 return if defined $self->{undefName} && $name =~ /$self->{undefName}/; 405 406 # gets element 407 my($e, $k) = split /;/, $line; 408 croak "Wrong Entry: <charList> must be separated by ';' from <collElement>" 409 if ! $k; 410 411 @uv = _getHexArray($e); 412 return if !@uv; 413 return if @uv > 1 && $self->{suppressHash} && !$tailoring && 414 exists $self->{suppressHash}{$uv[0]}; 415 $entry = join(CODE_SEP, @uv); # in JCPS 416 417 if (defined $self->{undefChar} || defined $self->{ignoreChar}) { 418 my $ele = pack_U(@uv); 419 420 # regarded as if it were not stored in the table 421 return 422 if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/; 423 424 # replaced as completely ignorable 425 $k = '[.0000.0000.0000.0000]' 426 if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/; 427 } 428 429 # replaced as completely ignorable 430 $k = '[.0000.0000.0000.0000]' 431 if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/; 432 433 my $is_L3_ignorable = TRUE; 434 435 foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed 436 my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient. 437 my @wt = _getHexArray($arr); 438 push @key, pack(VCE_TEMPLATE, $var, @wt); 439 $is_L3_ignorable = FALSE 440 if $wt[0] || $wt[1] || $wt[2]; 441 # Conformance Test for 3.1.1 and 4.0.0 shows Level 3 ignorable 442 # is completely ignorable. 443 # For expansion, an entry $is_L3_ignorable 444 # if and only if "all" CEs are [.0000.0000.0000]. 445 } 446 447 # mapping: be an array ref or not exists (any false value is disallowed) 448 $self->{mapping}{$entry} = $is_L3_ignorable ? [] : \@key; 449 450 # maxlength: be more than 1 or not exists (any false value is disallowed) 451 if (@uv > 1) { 452 if (!$self->{maxlength}{$uv[0]} || $self->{maxlength}{$uv[0]} < @uv) { 453 $self->{maxlength}{$uv[0]} = @uv; 454 } 455 } 456 457 # contraction: be 1 or not exists (any false value is disallowed) 458 while (@uv > 2) { 459 pop @uv; 460 my $fake_entry = join(CODE_SEP, @uv); # in JCPS 461 $self->{contraction}{$fake_entry} = 1; 462 } 463} 464 465 466sub viewSortKey 467{ 468 my $self = shift; 469 my $str = shift; 470 $self->visualizeSortKey($self->getSortKey($str)); 471} 472 473 474sub process 475{ 476 my $self = shift; 477 my $str = shift; 478 my $prep = $self->{preprocess}; 479 my $norm = $self->{normCode}; 480 481 $str = &$prep($str) if ref $prep; 482 $str = &$norm($str) if ref $norm; 483 return $str; 484} 485 486## 487## arrayref of JCPS = splitEnt(string to be collated) 488## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, TRUE) 489## 490sub splitEnt 491{ 492 my $self = shift; 493 my $str = shift; 494 my $wLen = shift; # with Length 495 496 my $map = $self->{mapping}; 497 my $max = $self->{maxlength}; 498 my $reH = $self->{rearrangeHash}; 499 my $vers = $self->{UCA_Version}; 500 my $ver9 = $vers >= 9 && $vers <= 11; 501 my $long = $self->{long_contraction}; 502 my $uXS = $self->{__useXS}; ### XS only 503 504 my @buf; 505 506 # get array of Unicode code point of string. 507 my @src = unpack_U($str); 508 509 # rearrangement: 510 # Character positions are not kept if rearranged, 511 # then neglected if $wLen is true. 512 if ($reH && ! $wLen) { 513 for (my $i = 0; $i < @src; $i++) { 514 if (exists $reH->{ $src[$i] } && $i + 1 < @src) { 515 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]); 516 $i++; 517 } 518 } 519 } 520 521 # remove a code point marked as a completely ignorable. 522 for (my $i = 0; $i < @src; $i++) { 523 if ($vers <= 20 && _isIllegal($src[$i])) { 524 $src[$i] = undef; 525 } elsif ($ver9) { 526 $src[$i] = undef if exists $map->{ $src[$i] } 527 ? @{ $map->{ $src[$i] } } == 0 528 : $uXS && _ignorable_simple($src[$i]); ### XS only 529 } 530 } 531 532 for (my $i = 0; $i < @src; $i++) { 533 my $jcps = $src[$i]; 534 535 # skip removed code point 536 if (! defined $jcps) { 537 if ($wLen && @buf) { 538 $buf[-1][2] = $i + 1; 539 } 540 next; 541 } 542 543 my $i_orig = $i; 544 545 # find contraction 546 if (exists $max->{$jcps}) { 547 my $temp_jcps = $jcps; 548 my $jcpsLen = 1; 549 my $maxLen = $max->{$jcps}; 550 551 for (my $p = $i + 1; $jcpsLen < $maxLen && $p < @src; $p++) { 552 next if ! defined $src[$p]; 553 $temp_jcps .= CODE_SEP . $src[$p]; 554 $jcpsLen++; 555 if (exists $map->{$temp_jcps}) { 556 $jcps = $temp_jcps; 557 $i = $p; 558 } 559 } 560 561 # discontiguous contraction with Combining Char (cf. UTS#10, S2.1). 562 # This process requires Unicode::Normalize. 563 # If "normalization" is undef, here should be skipped *always* 564 # (in spite of bool value of $CVgetCombinClass), 565 # since canonical ordering cannot be expected. 566 # Blocked combining character should not be contracted. 567 568 # $self->{normCode} is false in the case of "prenormalized". 569 if ($self->{normalization}) { 570 my $cont = $self->{contraction}; 571 my $preCC = 0; 572 my $preCC_uc = 0; 573 my $jcps_uc = $jcps; 574 my(@out, @out_uc); 575 576 for (my $p = $i + 1; $p < @src; $p++) { 577 next if ! defined $src[$p]; 578 my $curCC = $CVgetCombinClass->($src[$p]); 579 last unless $curCC; 580 my $tail = CODE_SEP . $src[$p]; 581 582 if ($preCC != $curCC && exists $map->{$jcps.$tail}) { 583 $jcps .= $tail; 584 push @out, $p; 585 } else { 586 $preCC = $curCC; 587 } 588 589 next if !$long; 590 591 if ($preCC_uc != $curCC && 592 (exists $map->{$jcps_uc.$tail} || 593 exists $cont->{$jcps_uc.$tail})) { 594 $jcps_uc .= $tail; 595 push @out_uc, $p; 596 } else { 597 $preCC_uc = $curCC; 598 } 599 } 600 601 if (@out_uc && exists $map->{$jcps_uc}) { 602 $jcps = $jcps_uc; 603 $src[$_] = undef for @out_uc; 604 } else { 605 $src[$_] = undef for @out; 606 } 607 } 608 } 609 610 # skip completely ignorable 611 if (exists $map->{$jcps} ? @{ $map->{$jcps} } == 0 : 612 $uXS && $jcps !~ /;/ && _ignorable_simple($jcps)) { ### XS only 613 if ($wLen && @buf) { 614 $buf[-1][2] = $i + 1; 615 } 616 next; 617 } 618 619 push @buf, $wLen ? [$jcps, $i_orig, $i + 1] : $jcps; 620 } 621 return \@buf; 622} 623 624## 625## VCE = _pack_override(input, codepoint, derivCode) 626## 627sub _pack_override ($$$) { 628 my $r = shift; 629 my $u = shift; 630 my $der = shift; 631 632 if (ref $r) { 633 return pack(VCE_TEMPLATE, NON_VAR, @$r); 634 } elsif (defined $r) { 635 return pack(VCE_TEMPLATE, NON_VAR, $r, Min2Wt, Min3Wt, $u); 636 } else { 637 $u = 0xFFFD if 0x10FFFF < $u; 638 return $der->($u); 639 } 640} 641 642## 643## list of VCE = getWt(JCPS) 644## 645sub getWt 646{ 647 my $self = shift; 648 my $u = shift; 649 my $map = $self->{mapping}; 650 my $der = $self->{derivCode}; 651 my $out = $self->{overrideOut}; 652 my $uXS = $self->{__useXS}; ### XS only 653 654 return if !defined $u; 655 return $self->varCE($HighestVCE) if $u eq 0xFFFF && $self->{highestFFFF}; 656 return $self->varCE($minimalVCE) if $u eq 0xFFFE && $self->{minimalFFFE}; 657 $u = 0xFFFD if $u !~ /;/ && 0x10FFFF < $u && !$out; 658 659 my @ce; 660 if (exists $map->{$u}) { 661 @ce = @{ $map->{$u} }; # $u may be a contraction 662### begin XS only ### 663 } elsif ($uXS && _exists_simple($u)) { 664 @ce = _fetch_simple($u); 665### end XS only ### 666 } elsif (Hangul_SIni <= $u && $u <= Hangul_SFin) { 667 my $hang = $self->{overrideHangul}; 668 if ($hang) { 669 @ce = map _pack_override($_, $u, $der), $hang->($u); 670 } elsif (!defined $hang) { 671 @ce = $der->($u); 672 } else { 673 my $max = $self->{maxlength}; 674 my @decH = _decompHangul($u); 675 676 if (@decH == 2) { 677 my $contract = join(CODE_SEP, @decH); 678 @decH = ($contract) if exists $map->{$contract}; 679 } else { # must be <@decH == 3> 680 if (exists $max->{$decH[0]}) { 681 my $contract = join(CODE_SEP, @decH); 682 if (exists $map->{$contract}) { 683 @decH = ($contract); 684 } else { 685 $contract = join(CODE_SEP, @decH[0,1]); 686 exists $map->{$contract} and @decH = ($contract, $decH[2]); 687 } 688 # even if V's ignorable, LT contraction is not supported. 689 # If such a situation were required, NFD should be used. 690 } 691 if (@decH == 3 && exists $max->{$decH[1]}) { 692 my $contract = join(CODE_SEP, @decH[1,2]); 693 exists $map->{$contract} and @decH = ($decH[0], $contract); 694 } 695 } 696 697 @ce = map({ 698 exists $map->{$_} ? @{ $map->{$_} } : 699 $uXS && _exists_simple($_) ? _fetch_simple($_) : ### XS only 700 $der->($_); 701 } @decH); 702 } 703 } elsif ($out && 0x10FFFF < $u) { 704 @ce = map _pack_override($_, $u, $der), $out->($u); 705 } else { 706 my $cjk = $self->{overrideCJK}; 707 my $vers = $self->{UCA_Version}; 708 if ($cjk && _isUIdeo($u, $vers)) { 709 @ce = map _pack_override($_, $u, $der), $cjk->($u); 710 } elsif ($vers == 8 && defined $cjk && _isUIdeo($u, 0)) { 711 @ce = _uideoCE_8($u); 712 } else { 713 @ce = $der->($u); 714 } 715 } 716 return map $self->varCE($_), @ce; 717} 718 719 720## 721## string sortkey = getSortKey(string arg) 722## 723sub getSortKey 724{ 725 my $self = shift; 726 my $orig = shift; 727 my $str = $self->process($orig); 728 my $rEnt = $self->splitEnt($str); # get an arrayref of JCPS 729 my $vers = $self->{UCA_Version}; 730 my $term = $self->{hangul_terminator}; 731 my $lev = $self->{level}; 732 my $iden = $self->{identical}; 733 734 my @buf; # weight arrays 735 if ($term) { 736 my $preHST = ''; 737 my $termCE = $self->varCE(pack(VCE_TEMPLATE, NON_VAR, $term, 0,0,0)); 738 foreach my $jcps (@$rEnt) { 739 # weird things like VL, TL-contraction are not considered! 740 my $curHST = join '', map getHST($_, $vers), split /;/, $jcps; 741 if ($preHST && !$curHST || # hangul before non-hangul 742 $preHST =~ /L\z/ && $curHST =~ /^T/ || 743 $preHST =~ /V\z/ && $curHST =~ /^L/ || 744 $preHST =~ /T\z/ && $curHST =~ /^[LV]/) { 745 push @buf, $termCE; 746 } 747 $preHST = $curHST; 748 push @buf, $self->getWt($jcps); 749 } 750 push @buf, $termCE if $preHST; # end at hangul 751 } else { 752 foreach my $jcps (@$rEnt) { 753 push @buf, $self->getWt($jcps); 754 } 755 } 756 757 my $rkey = $self->mk_SortKey(\@buf); ### XS only 758 759 if ($iden || $vers >= 26 && $lev == MaxLevel) { 760 $rkey .= LEVEL_SEP; 761 $rkey .= pack(TIE_TEMPLATE, unpack_U($str)) if $iden; 762 } 763 return $rkey; 764} 765 766 767## 768## int compare = cmp(string a, string b) 769## 770sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) } 771sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) } 772sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) } 773sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) } 774sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) } 775sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) } 776sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) } 777 778## 779## list[strings] sorted = sort(list[strings] arg) 780## 781sub sort { 782 my $obj = shift; 783 return 784 map { $_->[1] } 785 sort{ $a->[0] cmp $b->[0] } 786 map [ $obj->getSortKey($_), $_ ], @_; 787} 788 789 790## 791## bool _nonIgnorAtLevel(arrayref weights, int level) 792## 793sub _nonIgnorAtLevel($$) 794{ 795 my $wt = shift; 796 return if ! defined $wt; 797 my $lv = shift; 798 return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE; 799} 800 801## 802## bool _eqArray( 803## arrayref of arrayref[weights] source, 804## arrayref of arrayref[weights] substr, 805## int level) 806## * comparison of graphemes vs graphemes. 807## @$source >= @$substr must be true (check it before call this); 808## 809sub _eqArray($$$) 810{ 811 my $source = shift; 812 my $substr = shift; 813 my $lev = shift; 814 815 for my $g (0..@$substr-1){ 816 # Do the $g'th graphemes have the same number of AV weights? 817 return if @{ $source->[$g] } != @{ $substr->[$g] }; 818 819 for my $w (0..@{ $substr->[$g] }-1) { 820 for my $v (0..$lev-1) { 821 return if $source->[$g][$w][$v] != $substr->[$g][$w][$v]; 822 } 823 } 824 } 825 return 1; 826} 827 828## 829## (int position, int length) 830## int position = index(string, substring, position, [undoc'ed global]) 831## 832## With "global" (only for the list context), 833## returns list of arrayref[position, length]. 834## 835sub index 836{ 837 my $self = shift; 838 $self->{preprocess} and 839 croak "Don't use Preprocess with index(), match(), etc."; 840 $self->{normCode} and 841 croak "Don't use Normalization with index(), match(), etc."; 842 843 my $str = shift; 844 my $len = length($str); 845 my $sub = shift; 846 my $subE = $self->splitEnt($sub); 847 my $pos = @_ ? shift : 0; 848 $pos = 0 if $pos < 0; 849 my $glob = shift; 850 851 my $lev = $self->{level}; 852 my $v2i = $self->{UCA_Version} >= 9 && 853 $self->{variable} ne 'non-ignorable'; 854 855 if (! @$subE) { 856 my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos; 857 return $glob 858 ? map([$_, 0], $temp..$len) 859 : wantarray ? ($temp,0) : $temp; 860 } 861 $len < $pos 862 and return wantarray ? () : NOMATCHPOS; 863 my $strE = $self->splitEnt($pos ? substr($str, $pos) : $str, TRUE); 864 @$strE 865 or return wantarray ? () : NOMATCHPOS; 866 867 my(@strWt, @iniPos, @finPos, @subWt, @g_ret); 868 869 my $last_is_variable; 870 for my $vwt (map $self->getWt($_), @$subE) { 871 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt); 872 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev); 873 874 # "Ignorable (L1, L2) after Variable" since track. v. 9 875 if ($v2i) { 876 if ($var) { 877 $last_is_variable = TRUE; 878 } 879 elsif (!$wt[0]) { # ignorable 880 $to_be_pushed = FALSE if $last_is_variable; 881 } 882 else { 883 $last_is_variable = FALSE; 884 } 885 } 886 887 if (@subWt && !$var && !$wt[0]) { 888 push @{ $subWt[-1] }, \@wt if $to_be_pushed; 889 } elsif ($to_be_pushed) { 890 push @subWt, [ \@wt ]; 891 } 892 # else ===> skipped 893 } 894 895 my $count = 0; 896 my $end = @$strE - 1; 897 898 $last_is_variable = FALSE; # reuse 899 for (my $i = 0; $i <= $end; ) { # no $i++ 900 my $found_base = 0; 901 902 # fetch a grapheme 903 while ($i <= $end && $found_base == 0) { 904 for my $vwt ($self->getWt($strE->[$i][0])) { 905 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt); 906 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev); 907 908 # "Ignorable (L1, L2) after Variable" since track. v. 9 909 if ($v2i) { 910 if ($var) { 911 $last_is_variable = TRUE; 912 } 913 elsif (!$wt[0]) { # ignorable 914 $to_be_pushed = FALSE if $last_is_variable; 915 } 916 else { 917 $last_is_variable = FALSE; 918 } 919 } 920 921 if (@strWt && !$var && !$wt[0]) { 922 push @{ $strWt[-1] }, \@wt if $to_be_pushed; 923 $finPos[-1] = $strE->[$i][2]; 924 } elsif ($to_be_pushed) { 925 push @strWt, [ \@wt ]; 926 push @iniPos, $found_base ? NOMATCHPOS : $strE->[$i][1]; 927 $finPos[-1] = NOMATCHPOS if $found_base; 928 push @finPos, $strE->[$i][2]; 929 $found_base++; 930 } 931 # else ===> no-op 932 } 933 $i++; 934 } 935 936 # try to match 937 while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) { 938 if ($iniPos[0] != NOMATCHPOS && 939 $finPos[$#subWt] != NOMATCHPOS && 940 _eqArray(\@strWt, \@subWt, $lev)) { 941 my $temp = $iniPos[0] + $pos; 942 943 if ($glob) { 944 push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]]; 945 splice @strWt, 0, $#subWt; 946 splice @iniPos, 0, $#subWt; 947 splice @finPos, 0, $#subWt; 948 } 949 else { 950 return wantarray 951 ? ($temp, $finPos[$#subWt] - $iniPos[0]) 952 : $temp; 953 } 954 } 955 shift @strWt; 956 shift @iniPos; 957 shift @finPos; 958 } 959 } 960 961 return $glob 962 ? @g_ret 963 : wantarray ? () : NOMATCHPOS; 964} 965 966## 967## scalarref to matching part = match(string, substring) 968## 969sub match 970{ 971 my $self = shift; 972 if (my($pos,$len) = $self->index($_[0], $_[1])) { 973 my $temp = substr($_[0], $pos, $len); 974 return wantarray ? $temp : \$temp; 975 # An lvalue ref \substr should be avoided, 976 # since its value is affected by modification of its referent. 977 } 978 else { 979 return; 980 } 981} 982 983## 984## arrayref matching parts = gmatch(string, substring) 985## 986sub gmatch 987{ 988 my $self = shift; 989 my $str = shift; 990 my $sub = shift; 991 return map substr($str, $_->[0], $_->[1]), 992 $self->index($str, $sub, 0, 'g'); 993} 994 995## 996## bool subst'ed = subst(string, substring, replace) 997## 998sub subst 999{ 1000 my $self = shift; 1001 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE; 1002 1003 if (my($pos,$len) = $self->index($_[0], $_[1])) { 1004 if ($code) { 1005 my $mat = substr($_[0], $pos, $len); 1006 substr($_[0], $pos, $len, $code->($mat)); 1007 } else { 1008 substr($_[0], $pos, $len, $_[2]); 1009 } 1010 return TRUE; 1011 } 1012 else { 1013 return FALSE; 1014 } 1015} 1016 1017## 1018## int count = gsubst(string, substring, replace) 1019## 1020sub gsubst 1021{ 1022 my $self = shift; 1023 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE; 1024 my $cnt = 0; 1025 1026 # Replacement is carried out from the end, then use reverse. 1027 for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) { 1028 if ($code) { 1029 my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]); 1030 substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat)); 1031 } else { 1032 substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]); 1033 } 1034 $cnt++; 1035 } 1036 return $cnt; 1037} 1038 10391; 1040__END__ 1041 1042=head1 NAME 1043 1044Unicode::Collate - Unicode Collation Algorithm 1045 1046=head1 SYNOPSIS 1047 1048 use Unicode::Collate; 1049 1050 #construct 1051 $Collator = Unicode::Collate->new(%tailoring); 1052 1053 #sort 1054 @sorted = $Collator->sort(@not_sorted); 1055 1056 #compare 1057 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1. 1058 1059B<Note:> Strings in C<@not_sorted>, C<$a> and C<$b> are interpreted 1060according to Perl's Unicode support. See L<perlunicode>, 1061L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>. 1062Otherwise you can use C<preprocess> or should decode them before. 1063 1064=head1 DESCRIPTION 1065 1066This module is an implementation of Unicode Technical Standard #10 1067(a.k.a. UTS #10) - Unicode Collation Algorithm (a.k.a. UCA). 1068 1069=head2 Constructor and Tailoring 1070 1071The C<new> method returns a collator object. If new() is called 1072with no parameters, the collator should do the default collation. 1073 1074 $Collator = Unicode::Collate->new( 1075 UCA_Version => $UCA_Version, 1076 alternate => $alternate, # alias for 'variable' 1077 backwards => $levelNumber, # or \@levelNumbers 1078 entry => $element, 1079 hangul_terminator => $term_primary_weight, 1080 highestFFFF => $bool, 1081 identical => $bool, 1082 ignoreName => qr/$ignoreName/, 1083 ignoreChar => qr/$ignoreChar/, 1084 ignore_level2 => $bool, 1085 katakana_before_hiragana => $bool, 1086 level => $collationLevel, 1087 long_contraction => $bool, 1088 minimalFFFE => $bool, 1089 normalization => $normalization_form, 1090 overrideCJK => \&overrideCJK, 1091 overrideHangul => \&overrideHangul, 1092 preprocess => \&preprocess, 1093 rearrange => \@charList, 1094 rewrite => \&rewrite, 1095 suppress => \@charList, 1096 table => $filename, 1097 undefName => qr/$undefName/, 1098 undefChar => qr/$undefChar/, 1099 upper_before_lower => $bool, 1100 variable => $variable, 1101 ); 1102 1103=over 4 1104 1105=item UCA_Version 1106 1107If the revision (previously "tracking version") number of UCA is given, 1108behavior of that revision is emulated on collating. 1109If omitted, the return value of C<UCA_Version()> is used. 1110 1111The following revisions are supported. The default is 43. 1112 1113 UCA Unicode Standard DUCET (@version) 1114 ------------------------------------------------------- 1115 8 3.1 3.0.1 (3.0.1d9) 1116 9 3.1 with Corrigendum 3 3.1.1 1117 11 4.0.0 1118 14 4.1.0 1119 16 5.0.0 1120 18 5.1.0 1121 20 5.2.0 1122 22 6.0.0 1123 24 6.1.0 1124 26 6.2.0 1125 28 6.3.0 1126 30 7.0.0 1127 32 8.0.0 1128 34 9.0.0 1129 36 10.0.0 1130 38 11.0.0 1131 40 12.0.0 1132 41 12.1.0 1133 43 13.0.0 1134 1135* See below for C<long_contraction> with C<UCA_Version> 22 and 24. 1136 1137* Noncharacters (e.g. U+FFFF) are not ignored, and can be overridden 1138since C<UCA_Version> 22. 1139 1140* Out-of-range codepoints (greater than U+10FFFF) are not ignored, 1141and can be overridden since C<UCA_Version> 22. 1142 1143* Fully ignorable characters were ignored, and would not interrupt 1144contractions with C<UCA_Version> 9 and 11. 1145 1146* Treatment of ignorables after variables and some behaviors 1147were changed at C<UCA_Version> 9. 1148 1149* Characters regarded as CJK unified ideographs (cf. C<overrideCJK>) 1150depend on C<UCA_Version>. 1151 1152* Many hangul jamo are assigned at C<UCA_Version> 20, that will affect 1153C<hangul_terminator>. 1154 1155=item alternate 1156 1157-- see 3.2.2 Alternate Weighting, version 8 of UTS #10 1158 1159For backward compatibility, C<alternate> (old name) can be used 1160as an alias for C<variable>. 1161 1162=item backwards 1163 1164-- see 3.4 Backward Accents, UTS #10. 1165 1166 backwards => $levelNumber or \@levelNumbers 1167 1168Weights in reverse order; ex. level 2 (diacritic ordering) in French. 1169If omitted (or C<$levelNumber> is C<undef> or C<\@levelNumbers> is C<[]>), 1170forwards at all the levels. 1171 1172=item entry 1173 1174-- see 5 Tailoring; 9.1 Allkeys File Format, UTS #10. 1175 1176If the same character (or a sequence of characters) exists 1177in the collation element table through C<table>, 1178mapping to collation elements is overridden. 1179If it does not exist, the mapping is defined additionally. 1180 1181 entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt) 11820063 0068 ; [.0E6A.0020.0002.0063] # ch 11830043 0068 ; [.0E6A.0020.0007.0043] # Ch 11840043 0048 ; [.0E6A.0020.0008.0043] # CH 1185006C 006C ; [.0F4C.0020.0002.006C] # ll 1186004C 006C ; [.0F4C.0020.0007.004C] # Ll 1187004C 004C ; [.0F4C.0020.0008.004C] # LL 118800F1 ; [.0F7B.0020.0002.00F1] # n-tilde 1189006E 0303 ; [.0F7B.0020.0002.00F1] # n-tilde 119000D1 ; [.0F7B.0020.0008.00D1] # N-tilde 1191004E 0303 ; [.0F7B.0020.0008.00D1] # N-tilde 1192ENTRY 1193 1194 entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt) 119500E6 ; [.0E33.0020.0002.00E6][.0E8B.0020.0002.00E6] # ae ligature as <a><e> 119600C6 ; [.0E33.0020.0008.00C6][.0E8B.0020.0008.00C6] # AE ligature as <A><E> 1197ENTRY 1198 1199B<NOTE:> The code point in the UCA file format (before C<';'>) 1200B<must> be a Unicode code point (defined as hexadecimal), 1201but not a native code point. 1202So C<0063> must always denote C<U+0063>, 1203but not a character of C<"\x63">. 1204 1205Weighting may vary depending on collation element table. 1206So ensure the weights defined in C<entry> will be consistent with 1207those in the collation element table loaded via C<table>. 1208 1209In DUCET v4.0.0, primary weight of C<C> is C<0E60> 1210and that of C<D> is C<0E6D>. So setting primary weight of C<CH> to C<0E6A> 1211(as a value between C<0E60> and C<0E6D>) 1212makes ordering as C<C E<lt> CH E<lt> D>. 1213Exactly speaking DUCET already has some characters between C<C> and C<D>: 1214C<small capital C> (C<U+1D04>) with primary weight C<0E64>, 1215C<c-hook/C-hook> (C<U+0188/U+0187>) with C<0E65>, 1216and C<c-curl> (C<U+0255>) with C<0E69>. 1217Then primary weight C<0E6A> for C<CH> makes C<CH> 1218ordered between C<c-curl> and C<D>. 1219 1220=item hangul_terminator 1221 1222-- see 7.1.4 Trailing Weights, UTS #10. 1223 1224If a true value is given (non-zero but should be positive), 1225it will be added as a terminator primary weight to the end of 1226every standard Hangul syllable. Secondary and any higher weights 1227for terminator are set to zero. 1228If the value is false or C<hangul_terminator> key does not exist, 1229insertion of terminator weights will not be performed. 1230 1231Boundaries of Hangul syllables are determined 1232according to conjoining Jamo behavior in F<the Unicode Standard> 1233and F<HangulSyllableType.txt>. 1234 1235B<Implementation Note:> 1236(1) For expansion mapping (Unicode character mapped 1237to a sequence of collation elements), a terminator will not be added 1238between collation elements, even if Hangul syllable boundary exists there. 1239Addition of terminator is restricted to the next position 1240to the last collation element. 1241 1242(2) Non-conjoining Hangul letters 1243(Compatibility Jamo, halfwidth Jamo, and enclosed letters) are not 1244automatically terminated with a terminator primary weight. 1245These characters may need terminator included in a collation element 1246table beforehand. 1247 1248=item highestFFFF 1249 1250-- see 2.4 Tailored noncharacter weights, UTS #35 (LDML) Part 5: Collation. 1251 1252If the parameter is made true, C<U+FFFF> has a highest primary weight. 1253When a boolean of C<$coll-E<gt>ge($str, "abc")> and 1254C<$coll-E<gt>le($str, "abc\x{FFFF}")> is true, it is expected that C<$str> 1255begins with C<"abc">, or another primary equivalent. 1256C<$str> may be C<"abcd">, C<"abc012">, but should not include C<U+FFFF> 1257such as C<"abc\x{FFFF}xyz">. 1258 1259C<$coll-E<gt>le($str, "abc\x{FFFF}")> works like C<$coll-E<gt>lt($str, "abd")> 1260almost, but the latter has a problem that you should know which letter is 1261next to C<c>. For a certain language where C<ch> as the next letter, 1262C<"abch"> is greater than C<"abc\x{FFFF}">, but less than C<"abd">. 1263 1264Note: 1265This is equivalent to C<(entry =E<gt> 'FFFF ; [.FFFE.0020.0005.FFFF]')>. 1266Any other character than C<U+FFFF> can be tailored by C<entry>. 1267 1268=item identical 1269 1270-- see A.3 Deterministic Comparison, UTS #10. 1271 1272By default, strings whose weights are equal should be equal, 1273even though their code points are not equal. 1274Completely ignorable characters are ignored. 1275 1276If the parameter is made true, a final, tie-breaking level is used. 1277If no difference of weights is found after the comparison through 1278all the level specified by C<level>, the comparison with code points 1279will be performed. 1280For the tie-breaking comparison, the sort key has code points 1281of the original string appended. 1282Completely ignorable characters are not ignored. 1283 1284If C<preprocess> and/or C<normalization> is applied, the code points 1285of the string after them (in NFD by default) are used. 1286 1287=item ignoreChar 1288 1289=item ignoreName 1290 1291-- see 3.6 Variable Weighting, UTS #10. 1292 1293Makes the entry in the table completely ignorable; 1294i.e. as if the weights were zero at all level. 1295 1296Through C<ignoreChar>, any character matching C<qr/$ignoreChar/> 1297will be ignored. Through C<ignoreName>, any character whose name 1298(given in the C<table> file as a comment) matches C<qr/$ignoreName/> 1299will be ignored. 1300 1301E.g. when 'a' and 'e' are ignorable, 1302'element' is equal to 'lament' (or 'lmnt'). 1303 1304=item ignore_level2 1305 1306-- see 5.1 Parametric Tailoring, UTS #10. 1307 1308By default, case-sensitive comparison (that is level 3 difference) 1309won't ignore accents (that is level 2 difference). 1310 1311If the parameter is made true, accents (and other primary ignorable 1312characters) are ignored, even though cases are taken into account. 1313 1314B<NOTE>: C<level> should be 3 or greater. 1315 1316=item katakana_before_hiragana 1317 1318-- see 7.2 Tertiary Weight Table, UTS #10. 1319 1320By default, hiragana is before katakana. 1321If the parameter is made true, this is reversed. 1322 1323B<NOTE>: This parameter simplemindedly assumes that any hiragana/katakana 1324distinctions must occur in level 3, and their weights at level 3 must be 1325same as those mentioned in 7.3.1, UTS #10. 1326If you define your collation elements which violate this requirement, 1327this parameter does not work validly. 1328 1329=item level 1330 1331-- see 4.3 Form Sort Key, UTS #10. 1332 1333Set the maximum level. 1334Any higher levels than the specified one are ignored. 1335 1336 Level 1: alphabetic ordering 1337 Level 2: diacritic ordering 1338 Level 3: case ordering 1339 Level 4: tie-breaking (e.g. in the case when variable is 'shifted') 1340 1341 ex.level => 2, 1342 1343If omitted, the maximum is the 4th. 1344 1345B<NOTE:> The DUCET includes weights over 0xFFFF at the 4th level. 1346But this module only uses weights within 0xFFFF. 1347When C<variable> is 'blanked' or 'non-ignorable' (other than 'shifted' 1348and 'shift-trimmed'), the level 4 may be unreliable. 1349 1350See also C<identical>. 1351 1352=item long_contraction 1353 1354-- see 3.8.2 Well-Formedness of the DUCET, 4.2 Produce Array, UTS #10. 1355 1356If the parameter is made true, for a contraction with three or more 1357characters (here nicknamed "long contraction"), initial substrings 1358will be handled. 1359For example, a contraction ABC, where A is a starter, and B and C 1360are non-starters (character with non-zero combining character class), 1361will be detected even if there is not AB as a contraction. 1362 1363B<Default:> Usually false. 1364If C<UCA_Version> is 22 or 24, and the value of C<long_contraction> 1365is not specified in C<new()>, a true value is set implicitly. 1366This is a workaround to pass Conformance Tests for Unicode 6.0.0 and 6.1.0. 1367 1368C<change()> handles C<long_contraction> explicitly only. 1369If C<long_contraction> is not specified in C<change()>, even though 1370C<UCA_Version> is changed, C<long_contraction> will not be changed. 1371 1372B<Limitation:> Scanning non-starters is one-way (no back tracking). 1373If AB is found but not ABC is not found, other long contraction where 1374the first character is A and the second is not B may not be found. 1375 1376Under C<(normalization =E<gt> undef)>, detection step of discontiguous 1377contractions will be skipped. 1378 1379B<Note:> The following contractions in DUCET are not considered 1380in steps S2.1.1 to S2.1.3, where they are discontiguous. 1381 1382 0FB2 0F71 0F80 (TIBETAN VOWEL SIGN VOCALIC RR) 1383 0FB3 0F71 0F80 (TIBETAN VOWEL SIGN VOCALIC LL) 1384 1385For example C<TIBETAN VOWEL SIGN VOCALIC RR> with C<COMBINING TILDE OVERLAY> 1386(C<U+0344>) is C<0FB2 0344 0F71 0F80> in NFD. 1387In this case C<0FB2 0F80> (C<TIBETAN VOWEL SIGN VOCALIC R>) is detected, 1388instead of C<0FB2 0F71 0F80>. 1389Inserted C<0344> makes C<0FB2 0F71 0F80> discontiguous and lack of 1390contraction C<0FB2 0F71> prohibits C<0FB2 0F71 0F80> from being detected. 1391 1392=item minimalFFFE 1393 1394-- see 1.1.1 U+FFFE, UTS #35 (LDML) Part 5: Collation. 1395 1396If the parameter is made true, C<U+FFFE> has a minimal primary weight. 1397The comparison between C<"$a1\x{FFFE}$a2"> and C<"$b1\x{FFFE}$b2"> 1398first compares C<$a1> and C<$b1> at level 1, and 1399then C<$a2> and C<$b2> at level 1, as followed. 1400 1401 "ab\x{FFFE}a" 1402 "Ab\x{FFFE}a" 1403 "ab\x{FFFE}c" 1404 "Ab\x{FFFE}c" 1405 "ab\x{FFFE}xyz" 1406 "abc\x{FFFE}def" 1407 "abc\x{FFFE}xYz" 1408 "aBc\x{FFFE}xyz" 1409 "abcX\x{FFFE}def" 1410 "abcx\x{FFFE}xyz" 1411 "b\x{FFFE}aaa" 1412 "bbb\x{FFFE}a" 1413 1414Note: 1415This is equivalent to C<(entry =E<gt> 'FFFE ; [.0001.0020.0005.FFFE]')>. 1416Any other character than C<U+FFFE> can be tailored by C<entry>. 1417 1418=item normalization 1419 1420-- see 4.1 Normalize, UTS #10. 1421 1422If specified, strings are normalized before preparation of sort keys 1423(the normalization is executed after preprocess). 1424 1425A form name C<Unicode::Normalize::normalize()> accepts will be applied 1426as C<$normalization_form>. 1427Acceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>. 1428See C<Unicode::Normalize::normalize()> for detail. 1429If omitted, C<'NFD'> is used. 1430 1431C<normalization> is performed after C<preprocess> (if defined). 1432 1433Furthermore, special values, C<undef> and C<"prenormalized">, can be used, 1434though they are not concerned with C<Unicode::Normalize::normalize()>. 1435 1436If C<undef> (not a string C<"undef">) is passed explicitly 1437as the value for this key, 1438any normalization is not carried out (this may make tailoring easier 1439if any normalization is not desired). Under C<(normalization =E<gt> undef)>, 1440only contiguous contractions are resolved; 1441e.g. even if C<A-ring> (and C<A-ring-cedilla>) is ordered after C<Z>, 1442C<A-cedilla-ring> would be primary equal to C<A>. 1443In this point, 1444C<(normalization =E<gt> undef, preprocess =E<gt> sub { NFD(shift) })> 1445B<is not> equivalent to C<(normalization =E<gt> 'NFD')>. 1446 1447In the case of C<(normalization =E<gt> "prenormalized")>, 1448any normalization is not performed, but 1449discontiguous contractions with combining characters are performed. 1450Therefore 1451C<(normalization =E<gt> 'prenormalized', preprocess =E<gt> sub { NFD(shift) })> 1452B<is> equivalent to C<(normalization =E<gt> 'NFD')>. 1453If source strings are finely prenormalized, 1454C<(normalization =E<gt> 'prenormalized')> may save time for normalization. 1455 1456Except C<(normalization =E<gt> undef)>, 1457B<Unicode::Normalize> is required (see also B<CAVEAT>). 1458 1459=item overrideCJK 1460 1461-- see 7.1 Derived Collation Elements, UTS #10. 1462 1463By default, CJK unified ideographs are ordered in Unicode codepoint 1464order, but those in the CJK Unified Ideographs block are less than 1465those in the CJK Unified Ideographs Extension A etc. 1466 1467 In the CJK Unified Ideographs block: 1468 U+4E00..U+9FA5 if UCA_Version is 8, 9 or 11. 1469 U+4E00..U+9FBB if UCA_Version is 14 or 16. 1470 U+4E00..U+9FC3 if UCA_Version is 18. 1471 U+4E00..U+9FCB if UCA_Version is 20 or 22. 1472 U+4E00..U+9FCC if UCA_Version is 24 to 30. 1473 U+4E00..U+9FD5 if UCA_Version is 32 or 34. 1474 U+4E00..U+9FEA if UCA_Version is 36. 1475 U+4E00..U+9FEF if UCA_Version is 38, 40 or 41. 1476 U+4E00..U+9FFC if UCA_Version is 43. 1477 1478 In the CJK Unified Ideographs Extension blocks: 1479 Ext.A (U+3400..U+4DB5) if UCA_Version is 8 to 41. 1480 Ext.A (U+3400..U+4DBF) if UCA_Version is 43. 1481 Ext.B (U+20000..U+2A6D6) if UCA_Version is 8 to 41. 1482 Ext.B (U+20000..U+2A6DD) if UCA_Version is 43. 1483 Ext.C (U+2A700..U+2B734) if UCA_Version is 20 or later. 1484 Ext.D (U+2B740..U+2B81D) if UCA_Version is 22 or later. 1485 Ext.E (U+2B820..U+2CEA1) if UCA_Version is 32 or later. 1486 Ext.F (U+2CEB0..U+2EBE0) if UCA_Version is 36 or later. 1487 Ext.G (U+30000..U+3134A) if UCA_Version is 43. 1488 1489Through C<overrideCJK>, ordering of CJK unified ideographs (including 1490extensions) can be overridden. 1491 1492ex. CJK unified ideographs in the JIS code point order. 1493 1494 overrideCJK => sub { 1495 my $u = shift; # get a Unicode codepoint 1496 my $b = pack('n', $u); # to UTF-16BE 1497 my $s = your_unicode_to_sjis_converter($b); # convert 1498 my $n = unpack('n', $s); # convert sjis to short 1499 [ $n, 0x20, 0x2, $u ]; # return the collation element 1500 }, 1501 1502The return value may be an arrayref of 1st to 4th weights as shown 1503above. The return value may be an integer as the primary weight 1504as shown below. If C<undef> is returned, the default derived 1505collation element will be used. 1506 1507 overrideCJK => sub { 1508 my $u = shift; # get a Unicode codepoint 1509 my $b = pack('n', $u); # to UTF-16BE 1510 my $s = your_unicode_to_sjis_converter($b); # convert 1511 my $n = unpack('n', $s); # convert sjis to short 1512 return $n; # return the primary weight 1513 }, 1514 1515The return value may be a list containing zero or more of 1516an arrayref, an integer, or C<undef>. 1517 1518ex. ignores all CJK unified ideographs. 1519 1520 overrideCJK => sub {()}, # CODEREF returning empty list 1521 1522 # where ->eq("Pe\x{4E00}rl", "Perl") is true 1523 # as U+4E00 is a CJK unified ideograph and to be ignorable. 1524 1525If a false value (including C<undef>) is passed, C<overrideCJK> 1526has no effect. 1527C<$Collator-E<gt>change(overrideCJK =E<gt> 0)> resets the old one. 1528 1529But assignment of weight for CJK unified ideographs 1530in C<table> or C<entry> is still valid. 1531If C<undef> is passed explicitly as the value for this key, 1532weights for CJK unified ideographs are treated as undefined. 1533However when C<UCA_Version> E<gt> 8, C<(overrideCJK =E<gt> undef)> 1534has no special meaning. 1535 1536B<Note:> In addition to them, 12 CJK compatibility ideographs (C<U+FA0E>, 1537C<U+FA0F>, C<U+FA11>, C<U+FA13>, C<U+FA14>, C<U+FA1F>, C<U+FA21>, C<U+FA23>, 1538C<U+FA24>, C<U+FA27>, C<U+FA28>, C<U+FA29>) are also treated as CJK unified 1539ideographs. But they can't be overridden via C<overrideCJK> when you use 1540DUCET, as the table includes weights for them. C<table> or C<entry> has 1541priority over C<overrideCJK>. 1542 1543=item overrideHangul 1544 1545-- see 7.1 Derived Collation Elements, UTS #10. 1546 1547By default, Hangul syllables are decomposed into Hangul Jamo, 1548even if C<(normalization =E<gt> undef)>. 1549But the mapping of Hangul syllables may be overridden. 1550 1551This parameter works like C<overrideCJK>, so see there for examples. 1552 1553If you want to override the mapping of Hangul syllables, 1554NFD and NFKD are not appropriate, since NFD and NFKD will decompose 1555Hangul syllables before overriding. FCD may decompose Hangul syllables 1556as the case may be. 1557 1558If a false value (but not C<undef>) is passed, C<overrideHangul> 1559has no effect. 1560C<$Collator-E<gt>change(overrideHangul =E<gt> 0)> resets the old one. 1561 1562If C<undef> is passed explicitly as the value for this key, 1563weight for Hangul syllables is treated as undefined 1564without decomposition into Hangul Jamo. 1565But definition of weight for Hangul syllables 1566in C<table> or C<entry> is still valid. 1567 1568=item overrideOut 1569 1570-- see 7.1.1 Handling Ill-Formed Code Unit Sequences, UTS #10. 1571 1572Perl seems to allow out-of-range values (greater than 0x10FFFF). 1573By default, out-of-range values are replaced with C<U+FFFD> 1574(REPLACEMENT CHARACTER) when C<UCA_Version> E<gt>= 22, 1575or ignored when C<UCA_Version> E<lt>= 20. 1576 1577When C<UCA_Version> E<gt>= 22, the weights of out-of-range values 1578can be overridden. Though C<table> or C<entry> are available for them, 1579out-of-range values are too many. 1580 1581C<overrideOut> can perform it algorithmically. 1582This parameter works like C<overrideCJK>, so see there for examples. 1583 1584ex. ignores all out-of-range values. 1585 1586 overrideOut => sub {()}, # CODEREF returning empty list 1587 1588If a false value (including C<undef>) is passed, C<overrideOut> 1589has no effect. 1590C<$Collator-E<gt>change(overrideOut =E<gt> 0)> resets the old one. 1591 1592B<NOTE ABOUT U+FFFD:> 1593 1594UCA recommends that out-of-range values should not be ignored for security 1595reasons. Say, C<"pe\x{110000}rl"> should not be equal to C<"perl">. 1596However, C<U+FFFD> is wrongly mapped to a variable collation element 1597in DUCET for Unicode 6.0.0 to 6.2.0, that means out-of-range values will be 1598ignored when C<variable> isn't C<Non-ignorable>. 1599 1600The mapping of C<U+FFFD> is corrected in Unicode 6.3.0. 1601see L<http://www.unicode.org/reports/tr10/tr10-28.html#Trailing_Weights> 1602(7.1.4 Trailing Weights). Such a correction is reproduced by this. 1603 1604 overrideOut => sub { 0xFFFD }, # CODEREF returning a very large integer 1605 1606This workaround is unnecessary since Unicode 6.3.0. 1607 1608=item preprocess 1609 1610-- see 5.4 Preprocessing, UTS #10. 1611 1612If specified, the coderef is used to preprocess each string 1613before the formation of sort keys. 1614 1615ex. dropping English articles, such as "a" or "the". 1616Then, "the pen" is before "a pencil". 1617 1618 preprocess => sub { 1619 my $str = shift; 1620 $str =~ s/\b(?:an?|the)\s+//gi; 1621 return $str; 1622 }, 1623 1624C<preprocess> is performed before C<normalization> (if defined). 1625 1626ex. decoding strings in a legacy encoding such as shift-jis: 1627 1628 $sjis_collator = Unicode::Collate->new( 1629 preprocess => \&your_shiftjis_to_unicode_decoder, 1630 ); 1631 @result = $sjis_collator->sort(@shiftjis_strings); 1632 1633B<Note:> Strings returned from the coderef will be interpreted 1634according to Perl's Unicode support. See L<perlunicode>, 1635L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>. 1636 1637=item rearrange 1638 1639-- see 3.5 Rearrangement, UTS #10. 1640 1641Characters that are not coded in logical order and to be rearranged. 1642If C<UCA_Version> is equal to or less than 11, default is: 1643 1644 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ], 1645 1646If you want to disallow any rearrangement, pass C<undef> or C<[]> 1647(a reference to empty list) as the value for this key. 1648 1649If C<UCA_Version> is equal to or greater than 14, default is C<[]> 1650(i.e. no rearrangement). 1651 1652B<According to the version 9 of UCA, this parameter shall not be used; 1653but it is not warned at present.> 1654 1655=item rewrite 1656 1657If specified, the coderef is used to rewrite lines in C<table> or C<entry>. 1658The coderef will get each line, and then should return a rewritten line 1659according to the UCA file format. 1660If the coderef returns an empty line, the line will be skipped. 1661 1662e.g. any primary ignorable characters into tertiary ignorable: 1663 1664 rewrite => sub { 1665 my $line = shift; 1666 $line =~ s/\[\.0000\..{4}\..{4}\./[.0000.0000.0000./g; 1667 return $line; 1668 }, 1669 1670This example shows rewriting weights. C<rewrite> is allowed to 1671affect code points, weights, and the name. 1672 1673B<NOTE>: C<table> is available to use another table file; 1674preparing a modified table once would be more efficient than 1675rewriting lines on reading an unmodified table every time. 1676 1677=item suppress 1678 1679-- see 3.12 Special-Purpose Commands, UTS #35 (LDML) Part 5: Collation. 1680 1681Contractions beginning with the specified characters are suppressed, 1682even if those contractions are defined in C<table>. 1683 1684An example for Russian and some languages using the Cyrillic script: 1685 1686 suppress => [0x0400..0x0417, 0x041A..0x0437, 0x043A..0x045F], 1687 1688where 0x0400 stands for C<U+0400>, CYRILLIC CAPITAL LETTER IE WITH GRAVE. 1689 1690B<NOTE>: Contractions via C<entry> will not be suppressed. 1691 1692=item table 1693 1694-- see 3.8 Default Unicode Collation Element Table, UTS #10. 1695 1696You can use another collation element table if desired. 1697 1698The table file should locate in the F<Unicode/Collate> directory 1699on C<@INC>. Say, if the filename is F<Foo.txt>, 1700the table file is searched as F<Unicode/Collate/Foo.txt> in C<@INC>. 1701 1702By default, F<allkeys.txt> (as the filename of DUCET) is used. 1703If you will prepare your own table file, any name other than F<allkeys.txt> 1704may be better to avoid namespace conflict. 1705 1706B<NOTE>: When XSUB is used, the DUCET is compiled on building this 1707module, and it may save time at the run time. 1708Explicit saying C<(table =E<gt> 'allkeys.txt')>, or using another table, 1709or using C<ignoreChar>, C<ignoreName>, C<undefChar>, C<undefName> or 1710C<rewrite> will prevent this module from using the compiled DUCET. 1711 1712If C<undef> is passed explicitly as the value for this key, 1713no file is read (but you can define collation elements via C<entry>). 1714 1715A typical way to define a collation element table 1716without any file of table: 1717 1718 $onlyABC = Unicode::Collate->new( 1719 table => undef, 1720 entry => << 'ENTRIES', 17210061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A 17220041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A 17230062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B 17240042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B 17250063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C 17260043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C 1727ENTRIES 1728 ); 1729 1730If C<ignoreName> or C<undefName> is used, character names should be 1731specified as a comment (following C<#>) on each line. 1732 1733=item undefChar 1734 1735=item undefName 1736 1737-- see 6.3.3 Reducing the Repertoire, UTS #10. 1738 1739Undefines the collation element as if it were unassigned in the C<table>. 1740This reduces the size of the table. 1741If an unassigned character appears in the string to be collated, 1742the sort key is made from its codepoint 1743as a single-character collation element, 1744as it is greater than any other assigned collation elements 1745(in the codepoint order among the unassigned characters). 1746But, it'd be better to ignore characters 1747unfamiliar to you and maybe never used. 1748 1749Through C<undefChar>, any character matching C<qr/$undefChar/> 1750will be undefined. Through C<undefName>, any character whose name 1751(given in the C<table> file as a comment) matches C<qr/$undefName/> 1752will be undefined. 1753 1754ex. Collation weights for beyond-BMP characters are not stored in object: 1755 1756 undefChar => qr/[^\0-\x{fffd}]/, 1757 1758=item upper_before_lower 1759 1760-- see 6.6 Case Comparisons, UTS #10. 1761 1762By default, lowercase is before uppercase. 1763If the parameter is made true, this is reversed. 1764 1765B<NOTE>: This parameter simplemindedly assumes that any lowercase/uppercase 1766distinctions must occur in level 3, and their weights at level 3 must be 1767same as those mentioned in 7.3.1, UTS #10. 1768If you define your collation elements which differs from this requirement, 1769this parameter doesn't work validly. 1770 1771=item variable 1772 1773-- see 3.6 Variable Weighting, UTS #10. 1774 1775This key allows for variable weighting of variable collation elements, 1776which are marked with an ASTERISK in the table 1777(NOTE: Many punctuation marks and symbols are variable in F<allkeys.txt>). 1778 1779 variable => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'. 1780 1781These names are case-insensitive. 1782By default (if specification is omitted), 'shifted' is adopted. 1783 1784 'Blanked' Variable elements are made ignorable at levels 1 through 3; 1785 considered at the 4th level. 1786 1787 'Non-Ignorable' Variable elements are not reset to ignorable. 1788 1789 'Shifted' Variable elements are made ignorable at levels 1 through 3 1790 their level 4 weight is replaced by the old level 1 weight. 1791 Level 4 weight for Non-Variable elements is 0xFFFF. 1792 1793 'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level 1794 are trimmed. 1795 1796=back 1797 1798=head2 Methods for Collation 1799 1800=over 4 1801 1802=item C<@sorted = $Collator-E<gt>sort(@not_sorted)> 1803 1804Sorts a list of strings. 1805 1806=item C<$result = $Collator-E<gt>cmp($a, $b)> 1807 1808Returns 1 (when C<$a> is greater than C<$b>) 1809or 0 (when C<$a> is equal to C<$b>) 1810or -1 (when C<$a> is less than C<$b>). 1811 1812=item C<$result = $Collator-E<gt>eq($a, $b)> 1813 1814=item C<$result = $Collator-E<gt>ne($a, $b)> 1815 1816=item C<$result = $Collator-E<gt>lt($a, $b)> 1817 1818=item C<$result = $Collator-E<gt>le($a, $b)> 1819 1820=item C<$result = $Collator-E<gt>gt($a, $b)> 1821 1822=item C<$result = $Collator-E<gt>ge($a, $b)> 1823 1824They works like the same name operators as theirs. 1825 1826 eq : whether $a is equal to $b. 1827 ne : whether $a is not equal to $b. 1828 lt : whether $a is less than $b. 1829 le : whether $a is less than $b or equal to $b. 1830 gt : whether $a is greater than $b. 1831 ge : whether $a is greater than $b or equal to $b. 1832 1833=item C<$sortKey = $Collator-E<gt>getSortKey($string)> 1834 1835-- see 4.3 Form Sort Key, UTS #10. 1836 1837Returns a sort key. 1838 1839You compare the sort keys using a binary comparison 1840and get the result of the comparison of the strings using UCA. 1841 1842 $Collator->getSortKey($a) cmp $Collator->getSortKey($b) 1843 1844 is equivalent to 1845 1846 $Collator->cmp($a, $b) 1847 1848=item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)> 1849 1850Converts a sorting key into its representation form. 1851If C<UCA_Version> is 8, the output is slightly different. 1852 1853 use Unicode::Collate; 1854 my $c = Unicode::Collate->new(); 1855 print $c->viewSortKey("Perl"),"\n"; 1856 1857 # output: 1858 # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF] 1859 # Level 1 Level 2 Level 3 Level 4 1860 1861=back 1862 1863=head2 Methods for Searching 1864 1865The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work 1866like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively, 1867but they are not aware of any pattern, but only a literal substring. 1868 1869B<DISCLAIMER:> If C<preprocess> or C<normalization> parameter is true 1870for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>, 1871C<subst>, C<gsubst>) is croaked, as the position and the length might 1872differ from those on the specified string. 1873 1874C<rearrange> and C<hangul_terminator> parameters are neglected. 1875C<katakana_before_hiragana> and C<upper_before_lower> don't affect 1876matching and searching, as it doesn't matter whether greater or less. 1877 1878=over 4 1879 1880=item C<$position = $Collator-E<gt>index($string, $substring[, $position])> 1881 1882=item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])> 1883 1884If C<$substring> matches a part of C<$string>, returns 1885the position of the first occurrence of the matching part in scalar context; 1886in list context, returns a two-element list of 1887the position and the length of the matching part. 1888 1889If C<$substring> does not match any part of C<$string>, 1890returns C<-1> in scalar context and 1891an empty list in list context. 1892 1893e.g. when the content of C<$str> is C<"Ich mu>E<szlig>C< studieren Perl.">, 1894you say the following where C<$sub> is C<"M>E<uuml>C<SS">, 1895 1896 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 ); 1897 # (normalization => undef) is REQUIRED. 1898 my $match; 1899 if (my($pos,$len) = $Collator->index($str, $sub)) { 1900 $match = substr($str, $pos, $len); 1901 } 1902 1903and get C<"mu>E<szlig>C<"> in C<$match>, since C<"mu>E<szlig>C<"> 1904is primary equal to C<"M>E<uuml>C<SS">. 1905 1906=item C<$match_ref = $Collator-E<gt>match($string, $substring)> 1907 1908=item C<($match) = $Collator-E<gt>match($string, $substring)> 1909 1910If C<$substring> matches a part of C<$string>, in scalar context, returns 1911B<a reference to> the first occurrence of the matching part 1912(C<$match_ref> is always true if matches, 1913since every reference is B<true>); 1914in list context, returns the first occurrence of the matching part. 1915 1916If C<$substring> does not match any part of C<$string>, 1917returns C<undef> in scalar context and 1918an empty list in list context. 1919 1920e.g. 1921 1922 if ($match_ref = $Collator->match($str, $sub)) { # scalar context 1923 print "matches [$$match_ref].\n"; 1924 } else { 1925 print "doesn't match.\n"; 1926 } 1927 1928 or 1929 1930 if (($match) = $Collator->match($str, $sub)) { # list context 1931 print "matches [$match].\n"; 1932 } else { 1933 print "doesn't match.\n"; 1934 } 1935 1936=item C<@match = $Collator-E<gt>gmatch($string, $substring)> 1937 1938If C<$substring> matches a part of C<$string>, returns 1939all the matching parts (or matching count in scalar context). 1940 1941If C<$substring> does not match any part of C<$string>, 1942returns an empty list. 1943 1944=item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)> 1945 1946If C<$substring> matches a part of C<$string>, 1947the first occurrence of the matching part is replaced by C<$replacement> 1948(C<$string> is modified) and C<$count> (always equals to C<1>) is returned. 1949 1950C<$replacement> can be a C<CODEREF>, 1951taking the matching part as an argument, 1952and returning a string to replace the matching part 1953(a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>). 1954 1955=item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)> 1956 1957If C<$substring> matches a part of C<$string>, 1958all the occurrences of the matching part are replaced by C<$replacement> 1959(C<$string> is modified) and C<$count> is returned. 1960 1961C<$replacement> can be a C<CODEREF>, 1962taking the matching part as an argument, 1963and returning a string to replace the matching part 1964(a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>). 1965 1966e.g. 1967 1968 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 ); 1969 # (normalization => undef) is REQUIRED. 1970 my $str = "Camel donkey zebra came\x{301}l CAMEL horse cam\0e\0l..."; 1971 $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" }); 1972 1973 # now $str is "<b>Camel</b> donkey zebra <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cam\0e\0l</b>..."; 1974 # i.e., all the camels are made bold-faced. 1975 1976 Examples: levels and ignore_level2 - what does camel match? 1977 --------------------------------------------------------------------------- 1978 level ignore_level2 | camel Camel came\x{301}l c-a-m-e-l cam\0e\0l 1979 -----------------------|--------------------------------------------------- 1980 1 false | yes yes yes yes yes 1981 2 false | yes yes no yes yes 1982 3 false | yes no no yes yes 1983 4 false | yes no no no yes 1984 -----------------------|--------------------------------------------------- 1985 1 true | yes yes yes yes yes 1986 2 true | yes yes yes yes yes 1987 3 true | yes no yes yes yes 1988 4 true | yes no yes no yes 1989 --------------------------------------------------------------------------- 1990 note: if variable => non-ignorable, camel doesn't match c-a-m-e-l 1991 at any level. 1992 1993=back 1994 1995=head2 Other Methods 1996 1997=over 4 1998 1999=item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)> 2000 2001=item C<$modified_collator = $Collator-E<gt>change(%new_tailoring)> 2002 2003Changes the value of specified keys and returns the changed part. 2004 2005 $Collator = Unicode::Collate->new(level => 4); 2006 2007 $Collator->eq("perl", "PERL"); # false 2008 2009 %old = $Collator->change(level => 2); # returns (level => 4). 2010 2011 $Collator->eq("perl", "PERL"); # true 2012 2013 $Collator->change(%old); # returns (level => 2). 2014 2015 $Collator->eq("perl", "PERL"); # false 2016 2017Not all C<(key,value)>s are allowed to be changed. 2018See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>. 2019 2020In the scalar context, returns the modified collator 2021(but it is B<not> a clone from the original). 2022 2023 $Collator->change(level => 2)->eq("perl", "PERL"); # true 2024 2025 $Collator->eq("perl", "PERL"); # true; now max level is 2nd. 2026 2027 $Collator->change(level => 4)->eq("perl", "PERL"); # false 2028 2029=item C<$version = $Collator-E<gt>version()> 2030 2031Returns the version number (a string) of the Unicode Standard 2032which the C<table> file used by the collator object is based on. 2033If the table does not include a version line (starting with C<@version>), 2034returns C<"unknown">. 2035 2036=item C<UCA_Version()> 2037 2038Returns the revision number of UTS #10 this module consults, 2039that should correspond with the DUCET incorporated. 2040 2041=item C<Base_Unicode_Version()> 2042 2043Returns the version number of UTS #10 this module consults, 2044that should correspond with the DUCET incorporated. 2045 2046=back 2047 2048=head1 EXPORT 2049 2050No method will be exported. 2051 2052=head1 INSTALL 2053 2054Though this module can be used without any C<table> file, 2055to use this module easily, it is recommended to install a table file 2056in the UCA format, by copying it under the directory 2057<a place in @INC>/Unicode/Collate. 2058 2059The most preferable one is "The Default Unicode Collation Element Table" 2060(aka DUCET), available from the Unicode Consortium's website: 2061 2062 http://www.unicode.org/Public/UCA/ 2063 2064 http://www.unicode.org/Public/UCA/latest/allkeys.txt 2065 (latest version) 2066 2067If DUCET is not installed, it is recommended to copy the file 2068from http://www.unicode.org/Public/UCA/latest/allkeys.txt 2069to <a place in @INC>/Unicode/Collate/allkeys.txt 2070manually. 2071 2072=head1 CAVEATS 2073 2074=over 4 2075 2076=item Normalization 2077 2078Use of the C<normalization> parameter requires the B<Unicode::Normalize> 2079module (see L<Unicode::Normalize>). 2080 2081If you need not it (say, in the case when you need not 2082handle any combining characters), 2083assign C<(normalization =E<gt> undef)> explicitly. 2084 2085-- see 6.5 Avoiding Normalization, UTS #10. 2086 2087=item Conformance Test 2088 2089The Conformance Test for the UCA is available 2090under L<http://www.unicode.org/Public/UCA/>. 2091 2092For F<CollationTest_SHIFTED.txt>, 2093a collator via C<Unicode::Collate-E<gt>new( )> should be used; 2094for F<CollationTest_NON_IGNORABLE.txt>, a collator via 2095C<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>. 2096 2097If C<UCA_Version> is 26 or later, the C<identical> level is preferred; 2098C<Unicode::Collate-E<gt>new(identical =E<gt> 1)> and 2099C<Unicode::Collate-E<gt>new(identical =E<gt> 1,> 2100C<variable =E<gt> "non-ignorable", level =E<gt> 3)> should be used. 2101 2102B<Unicode::Normalize is required to try The Conformance Test.> 2103 2104=back 2105 2106=head1 AUTHOR, COPYRIGHT AND LICENSE 2107 2108The Unicode::Collate module for perl was written by SADAHIRO Tomoyuki, 2109<SADAHIRO@cpan.org>. This module is Copyright(C) 2001-2020, 2110SADAHIRO Tomoyuki. Japan. All rights reserved. 2111 2112This module is free software; you can redistribute it and/or 2113modify it under the same terms as Perl itself. 2114 2115The file Unicode/Collate/allkeys.txt was copied verbatim 2116from L<http://www.unicode.org/Public/UCA/13.0.0/allkeys.txt>. 2117For this file, Copyright (c) 2020 Unicode, Inc.; distributed 2118under the Terms of Use in L<http://www.unicode.org/terms_of_use.html> 2119 2120=head1 SEE ALSO 2121 2122=over 4 2123 2124=item Unicode Collation Algorithm - UTS #10 2125 2126L<http://www.unicode.org/reports/tr10/> 2127 2128=item The Default Unicode Collation Element Table (DUCET) 2129 2130L<http://www.unicode.org/Public/UCA/latest/allkeys.txt> 2131 2132=item The conformance test for the UCA 2133 2134L<http://www.unicode.org/Public/UCA/latest/CollationTest.html> 2135 2136L<http://www.unicode.org/Public/UCA/latest/CollationTest.zip> 2137 2138=item Hangul Syllable Type 2139 2140L<http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt> 2141 2142=item Unicode Normalization Forms - UAX #15 2143 2144L<http://www.unicode.org/reports/tr15/> 2145 2146=item Unicode Locale Data Markup Language (LDML) - UTS #35 2147 2148L<http://www.unicode.org/reports/tr35/> 2149 2150=back 2151 2152=cut 2153