1package Unicode::Collate; 2 3BEGIN { 4 unless ("A" eq pack('U', 0x41)) { 5 die "Unicode::Collate cannot stringify a Unicode code point\n"; 6 } 7 unless (0x41 == unpack('U', 'A')) { 8 die "Unicode::Collate cannot get a Unicode code point\n"; 9 } 10} 11 12use 5.006; 13use strict; 14use warnings; 15use Carp; 16use File::Spec; 17 18no warnings 'utf8'; 19 20our $VERSION = '1.27'; 21our $PACKAGE = __PACKAGE__; 22 23### begin XS only ### 24use XSLoader (); 25XSLoader::load('Unicode::Collate', $VERSION); 26### end XS only ### 27 28my @Path = qw(Unicode Collate); 29my $KeyFile = 'allkeys.txt'; 30 31# Perl's boolean 32use constant TRUE => 1; 33use constant FALSE => ""; 34use constant NOMATCHPOS => -1; 35 36# A coderef to get combining class imported from Unicode::Normalize 37# (i.e. \&Unicode::Normalize::getCombinClass). 38# This is also used as a HAS_UNICODE_NORMALIZE flag. 39my $CVgetCombinClass; 40 41# Supported Levels 42use constant MinLevel => 1; 43use constant MaxLevel => 4; 44 45# Minimum weights at level 2 and 3, respectively 46use constant Min2Wt => 0x20; 47use constant Min3Wt => 0x02; 48 49# Shifted weight at 4th level 50use constant Shift4Wt => 0xFFFF; 51 52# A boolean for Variable and 16-bit weights at 4 levels of Collation Element 53use constant VCE_TEMPLATE => 'Cn4'; 54 55# A sort key: 16-bit weights 56use constant KEY_TEMPLATE => 'n*'; 57 58# The tie-breaking: 32-bit weights 59use constant TIE_TEMPLATE => 'N*'; 60 61# Level separator in a sort key: 62# i.e. pack(KEY_TEMPLATE, 0) 63use constant LEVEL_SEP => "\0\0"; 64 65# As Unicode code point separator for hash keys. 66# A joined code point string (denoted by JCPS below) 67# like "65;768" is used for internal processing 68# instead of Perl's Unicode string like "\x41\x{300}", 69# as the native code point is different from the Unicode code point 70# on EBCDIC platform. 71# This character must not be included in any stringified 72# representation of an integer. 73use constant CODE_SEP => ';'; 74 # NOTE: in regex /;/ is used for $jcps! 75 76# boolean values of variable weights 77use constant NON_VAR => 0; # Non-Variable character 78use constant VAR => 1; # Variable character 79 80# specific code points 81use constant Hangul_SIni => 0xAC00; 82use constant Hangul_SFin => 0xD7A3; 83 84# Logical_Order_Exception in PropList.txt 85my $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ]; 86 87# for highestFFFF and minimalFFFE 88my $HighestVCE = pack(VCE_TEMPLATE, 0, 0xFFFE, 0x20, 0x5, 0xFFFF); 89my $minimalVCE = pack(VCE_TEMPLATE, 0, 1, 0x20, 0x5, 0xFFFE); 90 91sub UCA_Version { '36' } 92 93sub Base_Unicode_Version { '10.0.0' } 94 95###### 96 97sub pack_U { 98 return pack('U*', @_); 99} 100 101sub unpack_U { 102 return unpack('U*', shift(@_).pack('U*')); 103} 104 105###### 106 107my (%VariableOK); 108@VariableOK{ qw/ 109 blanked non-ignorable shifted shift-trimmed 110 / } = (); # keys lowercased 111 112our @ChangeOK = qw/ 113 alternate backwards level normalization rearrange 114 katakana_before_hiragana upper_before_lower ignore_level2 115 overrideCJK overrideHangul overrideOut preprocess UCA_Version 116 hangul_terminator variable identical highestFFFF minimalFFFE 117 long_contraction 118 /; 119 120our @ChangeNG = qw/ 121 entry mapping table maxlength contraction 122 ignoreChar ignoreName undefChar undefName rewrite 123 versionTable alternateTable backwardsTable forwardsTable 124 rearrangeTable variableTable 125 derivCode normCode rearrangeHash backwardsFlag 126 suppress suppressHash 127 __useXS /; ### XS only 128# The hash key 'ignored' was deleted at v 0.21. 129# The hash key 'isShift' was deleted at v 0.23. 130# The hash key 'combining' was deleted at v 0.24. 131# The hash key 'entries' was deleted at v 0.30. 132# The hash key 'L3_ignorable' was deleted at v 0.40. 133 134sub version { 135 my $self = shift; 136 return $self->{versionTable} || 'unknown'; 137} 138 139my (%ChangeOK, %ChangeNG); 140@ChangeOK{ @ChangeOK } = (); 141@ChangeNG{ @ChangeNG } = (); 142 143sub change { 144 my $self = shift; 145 my %hash = @_; 146 my %old; 147 if (exists $hash{alternate}) { 148 if (exists $hash{variable}) { 149 delete $hash{alternate}; 150 } else { 151 $hash{variable} = $hash{alternate}; 152 } 153 } 154 foreach my $k (keys %hash) { 155 if (exists $ChangeOK{$k}) { 156 $old{$k} = $self->{$k}; 157 $self->{$k} = $hash{$k}; 158 } elsif (exists $ChangeNG{$k}) { 159 croak "change of $k via change() is not allowed!"; 160 } 161 # else => ignored 162 } 163 $self->checkCollator(); 164 return wantarray ? %old : $self; 165} 166 167sub _checkLevel { 168 my $level = shift; 169 my $key = shift; # 'level' or 'backwards' 170 MinLevel <= $level or croak sprintf 171 "Illegal level %d (in value for key '%s') lower than %d.", 172 $level, $key, MinLevel; 173 $level <= MaxLevel or croak sprintf 174 "Unsupported level %d (in value for key '%s') higher than %d.", 175 $level, $key, MaxLevel; 176} 177 178my %DerivCode = ( 179 8 => \&_derivCE_8, 180 9 => \&_derivCE_9, 181 11 => \&_derivCE_9, # 11 == 9 182 14 => \&_derivCE_14, 183 16 => \&_derivCE_14, # 16 == 14 184 18 => \&_derivCE_18, 185 20 => \&_derivCE_20, 186 22 => \&_derivCE_22, 187 24 => \&_derivCE_24, 188 26 => \&_derivCE_24, # 26 == 24 189 28 => \&_derivCE_24, # 28 == 24 190 30 => \&_derivCE_24, # 30 == 24 191 32 => \&_derivCE_32, 192 34 => \&_derivCE_34, 193 36 => \&_derivCE_36, 194); 195 196sub checkCollator { 197 my $self = shift; 198 _checkLevel($self->{level}, 'level'); 199 200 $self->{derivCode} = $DerivCode{ $self->{UCA_Version} } 201 or croak "Illegal UCA version (passed $self->{UCA_Version})."; 202 203 $self->{variable} ||= $self->{alternate} || $self->{variableTable} || 204 $self->{alternateTable} || 'shifted'; 205 $self->{variable} = $self->{alternate} = lc($self->{variable}); 206 exists $VariableOK{ $self->{variable} } 207 or croak "$PACKAGE unknown variable parameter name: $self->{variable}"; 208 209 if (! defined $self->{backwards}) { 210 $self->{backwardsFlag} = 0; 211 } elsif (! ref $self->{backwards}) { 212 _checkLevel($self->{backwards}, 'backwards'); 213 $self->{backwardsFlag} = 1 << $self->{backwards}; 214 } else { 215 my %level; 216 $self->{backwardsFlag} = 0; 217 for my $b (@{ $self->{backwards} }) { 218 _checkLevel($b, 'backwards'); 219 $level{$b} = 1; 220 } 221 for my $v (sort keys %level) { 222 $self->{backwardsFlag} += 1 << $v; 223 } 224 } 225 226 defined $self->{rearrange} or $self->{rearrange} = []; 227 ref $self->{rearrange} 228 or croak "$PACKAGE: list for rearrangement must be store in ARRAYREF"; 229 230 # keys of $self->{rearrangeHash} are $self->{rearrange}. 231 $self->{rearrangeHash} = undef; 232 233 if (@{ $self->{rearrange} }) { 234 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = (); 235 } 236 237 $self->{normCode} = undef; 238 239 if (defined $self->{normalization}) { 240 eval { require Unicode::Normalize }; 241 $@ and croak "Unicode::Normalize is required to normalize strings"; 242 243 $CVgetCombinClass ||= \&Unicode::Normalize::getCombinClass; 244 245 if ($self->{normalization} =~ /^(?:NF)D\z/) { # tweak for default 246 $self->{normCode} = \&Unicode::Normalize::NFD; 247 } 248 elsif ($self->{normalization} ne 'prenormalized') { 249 my $norm = $self->{normalization}; 250 $self->{normCode} = sub { 251 Unicode::Normalize::normalize($norm, shift); 252 }; 253 eval { $self->{normCode}->("") }; # try 254 $@ and croak "$PACKAGE unknown normalization form name: $norm"; 255 } 256 } 257 return; 258} 259 260sub new 261{ 262 my $class = shift; 263 my $self = bless { @_ }, $class; 264 265### begin XS only ### 266 if (! exists $self->{table} && !defined $self->{rewrite} && 267 !defined $self->{undefName} && !defined $self->{ignoreName} && 268 !defined $self->{undefChar} && !defined $self->{ignoreChar}) { 269 $self->{__useXS} = \&_fetch_simple; 270 } else { 271 $self->{__useXS} = undef; 272 } 273### end XS only ### 274 275 # keys of $self->{suppressHash} are $self->{suppress}. 276 if ($self->{suppress} && @{ $self->{suppress} }) { 277 @{ $self->{suppressHash} }{ @{ $self->{suppress} } } = (); 278 } # before read_table() 279 280 # If undef is passed explicitly, no file is read. 281 $self->{table} = $KeyFile if ! exists $self->{table}; 282 $self->read_table() if defined $self->{table}; 283 284 if ($self->{entry}) { 285 while ($self->{entry} =~ /([^\n]+)/g) { 286 $self->parseEntry($1, TRUE); 287 } 288 } 289 290 # only in new(), not in change() 291 $self->{level} ||= MaxLevel; 292 $self->{UCA_Version} ||= UCA_Version(); 293 294 $self->{overrideHangul} = FALSE 295 if ! exists $self->{overrideHangul}; 296 $self->{overrideCJK} = FALSE 297 if ! exists $self->{overrideCJK}; 298 $self->{normalization} = 'NFD' 299 if ! exists $self->{normalization}; 300 $self->{rearrange} = $self->{rearrangeTable} || 301 ($self->{UCA_Version} <= 11 ? $DefaultRearrange : []) 302 if ! exists $self->{rearrange}; 303 $self->{backwards} = $self->{backwardsTable} 304 if ! exists $self->{backwards}; 305 exists $self->{long_contraction} or $self->{long_contraction} 306 = 22 <= $self->{UCA_Version} && $self->{UCA_Version} <= 24; 307 308 # checkCollator() will be called in change() 309 $self->checkCollator(); 310 311 return $self; 312} 313 314sub parseAtmark { 315 my $self = shift; 316 my $line = shift; # after s/^\s*\@// 317 318 if ($line =~ /^version\s*(\S*)/) { 319 $self->{versionTable} ||= $1; 320 } 321 elsif ($line =~ /^variable\s+(\S*)/) { # since UTS #10-9 322 $self->{variableTable} ||= $1; 323 } 324 elsif ($line =~ /^alternate\s+(\S*)/) { # till UTS #10-8 325 $self->{alternateTable} ||= $1; 326 } 327 elsif ($line =~ /^backwards\s+(\S*)/) { 328 push @{ $self->{backwardsTable} }, $1; 329 } 330 elsif ($line =~ /^forwards\s+(\S*)/) { # perhaps no use 331 push @{ $self->{forwardsTable} }, $1; 332 } 333 elsif ($line =~ /^rearrange\s+(.*)/) { # (\S*) is NG 334 push @{ $self->{rearrangeTable} }, _getHexArray($1); 335 } 336} 337 338sub read_table { 339 my $self = shift; 340 341### begin XS only ### 342 if ($self->{__useXS}) { 343 my @rest = _fetch_rest(); # complex matter need to parse 344 for my $line (@rest) { 345 next if $line =~ /^\s*#/; 346 347 if ($line =~ s/^\s*\@//) { 348 $self->parseAtmark($line); 349 } else { 350 $self->parseEntry($line); 351 } 352 } 353 return; 354 } 355### end XS only ### 356 357 my($f, $fh); 358 foreach my $d (@INC) { 359 $f = File::Spec->catfile($d, @Path, $self->{table}); 360 last if open($fh, $f); 361 $f = undef; 362 } 363 if (!defined $f) { 364 $f = File::Spec->catfile(@Path, $self->{table}); 365 croak("$PACKAGE: Can't locate $f in \@INC (\@INC contains: @INC)"); 366 } 367 368 while (my $line = <$fh>) { 369 next if $line =~ /^\s*#/; 370 371 if ($line =~ s/^\s*\@//) { 372 $self->parseAtmark($line); 373 } else { 374 $self->parseEntry($line); 375 } 376 } 377 close $fh; 378} 379 380 381## 382## get $line, parse it, and write an entry in $self 383## 384sub parseEntry 385{ 386 my $self = shift; 387 my $line = shift; 388 my $tailoring = shift; 389 my($name, $entry, @uv, @key); 390 391 if (defined $self->{rewrite}) { 392 $line = $self->{rewrite}->($line); 393 } 394 395 return if $line !~ /^\s*[0-9A-Fa-f]/; 396 397 # removes comment and gets name 398 $name = $1 399 if $line =~ s/[#%]\s*(.*)//; 400 return if defined $self->{undefName} && $name =~ /$self->{undefName}/; 401 402 # gets element 403 my($e, $k) = split /;/, $line; 404 croak "Wrong Entry: <charList> must be separated by ';' from <collElement>" 405 if ! $k; 406 407 @uv = _getHexArray($e); 408 return if !@uv; 409 return if @uv > 1 && $self->{suppressHash} && !$tailoring && 410 exists $self->{suppressHash}{$uv[0]}; 411 $entry = join(CODE_SEP, @uv); # in JCPS 412 413 if (defined $self->{undefChar} || defined $self->{ignoreChar}) { 414 my $ele = pack_U(@uv); 415 416 # regarded as if it were not stored in the table 417 return 418 if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/; 419 420 # replaced as completely ignorable 421 $k = '[.0000.0000.0000.0000]' 422 if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/; 423 } 424 425 # replaced as completely ignorable 426 $k = '[.0000.0000.0000.0000]' 427 if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/; 428 429 my $is_L3_ignorable = TRUE; 430 431 foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed 432 my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient. 433 my @wt = _getHexArray($arr); 434 push @key, pack(VCE_TEMPLATE, $var, @wt); 435 $is_L3_ignorable = FALSE 436 if $wt[0] || $wt[1] || $wt[2]; 437 # Conformance Test for 3.1.1 and 4.0.0 shows Level 3 ignorable 438 # is completely ignorable. 439 # For expansion, an entry $is_L3_ignorable 440 # if and only if "all" CEs are [.0000.0000.0000]. 441 } 442 443 # mapping: be an array ref or not exists (any false value is disallowed) 444 $self->{mapping}{$entry} = $is_L3_ignorable ? [] : \@key; 445 446 # maxlength: be more than 1 or not exists (any false value is disallowed) 447 if (@uv > 1) { 448 if (!$self->{maxlength}{$uv[0]} || $self->{maxlength}{$uv[0]} < @uv) { 449 $self->{maxlength}{$uv[0]} = @uv; 450 } 451 } 452 453 # contraction: be 1 or not exists (any false value is disallowed) 454 while (@uv > 2) { 455 pop @uv; 456 my $fake_entry = join(CODE_SEP, @uv); # in JCPS 457 $self->{contraction}{$fake_entry} = 1; 458 } 459} 460 461 462sub viewSortKey 463{ 464 my $self = shift; 465 my $str = shift; 466 $self->visualizeSortKey($self->getSortKey($str)); 467} 468 469 470sub process 471{ 472 my $self = shift; 473 my $str = shift; 474 my $prep = $self->{preprocess}; 475 my $norm = $self->{normCode}; 476 477 $str = &$prep($str) if ref $prep; 478 $str = &$norm($str) if ref $norm; 479 return $str; 480} 481 482## 483## arrayref of JCPS = splitEnt(string to be collated) 484## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, TRUE) 485## 486sub splitEnt 487{ 488 my $self = shift; 489 my $str = shift; 490 my $wLen = shift; # with Length 491 492 my $map = $self->{mapping}; 493 my $max = $self->{maxlength}; 494 my $reH = $self->{rearrangeHash}; 495 my $vers = $self->{UCA_Version}; 496 my $ver9 = $vers >= 9 && $vers <= 11; 497 my $long = $self->{long_contraction}; 498 my $uXS = $self->{__useXS}; ### XS only 499 500 my @buf; 501 502 # get array of Unicode code point of string. 503 my @src = unpack_U($str); 504 505 # rearrangement: 506 # Character positions are not kept if rearranged, 507 # then neglected if $wLen is true. 508 if ($reH && ! $wLen) { 509 for (my $i = 0; $i < @src; $i++) { 510 if (exists $reH->{ $src[$i] } && $i + 1 < @src) { 511 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]); 512 $i++; 513 } 514 } 515 } 516 517 # remove a code point marked as a completely ignorable. 518 for (my $i = 0; $i < @src; $i++) { 519 if ($vers <= 20 && _isIllegal($src[$i])) { 520 $src[$i] = undef; 521 } elsif ($ver9) { 522 $src[$i] = undef if exists $map->{ $src[$i] } 523 ? @{ $map->{ $src[$i] } } == 0 524 : $uXS && _ignorable_simple($src[$i]); ### XS only 525 } 526 } 527 528 for (my $i = 0; $i < @src; $i++) { 529 my $jcps = $src[$i]; 530 531 # skip removed code point 532 if (! defined $jcps) { 533 if ($wLen && @buf) { 534 $buf[-1][2] = $i + 1; 535 } 536 next; 537 } 538 539 my $i_orig = $i; 540 541 # find contraction 542 if (exists $max->{$jcps}) { 543 my $temp_jcps = $jcps; 544 my $jcpsLen = 1; 545 my $maxLen = $max->{$jcps}; 546 547 for (my $p = $i + 1; $jcpsLen < $maxLen && $p < @src; $p++) { 548 next if ! defined $src[$p]; 549 $temp_jcps .= CODE_SEP . $src[$p]; 550 $jcpsLen++; 551 if (exists $map->{$temp_jcps}) { 552 $jcps = $temp_jcps; 553 $i = $p; 554 } 555 } 556 557 # discontiguous contraction with Combining Char (cf. UTS#10, S2.1). 558 # This process requires Unicode::Normalize. 559 # If "normalization" is undef, here should be skipped *always* 560 # (in spite of bool value of $CVgetCombinClass), 561 # since canonical ordering cannot be expected. 562 # Blocked combining character should not be contracted. 563 564 # $self->{normCode} is false in the case of "prenormalized". 565 if ($self->{normalization}) { 566 my $cont = $self->{contraction}; 567 my $preCC = 0; 568 my $preCC_uc = 0; 569 my $jcps_uc = $jcps; 570 my(@out, @out_uc); 571 572 for (my $p = $i + 1; $p < @src; $p++) { 573 next if ! defined $src[$p]; 574 my $curCC = $CVgetCombinClass->($src[$p]); 575 last unless $curCC; 576 my $tail = CODE_SEP . $src[$p]; 577 578 if ($preCC != $curCC && exists $map->{$jcps.$tail}) { 579 $jcps .= $tail; 580 push @out, $p; 581 } else { 582 $preCC = $curCC; 583 } 584 585 next if !$long; 586 587 if ($preCC_uc != $curCC && 588 (exists $map->{$jcps_uc.$tail} || 589 exists $cont->{$jcps_uc.$tail})) { 590 $jcps_uc .= $tail; 591 push @out_uc, $p; 592 } else { 593 $preCC_uc = $curCC; 594 } 595 } 596 597 if (@out_uc && exists $map->{$jcps_uc}) { 598 $jcps = $jcps_uc; 599 $src[$_] = undef for @out_uc; 600 } else { 601 $src[$_] = undef for @out; 602 } 603 } 604 } 605 606 # skip completely ignorable 607 if (exists $map->{$jcps} ? @{ $map->{$jcps} } == 0 : 608 $uXS && $jcps !~ /;/ && _ignorable_simple($jcps)) { ### XS only 609 if ($wLen && @buf) { 610 $buf[-1][2] = $i + 1; 611 } 612 next; 613 } 614 615 push @buf, $wLen ? [$jcps, $i_orig, $i + 1] : $jcps; 616 } 617 return \@buf; 618} 619 620## 621## VCE = _pack_override(input, codepoint, derivCode) 622## 623sub _pack_override ($$$) { 624 my $r = shift; 625 my $u = shift; 626 my $der = shift; 627 628 if (ref $r) { 629 return pack(VCE_TEMPLATE, NON_VAR, @$r); 630 } elsif (defined $r) { 631 return pack(VCE_TEMPLATE, NON_VAR, $r, Min2Wt, Min3Wt, $u); 632 } else { 633 $u = 0xFFFD if 0x10FFFF < $u; 634 return $der->($u); 635 } 636} 637 638## 639## list of VCE = getWt(JCPS) 640## 641sub getWt 642{ 643 my $self = shift; 644 my $u = shift; 645 my $map = $self->{mapping}; 646 my $der = $self->{derivCode}; 647 my $out = $self->{overrideOut}; 648 my $uXS = $self->{__useXS}; ### XS only 649 650 return if !defined $u; 651 return $self->varCE($HighestVCE) if $u eq 0xFFFF && $self->{highestFFFF}; 652 return $self->varCE($minimalVCE) if $u eq 0xFFFE && $self->{minimalFFFE}; 653 $u = 0xFFFD if $u !~ /;/ && 0x10FFFF < $u && !$out; 654 655 my @ce; 656 if (exists $map->{$u}) { 657 @ce = @{ $map->{$u} }; # $u may be a contraction 658### begin XS only ### 659 } elsif ($uXS && _exists_simple($u)) { 660 @ce = _fetch_simple($u); 661### end XS only ### 662 } elsif (Hangul_SIni <= $u && $u <= Hangul_SFin) { 663 my $hang = $self->{overrideHangul}; 664 if ($hang) { 665 @ce = map _pack_override($_, $u, $der), $hang->($u); 666 } elsif (!defined $hang) { 667 @ce = $der->($u); 668 } else { 669 my $max = $self->{maxlength}; 670 my @decH = _decompHangul($u); 671 672 if (@decH == 2) { 673 my $contract = join(CODE_SEP, @decH); 674 @decH = ($contract) if exists $map->{$contract}; 675 } else { # must be <@decH == 3> 676 if (exists $max->{$decH[0]}) { 677 my $contract = join(CODE_SEP, @decH); 678 if (exists $map->{$contract}) { 679 @decH = ($contract); 680 } else { 681 $contract = join(CODE_SEP, @decH[0,1]); 682 exists $map->{$contract} and @decH = ($contract, $decH[2]); 683 } 684 # even if V's ignorable, LT contraction is not supported. 685 # If such a situation were required, NFD should be used. 686 } 687 if (@decH == 3 && exists $max->{$decH[1]}) { 688 my $contract = join(CODE_SEP, @decH[1,2]); 689 exists $map->{$contract} and @decH = ($decH[0], $contract); 690 } 691 } 692 693 @ce = map({ 694 exists $map->{$_} ? @{ $map->{$_} } : 695 $uXS && _exists_simple($_) ? _fetch_simple($_) : ### XS only 696 $der->($_); 697 } @decH); 698 } 699 } elsif ($out && 0x10FFFF < $u) { 700 @ce = map _pack_override($_, $u, $der), $out->($u); 701 } else { 702 my $cjk = $self->{overrideCJK}; 703 my $vers = $self->{UCA_Version}; 704 if ($cjk && _isUIdeo($u, $vers)) { 705 @ce = map _pack_override($_, $u, $der), $cjk->($u); 706 } elsif ($vers == 8 && defined $cjk && _isUIdeo($u, 0)) { 707 @ce = _uideoCE_8($u); 708 } else { 709 @ce = $der->($u); 710 } 711 } 712 return map $self->varCE($_), @ce; 713} 714 715 716## 717## string sortkey = getSortKey(string arg) 718## 719sub getSortKey 720{ 721 my $self = shift; 722 my $orig = shift; 723 my $str = $self->process($orig); 724 my $rEnt = $self->splitEnt($str); # get an arrayref of JCPS 725 my $vers = $self->{UCA_Version}; 726 my $term = $self->{hangul_terminator}; 727 my $lev = $self->{level}; 728 my $iden = $self->{identical}; 729 730 my @buf; # weight arrays 731 if ($term) { 732 my $preHST = ''; 733 my $termCE = $self->varCE(pack(VCE_TEMPLATE, NON_VAR, $term, 0,0,0)); 734 foreach my $jcps (@$rEnt) { 735 # weird things like VL, TL-contraction are not considered! 736 my $curHST = join '', map getHST($_, $vers), split /;/, $jcps; 737 if ($preHST && !$curHST || # hangul before non-hangul 738 $preHST =~ /L\z/ && $curHST =~ /^T/ || 739 $preHST =~ /V\z/ && $curHST =~ /^L/ || 740 $preHST =~ /T\z/ && $curHST =~ /^[LV]/) { 741 push @buf, $termCE; 742 } 743 $preHST = $curHST; 744 push @buf, $self->getWt($jcps); 745 } 746 push @buf, $termCE if $preHST; # end at hangul 747 } else { 748 foreach my $jcps (@$rEnt) { 749 push @buf, $self->getWt($jcps); 750 } 751 } 752 753 my $rkey = $self->mk_SortKey(\@buf); ### XS only 754 755 if ($iden || $vers >= 26 && $lev == MaxLevel) { 756 $rkey .= LEVEL_SEP; 757 $rkey .= pack(TIE_TEMPLATE, unpack_U($str)) if $iden; 758 } 759 return $rkey; 760} 761 762 763## 764## int compare = cmp(string a, string b) 765## 766sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) } 767sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) } 768sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) } 769sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) } 770sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) } 771sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) } 772sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) } 773 774## 775## list[strings] sorted = sort(list[strings] arg) 776## 777sub sort { 778 my $obj = shift; 779 return 780 map { $_->[1] } 781 sort{ $a->[0] cmp $b->[0] } 782 map [ $obj->getSortKey($_), $_ ], @_; 783} 784 785 786## 787## bool _nonIgnorAtLevel(arrayref weights, int level) 788## 789sub _nonIgnorAtLevel($$) 790{ 791 my $wt = shift; 792 return if ! defined $wt; 793 my $lv = shift; 794 return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE; 795} 796 797## 798## bool _eqArray( 799## arrayref of arrayref[weights] source, 800## arrayref of arrayref[weights] substr, 801## int level) 802## * comparison of graphemes vs graphemes. 803## @$source >= @$substr must be true (check it before call this); 804## 805sub _eqArray($$$) 806{ 807 my $source = shift; 808 my $substr = shift; 809 my $lev = shift; 810 811 for my $g (0..@$substr-1){ 812 # Do the $g'th graphemes have the same number of AV weights? 813 return if @{ $source->[$g] } != @{ $substr->[$g] }; 814 815 for my $w (0..@{ $substr->[$g] }-1) { 816 for my $v (0..$lev-1) { 817 return if $source->[$g][$w][$v] != $substr->[$g][$w][$v]; 818 } 819 } 820 } 821 return 1; 822} 823 824## 825## (int position, int length) 826## int position = index(string, substring, position, [undoc'ed global]) 827## 828## With "global" (only for the list context), 829## returns list of arrayref[position, length]. 830## 831sub index 832{ 833 my $self = shift; 834 $self->{preprocess} and 835 croak "Don't use Preprocess with index(), match(), etc."; 836 $self->{normCode} and 837 croak "Don't use Normalization with index(), match(), etc."; 838 839 my $str = shift; 840 my $len = length($str); 841 my $sub = shift; 842 my $subE = $self->splitEnt($sub); 843 my $pos = @_ ? shift : 0; 844 $pos = 0 if $pos < 0; 845 my $glob = shift; 846 847 my $lev = $self->{level}; 848 my $v2i = $self->{UCA_Version} >= 9 && 849 $self->{variable} ne 'non-ignorable'; 850 851 if (! @$subE) { 852 my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos; 853 return $glob 854 ? map([$_, 0], $temp..$len) 855 : wantarray ? ($temp,0) : $temp; 856 } 857 $len < $pos 858 and return wantarray ? () : NOMATCHPOS; 859 my $strE = $self->splitEnt($pos ? substr($str, $pos) : $str, TRUE); 860 @$strE 861 or return wantarray ? () : NOMATCHPOS; 862 863 my(@strWt, @iniPos, @finPos, @subWt, @g_ret); 864 865 my $last_is_variable; 866 for my $vwt (map $self->getWt($_), @$subE) { 867 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt); 868 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev); 869 870 # "Ignorable (L1, L2) after Variable" since track. v. 9 871 if ($v2i) { 872 if ($var) { 873 $last_is_variable = TRUE; 874 } 875 elsif (!$wt[0]) { # ignorable 876 $to_be_pushed = FALSE if $last_is_variable; 877 } 878 else { 879 $last_is_variable = FALSE; 880 } 881 } 882 883 if (@subWt && !$var && !$wt[0]) { 884 push @{ $subWt[-1] }, \@wt if $to_be_pushed; 885 } elsif ($to_be_pushed) { 886 push @subWt, [ \@wt ]; 887 } 888 # else ===> skipped 889 } 890 891 my $count = 0; 892 my $end = @$strE - 1; 893 894 $last_is_variable = FALSE; # reuse 895 for (my $i = 0; $i <= $end; ) { # no $i++ 896 my $found_base = 0; 897 898 # fetch a grapheme 899 while ($i <= $end && $found_base == 0) { 900 for my $vwt ($self->getWt($strE->[$i][0])) { 901 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt); 902 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev); 903 904 # "Ignorable (L1, L2) after Variable" since track. v. 9 905 if ($v2i) { 906 if ($var) { 907 $last_is_variable = TRUE; 908 } 909 elsif (!$wt[0]) { # ignorable 910 $to_be_pushed = FALSE if $last_is_variable; 911 } 912 else { 913 $last_is_variable = FALSE; 914 } 915 } 916 917 if (@strWt && !$var && !$wt[0]) { 918 push @{ $strWt[-1] }, \@wt if $to_be_pushed; 919 $finPos[-1] = $strE->[$i][2]; 920 } elsif ($to_be_pushed) { 921 push @strWt, [ \@wt ]; 922 push @iniPos, $found_base ? NOMATCHPOS : $strE->[$i][1]; 923 $finPos[-1] = NOMATCHPOS if $found_base; 924 push @finPos, $strE->[$i][2]; 925 $found_base++; 926 } 927 # else ===> no-op 928 } 929 $i++; 930 } 931 932 # try to match 933 while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) { 934 if ($iniPos[0] != NOMATCHPOS && 935 $finPos[$#subWt] != NOMATCHPOS && 936 _eqArray(\@strWt, \@subWt, $lev)) { 937 my $temp = $iniPos[0] + $pos; 938 939 if ($glob) { 940 push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]]; 941 splice @strWt, 0, $#subWt; 942 splice @iniPos, 0, $#subWt; 943 splice @finPos, 0, $#subWt; 944 } 945 else { 946 return wantarray 947 ? ($temp, $finPos[$#subWt] - $iniPos[0]) 948 : $temp; 949 } 950 } 951 shift @strWt; 952 shift @iniPos; 953 shift @finPos; 954 } 955 } 956 957 return $glob 958 ? @g_ret 959 : wantarray ? () : NOMATCHPOS; 960} 961 962## 963## scalarref to matching part = match(string, substring) 964## 965sub match 966{ 967 my $self = shift; 968 if (my($pos,$len) = $self->index($_[0], $_[1])) { 969 my $temp = substr($_[0], $pos, $len); 970 return wantarray ? $temp : \$temp; 971 # An lvalue ref \substr should be avoided, 972 # since its value is affected by modification of its referent. 973 } 974 else { 975 return; 976 } 977} 978 979## 980## arrayref matching parts = gmatch(string, substring) 981## 982sub gmatch 983{ 984 my $self = shift; 985 my $str = shift; 986 my $sub = shift; 987 return map substr($str, $_->[0], $_->[1]), 988 $self->index($str, $sub, 0, 'g'); 989} 990 991## 992## bool subst'ed = subst(string, substring, replace) 993## 994sub subst 995{ 996 my $self = shift; 997 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE; 998 999 if (my($pos,$len) = $self->index($_[0], $_[1])) { 1000 if ($code) { 1001 my $mat = substr($_[0], $pos, $len); 1002 substr($_[0], $pos, $len, $code->($mat)); 1003 } else { 1004 substr($_[0], $pos, $len, $_[2]); 1005 } 1006 return TRUE; 1007 } 1008 else { 1009 return FALSE; 1010 } 1011} 1012 1013## 1014## int count = gsubst(string, substring, replace) 1015## 1016sub gsubst 1017{ 1018 my $self = shift; 1019 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE; 1020 my $cnt = 0; 1021 1022 # Replacement is carried out from the end, then use reverse. 1023 for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) { 1024 if ($code) { 1025 my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]); 1026 substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat)); 1027 } else { 1028 substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]); 1029 } 1030 $cnt++; 1031 } 1032 return $cnt; 1033} 1034 10351; 1036__END__ 1037 1038=head1 NAME 1039 1040Unicode::Collate - Unicode Collation Algorithm 1041 1042=head1 SYNOPSIS 1043 1044 use Unicode::Collate; 1045 1046 #construct 1047 $Collator = Unicode::Collate->new(%tailoring); 1048 1049 #sort 1050 @sorted = $Collator->sort(@not_sorted); 1051 1052 #compare 1053 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1. 1054 1055B<Note:> Strings in C<@not_sorted>, C<$a> and C<$b> are interpreted 1056according to Perl's Unicode support. See L<perlunicode>, 1057L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>. 1058Otherwise you can use C<preprocess> or should decode them before. 1059 1060=head1 DESCRIPTION 1061 1062This module is an implementation of Unicode Technical Standard #10 1063(a.k.a. UTS #10) - Unicode Collation Algorithm (a.k.a. UCA). 1064 1065=head2 Constructor and Tailoring 1066 1067The C<new> method returns a collator object. If new() is called 1068with no parameters, the collator should do the default collation. 1069 1070 $Collator = Unicode::Collate->new( 1071 UCA_Version => $UCA_Version, 1072 alternate => $alternate, # alias for 'variable' 1073 backwards => $levelNumber, # or \@levelNumbers 1074 entry => $element, 1075 hangul_terminator => $term_primary_weight, 1076 highestFFFF => $bool, 1077 identical => $bool, 1078 ignoreName => qr/$ignoreName/, 1079 ignoreChar => qr/$ignoreChar/, 1080 ignore_level2 => $bool, 1081 katakana_before_hiragana => $bool, 1082 level => $collationLevel, 1083 long_contraction => $bool, 1084 minimalFFFE => $bool, 1085 normalization => $normalization_form, 1086 overrideCJK => \&overrideCJK, 1087 overrideHangul => \&overrideHangul, 1088 preprocess => \&preprocess, 1089 rearrange => \@charList, 1090 rewrite => \&rewrite, 1091 suppress => \@charList, 1092 table => $filename, 1093 undefName => qr/$undefName/, 1094 undefChar => qr/$undefChar/, 1095 upper_before_lower => $bool, 1096 variable => $variable, 1097 ); 1098 1099=over 4 1100 1101=item UCA_Version 1102 1103If the revision (previously "tracking version") number of UCA is given, 1104behavior of that revision is emulated on collating. 1105If omitted, the return value of C<UCA_Version()> is used. 1106 1107The following revisions are supported. The default is 36. 1108 1109 UCA Unicode Standard DUCET (@version) 1110 ------------------------------------------------------- 1111 8 3.1 3.0.1 (3.0.1d9) 1112 9 3.1 with Corrigendum 3 3.1.1 (3.1.1) 1113 11 4.0 4.0.0 (4.0.0) 1114 14 4.1.0 4.1.0 (4.1.0) 1115 16 5.0 5.0.0 (5.0.0) 1116 18 5.1.0 5.1.0 (5.1.0) 1117 20 5.2.0 5.2.0 (5.2.0) 1118 22 6.0.0 6.0.0 (6.0.0) 1119 24 6.1.0 6.1.0 (6.1.0) 1120 26 6.2.0 6.2.0 (6.2.0) 1121 28 6.3.0 6.3.0 (6.3.0) 1122 30 7.0.0 7.0.0 (7.0.0) 1123 32 8.0.0 8.0.0 (8.0.0) 1124 34 9.0.0 9.0.0 (9.0.0) 1125 36 10.0.0 10.0.0(10.0.0) 1126 1127* See below for C<long_contraction> with C<UCA_Version> 22 and 24. 1128 1129* Noncharacters (e.g. U+FFFF) are not ignored, and can be overridden 1130since C<UCA_Version> 22. 1131 1132* Out-of-range codepoints (greater than U+10FFFF) are not ignored, 1133and can be overridden since C<UCA_Version> 22. 1134 1135* Fully ignorable characters were ignored, and would not interrupt 1136contractions with C<UCA_Version> 9 and 11. 1137 1138* Treatment of ignorables after variables and some behaviors 1139were changed at C<UCA_Version> 9. 1140 1141* Characters regarded as CJK unified ideographs (cf. C<overrideCJK>) 1142depend on C<UCA_Version>. 1143 1144* Many hangul jamo are assigned at C<UCA_Version> 20, that will affect 1145C<hangul_terminator>. 1146 1147=item alternate 1148 1149-- see 3.2.2 Alternate Weighting, version 8 of UTS #10 1150 1151For backward compatibility, C<alternate> (old name) can be used 1152as an alias for C<variable>. 1153 1154=item backwards 1155 1156-- see 3.4 Backward Accents, UTS #10. 1157 1158 backwards => $levelNumber or \@levelNumbers 1159 1160Weights in reverse order; ex. level 2 (diacritic ordering) in French. 1161If omitted (or C<$levelNumber> is C<undef> or C<\@levelNumbers> is C<[]>), 1162forwards at all the levels. 1163 1164=item entry 1165 1166-- see 5 Tailoring; 9.1 Allkeys File Format, UTS #10. 1167 1168If the same character (or a sequence of characters) exists 1169in the collation element table through C<table>, 1170mapping to collation elements is overridden. 1171If it does not exist, the mapping is defined additionally. 1172 1173 entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt) 11740063 0068 ; [.0E6A.0020.0002.0063] # ch 11750043 0068 ; [.0E6A.0020.0007.0043] # Ch 11760043 0048 ; [.0E6A.0020.0008.0043] # CH 1177006C 006C ; [.0F4C.0020.0002.006C] # ll 1178004C 006C ; [.0F4C.0020.0007.004C] # Ll 1179004C 004C ; [.0F4C.0020.0008.004C] # LL 118000F1 ; [.0F7B.0020.0002.00F1] # n-tilde 1181006E 0303 ; [.0F7B.0020.0002.00F1] # n-tilde 118200D1 ; [.0F7B.0020.0008.00D1] # N-tilde 1183004E 0303 ; [.0F7B.0020.0008.00D1] # N-tilde 1184ENTRY 1185 1186 entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt) 118700E6 ; [.0E33.0020.0002.00E6][.0E8B.0020.0002.00E6] # ae ligature as <a><e> 118800C6 ; [.0E33.0020.0008.00C6][.0E8B.0020.0008.00C6] # AE ligature as <A><E> 1189ENTRY 1190 1191B<NOTE:> The code point in the UCA file format (before C<';'>) 1192B<must> be a Unicode code point (defined as hexadecimal), 1193but not a native code point. 1194So C<0063> must always denote C<U+0063>, 1195but not a character of C<"\x63">. 1196 1197Weighting may vary depending on collation element table. 1198So ensure the weights defined in C<entry> will be consistent with 1199those in the collation element table loaded via C<table>. 1200 1201In DUCET v4.0.0, primary weight of C<C> is C<0E60> 1202and that of C<D> is C<0E6D>. So setting primary weight of C<CH> to C<0E6A> 1203(as a value between C<0E60> and C<0E6D>) 1204makes ordering as C<C E<lt> CH E<lt> D>. 1205Exactly speaking DUCET already has some characters between C<C> and C<D>: 1206C<small capital C> (C<U+1D04>) with primary weight C<0E64>, 1207C<c-hook/C-hook> (C<U+0188/U+0187>) with C<0E65>, 1208and C<c-curl> (C<U+0255>) with C<0E69>. 1209Then primary weight C<0E6A> for C<CH> makes C<CH> 1210ordered between C<c-curl> and C<D>. 1211 1212=item hangul_terminator 1213 1214-- see 7.1.4 Trailing Weights, UTS #10. 1215 1216If a true value is given (non-zero but should be positive), 1217it will be added as a terminator primary weight to the end of 1218every standard Hangul syllable. Secondary and any higher weights 1219for terminator are set to zero. 1220If the value is false or C<hangul_terminator> key does not exist, 1221insertion of terminator weights will not be performed. 1222 1223Boundaries of Hangul syllables are determined 1224according to conjoining Jamo behavior in F<the Unicode Standard> 1225and F<HangulSyllableType.txt>. 1226 1227B<Implementation Note:> 1228(1) For expansion mapping (Unicode character mapped 1229to a sequence of collation elements), a terminator will not be added 1230between collation elements, even if Hangul syllable boundary exists there. 1231Addition of terminator is restricted to the next position 1232to the last collation element. 1233 1234(2) Non-conjoining Hangul letters 1235(Compatibility Jamo, halfwidth Jamo, and enclosed letters) are not 1236automatically terminated with a terminator primary weight. 1237These characters may need terminator included in a collation element 1238table beforehand. 1239 1240=item highestFFFF 1241 1242-- see 2.4 Tailored noncharacter weights, UTS #35 (LDML) Part 5: Collation. 1243 1244If the parameter is made true, C<U+FFFF> has a highest primary weight. 1245When a boolean of C<$coll-E<gt>ge($str, "abc")> and 1246C<$coll-E<gt>le($str, "abc\x{FFFF}")> is true, it is expected that C<$str> 1247begins with C<"abc">, or another primary equivalent. 1248C<$str> may be C<"abcd">, C<"abc012">, but should not include C<U+FFFF> 1249such as C<"abc\x{FFFF}xyz">. 1250 1251C<$coll-E<gt>le($str, "abc\x{FFFF}")> works like C<$coll-E<gt>lt($str, "abd")> 1252almost, but the latter has a problem that you should know which letter is 1253next to C<c>. For a certain language where C<ch> as the next letter, 1254C<"abch"> is greater than C<"abc\x{FFFF}">, but less than C<"abd">. 1255 1256Note: 1257This is equivalent to C<(entry =E<gt> 'FFFF ; [.FFFE.0020.0005.FFFF]')>. 1258Any other character than C<U+FFFF> can be tailored by C<entry>. 1259 1260=item identical 1261 1262-- see A.3 Deterministic Comparison, UTS #10. 1263 1264By default, strings whose weights are equal should be equal, 1265even though their code points are not equal. 1266Completely ignorable characters are ignored. 1267 1268If the parameter is made true, a final, tie-breaking level is used. 1269If no difference of weights is found after the comparison through 1270all the level specified by C<level>, the comparison with code points 1271will be performed. 1272For the tie-breaking comparison, the sort key has code points 1273of the original string appended. 1274Completely ignorable characters are not ignored. 1275 1276If C<preprocess> and/or C<normalization> is applied, the code points 1277of the string after them (in NFD by default) are used. 1278 1279=item ignoreChar 1280 1281=item ignoreName 1282 1283-- see 3.6 Variable Weighting, UTS #10. 1284 1285Makes the entry in the table completely ignorable; 1286i.e. as if the weights were zero at all level. 1287 1288Through C<ignoreChar>, any character matching C<qr/$ignoreChar/> 1289will be ignored. Through C<ignoreName>, any character whose name 1290(given in the C<table> file as a comment) matches C<qr/$ignoreName/> 1291will be ignored. 1292 1293E.g. when 'a' and 'e' are ignorable, 1294'element' is equal to 'lament' (or 'lmnt'). 1295 1296=item ignore_level2 1297 1298-- see 5.1 Parametric Tailoring, UTS #10. 1299 1300By default, case-sensitive comparison (that is level 3 difference) 1301won't ignore accents (that is level 2 difference). 1302 1303If the parameter is made true, accents (and other primary ignorable 1304characters) are ignored, even though cases are taken into account. 1305 1306B<NOTE>: C<level> should be 3 or greater. 1307 1308=item katakana_before_hiragana 1309 1310-- see 7.2 Tertiary Weight Table, UTS #10. 1311 1312By default, hiragana is before katakana. 1313If the parameter is made true, this is reversed. 1314 1315B<NOTE>: This parameter simplemindedly assumes that any hiragana/katakana 1316distinctions must occur in level 3, and their weights at level 3 must be 1317same as those mentioned in 7.3.1, UTS #10. 1318If you define your collation elements which violate this requirement, 1319this parameter does not work validly. 1320 1321=item level 1322 1323-- see 4.3 Form Sort Key, UTS #10. 1324 1325Set the maximum level. 1326Any higher levels than the specified one are ignored. 1327 1328 Level 1: alphabetic ordering 1329 Level 2: diacritic ordering 1330 Level 3: case ordering 1331 Level 4: tie-breaking (e.g. in the case when variable is 'shifted') 1332 1333 ex.level => 2, 1334 1335If omitted, the maximum is the 4th. 1336 1337B<NOTE:> The DUCET includes weights over 0xFFFF at the 4th level. 1338But this module only uses weights within 0xFFFF. 1339When C<variable> is 'blanked' or 'non-ignorable' (other than 'shifted' 1340and 'shift-trimmed'), the level 4 may be unreliable. 1341 1342See also C<identical>. 1343 1344=item long_contraction 1345 1346-- see 3.8.2 Well-Formedness of the DUCET, 4.2 Produce Array, UTS #10. 1347 1348If the parameter is made true, for a contraction with three or more 1349characters (here nicknamed "long contraction"), initial substrings 1350will be handled. 1351For example, a contraction ABC, where A is a starter, and B and C 1352are non-starters (character with non-zero combining character class), 1353will be detected even if there is not AB as a contraction. 1354 1355B<Default:> Usually false. 1356If C<UCA_Version> is 22 or 24, and the value of C<long_contraction> 1357is not specified in C<new()>, a true value is set implicitly. 1358This is a workaround to pass Conformance Tests for Unicode 6.0.0 and 6.1.0. 1359 1360C<change()> handles C<long_contraction> explicitly only. 1361If C<long_contraction> is not specified in C<change()>, even though 1362C<UCA_Version> is changed, C<long_contraction> will not be changed. 1363 1364B<Limitation:> Scanning non-starters is one-way (no back tracking). 1365If AB is found but not ABC is not found, other long contraction where 1366the first character is A and the second is not B may not be found. 1367 1368Under C<(normalization =E<gt> undef)>, detection step of discontiguous 1369contractions will be skipped. 1370 1371B<Note:> The following contractions in DUCET are not considered 1372in steps S2.1.1 to S2.1.3, where they are discontiguous. 1373 1374 0FB2 0F71 0F80 (TIBETAN VOWEL SIGN VOCALIC RR) 1375 0FB3 0F71 0F80 (TIBETAN VOWEL SIGN VOCALIC LL) 1376 1377For example C<TIBETAN VOWEL SIGN VOCALIC RR> with C<COMBINING TILDE OVERLAY> 1378(C<U+0344>) is C<0FB2 0344 0F71 0F80> in NFD. 1379In this case C<0FB2 0F80> (C<TIBETAN VOWEL SIGN VOCALIC R>) is detected, 1380instead of C<0FB2 0F71 0F80>. 1381Inserted C<0344> makes C<0FB2 0F71 0F80> discontiguous and lack of 1382contraction C<0FB2 0F71> prohibits C<0FB2 0F71 0F80> from being detected. 1383 1384=item minimalFFFE 1385 1386-- see 1.1.1 U+FFFE, UTS #35 (LDML) Part 5: Collation. 1387 1388If the parameter is made true, C<U+FFFE> has a minimal primary weight. 1389The comparison between C<"$a1\x{FFFE}$a2"> and C<"$b1\x{FFFE}$b2"> 1390first compares C<$a1> and C<$b1> at level 1, and 1391then C<$a2> and C<$b2> at level 1, as followed. 1392 1393 "ab\x{FFFE}a" 1394 "Ab\x{FFFE}a" 1395 "ab\x{FFFE}c" 1396 "Ab\x{FFFE}c" 1397 "ab\x{FFFE}xyz" 1398 "abc\x{FFFE}def" 1399 "abc\x{FFFE}xYz" 1400 "aBc\x{FFFE}xyz" 1401 "abcX\x{FFFE}def" 1402 "abcx\x{FFFE}xyz" 1403 "b\x{FFFE}aaa" 1404 "bbb\x{FFFE}a" 1405 1406Note: 1407This is equivalent to C<(entry =E<gt> 'FFFE ; [.0001.0020.0005.FFFE]')>. 1408Any other character than C<U+FFFE> can be tailored by C<entry>. 1409 1410=item normalization 1411 1412-- see 4.1 Normalize, UTS #10. 1413 1414If specified, strings are normalized before preparation of sort keys 1415(the normalization is executed after preprocess). 1416 1417A form name C<Unicode::Normalize::normalize()> accepts will be applied 1418as C<$normalization_form>. 1419Acceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>. 1420See C<Unicode::Normalize::normalize()> for detail. 1421If omitted, C<'NFD'> is used. 1422 1423C<normalization> is performed after C<preprocess> (if defined). 1424 1425Furthermore, special values, C<undef> and C<"prenormalized">, can be used, 1426though they are not concerned with C<Unicode::Normalize::normalize()>. 1427 1428If C<undef> (not a string C<"undef">) is passed explicitly 1429as the value for this key, 1430any normalization is not carried out (this may make tailoring easier 1431if any normalization is not desired). Under C<(normalization =E<gt> undef)>, 1432only contiguous contractions are resolved; 1433e.g. even if C<A-ring> (and C<A-ring-cedilla>) is ordered after C<Z>, 1434C<A-cedilla-ring> would be primary equal to C<A>. 1435In this point, 1436C<(normalization =E<gt> undef, preprocess =E<gt> sub { NFD(shift) })> 1437B<is not> equivalent to C<(normalization =E<gt> 'NFD')>. 1438 1439In the case of C<(normalization =E<gt> "prenormalized")>, 1440any normalization is not performed, but 1441discontiguous contractions with combining characters are performed. 1442Therefore 1443C<(normalization =E<gt> 'prenormalized', preprocess =E<gt> sub { NFD(shift) })> 1444B<is> equivalent to C<(normalization =E<gt> 'NFD')>. 1445If source strings are finely prenormalized, 1446C<(normalization =E<gt> 'prenormalized')> may save time for normalization. 1447 1448Except C<(normalization =E<gt> undef)>, 1449B<Unicode::Normalize> is required (see also B<CAVEAT>). 1450 1451=item overrideCJK 1452 1453-- see 7.1 Derived Collation Elements, UTS #10. 1454 1455By default, CJK unified ideographs are ordered in Unicode codepoint 1456order, but those in the CJK Unified Ideographs block are less than 1457those in the CJK Unified Ideographs Extension A etc. 1458 1459 In the CJK Unified Ideographs block: 1460 U+4E00..U+9FA5 if UCA_Version is 8, 9 or 11. 1461 U+4E00..U+9FBB if UCA_Version is 14 or 16. 1462 U+4E00..U+9FC3 if UCA_Version is 18. 1463 U+4E00..U+9FCB if UCA_Version is 20 or 22. 1464 U+4E00..U+9FCC if UCA_Version is 24 to 30. 1465 U+4E00..U+9FD5 if UCA_Version is 32 or 34. 1466 U+4E00..U+9FEA if UCA_Version is 36. 1467 1468 In the CJK Unified Ideographs Extension blocks: 1469 Ext.A (U+3400..U+4DB5) and Ext.B (U+20000..U+2A6D6) in any UCA_Version. 1470 Ext.C (U+2A700..U+2B734) if UCA_Version is 20 or later. 1471 Ext.D (U+2B740..U+2B81D) if UCA_Version is 22 or later. 1472 Ext.E (U+2B820..U+2CEA1) if UCA_Version is 32 or later. 1473 Ext.F (U+2CEB0..U+2EBE0) if UCA_Version is 36. 1474 1475Through C<overrideCJK>, ordering of CJK unified ideographs (including 1476extensions) can be overridden. 1477 1478ex. CJK unified ideographs in the JIS code point order. 1479 1480 overrideCJK => sub { 1481 my $u = shift; # get a Unicode codepoint 1482 my $b = pack('n', $u); # to UTF-16BE 1483 my $s = your_unicode_to_sjis_converter($b); # convert 1484 my $n = unpack('n', $s); # convert sjis to short 1485 [ $n, 0x20, 0x2, $u ]; # return the collation element 1486 }, 1487 1488The return value may be an arrayref of 1st to 4th weights as shown 1489above. The return value may be an integer as the primary weight 1490as shown below. If C<undef> is returned, the default derived 1491collation element will be used. 1492 1493 overrideCJK => sub { 1494 my $u = shift; # get a Unicode codepoint 1495 my $b = pack('n', $u); # to UTF-16BE 1496 my $s = your_unicode_to_sjis_converter($b); # convert 1497 my $n = unpack('n', $s); # convert sjis to short 1498 return $n; # return the primary weight 1499 }, 1500 1501The return value may be a list containing zero or more of 1502an arrayref, an integer, or C<undef>. 1503 1504ex. ignores all CJK unified ideographs. 1505 1506 overrideCJK => sub {()}, # CODEREF returning empty list 1507 1508 # where ->eq("Pe\x{4E00}rl", "Perl") is true 1509 # as U+4E00 is a CJK unified ideograph and to be ignorable. 1510 1511If a false value (including C<undef>) is passed, C<overrideCJK> 1512has no effect. 1513C<$Collator-E<gt>change(overrideCJK =E<gt> 0)> resets the old one. 1514 1515But assignment of weight for CJK unified ideographs 1516in C<table> or C<entry> is still valid. 1517If C<undef> is passed explicitly as the value for this key, 1518weights for CJK unified ideographs are treated as undefined. 1519However when C<UCA_Version> E<gt> 8, C<(overrideCJK =E<gt> undef)> 1520has no special meaning. 1521 1522B<Note:> In addition to them, 12 CJK compatibility ideographs (C<U+FA0E>, 1523C<U+FA0F>, C<U+FA11>, C<U+FA13>, C<U+FA14>, C<U+FA1F>, C<U+FA21>, C<U+FA23>, 1524C<U+FA24>, C<U+FA27>, C<U+FA28>, C<U+FA29>) are also treated as CJK unified 1525ideographs. But they can't be overridden via C<overrideCJK> when you use 1526DUCET, as the table includes weights for them. C<table> or C<entry> has 1527priority over C<overrideCJK>. 1528 1529=item overrideHangul 1530 1531-- see 7.1 Derived Collation Elements, UTS #10. 1532 1533By default, Hangul syllables are decomposed into Hangul Jamo, 1534even if C<(normalization =E<gt> undef)>. 1535But the mapping of Hangul syllables may be overridden. 1536 1537This parameter works like C<overrideCJK>, so see there for examples. 1538 1539If you want to override the mapping of Hangul syllables, 1540NFD and NFKD are not appropriate, since NFD and NFKD will decompose 1541Hangul syllables before overriding. FCD may decompose Hangul syllables 1542as the case may be. 1543 1544If a false value (but not C<undef>) is passed, C<overrideHangul> 1545has no effect. 1546C<$Collator-E<gt>change(overrideHangul =E<gt> 0)> resets the old one. 1547 1548If C<undef> is passed explicitly as the value for this key, 1549weight for Hangul syllables is treated as undefined 1550without decomposition into Hangul Jamo. 1551But definition of weight for Hangul syllables 1552in C<table> or C<entry> is still valid. 1553 1554=item overrideOut 1555 1556-- see 7.1.1 Handling Ill-Formed Code Unit Sequences, UTS #10. 1557 1558Perl seems to allow out-of-range values (greater than 0x10FFFF). 1559By default, out-of-range values are replaced with C<U+FFFD> 1560(REPLACEMENT CHARACTER) when C<UCA_Version> E<gt>= 22, 1561or ignored when C<UCA_Version> E<lt>= 20. 1562 1563When C<UCA_Version> E<gt>= 22, the weights of out-of-range values 1564can be overridden. Though C<table> or C<entry> are available for them, 1565out-of-range values are too many. 1566 1567C<overrideOut> can perform it algorithmically. 1568This parameter works like C<overrideCJK>, so see there for examples. 1569 1570ex. ignores all out-of-range values. 1571 1572 overrideOut => sub {()}, # CODEREF returning empty list 1573 1574If a false value (including C<undef>) is passed, C<overrideOut> 1575has no effect. 1576C<$Collator-E<gt>change(overrideOut =E<gt> 0)> resets the old one. 1577 1578B<NOTE ABOUT U+FFFD:> 1579 1580UCA recommends that out-of-range values should not be ignored for security 1581reasons. Say, C<"pe\x{110000}rl"> should not be equal to C<"perl">. 1582However, C<U+FFFD> is wrongly mapped to a variable collation element 1583in DUCET for Unicode 6.0.0 to 6.2.0, that means out-of-range values will be 1584ignored when C<variable> isn't C<Non-ignorable>. 1585 1586The mapping of C<U+FFFD> is corrected in Unicode 6.3.0. 1587see L<http://www.unicode.org/reports/tr10/tr10-28.html#Trailing_Weights> 1588(7.1.4 Trailing Weights). Such a correction is reproduced by this. 1589 1590 overrideOut => sub { 0xFFFD }, # CODEREF returning a very large integer 1591 1592This workaround is unnecessary since Unicode 6.3.0. 1593 1594=item preprocess 1595 1596-- see 5.4 Preprocessing, UTS #10. 1597 1598If specified, the coderef is used to preprocess each string 1599before the formation of sort keys. 1600 1601ex. dropping English articles, such as "a" or "the". 1602Then, "the pen" is before "a pencil". 1603 1604 preprocess => sub { 1605 my $str = shift; 1606 $str =~ s/\b(?:an?|the)\s+//gi; 1607 return $str; 1608 }, 1609 1610C<preprocess> is performed before C<normalization> (if defined). 1611 1612ex. decoding strings in a legacy encoding such as shift-jis: 1613 1614 $sjis_collator = Unicode::Collate->new( 1615 preprocess => \&your_shiftjis_to_unicode_decoder, 1616 ); 1617 @result = $sjis_collator->sort(@shiftjis_strings); 1618 1619B<Note:> Strings returned from the coderef will be interpreted 1620according to Perl's Unicode support. See L<perlunicode>, 1621L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>. 1622 1623=item rearrange 1624 1625-- see 3.5 Rearrangement, UTS #10. 1626 1627Characters that are not coded in logical order and to be rearranged. 1628If C<UCA_Version> is equal to or less than 11, default is: 1629 1630 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ], 1631 1632If you want to disallow any rearrangement, pass C<undef> or C<[]> 1633(a reference to empty list) as the value for this key. 1634 1635If C<UCA_Version> is equal to or greater than 14, default is C<[]> 1636(i.e. no rearrangement). 1637 1638B<According to the version 9 of UCA, this parameter shall not be used; 1639but it is not warned at present.> 1640 1641=item rewrite 1642 1643If specified, the coderef is used to rewrite lines in C<table> or C<entry>. 1644The coderef will get each line, and then should return a rewritten line 1645according to the UCA file format. 1646If the coderef returns an empty line, the line will be skipped. 1647 1648e.g. any primary ignorable characters into tertiary ignorable: 1649 1650 rewrite => sub { 1651 my $line = shift; 1652 $line =~ s/\[\.0000\..{4}\..{4}\./[.0000.0000.0000./g; 1653 return $line; 1654 }, 1655 1656This example shows rewriting weights. C<rewrite> is allowed to 1657affect code points, weights, and the name. 1658 1659B<NOTE>: C<table> is available to use another table file; 1660preparing a modified table once would be more efficient than 1661rewriting lines on reading an unmodified table every time. 1662 1663=item suppress 1664 1665-- see 3.12 Special-Purpose Commands, UTS #35 (LDML) Part 5: Collation. 1666 1667Contractions beginning with the specified characters are suppressed, 1668even if those contractions are defined in C<table>. 1669 1670An example for Russian and some languages using the Cyrillic script: 1671 1672 suppress => [0x0400..0x0417, 0x041A..0x0437, 0x043A..0x045F], 1673 1674where 0x0400 stands for C<U+0400>, CYRILLIC CAPITAL LETTER IE WITH GRAVE. 1675 1676B<NOTE>: Contractions via C<entry> will not be suppressed. 1677 1678=item table 1679 1680-- see 3.8 Default Unicode Collation Element Table, UTS #10. 1681 1682You can use another collation element table if desired. 1683 1684The table file should locate in the F<Unicode/Collate> directory 1685on C<@INC>. Say, if the filename is F<Foo.txt>, 1686the table file is searched as F<Unicode/Collate/Foo.txt> in C<@INC>. 1687 1688By default, F<allkeys.txt> (as the filename of DUCET) is used. 1689If you will prepare your own table file, any name other than F<allkeys.txt> 1690may be better to avoid namespace conflict. 1691 1692B<NOTE>: When XSUB is used, the DUCET is compiled on building this 1693module, and it may save time at the run time. 1694Explicit saying C<(table =E<gt> 'allkeys.txt')>, or using another table, 1695or using C<ignoreChar>, C<ignoreName>, C<undefChar>, C<undefName> or 1696C<rewrite> will prevent this module from using the compiled DUCET. 1697 1698If C<undef> is passed explicitly as the value for this key, 1699no file is read (but you can define collation elements via C<entry>). 1700 1701A typical way to define a collation element table 1702without any file of table: 1703 1704 $onlyABC = Unicode::Collate->new( 1705 table => undef, 1706 entry => << 'ENTRIES', 17070061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A 17080041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A 17090062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B 17100042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B 17110063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C 17120043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C 1713ENTRIES 1714 ); 1715 1716If C<ignoreName> or C<undefName> is used, character names should be 1717specified as a comment (following C<#>) on each line. 1718 1719=item undefChar 1720 1721=item undefName 1722 1723-- see 6.3.3 Reducing the Repertoire, UTS #10. 1724 1725Undefines the collation element as if it were unassigned in the C<table>. 1726This reduces the size of the table. 1727If an unassigned character appears in the string to be collated, 1728the sort key is made from its codepoint 1729as a single-character collation element, 1730as it is greater than any other assigned collation elements 1731(in the codepoint order among the unassigned characters). 1732But, it'd be better to ignore characters 1733unfamiliar to you and maybe never used. 1734 1735Through C<undefChar>, any character matching C<qr/$undefChar/> 1736will be undefined. Through C<undefName>, any character whose name 1737(given in the C<table> file as a comment) matches C<qr/$undefName/> 1738will be undefined. 1739 1740ex. Collation weights for beyond-BMP characters are not stored in object: 1741 1742 undefChar => qr/[^\0-\x{fffd}]/, 1743 1744=item upper_before_lower 1745 1746-- see 6.6 Case Comparisons, UTS #10. 1747 1748By default, lowercase is before uppercase. 1749If the parameter is made true, this is reversed. 1750 1751B<NOTE>: This parameter simplemindedly assumes that any lowercase/uppercase 1752distinctions must occur in level 3, and their weights at level 3 must be 1753same as those mentioned in 7.3.1, UTS #10. 1754If you define your collation elements which differs from this requirement, 1755this parameter doesn't work validly. 1756 1757=item variable 1758 1759-- see 3.6 Variable Weighting, UTS #10. 1760 1761This key allows for variable weighting of variable collation elements, 1762which are marked with an ASTERISK in the table 1763(NOTE: Many punctuation marks and symbols are variable in F<allkeys.txt>). 1764 1765 variable => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'. 1766 1767These names are case-insensitive. 1768By default (if specification is omitted), 'shifted' is adopted. 1769 1770 'Blanked' Variable elements are made ignorable at levels 1 through 3; 1771 considered at the 4th level. 1772 1773 'Non-Ignorable' Variable elements are not reset to ignorable. 1774 1775 'Shifted' Variable elements are made ignorable at levels 1 through 3 1776 their level 4 weight is replaced by the old level 1 weight. 1777 Level 4 weight for Non-Variable elements is 0xFFFF. 1778 1779 'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level 1780 are trimmed. 1781 1782=back 1783 1784=head2 Methods for Collation 1785 1786=over 4 1787 1788=item C<@sorted = $Collator-E<gt>sort(@not_sorted)> 1789 1790Sorts a list of strings. 1791 1792=item C<$result = $Collator-E<gt>cmp($a, $b)> 1793 1794Returns 1 (when C<$a> is greater than C<$b>) 1795or 0 (when C<$a> is equal to C<$b>) 1796or -1 (when C<$a> is less than C<$b>). 1797 1798=item C<$result = $Collator-E<gt>eq($a, $b)> 1799 1800=item C<$result = $Collator-E<gt>ne($a, $b)> 1801 1802=item C<$result = $Collator-E<gt>lt($a, $b)> 1803 1804=item C<$result = $Collator-E<gt>le($a, $b)> 1805 1806=item C<$result = $Collator-E<gt>gt($a, $b)> 1807 1808=item C<$result = $Collator-E<gt>ge($a, $b)> 1809 1810They works like the same name operators as theirs. 1811 1812 eq : whether $a is equal to $b. 1813 ne : whether $a is not equal to $b. 1814 lt : whether $a is less than $b. 1815 le : whether $a is less than $b or equal to $b. 1816 gt : whether $a is greater than $b. 1817 ge : whether $a is greater than $b or equal to $b. 1818 1819=item C<$sortKey = $Collator-E<gt>getSortKey($string)> 1820 1821-- see 4.3 Form Sort Key, UTS #10. 1822 1823Returns a sort key. 1824 1825You compare the sort keys using a binary comparison 1826and get the result of the comparison of the strings using UCA. 1827 1828 $Collator->getSortKey($a) cmp $Collator->getSortKey($b) 1829 1830 is equivalent to 1831 1832 $Collator->cmp($a, $b) 1833 1834=item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)> 1835 1836Converts a sorting key into its representation form. 1837If C<UCA_Version> is 8, the output is slightly different. 1838 1839 use Unicode::Collate; 1840 my $c = Unicode::Collate->new(); 1841 print $c->viewSortKey("Perl"),"\n"; 1842 1843 # output: 1844 # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF] 1845 # Level 1 Level 2 Level 3 Level 4 1846 1847=back 1848 1849=head2 Methods for Searching 1850 1851The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work 1852like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively, 1853but they are not aware of any pattern, but only a literal substring. 1854 1855B<DISCLAIMER:> If C<preprocess> or C<normalization> parameter is true 1856for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>, 1857C<subst>, C<gsubst>) is croaked, as the position and the length might 1858differ from those on the specified string. 1859 1860C<rearrange> and C<hangul_terminator> parameters are neglected. 1861C<katakana_before_hiragana> and C<upper_before_lower> don't affect 1862matching and searching, as it doesn't matter whether greater or less. 1863 1864=over 4 1865 1866=item C<$position = $Collator-E<gt>index($string, $substring[, $position])> 1867 1868=item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])> 1869 1870If C<$substring> matches a part of C<$string>, returns 1871the position of the first occurrence of the matching part in scalar context; 1872in list context, returns a two-element list of 1873the position and the length of the matching part. 1874 1875If C<$substring> does not match any part of C<$string>, 1876returns C<-1> in scalar context and 1877an empty list in list context. 1878 1879e.g. when the content of C<$str> is C<"Ich mu>E<szlig>C< studieren Perl.">, 1880you say the following where C<$sub> is C<"M>E<uuml>C<SS">, 1881 1882 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 ); 1883 # (normalization => undef) is REQUIRED. 1884 my $match; 1885 if (my($pos,$len) = $Collator->index($str, $sub)) { 1886 $match = substr($str, $pos, $len); 1887 } 1888 1889and get C<"mu>E<szlig>C<"> in C<$match>, since C<"mu>E<szlig>C<"> 1890is primary equal to C<"M>E<uuml>C<SS">. 1891 1892=item C<$match_ref = $Collator-E<gt>match($string, $substring)> 1893 1894=item C<($match) = $Collator-E<gt>match($string, $substring)> 1895 1896If C<$substring> matches a part of C<$string>, in scalar context, returns 1897B<a reference to> the first occurrence of the matching part 1898(C<$match_ref> is always true if matches, 1899since every reference is B<true>); 1900in list context, returns the first occurrence of the matching part. 1901 1902If C<$substring> does not match any part of C<$string>, 1903returns C<undef> in scalar context and 1904an empty list in list context. 1905 1906e.g. 1907 1908 if ($match_ref = $Collator->match($str, $sub)) { # scalar context 1909 print "matches [$$match_ref].\n"; 1910 } else { 1911 print "doesn't match.\n"; 1912 } 1913 1914 or 1915 1916 if (($match) = $Collator->match($str, $sub)) { # list context 1917 print "matches [$match].\n"; 1918 } else { 1919 print "doesn't match.\n"; 1920 } 1921 1922=item C<@match = $Collator-E<gt>gmatch($string, $substring)> 1923 1924If C<$substring> matches a part of C<$string>, returns 1925all the matching parts (or matching count in scalar context). 1926 1927If C<$substring> does not match any part of C<$string>, 1928returns an empty list. 1929 1930=item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)> 1931 1932If C<$substring> matches a part of C<$string>, 1933the first occurrence of the matching part is replaced by C<$replacement> 1934(C<$string> is modified) and C<$count> (always equals to C<1>) is returned. 1935 1936C<$replacement> can be a C<CODEREF>, 1937taking the matching part as an argument, 1938and returning a string to replace the matching part 1939(a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>). 1940 1941=item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)> 1942 1943If C<$substring> matches a part of C<$string>, 1944all the occurrences of the matching part are replaced by C<$replacement> 1945(C<$string> is modified) and C<$count> is returned. 1946 1947C<$replacement> can be a C<CODEREF>, 1948taking the matching part as an argument, 1949and returning a string to replace the matching part 1950(a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>). 1951 1952e.g. 1953 1954 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 ); 1955 # (normalization => undef) is REQUIRED. 1956 my $str = "Camel donkey zebra came\x{301}l CAMEL horse cam\0e\0l..."; 1957 $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" }); 1958 1959 # now $str is "<b>Camel</b> donkey zebra <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cam\0e\0l</b>..."; 1960 # i.e., all the camels are made bold-faced. 1961 1962 Examples: levels and ignore_level2 - what does camel match? 1963 --------------------------------------------------------------------------- 1964 level ignore_level2 | camel Camel came\x{301}l c-a-m-e-l cam\0e\0l 1965 -----------------------|--------------------------------------------------- 1966 1 false | yes yes yes yes yes 1967 2 false | yes yes no yes yes 1968 3 false | yes no no yes yes 1969 4 false | yes no no no yes 1970 -----------------------|--------------------------------------------------- 1971 1 true | yes yes yes yes yes 1972 2 true | yes yes yes yes yes 1973 3 true | yes no yes yes yes 1974 4 true | yes no yes no yes 1975 --------------------------------------------------------------------------- 1976 note: if variable => non-ignorable, camel doesn't match c-a-m-e-l 1977 at any level. 1978 1979=back 1980 1981=head2 Other Methods 1982 1983=over 4 1984 1985=item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)> 1986 1987=item C<$modified_collator = $Collator-E<gt>change(%new_tailoring)> 1988 1989Changes the value of specified keys and returns the changed part. 1990 1991 $Collator = Unicode::Collate->new(level => 4); 1992 1993 $Collator->eq("perl", "PERL"); # false 1994 1995 %old = $Collator->change(level => 2); # returns (level => 4). 1996 1997 $Collator->eq("perl", "PERL"); # true 1998 1999 $Collator->change(%old); # returns (level => 2). 2000 2001 $Collator->eq("perl", "PERL"); # false 2002 2003Not all C<(key,value)>s are allowed to be changed. 2004See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>. 2005 2006In the scalar context, returns the modified collator 2007(but it is B<not> a clone from the original). 2008 2009 $Collator->change(level => 2)->eq("perl", "PERL"); # true 2010 2011 $Collator->eq("perl", "PERL"); # true; now max level is 2nd. 2012 2013 $Collator->change(level => 4)->eq("perl", "PERL"); # false 2014 2015=item C<$version = $Collator-E<gt>version()> 2016 2017Returns the version number (a string) of the Unicode Standard 2018which the C<table> file used by the collator object is based on. 2019If the table does not include a version line (starting with C<@version>), 2020returns C<"unknown">. 2021 2022=item C<UCA_Version()> 2023 2024Returns the revision number of UTS #10 this module consults, 2025that should correspond with the DUCET incorporated. 2026 2027=item C<Base_Unicode_Version()> 2028 2029Returns the version number of UTS #10 this module consults, 2030that should correspond with the DUCET incorporated. 2031 2032=back 2033 2034=head1 EXPORT 2035 2036No method will be exported. 2037 2038=head1 INSTALL 2039 2040Though this module can be used without any C<table> file, 2041to use this module easily, it is recommended to install a table file 2042in the UCA format, by copying it under the directory 2043<a place in @INC>/Unicode/Collate. 2044 2045The most preferable one is "The Default Unicode Collation Element Table" 2046(aka DUCET), available from the Unicode Consortium's website: 2047 2048 http://www.unicode.org/Public/UCA/ 2049 2050 http://www.unicode.org/Public/UCA/latest/allkeys.txt 2051 (latest version) 2052 2053If DUCET is not installed, it is recommended to copy the file 2054from http://www.unicode.org/Public/UCA/latest/allkeys.txt 2055to <a place in @INC>/Unicode/Collate/allkeys.txt 2056manually. 2057 2058=head1 CAVEATS 2059 2060=over 4 2061 2062=item Normalization 2063 2064Use of the C<normalization> parameter requires the B<Unicode::Normalize> 2065module (see L<Unicode::Normalize>). 2066 2067If you need not it (say, in the case when you need not 2068handle any combining characters), 2069assign C<(normalization =E<gt> undef)> explicitly. 2070 2071-- see 6.5 Avoiding Normalization, UTS #10. 2072 2073=item Conformance Test 2074 2075The Conformance Test for the UCA is available 2076under L<http://www.unicode.org/Public/UCA/>. 2077 2078For F<CollationTest_SHIFTED.txt>, 2079a collator via C<Unicode::Collate-E<gt>new( )> should be used; 2080for F<CollationTest_NON_IGNORABLE.txt>, a collator via 2081C<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>. 2082 2083If C<UCA_Version> is 26 or later, the C<identical> level is preferred; 2084C<Unicode::Collate-E<gt>new(identical =E<gt> 1)> and 2085C<Unicode::Collate-E<gt>new(identical =E<gt> 1,> 2086C<variable =E<gt> "non-ignorable", level =E<gt> 3)> should be used. 2087 2088B<Unicode::Normalize is required to try The Conformance Test.> 2089 2090=back 2091 2092=head1 AUTHOR, COPYRIGHT AND LICENSE 2093 2094The Unicode::Collate module for perl was written by SADAHIRO Tomoyuki, 2095<SADAHIRO@cpan.org>. This module is Copyright(C) 2001-2018, 2096SADAHIRO Tomoyuki. Japan. All rights reserved. 2097 2098This module is free software; you can redistribute it and/or 2099modify it under the same terms as Perl itself. 2100 2101The file Unicode/Collate/allkeys.txt was copied verbatim 2102from L<http://www.unicode.org/Public/UCA/9.0.0/allkeys.txt>. 2103For this file, Copyright (c) 2016 Unicode, Inc.; distributed 2104under the Terms of Use in L<http://www.unicode.org/terms_of_use.html> 2105 2106=head1 SEE ALSO 2107 2108=over 4 2109 2110=item Unicode Collation Algorithm - UTS #10 2111 2112L<http://www.unicode.org/reports/tr10/> 2113 2114=item The Default Unicode Collation Element Table (DUCET) 2115 2116L<http://www.unicode.org/Public/UCA/latest/allkeys.txt> 2117 2118=item The conformance test for the UCA 2119 2120L<http://www.unicode.org/Public/UCA/latest/CollationTest.html> 2121 2122L<http://www.unicode.org/Public/UCA/latest/CollationTest.zip> 2123 2124=item Hangul Syllable Type 2125 2126L<http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt> 2127 2128=item Unicode Normalization Forms - UAX #15 2129 2130L<http://www.unicode.org/reports/tr15/> 2131 2132=item Unicode Locale Data Markup Language (LDML) - UTS #35 2133 2134L<http://www.unicode.org/reports/tr35/> 2135 2136=back 2137 2138=cut 2139