1package Unicode::Collate; 2 3use 5.006; 4use strict; 5use warnings; 6use Carp; 7use File::Spec; 8 9no warnings 'utf8'; 10 11our $VERSION = '1.31'; 12our $PACKAGE = __PACKAGE__; 13 14### begin XS only ### 15use XSLoader (); 16XSLoader::load('Unicode::Collate', $VERSION); 17### end XS only ### 18 19my @Path = qw(Unicode Collate); 20my $KeyFile = 'allkeys.txt'; 21 22# Perl's boolean 23use constant TRUE => 1; 24use constant FALSE => ""; 25use constant NOMATCHPOS => -1; 26 27# A coderef to get combining class imported from Unicode::Normalize 28# (i.e. \&Unicode::Normalize::getCombinClass). 29# This is also used as a HAS_UNICODE_NORMALIZE flag. 30my $CVgetCombinClass; 31 32# Supported Levels 33use constant MinLevel => 1; 34use constant MaxLevel => 4; 35 36# Minimum weights at level 2 and 3, respectively 37use constant Min2Wt => 0x20; 38use constant Min3Wt => 0x02; 39 40# Shifted weight at 4th level 41use constant Shift4Wt => 0xFFFF; 42 43# A boolean for Variable and 16-bit weights at 4 levels of Collation Element 44use constant VCE_TEMPLATE => 'Cn4'; 45 46# A sort key: 16-bit weights 47use constant KEY_TEMPLATE => 'n*'; 48 49# The tie-breaking: 32-bit weights 50use constant TIE_TEMPLATE => 'N*'; 51 52# Level separator in a sort key: 53# i.e. pack(KEY_TEMPLATE, 0) 54use constant LEVEL_SEP => "\0\0"; 55 56# As Unicode code point separator for hash keys. 57# A joined code point string (denoted by JCPS below) 58# like "65;768" is used for internal processing 59# instead of Perl's Unicode string like "\x41\x{300}", 60# as the native code point is different from the Unicode code point 61# on EBCDIC platform. 62# This character must not be included in any stringified 63# representation of an integer. 64use constant CODE_SEP => ';'; 65 # NOTE: in regex /;/ is used for $jcps! 66 67# boolean values of variable weights 68use constant NON_VAR => 0; # Non-Variable character 69use constant VAR => 1; # Variable character 70 71# specific code points 72use constant Hangul_SIni => 0xAC00; 73use constant Hangul_SFin => 0xD7A3; 74 75# Logical_Order_Exception in PropList.txt 76my $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ]; 77 78# for highestFFFF and minimalFFFE 79my $HighestVCE = pack(VCE_TEMPLATE, 0, 0xFFFE, 0x20, 0x5, 0xFFFF); 80my $minimalVCE = pack(VCE_TEMPLATE, 0, 1, 0x20, 0x5, 0xFFFE); 81 82sub UCA_Version { '43' } 83 84sub Base_Unicode_Version { '13.0.0' } 85 86###### 87 88my $native_to_unicode = ($::IS_ASCII || $] < 5.008) 89 ? sub { return shift } 90 : sub { utf8::native_to_unicode(shift) }; 91 92my $unicode_to_native = ($::IS_ASCII || $] < 5.008) 93 ? sub { return shift } 94 : sub { utf8::unicode_to_native(shift) }; 95 96# pack_U() should get Unicode code points. 97sub pack_U { 98 return pack('U*', map $unicode_to_native->($_), @_); 99} 100 101# unpack_U() should return Unicode code points. 102sub unpack_U { 103 return map $native_to_unicode->($_), unpack('U*', shift(@_).pack('U*')); 104} 105# for older perl version, pack('U*') generates empty string with utf8 flag. 106 107###### 108 109my (%VariableOK); 110@VariableOK{ qw/ 111 blanked non-ignorable shifted shift-trimmed 112 / } = (); # keys lowercased 113 114our @ChangeOK = qw/ 115 alternate backwards level normalization rearrange 116 katakana_before_hiragana upper_before_lower ignore_level2 117 overrideCJK overrideHangul overrideOut preprocess UCA_Version 118 hangul_terminator variable identical highestFFFF minimalFFFE 119 long_contraction 120 /; 121 122our @ChangeNG = qw/ 123 entry mapping table maxlength contraction 124 ignoreChar ignoreName undefChar undefName rewrite 125 versionTable alternateTable backwardsTable forwardsTable 126 rearrangeTable variableTable 127 derivCode normCode rearrangeHash backwardsFlag 128 suppress suppressHash 129 __useXS /; ### XS only 130# The hash key 'ignored' was deleted at v 0.21. 131# The hash key 'isShift' was deleted at v 0.23. 132# The hash key 'combining' was deleted at v 0.24. 133# The hash key 'entries' was deleted at v 0.30. 134# The hash key 'L3_ignorable' was deleted at v 0.40. 135 136sub version { 137 my $self = shift; 138 return $self->{versionTable} || 'unknown'; 139} 140 141my (%ChangeOK, %ChangeNG); 142@ChangeOK{ @ChangeOK } = (); 143@ChangeNG{ @ChangeNG } = (); 144 145sub change { 146 my $self = shift; 147 my %hash = @_; 148 my %old; 149 if (exists $hash{alternate}) { 150 if (exists $hash{variable}) { 151 delete $hash{alternate}; 152 } else { 153 $hash{variable} = $hash{alternate}; 154 } 155 } 156 foreach my $k (keys %hash) { 157 if (exists $ChangeOK{$k}) { 158 $old{$k} = $self->{$k}; 159 $self->{$k} = $hash{$k}; 160 } elsif (exists $ChangeNG{$k}) { 161 croak "change of $k via change() is not allowed!"; 162 } 163 # else => ignored 164 } 165 $self->checkCollator(); 166 return wantarray ? %old : $self; 167} 168 169sub _checkLevel { 170 my $level = shift; 171 my $key = shift; # 'level' or 'backwards' 172 MinLevel <= $level or croak sprintf 173 "Illegal level %d (in value for key '%s') lower than %d.", 174 $level, $key, MinLevel; 175 $level <= MaxLevel or croak sprintf 176 "Unsupported level %d (in value for key '%s') higher than %d.", 177 $level, $key, MaxLevel; 178} 179 180my %DerivCode = ( 181 8 => \&_derivCE_8, 182 9 => \&_derivCE_9, 183 11 => \&_derivCE_9, # 11 == 9 184 14 => \&_derivCE_14, 185 16 => \&_derivCE_14, # 16 == 14 186 18 => \&_derivCE_18, 187 20 => \&_derivCE_20, 188 22 => \&_derivCE_22, 189 24 => \&_derivCE_24, 190 26 => \&_derivCE_24, # 26 == 24 191 28 => \&_derivCE_24, # 28 == 24 192 30 => \&_derivCE_24, # 30 == 24 193 32 => \&_derivCE_32, 194 34 => \&_derivCE_34, 195 36 => \&_derivCE_36, 196 38 => \&_derivCE_38, 197 40 => \&_derivCE_40, 198 41 => \&_derivCE_40, # 41 == 40 199 43 => \&_derivCE_43, 200); 201 202sub checkCollator { 203 my $self = shift; 204 _checkLevel($self->{level}, 'level'); 205 206 $self->{derivCode} = $DerivCode{ $self->{UCA_Version} } 207 or croak "Illegal UCA version (passed $self->{UCA_Version})."; 208 209 $self->{variable} ||= $self->{alternate} || $self->{variableTable} || 210 $self->{alternateTable} || 'shifted'; 211 $self->{variable} = $self->{alternate} = lc($self->{variable}); 212 exists $VariableOK{ $self->{variable} } 213 or croak "$PACKAGE unknown variable parameter name: $self->{variable}"; 214 215 if (! defined $self->{backwards}) { 216 $self->{backwardsFlag} = 0; 217 } elsif (! ref $self->{backwards}) { 218 _checkLevel($self->{backwards}, 'backwards'); 219 $self->{backwardsFlag} = 1 << $self->{backwards}; 220 } else { 221 my %level; 222 $self->{backwardsFlag} = 0; 223 for my $b (@{ $self->{backwards} }) { 224 _checkLevel($b, 'backwards'); 225 $level{$b} = 1; 226 } 227 for my $v (sort keys %level) { 228 $self->{backwardsFlag} += 1 << $v; 229 } 230 } 231 232 defined $self->{rearrange} or $self->{rearrange} = []; 233 ref $self->{rearrange} 234 or croak "$PACKAGE: list for rearrangement must be store in ARRAYREF"; 235 236 # keys of $self->{rearrangeHash} are $self->{rearrange}. 237 $self->{rearrangeHash} = undef; 238 239 if (@{ $self->{rearrange} }) { 240 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = (); 241 } 242 243 $self->{normCode} = undef; 244 245 if (defined $self->{normalization}) { 246 eval { require Unicode::Normalize }; 247 $@ and croak "Unicode::Normalize is required to normalize strings"; 248 249 $CVgetCombinClass ||= \&Unicode::Normalize::getCombinClass; 250 251 if ($self->{normalization} =~ /^(?:NF)D\z/) { # tweak for default 252 $self->{normCode} = \&Unicode::Normalize::NFD; 253 } 254 elsif ($self->{normalization} ne 'prenormalized') { 255 my $norm = $self->{normalization}; 256 $self->{normCode} = sub { 257 Unicode::Normalize::normalize($norm, shift); 258 }; 259 eval { $self->{normCode}->("") }; # try 260 $@ and croak "$PACKAGE unknown normalization form name: $norm"; 261 } 262 } 263 return; 264} 265 266sub new 267{ 268 my $class = shift; 269 my $self = bless { @_ }, $class; 270 271### begin XS only ### 272 if (! exists $self->{table} && !defined $self->{rewrite} && 273 !defined $self->{undefName} && !defined $self->{ignoreName} && 274 !defined $self->{undefChar} && !defined $self->{ignoreChar}) { 275 $self->{__useXS} = \&_fetch_simple; 276 } else { 277 $self->{__useXS} = undef; 278 } 279### end XS only ### 280 281 # keys of $self->{suppressHash} are $self->{suppress}. 282 if ($self->{suppress} && @{ $self->{suppress} }) { 283 @{ $self->{suppressHash} }{ @{ $self->{suppress} } } = (); 284 } # before read_table() 285 286 # If undef is passed explicitly, no file is read. 287 $self->{table} = $KeyFile if ! exists $self->{table}; 288 $self->read_table() if defined $self->{table}; 289 290 if ($self->{entry}) { 291 while ($self->{entry} =~ /([^\n]+)/g) { 292 $self->parseEntry($1, TRUE); 293 } 294 } 295 296 # only in new(), not in change() 297 $self->{level} ||= MaxLevel; 298 $self->{UCA_Version} ||= UCA_Version(); 299 300 $self->{overrideHangul} = FALSE 301 if ! exists $self->{overrideHangul}; 302 $self->{overrideCJK} = FALSE 303 if ! exists $self->{overrideCJK}; 304 $self->{normalization} = 'NFD' 305 if ! exists $self->{normalization}; 306 $self->{rearrange} = $self->{rearrangeTable} || 307 ($self->{UCA_Version} <= 11 ? $DefaultRearrange : []) 308 if ! exists $self->{rearrange}; 309 $self->{backwards} = $self->{backwardsTable} 310 if ! exists $self->{backwards}; 311 exists $self->{long_contraction} or $self->{long_contraction} 312 = 22 <= $self->{UCA_Version} && $self->{UCA_Version} <= 24; 313 314 # checkCollator() will be called in change() 315 $self->checkCollator(); 316 317 return $self; 318} 319 320sub parseAtmark { 321 my $self = shift; 322 my $line = shift; # after s/^\s*\@// 323 324 if ($line =~ /^version\s*(\S*)/) { 325 $self->{versionTable} ||= $1; 326 } 327 elsif ($line =~ /^variable\s+(\S*)/) { # since UTS #10-9 328 $self->{variableTable} ||= $1; 329 } 330 elsif ($line =~ /^alternate\s+(\S*)/) { # till UTS #10-8 331 $self->{alternateTable} ||= $1; 332 } 333 elsif ($line =~ /^backwards\s+(\S*)/) { 334 push @{ $self->{backwardsTable} }, $1; 335 } 336 elsif ($line =~ /^forwards\s+(\S*)/) { # perhaps no use 337 push @{ $self->{forwardsTable} }, $1; 338 } 339 elsif ($line =~ /^rearrange\s+(.*)/) { # (\S*) is NG 340 push @{ $self->{rearrangeTable} }, _getHexArray($1); 341 } 342} 343 344sub read_table { 345 my $self = shift; 346 347### begin XS only ### 348 if ($self->{__useXS}) { 349 my @rest = _fetch_rest(); # complex matter need to parse 350 for my $line (@rest) { 351 next if $line =~ /^\s*#/; 352 353 if ($line =~ s/^\s*\@//) { 354 $self->parseAtmark($line); 355 } else { 356 $self->parseEntry($line); 357 } 358 } 359 return; 360 } 361### end XS only ### 362 363 my($f, $fh); 364 foreach my $d (@INC) { 365 $f = File::Spec->catfile($d, @Path, $self->{table}); 366 last if open($fh, $f); 367 $f = undef; 368 } 369 if (!defined $f) { 370 $f = File::Spec->catfile(@Path, $self->{table}); 371 croak("$PACKAGE: Can't locate $f in \@INC (\@INC contains: @INC)"); 372 } 373 374 while (my $line = <$fh>) { 375 next if $line =~ /^\s*#/; 376 377 if ($line =~ s/^\s*\@//) { 378 $self->parseAtmark($line); 379 } else { 380 $self->parseEntry($line); 381 } 382 } 383 close $fh; 384} 385 386 387## 388## get $line, parse it, and write an entry in $self 389## 390sub parseEntry 391{ 392 my $self = shift; 393 my $line = shift; 394 my $tailoring = shift; 395 my($name, $entry, @uv, @key); 396 397 if (defined $self->{rewrite}) { 398 $line = $self->{rewrite}->($line); 399 } 400 401 return if $line !~ /^\s*[0-9A-Fa-f]/; 402 403 # removes comment and gets name 404 $name = $1 405 if $line =~ s/[#%]\s*(.*)//; 406 return if defined $self->{undefName} && $name =~ /$self->{undefName}/; 407 408 # gets element 409 my($e, $k) = split /;/, $line; 410 croak "Wrong Entry: <charList> must be separated by ';' from <collElement>" 411 if ! $k; 412 413 @uv = _getHexArray($e); 414 return if !@uv; 415 return if @uv > 1 && $self->{suppressHash} && !$tailoring && 416 exists $self->{suppressHash}{$uv[0]}; 417 $entry = join(CODE_SEP, @uv); # in JCPS 418 419 if (defined $self->{undefChar} || defined $self->{ignoreChar}) { 420 my $ele = pack_U(@uv); 421 422 # regarded as if it were not stored in the table 423 return 424 if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/; 425 426 # replaced as completely ignorable 427 $k = '[.0000.0000.0000.0000]' 428 if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/; 429 } 430 431 # replaced as completely ignorable 432 $k = '[.0000.0000.0000.0000]' 433 if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/; 434 435 my $is_L3_ignorable = TRUE; 436 437 foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed 438 my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient. 439 my @wt = _getHexArray($arr); 440 push @key, pack(VCE_TEMPLATE, $var, @wt); 441 $is_L3_ignorable = FALSE 442 if $wt[0] || $wt[1] || $wt[2]; 443 # Conformance Test for 3.1.1 and 4.0.0 shows Level 3 ignorable 444 # is completely ignorable. 445 # For expansion, an entry $is_L3_ignorable 446 # if and only if "all" CEs are [.0000.0000.0000]. 447 } 448 449 # mapping: be an array ref or not exists (any false value is disallowed) 450 $self->{mapping}{$entry} = $is_L3_ignorable ? [] : \@key; 451 452 # maxlength: be more than 1 or not exists (any false value is disallowed) 453 if (@uv > 1) { 454 if (!$self->{maxlength}{$uv[0]} || $self->{maxlength}{$uv[0]} < @uv) { 455 $self->{maxlength}{$uv[0]} = @uv; 456 } 457 } 458 459 # contraction: be 1 or not exists (any false value is disallowed) 460 while (@uv > 2) { 461 pop @uv; 462 my $fake_entry = join(CODE_SEP, @uv); # in JCPS 463 $self->{contraction}{$fake_entry} = 1; 464 } 465} 466 467 468sub viewSortKey 469{ 470 my $self = shift; 471 my $str = shift; 472 $self->visualizeSortKey($self->getSortKey($str)); 473} 474 475 476sub process 477{ 478 my $self = shift; 479 my $str = shift; 480 my $prep = $self->{preprocess}; 481 my $norm = $self->{normCode}; 482 483 $str = &$prep($str) if ref $prep; 484 $str = &$norm($str) if ref $norm; 485 return $str; 486} 487 488## 489## arrayref of JCPS = splitEnt(string to be collated) 490## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, TRUE) 491## 492sub splitEnt 493{ 494 my $self = shift; 495 my $str = shift; 496 my $wLen = shift; # with Length 497 498 my $map = $self->{mapping}; 499 my $max = $self->{maxlength}; 500 my $reH = $self->{rearrangeHash}; 501 my $vers = $self->{UCA_Version}; 502 my $ver9 = $vers >= 9 && $vers <= 11; 503 my $long = $self->{long_contraction}; 504 my $uXS = $self->{__useXS}; ### XS only 505 506 my @buf; 507 508 # get array of Unicode code point of string. 509 my @src = unpack_U($str); 510 511 # rearrangement: 512 # Character positions are not kept if rearranged, 513 # then neglected if $wLen is true. 514 if ($reH && ! $wLen) { 515 for (my $i = 0; $i < @src; $i++) { 516 if (exists $reH->{ $src[$i] } && $i + 1 < @src) { 517 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]); 518 $i++; 519 } 520 } 521 } 522 523 # remove a code point marked as a completely ignorable. 524 for (my $i = 0; $i < @src; $i++) { 525 if ($vers <= 20 && _isIllegal($src[$i])) { 526 $src[$i] = undef; 527 } elsif ($ver9) { 528 $src[$i] = undef if exists $map->{ $src[$i] } 529 ? @{ $map->{ $src[$i] } } == 0 530 : $uXS && _ignorable_simple($src[$i]); ### XS only 531 } 532 } 533 534 for (my $i = 0; $i < @src; $i++) { 535 my $jcps = $src[$i]; 536 537 # skip removed code point 538 if (! defined $jcps) { 539 if ($wLen && @buf) { 540 $buf[-1][2] = $i + 1; 541 } 542 next; 543 } 544 545 my $i_orig = $i; 546 547 # find contraction 548 if (exists $max->{$jcps}) { 549 my $temp_jcps = $jcps; 550 my $jcpsLen = 1; 551 my $maxLen = $max->{$jcps}; 552 553 for (my $p = $i + 1; $jcpsLen < $maxLen && $p < @src; $p++) { 554 next if ! defined $src[$p]; 555 $temp_jcps .= CODE_SEP . $src[$p]; 556 $jcpsLen++; 557 if (exists $map->{$temp_jcps}) { 558 $jcps = $temp_jcps; 559 $i = $p; 560 } 561 } 562 563 # discontiguous contraction with Combining Char (cf. UTS#10, S2.1). 564 # This process requires Unicode::Normalize. 565 # If "normalization" is undef, here should be skipped *always* 566 # (in spite of bool value of $CVgetCombinClass), 567 # since canonical ordering cannot be expected. 568 # Blocked combining character should not be contracted. 569 570 # $self->{normCode} is false in the case of "prenormalized". 571 if ($self->{normalization}) { 572 my $cont = $self->{contraction}; 573 my $preCC = 0; 574 my $preCC_uc = 0; 575 my $jcps_uc = $jcps; 576 my(@out, @out_uc); 577 578 for (my $p = $i + 1; $p < @src; $p++) { 579 next if ! defined $src[$p]; 580 my $curCC = $CVgetCombinClass->($src[$p]); 581 last unless $curCC; 582 my $tail = CODE_SEP . $src[$p]; 583 584 if ($preCC != $curCC && exists $map->{$jcps.$tail}) { 585 $jcps .= $tail; 586 push @out, $p; 587 } else { 588 $preCC = $curCC; 589 } 590 591 next if !$long; 592 593 if ($preCC_uc != $curCC && 594 (exists $map->{$jcps_uc.$tail} || 595 exists $cont->{$jcps_uc.$tail})) { 596 $jcps_uc .= $tail; 597 push @out_uc, $p; 598 } else { 599 $preCC_uc = $curCC; 600 } 601 } 602 603 if (@out_uc && exists $map->{$jcps_uc}) { 604 $jcps = $jcps_uc; 605 $src[$_] = undef for @out_uc; 606 } else { 607 $src[$_] = undef for @out; 608 } 609 } 610 } 611 612 # skip completely ignorable 613 if (exists $map->{$jcps} ? @{ $map->{$jcps} } == 0 : 614 $uXS && $jcps !~ /;/ && _ignorable_simple($jcps)) { ### XS only 615 if ($wLen && @buf) { 616 $buf[-1][2] = $i + 1; 617 } 618 next; 619 } 620 621 push @buf, $wLen ? [$jcps, $i_orig, $i + 1] : $jcps; 622 } 623 return \@buf; 624} 625 626## 627## VCE = _pack_override(input, codepoint, derivCode) 628## 629sub _pack_override ($$$) { 630 my $r = shift; 631 my $u = shift; 632 my $der = shift; 633 634 if (ref $r) { 635 return pack(VCE_TEMPLATE, NON_VAR, @$r); 636 } elsif (defined $r) { 637 return pack(VCE_TEMPLATE, NON_VAR, $r, Min2Wt, Min3Wt, $u); 638 } else { 639 $u = 0xFFFD if 0x10FFFF < $u; 640 return $der->($u); 641 } 642} 643 644## 645## list of VCE = getWt(JCPS) 646## 647sub getWt 648{ 649 my $self = shift; 650 my $u = shift; 651 my $map = $self->{mapping}; 652 my $der = $self->{derivCode}; 653 my $out = $self->{overrideOut}; 654 my $uXS = $self->{__useXS}; ### XS only 655 656 return if !defined $u; 657 return $self->varCE($HighestVCE) if $u eq 0xFFFF && $self->{highestFFFF}; 658 return $self->varCE($minimalVCE) if $u eq 0xFFFE && $self->{minimalFFFE}; 659 $u = 0xFFFD if $u !~ /;/ && 0x10FFFF < $u && !$out; 660 661 my @ce; 662 if (exists $map->{$u}) { 663 @ce = @{ $map->{$u} }; # $u may be a contraction 664### begin XS only ### 665 } elsif ($uXS && _exists_simple($u)) { 666 @ce = _fetch_simple($u); 667### end XS only ### 668 } elsif (Hangul_SIni <= $u && $u <= Hangul_SFin) { 669 my $hang = $self->{overrideHangul}; 670 if ($hang) { 671 @ce = map _pack_override($_, $u, $der), $hang->($u); 672 } elsif (!defined $hang) { 673 @ce = $der->($u); 674 } else { 675 my $max = $self->{maxlength}; 676 my @decH = _decompHangul($u); 677 678 if (@decH == 2) { 679 my $contract = join(CODE_SEP, @decH); 680 @decH = ($contract) if exists $map->{$contract}; 681 } else { # must be <@decH == 3> 682 if (exists $max->{$decH[0]}) { 683 my $contract = join(CODE_SEP, @decH); 684 if (exists $map->{$contract}) { 685 @decH = ($contract); 686 } else { 687 $contract = join(CODE_SEP, @decH[0,1]); 688 exists $map->{$contract} and @decH = ($contract, $decH[2]); 689 } 690 # even if V's ignorable, LT contraction is not supported. 691 # If such a situation were required, NFD should be used. 692 } 693 if (@decH == 3 && exists $max->{$decH[1]}) { 694 my $contract = join(CODE_SEP, @decH[1,2]); 695 exists $map->{$contract} and @decH = ($decH[0], $contract); 696 } 697 } 698 699 @ce = map({ 700 exists $map->{$_} ? @{ $map->{$_} } : 701 $uXS && _exists_simple($_) ? _fetch_simple($_) : ### XS only 702 $der->($_); 703 } @decH); 704 } 705 } elsif ($out && 0x10FFFF < $u) { 706 @ce = map _pack_override($_, $u, $der), $out->($u); 707 } else { 708 my $cjk = $self->{overrideCJK}; 709 my $vers = $self->{UCA_Version}; 710 if ($cjk && _isUIdeo($u, $vers)) { 711 @ce = map _pack_override($_, $u, $der), $cjk->($u); 712 } elsif ($vers == 8 && defined $cjk && _isUIdeo($u, 0)) { 713 @ce = _uideoCE_8($u); 714 } else { 715 @ce = $der->($u); 716 } 717 } 718 return map $self->varCE($_), @ce; 719} 720 721 722## 723## string sortkey = getSortKey(string arg) 724## 725sub getSortKey 726{ 727 my $self = shift; 728 my $orig = shift; 729 my $str = $self->process($orig); 730 my $rEnt = $self->splitEnt($str); # get an arrayref of JCPS 731 my $vers = $self->{UCA_Version}; 732 my $term = $self->{hangul_terminator}; 733 my $lev = $self->{level}; 734 my $iden = $self->{identical}; 735 736 my @buf; # weight arrays 737 if ($term) { 738 my $preHST = ''; 739 my $termCE = $self->varCE(pack(VCE_TEMPLATE, NON_VAR, $term, 0,0,0)); 740 foreach my $jcps (@$rEnt) { 741 # weird things like VL, TL-contraction are not considered! 742 my $curHST = join '', map getHST($_, $vers), split /;/, $jcps; 743 if ($preHST && !$curHST || # hangul before non-hangul 744 $preHST =~ /L\z/ && $curHST =~ /^T/ || 745 $preHST =~ /V\z/ && $curHST =~ /^L/ || 746 $preHST =~ /T\z/ && $curHST =~ /^[LV]/) { 747 push @buf, $termCE; 748 } 749 $preHST = $curHST; 750 push @buf, $self->getWt($jcps); 751 } 752 push @buf, $termCE if $preHST; # end at hangul 753 } else { 754 foreach my $jcps (@$rEnt) { 755 push @buf, $self->getWt($jcps); 756 } 757 } 758 759 my $rkey = $self->mk_SortKey(\@buf); ### XS only 760 761 if ($iden || $vers >= 26 && $lev == MaxLevel) { 762 $rkey .= LEVEL_SEP; 763 $rkey .= pack(TIE_TEMPLATE, unpack_U($str)) if $iden; 764 } 765 return $rkey; 766} 767 768 769## 770## int compare = cmp(string a, string b) 771## 772sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) } 773sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) } 774sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) } 775sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) } 776sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) } 777sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) } 778sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) } 779 780## 781## list[strings] sorted = sort(list[strings] arg) 782## 783sub sort { 784 my $obj = shift; 785 return 786 map { $_->[1] } 787 sort{ $a->[0] cmp $b->[0] } 788 map [ $obj->getSortKey($_), $_ ], @_; 789} 790 791 792## 793## bool _nonIgnorAtLevel(arrayref weights, int level) 794## 795sub _nonIgnorAtLevel($$) 796{ 797 my $wt = shift; 798 return if ! defined $wt; 799 my $lv = shift; 800 return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE; 801} 802 803## 804## bool _eqArray( 805## arrayref of arrayref[weights] source, 806## arrayref of arrayref[weights] substr, 807## int level) 808## * comparison of graphemes vs graphemes. 809## @$source >= @$substr must be true (check it before call this); 810## 811sub _eqArray($$$) 812{ 813 my $source = shift; 814 my $substr = shift; 815 my $lev = shift; 816 817 for my $g (0..@$substr-1){ 818 # Do the $g'th graphemes have the same number of AV weights? 819 return if @{ $source->[$g] } != @{ $substr->[$g] }; 820 821 for my $w (0..@{ $substr->[$g] }-1) { 822 for my $v (0..$lev-1) { 823 return if $source->[$g][$w][$v] != $substr->[$g][$w][$v]; 824 } 825 } 826 } 827 return 1; 828} 829 830## 831## (int position, int length) 832## int position = index(string, substring, position, [undoc'ed global]) 833## 834## With "global" (only for the list context), 835## returns list of arrayref[position, length]. 836## 837sub index 838{ 839 my $self = shift; 840 $self->{preprocess} and 841 croak "Don't use Preprocess with index(), match(), etc."; 842 $self->{normCode} and 843 croak "Don't use Normalization with index(), match(), etc."; 844 845 my $str = shift; 846 my $len = length($str); 847 my $sub = shift; 848 my $subE = $self->splitEnt($sub); 849 my $pos = @_ ? shift : 0; 850 $pos = 0 if $pos < 0; 851 my $glob = shift; 852 853 my $lev = $self->{level}; 854 my $v2i = $self->{UCA_Version} >= 9 && 855 $self->{variable} ne 'non-ignorable'; 856 857 if (! @$subE) { 858 my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos; 859 return $glob 860 ? map([$_, 0], $temp..$len) 861 : wantarray ? ($temp,0) : $temp; 862 } 863 $len < $pos 864 and return wantarray ? () : NOMATCHPOS; 865 my $strE = $self->splitEnt($pos ? substr($str, $pos) : $str, TRUE); 866 @$strE 867 or return wantarray ? () : NOMATCHPOS; 868 869 my(@strWt, @iniPos, @finPos, @subWt, @g_ret); 870 871 my $last_is_variable; 872 for my $vwt (map $self->getWt($_), @$subE) { 873 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt); 874 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev); 875 876 # "Ignorable (L1, L2) after Variable" since track. v. 9 877 if ($v2i) { 878 if ($var) { 879 $last_is_variable = TRUE; 880 } 881 elsif (!$wt[0]) { # ignorable 882 $to_be_pushed = FALSE if $last_is_variable; 883 } 884 else { 885 $last_is_variable = FALSE; 886 } 887 } 888 889 if (@subWt && !$var && !$wt[0]) { 890 push @{ $subWt[-1] }, \@wt if $to_be_pushed; 891 } elsif ($to_be_pushed) { 892 push @subWt, [ \@wt ]; 893 } 894 # else ===> skipped 895 } 896 897 my $count = 0; 898 my $end = @$strE - 1; 899 900 $last_is_variable = FALSE; # reuse 901 for (my $i = 0; $i <= $end; ) { # no $i++ 902 my $found_base = 0; 903 904 # fetch a grapheme 905 while ($i <= $end && $found_base == 0) { 906 for my $vwt ($self->getWt($strE->[$i][0])) { 907 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt); 908 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev); 909 910 # "Ignorable (L1, L2) after Variable" since track. v. 9 911 if ($v2i) { 912 if ($var) { 913 $last_is_variable = TRUE; 914 } 915 elsif (!$wt[0]) { # ignorable 916 $to_be_pushed = FALSE if $last_is_variable; 917 } 918 else { 919 $last_is_variable = FALSE; 920 } 921 } 922 923 if (@strWt && !$var && !$wt[0]) { 924 push @{ $strWt[-1] }, \@wt if $to_be_pushed; 925 $finPos[-1] = $strE->[$i][2]; 926 } elsif ($to_be_pushed) { 927 push @strWt, [ \@wt ]; 928 push @iniPos, $found_base ? NOMATCHPOS : $strE->[$i][1]; 929 $finPos[-1] = NOMATCHPOS if $found_base; 930 push @finPos, $strE->[$i][2]; 931 $found_base++; 932 } 933 # else ===> no-op 934 } 935 $i++; 936 } 937 938 # try to match 939 while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) { 940 if ($iniPos[0] != NOMATCHPOS && 941 $finPos[$#subWt] != NOMATCHPOS && 942 _eqArray(\@strWt, \@subWt, $lev)) { 943 my $temp = $iniPos[0] + $pos; 944 945 if ($glob) { 946 push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]]; 947 splice @strWt, 0, $#subWt; 948 splice @iniPos, 0, $#subWt; 949 splice @finPos, 0, $#subWt; 950 } 951 else { 952 return wantarray 953 ? ($temp, $finPos[$#subWt] - $iniPos[0]) 954 : $temp; 955 } 956 } 957 shift @strWt; 958 shift @iniPos; 959 shift @finPos; 960 } 961 } 962 963 return $glob 964 ? @g_ret 965 : wantarray ? () : NOMATCHPOS; 966} 967 968## 969## scalarref to matching part = match(string, substring) 970## 971sub match 972{ 973 my $self = shift; 974 if (my($pos,$len) = $self->index($_[0], $_[1])) { 975 my $temp = substr($_[0], $pos, $len); 976 return wantarray ? $temp : \$temp; 977 # An lvalue ref \substr should be avoided, 978 # since its value is affected by modification of its referent. 979 } 980 else { 981 return; 982 } 983} 984 985## 986## arrayref matching parts = gmatch(string, substring) 987## 988sub gmatch 989{ 990 my $self = shift; 991 my $str = shift; 992 my $sub = shift; 993 return map substr($str, $_->[0], $_->[1]), 994 $self->index($str, $sub, 0, 'g'); 995} 996 997## 998## bool subst'ed = subst(string, substring, replace) 999## 1000sub subst 1001{ 1002 my $self = shift; 1003 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE; 1004 1005 if (my($pos,$len) = $self->index($_[0], $_[1])) { 1006 if ($code) { 1007 my $mat = substr($_[0], $pos, $len); 1008 substr($_[0], $pos, $len, $code->($mat)); 1009 } else { 1010 substr($_[0], $pos, $len, $_[2]); 1011 } 1012 return TRUE; 1013 } 1014 else { 1015 return FALSE; 1016 } 1017} 1018 1019## 1020## int count = gsubst(string, substring, replace) 1021## 1022sub gsubst 1023{ 1024 my $self = shift; 1025 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE; 1026 my $cnt = 0; 1027 1028 # Replacement is carried out from the end, then use reverse. 1029 for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) { 1030 if ($code) { 1031 my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]); 1032 substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat)); 1033 } else { 1034 substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]); 1035 } 1036 $cnt++; 1037 } 1038 return $cnt; 1039} 1040 10411; 1042__END__ 1043 1044=head1 NAME 1045 1046Unicode::Collate - Unicode Collation Algorithm 1047 1048=head1 SYNOPSIS 1049 1050 use Unicode::Collate; 1051 1052 #construct 1053 $Collator = Unicode::Collate->new(%tailoring); 1054 1055 #sort 1056 @sorted = $Collator->sort(@not_sorted); 1057 1058 #compare 1059 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1. 1060 1061B<Note:> Strings in C<@not_sorted>, C<$a> and C<$b> are interpreted 1062according to Perl's Unicode support. See L<perlunicode>, 1063L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>. 1064Otherwise you can use C<preprocess> or should decode them before. 1065 1066=head1 DESCRIPTION 1067 1068This module is an implementation of Unicode Technical Standard #10 1069(a.k.a. UTS #10) - Unicode Collation Algorithm (a.k.a. UCA). 1070 1071=head2 Constructor and Tailoring 1072 1073The C<new> method returns a collator object. If new() is called 1074with no parameters, the collator should do the default collation. 1075 1076 $Collator = Unicode::Collate->new( 1077 UCA_Version => $UCA_Version, 1078 alternate => $alternate, # alias for 'variable' 1079 backwards => $levelNumber, # or \@levelNumbers 1080 entry => $element, 1081 hangul_terminator => $term_primary_weight, 1082 highestFFFF => $bool, 1083 identical => $bool, 1084 ignoreName => qr/$ignoreName/, 1085 ignoreChar => qr/$ignoreChar/, 1086 ignore_level2 => $bool, 1087 katakana_before_hiragana => $bool, 1088 level => $collationLevel, 1089 long_contraction => $bool, 1090 minimalFFFE => $bool, 1091 normalization => $normalization_form, 1092 overrideCJK => \&overrideCJK, 1093 overrideHangul => \&overrideHangul, 1094 preprocess => \&preprocess, 1095 rearrange => \@charList, 1096 rewrite => \&rewrite, 1097 suppress => \@charList, 1098 table => $filename, 1099 undefName => qr/$undefName/, 1100 undefChar => qr/$undefChar/, 1101 upper_before_lower => $bool, 1102 variable => $variable, 1103 ); 1104 1105=over 4 1106 1107=item UCA_Version 1108 1109If the revision (previously "tracking version") number of UCA is given, 1110behavior of that revision is emulated on collating. 1111If omitted, the return value of C<UCA_Version()> is used. 1112 1113The following revisions are supported. The default is 43. 1114 1115 UCA Unicode Standard DUCET (@version) 1116 ------------------------------------------------------- 1117 8 3.1 3.0.1 (3.0.1d9) 1118 9 3.1 with Corrigendum 3 3.1.1 1119 11 4.0.0 1120 14 4.1.0 1121 16 5.0.0 1122 18 5.1.0 1123 20 5.2.0 1124 22 6.0.0 1125 24 6.1.0 1126 26 6.2.0 1127 28 6.3.0 1128 30 7.0.0 1129 32 8.0.0 1130 34 9.0.0 1131 36 10.0.0 1132 38 11.0.0 1133 40 12.0.0 1134 41 12.1.0 1135 43 13.0.0 1136 1137* See below for C<long_contraction> with C<UCA_Version> 22 and 24. 1138 1139* Noncharacters (e.g. U+FFFF) are not ignored, and can be overridden 1140since C<UCA_Version> 22. 1141 1142* Out-of-range codepoints (greater than U+10FFFF) are not ignored, 1143and can be overridden since C<UCA_Version> 22. 1144 1145* Fully ignorable characters were ignored, and would not interrupt 1146contractions with C<UCA_Version> 9 and 11. 1147 1148* Treatment of ignorables after variables and some behaviors 1149were changed at C<UCA_Version> 9. 1150 1151* Characters regarded as CJK unified ideographs (cf. C<overrideCJK>) 1152depend on C<UCA_Version>. 1153 1154* Many hangul jamo are assigned at C<UCA_Version> 20, that will affect 1155C<hangul_terminator>. 1156 1157=item alternate 1158 1159-- see 3.2.2 Alternate Weighting, version 8 of UTS #10 1160 1161For backward compatibility, C<alternate> (old name) can be used 1162as an alias for C<variable>. 1163 1164=item backwards 1165 1166-- see 3.4 Backward Accents, UTS #10. 1167 1168 backwards => $levelNumber or \@levelNumbers 1169 1170Weights in reverse order; ex. level 2 (diacritic ordering) in French. 1171If omitted (or C<$levelNumber> is C<undef> or C<\@levelNumbers> is C<[]>), 1172forwards at all the levels. 1173 1174=item entry 1175 1176-- see 5 Tailoring; 9.1 Allkeys File Format, UTS #10. 1177 1178If the same character (or a sequence of characters) exists 1179in the collation element table through C<table>, 1180mapping to collation elements is overridden. 1181If it does not exist, the mapping is defined additionally. 1182 1183 entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt) 11840063 0068 ; [.0E6A.0020.0002.0063] # ch 11850043 0068 ; [.0E6A.0020.0007.0043] # Ch 11860043 0048 ; [.0E6A.0020.0008.0043] # CH 1187006C 006C ; [.0F4C.0020.0002.006C] # ll 1188004C 006C ; [.0F4C.0020.0007.004C] # Ll 1189004C 004C ; [.0F4C.0020.0008.004C] # LL 119000F1 ; [.0F7B.0020.0002.00F1] # n-tilde 1191006E 0303 ; [.0F7B.0020.0002.00F1] # n-tilde 119200D1 ; [.0F7B.0020.0008.00D1] # N-tilde 1193004E 0303 ; [.0F7B.0020.0008.00D1] # N-tilde 1194ENTRY 1195 1196 entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt) 119700E6 ; [.0E33.0020.0002.00E6][.0E8B.0020.0002.00E6] # ae ligature as <a><e> 119800C6 ; [.0E33.0020.0008.00C6][.0E8B.0020.0008.00C6] # AE ligature as <A><E> 1199ENTRY 1200 1201B<NOTE:> The code point in the UCA file format (before C<';'>) 1202B<must> be a Unicode code point (defined as hexadecimal), 1203but not a native code point. 1204So C<0063> must always denote C<U+0063>, 1205but not a character of C<"\x63">. 1206 1207Weighting may vary depending on collation element table. 1208So ensure the weights defined in C<entry> will be consistent with 1209those in the collation element table loaded via C<table>. 1210 1211In DUCET v4.0.0, primary weight of C<C> is C<0E60> 1212and that of C<D> is C<0E6D>. So setting primary weight of C<CH> to C<0E6A> 1213(as a value between C<0E60> and C<0E6D>) 1214makes ordering as C<C E<lt> CH E<lt> D>. 1215Exactly speaking DUCET already has some characters between C<C> and C<D>: 1216C<small capital C> (C<U+1D04>) with primary weight C<0E64>, 1217C<c-hook/C-hook> (C<U+0188/U+0187>) with C<0E65>, 1218and C<c-curl> (C<U+0255>) with C<0E69>. 1219Then primary weight C<0E6A> for C<CH> makes C<CH> 1220ordered between C<c-curl> and C<D>. 1221 1222=item hangul_terminator 1223 1224-- see 7.1.4 Trailing Weights, UTS #10. 1225 1226If a true value is given (non-zero but should be positive), 1227it will be added as a terminator primary weight to the end of 1228every standard Hangul syllable. Secondary and any higher weights 1229for terminator are set to zero. 1230If the value is false or C<hangul_terminator> key does not exist, 1231insertion of terminator weights will not be performed. 1232 1233Boundaries of Hangul syllables are determined 1234according to conjoining Jamo behavior in F<the Unicode Standard> 1235and F<HangulSyllableType.txt>. 1236 1237B<Implementation Note:> 1238(1) For expansion mapping (Unicode character mapped 1239to a sequence of collation elements), a terminator will not be added 1240between collation elements, even if Hangul syllable boundary exists there. 1241Addition of terminator is restricted to the next position 1242to the last collation element. 1243 1244(2) Non-conjoining Hangul letters 1245(Compatibility Jamo, halfwidth Jamo, and enclosed letters) are not 1246automatically terminated with a terminator primary weight. 1247These characters may need terminator included in a collation element 1248table beforehand. 1249 1250=item highestFFFF 1251 1252-- see 2.4 Tailored noncharacter weights, UTS #35 (LDML) Part 5: Collation. 1253 1254If the parameter is made true, C<U+FFFF> has a highest primary weight. 1255When a boolean of C<$coll-E<gt>ge($str, "abc")> and 1256C<$coll-E<gt>le($str, "abc\x{FFFF}")> is true, it is expected that C<$str> 1257begins with C<"abc">, or another primary equivalent. 1258C<$str> may be C<"abcd">, C<"abc012">, but should not include C<U+FFFF> 1259such as C<"abc\x{FFFF}xyz">. 1260 1261C<$coll-E<gt>le($str, "abc\x{FFFF}")> works like C<$coll-E<gt>lt($str, "abd")> 1262almost, but the latter has a problem that you should know which letter is 1263next to C<c>. For a certain language where C<ch> as the next letter, 1264C<"abch"> is greater than C<"abc\x{FFFF}">, but less than C<"abd">. 1265 1266Note: 1267This is equivalent to C<(entry =E<gt> 'FFFF ; [.FFFE.0020.0005.FFFF]')>. 1268Any other character than C<U+FFFF> can be tailored by C<entry>. 1269 1270=item identical 1271 1272-- see A.3 Deterministic Comparison, UTS #10. 1273 1274By default, strings whose weights are equal should be equal, 1275even though their code points are not equal. 1276Completely ignorable characters are ignored. 1277 1278If the parameter is made true, a final, tie-breaking level is used. 1279If no difference of weights is found after the comparison through 1280all the level specified by C<level>, the comparison with code points 1281will be performed. 1282For the tie-breaking comparison, the sort key has code points 1283of the original string appended. 1284Completely ignorable characters are not ignored. 1285 1286If C<preprocess> and/or C<normalization> is applied, the code points 1287of the string after them (in NFD by default) are used. 1288 1289=item ignoreChar 1290 1291=item ignoreName 1292 1293-- see 3.6 Variable Weighting, UTS #10. 1294 1295Makes the entry in the table completely ignorable; 1296i.e. as if the weights were zero at all level. 1297 1298Through C<ignoreChar>, any character matching C<qr/$ignoreChar/> 1299will be ignored. Through C<ignoreName>, any character whose name 1300(given in the C<table> file as a comment) matches C<qr/$ignoreName/> 1301will be ignored. 1302 1303E.g. when 'a' and 'e' are ignorable, 1304'element' is equal to 'lament' (or 'lmnt'). 1305 1306=item ignore_level2 1307 1308-- see 5.1 Parametric Tailoring, UTS #10. 1309 1310By default, case-sensitive comparison (that is level 3 difference) 1311won't ignore accents (that is level 2 difference). 1312 1313If the parameter is made true, accents (and other primary ignorable 1314characters) are ignored, even though cases are taken into account. 1315 1316B<NOTE>: C<level> should be 3 or greater. 1317 1318=item katakana_before_hiragana 1319 1320-- see 7.2 Tertiary Weight Table, UTS #10. 1321 1322By default, hiragana is before katakana. 1323If the parameter is made true, this is reversed. 1324 1325B<NOTE>: This parameter simplemindedly assumes that any hiragana/katakana 1326distinctions must occur in level 3, and their weights at level 3 must be 1327same as those mentioned in 7.3.1, UTS #10. 1328If you define your collation elements which violate this requirement, 1329this parameter does not work validly. 1330 1331=item level 1332 1333-- see 4.3 Form Sort Key, UTS #10. 1334 1335Set the maximum level. 1336Any higher levels than the specified one are ignored. 1337 1338 Level 1: alphabetic ordering 1339 Level 2: diacritic ordering 1340 Level 3: case ordering 1341 Level 4: tie-breaking (e.g. in the case when variable is 'shifted') 1342 1343 ex.level => 2, 1344 1345If omitted, the maximum is the 4th. 1346 1347B<NOTE:> The DUCET includes weights over 0xFFFF at the 4th level. 1348But this module only uses weights within 0xFFFF. 1349When C<variable> is 'blanked' or 'non-ignorable' (other than 'shifted' 1350and 'shift-trimmed'), the level 4 may be unreliable. 1351 1352See also C<identical>. 1353 1354=item long_contraction 1355 1356-- see 3.8.2 Well-Formedness of the DUCET, 4.2 Produce Array, UTS #10. 1357 1358If the parameter is made true, for a contraction with three or more 1359characters (here nicknamed "long contraction"), initial substrings 1360will be handled. 1361For example, a contraction ABC, where A is a starter, and B and C 1362are non-starters (character with non-zero combining character class), 1363will be detected even if there is not AB as a contraction. 1364 1365B<Default:> Usually false. 1366If C<UCA_Version> is 22 or 24, and the value of C<long_contraction> 1367is not specified in C<new()>, a true value is set implicitly. 1368This is a workaround to pass Conformance Tests for Unicode 6.0.0 and 6.1.0. 1369 1370C<change()> handles C<long_contraction> explicitly only. 1371If C<long_contraction> is not specified in C<change()>, even though 1372C<UCA_Version> is changed, C<long_contraction> will not be changed. 1373 1374B<Limitation:> Scanning non-starters is one-way (no back tracking). 1375If AB is found but not ABC is not found, other long contraction where 1376the first character is A and the second is not B may not be found. 1377 1378Under C<(normalization =E<gt> undef)>, detection step of discontiguous 1379contractions will be skipped. 1380 1381B<Note:> The following contractions in DUCET are not considered 1382in steps S2.1.1 to S2.1.3, where they are discontiguous. 1383 1384 0FB2 0F71 0F80 (TIBETAN VOWEL SIGN VOCALIC RR) 1385 0FB3 0F71 0F80 (TIBETAN VOWEL SIGN VOCALIC LL) 1386 1387For example C<TIBETAN VOWEL SIGN VOCALIC RR> with C<COMBINING TILDE OVERLAY> 1388(C<U+0344>) is C<0FB2 0344 0F71 0F80> in NFD. 1389In this case C<0FB2 0F80> (C<TIBETAN VOWEL SIGN VOCALIC R>) is detected, 1390instead of C<0FB2 0F71 0F80>. 1391Inserted C<0344> makes C<0FB2 0F71 0F80> discontiguous and lack of 1392contraction C<0FB2 0F71> prohibits C<0FB2 0F71 0F80> from being detected. 1393 1394=item minimalFFFE 1395 1396-- see 1.1.1 U+FFFE, UTS #35 (LDML) Part 5: Collation. 1397 1398If the parameter is made true, C<U+FFFE> has a minimal primary weight. 1399The comparison between C<"$a1\x{FFFE}$a2"> and C<"$b1\x{FFFE}$b2"> 1400first compares C<$a1> and C<$b1> at level 1, and 1401then C<$a2> and C<$b2> at level 1, as followed. 1402 1403 "ab\x{FFFE}a" 1404 "Ab\x{FFFE}a" 1405 "ab\x{FFFE}c" 1406 "Ab\x{FFFE}c" 1407 "ab\x{FFFE}xyz" 1408 "abc\x{FFFE}def" 1409 "abc\x{FFFE}xYz" 1410 "aBc\x{FFFE}xyz" 1411 "abcX\x{FFFE}def" 1412 "abcx\x{FFFE}xyz" 1413 "b\x{FFFE}aaa" 1414 "bbb\x{FFFE}a" 1415 1416Note: 1417This is equivalent to C<(entry =E<gt> 'FFFE ; [.0001.0020.0005.FFFE]')>. 1418Any other character than C<U+FFFE> can be tailored by C<entry>. 1419 1420=item normalization 1421 1422-- see 4.1 Normalize, UTS #10. 1423 1424If specified, strings are normalized before preparation of sort keys 1425(the normalization is executed after preprocess). 1426 1427A form name C<Unicode::Normalize::normalize()> accepts will be applied 1428as C<$normalization_form>. 1429Acceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>. 1430See C<Unicode::Normalize::normalize()> for detail. 1431If omitted, C<'NFD'> is used. 1432 1433C<normalization> is performed after C<preprocess> (if defined). 1434 1435Furthermore, special values, C<undef> and C<"prenormalized">, can be used, 1436though they are not concerned with C<Unicode::Normalize::normalize()>. 1437 1438If C<undef> (not a string C<"undef">) is passed explicitly 1439as the value for this key, 1440any normalization is not carried out (this may make tailoring easier 1441if any normalization is not desired). Under C<(normalization =E<gt> undef)>, 1442only contiguous contractions are resolved; 1443e.g. even if C<A-ring> (and C<A-ring-cedilla>) is ordered after C<Z>, 1444C<A-cedilla-ring> would be primary equal to C<A>. 1445In this point, 1446C<(normalization =E<gt> undef, preprocess =E<gt> sub { NFD(shift) })> 1447B<is not> equivalent to C<(normalization =E<gt> 'NFD')>. 1448 1449In the case of C<(normalization =E<gt> "prenormalized")>, 1450any normalization is not performed, but 1451discontiguous contractions with combining characters are performed. 1452Therefore 1453C<(normalization =E<gt> 'prenormalized', preprocess =E<gt> sub { NFD(shift) })> 1454B<is> equivalent to C<(normalization =E<gt> 'NFD')>. 1455If source strings are finely prenormalized, 1456C<(normalization =E<gt> 'prenormalized')> may save time for normalization. 1457 1458Except C<(normalization =E<gt> undef)>, 1459B<Unicode::Normalize> is required (see also B<CAVEAT>). 1460 1461=item overrideCJK 1462 1463-- see 7.1 Derived Collation Elements, UTS #10. 1464 1465By default, CJK unified ideographs are ordered in Unicode codepoint 1466order, but those in the CJK Unified Ideographs block are less than 1467those in the CJK Unified Ideographs Extension A etc. 1468 1469 In the CJK Unified Ideographs block: 1470 U+4E00..U+9FA5 if UCA_Version is 8, 9 or 11. 1471 U+4E00..U+9FBB if UCA_Version is 14 or 16. 1472 U+4E00..U+9FC3 if UCA_Version is 18. 1473 U+4E00..U+9FCB if UCA_Version is 20 or 22. 1474 U+4E00..U+9FCC if UCA_Version is 24 to 30. 1475 U+4E00..U+9FD5 if UCA_Version is 32 or 34. 1476 U+4E00..U+9FEA if UCA_Version is 36. 1477 U+4E00..U+9FEF if UCA_Version is 38, 40 or 41. 1478 U+4E00..U+9FFC if UCA_Version is 43. 1479 1480 In the CJK Unified Ideographs Extension blocks: 1481 Ext.A (U+3400..U+4DB5) if UCA_Version is 8 to 41. 1482 Ext.A (U+3400..U+4DBF) if UCA_Version is 43. 1483 Ext.B (U+20000..U+2A6D6) if UCA_Version is 8 to 41. 1484 Ext.B (U+20000..U+2A6DD) if UCA_Version is 43. 1485 Ext.C (U+2A700..U+2B734) if UCA_Version is 20 or later. 1486 Ext.D (U+2B740..U+2B81D) if UCA_Version is 22 or later. 1487 Ext.E (U+2B820..U+2CEA1) if UCA_Version is 32 or later. 1488 Ext.F (U+2CEB0..U+2EBE0) if UCA_Version is 36 or later. 1489 Ext.G (U+30000..U+3134A) if UCA_Version is 43. 1490 1491Through C<overrideCJK>, ordering of CJK unified ideographs (including 1492extensions) can be overridden. 1493 1494ex. CJK unified ideographs in the JIS code point order. 1495 1496 overrideCJK => sub { 1497 my $u = shift; # get a Unicode codepoint 1498 my $b = pack('n', $u); # to UTF-16BE 1499 my $s = your_unicode_to_sjis_converter($b); # convert 1500 my $n = unpack('n', $s); # convert sjis to short 1501 [ $n, 0x20, 0x2, $u ]; # return the collation element 1502 }, 1503 1504The return value may be an arrayref of 1st to 4th weights as shown 1505above. The return value may be an integer as the primary weight 1506as shown below. If C<undef> is returned, the default derived 1507collation element will be used. 1508 1509 overrideCJK => sub { 1510 my $u = shift; # get a Unicode codepoint 1511 my $b = pack('n', $u); # to UTF-16BE 1512 my $s = your_unicode_to_sjis_converter($b); # convert 1513 my $n = unpack('n', $s); # convert sjis to short 1514 return $n; # return the primary weight 1515 }, 1516 1517The return value may be a list containing zero or more of 1518an arrayref, an integer, or C<undef>. 1519 1520ex. ignores all CJK unified ideographs. 1521 1522 overrideCJK => sub {()}, # CODEREF returning empty list 1523 1524 # where ->eq("Pe\x{4E00}rl", "Perl") is true 1525 # as U+4E00 is a CJK unified ideograph and to be ignorable. 1526 1527If a false value (including C<undef>) is passed, C<overrideCJK> 1528has no effect. 1529C<$Collator-E<gt>change(overrideCJK =E<gt> 0)> resets the old one. 1530 1531But assignment of weight for CJK unified ideographs 1532in C<table> or C<entry> is still valid. 1533If C<undef> is passed explicitly as the value for this key, 1534weights for CJK unified ideographs are treated as undefined. 1535However when C<UCA_Version> E<gt> 8, C<(overrideCJK =E<gt> undef)> 1536has no special meaning. 1537 1538B<Note:> In addition to them, 12 CJK compatibility ideographs (C<U+FA0E>, 1539C<U+FA0F>, C<U+FA11>, C<U+FA13>, C<U+FA14>, C<U+FA1F>, C<U+FA21>, C<U+FA23>, 1540C<U+FA24>, C<U+FA27>, C<U+FA28>, C<U+FA29>) are also treated as CJK unified 1541ideographs. But they can't be overridden via C<overrideCJK> when you use 1542DUCET, as the table includes weights for them. C<table> or C<entry> has 1543priority over C<overrideCJK>. 1544 1545=item overrideHangul 1546 1547-- see 7.1 Derived Collation Elements, UTS #10. 1548 1549By default, Hangul syllables are decomposed into Hangul Jamo, 1550even if C<(normalization =E<gt> undef)>. 1551But the mapping of Hangul syllables may be overridden. 1552 1553This parameter works like C<overrideCJK>, so see there for examples. 1554 1555If you want to override the mapping of Hangul syllables, 1556NFD and NFKD are not appropriate, since NFD and NFKD will decompose 1557Hangul syllables before overriding. FCD may decompose Hangul syllables 1558as the case may be. 1559 1560If a false value (but not C<undef>) is passed, C<overrideHangul> 1561has no effect. 1562C<$Collator-E<gt>change(overrideHangul =E<gt> 0)> resets the old one. 1563 1564If C<undef> is passed explicitly as the value for this key, 1565weight for Hangul syllables is treated as undefined 1566without decomposition into Hangul Jamo. 1567But definition of weight for Hangul syllables 1568in C<table> or C<entry> is still valid. 1569 1570=item overrideOut 1571 1572-- see 7.1.1 Handling Ill-Formed Code Unit Sequences, UTS #10. 1573 1574Perl seems to allow out-of-range values (greater than 0x10FFFF). 1575By default, out-of-range values are replaced with C<U+FFFD> 1576(REPLACEMENT CHARACTER) when C<UCA_Version> E<gt>= 22, 1577or ignored when C<UCA_Version> E<lt>= 20. 1578 1579When C<UCA_Version> E<gt>= 22, the weights of out-of-range values 1580can be overridden. Though C<table> or C<entry> are available for them, 1581out-of-range values are too many. 1582 1583C<overrideOut> can perform it algorithmically. 1584This parameter works like C<overrideCJK>, so see there for examples. 1585 1586ex. ignores all out-of-range values. 1587 1588 overrideOut => sub {()}, # CODEREF returning empty list 1589 1590If a false value (including C<undef>) is passed, C<overrideOut> 1591has no effect. 1592C<$Collator-E<gt>change(overrideOut =E<gt> 0)> resets the old one. 1593 1594B<NOTE ABOUT U+FFFD:> 1595 1596UCA recommends that out-of-range values should not be ignored for security 1597reasons. Say, C<"pe\x{110000}rl"> should not be equal to C<"perl">. 1598However, C<U+FFFD> is wrongly mapped to a variable collation element 1599in DUCET for Unicode 6.0.0 to 6.2.0, that means out-of-range values will be 1600ignored when C<variable> isn't C<Non-ignorable>. 1601 1602The mapping of C<U+FFFD> is corrected in Unicode 6.3.0. 1603see L<http://www.unicode.org/reports/tr10/tr10-28.html#Trailing_Weights> 1604(7.1.4 Trailing Weights). Such a correction is reproduced by this. 1605 1606 overrideOut => sub { 0xFFFD }, # CODEREF returning a very large integer 1607 1608This workaround is unnecessary since Unicode 6.3.0. 1609 1610=item preprocess 1611 1612-- see 5.4 Preprocessing, UTS #10. 1613 1614If specified, the coderef is used to preprocess each string 1615before the formation of sort keys. 1616 1617ex. dropping English articles, such as "a" or "the". 1618Then, "the pen" is before "a pencil". 1619 1620 preprocess => sub { 1621 my $str = shift; 1622 $str =~ s/\b(?:an?|the)\s+//gi; 1623 return $str; 1624 }, 1625 1626C<preprocess> is performed before C<normalization> (if defined). 1627 1628ex. decoding strings in a legacy encoding such as shift-jis: 1629 1630 $sjis_collator = Unicode::Collate->new( 1631 preprocess => \&your_shiftjis_to_unicode_decoder, 1632 ); 1633 @result = $sjis_collator->sort(@shiftjis_strings); 1634 1635B<Note:> Strings returned from the coderef will be interpreted 1636according to Perl's Unicode support. See L<perlunicode>, 1637L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>. 1638 1639=item rearrange 1640 1641-- see 3.5 Rearrangement, UTS #10. 1642 1643Characters that are not coded in logical order and to be rearranged. 1644If C<UCA_Version> is equal to or less than 11, default is: 1645 1646 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ], 1647 1648If you want to disallow any rearrangement, pass C<undef> or C<[]> 1649(a reference to empty list) as the value for this key. 1650 1651If C<UCA_Version> is equal to or greater than 14, default is C<[]> 1652(i.e. no rearrangement). 1653 1654B<According to the version 9 of UCA, this parameter shall not be used; 1655but it is not warned at present.> 1656 1657=item rewrite 1658 1659If specified, the coderef is used to rewrite lines in C<table> or C<entry>. 1660The coderef will get each line, and then should return a rewritten line 1661according to the UCA file format. 1662If the coderef returns an empty line, the line will be skipped. 1663 1664e.g. any primary ignorable characters into tertiary ignorable: 1665 1666 rewrite => sub { 1667 my $line = shift; 1668 $line =~ s/\[\.0000\..{4}\..{4}\./[.0000.0000.0000./g; 1669 return $line; 1670 }, 1671 1672This example shows rewriting weights. C<rewrite> is allowed to 1673affect code points, weights, and the name. 1674 1675B<NOTE>: C<table> is available to use another table file; 1676preparing a modified table once would be more efficient than 1677rewriting lines on reading an unmodified table every time. 1678 1679=item suppress 1680 1681-- see 3.12 Special-Purpose Commands, UTS #35 (LDML) Part 5: Collation. 1682 1683Contractions beginning with the specified characters are suppressed, 1684even if those contractions are defined in C<table>. 1685 1686An example for Russian and some languages using the Cyrillic script: 1687 1688 suppress => [0x0400..0x0417, 0x041A..0x0437, 0x043A..0x045F], 1689 1690where 0x0400 stands for C<U+0400>, CYRILLIC CAPITAL LETTER IE WITH GRAVE. 1691 1692B<NOTE>: Contractions via C<entry> will not be suppressed. 1693 1694=item table 1695 1696-- see 3.8 Default Unicode Collation Element Table, UTS #10. 1697 1698You can use another collation element table if desired. 1699 1700The table file should locate in the F<Unicode/Collate> directory 1701on C<@INC>. Say, if the filename is F<Foo.txt>, 1702the table file is searched as F<Unicode/Collate/Foo.txt> in C<@INC>. 1703 1704By default, F<allkeys.txt> (as the filename of DUCET) is used. 1705If you will prepare your own table file, any name other than F<allkeys.txt> 1706may be better to avoid namespace conflict. 1707 1708B<NOTE>: When XSUB is used, the DUCET is compiled on building this 1709module, and it may save time at the run time. 1710Explicit saying C<(table =E<gt> 'allkeys.txt')>, or using another table, 1711or using C<ignoreChar>, C<ignoreName>, C<undefChar>, C<undefName> or 1712C<rewrite> will prevent this module from using the compiled DUCET. 1713 1714If C<undef> is passed explicitly as the value for this key, 1715no file is read (but you can define collation elements via C<entry>). 1716 1717A typical way to define a collation element table 1718without any file of table: 1719 1720 $onlyABC = Unicode::Collate->new( 1721 table => undef, 1722 entry => << 'ENTRIES', 17230061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A 17240041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A 17250062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B 17260042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B 17270063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C 17280043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C 1729ENTRIES 1730 ); 1731 1732If C<ignoreName> or C<undefName> is used, character names should be 1733specified as a comment (following C<#>) on each line. 1734 1735=item undefChar 1736 1737=item undefName 1738 1739-- see 6.3.3 Reducing the Repertoire, UTS #10. 1740 1741Undefines the collation element as if it were unassigned in the C<table>. 1742This reduces the size of the table. 1743If an unassigned character appears in the string to be collated, 1744the sort key is made from its codepoint 1745as a single-character collation element, 1746as it is greater than any other assigned collation elements 1747(in the codepoint order among the unassigned characters). 1748But, it'd be better to ignore characters 1749unfamiliar to you and maybe never used. 1750 1751Through C<undefChar>, any character matching C<qr/$undefChar/> 1752will be undefined. Through C<undefName>, any character whose name 1753(given in the C<table> file as a comment) matches C<qr/$undefName/> 1754will be undefined. 1755 1756ex. Collation weights for beyond-BMP characters are not stored in object: 1757 1758 undefChar => qr/[^\0-\x{fffd}]/, 1759 1760=item upper_before_lower 1761 1762-- see 6.6 Case Comparisons, UTS #10. 1763 1764By default, lowercase is before uppercase. 1765If the parameter is made true, this is reversed. 1766 1767B<NOTE>: This parameter simplemindedly assumes that any lowercase/uppercase 1768distinctions must occur in level 3, and their weights at level 3 must be 1769same as those mentioned in 7.3.1, UTS #10. 1770If you define your collation elements which differs from this requirement, 1771this parameter doesn't work validly. 1772 1773=item variable 1774 1775-- see 3.6 Variable Weighting, UTS #10. 1776 1777This key allows for variable weighting of variable collation elements, 1778which are marked with an ASTERISK in the table 1779(NOTE: Many punctuation marks and symbols are variable in F<allkeys.txt>). 1780 1781 variable => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'. 1782 1783These names are case-insensitive. 1784By default (if specification is omitted), 'shifted' is adopted. 1785 1786 'Blanked' Variable elements are made ignorable at levels 1 through 3; 1787 considered at the 4th level. 1788 1789 'Non-Ignorable' Variable elements are not reset to ignorable. 1790 1791 'Shifted' Variable elements are made ignorable at levels 1 through 3 1792 their level 4 weight is replaced by the old level 1 weight. 1793 Level 4 weight for Non-Variable elements is 0xFFFF. 1794 1795 'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level 1796 are trimmed. 1797 1798=back 1799 1800=head2 Methods for Collation 1801 1802=over 4 1803 1804=item C<@sorted = $Collator-E<gt>sort(@not_sorted)> 1805 1806Sorts a list of strings. 1807 1808=item C<$result = $Collator-E<gt>cmp($a, $b)> 1809 1810Returns 1 (when C<$a> is greater than C<$b>) 1811or 0 (when C<$a> is equal to C<$b>) 1812or -1 (when C<$a> is less than C<$b>). 1813 1814=item C<$result = $Collator-E<gt>eq($a, $b)> 1815 1816=item C<$result = $Collator-E<gt>ne($a, $b)> 1817 1818=item C<$result = $Collator-E<gt>lt($a, $b)> 1819 1820=item C<$result = $Collator-E<gt>le($a, $b)> 1821 1822=item C<$result = $Collator-E<gt>gt($a, $b)> 1823 1824=item C<$result = $Collator-E<gt>ge($a, $b)> 1825 1826They works like the same name operators as theirs. 1827 1828 eq : whether $a is equal to $b. 1829 ne : whether $a is not equal to $b. 1830 lt : whether $a is less than $b. 1831 le : whether $a is less than $b or equal to $b. 1832 gt : whether $a is greater than $b. 1833 ge : whether $a is greater than $b or equal to $b. 1834 1835=item C<$sortKey = $Collator-E<gt>getSortKey($string)> 1836 1837-- see 4.3 Form Sort Key, UTS #10. 1838 1839Returns a sort key. 1840 1841You compare the sort keys using a binary comparison 1842and get the result of the comparison of the strings using UCA. 1843 1844 $Collator->getSortKey($a) cmp $Collator->getSortKey($b) 1845 1846 is equivalent to 1847 1848 $Collator->cmp($a, $b) 1849 1850=item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)> 1851 1852Converts a sorting key into its representation form. 1853If C<UCA_Version> is 8, the output is slightly different. 1854 1855 use Unicode::Collate; 1856 my $c = Unicode::Collate->new(); 1857 print $c->viewSortKey("Perl"),"\n"; 1858 1859 # output: 1860 # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF] 1861 # Level 1 Level 2 Level 3 Level 4 1862 1863=back 1864 1865=head2 Methods for Searching 1866 1867The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work 1868like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively, 1869but they are not aware of any pattern, but only a literal substring. 1870 1871B<DISCLAIMER:> If C<preprocess> or C<normalization> parameter is true 1872for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>, 1873C<subst>, C<gsubst>) is croaked, as the position and the length might 1874differ from those on the specified string. 1875 1876C<rearrange> and C<hangul_terminator> parameters are neglected. 1877C<katakana_before_hiragana> and C<upper_before_lower> don't affect 1878matching and searching, as it doesn't matter whether greater or less. 1879 1880=over 4 1881 1882=item C<$position = $Collator-E<gt>index($string, $substring[, $position])> 1883 1884=item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])> 1885 1886If C<$substring> matches a part of C<$string>, returns 1887the position of the first occurrence of the matching part in scalar context; 1888in list context, returns a two-element list of 1889the position and the length of the matching part. 1890 1891If C<$substring> does not match any part of C<$string>, 1892returns C<-1> in scalar context and 1893an empty list in list context. 1894 1895e.g. when the content of C<$str> is C<"Ich mu>E<szlig>C< studieren Perl.">, 1896you say the following where C<$sub> is C<"M>E<uuml>C<SS">, 1897 1898 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 ); 1899 # (normalization => undef) is REQUIRED. 1900 my $match; 1901 if (my($pos,$len) = $Collator->index($str, $sub)) { 1902 $match = substr($str, $pos, $len); 1903 } 1904 1905and get C<"mu>E<szlig>C<"> in C<$match>, since C<"mu>E<szlig>C<"> 1906is primary equal to C<"M>E<uuml>C<SS">. 1907 1908=item C<$match_ref = $Collator-E<gt>match($string, $substring)> 1909 1910=item C<($match) = $Collator-E<gt>match($string, $substring)> 1911 1912If C<$substring> matches a part of C<$string>, in scalar context, returns 1913B<a reference to> the first occurrence of the matching part 1914(C<$match_ref> is always true if matches, 1915since every reference is B<true>); 1916in list context, returns the first occurrence of the matching part. 1917 1918If C<$substring> does not match any part of C<$string>, 1919returns C<undef> in scalar context and 1920an empty list in list context. 1921 1922e.g. 1923 1924 if ($match_ref = $Collator->match($str, $sub)) { # scalar context 1925 print "matches [$$match_ref].\n"; 1926 } else { 1927 print "doesn't match.\n"; 1928 } 1929 1930 or 1931 1932 if (($match) = $Collator->match($str, $sub)) { # list context 1933 print "matches [$match].\n"; 1934 } else { 1935 print "doesn't match.\n"; 1936 } 1937 1938=item C<@match = $Collator-E<gt>gmatch($string, $substring)> 1939 1940If C<$substring> matches a part of C<$string>, returns 1941all the matching parts (or matching count in scalar context). 1942 1943If C<$substring> does not match any part of C<$string>, 1944returns an empty list. 1945 1946=item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)> 1947 1948If C<$substring> matches a part of C<$string>, 1949the first occurrence of the matching part is replaced by C<$replacement> 1950(C<$string> is modified) and C<$count> (always equals to C<1>) is returned. 1951 1952C<$replacement> can be a C<CODEREF>, 1953taking the matching part as an argument, 1954and returning a string to replace the matching part 1955(a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>). 1956 1957=item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)> 1958 1959If C<$substring> matches a part of C<$string>, 1960all the occurrences of the matching part are replaced by C<$replacement> 1961(C<$string> is modified) and C<$count> is returned. 1962 1963C<$replacement> can be a C<CODEREF>, 1964taking the matching part as an argument, 1965and returning a string to replace the matching part 1966(a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>). 1967 1968e.g. 1969 1970 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 ); 1971 # (normalization => undef) is REQUIRED. 1972 my $str = "Camel donkey zebra came\x{301}l CAMEL horse cam\0e\0l..."; 1973 $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" }); 1974 1975 # now $str is "<b>Camel</b> donkey zebra <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cam\0e\0l</b>..."; 1976 # i.e., all the camels are made bold-faced. 1977 1978 Examples: levels and ignore_level2 - what does camel match? 1979 --------------------------------------------------------------------------- 1980 level ignore_level2 | camel Camel came\x{301}l c-a-m-e-l cam\0e\0l 1981 -----------------------|--------------------------------------------------- 1982 1 false | yes yes yes yes yes 1983 2 false | yes yes no yes yes 1984 3 false | yes no no yes yes 1985 4 false | yes no no no yes 1986 -----------------------|--------------------------------------------------- 1987 1 true | yes yes yes yes yes 1988 2 true | yes yes yes yes yes 1989 3 true | yes no yes yes yes 1990 4 true | yes no yes no yes 1991 --------------------------------------------------------------------------- 1992 note: if variable => non-ignorable, camel doesn't match c-a-m-e-l 1993 at any level. 1994 1995=back 1996 1997=head2 Other Methods 1998 1999=over 4 2000 2001=item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)> 2002 2003=item C<$modified_collator = $Collator-E<gt>change(%new_tailoring)> 2004 2005Changes the value of specified keys and returns the changed part. 2006 2007 $Collator = Unicode::Collate->new(level => 4); 2008 2009 $Collator->eq("perl", "PERL"); # false 2010 2011 %old = $Collator->change(level => 2); # returns (level => 4). 2012 2013 $Collator->eq("perl", "PERL"); # true 2014 2015 $Collator->change(%old); # returns (level => 2). 2016 2017 $Collator->eq("perl", "PERL"); # false 2018 2019Not all C<(key,value)>s are allowed to be changed. 2020See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>. 2021 2022In the scalar context, returns the modified collator 2023(but it is B<not> a clone from the original). 2024 2025 $Collator->change(level => 2)->eq("perl", "PERL"); # true 2026 2027 $Collator->eq("perl", "PERL"); # true; now max level is 2nd. 2028 2029 $Collator->change(level => 4)->eq("perl", "PERL"); # false 2030 2031=item C<$version = $Collator-E<gt>version()> 2032 2033Returns the version number (a string) of the Unicode Standard 2034which the C<table> file used by the collator object is based on. 2035If the table does not include a version line (starting with C<@version>), 2036returns C<"unknown">. 2037 2038=item C<UCA_Version()> 2039 2040Returns the revision number of UTS #10 this module consults, 2041that should correspond with the DUCET incorporated. 2042 2043=item C<Base_Unicode_Version()> 2044 2045Returns the version number of UTS #10 this module consults, 2046that should correspond with the DUCET incorporated. 2047 2048=back 2049 2050=head1 EXPORT 2051 2052No method will be exported. 2053 2054=head1 INSTALL 2055 2056Though this module can be used without any C<table> file, 2057to use this module easily, it is recommended to install a table file 2058in the UCA format, by copying it under the directory 2059<a place in @INC>/Unicode/Collate. 2060 2061The most preferable one is "The Default Unicode Collation Element Table" 2062(aka DUCET), available from the Unicode Consortium's website: 2063 2064 http://www.unicode.org/Public/UCA/ 2065 2066 http://www.unicode.org/Public/UCA/latest/allkeys.txt 2067 (latest version) 2068 2069If DUCET is not installed, it is recommended to copy the file 2070from http://www.unicode.org/Public/UCA/latest/allkeys.txt 2071to <a place in @INC>/Unicode/Collate/allkeys.txt 2072manually. 2073 2074=head1 CAVEATS 2075 2076=over 4 2077 2078=item Normalization 2079 2080Use of the C<normalization> parameter requires the B<Unicode::Normalize> 2081module (see L<Unicode::Normalize>). 2082 2083If you need not it (say, in the case when you need not 2084handle any combining characters), 2085assign C<(normalization =E<gt> undef)> explicitly. 2086 2087-- see 6.5 Avoiding Normalization, UTS #10. 2088 2089=item Conformance Test 2090 2091The Conformance Test for the UCA is available 2092under L<http://www.unicode.org/Public/UCA/>. 2093 2094For F<CollationTest_SHIFTED.txt>, 2095a collator via C<Unicode::Collate-E<gt>new( )> should be used; 2096for F<CollationTest_NON_IGNORABLE.txt>, a collator via 2097C<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>. 2098 2099If C<UCA_Version> is 26 or later, the C<identical> level is preferred; 2100C<Unicode::Collate-E<gt>new(identical =E<gt> 1)> and 2101C<Unicode::Collate-E<gt>new(identical =E<gt> 1,> 2102C<variable =E<gt> "non-ignorable", level =E<gt> 3)> should be used. 2103 2104B<Unicode::Normalize is required to try The Conformance Test.> 2105 2106B<EBCDIC-SUPPORT IS EXPERIMENTAL.> 2107 2108=back 2109 2110=head1 AUTHOR, COPYRIGHT AND LICENSE 2111 2112The Unicode::Collate module for perl was written by SADAHIRO Tomoyuki, 2113<SADAHIRO@cpan.org>. This module is Copyright(C) 2001-2021, 2114SADAHIRO Tomoyuki. Japan. All rights reserved. 2115 2116This module is free software; you can redistribute it and/or 2117modify it under the same terms as Perl itself. 2118 2119The file Unicode/Collate/allkeys.txt was copied verbatim 2120from L<http://www.unicode.org/Public/UCA/13.0.0/allkeys.txt>. 2121For this file, Copyright (c) 2020 Unicode, Inc.; distributed 2122under the Terms of Use in L<http://www.unicode.org/terms_of_use.html> 2123 2124=head1 SEE ALSO 2125 2126=over 4 2127 2128=item Unicode Collation Algorithm - UTS #10 2129 2130L<http://www.unicode.org/reports/tr10/> 2131 2132=item The Default Unicode Collation Element Table (DUCET) 2133 2134L<http://www.unicode.org/Public/UCA/latest/allkeys.txt> 2135 2136=item The conformance test for the UCA 2137 2138L<http://www.unicode.org/Public/UCA/latest/CollationTest.html> 2139 2140L<http://www.unicode.org/Public/UCA/latest/CollationTest.zip> 2141 2142=item Hangul Syllable Type 2143 2144L<http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt> 2145 2146=item Unicode Normalization Forms - UAX #15 2147 2148L<http://www.unicode.org/reports/tr15/> 2149 2150=item Unicode Locale Data Markup Language (LDML) - UTS #35 2151 2152L<http://www.unicode.org/reports/tr35/> 2153 2154=back 2155 2156=cut 2157