1package Unicode::Collate; 2 3use 5.006; 4use strict; 5use warnings; 6use Carp; 7use File::Spec; 8 9no warnings 'utf8'; 10 11our $VERSION = '1.31'; 12our $PACKAGE = __PACKAGE__; 13 14my @Path = qw(Unicode Collate); 15my $KeyFile = 'allkeys.txt'; 16 17# Perl's boolean 18use constant TRUE => 1; 19use constant FALSE => ""; 20use constant NOMATCHPOS => -1; 21 22# A coderef to get combining class imported from Unicode::Normalize 23# (i.e. \&Unicode::Normalize::getCombinClass). 24# This is also used as a HAS_UNICODE_NORMALIZE flag. 25my $CVgetCombinClass; 26 27# Supported Levels 28use constant MinLevel => 1; 29use constant MaxLevel => 4; 30 31# Minimum weights at level 2 and 3, respectively 32use constant Min2Wt => 0x20; 33use constant Min3Wt => 0x02; 34 35# Shifted weight at 4th level 36use constant Shift4Wt => 0xFFFF; 37 38# A boolean for Variable and 16-bit weights at 4 levels of Collation Element 39use constant VCE_TEMPLATE => 'Cn4'; 40 41# A sort key: 16-bit weights 42use constant KEY_TEMPLATE => 'n*'; 43 44# The tie-breaking: 32-bit weights 45use constant TIE_TEMPLATE => 'N*'; 46 47# Level separator in a sort key: 48# i.e. pack(KEY_TEMPLATE, 0) 49use constant LEVEL_SEP => "\0\0"; 50 51# As Unicode code point separator for hash keys. 52# A joined code point string (denoted by JCPS below) 53# like "65;768" is used for internal processing 54# instead of Perl's Unicode string like "\x41\x{300}", 55# as the native code point is different from the Unicode code point 56# on EBCDIC platform. 57# This character must not be included in any stringified 58# representation of an integer. 59use constant CODE_SEP => ';'; 60 # NOTE: in regex /;/ is used for $jcps! 61 62# boolean values of variable weights 63use constant NON_VAR => 0; # Non-Variable character 64use constant VAR => 1; # Variable character 65 66# specific code points 67use constant Hangul_SBase => 0xAC00; ### PP only 68use constant Hangul_SIni => 0xAC00; 69use constant Hangul_SFin => 0xD7A3; 70### begin PP only ### 71use constant Hangul_NCount => 588; 72use constant Hangul_TCount => 28; 73use constant Hangul_LBase => 0x1100; 74use constant Hangul_LIni => 0x1100; 75use constant Hangul_LFin => 0x1159; 76use constant Hangul_LFill => 0x115F; 77use constant Hangul_LEnd => 0x115F; # Unicode 5.2 78use constant Hangul_VBase => 0x1161; 79use constant Hangul_VIni => 0x1160; # from Vowel Filler 80use constant Hangul_VFin => 0x11A2; 81use constant Hangul_VEnd => 0x11A7; # Unicode 5.2 82use constant Hangul_TBase => 0x11A7; # from "no-final" codepoint 83use constant Hangul_TIni => 0x11A8; 84use constant Hangul_TFin => 0x11F9; 85use constant Hangul_TEnd => 0x11FF; # Unicode 5.2 86use constant HangulL2Ini => 0xA960; # Unicode 5.2 87use constant HangulL2Fin => 0xA97C; # Unicode 5.2 88use constant HangulV2Ini => 0xD7B0; # Unicode 5.2 89use constant HangulV2Fin => 0xD7C6; # Unicode 5.2 90use constant HangulT2Ini => 0xD7CB; # Unicode 5.2 91use constant HangulT2Fin => 0xD7FB; # Unicode 5.2 92 93use constant CJK_UidIni => 0x4E00; 94use constant CJK_UidFin => 0x9FA5; 95use constant CJK_UidF41 => 0x9FBB; # Unicode 4.1 96use constant CJK_UidF51 => 0x9FC3; # Unicode 5.1 97use constant CJK_UidF52 => 0x9FCB; # Unicode 5.2 98use constant CJK_UidF61 => 0x9FCC; # Unicode 6.1 99use constant CJK_UidF80 => 0x9FD5; # Unicode 8.0 100use constant CJK_UidF100 => 0x9FEA; # Unicode 10.0 101use constant CJK_UidF110 => 0x9FEF; # Unicode 11.0 102use constant CJK_UidF130 => 0x9FFC; # Unicode 13.0 103 104use constant CJK_ExtAIni => 0x3400; # Unicode 3.0 105use constant CJK_ExtAFin => 0x4DB5; # Unicode 3.0 106use constant CJK_ExtA130 => 0x4DBF; # Unicode 13.0 107use constant CJK_ExtBIni => 0x20000; # Unicode 3.1 108use constant CJK_ExtBFin => 0x2A6D6; # Unicode 3.1 109use constant CJK_ExtB130 => 0x2A6DD; # Unicode 13.0 110use constant CJK_ExtCIni => 0x2A700; # Unicode 5.2 111use constant CJK_ExtCFin => 0x2B734; # Unicode 5.2 112use constant CJK_ExtDIni => 0x2B740; # Unicode 6.0 113use constant CJK_ExtDFin => 0x2B81D; # Unicode 6.0 114use constant CJK_ExtEIni => 0x2B820; # Unicode 8.0 115use constant CJK_ExtEFin => 0x2CEA1; # Unicode 8.0 116use constant CJK_ExtFIni => 0x2CEB0; # Unicode 10.0 117use constant CJK_ExtFFin => 0x2EBE0; # Unicode 10.0 118use constant CJK_ExtGIni => 0x30000; # Unicode 13.0 119use constant CJK_ExtGFin => 0x3134A; # Unicode 13.0 120 121my %CompatUI = map +($_ => 1), ( 122 0xFA0E, 0xFA0F, 0xFA11, 0xFA13, 0xFA14, 0xFA1F, 123 0xFA21, 0xFA23, 0xFA24, 0xFA27, 0xFA28, 0xFA29, 124); 125 126use constant TangIdeoIni => 0x17000; # Unicode 9.0 127use constant TangIdeoFin => 0x187EC; # Unicode 9.0 128use constant TangIdeo110 => 0x187F1; # Unicode 11.0 129use constant TangIdeo120 => 0x187F7; # Unicode 12.0 130use constant TangCompIni => 0x18800; # Unicode 9.0 131use constant TangCompFin => 0x18AF2; # Unicode 9.0 132use constant TangComp130 => 0x18AFF; # Unicode 13.0 133use constant TangSuppIni => 0x18D00; # Unicode 13.0 134use constant TangSuppFin => 0x18D08; # Unicode 13.0 135use constant NushuIni => 0x1B170; # Unicode 10.0 136use constant NushuFin => 0x1B2FB; # Unicode 10.0 137use constant KhitanIni => 0x18B00; # Unicode 13.0 138use constant KhitanFin => 0x18CD5; # Unicode 13.0 139 140### end PP only ### 141 142# Logical_Order_Exception in PropList.txt 143my $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ]; 144 145# for highestFFFF and minimalFFFE 146my $HighestVCE = pack(VCE_TEMPLATE, 0, 0xFFFE, 0x20, 0x5, 0xFFFF); 147my $minimalVCE = pack(VCE_TEMPLATE, 0, 1, 0x20, 0x5, 0xFFFE); 148 149sub UCA_Version { '43' } 150 151sub Base_Unicode_Version { '13.0.0' } 152 153###### 154 155my $native_to_unicode = ($::IS_ASCII || $] < 5.008) 156 ? sub { return shift } 157 : sub { utf8::native_to_unicode(shift) }; 158 159my $unicode_to_native = ($::IS_ASCII || $] < 5.008) 160 ? sub { return shift } 161 : sub { utf8::unicode_to_native(shift) }; 162 163# pack_U() should get Unicode code points. 164sub pack_U { 165 return pack('U*', map $unicode_to_native->($_), @_); 166} 167 168# unpack_U() should return Unicode code points. 169sub unpack_U { 170 return map $native_to_unicode->($_), unpack('U*', shift(@_).pack('U*')); 171} 172# for older perl version, pack('U*') generates empty string with utf8 flag. 173 174###### 175 176my (%VariableOK); 177@VariableOK{ qw/ 178 blanked non-ignorable shifted shift-trimmed 179 / } = (); # keys lowercased 180 181our @ChangeOK = qw/ 182 alternate backwards level normalization rearrange 183 katakana_before_hiragana upper_before_lower ignore_level2 184 overrideCJK overrideHangul overrideOut preprocess UCA_Version 185 hangul_terminator variable identical highestFFFF minimalFFFE 186 long_contraction 187 /; 188 189our @ChangeNG = qw/ 190 entry mapping table maxlength contraction 191 ignoreChar ignoreName undefChar undefName rewrite 192 versionTable alternateTable backwardsTable forwardsTable 193 rearrangeTable variableTable 194 derivCode normCode rearrangeHash backwardsFlag 195 suppress suppressHash 196 /; 197# The hash key 'ignored' was deleted at v 0.21. 198# The hash key 'isShift' was deleted at v 0.23. 199# The hash key 'combining' was deleted at v 0.24. 200# The hash key 'entries' was deleted at v 0.30. 201# The hash key 'L3_ignorable' was deleted at v 0.40. 202 203sub version { 204 my $self = shift; 205 return $self->{versionTable} || 'unknown'; 206} 207 208my (%ChangeOK, %ChangeNG); 209@ChangeOK{ @ChangeOK } = (); 210@ChangeNG{ @ChangeNG } = (); 211 212sub change { 213 my $self = shift; 214 my %hash = @_; 215 my %old; 216 if (exists $hash{alternate}) { 217 if (exists $hash{variable}) { 218 delete $hash{alternate}; 219 } else { 220 $hash{variable} = $hash{alternate}; 221 } 222 } 223 foreach my $k (keys %hash) { 224 if (exists $ChangeOK{$k}) { 225 $old{$k} = $self->{$k}; 226 $self->{$k} = $hash{$k}; 227 } elsif (exists $ChangeNG{$k}) { 228 croak "change of $k via change() is not allowed!"; 229 } 230 # else => ignored 231 } 232 $self->checkCollator(); 233 return wantarray ? %old : $self; 234} 235 236sub _checkLevel { 237 my $level = shift; 238 my $key = shift; # 'level' or 'backwards' 239 MinLevel <= $level or croak sprintf 240 "Illegal level %d (in value for key '%s') lower than %d.", 241 $level, $key, MinLevel; 242 $level <= MaxLevel or croak sprintf 243 "Unsupported level %d (in value for key '%s') higher than %d.", 244 $level, $key, MaxLevel; 245} 246 247my %DerivCode = ( 248 8 => \&_derivCE_8, 249 9 => \&_derivCE_9, 250 11 => \&_derivCE_9, # 11 == 9 251 14 => \&_derivCE_14, 252 16 => \&_derivCE_14, # 16 == 14 253 18 => \&_derivCE_18, 254 20 => \&_derivCE_20, 255 22 => \&_derivCE_22, 256 24 => \&_derivCE_24, 257 26 => \&_derivCE_24, # 26 == 24 258 28 => \&_derivCE_24, # 28 == 24 259 30 => \&_derivCE_24, # 30 == 24 260 32 => \&_derivCE_32, 261 34 => \&_derivCE_34, 262 36 => \&_derivCE_36, 263 38 => \&_derivCE_38, 264 40 => \&_derivCE_40, 265 41 => \&_derivCE_40, # 41 == 40 266 43 => \&_derivCE_43, 267); 268 269sub checkCollator { 270 my $self = shift; 271 _checkLevel($self->{level}, 'level'); 272 273 $self->{derivCode} = $DerivCode{ $self->{UCA_Version} } 274 or croak "Illegal UCA version (passed $self->{UCA_Version})."; 275 276 $self->{variable} ||= $self->{alternate} || $self->{variableTable} || 277 $self->{alternateTable} || 'shifted'; 278 $self->{variable} = $self->{alternate} = lc($self->{variable}); 279 exists $VariableOK{ $self->{variable} } 280 or croak "$PACKAGE unknown variable parameter name: $self->{variable}"; 281 282 if (! defined $self->{backwards}) { 283 $self->{backwardsFlag} = 0; 284 } elsif (! ref $self->{backwards}) { 285 _checkLevel($self->{backwards}, 'backwards'); 286 $self->{backwardsFlag} = 1 << $self->{backwards}; 287 } else { 288 my %level; 289 $self->{backwardsFlag} = 0; 290 for my $b (@{ $self->{backwards} }) { 291 _checkLevel($b, 'backwards'); 292 $level{$b} = 1; 293 } 294 for my $v (sort keys %level) { 295 $self->{backwardsFlag} += 1 << $v; 296 } 297 } 298 299 defined $self->{rearrange} or $self->{rearrange} = []; 300 ref $self->{rearrange} 301 or croak "$PACKAGE: list for rearrangement must be store in ARRAYREF"; 302 303 # keys of $self->{rearrangeHash} are $self->{rearrange}. 304 $self->{rearrangeHash} = undef; 305 306 if (@{ $self->{rearrange} }) { 307 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = (); 308 } 309 310 $self->{normCode} = undef; 311 312 if (defined $self->{normalization}) { 313 eval { require Unicode::Normalize }; 314 $@ and croak "Unicode::Normalize is required to normalize strings"; 315 316 $CVgetCombinClass ||= \&Unicode::Normalize::getCombinClass; 317 318 if ($self->{normalization} =~ /^(?:NF)D\z/) { # tweak for default 319 $self->{normCode} = \&Unicode::Normalize::NFD; 320 } 321 elsif ($self->{normalization} ne 'prenormalized') { 322 my $norm = $self->{normalization}; 323 $self->{normCode} = sub { 324 Unicode::Normalize::normalize($norm, shift); 325 }; 326 eval { $self->{normCode}->("") }; # try 327 $@ and croak "$PACKAGE unknown normalization form name: $norm"; 328 } 329 } 330 return; 331} 332 333sub new 334{ 335 my $class = shift; 336 my $self = bless { @_ }, $class; 337 338 # keys of $self->{suppressHash} are $self->{suppress}. 339 if ($self->{suppress} && @{ $self->{suppress} }) { 340 @{ $self->{suppressHash} }{ @{ $self->{suppress} } } = (); 341 } # before read_table() 342 343 # If undef is passed explicitly, no file is read. 344 $self->{table} = $KeyFile if ! exists $self->{table}; 345 $self->read_table() if defined $self->{table}; 346 347 if ($self->{entry}) { 348 while ($self->{entry} =~ /([^\n]+)/g) { 349 $self->parseEntry($1, TRUE); 350 } 351 } 352 353 # only in new(), not in change() 354 $self->{level} ||= MaxLevel; 355 $self->{UCA_Version} ||= UCA_Version(); 356 357 $self->{overrideHangul} = FALSE 358 if ! exists $self->{overrideHangul}; 359 $self->{overrideCJK} = FALSE 360 if ! exists $self->{overrideCJK}; 361 $self->{normalization} = 'NFD' 362 if ! exists $self->{normalization}; 363 $self->{rearrange} = $self->{rearrangeTable} || 364 ($self->{UCA_Version} <= 11 ? $DefaultRearrange : []) 365 if ! exists $self->{rearrange}; 366 $self->{backwards} = $self->{backwardsTable} 367 if ! exists $self->{backwards}; 368 exists $self->{long_contraction} or $self->{long_contraction} 369 = 22 <= $self->{UCA_Version} && $self->{UCA_Version} <= 24; 370 371 # checkCollator() will be called in change() 372 $self->checkCollator(); 373 374 return $self; 375} 376 377sub parseAtmark { 378 my $self = shift; 379 my $line = shift; # after s/^\s*\@// 380 381 if ($line =~ /^version\s*(\S*)/) { 382 $self->{versionTable} ||= $1; 383 } 384 elsif ($line =~ /^variable\s+(\S*)/) { # since UTS #10-9 385 $self->{variableTable} ||= $1; 386 } 387 elsif ($line =~ /^alternate\s+(\S*)/) { # till UTS #10-8 388 $self->{alternateTable} ||= $1; 389 } 390 elsif ($line =~ /^backwards\s+(\S*)/) { 391 push @{ $self->{backwardsTable} }, $1; 392 } 393 elsif ($line =~ /^forwards\s+(\S*)/) { # perhaps no use 394 push @{ $self->{forwardsTable} }, $1; 395 } 396 elsif ($line =~ /^rearrange\s+(.*)/) { # (\S*) is NG 397 push @{ $self->{rearrangeTable} }, _getHexArray($1); 398 } 399} 400 401sub read_table { 402 my $self = shift; 403 404 my($f, $fh); 405 foreach my $d (@INC) { 406 $f = File::Spec->catfile($d, @Path, $self->{table}); 407 last if open($fh, $f); 408 $f = undef; 409 } 410 if (!defined $f) { 411 $f = File::Spec->catfile(@Path, $self->{table}); 412 croak("$PACKAGE: Can't locate $f in \@INC (\@INC contains: @INC)"); 413 } 414 415 while (my $line = <$fh>) { 416 next if $line =~ /^\s*#/; 417 418 if ($line =~ s/^\s*\@//) { 419 $self->parseAtmark($line); 420 } else { 421 $self->parseEntry($line); 422 } 423 } 424 close $fh; 425} 426 427 428## 429## get $line, parse it, and write an entry in $self 430## 431sub parseEntry 432{ 433 my $self = shift; 434 my $line = shift; 435 my $tailoring = shift; 436 my($name, $entry, @uv, @key); 437 438 if (defined $self->{rewrite}) { 439 $line = $self->{rewrite}->($line); 440 } 441 442 return if $line !~ /^\s*[0-9A-Fa-f]/; 443 444 # removes comment and gets name 445 $name = $1 446 if $line =~ s/[#%]\s*(.*)//; 447 return if defined $self->{undefName} && $name =~ /$self->{undefName}/; 448 449 # gets element 450 my($e, $k) = split /;/, $line; 451 croak "Wrong Entry: <charList> must be separated by ';' from <collElement>" 452 if ! $k; 453 454 @uv = _getHexArray($e); 455 return if !@uv; 456 return if @uv > 1 && $self->{suppressHash} && !$tailoring && 457 exists $self->{suppressHash}{$uv[0]}; 458 $entry = join(CODE_SEP, @uv); # in JCPS 459 460 if (defined $self->{undefChar} || defined $self->{ignoreChar}) { 461 my $ele = pack_U(@uv); 462 463 # regarded as if it were not stored in the table 464 return 465 if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/; 466 467 # replaced as completely ignorable 468 $k = '[.0000.0000.0000.0000]' 469 if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/; 470 } 471 472 # replaced as completely ignorable 473 $k = '[.0000.0000.0000.0000]' 474 if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/; 475 476 my $is_L3_ignorable = TRUE; 477 478 foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed 479 my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient. 480 my @wt = _getHexArray($arr); 481 push @key, pack(VCE_TEMPLATE, $var, @wt); 482 $is_L3_ignorable = FALSE 483 if $wt[0] || $wt[1] || $wt[2]; 484 # Conformance Test for 3.1.1 and 4.0.0 shows Level 3 ignorable 485 # is completely ignorable. 486 # For expansion, an entry $is_L3_ignorable 487 # if and only if "all" CEs are [.0000.0000.0000]. 488 } 489 490 # mapping: be an array ref or not exists (any false value is disallowed) 491 $self->{mapping}{$entry} = $is_L3_ignorable ? [] : \@key; 492 493 # maxlength: be more than 1 or not exists (any false value is disallowed) 494 if (@uv > 1) { 495 if (!$self->{maxlength}{$uv[0]} || $self->{maxlength}{$uv[0]} < @uv) { 496 $self->{maxlength}{$uv[0]} = @uv; 497 } 498 } 499 500 # contraction: be 1 or not exists (any false value is disallowed) 501 while (@uv > 2) { 502 pop @uv; 503 my $fake_entry = join(CODE_SEP, @uv); # in JCPS 504 $self->{contraction}{$fake_entry} = 1; 505 } 506} 507 508### begin PP only ### 509# 510# modify CE 511# 512sub varCE 513{ 514 my $self = shift; 515 my $vbl = $self->{variable}; 516 my $vce = shift; 517 my ($var, @wt) = unpack VCE_TEMPLATE, $vce; 518 519 # primary weight == 0 && secondary weight != 0 520 if ($self->{ignore_level2} && $wt[0] == 0 && $wt[1] != 0) { 521 $wt[1] = $wt[2] = 0; 522 } 523 524 if ($vbl eq 'non-ignorable') { 525 return pack(VCE_TEMPLATE, $var, @wt); 526 } elsif ($var) { 527 return pack(VCE_TEMPLATE, $var, 0, 0, 0, 528 $vbl eq 'blanked' ? $wt[3] : $wt[0]); 529 } elsif ($vbl eq 'blanked') { 530 return pack(VCE_TEMPLATE, $var, @wt); 531 } elsif ($self->{UCA_Version} >= 36 && 532 $vbl eq 'shifted' && !$wt[1] && !$wt[2]) { 533 # completely ignorable or the second derived CE 534 return pack(VCE_TEMPLATE, $var, $wt[0], 0, 0, 0); 535 } else { 536 return pack(VCE_TEMPLATE, $var, @wt[0..2], 537 $vbl eq 'shifted' && $wt[0]+$wt[1]+$wt[2] ? 538 $wt[0] eq 1 ? $wt[0] : # XXX: CollationAuxiliary-6.2.0 539 Shift4Wt : 0); 540 } 541} 542 543sub visualizeSortKey 544{ 545 my $self = shift; 546 my $view = join ' ', map sprintf('%04X', $_), unpack(KEY_TEMPLATE, shift); 547 548 if ($self->{UCA_Version} <= 8) { 549 $view =~ s/ ?0000 ?/|/ for MinLevel..MaxLevel; 550 } else { 551 $view =~ s/\b0000\b/|/ for MinLevel..MaxLevel; 552 } 553 return "[$view]"; 554} 555### end PP only ### 556 557 558sub viewSortKey 559{ 560 my $self = shift; 561 my $str = shift; 562 $self->visualizeSortKey($self->getSortKey($str)); 563} 564 565 566sub process 567{ 568 my $self = shift; 569 my $str = shift; 570 my $prep = $self->{preprocess}; 571 my $norm = $self->{normCode}; 572 573 $str = &$prep($str) if ref $prep; 574 $str = &$norm($str) if ref $norm; 575 return $str; 576} 577 578## 579## arrayref of JCPS = splitEnt(string to be collated) 580## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, TRUE) 581## 582sub splitEnt 583{ 584 my $self = shift; 585 my $str = shift; 586 my $wLen = shift; # with Length 587 588 my $map = $self->{mapping}; 589 my $max = $self->{maxlength}; 590 my $reH = $self->{rearrangeHash}; 591 my $vers = $self->{UCA_Version}; 592 my $ver9 = $vers >= 9 && $vers <= 11; 593 my $long = $self->{long_contraction}; 594 595 my @buf; 596 597 # get array of Unicode code point of string. 598 my @src = unpack_U($str); 599 600 # rearrangement: 601 # Character positions are not kept if rearranged, 602 # then neglected if $wLen is true. 603 if ($reH && ! $wLen) { 604 for (my $i = 0; $i < @src; $i++) { 605 if (exists $reH->{ $src[$i] } && $i + 1 < @src) { 606 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]); 607 $i++; 608 } 609 } 610 } 611 612 # remove a code point marked as a completely ignorable. 613 for (my $i = 0; $i < @src; $i++) { 614 if ($vers <= 20 && _isIllegal($src[$i])) { 615 $src[$i] = undef; 616 } elsif ($ver9) { 617 $src[$i] = undef if exists $map->{ $src[$i] } 618 && @{ $map->{ $src[$i] } } == 0; 619 } 620 } 621 622 for (my $i = 0; $i < @src; $i++) { 623 my $jcps = $src[$i]; 624 625 # skip removed code point 626 if (! defined $jcps) { 627 if ($wLen && @buf) { 628 $buf[-1][2] = $i + 1; 629 } 630 next; 631 } 632 633 my $i_orig = $i; 634 635 # find contraction 636 if (exists $max->{$jcps}) { 637 my $temp_jcps = $jcps; 638 my $jcpsLen = 1; 639 my $maxLen = $max->{$jcps}; 640 641 for (my $p = $i + 1; $jcpsLen < $maxLen && $p < @src; $p++) { 642 next if ! defined $src[$p]; 643 $temp_jcps .= CODE_SEP . $src[$p]; 644 $jcpsLen++; 645 if (exists $map->{$temp_jcps}) { 646 $jcps = $temp_jcps; 647 $i = $p; 648 } 649 } 650 651 # discontiguous contraction with Combining Char (cf. UTS#10, S2.1). 652 # This process requires Unicode::Normalize. 653 # If "normalization" is undef, here should be skipped *always* 654 # (in spite of bool value of $CVgetCombinClass), 655 # since canonical ordering cannot be expected. 656 # Blocked combining character should not be contracted. 657 658 # $self->{normCode} is false in the case of "prenormalized". 659 if ($self->{normalization}) { 660 my $cont = $self->{contraction}; 661 my $preCC = 0; 662 my $preCC_uc = 0; 663 my $jcps_uc = $jcps; 664 my(@out, @out_uc); 665 666 for (my $p = $i + 1; $p < @src; $p++) { 667 next if ! defined $src[$p]; 668 my $curCC = $CVgetCombinClass->($src[$p]); 669 last unless $curCC; 670 my $tail = CODE_SEP . $src[$p]; 671 672 if ($preCC != $curCC && exists $map->{$jcps.$tail}) { 673 $jcps .= $tail; 674 push @out, $p; 675 } else { 676 $preCC = $curCC; 677 } 678 679 next if !$long; 680 681 if ($preCC_uc != $curCC && 682 (exists $map->{$jcps_uc.$tail} || 683 exists $cont->{$jcps_uc.$tail})) { 684 $jcps_uc .= $tail; 685 push @out_uc, $p; 686 } else { 687 $preCC_uc = $curCC; 688 } 689 } 690 691 if (@out_uc && exists $map->{$jcps_uc}) { 692 $jcps = $jcps_uc; 693 $src[$_] = undef for @out_uc; 694 } else { 695 $src[$_] = undef for @out; 696 } 697 } 698 } 699 700 # skip completely ignorable 701 if (exists $map->{$jcps} && @{ $map->{$jcps} } == 0) { 702 if ($wLen && @buf) { 703 $buf[-1][2] = $i + 1; 704 } 705 next; 706 } 707 708 push @buf, $wLen ? [$jcps, $i_orig, $i + 1] : $jcps; 709 } 710 return \@buf; 711} 712 713## 714## VCE = _pack_override(input, codepoint, derivCode) 715## 716sub _pack_override ($$$) { 717 my $r = shift; 718 my $u = shift; 719 my $der = shift; 720 721 if (ref $r) { 722 return pack(VCE_TEMPLATE, NON_VAR, @$r); 723 } elsif (defined $r) { 724 return pack(VCE_TEMPLATE, NON_VAR, $r, Min2Wt, Min3Wt, $u); 725 } else { 726 $u = 0xFFFD if 0x10FFFF < $u; 727 return $der->($u); 728 } 729} 730 731## 732## list of VCE = getWt(JCPS) 733## 734sub getWt 735{ 736 my $self = shift; 737 my $u = shift; 738 my $map = $self->{mapping}; 739 my $der = $self->{derivCode}; 740 my $out = $self->{overrideOut}; 741 742 return if !defined $u; 743 return $self->varCE($HighestVCE) if $u eq 0xFFFF && $self->{highestFFFF}; 744 return $self->varCE($minimalVCE) if $u eq 0xFFFE && $self->{minimalFFFE}; 745 $u = 0xFFFD if $u !~ /;/ && 0x10FFFF < $u && !$out; 746 747 my @ce; 748 if (exists $map->{$u}) { 749 @ce = @{ $map->{$u} }; # $u may be a contraction 750 } elsif (Hangul_SIni <= $u && $u <= Hangul_SFin) { 751 my $hang = $self->{overrideHangul}; 752 if ($hang) { 753 @ce = map _pack_override($_, $u, $der), $hang->($u); 754 } elsif (!defined $hang) { 755 @ce = $der->($u); 756 } else { 757 my $max = $self->{maxlength}; 758 my @decH = _decompHangul($u); 759 760 if (@decH == 2) { 761 my $contract = join(CODE_SEP, @decH); 762 @decH = ($contract) if exists $map->{$contract}; 763 } else { # must be <@decH == 3> 764 if (exists $max->{$decH[0]}) { 765 my $contract = join(CODE_SEP, @decH); 766 if (exists $map->{$contract}) { 767 @decH = ($contract); 768 } else { 769 $contract = join(CODE_SEP, @decH[0,1]); 770 exists $map->{$contract} and @decH = ($contract, $decH[2]); 771 } 772 # even if V's ignorable, LT contraction is not supported. 773 # If such a situation were required, NFD should be used. 774 } 775 if (@decH == 3 && exists $max->{$decH[1]}) { 776 my $contract = join(CODE_SEP, @decH[1,2]); 777 exists $map->{$contract} and @decH = ($decH[0], $contract); 778 } 779 } 780 781 @ce = map({ 782 exists $map->{$_} ? @{ $map->{$_} } : $der->($_); 783 } @decH); 784 } 785 } elsif ($out && 0x10FFFF < $u) { 786 @ce = map _pack_override($_, $u, $der), $out->($u); 787 } else { 788 my $cjk = $self->{overrideCJK}; 789 my $vers = $self->{UCA_Version}; 790 if ($cjk && _isUIdeo($u, $vers)) { 791 @ce = map _pack_override($_, $u, $der), $cjk->($u); 792 } elsif ($vers == 8 && defined $cjk && _isUIdeo($u, 0)) { 793 @ce = _uideoCE_8($u); 794 } else { 795 @ce = $der->($u); 796 } 797 } 798 return map $self->varCE($_), @ce; 799} 800 801 802## 803## string sortkey = getSortKey(string arg) 804## 805sub getSortKey 806{ 807 my $self = shift; 808 my $orig = shift; 809 my $str = $self->process($orig); 810 my $rEnt = $self->splitEnt($str); # get an arrayref of JCPS 811 my $vers = $self->{UCA_Version}; 812 my $term = $self->{hangul_terminator}; 813 my $lev = $self->{level}; 814 my $iden = $self->{identical}; 815### begin PP only ## 816 my $v2i = $vers >= 9 && $self->{variable} ne 'non-ignorable'; 817### end PP only ## 818 819 my @buf; # weight arrays 820 if ($term) { 821 my $preHST = ''; 822 my $termCE = $self->varCE(pack(VCE_TEMPLATE, NON_VAR, $term, 0,0,0)); 823 foreach my $jcps (@$rEnt) { 824 # weird things like VL, TL-contraction are not considered! 825 my $curHST = join '', map getHST($_, $vers), split /;/, $jcps; 826 if ($preHST && !$curHST || # hangul before non-hangul 827 $preHST =~ /L\z/ && $curHST =~ /^T/ || 828 $preHST =~ /V\z/ && $curHST =~ /^L/ || 829 $preHST =~ /T\z/ && $curHST =~ /^[LV]/) { 830 push @buf, $termCE; 831 } 832 $preHST = $curHST; 833 push @buf, $self->getWt($jcps); 834 } 835 push @buf, $termCE if $preHST; # end at hangul 836 } else { 837 foreach my $jcps (@$rEnt) { 838 push @buf, $self->getWt($jcps); 839 } 840 } 841 842### begin PP only ## 843 # make sort key 844 my @ret = ([],[],[],[]); 845 my $last_is_variable; 846 847 foreach my $vwt (@buf) { 848 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt); 849 850 # "Ignorable (L1, L2) after Variable" since track. v. 9 851 if ($v2i) { 852 if ($var) { 853 $last_is_variable = TRUE; 854 } elsif (!$wt[0]) { # ignorable 855 next if $last_is_variable; 856 } else { 857 $last_is_variable = FALSE; 858 } 859 } 860 foreach my $v (0..$lev-1) { 861 0 < $wt[$v] and push @{ $ret[$v] }, $wt[$v]; 862 } 863 } 864 865 # modification of tertiary weights 866 if ($self->{upper_before_lower}) { 867 foreach my $w (@{ $ret[2] }) { 868 if (0x8 <= $w && $w <= 0xC) { $w -= 6 } # lower 869 elsif (0x2 <= $w && $w <= 0x6) { $w += 6 } # upper 870 elsif ($w == 0x1C) { $w += 1 } # square upper 871 elsif ($w == 0x1D) { $w -= 1 } # square lower 872 } 873 } 874 if ($self->{katakana_before_hiragana}) { 875 foreach my $w (@{ $ret[2] }) { 876 if (0x0F <= $w && $w <= 0x13) { $w -= 2 } # katakana 877 elsif (0x0D <= $w && $w <= 0x0E) { $w += 5 } # hiragana 878 } 879 } 880 881 if ($self->{backwardsFlag}) { 882 for (my $v = MinLevel; $v <= MaxLevel; $v++) { 883 if ($self->{backwardsFlag} & (1 << $v)) { 884 @{ $ret[$v-1] } = reverse @{ $ret[$v-1] }; 885 } 886 } 887 } 888 889 my $rkey = join LEVEL_SEP, map pack(KEY_TEMPLATE, @$_), @ret; 890### end PP only ## 891 892 if ($iden || $vers >= 26 && $lev == MaxLevel) { 893 $rkey .= LEVEL_SEP; 894 $rkey .= pack(TIE_TEMPLATE, unpack_U($str)) if $iden; 895 } 896 return $rkey; 897} 898 899 900## 901## int compare = cmp(string a, string b) 902## 903sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) } 904sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) } 905sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) } 906sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) } 907sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) } 908sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) } 909sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) } 910 911## 912## list[strings] sorted = sort(list[strings] arg) 913## 914sub sort { 915 my $obj = shift; 916 return 917 map { $_->[1] } 918 sort{ $a->[0] cmp $b->[0] } 919 map [ $obj->getSortKey($_), $_ ], @_; 920} 921 922### begin PP only ## 923sub _derivCE_43 { 924 my $u = shift; 925 my $tang = (TangIdeoIni <= $u && $u <= TangIdeo120 || 926 TangCompIni <= $u && $u <= TangComp130 || 927 TangSuppIni <= $u && $u <= TangSuppFin); 928 my $nush = (NushuIni <= $u && $u <= NushuFin); 929 my $ktan = (KhitanIni <= $u && $u <= KhitanFin); 930 931 my $base = ($tang) ? 0xFB00 : # Tangut 932 ($nush) ? 0xFB01 : # Nushu 933 ($ktan) ? 0xFB02 : # Khitan 934 (CJK_UidIni <= $u && $u <= CJK_UidF130 || $CompatUI{$u}) 935 ? 0xFB40 : # CJK 936 (CJK_ExtAIni <= $u && $u <= CJK_ExtA130 || 937 CJK_ExtBIni <= $u && $u <= CJK_ExtB130 || 938 CJK_ExtCIni <= $u && $u <= CJK_ExtCFin || 939 CJK_ExtDIni <= $u && $u <= CJK_ExtDFin || 940 CJK_ExtEIni <= $u && $u <= CJK_ExtEFin || 941 CJK_ExtFIni <= $u && $u <= CJK_ExtFFin || 942 CJK_ExtGIni <= $u && $u <= CJK_ExtGFin) 943 ? 0xFB80 # CJK ext. 944 : 0xFBC0; # others 945 my $aaaa = $tang || $nush || $ktan ? $base : $base + ($u >> 15); 946 my $bbbb = ($tang ? ($u - TangIdeoIni) : 947 $nush ? ($u - NushuIni) : 948 $ktan ? ($u - KhitanIni) : ($u & 0x7FFF)) | 0x8000; 949 return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u), 950 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u); 951} 952 953sub _derivCE_40 { 954 my $u = shift; 955 my $tang = (TangIdeoIni <= $u && $u <= TangIdeo120 || 956 TangCompIni <= $u && $u <= TangCompFin); 957 my $nush = (NushuIni <= $u && $u <= NushuFin); 958 my $base = ($tang) 959 ? 0xFB00 : # Tangut 960 ($nush) 961 ? 0xFB01 : # Nushu 962 (CJK_UidIni <= $u && $u <= CJK_UidF110 || $CompatUI{$u}) 963 ? 0xFB40 : # CJK 964 (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin || 965 CJK_ExtBIni <= $u && $u <= CJK_ExtBFin || 966 CJK_ExtCIni <= $u && $u <= CJK_ExtCFin || 967 CJK_ExtDIni <= $u && $u <= CJK_ExtDFin || 968 CJK_ExtEIni <= $u && $u <= CJK_ExtEFin || 969 CJK_ExtFIni <= $u && $u <= CJK_ExtFFin) 970 ? 0xFB80 # CJK ext. 971 : 0xFBC0; # others 972 my $aaaa = $tang || $nush ? $base : $base + ($u >> 15); 973 my $bbbb = ($tang ? ($u - TangIdeoIni) : 974 $nush ? ($u - NushuIni) : ($u & 0x7FFF)) | 0x8000; 975 return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u), 976 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u); 977} 978 979sub _derivCE_38 { 980 my $u = shift; 981 my $tang = (TangIdeoIni <= $u && $u <= TangIdeo110 || 982 TangCompIni <= $u && $u <= TangCompFin); 983 my $nush = (NushuIni <= $u && $u <= NushuFin); 984 my $base = ($tang) 985 ? 0xFB00 : # Tangut 986 ($nush) 987 ? 0xFB01 : # Nushu 988 (CJK_UidIni <= $u && $u <= CJK_UidF110 || $CompatUI{$u}) 989 ? 0xFB40 : # CJK 990 (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin || 991 CJK_ExtBIni <= $u && $u <= CJK_ExtBFin || 992 CJK_ExtCIni <= $u && $u <= CJK_ExtCFin || 993 CJK_ExtDIni <= $u && $u <= CJK_ExtDFin || 994 CJK_ExtEIni <= $u && $u <= CJK_ExtEFin || 995 CJK_ExtFIni <= $u && $u <= CJK_ExtFFin) 996 ? 0xFB80 # CJK ext. 997 : 0xFBC0; # others 998 my $aaaa = $tang || $nush ? $base : $base + ($u >> 15); 999 my $bbbb = ($tang ? ($u - TangIdeoIni) : 1000 $nush ? ($u - NushuIni) : ($u & 0x7FFF)) | 0x8000; 1001 return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u), 1002 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u); 1003} 1004 1005sub _derivCE_36 { 1006 my $u = shift; 1007 my $tang = (TangIdeoIni <= $u && $u <= TangIdeoFin || 1008 TangCompIni <= $u && $u <= TangCompFin); 1009 my $nush = (NushuIni <= $u && $u <= NushuFin); 1010 my $base = ($tang) 1011 ? 0xFB00 : # Tangut 1012 ($nush) 1013 ? 0xFB01 : # Nushu 1014 (CJK_UidIni <= $u && $u <= CJK_UidF100 || $CompatUI{$u}) 1015 ? 0xFB40 : # CJK 1016 (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin || 1017 CJK_ExtBIni <= $u && $u <= CJK_ExtBFin || 1018 CJK_ExtCIni <= $u && $u <= CJK_ExtCFin || 1019 CJK_ExtDIni <= $u && $u <= CJK_ExtDFin || 1020 CJK_ExtEIni <= $u && $u <= CJK_ExtEFin || 1021 CJK_ExtFIni <= $u && $u <= CJK_ExtFFin) 1022 ? 0xFB80 # CJK ext. 1023 : 0xFBC0; # others 1024 my $aaaa = $tang || $nush ? $base : $base + ($u >> 15); 1025 my $bbbb = ($tang ? ($u - TangIdeoIni) : 1026 $nush ? ($u - NushuIni) : ($u & 0x7FFF)) | 0x8000; 1027 return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u), 1028 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u); 1029} 1030 1031sub _derivCE_34 { 1032 my $u = shift; 1033 my $tang = (TangIdeoIni <= $u && $u <= TangIdeoFin || 1034 TangCompIni <= $u && $u <= TangCompFin); 1035 my $base = ($tang) 1036 ? 0xFB00 : # Tangut 1037 (CJK_UidIni <= $u && $u <= CJK_UidF80 || $CompatUI{$u}) 1038 ? 0xFB40 : # CJK 1039 (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin || 1040 CJK_ExtBIni <= $u && $u <= CJK_ExtBFin || 1041 CJK_ExtCIni <= $u && $u <= CJK_ExtCFin || 1042 CJK_ExtDIni <= $u && $u <= CJK_ExtDFin || 1043 CJK_ExtEIni <= $u && $u <= CJK_ExtEFin) 1044 ? 0xFB80 # CJK ext. 1045 : 0xFBC0; # others 1046 my $aaaa = $tang ? $base : $base + ($u >> 15); 1047 my $bbbb = ($tang ? ($u - TangIdeoIni) : ($u & 0x7FFF)) | 0x8000; 1048 return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u), 1049 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u); 1050} 1051 1052sub _derivCE_32 { 1053 my $u = shift; 1054 my $base = (CJK_UidIni <= $u && $u <= CJK_UidF80 || $CompatUI{$u}) 1055 ? 0xFB40 : # CJK 1056 (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin || 1057 CJK_ExtBIni <= $u && $u <= CJK_ExtBFin || 1058 CJK_ExtCIni <= $u && $u <= CJK_ExtCFin || 1059 CJK_ExtDIni <= $u && $u <= CJK_ExtDFin || 1060 CJK_ExtEIni <= $u && $u <= CJK_ExtEFin) 1061 ? 0xFB80 # CJK ext. 1062 : 0xFBC0; # others 1063 my $aaaa = $base + ($u >> 15); 1064 my $bbbb = ($u & 0x7FFF) | 0x8000; 1065 return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u), 1066 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u); 1067} 1068 1069sub _derivCE_24 { 1070 my $u = shift; 1071 my $base = (CJK_UidIni <= $u && $u <= CJK_UidF61 || $CompatUI{$u}) 1072 ? 0xFB40 : # CJK 1073 (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin || 1074 CJK_ExtBIni <= $u && $u <= CJK_ExtBFin || 1075 CJK_ExtCIni <= $u && $u <= CJK_ExtCFin || 1076 CJK_ExtDIni <= $u && $u <= CJK_ExtDFin) 1077 ? 0xFB80 # CJK ext. 1078 : 0xFBC0; # others 1079 my $aaaa = $base + ($u >> 15); 1080 my $bbbb = ($u & 0x7FFF) | 0x8000; 1081 return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u), 1082 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u); 1083} 1084 1085sub _derivCE_22 { 1086 my $u = shift; 1087 my $base = (CJK_UidIni <= $u && $u <= CJK_UidF52 || $CompatUI{$u}) 1088 ? 0xFB40 : # CJK 1089 (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin || 1090 CJK_ExtBIni <= $u && $u <= CJK_ExtBFin || 1091 CJK_ExtCIni <= $u && $u <= CJK_ExtCFin || 1092 CJK_ExtDIni <= $u && $u <= CJK_ExtDFin) 1093 ? 0xFB80 # CJK ext. 1094 : 0xFBC0; # others 1095 my $aaaa = $base + ($u >> 15); 1096 my $bbbb = ($u & 0x7FFF) | 0x8000; 1097 return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u), 1098 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u); 1099} 1100 1101sub _derivCE_20 { 1102 my $u = shift; 1103 my $base = (CJK_UidIni <= $u && $u <= CJK_UidF52 || $CompatUI{$u}) 1104 ? 0xFB40 : # CJK 1105 (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin || 1106 CJK_ExtBIni <= $u && $u <= CJK_ExtBFin || 1107 CJK_ExtCIni <= $u && $u <= CJK_ExtCFin) 1108 ? 0xFB80 # CJK ext. 1109 : 0xFBC0; # others 1110 my $aaaa = $base + ($u >> 15); 1111 my $bbbb = ($u & 0x7FFF) | 0x8000; 1112 return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u), 1113 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u); 1114} 1115 1116sub _derivCE_18 { 1117 my $u = shift; 1118 my $base = (CJK_UidIni <= $u && $u <= CJK_UidF51 || $CompatUI{$u}) 1119 ? 0xFB40 : # CJK 1120 (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin || 1121 CJK_ExtBIni <= $u && $u <= CJK_ExtBFin) 1122 ? 0xFB80 # CJK ext. 1123 : 0xFBC0; # others 1124 my $aaaa = $base + ($u >> 15); 1125 my $bbbb = ($u & 0x7FFF) | 0x8000; 1126 return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u), 1127 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u); 1128} 1129 1130sub _derivCE_14 { 1131 my $u = shift; 1132 my $base = (CJK_UidIni <= $u && $u <= CJK_UidF41 || $CompatUI{$u}) 1133 ? 0xFB40 : # CJK 1134 (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin || 1135 CJK_ExtBIni <= $u && $u <= CJK_ExtBFin) 1136 ? 0xFB80 # CJK ext. 1137 : 0xFBC0; # others 1138 my $aaaa = $base + ($u >> 15); 1139 my $bbbb = ($u & 0x7FFF) | 0x8000; 1140 return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u), 1141 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u); 1142} 1143 1144sub _derivCE_9 { 1145 my $u = shift; 1146 my $base = (CJK_UidIni <= $u && $u <= CJK_UidFin || $CompatUI{$u}) 1147 ? 0xFB40 : # CJK 1148 (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin || 1149 CJK_ExtBIni <= $u && $u <= CJK_ExtBFin) 1150 ? 0xFB80 # CJK ext. 1151 : 0xFBC0; # others 1152 my $aaaa = $base + ($u >> 15); 1153 my $bbbb = ($u & 0x7FFF) | 0x8000; 1154 return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u), 1155 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u); 1156} 1157 1158sub _derivCE_8 { 1159 my $code = shift; 1160 my $aaaa = 0xFF80 + ($code >> 15); 1161 my $bbbb = ($code & 0x7FFF) | 0x8000; 1162 return pack(VCE_TEMPLATE, NON_VAR, $aaaa, 2, 1, $code), 1163 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $code); 1164} 1165 1166sub _uideoCE_8 { 1167 my $u = shift; 1168 return pack(VCE_TEMPLATE, NON_VAR, $u, Min2Wt, Min3Wt, $u); 1169} 1170 1171sub _isUIdeo { 1172 # $uca_vers = 0 for _uideoCE_8() 1173 my ($u, $uca_vers) = @_; 1174 return((CJK_UidIni <= $u && ( 1175 $uca_vers >= 43 ? ($u <= CJK_UidF130) : 1176 $uca_vers >= 38 ? ($u <= CJK_UidF110) : 1177 $uca_vers >= 36 ? ($u <= CJK_UidF100) : 1178 $uca_vers >= 32 ? ($u <= CJK_UidF80) : 1179 $uca_vers >= 24 ? ($u <= CJK_UidF61) : 1180 $uca_vers >= 20 ? ($u <= CJK_UidF52) : 1181 $uca_vers >= 18 ? ($u <= CJK_UidF51) : 1182 $uca_vers >= 14 ? ($u <= CJK_UidF41) : 1183 ($u <= CJK_UidFin))) || $CompatUI{$u} 1184 || 1185 (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin) 1186 || 1187 ($uca_vers >= 43 && CJK_ExtAIni <= $u && $u <= CJK_ExtA130) 1188 || 1189 ($uca_vers >= 8 && CJK_ExtBIni <= $u && $u <= CJK_ExtBFin) 1190 || 1191 ($uca_vers >= 43 && CJK_ExtBIni <= $u && $u <= CJK_ExtB130) 1192 || 1193 ($uca_vers >= 20 && CJK_ExtCIni <= $u && $u <= CJK_ExtCFin) 1194 || 1195 ($uca_vers >= 22 && CJK_ExtDIni <= $u && $u <= CJK_ExtDFin) 1196 || 1197 ($uca_vers >= 32 && CJK_ExtEIni <= $u && $u <= CJK_ExtEFin) 1198 || 1199 ($uca_vers >= 36 && CJK_ExtFIni <= $u && $u <= CJK_ExtFFin) 1200 || 1201 ($uca_vers >= 43 && CJK_ExtGIni <= $u && $u <= CJK_ExtGFin) 1202 ); 1203} 1204 1205 1206## 1207## "hhhh hhhh hhhh" to (dddd, dddd, dddd) 1208## 1209sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g } 1210 1211# 1212# $code *must* be in Hangul syllable. 1213# Check it before you enter here. 1214# 1215sub _decompHangul { 1216 my $code = shift; 1217 my $si = $code - Hangul_SBase; 1218 my $li = int( $si / Hangul_NCount); 1219 my $vi = int(($si % Hangul_NCount) / Hangul_TCount); 1220 my $ti = $si % Hangul_TCount; 1221 return ( 1222 Hangul_LBase + $li, 1223 Hangul_VBase + $vi, 1224 $ti ? (Hangul_TBase + $ti) : (), 1225 ); 1226} 1227 1228sub _isIllegal { 1229 my $code = shift; 1230 return((! defined $code) # removed 1231 || ($code < 0 || 0x10FFFF < $code) # out of range 1232 || (($code & 0xFFFE) == 0xFFFE) # ??FFF[EF] 1233 || (0xD800 <= $code && $code <= 0xDFFF) # unpaired surrogates 1234 || (0xFDD0 <= $code && $code <= 0xFDEF) # other non-characters 1235 ); 1236} 1237 1238# Hangul Syllable Type 1239sub getHST { 1240 my $u = shift; 1241 my $vers = shift || 0; 1242 1243 if (Hangul_SIni <= $u && $u <= Hangul_SFin) { 1244 return +($u - Hangul_SBase) % Hangul_TCount ? 'LVT' : 'LV'; 1245 } 1246 1247 if ($vers < 20) { 1248 return Hangul_LIni <= $u && $u <= Hangul_LFin || 1249 $u == Hangul_LFill ? 'L' : 1250 Hangul_VIni <= $u && $u <= Hangul_VFin ? 'V' : 1251 Hangul_TIni <= $u && $u <= Hangul_TFin ? 'T' : ''; 1252 } else { 1253 return Hangul_LIni <= $u && $u <= Hangul_LEnd || 1254 HangulL2Ini <= $u && $u <= HangulL2Fin ? 'L' : 1255 Hangul_VIni <= $u && $u <= Hangul_VEnd || 1256 HangulV2Ini <= $u && $u <= HangulV2Fin ? 'V' : 1257 Hangul_TIni <= $u && $u <= Hangul_TEnd || 1258 HangulT2Ini <= $u && $u <= HangulT2Fin ? 'T' : ''; 1259 } 1260} 1261### end PP only ## 1262 1263 1264## 1265## bool _nonIgnorAtLevel(arrayref weights, int level) 1266## 1267sub _nonIgnorAtLevel($$) 1268{ 1269 my $wt = shift; 1270 return if ! defined $wt; 1271 my $lv = shift; 1272 return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE; 1273} 1274 1275## 1276## bool _eqArray( 1277## arrayref of arrayref[weights] source, 1278## arrayref of arrayref[weights] substr, 1279## int level) 1280## * comparison of graphemes vs graphemes. 1281## @$source >= @$substr must be true (check it before call this); 1282## 1283sub _eqArray($$$) 1284{ 1285 my $source = shift; 1286 my $substr = shift; 1287 my $lev = shift; 1288 1289 for my $g (0..@$substr-1){ 1290 # Do the $g'th graphemes have the same number of AV weights? 1291 return if @{ $source->[$g] } != @{ $substr->[$g] }; 1292 1293 for my $w (0..@{ $substr->[$g] }-1) { 1294 for my $v (0..$lev-1) { 1295 return if $source->[$g][$w][$v] != $substr->[$g][$w][$v]; 1296 } 1297 } 1298 } 1299 return 1; 1300} 1301 1302## 1303## (int position, int length) 1304## int position = index(string, substring, position, [undoc'ed global]) 1305## 1306## With "global" (only for the list context), 1307## returns list of arrayref[position, length]. 1308## 1309sub index 1310{ 1311 my $self = shift; 1312 $self->{preprocess} and 1313 croak "Don't use Preprocess with index(), match(), etc."; 1314 $self->{normCode} and 1315 croak "Don't use Normalization with index(), match(), etc."; 1316 1317 my $str = shift; 1318 my $len = length($str); 1319 my $sub = shift; 1320 my $subE = $self->splitEnt($sub); 1321 my $pos = @_ ? shift : 0; 1322 $pos = 0 if $pos < 0; 1323 my $glob = shift; 1324 1325 my $lev = $self->{level}; 1326 my $v2i = $self->{UCA_Version} >= 9 && 1327 $self->{variable} ne 'non-ignorable'; 1328 1329 if (! @$subE) { 1330 my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos; 1331 return $glob 1332 ? map([$_, 0], $temp..$len) 1333 : wantarray ? ($temp,0) : $temp; 1334 } 1335 $len < $pos 1336 and return wantarray ? () : NOMATCHPOS; 1337 my $strE = $self->splitEnt($pos ? substr($str, $pos) : $str, TRUE); 1338 @$strE 1339 or return wantarray ? () : NOMATCHPOS; 1340 1341 my(@strWt, @iniPos, @finPos, @subWt, @g_ret); 1342 1343 my $last_is_variable; 1344 for my $vwt (map $self->getWt($_), @$subE) { 1345 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt); 1346 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev); 1347 1348 # "Ignorable (L1, L2) after Variable" since track. v. 9 1349 if ($v2i) { 1350 if ($var) { 1351 $last_is_variable = TRUE; 1352 } 1353 elsif (!$wt[0]) { # ignorable 1354 $to_be_pushed = FALSE if $last_is_variable; 1355 } 1356 else { 1357 $last_is_variable = FALSE; 1358 } 1359 } 1360 1361 if (@subWt && !$var && !$wt[0]) { 1362 push @{ $subWt[-1] }, \@wt if $to_be_pushed; 1363 } elsif ($to_be_pushed) { 1364 push @subWt, [ \@wt ]; 1365 } 1366 # else ===> skipped 1367 } 1368 1369 my $count = 0; 1370 my $end = @$strE - 1; 1371 1372 $last_is_variable = FALSE; # reuse 1373 for (my $i = 0; $i <= $end; ) { # no $i++ 1374 my $found_base = 0; 1375 1376 # fetch a grapheme 1377 while ($i <= $end && $found_base == 0) { 1378 for my $vwt ($self->getWt($strE->[$i][0])) { 1379 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt); 1380 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev); 1381 1382 # "Ignorable (L1, L2) after Variable" since track. v. 9 1383 if ($v2i) { 1384 if ($var) { 1385 $last_is_variable = TRUE; 1386 } 1387 elsif (!$wt[0]) { # ignorable 1388 $to_be_pushed = FALSE if $last_is_variable; 1389 } 1390 else { 1391 $last_is_variable = FALSE; 1392 } 1393 } 1394 1395 if (@strWt && !$var && !$wt[0]) { 1396 push @{ $strWt[-1] }, \@wt if $to_be_pushed; 1397 $finPos[-1] = $strE->[$i][2]; 1398 } elsif ($to_be_pushed) { 1399 push @strWt, [ \@wt ]; 1400 push @iniPos, $found_base ? NOMATCHPOS : $strE->[$i][1]; 1401 $finPos[-1] = NOMATCHPOS if $found_base; 1402 push @finPos, $strE->[$i][2]; 1403 $found_base++; 1404 } 1405 # else ===> no-op 1406 } 1407 $i++; 1408 } 1409 1410 # try to match 1411 while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) { 1412 if ($iniPos[0] != NOMATCHPOS && 1413 $finPos[$#subWt] != NOMATCHPOS && 1414 _eqArray(\@strWt, \@subWt, $lev)) { 1415 my $temp = $iniPos[0] + $pos; 1416 1417 if ($glob) { 1418 push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]]; 1419 splice @strWt, 0, $#subWt; 1420 splice @iniPos, 0, $#subWt; 1421 splice @finPos, 0, $#subWt; 1422 } 1423 else { 1424 return wantarray 1425 ? ($temp, $finPos[$#subWt] - $iniPos[0]) 1426 : $temp; 1427 } 1428 } 1429 shift @strWt; 1430 shift @iniPos; 1431 shift @finPos; 1432 } 1433 } 1434 1435 return $glob 1436 ? @g_ret 1437 : wantarray ? () : NOMATCHPOS; 1438} 1439 1440## 1441## scalarref to matching part = match(string, substring) 1442## 1443sub match 1444{ 1445 my $self = shift; 1446 if (my($pos,$len) = $self->index($_[0], $_[1])) { 1447 my $temp = substr($_[0], $pos, $len); 1448 return wantarray ? $temp : \$temp; 1449 # An lvalue ref \substr should be avoided, 1450 # since its value is affected by modification of its referent. 1451 } 1452 else { 1453 return; 1454 } 1455} 1456 1457## 1458## arrayref matching parts = gmatch(string, substring) 1459## 1460sub gmatch 1461{ 1462 my $self = shift; 1463 my $str = shift; 1464 my $sub = shift; 1465 return map substr($str, $_->[0], $_->[1]), 1466 $self->index($str, $sub, 0, 'g'); 1467} 1468 1469## 1470## bool subst'ed = subst(string, substring, replace) 1471## 1472sub subst 1473{ 1474 my $self = shift; 1475 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE; 1476 1477 if (my($pos,$len) = $self->index($_[0], $_[1])) { 1478 if ($code) { 1479 my $mat = substr($_[0], $pos, $len); 1480 substr($_[0], $pos, $len, $code->($mat)); 1481 } else { 1482 substr($_[0], $pos, $len, $_[2]); 1483 } 1484 return TRUE; 1485 } 1486 else { 1487 return FALSE; 1488 } 1489} 1490 1491## 1492## int count = gsubst(string, substring, replace) 1493## 1494sub gsubst 1495{ 1496 my $self = shift; 1497 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE; 1498 my $cnt = 0; 1499 1500 # Replacement is carried out from the end, then use reverse. 1501 for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) { 1502 if ($code) { 1503 my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]); 1504 substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat)); 1505 } else { 1506 substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]); 1507 } 1508 $cnt++; 1509 } 1510 return $cnt; 1511} 1512 15131; 1514__END__ 1515 1516=head1 NAME 1517 1518Unicode::Collate - Unicode Collation Algorithm 1519 1520=head1 SYNOPSIS 1521 1522 use Unicode::Collate; 1523 1524 #construct 1525 $Collator = Unicode::Collate->new(%tailoring); 1526 1527 #sort 1528 @sorted = $Collator->sort(@not_sorted); 1529 1530 #compare 1531 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1. 1532 1533B<Note:> Strings in C<@not_sorted>, C<$a> and C<$b> are interpreted 1534according to Perl's Unicode support. See L<perlunicode>, 1535L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>. 1536Otherwise you can use C<preprocess> or should decode them before. 1537 1538=head1 DESCRIPTION 1539 1540This module is an implementation of Unicode Technical Standard #10 1541(a.k.a. UTS #10) - Unicode Collation Algorithm (a.k.a. UCA). 1542 1543=head2 Constructor and Tailoring 1544 1545The C<new> method returns a collator object. If new() is called 1546with no parameters, the collator should do the default collation. 1547 1548 $Collator = Unicode::Collate->new( 1549 UCA_Version => $UCA_Version, 1550 alternate => $alternate, # alias for 'variable' 1551 backwards => $levelNumber, # or \@levelNumbers 1552 entry => $element, 1553 hangul_terminator => $term_primary_weight, 1554 highestFFFF => $bool, 1555 identical => $bool, 1556 ignoreName => qr/$ignoreName/, 1557 ignoreChar => qr/$ignoreChar/, 1558 ignore_level2 => $bool, 1559 katakana_before_hiragana => $bool, 1560 level => $collationLevel, 1561 long_contraction => $bool, 1562 minimalFFFE => $bool, 1563 normalization => $normalization_form, 1564 overrideCJK => \&overrideCJK, 1565 overrideHangul => \&overrideHangul, 1566 preprocess => \&preprocess, 1567 rearrange => \@charList, 1568 rewrite => \&rewrite, 1569 suppress => \@charList, 1570 table => $filename, 1571 undefName => qr/$undefName/, 1572 undefChar => qr/$undefChar/, 1573 upper_before_lower => $bool, 1574 variable => $variable, 1575 ); 1576 1577=over 4 1578 1579=item UCA_Version 1580 1581If the revision (previously "tracking version") number of UCA is given, 1582behavior of that revision is emulated on collating. 1583If omitted, the return value of C<UCA_Version()> is used. 1584 1585The following revisions are supported. The default is 43. 1586 1587 UCA Unicode Standard DUCET (@version) 1588 ------------------------------------------------------- 1589 8 3.1 3.0.1 (3.0.1d9) 1590 9 3.1 with Corrigendum 3 3.1.1 1591 11 4.0.0 1592 14 4.1.0 1593 16 5.0.0 1594 18 5.1.0 1595 20 5.2.0 1596 22 6.0.0 1597 24 6.1.0 1598 26 6.2.0 1599 28 6.3.0 1600 30 7.0.0 1601 32 8.0.0 1602 34 9.0.0 1603 36 10.0.0 1604 38 11.0.0 1605 40 12.0.0 1606 41 12.1.0 1607 43 13.0.0 1608 1609* See below for C<long_contraction> with C<UCA_Version> 22 and 24. 1610 1611* Noncharacters (e.g. U+FFFF) are not ignored, and can be overridden 1612since C<UCA_Version> 22. 1613 1614* Out-of-range codepoints (greater than U+10FFFF) are not ignored, 1615and can be overridden since C<UCA_Version> 22. 1616 1617* Fully ignorable characters were ignored, and would not interrupt 1618contractions with C<UCA_Version> 9 and 11. 1619 1620* Treatment of ignorables after variables and some behaviors 1621were changed at C<UCA_Version> 9. 1622 1623* Characters regarded as CJK unified ideographs (cf. C<overrideCJK>) 1624depend on C<UCA_Version>. 1625 1626* Many hangul jamo are assigned at C<UCA_Version> 20, that will affect 1627C<hangul_terminator>. 1628 1629=item alternate 1630 1631-- see 3.2.2 Alternate Weighting, version 8 of UTS #10 1632 1633For backward compatibility, C<alternate> (old name) can be used 1634as an alias for C<variable>. 1635 1636=item backwards 1637 1638-- see 3.4 Backward Accents, UTS #10. 1639 1640 backwards => $levelNumber or \@levelNumbers 1641 1642Weights in reverse order; ex. level 2 (diacritic ordering) in French. 1643If omitted (or C<$levelNumber> is C<undef> or C<\@levelNumbers> is C<[]>), 1644forwards at all the levels. 1645 1646=item entry 1647 1648-- see 5 Tailoring; 9.1 Allkeys File Format, UTS #10. 1649 1650If the same character (or a sequence of characters) exists 1651in the collation element table through C<table>, 1652mapping to collation elements is overridden. 1653If it does not exist, the mapping is defined additionally. 1654 1655 entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt) 16560063 0068 ; [.0E6A.0020.0002.0063] # ch 16570043 0068 ; [.0E6A.0020.0007.0043] # Ch 16580043 0048 ; [.0E6A.0020.0008.0043] # CH 1659006C 006C ; [.0F4C.0020.0002.006C] # ll 1660004C 006C ; [.0F4C.0020.0007.004C] # Ll 1661004C 004C ; [.0F4C.0020.0008.004C] # LL 166200F1 ; [.0F7B.0020.0002.00F1] # n-tilde 1663006E 0303 ; [.0F7B.0020.0002.00F1] # n-tilde 166400D1 ; [.0F7B.0020.0008.00D1] # N-tilde 1665004E 0303 ; [.0F7B.0020.0008.00D1] # N-tilde 1666ENTRY 1667 1668 entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt) 166900E6 ; [.0E33.0020.0002.00E6][.0E8B.0020.0002.00E6] # ae ligature as <a><e> 167000C6 ; [.0E33.0020.0008.00C6][.0E8B.0020.0008.00C6] # AE ligature as <A><E> 1671ENTRY 1672 1673B<NOTE:> The code point in the UCA file format (before C<';'>) 1674B<must> be a Unicode code point (defined as hexadecimal), 1675but not a native code point. 1676So C<0063> must always denote C<U+0063>, 1677but not a character of C<"\x63">. 1678 1679Weighting may vary depending on collation element table. 1680So ensure the weights defined in C<entry> will be consistent with 1681those in the collation element table loaded via C<table>. 1682 1683In DUCET v4.0.0, primary weight of C<C> is C<0E60> 1684and that of C<D> is C<0E6D>. So setting primary weight of C<CH> to C<0E6A> 1685(as a value between C<0E60> and C<0E6D>) 1686makes ordering as C<C E<lt> CH E<lt> D>. 1687Exactly speaking DUCET already has some characters between C<C> and C<D>: 1688C<small capital C> (C<U+1D04>) with primary weight C<0E64>, 1689C<c-hook/C-hook> (C<U+0188/U+0187>) with C<0E65>, 1690and C<c-curl> (C<U+0255>) with C<0E69>. 1691Then primary weight C<0E6A> for C<CH> makes C<CH> 1692ordered between C<c-curl> and C<D>. 1693 1694=item hangul_terminator 1695 1696-- see 7.1.4 Trailing Weights, UTS #10. 1697 1698If a true value is given (non-zero but should be positive), 1699it will be added as a terminator primary weight to the end of 1700every standard Hangul syllable. Secondary and any higher weights 1701for terminator are set to zero. 1702If the value is false or C<hangul_terminator> key does not exist, 1703insertion of terminator weights will not be performed. 1704 1705Boundaries of Hangul syllables are determined 1706according to conjoining Jamo behavior in F<the Unicode Standard> 1707and F<HangulSyllableType.txt>. 1708 1709B<Implementation Note:> 1710(1) For expansion mapping (Unicode character mapped 1711to a sequence of collation elements), a terminator will not be added 1712between collation elements, even if Hangul syllable boundary exists there. 1713Addition of terminator is restricted to the next position 1714to the last collation element. 1715 1716(2) Non-conjoining Hangul letters 1717(Compatibility Jamo, halfwidth Jamo, and enclosed letters) are not 1718automatically terminated with a terminator primary weight. 1719These characters may need terminator included in a collation element 1720table beforehand. 1721 1722=item highestFFFF 1723 1724-- see 2.4 Tailored noncharacter weights, UTS #35 (LDML) Part 5: Collation. 1725 1726If the parameter is made true, C<U+FFFF> has a highest primary weight. 1727When a boolean of C<$coll-E<gt>ge($str, "abc")> and 1728C<$coll-E<gt>le($str, "abc\x{FFFF}")> is true, it is expected that C<$str> 1729begins with C<"abc">, or another primary equivalent. 1730C<$str> may be C<"abcd">, C<"abc012">, but should not include C<U+FFFF> 1731such as C<"abc\x{FFFF}xyz">. 1732 1733C<$coll-E<gt>le($str, "abc\x{FFFF}")> works like C<$coll-E<gt>lt($str, "abd")> 1734almost, but the latter has a problem that you should know which letter is 1735next to C<c>. For a certain language where C<ch> as the next letter, 1736C<"abch"> is greater than C<"abc\x{FFFF}">, but less than C<"abd">. 1737 1738Note: 1739This is equivalent to C<(entry =E<gt> 'FFFF ; [.FFFE.0020.0005.FFFF]')>. 1740Any other character than C<U+FFFF> can be tailored by C<entry>. 1741 1742=item identical 1743 1744-- see A.3 Deterministic Comparison, UTS #10. 1745 1746By default, strings whose weights are equal should be equal, 1747even though their code points are not equal. 1748Completely ignorable characters are ignored. 1749 1750If the parameter is made true, a final, tie-breaking level is used. 1751If no difference of weights is found after the comparison through 1752all the level specified by C<level>, the comparison with code points 1753will be performed. 1754For the tie-breaking comparison, the sort key has code points 1755of the original string appended. 1756Completely ignorable characters are not ignored. 1757 1758If C<preprocess> and/or C<normalization> is applied, the code points 1759of the string after them (in NFD by default) are used. 1760 1761=item ignoreChar 1762 1763=item ignoreName 1764 1765-- see 3.6 Variable Weighting, UTS #10. 1766 1767Makes the entry in the table completely ignorable; 1768i.e. as if the weights were zero at all level. 1769 1770Through C<ignoreChar>, any character matching C<qr/$ignoreChar/> 1771will be ignored. Through C<ignoreName>, any character whose name 1772(given in the C<table> file as a comment) matches C<qr/$ignoreName/> 1773will be ignored. 1774 1775E.g. when 'a' and 'e' are ignorable, 1776'element' is equal to 'lament' (or 'lmnt'). 1777 1778=item ignore_level2 1779 1780-- see 5.1 Parametric Tailoring, UTS #10. 1781 1782By default, case-sensitive comparison (that is level 3 difference) 1783won't ignore accents (that is level 2 difference). 1784 1785If the parameter is made true, accents (and other primary ignorable 1786characters) are ignored, even though cases are taken into account. 1787 1788B<NOTE>: C<level> should be 3 or greater. 1789 1790=item katakana_before_hiragana 1791 1792-- see 7.2 Tertiary Weight Table, UTS #10. 1793 1794By default, hiragana is before katakana. 1795If the parameter is made true, this is reversed. 1796 1797B<NOTE>: This parameter simplemindedly assumes that any hiragana/katakana 1798distinctions must occur in level 3, and their weights at level 3 must be 1799same as those mentioned in 7.3.1, UTS #10. 1800If you define your collation elements which violate this requirement, 1801this parameter does not work validly. 1802 1803=item level 1804 1805-- see 4.3 Form Sort Key, UTS #10. 1806 1807Set the maximum level. 1808Any higher levels than the specified one are ignored. 1809 1810 Level 1: alphabetic ordering 1811 Level 2: diacritic ordering 1812 Level 3: case ordering 1813 Level 4: tie-breaking (e.g. in the case when variable is 'shifted') 1814 1815 ex.level => 2, 1816 1817If omitted, the maximum is the 4th. 1818 1819B<NOTE:> The DUCET includes weights over 0xFFFF at the 4th level. 1820But this module only uses weights within 0xFFFF. 1821When C<variable> is 'blanked' or 'non-ignorable' (other than 'shifted' 1822and 'shift-trimmed'), the level 4 may be unreliable. 1823 1824See also C<identical>. 1825 1826=item long_contraction 1827 1828-- see 3.8.2 Well-Formedness of the DUCET, 4.2 Produce Array, UTS #10. 1829 1830If the parameter is made true, for a contraction with three or more 1831characters (here nicknamed "long contraction"), initial substrings 1832will be handled. 1833For example, a contraction ABC, where A is a starter, and B and C 1834are non-starters (character with non-zero combining character class), 1835will be detected even if there is not AB as a contraction. 1836 1837B<Default:> Usually false. 1838If C<UCA_Version> is 22 or 24, and the value of C<long_contraction> 1839is not specified in C<new()>, a true value is set implicitly. 1840This is a workaround to pass Conformance Tests for Unicode 6.0.0 and 6.1.0. 1841 1842C<change()> handles C<long_contraction> explicitly only. 1843If C<long_contraction> is not specified in C<change()>, even though 1844C<UCA_Version> is changed, C<long_contraction> will not be changed. 1845 1846B<Limitation:> Scanning non-starters is one-way (no back tracking). 1847If AB is found but not ABC is not found, other long contraction where 1848the first character is A and the second is not B may not be found. 1849 1850Under C<(normalization =E<gt> undef)>, detection step of discontiguous 1851contractions will be skipped. 1852 1853B<Note:> The following contractions in DUCET are not considered 1854in steps S2.1.1 to S2.1.3, where they are discontiguous. 1855 1856 0FB2 0F71 0F80 (TIBETAN VOWEL SIGN VOCALIC RR) 1857 0FB3 0F71 0F80 (TIBETAN VOWEL SIGN VOCALIC LL) 1858 1859For example C<TIBETAN VOWEL SIGN VOCALIC RR> with C<COMBINING TILDE OVERLAY> 1860(C<U+0344>) is C<0FB2 0344 0F71 0F80> in NFD. 1861In this case C<0FB2 0F80> (C<TIBETAN VOWEL SIGN VOCALIC R>) is detected, 1862instead of C<0FB2 0F71 0F80>. 1863Inserted C<0344> makes C<0FB2 0F71 0F80> discontiguous and lack of 1864contraction C<0FB2 0F71> prohibits C<0FB2 0F71 0F80> from being detected. 1865 1866=item minimalFFFE 1867 1868-- see 1.1.1 U+FFFE, UTS #35 (LDML) Part 5: Collation. 1869 1870If the parameter is made true, C<U+FFFE> has a minimal primary weight. 1871The comparison between C<"$a1\x{FFFE}$a2"> and C<"$b1\x{FFFE}$b2"> 1872first compares C<$a1> and C<$b1> at level 1, and 1873then C<$a2> and C<$b2> at level 1, as followed. 1874 1875 "ab\x{FFFE}a" 1876 "Ab\x{FFFE}a" 1877 "ab\x{FFFE}c" 1878 "Ab\x{FFFE}c" 1879 "ab\x{FFFE}xyz" 1880 "abc\x{FFFE}def" 1881 "abc\x{FFFE}xYz" 1882 "aBc\x{FFFE}xyz" 1883 "abcX\x{FFFE}def" 1884 "abcx\x{FFFE}xyz" 1885 "b\x{FFFE}aaa" 1886 "bbb\x{FFFE}a" 1887 1888Note: 1889This is equivalent to C<(entry =E<gt> 'FFFE ; [.0001.0020.0005.FFFE]')>. 1890Any other character than C<U+FFFE> can be tailored by C<entry>. 1891 1892=item normalization 1893 1894-- see 4.1 Normalize, UTS #10. 1895 1896If specified, strings are normalized before preparation of sort keys 1897(the normalization is executed after preprocess). 1898 1899A form name C<Unicode::Normalize::normalize()> accepts will be applied 1900as C<$normalization_form>. 1901Acceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>. 1902See C<Unicode::Normalize::normalize()> for detail. 1903If omitted, C<'NFD'> is used. 1904 1905C<normalization> is performed after C<preprocess> (if defined). 1906 1907Furthermore, special values, C<undef> and C<"prenormalized">, can be used, 1908though they are not concerned with C<Unicode::Normalize::normalize()>. 1909 1910If C<undef> (not a string C<"undef">) is passed explicitly 1911as the value for this key, 1912any normalization is not carried out (this may make tailoring easier 1913if any normalization is not desired). Under C<(normalization =E<gt> undef)>, 1914only contiguous contractions are resolved; 1915e.g. even if C<A-ring> (and C<A-ring-cedilla>) is ordered after C<Z>, 1916C<A-cedilla-ring> would be primary equal to C<A>. 1917In this point, 1918C<(normalization =E<gt> undef, preprocess =E<gt> sub { NFD(shift) })> 1919B<is not> equivalent to C<(normalization =E<gt> 'NFD')>. 1920 1921In the case of C<(normalization =E<gt> "prenormalized")>, 1922any normalization is not performed, but 1923discontiguous contractions with combining characters are performed. 1924Therefore 1925C<(normalization =E<gt> 'prenormalized', preprocess =E<gt> sub { NFD(shift) })> 1926B<is> equivalent to C<(normalization =E<gt> 'NFD')>. 1927If source strings are finely prenormalized, 1928C<(normalization =E<gt> 'prenormalized')> may save time for normalization. 1929 1930Except C<(normalization =E<gt> undef)>, 1931B<Unicode::Normalize> is required (see also B<CAVEAT>). 1932 1933=item overrideCJK 1934 1935-- see 7.1 Derived Collation Elements, UTS #10. 1936 1937By default, CJK unified ideographs are ordered in Unicode codepoint 1938order, but those in the CJK Unified Ideographs block are less than 1939those in the CJK Unified Ideographs Extension A etc. 1940 1941 In the CJK Unified Ideographs block: 1942 U+4E00..U+9FA5 if UCA_Version is 8, 9 or 11. 1943 U+4E00..U+9FBB if UCA_Version is 14 or 16. 1944 U+4E00..U+9FC3 if UCA_Version is 18. 1945 U+4E00..U+9FCB if UCA_Version is 20 or 22. 1946 U+4E00..U+9FCC if UCA_Version is 24 to 30. 1947 U+4E00..U+9FD5 if UCA_Version is 32 or 34. 1948 U+4E00..U+9FEA if UCA_Version is 36. 1949 U+4E00..U+9FEF if UCA_Version is 38, 40 or 41. 1950 U+4E00..U+9FFC if UCA_Version is 43. 1951 1952 In the CJK Unified Ideographs Extension blocks: 1953 Ext.A (U+3400..U+4DB5) if UCA_Version is 8 to 41. 1954 Ext.A (U+3400..U+4DBF) if UCA_Version is 43. 1955 Ext.B (U+20000..U+2A6D6) if UCA_Version is 8 to 41. 1956 Ext.B (U+20000..U+2A6DD) if UCA_Version is 43. 1957 Ext.C (U+2A700..U+2B734) if UCA_Version is 20 or later. 1958 Ext.D (U+2B740..U+2B81D) if UCA_Version is 22 or later. 1959 Ext.E (U+2B820..U+2CEA1) if UCA_Version is 32 or later. 1960 Ext.F (U+2CEB0..U+2EBE0) if UCA_Version is 36 or later. 1961 Ext.G (U+30000..U+3134A) if UCA_Version is 43. 1962 1963Through C<overrideCJK>, ordering of CJK unified ideographs (including 1964extensions) can be overridden. 1965 1966ex. CJK unified ideographs in the JIS code point order. 1967 1968 overrideCJK => sub { 1969 my $u = shift; # get a Unicode codepoint 1970 my $b = pack('n', $u); # to UTF-16BE 1971 my $s = your_unicode_to_sjis_converter($b); # convert 1972 my $n = unpack('n', $s); # convert sjis to short 1973 [ $n, 0x20, 0x2, $u ]; # return the collation element 1974 }, 1975 1976The return value may be an arrayref of 1st to 4th weights as shown 1977above. The return value may be an integer as the primary weight 1978as shown below. If C<undef> is returned, the default derived 1979collation element will be used. 1980 1981 overrideCJK => sub { 1982 my $u = shift; # get a Unicode codepoint 1983 my $b = pack('n', $u); # to UTF-16BE 1984 my $s = your_unicode_to_sjis_converter($b); # convert 1985 my $n = unpack('n', $s); # convert sjis to short 1986 return $n; # return the primary weight 1987 }, 1988 1989The return value may be a list containing zero or more of 1990an arrayref, an integer, or C<undef>. 1991 1992ex. ignores all CJK unified ideographs. 1993 1994 overrideCJK => sub {()}, # CODEREF returning empty list 1995 1996 # where ->eq("Pe\x{4E00}rl", "Perl") is true 1997 # as U+4E00 is a CJK unified ideograph and to be ignorable. 1998 1999If a false value (including C<undef>) is passed, C<overrideCJK> 2000has no effect. 2001C<$Collator-E<gt>change(overrideCJK =E<gt> 0)> resets the old one. 2002 2003But assignment of weight for CJK unified ideographs 2004in C<table> or C<entry> is still valid. 2005If C<undef> is passed explicitly as the value for this key, 2006weights for CJK unified ideographs are treated as undefined. 2007However when C<UCA_Version> E<gt> 8, C<(overrideCJK =E<gt> undef)> 2008has no special meaning. 2009 2010B<Note:> In addition to them, 12 CJK compatibility ideographs (C<U+FA0E>, 2011C<U+FA0F>, C<U+FA11>, C<U+FA13>, C<U+FA14>, C<U+FA1F>, C<U+FA21>, C<U+FA23>, 2012C<U+FA24>, C<U+FA27>, C<U+FA28>, C<U+FA29>) are also treated as CJK unified 2013ideographs. But they can't be overridden via C<overrideCJK> when you use 2014DUCET, as the table includes weights for them. C<table> or C<entry> has 2015priority over C<overrideCJK>. 2016 2017=item overrideHangul 2018 2019-- see 7.1 Derived Collation Elements, UTS #10. 2020 2021By default, Hangul syllables are decomposed into Hangul Jamo, 2022even if C<(normalization =E<gt> undef)>. 2023But the mapping of Hangul syllables may be overridden. 2024 2025This parameter works like C<overrideCJK>, so see there for examples. 2026 2027If you want to override the mapping of Hangul syllables, 2028NFD and NFKD are not appropriate, since NFD and NFKD will decompose 2029Hangul syllables before overriding. FCD may decompose Hangul syllables 2030as the case may be. 2031 2032If a false value (but not C<undef>) is passed, C<overrideHangul> 2033has no effect. 2034C<$Collator-E<gt>change(overrideHangul =E<gt> 0)> resets the old one. 2035 2036If C<undef> is passed explicitly as the value for this key, 2037weight for Hangul syllables is treated as undefined 2038without decomposition into Hangul Jamo. 2039But definition of weight for Hangul syllables 2040in C<table> or C<entry> is still valid. 2041 2042=item overrideOut 2043 2044-- see 7.1.1 Handling Ill-Formed Code Unit Sequences, UTS #10. 2045 2046Perl seems to allow out-of-range values (greater than 0x10FFFF). 2047By default, out-of-range values are replaced with C<U+FFFD> 2048(REPLACEMENT CHARACTER) when C<UCA_Version> E<gt>= 22, 2049or ignored when C<UCA_Version> E<lt>= 20. 2050 2051When C<UCA_Version> E<gt>= 22, the weights of out-of-range values 2052can be overridden. Though C<table> or C<entry> are available for them, 2053out-of-range values are too many. 2054 2055C<overrideOut> can perform it algorithmically. 2056This parameter works like C<overrideCJK>, so see there for examples. 2057 2058ex. ignores all out-of-range values. 2059 2060 overrideOut => sub {()}, # CODEREF returning empty list 2061 2062If a false value (including C<undef>) is passed, C<overrideOut> 2063has no effect. 2064C<$Collator-E<gt>change(overrideOut =E<gt> 0)> resets the old one. 2065 2066B<NOTE ABOUT U+FFFD:> 2067 2068UCA recommends that out-of-range values should not be ignored for security 2069reasons. Say, C<"pe\x{110000}rl"> should not be equal to C<"perl">. 2070However, C<U+FFFD> is wrongly mapped to a variable collation element 2071in DUCET for Unicode 6.0.0 to 6.2.0, that means out-of-range values will be 2072ignored when C<variable> isn't C<Non-ignorable>. 2073 2074The mapping of C<U+FFFD> is corrected in Unicode 6.3.0. 2075see L<http://www.unicode.org/reports/tr10/tr10-28.html#Trailing_Weights> 2076(7.1.4 Trailing Weights). Such a correction is reproduced by this. 2077 2078 overrideOut => sub { 0xFFFD }, # CODEREF returning a very large integer 2079 2080This workaround is unnecessary since Unicode 6.3.0. 2081 2082=item preprocess 2083 2084-- see 5.4 Preprocessing, UTS #10. 2085 2086If specified, the coderef is used to preprocess each string 2087before the formation of sort keys. 2088 2089ex. dropping English articles, such as "a" or "the". 2090Then, "the pen" is before "a pencil". 2091 2092 preprocess => sub { 2093 my $str = shift; 2094 $str =~ s/\b(?:an?|the)\s+//gi; 2095 return $str; 2096 }, 2097 2098C<preprocess> is performed before C<normalization> (if defined). 2099 2100ex. decoding strings in a legacy encoding such as shift-jis: 2101 2102 $sjis_collator = Unicode::Collate->new( 2103 preprocess => \&your_shiftjis_to_unicode_decoder, 2104 ); 2105 @result = $sjis_collator->sort(@shiftjis_strings); 2106 2107B<Note:> Strings returned from the coderef will be interpreted 2108according to Perl's Unicode support. See L<perlunicode>, 2109L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>. 2110 2111=item rearrange 2112 2113-- see 3.5 Rearrangement, UTS #10. 2114 2115Characters that are not coded in logical order and to be rearranged. 2116If C<UCA_Version> is equal to or less than 11, default is: 2117 2118 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ], 2119 2120If you want to disallow any rearrangement, pass C<undef> or C<[]> 2121(a reference to empty list) as the value for this key. 2122 2123If C<UCA_Version> is equal to or greater than 14, default is C<[]> 2124(i.e. no rearrangement). 2125 2126B<According to the version 9 of UCA, this parameter shall not be used; 2127but it is not warned at present.> 2128 2129=item rewrite 2130 2131If specified, the coderef is used to rewrite lines in C<table> or C<entry>. 2132The coderef will get each line, and then should return a rewritten line 2133according to the UCA file format. 2134If the coderef returns an empty line, the line will be skipped. 2135 2136e.g. any primary ignorable characters into tertiary ignorable: 2137 2138 rewrite => sub { 2139 my $line = shift; 2140 $line =~ s/\[\.0000\..{4}\..{4}\./[.0000.0000.0000./g; 2141 return $line; 2142 }, 2143 2144This example shows rewriting weights. C<rewrite> is allowed to 2145affect code points, weights, and the name. 2146 2147B<NOTE>: C<table> is available to use another table file; 2148preparing a modified table once would be more efficient than 2149rewriting lines on reading an unmodified table every time. 2150 2151=item suppress 2152 2153-- see 3.12 Special-Purpose Commands, UTS #35 (LDML) Part 5: Collation. 2154 2155Contractions beginning with the specified characters are suppressed, 2156even if those contractions are defined in C<table>. 2157 2158An example for Russian and some languages using the Cyrillic script: 2159 2160 suppress => [0x0400..0x0417, 0x041A..0x0437, 0x043A..0x045F], 2161 2162where 0x0400 stands for C<U+0400>, CYRILLIC CAPITAL LETTER IE WITH GRAVE. 2163 2164B<NOTE>: Contractions via C<entry> will not be suppressed. 2165 2166=item table 2167 2168-- see 3.8 Default Unicode Collation Element Table, UTS #10. 2169 2170You can use another collation element table if desired. 2171 2172The table file should locate in the F<Unicode/Collate> directory 2173on C<@INC>. Say, if the filename is F<Foo.txt>, 2174the table file is searched as F<Unicode/Collate/Foo.txt> in C<@INC>. 2175 2176By default, F<allkeys.txt> (as the filename of DUCET) is used. 2177If you will prepare your own table file, any name other than F<allkeys.txt> 2178may be better to avoid namespace conflict. 2179 2180B<NOTE>: When XSUB is used, the DUCET is compiled on building this 2181module, and it may save time at the run time. 2182Explicit saying C<(table =E<gt> 'allkeys.txt')>, or using another table, 2183or using C<ignoreChar>, C<ignoreName>, C<undefChar>, C<undefName> or 2184C<rewrite> will prevent this module from using the compiled DUCET. 2185 2186If C<undef> is passed explicitly as the value for this key, 2187no file is read (but you can define collation elements via C<entry>). 2188 2189A typical way to define a collation element table 2190without any file of table: 2191 2192 $onlyABC = Unicode::Collate->new( 2193 table => undef, 2194 entry => << 'ENTRIES', 21950061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A 21960041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A 21970062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B 21980042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B 21990063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C 22000043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C 2201ENTRIES 2202 ); 2203 2204If C<ignoreName> or C<undefName> is used, character names should be 2205specified as a comment (following C<#>) on each line. 2206 2207=item undefChar 2208 2209=item undefName 2210 2211-- see 6.3.3 Reducing the Repertoire, UTS #10. 2212 2213Undefines the collation element as if it were unassigned in the C<table>. 2214This reduces the size of the table. 2215If an unassigned character appears in the string to be collated, 2216the sort key is made from its codepoint 2217as a single-character collation element, 2218as it is greater than any other assigned collation elements 2219(in the codepoint order among the unassigned characters). 2220But, it'd be better to ignore characters 2221unfamiliar to you and maybe never used. 2222 2223Through C<undefChar>, any character matching C<qr/$undefChar/> 2224will be undefined. Through C<undefName>, any character whose name 2225(given in the C<table> file as a comment) matches C<qr/$undefName/> 2226will be undefined. 2227 2228ex. Collation weights for beyond-BMP characters are not stored in object: 2229 2230 undefChar => qr/[^\0-\x{fffd}]/, 2231 2232=item upper_before_lower 2233 2234-- see 6.6 Case Comparisons, UTS #10. 2235 2236By default, lowercase is before uppercase. 2237If the parameter is made true, this is reversed. 2238 2239B<NOTE>: This parameter simplemindedly assumes that any lowercase/uppercase 2240distinctions must occur in level 3, and their weights at level 3 must be 2241same as those mentioned in 7.3.1, UTS #10. 2242If you define your collation elements which differs from this requirement, 2243this parameter doesn't work validly. 2244 2245=item variable 2246 2247-- see 3.6 Variable Weighting, UTS #10. 2248 2249This key allows for variable weighting of variable collation elements, 2250which are marked with an ASTERISK in the table 2251(NOTE: Many punctuation marks and symbols are variable in F<allkeys.txt>). 2252 2253 variable => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'. 2254 2255These names are case-insensitive. 2256By default (if specification is omitted), 'shifted' is adopted. 2257 2258 'Blanked' Variable elements are made ignorable at levels 1 through 3; 2259 considered at the 4th level. 2260 2261 'Non-Ignorable' Variable elements are not reset to ignorable. 2262 2263 'Shifted' Variable elements are made ignorable at levels 1 through 3 2264 their level 4 weight is replaced by the old level 1 weight. 2265 Level 4 weight for Non-Variable elements is 0xFFFF. 2266 2267 'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level 2268 are trimmed. 2269 2270=back 2271 2272=head2 Methods for Collation 2273 2274=over 4 2275 2276=item C<@sorted = $Collator-E<gt>sort(@not_sorted)> 2277 2278Sorts a list of strings. 2279 2280=item C<$result = $Collator-E<gt>cmp($a, $b)> 2281 2282Returns 1 (when C<$a> is greater than C<$b>) 2283or 0 (when C<$a> is equal to C<$b>) 2284or -1 (when C<$a> is less than C<$b>). 2285 2286=item C<$result = $Collator-E<gt>eq($a, $b)> 2287 2288=item C<$result = $Collator-E<gt>ne($a, $b)> 2289 2290=item C<$result = $Collator-E<gt>lt($a, $b)> 2291 2292=item C<$result = $Collator-E<gt>le($a, $b)> 2293 2294=item C<$result = $Collator-E<gt>gt($a, $b)> 2295 2296=item C<$result = $Collator-E<gt>ge($a, $b)> 2297 2298They works like the same name operators as theirs. 2299 2300 eq : whether $a is equal to $b. 2301 ne : whether $a is not equal to $b. 2302 lt : whether $a is less than $b. 2303 le : whether $a is less than $b or equal to $b. 2304 gt : whether $a is greater than $b. 2305 ge : whether $a is greater than $b or equal to $b. 2306 2307=item C<$sortKey = $Collator-E<gt>getSortKey($string)> 2308 2309-- see 4.3 Form Sort Key, UTS #10. 2310 2311Returns a sort key. 2312 2313You compare the sort keys using a binary comparison 2314and get the result of the comparison of the strings using UCA. 2315 2316 $Collator->getSortKey($a) cmp $Collator->getSortKey($b) 2317 2318 is equivalent to 2319 2320 $Collator->cmp($a, $b) 2321 2322=item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)> 2323 2324Converts a sorting key into its representation form. 2325If C<UCA_Version> is 8, the output is slightly different. 2326 2327 use Unicode::Collate; 2328 my $c = Unicode::Collate->new(); 2329 print $c->viewSortKey("Perl"),"\n"; 2330 2331 # output: 2332 # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF] 2333 # Level 1 Level 2 Level 3 Level 4 2334 2335=back 2336 2337=head2 Methods for Searching 2338 2339The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work 2340like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively, 2341but they are not aware of any pattern, but only a literal substring. 2342 2343B<DISCLAIMER:> If C<preprocess> or C<normalization> parameter is true 2344for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>, 2345C<subst>, C<gsubst>) is croaked, as the position and the length might 2346differ from those on the specified string. 2347 2348C<rearrange> and C<hangul_terminator> parameters are neglected. 2349C<katakana_before_hiragana> and C<upper_before_lower> don't affect 2350matching and searching, as it doesn't matter whether greater or less. 2351 2352=over 4 2353 2354=item C<$position = $Collator-E<gt>index($string, $substring[, $position])> 2355 2356=item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])> 2357 2358If C<$substring> matches a part of C<$string>, returns 2359the position of the first occurrence of the matching part in scalar context; 2360in list context, returns a two-element list of 2361the position and the length of the matching part. 2362 2363If C<$substring> does not match any part of C<$string>, 2364returns C<-1> in scalar context and 2365an empty list in list context. 2366 2367e.g. when the content of C<$str> is C<"Ich mu>E<szlig>C< studieren Perl.">, 2368you say the following where C<$sub> is C<"M>E<uuml>C<SS">, 2369 2370 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 ); 2371 # (normalization => undef) is REQUIRED. 2372 my $match; 2373 if (my($pos,$len) = $Collator->index($str, $sub)) { 2374 $match = substr($str, $pos, $len); 2375 } 2376 2377and get C<"mu>E<szlig>C<"> in C<$match>, since C<"mu>E<szlig>C<"> 2378is primary equal to C<"M>E<uuml>C<SS">. 2379 2380=item C<$match_ref = $Collator-E<gt>match($string, $substring)> 2381 2382=item C<($match) = $Collator-E<gt>match($string, $substring)> 2383 2384If C<$substring> matches a part of C<$string>, in scalar context, returns 2385B<a reference to> the first occurrence of the matching part 2386(C<$match_ref> is always true if matches, 2387since every reference is B<true>); 2388in list context, returns the first occurrence of the matching part. 2389 2390If C<$substring> does not match any part of C<$string>, 2391returns C<undef> in scalar context and 2392an empty list in list context. 2393 2394e.g. 2395 2396 if ($match_ref = $Collator->match($str, $sub)) { # scalar context 2397 print "matches [$$match_ref].\n"; 2398 } else { 2399 print "doesn't match.\n"; 2400 } 2401 2402 or 2403 2404 if (($match) = $Collator->match($str, $sub)) { # list context 2405 print "matches [$match].\n"; 2406 } else { 2407 print "doesn't match.\n"; 2408 } 2409 2410=item C<@match = $Collator-E<gt>gmatch($string, $substring)> 2411 2412If C<$substring> matches a part of C<$string>, returns 2413all the matching parts (or matching count in scalar context). 2414 2415If C<$substring> does not match any part of C<$string>, 2416returns an empty list. 2417 2418=item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)> 2419 2420If C<$substring> matches a part of C<$string>, 2421the first occurrence of the matching part is replaced by C<$replacement> 2422(C<$string> is modified) and C<$count> (always equals to C<1>) is returned. 2423 2424C<$replacement> can be a C<CODEREF>, 2425taking the matching part as an argument, 2426and returning a string to replace the matching part 2427(a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>). 2428 2429=item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)> 2430 2431If C<$substring> matches a part of C<$string>, 2432all the occurrences of the matching part are replaced by C<$replacement> 2433(C<$string> is modified) and C<$count> is returned. 2434 2435C<$replacement> can be a C<CODEREF>, 2436taking the matching part as an argument, 2437and returning a string to replace the matching part 2438(a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>). 2439 2440e.g. 2441 2442 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 ); 2443 # (normalization => undef) is REQUIRED. 2444 my $str = "Camel donkey zebra came\x{301}l CAMEL horse cam\0e\0l..."; 2445 $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" }); 2446 2447 # now $str is "<b>Camel</b> donkey zebra <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cam\0e\0l</b>..."; 2448 # i.e., all the camels are made bold-faced. 2449 2450 Examples: levels and ignore_level2 - what does camel match? 2451 --------------------------------------------------------------------------- 2452 level ignore_level2 | camel Camel came\x{301}l c-a-m-e-l cam\0e\0l 2453 -----------------------|--------------------------------------------------- 2454 1 false | yes yes yes yes yes 2455 2 false | yes yes no yes yes 2456 3 false | yes no no yes yes 2457 4 false | yes no no no yes 2458 -----------------------|--------------------------------------------------- 2459 1 true | yes yes yes yes yes 2460 2 true | yes yes yes yes yes 2461 3 true | yes no yes yes yes 2462 4 true | yes no yes no yes 2463 --------------------------------------------------------------------------- 2464 note: if variable => non-ignorable, camel doesn't match c-a-m-e-l 2465 at any level. 2466 2467=back 2468 2469=head2 Other Methods 2470 2471=over 4 2472 2473=item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)> 2474 2475=item C<$modified_collator = $Collator-E<gt>change(%new_tailoring)> 2476 2477Changes the value of specified keys and returns the changed part. 2478 2479 $Collator = Unicode::Collate->new(level => 4); 2480 2481 $Collator->eq("perl", "PERL"); # false 2482 2483 %old = $Collator->change(level => 2); # returns (level => 4). 2484 2485 $Collator->eq("perl", "PERL"); # true 2486 2487 $Collator->change(%old); # returns (level => 2). 2488 2489 $Collator->eq("perl", "PERL"); # false 2490 2491Not all C<(key,value)>s are allowed to be changed. 2492See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>. 2493 2494In the scalar context, returns the modified collator 2495(but it is B<not> a clone from the original). 2496 2497 $Collator->change(level => 2)->eq("perl", "PERL"); # true 2498 2499 $Collator->eq("perl", "PERL"); # true; now max level is 2nd. 2500 2501 $Collator->change(level => 4)->eq("perl", "PERL"); # false 2502 2503=item C<$version = $Collator-E<gt>version()> 2504 2505Returns the version number (a string) of the Unicode Standard 2506which the C<table> file used by the collator object is based on. 2507If the table does not include a version line (starting with C<@version>), 2508returns C<"unknown">. 2509 2510=item C<UCA_Version()> 2511 2512Returns the revision number of UTS #10 this module consults, 2513that should correspond with the DUCET incorporated. 2514 2515=item C<Base_Unicode_Version()> 2516 2517Returns the version number of UTS #10 this module consults, 2518that should correspond with the DUCET incorporated. 2519 2520=back 2521 2522=head1 EXPORT 2523 2524No method will be exported. 2525 2526=head1 INSTALL 2527 2528Though this module can be used without any C<table> file, 2529to use this module easily, it is recommended to install a table file 2530in the UCA format, by copying it under the directory 2531<a place in @INC>/Unicode/Collate. 2532 2533The most preferable one is "The Default Unicode Collation Element Table" 2534(aka DUCET), available from the Unicode Consortium's website: 2535 2536 http://www.unicode.org/Public/UCA/ 2537 2538 http://www.unicode.org/Public/UCA/latest/allkeys.txt 2539 (latest version) 2540 2541If DUCET is not installed, it is recommended to copy the file 2542from http://www.unicode.org/Public/UCA/latest/allkeys.txt 2543to <a place in @INC>/Unicode/Collate/allkeys.txt 2544manually. 2545 2546=head1 CAVEATS 2547 2548=over 4 2549 2550=item Normalization 2551 2552Use of the C<normalization> parameter requires the B<Unicode::Normalize> 2553module (see L<Unicode::Normalize>). 2554 2555If you need not it (say, in the case when you need not 2556handle any combining characters), 2557assign C<(normalization =E<gt> undef)> explicitly. 2558 2559-- see 6.5 Avoiding Normalization, UTS #10. 2560 2561=item Conformance Test 2562 2563The Conformance Test for the UCA is available 2564under L<http://www.unicode.org/Public/UCA/>. 2565 2566For F<CollationTest_SHIFTED.txt>, 2567a collator via C<Unicode::Collate-E<gt>new( )> should be used; 2568for F<CollationTest_NON_IGNORABLE.txt>, a collator via 2569C<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>. 2570 2571If C<UCA_Version> is 26 or later, the C<identical> level is preferred; 2572C<Unicode::Collate-E<gt>new(identical =E<gt> 1)> and 2573C<Unicode::Collate-E<gt>new(identical =E<gt> 1,> 2574C<variable =E<gt> "non-ignorable", level =E<gt> 3)> should be used. 2575 2576B<Unicode::Normalize is required to try The Conformance Test.> 2577 2578B<EBCDIC-SUPPORT IS EXPERIMENTAL.> 2579 2580=back 2581 2582=head1 AUTHOR, COPYRIGHT AND LICENSE 2583 2584The Unicode::Collate module for perl was written by SADAHIRO Tomoyuki, 2585<SADAHIRO@cpan.org>. This module is Copyright(C) 2001-2021, 2586SADAHIRO Tomoyuki. Japan. All rights reserved. 2587 2588This module is free software; you can redistribute it and/or 2589modify it under the same terms as Perl itself. 2590 2591The file Unicode/Collate/allkeys.txt was copied verbatim 2592from L<http://www.unicode.org/Public/UCA/13.0.0/allkeys.txt>. 2593For this file, Copyright (c) 2020 Unicode, Inc.; distributed 2594under the Terms of Use in L<http://www.unicode.org/terms_of_use.html> 2595 2596=head1 SEE ALSO 2597 2598=over 4 2599 2600=item Unicode Collation Algorithm - UTS #10 2601 2602L<http://www.unicode.org/reports/tr10/> 2603 2604=item The Default Unicode Collation Element Table (DUCET) 2605 2606L<http://www.unicode.org/Public/UCA/latest/allkeys.txt> 2607 2608=item The conformance test for the UCA 2609 2610L<http://www.unicode.org/Public/UCA/latest/CollationTest.html> 2611 2612L<http://www.unicode.org/Public/UCA/latest/CollationTest.zip> 2613 2614=item Hangul Syllable Type 2615 2616L<http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt> 2617 2618=item Unicode Normalization Forms - UAX #15 2619 2620L<http://www.unicode.org/reports/tr15/> 2621 2622=item Unicode Locale Data Markup Language (LDML) - UTS #35 2623 2624L<http://www.unicode.org/reports/tr35/> 2625 2626=back 2627 2628=cut 2629