1package Unicode::Collate;
2
3BEGIN {
4    unless ("A" eq pack('U', 0x41)) {
5	die "Unicode::Collate cannot stringify a Unicode code point\n";
6    }
7    unless (0x41 == unpack('U', 'A')) {
8	die "Unicode::Collate cannot get a Unicode code point\n";
9    }
10}
11
12use 5.006;
13use strict;
14use warnings;
15use Carp;
16use File::Spec;
17
18no warnings 'utf8';
19
20our $VERSION = '1.29';
21our $PACKAGE = __PACKAGE__;
22
23### begin XS only ###
24use XSLoader ();
25XSLoader::load('Unicode::Collate', $VERSION);
26### end XS only ###
27
28my @Path = qw(Unicode Collate);
29my $KeyFile = 'allkeys.txt';
30
31# Perl's boolean
32use constant TRUE  => 1;
33use constant FALSE => "";
34use constant NOMATCHPOS => -1;
35
36# A coderef to get combining class imported from Unicode::Normalize
37# (i.e. \&Unicode::Normalize::getCombinClass).
38# This is also used as a HAS_UNICODE_NORMALIZE flag.
39my $CVgetCombinClass;
40
41# Supported Levels
42use constant MinLevel => 1;
43use constant MaxLevel => 4;
44
45# Minimum weights at level 2 and 3, respectively
46use constant Min2Wt => 0x20;
47use constant Min3Wt => 0x02;
48
49# Shifted weight at 4th level
50use constant Shift4Wt => 0xFFFF;
51
52# A boolean for Variable and 16-bit weights at 4 levels of Collation Element
53use constant VCE_TEMPLATE => 'Cn4';
54
55# A sort key: 16-bit weights
56use constant KEY_TEMPLATE => 'n*';
57
58# The tie-breaking: 32-bit weights
59use constant TIE_TEMPLATE => 'N*';
60
61# Level separator in a sort key:
62# i.e. pack(KEY_TEMPLATE, 0)
63use constant LEVEL_SEP => "\0\0";
64
65# As Unicode code point separator for hash keys.
66# A joined code point string (denoted by JCPS below)
67# like "65;768" is used for internal processing
68# instead of Perl's Unicode string like "\x41\x{300}",
69# as the native code point is different from the Unicode code point
70# on EBCDIC platform.
71# This character must not be included in any stringified
72# representation of an integer.
73use constant CODE_SEP => ';';
74	# NOTE: in regex /;/ is used for $jcps!
75
76# boolean values of variable weights
77use constant NON_VAR => 0; # Non-Variable character
78use constant VAR     => 1; # Variable character
79
80# specific code points
81use constant Hangul_SIni   => 0xAC00;
82use constant Hangul_SFin   => 0xD7A3;
83
84# Logical_Order_Exception in PropList.txt
85my $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
86
87# for highestFFFF and minimalFFFE
88my $HighestVCE = pack(VCE_TEMPLATE, 0, 0xFFFE, 0x20, 0x5, 0xFFFF);
89my $minimalVCE = pack(VCE_TEMPLATE, 0,      1, 0x20, 0x5, 0xFFFE);
90
91sub UCA_Version { '43' }
92
93sub Base_Unicode_Version { '13.0.0' }
94
95######
96
97sub pack_U {
98    return pack('U*', @_);
99}
100
101sub unpack_U {
102    return unpack('U*', shift(@_).pack('U*'));
103}
104
105######
106
107my (%VariableOK);
108@VariableOK{ qw/
109    blanked  non-ignorable  shifted  shift-trimmed
110  / } = (); # keys lowercased
111
112our @ChangeOK = qw/
113    alternate backwards level normalization rearrange
114    katakana_before_hiragana upper_before_lower ignore_level2
115    overrideCJK overrideHangul overrideOut preprocess UCA_Version
116    hangul_terminator variable identical highestFFFF minimalFFFE
117    long_contraction
118  /;
119
120our @ChangeNG = qw/
121    entry mapping table maxlength contraction
122    ignoreChar ignoreName undefChar undefName rewrite
123    versionTable alternateTable backwardsTable forwardsTable
124    rearrangeTable variableTable
125    derivCode normCode rearrangeHash backwardsFlag
126    suppress suppressHash
127    __useXS /; ### XS only
128# The hash key 'ignored' was deleted at v 0.21.
129# The hash key 'isShift' was deleted at v 0.23.
130# The hash key 'combining' was deleted at v 0.24.
131# The hash key 'entries' was deleted at v 0.30.
132# The hash key 'L3_ignorable' was deleted at v 0.40.
133
134sub version {
135    my $self = shift;
136    return $self->{versionTable} || 'unknown';
137}
138
139my (%ChangeOK, %ChangeNG);
140@ChangeOK{ @ChangeOK } = ();
141@ChangeNG{ @ChangeNG } = ();
142
143sub change {
144    my $self = shift;
145    my %hash = @_;
146    my %old;
147    if (exists $hash{alternate}) {
148	if (exists $hash{variable}) {
149	    delete $hash{alternate};
150	} else {
151	    $hash{variable} = $hash{alternate};
152	}
153    }
154    foreach my $k (keys %hash) {
155	if (exists $ChangeOK{$k}) {
156	    $old{$k} = $self->{$k};
157	    $self->{$k} = $hash{$k};
158	} elsif (exists $ChangeNG{$k}) {
159	    croak "change of $k via change() is not allowed!";
160	}
161	# else => ignored
162    }
163    $self->checkCollator();
164    return wantarray ? %old : $self;
165}
166
167sub _checkLevel {
168    my $level = shift;
169    my $key   = shift; # 'level' or 'backwards'
170    MinLevel <= $level or croak sprintf
171	"Illegal level %d (in value for key '%s') lower than %d.",
172	    $level, $key, MinLevel;
173    $level <= MaxLevel or croak sprintf
174	"Unsupported level %d (in value for key '%s') higher than %d.",
175	    $level, $key, MaxLevel;
176}
177
178my %DerivCode = (
179    8 => \&_derivCE_8,
180    9 => \&_derivCE_9,
181   11 => \&_derivCE_9, # 11 == 9
182   14 => \&_derivCE_14,
183   16 => \&_derivCE_14, # 16 == 14
184   18 => \&_derivCE_18,
185   20 => \&_derivCE_20,
186   22 => \&_derivCE_22,
187   24 => \&_derivCE_24,
188   26 => \&_derivCE_24, # 26 == 24
189   28 => \&_derivCE_24, # 28 == 24
190   30 => \&_derivCE_24, # 30 == 24
191   32 => \&_derivCE_32,
192   34 => \&_derivCE_34,
193   36 => \&_derivCE_36,
194   38 => \&_derivCE_38,
195   40 => \&_derivCE_40,
196   41 => \&_derivCE_40, # 41 == 40
197   43 => \&_derivCE_43,
198);
199
200sub checkCollator {
201    my $self = shift;
202    _checkLevel($self->{level}, 'level');
203
204    $self->{derivCode} = $DerivCode{ $self->{UCA_Version} }
205	or croak "Illegal UCA version (passed $self->{UCA_Version}).";
206
207    $self->{variable} ||= $self->{alternate} || $self->{variableTable} ||
208				$self->{alternateTable} || 'shifted';
209    $self->{variable} = $self->{alternate} = lc($self->{variable});
210    exists $VariableOK{ $self->{variable} }
211	or croak "$PACKAGE unknown variable parameter name: $self->{variable}";
212
213    if (! defined $self->{backwards}) {
214	$self->{backwardsFlag} = 0;
215    } elsif (! ref $self->{backwards}) {
216	_checkLevel($self->{backwards}, 'backwards');
217	$self->{backwardsFlag} = 1 << $self->{backwards};
218    } else {
219	my %level;
220	$self->{backwardsFlag} = 0;
221	for my $b (@{ $self->{backwards} }) {
222	    _checkLevel($b, 'backwards');
223	    $level{$b} = 1;
224	}
225	for my $v (sort keys %level) {
226	    $self->{backwardsFlag} += 1 << $v;
227	}
228    }
229
230    defined $self->{rearrange} or $self->{rearrange} = [];
231    ref $self->{rearrange}
232	or croak "$PACKAGE: list for rearrangement must be store in ARRAYREF";
233
234    # keys of $self->{rearrangeHash} are $self->{rearrange}.
235    $self->{rearrangeHash} = undef;
236
237    if (@{ $self->{rearrange} }) {
238	@{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
239    }
240
241    $self->{normCode} = undef;
242
243    if (defined $self->{normalization}) {
244	eval { require Unicode::Normalize };
245	$@ and croak "Unicode::Normalize is required to normalize strings";
246
247	$CVgetCombinClass ||= \&Unicode::Normalize::getCombinClass;
248
249	if ($self->{normalization} =~ /^(?:NF)D\z/) { # tweak for default
250	    $self->{normCode} = \&Unicode::Normalize::NFD;
251	}
252	elsif ($self->{normalization} ne 'prenormalized') {
253	    my $norm = $self->{normalization};
254	    $self->{normCode} = sub {
255		Unicode::Normalize::normalize($norm, shift);
256	    };
257	    eval { $self->{normCode}->("") }; # try
258	    $@ and croak "$PACKAGE unknown normalization form name: $norm";
259	}
260    }
261    return;
262}
263
264sub new
265{
266    my $class = shift;
267    my $self = bless { @_ }, $class;
268
269### begin XS only ###
270    if (! exists $self->{table}     && !defined $self->{rewrite} &&
271	!defined $self->{undefName} && !defined $self->{ignoreName} &&
272	!defined $self->{undefChar} && !defined $self->{ignoreChar}) {
273	$self->{__useXS} = \&_fetch_simple;
274    } else {
275	$self->{__useXS} = undef;
276    }
277### end XS only ###
278
279    # keys of $self->{suppressHash} are $self->{suppress}.
280    if ($self->{suppress} && @{ $self->{suppress} }) {
281	@{ $self->{suppressHash} }{ @{ $self->{suppress} } } = ();
282    } # before read_table()
283
284    # If undef is passed explicitly, no file is read.
285    $self->{table} = $KeyFile if ! exists $self->{table};
286    $self->read_table() if defined $self->{table};
287
288    if ($self->{entry}) {
289	while ($self->{entry} =~ /([^\n]+)/g) {
290	    $self->parseEntry($1, TRUE);
291	}
292    }
293
294    # only in new(), not in change()
295    $self->{level} ||= MaxLevel;
296    $self->{UCA_Version} ||= UCA_Version();
297
298    $self->{overrideHangul} = FALSE
299	if ! exists $self->{overrideHangul};
300    $self->{overrideCJK} = FALSE
301	if ! exists $self->{overrideCJK};
302    $self->{normalization} = 'NFD'
303	if ! exists $self->{normalization};
304    $self->{rearrange} = $self->{rearrangeTable} ||
305	($self->{UCA_Version} <= 11 ? $DefaultRearrange : [])
306	if ! exists $self->{rearrange};
307    $self->{backwards} = $self->{backwardsTable}
308	if ! exists $self->{backwards};
309    exists $self->{long_contraction} or $self->{long_contraction}
310	= 22 <= $self->{UCA_Version} && $self->{UCA_Version} <= 24;
311
312    # checkCollator() will be called in change()
313    $self->checkCollator();
314
315    return $self;
316}
317
318sub parseAtmark {
319    my $self = shift;
320    my $line = shift; # after s/^\s*\@//
321
322    if ($line =~ /^version\s*(\S*)/) {
323	$self->{versionTable} ||= $1;
324    }
325    elsif ($line =~ /^variable\s+(\S*)/) { # since UTS #10-9
326	$self->{variableTable} ||= $1;
327    }
328    elsif ($line =~ /^alternate\s+(\S*)/) { # till UTS #10-8
329	$self->{alternateTable} ||= $1;
330    }
331    elsif ($line =~ /^backwards\s+(\S*)/) {
332	push @{ $self->{backwardsTable} }, $1;
333    }
334    elsif ($line =~ /^forwards\s+(\S*)/) { # perhaps no use
335	push @{ $self->{forwardsTable} }, $1;
336    }
337    elsif ($line =~ /^rearrange\s+(.*)/) { # (\S*) is NG
338	push @{ $self->{rearrangeTable} }, _getHexArray($1);
339    }
340}
341
342sub read_table {
343    my $self = shift;
344
345### begin XS only ###
346    if ($self->{__useXS}) {
347	my @rest = _fetch_rest(); # complex matter need to parse
348	for my $line (@rest) {
349	    next if $line =~ /^\s*#/;
350
351	    if ($line =~ s/^\s*\@//) {
352		$self->parseAtmark($line);
353	    } else {
354		$self->parseEntry($line);
355	    }
356	}
357	return;
358    }
359### end XS only ###
360
361    my($f, $fh);
362    foreach my $d (@INC) {
363	$f = File::Spec->catfile($d, @Path, $self->{table});
364	last if open($fh, $f);
365	$f = undef;
366    }
367    if (!defined $f) {
368	$f = File::Spec->catfile(@Path, $self->{table});
369	croak("$PACKAGE: Can't locate $f in \@INC (\@INC contains: @INC)");
370    }
371
372    while (my $line = <$fh>) {
373	next if $line =~ /^\s*#/;
374
375	if ($line =~ s/^\s*\@//) {
376	    $self->parseAtmark($line);
377	} else {
378	    $self->parseEntry($line);
379	}
380    }
381    close $fh;
382}
383
384
385##
386## get $line, parse it, and write an entry in $self
387##
388sub parseEntry
389{
390    my $self = shift;
391    my $line = shift;
392    my $tailoring = shift;
393    my($name, $entry, @uv, @key);
394
395    if (defined $self->{rewrite}) {
396	$line = $self->{rewrite}->($line);
397    }
398
399    return if $line !~ /^\s*[0-9A-Fa-f]/;
400
401    # removes comment and gets name
402    $name = $1
403	if $line =~ s/[#%]\s*(.*)//;
404    return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
405
406    # gets element
407    my($e, $k) = split /;/, $line;
408    croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
409	if ! $k;
410
411    @uv = _getHexArray($e);
412    return if !@uv;
413    return if @uv > 1 && $self->{suppressHash} && !$tailoring &&
414		  exists $self->{suppressHash}{$uv[0]};
415    $entry = join(CODE_SEP, @uv); # in JCPS
416
417    if (defined $self->{undefChar} || defined $self->{ignoreChar}) {
418	my $ele = pack_U(@uv);
419
420	# regarded as if it were not stored in the table
421	return
422	    if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
423
424	# replaced as completely ignorable
425	$k = '[.0000.0000.0000.0000]'
426	    if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/;
427    }
428
429    # replaced as completely ignorable
430    $k = '[.0000.0000.0000.0000]'
431	if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/;
432
433    my $is_L3_ignorable = TRUE;
434
435    foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
436	my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
437	my @wt = _getHexArray($arr);
438	push @key, pack(VCE_TEMPLATE, $var, @wt);
439	$is_L3_ignorable = FALSE
440	    if $wt[0] || $wt[1] || $wt[2];
441	# Conformance Test for 3.1.1 and 4.0.0 shows Level 3 ignorable
442	# is completely ignorable.
443	# For expansion, an entry $is_L3_ignorable
444	# if and only if "all" CEs are [.0000.0000.0000].
445    }
446
447    # mapping: be an array ref or not exists (any false value is disallowed)
448    $self->{mapping}{$entry} = $is_L3_ignorable ? [] : \@key;
449
450    # maxlength: be more than 1 or not exists (any false value is disallowed)
451    if (@uv > 1) {
452	if (!$self->{maxlength}{$uv[0]} || $self->{maxlength}{$uv[0]} < @uv) {
453	    $self->{maxlength}{$uv[0]} = @uv;
454	}
455    }
456
457    # contraction: be 1 or not exists (any false value is disallowed)
458    while (@uv > 2) {
459	pop @uv;
460	my $fake_entry = join(CODE_SEP, @uv); # in JCPS
461	$self->{contraction}{$fake_entry} = 1;
462    }
463}
464
465
466sub viewSortKey
467{
468    my $self = shift;
469    my $str  = shift;
470    $self->visualizeSortKey($self->getSortKey($str));
471}
472
473
474sub process
475{
476    my $self = shift;
477    my $str  = shift;
478    my $prep = $self->{preprocess};
479    my $norm = $self->{normCode};
480
481    $str = &$prep($str) if ref $prep;
482    $str = &$norm($str) if ref $norm;
483    return $str;
484}
485
486##
487## arrayref of JCPS   = splitEnt(string to be collated)
488## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, TRUE)
489##
490sub splitEnt
491{
492    my $self = shift;
493    my $str  = shift;
494    my $wLen = shift; # with Length
495
496    my $map  = $self->{mapping};
497    my $max  = $self->{maxlength};
498    my $reH  = $self->{rearrangeHash};
499    my $vers = $self->{UCA_Version};
500    my $ver9 = $vers >= 9 && $vers <= 11;
501    my $long = $self->{long_contraction};
502    my $uXS  = $self->{__useXS}; ### XS only
503
504    my @buf;
505
506    # get array of Unicode code point of string.
507    my @src = unpack_U($str);
508
509    # rearrangement:
510    # Character positions are not kept if rearranged,
511    # then neglected if $wLen is true.
512    if ($reH && ! $wLen) {
513	for (my $i = 0; $i < @src; $i++) {
514	    if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
515		($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
516		$i++;
517	    }
518	}
519    }
520
521    # remove a code point marked as a completely ignorable.
522    for (my $i = 0; $i < @src; $i++) {
523	if ($vers <= 20 && _isIllegal($src[$i])) {
524	    $src[$i] = undef;
525	} elsif ($ver9) {
526	    $src[$i] = undef if exists $map->{ $src[$i] }
527			   ? @{ $map->{ $src[$i] } } == 0
528			   : $uXS && _ignorable_simple($src[$i]); ### XS only
529	}
530    }
531
532    for (my $i = 0; $i < @src; $i++) {
533	my $jcps = $src[$i];
534
535	# skip removed code point
536	if (! defined $jcps) {
537	    if ($wLen && @buf) {
538		$buf[-1][2] = $i + 1;
539	    }
540	    next;
541	}
542
543	my $i_orig = $i;
544
545	# find contraction
546	if (exists $max->{$jcps}) {
547	    my $temp_jcps = $jcps;
548	    my $jcpsLen = 1;
549	    my $maxLen = $max->{$jcps};
550
551	    for (my $p = $i + 1; $jcpsLen < $maxLen && $p < @src; $p++) {
552		next if ! defined $src[$p];
553		$temp_jcps .= CODE_SEP . $src[$p];
554		$jcpsLen++;
555		if (exists $map->{$temp_jcps}) {
556		    $jcps = $temp_jcps;
557		    $i = $p;
558		}
559	    }
560
561	# discontiguous contraction with Combining Char (cf. UTS#10, S2.1).
562	# This process requires Unicode::Normalize.
563	# If "normalization" is undef, here should be skipped *always*
564	# (in spite of bool value of $CVgetCombinClass),
565	# since canonical ordering cannot be expected.
566	# Blocked combining character should not be contracted.
567
568	    # $self->{normCode} is false in the case of "prenormalized".
569	    if ($self->{normalization}) {
570		my $cont = $self->{contraction};
571		my $preCC = 0;
572		my $preCC_uc = 0;
573		my $jcps_uc = $jcps;
574		my(@out, @out_uc);
575
576		for (my $p = $i + 1; $p < @src; $p++) {
577		    next if ! defined $src[$p];
578		    my $curCC = $CVgetCombinClass->($src[$p]);
579		    last unless $curCC;
580		    my $tail = CODE_SEP . $src[$p];
581
582		    if ($preCC != $curCC && exists $map->{$jcps.$tail}) {
583			$jcps .= $tail;
584			push @out, $p;
585		    } else {
586			$preCC = $curCC;
587		    }
588
589		    next if !$long;
590
591		    if ($preCC_uc != $curCC &&
592			    (exists $map->{$jcps_uc.$tail} ||
593			    exists $cont->{$jcps_uc.$tail})) {
594			$jcps_uc .= $tail;
595			push @out_uc, $p;
596		    } else {
597			$preCC_uc = $curCC;
598		    }
599		}
600
601		if (@out_uc && exists $map->{$jcps_uc}) {
602		    $jcps = $jcps_uc;
603		    $src[$_] = undef for @out_uc;
604		} else {
605		    $src[$_] = undef for @out;
606		}
607	    }
608	}
609
610	# skip completely ignorable
611	if (exists $map->{$jcps} ? @{ $map->{$jcps} } == 0 :
612	    $uXS && $jcps !~ /;/ && _ignorable_simple($jcps)) { ### XS only
613	    if ($wLen && @buf) {
614		$buf[-1][2] = $i + 1;
615	    }
616	    next;
617	}
618
619	push @buf, $wLen ? [$jcps, $i_orig, $i + 1] : $jcps;
620    }
621    return \@buf;
622}
623
624##
625## VCE = _pack_override(input, codepoint, derivCode)
626##
627sub _pack_override ($$$) {
628    my $r = shift;
629    my $u = shift;
630    my $der = shift;
631
632    if (ref $r) {
633	return pack(VCE_TEMPLATE, NON_VAR, @$r);
634    } elsif (defined $r) {
635	return pack(VCE_TEMPLATE, NON_VAR, $r, Min2Wt, Min3Wt, $u);
636    } else {
637	$u = 0xFFFD if 0x10FFFF < $u;
638	return $der->($u);
639    }
640}
641
642##
643## list of VCE = getWt(JCPS)
644##
645sub getWt
646{
647    my $self = shift;
648    my $u    = shift;
649    my $map  = $self->{mapping};
650    my $der  = $self->{derivCode};
651    my $out  = $self->{overrideOut};
652    my $uXS  = $self->{__useXS}; ### XS only
653
654    return if !defined $u;
655    return $self->varCE($HighestVCE) if $u eq 0xFFFF && $self->{highestFFFF};
656    return $self->varCE($minimalVCE) if $u eq 0xFFFE && $self->{minimalFFFE};
657    $u = 0xFFFD if $u !~ /;/ && 0x10FFFF < $u && !$out;
658
659    my @ce;
660    if (exists $map->{$u}) {
661	@ce = @{ $map->{$u} }; # $u may be a contraction
662### begin XS only ###
663    } elsif ($uXS && _exists_simple($u)) {
664	@ce = _fetch_simple($u);
665### end XS only ###
666    } elsif (Hangul_SIni <= $u && $u <= Hangul_SFin) {
667	my $hang = $self->{overrideHangul};
668	if ($hang) {
669	    @ce = map _pack_override($_, $u, $der), $hang->($u);
670	} elsif (!defined $hang) {
671	    @ce = $der->($u);
672	} else {
673	    my $max  = $self->{maxlength};
674	    my @decH = _decompHangul($u);
675
676	    if (@decH == 2) {
677		my $contract = join(CODE_SEP, @decH);
678		@decH = ($contract) if exists $map->{$contract};
679	    } else { # must be <@decH == 3>
680		if (exists $max->{$decH[0]}) {
681		    my $contract = join(CODE_SEP, @decH);
682		    if (exists $map->{$contract}) {
683			@decH = ($contract);
684		    } else {
685			$contract = join(CODE_SEP, @decH[0,1]);
686			exists $map->{$contract} and @decH = ($contract, $decH[2]);
687		    }
688		    # even if V's ignorable, LT contraction is not supported.
689		    # If such a situation were required, NFD should be used.
690		}
691		if (@decH == 3 && exists $max->{$decH[1]}) {
692		    my $contract = join(CODE_SEP, @decH[1,2]);
693		    exists $map->{$contract} and @decH = ($decH[0], $contract);
694		}
695	    }
696
697	    @ce = map({
698		    exists $map->{$_} ? @{ $map->{$_} } :
699		$uXS && _exists_simple($_) ? _fetch_simple($_) : ### XS only
700		    $der->($_);
701		} @decH);
702	}
703    } elsif ($out && 0x10FFFF < $u) {
704	@ce = map _pack_override($_, $u, $der), $out->($u);
705    } else {
706	my $cjk  = $self->{overrideCJK};
707	my $vers = $self->{UCA_Version};
708	if ($cjk && _isUIdeo($u, $vers)) {
709	    @ce = map _pack_override($_, $u, $der), $cjk->($u);
710	} elsif ($vers == 8 && defined $cjk && _isUIdeo($u, 0)) {
711	    @ce = _uideoCE_8($u);
712	} else {
713	    @ce = $der->($u);
714	}
715    }
716    return map $self->varCE($_), @ce;
717}
718
719
720##
721## string sortkey = getSortKey(string arg)
722##
723sub getSortKey
724{
725    my $self = shift;
726    my $orig = shift;
727    my $str  = $self->process($orig);
728    my $rEnt = $self->splitEnt($str); # get an arrayref of JCPS
729    my $vers = $self->{UCA_Version};
730    my $term = $self->{hangul_terminator};
731    my $lev  = $self->{level};
732    my $iden = $self->{identical};
733
734    my @buf; # weight arrays
735    if ($term) {
736	my $preHST = '';
737	my $termCE = $self->varCE(pack(VCE_TEMPLATE, NON_VAR, $term, 0,0,0));
738	foreach my $jcps (@$rEnt) {
739	    # weird things like VL, TL-contraction are not considered!
740	    my $curHST = join '', map getHST($_, $vers), split /;/, $jcps;
741	    if ($preHST && !$curHST || # hangul before non-hangul
742		$preHST =~ /L\z/ && $curHST =~ /^T/ ||
743		$preHST =~ /V\z/ && $curHST =~ /^L/ ||
744		$preHST =~ /T\z/ && $curHST =~ /^[LV]/) {
745		push @buf, $termCE;
746	    }
747	    $preHST = $curHST;
748	    push @buf, $self->getWt($jcps);
749	}
750	push @buf, $termCE if $preHST; # end at hangul
751    } else {
752	foreach my $jcps (@$rEnt) {
753	    push @buf, $self->getWt($jcps);
754	}
755    }
756
757    my $rkey = $self->mk_SortKey(\@buf); ### XS only
758
759    if ($iden || $vers >= 26 && $lev == MaxLevel) {
760	$rkey .= LEVEL_SEP;
761	$rkey .= pack(TIE_TEMPLATE, unpack_U($str)) if $iden;
762    }
763    return $rkey;
764}
765
766
767##
768## int compare = cmp(string a, string b)
769##
770sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
771sub eq  { $_[0]->getSortKey($_[1]) eq  $_[0]->getSortKey($_[2]) }
772sub ne  { $_[0]->getSortKey($_[1]) ne  $_[0]->getSortKey($_[2]) }
773sub lt  { $_[0]->getSortKey($_[1]) lt  $_[0]->getSortKey($_[2]) }
774sub le  { $_[0]->getSortKey($_[1]) le  $_[0]->getSortKey($_[2]) }
775sub gt  { $_[0]->getSortKey($_[1]) gt  $_[0]->getSortKey($_[2]) }
776sub ge  { $_[0]->getSortKey($_[1]) ge  $_[0]->getSortKey($_[2]) }
777
778##
779## list[strings] sorted = sort(list[strings] arg)
780##
781sub sort {
782    my $obj = shift;
783    return
784	map { $_->[1] }
785	    sort{ $a->[0] cmp $b->[0] }
786		map [ $obj->getSortKey($_), $_ ], @_;
787}
788
789
790##
791## bool _nonIgnorAtLevel(arrayref weights, int level)
792##
793sub _nonIgnorAtLevel($$)
794{
795    my $wt = shift;
796    return if ! defined $wt;
797    my $lv = shift;
798    return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE;
799}
800
801##
802## bool _eqArray(
803##    arrayref of arrayref[weights] source,
804##    arrayref of arrayref[weights] substr,
805##    int level)
806## * comparison of graphemes vs graphemes.
807##   @$source >= @$substr must be true (check it before call this);
808##
809sub _eqArray($$$)
810{
811    my $source = shift;
812    my $substr = shift;
813    my $lev = shift;
814
815    for my $g (0..@$substr-1){
816	# Do the $g'th graphemes have the same number of AV weights?
817	return if @{ $source->[$g] } != @{ $substr->[$g] };
818
819	for my $w (0..@{ $substr->[$g] }-1) {
820	    for my $v (0..$lev-1) {
821		return if $source->[$g][$w][$v] != $substr->[$g][$w][$v];
822	    }
823	}
824    }
825    return 1;
826}
827
828##
829## (int position, int length)
830## int position = index(string, substring, position, [undoc'ed global])
831##
832## With "global" (only for the list context),
833##  returns list of arrayref[position, length].
834##
835sub index
836{
837    my $self = shift;
838    $self->{preprocess} and
839	croak "Don't use Preprocess with index(), match(), etc.";
840    $self->{normCode} and
841	croak "Don't use Normalization with index(), match(), etc.";
842
843    my $str  = shift;
844    my $len  = length($str);
845    my $sub  = shift;
846    my $subE = $self->splitEnt($sub);
847    my $pos  = @_ ? shift : 0;
848       $pos  = 0 if $pos < 0;
849    my $glob = shift;
850
851    my $lev  = $self->{level};
852    my $v2i  = $self->{UCA_Version} >= 9 &&
853		$self->{variable} ne 'non-ignorable';
854
855    if (! @$subE) {
856	my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos;
857	return $glob
858	    ? map([$_, 0], $temp..$len)
859	    : wantarray ? ($temp,0) : $temp;
860    }
861    $len < $pos
862	and return wantarray ? () : NOMATCHPOS;
863    my $strE = $self->splitEnt($pos ? substr($str, $pos) : $str, TRUE);
864    @$strE
865	or return wantarray ? () : NOMATCHPOS;
866
867    my(@strWt, @iniPos, @finPos, @subWt, @g_ret);
868
869    my $last_is_variable;
870    for my $vwt (map $self->getWt($_), @$subE) {
871	my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
872	my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
873
874	# "Ignorable (L1, L2) after Variable" since track. v. 9
875	if ($v2i) {
876	    if ($var) {
877		$last_is_variable = TRUE;
878	    }
879	    elsif (!$wt[0]) { # ignorable
880		$to_be_pushed = FALSE if $last_is_variable;
881	    }
882	    else {
883		$last_is_variable = FALSE;
884	    }
885	}
886
887	if (@subWt && !$var && !$wt[0]) {
888	    push @{ $subWt[-1] }, \@wt if $to_be_pushed;
889	} elsif ($to_be_pushed) {
890	    push @subWt, [ \@wt ];
891	}
892	# else ===> skipped
893    }
894
895    my $count = 0;
896    my $end = @$strE - 1;
897
898    $last_is_variable = FALSE; # reuse
899    for (my $i = 0; $i <= $end; ) { # no $i++
900	my $found_base = 0;
901
902	# fetch a grapheme
903	while ($i <= $end && $found_base == 0) {
904	    for my $vwt ($self->getWt($strE->[$i][0])) {
905		my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
906		my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
907
908		# "Ignorable (L1, L2) after Variable" since track. v. 9
909		if ($v2i) {
910		    if ($var) {
911			$last_is_variable = TRUE;
912		    }
913		    elsif (!$wt[0]) { # ignorable
914			$to_be_pushed = FALSE if $last_is_variable;
915		    }
916		    else {
917			$last_is_variable = FALSE;
918		    }
919		}
920
921		if (@strWt && !$var && !$wt[0]) {
922		    push @{ $strWt[-1] }, \@wt if $to_be_pushed;
923		    $finPos[-1] = $strE->[$i][2];
924		} elsif ($to_be_pushed) {
925		    push @strWt, [ \@wt ];
926		    push @iniPos, $found_base ? NOMATCHPOS : $strE->[$i][1];
927		    $finPos[-1] = NOMATCHPOS if $found_base;
928		    push @finPos, $strE->[$i][2];
929		    $found_base++;
930		}
931		# else ===> no-op
932	    }
933	    $i++;
934	}
935
936	# try to match
937	while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) {
938	    if ($iniPos[0] != NOMATCHPOS &&
939		    $finPos[$#subWt] != NOMATCHPOS &&
940			_eqArray(\@strWt, \@subWt, $lev)) {
941		my $temp = $iniPos[0] + $pos;
942
943		if ($glob) {
944		    push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]];
945		    splice @strWt,  0, $#subWt;
946		    splice @iniPos, 0, $#subWt;
947		    splice @finPos, 0, $#subWt;
948		}
949		else {
950		    return wantarray
951			? ($temp, $finPos[$#subWt] - $iniPos[0])
952			:  $temp;
953		}
954	    }
955	    shift @strWt;
956	    shift @iniPos;
957	    shift @finPos;
958	}
959    }
960
961    return $glob
962	? @g_ret
963	: wantarray ? () : NOMATCHPOS;
964}
965
966##
967## scalarref to matching part = match(string, substring)
968##
969sub match
970{
971    my $self = shift;
972    if (my($pos,$len) = $self->index($_[0], $_[1])) {
973	my $temp = substr($_[0], $pos, $len);
974	return wantarray ? $temp : \$temp;
975	# An lvalue ref \substr should be avoided,
976	# since its value is affected by modification of its referent.
977    }
978    else {
979	return;
980    }
981}
982
983##
984## arrayref matching parts = gmatch(string, substring)
985##
986sub gmatch
987{
988    my $self = shift;
989    my $str  = shift;
990    my $sub  = shift;
991    return map substr($str, $_->[0], $_->[1]),
992		$self->index($str, $sub, 0, 'g');
993}
994
995##
996## bool subst'ed = subst(string, substring, replace)
997##
998sub subst
999{
1000    my $self = shift;
1001    my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1002
1003    if (my($pos,$len) = $self->index($_[0], $_[1])) {
1004	if ($code) {
1005	    my $mat = substr($_[0], $pos, $len);
1006	    substr($_[0], $pos, $len, $code->($mat));
1007	} else {
1008	    substr($_[0], $pos, $len, $_[2]);
1009	}
1010	return TRUE;
1011    }
1012    else {
1013	return FALSE;
1014    }
1015}
1016
1017##
1018## int count = gsubst(string, substring, replace)
1019##
1020sub gsubst
1021{
1022    my $self = shift;
1023    my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1024    my $cnt = 0;
1025
1026    # Replacement is carried out from the end, then use reverse.
1027    for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) {
1028	if ($code) {
1029	    my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]);
1030	    substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat));
1031	} else {
1032	    substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]);
1033	}
1034	$cnt++;
1035    }
1036    return $cnt;
1037}
1038
10391;
1040__END__
1041
1042=head1 NAME
1043
1044Unicode::Collate - Unicode Collation Algorithm
1045
1046=head1 SYNOPSIS
1047
1048  use Unicode::Collate;
1049
1050  #construct
1051  $Collator = Unicode::Collate->new(%tailoring);
1052
1053  #sort
1054  @sorted = $Collator->sort(@not_sorted);
1055
1056  #compare
1057  $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
1058
1059B<Note:> Strings in C<@not_sorted>, C<$a> and C<$b> are interpreted
1060according to Perl's Unicode support. See L<perlunicode>,
1061L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>.
1062Otherwise you can use C<preprocess> or should decode them before.
1063
1064=head1 DESCRIPTION
1065
1066This module is an implementation of Unicode Technical Standard #10
1067(a.k.a. UTS #10) - Unicode Collation Algorithm (a.k.a. UCA).
1068
1069=head2 Constructor and Tailoring
1070
1071The C<new> method returns a collator object. If new() is called
1072with no parameters, the collator should do the default collation.
1073
1074   $Collator = Unicode::Collate->new(
1075      UCA_Version => $UCA_Version,
1076      alternate => $alternate, # alias for 'variable'
1077      backwards => $levelNumber, # or \@levelNumbers
1078      entry => $element,
1079      hangul_terminator => $term_primary_weight,
1080      highestFFFF => $bool,
1081      identical => $bool,
1082      ignoreName => qr/$ignoreName/,
1083      ignoreChar => qr/$ignoreChar/,
1084      ignore_level2 => $bool,
1085      katakana_before_hiragana => $bool,
1086      level => $collationLevel,
1087      long_contraction => $bool,
1088      minimalFFFE => $bool,
1089      normalization  => $normalization_form,
1090      overrideCJK => \&overrideCJK,
1091      overrideHangul => \&overrideHangul,
1092      preprocess => \&preprocess,
1093      rearrange => \@charList,
1094      rewrite => \&rewrite,
1095      suppress => \@charList,
1096      table => $filename,
1097      undefName => qr/$undefName/,
1098      undefChar => qr/$undefChar/,
1099      upper_before_lower => $bool,
1100      variable => $variable,
1101   );
1102
1103=over 4
1104
1105=item UCA_Version
1106
1107If the revision (previously "tracking version") number of UCA is given,
1108behavior of that revision is emulated on collating.
1109If omitted, the return value of C<UCA_Version()> is used.
1110
1111The following revisions are supported.  The default is 43.
1112
1113     UCA       Unicode Standard         DUCET (@version)
1114   -------------------------------------------------------
1115      8              3.1                3.0.1 (3.0.1d9)
1116      9     3.1 with Corrigendum 3      3.1.1
1117     11             4.0.0
1118     14             4.1.0
1119     16             5.0.0
1120     18             5.1.0
1121     20             5.2.0
1122     22             6.0.0
1123     24             6.1.0
1124     26             6.2.0
1125     28             6.3.0
1126     30             7.0.0
1127     32             8.0.0
1128     34             9.0.0
1129     36            10.0.0
1130     38            11.0.0
1131     40            12.0.0
1132     41            12.1.0
1133     43            13.0.0
1134
1135* See below for C<long_contraction> with C<UCA_Version> 22 and 24.
1136
1137* Noncharacters (e.g. U+FFFF) are not ignored, and can be overridden
1138since C<UCA_Version> 22.
1139
1140* Out-of-range codepoints (greater than U+10FFFF) are not ignored,
1141and can be overridden since C<UCA_Version> 22.
1142
1143* Fully ignorable characters were ignored, and would not interrupt
1144contractions with C<UCA_Version> 9 and 11.
1145
1146* Treatment of ignorables after variables and some behaviors
1147were changed at C<UCA_Version> 9.
1148
1149* Characters regarded as CJK unified ideographs (cf. C<overrideCJK>)
1150depend on C<UCA_Version>.
1151
1152* Many hangul jamo are assigned at C<UCA_Version> 20, that will affect
1153C<hangul_terminator>.
1154
1155=item alternate
1156
1157-- see 3.2.2 Alternate Weighting, version 8 of UTS #10
1158
1159For backward compatibility, C<alternate> (old name) can be used
1160as an alias for C<variable>.
1161
1162=item backwards
1163
1164-- see 3.4 Backward Accents, UTS #10.
1165
1166     backwards => $levelNumber or \@levelNumbers
1167
1168Weights in reverse order; ex. level 2 (diacritic ordering) in French.
1169If omitted (or C<$levelNumber> is C<undef> or C<\@levelNumbers> is C<[]>),
1170forwards at all the levels.
1171
1172=item entry
1173
1174-- see 5 Tailoring; 9.1 Allkeys File Format, UTS #10.
1175
1176If the same character (or a sequence of characters) exists
1177in the collation element table through C<table>,
1178mapping to collation elements is overridden.
1179If it does not exist, the mapping is defined additionally.
1180
1181    entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
11820063 0068 ; [.0E6A.0020.0002.0063] # ch
11830043 0068 ; [.0E6A.0020.0007.0043] # Ch
11840043 0048 ; [.0E6A.0020.0008.0043] # CH
1185006C 006C ; [.0F4C.0020.0002.006C] # ll
1186004C 006C ; [.0F4C.0020.0007.004C] # Ll
1187004C 004C ; [.0F4C.0020.0008.004C] # LL
118800F1      ; [.0F7B.0020.0002.00F1] # n-tilde
1189006E 0303 ; [.0F7B.0020.0002.00F1] # n-tilde
119000D1      ; [.0F7B.0020.0008.00D1] # N-tilde
1191004E 0303 ; [.0F7B.0020.0008.00D1] # N-tilde
1192ENTRY
1193
1194    entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
119500E6 ; [.0E33.0020.0002.00E6][.0E8B.0020.0002.00E6] # ae ligature as <a><e>
119600C6 ; [.0E33.0020.0008.00C6][.0E8B.0020.0008.00C6] # AE ligature as <A><E>
1197ENTRY
1198
1199B<NOTE:> The code point in the UCA file format (before C<';'>)
1200B<must> be a Unicode code point (defined as hexadecimal),
1201but not a native code point.
1202So C<0063> must always denote C<U+0063>,
1203but not a character of C<"\x63">.
1204
1205Weighting may vary depending on collation element table.
1206So ensure the weights defined in C<entry> will be consistent with
1207those in the collation element table loaded via C<table>.
1208
1209In DUCET v4.0.0, primary weight of C<C> is C<0E60>
1210and that of C<D> is C<0E6D>. So setting primary weight of C<CH> to C<0E6A>
1211(as a value between C<0E60> and C<0E6D>)
1212makes ordering as C<C E<lt> CH E<lt> D>.
1213Exactly speaking DUCET already has some characters between C<C> and C<D>:
1214C<small capital C> (C<U+1D04>) with primary weight C<0E64>,
1215C<c-hook/C-hook> (C<U+0188/U+0187>) with C<0E65>,
1216and C<c-curl> (C<U+0255>) with C<0E69>.
1217Then primary weight C<0E6A> for C<CH> makes C<CH>
1218ordered between C<c-curl> and C<D>.
1219
1220=item hangul_terminator
1221
1222-- see 7.1.4 Trailing Weights, UTS #10.
1223
1224If a true value is given (non-zero but should be positive),
1225it will be added as a terminator primary weight to the end of
1226every standard Hangul syllable. Secondary and any higher weights
1227for terminator are set to zero.
1228If the value is false or C<hangul_terminator> key does not exist,
1229insertion of terminator weights will not be performed.
1230
1231Boundaries of Hangul syllables are determined
1232according to conjoining Jamo behavior in F<the Unicode Standard>
1233and F<HangulSyllableType.txt>.
1234
1235B<Implementation Note:>
1236(1) For expansion mapping (Unicode character mapped
1237to a sequence of collation elements), a terminator will not be added
1238between collation elements, even if Hangul syllable boundary exists there.
1239Addition of terminator is restricted to the next position
1240to the last collation element.
1241
1242(2) Non-conjoining Hangul letters
1243(Compatibility Jamo, halfwidth Jamo, and enclosed letters) are not
1244automatically terminated with a terminator primary weight.
1245These characters may need terminator included in a collation element
1246table beforehand.
1247
1248=item highestFFFF
1249
1250-- see 2.4 Tailored noncharacter weights, UTS #35 (LDML) Part 5: Collation.
1251
1252If the parameter is made true, C<U+FFFF> has a highest primary weight.
1253When a boolean of C<$coll-E<gt>ge($str, "abc")> and
1254C<$coll-E<gt>le($str, "abc\x{FFFF}")> is true, it is expected that C<$str>
1255begins with C<"abc">, or another primary equivalent.
1256C<$str> may be C<"abcd">, C<"abc012">, but should not include C<U+FFFF>
1257such as C<"abc\x{FFFF}xyz">.
1258
1259C<$coll-E<gt>le($str, "abc\x{FFFF}")> works like C<$coll-E<gt>lt($str, "abd")>
1260almost, but the latter has a problem that you should know which letter is
1261next to C<c>. For a certain language where C<ch> as the next letter,
1262C<"abch"> is greater than C<"abc\x{FFFF}">, but less than C<"abd">.
1263
1264Note:
1265This is equivalent to C<(entry =E<gt> 'FFFF ; [.FFFE.0020.0005.FFFF]')>.
1266Any other character than C<U+FFFF> can be tailored by C<entry>.
1267
1268=item identical
1269
1270-- see A.3 Deterministic Comparison, UTS #10.
1271
1272By default, strings whose weights are equal should be equal,
1273even though their code points are not equal.
1274Completely ignorable characters are ignored.
1275
1276If the parameter is made true, a final, tie-breaking level is used.
1277If no difference of weights is found after the comparison through
1278all the level specified by C<level>, the comparison with code points
1279will be performed.
1280For the tie-breaking comparison, the sort key has code points
1281of the original string appended.
1282Completely ignorable characters are not ignored.
1283
1284If C<preprocess> and/or C<normalization> is applied, the code points
1285of the string after them (in NFD by default) are used.
1286
1287=item ignoreChar
1288
1289=item ignoreName
1290
1291-- see 3.6 Variable Weighting, UTS #10.
1292
1293Makes the entry in the table completely ignorable;
1294i.e. as if the weights were zero at all level.
1295
1296Through C<ignoreChar>, any character matching C<qr/$ignoreChar/>
1297will be ignored. Through C<ignoreName>, any character whose name
1298(given in the C<table> file as a comment) matches C<qr/$ignoreName/>
1299will be ignored.
1300
1301E.g. when 'a' and 'e' are ignorable,
1302'element' is equal to 'lament' (or 'lmnt').
1303
1304=item ignore_level2
1305
1306-- see 5.1 Parametric Tailoring, UTS #10.
1307
1308By default, case-sensitive comparison (that is level 3 difference)
1309won't ignore accents (that is level 2 difference).
1310
1311If the parameter is made true, accents (and other primary ignorable
1312characters) are ignored, even though cases are taken into account.
1313
1314B<NOTE>: C<level> should be 3 or greater.
1315
1316=item katakana_before_hiragana
1317
1318-- see 7.2 Tertiary Weight Table, UTS #10.
1319
1320By default, hiragana is before katakana.
1321If the parameter is made true, this is reversed.
1322
1323B<NOTE>: This parameter simplemindedly assumes that any hiragana/katakana
1324distinctions must occur in level 3, and their weights at level 3 must be
1325same as those mentioned in 7.3.1, UTS #10.
1326If you define your collation elements which violate this requirement,
1327this parameter does not work validly.
1328
1329=item level
1330
1331-- see 4.3 Form Sort Key, UTS #10.
1332
1333Set the maximum level.
1334Any higher levels than the specified one are ignored.
1335
1336  Level 1: alphabetic ordering
1337  Level 2: diacritic ordering
1338  Level 3: case ordering
1339  Level 4: tie-breaking (e.g. in the case when variable is 'shifted')
1340
1341  ex.level => 2,
1342
1343If omitted, the maximum is the 4th.
1344
1345B<NOTE:> The DUCET includes weights over 0xFFFF at the 4th level.
1346But this module only uses weights within 0xFFFF.
1347When C<variable> is 'blanked' or 'non-ignorable' (other than 'shifted'
1348and 'shift-trimmed'), the level 4 may be unreliable.
1349
1350See also C<identical>.
1351
1352=item long_contraction
1353
1354-- see 3.8.2 Well-Formedness of the DUCET, 4.2 Produce Array, UTS #10.
1355
1356If the parameter is made true, for a contraction with three or more
1357characters (here nicknamed "long contraction"), initial substrings
1358will be handled.
1359For example, a contraction ABC, where A is a starter, and B and C
1360are non-starters (character with non-zero combining character class),
1361will be detected even if there is not AB as a contraction.
1362
1363B<Default:> Usually false.
1364If C<UCA_Version> is 22 or 24, and the value of C<long_contraction>
1365is not specified in C<new()>, a true value is set implicitly.
1366This is a workaround to pass Conformance Tests for Unicode 6.0.0 and 6.1.0.
1367
1368C<change()> handles C<long_contraction> explicitly only.
1369If C<long_contraction> is not specified in C<change()>, even though
1370C<UCA_Version> is changed, C<long_contraction> will not be changed.
1371
1372B<Limitation:> Scanning non-starters is one-way (no back tracking).
1373If AB is found but not ABC is not found, other long contraction where
1374the first character is A and the second is not B may not be found.
1375
1376Under C<(normalization =E<gt> undef)>, detection step of discontiguous
1377contractions will be skipped.
1378
1379B<Note:> The following contractions in DUCET are not considered
1380in steps S2.1.1 to S2.1.3, where they are discontiguous.
1381
1382    0FB2 0F71 0F80 (TIBETAN VOWEL SIGN VOCALIC RR)
1383    0FB3 0F71 0F80 (TIBETAN VOWEL SIGN VOCALIC LL)
1384
1385For example C<TIBETAN VOWEL SIGN VOCALIC RR> with C<COMBINING TILDE OVERLAY>
1386(C<U+0344>) is C<0FB2 0344 0F71 0F80> in NFD.
1387In this case C<0FB2 0F80> (C<TIBETAN VOWEL SIGN VOCALIC R>) is detected,
1388instead of C<0FB2 0F71 0F80>.
1389Inserted C<0344> makes C<0FB2 0F71 0F80> discontiguous and lack of
1390contraction C<0FB2 0F71> prohibits C<0FB2 0F71 0F80> from being detected.
1391
1392=item minimalFFFE
1393
1394-- see 1.1.1 U+FFFE, UTS #35 (LDML) Part 5: Collation.
1395
1396If the parameter is made true, C<U+FFFE> has a minimal primary weight.
1397The comparison between C<"$a1\x{FFFE}$a2"> and C<"$b1\x{FFFE}$b2">
1398first compares C<$a1> and C<$b1> at level 1, and
1399then C<$a2> and C<$b2> at level 1, as followed.
1400
1401        "ab\x{FFFE}a"
1402        "Ab\x{FFFE}a"
1403        "ab\x{FFFE}c"
1404        "Ab\x{FFFE}c"
1405        "ab\x{FFFE}xyz"
1406        "abc\x{FFFE}def"
1407        "abc\x{FFFE}xYz"
1408        "aBc\x{FFFE}xyz"
1409        "abcX\x{FFFE}def"
1410        "abcx\x{FFFE}xyz"
1411        "b\x{FFFE}aaa"
1412        "bbb\x{FFFE}a"
1413
1414Note:
1415This is equivalent to C<(entry =E<gt> 'FFFE ; [.0001.0020.0005.FFFE]')>.
1416Any other character than C<U+FFFE> can be tailored by C<entry>.
1417
1418=item normalization
1419
1420-- see 4.1 Normalize, UTS #10.
1421
1422If specified, strings are normalized before preparation of sort keys
1423(the normalization is executed after preprocess).
1424
1425A form name C<Unicode::Normalize::normalize()> accepts will be applied
1426as C<$normalization_form>.
1427Acceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>.
1428See C<Unicode::Normalize::normalize()> for detail.
1429If omitted, C<'NFD'> is used.
1430
1431C<normalization> is performed after C<preprocess> (if defined).
1432
1433Furthermore, special values, C<undef> and C<"prenormalized">, can be used,
1434though they are not concerned with C<Unicode::Normalize::normalize()>.
1435
1436If C<undef> (not a string C<"undef">) is passed explicitly
1437as the value for this key,
1438any normalization is not carried out (this may make tailoring easier
1439if any normalization is not desired). Under C<(normalization =E<gt> undef)>,
1440only contiguous contractions are resolved;
1441e.g. even if C<A-ring> (and C<A-ring-cedilla>) is ordered after C<Z>,
1442C<A-cedilla-ring> would be primary equal to C<A>.
1443In this point,
1444C<(normalization =E<gt> undef, preprocess =E<gt> sub { NFD(shift) })>
1445B<is not> equivalent to C<(normalization =E<gt> 'NFD')>.
1446
1447In the case of C<(normalization =E<gt> "prenormalized")>,
1448any normalization is not performed, but
1449discontiguous contractions with combining characters are performed.
1450Therefore
1451C<(normalization =E<gt> 'prenormalized', preprocess =E<gt> sub { NFD(shift) })>
1452B<is> equivalent to C<(normalization =E<gt> 'NFD')>.
1453If source strings are finely prenormalized,
1454C<(normalization =E<gt> 'prenormalized')> may save time for normalization.
1455
1456Except C<(normalization =E<gt> undef)>,
1457B<Unicode::Normalize> is required (see also B<CAVEAT>).
1458
1459=item overrideCJK
1460
1461-- see 7.1 Derived Collation Elements, UTS #10.
1462
1463By default, CJK unified ideographs are ordered in Unicode codepoint
1464order, but those in the CJK Unified Ideographs block are less than
1465those in the CJK Unified Ideographs Extension A etc.
1466
1467    In the CJK Unified Ideographs block:
1468    U+4E00..U+9FA5 if UCA_Version is 8, 9 or 11.
1469    U+4E00..U+9FBB if UCA_Version is 14 or 16.
1470    U+4E00..U+9FC3 if UCA_Version is 18.
1471    U+4E00..U+9FCB if UCA_Version is 20 or 22.
1472    U+4E00..U+9FCC if UCA_Version is 24 to 30.
1473    U+4E00..U+9FD5 if UCA_Version is 32 or 34.
1474    U+4E00..U+9FEA if UCA_Version is 36.
1475    U+4E00..U+9FEF if UCA_Version is 38, 40 or 41.
1476    U+4E00..U+9FFC if UCA_Version is 43.
1477
1478    In the CJK Unified Ideographs Extension blocks:
1479    Ext.A (U+3400..U+4DB5)   if UCA_Version is  8 to 41.
1480    Ext.A (U+3400..U+4DBF)   if UCA_Version is 43.
1481    Ext.B (U+20000..U+2A6D6) if UCA_Version is  8 to 41.
1482    Ext.B (U+20000..U+2A6DD) if UCA_Version is 43.
1483    Ext.C (U+2A700..U+2B734) if UCA_Version is 20 or later.
1484    Ext.D (U+2B740..U+2B81D) if UCA_Version is 22 or later.
1485    Ext.E (U+2B820..U+2CEA1) if UCA_Version is 32 or later.
1486    Ext.F (U+2CEB0..U+2EBE0) if UCA_Version is 36 or later.
1487    Ext.G (U+30000..U+3134A) if UCA_Version is 43.
1488
1489Through C<overrideCJK>, ordering of CJK unified ideographs (including
1490extensions) can be overridden.
1491
1492ex. CJK unified ideographs in the JIS code point order.
1493
1494  overrideCJK => sub {
1495      my $u = shift;             # get a Unicode codepoint
1496      my $b = pack('n', $u);     # to UTF-16BE
1497      my $s = your_unicode_to_sjis_converter($b); # convert
1498      my $n = unpack('n', $s);   # convert sjis to short
1499      [ $n, 0x20, 0x2, $u ];     # return the collation element
1500  },
1501
1502The return value may be an arrayref of 1st to 4th weights as shown
1503above. The return value may be an integer as the primary weight
1504as shown below.  If C<undef> is returned, the default derived
1505collation element will be used.
1506
1507  overrideCJK => sub {
1508      my $u = shift;             # get a Unicode codepoint
1509      my $b = pack('n', $u);     # to UTF-16BE
1510      my $s = your_unicode_to_sjis_converter($b); # convert
1511      my $n = unpack('n', $s);   # convert sjis to short
1512      return $n;                 # return the primary weight
1513  },
1514
1515The return value may be a list containing zero or more of
1516an arrayref, an integer, or C<undef>.
1517
1518ex. ignores all CJK unified ideographs.
1519
1520  overrideCJK => sub {()}, # CODEREF returning empty list
1521
1522   # where ->eq("Pe\x{4E00}rl", "Perl") is true
1523   # as U+4E00 is a CJK unified ideograph and to be ignorable.
1524
1525If a false value (including C<undef>) is passed, C<overrideCJK>
1526has no effect.
1527C<$Collator-E<gt>change(overrideCJK =E<gt> 0)> resets the old one.
1528
1529But assignment of weight for CJK unified ideographs
1530in C<table> or C<entry> is still valid.
1531If C<undef> is passed explicitly as the value for this key,
1532weights for CJK unified ideographs are treated as undefined.
1533However when C<UCA_Version> E<gt> 8, C<(overrideCJK =E<gt> undef)>
1534has no special meaning.
1535
1536B<Note:> In addition to them, 12 CJK compatibility ideographs (C<U+FA0E>,
1537C<U+FA0F>, C<U+FA11>, C<U+FA13>, C<U+FA14>, C<U+FA1F>, C<U+FA21>, C<U+FA23>,
1538C<U+FA24>, C<U+FA27>, C<U+FA28>, C<U+FA29>) are also treated as CJK unified
1539ideographs. But they can't be overridden via C<overrideCJK> when you use
1540DUCET, as the table includes weights for them. C<table> or C<entry> has
1541priority over C<overrideCJK>.
1542
1543=item overrideHangul
1544
1545-- see 7.1 Derived Collation Elements, UTS #10.
1546
1547By default, Hangul syllables are decomposed into Hangul Jamo,
1548even if C<(normalization =E<gt> undef)>.
1549But the mapping of Hangul syllables may be overridden.
1550
1551This parameter works like C<overrideCJK>, so see there for examples.
1552
1553If you want to override the mapping of Hangul syllables,
1554NFD and NFKD are not appropriate, since NFD and NFKD will decompose
1555Hangul syllables before overriding. FCD may decompose Hangul syllables
1556as the case may be.
1557
1558If a false value (but not C<undef>) is passed, C<overrideHangul>
1559has no effect.
1560C<$Collator-E<gt>change(overrideHangul =E<gt> 0)> resets the old one.
1561
1562If C<undef> is passed explicitly as the value for this key,
1563weight for Hangul syllables is treated as undefined
1564without decomposition into Hangul Jamo.
1565But definition of weight for Hangul syllables
1566in C<table> or C<entry> is still valid.
1567
1568=item overrideOut
1569
1570-- see 7.1.1 Handling Ill-Formed Code Unit Sequences, UTS #10.
1571
1572Perl seems to allow out-of-range values (greater than 0x10FFFF).
1573By default, out-of-range values are replaced with C<U+FFFD>
1574(REPLACEMENT CHARACTER) when C<UCA_Version> E<gt>= 22,
1575or ignored when C<UCA_Version> E<lt>= 20.
1576
1577When C<UCA_Version> E<gt>= 22, the weights of out-of-range values
1578can be overridden. Though C<table> or C<entry> are available for them,
1579out-of-range values are too many.
1580
1581C<overrideOut> can perform it algorithmically.
1582This parameter works like C<overrideCJK>, so see there for examples.
1583
1584ex. ignores all out-of-range values.
1585
1586  overrideOut => sub {()}, # CODEREF returning empty list
1587
1588If a false value (including C<undef>) is passed, C<overrideOut>
1589has no effect.
1590C<$Collator-E<gt>change(overrideOut =E<gt> 0)> resets the old one.
1591
1592B<NOTE ABOUT U+FFFD:>
1593
1594UCA recommends that out-of-range values should not be ignored for security
1595reasons. Say, C<"pe\x{110000}rl"> should not be equal to C<"perl">.
1596However, C<U+FFFD> is wrongly mapped to a variable collation element
1597in DUCET for Unicode 6.0.0 to 6.2.0, that means out-of-range values will be
1598ignored when C<variable> isn't C<Non-ignorable>.
1599
1600The mapping of C<U+FFFD> is corrected in Unicode 6.3.0.
1601see L<http://www.unicode.org/reports/tr10/tr10-28.html#Trailing_Weights>
1602(7.1.4 Trailing Weights). Such a correction is reproduced by this.
1603
1604  overrideOut => sub { 0xFFFD }, # CODEREF returning a very large integer
1605
1606This workaround is unnecessary since Unicode 6.3.0.
1607
1608=item preprocess
1609
1610-- see 5.4 Preprocessing, UTS #10.
1611
1612If specified, the coderef is used to preprocess each string
1613before the formation of sort keys.
1614
1615ex. dropping English articles, such as "a" or "the".
1616Then, "the pen" is before "a pencil".
1617
1618     preprocess => sub {
1619           my $str = shift;
1620           $str =~ s/\b(?:an?|the)\s+//gi;
1621           return $str;
1622        },
1623
1624C<preprocess> is performed before C<normalization> (if defined).
1625
1626ex. decoding strings in a legacy encoding such as shift-jis:
1627
1628    $sjis_collator = Unicode::Collate->new(
1629        preprocess => \&your_shiftjis_to_unicode_decoder,
1630    );
1631    @result = $sjis_collator->sort(@shiftjis_strings);
1632
1633B<Note:> Strings returned from the coderef will be interpreted
1634according to Perl's Unicode support. See L<perlunicode>,
1635L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>.
1636
1637=item rearrange
1638
1639-- see 3.5 Rearrangement, UTS #10.
1640
1641Characters that are not coded in logical order and to be rearranged.
1642If C<UCA_Version> is equal to or less than 11, default is:
1643
1644    rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
1645
1646If you want to disallow any rearrangement, pass C<undef> or C<[]>
1647(a reference to empty list) as the value for this key.
1648
1649If C<UCA_Version> is equal to or greater than 14, default is C<[]>
1650(i.e. no rearrangement).
1651
1652B<According to the version 9 of UCA, this parameter shall not be used;
1653but it is not warned at present.>
1654
1655=item rewrite
1656
1657If specified, the coderef is used to rewrite lines in C<table> or C<entry>.
1658The coderef will get each line, and then should return a rewritten line
1659according to the UCA file format.
1660If the coderef returns an empty line, the line will be skipped.
1661
1662e.g. any primary ignorable characters into tertiary ignorable:
1663
1664    rewrite => sub {
1665        my $line = shift;
1666        $line =~ s/\[\.0000\..{4}\..{4}\./[.0000.0000.0000./g;
1667        return $line;
1668    },
1669
1670This example shows rewriting weights. C<rewrite> is allowed to
1671affect code points, weights, and the name.
1672
1673B<NOTE>: C<table> is available to use another table file;
1674preparing a modified table once would be more efficient than
1675rewriting lines on reading an unmodified table every time.
1676
1677=item suppress
1678
1679-- see 3.12 Special-Purpose Commands, UTS #35 (LDML) Part 5: Collation.
1680
1681Contractions beginning with the specified characters are suppressed,
1682even if those contractions are defined in C<table>.
1683
1684An example for Russian and some languages using the Cyrillic script:
1685
1686    suppress => [0x0400..0x0417, 0x041A..0x0437, 0x043A..0x045F],
1687
1688where 0x0400 stands for C<U+0400>, CYRILLIC CAPITAL LETTER IE WITH GRAVE.
1689
1690B<NOTE>: Contractions via C<entry> will not be suppressed.
1691
1692=item table
1693
1694-- see 3.8 Default Unicode Collation Element Table, UTS #10.
1695
1696You can use another collation element table if desired.
1697
1698The table file should locate in the F<Unicode/Collate> directory
1699on C<@INC>. Say, if the filename is F<Foo.txt>,
1700the table file is searched as F<Unicode/Collate/Foo.txt> in C<@INC>.
1701
1702By default, F<allkeys.txt> (as the filename of DUCET) is used.
1703If you will prepare your own table file, any name other than F<allkeys.txt>
1704may be better to avoid namespace conflict.
1705
1706B<NOTE>: When XSUB is used, the DUCET is compiled on building this
1707module, and it may save time at the run time.
1708Explicit saying C<(table =E<gt> 'allkeys.txt')>, or using another table,
1709or using C<ignoreChar>, C<ignoreName>, C<undefChar>, C<undefName> or
1710C<rewrite> will prevent this module from using the compiled DUCET.
1711
1712If C<undef> is passed explicitly as the value for this key,
1713no file is read (but you can define collation elements via C<entry>).
1714
1715A typical way to define a collation element table
1716without any file of table:
1717
1718   $onlyABC = Unicode::Collate->new(
1719       table => undef,
1720       entry => << 'ENTRIES',
17210061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
17220041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
17230062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
17240042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
17250063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
17260043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
1727ENTRIES
1728    );
1729
1730If C<ignoreName> or C<undefName> is used, character names should be
1731specified as a comment (following C<#>) on each line.
1732
1733=item undefChar
1734
1735=item undefName
1736
1737-- see 6.3.3 Reducing the Repertoire, UTS #10.
1738
1739Undefines the collation element as if it were unassigned in the C<table>.
1740This reduces the size of the table.
1741If an unassigned character appears in the string to be collated,
1742the sort key is made from its codepoint
1743as a single-character collation element,
1744as it is greater than any other assigned collation elements
1745(in the codepoint order among the unassigned characters).
1746But, it'd be better to ignore characters
1747unfamiliar to you and maybe never used.
1748
1749Through C<undefChar>, any character matching C<qr/$undefChar/>
1750will be undefined. Through C<undefName>, any character whose name
1751(given in the C<table> file as a comment) matches C<qr/$undefName/>
1752will be undefined.
1753
1754ex. Collation weights for beyond-BMP characters are not stored in object:
1755
1756    undefChar => qr/[^\0-\x{fffd}]/,
1757
1758=item upper_before_lower
1759
1760-- see 6.6 Case Comparisons, UTS #10.
1761
1762By default, lowercase is before uppercase.
1763If the parameter is made true, this is reversed.
1764
1765B<NOTE>: This parameter simplemindedly assumes that any lowercase/uppercase
1766distinctions must occur in level 3, and their weights at level 3 must be
1767same as those mentioned in 7.3.1, UTS #10.
1768If you define your collation elements which differs from this requirement,
1769this parameter doesn't work validly.
1770
1771=item variable
1772
1773-- see 3.6 Variable Weighting, UTS #10.
1774
1775This key allows for variable weighting of variable collation elements,
1776which are marked with an ASTERISK in the table
1777(NOTE: Many punctuation marks and symbols are variable in F<allkeys.txt>).
1778
1779   variable => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
1780
1781These names are case-insensitive.
1782By default (if specification is omitted), 'shifted' is adopted.
1783
1784   'Blanked'        Variable elements are made ignorable at levels 1 through 3;
1785                    considered at the 4th level.
1786
1787   'Non-Ignorable'  Variable elements are not reset to ignorable.
1788
1789   'Shifted'        Variable elements are made ignorable at levels 1 through 3
1790                    their level 4 weight is replaced by the old level 1 weight.
1791                    Level 4 weight for Non-Variable elements is 0xFFFF.
1792
1793   'Shift-Trimmed'  Same as 'shifted', but all FFFF's at the 4th level
1794                    are trimmed.
1795
1796=back
1797
1798=head2 Methods for Collation
1799
1800=over 4
1801
1802=item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
1803
1804Sorts a list of strings.
1805
1806=item C<$result = $Collator-E<gt>cmp($a, $b)>
1807
1808Returns 1 (when C<$a> is greater than C<$b>)
1809or 0 (when C<$a> is equal to C<$b>)
1810or -1 (when C<$a> is less than C<$b>).
1811
1812=item C<$result = $Collator-E<gt>eq($a, $b)>
1813
1814=item C<$result = $Collator-E<gt>ne($a, $b)>
1815
1816=item C<$result = $Collator-E<gt>lt($a, $b)>
1817
1818=item C<$result = $Collator-E<gt>le($a, $b)>
1819
1820=item C<$result = $Collator-E<gt>gt($a, $b)>
1821
1822=item C<$result = $Collator-E<gt>ge($a, $b)>
1823
1824They works like the same name operators as theirs.
1825
1826   eq : whether $a is equal to $b.
1827   ne : whether $a is not equal to $b.
1828   lt : whether $a is less than $b.
1829   le : whether $a is less than $b or equal to $b.
1830   gt : whether $a is greater than $b.
1831   ge : whether $a is greater than $b or equal to $b.
1832
1833=item C<$sortKey = $Collator-E<gt>getSortKey($string)>
1834
1835-- see 4.3 Form Sort Key, UTS #10.
1836
1837Returns a sort key.
1838
1839You compare the sort keys using a binary comparison
1840and get the result of the comparison of the strings using UCA.
1841
1842   $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
1843
1844      is equivalent to
1845
1846   $Collator->cmp($a, $b)
1847
1848=item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
1849
1850Converts a sorting key into its representation form.
1851If C<UCA_Version> is 8, the output is slightly different.
1852
1853   use Unicode::Collate;
1854   my $c = Unicode::Collate->new();
1855   print $c->viewSortKey("Perl"),"\n";
1856
1857   # output:
1858   # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF]
1859   #  Level 1               Level 2               Level 3               Level 4
1860
1861=back
1862
1863=head2 Methods for Searching
1864
1865The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work
1866like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively,
1867but they are not aware of any pattern, but only a literal substring.
1868
1869B<DISCLAIMER:> If C<preprocess> or C<normalization> parameter is true
1870for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>,
1871C<subst>, C<gsubst>) is croaked, as the position and the length might
1872differ from those on the specified string.
1873
1874C<rearrange> and C<hangul_terminator> parameters are neglected.
1875C<katakana_before_hiragana> and C<upper_before_lower> don't affect
1876matching and searching, as it doesn't matter whether greater or less.
1877
1878=over 4
1879
1880=item C<$position = $Collator-E<gt>index($string, $substring[, $position])>
1881
1882=item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])>
1883
1884If C<$substring> matches a part of C<$string>, returns
1885the position of the first occurrence of the matching part in scalar context;
1886in list context, returns a two-element list of
1887the position and the length of the matching part.
1888
1889If C<$substring> does not match any part of C<$string>,
1890returns C<-1> in scalar context and
1891an empty list in list context.
1892
1893e.g. when the content of C<$str> is C<"Ich mu>E<szlig>C< studieren Perl.">,
1894you say the following where C<$sub> is C<"M>E<uuml>C<SS">,
1895
1896  my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1897                                     # (normalization => undef) is REQUIRED.
1898  my $match;
1899  if (my($pos,$len) = $Collator->index($str, $sub)) {
1900      $match = substr($str, $pos, $len);
1901  }
1902
1903and get C<"mu>E<szlig>C<"> in C<$match>, since C<"mu>E<szlig>C<">
1904is primary equal to C<"M>E<uuml>C<SS">.
1905
1906=item C<$match_ref = $Collator-E<gt>match($string, $substring)>
1907
1908=item C<($match)   = $Collator-E<gt>match($string, $substring)>
1909
1910If C<$substring> matches a part of C<$string>, in scalar context, returns
1911B<a reference to> the first occurrence of the matching part
1912(C<$match_ref> is always true if matches,
1913since every reference is B<true>);
1914in list context, returns the first occurrence of the matching part.
1915
1916If C<$substring> does not match any part of C<$string>,
1917returns C<undef> in scalar context and
1918an empty list in list context.
1919
1920e.g.
1921
1922    if ($match_ref = $Collator->match($str, $sub)) { # scalar context
1923	print "matches [$$match_ref].\n";
1924    } else {
1925	print "doesn't match.\n";
1926    }
1927
1928     or
1929
1930    if (($match) = $Collator->match($str, $sub)) { # list context
1931	print "matches [$match].\n";
1932    } else {
1933	print "doesn't match.\n";
1934    }
1935
1936=item C<@match = $Collator-E<gt>gmatch($string, $substring)>
1937
1938If C<$substring> matches a part of C<$string>, returns
1939all the matching parts (or matching count in scalar context).
1940
1941If C<$substring> does not match any part of C<$string>,
1942returns an empty list.
1943
1944=item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)>
1945
1946If C<$substring> matches a part of C<$string>,
1947the first occurrence of the matching part is replaced by C<$replacement>
1948(C<$string> is modified) and C<$count> (always equals to C<1>) is returned.
1949
1950C<$replacement> can be a C<CODEREF>,
1951taking the matching part as an argument,
1952and returning a string to replace the matching part
1953(a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>).
1954
1955=item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)>
1956
1957If C<$substring> matches a part of C<$string>,
1958all the occurrences of the matching part are replaced by C<$replacement>
1959(C<$string> is modified) and C<$count> is returned.
1960
1961C<$replacement> can be a C<CODEREF>,
1962taking the matching part as an argument,
1963and returning a string to replace the matching part
1964(a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>).
1965
1966e.g.
1967
1968  my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1969                                     # (normalization => undef) is REQUIRED.
1970  my $str = "Camel donkey zebra came\x{301}l CAMEL horse cam\0e\0l...";
1971  $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" });
1972
1973  # now $str is "<b>Camel</b> donkey zebra <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cam\0e\0l</b>...";
1974  # i.e., all the camels are made bold-faced.
1975
1976   Examples: levels and ignore_level2 - what does camel match?
1977  ---------------------------------------------------------------------------
1978   level  ignore_level2  |  camel  Camel  came\x{301}l  c-a-m-e-l  cam\0e\0l
1979  -----------------------|---------------------------------------------------
1980     1        false      |   yes    yes      yes          yes        yes
1981     2        false      |   yes    yes      no           yes        yes
1982     3        false      |   yes    no       no           yes        yes
1983     4        false      |   yes    no       no           no         yes
1984  -----------------------|---------------------------------------------------
1985     1        true       |   yes    yes      yes          yes        yes
1986     2        true       |   yes    yes      yes          yes        yes
1987     3        true       |   yes    no       yes          yes        yes
1988     4        true       |   yes    no       yes          no         yes
1989  ---------------------------------------------------------------------------
1990   note: if variable => non-ignorable, camel doesn't match c-a-m-e-l
1991         at any level.
1992
1993=back
1994
1995=head2 Other Methods
1996
1997=over 4
1998
1999=item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
2000
2001=item C<$modified_collator = $Collator-E<gt>change(%new_tailoring)>
2002
2003Changes the value of specified keys and returns the changed part.
2004
2005    $Collator = Unicode::Collate->new(level => 4);
2006
2007    $Collator->eq("perl", "PERL"); # false
2008
2009    %old = $Collator->change(level => 2); # returns (level => 4).
2010
2011    $Collator->eq("perl", "PERL"); # true
2012
2013    $Collator->change(%old); # returns (level => 2).
2014
2015    $Collator->eq("perl", "PERL"); # false
2016
2017Not all C<(key,value)>s are allowed to be changed.
2018See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>.
2019
2020In the scalar context, returns the modified collator
2021(but it is B<not> a clone from the original).
2022
2023    $Collator->change(level => 2)->eq("perl", "PERL"); # true
2024
2025    $Collator->eq("perl", "PERL"); # true; now max level is 2nd.
2026
2027    $Collator->change(level => 4)->eq("perl", "PERL"); # false
2028
2029=item C<$version = $Collator-E<gt>version()>
2030
2031Returns the version number (a string) of the Unicode Standard
2032which the C<table> file used by the collator object is based on.
2033If the table does not include a version line (starting with C<@version>),
2034returns C<"unknown">.
2035
2036=item C<UCA_Version()>
2037
2038Returns the revision number of UTS #10 this module consults,
2039that should correspond with the DUCET incorporated.
2040
2041=item C<Base_Unicode_Version()>
2042
2043Returns the version number of UTS #10 this module consults,
2044that should correspond with the DUCET incorporated.
2045
2046=back
2047
2048=head1 EXPORT
2049
2050No method will be exported.
2051
2052=head1 INSTALL
2053
2054Though this module can be used without any C<table> file,
2055to use this module easily, it is recommended to install a table file
2056in the UCA format, by copying it under the directory
2057<a place in @INC>/Unicode/Collate.
2058
2059The most preferable one is "The Default Unicode Collation Element Table"
2060(aka DUCET), available from the Unicode Consortium's website:
2061
2062   http://www.unicode.org/Public/UCA/
2063
2064   http://www.unicode.org/Public/UCA/latest/allkeys.txt
2065   (latest version)
2066
2067If DUCET is not installed, it is recommended to copy the file
2068from http://www.unicode.org/Public/UCA/latest/allkeys.txt
2069to <a place in @INC>/Unicode/Collate/allkeys.txt
2070manually.
2071
2072=head1 CAVEATS
2073
2074=over 4
2075
2076=item Normalization
2077
2078Use of the C<normalization> parameter requires the B<Unicode::Normalize>
2079module (see L<Unicode::Normalize>).
2080
2081If you need not it (say, in the case when you need not
2082handle any combining characters),
2083assign C<(normalization =E<gt> undef)> explicitly.
2084
2085-- see 6.5 Avoiding Normalization, UTS #10.
2086
2087=item Conformance Test
2088
2089The Conformance Test for the UCA is available
2090under L<http://www.unicode.org/Public/UCA/>.
2091
2092For F<CollationTest_SHIFTED.txt>,
2093a collator via C<Unicode::Collate-E<gt>new( )> should be used;
2094for F<CollationTest_NON_IGNORABLE.txt>, a collator via
2095C<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>.
2096
2097If C<UCA_Version> is 26 or later, the C<identical> level is preferred;
2098C<Unicode::Collate-E<gt>new(identical =E<gt> 1)> and
2099C<Unicode::Collate-E<gt>new(identical =E<gt> 1,>
2100C<variable =E<gt> "non-ignorable", level =E<gt> 3)> should be used.
2101
2102B<Unicode::Normalize is required to try The Conformance Test.>
2103
2104=back
2105
2106=head1 AUTHOR, COPYRIGHT AND LICENSE
2107
2108The Unicode::Collate module for perl was written by SADAHIRO Tomoyuki,
2109<SADAHIRO@cpan.org>. This module is Copyright(C) 2001-2020,
2110SADAHIRO Tomoyuki. Japan. All rights reserved.
2111
2112This module is free software; you can redistribute it and/or
2113modify it under the same terms as Perl itself.
2114
2115The file Unicode/Collate/allkeys.txt was copied verbatim
2116from L<http://www.unicode.org/Public/UCA/13.0.0/allkeys.txt>.
2117For this file, Copyright (c) 2020 Unicode, Inc.; distributed
2118under the Terms of Use in L<http://www.unicode.org/terms_of_use.html>
2119
2120=head1 SEE ALSO
2121
2122=over 4
2123
2124=item Unicode Collation Algorithm - UTS #10
2125
2126L<http://www.unicode.org/reports/tr10/>
2127
2128=item The Default Unicode Collation Element Table (DUCET)
2129
2130L<http://www.unicode.org/Public/UCA/latest/allkeys.txt>
2131
2132=item The conformance test for the UCA
2133
2134L<http://www.unicode.org/Public/UCA/latest/CollationTest.html>
2135
2136L<http://www.unicode.org/Public/UCA/latest/CollationTest.zip>
2137
2138=item Hangul Syllable Type
2139
2140L<http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt>
2141
2142=item Unicode Normalization Forms - UAX #15
2143
2144L<http://www.unicode.org/reports/tr15/>
2145
2146=item Unicode Locale Data Markup Language (LDML) - UTS #35
2147
2148L<http://www.unicode.org/reports/tr35/>
2149
2150=back
2151
2152=cut
2153