1package Unicode::Collate;
2
3BEGIN {
4    unless ("A" eq pack('U', 0x41)) {
5	die "Unicode::Collate cannot stringify a Unicode code point\n";
6    }
7    unless (0x41 == unpack('U', 'A')) {
8	die "Unicode::Collate cannot get a Unicode code point\n";
9    }
10}
11
12use 5.006;
13use strict;
14use warnings;
15use Carp;
16use File::Spec;
17
18no warnings 'utf8';
19
20our $VERSION = '1.27';
21our $PACKAGE = __PACKAGE__;
22
23### begin XS only ###
24use XSLoader ();
25XSLoader::load('Unicode::Collate', $VERSION);
26### end XS only ###
27
28my @Path = qw(Unicode Collate);
29my $KeyFile = 'allkeys.txt';
30
31# Perl's boolean
32use constant TRUE  => 1;
33use constant FALSE => "";
34use constant NOMATCHPOS => -1;
35
36# A coderef to get combining class imported from Unicode::Normalize
37# (i.e. \&Unicode::Normalize::getCombinClass).
38# This is also used as a HAS_UNICODE_NORMALIZE flag.
39my $CVgetCombinClass;
40
41# Supported Levels
42use constant MinLevel => 1;
43use constant MaxLevel => 4;
44
45# Minimum weights at level 2 and 3, respectively
46use constant Min2Wt => 0x20;
47use constant Min3Wt => 0x02;
48
49# Shifted weight at 4th level
50use constant Shift4Wt => 0xFFFF;
51
52# A boolean for Variable and 16-bit weights at 4 levels of Collation Element
53use constant VCE_TEMPLATE => 'Cn4';
54
55# A sort key: 16-bit weights
56use constant KEY_TEMPLATE => 'n*';
57
58# The tie-breaking: 32-bit weights
59use constant TIE_TEMPLATE => 'N*';
60
61# Level separator in a sort key:
62# i.e. pack(KEY_TEMPLATE, 0)
63use constant LEVEL_SEP => "\0\0";
64
65# As Unicode code point separator for hash keys.
66# A joined code point string (denoted by JCPS below)
67# like "65;768" is used for internal processing
68# instead of Perl's Unicode string like "\x41\x{300}",
69# as the native code point is different from the Unicode code point
70# on EBCDIC platform.
71# This character must not be included in any stringified
72# representation of an integer.
73use constant CODE_SEP => ';';
74	# NOTE: in regex /;/ is used for $jcps!
75
76# boolean values of variable weights
77use constant NON_VAR => 0; # Non-Variable character
78use constant VAR     => 1; # Variable character
79
80# specific code points
81use constant Hangul_SIni   => 0xAC00;
82use constant Hangul_SFin   => 0xD7A3;
83
84# Logical_Order_Exception in PropList.txt
85my $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
86
87# for highestFFFF and minimalFFFE
88my $HighestVCE = pack(VCE_TEMPLATE, 0, 0xFFFE, 0x20, 0x5, 0xFFFF);
89my $minimalVCE = pack(VCE_TEMPLATE, 0,      1, 0x20, 0x5, 0xFFFE);
90
91sub UCA_Version { '36' }
92
93sub Base_Unicode_Version { '10.0.0' }
94
95######
96
97sub pack_U {
98    return pack('U*', @_);
99}
100
101sub unpack_U {
102    return unpack('U*', shift(@_).pack('U*'));
103}
104
105######
106
107my (%VariableOK);
108@VariableOK{ qw/
109    blanked  non-ignorable  shifted  shift-trimmed
110  / } = (); # keys lowercased
111
112our @ChangeOK = qw/
113    alternate backwards level normalization rearrange
114    katakana_before_hiragana upper_before_lower ignore_level2
115    overrideCJK overrideHangul overrideOut preprocess UCA_Version
116    hangul_terminator variable identical highestFFFF minimalFFFE
117    long_contraction
118  /;
119
120our @ChangeNG = qw/
121    entry mapping table maxlength contraction
122    ignoreChar ignoreName undefChar undefName rewrite
123    versionTable alternateTable backwardsTable forwardsTable
124    rearrangeTable variableTable
125    derivCode normCode rearrangeHash backwardsFlag
126    suppress suppressHash
127    __useXS /; ### XS only
128# The hash key 'ignored' was deleted at v 0.21.
129# The hash key 'isShift' was deleted at v 0.23.
130# The hash key 'combining' was deleted at v 0.24.
131# The hash key 'entries' was deleted at v 0.30.
132# The hash key 'L3_ignorable' was deleted at v 0.40.
133
134sub version {
135    my $self = shift;
136    return $self->{versionTable} || 'unknown';
137}
138
139my (%ChangeOK, %ChangeNG);
140@ChangeOK{ @ChangeOK } = ();
141@ChangeNG{ @ChangeNG } = ();
142
143sub change {
144    my $self = shift;
145    my %hash = @_;
146    my %old;
147    if (exists $hash{alternate}) {
148	if (exists $hash{variable}) {
149	    delete $hash{alternate};
150	} else {
151	    $hash{variable} = $hash{alternate};
152	}
153    }
154    foreach my $k (keys %hash) {
155	if (exists $ChangeOK{$k}) {
156	    $old{$k} = $self->{$k};
157	    $self->{$k} = $hash{$k};
158	} elsif (exists $ChangeNG{$k}) {
159	    croak "change of $k via change() is not allowed!";
160	}
161	# else => ignored
162    }
163    $self->checkCollator();
164    return wantarray ? %old : $self;
165}
166
167sub _checkLevel {
168    my $level = shift;
169    my $key   = shift; # 'level' or 'backwards'
170    MinLevel <= $level or croak sprintf
171	"Illegal level %d (in value for key '%s') lower than %d.",
172	    $level, $key, MinLevel;
173    $level <= MaxLevel or croak sprintf
174	"Unsupported level %d (in value for key '%s') higher than %d.",
175	    $level, $key, MaxLevel;
176}
177
178my %DerivCode = (
179    8 => \&_derivCE_8,
180    9 => \&_derivCE_9,
181   11 => \&_derivCE_9, # 11 == 9
182   14 => \&_derivCE_14,
183   16 => \&_derivCE_14, # 16 == 14
184   18 => \&_derivCE_18,
185   20 => \&_derivCE_20,
186   22 => \&_derivCE_22,
187   24 => \&_derivCE_24,
188   26 => \&_derivCE_24, # 26 == 24
189   28 => \&_derivCE_24, # 28 == 24
190   30 => \&_derivCE_24, # 30 == 24
191   32 => \&_derivCE_32,
192   34 => \&_derivCE_34,
193   36 => \&_derivCE_36,
194);
195
196sub checkCollator {
197    my $self = shift;
198    _checkLevel($self->{level}, 'level');
199
200    $self->{derivCode} = $DerivCode{ $self->{UCA_Version} }
201	or croak "Illegal UCA version (passed $self->{UCA_Version}).";
202
203    $self->{variable} ||= $self->{alternate} || $self->{variableTable} ||
204				$self->{alternateTable} || 'shifted';
205    $self->{variable} = $self->{alternate} = lc($self->{variable});
206    exists $VariableOK{ $self->{variable} }
207	or croak "$PACKAGE unknown variable parameter name: $self->{variable}";
208
209    if (! defined $self->{backwards}) {
210	$self->{backwardsFlag} = 0;
211    } elsif (! ref $self->{backwards}) {
212	_checkLevel($self->{backwards}, 'backwards');
213	$self->{backwardsFlag} = 1 << $self->{backwards};
214    } else {
215	my %level;
216	$self->{backwardsFlag} = 0;
217	for my $b (@{ $self->{backwards} }) {
218	    _checkLevel($b, 'backwards');
219	    $level{$b} = 1;
220	}
221	for my $v (sort keys %level) {
222	    $self->{backwardsFlag} += 1 << $v;
223	}
224    }
225
226    defined $self->{rearrange} or $self->{rearrange} = [];
227    ref $self->{rearrange}
228	or croak "$PACKAGE: list for rearrangement must be store in ARRAYREF";
229
230    # keys of $self->{rearrangeHash} are $self->{rearrange}.
231    $self->{rearrangeHash} = undef;
232
233    if (@{ $self->{rearrange} }) {
234	@{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
235    }
236
237    $self->{normCode} = undef;
238
239    if (defined $self->{normalization}) {
240	eval { require Unicode::Normalize };
241	$@ and croak "Unicode::Normalize is required to normalize strings";
242
243	$CVgetCombinClass ||= \&Unicode::Normalize::getCombinClass;
244
245	if ($self->{normalization} =~ /^(?:NF)D\z/) { # tweak for default
246	    $self->{normCode} = \&Unicode::Normalize::NFD;
247	}
248	elsif ($self->{normalization} ne 'prenormalized') {
249	    my $norm = $self->{normalization};
250	    $self->{normCode} = sub {
251		Unicode::Normalize::normalize($norm, shift);
252	    };
253	    eval { $self->{normCode}->("") }; # try
254	    $@ and croak "$PACKAGE unknown normalization form name: $norm";
255	}
256    }
257    return;
258}
259
260sub new
261{
262    my $class = shift;
263    my $self = bless { @_ }, $class;
264
265### begin XS only ###
266    if (! exists $self->{table}     && !defined $self->{rewrite} &&
267	!defined $self->{undefName} && !defined $self->{ignoreName} &&
268	!defined $self->{undefChar} && !defined $self->{ignoreChar}) {
269	$self->{__useXS} = \&_fetch_simple;
270    } else {
271	$self->{__useXS} = undef;
272    }
273### end XS only ###
274
275    # keys of $self->{suppressHash} are $self->{suppress}.
276    if ($self->{suppress} && @{ $self->{suppress} }) {
277	@{ $self->{suppressHash} }{ @{ $self->{suppress} } } = ();
278    } # before read_table()
279
280    # If undef is passed explicitly, no file is read.
281    $self->{table} = $KeyFile if ! exists $self->{table};
282    $self->read_table() if defined $self->{table};
283
284    if ($self->{entry}) {
285	while ($self->{entry} =~ /([^\n]+)/g) {
286	    $self->parseEntry($1, TRUE);
287	}
288    }
289
290    # only in new(), not in change()
291    $self->{level} ||= MaxLevel;
292    $self->{UCA_Version} ||= UCA_Version();
293
294    $self->{overrideHangul} = FALSE
295	if ! exists $self->{overrideHangul};
296    $self->{overrideCJK} = FALSE
297	if ! exists $self->{overrideCJK};
298    $self->{normalization} = 'NFD'
299	if ! exists $self->{normalization};
300    $self->{rearrange} = $self->{rearrangeTable} ||
301	($self->{UCA_Version} <= 11 ? $DefaultRearrange : [])
302	if ! exists $self->{rearrange};
303    $self->{backwards} = $self->{backwardsTable}
304	if ! exists $self->{backwards};
305    exists $self->{long_contraction} or $self->{long_contraction}
306	= 22 <= $self->{UCA_Version} && $self->{UCA_Version} <= 24;
307
308    # checkCollator() will be called in change()
309    $self->checkCollator();
310
311    return $self;
312}
313
314sub parseAtmark {
315    my $self = shift;
316    my $line = shift; # after s/^\s*\@//
317
318    if ($line =~ /^version\s*(\S*)/) {
319	$self->{versionTable} ||= $1;
320    }
321    elsif ($line =~ /^variable\s+(\S*)/) { # since UTS #10-9
322	$self->{variableTable} ||= $1;
323    }
324    elsif ($line =~ /^alternate\s+(\S*)/) { # till UTS #10-8
325	$self->{alternateTable} ||= $1;
326    }
327    elsif ($line =~ /^backwards\s+(\S*)/) {
328	push @{ $self->{backwardsTable} }, $1;
329    }
330    elsif ($line =~ /^forwards\s+(\S*)/) { # perhaps no use
331	push @{ $self->{forwardsTable} }, $1;
332    }
333    elsif ($line =~ /^rearrange\s+(.*)/) { # (\S*) is NG
334	push @{ $self->{rearrangeTable} }, _getHexArray($1);
335    }
336}
337
338sub read_table {
339    my $self = shift;
340
341### begin XS only ###
342    if ($self->{__useXS}) {
343	my @rest = _fetch_rest(); # complex matter need to parse
344	for my $line (@rest) {
345	    next if $line =~ /^\s*#/;
346
347	    if ($line =~ s/^\s*\@//) {
348		$self->parseAtmark($line);
349	    } else {
350		$self->parseEntry($line);
351	    }
352	}
353	return;
354    }
355### end XS only ###
356
357    my($f, $fh);
358    foreach my $d (@INC) {
359	$f = File::Spec->catfile($d, @Path, $self->{table});
360	last if open($fh, $f);
361	$f = undef;
362    }
363    if (!defined $f) {
364	$f = File::Spec->catfile(@Path, $self->{table});
365	croak("$PACKAGE: Can't locate $f in \@INC (\@INC contains: @INC)");
366    }
367
368    while (my $line = <$fh>) {
369	next if $line =~ /^\s*#/;
370
371	if ($line =~ s/^\s*\@//) {
372	    $self->parseAtmark($line);
373	} else {
374	    $self->parseEntry($line);
375	}
376    }
377    close $fh;
378}
379
380
381##
382## get $line, parse it, and write an entry in $self
383##
384sub parseEntry
385{
386    my $self = shift;
387    my $line = shift;
388    my $tailoring = shift;
389    my($name, $entry, @uv, @key);
390
391    if (defined $self->{rewrite}) {
392	$line = $self->{rewrite}->($line);
393    }
394
395    return if $line !~ /^\s*[0-9A-Fa-f]/;
396
397    # removes comment and gets name
398    $name = $1
399	if $line =~ s/[#%]\s*(.*)//;
400    return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
401
402    # gets element
403    my($e, $k) = split /;/, $line;
404    croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
405	if ! $k;
406
407    @uv = _getHexArray($e);
408    return if !@uv;
409    return if @uv > 1 && $self->{suppressHash} && !$tailoring &&
410		  exists $self->{suppressHash}{$uv[0]};
411    $entry = join(CODE_SEP, @uv); # in JCPS
412
413    if (defined $self->{undefChar} || defined $self->{ignoreChar}) {
414	my $ele = pack_U(@uv);
415
416	# regarded as if it were not stored in the table
417	return
418	    if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
419
420	# replaced as completely ignorable
421	$k = '[.0000.0000.0000.0000]'
422	    if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/;
423    }
424
425    # replaced as completely ignorable
426    $k = '[.0000.0000.0000.0000]'
427	if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/;
428
429    my $is_L3_ignorable = TRUE;
430
431    foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
432	my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
433	my @wt = _getHexArray($arr);
434	push @key, pack(VCE_TEMPLATE, $var, @wt);
435	$is_L3_ignorable = FALSE
436	    if $wt[0] || $wt[1] || $wt[2];
437	# Conformance Test for 3.1.1 and 4.0.0 shows Level 3 ignorable
438	# is completely ignorable.
439	# For expansion, an entry $is_L3_ignorable
440	# if and only if "all" CEs are [.0000.0000.0000].
441    }
442
443    # mapping: be an array ref or not exists (any false value is disallowed)
444    $self->{mapping}{$entry} = $is_L3_ignorable ? [] : \@key;
445
446    # maxlength: be more than 1 or not exists (any false value is disallowed)
447    if (@uv > 1) {
448	if (!$self->{maxlength}{$uv[0]} || $self->{maxlength}{$uv[0]} < @uv) {
449	    $self->{maxlength}{$uv[0]} = @uv;
450	}
451    }
452
453    # contraction: be 1 or not exists (any false value is disallowed)
454    while (@uv > 2) {
455	pop @uv;
456	my $fake_entry = join(CODE_SEP, @uv); # in JCPS
457	$self->{contraction}{$fake_entry} = 1;
458    }
459}
460
461
462sub viewSortKey
463{
464    my $self = shift;
465    my $str  = shift;
466    $self->visualizeSortKey($self->getSortKey($str));
467}
468
469
470sub process
471{
472    my $self = shift;
473    my $str  = shift;
474    my $prep = $self->{preprocess};
475    my $norm = $self->{normCode};
476
477    $str = &$prep($str) if ref $prep;
478    $str = &$norm($str) if ref $norm;
479    return $str;
480}
481
482##
483## arrayref of JCPS   = splitEnt(string to be collated)
484## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, TRUE)
485##
486sub splitEnt
487{
488    my $self = shift;
489    my $str  = shift;
490    my $wLen = shift; # with Length
491
492    my $map  = $self->{mapping};
493    my $max  = $self->{maxlength};
494    my $reH  = $self->{rearrangeHash};
495    my $vers = $self->{UCA_Version};
496    my $ver9 = $vers >= 9 && $vers <= 11;
497    my $long = $self->{long_contraction};
498    my $uXS  = $self->{__useXS}; ### XS only
499
500    my @buf;
501
502    # get array of Unicode code point of string.
503    my @src = unpack_U($str);
504
505    # rearrangement:
506    # Character positions are not kept if rearranged,
507    # then neglected if $wLen is true.
508    if ($reH && ! $wLen) {
509	for (my $i = 0; $i < @src; $i++) {
510	    if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
511		($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
512		$i++;
513	    }
514	}
515    }
516
517    # remove a code point marked as a completely ignorable.
518    for (my $i = 0; $i < @src; $i++) {
519	if ($vers <= 20 && _isIllegal($src[$i])) {
520	    $src[$i] = undef;
521	} elsif ($ver9) {
522	    $src[$i] = undef if exists $map->{ $src[$i] }
523			   ? @{ $map->{ $src[$i] } } == 0
524			   : $uXS && _ignorable_simple($src[$i]); ### XS only
525	}
526    }
527
528    for (my $i = 0; $i < @src; $i++) {
529	my $jcps = $src[$i];
530
531	# skip removed code point
532	if (! defined $jcps) {
533	    if ($wLen && @buf) {
534		$buf[-1][2] = $i + 1;
535	    }
536	    next;
537	}
538
539	my $i_orig = $i;
540
541	# find contraction
542	if (exists $max->{$jcps}) {
543	    my $temp_jcps = $jcps;
544	    my $jcpsLen = 1;
545	    my $maxLen = $max->{$jcps};
546
547	    for (my $p = $i + 1; $jcpsLen < $maxLen && $p < @src; $p++) {
548		next if ! defined $src[$p];
549		$temp_jcps .= CODE_SEP . $src[$p];
550		$jcpsLen++;
551		if (exists $map->{$temp_jcps}) {
552		    $jcps = $temp_jcps;
553		    $i = $p;
554		}
555	    }
556
557	# discontiguous contraction with Combining Char (cf. UTS#10, S2.1).
558	# This process requires Unicode::Normalize.
559	# If "normalization" is undef, here should be skipped *always*
560	# (in spite of bool value of $CVgetCombinClass),
561	# since canonical ordering cannot be expected.
562	# Blocked combining character should not be contracted.
563
564	    # $self->{normCode} is false in the case of "prenormalized".
565	    if ($self->{normalization}) {
566		my $cont = $self->{contraction};
567		my $preCC = 0;
568		my $preCC_uc = 0;
569		my $jcps_uc = $jcps;
570		my(@out, @out_uc);
571
572		for (my $p = $i + 1; $p < @src; $p++) {
573		    next if ! defined $src[$p];
574		    my $curCC = $CVgetCombinClass->($src[$p]);
575		    last unless $curCC;
576		    my $tail = CODE_SEP . $src[$p];
577
578		    if ($preCC != $curCC && exists $map->{$jcps.$tail}) {
579			$jcps .= $tail;
580			push @out, $p;
581		    } else {
582			$preCC = $curCC;
583		    }
584
585		    next if !$long;
586
587		    if ($preCC_uc != $curCC &&
588			    (exists $map->{$jcps_uc.$tail} ||
589			    exists $cont->{$jcps_uc.$tail})) {
590			$jcps_uc .= $tail;
591			push @out_uc, $p;
592		    } else {
593			$preCC_uc = $curCC;
594		    }
595		}
596
597		if (@out_uc && exists $map->{$jcps_uc}) {
598		    $jcps = $jcps_uc;
599		    $src[$_] = undef for @out_uc;
600		} else {
601		    $src[$_] = undef for @out;
602		}
603	    }
604	}
605
606	# skip completely ignorable
607	if (exists $map->{$jcps} ? @{ $map->{$jcps} } == 0 :
608	    $uXS && $jcps !~ /;/ && _ignorable_simple($jcps)) { ### XS only
609	    if ($wLen && @buf) {
610		$buf[-1][2] = $i + 1;
611	    }
612	    next;
613	}
614
615	push @buf, $wLen ? [$jcps, $i_orig, $i + 1] : $jcps;
616    }
617    return \@buf;
618}
619
620##
621## VCE = _pack_override(input, codepoint, derivCode)
622##
623sub _pack_override ($$$) {
624    my $r = shift;
625    my $u = shift;
626    my $der = shift;
627
628    if (ref $r) {
629	return pack(VCE_TEMPLATE, NON_VAR, @$r);
630    } elsif (defined $r) {
631	return pack(VCE_TEMPLATE, NON_VAR, $r, Min2Wt, Min3Wt, $u);
632    } else {
633	$u = 0xFFFD if 0x10FFFF < $u;
634	return $der->($u);
635    }
636}
637
638##
639## list of VCE = getWt(JCPS)
640##
641sub getWt
642{
643    my $self = shift;
644    my $u    = shift;
645    my $map  = $self->{mapping};
646    my $der  = $self->{derivCode};
647    my $out  = $self->{overrideOut};
648    my $uXS  = $self->{__useXS}; ### XS only
649
650    return if !defined $u;
651    return $self->varCE($HighestVCE) if $u eq 0xFFFF && $self->{highestFFFF};
652    return $self->varCE($minimalVCE) if $u eq 0xFFFE && $self->{minimalFFFE};
653    $u = 0xFFFD if $u !~ /;/ && 0x10FFFF < $u && !$out;
654
655    my @ce;
656    if (exists $map->{$u}) {
657	@ce = @{ $map->{$u} }; # $u may be a contraction
658### begin XS only ###
659    } elsif ($uXS && _exists_simple($u)) {
660	@ce = _fetch_simple($u);
661### end XS only ###
662    } elsif (Hangul_SIni <= $u && $u <= Hangul_SFin) {
663	my $hang = $self->{overrideHangul};
664	if ($hang) {
665	    @ce = map _pack_override($_, $u, $der), $hang->($u);
666	} elsif (!defined $hang) {
667	    @ce = $der->($u);
668	} else {
669	    my $max  = $self->{maxlength};
670	    my @decH = _decompHangul($u);
671
672	    if (@decH == 2) {
673		my $contract = join(CODE_SEP, @decH);
674		@decH = ($contract) if exists $map->{$contract};
675	    } else { # must be <@decH == 3>
676		if (exists $max->{$decH[0]}) {
677		    my $contract = join(CODE_SEP, @decH);
678		    if (exists $map->{$contract}) {
679			@decH = ($contract);
680		    } else {
681			$contract = join(CODE_SEP, @decH[0,1]);
682			exists $map->{$contract} and @decH = ($contract, $decH[2]);
683		    }
684		    # even if V's ignorable, LT contraction is not supported.
685		    # If such a situation were required, NFD should be used.
686		}
687		if (@decH == 3 && exists $max->{$decH[1]}) {
688		    my $contract = join(CODE_SEP, @decH[1,2]);
689		    exists $map->{$contract} and @decH = ($decH[0], $contract);
690		}
691	    }
692
693	    @ce = map({
694		    exists $map->{$_} ? @{ $map->{$_} } :
695		$uXS && _exists_simple($_) ? _fetch_simple($_) : ### XS only
696		    $der->($_);
697		} @decH);
698	}
699    } elsif ($out && 0x10FFFF < $u) {
700	@ce = map _pack_override($_, $u, $der), $out->($u);
701    } else {
702	my $cjk  = $self->{overrideCJK};
703	my $vers = $self->{UCA_Version};
704	if ($cjk && _isUIdeo($u, $vers)) {
705	    @ce = map _pack_override($_, $u, $der), $cjk->($u);
706	} elsif ($vers == 8 && defined $cjk && _isUIdeo($u, 0)) {
707	    @ce = _uideoCE_8($u);
708	} else {
709	    @ce = $der->($u);
710	}
711    }
712    return map $self->varCE($_), @ce;
713}
714
715
716##
717## string sortkey = getSortKey(string arg)
718##
719sub getSortKey
720{
721    my $self = shift;
722    my $orig = shift;
723    my $str  = $self->process($orig);
724    my $rEnt = $self->splitEnt($str); # get an arrayref of JCPS
725    my $vers = $self->{UCA_Version};
726    my $term = $self->{hangul_terminator};
727    my $lev  = $self->{level};
728    my $iden = $self->{identical};
729
730    my @buf; # weight arrays
731    if ($term) {
732	my $preHST = '';
733	my $termCE = $self->varCE(pack(VCE_TEMPLATE, NON_VAR, $term, 0,0,0));
734	foreach my $jcps (@$rEnt) {
735	    # weird things like VL, TL-contraction are not considered!
736	    my $curHST = join '', map getHST($_, $vers), split /;/, $jcps;
737	    if ($preHST && !$curHST || # hangul before non-hangul
738		$preHST =~ /L\z/ && $curHST =~ /^T/ ||
739		$preHST =~ /V\z/ && $curHST =~ /^L/ ||
740		$preHST =~ /T\z/ && $curHST =~ /^[LV]/) {
741		push @buf, $termCE;
742	    }
743	    $preHST = $curHST;
744	    push @buf, $self->getWt($jcps);
745	}
746	push @buf, $termCE if $preHST; # end at hangul
747    } else {
748	foreach my $jcps (@$rEnt) {
749	    push @buf, $self->getWt($jcps);
750	}
751    }
752
753    my $rkey = $self->mk_SortKey(\@buf); ### XS only
754
755    if ($iden || $vers >= 26 && $lev == MaxLevel) {
756	$rkey .= LEVEL_SEP;
757	$rkey .= pack(TIE_TEMPLATE, unpack_U($str)) if $iden;
758    }
759    return $rkey;
760}
761
762
763##
764## int compare = cmp(string a, string b)
765##
766sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
767sub eq  { $_[0]->getSortKey($_[1]) eq  $_[0]->getSortKey($_[2]) }
768sub ne  { $_[0]->getSortKey($_[1]) ne  $_[0]->getSortKey($_[2]) }
769sub lt  { $_[0]->getSortKey($_[1]) lt  $_[0]->getSortKey($_[2]) }
770sub le  { $_[0]->getSortKey($_[1]) le  $_[0]->getSortKey($_[2]) }
771sub gt  { $_[0]->getSortKey($_[1]) gt  $_[0]->getSortKey($_[2]) }
772sub ge  { $_[0]->getSortKey($_[1]) ge  $_[0]->getSortKey($_[2]) }
773
774##
775## list[strings] sorted = sort(list[strings] arg)
776##
777sub sort {
778    my $obj = shift;
779    return
780	map { $_->[1] }
781	    sort{ $a->[0] cmp $b->[0] }
782		map [ $obj->getSortKey($_), $_ ], @_;
783}
784
785
786##
787## bool _nonIgnorAtLevel(arrayref weights, int level)
788##
789sub _nonIgnorAtLevel($$)
790{
791    my $wt = shift;
792    return if ! defined $wt;
793    my $lv = shift;
794    return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE;
795}
796
797##
798## bool _eqArray(
799##    arrayref of arrayref[weights] source,
800##    arrayref of arrayref[weights] substr,
801##    int level)
802## * comparison of graphemes vs graphemes.
803##   @$source >= @$substr must be true (check it before call this);
804##
805sub _eqArray($$$)
806{
807    my $source = shift;
808    my $substr = shift;
809    my $lev = shift;
810
811    for my $g (0..@$substr-1){
812	# Do the $g'th graphemes have the same number of AV weights?
813	return if @{ $source->[$g] } != @{ $substr->[$g] };
814
815	for my $w (0..@{ $substr->[$g] }-1) {
816	    for my $v (0..$lev-1) {
817		return if $source->[$g][$w][$v] != $substr->[$g][$w][$v];
818	    }
819	}
820    }
821    return 1;
822}
823
824##
825## (int position, int length)
826## int position = index(string, substring, position, [undoc'ed global])
827##
828## With "global" (only for the list context),
829##  returns list of arrayref[position, length].
830##
831sub index
832{
833    my $self = shift;
834    $self->{preprocess} and
835	croak "Don't use Preprocess with index(), match(), etc.";
836    $self->{normCode} and
837	croak "Don't use Normalization with index(), match(), etc.";
838
839    my $str  = shift;
840    my $len  = length($str);
841    my $sub  = shift;
842    my $subE = $self->splitEnt($sub);
843    my $pos  = @_ ? shift : 0;
844       $pos  = 0 if $pos < 0;
845    my $glob = shift;
846
847    my $lev  = $self->{level};
848    my $v2i  = $self->{UCA_Version} >= 9 &&
849		$self->{variable} ne 'non-ignorable';
850
851    if (! @$subE) {
852	my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos;
853	return $glob
854	    ? map([$_, 0], $temp..$len)
855	    : wantarray ? ($temp,0) : $temp;
856    }
857    $len < $pos
858	and return wantarray ? () : NOMATCHPOS;
859    my $strE = $self->splitEnt($pos ? substr($str, $pos) : $str, TRUE);
860    @$strE
861	or return wantarray ? () : NOMATCHPOS;
862
863    my(@strWt, @iniPos, @finPos, @subWt, @g_ret);
864
865    my $last_is_variable;
866    for my $vwt (map $self->getWt($_), @$subE) {
867	my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
868	my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
869
870	# "Ignorable (L1, L2) after Variable" since track. v. 9
871	if ($v2i) {
872	    if ($var) {
873		$last_is_variable = TRUE;
874	    }
875	    elsif (!$wt[0]) { # ignorable
876		$to_be_pushed = FALSE if $last_is_variable;
877	    }
878	    else {
879		$last_is_variable = FALSE;
880	    }
881	}
882
883	if (@subWt && !$var && !$wt[0]) {
884	    push @{ $subWt[-1] }, \@wt if $to_be_pushed;
885	} elsif ($to_be_pushed) {
886	    push @subWt, [ \@wt ];
887	}
888	# else ===> skipped
889    }
890
891    my $count = 0;
892    my $end = @$strE - 1;
893
894    $last_is_variable = FALSE; # reuse
895    for (my $i = 0; $i <= $end; ) { # no $i++
896	my $found_base = 0;
897
898	# fetch a grapheme
899	while ($i <= $end && $found_base == 0) {
900	    for my $vwt ($self->getWt($strE->[$i][0])) {
901		my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
902		my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
903
904		# "Ignorable (L1, L2) after Variable" since track. v. 9
905		if ($v2i) {
906		    if ($var) {
907			$last_is_variable = TRUE;
908		    }
909		    elsif (!$wt[0]) { # ignorable
910			$to_be_pushed = FALSE if $last_is_variable;
911		    }
912		    else {
913			$last_is_variable = FALSE;
914		    }
915		}
916
917		if (@strWt && !$var && !$wt[0]) {
918		    push @{ $strWt[-1] }, \@wt if $to_be_pushed;
919		    $finPos[-1] = $strE->[$i][2];
920		} elsif ($to_be_pushed) {
921		    push @strWt, [ \@wt ];
922		    push @iniPos, $found_base ? NOMATCHPOS : $strE->[$i][1];
923		    $finPos[-1] = NOMATCHPOS if $found_base;
924		    push @finPos, $strE->[$i][2];
925		    $found_base++;
926		}
927		# else ===> no-op
928	    }
929	    $i++;
930	}
931
932	# try to match
933	while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) {
934	    if ($iniPos[0] != NOMATCHPOS &&
935		    $finPos[$#subWt] != NOMATCHPOS &&
936			_eqArray(\@strWt, \@subWt, $lev)) {
937		my $temp = $iniPos[0] + $pos;
938
939		if ($glob) {
940		    push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]];
941		    splice @strWt,  0, $#subWt;
942		    splice @iniPos, 0, $#subWt;
943		    splice @finPos, 0, $#subWt;
944		}
945		else {
946		    return wantarray
947			? ($temp, $finPos[$#subWt] - $iniPos[0])
948			:  $temp;
949		}
950	    }
951	    shift @strWt;
952	    shift @iniPos;
953	    shift @finPos;
954	}
955    }
956
957    return $glob
958	? @g_ret
959	: wantarray ? () : NOMATCHPOS;
960}
961
962##
963## scalarref to matching part = match(string, substring)
964##
965sub match
966{
967    my $self = shift;
968    if (my($pos,$len) = $self->index($_[0], $_[1])) {
969	my $temp = substr($_[0], $pos, $len);
970	return wantarray ? $temp : \$temp;
971	# An lvalue ref \substr should be avoided,
972	# since its value is affected by modification of its referent.
973    }
974    else {
975	return;
976    }
977}
978
979##
980## arrayref matching parts = gmatch(string, substring)
981##
982sub gmatch
983{
984    my $self = shift;
985    my $str  = shift;
986    my $sub  = shift;
987    return map substr($str, $_->[0], $_->[1]),
988		$self->index($str, $sub, 0, 'g');
989}
990
991##
992## bool subst'ed = subst(string, substring, replace)
993##
994sub subst
995{
996    my $self = shift;
997    my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
998
999    if (my($pos,$len) = $self->index($_[0], $_[1])) {
1000	if ($code) {
1001	    my $mat = substr($_[0], $pos, $len);
1002	    substr($_[0], $pos, $len, $code->($mat));
1003	} else {
1004	    substr($_[0], $pos, $len, $_[2]);
1005	}
1006	return TRUE;
1007    }
1008    else {
1009	return FALSE;
1010    }
1011}
1012
1013##
1014## int count = gsubst(string, substring, replace)
1015##
1016sub gsubst
1017{
1018    my $self = shift;
1019    my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1020    my $cnt = 0;
1021
1022    # Replacement is carried out from the end, then use reverse.
1023    for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) {
1024	if ($code) {
1025	    my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]);
1026	    substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat));
1027	} else {
1028	    substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]);
1029	}
1030	$cnt++;
1031    }
1032    return $cnt;
1033}
1034
10351;
1036__END__
1037
1038=head1 NAME
1039
1040Unicode::Collate - Unicode Collation Algorithm
1041
1042=head1 SYNOPSIS
1043
1044  use Unicode::Collate;
1045
1046  #construct
1047  $Collator = Unicode::Collate->new(%tailoring);
1048
1049  #sort
1050  @sorted = $Collator->sort(@not_sorted);
1051
1052  #compare
1053  $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
1054
1055B<Note:> Strings in C<@not_sorted>, C<$a> and C<$b> are interpreted
1056according to Perl's Unicode support. See L<perlunicode>,
1057L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>.
1058Otherwise you can use C<preprocess> or should decode them before.
1059
1060=head1 DESCRIPTION
1061
1062This module is an implementation of Unicode Technical Standard #10
1063(a.k.a. UTS #10) - Unicode Collation Algorithm (a.k.a. UCA).
1064
1065=head2 Constructor and Tailoring
1066
1067The C<new> method returns a collator object. If new() is called
1068with no parameters, the collator should do the default collation.
1069
1070   $Collator = Unicode::Collate->new(
1071      UCA_Version => $UCA_Version,
1072      alternate => $alternate, # alias for 'variable'
1073      backwards => $levelNumber, # or \@levelNumbers
1074      entry => $element,
1075      hangul_terminator => $term_primary_weight,
1076      highestFFFF => $bool,
1077      identical => $bool,
1078      ignoreName => qr/$ignoreName/,
1079      ignoreChar => qr/$ignoreChar/,
1080      ignore_level2 => $bool,
1081      katakana_before_hiragana => $bool,
1082      level => $collationLevel,
1083      long_contraction => $bool,
1084      minimalFFFE => $bool,
1085      normalization  => $normalization_form,
1086      overrideCJK => \&overrideCJK,
1087      overrideHangul => \&overrideHangul,
1088      preprocess => \&preprocess,
1089      rearrange => \@charList,
1090      rewrite => \&rewrite,
1091      suppress => \@charList,
1092      table => $filename,
1093      undefName => qr/$undefName/,
1094      undefChar => qr/$undefChar/,
1095      upper_before_lower => $bool,
1096      variable => $variable,
1097   );
1098
1099=over 4
1100
1101=item UCA_Version
1102
1103If the revision (previously "tracking version") number of UCA is given,
1104behavior of that revision is emulated on collating.
1105If omitted, the return value of C<UCA_Version()> is used.
1106
1107The following revisions are supported.  The default is 36.
1108
1109     UCA       Unicode Standard         DUCET (@version)
1110   -------------------------------------------------------
1111      8              3.1                3.0.1 (3.0.1d9)
1112      9     3.1 with Corrigendum 3      3.1.1 (3.1.1)
1113     11              4.0                4.0.0 (4.0.0)
1114     14             4.1.0               4.1.0 (4.1.0)
1115     16              5.0                5.0.0 (5.0.0)
1116     18             5.1.0               5.1.0 (5.1.0)
1117     20             5.2.0               5.2.0 (5.2.0)
1118     22             6.0.0               6.0.0 (6.0.0)
1119     24             6.1.0               6.1.0 (6.1.0)
1120     26             6.2.0               6.2.0 (6.2.0)
1121     28             6.3.0               6.3.0 (6.3.0)
1122     30             7.0.0               7.0.0 (7.0.0)
1123     32             8.0.0               8.0.0 (8.0.0)
1124     34             9.0.0               9.0.0 (9.0.0)
1125     36            10.0.0              10.0.0(10.0.0)
1126
1127* See below for C<long_contraction> with C<UCA_Version> 22 and 24.
1128
1129* Noncharacters (e.g. U+FFFF) are not ignored, and can be overridden
1130since C<UCA_Version> 22.
1131
1132* Out-of-range codepoints (greater than U+10FFFF) are not ignored,
1133and can be overridden since C<UCA_Version> 22.
1134
1135* Fully ignorable characters were ignored, and would not interrupt
1136contractions with C<UCA_Version> 9 and 11.
1137
1138* Treatment of ignorables after variables and some behaviors
1139were changed at C<UCA_Version> 9.
1140
1141* Characters regarded as CJK unified ideographs (cf. C<overrideCJK>)
1142depend on C<UCA_Version>.
1143
1144* Many hangul jamo are assigned at C<UCA_Version> 20, that will affect
1145C<hangul_terminator>.
1146
1147=item alternate
1148
1149-- see 3.2.2 Alternate Weighting, version 8 of UTS #10
1150
1151For backward compatibility, C<alternate> (old name) can be used
1152as an alias for C<variable>.
1153
1154=item backwards
1155
1156-- see 3.4 Backward Accents, UTS #10.
1157
1158     backwards => $levelNumber or \@levelNumbers
1159
1160Weights in reverse order; ex. level 2 (diacritic ordering) in French.
1161If omitted (or C<$levelNumber> is C<undef> or C<\@levelNumbers> is C<[]>),
1162forwards at all the levels.
1163
1164=item entry
1165
1166-- see 5 Tailoring; 9.1 Allkeys File Format, UTS #10.
1167
1168If the same character (or a sequence of characters) exists
1169in the collation element table through C<table>,
1170mapping to collation elements is overridden.
1171If it does not exist, the mapping is defined additionally.
1172
1173    entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
11740063 0068 ; [.0E6A.0020.0002.0063] # ch
11750043 0068 ; [.0E6A.0020.0007.0043] # Ch
11760043 0048 ; [.0E6A.0020.0008.0043] # CH
1177006C 006C ; [.0F4C.0020.0002.006C] # ll
1178004C 006C ; [.0F4C.0020.0007.004C] # Ll
1179004C 004C ; [.0F4C.0020.0008.004C] # LL
118000F1      ; [.0F7B.0020.0002.00F1] # n-tilde
1181006E 0303 ; [.0F7B.0020.0002.00F1] # n-tilde
118200D1      ; [.0F7B.0020.0008.00D1] # N-tilde
1183004E 0303 ; [.0F7B.0020.0008.00D1] # N-tilde
1184ENTRY
1185
1186    entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
118700E6 ; [.0E33.0020.0002.00E6][.0E8B.0020.0002.00E6] # ae ligature as <a><e>
118800C6 ; [.0E33.0020.0008.00C6][.0E8B.0020.0008.00C6] # AE ligature as <A><E>
1189ENTRY
1190
1191B<NOTE:> The code point in the UCA file format (before C<';'>)
1192B<must> be a Unicode code point (defined as hexadecimal),
1193but not a native code point.
1194So C<0063> must always denote C<U+0063>,
1195but not a character of C<"\x63">.
1196
1197Weighting may vary depending on collation element table.
1198So ensure the weights defined in C<entry> will be consistent with
1199those in the collation element table loaded via C<table>.
1200
1201In DUCET v4.0.0, primary weight of C<C> is C<0E60>
1202and that of C<D> is C<0E6D>. So setting primary weight of C<CH> to C<0E6A>
1203(as a value between C<0E60> and C<0E6D>)
1204makes ordering as C<C E<lt> CH E<lt> D>.
1205Exactly speaking DUCET already has some characters between C<C> and C<D>:
1206C<small capital C> (C<U+1D04>) with primary weight C<0E64>,
1207C<c-hook/C-hook> (C<U+0188/U+0187>) with C<0E65>,
1208and C<c-curl> (C<U+0255>) with C<0E69>.
1209Then primary weight C<0E6A> for C<CH> makes C<CH>
1210ordered between C<c-curl> and C<D>.
1211
1212=item hangul_terminator
1213
1214-- see 7.1.4 Trailing Weights, UTS #10.
1215
1216If a true value is given (non-zero but should be positive),
1217it will be added as a terminator primary weight to the end of
1218every standard Hangul syllable. Secondary and any higher weights
1219for terminator are set to zero.
1220If the value is false or C<hangul_terminator> key does not exist,
1221insertion of terminator weights will not be performed.
1222
1223Boundaries of Hangul syllables are determined
1224according to conjoining Jamo behavior in F<the Unicode Standard>
1225and F<HangulSyllableType.txt>.
1226
1227B<Implementation Note:>
1228(1) For expansion mapping (Unicode character mapped
1229to a sequence of collation elements), a terminator will not be added
1230between collation elements, even if Hangul syllable boundary exists there.
1231Addition of terminator is restricted to the next position
1232to the last collation element.
1233
1234(2) Non-conjoining Hangul letters
1235(Compatibility Jamo, halfwidth Jamo, and enclosed letters) are not
1236automatically terminated with a terminator primary weight.
1237These characters may need terminator included in a collation element
1238table beforehand.
1239
1240=item highestFFFF
1241
1242-- see 2.4 Tailored noncharacter weights, UTS #35 (LDML) Part 5: Collation.
1243
1244If the parameter is made true, C<U+FFFF> has a highest primary weight.
1245When a boolean of C<$coll-E<gt>ge($str, "abc")> and
1246C<$coll-E<gt>le($str, "abc\x{FFFF}")> is true, it is expected that C<$str>
1247begins with C<"abc">, or another primary equivalent.
1248C<$str> may be C<"abcd">, C<"abc012">, but should not include C<U+FFFF>
1249such as C<"abc\x{FFFF}xyz">.
1250
1251C<$coll-E<gt>le($str, "abc\x{FFFF}")> works like C<$coll-E<gt>lt($str, "abd")>
1252almost, but the latter has a problem that you should know which letter is
1253next to C<c>. For a certain language where C<ch> as the next letter,
1254C<"abch"> is greater than C<"abc\x{FFFF}">, but less than C<"abd">.
1255
1256Note:
1257This is equivalent to C<(entry =E<gt> 'FFFF ; [.FFFE.0020.0005.FFFF]')>.
1258Any other character than C<U+FFFF> can be tailored by C<entry>.
1259
1260=item identical
1261
1262-- see A.3 Deterministic Comparison, UTS #10.
1263
1264By default, strings whose weights are equal should be equal,
1265even though their code points are not equal.
1266Completely ignorable characters are ignored.
1267
1268If the parameter is made true, a final, tie-breaking level is used.
1269If no difference of weights is found after the comparison through
1270all the level specified by C<level>, the comparison with code points
1271will be performed.
1272For the tie-breaking comparison, the sort key has code points
1273of the original string appended.
1274Completely ignorable characters are not ignored.
1275
1276If C<preprocess> and/or C<normalization> is applied, the code points
1277of the string after them (in NFD by default) are used.
1278
1279=item ignoreChar
1280
1281=item ignoreName
1282
1283-- see 3.6 Variable Weighting, UTS #10.
1284
1285Makes the entry in the table completely ignorable;
1286i.e. as if the weights were zero at all level.
1287
1288Through C<ignoreChar>, any character matching C<qr/$ignoreChar/>
1289will be ignored. Through C<ignoreName>, any character whose name
1290(given in the C<table> file as a comment) matches C<qr/$ignoreName/>
1291will be ignored.
1292
1293E.g. when 'a' and 'e' are ignorable,
1294'element' is equal to 'lament' (or 'lmnt').
1295
1296=item ignore_level2
1297
1298-- see 5.1 Parametric Tailoring, UTS #10.
1299
1300By default, case-sensitive comparison (that is level 3 difference)
1301won't ignore accents (that is level 2 difference).
1302
1303If the parameter is made true, accents (and other primary ignorable
1304characters) are ignored, even though cases are taken into account.
1305
1306B<NOTE>: C<level> should be 3 or greater.
1307
1308=item katakana_before_hiragana
1309
1310-- see 7.2 Tertiary Weight Table, UTS #10.
1311
1312By default, hiragana is before katakana.
1313If the parameter is made true, this is reversed.
1314
1315B<NOTE>: This parameter simplemindedly assumes that any hiragana/katakana
1316distinctions must occur in level 3, and their weights at level 3 must be
1317same as those mentioned in 7.3.1, UTS #10.
1318If you define your collation elements which violate this requirement,
1319this parameter does not work validly.
1320
1321=item level
1322
1323-- see 4.3 Form Sort Key, UTS #10.
1324
1325Set the maximum level.
1326Any higher levels than the specified one are ignored.
1327
1328  Level 1: alphabetic ordering
1329  Level 2: diacritic ordering
1330  Level 3: case ordering
1331  Level 4: tie-breaking (e.g. in the case when variable is 'shifted')
1332
1333  ex.level => 2,
1334
1335If omitted, the maximum is the 4th.
1336
1337B<NOTE:> The DUCET includes weights over 0xFFFF at the 4th level.
1338But this module only uses weights within 0xFFFF.
1339When C<variable> is 'blanked' or 'non-ignorable' (other than 'shifted'
1340and 'shift-trimmed'), the level 4 may be unreliable.
1341
1342See also C<identical>.
1343
1344=item long_contraction
1345
1346-- see 3.8.2 Well-Formedness of the DUCET, 4.2 Produce Array, UTS #10.
1347
1348If the parameter is made true, for a contraction with three or more
1349characters (here nicknamed "long contraction"), initial substrings
1350will be handled.
1351For example, a contraction ABC, where A is a starter, and B and C
1352are non-starters (character with non-zero combining character class),
1353will be detected even if there is not AB as a contraction.
1354
1355B<Default:> Usually false.
1356If C<UCA_Version> is 22 or 24, and the value of C<long_contraction>
1357is not specified in C<new()>, a true value is set implicitly.
1358This is a workaround to pass Conformance Tests for Unicode 6.0.0 and 6.1.0.
1359
1360C<change()> handles C<long_contraction> explicitly only.
1361If C<long_contraction> is not specified in C<change()>, even though
1362C<UCA_Version> is changed, C<long_contraction> will not be changed.
1363
1364B<Limitation:> Scanning non-starters is one-way (no back tracking).
1365If AB is found but not ABC is not found, other long contraction where
1366the first character is A and the second is not B may not be found.
1367
1368Under C<(normalization =E<gt> undef)>, detection step of discontiguous
1369contractions will be skipped.
1370
1371B<Note:> The following contractions in DUCET are not considered
1372in steps S2.1.1 to S2.1.3, where they are discontiguous.
1373
1374    0FB2 0F71 0F80 (TIBETAN VOWEL SIGN VOCALIC RR)
1375    0FB3 0F71 0F80 (TIBETAN VOWEL SIGN VOCALIC LL)
1376
1377For example C<TIBETAN VOWEL SIGN VOCALIC RR> with C<COMBINING TILDE OVERLAY>
1378(C<U+0344>) is C<0FB2 0344 0F71 0F80> in NFD.
1379In this case C<0FB2 0F80> (C<TIBETAN VOWEL SIGN VOCALIC R>) is detected,
1380instead of C<0FB2 0F71 0F80>.
1381Inserted C<0344> makes C<0FB2 0F71 0F80> discontiguous and lack of
1382contraction C<0FB2 0F71> prohibits C<0FB2 0F71 0F80> from being detected.
1383
1384=item minimalFFFE
1385
1386-- see 1.1.1 U+FFFE, UTS #35 (LDML) Part 5: Collation.
1387
1388If the parameter is made true, C<U+FFFE> has a minimal primary weight.
1389The comparison between C<"$a1\x{FFFE}$a2"> and C<"$b1\x{FFFE}$b2">
1390first compares C<$a1> and C<$b1> at level 1, and
1391then C<$a2> and C<$b2> at level 1, as followed.
1392
1393        "ab\x{FFFE}a"
1394        "Ab\x{FFFE}a"
1395        "ab\x{FFFE}c"
1396        "Ab\x{FFFE}c"
1397        "ab\x{FFFE}xyz"
1398        "abc\x{FFFE}def"
1399        "abc\x{FFFE}xYz"
1400        "aBc\x{FFFE}xyz"
1401        "abcX\x{FFFE}def"
1402        "abcx\x{FFFE}xyz"
1403        "b\x{FFFE}aaa"
1404        "bbb\x{FFFE}a"
1405
1406Note:
1407This is equivalent to C<(entry =E<gt> 'FFFE ; [.0001.0020.0005.FFFE]')>.
1408Any other character than C<U+FFFE> can be tailored by C<entry>.
1409
1410=item normalization
1411
1412-- see 4.1 Normalize, UTS #10.
1413
1414If specified, strings are normalized before preparation of sort keys
1415(the normalization is executed after preprocess).
1416
1417A form name C<Unicode::Normalize::normalize()> accepts will be applied
1418as C<$normalization_form>.
1419Acceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>.
1420See C<Unicode::Normalize::normalize()> for detail.
1421If omitted, C<'NFD'> is used.
1422
1423C<normalization> is performed after C<preprocess> (if defined).
1424
1425Furthermore, special values, C<undef> and C<"prenormalized">, can be used,
1426though they are not concerned with C<Unicode::Normalize::normalize()>.
1427
1428If C<undef> (not a string C<"undef">) is passed explicitly
1429as the value for this key,
1430any normalization is not carried out (this may make tailoring easier
1431if any normalization is not desired). Under C<(normalization =E<gt> undef)>,
1432only contiguous contractions are resolved;
1433e.g. even if C<A-ring> (and C<A-ring-cedilla>) is ordered after C<Z>,
1434C<A-cedilla-ring> would be primary equal to C<A>.
1435In this point,
1436C<(normalization =E<gt> undef, preprocess =E<gt> sub { NFD(shift) })>
1437B<is not> equivalent to C<(normalization =E<gt> 'NFD')>.
1438
1439In the case of C<(normalization =E<gt> "prenormalized")>,
1440any normalization is not performed, but
1441discontiguous contractions with combining characters are performed.
1442Therefore
1443C<(normalization =E<gt> 'prenormalized', preprocess =E<gt> sub { NFD(shift) })>
1444B<is> equivalent to C<(normalization =E<gt> 'NFD')>.
1445If source strings are finely prenormalized,
1446C<(normalization =E<gt> 'prenormalized')> may save time for normalization.
1447
1448Except C<(normalization =E<gt> undef)>,
1449B<Unicode::Normalize> is required (see also B<CAVEAT>).
1450
1451=item overrideCJK
1452
1453-- see 7.1 Derived Collation Elements, UTS #10.
1454
1455By default, CJK unified ideographs are ordered in Unicode codepoint
1456order, but those in the CJK Unified Ideographs block are less than
1457those in the CJK Unified Ideographs Extension A etc.
1458
1459    In the CJK Unified Ideographs block:
1460    U+4E00..U+9FA5 if UCA_Version is 8, 9 or 11.
1461    U+4E00..U+9FBB if UCA_Version is 14 or 16.
1462    U+4E00..U+9FC3 if UCA_Version is 18.
1463    U+4E00..U+9FCB if UCA_Version is 20 or 22.
1464    U+4E00..U+9FCC if UCA_Version is 24 to 30.
1465    U+4E00..U+9FD5 if UCA_Version is 32 or 34.
1466    U+4E00..U+9FEA if UCA_Version is 36.
1467
1468    In the CJK Unified Ideographs Extension blocks:
1469    Ext.A (U+3400..U+4DB5) and Ext.B (U+20000..U+2A6D6) in any UCA_Version.
1470    Ext.C (U+2A700..U+2B734) if UCA_Version is 20 or later.
1471    Ext.D (U+2B740..U+2B81D) if UCA_Version is 22 or later.
1472    Ext.E (U+2B820..U+2CEA1) if UCA_Version is 32 or later.
1473    Ext.F (U+2CEB0..U+2EBE0) if UCA_Version is 36.
1474
1475Through C<overrideCJK>, ordering of CJK unified ideographs (including
1476extensions) can be overridden.
1477
1478ex. CJK unified ideographs in the JIS code point order.
1479
1480  overrideCJK => sub {
1481      my $u = shift;             # get a Unicode codepoint
1482      my $b = pack('n', $u);     # to UTF-16BE
1483      my $s = your_unicode_to_sjis_converter($b); # convert
1484      my $n = unpack('n', $s);   # convert sjis to short
1485      [ $n, 0x20, 0x2, $u ];     # return the collation element
1486  },
1487
1488The return value may be an arrayref of 1st to 4th weights as shown
1489above. The return value may be an integer as the primary weight
1490as shown below.  If C<undef> is returned, the default derived
1491collation element will be used.
1492
1493  overrideCJK => sub {
1494      my $u = shift;             # get a Unicode codepoint
1495      my $b = pack('n', $u);     # to UTF-16BE
1496      my $s = your_unicode_to_sjis_converter($b); # convert
1497      my $n = unpack('n', $s);   # convert sjis to short
1498      return $n;                 # return the primary weight
1499  },
1500
1501The return value may be a list containing zero or more of
1502an arrayref, an integer, or C<undef>.
1503
1504ex. ignores all CJK unified ideographs.
1505
1506  overrideCJK => sub {()}, # CODEREF returning empty list
1507
1508   # where ->eq("Pe\x{4E00}rl", "Perl") is true
1509   # as U+4E00 is a CJK unified ideograph and to be ignorable.
1510
1511If a false value (including C<undef>) is passed, C<overrideCJK>
1512has no effect.
1513C<$Collator-E<gt>change(overrideCJK =E<gt> 0)> resets the old one.
1514
1515But assignment of weight for CJK unified ideographs
1516in C<table> or C<entry> is still valid.
1517If C<undef> is passed explicitly as the value for this key,
1518weights for CJK unified ideographs are treated as undefined.
1519However when C<UCA_Version> E<gt> 8, C<(overrideCJK =E<gt> undef)>
1520has no special meaning.
1521
1522B<Note:> In addition to them, 12 CJK compatibility ideographs (C<U+FA0E>,
1523C<U+FA0F>, C<U+FA11>, C<U+FA13>, C<U+FA14>, C<U+FA1F>, C<U+FA21>, C<U+FA23>,
1524C<U+FA24>, C<U+FA27>, C<U+FA28>, C<U+FA29>) are also treated as CJK unified
1525ideographs. But they can't be overridden via C<overrideCJK> when you use
1526DUCET, as the table includes weights for them. C<table> or C<entry> has
1527priority over C<overrideCJK>.
1528
1529=item overrideHangul
1530
1531-- see 7.1 Derived Collation Elements, UTS #10.
1532
1533By default, Hangul syllables are decomposed into Hangul Jamo,
1534even if C<(normalization =E<gt> undef)>.
1535But the mapping of Hangul syllables may be overridden.
1536
1537This parameter works like C<overrideCJK>, so see there for examples.
1538
1539If you want to override the mapping of Hangul syllables,
1540NFD and NFKD are not appropriate, since NFD and NFKD will decompose
1541Hangul syllables before overriding. FCD may decompose Hangul syllables
1542as the case may be.
1543
1544If a false value (but not C<undef>) is passed, C<overrideHangul>
1545has no effect.
1546C<$Collator-E<gt>change(overrideHangul =E<gt> 0)> resets the old one.
1547
1548If C<undef> is passed explicitly as the value for this key,
1549weight for Hangul syllables is treated as undefined
1550without decomposition into Hangul Jamo.
1551But definition of weight for Hangul syllables
1552in C<table> or C<entry> is still valid.
1553
1554=item overrideOut
1555
1556-- see 7.1.1 Handling Ill-Formed Code Unit Sequences, UTS #10.
1557
1558Perl seems to allow out-of-range values (greater than 0x10FFFF).
1559By default, out-of-range values are replaced with C<U+FFFD>
1560(REPLACEMENT CHARACTER) when C<UCA_Version> E<gt>= 22,
1561or ignored when C<UCA_Version> E<lt>= 20.
1562
1563When C<UCA_Version> E<gt>= 22, the weights of out-of-range values
1564can be overridden. Though C<table> or C<entry> are available for them,
1565out-of-range values are too many.
1566
1567C<overrideOut> can perform it algorithmically.
1568This parameter works like C<overrideCJK>, so see there for examples.
1569
1570ex. ignores all out-of-range values.
1571
1572  overrideOut => sub {()}, # CODEREF returning empty list
1573
1574If a false value (including C<undef>) is passed, C<overrideOut>
1575has no effect.
1576C<$Collator-E<gt>change(overrideOut =E<gt> 0)> resets the old one.
1577
1578B<NOTE ABOUT U+FFFD:>
1579
1580UCA recommends that out-of-range values should not be ignored for security
1581reasons. Say, C<"pe\x{110000}rl"> should not be equal to C<"perl">.
1582However, C<U+FFFD> is wrongly mapped to a variable collation element
1583in DUCET for Unicode 6.0.0 to 6.2.0, that means out-of-range values will be
1584ignored when C<variable> isn't C<Non-ignorable>.
1585
1586The mapping of C<U+FFFD> is corrected in Unicode 6.3.0.
1587see L<http://www.unicode.org/reports/tr10/tr10-28.html#Trailing_Weights>
1588(7.1.4 Trailing Weights). Such a correction is reproduced by this.
1589
1590  overrideOut => sub { 0xFFFD }, # CODEREF returning a very large integer
1591
1592This workaround is unnecessary since Unicode 6.3.0.
1593
1594=item preprocess
1595
1596-- see 5.4 Preprocessing, UTS #10.
1597
1598If specified, the coderef is used to preprocess each string
1599before the formation of sort keys.
1600
1601ex. dropping English articles, such as "a" or "the".
1602Then, "the pen" is before "a pencil".
1603
1604     preprocess => sub {
1605           my $str = shift;
1606           $str =~ s/\b(?:an?|the)\s+//gi;
1607           return $str;
1608        },
1609
1610C<preprocess> is performed before C<normalization> (if defined).
1611
1612ex. decoding strings in a legacy encoding such as shift-jis:
1613
1614    $sjis_collator = Unicode::Collate->new(
1615        preprocess => \&your_shiftjis_to_unicode_decoder,
1616    );
1617    @result = $sjis_collator->sort(@shiftjis_strings);
1618
1619B<Note:> Strings returned from the coderef will be interpreted
1620according to Perl's Unicode support. See L<perlunicode>,
1621L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>.
1622
1623=item rearrange
1624
1625-- see 3.5 Rearrangement, UTS #10.
1626
1627Characters that are not coded in logical order and to be rearranged.
1628If C<UCA_Version> is equal to or less than 11, default is:
1629
1630    rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
1631
1632If you want to disallow any rearrangement, pass C<undef> or C<[]>
1633(a reference to empty list) as the value for this key.
1634
1635If C<UCA_Version> is equal to or greater than 14, default is C<[]>
1636(i.e. no rearrangement).
1637
1638B<According to the version 9 of UCA, this parameter shall not be used;
1639but it is not warned at present.>
1640
1641=item rewrite
1642
1643If specified, the coderef is used to rewrite lines in C<table> or C<entry>.
1644The coderef will get each line, and then should return a rewritten line
1645according to the UCA file format.
1646If the coderef returns an empty line, the line will be skipped.
1647
1648e.g. any primary ignorable characters into tertiary ignorable:
1649
1650    rewrite => sub {
1651        my $line = shift;
1652        $line =~ s/\[\.0000\..{4}\..{4}\./[.0000.0000.0000./g;
1653        return $line;
1654    },
1655
1656This example shows rewriting weights. C<rewrite> is allowed to
1657affect code points, weights, and the name.
1658
1659B<NOTE>: C<table> is available to use another table file;
1660preparing a modified table once would be more efficient than
1661rewriting lines on reading an unmodified table every time.
1662
1663=item suppress
1664
1665-- see 3.12 Special-Purpose Commands, UTS #35 (LDML) Part 5: Collation.
1666
1667Contractions beginning with the specified characters are suppressed,
1668even if those contractions are defined in C<table>.
1669
1670An example for Russian and some languages using the Cyrillic script:
1671
1672    suppress => [0x0400..0x0417, 0x041A..0x0437, 0x043A..0x045F],
1673
1674where 0x0400 stands for C<U+0400>, CYRILLIC CAPITAL LETTER IE WITH GRAVE.
1675
1676B<NOTE>: Contractions via C<entry> will not be suppressed.
1677
1678=item table
1679
1680-- see 3.8 Default Unicode Collation Element Table, UTS #10.
1681
1682You can use another collation element table if desired.
1683
1684The table file should locate in the F<Unicode/Collate> directory
1685on C<@INC>. Say, if the filename is F<Foo.txt>,
1686the table file is searched as F<Unicode/Collate/Foo.txt> in C<@INC>.
1687
1688By default, F<allkeys.txt> (as the filename of DUCET) is used.
1689If you will prepare your own table file, any name other than F<allkeys.txt>
1690may be better to avoid namespace conflict.
1691
1692B<NOTE>: When XSUB is used, the DUCET is compiled on building this
1693module, and it may save time at the run time.
1694Explicit saying C<(table =E<gt> 'allkeys.txt')>, or using another table,
1695or using C<ignoreChar>, C<ignoreName>, C<undefChar>, C<undefName> or
1696C<rewrite> will prevent this module from using the compiled DUCET.
1697
1698If C<undef> is passed explicitly as the value for this key,
1699no file is read (but you can define collation elements via C<entry>).
1700
1701A typical way to define a collation element table
1702without any file of table:
1703
1704   $onlyABC = Unicode::Collate->new(
1705       table => undef,
1706       entry => << 'ENTRIES',
17070061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
17080041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
17090062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
17100042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
17110063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
17120043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
1713ENTRIES
1714    );
1715
1716If C<ignoreName> or C<undefName> is used, character names should be
1717specified as a comment (following C<#>) on each line.
1718
1719=item undefChar
1720
1721=item undefName
1722
1723-- see 6.3.3 Reducing the Repertoire, UTS #10.
1724
1725Undefines the collation element as if it were unassigned in the C<table>.
1726This reduces the size of the table.
1727If an unassigned character appears in the string to be collated,
1728the sort key is made from its codepoint
1729as a single-character collation element,
1730as it is greater than any other assigned collation elements
1731(in the codepoint order among the unassigned characters).
1732But, it'd be better to ignore characters
1733unfamiliar to you and maybe never used.
1734
1735Through C<undefChar>, any character matching C<qr/$undefChar/>
1736will be undefined. Through C<undefName>, any character whose name
1737(given in the C<table> file as a comment) matches C<qr/$undefName/>
1738will be undefined.
1739
1740ex. Collation weights for beyond-BMP characters are not stored in object:
1741
1742    undefChar => qr/[^\0-\x{fffd}]/,
1743
1744=item upper_before_lower
1745
1746-- see 6.6 Case Comparisons, UTS #10.
1747
1748By default, lowercase is before uppercase.
1749If the parameter is made true, this is reversed.
1750
1751B<NOTE>: This parameter simplemindedly assumes that any lowercase/uppercase
1752distinctions must occur in level 3, and their weights at level 3 must be
1753same as those mentioned in 7.3.1, UTS #10.
1754If you define your collation elements which differs from this requirement,
1755this parameter doesn't work validly.
1756
1757=item variable
1758
1759-- see 3.6 Variable Weighting, UTS #10.
1760
1761This key allows for variable weighting of variable collation elements,
1762which are marked with an ASTERISK in the table
1763(NOTE: Many punctuation marks and symbols are variable in F<allkeys.txt>).
1764
1765   variable => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
1766
1767These names are case-insensitive.
1768By default (if specification is omitted), 'shifted' is adopted.
1769
1770   'Blanked'        Variable elements are made ignorable at levels 1 through 3;
1771                    considered at the 4th level.
1772
1773   'Non-Ignorable'  Variable elements are not reset to ignorable.
1774
1775   'Shifted'        Variable elements are made ignorable at levels 1 through 3
1776                    their level 4 weight is replaced by the old level 1 weight.
1777                    Level 4 weight for Non-Variable elements is 0xFFFF.
1778
1779   'Shift-Trimmed'  Same as 'shifted', but all FFFF's at the 4th level
1780                    are trimmed.
1781
1782=back
1783
1784=head2 Methods for Collation
1785
1786=over 4
1787
1788=item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
1789
1790Sorts a list of strings.
1791
1792=item C<$result = $Collator-E<gt>cmp($a, $b)>
1793
1794Returns 1 (when C<$a> is greater than C<$b>)
1795or 0 (when C<$a> is equal to C<$b>)
1796or -1 (when C<$a> is less than C<$b>).
1797
1798=item C<$result = $Collator-E<gt>eq($a, $b)>
1799
1800=item C<$result = $Collator-E<gt>ne($a, $b)>
1801
1802=item C<$result = $Collator-E<gt>lt($a, $b)>
1803
1804=item C<$result = $Collator-E<gt>le($a, $b)>
1805
1806=item C<$result = $Collator-E<gt>gt($a, $b)>
1807
1808=item C<$result = $Collator-E<gt>ge($a, $b)>
1809
1810They works like the same name operators as theirs.
1811
1812   eq : whether $a is equal to $b.
1813   ne : whether $a is not equal to $b.
1814   lt : whether $a is less than $b.
1815   le : whether $a is less than $b or equal to $b.
1816   gt : whether $a is greater than $b.
1817   ge : whether $a is greater than $b or equal to $b.
1818
1819=item C<$sortKey = $Collator-E<gt>getSortKey($string)>
1820
1821-- see 4.3 Form Sort Key, UTS #10.
1822
1823Returns a sort key.
1824
1825You compare the sort keys using a binary comparison
1826and get the result of the comparison of the strings using UCA.
1827
1828   $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
1829
1830      is equivalent to
1831
1832   $Collator->cmp($a, $b)
1833
1834=item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
1835
1836Converts a sorting key into its representation form.
1837If C<UCA_Version> is 8, the output is slightly different.
1838
1839   use Unicode::Collate;
1840   my $c = Unicode::Collate->new();
1841   print $c->viewSortKey("Perl"),"\n";
1842
1843   # output:
1844   # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF]
1845   #  Level 1               Level 2               Level 3               Level 4
1846
1847=back
1848
1849=head2 Methods for Searching
1850
1851The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work
1852like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively,
1853but they are not aware of any pattern, but only a literal substring.
1854
1855B<DISCLAIMER:> If C<preprocess> or C<normalization> parameter is true
1856for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>,
1857C<subst>, C<gsubst>) is croaked, as the position and the length might
1858differ from those on the specified string.
1859
1860C<rearrange> and C<hangul_terminator> parameters are neglected.
1861C<katakana_before_hiragana> and C<upper_before_lower> don't affect
1862matching and searching, as it doesn't matter whether greater or less.
1863
1864=over 4
1865
1866=item C<$position = $Collator-E<gt>index($string, $substring[, $position])>
1867
1868=item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])>
1869
1870If C<$substring> matches a part of C<$string>, returns
1871the position of the first occurrence of the matching part in scalar context;
1872in list context, returns a two-element list of
1873the position and the length of the matching part.
1874
1875If C<$substring> does not match any part of C<$string>,
1876returns C<-1> in scalar context and
1877an empty list in list context.
1878
1879e.g. when the content of C<$str> is C<"Ich mu>E<szlig>C< studieren Perl.">,
1880you say the following where C<$sub> is C<"M>E<uuml>C<SS">,
1881
1882  my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1883                                     # (normalization => undef) is REQUIRED.
1884  my $match;
1885  if (my($pos,$len) = $Collator->index($str, $sub)) {
1886      $match = substr($str, $pos, $len);
1887  }
1888
1889and get C<"mu>E<szlig>C<"> in C<$match>, since C<"mu>E<szlig>C<">
1890is primary equal to C<"M>E<uuml>C<SS">.
1891
1892=item C<$match_ref = $Collator-E<gt>match($string, $substring)>
1893
1894=item C<($match)   = $Collator-E<gt>match($string, $substring)>
1895
1896If C<$substring> matches a part of C<$string>, in scalar context, returns
1897B<a reference to> the first occurrence of the matching part
1898(C<$match_ref> is always true if matches,
1899since every reference is B<true>);
1900in list context, returns the first occurrence of the matching part.
1901
1902If C<$substring> does not match any part of C<$string>,
1903returns C<undef> in scalar context and
1904an empty list in list context.
1905
1906e.g.
1907
1908    if ($match_ref = $Collator->match($str, $sub)) { # scalar context
1909	print "matches [$$match_ref].\n";
1910    } else {
1911	print "doesn't match.\n";
1912    }
1913
1914     or
1915
1916    if (($match) = $Collator->match($str, $sub)) { # list context
1917	print "matches [$match].\n";
1918    } else {
1919	print "doesn't match.\n";
1920    }
1921
1922=item C<@match = $Collator-E<gt>gmatch($string, $substring)>
1923
1924If C<$substring> matches a part of C<$string>, returns
1925all the matching parts (or matching count in scalar context).
1926
1927If C<$substring> does not match any part of C<$string>,
1928returns an empty list.
1929
1930=item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)>
1931
1932If C<$substring> matches a part of C<$string>,
1933the first occurrence of the matching part is replaced by C<$replacement>
1934(C<$string> is modified) and C<$count> (always equals to C<1>) is returned.
1935
1936C<$replacement> can be a C<CODEREF>,
1937taking the matching part as an argument,
1938and returning a string to replace the matching part
1939(a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>).
1940
1941=item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)>
1942
1943If C<$substring> matches a part of C<$string>,
1944all the occurrences of the matching part are replaced by C<$replacement>
1945(C<$string> is modified) and C<$count> is returned.
1946
1947C<$replacement> can be a C<CODEREF>,
1948taking the matching part as an argument,
1949and returning a string to replace the matching part
1950(a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>).
1951
1952e.g.
1953
1954  my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1955                                     # (normalization => undef) is REQUIRED.
1956  my $str = "Camel donkey zebra came\x{301}l CAMEL horse cam\0e\0l...";
1957  $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" });
1958
1959  # now $str is "<b>Camel</b> donkey zebra <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cam\0e\0l</b>...";
1960  # i.e., all the camels are made bold-faced.
1961
1962   Examples: levels and ignore_level2 - what does camel match?
1963  ---------------------------------------------------------------------------
1964   level  ignore_level2  |  camel  Camel  came\x{301}l  c-a-m-e-l  cam\0e\0l
1965  -----------------------|---------------------------------------------------
1966     1        false      |   yes    yes      yes          yes        yes
1967     2        false      |   yes    yes      no           yes        yes
1968     3        false      |   yes    no       no           yes        yes
1969     4        false      |   yes    no       no           no         yes
1970  -----------------------|---------------------------------------------------
1971     1        true       |   yes    yes      yes          yes        yes
1972     2        true       |   yes    yes      yes          yes        yes
1973     3        true       |   yes    no       yes          yes        yes
1974     4        true       |   yes    no       yes          no         yes
1975  ---------------------------------------------------------------------------
1976   note: if variable => non-ignorable, camel doesn't match c-a-m-e-l
1977         at any level.
1978
1979=back
1980
1981=head2 Other Methods
1982
1983=over 4
1984
1985=item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
1986
1987=item C<$modified_collator = $Collator-E<gt>change(%new_tailoring)>
1988
1989Changes the value of specified keys and returns the changed part.
1990
1991    $Collator = Unicode::Collate->new(level => 4);
1992
1993    $Collator->eq("perl", "PERL"); # false
1994
1995    %old = $Collator->change(level => 2); # returns (level => 4).
1996
1997    $Collator->eq("perl", "PERL"); # true
1998
1999    $Collator->change(%old); # returns (level => 2).
2000
2001    $Collator->eq("perl", "PERL"); # false
2002
2003Not all C<(key,value)>s are allowed to be changed.
2004See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>.
2005
2006In the scalar context, returns the modified collator
2007(but it is B<not> a clone from the original).
2008
2009    $Collator->change(level => 2)->eq("perl", "PERL"); # true
2010
2011    $Collator->eq("perl", "PERL"); # true; now max level is 2nd.
2012
2013    $Collator->change(level => 4)->eq("perl", "PERL"); # false
2014
2015=item C<$version = $Collator-E<gt>version()>
2016
2017Returns the version number (a string) of the Unicode Standard
2018which the C<table> file used by the collator object is based on.
2019If the table does not include a version line (starting with C<@version>),
2020returns C<"unknown">.
2021
2022=item C<UCA_Version()>
2023
2024Returns the revision number of UTS #10 this module consults,
2025that should correspond with the DUCET incorporated.
2026
2027=item C<Base_Unicode_Version()>
2028
2029Returns the version number of UTS #10 this module consults,
2030that should correspond with the DUCET incorporated.
2031
2032=back
2033
2034=head1 EXPORT
2035
2036No method will be exported.
2037
2038=head1 INSTALL
2039
2040Though this module can be used without any C<table> file,
2041to use this module easily, it is recommended to install a table file
2042in the UCA format, by copying it under the directory
2043<a place in @INC>/Unicode/Collate.
2044
2045The most preferable one is "The Default Unicode Collation Element Table"
2046(aka DUCET), available from the Unicode Consortium's website:
2047
2048   http://www.unicode.org/Public/UCA/
2049
2050   http://www.unicode.org/Public/UCA/latest/allkeys.txt
2051   (latest version)
2052
2053If DUCET is not installed, it is recommended to copy the file
2054from http://www.unicode.org/Public/UCA/latest/allkeys.txt
2055to <a place in @INC>/Unicode/Collate/allkeys.txt
2056manually.
2057
2058=head1 CAVEATS
2059
2060=over 4
2061
2062=item Normalization
2063
2064Use of the C<normalization> parameter requires the B<Unicode::Normalize>
2065module (see L<Unicode::Normalize>).
2066
2067If you need not it (say, in the case when you need not
2068handle any combining characters),
2069assign C<(normalization =E<gt> undef)> explicitly.
2070
2071-- see 6.5 Avoiding Normalization, UTS #10.
2072
2073=item Conformance Test
2074
2075The Conformance Test for the UCA is available
2076under L<http://www.unicode.org/Public/UCA/>.
2077
2078For F<CollationTest_SHIFTED.txt>,
2079a collator via C<Unicode::Collate-E<gt>new( )> should be used;
2080for F<CollationTest_NON_IGNORABLE.txt>, a collator via
2081C<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>.
2082
2083If C<UCA_Version> is 26 or later, the C<identical> level is preferred;
2084C<Unicode::Collate-E<gt>new(identical =E<gt> 1)> and
2085C<Unicode::Collate-E<gt>new(identical =E<gt> 1,>
2086C<variable =E<gt> "non-ignorable", level =E<gt> 3)> should be used.
2087
2088B<Unicode::Normalize is required to try The Conformance Test.>
2089
2090=back
2091
2092=head1 AUTHOR, COPYRIGHT AND LICENSE
2093
2094The Unicode::Collate module for perl was written by SADAHIRO Tomoyuki,
2095<SADAHIRO@cpan.org>. This module is Copyright(C) 2001-2018,
2096SADAHIRO Tomoyuki. Japan. All rights reserved.
2097
2098This module is free software; you can redistribute it and/or
2099modify it under the same terms as Perl itself.
2100
2101The file Unicode/Collate/allkeys.txt was copied verbatim
2102from L<http://www.unicode.org/Public/UCA/9.0.0/allkeys.txt>.
2103For this file, Copyright (c) 2016 Unicode, Inc.; distributed
2104under the Terms of Use in L<http://www.unicode.org/terms_of_use.html>
2105
2106=head1 SEE ALSO
2107
2108=over 4
2109
2110=item Unicode Collation Algorithm - UTS #10
2111
2112L<http://www.unicode.org/reports/tr10/>
2113
2114=item The Default Unicode Collation Element Table (DUCET)
2115
2116L<http://www.unicode.org/Public/UCA/latest/allkeys.txt>
2117
2118=item The conformance test for the UCA
2119
2120L<http://www.unicode.org/Public/UCA/latest/CollationTest.html>
2121
2122L<http://www.unicode.org/Public/UCA/latest/CollationTest.zip>
2123
2124=item Hangul Syllable Type
2125
2126L<http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt>
2127
2128=item Unicode Normalization Forms - UAX #15
2129
2130L<http://www.unicode.org/reports/tr15/>
2131
2132=item Unicode Locale Data Markup Language (LDML) - UTS #35
2133
2134L<http://www.unicode.org/reports/tr35/>
2135
2136=back
2137
2138=cut
2139