1package Unicode::Collate;
2
3use 5.006;
4use strict;
5use warnings;
6use Carp;
7use File::Spec;
8
9no warnings 'utf8';
10
11our $VERSION = '1.31';
12our $PACKAGE = __PACKAGE__;
13
14### begin XS only ###
15use XSLoader ();
16XSLoader::load('Unicode::Collate', $VERSION);
17### end XS only ###
18
19my @Path = qw(Unicode Collate);
20my $KeyFile = 'allkeys.txt';
21
22# Perl's boolean
23use constant TRUE  => 1;
24use constant FALSE => "";
25use constant NOMATCHPOS => -1;
26
27# A coderef to get combining class imported from Unicode::Normalize
28# (i.e. \&Unicode::Normalize::getCombinClass).
29# This is also used as a HAS_UNICODE_NORMALIZE flag.
30my $CVgetCombinClass;
31
32# Supported Levels
33use constant MinLevel => 1;
34use constant MaxLevel => 4;
35
36# Minimum weights at level 2 and 3, respectively
37use constant Min2Wt => 0x20;
38use constant Min3Wt => 0x02;
39
40# Shifted weight at 4th level
41use constant Shift4Wt => 0xFFFF;
42
43# A boolean for Variable and 16-bit weights at 4 levels of Collation Element
44use constant VCE_TEMPLATE => 'Cn4';
45
46# A sort key: 16-bit weights
47use constant KEY_TEMPLATE => 'n*';
48
49# The tie-breaking: 32-bit weights
50use constant TIE_TEMPLATE => 'N*';
51
52# Level separator in a sort key:
53# i.e. pack(KEY_TEMPLATE, 0)
54use constant LEVEL_SEP => "\0\0";
55
56# As Unicode code point separator for hash keys.
57# A joined code point string (denoted by JCPS below)
58# like "65;768" is used for internal processing
59# instead of Perl's Unicode string like "\x41\x{300}",
60# as the native code point is different from the Unicode code point
61# on EBCDIC platform.
62# This character must not be included in any stringified
63# representation of an integer.
64use constant CODE_SEP => ';';
65	# NOTE: in regex /;/ is used for $jcps!
66
67# boolean values of variable weights
68use constant NON_VAR => 0; # Non-Variable character
69use constant VAR     => 1; # Variable character
70
71# specific code points
72use constant Hangul_SIni   => 0xAC00;
73use constant Hangul_SFin   => 0xD7A3;
74
75# Logical_Order_Exception in PropList.txt
76my $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
77
78# for highestFFFF and minimalFFFE
79my $HighestVCE = pack(VCE_TEMPLATE, 0, 0xFFFE, 0x20, 0x5, 0xFFFF);
80my $minimalVCE = pack(VCE_TEMPLATE, 0,      1, 0x20, 0x5, 0xFFFE);
81
82sub UCA_Version { '43' }
83
84sub Base_Unicode_Version { '13.0.0' }
85
86######
87
88my $native_to_unicode = ($::IS_ASCII || $] < 5.008)
89	? sub { return shift }
90	: sub { utf8::native_to_unicode(shift) };
91
92my $unicode_to_native = ($::IS_ASCII || $] < 5.008)
93	? sub { return shift }
94	: sub { utf8::unicode_to_native(shift) };
95
96# pack_U() should get Unicode code points.
97sub pack_U {
98    return pack('U*', map $unicode_to_native->($_), @_);
99}
100
101# unpack_U() should return Unicode code points.
102sub unpack_U {
103    return map $native_to_unicode->($_), unpack('U*', shift(@_).pack('U*'));
104}
105# for older perl version, pack('U*') generates empty string with utf8 flag.
106
107######
108
109my (%VariableOK);
110@VariableOK{ qw/
111    blanked  non-ignorable  shifted  shift-trimmed
112  / } = (); # keys lowercased
113
114our @ChangeOK = qw/
115    alternate backwards level normalization rearrange
116    katakana_before_hiragana upper_before_lower ignore_level2
117    overrideCJK overrideHangul overrideOut preprocess UCA_Version
118    hangul_terminator variable identical highestFFFF minimalFFFE
119    long_contraction
120  /;
121
122our @ChangeNG = qw/
123    entry mapping table maxlength contraction
124    ignoreChar ignoreName undefChar undefName rewrite
125    versionTable alternateTable backwardsTable forwardsTable
126    rearrangeTable variableTable
127    derivCode normCode rearrangeHash backwardsFlag
128    suppress suppressHash
129    __useXS /; ### XS only
130# The hash key 'ignored' was deleted at v 0.21.
131# The hash key 'isShift' was deleted at v 0.23.
132# The hash key 'combining' was deleted at v 0.24.
133# The hash key 'entries' was deleted at v 0.30.
134# The hash key 'L3_ignorable' was deleted at v 0.40.
135
136sub version {
137    my $self = shift;
138    return $self->{versionTable} || 'unknown';
139}
140
141my (%ChangeOK, %ChangeNG);
142@ChangeOK{ @ChangeOK } = ();
143@ChangeNG{ @ChangeNG } = ();
144
145sub change {
146    my $self = shift;
147    my %hash = @_;
148    my %old;
149    if (exists $hash{alternate}) {
150	if (exists $hash{variable}) {
151	    delete $hash{alternate};
152	} else {
153	    $hash{variable} = $hash{alternate};
154	}
155    }
156    foreach my $k (keys %hash) {
157	if (exists $ChangeOK{$k}) {
158	    $old{$k} = $self->{$k};
159	    $self->{$k} = $hash{$k};
160	} elsif (exists $ChangeNG{$k}) {
161	    croak "change of $k via change() is not allowed!";
162	}
163	# else => ignored
164    }
165    $self->checkCollator();
166    return wantarray ? %old : $self;
167}
168
169sub _checkLevel {
170    my $level = shift;
171    my $key   = shift; # 'level' or 'backwards'
172    MinLevel <= $level or croak sprintf
173	"Illegal level %d (in value for key '%s') lower than %d.",
174	    $level, $key, MinLevel;
175    $level <= MaxLevel or croak sprintf
176	"Unsupported level %d (in value for key '%s') higher than %d.",
177	    $level, $key, MaxLevel;
178}
179
180my %DerivCode = (
181    8 => \&_derivCE_8,
182    9 => \&_derivCE_9,
183   11 => \&_derivCE_9, # 11 == 9
184   14 => \&_derivCE_14,
185   16 => \&_derivCE_14, # 16 == 14
186   18 => \&_derivCE_18,
187   20 => \&_derivCE_20,
188   22 => \&_derivCE_22,
189   24 => \&_derivCE_24,
190   26 => \&_derivCE_24, # 26 == 24
191   28 => \&_derivCE_24, # 28 == 24
192   30 => \&_derivCE_24, # 30 == 24
193   32 => \&_derivCE_32,
194   34 => \&_derivCE_34,
195   36 => \&_derivCE_36,
196   38 => \&_derivCE_38,
197   40 => \&_derivCE_40,
198   41 => \&_derivCE_40, # 41 == 40
199   43 => \&_derivCE_43,
200);
201
202sub checkCollator {
203    my $self = shift;
204    _checkLevel($self->{level}, 'level');
205
206    $self->{derivCode} = $DerivCode{ $self->{UCA_Version} }
207	or croak "Illegal UCA version (passed $self->{UCA_Version}).";
208
209    $self->{variable} ||= $self->{alternate} || $self->{variableTable} ||
210				$self->{alternateTable} || 'shifted';
211    $self->{variable} = $self->{alternate} = lc($self->{variable});
212    exists $VariableOK{ $self->{variable} }
213	or croak "$PACKAGE unknown variable parameter name: $self->{variable}";
214
215    if (! defined $self->{backwards}) {
216	$self->{backwardsFlag} = 0;
217    } elsif (! ref $self->{backwards}) {
218	_checkLevel($self->{backwards}, 'backwards');
219	$self->{backwardsFlag} = 1 << $self->{backwards};
220    } else {
221	my %level;
222	$self->{backwardsFlag} = 0;
223	for my $b (@{ $self->{backwards} }) {
224	    _checkLevel($b, 'backwards');
225	    $level{$b} = 1;
226	}
227	for my $v (sort keys %level) {
228	    $self->{backwardsFlag} += 1 << $v;
229	}
230    }
231
232    defined $self->{rearrange} or $self->{rearrange} = [];
233    ref $self->{rearrange}
234	or croak "$PACKAGE: list for rearrangement must be store in ARRAYREF";
235
236    # keys of $self->{rearrangeHash} are $self->{rearrange}.
237    $self->{rearrangeHash} = undef;
238
239    if (@{ $self->{rearrange} }) {
240	@{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
241    }
242
243    $self->{normCode} = undef;
244
245    if (defined $self->{normalization}) {
246	eval { require Unicode::Normalize };
247	$@ and croak "Unicode::Normalize is required to normalize strings";
248
249	$CVgetCombinClass ||= \&Unicode::Normalize::getCombinClass;
250
251	if ($self->{normalization} =~ /^(?:NF)D\z/) { # tweak for default
252	    $self->{normCode} = \&Unicode::Normalize::NFD;
253	}
254	elsif ($self->{normalization} ne 'prenormalized') {
255	    my $norm = $self->{normalization};
256	    $self->{normCode} = sub {
257		Unicode::Normalize::normalize($norm, shift);
258	    };
259	    eval { $self->{normCode}->("") }; # try
260	    $@ and croak "$PACKAGE unknown normalization form name: $norm";
261	}
262    }
263    return;
264}
265
266sub new
267{
268    my $class = shift;
269    my $self = bless { @_ }, $class;
270
271### begin XS only ###
272    if (! exists $self->{table}     && !defined $self->{rewrite} &&
273	!defined $self->{undefName} && !defined $self->{ignoreName} &&
274	!defined $self->{undefChar} && !defined $self->{ignoreChar}) {
275	$self->{__useXS} = \&_fetch_simple;
276    } else {
277	$self->{__useXS} = undef;
278    }
279### end XS only ###
280
281    # keys of $self->{suppressHash} are $self->{suppress}.
282    if ($self->{suppress} && @{ $self->{suppress} }) {
283	@{ $self->{suppressHash} }{ @{ $self->{suppress} } } = ();
284    } # before read_table()
285
286    # If undef is passed explicitly, no file is read.
287    $self->{table} = $KeyFile if ! exists $self->{table};
288    $self->read_table() if defined $self->{table};
289
290    if ($self->{entry}) {
291	while ($self->{entry} =~ /([^\n]+)/g) {
292	    $self->parseEntry($1, TRUE);
293	}
294    }
295
296    # only in new(), not in change()
297    $self->{level} ||= MaxLevel;
298    $self->{UCA_Version} ||= UCA_Version();
299
300    $self->{overrideHangul} = FALSE
301	if ! exists $self->{overrideHangul};
302    $self->{overrideCJK} = FALSE
303	if ! exists $self->{overrideCJK};
304    $self->{normalization} = 'NFD'
305	if ! exists $self->{normalization};
306    $self->{rearrange} = $self->{rearrangeTable} ||
307	($self->{UCA_Version} <= 11 ? $DefaultRearrange : [])
308	if ! exists $self->{rearrange};
309    $self->{backwards} = $self->{backwardsTable}
310	if ! exists $self->{backwards};
311    exists $self->{long_contraction} or $self->{long_contraction}
312	= 22 <= $self->{UCA_Version} && $self->{UCA_Version} <= 24;
313
314    # checkCollator() will be called in change()
315    $self->checkCollator();
316
317    return $self;
318}
319
320sub parseAtmark {
321    my $self = shift;
322    my $line = shift; # after s/^\s*\@//
323
324    if ($line =~ /^version\s*(\S*)/) {
325	$self->{versionTable} ||= $1;
326    }
327    elsif ($line =~ /^variable\s+(\S*)/) { # since UTS #10-9
328	$self->{variableTable} ||= $1;
329    }
330    elsif ($line =~ /^alternate\s+(\S*)/) { # till UTS #10-8
331	$self->{alternateTable} ||= $1;
332    }
333    elsif ($line =~ /^backwards\s+(\S*)/) {
334	push @{ $self->{backwardsTable} }, $1;
335    }
336    elsif ($line =~ /^forwards\s+(\S*)/) { # perhaps no use
337	push @{ $self->{forwardsTable} }, $1;
338    }
339    elsif ($line =~ /^rearrange\s+(.*)/) { # (\S*) is NG
340	push @{ $self->{rearrangeTable} }, _getHexArray($1);
341    }
342}
343
344sub read_table {
345    my $self = shift;
346
347### begin XS only ###
348    if ($self->{__useXS}) {
349	my @rest = _fetch_rest(); # complex matter need to parse
350	for my $line (@rest) {
351	    next if $line =~ /^\s*#/;
352
353	    if ($line =~ s/^\s*\@//) {
354		$self->parseAtmark($line);
355	    } else {
356		$self->parseEntry($line);
357	    }
358	}
359	return;
360    }
361### end XS only ###
362
363    my($f, $fh);
364    foreach my $d (@INC) {
365	$f = File::Spec->catfile($d, @Path, $self->{table});
366	last if open($fh, $f);
367	$f = undef;
368    }
369    if (!defined $f) {
370	$f = File::Spec->catfile(@Path, $self->{table});
371	croak("$PACKAGE: Can't locate $f in \@INC (\@INC contains: @INC)");
372    }
373
374    while (my $line = <$fh>) {
375	next if $line =~ /^\s*#/;
376
377	if ($line =~ s/^\s*\@//) {
378	    $self->parseAtmark($line);
379	} else {
380	    $self->parseEntry($line);
381	}
382    }
383    close $fh;
384}
385
386
387##
388## get $line, parse it, and write an entry in $self
389##
390sub parseEntry
391{
392    my $self = shift;
393    my $line = shift;
394    my $tailoring = shift;
395    my($name, $entry, @uv, @key);
396
397    if (defined $self->{rewrite}) {
398	$line = $self->{rewrite}->($line);
399    }
400
401    return if $line !~ /^\s*[0-9A-Fa-f]/;
402
403    # removes comment and gets name
404    $name = $1
405	if $line =~ s/[#%]\s*(.*)//;
406    return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
407
408    # gets element
409    my($e, $k) = split /;/, $line;
410    croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
411	if ! $k;
412
413    @uv = _getHexArray($e);
414    return if !@uv;
415    return if @uv > 1 && $self->{suppressHash} && !$tailoring &&
416		  exists $self->{suppressHash}{$uv[0]};
417    $entry = join(CODE_SEP, @uv); # in JCPS
418
419    if (defined $self->{undefChar} || defined $self->{ignoreChar}) {
420	my $ele = pack_U(@uv);
421
422	# regarded as if it were not stored in the table
423	return
424	    if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
425
426	# replaced as completely ignorable
427	$k = '[.0000.0000.0000.0000]'
428	    if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/;
429    }
430
431    # replaced as completely ignorable
432    $k = '[.0000.0000.0000.0000]'
433	if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/;
434
435    my $is_L3_ignorable = TRUE;
436
437    foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
438	my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
439	my @wt = _getHexArray($arr);
440	push @key, pack(VCE_TEMPLATE, $var, @wt);
441	$is_L3_ignorable = FALSE
442	    if $wt[0] || $wt[1] || $wt[2];
443	# Conformance Test for 3.1.1 and 4.0.0 shows Level 3 ignorable
444	# is completely ignorable.
445	# For expansion, an entry $is_L3_ignorable
446	# if and only if "all" CEs are [.0000.0000.0000].
447    }
448
449    # mapping: be an array ref or not exists (any false value is disallowed)
450    $self->{mapping}{$entry} = $is_L3_ignorable ? [] : \@key;
451
452    # maxlength: be more than 1 or not exists (any false value is disallowed)
453    if (@uv > 1) {
454	if (!$self->{maxlength}{$uv[0]} || $self->{maxlength}{$uv[0]} < @uv) {
455	    $self->{maxlength}{$uv[0]} = @uv;
456	}
457    }
458
459    # contraction: be 1 or not exists (any false value is disallowed)
460    while (@uv > 2) {
461	pop @uv;
462	my $fake_entry = join(CODE_SEP, @uv); # in JCPS
463	$self->{contraction}{$fake_entry} = 1;
464    }
465}
466
467
468sub viewSortKey
469{
470    my $self = shift;
471    my $str  = shift;
472    $self->visualizeSortKey($self->getSortKey($str));
473}
474
475
476sub process
477{
478    my $self = shift;
479    my $str  = shift;
480    my $prep = $self->{preprocess};
481    my $norm = $self->{normCode};
482
483    $str = &$prep($str) if ref $prep;
484    $str = &$norm($str) if ref $norm;
485    return $str;
486}
487
488##
489## arrayref of JCPS   = splitEnt(string to be collated)
490## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, TRUE)
491##
492sub splitEnt
493{
494    my $self = shift;
495    my $str  = shift;
496    my $wLen = shift; # with Length
497
498    my $map  = $self->{mapping};
499    my $max  = $self->{maxlength};
500    my $reH  = $self->{rearrangeHash};
501    my $vers = $self->{UCA_Version};
502    my $ver9 = $vers >= 9 && $vers <= 11;
503    my $long = $self->{long_contraction};
504    my $uXS  = $self->{__useXS}; ### XS only
505
506    my @buf;
507
508    # get array of Unicode code point of string.
509    my @src = unpack_U($str);
510
511    # rearrangement:
512    # Character positions are not kept if rearranged,
513    # then neglected if $wLen is true.
514    if ($reH && ! $wLen) {
515	for (my $i = 0; $i < @src; $i++) {
516	    if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
517		($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
518		$i++;
519	    }
520	}
521    }
522
523    # remove a code point marked as a completely ignorable.
524    for (my $i = 0; $i < @src; $i++) {
525	if ($vers <= 20 && _isIllegal($src[$i])) {
526	    $src[$i] = undef;
527	} elsif ($ver9) {
528	    $src[$i] = undef if exists $map->{ $src[$i] }
529			   ? @{ $map->{ $src[$i] } } == 0
530			   : $uXS && _ignorable_simple($src[$i]); ### XS only
531	}
532    }
533
534    for (my $i = 0; $i < @src; $i++) {
535	my $jcps = $src[$i];
536
537	# skip removed code point
538	if (! defined $jcps) {
539	    if ($wLen && @buf) {
540		$buf[-1][2] = $i + 1;
541	    }
542	    next;
543	}
544
545	my $i_orig = $i;
546
547	# find contraction
548	if (exists $max->{$jcps}) {
549	    my $temp_jcps = $jcps;
550	    my $jcpsLen = 1;
551	    my $maxLen = $max->{$jcps};
552
553	    for (my $p = $i + 1; $jcpsLen < $maxLen && $p < @src; $p++) {
554		next if ! defined $src[$p];
555		$temp_jcps .= CODE_SEP . $src[$p];
556		$jcpsLen++;
557		if (exists $map->{$temp_jcps}) {
558		    $jcps = $temp_jcps;
559		    $i = $p;
560		}
561	    }
562
563	# discontiguous contraction with Combining Char (cf. UTS#10, S2.1).
564	# This process requires Unicode::Normalize.
565	# If "normalization" is undef, here should be skipped *always*
566	# (in spite of bool value of $CVgetCombinClass),
567	# since canonical ordering cannot be expected.
568	# Blocked combining character should not be contracted.
569
570	    # $self->{normCode} is false in the case of "prenormalized".
571	    if ($self->{normalization}) {
572		my $cont = $self->{contraction};
573		my $preCC = 0;
574		my $preCC_uc = 0;
575		my $jcps_uc = $jcps;
576		my(@out, @out_uc);
577
578		for (my $p = $i + 1; $p < @src; $p++) {
579		    next if ! defined $src[$p];
580		    my $curCC = $CVgetCombinClass->($src[$p]);
581		    last unless $curCC;
582		    my $tail = CODE_SEP . $src[$p];
583
584		    if ($preCC != $curCC && exists $map->{$jcps.$tail}) {
585			$jcps .= $tail;
586			push @out, $p;
587		    } else {
588			$preCC = $curCC;
589		    }
590
591		    next if !$long;
592
593		    if ($preCC_uc != $curCC &&
594			    (exists $map->{$jcps_uc.$tail} ||
595			    exists $cont->{$jcps_uc.$tail})) {
596			$jcps_uc .= $tail;
597			push @out_uc, $p;
598		    } else {
599			$preCC_uc = $curCC;
600		    }
601		}
602
603		if (@out_uc && exists $map->{$jcps_uc}) {
604		    $jcps = $jcps_uc;
605		    $src[$_] = undef for @out_uc;
606		} else {
607		    $src[$_] = undef for @out;
608		}
609	    }
610	}
611
612	# skip completely ignorable
613	if (exists $map->{$jcps} ? @{ $map->{$jcps} } == 0 :
614	    $uXS && $jcps !~ /;/ && _ignorable_simple($jcps)) { ### XS only
615	    if ($wLen && @buf) {
616		$buf[-1][2] = $i + 1;
617	    }
618	    next;
619	}
620
621	push @buf, $wLen ? [$jcps, $i_orig, $i + 1] : $jcps;
622    }
623    return \@buf;
624}
625
626##
627## VCE = _pack_override(input, codepoint, derivCode)
628##
629sub _pack_override ($$$) {
630    my $r = shift;
631    my $u = shift;
632    my $der = shift;
633
634    if (ref $r) {
635	return pack(VCE_TEMPLATE, NON_VAR, @$r);
636    } elsif (defined $r) {
637	return pack(VCE_TEMPLATE, NON_VAR, $r, Min2Wt, Min3Wt, $u);
638    } else {
639	$u = 0xFFFD if 0x10FFFF < $u;
640	return $der->($u);
641    }
642}
643
644##
645## list of VCE = getWt(JCPS)
646##
647sub getWt
648{
649    my $self = shift;
650    my $u    = shift;
651    my $map  = $self->{mapping};
652    my $der  = $self->{derivCode};
653    my $out  = $self->{overrideOut};
654    my $uXS  = $self->{__useXS}; ### XS only
655
656    return if !defined $u;
657    return $self->varCE($HighestVCE) if $u eq 0xFFFF && $self->{highestFFFF};
658    return $self->varCE($minimalVCE) if $u eq 0xFFFE && $self->{minimalFFFE};
659    $u = 0xFFFD if $u !~ /;/ && 0x10FFFF < $u && !$out;
660
661    my @ce;
662    if (exists $map->{$u}) {
663	@ce = @{ $map->{$u} }; # $u may be a contraction
664### begin XS only ###
665    } elsif ($uXS && _exists_simple($u)) {
666	@ce = _fetch_simple($u);
667### end XS only ###
668    } elsif (Hangul_SIni <= $u && $u <= Hangul_SFin) {
669	my $hang = $self->{overrideHangul};
670	if ($hang) {
671	    @ce = map _pack_override($_, $u, $der), $hang->($u);
672	} elsif (!defined $hang) {
673	    @ce = $der->($u);
674	} else {
675	    my $max  = $self->{maxlength};
676	    my @decH = _decompHangul($u);
677
678	    if (@decH == 2) {
679		my $contract = join(CODE_SEP, @decH);
680		@decH = ($contract) if exists $map->{$contract};
681	    } else { # must be <@decH == 3>
682		if (exists $max->{$decH[0]}) {
683		    my $contract = join(CODE_SEP, @decH);
684		    if (exists $map->{$contract}) {
685			@decH = ($contract);
686		    } else {
687			$contract = join(CODE_SEP, @decH[0,1]);
688			exists $map->{$contract} and @decH = ($contract, $decH[2]);
689		    }
690		    # even if V's ignorable, LT contraction is not supported.
691		    # If such a situation were required, NFD should be used.
692		}
693		if (@decH == 3 && exists $max->{$decH[1]}) {
694		    my $contract = join(CODE_SEP, @decH[1,2]);
695		    exists $map->{$contract} and @decH = ($decH[0], $contract);
696		}
697	    }
698
699	    @ce = map({
700		    exists $map->{$_} ? @{ $map->{$_} } :
701		$uXS && _exists_simple($_) ? _fetch_simple($_) : ### XS only
702		    $der->($_);
703		} @decH);
704	}
705    } elsif ($out && 0x10FFFF < $u) {
706	@ce = map _pack_override($_, $u, $der), $out->($u);
707    } else {
708	my $cjk  = $self->{overrideCJK};
709	my $vers = $self->{UCA_Version};
710	if ($cjk && _isUIdeo($u, $vers)) {
711	    @ce = map _pack_override($_, $u, $der), $cjk->($u);
712	} elsif ($vers == 8 && defined $cjk && _isUIdeo($u, 0)) {
713	    @ce = _uideoCE_8($u);
714	} else {
715	    @ce = $der->($u);
716	}
717    }
718    return map $self->varCE($_), @ce;
719}
720
721
722##
723## string sortkey = getSortKey(string arg)
724##
725sub getSortKey
726{
727    my $self = shift;
728    my $orig = shift;
729    my $str  = $self->process($orig);
730    my $rEnt = $self->splitEnt($str); # get an arrayref of JCPS
731    my $vers = $self->{UCA_Version};
732    my $term = $self->{hangul_terminator};
733    my $lev  = $self->{level};
734    my $iden = $self->{identical};
735
736    my @buf; # weight arrays
737    if ($term) {
738	my $preHST = '';
739	my $termCE = $self->varCE(pack(VCE_TEMPLATE, NON_VAR, $term, 0,0,0));
740	foreach my $jcps (@$rEnt) {
741	    # weird things like VL, TL-contraction are not considered!
742	    my $curHST = join '', map getHST($_, $vers), split /;/, $jcps;
743	    if ($preHST && !$curHST || # hangul before non-hangul
744		$preHST =~ /L\z/ && $curHST =~ /^T/ ||
745		$preHST =~ /V\z/ && $curHST =~ /^L/ ||
746		$preHST =~ /T\z/ && $curHST =~ /^[LV]/) {
747		push @buf, $termCE;
748	    }
749	    $preHST = $curHST;
750	    push @buf, $self->getWt($jcps);
751	}
752	push @buf, $termCE if $preHST; # end at hangul
753    } else {
754	foreach my $jcps (@$rEnt) {
755	    push @buf, $self->getWt($jcps);
756	}
757    }
758
759    my $rkey = $self->mk_SortKey(\@buf); ### XS only
760
761    if ($iden || $vers >= 26 && $lev == MaxLevel) {
762	$rkey .= LEVEL_SEP;
763	$rkey .= pack(TIE_TEMPLATE, unpack_U($str)) if $iden;
764    }
765    return $rkey;
766}
767
768
769##
770## int compare = cmp(string a, string b)
771##
772sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
773sub eq  { $_[0]->getSortKey($_[1]) eq  $_[0]->getSortKey($_[2]) }
774sub ne  { $_[0]->getSortKey($_[1]) ne  $_[0]->getSortKey($_[2]) }
775sub lt  { $_[0]->getSortKey($_[1]) lt  $_[0]->getSortKey($_[2]) }
776sub le  { $_[0]->getSortKey($_[1]) le  $_[0]->getSortKey($_[2]) }
777sub gt  { $_[0]->getSortKey($_[1]) gt  $_[0]->getSortKey($_[2]) }
778sub ge  { $_[0]->getSortKey($_[1]) ge  $_[0]->getSortKey($_[2]) }
779
780##
781## list[strings] sorted = sort(list[strings] arg)
782##
783sub sort {
784    my $obj = shift;
785    return
786	map { $_->[1] }
787	    sort{ $a->[0] cmp $b->[0] }
788		map [ $obj->getSortKey($_), $_ ], @_;
789}
790
791
792##
793## bool _nonIgnorAtLevel(arrayref weights, int level)
794##
795sub _nonIgnorAtLevel($$)
796{
797    my $wt = shift;
798    return if ! defined $wt;
799    my $lv = shift;
800    return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE;
801}
802
803##
804## bool _eqArray(
805##    arrayref of arrayref[weights] source,
806##    arrayref of arrayref[weights] substr,
807##    int level)
808## * comparison of graphemes vs graphemes.
809##   @$source >= @$substr must be true (check it before call this);
810##
811sub _eqArray($$$)
812{
813    my $source = shift;
814    my $substr = shift;
815    my $lev = shift;
816
817    for my $g (0..@$substr-1){
818	# Do the $g'th graphemes have the same number of AV weights?
819	return if @{ $source->[$g] } != @{ $substr->[$g] };
820
821	for my $w (0..@{ $substr->[$g] }-1) {
822	    for my $v (0..$lev-1) {
823		return if $source->[$g][$w][$v] != $substr->[$g][$w][$v];
824	    }
825	}
826    }
827    return 1;
828}
829
830##
831## (int position, int length)
832## int position = index(string, substring, position, [undoc'ed global])
833##
834## With "global" (only for the list context),
835##  returns list of arrayref[position, length].
836##
837sub index
838{
839    my $self = shift;
840    $self->{preprocess} and
841	croak "Don't use Preprocess with index(), match(), etc.";
842    $self->{normCode} and
843	croak "Don't use Normalization with index(), match(), etc.";
844
845    my $str  = shift;
846    my $len  = length($str);
847    my $sub  = shift;
848    my $subE = $self->splitEnt($sub);
849    my $pos  = @_ ? shift : 0;
850       $pos  = 0 if $pos < 0;
851    my $glob = shift;
852
853    my $lev  = $self->{level};
854    my $v2i  = $self->{UCA_Version} >= 9 &&
855		$self->{variable} ne 'non-ignorable';
856
857    if (! @$subE) {
858	my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos;
859	return $glob
860	    ? map([$_, 0], $temp..$len)
861	    : wantarray ? ($temp,0) : $temp;
862    }
863    $len < $pos
864	and return wantarray ? () : NOMATCHPOS;
865    my $strE = $self->splitEnt($pos ? substr($str, $pos) : $str, TRUE);
866    @$strE
867	or return wantarray ? () : NOMATCHPOS;
868
869    my(@strWt, @iniPos, @finPos, @subWt, @g_ret);
870
871    my $last_is_variable;
872    for my $vwt (map $self->getWt($_), @$subE) {
873	my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
874	my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
875
876	# "Ignorable (L1, L2) after Variable" since track. v. 9
877	if ($v2i) {
878	    if ($var) {
879		$last_is_variable = TRUE;
880	    }
881	    elsif (!$wt[0]) { # ignorable
882		$to_be_pushed = FALSE if $last_is_variable;
883	    }
884	    else {
885		$last_is_variable = FALSE;
886	    }
887	}
888
889	if (@subWt && !$var && !$wt[0]) {
890	    push @{ $subWt[-1] }, \@wt if $to_be_pushed;
891	} elsif ($to_be_pushed) {
892	    push @subWt, [ \@wt ];
893	}
894	# else ===> skipped
895    }
896
897    my $count = 0;
898    my $end = @$strE - 1;
899
900    $last_is_variable = FALSE; # reuse
901    for (my $i = 0; $i <= $end; ) { # no $i++
902	my $found_base = 0;
903
904	# fetch a grapheme
905	while ($i <= $end && $found_base == 0) {
906	    for my $vwt ($self->getWt($strE->[$i][0])) {
907		my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
908		my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
909
910		# "Ignorable (L1, L2) after Variable" since track. v. 9
911		if ($v2i) {
912		    if ($var) {
913			$last_is_variable = TRUE;
914		    }
915		    elsif (!$wt[0]) { # ignorable
916			$to_be_pushed = FALSE if $last_is_variable;
917		    }
918		    else {
919			$last_is_variable = FALSE;
920		    }
921		}
922
923		if (@strWt && !$var && !$wt[0]) {
924		    push @{ $strWt[-1] }, \@wt if $to_be_pushed;
925		    $finPos[-1] = $strE->[$i][2];
926		} elsif ($to_be_pushed) {
927		    push @strWt, [ \@wt ];
928		    push @iniPos, $found_base ? NOMATCHPOS : $strE->[$i][1];
929		    $finPos[-1] = NOMATCHPOS if $found_base;
930		    push @finPos, $strE->[$i][2];
931		    $found_base++;
932		}
933		# else ===> no-op
934	    }
935	    $i++;
936	}
937
938	# try to match
939	while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) {
940	    if ($iniPos[0] != NOMATCHPOS &&
941		    $finPos[$#subWt] != NOMATCHPOS &&
942			_eqArray(\@strWt, \@subWt, $lev)) {
943		my $temp = $iniPos[0] + $pos;
944
945		if ($glob) {
946		    push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]];
947		    splice @strWt,  0, $#subWt;
948		    splice @iniPos, 0, $#subWt;
949		    splice @finPos, 0, $#subWt;
950		}
951		else {
952		    return wantarray
953			? ($temp, $finPos[$#subWt] - $iniPos[0])
954			:  $temp;
955		}
956	    }
957	    shift @strWt;
958	    shift @iniPos;
959	    shift @finPos;
960	}
961    }
962
963    return $glob
964	? @g_ret
965	: wantarray ? () : NOMATCHPOS;
966}
967
968##
969## scalarref to matching part = match(string, substring)
970##
971sub match
972{
973    my $self = shift;
974    if (my($pos,$len) = $self->index($_[0], $_[1])) {
975	my $temp = substr($_[0], $pos, $len);
976	return wantarray ? $temp : \$temp;
977	# An lvalue ref \substr should be avoided,
978	# since its value is affected by modification of its referent.
979    }
980    else {
981	return;
982    }
983}
984
985##
986## arrayref matching parts = gmatch(string, substring)
987##
988sub gmatch
989{
990    my $self = shift;
991    my $str  = shift;
992    my $sub  = shift;
993    return map substr($str, $_->[0], $_->[1]),
994		$self->index($str, $sub, 0, 'g');
995}
996
997##
998## bool subst'ed = subst(string, substring, replace)
999##
1000sub subst
1001{
1002    my $self = shift;
1003    my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1004
1005    if (my($pos,$len) = $self->index($_[0], $_[1])) {
1006	if ($code) {
1007	    my $mat = substr($_[0], $pos, $len);
1008	    substr($_[0], $pos, $len, $code->($mat));
1009	} else {
1010	    substr($_[0], $pos, $len, $_[2]);
1011	}
1012	return TRUE;
1013    }
1014    else {
1015	return FALSE;
1016    }
1017}
1018
1019##
1020## int count = gsubst(string, substring, replace)
1021##
1022sub gsubst
1023{
1024    my $self = shift;
1025    my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1026    my $cnt = 0;
1027
1028    # Replacement is carried out from the end, then use reverse.
1029    for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) {
1030	if ($code) {
1031	    my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]);
1032	    substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat));
1033	} else {
1034	    substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]);
1035	}
1036	$cnt++;
1037    }
1038    return $cnt;
1039}
1040
10411;
1042__END__
1043
1044=head1 NAME
1045
1046Unicode::Collate - Unicode Collation Algorithm
1047
1048=head1 SYNOPSIS
1049
1050  use Unicode::Collate;
1051
1052  #construct
1053  $Collator = Unicode::Collate->new(%tailoring);
1054
1055  #sort
1056  @sorted = $Collator->sort(@not_sorted);
1057
1058  #compare
1059  $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
1060
1061B<Note:> Strings in C<@not_sorted>, C<$a> and C<$b> are interpreted
1062according to Perl's Unicode support. See L<perlunicode>,
1063L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>.
1064Otherwise you can use C<preprocess> or should decode them before.
1065
1066=head1 DESCRIPTION
1067
1068This module is an implementation of Unicode Technical Standard #10
1069(a.k.a. UTS #10) - Unicode Collation Algorithm (a.k.a. UCA).
1070
1071=head2 Constructor and Tailoring
1072
1073The C<new> method returns a collator object. If new() is called
1074with no parameters, the collator should do the default collation.
1075
1076   $Collator = Unicode::Collate->new(
1077      UCA_Version => $UCA_Version,
1078      alternate => $alternate, # alias for 'variable'
1079      backwards => $levelNumber, # or \@levelNumbers
1080      entry => $element,
1081      hangul_terminator => $term_primary_weight,
1082      highestFFFF => $bool,
1083      identical => $bool,
1084      ignoreName => qr/$ignoreName/,
1085      ignoreChar => qr/$ignoreChar/,
1086      ignore_level2 => $bool,
1087      katakana_before_hiragana => $bool,
1088      level => $collationLevel,
1089      long_contraction => $bool,
1090      minimalFFFE => $bool,
1091      normalization  => $normalization_form,
1092      overrideCJK => \&overrideCJK,
1093      overrideHangul => \&overrideHangul,
1094      preprocess => \&preprocess,
1095      rearrange => \@charList,
1096      rewrite => \&rewrite,
1097      suppress => \@charList,
1098      table => $filename,
1099      undefName => qr/$undefName/,
1100      undefChar => qr/$undefChar/,
1101      upper_before_lower => $bool,
1102      variable => $variable,
1103   );
1104
1105=over 4
1106
1107=item UCA_Version
1108
1109If the revision (previously "tracking version") number of UCA is given,
1110behavior of that revision is emulated on collating.
1111If omitted, the return value of C<UCA_Version()> is used.
1112
1113The following revisions are supported.  The default is 43.
1114
1115     UCA       Unicode Standard         DUCET (@version)
1116   -------------------------------------------------------
1117      8              3.1                3.0.1 (3.0.1d9)
1118      9     3.1 with Corrigendum 3      3.1.1
1119     11             4.0.0
1120     14             4.1.0
1121     16             5.0.0
1122     18             5.1.0
1123     20             5.2.0
1124     22             6.0.0
1125     24             6.1.0
1126     26             6.2.0
1127     28             6.3.0
1128     30             7.0.0
1129     32             8.0.0
1130     34             9.0.0
1131     36            10.0.0
1132     38            11.0.0
1133     40            12.0.0
1134     41            12.1.0
1135     43            13.0.0
1136
1137* See below for C<long_contraction> with C<UCA_Version> 22 and 24.
1138
1139* Noncharacters (e.g. U+FFFF) are not ignored, and can be overridden
1140since C<UCA_Version> 22.
1141
1142* Out-of-range codepoints (greater than U+10FFFF) are not ignored,
1143and can be overridden since C<UCA_Version> 22.
1144
1145* Fully ignorable characters were ignored, and would not interrupt
1146contractions with C<UCA_Version> 9 and 11.
1147
1148* Treatment of ignorables after variables and some behaviors
1149were changed at C<UCA_Version> 9.
1150
1151* Characters regarded as CJK unified ideographs (cf. C<overrideCJK>)
1152depend on C<UCA_Version>.
1153
1154* Many hangul jamo are assigned at C<UCA_Version> 20, that will affect
1155C<hangul_terminator>.
1156
1157=item alternate
1158
1159-- see 3.2.2 Alternate Weighting, version 8 of UTS #10
1160
1161For backward compatibility, C<alternate> (old name) can be used
1162as an alias for C<variable>.
1163
1164=item backwards
1165
1166-- see 3.4 Backward Accents, UTS #10.
1167
1168     backwards => $levelNumber or \@levelNumbers
1169
1170Weights in reverse order; ex. level 2 (diacritic ordering) in French.
1171If omitted (or C<$levelNumber> is C<undef> or C<\@levelNumbers> is C<[]>),
1172forwards at all the levels.
1173
1174=item entry
1175
1176-- see 5 Tailoring; 9.1 Allkeys File Format, UTS #10.
1177
1178If the same character (or a sequence of characters) exists
1179in the collation element table through C<table>,
1180mapping to collation elements is overridden.
1181If it does not exist, the mapping is defined additionally.
1182
1183    entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
11840063 0068 ; [.0E6A.0020.0002.0063] # ch
11850043 0068 ; [.0E6A.0020.0007.0043] # Ch
11860043 0048 ; [.0E6A.0020.0008.0043] # CH
1187006C 006C ; [.0F4C.0020.0002.006C] # ll
1188004C 006C ; [.0F4C.0020.0007.004C] # Ll
1189004C 004C ; [.0F4C.0020.0008.004C] # LL
119000F1      ; [.0F7B.0020.0002.00F1] # n-tilde
1191006E 0303 ; [.0F7B.0020.0002.00F1] # n-tilde
119200D1      ; [.0F7B.0020.0008.00D1] # N-tilde
1193004E 0303 ; [.0F7B.0020.0008.00D1] # N-tilde
1194ENTRY
1195
1196    entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
119700E6 ; [.0E33.0020.0002.00E6][.0E8B.0020.0002.00E6] # ae ligature as <a><e>
119800C6 ; [.0E33.0020.0008.00C6][.0E8B.0020.0008.00C6] # AE ligature as <A><E>
1199ENTRY
1200
1201B<NOTE:> The code point in the UCA file format (before C<';'>)
1202B<must> be a Unicode code point (defined as hexadecimal),
1203but not a native code point.
1204So C<0063> must always denote C<U+0063>,
1205but not a character of C<"\x63">.
1206
1207Weighting may vary depending on collation element table.
1208So ensure the weights defined in C<entry> will be consistent with
1209those in the collation element table loaded via C<table>.
1210
1211In DUCET v4.0.0, primary weight of C<C> is C<0E60>
1212and that of C<D> is C<0E6D>. So setting primary weight of C<CH> to C<0E6A>
1213(as a value between C<0E60> and C<0E6D>)
1214makes ordering as C<C E<lt> CH E<lt> D>.
1215Exactly speaking DUCET already has some characters between C<C> and C<D>:
1216C<small capital C> (C<U+1D04>) with primary weight C<0E64>,
1217C<c-hook/C-hook> (C<U+0188/U+0187>) with C<0E65>,
1218and C<c-curl> (C<U+0255>) with C<0E69>.
1219Then primary weight C<0E6A> for C<CH> makes C<CH>
1220ordered between C<c-curl> and C<D>.
1221
1222=item hangul_terminator
1223
1224-- see 7.1.4 Trailing Weights, UTS #10.
1225
1226If a true value is given (non-zero but should be positive),
1227it will be added as a terminator primary weight to the end of
1228every standard Hangul syllable. Secondary and any higher weights
1229for terminator are set to zero.
1230If the value is false or C<hangul_terminator> key does not exist,
1231insertion of terminator weights will not be performed.
1232
1233Boundaries of Hangul syllables are determined
1234according to conjoining Jamo behavior in F<the Unicode Standard>
1235and F<HangulSyllableType.txt>.
1236
1237B<Implementation Note:>
1238(1) For expansion mapping (Unicode character mapped
1239to a sequence of collation elements), a terminator will not be added
1240between collation elements, even if Hangul syllable boundary exists there.
1241Addition of terminator is restricted to the next position
1242to the last collation element.
1243
1244(2) Non-conjoining Hangul letters
1245(Compatibility Jamo, halfwidth Jamo, and enclosed letters) are not
1246automatically terminated with a terminator primary weight.
1247These characters may need terminator included in a collation element
1248table beforehand.
1249
1250=item highestFFFF
1251
1252-- see 2.4 Tailored noncharacter weights, UTS #35 (LDML) Part 5: Collation.
1253
1254If the parameter is made true, C<U+FFFF> has a highest primary weight.
1255When a boolean of C<$coll-E<gt>ge($str, "abc")> and
1256C<$coll-E<gt>le($str, "abc\x{FFFF}")> is true, it is expected that C<$str>
1257begins with C<"abc">, or another primary equivalent.
1258C<$str> may be C<"abcd">, C<"abc012">, but should not include C<U+FFFF>
1259such as C<"abc\x{FFFF}xyz">.
1260
1261C<$coll-E<gt>le($str, "abc\x{FFFF}")> works like C<$coll-E<gt>lt($str, "abd")>
1262almost, but the latter has a problem that you should know which letter is
1263next to C<c>. For a certain language where C<ch> as the next letter,
1264C<"abch"> is greater than C<"abc\x{FFFF}">, but less than C<"abd">.
1265
1266Note:
1267This is equivalent to C<(entry =E<gt> 'FFFF ; [.FFFE.0020.0005.FFFF]')>.
1268Any other character than C<U+FFFF> can be tailored by C<entry>.
1269
1270=item identical
1271
1272-- see A.3 Deterministic Comparison, UTS #10.
1273
1274By default, strings whose weights are equal should be equal,
1275even though their code points are not equal.
1276Completely ignorable characters are ignored.
1277
1278If the parameter is made true, a final, tie-breaking level is used.
1279If no difference of weights is found after the comparison through
1280all the level specified by C<level>, the comparison with code points
1281will be performed.
1282For the tie-breaking comparison, the sort key has code points
1283of the original string appended.
1284Completely ignorable characters are not ignored.
1285
1286If C<preprocess> and/or C<normalization> is applied, the code points
1287of the string after them (in NFD by default) are used.
1288
1289=item ignoreChar
1290
1291=item ignoreName
1292
1293-- see 3.6 Variable Weighting, UTS #10.
1294
1295Makes the entry in the table completely ignorable;
1296i.e. as if the weights were zero at all level.
1297
1298Through C<ignoreChar>, any character matching C<qr/$ignoreChar/>
1299will be ignored. Through C<ignoreName>, any character whose name
1300(given in the C<table> file as a comment) matches C<qr/$ignoreName/>
1301will be ignored.
1302
1303E.g. when 'a' and 'e' are ignorable,
1304'element' is equal to 'lament' (or 'lmnt').
1305
1306=item ignore_level2
1307
1308-- see 5.1 Parametric Tailoring, UTS #10.
1309
1310By default, case-sensitive comparison (that is level 3 difference)
1311won't ignore accents (that is level 2 difference).
1312
1313If the parameter is made true, accents (and other primary ignorable
1314characters) are ignored, even though cases are taken into account.
1315
1316B<NOTE>: C<level> should be 3 or greater.
1317
1318=item katakana_before_hiragana
1319
1320-- see 7.2 Tertiary Weight Table, UTS #10.
1321
1322By default, hiragana is before katakana.
1323If the parameter is made true, this is reversed.
1324
1325B<NOTE>: This parameter simplemindedly assumes that any hiragana/katakana
1326distinctions must occur in level 3, and their weights at level 3 must be
1327same as those mentioned in 7.3.1, UTS #10.
1328If you define your collation elements which violate this requirement,
1329this parameter does not work validly.
1330
1331=item level
1332
1333-- see 4.3 Form Sort Key, UTS #10.
1334
1335Set the maximum level.
1336Any higher levels than the specified one are ignored.
1337
1338  Level 1: alphabetic ordering
1339  Level 2: diacritic ordering
1340  Level 3: case ordering
1341  Level 4: tie-breaking (e.g. in the case when variable is 'shifted')
1342
1343  ex.level => 2,
1344
1345If omitted, the maximum is the 4th.
1346
1347B<NOTE:> The DUCET includes weights over 0xFFFF at the 4th level.
1348But this module only uses weights within 0xFFFF.
1349When C<variable> is 'blanked' or 'non-ignorable' (other than 'shifted'
1350and 'shift-trimmed'), the level 4 may be unreliable.
1351
1352See also C<identical>.
1353
1354=item long_contraction
1355
1356-- see 3.8.2 Well-Formedness of the DUCET, 4.2 Produce Array, UTS #10.
1357
1358If the parameter is made true, for a contraction with three or more
1359characters (here nicknamed "long contraction"), initial substrings
1360will be handled.
1361For example, a contraction ABC, where A is a starter, and B and C
1362are non-starters (character with non-zero combining character class),
1363will be detected even if there is not AB as a contraction.
1364
1365B<Default:> Usually false.
1366If C<UCA_Version> is 22 or 24, and the value of C<long_contraction>
1367is not specified in C<new()>, a true value is set implicitly.
1368This is a workaround to pass Conformance Tests for Unicode 6.0.0 and 6.1.0.
1369
1370C<change()> handles C<long_contraction> explicitly only.
1371If C<long_contraction> is not specified in C<change()>, even though
1372C<UCA_Version> is changed, C<long_contraction> will not be changed.
1373
1374B<Limitation:> Scanning non-starters is one-way (no back tracking).
1375If AB is found but not ABC is not found, other long contraction where
1376the first character is A and the second is not B may not be found.
1377
1378Under C<(normalization =E<gt> undef)>, detection step of discontiguous
1379contractions will be skipped.
1380
1381B<Note:> The following contractions in DUCET are not considered
1382in steps S2.1.1 to S2.1.3, where they are discontiguous.
1383
1384    0FB2 0F71 0F80 (TIBETAN VOWEL SIGN VOCALIC RR)
1385    0FB3 0F71 0F80 (TIBETAN VOWEL SIGN VOCALIC LL)
1386
1387For example C<TIBETAN VOWEL SIGN VOCALIC RR> with C<COMBINING TILDE OVERLAY>
1388(C<U+0344>) is C<0FB2 0344 0F71 0F80> in NFD.
1389In this case C<0FB2 0F80> (C<TIBETAN VOWEL SIGN VOCALIC R>) is detected,
1390instead of C<0FB2 0F71 0F80>.
1391Inserted C<0344> makes C<0FB2 0F71 0F80> discontiguous and lack of
1392contraction C<0FB2 0F71> prohibits C<0FB2 0F71 0F80> from being detected.
1393
1394=item minimalFFFE
1395
1396-- see 1.1.1 U+FFFE, UTS #35 (LDML) Part 5: Collation.
1397
1398If the parameter is made true, C<U+FFFE> has a minimal primary weight.
1399The comparison between C<"$a1\x{FFFE}$a2"> and C<"$b1\x{FFFE}$b2">
1400first compares C<$a1> and C<$b1> at level 1, and
1401then C<$a2> and C<$b2> at level 1, as followed.
1402
1403        "ab\x{FFFE}a"
1404        "Ab\x{FFFE}a"
1405        "ab\x{FFFE}c"
1406        "Ab\x{FFFE}c"
1407        "ab\x{FFFE}xyz"
1408        "abc\x{FFFE}def"
1409        "abc\x{FFFE}xYz"
1410        "aBc\x{FFFE}xyz"
1411        "abcX\x{FFFE}def"
1412        "abcx\x{FFFE}xyz"
1413        "b\x{FFFE}aaa"
1414        "bbb\x{FFFE}a"
1415
1416Note:
1417This is equivalent to C<(entry =E<gt> 'FFFE ; [.0001.0020.0005.FFFE]')>.
1418Any other character than C<U+FFFE> can be tailored by C<entry>.
1419
1420=item normalization
1421
1422-- see 4.1 Normalize, UTS #10.
1423
1424If specified, strings are normalized before preparation of sort keys
1425(the normalization is executed after preprocess).
1426
1427A form name C<Unicode::Normalize::normalize()> accepts will be applied
1428as C<$normalization_form>.
1429Acceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>.
1430See C<Unicode::Normalize::normalize()> for detail.
1431If omitted, C<'NFD'> is used.
1432
1433C<normalization> is performed after C<preprocess> (if defined).
1434
1435Furthermore, special values, C<undef> and C<"prenormalized">, can be used,
1436though they are not concerned with C<Unicode::Normalize::normalize()>.
1437
1438If C<undef> (not a string C<"undef">) is passed explicitly
1439as the value for this key,
1440any normalization is not carried out (this may make tailoring easier
1441if any normalization is not desired). Under C<(normalization =E<gt> undef)>,
1442only contiguous contractions are resolved;
1443e.g. even if C<A-ring> (and C<A-ring-cedilla>) is ordered after C<Z>,
1444C<A-cedilla-ring> would be primary equal to C<A>.
1445In this point,
1446C<(normalization =E<gt> undef, preprocess =E<gt> sub { NFD(shift) })>
1447B<is not> equivalent to C<(normalization =E<gt> 'NFD')>.
1448
1449In the case of C<(normalization =E<gt> "prenormalized")>,
1450any normalization is not performed, but
1451discontiguous contractions with combining characters are performed.
1452Therefore
1453C<(normalization =E<gt> 'prenormalized', preprocess =E<gt> sub { NFD(shift) })>
1454B<is> equivalent to C<(normalization =E<gt> 'NFD')>.
1455If source strings are finely prenormalized,
1456C<(normalization =E<gt> 'prenormalized')> may save time for normalization.
1457
1458Except C<(normalization =E<gt> undef)>,
1459B<Unicode::Normalize> is required (see also B<CAVEAT>).
1460
1461=item overrideCJK
1462
1463-- see 7.1 Derived Collation Elements, UTS #10.
1464
1465By default, CJK unified ideographs are ordered in Unicode codepoint
1466order, but those in the CJK Unified Ideographs block are less than
1467those in the CJK Unified Ideographs Extension A etc.
1468
1469    In the CJK Unified Ideographs block:
1470    U+4E00..U+9FA5 if UCA_Version is 8, 9 or 11.
1471    U+4E00..U+9FBB if UCA_Version is 14 or 16.
1472    U+4E00..U+9FC3 if UCA_Version is 18.
1473    U+4E00..U+9FCB if UCA_Version is 20 or 22.
1474    U+4E00..U+9FCC if UCA_Version is 24 to 30.
1475    U+4E00..U+9FD5 if UCA_Version is 32 or 34.
1476    U+4E00..U+9FEA if UCA_Version is 36.
1477    U+4E00..U+9FEF if UCA_Version is 38, 40 or 41.
1478    U+4E00..U+9FFC if UCA_Version is 43.
1479
1480    In the CJK Unified Ideographs Extension blocks:
1481    Ext.A (U+3400..U+4DB5)   if UCA_Version is  8 to 41.
1482    Ext.A (U+3400..U+4DBF)   if UCA_Version is 43.
1483    Ext.B (U+20000..U+2A6D6) if UCA_Version is  8 to 41.
1484    Ext.B (U+20000..U+2A6DD) if UCA_Version is 43.
1485    Ext.C (U+2A700..U+2B734) if UCA_Version is 20 or later.
1486    Ext.D (U+2B740..U+2B81D) if UCA_Version is 22 or later.
1487    Ext.E (U+2B820..U+2CEA1) if UCA_Version is 32 or later.
1488    Ext.F (U+2CEB0..U+2EBE0) if UCA_Version is 36 or later.
1489    Ext.G (U+30000..U+3134A) if UCA_Version is 43.
1490
1491Through C<overrideCJK>, ordering of CJK unified ideographs (including
1492extensions) can be overridden.
1493
1494ex. CJK unified ideographs in the JIS code point order.
1495
1496  overrideCJK => sub {
1497      my $u = shift;             # get a Unicode codepoint
1498      my $b = pack('n', $u);     # to UTF-16BE
1499      my $s = your_unicode_to_sjis_converter($b); # convert
1500      my $n = unpack('n', $s);   # convert sjis to short
1501      [ $n, 0x20, 0x2, $u ];     # return the collation element
1502  },
1503
1504The return value may be an arrayref of 1st to 4th weights as shown
1505above. The return value may be an integer as the primary weight
1506as shown below.  If C<undef> is returned, the default derived
1507collation element will be used.
1508
1509  overrideCJK => sub {
1510      my $u = shift;             # get a Unicode codepoint
1511      my $b = pack('n', $u);     # to UTF-16BE
1512      my $s = your_unicode_to_sjis_converter($b); # convert
1513      my $n = unpack('n', $s);   # convert sjis to short
1514      return $n;                 # return the primary weight
1515  },
1516
1517The return value may be a list containing zero or more of
1518an arrayref, an integer, or C<undef>.
1519
1520ex. ignores all CJK unified ideographs.
1521
1522  overrideCJK => sub {()}, # CODEREF returning empty list
1523
1524   # where ->eq("Pe\x{4E00}rl", "Perl") is true
1525   # as U+4E00 is a CJK unified ideograph and to be ignorable.
1526
1527If a false value (including C<undef>) is passed, C<overrideCJK>
1528has no effect.
1529C<$Collator-E<gt>change(overrideCJK =E<gt> 0)> resets the old one.
1530
1531But assignment of weight for CJK unified ideographs
1532in C<table> or C<entry> is still valid.
1533If C<undef> is passed explicitly as the value for this key,
1534weights for CJK unified ideographs are treated as undefined.
1535However when C<UCA_Version> E<gt> 8, C<(overrideCJK =E<gt> undef)>
1536has no special meaning.
1537
1538B<Note:> In addition to them, 12 CJK compatibility ideographs (C<U+FA0E>,
1539C<U+FA0F>, C<U+FA11>, C<U+FA13>, C<U+FA14>, C<U+FA1F>, C<U+FA21>, C<U+FA23>,
1540C<U+FA24>, C<U+FA27>, C<U+FA28>, C<U+FA29>) are also treated as CJK unified
1541ideographs. But they can't be overridden via C<overrideCJK> when you use
1542DUCET, as the table includes weights for them. C<table> or C<entry> has
1543priority over C<overrideCJK>.
1544
1545=item overrideHangul
1546
1547-- see 7.1 Derived Collation Elements, UTS #10.
1548
1549By default, Hangul syllables are decomposed into Hangul Jamo,
1550even if C<(normalization =E<gt> undef)>.
1551But the mapping of Hangul syllables may be overridden.
1552
1553This parameter works like C<overrideCJK>, so see there for examples.
1554
1555If you want to override the mapping of Hangul syllables,
1556NFD and NFKD are not appropriate, since NFD and NFKD will decompose
1557Hangul syllables before overriding. FCD may decompose Hangul syllables
1558as the case may be.
1559
1560If a false value (but not C<undef>) is passed, C<overrideHangul>
1561has no effect.
1562C<$Collator-E<gt>change(overrideHangul =E<gt> 0)> resets the old one.
1563
1564If C<undef> is passed explicitly as the value for this key,
1565weight for Hangul syllables is treated as undefined
1566without decomposition into Hangul Jamo.
1567But definition of weight for Hangul syllables
1568in C<table> or C<entry> is still valid.
1569
1570=item overrideOut
1571
1572-- see 7.1.1 Handling Ill-Formed Code Unit Sequences, UTS #10.
1573
1574Perl seems to allow out-of-range values (greater than 0x10FFFF).
1575By default, out-of-range values are replaced with C<U+FFFD>
1576(REPLACEMENT CHARACTER) when C<UCA_Version> E<gt>= 22,
1577or ignored when C<UCA_Version> E<lt>= 20.
1578
1579When C<UCA_Version> E<gt>= 22, the weights of out-of-range values
1580can be overridden. Though C<table> or C<entry> are available for them,
1581out-of-range values are too many.
1582
1583C<overrideOut> can perform it algorithmically.
1584This parameter works like C<overrideCJK>, so see there for examples.
1585
1586ex. ignores all out-of-range values.
1587
1588  overrideOut => sub {()}, # CODEREF returning empty list
1589
1590If a false value (including C<undef>) is passed, C<overrideOut>
1591has no effect.
1592C<$Collator-E<gt>change(overrideOut =E<gt> 0)> resets the old one.
1593
1594B<NOTE ABOUT U+FFFD:>
1595
1596UCA recommends that out-of-range values should not be ignored for security
1597reasons. Say, C<"pe\x{110000}rl"> should not be equal to C<"perl">.
1598However, C<U+FFFD> is wrongly mapped to a variable collation element
1599in DUCET for Unicode 6.0.0 to 6.2.0, that means out-of-range values will be
1600ignored when C<variable> isn't C<Non-ignorable>.
1601
1602The mapping of C<U+FFFD> is corrected in Unicode 6.3.0.
1603see L<http://www.unicode.org/reports/tr10/tr10-28.html#Trailing_Weights>
1604(7.1.4 Trailing Weights). Such a correction is reproduced by this.
1605
1606  overrideOut => sub { 0xFFFD }, # CODEREF returning a very large integer
1607
1608This workaround is unnecessary since Unicode 6.3.0.
1609
1610=item preprocess
1611
1612-- see 5.4 Preprocessing, UTS #10.
1613
1614If specified, the coderef is used to preprocess each string
1615before the formation of sort keys.
1616
1617ex. dropping English articles, such as "a" or "the".
1618Then, "the pen" is before "a pencil".
1619
1620     preprocess => sub {
1621           my $str = shift;
1622           $str =~ s/\b(?:an?|the)\s+//gi;
1623           return $str;
1624        },
1625
1626C<preprocess> is performed before C<normalization> (if defined).
1627
1628ex. decoding strings in a legacy encoding such as shift-jis:
1629
1630    $sjis_collator = Unicode::Collate->new(
1631        preprocess => \&your_shiftjis_to_unicode_decoder,
1632    );
1633    @result = $sjis_collator->sort(@shiftjis_strings);
1634
1635B<Note:> Strings returned from the coderef will be interpreted
1636according to Perl's Unicode support. See L<perlunicode>,
1637L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>.
1638
1639=item rearrange
1640
1641-- see 3.5 Rearrangement, UTS #10.
1642
1643Characters that are not coded in logical order and to be rearranged.
1644If C<UCA_Version> is equal to or less than 11, default is:
1645
1646    rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
1647
1648If you want to disallow any rearrangement, pass C<undef> or C<[]>
1649(a reference to empty list) as the value for this key.
1650
1651If C<UCA_Version> is equal to or greater than 14, default is C<[]>
1652(i.e. no rearrangement).
1653
1654B<According to the version 9 of UCA, this parameter shall not be used;
1655but it is not warned at present.>
1656
1657=item rewrite
1658
1659If specified, the coderef is used to rewrite lines in C<table> or C<entry>.
1660The coderef will get each line, and then should return a rewritten line
1661according to the UCA file format.
1662If the coderef returns an empty line, the line will be skipped.
1663
1664e.g. any primary ignorable characters into tertiary ignorable:
1665
1666    rewrite => sub {
1667        my $line = shift;
1668        $line =~ s/\[\.0000\..{4}\..{4}\./[.0000.0000.0000./g;
1669        return $line;
1670    },
1671
1672This example shows rewriting weights. C<rewrite> is allowed to
1673affect code points, weights, and the name.
1674
1675B<NOTE>: C<table> is available to use another table file;
1676preparing a modified table once would be more efficient than
1677rewriting lines on reading an unmodified table every time.
1678
1679=item suppress
1680
1681-- see 3.12 Special-Purpose Commands, UTS #35 (LDML) Part 5: Collation.
1682
1683Contractions beginning with the specified characters are suppressed,
1684even if those contractions are defined in C<table>.
1685
1686An example for Russian and some languages using the Cyrillic script:
1687
1688    suppress => [0x0400..0x0417, 0x041A..0x0437, 0x043A..0x045F],
1689
1690where 0x0400 stands for C<U+0400>, CYRILLIC CAPITAL LETTER IE WITH GRAVE.
1691
1692B<NOTE>: Contractions via C<entry> will not be suppressed.
1693
1694=item table
1695
1696-- see 3.8 Default Unicode Collation Element Table, UTS #10.
1697
1698You can use another collation element table if desired.
1699
1700The table file should locate in the F<Unicode/Collate> directory
1701on C<@INC>. Say, if the filename is F<Foo.txt>,
1702the table file is searched as F<Unicode/Collate/Foo.txt> in C<@INC>.
1703
1704By default, F<allkeys.txt> (as the filename of DUCET) is used.
1705If you will prepare your own table file, any name other than F<allkeys.txt>
1706may be better to avoid namespace conflict.
1707
1708B<NOTE>: When XSUB is used, the DUCET is compiled on building this
1709module, and it may save time at the run time.
1710Explicit saying C<(table =E<gt> 'allkeys.txt')>, or using another table,
1711or using C<ignoreChar>, C<ignoreName>, C<undefChar>, C<undefName> or
1712C<rewrite> will prevent this module from using the compiled DUCET.
1713
1714If C<undef> is passed explicitly as the value for this key,
1715no file is read (but you can define collation elements via C<entry>).
1716
1717A typical way to define a collation element table
1718without any file of table:
1719
1720   $onlyABC = Unicode::Collate->new(
1721       table => undef,
1722       entry => << 'ENTRIES',
17230061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
17240041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
17250062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
17260042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
17270063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
17280043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
1729ENTRIES
1730    );
1731
1732If C<ignoreName> or C<undefName> is used, character names should be
1733specified as a comment (following C<#>) on each line.
1734
1735=item undefChar
1736
1737=item undefName
1738
1739-- see 6.3.3 Reducing the Repertoire, UTS #10.
1740
1741Undefines the collation element as if it were unassigned in the C<table>.
1742This reduces the size of the table.
1743If an unassigned character appears in the string to be collated,
1744the sort key is made from its codepoint
1745as a single-character collation element,
1746as it is greater than any other assigned collation elements
1747(in the codepoint order among the unassigned characters).
1748But, it'd be better to ignore characters
1749unfamiliar to you and maybe never used.
1750
1751Through C<undefChar>, any character matching C<qr/$undefChar/>
1752will be undefined. Through C<undefName>, any character whose name
1753(given in the C<table> file as a comment) matches C<qr/$undefName/>
1754will be undefined.
1755
1756ex. Collation weights for beyond-BMP characters are not stored in object:
1757
1758    undefChar => qr/[^\0-\x{fffd}]/,
1759
1760=item upper_before_lower
1761
1762-- see 6.6 Case Comparisons, UTS #10.
1763
1764By default, lowercase is before uppercase.
1765If the parameter is made true, this is reversed.
1766
1767B<NOTE>: This parameter simplemindedly assumes that any lowercase/uppercase
1768distinctions must occur in level 3, and their weights at level 3 must be
1769same as those mentioned in 7.3.1, UTS #10.
1770If you define your collation elements which differs from this requirement,
1771this parameter doesn't work validly.
1772
1773=item variable
1774
1775-- see 3.6 Variable Weighting, UTS #10.
1776
1777This key allows for variable weighting of variable collation elements,
1778which are marked with an ASTERISK in the table
1779(NOTE: Many punctuation marks and symbols are variable in F<allkeys.txt>).
1780
1781   variable => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
1782
1783These names are case-insensitive.
1784By default (if specification is omitted), 'shifted' is adopted.
1785
1786   'Blanked'        Variable elements are made ignorable at levels 1 through 3;
1787                    considered at the 4th level.
1788
1789   'Non-Ignorable'  Variable elements are not reset to ignorable.
1790
1791   'Shifted'        Variable elements are made ignorable at levels 1 through 3
1792                    their level 4 weight is replaced by the old level 1 weight.
1793                    Level 4 weight for Non-Variable elements is 0xFFFF.
1794
1795   'Shift-Trimmed'  Same as 'shifted', but all FFFF's at the 4th level
1796                    are trimmed.
1797
1798=back
1799
1800=head2 Methods for Collation
1801
1802=over 4
1803
1804=item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
1805
1806Sorts a list of strings.
1807
1808=item C<$result = $Collator-E<gt>cmp($a, $b)>
1809
1810Returns 1 (when C<$a> is greater than C<$b>)
1811or 0 (when C<$a> is equal to C<$b>)
1812or -1 (when C<$a> is less than C<$b>).
1813
1814=item C<$result = $Collator-E<gt>eq($a, $b)>
1815
1816=item C<$result = $Collator-E<gt>ne($a, $b)>
1817
1818=item C<$result = $Collator-E<gt>lt($a, $b)>
1819
1820=item C<$result = $Collator-E<gt>le($a, $b)>
1821
1822=item C<$result = $Collator-E<gt>gt($a, $b)>
1823
1824=item C<$result = $Collator-E<gt>ge($a, $b)>
1825
1826They works like the same name operators as theirs.
1827
1828   eq : whether $a is equal to $b.
1829   ne : whether $a is not equal to $b.
1830   lt : whether $a is less than $b.
1831   le : whether $a is less than $b or equal to $b.
1832   gt : whether $a is greater than $b.
1833   ge : whether $a is greater than $b or equal to $b.
1834
1835=item C<$sortKey = $Collator-E<gt>getSortKey($string)>
1836
1837-- see 4.3 Form Sort Key, UTS #10.
1838
1839Returns a sort key.
1840
1841You compare the sort keys using a binary comparison
1842and get the result of the comparison of the strings using UCA.
1843
1844   $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
1845
1846      is equivalent to
1847
1848   $Collator->cmp($a, $b)
1849
1850=item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
1851
1852Converts a sorting key into its representation form.
1853If C<UCA_Version> is 8, the output is slightly different.
1854
1855   use Unicode::Collate;
1856   my $c = Unicode::Collate->new();
1857   print $c->viewSortKey("Perl"),"\n";
1858
1859   # output:
1860   # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF]
1861   #  Level 1               Level 2               Level 3               Level 4
1862
1863=back
1864
1865=head2 Methods for Searching
1866
1867The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work
1868like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively,
1869but they are not aware of any pattern, but only a literal substring.
1870
1871B<DISCLAIMER:> If C<preprocess> or C<normalization> parameter is true
1872for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>,
1873C<subst>, C<gsubst>) is croaked, as the position and the length might
1874differ from those on the specified string.
1875
1876C<rearrange> and C<hangul_terminator> parameters are neglected.
1877C<katakana_before_hiragana> and C<upper_before_lower> don't affect
1878matching and searching, as it doesn't matter whether greater or less.
1879
1880=over 4
1881
1882=item C<$position = $Collator-E<gt>index($string, $substring[, $position])>
1883
1884=item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])>
1885
1886If C<$substring> matches a part of C<$string>, returns
1887the position of the first occurrence of the matching part in scalar context;
1888in list context, returns a two-element list of
1889the position and the length of the matching part.
1890
1891If C<$substring> does not match any part of C<$string>,
1892returns C<-1> in scalar context and
1893an empty list in list context.
1894
1895e.g. when the content of C<$str> is C<"Ich mu>E<szlig>C< studieren Perl.">,
1896you say the following where C<$sub> is C<"M>E<uuml>C<SS">,
1897
1898  my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1899                                     # (normalization => undef) is REQUIRED.
1900  my $match;
1901  if (my($pos,$len) = $Collator->index($str, $sub)) {
1902      $match = substr($str, $pos, $len);
1903  }
1904
1905and get C<"mu>E<szlig>C<"> in C<$match>, since C<"mu>E<szlig>C<">
1906is primary equal to C<"M>E<uuml>C<SS">.
1907
1908=item C<$match_ref = $Collator-E<gt>match($string, $substring)>
1909
1910=item C<($match)   = $Collator-E<gt>match($string, $substring)>
1911
1912If C<$substring> matches a part of C<$string>, in scalar context, returns
1913B<a reference to> the first occurrence of the matching part
1914(C<$match_ref> is always true if matches,
1915since every reference is B<true>);
1916in list context, returns the first occurrence of the matching part.
1917
1918If C<$substring> does not match any part of C<$string>,
1919returns C<undef> in scalar context and
1920an empty list in list context.
1921
1922e.g.
1923
1924    if ($match_ref = $Collator->match($str, $sub)) { # scalar context
1925	print "matches [$$match_ref].\n";
1926    } else {
1927	print "doesn't match.\n";
1928    }
1929
1930     or
1931
1932    if (($match) = $Collator->match($str, $sub)) { # list context
1933	print "matches [$match].\n";
1934    } else {
1935	print "doesn't match.\n";
1936    }
1937
1938=item C<@match = $Collator-E<gt>gmatch($string, $substring)>
1939
1940If C<$substring> matches a part of C<$string>, returns
1941all the matching parts (or matching count in scalar context).
1942
1943If C<$substring> does not match any part of C<$string>,
1944returns an empty list.
1945
1946=item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)>
1947
1948If C<$substring> matches a part of C<$string>,
1949the first occurrence of the matching part is replaced by C<$replacement>
1950(C<$string> is modified) and C<$count> (always equals to C<1>) is returned.
1951
1952C<$replacement> can be a C<CODEREF>,
1953taking the matching part as an argument,
1954and returning a string to replace the matching part
1955(a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>).
1956
1957=item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)>
1958
1959If C<$substring> matches a part of C<$string>,
1960all the occurrences of the matching part are replaced by C<$replacement>
1961(C<$string> is modified) and C<$count> is returned.
1962
1963C<$replacement> can be a C<CODEREF>,
1964taking the matching part as an argument,
1965and returning a string to replace the matching part
1966(a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>).
1967
1968e.g.
1969
1970  my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1971                                     # (normalization => undef) is REQUIRED.
1972  my $str = "Camel donkey zebra came\x{301}l CAMEL horse cam\0e\0l...";
1973  $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" });
1974
1975  # now $str is "<b>Camel</b> donkey zebra <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cam\0e\0l</b>...";
1976  # i.e., all the camels are made bold-faced.
1977
1978   Examples: levels and ignore_level2 - what does camel match?
1979  ---------------------------------------------------------------------------
1980   level  ignore_level2  |  camel  Camel  came\x{301}l  c-a-m-e-l  cam\0e\0l
1981  -----------------------|---------------------------------------------------
1982     1        false      |   yes    yes      yes          yes        yes
1983     2        false      |   yes    yes      no           yes        yes
1984     3        false      |   yes    no       no           yes        yes
1985     4        false      |   yes    no       no           no         yes
1986  -----------------------|---------------------------------------------------
1987     1        true       |   yes    yes      yes          yes        yes
1988     2        true       |   yes    yes      yes          yes        yes
1989     3        true       |   yes    no       yes          yes        yes
1990     4        true       |   yes    no       yes          no         yes
1991  ---------------------------------------------------------------------------
1992   note: if variable => non-ignorable, camel doesn't match c-a-m-e-l
1993         at any level.
1994
1995=back
1996
1997=head2 Other Methods
1998
1999=over 4
2000
2001=item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
2002
2003=item C<$modified_collator = $Collator-E<gt>change(%new_tailoring)>
2004
2005Changes the value of specified keys and returns the changed part.
2006
2007    $Collator = Unicode::Collate->new(level => 4);
2008
2009    $Collator->eq("perl", "PERL"); # false
2010
2011    %old = $Collator->change(level => 2); # returns (level => 4).
2012
2013    $Collator->eq("perl", "PERL"); # true
2014
2015    $Collator->change(%old); # returns (level => 2).
2016
2017    $Collator->eq("perl", "PERL"); # false
2018
2019Not all C<(key,value)>s are allowed to be changed.
2020See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>.
2021
2022In the scalar context, returns the modified collator
2023(but it is B<not> a clone from the original).
2024
2025    $Collator->change(level => 2)->eq("perl", "PERL"); # true
2026
2027    $Collator->eq("perl", "PERL"); # true; now max level is 2nd.
2028
2029    $Collator->change(level => 4)->eq("perl", "PERL"); # false
2030
2031=item C<$version = $Collator-E<gt>version()>
2032
2033Returns the version number (a string) of the Unicode Standard
2034which the C<table> file used by the collator object is based on.
2035If the table does not include a version line (starting with C<@version>),
2036returns C<"unknown">.
2037
2038=item C<UCA_Version()>
2039
2040Returns the revision number of UTS #10 this module consults,
2041that should correspond with the DUCET incorporated.
2042
2043=item C<Base_Unicode_Version()>
2044
2045Returns the version number of UTS #10 this module consults,
2046that should correspond with the DUCET incorporated.
2047
2048=back
2049
2050=head1 EXPORT
2051
2052No method will be exported.
2053
2054=head1 INSTALL
2055
2056Though this module can be used without any C<table> file,
2057to use this module easily, it is recommended to install a table file
2058in the UCA format, by copying it under the directory
2059<a place in @INC>/Unicode/Collate.
2060
2061The most preferable one is "The Default Unicode Collation Element Table"
2062(aka DUCET), available from the Unicode Consortium's website:
2063
2064   http://www.unicode.org/Public/UCA/
2065
2066   http://www.unicode.org/Public/UCA/latest/allkeys.txt
2067   (latest version)
2068
2069If DUCET is not installed, it is recommended to copy the file
2070from http://www.unicode.org/Public/UCA/latest/allkeys.txt
2071to <a place in @INC>/Unicode/Collate/allkeys.txt
2072manually.
2073
2074=head1 CAVEATS
2075
2076=over 4
2077
2078=item Normalization
2079
2080Use of the C<normalization> parameter requires the B<Unicode::Normalize>
2081module (see L<Unicode::Normalize>).
2082
2083If you need not it (say, in the case when you need not
2084handle any combining characters),
2085assign C<(normalization =E<gt> undef)> explicitly.
2086
2087-- see 6.5 Avoiding Normalization, UTS #10.
2088
2089=item Conformance Test
2090
2091The Conformance Test for the UCA is available
2092under L<http://www.unicode.org/Public/UCA/>.
2093
2094For F<CollationTest_SHIFTED.txt>,
2095a collator via C<Unicode::Collate-E<gt>new( )> should be used;
2096for F<CollationTest_NON_IGNORABLE.txt>, a collator via
2097C<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>.
2098
2099If C<UCA_Version> is 26 or later, the C<identical> level is preferred;
2100C<Unicode::Collate-E<gt>new(identical =E<gt> 1)> and
2101C<Unicode::Collate-E<gt>new(identical =E<gt> 1,>
2102C<variable =E<gt> "non-ignorable", level =E<gt> 3)> should be used.
2103
2104B<Unicode::Normalize is required to try The Conformance Test.>
2105
2106B<EBCDIC-SUPPORT IS EXPERIMENTAL.>
2107
2108=back
2109
2110=head1 AUTHOR, COPYRIGHT AND LICENSE
2111
2112The Unicode::Collate module for perl was written by SADAHIRO Tomoyuki,
2113<SADAHIRO@cpan.org>. This module is Copyright(C) 2001-2021,
2114SADAHIRO Tomoyuki. Japan. All rights reserved.
2115
2116This module is free software; you can redistribute it and/or
2117modify it under the same terms as Perl itself.
2118
2119The file Unicode/Collate/allkeys.txt was copied verbatim
2120from L<http://www.unicode.org/Public/UCA/13.0.0/allkeys.txt>.
2121For this file, Copyright (c) 2020 Unicode, Inc.; distributed
2122under the Terms of Use in L<http://www.unicode.org/terms_of_use.html>
2123
2124=head1 SEE ALSO
2125
2126=over 4
2127
2128=item Unicode Collation Algorithm - UTS #10
2129
2130L<http://www.unicode.org/reports/tr10/>
2131
2132=item The Default Unicode Collation Element Table (DUCET)
2133
2134L<http://www.unicode.org/Public/UCA/latest/allkeys.txt>
2135
2136=item The conformance test for the UCA
2137
2138L<http://www.unicode.org/Public/UCA/latest/CollationTest.html>
2139
2140L<http://www.unicode.org/Public/UCA/latest/CollationTest.zip>
2141
2142=item Hangul Syllable Type
2143
2144L<http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt>
2145
2146=item Unicode Normalization Forms - UAX #15
2147
2148L<http://www.unicode.org/reports/tr15/>
2149
2150=item Unicode Locale Data Markup Language (LDML) - UTS #35
2151
2152L<http://www.unicode.org/reports/tr35/>
2153
2154=back
2155
2156=cut
2157