1package Unicode::Collate;
2
3use 5.006;
4use strict;
5use warnings;
6use Carp;
7use File::Spec;
8
9no warnings 'utf8';
10
11our $VERSION = '1.31';
12our $PACKAGE = __PACKAGE__;
13
14my @Path = qw(Unicode Collate);
15my $KeyFile = 'allkeys.txt';
16
17# Perl's boolean
18use constant TRUE  => 1;
19use constant FALSE => "";
20use constant NOMATCHPOS => -1;
21
22# A coderef to get combining class imported from Unicode::Normalize
23# (i.e. \&Unicode::Normalize::getCombinClass).
24# This is also used as a HAS_UNICODE_NORMALIZE flag.
25my $CVgetCombinClass;
26
27# Supported Levels
28use constant MinLevel => 1;
29use constant MaxLevel => 4;
30
31# Minimum weights at level 2 and 3, respectively
32use constant Min2Wt => 0x20;
33use constant Min3Wt => 0x02;
34
35# Shifted weight at 4th level
36use constant Shift4Wt => 0xFFFF;
37
38# A boolean for Variable and 16-bit weights at 4 levels of Collation Element
39use constant VCE_TEMPLATE => 'Cn4';
40
41# A sort key: 16-bit weights
42use constant KEY_TEMPLATE => 'n*';
43
44# The tie-breaking: 32-bit weights
45use constant TIE_TEMPLATE => 'N*';
46
47# Level separator in a sort key:
48# i.e. pack(KEY_TEMPLATE, 0)
49use constant LEVEL_SEP => "\0\0";
50
51# As Unicode code point separator for hash keys.
52# A joined code point string (denoted by JCPS below)
53# like "65;768" is used for internal processing
54# instead of Perl's Unicode string like "\x41\x{300}",
55# as the native code point is different from the Unicode code point
56# on EBCDIC platform.
57# This character must not be included in any stringified
58# representation of an integer.
59use constant CODE_SEP => ';';
60	# NOTE: in regex /;/ is used for $jcps!
61
62# boolean values of variable weights
63use constant NON_VAR => 0; # Non-Variable character
64use constant VAR     => 1; # Variable character
65
66# specific code points
67use constant Hangul_SBase  => 0xAC00; ### PP only
68use constant Hangul_SIni   => 0xAC00;
69use constant Hangul_SFin   => 0xD7A3;
70### begin PP only ###
71use constant Hangul_NCount => 588;
72use constant Hangul_TCount => 28;
73use constant Hangul_LBase  => 0x1100;
74use constant Hangul_LIni   => 0x1100;
75use constant Hangul_LFin   => 0x1159;
76use constant Hangul_LFill  => 0x115F;
77use constant Hangul_LEnd   => 0x115F; # Unicode 5.2
78use constant Hangul_VBase  => 0x1161;
79use constant Hangul_VIni   => 0x1160; # from Vowel Filler
80use constant Hangul_VFin   => 0x11A2;
81use constant Hangul_VEnd   => 0x11A7; # Unicode 5.2
82use constant Hangul_TBase  => 0x11A7; # from "no-final" codepoint
83use constant Hangul_TIni   => 0x11A8;
84use constant Hangul_TFin   => 0x11F9;
85use constant Hangul_TEnd   => 0x11FF; # Unicode 5.2
86use constant HangulL2Ini   => 0xA960; # Unicode 5.2
87use constant HangulL2Fin   => 0xA97C; # Unicode 5.2
88use constant HangulV2Ini   => 0xD7B0; # Unicode 5.2
89use constant HangulV2Fin   => 0xD7C6; # Unicode 5.2
90use constant HangulT2Ini   => 0xD7CB; # Unicode 5.2
91use constant HangulT2Fin   => 0xD7FB; # Unicode 5.2
92
93use constant CJK_UidIni   =>  0x4E00;
94use constant CJK_UidFin   =>  0x9FA5;
95use constant CJK_UidF41   =>  0x9FBB; # Unicode 4.1
96use constant CJK_UidF51   =>  0x9FC3; # Unicode 5.1
97use constant CJK_UidF52   =>  0x9FCB; # Unicode 5.2
98use constant CJK_UidF61   =>  0x9FCC; # Unicode 6.1
99use constant CJK_UidF80   =>  0x9FD5; # Unicode 8.0
100use constant CJK_UidF100  =>  0x9FEA; # Unicode 10.0
101use constant CJK_UidF110  =>  0x9FEF; # Unicode 11.0
102use constant CJK_UidF130  =>  0x9FFC; # Unicode 13.0
103
104use constant CJK_ExtAIni  =>  0x3400; # Unicode 3.0
105use constant CJK_ExtAFin  =>  0x4DB5; # Unicode 3.0
106use constant CJK_ExtA130  =>  0x4DBF; # Unicode 13.0
107use constant CJK_ExtBIni  => 0x20000; # Unicode 3.1
108use constant CJK_ExtBFin  => 0x2A6D6; # Unicode 3.1
109use constant CJK_ExtB130  => 0x2A6DD; # Unicode 13.0
110use constant CJK_ExtCIni  => 0x2A700; # Unicode 5.2
111use constant CJK_ExtCFin  => 0x2B734; # Unicode 5.2
112use constant CJK_ExtDIni  => 0x2B740; # Unicode 6.0
113use constant CJK_ExtDFin  => 0x2B81D; # Unicode 6.0
114use constant CJK_ExtEIni  => 0x2B820; # Unicode 8.0
115use constant CJK_ExtEFin  => 0x2CEA1; # Unicode 8.0
116use constant CJK_ExtFIni  => 0x2CEB0; # Unicode 10.0
117use constant CJK_ExtFFin  => 0x2EBE0; # Unicode 10.0
118use constant CJK_ExtGIni  => 0x30000; # Unicode 13.0
119use constant CJK_ExtGFin  => 0x3134A; # Unicode 13.0
120
121my %CompatUI = map +($_ => 1), (
122    0xFA0E, 0xFA0F, 0xFA11, 0xFA13, 0xFA14, 0xFA1F,
123    0xFA21, 0xFA23, 0xFA24, 0xFA27, 0xFA28, 0xFA29,
124);
125
126use constant TangIdeoIni  => 0x17000; # Unicode 9.0
127use constant TangIdeoFin  => 0x187EC; # Unicode 9.0
128use constant TangIdeo110  => 0x187F1; # Unicode 11.0
129use constant TangIdeo120  => 0x187F7; # Unicode 12.0
130use constant TangCompIni  => 0x18800; # Unicode 9.0
131use constant TangCompFin  => 0x18AF2; # Unicode 9.0
132use constant TangComp130  => 0x18AFF; # Unicode 13.0
133use constant TangSuppIni  => 0x18D00; # Unicode 13.0
134use constant TangSuppFin  => 0x18D08; # Unicode 13.0
135use constant NushuIni     => 0x1B170; # Unicode 10.0
136use constant NushuFin     => 0x1B2FB; # Unicode 10.0
137use constant KhitanIni    => 0x18B00; # Unicode 13.0
138use constant KhitanFin    => 0x18CD5; # Unicode 13.0
139
140### end PP only ###
141
142# Logical_Order_Exception in PropList.txt
143my $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
144
145# for highestFFFF and minimalFFFE
146my $HighestVCE = pack(VCE_TEMPLATE, 0, 0xFFFE, 0x20, 0x5, 0xFFFF);
147my $minimalVCE = pack(VCE_TEMPLATE, 0,      1, 0x20, 0x5, 0xFFFE);
148
149sub UCA_Version { '43' }
150
151sub Base_Unicode_Version { '13.0.0' }
152
153######
154
155my $native_to_unicode = ($::IS_ASCII || $] < 5.008)
156	? sub { return shift }
157	: sub { utf8::native_to_unicode(shift) };
158
159my $unicode_to_native = ($::IS_ASCII || $] < 5.008)
160	? sub { return shift }
161	: sub { utf8::unicode_to_native(shift) };
162
163# pack_U() should get Unicode code points.
164sub pack_U {
165    return pack('U*', map $unicode_to_native->($_), @_);
166}
167
168# unpack_U() should return Unicode code points.
169sub unpack_U {
170    return map $native_to_unicode->($_), unpack('U*', shift(@_).pack('U*'));
171}
172# for older perl version, pack('U*') generates empty string with utf8 flag.
173
174######
175
176my (%VariableOK);
177@VariableOK{ qw/
178    blanked  non-ignorable  shifted  shift-trimmed
179  / } = (); # keys lowercased
180
181our @ChangeOK = qw/
182    alternate backwards level normalization rearrange
183    katakana_before_hiragana upper_before_lower ignore_level2
184    overrideCJK overrideHangul overrideOut preprocess UCA_Version
185    hangul_terminator variable identical highestFFFF minimalFFFE
186    long_contraction
187  /;
188
189our @ChangeNG = qw/
190    entry mapping table maxlength contraction
191    ignoreChar ignoreName undefChar undefName rewrite
192    versionTable alternateTable backwardsTable forwardsTable
193    rearrangeTable variableTable
194    derivCode normCode rearrangeHash backwardsFlag
195    suppress suppressHash
196  /;
197# The hash key 'ignored' was deleted at v 0.21.
198# The hash key 'isShift' was deleted at v 0.23.
199# The hash key 'combining' was deleted at v 0.24.
200# The hash key 'entries' was deleted at v 0.30.
201# The hash key 'L3_ignorable' was deleted at v 0.40.
202
203sub version {
204    my $self = shift;
205    return $self->{versionTable} || 'unknown';
206}
207
208my (%ChangeOK, %ChangeNG);
209@ChangeOK{ @ChangeOK } = ();
210@ChangeNG{ @ChangeNG } = ();
211
212sub change {
213    my $self = shift;
214    my %hash = @_;
215    my %old;
216    if (exists $hash{alternate}) {
217	if (exists $hash{variable}) {
218	    delete $hash{alternate};
219	} else {
220	    $hash{variable} = $hash{alternate};
221	}
222    }
223    foreach my $k (keys %hash) {
224	if (exists $ChangeOK{$k}) {
225	    $old{$k} = $self->{$k};
226	    $self->{$k} = $hash{$k};
227	} elsif (exists $ChangeNG{$k}) {
228	    croak "change of $k via change() is not allowed!";
229	}
230	# else => ignored
231    }
232    $self->checkCollator();
233    return wantarray ? %old : $self;
234}
235
236sub _checkLevel {
237    my $level = shift;
238    my $key   = shift; # 'level' or 'backwards'
239    MinLevel <= $level or croak sprintf
240	"Illegal level %d (in value for key '%s') lower than %d.",
241	    $level, $key, MinLevel;
242    $level <= MaxLevel or croak sprintf
243	"Unsupported level %d (in value for key '%s') higher than %d.",
244	    $level, $key, MaxLevel;
245}
246
247my %DerivCode = (
248    8 => \&_derivCE_8,
249    9 => \&_derivCE_9,
250   11 => \&_derivCE_9, # 11 == 9
251   14 => \&_derivCE_14,
252   16 => \&_derivCE_14, # 16 == 14
253   18 => \&_derivCE_18,
254   20 => \&_derivCE_20,
255   22 => \&_derivCE_22,
256   24 => \&_derivCE_24,
257   26 => \&_derivCE_24, # 26 == 24
258   28 => \&_derivCE_24, # 28 == 24
259   30 => \&_derivCE_24, # 30 == 24
260   32 => \&_derivCE_32,
261   34 => \&_derivCE_34,
262   36 => \&_derivCE_36,
263   38 => \&_derivCE_38,
264   40 => \&_derivCE_40,
265   41 => \&_derivCE_40, # 41 == 40
266   43 => \&_derivCE_43,
267);
268
269sub checkCollator {
270    my $self = shift;
271    _checkLevel($self->{level}, 'level');
272
273    $self->{derivCode} = $DerivCode{ $self->{UCA_Version} }
274	or croak "Illegal UCA version (passed $self->{UCA_Version}).";
275
276    $self->{variable} ||= $self->{alternate} || $self->{variableTable} ||
277				$self->{alternateTable} || 'shifted';
278    $self->{variable} = $self->{alternate} = lc($self->{variable});
279    exists $VariableOK{ $self->{variable} }
280	or croak "$PACKAGE unknown variable parameter name: $self->{variable}";
281
282    if (! defined $self->{backwards}) {
283	$self->{backwardsFlag} = 0;
284    } elsif (! ref $self->{backwards}) {
285	_checkLevel($self->{backwards}, 'backwards');
286	$self->{backwardsFlag} = 1 << $self->{backwards};
287    } else {
288	my %level;
289	$self->{backwardsFlag} = 0;
290	for my $b (@{ $self->{backwards} }) {
291	    _checkLevel($b, 'backwards');
292	    $level{$b} = 1;
293	}
294	for my $v (sort keys %level) {
295	    $self->{backwardsFlag} += 1 << $v;
296	}
297    }
298
299    defined $self->{rearrange} or $self->{rearrange} = [];
300    ref $self->{rearrange}
301	or croak "$PACKAGE: list for rearrangement must be store in ARRAYREF";
302
303    # keys of $self->{rearrangeHash} are $self->{rearrange}.
304    $self->{rearrangeHash} = undef;
305
306    if (@{ $self->{rearrange} }) {
307	@{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
308    }
309
310    $self->{normCode} = undef;
311
312    if (defined $self->{normalization}) {
313	eval { require Unicode::Normalize };
314	$@ and croak "Unicode::Normalize is required to normalize strings";
315
316	$CVgetCombinClass ||= \&Unicode::Normalize::getCombinClass;
317
318	if ($self->{normalization} =~ /^(?:NF)D\z/) { # tweak for default
319	    $self->{normCode} = \&Unicode::Normalize::NFD;
320	}
321	elsif ($self->{normalization} ne 'prenormalized') {
322	    my $norm = $self->{normalization};
323	    $self->{normCode} = sub {
324		Unicode::Normalize::normalize($norm, shift);
325	    };
326	    eval { $self->{normCode}->("") }; # try
327	    $@ and croak "$PACKAGE unknown normalization form name: $norm";
328	}
329    }
330    return;
331}
332
333sub new
334{
335    my $class = shift;
336    my $self = bless { @_ }, $class;
337
338    # keys of $self->{suppressHash} are $self->{suppress}.
339    if ($self->{suppress} && @{ $self->{suppress} }) {
340	@{ $self->{suppressHash} }{ @{ $self->{suppress} } } = ();
341    } # before read_table()
342
343    # If undef is passed explicitly, no file is read.
344    $self->{table} = $KeyFile if ! exists $self->{table};
345    $self->read_table() if defined $self->{table};
346
347    if ($self->{entry}) {
348	while ($self->{entry} =~ /([^\n]+)/g) {
349	    $self->parseEntry($1, TRUE);
350	}
351    }
352
353    # only in new(), not in change()
354    $self->{level} ||= MaxLevel;
355    $self->{UCA_Version} ||= UCA_Version();
356
357    $self->{overrideHangul} = FALSE
358	if ! exists $self->{overrideHangul};
359    $self->{overrideCJK} = FALSE
360	if ! exists $self->{overrideCJK};
361    $self->{normalization} = 'NFD'
362	if ! exists $self->{normalization};
363    $self->{rearrange} = $self->{rearrangeTable} ||
364	($self->{UCA_Version} <= 11 ? $DefaultRearrange : [])
365	if ! exists $self->{rearrange};
366    $self->{backwards} = $self->{backwardsTable}
367	if ! exists $self->{backwards};
368    exists $self->{long_contraction} or $self->{long_contraction}
369	= 22 <= $self->{UCA_Version} && $self->{UCA_Version} <= 24;
370
371    # checkCollator() will be called in change()
372    $self->checkCollator();
373
374    return $self;
375}
376
377sub parseAtmark {
378    my $self = shift;
379    my $line = shift; # after s/^\s*\@//
380
381    if ($line =~ /^version\s*(\S*)/) {
382	$self->{versionTable} ||= $1;
383    }
384    elsif ($line =~ /^variable\s+(\S*)/) { # since UTS #10-9
385	$self->{variableTable} ||= $1;
386    }
387    elsif ($line =~ /^alternate\s+(\S*)/) { # till UTS #10-8
388	$self->{alternateTable} ||= $1;
389    }
390    elsif ($line =~ /^backwards\s+(\S*)/) {
391	push @{ $self->{backwardsTable} }, $1;
392    }
393    elsif ($line =~ /^forwards\s+(\S*)/) { # perhaps no use
394	push @{ $self->{forwardsTable} }, $1;
395    }
396    elsif ($line =~ /^rearrange\s+(.*)/) { # (\S*) is NG
397	push @{ $self->{rearrangeTable} }, _getHexArray($1);
398    }
399}
400
401sub read_table {
402    my $self = shift;
403
404    my($f, $fh);
405    foreach my $d (@INC) {
406	$f = File::Spec->catfile($d, @Path, $self->{table});
407	last if open($fh, $f);
408	$f = undef;
409    }
410    if (!defined $f) {
411	$f = File::Spec->catfile(@Path, $self->{table});
412	croak("$PACKAGE: Can't locate $f in \@INC (\@INC contains: @INC)");
413    }
414
415    while (my $line = <$fh>) {
416	next if $line =~ /^\s*#/;
417
418	if ($line =~ s/^\s*\@//) {
419	    $self->parseAtmark($line);
420	} else {
421	    $self->parseEntry($line);
422	}
423    }
424    close $fh;
425}
426
427
428##
429## get $line, parse it, and write an entry in $self
430##
431sub parseEntry
432{
433    my $self = shift;
434    my $line = shift;
435    my $tailoring = shift;
436    my($name, $entry, @uv, @key);
437
438    if (defined $self->{rewrite}) {
439	$line = $self->{rewrite}->($line);
440    }
441
442    return if $line !~ /^\s*[0-9A-Fa-f]/;
443
444    # removes comment and gets name
445    $name = $1
446	if $line =~ s/[#%]\s*(.*)//;
447    return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
448
449    # gets element
450    my($e, $k) = split /;/, $line;
451    croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
452	if ! $k;
453
454    @uv = _getHexArray($e);
455    return if !@uv;
456    return if @uv > 1 && $self->{suppressHash} && !$tailoring &&
457		  exists $self->{suppressHash}{$uv[0]};
458    $entry = join(CODE_SEP, @uv); # in JCPS
459
460    if (defined $self->{undefChar} || defined $self->{ignoreChar}) {
461	my $ele = pack_U(@uv);
462
463	# regarded as if it were not stored in the table
464	return
465	    if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
466
467	# replaced as completely ignorable
468	$k = '[.0000.0000.0000.0000]'
469	    if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/;
470    }
471
472    # replaced as completely ignorable
473    $k = '[.0000.0000.0000.0000]'
474	if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/;
475
476    my $is_L3_ignorable = TRUE;
477
478    foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
479	my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
480	my @wt = _getHexArray($arr);
481	push @key, pack(VCE_TEMPLATE, $var, @wt);
482	$is_L3_ignorable = FALSE
483	    if $wt[0] || $wt[1] || $wt[2];
484	# Conformance Test for 3.1.1 and 4.0.0 shows Level 3 ignorable
485	# is completely ignorable.
486	# For expansion, an entry $is_L3_ignorable
487	# if and only if "all" CEs are [.0000.0000.0000].
488    }
489
490    # mapping: be an array ref or not exists (any false value is disallowed)
491    $self->{mapping}{$entry} = $is_L3_ignorable ? [] : \@key;
492
493    # maxlength: be more than 1 or not exists (any false value is disallowed)
494    if (@uv > 1) {
495	if (!$self->{maxlength}{$uv[0]} || $self->{maxlength}{$uv[0]} < @uv) {
496	    $self->{maxlength}{$uv[0]} = @uv;
497	}
498    }
499
500    # contraction: be 1 or not exists (any false value is disallowed)
501    while (@uv > 2) {
502	pop @uv;
503	my $fake_entry = join(CODE_SEP, @uv); # in JCPS
504	$self->{contraction}{$fake_entry} = 1;
505    }
506}
507
508### begin PP only ###
509#
510# modify CE
511#
512sub varCE
513{
514    my $self = shift;
515    my $vbl = $self->{variable};
516    my $vce = shift;
517    my ($var, @wt) = unpack VCE_TEMPLATE, $vce;
518
519    # primary weight == 0 && secondary weight != 0
520    if ($self->{ignore_level2} && $wt[0] == 0 && $wt[1] != 0) {
521	$wt[1] = $wt[2] = 0;
522    }
523
524    if ($vbl eq 'non-ignorable') {
525	return pack(VCE_TEMPLATE, $var, @wt);
526    } elsif ($var) {
527	return pack(VCE_TEMPLATE, $var, 0, 0, 0,
528		$vbl eq 'blanked' ? $wt[3] : $wt[0]);
529    } elsif ($vbl eq 'blanked') {
530	return pack(VCE_TEMPLATE, $var, @wt);
531    } elsif ($self->{UCA_Version} >= 36 &&
532	     $vbl eq 'shifted' && !$wt[1] && !$wt[2]) {
533	# completely ignorable or the second derived CE
534	return pack(VCE_TEMPLATE, $var, $wt[0], 0, 0, 0);
535    } else {
536	return pack(VCE_TEMPLATE, $var, @wt[0..2],
537	    $vbl eq 'shifted' && $wt[0]+$wt[1]+$wt[2] ?
538		$wt[0] eq 1 ? $wt[0] : # XXX: CollationAuxiliary-6.2.0
539		Shift4Wt : 0);
540    }
541}
542
543sub visualizeSortKey
544{
545    my $self = shift;
546    my $view = join ' ', map sprintf('%04X', $_), unpack(KEY_TEMPLATE, shift);
547
548    if ($self->{UCA_Version} <= 8) {
549	$view =~ s/ ?0000 ?/|/ for MinLevel..MaxLevel;
550    } else {
551	$view =~ s/\b0000\b/|/ for MinLevel..MaxLevel;
552    }
553    return "[$view]";
554}
555### end PP only ###
556
557
558sub viewSortKey
559{
560    my $self = shift;
561    my $str  = shift;
562    $self->visualizeSortKey($self->getSortKey($str));
563}
564
565
566sub process
567{
568    my $self = shift;
569    my $str  = shift;
570    my $prep = $self->{preprocess};
571    my $norm = $self->{normCode};
572
573    $str = &$prep($str) if ref $prep;
574    $str = &$norm($str) if ref $norm;
575    return $str;
576}
577
578##
579## arrayref of JCPS   = splitEnt(string to be collated)
580## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, TRUE)
581##
582sub splitEnt
583{
584    my $self = shift;
585    my $str  = shift;
586    my $wLen = shift; # with Length
587
588    my $map  = $self->{mapping};
589    my $max  = $self->{maxlength};
590    my $reH  = $self->{rearrangeHash};
591    my $vers = $self->{UCA_Version};
592    my $ver9 = $vers >= 9 && $vers <= 11;
593    my $long = $self->{long_contraction};
594
595    my @buf;
596
597    # get array of Unicode code point of string.
598    my @src = unpack_U($str);
599
600    # rearrangement:
601    # Character positions are not kept if rearranged,
602    # then neglected if $wLen is true.
603    if ($reH && ! $wLen) {
604	for (my $i = 0; $i < @src; $i++) {
605	    if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
606		($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
607		$i++;
608	    }
609	}
610    }
611
612    # remove a code point marked as a completely ignorable.
613    for (my $i = 0; $i < @src; $i++) {
614	if ($vers <= 20 && _isIllegal($src[$i])) {
615	    $src[$i] = undef;
616	} elsif ($ver9) {
617	    $src[$i] = undef if exists $map->{ $src[$i] }
618			  && @{ $map->{ $src[$i] } } == 0;
619	}
620    }
621
622    for (my $i = 0; $i < @src; $i++) {
623	my $jcps = $src[$i];
624
625	# skip removed code point
626	if (! defined $jcps) {
627	    if ($wLen && @buf) {
628		$buf[-1][2] = $i + 1;
629	    }
630	    next;
631	}
632
633	my $i_orig = $i;
634
635	# find contraction
636	if (exists $max->{$jcps}) {
637	    my $temp_jcps = $jcps;
638	    my $jcpsLen = 1;
639	    my $maxLen = $max->{$jcps};
640
641	    for (my $p = $i + 1; $jcpsLen < $maxLen && $p < @src; $p++) {
642		next if ! defined $src[$p];
643		$temp_jcps .= CODE_SEP . $src[$p];
644		$jcpsLen++;
645		if (exists $map->{$temp_jcps}) {
646		    $jcps = $temp_jcps;
647		    $i = $p;
648		}
649	    }
650
651	# discontiguous contraction with Combining Char (cf. UTS#10, S2.1).
652	# This process requires Unicode::Normalize.
653	# If "normalization" is undef, here should be skipped *always*
654	# (in spite of bool value of $CVgetCombinClass),
655	# since canonical ordering cannot be expected.
656	# Blocked combining character should not be contracted.
657
658	    # $self->{normCode} is false in the case of "prenormalized".
659	    if ($self->{normalization}) {
660		my $cont = $self->{contraction};
661		my $preCC = 0;
662		my $preCC_uc = 0;
663		my $jcps_uc = $jcps;
664		my(@out, @out_uc);
665
666		for (my $p = $i + 1; $p < @src; $p++) {
667		    next if ! defined $src[$p];
668		    my $curCC = $CVgetCombinClass->($src[$p]);
669		    last unless $curCC;
670		    my $tail = CODE_SEP . $src[$p];
671
672		    if ($preCC != $curCC && exists $map->{$jcps.$tail}) {
673			$jcps .= $tail;
674			push @out, $p;
675		    } else {
676			$preCC = $curCC;
677		    }
678
679		    next if !$long;
680
681		    if ($preCC_uc != $curCC &&
682			    (exists $map->{$jcps_uc.$tail} ||
683			    exists $cont->{$jcps_uc.$tail})) {
684			$jcps_uc .= $tail;
685			push @out_uc, $p;
686		    } else {
687			$preCC_uc = $curCC;
688		    }
689		}
690
691		if (@out_uc && exists $map->{$jcps_uc}) {
692		    $jcps = $jcps_uc;
693		    $src[$_] = undef for @out_uc;
694		} else {
695		    $src[$_] = undef for @out;
696		}
697	    }
698	}
699
700	# skip completely ignorable
701	if (exists $map->{$jcps} && @{ $map->{$jcps} } == 0) {
702	    if ($wLen && @buf) {
703		$buf[-1][2] = $i + 1;
704	    }
705	    next;
706	}
707
708	push @buf, $wLen ? [$jcps, $i_orig, $i + 1] : $jcps;
709    }
710    return \@buf;
711}
712
713##
714## VCE = _pack_override(input, codepoint, derivCode)
715##
716sub _pack_override ($$$) {
717    my $r = shift;
718    my $u = shift;
719    my $der = shift;
720
721    if (ref $r) {
722	return pack(VCE_TEMPLATE, NON_VAR, @$r);
723    } elsif (defined $r) {
724	return pack(VCE_TEMPLATE, NON_VAR, $r, Min2Wt, Min3Wt, $u);
725    } else {
726	$u = 0xFFFD if 0x10FFFF < $u;
727	return $der->($u);
728    }
729}
730
731##
732## list of VCE = getWt(JCPS)
733##
734sub getWt
735{
736    my $self = shift;
737    my $u    = shift;
738    my $map  = $self->{mapping};
739    my $der  = $self->{derivCode};
740    my $out  = $self->{overrideOut};
741
742    return if !defined $u;
743    return $self->varCE($HighestVCE) if $u eq 0xFFFF && $self->{highestFFFF};
744    return $self->varCE($minimalVCE) if $u eq 0xFFFE && $self->{minimalFFFE};
745    $u = 0xFFFD if $u !~ /;/ && 0x10FFFF < $u && !$out;
746
747    my @ce;
748    if (exists $map->{$u}) {
749	@ce = @{ $map->{$u} }; # $u may be a contraction
750    } elsif (Hangul_SIni <= $u && $u <= Hangul_SFin) {
751	my $hang = $self->{overrideHangul};
752	if ($hang) {
753	    @ce = map _pack_override($_, $u, $der), $hang->($u);
754	} elsif (!defined $hang) {
755	    @ce = $der->($u);
756	} else {
757	    my $max  = $self->{maxlength};
758	    my @decH = _decompHangul($u);
759
760	    if (@decH == 2) {
761		my $contract = join(CODE_SEP, @decH);
762		@decH = ($contract) if exists $map->{$contract};
763	    } else { # must be <@decH == 3>
764		if (exists $max->{$decH[0]}) {
765		    my $contract = join(CODE_SEP, @decH);
766		    if (exists $map->{$contract}) {
767			@decH = ($contract);
768		    } else {
769			$contract = join(CODE_SEP, @decH[0,1]);
770			exists $map->{$contract} and @decH = ($contract, $decH[2]);
771		    }
772		    # even if V's ignorable, LT contraction is not supported.
773		    # If such a situation were required, NFD should be used.
774		}
775		if (@decH == 3 && exists $max->{$decH[1]}) {
776		    my $contract = join(CODE_SEP, @decH[1,2]);
777		    exists $map->{$contract} and @decH = ($decH[0], $contract);
778		}
779	    }
780
781	    @ce = map({
782		    exists $map->{$_} ? @{ $map->{$_} } : $der->($_);
783		} @decH);
784	}
785    } elsif ($out && 0x10FFFF < $u) {
786	@ce = map _pack_override($_, $u, $der), $out->($u);
787    } else {
788	my $cjk  = $self->{overrideCJK};
789	my $vers = $self->{UCA_Version};
790	if ($cjk && _isUIdeo($u, $vers)) {
791	    @ce = map _pack_override($_, $u, $der), $cjk->($u);
792	} elsif ($vers == 8 && defined $cjk && _isUIdeo($u, 0)) {
793	    @ce = _uideoCE_8($u);
794	} else {
795	    @ce = $der->($u);
796	}
797    }
798    return map $self->varCE($_), @ce;
799}
800
801
802##
803## string sortkey = getSortKey(string arg)
804##
805sub getSortKey
806{
807    my $self = shift;
808    my $orig = shift;
809    my $str  = $self->process($orig);
810    my $rEnt = $self->splitEnt($str); # get an arrayref of JCPS
811    my $vers = $self->{UCA_Version};
812    my $term = $self->{hangul_terminator};
813    my $lev  = $self->{level};
814    my $iden = $self->{identical};
815### begin PP only ##
816    my $v2i  = $vers >= 9 && $self->{variable} ne 'non-ignorable';
817### end PP only ##
818
819    my @buf; # weight arrays
820    if ($term) {
821	my $preHST = '';
822	my $termCE = $self->varCE(pack(VCE_TEMPLATE, NON_VAR, $term, 0,0,0));
823	foreach my $jcps (@$rEnt) {
824	    # weird things like VL, TL-contraction are not considered!
825	    my $curHST = join '', map getHST($_, $vers), split /;/, $jcps;
826	    if ($preHST && !$curHST || # hangul before non-hangul
827		$preHST =~ /L\z/ && $curHST =~ /^T/ ||
828		$preHST =~ /V\z/ && $curHST =~ /^L/ ||
829		$preHST =~ /T\z/ && $curHST =~ /^[LV]/) {
830		push @buf, $termCE;
831	    }
832	    $preHST = $curHST;
833	    push @buf, $self->getWt($jcps);
834	}
835	push @buf, $termCE if $preHST; # end at hangul
836    } else {
837	foreach my $jcps (@$rEnt) {
838	    push @buf, $self->getWt($jcps);
839	}
840    }
841
842### begin PP only ##
843    # make sort key
844    my @ret = ([],[],[],[]);
845    my $last_is_variable;
846
847    foreach my $vwt (@buf) {
848	my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
849
850	# "Ignorable (L1, L2) after Variable" since track. v. 9
851	if ($v2i) {
852	    if ($var) {
853		$last_is_variable = TRUE;
854	    } elsif (!$wt[0]) { # ignorable
855		next if $last_is_variable;
856	    } else {
857		$last_is_variable = FALSE;
858	    }
859	}
860	foreach my $v (0..$lev-1) {
861	    0 < $wt[$v] and push @{ $ret[$v] }, $wt[$v];
862	}
863    }
864
865    # modification of tertiary weights
866    if ($self->{upper_before_lower}) {
867	foreach my $w (@{ $ret[2] }) {
868	    if    (0x8 <= $w && $w <= 0xC) { $w -= 6 } # lower
869	    elsif (0x2 <= $w && $w <= 0x6) { $w += 6 } # upper
870	    elsif ($w == 0x1C)             { $w += 1 } # square upper
871	    elsif ($w == 0x1D)             { $w -= 1 } # square lower
872	}
873    }
874    if ($self->{katakana_before_hiragana}) {
875	foreach my $w (@{ $ret[2] }) {
876	    if    (0x0F <= $w && $w <= 0x13) { $w -= 2 } # katakana
877	    elsif (0x0D <= $w && $w <= 0x0E) { $w += 5 } # hiragana
878	}
879    }
880
881    if ($self->{backwardsFlag}) {
882	for (my $v = MinLevel; $v <= MaxLevel; $v++) {
883	    if ($self->{backwardsFlag} & (1 << $v)) {
884		@{ $ret[$v-1] } = reverse @{ $ret[$v-1] };
885	    }
886	}
887    }
888
889    my $rkey = join LEVEL_SEP, map pack(KEY_TEMPLATE, @$_), @ret;
890### end PP only ##
891
892    if ($iden || $vers >= 26 && $lev == MaxLevel) {
893	$rkey .= LEVEL_SEP;
894	$rkey .= pack(TIE_TEMPLATE, unpack_U($str)) if $iden;
895    }
896    return $rkey;
897}
898
899
900##
901## int compare = cmp(string a, string b)
902##
903sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
904sub eq  { $_[0]->getSortKey($_[1]) eq  $_[0]->getSortKey($_[2]) }
905sub ne  { $_[0]->getSortKey($_[1]) ne  $_[0]->getSortKey($_[2]) }
906sub lt  { $_[0]->getSortKey($_[1]) lt  $_[0]->getSortKey($_[2]) }
907sub le  { $_[0]->getSortKey($_[1]) le  $_[0]->getSortKey($_[2]) }
908sub gt  { $_[0]->getSortKey($_[1]) gt  $_[0]->getSortKey($_[2]) }
909sub ge  { $_[0]->getSortKey($_[1]) ge  $_[0]->getSortKey($_[2]) }
910
911##
912## list[strings] sorted = sort(list[strings] arg)
913##
914sub sort {
915    my $obj = shift;
916    return
917	map { $_->[1] }
918	    sort{ $a->[0] cmp $b->[0] }
919		map [ $obj->getSortKey($_), $_ ], @_;
920}
921
922### begin PP only ##
923sub _derivCE_43 {
924    my $u = shift;
925    my $tang = (TangIdeoIni <= $u && $u <= TangIdeo120 ||
926		TangCompIni <= $u && $u <= TangComp130 ||
927		TangSuppIni <= $u && $u <= TangSuppFin);
928    my $nush = (NushuIni  <= $u && $u <= NushuFin);
929    my $ktan = (KhitanIni <= $u && $u <= KhitanFin);
930
931    my $base = ($tang) ? 0xFB00 : # Tangut
932	       ($nush) ? 0xFB01 : # Nushu
933	       ($ktan) ? 0xFB02 : # Khitan
934	       (CJK_UidIni  <= $u && $u <= CJK_UidF130 || $CompatUI{$u})
935		? 0xFB40 : # CJK
936	       (CJK_ExtAIni <= $u && $u <= CJK_ExtA130 ||
937		CJK_ExtBIni <= $u && $u <= CJK_ExtB130 ||
938		CJK_ExtCIni <= $u && $u <= CJK_ExtCFin ||
939		CJK_ExtDIni <= $u && $u <= CJK_ExtDFin ||
940		CJK_ExtEIni <= $u && $u <= CJK_ExtEFin ||
941		CJK_ExtFIni <= $u && $u <= CJK_ExtFFin ||
942		CJK_ExtGIni <= $u && $u <= CJK_ExtGFin)
943		? 0xFB80  # CJK ext.
944		: 0xFBC0; # others
945    my $aaaa = $tang || $nush || $ktan ? $base : $base + ($u >> 15);
946    my $bbbb = ($tang ? ($u - TangIdeoIni) :
947		$nush ? ($u - NushuIni) :
948		$ktan ? ($u - KhitanIni) : ($u & 0x7FFF)) | 0x8000;
949    return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
950	   pack(VCE_TEMPLATE, NON_VAR, $bbbb,      0,      0, $u);
951}
952
953sub _derivCE_40 {
954    my $u = shift;
955    my $tang = (TangIdeoIni <= $u && $u <= TangIdeo120 ||
956		TangCompIni <= $u && $u <= TangCompFin);
957    my $nush = (NushuIni <= $u && $u <= NushuFin);
958    my $base = ($tang)
959		? 0xFB00 : # Tangut
960	       ($nush)
961		? 0xFB01 : # Nushu
962	       (CJK_UidIni  <= $u && $u <= CJK_UidF110 || $CompatUI{$u})
963		? 0xFB40 : # CJK
964	       (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
965		CJK_ExtBIni <= $u && $u <= CJK_ExtBFin ||
966		CJK_ExtCIni <= $u && $u <= CJK_ExtCFin ||
967		CJK_ExtDIni <= $u && $u <= CJK_ExtDFin ||
968		CJK_ExtEIni <= $u && $u <= CJK_ExtEFin ||
969		CJK_ExtFIni <= $u && $u <= CJK_ExtFFin)
970		? 0xFB80  # CJK ext.
971		: 0xFBC0; # others
972    my $aaaa = $tang || $nush ? $base : $base + ($u >> 15);
973    my $bbbb = ($tang ? ($u - TangIdeoIni) :
974		$nush ? ($u - NushuIni) : ($u & 0x7FFF)) | 0x8000;
975    return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
976	   pack(VCE_TEMPLATE, NON_VAR, $bbbb,      0,      0, $u);
977}
978
979sub _derivCE_38 {
980    my $u = shift;
981    my $tang = (TangIdeoIni <= $u && $u <= TangIdeo110 ||
982		TangCompIni <= $u && $u <= TangCompFin);
983    my $nush = (NushuIni <= $u && $u <= NushuFin);
984    my $base = ($tang)
985		? 0xFB00 : # Tangut
986	       ($nush)
987		? 0xFB01 : # Nushu
988	       (CJK_UidIni  <= $u && $u <= CJK_UidF110 || $CompatUI{$u})
989		? 0xFB40 : # CJK
990	       (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
991		CJK_ExtBIni <= $u && $u <= CJK_ExtBFin ||
992		CJK_ExtCIni <= $u && $u <= CJK_ExtCFin ||
993		CJK_ExtDIni <= $u && $u <= CJK_ExtDFin ||
994		CJK_ExtEIni <= $u && $u <= CJK_ExtEFin ||
995		CJK_ExtFIni <= $u && $u <= CJK_ExtFFin)
996		? 0xFB80  # CJK ext.
997		: 0xFBC0; # others
998    my $aaaa = $tang || $nush ? $base : $base + ($u >> 15);
999    my $bbbb = ($tang ? ($u - TangIdeoIni) :
1000		$nush ? ($u - NushuIni) : ($u & 0x7FFF)) | 0x8000;
1001    return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
1002	   pack(VCE_TEMPLATE, NON_VAR, $bbbb,      0,      0, $u);
1003}
1004
1005sub _derivCE_36 {
1006    my $u = shift;
1007    my $tang = (TangIdeoIni <= $u && $u <= TangIdeoFin ||
1008		TangCompIni <= $u && $u <= TangCompFin);
1009    my $nush = (NushuIni <= $u && $u <= NushuFin);
1010    my $base = ($tang)
1011		? 0xFB00 : # Tangut
1012	       ($nush)
1013		? 0xFB01 : # Nushu
1014	       (CJK_UidIni  <= $u && $u <= CJK_UidF100 || $CompatUI{$u})
1015		? 0xFB40 : # CJK
1016	       (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
1017		CJK_ExtBIni <= $u && $u <= CJK_ExtBFin ||
1018		CJK_ExtCIni <= $u && $u <= CJK_ExtCFin ||
1019		CJK_ExtDIni <= $u && $u <= CJK_ExtDFin ||
1020		CJK_ExtEIni <= $u && $u <= CJK_ExtEFin ||
1021		CJK_ExtFIni <= $u && $u <= CJK_ExtFFin)
1022		? 0xFB80  # CJK ext.
1023		: 0xFBC0; # others
1024    my $aaaa = $tang || $nush ? $base : $base + ($u >> 15);
1025    my $bbbb = ($tang ? ($u - TangIdeoIni) :
1026		$nush ? ($u - NushuIni) : ($u & 0x7FFF)) | 0x8000;
1027    return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
1028	   pack(VCE_TEMPLATE, NON_VAR, $bbbb,      0,      0, $u);
1029}
1030
1031sub _derivCE_34 {
1032    my $u = shift;
1033    my $tang = (TangIdeoIni <= $u && $u <= TangIdeoFin ||
1034		TangCompIni <= $u && $u <= TangCompFin);
1035    my $base = ($tang)
1036		? 0xFB00 : # Tangut
1037	       (CJK_UidIni  <= $u && $u <= CJK_UidF80 || $CompatUI{$u})
1038		? 0xFB40 : # CJK
1039	       (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
1040		CJK_ExtBIni <= $u && $u <= CJK_ExtBFin ||
1041		CJK_ExtCIni <= $u && $u <= CJK_ExtCFin ||
1042		CJK_ExtDIni <= $u && $u <= CJK_ExtDFin ||
1043		CJK_ExtEIni <= $u && $u <= CJK_ExtEFin)
1044		? 0xFB80  # CJK ext.
1045		: 0xFBC0; # others
1046    my $aaaa = $tang ? $base : $base + ($u >> 15);
1047    my $bbbb = ($tang ? ($u - TangIdeoIni) : ($u & 0x7FFF)) | 0x8000;
1048    return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
1049	   pack(VCE_TEMPLATE, NON_VAR, $bbbb,      0,      0, $u);
1050}
1051
1052sub _derivCE_32 {
1053    my $u = shift;
1054    my $base = (CJK_UidIni  <= $u && $u <= CJK_UidF80 || $CompatUI{$u})
1055		? 0xFB40 : # CJK
1056	       (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
1057		CJK_ExtBIni <= $u && $u <= CJK_ExtBFin ||
1058		CJK_ExtCIni <= $u && $u <= CJK_ExtCFin ||
1059		CJK_ExtDIni <= $u && $u <= CJK_ExtDFin ||
1060		CJK_ExtEIni <= $u && $u <= CJK_ExtEFin)
1061		? 0xFB80  # CJK ext.
1062		: 0xFBC0; # others
1063    my $aaaa = $base + ($u >> 15);
1064    my $bbbb = ($u & 0x7FFF) | 0x8000;
1065    return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
1066	   pack(VCE_TEMPLATE, NON_VAR, $bbbb,      0,      0, $u);
1067}
1068
1069sub _derivCE_24 {
1070    my $u = shift;
1071    my $base = (CJK_UidIni  <= $u && $u <= CJK_UidF61 || $CompatUI{$u})
1072		? 0xFB40 : # CJK
1073	       (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
1074		CJK_ExtBIni <= $u && $u <= CJK_ExtBFin ||
1075		CJK_ExtCIni <= $u && $u <= CJK_ExtCFin ||
1076		CJK_ExtDIni <= $u && $u <= CJK_ExtDFin)
1077		? 0xFB80  # CJK ext.
1078		: 0xFBC0; # others
1079    my $aaaa = $base + ($u >> 15);
1080    my $bbbb = ($u & 0x7FFF) | 0x8000;
1081    return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
1082	   pack(VCE_TEMPLATE, NON_VAR, $bbbb,      0,      0, $u);
1083}
1084
1085sub _derivCE_22 {
1086    my $u = shift;
1087    my $base = (CJK_UidIni  <= $u && $u <= CJK_UidF52 || $CompatUI{$u})
1088		? 0xFB40 : # CJK
1089	       (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
1090		CJK_ExtBIni <= $u && $u <= CJK_ExtBFin ||
1091		CJK_ExtCIni <= $u && $u <= CJK_ExtCFin ||
1092		CJK_ExtDIni <= $u && $u <= CJK_ExtDFin)
1093		? 0xFB80  # CJK ext.
1094		: 0xFBC0; # others
1095    my $aaaa = $base + ($u >> 15);
1096    my $bbbb = ($u & 0x7FFF) | 0x8000;
1097    return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
1098	   pack(VCE_TEMPLATE, NON_VAR, $bbbb,      0,      0, $u);
1099}
1100
1101sub _derivCE_20 {
1102    my $u = shift;
1103    my $base = (CJK_UidIni  <= $u && $u <= CJK_UidF52 || $CompatUI{$u})
1104		? 0xFB40 : # CJK
1105	       (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
1106		CJK_ExtBIni <= $u && $u <= CJK_ExtBFin ||
1107		CJK_ExtCIni <= $u && $u <= CJK_ExtCFin)
1108		? 0xFB80  # CJK ext.
1109		: 0xFBC0; # others
1110    my $aaaa = $base + ($u >> 15);
1111    my $bbbb = ($u & 0x7FFF) | 0x8000;
1112    return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
1113	   pack(VCE_TEMPLATE, NON_VAR, $bbbb,      0,      0, $u);
1114}
1115
1116sub _derivCE_18 {
1117    my $u = shift;
1118    my $base = (CJK_UidIni  <= $u && $u <= CJK_UidF51 || $CompatUI{$u})
1119		? 0xFB40 : # CJK
1120	       (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
1121		CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
1122		? 0xFB80  # CJK ext.
1123		: 0xFBC0; # others
1124    my $aaaa = $base + ($u >> 15);
1125    my $bbbb = ($u & 0x7FFF) | 0x8000;
1126    return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
1127	   pack(VCE_TEMPLATE, NON_VAR, $bbbb,      0,      0, $u);
1128}
1129
1130sub _derivCE_14 {
1131    my $u = shift;
1132    my $base = (CJK_UidIni  <= $u && $u <= CJK_UidF41 || $CompatUI{$u})
1133		? 0xFB40 : # CJK
1134	       (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
1135		CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
1136		? 0xFB80  # CJK ext.
1137		: 0xFBC0; # others
1138    my $aaaa = $base + ($u >> 15);
1139    my $bbbb = ($u & 0x7FFF) | 0x8000;
1140    return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
1141	   pack(VCE_TEMPLATE, NON_VAR, $bbbb,      0,      0, $u);
1142}
1143
1144sub _derivCE_9 {
1145    my $u = shift;
1146    my $base = (CJK_UidIni  <= $u && $u <= CJK_UidFin || $CompatUI{$u})
1147		? 0xFB40 : # CJK
1148	       (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
1149		CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
1150		? 0xFB80  # CJK ext.
1151		: 0xFBC0; # others
1152    my $aaaa = $base + ($u >> 15);
1153    my $bbbb = ($u & 0x7FFF) | 0x8000;
1154    return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
1155	   pack(VCE_TEMPLATE, NON_VAR, $bbbb,      0,      0, $u);
1156}
1157
1158sub _derivCE_8 {
1159    my $code = shift;
1160    my $aaaa =  0xFF80 + ($code >> 15);
1161    my $bbbb = ($code & 0x7FFF) | 0x8000;
1162    return pack(VCE_TEMPLATE, NON_VAR, $aaaa, 2, 1, $code),
1163	   pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $code);
1164}
1165
1166sub _uideoCE_8 {
1167    my $u = shift;
1168    return pack(VCE_TEMPLATE, NON_VAR, $u, Min2Wt, Min3Wt, $u);
1169}
1170
1171sub _isUIdeo {
1172    # $uca_vers = 0 for _uideoCE_8()
1173    my ($u, $uca_vers) = @_;
1174    return((CJK_UidIni <= $u && (
1175	    $uca_vers >= 43 ? ($u <= CJK_UidF130) :
1176	    $uca_vers >= 38 ? ($u <= CJK_UidF110) :
1177	    $uca_vers >= 36 ? ($u <= CJK_UidF100) :
1178	    $uca_vers >= 32 ? ($u <= CJK_UidF80) :
1179	    $uca_vers >= 24 ? ($u <= CJK_UidF61) :
1180	    $uca_vers >= 20 ? ($u <= CJK_UidF52) :
1181	    $uca_vers >= 18 ? ($u <= CJK_UidF51) :
1182	    $uca_vers >= 14 ? ($u <= CJK_UidF41) :
1183			      ($u <= CJK_UidFin))) || $CompatUI{$u}
1184		||
1185	(CJK_ExtAIni <= $u && $u <= CJK_ExtAFin)
1186		||
1187	($uca_vers >= 43 && CJK_ExtAIni <= $u && $u <= CJK_ExtA130)
1188		||
1189	($uca_vers >=  8 && CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
1190		||
1191	($uca_vers >= 43 && CJK_ExtBIni <= $u && $u <= CJK_ExtB130)
1192		||
1193	($uca_vers >= 20 && CJK_ExtCIni <= $u && $u <= CJK_ExtCFin)
1194		||
1195	($uca_vers >= 22 && CJK_ExtDIni <= $u && $u <= CJK_ExtDFin)
1196		||
1197	($uca_vers >= 32 && CJK_ExtEIni <= $u && $u <= CJK_ExtEFin)
1198		||
1199	($uca_vers >= 36 && CJK_ExtFIni <= $u && $u <= CJK_ExtFFin)
1200		||
1201	($uca_vers >= 43 && CJK_ExtGIni <= $u && $u <= CJK_ExtGFin)
1202    );
1203}
1204
1205
1206##
1207## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
1208##
1209sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g }
1210
1211#
1212# $code *must* be in Hangul syllable.
1213# Check it before you enter here.
1214#
1215sub _decompHangul {
1216    my $code = shift;
1217    my $si = $code - Hangul_SBase;
1218    my $li = int( $si / Hangul_NCount);
1219    my $vi = int(($si % Hangul_NCount) / Hangul_TCount);
1220    my $ti =      $si % Hangul_TCount;
1221    return (
1222	Hangul_LBase + $li,
1223	Hangul_VBase + $vi,
1224	$ti ? (Hangul_TBase + $ti) : (),
1225    );
1226}
1227
1228sub _isIllegal {
1229    my $code = shift;
1230    return((! defined $code)                    # removed
1231	|| ($code < 0 || 0x10FFFF < $code)      # out of range
1232	|| (($code & 0xFFFE) == 0xFFFE)         # ??FFF[EF]
1233	|| (0xD800 <= $code && $code <= 0xDFFF) # unpaired surrogates
1234	|| (0xFDD0 <= $code && $code <= 0xFDEF) # other non-characters
1235    );
1236}
1237
1238# Hangul Syllable Type
1239sub getHST {
1240    my $u = shift;
1241    my $vers = shift || 0;
1242
1243    if (Hangul_SIni <= $u && $u <= Hangul_SFin) {
1244	return +($u - Hangul_SBase) % Hangul_TCount ? 'LVT' : 'LV';
1245    }
1246
1247    if ($vers < 20) {
1248	return Hangul_LIni <= $u && $u <= Hangul_LFin ||
1249			            $u == Hangul_LFill ? 'L' :
1250	       Hangul_VIni <= $u && $u <= Hangul_VFin  ? 'V' :
1251	       Hangul_TIni <= $u && $u <= Hangul_TFin  ? 'T' : '';
1252    } else {
1253	return Hangul_LIni <= $u && $u <= Hangul_LEnd ||
1254	       HangulL2Ini <= $u && $u <= HangulL2Fin  ? 'L' :
1255	       Hangul_VIni <= $u && $u <= Hangul_VEnd ||
1256	       HangulV2Ini <= $u && $u <= HangulV2Fin  ? 'V' :
1257	       Hangul_TIni <= $u && $u <= Hangul_TEnd ||
1258	       HangulT2Ini <= $u && $u <= HangulT2Fin  ? 'T' : '';
1259    }
1260}
1261### end PP only ##
1262
1263
1264##
1265## bool _nonIgnorAtLevel(arrayref weights, int level)
1266##
1267sub _nonIgnorAtLevel($$)
1268{
1269    my $wt = shift;
1270    return if ! defined $wt;
1271    my $lv = shift;
1272    return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE;
1273}
1274
1275##
1276## bool _eqArray(
1277##    arrayref of arrayref[weights] source,
1278##    arrayref of arrayref[weights] substr,
1279##    int level)
1280## * comparison of graphemes vs graphemes.
1281##   @$source >= @$substr must be true (check it before call this);
1282##
1283sub _eqArray($$$)
1284{
1285    my $source = shift;
1286    my $substr = shift;
1287    my $lev = shift;
1288
1289    for my $g (0..@$substr-1){
1290	# Do the $g'th graphemes have the same number of AV weights?
1291	return if @{ $source->[$g] } != @{ $substr->[$g] };
1292
1293	for my $w (0..@{ $substr->[$g] }-1) {
1294	    for my $v (0..$lev-1) {
1295		return if $source->[$g][$w][$v] != $substr->[$g][$w][$v];
1296	    }
1297	}
1298    }
1299    return 1;
1300}
1301
1302##
1303## (int position, int length)
1304## int position = index(string, substring, position, [undoc'ed global])
1305##
1306## With "global" (only for the list context),
1307##  returns list of arrayref[position, length].
1308##
1309sub index
1310{
1311    my $self = shift;
1312    $self->{preprocess} and
1313	croak "Don't use Preprocess with index(), match(), etc.";
1314    $self->{normCode} and
1315	croak "Don't use Normalization with index(), match(), etc.";
1316
1317    my $str  = shift;
1318    my $len  = length($str);
1319    my $sub  = shift;
1320    my $subE = $self->splitEnt($sub);
1321    my $pos  = @_ ? shift : 0;
1322       $pos  = 0 if $pos < 0;
1323    my $glob = shift;
1324
1325    my $lev  = $self->{level};
1326    my $v2i  = $self->{UCA_Version} >= 9 &&
1327		$self->{variable} ne 'non-ignorable';
1328
1329    if (! @$subE) {
1330	my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos;
1331	return $glob
1332	    ? map([$_, 0], $temp..$len)
1333	    : wantarray ? ($temp,0) : $temp;
1334    }
1335    $len < $pos
1336	and return wantarray ? () : NOMATCHPOS;
1337    my $strE = $self->splitEnt($pos ? substr($str, $pos) : $str, TRUE);
1338    @$strE
1339	or return wantarray ? () : NOMATCHPOS;
1340
1341    my(@strWt, @iniPos, @finPos, @subWt, @g_ret);
1342
1343    my $last_is_variable;
1344    for my $vwt (map $self->getWt($_), @$subE) {
1345	my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
1346	my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
1347
1348	# "Ignorable (L1, L2) after Variable" since track. v. 9
1349	if ($v2i) {
1350	    if ($var) {
1351		$last_is_variable = TRUE;
1352	    }
1353	    elsif (!$wt[0]) { # ignorable
1354		$to_be_pushed = FALSE if $last_is_variable;
1355	    }
1356	    else {
1357		$last_is_variable = FALSE;
1358	    }
1359	}
1360
1361	if (@subWt && !$var && !$wt[0]) {
1362	    push @{ $subWt[-1] }, \@wt if $to_be_pushed;
1363	} elsif ($to_be_pushed) {
1364	    push @subWt, [ \@wt ];
1365	}
1366	# else ===> skipped
1367    }
1368
1369    my $count = 0;
1370    my $end = @$strE - 1;
1371
1372    $last_is_variable = FALSE; # reuse
1373    for (my $i = 0; $i <= $end; ) { # no $i++
1374	my $found_base = 0;
1375
1376	# fetch a grapheme
1377	while ($i <= $end && $found_base == 0) {
1378	    for my $vwt ($self->getWt($strE->[$i][0])) {
1379		my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
1380		my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
1381
1382		# "Ignorable (L1, L2) after Variable" since track. v. 9
1383		if ($v2i) {
1384		    if ($var) {
1385			$last_is_variable = TRUE;
1386		    }
1387		    elsif (!$wt[0]) { # ignorable
1388			$to_be_pushed = FALSE if $last_is_variable;
1389		    }
1390		    else {
1391			$last_is_variable = FALSE;
1392		    }
1393		}
1394
1395		if (@strWt && !$var && !$wt[0]) {
1396		    push @{ $strWt[-1] }, \@wt if $to_be_pushed;
1397		    $finPos[-1] = $strE->[$i][2];
1398		} elsif ($to_be_pushed) {
1399		    push @strWt, [ \@wt ];
1400		    push @iniPos, $found_base ? NOMATCHPOS : $strE->[$i][1];
1401		    $finPos[-1] = NOMATCHPOS if $found_base;
1402		    push @finPos, $strE->[$i][2];
1403		    $found_base++;
1404		}
1405		# else ===> no-op
1406	    }
1407	    $i++;
1408	}
1409
1410	# try to match
1411	while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) {
1412	    if ($iniPos[0] != NOMATCHPOS &&
1413		    $finPos[$#subWt] != NOMATCHPOS &&
1414			_eqArray(\@strWt, \@subWt, $lev)) {
1415		my $temp = $iniPos[0] + $pos;
1416
1417		if ($glob) {
1418		    push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]];
1419		    splice @strWt,  0, $#subWt;
1420		    splice @iniPos, 0, $#subWt;
1421		    splice @finPos, 0, $#subWt;
1422		}
1423		else {
1424		    return wantarray
1425			? ($temp, $finPos[$#subWt] - $iniPos[0])
1426			:  $temp;
1427		}
1428	    }
1429	    shift @strWt;
1430	    shift @iniPos;
1431	    shift @finPos;
1432	}
1433    }
1434
1435    return $glob
1436	? @g_ret
1437	: wantarray ? () : NOMATCHPOS;
1438}
1439
1440##
1441## scalarref to matching part = match(string, substring)
1442##
1443sub match
1444{
1445    my $self = shift;
1446    if (my($pos,$len) = $self->index($_[0], $_[1])) {
1447	my $temp = substr($_[0], $pos, $len);
1448	return wantarray ? $temp : \$temp;
1449	# An lvalue ref \substr should be avoided,
1450	# since its value is affected by modification of its referent.
1451    }
1452    else {
1453	return;
1454    }
1455}
1456
1457##
1458## arrayref matching parts = gmatch(string, substring)
1459##
1460sub gmatch
1461{
1462    my $self = shift;
1463    my $str  = shift;
1464    my $sub  = shift;
1465    return map substr($str, $_->[0], $_->[1]),
1466		$self->index($str, $sub, 0, 'g');
1467}
1468
1469##
1470## bool subst'ed = subst(string, substring, replace)
1471##
1472sub subst
1473{
1474    my $self = shift;
1475    my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1476
1477    if (my($pos,$len) = $self->index($_[0], $_[1])) {
1478	if ($code) {
1479	    my $mat = substr($_[0], $pos, $len);
1480	    substr($_[0], $pos, $len, $code->($mat));
1481	} else {
1482	    substr($_[0], $pos, $len, $_[2]);
1483	}
1484	return TRUE;
1485    }
1486    else {
1487	return FALSE;
1488    }
1489}
1490
1491##
1492## int count = gsubst(string, substring, replace)
1493##
1494sub gsubst
1495{
1496    my $self = shift;
1497    my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1498    my $cnt = 0;
1499
1500    # Replacement is carried out from the end, then use reverse.
1501    for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) {
1502	if ($code) {
1503	    my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]);
1504	    substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat));
1505	} else {
1506	    substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]);
1507	}
1508	$cnt++;
1509    }
1510    return $cnt;
1511}
1512
15131;
1514__END__
1515
1516=head1 NAME
1517
1518Unicode::Collate - Unicode Collation Algorithm
1519
1520=head1 SYNOPSIS
1521
1522  use Unicode::Collate;
1523
1524  #construct
1525  $Collator = Unicode::Collate->new(%tailoring);
1526
1527  #sort
1528  @sorted = $Collator->sort(@not_sorted);
1529
1530  #compare
1531  $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
1532
1533B<Note:> Strings in C<@not_sorted>, C<$a> and C<$b> are interpreted
1534according to Perl's Unicode support. See L<perlunicode>,
1535L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>.
1536Otherwise you can use C<preprocess> or should decode them before.
1537
1538=head1 DESCRIPTION
1539
1540This module is an implementation of Unicode Technical Standard #10
1541(a.k.a. UTS #10) - Unicode Collation Algorithm (a.k.a. UCA).
1542
1543=head2 Constructor and Tailoring
1544
1545The C<new> method returns a collator object. If new() is called
1546with no parameters, the collator should do the default collation.
1547
1548   $Collator = Unicode::Collate->new(
1549      UCA_Version => $UCA_Version,
1550      alternate => $alternate, # alias for 'variable'
1551      backwards => $levelNumber, # or \@levelNumbers
1552      entry => $element,
1553      hangul_terminator => $term_primary_weight,
1554      highestFFFF => $bool,
1555      identical => $bool,
1556      ignoreName => qr/$ignoreName/,
1557      ignoreChar => qr/$ignoreChar/,
1558      ignore_level2 => $bool,
1559      katakana_before_hiragana => $bool,
1560      level => $collationLevel,
1561      long_contraction => $bool,
1562      minimalFFFE => $bool,
1563      normalization  => $normalization_form,
1564      overrideCJK => \&overrideCJK,
1565      overrideHangul => \&overrideHangul,
1566      preprocess => \&preprocess,
1567      rearrange => \@charList,
1568      rewrite => \&rewrite,
1569      suppress => \@charList,
1570      table => $filename,
1571      undefName => qr/$undefName/,
1572      undefChar => qr/$undefChar/,
1573      upper_before_lower => $bool,
1574      variable => $variable,
1575   );
1576
1577=over 4
1578
1579=item UCA_Version
1580
1581If the revision (previously "tracking version") number of UCA is given,
1582behavior of that revision is emulated on collating.
1583If omitted, the return value of C<UCA_Version()> is used.
1584
1585The following revisions are supported.  The default is 43.
1586
1587     UCA       Unicode Standard         DUCET (@version)
1588   -------------------------------------------------------
1589      8              3.1                3.0.1 (3.0.1d9)
1590      9     3.1 with Corrigendum 3      3.1.1
1591     11             4.0.0
1592     14             4.1.0
1593     16             5.0.0
1594     18             5.1.0
1595     20             5.2.0
1596     22             6.0.0
1597     24             6.1.0
1598     26             6.2.0
1599     28             6.3.0
1600     30             7.0.0
1601     32             8.0.0
1602     34             9.0.0
1603     36            10.0.0
1604     38            11.0.0
1605     40            12.0.0
1606     41            12.1.0
1607     43            13.0.0
1608
1609* See below for C<long_contraction> with C<UCA_Version> 22 and 24.
1610
1611* Noncharacters (e.g. U+FFFF) are not ignored, and can be overridden
1612since C<UCA_Version> 22.
1613
1614* Out-of-range codepoints (greater than U+10FFFF) are not ignored,
1615and can be overridden since C<UCA_Version> 22.
1616
1617* Fully ignorable characters were ignored, and would not interrupt
1618contractions with C<UCA_Version> 9 and 11.
1619
1620* Treatment of ignorables after variables and some behaviors
1621were changed at C<UCA_Version> 9.
1622
1623* Characters regarded as CJK unified ideographs (cf. C<overrideCJK>)
1624depend on C<UCA_Version>.
1625
1626* Many hangul jamo are assigned at C<UCA_Version> 20, that will affect
1627C<hangul_terminator>.
1628
1629=item alternate
1630
1631-- see 3.2.2 Alternate Weighting, version 8 of UTS #10
1632
1633For backward compatibility, C<alternate> (old name) can be used
1634as an alias for C<variable>.
1635
1636=item backwards
1637
1638-- see 3.4 Backward Accents, UTS #10.
1639
1640     backwards => $levelNumber or \@levelNumbers
1641
1642Weights in reverse order; ex. level 2 (diacritic ordering) in French.
1643If omitted (or C<$levelNumber> is C<undef> or C<\@levelNumbers> is C<[]>),
1644forwards at all the levels.
1645
1646=item entry
1647
1648-- see 5 Tailoring; 9.1 Allkeys File Format, UTS #10.
1649
1650If the same character (or a sequence of characters) exists
1651in the collation element table through C<table>,
1652mapping to collation elements is overridden.
1653If it does not exist, the mapping is defined additionally.
1654
1655    entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
16560063 0068 ; [.0E6A.0020.0002.0063] # ch
16570043 0068 ; [.0E6A.0020.0007.0043] # Ch
16580043 0048 ; [.0E6A.0020.0008.0043] # CH
1659006C 006C ; [.0F4C.0020.0002.006C] # ll
1660004C 006C ; [.0F4C.0020.0007.004C] # Ll
1661004C 004C ; [.0F4C.0020.0008.004C] # LL
166200F1      ; [.0F7B.0020.0002.00F1] # n-tilde
1663006E 0303 ; [.0F7B.0020.0002.00F1] # n-tilde
166400D1      ; [.0F7B.0020.0008.00D1] # N-tilde
1665004E 0303 ; [.0F7B.0020.0008.00D1] # N-tilde
1666ENTRY
1667
1668    entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
166900E6 ; [.0E33.0020.0002.00E6][.0E8B.0020.0002.00E6] # ae ligature as <a><e>
167000C6 ; [.0E33.0020.0008.00C6][.0E8B.0020.0008.00C6] # AE ligature as <A><E>
1671ENTRY
1672
1673B<NOTE:> The code point in the UCA file format (before C<';'>)
1674B<must> be a Unicode code point (defined as hexadecimal),
1675but not a native code point.
1676So C<0063> must always denote C<U+0063>,
1677but not a character of C<"\x63">.
1678
1679Weighting may vary depending on collation element table.
1680So ensure the weights defined in C<entry> will be consistent with
1681those in the collation element table loaded via C<table>.
1682
1683In DUCET v4.0.0, primary weight of C<C> is C<0E60>
1684and that of C<D> is C<0E6D>. So setting primary weight of C<CH> to C<0E6A>
1685(as a value between C<0E60> and C<0E6D>)
1686makes ordering as C<C E<lt> CH E<lt> D>.
1687Exactly speaking DUCET already has some characters between C<C> and C<D>:
1688C<small capital C> (C<U+1D04>) with primary weight C<0E64>,
1689C<c-hook/C-hook> (C<U+0188/U+0187>) with C<0E65>,
1690and C<c-curl> (C<U+0255>) with C<0E69>.
1691Then primary weight C<0E6A> for C<CH> makes C<CH>
1692ordered between C<c-curl> and C<D>.
1693
1694=item hangul_terminator
1695
1696-- see 7.1.4 Trailing Weights, UTS #10.
1697
1698If a true value is given (non-zero but should be positive),
1699it will be added as a terminator primary weight to the end of
1700every standard Hangul syllable. Secondary and any higher weights
1701for terminator are set to zero.
1702If the value is false or C<hangul_terminator> key does not exist,
1703insertion of terminator weights will not be performed.
1704
1705Boundaries of Hangul syllables are determined
1706according to conjoining Jamo behavior in F<the Unicode Standard>
1707and F<HangulSyllableType.txt>.
1708
1709B<Implementation Note:>
1710(1) For expansion mapping (Unicode character mapped
1711to a sequence of collation elements), a terminator will not be added
1712between collation elements, even if Hangul syllable boundary exists there.
1713Addition of terminator is restricted to the next position
1714to the last collation element.
1715
1716(2) Non-conjoining Hangul letters
1717(Compatibility Jamo, halfwidth Jamo, and enclosed letters) are not
1718automatically terminated with a terminator primary weight.
1719These characters may need terminator included in a collation element
1720table beforehand.
1721
1722=item highestFFFF
1723
1724-- see 2.4 Tailored noncharacter weights, UTS #35 (LDML) Part 5: Collation.
1725
1726If the parameter is made true, C<U+FFFF> has a highest primary weight.
1727When a boolean of C<$coll-E<gt>ge($str, "abc")> and
1728C<$coll-E<gt>le($str, "abc\x{FFFF}")> is true, it is expected that C<$str>
1729begins with C<"abc">, or another primary equivalent.
1730C<$str> may be C<"abcd">, C<"abc012">, but should not include C<U+FFFF>
1731such as C<"abc\x{FFFF}xyz">.
1732
1733C<$coll-E<gt>le($str, "abc\x{FFFF}")> works like C<$coll-E<gt>lt($str, "abd")>
1734almost, but the latter has a problem that you should know which letter is
1735next to C<c>. For a certain language where C<ch> as the next letter,
1736C<"abch"> is greater than C<"abc\x{FFFF}">, but less than C<"abd">.
1737
1738Note:
1739This is equivalent to C<(entry =E<gt> 'FFFF ; [.FFFE.0020.0005.FFFF]')>.
1740Any other character than C<U+FFFF> can be tailored by C<entry>.
1741
1742=item identical
1743
1744-- see A.3 Deterministic Comparison, UTS #10.
1745
1746By default, strings whose weights are equal should be equal,
1747even though their code points are not equal.
1748Completely ignorable characters are ignored.
1749
1750If the parameter is made true, a final, tie-breaking level is used.
1751If no difference of weights is found after the comparison through
1752all the level specified by C<level>, the comparison with code points
1753will be performed.
1754For the tie-breaking comparison, the sort key has code points
1755of the original string appended.
1756Completely ignorable characters are not ignored.
1757
1758If C<preprocess> and/or C<normalization> is applied, the code points
1759of the string after them (in NFD by default) are used.
1760
1761=item ignoreChar
1762
1763=item ignoreName
1764
1765-- see 3.6 Variable Weighting, UTS #10.
1766
1767Makes the entry in the table completely ignorable;
1768i.e. as if the weights were zero at all level.
1769
1770Through C<ignoreChar>, any character matching C<qr/$ignoreChar/>
1771will be ignored. Through C<ignoreName>, any character whose name
1772(given in the C<table> file as a comment) matches C<qr/$ignoreName/>
1773will be ignored.
1774
1775E.g. when 'a' and 'e' are ignorable,
1776'element' is equal to 'lament' (or 'lmnt').
1777
1778=item ignore_level2
1779
1780-- see 5.1 Parametric Tailoring, UTS #10.
1781
1782By default, case-sensitive comparison (that is level 3 difference)
1783won't ignore accents (that is level 2 difference).
1784
1785If the parameter is made true, accents (and other primary ignorable
1786characters) are ignored, even though cases are taken into account.
1787
1788B<NOTE>: C<level> should be 3 or greater.
1789
1790=item katakana_before_hiragana
1791
1792-- see 7.2 Tertiary Weight Table, UTS #10.
1793
1794By default, hiragana is before katakana.
1795If the parameter is made true, this is reversed.
1796
1797B<NOTE>: This parameter simplemindedly assumes that any hiragana/katakana
1798distinctions must occur in level 3, and their weights at level 3 must be
1799same as those mentioned in 7.3.1, UTS #10.
1800If you define your collation elements which violate this requirement,
1801this parameter does not work validly.
1802
1803=item level
1804
1805-- see 4.3 Form Sort Key, UTS #10.
1806
1807Set the maximum level.
1808Any higher levels than the specified one are ignored.
1809
1810  Level 1: alphabetic ordering
1811  Level 2: diacritic ordering
1812  Level 3: case ordering
1813  Level 4: tie-breaking (e.g. in the case when variable is 'shifted')
1814
1815  ex.level => 2,
1816
1817If omitted, the maximum is the 4th.
1818
1819B<NOTE:> The DUCET includes weights over 0xFFFF at the 4th level.
1820But this module only uses weights within 0xFFFF.
1821When C<variable> is 'blanked' or 'non-ignorable' (other than 'shifted'
1822and 'shift-trimmed'), the level 4 may be unreliable.
1823
1824See also C<identical>.
1825
1826=item long_contraction
1827
1828-- see 3.8.2 Well-Formedness of the DUCET, 4.2 Produce Array, UTS #10.
1829
1830If the parameter is made true, for a contraction with three or more
1831characters (here nicknamed "long contraction"), initial substrings
1832will be handled.
1833For example, a contraction ABC, where A is a starter, and B and C
1834are non-starters (character with non-zero combining character class),
1835will be detected even if there is not AB as a contraction.
1836
1837B<Default:> Usually false.
1838If C<UCA_Version> is 22 or 24, and the value of C<long_contraction>
1839is not specified in C<new()>, a true value is set implicitly.
1840This is a workaround to pass Conformance Tests for Unicode 6.0.0 and 6.1.0.
1841
1842C<change()> handles C<long_contraction> explicitly only.
1843If C<long_contraction> is not specified in C<change()>, even though
1844C<UCA_Version> is changed, C<long_contraction> will not be changed.
1845
1846B<Limitation:> Scanning non-starters is one-way (no back tracking).
1847If AB is found but not ABC is not found, other long contraction where
1848the first character is A and the second is not B may not be found.
1849
1850Under C<(normalization =E<gt> undef)>, detection step of discontiguous
1851contractions will be skipped.
1852
1853B<Note:> The following contractions in DUCET are not considered
1854in steps S2.1.1 to S2.1.3, where they are discontiguous.
1855
1856    0FB2 0F71 0F80 (TIBETAN VOWEL SIGN VOCALIC RR)
1857    0FB3 0F71 0F80 (TIBETAN VOWEL SIGN VOCALIC LL)
1858
1859For example C<TIBETAN VOWEL SIGN VOCALIC RR> with C<COMBINING TILDE OVERLAY>
1860(C<U+0344>) is C<0FB2 0344 0F71 0F80> in NFD.
1861In this case C<0FB2 0F80> (C<TIBETAN VOWEL SIGN VOCALIC R>) is detected,
1862instead of C<0FB2 0F71 0F80>.
1863Inserted C<0344> makes C<0FB2 0F71 0F80> discontiguous and lack of
1864contraction C<0FB2 0F71> prohibits C<0FB2 0F71 0F80> from being detected.
1865
1866=item minimalFFFE
1867
1868-- see 1.1.1 U+FFFE, UTS #35 (LDML) Part 5: Collation.
1869
1870If the parameter is made true, C<U+FFFE> has a minimal primary weight.
1871The comparison between C<"$a1\x{FFFE}$a2"> and C<"$b1\x{FFFE}$b2">
1872first compares C<$a1> and C<$b1> at level 1, and
1873then C<$a2> and C<$b2> at level 1, as followed.
1874
1875        "ab\x{FFFE}a"
1876        "Ab\x{FFFE}a"
1877        "ab\x{FFFE}c"
1878        "Ab\x{FFFE}c"
1879        "ab\x{FFFE}xyz"
1880        "abc\x{FFFE}def"
1881        "abc\x{FFFE}xYz"
1882        "aBc\x{FFFE}xyz"
1883        "abcX\x{FFFE}def"
1884        "abcx\x{FFFE}xyz"
1885        "b\x{FFFE}aaa"
1886        "bbb\x{FFFE}a"
1887
1888Note:
1889This is equivalent to C<(entry =E<gt> 'FFFE ; [.0001.0020.0005.FFFE]')>.
1890Any other character than C<U+FFFE> can be tailored by C<entry>.
1891
1892=item normalization
1893
1894-- see 4.1 Normalize, UTS #10.
1895
1896If specified, strings are normalized before preparation of sort keys
1897(the normalization is executed after preprocess).
1898
1899A form name C<Unicode::Normalize::normalize()> accepts will be applied
1900as C<$normalization_form>.
1901Acceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>.
1902See C<Unicode::Normalize::normalize()> for detail.
1903If omitted, C<'NFD'> is used.
1904
1905C<normalization> is performed after C<preprocess> (if defined).
1906
1907Furthermore, special values, C<undef> and C<"prenormalized">, can be used,
1908though they are not concerned with C<Unicode::Normalize::normalize()>.
1909
1910If C<undef> (not a string C<"undef">) is passed explicitly
1911as the value for this key,
1912any normalization is not carried out (this may make tailoring easier
1913if any normalization is not desired). Under C<(normalization =E<gt> undef)>,
1914only contiguous contractions are resolved;
1915e.g. even if C<A-ring> (and C<A-ring-cedilla>) is ordered after C<Z>,
1916C<A-cedilla-ring> would be primary equal to C<A>.
1917In this point,
1918C<(normalization =E<gt> undef, preprocess =E<gt> sub { NFD(shift) })>
1919B<is not> equivalent to C<(normalization =E<gt> 'NFD')>.
1920
1921In the case of C<(normalization =E<gt> "prenormalized")>,
1922any normalization is not performed, but
1923discontiguous contractions with combining characters are performed.
1924Therefore
1925C<(normalization =E<gt> 'prenormalized', preprocess =E<gt> sub { NFD(shift) })>
1926B<is> equivalent to C<(normalization =E<gt> 'NFD')>.
1927If source strings are finely prenormalized,
1928C<(normalization =E<gt> 'prenormalized')> may save time for normalization.
1929
1930Except C<(normalization =E<gt> undef)>,
1931B<Unicode::Normalize> is required (see also B<CAVEAT>).
1932
1933=item overrideCJK
1934
1935-- see 7.1 Derived Collation Elements, UTS #10.
1936
1937By default, CJK unified ideographs are ordered in Unicode codepoint
1938order, but those in the CJK Unified Ideographs block are less than
1939those in the CJK Unified Ideographs Extension A etc.
1940
1941    In the CJK Unified Ideographs block:
1942    U+4E00..U+9FA5 if UCA_Version is 8, 9 or 11.
1943    U+4E00..U+9FBB if UCA_Version is 14 or 16.
1944    U+4E00..U+9FC3 if UCA_Version is 18.
1945    U+4E00..U+9FCB if UCA_Version is 20 or 22.
1946    U+4E00..U+9FCC if UCA_Version is 24 to 30.
1947    U+4E00..U+9FD5 if UCA_Version is 32 or 34.
1948    U+4E00..U+9FEA if UCA_Version is 36.
1949    U+4E00..U+9FEF if UCA_Version is 38, 40 or 41.
1950    U+4E00..U+9FFC if UCA_Version is 43.
1951
1952    In the CJK Unified Ideographs Extension blocks:
1953    Ext.A (U+3400..U+4DB5)   if UCA_Version is  8 to 41.
1954    Ext.A (U+3400..U+4DBF)   if UCA_Version is 43.
1955    Ext.B (U+20000..U+2A6D6) if UCA_Version is  8 to 41.
1956    Ext.B (U+20000..U+2A6DD) if UCA_Version is 43.
1957    Ext.C (U+2A700..U+2B734) if UCA_Version is 20 or later.
1958    Ext.D (U+2B740..U+2B81D) if UCA_Version is 22 or later.
1959    Ext.E (U+2B820..U+2CEA1) if UCA_Version is 32 or later.
1960    Ext.F (U+2CEB0..U+2EBE0) if UCA_Version is 36 or later.
1961    Ext.G (U+30000..U+3134A) if UCA_Version is 43.
1962
1963Through C<overrideCJK>, ordering of CJK unified ideographs (including
1964extensions) can be overridden.
1965
1966ex. CJK unified ideographs in the JIS code point order.
1967
1968  overrideCJK => sub {
1969      my $u = shift;             # get a Unicode codepoint
1970      my $b = pack('n', $u);     # to UTF-16BE
1971      my $s = your_unicode_to_sjis_converter($b); # convert
1972      my $n = unpack('n', $s);   # convert sjis to short
1973      [ $n, 0x20, 0x2, $u ];     # return the collation element
1974  },
1975
1976The return value may be an arrayref of 1st to 4th weights as shown
1977above. The return value may be an integer as the primary weight
1978as shown below.  If C<undef> is returned, the default derived
1979collation element will be used.
1980
1981  overrideCJK => sub {
1982      my $u = shift;             # get a Unicode codepoint
1983      my $b = pack('n', $u);     # to UTF-16BE
1984      my $s = your_unicode_to_sjis_converter($b); # convert
1985      my $n = unpack('n', $s);   # convert sjis to short
1986      return $n;                 # return the primary weight
1987  },
1988
1989The return value may be a list containing zero or more of
1990an arrayref, an integer, or C<undef>.
1991
1992ex. ignores all CJK unified ideographs.
1993
1994  overrideCJK => sub {()}, # CODEREF returning empty list
1995
1996   # where ->eq("Pe\x{4E00}rl", "Perl") is true
1997   # as U+4E00 is a CJK unified ideograph and to be ignorable.
1998
1999If a false value (including C<undef>) is passed, C<overrideCJK>
2000has no effect.
2001C<$Collator-E<gt>change(overrideCJK =E<gt> 0)> resets the old one.
2002
2003But assignment of weight for CJK unified ideographs
2004in C<table> or C<entry> is still valid.
2005If C<undef> is passed explicitly as the value for this key,
2006weights for CJK unified ideographs are treated as undefined.
2007However when C<UCA_Version> E<gt> 8, C<(overrideCJK =E<gt> undef)>
2008has no special meaning.
2009
2010B<Note:> In addition to them, 12 CJK compatibility ideographs (C<U+FA0E>,
2011C<U+FA0F>, C<U+FA11>, C<U+FA13>, C<U+FA14>, C<U+FA1F>, C<U+FA21>, C<U+FA23>,
2012C<U+FA24>, C<U+FA27>, C<U+FA28>, C<U+FA29>) are also treated as CJK unified
2013ideographs. But they can't be overridden via C<overrideCJK> when you use
2014DUCET, as the table includes weights for them. C<table> or C<entry> has
2015priority over C<overrideCJK>.
2016
2017=item overrideHangul
2018
2019-- see 7.1 Derived Collation Elements, UTS #10.
2020
2021By default, Hangul syllables are decomposed into Hangul Jamo,
2022even if C<(normalization =E<gt> undef)>.
2023But the mapping of Hangul syllables may be overridden.
2024
2025This parameter works like C<overrideCJK>, so see there for examples.
2026
2027If you want to override the mapping of Hangul syllables,
2028NFD and NFKD are not appropriate, since NFD and NFKD will decompose
2029Hangul syllables before overriding. FCD may decompose Hangul syllables
2030as the case may be.
2031
2032If a false value (but not C<undef>) is passed, C<overrideHangul>
2033has no effect.
2034C<$Collator-E<gt>change(overrideHangul =E<gt> 0)> resets the old one.
2035
2036If C<undef> is passed explicitly as the value for this key,
2037weight for Hangul syllables is treated as undefined
2038without decomposition into Hangul Jamo.
2039But definition of weight for Hangul syllables
2040in C<table> or C<entry> is still valid.
2041
2042=item overrideOut
2043
2044-- see 7.1.1 Handling Ill-Formed Code Unit Sequences, UTS #10.
2045
2046Perl seems to allow out-of-range values (greater than 0x10FFFF).
2047By default, out-of-range values are replaced with C<U+FFFD>
2048(REPLACEMENT CHARACTER) when C<UCA_Version> E<gt>= 22,
2049or ignored when C<UCA_Version> E<lt>= 20.
2050
2051When C<UCA_Version> E<gt>= 22, the weights of out-of-range values
2052can be overridden. Though C<table> or C<entry> are available for them,
2053out-of-range values are too many.
2054
2055C<overrideOut> can perform it algorithmically.
2056This parameter works like C<overrideCJK>, so see there for examples.
2057
2058ex. ignores all out-of-range values.
2059
2060  overrideOut => sub {()}, # CODEREF returning empty list
2061
2062If a false value (including C<undef>) is passed, C<overrideOut>
2063has no effect.
2064C<$Collator-E<gt>change(overrideOut =E<gt> 0)> resets the old one.
2065
2066B<NOTE ABOUT U+FFFD:>
2067
2068UCA recommends that out-of-range values should not be ignored for security
2069reasons. Say, C<"pe\x{110000}rl"> should not be equal to C<"perl">.
2070However, C<U+FFFD> is wrongly mapped to a variable collation element
2071in DUCET for Unicode 6.0.0 to 6.2.0, that means out-of-range values will be
2072ignored when C<variable> isn't C<Non-ignorable>.
2073
2074The mapping of C<U+FFFD> is corrected in Unicode 6.3.0.
2075see L<http://www.unicode.org/reports/tr10/tr10-28.html#Trailing_Weights>
2076(7.1.4 Trailing Weights). Such a correction is reproduced by this.
2077
2078  overrideOut => sub { 0xFFFD }, # CODEREF returning a very large integer
2079
2080This workaround is unnecessary since Unicode 6.3.0.
2081
2082=item preprocess
2083
2084-- see 5.4 Preprocessing, UTS #10.
2085
2086If specified, the coderef is used to preprocess each string
2087before the formation of sort keys.
2088
2089ex. dropping English articles, such as "a" or "the".
2090Then, "the pen" is before "a pencil".
2091
2092     preprocess => sub {
2093           my $str = shift;
2094           $str =~ s/\b(?:an?|the)\s+//gi;
2095           return $str;
2096        },
2097
2098C<preprocess> is performed before C<normalization> (if defined).
2099
2100ex. decoding strings in a legacy encoding such as shift-jis:
2101
2102    $sjis_collator = Unicode::Collate->new(
2103        preprocess => \&your_shiftjis_to_unicode_decoder,
2104    );
2105    @result = $sjis_collator->sort(@shiftjis_strings);
2106
2107B<Note:> Strings returned from the coderef will be interpreted
2108according to Perl's Unicode support. See L<perlunicode>,
2109L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>.
2110
2111=item rearrange
2112
2113-- see 3.5 Rearrangement, UTS #10.
2114
2115Characters that are not coded in logical order and to be rearranged.
2116If C<UCA_Version> is equal to or less than 11, default is:
2117
2118    rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
2119
2120If you want to disallow any rearrangement, pass C<undef> or C<[]>
2121(a reference to empty list) as the value for this key.
2122
2123If C<UCA_Version> is equal to or greater than 14, default is C<[]>
2124(i.e. no rearrangement).
2125
2126B<According to the version 9 of UCA, this parameter shall not be used;
2127but it is not warned at present.>
2128
2129=item rewrite
2130
2131If specified, the coderef is used to rewrite lines in C<table> or C<entry>.
2132The coderef will get each line, and then should return a rewritten line
2133according to the UCA file format.
2134If the coderef returns an empty line, the line will be skipped.
2135
2136e.g. any primary ignorable characters into tertiary ignorable:
2137
2138    rewrite => sub {
2139        my $line = shift;
2140        $line =~ s/\[\.0000\..{4}\..{4}\./[.0000.0000.0000./g;
2141        return $line;
2142    },
2143
2144This example shows rewriting weights. C<rewrite> is allowed to
2145affect code points, weights, and the name.
2146
2147B<NOTE>: C<table> is available to use another table file;
2148preparing a modified table once would be more efficient than
2149rewriting lines on reading an unmodified table every time.
2150
2151=item suppress
2152
2153-- see 3.12 Special-Purpose Commands, UTS #35 (LDML) Part 5: Collation.
2154
2155Contractions beginning with the specified characters are suppressed,
2156even if those contractions are defined in C<table>.
2157
2158An example for Russian and some languages using the Cyrillic script:
2159
2160    suppress => [0x0400..0x0417, 0x041A..0x0437, 0x043A..0x045F],
2161
2162where 0x0400 stands for C<U+0400>, CYRILLIC CAPITAL LETTER IE WITH GRAVE.
2163
2164B<NOTE>: Contractions via C<entry> will not be suppressed.
2165
2166=item table
2167
2168-- see 3.8 Default Unicode Collation Element Table, UTS #10.
2169
2170You can use another collation element table if desired.
2171
2172The table file should locate in the F<Unicode/Collate> directory
2173on C<@INC>. Say, if the filename is F<Foo.txt>,
2174the table file is searched as F<Unicode/Collate/Foo.txt> in C<@INC>.
2175
2176By default, F<allkeys.txt> (as the filename of DUCET) is used.
2177If you will prepare your own table file, any name other than F<allkeys.txt>
2178may be better to avoid namespace conflict.
2179
2180B<NOTE>: When XSUB is used, the DUCET is compiled on building this
2181module, and it may save time at the run time.
2182Explicit saying C<(table =E<gt> 'allkeys.txt')>, or using another table,
2183or using C<ignoreChar>, C<ignoreName>, C<undefChar>, C<undefName> or
2184C<rewrite> will prevent this module from using the compiled DUCET.
2185
2186If C<undef> is passed explicitly as the value for this key,
2187no file is read (but you can define collation elements via C<entry>).
2188
2189A typical way to define a collation element table
2190without any file of table:
2191
2192   $onlyABC = Unicode::Collate->new(
2193       table => undef,
2194       entry => << 'ENTRIES',
21950061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
21960041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
21970062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
21980042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
21990063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
22000043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
2201ENTRIES
2202    );
2203
2204If C<ignoreName> or C<undefName> is used, character names should be
2205specified as a comment (following C<#>) on each line.
2206
2207=item undefChar
2208
2209=item undefName
2210
2211-- see 6.3.3 Reducing the Repertoire, UTS #10.
2212
2213Undefines the collation element as if it were unassigned in the C<table>.
2214This reduces the size of the table.
2215If an unassigned character appears in the string to be collated,
2216the sort key is made from its codepoint
2217as a single-character collation element,
2218as it is greater than any other assigned collation elements
2219(in the codepoint order among the unassigned characters).
2220But, it'd be better to ignore characters
2221unfamiliar to you and maybe never used.
2222
2223Through C<undefChar>, any character matching C<qr/$undefChar/>
2224will be undefined. Through C<undefName>, any character whose name
2225(given in the C<table> file as a comment) matches C<qr/$undefName/>
2226will be undefined.
2227
2228ex. Collation weights for beyond-BMP characters are not stored in object:
2229
2230    undefChar => qr/[^\0-\x{fffd}]/,
2231
2232=item upper_before_lower
2233
2234-- see 6.6 Case Comparisons, UTS #10.
2235
2236By default, lowercase is before uppercase.
2237If the parameter is made true, this is reversed.
2238
2239B<NOTE>: This parameter simplemindedly assumes that any lowercase/uppercase
2240distinctions must occur in level 3, and their weights at level 3 must be
2241same as those mentioned in 7.3.1, UTS #10.
2242If you define your collation elements which differs from this requirement,
2243this parameter doesn't work validly.
2244
2245=item variable
2246
2247-- see 3.6 Variable Weighting, UTS #10.
2248
2249This key allows for variable weighting of variable collation elements,
2250which are marked with an ASTERISK in the table
2251(NOTE: Many punctuation marks and symbols are variable in F<allkeys.txt>).
2252
2253   variable => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
2254
2255These names are case-insensitive.
2256By default (if specification is omitted), 'shifted' is adopted.
2257
2258   'Blanked'        Variable elements are made ignorable at levels 1 through 3;
2259                    considered at the 4th level.
2260
2261   'Non-Ignorable'  Variable elements are not reset to ignorable.
2262
2263   'Shifted'        Variable elements are made ignorable at levels 1 through 3
2264                    their level 4 weight is replaced by the old level 1 weight.
2265                    Level 4 weight for Non-Variable elements is 0xFFFF.
2266
2267   'Shift-Trimmed'  Same as 'shifted', but all FFFF's at the 4th level
2268                    are trimmed.
2269
2270=back
2271
2272=head2 Methods for Collation
2273
2274=over 4
2275
2276=item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
2277
2278Sorts a list of strings.
2279
2280=item C<$result = $Collator-E<gt>cmp($a, $b)>
2281
2282Returns 1 (when C<$a> is greater than C<$b>)
2283or 0 (when C<$a> is equal to C<$b>)
2284or -1 (when C<$a> is less than C<$b>).
2285
2286=item C<$result = $Collator-E<gt>eq($a, $b)>
2287
2288=item C<$result = $Collator-E<gt>ne($a, $b)>
2289
2290=item C<$result = $Collator-E<gt>lt($a, $b)>
2291
2292=item C<$result = $Collator-E<gt>le($a, $b)>
2293
2294=item C<$result = $Collator-E<gt>gt($a, $b)>
2295
2296=item C<$result = $Collator-E<gt>ge($a, $b)>
2297
2298They works like the same name operators as theirs.
2299
2300   eq : whether $a is equal to $b.
2301   ne : whether $a is not equal to $b.
2302   lt : whether $a is less than $b.
2303   le : whether $a is less than $b or equal to $b.
2304   gt : whether $a is greater than $b.
2305   ge : whether $a is greater than $b or equal to $b.
2306
2307=item C<$sortKey = $Collator-E<gt>getSortKey($string)>
2308
2309-- see 4.3 Form Sort Key, UTS #10.
2310
2311Returns a sort key.
2312
2313You compare the sort keys using a binary comparison
2314and get the result of the comparison of the strings using UCA.
2315
2316   $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
2317
2318      is equivalent to
2319
2320   $Collator->cmp($a, $b)
2321
2322=item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
2323
2324Converts a sorting key into its representation form.
2325If C<UCA_Version> is 8, the output is slightly different.
2326
2327   use Unicode::Collate;
2328   my $c = Unicode::Collate->new();
2329   print $c->viewSortKey("Perl"),"\n";
2330
2331   # output:
2332   # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF]
2333   #  Level 1               Level 2               Level 3               Level 4
2334
2335=back
2336
2337=head2 Methods for Searching
2338
2339The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work
2340like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively,
2341but they are not aware of any pattern, but only a literal substring.
2342
2343B<DISCLAIMER:> If C<preprocess> or C<normalization> parameter is true
2344for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>,
2345C<subst>, C<gsubst>) is croaked, as the position and the length might
2346differ from those on the specified string.
2347
2348C<rearrange> and C<hangul_terminator> parameters are neglected.
2349C<katakana_before_hiragana> and C<upper_before_lower> don't affect
2350matching and searching, as it doesn't matter whether greater or less.
2351
2352=over 4
2353
2354=item C<$position = $Collator-E<gt>index($string, $substring[, $position])>
2355
2356=item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])>
2357
2358If C<$substring> matches a part of C<$string>, returns
2359the position of the first occurrence of the matching part in scalar context;
2360in list context, returns a two-element list of
2361the position and the length of the matching part.
2362
2363If C<$substring> does not match any part of C<$string>,
2364returns C<-1> in scalar context and
2365an empty list in list context.
2366
2367e.g. when the content of C<$str> is C<"Ich mu>E<szlig>C< studieren Perl.">,
2368you say the following where C<$sub> is C<"M>E<uuml>C<SS">,
2369
2370  my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
2371                                     # (normalization => undef) is REQUIRED.
2372  my $match;
2373  if (my($pos,$len) = $Collator->index($str, $sub)) {
2374      $match = substr($str, $pos, $len);
2375  }
2376
2377and get C<"mu>E<szlig>C<"> in C<$match>, since C<"mu>E<szlig>C<">
2378is primary equal to C<"M>E<uuml>C<SS">.
2379
2380=item C<$match_ref = $Collator-E<gt>match($string, $substring)>
2381
2382=item C<($match)   = $Collator-E<gt>match($string, $substring)>
2383
2384If C<$substring> matches a part of C<$string>, in scalar context, returns
2385B<a reference to> the first occurrence of the matching part
2386(C<$match_ref> is always true if matches,
2387since every reference is B<true>);
2388in list context, returns the first occurrence of the matching part.
2389
2390If C<$substring> does not match any part of C<$string>,
2391returns C<undef> in scalar context and
2392an empty list in list context.
2393
2394e.g.
2395
2396    if ($match_ref = $Collator->match($str, $sub)) { # scalar context
2397	print "matches [$$match_ref].\n";
2398    } else {
2399	print "doesn't match.\n";
2400    }
2401
2402     or
2403
2404    if (($match) = $Collator->match($str, $sub)) { # list context
2405	print "matches [$match].\n";
2406    } else {
2407	print "doesn't match.\n";
2408    }
2409
2410=item C<@match = $Collator-E<gt>gmatch($string, $substring)>
2411
2412If C<$substring> matches a part of C<$string>, returns
2413all the matching parts (or matching count in scalar context).
2414
2415If C<$substring> does not match any part of C<$string>,
2416returns an empty list.
2417
2418=item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)>
2419
2420If C<$substring> matches a part of C<$string>,
2421the first occurrence of the matching part is replaced by C<$replacement>
2422(C<$string> is modified) and C<$count> (always equals to C<1>) is returned.
2423
2424C<$replacement> can be a C<CODEREF>,
2425taking the matching part as an argument,
2426and returning a string to replace the matching part
2427(a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>).
2428
2429=item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)>
2430
2431If C<$substring> matches a part of C<$string>,
2432all the occurrences of the matching part are replaced by C<$replacement>
2433(C<$string> is modified) and C<$count> is returned.
2434
2435C<$replacement> can be a C<CODEREF>,
2436taking the matching part as an argument,
2437and returning a string to replace the matching part
2438(a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>).
2439
2440e.g.
2441
2442  my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
2443                                     # (normalization => undef) is REQUIRED.
2444  my $str = "Camel donkey zebra came\x{301}l CAMEL horse cam\0e\0l...";
2445  $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" });
2446
2447  # now $str is "<b>Camel</b> donkey zebra <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cam\0e\0l</b>...";
2448  # i.e., all the camels are made bold-faced.
2449
2450   Examples: levels and ignore_level2 - what does camel match?
2451  ---------------------------------------------------------------------------
2452   level  ignore_level2  |  camel  Camel  came\x{301}l  c-a-m-e-l  cam\0e\0l
2453  -----------------------|---------------------------------------------------
2454     1        false      |   yes    yes      yes          yes        yes
2455     2        false      |   yes    yes      no           yes        yes
2456     3        false      |   yes    no       no           yes        yes
2457     4        false      |   yes    no       no           no         yes
2458  -----------------------|---------------------------------------------------
2459     1        true       |   yes    yes      yes          yes        yes
2460     2        true       |   yes    yes      yes          yes        yes
2461     3        true       |   yes    no       yes          yes        yes
2462     4        true       |   yes    no       yes          no         yes
2463  ---------------------------------------------------------------------------
2464   note: if variable => non-ignorable, camel doesn't match c-a-m-e-l
2465         at any level.
2466
2467=back
2468
2469=head2 Other Methods
2470
2471=over 4
2472
2473=item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
2474
2475=item C<$modified_collator = $Collator-E<gt>change(%new_tailoring)>
2476
2477Changes the value of specified keys and returns the changed part.
2478
2479    $Collator = Unicode::Collate->new(level => 4);
2480
2481    $Collator->eq("perl", "PERL"); # false
2482
2483    %old = $Collator->change(level => 2); # returns (level => 4).
2484
2485    $Collator->eq("perl", "PERL"); # true
2486
2487    $Collator->change(%old); # returns (level => 2).
2488
2489    $Collator->eq("perl", "PERL"); # false
2490
2491Not all C<(key,value)>s are allowed to be changed.
2492See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>.
2493
2494In the scalar context, returns the modified collator
2495(but it is B<not> a clone from the original).
2496
2497    $Collator->change(level => 2)->eq("perl", "PERL"); # true
2498
2499    $Collator->eq("perl", "PERL"); # true; now max level is 2nd.
2500
2501    $Collator->change(level => 4)->eq("perl", "PERL"); # false
2502
2503=item C<$version = $Collator-E<gt>version()>
2504
2505Returns the version number (a string) of the Unicode Standard
2506which the C<table> file used by the collator object is based on.
2507If the table does not include a version line (starting with C<@version>),
2508returns C<"unknown">.
2509
2510=item C<UCA_Version()>
2511
2512Returns the revision number of UTS #10 this module consults,
2513that should correspond with the DUCET incorporated.
2514
2515=item C<Base_Unicode_Version()>
2516
2517Returns the version number of UTS #10 this module consults,
2518that should correspond with the DUCET incorporated.
2519
2520=back
2521
2522=head1 EXPORT
2523
2524No method will be exported.
2525
2526=head1 INSTALL
2527
2528Though this module can be used without any C<table> file,
2529to use this module easily, it is recommended to install a table file
2530in the UCA format, by copying it under the directory
2531<a place in @INC>/Unicode/Collate.
2532
2533The most preferable one is "The Default Unicode Collation Element Table"
2534(aka DUCET), available from the Unicode Consortium's website:
2535
2536   http://www.unicode.org/Public/UCA/
2537
2538   http://www.unicode.org/Public/UCA/latest/allkeys.txt
2539   (latest version)
2540
2541If DUCET is not installed, it is recommended to copy the file
2542from http://www.unicode.org/Public/UCA/latest/allkeys.txt
2543to <a place in @INC>/Unicode/Collate/allkeys.txt
2544manually.
2545
2546=head1 CAVEATS
2547
2548=over 4
2549
2550=item Normalization
2551
2552Use of the C<normalization> parameter requires the B<Unicode::Normalize>
2553module (see L<Unicode::Normalize>).
2554
2555If you need not it (say, in the case when you need not
2556handle any combining characters),
2557assign C<(normalization =E<gt> undef)> explicitly.
2558
2559-- see 6.5 Avoiding Normalization, UTS #10.
2560
2561=item Conformance Test
2562
2563The Conformance Test for the UCA is available
2564under L<http://www.unicode.org/Public/UCA/>.
2565
2566For F<CollationTest_SHIFTED.txt>,
2567a collator via C<Unicode::Collate-E<gt>new( )> should be used;
2568for F<CollationTest_NON_IGNORABLE.txt>, a collator via
2569C<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>.
2570
2571If C<UCA_Version> is 26 or later, the C<identical> level is preferred;
2572C<Unicode::Collate-E<gt>new(identical =E<gt> 1)> and
2573C<Unicode::Collate-E<gt>new(identical =E<gt> 1,>
2574C<variable =E<gt> "non-ignorable", level =E<gt> 3)> should be used.
2575
2576B<Unicode::Normalize is required to try The Conformance Test.>
2577
2578B<EBCDIC-SUPPORT IS EXPERIMENTAL.>
2579
2580=back
2581
2582=head1 AUTHOR, COPYRIGHT AND LICENSE
2583
2584The Unicode::Collate module for perl was written by SADAHIRO Tomoyuki,
2585<SADAHIRO@cpan.org>. This module is Copyright(C) 2001-2021,
2586SADAHIRO Tomoyuki. Japan. All rights reserved.
2587
2588This module is free software; you can redistribute it and/or
2589modify it under the same terms as Perl itself.
2590
2591The file Unicode/Collate/allkeys.txt was copied verbatim
2592from L<http://www.unicode.org/Public/UCA/13.0.0/allkeys.txt>.
2593For this file, Copyright (c) 2020 Unicode, Inc.; distributed
2594under the Terms of Use in L<http://www.unicode.org/terms_of_use.html>
2595
2596=head1 SEE ALSO
2597
2598=over 4
2599
2600=item Unicode Collation Algorithm - UTS #10
2601
2602L<http://www.unicode.org/reports/tr10/>
2603
2604=item The Default Unicode Collation Element Table (DUCET)
2605
2606L<http://www.unicode.org/Public/UCA/latest/allkeys.txt>
2607
2608=item The conformance test for the UCA
2609
2610L<http://www.unicode.org/Public/UCA/latest/CollationTest.html>
2611
2612L<http://www.unicode.org/Public/UCA/latest/CollationTest.zip>
2613
2614=item Hangul Syllable Type
2615
2616L<http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt>
2617
2618=item Unicode Normalization Forms - UAX #15
2619
2620L<http://www.unicode.org/reports/tr15/>
2621
2622=item Unicode Locale Data Markup Language (LDML) - UTS #35
2623
2624L<http://www.unicode.org/reports/tr35/>
2625
2626=back
2627
2628=cut
2629