1package Unicode::Collate;
2
3BEGIN {
4    unless ("A" eq pack('U', 0x41)) {
5	die "Unicode::Collate cannot stringify a Unicode code point\n";
6    }
7}
8
9use 5.006;
10use strict;
11use warnings;
12use Carp;
13use File::Spec;
14
15no warnings 'utf8';
16
17our $VERSION = '0.52_01';
18our $PACKAGE = __PACKAGE__;
19
20my @Path = qw(Unicode Collate);
21my $KeyFile = "allkeys.txt";
22
23# Perl's boolean
24use constant TRUE  => 1;
25use constant FALSE => "";
26use constant NOMATCHPOS => -1;
27
28# A coderef to get combining class imported from Unicode::Normalize
29# (i.e. \&Unicode::Normalize::getCombinClass).
30# This is also used as a HAS_UNICODE_NORMALIZE flag.
31my $CVgetCombinClass;
32
33# Supported Levels
34use constant MinLevel => 1;
35use constant MaxLevel => 4;
36
37# Minimum weights at level 2 and 3, respectively
38use constant Min2Wt => 0x20;
39use constant Min3Wt => 0x02;
40
41# Shifted weight at 4th level
42use constant Shift4Wt => 0xFFFF;
43
44# A boolean for Variable and 16-bit weights at 4 levels of Collation Element
45# PROBLEM: The Default Unicode Collation Element Table
46# has weights over 0xFFFF at the 4th level.
47# The tie-breaking in the variable weights
48# other than "shift" (as well as "shift-trimmed") is unreliable.
49use constant VCE_TEMPLATE => 'Cn4';
50
51# A sort key: 16-bit weights
52# See also the PROBLEM on VCE_TEMPLATE above.
53use constant KEY_TEMPLATE => 'n*';
54
55# Level separator in a sort key:
56# i.e. pack(KEY_TEMPLATE, 0)
57use constant LEVEL_SEP => "\0\0";
58
59# As Unicode code point separator for hash keys.
60# A joined code point string (denoted by JCPS below)
61# like "65;768" is used for internal processing
62# instead of Perl's Unicode string like "\x41\x{300}",
63# as the native code point is different from the Unicode code point
64# on EBCDIC platform.
65# This character must not be included in any stringified
66# representation of an integer.
67use constant CODE_SEP => ';';
68
69# boolean values of variable weights
70use constant NON_VAR => 0; # Non-Variable character
71use constant VAR     => 1; # Variable character
72
73# specific code points
74use constant Hangul_LBase  => 0x1100;
75use constant Hangul_LIni   => 0x1100;
76use constant Hangul_LFin   => 0x1159;
77use constant Hangul_LFill  => 0x115F;
78use constant Hangul_VBase  => 0x1161;
79use constant Hangul_VIni   => 0x1160; # from Vowel Filler
80use constant Hangul_VFin   => 0x11A2;
81use constant Hangul_TBase  => 0x11A7; # from "no-final" codepoint
82use constant Hangul_TIni   => 0x11A8;
83use constant Hangul_TFin   => 0x11F9;
84use constant Hangul_TCount => 28;
85use constant Hangul_NCount => 588;
86use constant Hangul_SBase  => 0xAC00;
87use constant Hangul_SIni   => 0xAC00;
88use constant Hangul_SFin   => 0xD7A3;
89use constant CJK_UidIni    => 0x4E00;
90use constant CJK_UidFin    => 0x9FA5;
91use constant CJK_UidF41    => 0x9FBB;
92use constant CJK_ExtAIni   => 0x3400;
93use constant CJK_ExtAFin   => 0x4DB5;
94use constant CJK_ExtBIni   => 0x20000;
95use constant CJK_ExtBFin   => 0x2A6D6;
96use constant BMP_Max       => 0xFFFF;
97
98# Logical_Order_Exception in PropList.txt
99my $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
100
101sub UCA_Version { "14" }
102
103sub Base_Unicode_Version { "4.1.0" }
104
105######
106
107sub pack_U {
108    return pack('U*', @_);
109}
110
111sub unpack_U {
112    return unpack('U*', shift(@_).pack('U*'));
113}
114
115######
116
117my (%VariableOK);
118@VariableOK{ qw/
119    blanked  non-ignorable  shifted  shift-trimmed
120  / } = (); # keys lowercased
121
122our @ChangeOK = qw/
123    alternate backwards level normalization rearrange
124    katakana_before_hiragana upper_before_lower
125    overrideHangul overrideCJK preprocess UCA_Version
126    hangul_terminator variable
127  /;
128
129our @ChangeNG = qw/
130    entry mapping table maxlength
131    ignoreChar ignoreName undefChar undefName variableTable
132    versionTable alternateTable backwardsTable forwardsTable rearrangeTable
133    derivCode normCode rearrangeHash
134    backwardsFlag
135  /;
136# The hash key 'ignored' is deleted at v 0.21.
137# The hash key 'isShift' is deleted at v 0.23.
138# The hash key 'combining' is deleted at v 0.24.
139# The hash key 'entries' is deleted at v 0.30.
140# The hash key 'L3_ignorable' is deleted at v 0.40.
141
142sub version {
143    my $self = shift;
144    return $self->{versionTable} || 'unknown';
145}
146
147my (%ChangeOK, %ChangeNG);
148@ChangeOK{ @ChangeOK } = ();
149@ChangeNG{ @ChangeNG } = ();
150
151sub change {
152    my $self = shift;
153    my %hash = @_;
154    my %old;
155    if (exists $hash{variable} && exists $hash{alternate}) {
156	delete $hash{alternate};
157    }
158    elsif (!exists $hash{variable} && exists $hash{alternate}) {
159	$hash{variable} = $hash{alternate};
160    }
161    foreach my $k (keys %hash) {
162	if (exists $ChangeOK{$k}) {
163	    $old{$k} = $self->{$k};
164	    $self->{$k} = $hash{$k};
165	}
166	elsif (exists $ChangeNG{$k}) {
167	    croak "change of $k via change() is not allowed!";
168	}
169	# else => ignored
170    }
171    $self->checkCollator();
172    return wantarray ? %old : $self;
173}
174
175sub _checkLevel {
176    my $level = shift;
177    my $key   = shift; # 'level' or 'backwards'
178    MinLevel <= $level or croak sprintf
179	"Illegal level %d (in value for key '%s') lower than %d.",
180	    $level, $key, MinLevel;
181    $level <= MaxLevel or croak sprintf
182	"Unsupported level %d (in value for key '%s') higher than %d.",
183	    $level, $key, MaxLevel;
184}
185
186my %DerivCode = (
187    8 => \&_derivCE_8,
188    9 => \&_derivCE_9,
189   11 => \&_derivCE_9, # 11 == 9
190   14 => \&_derivCE_14,
191);
192
193sub checkCollator {
194    my $self = shift;
195    _checkLevel($self->{level}, "level");
196
197    $self->{derivCode} = $DerivCode{ $self->{UCA_Version} }
198	or croak "Illegal UCA version (passed $self->{UCA_Version}).";
199
200    $self->{variable} ||= $self->{alternate} || $self->{variableTable} ||
201				$self->{alternateTable} || 'shifted';
202    $self->{variable} = $self->{alternate} = lc($self->{variable});
203    exists $VariableOK{ $self->{variable} }
204	or croak "$PACKAGE unknown variable parameter name: $self->{variable}";
205
206    if (! defined $self->{backwards}) {
207	$self->{backwardsFlag} = 0;
208    }
209    elsif (! ref $self->{backwards}) {
210	_checkLevel($self->{backwards}, "backwards");
211	$self->{backwardsFlag} = 1 << $self->{backwards};
212    }
213    else {
214	my %level;
215	$self->{backwardsFlag} = 0;
216	for my $b (@{ $self->{backwards} }) {
217	    _checkLevel($b, "backwards");
218	    $level{$b} = 1;
219	}
220	for my $v (sort keys %level) {
221	    $self->{backwardsFlag} += 1 << $v;
222	}
223    }
224
225    defined $self->{rearrange} or $self->{rearrange} = [];
226    ref $self->{rearrange}
227	or croak "$PACKAGE: list for rearrangement must be store in ARRAYREF";
228
229    # keys of $self->{rearrangeHash} are $self->{rearrange}.
230    $self->{rearrangeHash} = undef;
231
232    if (@{ $self->{rearrange} }) {
233	@{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
234    }
235
236    $self->{normCode} = undef;
237
238    if (defined $self->{normalization}) {
239	eval { require Unicode::Normalize };
240	$@ and croak "Unicode::Normalize is required to normalize strings";
241
242	$CVgetCombinClass ||= \&Unicode::Normalize::getCombinClass;
243
244	if ($self->{normalization} =~ /^(?:NF)D\z/) { # tweak for default
245	    $self->{normCode} = \&Unicode::Normalize::NFD;
246	}
247	elsif ($self->{normalization} ne 'prenormalized') {
248	    my $norm = $self->{normalization};
249	    $self->{normCode} = sub {
250		Unicode::Normalize::normalize($norm, shift);
251	    };
252	    eval { $self->{normCode}->("") }; # try
253	    $@ and croak "$PACKAGE unknown normalization form name: $norm";
254	}
255    }
256    return;
257}
258
259sub new
260{
261    my $class = shift;
262    my $self = bless { @_ }, $class;
263
264    # If undef is passed explicitly, no file is read.
265    $self->{table} = $KeyFile if ! exists $self->{table};
266    $self->read_table() if defined $self->{table};
267
268    if ($self->{entry}) {
269	while ($self->{entry} =~ /([^\n]+)/g) {
270	    $self->parseEntry($1);
271	}
272    }
273
274    $self->{level} ||= MaxLevel;
275    $self->{UCA_Version} ||= UCA_Version();
276
277    $self->{overrideHangul} = FALSE
278	if ! exists $self->{overrideHangul};
279    $self->{overrideCJK} = FALSE
280	if ! exists $self->{overrideCJK};
281    $self->{normalization} = 'NFD'
282	if ! exists $self->{normalization};
283    $self->{rearrange} = $self->{rearrangeTable} ||
284	($self->{UCA_Version} <= 11 ? $DefaultRearrange : [])
285	if ! exists $self->{rearrange};
286    $self->{backwards} = $self->{backwardsTable}
287	if ! exists $self->{backwards};
288
289    $self->checkCollator();
290
291    return $self;
292}
293
294sub read_table {
295    my $self = shift;
296
297    my($f, $fh);
298    foreach my $d (@INC) {
299	$f = File::Spec->catfile($d, @Path, $self->{table});
300	last if open($fh, $f);
301	$f = undef;
302    }
303    if (!defined $f) {
304	$f = File::Spec->catfile(@Path, $self->{table});
305	croak("$PACKAGE: Can't locate $f in \@INC (\@INC contains: @INC)");
306    }
307
308    while (my $line = <$fh>) {
309	next if $line =~ /^\s*#/;
310	unless ($line =~ s/^\s*\@//) {
311	    $self->parseEntry($line);
312	    next;
313	}
314
315	# matched ^\s*\@
316	if ($line =~ /^version\s*(\S*)/) {
317	    $self->{versionTable} ||= $1;
318	}
319	elsif ($line =~ /^variable\s+(\S*)/) { # since UTS #10-9
320	    $self->{variableTable} ||= $1;
321	}
322	elsif ($line =~ /^alternate\s+(\S*)/) { # till UTS #10-8
323	    $self->{alternateTable} ||= $1;
324	}
325	elsif ($line =~ /^backwards\s+(\S*)/) {
326	    push @{ $self->{backwardsTable} }, $1;
327	}
328	elsif ($line =~ /^forwards\s+(\S*)/) { # parhaps no use
329	    push @{ $self->{forwardsTable} }, $1;
330	}
331	elsif ($line =~ /^rearrange\s+(.*)/) { # (\S*) is NG
332	    push @{ $self->{rearrangeTable} }, _getHexArray($1);
333	}
334    }
335    close $fh;
336}
337
338
339##
340## get $line, parse it, and write an entry in $self
341##
342sub parseEntry
343{
344    my $self = shift;
345    my $line = shift;
346    my($name, $entry, @uv, @key);
347
348    return if $line !~ /^\s*[0-9A-Fa-f]/;
349
350    # removes comment and gets name
351    $name = $1
352	if $line =~ s/[#%]\s*(.*)//;
353    return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
354
355    # gets element
356    my($e, $k) = split /;/, $line;
357    croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
358	if ! $k;
359
360    @uv = _getHexArray($e);
361    return if !@uv;
362
363    $entry = join(CODE_SEP, @uv); # in JCPS
364
365    if (defined $self->{undefChar} || defined $self->{ignoreChar}) {
366	my $ele = pack_U(@uv);
367
368	# regarded as if it were not entried in the table
369	return
370	    if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
371
372	# replaced as completely ignorable
373	$k = '[.0000.0000.0000.0000]'
374	    if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/;
375    }
376
377    # replaced as completely ignorable
378    $k = '[.0000.0000.0000.0000]'
379	if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/;
380
381    my $is_L3_ignorable = TRUE;
382
383    foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
384	my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
385	my @wt = _getHexArray($arr);
386	push @key, pack(VCE_TEMPLATE, $var, @wt);
387	$is_L3_ignorable = FALSE
388	    if $wt[0] || $wt[1] || $wt[2];
389	# Conformance Test for 3.1.1 and 4.0.0 shows Level 3 ignorable
390	# is completely ignorable.
391	# For expansion, an entry $is_L3_ignorable
392	# if and only if "all" CEs are [.0000.0000.0000].
393    }
394
395    $self->{mapping}{$entry} = $is_L3_ignorable ? [] : \@key;
396
397    if (@uv > 1) {
398	(!$self->{maxlength}{$uv[0]} || $self->{maxlength}{$uv[0]} < @uv)
399	    and $self->{maxlength}{$uv[0]} = @uv;
400    }
401}
402
403
404##
405## VCE = _varCE(variable term, VCE)
406##
407sub _varCE
408{
409    my $vbl = shift;
410    my $vce = shift;
411    if ($vbl eq 'non-ignorable') {
412	return $vce;
413    }
414    my ($var, @wt) = unpack VCE_TEMPLATE, $vce;
415
416    if ($var) {
417	return pack(VCE_TEMPLATE, $var, 0, 0, 0,
418		$vbl eq 'blanked' ? $wt[3] : $wt[0]);
419    }
420    elsif ($vbl eq 'blanked') {
421	return $vce;
422    }
423    else {
424	return pack(VCE_TEMPLATE, $var, @wt[0..2],
425	    $vbl eq 'shifted' && $wt[0]+$wt[1]+$wt[2] ? Shift4Wt : 0);
426    }
427}
428
429sub viewSortKey
430{
431    my $self = shift;
432    $self->visualizeSortKey($self->getSortKey(@_));
433}
434
435sub visualizeSortKey
436{
437    my $self = shift;
438    my $view = join " ", map sprintf("%04X", $_), unpack(KEY_TEMPLATE, shift);
439
440    if ($self->{UCA_Version} <= 8) {
441	$view =~ s/ ?0000 ?/|/g;
442    } else {
443	$view =~ s/\b0000\b/|/g;
444    }
445    return "[$view]";
446}
447
448
449##
450## arrayref of JCPS   = splitEnt(string to be collated)
451## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, true)
452##
453sub splitEnt
454{
455    my $self = shift;
456    my $wLen = $_[1];
457
458    my $code = $self->{preprocess};
459    my $norm = $self->{normCode};
460    my $map  = $self->{mapping};
461    my $max  = $self->{maxlength};
462    my $reH  = $self->{rearrangeHash};
463    my $ver9 = $self->{UCA_Version} >= 9 && $self->{UCA_Version} <= 11;
464
465    my ($str, @buf);
466
467    if ($wLen) {
468	$code and croak "Preprocess breaks character positions. "
469			. "Don't use with index(), match(), etc.";
470	$norm and croak "Normalization breaks character positions. "
471			. "Don't use with index(), match(), etc.";
472	$str = $_[0];
473    }
474    else {
475	$str = $_[0];
476	$str = &$code($str) if ref $code;
477	$str = &$norm($str) if ref $norm;
478    }
479
480    # get array of Unicode code point of string.
481    my @src = unpack_U($str);
482
483    # rearrangement:
484    # Character positions are not kept if rearranged,
485    # then neglected if $wLen is true.
486    if ($reH && ! $wLen) {
487	for (my $i = 0; $i < @src; $i++) {
488	    if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
489		($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
490		$i++;
491	    }
492	}
493    }
494
495    # remove a code point marked as a completely ignorable.
496    for (my $i = 0; $i < @src; $i++) {
497	$src[$i] = undef
498	    if _isIllegal($src[$i]) || ($ver9 &&
499		$map->{ $src[$i] } && @{ $map->{ $src[$i] } } == 0);
500    }
501
502    for (my $i = 0; $i < @src; $i++) {
503	my $jcps = $src[$i];
504
505	# skip removed code point
506	if (! defined $jcps) {
507	    if ($wLen && @buf) {
508		$buf[-1][2] = $i + 1;
509	    }
510	    next;
511	}
512
513	my $i_orig = $i;
514
515	# find contraction
516	if ($max->{$jcps}) {
517	    my $temp_jcps = $jcps;
518	    my $jcpsLen = 1;
519	    my $maxLen = $max->{$jcps};
520
521	    for (my $p = $i + 1; $jcpsLen < $maxLen && $p < @src; $p++) {
522		next if ! defined $src[$p];
523		$temp_jcps .= CODE_SEP . $src[$p];
524		$jcpsLen++;
525		if ($map->{$temp_jcps}) {
526		    $jcps = $temp_jcps;
527		    $i = $p;
528		}
529	    }
530
531	# not-contiguous contraction with Combining Char (cf. UTS#10, S2.1).
532	# This process requires Unicode::Normalize.
533	# If "normalization" is undef, here should be skipped *always*
534	# (in spite of bool value of $CVgetCombinClass),
535	# since canonical ordering cannot be expected.
536	# Blocked combining character should not be contracted.
537
538	    if ($self->{normalization})
539	    # $self->{normCode} is false in the case of "prenormalized".
540	    {
541		my $preCC = 0;
542		my $curCC = 0;
543
544		for (my $p = $i + 1; $p < @src; $p++) {
545		    next if ! defined $src[$p];
546		    $curCC = $CVgetCombinClass->($src[$p]);
547		    last unless $curCC;
548		    my $tail = CODE_SEP . $src[$p];
549		    if ($preCC != $curCC && $map->{$jcps.$tail}) {
550			$jcps .= $tail;
551			$src[$p] = undef;
552		    } else {
553			$preCC = $curCC;
554		    }
555		}
556	    }
557	}
558
559	# skip completely ignorable
560	if ($map->{$jcps} && @{ $map->{$jcps} } == 0) {
561	    if ($wLen && @buf) {
562		$buf[-1][2] = $i + 1;
563	    }
564	    next;
565	}
566
567	push @buf, $wLen ? [$jcps, $i_orig, $i + 1] : $jcps;
568    }
569    return \@buf;
570}
571
572
573##
574## list of VCE = getWt(JCPS)
575##
576sub getWt
577{
578    my $self = shift;
579    my $u    = shift;
580    my $vbl  = $self->{variable};
581    my $map  = $self->{mapping};
582    my $der  = $self->{derivCode};
583
584    return if !defined $u;
585    return map(_varCE($vbl, $_), @{ $map->{$u} })
586	if $map->{$u};
587
588    # JCPS must not be a contraction, then it's a code point.
589    if (Hangul_SIni <= $u && $u <= Hangul_SFin) {
590	my $hang = $self->{overrideHangul};
591	my @hangulCE;
592	if ($hang) {
593	    @hangulCE = map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$hang($u));
594	}
595	elsif (!defined $hang) {
596	    @hangulCE = $der->($u);
597	}
598	else {
599	    my $max  = $self->{maxlength};
600	    my @decH = _decompHangul($u);
601
602	    if (@decH == 2) {
603		my $contract = join(CODE_SEP, @decH);
604		@decH = ($contract) if $map->{$contract};
605	    } else { # must be <@decH == 3>
606		if ($max->{$decH[0]}) {
607		    my $contract = join(CODE_SEP, @decH);
608		    if ($map->{$contract}) {
609			@decH = ($contract);
610		    } else {
611			$contract = join(CODE_SEP, @decH[0,1]);
612			$map->{$contract} and @decH = ($contract, $decH[2]);
613		    }
614		    # even if V's ignorable, LT contraction is not supported.
615		    # If such a situatution were required, NFD should be used.
616		}
617		if (@decH == 3 && $max->{$decH[1]}) {
618		    my $contract = join(CODE_SEP, @decH[1,2]);
619		    $map->{$contract} and @decH = ($decH[0], $contract);
620		}
621	    }
622
623	    @hangulCE = map({
624		    $map->{$_} ? @{ $map->{$_} } : $der->($_);
625		} @decH);
626	}
627	return map _varCE($vbl, $_), @hangulCE;
628    }
629    elsif (_isUIdeo($u, $self->{UCA_Version})) {
630	my $cjk  = $self->{overrideCJK};
631	return map _varCE($vbl, $_),
632	    $cjk
633		? map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$cjk($u))
634		: defined $cjk && $self->{UCA_Version} <= 8 && $u < 0x10000
635		    ? _uideoCE_8($u)
636		    : $der->($u);
637    }
638    else {
639	return map _varCE($vbl, $_), $der->($u);
640    }
641}
642
643
644##
645## string sortkey = getSortKey(string arg)
646##
647sub getSortKey
648{
649    my $self = shift;
650    my $lev  = $self->{level};
651    my $rEnt = $self->splitEnt(shift); # get an arrayref of JCPS
652    my $v2i  = $self->{UCA_Version} >= 9 &&
653		$self->{variable} ne 'non-ignorable';
654
655    my @buf; # weight arrays
656    if ($self->{hangul_terminator}) {
657	my $preHST = '';
658	foreach my $jcps (@$rEnt) {
659	    # weird things like VL, TL-contraction are not considered!
660	    my $curHST = '';
661	    foreach my $u (split /;/, $jcps) {
662		$curHST .= getHST($u);
663	    }
664	    if ($preHST && !$curHST || # hangul before non-hangul
665		$preHST =~ /L\z/ && $curHST =~ /^T/ ||
666		$preHST =~ /V\z/ && $curHST =~ /^L/ ||
667		$preHST =~ /T\z/ && $curHST =~ /^[LV]/) {
668
669		push @buf, $self->getWtHangulTerm();
670	    }
671	    $preHST = $curHST;
672
673	    push @buf, $self->getWt($jcps);
674	}
675	$preHST # end at hangul
676	    and push @buf, $self->getWtHangulTerm();
677    }
678    else {
679	foreach my $jcps (@$rEnt) {
680	    push @buf, $self->getWt($jcps);
681	}
682    }
683
684    # make sort key
685    my @ret = ([],[],[],[]);
686    my $last_is_variable;
687
688    foreach my $vwt (@buf) {
689	my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
690
691	# "Ignorable (L1, L2) after Variable" since track. v. 9
692	if ($v2i) {
693	    if ($var) {
694		$last_is_variable = TRUE;
695	    }
696	    elsif (!$wt[0]) { # ignorable
697		next if $last_is_variable;
698	    }
699	    else {
700		$last_is_variable = FALSE;
701	    }
702	}
703	foreach my $v (0..$lev-1) {
704	    0 < $wt[$v] and push @{ $ret[$v] }, $wt[$v];
705	}
706    }
707
708    # modification of tertiary weights
709    if ($self->{upper_before_lower}) {
710	foreach my $w (@{ $ret[2] }) {
711	    if    (0x8 <= $w && $w <= 0xC) { $w -= 6 } # lower
712	    elsif (0x2 <= $w && $w <= 0x6) { $w += 6 } # upper
713	    elsif ($w == 0x1C)             { $w += 1 } # square upper
714	    elsif ($w == 0x1D)             { $w -= 1 } # square lower
715	}
716    }
717    if ($self->{katakana_before_hiragana}) {
718	foreach my $w (@{ $ret[2] }) {
719	    if    (0x0F <= $w && $w <= 0x13) { $w -= 2 } # katakana
720	    elsif (0x0D <= $w && $w <= 0x0E) { $w += 5 } # hiragana
721	}
722    }
723
724    if ($self->{backwardsFlag}) {
725	for (my $v = MinLevel; $v <= MaxLevel; $v++) {
726	    if ($self->{backwardsFlag} & (1 << $v)) {
727		@{ $ret[$v-1] } = reverse @{ $ret[$v-1] };
728	    }
729	}
730    }
731
732    join LEVEL_SEP, map pack(KEY_TEMPLATE, @$_), @ret;
733}
734
735
736##
737## int compare = cmp(string a, string b)
738##
739sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
740sub eq  { $_[0]->getSortKey($_[1]) eq  $_[0]->getSortKey($_[2]) }
741sub ne  { $_[0]->getSortKey($_[1]) ne  $_[0]->getSortKey($_[2]) }
742sub lt  { $_[0]->getSortKey($_[1]) lt  $_[0]->getSortKey($_[2]) }
743sub le  { $_[0]->getSortKey($_[1]) le  $_[0]->getSortKey($_[2]) }
744sub gt  { $_[0]->getSortKey($_[1]) gt  $_[0]->getSortKey($_[2]) }
745sub ge  { $_[0]->getSortKey($_[1]) ge  $_[0]->getSortKey($_[2]) }
746
747##
748## list[strings] sorted = sort(list[strings] arg)
749##
750sub sort {
751    my $obj = shift;
752    return
753	map { $_->[1] }
754	    sort{ $a->[0] cmp $b->[0] }
755		map [ $obj->getSortKey($_), $_ ], @_;
756}
757
758
759sub _derivCE_14 {
760    my $u = shift;
761    my $base =
762	(CJK_UidIni  <= $u && $u <= CJK_UidF41)
763	    ? 0xFB40 : # CJK
764	(CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
765	 CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
766	    ? 0xFB80   # CJK ext.
767	    : 0xFBC0;  # others
768
769    my $aaaa = $base + ($u >> 15);
770    my $bbbb = ($u & 0x7FFF) | 0x8000;
771    return
772	pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
773	pack(VCE_TEMPLATE, NON_VAR, $bbbb,      0,      0, $u);
774}
775
776sub _derivCE_9 {
777    my $u = shift;
778    my $base =
779	(CJK_UidIni  <= $u && $u <= CJK_UidFin)
780	    ? 0xFB40 : # CJK
781	(CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
782	 CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
783	    ? 0xFB80   # CJK ext.
784	    : 0xFBC0;  # others
785
786    my $aaaa = $base + ($u >> 15);
787    my $bbbb = ($u & 0x7FFF) | 0x8000;
788    return
789	pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
790	pack(VCE_TEMPLATE, NON_VAR, $bbbb,      0,      0, $u);
791}
792
793sub _derivCE_8 {
794    my $code = shift;
795    my $aaaa =  0xFF80 + ($code >> 15);
796    my $bbbb = ($code & 0x7FFF) | 0x8000;
797    return
798	pack(VCE_TEMPLATE, NON_VAR, $aaaa, 2, 1, $code),
799	pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $code);
800}
801
802sub _uideoCE_8 {
803    my $u = shift;
804    return pack(VCE_TEMPLATE, NON_VAR, $u, Min2Wt, Min3Wt, $u);
805}
806
807sub _isUIdeo {
808    my ($u, $uca_vers) = @_;
809    return(
810	(CJK_UidIni <= $u &&
811	    ($uca_vers >= 14 ? ( $u <= CJK_UidF41) : ($u <= CJK_UidFin)))
812		||
813	(CJK_ExtAIni <= $u && $u <= CJK_ExtAFin)
814		||
815	(CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
816    );
817}
818
819
820sub getWtHangulTerm {
821    my $self = shift;
822    return _varCE($self->{variable},
823	pack(VCE_TEMPLATE, NON_VAR, $self->{hangul_terminator}, 0,0,0));
824}
825
826
827##
828## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
829##
830sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g }
831
832#
833# $code *must* be in Hangul syllable.
834# Check it before you enter here.
835#
836sub _decompHangul {
837    my $code = shift;
838    my $si = $code - Hangul_SBase;
839    my $li = int( $si / Hangul_NCount);
840    my $vi = int(($si % Hangul_NCount) / Hangul_TCount);
841    my $ti =      $si % Hangul_TCount;
842    return (
843	Hangul_LBase + $li,
844	Hangul_VBase + $vi,
845	$ti ? (Hangul_TBase + $ti) : (),
846    );
847}
848
849sub _isIllegal {
850    my $code = shift;
851    return ! defined $code                      # removed
852	|| ($code < 0 || 0x10FFFF < $code)      # out of range
853	|| (($code & 0xFFFE) == 0xFFFE)         # ??FFF[EF] (cf. utf8.c)
854	|| (0xD800 <= $code && $code <= 0xDFFF) # unpaired surrogates
855	|| (0xFDD0 <= $code && $code <= 0xFDEF) # other non-characters
856    ;
857}
858
859# Hangul Syllable Type
860sub getHST {
861    my $u = shift;
862    return
863	Hangul_LIni <= $u && $u <= Hangul_LFin || $u == Hangul_LFill ? "L" :
864	Hangul_VIni <= $u && $u <= Hangul_VFin	     ? "V" :
865	Hangul_TIni <= $u && $u <= Hangul_TFin	     ? "T" :
866	Hangul_SIni <= $u && $u <= Hangul_SFin ?
867	    ($u - Hangul_SBase) % Hangul_TCount ? "LVT" : "LV" : "";
868}
869
870
871##
872## bool _nonIgnorAtLevel(arrayref weights, int level)
873##
874sub _nonIgnorAtLevel($$)
875{
876    my $wt = shift;
877    return if ! defined $wt;
878    my $lv = shift;
879    return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE;
880}
881
882##
883## bool _eqArray(
884##    arrayref of arrayref[weights] source,
885##    arrayref of arrayref[weights] substr,
886##    int level)
887## * comparison of graphemes vs graphemes.
888##   @$source >= @$substr must be true (check it before call this);
889##
890sub _eqArray($$$)
891{
892    my $source = shift;
893    my $substr = shift;
894    my $lev = shift;
895
896    for my $g (0..@$substr-1){
897	# Do the $g'th graphemes have the same number of AV weigths?
898	return if @{ $source->[$g] } != @{ $substr->[$g] };
899
900	for my $w (0..@{ $substr->[$g] }-1) {
901	    for my $v (0..$lev-1) {
902		return if $source->[$g][$w][$v] != $substr->[$g][$w][$v];
903	    }
904	}
905    }
906    return 1;
907}
908
909##
910## (int position, int length)
911## int position = index(string, substring, position, [undoc'ed grobal])
912##
913## With "grobal" (only for the list context),
914##  returns list of arrayref[position, length].
915##
916sub index
917{
918    my $self = shift;
919    my $str  = shift;
920    my $len  = length($str);
921    my $subE = $self->splitEnt(shift);
922    my $pos  = @_ ? shift : 0;
923       $pos  = 0 if $pos < 0;
924    my $grob = shift;
925
926    my $lev  = $self->{level};
927    my $v2i  = $self->{UCA_Version} >= 9 &&
928		$self->{variable} ne 'non-ignorable';
929
930    if (! @$subE) {
931	my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos;
932	return $grob
933	    ? map([$_, 0], $temp..$len)
934	    : wantarray ? ($temp,0) : $temp;
935    }
936    $len < $pos
937	and return wantarray ? () : NOMATCHPOS;
938    my $strE = $self->splitEnt($pos ? substr($str, $pos) : $str, TRUE);
939    @$strE
940	or return wantarray ? () : NOMATCHPOS;
941
942    my(@strWt, @iniPos, @finPos, @subWt, @g_ret);
943
944    my $last_is_variable;
945    for my $vwt (map $self->getWt($_), @$subE) {
946	my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
947	my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
948
949	# "Ignorable (L1, L2) after Variable" since track. v. 9
950	if ($v2i) {
951	    if ($var) {
952		$last_is_variable = TRUE;
953	    }
954	    elsif (!$wt[0]) { # ignorable
955		$to_be_pushed = FALSE if $last_is_variable;
956	    }
957	    else {
958		$last_is_variable = FALSE;
959	    }
960	}
961
962	if (@subWt && !$var && !$wt[0]) {
963	    push @{ $subWt[-1] }, \@wt if $to_be_pushed;
964	} else {
965	    push @subWt, [ \@wt ];
966	}
967    }
968
969    my $count = 0;
970    my $end = @$strE - 1;
971
972    $last_is_variable = FALSE; # reuse
973    for (my $i = 0; $i <= $end; ) { # no $i++
974	my $found_base = 0;
975
976	# fetch a grapheme
977	while ($i <= $end && $found_base == 0) {
978	    for my $vwt ($self->getWt($strE->[$i][0])) {
979		my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
980		my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
981
982		# "Ignorable (L1, L2) after Variable" since track. v. 9
983		if ($v2i) {
984		    if ($var) {
985			$last_is_variable = TRUE;
986		    }
987		    elsif (!$wt[0]) { # ignorable
988			$to_be_pushed = FALSE if $last_is_variable;
989		    }
990		    else {
991			$last_is_variable = FALSE;
992		    }
993		}
994
995		if (@strWt && !$var && !$wt[0]) {
996		    push @{ $strWt[-1] }, \@wt if $to_be_pushed;
997		    $finPos[-1] = $strE->[$i][2];
998		} elsif ($to_be_pushed) {
999		    push @strWt, [ \@wt ];
1000		    push @iniPos, $found_base ? NOMATCHPOS : $strE->[$i][1];
1001		    $finPos[-1] = NOMATCHPOS if $found_base;
1002		    push @finPos, $strE->[$i][2];
1003		    $found_base++;
1004		}
1005		# else ===> no-op
1006	    }
1007	    $i++;
1008	}
1009
1010	# try to match
1011	while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) {
1012	    if ($iniPos[0] != NOMATCHPOS &&
1013		    $finPos[$#subWt] != NOMATCHPOS &&
1014			_eqArray(\@strWt, \@subWt, $lev)) {
1015		my $temp = $iniPos[0] + $pos;
1016
1017		if ($grob) {
1018		    push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]];
1019		    splice @strWt,  0, $#subWt;
1020		    splice @iniPos, 0, $#subWt;
1021		    splice @finPos, 0, $#subWt;
1022		}
1023		else {
1024		    return wantarray
1025			? ($temp, $finPos[$#subWt] - $iniPos[0])
1026			:  $temp;
1027		}
1028	    }
1029	    shift @strWt;
1030	    shift @iniPos;
1031	    shift @finPos;
1032	}
1033    }
1034
1035    return $grob
1036	? @g_ret
1037	: wantarray ? () : NOMATCHPOS;
1038}
1039
1040##
1041## scalarref to matching part = match(string, substring)
1042##
1043sub match
1044{
1045    my $self = shift;
1046    if (my($pos,$len) = $self->index($_[0], $_[1])) {
1047	my $temp = substr($_[0], $pos, $len);
1048	return wantarray ? $temp : \$temp;
1049	# An lvalue ref \substr should be avoided,
1050	# since its value is affected by modification of its referent.
1051    }
1052    else {
1053	return;
1054    }
1055}
1056
1057##
1058## arrayref matching parts = gmatch(string, substring)
1059##
1060sub gmatch
1061{
1062    my $self = shift;
1063    my $str  = shift;
1064    my $sub  = shift;
1065    return map substr($str, $_->[0], $_->[1]),
1066		$self->index($str, $sub, 0, 'g');
1067}
1068
1069##
1070## bool subst'ed = subst(string, substring, replace)
1071##
1072sub subst
1073{
1074    my $self = shift;
1075    my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1076
1077    if (my($pos,$len) = $self->index($_[0], $_[1])) {
1078	if ($code) {
1079	    my $mat = substr($_[0], $pos, $len);
1080	    substr($_[0], $pos, $len, $code->($mat));
1081	} else {
1082	    substr($_[0], $pos, $len, $_[2]);
1083	}
1084	return TRUE;
1085    }
1086    else {
1087	return FALSE;
1088    }
1089}
1090
1091##
1092## int count = gsubst(string, substring, replace)
1093##
1094sub gsubst
1095{
1096    my $self = shift;
1097    my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1098    my $cnt = 0;
1099
1100    # Replacement is carried out from the end, then use reverse.
1101    for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) {
1102	if ($code) {
1103	    my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]);
1104	    substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat));
1105	} else {
1106	    substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]);
1107	}
1108	$cnt++;
1109    }
1110    return $cnt;
1111}
1112
11131;
1114__END__
1115
1116=head1 NAME
1117
1118Unicode::Collate - Unicode Collation Algorithm
1119
1120=head1 SYNOPSIS
1121
1122  use Unicode::Collate;
1123
1124  #construct
1125  $Collator = Unicode::Collate->new(%tailoring);
1126
1127  #sort
1128  @sorted = $Collator->sort(@not_sorted);
1129
1130  #compare
1131  $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
1132
1133  # If %tailoring is false (i.e. empty),
1134  # $Collator should do the default collation.
1135
1136=head1 DESCRIPTION
1137
1138This module is an implementation of Unicode Technical Standard #10
1139(a.k.a. UTS #10) - Unicode Collation Algorithm (a.k.a. UCA).
1140
1141=head2 Constructor and Tailoring
1142
1143The C<new> method returns a collator object.
1144
1145   $Collator = Unicode::Collate->new(
1146      UCA_Version => $UCA_Version,
1147      alternate => $alternate, # deprecated: use of 'variable' is recommended.
1148      backwards => $levelNumber, # or \@levelNumbers
1149      entry => $element,
1150      hangul_terminator => $term_primary_weight,
1151      ignoreName => qr/$ignoreName/,
1152      ignoreChar => qr/$ignoreChar/,
1153      katakana_before_hiragana => $bool,
1154      level => $collationLevel,
1155      normalization  => $normalization_form,
1156      overrideCJK => \&overrideCJK,
1157      overrideHangul => \&overrideHangul,
1158      preprocess => \&preprocess,
1159      rearrange => \@charList,
1160      table => $filename,
1161      undefName => qr/$undefName/,
1162      undefChar => qr/$undefChar/,
1163      upper_before_lower => $bool,
1164      variable => $variable,
1165   );
1166
1167=over 4
1168
1169=item UCA_Version
1170
1171If the tracking version number of UCA is given,
1172behavior of that tracking version is emulated on collating.
1173If omitted, the return value of C<UCA_Version()> is used.
1174C<UCA_Version()> should return the latest tracking version supported.
1175
1176The supported tracking version: 8, 9, 11, or 14.
1177
1178     UCA       Unicode Standard         DUCET (@version)
1179     ---------------------------------------------------
1180      8              3.1                3.0.1 (3.0.1d9)
1181      9     3.1 with Corrigendum 3      3.1.1 (3.1.1)
1182     11              4.0                4.0.0 (4.0.0)
1183     14             4.1.0               4.1.0 (4.1.0)
1184
1185Note: Recent UTS #10 renames "Tracking Version" to "Revision."
1186
1187=item alternate
1188
1189-- see 3.2.2 Alternate Weighting, version 8 of UTS #10
1190
1191For backward compatibility, C<alternate> (old name) can be used
1192as an alias for C<variable>.
1193
1194=item backwards
1195
1196-- see 3.1.2 French Accents, UTS #10.
1197
1198     backwards => $levelNumber or \@levelNumbers
1199
1200Weights in reverse order; ex. level 2 (diacritic ordering) in French.
1201If omitted, forwards at all the levels.
1202
1203=item entry
1204
1205-- see 3.1 Linguistic Features; 3.2.1 File Format, UTS #10.
1206
1207If the same character (or a sequence of characters) exists
1208in the collation element table through C<table>,
1209mapping to collation elements is overrided.
1210If it does not exist, the mapping is defined additionally.
1211
1212    entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
12130063 0068 ; [.0E6A.0020.0002.0063] # ch
12140043 0068 ; [.0E6A.0020.0007.0043] # Ch
12150043 0048 ; [.0E6A.0020.0008.0043] # CH
1216006C 006C ; [.0F4C.0020.0002.006C] # ll
1217004C 006C ; [.0F4C.0020.0007.004C] # Ll
1218004C 004C ; [.0F4C.0020.0008.004C] # LL
121900F1      ; [.0F7B.0020.0002.00F1] # n-tilde
1220006E 0303 ; [.0F7B.0020.0002.00F1] # n-tilde
122100D1      ; [.0F7B.0020.0008.00D1] # N-tilde
1222004E 0303 ; [.0F7B.0020.0008.00D1] # N-tilde
1223ENTRY
1224
1225    entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
122600E6 ; [.0E33.0020.0002.00E6][.0E8B.0020.0002.00E6] # ae ligature as <a><e>
122700C6 ; [.0E33.0020.0008.00C6][.0E8B.0020.0008.00C6] # AE ligature as <A><E>
1228ENTRY
1229
1230B<NOTE:> The code point in the UCA file format (before C<';'>)
1231B<must> be a Unicode code point (defined as hexadecimal),
1232but not a native code point.
1233So C<0063> must always denote C<U+0063>,
1234but not a character of C<"\x63">.
1235
1236Weighting may vary depending on collation element table.
1237So ensure the weights defined in C<entry> will be consistent with
1238those in the collation element table loaded via C<table>.
1239
1240In DUCET v4.0.0, primary weight of C<C> is C<0E60>
1241and that of C<D> is C<0E6D>. So setting primary weight of C<CH> to C<0E6A>
1242(as a value between C<0E60> and C<0E6D>)
1243makes ordering as C<C E<lt> CH E<lt> D>.
1244Exactly speaking DUCET already has some characters between C<C> and C<D>:
1245C<small capital C> (C<U+1D04>) with primary weight C<0E64>,
1246C<c-hook/C-hook> (C<U+0188/U+0187>) with C<0E65>,
1247and C<c-curl> (C<U+0255>) with C<0E69>.
1248Then primary weight C<0E6A> for C<CH> makes C<CH>
1249ordered between C<c-curl> and C<D>.
1250
1251=item hangul_terminator
1252
1253-- see 7.1.4 Trailing Weights, UTS #10.
1254
1255If a true value is given (non-zero but should be positive),
1256it will be added as a terminator primary weight to the end of
1257every standard Hangul syllable. Secondary and any higher weights
1258for terminator are set to zero.
1259If the value is false or C<hangul_terminator> key does not exist,
1260insertion of terminator weights will not be performed.
1261
1262Boundaries of Hangul syllables are determined
1263according to conjoining Jamo behavior in F<the Unicode Standard>
1264and F<HangulSyllableType.txt>.
1265
1266B<Implementation Note:>
1267(1) For expansion mapping (Unicode character mapped
1268to a sequence of collation elements), a terminator will not be added
1269between collation elements, even if Hangul syllable boundary exists there.
1270Addition of terminator is restricted to the next position
1271to the last collation element.
1272
1273(2) Non-conjoining Hangul letters
1274(Compatibility Jamo, halfwidth Jamo, and enclosed letters) are not
1275automatically terminated with a terminator primary weight.
1276These characters may need terminator included in a collation element
1277table beforehand.
1278
1279=item ignoreChar
1280
1281=item ignoreName
1282
1283-- see 3.2.2 Variable Weighting, UTS #10.
1284
1285Makes the entry in the table completely ignorable;
1286i.e. as if the weights were zero at all level.
1287
1288Through C<ignoreChar>, any character matching C<qr/$ignoreChar/>
1289will be ignored. Through C<ignoreName>, any character whose name
1290(given in the C<table> file as a comment) matches C<qr/$ignoreName/>
1291will be ignored.
1292
1293E.g. when 'a' and 'e' are ignorable,
1294'element' is equal to 'lament' (or 'lmnt').
1295
1296=item katakana_before_hiragana
1297
1298-- see 7.3.1 Tertiary Weight Table, UTS #10.
1299
1300By default, hiragana is before katakana.
1301If the parameter is made true, this is reversed.
1302
1303B<NOTE>: This parameter simplemindedly assumes that any hiragana/katakana
1304distinctions must occur in level 3, and their weights at level 3 must be
1305same as those mentioned in 7.3.1, UTS #10.
1306If you define your collation elements which violate this requirement,
1307this parameter does not work validly.
1308
1309=item level
1310
1311-- see 4.3 Form Sort Key, UTS #10.
1312
1313Set the maximum level.
1314Any higher levels than the specified one are ignored.
1315
1316  Level 1: alphabetic ordering
1317  Level 2: diacritic ordering
1318  Level 3: case ordering
1319  Level 4: tie-breaking (e.g. in the case when variable is 'shifted')
1320
1321  ex.level => 2,
1322
1323If omitted, the maximum is the 4th.
1324
1325=item normalization
1326
1327-- see 4.1 Normalize, UTS #10.
1328
1329If specified, strings are normalized before preparation of sort keys
1330(the normalization is executed after preprocess).
1331
1332A form name C<Unicode::Normalize::normalize()> accepts will be applied
1333as C<$normalization_form>.
1334Acceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>.
1335See C<Unicode::Normalize::normalize()> for detail.
1336If omitted, C<'NFD'> is used.
1337
1338C<normalization> is performed after C<preprocess> (if defined).
1339
1340Furthermore, special values, C<undef> and C<"prenormalized">, can be used,
1341though they are not concerned with C<Unicode::Normalize::normalize()>.
1342
1343If C<undef> (not a string C<"undef">) is passed explicitly
1344as the value for this key,
1345any normalization is not carried out (this may make tailoring easier
1346if any normalization is not desired). Under C<(normalization =E<gt> undef)>,
1347only contiguous contractions are resolved;
1348e.g. even if C<A-ring> (and C<A-ring-cedilla>) is ordered after C<Z>,
1349C<A-cedilla-ring> would be primary equal to C<A>.
1350In this point,
1351C<(normalization =E<gt> undef, preprocess =E<gt> sub { NFD(shift) })>
1352B<is not> equivalent to C<(normalization =E<gt> 'NFD')>.
1353
1354In the case of C<(normalization =E<gt> "prenormalized")>,
1355any normalization is not performed, but
1356non-contiguous contractions with combining characters are performed.
1357Therefore
1358C<(normalization =E<gt> 'prenormalized', preprocess =E<gt> sub { NFD(shift) })>
1359B<is> equivalent to C<(normalization =E<gt> 'NFD')>.
1360If source strings are finely prenormalized,
1361C<(normalization =E<gt> 'prenormalized')> may save time for normalization.
1362
1363Except C<(normalization =E<gt> undef)>,
1364B<Unicode::Normalize> is required (see also B<CAVEAT>).
1365
1366=item overrideCJK
1367
1368-- see 7.1 Derived Collation Elements, UTS #10.
1369
1370By default, CJK Unified Ideographs are ordered in Unicode codepoint order
1371but C<CJK Unified Ideographs> (if C<UCA_Version> is 8 to 11, its range is
1372C<U+4E00..U+9FA5>; if C<UCA_Version> is 14, its range is C<U+4E00..U+9FBB>)
1373are lesser than C<CJK Unified Ideographs Extension> (its range is
1374C<U+3400..U+4DB5> and C<U+20000..U+2A6D6>).
1375
1376Through C<overrideCJK>, ordering of CJK Unified Ideographs can be overrided.
1377
1378ex. CJK Unified Ideographs in the JIS code point order.
1379
1380  overrideCJK => sub {
1381      my $u = shift;             # get a Unicode codepoint
1382      my $b = pack('n', $u);     # to UTF-16BE
1383      my $s = your_unicode_to_sjis_converter($b); # convert
1384      my $n = unpack('n', $s);   # convert sjis to short
1385      [ $n, 0x20, 0x2, $u ];     # return the collation element
1386  },
1387
1388ex. ignores all CJK Unified Ideographs.
1389
1390  overrideCJK => sub {()}, # CODEREF returning empty list
1391
1392   # where ->eq("Pe\x{4E00}rl", "Perl") is true
1393   # as U+4E00 is a CJK Unified Ideograph and to be ignorable.
1394
1395If C<undef> is passed explicitly as the value for this key,
1396weights for CJK Unified Ideographs are treated as undefined.
1397But assignment of weight for CJK Unified Ideographs
1398in table or C<entry> is still valid.
1399
1400=item overrideHangul
1401
1402-- see 7.1 Derived Collation Elements, UTS #10.
1403
1404By default, Hangul Syllables are decomposed into Hangul Jamo,
1405even if C<(normalization =E<gt> undef)>.
1406But the mapping of Hangul Syllables may be overrided.
1407
1408This parameter works like C<overrideCJK>, so see there for examples.
1409
1410If you want to override the mapping of Hangul Syllables,
1411NFD, NFKD, and FCD are not appropriate,
1412since they will decompose Hangul Syllables before overriding.
1413
1414If C<undef> is passed explicitly as the value for this key,
1415weight for Hangul Syllables is treated as undefined
1416without decomposition into Hangul Jamo.
1417But definition of weight for Hangul Syllables
1418in table or C<entry> is still valid.
1419
1420=item preprocess
1421
1422-- see 5.1 Preprocessing, UTS #10.
1423
1424If specified, the coderef is used to preprocess
1425before the formation of sort keys.
1426
1427ex. dropping English articles, such as "a" or "the".
1428Then, "the pen" is before "a pencil".
1429
1430     preprocess => sub {
1431           my $str = shift;
1432           $str =~ s/\b(?:an?|the)\s+//gi;
1433           return $str;
1434        },
1435
1436C<preprocess> is performed before C<normalization> (if defined).
1437
1438=item rearrange
1439
1440-- see 3.1.3 Rearrangement, UTS #10.
1441
1442Characters that are not coded in logical order and to be rearranged.
1443If C<UCA_Version> is equal to or lesser than 11, default is:
1444
1445    rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
1446
1447If you want to disallow any rearrangement, pass C<undef> or C<[]>
1448(a reference to empty list) as the value for this key.
1449
1450If C<UCA_Version> is equal to 14, default is C<[]> (i.e. no rearrangement).
1451
1452B<According to the version 9 of UCA, this parameter shall not be used;
1453but it is not warned at present.>
1454
1455=item table
1456
1457-- see 3.2 Default Unicode Collation Element Table, UTS #10.
1458
1459You can use another collation element table if desired.
1460
1461The table file should locate in the F<Unicode/Collate> directory
1462on C<@INC>. Say, if the filename is F<Foo.txt>,
1463the table file is searched as F<Unicode/Collate/Foo.txt> in C<@INC>.
1464
1465By default, F<allkeys.txt> (as the filename of DUCET) is used.
1466If you will prepare your own table file, any name other than F<allkeys.txt>
1467may be better to avoid namespace conflict.
1468
1469If C<undef> is passed explicitly as the value for this key,
1470no file is read (but you can define collation elements via C<entry>).
1471
1472A typical way to define a collation element table
1473without any file of table:
1474
1475   $onlyABC = Unicode::Collate->new(
1476       table => undef,
1477       entry => << 'ENTRIES',
14780061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
14790041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
14800062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
14810042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
14820063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
14830043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
1484ENTRIES
1485    );
1486
1487If C<ignoreName> or C<undefName> is used, character names should be
1488specified as a comment (following C<#>) on each line.
1489
1490=item undefChar
1491
1492=item undefName
1493
1494-- see 6.3.4 Reducing the Repertoire, UTS #10.
1495
1496Undefines the collation element as if it were unassigned in the table.
1497This reduces the size of the table.
1498If an unassigned character appears in the string to be collated,
1499the sort key is made from its codepoint
1500as a single-character collation element,
1501as it is greater than any other assigned collation elements
1502(in the codepoint order among the unassigned characters).
1503But, it'd be better to ignore characters
1504unfamiliar to you and maybe never used.
1505
1506Through C<undefChar>, any character matching C<qr/$undefChar/>
1507will be undefined. Through C<undefName>, any character whose name
1508(given in the C<table> file as a comment) matches C<qr/$undefName/>
1509will be undefined.
1510
1511ex. Collation weights for beyond-BMP characters are not stored in object:
1512
1513    undefChar => qr/[^\0-\x{fffd}]/,
1514
1515=item upper_before_lower
1516
1517-- see 6.6 Case Comparisons, UTS #10.
1518
1519By default, lowercase is before uppercase.
1520If the parameter is made true, this is reversed.
1521
1522B<NOTE>: This parameter simplemindedly assumes that any lowercase/uppercase
1523distinctions must occur in level 3, and their weights at level 3 must be
1524same as those mentioned in 7.3.1, UTS #10.
1525If you define your collation elements which differs from this requirement,
1526this parameter doesn't work validly.
1527
1528=item variable
1529
1530-- see 3.2.2 Variable Weighting, UTS #10.
1531
1532This key allows to variable weighting for variable collation elements,
1533which are marked with an ASTERISK in the table
1534(NOTE: Many punction marks and symbols are variable in F<allkeys.txt>).
1535
1536   variable => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
1537
1538These names are case-insensitive.
1539By default (if specification is omitted), 'shifted' is adopted.
1540
1541   'Blanked'        Variable elements are made ignorable at levels 1 through 3;
1542                    considered at the 4th level.
1543
1544   'Non-Ignorable'  Variable elements are not reset to ignorable.
1545
1546   'Shifted'        Variable elements are made ignorable at levels 1 through 3
1547                    their level 4 weight is replaced by the old level 1 weight.
1548                    Level 4 weight for Non-Variable elements is 0xFFFF.
1549
1550   'Shift-Trimmed'  Same as 'shifted', but all FFFF's at the 4th level
1551                    are trimmed.
1552
1553=back
1554
1555=head2 Methods for Collation
1556
1557=over 4
1558
1559=item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
1560
1561Sorts a list of strings.
1562
1563=item C<$result = $Collator-E<gt>cmp($a, $b)>
1564
1565Returns 1 (when C<$a> is greater than C<$b>)
1566or 0 (when C<$a> is equal to C<$b>)
1567or -1 (when C<$a> is lesser than C<$b>).
1568
1569=item C<$result = $Collator-E<gt>eq($a, $b)>
1570
1571=item C<$result = $Collator-E<gt>ne($a, $b)>
1572
1573=item C<$result = $Collator-E<gt>lt($a, $b)>
1574
1575=item C<$result = $Collator-E<gt>le($a, $b)>
1576
1577=item C<$result = $Collator-E<gt>gt($a, $b)>
1578
1579=item C<$result = $Collator-E<gt>ge($a, $b)>
1580
1581They works like the same name operators as theirs.
1582
1583   eq : whether $a is equal to $b.
1584   ne : whether $a is not equal to $b.
1585   lt : whether $a is lesser than $b.
1586   le : whether $a is lesser than $b or equal to $b.
1587   gt : whether $a is greater than $b.
1588   ge : whether $a is greater than $b or equal to $b.
1589
1590=item C<$sortKey = $Collator-E<gt>getSortKey($string)>
1591
1592-- see 4.3 Form Sort Key, UTS #10.
1593
1594Returns a sort key.
1595
1596You compare the sort keys using a binary comparison
1597and get the result of the comparison of the strings using UCA.
1598
1599   $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
1600
1601      is equivalent to
1602
1603   $Collator->cmp($a, $b)
1604
1605=item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
1606
1607Converts a sorting key into its representation form.
1608If C<UCA_Version> is 8, the output is slightly different.
1609
1610   use Unicode::Collate;
1611   my $c = Unicode::Collate->new();
1612   print $c->viewSortKey("Perl"),"\n";
1613
1614   # output:
1615   # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF]
1616   #  Level 1               Level 2               Level 3               Level 4
1617
1618=back
1619
1620=head2 Methods for Searching
1621
1622B<DISCLAIMER:> If C<preprocess> or C<normalization> parameter is true
1623for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>,
1624C<subst>, C<gsubst>) is croaked,
1625as the position and the length might differ
1626from those on the specified string.
1627(And C<rearrange> and C<hangul_terminator> parameters are neglected.)
1628
1629The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work
1630like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively,
1631but they are not aware of any pattern, but only a literal substring.
1632
1633=over 4
1634
1635=item C<$position = $Collator-E<gt>index($string, $substring[, $position])>
1636
1637=item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])>
1638
1639If C<$substring> matches a part of C<$string>, returns
1640the position of the first occurrence of the matching part in scalar context;
1641in list context, returns a two-element list of
1642the position and the length of the matching part.
1643
1644If C<$substring> does not match any part of C<$string>,
1645returns C<-1> in scalar context and
1646an empty list in list context.
1647
1648e.g. you say
1649
1650  my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1651                                     # (normalization => undef) is REQUIRED.
1652  my $str = "Ich mu� studieren Perl.";
1653  my $sub = "M�SS";
1654  my $match;
1655  if (my($pos,$len) = $Collator->index($str, $sub)) {
1656      $match = substr($str, $pos, $len);
1657  }
1658
1659and get C<"mu�"> in C<$match> since C<"mu�">
1660is primary equal to C<"M�SS">.
1661
1662=item C<$match_ref = $Collator-E<gt>match($string, $substring)>
1663
1664=item C<($match)   = $Collator-E<gt>match($string, $substring)>
1665
1666If C<$substring> matches a part of C<$string>, in scalar context, returns
1667B<a reference to> the first occurrence of the matching part
1668(C<$match_ref> is always true if matches,
1669since every reference is B<true>);
1670in list context, returns the first occurrence of the matching part.
1671
1672If C<$substring> does not match any part of C<$string>,
1673returns C<undef> in scalar context and
1674an empty list in list context.
1675
1676e.g.
1677
1678    if ($match_ref = $Collator->match($str, $sub)) { # scalar context
1679	print "matches [$$match_ref].\n";
1680    } else {
1681	print "doesn't match.\n";
1682    }
1683
1684     or
1685
1686    if (($match) = $Collator->match($str, $sub)) { # list context
1687	print "matches [$match].\n";
1688    } else {
1689	print "doesn't match.\n";
1690    }
1691
1692=item C<@match = $Collator-E<gt>gmatch($string, $substring)>
1693
1694If C<$substring> matches a part of C<$string>, returns
1695all the matching parts (or matching count in scalar context).
1696
1697If C<$substring> does not match any part of C<$string>,
1698returns an empty list.
1699
1700=item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)>
1701
1702If C<$substring> matches a part of C<$string>,
1703the first occurrence of the matching part is replaced by C<$replacement>
1704(C<$string> is modified) and return C<$count> (always equals to C<1>).
1705
1706C<$replacement> can be a C<CODEREF>,
1707taking the matching part as an argument,
1708and returning a string to replace the matching part
1709(a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>).
1710
1711=item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)>
1712
1713If C<$substring> matches a part of C<$string>,
1714all the occurrences of the matching part is replaced by C<$replacement>
1715(C<$string> is modified) and return C<$count>.
1716
1717C<$replacement> can be a C<CODEREF>,
1718taking the matching part as an argument,
1719and returning a string to replace the matching part
1720(a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>).
1721
1722e.g.
1723
1724  my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1725                                     # (normalization => undef) is REQUIRED.
1726  my $str = "Camel donkey zebra came\x{301}l CAMEL horse cAm\0E\0L...";
1727  $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" });
1728
1729  # now $str is "<b>Camel</b> donkey zebra <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cAm\0E\0L</b>...";
1730  # i.e., all the camels are made bold-faced.
1731
1732=back
1733
1734=head2 Other Methods
1735
1736=over 4
1737
1738=item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
1739
1740Change the value of specified keys and returns the changed part.
1741
1742    $Collator = Unicode::Collate->new(level => 4);
1743
1744    $Collator->eq("perl", "PERL"); # false
1745
1746    %old = $Collator->change(level => 2); # returns (level => 4).
1747
1748    $Collator->eq("perl", "PERL"); # true
1749
1750    $Collator->change(%old); # returns (level => 2).
1751
1752    $Collator->eq("perl", "PERL"); # false
1753
1754Not all C<(key,value)>s are allowed to be changed.
1755See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>.
1756
1757In the scalar context, returns the modified collator
1758(but it is B<not> a clone from the original).
1759
1760    $Collator->change(level => 2)->eq("perl", "PERL"); # true
1761
1762    $Collator->eq("perl", "PERL"); # true; now max level is 2nd.
1763
1764    $Collator->change(level => 4)->eq("perl", "PERL"); # false
1765
1766=item C<$version = $Collator-E<gt>version()>
1767
1768Returns the version number (a string) of the Unicode Standard
1769which the C<table> file used by the collator object is based on.
1770If the table does not include a version line (starting with C<@version>),
1771returns C<"unknown">.
1772
1773=item C<UCA_Version()>
1774
1775Returns the tracking version number of UTS #10 this module consults.
1776
1777=item C<Base_Unicode_Version()>
1778
1779Returns the version number of UTS #10 this module consults.
1780
1781=back
1782
1783=head1 EXPORT
1784
1785No method will be exported.
1786
1787=head1 INSTALL
1788
1789Though this module can be used without any C<table> file,
1790to use this module easily, it is recommended to install a table file
1791in the UCA format, by copying it under the directory
1792<a place in @INC>/Unicode/Collate.
1793
1794The most preferable one is "The Default Unicode Collation Element Table"
1795(aka DUCET), available from the Unicode Consortium's website:
1796
1797   http://www.unicode.org/Public/UCA/
1798
1799   http://www.unicode.org/Public/UCA/latest/allkeys.txt (latest version)
1800
1801If DUCET is not installed, it is recommended to copy the file
1802from http://www.unicode.org/Public/UCA/latest/allkeys.txt
1803to <a place in @INC>/Unicode/Collate/allkeys.txt
1804manually.
1805
1806=head1 CAVEATS
1807
1808=over 4
1809
1810=item Normalization
1811
1812Use of the C<normalization> parameter requires the B<Unicode::Normalize>
1813module (see L<Unicode::Normalize>).
1814
1815If you need not it (say, in the case when you need not
1816handle any combining characters),
1817assign C<normalization =E<gt> undef> explicitly.
1818
1819-- see 6.5 Avoiding Normalization, UTS #10.
1820
1821=item Conformance Test
1822
1823The Conformance Test for the UCA is available
1824under L<http://www.unicode.org/Public/UCA/>.
1825
1826For F<CollationTest_SHIFTED.txt>,
1827a collator via C<Unicode::Collate-E<gt>new( )> should be used;
1828for F<CollationTest_NON_IGNORABLE.txt>, a collator via
1829C<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>.
1830
1831B<Unicode::Normalize is required to try The Conformance Test.>
1832
1833=back
1834
1835=head1 AUTHOR, COPYRIGHT AND LICENSE
1836
1837The Unicode::Collate module for perl was written by SADAHIRO Tomoyuki,
1838<SADAHIRO@cpan.org>. This module is Copyright(C) 2001-2005,
1839SADAHIRO Tomoyuki. Japan. All rights reserved.
1840
1841This module is free software; you can redistribute it and/or
1842modify it under the same terms as Perl itself.
1843
1844The file Unicode/Collate/allkeys.txt was copied directly
1845from L<http://www.unicode.org/Public/UCA/4.1.0/allkeys.txt>.
1846This file is Copyright (c) 1991-2005 Unicode, Inc. All rights reserved.
1847Distributed under the Terms of Use in L<http://www.unicode.org/copyright.html>.
1848
1849=head1 SEE ALSO
1850
1851=over 4
1852
1853=item Unicode Collation Algorithm - UTS #10
1854
1855L<http://www.unicode.org/reports/tr10/>
1856
1857=item The Default Unicode Collation Element Table (DUCET)
1858
1859L<http://www.unicode.org/Public/UCA/latest/allkeys.txt>
1860
1861=item The conformance test for the UCA
1862
1863L<http://www.unicode.org/Public/UCA/latest/CollationTest.html>
1864
1865L<http://www.unicode.org/Public/UCA/latest/CollationTest.zip>
1866
1867=item Hangul Syllable Type
1868
1869L<http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt>
1870
1871=item Unicode Normalization Forms - UAX #15
1872
1873L<http://www.unicode.org/reports/tr15/>
1874
1875=back
1876
1877=cut
1878