1#!perl
2#
3# This auxiliary script makes locale .pl files and copes with Korean.pm
4# used by Unicode::Collate::Locale.
5#
6# Usage:
7#    <do './mklocale'> in perl, or <perl mklocale> in command line
8#
9# Input files:
10#    data/*.txt
11#    Collate/CJK/Korean.pm
12#    Collate/allkeys.txt
13#
14# Output files:
15#    Locale/*.pl (need to be moved to Collate/Locale/*.pl to install them)
16#    Korean.pm   (need to be moved to Collate/CJK/Korean.pm to install it)
17#
18# What to do:
19#   * convert data/*.txt to Locale/*.pl
20#   * rewrite %jamo2prim in Korean.pm
21#
22# == Examples of the Rules for data/*.txt ==
23#
24# 00F1;n+1 ===> primary weight of 00F1 is greater than that of n by 1.
25#    Among literals, only [A-Za-z'"] can be the base.
26#    +1 for primary weight (greater by 1),
27#    -1 for primary weight (less by 1),
28#    ++1 and --1 for secondary weight,
29#    +++1 and ---1 for tertiary weight.
30#    A number followed by + or - is decimal integer.
31#
32# 0491;<0433>+1
33# 1D2D;<00C6>+++12
34#    XXXX is hexadecimal for U+XXXX.
35#    <XXXX> can be the base followed by +1 etc.
36#
37# 3220;<0028 4E00 0029>+++?
38#    '+++?' means implicit tertiary weights and borrows them from DUCET.
39#    The lefthand (3220) and the righthand (0028 4E00 0029) *must* be
40#    secondary equal in DUCET too (usually compatibility equivalent).
41#    The example shown works like "3220;<0028>+++2 <4E00>+++2 <0029>+++29",
42#    since the tertiary weights of U+3200 are (0x4,0x4,0x1F), those of
43#    <0028 4E00 0029> are (0x2,0x2,0x2), and the difference is (+2,+2,+29).
44#    Limitation: cannot be combined with other rules in the same line.
45#
46# 01FD;<00E6><0301> ===> U+01FD eq U+00E6,U+0301
47#    simple decomposition (without + and -): identical each other.
48#
49# 0902;<0950>+0 [.FFF1.0.0.0]
50#     [...] means constant weights in hexadecimal.
51#
52# 0064 0335;=
53# 0111;d++1<0335>
54#    '=' saves DUCET weights as it is.
55#    0064 0335;= prevents 0064 0335 from being equal to 0111.
56#
57# {ch};c+1   ===> 0063 0068;c+1
58# {K'};Q++1  ===> 004B 0027;Q++1
59#     { } before ; encloses a literal: [A-Za-z'] (alphabets or apostrophe).
60#
61# {gh}0335;<{g}0127> ===> U+0067,U+0068,U+0335 eq U+0067,U+0127
62# {dZ}030C;<{d}017D> ===> U+0064,U+005A,U+030C eq U+0064,U+017D
63#     < > after ; can enclose a unit comprising plural XXXX or {literal}.
64#
65# alternate
66#    "alternate XXX" gives "variable => 'XXX'," and "alternate => 'XXX',".
67#
68# backwards
69#    It gives "backwards => 2,".
70#
71# upper
72#    It gives "upper_before_lower => 1,".
73#
74# suppress
75#    "suppress XXXX YYYY" gives "suppress => [0xXXXX, 0xYYYY],",
76#    "suppress XXXX-YYYY" gives "suppress => [0xXXXX..0xYYYY],".
77#    Values XXXX etc. should be hexadecimal, and may be listed in few lines.
78#
79# use
80#    "use PACKAGE" gives "use PACKAGE;".  The PACKAGE stands for a module,
81#    such as CJK/*.pm for overrideCJK, to be loaded from Locale/*.pl.
82#
83# overrideCJK and overrideHangul
84#    "overrideCJK FUNCNAME" or "overrideCJK \&FUNCNAME" gives
85#    "overrideCJK => \&FUNCNAME,", where FUNCNAME should be visible
86#    from Locale/*.pl. As well, overrideHangul works similar.
87#
88# locale_version
89#    The version number of Locale/*.pl. It should be updated when a *.pl file
90#    will be changed. The default value is $DEFAULT_LOCALE_VERSION.
91#
92use 5.006;
93use strict;
94use warnings;
95use Carp;
96use File::Spec;
97
98my $Use4th; # Use 4th level (Unicode 6.2.0 or before)
99my $DEFAULT_LOCALE_VERSION = '1.31'; # should be same as $VERSION of Collate.pm
100
101BEGIN {
102    unless ("A" eq pack('U', 0x41)) {
103	die "Unicode::Collate cannot stringify a Unicode code point\n";
104    }
105    unless (0x41 == unpack('U', 'A')) {
106	die "Unicode::Collate cannot get a Unicode code point\n";
107    }
108}
109
110sub _getHexArray { map hex, $_[0] =~ /([0-9A-Fa-f]+)/g }
111sub trim { $_[0] =~ s/^\ +//; $_[0] =~ s/\ +\z// }
112
113sub ce {
114    my $var = shift;
115    my $vc = $var ? '*' : '.';
116    my $hx = join '.', map { sprintf '%04X', $_ } @_;
117    return "[$vc$hx]";
118}
119
120our $PACKAGE = 'Unicode::Collate, locale';
121our $ENT_FMT = "%-9s ; %s # %s\n";
122our $RE_CE   = '(?:\[[0-9A-Fa-f\.\*]+\])';
123our $CUR_DIR = File::Spec->curdir();
124
125use constant SBase  => 0xAC00;
126use constant SFinal => 0xD7A3;
127use constant NCount =>    588;
128use constant TCount =>     28;
129use constant LBase  => 0x1100;
130use constant VBase  => 0x1161;
131use constant TBase  => 0x11A7;
132
133use constant Min2Wt => 0x20;
134use constant Min3Wt => 0x02;
135
136our $OvCJK  = 'overrideCJK';
137our $OvHang = 'overrideHangul';
138
139sub decHangul {
140    my $code = shift;
141    my $si = $code - SBase;
142    my $li = int( $si / NCount);
143    my $vi = int(($si % NCount) / TCount);
144    my $ti =      $si % TCount;
145    return (
146	LBase + $li,
147	VBase + $vi,
148	$ti ? (TBase + $ti) : (),
149    );
150}
151
152use constant CJK_UidFr =>  0x4E00;
153use constant CJK_UidTo =>  0x9FFF;
154
155sub simple_cjk_deriv { # $u should be CJK U.I. or Ext.
156    my $u = hex shift;
157    my $base = (CJK_UidFr <= $u && $u <= CJK_UidTo) ? 0xFB40 : 0xFB80;
158    my $aaaa = $base + ($u >> 15);
159    my $bbbb = ($u & 0x7FFF) | 0x8000;
160    my @u = $Use4th ? ($u) : ();
161    return ce(0, $aaaa, Min2Wt, Min3Wt, @u).ce(0, $bbbb, 0, 0, @u);
162}
163
164my %Keys;        # "0300" => "[.0000.0035.0002.0300]"
165my %Code;        # "[.0000.0035.0002.0300]" => "0300"
166my %Name;        # "0300" => "COMBINING GRAVE ACCENT"
167my %Equiv;       # "[.0000.0035.0002.0300]" => ["0340", "0953"]
168my $vDUCET;      # from @version, such as "6.0.0"
169my %JamoWt1;     # primary weights of modern jamo (codept => wt) decimal
170my $JamoWt2 = 0; # the maximum secondary weight of modern jamo (decimal)
171my @CompId;      # decomp. mapping of CJK compat. id. like ["FA00", 0x5207]
172
173my @OtherEquiv = split /\n=/, <<'OTHEREQUIV';
174=00C2= (A-circ)
1751EA7;<00E2><0300>
1761EA6;<00C2><0300>
1771EA5;<00E2><0301>
1781EA4;<00C2><0301>
1791EAB;<00E2><0303>
1801EAA;<00C2><0303>
1811EA9;<00E2><0309>
1821EA8;<00C2><0309>
1831EAD;<00E2><0323>
1841EAC;<00C2><0323>
185=00C4= (A-uml)
18601DF;<00E4><0304>
18701DE;<00C4><0304>
188=00C5= (A-ring)
18901FB;<00E5><0301>
19001FA;<00C5><0301>
191=00C6= (AE-lig)
1921D2D;<00C6>+++12
19301FD;<00E6><0301>
19401FC;<00C6><0301>
19501E3;<00E6><0304>
19601E2;<00C6><0304>
197=00CA= (E-circ)
1981EC1;<00EA><0300>
1991EC0;<00CA><0300>
2001EBF;<00EA><0301>
2011EBE;<00CA><0301>
2021EC5;<00EA><0303>
2031EC4;<00CA><0303>
2041EC3;<00EA><0309>
2051EC2;<00CA><0309>
2061EC7;<00EA><0323>
2071EC6;<00CA><0323>
208=00D4= (O-circ)
2091ED3;<00F4><0300>
2101ED2;<00D4><0300>
2111ED1;<00F4><0301>
2121ED0;<00D4><0301>
2131ED7;<00F4><0303>
2141ED6;<00D4><0303>
2151ED5;<00F4><0309>
2161ED4;<00D4><0309>
2171ED9;<00F4><0323>
2181ED8;<00D4><0323>
219=00D5= (O-tilde)
2201E4D;<00F5><0301>
2211E4C;<00D5><0301>
222022D;<00F5><0304>
223022C;<00D5><0304>
2241E4F;<00F5><0308>
2251E4E;<00D5><0308>
2261EE1;<00F5><031B>
2271EE0;<00D5><031B>
228=00D6= (O-uml)
229022B;<00F6><0304>
230022A;<00D6><0304>
231=00D8= (O-slash)
23201FF;<00F8><0301>
23301FE;<00D8><0301>
234=00DC= (U-uml)
23501DC;<00FC><0300>
23601DB;<00DC><0300>
23701D8;<00FC><0301>
23801D7;<00DC><0301>
23901D6;<00FC><0304>
24001D5;<00DC><0304>
24101DA;<00FC><030C>
24201D9;<00DC><030C>
243=0102= (A-breve)
2441EB1;<0103><0300>
2451EB0;<0102><0300>
2461EAF;<0103><0301>
2471EAE;<0102><0301>
2481EB5;<0103><0303>
2491EB4;<0102><0303>
2501EB3;<0103><0309>
2511EB2;<0102><0309>
2521EB7;<0103><0323>
2531EB6;<0102><0323>
254=0186= (open-O)
2551D53;<0186>+++?
256=0190= (open-E)
2572107;<0190>+++?
2581D4B;<0190>+++?
259=01A0= (O-horn)
2601EDD;<01A1><0300>
2611EDC;<01A0><0300>
2621EDB;<01A1><0301>
2631EDA;<01A0><0301>
2641EE1;<01A1><0303>
2651EE0;<01A0><0303>
2661EDF;<01A1><0309>
2671EDE;<01A0><0309>
2681EE3;<01A1><0323>
2691EE2;<01A0><0323>
270=01AF= (U-horn)
2711EEB;<01B0><0300>
2721EEA;<01AF><0300>
2731EE9;<01B0><0301>
2741EE8;<01AF><0301>
2751EEF;<01B0><0303>
2761EEE;<01AF><0303>
2771EED;<01B0><0309>
2781EEC;<01AF><0309>
2791EF1;<01B0><0323>
2801EF0;<01AF><0323>
281=0300= (grave)
28200E0;a<0300>
28300C0;A<0300>
28400E8;e<0300>
28500C8;E<0300>
28600EC;i<0300>
28700CC;I<0300>
28800F2;o<0300>
28900D2;O<0300>
29000F9;u<0300>
29100D9;U<0300>
2921EF3;y<0300>
2931EF2;Y<0300>
294=0301= (acute)
29500E1;a<0301>
29600C1;A<0301>
29700E9;e<0301>
29800C9;E<0301>
29900ED;i<0301>
30000CD;I<0301>
30100F3;o<0301>
30200D3;O<0301>
30300FA;u<0301>
30400DA;U<0301>
30500FD;y<0301>
30600DD;Y<0301>
307=0302= (circum)
30800E2;a<0302>
30900C2;A<0302>
31000EA;e<0302>
31100CA;E<0302>
31200EE;i<0302>
31300CE;I<0302>
31400F4;o<0302>
31500D4;O<0302>
31600FB;u<0302>
31700DB;U<0302>
3180177;y<0302>
3190176;Y<0302>
320=0303= (tilde)
32100E3;a<0303>
32200C3;A<0303>
3231EBD;e<0303>
3241EBC;E<0303>
3250129;i<0303>
3260128;I<0303>
32700F5;o<0303>
32800D5;O<0303>
3290169;u<0303>
3300168;U<0303>
3311EF9;y<0303>
3321EF8;Y<0303>
333=0308= (diaeresis)
33400E4;a<0308>
33500C4;A<0308>
33600EB;e<0308>
33700CB;E<0308>
33800EF;i<0308>
33900CF;I<0308>
34000F6;o<0308>
34100D6;O<0308>
34200FC;u<0308>
34300DC;U<0308>
34400FF;y<0308>
3450178;Y<0308>
346=0309= (hook-above)
3471EA3;a<0309>
3481EA2;A<0309>
3491EBB;e<0309>
3501EBA;E<0309>
3511EC9;i<0309>
3521EC8;I<0309>
3531ECF;o<0309>
3541ECE;O<0309>
3551EE7;u<0309>
3561EE6;U<0309>
3571EF7;y<0309>
3581EF6;Y<0309>
359=0323= (dot-below)
3601EA1;a<0323>
3611EA0;A<0323>
3621EB9;e<0323>
3631EB8;E<0323>
3641ECB;i<0323>
3651ECA;I<0323>
3661ECD;o<0323>
3671ECC;O<0323>
3681EE5;u<0323>
3691EE4;U<0323>
3701EF5;y<0323>
3711EF4;Y<0323>
372=0406= (Cyrillic-Byelorussian-Ukrainian-I)
3730457;<0456><0308>
374A676;<0456>+++2 <0308>+++2
3750407;<0406><0308>
376=041E= (Cyrillic-O)
37704E7;<043E><0308>
37804E6;<041E><0308>
379=0423= (Cyrillic-U)
380045E;<0443><0306>
381040E;<0423><0306>
38204F1;<0443><0308>
38304F0;<0423><0308>
38404F3;<0443><030B>
38504F2;<0423><030B>
38604EF;<0443><0304>
38704EE;<0423><0304>
388=0933= (Devanagari-LLA)
3890934;<0933><093C>
390=0A3C= (Gurmukhi-Nukta)
3910A33;<0A32><0A3C>
3920A36;<0A38><0A3C>
3930A59;<0A16><0A3C>
3940A5A;<0A17><0A3C>
3950A5B;<0A1C><0A3C>
3960A5E;<0A2B><0A3C>
397=1EB8= (E-dot-below)
3981EC7;<1EB9><0302>
3991EC6;<1EB8><0302>
400=1ECC= (O-dot-below)
4011ED9;<1ECD><0302>
4021ED8;<1ECC><0302>
4031EE3;<1ECD><031B>
4041EE2;<1ECC><031B>
405=1EE4= (U-dot-below)
4061EF1;<1EE5><031B>
4071EF0;<1EE4><031B>
408OTHEREQUIV
409
410my %OtherEquiv;
411for my $o (@OtherEquiv) {
412    my @ln = split /\n/, $o;
413    my $uv = shift @ln;
414    $uv =~ s/ *\([a-zA-Z-]+\) *//;
415    $uv =~ tr/=//d;
416    croak "$PACKAGE: $uv is invalid in OTHEREQUIV" if $uv !~ /^[0-9A-F]+\z/;
417    $OtherEquiv{$uv} = \@ln;
418}
419
420##### read DUCET #####
421
422{
423    my($f, $fh);
424    foreach my $d ($CUR_DIR) {
425	$f = File::Spec->catfile($d, "Collate", "allkeys.txt");
426	last if open($fh, $f);
427	$f = undef;
428    }
429    croak "$PACKAGE: Collate/allkeys.txt is not found" if !defined $f;
430
431    while (my $line = <$fh>) {
432	chomp $line;
433	next if $line =~ /^\s*#/;
434	$vDUCET = $1 if $line =~ /^\@version\s*(\S*)/;
435
436	next if $line !~ /^\s*[0-9A-Fa-f]/;
437
438	my $name = ($line =~ s/[#%]\s*(.*)//) ? $1 : '';
439
440	# gets element
441	my($e, $k) = split /;/, $line;
442	trim($e);
443	trim($k);
444	$name =~ s/; QQ[A-Z]+//;
445	$name =~ s/^ ?\[[0-9A-F]+\] ?//;
446
447	if ($k =~ /\[\.0000\.0000\.0000(\.?0*)\]/) {
448	    $Use4th = 1 if $1;
449	    $Name{$e} = $name;
450	    next;
451	}
452	croak "Wrong Entry: <charList> must be separated by ';' ".
453	      "from <collElement>" if ! $k;
454	push @{ $Equiv{$k} }, $e if exists $Code{$k};
455
456	$Keys{$e} = $k;
457	$Code{$k} = $e if !exists $Code{$k};
458	$Name{$e} = $name;
459
460	# Hangul Jamo (modern only)
461	if ($e =~ /^11[0-9A-F]{2}\z/) {
462	    my @ec = _getHexArray($e);
463	    my @kc = _getHexArray($k);
464	    if (@ec == 1) {
465		$JamoWt1{$ec[0]} = $kc[0]; # each jamo
466		$JamoWt2 = $kc[1] if $JamoWt2 < $kc[1]; # max
467	    }
468	}
469
470	# CJK compatibility ideographs
471	if ($k =~ /^\[\.F[0-9A-F]+\.[0-9A-F]+\.0002[\.\]]/) {
472	    my @p = map hex($_), $k =~ /\[\.([0-9A-F]+)\./g;
473	    if (@p == 2) {
474		my $ui = ((($p[0] & 0x3F) << 15) | ($p[1] & 0x7FFF));
475		my $h = sprintf "%04X", $ui;
476		push @CompId, [$e, $ui] if $e ne $h; # should be decomposable
477	    }
478	}
479    }
480    close $fh;
481}
482
483# $Code{$k} : precomposed (such as 04D1, CYRILLIC SMALL LETTER A WITH BREVE)
484# $eqs      : equivalent sequence (such as <0430><0306>)
485# $starter  : starter codepoint (integer such as hex '0430')
486my @Contractions; # store Cyrillic, currently required, and others.
487for my $k (sort keys %Equiv) {
488    if ($Code{$k} !~ / / && $Equiv{$k}[0] =~ / /) {
489	(my $eqs = "<$Equiv{$k}[0]>") =~ s/ /></g;
490	my $starter = $eqs =~ /^<([0-9A-Fa-f]+)>/ ? hex($1) : '';
491	push @Contractions, [$starter, "$Code{$k};$eqs"];
492    }
493}
494
495##### Korean.pm #####
496
497{
498    my($f, $fh);
499    foreach my $d ($CUR_DIR) {
500	$f = File::Spec->catfile($d, "Collate", "CJK", "Korean.pm");
501	last if open($fh, $f);
502	$f = undef;
503    }
504    croak "$PACKAGE: Collate/CJK/Korean.pm is not found" if !defined $f;
505
506    my %KO_jamo; # (codepoint => 1) for jamo that a syllable comprises.
507    my $KO_head = ''; # before /^my %jamo2prim/
508    my $KO_foot = ''; # after /^\); # for DUCET/
509    while (my $line = <$fh>) {
510	chomp $line;
511
512	# check jamo that a syllable in __DATA__ comprises.
513	if (($line =~ /^__DATA__/) .. ($line =~ /^__END__/)) {
514	    if ($line =~ /^[A-D]/) { # Hangul syllable
515		$line =~ s/:.*//;
516		my @u = _getHexArray($line);
517		croak "unexpected $line" if @u != 1;
518		my $uv = $u[0];
519		croak "unexpected $line" unless SBase <= $uv && $uv <= SFinal;
520		my @dec = decHangul($uv);
521		$KO_jamo{$_} = 1 for @dec;
522		$line .= ':'.join '-', map sprintf('%04X', $_), @dec;
523	    }
524	}
525
526	$line .= "\n"; # guarantee the last LF at the file end.
527	$KO_head .= $line if 1 .. ($line =~ /^my %jamo2prim/);
528	$KO_foot .= $line if ($line =~ /^\); # for DUCET/) .. 1; # eof($fh)
529    }
530    close $fh;
531
532    open my $pm, ">Korean.pm" or die 'Korean.pm';
533    binmode $pm;
534    print $pm $KO_head;
535
536    my $count = 0;
537    for (sort keys %KO_jamo) {
538	print $pm ' ' if ($count % 4) == 0;
539	++$count;
540	print $pm '   ';
541	print $pm sprintf q!'%04X', 0x%04X,!, $_, $JamoWt1{$_};
542	print $pm "\n" if ($count % 4) == 0;
543    }
544    print $pm "\n" if $count % 4 != 0;
545
546    $JamoWt2 = sprintf "0x%02X", $JamoWt2;
547    $KO_foot =~ s/(\$wt = )[0-9x]+/$1$JamoWt2/;
548    $KO_foot =~ s/^(\); # for DUCET).*/$1 v$vDUCET/;
549    print $pm $KO_foot;
550    close $pm;
551}
552
553##### Locale/*.pl #####
554
555my $source = 'data';
556opendir DIR, $source or croak "no $source";
557my @txts = grep /^[a-zA-Z]/, readdir DIR;
558closedir DIR;
559
560my $target = 'Locale';
561mkdir $target;
562for my $txt (@txts) {
563    my %locale_keys;
564    my $txtfile = File::Spec->catfile($source, $txt);
565    my $pl = $txt;
566       $pl =~ s/\.txt\z/.pl/ or croak "$PACKAGE: $source/$txt is not .txt";
567    my $plfile = File::Spec->catfile($target, $pl);
568
569    open my $fh,  $txtfile  or croak "$PACKAGE: $source/$txt is not found: $!";
570    open my $ph, ">$plfile" or croak "$PACKAGE: $target/$pl can't be made: $!";
571    binmode $ph;
572
573    my $ptxt  = '';
574    my $entry = '';
575    my $locale_version = $DEFAULT_LOCALE_VERSION;
576
577    while (<$fh>) {
578	chomp;
579	next if /^\s*\z/;
580	if (s/^locale_version//) {
581	    $locale_version = $1 if /(\S+)/;
582	    next;
583	}
584	if (/^use/) {
585	    print $ph "$_;\n";
586	    s/^[^:]+:://; # assume that will be located under ./Collate,
587			  # then omit the first identifier (Unicode::).
588	    s/\s*\z/.pm/;
589	    my $f = File::Spec->catfile($CUR_DIR, split /::/, $_);
590	    $f = 'Korean.pm' if /::Korean\.pm/; # using the newer one
591	    require "./$f"; # File::Spec->catfile($CUR_DIR, $f) seems not work.
592	    next;
593	}
594	if (/^(alternate)\s+(\S+)/) {
595	    my $v = "variable";
596	    $ptxt .= "   $v => '$2',\n";
597	    $ptxt .= "   $1 => '$2',\n";
598	    next;
599	}
600	if (/^backwards$/) {
601	    $ptxt .= "   backwards => 2,\n";
602	    next;
603	}
604	if (s/^(override)(CJK|Hangul)[ \t]+(?:\\&|)/\\&/) {
605	    my $key = $1.$2;
606	    $ptxt .= "   $key => $_,\n";
607	    $locale_keys{$key} = eval $_;
608	    next;
609	}
610	if (/^upper$/) {
611	    $ptxt .= "   upper_before_lower => 1,\n";
612	    next;
613	}
614	if (s/^suppress//) {
615	    s/\s*-\s*/../g;
616	    my @c = split;
617	    s/(?:0[Xx])?([0-9A-Fa-f]+)/0x$1/g for @c;
618	    my $list = join ", ", @c;
619	    $ptxt .= "   suppress => [$list],\n";
620	    next;
621	}
622	if (/^([\s\-0-9A-Fa-fXx]+)\z/) { # continue the last list
623	    s/\s*-\s*/../g;
624	    my @c = split;
625	    s/(?:0[Xx])?([0-9A-Fa-f]+)/0x$1/g for @c;
626	    my $list = join ", ", @c;
627	    $ptxt =~ s/\](,$)/$1/;
628	    $ptxt .= "\t\t$list],\n";
629	    next;
630	}
631	if (/^\s*(#\s*)/) {
632	    $ptxt .= "$_\n" if $1 ne '#';
633	    next;
634	}
635
636	$entry .= parse_entry($_, \%locale_keys);
637    }
638
639    # precomposed chars to be suppressed as additional equivalents
640    if ($ptxt =~ /suppress => \[(.*)\]/s) {
641	my @suplist = eval $1;
642	my %suppressed;
643	@suppressed{@suplist} = (1) x @suplist;
644
645	for my $contract (@Contractions) {
646	    my $starter = $contract->[0];
647	    my $addline = $contract->[1];
648	    next if ! $suppressed{$starter};
649	    $entry .= parse_entry($addline, \%locale_keys);
650	}
651    }
652
653    # Compatibility ideographs are tailored as equivalent unified ideographs.
654    if ($locale_keys{$OvCJK}) {
655	for my $c (@CompId) {
656	    my $r = $locale_keys{$OvCJK}->($c->[1]);
657	    next if !defined $r;
658	    my $addline = sprintf '%s;<%04X>', $c->[0], $c->[1];
659	    $entry .= parse_entry($addline, \%locale_keys);
660	}
661    }
662
663    if ($entry) {
664	my $v = $vDUCET ? " # for DUCET v$vDUCET" : '';
665	$ptxt .= "   entry => <<'ENTRY',$v\n";
666	$ptxt .= $entry;
667	$ptxt .= "ENTRY\n";
668    }
669
670    my $lv = "   locale_version => $locale_version,\n";
671    print $ph "+{\n$lv$ptxt};\n";
672    close $fh;
673    close $ph;
674}
675
676sub parse_entry {
677    my $line = shift;
678    my $lockeys = shift;
679
680    my($e,$rule) = split_e_rule($line);
681    my $name = getname($e);
682    my $eq_rule = $rule eq '=';
683    $rule = join '', map "<$_>", split ' ', $e if $eq_rule;
684    my ($newce, $simpdec) = parse_rule($e, $rule, $lockeys);
685
686    my $newentry = '';
687
688    if (!$lockeys->{$e}) {
689	$newentry .= sprintf $ENT_FMT, $e, $newce, $name if !$eq_rule;
690	$lockeys->{$e} = $newce;
691    } else {
692	$newentry .= "# already tailored: $_\n";
693    }
694
695    if (!$simpdec && $Keys{$e}) { # duplicate for the decomposition
696	my $key = $Keys{$e};
697	my @ce = $key =~ /$RE_CE/go;
698	if (@ce > 1) {
699	    my $ok = 1;
700	    my $ee = '';
701	    for my $c (@ce) {
702		$ok = 0, last if !$Code{$c};
703		$ee .= ' ' if $ee ne '';
704		$ee .= $Code{$c};
705	    }
706	    if ($ok && !$lockeys->{$ee}) {
707		$newentry .= sprintf $ENT_FMT, $ee, $newce, $name;
708		$lockeys->{$ee} = $newce;
709	    }
710	    if ($ee =~ s/ 030([01])/ 034$1/ &&
711		$ok && !$lockeys->{$ee}) {
712		$newentry .= sprintf $ENT_FMT, $ee, $newce, $name;
713		$lockeys->{$ee} = $newce;
714	    }
715	}
716	if ($Equiv{$key}) {
717	    for my $eq (@{ $Equiv{$key} }) {
718		next if $key =~ /^\[\.0000\.[^]]+\]\z/; # primary ignorable
719		next if $lockeys->{$eq};
720		next if $eq eq '3038'; # 3038 is identical to 2F17 in DUCET,
721				       # but not canonical equivalent.
722		$newentry .= sprintf $ENT_FMT, $eq, $newce, $Name{$eq};
723		$lockeys->{$eq} = $newce;
724	    }
725	}
726    }
727
728    if ($OtherEquiv{$e}) {
729	for my $o (@{ $OtherEquiv{$e} }) {
730	    my($e,$rule) = split_e_rule($o);
731	    my $name = getname($e);
732	    (my $newce, undef) = parse_rule($e, $rule, $lockeys);
733	    next if $lockeys->{$e};
734	    $newentry .= sprintf $ENT_FMT, $e, $newce, $name;
735	    $lockeys->{$e} = $newce;
736	}
737    }
738
739    return $newentry;
740}
741
742sub getunicode {
743    return join ' ', map { sprintf '%04X', $_ } unpack 'U*', shift;
744}
745
746sub parse_element {
747    my $e = shift;
748    $e =~ s/\{([A-Za-z']+)\}/' '.getunicode($1).' '/ge;
749    $e =~ s/ +/ /g;
750    trim($e);
751    return $e;
752}
753
754sub split_e_rule {
755    my $line = shift;
756    my($e, $r) = split /;/, $line;
757    return (parse_element($e), $r);
758}
759
760sub getname {
761    my $e = shift;
762    return $Name{$e} if $Name{$e};  # single collation element (without <>)
763    my @e = split ' ', $e;
764    my @name = map { $Name{$_} ? $Name{$_} :
765		     /^FD[DE][0-9A-F]\z/ ? "noncharacter-$_" :
766		     'unknown' } @e;
767    return sprintf '<%s>', join ', ', @name;
768}
769
770sub parse_rule {
771    my $e    = shift;
772    my $e1   = $e =~ /^([0-9A-F]+)/ ? $1 : '';
773    my $rule = shift;
774    my $lockeys = shift;
775    my $result = '';
776    my $simple_decomp = 1; # rules containing only [A-Za-z'"] or <XXXX>
777
778    for (my $prerule = $rule; $rule ne ''; $prerule = $rule) {
779	$rule =~ s/^ +//;
780	last if $rule =~ /^#/;
781	if ($rule =~ s/^($RE_CE)//o) {
782	    my $k = $1;
783	    my $var = $k =~ /^\[\*/ ? 1 : 0;
784	    my @c = _getHexArray($k);
785	    @c = @c[0..2] if !$Use4th;
786	    $result .= ce($var, @c);
787	    next;
788	}
789
790	if ($rule =~ s/^(<([0-9A-F ]+)>\+\+\+\?)//) {
791	    my $cr = $1;
792	    my @c = split ' ', $2;
793	    my $compat = $Keys{$e};
794	    my $decomp = join '', map {
795			$Keys{$_} ? $Keys{$_} : simple_cjk_deriv($_)
796		    } @c;
797	    my $regexp = $decomp;
798	    $regexp =~ s/([\[\]\.\*])/\\$1/g;
799	    $regexp =~ s/\.00(?:0[1-9A-F]|1[0-9A-F])(?:\\\.[0-9A-F]+|)\\\]
800			/.(00(?:0[1-9A-F]|1[0-9A-F]))(?:\\.[0-9A-F]+|)\\\]/gx;
801			# tertiary weights of 01-1F (excluding 00)
802	    my @tD = map hex($_), $decomp =~ /^$regexp\z/;
803	    my @tC = map hex($_), $compat =~ /^$regexp\z/;
804	    croak "wrong at $cr" unless @c == @tD && @c == @tC;
805	    my $r = join ' ', map "<$c[$_]>+++".($tC[$_] - $tD[$_]), 0..@c-1;
806	    $rule = $r.$rule;
807	    next;
808	}
809
810	my $key;
811	if ($rule =~ s/^(<[0-9A-Za-z'{ }]+>|[A-Za-z'"])//) {
812	    my $e = $1;
813	    my $c = $e =~ tr/<>//d ? parse_element($e) : getunicode($e);
814	    croak "<$c> is too short" if 4 > length $c;
815	    $key = $lockeys->{$c} || $Keys{$c};
816	    if (!defined $key) {
817		my $u = hex $c;
818		my @u = $Use4th ? ($u) : ();
819		my @r;
820		if (SBase <= $u && $u <= SFinal) {
821		    @r = $lockeys->{$OvHang}->($u) if $lockeys->{$OvHang};
822		} else {
823		    # but no check if $u is in CJK ideographs
824		    @r = $lockeys->{$OvCJK} ->($u) if $lockeys->{$OvCJK};
825		}
826		if (@r) {
827		    $key = join '', map {
828			ref $_ ? ce(0, @$_) : ce(0, $_, Min2Wt, Min3Wt, @u)
829			    } @r;
830		}
831	    }
832	}
833
834	my @base;
835	for my $k ($key =~ /$RE_CE/go) {
836	    my $var = $k =~ /^\[\*/ ? 1 : 0;
837	    push @base, [$var, _getHexArray($k)];
838	}
839	croak "the rule seems wrong at $prerule" if !@base;
840
841	my $replaced = 0;
842	while ($rule =~ s/^(([+-])\2*)(\d+)//) {
843	    my $idx = length($1);
844	    my $num = $2 eq '-' ? -$3 : $3;
845	    $base[0][$idx] += $num;
846	    ++$replaced;
847	}
848
849	$simple_decomp = 0 if $replaced;
850	for my $c (@base) {
851	    $c->[4] = hex $e1 if $replaced && $Use4th;
852	    $result .= ce(@$c);
853	}
854	croak "something wrong at $rule" if $prerule eq $rule;
855    }
856    return($result, $simple_decomp);
857}
858