1#!perl 2# 3# This auxiliary script makes locale .pl files and copes with Korean.pm 4# used by Unicode::Collate::Locale. 5# 6# Usage: 7# <do './mklocale'> in perl, or <perl mklocale> in command line 8# 9# Input files: 10# data/*.txt 11# Collate/CJK/Korean.pm 12# Collate/allkeys.txt 13# 14# Output files: 15# Locale/*.pl (need to be moved to Collate/Locale/*.pl to install them) 16# Korean.pm (need to be moved to Collate/CJK/Korean.pm to install it) 17# 18# What to do: 19# * convert data/*.txt to Locale/*.pl 20# * rewrite %jamo2prim in Korean.pm 21# 22# == Examples of the Rules for data/*.txt == 23# 24# 00F1;n+1 ===> primary weight of 00F1 is greater than that of n by 1. 25# Among literals, only [A-Za-z'"] can be the base. 26# +1 for primary weight (greater by 1), 27# -1 for primary weight (less by 1), 28# ++1 and --1 for secondary weight, 29# +++1 and ---1 for tertiary weight. 30# A number followed by + or - is decimal integer. 31# 32# 0491;<0433>+1 33# 1D2D;<00C6>+++12 34# XXXX is hexadecimal for U+XXXX. 35# <XXXX> can be the base followed by +1 etc. 36# 37# 3220;<0028 4E00 0029>+++? 38# '+++?' means implicit tertiary weights and borrows them from DUCET. 39# The lefthand (3220) and the righthand (0028 4E00 0029) *must* be 40# secondary equal in DUCET too (usually compatibility equivalent). 41# The example shown works like "3220;<0028>+++2 <4E00>+++2 <0029>+++29", 42# since the tertiary weights of U+3200 are (0x4,0x4,0x1F), those of 43# <0028 4E00 0029> are (0x2,0x2,0x2), and the difference is (+2,+2,+29). 44# Limitation: cannot be combined with other rules in the same line. 45# 46# 01FD;<00E6><0301> ===> U+01FD eq U+00E6,U+0301 47# simple decomposition (without + and -): identical each other. 48# 49# 0902;<0950>+0 [.FFF1.0.0.0] 50# [...] means constant weights in hexadecimal. 51# 52# 0064 0335;= 53# 0111;d++1<0335> 54# '=' saves DUCET weights as it is. 55# 0064 0335;= prevents 0064 0335 from being equal to 0111. 56# 57# {ch};c+1 ===> 0063 0068;c+1 58# {K'};Q++1 ===> 004B 0027;Q++1 59# { } before ; encloses a literal: [A-Za-z'] (alphabets or apostrophe). 60# 61# {gh}0335;<{g}0127> ===> U+0067,U+0068,U+0335 eq U+0067,U+0127 62# {dZ}030C;<{d}017D> ===> U+0064,U+005A,U+030C eq U+0064,U+017D 63# < > after ; can enclose a unit comprising plural XXXX or {literal}. 64# 65# alternate 66# "alternate XXX" gives "variable => 'XXX'," and "alternate => 'XXX',". 67# 68# backwards 69# It gives "backwards => 2,". 70# 71# upper 72# It gives "upper_before_lower => 1,". 73# 74# suppress 75# "suppress XXXX YYYY" gives "suppress => [0xXXXX, 0xYYYY],", 76# "suppress XXXX-YYYY" gives "suppress => [0xXXXX..0xYYYY],". 77# Values XXXX etc. should be hexadecimal, and may be listed in few lines. 78# 79# use 80# "use PACKAGE" gives "use PACKAGE;". The PACKAGE stands for a module, 81# such as CJK/*.pm for overrideCJK, to be loaded from Locale/*.pl. 82# 83# overrideCJK and overrideHangul 84# "overrideCJK FUNCNAME" or "overrideCJK \&FUNCNAME" gives 85# "overrideCJK => \&FUNCNAME,", where FUNCNAME should be visible 86# from Locale/*.pl. As well, overrideHangul works similar. 87# 88# locale_version 89# The version number of Locale/*.pl. It should be updated when a *.pl file 90# will be changed. The default value is $DEFAULT_LOCALE_VERSION. 91# 92use 5.006; 93use strict; 94use warnings; 95use Carp; 96use File::Spec; 97 98my $Use4th; # Use 4th level (Unicode 6.2.0 or before) 99my $DEFAULT_LOCALE_VERSION = '1.31'; # should be same as $VERSION of Collate.pm 100 101BEGIN { 102 unless ("A" eq pack('U', 0x41)) { 103 die "Unicode::Collate cannot stringify a Unicode code point\n"; 104 } 105 unless (0x41 == unpack('U', 'A')) { 106 die "Unicode::Collate cannot get a Unicode code point\n"; 107 } 108} 109 110sub _getHexArray { map hex, $_[0] =~ /([0-9A-Fa-f]+)/g } 111sub trim { $_[0] =~ s/^\ +//; $_[0] =~ s/\ +\z// } 112 113sub ce { 114 my $var = shift; 115 my $vc = $var ? '*' : '.'; 116 my $hx = join '.', map { sprintf '%04X', $_ } @_; 117 return "[$vc$hx]"; 118} 119 120our $PACKAGE = 'Unicode::Collate, locale'; 121our $ENT_FMT = "%-9s ; %s # %s\n"; 122our $RE_CE = '(?:\[[0-9A-Fa-f\.\*]+\])'; 123our $CUR_DIR = File::Spec->curdir(); 124 125use constant SBase => 0xAC00; 126use constant SFinal => 0xD7A3; 127use constant NCount => 588; 128use constant TCount => 28; 129use constant LBase => 0x1100; 130use constant VBase => 0x1161; 131use constant TBase => 0x11A7; 132 133use constant Min2Wt => 0x20; 134use constant Min3Wt => 0x02; 135 136our $OvCJK = 'overrideCJK'; 137our $OvHang = 'overrideHangul'; 138 139sub decHangul { 140 my $code = shift; 141 my $si = $code - SBase; 142 my $li = int( $si / NCount); 143 my $vi = int(($si % NCount) / TCount); 144 my $ti = $si % TCount; 145 return ( 146 LBase + $li, 147 VBase + $vi, 148 $ti ? (TBase + $ti) : (), 149 ); 150} 151 152use constant CJK_UidFr => 0x4E00; 153use constant CJK_UidTo => 0x9FFF; 154 155sub simple_cjk_deriv { # $u should be CJK U.I. or Ext. 156 my $u = hex shift; 157 my $base = (CJK_UidFr <= $u && $u <= CJK_UidTo) ? 0xFB40 : 0xFB80; 158 my $aaaa = $base + ($u >> 15); 159 my $bbbb = ($u & 0x7FFF) | 0x8000; 160 my @u = $Use4th ? ($u) : (); 161 return ce(0, $aaaa, Min2Wt, Min3Wt, @u).ce(0, $bbbb, 0, 0, @u); 162} 163 164my %Keys; # "0300" => "[.0000.0035.0002.0300]" 165my %Code; # "[.0000.0035.0002.0300]" => "0300" 166my %Name; # "0300" => "COMBINING GRAVE ACCENT" 167my %Equiv; # "[.0000.0035.0002.0300]" => ["0340", "0953"] 168my $vDUCET; # from @version, such as "6.0.0" 169my %JamoWt1; # primary weights of modern jamo (codept => wt) decimal 170my $JamoWt2 = 0; # the maximum secondary weight of modern jamo (decimal) 171my @CompId; # decomp. mapping of CJK compat. id. like ["FA00", 0x5207] 172 173my @OtherEquiv = split /\n=/, <<'OTHEREQUIV'; 174=00C2= (A-circ) 1751EA7;<00E2><0300> 1761EA6;<00C2><0300> 1771EA5;<00E2><0301> 1781EA4;<00C2><0301> 1791EAB;<00E2><0303> 1801EAA;<00C2><0303> 1811EA9;<00E2><0309> 1821EA8;<00C2><0309> 1831EAD;<00E2><0323> 1841EAC;<00C2><0323> 185=00C4= (A-uml) 18601DF;<00E4><0304> 18701DE;<00C4><0304> 188=00C5= (A-ring) 18901FB;<00E5><0301> 19001FA;<00C5><0301> 191=00C6= (AE-lig) 1921D2D;<00C6>+++12 19301FD;<00E6><0301> 19401FC;<00C6><0301> 19501E3;<00E6><0304> 19601E2;<00C6><0304> 197=00CA= (E-circ) 1981EC1;<00EA><0300> 1991EC0;<00CA><0300> 2001EBF;<00EA><0301> 2011EBE;<00CA><0301> 2021EC5;<00EA><0303> 2031EC4;<00CA><0303> 2041EC3;<00EA><0309> 2051EC2;<00CA><0309> 2061EC7;<00EA><0323> 2071EC6;<00CA><0323> 208=00D4= (O-circ) 2091ED3;<00F4><0300> 2101ED2;<00D4><0300> 2111ED1;<00F4><0301> 2121ED0;<00D4><0301> 2131ED7;<00F4><0303> 2141ED6;<00D4><0303> 2151ED5;<00F4><0309> 2161ED4;<00D4><0309> 2171ED9;<00F4><0323> 2181ED8;<00D4><0323> 219=00D5= (O-tilde) 2201E4D;<00F5><0301> 2211E4C;<00D5><0301> 222022D;<00F5><0304> 223022C;<00D5><0304> 2241E4F;<00F5><0308> 2251E4E;<00D5><0308> 2261EE1;<00F5><031B> 2271EE0;<00D5><031B> 228=00D6= (O-uml) 229022B;<00F6><0304> 230022A;<00D6><0304> 231=00D8= (O-slash) 23201FF;<00F8><0301> 23301FE;<00D8><0301> 234=00DC= (U-uml) 23501DC;<00FC><0300> 23601DB;<00DC><0300> 23701D8;<00FC><0301> 23801D7;<00DC><0301> 23901D6;<00FC><0304> 24001D5;<00DC><0304> 24101DA;<00FC><030C> 24201D9;<00DC><030C> 243=0102= (A-breve) 2441EB1;<0103><0300> 2451EB0;<0102><0300> 2461EAF;<0103><0301> 2471EAE;<0102><0301> 2481EB5;<0103><0303> 2491EB4;<0102><0303> 2501EB3;<0103><0309> 2511EB2;<0102><0309> 2521EB7;<0103><0323> 2531EB6;<0102><0323> 254=0186= (open-O) 2551D53;<0186>+++? 256=0190= (open-E) 2572107;<0190>+++? 2581D4B;<0190>+++? 259=01A0= (O-horn) 2601EDD;<01A1><0300> 2611EDC;<01A0><0300> 2621EDB;<01A1><0301> 2631EDA;<01A0><0301> 2641EE1;<01A1><0303> 2651EE0;<01A0><0303> 2661EDF;<01A1><0309> 2671EDE;<01A0><0309> 2681EE3;<01A1><0323> 2691EE2;<01A0><0323> 270=01AF= (U-horn) 2711EEB;<01B0><0300> 2721EEA;<01AF><0300> 2731EE9;<01B0><0301> 2741EE8;<01AF><0301> 2751EEF;<01B0><0303> 2761EEE;<01AF><0303> 2771EED;<01B0><0309> 2781EEC;<01AF><0309> 2791EF1;<01B0><0323> 2801EF0;<01AF><0323> 281=0300= (grave) 28200E0;a<0300> 28300C0;A<0300> 28400E8;e<0300> 28500C8;E<0300> 28600EC;i<0300> 28700CC;I<0300> 28800F2;o<0300> 28900D2;O<0300> 29000F9;u<0300> 29100D9;U<0300> 2921EF3;y<0300> 2931EF2;Y<0300> 294=0301= (acute) 29500E1;a<0301> 29600C1;A<0301> 29700E9;e<0301> 29800C9;E<0301> 29900ED;i<0301> 30000CD;I<0301> 30100F3;o<0301> 30200D3;O<0301> 30300FA;u<0301> 30400DA;U<0301> 30500FD;y<0301> 30600DD;Y<0301> 307=0302= (circum) 30800E2;a<0302> 30900C2;A<0302> 31000EA;e<0302> 31100CA;E<0302> 31200EE;i<0302> 31300CE;I<0302> 31400F4;o<0302> 31500D4;O<0302> 31600FB;u<0302> 31700DB;U<0302> 3180177;y<0302> 3190176;Y<0302> 320=0303= (tilde) 32100E3;a<0303> 32200C3;A<0303> 3231EBD;e<0303> 3241EBC;E<0303> 3250129;i<0303> 3260128;I<0303> 32700F5;o<0303> 32800D5;O<0303> 3290169;u<0303> 3300168;U<0303> 3311EF9;y<0303> 3321EF8;Y<0303> 333=0308= (diaeresis) 33400E4;a<0308> 33500C4;A<0308> 33600EB;e<0308> 33700CB;E<0308> 33800EF;i<0308> 33900CF;I<0308> 34000F6;o<0308> 34100D6;O<0308> 34200FC;u<0308> 34300DC;U<0308> 34400FF;y<0308> 3450178;Y<0308> 346=0309= (hook-above) 3471EA3;a<0309> 3481EA2;A<0309> 3491EBB;e<0309> 3501EBA;E<0309> 3511EC9;i<0309> 3521EC8;I<0309> 3531ECF;o<0309> 3541ECE;O<0309> 3551EE7;u<0309> 3561EE6;U<0309> 3571EF7;y<0309> 3581EF6;Y<0309> 359=0323= (dot-below) 3601EA1;a<0323> 3611EA0;A<0323> 3621EB9;e<0323> 3631EB8;E<0323> 3641ECB;i<0323> 3651ECA;I<0323> 3661ECD;o<0323> 3671ECC;O<0323> 3681EE5;u<0323> 3691EE4;U<0323> 3701EF5;y<0323> 3711EF4;Y<0323> 372=0406= (Cyrillic-Byelorussian-Ukrainian-I) 3730457;<0456><0308> 374A676;<0456>+++2 <0308>+++2 3750407;<0406><0308> 376=041E= (Cyrillic-O) 37704E7;<043E><0308> 37804E6;<041E><0308> 379=0423= (Cyrillic-U) 380045E;<0443><0306> 381040E;<0423><0306> 38204F1;<0443><0308> 38304F0;<0423><0308> 38404F3;<0443><030B> 38504F2;<0423><030B> 38604EF;<0443><0304> 38704EE;<0423><0304> 388=0933= (Devanagari-LLA) 3890934;<0933><093C> 390=0A3C= (Gurmukhi-Nukta) 3910A33;<0A32><0A3C> 3920A36;<0A38><0A3C> 3930A59;<0A16><0A3C> 3940A5A;<0A17><0A3C> 3950A5B;<0A1C><0A3C> 3960A5E;<0A2B><0A3C> 397=1EB8= (E-dot-below) 3981EC7;<1EB9><0302> 3991EC6;<1EB8><0302> 400=1ECC= (O-dot-below) 4011ED9;<1ECD><0302> 4021ED8;<1ECC><0302> 4031EE3;<1ECD><031B> 4041EE2;<1ECC><031B> 405=1EE4= (U-dot-below) 4061EF1;<1EE5><031B> 4071EF0;<1EE4><031B> 408OTHEREQUIV 409 410my %OtherEquiv; 411for my $o (@OtherEquiv) { 412 my @ln = split /\n/, $o; 413 my $uv = shift @ln; 414 $uv =~ s/ *\([a-zA-Z-]+\) *//; 415 $uv =~ tr/=//d; 416 croak "$PACKAGE: $uv is invalid in OTHEREQUIV" if $uv !~ /^[0-9A-F]+\z/; 417 $OtherEquiv{$uv} = \@ln; 418} 419 420##### read DUCET ##### 421 422{ 423 my($f, $fh); 424 foreach my $d ($CUR_DIR) { 425 $f = File::Spec->catfile($d, "Collate", "allkeys.txt"); 426 last if open($fh, $f); 427 $f = undef; 428 } 429 croak "$PACKAGE: Collate/allkeys.txt is not found" if !defined $f; 430 431 while (my $line = <$fh>) { 432 chomp $line; 433 next if $line =~ /^\s*#/; 434 $vDUCET = $1 if $line =~ /^\@version\s*(\S*)/; 435 436 next if $line !~ /^\s*[0-9A-Fa-f]/; 437 438 my $name = ($line =~ s/[#%]\s*(.*)//) ? $1 : ''; 439 440 # gets element 441 my($e, $k) = split /;/, $line; 442 trim($e); 443 trim($k); 444 $name =~ s/; QQ[A-Z]+//; 445 $name =~ s/^ ?\[[0-9A-F]+\] ?//; 446 447 if ($k =~ /\[\.0000\.0000\.0000(\.?0*)\]/) { 448 $Use4th = 1 if $1; 449 $Name{$e} = $name; 450 next; 451 } 452 croak "Wrong Entry: <charList> must be separated by ';' ". 453 "from <collElement>" if ! $k; 454 push @{ $Equiv{$k} }, $e if exists $Code{$k}; 455 456 $Keys{$e} = $k; 457 $Code{$k} = $e if !exists $Code{$k}; 458 $Name{$e} = $name; 459 460 # Hangul Jamo (modern only) 461 if ($e =~ /^11[0-9A-F]{2}\z/) { 462 my @ec = _getHexArray($e); 463 my @kc = _getHexArray($k); 464 if (@ec == 1) { 465 $JamoWt1{$ec[0]} = $kc[0]; # each jamo 466 $JamoWt2 = $kc[1] if $JamoWt2 < $kc[1]; # max 467 } 468 } 469 470 # CJK compatibility ideographs 471 if ($k =~ /^\[\.F[0-9A-F]+\.[0-9A-F]+\.0002[\.\]]/) { 472 my @p = map hex($_), $k =~ /\[\.([0-9A-F]+)\./g; 473 if (@p == 2) { 474 my $ui = ((($p[0] & 0x3F) << 15) | ($p[1] & 0x7FFF)); 475 my $h = sprintf "%04X", $ui; 476 push @CompId, [$e, $ui] if $e ne $h; # should be decomposable 477 } 478 } 479 } 480 close $fh; 481} 482 483# $Code{$k} : precomposed (such as 04D1, CYRILLIC SMALL LETTER A WITH BREVE) 484# $eqs : equivalent sequence (such as <0430><0306>) 485# $starter : starter codepoint (integer such as hex '0430') 486my @Contractions; # store Cyrillic, currently required, and others. 487for my $k (sort keys %Equiv) { 488 if ($Code{$k} !~ / / && $Equiv{$k}[0] =~ / /) { 489 (my $eqs = "<$Equiv{$k}[0]>") =~ s/ /></g; 490 my $starter = $eqs =~ /^<([0-9A-Fa-f]+)>/ ? hex($1) : ''; 491 push @Contractions, [$starter, "$Code{$k};$eqs"]; 492 } 493} 494 495##### Korean.pm ##### 496 497{ 498 my($f, $fh); 499 foreach my $d ($CUR_DIR) { 500 $f = File::Spec->catfile($d, "Collate", "CJK", "Korean.pm"); 501 last if open($fh, $f); 502 $f = undef; 503 } 504 croak "$PACKAGE: Collate/CJK/Korean.pm is not found" if !defined $f; 505 506 my %KO_jamo; # (codepoint => 1) for jamo that a syllable comprises. 507 my $KO_head = ''; # before /^my %jamo2prim/ 508 my $KO_foot = ''; # after /^\); # for DUCET/ 509 while (my $line = <$fh>) { 510 chomp $line; 511 512 # check jamo that a syllable in __DATA__ comprises. 513 if (($line =~ /^__DATA__/) .. ($line =~ /^__END__/)) { 514 if ($line =~ /^[A-D]/) { # Hangul syllable 515 $line =~ s/:.*//; 516 my @u = _getHexArray($line); 517 croak "unexpected $line" if @u != 1; 518 my $uv = $u[0]; 519 croak "unexpected $line" unless SBase <= $uv && $uv <= SFinal; 520 my @dec = decHangul($uv); 521 $KO_jamo{$_} = 1 for @dec; 522 $line .= ':'.join '-', map sprintf('%04X', $_), @dec; 523 } 524 } 525 526 $line .= "\n"; # guarantee the last LF at the file end. 527 $KO_head .= $line if 1 .. ($line =~ /^my %jamo2prim/); 528 $KO_foot .= $line if ($line =~ /^\); # for DUCET/) .. 1; # eof($fh) 529 } 530 close $fh; 531 532 open my $pm, ">Korean.pm" or die 'Korean.pm'; 533 binmode $pm; 534 print $pm $KO_head; 535 536 my $count = 0; 537 for (sort keys %KO_jamo) { 538 print $pm ' ' if ($count % 4) == 0; 539 ++$count; 540 print $pm ' '; 541 print $pm sprintf q!'%04X', 0x%04X,!, $_, $JamoWt1{$_}; 542 print $pm "\n" if ($count % 4) == 0; 543 } 544 print $pm "\n" if $count % 4 != 0; 545 546 $JamoWt2 = sprintf "0x%02X", $JamoWt2; 547 $KO_foot =~ s/(\$wt = )[0-9x]+/$1$JamoWt2/; 548 $KO_foot =~ s/^(\); # for DUCET).*/$1 v$vDUCET/; 549 print $pm $KO_foot; 550 close $pm; 551} 552 553##### Locale/*.pl ##### 554 555my $source = 'data'; 556opendir DIR, $source or croak "no $source"; 557my @txts = grep /^[a-zA-Z]/, readdir DIR; 558closedir DIR; 559 560my $target = 'Locale'; 561mkdir $target; 562for my $txt (@txts) { 563 my %locale_keys; 564 my $txtfile = File::Spec->catfile($source, $txt); 565 my $pl = $txt; 566 $pl =~ s/\.txt\z/.pl/ or croak "$PACKAGE: $source/$txt is not .txt"; 567 my $plfile = File::Spec->catfile($target, $pl); 568 569 open my $fh, $txtfile or croak "$PACKAGE: $source/$txt is not found: $!"; 570 open my $ph, ">$plfile" or croak "$PACKAGE: $target/$pl can't be made: $!"; 571 binmode $ph; 572 573 my $ptxt = ''; 574 my $entry = ''; 575 my $locale_version = $DEFAULT_LOCALE_VERSION; 576 577 while (<$fh>) { 578 chomp; 579 next if /^\s*\z/; 580 if (s/^locale_version//) { 581 $locale_version = $1 if /(\S+)/; 582 next; 583 } 584 if (/^use/) { 585 print $ph "$_;\n"; 586 s/^[^:]+:://; # assume that will be located under ./Collate, 587 # then omit the first identifier (Unicode::). 588 s/\s*\z/.pm/; 589 my $f = File::Spec->catfile($CUR_DIR, split /::/, $_); 590 $f = 'Korean.pm' if /::Korean\.pm/; # using the newer one 591 require "./$f"; # File::Spec->catfile($CUR_DIR, $f) seems not work. 592 next; 593 } 594 if (/^(alternate)\s+(\S+)/) { 595 my $v = "variable"; 596 $ptxt .= " $v => '$2',\n"; 597 $ptxt .= " $1 => '$2',\n"; 598 next; 599 } 600 if (/^backwards$/) { 601 $ptxt .= " backwards => 2,\n"; 602 next; 603 } 604 if (s/^(override)(CJK|Hangul)[ \t]+(?:\\&|)/\\&/) { 605 my $key = $1.$2; 606 $ptxt .= " $key => $_,\n"; 607 $locale_keys{$key} = eval $_; 608 next; 609 } 610 if (/^upper$/) { 611 $ptxt .= " upper_before_lower => 1,\n"; 612 next; 613 } 614 if (s/^suppress//) { 615 s/\s*-\s*/../g; 616 my @c = split; 617 s/(?:0[Xx])?([0-9A-Fa-f]+)/0x$1/g for @c; 618 my $list = join ", ", @c; 619 $ptxt .= " suppress => [$list],\n"; 620 next; 621 } 622 if (/^([\s\-0-9A-Fa-fXx]+)\z/) { # continue the last list 623 s/\s*-\s*/../g; 624 my @c = split; 625 s/(?:0[Xx])?([0-9A-Fa-f]+)/0x$1/g for @c; 626 my $list = join ", ", @c; 627 $ptxt =~ s/\](,$)/$1/; 628 $ptxt .= "\t\t$list],\n"; 629 next; 630 } 631 if (/^\s*(#\s*)/) { 632 $ptxt .= "$_\n" if $1 ne '#'; 633 next; 634 } 635 636 $entry .= parse_entry($_, \%locale_keys); 637 } 638 639 # precomposed chars to be suppressed as additional equivalents 640 if ($ptxt =~ /suppress => \[(.*)\]/s) { 641 my @suplist = eval $1; 642 my %suppressed; 643 @suppressed{@suplist} = (1) x @suplist; 644 645 for my $contract (@Contractions) { 646 my $starter = $contract->[0]; 647 my $addline = $contract->[1]; 648 next if ! $suppressed{$starter}; 649 $entry .= parse_entry($addline, \%locale_keys); 650 } 651 } 652 653 # Compatibility ideographs are tailored as equivalent unified ideographs. 654 if ($locale_keys{$OvCJK}) { 655 for my $c (@CompId) { 656 my $r = $locale_keys{$OvCJK}->($c->[1]); 657 next if !defined $r; 658 my $addline = sprintf '%s;<%04X>', $c->[0], $c->[1]; 659 $entry .= parse_entry($addline, \%locale_keys); 660 } 661 } 662 663 if ($entry) { 664 my $v = $vDUCET ? " # for DUCET v$vDUCET" : ''; 665 $ptxt .= " entry => <<'ENTRY',$v\n"; 666 $ptxt .= $entry; 667 $ptxt .= "ENTRY\n"; 668 } 669 670 my $lv = " locale_version => $locale_version,\n"; 671 print $ph "+{\n$lv$ptxt};\n"; 672 close $fh; 673 close $ph; 674} 675 676sub parse_entry { 677 my $line = shift; 678 my $lockeys = shift; 679 680 my($e,$rule) = split_e_rule($line); 681 my $name = getname($e); 682 my $eq_rule = $rule eq '='; 683 $rule = join '', map "<$_>", split ' ', $e if $eq_rule; 684 my ($newce, $simpdec) = parse_rule($e, $rule, $lockeys); 685 686 my $newentry = ''; 687 688 if (!$lockeys->{$e}) { 689 $newentry .= sprintf $ENT_FMT, $e, $newce, $name if !$eq_rule; 690 $lockeys->{$e} = $newce; 691 } else { 692 $newentry .= "# already tailored: $_\n"; 693 } 694 695 if (!$simpdec && $Keys{$e}) { # duplicate for the decomposition 696 my $key = $Keys{$e}; 697 my @ce = $key =~ /$RE_CE/go; 698 if (@ce > 1) { 699 my $ok = 1; 700 my $ee = ''; 701 for my $c (@ce) { 702 $ok = 0, last if !$Code{$c}; 703 $ee .= ' ' if $ee ne ''; 704 $ee .= $Code{$c}; 705 } 706 if ($ok && !$lockeys->{$ee}) { 707 $newentry .= sprintf $ENT_FMT, $ee, $newce, $name; 708 $lockeys->{$ee} = $newce; 709 } 710 if ($ee =~ s/ 030([01])/ 034$1/ && 711 $ok && !$lockeys->{$ee}) { 712 $newentry .= sprintf $ENT_FMT, $ee, $newce, $name; 713 $lockeys->{$ee} = $newce; 714 } 715 } 716 if ($Equiv{$key}) { 717 for my $eq (@{ $Equiv{$key} }) { 718 next if $key =~ /^\[\.0000\.[^]]+\]\z/; # primary ignorable 719 next if $lockeys->{$eq}; 720 next if $eq eq '3038'; # 3038 is identical to 2F17 in DUCET, 721 # but not canonical equivalent. 722 $newentry .= sprintf $ENT_FMT, $eq, $newce, $Name{$eq}; 723 $lockeys->{$eq} = $newce; 724 } 725 } 726 } 727 728 if ($OtherEquiv{$e}) { 729 for my $o (@{ $OtherEquiv{$e} }) { 730 my($e,$rule) = split_e_rule($o); 731 my $name = getname($e); 732 (my $newce, undef) = parse_rule($e, $rule, $lockeys); 733 next if $lockeys->{$e}; 734 $newentry .= sprintf $ENT_FMT, $e, $newce, $name; 735 $lockeys->{$e} = $newce; 736 } 737 } 738 739 return $newentry; 740} 741 742sub getunicode { 743 return join ' ', map { sprintf '%04X', $_ } unpack 'U*', shift; 744} 745 746sub parse_element { 747 my $e = shift; 748 $e =~ s/\{([A-Za-z']+)\}/' '.getunicode($1).' '/ge; 749 $e =~ s/ +/ /g; 750 trim($e); 751 return $e; 752} 753 754sub split_e_rule { 755 my $line = shift; 756 my($e, $r) = split /;/, $line; 757 return (parse_element($e), $r); 758} 759 760sub getname { 761 my $e = shift; 762 return $Name{$e} if $Name{$e}; # single collation element (without <>) 763 my @e = split ' ', $e; 764 my @name = map { $Name{$_} ? $Name{$_} : 765 /^FD[DE][0-9A-F]\z/ ? "noncharacter-$_" : 766 'unknown' } @e; 767 return sprintf '<%s>', join ', ', @name; 768} 769 770sub parse_rule { 771 my $e = shift; 772 my $e1 = $e =~ /^([0-9A-F]+)/ ? $1 : ''; 773 my $rule = shift; 774 my $lockeys = shift; 775 my $result = ''; 776 my $simple_decomp = 1; # rules containing only [A-Za-z'"] or <XXXX> 777 778 for (my $prerule = $rule; $rule ne ''; $prerule = $rule) { 779 $rule =~ s/^ +//; 780 last if $rule =~ /^#/; 781 if ($rule =~ s/^($RE_CE)//o) { 782 my $k = $1; 783 my $var = $k =~ /^\[\*/ ? 1 : 0; 784 my @c = _getHexArray($k); 785 @c = @c[0..2] if !$Use4th; 786 $result .= ce($var, @c); 787 next; 788 } 789 790 if ($rule =~ s/^(<([0-9A-F ]+)>\+\+\+\?)//) { 791 my $cr = $1; 792 my @c = split ' ', $2; 793 my $compat = $Keys{$e}; 794 my $decomp = join '', map { 795 $Keys{$_} ? $Keys{$_} : simple_cjk_deriv($_) 796 } @c; 797 my $regexp = $decomp; 798 $regexp =~ s/([\[\]\.\*])/\\$1/g; 799 $regexp =~ s/\.00(?:0[1-9A-F]|1[0-9A-F])(?:\\\.[0-9A-F]+|)\\\] 800 /.(00(?:0[1-9A-F]|1[0-9A-F]))(?:\\.[0-9A-F]+|)\\\]/gx; 801 # tertiary weights of 01-1F (excluding 00) 802 my @tD = map hex($_), $decomp =~ /^$regexp\z/; 803 my @tC = map hex($_), $compat =~ /^$regexp\z/; 804 croak "wrong at $cr" unless @c == @tD && @c == @tC; 805 my $r = join ' ', map "<$c[$_]>+++".($tC[$_] - $tD[$_]), 0..@c-1; 806 $rule = $r.$rule; 807 next; 808 } 809 810 my $key; 811 if ($rule =~ s/^(<[0-9A-Za-z'{ }]+>|[A-Za-z'"])//) { 812 my $e = $1; 813 my $c = $e =~ tr/<>//d ? parse_element($e) : getunicode($e); 814 croak "<$c> is too short" if 4 > length $c; 815 $key = $lockeys->{$c} || $Keys{$c}; 816 if (!defined $key) { 817 my $u = hex $c; 818 my @u = $Use4th ? ($u) : (); 819 my @r; 820 if (SBase <= $u && $u <= SFinal) { 821 @r = $lockeys->{$OvHang}->($u) if $lockeys->{$OvHang}; 822 } else { 823 # but no check if $u is in CJK ideographs 824 @r = $lockeys->{$OvCJK} ->($u) if $lockeys->{$OvCJK}; 825 } 826 if (@r) { 827 $key = join '', map { 828 ref $_ ? ce(0, @$_) : ce(0, $_, Min2Wt, Min3Wt, @u) 829 } @r; 830 } 831 } 832 } 833 834 my @base; 835 for my $k ($key =~ /$RE_CE/go) { 836 my $var = $k =~ /^\[\*/ ? 1 : 0; 837 push @base, [$var, _getHexArray($k)]; 838 } 839 croak "the rule seems wrong at $prerule" if !@base; 840 841 my $replaced = 0; 842 while ($rule =~ s/^(([+-])\2*)(\d+)//) { 843 my $idx = length($1); 844 my $num = $2 eq '-' ? -$3 : $3; 845 $base[0][$idx] += $num; 846 ++$replaced; 847 } 848 849 $simple_decomp = 0 if $replaced; 850 for my $c (@base) { 851 $c->[4] = hex $e1 if $replaced && $Use4th; 852 $result .= ce(@$c); 853 } 854 croak "something wrong at $rule" if $prerule eq $rule; 855 } 856 return($result, $simple_decomp); 857} 858