1#!/usr/local/bin/perl -wC 2 3# SPDX-License-Identifier: BSD-2-Clause 4# 5# Copyright 2009 Edwin Groothuis <edwin@FreeBSD.org> 6# Copyright 2015 John Marino <draco@marino.st> 7# Copyright 2020 Hiroki Sato <hrs@FreeBSD.org> 8# 9# Redistribution and use in source and binary forms, with or without 10# modification, are permitted provided that the following conditions 11# are met: 12# 1. Redistributions of source code must retain the above copyright 13# notice, this list of conditions and the following disclaimer. 14# 2. Redistributions in binary form must reproduce the above copyright 15# notice, this list of conditions and the following disclaimer in the 16# documentation and/or other materials provided with the distribution. 17# 18# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 19# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28# SUCH DAMAGE. 29# 30 31use strict; 32use File::Copy; 33use XML::Parser; 34use Tie::IxHash; 35use Text::Iconv; 36#use Data::Dumper; 37use Getopt::Long; 38use Digest::SHA qw(sha1_hex); 39require "charmaps.pm"; 40 41if ($#ARGV < 2) { 42 print "Usage: $0 --unidir=<unidir> --etc=<etcdir> --type=<type>\n"; 43 exit(1); 44} 45 46my $DEFENCODING = "UTF-8"; 47 48my $UNIDIR = undef; 49my $ETCDIR = undef; 50my $TYPE = undef; 51 52my $CLDR_VERSION = undef; 53 54my $result = GetOptions ( 55 "unidir=s" => \$UNIDIR, 56 "etc=s" => \$ETCDIR, 57 "type=s" => \$TYPE, 58 ); 59 60my %convertors = (); 61 62my %ucd = (); 63my %values = (); 64my %hashtable = (); 65my %languages = (); 66my %translations = (); 67my %alternativemonths = (); 68get_languages(); 69 70my %utfmap = (); 71$utfmap{'UTF-8'} = {}; 72$utfmap{'UTF-32'} = {}; 73get_utfmap("$UNIDIR/posix/$DEFENCODING.cm", $utfmap{'UTF-8'}); 74get_utfmap("$UNIDIR/posix/UTF-32.cm", $utfmap{'UTF-32'}); 75 76my %keys = (); 77tie(%keys, "Tie::IxHash"); 78tie(%hashtable, "Tie::IxHash"); 79 80my %FILESNAMES = ( 81 "monetdef" => "LC_MONETARY", 82 "timedef" => "LC_TIME", 83 "msgdef" => "LC_MESSAGES", 84 "numericdef" => "LC_NUMERIC", 85 "colldef" => "LC_COLLATE", 86 "ctypedef" => "LC_CTYPE" 87); 88 89my %callback = ( 90 mdorder => \&callback_mdorder, 91 altmon => \&callback_altmon, 92 cformat => \&callback_cformat, 93 dformat => \&callback_dformat, 94 dtformat => \&callback_dtformat, 95 cbabmon => \&callback_abmon, 96 cbampm => \&callback_ampm, 97 data => undef, 98); 99 100my %DESC = ( 101 102 # numericdef 103 "decimal_point" => "decimal_point", 104 "thousands_sep" => "thousands_sep", 105 "grouping" => "grouping", 106 107 # monetdef 108 "int_curr_symbol" => "int_curr_symbol (last character always " . 109 "SPACE)", 110 "currency_symbol" => "currency_symbol", 111 "mon_decimal_point" => "mon_decimal_point", 112 "mon_thousands_sep" => "mon_thousands_sep", 113 "mon_grouping" => "mon_grouping", 114 "positive_sign" => "positive_sign", 115 "negative_sign" => "negative_sign", 116 "int_frac_digits" => "int_frac_digits", 117 "frac_digits" => "frac_digits", 118 "p_cs_precedes" => "p_cs_precedes", 119 "p_sep_by_space" => "p_sep_by_space", 120 "n_cs_precedes" => "n_cs_precedes", 121 "n_sep_by_space" => "n_sep_by_space", 122 "p_sign_posn" => "p_sign_posn", 123 "n_sign_posn" => "n_sign_posn", 124 125 # msgdef 126 "yesexpr" => "yesexpr", 127 "noexpr" => "noexpr", 128 "yesstr" => "yesstr", 129 "nostr" => "nostr", 130 131 # timedef 132 "abmon" => "Short month names", 133 "mon" => "Long month names (as in a date)", 134 "abday" => "Short weekday names", 135 "day" => "Long weekday names", 136 "t_fmt" => "X_fmt", 137 "d_fmt" => "x_fmt", 138 "c_fmt" => "c_fmt", 139 "am_pm" => "AM/PM", 140 "d_t_fmt" => "date_fmt", 141 "altmon" => "Long month names (without case ending)", 142 "md_order" => "md_order", 143 "t_fmt_ampm" => "ampm_fmt", 144); 145 146if ($TYPE eq "colldef") { 147 transform_collation(); 148 make_makefile(); 149} 150 151if ($TYPE eq "ctypedef") { 152 transform_ctypes(); 153 make_makefile(); 154} 155 156if ($TYPE eq "numericdef") { 157 %keys = ( 158 "decimal_point" => "s", 159 "thousands_sep" => "s", 160 "grouping" => "ai", 161 ); 162 get_fields(); 163 print_fields(); 164 make_makefile(); 165} 166 167if ($TYPE eq "monetdef") { 168 %keys = ( 169 "int_curr_symbol" => "s", 170 "currency_symbol" => "s", 171 "mon_decimal_point" => "s", 172 "mon_thousands_sep" => "s", 173 "mon_grouping" => "ai", 174 "positive_sign" => "s", 175 "negative_sign" => "s", 176 "int_frac_digits" => "i", 177 "frac_digits" => "i", 178 "p_cs_precedes" => "i", 179 "p_sep_by_space" => "i", 180 "n_cs_precedes" => "i", 181 "n_sep_by_space" => "i", 182 "p_sign_posn" => "i", 183 "n_sign_posn" => "i" 184 ); 185 get_fields(); 186 print_fields(); 187 make_makefile(); 188} 189 190if ($TYPE eq "msgdef") { 191 %keys = ( 192 "yesexpr" => "s", 193 "noexpr" => "s", 194 "yesstr" => "s", 195 "nostr" => "s" 196 ); 197 get_fields(); 198 print_fields(); 199 make_makefile(); 200} 201 202if ($TYPE eq "timedef") { 203 %keys = ( 204 "abmon" => "<cbabmon<abmon<as", 205 "mon" => "as", 206 "abday" => "as", 207 "day" => "as", 208 "t_fmt" => "s", 209 "d_fmt" => "<dformat<d_fmt<s", 210 "c_fmt" => "<cformat<d_t_fmt<s", 211 "am_pm" => "<cbampm<am_pm<as", 212 "d_t_fmt" => "<dtformat<d_t_fmt<s", 213 "altmon" => "<altmon<mon<as", 214 "md_order" => "<mdorder<d_fmt<s", 215 "t_fmt_ampm" => "s", 216 ); 217 get_fields(); 218 print_fields(); 219 make_makefile(); 220} 221 222sub callback_ampm { 223 my $s = shift; 224 my $nl = $callback{data}{l} . "_" . $callback{data}{c}; 225 my $enc = $callback{data}{e}; 226 227 if ($nl eq 'ru_RU') { 228 if ($enc eq 'UTF-8') { 229 $s = 'дп;пп'; 230 } else { 231 my $converter = Text::Iconv->new("utf-8", "$enc"); 232 $s = $converter->convert("дп;пп"); 233 } 234 } 235 return $s; 236} 237 238sub callback_cformat { 239 my $s = shift; 240 my $nl = $callback{data}{l} . "_" . $callback{data}{c}; 241 242 if ($nl eq 'ko_KR') { 243 $s =~ s/(> )(%p)/$1%A $2/; 244 } 245 $s =~ s/\.,/\./; 246 $s =~ s/ %Z//; 247 $s =~ s/ %z//; 248 $s =~ s/^"%e\./%A %e/; 249 $s =~ s/^"(%B %e, )/"%A, $1/; 250 $s =~ s/^"(%e %B )/"%A $1/; 251 return $s; 252}; 253 254sub callback_dformat { 255 my $s = shift; 256 257 $s =~ s/(%m(<SOLIDUS>|[-.]))%e/$1%d/; 258 $s =~ s/%e((<SOLIDUS>|[-.])%m)/%d$1/; 259 return $s; 260}; 261 262sub callback_dtformat { 263 my $s = shift; 264 my $nl = $callback{data}{l} . "_" . $callback{data}{c}; 265 266 if ($nl eq 'ja_JP') { 267 $s =~ s/(> )(%H)/$1%A $2/; 268 } elsif ($nl eq 'ko_KR' || $nl eq 'zh_CN' || $nl eq 'zh_TW') { 269 if ($nl ne 'ko_KR') { 270 $s =~ s/%m/%_m/; 271 } 272 $s =~ s/(> )(%p)/$1%A $2/; 273 } 274 $s =~ s/\.,/\./; 275 $s =~ s/^"%e\./%A %e/; 276 $s =~ s/^"(%B %e, )/"%A, $1/; 277 $s =~ s/^"(%e %B )/"%A $1/; 278 return $s; 279}; 280 281sub callback_mdorder { 282 my $s = shift; 283 return undef if (!defined $s); 284 $s =~ s/[^dem]//g; 285 $s =~ s/e/d/g; 286 return $s; 287}; 288 289sub callback_altmon { 290 # if the language/country is known in %alternative months then 291 # return that, otherwise repeat mon 292 my $s = shift; 293 294 if (defined $alternativemonths{$callback{data}{l}}{$callback{data}{c}}) { 295 my @altnames = split(";",$alternativemonths{$callback{data}{l}}{$callback{data}{c}}); 296 my @cleaned; 297 foreach (@altnames) 298 { 299 $_ =~ s/^\s+//; 300 $_ =~ s/\s+$//; 301 push @cleaned, $_; 302 } 303 return join(";",@cleaned); 304 } 305 306 return $s; 307} 308 309sub callback_abmon { 310 # for specified CJK locales, pad result with a space to enable 311 # columns to line up (style established in FreeBSD in 2001) 312 my $s = shift; 313 my $nl = $callback{data}{l} . "_" . $callback{data}{c}; 314 315 if ($nl eq 'ja_JP' || $nl eq 'ko_KR' || $nl eq 'zh_CN' || 316 $nl eq 'zh_HK' || $nl eq 'zh_TW') { 317 my @monthnames = split(";", $s); 318 my @cleaned; 319 foreach (@monthnames) 320 { 321 if ($_ =~ /^"<(two|three|four|five|six|seven|eight|nine)>/ || 322 ($_ =~ /^"<one>/ && $_ !~ /^"<one>(<zero>|<one>|<two>)/)) 323 { 324 $_ =~ s/^"/"<space>/; 325 } 326 push @cleaned, $_; 327 } 328 return join(";",@cleaned); 329 } 330 return $s; 331} 332 333############################ 334 335sub get_utfmap { 336 my ($file, $db) = @_; 337 338 open(FIN, $file); 339 my @lines = <FIN>; 340 close(FIN); 341 chomp(@lines); 342 343 my $prev_k = undef; 344 my $prev_v = ""; 345 my $incharmap = 0; 346 foreach my $l (@lines) { 347 chomp($l); 348 next if ($l =~ /^\#/); 349 next if ($l eq ""); 350 351 if ($l eq "CHARMAP") { 352 $incharmap = 1; 353 next; 354 } 355 356 next if (!$incharmap); 357 last if ($l eq "END CHARMAP"); 358 359 $l =~ /^<([^\s]+)>\s+(.*)/; 360 my $k = $1; 361 my $v = $2; 362 $v =~ s/\\x//g; # UTF-8 char code 363 $db->{$k} = $v; 364# print STDERR "UTF $k = $v\n"; 365 366 # XXX: no longer needed 367 # $db_alias->{$k} = $prev_k if ($prev_v eq $v); 368 369 $prev_v = $v; 370 $prev_k = $k; 371 } 372} 373 374sub resolve_enc_addition { 375 my $ret = ''; 376 377 foreach my $t (split(/\+/, $_[0])) { 378 $t =~ s/^0[xX]//; 379 $ret .= $t; 380 } 381 return $ret; 382} 383 384sub get_languages { 385 my %data = get_xmldata($ETCDIR); 386 %languages = %{$data{L}}; 387 %translations = %{$data{T}}; 388 %alternativemonths = %{$data{AM}}; 389} 390 391sub transform_ctypes { 392 # Add the C.UTF-8 393 $languages{"C"}{"x"}{data}{"x"}{$DEFENCODING} = undef; 394 395 foreach my $l (sort keys(%languages)) { 396 foreach my $f (sort keys(%{$languages{$l}})) { 397 foreach my $c (sort keys(%{$languages{$l}{$f}{data}})) { 398 next if (defined $languages{$l}{$f}{definitions} 399 && $languages{$l}{$f}{definitions} !~ /$TYPE/); 400 $languages{$l}{$f}{data}{$c}{$DEFENCODING} = 0; # unread 401 my $file = $l; 402 $file .= "_" . $f if ($f ne "x"); 403 $file .= "_" . $c if ($c ne "x"); 404 my $actfile = $file; 405 406 my $filename = "$UNIDIR/posix/xx_Comm_C.UTF-8.src"; 407 if (! -f $filename) { 408 print STDERR "Cannot open $filename\n"; 409 next; 410 } 411 open(FIN, "$filename"); 412 print "Reading from $filename for ${l}_${f}_${c}\n"; 413 $languages{$l}{$f}{data}{$c}{$DEFENCODING} = 1; # read 414 my @lines; 415 my $shex; 416 my $uhex; 417 while (<FIN>) { 418 push @lines, $_; 419 } 420 close(FIN); 421 $shex = sha1_hex(join("\n", @lines)); 422 $languages{$l}{$f}{data}{$c}{$DEFENCODING} = $shex; 423 $hashtable{$shex}{"${l}_${f}_${c}.$DEFENCODING"} = 1; 424 open(FOUT, ">$TYPE.draft/$actfile.$DEFENCODING.src"); 425 print FOUT @lines; 426 close(FOUT); 427 foreach my $enc (sort keys(%{$languages{$l}{$f}{data}{$c}})) { 428 next if ($enc eq $DEFENCODING); 429 $filename = "$UNIDIR/posix/$file.$DEFENCODING.src"; 430 if ($file eq 'ja_JP') { 431 # Override $filename for ja_JP because 432 # its CTYPE is not compatible with UTF-8. 433 $filename = "$UNIDIR/posix/$file.eucJP.src"; 434 } 435 if (! -f $filename) { 436 print STDERR "Cannot open $filename\n"; 437 next; 438 } 439 @lines = (); 440 open(FIN, "$filename"); 441 while (<FIN>) { 442 if ((/^comment_char\s/) || (/^escape_char\s/)){ 443 push @lines, $_; 444 } 445 if (/^LC_CTYPE/../^END LC_CTYPE/) { 446 push @lines, $_; 447 } 448 } 449 close(FIN); 450 $uhex = sha1_hex(join("\n", @lines) . $enc); 451 $languages{$l}{$f}{data}{$c}{$enc} = $uhex; 452 $hashtable{$uhex}{"${l}_${f}_${c}.$enc"} = 1; 453 open(FOUT, ">$TYPE.draft/$actfile.$enc.src"); 454 print FOUT <<EOF; 455# Warning: Do not edit. This file is automatically extracted from the 456# tools in /usr/src/tools/tools/locale. The data is obtained from the 457# CLDR project, obtained from http://cldr.unicode.org/ 458# ----------------------------------------------------------------------------- 459EOF 460 print FOUT @lines; 461 close(FOUT); 462 } 463 } 464 } 465 } 466} 467 468 469sub transform_collation { 470 # Read the CLDR version 471 open(FIN, "$UNIDIR/cldr-version") or die "Cannot open cldr-version"; 472 read FIN, $CLDR_VERSION, -s FIN; 473 close(FIN); 474 $CLDR_VERSION =~ s/\s*$//; 475 476 foreach my $l (sort keys(%languages)) { 477 foreach my $f (sort keys(%{$languages{$l}})) { 478 foreach my $c (sort keys(%{$languages{$l}{$f}{data}})) { 479 next if (defined $languages{$l}{$f}{definitions} 480 && $languages{$l}{$f}{definitions} !~ /$TYPE/); 481 $languages{$l}{$f}{data}{$c}{$DEFENCODING} = 0; # unread 482 my $file; 483 $file = $l . "_"; 484 $file .= $f . "_" if ($f ne "x"); 485 $file .= $c; 486 my $actfile = $file; 487 488 my $filename = "$UNIDIR/posix/$file.$DEFENCODING.src"; 489 $filename = "$ETCDIR/$file.$DEFENCODING.src" 490 if (! -f $filename); 491 if (! -f $filename 492 && defined $languages{$l}{$f}{fallback}) { 493 $file = $languages{$l}{$f}{fallback}; 494 $filename = "$UNIDIR/posix/$file.$DEFENCODING.src"; 495 } 496 $filename = "$UNIDIR/posix/$file.$DEFENCODING.src" 497 if (! -f $filename); 498 if (! -f $filename) { 499 print STDERR 500 "Cannot open $file.$DEFENCODING.src or fallback\n"; 501 next; 502 } 503 open(FIN, "$filename"); 504 print "Reading from $filename for ${l}_${f}_${c}\n"; 505 $languages{$l}{$f}{data}{$c}{$DEFENCODING} = 1; # read 506 my @lines; 507 my $shex; 508 while (<FIN>) { 509 if ((/^comment_char\s/) || (/^escape_char\s/)){ 510 push @lines, $_; 511 } 512 if (/^LC_COLLATE/../^END LC_COLLATE/) { 513 $_ =~ s/[ ]+/ /g; 514 push @lines, $_; 515 } 516 } 517 close(FIN); 518 $shex = sha1_hex(join("\n", @lines)); 519 $languages{$l}{$f}{data}{$c}{$DEFENCODING} = $shex; 520 $hashtable{$shex}{"${l}_${f}_${c}.$DEFENCODING"} = 1; 521 open(FOUT, ">$TYPE.draft/$actfile.$DEFENCODING.src"); 522 print FOUT <<EOF; 523# Warning: Do not edit. This file is automatically extracted from the 524# tools in /usr/src/tools/tools/locale. The data is obtained from the 525# CLDR project, obtained from http://cldr.unicode.org/ 526# ----------------------------------------------------------------------------- 527EOF 528 print FOUT @lines; 529 close(FOUT); 530 531 foreach my $enc (sort keys(%{$languages{$l}{$f}{data}{$c}})) { 532 next if ($enc eq $DEFENCODING); 533 534 open FIN, "<$TYPE.draft/$actfile.$DEFENCODING.src"; 535 open FOUT, ">$TYPE.draft/$actfile.$enc.src"; 536 my $order_start = 0; 537 my $print_p = 0; 538 # 539 # %c_elem: collation elements 540 # 541 # undef: not defined 542 # 1: defined 543 # 2: invalid in this encoding 544 # 545 my %c_elem = (); 546 while (<FIN>) { # XXX: this loop should be refactored. 547 chomp; 548 $print_p = 1; 549 if ($order_start) { 550 $order_start = 0 if (m/^order_end/); 551 if (m/^<([^>]+)>/) { 552 if (not defined $c_elem{$1}) { 553# print STDERR "$1:\n"; 554 555 my $u32 = $utfmap{'UTF-32'}->{$1}; 556 die "order, $1\n" if (not defined $u32); 557# print STDERR "u32 for $1 = $u32\n"; 558 if (not defined $convertors{$enc}{$u32}) { 559# print STDERR "$1 - $u32 not defined in $enc\n"; 560 $print_p = 0; 561 } 562 } elsif ($c_elem{$1} == 2) { 563# print STDERR "$1 is marked as invalid in $enc\n"; 564 $print_p = 0; 565 } 566 } 567 } elsif (m/^collating-element/) { 568 my ($elem, $l); 569 if (m/<([^>]+)> from (.+)/) { 570 ($elem, $l) = ($1, $2); 571 } 572# print STDERR "$elem: enter ($print_p, $l,)\n"; 573 while ($print_p and 574 defined $l and 575 $l =~ m/<([^>]+)>/g) { 576# print STDERR "$elem: $1\n"; 577 my $u32 = $utfmap{'UTF-32'}->{$1}; 578 die "collating-element, $1\n" if (not defined $u32); 579# print STDERR "u32 for $1 = $u32\n"; 580 if (not $convertors{$enc}{$u32}) { 581# print STDERR "$1 - $u32 not defined in $enc\n"; 582 $print_p = 0; 583# print STDERR "Mark $elem as invalid\n"; 584 $c_elem{$elem} = 2; 585 } 586 } 587 if ($print_p) { 588# print STDERR "Add $elem\n"; 589 $c_elem{$elem} = 1; 590 } 591 } elsif (m/^collating-symbol <([^>]+)>/) { 592# print STDERR "Add $1\n"; 593 $c_elem{$1} = 1; 594 } elsif (m/^order_start/) { 595 $order_start = 1; 596 # do nothing 597 } 598 print FOUT $_, "\n" if ($print_p); 599 } 600 close FOUT; 601 close FIN; 602 $languages{$l}{$f}{data}{$c}{$enc} = $shex; 603 $hashtable{$shex}{"${l}_${f}_${c}.$enc"} = 1; 604 } 605 } 606 } 607 } 608} 609 610sub get_fields { 611 foreach my $l (sort keys(%languages)) { 612 foreach my $f (sort keys(%{$languages{$l}})) { 613 foreach my $c (sort keys(%{$languages{$l}{$f}{data}})) { 614 next if (defined $languages{$l}{$f}{definitions} 615 && $languages{$l}{$f}{definitions} !~ /$TYPE/); 616 617 $languages{$l}{$f}{data}{$c}{$DEFENCODING} = 0; # unread 618 my $file; 619 $file = $l . "_"; 620 $file .= $f . "_" if ($f ne "x"); 621 $file .= $c; 622 623 my $filename = "$UNIDIR/posix/$file.$DEFENCODING.src"; 624 $filename = "$ETCDIR/$file.$DEFENCODING.src" 625 if (! -f $filename); 626 if (! -f $filename 627 && defined $languages{$l}{$f}{fallback}) { 628 $file = $languages{$l}{$f}{fallback}; 629 $filename = "$UNIDIR/posix/$file.$DEFENCODING.src"; 630 } 631 $filename = "$UNIDIR/posix/$file.$DEFENCODING.src" 632 if (! -f $filename); 633 if (! -f $filename) { 634 print STDERR 635 "Cannot open $file.$DEFENCODING.src or fallback\n"; 636 next; 637 } 638 open(FIN, "$filename"); 639 print "Reading from $filename for ${l}_${f}_${c}\n"; 640 $languages{$l}{$f}{data}{$c}{$DEFENCODING} = 1; # read 641 my @lines = <FIN>; 642 chomp(@lines); 643 close(FIN); 644 my $continue = 0; 645 foreach my $k (keys(%keys)) { 646 foreach my $line (@lines) { 647 $line =~ s/\r//; 648 next if (!$continue && $line !~ /^$k\s/); 649 if ($continue) { 650 $line =~ s/^\s+//; 651 } else { 652 $line =~ s/^$k\s+//; 653 } 654 655 $values{$l}{$f}{$c}{$k} = "" 656 if (!defined $values{$l}{$f}{$c}{$k}); 657 658 $continue = ($line =~ /\/$/); 659 $line =~ s/\/$// if ($continue); 660 661# while ($line =~ /_/) { 662# $line =~ 663# s/\<([^>_]+)_([^>]+)\>/<$1 $2>/; 664# } 665# die "_ in data - $line" if ($line =~ /_/); 666 $values{$l}{$f}{$c}{$k} .= $line; 667 668 last if (!$continue); 669 } 670 } 671 } 672 } 673 } 674} 675 676sub decodecldr { 677 my $e = shift; 678 my $s = shift; 679 680 my $v = undef; 681 682 if ($e eq "UTF-8") { 683 # 684 # Conversion to UTF-8 can be done from the Unicode name to 685 # the UTF-8 character code. 686 # 687 $v = $utfmap{'UTF-8'}->{$s}; 688 die "Cannot convert $s in $e (charmap)" if (!defined $v); 689 } else { 690 # 691 # Conversion to these encodings can be done from the Unicode 692 # name to Unicode code to the encodings code. 693 # 694 # hex - hex or string attr 695 # unicode - unicode attr 696 # ucc - ucc attr 697 my $hex = $translations{$e}{$s}{hex}; 698 my $ucc = $utfmap{'UTF-32'}->{$s}; 699 my $ucc_attr = $translations{$e}{$s}{ucc}; 700 my $unicode = $translations{$e}{$s}{unicode}; 701 702 if (defined $hex) { # hex is in local encoding 703 $v = $hex; 704 } elsif (defined $unicode) { # unicode is in name 705 $v = $convertors{$e}{$utfmap{'UTF-32'}->{$unicode}}; 706 } elsif (defined $ucc_attr) { # ucc is in code point 707 if (defined $ucc) { 708# print STDERR "INFO: ucc=$ucc_attr ", 709# "overrides $ucc in UTF-32\n"; 710 } 711 # normalize 712 $ucc_attr = sprintf("%08X", hex($ucc_attr)); 713# print STDERR "convert $ucc_attr into $e\n"; 714 $v = $convertors{$e}{$ucc_attr}; 715 } elsif (defined $ucc) { 716 # normalize 717 $ucc = sprintf("%08X", hex($ucc)); 718# print STDERR "convert $ucc into $e\n"; 719 $v = $convertors{$e}{$ucc}; 720 } 721 die "Cannot convert $s in $e" if (!defined $v); 722 } 723 724 # XXX: length = 8 is not supported yet. 725 $v =~ s/^[0]+//g; 726 $v = "0" . $v if (length($v) % 2); 727 return pack("C", hex($v)) if (length($v) == 2); 728 return pack("CC", hex(substr($v, 0, 2)), hex(substr($v, 2, 2))) 729 if (length($v) == 4); 730 return pack("CCC", hex(substr($v, 0, 2)), hex(substr($v, 2, 2)), 731 hex(substr($v, 4, 2))) if (length($v) == 6); 732 die "Cannot convert $s in $e (length = " . length($v) . "\n"; 733} 734 735sub translate { 736 my $enc = shift; 737 my $v = shift; 738 739 return $translations{$enc}{$v} if (defined $translations{$enc}{$v}); 740 return undef; 741} 742 743sub print_fields { 744 foreach my $l (sort keys(%languages)) { 745 foreach my $f (sort keys(%{$languages{$l}})) { 746 foreach my $c (sort keys(%{$languages{$l}{$f}{data}})) { 747 next if (defined $languages{$l}{$f}{definitions} 748 && $languages{$l}{$f}{definitions} !~ /$TYPE/); 749 foreach my $enc (sort keys(%{$languages{$l}{$f}{data}{$c}})) { 750 if ($languages{$l}{$f}{data}{$c}{$DEFENCODING} eq "0") { 751 print "Skipping ${l}_" . 752 ($f eq "x" ? "" : "${f}_") . 753 "${c} - not read\n"; 754 next; 755 } 756 my $file = $l; 757 $file .= "_" . $f if ($f ne "x"); 758 $file .= "_" . $c; 759 print "Writing to $file in $enc\n"; 760 761 if ($enc ne $DEFENCODING && 762 !defined $convertors{$enc}) { 763 print "Failed! Cannot convert to $enc.\n"; 764 next; 765 }; 766 767 open(FOUT, ">$TYPE.draft/$file.$enc.new"); 768 my $okay = 1; 769 my $output = ""; 770 print FOUT <<EOF; 771# Warning: Do not edit. This file is automatically generated from the 772# tools in /usr/src/tools/tools/locale. The data is obtained from the 773# CLDR project, obtained from http://cldr.unicode.org/ 774# ----------------------------------------------------------------------------- 775EOF 776 foreach my $k (keys(%keys)) { 777 my $g = $keys{$k}; 778 779 die("Unknown $k in \%DESC") 780 if (!defined $DESC{$k}); 781 782 $output .= "#\n# $DESC{$k}\n"; 783 784 # Replace one row with another 785 if ($g =~ /^>/) { 786 $k = substr($g, 1); 787 $g = $keys{$k}; 788 } 789 790 # Callback function 791 if ($g =~ /^\</) { 792 $callback{data}{c} = $c; 793 $callback{data}{k} = $k; 794 $callback{data}{f} = $f; 795 $callback{data}{l} = $l; 796 $callback{data}{e} = $enc; 797 my @a = split(/\</, substr($g, 1)); 798 my $rv = 799 &{$callback{$a[0]}}($values{$l}{$f}{$c}{$a[1]}); 800 $values{$l}{$f}{$c}{$k} = $rv; 801 $g = $a[2]; 802 $callback{data} = (); 803 } 804 805 my $v = $values{$l}{$f}{$c}{$k}; 806 $v = "undef" if (!defined $v); 807 808 if ($g eq "i") { 809 $output .= "$v\n"; 810 next; 811 } 812 if ($g eq "ai") { 813 $output .= "$v\n"; 814 next; 815 } 816 if ($g eq "s") { 817 $v =~ s/^"//; 818 $v =~ s/"$//; 819 my $cm = ""; 820 while ($v =~ /^(.*?)<(.*?)>(.*)/) { 821 my $p1 = $1; 822 $cm = $2; 823 my $p3 = $3; 824 825 my $rv = decodecldr($enc, $cm); 826# $rv = translate($enc, $cm) 827# if (!defined $rv); 828 if (!defined $rv) { 829 print STDERR 830"Could not convert $k ($cm) from $DEFENCODING to $enc\n"; 831 $okay = 0; 832 next; 833 } 834 835 $v = $p1 . $rv . $p3; 836 } 837 $output .= "$v\n"; 838 next; 839 } 840 if ($g eq "as") { 841 foreach my $v (split(/;/, $v)) { 842 $v =~ s/^"//; 843 $v =~ s/"$//; 844 my $cm = ""; 845 while ($v =~ /^(.*?)<(.*?)>(.*)/) { 846 my $p1 = $1; 847 $cm = $2; 848 my $p3 = $3; 849 850 my $rv = 851 decodecldr($enc, 852 $cm); 853# $rv = translate($enc, 854# $cm) 855# if (!defined $rv); 856 if (!defined $rv) { 857 print STDERR 858"Could not convert $k ($cm) from $DEFENCODING to $enc\n"; 859 $okay = 0; 860 next; 861 } 862 863 $v = $1 . $rv . $3; 864 } 865 $output .= "$v\n"; 866 } 867 next; 868 } 869 870 die("$k is '$g'"); 871 872 } 873 874 $languages{$l}{$f}{data}{$c}{$enc} = sha1_hex($output); 875 $hashtable{sha1_hex($output)}{"${l}_${f}_${c}.$enc"} = 1; 876 print FOUT "$output# EOF\n"; 877 close(FOUT); 878 879 if ($okay) { 880 rename("$TYPE.draft/$file.$enc.new", 881 "$TYPE.draft/$file.$enc.src"); 882 } else { 883 rename("$TYPE.draft/$file.$enc.new", 884 "$TYPE.draft/$file.$enc.failed"); 885 } 886 } 887 } 888 } 889 } 890} 891 892sub make_makefile { 893 print "Creating Makefile for $TYPE\n"; 894 my $SRCOUT; 895 my $SRCOUT2; 896 my $SRCOUT3 = ""; 897 my $SRCOUT4 = ""; 898 my $MAPLOC; 899 if ($TYPE eq "colldef") { 900 # In future, we might want to try to put the CLDR version into 901 # the .src files with some new syntax, instead of the makefile. 902 $SRCOUT = "localedef \${LOCALEDEF_ENDIAN} -D -U " . 903 "-i \${.IMPSRC} \\\n" . 904 "\t-V \${CLDR_VERSION} \\\n" . 905 "\t-f \${MAPLOC}/map.\${.TARGET:T:R:E:C/@.*//} " . 906 "\${.OBJDIR}/\${.IMPSRC:T:R}"; 907 $MAPLOC = "MAPLOC=\t\t\${.CURDIR}/../../tools/tools/" . 908 "locale/etc/final-maps\n"; 909 $SRCOUT2 = "LC_COLLATE"; 910 $SRCOUT3 = "" . 911 ".for f t in \${LOCALES_MAPPED}\n" . 912 "FILES+=\t\$t.LC_COLLATE\n" . 913 "FILESDIR_\$t.LC_COLLATE=\t\${LOCALEDIR}/\$t\n" . 914 "\$t.LC_COLLATE: \${.CURDIR}/\$f.src\n" . 915 "\tlocaledef \${LOCALEDEF_ENDIAN} -D -U " . 916 "-i \${.ALLSRC} \\\n" . 917 "\t-V \${CLDR_VERSION} \\\n" . 918 "\t\t-f \${MAPLOC}/map.\${.TARGET:T:R:E:C/@.*//} \\\n" . 919 "\t\t\${.OBJDIR}/\${.TARGET:T:R}\n" . 920 ".endfor\n\n"; 921 $SRCOUT4 = "## LOCALES_MAPPED\n"; 922 } 923 elsif ($TYPE eq "ctypedef") { 924 $SRCOUT = "localedef \${LOCALEDEF_ENDIAN} -D -U -c " . 925 "-w \${MAPLOC}/widths.txt \\\n" . 926 "\t-f \${MAPLOC}/map.\${.IMPSRC:T:R:E} " . 927 "\\\n\t-i \${.IMPSRC} \${.OBJDIR}/\${.IMPSRC:T:R} " . 928 " || true"; 929 $SRCOUT2 = "LC_CTYPE"; 930 $MAPLOC = "MAPLOC=\t\t\${.CURDIR}/../../tools/tools/" . 931 "locale/etc/final-maps\n"; 932 $SRCOUT3 = "## SYMPAIRS\n\n" . 933 ".for s t in \${SYMPAIRS}\n" . 934 "\${t:S/src\$/LC_CTYPE/}: " . 935 "\$s\n" . 936 "\tlocaledef \${LOCALEDEF_ENDIAN} -D -U -c " . 937 "-w \${MAPLOC}/widths.txt \\\n" . 938 "\t-f \${MAPLOC}/map.\${.TARGET:T:R:C/^.*\\.//} " . 939 "\\\n\t-i \${.ALLSRC} \${.OBJDIR}/\${.TARGET:T:R} " . 940 " || true\n" . 941 ".endfor\n\n"; 942 } 943 else { 944 $SRCOUT = "grep -v -E '^(\#\$\$|\#[ ])' < \${.IMPSRC} > \${.TARGET}"; 945 $SRCOUT2 = "out"; 946 $MAPLOC = ""; 947 } 948 open(FOUT, ">$TYPE.draft/Makefile"); 949 print FOUT <<EOF; 950# Warning: Do not edit. This file is automatically generated from the 951# tools in /usr/src/tools/tools/locale. 952 953PACKAGE= locales 954LOCALEDIR= \${SHAREDIR}/locale 955FILESNAME= $FILESNAMES{$TYPE} 956.SUFFIXES: .src .${SRCOUT2} 957${MAPLOC} 958EOF 959 960 if ($TYPE eq "colldef") { 961 print FOUT <<EOF; 962CLDR_VERSION= "${CLDR_VERSION}" 963 964EOF 965 } 966 967 if ($TYPE eq "colldef" || $TYPE eq "ctypedef") { 968 print FOUT <<EOF; 969.include <bsd.endian.mk> 970 971EOF 972 } 973 974 print FOUT <<EOF; 975.src.${SRCOUT2}: 976 $SRCOUT 977 978## PLACEHOLDER 979 980${SRCOUT4} 981 982EOF 983 984 foreach my $hash (keys(%hashtable)) { 985 # For colldef, weight LOCALES to UTF-8 986 # Sort as upper-case and reverse to achieve it 987 # Make en_US, ru_RU, and ca_AD preferred 988 my @files; 989 if ($TYPE eq "colldef") { 990 @files = sort { 991 if ($a eq 'en_x_US.UTF-8' || 992 $a eq 'ru_x_RU.UTF-8' || 993 $a eq 'ca_x_AD.UTF-8') { return -1; } 994 elsif ($b eq 'en_x_US.UTF-8' || 995 $b eq 'ru_x_RU.UTF-8' || 996 $b eq 'ca_x_AD.UTF-8') { return 1; } 997 else { return uc($b) cmp uc($a); } 998 } keys(%{$hashtable{$hash}}); 999 } elsif ($TYPE eq "ctypedef") { 1000 @files = sort { 1001 if ($a eq 'C_x_x.UTF-8') { return -1; } 1002 elsif ($b eq 'C_x_x.UTF-8') { return 1; } 1003 if ($a =~ /^en_x_US/) { return -1; } 1004 elsif ($b =~ /^en_x_US/) { return 1; } 1005 1006 if ($a =~ /^en_x_GB.ISO8859-15/ || 1007 $a =~ /^ru_x_RU/) { return -1; } 1008 elsif ($b =~ /^en_x_GB.ISO8859-15/ || 1009 $b =~ /ru_x_RU/) { return 1; } 1010 else { return uc($b) cmp uc($a); } 1011 1012 } keys(%{$hashtable{$hash}}); 1013 } else { 1014 @files = sort { 1015 if ($a =~ /_Comm_/ || 1016 $b eq 'en_x_US.UTF-8') { return 1; } 1017 elsif ($b =~ /_Comm_/ || 1018 $a eq 'en_x_US.UTF-8') { return -1; } 1019 else { return uc($b) cmp uc($a); } 1020 } keys(%{$hashtable{$hash}}); 1021 } 1022 if ($#files > 0) { 1023 my $link = shift(@files); 1024 $link =~ s/_x_x//; # special case for C 1025 $link =~ s/_x_/_/; # strip family if none there 1026 foreach my $file (@files) { 1027 my @a = split(/_/, $file); 1028 my @b = split(/\./, $a[-1]); 1029 $file =~ s/_x_/_/; 1030 print FOUT "SAME+=\t\t$link $file\n"; 1031 undef($languages{$a[0]}{$a[1]}{data}{$b[0]}{$b[1]}); 1032 } 1033 } 1034 } 1035 1036 foreach my $l (sort keys(%languages)) { 1037 foreach my $f (sort keys(%{$languages{$l}})) { 1038 foreach my $c (sort keys(%{$languages{$l}{$f}{data}})) { 1039 next if (defined $languages{$l}{$f}{definitions} 1040 && $languages{$l}{$f}{definitions} !~ /$TYPE/); 1041 if (defined $languages{$l}{$f}{data}{$c}{$DEFENCODING} 1042 && $languages{$l}{$f}{data}{$c}{$DEFENCODING} eq "0") { 1043 print "Skipping ${l}_" . ($f eq "x" ? "" : "${f}_") . 1044 "${c} - not read\n"; 1045 next; 1046 } 1047 foreach my $e (sort keys(%{$languages{$l}{$f}{data}{$c}})) { 1048 my $file = $l; 1049 $file .= "_" . $f if ($f ne "x"); 1050 $file .= "_" . $c if ($c ne "x"); 1051 next if (!defined $languages{$l}{$f}{data}{$c}{$e}); 1052 print FOUT "LOCALES+=\t$file.$e\n"; 1053 } 1054 1055 if (defined $languages{$l}{$f}{nc_link}) { 1056 foreach my $e (sort keys(%{$languages{$l}{$f}{data}{$c}})) { 1057 my $file = $l . "_"; 1058 $file .= $f . "_" if ($f ne "x"); 1059 $file .= $c; 1060 print FOUT "SAME+=\t\t$file.$e $languages{$l}{$f}{nc_link}.$e\t# legacy (lang/country change)\n"; 1061 } 1062 } 1063 1064 if (defined $languages{$l}{$f}{e_link}) { 1065 foreach my $el (split(" ", $languages{$l}{$f}{e_link})) { 1066 my @a = split(/:/, $el); 1067 my $file = $l . "_"; 1068 $file .= $f . "_" if ($f ne "x"); 1069 $file .= $c; 1070 print FOUT "SAME+=\t\t$file.$a[0] $file.$a[1]\t# legacy (same charset)\n"; 1071 } 1072 } 1073 1074 } 1075 } 1076 } 1077 1078 print FOUT <<EOF; 1079 1080FILES= \${LOCALES:S/\$/.${SRCOUT2}/} 1081CLEANFILES= \${FILES} 1082 1083.for f t in \${SAME} 1084DIRS+= LOCALEDIR_\$t 1085LOCALEDIR_\$t= \${LOCALEDIR}/\$t 1086LOCALEDIR_\$tPACKAGE= locales 1087SYMLINKS+= ../\$f/\${FILESNAME} \\ 1088 \${LOCALEDIR}/\$t/\${FILESNAME} 1089.endfor 1090 1091.for f in \${LOCALES} 1092FILESDIR_\${f}.${SRCOUT2}= \${LOCALEDIR}/\${f} 1093.endfor 1094 1095${SRCOUT3}.include <bsd.prog.mk> 1096EOF 1097 1098 close(FOUT); 1099} 1100