1#!/usr/local/bin/perl -wC 2 3# SPDX-License-Identifier: BSD-2-Clause-FreeBSD 4# 5# Copyright 2009 Edwin Groothuis <edwin@FreeBSD.org> 6# Copyright 2015 John Marino <draco@marino.st> 7# Copyright 2020 Hiroki Sato <hrs@FreeBSD.org> 8# 9# Redistribution and use in source and binary forms, with or without 10# modification, are permitted provided that the following conditions 11# are met: 12# 1. Redistributions of source code must retain the above copyright 13# notice, this list of conditions and the following disclaimer. 14# 2. Redistributions in binary form must reproduce the above copyright 15# notice, this list of conditions and the following disclaimer in the 16# documentation and/or other materials provided with the distribution. 17# 18# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 19# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28# SUCH DAMAGE. 29# 30# $FreeBSD$ 31 32use strict; 33use File::Copy; 34use XML::Parser; 35use Tie::IxHash; 36use Text::Iconv; 37#use Data::Dumper; 38use Getopt::Long; 39use Digest::SHA qw(sha1_hex); 40require "charmaps.pm"; 41 42if ($#ARGV < 2) { 43 print "Usage: $0 --unidir=<unidir> --etc=<etcdir> --type=<type>\n"; 44 exit(1); 45} 46 47my $DEFENCODING = "UTF-8"; 48 49my $UNIDIR = undef; 50my $ETCDIR = undef; 51my $TYPE = undef; 52 53my $CLDR_VERSION = undef; 54 55my $result = GetOptions ( 56 "unidir=s" => \$UNIDIR, 57 "etc=s" => \$ETCDIR, 58 "type=s" => \$TYPE, 59 ); 60 61my %convertors = (); 62 63my %ucd = (); 64my %values = (); 65my %hashtable = (); 66my %languages = (); 67my %translations = (); 68my %alternativemonths = (); 69get_languages(); 70 71my %utfmap = (); 72$utfmap{'UTF-8'} = {}; 73$utfmap{'UTF-32'} = {}; 74get_utfmap("$UNIDIR/posix/$DEFENCODING.cm", $utfmap{'UTF-8'}); 75get_utfmap("$UNIDIR/posix/UTF-32.cm", $utfmap{'UTF-32'}); 76 77my %keys = (); 78tie(%keys, "Tie::IxHash"); 79tie(%hashtable, "Tie::IxHash"); 80 81my %FILESNAMES = ( 82 "monetdef" => "LC_MONETARY", 83 "timedef" => "LC_TIME", 84 "msgdef" => "LC_MESSAGES", 85 "numericdef" => "LC_NUMERIC", 86 "colldef" => "LC_COLLATE", 87 "ctypedef" => "LC_CTYPE" 88); 89 90my %callback = ( 91 mdorder => \&callback_mdorder, 92 altmon => \&callback_altmon, 93 cformat => \&callback_cformat, 94 dformat => \&callback_dformat, 95 dtformat => \&callback_dtformat, 96 cbabmon => \&callback_abmon, 97 cbampm => \&callback_ampm, 98 data => undef, 99); 100 101my %DESC = ( 102 103 # numericdef 104 "decimal_point" => "decimal_point", 105 "thousands_sep" => "thousands_sep", 106 "grouping" => "grouping", 107 108 # monetdef 109 "int_curr_symbol" => "int_curr_symbol (last character always " . 110 "SPACE)", 111 "currency_symbol" => "currency_symbol", 112 "mon_decimal_point" => "mon_decimal_point", 113 "mon_thousands_sep" => "mon_thousands_sep", 114 "mon_grouping" => "mon_grouping", 115 "positive_sign" => "positive_sign", 116 "negative_sign" => "negative_sign", 117 "int_frac_digits" => "int_frac_digits", 118 "frac_digits" => "frac_digits", 119 "p_cs_precedes" => "p_cs_precedes", 120 "p_sep_by_space" => "p_sep_by_space", 121 "n_cs_precedes" => "n_cs_precedes", 122 "n_sep_by_space" => "n_sep_by_space", 123 "p_sign_posn" => "p_sign_posn", 124 "n_sign_posn" => "n_sign_posn", 125 126 # msgdef 127 "yesexpr" => "yesexpr", 128 "noexpr" => "noexpr", 129 "yesstr" => "yesstr", 130 "nostr" => "nostr", 131 132 # timedef 133 "abmon" => "Short month names", 134 "mon" => "Long month names (as in a date)", 135 "abday" => "Short weekday names", 136 "day" => "Long weekday names", 137 "t_fmt" => "X_fmt", 138 "d_fmt" => "x_fmt", 139 "c_fmt" => "c_fmt", 140 "am_pm" => "AM/PM", 141 "d_t_fmt" => "date_fmt", 142 "altmon" => "Long month names (without case ending)", 143 "md_order" => "md_order", 144 "t_fmt_ampm" => "ampm_fmt", 145); 146 147if ($TYPE eq "colldef") { 148 transform_collation(); 149 make_makefile(); 150} 151 152if ($TYPE eq "ctypedef") { 153 transform_ctypes(); 154 make_makefile(); 155} 156 157if ($TYPE eq "numericdef") { 158 %keys = ( 159 "decimal_point" => "s", 160 "thousands_sep" => "s", 161 "grouping" => "ai", 162 ); 163 get_fields(); 164 print_fields(); 165 make_makefile(); 166} 167 168if ($TYPE eq "monetdef") { 169 %keys = ( 170 "int_curr_symbol" => "s", 171 "currency_symbol" => "s", 172 "mon_decimal_point" => "s", 173 "mon_thousands_sep" => "s", 174 "mon_grouping" => "ai", 175 "positive_sign" => "s", 176 "negative_sign" => "s", 177 "int_frac_digits" => "i", 178 "frac_digits" => "i", 179 "p_cs_precedes" => "i", 180 "p_sep_by_space" => "i", 181 "n_cs_precedes" => "i", 182 "n_sep_by_space" => "i", 183 "p_sign_posn" => "i", 184 "n_sign_posn" => "i" 185 ); 186 get_fields(); 187 print_fields(); 188 make_makefile(); 189} 190 191if ($TYPE eq "msgdef") { 192 %keys = ( 193 "yesexpr" => "s", 194 "noexpr" => "s", 195 "yesstr" => "s", 196 "nostr" => "s" 197 ); 198 get_fields(); 199 print_fields(); 200 make_makefile(); 201} 202 203if ($TYPE eq "timedef") { 204 %keys = ( 205 "abmon" => "<cbabmon<abmon<as", 206 "mon" => "as", 207 "abday" => "as", 208 "day" => "as", 209 "t_fmt" => "s", 210 "d_fmt" => "<dformat<d_fmt<s", 211 "c_fmt" => "<cformat<d_t_fmt<s", 212 "am_pm" => "<cbampm<am_pm<as", 213 "d_t_fmt" => "<dtformat<d_t_fmt<s", 214 "altmon" => "<altmon<mon<as", 215 "md_order" => "<mdorder<d_fmt<s", 216 "t_fmt_ampm" => "s", 217 ); 218 get_fields(); 219 print_fields(); 220 make_makefile(); 221} 222 223sub callback_ampm { 224 my $s = shift; 225 my $nl = $callback{data}{l} . "_" . $callback{data}{c}; 226 my $enc = $callback{data}{e}; 227 228 if ($nl eq 'ru_RU') { 229 if ($enc eq 'UTF-8') { 230 $s = 'дп;пп'; 231 } else { 232 my $converter = Text::Iconv->new("utf-8", "$enc"); 233 $s = $converter->convert("дп;пп"); 234 } 235 } 236 return $s; 237} 238 239sub callback_cformat { 240 my $s = shift; 241 my $nl = $callback{data}{l} . "_" . $callback{data}{c}; 242 243 if ($nl eq 'ko_KR') { 244 $s =~ s/(> )(%p)/$1%A $2/; 245 } 246 $s =~ s/\.,/\./; 247 $s =~ s/ %Z//; 248 $s =~ s/ %z//; 249 $s =~ s/^"%e\./%A %e/; 250 $s =~ s/^"(%B %e, )/"%A, $1/; 251 $s =~ s/^"(%e %B )/"%A $1/; 252 return $s; 253}; 254 255sub callback_dformat { 256 my $s = shift; 257 258 $s =~ s/(%m(<SOLIDUS>|[-.]))%e/$1%d/; 259 $s =~ s/%e((<SOLIDUS>|[-.])%m)/%d$1/; 260 return $s; 261}; 262 263sub callback_dtformat { 264 my $s = shift; 265 my $nl = $callback{data}{l} . "_" . $callback{data}{c}; 266 267 if ($nl eq 'ja_JP') { 268 $s =~ s/(> )(%H)/$1%A $2/; 269 } elsif ($nl eq 'ko_KR' || $nl eq 'zh_CN' || $nl eq 'zh_TW') { 270 if ($nl ne 'ko_KR') { 271 $s =~ s/%m/%_m/; 272 } 273 $s =~ s/(> )(%p)/$1%A $2/; 274 } 275 $s =~ s/\.,/\./; 276 $s =~ s/^"%e\./%A %e/; 277 $s =~ s/^"(%B %e, )/"%A, $1/; 278 $s =~ s/^"(%e %B )/"%A $1/; 279 return $s; 280}; 281 282sub callback_mdorder { 283 my $s = shift; 284 return undef if (!defined $s); 285 $s =~ s/[^dem]//g; 286 $s =~ s/e/d/g; 287 return $s; 288}; 289 290sub callback_altmon { 291 # if the language/country is known in %alternative months then 292 # return that, otherwise repeat mon 293 my $s = shift; 294 295 if (defined $alternativemonths{$callback{data}{l}}{$callback{data}{c}}) { 296 my @altnames = split(";",$alternativemonths{$callback{data}{l}}{$callback{data}{c}}); 297 my @cleaned; 298 foreach (@altnames) 299 { 300 $_ =~ s/^\s+//; 301 $_ =~ s/\s+$//; 302 push @cleaned, $_; 303 } 304 return join(";",@cleaned); 305 } 306 307 return $s; 308} 309 310sub callback_abmon { 311 # for specified CJK locales, pad result with a space to enable 312 # columns to line up (style established in FreeBSD in 2001) 313 my $s = shift; 314 my $nl = $callback{data}{l} . "_" . $callback{data}{c}; 315 316 if ($nl eq 'ja_JP' || $nl eq 'ko_KR' || $nl eq 'zh_CN' || 317 $nl eq 'zh_HK' || $nl eq 'zh_TW') { 318 my @monthnames = split(";", $s); 319 my @cleaned; 320 foreach (@monthnames) 321 { 322 if ($_ =~ /^"<(two|three|four|five|six|seven|eight|nine)>/ || 323 ($_ =~ /^"<one>/ && $_ !~ /^"<one>(<zero>|<one>|<two>)/)) 324 { 325 $_ =~ s/^"/"<space>/; 326 } 327 push @cleaned, $_; 328 } 329 return join(";",@cleaned); 330 } 331 return $s; 332} 333 334############################ 335 336sub get_utfmap { 337 my ($file, $db) = @_; 338 339 open(FIN, $file); 340 my @lines = <FIN>; 341 close(FIN); 342 chomp(@lines); 343 344 my $prev_k = undef; 345 my $prev_v = ""; 346 my $incharmap = 0; 347 foreach my $l (@lines) { 348 chomp($l); 349 next if ($l =~ /^\#/); 350 next if ($l eq ""); 351 352 if ($l eq "CHARMAP") { 353 $incharmap = 1; 354 next; 355 } 356 357 next if (!$incharmap); 358 last if ($l eq "END CHARMAP"); 359 360 $l =~ /^<([^\s]+)>\s+(.*)/; 361 my $k = $1; 362 my $v = $2; 363 $v =~ s/\\x//g; # UTF-8 char code 364 $db->{$k} = $v; 365# print STDERR "UTF $k = $v\n"; 366 367 # XXX: no longer needed 368 # $db_alias->{$k} = $prev_k if ($prev_v eq $v); 369 370 $prev_v = $v; 371 $prev_k = $k; 372 } 373} 374 375sub resolve_enc_addition { 376 my $ret = ''; 377 378 foreach my $t (split(/\+/, $_[0])) { 379 $t =~ s/^0[xX]//; 380 $ret .= $t; 381 } 382 return $ret; 383} 384 385sub get_languages { 386 my %data = get_xmldata($ETCDIR); 387 %languages = %{$data{L}}; 388 %translations = %{$data{T}}; 389 %alternativemonths = %{$data{AM}}; 390} 391 392sub transform_ctypes { 393 # Add the C.UTF-8 394 $languages{"C"}{"x"}{data}{"x"}{$DEFENCODING} = undef; 395 396 foreach my $l (sort keys(%languages)) { 397 foreach my $f (sort keys(%{$languages{$l}})) { 398 foreach my $c (sort keys(%{$languages{$l}{$f}{data}})) { 399 next if (defined $languages{$l}{$f}{definitions} 400 && $languages{$l}{$f}{definitions} !~ /$TYPE/); 401 $languages{$l}{$f}{data}{$c}{$DEFENCODING} = 0; # unread 402 my $file = $l; 403 $file .= "_" . $f if ($f ne "x"); 404 $file .= "_" . $c if ($c ne "x"); 405 my $actfile = $file; 406 407 my $filename = "$UNIDIR/posix/xx_Comm_C.UTF-8.src"; 408 if (! -f $filename) { 409 print STDERR "Cannot open $filename\n"; 410 next; 411 } 412 open(FIN, "$filename"); 413 print "Reading from $filename for ${l}_${f}_${c}\n"; 414 $languages{$l}{$f}{data}{$c}{$DEFENCODING} = 1; # read 415 my @lines; 416 my $shex; 417 my $uhex; 418 while (<FIN>) { 419 push @lines, $_; 420 } 421 close(FIN); 422 $shex = sha1_hex(join("\n", @lines)); 423 $languages{$l}{$f}{data}{$c}{$DEFENCODING} = $shex; 424 $hashtable{$shex}{"${l}_${f}_${c}.$DEFENCODING"} = 1; 425 open(FOUT, ">$TYPE.draft/$actfile.$DEFENCODING.src"); 426 print FOUT @lines; 427 close(FOUT); 428 foreach my $enc (sort keys(%{$languages{$l}{$f}{data}{$c}})) { 429 next if ($enc eq $DEFENCODING); 430 $filename = "$UNIDIR/posix/$file.$DEFENCODING.src"; 431 if ($file eq 'ja_JP') { 432 # Override $filename for ja_JP because 433 # its CTYPE is not compatible with UTF-8. 434 $filename = "$UNIDIR/posix/$file.eucJP.src"; 435 } 436 if (! -f $filename) { 437 print STDERR "Cannot open $filename\n"; 438 next; 439 } 440 @lines = (); 441 open(FIN, "$filename"); 442 while (<FIN>) { 443 if ((/^comment_char\s/) || (/^escape_char\s/)){ 444 push @lines, $_; 445 } 446 if (/^LC_CTYPE/../^END LC_CTYPE/) { 447 push @lines, $_; 448 } 449 } 450 close(FIN); 451 $uhex = sha1_hex(join("\n", @lines) . $enc); 452 $languages{$l}{$f}{data}{$c}{$enc} = $uhex; 453 $hashtable{$uhex}{"${l}_${f}_${c}.$enc"} = 1; 454 open(FOUT, ">$TYPE.draft/$actfile.$enc.src"); 455 print FOUT <<EOF; 456# Warning: Do not edit. This file is automatically extracted from the 457# tools in /usr/src/tools/tools/locale. The data is obtained from the 458# CLDR project, obtained from http://cldr.unicode.org/ 459# ----------------------------------------------------------------------------- 460EOF 461 print FOUT @lines; 462 close(FOUT); 463 } 464 } 465 } 466 } 467} 468 469 470sub transform_collation { 471 # Read the CLDR version 472 open(FIN, "$UNIDIR/cldr-version") or die "Cannot open cldr-version"; 473 read FIN, $CLDR_VERSION, -s FIN; 474 close(FIN); 475 $CLDR_VERSION =~ s/\s*$//; 476 477 foreach my $l (sort keys(%languages)) { 478 foreach my $f (sort keys(%{$languages{$l}})) { 479 foreach my $c (sort keys(%{$languages{$l}{$f}{data}})) { 480 next if (defined $languages{$l}{$f}{definitions} 481 && $languages{$l}{$f}{definitions} !~ /$TYPE/); 482 $languages{$l}{$f}{data}{$c}{$DEFENCODING} = 0; # unread 483 my $file; 484 $file = $l . "_"; 485 $file .= $f . "_" if ($f ne "x"); 486 $file .= $c; 487 my $actfile = $file; 488 489 my $filename = "$UNIDIR/posix/$file.$DEFENCODING.src"; 490 $filename = "$ETCDIR/$file.$DEFENCODING.src" 491 if (! -f $filename); 492 if (! -f $filename 493 && defined $languages{$l}{$f}{fallback}) { 494 $file = $languages{$l}{$f}{fallback}; 495 $filename = "$UNIDIR/posix/$file.$DEFENCODING.src"; 496 } 497 $filename = "$UNIDIR/posix/$file.$DEFENCODING.src" 498 if (! -f $filename); 499 if (! -f $filename) { 500 print STDERR 501 "Cannot open $file.$DEFENCODING.src or fallback\n"; 502 next; 503 } 504 open(FIN, "$filename"); 505 print "Reading from $filename for ${l}_${f}_${c}\n"; 506 $languages{$l}{$f}{data}{$c}{$DEFENCODING} = 1; # read 507 my @lines; 508 my $shex; 509 while (<FIN>) { 510 if ((/^comment_char\s/) || (/^escape_char\s/)){ 511 push @lines, $_; 512 } 513 if (/^LC_COLLATE/../^END LC_COLLATE/) { 514 $_ =~ s/[ ]+/ /g; 515 push @lines, $_; 516 } 517 } 518 close(FIN); 519 $shex = sha1_hex(join("\n", @lines)); 520 $languages{$l}{$f}{data}{$c}{$DEFENCODING} = $shex; 521 $hashtable{$shex}{"${l}_${f}_${c}.$DEFENCODING"} = 1; 522 open(FOUT, ">$TYPE.draft/$actfile.$DEFENCODING.src"); 523 print FOUT <<EOF; 524# Warning: Do not edit. This file is automatically extracted from the 525# tools in /usr/src/tools/tools/locale. The data is obtained from the 526# CLDR project, obtained from http://cldr.unicode.org/ 527# ----------------------------------------------------------------------------- 528EOF 529 print FOUT @lines; 530 close(FOUT); 531 532 foreach my $enc (sort keys(%{$languages{$l}{$f}{data}{$c}})) { 533 next if ($enc eq $DEFENCODING); 534 535 open FIN, "<$TYPE.draft/$actfile.$DEFENCODING.src"; 536 open FOUT, ">$TYPE.draft/$actfile.$enc.src"; 537 my $order_start = 0; 538 my $print_p = 0; 539 # 540 # %c_elem: collation elements 541 # 542 # undef: not defined 543 # 1: defined 544 # 2: invalid in this encoding 545 # 546 my %c_elem = (); 547 while (<FIN>) { # XXX: this loop should be refactored. 548 chomp; 549 $print_p = 1; 550 if ($order_start) { 551 $order_start = 0 if (m/^order_end/); 552 if (m/^<([^>]+)>/) { 553 if (not defined $c_elem{$1}) { 554# print STDERR "$1:\n"; 555 556 my $u32 = $utfmap{'UTF-32'}->{$1}; 557 die "order, $1\n" if (not defined $u32); 558# print STDERR "u32 for $1 = $u32\n"; 559 if (not defined $convertors{$enc}{$u32}) { 560# print STDERR "$1 - $u32 not defined in $enc\n"; 561 $print_p = 0; 562 } 563 } elsif ($c_elem{$1} == 2) { 564# print STDERR "$1 is marked as invalid in $enc\n"; 565 $print_p = 0; 566 } 567 } 568 } elsif (m/^collating-element/) { 569 my ($elem, $l); 570 if (m/<([^>]+)> from (.+)/) { 571 ($elem, $l) = ($1, $2); 572 } 573# print STDERR "$elem: enter ($print_p, $l,)\n"; 574 while ($print_p and 575 defined $l and 576 $l =~ m/<([^>]+)>/g) { 577# print STDERR "$elem: $1\n"; 578 my $u32 = $utfmap{'UTF-32'}->{$1}; 579 die "collating-element, $1\n" if (not defined $u32); 580# print STDERR "u32 for $1 = $u32\n"; 581 if (not $convertors{$enc}{$u32}) { 582# print STDERR "$1 - $u32 not defined in $enc\n"; 583 $print_p = 0; 584# print STDERR "Mark $elem as invalid\n"; 585 $c_elem{$elem} = 2; 586 } 587 } 588 if ($print_p) { 589# print STDERR "Add $elem\n"; 590 $c_elem{$elem} = 1; 591 } 592 } elsif (m/^collating-symbol <([^>]+)>/) { 593# print STDERR "Add $1\n"; 594 $c_elem{$1} = 1; 595 } elsif (m/^order_start/) { 596 $order_start = 1; 597 # do nothing 598 } 599 print FOUT $_, "\n" if ($print_p); 600 } 601 close FOUT; 602 close FIN; 603 $languages{$l}{$f}{data}{$c}{$enc} = $shex; 604 $hashtable{$shex}{"${l}_${f}_${c}.$enc"} = 1; 605 } 606 } 607 } 608 } 609} 610 611sub get_fields { 612 foreach my $l (sort keys(%languages)) { 613 foreach my $f (sort keys(%{$languages{$l}})) { 614 foreach my $c (sort keys(%{$languages{$l}{$f}{data}})) { 615 next if (defined $languages{$l}{$f}{definitions} 616 && $languages{$l}{$f}{definitions} !~ /$TYPE/); 617 618 $languages{$l}{$f}{data}{$c}{$DEFENCODING} = 0; # unread 619 my $file; 620 $file = $l . "_"; 621 $file .= $f . "_" if ($f ne "x"); 622 $file .= $c; 623 624 my $filename = "$UNIDIR/posix/$file.$DEFENCODING.src"; 625 $filename = "$ETCDIR/$file.$DEFENCODING.src" 626 if (! -f $filename); 627 if (! -f $filename 628 && defined $languages{$l}{$f}{fallback}) { 629 $file = $languages{$l}{$f}{fallback}; 630 $filename = "$UNIDIR/posix/$file.$DEFENCODING.src"; 631 } 632 $filename = "$UNIDIR/posix/$file.$DEFENCODING.src" 633 if (! -f $filename); 634 if (! -f $filename) { 635 print STDERR 636 "Cannot open $file.$DEFENCODING.src or fallback\n"; 637 next; 638 } 639 open(FIN, "$filename"); 640 print "Reading from $filename for ${l}_${f}_${c}\n"; 641 $languages{$l}{$f}{data}{$c}{$DEFENCODING} = 1; # read 642 my @lines = <FIN>; 643 chomp(@lines); 644 close(FIN); 645 my $continue = 0; 646 foreach my $k (keys(%keys)) { 647 foreach my $line (@lines) { 648 $line =~ s/\r//; 649 next if (!$continue && $line !~ /^$k\s/); 650 if ($continue) { 651 $line =~ s/^\s+//; 652 } else { 653 $line =~ s/^$k\s+//; 654 } 655 656 $values{$l}{$f}{$c}{$k} = "" 657 if (!defined $values{$l}{$f}{$c}{$k}); 658 659 $continue = ($line =~ /\/$/); 660 $line =~ s/\/$// if ($continue); 661 662# while ($line =~ /_/) { 663# $line =~ 664# s/\<([^>_]+)_([^>]+)\>/<$1 $2>/; 665# } 666# die "_ in data - $line" if ($line =~ /_/); 667 $values{$l}{$f}{$c}{$k} .= $line; 668 669 last if (!$continue); 670 } 671 } 672 } 673 } 674 } 675} 676 677sub decodecldr { 678 my $e = shift; 679 my $s = shift; 680 681 my $v = undef; 682 683 if ($e eq "UTF-8") { 684 # 685 # Conversion to UTF-8 can be done from the Unicode name to 686 # the UTF-8 character code. 687 # 688 $v = $utfmap{'UTF-8'}->{$s}; 689 die "Cannot convert $s in $e (charmap)" if (!defined $v); 690 } else { 691 # 692 # Conversion to these encodings can be done from the Unicode 693 # name to Unicode code to the encodings code. 694 # 695 # hex - hex or string attr 696 # unicode - unicode attr 697 # ucc - ucc attr 698 my $hex = $translations{$e}{$s}{hex}; 699 my $ucc = $utfmap{'UTF-32'}->{$s}; 700 my $ucc_attr = $translations{$e}{$s}{ucc}; 701 my $unicode = $translations{$e}{$s}{unicode}; 702 703 if (defined $hex) { # hex is in local encoding 704 $v = $hex; 705 } elsif (defined $unicode) { # unicode is in name 706 $v = $convertors{$e}{$utfmap{'UTF-32'}->{$unicode}}; 707 } elsif (defined $ucc_attr) { # ucc is in code point 708 if (defined $ucc) { 709# print STDERR "INFO: ucc=$ucc_attr ", 710# "overrides $ucc in UTF-32\n"; 711 } 712 # normalize 713 $ucc_attr = sprintf("%08X", hex($ucc_attr)); 714# print STDERR "convert $ucc_attr into $e\n"; 715 $v = $convertors{$e}{$ucc_attr}; 716 } elsif (defined $ucc) { 717 # normalize 718 $ucc = sprintf("%08X", hex($ucc)); 719# print STDERR "convert $ucc into $e\n"; 720 $v = $convertors{$e}{$ucc}; 721 } 722 die "Cannot convert $s in $e" if (!defined $v); 723 } 724 725 # XXX: length = 8 is not supported yet. 726 $v =~ s/^[0]+//g; 727 $v = "0" . $v if (length($v) % 2); 728 return pack("C", hex($v)) if (length($v) == 2); 729 return pack("CC", hex(substr($v, 0, 2)), hex(substr($v, 2, 2))) 730 if (length($v) == 4); 731 return pack("CCC", hex(substr($v, 0, 2)), hex(substr($v, 2, 2)), 732 hex(substr($v, 4, 2))) if (length($v) == 6); 733 die "Cannot convert $s in $e (length = " . length($v) . "\n"; 734} 735 736sub translate { 737 my $enc = shift; 738 my $v = shift; 739 740 return $translations{$enc}{$v} if (defined $translations{$enc}{$v}); 741 return undef; 742} 743 744sub print_fields { 745 foreach my $l (sort keys(%languages)) { 746 foreach my $f (sort keys(%{$languages{$l}})) { 747 foreach my $c (sort keys(%{$languages{$l}{$f}{data}})) { 748 next if (defined $languages{$l}{$f}{definitions} 749 && $languages{$l}{$f}{definitions} !~ /$TYPE/); 750 foreach my $enc (sort keys(%{$languages{$l}{$f}{data}{$c}})) { 751 if ($languages{$l}{$f}{data}{$c}{$DEFENCODING} eq "0") { 752 print "Skipping ${l}_" . 753 ($f eq "x" ? "" : "${f}_") . 754 "${c} - not read\n"; 755 next; 756 } 757 my $file = $l; 758 $file .= "_" . $f if ($f ne "x"); 759 $file .= "_" . $c; 760 print "Writing to $file in $enc\n"; 761 762 if ($enc ne $DEFENCODING && 763 !defined $convertors{$enc}) { 764 print "Failed! Cannot convert to $enc.\n"; 765 next; 766 }; 767 768 open(FOUT, ">$TYPE.draft/$file.$enc.new"); 769 my $okay = 1; 770 my $output = ""; 771 print FOUT <<EOF; 772# Warning: Do not edit. This file is automatically generated from the 773# tools in /usr/src/tools/tools/locale. The data is obtained from the 774# CLDR project, obtained from http://cldr.unicode.org/ 775# ----------------------------------------------------------------------------- 776EOF 777 foreach my $k (keys(%keys)) { 778 my $g = $keys{$k}; 779 780 die("Unknown $k in \%DESC") 781 if (!defined $DESC{$k}); 782 783 $output .= "#\n# $DESC{$k}\n"; 784 785 # Replace one row with another 786 if ($g =~ /^>/) { 787 $k = substr($g, 1); 788 $g = $keys{$k}; 789 } 790 791 # Callback function 792 if ($g =~ /^\</) { 793 $callback{data}{c} = $c; 794 $callback{data}{k} = $k; 795 $callback{data}{f} = $f; 796 $callback{data}{l} = $l; 797 $callback{data}{e} = $enc; 798 my @a = split(/\</, substr($g, 1)); 799 my $rv = 800 &{$callback{$a[0]}}($values{$l}{$f}{$c}{$a[1]}); 801 $values{$l}{$f}{$c}{$k} = $rv; 802 $g = $a[2]; 803 $callback{data} = (); 804 } 805 806 my $v = $values{$l}{$f}{$c}{$k}; 807 $v = "undef" if (!defined $v); 808 809 if ($g eq "i") { 810 $output .= "$v\n"; 811 next; 812 } 813 if ($g eq "ai") { 814 $output .= "$v\n"; 815 next; 816 } 817 if ($g eq "s") { 818 $v =~ s/^"//; 819 $v =~ s/"$//; 820 my $cm = ""; 821 while ($v =~ /^(.*?)<(.*?)>(.*)/) { 822 my $p1 = $1; 823 $cm = $2; 824 my $p3 = $3; 825 826 my $rv = decodecldr($enc, $cm); 827# $rv = translate($enc, $cm) 828# if (!defined $rv); 829 if (!defined $rv) { 830 print STDERR 831"Could not convert $k ($cm) from $DEFENCODING to $enc\n"; 832 $okay = 0; 833 next; 834 } 835 836 $v = $p1 . $rv . $p3; 837 } 838 $output .= "$v\n"; 839 next; 840 } 841 if ($g eq "as") { 842 foreach my $v (split(/;/, $v)) { 843 $v =~ s/^"//; 844 $v =~ s/"$//; 845 my $cm = ""; 846 while ($v =~ /^(.*?)<(.*?)>(.*)/) { 847 my $p1 = $1; 848 $cm = $2; 849 my $p3 = $3; 850 851 my $rv = 852 decodecldr($enc, 853 $cm); 854# $rv = translate($enc, 855# $cm) 856# if (!defined $rv); 857 if (!defined $rv) { 858 print STDERR 859"Could not convert $k ($cm) from $DEFENCODING to $enc\n"; 860 $okay = 0; 861 next; 862 } 863 864 $v = $1 . $rv . $3; 865 } 866 $output .= "$v\n"; 867 } 868 next; 869 } 870 871 die("$k is '$g'"); 872 873 } 874 875 $languages{$l}{$f}{data}{$c}{$enc} = sha1_hex($output); 876 $hashtable{sha1_hex($output)}{"${l}_${f}_${c}.$enc"} = 1; 877 print FOUT "$output# EOF\n"; 878 close(FOUT); 879 880 if ($okay) { 881 rename("$TYPE.draft/$file.$enc.new", 882 "$TYPE.draft/$file.$enc.src"); 883 } else { 884 rename("$TYPE.draft/$file.$enc.new", 885 "$TYPE.draft/$file.$enc.failed"); 886 } 887 } 888 } 889 } 890 } 891} 892 893sub make_makefile { 894 print "Creating Makefile for $TYPE\n"; 895 my $SRCOUT; 896 my $SRCOUT2; 897 my $SRCOUT3 = ""; 898 my $SRCOUT4 = ""; 899 my $MAPLOC; 900 if ($TYPE eq "colldef") { 901 # In future, we might want to try to put the CLDR version into 902 # the .src files with some new syntax, instead of the makefile. 903 $SRCOUT = "localedef \${LOCALEDEF_ENDIAN} -D -U " . 904 "-i \${.IMPSRC} \\\n" . 905 "\t-V \${CLDR_VERSION} \\\n" . 906 "\t-f \${MAPLOC}/map.\${.TARGET:T:R:E:C/@.*//} " . 907 "\${.OBJDIR}/\${.IMPSRC:T:R}"; 908 $MAPLOC = "MAPLOC=\t\t\${.CURDIR}/../../tools/tools/" . 909 "locale/etc/final-maps\n"; 910 $SRCOUT2 = "LC_COLLATE"; 911 $SRCOUT3 = "" . 912 ".for f t in \${LOCALES_MAPPED}\n" . 913 "FILES+=\t\$t.LC_COLLATE\n" . 914 "FILESDIR_\$t.LC_COLLATE=\t\${LOCALEDIR}/\$t\n" . 915 "\$t.LC_COLLATE: \${.CURDIR}/\$f.src\n" . 916 "\tlocaledef \${LOCALEDEF_ENDIAN} -D -U " . 917 "-i \${.ALLSRC} \\\n" . 918 "\t-V \${CLDR_VERSION} \\\n" . 919 "\t\t-f \${MAPLOC}/map.\${.TARGET:T:R:E:C/@.*//} \\\n" . 920 "\t\t\${.OBJDIR}/\${.TARGET:T:R}\n" . 921 ".endfor\n\n"; 922 $SRCOUT4 = "## LOCALES_MAPPED\n"; 923 } 924 elsif ($TYPE eq "ctypedef") { 925 $SRCOUT = "localedef \${LOCALEDEF_ENDIAN} -D -U -c " . 926 "-w \${MAPLOC}/widths.txt \\\n" . 927 "\t-f \${MAPLOC}/map.\${.IMPSRC:T:R:E} " . 928 "\\\n\t-i \${.IMPSRC} \${.OBJDIR}/\${.IMPSRC:T:R} " . 929 " || true"; 930 $SRCOUT2 = "LC_CTYPE"; 931 $MAPLOC = "MAPLOC=\t\t\${.CURDIR}/../../tools/tools/" . 932 "locale/etc/final-maps\n"; 933 $SRCOUT3 = "## SYMPAIRS\n\n" . 934 ".for s t in \${SYMPAIRS}\n" . 935 "\${t:S/src\$/LC_CTYPE/}: " . 936 "\$s\n" . 937 "\tlocaledef \${LOCALEDEF_ENDIAN} -D -U -c " . 938 "-w \${MAPLOC}/widths.txt \\\n" . 939 "\t-f \${MAPLOC}/map.\${.TARGET:T:R:C/^.*\\.//} " . 940 "\\\n\t-i \${.ALLSRC} \${.OBJDIR}/\${.TARGET:T:R} " . 941 " || true\n" . 942 ".endfor\n\n"; 943 } 944 else { 945 $SRCOUT = "grep -v -E '^(\#\$\$|\#[ ])' < \${.IMPSRC} > \${.TARGET}"; 946 $SRCOUT2 = "out"; 947 $MAPLOC = ""; 948 } 949 open(FOUT, ">$TYPE.draft/Makefile"); 950 print FOUT <<EOF; 951# \$FreeBSD\$ 952# Warning: Do not edit. This file is automatically generated from the 953# tools in /usr/src/tools/tools/locale. 954 955PACKAGE= locales 956LOCALEDIR= \${SHAREDIR}/locale 957FILESNAME= $FILESNAMES{$TYPE} 958.SUFFIXES: .src .${SRCOUT2} 959${MAPLOC} 960EOF 961 962 if ($TYPE eq "colldef") { 963 print FOUT <<EOF; 964CLDR_VERSION= "${CLDR_VERSION}" 965 966EOF 967 } 968 969 if ($TYPE eq "colldef" || $TYPE eq "ctypedef") { 970 print FOUT <<EOF; 971.include <bsd.endian.mk> 972 973EOF 974 } 975 976 print FOUT <<EOF; 977.src.${SRCOUT2}: 978 $SRCOUT 979 980## PLACEHOLDER 981 982${SRCOUT4} 983 984EOF 985 986 foreach my $hash (keys(%hashtable)) { 987 # For colldef, weight LOCALES to UTF-8 988 # Sort as upper-case and reverse to achieve it 989 # Make en_US, ru_RU, and ca_AD preferred 990 my @files; 991 if ($TYPE eq "colldef") { 992 @files = sort { 993 if ($a eq 'en_x_US.UTF-8' || 994 $a eq 'ru_x_RU.UTF-8' || 995 $a eq 'ca_x_AD.UTF-8') { return -1; } 996 elsif ($b eq 'en_x_US.UTF-8' || 997 $b eq 'ru_x_RU.UTF-8' || 998 $b eq 'ca_x_AD.UTF-8') { return 1; } 999 else { return uc($b) cmp uc($a); } 1000 } keys(%{$hashtable{$hash}}); 1001 } elsif ($TYPE eq "ctypedef") { 1002 @files = sort { 1003 if ($a eq 'C_x_x.UTF-8') { return -1; } 1004 elsif ($b eq 'C_x_x.UTF-8') { return 1; } 1005 if ($a =~ /^en_x_US/) { return -1; } 1006 elsif ($b =~ /^en_x_US/) { return 1; } 1007 1008 if ($a =~ /^en_x_GB.ISO8859-15/ || 1009 $a =~ /^ru_x_RU/) { return -1; } 1010 elsif ($b =~ /^en_x_GB.ISO8859-15/ || 1011 $b =~ /ru_x_RU/) { return 1; } 1012 else { return uc($b) cmp uc($a); } 1013 1014 } keys(%{$hashtable{$hash}}); 1015 } else { 1016 @files = sort { 1017 if ($a =~ /_Comm_/ || 1018 $b eq 'en_x_US.UTF-8') { return 1; } 1019 elsif ($b =~ /_Comm_/ || 1020 $a eq 'en_x_US.UTF-8') { return -1; } 1021 else { return uc($b) cmp uc($a); } 1022 } keys(%{$hashtable{$hash}}); 1023 } 1024 if ($#files > 0) { 1025 my $link = shift(@files); 1026 $link =~ s/_x_x//; # special case for C 1027 $link =~ s/_x_/_/; # strip family if none there 1028 foreach my $file (@files) { 1029 my @a = split(/_/, $file); 1030 my @b = split(/\./, $a[-1]); 1031 $file =~ s/_x_/_/; 1032 print FOUT "SAME+=\t\t$link $file\n"; 1033 undef($languages{$a[0]}{$a[1]}{data}{$b[0]}{$b[1]}); 1034 } 1035 } 1036 } 1037 1038 foreach my $l (sort keys(%languages)) { 1039 foreach my $f (sort keys(%{$languages{$l}})) { 1040 foreach my $c (sort keys(%{$languages{$l}{$f}{data}})) { 1041 next if (defined $languages{$l}{$f}{definitions} 1042 && $languages{$l}{$f}{definitions} !~ /$TYPE/); 1043 if (defined $languages{$l}{$f}{data}{$c}{$DEFENCODING} 1044 && $languages{$l}{$f}{data}{$c}{$DEFENCODING} eq "0") { 1045 print "Skipping ${l}_" . ($f eq "x" ? "" : "${f}_") . 1046 "${c} - not read\n"; 1047 next; 1048 } 1049 foreach my $e (sort keys(%{$languages{$l}{$f}{data}{$c}})) { 1050 my $file = $l; 1051 $file .= "_" . $f if ($f ne "x"); 1052 $file .= "_" . $c if ($c ne "x"); 1053 next if (!defined $languages{$l}{$f}{data}{$c}{$e}); 1054 print FOUT "LOCALES+=\t$file.$e\n"; 1055 } 1056 1057 if (defined $languages{$l}{$f}{nc_link}) { 1058 foreach my $e (sort keys(%{$languages{$l}{$f}{data}{$c}})) { 1059 my $file = $l . "_"; 1060 $file .= $f . "_" if ($f ne "x"); 1061 $file .= $c; 1062 print FOUT "SAME+=\t\t$file.$e $languages{$l}{$f}{nc_link}.$e\t# legacy (lang/country change)\n"; 1063 } 1064 } 1065 1066 if (defined $languages{$l}{$f}{e_link}) { 1067 foreach my $el (split(" ", $languages{$l}{$f}{e_link})) { 1068 my @a = split(/:/, $el); 1069 my $file = $l . "_"; 1070 $file .= $f . "_" if ($f ne "x"); 1071 $file .= $c; 1072 print FOUT "SAME+=\t\t$file.$a[0] $file.$a[1]\t# legacy (same charset)\n"; 1073 } 1074 } 1075 1076 } 1077 } 1078 } 1079 1080 print FOUT <<EOF; 1081 1082FILES= \${LOCALES:S/\$/.${SRCOUT2}/} 1083CLEANFILES= \${FILES} 1084 1085.for f t in \${SAME} 1086DIRS+= LOCALEDIR_\$t 1087LOCALEDIR_\$t= \${LOCALEDIR}/\$t 1088LOCALEDIR_\$tPACKAGE= locales 1089SYMLINKS+= ../\$f/\${FILESNAME} \\ 1090 \${LOCALEDIR}/\$t/\${FILESNAME} 1091.endfor 1092 1093.for f in \${LOCALES} 1094FILESDIR_\${f}.${SRCOUT2}= \${LOCALEDIR}/\${f} 1095.endfor 1096 1097${SRCOUT3}.include <bsd.prog.mk> 1098EOF 1099 1100 close(FOUT); 1101} 1102