xref: /freebsd/tools/tools/locale/tools/cldr2def.pl (revision e0c4386e)
1#!/usr/local/bin/perl -wC
2
3# SPDX-License-Identifier: BSD-2-Clause
4#
5# Copyright 2009 Edwin Groothuis <edwin@FreeBSD.org>
6# Copyright 2015 John Marino <draco@marino.st>
7# Copyright 2020 Hiroki Sato <hrs@FreeBSD.org>
8#
9# Redistribution and use in source and binary forms, with or without
10# modification, are permitted provided that the following conditions
11# are met:
12# 1. Redistributions of source code must retain the above copyright
13#    notice, this list of conditions and the following disclaimer.
14# 2. Redistributions in binary form must reproduce the above copyright
15#    notice, this list of conditions and the following disclaimer in the
16#    documentation and/or other materials provided with the distribution.
17#
18# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21# ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28# SUCH DAMAGE.
29#
30
31use strict;
32use File::Copy;
33use XML::Parser;
34use Tie::IxHash;
35use Text::Iconv;
36#use Data::Dumper;
37use Getopt::Long;
38use Digest::SHA qw(sha1_hex);
39require "charmaps.pm";
40
41if ($#ARGV < 2) {
42	print "Usage: $0 --unidir=<unidir> --etc=<etcdir> --type=<type>\n";
43	exit(1);
44}
45
46my $DEFENCODING = "UTF-8";
47
48my $UNIDIR = undef;
49my $ETCDIR = undef;
50my $TYPE = undef;
51
52my $CLDR_VERSION = undef;
53
54my $result = GetOptions (
55		"unidir=s"	=> \$UNIDIR,
56		"etc=s"		=> \$ETCDIR,
57		"type=s"	=> \$TYPE,
58	    );
59
60my %convertors = ();
61
62my %ucd = ();
63my %values = ();
64my %hashtable = ();
65my %languages = ();
66my %translations = ();
67my %alternativemonths = ();
68get_languages();
69
70my %utfmap = ();
71$utfmap{'UTF-8'} = {};
72$utfmap{'UTF-32'} = {};
73get_utfmap("$UNIDIR/posix/$DEFENCODING.cm", $utfmap{'UTF-8'});
74get_utfmap("$UNIDIR/posix/UTF-32.cm", $utfmap{'UTF-32'});
75
76my %keys = ();
77tie(%keys, "Tie::IxHash");
78tie(%hashtable, "Tie::IxHash");
79
80my %FILESNAMES = (
81	"monetdef"	=> "LC_MONETARY",
82	"timedef"	=> "LC_TIME",
83	"msgdef"	=> "LC_MESSAGES",
84	"numericdef"	=> "LC_NUMERIC",
85	"colldef"	=> "LC_COLLATE",
86	"ctypedef"	=> "LC_CTYPE"
87);
88
89my %callback = (
90	mdorder => \&callback_mdorder,
91	altmon => \&callback_altmon,
92	cformat => \&callback_cformat,
93	dformat => \&callback_dformat,
94	dtformat => \&callback_dtformat,
95	cbabmon => \&callback_abmon,
96	cbampm => \&callback_ampm,
97	data => undef,
98);
99
100my %DESC = (
101
102	# numericdef
103	"decimal_point"	=> "decimal_point",
104	"thousands_sep"	=> "thousands_sep",
105	"grouping"	=> "grouping",
106
107	# monetdef
108	"int_curr_symbol"	=> "int_curr_symbol (last character always " .
109				   "SPACE)",
110	"currency_symbol"	=> "currency_symbol",
111	"mon_decimal_point"	=> "mon_decimal_point",
112	"mon_thousands_sep"	=> "mon_thousands_sep",
113	"mon_grouping"		=> "mon_grouping",
114	"positive_sign"		=> "positive_sign",
115	"negative_sign"		=> "negative_sign",
116	"int_frac_digits"	=> "int_frac_digits",
117	"frac_digits"		=> "frac_digits",
118	"p_cs_precedes"		=> "p_cs_precedes",
119	"p_sep_by_space"	=> "p_sep_by_space",
120	"n_cs_precedes"		=> "n_cs_precedes",
121	"n_sep_by_space"	=> "n_sep_by_space",
122	"p_sign_posn"		=> "p_sign_posn",
123	"n_sign_posn"		=> "n_sign_posn",
124
125	# msgdef
126	"yesexpr"	=> "yesexpr",
127	"noexpr"	=> "noexpr",
128	"yesstr"	=> "yesstr",
129	"nostr"		=> "nostr",
130
131	# timedef
132	"abmon"		=> "Short month names",
133	"mon"		=> "Long month names (as in a date)",
134	"abday"		=> "Short weekday names",
135	"day"		=> "Long weekday names",
136	"t_fmt"		=> "X_fmt",
137	"d_fmt"		=> "x_fmt",
138	"c_fmt"		=> "c_fmt",
139	"am_pm"		=> "AM/PM",
140	"d_t_fmt"	=> "date_fmt",
141	"altmon"	=> "Long month names (without case ending)",
142	"md_order"	=> "md_order",
143	"t_fmt_ampm"	=> "ampm_fmt",
144);
145
146if ($TYPE eq "colldef") {
147	transform_collation();
148	make_makefile();
149}
150
151if ($TYPE eq "ctypedef") {
152	transform_ctypes();
153	make_makefile();
154}
155
156if ($TYPE eq "numericdef") {
157	%keys = (
158	    "decimal_point"	=> "s",
159	    "thousands_sep"	=> "s",
160	    "grouping"		=> "ai",
161	);
162	get_fields();
163	print_fields();
164	make_makefile();
165}
166
167if ($TYPE eq "monetdef") {
168	%keys = (
169	    "int_curr_symbol"	=> "s",
170	    "currency_symbol"	=> "s",
171	    "mon_decimal_point"	=> "s",
172	    "mon_thousands_sep"	=> "s",
173	    "mon_grouping"	=> "ai",
174	    "positive_sign"	=> "s",
175	    "negative_sign"	=> "s",
176	    "int_frac_digits"	=> "i",
177	    "frac_digits"	=> "i",
178	    "p_cs_precedes"	=> "i",
179	    "p_sep_by_space"	=> "i",
180	    "n_cs_precedes"	=> "i",
181	    "n_sep_by_space"	=> "i",
182	    "p_sign_posn"	=> "i",
183	    "n_sign_posn"	=> "i"
184	);
185	get_fields();
186	print_fields();
187	make_makefile();
188}
189
190if ($TYPE eq "msgdef") {
191	%keys = (
192	    "yesexpr"		=> "s",
193	    "noexpr"		=> "s",
194	    "yesstr"		=> "s",
195	    "nostr"		=> "s"
196	);
197	get_fields();
198	print_fields();
199	make_makefile();
200}
201
202if ($TYPE eq "timedef") {
203	%keys = (
204	    "abmon"		=> "<cbabmon<abmon<as",
205	    "mon"		=> "as",
206	    "abday"		=> "as",
207	    "day"		=> "as",
208	    "t_fmt"		=> "s",
209	    "d_fmt"		=> "<dformat<d_fmt<s",
210	    "c_fmt"		=> "<cformat<d_t_fmt<s",
211	    "am_pm"		=> "<cbampm<am_pm<as",
212	    "d_t_fmt"		=> "<dtformat<d_t_fmt<s",
213	    "altmon"		=> "<altmon<mon<as",
214	    "md_order"		=> "<mdorder<d_fmt<s",
215	    "t_fmt_ampm"	=> "s",
216	);
217	get_fields();
218	print_fields();
219	make_makefile();
220}
221
222sub callback_ampm {
223	my $s = shift;
224	my $nl = $callback{data}{l} . "_" . $callback{data}{c};
225	my $enc = $callback{data}{e};
226
227	if ($nl eq 'ru_RU') {
228		if ($enc eq 'UTF-8') {
229			$s = 'дп;пп';
230		} else {
231			my  $converter = Text::Iconv->new("utf-8", "$enc");
232			$s = $converter->convert("дп;пп");
233		}
234	}
235	return $s;
236}
237
238sub callback_cformat {
239	my $s = shift;
240	my $nl = $callback{data}{l} . "_" . $callback{data}{c};
241
242	if ($nl eq 'ko_KR') {
243		$s =~ s/(> )(%p)/$1%A $2/;
244	}
245	$s =~ s/\.,/\./;
246	$s =~ s/ %Z//;
247	$s =~ s/ %z//;
248	$s =~ s/^"%e\./%A %e/;
249	$s =~ s/^"(%B %e, )/"%A, $1/;
250	$s =~ s/^"(%e %B )/"%A $1/;
251	return $s;
252};
253
254sub callback_dformat {
255	my $s = shift;
256
257	$s =~ s/(%m(<SOLIDUS>|[-.]))%e/$1%d/;
258	$s =~ s/%e((<SOLIDUS>|[-.])%m)/%d$1/;
259	return $s;
260};
261
262sub callback_dtformat {
263	my $s = shift;
264	my $nl = $callback{data}{l} . "_" . $callback{data}{c};
265
266	if ($nl eq 'ja_JP') {
267		$s =~ s/(> )(%H)/$1%A $2/;
268	} elsif ($nl eq 'ko_KR' || $nl eq 'zh_CN' || $nl eq 'zh_TW') {
269		if ($nl ne 'ko_KR') {
270			$s =~ s/%m/%_m/;
271		}
272		$s =~ s/(> )(%p)/$1%A $2/;
273	}
274	$s =~ s/\.,/\./;
275	$s =~ s/^"%e\./%A %e/;
276	$s =~ s/^"(%B %e, )/"%A, $1/;
277	$s =~ s/^"(%e %B )/"%A $1/;
278	return $s;
279};
280
281sub callback_mdorder {
282	my $s = shift;
283	return undef if (!defined $s);
284	$s =~ s/[^dem]//g;
285	$s =~ s/e/d/g;
286	return $s;
287};
288
289sub callback_altmon {
290	# if the language/country is known in %alternative months then
291	# return that, otherwise repeat mon
292	my $s = shift;
293
294	if (defined $alternativemonths{$callback{data}{l}}{$callback{data}{c}}) {
295		my @altnames = split(";",$alternativemonths{$callback{data}{l}}{$callback{data}{c}});
296		my @cleaned;
297		foreach (@altnames)
298		{
299			$_ =~ s/^\s+//;
300			$_ =~ s/\s+$//;
301			push @cleaned, $_;
302		}
303		return join(";",@cleaned);
304	}
305
306	return $s;
307}
308
309sub callback_abmon {
310	# for specified CJK locales, pad result with a space to enable
311	# columns to line up (style established in FreeBSD in 2001)
312	my $s = shift;
313	my $nl = $callback{data}{l} . "_" . $callback{data}{c};
314
315	if ($nl eq 'ja_JP' || $nl eq 'ko_KR' || $nl eq 'zh_CN' ||
316	    $nl eq 'zh_HK' || $nl eq 'zh_TW') {
317		my @monthnames = split(";", $s);
318		my @cleaned;
319		foreach (@monthnames)
320		{
321			if ($_ =~ /^"<(two|three|four|five|six|seven|eight|nine)>/ ||
322			   ($_ =~ /^"<one>/ && $_ !~ /^"<one>(<zero>|<one>|<two>)/))
323			{
324				$_ =~ s/^"/"<space>/;
325			}
326			push @cleaned, $_;
327		}
328		return join(";",@cleaned);
329	}
330	return $s;
331}
332
333############################
334
335sub get_utfmap {
336	my ($file, $db) = @_;
337
338	open(FIN, $file);
339	my @lines = <FIN>;
340	close(FIN);
341	chomp(@lines);
342
343	my $prev_k = undef;
344	my $prev_v = "";
345	my $incharmap = 0;
346	foreach my $l (@lines) {
347		chomp($l);
348		next if ($l =~ /^\#/);
349		next if ($l eq "");
350
351		if ($l eq "CHARMAP") {
352			$incharmap = 1;
353			next;
354		}
355
356		next if (!$incharmap);
357		last if ($l eq "END CHARMAP");
358
359		$l =~ /^<([^\s]+)>\s+(.*)/;
360		my $k = $1;
361		my $v = $2;
362		$v =~ s/\\x//g;		# UTF-8 char code
363		$db->{$k} = $v;
364#		print STDERR "UTF $k = $v\n";
365
366		# XXX: no longer needed
367		# $db_alias->{$k} = $prev_k if ($prev_v eq $v);
368
369		$prev_v = $v;
370		$prev_k = $k;
371	}
372}
373
374sub resolve_enc_addition {
375	my $ret = '';
376
377	foreach my $t (split(/\+/, $_[0])) {
378		$t =~ s/^0[xX]//;
379		$ret .= $t;
380	}
381	return $ret;
382}
383
384sub get_languages {
385	my %data = get_xmldata($ETCDIR);
386	%languages = %{$data{L}};
387	%translations = %{$data{T}};
388	%alternativemonths = %{$data{AM}};
389}
390
391sub transform_ctypes {
392	# Add the C.UTF-8
393	$languages{"C"}{"x"}{data}{"x"}{$DEFENCODING} = undef;
394
395	foreach my $l (sort keys(%languages)) {
396	foreach my $f (sort keys(%{$languages{$l}})) {
397	foreach my $c (sort keys(%{$languages{$l}{$f}{data}})) {
398		next if (defined $languages{$l}{$f}{definitions}
399		    && $languages{$l}{$f}{definitions} !~ /$TYPE/);
400		$languages{$l}{$f}{data}{$c}{$DEFENCODING} = 0;	# unread
401		my $file = $l;
402		$file .= "_" . $f if ($f ne "x");
403		$file .= "_" . $c if ($c ne "x");
404		my $actfile = $file;
405
406		my $filename = "$UNIDIR/posix/xx_Comm_C.UTF-8.src";
407		if (! -f $filename) {
408			print STDERR "Cannot open $filename\n";
409			next;
410		}
411		open(FIN, "$filename");
412		print "Reading from $filename for ${l}_${f}_${c}\n";
413		$languages{$l}{$f}{data}{$c}{$DEFENCODING} = 1;	# read
414		my @lines;
415		my $shex;
416		my $uhex;
417		while (<FIN>) {
418			push @lines, $_;
419		}
420		close(FIN);
421		$shex = sha1_hex(join("\n", @lines));
422		$languages{$l}{$f}{data}{$c}{$DEFENCODING} = $shex;
423		$hashtable{$shex}{"${l}_${f}_${c}.$DEFENCODING"} = 1;
424		open(FOUT, ">$TYPE.draft/$actfile.$DEFENCODING.src");
425		print FOUT @lines;
426		close(FOUT);
427		foreach my $enc (sort keys(%{$languages{$l}{$f}{data}{$c}})) {
428			next if ($enc eq $DEFENCODING);
429			$filename = "$UNIDIR/posix/$file.$DEFENCODING.src";
430			if ($file eq 'ja_JP') {
431				# Override $filename for ja_JP because
432				# its CTYPE is not compatible with UTF-8.
433				$filename = "$UNIDIR/posix/$file.eucJP.src";
434			}
435			if (! -f $filename) {
436				print STDERR "Cannot open $filename\n";
437				next;
438			}
439			@lines = ();
440			open(FIN, "$filename");
441			while (<FIN>) {
442				if ((/^comment_char\s/) || (/^escape_char\s/)){
443					push @lines, $_;
444				}
445				if (/^LC_CTYPE/../^END LC_CTYPE/) {
446					push @lines, $_;
447				}
448			}
449			close(FIN);
450			$uhex = sha1_hex(join("\n", @lines) . $enc);
451			$languages{$l}{$f}{data}{$c}{$enc} = $uhex;
452			$hashtable{$uhex}{"${l}_${f}_${c}.$enc"} = 1;
453			open(FOUT, ">$TYPE.draft/$actfile.$enc.src");
454			print FOUT <<EOF;
455# Warning: Do not edit. This file is automatically extracted from the
456# tools in /usr/src/tools/tools/locale. The data is obtained from the
457# CLDR project, obtained from http://cldr.unicode.org/
458# -----------------------------------------------------------------------------
459EOF
460			print FOUT @lines;
461			close(FOUT);
462		}
463	}
464	}
465	}
466}
467
468
469sub transform_collation {
470	# Read the CLDR version
471	open(FIN, "$UNIDIR/cldr-version") or die "Cannot open cldr-version";
472	read FIN, $CLDR_VERSION, -s FIN;
473	close(FIN);
474	$CLDR_VERSION =~ s/\s*$//;
475
476	foreach my $l (sort keys(%languages)) {
477	foreach my $f (sort keys(%{$languages{$l}})) {
478	foreach my $c (sort keys(%{$languages{$l}{$f}{data}})) {
479		next if (defined $languages{$l}{$f}{definitions}
480		    && $languages{$l}{$f}{definitions} !~ /$TYPE/);
481		$languages{$l}{$f}{data}{$c}{$DEFENCODING} = 0;	# unread
482		my $file;
483		$file = $l . "_";
484		$file .= $f . "_" if ($f ne "x");
485		$file .= $c;
486		my $actfile = $file;
487
488		my $filename = "$UNIDIR/posix/$file.$DEFENCODING.src";
489		$filename = "$ETCDIR/$file.$DEFENCODING.src"
490		    if (! -f $filename);
491		if (! -f $filename
492		 && defined $languages{$l}{$f}{fallback}) {
493			$file = $languages{$l}{$f}{fallback};
494			$filename = "$UNIDIR/posix/$file.$DEFENCODING.src";
495		}
496		$filename = "$UNIDIR/posix/$file.$DEFENCODING.src"
497		    if (! -f $filename);
498		if (! -f $filename) {
499			print STDERR
500			    "Cannot open $file.$DEFENCODING.src or fallback\n";
501			next;
502		}
503		open(FIN, "$filename");
504		print "Reading from $filename for ${l}_${f}_${c}\n";
505		$languages{$l}{$f}{data}{$c}{$DEFENCODING} = 1;	# read
506		my @lines;
507		my $shex;
508		while (<FIN>) {
509			if ((/^comment_char\s/) || (/^escape_char\s/)){
510				push @lines, $_;
511			}
512			if (/^LC_COLLATE/../^END LC_COLLATE/) {
513				$_ =~ s/[ ]+/ /g;
514				push @lines, $_;
515			}
516		}
517		close(FIN);
518		$shex = sha1_hex(join("\n", @lines));
519		$languages{$l}{$f}{data}{$c}{$DEFENCODING} = $shex;
520		$hashtable{$shex}{"${l}_${f}_${c}.$DEFENCODING"} = 1;
521		open(FOUT, ">$TYPE.draft/$actfile.$DEFENCODING.src");
522		print FOUT <<EOF;
523# Warning: Do not edit. This file is automatically extracted from the
524# tools in /usr/src/tools/tools/locale. The data is obtained from the
525# CLDR project, obtained from http://cldr.unicode.org/
526# -----------------------------------------------------------------------------
527EOF
528		print FOUT @lines;
529		close(FOUT);
530
531		foreach my $enc (sort keys(%{$languages{$l}{$f}{data}{$c}})) {
532			next if ($enc eq $DEFENCODING);
533
534			open FIN, "<$TYPE.draft/$actfile.$DEFENCODING.src";
535			open FOUT, ">$TYPE.draft/$actfile.$enc.src";
536			my $order_start = 0;
537			my $print_p = 0;
538			#
539			# %c_elem: collation elements
540			#
541			#   undef: not defined
542			#   1: defined
543			#   2: invalid in this encoding
544			#
545			my %c_elem = ();
546			while (<FIN>) {	# XXX: this loop should be refactored.
547				chomp;
548				$print_p = 1;
549				if ($order_start) {
550					$order_start = 0 if (m/^order_end/);
551					if (m/^<([^>]+)>/) {
552						if (not defined $c_elem{$1}) {
553#							print STDERR "$1:\n";
554
555							my $u32 = $utfmap{'UTF-32'}->{$1};
556							die "order, $1\n" if (not defined $u32);
557#							print STDERR "u32 for $1 = $u32\n";
558							if (not defined $convertors{$enc}{$u32}) {
559#								print STDERR "$1 - $u32 not defined in $enc\n";
560								$print_p = 0;
561							}
562						} elsif ($c_elem{$1} == 2) {
563#							print STDERR "$1 is marked as invalid in $enc\n";
564							$print_p = 0;
565						}
566					}
567				} elsif (m/^collating-element/) {
568					my ($elem, $l);
569					if (m/<([^>]+)> from (.+)/) {
570						($elem, $l) = ($1, $2);
571					}
572#					print STDERR "$elem: enter ($print_p, $l,)\n";
573					while ($print_p and
574					    defined $l and
575					    $l =~ m/<([^>]+)>/g) {
576#						print STDERR "$elem: $1\n";
577						my $u32 = $utfmap{'UTF-32'}->{$1};
578						die "collating-element, $1\n" if (not defined $u32);
579#						print STDERR "u32 for $1 = $u32\n";
580						if (not $convertors{$enc}{$u32}) {
581#							print STDERR "$1 - $u32 not defined in $enc\n";
582							$print_p = 0;
583#							print STDERR "Mark $elem as invalid\n";
584							$c_elem{$elem} = 2;
585						}
586					}
587					if ($print_p) {
588#						print STDERR "Add $elem\n";
589						$c_elem{$elem} = 1;
590					}
591				} elsif (m/^collating-symbol <([^>]+)>/) {
592#					print STDERR "Add $1\n";
593					$c_elem{$1} = 1;
594				} elsif (m/^order_start/) {
595					$order_start = 1;
596					# do nothing
597				}
598				print FOUT $_, "\n" if ($print_p);
599			}
600			close FOUT;
601			close FIN;
602			$languages{$l}{$f}{data}{$c}{$enc} = $shex;
603			$hashtable{$shex}{"${l}_${f}_${c}.$enc"} = 1;
604		}
605	}
606	}
607	}
608}
609
610sub get_fields {
611	foreach my $l (sort keys(%languages)) {
612	foreach my $f (sort keys(%{$languages{$l}})) {
613	foreach my $c (sort keys(%{$languages{$l}{$f}{data}})) {
614		next if (defined $languages{$l}{$f}{definitions}
615		    && $languages{$l}{$f}{definitions} !~ /$TYPE/);
616
617		$languages{$l}{$f}{data}{$c}{$DEFENCODING} = 0;	# unread
618		my $file;
619		$file = $l . "_";
620		$file .= $f . "_" if ($f ne "x");
621		$file .= $c;
622
623		my $filename = "$UNIDIR/posix/$file.$DEFENCODING.src";
624		$filename = "$ETCDIR/$file.$DEFENCODING.src"
625		    if (! -f $filename);
626		if (! -f $filename
627		 && defined $languages{$l}{$f}{fallback}) {
628			$file = $languages{$l}{$f}{fallback};
629			$filename = "$UNIDIR/posix/$file.$DEFENCODING.src";
630		}
631		$filename = "$UNIDIR/posix/$file.$DEFENCODING.src"
632		    if (! -f $filename);
633		if (! -f $filename) {
634			print STDERR
635			    "Cannot open $file.$DEFENCODING.src or fallback\n";
636			next;
637		}
638		open(FIN, "$filename");
639		print "Reading from $filename for ${l}_${f}_${c}\n";
640		$languages{$l}{$f}{data}{$c}{$DEFENCODING} = 1;	# read
641		my @lines = <FIN>;
642		chomp(@lines);
643		close(FIN);
644		my $continue = 0;
645		foreach my $k (keys(%keys)) {
646			foreach my $line (@lines) {
647				$line =~ s/\r//;
648				next if (!$continue && $line !~ /^$k\s/);
649				if ($continue) {
650					$line =~ s/^\s+//;
651				} else {
652					$line =~ s/^$k\s+//;
653				}
654
655				$values{$l}{$f}{$c}{$k} = ""
656					if (!defined $values{$l}{$f}{$c}{$k});
657
658				$continue = ($line =~ /\/$/);
659				$line =~ s/\/$// if ($continue);
660
661#				while ($line =~ /_/) {
662#					$line =~
663#					    s/\<([^>_]+)_([^>]+)\>/<$1 $2>/;
664#				}
665#				die "_ in data - $line" if ($line =~ /_/);
666				$values{$l}{$f}{$c}{$k} .= $line;
667
668				last if (!$continue);
669			}
670		}
671	}
672	}
673	}
674}
675
676sub decodecldr {
677	my $e = shift;
678	my $s = shift;
679
680	my $v = undef;
681
682	if ($e eq "UTF-8") {
683		#
684		# Conversion to UTF-8 can be done from the Unicode name to
685		# the UTF-8 character code.
686		#
687		$v = $utfmap{'UTF-8'}->{$s};
688		die "Cannot convert $s in $e (charmap)" if (!defined $v);
689	} else {
690		#
691		# Conversion to these encodings can be done from the Unicode
692		# name to Unicode code to the encodings code.
693		#
694		# hex - hex or string attr
695		# unicode - unicode attr
696		# ucc - ucc attr
697		my $hex = $translations{$e}{$s}{hex};
698		my $ucc = $utfmap{'UTF-32'}->{$s};
699		my $ucc_attr = $translations{$e}{$s}{ucc};
700		my $unicode = $translations{$e}{$s}{unicode};
701
702		if (defined $hex) {		# hex is in local encoding
703			$v = $hex;
704		} elsif (defined $unicode) {	# unicode is in name
705			$v = $convertors{$e}{$utfmap{'UTF-32'}->{$unicode}};
706		} elsif (defined $ucc_attr) {	# ucc is in code point
707			if (defined $ucc) {
708#				print STDERR "INFO: ucc=$ucc_attr ",
709#				    "overrides $ucc in UTF-32\n";
710			}
711			# normalize
712			$ucc_attr = sprintf("%08X", hex($ucc_attr));
713#			print STDERR "convert $ucc_attr into $e\n";
714			$v = $convertors{$e}{$ucc_attr};
715		} elsif (defined $ucc) {
716			# normalize
717			$ucc = sprintf("%08X", hex($ucc));
718#			print STDERR "convert $ucc into $e\n";
719			$v = $convertors{$e}{$ucc};
720		}
721		die "Cannot convert $s in $e" if (!defined $v);
722	}
723
724	# XXX: length = 8 is not supported yet.
725	$v =~ s/^[0]+//g;
726	$v = "0" . $v if (length($v) % 2);
727	return pack("C", hex($v)) if (length($v) == 2);
728	return pack("CC", hex(substr($v, 0, 2)), hex(substr($v, 2, 2)))
729		if (length($v) == 4);
730	return pack("CCC", hex(substr($v, 0, 2)), hex(substr($v, 2, 2)),
731	    hex(substr($v, 4, 2))) if (length($v) == 6);
732	die "Cannot convert $s in $e (length = " . length($v) . "\n";
733}
734
735sub translate {
736	my $enc = shift;
737	my $v = shift;
738
739	return $translations{$enc}{$v} if (defined $translations{$enc}{$v});
740	return undef;
741}
742
743sub print_fields {
744	foreach my $l (sort keys(%languages)) {
745	foreach my $f (sort keys(%{$languages{$l}})) {
746	foreach my $c (sort keys(%{$languages{$l}{$f}{data}})) {
747		next if (defined $languages{$l}{$f}{definitions}
748		    && $languages{$l}{$f}{definitions} !~ /$TYPE/);
749		foreach my $enc (sort keys(%{$languages{$l}{$f}{data}{$c}})) {
750			if ($languages{$l}{$f}{data}{$c}{$DEFENCODING} eq "0") {
751				print "Skipping ${l}_" .
752				    ($f eq "x" ? "" : "${f}_") .
753				    "${c} - not read\n";
754				next;
755			}
756			my $file = $l;
757			$file .= "_" . $f if ($f ne "x");
758			$file .= "_" . $c;
759			print "Writing to $file in $enc\n";
760
761			if ($enc ne $DEFENCODING &&
762			    !defined $convertors{$enc}) {
763				print "Failed! Cannot convert to $enc.\n";
764				next;
765			};
766
767			open(FOUT, ">$TYPE.draft/$file.$enc.new");
768			my $okay = 1;
769			my $output = "";
770			print FOUT <<EOF;
771# Warning: Do not edit. This file is automatically generated from the
772# tools in /usr/src/tools/tools/locale. The data is obtained from the
773# CLDR project, obtained from http://cldr.unicode.org/
774# -----------------------------------------------------------------------------
775EOF
776			foreach my $k (keys(%keys)) {
777				my $g = $keys{$k};
778
779				die("Unknown $k in \%DESC")
780					if (!defined $DESC{$k});
781
782				$output .= "#\n# $DESC{$k}\n";
783
784				# Replace one row with another
785				if ($g =~ /^>/) {
786					$k = substr($g, 1);
787					$g = $keys{$k};
788				}
789
790				# Callback function
791				if ($g =~ /^\</) {
792					$callback{data}{c} = $c;
793					$callback{data}{k} = $k;
794					$callback{data}{f} = $f;
795					$callback{data}{l} = $l;
796					$callback{data}{e} = $enc;
797					my @a = split(/\</, substr($g, 1));
798					my $rv =
799					    &{$callback{$a[0]}}($values{$l}{$f}{$c}{$a[1]});
800					$values{$l}{$f}{$c}{$k} = $rv;
801					$g = $a[2];
802					$callback{data} = ();
803				}
804
805				my $v = $values{$l}{$f}{$c}{$k};
806				$v = "undef" if (!defined $v);
807
808				if ($g eq "i") {
809					$output .= "$v\n";
810					next;
811				}
812				if ($g eq "ai") {
813					$output .= "$v\n";
814					next;
815				}
816				if ($g eq "s") {
817					$v =~ s/^"//;
818					$v =~ s/"$//;
819					my $cm = "";
820					while ($v =~ /^(.*?)<(.*?)>(.*)/) {
821						my $p1 = $1;
822						$cm = $2;
823						my $p3 = $3;
824
825						my $rv = decodecldr($enc, $cm);
826#						$rv = translate($enc, $cm)
827#							if (!defined $rv);
828						if (!defined $rv) {
829							print STDERR
830"Could not convert $k ($cm) from $DEFENCODING to $enc\n";
831							$okay = 0;
832							next;
833						}
834
835						$v = $p1 . $rv . $p3;
836					}
837					$output .= "$v\n";
838					next;
839				}
840				if ($g eq "as") {
841					foreach my $v (split(/;/, $v)) {
842						$v =~ s/^"//;
843						$v =~ s/"$//;
844						my $cm = "";
845						while ($v =~ /^(.*?)<(.*?)>(.*)/) {
846							my $p1 = $1;
847							$cm = $2;
848							my $p3 = $3;
849
850							my $rv =
851							    decodecldr($enc,
852								$cm);
853#							$rv = translate($enc,
854#							    $cm)
855#							    if (!defined $rv);
856							if (!defined $rv) {
857								print STDERR
858"Could not convert $k ($cm) from $DEFENCODING to $enc\n";
859								$okay = 0;
860								next;
861							}
862
863							$v = $1 . $rv . $3;
864						}
865						$output .= "$v\n";
866					}
867					next;
868				}
869
870				die("$k is '$g'");
871
872			}
873
874			$languages{$l}{$f}{data}{$c}{$enc} = sha1_hex($output);
875			$hashtable{sha1_hex($output)}{"${l}_${f}_${c}.$enc"} = 1;
876			print FOUT "$output# EOF\n";
877			close(FOUT);
878
879			if ($okay) {
880				rename("$TYPE.draft/$file.$enc.new",
881				    "$TYPE.draft/$file.$enc.src");
882			} else {
883				rename("$TYPE.draft/$file.$enc.new",
884				    "$TYPE.draft/$file.$enc.failed");
885			}
886		}
887	}
888	}
889	}
890}
891
892sub make_makefile {
893	print "Creating Makefile for $TYPE\n";
894	my $SRCOUT;
895	my $SRCOUT2;
896	my $SRCOUT3 = "";
897	my $SRCOUT4 = "";
898	my $MAPLOC;
899	if ($TYPE eq "colldef") {
900		# In future, we might want to try to put the CLDR version into
901		# the .src files with some new syntax, instead of the makefile.
902		$SRCOUT = "localedef \${LOCALEDEF_ENDIAN} -D -U " .
903			"-i \${.IMPSRC} \\\n" .
904			"\t-V \${CLDR_VERSION} \\\n" .
905			"\t-f \${MAPLOC}/map.\${.TARGET:T:R:E:C/@.*//} " .
906			"\${.OBJDIR}/\${.IMPSRC:T:R}";
907		$MAPLOC = "MAPLOC=\t\t\${.CURDIR}/../../tools/tools/" .
908				"locale/etc/final-maps\n";
909		$SRCOUT2 = "LC_COLLATE";
910		$SRCOUT3 = "" .
911			".for f t in \${LOCALES_MAPPED}\n" .
912			"FILES+=\t\$t.LC_COLLATE\n" .
913			"FILESDIR_\$t.LC_COLLATE=\t\${LOCALEDIR}/\$t\n" .
914			"\$t.LC_COLLATE: \${.CURDIR}/\$f.src\n" .
915			"\tlocaledef \${LOCALEDEF_ENDIAN} -D -U " .
916			"-i \${.ALLSRC} \\\n" .
917			"\t-V \${CLDR_VERSION} \\\n" .
918			"\t\t-f \${MAPLOC}/map.\${.TARGET:T:R:E:C/@.*//} \\\n" .
919			"\t\t\${.OBJDIR}/\${.TARGET:T:R}\n" .
920			".endfor\n\n";
921		$SRCOUT4 = "## LOCALES_MAPPED\n";
922	}
923	elsif ($TYPE eq "ctypedef") {
924		$SRCOUT = "localedef \${LOCALEDEF_ENDIAN} -D -U -c " .
925			"-w \${MAPLOC}/widths.txt \\\n" .
926			"\t-f \${MAPLOC}/map.\${.IMPSRC:T:R:E} " .
927			"\\\n\t-i \${.IMPSRC} \${.OBJDIR}/\${.IMPSRC:T:R} " .
928			" || true";
929		$SRCOUT2 = "LC_CTYPE";
930		$MAPLOC = "MAPLOC=\t\t\${.CURDIR}/../../tools/tools/" .
931				"locale/etc/final-maps\n";
932		$SRCOUT3 = "## SYMPAIRS\n\n" .
933			".for s t in \${SYMPAIRS}\n" .
934			"\${t:S/src\$/LC_CTYPE/}: " .
935			"\$s\n" .
936			"\tlocaledef \${LOCALEDEF_ENDIAN} -D -U -c " .
937			"-w \${MAPLOC}/widths.txt \\\n" .
938			"\t-f \${MAPLOC}/map.\${.TARGET:T:R:C/^.*\\.//} " .
939			"\\\n\t-i \${.ALLSRC} \${.OBJDIR}/\${.TARGET:T:R} " .
940			" || true\n" .
941			".endfor\n\n";
942	}
943	else {
944		$SRCOUT = "grep -v -E '^(\#\$\$|\#[ ])' < \${.IMPSRC} > \${.TARGET}";
945		$SRCOUT2 = "out";
946		$MAPLOC = "";
947	}
948	open(FOUT, ">$TYPE.draft/Makefile");
949	print FOUT <<EOF;
950# Warning: Do not edit. This file is automatically generated from the
951# tools in /usr/src/tools/tools/locale.
952
953PACKAGE=	locales
954LOCALEDIR=	\${SHAREDIR}/locale
955FILESNAME=	$FILESNAMES{$TYPE}
956.SUFFIXES:	.src .${SRCOUT2}
957${MAPLOC}
958EOF
959
960	if ($TYPE eq "colldef") {
961		print FOUT <<EOF;
962CLDR_VERSION=	"${CLDR_VERSION}"
963
964EOF
965	}
966
967	if ($TYPE eq "colldef" || $TYPE eq "ctypedef") {
968		print FOUT <<EOF;
969.include <bsd.endian.mk>
970
971EOF
972	}
973
974	print FOUT <<EOF;
975.src.${SRCOUT2}:
976	$SRCOUT
977
978## PLACEHOLDER
979
980${SRCOUT4}
981
982EOF
983
984	foreach my $hash (keys(%hashtable)) {
985		# For colldef, weight LOCALES to UTF-8
986		#     Sort as upper-case and reverse to achieve it
987		#     Make en_US, ru_RU, and ca_AD preferred
988		my @files;
989		if ($TYPE eq "colldef") {
990			@files = sort {
991				if ($a eq 'en_x_US.UTF-8' ||
992				    $a eq 'ru_x_RU.UTF-8' ||
993				    $a eq 'ca_x_AD.UTF-8') { return -1; }
994				elsif ($b eq 'en_x_US.UTF-8' ||
995				       $b eq 'ru_x_RU.UTF-8' ||
996				       $b eq 'ca_x_AD.UTF-8') { return 1; }
997				else { return uc($b) cmp uc($a); }
998				} keys(%{$hashtable{$hash}});
999		} elsif ($TYPE eq "ctypedef") {
1000			@files = sort {
1001				if ($a eq 'C_x_x.UTF-8') { return -1; }
1002				elsif ($b eq 'C_x_x.UTF-8') { return 1; }
1003				if ($a =~ /^en_x_US/) { return -1; }
1004				elsif ($b =~ /^en_x_US/) { return 1; }
1005
1006				if ($a =~ /^en_x_GB.ISO8859-15/ ||
1007				    $a =~ /^ru_x_RU/) { return -1; }
1008				elsif ($b =~ /^en_x_GB.ISO8859-15/ ||
1009				       $b =~ /ru_x_RU/) { return 1; }
1010				else { return uc($b) cmp uc($a); }
1011
1012				} keys(%{$hashtable{$hash}});
1013		} else {
1014			@files = sort {
1015				if ($a =~ /_Comm_/ ||
1016				    $b eq 'en_x_US.UTF-8') { return 1; }
1017				elsif ($b =~ /_Comm_/ ||
1018				       $a eq 'en_x_US.UTF-8') { return -1; }
1019				else { return uc($b) cmp uc($a); }
1020				} keys(%{$hashtable{$hash}});
1021		}
1022		if ($#files > 0) {
1023			my $link = shift(@files);
1024			$link =~ s/_x_x//;	# special case for C
1025			$link =~ s/_x_/_/;	# strip family if none there
1026			foreach my $file (@files) {
1027				my @a = split(/_/, $file);
1028				my @b = split(/\./, $a[-1]);
1029				$file =~ s/_x_/_/;
1030				print FOUT "SAME+=\t\t$link $file\n";
1031				undef($languages{$a[0]}{$a[1]}{data}{$b[0]}{$b[1]});
1032			}
1033		}
1034	}
1035
1036	foreach my $l (sort keys(%languages)) {
1037	foreach my $f (sort keys(%{$languages{$l}})) {
1038	foreach my $c (sort keys(%{$languages{$l}{$f}{data}})) {
1039		next if (defined $languages{$l}{$f}{definitions}
1040		    && $languages{$l}{$f}{definitions} !~ /$TYPE/);
1041		if (defined $languages{$l}{$f}{data}{$c}{$DEFENCODING}
1042		 && $languages{$l}{$f}{data}{$c}{$DEFENCODING} eq "0") {
1043			print "Skipping ${l}_" . ($f eq "x" ? "" : "${f}_") .
1044			    "${c} - not read\n";
1045			next;
1046		}
1047		foreach my $e (sort keys(%{$languages{$l}{$f}{data}{$c}})) {
1048			my $file = $l;
1049			$file .= "_" . $f if ($f ne "x");
1050			$file .= "_" . $c if ($c ne "x");
1051			next if (!defined $languages{$l}{$f}{data}{$c}{$e});
1052			print FOUT "LOCALES+=\t$file.$e\n";
1053		}
1054
1055		if (defined $languages{$l}{$f}{nc_link}) {
1056			foreach my $e (sort keys(%{$languages{$l}{$f}{data}{$c}})) {
1057				my $file = $l . "_";
1058				$file .= $f . "_" if ($f ne "x");
1059				$file .= $c;
1060				print FOUT "SAME+=\t\t$file.$e $languages{$l}{$f}{nc_link}.$e\t# legacy (lang/country change)\n";
1061			}
1062		}
1063
1064		if (defined $languages{$l}{$f}{e_link}) {
1065			foreach my $el (split(" ", $languages{$l}{$f}{e_link})) {
1066				my @a = split(/:/, $el);
1067				my $file = $l . "_";
1068				$file .= $f . "_" if ($f ne "x");
1069				$file .= $c;
1070				print FOUT "SAME+=\t\t$file.$a[0] $file.$a[1]\t# legacy (same charset)\n";
1071			}
1072		}
1073
1074	}
1075	}
1076	}
1077
1078	print FOUT <<EOF;
1079
1080FILES=		\${LOCALES:S/\$/.${SRCOUT2}/}
1081CLEANFILES=	\${FILES}
1082
1083.for f t in \${SAME}
1084DIRS+=		LOCALEDIR_\$t
1085LOCALEDIR_\$t=	\${LOCALEDIR}/\$t
1086LOCALEDIR_\$tPACKAGE=	locales
1087SYMLINKS+=	../\$f/\${FILESNAME} \\
1088    \${LOCALEDIR}/\$t/\${FILESNAME}
1089.endfor
1090
1091.for f in \${LOCALES}
1092FILESDIR_\${f}.${SRCOUT2}= \${LOCALEDIR}/\${f}
1093.endfor
1094
1095${SRCOUT3}.include <bsd.prog.mk>
1096EOF
1097
1098	close(FOUT);
1099}
1100