xref: /freebsd/tools/tools/locale/tools/cldr2def.pl (revision 7cc42f6d)
1#!/usr/local/bin/perl -wC
2
3# SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4#
5# Copyright 2009 Edwin Groothuis <edwin@FreeBSD.org>
6# Copyright 2015 John Marino <draco@marino.st>
7#
8# Redistribution and use in source and binary forms, with or without
9# modification, are permitted provided that the following conditions
10# are met:
11# 1. Redistributions of source code must retain the above copyright
12#    notice, this list of conditions and the following disclaimer.
13# 2. Redistributions in binary form must reproduce the above copyright
14#    notice, this list of conditions and the following disclaimer in the
15#    documentation and/or other materials provided with the distribution.
16#
17# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20# ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27# SUCH DAMAGE.
28#
29# $FreeBSD$
30
31use strict;
32use File::Copy;
33use XML::Parser;
34use Tie::IxHash;
35use Text::Iconv;
36#use Data::Dumper;
37use Getopt::Long;
38use Digest::SHA qw(sha1_hex);
39require "charmaps.pm";
40
41
42if ($#ARGV < 2) {
43	print "Usage: $0 --unidir=<unidir> --etc=<etcdir> --type=<type>\n";
44	exit(1);
45}
46
47my $DEFENCODING = "UTF-8";
48
49my $UNIDIR = undef;
50my $ETCDIR = undef;
51my $TYPE = undef;
52
53my $result = GetOptions (
54		"unidir=s"	=> \$UNIDIR,
55		"etc=s"		=> \$ETCDIR,
56		"type=s"	=> \$TYPE,
57	    );
58
59my %convertors = ();
60
61my %ucd = ();
62my %values = ();
63my %hashtable = ();
64my %languages = ();
65my %translations = ();
66my %encodings = ();
67my %alternativemonths = ();
68get_languages();
69
70my %utf8map = ();
71my %utf8aliases = ();
72get_unidata($UNIDIR);
73get_utf8map("$UNIDIR/posix/$DEFENCODING.cm");
74get_encodings("$ETCDIR/charmaps");
75
76my %keys = ();
77tie(%keys, "Tie::IxHash");
78tie(%hashtable, "Tie::IxHash");
79
80my %FILESNAMES = (
81	"monetdef"	=> "LC_MONETARY",
82	"timedef"	=> "LC_TIME",
83	"msgdef"	=> "LC_MESSAGES",
84	"numericdef"	=> "LC_NUMERIC",
85	"colldef"	=> "LC_COLLATE",
86	"ctypedef"	=> "LC_CTYPE"
87);
88
89my %callback = (
90	mdorder => \&callback_mdorder,
91	altmon => \&callback_altmon,
92	cformat => \&callback_cformat,
93	dformat => \&callback_dformat,
94	dtformat => \&callback_dtformat,
95	cbabmon => \&callback_abmon,
96	cbampm => \&callback_ampm,
97	data => undef,
98);
99
100my %DESC = (
101
102	# numericdef
103	"decimal_point"	=> "decimal_point",
104	"thousands_sep"	=> "thousands_sep",
105	"grouping"	=> "grouping",
106
107	# monetdef
108	"int_curr_symbol"	=> "int_curr_symbol (last character always " .
109				   "SPACE)",
110	"currency_symbol"	=> "currency_symbol",
111	"mon_decimal_point"	=> "mon_decimal_point",
112	"mon_thousands_sep"	=> "mon_thousands_sep",
113	"mon_grouping"		=> "mon_grouping",
114	"positive_sign"		=> "positive_sign",
115	"negative_sign"		=> "negative_sign",
116	"int_frac_digits"	=> "int_frac_digits",
117	"frac_digits"		=> "frac_digits",
118	"p_cs_precedes"		=> "p_cs_precedes",
119	"p_sep_by_space"	=> "p_sep_by_space",
120	"n_cs_precedes"		=> "n_cs_precedes",
121	"n_sep_by_space"	=> "n_sep_by_space",
122	"p_sign_posn"		=> "p_sign_posn",
123	"n_sign_posn"		=> "n_sign_posn",
124
125	# msgdef
126	"yesexpr"	=> "yesexpr",
127	"noexpr"	=> "noexpr",
128	"yesstr"	=> "yesstr",
129	"nostr"		=> "nostr",
130
131	# timedef
132	"abmon"		=> "Short month names",
133	"mon"		=> "Long month names (as in a date)",
134	"abday"		=> "Short weekday names",
135	"day"		=> "Long weekday names",
136	"t_fmt"		=> "X_fmt",
137	"d_fmt"		=> "x_fmt",
138	"c_fmt"		=> "c_fmt",
139	"am_pm"		=> "AM/PM",
140	"d_t_fmt"	=> "date_fmt",
141	"altmon"	=> "Long month names (without case ending)",
142	"md_order"	=> "md_order",
143	"t_fmt_ampm"	=> "ampm_fmt",
144);
145
146if ($TYPE eq "colldef") {
147	transform_collation();
148	make_makefile();
149}
150
151if ($TYPE eq "ctypedef") {
152	transform_ctypes();
153	make_makefile();
154}
155
156if ($TYPE eq "numericdef") {
157	%keys = (
158	    "decimal_point"	=> "s",
159	    "thousands_sep"	=> "s",
160	    "grouping"		=> "ai",
161	);
162	get_fields();
163	print_fields();
164	make_makefile();
165}
166
167if ($TYPE eq "monetdef") {
168	%keys = (
169	    "int_curr_symbol"	=> "s",
170	    "currency_symbol"	=> "s",
171	    "mon_decimal_point"	=> "s",
172	    "mon_thousands_sep"	=> "s",
173	    "mon_grouping"	=> "ai",
174	    "positive_sign"	=> "s",
175	    "negative_sign"	=> "s",
176	    "int_frac_digits"	=> "i",
177	    "frac_digits"	=> "i",
178	    "p_cs_precedes"	=> "i",
179	    "p_sep_by_space"	=> "i",
180	    "n_cs_precedes"	=> "i",
181	    "n_sep_by_space"	=> "i",
182	    "p_sign_posn"	=> "i",
183	    "n_sign_posn"	=> "i"
184	);
185	get_fields();
186	print_fields();
187	make_makefile();
188}
189
190if ($TYPE eq "msgdef") {
191	%keys = (
192	    "yesexpr"		=> "s",
193	    "noexpr"		=> "s",
194	    "yesstr"		=> "s",
195	    "nostr"		=> "s"
196	);
197	get_fields();
198	print_fields();
199	make_makefile();
200}
201
202if ($TYPE eq "timedef") {
203	%keys = (
204	    "abmon"		=> "<cbabmon<abmon<as",
205	    "mon"		=> "as",
206	    "abday"		=> "as",
207	    "day"		=> "as",
208	    "t_fmt"		=> "s",
209	    "d_fmt"		=> "<dformat<d_fmt<s",
210	    "c_fmt"		=> "<cformat<d_t_fmt<s",
211	    "am_pm"		=> "<cbampm<am_pm<as",
212	    "d_t_fmt"		=> "<dtformat<d_t_fmt<s",
213	    "altmon"		=> "<altmon<mon<as",
214	    "md_order"		=> "<mdorder<d_fmt<s",
215	    "t_fmt_ampm"	=> "s",
216	);
217	get_fields();
218	print_fields();
219	make_makefile();
220}
221
222sub callback_ampm {
223	my $s = shift;
224	my $nl = $callback{data}{l} . "_" . $callback{data}{c};
225	my $enc = $callback{data}{e};
226
227	if ($nl eq 'ru_RU') {
228		if ($enc eq 'UTF-8') {
229			$s = 'дп;пп';
230		} else {
231			my  $converter = Text::Iconv->new("utf-8", "$enc");
232			$s = $converter->convert("дп;пп");
233		}
234	}
235	return $s;
236}
237
238sub callback_cformat {
239	my $s = shift;
240	my $nl = $callback{data}{l} . "_" . $callback{data}{c};
241
242	if ($nl eq 'ko_KR') {
243		$s =~ s/(> )(%p)/$1%A $2/;
244	}
245	$s =~ s/\.,/\./;
246	$s =~ s/ %Z//;
247	$s =~ s/ %z//;
248	$s =~ s/^"%e\./%A %e/;
249	$s =~ s/^"(%B %e, )/"%A, $1/;
250	$s =~ s/^"(%e %B )/"%A $1/;
251	return $s;
252};
253
254sub callback_dformat {
255	my $s = shift;
256
257	$s =~ s/(%m(<SOLIDUS>|[-.]))%e/$1%d/;
258	$s =~ s/%e((<SOLIDUS>|[-.])%m)/%d$1/;
259	return $s;
260};
261
262sub callback_dtformat {
263	my $s = shift;
264	my $nl = $callback{data}{l} . "_" . $callback{data}{c};
265
266	if ($nl eq 'ja_JP') {
267		$s =~ s/(> )(%H)/$1%A $2/;
268	} elsif ($nl eq 'ko_KR' || $nl eq 'zh_CN' || $nl eq 'zh_TW') {
269		if ($nl ne 'ko_KR') {
270			$s =~ s/%m/%_m/;
271		}
272		$s =~ s/(> )(%p)/$1%A $2/;
273	}
274	$s =~ s/\.,/\./;
275	$s =~ s/^"%e\./%A %e/;
276	$s =~ s/^"(%B %e, )/"%A, $1/;
277	$s =~ s/^"(%e %B )/"%A $1/;
278	return $s;
279};
280
281sub callback_mdorder {
282	my $s = shift;
283	return undef if (!defined $s);
284	$s =~ s/[^dem]//g;
285	$s =~ s/e/d/g;
286	return $s;
287};
288
289sub callback_altmon {
290	# if the language/country is known in %alternative months then
291	# return that, otherwise repeat mon
292	my $s = shift;
293
294	if (defined $alternativemonths{$callback{data}{l}}{$callback{data}{c}}) {
295		my @altnames = split(";",$alternativemonths{$callback{data}{l}}{$callback{data}{c}});
296		my @cleaned;
297		foreach (@altnames)
298		{
299			$_ =~ s/^\s+//;
300			$_ =~ s/\s+$//;
301			push @cleaned, $_;
302		}
303		return join(";",@cleaned);
304	}
305
306	return $s;
307}
308
309sub callback_abmon {
310	# for specified CJK locales, pad result with a space to enable
311	# columns to line up (style established in FreeBSD in 2001)
312	my $s = shift;
313	my $nl = $callback{data}{l} . "_" . $callback{data}{c};
314
315	if ($nl eq 'ja_JP' || $nl eq 'ko_KR' || $nl eq 'zh_CN' ||
316	    $nl eq 'zh_HK' || $nl eq 'zh_TW') {
317		my @monthnames = split(";", $s);
318		my @cleaned;
319		foreach (@monthnames)
320		{
321			if ($_ =~ /^"<(two|three|four|five|six|seven|eight|nine)>/ ||
322			   ($_ =~ /^"<one>/ && $_ !~ /^"<one>(<zero>|<one>|<two>)/))
323			{
324				$_ =~ s/^"/"<space>/;
325			}
326			push @cleaned, $_;
327		}
328		return join(";",@cleaned);
329	}
330	return $s;
331}
332
333############################
334
335sub get_unidata {
336	my $directory = shift;
337
338	open(FIN, "$directory/UnicodeData.txt")
339	    or die("Cannot open $directory/UnicodeData.txt");;
340	my @lines = <FIN>;
341	chomp(@lines);
342	close(FIN);
343
344	foreach my $l (@lines) {
345		my @a = split(/;/, $l);
346
347		$ucd{code2name}{"$a[0]"} = $a[1];	# Unicode name
348		$ucd{name2code}{"$a[1]"} = $a[0];	# Unicode code
349	}
350}
351
352sub get_utf8map {
353	my $file = shift;
354
355	open(FIN, $file);
356	my @lines = <FIN>;
357	close(FIN);
358	chomp(@lines);
359
360	my $prev_k = undef;
361	my $prev_v = "";
362	my $incharmap = 0;
363	foreach my $l (@lines) {
364		$l =~ s/\r//;
365		next if ($l =~ /^\#/);
366		next if ($l eq "");
367
368		if ($l eq "CHARMAP") {
369			$incharmap = 1;
370			next;
371		}
372
373		next if (!$incharmap);
374		last if ($l eq "END CHARMAP");
375
376		$l =~ /^<([^\s]+)>\s+(.*)/;
377		my $k = $1;
378		my $v = $2;
379		$k =~ s/_/ /g;		# unicode char string
380		$v =~ s/\\x//g;		# UTF-8 char code
381		$utf8map{$k} = $v;
382
383		$utf8aliases{$k} = $prev_k if ($prev_v eq $v);
384
385		$prev_v = $v;
386		$prev_k = $k;
387	}
388}
389
390sub get_encodings {
391	my $dir = shift;
392	foreach my $e (sort(keys(%encodings))) {
393		if (!open(FIN, "$dir/$e.TXT")) {
394			print "Cannot open charmap for $e\n";
395			next;
396
397		}
398		$encodings{$e} = 1;
399		my @lines = <FIN>;
400		close(FIN);
401		chomp(@lines);
402		foreach my $l (@lines) {
403			$l =~ s/\r//;
404			next if ($l =~ /^\#/);
405			next if ($l eq "");
406
407			my @a = split(" ", $l);
408			next if ($#a < 1);
409			$a[0] =~ s/^0[xX]//;	# local char code
410			$a[1] =~ s/^0[xX]//;	# unicode char code
411			$convertors{$e}{uc($a[1])} = uc($a[0]);
412		}
413	}
414}
415
416sub get_languages {
417	my %data = get_xmldata($ETCDIR);
418	%languages = %{$data{L}};
419	%translations = %{$data{T}};
420	%alternativemonths = %{$data{AM}};
421	%encodings = %{$data{E}};
422}
423
424sub transform_ctypes {
425	# Add the C.UTF-8
426	$languages{"C"}{"x"}{data}{"x"}{$DEFENCODING} = undef;
427
428	foreach my $l (sort keys(%languages)) {
429	foreach my $f (sort keys(%{$languages{$l}})) {
430	foreach my $c (sort keys(%{$languages{$l}{$f}{data}})) {
431		next if (defined $languages{$l}{$f}{definitions}
432		    && $languages{$l}{$f}{definitions} !~ /$TYPE/);
433		$languages{$l}{$f}{data}{$c}{$DEFENCODING} = 0;	# unread
434		my $file = $l;
435		$file .= "_" . $f if ($f ne "x");
436		$file .= "_" . $c if ($c ne "x");
437		my $actfile = $file;
438
439		my $filename = "$UNIDIR/posix/xx_Comm_C.UTF-8.src";
440		if (! -f $filename) {
441			print STDERR "Cannot open $filename\n";
442			next;
443		}
444		open(FIN, "$filename");
445		print "Reading from $filename for ${l}_${f}_${c}\n";
446		$languages{$l}{$f}{data}{$c}{$DEFENCODING} = 1;	# read
447		my @lines;
448		my $shex;
449		my $uhex;
450		while (<FIN>) {
451			push @lines, $_;
452		}
453		close(FIN);
454		$shex = sha1_hex(join("\n", @lines));
455		$languages{$l}{$f}{data}{$c}{$DEFENCODING} = $shex;
456		$hashtable{$shex}{"${l}_${f}_${c}.$DEFENCODING"} = 1;
457		open(FOUT, ">$TYPE.draft/$actfile.$DEFENCODING.src");
458		print FOUT @lines;
459		close(FOUT);
460		foreach my $enc (sort keys(%{$languages{$l}{$f}{data}{$c}})) {
461			next if ($enc eq $DEFENCODING);
462			$filename = "$UNIDIR/posix/$file.$DEFENCODING.src";
463			if ($file eq 'ja_JP') {
464				# Override $filename for ja_JP because
465				# its CTYPE is not compatible with UTF-8.
466				$filename = "$UNIDIR/posix/$file.eucJP.src";
467			}
468			if (! -f $filename) {
469				print STDERR "Cannot open $filename\n";
470				next;
471			}
472			@lines = ();
473			open(FIN, "$filename");
474			while (<FIN>) {
475				if ((/^comment_char\s/) || (/^escape_char\s/)){
476					push @lines, $_;
477				}
478				if (/^LC_CTYPE/../^END LC_CTYPE/) {
479					push @lines, $_;
480				}
481			}
482			close(FIN);
483			$uhex = sha1_hex(join("\n", @lines) . $enc);
484			$languages{$l}{$f}{data}{$c}{$enc} = $uhex;
485			$hashtable{$uhex}{"${l}_${f}_${c}.$enc"} = 1;
486			open(FOUT, ">$TYPE.draft/$actfile.$enc.src");
487			print FOUT <<EOF;
488# Warning: Do not edit. This file is automatically extracted from the
489# tools in /usr/src/tools/tools/locale. The data is obtained from the
490# CLDR project, obtained from http://cldr.unicode.org/
491# -----------------------------------------------------------------------------
492EOF
493			print FOUT @lines;
494			close(FOUT);
495		}
496	}
497	}
498	}
499}
500
501
502sub transform_collation {
503	foreach my $l (sort keys(%languages)) {
504	foreach my $f (sort keys(%{$languages{$l}})) {
505	foreach my $c (sort keys(%{$languages{$l}{$f}{data}})) {
506		next if (defined $languages{$l}{$f}{definitions}
507		    && $languages{$l}{$f}{definitions} !~ /$TYPE/);
508		$languages{$l}{$f}{data}{$c}{$DEFENCODING} = 0;	# unread
509		my $file;
510		$file = $l . "_";
511		$file .= $f . "_" if ($f ne "x");
512		$file .= $c;
513		my $actfile = $file;
514
515		my $filename = "$UNIDIR/posix/$file.$DEFENCODING.src";
516		$filename = "$ETCDIR/$file.$DEFENCODING.src"
517		    if (! -f $filename);
518		if (! -f $filename
519		 && defined $languages{$l}{$f}{fallback}) {
520			$file = $languages{$l}{$f}{fallback};
521			$filename = "$UNIDIR/posix/$file.$DEFENCODING.src";
522		}
523		$filename = "$UNIDIR/posix/$file.$DEFENCODING.src"
524		    if (! -f $filename);
525		if (! -f $filename) {
526			print STDERR
527			    "Cannot open $file.$DEFENCODING.src or fallback\n";
528			next;
529		}
530		open(FIN, "$filename");
531		print "Reading from $filename for ${l}_${f}_${c}\n";
532		$languages{$l}{$f}{data}{$c}{$DEFENCODING} = 1;	# read
533		my @lines;
534		my $shex;
535		while (<FIN>) {
536			if ((/^comment_char\s/) || (/^escape_char\s/)){
537				push @lines, $_;
538			}
539			if (/^LC_COLLATE/../^END LC_COLLATE/) {
540				$_ =~ s/[ ]+/ /g;
541				push @lines, $_;
542			}
543		}
544		close(FIN);
545		$shex = sha1_hex(join("\n", @lines));
546		$languages{$l}{$f}{data}{$c}{$DEFENCODING} = $shex;
547		$hashtable{$shex}{"${l}_${f}_${c}.$DEFENCODING"} = 1;
548		open(FOUT, ">$TYPE.draft/$actfile.$DEFENCODING.src");
549		print FOUT <<EOF;
550# Warning: Do not edit. This file is automatically extracted from the
551# tools in /usr/src/tools/tools/locale. The data is obtained from the
552# CLDR project, obtained from http://cldr.unicode.org/
553# -----------------------------------------------------------------------------
554EOF
555		print FOUT @lines;
556		close(FOUT);
557
558		foreach my $enc (sort keys(%{$languages{$l}{$f}{data}{$c}})) {
559			next if ($enc eq $DEFENCODING);
560			copy ("$TYPE.draft/$actfile.$DEFENCODING.src",
561			      "$TYPE.draft/$actfile.$enc.src");
562			$languages{$l}{$f}{data}{$c}{$enc} = $shex;
563			$hashtable{$shex}{"${l}_${f}_${c}.$enc"} = 1;
564		}
565	}
566	}
567	}
568}
569
570sub get_fields {
571	foreach my $l (sort keys(%languages)) {
572	foreach my $f (sort keys(%{$languages{$l}})) {
573	foreach my $c (sort keys(%{$languages{$l}{$f}{data}})) {
574		next if (defined $languages{$l}{$f}{definitions}
575		    && $languages{$l}{$f}{definitions} !~ /$TYPE/);
576
577		$languages{$l}{$f}{data}{$c}{$DEFENCODING} = 0;	# unread
578		my $file;
579		$file = $l . "_";
580		$file .= $f . "_" if ($f ne "x");
581		$file .= $c;
582
583		my $filename = "$UNIDIR/posix/$file.$DEFENCODING.src";
584		$filename = "$ETCDIR/$file.$DEFENCODING.src"
585		    if (! -f $filename);
586		if (! -f $filename
587		 && defined $languages{$l}{$f}{fallback}) {
588			$file = $languages{$l}{$f}{fallback};
589			$filename = "$UNIDIR/posix/$file.$DEFENCODING.src";
590		}
591		$filename = "$UNIDIR/posix/$file.$DEFENCODING.src"
592		    if (! -f $filename);
593		if (! -f $filename) {
594			print STDERR
595			    "Cannot open $file.$DEFENCODING.src or fallback\n";
596			next;
597		}
598		open(FIN, "$filename");
599		print "Reading from $filename for ${l}_${f}_${c}\n";
600		$languages{$l}{$f}{data}{$c}{$DEFENCODING} = 1;	# read
601		my @lines = <FIN>;
602		chomp(@lines);
603		close(FIN);
604		my $continue = 0;
605		foreach my $k (keys(%keys)) {
606			foreach my $line (@lines) {
607				$line =~ s/\r//;
608				next if (!$continue && $line !~ /^$k\s/);
609				if ($continue) {
610					$line =~ s/^\s+//;
611				} else {
612					$line =~ s/^$k\s+//;
613				}
614
615				$values{$l}{$f}{$c}{$k} = ""
616					if (!defined $values{$l}{$f}{$c}{$k});
617
618				$continue = ($line =~ /\/$/);
619				$line =~ s/\/$// if ($continue);
620
621				while ($line =~ /_/) {
622					$line =~
623					    s/\<([^>_]+)_([^>]+)\>/<$1 $2>/;
624				}
625				die "_ in data - $line" if ($line =~ /_/);
626				$values{$l}{$f}{$c}{$k} .= $line;
627
628				last if (!$continue);
629			}
630		}
631	}
632	}
633	}
634}
635
636sub decodecldr {
637	my $e = shift;
638	my $s = shift;
639
640	my $v = undef;
641
642	if ($e eq "UTF-8") {
643		#
644		# Conversion to UTF-8 can be done from the Unicode name to
645		# the UTF-8 character code.
646		#
647		$v = $utf8map{$s};
648		die "Cannot convert $s in $e (charmap)" if (!defined $v);
649	} else {
650		#
651		# Conversion to these encodings can be done from the Unicode
652		# name to Unicode code to the encodings code.
653		#
654		my $ucc = undef;
655		$ucc = $ucd{name2code}{$s} if (defined $ucd{name2code}{$s});
656		$ucc = $ucd{name2code}{$utf8aliases{$s}}
657			if (!defined $ucc
658			 && $utf8aliases{$s}
659			 && defined $ucd{name2code}{$utf8aliases{$s}});
660
661		if (!defined $ucc) {
662			if (defined $translations{$e}{$s}{hex}) {
663				$v = $translations{$e}{$s}{hex};
664				$ucc = 0;
665			} elsif (defined $translations{$e}{$s}{ucc}) {
666				$ucc = $translations{$e}{$s}{ucc};
667			}
668		}
669
670		die "Cannot convert $s in $e (ucd string)" if (!defined $ucc);
671		$v = $convertors{$e}{$ucc} if (!defined $v);
672
673		$v = $translations{$e}{$s}{hex}
674			if (!defined $v && defined $translations{$e}{$s}{hex});
675
676		if (!defined $v && defined $translations{$e}{$s}{unicode}) {
677			my $ucn = $translations{$e}{$s}{unicode};
678			$ucc = $ucd{name2code}{$ucn}
679				if (defined $ucd{name2code}{$ucn});
680			$ucc = $ucd{name2code}{$utf8aliases{$ucn}}
681				if (!defined $ucc
682				 && defined $ucd{name2code}{$utf8aliases{$ucn}});
683			$v = $convertors{$e}{$ucc};
684		}
685
686		die "Cannot convert $s in $e (charmap)" if (!defined $v);
687	}
688
689	return pack("C", hex($v)) if (length($v) == 2);
690	return pack("CC", hex(substr($v, 0, 2)), hex(substr($v, 2, 2)))
691		if (length($v) == 4);
692	return pack("CCC", hex(substr($v, 0, 2)), hex(substr($v, 2, 2)),
693	    hex(substr($v, 4, 2))) if (length($v) == 6);
694	print STDERR "Cannot convert $e $s\n";
695	return "length = " . length($v);
696
697}
698
699sub translate {
700	my $enc = shift;
701	my $v = shift;
702
703	return $translations{$enc}{$v} if (defined $translations{$enc}{$v});
704	return undef;
705}
706
707sub print_fields {
708	foreach my $l (sort keys(%languages)) {
709	foreach my $f (sort keys(%{$languages{$l}})) {
710	foreach my $c (sort keys(%{$languages{$l}{$f}{data}})) {
711		next if (defined $languages{$l}{$f}{definitions}
712		    && $languages{$l}{$f}{definitions} !~ /$TYPE/);
713		foreach my $enc (sort keys(%{$languages{$l}{$f}{data}{$c}})) {
714			if ($languages{$l}{$f}{data}{$c}{$DEFENCODING} eq "0") {
715				print "Skipping ${l}_" .
716				    ($f eq "x" ? "" : "${f}_") .
717				    "${c} - not read\n";
718				next;
719			}
720			my $file = $l;
721			$file .= "_" . $f if ($f ne "x");
722			$file .= "_" . $c;
723			print "Writing to $file in $enc\n";
724
725			if ($enc ne $DEFENCODING &&
726			    !defined $convertors{$enc}) {
727				print "Failed! Cannot convert to $enc.\n";
728				next;
729			};
730
731			open(FOUT, ">$TYPE.draft/$file.$enc.new");
732			my $okay = 1;
733			my $output = "";
734			print FOUT <<EOF;
735# Warning: Do not edit. This file is automatically generated from the
736# tools in /usr/src/tools/tools/locale. The data is obtained from the
737# CLDR project, obtained from http://cldr.unicode.org/
738# -----------------------------------------------------------------------------
739EOF
740			foreach my $k (keys(%keys)) {
741				my $g = $keys{$k};
742
743				die("Unknown $k in \%DESC")
744					if (!defined $DESC{$k});
745
746				$output .= "#\n# $DESC{$k}\n";
747
748				# Replace one row with another
749				if ($g =~ /^>/) {
750					$k = substr($g, 1);
751					$g = $keys{$k};
752				}
753
754				# Callback function
755				if ($g =~ /^\</) {
756					$callback{data}{c} = $c;
757					$callback{data}{k} = $k;
758					$callback{data}{f} = $f;
759					$callback{data}{l} = $l;
760					$callback{data}{e} = $enc;
761					my @a = split(/\</, substr($g, 1));
762					my $rv =
763					    &{$callback{$a[0]}}($values{$l}{$f}{$c}{$a[1]});
764					$values{$l}{$f}{$c}{$k} = $rv;
765					$g = $a[2];
766					$callback{data} = ();
767				}
768
769				my $v = $values{$l}{$f}{$c}{$k};
770				$v = "undef" if (!defined $v);
771
772				if ($g eq "i") {
773					$output .= "$v\n";
774					next;
775				}
776				if ($g eq "ai") {
777					$output .= "$v\n";
778					next;
779				}
780				if ($g eq "s") {
781					$v =~ s/^"//;
782					$v =~ s/"$//;
783					my $cm = "";
784					while ($v =~ /^(.*?)<(.*?)>(.*)/) {
785						my $p1 = $1;
786						$cm = $2;
787						my $p3 = $3;
788
789						my $rv = decodecldr($enc, $cm);
790#						$rv = translate($enc, $cm)
791#							if (!defined $rv);
792						if (!defined $rv) {
793							print STDERR
794"Could not convert $k ($cm) from $DEFENCODING to $enc\n";
795							$okay = 0;
796							next;
797						}
798
799						$v = $p1 . $rv . $p3;
800					}
801					$output .= "$v\n";
802					next;
803				}
804				if ($g eq "as") {
805					foreach my $v (split(/;/, $v)) {
806						$v =~ s/^"//;
807						$v =~ s/"$//;
808						my $cm = "";
809						while ($v =~ /^(.*?)<(.*?)>(.*)/) {
810							my $p1 = $1;
811							$cm = $2;
812							my $p3 = $3;
813
814							my $rv =
815							    decodecldr($enc,
816								$cm);
817#							$rv = translate($enc,
818#							    $cm)
819#							    if (!defined $rv);
820							if (!defined $rv) {
821								print STDERR
822"Could not convert $k ($cm) from $DEFENCODING to $enc\n";
823								$okay = 0;
824								next;
825							}
826
827							$v = $1 . $rv . $3;
828						}
829						$output .= "$v\n";
830					}
831					next;
832				}
833
834				die("$k is '$g'");
835
836			}
837
838			$languages{$l}{$f}{data}{$c}{$enc} = sha1_hex($output);
839			$hashtable{sha1_hex($output)}{"${l}_${f}_${c}.$enc"} = 1;
840			print FOUT "$output# EOF\n";
841			close(FOUT);
842
843			if ($okay) {
844				rename("$TYPE.draft/$file.$enc.new",
845				    "$TYPE.draft/$file.$enc.src");
846			} else {
847				rename("$TYPE.draft/$file.$enc.new",
848				    "$TYPE.draft/$file.$enc.failed");
849			}
850		}
851	}
852	}
853	}
854}
855
856sub make_makefile {
857	print "Creating Makefile for $TYPE\n";
858	my $SRCOUT;
859	my $SRCOUT2;
860	my $SRCOUT3 = "";
861	my $SRCOUT4 = "";
862	my $MAPLOC;
863	if ($TYPE eq "colldef") {
864		$SRCOUT = "localedef \${LOCALEDEF_ENDIAN} -D -U " .
865			"-i \${.IMPSRC} \\\n" .
866			"\t-f \${MAPLOC}/map.\${.TARGET:T:R:E:C/@.*//} " .
867			"\${.OBJDIR}/\${.IMPSRC:T:R}";
868		$MAPLOC = "MAPLOC=\t\t\${.CURDIR}/../../tools/tools/" .
869				"locale/etc/final-maps\n";
870		$SRCOUT2 = "LC_COLLATE";
871		$SRCOUT3 = "" .
872			".for f t in \${LOCALES_MAPPED}\n" .
873			"FILES+=\t\$t.LC_COLLATE\n" .
874			"FILESDIR_\$t.LC_COLLATE=\t\${LOCALEDIR}/\$t\n" .
875			"\$t.LC_COLLATE: \${.CURDIR}/\$f.src\n" .
876			"\tlocaledef \${LOCALEDEF_ENDIAN} -D -U " .
877			"-i \${.ALLSRC} \\\n" .
878			"\t\t-f \${MAPLOC}/map.\${.TARGET:T:R:E:C/@.*//} \\\n" .
879			"\t\t\${.OBJDIR}/\${.TARGET:T:R}\n" .
880			".endfor\n\n";
881		$SRCOUT4 = "## LOCALES_MAPPED\n";
882	}
883	elsif ($TYPE eq "ctypedef") {
884		$SRCOUT = "localedef \${LOCALEDEF_ENDIAN} -D -U -c " .
885			"-w \${MAPLOC}/widths.txt \\\n" .
886			"\t-f \${MAPLOC}/map.\${.IMPSRC:T:R:E} " .
887			"\\\n\t-i \${.IMPSRC} \${.OBJDIR}/\${.IMPSRC:T:R} " .
888			" || true";
889		$SRCOUT2 = "LC_CTYPE";
890		$MAPLOC = "MAPLOC=\t\t\${.CURDIR}/../../tools/tools/" .
891				"locale/etc/final-maps\n";
892		$SRCOUT3 = "## SYMPAIRS\n\n" .
893			".for s t in \${SYMPAIRS}\n" .
894			"\${t:S/src\$/LC_CTYPE/}: " .
895			"\$s\n" .
896			"\tlocaledef \${LOCALEDEF_ENDIAN} -D -U -c " .
897			"-w \${MAPLOC}/widths.txt \\\n" .
898			"\t-f \${MAPLOC}/map.\${.TARGET:T:R:C/^.*\\.//} " .
899			"\\\n\t-i \${.ALLSRC} \${.OBJDIR}/\${.TARGET:T:R} " .
900			" || true\n" .
901			".endfor\n\n";
902	}
903	else {
904		$SRCOUT = "grep -v -E '^(\#\$\$|\#[ ])' < \${.IMPSRC} > \${.TARGET}";
905		$SRCOUT2 = "out";
906		$MAPLOC = "";
907	}
908	open(FOUT, ">$TYPE.draft/Makefile");
909	print FOUT <<EOF;
910# \$FreeBSD\$
911# Warning: Do not edit. This file is automatically generated from the
912# tools in /usr/src/tools/tools/locale.
913
914LOCALEDIR=	\${SHAREDIR}/locale
915FILESNAME=	$FILESNAMES{$TYPE}
916.SUFFIXES:	.src .${SRCOUT2}
917${MAPLOC}
918EOF
919
920	if ($TYPE eq "colldef" || $TYPE eq "ctypedef") {
921		print FOUT <<EOF;
922.include <bsd.endian.mk>
923
924EOF
925	}
926
927	print FOUT <<EOF;
928.src.${SRCOUT2}:
929	$SRCOUT
930
931## PLACEHOLDER
932
933${SRCOUT4}
934
935EOF
936
937	foreach my $hash (keys(%hashtable)) {
938		# For colldef, weight LOCALES to UTF-8
939		#     Sort as upper-case and reverse to achieve it
940		#     Make en_US, ru_RU, and ca_AD preferred
941		my @files;
942		if ($TYPE eq "colldef") {
943			@files = sort {
944				if ($a eq 'en_x_US.UTF-8' ||
945				    $a eq 'ru_x_RU.UTF-8' ||
946				    $a eq 'ca_x_AD.UTF-8') { return -1; }
947				elsif ($b eq 'en_x_US.UTF-8' ||
948				       $b eq 'ru_x_RU.UTF-8' ||
949				       $b eq 'ca_x_AD.UTF-8') { return 1; }
950				else { return uc($b) cmp uc($a); }
951				} keys(%{$hashtable{$hash}});
952		} elsif ($TYPE eq "ctypedef") {
953			@files = sort {
954				if ($a eq 'C_x_x.UTF-8') { return -1; }
955				elsif ($b eq 'C_x_x.UTF-8') { return 1; }
956				if ($a =~ /^en_x_US/) { return -1; }
957				elsif ($b =~ /^en_x_US/) { return 1; }
958
959				if ($a =~ /^en_x_GB.ISO8859-15/ ||
960				    $a =~ /^ru_x_RU/) { return -1; }
961				elsif ($b =~ /^en_x_GB.ISO8859-15/ ||
962				       $b =~ /ru_x_RU/) { return 1; }
963				else { return uc($b) cmp uc($a); }
964
965				} keys(%{$hashtable{$hash}});
966		} else {
967			@files = sort {
968				if ($a =~ /_Comm_/ ||
969				    $b eq 'en_x_US.UTF-8') { return 1; }
970				elsif ($b =~ /_Comm_/ ||
971				       $a eq 'en_x_US.UTF-8') { return -1; }
972				else { return uc($b) cmp uc($a); }
973				} keys(%{$hashtable{$hash}});
974		}
975		if ($#files > 0) {
976			my $link = shift(@files);
977			$link =~ s/_x_x//;	# special case for C
978			$link =~ s/_x_/_/;	# strip family if none there
979			foreach my $file (@files) {
980				my @a = split(/_/, $file);
981				my @b = split(/\./, $a[-1]);
982				$file =~ s/_x_/_/;
983				print FOUT "SAME+=\t\t$link $file\n";
984				undef($languages{$a[0]}{$a[1]}{data}{$b[0]}{$b[1]});
985			}
986		}
987	}
988
989	foreach my $l (sort keys(%languages)) {
990	foreach my $f (sort keys(%{$languages{$l}})) {
991	foreach my $c (sort keys(%{$languages{$l}{$f}{data}})) {
992		next if (defined $languages{$l}{$f}{definitions}
993		    && $languages{$l}{$f}{definitions} !~ /$TYPE/);
994		if (defined $languages{$l}{$f}{data}{$c}{$DEFENCODING}
995		 && $languages{$l}{$f}{data}{$c}{$DEFENCODING} eq "0") {
996			print "Skipping ${l}_" . ($f eq "x" ? "" : "${f}_") .
997			    "${c} - not read\n";
998			next;
999		}
1000		foreach my $e (sort keys(%{$languages{$l}{$f}{data}{$c}})) {
1001			my $file = $l;
1002			$file .= "_" . $f if ($f ne "x");
1003			$file .= "_" . $c if ($c ne "x");
1004			next if (!defined $languages{$l}{$f}{data}{$c}{$e});
1005			print FOUT "LOCALES+=\t$file.$e\n";
1006		}
1007
1008		if (defined $languages{$l}{$f}{nc_link}) {
1009			foreach my $e (sort keys(%{$languages{$l}{$f}{data}{$c}})) {
1010				my $file = $l . "_";
1011				$file .= $f . "_" if ($f ne "x");
1012				$file .= $c;
1013				print FOUT "SAME+=\t\t$file.$e $languages{$l}{$f}{nc_link}.$e\t# legacy (lang/country change)\n";
1014			}
1015		}
1016
1017		if (defined $languages{$l}{$f}{e_link}) {
1018			foreach my $el (split(" ", $languages{$l}{$f}{e_link})) {
1019				my @a = split(/:/, $el);
1020				my $file = $l . "_";
1021				$file .= $f . "_" if ($f ne "x");
1022				$file .= $c;
1023				print FOUT "SAME+=\t\t$file.$a[0] $file.$a[1]\t# legacy (same charset)\n";
1024			}
1025		}
1026
1027	}
1028	}
1029	}
1030
1031	print FOUT <<EOF;
1032
1033FILES=		\${LOCALES:S/\$/.${SRCOUT2}/}
1034CLEANFILES=	\${FILES}
1035
1036.for f t in \${SAME}
1037SYMLINKS+=	../\$f/\${FILESNAME} \\
1038    \${LOCALEDIR}/\$t/\${FILESNAME}
1039.endfor
1040
1041.for f in \${LOCALES}
1042FILESDIR_\${f}.${SRCOUT2}= \${LOCALEDIR}/\${f}
1043.endfor
1044
1045${SRCOUT3}.include <bsd.prog.mk>
1046EOF
1047
1048	close(FOUT);
1049}
1050