xref: /freebsd/tools/tools/locale/tools/cldr2def.pl (revision 1d386b48)
1#!/usr/local/bin/perl -wC
2
3# SPDX-License-Identifier: BSD-2-Clause
4#
5# Copyright 2009 Edwin Groothuis <edwin@FreeBSD.org>
6# Copyright 2015 John Marino <draco@marino.st>
7# Copyright 2020 Hiroki Sato <hrs@FreeBSD.org>
8#
9# Redistribution and use in source and binary forms, with or without
10# modification, are permitted provided that the following conditions
11# are met:
12# 1. Redistributions of source code must retain the above copyright
13#    notice, this list of conditions and the following disclaimer.
14# 2. Redistributions in binary form must reproduce the above copyright
15#    notice, this list of conditions and the following disclaimer in the
16#    documentation and/or other materials provided with the distribution.
17#
18# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21# ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28# SUCH DAMAGE.
29#
30# $FreeBSD$
31
32use strict;
33use File::Copy;
34use XML::Parser;
35use Tie::IxHash;
36use Text::Iconv;
37#use Data::Dumper;
38use Getopt::Long;
39use Digest::SHA qw(sha1_hex);
40require "charmaps.pm";
41
42if ($#ARGV < 2) {
43	print "Usage: $0 --unidir=<unidir> --etc=<etcdir> --type=<type>\n";
44	exit(1);
45}
46
47my $DEFENCODING = "UTF-8";
48
49my $UNIDIR = undef;
50my $ETCDIR = undef;
51my $TYPE = undef;
52
53my $CLDR_VERSION = undef;
54
55my $result = GetOptions (
56		"unidir=s"	=> \$UNIDIR,
57		"etc=s"		=> \$ETCDIR,
58		"type=s"	=> \$TYPE,
59	    );
60
61my %convertors = ();
62
63my %ucd = ();
64my %values = ();
65my %hashtable = ();
66my %languages = ();
67my %translations = ();
68my %alternativemonths = ();
69get_languages();
70
71my %utfmap = ();
72$utfmap{'UTF-8'} = {};
73$utfmap{'UTF-32'} = {};
74get_utfmap("$UNIDIR/posix/$DEFENCODING.cm", $utfmap{'UTF-8'});
75get_utfmap("$UNIDIR/posix/UTF-32.cm", $utfmap{'UTF-32'});
76
77my %keys = ();
78tie(%keys, "Tie::IxHash");
79tie(%hashtable, "Tie::IxHash");
80
81my %FILESNAMES = (
82	"monetdef"	=> "LC_MONETARY",
83	"timedef"	=> "LC_TIME",
84	"msgdef"	=> "LC_MESSAGES",
85	"numericdef"	=> "LC_NUMERIC",
86	"colldef"	=> "LC_COLLATE",
87	"ctypedef"	=> "LC_CTYPE"
88);
89
90my %callback = (
91	mdorder => \&callback_mdorder,
92	altmon => \&callback_altmon,
93	cformat => \&callback_cformat,
94	dformat => \&callback_dformat,
95	dtformat => \&callback_dtformat,
96	cbabmon => \&callback_abmon,
97	cbampm => \&callback_ampm,
98	data => undef,
99);
100
101my %DESC = (
102
103	# numericdef
104	"decimal_point"	=> "decimal_point",
105	"thousands_sep"	=> "thousands_sep",
106	"grouping"	=> "grouping",
107
108	# monetdef
109	"int_curr_symbol"	=> "int_curr_symbol (last character always " .
110				   "SPACE)",
111	"currency_symbol"	=> "currency_symbol",
112	"mon_decimal_point"	=> "mon_decimal_point",
113	"mon_thousands_sep"	=> "mon_thousands_sep",
114	"mon_grouping"		=> "mon_grouping",
115	"positive_sign"		=> "positive_sign",
116	"negative_sign"		=> "negative_sign",
117	"int_frac_digits"	=> "int_frac_digits",
118	"frac_digits"		=> "frac_digits",
119	"p_cs_precedes"		=> "p_cs_precedes",
120	"p_sep_by_space"	=> "p_sep_by_space",
121	"n_cs_precedes"		=> "n_cs_precedes",
122	"n_sep_by_space"	=> "n_sep_by_space",
123	"p_sign_posn"		=> "p_sign_posn",
124	"n_sign_posn"		=> "n_sign_posn",
125
126	# msgdef
127	"yesexpr"	=> "yesexpr",
128	"noexpr"	=> "noexpr",
129	"yesstr"	=> "yesstr",
130	"nostr"		=> "nostr",
131
132	# timedef
133	"abmon"		=> "Short month names",
134	"mon"		=> "Long month names (as in a date)",
135	"abday"		=> "Short weekday names",
136	"day"		=> "Long weekday names",
137	"t_fmt"		=> "X_fmt",
138	"d_fmt"		=> "x_fmt",
139	"c_fmt"		=> "c_fmt",
140	"am_pm"		=> "AM/PM",
141	"d_t_fmt"	=> "date_fmt",
142	"altmon"	=> "Long month names (without case ending)",
143	"md_order"	=> "md_order",
144	"t_fmt_ampm"	=> "ampm_fmt",
145);
146
147if ($TYPE eq "colldef") {
148	transform_collation();
149	make_makefile();
150}
151
152if ($TYPE eq "ctypedef") {
153	transform_ctypes();
154	make_makefile();
155}
156
157if ($TYPE eq "numericdef") {
158	%keys = (
159	    "decimal_point"	=> "s",
160	    "thousands_sep"	=> "s",
161	    "grouping"		=> "ai",
162	);
163	get_fields();
164	print_fields();
165	make_makefile();
166}
167
168if ($TYPE eq "monetdef") {
169	%keys = (
170	    "int_curr_symbol"	=> "s",
171	    "currency_symbol"	=> "s",
172	    "mon_decimal_point"	=> "s",
173	    "mon_thousands_sep"	=> "s",
174	    "mon_grouping"	=> "ai",
175	    "positive_sign"	=> "s",
176	    "negative_sign"	=> "s",
177	    "int_frac_digits"	=> "i",
178	    "frac_digits"	=> "i",
179	    "p_cs_precedes"	=> "i",
180	    "p_sep_by_space"	=> "i",
181	    "n_cs_precedes"	=> "i",
182	    "n_sep_by_space"	=> "i",
183	    "p_sign_posn"	=> "i",
184	    "n_sign_posn"	=> "i"
185	);
186	get_fields();
187	print_fields();
188	make_makefile();
189}
190
191if ($TYPE eq "msgdef") {
192	%keys = (
193	    "yesexpr"		=> "s",
194	    "noexpr"		=> "s",
195	    "yesstr"		=> "s",
196	    "nostr"		=> "s"
197	);
198	get_fields();
199	print_fields();
200	make_makefile();
201}
202
203if ($TYPE eq "timedef") {
204	%keys = (
205	    "abmon"		=> "<cbabmon<abmon<as",
206	    "mon"		=> "as",
207	    "abday"		=> "as",
208	    "day"		=> "as",
209	    "t_fmt"		=> "s",
210	    "d_fmt"		=> "<dformat<d_fmt<s",
211	    "c_fmt"		=> "<cformat<d_t_fmt<s",
212	    "am_pm"		=> "<cbampm<am_pm<as",
213	    "d_t_fmt"		=> "<dtformat<d_t_fmt<s",
214	    "altmon"		=> "<altmon<mon<as",
215	    "md_order"		=> "<mdorder<d_fmt<s",
216	    "t_fmt_ampm"	=> "s",
217	);
218	get_fields();
219	print_fields();
220	make_makefile();
221}
222
223sub callback_ampm {
224	my $s = shift;
225	my $nl = $callback{data}{l} . "_" . $callback{data}{c};
226	my $enc = $callback{data}{e};
227
228	if ($nl eq 'ru_RU') {
229		if ($enc eq 'UTF-8') {
230			$s = 'дп;пп';
231		} else {
232			my  $converter = Text::Iconv->new("utf-8", "$enc");
233			$s = $converter->convert("дп;пп");
234		}
235	}
236	return $s;
237}
238
239sub callback_cformat {
240	my $s = shift;
241	my $nl = $callback{data}{l} . "_" . $callback{data}{c};
242
243	if ($nl eq 'ko_KR') {
244		$s =~ s/(> )(%p)/$1%A $2/;
245	}
246	$s =~ s/\.,/\./;
247	$s =~ s/ %Z//;
248	$s =~ s/ %z//;
249	$s =~ s/^"%e\./%A %e/;
250	$s =~ s/^"(%B %e, )/"%A, $1/;
251	$s =~ s/^"(%e %B )/"%A $1/;
252	return $s;
253};
254
255sub callback_dformat {
256	my $s = shift;
257
258	$s =~ s/(%m(<SOLIDUS>|[-.]))%e/$1%d/;
259	$s =~ s/%e((<SOLIDUS>|[-.])%m)/%d$1/;
260	return $s;
261};
262
263sub callback_dtformat {
264	my $s = shift;
265	my $nl = $callback{data}{l} . "_" . $callback{data}{c};
266
267	if ($nl eq 'ja_JP') {
268		$s =~ s/(> )(%H)/$1%A $2/;
269	} elsif ($nl eq 'ko_KR' || $nl eq 'zh_CN' || $nl eq 'zh_TW') {
270		if ($nl ne 'ko_KR') {
271			$s =~ s/%m/%_m/;
272		}
273		$s =~ s/(> )(%p)/$1%A $2/;
274	}
275	$s =~ s/\.,/\./;
276	$s =~ s/^"%e\./%A %e/;
277	$s =~ s/^"(%B %e, )/"%A, $1/;
278	$s =~ s/^"(%e %B )/"%A $1/;
279	return $s;
280};
281
282sub callback_mdorder {
283	my $s = shift;
284	return undef if (!defined $s);
285	$s =~ s/[^dem]//g;
286	$s =~ s/e/d/g;
287	return $s;
288};
289
290sub callback_altmon {
291	# if the language/country is known in %alternative months then
292	# return that, otherwise repeat mon
293	my $s = shift;
294
295	if (defined $alternativemonths{$callback{data}{l}}{$callback{data}{c}}) {
296		my @altnames = split(";",$alternativemonths{$callback{data}{l}}{$callback{data}{c}});
297		my @cleaned;
298		foreach (@altnames)
299		{
300			$_ =~ s/^\s+//;
301			$_ =~ s/\s+$//;
302			push @cleaned, $_;
303		}
304		return join(";",@cleaned);
305	}
306
307	return $s;
308}
309
310sub callback_abmon {
311	# for specified CJK locales, pad result with a space to enable
312	# columns to line up (style established in FreeBSD in 2001)
313	my $s = shift;
314	my $nl = $callback{data}{l} . "_" . $callback{data}{c};
315
316	if ($nl eq 'ja_JP' || $nl eq 'ko_KR' || $nl eq 'zh_CN' ||
317	    $nl eq 'zh_HK' || $nl eq 'zh_TW') {
318		my @monthnames = split(";", $s);
319		my @cleaned;
320		foreach (@monthnames)
321		{
322			if ($_ =~ /^"<(two|three|four|five|six|seven|eight|nine)>/ ||
323			   ($_ =~ /^"<one>/ && $_ !~ /^"<one>(<zero>|<one>|<two>)/))
324			{
325				$_ =~ s/^"/"<space>/;
326			}
327			push @cleaned, $_;
328		}
329		return join(";",@cleaned);
330	}
331	return $s;
332}
333
334############################
335
336sub get_utfmap {
337	my ($file, $db) = @_;
338
339	open(FIN, $file);
340	my @lines = <FIN>;
341	close(FIN);
342	chomp(@lines);
343
344	my $prev_k = undef;
345	my $prev_v = "";
346	my $incharmap = 0;
347	foreach my $l (@lines) {
348		chomp($l);
349		next if ($l =~ /^\#/);
350		next if ($l eq "");
351
352		if ($l eq "CHARMAP") {
353			$incharmap = 1;
354			next;
355		}
356
357		next if (!$incharmap);
358		last if ($l eq "END CHARMAP");
359
360		$l =~ /^<([^\s]+)>\s+(.*)/;
361		my $k = $1;
362		my $v = $2;
363		$v =~ s/\\x//g;		# UTF-8 char code
364		$db->{$k} = $v;
365#		print STDERR "UTF $k = $v\n";
366
367		# XXX: no longer needed
368		# $db_alias->{$k} = $prev_k if ($prev_v eq $v);
369
370		$prev_v = $v;
371		$prev_k = $k;
372	}
373}
374
375sub resolve_enc_addition {
376	my $ret = '';
377
378	foreach my $t (split(/\+/, $_[0])) {
379		$t =~ s/^0[xX]//;
380		$ret .= $t;
381	}
382	return $ret;
383}
384
385sub get_languages {
386	my %data = get_xmldata($ETCDIR);
387	%languages = %{$data{L}};
388	%translations = %{$data{T}};
389	%alternativemonths = %{$data{AM}};
390}
391
392sub transform_ctypes {
393	# Add the C.UTF-8
394	$languages{"C"}{"x"}{data}{"x"}{$DEFENCODING} = undef;
395
396	foreach my $l (sort keys(%languages)) {
397	foreach my $f (sort keys(%{$languages{$l}})) {
398	foreach my $c (sort keys(%{$languages{$l}{$f}{data}})) {
399		next if (defined $languages{$l}{$f}{definitions}
400		    && $languages{$l}{$f}{definitions} !~ /$TYPE/);
401		$languages{$l}{$f}{data}{$c}{$DEFENCODING} = 0;	# unread
402		my $file = $l;
403		$file .= "_" . $f if ($f ne "x");
404		$file .= "_" . $c if ($c ne "x");
405		my $actfile = $file;
406
407		my $filename = "$UNIDIR/posix/xx_Comm_C.UTF-8.src";
408		if (! -f $filename) {
409			print STDERR "Cannot open $filename\n";
410			next;
411		}
412		open(FIN, "$filename");
413		print "Reading from $filename for ${l}_${f}_${c}\n";
414		$languages{$l}{$f}{data}{$c}{$DEFENCODING} = 1;	# read
415		my @lines;
416		my $shex;
417		my $uhex;
418		while (<FIN>) {
419			push @lines, $_;
420		}
421		close(FIN);
422		$shex = sha1_hex(join("\n", @lines));
423		$languages{$l}{$f}{data}{$c}{$DEFENCODING} = $shex;
424		$hashtable{$shex}{"${l}_${f}_${c}.$DEFENCODING"} = 1;
425		open(FOUT, ">$TYPE.draft/$actfile.$DEFENCODING.src");
426		print FOUT @lines;
427		close(FOUT);
428		foreach my $enc (sort keys(%{$languages{$l}{$f}{data}{$c}})) {
429			next if ($enc eq $DEFENCODING);
430			$filename = "$UNIDIR/posix/$file.$DEFENCODING.src";
431			if ($file eq 'ja_JP') {
432				# Override $filename for ja_JP because
433				# its CTYPE is not compatible with UTF-8.
434				$filename = "$UNIDIR/posix/$file.eucJP.src";
435			}
436			if (! -f $filename) {
437				print STDERR "Cannot open $filename\n";
438				next;
439			}
440			@lines = ();
441			open(FIN, "$filename");
442			while (<FIN>) {
443				if ((/^comment_char\s/) || (/^escape_char\s/)){
444					push @lines, $_;
445				}
446				if (/^LC_CTYPE/../^END LC_CTYPE/) {
447					push @lines, $_;
448				}
449			}
450			close(FIN);
451			$uhex = sha1_hex(join("\n", @lines) . $enc);
452			$languages{$l}{$f}{data}{$c}{$enc} = $uhex;
453			$hashtable{$uhex}{"${l}_${f}_${c}.$enc"} = 1;
454			open(FOUT, ">$TYPE.draft/$actfile.$enc.src");
455			print FOUT <<EOF;
456# Warning: Do not edit. This file is automatically extracted from the
457# tools in /usr/src/tools/tools/locale. The data is obtained from the
458# CLDR project, obtained from http://cldr.unicode.org/
459# -----------------------------------------------------------------------------
460EOF
461			print FOUT @lines;
462			close(FOUT);
463		}
464	}
465	}
466	}
467}
468
469
470sub transform_collation {
471	# Read the CLDR version
472	open(FIN, "$UNIDIR/cldr-version") or die "Cannot open cldr-version";
473	read FIN, $CLDR_VERSION, -s FIN;
474	close(FIN);
475	$CLDR_VERSION =~ s/\s*$//;
476
477	foreach my $l (sort keys(%languages)) {
478	foreach my $f (sort keys(%{$languages{$l}})) {
479	foreach my $c (sort keys(%{$languages{$l}{$f}{data}})) {
480		next if (defined $languages{$l}{$f}{definitions}
481		    && $languages{$l}{$f}{definitions} !~ /$TYPE/);
482		$languages{$l}{$f}{data}{$c}{$DEFENCODING} = 0;	# unread
483		my $file;
484		$file = $l . "_";
485		$file .= $f . "_" if ($f ne "x");
486		$file .= $c;
487		my $actfile = $file;
488
489		my $filename = "$UNIDIR/posix/$file.$DEFENCODING.src";
490		$filename = "$ETCDIR/$file.$DEFENCODING.src"
491		    if (! -f $filename);
492		if (! -f $filename
493		 && defined $languages{$l}{$f}{fallback}) {
494			$file = $languages{$l}{$f}{fallback};
495			$filename = "$UNIDIR/posix/$file.$DEFENCODING.src";
496		}
497		$filename = "$UNIDIR/posix/$file.$DEFENCODING.src"
498		    if (! -f $filename);
499		if (! -f $filename) {
500			print STDERR
501			    "Cannot open $file.$DEFENCODING.src or fallback\n";
502			next;
503		}
504		open(FIN, "$filename");
505		print "Reading from $filename for ${l}_${f}_${c}\n";
506		$languages{$l}{$f}{data}{$c}{$DEFENCODING} = 1;	# read
507		my @lines;
508		my $shex;
509		while (<FIN>) {
510			if ((/^comment_char\s/) || (/^escape_char\s/)){
511				push @lines, $_;
512			}
513			if (/^LC_COLLATE/../^END LC_COLLATE/) {
514				$_ =~ s/[ ]+/ /g;
515				push @lines, $_;
516			}
517		}
518		close(FIN);
519		$shex = sha1_hex(join("\n", @lines));
520		$languages{$l}{$f}{data}{$c}{$DEFENCODING} = $shex;
521		$hashtable{$shex}{"${l}_${f}_${c}.$DEFENCODING"} = 1;
522		open(FOUT, ">$TYPE.draft/$actfile.$DEFENCODING.src");
523		print FOUT <<EOF;
524# Warning: Do not edit. This file is automatically extracted from the
525# tools in /usr/src/tools/tools/locale. The data is obtained from the
526# CLDR project, obtained from http://cldr.unicode.org/
527# -----------------------------------------------------------------------------
528EOF
529		print FOUT @lines;
530		close(FOUT);
531
532		foreach my $enc (sort keys(%{$languages{$l}{$f}{data}{$c}})) {
533			next if ($enc eq $DEFENCODING);
534
535			open FIN, "<$TYPE.draft/$actfile.$DEFENCODING.src";
536			open FOUT, ">$TYPE.draft/$actfile.$enc.src";
537			my $order_start = 0;
538			my $print_p = 0;
539			#
540			# %c_elem: collation elements
541			#
542			#   undef: not defined
543			#   1: defined
544			#   2: invalid in this encoding
545			#
546			my %c_elem = ();
547			while (<FIN>) {	# XXX: this loop should be refactored.
548				chomp;
549				$print_p = 1;
550				if ($order_start) {
551					$order_start = 0 if (m/^order_end/);
552					if (m/^<([^>]+)>/) {
553						if (not defined $c_elem{$1}) {
554#							print STDERR "$1:\n";
555
556							my $u32 = $utfmap{'UTF-32'}->{$1};
557							die "order, $1\n" if (not defined $u32);
558#							print STDERR "u32 for $1 = $u32\n";
559							if (not defined $convertors{$enc}{$u32}) {
560#								print STDERR "$1 - $u32 not defined in $enc\n";
561								$print_p = 0;
562							}
563						} elsif ($c_elem{$1} == 2) {
564#							print STDERR "$1 is marked as invalid in $enc\n";
565							$print_p = 0;
566						}
567					}
568				} elsif (m/^collating-element/) {
569					my ($elem, $l);
570					if (m/<([^>]+)> from (.+)/) {
571						($elem, $l) = ($1, $2);
572					}
573#					print STDERR "$elem: enter ($print_p, $l,)\n";
574					while ($print_p and
575					    defined $l and
576					    $l =~ m/<([^>]+)>/g) {
577#						print STDERR "$elem: $1\n";
578						my $u32 = $utfmap{'UTF-32'}->{$1};
579						die "collating-element, $1\n" if (not defined $u32);
580#						print STDERR "u32 for $1 = $u32\n";
581						if (not $convertors{$enc}{$u32}) {
582#							print STDERR "$1 - $u32 not defined in $enc\n";
583							$print_p = 0;
584#							print STDERR "Mark $elem as invalid\n";
585							$c_elem{$elem} = 2;
586						}
587					}
588					if ($print_p) {
589#						print STDERR "Add $elem\n";
590						$c_elem{$elem} = 1;
591					}
592				} elsif (m/^collating-symbol <([^>]+)>/) {
593#					print STDERR "Add $1\n";
594					$c_elem{$1} = 1;
595				} elsif (m/^order_start/) {
596					$order_start = 1;
597					# do nothing
598				}
599				print FOUT $_, "\n" if ($print_p);
600			}
601			close FOUT;
602			close FIN;
603			$languages{$l}{$f}{data}{$c}{$enc} = $shex;
604			$hashtable{$shex}{"${l}_${f}_${c}.$enc"} = 1;
605		}
606	}
607	}
608	}
609}
610
611sub get_fields {
612	foreach my $l (sort keys(%languages)) {
613	foreach my $f (sort keys(%{$languages{$l}})) {
614	foreach my $c (sort keys(%{$languages{$l}{$f}{data}})) {
615		next if (defined $languages{$l}{$f}{definitions}
616		    && $languages{$l}{$f}{definitions} !~ /$TYPE/);
617
618		$languages{$l}{$f}{data}{$c}{$DEFENCODING} = 0;	# unread
619		my $file;
620		$file = $l . "_";
621		$file .= $f . "_" if ($f ne "x");
622		$file .= $c;
623
624		my $filename = "$UNIDIR/posix/$file.$DEFENCODING.src";
625		$filename = "$ETCDIR/$file.$DEFENCODING.src"
626		    if (! -f $filename);
627		if (! -f $filename
628		 && defined $languages{$l}{$f}{fallback}) {
629			$file = $languages{$l}{$f}{fallback};
630			$filename = "$UNIDIR/posix/$file.$DEFENCODING.src";
631		}
632		$filename = "$UNIDIR/posix/$file.$DEFENCODING.src"
633		    if (! -f $filename);
634		if (! -f $filename) {
635			print STDERR
636			    "Cannot open $file.$DEFENCODING.src or fallback\n";
637			next;
638		}
639		open(FIN, "$filename");
640		print "Reading from $filename for ${l}_${f}_${c}\n";
641		$languages{$l}{$f}{data}{$c}{$DEFENCODING} = 1;	# read
642		my @lines = <FIN>;
643		chomp(@lines);
644		close(FIN);
645		my $continue = 0;
646		foreach my $k (keys(%keys)) {
647			foreach my $line (@lines) {
648				$line =~ s/\r//;
649				next if (!$continue && $line !~ /^$k\s/);
650				if ($continue) {
651					$line =~ s/^\s+//;
652				} else {
653					$line =~ s/^$k\s+//;
654				}
655
656				$values{$l}{$f}{$c}{$k} = ""
657					if (!defined $values{$l}{$f}{$c}{$k});
658
659				$continue = ($line =~ /\/$/);
660				$line =~ s/\/$// if ($continue);
661
662#				while ($line =~ /_/) {
663#					$line =~
664#					    s/\<([^>_]+)_([^>]+)\>/<$1 $2>/;
665#				}
666#				die "_ in data - $line" if ($line =~ /_/);
667				$values{$l}{$f}{$c}{$k} .= $line;
668
669				last if (!$continue);
670			}
671		}
672	}
673	}
674	}
675}
676
677sub decodecldr {
678	my $e = shift;
679	my $s = shift;
680
681	my $v = undef;
682
683	if ($e eq "UTF-8") {
684		#
685		# Conversion to UTF-8 can be done from the Unicode name to
686		# the UTF-8 character code.
687		#
688		$v = $utfmap{'UTF-8'}->{$s};
689		die "Cannot convert $s in $e (charmap)" if (!defined $v);
690	} else {
691		#
692		# Conversion to these encodings can be done from the Unicode
693		# name to Unicode code to the encodings code.
694		#
695		# hex - hex or string attr
696		# unicode - unicode attr
697		# ucc - ucc attr
698		my $hex = $translations{$e}{$s}{hex};
699		my $ucc = $utfmap{'UTF-32'}->{$s};
700		my $ucc_attr = $translations{$e}{$s}{ucc};
701		my $unicode = $translations{$e}{$s}{unicode};
702
703		if (defined $hex) {		# hex is in local encoding
704			$v = $hex;
705		} elsif (defined $unicode) {	# unicode is in name
706			$v = $convertors{$e}{$utfmap{'UTF-32'}->{$unicode}};
707		} elsif (defined $ucc_attr) {	# ucc is in code point
708			if (defined $ucc) {
709#				print STDERR "INFO: ucc=$ucc_attr ",
710#				    "overrides $ucc in UTF-32\n";
711			}
712			# normalize
713			$ucc_attr = sprintf("%08X", hex($ucc_attr));
714#			print STDERR "convert $ucc_attr into $e\n";
715			$v = $convertors{$e}{$ucc_attr};
716		} elsif (defined $ucc) {
717			# normalize
718			$ucc = sprintf("%08X", hex($ucc));
719#			print STDERR "convert $ucc into $e\n";
720			$v = $convertors{$e}{$ucc};
721		}
722		die "Cannot convert $s in $e" if (!defined $v);
723	}
724
725	# XXX: length = 8 is not supported yet.
726	$v =~ s/^[0]+//g;
727	$v = "0" . $v if (length($v) % 2);
728	return pack("C", hex($v)) if (length($v) == 2);
729	return pack("CC", hex(substr($v, 0, 2)), hex(substr($v, 2, 2)))
730		if (length($v) == 4);
731	return pack("CCC", hex(substr($v, 0, 2)), hex(substr($v, 2, 2)),
732	    hex(substr($v, 4, 2))) if (length($v) == 6);
733	die "Cannot convert $s in $e (length = " . length($v) . "\n";
734}
735
736sub translate {
737	my $enc = shift;
738	my $v = shift;
739
740	return $translations{$enc}{$v} if (defined $translations{$enc}{$v});
741	return undef;
742}
743
744sub print_fields {
745	foreach my $l (sort keys(%languages)) {
746	foreach my $f (sort keys(%{$languages{$l}})) {
747	foreach my $c (sort keys(%{$languages{$l}{$f}{data}})) {
748		next if (defined $languages{$l}{$f}{definitions}
749		    && $languages{$l}{$f}{definitions} !~ /$TYPE/);
750		foreach my $enc (sort keys(%{$languages{$l}{$f}{data}{$c}})) {
751			if ($languages{$l}{$f}{data}{$c}{$DEFENCODING} eq "0") {
752				print "Skipping ${l}_" .
753				    ($f eq "x" ? "" : "${f}_") .
754				    "${c} - not read\n";
755				next;
756			}
757			my $file = $l;
758			$file .= "_" . $f if ($f ne "x");
759			$file .= "_" . $c;
760			print "Writing to $file in $enc\n";
761
762			if ($enc ne $DEFENCODING &&
763			    !defined $convertors{$enc}) {
764				print "Failed! Cannot convert to $enc.\n";
765				next;
766			};
767
768			open(FOUT, ">$TYPE.draft/$file.$enc.new");
769			my $okay = 1;
770			my $output = "";
771			print FOUT <<EOF;
772# Warning: Do not edit. This file is automatically generated from the
773# tools in /usr/src/tools/tools/locale. The data is obtained from the
774# CLDR project, obtained from http://cldr.unicode.org/
775# -----------------------------------------------------------------------------
776EOF
777			foreach my $k (keys(%keys)) {
778				my $g = $keys{$k};
779
780				die("Unknown $k in \%DESC")
781					if (!defined $DESC{$k});
782
783				$output .= "#\n# $DESC{$k}\n";
784
785				# Replace one row with another
786				if ($g =~ /^>/) {
787					$k = substr($g, 1);
788					$g = $keys{$k};
789				}
790
791				# Callback function
792				if ($g =~ /^\</) {
793					$callback{data}{c} = $c;
794					$callback{data}{k} = $k;
795					$callback{data}{f} = $f;
796					$callback{data}{l} = $l;
797					$callback{data}{e} = $enc;
798					my @a = split(/\</, substr($g, 1));
799					my $rv =
800					    &{$callback{$a[0]}}($values{$l}{$f}{$c}{$a[1]});
801					$values{$l}{$f}{$c}{$k} = $rv;
802					$g = $a[2];
803					$callback{data} = ();
804				}
805
806				my $v = $values{$l}{$f}{$c}{$k};
807				$v = "undef" if (!defined $v);
808
809				if ($g eq "i") {
810					$output .= "$v\n";
811					next;
812				}
813				if ($g eq "ai") {
814					$output .= "$v\n";
815					next;
816				}
817				if ($g eq "s") {
818					$v =~ s/^"//;
819					$v =~ s/"$//;
820					my $cm = "";
821					while ($v =~ /^(.*?)<(.*?)>(.*)/) {
822						my $p1 = $1;
823						$cm = $2;
824						my $p3 = $3;
825
826						my $rv = decodecldr($enc, $cm);
827#						$rv = translate($enc, $cm)
828#							if (!defined $rv);
829						if (!defined $rv) {
830							print STDERR
831"Could not convert $k ($cm) from $DEFENCODING to $enc\n";
832							$okay = 0;
833							next;
834						}
835
836						$v = $p1 . $rv . $p3;
837					}
838					$output .= "$v\n";
839					next;
840				}
841				if ($g eq "as") {
842					foreach my $v (split(/;/, $v)) {
843						$v =~ s/^"//;
844						$v =~ s/"$//;
845						my $cm = "";
846						while ($v =~ /^(.*?)<(.*?)>(.*)/) {
847							my $p1 = $1;
848							$cm = $2;
849							my $p3 = $3;
850
851							my $rv =
852							    decodecldr($enc,
853								$cm);
854#							$rv = translate($enc,
855#							    $cm)
856#							    if (!defined $rv);
857							if (!defined $rv) {
858								print STDERR
859"Could not convert $k ($cm) from $DEFENCODING to $enc\n";
860								$okay = 0;
861								next;
862							}
863
864							$v = $1 . $rv . $3;
865						}
866						$output .= "$v\n";
867					}
868					next;
869				}
870
871				die("$k is '$g'");
872
873			}
874
875			$languages{$l}{$f}{data}{$c}{$enc} = sha1_hex($output);
876			$hashtable{sha1_hex($output)}{"${l}_${f}_${c}.$enc"} = 1;
877			print FOUT "$output# EOF\n";
878			close(FOUT);
879
880			if ($okay) {
881				rename("$TYPE.draft/$file.$enc.new",
882				    "$TYPE.draft/$file.$enc.src");
883			} else {
884				rename("$TYPE.draft/$file.$enc.new",
885				    "$TYPE.draft/$file.$enc.failed");
886			}
887		}
888	}
889	}
890	}
891}
892
893sub make_makefile {
894	print "Creating Makefile for $TYPE\n";
895	my $SRCOUT;
896	my $SRCOUT2;
897	my $SRCOUT3 = "";
898	my $SRCOUT4 = "";
899	my $MAPLOC;
900	if ($TYPE eq "colldef") {
901		# In future, we might want to try to put the CLDR version into
902		# the .src files with some new syntax, instead of the makefile.
903		$SRCOUT = "localedef \${LOCALEDEF_ENDIAN} -D -U " .
904			"-i \${.IMPSRC} \\\n" .
905			"\t-V \${CLDR_VERSION} \\\n" .
906			"\t-f \${MAPLOC}/map.\${.TARGET:T:R:E:C/@.*//} " .
907			"\${.OBJDIR}/\${.IMPSRC:T:R}";
908		$MAPLOC = "MAPLOC=\t\t\${.CURDIR}/../../tools/tools/" .
909				"locale/etc/final-maps\n";
910		$SRCOUT2 = "LC_COLLATE";
911		$SRCOUT3 = "" .
912			".for f t in \${LOCALES_MAPPED}\n" .
913			"FILES+=\t\$t.LC_COLLATE\n" .
914			"FILESDIR_\$t.LC_COLLATE=\t\${LOCALEDIR}/\$t\n" .
915			"\$t.LC_COLLATE: \${.CURDIR}/\$f.src\n" .
916			"\tlocaledef \${LOCALEDEF_ENDIAN} -D -U " .
917			"-i \${.ALLSRC} \\\n" .
918			"\t-V \${CLDR_VERSION} \\\n" .
919			"\t\t-f \${MAPLOC}/map.\${.TARGET:T:R:E:C/@.*//} \\\n" .
920			"\t\t\${.OBJDIR}/\${.TARGET:T:R}\n" .
921			".endfor\n\n";
922		$SRCOUT4 = "## LOCALES_MAPPED\n";
923	}
924	elsif ($TYPE eq "ctypedef") {
925		$SRCOUT = "localedef \${LOCALEDEF_ENDIAN} -D -U -c " .
926			"-w \${MAPLOC}/widths.txt \\\n" .
927			"\t-f \${MAPLOC}/map.\${.IMPSRC:T:R:E} " .
928			"\\\n\t-i \${.IMPSRC} \${.OBJDIR}/\${.IMPSRC:T:R} " .
929			" || true";
930		$SRCOUT2 = "LC_CTYPE";
931		$MAPLOC = "MAPLOC=\t\t\${.CURDIR}/../../tools/tools/" .
932				"locale/etc/final-maps\n";
933		$SRCOUT3 = "## SYMPAIRS\n\n" .
934			".for s t in \${SYMPAIRS}\n" .
935			"\${t:S/src\$/LC_CTYPE/}: " .
936			"\$s\n" .
937			"\tlocaledef \${LOCALEDEF_ENDIAN} -D -U -c " .
938			"-w \${MAPLOC}/widths.txt \\\n" .
939			"\t-f \${MAPLOC}/map.\${.TARGET:T:R:C/^.*\\.//} " .
940			"\\\n\t-i \${.ALLSRC} \${.OBJDIR}/\${.TARGET:T:R} " .
941			" || true\n" .
942			".endfor\n\n";
943	}
944	else {
945		$SRCOUT = "grep -v -E '^(\#\$\$|\#[ ])' < \${.IMPSRC} > \${.TARGET}";
946		$SRCOUT2 = "out";
947		$MAPLOC = "";
948	}
949	open(FOUT, ">$TYPE.draft/Makefile");
950	print FOUT <<EOF;
951# \$FreeBSD\$
952# Warning: Do not edit. This file is automatically generated from the
953# tools in /usr/src/tools/tools/locale.
954
955PACKAGE=	locales
956LOCALEDIR=	\${SHAREDIR}/locale
957FILESNAME=	$FILESNAMES{$TYPE}
958.SUFFIXES:	.src .${SRCOUT2}
959${MAPLOC}
960EOF
961
962	if ($TYPE eq "colldef") {
963		print FOUT <<EOF;
964CLDR_VERSION=	"${CLDR_VERSION}"
965
966EOF
967	}
968
969	if ($TYPE eq "colldef" || $TYPE eq "ctypedef") {
970		print FOUT <<EOF;
971.include <bsd.endian.mk>
972
973EOF
974	}
975
976	print FOUT <<EOF;
977.src.${SRCOUT2}:
978	$SRCOUT
979
980## PLACEHOLDER
981
982${SRCOUT4}
983
984EOF
985
986	foreach my $hash (keys(%hashtable)) {
987		# For colldef, weight LOCALES to UTF-8
988		#     Sort as upper-case and reverse to achieve it
989		#     Make en_US, ru_RU, and ca_AD preferred
990		my @files;
991		if ($TYPE eq "colldef") {
992			@files = sort {
993				if ($a eq 'en_x_US.UTF-8' ||
994				    $a eq 'ru_x_RU.UTF-8' ||
995				    $a eq 'ca_x_AD.UTF-8') { return -1; }
996				elsif ($b eq 'en_x_US.UTF-8' ||
997				       $b eq 'ru_x_RU.UTF-8' ||
998				       $b eq 'ca_x_AD.UTF-8') { return 1; }
999				else { return uc($b) cmp uc($a); }
1000				} keys(%{$hashtable{$hash}});
1001		} elsif ($TYPE eq "ctypedef") {
1002			@files = sort {
1003				if ($a eq 'C_x_x.UTF-8') { return -1; }
1004				elsif ($b eq 'C_x_x.UTF-8') { return 1; }
1005				if ($a =~ /^en_x_US/) { return -1; }
1006				elsif ($b =~ /^en_x_US/) { return 1; }
1007
1008				if ($a =~ /^en_x_GB.ISO8859-15/ ||
1009				    $a =~ /^ru_x_RU/) { return -1; }
1010				elsif ($b =~ /^en_x_GB.ISO8859-15/ ||
1011				       $b =~ /ru_x_RU/) { return 1; }
1012				else { return uc($b) cmp uc($a); }
1013
1014				} keys(%{$hashtable{$hash}});
1015		} else {
1016			@files = sort {
1017				if ($a =~ /_Comm_/ ||
1018				    $b eq 'en_x_US.UTF-8') { return 1; }
1019				elsif ($b =~ /_Comm_/ ||
1020				       $a eq 'en_x_US.UTF-8') { return -1; }
1021				else { return uc($b) cmp uc($a); }
1022				} keys(%{$hashtable{$hash}});
1023		}
1024		if ($#files > 0) {
1025			my $link = shift(@files);
1026			$link =~ s/_x_x//;	# special case for C
1027			$link =~ s/_x_/_/;	# strip family if none there
1028			foreach my $file (@files) {
1029				my @a = split(/_/, $file);
1030				my @b = split(/\./, $a[-1]);
1031				$file =~ s/_x_/_/;
1032				print FOUT "SAME+=\t\t$link $file\n";
1033				undef($languages{$a[0]}{$a[1]}{data}{$b[0]}{$b[1]});
1034			}
1035		}
1036	}
1037
1038	foreach my $l (sort keys(%languages)) {
1039	foreach my $f (sort keys(%{$languages{$l}})) {
1040	foreach my $c (sort keys(%{$languages{$l}{$f}{data}})) {
1041		next if (defined $languages{$l}{$f}{definitions}
1042		    && $languages{$l}{$f}{definitions} !~ /$TYPE/);
1043		if (defined $languages{$l}{$f}{data}{$c}{$DEFENCODING}
1044		 && $languages{$l}{$f}{data}{$c}{$DEFENCODING} eq "0") {
1045			print "Skipping ${l}_" . ($f eq "x" ? "" : "${f}_") .
1046			    "${c} - not read\n";
1047			next;
1048		}
1049		foreach my $e (sort keys(%{$languages{$l}{$f}{data}{$c}})) {
1050			my $file = $l;
1051			$file .= "_" . $f if ($f ne "x");
1052			$file .= "_" . $c if ($c ne "x");
1053			next if (!defined $languages{$l}{$f}{data}{$c}{$e});
1054			print FOUT "LOCALES+=\t$file.$e\n";
1055		}
1056
1057		if (defined $languages{$l}{$f}{nc_link}) {
1058			foreach my $e (sort keys(%{$languages{$l}{$f}{data}{$c}})) {
1059				my $file = $l . "_";
1060				$file .= $f . "_" if ($f ne "x");
1061				$file .= $c;
1062				print FOUT "SAME+=\t\t$file.$e $languages{$l}{$f}{nc_link}.$e\t# legacy (lang/country change)\n";
1063			}
1064		}
1065
1066		if (defined $languages{$l}{$f}{e_link}) {
1067			foreach my $el (split(" ", $languages{$l}{$f}{e_link})) {
1068				my @a = split(/:/, $el);
1069				my $file = $l . "_";
1070				$file .= $f . "_" if ($f ne "x");
1071				$file .= $c;
1072				print FOUT "SAME+=\t\t$file.$a[0] $file.$a[1]\t# legacy (same charset)\n";
1073			}
1074		}
1075
1076	}
1077	}
1078	}
1079
1080	print FOUT <<EOF;
1081
1082FILES=		\${LOCALES:S/\$/.${SRCOUT2}/}
1083CLEANFILES=	\${FILES}
1084
1085.for f t in \${SAME}
1086DIRS+=		LOCALEDIR_\$t
1087LOCALEDIR_\$t=	\${LOCALEDIR}/\$t
1088LOCALEDIR_\$tPACKAGE=	locales
1089SYMLINKS+=	../\$f/\${FILESNAME} \\
1090    \${LOCALEDIR}/\$t/\${FILESNAME}
1091.endfor
1092
1093.for f in \${LOCALES}
1094FILESDIR_\${f}.${SRCOUT2}= \${LOCALEDIR}/\${f}
1095.endfor
1096
1097${SRCOUT3}.include <bsd.prog.mk>
1098EOF
1099
1100	close(FOUT);
1101}
1102