1#!/usr/bin/env perl -wC
2
3#
4# $FreeBSD$
5#
6
7use strict;
8use XML::Parser;
9use Tie::IxHash;
10use Data::Dumper;
11use Getopt::Long;
12use Digest::SHA qw(sha1_hex);
13
14
15if ($#ARGV < 2) {
16	print "Usage: $0 --cldr=<cldrdir> --unidata=<unidatadir> --etc=<etcdir> --input=<inputfile> --output=<outputfile>\n";
17	exit(1);
18}
19
20my @filter = ();
21
22my $CLDRDIR = undef;
23my $UNIDATADIR = undef;
24my $ETCDIR = undef;
25my $TYPE = undef;
26my $INPUT = undef;
27my $OUTPUT = undef;
28
29my $result = GetOptions (
30		"cldr=s"	=> \$CLDRDIR,
31		"unidata=s"	=> \$UNIDATADIR,
32		"etc=s"		=> \$ETCDIR,
33		"type=s"	=> \$TYPE,
34		"input=s"	=> \$INPUT,
35		"output=s"	=> \$OUTPUT,
36	    );
37
38my %ucd = ();
39my %utf8map = ();
40my %utf8aliases = ();
41get_unidata($UNIDATADIR);
42get_utf8map("$CLDRDIR/posix/UTF-8.cm");
43convert($INPUT, $OUTPUT);
44
45############################
46
47sub get_unidata {
48	my $directory = shift;
49
50	open(FIN, "$directory/UnicodeData.txt")
51	    or die("Cannot open $directory/UnicodeData.txt");;
52	my @lines = <FIN>;
53	chomp(@lines);
54	close(FIN);
55
56	foreach my $l (@lines) {
57		my @a = split(/;/, $l);
58
59		$ucd{code2name}{"$a[0]"} = $a[1];	# Unicode name
60		$ucd{name2code}{"$a[1]"} = $a[0];	# Unicode code
61	}
62}
63
64sub get_utf8map {
65	my $file = shift;
66
67	open(FIN, $file);
68	my @lines = <FIN>;
69	close(FIN);
70	chomp(@lines);
71
72	my $prev_k = undef;
73	my $prev_v = "";
74	my $incharmap = 0;
75	foreach my $l (@lines) {
76		$l =~ s/\r//;
77		next if ($l =~ /^\#/);
78		next if ($l eq "");
79
80		if ($l eq "CHARMAP") {
81			$incharmap = 1;
82			next;
83		}
84
85		next if (!$incharmap);
86		last if ($l eq "END CHARMAP");
87
88		$l =~ /^<([^\s]+)>\s+(.*)/;
89		my $k = $1;
90		my $v = $2;
91		$k =~ s/_/ /g;		# unicode char string
92		$v =~ s/\\x//g;		# UTF-8 char code
93		$utf8map{$k} = $v;
94
95		$utf8aliases{$k} = $prev_k if ($prev_v eq $v);
96
97		$prev_v = $v;
98		$prev_k = $k;
99	}
100}
101
102sub decode_cldr {
103	my $s = shift;
104
105	my $v = $utf8map{$s};
106	$v = $utf8aliases{$s} if (!defined $v);
107	die "Cannot convert $s" if (!defined $v);
108
109	return pack("C", hex($v)) if (length($v) == 2);
110	return pack("CC", hex(substr($v, 0, 2)), hex(substr($v, 2, 2)))
111		if (length($v) == 4);
112	return pack("CCC", hex(substr($v, 0, 2)), hex(substr($v, 2, 2)),
113	    hex(substr($v, 4, 2))) if (length($v) == 6);
114	print STDERR "Cannot convert $s\n";
115	return "length = " . length($v);
116}
117
118sub convert {
119	my $IN = shift;
120	my $OUT = shift;
121
122	open(FIN, "$IN");
123	open(FOUT, ">$OUT");
124
125#	print Dumper(%utf8map);
126
127	my $l;
128	while (defined ($l = <FIN>)) {
129		chomp($l);
130
131		if ($l =~ /^#/) {
132			print FOUT $l, "\n";
133			next;
134		}
135
136		while ($l =~ /^(.*?)<(.*?)>(.*)$/) {
137			$l = $1 . decode_cldr($2) . $3;
138		}
139		print FOUT $l, "\n";
140	}
141
142	close(FOUT);
143	close(FIN);
144}
145