1#! /usr/local/bin/perl
2#
3#
4# This file and its contents are supplied under the terms of the
5# Common Development and Distribution License ("CDDL"), version 1.0.
6# You may only use this file in accordance with the terms of version
7# 1.0 of the CDDL.
8#
9# A full copy of the text of the CDDL should have accompanied this
10# source.  A copy is of the CDDL is also available via the Internet
11# at http://www.illumos.org/license/CDDL.
12#
13
14#
15# Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
16# Copyright 2015 John Marino <draco@marino.st>
17#
18
19# This converts MAPPING files to localedef character maps
20# suitable for use with the UTF-8 derived localedef data.
21
22sub ucs_to_utf8
23{
24    my $ucs = shift;
25    my $utf8;
26
27    if ($ucs <= 0x7f) {
28	$utf8 = sprintf("\\x%02X", $ucs).$utf8;
29    } elsif ($ucs <= 0x7ff) {
30	$utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8;
31	$ucs >>= 6;
32	$utf8 = sprintf("\\x%02X", $ucs | 0xc0).$utf8;
33
34    } elsif ($ucs <= 0xffff) {
35	$utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8;
36	$ucs >>= 6;
37	$utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8;
38	$ucs >>= 6;
39	$utf8 = sprintf("\\x%02X", $ucs | 0xe0).$utf8;
40
41    } elsif ($ucs <= 0x1fffff) {
42	$utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8;
43	$ucs >>= 6;
44	$utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8;
45	$ucs >>= 6;
46	$utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8;
47	$ucs >>= 6;
48	$utf8 = sprintf("\\x%02X", $ucs | 0xf0).$utf8;
49
50    } elsif ($ucs <= 0x03ffffff) {
51	$utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8;
52	$ucs >>= 6;
53	$utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8;
54	$ucs >>= 6;
55	$utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8;
56	$ucs >>= 6;
57	$utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8;
58	$ucs >>= 6;
59	$utf8 = sprintf("\\x%02X", $ucs | 0xf8).$utf8;
60
61    } else {
62	$utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8;
63	$ucs >>= 6;
64	$utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8;
65	$ucs >>= 6;
66	$utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8;
67	$ucs >>= 6;
68	$utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8;
69	$ucs >>= 6;
70	$utf8 = sprintf("\\x%02X", ($ucs & 0x3f) | 0x80).$utf8;
71	$ucs >>= 6;
72	$utf8 = sprintf("\\x%02X", $ucs | 0xf8).$utf8;
73    }
74
75    return ($utf8);
76}
77
78my %unames;
79my %uvalues;
80
81#
82# This is not a general purpose Character Map parser, but its good enough
83# for the stock one supplied with CLDR.
84#
85sub load_utf8_cm
86{
87    my $file = shift;
88
89    open(UTF8, "$file") || die "$!: open: $file";
90
91    while (<UTF8>) {
92	next if (/^#/);
93	next if (/^\s*$/);
94	next if (/^\s*CHARMAP\s*$/);
95	next if (/^\s*END\s*CHARMAP\s*$/);
96	chomp;
97	@words = split /\s+/;
98	$name = $words[0];
99	$utf8val = $words[1];
100
101	if (defined($unames{$utf8val})) {
102	    $unames{$utf8val} .= "\n" .$name;
103	} else {
104	    $unames{$utf8val} = $name;
105	}
106	$uvalues{$name} = $utf8val;
107    }
108    close(UTF8);
109}
110
111my %map;
112
113sub load_map
114{
115    my $file = shift;
116
117    open(MAP, "$file") || die "open";
118
119    while (<MAP>) {
120	next if (/^#/);
121	next if (/^\s*$/);
122	next if (/^0x..\+0x../);
123	next if (/^0x[0-9A-F]{4}\t0x[0-9A-F]{4} 0x[0-9A-F]{4}/);
124	next if (/^0x[0-9A-F]{2}\s+#/);
125	next if (/# ... NO MAPPING .../);
126	chomp;
127	@words = split /\s+/;
128	$utf8 = $words[1];
129	$utf8 =~ s/^\\x[0]*//;
130	$utf8 = ucs_to_utf8(hex($utf8));
131	$val = $words[0];
132	if (defined ($map{$val})) {
133	    $map{$val} .= " ".$utf8;
134	} else {
135	    $map{$val} = $utf8;
136	}
137    }
138}
139
140sub mb_str
141{
142    my $val = shift;
143    my $str = "";
144    $val = hex($val);
145
146    if ($val == 0) {
147	return ("\\x00");
148    }
149    while ($val) {
150	$str = sprintf("\\x%02x", $val & 0xff).$str;
151	$val >>= 8;
152    }
153    return ($str);
154}
155
156$mf = shift(@ARGV);
157$codeset = shift(@ARGV);
158my $max_mb;
159
160my $etcdir = (exists $ENV{'ETCDIR'}) ? $ENV{'ETCDIR'} : "etc";
161load_utf8_cm("${etcdir}/final-maps/map.UTF-8");
162load_map($mf);
163
164
165   if ($codeset eq "SJIS")      { $max_mb = 2 }
166elsif ($codeset eq "eucCN")     { $max_mb = 2 }
167elsif ($codeset eq "eucJP")     { $max_mb = 3 }
168elsif ($codeset eq "eucKR")     { $max_mb = 2 }
169elsif ($codeset eq "GBK")       { $max_mb = 2 }
170elsif ($codeset eq "GB2312")    { $max_mb = 2 }
171elsif ($codeset eq "Big5")      { $max_mb = 2 }
172else { $max_mb = 1 };
173print("<code_set_name> \"$codeset\"\n");
174print("<mb_cur_min> 1\n");
175print("<mb_cur_max> $max_mb\n");
176
177print("CHARMAP\n");
178foreach $val (sort (keys (%map))) {
179    #$utf8 = $map{$val};
180    foreach $utf8 (split / /, $map{$val}) {
181	$ref = $unames{$utf8};
182	foreach $name (sort (split /\n/, $ref)) {
183	    print "$name";
184	    my $nt = int((64 - length($name) + 7) / 8);
185	    while ($nt) {
186		print "\t";
187		$nt--;
188	    }
189	    print mb_str($val)."\n";
190	}
191    }
192}
193print "END CHARMAP\n";
194