1#!/usr/local/bin/perl -wC
2# $FreeBSD$
3
4use strict;
5use Getopt::Long;
6
7if ($#ARGV != 0) {
8	print "Usage: $0 --unidir=<unidir>\n";
9	exit(1);
10}
11
12my $UNIDIR = undef;
13
14my $result = GetOptions (
15		"unidir=s"	=> \$UNIDIR
16	    );
17
18my %utf8map = ();
19my $outfilename = "$UNIDIR/posix/xx_Comm_C.UTF-8.src";
20
21get_utf8map("$UNIDIR/posix/UTF-8.cm");
22generate_header ();
23parse_unidata ("$UNIDIR/UnicodeData.txt");
24generate_footer ();
25
26############################
27
28sub get_utf8map {
29	my $file = shift;
30
31	open(FIN, $file);
32	my @lines = <FIN>;
33	close(FIN);
34	chomp(@lines);
35
36	my $incharmap = 0;
37	foreach my $l (@lines) {
38		$l =~ s/\r//;
39		next if ($l =~ /^\#/);
40		next if ($l eq "");
41
42		if ($l eq "CHARMAP") {
43			$incharmap = 1;
44			next;
45		}
46
47		next if (!$incharmap);
48		last if ($l eq "END CHARMAP");
49
50		$l =~ /^(<[^\s]+>)\s+(.*)/;
51		my $k = $2;
52		my $v = $1;
53		$k =~ s/\\x//g;		# UTF-8 char code
54		$utf8map{$k} = $v;
55	}
56}
57
58sub generate_header {
59	open(FOUT, ">", "$outfilename")
60		or die ("can't write to $outfilename\n");
61	print FOUT <<EOF;
62# Warning: Do not edit. This file is automatically generated from the
63# tools in /usr/src/tools/tools/locale. The data is obtained from the
64# CLDR project, obtained from http://cldr.unicode.org/
65# -----------------------------------------------------------------------------
66
67comment_char *
68escape_char /
69
70LC_CTYPE
71EOF
72}
73
74sub generate_footer {
75	print FOUT "\nEND LC_CTYPE\n";
76	close (FOUT);
77}
78
79sub wctomb {
80	my $wc = hex(shift);
81	my $lead;
82	my $len;
83	my $ret = "";
84	my $i;
85
86	if (($wc & ~0x7f) == 0) {
87		return sprintf "%02X", $wc;
88	} elsif (($wc & ~0x7ff) == 0) {
89		$lead = 0xc0;
90		$len = 2;
91	} elsif (($wc & ~0xffff) == 0) {
92		$lead = 0xe0;
93		$len = 3;
94	} elsif ($wc >= 0 && $wc <= 0x10ffff) {
95		$lead = 0xf0;
96		$len = 4;
97	}
98
99	for ($i = $len - 1; $i > 0; $i--) {
100		$ret = (sprintf "%02X", ($wc & 0x3f) | 0x80) . $ret;
101		$wc >>= 6;
102	}
103	$ret = (sprintf "%02X", ($wc & 0xff) | $lead) . $ret;
104
105	return $ret;
106}
107
108sub parse_unidata {
109	my $file = shift;
110	my %data = ();
111
112	open(FIN, $file);
113	my @lines = <FIN>;
114	close(FIN);
115	chomp(@lines);
116
117	foreach my $l (@lines) {
118		my @d = split(/;/, $l, -1);
119		my $mb = wctomb($d[0]);
120		my $cat;
121
122		# XXX There are code points present in UnicodeData.txt
123		# and missing from UTF-8.cm
124		next if !defined $utf8map{$mb};
125
126		# Define the category
127		if ($d[2] =~ /^Lu/) {
128			$cat = "upper";
129		} elsif ($d[2] =~ /^Ll/) {
130			$cat = "lower";
131		} elsif ($d[2] =~ /^Nd/) {
132			$cat = "digit";
133		} elsif ($d[2] =~ /^L/) {
134			$cat = "alpha";
135		} elsif ($d[2] =~ /^P/) {
136			$cat = "punct";
137		} elsif ($d[2] =~ /^M/ || $d[2] =~ /^N/ || $d[2] =~ /^S/) {
138			$cat = "graph";
139		} elsif ($d[2] =~ /^C/) {
140			$cat = "cntrl";
141		} elsif ($d[2] =~ /^Z/) {
142			$cat = "space";
143		}
144		$data{$cat}{$mb}{'wc'} = $d[0];
145
146		# Check if it's a start or end of range
147		if ($d[1] =~ /First>$/) {
148			$data{$cat}{$mb}{'start'} = 1;
149		} elsif ($d[1] =~ /Last>$/) {
150			$data{$cat}{$mb}{'end'} = 1;
151		}
152
153		# Check if there's upper/lower mapping
154		if ($d[12] ne "") {
155			$data{'toupper'}{$mb} = wctomb($d[12]);
156		} elsif ($d[13] ne "") {
157			$data{'tolower'}{$mb} = wctomb($d[13]);
158		}
159	}
160
161	my $first;
162	my $inrange = 0;
163
164	# Now write out the categories
165	foreach my $cat (sort keys (%data)) {
166		print FOUT "$cat\t";
167		$first = 1;
168	foreach my $mb (sort keys (%{$data{$cat}})) {
169		if ($first == 1) {
170			$first = 0;
171		} elsif ($inrange == 1) {
172			# Safety belt
173			die "broken range end wc=$data{$cat}{$mb}{'wc'}"
174			    if !defined $data{$cat}{$mb}{'end'};
175			print FOUT ";...;";
176			$inrange = 0;
177		} else {
178			print FOUT ";/\n\t";
179		}
180
181		if ($cat eq "tolower" || $cat eq "toupper") {
182			print FOUT "($utf8map{$mb},$utf8map{$data{$cat}{$mb}})";
183		} else {
184			if (defined($data{$cat}{$mb}{'start'})) {
185				$inrange = 1;
186			}
187			print FOUT "$utf8map{$mb}";
188		}
189	}
190		print FOUT "\n";
191	}
192}
193