1#!/usr/local/bin/perl -wC
2
3# SPDX-License-Identifier: BSD-2-Clause
4#
5# Copyright 2009 Edwin Groothuis <edwin@FreeBSD.org>
6# Copyright 2015 John Marino <draco@marino.st>
7#
8# Redistribution and use in source and binary forms, with or without
9# modification, are permitted provided that the following conditions
10# are met:
11# 1. Redistributions of source code must retain the above copyright
12#    notice, this list of conditions and the following disclaimer.
13# 2. Redistributions in binary form must reproduce the above copyright
14#    notice, this list of conditions and the following disclaimer in the
15#    documentation and/or other materials provided with the distribution.
16#
17# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20# ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27# SUCH DAMAGE.
28#
29# $FreeBSD$
30
31use strict;
32use Getopt::Long;
33use Encode qw(encode decode);
34
35if ($#ARGV != 0) {
36	print "Usage: $0 --unidir=<unidir>\n";
37	exit(1);
38}
39
40my $UNIDIR = undef;
41
42my $result = GetOptions (
43		"unidir=s"	=> \$UNIDIR
44	    );
45
46my %utf8map = ();
47my $outfilename = "$UNIDIR/posix/xx_Comm_C.UTF-8.src";
48
49get_utf8map("$UNIDIR/posix/UTF-8.cm");
50generate_header ();
51parse_unidata ("$UNIDIR/UnicodeData.txt");
52generate_footer ();
53
54############################
55
56sub utf8to32 {
57	my @kl = split /\\x/, $_[0];
58
59	shift @kl if ($kl[0] eq '');
60	my $k = pack('H2' x scalar @kl, @kl);
61	my $ux = encode('UTF-32BE', decode('UTF-8', $k));
62	my $u = uc(unpack('H*', $ux));
63	# Remove BOM
64	$u =~ s/^0000FEFF//;
65	# Remove heading bytes of 0
66	while ($u =~ m/^0/ and length($u) > 4) {
67		$u =~ s/^0//;
68	}
69
70	return $u;
71}
72
73sub get_utf8map {
74	my $file = shift;
75
76	open(FIN, $file);
77	my @lines = <FIN>;
78	close(FIN);
79	chomp(@lines);
80
81	my $incharmap = 0;
82	foreach my $l (@lines) {
83		$l =~ s/\r//;
84		next if ($l =~ /^\#/);
85		next if ($l eq "");
86
87		if ($l eq "CHARMAP") {
88			$incharmap = 1;
89			next;
90		}
91
92		next if (!$incharmap);
93		last if ($l eq "END CHARMAP");
94
95		$l =~ /^(<[^\s]+>)\s+(.*)/;
96		my $k = utf8to32($2);	# UTF-8 char code
97		my $v = $1;
98
99#		print STDERR "register: $k - $v\n";
100		$utf8map{$k} = $v;
101	}
102}
103
104sub generate_header {
105	open(FOUT, ">", "$outfilename")
106		or die ("can't write to $outfilename\n");
107	print FOUT <<EOF;
108# Warning: Do not edit. This file is automatically generated from the
109# tools in /usr/src/tools/tools/locale. The data is obtained from the
110# CLDR project, obtained from http://cldr.unicode.org/
111# -----------------------------------------------------------------------------
112
113comment_char *
114escape_char /
115
116LC_CTYPE
117EOF
118}
119
120sub generate_footer {
121	print FOUT "\nEND LC_CTYPE\n";
122	close (FOUT);
123}
124
125sub wctomb {
126	my $wc = hex(shift);
127	my $lead;
128	my $len;
129	my $ret = "";
130	my $i;
131
132	if (($wc & ~0x7f) == 0) {
133		return sprintf "%02X", $wc;
134	} elsif (($wc & ~0x7ff) == 0) {
135		$lead = 0xc0;
136		$len = 2;
137	} elsif (($wc & ~0xffff) == 0) {
138		$lead = 0xe0;
139		$len = 3;
140	} elsif ($wc >= 0 && $wc <= 0x10ffff) {
141		$lead = 0xf0;
142		$len = 4;
143	}
144
145	for ($i = $len - 1; $i > 0; $i--) {
146		$ret = (sprintf "%02X", ($wc & 0x3f) | 0x80) . $ret;
147		$wc >>= 6;
148	}
149	$ret = (sprintf "%02X", ($wc & 0xff) | $lead) . $ret;
150
151	return $ret;
152}
153
154sub parse_unidata {
155	my $file = shift;
156	my %data = ();
157
158	open(FIN, $file);
159	my @lines = <FIN>;
160	close(FIN);
161	chomp(@lines);
162
163	foreach my $l (@lines) {
164		my @d = split(/;/, $l, -1);
165		my $mb = $d[0];
166		my $cat;
167
168		# XXX There are code points present in UnicodeData.txt
169		# and missing from UTF-8.cm
170		next if !defined $utf8map{$mb};
171
172		# Define the category
173		if ($d[2] =~ /^Lu/) {
174			$cat = "upper";
175		} elsif ($d[2] =~ /^Ll/) {
176			$cat = "lower";
177		} elsif ($d[2] =~ /^Nd/) {
178			$cat = "digit";
179		} elsif ($d[2] =~ /^L/) {
180			$cat = "alpha";
181		} elsif ($d[2] =~ /^P/) {
182			$cat = "punct";
183		} elsif ($d[2] =~ /^Co/ || $d[2] =~ /^M/ || $d[2] =~ /^N/ ||
184		    $d[2] =~ /^S/) {
185			$cat = "graph";
186		} elsif ($d[2] =~ /^C/) {
187			$cat = "cntrl";
188		} elsif ($d[2] =~ /^Z/) {
189			$cat = "space";
190		}
191		$data{$cat}{$mb}{'wc'} = $d[0];
192
193		# Check if it's a start or end of range
194		if ($d[1] =~ /First>$/) {
195			$data{$cat}{$mb}{'start'} = 1;
196		} elsif ($d[1] =~ /Last>$/) {
197			$data{$cat}{$mb}{'end'} = 1;
198		}
199
200		# Check if there's upper/lower mapping
201		if ($d[12] ne "") {
202			$data{'toupper'}{$mb} = $d[12];
203		} elsif ($d[13] ne "") {
204			$data{'tolower'}{$mb} = $d[13];
205		}
206	}
207
208	my $first;
209	my $inrange = 0;
210
211	# Now write out the categories
212	foreach my $cat (sort keys (%data)) {
213		print FOUT "$cat\t";
214		$first = 1;
215	foreach my $mb (sort {hex($a) <=> hex($b)} keys (%{$data{$cat}})) {
216		if ($first == 1) {
217			$first = 0;
218		} elsif ($inrange == 1) {
219			# Safety belt
220			die "broken range end wc=$data{$cat}{$mb}{'wc'}"
221			    if !defined $data{$cat}{$mb}{'end'};
222			print FOUT ";...;";
223			$inrange = 0;
224		} else {
225			print FOUT ";/\n\t";
226		}
227
228		if ($cat eq "tolower" || $cat eq "toupper") {
229			print FOUT "($utf8map{$mb},$utf8map{$data{$cat}{$mb}})";
230		} else {
231			if (defined($data{$cat}{$mb}{'start'})) {
232				$inrange = 1;
233			}
234			print FOUT "$utf8map{$mb}";
235		}
236	}
237		print FOUT "\n";
238	}
239}
240