1#!/usr/local/bin/perl -wC
2
3# SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4#
5# Copyright 2009 Edwin Groothuis <edwin@FreeBSD.org>
6# Copyright 2015 John Marino <draco@marino.st>
7#
8# Redistribution and use in source and binary forms, with or without
9# modification, are permitted provided that the following conditions
10# are met:
11# 1. Redistributions of source code must retain the above copyright
12#    notice, this list of conditions and the following disclaimer.
13# 2. Redistributions in binary form must reproduce the above copyright
14#    notice, this list of conditions and the following disclaimer in the
15#    documentation and/or other materials provided with the distribution.
16#
17# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20# ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27# SUCH DAMAGE.
28#
29# $FreeBSD$
30
31use strict;
32use Getopt::Long;
33
34if ($#ARGV != 0) {
35	print "Usage: $0 --unidir=<unidir>\n";
36	exit(1);
37}
38
39my $UNIDIR = undef;
40
41my $result = GetOptions (
42		"unidir=s"	=> \$UNIDIR
43	    );
44
45my %utf8map = ();
46my $outfilename = "$UNIDIR/posix/xx_Comm_C.UTF-8.src";
47
48get_utf8map("$UNIDIR/posix/UTF-8.cm");
49generate_header ();
50parse_unidata ("$UNIDIR/UnicodeData.txt");
51generate_footer ();
52
53############################
54
55sub get_utf8map {
56	my $file = shift;
57
58	open(FIN, $file);
59	my @lines = <FIN>;
60	close(FIN);
61	chomp(@lines);
62
63	my $incharmap = 0;
64	foreach my $l (@lines) {
65		$l =~ s/\r//;
66		next if ($l =~ /^\#/);
67		next if ($l eq "");
68
69		if ($l eq "CHARMAP") {
70			$incharmap = 1;
71			next;
72		}
73
74		next if (!$incharmap);
75		last if ($l eq "END CHARMAP");
76
77		$l =~ /^(<[^\s]+>)\s+(.*)/;
78		my $k = $2;
79		my $v = $1;
80		$k =~ s/\\x//g;		# UTF-8 char code
81		$utf8map{$k} = $v;
82	}
83}
84
85sub generate_header {
86	open(FOUT, ">", "$outfilename")
87		or die ("can't write to $outfilename\n");
88	print FOUT <<EOF;
89# Warning: Do not edit. This file is automatically generated from the
90# tools in /usr/src/tools/tools/locale. The data is obtained from the
91# CLDR project, obtained from http://cldr.unicode.org/
92# -----------------------------------------------------------------------------
93
94comment_char *
95escape_char /
96
97LC_CTYPE
98EOF
99}
100
101sub generate_footer {
102	print FOUT "\nEND LC_CTYPE\n";
103	close (FOUT);
104}
105
106sub wctomb {
107	my $wc = hex(shift);
108	my $lead;
109	my $len;
110	my $ret = "";
111	my $i;
112
113	if (($wc & ~0x7f) == 0) {
114		return sprintf "%02X", $wc;
115	} elsif (($wc & ~0x7ff) == 0) {
116		$lead = 0xc0;
117		$len = 2;
118	} elsif (($wc & ~0xffff) == 0) {
119		$lead = 0xe0;
120		$len = 3;
121	} elsif ($wc >= 0 && $wc <= 0x10ffff) {
122		$lead = 0xf0;
123		$len = 4;
124	}
125
126	for ($i = $len - 1; $i > 0; $i--) {
127		$ret = (sprintf "%02X", ($wc & 0x3f) | 0x80) . $ret;
128		$wc >>= 6;
129	}
130	$ret = (sprintf "%02X", ($wc & 0xff) | $lead) . $ret;
131
132	return $ret;
133}
134
135sub parse_unidata {
136	my $file = shift;
137	my %data = ();
138
139	open(FIN, $file);
140	my @lines = <FIN>;
141	close(FIN);
142	chomp(@lines);
143
144	foreach my $l (@lines) {
145		my @d = split(/;/, $l, -1);
146		my $mb = wctomb($d[0]);
147		my $cat;
148
149		# XXX There are code points present in UnicodeData.txt
150		# and missing from UTF-8.cm
151		next if !defined $utf8map{$mb};
152
153		# Define the category
154		if ($d[2] =~ /^Lu/) {
155			$cat = "upper";
156		} elsif ($d[2] =~ /^Ll/) {
157			$cat = "lower";
158		} elsif ($d[2] =~ /^Nd/) {
159			$cat = "digit";
160		} elsif ($d[2] =~ /^L/) {
161			$cat = "alpha";
162		} elsif ($d[2] =~ /^P/) {
163			$cat = "punct";
164		} elsif ($d[2] =~ /^Co/ || $d[2] =~ /^M/ || $d[2] =~ /^N/ ||
165		    $d[2] =~ /^S/) {
166			$cat = "graph";
167		} elsif ($d[2] =~ /^C/) {
168			$cat = "cntrl";
169		} elsif ($d[2] =~ /^Z/) {
170			$cat = "space";
171		}
172		$data{$cat}{$mb}{'wc'} = $d[0];
173
174		# Check if it's a start or end of range
175		if ($d[1] =~ /First>$/) {
176			$data{$cat}{$mb}{'start'} = 1;
177		} elsif ($d[1] =~ /Last>$/) {
178			$data{$cat}{$mb}{'end'} = 1;
179		}
180
181		# Check if there's upper/lower mapping
182		if ($d[12] ne "") {
183			$data{'toupper'}{$mb} = wctomb($d[12]);
184		} elsif ($d[13] ne "") {
185			$data{'tolower'}{$mb} = wctomb($d[13]);
186		}
187	}
188
189	my $first;
190	my $inrange = 0;
191
192	# Now write out the categories
193	foreach my $cat (sort keys (%data)) {
194		print FOUT "$cat\t";
195		$first = 1;
196	foreach my $mb (sort keys (%{$data{$cat}})) {
197		if ($first == 1) {
198			$first = 0;
199		} elsif ($inrange == 1) {
200			# Safety belt
201			die "broken range end wc=$data{$cat}{$mb}{'wc'}"
202			    if !defined $data{$cat}{$mb}{'end'};
203			print FOUT ";...;";
204			$inrange = 0;
205		} else {
206			print FOUT ";/\n\t";
207		}
208
209		if ($cat eq "tolower" || $cat eq "toupper") {
210			print FOUT "($utf8map{$mb},$utf8map{$data{$cat}{$mb}})";
211		} else {
212			if (defined($data{$cat}{$mb}{'start'})) {
213				$inrange = 1;
214			}
215			print FOUT "$utf8map{$mb}";
216		}
217	}
218		print FOUT "\n";
219	}
220}
221