1#!/usr/local/bin/perl -wC 2 3# SPDX-License-Identifier: BSD-2-Clause 4# 5# Copyright 2009 Edwin Groothuis <edwin@FreeBSD.org> 6# Copyright 2015 John Marino <draco@marino.st> 7# 8# Redistribution and use in source and binary forms, with or without 9# modification, are permitted provided that the following conditions 10# are met: 11# 1. Redistributions of source code must retain the above copyright 12# notice, this list of conditions and the following disclaimer. 13# 2. Redistributions in binary form must reproduce the above copyright 14# notice, this list of conditions and the following disclaimer in the 15# documentation and/or other materials provided with the distribution. 16# 17# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27# SUCH DAMAGE. 28# 29# $FreeBSD$ 30 31use strict; 32use Getopt::Long; 33use Encode qw(encode decode); 34 35if ($#ARGV != 0) { 36 print "Usage: $0 --unidir=<unidir>\n"; 37 exit(1); 38} 39 40my $UNIDIR = undef; 41 42my $result = GetOptions ( 43 "unidir=s" => \$UNIDIR 44 ); 45 46my %utf8map = (); 47my $outfilename = "$UNIDIR/posix/xx_Comm_C.UTF-8.src"; 48 49get_utf8map("$UNIDIR/posix/UTF-8.cm"); 50generate_header (); 51parse_unidata ("$UNIDIR/UnicodeData.txt"); 52generate_footer (); 53 54############################ 55 56sub utf8to32 { 57 my @kl = split /\\x/, $_[0]; 58 59 shift @kl if ($kl[0] eq ''); 60 my $k = pack('H2' x scalar @kl, @kl); 61 my $ux = encode('UTF-32BE', decode('UTF-8', $k)); 62 my $u = uc(unpack('H*', $ux)); 63 # Remove BOM 64 $u =~ s/^0000FEFF//; 65 # Remove heading bytes of 0 66 while ($u =~ m/^0/ and length($u) > 4) { 67 $u =~ s/^0//; 68 } 69 70 return $u; 71} 72 73sub get_utf8map { 74 my $file = shift; 75 76 open(FIN, $file); 77 my @lines = <FIN>; 78 close(FIN); 79 chomp(@lines); 80 81 my $incharmap = 0; 82 foreach my $l (@lines) { 83 $l =~ s/\r//; 84 next if ($l =~ /^\#/); 85 next if ($l eq ""); 86 87 if ($l eq "CHARMAP") { 88 $incharmap = 1; 89 next; 90 } 91 92 next if (!$incharmap); 93 last if ($l eq "END CHARMAP"); 94 95 $l =~ /^(<[^\s]+>)\s+(.*)/; 96 my $k = utf8to32($2); # UTF-8 char code 97 my $v = $1; 98 99# print STDERR "register: $k - $v\n"; 100 $utf8map{$k} = $v; 101 } 102} 103 104sub generate_header { 105 open(FOUT, ">", "$outfilename") 106 or die ("can't write to $outfilename\n"); 107 print FOUT <<EOF; 108# Warning: Do not edit. This file is automatically generated from the 109# tools in /usr/src/tools/tools/locale. The data is obtained from the 110# CLDR project, obtained from http://cldr.unicode.org/ 111# ----------------------------------------------------------------------------- 112 113comment_char * 114escape_char / 115 116LC_CTYPE 117EOF 118} 119 120sub generate_footer { 121 print FOUT "\nEND LC_CTYPE\n"; 122 close (FOUT); 123} 124 125sub wctomb { 126 my $wc = hex(shift); 127 my $lead; 128 my $len; 129 my $ret = ""; 130 my $i; 131 132 if (($wc & ~0x7f) == 0) { 133 return sprintf "%02X", $wc; 134 } elsif (($wc & ~0x7ff) == 0) { 135 $lead = 0xc0; 136 $len = 2; 137 } elsif (($wc & ~0xffff) == 0) { 138 $lead = 0xe0; 139 $len = 3; 140 } elsif ($wc >= 0 && $wc <= 0x10ffff) { 141 $lead = 0xf0; 142 $len = 4; 143 } 144 145 for ($i = $len - 1; $i > 0; $i--) { 146 $ret = (sprintf "%02X", ($wc & 0x3f) | 0x80) . $ret; 147 $wc >>= 6; 148 } 149 $ret = (sprintf "%02X", ($wc & 0xff) | $lead) . $ret; 150 151 return $ret; 152} 153 154sub parse_unidata { 155 my $file = shift; 156 my %data = (); 157 158 open(FIN, $file); 159 my @lines = <FIN>; 160 close(FIN); 161 chomp(@lines); 162 163 foreach my $l (@lines) { 164 my @d = split(/;/, $l, -1); 165 my $mb = $d[0]; 166 my $cat; 167 168 # XXX There are code points present in UnicodeData.txt 169 # and missing from UTF-8.cm 170 next if !defined $utf8map{$mb}; 171 172 # Define the category 173 if ($d[2] =~ /^Lu/) { 174 $cat = "upper"; 175 } elsif ($d[2] =~ /^Ll/) { 176 $cat = "lower"; 177 } elsif ($d[2] =~ /^Nd/) { 178 $cat = "digit"; 179 } elsif ($d[2] =~ /^L/) { 180 $cat = "alpha"; 181 } elsif ($d[2] =~ /^P/) { 182 $cat = "punct"; 183 } elsif ($d[2] =~ /^Co/ || $d[2] =~ /^M/ || $d[2] =~ /^N/ || 184 $d[2] =~ /^S/) { 185 $cat = "graph"; 186 } elsif ($d[2] =~ /^C/) { 187 $cat = "cntrl"; 188 } elsif ($d[2] =~ /^Z/) { 189 $cat = "space"; 190 } 191 $data{$cat}{$mb}{'wc'} = $d[0]; 192 193 # Check if it's a start or end of range 194 if ($d[1] =~ /First>$/) { 195 $data{$cat}{$mb}{'start'} = 1; 196 } elsif ($d[1] =~ /Last>$/) { 197 $data{$cat}{$mb}{'end'} = 1; 198 } 199 200 # Check if there's upper/lower mapping 201 if ($d[12] ne "") { 202 $data{'toupper'}{$mb} = $d[12]; 203 } elsif ($d[13] ne "") { 204 $data{'tolower'}{$mb} = $d[13]; 205 } 206 } 207 208 my $first; 209 my $inrange = 0; 210 211 # Now write out the categories 212 foreach my $cat (sort keys (%data)) { 213 print FOUT "$cat\t"; 214 $first = 1; 215 foreach my $mb (sort {hex($a) <=> hex($b)} keys (%{$data{$cat}})) { 216 if ($first == 1) { 217 $first = 0; 218 } elsif ($inrange == 1) { 219 # Safety belt 220 die "broken range end wc=$data{$cat}{$mb}{'wc'}" 221 if !defined $data{$cat}{$mb}{'end'}; 222 print FOUT ";...;"; 223 $inrange = 0; 224 } else { 225 print FOUT ";/\n\t"; 226 } 227 228 if ($cat eq "tolower" || $cat eq "toupper") { 229 print FOUT "($utf8map{$mb},$utf8map{$data{$cat}{$mb}})"; 230 } else { 231 if (defined($data{$cat}{$mb}{'start'})) { 232 $inrange = 1; 233 } 234 print FOUT "$utf8map{$mb}"; 235 } 236 } 237 print FOUT "\n"; 238 } 239} 240