1#!/usr/local/bin/perl -wC 2 3# SPDX-License-Identifier: BSD-2-Clause-FreeBSD 4# 5# Copyright 2009 Edwin Groothuis <edwin@FreeBSD.org> 6# Copyright 2015 John Marino <draco@marino.st> 7# 8# Redistribution and use in source and binary forms, with or without 9# modification, are permitted provided that the following conditions 10# are met: 11# 1. Redistributions of source code must retain the above copyright 12# notice, this list of conditions and the following disclaimer. 13# 2. Redistributions in binary form must reproduce the above copyright 14# notice, this list of conditions and the following disclaimer in the 15# documentation and/or other materials provided with the distribution. 16# 17# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27# SUCH DAMAGE. 28# 29# $FreeBSD$ 30 31use strict; 32use Getopt::Long; 33 34if ($#ARGV != 0) { 35 print "Usage: $0 --unidir=<unidir>\n"; 36 exit(1); 37} 38 39my $UNIDIR = undef; 40 41my $result = GetOptions ( 42 "unidir=s" => \$UNIDIR 43 ); 44 45my %utf8map = (); 46my $outfilename = "$UNIDIR/posix/xx_Comm_C.UTF-8.src"; 47 48get_utf8map("$UNIDIR/posix/UTF-8.cm"); 49generate_header (); 50parse_unidata ("$UNIDIR/UnicodeData.txt"); 51generate_footer (); 52 53############################ 54 55sub get_utf8map { 56 my $file = shift; 57 58 open(FIN, $file); 59 my @lines = <FIN>; 60 close(FIN); 61 chomp(@lines); 62 63 my $incharmap = 0; 64 foreach my $l (@lines) { 65 $l =~ s/\r//; 66 next if ($l =~ /^\#/); 67 next if ($l eq ""); 68 69 if ($l eq "CHARMAP") { 70 $incharmap = 1; 71 next; 72 } 73 74 next if (!$incharmap); 75 last if ($l eq "END CHARMAP"); 76 77 $l =~ /^(<[^\s]+>)\s+(.*)/; 78 my $k = $2; 79 my $v = $1; 80 $k =~ s/\\x//g; # UTF-8 char code 81 $utf8map{$k} = $v; 82 } 83} 84 85sub generate_header { 86 open(FOUT, ">", "$outfilename") 87 or die ("can't write to $outfilename\n"); 88 print FOUT <<EOF; 89# Warning: Do not edit. This file is automatically generated from the 90# tools in /usr/src/tools/tools/locale. The data is obtained from the 91# CLDR project, obtained from http://cldr.unicode.org/ 92# ----------------------------------------------------------------------------- 93 94comment_char * 95escape_char / 96 97LC_CTYPE 98EOF 99} 100 101sub generate_footer { 102 print FOUT "\nEND LC_CTYPE\n"; 103 close (FOUT); 104} 105 106sub wctomb { 107 my $wc = hex(shift); 108 my $lead; 109 my $len; 110 my $ret = ""; 111 my $i; 112 113 if (($wc & ~0x7f) == 0) { 114 return sprintf "%02X", $wc; 115 } elsif (($wc & ~0x7ff) == 0) { 116 $lead = 0xc0; 117 $len = 2; 118 } elsif (($wc & ~0xffff) == 0) { 119 $lead = 0xe0; 120 $len = 3; 121 } elsif ($wc >= 0 && $wc <= 0x10ffff) { 122 $lead = 0xf0; 123 $len = 4; 124 } 125 126 for ($i = $len - 1; $i > 0; $i--) { 127 $ret = (sprintf "%02X", ($wc & 0x3f) | 0x80) . $ret; 128 $wc >>= 6; 129 } 130 $ret = (sprintf "%02X", ($wc & 0xff) | $lead) . $ret; 131 132 return $ret; 133} 134 135sub parse_unidata { 136 my $file = shift; 137 my %data = (); 138 139 open(FIN, $file); 140 my @lines = <FIN>; 141 close(FIN); 142 chomp(@lines); 143 144 foreach my $l (@lines) { 145 my @d = split(/;/, $l, -1); 146 my $mb = wctomb($d[0]); 147 my $cat; 148 149 # XXX There are code points present in UnicodeData.txt 150 # and missing from UTF-8.cm 151 next if !defined $utf8map{$mb}; 152 153 # Define the category 154 if ($d[2] =~ /^Lu/) { 155 $cat = "upper"; 156 } elsif ($d[2] =~ /^Ll/) { 157 $cat = "lower"; 158 } elsif ($d[2] =~ /^Nd/) { 159 $cat = "digit"; 160 } elsif ($d[2] =~ /^L/) { 161 $cat = "alpha"; 162 } elsif ($d[2] =~ /^P/) { 163 $cat = "punct"; 164 } elsif ($d[2] =~ /^Co/ || $d[2] =~ /^M/ || $d[2] =~ /^N/ || 165 $d[2] =~ /^S/) { 166 $cat = "graph"; 167 } elsif ($d[2] =~ /^C/) { 168 $cat = "cntrl"; 169 } elsif ($d[2] =~ /^Z/) { 170 $cat = "space"; 171 } 172 $data{$cat}{$mb}{'wc'} = $d[0]; 173 174 # Check if it's a start or end of range 175 if ($d[1] =~ /First>$/) { 176 $data{$cat}{$mb}{'start'} = 1; 177 } elsif ($d[1] =~ /Last>$/) { 178 $data{$cat}{$mb}{'end'} = 1; 179 } 180 181 # Check if there's upper/lower mapping 182 if ($d[12] ne "") { 183 $data{'toupper'}{$mb} = wctomb($d[12]); 184 } elsif ($d[13] ne "") { 185 $data{'tolower'}{$mb} = wctomb($d[13]); 186 } 187 } 188 189 my $first; 190 my $inrange = 0; 191 192 # Now write out the categories 193 foreach my $cat (sort keys (%data)) { 194 print FOUT "$cat\t"; 195 $first = 1; 196 foreach my $mb (sort keys (%{$data{$cat}})) { 197 if ($first == 1) { 198 $first = 0; 199 } elsif ($inrange == 1) { 200 # Safety belt 201 die "broken range end wc=$data{$cat}{$mb}{'wc'}" 202 if !defined $data{$cat}{$mb}{'end'}; 203 print FOUT ";...;"; 204 $inrange = 0; 205 } else { 206 print FOUT ";/\n\t"; 207 } 208 209 if ($cat eq "tolower" || $cat eq "toupper") { 210 print FOUT "($utf8map{$mb},$utf8map{$data{$cat}{$mb}})"; 211 } else { 212 if (defined($data{$cat}{$mb}{'start'})) { 213 $inrange = 1; 214 } 215 print FOUT "$utf8map{$mb}"; 216 } 217 } 218 print FOUT "\n"; 219 } 220} 221