1#!/usr/local/bin/perl -wC 2# $FreeBSD$ 3 4use strict; 5use Getopt::Long; 6 7if ($#ARGV != 0) { 8 print "Usage: $0 --unidir=<unidir>\n"; 9 exit(1); 10} 11 12my $UNIDIR = undef; 13 14my $result = GetOptions ( 15 "unidir=s" => \$UNIDIR 16 ); 17 18my %utf8map = (); 19my $outfilename = "$UNIDIR/posix/xx_Comm_C.UTF-8.src"; 20 21get_utf8map("$UNIDIR/posix/UTF-8.cm"); 22generate_header (); 23parse_unidata ("$UNIDIR/UnicodeData.txt"); 24generate_footer (); 25 26############################ 27 28sub get_utf8map { 29 my $file = shift; 30 31 open(FIN, $file); 32 my @lines = <FIN>; 33 close(FIN); 34 chomp(@lines); 35 36 my $incharmap = 0; 37 foreach my $l (@lines) { 38 $l =~ s/\r//; 39 next if ($l =~ /^\#/); 40 next if ($l eq ""); 41 42 if ($l eq "CHARMAP") { 43 $incharmap = 1; 44 next; 45 } 46 47 next if (!$incharmap); 48 last if ($l eq "END CHARMAP"); 49 50 $l =~ /^(<[^\s]+>)\s+(.*)/; 51 my $k = $2; 52 my $v = $1; 53 $k =~ s/\\x//g; # UTF-8 char code 54 $utf8map{$k} = $v; 55 } 56} 57 58sub generate_header { 59 open(FOUT, ">", "$outfilename") 60 or die ("can't write to $outfilename\n"); 61 print FOUT <<EOF; 62# Warning: Do not edit. This file is automatically generated from the 63# tools in /usr/src/tools/tools/locale. The data is obtained from the 64# CLDR project, obtained from http://cldr.unicode.org/ 65# ----------------------------------------------------------------------------- 66 67comment_char * 68escape_char / 69 70LC_CTYPE 71EOF 72} 73 74sub generate_footer { 75 print FOUT "\nEND LC_CTYPE\n"; 76 close (FOUT); 77} 78 79sub wctomb { 80 my $wc = hex(shift); 81 my $lead; 82 my $len; 83 my $ret = ""; 84 my $i; 85 86 if (($wc & ~0x7f) == 0) { 87 return sprintf "%02X", $wc; 88 } elsif (($wc & ~0x7ff) == 0) { 89 $lead = 0xc0; 90 $len = 2; 91 } elsif (($wc & ~0xffff) == 0) { 92 $lead = 0xe0; 93 $len = 3; 94 } elsif ($wc >= 0 && $wc <= 0x10ffff) { 95 $lead = 0xf0; 96 $len = 4; 97 } 98 99 for ($i = $len - 1; $i > 0; $i--) { 100 $ret = (sprintf "%02X", ($wc & 0x3f) | 0x80) . $ret; 101 $wc >>= 6; 102 } 103 $ret = (sprintf "%02X", ($wc & 0xff) | $lead) . $ret; 104 105 return $ret; 106} 107 108sub parse_unidata { 109 my $file = shift; 110 my %data = (); 111 112 open(FIN, $file); 113 my @lines = <FIN>; 114 close(FIN); 115 chomp(@lines); 116 117 foreach my $l (@lines) { 118 my @d = split(/;/, $l, -1); 119 my $mb = wctomb($d[0]); 120 my $cat; 121 122 # XXX There are code points present in UnicodeData.txt 123 # and missing from UTF-8.cm 124 next if !defined $utf8map{$mb}; 125 126 # Define the category 127 if ($d[2] =~ /^Lu/) { 128 $cat = "upper"; 129 } elsif ($d[2] =~ /^Ll/) { 130 $cat = "lower"; 131 } elsif ($d[2] =~ /^Nd/) { 132 $cat = "digit"; 133 } elsif ($d[2] =~ /^L/) { 134 $cat = "alpha"; 135 } elsif ($d[2] =~ /^P/) { 136 $cat = "punct"; 137 } elsif ($d[2] =~ /^M/ || $d[2] =~ /^N/ || $d[2] =~ /^S/) { 138 $cat = "graph"; 139 } elsif ($d[2] =~ /^C/) { 140 $cat = "cntrl"; 141 } elsif ($d[2] =~ /^Z/) { 142 $cat = "space"; 143 } 144 $data{$cat}{$mb}{'wc'} = $d[0]; 145 146 # Check if it's a start or end of range 147 if ($d[1] =~ /First>$/) { 148 $data{$cat}{$mb}{'start'} = 1; 149 } elsif ($d[1] =~ /Last>$/) { 150 $data{$cat}{$mb}{'end'} = 1; 151 } 152 153 # Check if there's upper/lower mapping 154 if ($d[12] ne "") { 155 $data{'toupper'}{$mb} = wctomb($d[12]); 156 } elsif ($d[13] ne "") { 157 $data{'tolower'}{$mb} = wctomb($d[13]); 158 } 159 } 160 161 my $first; 162 my $inrange = 0; 163 164 # Now write out the categories 165 foreach my $cat (sort keys (%data)) { 166 print FOUT "$cat\t"; 167 $first = 1; 168 foreach my $mb (sort keys (%{$data{$cat}})) { 169 if ($first == 1) { 170 $first = 0; 171 } elsif ($inrange == 1) { 172 # Safety belt 173 die "broken range end wc=$data{$cat}{$mb}{'wc'}" 174 if !defined $data{$cat}{$mb}{'end'}; 175 print FOUT ";...;"; 176 $inrange = 0; 177 } else { 178 print FOUT ";/\n\t"; 179 } 180 181 if ($cat eq "tolower" || $cat eq "toupper") { 182 print FOUT "($utf8map{$mb},$utf8map{$data{$cat}{$mb}})"; 183 } else { 184 if (defined($data{$cat}{$mb}{'start'})) { 185 $inrange = 1; 186 } 187 print FOUT "$utf8map{$mb}"; 188 } 189 } 190 print FOUT "\n"; 191 } 192} 193