1*c87b03e5Sespie#! /usr/bin/perl 2*c87b03e5Sespie 3*c87b03e5Sespie# Copyright (C) 2000, 2001 Free Software Foundation 4*c87b03e5Sespie 5*c87b03e5Sespie# This program is free software; you can redistribute it and/or modify 6*c87b03e5Sespie# it under the terms of the GNU General Public License as published by 7*c87b03e5Sespie# the Free Software Foundation; either version 2, or (at your option) 8*c87b03e5Sespie# any later version. 9*c87b03e5Sespie 10*c87b03e5Sespie# This program is distributed in the hope that it will be useful, 11*c87b03e5Sespie# but WITHOUT ANY WARRANTY; without even the implied warranty of 12*c87b03e5Sespie# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13*c87b03e5Sespie# GNU General Public License for more details. 14*c87b03e5Sespie 15*c87b03e5Sespie# You should have received a copy of the GNU General Public License 16*c87b03e5Sespie# along with this program; if not, write to the Free Software 17*c87b03e5Sespie# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 18*c87b03e5Sespie# 02111-1307, USA. 19*c87b03e5Sespie 20*c87b03e5Sespie# gen-table.pl - Generate tables for gcj from Unicode data. 21*c87b03e5Sespie# Usage: perl gen-table.pl DATA-FILE 22*c87b03e5Sespie# 23*c87b03e5Sespie# You can find the Unicode data file here: 24*c87b03e5Sespie# ftp://www.unicode.org/Public/3.0-Update1/UnicodeData-3.0.1.txt 25*c87b03e5Sespie# Please update this URL when this program is used with a more 26*c87b03e5Sespie# recent version of the table. Note that this table cannot be 27*c87b03e5Sespie# distributed with gcc. 28*c87b03e5Sespie# This program should not be re-run indiscriminately. Care must be 29*c87b03e5Sespie# taken that what it generates is in sync with the Java specification. 30*c87b03e5Sespie 31*c87b03e5Sespie# Names of fields in Unicode data table. 32*c87b03e5Sespie$CODE = 0; 33*c87b03e5Sespie$NAME = 1; 34*c87b03e5Sespie$CATEGORY = 2; 35*c87b03e5Sespie$COMBINING_CLASSES = 3; 36*c87b03e5Sespie$BIDI_CATEGORY = 4; 37*c87b03e5Sespie$DECOMPOSITION = 5; 38*c87b03e5Sespie$DECIMAL_VALUE = 6; 39*c87b03e5Sespie$DIGIT_VALUE = 7; 40*c87b03e5Sespie$NUMERIC_VALUE = 8; 41*c87b03e5Sespie$MIRRORED = 9; 42*c87b03e5Sespie$OLD_NAME = 10; 43*c87b03e5Sespie$COMMENT = 11; 44*c87b03e5Sespie$UPPER = 12; 45*c87b03e5Sespie$LOWER = 13; 46*c87b03e5Sespie$TITLE = 14; 47*c87b03e5Sespie 48*c87b03e5Sespie# Start of special-cased gaps in Unicode data table. 49*c87b03e5Sespie%gaps = ( 50*c87b03e5Sespie 0x4e00 => "CJK", 51*c87b03e5Sespie 0xac00 => "Hangul", 52*c87b03e5Sespie 0xd800 => "Unassigned High Surrogate", 53*c87b03e5Sespie 0xdb80 => "Private Use High Surrogate", 54*c87b03e5Sespie 0xdc00 => "Low Surrogate", 55*c87b03e5Sespie 0xe000 => "Private Use" 56*c87b03e5Sespie ); 57*c87b03e5Sespie 58*c87b03e5Sespie# This lists control characters which are also considered whitespace. 59*c87b03e5Sespie# This is a somewhat odd list, taken from the JCL definition of 60*c87b03e5Sespie# Character.isIdentifierIgnorable. 61*c87b03e5Sespie%whitespace_controls = 62*c87b03e5Sespie ( 63*c87b03e5Sespie 0x0009 => 1, 64*c87b03e5Sespie 0x000a => 1, 65*c87b03e5Sespie 0x000b => 1, 66*c87b03e5Sespie 0x000c => 1, 67*c87b03e5Sespie 0x000d => 1, 68*c87b03e5Sespie 0x001c => 1, 69*c87b03e5Sespie 0x001d => 1, 70*c87b03e5Sespie 0x001e => 1, 71*c87b03e5Sespie 0x001f => 1 72*c87b03e5Sespie ); 73*c87b03e5Sespie 74*c87b03e5Sespieopen (INPUT, "< $ARGV[0]") || exit 1; 75*c87b03e5Sespie 76*c87b03e5Sespie$last_code = -1; 77*c87b03e5Sespiewhile (<INPUT>) 78*c87b03e5Sespie{ 79*c87b03e5Sespie chop; 80*c87b03e5Sespie @fields = split (';', $_, 30); 81*c87b03e5Sespie if ($#fields != 14) 82*c87b03e5Sespie { 83*c87b03e5Sespie print STDERR "Entry for $fields[$CODE] has wrong number of fields\n"; 84*c87b03e5Sespie } 85*c87b03e5Sespie 86*c87b03e5Sespie $code = hex ($fields[$CODE]); 87*c87b03e5Sespie last if $code > 0xffff; 88*c87b03e5Sespie if ($code > $last_code + 1) 89*c87b03e5Sespie { 90*c87b03e5Sespie # Found a gap. 91*c87b03e5Sespie if (defined $gaps{$code}) 92*c87b03e5Sespie { 93*c87b03e5Sespie # Fill the gap with the last character read. 94*c87b03e5Sespie @gfields = @fields; 95*c87b03e5Sespie } 96*c87b03e5Sespie else 97*c87b03e5Sespie { 98*c87b03e5Sespie # The gap represents undefined characters. Only the type 99*c87b03e5Sespie # matters. 100*c87b03e5Sespie @gfields = ('', '', 'Cn', '0', '', '', '', '', '', '', '', 101*c87b03e5Sespie '', '', '', ''); 102*c87b03e5Sespie } 103*c87b03e5Sespie for (++$last_code; $last_code < $code; ++$last_code) 104*c87b03e5Sespie { 105*c87b03e5Sespie $gfields{$CODE} = sprintf ("%04x", $last_code); 106*c87b03e5Sespie &process_one ($last_code, @gfields); 107*c87b03e5Sespie } 108*c87b03e5Sespie } 109*c87b03e5Sespie &process_one ($code, @fields); 110*c87b03e5Sespie $last_code = $code; 111*c87b03e5Sespie} 112*c87b03e5Sespie 113*c87b03e5Sespieclose (INPUT); 114*c87b03e5Sespie 115*c87b03e5Sespie@gfields = ('', '', 'Cn', '0', '', '', '', '', '', '', '', 116*c87b03e5Sespie '', '', '', ''); 117*c87b03e5Sespiefor (++$last_code; $last_code < 0x10000; ++$last_code) 118*c87b03e5Sespie{ 119*c87b03e5Sespie $gfields{$CODE} = sprintf ("%04x", $last_code); 120*c87b03e5Sespie &process_one ($last_code, @gfields); 121*c87b03e5Sespie} 122*c87b03e5Sespie--$last_code; # Want last to be 0xFFFF. 123*c87b03e5Sespie 124*c87b03e5Sespie&print_tables ($last_code); 125*c87b03e5Sespie 126*c87b03e5Sespieexit 0; 127*c87b03e5Sespie 128*c87b03e5Sespie# Process a single character. 129*c87b03e5Sespiesub process_one 130*c87b03e5Sespie{ 131*c87b03e5Sespie my ($code, @fields) = @_; 132*c87b03e5Sespie 133*c87b03e5Sespie my $value = ''; 134*c87b03e5Sespie my $type = $fields[$CATEGORY]; 135*c87b03e5Sespie 136*c87b03e5Sespie # See if the character is a valid identifier start. 137*c87b03e5Sespie if ($type =~ /L./ # Letter 138*c87b03e5Sespie || $type eq 'Pc' # Connecting punctuation 139*c87b03e5Sespie || $type eq 'Sc') # Currency symbol 140*c87b03e5Sespie { 141*c87b03e5Sespie $value = 'LETTER_START'; 142*c87b03e5Sespie } 143*c87b03e5Sespie 144*c87b03e5Sespie # See if the character is a valid identifier member. 145*c87b03e5Sespie if ($type =~ /L./ # Letter 146*c87b03e5Sespie || $type eq 'Pc' # Connecting punctuation 147*c87b03e5Sespie || $type eq 'Sc' # Currency symbol 148*c87b03e5Sespie || $type =~ /N[dl]/ # Number: decimal or letter 149*c87b03e5Sespie || $type =~ /M[nc]/ # Mark: non-spacing or combining 150*c87b03e5Sespie || ($type eq 'Cc' # Certain controls 151*c87b03e5Sespie && ! defined $whitespace_controls{$code}) 152*c87b03e5Sespie || ($code >= 0x200c # Join controls 153*c87b03e5Sespie && $code <= 0x200f) 154*c87b03e5Sespie || ($code >= 0x202a # Bidi controls -- note that there 155*c87b03e5Sespie # is a typo in the JCL where these are 156*c87b03e5Sespie # concerned. 157*c87b03e5Sespie && $code <= 0x202e) 158*c87b03e5Sespie || ($code >= 0x206a # Format controls 159*c87b03e5Sespie && $code <= 0x206f) 160*c87b03e5Sespie || $code == 0xfeff) # ZWNBSP 161*c87b03e5Sespie { 162*c87b03e5Sespie if ($value eq '') 163*c87b03e5Sespie { 164*c87b03e5Sespie $value = 'LETTER_PART'; 165*c87b03e5Sespie } 166*c87b03e5Sespie else 167*c87b03e5Sespie { 168*c87b03e5Sespie $value = 'LETTER_PART | ' . $value; 169*c87b03e5Sespie } 170*c87b03e5Sespie } 171*c87b03e5Sespie 172*c87b03e5Sespie if ($value eq '') 173*c87b03e5Sespie { 174*c87b03e5Sespie $value = '0'; 175*c87b03e5Sespie } 176*c87b03e5Sespie else 177*c87b03e5Sespie { 178*c87b03e5Sespie $value = '(' . $value . ')'; 179*c87b03e5Sespie } 180*c87b03e5Sespie 181*c87b03e5Sespie $map[$code] = $value; 182*c87b03e5Sespie} 183*c87b03e5Sespie 184*c87b03e5Sespiesub print_tables 185*c87b03e5Sespie{ 186*c87b03e5Sespie my ($last) = @_; 187*c87b03e5Sespie 188*c87b03e5Sespie local ($bytes_out) = 0; 189*c87b03e5Sespie 190*c87b03e5Sespie open (OUT, "> chartables.h"); 191*c87b03e5Sespie 192*c87b03e5Sespie print OUT "/* This file is automatically generated. DO NOT EDIT!\n"; 193*c87b03e5Sespie print OUT " Instead, edit gen-table.pl and re-run. */\n\n"; 194*c87b03e5Sespie 195*c87b03e5Sespie print OUT "#ifndef GCC_CHARTABLES_H\n"; 196*c87b03e5Sespie print OUT "#define GCC_CHARTABLES_H\n\n"; 197*c87b03e5Sespie 198*c87b03e5Sespie print OUT "#define LETTER_START 1\n"; 199*c87b03e5Sespie print OUT "#define LETTER_PART 2\n\n"; 200*c87b03e5Sespie 201*c87b03e5Sespie for ($count = 0; $count <= $last; $count += 256) 202*c87b03e5Sespie { 203*c87b03e5Sespie $row[$count / 256] = &print_row ($count, '(char *) ', 'const char', 1, 204*c87b03e5Sespie 'page'); 205*c87b03e5Sespie } 206*c87b03e5Sespie 207*c87b03e5Sespie print OUT "static const char *const type_table[256] = {\n"; 208*c87b03e5Sespie for ($count = 0; $count <= $last; $count += 256) 209*c87b03e5Sespie { 210*c87b03e5Sespie print OUT ",\n" if $count > 0; 211*c87b03e5Sespie print OUT " ", $row[$count / 256]; 212*c87b03e5Sespie $bytes_out += 4; 213*c87b03e5Sespie } 214*c87b03e5Sespie print OUT "\n};\n\n"; 215*c87b03e5Sespie 216*c87b03e5Sespie print OUT "#endif /* ! GCC_CHARTABLES_H */\n"; 217*c87b03e5Sespie 218*c87b03e5Sespie close (OUT); 219*c87b03e5Sespie 220*c87b03e5Sespie printf "Generated %d bytes\n", $bytes_out; 221*c87b03e5Sespie} 222*c87b03e5Sespie 223*c87b03e5Sespie# Print a single "row" of a two-level table. 224*c87b03e5Sespiesub print_row 225*c87b03e5Sespie{ 226*c87b03e5Sespie my ($start, $def_pfx, $typname, $typsize, $name) = @_; 227*c87b03e5Sespie 228*c87b03e5Sespie my ($i); 229*c87b03e5Sespie my (@values); 230*c87b03e5Sespie my ($flag) = 1; 231*c87b03e5Sespie my ($off); 232*c87b03e5Sespie for ($off = 0; $off < 256; ++$off) 233*c87b03e5Sespie { 234*c87b03e5Sespie $values[$off] = $map[$off + $start]; 235*c87b03e5Sespie if ($values[$off] ne $values[0]) 236*c87b03e5Sespie { 237*c87b03e5Sespie $flag = 0; 238*c87b03e5Sespie } 239*c87b03e5Sespie } 240*c87b03e5Sespie if ($flag) 241*c87b03e5Sespie { 242*c87b03e5Sespie return $def_pfx . $values[0]; 243*c87b03e5Sespie } 244*c87b03e5Sespie 245*c87b03e5Sespie printf OUT "static %s %s%d[256] = {\n ", $typname, $name, $start / 256; 246*c87b03e5Sespie my ($column) = 2; 247*c87b03e5Sespie for ($i = $start; $i < $start + 256; ++$i) 248*c87b03e5Sespie { 249*c87b03e5Sespie print OUT ", " 250*c87b03e5Sespie if $i > $start; 251*c87b03e5Sespie my ($text) = $values[$i - $start]; 252*c87b03e5Sespie if (length ($text) + $column + 2 > 78) 253*c87b03e5Sespie { 254*c87b03e5Sespie print OUT "\n "; 255*c87b03e5Sespie $column = 2; 256*c87b03e5Sespie } 257*c87b03e5Sespie print OUT $text; 258*c87b03e5Sespie $column += length ($text) + 2; 259*c87b03e5Sespie } 260*c87b03e5Sespie print OUT "\n};\n\n"; 261*c87b03e5Sespie 262*c87b03e5Sespie $bytes_out += 256 * $typsize; 263*c87b03e5Sespie 264*c87b03e5Sespie return sprintf "%s%d", $name, $start / 256; 265*c87b03e5Sespie} 266