1#! /usr/bin/perl 2 3# Copyright (C) 2000, 2001 Free Software Foundation 4 5# This program is free software; you can redistribute it and/or modify 6# it under the terms of the GNU General Public License as published by 7# the Free Software Foundation; either version 2, or (at your option) 8# any later version. 9 10# This program is distributed in the hope that it will be useful, 11# but WITHOUT ANY WARRANTY; without even the implied warranty of 12# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13# GNU General Public License for more details. 14 15# You should have received a copy of the GNU General Public License 16# along with this program; if not, write to the Free Software 17# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 18# 02111-1307, USA. 19 20# gen-table.pl - Generate tables for gcj from Unicode data. 21# Usage: perl gen-table.pl DATA-FILE 22# 23# You can find the Unicode data file here: 24# ftp://www.unicode.org/Public/3.0-Update1/UnicodeData-3.0.1.txt 25# Please update this URL when this program is used with a more 26# recent version of the table. Note that this table cannot be 27# distributed with gcc. 28# This program should not be re-run indiscriminately. Care must be 29# taken that what it generates is in sync with the Java specification. 30 31# Names of fields in Unicode data table. 32$CODE = 0; 33$NAME = 1; 34$CATEGORY = 2; 35$COMBINING_CLASSES = 3; 36$BIDI_CATEGORY = 4; 37$DECOMPOSITION = 5; 38$DECIMAL_VALUE = 6; 39$DIGIT_VALUE = 7; 40$NUMERIC_VALUE = 8; 41$MIRRORED = 9; 42$OLD_NAME = 10; 43$COMMENT = 11; 44$UPPER = 12; 45$LOWER = 13; 46$TITLE = 14; 47 48# Start of special-cased gaps in Unicode data table. 49%gaps = ( 50 0x4e00 => "CJK", 51 0xac00 => "Hangul", 52 0xd800 => "Unassigned High Surrogate", 53 0xdb80 => "Private Use High Surrogate", 54 0xdc00 => "Low Surrogate", 55 0xe000 => "Private Use" 56 ); 57 58# This lists control characters which are also considered whitespace. 59# This is a somewhat odd list, taken from the JCL definition of 60# Character.isIdentifierIgnorable. 61%whitespace_controls = 62 ( 63 0x0009 => 1, 64 0x000a => 1, 65 0x000b => 1, 66 0x000c => 1, 67 0x000d => 1, 68 0x001c => 1, 69 0x001d => 1, 70 0x001e => 1, 71 0x001f => 1 72 ); 73 74open (INPUT, "< $ARGV[0]") || exit 1; 75 76$last_code = -1; 77while (<INPUT>) 78{ 79 chop; 80 @fields = split (';', $_, 30); 81 if ($#fields != 14) 82 { 83 print STDERR "Entry for $fields[$CODE] has wrong number of fields\n"; 84 } 85 86 $code = hex ($fields[$CODE]); 87 last if $code > 0xffff; 88 if ($code > $last_code + 1) 89 { 90 # Found a gap. 91 if (defined $gaps{$code}) 92 { 93 # Fill the gap with the last character read. 94 @gfields = @fields; 95 } 96 else 97 { 98 # The gap represents undefined characters. Only the type 99 # matters. 100 @gfields = ('', '', 'Cn', '0', '', '', '', '', '', '', '', 101 '', '', '', ''); 102 } 103 for (++$last_code; $last_code < $code; ++$last_code) 104 { 105 $gfields{$CODE} = sprintf ("%04x", $last_code); 106 &process_one ($last_code, @gfields); 107 } 108 } 109 &process_one ($code, @fields); 110 $last_code = $code; 111} 112 113close (INPUT); 114 115@gfields = ('', '', 'Cn', '0', '', '', '', '', '', '', '', 116 '', '', '', ''); 117for (++$last_code; $last_code < 0x10000; ++$last_code) 118{ 119 $gfields{$CODE} = sprintf ("%04x", $last_code); 120 &process_one ($last_code, @gfields); 121} 122--$last_code; # Want last to be 0xFFFF. 123 124&print_tables ($last_code); 125 126exit 0; 127 128# Process a single character. 129sub process_one 130{ 131 my ($code, @fields) = @_; 132 133 my $value = ''; 134 my $type = $fields[$CATEGORY]; 135 136 # See if the character is a valid identifier start. 137 if ($type =~ /L./ # Letter 138 || $type eq 'Pc' # Connecting punctuation 139 || $type eq 'Sc') # Currency symbol 140 { 141 $value = 'LETTER_START'; 142 } 143 144 # See if the character is a valid identifier member. 145 if ($type =~ /L./ # Letter 146 || $type eq 'Pc' # Connecting punctuation 147 || $type eq 'Sc' # Currency symbol 148 || $type =~ /N[dl]/ # Number: decimal or letter 149 || $type =~ /M[nc]/ # Mark: non-spacing or combining 150 || ($type eq 'Cc' # Certain controls 151 && ! defined $whitespace_controls{$code}) 152 || ($code >= 0x200c # Join controls 153 && $code <= 0x200f) 154 || ($code >= 0x202a # Bidi controls -- note that there 155 # is a typo in the JCL where these are 156 # concerned. 157 && $code <= 0x202e) 158 || ($code >= 0x206a # Format controls 159 && $code <= 0x206f) 160 || $code == 0xfeff) # ZWNBSP 161 { 162 if ($value eq '') 163 { 164 $value = 'LETTER_PART'; 165 } 166 else 167 { 168 $value = 'LETTER_PART | ' . $value; 169 } 170 } 171 172 if ($value eq '') 173 { 174 $value = '0'; 175 } 176 else 177 { 178 $value = '(' . $value . ')'; 179 } 180 181 $map[$code] = $value; 182} 183 184sub print_tables 185{ 186 my ($last) = @_; 187 188 local ($bytes_out) = 0; 189 190 open (OUT, "> chartables.h"); 191 192 print OUT "/* This file is automatically generated. DO NOT EDIT!\n"; 193 print OUT " Instead, edit gen-table.pl and re-run. */\n\n"; 194 195 print OUT "#ifndef GCC_CHARTABLES_H\n"; 196 print OUT "#define GCC_CHARTABLES_H\n\n"; 197 198 print OUT "#define LETTER_START 1\n"; 199 print OUT "#define LETTER_PART 2\n\n"; 200 201 for ($count = 0; $count <= $last; $count += 256) 202 { 203 $row[$count / 256] = &print_row ($count, '(char *) ', 'const char', 1, 204 'page'); 205 } 206 207 print OUT "static const char *const type_table[256] = {\n"; 208 for ($count = 0; $count <= $last; $count += 256) 209 { 210 print OUT ",\n" if $count > 0; 211 print OUT " ", $row[$count / 256]; 212 $bytes_out += 4; 213 } 214 print OUT "\n};\n\n"; 215 216 print OUT "#endif /* ! GCC_CHARTABLES_H */\n"; 217 218 close (OUT); 219 220 printf "Generated %d bytes\n", $bytes_out; 221} 222 223# Print a single "row" of a two-level table. 224sub print_row 225{ 226 my ($start, $def_pfx, $typname, $typsize, $name) = @_; 227 228 my ($i); 229 my (@values); 230 my ($flag) = 1; 231 my ($off); 232 for ($off = 0; $off < 256; ++$off) 233 { 234 $values[$off] = $map[$off + $start]; 235 if ($values[$off] ne $values[0]) 236 { 237 $flag = 0; 238 } 239 } 240 if ($flag) 241 { 242 return $def_pfx . $values[0]; 243 } 244 245 printf OUT "static %s %s%d[256] = {\n ", $typname, $name, $start / 256; 246 my ($column) = 2; 247 for ($i = $start; $i < $start + 256; ++$i) 248 { 249 print OUT ", " 250 if $i > $start; 251 my ($text) = $values[$i - $start]; 252 if (length ($text) + $column + 2 > 78) 253 { 254 print OUT "\n "; 255 $column = 2; 256 } 257 print OUT $text; 258 $column += length ($text) + 2; 259 } 260 print OUT "\n};\n\n"; 261 262 $bytes_out += 256 * $typsize; 263 264 return sprintf "%s%d", $name, $start / 256; 265} 266