xref: /openbsd/gnu/usr.bin/gcc/gcc/java/gen-table.pl (revision c87b03e5)
1*c87b03e5Sespie#! /usr/bin/perl
2*c87b03e5Sespie
3*c87b03e5Sespie#    Copyright (C) 2000, 2001 Free Software Foundation
4*c87b03e5Sespie
5*c87b03e5Sespie#    This program is free software; you can redistribute it and/or modify
6*c87b03e5Sespie#    it under the terms of the GNU General Public License as published by
7*c87b03e5Sespie#    the Free Software Foundation; either version 2, or (at your option)
8*c87b03e5Sespie#    any later version.
9*c87b03e5Sespie
10*c87b03e5Sespie#    This program is distributed in the hope that it will be useful,
11*c87b03e5Sespie#    but WITHOUT ANY WARRANTY; without even the implied warranty of
12*c87b03e5Sespie#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13*c87b03e5Sespie#    GNU General Public License for more details.
14*c87b03e5Sespie
15*c87b03e5Sespie#    You should have received a copy of the GNU General Public License
16*c87b03e5Sespie#    along with this program; if not, write to the Free Software
17*c87b03e5Sespie#    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
18*c87b03e5Sespie#    02111-1307, USA.
19*c87b03e5Sespie
20*c87b03e5Sespie# gen-table.pl - Generate tables for gcj from Unicode data.
21*c87b03e5Sespie# Usage: perl gen-table.pl DATA-FILE
22*c87b03e5Sespie#
23*c87b03e5Sespie# You can find the Unicode data file here:
24*c87b03e5Sespie#   ftp://www.unicode.org/Public/3.0-Update1/UnicodeData-3.0.1.txt
25*c87b03e5Sespie# Please update this URL when this program is used with a more
26*c87b03e5Sespie# recent version of the table.  Note that this table cannot be
27*c87b03e5Sespie# distributed with gcc.
28*c87b03e5Sespie# This program should not be re-run indiscriminately.  Care must be
29*c87b03e5Sespie# taken that what it generates is in sync with the Java specification.
30*c87b03e5Sespie
31*c87b03e5Sespie# Names of fields in Unicode data table.
32*c87b03e5Sespie$CODE = 0;
33*c87b03e5Sespie$NAME = 1;
34*c87b03e5Sespie$CATEGORY = 2;
35*c87b03e5Sespie$COMBINING_CLASSES = 3;
36*c87b03e5Sespie$BIDI_CATEGORY = 4;
37*c87b03e5Sespie$DECOMPOSITION = 5;
38*c87b03e5Sespie$DECIMAL_VALUE = 6;
39*c87b03e5Sespie$DIGIT_VALUE = 7;
40*c87b03e5Sespie$NUMERIC_VALUE = 8;
41*c87b03e5Sespie$MIRRORED = 9;
42*c87b03e5Sespie$OLD_NAME = 10;
43*c87b03e5Sespie$COMMENT = 11;
44*c87b03e5Sespie$UPPER = 12;
45*c87b03e5Sespie$LOWER = 13;
46*c87b03e5Sespie$TITLE = 14;
47*c87b03e5Sespie
48*c87b03e5Sespie# Start of special-cased gaps in Unicode data table.
49*c87b03e5Sespie%gaps = (
50*c87b03e5Sespie	 0x4e00 => "CJK",
51*c87b03e5Sespie	 0xac00 => "Hangul",
52*c87b03e5Sespie	 0xd800 => "Unassigned High Surrogate",
53*c87b03e5Sespie	 0xdb80 => "Private Use High Surrogate",
54*c87b03e5Sespie	 0xdc00 => "Low Surrogate",
55*c87b03e5Sespie	 0xe000 => "Private Use"
56*c87b03e5Sespie	 );
57*c87b03e5Sespie
58*c87b03e5Sespie# This lists control characters which are also considered whitespace.
59*c87b03e5Sespie# This is a somewhat odd list, taken from the JCL definition of
60*c87b03e5Sespie# Character.isIdentifierIgnorable.
61*c87b03e5Sespie%whitespace_controls =
62*c87b03e5Sespie    (
63*c87b03e5Sespie     0x0009 => 1,
64*c87b03e5Sespie     0x000a => 1,
65*c87b03e5Sespie     0x000b => 1,
66*c87b03e5Sespie     0x000c => 1,
67*c87b03e5Sespie     0x000d => 1,
68*c87b03e5Sespie     0x001c => 1,
69*c87b03e5Sespie     0x001d => 1,
70*c87b03e5Sespie     0x001e => 1,
71*c87b03e5Sespie     0x001f => 1
72*c87b03e5Sespie     );
73*c87b03e5Sespie
74*c87b03e5Sespieopen (INPUT, "< $ARGV[0]") || exit 1;
75*c87b03e5Sespie
76*c87b03e5Sespie$last_code = -1;
77*c87b03e5Sespiewhile (<INPUT>)
78*c87b03e5Sespie{
79*c87b03e5Sespie    chop;
80*c87b03e5Sespie    @fields = split (';', $_, 30);
81*c87b03e5Sespie    if ($#fields != 14)
82*c87b03e5Sespie    {
83*c87b03e5Sespie	print STDERR "Entry for $fields[$CODE] has wrong number of fields\n";
84*c87b03e5Sespie    }
85*c87b03e5Sespie
86*c87b03e5Sespie    $code = hex ($fields[$CODE]);
87*c87b03e5Sespie    last if $code > 0xffff;
88*c87b03e5Sespie    if ($code > $last_code + 1)
89*c87b03e5Sespie    {
90*c87b03e5Sespie	# Found a gap.
91*c87b03e5Sespie	if (defined $gaps{$code})
92*c87b03e5Sespie	{
93*c87b03e5Sespie	    # Fill the gap with the last character read.
94*c87b03e5Sespie	    @gfields = @fields;
95*c87b03e5Sespie	}
96*c87b03e5Sespie	else
97*c87b03e5Sespie	{
98*c87b03e5Sespie	    # The gap represents undefined characters.  Only the type
99*c87b03e5Sespie	    # matters.
100*c87b03e5Sespie	    @gfields = ('', '', 'Cn', '0', '', '', '', '', '', '', '',
101*c87b03e5Sespie			'', '', '', '');
102*c87b03e5Sespie	}
103*c87b03e5Sespie	for (++$last_code; $last_code < $code; ++$last_code)
104*c87b03e5Sespie	{
105*c87b03e5Sespie	    $gfields{$CODE} = sprintf ("%04x", $last_code);
106*c87b03e5Sespie	    &process_one ($last_code, @gfields);
107*c87b03e5Sespie	}
108*c87b03e5Sespie    }
109*c87b03e5Sespie    &process_one ($code, @fields);
110*c87b03e5Sespie    $last_code = $code;
111*c87b03e5Sespie}
112*c87b03e5Sespie
113*c87b03e5Sespieclose (INPUT);
114*c87b03e5Sespie
115*c87b03e5Sespie@gfields = ('', '', 'Cn', '0', '', '', '', '', '', '', '',
116*c87b03e5Sespie	    '', '', '', '');
117*c87b03e5Sespiefor (++$last_code; $last_code < 0x10000; ++$last_code)
118*c87b03e5Sespie{
119*c87b03e5Sespie    $gfields{$CODE} = sprintf ("%04x", $last_code);
120*c87b03e5Sespie    &process_one ($last_code, @gfields);
121*c87b03e5Sespie}
122*c87b03e5Sespie--$last_code;			# Want last to be 0xFFFF.
123*c87b03e5Sespie
124*c87b03e5Sespie&print_tables ($last_code);
125*c87b03e5Sespie
126*c87b03e5Sespieexit 0;
127*c87b03e5Sespie
128*c87b03e5Sespie# Process a single character.
129*c87b03e5Sespiesub process_one
130*c87b03e5Sespie{
131*c87b03e5Sespie    my ($code, @fields) = @_;
132*c87b03e5Sespie
133*c87b03e5Sespie    my $value = '';
134*c87b03e5Sespie    my $type = $fields[$CATEGORY];
135*c87b03e5Sespie
136*c87b03e5Sespie    # See if the character is a valid identifier start.
137*c87b03e5Sespie    if ($type =~ /L./		# Letter
138*c87b03e5Sespie	|| $type eq 'Pc'	# Connecting punctuation
139*c87b03e5Sespie	|| $type eq 'Sc')	# Currency symbol
140*c87b03e5Sespie    {
141*c87b03e5Sespie	$value = 'LETTER_START';
142*c87b03e5Sespie    }
143*c87b03e5Sespie
144*c87b03e5Sespie    # See if the character is a valid identifier member.
145*c87b03e5Sespie    if ($type =~ /L./		# Letter
146*c87b03e5Sespie	|| $type eq 'Pc'	# Connecting punctuation
147*c87b03e5Sespie	|| $type eq 'Sc'	# Currency symbol
148*c87b03e5Sespie	|| $type =~ /N[dl]/	# Number: decimal or letter
149*c87b03e5Sespie	|| $type =~ /M[nc]/	# Mark: non-spacing or combining
150*c87b03e5Sespie	|| ($type eq 'Cc'	# Certain controls
151*c87b03e5Sespie	    && ! defined $whitespace_controls{$code})
152*c87b03e5Sespie	|| ($code >= 0x200c	# Join controls
153*c87b03e5Sespie	    && $code <= 0x200f)
154*c87b03e5Sespie	|| ($code >= 0x202a	# Bidi controls -- note that there
155*c87b03e5Sespie				# is a typo in the JCL where these are
156*c87b03e5Sespie				# concerned.
157*c87b03e5Sespie	    && $code <= 0x202e)
158*c87b03e5Sespie	|| ($code >= 0x206a	# Format controls
159*c87b03e5Sespie	    && $code <= 0x206f)
160*c87b03e5Sespie	|| $code == 0xfeff)	# ZWNBSP
161*c87b03e5Sespie    {
162*c87b03e5Sespie	if ($value eq '')
163*c87b03e5Sespie	{
164*c87b03e5Sespie	    $value = 'LETTER_PART';
165*c87b03e5Sespie	}
166*c87b03e5Sespie	else
167*c87b03e5Sespie	{
168*c87b03e5Sespie	    $value = 'LETTER_PART | ' . $value;
169*c87b03e5Sespie	}
170*c87b03e5Sespie    }
171*c87b03e5Sespie
172*c87b03e5Sespie    if ($value eq '')
173*c87b03e5Sespie    {
174*c87b03e5Sespie	$value = '0';
175*c87b03e5Sespie    }
176*c87b03e5Sespie    else
177*c87b03e5Sespie    {
178*c87b03e5Sespie	$value = '(' . $value . ')';
179*c87b03e5Sespie    }
180*c87b03e5Sespie
181*c87b03e5Sespie    $map[$code] = $value;
182*c87b03e5Sespie}
183*c87b03e5Sespie
184*c87b03e5Sespiesub print_tables
185*c87b03e5Sespie{
186*c87b03e5Sespie    my ($last) = @_;
187*c87b03e5Sespie
188*c87b03e5Sespie    local ($bytes_out) = 0;
189*c87b03e5Sespie
190*c87b03e5Sespie    open (OUT, "> chartables.h");
191*c87b03e5Sespie
192*c87b03e5Sespie    print OUT "/* This file is automatically generated.  DO NOT EDIT!\n";
193*c87b03e5Sespie    print OUT "   Instead, edit gen-table.pl and re-run.  */\n\n";
194*c87b03e5Sespie
195*c87b03e5Sespie    print OUT "#ifndef GCC_CHARTABLES_H\n";
196*c87b03e5Sespie    print OUT "#define GCC_CHARTABLES_H\n\n";
197*c87b03e5Sespie
198*c87b03e5Sespie    print OUT "#define LETTER_START 1\n";
199*c87b03e5Sespie    print OUT "#define LETTER_PART  2\n\n";
200*c87b03e5Sespie
201*c87b03e5Sespie    for ($count = 0; $count <= $last; $count += 256)
202*c87b03e5Sespie    {
203*c87b03e5Sespie	$row[$count / 256] = &print_row ($count, '(char *) ', 'const char', 1,
204*c87b03e5Sespie					 'page');
205*c87b03e5Sespie    }
206*c87b03e5Sespie
207*c87b03e5Sespie    print OUT "static const char *const type_table[256] = {\n";
208*c87b03e5Sespie    for ($count = 0; $count <= $last; $count += 256)
209*c87b03e5Sespie    {
210*c87b03e5Sespie	print OUT ",\n" if $count > 0;
211*c87b03e5Sespie	print OUT "  ", $row[$count / 256];
212*c87b03e5Sespie	$bytes_out += 4;
213*c87b03e5Sespie    }
214*c87b03e5Sespie    print OUT "\n};\n\n";
215*c87b03e5Sespie
216*c87b03e5Sespie    print OUT "#endif /* ! GCC_CHARTABLES_H */\n";
217*c87b03e5Sespie
218*c87b03e5Sespie    close (OUT);
219*c87b03e5Sespie
220*c87b03e5Sespie    printf "Generated %d bytes\n", $bytes_out;
221*c87b03e5Sespie}
222*c87b03e5Sespie
223*c87b03e5Sespie# Print a single "row" of a two-level table.
224*c87b03e5Sespiesub print_row
225*c87b03e5Sespie{
226*c87b03e5Sespie    my ($start, $def_pfx, $typname, $typsize, $name) = @_;
227*c87b03e5Sespie
228*c87b03e5Sespie    my ($i);
229*c87b03e5Sespie    my (@values);
230*c87b03e5Sespie    my ($flag) = 1;
231*c87b03e5Sespie    my ($off);
232*c87b03e5Sespie    for ($off = 0; $off < 256; ++$off)
233*c87b03e5Sespie    {
234*c87b03e5Sespie	$values[$off] = $map[$off + $start];
235*c87b03e5Sespie	if ($values[$off] ne $values[0])
236*c87b03e5Sespie	{
237*c87b03e5Sespie	    $flag = 0;
238*c87b03e5Sespie	}
239*c87b03e5Sespie    }
240*c87b03e5Sespie    if ($flag)
241*c87b03e5Sespie    {
242*c87b03e5Sespie	return $def_pfx . $values[0];
243*c87b03e5Sespie    }
244*c87b03e5Sespie
245*c87b03e5Sespie    printf OUT "static %s %s%d[256] = {\n  ", $typname, $name, $start / 256;
246*c87b03e5Sespie    my ($column) = 2;
247*c87b03e5Sespie    for ($i = $start; $i < $start + 256; ++$i)
248*c87b03e5Sespie    {
249*c87b03e5Sespie	print OUT ", "
250*c87b03e5Sespie	    if $i > $start;
251*c87b03e5Sespie	my ($text) = $values[$i - $start];
252*c87b03e5Sespie	if (length ($text) + $column + 2 > 78)
253*c87b03e5Sespie	{
254*c87b03e5Sespie	    print OUT "\n  ";
255*c87b03e5Sespie	    $column = 2;
256*c87b03e5Sespie	}
257*c87b03e5Sespie	print OUT $text;
258*c87b03e5Sespie	$column += length ($text) + 2;
259*c87b03e5Sespie    }
260*c87b03e5Sespie    print OUT "\n};\n\n";
261*c87b03e5Sespie
262*c87b03e5Sespie    $bytes_out += 256 * $typsize;
263*c87b03e5Sespie
264*c87b03e5Sespie    return sprintf "%s%d", $name, $start / 256;
265*c87b03e5Sespie}
266