xref: /openbsd/gnu/usr.bin/gcc/gcc/java/gen-table.pl (revision c87b03e5)
1#! /usr/bin/perl
2
3#    Copyright (C) 2000, 2001 Free Software Foundation
4
5#    This program is free software; you can redistribute it and/or modify
6#    it under the terms of the GNU General Public License as published by
7#    the Free Software Foundation; either version 2, or (at your option)
8#    any later version.
9
10#    This program is distributed in the hope that it will be useful,
11#    but WITHOUT ANY WARRANTY; without even the implied warranty of
12#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13#    GNU General Public License for more details.
14
15#    You should have received a copy of the GNU General Public License
16#    along with this program; if not, write to the Free Software
17#    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
18#    02111-1307, USA.
19
20# gen-table.pl - Generate tables for gcj from Unicode data.
21# Usage: perl gen-table.pl DATA-FILE
22#
23# You can find the Unicode data file here:
24#   ftp://www.unicode.org/Public/3.0-Update1/UnicodeData-3.0.1.txt
25# Please update this URL when this program is used with a more
26# recent version of the table.  Note that this table cannot be
27# distributed with gcc.
28# This program should not be re-run indiscriminately.  Care must be
29# taken that what it generates is in sync with the Java specification.
30
31# Names of fields in Unicode data table.
32$CODE = 0;
33$NAME = 1;
34$CATEGORY = 2;
35$COMBINING_CLASSES = 3;
36$BIDI_CATEGORY = 4;
37$DECOMPOSITION = 5;
38$DECIMAL_VALUE = 6;
39$DIGIT_VALUE = 7;
40$NUMERIC_VALUE = 8;
41$MIRRORED = 9;
42$OLD_NAME = 10;
43$COMMENT = 11;
44$UPPER = 12;
45$LOWER = 13;
46$TITLE = 14;
47
48# Start of special-cased gaps in Unicode data table.
49%gaps = (
50	 0x4e00 => "CJK",
51	 0xac00 => "Hangul",
52	 0xd800 => "Unassigned High Surrogate",
53	 0xdb80 => "Private Use High Surrogate",
54	 0xdc00 => "Low Surrogate",
55	 0xe000 => "Private Use"
56	 );
57
58# This lists control characters which are also considered whitespace.
59# This is a somewhat odd list, taken from the JCL definition of
60# Character.isIdentifierIgnorable.
61%whitespace_controls =
62    (
63     0x0009 => 1,
64     0x000a => 1,
65     0x000b => 1,
66     0x000c => 1,
67     0x000d => 1,
68     0x001c => 1,
69     0x001d => 1,
70     0x001e => 1,
71     0x001f => 1
72     );
73
74open (INPUT, "< $ARGV[0]") || exit 1;
75
76$last_code = -1;
77while (<INPUT>)
78{
79    chop;
80    @fields = split (';', $_, 30);
81    if ($#fields != 14)
82    {
83	print STDERR "Entry for $fields[$CODE] has wrong number of fields\n";
84    }
85
86    $code = hex ($fields[$CODE]);
87    last if $code > 0xffff;
88    if ($code > $last_code + 1)
89    {
90	# Found a gap.
91	if (defined $gaps{$code})
92	{
93	    # Fill the gap with the last character read.
94	    @gfields = @fields;
95	}
96	else
97	{
98	    # The gap represents undefined characters.  Only the type
99	    # matters.
100	    @gfields = ('', '', 'Cn', '0', '', '', '', '', '', '', '',
101			'', '', '', '');
102	}
103	for (++$last_code; $last_code < $code; ++$last_code)
104	{
105	    $gfields{$CODE} = sprintf ("%04x", $last_code);
106	    &process_one ($last_code, @gfields);
107	}
108    }
109    &process_one ($code, @fields);
110    $last_code = $code;
111}
112
113close (INPUT);
114
115@gfields = ('', '', 'Cn', '0', '', '', '', '', '', '', '',
116	    '', '', '', '');
117for (++$last_code; $last_code < 0x10000; ++$last_code)
118{
119    $gfields{$CODE} = sprintf ("%04x", $last_code);
120    &process_one ($last_code, @gfields);
121}
122--$last_code;			# Want last to be 0xFFFF.
123
124&print_tables ($last_code);
125
126exit 0;
127
128# Process a single character.
129sub process_one
130{
131    my ($code, @fields) = @_;
132
133    my $value = '';
134    my $type = $fields[$CATEGORY];
135
136    # See if the character is a valid identifier start.
137    if ($type =~ /L./		# Letter
138	|| $type eq 'Pc'	# Connecting punctuation
139	|| $type eq 'Sc')	# Currency symbol
140    {
141	$value = 'LETTER_START';
142    }
143
144    # See if the character is a valid identifier member.
145    if ($type =~ /L./		# Letter
146	|| $type eq 'Pc'	# Connecting punctuation
147	|| $type eq 'Sc'	# Currency symbol
148	|| $type =~ /N[dl]/	# Number: decimal or letter
149	|| $type =~ /M[nc]/	# Mark: non-spacing or combining
150	|| ($type eq 'Cc'	# Certain controls
151	    && ! defined $whitespace_controls{$code})
152	|| ($code >= 0x200c	# Join controls
153	    && $code <= 0x200f)
154	|| ($code >= 0x202a	# Bidi controls -- note that there
155				# is a typo in the JCL where these are
156				# concerned.
157	    && $code <= 0x202e)
158	|| ($code >= 0x206a	# Format controls
159	    && $code <= 0x206f)
160	|| $code == 0xfeff)	# ZWNBSP
161    {
162	if ($value eq '')
163	{
164	    $value = 'LETTER_PART';
165	}
166	else
167	{
168	    $value = 'LETTER_PART | ' . $value;
169	}
170    }
171
172    if ($value eq '')
173    {
174	$value = '0';
175    }
176    else
177    {
178	$value = '(' . $value . ')';
179    }
180
181    $map[$code] = $value;
182}
183
184sub print_tables
185{
186    my ($last) = @_;
187
188    local ($bytes_out) = 0;
189
190    open (OUT, "> chartables.h");
191
192    print OUT "/* This file is automatically generated.  DO NOT EDIT!\n";
193    print OUT "   Instead, edit gen-table.pl and re-run.  */\n\n";
194
195    print OUT "#ifndef GCC_CHARTABLES_H\n";
196    print OUT "#define GCC_CHARTABLES_H\n\n";
197
198    print OUT "#define LETTER_START 1\n";
199    print OUT "#define LETTER_PART  2\n\n";
200
201    for ($count = 0; $count <= $last; $count += 256)
202    {
203	$row[$count / 256] = &print_row ($count, '(char *) ', 'const char', 1,
204					 'page');
205    }
206
207    print OUT "static const char *const type_table[256] = {\n";
208    for ($count = 0; $count <= $last; $count += 256)
209    {
210	print OUT ",\n" if $count > 0;
211	print OUT "  ", $row[$count / 256];
212	$bytes_out += 4;
213    }
214    print OUT "\n};\n\n";
215
216    print OUT "#endif /* ! GCC_CHARTABLES_H */\n";
217
218    close (OUT);
219
220    printf "Generated %d bytes\n", $bytes_out;
221}
222
223# Print a single "row" of a two-level table.
224sub print_row
225{
226    my ($start, $def_pfx, $typname, $typsize, $name) = @_;
227
228    my ($i);
229    my (@values);
230    my ($flag) = 1;
231    my ($off);
232    for ($off = 0; $off < 256; ++$off)
233    {
234	$values[$off] = $map[$off + $start];
235	if ($values[$off] ne $values[0])
236	{
237	    $flag = 0;
238	}
239    }
240    if ($flag)
241    {
242	return $def_pfx . $values[0];
243    }
244
245    printf OUT "static %s %s%d[256] = {\n  ", $typname, $name, $start / 256;
246    my ($column) = 2;
247    for ($i = $start; $i < $start + 256; ++$i)
248    {
249	print OUT ", "
250	    if $i > $start;
251	my ($text) = $values[$i - $start];
252	if (length ($text) + $column + 2 > 78)
253	{
254	    print OUT "\n  ";
255	    $column = 2;
256	}
257	print OUT $text;
258	$column += length ($text) + 2;
259    }
260    print OUT "\n};\n\n";
261
262    $bytes_out += 256 * $typsize;
263
264    return sprintf "%s%d", $name, $start / 256;
265}
266