1 // $Id: gencode.java,v 1.14 2004/02/17 12:58:45 ericb Exp $
2 //
3 // This software is subject to the terms of the IBM Jikes Compiler
4 // License Agreement available at the following URL:
5 // http://www.ibm.com/research/jikes.
6 // Copyright (C) 1999, 2004 IBM Corporation and others.  All Rights Reserved.
7 // You must accept the terms of that agreement to use this software.
8 //
9 
10 import java.io.*;
11 import java.util.*;
12 
13 /**
14  * This helper class generates code.h and code.cpp, the lookup table used
15  * by Jikes in deciding how to categorize Unicode input characters.  This
16  * is written in Java in order to accurately track the Unicode rules as
17  * implemented by the latest version of java.lang.Character.
18  */
19 class gencode
20 {
21     static final int SPACE_CODE    = 0; // Character.isWhitespace
22     static final int BAD_CODE      = 1; // everything else ...
23     static final int DIGIT_CODE    = 2; // Character.isDigit
24     static final int ID_PART_CODE  = 3; // Character.isJavaIdentifierPart
25     static final int LOWER_CODE    = 4; // Character.isLowerCase
26     static final int UPPER_CODE    = 5; // Character.isUpperCase
27     static final int ID_START_CODE = 6; // Character.isJavaIdentifierStart
28     static final int MAX_CODEPOINT = 0x10ffff; // Maximum Unicode.
29 
30     static final String[] CODE_NAMES = {
31         "SPACE_CODE", "BAD_CODE", "DIGIT_CODE", "ID_PART_CODE", "LOWER_CODE",
32         "UPPER_CODE", "ID_START_CODE"
33     };
34 
main(String args[])35     static public void main(String args[])
36         throws FileNotFoundException, IOException
37     {
38         System.out.println("Gathering data...");
39         // Use char[] for ease in building strings, despite only using 8 bits.
40         int numElements = MAX_CODEPOINT + 1;
41         char[] info = new char[numElements];
42         for (int i = 0; i < info.length; i++)
43         {
44             if (Character.isWhitespace(i))
45                 info[i] = SPACE_CODE;
46             else if (Character.isLowerCase(i))
47             {
48                 assert Character.isJavaIdentifierStart(i);
49                 info[i] = LOWER_CODE;
50             }
51             else if (Character.isUpperCase(i))
52             {
53                 assert Character.isJavaIdentifierStart(i);
54                 info[i] = UPPER_CODE;
55             }
56             else if (Character.isDigit(i))
57             {
58                 assert Character.isJavaIdentifierPart(i);
59                 info[i] = DIGIT_CODE;
60             }
61             else if (Character.isJavaIdentifierStart(i))
62                 info[i] = ID_START_CODE;
63             else if (Character.isJavaIdentifierPart(i))
64                 info[i] = ID_PART_CODE;
65             else
66             {
67                 info[i] = BAD_CODE;
68                 numElements--;
69             }
70         }
71 
72         System.out.println("Compressing tables...");
73         int bestShift = 0;
74         int bestEst = info.length;
75         String bestBlkStr = null;
76 
77         for (int i = 3; i < 12; i++)
78         {
79             int blkSize = 1 << i;
80             Map blocks = new HashMap();
81             List blkArray = new ArrayList();
82             System.out.print("shift: " + i);
83 
84             for (int j = 0; j < info.length; j += blkSize)
85             {
86                 String key = new String(info, j, blkSize);
87                 if (blocks.get(key) == null)
88                 {
89                     blkArray.add(key);
90                     blocks.put(key, new Integer(blkArray.size()));
91                 }
92             }
93             int blkNum = blkArray.size();
94             int blockLen = blkNum * blkSize;
95             System.out.print(" before " + blockLen);
96 
97             //
98             // Try to pack blkArray, by finding successively smaller matches
99             // between heads and tails of blocks.
100             //
101             for (int j = blkSize - 1; j > 0; j--)
102             {
103                 Map tails = new HashMap();
104                 for (int k = 0; k < blkArray.size(); k++)
105                 {
106                     String str = (String) blkArray.get(k);
107                     if (str == null)
108                         continue;
109                     String tail = str.substring(str.length() - j);
110                     List l = (List) tails.get(tail);
111                     if (l == null)
112                         tails.put(tail,
113                                   new LinkedList(Collections
114                                                  .singleton(new Integer(k))));
115                     else l.add(new Integer(k));
116                 }
117 
118                 //
119                 // Now calculate the heads, and merge overlapping blocks
120                 //
121             block:
122                 for (int k = 0; k < blkArray.size(); k++)
123                 {
124                     String tomerge = (String) blkArray.get(k);
125                     if (tomerge == null)
126                         continue;
127                     while (true)
128                     {
129                         String head = tomerge.substring(0, j);
130                         LinkedList entry = (LinkedList) tails.get(head);
131                         if (entry == null)
132                             continue block;
133                         Integer other = (Integer) entry.removeFirst();
134                         if (other.intValue() == k)
135                         {
136                             if (entry.size() > 0)
137                             {
138                                 entry.add(other);
139                                 other = (Integer) entry.removeFirst();
140                             }
141                             else
142                             {
143                                 entry.add(other);
144                                 continue block;
145                             }
146                         }
147                         if (entry.size() == 0)
148                             tails.remove(head);
149 
150                         //
151                         // A match was found.
152                         //
153                         String merge = blkArray.get(other.intValue()) +
154                             tomerge.substring(j);
155                         blockLen -= j;
156                         blkNum--;
157                         if (other.intValue() < k)
158                         {
159                             blkArray.set(k, null);
160                             blkArray.set(other.intValue(), merge);
161                             String tail = merge.substring(merge.length() - j);
162                             List l = (List) tails.get(tail);
163                             Collections.replaceAll(l, new Integer(k), other);
164                             continue block;
165                         }
166                         blkArray.set(k, merge);
167                         blkArray.set(other.intValue(), null);
168                         tomerge = merge;
169                     }
170                 }
171             }
172             StringBuffer blockStr = new StringBuffer(blockLen);
173             for (int k = 0; k < blkArray.size(); k++)
174             {
175                 String str = (String) blkArray.get(k);
176                 if (str != null)
177                     blockStr.append(str);
178             }
179             assert blockStr.length() == blockLen :
180                 "Unexpected blockLen " + blockLen;
181             int estimate = blockLen + (info.length >> (i - 1));
182             System.out.println(" after merge " + blockLen + ": " + estimate +
183                                " bytes");
184             if (estimate < bestEst)
185             {
186                 bestEst = estimate;
187                 bestShift = i;
188                 bestBlkStr = blockStr.toString();
189             }
190         }
191 
192         System.out.println("Generating code.h with shift of " + bestShift);
193         int blkSize = 1 << bestShift;
194         char[] blocks = new char[info.length >> bestShift];
195         for (int j = 0; j < info.length; j += blkSize)
196         {
197             String key = new String(info, j, blkSize);
198             int index = bestBlkStr.indexOf(key);
199             assert index != -1 : "Unexpected index for " + j;
200             blocks[j >> bestShift] = (char) (index - j);
201         }
202 
203         //
204         // Process the code.h file
205         //
206         PrintStream hfile = new PrintStream(new FileOutputStream("code.h"));
207         printHeader(hfile, new String[] {"\"platform.h\""});
208         hfile.println("#ifndef code_INCLUDED");
209         hfile.println("#define code_INCLUDED");
210         hfile.println();
211         hfile.println("class Code");
212         hfile.println("{");
213         hfile.println("    //");
214         hfile.println("    // To facilitate the scanning, the character set is partitioned into");
215         hfile.println("    // categories using the array CODE. These are described below together");
216         hfile.println("    // with some self-explanatory functions defined on CODE.");
217         hfile.println("    //");
218         hfile.println("    enum {");
219         hfile.println("        SHIFT = " + bestShift + ",");
220         hfile.println("        SPACE_CODE = " + SPACE_CODE + ',');
221         hfile.println("        BAD_CODE = " + BAD_CODE + ',');
222         hfile.println("        DIGIT_CODE = " + DIGIT_CODE + ',');
223         hfile.println("        ID_PART_CODE = " + ID_PART_CODE + ',');
224         hfile.println("        LOWER_CODE = " + LOWER_CODE + ',');
225         hfile.println("        UPPER_CODE = " + UPPER_CODE + ',');
226         hfile.println("        ID_START_CODE = " + ID_START_CODE);
227         hfile.println("    };");
228         hfile.println();
229         hfile.println("    static char codes[" + bestBlkStr.length() + "];");
230         hfile.println("    static u2 blocks[" +  blocks.length + "];");
231         hfile.println();
232         hfile.println();
233         hfile.println("public:");
234         hfile.println("#ifdef JIKES_DEBUG");
235         hfile.println("    static inline void CodeCheck(u4 c)");
236         hfile.println("    {");
237         hfile.println("        assert((u2) (blocks[c >> SHIFT] + c) < " +
238                       bestBlkStr.length() + ");");
239         hfile.println("    }");
240         hfile.println();
241         hfile.println("    static inline bool CodeCheck(void)");
242         hfile.println("    {");
243         hfile.println("        for (u4 c = 0; c <= " + MAX_CODEPOINT +
244                       "; c++)");
245         hfile.println("            CodeCheck(c);");
246         hfile.println("        return true;");
247         hfile.println("    }");
248         hfile.println("#endif // JIKES_DEBUG");
249         hfile.println();
250         hfile.println("//");
251         hfile.println("// These methods test for Unicode surrogate pairs.");
252         hfile.println("//");
253         hfile.println("    static inline bool IsHighSurrogate(wchar_t c)");
254         hfile.println("    {");
255         hfile.println("        return c >= 0xd800 && c <= 0xdbff;");
256         hfile.println("    }");
257         hfile.println("    static inline bool IsLowSurrogate(wchar_t c)");
258         hfile.println("    {");
259         hfile.println("        return c >= 0xdc00 && c <= 0xdfff;");
260         hfile.println("    }");
261         hfile.println();
262         hfile.println("    static inline u4 Codepoint(wchar_t hi, wchar_t lo)");
263         hfile.println("    {");
264         hfile.println("        assert(IsHighSurrogate(hi) && IsLowSurrogate(lo));");
265         hfile.println("        return (hi << 10) + lo + (0x10000 - (0xd800 << 10) - 0xdc00);");
266         hfile.println("    }");
267         hfile.println("    static inline u4 Codepoint(const wchar_t* p)");
268         hfile.println("    {");
269         hfile.println("        u4 result = (u4) *p;");
270         hfile.println("        if (IsHighSurrogate(result) && IsLowSurrogate(p[1]))");
271         hfile.println("            result = Codepoint(result, p[1]);");
272         hfile.println("        return result;");
273         hfile.println("    }");
274         hfile.println("    static inline int Codelength(const wchar_t* p)");
275         hfile.println("    {");
276         hfile.println("        return (IsHighSurrogate(*p) && IsLowSurrogate(p[1])) ? 2 : 1;");
277         hfile.println("    }");
278         hfile.println();
279         hfile.println("//");
280         hfile.println("// These methods test for ASCII characteristics. Since it is strictly ASCII,");
281         hfile.println("// there is no need to check for Unicode surrogate pairs.");
282         hfile.println("//");
283         hfile.println("    static inline bool IsNewline(wchar_t c)");
284         hfile.println("    {");
285         hfile.println("        return c == U_LF || c == U_CR;");
286         hfile.println("    }");
287         hfile.println("    static inline bool IsSpaceButNotNewline(wchar_t c)");
288         hfile.println("    {");
289         hfile.println("        return c == U_SP || c == U_FF || c == U_HT;");
290         hfile.println("    }");
291         hfile.println("    static inline bool IsSpace(wchar_t c)");
292         hfile.println("    {");
293         hfile.println("        return c == U_SP || c == U_CR || c == U_LF ||");
294         hfile.println("            c == U_HT || c == U_FF;");
295         hfile.println("    }");
296         hfile.println();
297         hfile.println("    static inline bool IsDecimalDigit(wchar_t c)");
298         hfile.println("    {");
299         hfile.println("        return c <= U_9 && c >= U_0;");
300         hfile.println("    }");
301         hfile.println("    static inline bool IsOctalDigit(wchar_t c)");
302         hfile.println("    {");
303         hfile.println("        return c <= U_7 && c >= U_0;");
304         hfile.println("    }");
305         hfile.println("    static inline bool IsHexDigit(wchar_t c)");
306         hfile.println("    {");
307         hfile.println("        return c <= U_f && (c >= U_a ||");
308         hfile.println("                            (c >= U_A && c <= U_F) ||");
309         hfile.println("                            (c >= U_0 && c <= U_9));");
310         hfile.println("    }");
311         hfile.println("    static inline int Value(wchar_t c)");
312         hfile.println("    {");
313         hfile.println("        assert(IsHexDigit(c));");
314         hfile.println("        return c - (c <= U_9 ? U_0 : c < U_a ? U_A - 10 : U_a - 10);");
315         hfile.println("    }");
316         hfile.println("    static inline bool IsSign(wchar_t c)");
317         hfile.println("    {");
318         hfile.println("        return c == U_MINUS || c == U_PLUS;");
319         hfile.println("    }");
320         hfile.println();
321         hfile.println("    static inline bool IsAsciiUpper(wchar_t c)");
322         hfile.println("    {");
323         hfile.println("        return c <= U_Z && c >= U_A;");
324         hfile.println("    }");
325         hfile.println("    static inline bool IsAsciiLower(wchar_t c)");
326         hfile.println("    {");
327         hfile.println("        return c <= U_z && c >= U_a;");
328         hfile.println("    }");
329         hfile.println();
330         hfile.println("//");
331         hfile.println("// The following methods recognize Unicode surrogate pairs, hence the need to");
332         hfile.println("// pass a pointer. Use Codelength() to determine if one or two characters");
333         hfile.println("// were used in the formation of a character.");
334         hfile.println("//");
335         printMethod(hfile, "IsWhitespace", "SPACE_CODE", "==");
336         printMethod(hfile, "IsDigit", "DIGIT_CODE", "==");
337         printMethod(hfile, "IsUpper", "UPPER_CODE", "==");
338         printMethod(hfile, "IsLower", "LOWER_CODE", "==");
339         printMethod(hfile, "IsAlpha", "LOWER_CODE", ">=");
340         printMethod(hfile, "IsAlnum", "DIGIT_CODE", ">=");
341         hfile.println("};");
342         hfile.println();
343         hfile.println("#endif // code_INCLUDED");
344         printFooter(hfile);
345         hfile.close();
346 
347         //
348         // Process the code.cpp file
349         //
350         System.out.println("Generating code.cpp");
351         PrintStream cfile = new PrintStream(new FileOutputStream("code.cpp"));
352         printHeader(cfile, new String[] {"\"code.h\""});
353         cfile.println("char Code::codes[" + bestBlkStr.length() + "] =");
354         cfile.println("{");
355         for (int j = 0; j < bestBlkStr.length(); j += 5)
356         {
357             for (int k = 0; k < 5; k++)
358             {
359                 if (k + j >= bestBlkStr.length())
360                     break;
361                 if (k == 0)
362                     cfile.print("   ");
363                 cfile.print(" " + CODE_NAMES[bestBlkStr.charAt(k + j)] + ",");
364             }
365             cfile.println();
366         }
367         cfile.println("};");
368         cfile.println();
369         cfile.println();
370         cfile.println("//");
371         cfile.println("// The Blocks vector:");
372         cfile.println("//");
373         cfile.println("u2 Code::blocks[" + blocks.length + "] =");
374         cfile.println("{");
375         for (int k = 0; k < blocks.length; k += 9)
376         {
377             for (int i = 0; i < 9; i++)
378             {
379                 if (k + i >= blocks.length)
380                     break;
381                 if (i == 0)
382                     cfile.print("   ");
383                 cfile.print(" 0x" + Integer.toHexString(blocks[k + i]) + ",");
384             }
385             cfile.println();
386         }
387         cfile.println("};");
388         printFooter(cfile);
389         cfile.close();
390 
391         //
392         // Print statistics.
393         //
394         System.out.println("Total static storage utilization is " +
395                            blocks.length * 2 + " bytes for block lookup");
396         System.out.println("   plus " + bestBlkStr.length() +
397                            " bytes for the encodings");
398         System.out.println("Number of legal unicode codepoints:" +
399                            numElements);
400     }
401 
printHeader(PrintStream file, String[] includes)402     static void printHeader(PrintStream file, String[] includes)
403     {
404         file.println("// $I" + /* CVS hack */ "d$ -*- c++ -*-");
405         file.println("// DO NOT MODIFY THIS FILE - it is generated using gencode.java.");
406         file.println("//");
407         file.println("// This software is subject to the terms of the IBM Jikes Compiler");
408         file.println("// License Agreement available at the following URL:");
409         file.println("// http://www.ibm.com/research/jikes.");
410         file.println("// Copyright (C) 1999, 2004 IBM Corporation and others.  All Rights Reserved.");
411         file.println("// You must accept the terms of that agreement to use this software.");
412         file.println("//");
413         file.println();
414 
415         for (int i = 0; i < includes.length; i++)
416         {
417             file.println("#include " + includes[i]);
418         }
419 
420         file.println();
421         file.println("#ifdef HAVE_JIKES_NAMESPACE");
422         file.println("namespace Jikes { // Open namespace Jikes block");
423         file.println("#endif");
424         file.println();
425     }
426 
printMethod(PrintStream file, String name, String value, String relation)427     static void printMethod(PrintStream file, String name, String value,
428                             String relation)
429     {
430         file.println("    static inline bool " + name + "(const wchar_t* p)");
431         file.println("    {");
432         file.println("        u4 c = Codepoint(p);");
433         file.println("        return codes[(u2) (blocks[c >> SHIFT] + c)] " +
434                      relation + ' ' + value + ";");
435         file.println("    }");
436     }
437 
printFooter(PrintStream file)438     static void printFooter(PrintStream file)
439     {
440         file.println();
441         file.println("#ifdef HAVE_JIKES_NAMESPACE");
442         file.println("} // Close namespace Jikes block");
443         file.println("#endif");
444         file.println();
445     }
446 }
447