1 // $Id: gencode.java,v 1.14 2004/02/17 12:58:45 ericb Exp $ 2 // 3 // This software is subject to the terms of the IBM Jikes Compiler 4 // License Agreement available at the following URL: 5 // http://www.ibm.com/research/jikes. 6 // Copyright (C) 1999, 2004 IBM Corporation and others. All Rights Reserved. 7 // You must accept the terms of that agreement to use this software. 8 // 9 10 import java.io.*; 11 import java.util.*; 12 13 /** 14 * This helper class generates code.h and code.cpp, the lookup table used 15 * by Jikes in deciding how to categorize Unicode input characters. This 16 * is written in Java in order to accurately track the Unicode rules as 17 * implemented by the latest version of java.lang.Character. 18 */ 19 class gencode 20 { 21 static final int SPACE_CODE = 0; // Character.isWhitespace 22 static final int BAD_CODE = 1; // everything else ... 23 static final int DIGIT_CODE = 2; // Character.isDigit 24 static final int ID_PART_CODE = 3; // Character.isJavaIdentifierPart 25 static final int LOWER_CODE = 4; // Character.isLowerCase 26 static final int UPPER_CODE = 5; // Character.isUpperCase 27 static final int ID_START_CODE = 6; // Character.isJavaIdentifierStart 28 static final int MAX_CODEPOINT = 0x10ffff; // Maximum Unicode. 29 30 static final String[] CODE_NAMES = { 31 "SPACE_CODE", "BAD_CODE", "DIGIT_CODE", "ID_PART_CODE", "LOWER_CODE", 32 "UPPER_CODE", "ID_START_CODE" 33 }; 34 main(String args[])35 static public void main(String args[]) 36 throws FileNotFoundException, IOException 37 { 38 System.out.println("Gathering data..."); 39 // Use char[] for ease in building strings, despite only using 8 bits. 40 int numElements = MAX_CODEPOINT + 1; 41 char[] info = new char[numElements]; 42 for (int i = 0; i < info.length; i++) 43 { 44 if (Character.isWhitespace(i)) 45 info[i] = SPACE_CODE; 46 else if (Character.isLowerCase(i)) 47 { 48 assert Character.isJavaIdentifierStart(i); 49 info[i] = LOWER_CODE; 50 } 51 else if (Character.isUpperCase(i)) 52 { 53 assert Character.isJavaIdentifierStart(i); 54 info[i] = UPPER_CODE; 55 } 56 else if (Character.isDigit(i)) 57 { 58 assert Character.isJavaIdentifierPart(i); 59 info[i] = DIGIT_CODE; 60 } 61 else if (Character.isJavaIdentifierStart(i)) 62 info[i] = ID_START_CODE; 63 else if (Character.isJavaIdentifierPart(i)) 64 info[i] = ID_PART_CODE; 65 else 66 { 67 info[i] = BAD_CODE; 68 numElements--; 69 } 70 } 71 72 System.out.println("Compressing tables..."); 73 int bestShift = 0; 74 int bestEst = info.length; 75 String bestBlkStr = null; 76 77 for (int i = 3; i < 12; i++) 78 { 79 int blkSize = 1 << i; 80 Map blocks = new HashMap(); 81 List blkArray = new ArrayList(); 82 System.out.print("shift: " + i); 83 84 for (int j = 0; j < info.length; j += blkSize) 85 { 86 String key = new String(info, j, blkSize); 87 if (blocks.get(key) == null) 88 { 89 blkArray.add(key); 90 blocks.put(key, new Integer(blkArray.size())); 91 } 92 } 93 int blkNum = blkArray.size(); 94 int blockLen = blkNum * blkSize; 95 System.out.print(" before " + blockLen); 96 97 // 98 // Try to pack blkArray, by finding successively smaller matches 99 // between heads and tails of blocks. 100 // 101 for (int j = blkSize - 1; j > 0; j--) 102 { 103 Map tails = new HashMap(); 104 for (int k = 0; k < blkArray.size(); k++) 105 { 106 String str = (String) blkArray.get(k); 107 if (str == null) 108 continue; 109 String tail = str.substring(str.length() - j); 110 List l = (List) tails.get(tail); 111 if (l == null) 112 tails.put(tail, 113 new LinkedList(Collections 114 .singleton(new Integer(k)))); 115 else l.add(new Integer(k)); 116 } 117 118 // 119 // Now calculate the heads, and merge overlapping blocks 120 // 121 block: 122 for (int k = 0; k < blkArray.size(); k++) 123 { 124 String tomerge = (String) blkArray.get(k); 125 if (tomerge == null) 126 continue; 127 while (true) 128 { 129 String head = tomerge.substring(0, j); 130 LinkedList entry = (LinkedList) tails.get(head); 131 if (entry == null) 132 continue block; 133 Integer other = (Integer) entry.removeFirst(); 134 if (other.intValue() == k) 135 { 136 if (entry.size() > 0) 137 { 138 entry.add(other); 139 other = (Integer) entry.removeFirst(); 140 } 141 else 142 { 143 entry.add(other); 144 continue block; 145 } 146 } 147 if (entry.size() == 0) 148 tails.remove(head); 149 150 // 151 // A match was found. 152 // 153 String merge = blkArray.get(other.intValue()) + 154 tomerge.substring(j); 155 blockLen -= j; 156 blkNum--; 157 if (other.intValue() < k) 158 { 159 blkArray.set(k, null); 160 blkArray.set(other.intValue(), merge); 161 String tail = merge.substring(merge.length() - j); 162 List l = (List) tails.get(tail); 163 Collections.replaceAll(l, new Integer(k), other); 164 continue block; 165 } 166 blkArray.set(k, merge); 167 blkArray.set(other.intValue(), null); 168 tomerge = merge; 169 } 170 } 171 } 172 StringBuffer blockStr = new StringBuffer(blockLen); 173 for (int k = 0; k < blkArray.size(); k++) 174 { 175 String str = (String) blkArray.get(k); 176 if (str != null) 177 blockStr.append(str); 178 } 179 assert blockStr.length() == blockLen : 180 "Unexpected blockLen " + blockLen; 181 int estimate = blockLen + (info.length >> (i - 1)); 182 System.out.println(" after merge " + blockLen + ": " + estimate + 183 " bytes"); 184 if (estimate < bestEst) 185 { 186 bestEst = estimate; 187 bestShift = i; 188 bestBlkStr = blockStr.toString(); 189 } 190 } 191 192 System.out.println("Generating code.h with shift of " + bestShift); 193 int blkSize = 1 << bestShift; 194 char[] blocks = new char[info.length >> bestShift]; 195 for (int j = 0; j < info.length; j += blkSize) 196 { 197 String key = new String(info, j, blkSize); 198 int index = bestBlkStr.indexOf(key); 199 assert index != -1 : "Unexpected index for " + j; 200 blocks[j >> bestShift] = (char) (index - j); 201 } 202 203 // 204 // Process the code.h file 205 // 206 PrintStream hfile = new PrintStream(new FileOutputStream("code.h")); 207 printHeader(hfile, new String[] {"\"platform.h\""}); 208 hfile.println("#ifndef code_INCLUDED"); 209 hfile.println("#define code_INCLUDED"); 210 hfile.println(); 211 hfile.println("class Code"); 212 hfile.println("{"); 213 hfile.println(" //"); 214 hfile.println(" // To facilitate the scanning, the character set is partitioned into"); 215 hfile.println(" // categories using the array CODE. These are described below together"); 216 hfile.println(" // with some self-explanatory functions defined on CODE."); 217 hfile.println(" //"); 218 hfile.println(" enum {"); 219 hfile.println(" SHIFT = " + bestShift + ","); 220 hfile.println(" SPACE_CODE = " + SPACE_CODE + ','); 221 hfile.println(" BAD_CODE = " + BAD_CODE + ','); 222 hfile.println(" DIGIT_CODE = " + DIGIT_CODE + ','); 223 hfile.println(" ID_PART_CODE = " + ID_PART_CODE + ','); 224 hfile.println(" LOWER_CODE = " + LOWER_CODE + ','); 225 hfile.println(" UPPER_CODE = " + UPPER_CODE + ','); 226 hfile.println(" ID_START_CODE = " + ID_START_CODE); 227 hfile.println(" };"); 228 hfile.println(); 229 hfile.println(" static char codes[" + bestBlkStr.length() + "];"); 230 hfile.println(" static u2 blocks[" + blocks.length + "];"); 231 hfile.println(); 232 hfile.println(); 233 hfile.println("public:"); 234 hfile.println("#ifdef JIKES_DEBUG"); 235 hfile.println(" static inline void CodeCheck(u4 c)"); 236 hfile.println(" {"); 237 hfile.println(" assert((u2) (blocks[c >> SHIFT] + c) < " + 238 bestBlkStr.length() + ");"); 239 hfile.println(" }"); 240 hfile.println(); 241 hfile.println(" static inline bool CodeCheck(void)"); 242 hfile.println(" {"); 243 hfile.println(" for (u4 c = 0; c <= " + MAX_CODEPOINT + 244 "; c++)"); 245 hfile.println(" CodeCheck(c);"); 246 hfile.println(" return true;"); 247 hfile.println(" }"); 248 hfile.println("#endif // JIKES_DEBUG"); 249 hfile.println(); 250 hfile.println("//"); 251 hfile.println("// These methods test for Unicode surrogate pairs."); 252 hfile.println("//"); 253 hfile.println(" static inline bool IsHighSurrogate(wchar_t c)"); 254 hfile.println(" {"); 255 hfile.println(" return c >= 0xd800 && c <= 0xdbff;"); 256 hfile.println(" }"); 257 hfile.println(" static inline bool IsLowSurrogate(wchar_t c)"); 258 hfile.println(" {"); 259 hfile.println(" return c >= 0xdc00 && c <= 0xdfff;"); 260 hfile.println(" }"); 261 hfile.println(); 262 hfile.println(" static inline u4 Codepoint(wchar_t hi, wchar_t lo)"); 263 hfile.println(" {"); 264 hfile.println(" assert(IsHighSurrogate(hi) && IsLowSurrogate(lo));"); 265 hfile.println(" return (hi << 10) + lo + (0x10000 - (0xd800 << 10) - 0xdc00);"); 266 hfile.println(" }"); 267 hfile.println(" static inline u4 Codepoint(const wchar_t* p)"); 268 hfile.println(" {"); 269 hfile.println(" u4 result = (u4) *p;"); 270 hfile.println(" if (IsHighSurrogate(result) && IsLowSurrogate(p[1]))"); 271 hfile.println(" result = Codepoint(result, p[1]);"); 272 hfile.println(" return result;"); 273 hfile.println(" }"); 274 hfile.println(" static inline int Codelength(const wchar_t* p)"); 275 hfile.println(" {"); 276 hfile.println(" return (IsHighSurrogate(*p) && IsLowSurrogate(p[1])) ? 2 : 1;"); 277 hfile.println(" }"); 278 hfile.println(); 279 hfile.println("//"); 280 hfile.println("// These methods test for ASCII characteristics. Since it is strictly ASCII,"); 281 hfile.println("// there is no need to check for Unicode surrogate pairs."); 282 hfile.println("//"); 283 hfile.println(" static inline bool IsNewline(wchar_t c)"); 284 hfile.println(" {"); 285 hfile.println(" return c == U_LF || c == U_CR;"); 286 hfile.println(" }"); 287 hfile.println(" static inline bool IsSpaceButNotNewline(wchar_t c)"); 288 hfile.println(" {"); 289 hfile.println(" return c == U_SP || c == U_FF || c == U_HT;"); 290 hfile.println(" }"); 291 hfile.println(" static inline bool IsSpace(wchar_t c)"); 292 hfile.println(" {"); 293 hfile.println(" return c == U_SP || c == U_CR || c == U_LF ||"); 294 hfile.println(" c == U_HT || c == U_FF;"); 295 hfile.println(" }"); 296 hfile.println(); 297 hfile.println(" static inline bool IsDecimalDigit(wchar_t c)"); 298 hfile.println(" {"); 299 hfile.println(" return c <= U_9 && c >= U_0;"); 300 hfile.println(" }"); 301 hfile.println(" static inline bool IsOctalDigit(wchar_t c)"); 302 hfile.println(" {"); 303 hfile.println(" return c <= U_7 && c >= U_0;"); 304 hfile.println(" }"); 305 hfile.println(" static inline bool IsHexDigit(wchar_t c)"); 306 hfile.println(" {"); 307 hfile.println(" return c <= U_f && (c >= U_a ||"); 308 hfile.println(" (c >= U_A && c <= U_F) ||"); 309 hfile.println(" (c >= U_0 && c <= U_9));"); 310 hfile.println(" }"); 311 hfile.println(" static inline int Value(wchar_t c)"); 312 hfile.println(" {"); 313 hfile.println(" assert(IsHexDigit(c));"); 314 hfile.println(" return c - (c <= U_9 ? U_0 : c < U_a ? U_A - 10 : U_a - 10);"); 315 hfile.println(" }"); 316 hfile.println(" static inline bool IsSign(wchar_t c)"); 317 hfile.println(" {"); 318 hfile.println(" return c == U_MINUS || c == U_PLUS;"); 319 hfile.println(" }"); 320 hfile.println(); 321 hfile.println(" static inline bool IsAsciiUpper(wchar_t c)"); 322 hfile.println(" {"); 323 hfile.println(" return c <= U_Z && c >= U_A;"); 324 hfile.println(" }"); 325 hfile.println(" static inline bool IsAsciiLower(wchar_t c)"); 326 hfile.println(" {"); 327 hfile.println(" return c <= U_z && c >= U_a;"); 328 hfile.println(" }"); 329 hfile.println(); 330 hfile.println("//"); 331 hfile.println("// The following methods recognize Unicode surrogate pairs, hence the need to"); 332 hfile.println("// pass a pointer. Use Codelength() to determine if one or two characters"); 333 hfile.println("// were used in the formation of a character."); 334 hfile.println("//"); 335 printMethod(hfile, "IsWhitespace", "SPACE_CODE", "=="); 336 printMethod(hfile, "IsDigit", "DIGIT_CODE", "=="); 337 printMethod(hfile, "IsUpper", "UPPER_CODE", "=="); 338 printMethod(hfile, "IsLower", "LOWER_CODE", "=="); 339 printMethod(hfile, "IsAlpha", "LOWER_CODE", ">="); 340 printMethod(hfile, "IsAlnum", "DIGIT_CODE", ">="); 341 hfile.println("};"); 342 hfile.println(); 343 hfile.println("#endif // code_INCLUDED"); 344 printFooter(hfile); 345 hfile.close(); 346 347 // 348 // Process the code.cpp file 349 // 350 System.out.println("Generating code.cpp"); 351 PrintStream cfile = new PrintStream(new FileOutputStream("code.cpp")); 352 printHeader(cfile, new String[] {"\"code.h\""}); 353 cfile.println("char Code::codes[" + bestBlkStr.length() + "] ="); 354 cfile.println("{"); 355 for (int j = 0; j < bestBlkStr.length(); j += 5) 356 { 357 for (int k = 0; k < 5; k++) 358 { 359 if (k + j >= bestBlkStr.length()) 360 break; 361 if (k == 0) 362 cfile.print(" "); 363 cfile.print(" " + CODE_NAMES[bestBlkStr.charAt(k + j)] + ","); 364 } 365 cfile.println(); 366 } 367 cfile.println("};"); 368 cfile.println(); 369 cfile.println(); 370 cfile.println("//"); 371 cfile.println("// The Blocks vector:"); 372 cfile.println("//"); 373 cfile.println("u2 Code::blocks[" + blocks.length + "] ="); 374 cfile.println("{"); 375 for (int k = 0; k < blocks.length; k += 9) 376 { 377 for (int i = 0; i < 9; i++) 378 { 379 if (k + i >= blocks.length) 380 break; 381 if (i == 0) 382 cfile.print(" "); 383 cfile.print(" 0x" + Integer.toHexString(blocks[k + i]) + ","); 384 } 385 cfile.println(); 386 } 387 cfile.println("};"); 388 printFooter(cfile); 389 cfile.close(); 390 391 // 392 // Print statistics. 393 // 394 System.out.println("Total static storage utilization is " + 395 blocks.length * 2 + " bytes for block lookup"); 396 System.out.println(" plus " + bestBlkStr.length() + 397 " bytes for the encodings"); 398 System.out.println("Number of legal unicode codepoints:" + 399 numElements); 400 } 401 printHeader(PrintStream file, String[] includes)402 static void printHeader(PrintStream file, String[] includes) 403 { 404 file.println("// $I" + /* CVS hack */ "d$ -*- c++ -*-"); 405 file.println("// DO NOT MODIFY THIS FILE - it is generated using gencode.java."); 406 file.println("//"); 407 file.println("// This software is subject to the terms of the IBM Jikes Compiler"); 408 file.println("// License Agreement available at the following URL:"); 409 file.println("// http://www.ibm.com/research/jikes."); 410 file.println("// Copyright (C) 1999, 2004 IBM Corporation and others. All Rights Reserved."); 411 file.println("// You must accept the terms of that agreement to use this software."); 412 file.println("//"); 413 file.println(); 414 415 for (int i = 0; i < includes.length; i++) 416 { 417 file.println("#include " + includes[i]); 418 } 419 420 file.println(); 421 file.println("#ifdef HAVE_JIKES_NAMESPACE"); 422 file.println("namespace Jikes { // Open namespace Jikes block"); 423 file.println("#endif"); 424 file.println(); 425 } 426 printMethod(PrintStream file, String name, String value, String relation)427 static void printMethod(PrintStream file, String name, String value, 428 String relation) 429 { 430 file.println(" static inline bool " + name + "(const wchar_t* p)"); 431 file.println(" {"); 432 file.println(" u4 c = Codepoint(p);"); 433 file.println(" return codes[(u2) (blocks[c >> SHIFT] + c)] " + 434 relation + ' ' + value + ";"); 435 file.println(" }"); 436 } 437 printFooter(PrintStream file)438 static void printFooter(PrintStream file) 439 { 440 file.println(); 441 file.println("#ifdef HAVE_JIKES_NAMESPACE"); 442 file.println("} // Close namespace Jikes block"); 443 file.println("#endif"); 444 file.println(); 445 } 446 } 447