1 /* StreamTokenizer.java -- parses streams of characters into tokens 2 Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003 Free Software Foundation 3 4 This file is part of GNU Classpath. 5 6 GNU Classpath is free software; you can redistribute it and/or modify 7 it under the terms of the GNU General Public License as published by 8 the Free Software Foundation; either version 2, or (at your option) 9 any later version. 10 11 GNU Classpath is distributed in the hope that it will be useful, but 12 WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 General Public License for more details. 15 16 You should have received a copy of the GNU General Public License 17 along with GNU Classpath; see the file COPYING. If not, write to the 18 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 19 02110-1301 USA. 20 21 Linking this library statically or dynamically with other modules is 22 making a combined work based on this library. Thus, the terms and 23 conditions of the GNU General Public License cover the whole 24 combination. 25 26 As a special exception, the copyright holders of this library give you 27 permission to link this library with independent modules to produce an 28 executable, regardless of the license terms of these independent 29 modules, and to copy and distribute the resulting executable under 30 terms of your choice, provided that you also meet, for each linked 31 independent module, the terms and conditions of the license of that 32 module. An independent module is a module which is not derived from 33 or based on this library. If you modify this library, you may extend 34 this exception to your version of the library, but you are not 35 obligated to do so. If you do not wish to do so, delete this 36 exception statement from your version. */ 37 38 package java.io; 39 40 import gnu.java.lang.CPStringBuilder; 41 42 /** 43 * This class parses streams of characters into tokens. There are a 44 * million-zillion flags that can be set to control the parsing, as 45 * described under the various method headings. 46 * 47 * @author Warren Levy (warrenl@cygnus.com) 48 * @date October 25, 1998. 49 */ 50 /* Written using "Java Class Libraries", 2nd edition, ISBN 0-201-31002-3 51 * "The Java Language Specification", ISBN 0-201-63451-1 52 * plus online API docs for JDK 1.2 beta from http://www.javasoft.com. 53 * Status: Believed complete and correct. 54 */ 55 56 public class StreamTokenizer 57 { 58 /** A constant indicating that the end of the stream has been read. */ 59 public static final int TT_EOF = -1; 60 61 /** A constant indicating that the end of the line has been read. */ 62 public static final int TT_EOL = '\n'; 63 64 /** A constant indicating that a number token has been read. */ 65 public static final int TT_NUMBER = -2; 66 67 /** A constant indicating that a word token has been read. */ 68 public static final int TT_WORD = -3; 69 70 /** A constant indicating that no tokens have been read yet. */ 71 private static final int TT_NONE = -4; 72 73 /** 74 * Contains the type of the token read resulting from a call to nextToken 75 * The rules are as follows: 76 * <ul> 77 * <li>For a token consisting of a single ordinary character, this is the 78 * value of that character.</li> 79 * <li>For a quoted string, this is the value of the quote character</li> 80 * <li>For a word, this is TT_WORD</li> 81 * <li>For a number, this is TT_NUMBER</li> 82 * <li>For the end of the line, this is TT_EOL</li> 83 * <li>For the end of the stream, this is TT_EOF</li> 84 * </ul> 85 */ 86 public int ttype = TT_NONE; 87 88 /** The String associated with word and string tokens. */ 89 public String sval; 90 91 /** The numeric value associated with number tokens. */ 92 public double nval; 93 94 /* Indicates whether end-of-line is recognized as a token. */ 95 private boolean eolSignificant = false; 96 97 /* Indicates whether word tokens are automatically made lower case. */ 98 private boolean lowerCase = false; 99 100 /* Indicates whether C++ style comments are recognized and skipped. */ 101 private boolean slashSlash = false; 102 103 /* Indicates whether C style comments are recognized and skipped. */ 104 private boolean slashStar = false; 105 106 /* Attribute tables of each byte from 0x00 to 0xFF. */ 107 private boolean[] whitespace = new boolean[256]; 108 private boolean[] alphabetic = new boolean[256]; 109 private boolean[] numeric = new boolean[256]; 110 private boolean[] quote = new boolean[256]; 111 private boolean[] comment = new boolean[256]; 112 113 /* The Reader associated with this class. */ 114 private PushbackReader in; 115 116 /* Indicates if a token has been pushed back. */ 117 private boolean pushedBack = false; 118 119 /* Contains the current line number of the reader. */ 120 private int lineNumber = 1; 121 122 /** 123 * This method reads bytes from an <code>InputStream</code> and tokenizes 124 * them. For details on how this method operates by default, see 125 * <code>StreamTokenizer(Reader)</code>. 126 * 127 * @param is The <code>InputStream</code> to read from 128 * 129 * @deprecated Since JDK 1.1. 130 */ StreamTokenizer(InputStream is)131 public StreamTokenizer(InputStream is) 132 { 133 this(new InputStreamReader(is)); 134 } 135 136 /** 137 * This method initializes a new <code>StreamTokenizer</code> to read 138 * characters from a <code>Reader</code> and parse them. The char values 139 * have their hight bits masked so that the value is treated a character 140 * in the range of 0x0000 to 0x00FF. 141 * <p> 142 * This constructor sets up the parsing table to parse the stream in the 143 * following manner: 144 * <ul> 145 * <li>The values 'A' through 'Z', 'a' through 'z' and 0xA0 through 0xFF 146 * are initialized as alphabetic</li> 147 * <li>The values 0x00 through 0x20 are initialized as whitespace</li> 148 * <li>The values '\'' and '"' are initialized as quote characters</li> 149 * <li>'/' is a comment character</li> 150 * <li>Numbers will be parsed</li> 151 * <li>EOL is not treated as significant</li> 152 * <li>C and C++ (//) comments are not recognized</li> 153 * </ul> 154 * 155 * @param r The <code>Reader</code> to read chars from 156 */ StreamTokenizer(Reader r)157 public StreamTokenizer(Reader r) 158 { 159 in = new PushbackReader(r); 160 161 whitespaceChars(0x00, 0x20); 162 wordChars('A', 'Z'); 163 wordChars('a', 'z'); 164 wordChars(0xA0, 0xFF); 165 commentChar('/'); 166 quoteChar('\''); 167 quoteChar('"'); 168 parseNumbers(); 169 } 170 171 /** 172 * This method sets the comment attribute on the specified 173 * character. Other attributes for the character are cleared. 174 * 175 * @param ch The character to set the comment attribute for, passed as an int 176 */ commentChar(int ch)177 public void commentChar(int ch) 178 { 179 if (ch >= 0 && ch <= 255) 180 { 181 comment[ch] = true; 182 whitespace[ch] = false; 183 alphabetic[ch] = false; 184 numeric[ch] = false; 185 quote[ch] = false; 186 } 187 } 188 189 /** 190 * This method sets a flag that indicates whether or not the end of line 191 * sequence terminates and is a token. The defaults to <code>false</code> 192 * 193 * @param flag <code>true</code> if EOF is significant, <code>false</code> 194 * otherwise 195 */ eolIsSignificant(boolean flag)196 public void eolIsSignificant(boolean flag) 197 { 198 eolSignificant = flag; 199 } 200 201 /** 202 * This method returns the current line number. Note that if the 203 * <code>pushBack()</code> method is called, it has no effect on the 204 * line number returned by this method. 205 * 206 * @return The current line number 207 */ lineno()208 public int lineno() 209 { 210 return lineNumber; 211 } 212 213 /** 214 * This method sets a flag that indicates whether or not alphabetic 215 * tokens that are returned should be converted to lower case. 216 * 217 * @param flag <code>true</code> to convert to lower case, 218 * <code>false</code> otherwise 219 */ lowerCaseMode(boolean flag)220 public void lowerCaseMode(boolean flag) 221 { 222 lowerCase = flag; 223 } 224 isWhitespace(int ch)225 private boolean isWhitespace(int ch) 226 { 227 return (ch >= 0 && ch <= 255 && whitespace[ch]); 228 } 229 isAlphabetic(int ch)230 private boolean isAlphabetic(int ch) 231 { 232 return ((ch > 255) || (ch >= 0 && alphabetic[ch])); 233 } 234 isNumeric(int ch)235 private boolean isNumeric(int ch) 236 { 237 return (ch >= 0 && ch <= 255 && numeric[ch]); 238 } 239 isQuote(int ch)240 private boolean isQuote(int ch) 241 { 242 return (ch >= 0 && ch <= 255 && quote[ch]); 243 } 244 isComment(int ch)245 private boolean isComment(int ch) 246 { 247 return (ch >= 0 && ch <= 255 && comment[ch]); 248 } 249 250 /** 251 * This method reads the next token from the stream. It sets the 252 * <code>ttype</code> variable to the appropriate token type and 253 * returns it. It also can set <code>sval</code> or <code>nval</code> 254 * as described below. The parsing strategy is as follows: 255 * <ul> 256 * <li>Skip any whitespace characters.</li> 257 * <li>If a numeric character is encountered, attempt to parse a numeric 258 * value. Leading '-' characters indicate a numeric only if followed by 259 * another non-'-' numeric. The value of the numeric token is terminated 260 * by either the first non-numeric encountered, or the second occurrence of 261 * '-' or '.'. The token type returned is TT_NUMBER and <code>nval</code> 262 * is set to the value parsed.</li> 263 * <li>If an alphabetic character is parsed, all subsequent characters 264 * are read until the first non-alphabetic or non-numeric character is 265 * encountered. The token type returned is TT_WORD and the value parsed 266 * is stored in <code>sval</code>. If lower case mode is set, the token 267 * stored in <code>sval</code> is converted to lower case. The end of line 268 * sequence terminates a word only if EOL signficance has been turned on. 269 * The start of a comment also terminates a word. Any character with a 270 * non-alphabetic and non-numeric attribute (such as white space, a quote, 271 * or a commet) are treated as non-alphabetic and terminate the word.</li> 272 * <li>If a comment character is parsed, then all remaining characters on 273 * the current line are skipped and another token is parsed. Any EOL or 274 * EOF's encountered are not discarded, but rather terminate the comment.</li> 275 * <li>If a quote character is parsed, then all characters up to the 276 * second occurrence of the same quote character are parsed into a 277 * <code>String</code>. This <code>String</code> is stored as 278 * <code>sval</code>, but is not converted to lower case, even if lower case 279 * mode is enabled. The token type returned is the value of the quote 280 * character encountered. Any escape sequences 281 * (\b (backspace), \t (HTAB), \n (linefeed), \f (form feed), \r 282 * (carriage return), \" (double quote), \' (single quote), \\ 283 * (backslash), \XXX (octal esacpe)) are converted to the appropriate 284 * char values. Invalid esacape sequences are left in untranslated. 285 * Unicode characters like ('\ u0000') are not recognized. </li> 286 * <li>If the C++ comment sequence "//" is encountered, and the parser 287 * is configured to handle that sequence, then the remainder of the line 288 * is skipped and another token is read exactly as if a character with 289 * the comment attribute was encountered.</li> 290 * <li>If the C comment sequence "/*" is encountered, and the parser 291 * is configured to handle that sequence, then all characters up to and 292 * including the comment terminator sequence are discarded and another 293 * token is parsed.</li> 294 * <li>If all cases above are not met, then the character is an ordinary 295 * character that is parsed as a token by itself. The char encountered 296 * is returned as the token type.</li> 297 * </ul> 298 * 299 * @return The token type 300 * @exception IOException If an I/O error occurs 301 */ nextToken()302 public int nextToken() throws IOException 303 { 304 if (pushedBack) 305 { 306 pushedBack = false; 307 if (ttype != TT_NONE) 308 return ttype; 309 } 310 311 sval = null; 312 int ch; 313 314 // Skip whitespace. Deal with EOL along the way. 315 while (isWhitespace(ch = in.read())) 316 if (ch == '\n' || ch == '\r') 317 { 318 lineNumber++; 319 320 // Throw away \n if in combination with \r. 321 if (ch == '\r' && (ch = in.read()) != '\n') 322 { 323 if (ch != TT_EOF) 324 in.unread(ch); 325 } 326 if (eolSignificant) 327 return (ttype = TT_EOL); 328 } 329 330 if (ch == '/') 331 if ((ch = in.read()) == '/' && slashSlash) 332 { 333 while ((ch = in.read()) != '\n' && ch != '\r' && ch != TT_EOF) 334 ; 335 336 if (ch != TT_EOF) 337 in.unread(ch); 338 return nextToken(); // Recursive, but not too deep in normal cases 339 } 340 else if (ch == '*' && slashStar) 341 { 342 while (true) 343 { 344 ch = in.read(); 345 if (ch == '*') 346 { 347 if ((ch = in.read()) == '/') 348 break; 349 else if (ch != TT_EOF) 350 in.unread(ch); 351 } 352 else if (ch == '\n' || ch == '\r') 353 { 354 lineNumber++; 355 if (ch == '\r' && (ch = in.read()) != '\n') 356 { 357 if (ch != TT_EOF) 358 in.unread(ch); 359 } 360 } 361 else if (ch == TT_EOF) 362 { 363 break; 364 } 365 } 366 return nextToken(); // Recursive, but not too deep in normal cases 367 } 368 else 369 { 370 if (ch != TT_EOF) 371 in.unread(ch); 372 ch = '/'; 373 } 374 375 if (ch == TT_EOF) 376 ttype = TT_EOF; 377 else if (isNumeric(ch)) 378 { 379 boolean isNegative = false; 380 if (ch == '-') 381 { 382 // Read ahead to see if this is an ordinary '-' rather than numeric. 383 ch = in.read(); 384 if (isNumeric(ch) && ch != '-') 385 { 386 isNegative = true; 387 } 388 else 389 { 390 if (ch != TT_EOF) 391 in.unread(ch); 392 return (ttype = '-'); 393 } 394 } 395 396 CPStringBuilder tokbuf = new CPStringBuilder(); 397 tokbuf.append((char) ch); 398 399 int decCount = 0; 400 while (isNumeric(ch = in.read()) && ch != '-') 401 if (ch == '.' && decCount++ > 0) 402 break; 403 else 404 tokbuf.append((char) ch); 405 406 if (ch != TT_EOF) 407 in.unread(ch); 408 ttype = TT_NUMBER; 409 try 410 { 411 nval = Double.valueOf(tokbuf.toString()).doubleValue(); 412 } 413 catch (NumberFormatException _) 414 { 415 nval = 0.0; 416 } 417 if (isNegative) 418 nval = -nval; 419 } 420 else if (isAlphabetic(ch)) 421 { 422 CPStringBuilder tokbuf = new CPStringBuilder(); 423 tokbuf.append((char) ch); 424 while (isAlphabetic(ch = in.read()) || isNumeric(ch)) 425 tokbuf.append((char) ch); 426 if (ch != TT_EOF) 427 in.unread(ch); 428 ttype = TT_WORD; 429 sval = tokbuf.toString(); 430 if (lowerCase) 431 sval = sval.toLowerCase(); 432 } 433 else if (isComment(ch)) 434 { 435 while ((ch = in.read()) != '\n' && ch != '\r' && ch != TT_EOF) 436 ; 437 438 if (ch != TT_EOF) 439 in.unread(ch); 440 return nextToken(); // Recursive, but not too deep in normal cases. 441 } 442 else if (isQuote(ch)) 443 { 444 ttype = ch; 445 CPStringBuilder tokbuf = new CPStringBuilder(); 446 while ((ch = in.read()) != ttype && ch != '\n' && ch != '\r' && 447 ch != TT_EOF) 448 { 449 if (ch == '\\') 450 switch (ch = in.read()) 451 { 452 case 'a': ch = 0x7; 453 break; 454 case 'b': ch = '\b'; 455 break; 456 case 'f': ch = 0xC; 457 break; 458 case 'n': ch = '\n'; 459 break; 460 case 'r': ch = '\r'; 461 break; 462 case 't': ch = '\t'; 463 break; 464 case 'v': ch = 0xB; 465 break; 466 case '\n': ch = '\n'; 467 break; 468 case '\r': ch = '\r'; 469 break; 470 case '\"': 471 case '\'': 472 case '\\': 473 break; 474 default: 475 int ch1, nextch; 476 if ((nextch = ch1 = ch) >= '0' && ch <= '7') 477 { 478 ch -= '0'; 479 if ((nextch = in.read()) >= '0' && nextch <= '7') 480 { 481 ch = ch * 8 + nextch - '0'; 482 if ((nextch = in.read()) >= '0' && nextch <= '7' && 483 ch1 >= '0' && ch1 <= '3') 484 { 485 ch = ch * 8 + nextch - '0'; 486 nextch = in.read(); 487 } 488 } 489 } 490 491 if (nextch != TT_EOF) 492 in.unread(nextch); 493 } 494 495 tokbuf.append((char) ch); 496 } 497 498 // Throw away matching quote char. 499 if (ch != ttype && ch != TT_EOF) 500 in.unread(ch); 501 502 sval = tokbuf.toString(); 503 } 504 else 505 { 506 ttype = ch; 507 } 508 509 return ttype; 510 } 511 resetChar(int ch)512 private void resetChar(int ch) 513 { 514 whitespace[ch] = alphabetic[ch] = numeric[ch] = quote[ch] = comment[ch] = 515 false; 516 } 517 518 /** 519 * This method makes the specified character an ordinary character. This 520 * means that none of the attributes (whitespace, alphabetic, numeric, 521 * quote, or comment) will be set on this character. This character will 522 * parse as its own token. 523 * 524 * @param ch The character to make ordinary, passed as an int 525 */ ordinaryChar(int ch)526 public void ordinaryChar(int ch) 527 { 528 if (ch >= 0 && ch <= 255) 529 resetChar(ch); 530 } 531 532 /** 533 * This method makes all the characters in the specified range, range 534 * terminators included, ordinary. This means the none of the attributes 535 * (whitespace, alphabetic, numeric, quote, or comment) will be set on 536 * any of the characters in the range. This makes each character in this 537 * range parse as its own token. 538 * 539 * @param low The low end of the range of values to set the whitespace 540 * attribute for 541 * @param hi The high end of the range of values to set the whitespace 542 * attribute for 543 */ ordinaryChars(int low, int hi)544 public void ordinaryChars(int low, int hi) 545 { 546 if (low < 0) 547 low = 0; 548 if (hi > 255) 549 hi = 255; 550 for (int i = low; i <= hi; i++) 551 resetChar(i); 552 } 553 554 /** 555 * This method sets the numeric attribute on the characters '0' - '9' and 556 * the characters '.' and '-'. 557 * When this method is used, the result of giving other attributes 558 * (whitespace, quote, or comment) to the numeric characters may 559 * vary depending on the implementation. For example, if 560 * parseNumbers() and then whitespaceChars('1', '1') are called, 561 * this implementation reads "121" as 2, while some other implementation 562 * will read it as 21. 563 */ parseNumbers()564 public void parseNumbers() 565 { 566 for (int i = 0; i <= 9; i++) 567 numeric['0' + i] = true; 568 569 numeric['.'] = true; 570 numeric['-'] = true; 571 } 572 573 /** 574 * Puts the current token back into the StreamTokenizer so 575 * <code>nextToken</code> will return the same value on the next call. 576 * May cause the lineno method to return an incorrect value 577 * if lineno is called before the next call to nextToken. 578 */ pushBack()579 public void pushBack() 580 { 581 pushedBack = true; 582 } 583 584 /** 585 * This method sets the quote attribute on the specified character. 586 * Other attributes for the character are cleared. 587 * 588 * @param ch The character to set the quote attribute for, passed as an int. 589 */ quoteChar(int ch)590 public void quoteChar(int ch) 591 { 592 if (ch >= 0 && ch <= 255) 593 { 594 quote[ch] = true; 595 comment[ch] = false; 596 whitespace[ch] = false; 597 alphabetic[ch] = false; 598 numeric[ch] = false; 599 } 600 } 601 602 /** 603 * This method removes all attributes (whitespace, alphabetic, numeric, 604 * quote, and comment) from all characters. It is equivalent to calling 605 * <code>ordinaryChars(0x00, 0xFF)</code>. 606 * 607 * @see #ordinaryChars(int, int) 608 */ resetSyntax()609 public void resetSyntax() 610 { 611 ordinaryChars(0x00, 0xFF); 612 } 613 614 /** 615 * This method sets a flag that indicates whether or not "C++" language style 616 * comments ("//" comments through EOL ) are handled by the parser. 617 * If this is <code>true</code> commented out sequences are skipped and 618 * ignored by the parser. This defaults to <code>false</code>. 619 * 620 * @param flag <code>true</code> to recognized and handle "C++" style 621 * comments, <code>false</code> otherwise 622 */ slashSlashComments(boolean flag)623 public void slashSlashComments(boolean flag) 624 { 625 slashSlash = flag; 626 } 627 628 /** 629 * This method sets a flag that indicates whether or not "C" language style 630 * comments (with nesting not allowed) are handled by the parser. 631 * If this is <code>true</code> commented out sequences are skipped and 632 * ignored by the parser. This defaults to <code>false</code>. 633 * 634 * @param flag <code>true</code> to recognized and handle "C" style comments, 635 * <code>false</code> otherwise 636 */ slashStarComments(boolean flag)637 public void slashStarComments(boolean flag) 638 { 639 slashStar = flag; 640 } 641 642 /** 643 * This method returns the current token value as a <code>String</code> in 644 * the form "Token[x], line n", where 'n' is the current line numbers and 645 * 'x' is determined as follows. 646 * <p> 647 * <ul> 648 * <li>If no token has been read, then 'x' is "NOTHING" and 'n' is 0</li> 649 * <li>If <code>ttype</code> is TT_EOF, then 'x' is "EOF"</li> 650 * <li>If <code>ttype</code> is TT_EOL, then 'x' is "EOL"</li> 651 * <li>If <code>ttype</code> is TT_WORD, then 'x' is <code>sval</code></li> 652 * <li>If <code>ttype</code> is TT_NUMBER, then 'x' is "n=strnval" where 653 * 'strnval' is <code>String.valueOf(nval)</code>.</li> 654 * <li>If <code>ttype</code> is a quote character, then 'x' is 655 * <code>sval</code></li> 656 * <li>For all other cases, 'x' is <code>ttype</code></li> 657 * </ul> 658 */ toString()659 public String toString() 660 { 661 String tempstr; 662 if (ttype == TT_EOF) 663 tempstr = "EOF"; 664 else if (ttype == TT_EOL) 665 tempstr = "EOL"; 666 else if (ttype == TT_WORD) 667 tempstr = sval; 668 else if (ttype == TT_NUMBER) 669 tempstr = "n=" + nval; 670 else if (ttype == TT_NONE) 671 tempstr = "NOTHING"; 672 else // must be an ordinary char. 673 tempstr = "\'" + (char) ttype + "\'"; 674 675 return "Token[" + tempstr + "], line " + lineno(); 676 } 677 678 /** 679 * This method sets the whitespace attribute for all characters in the 680 * specified range, range terminators included. 681 * 682 * @param low The low end of the range of values to set the whitespace 683 * attribute for 684 * @param hi The high end of the range of values to set the whitespace 685 * attribute for 686 */ whitespaceChars(int low, int hi)687 public void whitespaceChars(int low, int hi) 688 { 689 if (low < 0) 690 low = 0; 691 if (hi > 255) 692 hi = 255; 693 for (int i = low; i <= hi; i++) 694 { 695 resetChar(i); 696 whitespace[i] = true; 697 } 698 } 699 700 /** 701 * This method sets the alphabetic attribute for all characters in the 702 * specified range, range terminators included. 703 * 704 * @param low The low end of the range of values to set the alphabetic 705 * attribute for 706 * @param hi The high end of the range of values to set the alphabetic 707 * attribute for 708 */ wordChars(int low, int hi)709 public void wordChars(int low, int hi) 710 { 711 if (low < 0) 712 low = 0; 713 if (hi > 255) 714 hi = 255; 715 for (int i = low; i <= hi; i++) 716 alphabetic[i] = true; 717 } 718 } 719