1 /* XPathTokenizer.java -- 2 Copyright (C) 2004 Free Software Foundation, Inc. 3 4 This file is part of GNU Classpath. 5 6 GNU Classpath is free software; you can redistribute it and/or modify 7 it under the terms of the GNU General Public License as published by 8 the Free Software Foundation; either version 2, or (at your option) 9 any later version. 10 11 GNU Classpath is distributed in the hope that it will be useful, but 12 WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 General Public License for more details. 15 16 You should have received a copy of the GNU General Public License 17 along with GNU Classpath; see the file COPYING. If not, write to the 18 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 19 02110-1301 USA. 20 21 Linking this library statically or dynamically with other modules is 22 making a combined work based on this library. Thus, the terms and 23 conditions of the GNU General Public License cover the whole 24 combination. 25 26 As a special exception, the copyright holders of this library give you 27 permission to link this library with independent modules to produce an 28 executable, regardless of the license terms of these independent 29 modules, and to copy and distribute the resulting executable under 30 terms of your choice, provided that you also meet, for each linked 31 independent module, the terms and conditions of the license of that 32 module. An independent module is a module which is not derived from 33 or based on this library. If you modify this library, you may extend 34 this exception to your version of the library, but you are not 35 obligated to do so. If you do not wish to do so, delete this 36 exception statement from your version. */ 37 38 package gnu.xml.xpath; 39 40 import gnu.java.lang.CPStringBuilder; 41 42 import java.io.BufferedReader; 43 import java.io.IOException; 44 import java.io.Reader; 45 import java.io.StringReader; 46 import java.util.Map; 47 import java.util.TreeMap; 48 49 /*import antlr.Token; 50 import antlr.TokenStream; 51 import antlr.TokenStreamException; 52 import antlr.TokenStreamIOException;*/ 53 54 /** 55 * XPath 1.0 expression tokenizer. 56 * 57 * @author <a href='mailto:dog@gnu.org'>Chris Burdess</a> 58 */ 59 public class XPathTokenizer 60 implements XPathParser.yyInput 61 //implements TokenStream 62 { 63 64 static class XPathToken 65 //extends Token 66 { 67 68 int type; 69 String val; 70 XPathToken(int type)71 XPathToken (int type) 72 { 73 this (type, null); 74 } 75 XPathToken(int type, String val)76 XPathToken (int type, String val) 77 { 78 //super (type); 79 this.type = type; 80 this.val = val; 81 } 82 getText()83 public String getText () 84 { 85 return val; 86 } 87 toString()88 public String toString () 89 { 90 return val; 91 } 92 93 } 94 95 static final Map<String,Integer> keywords = new TreeMap<String,Integer> (); 96 static 97 { 98 keywords.put ("ancestor", new Integer (XPathParser.ANCESTOR)); 99 keywords.put ("ancestor-or-self", new Integer (XPathParser.ANCESTOR_OR_SELF)); 100 keywords.put ("attribute", new Integer (XPathParser.ATTRIBUTE)); 101 keywords.put ("child", new Integer (XPathParser.CHILD)); 102 keywords.put ("descendant", new Integer (XPathParser.DESCENDANT)); 103 keywords.put ("descendant-or-self", new Integer (XPathParser.DESCENDANT_OR_SELF)); 104 keywords.put ("following", new Integer (XPathParser.FOLLOWING)); 105 keywords.put ("following-sibling", new Integer (XPathParser.FOLLOWING_SIBLING)); 106 keywords.put ("namespace", new Integer (XPathParser.NAMESPACE)); 107 keywords.put ("parent", new Integer (XPathParser.PARENT)); 108 keywords.put ("preceding", new Integer (XPathParser.PRECEDING)); 109 keywords.put ("preceding-sibling", new Integer (XPathParser.PRECEDING_SIBLING)); 110 keywords.put ("self", new Integer (XPathParser.SELF)); 111 keywords.put ("div", new Integer (XPathParser.DIV)); 112 keywords.put ("mod", new Integer (XPathParser.MOD)); 113 keywords.put ("or", new Integer (XPathParser.OR)); 114 keywords.put ("and", new Integer (XPathParser.AND)); 115 keywords.put ("comment", new Integer (XPathParser.COMMENT)); 116 keywords.put ("processing-instruction", new Integer (XPathParser.PROCESSING_INSTRUCTION)); 117 keywords.put ("text", new Integer (XPathParser.TEXT)); 118 keywords.put ("node", new Integer (XPathParser.NODE)); 119 } 120 121 Reader in; 122 XPathToken token; 123 XPathToken lastToken; 124 XPathTokenizer(String expr)125 public XPathTokenizer (String expr) 126 { 127 this (new StringReader (expr)); 128 } 129 XPathTokenizer(Reader in)130 XPathTokenizer (Reader in) 131 { 132 this.in = in.markSupported () ? in : new BufferedReader (in); 133 } 134 135 /* Begin ANTLR specific * 136 137 public Token nextToken () 138 throws TokenStreamException 139 { 140 try 141 { 142 if (!advance ()) 143 { 144 throw new TokenStreamException ("eof"); 145 } 146 token (); 147 return token; 148 } 149 catch (IOException e) 150 { 151 throw new TokenStreamIOException (e); 152 } 153 } 154 155 * End ANTLR specific */ 156 advance()157 public boolean advance () 158 throws IOException 159 { 160 lastToken = token; 161 int c = in.read (); 162 switch (c) 163 { 164 case -1: // eof 165 return false; 166 case 0x20: 167 case 0x09: 168 case 0x0d: 169 case 0x0a: // skip whitespace 170 return advance (); 171 case 0x22: // " 172 case 0x27: // ' 173 token = consume_literal (c); 174 break; 175 case 0x28: // ( 176 token = new XPathToken (XPathParser.LP); 177 break; 178 case 0x29: // ) 179 token = new XPathToken (XPathParser.RP); 180 break; 181 case 0x5b: // [ 182 token = new XPathToken (XPathParser.LB); 183 break; 184 case 0x5d: // ] 185 token = new XPathToken (XPathParser.RB); 186 break; 187 case 0x2c: // , 188 token = new XPathToken (XPathParser.COMMA); 189 break; 190 case 0x7c: // | 191 token = new XPathToken (XPathParser.PIPE); 192 break; 193 case 0x2f: // / 194 in.mark (1); 195 int d1 = in.read (); 196 if (d1 == 0x2f) 197 { 198 token = new XPathToken (XPathParser.DOUBLE_SLASH); 199 } 200 else 201 { 202 in.reset (); 203 token = new XPathToken (XPathParser.SLASH); 204 } 205 break; 206 case 0x3d: // = 207 token = new XPathToken (XPathParser.EQ); 208 break; 209 case 0x21: // ! 210 in.mark (1); 211 int d2 = in.read (); 212 if (d2 == 0x3d) // = 213 { 214 token = new XPathToken (XPathParser.NE); 215 } 216 else 217 { 218 in.reset (); 219 token = new XPathToken (XPathParser.yyErrorCode); 220 } 221 break; 222 case 0x3e: // > 223 in.mark (1); 224 int d3 = in.read (); 225 if (d3 == 0x3d) // = 226 { 227 token = new XPathToken (XPathParser.GTE); 228 } 229 else 230 { 231 in.reset (); 232 token = new XPathToken (XPathParser.GT); 233 } 234 break; 235 case 0x3c: // < 236 in.mark (1); 237 int d4 = in.read (); 238 if (d4 == 0x3d) // = 239 { 240 token = new XPathToken (XPathParser.LTE); 241 } 242 else 243 { 244 in.reset (); 245 token = new XPathToken (XPathParser.LT); 246 } 247 break; 248 case 0x2b: // + 249 token = new XPathToken (XPathParser.PLUS); 250 break; 251 case 0x2d: // - 252 token = new XPathToken (XPathParser.MINUS); 253 break; 254 case 0x40: // @ 255 token = new XPathToken (XPathParser.AT); 256 break; 257 case 0x2a: // * 258 token = new XPathToken (XPathParser.STAR); 259 break; 260 case 0x24: // $ 261 token = new XPathToken (XPathParser.DOLLAR); 262 break; 263 case 0x3a: // : 264 in.mark (1); 265 int d5 = in.read (); 266 if (d5 == 0x3a) 267 { 268 token = new XPathToken (XPathParser.DOUBLE_COLON); 269 } 270 else 271 { 272 in.reset (); 273 token = new XPathToken (XPathParser.COLON); 274 } 275 break; 276 case 0x2e: // . 277 in.mark (1); 278 int d6 = in.read (); 279 if (d6 == 0x2e) 280 { 281 token = new XPathToken (XPathParser.DOUBLE_DOT); 282 } 283 else 284 { 285 in.reset (); 286 token = new XPathToken (XPathParser.DOT); 287 } 288 break; 289 default: 290 if (c >= 0x30 && c <= 0x39) 291 { 292 token = consume_digits (c); 293 } 294 else if (c == 0x5f || Character.isLetter ((char) c)) 295 { 296 token = consume_name (c); 297 } 298 else 299 { 300 token = new XPathToken (XPathParser.yyErrorCode); 301 } 302 } 303 return true; 304 } 305 token()306 public int token () 307 { 308 return token.type; 309 } 310 value()311 public Object value () 312 { 313 return token.val; 314 } 315 consume_literal(int delimiter)316 XPathToken consume_literal (int delimiter) 317 throws IOException 318 { 319 CPStringBuilder buf = new CPStringBuilder (); 320 while (true) 321 { 322 int c = in.read (); 323 if (c == -1) 324 { 325 return new XPathToken (XPathParser.yyErrorCode); 326 } 327 else if (c == delimiter) 328 { 329 return new XPathToken (XPathParser.LITERAL, buf.toString ()); 330 } 331 else 332 { 333 buf.append ((char) c); 334 } 335 } 336 } 337 consume_digits(int c)338 XPathToken consume_digits (int c) 339 throws IOException 340 { 341 CPStringBuilder buf = new CPStringBuilder (); 342 buf.append ((char) c); 343 while (true) 344 { 345 in.mark (1); 346 c = in.read (); 347 if (c >= 0x30 && c <= 0x39) 348 { 349 buf.append ((char) c); 350 } 351 else 352 { 353 in.reset (); 354 return new XPathToken (XPathParser.DIGITS, buf.toString ()); 355 } 356 } 357 } 358 consume_name(int c)359 XPathToken consume_name (int c) 360 throws IOException 361 { 362 CPStringBuilder buf = new CPStringBuilder (); 363 buf.append ((char) c); 364 while (true) 365 { 366 in.mark (1); 367 c = in.read (); 368 if (isNameChar (c)) 369 { 370 buf.append ((char) c); 371 } 372 else 373 { 374 in.reset (); 375 String name = buf.toString (); 376 Integer keyword = (Integer) keywords.get (name); 377 if (keyword == null) 378 { 379 return new XPathToken (XPathParser.NAME, name); 380 } 381 else 382 { 383 int val = keyword.intValue (); 384 switch (val) 385 { 386 case XPathParser.NODE: 387 case XPathParser.COMMENT: 388 case XPathParser.TEXT: 389 case XPathParser.PROCESSING_INSTRUCTION: 390 // Consume subsequent ( 391 in.mark (1); 392 do 393 { 394 c = in.read (); 395 } 396 while (c == 0x20 || c == 0x09); 397 if (c != 0x28) 398 { 399 in.reset (); 400 return new XPathToken (XPathParser.NAME, name); 401 } 402 break; 403 case XPathParser.CHILD: 404 case XPathParser.PARENT: 405 case XPathParser.SELF: 406 case XPathParser.DESCENDANT: 407 case XPathParser.ANCESTOR: 408 case XPathParser.DESCENDANT_OR_SELF: 409 case XPathParser.ANCESTOR_OR_SELF: 410 case XPathParser.ATTRIBUTE: 411 case XPathParser.NAMESPACE: 412 case XPathParser.FOLLOWING: 413 case XPathParser.FOLLOWING_SIBLING: 414 case XPathParser.PRECEDING: 415 case XPathParser.PRECEDING_SIBLING: 416 // Check that this is an axis specifier 417 in.mark(1); 418 do 419 { 420 c = in.read(); 421 } 422 while (c == 0x20 || c == 0x09); 423 if (c == 0x3a) 424 { 425 c = in.read(); 426 if (c == 0x3a) 427 { 428 in.reset(); 429 return new XPathToken(val); 430 } 431 } 432 in.reset(); 433 return new XPathToken(XPathParser.NAME, name); 434 case XPathParser.DIV: 435 case XPathParser.MOD: 436 // May be a name 437 if (lastToken == null) 438 { 439 return new XPathToken(XPathParser.NAME, name); 440 } 441 switch (lastToken.type) 442 { 443 case XPathParser.LP: 444 case XPathParser.LB: 445 case XPathParser.COMMA: 446 case XPathParser.PIPE: 447 case XPathParser.EQ: 448 case XPathParser.NE: 449 case XPathParser.GT: 450 case XPathParser.LT: 451 case XPathParser.GTE: 452 case XPathParser.LTE: 453 case XPathParser.PLUS: 454 case XPathParser.MINUS: 455 case XPathParser.STAR: 456 case XPathParser.AT: 457 case XPathParser.DOLLAR: 458 case XPathParser.COLON: 459 case XPathParser.DOUBLE_COLON: 460 case XPathParser.DIV: 461 case XPathParser.MOD: 462 case XPathParser.OR: 463 case XPathParser.AND: 464 case XPathParser.SLASH: 465 return new XPathToken(XPathParser.NAME, name); 466 } 467 break; 468 } 469 return new XPathToken (val); 470 } 471 } 472 } 473 } 474 isNameChar(int c)475 boolean isNameChar (int c) 476 { 477 /* Name */ 478 return (c == 0x5f 479 || c == 0x2d 480 || c == 0x2e 481 || (c >= 0x30 && c <= 0x39) 482 /* CombiningChar */ 483 || (c >= 0x0300 && c <= 0x0345) 484 || (c >= 0x0360 && c <= 0x0361) 485 || (c >= 0x0483 && c <= 0x0486) 486 || (c >= 0x0591 && c <= 0x05A1) 487 || (c >= 0x05A3 && c <= 0x05B9) 488 || (c >= 0x05BB && c <= 0x05BD) 489 || c == 0x05BF 490 || (c >= 0x05C1 && c <= 0x05C2) 491 || c == 0x05C4 492 || (c >= 0x064B && c <= 0x0652) 493 || c == 0x0670 494 || (c >= 0x06D6 && c <= 0x06DC) 495 || (c >= 0x06DD && c <= 0x06DF) 496 || (c >= 0x06E0 && c <= 0x06E4) 497 || (c >= 0x06E7 && c <= 0x06E8) 498 || (c >= 0x06EA && c <= 0x06ED) 499 || (c >= 0x0901 && c <= 0x0903) 500 || c == 0x093C 501 || (c >= 0x093E && c <= 0x094C) 502 || c == 0x094D 503 || (c >= 0x0951 && c <= 0x0954) 504 || (c >= 0x0962 && c <= 0x0963) 505 || (c >= 0x0981 && c <= 0x0983) 506 || c == 0x09BC 507 || c == 0x09BE 508 || c == 0x09BF 509 || (c >= 0x09C0 && c <= 0x09C4) 510 || (c >= 0x09C7 && c <= 0x09C8) 511 || (c >= 0x09CB && c <= 0x09CD) 512 || c == 0x09D7 513 || (c >= 0x09E2 && c <= 0x09E3) 514 || c == 0x0A02 515 || c == 0x0A3C 516 || c == 0x0A3E 517 || c == 0x0A3F 518 || (c >= 0x0A40 && c <= 0x0A42) 519 || (c >= 0x0A47 && c <= 0x0A48) 520 || (c >= 0x0A4B && c <= 0x0A4D) 521 || (c >= 0x0A70 && c <= 0x0A71) 522 || (c >= 0x0A81 && c <= 0x0A83) 523 || c == 0x0ABC 524 || (c >= 0x0ABE && c <= 0x0AC5) 525 || (c >= 0x0AC7 && c <= 0x0AC9) 526 || (c >= 0x0ACB && c <= 0x0ACD) 527 || (c >= 0x0B01 && c <= 0x0B03) 528 || c == 0x0B3C 529 || (c >= 0x0B3E && c <= 0x0B43) 530 || (c >= 0x0B47 && c <= 0x0B48) 531 || (c >= 0x0B4B && c <= 0x0B4D) 532 || (c >= 0x0B56 && c <= 0x0B57) 533 || (c >= 0x0B82 && c <= 0x0B83) 534 || (c >= 0x0BBE && c <= 0x0BC2) 535 || (c >= 0x0BC6 && c <= 0x0BC8) 536 || (c >= 0x0BCA && c <= 0x0BCD) 537 || c == 0x0BD7 538 || (c >= 0x0C01 && c <= 0x0C03) 539 || (c >= 0x0C3E && c <= 0x0C44) 540 || (c >= 0x0C46 && c <= 0x0C48) 541 || (c >= 0x0C4A && c <= 0x0C4D) 542 || (c >= 0x0C55 && c <= 0x0C56) 543 || (c >= 0x0C82 && c <= 0x0C83) 544 || (c >= 0x0CBE && c <= 0x0CC4) 545 || (c >= 0x0CC6 && c <= 0x0CC8) 546 || (c >= 0x0CCA && c <= 0x0CCD) 547 || (c >= 0x0CD5 && c <= 0x0CD6) 548 || (c >= 0x0D02 && c <= 0x0D03) 549 || (c >= 0x0D3E && c <= 0x0D43) 550 || (c >= 0x0D46 && c <= 0x0D48) 551 || (c >= 0x0D4A && c <= 0x0D4D) 552 || c == 0x0D57 553 || c == 0x0E31 554 || (c >= 0x0E34 && c <= 0x0E3A) 555 || (c >= 0x0E47 && c <= 0x0E4E) 556 || c == 0x0EB1 557 || (c >= 0x0EB4 && c <= 0x0EB9) 558 || (c >= 0x0EBB && c <= 0x0EBC) 559 || (c >= 0x0EC8 && c <= 0x0ECD) 560 || (c >= 0x0F18 && c <= 0x0F19) 561 || c == 0x0F35 562 || c == 0x0F37 563 || c == 0x0F39 564 || c == 0x0F3E 565 || c == 0x0F3F 566 || (c >= 0x0F71 && c <= 0x0F84) 567 || (c >= 0x0F86 && c <= 0x0F8B) 568 || (c >= 0x0F90 && c <= 0x0F95) 569 || c == 0x0F97 570 || (c >= 0x0F99 && c <= 0x0FAD) 571 || (c >= 0x0FB1 && c <= 0x0FB7) 572 || c == 0x0FB9 573 || (c >= 0x20D0 && c <= 0x20DC) 574 || c == 0x20E1 575 || (c >= 0x302A && c <= 0x302F) 576 || c == 0x3099 577 || c == 0x309A 578 /* Extender */ 579 || c == 0x00B7 580 || c == 0x02D0 581 || c == 0x02D1 582 || c == 0x0387 583 || c == 0x0640 584 || c == 0x0E46 585 || c == 0x0EC6 586 || c == 0x3005 587 || (c >= 0x3031 && c <= 0x3035) 588 || (c >= 0x309D && c <= 0x309E) 589 || (c >= 0x30FC && c <= 0x30FE) 590 /* Name */ 591 || Character.isLetter ((char) c)); 592 } 593 594 } 595