1 /* 2 * Copyright (c) 2015, 2017, Oracle and/or its affiliates. All rights reserved. 3 */ 4 /* 5 * Licensed to the Apache Software Foundation (ASF) under one or more 6 * contributor license agreements. See the NOTICE file distributed with 7 * this work for additional information regarding copyright ownership. 8 * The ASF licenses this file to You under the Apache License, Version 2.0 9 * (the "License"); you may not use this file except in compliance with 10 * the License. You may obtain a copy of the License at 11 * 12 * http://www.apache.org/licenses/LICENSE-2.0 13 * 14 * Unless required by applicable law or agreed to in writing, software 15 * distributed under the License is distributed on an "AS IS" BASIS, 16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 * See the License for the specific language governing permissions and 18 * limitations under the License. 19 */ 20 21 package com.sun.org.apache.xerces.internal.impl.xpath.regex; 22 23 import java.util.ArrayList; 24 import java.util.Locale; 25 import java.util.MissingResourceException; 26 import java.util.ResourceBundle; 27 import jdk.xml.internal.SecuritySupport; 28 29 /** 30 * A Regular Expression Parser. 31 * 32 * @xerces.internal 33 * 34 * @LastModified: Sep 2017 35 */ 36 class RegexParser { 37 static final int T_CHAR = 0; 38 static final int T_EOF = 1; 39 static final int T_OR = 2; // '|' 40 static final int T_STAR = 3; // '*' 41 static final int T_PLUS = 4; // '+' 42 static final int T_QUESTION = 5; // '?' 43 static final int T_LPAREN = 6; // '(' 44 static final int T_RPAREN = 7; // ')' 45 static final int T_DOT = 8; // '.' 46 static final int T_LBRACKET = 9; // '[' 47 static final int T_BACKSOLIDUS = 10; // '\' 48 static final int T_CARET = 11; // '^' 49 static final int T_DOLLAR = 12; // '$' 50 static final int T_LPAREN2 = 13; // '(?:' 51 static final int T_LOOKAHEAD = 14; // '(?=' 52 static final int T_NEGATIVELOOKAHEAD = 15; // '(?!' 53 static final int T_LOOKBEHIND = 16; // '(?<=' 54 static final int T_NEGATIVELOOKBEHIND = 17; // '(?<!' 55 static final int T_INDEPENDENT = 18; // '(?>' 56 static final int T_SET_OPERATIONS = 19; // '(?[' 57 static final int T_POSIX_CHARCLASS_START = 20; // '[:' in a character class 58 static final int T_COMMENT = 21; // '(?#' 59 static final int T_MODIFIERS = 22; // '(?' [\-,a-z,A-Z] 60 static final int T_CONDITION = 23; // '(?(' 61 static final int T_XMLSCHEMA_CC_SUBTRACTION = 24; // '-[' in a character class 62 63 static class ReferencePosition { 64 int refNumber; 65 int position; ReferencePosition(int n, int pos)66 ReferencePosition(int n, int pos) { 67 this.refNumber = n; 68 this.position = pos; 69 } 70 } 71 72 int offset; 73 String regex; 74 int regexlen; 75 int options; 76 ResourceBundle resources; 77 int chardata; 78 int nexttoken; 79 static protected final int S_NORMAL = 0; 80 static protected final int S_INBRACKETS = 1; 81 static protected final int S_INXBRACKETS = 2; 82 int context = S_NORMAL; 83 int parenOpened = 1; 84 int parennumber = 1; 85 boolean hasBackReferences; 86 ArrayList<ReferencePosition> references = null; 87 RegexParser()88 public RegexParser() { 89 this.setLocale(Locale.getDefault()); 90 } RegexParser(Locale locale)91 public RegexParser(Locale locale) { 92 this.setLocale(locale); 93 } 94 setLocale(Locale locale)95 public void setLocale(Locale locale) { 96 try { 97 if (locale != null) { 98 this.resources = SecuritySupport.getResourceBundle("com.sun.org.apache.xerces.internal.impl.xpath.regex.message", locale); 99 } 100 else { 101 this.resources = SecuritySupport.getResourceBundle("com.sun.org.apache.xerces.internal.impl.xpath.regex.message"); 102 } 103 } 104 catch (MissingResourceException mre) { 105 throw new RuntimeException("Installation Problem??? Couldn't load messages: " 106 + mre.getMessage()); 107 } 108 } 109 ex(String key, int loc)110 final ParseException ex(String key, int loc) { 111 return new ParseException(this.resources.getString(key), loc); 112 } 113 isSet(int flag)114 protected final boolean isSet(int flag) { 115 return (this.options & flag) == flag; 116 } 117 parse(String regex, int options)118 Token parse(String regex, int options) throws ParseException { 119 this.options = options; 120 this.offset = 0; 121 this.setContext(S_NORMAL); 122 this.parennumber = 1; 123 this.parenOpened = 1; 124 this.hasBackReferences = false; 125 this.regex = regex; 126 if (this.isSet(RegularExpression.EXTENDED_COMMENT)) 127 this.regex = REUtil.stripExtendedComment(this.regex); 128 this.regexlen = this.regex.length(); 129 130 131 this.next(); 132 Token ret = this.parseRegex(); 133 if (this.offset != this.regexlen) 134 throw ex("parser.parse.1", this.offset); 135 if (this.read() != T_EOF) { 136 throw ex("parser.parse.1", this.offset-1); 137 } 138 if (this.references != null) { 139 for (int i = 0; i < this.references.size(); i ++) { 140 ReferencePosition position = this.references.get(i); 141 if (this.parennumber <= position.refNumber) 142 throw ex("parser.parse.2", position.position); 143 } 144 this.references.clear(); 145 } 146 return ret; 147 } 148 149 /* 150 public RegularExpression createRegex(String regex, int options) throws ParseException { 151 Token tok = this.parse(regex, options); 152 return new RegularExpression(regex, tok, this.parennumber, this.hasBackReferences, options); 153 } 154 */ 155 setContext(int con)156 protected final void setContext(int con) { 157 this.context = con; 158 } 159 read()160 final int read() { 161 return this.nexttoken; 162 } 163 164 @SuppressWarnings("fallthrough") next()165 final void next() { 166 if (this.offset >= this.regexlen) { 167 this.chardata = -1; 168 this.nexttoken = T_EOF; 169 return; 170 } 171 172 int ret; 173 int ch = this.regex.charAt(this.offset++); 174 this.chardata = ch; 175 176 if (this.context == S_INBRACKETS) { 177 // In a character class, this.chardata has one character, that is to say, 178 // a pair of surrogates is composed and stored to this.chardata. 179 switch (ch) { 180 case '\\': 181 ret = T_BACKSOLIDUS; 182 if (this.offset >= this.regexlen) 183 throw ex("parser.next.1", this.offset-1); 184 this.chardata = this.regex.charAt(this.offset++); 185 break; 186 187 case '-': 188 // Allow character class subtraction (regardless of whether we are in 189 // XML Schema mode or not) 190 if (this.offset < this.regexlen && this.regex.charAt(this.offset) == '[') { 191 this.offset++; 192 ret = T_XMLSCHEMA_CC_SUBTRACTION; 193 } else 194 ret = T_CHAR; 195 break; 196 197 case '[': 198 if (!this.isSet(RegularExpression.XMLSCHEMA_MODE) 199 && this.offset < this.regexlen && this.regex.charAt(this.offset) == ':') { 200 this.offset++; 201 ret = T_POSIX_CHARCLASS_START; 202 break; 203 } // Through down 204 default: 205 if (REUtil.isHighSurrogate(ch) && this.offset < this.regexlen) { 206 int low = this.regex.charAt(this.offset); 207 if (REUtil.isLowSurrogate(low)) { 208 this.chardata = REUtil.composeFromSurrogates(ch, low); 209 this.offset ++; 210 } 211 } 212 ret = T_CHAR; 213 } 214 this.nexttoken = ret; 215 return; 216 } 217 218 switch (ch) { 219 case '|': ret = T_OR; break; 220 case '*': ret = T_STAR; break; 221 case '+': ret = T_PLUS; break; 222 case '?': ret = T_QUESTION; break; 223 case ')': ret = T_RPAREN; break; 224 case '.': ret = T_DOT; break; 225 case '[': ret = T_LBRACKET; break; 226 case '^': 227 if (this.isSet(RegularExpression.XMLSCHEMA_MODE)) { 228 ret = T_CHAR; 229 } 230 else { 231 ret = T_CARET; 232 } 233 break; 234 case '$': 235 if (this.isSet(RegularExpression.XMLSCHEMA_MODE)) { 236 ret = T_CHAR; 237 } 238 else { 239 ret = T_DOLLAR; 240 } 241 break; 242 case '(': 243 ret = T_LPAREN; 244 if (this.offset >= this.regexlen) 245 break; 246 if (this.regex.charAt(this.offset) != '?') 247 break; 248 if (++this.offset >= this.regexlen) 249 throw ex("parser.next.2", this.offset-1); 250 ch = this.regex.charAt(this.offset++); 251 switch (ch) { 252 case ':': ret = T_LPAREN2; break; 253 case '=': ret = T_LOOKAHEAD; break; 254 case '!': ret = T_NEGATIVELOOKAHEAD; break; 255 case '[': ret = T_SET_OPERATIONS; break; 256 case '>': ret = T_INDEPENDENT; break; 257 case '<': 258 if (this.offset >= this.regexlen) 259 throw ex("parser.next.2", this.offset-3); 260 ch = this.regex.charAt(this.offset++); 261 if (ch == '=') { 262 ret = T_LOOKBEHIND; 263 } else if (ch == '!') { 264 ret = T_NEGATIVELOOKBEHIND; 265 } else 266 throw ex("parser.next.3", this.offset-3); 267 break; 268 case '#': 269 while (this.offset < this.regexlen) { 270 ch = this.regex.charAt(this.offset++); 271 if (ch == ')') break; 272 } 273 if (ch != ')') 274 throw ex("parser.next.4", this.offset-1); 275 ret = T_COMMENT; 276 break; 277 default: 278 if (ch == '-' || 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z') {// Options 279 this.offset --; 280 ret = T_MODIFIERS; 281 break; 282 } else if (ch == '(') { // conditional 283 ret = T_CONDITION; // this.offsets points the next of '('. 284 break; 285 } 286 throw ex("parser.next.2", this.offset-2); 287 } 288 break; 289 290 case '\\': 291 ret = T_BACKSOLIDUS; 292 if (this.offset >= this.regexlen) 293 throw ex("parser.next.1", this.offset-1); 294 this.chardata = this.regex.charAt(this.offset++); 295 break; 296 297 default: 298 ret = T_CHAR; 299 } 300 this.nexttoken = ret; 301 } 302 303 /** 304 * regex ::= term (`|` term)* 305 * term ::= factor+ 306 * factor ::= ('^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>' 307 * | atom (('*' | '+' | '?' | minmax ) '?'? )?) 308 * | '(?=' regex ')' | '(?!' regex ')' | '(?<=' regex ')' | '(?<!' regex ')' 309 * atom ::= char | '.' | range | '(' regex ')' | '(?:' regex ')' | '\' [0-9] 310 * | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block 311 */ parseRegex()312 Token parseRegex() throws ParseException { 313 Token tok = this.parseTerm(); 314 Token parent = null; 315 while (this.read() == T_OR) { 316 this.next(); // '|' 317 if (parent == null) { 318 parent = Token.createUnion(); 319 parent.addChild(tok); 320 tok = parent; 321 } 322 tok.addChild(this.parseTerm()); 323 } 324 return tok; 325 } 326 327 /** 328 * term ::= factor+ 329 */ parseTerm()330 Token parseTerm() throws ParseException { 331 int ch = this.read(); 332 if (ch == T_OR || ch == T_RPAREN || ch == T_EOF) { 333 return Token.createEmpty(); 334 } else { 335 Token tok = this.parseFactor(); 336 Token concat = null; 337 while ((ch = this.read()) != T_OR && ch != T_RPAREN && ch != T_EOF) { 338 if (concat == null) { 339 concat = Token.createConcat(); 340 concat.addChild(tok); 341 tok = concat; 342 } 343 concat.addChild(this.parseFactor()); 344 //tok = Token.createConcat(tok, this.parseFactor()); 345 } 346 return tok; 347 } 348 } 349 350 // ---------------------------------------------------------------- 351 processCaret()352 Token processCaret() throws ParseException { 353 this.next(); 354 return Token.token_linebeginning; 355 } processDollar()356 Token processDollar() throws ParseException { 357 this.next(); 358 return Token.token_lineend; 359 } processLookahead()360 Token processLookahead() throws ParseException { 361 this.next(); 362 Token tok = Token.createLook(Token.LOOKAHEAD, this.parseRegex()); 363 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 364 this.next(); // ')' 365 return tok; 366 } processNegativelookahead()367 Token processNegativelookahead() throws ParseException { 368 this.next(); 369 Token tok = Token.createLook(Token.NEGATIVELOOKAHEAD, this.parseRegex()); 370 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 371 this.next(); // ')' 372 return tok; 373 } processLookbehind()374 Token processLookbehind() throws ParseException { 375 this.next(); 376 Token tok = Token.createLook(Token.LOOKBEHIND, this.parseRegex()); 377 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 378 this.next(); // ')' 379 return tok; 380 } processNegativelookbehind()381 Token processNegativelookbehind() throws ParseException { 382 this.next(); 383 Token tok = Token.createLook(Token.NEGATIVELOOKBEHIND, this.parseRegex()); 384 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 385 this.next(); // ')' 386 return tok; 387 } processBacksolidus_A()388 Token processBacksolidus_A() throws ParseException { 389 this.next(); 390 return Token.token_stringbeginning; 391 } processBacksolidus_Z()392 Token processBacksolidus_Z() throws ParseException { 393 this.next(); 394 return Token.token_stringend2; 395 } processBacksolidus_z()396 Token processBacksolidus_z() throws ParseException { 397 this.next(); 398 return Token.token_stringend; 399 } processBacksolidus_b()400 Token processBacksolidus_b() throws ParseException { 401 this.next(); 402 return Token.token_wordedge; 403 } processBacksolidus_B()404 Token processBacksolidus_B() throws ParseException { 405 this.next(); 406 return Token.token_not_wordedge; 407 } processBacksolidus_lt()408 Token processBacksolidus_lt() throws ParseException { 409 this.next(); 410 return Token.token_wordbeginning; 411 } processBacksolidus_gt()412 Token processBacksolidus_gt() throws ParseException { 413 this.next(); 414 return Token.token_wordend; 415 } processStar(Token tok)416 Token processStar(Token tok) throws ParseException { 417 this.next(); 418 if (this.read() == T_QUESTION) { 419 this.next(); 420 return Token.createNGClosure(tok); 421 } else 422 return Token.createClosure(tok); 423 } processPlus(Token tok)424 Token processPlus(Token tok) throws ParseException { 425 // X+ -> XX* 426 this.next(); 427 if (this.read() == T_QUESTION) { 428 this.next(); 429 return Token.createConcat(tok, Token.createNGClosure(tok)); 430 } else 431 return Token.createConcat(tok, Token.createClosure(tok)); 432 } processQuestion(Token tok)433 Token processQuestion(Token tok) throws ParseException { 434 // X? -> X| 435 this.next(); 436 Token par = Token.createUnion(); 437 if (this.read() == T_QUESTION) { 438 this.next(); 439 par.addChild(Token.createEmpty()); 440 par.addChild(tok); 441 } else { 442 par.addChild(tok); 443 par.addChild(Token.createEmpty()); 444 } 445 return par; 446 } checkQuestion(int off)447 boolean checkQuestion(int off) { 448 return off < this.regexlen && this.regex.charAt(off) == '?'; 449 } processParen()450 Token processParen() throws ParseException { 451 this.next(); 452 int p = this.parenOpened++; 453 Token tok = Token.createParen(this.parseRegex(), p); 454 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 455 this.parennumber++; 456 this.next(); // Skips ')' 457 return tok; 458 } processParen2()459 Token processParen2() throws ParseException { 460 this.next(); 461 Token tok = Token.createParen(this.parseRegex(), 0); 462 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 463 this.next(); // Skips ')' 464 return tok; 465 } processCondition()466 Token processCondition() throws ParseException { 467 // this.offset points the next of '(' 468 if (this.offset+1 >= this.regexlen) throw ex("parser.factor.4", this.offset); 469 // Parses a condition. 470 int refno = -1; 471 Token condition = null; 472 int ch = this.regex.charAt(this.offset); 473 if ('1' <= ch && ch <= '9') { 474 refno = ch-'0'; 475 int finalRefno = refno; 476 477 if (this.parennumber <= refno) 478 throw ex("parser.parse.2", this.offset); 479 480 while (this.offset + 1 < this.regexlen) { 481 ch = this.regex.charAt(this.offset + 1); 482 if ('0' <= ch && ch <= '9') { 483 refno = (refno * 10) + (ch - '0'); 484 if (refno < this.parennumber) { 485 finalRefno= refno; 486 ++this.offset; 487 } 488 else { 489 break; 490 } 491 } 492 else { 493 break; 494 } 495 } 496 497 this.hasBackReferences = true; 498 if (this.references == null) this.references = new ArrayList<>(); 499 this.references.add(new ReferencePosition(finalRefno, this.offset)); 500 this.offset ++; 501 if (this.regex.charAt(this.offset) != ')') throw ex("parser.factor.1", this.offset); 502 this.offset ++; 503 } else { 504 if (ch == '?') this.offset --; // Points '('. 505 this.next(); 506 condition = this.parseFactor(); 507 switch (condition.type) { 508 case Token.LOOKAHEAD: 509 case Token.NEGATIVELOOKAHEAD: 510 case Token.LOOKBEHIND: 511 case Token.NEGATIVELOOKBEHIND: 512 break; 513 case Token.ANCHOR: 514 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 515 break; 516 default: 517 throw ex("parser.factor.5", this.offset); 518 } 519 } 520 // Parses yes/no-patterns. 521 this.next(); 522 Token yesPattern = this.parseRegex(); 523 Token noPattern = null; 524 if (yesPattern.type == Token.UNION) { 525 if (yesPattern.size() != 2) throw ex("parser.factor.6", this.offset); 526 noPattern = yesPattern.getChild(1); 527 yesPattern = yesPattern.getChild(0); 528 } 529 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 530 this.next(); 531 return Token.createCondition(refno, condition, yesPattern, noPattern); 532 } processModifiers()533 Token processModifiers() throws ParseException { 534 // this.offset points the next of '?'. 535 // modifiers ::= [imsw]* ('-' [imsw]*)? ':' 536 int add = 0, mask = 0, ch = -1; 537 while (this.offset < this.regexlen) { 538 ch = this.regex.charAt(this.offset); 539 int v = REUtil.getOptionValue(ch); 540 if (v == 0) break; // '-' or ':'? 541 add |= v; 542 this.offset ++; 543 } 544 if (this.offset >= this.regexlen) throw ex("parser.factor.2", this.offset-1); 545 if (ch == '-') { 546 this.offset ++; 547 while (this.offset < this.regexlen) { 548 ch = this.regex.charAt(this.offset); 549 int v = REUtil.getOptionValue(ch); 550 if (v == 0) break; // ':'? 551 mask |= v; 552 this.offset ++; 553 } 554 if (this.offset >= this.regexlen) throw ex("parser.factor.2", this.offset-1); 555 } 556 Token tok; 557 if (ch == ':') { 558 this.offset ++; 559 this.next(); 560 tok = Token.createModifierGroup(this.parseRegex(), add, mask); 561 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 562 this.next(); 563 } else if (ch == ')') { // such as (?-i) 564 this.offset ++; 565 this.next(); 566 tok = Token.createModifierGroup(this.parseRegex(), add, mask); 567 } else 568 throw ex("parser.factor.3", this.offset); 569 570 return tok; 571 } processIndependent()572 Token processIndependent() throws ParseException { 573 this.next(); 574 Token tok = Token.createLook(Token.INDEPENDENT, this.parseRegex()); 575 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 576 this.next(); // Skips ')' 577 return tok; 578 } processBacksolidus_c()579 Token processBacksolidus_c() throws ParseException { 580 int ch2; // Must be in 0x0040-0x005f 581 if (this.offset >= this.regexlen 582 || ((ch2 = this.regex.charAt(this.offset++)) & 0xffe0) != 0x0040) 583 throw ex("parser.atom.1", this.offset-1); 584 this.next(); 585 return Token.createChar(ch2-0x40); 586 } processBacksolidus_C()587 Token processBacksolidus_C() throws ParseException { 588 throw ex("parser.process.1", this.offset); 589 } processBacksolidus_i()590 Token processBacksolidus_i() throws ParseException { 591 Token tok = Token.createChar('i'); 592 this.next(); 593 return tok; 594 } processBacksolidus_I()595 Token processBacksolidus_I() throws ParseException { 596 throw ex("parser.process.1", this.offset); 597 } processBacksolidus_g()598 Token processBacksolidus_g() throws ParseException { 599 this.next(); 600 return Token.getGraphemePattern(); 601 } processBacksolidus_X()602 Token processBacksolidus_X() throws ParseException { 603 this.next(); 604 return Token.getCombiningCharacterSequence(); 605 } processBackreference()606 Token processBackreference() throws ParseException { 607 int refnum = this.chardata-'0'; 608 int finalRefnum = refnum; 609 610 if (this.parennumber <= refnum) 611 throw ex("parser.parse.2", this.offset-2); 612 613 while (this.offset < this.regexlen) { 614 final int ch = this.regex.charAt(this.offset); 615 if ('0' <= ch && ch <= '9') { 616 refnum = (refnum * 10) + (ch - '0'); 617 if (refnum < this.parennumber) { 618 ++this.offset; 619 finalRefnum = refnum; 620 this.chardata = ch; 621 } 622 else { 623 break; 624 } 625 } 626 else { 627 break; 628 } 629 } 630 631 Token tok = Token.createBackReference(finalRefnum); 632 this.hasBackReferences = true; 633 if (this.references == null) this.references = new ArrayList<>(); 634 this.references.add(new ReferencePosition(finalRefnum, this.offset-2)); 635 this.next(); 636 return tok; 637 } 638 639 // ---------------------------------------------------------------- 640 641 /** 642 * factor ::= ('^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>' 643 * | atom (('*' | '+' | '?' | minmax ) '?'? )?) 644 * | '(?=' regex ')' | '(?!' regex ')' | '(?<=' regex ')' | '(?<!' regex ')' 645 * | '(?#' [^)]* ')' 646 * minmax ::= '{' min (',' max?)? '}' 647 * min ::= [0-9]+ 648 * max ::= [0-9]+ 649 */ parseFactor()650 Token parseFactor() throws ParseException { 651 int ch = this.read(); 652 Token tok; 653 switch (ch) { 654 case T_CARET: return this.processCaret(); 655 case T_DOLLAR: return this.processDollar(); 656 case T_LOOKAHEAD: return this.processLookahead(); 657 case T_NEGATIVELOOKAHEAD: return this.processNegativelookahead(); 658 case T_LOOKBEHIND: return this.processLookbehind(); 659 case T_NEGATIVELOOKBEHIND: return this.processNegativelookbehind(); 660 661 case T_COMMENT: 662 this.next(); 663 return Token.createEmpty(); 664 665 case T_BACKSOLIDUS: 666 switch (this.chardata) { 667 case 'A': return this.processBacksolidus_A(); 668 case 'Z': return this.processBacksolidus_Z(); 669 case 'z': return this.processBacksolidus_z(); 670 case 'b': return this.processBacksolidus_b(); 671 case 'B': return this.processBacksolidus_B(); 672 case '<': return this.processBacksolidus_lt(); 673 case '>': return this.processBacksolidus_gt(); 674 } 675 // through down 676 } 677 tok = this.parseAtom(); 678 ch = this.read(); 679 switch (ch) { 680 case T_STAR: return this.processStar(tok); 681 case T_PLUS: return this.processPlus(tok); 682 case T_QUESTION: return this.processQuestion(tok); 683 case T_CHAR: 684 if (this.chardata == '{' && this.offset < this.regexlen) { 685 686 int off = this.offset; // this.offset -> next of '{' 687 int min = 0, max = -1; 688 689 if ((ch = this.regex.charAt(off++)) >= '0' && ch <= '9') { 690 691 min = ch -'0'; 692 while (off < this.regexlen 693 && (ch = this.regex.charAt(off++)) >= '0' && ch <= '9') { 694 min = min*10 +ch-'0'; 695 if (min < 0) 696 throw ex("parser.quantifier.5", this.offset); 697 } 698 } 699 else { 700 throw ex("parser.quantifier.1", this.offset); 701 } 702 703 max = min; 704 if (ch == ',') { 705 706 if (off >= this.regexlen) { 707 throw ex("parser.quantifier.3", this.offset); 708 } 709 else if ((ch = this.regex.charAt(off++)) >= '0' && ch <= '9') { 710 711 max = ch -'0'; // {min,max} 712 while (off < this.regexlen 713 && (ch = this.regex.charAt(off++)) >= '0' 714 && ch <= '9') { 715 max = max*10 +ch-'0'; 716 if (max < 0) 717 throw ex("parser.quantifier.5", this.offset); 718 } 719 720 if (min > max) 721 throw ex("parser.quantifier.4", this.offset); 722 } 723 else { // assume {min,} 724 max = -1; 725 } 726 } 727 728 if (ch != '}') 729 throw ex("parser.quantifier.2", this.offset); 730 731 if (this.checkQuestion(off)) { // off -> next of '}' 732 tok = Token.createNGClosure(tok); 733 this.offset = off+1; 734 } else { 735 tok = Token.createClosure(tok); 736 this.offset = off; 737 } 738 739 tok.setMin(min); 740 tok.setMax(max); 741 //System.err.println("CLOSURE: "+min+", "+max); 742 this.next(); 743 } 744 } 745 return tok; 746 } 747 748 /** 749 * atom ::= char | '.' | char-class | '(' regex ')' | '(?:' regex ')' | '\' [0-9] 750 * | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block 751 * | '(?>' regex ')' 752 * char ::= '\\' | '\' [efnrt] | bmp-code | character-1 753 */ parseAtom()754 Token parseAtom() throws ParseException { 755 int ch = this.read(); 756 Token tok = null; 757 switch (ch) { 758 case T_LPAREN: return this.processParen(); 759 case T_LPAREN2: return this.processParen2(); // '(?:' 760 case T_CONDITION: return this.processCondition(); // '(?(' 761 case T_MODIFIERS: return this.processModifiers(); // (?modifiers ... ) 762 case T_INDEPENDENT: return this.processIndependent(); 763 case T_DOT: 764 this.next(); // Skips '.' 765 tok = Token.token_dot; 766 break; 767 768 /** 769 * char-class ::= '[' ( '^'? range ','?)+ ']' 770 * range ::= '\d' | '\w' | '\s' | category-block | range-char 771 * | range-char '-' range-char 772 * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | bmp-code | character-2 773 * bmp-char ::= '\' 'u' [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] 774 */ 775 case T_LBRACKET: return this.parseCharacterClass(true); 776 case T_SET_OPERATIONS: return this.parseSetOperations(); 777 778 case T_BACKSOLIDUS: 779 switch (this.chardata) { 780 case 'd': case 'D': 781 case 'w': case 'W': 782 case 's': case 'S': 783 tok = this.getTokenForShorthand(this.chardata); 784 this.next(); 785 return tok; 786 787 case 'e': case 'f': case 'n': case 'r': 788 case 't': case 'u': case 'v': case 'x': 789 { 790 int ch2 = this.decodeEscaped(); 791 if (ch2 < 0x10000) { 792 tok = Token.createChar(ch2); 793 } else { 794 tok = Token.createString(REUtil.decomposeToSurrogates(ch2)); 795 } 796 } 797 break; 798 799 case 'c': return this.processBacksolidus_c(); 800 case 'C': return this.processBacksolidus_C(); 801 case 'i': return this.processBacksolidus_i(); 802 case 'I': return this.processBacksolidus_I(); 803 case 'g': return this.processBacksolidus_g(); 804 case 'X': return this.processBacksolidus_X(); 805 case '1': case '2': case '3': case '4': 806 case '5': case '6': case '7': case '8': case '9': 807 return this.processBackreference(); 808 809 case 'P': 810 case 'p': 811 int pstart = this.offset; 812 tok = processBacksolidus_pP(this.chardata); 813 if (tok == null) throw this.ex("parser.atom.5", pstart); 814 break; 815 816 default: 817 tok = Token.createChar(this.chardata); 818 } 819 this.next(); 820 break; 821 822 case T_CHAR: 823 if (this.chardata == ']' || this.chardata == '{' || this.chardata == '}') 824 throw this.ex("parser.atom.4", this.offset-1); 825 tok = Token.createChar(this.chardata); 826 int high = this.chardata; 827 this.next(); 828 if (REUtil.isHighSurrogate(high) 829 && this.read() == T_CHAR && REUtil.isLowSurrogate(this.chardata)) { 830 char[] sur = new char[2]; 831 sur[0] = (char)high; 832 sur[1] = (char)this.chardata; 833 tok = Token.createParen(Token.createString(new String(sur)), 0); 834 this.next(); 835 } 836 break; 837 838 default: 839 throw this.ex("parser.atom.4", this.offset-1); 840 } 841 return tok; 842 } 843 processBacksolidus_pP(int c)844 protected RangeToken processBacksolidus_pP(int c) throws ParseException { 845 846 this.next(); 847 if (this.read() != T_CHAR || this.chardata != '{') 848 throw this.ex("parser.atom.2", this.offset-1); 849 850 // handle category escape 851 boolean positive = c == 'p'; 852 int namestart = this.offset; 853 int nameend = this.regex.indexOf('}', namestart); 854 855 if (nameend < 0) 856 throw this.ex("parser.atom.3", this.offset); 857 858 String pname = this.regex.substring(namestart, nameend); 859 this.offset = nameend+1; 860 861 return Token.getRange(pname, positive, this.isSet(RegularExpression.XMLSCHEMA_MODE)); 862 } 863 processCIinCharacterClass(RangeToken tok, int c)864 int processCIinCharacterClass(RangeToken tok, int c) { 865 return this.decodeEscaped(); 866 } 867 868 /** 869 * char-class ::= '[' ( '^'? range ','?)+ ']' 870 * range ::= '\d' | '\w' | '\s' | category-block | range-char 871 * | range-char '-' range-char 872 * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | bmp-code | character-2 873 * bmp-code ::= '\' 'u' [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] 874 */ parseCharacterClass(boolean useNrange)875 protected RangeToken parseCharacterClass(boolean useNrange) throws ParseException { 876 this.setContext(S_INBRACKETS); 877 this.next(); // '[' 878 boolean nrange = false; 879 RangeToken base = null; 880 RangeToken tok; 881 if (this.read() == T_CHAR && this.chardata == '^') { 882 nrange = true; 883 this.next(); // '^' 884 if (useNrange) { 885 tok = Token.createNRange(); 886 } else { 887 base = Token.createRange(); 888 base.addRange(0, Token.UTF16_MAX); 889 tok = Token.createRange(); 890 } 891 } else { 892 tok = Token.createRange(); 893 } 894 int type; 895 boolean firstloop = true; 896 while ((type = this.read()) != T_EOF) { 897 if (type == T_CHAR && this.chardata == ']' && !firstloop) 898 break; 899 int c = this.chardata; 900 boolean end = false; 901 if (type == T_BACKSOLIDUS) { 902 switch (c) { 903 case 'd': case 'D': 904 case 'w': case 'W': 905 case 's': case 'S': 906 tok.mergeRanges(this.getTokenForShorthand(c)); 907 end = true; 908 break; 909 910 case 'i': case 'I': 911 case 'c': case 'C': 912 c = this.processCIinCharacterClass(tok, c); 913 if (c < 0) end = true; 914 break; 915 916 case 'p': 917 case 'P': 918 int pstart = this.offset; 919 RangeToken tok2 = this.processBacksolidus_pP(c); 920 if (tok2 == null) throw this.ex("parser.atom.5", pstart); 921 tok.mergeRanges(tok2); 922 end = true; 923 break; 924 925 default: 926 c = this.decodeEscaped(); 927 } // \ + c 928 } // backsolidus 929 // POSIX Character class such as [:alnum:] 930 else if (type == T_POSIX_CHARCLASS_START) { 931 int nameend = this.regex.indexOf(':', this.offset); 932 if (nameend < 0) throw this.ex("parser.cc.1", this.offset); 933 boolean positive = true; 934 if (this.regex.charAt(this.offset) == '^') { 935 this.offset ++; 936 positive = false; 937 } 938 String name = this.regex.substring(this.offset, nameend); 939 RangeToken range = Token.getRange(name, positive, 940 this.isSet(RegularExpression.XMLSCHEMA_MODE)); 941 if (range == null) throw this.ex("parser.cc.3", this.offset); 942 tok.mergeRanges(range); 943 end = true; 944 if (nameend+1 >= this.regexlen || this.regex.charAt(nameend+1) != ']') 945 throw this.ex("parser.cc.1", nameend); 946 this.offset = nameend+2; 947 } 948 else if (type == T_XMLSCHEMA_CC_SUBTRACTION && !firstloop) { 949 if (nrange) { 950 nrange = false; 951 if (useNrange) { 952 tok = (RangeToken) Token.complementRanges(tok); 953 } 954 else { 955 base.subtractRanges(tok); 956 tok = base; 957 } 958 } 959 RangeToken range2 = this.parseCharacterClass(false); 960 tok.subtractRanges(range2); 961 if (this.read() != T_CHAR || this.chardata != ']') { 962 throw this.ex("parser.cc.5", this.offset); 963 } 964 break; // Exit this loop 965 } 966 this.next(); 967 if (!end) { // if not shorthands... 968 if (this.read() != T_CHAR || this.chardata != '-') { // Here is no '-'. 969 if (!this.isSet(RegularExpression.IGNORE_CASE) || c > 0xffff) { 970 tok.addRange(c, c); 971 } 972 else { 973 addCaseInsensitiveChar(tok, c); 974 } 975 } 976 else if (type == T_XMLSCHEMA_CC_SUBTRACTION) { 977 throw this.ex("parser.cc.8", this.offset-1); 978 } 979 else { 980 this.next(); // Skips '-' 981 if ((type = this.read()) == T_EOF) throw this.ex("parser.cc.2", this.offset); 982 if (type == T_CHAR && this.chardata == ']') { 983 if (!this.isSet(RegularExpression.IGNORE_CASE) || c > 0xffff) { 984 tok.addRange(c, c); 985 } 986 else { 987 addCaseInsensitiveChar(tok, c); 988 } 989 tok.addRange('-', '-'); 990 } else { 991 int rangeend = this.chardata; 992 if (type == T_BACKSOLIDUS) { 993 rangeend = this.decodeEscaped(); 994 } 995 this.next(); 996 if (c > rangeend) { 997 throw this.ex("parser.ope.3", this.offset-1); 998 } 999 if (!this.isSet(RegularExpression.IGNORE_CASE) || 1000 (c > 0xffff && rangeend > 0xffff)) { 1001 tok.addRange(c, rangeend); 1002 } 1003 else { 1004 addCaseInsensitiveCharRange(tok, c, rangeend); 1005 } 1006 } 1007 } 1008 } 1009 if (this.isSet(RegularExpression.SPECIAL_COMMA) 1010 && this.read() == T_CHAR && this.chardata == ',') { 1011 this.next(); 1012 } 1013 firstloop = false; 1014 } 1015 if (this.read() == T_EOF) { 1016 throw this.ex("parser.cc.2", this.offset); 1017 } 1018 1019 if (!useNrange && nrange) { 1020 base.subtractRanges(tok); 1021 tok = base; 1022 } 1023 tok.sortRanges(); 1024 tok.compactRanges(); 1025 this.setContext(S_NORMAL); 1026 this.next(); // Skips ']' 1027 1028 return tok; 1029 } 1030 1031 /** 1032 * '(?[' ... ']' (('-' | '+' | '&') '[' ... ']')? ')' 1033 */ parseSetOperations()1034 protected RangeToken parseSetOperations() throws ParseException { 1035 RangeToken tok = this.parseCharacterClass(false); 1036 int type; 1037 while ((type = this.read()) != T_RPAREN) { 1038 int ch = this.chardata; 1039 if (type == T_CHAR && (ch == '-' || ch == '&') 1040 || type == T_PLUS) { 1041 this.next(); 1042 if (this.read() != T_LBRACKET) throw ex("parser.ope.1", this.offset-1); 1043 RangeToken t2 = this.parseCharacterClass(false); 1044 if (type == T_PLUS) 1045 tok.mergeRanges(t2); 1046 else if (ch == '-') 1047 tok.subtractRanges(t2); 1048 else if (ch == '&') 1049 tok.intersectRanges(t2); 1050 else 1051 throw new RuntimeException("ASSERT"); 1052 } else { 1053 throw ex("parser.ope.2", this.offset-1); 1054 } 1055 } 1056 this.next(); 1057 return tok; 1058 } 1059 getTokenForShorthand(int ch)1060 Token getTokenForShorthand(int ch) { 1061 Token tok; 1062 switch (ch) { 1063 case 'd': 1064 tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) 1065 ? Token.getRange("Nd", true) : Token.token_0to9; 1066 break; 1067 case 'D': 1068 tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) 1069 ? Token.getRange("Nd", false) : Token.token_not_0to9; 1070 break; 1071 case 'w': 1072 tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) 1073 ? Token.getRange("IsWord", true) : Token.token_wordchars; 1074 break; 1075 case 'W': 1076 tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) 1077 ? Token.getRange("IsWord", false) : Token.token_not_wordchars; 1078 break; 1079 case 's': 1080 tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) 1081 ? Token.getRange("IsSpace", true) : Token.token_spaces; 1082 break; 1083 case 'S': 1084 tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) 1085 ? Token.getRange("IsSpace", false) : Token.token_not_spaces; 1086 break; 1087 1088 default: 1089 throw new RuntimeException("Internal Error: shorthands: \\u"+Integer.toString(ch, 16)); 1090 } 1091 return tok; 1092 } 1093 1094 /** 1095 */ decodeEscaped()1096 int decodeEscaped() throws ParseException { 1097 if (this.read() != T_BACKSOLIDUS) throw ex("parser.next.1", this.offset-1); 1098 int c = this.chardata; 1099 switch (c) { 1100 case 'e': c = 0x1b; break; // ESCAPE U+001B 1101 case 'f': c = '\f'; break; // FORM FEED U+000C 1102 case 'n': c = '\n'; break; // LINE FEED U+000A 1103 case 'r': c = '\r'; break; // CRRIAGE RETURN U+000D 1104 case 't': c = '\t'; break; // HORIZONTAL TABULATION U+0009 1105 //case 'v': c = 0x0b; break; // VERTICAL TABULATION U+000B 1106 case 'x': 1107 this.next(); 1108 if (this.read() != T_CHAR) throw ex("parser.descape.1", this.offset-1); 1109 if (this.chardata == '{') { 1110 int v1 = 0; 1111 int uv = 0; 1112 do { 1113 this.next(); 1114 if (this.read() != T_CHAR) throw ex("parser.descape.1", this.offset-1); 1115 if ((v1 = hexChar(this.chardata)) < 0) 1116 break; 1117 if (uv > uv*16) throw ex("parser.descape.2", this.offset-1); 1118 uv = uv*16+v1; 1119 } while (true); 1120 if (this.chardata != '}') throw ex("parser.descape.3", this.offset-1); 1121 if (uv > Token.UTF16_MAX) throw ex("parser.descape.4", this.offset-1); 1122 c = uv; 1123 } else { 1124 int v1 = 0; 1125 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1126 throw ex("parser.descape.1", this.offset-1); 1127 int uv = v1; 1128 this.next(); 1129 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1130 throw ex("parser.descape.1", this.offset-1); 1131 uv = uv*16+v1; 1132 c = uv; 1133 } 1134 break; 1135 1136 case 'u': 1137 int v1 = 0; 1138 this.next(); 1139 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1140 throw ex("parser.descape.1", this.offset-1); 1141 int uv = v1; 1142 this.next(); 1143 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1144 throw ex("parser.descape.1", this.offset-1); 1145 uv = uv*16+v1; 1146 this.next(); 1147 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1148 throw ex("parser.descape.1", this.offset-1); 1149 uv = uv*16+v1; 1150 this.next(); 1151 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1152 throw ex("parser.descape.1", this.offset-1); 1153 uv = uv*16+v1; 1154 c = uv; 1155 break; 1156 1157 case 'v': 1158 this.next(); 1159 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1160 throw ex("parser.descape.1", this.offset-1); 1161 uv = v1; 1162 this.next(); 1163 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1164 throw ex("parser.descape.1", this.offset-1); 1165 uv = uv*16+v1; 1166 this.next(); 1167 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1168 throw ex("parser.descape.1", this.offset-1); 1169 uv = uv*16+v1; 1170 this.next(); 1171 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1172 throw ex("parser.descape.1", this.offset-1); 1173 uv = uv*16+v1; 1174 this.next(); 1175 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1176 throw ex("parser.descape.1", this.offset-1); 1177 uv = uv*16+v1; 1178 this.next(); 1179 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1180 throw ex("parser.descape.1", this.offset-1); 1181 uv = uv*16+v1; 1182 if (uv > Token.UTF16_MAX) throw ex("parser.descappe.4", this.offset-1); 1183 c = uv; 1184 break; 1185 case 'A': 1186 case 'Z': 1187 case 'z': 1188 throw ex("parser.descape.5", this.offset-2); 1189 default: 1190 } 1191 return c; 1192 } 1193 hexChar(int ch)1194 static private final int hexChar(int ch) { 1195 if (ch < '0') return -1; 1196 if (ch > 'f') return -1; 1197 if (ch <= '9') return ch-'0'; 1198 if (ch < 'A') return -1; 1199 if (ch <= 'F') return ch-'A'+10; 1200 if (ch < 'a') return -1; 1201 return ch-'a'+10; 1202 } 1203 addCaseInsensitiveChar(RangeToken tok, int c)1204 static protected final void addCaseInsensitiveChar(RangeToken tok, int c) { 1205 final int[] caseMap = CaseInsensitiveMap.get(c); 1206 tok.addRange(c, c); 1207 1208 if (caseMap != null) { 1209 for (int i=0; i<caseMap.length; i+=2) { 1210 tok.addRange(caseMap[i], caseMap[i]); 1211 } 1212 } 1213 1214 } 1215 addCaseInsensitiveCharRange(RangeToken tok, int start, int end)1216 static protected final void addCaseInsensitiveCharRange(RangeToken tok, int start, int end) { 1217 int[] caseMap; 1218 int r1, r2; 1219 if (start <= end) { 1220 r1 = start; 1221 r2 = end; 1222 } else { 1223 r1 = end; 1224 r2 = start; 1225 } 1226 1227 tok.addRange(r1, r2); 1228 for (int ch = r1; ch <= r2; ch++) { 1229 caseMap = CaseInsensitiveMap.get(ch); 1230 if (caseMap != null) { 1231 for (int i=0; i<caseMap.length; i+=2) { 1232 tok.addRange(caseMap[i], caseMap[i]); 1233 } 1234 } 1235 } 1236 } 1237 } 1238