1 /* gnu/regexp/RE.java 2 Copyright (C) 1998-2001, 2004, 2005 Free Software Foundation, Inc. 3 4 This file is part of GNU Classpath. 5 6 GNU Classpath is free software; you can redistribute it and/or modify 7 it under the terms of the GNU General Public License as published by 8 the Free Software Foundation; either version 2, or (at your option) 9 any later version. 10 11 GNU Classpath is distributed in the hope that it will be useful, but 12 WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 General Public License for more details. 15 16 You should have received a copy of the GNU General Public License 17 along with GNU Classpath; see the file COPYING. If not, write to the 18 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 19 02110-1301 USA. 20 21 Linking this library statically or dynamically with other modules is 22 making a combined work based on this library. Thus, the terms and 23 conditions of the GNU General Public License cover the whole 24 combination. 25 26 As a special exception, the copyright holders of this library give you 27 permission to link this library with independent modules to produce an 28 executable, regardless of the license terms of these independent 29 modules, and to copy and distribute the resulting executable under 30 terms of your choice, provided that you also meet, for each linked 31 independent module, the terms and conditions of the license of that 32 module. An independent module is a module which is not derived from 33 or based on this library. If you modify this library, you may extend 34 this exception to your version of the library, but you are not 35 obligated to do so. If you do not wish to do so, delete this 36 exception statement from your version. */ 37 38 package gnu.regexp; 39 import java.io.InputStream; 40 import java.io.Serializable; 41 import java.util.Locale; 42 import java.util.PropertyResourceBundle; 43 import java.util.ResourceBundle; 44 import java.util.Vector; 45 46 /** 47 * RE provides the user interface for compiling and matching regular 48 * expressions. 49 * <P> 50 * A regular expression object (class RE) is compiled by constructing it 51 * from a String, StringBuffer or character array, with optional 52 * compilation flags (below) 53 * and an optional syntax specification (see RESyntax; if not specified, 54 * <code>RESyntax.RE_SYNTAX_PERL5</code> is used). 55 * <P> 56 * Once compiled, a regular expression object is reusable as well as 57 * threadsafe: multiple threads can use the RE instance simultaneously 58 * to match against different input text. 59 * <P> 60 * Various methods attempt to match input text against a compiled 61 * regular expression. These methods are: 62 * <LI><code>isMatch</code>: returns true if the input text in its 63 * entirety matches the regular expression pattern. 64 * <LI><code>getMatch</code>: returns the first match found in the 65 * input text, or null if no match is found. 66 * <LI><code>getAllMatches</code>: returns an array of all 67 * non-overlapping matches found in the input text. If no matches are 68 * found, the array is zero-length. 69 * <LI><code>substitute</code>: substitute the first occurence of the 70 * pattern in the input text with a replacement string (which may 71 * include metacharacters $0-$9, see REMatch.substituteInto). 72 * <LI><code>substituteAll</code>: same as above, but repeat for each 73 * match before returning. 74 * <LI><code>getMatchEnumeration</code>: returns an REMatchEnumeration 75 * object that allows iteration over the matches (see 76 * REMatchEnumeration for some reasons why you may want to do this 77 * instead of using <code>getAllMatches</code>. 78 * <P> 79 * 80 * These methods all have similar argument lists. The input can be a 81 * String, a character array, a StringBuffer, or an 82 * InputStream of some sort. Note that when using an 83 * InputStream, the stream read position cannot be guaranteed after 84 * attempting a match (this is not a bug, but a consequence of the way 85 * regular expressions work). Using an REMatchEnumeration can 86 * eliminate most positioning problems. 87 * 88 * <P> 89 * 90 * The optional index argument specifies the offset from the beginning 91 * of the text at which the search should start (see the descriptions 92 * of some of the execution flags for how this can affect positional 93 * pattern operators). For an InputStream, this means an 94 * offset from the current read position, so subsequent calls with the 95 * same index argument on an InputStream will not 96 * necessarily access the same position on the stream, whereas 97 * repeated searches at a given index in a fixed string will return 98 * consistent results. 99 * 100 * <P> 101 * You can optionally affect the execution environment by using a 102 * combination of execution flags (constants listed below). 103 * 104 * <P> 105 * All operations on a regular expression are performed in a 106 * thread-safe manner. 107 * 108 * @author <A HREF="mailto:wes@cacas.org">Wes Biggs</A> 109 * @version 1.1.5-dev, to be released 110 */ 111 112 public class RE extends REToken { 113 114 private static final class IntPair implements Serializable { 115 public int first, second; 116 } 117 118 private static final class CharUnit implements Serializable { 119 public char ch; 120 public boolean bk; 121 } 122 123 // This String will be returned by getVersion() 124 private static final String VERSION = "1.1.5-dev"; 125 126 // The localized strings are kept in a separate file 127 private static ResourceBundle messages = PropertyResourceBundle.getBundle("gnu/regexp/MessagesBundle", Locale.getDefault()); 128 129 // These are, respectively, the first and last tokens in our linked list 130 // If there is only one token, firstToken == lastToken 131 private REToken firstToken, lastToken; 132 133 // This is the number of subexpressions in this regular expression, 134 // with a minimum value of zero. Returned by getNumSubs() 135 private int numSubs; 136 137 /** Minimum length, in characters, of any possible match. */ 138 private int minimumLength; 139 140 /** 141 * Compilation flag. Do not differentiate case. Subsequent 142 * searches using this RE will be case insensitive. 143 */ 144 public static final int REG_ICASE = 2; 145 146 /** 147 * Compilation flag. The match-any-character operator (dot) 148 * will match a newline character. When set this overrides the syntax 149 * bit RE_DOT_NEWLINE (see RESyntax for details). This is equivalent to 150 * the "/s" operator in Perl. 151 */ 152 public static final int REG_DOT_NEWLINE = 4; 153 154 /** 155 * Compilation flag. Use multiline mode. In this mode, the ^ and $ 156 * anchors will match based on newlines within the input. This is 157 * equivalent to the "/m" operator in Perl. 158 */ 159 public static final int REG_MULTILINE = 8; 160 161 /** 162 * Execution flag. 163 * The match-beginning operator (^) will not match at the beginning 164 * of the input string. Useful for matching on a substring when you 165 * know the context of the input is such that position zero of the 166 * input to the match test is not actually position zero of the text. 167 * <P> 168 * This example demonstrates the results of various ways of matching on 169 * a substring. 170 * <P> 171 * <CODE> 172 * String s = "food bar fool";<BR> 173 * RE exp = new RE("^foo.");<BR> 174 * REMatch m0 = exp.getMatch(s);<BR> 175 * REMatch m1 = exp.getMatch(s.substring(8));<BR> 176 * REMatch m2 = exp.getMatch(s.substring(8),0,RE.REG_NOTBOL); <BR> 177 * REMatch m3 = exp.getMatch(s,8); <BR> 178 * REMatch m4 = exp.getMatch(s,8,RE.REG_ANCHORINDEX); <BR> 179 * <P> 180 * // Results:<BR> 181 * // m0.toString(): "food"<BR> 182 * // m1.toString(): "fool"<BR> 183 * // m2.toString(): null<BR> 184 * // m3.toString(): null<BR> 185 * // m4.toString(): "fool"<BR> 186 * </CODE> 187 */ 188 public static final int REG_NOTBOL = 16; 189 190 /** 191 * Execution flag. 192 * The match-end operator ($) does not match at the end 193 * of the input string. Useful for matching on substrings. 194 */ 195 public static final int REG_NOTEOL = 32; 196 197 /** 198 * Execution flag. 199 * When a match method is invoked that starts matching at a non-zero 200 * index into the input, treat the input as if it begins at the index 201 * given. The effect of this flag is that the engine does not "see" 202 * any text in the input before the given index. This is useful so 203 * that the match-beginning operator (^) matches not at position 0 204 * in the input string, but at the position the search started at 205 * (based on the index input given to the getMatch function). See 206 * the example under REG_NOTBOL. It also affects the use of the \< 207 * and \b operators. 208 */ 209 public static final int REG_ANCHORINDEX = 64; 210 211 /** 212 * Execution flag. 213 * The substitute and substituteAll methods will not attempt to 214 * interpolate occurrences of $1-$9 in the replacement text with 215 * the corresponding subexpressions. For example, you may want to 216 * replace all matches of "one dollar" with "$1". 217 */ 218 public static final int REG_NO_INTERPOLATE = 128; 219 220 /** Returns a string representing the version of the gnu.regexp package. */ version()221 public static final String version() { 222 return VERSION; 223 } 224 225 // Retrieves a message from the ResourceBundle getLocalizedMessage(String key)226 static final String getLocalizedMessage(String key) { 227 return messages.getString(key); 228 } 229 230 /** 231 * Constructs a regular expression pattern buffer without any compilation 232 * flags set, and using the default syntax (RESyntax.RE_SYNTAX_PERL5). 233 * 234 * @param pattern A regular expression pattern, in the form of a String, 235 * StringBuffer or char[]. Other input types will be converted to 236 * strings using the toString() method. 237 * @exception REException The input pattern could not be parsed. 238 * @exception NullPointerException The pattern was null. 239 */ RE(Object pattern)240 public RE(Object pattern) throws REException { 241 this(pattern,0,RESyntax.RE_SYNTAX_PERL5,0,0); 242 } 243 244 /** 245 * Constructs a regular expression pattern buffer using the specified 246 * compilation flags and the default syntax (RESyntax.RE_SYNTAX_PERL5). 247 * 248 * @param pattern A regular expression pattern, in the form of a String, 249 * StringBuffer, or char[]. Other input types will be converted to 250 * strings using the toString() method. 251 * @param cflags The logical OR of any combination of the compilation flags listed above. 252 * @exception REException The input pattern could not be parsed. 253 * @exception NullPointerException The pattern was null. 254 */ RE(Object pattern, int cflags)255 public RE(Object pattern, int cflags) throws REException { 256 this(pattern,cflags,RESyntax.RE_SYNTAX_PERL5,0,0); 257 } 258 259 /** 260 * Constructs a regular expression pattern buffer using the specified 261 * compilation flags and regular expression syntax. 262 * 263 * @param pattern A regular expression pattern, in the form of a String, 264 * StringBuffer, or char[]. Other input types will be converted to 265 * strings using the toString() method. 266 * @param cflags The logical OR of any combination of the compilation flags listed above. 267 * @param syntax The type of regular expression syntax to use. 268 * @exception REException The input pattern could not be parsed. 269 * @exception NullPointerException The pattern was null. 270 */ RE(Object pattern, int cflags, RESyntax syntax)271 public RE(Object pattern, int cflags, RESyntax syntax) throws REException { 272 this(pattern,cflags,syntax,0,0); 273 } 274 275 // internal constructor used for alternation RE(REToken first, REToken last,int subs, int subIndex, int minLength)276 private RE(REToken first, REToken last,int subs, int subIndex, int minLength) { 277 super(subIndex); 278 firstToken = first; 279 lastToken = last; 280 numSubs = subs; 281 minimumLength = minLength; 282 addToken(new RETokenEndSub(subIndex)); 283 } 284 RE(Object patternObj, int cflags, RESyntax syntax, int myIndex, int nextSub)285 private RE(Object patternObj, int cflags, RESyntax syntax, int myIndex, int nextSub) throws REException { 286 super(myIndex); // Subexpression index of this token. 287 initialize(patternObj, cflags, syntax, myIndex, nextSub); 288 } 289 290 // For use by subclasses RE()291 protected RE() { super(0); } 292 293 // The meat of construction initialize(Object patternObj, int cflags, RESyntax syntax, int myIndex, int nextSub)294 protected void initialize(Object patternObj, int cflags, RESyntax syntax, int myIndex, int nextSub) throws REException { 295 char[] pattern; 296 if (patternObj instanceof String) { 297 pattern = ((String) patternObj).toCharArray(); 298 } else if (patternObj instanceof char[]) { 299 pattern = (char[]) patternObj; 300 } else if (patternObj instanceof StringBuffer) { 301 pattern = new char [((StringBuffer) patternObj).length()]; 302 ((StringBuffer) patternObj).getChars(0,pattern.length,pattern,0); 303 } else { 304 pattern = patternObj.toString().toCharArray(); 305 } 306 307 int pLength = pattern.length; 308 309 numSubs = 0; // Number of subexpressions in this token. 310 Vector branches = null; 311 312 // linked list of tokens (sort of -- some closed loops can exist) 313 firstToken = lastToken = null; 314 315 // Precalculate these so we don't pay for the math every time we 316 // need to access them. 317 boolean insens = ((cflags & REG_ICASE) > 0); 318 319 // Parse pattern into tokens. Does anyone know if it's more efficient 320 // to use char[] than a String.charAt()? I'm assuming so. 321 322 // index tracks the position in the char array 323 int index = 0; 324 325 // this will be the current parse character (pattern[index]) 326 CharUnit unit = new CharUnit(); 327 328 // This is used for {x,y} calculations 329 IntPair minMax = new IntPair(); 330 331 // Buffer a token so we can create a TokenRepeated, etc. 332 REToken currentToken = null; 333 char ch; 334 boolean quot = false; 335 336 while (index < pLength) { 337 // read the next character unit (including backslash escapes) 338 index = getCharUnit(pattern,index,unit,quot); 339 340 if (unit.bk) 341 if (unit.ch == 'Q') { 342 quot = true; 343 continue; 344 } else if (unit.ch == 'E') { 345 quot = false; 346 continue; 347 } 348 if (quot) 349 unit.bk = false; 350 351 // ALTERNATION OPERATOR 352 // \| or | (if RE_NO_BK_VBAR) or newline (if RE_NEWLINE_ALT) 353 // not available if RE_LIMITED_OPS is set 354 355 // TODO: the '\n' literal here should be a test against REToken.newline, 356 // which unfortunately may be more than a single character. 357 if ( ( (unit.ch == '|' && (syntax.get(RESyntax.RE_NO_BK_VBAR) ^ (unit.bk || quot))) 358 || (syntax.get(RESyntax.RE_NEWLINE_ALT) && (unit.ch == '\n') && !(unit.bk || quot)) ) 359 && !syntax.get(RESyntax.RE_LIMITED_OPS)) { 360 // make everything up to here be a branch. create vector if nec. 361 addToken(currentToken); 362 RE theBranch = new RE(firstToken, lastToken, numSubs, subIndex, minimumLength); 363 minimumLength = 0; 364 if (branches == null) { 365 branches = new Vector(); 366 } 367 branches.addElement(theBranch); 368 firstToken = lastToken = currentToken = null; 369 } 370 371 // INTERVAL OPERATOR: 372 // {x} | {x,} | {x,y} (RE_INTERVALS && RE_NO_BK_BRACES) 373 // \{x\} | \{x,\} | \{x,y\} (RE_INTERVALS && !RE_NO_BK_BRACES) 374 // 375 // OPEN QUESTION: 376 // what is proper interpretation of '{' at start of string? 377 378 else if ((unit.ch == '{') && syntax.get(RESyntax.RE_INTERVALS) && (syntax.get(RESyntax.RE_NO_BK_BRACES) ^ (unit.bk || quot))) { 379 int newIndex = getMinMax(pattern,index,minMax,syntax); 380 if (newIndex > index) { 381 if (minMax.first > minMax.second) 382 throw new REException(getLocalizedMessage("interval.order"),REException.REG_BADRPT,newIndex); 383 if (currentToken == null) 384 throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,newIndex); 385 if (currentToken instanceof RETokenRepeated) 386 throw new REException(getLocalizedMessage("repeat.chained"),REException.REG_BADRPT,newIndex); 387 if (currentToken instanceof RETokenWordBoundary || currentToken instanceof RETokenWordBoundary) 388 throw new REException(getLocalizedMessage("repeat.assertion"),REException.REG_BADRPT,newIndex); 389 if ((currentToken.getMinimumLength() == 0) && (minMax.second == Integer.MAX_VALUE)) 390 throw new REException(getLocalizedMessage("repeat.empty.token"),REException.REG_BADRPT,newIndex); 391 index = newIndex; 392 currentToken = setRepeated(currentToken,minMax.first,minMax.second,index); 393 } 394 else { 395 addToken(currentToken); 396 currentToken = new RETokenChar(subIndex,unit.ch,insens); 397 } 398 } 399 400 // LIST OPERATOR: 401 // [...] | [^...] 402 403 else if ((unit.ch == '[') && !(unit.bk || quot)) { 404 Vector options = new Vector(); 405 boolean negative = false; 406 char lastChar = 0; 407 if (index == pLength) throw new REException(getLocalizedMessage("unmatched.bracket"),REException.REG_EBRACK,index); 408 409 // Check for initial caret, negation 410 if ((ch = pattern[index]) == '^') { 411 negative = true; 412 if (++index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index); 413 ch = pattern[index]; 414 } 415 416 // Check for leading right bracket literal 417 if (ch == ']') { 418 lastChar = ch; 419 if (++index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index); 420 } 421 422 while ((ch = pattern[index++]) != ']') { 423 if ((ch == '-') && (lastChar != 0)) { 424 if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index); 425 if ((ch = pattern[index]) == ']') { 426 options.addElement(new RETokenChar(subIndex,lastChar,insens)); 427 lastChar = '-'; 428 } else { 429 options.addElement(new RETokenRange(subIndex,lastChar,ch,insens)); 430 lastChar = 0; 431 index++; 432 } 433 } else if ((ch == '\\') && syntax.get(RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS)) { 434 if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index); 435 int posixID = -1; 436 boolean negate = false; 437 char asciiEsc = 0; 438 if (("dswDSW".indexOf(pattern[index]) != -1) && syntax.get(RESyntax.RE_CHAR_CLASS_ESC_IN_LISTS)) { 439 switch (pattern[index]) { 440 case 'D': 441 negate = true; 442 case 'd': 443 posixID = RETokenPOSIX.DIGIT; 444 break; 445 case 'S': 446 negate = true; 447 case 's': 448 posixID = RETokenPOSIX.SPACE; 449 break; 450 case 'W': 451 negate = true; 452 case 'w': 453 posixID = RETokenPOSIX.ALNUM; 454 break; 455 } 456 } 457 else if ("nrt".indexOf(pattern[index]) != -1) { 458 switch (pattern[index]) { 459 case 'n': 460 asciiEsc = '\n'; 461 break; 462 case 't': 463 asciiEsc = '\t'; 464 break; 465 case 'r': 466 asciiEsc = '\r'; 467 break; 468 } 469 } 470 if (lastChar != 0) options.addElement(new RETokenChar(subIndex,lastChar,insens)); 471 472 if (posixID != -1) { 473 options.addElement(new RETokenPOSIX(subIndex,posixID,insens,negate)); 474 } else if (asciiEsc != 0) { 475 lastChar = asciiEsc; 476 } else { 477 lastChar = pattern[index]; 478 } 479 ++index; 480 } else if ((ch == '[') && (syntax.get(RESyntax.RE_CHAR_CLASSES)) && (index < pLength) && (pattern[index] == ':')) { 481 StringBuffer posixSet = new StringBuffer(); 482 index = getPosixSet(pattern,index+1,posixSet); 483 int posixId = RETokenPOSIX.intValue(posixSet.toString()); 484 if (posixId != -1) 485 options.addElement(new RETokenPOSIX(subIndex,posixId,insens,false)); 486 } else { 487 if (lastChar != 0) options.addElement(new RETokenChar(subIndex,lastChar,insens)); 488 lastChar = ch; 489 } 490 if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index); 491 } // while in list 492 // Out of list, index is one past ']' 493 494 if (lastChar != 0) options.addElement(new RETokenChar(subIndex,lastChar,insens)); 495 496 // Create a new RETokenOneOf 497 addToken(currentToken); 498 options.trimToSize(); 499 currentToken = new RETokenOneOf(subIndex,options,negative); 500 } 501 502 // SUBEXPRESSIONS 503 // (...) | \(...\) depending on RE_NO_BK_PARENS 504 505 else if ((unit.ch == '(') && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot))) { 506 boolean pure = false; 507 boolean comment = false; 508 boolean lookAhead = false; 509 boolean negativelh = false; 510 if ((index+1 < pLength) && (pattern[index] == '?')) { 511 switch (pattern[index+1]) { 512 case '!': 513 if (syntax.get(RESyntax.RE_LOOKAHEAD)) { 514 pure = true; 515 negativelh = true; 516 lookAhead = true; 517 index += 2; 518 } 519 break; 520 case '=': 521 if (syntax.get(RESyntax.RE_LOOKAHEAD)) { 522 pure = true; 523 lookAhead = true; 524 index += 2; 525 } 526 break; 527 case ':': 528 if (syntax.get(RESyntax.RE_PURE_GROUPING)) { 529 pure = true; 530 index += 2; 531 } 532 break; 533 case '#': 534 if (syntax.get(RESyntax.RE_COMMENTS)) { 535 comment = true; 536 } 537 break; 538 default: 539 throw new REException(getLocalizedMessage("repeat.no.token"), REException.REG_BADRPT, index); 540 } 541 } 542 543 if (index >= pLength) { 544 throw new REException(getLocalizedMessage("unmatched.paren"), REException.REG_ESUBREG,index); 545 } 546 547 // find end of subexpression 548 int endIndex = index; 549 int nextIndex = index; 550 int nested = 0; 551 552 while ( ((nextIndex = getCharUnit(pattern,endIndex,unit,false)) > 0) 553 && !(nested == 0 && (unit.ch == ')') && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot))) ) 554 if ((endIndex = nextIndex) >= pLength) 555 throw new REException(getLocalizedMessage("subexpr.no.end"),REException.REG_ESUBREG,nextIndex); 556 else if (unit.ch == '(' && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot))) 557 nested++; 558 else if (unit.ch == ')' && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot))) 559 nested--; 560 561 // endIndex is now position at a ')','\)' 562 // nextIndex is end of string or position after ')' or '\)' 563 564 if (comment) index = nextIndex; 565 else { // not a comment 566 // create RE subexpression as token. 567 addToken(currentToken); 568 if (!pure) { 569 numSubs++; 570 } 571 572 int useIndex = (pure || lookAhead) ? 0 : nextSub + numSubs; 573 currentToken = new RE(String.valueOf(pattern,index,endIndex-index).toCharArray(),cflags,syntax,useIndex,nextSub + numSubs); 574 numSubs += ((RE) currentToken).getNumSubs(); 575 576 if (lookAhead) { 577 currentToken = new RETokenLookAhead(currentToken,negativelh); 578 } 579 580 index = nextIndex; 581 } // not a comment 582 } // subexpression 583 584 // UNMATCHED RIGHT PAREN 585 // ) or \) throw exception if 586 // !syntax.get(RESyntax.RE_UNMATCHED_RIGHT_PAREN_ORD) 587 else if (!syntax.get(RESyntax.RE_UNMATCHED_RIGHT_PAREN_ORD) && ((unit.ch == ')') && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot)))) { 588 throw new REException(getLocalizedMessage("unmatched.paren"),REException.REG_EPAREN,index); 589 } 590 591 // START OF LINE OPERATOR 592 // ^ 593 594 else if ((unit.ch == '^') && !(unit.bk || quot)) { 595 addToken(currentToken); 596 currentToken = null; 597 addToken(new RETokenStart(subIndex,((cflags & REG_MULTILINE) > 0) ? syntax.getLineSeparator() : null)); 598 } 599 600 // END OF LINE OPERATOR 601 // $ 602 603 else if ((unit.ch == '$') && !(unit.bk || quot)) { 604 addToken(currentToken); 605 currentToken = null; 606 addToken(new RETokenEnd(subIndex,((cflags & REG_MULTILINE) > 0) ? syntax.getLineSeparator() : null)); 607 } 608 609 // MATCH-ANY-CHARACTER OPERATOR (except possibly newline and null) 610 // . 611 612 else if ((unit.ch == '.') && !(unit.bk || quot)) { 613 addToken(currentToken); 614 currentToken = new RETokenAny(subIndex,syntax.get(RESyntax.RE_DOT_NEWLINE) || ((cflags & REG_DOT_NEWLINE) > 0),syntax.get(RESyntax.RE_DOT_NOT_NULL)); 615 } 616 617 // ZERO-OR-MORE REPEAT OPERATOR 618 // * 619 620 else if ((unit.ch == '*') && !(unit.bk || quot)) { 621 if (currentToken == null) 622 throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,index); 623 if (currentToken instanceof RETokenRepeated) 624 throw new REException(getLocalizedMessage("repeat.chained"),REException.REG_BADRPT,index); 625 if (currentToken instanceof RETokenWordBoundary || currentToken instanceof RETokenWordBoundary) 626 throw new REException(getLocalizedMessage("repeat.assertion"),REException.REG_BADRPT,index); 627 if (currentToken.getMinimumLength() == 0) 628 throw new REException(getLocalizedMessage("repeat.empty.token"),REException.REG_BADRPT,index); 629 currentToken = setRepeated(currentToken,0,Integer.MAX_VALUE,index); 630 } 631 632 // ONE-OR-MORE REPEAT OPERATOR / POSSESSIVE MATCHING OPERATOR 633 // + | \+ depending on RE_BK_PLUS_QM 634 // not available if RE_LIMITED_OPS is set 635 636 else if ((unit.ch == '+') && !syntax.get(RESyntax.RE_LIMITED_OPS) && (!syntax.get(RESyntax.RE_BK_PLUS_QM) ^ (unit.bk || quot))) { 637 if (currentToken == null) 638 throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,index); 639 640 // Check for possessive matching on RETokenRepeated 641 if (currentToken instanceof RETokenRepeated) { 642 RETokenRepeated tokenRep = (RETokenRepeated)currentToken; 643 if (syntax.get(RESyntax.RE_POSSESSIVE_OPS) && !tokenRep.isPossessive() && !tokenRep.isStingy()) 644 tokenRep.makePossessive(); 645 else 646 throw new REException(getLocalizedMessage("repeat.chained"),REException.REG_BADRPT,index); 647 648 } 649 else if (currentToken instanceof RETokenWordBoundary || currentToken instanceof RETokenWordBoundary) 650 throw new REException(getLocalizedMessage("repeat.assertion"),REException.REG_BADRPT,index); 651 else if (currentToken.getMinimumLength() == 0) 652 throw new REException(getLocalizedMessage("repeat.empty.token"),REException.REG_BADRPT,index); 653 else 654 currentToken = setRepeated(currentToken,1,Integer.MAX_VALUE,index); 655 } 656 657 // ZERO-OR-ONE REPEAT OPERATOR / STINGY MATCHING OPERATOR 658 // ? | \? depending on RE_BK_PLUS_QM 659 // not available if RE_LIMITED_OPS is set 660 // stingy matching if RE_STINGY_OPS is set and it follows a quantifier 661 662 else if ((unit.ch == '?') && !syntax.get(RESyntax.RE_LIMITED_OPS) && (!syntax.get(RESyntax.RE_BK_PLUS_QM) ^ (unit.bk || quot))) { 663 if (currentToken == null) throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,index); 664 665 // Check for stingy matching on RETokenRepeated 666 if (currentToken instanceof RETokenRepeated) { 667 RETokenRepeated tokenRep = (RETokenRepeated)currentToken; 668 if (syntax.get(RESyntax.RE_STINGY_OPS) && !tokenRep.isStingy() && !tokenRep.isPossessive()) 669 tokenRep.makeStingy(); 670 else 671 throw new REException(getLocalizedMessage("repeat.chained"),REException.REG_BADRPT,index); 672 } 673 else if (currentToken instanceof RETokenWordBoundary || currentToken instanceof RETokenWordBoundary) 674 throw new REException(getLocalizedMessage("repeat.assertion"),REException.REG_BADRPT,index); 675 else 676 currentToken = setRepeated(currentToken,0,1,index); 677 } 678 679 // BACKREFERENCE OPERATOR 680 // \1 \2 ... \9 681 // not available if RE_NO_BK_REFS is set 682 683 else if (unit.bk && Character.isDigit(unit.ch) && !syntax.get(RESyntax.RE_NO_BK_REFS)) { 684 addToken(currentToken); 685 currentToken = new RETokenBackRef(subIndex,Character.digit(unit.ch,10),insens); 686 } 687 688 // START OF STRING OPERATOR 689 // \A if RE_STRING_ANCHORS is set 690 691 else if (unit.bk && (unit.ch == 'A') && syntax.get(RESyntax.RE_STRING_ANCHORS)) { 692 addToken(currentToken); 693 currentToken = new RETokenStart(subIndex,null); 694 } 695 696 // WORD BREAK OPERATOR 697 // \b if ???? 698 699 else if (unit.bk && (unit.ch == 'b') && syntax.get(RESyntax.RE_STRING_ANCHORS)) { 700 addToken(currentToken); 701 currentToken = new RETokenWordBoundary(subIndex, RETokenWordBoundary.BEGIN | RETokenWordBoundary.END, false); 702 } 703 704 // WORD BEGIN OPERATOR 705 // \< if ???? 706 else if (unit.bk && (unit.ch == '<')) { 707 addToken(currentToken); 708 currentToken = new RETokenWordBoundary(subIndex, RETokenWordBoundary.BEGIN, false); 709 } 710 711 // WORD END OPERATOR 712 // \> if ???? 713 else if (unit.bk && (unit.ch == '>')) { 714 addToken(currentToken); 715 currentToken = new RETokenWordBoundary(subIndex, RETokenWordBoundary.END, false); 716 } 717 718 // NON-WORD BREAK OPERATOR 719 // \B if ???? 720 721 else if (unit.bk && (unit.ch == 'B') && syntax.get(RESyntax.RE_STRING_ANCHORS)) { 722 addToken(currentToken); 723 currentToken = new RETokenWordBoundary(subIndex, RETokenWordBoundary.BEGIN | RETokenWordBoundary.END, true); 724 } 725 726 727 // DIGIT OPERATOR 728 // \d if RE_CHAR_CLASS_ESCAPES is set 729 730 else if (unit.bk && (unit.ch == 'd') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) { 731 addToken(currentToken); 732 currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.DIGIT,insens,false); 733 } 734 735 // NON-DIGIT OPERATOR 736 // \D 737 738 else if (unit.bk && (unit.ch == 'D') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) { 739 addToken(currentToken); 740 currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.DIGIT,insens,true); 741 } 742 743 // NEWLINE ESCAPE 744 // \n 745 746 else if (unit.bk && (unit.ch == 'n')) { 747 addToken(currentToken); 748 currentToken = new RETokenChar(subIndex,'\n',false); 749 } 750 751 // RETURN ESCAPE 752 // \r 753 754 else if (unit.bk && (unit.ch == 'r')) { 755 addToken(currentToken); 756 currentToken = new RETokenChar(subIndex,'\r',false); 757 } 758 759 // WHITESPACE OPERATOR 760 // \s if RE_CHAR_CLASS_ESCAPES is set 761 762 else if (unit.bk && (unit.ch == 's') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) { 763 addToken(currentToken); 764 currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.SPACE,insens,false); 765 } 766 767 // NON-WHITESPACE OPERATOR 768 // \S 769 770 else if (unit.bk && (unit.ch == 'S') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) { 771 addToken(currentToken); 772 currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.SPACE,insens,true); 773 } 774 775 // TAB ESCAPE 776 // \t 777 778 else if (unit.bk && (unit.ch == 't')) { 779 addToken(currentToken); 780 currentToken = new RETokenChar(subIndex,'\t',false); 781 } 782 783 // ALPHANUMERIC OPERATOR 784 // \w 785 786 else if (unit.bk && (unit.ch == 'w') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) { 787 addToken(currentToken); 788 currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.ALNUM,insens,false); 789 } 790 791 // NON-ALPHANUMERIC OPERATOR 792 // \W 793 794 else if (unit.bk && (unit.ch == 'W') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) { 795 addToken(currentToken); 796 currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.ALNUM,insens,true); 797 } 798 799 // END OF STRING OPERATOR 800 // \Z 801 802 else if (unit.bk && (unit.ch == 'Z') && syntax.get(RESyntax.RE_STRING_ANCHORS)) { 803 addToken(currentToken); 804 currentToken = new RETokenEnd(subIndex,null); 805 } 806 807 // NON-SPECIAL CHARACTER (or escape to make literal) 808 // c | \* for example 809 810 else { // not a special character 811 addToken(currentToken); 812 currentToken = new RETokenChar(subIndex,unit.ch,insens); 813 } 814 } // end while 815 816 // Add final buffered token and an EndSub marker 817 addToken(currentToken); 818 819 if (branches != null) { 820 branches.addElement(new RE(firstToken,lastToken,numSubs,subIndex,minimumLength)); 821 branches.trimToSize(); // compact the Vector 822 minimumLength = 0; 823 firstToken = lastToken = null; 824 addToken(new RETokenOneOf(subIndex,branches,false)); 825 } 826 else addToken(new RETokenEndSub(subIndex)); 827 828 } 829 getCharUnit(char[] input, int index, CharUnit unit, boolean quot)830 private static int getCharUnit(char[] input, int index, CharUnit unit, boolean quot) throws REException { 831 unit.ch = input[index++]; 832 unit.bk = (unit.ch == '\\' 833 && (!quot || index >= input.length || input[index] == 'E')); 834 if (unit.bk) 835 if (index < input.length) 836 unit.ch = input[index++]; 837 else throw new REException(getLocalizedMessage("ends.with.backslash"),REException.REG_ESCAPE,index); 838 return index; 839 } 840 841 /** 842 * Checks if the regular expression matches the input in its entirety. 843 * 844 * @param input The input text. 845 */ isMatch(Object input)846 public boolean isMatch(Object input) { 847 return isMatch(input,0,0); 848 } 849 850 /** 851 * Checks if the input string, starting from index, is an exact match of 852 * this regular expression. 853 * 854 * @param input The input text. 855 * @param index The offset index at which the search should be begin. 856 */ isMatch(Object input,int index)857 public boolean isMatch(Object input,int index) { 858 return isMatch(input,index,0); 859 } 860 861 862 /** 863 * Checks if the input, starting from index and using the specified 864 * execution flags, is an exact match of this regular expression. 865 * 866 * @param input The input text. 867 * @param index The offset index at which the search should be begin. 868 * @param eflags The logical OR of any execution flags above. 869 */ isMatch(Object input,int index,int eflags)870 public boolean isMatch(Object input,int index,int eflags) { 871 return isMatchImpl(makeCharIndexed(input,index),index,eflags); 872 } 873 isMatchImpl(CharIndexed input, int index, int eflags)874 private boolean isMatchImpl(CharIndexed input, int index, int eflags) { 875 if (firstToken == null) // Trivial case 876 return (input.charAt(0) == CharIndexed.OUT_OF_BOUNDS); 877 REMatch m = new REMatch(numSubs, index, eflags); 878 if (firstToken.match(input, m)) { 879 while (m != null) { 880 if (input.charAt(m.index) == CharIndexed.OUT_OF_BOUNDS) { 881 return true; 882 } 883 m = m.next; 884 } 885 } 886 return false; 887 } 888 889 /** 890 * Returns the maximum number of subexpressions in this regular expression. 891 * If the expression contains branches, the value returned will be the 892 * maximum subexpressions in any of the branches. 893 */ getNumSubs()894 public int getNumSubs() { 895 return numSubs; 896 } 897 898 // Overrides REToken.setUncle setUncle(REToken uncle)899 void setUncle(REToken uncle) { 900 if (lastToken != null) { 901 lastToken.setUncle(uncle); 902 } else super.setUncle(uncle); // to deal with empty subexpressions 903 } 904 905 // Overrides REToken.chain 906 chain(REToken next)907 boolean chain(REToken next) { 908 super.chain(next); 909 setUncle(next); 910 return true; 911 } 912 913 /** 914 * Returns the minimum number of characters that could possibly 915 * constitute a match of this regular expression. 916 */ getMinimumLength()917 public int getMinimumLength() { 918 return minimumLength; 919 } 920 921 /** 922 * Returns an array of all matches found in the input. 923 * 924 * If the regular expression allows the empty string to match, it will 925 * substitute matches at all positions except the end of the input. 926 * 927 * @param input The input text. 928 * @return a non-null (but possibly zero-length) array of matches 929 */ getAllMatches(Object input)930 public REMatch[] getAllMatches(Object input) { 931 return getAllMatches(input,0,0); 932 } 933 934 /** 935 * Returns an array of all matches found in the input, 936 * beginning at the specified index position. 937 * 938 * If the regular expression allows the empty string to match, it will 939 * substitute matches at all positions except the end of the input. 940 * 941 * @param input The input text. 942 * @param index The offset index at which the search should be begin. 943 * @return a non-null (but possibly zero-length) array of matches 944 */ getAllMatches(Object input, int index)945 public REMatch[] getAllMatches(Object input, int index) { 946 return getAllMatches(input,index,0); 947 } 948 949 /** 950 * Returns an array of all matches found in the input string, 951 * beginning at the specified index position and using the specified 952 * execution flags. 953 * 954 * If the regular expression allows the empty string to match, it will 955 * substitute matches at all positions except the end of the input. 956 * 957 * @param input The input text. 958 * @param index The offset index at which the search should be begin. 959 * @param eflags The logical OR of any execution flags above. 960 * @return a non-null (but possibly zero-length) array of matches 961 */ getAllMatches(Object input, int index, int eflags)962 public REMatch[] getAllMatches(Object input, int index, int eflags) { 963 return getAllMatchesImpl(makeCharIndexed(input,index),index,eflags); 964 } 965 966 // this has been changed since 1.03 to be non-overlapping matches getAllMatchesImpl(CharIndexed input, int index, int eflags)967 private REMatch[] getAllMatchesImpl(CharIndexed input, int index, int eflags) { 968 Vector all = new Vector(); 969 REMatch m = null; 970 while ((m = getMatchImpl(input,index,eflags,null)) != null) { 971 all.addElement(m); 972 index = m.getEndIndex(); 973 if (m.end[0] == 0) { // handle pathological case of zero-length match 974 index++; 975 input.move(1); 976 } else { 977 input.move(m.end[0]); 978 } 979 if (!input.isValid()) break; 980 } 981 REMatch[] mset = new REMatch[all.size()]; 982 all.copyInto(mset); 983 return mset; 984 } 985 986 /* Implements abstract method REToken.match() */ match(CharIndexed input, REMatch mymatch)987 boolean match(CharIndexed input, REMatch mymatch) { 988 if (firstToken == null) return next(input, mymatch); 989 990 // Note the start of this subexpression 991 mymatch.start[subIndex] = mymatch.index; 992 993 return firstToken.match(input, mymatch); 994 } 995 996 /** 997 * Returns the first match found in the input. If no match is found, 998 * null is returned. 999 * 1000 * @param input The input text. 1001 * @return An REMatch instance referencing the match, or null if none. 1002 */ getMatch(Object input)1003 public REMatch getMatch(Object input) { 1004 return getMatch(input,0,0); 1005 } 1006 1007 /** 1008 * Returns the first match found in the input, beginning 1009 * the search at the specified index. If no match is found, 1010 * returns null. 1011 * 1012 * @param input The input text. 1013 * @param index The offset within the text to begin looking for a match. 1014 * @return An REMatch instance referencing the match, or null if none. 1015 */ getMatch(Object input, int index)1016 public REMatch getMatch(Object input, int index) { 1017 return getMatch(input,index,0); 1018 } 1019 1020 /** 1021 * Returns the first match found in the input, beginning 1022 * the search at the specified index, and using the specified 1023 * execution flags. If no match is found, returns null. 1024 * 1025 * @param input The input text. 1026 * @param index The offset index at which the search should be begin. 1027 * @param eflags The logical OR of any execution flags above. 1028 * @return An REMatch instance referencing the match, or null if none. 1029 */ getMatch(Object input, int index, int eflags)1030 public REMatch getMatch(Object input, int index, int eflags) { 1031 return getMatch(input,index,eflags,null); 1032 } 1033 1034 /** 1035 * Returns the first match found in the input, beginning the search 1036 * at the specified index, and using the specified execution flags. 1037 * If no match is found, returns null. If a StringBuffer is 1038 * provided and is non-null, the contents of the input text from the 1039 * index to the beginning of the match (or to the end of the input, 1040 * if there is no match) are appended to the StringBuffer. 1041 * 1042 * @param input The input text. 1043 * @param index The offset index at which the search should be begin. 1044 * @param eflags The logical OR of any execution flags above. 1045 * @param buffer The StringBuffer to save pre-match text in. 1046 * @return An REMatch instance referencing the match, or null if none. */ getMatch(Object input, int index, int eflags, StringBuffer buffer)1047 public REMatch getMatch(Object input, int index, int eflags, StringBuffer buffer) { 1048 return getMatchImpl(makeCharIndexed(input,index),index,eflags,buffer); 1049 } 1050 getMatchImpl(CharIndexed input, int anchor, int eflags, StringBuffer buffer)1051 REMatch getMatchImpl(CharIndexed input, int anchor, int eflags, StringBuffer buffer) { 1052 // Create a new REMatch to hold results 1053 REMatch mymatch = new REMatch(numSubs, anchor, eflags); 1054 do { 1055 // Optimization: check if anchor + minimumLength > length 1056 if (minimumLength == 0 || input.charAt(minimumLength-1) != CharIndexed.OUT_OF_BOUNDS) { 1057 if (match(input, mymatch)) { 1058 // Find longest match of them all to observe leftmost longest 1059 REMatch longest = mymatch; 1060 while ((mymatch = mymatch.next) != null) { 1061 if (mymatch.index > longest.index) { 1062 longest = mymatch; 1063 } 1064 } 1065 1066 longest.end[0] = longest.index; 1067 longest.finish(input); 1068 return longest; 1069 } 1070 } 1071 mymatch.clear(++anchor); 1072 // Append character to buffer if needed 1073 if (buffer != null && input.charAt(0) != CharIndexed.OUT_OF_BOUNDS) { 1074 buffer.append(input.charAt(0)); 1075 } 1076 } while (input.move(1)); 1077 1078 // Special handling at end of input for e.g. "$" 1079 if (minimumLength == 0) { 1080 if (match(input, mymatch)) { 1081 mymatch.finish(input); 1082 return mymatch; 1083 } 1084 } 1085 1086 return null; 1087 } 1088 1089 /** 1090 * Returns an REMatchEnumeration that can be used to iterate over the 1091 * matches found in the input text. 1092 * 1093 * @param input The input text. 1094 * @return A non-null REMatchEnumeration instance. 1095 */ getMatchEnumeration(Object input)1096 public REMatchEnumeration getMatchEnumeration(Object input) { 1097 return getMatchEnumeration(input,0,0); 1098 } 1099 1100 1101 /** 1102 * Returns an REMatchEnumeration that can be used to iterate over the 1103 * matches found in the input text. 1104 * 1105 * @param input The input text. 1106 * @param index The offset index at which the search should be begin. 1107 * @return A non-null REMatchEnumeration instance, with its input cursor 1108 * set to the index position specified. 1109 */ getMatchEnumeration(Object input, int index)1110 public REMatchEnumeration getMatchEnumeration(Object input, int index) { 1111 return getMatchEnumeration(input,index,0); 1112 } 1113 1114 /** 1115 * Returns an REMatchEnumeration that can be used to iterate over the 1116 * matches found in the input text. 1117 * 1118 * @param input The input text. 1119 * @param index The offset index at which the search should be begin. 1120 * @param eflags The logical OR of any execution flags above. 1121 * @return A non-null REMatchEnumeration instance, with its input cursor 1122 * set to the index position specified. 1123 */ getMatchEnumeration(Object input, int index, int eflags)1124 public REMatchEnumeration getMatchEnumeration(Object input, int index, int eflags) { 1125 return new REMatchEnumeration(this,makeCharIndexed(input,index),index,eflags); 1126 } 1127 1128 1129 /** 1130 * Substitutes the replacement text for the first match found in the input. 1131 * 1132 * @param input The input text. 1133 * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto). 1134 * @return A String interpolating the substituted text. 1135 * @see REMatch#substituteInto 1136 */ substitute(Object input,String replace)1137 public String substitute(Object input,String replace) { 1138 return substitute(input,replace,0,0); 1139 } 1140 1141 /** 1142 * Substitutes the replacement text for the first match found in the input 1143 * beginning at the specified index position. Specifying an index 1144 * effectively causes the regular expression engine to throw away the 1145 * specified number of characters. 1146 * 1147 * @param input The input text. 1148 * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto). 1149 * @param index The offset index at which the search should be begin. 1150 * @return A String containing the substring of the input, starting 1151 * at the index position, and interpolating the substituted text. 1152 * @see REMatch#substituteInto 1153 */ substitute(Object input,String replace,int index)1154 public String substitute(Object input,String replace,int index) { 1155 return substitute(input,replace,index,0); 1156 } 1157 1158 /** 1159 * Substitutes the replacement text for the first match found in the input 1160 * string, beginning at the specified index position and using the 1161 * specified execution flags. 1162 * 1163 * @param input The input text. 1164 * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto). 1165 * @param index The offset index at which the search should be begin. 1166 * @param eflags The logical OR of any execution flags above. 1167 * @return A String containing the substring of the input, starting 1168 * at the index position, and interpolating the substituted text. 1169 * @see REMatch#substituteInto 1170 */ substitute(Object input,String replace,int index,int eflags)1171 public String substitute(Object input,String replace,int index,int eflags) { 1172 return substituteImpl(makeCharIndexed(input,index),replace,index,eflags); 1173 } 1174 substituteImpl(CharIndexed input,String replace,int index,int eflags)1175 private String substituteImpl(CharIndexed input,String replace,int index,int eflags) { 1176 StringBuffer buffer = new StringBuffer(); 1177 REMatch m = getMatchImpl(input,index,eflags,buffer); 1178 if (m==null) return buffer.toString(); 1179 buffer.append( ((eflags & REG_NO_INTERPOLATE) > 0) ? 1180 replace : m.substituteInto(replace) ); 1181 if (input.move(m.end[0])) { 1182 do { 1183 buffer.append(input.charAt(0)); 1184 } while (input.move(1)); 1185 } 1186 return buffer.toString(); 1187 } 1188 1189 /** 1190 * Substitutes the replacement text for each non-overlapping match found 1191 * in the input text. 1192 * 1193 * @param input The input text. 1194 * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto). 1195 * @return A String interpolating the substituted text. 1196 * @see REMatch#substituteInto 1197 */ substituteAll(Object input,String replace)1198 public String substituteAll(Object input,String replace) { 1199 return substituteAll(input,replace,0,0); 1200 } 1201 1202 /** 1203 * Substitutes the replacement text for each non-overlapping match found 1204 * in the input text, starting at the specified index. 1205 * 1206 * If the regular expression allows the empty string to match, it will 1207 * substitute matches at all positions except the end of the input. 1208 * 1209 * @param input The input text. 1210 * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto). 1211 * @param index The offset index at which the search should be begin. 1212 * @return A String containing the substring of the input, starting 1213 * at the index position, and interpolating the substituted text. 1214 * @see REMatch#substituteInto 1215 */ substituteAll(Object input,String replace,int index)1216 public String substituteAll(Object input,String replace,int index) { 1217 return substituteAll(input,replace,index,0); 1218 } 1219 1220 /** 1221 * Substitutes the replacement text for each non-overlapping match found 1222 * in the input text, starting at the specified index and using the 1223 * specified execution flags. 1224 * 1225 * @param input The input text. 1226 * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto). 1227 * @param index The offset index at which the search should be begin. 1228 * @param eflags The logical OR of any execution flags above. 1229 * @return A String containing the substring of the input, starting 1230 * at the index position, and interpolating the substituted text. 1231 * @see REMatch#substituteInto 1232 */ substituteAll(Object input,String replace,int index,int eflags)1233 public String substituteAll(Object input,String replace,int index,int eflags) { 1234 return substituteAllImpl(makeCharIndexed(input,index),replace,index,eflags); 1235 } 1236 substituteAllImpl(CharIndexed input,String replace,int index,int eflags)1237 private String substituteAllImpl(CharIndexed input,String replace,int index,int eflags) { 1238 StringBuffer buffer = new StringBuffer(); 1239 REMatch m; 1240 while ((m = getMatchImpl(input,index,eflags,buffer)) != null) { 1241 buffer.append( ((eflags & REG_NO_INTERPOLATE) > 0) ? 1242 replace : m.substituteInto(replace) ); 1243 index = m.getEndIndex(); 1244 if (m.end[0] == 0) { 1245 char ch = input.charAt(0); 1246 if (ch != CharIndexed.OUT_OF_BOUNDS) 1247 buffer.append(ch); 1248 input.move(1); 1249 } else { 1250 input.move(m.end[0]); 1251 } 1252 1253 if (!input.isValid()) break; 1254 } 1255 return buffer.toString(); 1256 } 1257 1258 /* Helper function for constructor */ addToken(REToken next)1259 private void addToken(REToken next) { 1260 if (next == null) return; 1261 minimumLength += next.getMinimumLength(); 1262 if (firstToken == null) { 1263 lastToken = firstToken = next; 1264 } else { 1265 // if chain returns false, it "rejected" the token due to 1266 // an optimization, and next was combined with lastToken 1267 if (lastToken.chain(next)) { 1268 lastToken = next; 1269 } 1270 } 1271 } 1272 setRepeated(REToken current, int min, int max, int index)1273 private static REToken setRepeated(REToken current, int min, int max, int index) throws REException { 1274 if (current == null) throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,index); 1275 return new RETokenRepeated(current.subIndex,current,min,max); 1276 } 1277 getPosixSet(char[] pattern,int index,StringBuffer buf)1278 private static int getPosixSet(char[] pattern,int index,StringBuffer buf) { 1279 // Precondition: pattern[index-1] == ':' 1280 // we will return pos of closing ']'. 1281 int i; 1282 for (i=index; i<(pattern.length-1); i++) { 1283 if ((pattern[i] == ':') && (pattern[i+1] == ']')) 1284 return i+2; 1285 buf.append(pattern[i]); 1286 } 1287 return index; // didn't match up 1288 } 1289 getMinMax(char[] input,int index,IntPair minMax,RESyntax syntax)1290 private int getMinMax(char[] input,int index,IntPair minMax,RESyntax syntax) throws REException { 1291 // Precondition: input[index-1] == '{', minMax != null 1292 1293 boolean mustMatch = !syntax.get(RESyntax.RE_NO_BK_BRACES); 1294 int startIndex = index; 1295 if (index == input.length) { 1296 if (mustMatch) 1297 throw new REException(getLocalizedMessage("unmatched.brace"),REException.REG_EBRACE,index); 1298 else 1299 return startIndex; 1300 } 1301 1302 int min,max=0; 1303 CharUnit unit = new CharUnit(); 1304 StringBuffer buf = new StringBuffer(); 1305 1306 // Read string of digits 1307 do { 1308 index = getCharUnit(input,index,unit,false); 1309 if (Character.isDigit(unit.ch)) 1310 buf.append(unit.ch); 1311 } while ((index != input.length) && Character.isDigit(unit.ch)); 1312 1313 // Check for {} tomfoolery 1314 if (buf.length() == 0) { 1315 if (mustMatch) 1316 throw new REException(getLocalizedMessage("interval.error"),REException.REG_EBRACE,index); 1317 else 1318 return startIndex; 1319 } 1320 1321 min = Integer.parseInt(buf.toString()); 1322 1323 if ((unit.ch == '}') && (syntax.get(RESyntax.RE_NO_BK_BRACES) ^ unit.bk)) 1324 max = min; 1325 else if (index == input.length) 1326 if (mustMatch) 1327 throw new REException(getLocalizedMessage("interval.no.end"),REException.REG_EBRACE,index); 1328 else 1329 return startIndex; 1330 else if ((unit.ch == ',') && !unit.bk) { 1331 buf = new StringBuffer(); 1332 // Read string of digits 1333 while (((index = getCharUnit(input,index,unit,false)) != input.length) && Character.isDigit(unit.ch)) 1334 buf.append(unit.ch); 1335 1336 if (!((unit.ch == '}') && (syntax.get(RESyntax.RE_NO_BK_BRACES) ^ unit.bk))) 1337 if (mustMatch) 1338 throw new REException(getLocalizedMessage("interval.error"),REException.REG_EBRACE,index); 1339 else 1340 return startIndex; 1341 1342 // This is the case of {x,} 1343 if (buf.length() == 0) max = Integer.MAX_VALUE; 1344 else max = Integer.parseInt(buf.toString()); 1345 } else 1346 if (mustMatch) 1347 throw new REException(getLocalizedMessage("interval.error"),REException.REG_EBRACE,index); 1348 else 1349 return startIndex; 1350 1351 // We know min and max now, and they are valid. 1352 1353 minMax.first = min; 1354 minMax.second = max; 1355 1356 // return the index following the '}' 1357 return index; 1358 } 1359 1360 /** 1361 * Return a human readable form of the compiled regular expression, 1362 * useful for debugging. 1363 */ toString()1364 public String toString() { 1365 StringBuffer sb = new StringBuffer(); 1366 dump(sb); 1367 return sb.toString(); 1368 } 1369 dump(StringBuffer os)1370 void dump(StringBuffer os) { 1371 os.append('('); 1372 if (subIndex == 0) 1373 os.append("?:"); 1374 if (firstToken != null) 1375 firstToken.dumpAll(os); 1376 os.append(')'); 1377 } 1378 1379 // Cast input appropriately or throw exception makeCharIndexed(Object input, int index)1380 private static CharIndexed makeCharIndexed(Object input, int index) { 1381 // We could let a String fall through to final input, but since 1382 // it's the most likely input type, we check it first. 1383 if (input instanceof String) 1384 return new CharIndexedString((String) input,index); 1385 else if (input instanceof char[]) 1386 return new CharIndexedCharArray((char[]) input,index); 1387 else if (input instanceof StringBuffer) 1388 return new CharIndexedStringBuffer((StringBuffer) input,index); 1389 else if (input instanceof InputStream) 1390 return new CharIndexedInputStream((InputStream) input,index); 1391 else if (input instanceof CharIndexed) 1392 return (CharIndexed) input; // do we lose index info? 1393 else 1394 return new CharIndexedString(input.toString(), index); 1395 } 1396 } 1397