1 /* gnu/regexp/RE.java 2 Copyright (C) 2006 Free Software Foundation, Inc. 3 4 This file is part of GNU Classpath. 5 6 GNU Classpath is free software; you can redistribute it and/or modify 7 it under the terms of the GNU General Public License as published by 8 the Free Software Foundation; either version 2, or (at your option) 9 any later version. 10 11 GNU Classpath is distributed in the hope that it will be useful, but 12 WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 General Public License for more details. 15 16 You should have received a copy of the GNU General Public License 17 along with GNU Classpath; see the file COPYING. If not, write to the 18 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 19 02110-1301 USA. 20 21 Linking this library statically or dynamically with other modules is 22 making a combined work based on this library. Thus, the terms and 23 conditions of the GNU General Public License cover the whole 24 combination. 25 26 As a special exception, the copyright holders of this library give you 27 permission to link this library with independent modules to produce an 28 executable, regardless of the license terms of these independent 29 modules, and to copy and distribute the resulting executable under 30 terms of your choice, provided that you also meet, for each linked 31 independent module, the terms and conditions of the license of that 32 module. An independent module is a module which is not derived from 33 or based on this library. If you modify this library, you may extend 34 this exception to your version of the library, but you are not 35 obligated to do so. If you do not wish to do so, delete this 36 exception statement from your version. */ 37 38 package gnu.java.util.regex; 39 40 import gnu.java.lang.CPStringBuilder; 41 42 import java.io.InputStream; 43 import java.io.Serializable; 44 45 import java.util.ArrayList; 46 import java.util.List; 47 import java.util.Locale; 48 import java.util.PropertyResourceBundle; 49 import java.util.ResourceBundle; 50 51 /** 52 * RE provides the user interface for compiling and matching regular 53 * expressions. 54 * <P> 55 * A regular expression object (class RE) is compiled by constructing it 56 * from a String, StringBuffer or character array, with optional 57 * compilation flags (below) 58 * and an optional syntax specification (see RESyntax; if not specified, 59 * <code>RESyntax.RE_SYNTAX_PERL5</code> is used). 60 * <P> 61 * Once compiled, a regular expression object is reusable as well as 62 * threadsafe: multiple threads can use the RE instance simultaneously 63 * to match against different input text. 64 * <P> 65 * Various methods attempt to match input text against a compiled 66 * regular expression. These methods are: 67 * <LI><code>isMatch</code>: returns true if the input text in its 68 * entirety matches the regular expression pattern. 69 * <LI><code>getMatch</code>: returns the first match found in the 70 * input text, or null if no match is found. 71 * <LI><code>getAllMatches</code>: returns an array of all 72 * non-overlapping matches found in the input text. If no matches are 73 * found, the array is zero-length. 74 * <LI><code>substitute</code>: substitute the first occurence of the 75 * pattern in the input text with a replacement string (which may 76 * include metacharacters $0-$9, see REMatch.substituteInto). 77 * <LI><code>substituteAll</code>: same as above, but repeat for each 78 * match before returning. 79 * <LI><code>getMatchEnumeration</code>: returns an REMatchEnumeration 80 * object that allows iteration over the matches (see 81 * REMatchEnumeration for some reasons why you may want to do this 82 * instead of using <code>getAllMatches</code>. 83 * <P> 84 * 85 * These methods all have similar argument lists. The input can be a 86 * CharIndexed, String, a character array, a StringBuffer, or an 87 * InputStream of some sort. Note that when using an 88 * InputStream, the stream read position cannot be guaranteed after 89 * attempting a match (this is not a bug, but a consequence of the way 90 * regular expressions work). Using an REMatchEnumeration can 91 * eliminate most positioning problems. 92 * 93 * Although the input object can be of various types, it is recommended 94 * that it should be a CharIndexed because {@link CharIndexed#getLastMatch()} 95 * can show the last match found on this input, which helps the expression 96 * \G work as the end of the previous match. 97 * 98 * <P> 99 * 100 * The optional index argument specifies the offset from the beginning 101 * of the text at which the search should start (see the descriptions 102 * of some of the execution flags for how this can affect positional 103 * pattern operators). For an InputStream, this means an 104 * offset from the current read position, so subsequent calls with the 105 * same index argument on an InputStream will not 106 * necessarily access the same position on the stream, whereas 107 * repeated searches at a given index in a fixed string will return 108 * consistent results. 109 * 110 * <P> 111 * You can optionally affect the execution environment by using a 112 * combination of execution flags (constants listed below). 113 * 114 * <P> 115 * All operations on a regular expression are performed in a 116 * thread-safe manner. 117 * 118 * @author <A HREF="mailto:wes@cacas.org">Wes Biggs</A> 119 * @version 1.1.5-dev, to be released 120 */ 121 122 public class RE extends REToken 123 { 124 125 private static final class IntPair implements Serializable 126 { 127 public int first, second; 128 } 129 130 private static final class CharUnit implements Serializable 131 { 132 public char ch; 133 public boolean bk; 134 } 135 136 // This String will be returned by getVersion() 137 private static final String VERSION = "1.1.5-dev"; 138 139 // The localized strings are kept in a separate file 140 // Used by getLocalizedMessage(). 141 private static ResourceBundle messages; 142 143 // Name of the bundle that contains the localized messages. 144 private static final String bundle = "gnu/java/util/regex/MessagesBundle"; 145 146 // These are, respectively, the first and last tokens in our linked list 147 // If there is only one token, firstToken == lastToken 148 private REToken firstToken, lastToken; 149 150 // This is the number of subexpressions in this regular expression, 151 // with a minimum value of zero. Returned by getNumSubs() 152 private int numSubs; 153 154 /** Minimum length, in characters, of any possible match. */ 155 private int minimumLength; 156 private int maximumLength; 157 158 /** 159 * Compilation flag. Do not differentiate case. Subsequent 160 * searches using this RE will be case insensitive. 161 */ 162 public static final int REG_ICASE = 0x02; 163 164 /** 165 * Compilation flag. The match-any-character operator (dot) 166 * will match a newline character. When set this overrides the syntax 167 * bit RE_DOT_NEWLINE (see RESyntax for details). This is equivalent to 168 * the "/s" operator in Perl. 169 */ 170 public static final int REG_DOT_NEWLINE = 0x04; 171 172 /** 173 * Compilation flag. Use multiline mode. In this mode, the ^ and $ 174 * anchors will match based on newlines within the input. This is 175 * equivalent to the "/m" operator in Perl. 176 */ 177 public static final int REG_MULTILINE = 0x08; 178 179 /** 180 * Execution flag. 181 * The match-beginning operator (^) will not match at the beginning 182 * of the input string. Useful for matching on a substring when you 183 * know the context of the input is such that position zero of the 184 * input to the match test is not actually position zero of the text. 185 * <P> 186 * This example demonstrates the results of various ways of matching on 187 * a substring. 188 * <P> 189 * <CODE> 190 * String s = "food bar fool";<BR> 191 * RE exp = new RE("^foo.");<BR> 192 * REMatch m0 = exp.getMatch(s);<BR> 193 * REMatch m1 = exp.getMatch(s.substring(8));<BR> 194 * REMatch m2 = exp.getMatch(s.substring(8),0,RE.REG_NOTBOL); <BR> 195 * REMatch m3 = exp.getMatch(s,8); <BR> 196 * REMatch m4 = exp.getMatch(s,8,RE.REG_ANCHORINDEX); <BR> 197 * <P> 198 * // Results:<BR> 199 * // m0.toString(): "food"<BR> 200 * // m1.toString(): "fool"<BR> 201 * // m2.toString(): null<BR> 202 * // m3.toString(): null<BR> 203 * // m4.toString(): "fool"<BR> 204 * </CODE> 205 */ 206 public static final int REG_NOTBOL = 0x10; 207 208 /** 209 * Execution flag. 210 * The match-end operator ($) does not match at the end 211 * of the input string. Useful for matching on substrings. 212 */ 213 public static final int REG_NOTEOL = 0x20; 214 215 /** 216 * Execution flag. 217 * When a match method is invoked that starts matching at a non-zero 218 * index into the input, treat the input as if it begins at the index 219 * given. The effect of this flag is that the engine does not "see" 220 * any text in the input before the given index. This is useful so 221 * that the match-beginning operator (^) matches not at position 0 222 * in the input string, but at the position the search started at 223 * (based on the index input given to the getMatch function). See 224 * the example under REG_NOTBOL. It also affects the use of the \< 225 * and \b operators. 226 */ 227 public static final int REG_ANCHORINDEX = 0x40; 228 229 /** 230 * Execution flag. 231 * The substitute and substituteAll methods will not attempt to 232 * interpolate occurrences of $1-$9 in the replacement text with 233 * the corresponding subexpressions. For example, you may want to 234 * replace all matches of "one dollar" with "$1". 235 */ 236 public static final int REG_NO_INTERPOLATE = 0x80; 237 238 /** 239 * Execution flag. 240 * Try to match the whole input string. An implicit match-end operator 241 * is added to this regexp. 242 */ 243 public static final int REG_TRY_ENTIRE_MATCH = 0x0100; 244 245 /** 246 * Execution flag. 247 * The substitute and substituteAll methods will treat the 248 * character '\' in the replacement as an escape to a literal 249 * character. In this case "\n", "\$", "\\", "\x40" and "\012" 250 * will become "n", "$", "\", "x40" and "012" respectively. 251 * This flag has no effect if REG_NO_INTERPOLATE is set on. 252 */ 253 public static final int REG_REPLACE_USE_BACKSLASHESCAPE = 0x0200; 254 255 /** 256 * Compilation flag. Allow whitespace and comments in pattern. 257 * This is equivalent to the "/x" operator in Perl. 258 */ 259 public static final int REG_X_COMMENTS = 0x0400; 260 261 /** 262 * Compilation flag. If set, REG_ICASE is effective only for US-ASCII. 263 */ 264 public static final int REG_ICASE_USASCII = 0x0800; 265 266 /** 267 * Execution flag. 268 * Do not move the position at which the search begins. If not set, 269 * the starting position will be moved until a match is found. 270 */ 271 public static final int REG_FIX_STARTING_POSITION = 0x1000; 272 273 /** Returns a string representing the version of the gnu.regexp package. */ version()274 public static final String version () 275 { 276 return VERSION; 277 } 278 279 // Retrieves a message from the ResourceBundle getLocalizedMessage(String key)280 static final String getLocalizedMessage (String key) 281 { 282 if (messages == null) 283 messages = 284 PropertyResourceBundle.getBundle (bundle, Locale.getDefault ()); 285 return messages.getString (key); 286 } 287 288 /** 289 * Constructs a regular expression pattern buffer without any compilation 290 * flags set, and using the default syntax (RESyntax.RE_SYNTAX_PERL5). 291 * 292 * @param pattern A regular expression pattern, in the form of a String, 293 * StringBuffer or char[]. Other input types will be converted to 294 * strings using the toString() method. 295 * @exception REException The input pattern could not be parsed. 296 * @exception NullPointerException The pattern was null. 297 */ RE(Object pattern)298 public RE (Object pattern) throws REException 299 { 300 this (pattern, 0, RESyntax.RE_SYNTAX_PERL5, 0, 0); 301 } 302 303 /** 304 * Constructs a regular expression pattern buffer using the specified 305 * compilation flags and the default syntax (RESyntax.RE_SYNTAX_PERL5). 306 * 307 * @param pattern A regular expression pattern, in the form of a String, 308 * StringBuffer, or char[]. Other input types will be converted to 309 * strings using the toString() method. 310 * @param cflags The logical OR of any combination of the compilation flags listed above. 311 * @exception REException The input pattern could not be parsed. 312 * @exception NullPointerException The pattern was null. 313 */ RE(Object pattern, int cflags)314 public RE (Object pattern, int cflags) throws REException 315 { 316 this (pattern, cflags, RESyntax.RE_SYNTAX_PERL5, 0, 0); 317 } 318 319 /** 320 * Constructs a regular expression pattern buffer using the specified 321 * compilation flags and regular expression syntax. 322 * 323 * @param pattern A regular expression pattern, in the form of a String, 324 * StringBuffer, or char[]. Other input types will be converted to 325 * strings using the toString() method. 326 * @param cflags The logical OR of any combination of the compilation flags listed above. 327 * @param syntax The type of regular expression syntax to use. 328 * @exception REException The input pattern could not be parsed. 329 * @exception NullPointerException The pattern was null. 330 */ RE(Object pattern, int cflags, RESyntax syntax)331 public RE (Object pattern, int cflags, RESyntax syntax) throws REException 332 { 333 this (pattern, cflags, syntax, 0, 0); 334 } 335 336 // internal constructor used for alternation RE(REToken first, REToken last, int subs, int subIndex, int minLength, int maxLength)337 private RE (REToken first, REToken last, int subs, int subIndex, 338 int minLength, int maxLength) 339 { 340 super (subIndex); 341 firstToken = first; 342 lastToken = last; 343 numSubs = subs; 344 minimumLength = minLength; 345 maximumLength = maxLength; 346 addToken (new RETokenEndSub (subIndex)); 347 } 348 RE(Object patternObj, int cflags, RESyntax syntax, int myIndex, int nextSub)349 private RE (Object patternObj, int cflags, RESyntax syntax, int myIndex, 350 int nextSub) throws REException 351 { 352 super (myIndex); // Subexpression index of this token. 353 initialize (patternObj, cflags, syntax, myIndex, nextSub); 354 } 355 356 // For use by subclasses RE()357 protected RE () 358 { 359 super (0); 360 } 361 362 // The meat of construction initialize(Object patternObj, int cflags, RESyntax syntax, int myIndex, int nextSub)363 protected void initialize (Object patternObj, int cflags, RESyntax syntax, 364 int myIndex, int nextSub) throws REException 365 { 366 char[] pattern; 367 if (patternObj instanceof String) 368 { 369 pattern = ((String) patternObj).toCharArray (); 370 } 371 else if (patternObj instanceof char[]) 372 { 373 pattern = (char[]) patternObj; 374 } 375 else if (patternObj instanceof StringBuffer) 376 { 377 pattern = new char[((StringBuffer) patternObj).length ()]; 378 ((StringBuffer) patternObj).getChars (0, pattern.length, pattern, 0); 379 } 380 else if (patternObj instanceof StringBuilder) 381 { 382 pattern = new char[((StringBuilder) patternObj).length ()]; 383 ((StringBuilder) patternObj).getChars (0, pattern.length, pattern, 0); 384 } 385 else if (patternObj instanceof CPStringBuilder) 386 { 387 pattern = new char[((CPStringBuilder) patternObj).length ()]; 388 ((CPStringBuilder) patternObj).getChars (0, pattern.length, pattern, 389 0); 390 } 391 else 392 { 393 pattern = patternObj.toString ().toCharArray (); 394 } 395 396 int pLength = pattern.length; 397 398 numSubs = 0; // Number of subexpressions in this token. 399 ArrayList < REToken > branches = null; 400 401 // linked list of tokens (sort of -- some closed loops can exist) 402 firstToken = lastToken = null; 403 404 // Precalculate these so we don't pay for the math every time we 405 // need to access them. 406 boolean insens = ((cflags & REG_ICASE) > 0); 407 boolean insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0); 408 409 // Parse pattern into tokens. Does anyone know if it's more efficient 410 // to use char[] than a String.charAt()? I'm assuming so. 411 412 // index tracks the position in the char array 413 int index = 0; 414 415 // this will be the current parse character (pattern[index]) 416 CharUnit unit = new CharUnit (); 417 418 // This is used for {x,y} calculations 419 IntPair minMax = new IntPair (); 420 421 // Buffer a token so we can create a TokenRepeated, etc. 422 REToken currentToken = null; 423 boolean quot = false; 424 425 // Saved syntax and flags. 426 RESyntax savedSyntax = null; 427 int savedCflags = 0; 428 boolean flagsSaved = false; 429 430 while (index < pLength) 431 { 432 // read the next character unit (including backslash escapes) 433 index = getCharUnit (pattern, index, unit, quot); 434 435 if (unit.bk) 436 if (unit.ch == 'Q') 437 { 438 quot = true; 439 continue; 440 } 441 else if (unit.ch == 'E') 442 { 443 quot = false; 444 continue; 445 } 446 if (quot) 447 unit.bk = false; 448 449 if (((cflags & REG_X_COMMENTS) > 0) && (!unit.bk) && (!quot)) 450 { 451 if (Character.isWhitespace (unit.ch)) 452 { 453 continue; 454 } 455 if (unit.ch == '#') 456 { 457 for (int i = index; i < pLength; i++) 458 { 459 if (pattern[i] == '\n') 460 { 461 index = i + 1; 462 continue; 463 } 464 else if (pattern[i] == '\r') 465 { 466 if (i + 1 < pLength && pattern[i + 1] == '\n') 467 { 468 index = i + 2; 469 } 470 else 471 { 472 index = i + 1; 473 } 474 continue; 475 } 476 } 477 index = pLength; 478 continue; 479 } 480 } 481 482 // ALTERNATION OPERATOR 483 // \| or | (if RE_NO_BK_VBAR) or newline (if RE_NEWLINE_ALT) 484 // not available if RE_LIMITED_OPS is set 485 486 // TODO: the '\n' literal here should be a test against REToken.newline, 487 // which unfortunately may be more than a single character. 488 if (((unit.ch == '|' 489 && (syntax.get (RESyntax.RE_NO_BK_VBAR) ^ (unit.bk || quot))) 490 || (syntax.get (RESyntax.RE_NEWLINE_ALT) && (unit.ch == '\n') 491 && !(unit.bk || quot))) 492 && !syntax.get (RESyntax.RE_LIMITED_OPS)) 493 { 494 // make everything up to here be a branch. create vector if nec. 495 addToken (currentToken); 496 RE theBranch = 497 new RE (firstToken, lastToken, numSubs, subIndex, minimumLength, 498 maximumLength); 499 minimumLength = 0; 500 maximumLength = 0; 501 if (branches == null) 502 { 503 branches = new ArrayList < REToken > (); 504 } 505 branches.add (theBranch); 506 firstToken = lastToken = currentToken = null; 507 } 508 509 // INTERVAL OPERATOR: 510 // {x} | {x,} | {x,y} (RE_INTERVALS && RE_NO_BK_BRACES) 511 // \{x\} | \{x,\} | \{x,y\} (RE_INTERVALS && !RE_NO_BK_BRACES) 512 // 513 // OPEN QUESTION: 514 // what is proper interpretation of '{' at start of string? 515 // 516 // This method used to check "repeat.empty.token" to avoid such regexp 517 // as "(a*){2,}", but now "repeat.empty.token" is allowed. 518 519 else if ((unit.ch == '{') && syntax.get (RESyntax.RE_INTERVALS) 520 && (syntax. 521 get (RESyntax.RE_NO_BK_BRACES) ^ (unit.bk || quot))) 522 { 523 int newIndex = getMinMax (pattern, index, minMax, syntax); 524 if (newIndex > index) 525 { 526 if (minMax.first > minMax.second) 527 throw new 528 REException (getLocalizedMessage ("interval.order"), 529 REException.REG_BADRPT, newIndex); 530 if (currentToken == null) 531 throw new 532 REException (getLocalizedMessage ("repeat.no.token"), 533 REException.REG_BADRPT, newIndex); 534 if (currentToken instanceof RETokenRepeated) 535 throw new 536 REException (getLocalizedMessage ("repeat.chained"), 537 REException.REG_BADRPT, newIndex); 538 if (currentToken instanceof RETokenWordBoundary 539 || currentToken instanceof RETokenWordBoundary) 540 throw new 541 REException (getLocalizedMessage ("repeat.assertion"), 542 REException.REG_BADRPT, newIndex); 543 index = newIndex; 544 currentToken = 545 setRepeated (currentToken, minMax.first, minMax.second, 546 index); 547 } 548 else 549 { 550 addToken (currentToken); 551 currentToken = new RETokenChar (subIndex, unit.ch, insens); 552 if (insensUSASCII) 553 currentToken.unicodeAware = false; 554 } 555 } 556 557 // LIST OPERATOR: 558 // [...] | [^...] 559 560 else if ((unit.ch == '[') && !(unit.bk || quot)) 561 { 562 // Create a new RETokenOneOf 563 ParseCharClassResult result = 564 parseCharClass (subIndex, pattern, index, pLength, cflags, 565 syntax, 0); 566 addToken (currentToken); 567 currentToken = result.token; 568 index = result.index; 569 } 570 571 // SUBEXPRESSIONS 572 // (...) | \(...\) depending on RE_NO_BK_PARENS 573 574 else if ((unit.ch == '(') 575 && (syntax. 576 get (RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot))) 577 { 578 boolean pure = false; 579 boolean comment = false; 580 boolean lookAhead = false; 581 boolean lookBehind = false; 582 boolean independent = false; 583 boolean negativelh = false; 584 boolean negativelb = false; 585 if ((index + 1 < pLength) && (pattern[index] == '?')) 586 { 587 switch (pattern[index + 1]) 588 { 589 case '!': 590 if (syntax.get (RESyntax.RE_LOOKAHEAD)) 591 { 592 pure = true; 593 negativelh = true; 594 lookAhead = true; 595 index += 2; 596 } 597 break; 598 case '=': 599 if (syntax.get (RESyntax.RE_LOOKAHEAD)) 600 { 601 pure = true; 602 lookAhead = true; 603 index += 2; 604 } 605 break; 606 case '<': 607 // We assume that if the syntax supports look-ahead, 608 // it also supports look-behind. 609 if (syntax.get (RESyntax.RE_LOOKAHEAD)) 610 { 611 index++; 612 switch (pattern[index + 1]) 613 { 614 case '!': 615 pure = true; 616 negativelb = true; 617 lookBehind = true; 618 index += 2; 619 break; 620 case '=': 621 pure = true; 622 lookBehind = true; 623 index += 2; 624 } 625 } 626 break; 627 case '>': 628 // We assume that if the syntax supports look-ahead, 629 // it also supports independent group. 630 if (syntax.get (RESyntax.RE_LOOKAHEAD)) 631 { 632 pure = true; 633 independent = true; 634 index += 2; 635 } 636 break; 637 case 'i': 638 case 'd': 639 case 'm': 640 case 's': 641 case 'u': 642 case 'x': 643 case '-': 644 if (!syntax.get (RESyntax.RE_EMBEDDED_FLAGS)) 645 break; 646 // Set or reset syntax flags. 647 int flagIndex = index + 1; 648 int endFlag = -1; 649 RESyntax newSyntax = new RESyntax (syntax); 650 int newCflags = cflags; 651 boolean negate = false; 652 while (flagIndex < pLength && endFlag < 0) 653 { 654 switch (pattern[flagIndex]) 655 { 656 case 'i': 657 if (negate) 658 newCflags &= ~REG_ICASE; 659 else 660 newCflags |= REG_ICASE; 661 flagIndex++; 662 break; 663 case 'd': 664 if (negate) 665 newSyntax.setLineSeparator (RESyntax. 666 DEFAULT_LINE_SEPARATOR); 667 else 668 newSyntax.setLineSeparator ("\n"); 669 flagIndex++; 670 break; 671 case 'm': 672 if (negate) 673 newCflags &= ~REG_MULTILINE; 674 else 675 newCflags |= REG_MULTILINE; 676 flagIndex++; 677 break; 678 case 's': 679 if (negate) 680 newCflags &= ~REG_DOT_NEWLINE; 681 else 682 newCflags |= REG_DOT_NEWLINE; 683 flagIndex++; 684 break; 685 case 'u': 686 if (negate) 687 newCflags |= REG_ICASE_USASCII; 688 else 689 newCflags &= ~REG_ICASE_USASCII; 690 flagIndex++; 691 break; 692 case 'x': 693 if (negate) 694 newCflags &= ~REG_X_COMMENTS; 695 else 696 newCflags |= REG_X_COMMENTS; 697 flagIndex++; 698 break; 699 case '-': 700 negate = true; 701 flagIndex++; 702 break; 703 case ':': 704 case ')': 705 endFlag = pattern[flagIndex]; 706 break; 707 default: 708 throw new 709 REException (getLocalizedMessage 710 ("repeat.no.token"), 711 REException.REG_BADRPT, index); 712 } 713 } 714 if (endFlag == ')') 715 { 716 syntax = newSyntax; 717 cflags = newCflags; 718 insens = ((cflags & REG_ICASE) > 0); 719 insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0); 720 // This can be treated as though it were a comment. 721 comment = true; 722 index = flagIndex - 1; 723 break; 724 } 725 if (endFlag == ':') 726 { 727 savedSyntax = syntax; 728 savedCflags = cflags; 729 flagsSaved = true; 730 syntax = newSyntax; 731 cflags = newCflags; 732 insens = ((cflags & REG_ICASE) > 0); 733 insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0); 734 index = flagIndex - 1; 735 // Fall through to the next case. 736 } 737 else 738 { 739 throw new 740 REException (getLocalizedMessage 741 ("unmatched.paren"), 742 REException.REG_ESUBREG, index); 743 } 744 case ':': 745 if (syntax.get (RESyntax.RE_PURE_GROUPING)) 746 { 747 pure = true; 748 index += 2; 749 } 750 break; 751 case '#': 752 if (syntax.get (RESyntax.RE_COMMENTS)) 753 { 754 comment = true; 755 } 756 break; 757 default: 758 throw new 759 REException (getLocalizedMessage ("repeat.no.token"), 760 REException.REG_BADRPT, index); 761 } 762 } 763 764 if (index >= pLength) 765 { 766 throw new 767 REException (getLocalizedMessage ("unmatched.paren"), 768 REException.REG_ESUBREG, index); 769 } 770 771 // find end of subexpression 772 int endIndex = index; 773 int nextIndex = index; 774 int nested = 0; 775 776 while (((nextIndex = 777 getCharUnit (pattern, endIndex, unit, false)) > 0) 778 && !(nested == 0 && (unit.ch == ')') 779 && (syntax. 780 get (RESyntax.RE_NO_BK_PARENS) ^ (unit.bk 781 || quot)))) 782 { 783 if ((endIndex = nextIndex) >= pLength) 784 throw new 785 REException (getLocalizedMessage ("subexpr.no.end"), 786 REException.REG_ESUBREG, nextIndex); 787 else 788 if ((unit.ch == '[') && !(unit.bk || quot)) 789 { 790 // I hate to do something similar to the LIST OPERATOR matters 791 // above, but ... 792 int listIndex = nextIndex; 793 if (listIndex < pLength && pattern[listIndex] == '^') 794 listIndex++; 795 if (listIndex < pLength && pattern[listIndex] == ']') 796 listIndex++; 797 int listEndIndex = -1; 798 int listNest = 0; 799 while (listIndex < pLength && listEndIndex < 0) 800 { 801 switch (pattern[listIndex++]) 802 { 803 case '\\': 804 listIndex++; 805 break; 806 case '[': 807 // Sun's API document says that regexp like "[a-d[m-p]]" 808 // is legal. Even something like "[[[^]]]]" is accepted. 809 listNest++; 810 if (listIndex < pLength 811 && pattern[listIndex] == '^') 812 listIndex++; 813 if (listIndex < pLength 814 && pattern[listIndex] == ']') 815 listIndex++; 816 break; 817 case ']': 818 if (listNest == 0) 819 listEndIndex = listIndex; 820 listNest--; 821 break; 822 } 823 } 824 if (listEndIndex >= 0) 825 { 826 nextIndex = listEndIndex; 827 if ((endIndex = nextIndex) >= pLength) 828 throw new 829 REException (getLocalizedMessage ("subexpr.no.end"), 830 REException.REG_ESUBREG, nextIndex); 831 else 832 continue; 833 } 834 throw new 835 REException (getLocalizedMessage ("subexpr.no.end"), 836 REException.REG_ESUBREG, nextIndex); 837 } 838 else if (unit.ch == '(' 839 && (syntax. 840 get (RESyntax.RE_NO_BK_PARENS) ^ (unit.bk 841 || quot))) 842 nested++; 843 else if (unit.ch == ')' 844 && (syntax. 845 get (RESyntax.RE_NO_BK_PARENS) ^ (unit.bk 846 || quot))) 847 nested--; 848 } 849 850 // endIndex is now position at a ')','\)' 851 // nextIndex is end of string or position after ')' or '\)' 852 853 if (comment) 854 index = nextIndex; 855 else 856 { // not a comment 857 // create RE subexpression as token. 858 addToken (currentToken); 859 if (!pure) 860 { 861 numSubs++; 862 } 863 864 int useIndex = (pure || lookAhead || lookBehind 865 || independent) ? 0 : nextSub + numSubs; 866 currentToken = 867 new RE (String.valueOf (pattern, index, endIndex - index). 868 toCharArray (), cflags, syntax, useIndex, 869 nextSub + numSubs); 870 numSubs += ((RE) currentToken).getNumSubs (); 871 872 if (lookAhead) 873 { 874 currentToken = 875 new RETokenLookAhead (currentToken, negativelh); 876 } 877 else if (lookBehind) 878 { 879 currentToken = 880 new RETokenLookBehind (currentToken, negativelb); 881 } 882 else if (independent) 883 { 884 currentToken = new RETokenIndependent (currentToken); 885 } 886 887 index = nextIndex; 888 if (flagsSaved) 889 { 890 syntax = savedSyntax; 891 cflags = savedCflags; 892 insens = ((cflags & REG_ICASE) > 0); 893 insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0); 894 flagsSaved = false; 895 } 896 } // not a comment 897 } // subexpression 898 899 // UNMATCHED RIGHT PAREN 900 // ) or \) throw exception if 901 // !syntax.get(RESyntax.RE_UNMATCHED_RIGHT_PAREN_ORD) 902 else if (!syntax.get (RESyntax.RE_UNMATCHED_RIGHT_PAREN_ORD) 903 && ((unit.ch == ')') 904 && (syntax. 905 get (RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot)))) 906 { 907 throw new REException (getLocalizedMessage ("unmatched.paren"), 908 REException.REG_EPAREN, index); 909 } 910 911 // START OF LINE OPERATOR 912 // ^ 913 914 else if ((unit.ch == '^') && !(unit.bk || quot)) 915 { 916 addToken (currentToken); 917 currentToken = null; 918 RETokenStart token = null; 919 if ((cflags & REG_MULTILINE) > 0) 920 { 921 String sep = syntax.getLineSeparator (); 922 if (sep == null) 923 { 924 token = new RETokenStart (subIndex, null, true); 925 } 926 else 927 { 928 token = new RETokenStart (subIndex, sep); 929 } 930 } 931 else 932 { 933 token = new RETokenStart (subIndex, null); 934 } 935 addToken (token); 936 } 937 938 // END OF LINE OPERATOR 939 // $ 940 941 else if ((unit.ch == '$') && !(unit.bk || quot)) 942 { 943 addToken (currentToken); 944 currentToken = null; 945 RETokenEnd token = null; 946 if ((cflags & REG_MULTILINE) > 0) 947 { 948 String sep = syntax.getLineSeparator (); 949 if (sep == null) 950 { 951 token = new RETokenEnd (subIndex, null, true); 952 } 953 else 954 { 955 token = new RETokenEnd (subIndex, sep); 956 } 957 } 958 else 959 { 960 token = new RETokenEnd (subIndex, null); 961 } 962 addToken (token); 963 } 964 965 // MATCH-ANY-CHARACTER OPERATOR (except possibly newline and null) 966 // . 967 968 else if ((unit.ch == '.') && !(unit.bk || quot)) 969 { 970 addToken (currentToken); 971 currentToken = 972 new RETokenAny (subIndex, syntax.get (RESyntax.RE_DOT_NEWLINE) 973 || ((cflags & REG_DOT_NEWLINE) > 0), 974 syntax.get (RESyntax.RE_DOT_NOT_NULL)); 975 } 976 977 // ZERO-OR-MORE REPEAT OPERATOR 978 // * 979 // 980 // This method used to check "repeat.empty.token" to avoid such regexp 981 // as "(a*)*", but now "repeat.empty.token" is allowed. 982 983 else if ((unit.ch == '*') && !(unit.bk || quot)) 984 { 985 if (currentToken == null) 986 throw new REException (getLocalizedMessage ("repeat.no.token"), 987 REException.REG_BADRPT, index); 988 if (currentToken instanceof RETokenRepeated) 989 throw new REException (getLocalizedMessage ("repeat.chained"), 990 REException.REG_BADRPT, index); 991 if (currentToken instanceof RETokenWordBoundary 992 || currentToken instanceof RETokenWordBoundary) 993 throw new REException (getLocalizedMessage ("repeat.assertion"), 994 REException.REG_BADRPT, index); 995 currentToken = 996 setRepeated (currentToken, 0, Integer.MAX_VALUE, index); 997 } 998 999 // ONE-OR-MORE REPEAT OPERATOR / POSSESSIVE MATCHING OPERATOR 1000 // + | \+ depending on RE_BK_PLUS_QM 1001 // not available if RE_LIMITED_OPS is set 1002 // 1003 // This method used to check "repeat.empty.token" to avoid such regexp 1004 // as "(a*)+", but now "repeat.empty.token" is allowed. 1005 1006 else if ((unit.ch == '+') && !syntax.get (RESyntax.RE_LIMITED_OPS) 1007 && (!syntax. 1008 get (RESyntax.RE_BK_PLUS_QM) ^ (unit.bk || quot))) 1009 { 1010 if (currentToken == null) 1011 throw new REException (getLocalizedMessage ("repeat.no.token"), 1012 REException.REG_BADRPT, index); 1013 1014 // Check for possessive matching on RETokenRepeated 1015 if (currentToken instanceof RETokenRepeated) 1016 { 1017 RETokenRepeated tokenRep = (RETokenRepeated) currentToken; 1018 if (syntax.get (RESyntax.RE_POSSESSIVE_OPS) 1019 && !tokenRep.isPossessive () && !tokenRep.isStingy ()) 1020 tokenRep.makePossessive (); 1021 else 1022 throw new 1023 REException (getLocalizedMessage ("repeat.chained"), 1024 REException.REG_BADRPT, index); 1025 1026 } 1027 else if (currentToken instanceof RETokenWordBoundary 1028 || currentToken instanceof RETokenWordBoundary) 1029 throw new REException (getLocalizedMessage ("repeat.assertion"), 1030 REException.REG_BADRPT, index); 1031 else 1032 currentToken = 1033 setRepeated (currentToken, 1, Integer.MAX_VALUE, index); 1034 } 1035 1036 // ZERO-OR-ONE REPEAT OPERATOR / STINGY MATCHING OPERATOR 1037 // ? | \? depending on RE_BK_PLUS_QM 1038 // not available if RE_LIMITED_OPS is set 1039 // stingy matching if RE_STINGY_OPS is set and it follows a quantifier 1040 1041 else if ((unit.ch == '?') && !syntax.get (RESyntax.RE_LIMITED_OPS) 1042 && (!syntax. 1043 get (RESyntax.RE_BK_PLUS_QM) ^ (unit.bk || quot))) 1044 { 1045 if (currentToken == null) 1046 throw new REException (getLocalizedMessage ("repeat.no.token"), 1047 REException.REG_BADRPT, index); 1048 1049 // Check for stingy matching on RETokenRepeated 1050 if (currentToken instanceof RETokenRepeated) 1051 { 1052 RETokenRepeated tokenRep = (RETokenRepeated) currentToken; 1053 if (syntax.get (RESyntax.RE_STINGY_OPS) 1054 && !tokenRep.isStingy () && !tokenRep.isPossessive ()) 1055 tokenRep.makeStingy (); 1056 else 1057 throw new 1058 REException (getLocalizedMessage ("repeat.chained"), 1059 REException.REG_BADRPT, index); 1060 } 1061 else if (currentToken instanceof RETokenWordBoundary 1062 || currentToken instanceof RETokenWordBoundary) 1063 throw new REException (getLocalizedMessage ("repeat.assertion"), 1064 REException.REG_BADRPT, index); 1065 else 1066 currentToken = setRepeated (currentToken, 0, 1, index); 1067 } 1068 1069 // OCTAL CHARACTER 1070 // \0377 1071 1072 else if (unit.bk && (unit.ch == '0') 1073 && syntax.get (RESyntax.RE_OCTAL_CHAR)) 1074 { 1075 CharExpression ce = 1076 getCharExpression (pattern, index - 2, pLength, syntax); 1077 if (ce == null) 1078 throw new REException ("invalid octal character", 1079 REException.REG_ESCAPE, index); 1080 index = index - 2 + ce.len; 1081 addToken (currentToken); 1082 currentToken = new RETokenChar (subIndex, ce.ch, insens); 1083 if (insensUSASCII) 1084 currentToken.unicodeAware = false; 1085 } 1086 1087 // BACKREFERENCE OPERATOR 1088 // \1 \2 ... \9 and \10 \11 \12 ... 1089 // not available if RE_NO_BK_REFS is set 1090 // Perl recognizes \10, \11, and so on only if enough number of 1091 // parentheses have opened before it, otherwise they are treated 1092 // as aliases of \010, \011, ... (octal characters). In case of 1093 // Sun's JDK, octal character expression must always begin with \0. 1094 // We will do as JDK does. But FIXME, take a look at "(a)(b)\29". 1095 // JDK treats \2 as a back reference to the 2nd group because 1096 // there are only two groups. But in our poor implementation, 1097 // we cannot help but treat \29 as a back reference to the 29th group. 1098 1099 else if (unit.bk && Character.isDigit (unit.ch) 1100 && !syntax.get (RESyntax.RE_NO_BK_REFS)) 1101 { 1102 addToken (currentToken); 1103 int numBegin = index - 1; 1104 int numEnd = pLength; 1105 for (int i = index; i < pLength; i++) 1106 { 1107 if (!Character.isDigit (pattern[i])) 1108 { 1109 numEnd = i; 1110 break; 1111 } 1112 } 1113 int num = parseInt (pattern, numBegin, numEnd - numBegin, 10); 1114 1115 currentToken = new RETokenBackRef (subIndex, num, insens); 1116 if (insensUSASCII) 1117 currentToken.unicodeAware = false; 1118 index = numEnd; 1119 } 1120 1121 // START OF STRING OPERATOR 1122 // \A if RE_STRING_ANCHORS is set 1123 1124 else if (unit.bk && (unit.ch == 'A') 1125 && syntax.get (RESyntax.RE_STRING_ANCHORS)) 1126 { 1127 addToken (currentToken); 1128 currentToken = new RETokenStart (subIndex, null); 1129 } 1130 1131 // WORD BREAK OPERATOR 1132 // \b if ???? 1133 1134 else if (unit.bk && (unit.ch == 'b') 1135 && syntax.get (RESyntax.RE_STRING_ANCHORS)) 1136 { 1137 addToken (currentToken); 1138 currentToken = 1139 new RETokenWordBoundary (subIndex, 1140 RETokenWordBoundary. 1141 BEGIN | RETokenWordBoundary.END, 1142 false); 1143 } 1144 1145 // WORD BEGIN OPERATOR 1146 // \< if ???? 1147 else if (unit.bk && (unit.ch == '<')) 1148 { 1149 addToken (currentToken); 1150 currentToken = 1151 new RETokenWordBoundary (subIndex, RETokenWordBoundary.BEGIN, 1152 false); 1153 } 1154 1155 // WORD END OPERATOR 1156 // \> if ???? 1157 else if (unit.bk && (unit.ch == '>')) 1158 { 1159 addToken (currentToken); 1160 currentToken = 1161 new RETokenWordBoundary (subIndex, RETokenWordBoundary.END, 1162 false); 1163 } 1164 1165 // NON-WORD BREAK OPERATOR 1166 // \B if ???? 1167 1168 else if (unit.bk && (unit.ch == 'B') 1169 && syntax.get (RESyntax.RE_STRING_ANCHORS)) 1170 { 1171 addToken (currentToken); 1172 currentToken = 1173 new RETokenWordBoundary (subIndex, 1174 RETokenWordBoundary. 1175 BEGIN | RETokenWordBoundary.END, true); 1176 } 1177 1178 1179 // DIGIT OPERATOR 1180 // \d if RE_CHAR_CLASS_ESCAPES is set 1181 1182 else if (unit.bk && (unit.ch == 'd') 1183 && syntax.get (RESyntax.RE_CHAR_CLASS_ESCAPES)) 1184 { 1185 addToken (currentToken); 1186 currentToken = 1187 new RETokenPOSIX (subIndex, RETokenPOSIX.DIGIT, insens, false); 1188 if (insensUSASCII) 1189 currentToken.unicodeAware = false; 1190 } 1191 1192 // NON-DIGIT OPERATOR 1193 // \D 1194 1195 else if (unit.bk && (unit.ch == 'D') 1196 && syntax.get (RESyntax.RE_CHAR_CLASS_ESCAPES)) 1197 { 1198 addToken (currentToken); 1199 currentToken = 1200 new RETokenPOSIX (subIndex, RETokenPOSIX.DIGIT, insens, true); 1201 if (insensUSASCII) 1202 currentToken.unicodeAware = false; 1203 } 1204 1205 // NEWLINE ESCAPE 1206 // \n 1207 1208 else if (unit.bk && (unit.ch == 'n')) 1209 { 1210 addToken (currentToken); 1211 currentToken = new RETokenChar (subIndex, '\n', false); 1212 } 1213 1214 // RETURN ESCAPE 1215 // \r 1216 1217 else if (unit.bk && (unit.ch == 'r')) 1218 { 1219 addToken (currentToken); 1220 currentToken = new RETokenChar (subIndex, '\r', false); 1221 } 1222 1223 // WHITESPACE OPERATOR 1224 // \s if RE_CHAR_CLASS_ESCAPES is set 1225 1226 else if (unit.bk && (unit.ch == 's') 1227 && syntax.get (RESyntax.RE_CHAR_CLASS_ESCAPES)) 1228 { 1229 addToken (currentToken); 1230 currentToken = 1231 new RETokenPOSIX (subIndex, RETokenPOSIX.SPACE, insens, false); 1232 if (insensUSASCII) 1233 currentToken.unicodeAware = false; 1234 } 1235 1236 // NON-WHITESPACE OPERATOR 1237 // \S 1238 1239 else if (unit.bk && (unit.ch == 'S') 1240 && syntax.get (RESyntax.RE_CHAR_CLASS_ESCAPES)) 1241 { 1242 addToken (currentToken); 1243 currentToken = 1244 new RETokenPOSIX (subIndex, RETokenPOSIX.SPACE, insens, true); 1245 if (insensUSASCII) 1246 currentToken.unicodeAware = false; 1247 } 1248 1249 // TAB ESCAPE 1250 // \t 1251 1252 else if (unit.bk && (unit.ch == 't')) 1253 { 1254 addToken (currentToken); 1255 currentToken = new RETokenChar (subIndex, '\t', false); 1256 } 1257 1258 // ALPHANUMERIC OPERATOR 1259 // \w 1260 1261 else if (unit.bk && (unit.ch == 'w') 1262 && syntax.get (RESyntax.RE_CHAR_CLASS_ESCAPES)) 1263 { 1264 addToken (currentToken); 1265 currentToken = 1266 new RETokenPOSIX (subIndex, RETokenPOSIX.ALNUM, insens, false); 1267 if (insensUSASCII) 1268 currentToken.unicodeAware = false; 1269 } 1270 1271 // NON-ALPHANUMERIC OPERATOR 1272 // \W 1273 1274 else if (unit.bk && (unit.ch == 'W') 1275 && syntax.get (RESyntax.RE_CHAR_CLASS_ESCAPES)) 1276 { 1277 addToken (currentToken); 1278 currentToken = 1279 new RETokenPOSIX (subIndex, RETokenPOSIX.ALNUM, insens, true); 1280 if (insensUSASCII) 1281 currentToken.unicodeAware = false; 1282 } 1283 1284 // END OF STRING OPERATOR 1285 // \Z, \z 1286 1287 // FIXME: \Z and \z are different in that if the input string 1288 // ends with a line terminator, \Z matches the position before 1289 // the final terminator. This special behavior of \Z is yet 1290 // to be implemented. 1291 1292 else if (unit.bk && (unit.ch == 'Z' || unit.ch == 'z') && 1293 syntax.get (RESyntax.RE_STRING_ANCHORS)) 1294 { 1295 addToken (currentToken); 1296 currentToken = new RETokenEnd (subIndex, null); 1297 } 1298 1299 // HEX CHARACTER, UNICODE CHARACTER 1300 // \x1B, \u1234 1301 1302 else 1303 if ((unit.bk && (unit.ch == 'x') 1304 && syntax.get (RESyntax.RE_HEX_CHAR)) || (unit.bk 1305 && (unit.ch == 'u') 1306 && syntax. 1307 get (RESyntax. 1308 RE_UNICODE_CHAR))) 1309 { 1310 CharExpression ce = 1311 getCharExpression (pattern, index - 2, pLength, syntax); 1312 if (ce == null) 1313 throw new REException ("invalid hex character", 1314 REException.REG_ESCAPE, index); 1315 index = index - 2 + ce.len; 1316 addToken (currentToken); 1317 currentToken = new RETokenChar (subIndex, ce.ch, insens); 1318 if (insensUSASCII) 1319 currentToken.unicodeAware = false; 1320 } 1321 1322 // NAMED PROPERTY 1323 // \p{prop}, \P{prop} 1324 1325 else 1326 if ((unit.bk && (unit.ch == 'p') 1327 && syntax.get (RESyntax.RE_NAMED_PROPERTY)) || (unit.bk 1328 && (unit.ch == 1329 'P') 1330 && syntax. 1331 get (RESyntax. 1332 RE_NAMED_PROPERTY))) 1333 { 1334 NamedProperty np = getNamedProperty (pattern, index - 2, pLength); 1335 if (np == null) 1336 throw new REException ("invalid escape sequence", 1337 REException.REG_ESCAPE, index); 1338 index = index - 2 + np.len; 1339 addToken (currentToken); 1340 currentToken = 1341 getRETokenNamedProperty (subIndex, np, insens, index); 1342 if (insensUSASCII) 1343 currentToken.unicodeAware = false; 1344 } 1345 1346 // END OF PREVIOUS MATCH 1347 // \G 1348 1349 else if (unit.bk && (unit.ch == 'G') && 1350 syntax.get (RESyntax.RE_STRING_ANCHORS)) 1351 { 1352 addToken (currentToken); 1353 currentToken = new RETokenEndOfPreviousMatch (subIndex); 1354 } 1355 1356 // NON-SPECIAL CHARACTER (or escape to make literal) 1357 // c | \* for example 1358 1359 else 1360 { // not a special character 1361 addToken (currentToken); 1362 currentToken = new RETokenChar (subIndex, unit.ch, insens); 1363 if (insensUSASCII) 1364 currentToken.unicodeAware = false; 1365 } 1366 } // end while 1367 1368 // Add final buffered token and an EndSub marker 1369 addToken (currentToken); 1370 1371 if (branches != null) 1372 { 1373 branches. 1374 add (new 1375 RE (firstToken, lastToken, numSubs, subIndex, minimumLength, 1376 maximumLength)); 1377 branches.trimToSize (); // compact the Vector 1378 minimumLength = 0; 1379 maximumLength = 0; 1380 firstToken = lastToken = null; 1381 addToken (new RETokenOneOf (subIndex, branches, false)); 1382 } 1383 else 1384 addToken (new RETokenEndSub (subIndex)); 1385 1386 } 1387 1388 private static class ParseCharClassResult 1389 { 1390 RETokenOneOf token; 1391 int index; 1392 boolean returnAtAndOperator = false; 1393 } 1394 1395 /** 1396 * Parse [...] or [^...] and make an RETokenOneOf instance. 1397 * @param subIndex subIndex to be given to the created RETokenOneOf instance. 1398 * @param pattern Input array of characters to be parsed. 1399 * @param index Index pointing to the character next to the beginning '['. 1400 * @param pLength Limit of the input array. 1401 * @param cflags Compilation flags used to parse the pattern. 1402 * @param pflags Flags that affect the behavior of this method. 1403 * @param syntax Syntax used to parse the pattern. 1404 */ parseCharClass(int subIndex, char[]pattern, int index, int pLength, int cflags, RESyntax syntax, int pflags)1405 private static ParseCharClassResult parseCharClass (int subIndex, 1406 char[]pattern, 1407 int index, int pLength, 1408 int cflags, 1409 RESyntax syntax, 1410 int pflags) throws 1411 REException 1412 { 1413 1414 boolean insens = ((cflags & REG_ICASE) > 0); 1415 boolean insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0); 1416 final ArrayList < REToken > options = new ArrayList < REToken > (); 1417 ArrayList < Object > addition = new ArrayList < Object > (); 1418 boolean additionAndAppeared = false; 1419 final int RETURN_AT_AND = 0x01; 1420 boolean returnAtAndOperator = ((pflags & RETURN_AT_AND) != 0); 1421 boolean negative = false; 1422 char ch; 1423 1424 char lastChar = 0; 1425 boolean lastCharIsSet = false; 1426 if (index == pLength) 1427 throw new REException (getLocalizedMessage ("unmatched.bracket"), 1428 REException.REG_EBRACK, index); 1429 1430 // Check for initial caret, negation 1431 if ((ch = pattern[index]) == '^') 1432 { 1433 negative = true; 1434 if (++index == pLength) 1435 throw new REException (getLocalizedMessage ("class.no.end"), 1436 REException.REG_EBRACK, index); 1437 ch = pattern[index]; 1438 } 1439 1440 // Check for leading right bracket literal 1441 if (ch == ']') 1442 { 1443 lastChar = ch; 1444 lastCharIsSet = true; 1445 if (++index == pLength) 1446 throw new REException (getLocalizedMessage ("class.no.end"), 1447 REException.REG_EBRACK, index); 1448 } 1449 1450 while ((ch = pattern[index++]) != ']') 1451 { 1452 if ((ch == '-') && (lastCharIsSet)) 1453 { 1454 if (index == pLength) 1455 throw new REException (getLocalizedMessage ("class.no.end"), 1456 REException.REG_EBRACK, index); 1457 if ((ch = pattern[index]) == ']') 1458 { 1459 RETokenChar t = new RETokenChar (subIndex, lastChar, insens); 1460 if (insensUSASCII) 1461 t.unicodeAware = false; 1462 options.add (t); 1463 lastChar = '-'; 1464 } 1465 else 1466 { 1467 if ((ch == '\\') 1468 && syntax.get (RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS)) 1469 { 1470 CharExpression ce = 1471 getCharExpression (pattern, index, pLength, syntax); 1472 if (ce == null) 1473 throw new REException ("invalid escape sequence", 1474 REException.REG_ESCAPE, index); 1475 ch = ce.ch; 1476 index = index + ce.len - 1; 1477 } 1478 RETokenRange t = 1479 new RETokenRange (subIndex, lastChar, ch, insens); 1480 if (insensUSASCII) 1481 t.unicodeAware = false; 1482 options.add (t); 1483 lastChar = 0; 1484 lastCharIsSet = false; 1485 index++; 1486 } 1487 } 1488 else if ((ch == '\\') 1489 && syntax.get (RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS)) 1490 { 1491 if (index == pLength) 1492 throw new REException (getLocalizedMessage ("class.no.end"), 1493 REException.REG_EBRACK, index); 1494 int posixID = -1; 1495 boolean negate = false; 1496 char asciiEsc = 0; 1497 boolean asciiEscIsSet = false; 1498 NamedProperty np = null; 1499 if (("dswDSW".indexOf (pattern[index]) != -1) 1500 && syntax.get (RESyntax.RE_CHAR_CLASS_ESC_IN_LISTS)) 1501 { 1502 switch (pattern[index]) 1503 { 1504 case 'D': 1505 negate = true; 1506 case 'd': 1507 posixID = RETokenPOSIX.DIGIT; 1508 break; 1509 case 'S': 1510 negate = true; 1511 case 's': 1512 posixID = RETokenPOSIX.SPACE; 1513 break; 1514 case 'W': 1515 negate = true; 1516 case 'w': 1517 posixID = RETokenPOSIX.ALNUM; 1518 break; 1519 } 1520 } 1521 if (("pP".indexOf (pattern[index]) != -1) 1522 && syntax.get (RESyntax.RE_NAMED_PROPERTY)) 1523 { 1524 np = getNamedProperty (pattern, index - 1, pLength); 1525 if (np == null) 1526 throw new REException ("invalid escape sequence", 1527 REException.REG_ESCAPE, index); 1528 index = index - 1 + np.len - 1; 1529 } 1530 else 1531 { 1532 CharExpression ce = 1533 getCharExpression (pattern, index - 1, pLength, syntax); 1534 if (ce == null) 1535 throw new REException ("invalid escape sequence", 1536 REException.REG_ESCAPE, index); 1537 asciiEsc = ce.ch; 1538 asciiEscIsSet = true; 1539 index = index - 1 + ce.len - 1; 1540 } 1541 if (lastCharIsSet) 1542 { 1543 RETokenChar t = new RETokenChar (subIndex, lastChar, insens); 1544 if (insensUSASCII) 1545 t.unicodeAware = false; 1546 options.add (t); 1547 } 1548 1549 if (posixID != -1) 1550 { 1551 RETokenPOSIX t = 1552 new RETokenPOSIX (subIndex, posixID, insens, negate); 1553 if (insensUSASCII) 1554 t.unicodeAware = false; 1555 options.add (t); 1556 } 1557 else if (np != null) 1558 { 1559 RETokenNamedProperty t = 1560 getRETokenNamedProperty (subIndex, np, insens, index); 1561 if (insensUSASCII) 1562 t.unicodeAware = false; 1563 options.add (t); 1564 } 1565 else if (asciiEscIsSet) 1566 { 1567 lastChar = asciiEsc; 1568 lastCharIsSet = true; 1569 } 1570 else 1571 { 1572 lastChar = pattern[index]; 1573 lastCharIsSet = true; 1574 } 1575 ++index; 1576 } 1577 else if ((ch == '[') && (syntax.get (RESyntax.RE_CHAR_CLASSES)) 1578 && (index < pLength) && (pattern[index] == ':')) 1579 { 1580 CPStringBuilder posixSet = new CPStringBuilder (); 1581 index = getPosixSet (pattern, index + 1, posixSet); 1582 int posixId = RETokenPOSIX.intValue (posixSet.toString ()); 1583 if (posixId != -1) 1584 { 1585 RETokenPOSIX t = 1586 new RETokenPOSIX (subIndex, posixId, insens, false); 1587 if (insensUSASCII) 1588 t.unicodeAware = false; 1589 options.add (t); 1590 } 1591 } 1592 else if ((ch == '[') && (syntax.get (RESyntax.RE_NESTED_CHARCLASS))) 1593 { 1594 ParseCharClassResult result = 1595 parseCharClass (subIndex, pattern, index, pLength, cflags, 1596 syntax, 0); 1597 addition.add (result.token); 1598 addition.add ("|"); 1599 index = result.index; 1600 } 1601 else if ((ch == '&') && 1602 (syntax.get (RESyntax.RE_NESTED_CHARCLASS)) && 1603 (index < pLength) && (pattern[index] == '&')) 1604 { 1605 if (returnAtAndOperator) 1606 { 1607 ParseCharClassResult result = new ParseCharClassResult (); 1608 options.trimToSize (); 1609 if (additionAndAppeared) 1610 addition.add ("&"); 1611 if (addition.size () == 0) 1612 addition = null; 1613 result.token = new RETokenOneOf (subIndex, 1614 options, addition, negative); 1615 result.index = index - 1; 1616 result.returnAtAndOperator = true; 1617 return result; 1618 } 1619 // The precedence of the operator "&&" is the lowest. 1620 // So we postpone adding "&" until other elements 1621 // are added. And we insert Boolean.FALSE at the 1622 // beginning of the list of tokens following "&&". 1623 // So, "&&[a-b][k-m]" will be stored in the Vecter 1624 // addition in this order: 1625 // Boolean.FALSE, [a-b], "|", [k-m], "|", "&" 1626 if (additionAndAppeared) 1627 addition.add ("&"); 1628 addition.add (Boolean.FALSE); 1629 additionAndAppeared = true; 1630 1631 // The part on which "&&" operates may be either 1632 // (1) explicitly enclosed by [] 1633 // or 1634 // (2) not enclosed by [] and terminated by the 1635 // next "&&" or the end of the character list. 1636 // Let the preceding else if block do the case (1). 1637 // We must do something in case of (2). 1638 if ((index + 1 < pLength) && (pattern[index + 1] != '[')) 1639 { 1640 ParseCharClassResult result = 1641 parseCharClass (subIndex, pattern, index + 1, pLength, 1642 cflags, syntax, 1643 RETURN_AT_AND); 1644 addition.add (result.token); 1645 addition.add ("|"); 1646 // If the method returned at the next "&&", it is OK. 1647 // Otherwise we have eaten the mark of the end of this 1648 // character list "]". In this case we must give back 1649 // the end mark. 1650 index = (result.returnAtAndOperator ? 1651 result.index : result.index - 1); 1652 } 1653 } 1654 else 1655 { 1656 if (lastCharIsSet) 1657 { 1658 RETokenChar t = new RETokenChar (subIndex, lastChar, insens); 1659 if (insensUSASCII) 1660 t.unicodeAware = false; 1661 options.add (t); 1662 } 1663 lastChar = ch; 1664 lastCharIsSet = true; 1665 } 1666 if (index == pLength) 1667 throw new REException (getLocalizedMessage ("class.no.end"), 1668 REException.REG_EBRACK, index); 1669 } // while in list 1670 // Out of list, index is one past ']' 1671 1672 if (lastCharIsSet) 1673 { 1674 RETokenChar t = new RETokenChar (subIndex, lastChar, insens); 1675 if (insensUSASCII) 1676 t.unicodeAware = false; 1677 options.add (t); 1678 } 1679 1680 ParseCharClassResult result = new ParseCharClassResult (); 1681 // Create a new RETokenOneOf 1682 options.trimToSize (); 1683 if (additionAndAppeared) 1684 addition.add ("&"); 1685 if (addition.size () == 0) 1686 addition = null; 1687 result.token = new RETokenOneOf (subIndex, options, addition, negative); 1688 result.index = index; 1689 return result; 1690 } 1691 getCharUnit(char[]input, int index, CharUnit unit, boolean quot)1692 private static int getCharUnit (char[]input, int index, CharUnit unit, 1693 boolean quot) throws REException 1694 { 1695 unit.ch = input[index++]; 1696 unit.bk = (unit.ch == '\\' 1697 && (!quot || index >= input.length || input[index] == 'E')); 1698 if (unit.bk) 1699 if (index < input.length) 1700 unit.ch = input[index++]; 1701 else 1702 throw new REException (getLocalizedMessage ("ends.with.backslash"), 1703 REException.REG_ESCAPE, index); 1704 return index; 1705 } 1706 parseInt(char[]input, int pos, int len, int radix)1707 private static int parseInt (char[]input, int pos, int len, int radix) 1708 { 1709 int ret = 0; 1710 for (int i = pos; i < pos + len; i++) 1711 { 1712 ret = ret * radix + Character.digit (input[i], radix); 1713 } 1714 return ret; 1715 } 1716 1717 /** 1718 * This class represents various expressions for a character. 1719 * "a" : 'a' itself. 1720 * "\0123" : Octal char 0123 1721 * "\x1b" : Hex char 0x1b 1722 * "\u1234" : Unicode char \u1234 1723 */ 1724 private static class CharExpression 1725 { 1726 /** character represented by this expression */ 1727 char ch; 1728 /** String expression */ 1729 String expr; 1730 /** length of this expression */ 1731 int len; toString()1732 public String toString () 1733 { 1734 return expr; 1735 } 1736 } 1737 getCharExpression(char[]input, int pos, int lim, RESyntax syntax)1738 private static CharExpression getCharExpression (char[]input, int pos, 1739 int lim, RESyntax syntax) 1740 { 1741 CharExpression ce = new CharExpression (); 1742 char c = input[pos]; 1743 if (c == '\\') 1744 { 1745 if (pos + 1 >= lim) 1746 return null; 1747 c = input[pos + 1]; 1748 switch (c) 1749 { 1750 case 't': 1751 ce.ch = '\t'; 1752 ce.len = 2; 1753 break; 1754 case 'n': 1755 ce.ch = '\n'; 1756 ce.len = 2; 1757 break; 1758 case 'r': 1759 ce.ch = '\r'; 1760 ce.len = 2; 1761 break; 1762 case 'x': 1763 case 'u': 1764 if ((c == 'x' && syntax.get (RESyntax.RE_HEX_CHAR)) || 1765 (c == 'u' && syntax.get (RESyntax.RE_UNICODE_CHAR))) 1766 { 1767 int l = 0; 1768 int expectedLength = (c == 'x' ? 2 : 4); 1769 for (int i = pos + 2; i < pos + 2 + expectedLength; i++) 1770 { 1771 if (i >= lim) 1772 break; 1773 if (!((input[i] >= '0' && input[i] <= '9') || 1774 (input[i] >= 'A' && input[i] <= 'F') || 1775 (input[i] >= 'a' && input[i] <= 'f'))) 1776 break; 1777 l++; 1778 } 1779 if (l != expectedLength) 1780 return null; 1781 ce.ch = (char) (parseInt (input, pos + 2, l, 16)); 1782 ce.len = l + 2; 1783 } 1784 else 1785 { 1786 ce.ch = c; 1787 ce.len = 2; 1788 } 1789 break; 1790 case '0': 1791 if (syntax.get (RESyntax.RE_OCTAL_CHAR)) 1792 { 1793 int l = 0; 1794 for (int i = pos + 2; i < pos + 2 + 3; i++) 1795 { 1796 if (i >= lim) 1797 break; 1798 if (input[i] < '0' || input[i] > '7') 1799 break; 1800 l++; 1801 } 1802 if (l == 3 && input[pos + 2] > '3') 1803 l--; 1804 if (l <= 0) 1805 return null; 1806 ce.ch = (char) (parseInt (input, pos + 2, l, 8)); 1807 ce.len = l + 2; 1808 } 1809 else 1810 { 1811 ce.ch = c; 1812 ce.len = 2; 1813 } 1814 break; 1815 default: 1816 ce.ch = c; 1817 ce.len = 2; 1818 break; 1819 } 1820 } 1821 else 1822 { 1823 ce.ch = input[pos]; 1824 ce.len = 1; 1825 } 1826 ce.expr = new String (input, pos, ce.len); 1827 return ce; 1828 } 1829 1830 /** 1831 * This class represents a substring in a pattern string expressing 1832 * a named property. 1833 * "\pA" : Property named "A" 1834 * "\p{prop}" : Property named "prop" 1835 * "\PA" : Property named "A" (Negated) 1836 * "\P{prop}" : Property named "prop" (Negated) 1837 */ 1838 private static class NamedProperty 1839 { 1840 /** Property name */ 1841 String name; 1842 /** Negated or not */ 1843 boolean negate; 1844 /** length of this expression */ 1845 int len; 1846 } 1847 getNamedProperty(char[]input, int pos, int lim)1848 private static NamedProperty getNamedProperty (char[]input, int pos, 1849 int lim) 1850 { 1851 NamedProperty np = new NamedProperty (); 1852 char c = input[pos]; 1853 if (c == '\\') 1854 { 1855 if (++pos >= lim) 1856 return null; 1857 c = input[pos++]; 1858 switch (c) 1859 { 1860 case 'p': 1861 np.negate = false; 1862 break; 1863 case 'P': 1864 np.negate = true; 1865 break; 1866 default: 1867 return null; 1868 } 1869 c = input[pos++]; 1870 if (c == '{') 1871 { 1872 int p = -1; 1873 for (int i = pos; i < lim; i++) 1874 { 1875 if (input[i] == '}') 1876 { 1877 p = i; 1878 break; 1879 } 1880 } 1881 if (p < 0) 1882 return null; 1883 int len = p - pos; 1884 np.name = new String (input, pos, len); 1885 np.len = len + 4; 1886 } 1887 else 1888 { 1889 np.name = new String (input, pos - 1, 1); 1890 np.len = 3; 1891 } 1892 return np; 1893 } 1894 else 1895 return null; 1896 } 1897 getRETokenNamedProperty(int subIndex, NamedProperty np, boolean insens, int index)1898 private static RETokenNamedProperty getRETokenNamedProperty (int subIndex, 1899 NamedProperty 1900 np, 1901 boolean insens, 1902 int index) 1903 throws REException 1904 { 1905 try 1906 { 1907 return new RETokenNamedProperty (subIndex, np.name, insens, np.negate); 1908 } 1909 catch (REException e) 1910 { 1911 REException ree; 1912 ree = new REException (e.getMessage (), REException.REG_ESCAPE, index); 1913 ree.initCause (e); 1914 throw ree; 1915 } 1916 } 1917 1918 /** 1919 * Checks if the regular expression matches the input in its entirety. 1920 * 1921 * @param input The input text. 1922 */ isMatch(Object input)1923 public boolean isMatch (Object input) 1924 { 1925 return isMatch (input, 0, 0); 1926 } 1927 1928 /** 1929 * Checks if the input string, starting from index, is an exact match of 1930 * this regular expression. 1931 * 1932 * @param input The input text. 1933 * @param index The offset index at which the search should be begin. 1934 */ isMatch(Object input, int index)1935 public boolean isMatch (Object input, int index) 1936 { 1937 return isMatch (input, index, 0); 1938 } 1939 1940 1941 /** 1942 * Checks if the input, starting from index and using the specified 1943 * execution flags, is an exact match of this regular expression. 1944 * 1945 * @param input The input text. 1946 * @param index The offset index at which the search should be begin. 1947 * @param eflags The logical OR of any execution flags above. 1948 */ isMatch(Object input, int index, int eflags)1949 public boolean isMatch (Object input, int index, int eflags) 1950 { 1951 return isMatchImpl (makeCharIndexed (input, index), index, eflags); 1952 } 1953 isMatchImpl(CharIndexed input, int index, int eflags)1954 private boolean isMatchImpl (CharIndexed input, int index, int eflags) 1955 { 1956 if (firstToken == null) // Trivial case 1957 return (input.charAt (0) == CharIndexed.OUT_OF_BOUNDS); 1958 REMatch m = new REMatch (numSubs, index, eflags); 1959 if (firstToken.match (input, m)) 1960 { 1961 if (m != null) 1962 { 1963 if (input.charAt (m.index) == CharIndexed.OUT_OF_BOUNDS) 1964 { 1965 return true; 1966 } 1967 } 1968 } 1969 return false; 1970 } 1971 1972 /** 1973 * Returns the maximum number of subexpressions in this regular expression. 1974 * If the expression contains branches, the value returned will be the 1975 * maximum subexpressions in any of the branches. 1976 */ getNumSubs()1977 public int getNumSubs () 1978 { 1979 return numSubs; 1980 } 1981 1982 // Overrides REToken.setUncle setUncle(REToken uncle)1983 void setUncle (REToken uncle) 1984 { 1985 if (lastToken != null) 1986 { 1987 lastToken.setUncle (uncle); 1988 } 1989 else 1990 super.setUncle (uncle); // to deal with empty subexpressions 1991 } 1992 1993 // Overrides REToken.chain 1994 chain(REToken next)1995 boolean chain (REToken next) 1996 { 1997 super.chain (next); 1998 setUncle (next); 1999 return true; 2000 } 2001 2002 /** 2003 * Returns the minimum number of characters that could possibly 2004 * constitute a match of this regular expression. 2005 */ getMinimumLength()2006 public int getMinimumLength () 2007 { 2008 return minimumLength; 2009 } 2010 getMaximumLength()2011 public int getMaximumLength () 2012 { 2013 return maximumLength; 2014 } 2015 2016 /** 2017 * Returns an array of all matches found in the input. 2018 * 2019 * If the regular expression allows the empty string to match, it will 2020 * substitute matches at all positions except the end of the input. 2021 * 2022 * @param input The input text. 2023 * @return a non-null (but possibly zero-length) array of matches 2024 */ getAllMatches(Object input)2025 public REMatch[] getAllMatches (Object input) 2026 { 2027 return getAllMatches (input, 0, 0); 2028 } 2029 2030 /** 2031 * Returns an array of all matches found in the input, 2032 * beginning at the specified index position. 2033 * 2034 * If the regular expression allows the empty string to match, it will 2035 * substitute matches at all positions except the end of the input. 2036 * 2037 * @param input The input text. 2038 * @param index The offset index at which the search should be begin. 2039 * @return a non-null (but possibly zero-length) array of matches 2040 */ getAllMatches(Object input, int index)2041 public REMatch[] getAllMatches (Object input, int index) 2042 { 2043 return getAllMatches (input, index, 0); 2044 } 2045 2046 /** 2047 * Returns an array of all matches found in the input string, 2048 * beginning at the specified index position and using the specified 2049 * execution flags. 2050 * 2051 * If the regular expression allows the empty string to match, it will 2052 * substitute matches at all positions except the end of the input. 2053 * 2054 * @param input The input text. 2055 * @param index The offset index at which the search should be begin. 2056 * @param eflags The logical OR of any execution flags above. 2057 * @return a non-null (but possibly zero-length) array of matches 2058 */ getAllMatches(Object input, int index, int eflags)2059 public REMatch[] getAllMatches (Object input, int index, int eflags) 2060 { 2061 return getAllMatchesImpl (makeCharIndexed (input, index), index, eflags); 2062 } 2063 2064 // this has been changed since 1.03 to be non-overlapping matches getAllMatchesImpl(CharIndexed input, int index, int eflags)2065 private REMatch[] getAllMatchesImpl (CharIndexed input, int index, 2066 int eflags) 2067 { 2068 List < REMatch > all = new ArrayList < REMatch > (); 2069 REMatch m = null; 2070 while ((m = getMatchImpl (input, index, eflags, null)) != null) 2071 { 2072 all.add (m); 2073 index = m.getEndIndex (); 2074 if (m.end[0] == 0) 2075 { // handle pathological case of zero-length match 2076 index++; 2077 input.move (1); 2078 } 2079 else 2080 { 2081 input.move (m.end[0]); 2082 } 2083 if (!input.isValid ()) 2084 break; 2085 } 2086 return all.toArray (new REMatch[all.size ()]); 2087 } 2088 2089 /* Implements abstract method REToken.match() */ match(CharIndexed input, REMatch mymatch)2090 boolean match (CharIndexed input, REMatch mymatch) 2091 { 2092 input.setHitEnd (mymatch); 2093 if (firstToken == null) 2094 { 2095 return next (input, mymatch); 2096 } 2097 2098 // Note the start of this subexpression 2099 mymatch.start1[subIndex] = mymatch.index; 2100 2101 return firstToken.match (input, mymatch); 2102 } 2103 findMatch(CharIndexed input, REMatch mymatch)2104 REMatch findMatch (CharIndexed input, REMatch mymatch) 2105 { 2106 if (mymatch.backtrackStack == null) 2107 mymatch.backtrackStack = new BacktrackStack (); 2108 boolean b = match (input, mymatch); 2109 if (b) 2110 { 2111 return mymatch; 2112 } 2113 return null; 2114 } 2115 2116 /** 2117 * Returns the first match found in the input. If no match is found, 2118 * null is returned. 2119 * 2120 * @param input The input text. 2121 * @return An REMatch instance referencing the match, or null if none. 2122 */ getMatch(Object input)2123 public REMatch getMatch (Object input) 2124 { 2125 return getMatch (input, 0, 0); 2126 } 2127 2128 /** 2129 * Returns the first match found in the input, beginning 2130 * the search at the specified index. If no match is found, 2131 * returns null. 2132 * 2133 * @param input The input text. 2134 * @param index The offset within the text to begin looking for a match. 2135 * @return An REMatch instance referencing the match, or null if none. 2136 */ getMatch(Object input, int index)2137 public REMatch getMatch (Object input, int index) 2138 { 2139 return getMatch (input, index, 0); 2140 } 2141 2142 /** 2143 * Returns the first match found in the input, beginning 2144 * the search at the specified index, and using the specified 2145 * execution flags. If no match is found, returns null. 2146 * 2147 * @param input The input text. 2148 * @param index The offset index at which the search should be begin. 2149 * @param eflags The logical OR of any execution flags above. 2150 * @return An REMatch instance referencing the match, or null if none. 2151 */ getMatch(Object input, int index, int eflags)2152 public REMatch getMatch (Object input, int index, int eflags) 2153 { 2154 return getMatch (input, index, eflags, null); 2155 } 2156 2157 /** 2158 * Returns the first match found in the input, beginning the search 2159 * at the specified index, and using the specified execution flags. 2160 * If no match is found, returns null. If a StringBuffer is 2161 * provided and is non-null, the contents of the input text from the 2162 * index to the beginning of the match (or to the end of the input, 2163 * if there is no match) are appended to the StringBuffer. 2164 * 2165 * @param input The input text. 2166 * @param index The offset index at which the search should be begin. 2167 * @param eflags The logical OR of any execution flags above. 2168 * @param buffer The StringBuffer to save pre-match text in. 2169 * @return An REMatch instance referencing the match, or null if none. */ getMatch(Object input, int index, int eflags, CPStringBuilder buffer)2170 public REMatch getMatch (Object input, int index, int eflags, 2171 CPStringBuilder buffer) 2172 { 2173 return getMatchImpl (makeCharIndexed (input, index), index, eflags, 2174 buffer); 2175 } 2176 getMatchImpl(CharIndexed input, int anchor, int eflags, CPStringBuilder buffer)2177 REMatch getMatchImpl (CharIndexed input, int anchor, int eflags, 2178 CPStringBuilder buffer) 2179 { 2180 boolean tryEntireMatch = ((eflags & REG_TRY_ENTIRE_MATCH) != 0); 2181 boolean doMove = ((eflags & REG_FIX_STARTING_POSITION) == 0); 2182 RE re = (tryEntireMatch ? (RE) this.clone () : this); 2183 if (tryEntireMatch) 2184 { 2185 RETokenEnd reEnd = new RETokenEnd (0, null); 2186 reEnd.setFake (true); 2187 re.chain (reEnd); 2188 } 2189 // Create a new REMatch to hold results 2190 REMatch mymatch = new REMatch (numSubs, anchor, eflags); 2191 do 2192 { 2193 /* The following potimization is commented out because 2194 the matching should be tried even if the length of 2195 input is obviously too short in order that 2196 java.util.regex.Matcher#hitEnd() may work correctly. 2197 // Optimization: check if anchor + minimumLength > length 2198 if (minimumLength == 0 || input.charAt(minimumLength-1) != CharIndexed.OUT_OF_BOUNDS) { 2199 */ 2200 if (re.match (input, mymatch)) 2201 { 2202 REMatch best = mymatch; 2203 // We assume that the match that coms first is the best. 2204 // And the following "The longer, the better" rule has 2205 // been commented out. The longest is not neccesarily 2206 // the best. For example, "a" out of "aaa" is the best 2207 // match for /a+?/. 2208 /* 2209 // Find best match of them all to observe leftmost longest 2210 while ((mymatch = mymatch.next) != null) { 2211 if (mymatch.index > best.index) { 2212 best = mymatch; 2213 } 2214 } 2215 */ 2216 best.end[0] = best.index; 2217 best.finish (input); 2218 input.setLastMatch (best); 2219 return best; 2220 } 2221 /* End of the optimization commented out 2222 } 2223 */ 2224 mymatch.clear (++anchor); 2225 // Append character to buffer if needed 2226 if (buffer != null && input.charAt (0) != CharIndexed.OUT_OF_BOUNDS) 2227 { 2228 buffer.append (input.charAt (0)); 2229 } 2230 // java.util.regex.Matcher#hitEnd() requires that the search should 2231 // be tried at the end of input, so we use move1(1) instead of move(1) 2232 } 2233 while (doMove && input.move1 (1)); 2234 2235 // Special handling at end of input for e.g. "$" 2236 if (minimumLength == 0) 2237 { 2238 if (match (input, mymatch)) 2239 { 2240 mymatch.finish (input); 2241 return mymatch; 2242 } 2243 } 2244 2245 return null; 2246 } 2247 2248 /** 2249 * Returns an REMatchEnumeration that can be used to iterate over the 2250 * matches found in the input text. 2251 * 2252 * @param input The input text. 2253 * @return A non-null REMatchEnumeration instance. 2254 */ getMatchEnumeration(Object input)2255 public REMatchEnumeration getMatchEnumeration (Object input) 2256 { 2257 return getMatchEnumeration (input, 0, 0); 2258 } 2259 2260 2261 /** 2262 * Returns an REMatchEnumeration that can be used to iterate over the 2263 * matches found in the input text. 2264 * 2265 * @param input The input text. 2266 * @param index The offset index at which the search should be begin. 2267 * @return A non-null REMatchEnumeration instance, with its input cursor 2268 * set to the index position specified. 2269 */ getMatchEnumeration(Object input, int index)2270 public REMatchEnumeration getMatchEnumeration (Object input, int index) 2271 { 2272 return getMatchEnumeration (input, index, 0); 2273 } 2274 2275 /** 2276 * Returns an REMatchEnumeration that can be used to iterate over the 2277 * matches found in the input text. 2278 * 2279 * @param input The input text. 2280 * @param index The offset index at which the search should be begin. 2281 * @param eflags The logical OR of any execution flags above. 2282 * @return A non-null REMatchEnumeration instance, with its input cursor 2283 * set to the index position specified. 2284 */ getMatchEnumeration(Object input, int index, int eflags)2285 public REMatchEnumeration getMatchEnumeration (Object input, int index, 2286 int eflags) 2287 { 2288 return new REMatchEnumeration (this, makeCharIndexed (input, index), 2289 index, eflags); 2290 } 2291 2292 2293 /** 2294 * Substitutes the replacement text for the first match found in the input. 2295 * 2296 * @param input The input text. 2297 * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto). 2298 * @return A String interpolating the substituted text. 2299 * @see REMatch#substituteInto 2300 */ substitute(Object input, String replace)2301 public String substitute (Object input, String replace) 2302 { 2303 return substitute (input, replace, 0, 0); 2304 } 2305 2306 /** 2307 * Substitutes the replacement text for the first match found in the input 2308 * beginning at the specified index position. Specifying an index 2309 * effectively causes the regular expression engine to throw away the 2310 * specified number of characters. 2311 * 2312 * @param input The input text. 2313 * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto). 2314 * @param index The offset index at which the search should be begin. 2315 * @return A String containing the substring of the input, starting 2316 * at the index position, and interpolating the substituted text. 2317 * @see REMatch#substituteInto 2318 */ substitute(Object input, String replace, int index)2319 public String substitute (Object input, String replace, int index) 2320 { 2321 return substitute (input, replace, index, 0); 2322 } 2323 2324 /** 2325 * Substitutes the replacement text for the first match found in the input 2326 * string, beginning at the specified index position and using the 2327 * specified execution flags. 2328 * 2329 * @param input The input text. 2330 * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto). 2331 * @param index The offset index at which the search should be begin. 2332 * @param eflags The logical OR of any execution flags above. 2333 * @return A String containing the substring of the input, starting 2334 * at the index position, and interpolating the substituted text. 2335 * @see REMatch#substituteInto 2336 */ substitute(Object input, String replace, int index, int eflags)2337 public String substitute (Object input, String replace, int index, 2338 int eflags) 2339 { 2340 return substituteImpl (makeCharIndexed (input, index), replace, index, 2341 eflags); 2342 } 2343 substituteImpl(CharIndexed input, String replace, int index, int eflags)2344 private String substituteImpl (CharIndexed input, String replace, int index, 2345 int eflags) 2346 { 2347 CPStringBuilder buffer = new CPStringBuilder (); 2348 REMatch m = getMatchImpl (input, index, eflags, buffer); 2349 if (m == null) 2350 return buffer.toString (); 2351 buffer.append (getReplacement (replace, m, eflags)); 2352 if (input.move (m.end[0])) 2353 { 2354 do 2355 { 2356 buffer.append (input.charAt (0)); 2357 } 2358 while (input.move (1)); 2359 } 2360 return buffer.toString (); 2361 } 2362 2363 /** 2364 * Substitutes the replacement text for each non-overlapping match found 2365 * in the input text. 2366 * 2367 * @param input The input text. 2368 * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto). 2369 * @return A String interpolating the substituted text. 2370 * @see REMatch#substituteInto 2371 */ substituteAll(Object input, String replace)2372 public String substituteAll (Object input, String replace) 2373 { 2374 return substituteAll (input, replace, 0, 0); 2375 } 2376 2377 /** 2378 * Substitutes the replacement text for each non-overlapping match found 2379 * in the input text, starting at the specified index. 2380 * 2381 * If the regular expression allows the empty string to match, it will 2382 * substitute matches at all positions except the end of the input. 2383 * 2384 * @param input The input text. 2385 * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto). 2386 * @param index The offset index at which the search should be begin. 2387 * @return A String containing the substring of the input, starting 2388 * at the index position, and interpolating the substituted text. 2389 * @see REMatch#substituteInto 2390 */ substituteAll(Object input, String replace, int index)2391 public String substituteAll (Object input, String replace, int index) 2392 { 2393 return substituteAll (input, replace, index, 0); 2394 } 2395 2396 /** 2397 * Substitutes the replacement text for each non-overlapping match found 2398 * in the input text, starting at the specified index and using the 2399 * specified execution flags. 2400 * 2401 * @param input The input text. 2402 * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto). 2403 * @param index The offset index at which the search should be begin. 2404 * @param eflags The logical OR of any execution flags above. 2405 * @return A String containing the substring of the input, starting 2406 * at the index position, and interpolating the substituted text. 2407 * @see REMatch#substituteInto 2408 */ substituteAll(Object input, String replace, int index, int eflags)2409 public String substituteAll (Object input, String replace, int index, 2410 int eflags) 2411 { 2412 return substituteAllImpl (makeCharIndexed (input, index), replace, index, 2413 eflags); 2414 } 2415 substituteAllImpl(CharIndexed input, String replace, int index, int eflags)2416 private String substituteAllImpl (CharIndexed input, String replace, 2417 int index, int eflags) 2418 { 2419 CPStringBuilder buffer = new CPStringBuilder (); 2420 REMatch m; 2421 while ((m = getMatchImpl (input, index, eflags, buffer)) != null) 2422 { 2423 buffer.append (getReplacement (replace, m, eflags)); 2424 index = m.getEndIndex (); 2425 if (m.end[0] == 0) 2426 { 2427 char ch = input.charAt (0); 2428 if (ch != CharIndexed.OUT_OF_BOUNDS) 2429 buffer.append (ch); 2430 input.move (1); 2431 } 2432 else 2433 { 2434 input.move (m.end[0]); 2435 } 2436 2437 if (!input.isValid ()) 2438 break; 2439 } 2440 return buffer.toString (); 2441 } 2442 getReplacement(String replace, REMatch m, int eflags)2443 public static String getReplacement (String replace, REMatch m, int eflags) 2444 { 2445 if ((eflags & REG_NO_INTERPOLATE) > 0) 2446 return replace; 2447 else 2448 { 2449 if ((eflags & REG_REPLACE_USE_BACKSLASHESCAPE) > 0) 2450 { 2451 CPStringBuilder sb = new CPStringBuilder (); 2452 int l = replace.length (); 2453 for (int i = 0; i < l; i++) 2454 { 2455 char c = replace.charAt (i); 2456 switch (c) 2457 { 2458 case '\\': 2459 i++; 2460 // Let StringIndexOutOfBoundsException be thrown. 2461 sb.append (replace.charAt (i)); 2462 break; 2463 case '$': 2464 int i1 = i + 1; 2465 while (i1 < replace.length () && 2466 Character.isDigit (replace.charAt (i1))) 2467 i1++; 2468 sb.append (m.substituteInto (replace.substring (i, i1))); 2469 i = i1 - 1; 2470 break; 2471 default: 2472 sb.append (c); 2473 } 2474 } 2475 return sb.toString (); 2476 } 2477 else 2478 return m.substituteInto (replace); 2479 } 2480 } 2481 2482 /* Helper function for constructor */ addToken(REToken next)2483 private void addToken (REToken next) 2484 { 2485 if (next == null) 2486 return; 2487 minimumLength += next.getMinimumLength (); 2488 int nmax = next.getMaximumLength (); 2489 if (nmax < Integer.MAX_VALUE && maximumLength < Integer.MAX_VALUE) 2490 maximumLength += nmax; 2491 else 2492 maximumLength = Integer.MAX_VALUE; 2493 2494 if (firstToken == null) 2495 { 2496 lastToken = firstToken = next; 2497 } 2498 else 2499 { 2500 // if chain returns false, it "rejected" the token due to 2501 // an optimization, and next was combined with lastToken 2502 if (lastToken.chain (next)) 2503 { 2504 lastToken = next; 2505 } 2506 } 2507 } 2508 setRepeated(REToken current, int min, int max, int index)2509 private static REToken setRepeated (REToken current, int min, int max, 2510 int index) throws REException 2511 { 2512 if (current == null) 2513 throw new REException (getLocalizedMessage ("repeat.no.token"), 2514 REException.REG_BADRPT, index); 2515 return new RETokenRepeated (current.subIndex, current, min, max); 2516 } 2517 getPosixSet(char[]pattern, int index, CPStringBuilder buf)2518 private static int getPosixSet (char[]pattern, int index, 2519 CPStringBuilder buf) 2520 { 2521 // Precondition: pattern[index-1] == ':' 2522 // we will return pos of closing ']'. 2523 int i; 2524 for (i = index; i < (pattern.length - 1); i++) 2525 { 2526 if ((pattern[i] == ':') && (pattern[i + 1] == ']')) 2527 return i + 2; 2528 buf.append (pattern[i]); 2529 } 2530 return index; // didn't match up 2531 } 2532 getMinMax(char[]input, int index, IntPair minMax, RESyntax syntax)2533 private int getMinMax (char[]input, int index, IntPair minMax, 2534 RESyntax syntax) throws REException 2535 { 2536 // Precondition: input[index-1] == '{', minMax != null 2537 2538 boolean mustMatch = !syntax.get (RESyntax.RE_NO_BK_BRACES); 2539 int startIndex = index; 2540 if (index == input.length) 2541 { 2542 if (mustMatch) 2543 throw new REException (getLocalizedMessage ("unmatched.brace"), 2544 REException.REG_EBRACE, index); 2545 else 2546 return startIndex; 2547 } 2548 2549 int min, max = 0; 2550 CharUnit unit = new CharUnit (); 2551 CPStringBuilder buf = new CPStringBuilder (); 2552 2553 // Read string of digits 2554 do 2555 { 2556 index = getCharUnit (input, index, unit, false); 2557 if (Character.isDigit (unit.ch)) 2558 buf.append (unit.ch); 2559 } 2560 while ((index != input.length) && Character.isDigit (unit.ch)); 2561 2562 // Check for {} tomfoolery 2563 if (buf.length () == 0) 2564 { 2565 if (mustMatch) 2566 throw new REException (getLocalizedMessage ("interval.error"), 2567 REException.REG_EBRACE, index); 2568 else 2569 return startIndex; 2570 } 2571 2572 min = Integer.parseInt (buf.toString ()); 2573 2574 if ((unit.ch == '}') && (syntax.get (RESyntax.RE_NO_BK_BRACES) ^ unit.bk)) 2575 max = min; 2576 else if (index == input.length) 2577 if (mustMatch) 2578 throw new REException (getLocalizedMessage ("interval.no.end"), 2579 REException.REG_EBRACE, index); 2580 else 2581 return startIndex; 2582 else 2583 if ((unit.ch == ',') && !unit.bk) 2584 { 2585 buf = new CPStringBuilder (); 2586 // Read string of digits 2587 while (((index = 2588 getCharUnit (input, index, unit, false)) != input.length) 2589 && Character.isDigit (unit.ch)) 2590 buf.append (unit.ch); 2591 2592 if (! 2593 ((unit.ch == '}') 2594 && (syntax.get (RESyntax.RE_NO_BK_BRACES) ^ unit.bk))) 2595 if (mustMatch) 2596 throw new REException (getLocalizedMessage ("interval.error"), 2597 REException.REG_EBRACE, index); 2598 else 2599 return startIndex; 2600 2601 // This is the case of {x,} 2602 if (buf.length () == 0) 2603 max = Integer.MAX_VALUE; 2604 else 2605 max = Integer.parseInt (buf.toString ()); 2606 } 2607 else if (mustMatch) 2608 throw new REException (getLocalizedMessage ("interval.error"), 2609 REException.REG_EBRACE, index); 2610 else 2611 return startIndex; 2612 2613 // We know min and max now, and they are valid. 2614 2615 minMax.first = min; 2616 minMax.second = max; 2617 2618 // return the index following the '}' 2619 return index; 2620 } 2621 2622 /** 2623 * Return a human readable form of the compiled regular expression, 2624 * useful for debugging. 2625 */ toString()2626 public String toString () 2627 { 2628 CPStringBuilder sb = new CPStringBuilder (); 2629 dump (sb); 2630 return sb.toString (); 2631 } 2632 dump(CPStringBuilder os)2633 void dump (CPStringBuilder os) 2634 { 2635 os.append ("(?#startRE subIndex=" + subIndex + ")"); 2636 if (subIndex == 0) 2637 os.append ("?:"); 2638 if (firstToken != null) 2639 firstToken.dumpAll (os); 2640 if (subIndex == 0) 2641 os.append (")"); 2642 os.append ("(?#endRE subIndex=" + subIndex + ")"); 2643 } 2644 2645 // Cast input appropriately or throw exception 2646 // This method was originally a private method, but has been made 2647 // public because java.util.regex.Matcher uses this. makeCharIndexed(Object input, int index)2648 public static CharIndexed makeCharIndexed (Object input, int index) 2649 { 2650 // The case where input is already a CharIndexed is supposed 2651 // be the most likely because this is the case with 2652 // java.util.regex.Matcher. 2653 // We could let a String or a CharSequence fall through 2654 // to final input, but since it'a very likely input type, 2655 // we check it first. 2656 if (input instanceof CharIndexed) 2657 { 2658 CharIndexed ci = (CharIndexed) input; 2659 ci.setAnchor (index); 2660 return ci; 2661 } 2662 else if (input instanceof CharSequence) 2663 return new CharIndexedCharSequence ((CharSequence) input, index); 2664 else if (input instanceof String) 2665 return new CharIndexedString ((String) input, index); 2666 else if (input instanceof char[]) 2667 return new CharIndexedCharArray ((char[]) input, index); 2668 else if (input instanceof StringBuffer) 2669 return new CharIndexedStringBuffer ((StringBuffer) input, index); 2670 else if (input instanceof InputStream) 2671 return new CharIndexedInputStream ((InputStream) input, index); 2672 else 2673 return new CharIndexedString (input.toString (), index); 2674 } 2675 } 2676