1 /* gnu/regexp/RE.java
2    Copyright (C) 1998-2001, 2004, 2005 Free Software Foundation, Inc.
3 
4 This file is part of GNU Classpath.
5 
6 GNU Classpath is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
10 
11 GNU Classpath is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 General Public License for more details.
15 
16 You should have received a copy of the GNU General Public License
17 along with GNU Classpath; see the file COPYING.  If not, write to the
18 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 02110-1301 USA.
20 
21 Linking this library statically or dynamically with other modules is
22 making a combined work based on this library.  Thus, the terms and
23 conditions of the GNU General Public License cover the whole
24 combination.
25 
26 As a special exception, the copyright holders of this library give you
27 permission to link this library with independent modules to produce an
28 executable, regardless of the license terms of these independent
29 modules, and to copy and distribute the resulting executable under
30 terms of your choice, provided that you also meet, for each linked
31 independent module, the terms and conditions of the license of that
32 module.  An independent module is a module which is not derived from
33 or based on this library.  If you modify this library, you may extend
34 this exception to your version of the library, but you are not
35 obligated to do so.  If you do not wish to do so, delete this
36 exception statement from your version. */
37 
38 package gnu.regexp;
39 import java.io.InputStream;
40 import java.io.Serializable;
41 import java.util.Locale;
42 import java.util.PropertyResourceBundle;
43 import java.util.ResourceBundle;
44 import java.util.Vector;
45 
46 /**
47  * RE provides the user interface for compiling and matching regular
48  * expressions.
49  * <P>
50  * A regular expression object (class RE) is compiled by constructing it
51  * from a String, StringBuffer or character array, with optional
52  * compilation flags (below)
53  * and an optional syntax specification (see RESyntax; if not specified,
54  * <code>RESyntax.RE_SYNTAX_PERL5</code> is used).
55  * <P>
56  * Once compiled, a regular expression object is reusable as well as
57  * threadsafe: multiple threads can use the RE instance simultaneously
58  * to match against different input text.
59  * <P>
60  * Various methods attempt to match input text against a compiled
61  * regular expression.  These methods are:
62  * <LI><code>isMatch</code>: returns true if the input text in its
63  * entirety matches the regular expression pattern.
64  * <LI><code>getMatch</code>: returns the first match found in the
65  * input text, or null if no match is found.
66  * <LI><code>getAllMatches</code>: returns an array of all
67  * non-overlapping matches found in the input text.  If no matches are
68  * found, the array is zero-length.
69  * <LI><code>substitute</code>: substitute the first occurence of the
70  * pattern in the input text with a replacement string (which may
71  * include metacharacters $0-$9, see REMatch.substituteInto).
72  * <LI><code>substituteAll</code>: same as above, but repeat for each
73  * match before returning.
74  * <LI><code>getMatchEnumeration</code>: returns an REMatchEnumeration
75  * object that allows iteration over the matches (see
76  * REMatchEnumeration for some reasons why you may want to do this
77  * instead of using <code>getAllMatches</code>.
78  * <P>
79  *
80  * These methods all have similar argument lists.  The input can be a
81  * String, a character array, a StringBuffer, or an
82  * InputStream of some sort.  Note that when using an
83  * InputStream, the stream read position cannot be guaranteed after
84  * attempting a match (this is not a bug, but a consequence of the way
85  * regular expressions work).  Using an REMatchEnumeration can
86  * eliminate most positioning problems.
87  *
88  * <P>
89  *
90  * The optional index argument specifies the offset from the beginning
91  * of the text at which the search should start (see the descriptions
92  * of some of the execution flags for how this can affect positional
93  * pattern operators).  For an InputStream, this means an
94  * offset from the current read position, so subsequent calls with the
95  * same index argument on an InputStream will not
96  * necessarily access the same position on the stream, whereas
97  * repeated searches at a given index in a fixed string will return
98  * consistent results.
99  *
100  * <P>
101  * You can optionally affect the execution environment by using a
102  * combination of execution flags (constants listed below).
103  *
104  * <P>
105  * All operations on a regular expression are performed in a
106  * thread-safe manner.
107  *
108  * @author <A HREF="mailto:wes@cacas.org">Wes Biggs</A>
109  * @version 1.1.5-dev, to be released
110  */
111 
112 public class RE extends REToken {
113 
114   private static final class IntPair implements Serializable {
115     public int first, second;
116   }
117 
118   private static final class CharUnit implements Serializable {
119     public char ch;
120     public boolean bk;
121   }
122 
123   // This String will be returned by getVersion()
124   private static final String VERSION = "1.1.5-dev";
125 
126   // The localized strings are kept in a separate file
127   private static ResourceBundle messages = PropertyResourceBundle.getBundle("gnu/regexp/MessagesBundle", Locale.getDefault());
128 
129   // These are, respectively, the first and last tokens in our linked list
130   // If there is only one token, firstToken == lastToken
131   private REToken firstToken, lastToken;
132 
133   // This is the number of subexpressions in this regular expression,
134   // with a minimum value of zero.  Returned by getNumSubs()
135   private int numSubs;
136 
137     /** Minimum length, in characters, of any possible match. */
138     private int minimumLength;
139 
140   /**
141    * Compilation flag. Do  not  differentiate  case.   Subsequent
142    * searches  using  this  RE will be case insensitive.
143    */
144   public static final int REG_ICASE = 2;
145 
146   /**
147    * Compilation flag. The match-any-character operator (dot)
148    * will match a newline character.  When set this overrides the syntax
149    * bit RE_DOT_NEWLINE (see RESyntax for details).  This is equivalent to
150    * the "/s" operator in Perl.
151    */
152   public static final int REG_DOT_NEWLINE = 4;
153 
154   /**
155    * Compilation flag. Use multiline mode.  In this mode, the ^ and $
156    * anchors will match based on newlines within the input. This is
157    * equivalent to the "/m" operator in Perl.
158    */
159   public static final int REG_MULTILINE = 8;
160 
161   /**
162    * Execution flag.
163    * The match-beginning operator (^) will not match at the beginning
164    * of the input string. Useful for matching on a substring when you
165    * know the context of the input is such that position zero of the
166    * input to the match test is not actually position zero of the text.
167    * <P>
168    * This example demonstrates the results of various ways of matching on
169    * a substring.
170    * <P>
171    * <CODE>
172    * String s = "food bar fool";<BR>
173    * RE exp = new RE("^foo.");<BR>
174    * REMatch m0 = exp.getMatch(s);<BR>
175    * REMatch m1 = exp.getMatch(s.substring(8));<BR>
176    * REMatch m2 = exp.getMatch(s.substring(8),0,RE.REG_NOTBOL); <BR>
177    * REMatch m3 = exp.getMatch(s,8);                            <BR>
178    * REMatch m4 = exp.getMatch(s,8,RE.REG_ANCHORINDEX);         <BR>
179    * <P>
180    * // Results:<BR>
181    * //  m0.toString(): "food"<BR>
182    * //  m1.toString(): "fool"<BR>
183    * //  m2.toString(): null<BR>
184    * //  m3.toString(): null<BR>
185    * //  m4.toString(): "fool"<BR>
186    * </CODE>
187    */
188   public static final int REG_NOTBOL = 16;
189 
190   /**
191    * Execution flag.
192    * The match-end operator ($) does not match at the end
193    * of the input string. Useful for matching on substrings.
194    */
195   public static final int REG_NOTEOL = 32;
196 
197   /**
198    * Execution flag.
199    * When a match method is invoked that starts matching at a non-zero
200    * index into the input, treat the input as if it begins at the index
201    * given.  The effect of this flag is that the engine does not "see"
202    * any text in the input before the given index.  This is useful so
203    * that the match-beginning operator (^) matches not at position 0
204    * in the input string, but at the position the search started at
205    * (based on the index input given to the getMatch function).  See
206    * the example under REG_NOTBOL.  It also affects the use of the \&lt;
207    * and \b operators.
208    */
209   public static final int REG_ANCHORINDEX = 64;
210 
211   /**
212    * Execution flag.
213    * The substitute and substituteAll methods will not attempt to
214    * interpolate occurrences of $1-$9 in the replacement text with
215    * the corresponding subexpressions.  For example, you may want to
216    * replace all matches of "one dollar" with "$1".
217    */
218   public static final int REG_NO_INTERPOLATE = 128;
219 
220   /** Returns a string representing the version of the gnu.regexp package. */
version()221   public static final String version() {
222     return VERSION;
223   }
224 
225   // Retrieves a message from the ResourceBundle
getLocalizedMessage(String key)226   static final String getLocalizedMessage(String key) {
227     return messages.getString(key);
228   }
229 
230   /**
231    * Constructs a regular expression pattern buffer without any compilation
232    * flags set, and using the default syntax (RESyntax.RE_SYNTAX_PERL5).
233    *
234    * @param pattern A regular expression pattern, in the form of a String,
235    *   StringBuffer or char[].  Other input types will be converted to
236    *   strings using the toString() method.
237    * @exception REException The input pattern could not be parsed.
238    * @exception NullPointerException The pattern was null.
239    */
RE(Object pattern)240   public RE(Object pattern) throws REException {
241     this(pattern,0,RESyntax.RE_SYNTAX_PERL5,0,0);
242   }
243 
244   /**
245    * Constructs a regular expression pattern buffer using the specified
246    * compilation flags and the default syntax (RESyntax.RE_SYNTAX_PERL5).
247    *
248    * @param pattern A regular expression pattern, in the form of a String,
249    *   StringBuffer, or char[].  Other input types will be converted to
250    *   strings using the toString() method.
251    * @param cflags The logical OR of any combination of the compilation flags listed above.
252    * @exception REException The input pattern could not be parsed.
253    * @exception NullPointerException The pattern was null.
254    */
RE(Object pattern, int cflags)255   public RE(Object pattern, int cflags) throws REException {
256     this(pattern,cflags,RESyntax.RE_SYNTAX_PERL5,0,0);
257   }
258 
259   /**
260    * Constructs a regular expression pattern buffer using the specified
261    * compilation flags and regular expression syntax.
262    *
263    * @param pattern A regular expression pattern, in the form of a String,
264    *   StringBuffer, or char[].  Other input types will be converted to
265    *   strings using the toString() method.
266    * @param cflags The logical OR of any combination of the compilation flags listed above.
267    * @param syntax The type of regular expression syntax to use.
268    * @exception REException The input pattern could not be parsed.
269    * @exception NullPointerException The pattern was null.
270    */
RE(Object pattern, int cflags, RESyntax syntax)271   public RE(Object pattern, int cflags, RESyntax syntax) throws REException {
272     this(pattern,cflags,syntax,0,0);
273   }
274 
275   // internal constructor used for alternation
RE(REToken first, REToken last,int subs, int subIndex, int minLength)276   private RE(REToken first, REToken last,int subs, int subIndex, int minLength) {
277     super(subIndex);
278     firstToken = first;
279     lastToken = last;
280     numSubs = subs;
281     minimumLength = minLength;
282     addToken(new RETokenEndSub(subIndex));
283   }
284 
RE(Object patternObj, int cflags, RESyntax syntax, int myIndex, int nextSub)285   private RE(Object patternObj, int cflags, RESyntax syntax, int myIndex, int nextSub) throws REException {
286     super(myIndex); // Subexpression index of this token.
287     initialize(patternObj, cflags, syntax, myIndex, nextSub);
288   }
289 
290     // For use by subclasses
RE()291     protected RE() { super(0); }
292 
293     // The meat of construction
initialize(Object patternObj, int cflags, RESyntax syntax, int myIndex, int nextSub)294   protected void initialize(Object patternObj, int cflags, RESyntax syntax, int myIndex, int nextSub) throws REException {
295       char[] pattern;
296     if (patternObj instanceof String) {
297       pattern = ((String) patternObj).toCharArray();
298     } else if (patternObj instanceof char[]) {
299       pattern = (char[]) patternObj;
300     } else if (patternObj instanceof StringBuffer) {
301       pattern = new char [((StringBuffer) patternObj).length()];
302       ((StringBuffer) patternObj).getChars(0,pattern.length,pattern,0);
303     } else {
304 	pattern = patternObj.toString().toCharArray();
305     }
306 
307     int pLength = pattern.length;
308 
309     numSubs = 0; // Number of subexpressions in this token.
310     Vector branches = null;
311 
312     // linked list of tokens (sort of -- some closed loops can exist)
313     firstToken = lastToken = null;
314 
315     // Precalculate these so we don't pay for the math every time we
316     // need to access them.
317     boolean insens = ((cflags & REG_ICASE) > 0);
318 
319     // Parse pattern into tokens.  Does anyone know if it's more efficient
320     // to use char[] than a String.charAt()?  I'm assuming so.
321 
322     // index tracks the position in the char array
323     int index = 0;
324 
325     // this will be the current parse character (pattern[index])
326     CharUnit unit = new CharUnit();
327 
328     // This is used for {x,y} calculations
329     IntPair minMax = new IntPair();
330 
331     // Buffer a token so we can create a TokenRepeated, etc.
332     REToken currentToken = null;
333     char ch;
334     boolean quot = false;
335 
336     while (index < pLength) {
337       // read the next character unit (including backslash escapes)
338       index = getCharUnit(pattern,index,unit,quot);
339 
340       if (unit.bk)
341         if (unit.ch == 'Q') {
342           quot = true;
343           continue;
344         } else if (unit.ch == 'E') {
345           quot = false;
346           continue;
347         }
348       if (quot)
349       	unit.bk = false;
350 
351       // ALTERNATION OPERATOR
352       //  \| or | (if RE_NO_BK_VBAR) or newline (if RE_NEWLINE_ALT)
353       //  not available if RE_LIMITED_OPS is set
354 
355       // TODO: the '\n' literal here should be a test against REToken.newline,
356       // which unfortunately may be more than a single character.
357       if ( ( (unit.ch == '|' && (syntax.get(RESyntax.RE_NO_BK_VBAR) ^ (unit.bk || quot)))
358 	     || (syntax.get(RESyntax.RE_NEWLINE_ALT) && (unit.ch == '\n') && !(unit.bk || quot)) )
359 	   && !syntax.get(RESyntax.RE_LIMITED_OPS)) {
360 	// make everything up to here be a branch. create vector if nec.
361 	addToken(currentToken);
362 	RE theBranch = new RE(firstToken, lastToken, numSubs, subIndex, minimumLength);
363 	minimumLength = 0;
364 	if (branches == null) {
365 	    branches = new Vector();
366 	}
367 	branches.addElement(theBranch);
368 	firstToken = lastToken = currentToken = null;
369       }
370 
371       // INTERVAL OPERATOR:
372       //  {x} | {x,} | {x,y}  (RE_INTERVALS && RE_NO_BK_BRACES)
373       //  \{x\} | \{x,\} | \{x,y\} (RE_INTERVALS && !RE_NO_BK_BRACES)
374       //
375       // OPEN QUESTION:
376       //  what is proper interpretation of '{' at start of string?
377 
378       else if ((unit.ch == '{') && syntax.get(RESyntax.RE_INTERVALS) && (syntax.get(RESyntax.RE_NO_BK_BRACES) ^ (unit.bk || quot))) {
379 	int newIndex = getMinMax(pattern,index,minMax,syntax);
380         if (newIndex > index) {
381           if (minMax.first > minMax.second)
382             throw new REException(getLocalizedMessage("interval.order"),REException.REG_BADRPT,newIndex);
383           if (currentToken == null)
384             throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,newIndex);
385           if (currentToken instanceof RETokenRepeated)
386             throw new REException(getLocalizedMessage("repeat.chained"),REException.REG_BADRPT,newIndex);
387           if (currentToken instanceof RETokenWordBoundary || currentToken instanceof RETokenWordBoundary)
388             throw new REException(getLocalizedMessage("repeat.assertion"),REException.REG_BADRPT,newIndex);
389           if ((currentToken.getMinimumLength() == 0) && (minMax.second == Integer.MAX_VALUE))
390             throw new REException(getLocalizedMessage("repeat.empty.token"),REException.REG_BADRPT,newIndex);
391           index = newIndex;
392           currentToken = setRepeated(currentToken,minMax.first,minMax.second,index);
393         }
394         else {
395           addToken(currentToken);
396           currentToken = new RETokenChar(subIndex,unit.ch,insens);
397         }
398       }
399 
400       // LIST OPERATOR:
401       //  [...] | [^...]
402 
403       else if ((unit.ch == '[') && !(unit.bk || quot)) {
404 	Vector options = new Vector();
405 	boolean negative = false;
406 	char lastChar = 0;
407 	if (index == pLength) throw new REException(getLocalizedMessage("unmatched.bracket"),REException.REG_EBRACK,index);
408 
409 	// Check for initial caret, negation
410 	if ((ch = pattern[index]) == '^') {
411 	  negative = true;
412 	  if (++index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
413 	  ch = pattern[index];
414 	}
415 
416 	// Check for leading right bracket literal
417 	if (ch == ']') {
418 	  lastChar = ch;
419 	  if (++index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
420 	}
421 
422 	while ((ch = pattern[index++]) != ']') {
423 	  if ((ch == '-') && (lastChar != 0)) {
424 	    if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
425 	    if ((ch = pattern[index]) == ']') {
426 	      options.addElement(new RETokenChar(subIndex,lastChar,insens));
427 	      lastChar = '-';
428 	    } else {
429 	      options.addElement(new RETokenRange(subIndex,lastChar,ch,insens));
430 	      lastChar = 0;
431 	      index++;
432 	    }
433           } else if ((ch == '\\') && syntax.get(RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS)) {
434             if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
435 	    int posixID = -1;
436 	    boolean negate = false;
437             char asciiEsc = 0;
438 	    if (("dswDSW".indexOf(pattern[index]) != -1) && syntax.get(RESyntax.RE_CHAR_CLASS_ESC_IN_LISTS)) {
439 	      switch (pattern[index]) {
440 	      case 'D':
441 		negate = true;
442 	      case 'd':
443 		posixID = RETokenPOSIX.DIGIT;
444 		break;
445 	      case 'S':
446 		negate = true;
447 	      case 's':
448 		posixID = RETokenPOSIX.SPACE;
449 		break;
450 	      case 'W':
451 		negate = true;
452 	      case 'w':
453 		posixID = RETokenPOSIX.ALNUM;
454 		break;
455 	      }
456 	    }
457             else if ("nrt".indexOf(pattern[index]) != -1) {
458               switch (pattern[index]) {
459                 case 'n':
460                   asciiEsc = '\n';
461                   break;
462                 case 't':
463                   asciiEsc = '\t';
464                   break;
465                 case 'r':
466                   asciiEsc = '\r';
467                   break;
468               }
469             }
470 	    if (lastChar != 0) options.addElement(new RETokenChar(subIndex,lastChar,insens));
471 
472 	    if (posixID != -1) {
473 	      options.addElement(new RETokenPOSIX(subIndex,posixID,insens,negate));
474 	    } else if (asciiEsc != 0) {
475 	      lastChar = asciiEsc;
476 	    } else {
477 	      lastChar = pattern[index];
478 	    }
479 	    ++index;
480 	  } else if ((ch == '[') && (syntax.get(RESyntax.RE_CHAR_CLASSES)) && (index < pLength) && (pattern[index] == ':')) {
481 	    StringBuffer posixSet = new StringBuffer();
482 	    index = getPosixSet(pattern,index+1,posixSet);
483 	    int posixId = RETokenPOSIX.intValue(posixSet.toString());
484 	    if (posixId != -1)
485 	      options.addElement(new RETokenPOSIX(subIndex,posixId,insens,false));
486 	  } else {
487 	    if (lastChar != 0) options.addElement(new RETokenChar(subIndex,lastChar,insens));
488 	    lastChar = ch;
489 	  }
490 	  if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
491 	} // while in list
492 	// Out of list, index is one past ']'
493 
494 	if (lastChar != 0) options.addElement(new RETokenChar(subIndex,lastChar,insens));
495 
496 	// Create a new RETokenOneOf
497 	addToken(currentToken);
498 	options.trimToSize();
499 	currentToken = new RETokenOneOf(subIndex,options,negative);
500       }
501 
502       // SUBEXPRESSIONS
503       //  (...) | \(...\) depending on RE_NO_BK_PARENS
504 
505       else if ((unit.ch == '(') && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot))) {
506 	boolean pure = false;
507 	boolean comment = false;
508         boolean lookAhead = false;
509         boolean negativelh = false;
510 	if ((index+1 < pLength) && (pattern[index] == '?')) {
511 	  switch (pattern[index+1]) {
512           case '!':
513             if (syntax.get(RESyntax.RE_LOOKAHEAD)) {
514               pure = true;
515               negativelh = true;
516               lookAhead = true;
517               index += 2;
518             }
519             break;
520           case '=':
521             if (syntax.get(RESyntax.RE_LOOKAHEAD)) {
522               pure = true;
523               lookAhead = true;
524               index += 2;
525             }
526             break;
527 	  case ':':
528 	    if (syntax.get(RESyntax.RE_PURE_GROUPING)) {
529 	      pure = true;
530 	      index += 2;
531 	    }
532 	    break;
533 	  case '#':
534 	    if (syntax.get(RESyntax.RE_COMMENTS)) {
535 	      comment = true;
536 	    }
537 	    break;
538           default:
539             throw new REException(getLocalizedMessage("repeat.no.token"), REException.REG_BADRPT, index);
540 	  }
541 	}
542 
543 	if (index >= pLength) {
544 	    throw new REException(getLocalizedMessage("unmatched.paren"), REException.REG_ESUBREG,index);
545 	}
546 
547 	// find end of subexpression
548 	int endIndex = index;
549 	int nextIndex = index;
550 	int nested = 0;
551 
552 	while ( ((nextIndex = getCharUnit(pattern,endIndex,unit,false)) > 0)
553 		&& !(nested == 0 && (unit.ch == ')') && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot))) )
554 	  if ((endIndex = nextIndex) >= pLength)
555 	    throw new REException(getLocalizedMessage("subexpr.no.end"),REException.REG_ESUBREG,nextIndex);
556 	  else if (unit.ch == '(' && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot)))
557 	    nested++;
558 	  else if (unit.ch == ')' && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot)))
559 	    nested--;
560 
561 	// endIndex is now position at a ')','\)'
562 	// nextIndex is end of string or position after ')' or '\)'
563 
564 	if (comment) index = nextIndex;
565 	else { // not a comment
566 	  // create RE subexpression as token.
567 	  addToken(currentToken);
568 	  if (!pure) {
569 	    numSubs++;
570 	  }
571 
572 	  int useIndex = (pure || lookAhead) ? 0 : nextSub + numSubs;
573 	  currentToken = new RE(String.valueOf(pattern,index,endIndex-index).toCharArray(),cflags,syntax,useIndex,nextSub + numSubs);
574 	  numSubs += ((RE) currentToken).getNumSubs();
575 
576           if (lookAhead) {
577 	      currentToken = new RETokenLookAhead(currentToken,negativelh);
578 	  }
579 
580 	  index = nextIndex;
581 	} // not a comment
582       } // subexpression
583 
584       // UNMATCHED RIGHT PAREN
585       // ) or \) throw exception if
586       // !syntax.get(RESyntax.RE_UNMATCHED_RIGHT_PAREN_ORD)
587       else if (!syntax.get(RESyntax.RE_UNMATCHED_RIGHT_PAREN_ORD) && ((unit.ch == ')') && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot)))) {
588 	throw new REException(getLocalizedMessage("unmatched.paren"),REException.REG_EPAREN,index);
589       }
590 
591       // START OF LINE OPERATOR
592       //  ^
593 
594       else if ((unit.ch == '^') && !(unit.bk || quot)) {
595 	addToken(currentToken);
596 	currentToken = null;
597 	addToken(new RETokenStart(subIndex,((cflags & REG_MULTILINE) > 0) ? syntax.getLineSeparator() : null));
598       }
599 
600       // END OF LINE OPERATOR
601       //  $
602 
603       else if ((unit.ch == '$') && !(unit.bk || quot)) {
604 	addToken(currentToken);
605 	currentToken = null;
606 	addToken(new RETokenEnd(subIndex,((cflags & REG_MULTILINE) > 0) ? syntax.getLineSeparator() : null));
607       }
608 
609       // MATCH-ANY-CHARACTER OPERATOR (except possibly newline and null)
610       //  .
611 
612       else if ((unit.ch == '.') && !(unit.bk || quot)) {
613 	addToken(currentToken);
614 	currentToken = new RETokenAny(subIndex,syntax.get(RESyntax.RE_DOT_NEWLINE) || ((cflags & REG_DOT_NEWLINE) > 0),syntax.get(RESyntax.RE_DOT_NOT_NULL));
615       }
616 
617       // ZERO-OR-MORE REPEAT OPERATOR
618       //  *
619 
620       else if ((unit.ch == '*') && !(unit.bk || quot)) {
621 	if (currentToken == null)
622           throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,index);
623 	if (currentToken instanceof RETokenRepeated)
624           throw new REException(getLocalizedMessage("repeat.chained"),REException.REG_BADRPT,index);
625 	if (currentToken instanceof RETokenWordBoundary || currentToken instanceof RETokenWordBoundary)
626 	  throw new REException(getLocalizedMessage("repeat.assertion"),REException.REG_BADRPT,index);
627 	if (currentToken.getMinimumLength() == 0)
628 	  throw new REException(getLocalizedMessage("repeat.empty.token"),REException.REG_BADRPT,index);
629 	currentToken = setRepeated(currentToken,0,Integer.MAX_VALUE,index);
630       }
631 
632       // ONE-OR-MORE REPEAT OPERATOR / POSSESSIVE MATCHING OPERATOR
633       //  + | \+ depending on RE_BK_PLUS_QM
634       //  not available if RE_LIMITED_OPS is set
635 
636       else if ((unit.ch == '+') && !syntax.get(RESyntax.RE_LIMITED_OPS) && (!syntax.get(RESyntax.RE_BK_PLUS_QM) ^ (unit.bk || quot))) {
637 	if (currentToken == null)
638           throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,index);
639 
640 	// Check for possessive matching on RETokenRepeated
641 	if (currentToken instanceof RETokenRepeated) {
642 	  RETokenRepeated tokenRep = (RETokenRepeated)currentToken;
643 	  if (syntax.get(RESyntax.RE_POSSESSIVE_OPS) && !tokenRep.isPossessive() && !tokenRep.isStingy())
644 	    tokenRep.makePossessive();
645 	  else
646 	    throw new REException(getLocalizedMessage("repeat.chained"),REException.REG_BADRPT,index);
647 
648 	}
649 	else if (currentToken instanceof RETokenWordBoundary || currentToken instanceof RETokenWordBoundary)
650 	  throw new REException(getLocalizedMessage("repeat.assertion"),REException.REG_BADRPT,index);
651 	else if (currentToken.getMinimumLength() == 0)
652 	  throw new REException(getLocalizedMessage("repeat.empty.token"),REException.REG_BADRPT,index);
653 	else
654 	  currentToken = setRepeated(currentToken,1,Integer.MAX_VALUE,index);
655       }
656 
657       // ZERO-OR-ONE REPEAT OPERATOR / STINGY MATCHING OPERATOR
658       //  ? | \? depending on RE_BK_PLUS_QM
659       //  not available if RE_LIMITED_OPS is set
660       //  stingy matching if RE_STINGY_OPS is set and it follows a quantifier
661 
662       else if ((unit.ch == '?') && !syntax.get(RESyntax.RE_LIMITED_OPS) && (!syntax.get(RESyntax.RE_BK_PLUS_QM) ^ (unit.bk || quot))) {
663 	if (currentToken == null) throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,index);
664 
665 	// Check for stingy matching on RETokenRepeated
666 	if (currentToken instanceof RETokenRepeated) {
667 	  RETokenRepeated tokenRep = (RETokenRepeated)currentToken;
668 	  if (syntax.get(RESyntax.RE_STINGY_OPS) && !tokenRep.isStingy() && !tokenRep.isPossessive())
669 	    tokenRep.makeStingy();
670 	  else
671 	    throw new REException(getLocalizedMessage("repeat.chained"),REException.REG_BADRPT,index);
672 	}
673 	else if (currentToken instanceof RETokenWordBoundary || currentToken instanceof RETokenWordBoundary)
674 	  throw new REException(getLocalizedMessage("repeat.assertion"),REException.REG_BADRPT,index);
675 	else
676 	  currentToken = setRepeated(currentToken,0,1,index);
677       }
678 
679       // BACKREFERENCE OPERATOR
680       //  \1 \2 ... \9
681       // not available if RE_NO_BK_REFS is set
682 
683       else if (unit.bk && Character.isDigit(unit.ch) && !syntax.get(RESyntax.RE_NO_BK_REFS)) {
684 	addToken(currentToken);
685 	currentToken = new RETokenBackRef(subIndex,Character.digit(unit.ch,10),insens);
686       }
687 
688       // START OF STRING OPERATOR
689       //  \A if RE_STRING_ANCHORS is set
690 
691       else if (unit.bk && (unit.ch == 'A') && syntax.get(RESyntax.RE_STRING_ANCHORS)) {
692 	addToken(currentToken);
693 	currentToken = new RETokenStart(subIndex,null);
694       }
695 
696       // WORD BREAK OPERATOR
697       //  \b if ????
698 
699       else if (unit.bk && (unit.ch == 'b') && syntax.get(RESyntax.RE_STRING_ANCHORS)) {
700 	  addToken(currentToken);
701 	  currentToken = new RETokenWordBoundary(subIndex, RETokenWordBoundary.BEGIN | RETokenWordBoundary.END, false);
702       }
703 
704       // WORD BEGIN OPERATOR
705       //  \< if ????
706       else if (unit.bk && (unit.ch == '<')) {
707 	  addToken(currentToken);
708 	  currentToken = new RETokenWordBoundary(subIndex, RETokenWordBoundary.BEGIN, false);
709       }
710 
711       // WORD END OPERATOR
712       //  \> if ????
713       else if (unit.bk && (unit.ch == '>')) {
714 	  addToken(currentToken);
715 	  currentToken = new RETokenWordBoundary(subIndex, RETokenWordBoundary.END, false);
716       }
717 
718       // NON-WORD BREAK OPERATOR
719       // \B if ????
720 
721       else if (unit.bk && (unit.ch == 'B') && syntax.get(RESyntax.RE_STRING_ANCHORS)) {
722 	  addToken(currentToken);
723 	  currentToken = new RETokenWordBoundary(subIndex, RETokenWordBoundary.BEGIN | RETokenWordBoundary.END, true);
724       }
725 
726 
727       // DIGIT OPERATOR
728       //  \d if RE_CHAR_CLASS_ESCAPES is set
729 
730       else if (unit.bk && (unit.ch == 'd') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
731 	addToken(currentToken);
732 	currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.DIGIT,insens,false);
733       }
734 
735       // NON-DIGIT OPERATOR
736       //  \D
737 
738 	else if (unit.bk && (unit.ch == 'D') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
739 	  addToken(currentToken);
740 	  currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.DIGIT,insens,true);
741 	}
742 
743 	// NEWLINE ESCAPE
744         //  \n
745 
746 	else if (unit.bk && (unit.ch == 'n')) {
747 	  addToken(currentToken);
748 	  currentToken = new RETokenChar(subIndex,'\n',false);
749 	}
750 
751 	// RETURN ESCAPE
752         //  \r
753 
754 	else if (unit.bk && (unit.ch == 'r')) {
755 	  addToken(currentToken);
756 	  currentToken = new RETokenChar(subIndex,'\r',false);
757 	}
758 
759 	// WHITESPACE OPERATOR
760         //  \s if RE_CHAR_CLASS_ESCAPES is set
761 
762 	else if (unit.bk && (unit.ch == 's') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
763 	  addToken(currentToken);
764 	  currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.SPACE,insens,false);
765 	}
766 
767 	// NON-WHITESPACE OPERATOR
768         //  \S
769 
770 	else if (unit.bk && (unit.ch == 'S') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
771 	  addToken(currentToken);
772 	  currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.SPACE,insens,true);
773 	}
774 
775 	// TAB ESCAPE
776         //  \t
777 
778 	else if (unit.bk && (unit.ch == 't')) {
779 	  addToken(currentToken);
780 	  currentToken = new RETokenChar(subIndex,'\t',false);
781 	}
782 
783 	// ALPHANUMERIC OPERATOR
784         //  \w
785 
786 	else if (unit.bk && (unit.ch == 'w') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
787 	  addToken(currentToken);
788 	  currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.ALNUM,insens,false);
789 	}
790 
791 	// NON-ALPHANUMERIC OPERATOR
792         //  \W
793 
794 	else if (unit.bk && (unit.ch == 'W') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
795 	  addToken(currentToken);
796 	  currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.ALNUM,insens,true);
797 	}
798 
799 	// END OF STRING OPERATOR
800         //  \Z
801 
802 	else if (unit.bk && (unit.ch == 'Z') && syntax.get(RESyntax.RE_STRING_ANCHORS)) {
803 	  addToken(currentToken);
804 	  currentToken = new RETokenEnd(subIndex,null);
805 	}
806 
807 	// NON-SPECIAL CHARACTER (or escape to make literal)
808         //  c | \* for example
809 
810 	else {  // not a special character
811 	  addToken(currentToken);
812 	  currentToken = new RETokenChar(subIndex,unit.ch,insens);
813 	}
814       } // end while
815 
816     // Add final buffered token and an EndSub marker
817     addToken(currentToken);
818 
819     if (branches != null) {
820 	branches.addElement(new RE(firstToken,lastToken,numSubs,subIndex,minimumLength));
821 	branches.trimToSize(); // compact the Vector
822 	minimumLength = 0;
823 	firstToken = lastToken = null;
824 	addToken(new RETokenOneOf(subIndex,branches,false));
825     }
826     else addToken(new RETokenEndSub(subIndex));
827 
828   }
829 
getCharUnit(char[] input, int index, CharUnit unit, boolean quot)830   private static int getCharUnit(char[] input, int index, CharUnit unit, boolean quot) throws REException {
831     unit.ch = input[index++];
832     unit.bk = (unit.ch == '\\'
833 	       && (!quot || index >= input.length || input[index] == 'E'));
834     if (unit.bk)
835       if (index < input.length)
836 	unit.ch = input[index++];
837       else throw new REException(getLocalizedMessage("ends.with.backslash"),REException.REG_ESCAPE,index);
838     return index;
839   }
840 
841   /**
842    * Checks if the regular expression matches the input in its entirety.
843    *
844    * @param input The input text.
845    */
isMatch(Object input)846   public boolean isMatch(Object input) {
847     return isMatch(input,0,0);
848   }
849 
850   /**
851    * Checks if the input string, starting from index, is an exact match of
852    * this regular expression.
853    *
854    * @param input The input text.
855    * @param index The offset index at which the search should be begin.
856    */
isMatch(Object input,int index)857   public boolean isMatch(Object input,int index) {
858     return isMatch(input,index,0);
859   }
860 
861 
862   /**
863    * Checks if the input, starting from index and using the specified
864    * execution flags, is an exact match of this regular expression.
865    *
866    * @param input The input text.
867    * @param index The offset index at which the search should be begin.
868    * @param eflags The logical OR of any execution flags above.
869    */
isMatch(Object input,int index,int eflags)870   public boolean isMatch(Object input,int index,int eflags) {
871     return isMatchImpl(makeCharIndexed(input,index),index,eflags);
872   }
873 
isMatchImpl(CharIndexed input, int index, int eflags)874   private boolean isMatchImpl(CharIndexed input, int index, int eflags) {
875     if (firstToken == null)  // Trivial case
876       return (input.charAt(0) == CharIndexed.OUT_OF_BOUNDS);
877     REMatch m = new REMatch(numSubs, index, eflags);
878     if (firstToken.match(input, m)) {
879 	while (m != null) {
880 	    if (input.charAt(m.index) == CharIndexed.OUT_OF_BOUNDS) {
881 		return true;
882 	    }
883 	    m = m.next;
884 	}
885     }
886     return false;
887   }
888 
889   /**
890    * Returns the maximum number of subexpressions in this regular expression.
891    * If the expression contains branches, the value returned will be the
892    * maximum subexpressions in any of the branches.
893    */
getNumSubs()894   public int getNumSubs() {
895     return numSubs;
896   }
897 
898   // Overrides REToken.setUncle
setUncle(REToken uncle)899   void setUncle(REToken uncle) {
900       if (lastToken != null) {
901 	  lastToken.setUncle(uncle);
902       } else super.setUncle(uncle); // to deal with empty subexpressions
903   }
904 
905   // Overrides REToken.chain
906 
chain(REToken next)907   boolean chain(REToken next) {
908     super.chain(next);
909     setUncle(next);
910     return true;
911   }
912 
913   /**
914    * Returns the minimum number of characters that could possibly
915    * constitute a match of this regular expression.
916    */
getMinimumLength()917   public int getMinimumLength() {
918       return minimumLength;
919   }
920 
921   /**
922    * Returns an array of all matches found in the input.
923    *
924    * If the regular expression allows the empty string to match, it will
925    * substitute matches at all positions except the end of the input.
926    *
927    * @param input The input text.
928    * @return a non-null (but possibly zero-length) array of matches
929    */
getAllMatches(Object input)930   public REMatch[] getAllMatches(Object input) {
931     return getAllMatches(input,0,0);
932   }
933 
934   /**
935    * Returns an array of all matches found in the input,
936    * beginning at the specified index position.
937    *
938    * If the regular expression allows the empty string to match, it will
939    * substitute matches at all positions except the end of the input.
940    *
941    * @param input The input text.
942    * @param index The offset index at which the search should be begin.
943    * @return a non-null (but possibly zero-length) array of matches
944    */
getAllMatches(Object input, int index)945   public REMatch[] getAllMatches(Object input, int index) {
946     return getAllMatches(input,index,0);
947   }
948 
949   /**
950    * Returns an array of all matches found in the input string,
951    * beginning at the specified index position and using the specified
952    * execution flags.
953    *
954    * If the regular expression allows the empty string to match, it will
955    * substitute matches at all positions except the end of the input.
956    *
957    * @param input The input text.
958    * @param index The offset index at which the search should be begin.
959    * @param eflags The logical OR of any execution flags above.
960    * @return a non-null (but possibly zero-length) array of matches
961    */
getAllMatches(Object input, int index, int eflags)962   public REMatch[] getAllMatches(Object input, int index, int eflags) {
963     return getAllMatchesImpl(makeCharIndexed(input,index),index,eflags);
964   }
965 
966   // this has been changed since 1.03 to be non-overlapping matches
getAllMatchesImpl(CharIndexed input, int index, int eflags)967   private REMatch[] getAllMatchesImpl(CharIndexed input, int index, int eflags) {
968     Vector all = new Vector();
969     REMatch m = null;
970     while ((m = getMatchImpl(input,index,eflags,null)) != null) {
971       all.addElement(m);
972       index = m.getEndIndex();
973       if (m.end[0] == 0) {   // handle pathological case of zero-length match
974 	index++;
975 	input.move(1);
976       } else {
977 	input.move(m.end[0]);
978       }
979       if (!input.isValid()) break;
980     }
981     REMatch[] mset = new REMatch[all.size()];
982     all.copyInto(mset);
983     return mset;
984   }
985 
986     /* Implements abstract method REToken.match() */
match(CharIndexed input, REMatch mymatch)987     boolean match(CharIndexed input, REMatch mymatch) {
988 	if (firstToken == null) return next(input, mymatch);
989 
990 	// Note the start of this subexpression
991 	mymatch.start[subIndex] = mymatch.index;
992 
993 	return firstToken.match(input, mymatch);
994     }
995 
996   /**
997    * Returns the first match found in the input.  If no match is found,
998    * null is returned.
999    *
1000    * @param input The input text.
1001    * @return An REMatch instance referencing the match, or null if none.
1002    */
getMatch(Object input)1003   public REMatch getMatch(Object input) {
1004     return getMatch(input,0,0);
1005   }
1006 
1007   /**
1008    * Returns the first match found in the input, beginning
1009    * the search at the specified index.  If no match is found,
1010    * returns null.
1011    *
1012    * @param input The input text.
1013    * @param index The offset within the text to begin looking for a match.
1014    * @return An REMatch instance referencing the match, or null if none.
1015    */
getMatch(Object input, int index)1016   public REMatch getMatch(Object input, int index) {
1017     return getMatch(input,index,0);
1018   }
1019 
1020   /**
1021    * Returns the first match found in the input, beginning
1022    * the search at the specified index, and using the specified
1023    * execution flags.  If no match is found, returns null.
1024    *
1025    * @param input The input text.
1026    * @param index The offset index at which the search should be begin.
1027    * @param eflags The logical OR of any execution flags above.
1028    * @return An REMatch instance referencing the match, or null if none.
1029    */
getMatch(Object input, int index, int eflags)1030   public REMatch getMatch(Object input, int index, int eflags) {
1031     return getMatch(input,index,eflags,null);
1032   }
1033 
1034   /**
1035    * Returns the first match found in the input, beginning the search
1036    * at the specified index, and using the specified execution flags.
1037    * If no match is found, returns null.  If a StringBuffer is
1038    * provided and is non-null, the contents of the input text from the
1039    * index to the beginning of the match (or to the end of the input,
1040    * if there is no match) are appended to the StringBuffer.
1041    *
1042    * @param input The input text.
1043    * @param index The offset index at which the search should be begin.
1044    * @param eflags The logical OR of any execution flags above.
1045    * @param buffer The StringBuffer to save pre-match text in.
1046    * @return An REMatch instance referencing the match, or null if none.  */
getMatch(Object input, int index, int eflags, StringBuffer buffer)1047   public REMatch getMatch(Object input, int index, int eflags, StringBuffer buffer) {
1048     return getMatchImpl(makeCharIndexed(input,index),index,eflags,buffer);
1049   }
1050 
getMatchImpl(CharIndexed input, int anchor, int eflags, StringBuffer buffer)1051   REMatch getMatchImpl(CharIndexed input, int anchor, int eflags, StringBuffer buffer) {
1052       // Create a new REMatch to hold results
1053       REMatch mymatch = new REMatch(numSubs, anchor, eflags);
1054       do {
1055 	  // Optimization: check if anchor + minimumLength > length
1056 	  if (minimumLength == 0 || input.charAt(minimumLength-1) != CharIndexed.OUT_OF_BOUNDS) {
1057 	      if (match(input, mymatch)) {
1058 		  // Find longest match of them all to observe leftmost longest
1059 		  REMatch longest = mymatch;
1060 		  while ((mymatch = mymatch.next) != null) {
1061 		      if (mymatch.index > longest.index) {
1062 			  longest = mymatch;
1063 		      }
1064 		  }
1065 
1066 		  longest.end[0] = longest.index;
1067 		  longest.finish(input);
1068 		  return longest;
1069 	      }
1070 	  }
1071 	  mymatch.clear(++anchor);
1072 	  // Append character to buffer if needed
1073 	  if (buffer != null && input.charAt(0) != CharIndexed.OUT_OF_BOUNDS) {
1074 	      buffer.append(input.charAt(0));
1075 	  }
1076       } while (input.move(1));
1077 
1078       // Special handling at end of input for e.g. "$"
1079       if (minimumLength == 0) {
1080 	  if (match(input, mymatch)) {
1081 	      mymatch.finish(input);
1082 	      return mymatch;
1083 	  }
1084       }
1085 
1086       return null;
1087   }
1088 
1089   /**
1090    * Returns an REMatchEnumeration that can be used to iterate over the
1091    * matches found in the input text.
1092    *
1093    * @param input The input text.
1094    * @return A non-null REMatchEnumeration instance.
1095    */
getMatchEnumeration(Object input)1096   public REMatchEnumeration getMatchEnumeration(Object input) {
1097     return getMatchEnumeration(input,0,0);
1098   }
1099 
1100 
1101   /**
1102    * Returns an REMatchEnumeration that can be used to iterate over the
1103    * matches found in the input text.
1104    *
1105    * @param input The input text.
1106    * @param index The offset index at which the search should be begin.
1107    * @return A non-null REMatchEnumeration instance, with its input cursor
1108    *  set to the index position specified.
1109    */
getMatchEnumeration(Object input, int index)1110   public REMatchEnumeration getMatchEnumeration(Object input, int index) {
1111     return getMatchEnumeration(input,index,0);
1112   }
1113 
1114   /**
1115    * Returns an REMatchEnumeration that can be used to iterate over the
1116    * matches found in the input text.
1117    *
1118    * @param input The input text.
1119    * @param index The offset index at which the search should be begin.
1120    * @param eflags The logical OR of any execution flags above.
1121    * @return A non-null REMatchEnumeration instance, with its input cursor
1122    *  set to the index position specified.
1123    */
getMatchEnumeration(Object input, int index, int eflags)1124   public REMatchEnumeration getMatchEnumeration(Object input, int index, int eflags) {
1125     return new REMatchEnumeration(this,makeCharIndexed(input,index),index,eflags);
1126   }
1127 
1128 
1129   /**
1130    * Substitutes the replacement text for the first match found in the input.
1131    *
1132    * @param input The input text.
1133    * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1134    * @return A String interpolating the substituted text.
1135    * @see REMatch#substituteInto
1136    */
substitute(Object input,String replace)1137   public String substitute(Object input,String replace) {
1138     return substitute(input,replace,0,0);
1139   }
1140 
1141   /**
1142    * Substitutes the replacement text for the first match found in the input
1143    * beginning at the specified index position.  Specifying an index
1144    * effectively causes the regular expression engine to throw away the
1145    * specified number of characters.
1146    *
1147    * @param input The input text.
1148    * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1149    * @param index The offset index at which the search should be begin.
1150    * @return A String containing the substring of the input, starting
1151    *   at the index position, and interpolating the substituted text.
1152    * @see REMatch#substituteInto
1153    */
substitute(Object input,String replace,int index)1154   public String substitute(Object input,String replace,int index) {
1155     return substitute(input,replace,index,0);
1156   }
1157 
1158   /**
1159    * Substitutes the replacement text for the first match found in the input
1160    * string, beginning at the specified index position and using the
1161    * specified execution flags.
1162    *
1163    * @param input The input text.
1164    * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1165    * @param index The offset index at which the search should be begin.
1166    * @param eflags The logical OR of any execution flags above.
1167    * @return A String containing the substring of the input, starting
1168    *   at the index position, and interpolating the substituted text.
1169    * @see REMatch#substituteInto
1170    */
substitute(Object input,String replace,int index,int eflags)1171   public String substitute(Object input,String replace,int index,int eflags) {
1172     return substituteImpl(makeCharIndexed(input,index),replace,index,eflags);
1173   }
1174 
substituteImpl(CharIndexed input,String replace,int index,int eflags)1175   private String substituteImpl(CharIndexed input,String replace,int index,int eflags) {
1176     StringBuffer buffer = new StringBuffer();
1177     REMatch m = getMatchImpl(input,index,eflags,buffer);
1178     if (m==null) return buffer.toString();
1179     buffer.append( ((eflags & REG_NO_INTERPOLATE) > 0) ?
1180 		   replace : m.substituteInto(replace) );
1181     if (input.move(m.end[0])) {
1182       do {
1183 	buffer.append(input.charAt(0));
1184       } while (input.move(1));
1185     }
1186     return buffer.toString();
1187   }
1188 
1189   /**
1190    * Substitutes the replacement text for each non-overlapping match found
1191    * in the input text.
1192    *
1193    * @param input The input text.
1194    * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1195    * @return A String interpolating the substituted text.
1196    * @see REMatch#substituteInto
1197    */
substituteAll(Object input,String replace)1198   public String substituteAll(Object input,String replace) {
1199     return substituteAll(input,replace,0,0);
1200   }
1201 
1202   /**
1203    * Substitutes the replacement text for each non-overlapping match found
1204    * in the input text, starting at the specified index.
1205    *
1206    * If the regular expression allows the empty string to match, it will
1207    * substitute matches at all positions except the end of the input.
1208    *
1209    * @param input The input text.
1210    * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1211    * @param index The offset index at which the search should be begin.
1212    * @return A String containing the substring of the input, starting
1213    *   at the index position, and interpolating the substituted text.
1214    * @see REMatch#substituteInto
1215    */
substituteAll(Object input,String replace,int index)1216   public String substituteAll(Object input,String replace,int index) {
1217     return substituteAll(input,replace,index,0);
1218   }
1219 
1220   /**
1221    * Substitutes the replacement text for each non-overlapping match found
1222    * in the input text, starting at the specified index and using the
1223    * specified execution flags.
1224    *
1225    * @param input The input text.
1226    * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1227    * @param index The offset index at which the search should be begin.
1228    * @param eflags The logical OR of any execution flags above.
1229    * @return A String containing the substring of the input, starting
1230    *   at the index position, and interpolating the substituted text.
1231    * @see REMatch#substituteInto
1232    */
substituteAll(Object input,String replace,int index,int eflags)1233   public String substituteAll(Object input,String replace,int index,int eflags) {
1234     return substituteAllImpl(makeCharIndexed(input,index),replace,index,eflags);
1235   }
1236 
substituteAllImpl(CharIndexed input,String replace,int index,int eflags)1237   private String substituteAllImpl(CharIndexed input,String replace,int index,int eflags) {
1238     StringBuffer buffer = new StringBuffer();
1239     REMatch m;
1240     while ((m = getMatchImpl(input,index,eflags,buffer)) != null) {
1241 	buffer.append( ((eflags & REG_NO_INTERPOLATE) > 0) ?
1242 		       replace : m.substituteInto(replace) );
1243       index = m.getEndIndex();
1244       if (m.end[0] == 0) {
1245 	char ch = input.charAt(0);
1246 	if (ch != CharIndexed.OUT_OF_BOUNDS)
1247 	    buffer.append(ch);
1248 	input.move(1);
1249       } else {
1250 	  input.move(m.end[0]);
1251       }
1252 
1253       if (!input.isValid()) break;
1254     }
1255     return buffer.toString();
1256   }
1257 
1258   /* Helper function for constructor */
addToken(REToken next)1259   private void addToken(REToken next) {
1260     if (next == null) return;
1261     minimumLength += next.getMinimumLength();
1262     if (firstToken == null) {
1263 	lastToken = firstToken = next;
1264     } else {
1265       // if chain returns false, it "rejected" the token due to
1266       // an optimization, and next was combined with lastToken
1267       if (lastToken.chain(next)) {
1268 	  lastToken = next;
1269       }
1270     }
1271   }
1272 
setRepeated(REToken current, int min, int max, int index)1273   private static REToken setRepeated(REToken current, int min, int max, int index) throws REException {
1274     if (current == null) throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,index);
1275     return new RETokenRepeated(current.subIndex,current,min,max);
1276   }
1277 
getPosixSet(char[] pattern,int index,StringBuffer buf)1278   private static int getPosixSet(char[] pattern,int index,StringBuffer buf) {
1279     // Precondition: pattern[index-1] == ':'
1280     // we will return pos of closing ']'.
1281     int i;
1282     for (i=index; i<(pattern.length-1); i++) {
1283       if ((pattern[i] == ':') && (pattern[i+1] == ']'))
1284 	return i+2;
1285       buf.append(pattern[i]);
1286     }
1287     return index; // didn't match up
1288   }
1289 
getMinMax(char[] input,int index,IntPair minMax,RESyntax syntax)1290   private int getMinMax(char[] input,int index,IntPair minMax,RESyntax syntax) throws REException {
1291     // Precondition: input[index-1] == '{', minMax != null
1292 
1293     boolean mustMatch = !syntax.get(RESyntax.RE_NO_BK_BRACES);
1294     int startIndex = index;
1295     if (index == input.length) {
1296       if (mustMatch)
1297         throw new REException(getLocalizedMessage("unmatched.brace"),REException.REG_EBRACE,index);
1298       else
1299         return startIndex;
1300     }
1301 
1302     int min,max=0;
1303     CharUnit unit = new CharUnit();
1304     StringBuffer buf = new StringBuffer();
1305 
1306     // Read string of digits
1307     do {
1308       index = getCharUnit(input,index,unit,false);
1309       if (Character.isDigit(unit.ch))
1310         buf.append(unit.ch);
1311     } while ((index != input.length) && Character.isDigit(unit.ch));
1312 
1313     // Check for {} tomfoolery
1314     if (buf.length() == 0) {
1315       if (mustMatch)
1316         throw new REException(getLocalizedMessage("interval.error"),REException.REG_EBRACE,index);
1317       else
1318         return startIndex;
1319     }
1320 
1321     min = Integer.parseInt(buf.toString());
1322 
1323     if ((unit.ch == '}') && (syntax.get(RESyntax.RE_NO_BK_BRACES) ^ unit.bk))
1324       max = min;
1325     else if (index == input.length)
1326       if (mustMatch)
1327         throw new REException(getLocalizedMessage("interval.no.end"),REException.REG_EBRACE,index);
1328       else
1329         return startIndex;
1330     else if ((unit.ch == ',') && !unit.bk) {
1331       buf = new StringBuffer();
1332       // Read string of digits
1333       while (((index = getCharUnit(input,index,unit,false)) != input.length) && Character.isDigit(unit.ch))
1334 	buf.append(unit.ch);
1335 
1336       if (!((unit.ch == '}') && (syntax.get(RESyntax.RE_NO_BK_BRACES) ^ unit.bk)))
1337         if (mustMatch)
1338           throw new REException(getLocalizedMessage("interval.error"),REException.REG_EBRACE,index);
1339         else
1340           return startIndex;
1341 
1342       // This is the case of {x,}
1343       if (buf.length() == 0) max = Integer.MAX_VALUE;
1344       else max = Integer.parseInt(buf.toString());
1345     } else
1346       if (mustMatch)
1347         throw new REException(getLocalizedMessage("interval.error"),REException.REG_EBRACE,index);
1348       else
1349         return startIndex;
1350 
1351     // We know min and max now, and they are valid.
1352 
1353     minMax.first = min;
1354     minMax.second = max;
1355 
1356     // return the index following the '}'
1357     return index;
1358   }
1359 
1360    /**
1361     * Return a human readable form of the compiled regular expression,
1362     * useful for debugging.
1363     */
toString()1364    public String toString() {
1365      StringBuffer sb = new StringBuffer();
1366      dump(sb);
1367      return sb.toString();
1368    }
1369 
dump(StringBuffer os)1370   void dump(StringBuffer os) {
1371     os.append('(');
1372     if (subIndex == 0)
1373       os.append("?:");
1374     if (firstToken != null)
1375       firstToken.dumpAll(os);
1376     os.append(')');
1377   }
1378 
1379   // Cast input appropriately or throw exception
makeCharIndexed(Object input, int index)1380   private static CharIndexed makeCharIndexed(Object input, int index) {
1381       // We could let a String fall through to final input, but since
1382       // it's the most likely input type, we check it first.
1383     if (input instanceof String)
1384       return new CharIndexedString((String) input,index);
1385     else if (input instanceof char[])
1386       return new CharIndexedCharArray((char[]) input,index);
1387     else if (input instanceof StringBuffer)
1388       return new CharIndexedStringBuffer((StringBuffer) input,index);
1389     else if (input instanceof InputStream)
1390       return new CharIndexedInputStream((InputStream) input,index);
1391     else if (input instanceof CharIndexed)
1392 	return (CharIndexed) input; // do we lose index info?
1393     else
1394 	return new CharIndexedString(input.toString(), index);
1395   }
1396 }
1397