1 /* gnu/regexp/RE.java
2    Copyright (C) 2006 Free Software Foundation, Inc.
3 
4 This file is part of GNU Classpath.
5 
6 GNU Classpath is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
10 
11 GNU Classpath is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 General Public License for more details.
15 
16 You should have received a copy of the GNU General Public License
17 along with GNU Classpath; see the file COPYING.  If not, write to the
18 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 02110-1301 USA.
20 
21 Linking this library statically or dynamically with other modules is
22 making a combined work based on this library.  Thus, the terms and
23 conditions of the GNU General Public License cover the whole
24 combination.
25 
26 As a special exception, the copyright holders of this library give you
27 permission to link this library with independent modules to produce an
28 executable, regardless of the license terms of these independent
29 modules, and to copy and distribute the resulting executable under
30 terms of your choice, provided that you also meet, for each linked
31 independent module, the terms and conditions of the license of that
32 module.  An independent module is a module which is not derived from
33 or based on this library.  If you modify this library, you may extend
34 this exception to your version of the library, but you are not
35 obligated to do so.  If you do not wish to do so, delete this
36 exception statement from your version. */
37 
38 package gnu.java.util.regex;
39 
40 import gnu.java.lang.CPStringBuilder;
41 
42 import java.io.InputStream;
43 import java.io.Serializable;
44 
45 import java.util.ArrayList;
46 import java.util.List;
47 import java.util.Locale;
48 import java.util.PropertyResourceBundle;
49 import java.util.ResourceBundle;
50 
51 /**
52  * RE provides the user interface for compiling and matching regular
53  * expressions.
54  * <P>
55  * A regular expression object (class RE) is compiled by constructing it
56  * from a String, StringBuffer or character array, with optional
57  * compilation flags (below)
58  * and an optional syntax specification (see RESyntax; if not specified,
59  * <code>RESyntax.RE_SYNTAX_PERL5</code> is used).
60  * <P>
61  * Once compiled, a regular expression object is reusable as well as
62  * threadsafe: multiple threads can use the RE instance simultaneously
63  * to match against different input text.
64  * <P>
65  * Various methods attempt to match input text against a compiled
66  * regular expression.  These methods are:
67  * <LI><code>isMatch</code>: returns true if the input text in its
68  * entirety matches the regular expression pattern.
69  * <LI><code>getMatch</code>: returns the first match found in the
70  * input text, or null if no match is found.
71  * <LI><code>getAllMatches</code>: returns an array of all
72  * non-overlapping matches found in the input text.  If no matches are
73  * found, the array is zero-length.
74  * <LI><code>substitute</code>: substitute the first occurence of the
75  * pattern in the input text with a replacement string (which may
76  * include metacharacters $0-$9, see REMatch.substituteInto).
77  * <LI><code>substituteAll</code>: same as above, but repeat for each
78  * match before returning.
79  * <LI><code>getMatchEnumeration</code>: returns an REMatchEnumeration
80  * object that allows iteration over the matches (see
81  * REMatchEnumeration for some reasons why you may want to do this
82  * instead of using <code>getAllMatches</code>.
83  * <P>
84  *
85  * These methods all have similar argument lists.  The input can be a
86  * CharIndexed, String, a character array, a StringBuffer, or an
87  * InputStream of some sort.  Note that when using an
88  * InputStream, the stream read position cannot be guaranteed after
89  * attempting a match (this is not a bug, but a consequence of the way
90  * regular expressions work).  Using an REMatchEnumeration can
91  * eliminate most positioning problems.
92  *
93  * Although the input object can be of various types, it is recommended
94  * that it should be a CharIndexed because {@link CharIndexed#getLastMatch()}
95  * can show the last match found on this input, which helps the expression
96  * \G work as the end of the previous match.
97  *
98  * <P>
99  *
100  * The optional index argument specifies the offset from the beginning
101  * of the text at which the search should start (see the descriptions
102  * of some of the execution flags for how this can affect positional
103  * pattern operators).  For an InputStream, this means an
104  * offset from the current read position, so subsequent calls with the
105  * same index argument on an InputStream will not
106  * necessarily access the same position on the stream, whereas
107  * repeated searches at a given index in a fixed string will return
108  * consistent results.
109  *
110  * <P>
111  * You can optionally affect the execution environment by using a
112  * combination of execution flags (constants listed below).
113  *
114  * <P>
115  * All operations on a regular expression are performed in a
116  * thread-safe manner.
117  *
118  * @author <A HREF="mailto:wes@cacas.org">Wes Biggs</A>
119  * @version 1.1.5-dev, to be released
120  */
121 
122 public class RE extends REToken
123 {
124 
125   private static final class IntPair implements Serializable
126   {
127     public int first, second;
128   }
129 
130   private static final class CharUnit implements Serializable
131   {
132     public char ch;
133     public boolean bk;
134   }
135 
136   // This String will be returned by getVersion()
137   private static final String VERSION = "1.1.5-dev";
138 
139   // The localized strings are kept in a separate file
140   // Used by getLocalizedMessage().
141   private static ResourceBundle messages;
142 
143   // Name of the bundle that contains the localized messages.
144   private static final String bundle = "gnu/java/util/regex/MessagesBundle";
145 
146   // These are, respectively, the first and last tokens in our linked list
147   // If there is only one token, firstToken == lastToken
148   private REToken firstToken, lastToken;
149 
150   // This is the number of subexpressions in this regular expression,
151   // with a minimum value of zero.  Returned by getNumSubs()
152   private int numSubs;
153 
154     /** Minimum length, in characters, of any possible match. */
155   private int minimumLength;
156   private int maximumLength;
157 
158   /**
159    * Compilation flag. Do  not  differentiate  case.   Subsequent
160    * searches  using  this  RE will be case insensitive.
161    */
162   public static final int REG_ICASE = 0x02;
163 
164   /**
165    * Compilation flag. The match-any-character operator (dot)
166    * will match a newline character.  When set this overrides the syntax
167    * bit RE_DOT_NEWLINE (see RESyntax for details).  This is equivalent to
168    * the "/s" operator in Perl.
169    */
170   public static final int REG_DOT_NEWLINE = 0x04;
171 
172   /**
173    * Compilation flag. Use multiline mode.  In this mode, the ^ and $
174    * anchors will match based on newlines within the input. This is
175    * equivalent to the "/m" operator in Perl.
176    */
177   public static final int REG_MULTILINE = 0x08;
178 
179   /**
180    * Execution flag.
181    * The match-beginning operator (^) will not match at the beginning
182    * of the input string. Useful for matching on a substring when you
183    * know the context of the input is such that position zero of the
184    * input to the match test is not actually position zero of the text.
185    * <P>
186    * This example demonstrates the results of various ways of matching on
187    * a substring.
188    * <P>
189    * <CODE>
190    * String s = "food bar fool";<BR>
191    * RE exp = new RE("^foo.");<BR>
192    * REMatch m0 = exp.getMatch(s);<BR>
193    * REMatch m1 = exp.getMatch(s.substring(8));<BR>
194    * REMatch m2 = exp.getMatch(s.substring(8),0,RE.REG_NOTBOL); <BR>
195    * REMatch m3 = exp.getMatch(s,8);                            <BR>
196    * REMatch m4 = exp.getMatch(s,8,RE.REG_ANCHORINDEX);         <BR>
197    * <P>
198    * // Results:<BR>
199    * //  m0.toString(): "food"<BR>
200    * //  m1.toString(): "fool"<BR>
201    * //  m2.toString(): null<BR>
202    * //  m3.toString(): null<BR>
203    * //  m4.toString(): "fool"<BR>
204    * </CODE>
205    */
206   public static final int REG_NOTBOL = 0x10;
207 
208   /**
209    * Execution flag.
210    * The match-end operator ($) does not match at the end
211    * of the input string. Useful for matching on substrings.
212    */
213   public static final int REG_NOTEOL = 0x20;
214 
215   /**
216    * Execution flag.
217    * When a match method is invoked that starts matching at a non-zero
218    * index into the input, treat the input as if it begins at the index
219    * given.  The effect of this flag is that the engine does not "see"
220    * any text in the input before the given index.  This is useful so
221    * that the match-beginning operator (^) matches not at position 0
222    * in the input string, but at the position the search started at
223    * (based on the index input given to the getMatch function).  See
224    * the example under REG_NOTBOL.  It also affects the use of the \&lt;
225    * and \b operators.
226    */
227   public static final int REG_ANCHORINDEX = 0x40;
228 
229   /**
230    * Execution flag.
231    * The substitute and substituteAll methods will not attempt to
232    * interpolate occurrences of $1-$9 in the replacement text with
233    * the corresponding subexpressions.  For example, you may want to
234    * replace all matches of "one dollar" with "$1".
235    */
236   public static final int REG_NO_INTERPOLATE = 0x80;
237 
238   /**
239    * Execution flag.
240    * Try to match the whole input string. An implicit match-end operator
241    * is added to this regexp.
242    */
243   public static final int REG_TRY_ENTIRE_MATCH = 0x0100;
244 
245   /**
246    * Execution flag.
247    * The substitute and substituteAll methods will treat the
248    * character '\' in the replacement as an escape to a literal
249    * character. In this case "\n", "\$", "\\", "\x40" and "\012"
250    * will become "n", "$", "\", "x40" and "012" respectively.
251    * This flag has no effect if REG_NO_INTERPOLATE is set on.
252    */
253   public static final int REG_REPLACE_USE_BACKSLASHESCAPE = 0x0200;
254 
255   /**
256    * Compilation flag. Allow whitespace and comments in pattern.
257    * This is equivalent to the "/x" operator in Perl.
258    */
259   public static final int REG_X_COMMENTS = 0x0400;
260 
261   /**
262    * Compilation flag. If set, REG_ICASE is effective only for US-ASCII.
263    */
264   public static final int REG_ICASE_USASCII = 0x0800;
265 
266   /**
267    * Execution flag.
268    * Do not move the position at which the search begins.  If not set,
269    * the starting position will be moved until a match is found.
270    */
271   public static final int REG_FIX_STARTING_POSITION = 0x1000;
272 
273   /** Returns a string representing the version of the gnu.regexp package. */
version()274   public static final String version ()
275   {
276     return VERSION;
277   }
278 
279   // Retrieves a message from the ResourceBundle
getLocalizedMessage(String key)280   static final String getLocalizedMessage (String key)
281   {
282     if (messages == null)
283       messages =
284         PropertyResourceBundle.getBundle (bundle, Locale.getDefault ());
285     return messages.getString (key);
286   }
287 
288   /**
289    * Constructs a regular expression pattern buffer without any compilation
290    * flags set, and using the default syntax (RESyntax.RE_SYNTAX_PERL5).
291    *
292    * @param pattern A regular expression pattern, in the form of a String,
293    *   StringBuffer or char[].  Other input types will be converted to
294    *   strings using the toString() method.
295    * @exception REException The input pattern could not be parsed.
296    * @exception NullPointerException The pattern was null.
297    */
RE(Object pattern)298   public RE (Object pattern) throws REException
299   {
300     this (pattern, 0, RESyntax.RE_SYNTAX_PERL5, 0, 0);
301   }
302 
303   /**
304    * Constructs a regular expression pattern buffer using the specified
305    * compilation flags and the default syntax (RESyntax.RE_SYNTAX_PERL5).
306    *
307    * @param pattern A regular expression pattern, in the form of a String,
308    *   StringBuffer, or char[].  Other input types will be converted to
309    *   strings using the toString() method.
310    * @param cflags The logical OR of any combination of the compilation flags listed above.
311    * @exception REException The input pattern could not be parsed.
312    * @exception NullPointerException The pattern was null.
313    */
RE(Object pattern, int cflags)314   public RE (Object pattern, int cflags) throws REException
315   {
316     this (pattern, cflags, RESyntax.RE_SYNTAX_PERL5, 0, 0);
317   }
318 
319   /**
320    * Constructs a regular expression pattern buffer using the specified
321    * compilation flags and regular expression syntax.
322    *
323    * @param pattern A regular expression pattern, in the form of a String,
324    *   StringBuffer, or char[].  Other input types will be converted to
325    *   strings using the toString() method.
326    * @param cflags The logical OR of any combination of the compilation flags listed above.
327    * @param syntax The type of regular expression syntax to use.
328    * @exception REException The input pattern could not be parsed.
329    * @exception NullPointerException The pattern was null.
330    */
RE(Object pattern, int cflags, RESyntax syntax)331   public RE (Object pattern, int cflags, RESyntax syntax) throws REException
332   {
333     this (pattern, cflags, syntax, 0, 0);
334   }
335 
336   // internal constructor used for alternation
RE(REToken first, REToken last, int subs, int subIndex, int minLength, int maxLength)337   private RE (REToken first, REToken last, int subs, int subIndex,
338               int minLength, int maxLength)
339   {
340     super (subIndex);
341     firstToken = first;
342     lastToken = last;
343     numSubs = subs;
344     minimumLength = minLength;
345     maximumLength = maxLength;
346     addToken (new RETokenEndSub (subIndex));
347   }
348 
RE(Object patternObj, int cflags, RESyntax syntax, int myIndex, int nextSub)349   private RE (Object patternObj, int cflags, RESyntax syntax, int myIndex,
350               int nextSub) throws REException
351   {
352     super (myIndex);            // Subexpression index of this token.
353     initialize (patternObj, cflags, syntax, myIndex, nextSub);
354   }
355 
356   // For use by subclasses
RE()357   protected RE ()
358   {
359     super (0);
360   }
361 
362   // The meat of construction
initialize(Object patternObj, int cflags, RESyntax syntax, int myIndex, int nextSub)363   protected void initialize (Object patternObj, int cflags, RESyntax syntax,
364                              int myIndex, int nextSub) throws REException
365   {
366     char[] pattern;
367     if (patternObj instanceof String)
368       {
369         pattern = ((String) patternObj).toCharArray ();
370       }
371     else if (patternObj instanceof char[])
372       {
373         pattern = (char[]) patternObj;
374       }
375     else if (patternObj instanceof StringBuffer)
376       {
377         pattern = new char[((StringBuffer) patternObj).length ()];
378         ((StringBuffer) patternObj).getChars (0, pattern.length, pattern, 0);
379       }
380     else if (patternObj instanceof StringBuilder)
381       {
382         pattern = new char[((StringBuilder) patternObj).length ()];
383         ((StringBuilder) patternObj).getChars (0, pattern.length, pattern, 0);
384       }
385     else if (patternObj instanceof CPStringBuilder)
386       {
387         pattern = new char[((CPStringBuilder) patternObj).length ()];
388         ((CPStringBuilder) patternObj).getChars (0, pattern.length, pattern,
389                                                  0);
390       }
391     else
392       {
393         pattern = patternObj.toString ().toCharArray ();
394       }
395 
396     int pLength = pattern.length;
397 
398     numSubs = 0;                // Number of subexpressions in this token.
399     ArrayList < REToken > branches = null;
400 
401     // linked list of tokens (sort of -- some closed loops can exist)
402     firstToken = lastToken = null;
403 
404     // Precalculate these so we don't pay for the math every time we
405     // need to access them.
406     boolean insens = ((cflags & REG_ICASE) > 0);
407     boolean insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
408 
409     // Parse pattern into tokens.  Does anyone know if it's more efficient
410     // to use char[] than a String.charAt()?  I'm assuming so.
411 
412     // index tracks the position in the char array
413     int index = 0;
414 
415     // this will be the current parse character (pattern[index])
416     CharUnit unit = new CharUnit ();
417 
418     // This is used for {x,y} calculations
419     IntPair minMax = new IntPair ();
420 
421     // Buffer a token so we can create a TokenRepeated, etc.
422     REToken currentToken = null;
423     boolean quot = false;
424 
425     // Saved syntax and flags.
426     RESyntax savedSyntax = null;
427     int savedCflags = 0;
428     boolean flagsSaved = false;
429 
430     while (index < pLength)
431       {
432         // read the next character unit (including backslash escapes)
433         index = getCharUnit (pattern, index, unit, quot);
434 
435         if (unit.bk)
436           if (unit.ch == 'Q')
437             {
438               quot = true;
439               continue;
440             }
441           else if (unit.ch == 'E')
442             {
443               quot = false;
444               continue;
445             }
446         if (quot)
447           unit.bk = false;
448 
449         if (((cflags & REG_X_COMMENTS) > 0) && (!unit.bk) && (!quot))
450           {
451             if (Character.isWhitespace (unit.ch))
452               {
453                 continue;
454               }
455             if (unit.ch == '#')
456               {
457                 for (int i = index; i < pLength; i++)
458                   {
459                     if (pattern[i] == '\n')
460                       {
461                         index = i + 1;
462                         continue;
463                       }
464                     else if (pattern[i] == '\r')
465                       {
466                         if (i + 1 < pLength && pattern[i + 1] == '\n')
467                           {
468                             index = i + 2;
469                           }
470                         else
471                           {
472                             index = i + 1;
473                           }
474                         continue;
475                       }
476                   }
477                 index = pLength;
478                 continue;
479               }
480           }
481 
482         // ALTERNATION OPERATOR
483         //  \| or | (if RE_NO_BK_VBAR) or newline (if RE_NEWLINE_ALT)
484         //  not available if RE_LIMITED_OPS is set
485 
486         // TODO: the '\n' literal here should be a test against REToken.newline,
487         // which unfortunately may be more than a single character.
488         if (((unit.ch == '|'
489               && (syntax.get (RESyntax.RE_NO_BK_VBAR) ^ (unit.bk || quot)))
490              || (syntax.get (RESyntax.RE_NEWLINE_ALT) && (unit.ch == '\n')
491                  && !(unit.bk || quot)))
492             && !syntax.get (RESyntax.RE_LIMITED_OPS))
493           {
494             // make everything up to here be a branch. create vector if nec.
495             addToken (currentToken);
496             RE theBranch =
497               new RE (firstToken, lastToken, numSubs, subIndex, minimumLength,
498                       maximumLength);
499             minimumLength = 0;
500             maximumLength = 0;
501             if (branches == null)
502               {
503                 branches = new ArrayList < REToken > ();
504               }
505             branches.add (theBranch);
506             firstToken = lastToken = currentToken = null;
507           }
508 
509         // INTERVAL OPERATOR:
510         //  {x} | {x,} | {x,y}  (RE_INTERVALS && RE_NO_BK_BRACES)
511         //  \{x\} | \{x,\} | \{x,y\} (RE_INTERVALS && !RE_NO_BK_BRACES)
512         //
513         // OPEN QUESTION:
514         //  what is proper interpretation of '{' at start of string?
515         //
516         // This method used to check "repeat.empty.token" to avoid such regexp
517         // as "(a*){2,}", but now "repeat.empty.token" is allowed.
518 
519         else if ((unit.ch == '{') && syntax.get (RESyntax.RE_INTERVALS)
520                  && (syntax.
521                      get (RESyntax.RE_NO_BK_BRACES) ^ (unit.bk || quot)))
522           {
523             int newIndex = getMinMax (pattern, index, minMax, syntax);
524             if (newIndex > index)
525               {
526                 if (minMax.first > minMax.second)
527                   throw new
528                     REException (getLocalizedMessage ("interval.order"),
529                                  REException.REG_BADRPT, newIndex);
530                 if (currentToken == null)
531                   throw new
532                     REException (getLocalizedMessage ("repeat.no.token"),
533                                  REException.REG_BADRPT, newIndex);
534                 if (currentToken instanceof RETokenRepeated)
535                   throw new
536                     REException (getLocalizedMessage ("repeat.chained"),
537                                  REException.REG_BADRPT, newIndex);
538                 if (currentToken instanceof RETokenWordBoundary
539                     || currentToken instanceof RETokenWordBoundary)
540                   throw new
541                     REException (getLocalizedMessage ("repeat.assertion"),
542                                  REException.REG_BADRPT, newIndex);
543                 index = newIndex;
544                 currentToken =
545                   setRepeated (currentToken, minMax.first, minMax.second,
546                                index);
547               }
548             else
549               {
550                 addToken (currentToken);
551                 currentToken = new RETokenChar (subIndex, unit.ch, insens);
552                 if (insensUSASCII)
553                   currentToken.unicodeAware = false;
554               }
555           }
556 
557         // LIST OPERATOR:
558         //  [...] | [^...]
559 
560         else if ((unit.ch == '[') && !(unit.bk || quot))
561           {
562             // Create a new RETokenOneOf
563             ParseCharClassResult result =
564               parseCharClass (subIndex, pattern, index, pLength, cflags,
565                               syntax, 0);
566             addToken (currentToken);
567             currentToken = result.token;
568             index = result.index;
569           }
570 
571         // SUBEXPRESSIONS
572         //  (...) | \(...\) depending on RE_NO_BK_PARENS
573 
574         else if ((unit.ch == '(')
575                  && (syntax.
576                      get (RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot)))
577           {
578             boolean pure = false;
579             boolean comment = false;
580             boolean lookAhead = false;
581             boolean lookBehind = false;
582             boolean independent = false;
583             boolean negativelh = false;
584             boolean negativelb = false;
585             if ((index + 1 < pLength) && (pattern[index] == '?'))
586               {
587                 switch (pattern[index + 1])
588                   {
589                   case '!':
590                     if (syntax.get (RESyntax.RE_LOOKAHEAD))
591                       {
592                         pure = true;
593                         negativelh = true;
594                         lookAhead = true;
595                         index += 2;
596                       }
597                     break;
598                   case '=':
599                     if (syntax.get (RESyntax.RE_LOOKAHEAD))
600                       {
601                         pure = true;
602                         lookAhead = true;
603                         index += 2;
604                       }
605                     break;
606                   case '<':
607                     // We assume that if the syntax supports look-ahead,
608                     // it also supports look-behind.
609                     if (syntax.get (RESyntax.RE_LOOKAHEAD))
610                       {
611                         index++;
612                         switch (pattern[index + 1])
613                           {
614                           case '!':
615                             pure = true;
616                             negativelb = true;
617                             lookBehind = true;
618                             index += 2;
619                             break;
620                           case '=':
621                             pure = true;
622                             lookBehind = true;
623                             index += 2;
624                           }
625                       }
626                     break;
627                   case '>':
628                     // We assume that if the syntax supports look-ahead,
629                     // it also supports independent group.
630                     if (syntax.get (RESyntax.RE_LOOKAHEAD))
631                       {
632                         pure = true;
633                         independent = true;
634                         index += 2;
635                       }
636                     break;
637                   case 'i':
638                   case 'd':
639                   case 'm':
640                   case 's':
641                   case 'u':
642                   case 'x':
643                   case '-':
644                     if (!syntax.get (RESyntax.RE_EMBEDDED_FLAGS))
645                       break;
646                     // Set or reset syntax flags.
647                     int flagIndex = index + 1;
648                     int endFlag = -1;
649                     RESyntax newSyntax = new RESyntax (syntax);
650                     int newCflags = cflags;
651                     boolean negate = false;
652                     while (flagIndex < pLength && endFlag < 0)
653                       {
654                         switch (pattern[flagIndex])
655                           {
656                           case 'i':
657                             if (negate)
658                               newCflags &= ~REG_ICASE;
659                             else
660                               newCflags |= REG_ICASE;
661                             flagIndex++;
662                             break;
663                           case 'd':
664                             if (negate)
665                               newSyntax.setLineSeparator (RESyntax.
666                                                           DEFAULT_LINE_SEPARATOR);
667                             else
668                               newSyntax.setLineSeparator ("\n");
669                             flagIndex++;
670                             break;
671                           case 'm':
672                             if (negate)
673                               newCflags &= ~REG_MULTILINE;
674                             else
675                               newCflags |= REG_MULTILINE;
676                             flagIndex++;
677                             break;
678                           case 's':
679                             if (negate)
680                               newCflags &= ~REG_DOT_NEWLINE;
681                             else
682                               newCflags |= REG_DOT_NEWLINE;
683                             flagIndex++;
684                             break;
685                           case 'u':
686                             if (negate)
687                               newCflags |= REG_ICASE_USASCII;
688                             else
689                               newCflags &= ~REG_ICASE_USASCII;
690                             flagIndex++;
691                             break;
692                           case 'x':
693                             if (negate)
694                               newCflags &= ~REG_X_COMMENTS;
695                             else
696                               newCflags |= REG_X_COMMENTS;
697                             flagIndex++;
698                             break;
699                           case '-':
700                             negate = true;
701                             flagIndex++;
702                             break;
703                           case ':':
704                           case ')':
705                             endFlag = pattern[flagIndex];
706                             break;
707                           default:
708                             throw new
709                               REException (getLocalizedMessage
710                                            ("repeat.no.token"),
711                                            REException.REG_BADRPT, index);
712                           }
713                       }
714                     if (endFlag == ')')
715                       {
716                         syntax = newSyntax;
717                         cflags = newCflags;
718                         insens = ((cflags & REG_ICASE) > 0);
719                         insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
720                         // This can be treated as though it were a comment.
721                         comment = true;
722                         index = flagIndex - 1;
723                         break;
724                       }
725                     if (endFlag == ':')
726                       {
727                         savedSyntax = syntax;
728                         savedCflags = cflags;
729                         flagsSaved = true;
730                         syntax = newSyntax;
731                         cflags = newCflags;
732                         insens = ((cflags & REG_ICASE) > 0);
733                         insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
734                         index = flagIndex - 1;
735                         // Fall through to the next case.
736                       }
737                     else
738                       {
739                         throw new
740                           REException (getLocalizedMessage
741                                        ("unmatched.paren"),
742                                        REException.REG_ESUBREG, index);
743                       }
744                   case ':':
745                     if (syntax.get (RESyntax.RE_PURE_GROUPING))
746                       {
747                         pure = true;
748                         index += 2;
749                       }
750                     break;
751                   case '#':
752                     if (syntax.get (RESyntax.RE_COMMENTS))
753                       {
754                         comment = true;
755                       }
756                     break;
757                   default:
758                     throw new
759                       REException (getLocalizedMessage ("repeat.no.token"),
760                                    REException.REG_BADRPT, index);
761                   }
762               }
763 
764             if (index >= pLength)
765               {
766                 throw new
767                   REException (getLocalizedMessage ("unmatched.paren"),
768                                REException.REG_ESUBREG, index);
769               }
770 
771             // find end of subexpression
772             int endIndex = index;
773             int nextIndex = index;
774             int nested = 0;
775 
776             while (((nextIndex =
777                      getCharUnit (pattern, endIndex, unit, false)) > 0)
778                    && !(nested == 0 && (unit.ch == ')')
779                         && (syntax.
780                             get (RESyntax.RE_NO_BK_PARENS) ^ (unit.bk
781                                                               || quot))))
782               {
783                 if ((endIndex = nextIndex) >= pLength)
784                   throw new
785                     REException (getLocalizedMessage ("subexpr.no.end"),
786                                  REException.REG_ESUBREG, nextIndex);
787                 else
788               if ((unit.ch == '[') && !(unit.bk || quot))
789                 {
790                   // I hate to do something similar to the LIST OPERATOR matters
791                   // above, but ...
792                   int listIndex = nextIndex;
793                   if (listIndex < pLength && pattern[listIndex] == '^')
794                     listIndex++;
795                   if (listIndex < pLength && pattern[listIndex] == ']')
796                     listIndex++;
797                   int listEndIndex = -1;
798                   int listNest = 0;
799                   while (listIndex < pLength && listEndIndex < 0)
800                     {
801                       switch (pattern[listIndex++])
802                         {
803                         case '\\':
804                           listIndex++;
805                           break;
806                         case '[':
807                           // Sun's API document says that regexp like "[a-d[m-p]]"
808                           // is legal. Even something like "[[[^]]]]" is accepted.
809                           listNest++;
810                           if (listIndex < pLength
811                               && pattern[listIndex] == '^')
812                             listIndex++;
813                           if (listIndex < pLength
814                               && pattern[listIndex] == ']')
815                             listIndex++;
816                           break;
817                         case ']':
818                           if (listNest == 0)
819                             listEndIndex = listIndex;
820                           listNest--;
821                           break;
822                         }
823                     }
824                   if (listEndIndex >= 0)
825                     {
826                       nextIndex = listEndIndex;
827                       if ((endIndex = nextIndex) >= pLength)
828                         throw new
829                           REException (getLocalizedMessage ("subexpr.no.end"),
830                                        REException.REG_ESUBREG, nextIndex);
831                       else
832                       continue;
833                     }
834                   throw new
835                     REException (getLocalizedMessage ("subexpr.no.end"),
836                                  REException.REG_ESUBREG, nextIndex);
837                 }
838               else if (unit.ch == '('
839                        && (syntax.
840                            get (RESyntax.RE_NO_BK_PARENS) ^ (unit.bk
841                                                              || quot)))
842                 nested++;
843               else if (unit.ch == ')'
844                        && (syntax.
845                            get (RESyntax.RE_NO_BK_PARENS) ^ (unit.bk
846                                                              || quot)))
847                 nested--;
848               }
849 
850             // endIndex is now position at a ')','\)'
851             // nextIndex is end of string or position after ')' or '\)'
852 
853             if (comment)
854               index = nextIndex;
855             else
856               {                 // not a comment
857                 // create RE subexpression as token.
858                 addToken (currentToken);
859                 if (!pure)
860                   {
861                     numSubs++;
862                   }
863 
864                 int useIndex = (pure || lookAhead || lookBehind
865                                 || independent) ? 0 : nextSub + numSubs;
866                 currentToken =
867                   new RE (String.valueOf (pattern, index, endIndex - index).
868                           toCharArray (), cflags, syntax, useIndex,
869                           nextSub + numSubs);
870                 numSubs += ((RE) currentToken).getNumSubs ();
871 
872                 if (lookAhead)
873                   {
874                     currentToken =
875                       new RETokenLookAhead (currentToken, negativelh);
876                   }
877                 else if (lookBehind)
878                   {
879                     currentToken =
880                       new RETokenLookBehind (currentToken, negativelb);
881                   }
882                 else if (independent)
883                   {
884                     currentToken = new RETokenIndependent (currentToken);
885                   }
886 
887                 index = nextIndex;
888                 if (flagsSaved)
889                   {
890                     syntax = savedSyntax;
891                     cflags = savedCflags;
892                     insens = ((cflags & REG_ICASE) > 0);
893                     insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
894                     flagsSaved = false;
895                   }
896               }                 // not a comment
897           }                     // subexpression
898 
899         // UNMATCHED RIGHT PAREN
900         // ) or \) throw exception if
901         // !syntax.get(RESyntax.RE_UNMATCHED_RIGHT_PAREN_ORD)
902         else if (!syntax.get (RESyntax.RE_UNMATCHED_RIGHT_PAREN_ORD)
903                  && ((unit.ch == ')')
904                      && (syntax.
905                          get (RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot))))
906           {
907             throw new REException (getLocalizedMessage ("unmatched.paren"),
908                                    REException.REG_EPAREN, index);
909           }
910 
911         // START OF LINE OPERATOR
912         //  ^
913 
914         else if ((unit.ch == '^') && !(unit.bk || quot))
915           {
916             addToken (currentToken);
917             currentToken = null;
918             RETokenStart token = null;
919             if ((cflags & REG_MULTILINE) > 0)
920               {
921                 String sep = syntax.getLineSeparator ();
922                 if (sep == null)
923                   {
924                     token = new RETokenStart (subIndex, null, true);
925                   }
926                 else
927                   {
928                     token = new RETokenStart (subIndex, sep);
929                   }
930               }
931             else
932               {
933                 token = new RETokenStart (subIndex, null);
934               }
935             addToken (token);
936           }
937 
938         // END OF LINE OPERATOR
939         //  $
940 
941         else if ((unit.ch == '$') && !(unit.bk || quot))
942           {
943             addToken (currentToken);
944             currentToken = null;
945             RETokenEnd token = null;
946             if ((cflags & REG_MULTILINE) > 0)
947               {
948                 String sep = syntax.getLineSeparator ();
949                 if (sep == null)
950                   {
951                     token = new RETokenEnd (subIndex, null, true);
952                   }
953                 else
954                   {
955                     token = new RETokenEnd (subIndex, sep);
956                   }
957               }
958             else
959               {
960                 token = new RETokenEnd (subIndex, null);
961               }
962             addToken (token);
963           }
964 
965         // MATCH-ANY-CHARACTER OPERATOR (except possibly newline and null)
966         //  .
967 
968         else if ((unit.ch == '.') && !(unit.bk || quot))
969           {
970             addToken (currentToken);
971             currentToken =
972               new RETokenAny (subIndex, syntax.get (RESyntax.RE_DOT_NEWLINE)
973                               || ((cflags & REG_DOT_NEWLINE) > 0),
974                               syntax.get (RESyntax.RE_DOT_NOT_NULL));
975           }
976 
977         // ZERO-OR-MORE REPEAT OPERATOR
978         //  *
979         //
980         // This method used to check "repeat.empty.token" to avoid such regexp
981         // as "(a*)*", but now "repeat.empty.token" is allowed.
982 
983         else if ((unit.ch == '*') && !(unit.bk || quot))
984           {
985             if (currentToken == null)
986               throw new REException (getLocalizedMessage ("repeat.no.token"),
987                                      REException.REG_BADRPT, index);
988             if (currentToken instanceof RETokenRepeated)
989               throw new REException (getLocalizedMessage ("repeat.chained"),
990                                      REException.REG_BADRPT, index);
991             if (currentToken instanceof RETokenWordBoundary
992                 || currentToken instanceof RETokenWordBoundary)
993               throw new REException (getLocalizedMessage ("repeat.assertion"),
994                                      REException.REG_BADRPT, index);
995             currentToken =
996               setRepeated (currentToken, 0, Integer.MAX_VALUE, index);
997           }
998 
999         // ONE-OR-MORE REPEAT OPERATOR / POSSESSIVE MATCHING OPERATOR
1000         //  + | \+ depending on RE_BK_PLUS_QM
1001         //  not available if RE_LIMITED_OPS is set
1002         //
1003         // This method used to check "repeat.empty.token" to avoid such regexp
1004         // as "(a*)+", but now "repeat.empty.token" is allowed.
1005 
1006         else if ((unit.ch == '+') && !syntax.get (RESyntax.RE_LIMITED_OPS)
1007                  && (!syntax.
1008                      get (RESyntax.RE_BK_PLUS_QM) ^ (unit.bk || quot)))
1009           {
1010             if (currentToken == null)
1011               throw new REException (getLocalizedMessage ("repeat.no.token"),
1012                                      REException.REG_BADRPT, index);
1013 
1014             // Check for possessive matching on RETokenRepeated
1015             if (currentToken instanceof RETokenRepeated)
1016               {
1017                 RETokenRepeated tokenRep = (RETokenRepeated) currentToken;
1018                 if (syntax.get (RESyntax.RE_POSSESSIVE_OPS)
1019                     && !tokenRep.isPossessive () && !tokenRep.isStingy ())
1020                   tokenRep.makePossessive ();
1021                 else
1022                   throw new
1023                     REException (getLocalizedMessage ("repeat.chained"),
1024                                  REException.REG_BADRPT, index);
1025 
1026               }
1027             else if (currentToken instanceof RETokenWordBoundary
1028                      || currentToken instanceof RETokenWordBoundary)
1029               throw new REException (getLocalizedMessage ("repeat.assertion"),
1030                                      REException.REG_BADRPT, index);
1031             else
1032             currentToken =
1033               setRepeated (currentToken, 1, Integer.MAX_VALUE, index);
1034           }
1035 
1036         // ZERO-OR-ONE REPEAT OPERATOR / STINGY MATCHING OPERATOR
1037         //  ? | \? depending on RE_BK_PLUS_QM
1038         //  not available if RE_LIMITED_OPS is set
1039         //  stingy matching if RE_STINGY_OPS is set and it follows a quantifier
1040 
1041         else if ((unit.ch == '?') && !syntax.get (RESyntax.RE_LIMITED_OPS)
1042                  && (!syntax.
1043                      get (RESyntax.RE_BK_PLUS_QM) ^ (unit.bk || quot)))
1044           {
1045             if (currentToken == null)
1046               throw new REException (getLocalizedMessage ("repeat.no.token"),
1047                                      REException.REG_BADRPT, index);
1048 
1049             // Check for stingy matching on RETokenRepeated
1050             if (currentToken instanceof RETokenRepeated)
1051               {
1052                 RETokenRepeated tokenRep = (RETokenRepeated) currentToken;
1053                 if (syntax.get (RESyntax.RE_STINGY_OPS)
1054                     && !tokenRep.isStingy () && !tokenRep.isPossessive ())
1055                   tokenRep.makeStingy ();
1056                 else
1057                   throw new
1058                     REException (getLocalizedMessage ("repeat.chained"),
1059                                  REException.REG_BADRPT, index);
1060               }
1061             else if (currentToken instanceof RETokenWordBoundary
1062                      || currentToken instanceof RETokenWordBoundary)
1063               throw new REException (getLocalizedMessage ("repeat.assertion"),
1064                                      REException.REG_BADRPT, index);
1065             else
1066             currentToken = setRepeated (currentToken, 0, 1, index);
1067           }
1068 
1069         // OCTAL CHARACTER
1070         //  \0377
1071 
1072         else if (unit.bk && (unit.ch == '0')
1073                  && syntax.get (RESyntax.RE_OCTAL_CHAR))
1074           {
1075             CharExpression ce =
1076               getCharExpression (pattern, index - 2, pLength, syntax);
1077             if (ce == null)
1078               throw new REException ("invalid octal character",
1079                                      REException.REG_ESCAPE, index);
1080             index = index - 2 + ce.len;
1081             addToken (currentToken);
1082             currentToken = new RETokenChar (subIndex, ce.ch, insens);
1083             if (insensUSASCII)
1084               currentToken.unicodeAware = false;
1085           }
1086 
1087         // BACKREFERENCE OPERATOR
1088         //  \1 \2 ... \9 and \10 \11 \12 ...
1089         // not available if RE_NO_BK_REFS is set
1090         // Perl recognizes \10, \11, and so on only if enough number of
1091         // parentheses have opened before it, otherwise they are treated
1092         // as aliases of \010, \011, ... (octal characters).  In case of
1093         // Sun's JDK, octal character expression must always begin with \0.
1094         // We will do as JDK does. But FIXME, take a look at "(a)(b)\29".
1095         // JDK treats \2 as a back reference to the 2nd group because
1096         // there are only two groups. But in our poor implementation,
1097         // we cannot help but treat \29 as a back reference to the 29th group.
1098 
1099         else if (unit.bk && Character.isDigit (unit.ch)
1100                  && !syntax.get (RESyntax.RE_NO_BK_REFS))
1101           {
1102             addToken (currentToken);
1103             int numBegin = index - 1;
1104             int numEnd = pLength;
1105             for (int i = index; i < pLength; i++)
1106               {
1107                 if (!Character.isDigit (pattern[i]))
1108                   {
1109                     numEnd = i;
1110                     break;
1111                   }
1112               }
1113             int num = parseInt (pattern, numBegin, numEnd - numBegin, 10);
1114 
1115             currentToken = new RETokenBackRef (subIndex, num, insens);
1116             if (insensUSASCII)
1117               currentToken.unicodeAware = false;
1118             index = numEnd;
1119           }
1120 
1121         // START OF STRING OPERATOR
1122         //  \A if RE_STRING_ANCHORS is set
1123 
1124         else if (unit.bk && (unit.ch == 'A')
1125                  && syntax.get (RESyntax.RE_STRING_ANCHORS))
1126           {
1127             addToken (currentToken);
1128             currentToken = new RETokenStart (subIndex, null);
1129           }
1130 
1131         // WORD BREAK OPERATOR
1132         //  \b if ????
1133 
1134         else if (unit.bk && (unit.ch == 'b')
1135                  && syntax.get (RESyntax.RE_STRING_ANCHORS))
1136           {
1137             addToken (currentToken);
1138             currentToken =
1139               new RETokenWordBoundary (subIndex,
1140                                        RETokenWordBoundary.
1141                                        BEGIN | RETokenWordBoundary.END,
1142                                        false);
1143           }
1144 
1145         // WORD BEGIN OPERATOR
1146         //  \< if ????
1147         else if (unit.bk && (unit.ch == '<'))
1148           {
1149             addToken (currentToken);
1150             currentToken =
1151               new RETokenWordBoundary (subIndex, RETokenWordBoundary.BEGIN,
1152                                        false);
1153           }
1154 
1155         // WORD END OPERATOR
1156         //  \> if ????
1157         else if (unit.bk && (unit.ch == '>'))
1158           {
1159             addToken (currentToken);
1160             currentToken =
1161               new RETokenWordBoundary (subIndex, RETokenWordBoundary.END,
1162                                        false);
1163           }
1164 
1165         // NON-WORD BREAK OPERATOR
1166         // \B if ????
1167 
1168         else if (unit.bk && (unit.ch == 'B')
1169                  && syntax.get (RESyntax.RE_STRING_ANCHORS))
1170           {
1171             addToken (currentToken);
1172             currentToken =
1173               new RETokenWordBoundary (subIndex,
1174                                        RETokenWordBoundary.
1175                                        BEGIN | RETokenWordBoundary.END, true);
1176           }
1177 
1178 
1179         // DIGIT OPERATOR
1180         //  \d if RE_CHAR_CLASS_ESCAPES is set
1181 
1182         else if (unit.bk && (unit.ch == 'd')
1183                  && syntax.get (RESyntax.RE_CHAR_CLASS_ESCAPES))
1184           {
1185             addToken (currentToken);
1186             currentToken =
1187               new RETokenPOSIX (subIndex, RETokenPOSIX.DIGIT, insens, false);
1188             if (insensUSASCII)
1189               currentToken.unicodeAware = false;
1190           }
1191 
1192         // NON-DIGIT OPERATOR
1193         //  \D
1194 
1195         else if (unit.bk && (unit.ch == 'D')
1196                  && syntax.get (RESyntax.RE_CHAR_CLASS_ESCAPES))
1197           {
1198             addToken (currentToken);
1199             currentToken =
1200               new RETokenPOSIX (subIndex, RETokenPOSIX.DIGIT, insens, true);
1201             if (insensUSASCII)
1202               currentToken.unicodeAware = false;
1203           }
1204 
1205         // NEWLINE ESCAPE
1206         //  \n
1207 
1208         else if (unit.bk && (unit.ch == 'n'))
1209           {
1210             addToken (currentToken);
1211             currentToken = new RETokenChar (subIndex, '\n', false);
1212           }
1213 
1214         // RETURN ESCAPE
1215         //  \r
1216 
1217         else if (unit.bk && (unit.ch == 'r'))
1218           {
1219             addToken (currentToken);
1220             currentToken = new RETokenChar (subIndex, '\r', false);
1221           }
1222 
1223         // WHITESPACE OPERATOR
1224         //  \s if RE_CHAR_CLASS_ESCAPES is set
1225 
1226         else if (unit.bk && (unit.ch == 's')
1227                  && syntax.get (RESyntax.RE_CHAR_CLASS_ESCAPES))
1228           {
1229             addToken (currentToken);
1230             currentToken =
1231               new RETokenPOSIX (subIndex, RETokenPOSIX.SPACE, insens, false);
1232             if (insensUSASCII)
1233               currentToken.unicodeAware = false;
1234           }
1235 
1236         // NON-WHITESPACE OPERATOR
1237         //  \S
1238 
1239         else if (unit.bk && (unit.ch == 'S')
1240                  && syntax.get (RESyntax.RE_CHAR_CLASS_ESCAPES))
1241           {
1242             addToken (currentToken);
1243             currentToken =
1244               new RETokenPOSIX (subIndex, RETokenPOSIX.SPACE, insens, true);
1245             if (insensUSASCII)
1246               currentToken.unicodeAware = false;
1247           }
1248 
1249         // TAB ESCAPE
1250         //  \t
1251 
1252         else if (unit.bk && (unit.ch == 't'))
1253           {
1254             addToken (currentToken);
1255             currentToken = new RETokenChar (subIndex, '\t', false);
1256           }
1257 
1258         // ALPHANUMERIC OPERATOR
1259         //  \w
1260 
1261         else if (unit.bk && (unit.ch == 'w')
1262                  && syntax.get (RESyntax.RE_CHAR_CLASS_ESCAPES))
1263           {
1264             addToken (currentToken);
1265             currentToken =
1266               new RETokenPOSIX (subIndex, RETokenPOSIX.ALNUM, insens, false);
1267             if (insensUSASCII)
1268               currentToken.unicodeAware = false;
1269           }
1270 
1271         // NON-ALPHANUMERIC OPERATOR
1272         //  \W
1273 
1274         else if (unit.bk && (unit.ch == 'W')
1275                  && syntax.get (RESyntax.RE_CHAR_CLASS_ESCAPES))
1276           {
1277             addToken (currentToken);
1278             currentToken =
1279               new RETokenPOSIX (subIndex, RETokenPOSIX.ALNUM, insens, true);
1280             if (insensUSASCII)
1281               currentToken.unicodeAware = false;
1282           }
1283 
1284         // END OF STRING OPERATOR
1285         //  \Z, \z
1286 
1287         // FIXME: \Z and \z are different in that if the input string
1288         // ends with a line terminator, \Z matches the position before
1289         // the final terminator.  This special behavior of \Z is yet
1290         // to be implemented.
1291 
1292         else if (unit.bk && (unit.ch == 'Z' || unit.ch == 'z') &&
1293                  syntax.get (RESyntax.RE_STRING_ANCHORS))
1294           {
1295             addToken (currentToken);
1296             currentToken = new RETokenEnd (subIndex, null);
1297           }
1298 
1299         // HEX CHARACTER, UNICODE CHARACTER
1300         //  \x1B, \u1234
1301 
1302         else
1303           if ((unit.bk && (unit.ch == 'x')
1304                && syntax.get (RESyntax.RE_HEX_CHAR)) || (unit.bk
1305                                                          && (unit.ch == 'u')
1306                                                          && syntax.
1307                                                          get (RESyntax.
1308                                                               RE_UNICODE_CHAR)))
1309           {
1310             CharExpression ce =
1311               getCharExpression (pattern, index - 2, pLength, syntax);
1312             if (ce == null)
1313               throw new REException ("invalid hex character",
1314                                      REException.REG_ESCAPE, index);
1315             index = index - 2 + ce.len;
1316             addToken (currentToken);
1317             currentToken = new RETokenChar (subIndex, ce.ch, insens);
1318             if (insensUSASCII)
1319               currentToken.unicodeAware = false;
1320           }
1321 
1322         // NAMED PROPERTY
1323         // \p{prop}, \P{prop}
1324 
1325         else
1326           if ((unit.bk && (unit.ch == 'p')
1327                && syntax.get (RESyntax.RE_NAMED_PROPERTY)) || (unit.bk
1328                                                                && (unit.ch ==
1329                                                                    'P')
1330                                                                && syntax.
1331                                                                get (RESyntax.
1332                                                                     RE_NAMED_PROPERTY)))
1333           {
1334             NamedProperty np = getNamedProperty (pattern, index - 2, pLength);
1335             if (np == null)
1336               throw new REException ("invalid escape sequence",
1337                                      REException.REG_ESCAPE, index);
1338             index = index - 2 + np.len;
1339             addToken (currentToken);
1340             currentToken =
1341               getRETokenNamedProperty (subIndex, np, insens, index);
1342             if (insensUSASCII)
1343               currentToken.unicodeAware = false;
1344           }
1345 
1346         // END OF PREVIOUS MATCH
1347         //  \G
1348 
1349         else if (unit.bk && (unit.ch == 'G') &&
1350                  syntax.get (RESyntax.RE_STRING_ANCHORS))
1351           {
1352             addToken (currentToken);
1353             currentToken = new RETokenEndOfPreviousMatch (subIndex);
1354           }
1355 
1356         // NON-SPECIAL CHARACTER (or escape to make literal)
1357         //  c | \* for example
1358 
1359         else
1360           {                     // not a special character
1361             addToken (currentToken);
1362             currentToken = new RETokenChar (subIndex, unit.ch, insens);
1363             if (insensUSASCII)
1364               currentToken.unicodeAware = false;
1365           }
1366       }                         // end while
1367 
1368     // Add final buffered token and an EndSub marker
1369     addToken (currentToken);
1370 
1371     if (branches != null)
1372       {
1373         branches.
1374           add (new
1375                RE (firstToken, lastToken, numSubs, subIndex, minimumLength,
1376                    maximumLength));
1377         branches.trimToSize (); // compact the Vector
1378         minimumLength = 0;
1379         maximumLength = 0;
1380         firstToken = lastToken = null;
1381         addToken (new RETokenOneOf (subIndex, branches, false));
1382       }
1383     else
1384       addToken (new RETokenEndSub (subIndex));
1385 
1386   }
1387 
1388   private static class ParseCharClassResult
1389   {
1390     RETokenOneOf token;
1391     int index;
1392     boolean returnAtAndOperator = false;
1393   }
1394 
1395   /**
1396    * Parse [...] or [^...] and make an RETokenOneOf instance.
1397    * @param subIndex subIndex to be given to the created RETokenOneOf instance.
1398    * @param pattern Input array of characters to be parsed.
1399    * @param index Index pointing to the character next to the beginning '['.
1400    * @param pLength Limit of the input array.
1401    * @param cflags Compilation flags used to parse the pattern.
1402    * @param pflags Flags that affect the behavior of this method.
1403    * @param syntax Syntax used to parse the pattern.
1404    */
parseCharClass(int subIndex, char[]pattern, int index, int pLength, int cflags, RESyntax syntax, int pflags)1405   private static ParseCharClassResult parseCharClass (int subIndex,
1406                                                       char[]pattern,
1407                                                       int index, int pLength,
1408                                                       int cflags,
1409                                                       RESyntax syntax,
1410                                                       int pflags) throws
1411     REException
1412   {
1413 
1414     boolean insens = ((cflags & REG_ICASE) > 0);
1415     boolean insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
1416     final ArrayList < REToken > options = new ArrayList < REToken > ();
1417       ArrayList < Object > addition = new ArrayList < Object > ();
1418     boolean additionAndAppeared = false;
1419     final int RETURN_AT_AND = 0x01;
1420     boolean returnAtAndOperator = ((pflags & RETURN_AT_AND) != 0);
1421     boolean negative = false;
1422     char ch;
1423 
1424     char lastChar = 0;
1425     boolean lastCharIsSet = false;
1426     if (index == pLength)
1427       throw new REException (getLocalizedMessage ("unmatched.bracket"),
1428                              REException.REG_EBRACK, index);
1429 
1430     // Check for initial caret, negation
1431     if ((ch = pattern[index]) == '^')
1432       {
1433         negative = true;
1434         if (++index == pLength)
1435           throw new REException (getLocalizedMessage ("class.no.end"),
1436                                  REException.REG_EBRACK, index);
1437           ch = pattern[index];
1438       }
1439 
1440     // Check for leading right bracket literal
1441     if (ch == ']')
1442       {
1443         lastChar = ch;
1444         lastCharIsSet = true;
1445         if (++index == pLength)
1446           throw new REException (getLocalizedMessage ("class.no.end"),
1447                                  REException.REG_EBRACK, index);
1448       }
1449 
1450     while ((ch = pattern[index++]) != ']')
1451       {
1452         if ((ch == '-') && (lastCharIsSet))
1453           {
1454             if (index == pLength)
1455               throw new REException (getLocalizedMessage ("class.no.end"),
1456                                      REException.REG_EBRACK, index);
1457             if ((ch = pattern[index]) == ']')
1458               {
1459                 RETokenChar t = new RETokenChar (subIndex, lastChar, insens);
1460                 if (insensUSASCII)
1461                   t.unicodeAware = false;
1462                 options.add (t);
1463                 lastChar = '-';
1464               }
1465             else
1466               {
1467                 if ((ch == '\\')
1468                     && syntax.get (RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS))
1469                   {
1470                     CharExpression ce =
1471                       getCharExpression (pattern, index, pLength, syntax);
1472                     if (ce == null)
1473                       throw new REException ("invalid escape sequence",
1474                                              REException.REG_ESCAPE, index);
1475                     ch = ce.ch;
1476                     index = index + ce.len - 1;
1477                   }
1478                 RETokenRange t =
1479                   new RETokenRange (subIndex, lastChar, ch, insens);
1480                 if (insensUSASCII)
1481                   t.unicodeAware = false;
1482                 options.add (t);
1483                 lastChar = 0;
1484                 lastCharIsSet = false;
1485                 index++;
1486               }
1487           }
1488         else if ((ch == '\\')
1489                  && syntax.get (RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS))
1490           {
1491             if (index == pLength)
1492               throw new REException (getLocalizedMessage ("class.no.end"),
1493                                      REException.REG_EBRACK, index);
1494             int posixID = -1;
1495             boolean negate = false;
1496             char asciiEsc = 0;
1497             boolean asciiEscIsSet = false;
1498             NamedProperty np = null;
1499             if (("dswDSW".indexOf (pattern[index]) != -1)
1500                 && syntax.get (RESyntax.RE_CHAR_CLASS_ESC_IN_LISTS))
1501               {
1502                 switch (pattern[index])
1503                   {
1504                   case 'D':
1505                     negate = true;
1506                   case 'd':
1507                     posixID = RETokenPOSIX.DIGIT;
1508                     break;
1509                   case 'S':
1510                     negate = true;
1511                   case 's':
1512                     posixID = RETokenPOSIX.SPACE;
1513                     break;
1514                   case 'W':
1515                     negate = true;
1516                   case 'w':
1517                     posixID = RETokenPOSIX.ALNUM;
1518                     break;
1519                   }
1520               }
1521             if (("pP".indexOf (pattern[index]) != -1)
1522                 && syntax.get (RESyntax.RE_NAMED_PROPERTY))
1523               {
1524                 np = getNamedProperty (pattern, index - 1, pLength);
1525                 if (np == null)
1526                   throw new REException ("invalid escape sequence",
1527                                          REException.REG_ESCAPE, index);
1528                 index = index - 1 + np.len - 1;
1529               }
1530             else
1531               {
1532                 CharExpression ce =
1533                   getCharExpression (pattern, index - 1, pLength, syntax);
1534                 if (ce == null)
1535                   throw new REException ("invalid escape sequence",
1536                                          REException.REG_ESCAPE, index);
1537                 asciiEsc = ce.ch;
1538                 asciiEscIsSet = true;
1539                 index = index - 1 + ce.len - 1;
1540               }
1541             if (lastCharIsSet)
1542               {
1543                 RETokenChar t = new RETokenChar (subIndex, lastChar, insens);
1544                 if (insensUSASCII)
1545                   t.unicodeAware = false;
1546                 options.add (t);
1547               }
1548 
1549             if (posixID != -1)
1550               {
1551                 RETokenPOSIX t =
1552                   new RETokenPOSIX (subIndex, posixID, insens, negate);
1553                 if (insensUSASCII)
1554                   t.unicodeAware = false;
1555                 options.add (t);
1556               }
1557             else if (np != null)
1558               {
1559                 RETokenNamedProperty t =
1560                   getRETokenNamedProperty (subIndex, np, insens, index);
1561                 if (insensUSASCII)
1562                   t.unicodeAware = false;
1563                 options.add (t);
1564               }
1565             else if (asciiEscIsSet)
1566               {
1567                 lastChar = asciiEsc;
1568                 lastCharIsSet = true;
1569               }
1570             else
1571               {
1572                 lastChar = pattern[index];
1573                 lastCharIsSet = true;
1574               }
1575             ++index;
1576           }
1577         else if ((ch == '[') && (syntax.get (RESyntax.RE_CHAR_CLASSES))
1578                  && (index < pLength) && (pattern[index] == ':'))
1579           {
1580             CPStringBuilder posixSet = new CPStringBuilder ();
1581             index = getPosixSet (pattern, index + 1, posixSet);
1582             int posixId = RETokenPOSIX.intValue (posixSet.toString ());
1583             if (posixId != -1)
1584               {
1585                 RETokenPOSIX t =
1586                   new RETokenPOSIX (subIndex, posixId, insens, false);
1587                 if (insensUSASCII)
1588                   t.unicodeAware = false;
1589                 options.add (t);
1590               }
1591           }
1592         else if ((ch == '[') && (syntax.get (RESyntax.RE_NESTED_CHARCLASS)))
1593           {
1594             ParseCharClassResult result =
1595               parseCharClass (subIndex, pattern, index, pLength, cflags,
1596                               syntax, 0);
1597             addition.add (result.token);
1598             addition.add ("|");
1599             index = result.index;
1600           }
1601         else if ((ch == '&') &&
1602                  (syntax.get (RESyntax.RE_NESTED_CHARCLASS)) &&
1603                  (index < pLength) && (pattern[index] == '&'))
1604           {
1605             if (returnAtAndOperator)
1606               {
1607                 ParseCharClassResult result = new ParseCharClassResult ();
1608                 options.trimToSize ();
1609                 if (additionAndAppeared)
1610                   addition.add ("&");
1611                 if (addition.size () == 0)
1612                   addition = null;
1613                 result.token = new RETokenOneOf (subIndex,
1614                                                  options, addition, negative);
1615                 result.index = index - 1;
1616                 result.returnAtAndOperator = true;
1617                 return result;
1618               }
1619             // The precedence of the operator "&&" is the lowest.
1620             // So we postpone adding "&" until other elements
1621             // are added. And we insert Boolean.FALSE at the
1622             // beginning of the list of tokens following "&&".
1623             // So, "&&[a-b][k-m]" will be stored in the Vecter
1624             // addition in this order:
1625             //     Boolean.FALSE, [a-b], "|", [k-m], "|", "&"
1626             if (additionAndAppeared)
1627               addition.add ("&");
1628             addition.add (Boolean.FALSE);
1629             additionAndAppeared = true;
1630 
1631             // The part on which "&&" operates may be either
1632             //   (1) explicitly enclosed by []
1633             //   or
1634             //   (2) not enclosed by [] and terminated by the
1635             //       next "&&" or the end of the character list.
1636             //  Let the preceding else if block do the case (1).
1637             //  We must do something in case of (2).
1638             if ((index + 1 < pLength) && (pattern[index + 1] != '['))
1639               {
1640                 ParseCharClassResult result =
1641                   parseCharClass (subIndex, pattern, index + 1, pLength,
1642                                   cflags, syntax,
1643                                   RETURN_AT_AND);
1644                 addition.add (result.token);
1645                 addition.add ("|");
1646                 // If the method returned at the next "&&", it is OK.
1647                 // Otherwise we have eaten the mark of the end of this
1648                 // character list "]".  In this case we must give back
1649                 // the end mark.
1650                 index = (result.returnAtAndOperator ?
1651                          result.index : result.index - 1);
1652               }
1653           }
1654         else
1655           {
1656             if (lastCharIsSet)
1657               {
1658                 RETokenChar t = new RETokenChar (subIndex, lastChar, insens);
1659                 if (insensUSASCII)
1660                   t.unicodeAware = false;
1661                 options.add (t);
1662               }
1663             lastChar = ch;
1664             lastCharIsSet = true;
1665           }
1666         if (index == pLength)
1667           throw new REException (getLocalizedMessage ("class.no.end"),
1668                                  REException.REG_EBRACK, index);
1669       }                         // while in list
1670     // Out of list, index is one past ']'
1671 
1672     if (lastCharIsSet)
1673       {
1674         RETokenChar t = new RETokenChar (subIndex, lastChar, insens);
1675         if (insensUSASCII)
1676           t.unicodeAware = false;
1677         options.add (t);
1678       }
1679 
1680     ParseCharClassResult result = new ParseCharClassResult ();
1681     // Create a new RETokenOneOf
1682     options.trimToSize ();
1683     if (additionAndAppeared)
1684       addition.add ("&");
1685     if (addition.size () == 0)
1686       addition = null;
1687     result.token = new RETokenOneOf (subIndex, options, addition, negative);
1688     result.index = index;
1689     return result;
1690   }
1691 
getCharUnit(char[]input, int index, CharUnit unit, boolean quot)1692   private static int getCharUnit (char[]input, int index, CharUnit unit,
1693                                   boolean quot) throws REException
1694   {
1695     unit.ch = input[index++];
1696     unit.bk = (unit.ch == '\\'
1697                && (!quot || index >= input.length || input[index] == 'E'));
1698     if (unit.bk)
1699       if (index < input.length)
1700         unit.ch = input[index++];
1701       else
1702         throw new REException (getLocalizedMessage ("ends.with.backslash"),
1703                                REException.REG_ESCAPE, index);
1704     return index;
1705   }
1706 
parseInt(char[]input, int pos, int len, int radix)1707   private static int parseInt (char[]input, int pos, int len, int radix)
1708   {
1709     int ret = 0;
1710     for (int i = pos; i < pos + len; i++)
1711       {
1712         ret = ret * radix + Character.digit (input[i], radix);
1713       }
1714     return ret;
1715   }
1716 
1717   /**
1718    * This class represents various expressions for a character.
1719    * "a"      : 'a' itself.
1720    * "\0123"  : Octal char 0123
1721    * "\x1b"   : Hex char 0x1b
1722    * "\u1234" : Unicode char \u1234
1723    */
1724   private static class CharExpression
1725   {
1726     /** character represented by this expression */
1727     char ch;
1728     /** String expression */
1729     String expr;
1730     /** length of this expression */
1731     int len;
toString()1732     public String toString ()
1733     {
1734       return expr;
1735     }
1736   }
1737 
getCharExpression(char[]input, int pos, int lim, RESyntax syntax)1738   private static CharExpression getCharExpression (char[]input, int pos,
1739                                                    int lim, RESyntax syntax)
1740   {
1741     CharExpression ce = new CharExpression ();
1742     char c = input[pos];
1743     if (c == '\\')
1744       {
1745         if (pos + 1 >= lim)
1746           return null;
1747         c = input[pos + 1];
1748         switch (c)
1749           {
1750           case 't':
1751             ce.ch = '\t';
1752             ce.len = 2;
1753             break;
1754           case 'n':
1755             ce.ch = '\n';
1756             ce.len = 2;
1757             break;
1758           case 'r':
1759             ce.ch = '\r';
1760             ce.len = 2;
1761             break;
1762           case 'x':
1763           case 'u':
1764             if ((c == 'x' && syntax.get (RESyntax.RE_HEX_CHAR)) ||
1765                 (c == 'u' && syntax.get (RESyntax.RE_UNICODE_CHAR)))
1766               {
1767                 int l = 0;
1768                 int expectedLength = (c == 'x' ? 2 : 4);
1769                 for (int i = pos + 2; i < pos + 2 + expectedLength; i++)
1770                   {
1771                     if (i >= lim)
1772                       break;
1773                     if (!((input[i] >= '0' && input[i] <= '9') ||
1774                           (input[i] >= 'A' && input[i] <= 'F') ||
1775                           (input[i] >= 'a' && input[i] <= 'f')))
1776                       break;
1777                     l++;
1778                   }
1779                 if (l != expectedLength)
1780                   return null;
1781                 ce.ch = (char) (parseInt (input, pos + 2, l, 16));
1782                 ce.len = l + 2;
1783               }
1784             else
1785               {
1786                 ce.ch = c;
1787                 ce.len = 2;
1788               }
1789             break;
1790           case '0':
1791             if (syntax.get (RESyntax.RE_OCTAL_CHAR))
1792               {
1793                 int l = 0;
1794                 for (int i = pos + 2; i < pos + 2 + 3; i++)
1795                   {
1796                     if (i >= lim)
1797                       break;
1798                     if (input[i] < '0' || input[i] > '7')
1799                       break;
1800                     l++;
1801                   }
1802                 if (l == 3 && input[pos + 2] > '3')
1803                   l--;
1804                 if (l <= 0)
1805                   return null;
1806                 ce.ch = (char) (parseInt (input, pos + 2, l, 8));
1807                 ce.len = l + 2;
1808               }
1809             else
1810               {
1811                 ce.ch = c;
1812                 ce.len = 2;
1813               }
1814             break;
1815           default:
1816             ce.ch = c;
1817             ce.len = 2;
1818             break;
1819           }
1820       }
1821     else
1822       {
1823         ce.ch = input[pos];
1824         ce.len = 1;
1825       }
1826     ce.expr = new String (input, pos, ce.len);
1827     return ce;
1828   }
1829 
1830   /**
1831    * This class represents a substring in a pattern string expressing
1832    * a named property.
1833    * "\pA"      : Property named "A"
1834    * "\p{prop}" : Property named "prop"
1835    * "\PA"      : Property named "A" (Negated)
1836    * "\P{prop}" : Property named "prop" (Negated)
1837    */
1838   private static class NamedProperty
1839   {
1840     /** Property name */
1841     String name;
1842     /** Negated or not */
1843     boolean negate;
1844     /** length of this expression */
1845     int len;
1846   }
1847 
getNamedProperty(char[]input, int pos, int lim)1848   private static NamedProperty getNamedProperty (char[]input, int pos,
1849                                                  int lim)
1850   {
1851     NamedProperty np = new NamedProperty ();
1852     char c = input[pos];
1853     if (c == '\\')
1854       {
1855         if (++pos >= lim)
1856           return null;
1857         c = input[pos++];
1858         switch (c)
1859           {
1860           case 'p':
1861             np.negate = false;
1862             break;
1863           case 'P':
1864             np.negate = true;
1865             break;
1866           default:
1867             return null;
1868           }
1869         c = input[pos++];
1870         if (c == '{')
1871           {
1872             int p = -1;
1873             for (int i = pos; i < lim; i++)
1874               {
1875                 if (input[i] == '}')
1876                   {
1877                     p = i;
1878                     break;
1879                   }
1880               }
1881             if (p < 0)
1882               return null;
1883             int len = p - pos;
1884             np.name = new String (input, pos, len);
1885             np.len = len + 4;
1886           }
1887         else
1888           {
1889             np.name = new String (input, pos - 1, 1);
1890             np.len = 3;
1891           }
1892         return np;
1893       }
1894     else
1895       return null;
1896   }
1897 
getRETokenNamedProperty(int subIndex, NamedProperty np, boolean insens, int index)1898   private static RETokenNamedProperty getRETokenNamedProperty (int subIndex,
1899                                                                NamedProperty
1900                                                                np,
1901                                                                boolean insens,
1902                                                                int index)
1903     throws REException
1904   {
1905     try
1906     {
1907       return new RETokenNamedProperty (subIndex, np.name, insens, np.negate);
1908     }
1909     catch (REException e)
1910     {
1911       REException ree;
1912       ree = new REException (e.getMessage (), REException.REG_ESCAPE, index);
1913       ree.initCause (e);
1914       throw ree;
1915     }
1916   }
1917 
1918   /**
1919    * Checks if the regular expression matches the input in its entirety.
1920    *
1921    * @param input The input text.
1922    */
isMatch(Object input)1923   public boolean isMatch (Object input)
1924   {
1925     return isMatch (input, 0, 0);
1926   }
1927 
1928   /**
1929    * Checks if the input string, starting from index, is an exact match of
1930    * this regular expression.
1931    *
1932    * @param input The input text.
1933    * @param index The offset index at which the search should be begin.
1934    */
isMatch(Object input, int index)1935   public boolean isMatch (Object input, int index)
1936   {
1937     return isMatch (input, index, 0);
1938   }
1939 
1940 
1941   /**
1942    * Checks if the input, starting from index and using the specified
1943    * execution flags, is an exact match of this regular expression.
1944    *
1945    * @param input The input text.
1946    * @param index The offset index at which the search should be begin.
1947    * @param eflags The logical OR of any execution flags above.
1948    */
isMatch(Object input, int index, int eflags)1949   public boolean isMatch (Object input, int index, int eflags)
1950   {
1951     return isMatchImpl (makeCharIndexed (input, index), index, eflags);
1952   }
1953 
isMatchImpl(CharIndexed input, int index, int eflags)1954   private boolean isMatchImpl (CharIndexed input, int index, int eflags)
1955   {
1956     if (firstToken == null)     // Trivial case
1957       return (input.charAt (0) == CharIndexed.OUT_OF_BOUNDS);
1958     REMatch m = new REMatch (numSubs, index, eflags);
1959     if (firstToken.match (input, m))
1960       {
1961         if (m != null)
1962           {
1963             if (input.charAt (m.index) == CharIndexed.OUT_OF_BOUNDS)
1964               {
1965                 return true;
1966               }
1967           }
1968       }
1969     return false;
1970   }
1971 
1972   /**
1973    * Returns the maximum number of subexpressions in this regular expression.
1974    * If the expression contains branches, the value returned will be the
1975    * maximum subexpressions in any of the branches.
1976    */
getNumSubs()1977   public int getNumSubs ()
1978   {
1979     return numSubs;
1980   }
1981 
1982   // Overrides REToken.setUncle
setUncle(REToken uncle)1983   void setUncle (REToken uncle)
1984   {
1985     if (lastToken != null)
1986       {
1987         lastToken.setUncle (uncle);
1988       }
1989     else
1990       super.setUncle (uncle);   // to deal with empty subexpressions
1991   }
1992 
1993   // Overrides REToken.chain
1994 
chain(REToken next)1995   boolean chain (REToken next)
1996   {
1997     super.chain (next);
1998     setUncle (next);
1999     return true;
2000   }
2001 
2002   /**
2003    * Returns the minimum number of characters that could possibly
2004    * constitute a match of this regular expression.
2005    */
getMinimumLength()2006   public int getMinimumLength ()
2007   {
2008     return minimumLength;
2009   }
2010 
getMaximumLength()2011   public int getMaximumLength ()
2012   {
2013     return maximumLength;
2014   }
2015 
2016   /**
2017    * Returns an array of all matches found in the input.
2018    *
2019    * If the regular expression allows the empty string to match, it will
2020    * substitute matches at all positions except the end of the input.
2021    *
2022    * @param input The input text.
2023    * @return a non-null (but possibly zero-length) array of matches
2024    */
getAllMatches(Object input)2025   public REMatch[] getAllMatches (Object input)
2026   {
2027     return getAllMatches (input, 0, 0);
2028   }
2029 
2030   /**
2031    * Returns an array of all matches found in the input,
2032    * beginning at the specified index position.
2033    *
2034    * If the regular expression allows the empty string to match, it will
2035    * substitute matches at all positions except the end of the input.
2036    *
2037    * @param input The input text.
2038    * @param index The offset index at which the search should be begin.
2039    * @return a non-null (but possibly zero-length) array of matches
2040    */
getAllMatches(Object input, int index)2041   public REMatch[] getAllMatches (Object input, int index)
2042   {
2043     return getAllMatches (input, index, 0);
2044   }
2045 
2046   /**
2047    * Returns an array of all matches found in the input string,
2048    * beginning at the specified index position and using the specified
2049    * execution flags.
2050    *
2051    * If the regular expression allows the empty string to match, it will
2052    * substitute matches at all positions except the end of the input.
2053    *
2054    * @param input The input text.
2055    * @param index The offset index at which the search should be begin.
2056    * @param eflags The logical OR of any execution flags above.
2057    * @return a non-null (but possibly zero-length) array of matches
2058    */
getAllMatches(Object input, int index, int eflags)2059   public REMatch[] getAllMatches (Object input, int index, int eflags)
2060   {
2061     return getAllMatchesImpl (makeCharIndexed (input, index), index, eflags);
2062   }
2063 
2064   // this has been changed since 1.03 to be non-overlapping matches
getAllMatchesImpl(CharIndexed input, int index, int eflags)2065   private REMatch[] getAllMatchesImpl (CharIndexed input, int index,
2066                                        int eflags)
2067   {
2068     List < REMatch > all = new ArrayList < REMatch > ();
2069     REMatch m = null;
2070     while ((m = getMatchImpl (input, index, eflags, null)) != null)
2071       {
2072         all.add (m);
2073         index = m.getEndIndex ();
2074         if (m.end[0] == 0)
2075           {                     // handle pathological case of zero-length match
2076             index++;
2077             input.move (1);
2078           }
2079         else
2080           {
2081             input.move (m.end[0]);
2082           }
2083         if (!input.isValid ())
2084           break;
2085       }
2086     return all.toArray (new REMatch[all.size ()]);
2087   }
2088 
2089   /* Implements abstract method REToken.match() */
match(CharIndexed input, REMatch mymatch)2090   boolean match (CharIndexed input, REMatch mymatch)
2091   {
2092     input.setHitEnd (mymatch);
2093     if (firstToken == null)
2094       {
2095         return next (input, mymatch);
2096       }
2097 
2098     // Note the start of this subexpression
2099     mymatch.start1[subIndex] = mymatch.index;
2100 
2101     return firstToken.match (input, mymatch);
2102   }
2103 
findMatch(CharIndexed input, REMatch mymatch)2104   REMatch findMatch (CharIndexed input, REMatch mymatch)
2105   {
2106     if (mymatch.backtrackStack == null)
2107       mymatch.backtrackStack = new BacktrackStack ();
2108     boolean b = match (input, mymatch);
2109     if (b)
2110       {
2111         return mymatch;
2112       }
2113     return null;
2114   }
2115 
2116   /**
2117    * Returns the first match found in the input.  If no match is found,
2118    * null is returned.
2119    *
2120    * @param input The input text.
2121    * @return An REMatch instance referencing the match, or null if none.
2122    */
getMatch(Object input)2123   public REMatch getMatch (Object input)
2124   {
2125     return getMatch (input, 0, 0);
2126   }
2127 
2128   /**
2129    * Returns the first match found in the input, beginning
2130    * the search at the specified index.  If no match is found,
2131    * returns null.
2132    *
2133    * @param input The input text.
2134    * @param index The offset within the text to begin looking for a match.
2135    * @return An REMatch instance referencing the match, or null if none.
2136    */
getMatch(Object input, int index)2137   public REMatch getMatch (Object input, int index)
2138   {
2139     return getMatch (input, index, 0);
2140   }
2141 
2142   /**
2143    * Returns the first match found in the input, beginning
2144    * the search at the specified index, and using the specified
2145    * execution flags.  If no match is found, returns null.
2146    *
2147    * @param input The input text.
2148    * @param index The offset index at which the search should be begin.
2149    * @param eflags The logical OR of any execution flags above.
2150    * @return An REMatch instance referencing the match, or null if none.
2151    */
getMatch(Object input, int index, int eflags)2152   public REMatch getMatch (Object input, int index, int eflags)
2153   {
2154     return getMatch (input, index, eflags, null);
2155   }
2156 
2157   /**
2158    * Returns the first match found in the input, beginning the search
2159    * at the specified index, and using the specified execution flags.
2160    * If no match is found, returns null.  If a StringBuffer is
2161    * provided and is non-null, the contents of the input text from the
2162    * index to the beginning of the match (or to the end of the input,
2163    * if there is no match) are appended to the StringBuffer.
2164    *
2165    * @param input The input text.
2166    * @param index The offset index at which the search should be begin.
2167    * @param eflags The logical OR of any execution flags above.
2168    * @param buffer The StringBuffer to save pre-match text in.
2169    * @return An REMatch instance referencing the match, or null if none.  */
getMatch(Object input, int index, int eflags, CPStringBuilder buffer)2170   public REMatch getMatch (Object input, int index, int eflags,
2171                            CPStringBuilder buffer)
2172   {
2173     return getMatchImpl (makeCharIndexed (input, index), index, eflags,
2174                          buffer);
2175   }
2176 
getMatchImpl(CharIndexed input, int anchor, int eflags, CPStringBuilder buffer)2177   REMatch getMatchImpl (CharIndexed input, int anchor, int eflags,
2178                         CPStringBuilder buffer)
2179   {
2180     boolean tryEntireMatch = ((eflags & REG_TRY_ENTIRE_MATCH) != 0);
2181     boolean doMove = ((eflags & REG_FIX_STARTING_POSITION) == 0);
2182     RE re = (tryEntireMatch ? (RE) this.clone () : this);
2183     if (tryEntireMatch)
2184       {
2185         RETokenEnd reEnd = new RETokenEnd (0, null);
2186         reEnd.setFake (true);
2187         re.chain (reEnd);
2188       }
2189     // Create a new REMatch to hold results
2190     REMatch mymatch = new REMatch (numSubs, anchor, eflags);
2191     do
2192       {
2193         /* The following potimization is commented out because
2194            the matching should be tried even if the length of
2195            input is obviously too short in order that
2196            java.util.regex.Matcher#hitEnd() may work correctly.
2197            // Optimization: check if anchor + minimumLength > length
2198            if (minimumLength == 0 || input.charAt(minimumLength-1) != CharIndexed.OUT_OF_BOUNDS) {
2199          */
2200         if (re.match (input, mymatch))
2201           {
2202             REMatch best = mymatch;
2203             // We assume that the match that coms first is the best.
2204             // And the following "The longer, the better" rule has
2205             // been commented out. The longest is not neccesarily
2206             // the best. For example, "a" out of "aaa" is the best
2207             // match for /a+?/.
2208             /*
2209                // Find best match of them all to observe leftmost longest
2210                while ((mymatch = mymatch.next) != null) {
2211                if (mymatch.index > best.index) {
2212                best = mymatch;
2213                }
2214                }
2215              */
2216             best.end[0] = best.index;
2217             best.finish (input);
2218             input.setLastMatch (best);
2219             return best;
2220           }
2221         /* End of the optimization commented out
2222            }
2223          */
2224         mymatch.clear (++anchor);
2225         // Append character to buffer if needed
2226         if (buffer != null && input.charAt (0) != CharIndexed.OUT_OF_BOUNDS)
2227           {
2228             buffer.append (input.charAt (0));
2229           }
2230         // java.util.regex.Matcher#hitEnd() requires that the search should
2231         // be tried at the end of input, so we use move1(1) instead of move(1)
2232       }
2233     while (doMove && input.move1 (1));
2234 
2235     // Special handling at end of input for e.g. "$"
2236     if (minimumLength == 0)
2237       {
2238         if (match (input, mymatch))
2239           {
2240             mymatch.finish (input);
2241             return mymatch;
2242           }
2243       }
2244 
2245     return null;
2246   }
2247 
2248   /**
2249    * Returns an REMatchEnumeration that can be used to iterate over the
2250    * matches found in the input text.
2251    *
2252    * @param input The input text.
2253    * @return A non-null REMatchEnumeration instance.
2254    */
getMatchEnumeration(Object input)2255   public REMatchEnumeration getMatchEnumeration (Object input)
2256   {
2257     return getMatchEnumeration (input, 0, 0);
2258   }
2259 
2260 
2261   /**
2262    * Returns an REMatchEnumeration that can be used to iterate over the
2263    * matches found in the input text.
2264    *
2265    * @param input The input text.
2266    * @param index The offset index at which the search should be begin.
2267    * @return A non-null REMatchEnumeration instance, with its input cursor
2268    *  set to the index position specified.
2269    */
getMatchEnumeration(Object input, int index)2270   public REMatchEnumeration getMatchEnumeration (Object input, int index)
2271   {
2272     return getMatchEnumeration (input, index, 0);
2273   }
2274 
2275   /**
2276    * Returns an REMatchEnumeration that can be used to iterate over the
2277    * matches found in the input text.
2278    *
2279    * @param input The input text.
2280    * @param index The offset index at which the search should be begin.
2281    * @param eflags The logical OR of any execution flags above.
2282    * @return A non-null REMatchEnumeration instance, with its input cursor
2283    *  set to the index position specified.
2284    */
getMatchEnumeration(Object input, int index, int eflags)2285   public REMatchEnumeration getMatchEnumeration (Object input, int index,
2286                                                  int eflags)
2287   {
2288     return new REMatchEnumeration (this, makeCharIndexed (input, index),
2289                                    index, eflags);
2290   }
2291 
2292 
2293   /**
2294    * Substitutes the replacement text for the first match found in the input.
2295    *
2296    * @param input The input text.
2297    * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
2298    * @return A String interpolating the substituted text.
2299    * @see REMatch#substituteInto
2300    */
substitute(Object input, String replace)2301   public String substitute (Object input, String replace)
2302   {
2303     return substitute (input, replace, 0, 0);
2304   }
2305 
2306   /**
2307    * Substitutes the replacement text for the first match found in the input
2308    * beginning at the specified index position.  Specifying an index
2309    * effectively causes the regular expression engine to throw away the
2310    * specified number of characters.
2311    *
2312    * @param input The input text.
2313    * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
2314    * @param index The offset index at which the search should be begin.
2315    * @return A String containing the substring of the input, starting
2316    *   at the index position, and interpolating the substituted text.
2317    * @see REMatch#substituteInto
2318    */
substitute(Object input, String replace, int index)2319   public String substitute (Object input, String replace, int index)
2320   {
2321     return substitute (input, replace, index, 0);
2322   }
2323 
2324   /**
2325    * Substitutes the replacement text for the first match found in the input
2326    * string, beginning at the specified index position and using the
2327    * specified execution flags.
2328    *
2329    * @param input The input text.
2330    * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
2331    * @param index The offset index at which the search should be begin.
2332    * @param eflags The logical OR of any execution flags above.
2333    * @return A String containing the substring of the input, starting
2334    *   at the index position, and interpolating the substituted text.
2335    * @see REMatch#substituteInto
2336    */
substitute(Object input, String replace, int index, int eflags)2337   public String substitute (Object input, String replace, int index,
2338                             int eflags)
2339   {
2340     return substituteImpl (makeCharIndexed (input, index), replace, index,
2341                            eflags);
2342   }
2343 
substituteImpl(CharIndexed input, String replace, int index, int eflags)2344   private String substituteImpl (CharIndexed input, String replace, int index,
2345                                  int eflags)
2346   {
2347     CPStringBuilder buffer = new CPStringBuilder ();
2348     REMatch m = getMatchImpl (input, index, eflags, buffer);
2349     if (m == null)
2350       return buffer.toString ();
2351     buffer.append (getReplacement (replace, m, eflags));
2352     if (input.move (m.end[0]))
2353       {
2354         do
2355           {
2356             buffer.append (input.charAt (0));
2357           }
2358         while (input.move (1));
2359       }
2360     return buffer.toString ();
2361   }
2362 
2363   /**
2364    * Substitutes the replacement text for each non-overlapping match found
2365    * in the input text.
2366    *
2367    * @param input The input text.
2368    * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
2369    * @return A String interpolating the substituted text.
2370    * @see REMatch#substituteInto
2371    */
substituteAll(Object input, String replace)2372   public String substituteAll (Object input, String replace)
2373   {
2374     return substituteAll (input, replace, 0, 0);
2375   }
2376 
2377   /**
2378    * Substitutes the replacement text for each non-overlapping match found
2379    * in the input text, starting at the specified index.
2380    *
2381    * If the regular expression allows the empty string to match, it will
2382    * substitute matches at all positions except the end of the input.
2383    *
2384    * @param input The input text.
2385    * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
2386    * @param index The offset index at which the search should be begin.
2387    * @return A String containing the substring of the input, starting
2388    *   at the index position, and interpolating the substituted text.
2389    * @see REMatch#substituteInto
2390    */
substituteAll(Object input, String replace, int index)2391   public String substituteAll (Object input, String replace, int index)
2392   {
2393     return substituteAll (input, replace, index, 0);
2394   }
2395 
2396   /**
2397    * Substitutes the replacement text for each non-overlapping match found
2398    * in the input text, starting at the specified index and using the
2399    * specified execution flags.
2400    *
2401    * @param input The input text.
2402    * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
2403    * @param index The offset index at which the search should be begin.
2404    * @param eflags The logical OR of any execution flags above.
2405    * @return A String containing the substring of the input, starting
2406    *   at the index position, and interpolating the substituted text.
2407    * @see REMatch#substituteInto
2408    */
substituteAll(Object input, String replace, int index, int eflags)2409   public String substituteAll (Object input, String replace, int index,
2410                                int eflags)
2411   {
2412     return substituteAllImpl (makeCharIndexed (input, index), replace, index,
2413                               eflags);
2414   }
2415 
substituteAllImpl(CharIndexed input, String replace, int index, int eflags)2416   private String substituteAllImpl (CharIndexed input, String replace,
2417                                     int index, int eflags)
2418   {
2419     CPStringBuilder buffer = new CPStringBuilder ();
2420     REMatch m;
2421     while ((m = getMatchImpl (input, index, eflags, buffer)) != null)
2422       {
2423         buffer.append (getReplacement (replace, m, eflags));
2424         index = m.getEndIndex ();
2425         if (m.end[0] == 0)
2426           {
2427             char ch = input.charAt (0);
2428             if (ch != CharIndexed.OUT_OF_BOUNDS)
2429               buffer.append (ch);
2430             input.move (1);
2431           }
2432         else
2433           {
2434             input.move (m.end[0]);
2435           }
2436 
2437         if (!input.isValid ())
2438           break;
2439       }
2440     return buffer.toString ();
2441   }
2442 
getReplacement(String replace, REMatch m, int eflags)2443   public static String getReplacement (String replace, REMatch m, int eflags)
2444   {
2445     if ((eflags & REG_NO_INTERPOLATE) > 0)
2446       return replace;
2447     else
2448       {
2449         if ((eflags & REG_REPLACE_USE_BACKSLASHESCAPE) > 0)
2450           {
2451             CPStringBuilder sb = new CPStringBuilder ();
2452             int l = replace.length ();
2453             for (int i = 0; i < l; i++)
2454               {
2455                 char c = replace.charAt (i);
2456                 switch (c)
2457                   {
2458                   case '\\':
2459                     i++;
2460                     // Let StringIndexOutOfBoundsException be thrown.
2461                     sb.append (replace.charAt (i));
2462                     break;
2463                   case '$':
2464                     int i1 = i + 1;
2465                     while (i1 < replace.length () &&
2466                            Character.isDigit (replace.charAt (i1)))
2467                       i1++;
2468                     sb.append (m.substituteInto (replace.substring (i, i1)));
2469                     i = i1 - 1;
2470                     break;
2471                   default:
2472                     sb.append (c);
2473                   }
2474               }
2475             return sb.toString ();
2476           }
2477         else
2478           return m.substituteInto (replace);
2479       }
2480   }
2481 
2482   /* Helper function for constructor */
addToken(REToken next)2483   private void addToken (REToken next)
2484   {
2485     if (next == null)
2486       return;
2487     minimumLength += next.getMinimumLength ();
2488     int nmax = next.getMaximumLength ();
2489     if (nmax < Integer.MAX_VALUE && maximumLength < Integer.MAX_VALUE)
2490       maximumLength += nmax;
2491     else
2492       maximumLength = Integer.MAX_VALUE;
2493 
2494     if (firstToken == null)
2495       {
2496         lastToken = firstToken = next;
2497       }
2498     else
2499       {
2500         // if chain returns false, it "rejected" the token due to
2501         // an optimization, and next was combined with lastToken
2502         if (lastToken.chain (next))
2503           {
2504             lastToken = next;
2505           }
2506       }
2507   }
2508 
setRepeated(REToken current, int min, int max, int index)2509   private static REToken setRepeated (REToken current, int min, int max,
2510                                       int index) throws REException
2511   {
2512     if (current == null)
2513       throw new REException (getLocalizedMessage ("repeat.no.token"),
2514                              REException.REG_BADRPT, index);
2515       return new RETokenRepeated (current.subIndex, current, min, max);
2516   }
2517 
getPosixSet(char[]pattern, int index, CPStringBuilder buf)2518   private static int getPosixSet (char[]pattern, int index,
2519                                   CPStringBuilder buf)
2520   {
2521     // Precondition: pattern[index-1] == ':'
2522     // we will return pos of closing ']'.
2523     int i;
2524     for (i = index; i < (pattern.length - 1); i++)
2525       {
2526         if ((pattern[i] == ':') && (pattern[i + 1] == ']'))
2527           return i + 2;
2528         buf.append (pattern[i]);
2529       }
2530     return index;               // didn't match up
2531   }
2532 
getMinMax(char[]input, int index, IntPair minMax, RESyntax syntax)2533   private int getMinMax (char[]input, int index, IntPair minMax,
2534                          RESyntax syntax) throws REException
2535   {
2536     // Precondition: input[index-1] == '{', minMax != null
2537 
2538     boolean mustMatch = !syntax.get (RESyntax.RE_NO_BK_BRACES);
2539     int startIndex = index;
2540     if (index == input.length)
2541       {
2542         if (mustMatch)
2543           throw new REException (getLocalizedMessage ("unmatched.brace"),
2544                                  REException.REG_EBRACE, index);
2545         else
2546           return startIndex;
2547       }
2548 
2549     int min, max = 0;
2550     CharUnit unit = new CharUnit ();
2551     CPStringBuilder buf = new CPStringBuilder ();
2552 
2553     // Read string of digits
2554     do
2555       {
2556         index = getCharUnit (input, index, unit, false);
2557         if (Character.isDigit (unit.ch))
2558           buf.append (unit.ch);
2559       }
2560     while ((index != input.length) && Character.isDigit (unit.ch));
2561 
2562     // Check for {} tomfoolery
2563     if (buf.length () == 0)
2564       {
2565         if (mustMatch)
2566           throw new REException (getLocalizedMessage ("interval.error"),
2567                                  REException.REG_EBRACE, index);
2568         else
2569         return startIndex;
2570       }
2571 
2572     min = Integer.parseInt (buf.toString ());
2573 
2574     if ((unit.ch == '}') && (syntax.get (RESyntax.RE_NO_BK_BRACES) ^ unit.bk))
2575       max = min;
2576     else if (index == input.length)
2577       if (mustMatch)
2578         throw new REException (getLocalizedMessage ("interval.no.end"),
2579                                REException.REG_EBRACE, index);
2580     else
2581     return startIndex;
2582     else
2583   if ((unit.ch == ',') && !unit.bk)
2584     {
2585       buf = new CPStringBuilder ();
2586       // Read string of digits
2587       while (((index =
2588                getCharUnit (input, index, unit, false)) != input.length)
2589              && Character.isDigit (unit.ch))
2590         buf.append (unit.ch);
2591 
2592       if (!
2593           ((unit.ch == '}')
2594            && (syntax.get (RESyntax.RE_NO_BK_BRACES) ^ unit.bk)))
2595         if (mustMatch)
2596           throw new REException (getLocalizedMessage ("interval.error"),
2597                                  REException.REG_EBRACE, index);
2598       else
2599       return startIndex;
2600 
2601       // This is the case of {x,}
2602       if (buf.length () == 0)
2603         max = Integer.MAX_VALUE;
2604       else
2605         max = Integer.parseInt (buf.toString ());
2606     }
2607   else if (mustMatch)
2608     throw new REException (getLocalizedMessage ("interval.error"),
2609                            REException.REG_EBRACE, index);
2610   else
2611   return startIndex;
2612 
2613   // We know min and max now, and they are valid.
2614 
2615   minMax.first = min;
2616   minMax.second = max;
2617 
2618   // return the index following the '}'
2619   return index;
2620   }
2621 
2622    /**
2623     * Return a human readable form of the compiled regular expression,
2624     * useful for debugging.
2625     */
toString()2626   public String toString ()
2627   {
2628     CPStringBuilder sb = new CPStringBuilder ();
2629     dump (sb);
2630     return sb.toString ();
2631   }
2632 
dump(CPStringBuilder os)2633   void dump (CPStringBuilder os)
2634   {
2635     os.append ("(?#startRE subIndex=" + subIndex + ")");
2636     if (subIndex == 0)
2637       os.append ("?:");
2638     if (firstToken != null)
2639       firstToken.dumpAll (os);
2640     if (subIndex == 0)
2641       os.append (")");
2642     os.append ("(?#endRE subIndex=" + subIndex + ")");
2643   }
2644 
2645   // Cast input appropriately or throw exception
2646   // This method was originally a private method, but has been made
2647   // public because java.util.regex.Matcher uses this.
makeCharIndexed(Object input, int index)2648   public static CharIndexed makeCharIndexed (Object input, int index)
2649   {
2650     // The case where input is already a CharIndexed is supposed
2651     // be the most likely because this is the case with
2652     // java.util.regex.Matcher.
2653     // We could let a String or a CharSequence fall through
2654     // to final input, but since it'a very likely input type,
2655     // we check it first.
2656     if (input instanceof CharIndexed)
2657       {
2658         CharIndexed ci = (CharIndexed) input;
2659         ci.setAnchor (index);
2660         return ci;
2661       }
2662     else if (input instanceof CharSequence)
2663       return new CharIndexedCharSequence ((CharSequence) input, index);
2664     else if (input instanceof String)
2665       return new CharIndexedString ((String) input, index);
2666     else if (input instanceof char[])
2667       return new CharIndexedCharArray ((char[]) input, index);
2668     else if (input instanceof StringBuffer)
2669       return new CharIndexedStringBuffer ((StringBuffer) input, index);
2670     else if (input instanceof InputStream)
2671       return new CharIndexedInputStream ((InputStream) input, index);
2672     else
2673       return new CharIndexedString (input.toString (), index);
2674   }
2675 }
2676