1 package org.apache.regexp;
2 
3 /*
4  * ====================================================================
5  *
6  * The Apache Software License, Version 1.1
7  *
8  * Copyright (c) 1999 The Apache Software Foundation.  All rights
9  * reserved.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  *
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  *
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in
20  *    the documentation and/or other materials provided with the
21  *    distribution.
22  *
23  * 3. The end-user documentation included with the redistribution, if
24  *    any, must include the following acknowlegement:
25  *       "This product includes software developed by the
26  *        Apache Software Foundation (http://www.apache.org/)."
27  *    Alternately, this acknowlegement may appear in the software itself,
28  *    if and wherever such third-party acknowlegements normally appear.
29  *
30  * 4. The names "The Jakarta Project", "Jakarta-Regexp", and "Apache Software
31  *    Foundation" must not be used to endorse or promote products derived
32  *    from this software without prior written permission. For written
33  *    permission, please contact apache@apache.org.
34  *
35  * 5. Products derived from this software may not be called "Apache"
36  *    nor may "Apache" appear in their names without prior written
37  *    permission of the Apache Group.
38  *
39  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
40  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
41  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
42  * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
43  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
44  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
45  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
46  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
47  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
48  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
49  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
50  * SUCH DAMAGE.
51  * ====================================================================
52  *
53  * This software consists of voluntary contributions made by many
54  * individuals on behalf of the Apache Software Foundation.  For more
55  * information on the Apache Software Foundation, please see
56  * <http://www.apache.org/>.
57  *
58  */
59 
60 import org.apache.regexp.RE;
61 import java.util.Hashtable;
62 
63 /**
64  * A regular expression compiler class.  This class compiles a pattern string into a
65  * regular expression program interpretable by the RE evaluator class.  The 'recompile'
66  * command line tool uses this compiler to pre-compile regular expressions for use
67  * with RE.  For a description of the syntax accepted by RECompiler and what you can
68  * do with regular expressions, see the documentation for the RE matcher class.
69  *
70  * @see RE
71  * @see recompile
72  *
73  * @author <a href="mailto:jonl@muppetlabs.com">Jonathan Locke</a>
74  * @version $Id: RECompiler.java,v 1.2 2000/05/14 21:04:17 jon Exp $
75  */
76 public class RECompiler
77 {
78     // The compiled program
79     char[] instruction;                                 // The compiled RE 'program' instruction buffer
80     int lenInstruction;                                 // The amount of the program buffer currently in use
81 
82     // Input state for compiling regular expression
83     String pattern;                                     // Input string
84     int len;                                            // Length of the pattern string
85     int idx;                                            // Current input index into ac
86     int parens;                                         // Total number of paren pairs
87 
88     // Node flags
89     static final int NODE_NORMAL   = 0;                 // No flags (nothing special)
90     static final int NODE_NULLABLE = 1;                 // True if node is potentially null
91     static final int NODE_TOPLEVEL = 2;                 // True if top level expr
92 
93     // Special types of 'escapes'
94     static final char ESC_MASK     = 0xfff0;            // Escape complexity mask
95     static final char ESC_BACKREF  = 0xffff;            // Escape is really a backreference
96     static final char ESC_COMPLEX  = 0xfffe;            // Escape isn't really a true character
97     static final char ESC_CLASS    = 0xfffd;            // Escape represents a whole class of characters
98 
99     // {m,n} stacks
100     static final int maxBrackets = 10;                  // Maximum number of bracket pairs
101     static int brackets = 0;                            // Number of bracket sets
102     static int[] bracketStart = null;                   // Starting point
103     static int[] bracketEnd = null;                     // Ending point
104     static int[] bracketMin = null;                     // Minimum number of matches
105     static int[] bracketOpt = null;                     // Additional optional matches
106     static final int bracketUnbounded = -1;             // Unbounded value
107     static final int bracketFinished = -2;              // Unbounded value
108 
109     // Lookup table for POSIX character class names
110     static Hashtable hashPOSIX = new Hashtable();
111     static
112     {
113         hashPOSIX.put("alnum",     new Character(RE.POSIX_CLASS_ALNUM));
114         hashPOSIX.put("alpha",     new Character(RE.POSIX_CLASS_ALPHA));
115         hashPOSIX.put("blank",     new Character(RE.POSIX_CLASS_BLANK));
116         hashPOSIX.put("cntrl",     new Character(RE.POSIX_CLASS_CNTRL));
117         hashPOSIX.put("digit",     new Character(RE.POSIX_CLASS_DIGIT));
118         hashPOSIX.put("graph",     new Character(RE.POSIX_CLASS_GRAPH));
119         hashPOSIX.put("lower",     new Character(RE.POSIX_CLASS_LOWER));
120         hashPOSIX.put("print",     new Character(RE.POSIX_CLASS_PRINT));
121         hashPOSIX.put("punct",     new Character(RE.POSIX_CLASS_PUNCT));
122         hashPOSIX.put("space",     new Character(RE.POSIX_CLASS_SPACE));
123         hashPOSIX.put("upper",     new Character(RE.POSIX_CLASS_UPPER));
124         hashPOSIX.put("xdigit",    new Character(RE.POSIX_CLASS_XDIGIT));
125         hashPOSIX.put("javastart", new Character(RE.POSIX_CLASS_JSTART));
126         hashPOSIX.put("javapart",  new Character(RE.POSIX_CLASS_JPART));
127     }
128 
129     /**
130      * Constructor.  Creates (initially empty) storage for a regular expression program.
131      */
RECompiler()132     public RECompiler()
133     {
134         // Start off with a generous, yet reasonable, initial size
135         instruction = new char[128];
136         lenInstruction = 0;
137     }
138 
139     /**
140      * Ensures that n more characters can fit in the program buffer.
141      * If n more can't fit, then the size is doubled until it can.
142      * @param n Number of additional characters to ensure will fit.
143      */
ensure(int n)144     void ensure(int n)
145     {
146         // Get current program length
147         int curlen = instruction.length;
148 
149         // If the current length + n more is too much
150         if (lenInstruction + n >= curlen)
151         {
152             // Double the size of the program array until n more will fit
153             while (lenInstruction + n >= curlen)
154             {
155                 curlen *= 2;
156             }
157 
158             // Allocate new program array and move data into it
159             char[] newInstruction = new char[curlen];
160             System.arraycopy(instruction, 0, newInstruction, 0, lenInstruction);
161             instruction = newInstruction;
162         }
163     }
164 
165     /**
166      * Emit a single character into the program stream.
167      * @param c Character to add
168      */
emit(char c)169     void emit(char c)
170     {
171         // Make room for character
172         ensure(1);
173 
174         // Add character
175         instruction[lenInstruction++] = c;
176     }
177 
178     /**
179      * Inserts a node with a given opcode and opdata at insertAt.  The node relative next
180      * pointer is initialized to 0.
181      * @param opcode Opcode for new node
182      * @param opdata Opdata for new node (only the low 16 bits are currently used)
183      * @param insertAt Index at which to insert the new node in the program
184      */
nodeInsert(char opcode, int opdata, int insertAt)185     void nodeInsert(char opcode, int opdata, int insertAt)
186     {
187         // Make room for a new node
188         ensure(RE.nodeSize);
189 
190         // Move everything from insertAt to the end down nodeSize elements
191         System.arraycopy(instruction, insertAt, instruction, insertAt + RE.nodeSize, lenInstruction - insertAt);
192         instruction[insertAt + RE.offsetOpcode] = opcode;
193         instruction[insertAt + RE.offsetOpdata] = (char)opdata;
194         instruction[insertAt + RE.offsetNext] = 0;
195         lenInstruction += RE.nodeSize;
196     }
197 
198     /**
199      * Appends a node to the end of a node chain
200      * @param node Start of node chain to traverse
201      * @param pointTo Node to have the tail of the chain point to
202      */
setNextOfEnd(int node, int pointTo)203     void setNextOfEnd(int node, int pointTo)
204     {
205         // Traverse the chain until the next offset is 0
206         int next;
207         while ((next = instruction[node + RE.offsetNext]) != 0)
208         {
209             node += next;
210         }
211 
212         // Point the last node in the chain to pointTo.
213         instruction[node + RE.offsetNext] = (char)(short)(pointTo - node);
214     }
215 
216     /**
217      * Adds a new node
218      * @param opcode Opcode for node
219      * @param opdata Opdata for node (only the low 16 bits are currently used)
220      * @return Index of new node in program
221      */
node(char opcode, int opdata)222     int node(char opcode, int opdata)
223     {
224         // Make room for a new node
225         ensure(RE.nodeSize);
226 
227         // Add new node at end
228         instruction[lenInstruction + RE.offsetOpcode] = opcode;
229         instruction[lenInstruction + RE.offsetOpdata] = (char)opdata;
230         instruction[lenInstruction + RE.offsetNext] = 0;
231         lenInstruction += RE.nodeSize;
232 
233         // Return index of new node
234         return lenInstruction - RE.nodeSize;
235     }
236 
237 
238     /**
239      * Throws a new internal error exception
240      * @exception Error Thrown in the event of an internal error.
241      */
internalError()242     void internalError() throws Error
243     {
244         throw new Error("Internal error!");
245     }
246 
247     /**
248      * Throws a new syntax error exception
249      * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
250      */
syntaxError(String s)251     void syntaxError(String s) throws RESyntaxException
252     {
253         throw new RESyntaxException(s);
254     }
255 
256     /**
257      * Allocate storage for brackets only as needed
258      */
allocBrackets()259     void allocBrackets()
260     {
261         // Allocate bracket stacks if not already done
262         if (bracketStart == null)
263         {
264             // Allocate storage
265             bracketStart = new int[maxBrackets];
266             bracketEnd   = new int[maxBrackets];
267             bracketMin   = new int[maxBrackets];
268             bracketOpt   = new int[maxBrackets];
269 
270             // Initialize to invalid values
271             for (int i = 0; i < maxBrackets; i++)
272             {
273                 bracketStart[i] = bracketEnd[i] = bracketMin[i] = bracketOpt[i] = -1;
274             }
275         }
276     }
277 
278     /**
279      * Match bracket {m,n} expression put results in bracket member variables
280      * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
281      */
bracket()282     void bracket() throws RESyntaxException
283     {
284         // Current character must be a '{'
285         if (idx >= len || pattern.charAt(idx++) != '{')
286         {
287             internalError();
288         }
289 
290         // Next char must be a digit
291         if (idx >= len || !Character.isDigit(pattern.charAt(idx)))
292         {
293             syntaxError("Expected digit");
294         }
295 
296         // Get min ('m' of {m,n}) number
297         StringBuffer number = new StringBuffer();
298         while (idx < len && Character.isDigit(pattern.charAt(idx)))
299         {
300             number.append(pattern.charAt(idx++));
301         }
302         try
303         {
304             bracketMin[brackets] = Integer.parseInt(number.toString());
305         }
306         catch (NumberFormatException e)
307         {
308             syntaxError("Expected valid number");
309         }
310 
311         // If out of input, fail
312         if (idx >= len)
313         {
314             syntaxError("Expected comma or right bracket");
315         }
316 
317         // If end of expr, optional limit is 0
318         if (pattern.charAt(idx) == '}')
319         {
320             idx++;
321             bracketOpt[brackets] = 0;
322             return;
323         }
324 
325         // Must have at least {m,} and maybe {m,n}.
326         if (idx >= len || pattern.charAt(idx++) != ',')
327         {
328             syntaxError("Expected comma");
329         }
330 
331         // If out of input, fail
332         if (idx >= len)
333         {
334             syntaxError("Expected comma or right bracket");
335         }
336 
337         // If {m,} max is unlimited
338         if (pattern.charAt(idx) == '}')
339         {
340             idx++;
341             bracketOpt[brackets] = bracketUnbounded;
342             return;
343         }
344 
345         // Next char must be a digit
346         if (idx >= len || !Character.isDigit(pattern.charAt(idx)))
347         {
348             syntaxError("Expected digit");
349         }
350 
351         // Get max number
352         number.setLength(0);
353         while (idx < len && Character.isDigit(pattern.charAt(idx)))
354         {
355             number.append(pattern.charAt(idx++));
356         }
357         try
358         {
359             bracketOpt[brackets] = Integer.parseInt(number.toString()) - bracketMin[brackets];
360         }
361         catch (NumberFormatException e)
362         {
363             syntaxError("Expected valid number");
364         }
365 
366         // Optional repetitions must be > 0
367         if (bracketOpt[brackets] <= 0)
368         {
369             syntaxError("Bad range");
370         }
371 
372         // Must have close brace
373         if (idx >= len || pattern.charAt(idx++) != '}')
374         {
375             syntaxError("Missing close brace");
376         }
377     }
378 
379     /**
380      * Match an escape sequence.  Handles quoted chars and octal escapes as well
381      * as normal escape characters.  Always advances the input stream by the
382      * right amount. This code "understands" the subtle difference between an
383      * octal escape and a backref.  You can access the type of ESC_CLASS or
384      * ESC_COMPLEX or ESC_BACKREF by looking at pattern[idx - 1].
385      * @return ESC_* code or character if simple escape
386      * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
387      */
escape()388     char escape() throws RESyntaxException
389     {
390         // "Shouldn't" happen
391         if (pattern.charAt(idx) != '\\')
392         {
393             internalError();
394         }
395 
396         // Escape shouldn't occur as last character in string!
397         if (idx + 1 == len)
398         {
399             syntaxError("Escape terminates string");
400         }
401 
402         // Switch on character after backslash
403         idx += 2;
404         char escapeChar = pattern.charAt(idx - 1);
405         switch (escapeChar)
406         {
407             case RE.E_BOUND:
408             case RE.E_NBOUND:
409                 return ESC_COMPLEX;
410 
411             case RE.E_ALNUM:
412             case RE.E_NALNUM:
413             case RE.E_SPACE:
414             case RE.E_NSPACE:
415             case RE.E_DIGIT:
416             case RE.E_NDIGIT:
417                 return ESC_CLASS;
418 
419             case 'u':
420             case 'x':
421                 {
422                     // Exact required hex digits for escape type
423                     int hexDigits = (escapeChar == 'u' ? 4 : 2);
424 
425                     // Parse up to hexDigits characters from input
426                     int val = 0;
427                     for ( ; idx < len && hexDigits-- > 0; idx++)
428                     {
429                         // Get char
430                         char c = pattern.charAt(idx);
431 
432                         // If it's a hexadecimal digit (0-9)
433                         if (c >= '0' && c <= '9')
434                         {
435                             // Compute new value
436                             val = (val << 4) + c - '0';
437                         }
438                         else
439                         {
440                             // If it's a hexadecimal letter (a-f)
441                             c = Character.toLowerCase(c);
442                             if (c >= 'a' && c <= 'f')
443                             {
444                                 // Compute new value
445                                 val = (val << 4) + (c - 'a') + 10;
446                             }
447                             else
448                             {
449                                 // If it's not a valid digit or hex letter, the escape must be invalid
450                                 // because hexDigits of input have not been absorbed yet.
451                                 syntaxError("Expected " + hexDigits + " hexadecimal digits after \\" + escapeChar);
452                             }
453                         }
454                     }
455                     return (char)val;
456                 }
457 
458             case 't':
459                 return '\t';
460 
461             case 'n':
462                 return '\n';
463 
464             case 'r':
465                 return '\r';
466 
467             case 'f':
468                 return '\f';
469 
470             case '0':
471             case '1':
472             case '2':
473             case '3':
474             case '4':
475             case '5':
476             case '6':
477             case '7':
478             case '8':
479             case '9':
480 
481                 // An octal escape starts with a 0 or has two digits in a row
482                 if ((idx < len && Character.isDigit(pattern.charAt(idx))) || escapeChar == '0')
483                 {
484                     // Handle \nnn octal escapes
485                     int val = escapeChar - '0';
486                     if (idx < len && Character.isDigit(pattern.charAt(idx)))
487                     {
488                         val = ((val << 3) + (pattern.charAt(idx++) - '0'));
489                         if (idx < len && Character.isDigit(pattern.charAt(idx)))
490                         {
491                             val = ((val << 3) + (pattern.charAt(idx++) - '0'));
492                         }
493                     }
494                     return (char)val;
495                 }
496 
497                 // It's actually a backreference (\[1-9]), not an escape
498                 return ESC_BACKREF;
499 
500             default:
501 
502                 // Simple quoting of a character
503                 return escapeChar;
504         }
505     }
506 
507     /**
508      * Compile a character class
509      * @return Index of class node
510      * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
511      */
characterClass()512     int characterClass() throws RESyntaxException
513     {
514         // Check for bad calling or empty class
515         if (pattern.charAt(idx) != '[')
516         {
517             internalError();
518         }
519 
520         // Check for unterminated or empty class
521         if ((idx + 1) >= len || pattern.charAt(++idx) == ']')
522         {
523             syntaxError("Empty or unterminated class");
524         }
525 
526         // Check for POSIX character class
527         if (idx < len && pattern.charAt(idx) == ':')
528         {
529             // Skip colon
530             idx++;
531 
532             // POSIX character classes are denoted with lowercase ASCII strings
533             int idxStart = idx;
534             while (idx < len && pattern.charAt(idx) >= 'a' && pattern.charAt(idx) <= 'z')
535             {
536                 idx++;
537             }
538 
539             // Should be a ":]" to terminate the POSIX character class
540             if ((idx + 1) < len && pattern.charAt(idx) == ':' && pattern.charAt(idx + 1) == ']')
541             {
542                 // Get character class
543                 String charClass = pattern.substring(idxStart, idx);
544 
545                 // Select the POSIX class id
546                 Character i = (Character)hashPOSIX.get(charClass);
547                 if (i != null)
548                 {
549                     // Move past colon and right bracket
550                     idx += 2;
551 
552                     // Return new POSIX character class node
553                     return node(RE.OP_POSIXCLASS, i.charValue());
554                 }
555                 syntaxError("Invalid POSIX character class '" + charClass + "'");
556             }
557             syntaxError("Invalid POSIX character class syntax");
558         }
559 
560         // Try to build a class.  Create OP_ANYOF node
561         int ret = node(RE.OP_ANYOF, 0);
562 
563         // Parse class declaration
564         char CHAR_INVALID = Character.MAX_VALUE;
565         char last = CHAR_INVALID;
566         char simpleChar = 0;
567         boolean include = true;
568         boolean definingRange = false;
569         int idxFirst = idx;
570         char rangeStart = Character.MIN_VALUE;
571         char rangeEnd;
572         RERange range = new RERange();
573         while (idx < len && pattern.charAt(idx) != ']')
574         {
575 
576             switchOnCharacter:
577 
578             // Switch on character
579             switch (pattern.charAt(idx))
580             {
581                 case '^':
582                     include = !include;
583                     if (idx == idxFirst)
584                     {
585                         range.include(Character.MIN_VALUE, Character.MAX_VALUE, true);
586                     }
587                     idx++;
588                     continue;
589 
590                 case '\\':
591                 {
592                     // Escape always advances the stream
593                     char c;
594                     switch (c = escape ())
595                     {
596                         case ESC_COMPLEX:
597                         case ESC_BACKREF:
598 
599                             // Word boundaries and backrefs not allowed in a character class!
600                             syntaxError("Bad character class");
601 
602                         case ESC_CLASS:
603 
604                             // Classes can't be an endpoint of a range
605                             if (definingRange)
606                             {
607                                 syntaxError("Bad character class");
608                             }
609 
610                             // Handle specific type of class (some are ok)
611                             switch (pattern.charAt(idx - 1))
612                             {
613                                 case RE.E_NSPACE:
614                                 case RE.E_NDIGIT:
615                                 case RE.E_NALNUM:
616                                     syntaxError("Bad character class");
617 
618                                 case RE.E_SPACE:
619                                     range.include('\t', include);
620                                     range.include('\r', include);
621                                     range.include('\f', include);
622                                     range.include('\n', include);
623                                     range.include('\b', include);
624                                     range.include(' ', include);
625                                     break;
626 
627                                 case RE.E_ALNUM:
628                                     range.include('a', 'z', include);
629                                     range.include('A', 'Z', include);
630                                     range.include('_', include);
631 
632                                     // Fall through!
633 
634                                 case RE.E_DIGIT:
635                                     range.include('0', '9', include);
636                                     break;
637                             }
638 
639                             // Make last char invalid (can't be a range start)
640                             last = CHAR_INVALID;
641                             break;
642 
643                         default:
644 
645                             // Escape is simple so treat as a simple char
646                             simpleChar = c;
647                             break switchOnCharacter;
648                     }
649                 }
650                 continue;
651 
652                 case '-':
653 
654                     // Start a range if one isn't already started
655                     if (definingRange)
656                     {
657                         syntaxError("Bad class range");
658                     }
659                     definingRange = true;
660 
661                     // If no last character, start of range is 0
662                     rangeStart = (last == CHAR_INVALID ? 0 : last);
663 
664                     // Premature end of range. define up to Character.MAX_VALUE
665                     if ((idx + 1) < len && pattern.charAt(++idx) == ']')
666                     {
667                         simpleChar = Character.MAX_VALUE;
668                         break;
669                     }
670                     continue;
671 
672                 default:
673                     simpleChar = pattern.charAt(idx++);
674                     break;
675             }
676 
677             // Handle simple character simpleChar
678             if (definingRange)
679             {
680                 // if we are defining a range make it now
681                 rangeEnd = simpleChar;
682 
683                 // Actually create a range if the range is ok
684                 if (rangeStart >= rangeEnd)
685                 {
686                     syntaxError("Bad character class");
687                 }
688                 range.include(rangeStart, rangeEnd, include);
689 
690                 // We are done defining the range
691                 last = CHAR_INVALID;
692                 definingRange = false;
693             }
694             else
695             {
696                 // If simple character and not start of range, include it
697                 if ((idx + 1) >= len || pattern.charAt(idx + 1) != '-')
698                 {
699                     range.include(simpleChar, include);
700                 }
701                 last = simpleChar;
702             }
703         }
704 
705         // Shouldn't be out of input
706         if (idx == len)
707         {
708             syntaxError("Unterminated character class");
709         }
710 
711         // Absorb the ']' end of class marker
712         idx++;
713 
714         // Emit character class definition
715         instruction[ret + RE.offsetOpdata] = (char)range.num;
716         for (int i = 0; i < range.num; i++)
717         {
718             emit((char)range.minRange[i]);
719             emit((char)range.maxRange[i]);
720         }
721         return ret;
722     }
723 
724     /**
725      * Absorb an atomic character string.  This method is a little tricky because
726      * it can un-include the last character of string if a closure operator follows.
727      * This is correct because *+? have higher precedence than concatentation (thus
728      * ABC* means AB(C*) and NOT (ABC)*).
729      * @return Index of new atom node
730      * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
731      */
atom()732     int atom() throws RESyntaxException
733     {
734         // Create a string node
735         int ret = node(RE.OP_ATOM, 0);
736 
737         // Length of atom
738         int lenAtom = 0;
739 
740         // Loop while we've got input
741 
742         atomLoop:
743 
744         while (idx < len)
745         {
746             // Is there a next char?
747             if ((idx + 1) < len)
748             {
749                 char c = pattern.charAt(idx + 1);
750 
751                 // If the next 'char' is an escape, look past the whole escape
752                 if (pattern.charAt(idx) == '\\')
753                 {
754                     int idxEscape = idx;
755                     escape();
756                     if (idx < len)
757                     {
758                         c = pattern.charAt(idx);
759                     }
760                     idx = idxEscape;
761                 }
762 
763                 // Switch on next char
764                 switch (c)
765                 {
766                     case '{':
767                     case '?':
768                     case '*':
769                     case '+':
770 
771                         // If the next character is a closure operator and our atom is non-empty, the
772                         // current character should bind to the closure operator rather than the atom
773                         if (lenAtom != 0)
774                         {
775                             break atomLoop;
776                         }
777                 }
778             }
779 
780             // Switch on current char
781             switch (pattern.charAt(idx))
782             {
783                 case ']':
784                 case '^':
785                 case '$':
786                 case '.':
787                 case '[':
788                 case '(':
789                 case ')':
790                 case '|':
791                     break atomLoop;
792 
793                 case '{':
794                 case '?':
795                 case '*':
796                 case '+':
797 
798                     // We should have an atom by now
799                     if (lenAtom == 0)
800                     {
801                         // No atom before closure
802                         syntaxError("Missing operand to closure");
803                     }
804                     break atomLoop;
805 
806                 case '\\':
807 
808                     {
809                         // Get the escaped character (advances input automatically)
810                         int idxBeforeEscape = idx;
811                         char c = escape();
812 
813                         // Check if it's a simple escape (as opposed to, say, a backreference)
814                         if ((c & ESC_MASK) == ESC_MASK)
815                         {
816                             // Not a simple escape, so backup to where we were before the escape.
817                             idx = idxBeforeEscape;
818                             break atomLoop;
819                         }
820 
821                         // Add escaped char to atom
822                         emit(c);
823                         lenAtom++;
824                     }
825                     break;
826 
827                 default:
828 
829                     // Add normal character to atom
830                     emit(pattern.charAt(idx++));
831                     lenAtom++;
832                     break;
833             }
834         }
835 
836         // This "shouldn't" happen
837         if (lenAtom == 0)
838         {
839             internalError();
840         }
841 
842         // Emit the atom length into the program
843         instruction[ret + RE.offsetOpdata] = (char)lenAtom;
844         return ret;
845     }
846 
847     /**
848      * Match a terminal node.
849      * @param flags Flags
850      * @return Index of terminal node (closeable)
851      * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
852      */
terminal(int[] flags)853     int terminal(int[] flags) throws RESyntaxException
854     {
855         switch (pattern.charAt(idx))
856         {
857         case RE.OP_EOL:
858         case RE.OP_BOL:
859         case RE.OP_ANY:
860             return node(pattern.charAt(idx++), 0);
861 
862         case '[':
863             return characterClass();
864 
865         case '(':
866             return expr(flags);
867 
868         case ')':
869             syntaxError("Unexpected close paren");
870 
871         case '|':
872             internalError();
873 
874         case ']':
875             syntaxError("Mismatched class");
876 
877         case 0:
878             syntaxError("Unexpected end of input");
879 
880         case '?':
881         case '+':
882         case '{':
883         case '*':
884             syntaxError("Missing operand to closure");
885 
886         case '\\':
887             {
888                 // Don't forget, escape() advances the input stream!
889                 int idxBeforeEscape = idx;
890 
891                 // Switch on escaped character
892                 switch (escape())
893                 {
894                     case ESC_CLASS:
895                     case ESC_COMPLEX:
896                         flags[0] &= ~NODE_NULLABLE;
897                         return node(RE.OP_ESCAPE, pattern.charAt(idx - 1));
898 
899                     case ESC_BACKREF:
900                         {
901                             char backreference = (char)(pattern.charAt(idx - 1) - '0');
902                             if (parens <= backreference)
903                             {
904                                 syntaxError("Bad backreference");
905                             }
906                             flags[0] |= NODE_NULLABLE;
907                             return node(RE.OP_BACKREF, backreference);
908                         }
909 
910                     default:
911 
912                         // We had a simple escape and we want to have it end up in
913                         // an atom, so we back up and fall though to the default handling
914                         idx = idxBeforeEscape;
915                         flags[0] &= ~NODE_NULLABLE;
916                         break;
917                 }
918             }
919         }
920 
921         // Everything above either fails or returns.
922         // If it wasn't one of the above, it must be the start of an atom.
923         flags[0] &= ~NODE_NULLABLE;
924         return atom();
925     }
926 
927     /**
928      * Compile a possibly closured terminal
929      * @param flags Flags passed by reference
930      * @return Index of closured node
931      * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
932      */
closure(int[] flags)933     int closure(int[] flags) throws RESyntaxException
934     {
935         // Before terminal
936         int idxBeforeTerminal = idx;
937 
938         // Values to pass by reference to terminal()
939         int[] terminalFlags = { NODE_NORMAL };
940 
941         // Get terminal symbol
942         int ret = terminal(terminalFlags);
943 
944         // Or in flags from terminal symbol
945         flags[0] |= terminalFlags[0];
946 
947         // Advance input, set NODE_NULLABLE flag and do sanity checks
948         if (idx >= len)
949         {
950             return ret;
951         }
952         boolean greedy = true;
953         char closureType = pattern.charAt(idx);
954         switch (closureType)
955         {
956             case '?':
957             case '*':
958 
959                 // The current node can be null
960                 flags[0] |= NODE_NULLABLE;
961 
962             case '+':
963 
964                 // Eat closure character
965                 idx++;
966 
967             case '{':
968 
969                 // Don't allow blantant stupidity
970                 int opcode = instruction[ret + RE.offsetOpcode];
971                 if (opcode == RE.OP_BOL || opcode == RE.OP_EOL)
972                 {
973                     syntaxError("Bad closure operand");
974                 }
975                 if ((terminalFlags[0] & NODE_NULLABLE) != 0)
976                 {
977                     syntaxError("Closure operand can't be nullable");
978                 }
979                 break;
980         }
981 
982         // If the next character is a '?', make the closure non-greedy (reluctant)
983         if (idx < len && pattern.charAt(idx) == '?')
984         {
985             idx++;
986             greedy = false;
987         }
988 
989         if (greedy)
990         {
991             // Actually do the closure now
992             switch (closureType)
993             {
994                 case '{':
995                 {
996                     // We look for our bracket in the list
997                     boolean found = false;
998                     int i;
999                     allocBrackets();
1000                     for (i = 0; i < brackets; i++)
1001                     {
1002                         if (bracketStart[i] == idx)
1003                         {
1004                             found = true;
1005                             break;
1006                         }
1007                     }
1008 
1009                     // If its not in the list we parse the {m,n}
1010                     if (!found)
1011                     {
1012                         if (brackets >= maxBrackets)
1013                         {
1014                             syntaxError("Too many bracketed closures (limit is 10)");
1015                         }
1016                         bracketStart[brackets] = idx;
1017                         bracket();
1018                         bracketEnd[brackets] = idx;
1019                         i = brackets++;
1020                     }
1021 
1022                     // If there's a min, rewind stream and reparse
1023                     if (--bracketMin[i] > 0)
1024                     {
1025                         // Rewind stream and run it through again
1026                         idx = idxBeforeTerminal;
1027                         break;
1028                     }
1029 
1030                     // Do the right thing for maximum ({m,})
1031                     if (bracketOpt[i] == bracketFinished)
1032                     {
1033                         // Drop through now and closure expression.
1034                         // We are done with the {m,} expr, so skip rest
1035                         closureType = '*';
1036                         bracketOpt[i] = 0;
1037                         idx = bracketEnd[i];
1038                     }
1039                     else
1040                         if (bracketOpt[i] == bracketUnbounded)
1041                         {
1042                             idx = idxBeforeTerminal;
1043                             bracketOpt[i] = bracketFinished;
1044                             break;
1045                         }
1046                         else
1047                             if (bracketOpt[i]-- > 0)
1048                             {
1049                                 // Drop through to optionally close and then 'play it again sam!'
1050                                 idx = idxBeforeTerminal;
1051                                 closureType = '?';
1052                             }
1053                             else
1054                             {
1055                                 // We are done. skip the rest of {m,n} expr
1056                                 idx = bracketEnd[i];
1057                                 break;
1058                             }
1059                 }
1060 
1061                 // Fall through!
1062 
1063                 case '?':
1064                 case '*':
1065 
1066                     if (!greedy)
1067                     {
1068                         break;
1069                     }
1070 
1071                     if (closureType == '?')
1072                     {
1073                         // X? is compiled as (X|)
1074                         nodeInsert(RE.OP_BRANCH, 0, ret);                 // branch before X
1075                         setNextOfEnd(ret, node (RE.OP_BRANCH, 0));        // inserted branch to option
1076                         int nothing = node (RE.OP_NOTHING, 0);            // which is OP_NOTHING
1077                         setNextOfEnd(ret, nothing);                       // point (second) branch to OP_NOTHING
1078                         setNextOfEnd(ret + RE.nodeSize, nothing);         // point the end of X to OP_NOTHING node
1079                     }
1080 
1081                     if (closureType == '*')
1082                     {
1083                         // X* is compiled as (X{gotoX}|)
1084                         nodeInsert(RE.OP_BRANCH, 0, ret);                         // branch before X
1085                         setNextOfEnd(ret + RE.nodeSize, node(RE.OP_BRANCH, 0));   // end of X points to an option
1086                         setNextOfEnd(ret + RE.nodeSize, node(RE.OP_GOTO, 0));     // to goto
1087                         setNextOfEnd(ret + RE.nodeSize, ret);                     // the start again
1088                         setNextOfEnd(ret, node(RE.OP_BRANCH, 0));                 // the other option is
1089                         setNextOfEnd(ret, node(RE.OP_NOTHING, 0));                // OP_NOTHING
1090                     }
1091                     break;
1092 
1093                 case '+':
1094                 {
1095                     // X+ is compiled as X({gotoX}|)
1096                     int branch;
1097                     branch = node(RE.OP_BRANCH, 0);                   // a new branch
1098                     setNextOfEnd(ret, branch);                        // is added to the end of X
1099                     setNextOfEnd(node(RE.OP_GOTO, 0), ret);           // one option is to go back to the start
1100                     setNextOfEnd(branch, node(RE.OP_BRANCH, 0));      // the other option
1101                     setNextOfEnd(ret, node(RE.OP_NOTHING, 0));        // is OP_NOTHING
1102                 }
1103                 break;
1104             }
1105         }
1106         else
1107         {
1108             // Add end after closured subexpr
1109             setNextOfEnd(ret, node(RE.OP_END, 0));
1110 
1111             // Actually do the closure now
1112             switch (closureType)
1113             {
1114                 case '?':
1115                     nodeInsert(RE.OP_RELUCTANTMAYBE, 0, ret);
1116                     break;
1117 
1118                 case '*':
1119                     nodeInsert(RE.OP_RELUCTANTSTAR, 0, ret);
1120                     break;
1121 
1122                 case '+':
1123                     nodeInsert(RE.OP_RELUCTANTPLUS, 0, ret);
1124                     break;
1125             }
1126 
1127             // Point to the expr after the closure
1128             setNextOfEnd(ret, lenInstruction);
1129         }
1130         return ret;
1131     }
1132 
1133     /**
1134      * Compile one branch of an or operator (implements concatenation)
1135      * @param flags Flags passed by reference
1136      * @return Pointer to branch node
1137      * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
1138      */
branch(int[] flags)1139     int branch(int[] flags) throws RESyntaxException
1140     {
1141         // Get each possibly closured piece and concat
1142         int node;
1143         int ret = node(RE.OP_BRANCH, 0);
1144         int chain = -1;
1145         int[] closureFlags = new int[1];
1146         boolean nullable = true;
1147         while (idx < len && pattern.charAt(idx) != '|' && pattern.charAt(idx) != ')')
1148         {
1149             // Get new node
1150             closureFlags[0] = NODE_NORMAL;
1151             node = closure(closureFlags);
1152             if (closureFlags[0] == NODE_NORMAL)
1153             {
1154                 nullable = false;
1155             }
1156 
1157             // If there's a chain, append to the end
1158             if (chain != -1)
1159             {
1160                 setNextOfEnd(chain, node);
1161             }
1162 
1163             // Chain starts at current
1164             chain = node;
1165         }
1166 
1167         // If we don't run loop, make a nothing node
1168         if (chain == -1)
1169         {
1170             node(RE.OP_NOTHING, 0);
1171         }
1172 
1173         // Set nullable flag for this branch
1174         if (nullable)
1175         {
1176             flags[0] |= NODE_NULLABLE;
1177         }
1178         return ret;
1179     }
1180 
1181     /**
1182      * Compile an expression with possible parens around it.  Paren matching
1183      * is done at this level so we can tie the branch tails together.
1184      * @param flags Flag value passed by reference
1185      * @return Node index of expression in instruction array
1186      * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
1187      */
expr(int[] flags)1188     int expr(int[] flags) throws RESyntaxException
1189     {
1190         // Create open paren node unless we were called from the top level (which has no parens)
1191         boolean paren = false;
1192         int ret = -1;
1193         int closeParens = parens;
1194         if ((flags[0] & NODE_TOPLEVEL) == 0 && pattern.charAt(idx) == '(')
1195         {
1196             idx++;
1197             paren = true;
1198             ret = node(RE.OP_OPEN, parens++);
1199         }
1200         flags[0] &= ~NODE_TOPLEVEL;
1201 
1202         // Create a branch node
1203         int branch = branch(flags);
1204         if (ret == -1)
1205         {
1206             ret = branch;
1207         }
1208         else
1209         {
1210             setNextOfEnd(ret, branch);
1211         }
1212 
1213         // Loop through branches
1214         while (idx < len && pattern.charAt(idx) == '|')
1215         {
1216             idx++;
1217             branch = branch(flags);
1218             setNextOfEnd(ret, branch);
1219         }
1220 
1221         // Create an ending node (either a close paren or an OP_END)
1222         int end;
1223         if (paren)
1224         {
1225             if (idx < len && pattern.charAt(idx) == ')')
1226             {
1227                 idx++;
1228             }
1229             else
1230             {
1231                 syntaxError("Missing close paren");
1232             }
1233             end = node(RE.OP_CLOSE, closeParens);
1234         }
1235         else
1236         {
1237             end = node(RE.OP_END, 0);
1238         }
1239 
1240         // Append the ending node to the ret nodelist
1241         setNextOfEnd(ret, end);
1242 
1243         // Hook the ends of each branch to the end node
1244         for (int next = -1, i = ret; next != 0; next = instruction[i + RE.offsetNext], i += next)
1245         {
1246             // If branch, make the end of the branch's operand chain point to the end node.
1247             if (instruction[i + RE.offsetOpcode] == RE.OP_BRANCH)
1248             {
1249                 setNextOfEnd(i + RE.nodeSize, end);
1250             }
1251         }
1252 
1253         // Return the node list
1254         return ret;
1255     }
1256 
1257     /**
1258      * Compiles a regular expression pattern into a program runnable by the pattern
1259      * matcher class 'RE'.
1260      * @param pattern Regular expression pattern to compile (see RECompiler class
1261      * for details).
1262      * @return A compiled regular expression program.
1263      * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
1264      * @see RECompiler
1265      * @see RE
1266      */
compile(String pattern)1267     public REProgram compile(String pattern) throws RESyntaxException
1268     {
1269         // Initialize variables for compilation
1270         this.pattern = pattern;                         // Save pattern in instance variable
1271         len = pattern.length();                         // Precompute pattern length for speed
1272         idx = 0;                                        // Set parsing index to the first character
1273         lenInstruction = 0;                             // Set emitted instruction count to zero
1274         parens = 1;                                     // Set paren level to 1 (the implicit outer parens)
1275         brackets = 0;                                   // No bracketed closures yet
1276 
1277         // Initialize pass by reference flags value
1278         int[] flags = { NODE_TOPLEVEL };
1279 
1280         // Parse expression
1281         expr(flags);
1282 
1283         // Should be at end of input
1284         if (idx != len)
1285         {
1286             if (pattern.charAt(idx) == ')')
1287             {
1288                 syntaxError("Unmatched close paren");
1289             }
1290             syntaxError("Unexpected input remains");
1291         }
1292 
1293         // Return the result
1294         char[] ins = new char[lenInstruction];
1295         System.arraycopy(instruction, 0, ins, 0, lenInstruction);
1296         return new REProgram(ins);
1297     }
1298 
1299     /**
1300      * Local, nested class for maintaining character ranges for character classes.
1301      */
1302     class RERange
1303     {
1304         int size = 16;                      // Capacity of current range arrays
1305         int[] minRange = new int[size];     // Range minima
1306         int[] maxRange = new int[size];     // Range maxima
1307         int num = 0;                        // Number of range array elements in use
1308 
1309         /**
1310          * Deletes the range at a given index from the range lists
1311          * @param index Index of range to delete from minRange and maxRange arrays.
1312          */
delete(int index)1313         void delete(int index)
1314         {
1315             // Return if no elements left or index is out of range
1316             if (num == 0 || index >= num)
1317             {
1318                 return;
1319             }
1320 
1321             // Move elements down
1322             while (index++ < num)
1323             {
1324                 if (index - 1 >= 0)
1325                 {
1326                     minRange[index-1] = minRange[index];
1327                     maxRange[index-1] = maxRange[index];
1328                 }
1329             }
1330 
1331             // One less element now
1332             num--;
1333         }
1334 
1335         /**
1336          * Merges a range into the range list, coalescing ranges if possible.
1337          * @param min Minimum end of range
1338          * @param max Maximum end of range
1339          */
merge(int min, int max)1340         void merge(int min, int max)
1341         {
1342             // Loop through ranges
1343             for (int i = 0; i < num; i++)
1344             {
1345                 // Min-max is subsumed by minRange[i]-maxRange[i]
1346                 if (min >= minRange[i] && max <= maxRange[i])
1347                 {
1348                     return;
1349                 }
1350 
1351                 // Min-max subsumes minRange[i]-maxRange[i]
1352                 else if (min <= minRange[i] && max >= maxRange[i])
1353                 {
1354                     delete(i);
1355                     merge(min, max);
1356                     return;
1357                 }
1358 
1359                 // Min is in the range, but max is outside
1360                 else if (min >= minRange[i] && min <= maxRange[i])
1361                 {
1362                     delete(i);
1363                     min = minRange[i];
1364                     merge(min, max);
1365                     return;
1366                 }
1367 
1368                 // Max is in the range, but min is outside
1369                 else if (max >= minRange[i] && max <= maxRange[i])
1370                 {
1371                     delete(i);
1372                     max = maxRange[i];
1373                     merge(min, max);
1374                     return;
1375                 }
1376             }
1377 
1378             // Must not overlap any other ranges
1379             if (num >= size)
1380             {
1381                 size *= 2;
1382                 int[] newMin = new int[size];
1383                 int[] newMax = new int[size];
1384                 System.arraycopy(minRange, 0, newMin, 0, num);
1385                 System.arraycopy(maxRange, 0, newMax, 0, num);
1386                 minRange = newMin;
1387                 maxRange = newMax;
1388             }
1389             minRange[num] = min;
1390             maxRange[num] = max;
1391             num++;
1392         }
1393 
1394         /**
1395          * Removes a range by deleting or shrinking all other ranges
1396          * @param min Minimum end of range
1397          * @param max Maximum end of range
1398          */
remove(int min, int max)1399         void remove(int min, int max)
1400         {
1401             // Loop through ranges
1402             for (int i = 0; i < num; i++)
1403             {
1404                 // minRange[i]-maxRange[i] is subsumed by min-max
1405                 if (minRange[i] >= min && maxRange[i] <= max)
1406                 {
1407                     delete(i);
1408                     i--;
1409                     return;
1410                 }
1411 
1412                 // min-max is subsumed by minRange[i]-maxRange[i]
1413                 else if (min >= minRange[i] && max <= maxRange[i])
1414                 {
1415                     int minr = minRange[i];
1416                     int maxr = maxRange[i];
1417                     delete(i);
1418                     if (minr < min - 1)
1419                     {
1420                         merge(minr, min - 1);
1421                     }
1422                     if (max + 1 < maxr)
1423                     {
1424                         merge(max + 1, maxr);
1425                     }
1426                     return;
1427                 }
1428 
1429                 // minRange is in the range, but maxRange is outside
1430                 else if (minRange[i] >= min && minRange[i] <= max)
1431                 {
1432                     minRange[i] = max + 1;
1433                     return;
1434                 }
1435 
1436                 // maxRange is in the range, but minRange is outside
1437                 else if (maxRange[i] >= min && maxRange[i] <= max)
1438                 {
1439                     maxRange[i] = min - 1;
1440                     return;
1441                 }
1442             }
1443         }
1444 
1445         /**
1446          * Includes (or excludes) the range from min to max, inclusive.
1447          * @param min Minimum end of range
1448          * @param max Maximum end of range
1449          * @param include True if range should be included.  False otherwise.
1450          */
include(int min, int max, boolean include)1451         void include(int min, int max, boolean include)
1452         {
1453             if (include)
1454             {
1455                 merge(min, max);
1456             }
1457             else
1458             {
1459                 remove(min, max);
1460             }
1461         }
1462 
1463         /**
1464          * Includes a range with the same min and max
1465          * @param minmax Minimum and maximum end of range (inclusive)
1466          * @param include True if range should be included.  False otherwise.
1467          */
include(char minmax, boolean include)1468         void include(char minmax, boolean include)
1469         {
1470             include(minmax, minmax, include);
1471         }
1472     }
1473 }
1474