1 /*
2  * Copyright (c) 2015, 2017, Oracle and/or its affiliates. All rights reserved.
3  */
4 /*
5  * Licensed to the Apache Software Foundation (ASF) under one or more
6  * contributor license agreements.  See the NOTICE file distributed with
7  * this work for additional information regarding copyright ownership.
8  * The ASF licenses this file to You under the Apache License, Version 2.0
9  * (the "License"); you may not use this file except in compliance with
10  * the License.  You may obtain a copy of the License at
11  *
12  *      http://www.apache.org/licenses/LICENSE-2.0
13  *
14  * Unless required by applicable law or agreed to in writing, software
15  * distributed under the License is distributed on an "AS IS" BASIS,
16  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17  * See the License for the specific language governing permissions and
18  * limitations under the License.
19  */
20 
21 package com.sun.org.apache.xerces.internal.impl.xpath.regex;
22 
23 import java.util.ArrayList;
24 import java.util.Locale;
25 import java.util.MissingResourceException;
26 import java.util.ResourceBundle;
27 import jdk.xml.internal.SecuritySupport;
28 
29 /**
30  * A Regular Expression Parser.
31  *
32  * @xerces.internal
33  *
34  * @LastModified: Sep 2017
35  */
36 class RegexParser {
37     static final int T_CHAR = 0;
38     static final int T_EOF = 1;
39     static final int T_OR = 2;                  // '|'
40     static final int T_STAR = 3;                // '*'
41     static final int T_PLUS = 4;                // '+'
42     static final int T_QUESTION = 5;            // '?'
43     static final int T_LPAREN = 6;              // '('
44     static final int T_RPAREN = 7;              // ')'
45     static final int T_DOT = 8;                 // '.'
46     static final int T_LBRACKET = 9;            // '['
47     static final int T_BACKSOLIDUS = 10;        // '\'
48     static final int T_CARET = 11;              // '^'
49     static final int T_DOLLAR = 12;             // '$'
50     static final int T_LPAREN2 = 13;            // '(?:'
51     static final int T_LOOKAHEAD = 14;          // '(?='
52     static final int T_NEGATIVELOOKAHEAD = 15;  // '(?!'
53     static final int T_LOOKBEHIND = 16;         // '(?<='
54     static final int T_NEGATIVELOOKBEHIND = 17; // '(?<!'
55     static final int T_INDEPENDENT = 18;        // '(?>'
56     static final int T_SET_OPERATIONS = 19;     // '(?['
57     static final int T_POSIX_CHARCLASS_START = 20; // '[:' in a character class
58     static final int T_COMMENT = 21;            // '(?#'
59     static final int T_MODIFIERS = 22;          // '(?' [\-,a-z,A-Z]
60     static final int T_CONDITION = 23;          // '(?('
61     static final int T_XMLSCHEMA_CC_SUBTRACTION = 24; // '-[' in a character class
62 
63     static class ReferencePosition {
64         int refNumber;
65         int position;
ReferencePosition(int n, int pos)66         ReferencePosition(int n, int pos) {
67             this.refNumber = n;
68             this.position = pos;
69         }
70     }
71 
72     int offset;
73     String regex;
74     int regexlen;
75     int options;
76     ResourceBundle resources;
77     int chardata;
78     int nexttoken;
79     static protected final int S_NORMAL = 0;
80     static protected final int S_INBRACKETS = 1;
81     static protected final int S_INXBRACKETS = 2;
82     int context = S_NORMAL;
83     int parenOpened = 1;
84     int parennumber = 1;
85     boolean hasBackReferences;
86     ArrayList<ReferencePosition> references = null;
87 
RegexParser()88     public RegexParser() {
89         this.setLocale(Locale.getDefault());
90     }
RegexParser(Locale locale)91     public RegexParser(Locale locale) {
92         this.setLocale(locale);
93     }
94 
setLocale(Locale locale)95     public void setLocale(Locale locale) {
96         try {
97             if (locale != null) {
98                 this.resources = SecuritySupport.getResourceBundle("com.sun.org.apache.xerces.internal.impl.xpath.regex.message", locale);
99             }
100             else {
101                 this.resources = SecuritySupport.getResourceBundle("com.sun.org.apache.xerces.internal.impl.xpath.regex.message");
102             }
103         }
104         catch (MissingResourceException mre) {
105             throw new RuntimeException("Installation Problem???  Couldn't load messages: "
106                                        + mre.getMessage());
107         }
108     }
109 
ex(String key, int loc)110     final ParseException ex(String key, int loc) {
111         return new ParseException(this.resources.getString(key), loc);
112     }
113 
isSet(int flag)114     protected final boolean isSet(int flag) {
115         return (this.options & flag) == flag;
116     }
117 
parse(String regex, int options)118     Token parse(String regex, int options) throws ParseException {
119         this.options = options;
120         this.offset = 0;
121         this.setContext(S_NORMAL);
122         this.parennumber = 1;
123         this.parenOpened = 1;
124         this.hasBackReferences = false;
125         this.regex = regex;
126         if (this.isSet(RegularExpression.EXTENDED_COMMENT))
127             this.regex = REUtil.stripExtendedComment(this.regex);
128         this.regexlen = this.regex.length();
129 
130 
131         this.next();
132         Token ret = this.parseRegex();
133         if (this.offset != this.regexlen)
134             throw ex("parser.parse.1", this.offset);
135         if (this.read() != T_EOF) {
136             throw ex("parser.parse.1", this.offset-1);
137         }
138         if (this.references != null) {
139             for (int i = 0;  i < this.references.size();  i ++) {
140                 ReferencePosition position = this.references.get(i);
141                 if (this.parennumber <= position.refNumber)
142                     throw ex("parser.parse.2", position.position);
143             }
144             this.references.clear();
145         }
146         return ret;
147     }
148 
149     /*
150     public RegularExpression createRegex(String regex, int options) throws ParseException {
151         Token tok = this.parse(regex, options);
152         return new RegularExpression(regex, tok, this.parennumber, this.hasBackReferences, options);
153     }
154     */
155 
setContext(int con)156     protected final void setContext(int con) {
157         this.context = con;
158     }
159 
read()160     final int read() {
161         return this.nexttoken;
162     }
163 
164     @SuppressWarnings("fallthrough")
next()165     final void next() {
166         if (this.offset >= this.regexlen) {
167             this.chardata = -1;
168             this.nexttoken = T_EOF;
169             return;
170         }
171 
172         int ret;
173         int ch = this.regex.charAt(this.offset++);
174         this.chardata = ch;
175 
176         if (this.context == S_INBRACKETS) {
177             // In a character class, this.chardata has one character, that is to say,
178             // a pair of surrogates is composed and stored to this.chardata.
179             switch (ch) {
180               case '\\':
181                 ret = T_BACKSOLIDUS;
182                 if (this.offset >= this.regexlen)
183                     throw ex("parser.next.1", this.offset-1);
184                 this.chardata = this.regex.charAt(this.offset++);
185                 break;
186 
187               case '-':
188                 // Allow character class subtraction (regardless of whether we are in
189                 // XML Schema mode or not)
190                 if (this.offset < this.regexlen && this.regex.charAt(this.offset) == '[') {
191                     this.offset++;
192                     ret = T_XMLSCHEMA_CC_SUBTRACTION;
193                 } else
194                     ret = T_CHAR;
195                 break;
196 
197               case '[':
198                 if (!this.isSet(RegularExpression.XMLSCHEMA_MODE)
199                     && this.offset < this.regexlen && this.regex.charAt(this.offset) == ':') {
200                     this.offset++;
201                     ret = T_POSIX_CHARCLASS_START;
202                     break;
203                 } // Through down
204               default:
205                 if (REUtil.isHighSurrogate(ch) && this.offset < this.regexlen) {
206                     int low = this.regex.charAt(this.offset);
207                     if (REUtil.isLowSurrogate(low)) {
208                         this.chardata = REUtil.composeFromSurrogates(ch, low);
209                         this.offset ++;
210                     }
211                 }
212                 ret = T_CHAR;
213             }
214             this.nexttoken = ret;
215             return;
216         }
217 
218         switch (ch) {
219           case '|': ret = T_OR;             break;
220           case '*': ret = T_STAR;           break;
221           case '+': ret = T_PLUS;           break;
222           case '?': ret = T_QUESTION;       break;
223           case ')': ret = T_RPAREN;         break;
224           case '.': ret = T_DOT;            break;
225           case '[': ret = T_LBRACKET;       break;
226           case '^':
227               if (this.isSet(RegularExpression.XMLSCHEMA_MODE)) {
228                   ret = T_CHAR;
229               }
230               else {
231                   ret = T_CARET;
232               }
233               break;
234           case '$':
235               if (this.isSet(RegularExpression.XMLSCHEMA_MODE)) {
236                   ret = T_CHAR;
237               }
238               else {
239                   ret = T_DOLLAR;
240               }
241               break;
242           case '(':
243             ret = T_LPAREN;
244             if (this.offset >= this.regexlen)
245                 break;
246             if (this.regex.charAt(this.offset) != '?')
247                 break;
248             if (++this.offset >= this.regexlen)
249                 throw ex("parser.next.2", this.offset-1);
250             ch = this.regex.charAt(this.offset++);
251             switch (ch) {
252               case ':':  ret = T_LPAREN2;            break;
253               case '=':  ret = T_LOOKAHEAD;          break;
254               case '!':  ret = T_NEGATIVELOOKAHEAD;  break;
255               case '[':  ret = T_SET_OPERATIONS;     break;
256               case '>':  ret = T_INDEPENDENT;        break;
257               case '<':
258                 if (this.offset >= this.regexlen)
259                     throw ex("parser.next.2", this.offset-3);
260                 ch = this.regex.charAt(this.offset++);
261                 if (ch == '=') {
262                     ret = T_LOOKBEHIND;
263                 } else if (ch == '!') {
264                     ret = T_NEGATIVELOOKBEHIND;
265                 } else
266                     throw ex("parser.next.3", this.offset-3);
267                 break;
268               case '#':
269                 while (this.offset < this.regexlen) {
270                     ch = this.regex.charAt(this.offset++);
271                     if (ch == ')')  break;
272                 }
273                 if (ch != ')')
274                     throw ex("parser.next.4", this.offset-1);
275                 ret = T_COMMENT;
276                 break;
277               default:
278                 if (ch == '-' || 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z') {// Options
279                     this.offset --;
280                     ret = T_MODIFIERS;
281                     break;
282                 } else if (ch == '(') {         // conditional
283                     ret = T_CONDITION;          // this.offsets points the next of '('.
284                     break;
285                 }
286                 throw ex("parser.next.2", this.offset-2);
287             }
288             break;
289 
290           case '\\':
291             ret = T_BACKSOLIDUS;
292             if (this.offset >= this.regexlen)
293                 throw ex("parser.next.1", this.offset-1);
294             this.chardata = this.regex.charAt(this.offset++);
295             break;
296 
297           default:
298             ret = T_CHAR;
299         }
300         this.nexttoken = ret;
301     }
302 
303     /**
304      * regex ::= term (`|` term)*
305      * term ::= factor+
306      * factor ::= ('^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>'
307      *            | atom (('*' | '+' | '?' | minmax ) '?'? )?)
308      *            | '(?=' regex ')'  | '(?!' regex ')'  | '(?&lt;=' regex ')'  | '(?&lt;!' regex ')'
309      * atom ::= char | '.' | range | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
310      *          | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block
311      */
parseRegex()312     Token parseRegex() throws ParseException {
313         Token tok = this.parseTerm();
314         Token parent = null;
315         while (this.read() == T_OR) {
316             this.next();                    // '|'
317             if (parent == null) {
318                 parent = Token.createUnion();
319                 parent.addChild(tok);
320                 tok = parent;
321             }
322             tok.addChild(this.parseTerm());
323         }
324         return tok;
325     }
326 
327     /**
328      * term ::= factor+
329      */
parseTerm()330     Token parseTerm() throws ParseException {
331         int ch = this.read();
332         if (ch == T_OR || ch == T_RPAREN || ch == T_EOF) {
333             return Token.createEmpty();
334         } else {
335             Token tok = this.parseFactor();
336             Token concat = null;
337             while ((ch = this.read()) != T_OR && ch != T_RPAREN && ch != T_EOF) {
338                 if (concat == null) {
339                     concat = Token.createConcat();
340                     concat.addChild(tok);
341                     tok = concat;
342                 }
343                 concat.addChild(this.parseFactor());
344                 //tok = Token.createConcat(tok, this.parseFactor());
345             }
346             return tok;
347         }
348     }
349 
350     // ----------------------------------------------------------------
351 
processCaret()352     Token processCaret() throws ParseException {
353         this.next();
354         return Token.token_linebeginning;
355     }
processDollar()356     Token processDollar() throws ParseException {
357         this.next();
358         return Token.token_lineend;
359     }
processLookahead()360     Token processLookahead() throws ParseException {
361         this.next();
362         Token tok = Token.createLook(Token.LOOKAHEAD, this.parseRegex());
363         if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
364         this.next();                            // ')'
365         return tok;
366     }
processNegativelookahead()367     Token processNegativelookahead() throws ParseException {
368         this.next();
369         Token tok = Token.createLook(Token.NEGATIVELOOKAHEAD, this.parseRegex());
370         if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
371         this.next();                            // ')'
372         return tok;
373     }
processLookbehind()374     Token processLookbehind() throws ParseException {
375         this.next();
376         Token tok = Token.createLook(Token.LOOKBEHIND, this.parseRegex());
377         if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
378         this.next();                            // ')'
379         return tok;
380     }
processNegativelookbehind()381     Token processNegativelookbehind() throws ParseException {
382         this.next();
383         Token tok = Token.createLook(Token.NEGATIVELOOKBEHIND, this.parseRegex());
384         if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
385         this.next();                    // ')'
386         return tok;
387     }
processBacksolidus_A()388     Token processBacksolidus_A() throws ParseException {
389         this.next();
390         return Token.token_stringbeginning;
391     }
processBacksolidus_Z()392     Token processBacksolidus_Z() throws ParseException {
393         this.next();
394         return Token.token_stringend2;
395     }
processBacksolidus_z()396     Token processBacksolidus_z() throws ParseException {
397         this.next();
398         return Token.token_stringend;
399     }
processBacksolidus_b()400     Token processBacksolidus_b() throws ParseException {
401         this.next();
402         return Token.token_wordedge;
403     }
processBacksolidus_B()404     Token processBacksolidus_B() throws ParseException {
405         this.next();
406         return Token.token_not_wordedge;
407     }
processBacksolidus_lt()408     Token processBacksolidus_lt() throws ParseException {
409         this.next();
410         return Token.token_wordbeginning;
411     }
processBacksolidus_gt()412     Token processBacksolidus_gt() throws ParseException {
413         this.next();
414         return Token.token_wordend;
415     }
processStar(Token tok)416     Token processStar(Token tok) throws ParseException {
417         this.next();
418         if (this.read() == T_QUESTION) {
419             this.next();
420             return Token.createNGClosure(tok);
421         } else
422             return Token.createClosure(tok);
423     }
processPlus(Token tok)424     Token processPlus(Token tok) throws ParseException {
425         // X+ -> XX*
426         this.next();
427         if (this.read() == T_QUESTION) {
428             this.next();
429             return Token.createConcat(tok, Token.createNGClosure(tok));
430         } else
431             return Token.createConcat(tok, Token.createClosure(tok));
432     }
processQuestion(Token tok)433     Token processQuestion(Token tok) throws ParseException {
434         // X? -> X|
435         this.next();
436         Token par = Token.createUnion();
437         if (this.read() == T_QUESTION) {
438             this.next();
439             par.addChild(Token.createEmpty());
440             par.addChild(tok);
441         } else {
442             par.addChild(tok);
443             par.addChild(Token.createEmpty());
444         }
445         return par;
446     }
checkQuestion(int off)447     boolean checkQuestion(int off) {
448         return off < this.regexlen && this.regex.charAt(off) == '?';
449     }
processParen()450     Token processParen() throws ParseException {
451         this.next();
452         int p = this.parenOpened++;
453         Token tok = Token.createParen(this.parseRegex(), p);
454         if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
455         this.parennumber++;
456         this.next();                            // Skips ')'
457         return tok;
458     }
processParen2()459     Token processParen2() throws ParseException {
460         this.next();
461         Token tok = Token.createParen(this.parseRegex(), 0);
462         if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
463         this.next();                            // Skips ')'
464         return tok;
465     }
processCondition()466     Token processCondition() throws ParseException {
467                                                 // this.offset points the next of '('
468         if (this.offset+1 >= this.regexlen)  throw ex("parser.factor.4", this.offset);
469                                                 // Parses a condition.
470         int refno = -1;
471         Token condition = null;
472         int ch = this.regex.charAt(this.offset);
473         if ('1' <= ch && ch <= '9') {
474             refno = ch-'0';
475             int finalRefno = refno;
476 
477             if (this.parennumber <= refno)
478                 throw ex("parser.parse.2", this.offset);
479 
480             while (this.offset + 1 < this.regexlen) {
481                 ch = this.regex.charAt(this.offset + 1);
482                 if ('0' <= ch && ch <= '9') {
483                     refno = (refno * 10) + (ch - '0');
484                     if (refno < this.parennumber) {
485                         finalRefno= refno;
486                         ++this.offset;
487                     }
488                     else {
489                         break;
490                     }
491                 }
492                 else {
493                     break;
494                 }
495             }
496 
497             this.hasBackReferences = true;
498             if (this.references == null)  this.references = new ArrayList<>();
499             this.references.add(new ReferencePosition(finalRefno, this.offset));
500             this.offset ++;
501             if (this.regex.charAt(this.offset) != ')')  throw ex("parser.factor.1", this.offset);
502             this.offset ++;
503         } else {
504             if (ch == '?')  this.offset --; // Points '('.
505             this.next();
506             condition = this.parseFactor();
507             switch (condition.type) {
508               case Token.LOOKAHEAD:
509               case Token.NEGATIVELOOKAHEAD:
510               case Token.LOOKBEHIND:
511               case Token.NEGATIVELOOKBEHIND:
512                 break;
513               case Token.ANCHOR:
514                 if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
515                 break;
516               default:
517                 throw ex("parser.factor.5", this.offset);
518             }
519         }
520                                                 // Parses yes/no-patterns.
521         this.next();
522         Token yesPattern = this.parseRegex();
523         Token noPattern = null;
524         if (yesPattern.type == Token.UNION) {
525             if (yesPattern.size() != 2)  throw ex("parser.factor.6", this.offset);
526             noPattern = yesPattern.getChild(1);
527             yesPattern = yesPattern.getChild(0);
528         }
529         if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
530         this.next();
531         return Token.createCondition(refno, condition, yesPattern, noPattern);
532     }
processModifiers()533     Token processModifiers() throws ParseException {
534                                                 // this.offset points the next of '?'.
535                                                 // modifiers ::= [imsw]* ('-' [imsw]*)? ':'
536         int add = 0, mask = 0, ch = -1;
537         while (this.offset < this.regexlen) {
538             ch = this.regex.charAt(this.offset);
539             int v = REUtil.getOptionValue(ch);
540             if (v == 0)  break;                 // '-' or ':'?
541             add |= v;
542             this.offset ++;
543         }
544         if (this.offset >= this.regexlen)  throw ex("parser.factor.2", this.offset-1);
545         if (ch == '-') {
546             this.offset ++;
547             while (this.offset < this.regexlen) {
548                 ch = this.regex.charAt(this.offset);
549                 int v = REUtil.getOptionValue(ch);
550                 if (v == 0)  break;             // ':'?
551                 mask |= v;
552                 this.offset ++;
553             }
554             if (this.offset >= this.regexlen)  throw ex("parser.factor.2", this.offset-1);
555         }
556         Token tok;
557         if (ch == ':') {
558             this.offset ++;
559             this.next();
560             tok = Token.createModifierGroup(this.parseRegex(), add, mask);
561             if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
562             this.next();
563         } else if (ch == ')') {                 // such as (?-i)
564             this.offset ++;
565             this.next();
566             tok = Token.createModifierGroup(this.parseRegex(), add, mask);
567         } else
568             throw ex("parser.factor.3", this.offset);
569 
570         return tok;
571     }
processIndependent()572     Token processIndependent() throws ParseException {
573         this.next();
574         Token tok = Token.createLook(Token.INDEPENDENT, this.parseRegex());
575         if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
576         this.next();                            // Skips ')'
577         return tok;
578     }
processBacksolidus_c()579     Token processBacksolidus_c() throws ParseException {
580         int ch2;                                // Must be in 0x0040-0x005f
581         if (this.offset >= this.regexlen
582             || ((ch2 = this.regex.charAt(this.offset++)) & 0xffe0) != 0x0040)
583             throw ex("parser.atom.1", this.offset-1);
584         this.next();
585         return Token.createChar(ch2-0x40);
586     }
processBacksolidus_C()587     Token processBacksolidus_C() throws ParseException {
588         throw ex("parser.process.1", this.offset);
589     }
processBacksolidus_i()590     Token processBacksolidus_i() throws ParseException {
591         Token tok = Token.createChar('i');
592         this.next();
593         return tok;
594     }
processBacksolidus_I()595     Token processBacksolidus_I() throws ParseException {
596         throw ex("parser.process.1", this.offset);
597     }
processBacksolidus_g()598     Token processBacksolidus_g() throws ParseException {
599         this.next();
600         return Token.getGraphemePattern();
601     }
processBacksolidus_X()602     Token processBacksolidus_X() throws ParseException {
603         this.next();
604         return Token.getCombiningCharacterSequence();
605     }
processBackreference()606     Token processBackreference() throws ParseException {
607         int refnum = this.chardata-'0';
608         int finalRefnum = refnum;
609 
610         if (this.parennumber <= refnum)
611             throw ex("parser.parse.2", this.offset-2);
612 
613         while  (this.offset < this.regexlen) {
614             final int ch = this.regex.charAt(this.offset);
615             if ('0' <= ch && ch <= '9') {
616                 refnum = (refnum * 10) + (ch - '0');
617                 if (refnum < this.parennumber) {
618                     ++this.offset;
619                     finalRefnum = refnum;
620                     this.chardata = ch;
621                 }
622                 else {
623                     break;
624                 }
625             }
626             else {
627                 break;
628             }
629         }
630 
631         Token tok = Token.createBackReference(finalRefnum);
632         this.hasBackReferences = true;
633         if (this.references == null)  this.references = new ArrayList<>();
634         this.references.add(new ReferencePosition(finalRefnum, this.offset-2));
635         this.next();
636         return tok;
637     }
638 
639     // ----------------------------------------------------------------
640 
641     /**
642      * factor ::= ('^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>'
643      *            | atom (('*' | '+' | '?' | minmax ) '?'? )?)
644      *            | '(?=' regex ')'  | '(?!' regex ')'  | '(?&lt;=' regex ')'  | '(?&lt;!' regex ')'
645      *            | '(?#' [^)]* ')'
646      * minmax ::= '{' min (',' max?)? '}'
647      * min ::= [0-9]+
648      * max ::= [0-9]+
649      */
parseFactor()650     Token parseFactor() throws ParseException {
651         int ch = this.read();
652         Token tok;
653         switch (ch) {
654           case T_CARET:         return this.processCaret();
655           case T_DOLLAR:        return this.processDollar();
656           case T_LOOKAHEAD:     return this.processLookahead();
657           case T_NEGATIVELOOKAHEAD: return this.processNegativelookahead();
658           case T_LOOKBEHIND:    return this.processLookbehind();
659           case T_NEGATIVELOOKBEHIND: return this.processNegativelookbehind();
660 
661           case T_COMMENT:
662             this.next();
663             return Token.createEmpty();
664 
665           case T_BACKSOLIDUS:
666             switch (this.chardata) {
667               case 'A': return this.processBacksolidus_A();
668               case 'Z': return this.processBacksolidus_Z();
669               case 'z': return this.processBacksolidus_z();
670               case 'b': return this.processBacksolidus_b();
671               case 'B': return this.processBacksolidus_B();
672               case '<': return this.processBacksolidus_lt();
673               case '>': return this.processBacksolidus_gt();
674             }
675                                                 // through down
676         }
677         tok = this.parseAtom();
678         ch = this.read();
679         switch (ch) {
680           case T_STAR:  return this.processStar(tok);
681           case T_PLUS:  return this.processPlus(tok);
682           case T_QUESTION: return this.processQuestion(tok);
683           case T_CHAR:
684             if (this.chardata == '{' && this.offset < this.regexlen) {
685 
686                 int off = this.offset;          // this.offset -> next of '{'
687                 int min = 0, max = -1;
688 
689                 if ((ch = this.regex.charAt(off++)) >= '0' && ch <= '9') {
690 
691                     min = ch -'0';
692                     while (off < this.regexlen
693                            && (ch = this.regex.charAt(off++)) >= '0' && ch <= '9') {
694                         min = min*10 +ch-'0';
695                         if (min < 0)
696                             throw ex("parser.quantifier.5", this.offset);
697                     }
698                 }
699                 else {
700                     throw ex("parser.quantifier.1", this.offset);
701                 }
702 
703                 max = min;
704                 if (ch == ',') {
705 
706                    if (off >= this.regexlen) {
707                        throw ex("parser.quantifier.3", this.offset);
708                    }
709                    else if ((ch = this.regex.charAt(off++)) >= '0' && ch <= '9') {
710 
711                         max = ch -'0';       // {min,max}
712                         while (off < this.regexlen
713                                && (ch = this.regex.charAt(off++)) >= '0'
714                                && ch <= '9') {
715                             max = max*10 +ch-'0';
716                             if (max < 0)
717                                 throw ex("parser.quantifier.5", this.offset);
718                         }
719 
720                         if (min > max)
721                             throw ex("parser.quantifier.4", this.offset);
722                    }
723                    else { // assume {min,}
724                         max = -1;
725                     }
726                 }
727 
728                if (ch != '}')
729                    throw ex("parser.quantifier.2", this.offset);
730 
731                if (this.checkQuestion(off)) {  // off -> next of '}'
732                     tok = Token.createNGClosure(tok);
733                     this.offset = off+1;
734                 } else {
735                     tok = Token.createClosure(tok);
736                     this.offset = off;
737                 }
738 
739                 tok.setMin(min);
740                 tok.setMax(max);
741                 //System.err.println("CLOSURE: "+min+", "+max);
742                 this.next();
743             }
744         }
745         return tok;
746     }
747 
748     /**
749      * atom ::= char | '.' | char-class | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
750      *          | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block
751      *          | '(?>' regex ')'
752      * char ::= '\\' | '\' [efnrt] | bmp-code | character-1
753      */
parseAtom()754     Token parseAtom() throws ParseException {
755         int ch = this.read();
756         Token tok = null;
757         switch (ch) {
758           case T_LPAREN:        return this.processParen();
759           case T_LPAREN2:       return this.processParen2(); // '(?:'
760           case T_CONDITION:     return this.processCondition(); // '(?('
761           case T_MODIFIERS:     return this.processModifiers(); // (?modifiers ... )
762           case T_INDEPENDENT:   return this.processIndependent();
763           case T_DOT:
764             this.next();                    // Skips '.'
765             tok = Token.token_dot;
766             break;
767 
768             /**
769              * char-class ::= '[' ( '^'? range ','?)+ ']'
770              * range ::= '\d' | '\w' | '\s' | category-block | range-char
771              *           | range-char '-' range-char
772              * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | bmp-code | character-2
773              * bmp-char ::= '\' 'u' [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]
774              */
775           case T_LBRACKET:      return this.parseCharacterClass(true);
776           case T_SET_OPERATIONS: return this.parseSetOperations();
777 
778           case T_BACKSOLIDUS:
779             switch (this.chardata) {
780               case 'd':  case 'D':
781               case 'w':  case 'W':
782               case 's':  case 'S':
783                 tok = this.getTokenForShorthand(this.chardata);
784                 this.next();
785                 return tok;
786 
787               case 'e':  case 'f':  case 'n':  case 'r':
788               case 't':  case 'u':  case 'v':  case 'x':
789                 {
790                     int ch2 = this.decodeEscaped();
791                     if (ch2 < 0x10000) {
792                         tok = Token.createChar(ch2);
793                     } else {
794                         tok = Token.createString(REUtil.decomposeToSurrogates(ch2));
795                     }
796                 }
797                 break;
798 
799               case 'c': return this.processBacksolidus_c();
800               case 'C': return this.processBacksolidus_C();
801               case 'i': return this.processBacksolidus_i();
802               case 'I': return this.processBacksolidus_I();
803               case 'g': return this.processBacksolidus_g();
804               case 'X': return this.processBacksolidus_X();
805               case '1':  case '2':  case '3':  case '4':
806               case '5':  case '6':  case '7':  case '8':  case '9':
807                 return this.processBackreference();
808 
809               case 'P':
810               case 'p':
811                 int pstart = this.offset;
812                 tok = processBacksolidus_pP(this.chardata);
813                 if (tok == null)  throw this.ex("parser.atom.5", pstart);
814                 break;
815 
816               default:
817                 tok = Token.createChar(this.chardata);
818             }
819             this.next();
820             break;
821 
822           case T_CHAR:
823             if (this.chardata == ']' || this.chardata == '{' || this.chardata == '}')
824                 throw this.ex("parser.atom.4", this.offset-1);
825             tok = Token.createChar(this.chardata);
826             int high = this.chardata;
827             this.next();
828             if (REUtil.isHighSurrogate(high)
829                 && this.read() == T_CHAR && REUtil.isLowSurrogate(this.chardata)) {
830                 char[] sur = new char[2];
831                 sur[0] = (char)high;
832                 sur[1] = (char)this.chardata;
833                 tok = Token.createParen(Token.createString(new String(sur)), 0);
834                 this.next();
835             }
836             break;
837 
838           default:
839             throw this.ex("parser.atom.4", this.offset-1);
840         }
841         return tok;
842     }
843 
processBacksolidus_pP(int c)844     protected RangeToken processBacksolidus_pP(int c) throws ParseException {
845 
846         this.next();
847         if (this.read() != T_CHAR || this.chardata != '{')
848             throw this.ex("parser.atom.2", this.offset-1);
849 
850         // handle category escape
851         boolean positive = c == 'p';
852         int namestart = this.offset;
853         int nameend = this.regex.indexOf('}', namestart);
854 
855         if (nameend < 0)
856             throw this.ex("parser.atom.3", this.offset);
857 
858         String pname = this.regex.substring(namestart, nameend);
859         this.offset = nameend+1;
860 
861         return Token.getRange(pname, positive, this.isSet(RegularExpression.XMLSCHEMA_MODE));
862     }
863 
processCIinCharacterClass(RangeToken tok, int c)864     int processCIinCharacterClass(RangeToken tok, int c) {
865         return this.decodeEscaped();
866     }
867 
868     /**
869      * char-class ::= '[' ( '^'? range ','?)+ ']'
870      * range ::= '\d' | '\w' | '\s' | category-block | range-char
871      *           | range-char '-' range-char
872      * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | bmp-code | character-2
873      * bmp-code ::= '\' 'u' [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]
874      */
parseCharacterClass(boolean useNrange)875     protected RangeToken parseCharacterClass(boolean useNrange) throws ParseException {
876         this.setContext(S_INBRACKETS);
877         this.next();                            // '['
878         boolean nrange = false;
879         RangeToken base = null;
880         RangeToken tok;
881         if (this.read() == T_CHAR && this.chardata == '^') {
882             nrange = true;
883             this.next();                        // '^'
884             if (useNrange) {
885                 tok = Token.createNRange();
886             } else {
887                 base = Token.createRange();
888                 base.addRange(0, Token.UTF16_MAX);
889                 tok = Token.createRange();
890             }
891         } else {
892             tok = Token.createRange();
893         }
894         int type;
895         boolean firstloop = true;
896         while ((type = this.read()) != T_EOF) {
897             if (type == T_CHAR && this.chardata == ']' && !firstloop)
898                 break;
899             int c = this.chardata;
900             boolean end = false;
901             if (type == T_BACKSOLIDUS) {
902                 switch (c) {
903                   case 'd':  case 'D':
904                   case 'w':  case 'W':
905                   case 's':  case 'S':
906                     tok.mergeRanges(this.getTokenForShorthand(c));
907                     end = true;
908                     break;
909 
910                   case 'i':  case 'I':
911                   case 'c':  case 'C':
912                     c = this.processCIinCharacterClass(tok, c);
913                     if (c < 0)  end = true;
914                     break;
915 
916                   case 'p':
917                   case 'P':
918                     int pstart = this.offset;
919                     RangeToken tok2 = this.processBacksolidus_pP(c);
920                     if (tok2 == null)  throw this.ex("parser.atom.5", pstart);
921                     tok.mergeRanges(tok2);
922                     end = true;
923                     break;
924 
925                   default:
926                     c = this.decodeEscaped();
927                 } // \ + c
928             } // backsolidus
929                                                 // POSIX Character class such as [:alnum:]
930             else if (type == T_POSIX_CHARCLASS_START) {
931                 int nameend = this.regex.indexOf(':', this.offset);
932                 if (nameend < 0) throw this.ex("parser.cc.1", this.offset);
933                 boolean positive = true;
934                 if (this.regex.charAt(this.offset) == '^') {
935                     this.offset ++;
936                     positive = false;
937                 }
938                 String name = this.regex.substring(this.offset, nameend);
939                 RangeToken range = Token.getRange(name, positive,
940                                                   this.isSet(RegularExpression.XMLSCHEMA_MODE));
941                 if (range == null)  throw this.ex("parser.cc.3", this.offset);
942                 tok.mergeRanges(range);
943                 end = true;
944                 if (nameend+1 >= this.regexlen || this.regex.charAt(nameend+1) != ']')
945                     throw this.ex("parser.cc.1", nameend);
946                 this.offset = nameend+2;
947             }
948             else if (type == T_XMLSCHEMA_CC_SUBTRACTION && !firstloop) {
949                 if (nrange) {
950                     nrange = false;
951                     if (useNrange) {
952                         tok = (RangeToken) Token.complementRanges(tok);
953                     }
954                     else {
955                         base.subtractRanges(tok);
956                         tok = base;
957                     }
958                 }
959                 RangeToken range2 = this.parseCharacterClass(false);
960                 tok.subtractRanges(range2);
961                 if (this.read() != T_CHAR || this.chardata != ']') {
962                     throw this.ex("parser.cc.5", this.offset);
963                 }
964                 break;                          // Exit this loop
965             }
966             this.next();
967             if (!end) {                         // if not shorthands...
968                 if (this.read() != T_CHAR || this.chardata != '-') { // Here is no '-'.
969                     if (!this.isSet(RegularExpression.IGNORE_CASE) || c > 0xffff) {
970                         tok.addRange(c, c);
971                     }
972                     else {
973                         addCaseInsensitiveChar(tok, c);
974                     }
975                 }
976                 else if (type == T_XMLSCHEMA_CC_SUBTRACTION) {
977                     throw this.ex("parser.cc.8", this.offset-1);
978                 }
979                 else {
980                     this.next(); // Skips '-'
981                     if ((type = this.read()) == T_EOF)  throw this.ex("parser.cc.2", this.offset);
982                     if (type == T_CHAR && this.chardata == ']') {
983                         if (!this.isSet(RegularExpression.IGNORE_CASE) || c > 0xffff) {
984                             tok.addRange(c, c);
985                         }
986                         else {
987                             addCaseInsensitiveChar(tok, c);
988                         }
989                         tok.addRange('-', '-');
990                     } else {
991                         int rangeend = this.chardata;
992                         if (type == T_BACKSOLIDUS) {
993                             rangeend = this.decodeEscaped();
994                         }
995                         this.next();
996                         if (c > rangeend) {
997                             throw this.ex("parser.ope.3", this.offset-1);
998                         }
999                         if (!this.isSet(RegularExpression.IGNORE_CASE) ||
1000                                 (c > 0xffff && rangeend > 0xffff)) {
1001                             tok.addRange(c, rangeend);
1002                         }
1003                         else {
1004                             addCaseInsensitiveCharRange(tok, c, rangeend);
1005                         }
1006                     }
1007                 }
1008             }
1009             if (this.isSet(RegularExpression.SPECIAL_COMMA)
1010                 && this.read() == T_CHAR && this.chardata == ',') {
1011                 this.next();
1012             }
1013             firstloop = false;
1014         }
1015         if (this.read() == T_EOF) {
1016             throw this.ex("parser.cc.2", this.offset);
1017         }
1018 
1019         if (!useNrange && nrange) {
1020             base.subtractRanges(tok);
1021             tok = base;
1022         }
1023         tok.sortRanges();
1024         tok.compactRanges();
1025         this.setContext(S_NORMAL);
1026         this.next();                    // Skips ']'
1027 
1028         return tok;
1029     }
1030 
1031     /**
1032      * '(?[' ... ']' (('-' | '+' | '&') '[' ... ']')? ')'
1033      */
parseSetOperations()1034     protected RangeToken parseSetOperations() throws ParseException {
1035         RangeToken tok = this.parseCharacterClass(false);
1036         int type;
1037         while ((type = this.read()) != T_RPAREN) {
1038             int ch = this.chardata;
1039             if (type == T_CHAR && (ch == '-' || ch == '&')
1040                 || type == T_PLUS) {
1041                 this.next();
1042                 if (this.read() != T_LBRACKET) throw ex("parser.ope.1", this.offset-1);
1043                 RangeToken t2 = this.parseCharacterClass(false);
1044                 if (type == T_PLUS)
1045                     tok.mergeRanges(t2);
1046                 else if (ch == '-')
1047                     tok.subtractRanges(t2);
1048                 else if (ch == '&')
1049                     tok.intersectRanges(t2);
1050                 else
1051                     throw new RuntimeException("ASSERT");
1052             } else {
1053                 throw ex("parser.ope.2", this.offset-1);
1054             }
1055         }
1056         this.next();
1057         return tok;
1058     }
1059 
getTokenForShorthand(int ch)1060     Token getTokenForShorthand(int ch) {
1061         Token tok;
1062         switch (ch) {
1063           case 'd':
1064             tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
1065                 ? Token.getRange("Nd", true) : Token.token_0to9;
1066             break;
1067           case 'D':
1068             tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
1069                 ? Token.getRange("Nd", false) : Token.token_not_0to9;
1070             break;
1071           case 'w':
1072             tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
1073                 ? Token.getRange("IsWord", true) : Token.token_wordchars;
1074             break;
1075           case 'W':
1076             tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
1077                 ? Token.getRange("IsWord", false) : Token.token_not_wordchars;
1078             break;
1079           case 's':
1080             tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
1081                 ? Token.getRange("IsSpace", true) : Token.token_spaces;
1082             break;
1083           case 'S':
1084             tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
1085                 ? Token.getRange("IsSpace", false) : Token.token_not_spaces;
1086             break;
1087 
1088           default:
1089             throw new RuntimeException("Internal Error: shorthands: \\u"+Integer.toString(ch, 16));
1090         }
1091         return tok;
1092     }
1093 
1094     /**
1095      */
decodeEscaped()1096     int decodeEscaped() throws ParseException {
1097         if (this.read() != T_BACKSOLIDUS)  throw ex("parser.next.1", this.offset-1);
1098         int c = this.chardata;
1099         switch (c) {
1100           case 'e':  c = 0x1b;  break; // ESCAPE U+001B
1101           case 'f':  c = '\f';  break; // FORM FEED U+000C
1102           case 'n':  c = '\n';  break; // LINE FEED U+000A
1103           case 'r':  c = '\r';  break; // CRRIAGE RETURN U+000D
1104           case 't':  c = '\t';  break; // HORIZONTAL TABULATION U+0009
1105           //case 'v':  c = 0x0b;  break; // VERTICAL TABULATION U+000B
1106           case 'x':
1107             this.next();
1108             if (this.read() != T_CHAR)  throw ex("parser.descape.1", this.offset-1);
1109             if (this.chardata == '{') {
1110                 int v1 = 0;
1111                 int uv = 0;
1112                 do {
1113                     this.next();
1114                     if (this.read() != T_CHAR)  throw ex("parser.descape.1", this.offset-1);
1115                     if ((v1 = hexChar(this.chardata)) < 0)
1116                         break;
1117                     if (uv > uv*16) throw ex("parser.descape.2", this.offset-1);
1118                     uv = uv*16+v1;
1119                 } while (true);
1120                 if (this.chardata != '}')  throw ex("parser.descape.3", this.offset-1);
1121                 if (uv > Token.UTF16_MAX)  throw ex("parser.descape.4", this.offset-1);
1122                 c = uv;
1123             } else {
1124                 int v1 = 0;
1125                 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1126                     throw ex("parser.descape.1", this.offset-1);
1127                 int uv = v1;
1128                 this.next();
1129                 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1130                     throw ex("parser.descape.1", this.offset-1);
1131                 uv = uv*16+v1;
1132                 c = uv;
1133             }
1134             break;
1135 
1136           case 'u':
1137             int v1 = 0;
1138             this.next();
1139             if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1140                 throw ex("parser.descape.1", this.offset-1);
1141             int uv = v1;
1142             this.next();
1143             if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1144                 throw ex("parser.descape.1", this.offset-1);
1145             uv = uv*16+v1;
1146             this.next();
1147             if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1148                 throw ex("parser.descape.1", this.offset-1);
1149             uv = uv*16+v1;
1150             this.next();
1151             if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1152                 throw ex("parser.descape.1", this.offset-1);
1153             uv = uv*16+v1;
1154             c = uv;
1155             break;
1156 
1157           case 'v':
1158             this.next();
1159             if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1160                 throw ex("parser.descape.1", this.offset-1);
1161             uv = v1;
1162             this.next();
1163             if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1164                 throw ex("parser.descape.1", this.offset-1);
1165             uv = uv*16+v1;
1166             this.next();
1167             if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1168                 throw ex("parser.descape.1", this.offset-1);
1169             uv = uv*16+v1;
1170             this.next();
1171             if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1172                 throw ex("parser.descape.1", this.offset-1);
1173             uv = uv*16+v1;
1174             this.next();
1175             if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1176                 throw ex("parser.descape.1", this.offset-1);
1177             uv = uv*16+v1;
1178             this.next();
1179             if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1180                 throw ex("parser.descape.1", this.offset-1);
1181             uv = uv*16+v1;
1182             if (uv > Token.UTF16_MAX)  throw ex("parser.descappe.4", this.offset-1);
1183             c = uv;
1184             break;
1185           case 'A':
1186           case 'Z':
1187           case 'z':
1188             throw ex("parser.descape.5", this.offset-2);
1189           default:
1190         }
1191         return c;
1192     }
1193 
hexChar(int ch)1194     static private final int hexChar(int ch) {
1195         if (ch < '0')  return -1;
1196         if (ch > 'f')  return -1;
1197         if (ch <= '9')  return ch-'0';
1198         if (ch < 'A')  return -1;
1199         if (ch <= 'F')  return ch-'A'+10;
1200         if (ch < 'a')  return -1;
1201         return ch-'a'+10;
1202     }
1203 
addCaseInsensitiveChar(RangeToken tok, int c)1204     static protected final void addCaseInsensitiveChar(RangeToken tok, int c) {
1205         final int[] caseMap = CaseInsensitiveMap.get(c);
1206         tok.addRange(c, c);
1207 
1208         if (caseMap != null) {
1209             for (int i=0; i<caseMap.length; i+=2) {
1210                 tok.addRange(caseMap[i], caseMap[i]);
1211             }
1212         }
1213 
1214     }
1215 
addCaseInsensitiveCharRange(RangeToken tok, int start, int end)1216     static protected final void addCaseInsensitiveCharRange(RangeToken tok, int start, int end) {
1217         int[] caseMap;
1218         int r1, r2;
1219         if (start <= end) {
1220             r1 = start;
1221             r2 = end;
1222         } else {
1223             r1 = end;
1224             r2 = start;
1225         }
1226 
1227         tok.addRange(r1, r2);
1228         for (int ch = r1;  ch <= r2;  ch++) {
1229             caseMap = CaseInsensitiveMap.get(ch);
1230             if (caseMap != null) {
1231                 for (int i=0; i<caseMap.length; i+=2) {
1232                     tok.addRange(caseMap[i], caseMap[i]);
1233                 }
1234             }
1235         }
1236     }
1237 }
1238