1 /*
2  * Permission is hereby granted, free of charge, to any person obtaining a copy of
3  * this software and associated documentation files (the "Software"), to deal in
4  * the Software without restriction, including without limitation the rights to
5  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
6  * of the Software, and to permit persons to whom the Software is furnished to do
7  * so, subject to the following conditions:
8  *
9  * The above copyright notice and this permission notice shall be included in all
10  * copies or substantial portions of the Software.
11  *
12  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
13  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
14  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
15  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
16  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
17  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
18  * SOFTWARE.
19  */
20 package jdk.nashorn.internal.runtime.regexp.joni;
21 
22 import static jdk.nashorn.internal.runtime.regexp.joni.Option.isSingleline;
23 import static jdk.nashorn.internal.runtime.regexp.joni.ast.QuantifierNode.isRepeatInfinite;
24 import jdk.nashorn.internal.runtime.regexp.joni.ast.QuantifierNode;
25 import jdk.nashorn.internal.runtime.regexp.joni.constants.AnchorType;
26 import jdk.nashorn.internal.runtime.regexp.joni.constants.MetaChar;
27 import jdk.nashorn.internal.runtime.regexp.joni.constants.TokenType;
28 import jdk.nashorn.internal.runtime.regexp.joni.encoding.CharacterType;
29 import jdk.nashorn.internal.runtime.regexp.joni.exception.ErrorMessages;
30 import jdk.nashorn.internal.runtime.regexp.joni.exception.SyntaxException;
31 import jdk.nashorn.internal.runtime.regexp.joni.exception.ValueException;
32 
33 class Lexer extends ScannerSupport {
34     protected final ScanEnvironment env;
35     protected final Syntax syntax;              // fast access to syntax
36     protected final Token token = new Token();  // current token
37 
Lexer(final ScanEnvironment env, final char[] chars, final int p, final int end)38     protected Lexer(final ScanEnvironment env, final char[] chars, final int p, final int end) {
39         super(chars, p, end);
40         this.env = env;
41         this.syntax = env.syntax;
42     }
43 
44     /**
45      * @return 0: normal {n,m}, 2: fixed {n}
46      * !introduce returnCode here
47      */
fetchRangeQuantifier()48     private int fetchRangeQuantifier() {
49         mark();
50         final boolean synAllow = syntax.allowInvalidInterval();
51 
52         if (!left()) {
53             if (synAllow) {
54                 return 1; /* "....{" : OK! */
55             }
56             throw new SyntaxException(ERR_END_PATTERN_AT_LEFT_BRACE);
57         }
58 
59         if (!synAllow) {
60             c = peek();
61             if (c == ')' || c == '(' || c == '|') {
62                 throw new SyntaxException(ERR_END_PATTERN_AT_LEFT_BRACE);
63             }
64         }
65 
66         int low = scanUnsignedNumber();
67         if (low < 0) {
68             throw new SyntaxException(ErrorMessages.ERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE);
69         }
70         if (low > Config.MAX_REPEAT_NUM) {
71             throw new SyntaxException(ErrorMessages.ERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE);
72         }
73 
74         boolean nonLow = false;
75         if (p == _p) { /* can't read low */
76             if (syntax.allowIntervalLowAbbrev()) {
77                 low = 0;
78                 nonLow = true;
79             } else {
80                 return invalidRangeQuantifier(synAllow);
81             }
82         }
83 
84         if (!left()) {
85             return invalidRangeQuantifier(synAllow);
86         }
87 
88         fetch();
89         int up;
90         int ret = 0;
91         if (c == ',') {
92             final int prev = p; // ??? last
93             up = scanUnsignedNumber();
94             if (up < 0) {
95                 throw new ValueException(ERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE);
96             }
97             if (up > Config.MAX_REPEAT_NUM) {
98                 throw new ValueException(ERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE);
99             }
100 
101             if (p == prev) {
102                 if (nonLow) {
103                     return invalidRangeQuantifier(synAllow);
104                 }
105                 up = QuantifierNode.REPEAT_INFINITE; /* {n,} : {n,infinite} */
106             }
107         } else {
108             if (nonLow) {
109                 return invalidRangeQuantifier(synAllow);
110             }
111             unfetch();
112             up = low; /* {n} : exact n times */
113             ret = 2; /* fixed */
114         }
115 
116         if (!left()) {
117             return invalidRangeQuantifier(synAllow);
118         }
119         fetch();
120 
121         if (syntax.opEscBraceInterval()) {
122             if (c != syntax.metaCharTable.esc) {
123                 return invalidRangeQuantifier(synAllow);
124             }
125             fetch();
126         }
127 
128         if (c != '}') {
129             return invalidRangeQuantifier(synAllow);
130         }
131 
132         if (!isRepeatInfinite(up) && low > up) {
133             throw new ValueException(ERR_UPPER_SMALLER_THAN_LOWER_IN_REPEAT_RANGE);
134         }
135 
136         token.type = TokenType.INTERVAL;
137         token.setRepeatLower(low);
138         token.setRepeatUpper(up);
139 
140         return ret; /* 0: normal {n,m}, 2: fixed {n} */
141     }
142 
invalidRangeQuantifier(final boolean synAllow)143     private int invalidRangeQuantifier(final boolean synAllow) {
144         if (synAllow) {
145             restore();
146             return 1;
147         }
148         throw new SyntaxException(ERR_INVALID_REPEAT_RANGE_PATTERN);
149     }
150 
151     @SuppressWarnings("fallthrough")
152     /* \M-, \C-, \c, or \... */
fetchEscapedValue()153     private int fetchEscapedValue() {
154         if (!left()) {
155             throw new SyntaxException(ERR_END_PATTERN_AT_ESCAPE);
156         }
157         fetch();
158 
159         switch(c) {
160 
161         case 'M':
162             if (syntax.op2EscCapitalMBarMeta()) {
163                 if (!left()) {
164                     throw new SyntaxException(ERR_END_PATTERN_AT_META);
165                 }
166                 fetch();
167                 if (c != '-') {
168                     throw new SyntaxException(ERR_META_CODE_SYNTAX);
169                 }
170                 if (!left()) {
171                     throw new SyntaxException(ERR_END_PATTERN_AT_META);
172                 }
173                 fetch();
174                 if (c == syntax.metaCharTable.esc) {
175                     c = fetchEscapedValue();
176                 }
177                 c = ((c & 0xff) | 0x80);
178             } else {
179                 fetchEscapedValueBackSlash();
180             }
181             break;
182 
183         case 'C':
184             if (syntax.op2EscCapitalCBarControl()) {
185                 if (!left()) {
186                     throw new SyntaxException(ERR_END_PATTERN_AT_CONTROL);
187                 }
188                 fetch();
189                 if (c != '-') {
190                     throw new SyntaxException(ERR_CONTROL_CODE_SYNTAX);
191                 }
192                 fetchEscapedValueControl();
193             } else {
194                 fetchEscapedValueBackSlash();
195             }
196             break;
197 
198         case 'c':
199             if (syntax.opEscCControl()) {
200                 fetchEscapedValueControl();
201             }
202             /* fall through */
203 
204         default:
205             fetchEscapedValueBackSlash();
206         } // switch
207 
208         return c; // ???
209     }
210 
fetchEscapedValueBackSlash()211     private void fetchEscapedValueBackSlash() {
212         c = env.convertBackslashValue(c);
213     }
214 
fetchEscapedValueControl()215     private void fetchEscapedValueControl() {
216         if (!left()) {
217             throw new SyntaxException(ERR_END_PATTERN_AT_CONTROL);
218         }
219         fetch();
220         if (c == '?') {
221             c = 0177;
222         } else {
223             if (c == syntax.metaCharTable.esc) {
224                 c = fetchEscapedValue();
225             }
226             c &= 0x9f;
227         }
228     }
229 
fetchTokenInCCFor_charType(final boolean flag, final int type)230     private void fetchTokenInCCFor_charType(final boolean flag, final int type) {
231         token.type = TokenType.CHAR_TYPE;
232         token.setPropCType(type);
233         token.setPropNot(flag);
234     }
235 
fetchTokenInCCFor_x()236     private void fetchTokenInCCFor_x() {
237         if (!left()) {
238             return;
239         }
240         final int last = p;
241 
242         if (peekIs('{') && syntax.opEscXBraceHex8()) {
243             inc();
244             final int num = scanUnsignedHexadecimalNumber(8);
245             if (num < 0) {
246                 throw new ValueException(ERR_TOO_BIG_WIDE_CHAR_VALUE);
247             }
248             if (left()) {
249                 final int c2 = peek();
250                 if (EncodingHelper.isXDigit(c2)) {
251                     throw new ValueException(ERR_TOO_LONG_WIDE_CHAR_VALUE);
252                 }
253             }
254 
255             if (p > last + 1 && left() && peekIs('}')) {
256                 inc();
257                 token.type = TokenType.CODE_POINT;
258                 token.setCode(num);
259             } else {
260                 /* can't read nothing or invalid format */
261                 p = last;
262             }
263         } else if (syntax.opEscXHex2()) {
264             int num = scanUnsignedHexadecimalNumber(2);
265             if (num < 0) {
266                 throw new ValueException(ERR_TOO_BIG_NUMBER);
267             }
268             if (p == last) { /* can't read nothing. */
269                 num = 0; /* but, it's not error */
270             }
271             token.type = TokenType.RAW_BYTE;
272             token.setC(num);
273         }
274     }
275 
fetchTokenInCCFor_u()276     private void fetchTokenInCCFor_u() {
277         if (!left()) {
278             return;
279         }
280         final int last = p;
281 
282         if (syntax.op2EscUHex4()) {
283             int num = scanUnsignedHexadecimalNumber(4);
284             if (num < 0) {
285                 throw new ValueException(ERR_TOO_BIG_NUMBER);
286             }
287             if (p == last) {  /* can't read nothing. */
288                 num = 0; /* but, it's not error */
289             }
290             token.type = TokenType.CODE_POINT;
291             token.setCode(num);
292         }
293     }
294 
fetchTokenInCCFor_digit()295     private void fetchTokenInCCFor_digit() {
296         if (syntax.opEscOctal3()) {
297             unfetch();
298             final int last = p;
299             int num = scanUnsignedOctalNumber(3);
300             if (num < 0) {
301                 throw new ValueException(ERR_TOO_BIG_NUMBER);
302             }
303             if (p == last) {  /* can't read nothing. */
304                 num = 0; /* but, it's not error */
305             }
306             token.type = TokenType.RAW_BYTE;
307             token.setC(num);
308         }
309     }
310 
fetchTokenInCCFor_and()311     private void fetchTokenInCCFor_and() {
312         if (syntax.op2CClassSetOp() && left() && peekIs('&')) {
313             inc();
314             token.type = TokenType.CC_AND;
315         }
316     }
317 
fetchTokenInCC()318     protected final TokenType fetchTokenInCC() {
319         if (!left()) {
320             token.type = TokenType.EOT;
321             return token.type;
322         }
323 
324         fetch();
325         token.type = TokenType.CHAR;
326         token.setC(c);
327         token.escaped = false;
328 
329         if (c == ']') {
330             token.type = TokenType.CC_CLOSE;
331         } else if (c == '-') {
332             token.type = TokenType.CC_RANGE;
333         } else if (c == syntax.metaCharTable.esc) {
334             if (!syntax.backSlashEscapeInCC()) {
335                 return token.type;
336             }
337             if (!left()) {
338                 throw new SyntaxException(ERR_END_PATTERN_AT_ESCAPE);
339             }
340             fetch();
341             token.escaped = true;
342             token.setC(c);
343 
344             switch (c) {
345             case 'w':
346                 fetchTokenInCCFor_charType(false, Config.NON_UNICODE_SDW ? CharacterType.W : CharacterType.WORD);
347                 break;
348             case 'W':
349                 fetchTokenInCCFor_charType(true, Config.NON_UNICODE_SDW ? CharacterType.W : CharacterType.WORD);
350                 break;
351             case 'd':
352                 fetchTokenInCCFor_charType(false, Config.NON_UNICODE_SDW ? CharacterType.D : CharacterType.DIGIT);
353                 break;
354             case 'D':
355                 fetchTokenInCCFor_charType(true, Config.NON_UNICODE_SDW ? CharacterType.D : CharacterType.DIGIT);
356                 break;
357             case 's':
358                 fetchTokenInCCFor_charType(false, Config.NON_UNICODE_SDW ? CharacterType.S : CharacterType.SPACE);
359                 break;
360             case 'S':
361                 fetchTokenInCCFor_charType(true, Config.NON_UNICODE_SDW ? CharacterType.S : CharacterType.SPACE);
362                 break;
363             case 'h':
364                 if (syntax.op2EscHXDigit()) {
365                     fetchTokenInCCFor_charType(false, CharacterType.XDIGIT);
366                 }
367                 break;
368             case 'H':
369                 if (syntax.op2EscHXDigit()) {
370                     fetchTokenInCCFor_charType(true, CharacterType.XDIGIT);
371                 }
372                 break;
373             case 'x':
374                 fetchTokenInCCFor_x();
375                 break;
376             case 'u':
377                 fetchTokenInCCFor_u();
378                 break;
379             case '0':
380             case '1':
381             case '2':
382             case '3':
383             case '4':
384             case '5':
385             case '6':
386             case '7':
387                 fetchTokenInCCFor_digit();
388                 break;
389 
390             default:
391                 unfetch();
392                 final int num = fetchEscapedValue();
393                 if (token.getC() != num) {
394                     token.setCode(num);
395                     token.type = TokenType.CODE_POINT;
396                 }
397                 break;
398             } // switch
399 
400         } else if (c == '&') {
401             fetchTokenInCCFor_and();
402         }
403         return token.type;
404     }
405 
fetchTokenFor_repeat(final int lower, final int upper)406     private void fetchTokenFor_repeat(final int lower, final int upper) {
407         token.type = TokenType.OP_REPEAT;
408         token.setRepeatLower(lower);
409         token.setRepeatUpper(upper);
410         greedyCheck();
411     }
412 
fetchTokenFor_openBrace()413     private void fetchTokenFor_openBrace() {
414         switch (fetchRangeQuantifier()) {
415         case 0:
416             greedyCheck();
417             break;
418         case 2:
419             if (syntax.fixedIntervalIsGreedyOnly()) {
420                 possessiveCheck();
421             } else {
422                 greedyCheck();
423             }
424             break;
425         default: /* 1 : normal char */
426         } // inner switch
427     }
428 
fetchTokenFor_anchor(final int subType)429     private void fetchTokenFor_anchor(final int subType) {
430         token.type = TokenType.ANCHOR;
431         token.setAnchor(subType);
432     }
433 
fetchTokenFor_xBrace()434     private void fetchTokenFor_xBrace() {
435         if (!left()) {
436             return;
437         }
438 
439         final int last = p;
440         if (peekIs('{') && syntax.opEscXBraceHex8()) {
441             inc();
442             final int num = scanUnsignedHexadecimalNumber(8);
443             if (num < 0) {
444                 throw new ValueException(ERR_TOO_BIG_WIDE_CHAR_VALUE);
445             }
446             if (left()) {
447                 if (EncodingHelper.isXDigit(peek())) {
448                     throw new ValueException(ERR_TOO_LONG_WIDE_CHAR_VALUE);
449                 }
450             }
451 
452             if (p > last + 1 && left() && peekIs('}')) {
453                 inc();
454                 token.type = TokenType.CODE_POINT;
455                 token.setCode(num);
456             } else {
457                 /* can't read nothing or invalid format */
458                 p = last;
459             }
460         } else if (syntax.opEscXHex2()) {
461             int num = scanUnsignedHexadecimalNumber(2);
462             if (num < 0) {
463                 throw new ValueException(ERR_TOO_BIG_NUMBER);
464             }
465             if (p == last) { /* can't read nothing. */
466                 num = 0; /* but, it's not error */
467             }
468             token.type = TokenType.RAW_BYTE;
469             token.setC(num);
470         }
471     }
472 
fetchTokenFor_uHex()473     private void fetchTokenFor_uHex() {
474         if (!left()) {
475             return;
476         }
477         final int last = p;
478 
479         if (syntax.op2EscUHex4()) {
480             int num = scanUnsignedHexadecimalNumber(4);
481             if (num < 0) {
482                 throw new ValueException(ERR_TOO_BIG_NUMBER);
483             }
484             if (p == last) { /* can't read nothing. */
485                 num = 0; /* but, it's not error */
486             }
487             token.type = TokenType.CODE_POINT;
488             token.setCode(num);
489         }
490     }
491 
fetchTokenFor_digit()492     private void fetchTokenFor_digit() {
493         unfetch();
494         final int last = p;
495         final int num = scanUnsignedNumber();
496         if (num < 0 || num > Config.MAX_BACKREF_NUM) { // goto skip_backref
497         } else if (syntax.opDecimalBackref() && (num <= env.numMem || num <= 9)) { /* This spec. from GNU regex */
498             if (syntax.strictCheckBackref()) {
499                 if (num > env.numMem || env.memNodes == null || env.memNodes[num] == null) {
500                     throw new ValueException(ERR_INVALID_BACKREF);
501                 }
502             }
503             token.type = TokenType.BACKREF;
504             token.setBackrefRef(num);
505             return;
506         }
507 
508         if (c == '8' || c == '9') { /* normal char */ // skip_backref:
509             p = last;
510             inc();
511             return;
512         }
513         p = last;
514 
515         fetchTokenFor_zero(); /* fall through */
516     }
517 
fetchTokenFor_zero()518     private void fetchTokenFor_zero() {
519         if (syntax.opEscOctal3()) {
520             final int last = p;
521             int num = scanUnsignedOctalNumber(c == '0' ? 2 : 3);
522             if (num < 0) {
523                 throw new ValueException(ERR_TOO_BIG_NUMBER);
524             }
525             if (p == last) { /* can't read nothing. */
526                 num = 0; /* but, it's not error */
527             }
528             token.type = TokenType.RAW_BYTE;
529             token.setC(num);
530         } else if (c != '0') {
531             inc();
532         }
533     }
534 
fetchTokenFor_metaChars()535     private void fetchTokenFor_metaChars() {
536         if (c == syntax.metaCharTable.anyChar) {
537             token.type = TokenType.ANYCHAR;
538         } else if (c == syntax.metaCharTable.anyTime) {
539             fetchTokenFor_repeat(0, QuantifierNode.REPEAT_INFINITE);
540         }  else if (c == syntax.metaCharTable.zeroOrOneTime) {
541             fetchTokenFor_repeat(0, 1);
542         } else if (c == syntax.metaCharTable.oneOrMoreTime) {
543             fetchTokenFor_repeat(1, QuantifierNode.REPEAT_INFINITE);
544         } else if (c == syntax.metaCharTable.anyCharAnyTime) {
545             token.type = TokenType.ANYCHAR_ANYTIME;
546             // goto out
547         }
548     }
549 
fetchToken()550     protected final TokenType fetchToken() {
551         // mark(); // out
552         start:
553         while(true) {
554             if (!left()) {
555                 token.type = TokenType.EOT;
556                 return token.type;
557             }
558 
559             token.type = TokenType.STRING;
560             token.backP = p;
561 
562             fetch();
563 
564             if (c == syntax.metaCharTable.esc && !syntax.op2IneffectiveEscape()) { // IS_MC_ESC_CODE(code, syn)
565                 if (!left()) {
566                     throw new SyntaxException(ERR_END_PATTERN_AT_ESCAPE);
567                 }
568 
569                 token.backP = p;
570                 fetch();
571 
572                 token.setC(c);
573                 token.escaped = true;
574                 switch(c) {
575 
576                 case '*':
577                     if (syntax.opEscAsteriskZeroInf()) {
578                         fetchTokenFor_repeat(0, QuantifierNode.REPEAT_INFINITE);
579                     }
580                     break;
581                 case '+':
582                     if (syntax.opEscPlusOneInf()) {
583                         fetchTokenFor_repeat(1, QuantifierNode.REPEAT_INFINITE);
584                     }
585                     break;
586                 case '?':
587                     if (syntax.opEscQMarkZeroOne()) {
588                         fetchTokenFor_repeat(0, 1);
589                     }
590                     break;
591                 case '{':
592                     if (syntax.opEscBraceInterval()) {
593                         fetchTokenFor_openBrace();
594                     }
595                     break;
596                 case '|':
597                     if (syntax.opEscVBarAlt()) {
598                         token.type = TokenType.ALT;
599                     }
600                     break;
601                 case '(':
602                     if (syntax.opEscLParenSubexp()) {
603                         token.type = TokenType.SUBEXP_OPEN;
604                     }
605                     break;
606                 case ')':
607                     if (syntax.opEscLParenSubexp()) {
608                         token.type = TokenType.SUBEXP_CLOSE;
609                     }
610                     break;
611                 case 'w':
612                     if (syntax.opEscWWord()) {
613                         fetchTokenInCCFor_charType(false, Config.NON_UNICODE_SDW ? CharacterType.W : CharacterType.WORD);
614                     }
615                     break;
616                 case 'W':
617                     if (syntax.opEscWWord()) {
618                         fetchTokenInCCFor_charType(true, Config.NON_UNICODE_SDW ? CharacterType.W : CharacterType.WORD);
619                     }
620                     break;
621                 case 'b':
622                     if (syntax.opEscBWordBound()) {
623                         fetchTokenFor_anchor(AnchorType.WORD_BOUND);
624                     }
625                     break;
626                 case 'B':
627                     if (syntax.opEscBWordBound()) {
628                         fetchTokenFor_anchor(AnchorType.NOT_WORD_BOUND);
629                     }
630                     break;
631                 case '<':
632                     if (Config.USE_WORD_BEGIN_END && syntax.opEscLtGtWordBeginEnd()) {
633                         fetchTokenFor_anchor(AnchorType.WORD_BEGIN);
634                     }
635                     break;
636                 case '>':
637                     if (Config.USE_WORD_BEGIN_END && syntax.opEscLtGtWordBeginEnd()) {
638                         fetchTokenFor_anchor(AnchorType.WORD_END);
639                     }
640                     break;
641                 case 's':
642                     if (syntax.opEscSWhiteSpace()) {
643                         fetchTokenInCCFor_charType(false, Config.NON_UNICODE_SDW ? CharacterType.S : CharacterType.SPACE);
644                     }
645                     break;
646                 case 'S':
647                     if (syntax.opEscSWhiteSpace()) {
648                         fetchTokenInCCFor_charType(true, Config.NON_UNICODE_SDW ? CharacterType.S : CharacterType.SPACE);
649                     }
650                     break;
651                 case 'd':
652                     if (syntax.opEscDDigit()) {
653                         fetchTokenInCCFor_charType(false, Config.NON_UNICODE_SDW ? CharacterType.D : CharacterType.DIGIT);
654                     }
655                     break;
656                 case 'D':
657                     if (syntax.opEscDDigit()) {
658                         fetchTokenInCCFor_charType(true, Config.NON_UNICODE_SDW ? CharacterType.D : CharacterType.DIGIT);
659                     }
660                     break;
661                 case 'h':
662                     if (syntax.op2EscHXDigit()) {
663                         fetchTokenInCCFor_charType(false, CharacterType.XDIGIT);
664                     }
665                     break;
666                 case 'H':
667                     if (syntax.op2EscHXDigit()) {
668                         fetchTokenInCCFor_charType(true, CharacterType.XDIGIT);
669                     }
670                     break;
671                 case 'A':
672                     if (syntax.opEscAZBufAnchor()) {
673                         fetchTokenFor_anchor(AnchorType.BEGIN_BUF);
674                     }
675                     break;
676                 case 'Z':
677                     if (syntax.opEscAZBufAnchor()) {
678                         fetchTokenFor_anchor(AnchorType.SEMI_END_BUF);
679                     }
680                     break;
681                 case 'z':
682                     if (syntax.opEscAZBufAnchor()) {
683                         fetchTokenFor_anchor(AnchorType.END_BUF);
684                     }
685                     break;
686                 case 'G':
687                     if (syntax.opEscCapitalGBeginAnchor()) {
688                         fetchTokenFor_anchor(AnchorType.BEGIN_POSITION);
689                     }
690                     break;
691                 case '`':
692                     if (syntax.op2EscGnuBufAnchor()) {
693                         fetchTokenFor_anchor(AnchorType.BEGIN_BUF);
694                     }
695                     break;
696                 case '\'':
697                     if (syntax.op2EscGnuBufAnchor()) {
698                         fetchTokenFor_anchor(AnchorType.END_BUF);
699                     }
700                     break;
701                 case 'x':
702                     fetchTokenFor_xBrace();
703                     break;
704                 case 'u':
705                     fetchTokenFor_uHex();
706                     break;
707                 case '1':
708                 case '2':
709                 case '3':
710                 case '4':
711                 case '5':
712                 case '6':
713                 case '7':
714                 case '8':
715                 case '9':
716                     fetchTokenFor_digit();
717                     break;
718                 case '0':
719                     fetchTokenFor_zero();
720                     break;
721 
722                 default:
723                     unfetch();
724                     final int num = fetchEscapedValue();
725 
726                     /* set_raw: */
727                     if (token.getC() != num) {
728                         token.type = TokenType.CODE_POINT;
729                         token.setCode(num);
730                     } else { /* string */
731                         p = token.backP + 1;
732                     }
733                     break;
734 
735                 } // switch (c)
736 
737             } else {
738                 token.setC(c);
739                 token.escaped = false;
740 
741                 if (Config.USE_VARIABLE_META_CHARS && (c != MetaChar.INEFFECTIVE_META_CHAR && syntax.opVariableMetaCharacters())) {
742                     fetchTokenFor_metaChars();
743                     break;
744                 }
745 
746                 {
747                     switch(c) {
748                     case '.':
749                         if (syntax.opDotAnyChar()) {
750                             token.type = TokenType.ANYCHAR;
751                         }
752                         break;
753                     case '*':
754                         if (syntax.opAsteriskZeroInf()) {
755                             fetchTokenFor_repeat(0, QuantifierNode.REPEAT_INFINITE);
756                         }
757                         break;
758                     case '+':
759                         if (syntax.opPlusOneInf()) {
760                             fetchTokenFor_repeat(1, QuantifierNode.REPEAT_INFINITE);
761                         }
762                         break;
763                     case '?':
764                         if (syntax.opQMarkZeroOne()) {
765                             fetchTokenFor_repeat(0, 1);
766                         }
767                         break;
768                     case '{':
769                         if (syntax.opBraceInterval()) {
770                             fetchTokenFor_openBrace();
771                         }
772                         break;
773                     case '|':
774                         if (syntax.opVBarAlt()) {
775                             token.type = TokenType.ALT;
776                         }
777                         break;
778 
779                     case '(':
780                         if (peekIs('?') && syntax.op2QMarkGroupEffect()) {
781                             inc();
782                             if (peekIs('#')) {
783                                 fetch();
784                                 while (true) {
785                                     if (!left()) {
786                                         throw new SyntaxException(ERR_END_PATTERN_IN_GROUP);
787                                     }
788                                     fetch();
789                                     if (c == syntax.metaCharTable.esc) {
790                                         if (left()) {
791                                             fetch();
792                                         }
793                                     } else {
794                                         if (c == ')') {
795                                             break;
796                                         }
797                                     }
798                                 }
799                                 continue start; // goto start
800                             }
801                             unfetch();
802                         }
803 
804                         if (syntax.opLParenSubexp()) {
805                             token.type = TokenType.SUBEXP_OPEN;
806                         }
807                         break;
808                     case ')':
809                         if (syntax.opLParenSubexp()) {
810                             token.type = TokenType.SUBEXP_CLOSE;
811                         }
812                         break;
813                     case '^':
814                         if (syntax.opLineAnchor()) {
815                             fetchTokenFor_anchor(isSingleline(env.option) ? AnchorType.BEGIN_BUF : AnchorType.BEGIN_LINE);
816                         }
817                         break;
818                     case '$':
819                         if (syntax.opLineAnchor()) {
820                             fetchTokenFor_anchor(isSingleline(env.option) ? AnchorType.END_BUF : AnchorType.END_LINE);
821                         }
822                         break;
823                     case '[':
824                         if (syntax.opBracketCC()) {
825                             token.type = TokenType.CC_CC_OPEN;
826                         }
827                         break;
828                     case ']':
829                         //if (*src > env->pattern)   /* /].../ is allowed. */
830                         //CLOSE_BRACKET_WITHOUT_ESC_WARN(env, (UChar* )"]");
831                         break;
832                     case '#':
833                         if (Option.isExtend(env.option)) {
834                             while (left()) {
835                                 fetch();
836                                 if (EncodingHelper.isNewLine(c)) {
837                                     break;
838                                 }
839                             }
840                             continue start; // goto start
841                         }
842                         break;
843 
844                     case ' ':
845                     case '\t':
846                     case '\n':
847                     case '\r':
848                     case '\f':
849                         if (Option.isExtend(env.option))
850                          {
851                             continue start; // goto start
852                         }
853                         break;
854 
855                     default: // string
856                         break;
857 
858                     } // switch
859                 }
860             }
861 
862             break;
863         } // while
864         return token.type;
865     }
866 
greedyCheck()867     private void greedyCheck() {
868         if (left() && peekIs('?') && syntax.opQMarkNonGreedy()) {
869 
870             fetch();
871 
872             token.setRepeatGreedy(false);
873             token.setRepeatPossessive(false);
874         } else {
875             possessiveCheck();
876         }
877     }
878 
possessiveCheck()879     private void possessiveCheck() {
880         if (left() && peekIs('+') &&
881             (syntax.op2PlusPossessiveRepeat() && token.type != TokenType.INTERVAL ||
882              syntax.op2PlusPossessiveInterval() && token.type == TokenType.INTERVAL)) {
883 
884             fetch();
885 
886             token.setRepeatGreedy(true);
887             token.setRepeatPossessive(true);
888         } else {
889             token.setRepeatGreedy(true);
890             token.setRepeatPossessive(false);
891         }
892     }
893 
syntaxWarn(final String message, final char ch)894     protected final void syntaxWarn(final String message, final char ch) {
895         syntaxWarn(message.replace("<%n>", Character.toString(ch)));
896     }
897 
syntaxWarn(final String message)898     protected final void syntaxWarn(final String message) {
899         if (Config.USE_WARN) {
900             env.reg.warnings.warn(message + ": /" + new String(chars, getBegin(), getEnd()) + "/");
901         }
902     }
903 }
904