1 /*
2  * Copyright (c) 2010, 2019, Oracle and/or its affiliates. All rights reserved.
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * This code is free software; you can redistribute it and/or modify it
6  * under the terms of the GNU General Public License version 2 only, as
7  * published by the Free Software Foundation.  Oracle designates this
8  * particular file as subject to the "Classpath" exception as provided
9  * by Oracle in the LICENSE file that accompanied this code.
10  *
11  * This code is distributed in the hope that it will be useful, but WITHOUT
12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14  * version 2 for more details (a copy is included in the LICENSE file that
15  * accompanied this code).
16  *
17  * You should have received a copy of the GNU General Public License version
18  * 2 along with this work; if not, write to the Free Software Foundation,
19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20  *
21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22  * or visit www.oracle.com if you need additional information or have any
23  * questions.
24  */
25 
26 package jdk.nashorn.internal.runtime.regexp;
27 
28 import java.util.HashMap;
29 import java.util.Iterator;
30 import java.util.LinkedList;
31 import java.util.List;
32 import java.util.Map;
33 import java.util.regex.PatternSyntaxException;
34 import jdk.nashorn.internal.parser.Lexer;
35 import jdk.nashorn.internal.parser.Scanner;
36 import jdk.nashorn.internal.runtime.BitVector;
37 
38 /**
39  * Scan a JavaScript regexp, converting to Java regex if necessary.
40  *
41  */
42 final class RegExpScanner extends Scanner {
43 
44     /**
45      * String builder used to rewrite the pattern for the currently used regexp factory.
46      */
47     private final StringBuilder sb;
48 
49     /** Expected token table */
50     private final Map<Character, Integer> expected = new HashMap<>();
51 
52     /** Capturing parenthesis that have been found so far. */
53     private final List<Capture> caps = new LinkedList<>();
54 
55     /** Forward references to capturing parenthesis to be resolved later.*/
56     private final LinkedList<Integer> forwardReferences = new LinkedList<>();
57 
58     /** Current level of zero-width negative lookahead assertions. */
59     private int negLookaheadLevel;
60 
61     /** Sequential id of current top-level zero-width negative lookahead assertion. */
62     private int negLookaheadGroup;
63 
64     /** Are we currently inside a character class? */
65     private boolean inCharClass = false;
66 
67     /** Are we currently inside a negated character class? */
68     private boolean inNegativeClass = false;
69 
70     private static final String NON_IDENT_ESCAPES = "$^*+(){}[]|\\.?-";
71 
72     private static class Capture {
73         /** Zero-width negative lookaheads enclosing the capture. */
74         private final int negLookaheadLevel;
75         /** Sequential id of top-level negative lookaheads containing the capture. */
76         private  final int negLookaheadGroup;
77 
Capture(final int negLookaheadGroup, final int negLookaheadLevel)78         Capture(final int negLookaheadGroup, final int negLookaheadLevel) {
79             this.negLookaheadGroup = negLookaheadGroup;
80             this.negLookaheadLevel = negLookaheadLevel;
81         }
82 
83         /**
84          * Returns true if this Capture can be referenced from the position specified by the
85          * group and level parameters. This is the case if either the group is not within
86          * a negative lookahead, or the position of the referrer is in the same negative lookahead.
87          *
88          * @param group current negative lookahead group
89          * @param level current negative lokahead level
90          * @return true if this capture group can be referenced from the given position
91          */
canBeReferencedFrom(final int group, final int level)92         boolean canBeReferencedFrom(final int group, final int level) {
93             return this.negLookaheadLevel == 0 || (group == this.negLookaheadGroup && level >= this.negLookaheadLevel);
94         }
95 
96     }
97 
98     /**
99      * Constructor
100      * @param string the JavaScript regexp to parse
101      */
RegExpScanner(final String string)102     private RegExpScanner(final String string) {
103         super(string);
104         sb = new StringBuilder(limit);
105         reset(0);
106         expected.put(']', 0);
107         expected.put('}', 0);
108     }
109 
processForwardReferences()110     private void processForwardReferences() {
111 
112         final Iterator<Integer> iterator = forwardReferences.descendingIterator();
113         while (iterator.hasNext()) {
114             final int pos = iterator.next();
115             final int num = iterator.next();
116             if (num > caps.size()) {
117                 // Non-existing backreference. If the number begins with a valid octal convert it to
118                 // Unicode escape and append the rest to a literal character sequence.
119                 final StringBuilder buffer = new StringBuilder();
120                 octalOrLiteral(Integer.toString(num), buffer);
121                 sb.insert(pos, buffer);
122             }
123         }
124 
125         forwardReferences.clear();
126     }
127 
128     /**
129      * Scan a JavaScript regexp string returning a Java safe regex string.
130      *
131      * @param string
132      *            JavaScript regexp string.
133      * @return Java safe regex string.
134      */
scan(final String string)135     public static RegExpScanner scan(final String string) {
136         final RegExpScanner scanner = new RegExpScanner(string);
137 
138         try {
139             scanner.disjunction();
140         } catch (final Exception e) {
141             throw new PatternSyntaxException(e.getMessage(), string, scanner.position);
142         }
143 
144         // Throw syntax error unless we parsed the entire JavaScript regexp without syntax errors
145         if (scanner.position != string.length()) {
146             final String p = scanner.getStringBuilder().toString();
147             throw new PatternSyntaxException(string, p, p.length() + 1);
148         }
149 
150         scanner.processForwardReferences();
151 
152         return scanner;
153     }
154 
getStringBuilder()155     final StringBuilder getStringBuilder() {
156         return sb;
157     }
158 
getJavaPattern()159     String getJavaPattern() {
160         return sb.toString();
161     }
162 
getGroupsInNegativeLookahead()163     BitVector getGroupsInNegativeLookahead() {
164         BitVector vec = null;
165         for (int i = 0; i < caps.size(); i++) {
166             final Capture cap = caps.get(i);
167             if (cap.negLookaheadLevel > 0) {
168                 if (vec == null) {
169                     vec = new BitVector(caps.size() + 1);
170                 }
171                 vec.set(i + 1);
172             }
173         }
174         return vec;
175     }
176 
177     /**
178      * Commit n characters to the builder and to a given token
179      * @param n     Number of characters.
180      * @return Committed token
181      */
commit(final int n)182     private boolean commit(final int n) {
183         switch (n) {
184         case 1:
185             sb.append(ch0);
186             skip(1);
187             break;
188         case 2:
189             sb.append(ch0);
190             sb.append(ch1);
191             skip(2);
192             break;
193         case 3:
194             sb.append(ch0);
195             sb.append(ch1);
196             sb.append(ch2);
197             skip(3);
198             break;
199         default:
200             assert false : "Should not reach here";
201         }
202 
203         return true;
204     }
205 
206     /**
207      * Restart the buffers back at an earlier position.
208      *
209      * @param startIn
210      *            Position in the input stream.
211      * @param startOut
212      *            Position in the output stream.
213      */
restart(final int startIn, final int startOut)214     private void restart(final int startIn, final int startOut) {
215         reset(startIn);
216         sb.setLength(startOut);
217     }
218 
push(final char ch)219     private void push(final char ch) {
220         expected.put(ch, expected.get(ch) + 1);
221     }
222 
pop(final char ch)223     private void pop(final char ch) {
224         expected.put(ch, Math.min(0, expected.get(ch) - 1));
225     }
226 
227     /*
228      * Recursive descent tokenizer starts below.
229      */
230 
231     /*
232      * Disjunction ::
233      *      Alternative
234      *      Alternative | Disjunction
235      */
disjunction()236     private void disjunction() {
237         while (true) {
238             alternative();
239 
240             if (ch0 == '|') {
241                 commit(1);
242             } else {
243                 break;
244             }
245         }
246     }
247 
248     /*
249      * Alternative ::
250      *      [empty]
251      *      Alternative Term
252      */
alternative()253     private void alternative() {
254         while (term()) {
255             // do nothing
256         }
257     }
258 
259     /*
260      * Term ::
261      *      Assertion
262      *      Atom
263      *      Atom Quantifier
264      */
term()265     private boolean term() {
266         final int startIn  = position;
267         final int startOut = sb.length();
268 
269         if (assertion()) {
270             return true;
271         }
272 
273         if (atom()) {
274             quantifier();
275             return true;
276         }
277 
278         restart(startIn, startOut);
279         return false;
280     }
281 
282     /*
283      * Assertion ::
284      *      ^
285      *      $
286      *      \b
287      *      \B
288      *      ( ? = Disjunction )
289      *      ( ? ! Disjunction )
290      */
assertion()291     private boolean assertion() {
292         final int startIn  = position;
293         final int startOut = sb.length();
294 
295         switch (ch0) {
296         case '^':
297         case '$':
298             return commit(1);
299 
300         case '\\':
301             if (ch1 == 'b' || ch1 == 'B') {
302                 return commit(2);
303             }
304             break;
305 
306         case '(':
307             if (ch1 != '?') {
308                 break;
309             }
310             if (ch2 != '=' && ch2 != '!') {
311                 break;
312             }
313             final boolean isNegativeLookahead = (ch2 == '!');
314             commit(3);
315 
316             if (isNegativeLookahead) {
317                 if (negLookaheadLevel == 0) {
318                     negLookaheadGroup++;
319                 }
320                 negLookaheadLevel++;
321             }
322             disjunction();
323             if (isNegativeLookahead) {
324                 negLookaheadLevel--;
325             }
326 
327             if (ch0 == ')') {
328                 return commit(1);
329             }
330             break;
331 
332         default:
333             break;
334         }
335 
336         restart(startIn, startOut);
337         return false;
338     }
339 
340     /*
341      * Quantifier ::
342      *      QuantifierPrefix
343      *      QuantifierPrefix ?
344      */
quantifier()345     private boolean quantifier() {
346         if (quantifierPrefix()) {
347             if (ch0 == '?') {
348                 commit(1);
349             }
350             return true;
351         }
352         return false;
353     }
354 
355     /*
356      * QuantifierPrefix ::
357      *      *
358      *      +
359      *      ?
360      *      { DecimalDigits }
361      *      { DecimalDigits , }
362      *      { DecimalDigits , DecimalDigits }
363      */
quantifierPrefix()364     private boolean quantifierPrefix() {
365         final int startIn  = position;
366         final int startOut = sb.length();
367 
368         switch (ch0) {
369         case '*':
370         case '+':
371         case '?':
372             return commit(1);
373 
374         case '{':
375             commit(1);
376 
377             if (!decimalDigits()) {
378                 break; // not a quantifier - back out
379             }
380             push('}');
381 
382             if (ch0 == ',') {
383                 commit(1);
384                 decimalDigits();
385             }
386 
387             if (ch0 == '}') {
388                 pop('}');
389                 commit(1);
390             } else {
391                 // Bad quantifier should be rejected but is accepted by all major engines
392                 restart(startIn, startOut);
393                 return false;
394             }
395 
396             return true;
397 
398         default:
399             break;
400         }
401 
402         restart(startIn, startOut);
403         return false;
404     }
405 
406     /*
407      * Atom ::
408      *      PatternCharacter
409      *      .
410      *      \ AtomEscape
411      *      CharacterClass
412      *      ( Disjunction )
413      *      ( ? : Disjunction )
414      *
415      */
atom()416     private boolean atom() {
417         final int startIn  = position;
418         final int startOut = sb.length();
419 
420         if (patternCharacter()) {
421             return true;
422         }
423 
424         if (ch0 == '.') {
425             return commit(1);
426         }
427 
428         if (ch0 == '\\') {
429             commit(1);
430 
431             if (atomEscape()) {
432                 return true;
433             }
434         }
435 
436         if (characterClass()) {
437             return true;
438         }
439 
440         if (ch0 == '(') {
441             commit(1);
442             if (ch0 == '?' && ch1 == ':') {
443                 commit(2);
444             } else {
445                 caps.add(new Capture(negLookaheadGroup, negLookaheadLevel));
446             }
447 
448             disjunction();
449 
450             if (ch0 == ')') {
451                 commit(1);
452                 return true;
453             }
454         }
455 
456         restart(startIn, startOut);
457         return false;
458     }
459 
460     /*
461      * PatternCharacter ::
462      *      SourceCharacter but not any of: ^$\.*+?()[]{}|
463      */
464     @SuppressWarnings("fallthrough")
patternCharacter()465     private boolean patternCharacter() {
466         if (atEOF()) {
467             return false;
468         }
469 
470         switch (ch0) {
471         case '^':
472         case '$':
473         case '\\':
474         case '.':
475         case '*':
476         case '+':
477         case '?':
478         case '(':
479         case ')':
480         case '[':
481         case '|':
482             return false;
483 
484         case '}':
485         case ']':
486             final int n = expected.get(ch0);
487             if (n != 0) {
488                 return false;
489             }
490 
491        case '{':
492            // if not a valid quantifier escape curly brace to match itself
493            // this ensures compatibility with other JS implementations
494            if (!quantifierPrefix()) {
495                sb.append('\\');
496                return commit(1);
497            }
498            return false;
499 
500         default:
501             return commit(1); // SOURCECHARACTER
502         }
503     }
504 
505     /*
506      * AtomEscape ::
507      *      DecimalEscape
508      *      CharacterEscape
509      *      CharacterClassEscape
510      */
atomEscape()511     private boolean atomEscape() {
512         // Note that contrary to ES 5.1 spec we put identityEscape() last because it acts as a catch-all
513         return decimalEscape() || characterClassEscape() || characterEscape() || identityEscape();
514     }
515 
516     /*
517      * CharacterEscape ::
518      *      ControlEscape
519      *      c ControlLetter
520      *      HexEscapeSequence
521      *      UnicodeEscapeSequence
522      *      IdentityEscape
523      */
characterEscape()524     private boolean characterEscape() {
525         final int startIn  = position;
526         final int startOut = sb.length();
527 
528         if (controlEscape()) {
529             return true;
530         }
531 
532         if (ch0 == 'c') {
533             commit(1);
534             if (controlLetter()) {
535                 return true;
536             }
537             restart(startIn, startOut);
538         }
539 
540         if (hexEscapeSequence() || unicodeEscapeSequence()) {
541             return true;
542         }
543 
544         restart(startIn, startOut);
545         return false;
546     }
547 
scanEscapeSequence(final char leader, final int length)548     private boolean scanEscapeSequence(final char leader, final int length) {
549         final int startIn  = position;
550         final int startOut = sb.length();
551 
552         if (ch0 != leader) {
553             return false;
554         }
555 
556         commit(1);
557         for (int i = 0; i < length; i++) {
558             final char ch0l = Character.toLowerCase(ch0);
559             if ((ch0l >= 'a' && ch0l <= 'f') || isDecimalDigit(ch0)) {
560                 commit(1);
561             } else {
562                 restart(startIn, startOut);
563                 return false;
564             }
565         }
566 
567         return true;
568     }
569 
hexEscapeSequence()570     private boolean hexEscapeSequence() {
571         return scanEscapeSequence('x', 2);
572     }
573 
unicodeEscapeSequence()574     private boolean unicodeEscapeSequence() {
575         return scanEscapeSequence('u', 4);
576     }
577 
578     /*
579      * ControlEscape ::
580      *      one of fnrtv
581      */
controlEscape()582     private boolean controlEscape() {
583         switch (ch0) {
584         case 'f':
585         case 'n':
586         case 'r':
587         case 't':
588         case 'v':
589             return commit(1);
590 
591         default:
592             return false;
593         }
594     }
595 
596     /*
597      * ControlLetter ::
598      *      one of abcdefghijklmnopqrstuvwxyz
599      *      ABCDEFGHIJKLMNOPQRSTUVWXYZ
600      */
controlLetter()601     private boolean controlLetter() {
602         // To match other engines we also accept '0'..'9' and '_' as control letters inside a character class.
603         if ((ch0 >= 'A' && ch0 <= 'Z') || (ch0 >= 'a' && ch0 <= 'z')
604                 || (inCharClass && (isDecimalDigit(ch0) || ch0 == '_'))) {
605             // for some reason java regexps don't like control characters on the
606             // form "\\ca".match([string with ascii 1 at char0]). Translating
607             // them to unicode does it though.
608             sb.setLength(sb.length() - 1);
609             unicode(ch0 % 32, sb);
610             skip(1);
611             return true;
612         }
613         return false;
614     }
615 
616     /*
617      * IdentityEscape ::
618      *      SourceCharacter but not IdentifierPart
619      *      <ZWJ>  (200c)
620      *      <ZWNJ> (200d)
621      */
identityEscape()622     private boolean identityEscape() {
623         if (atEOF()) {
624             throw new RuntimeException("\\ at end of pattern"); // will be converted to PatternSyntaxException
625         }
626         // ES 5.1 A.7 requires "not IdentifierPart" here but all major engines accept any character here.
627         if (ch0 == 'c') {
628             sb.append('\\'); // Treat invalid \c control sequence as \\c
629         } else if (NON_IDENT_ESCAPES.indexOf(ch0) == -1) {
630             sb.setLength(sb.length() - 1);
631         }
632         return commit(1);
633     }
634 
635     /*
636      * DecimalEscape ::
637      *      DecimalIntegerLiteral [lookahead DecimalDigit]
638      */
decimalEscape()639     private boolean decimalEscape() {
640         final int startIn  = position;
641         final int startOut = sb.length();
642 
643         if (ch0 == '0' && !isOctalDigit(ch1)) {
644             skip(1);
645             //  DecimalEscape :: 0. If i is zero, return the EscapeValue consisting of a <NUL> character (Unicodevalue0000);
646             sb.append("\u0000");
647             return true;
648         }
649 
650         if (isDecimalDigit(ch0)) {
651 
652             if (ch0 == '0') {
653                 // We know this is an octal escape.
654                 if (inCharClass) {
655                     // Convert octal escape to unicode escape if inside character class.
656                     int octalValue = 0;
657                     while (isOctalDigit(ch0)) {
658                         octalValue = octalValue * 8 + ch0 - '0';
659                         skip(1);
660                     }
661 
662                     unicode(octalValue, sb);
663 
664                 } else {
665                     // Copy decimal escape as-is
666                     decimalDigits();
667                 }
668             } else {
669                 // This should be a backreference, but could also be an octal escape or even a literal string.
670                 int decimalValue = 0;
671                 while (isDecimalDigit(ch0)) {
672                     decimalValue = decimalValue * 10 + ch0 - '0';
673                     skip(1);
674                 }
675 
676                 if (inCharClass) {
677                     // No backreferences in character classes. Encode as unicode escape or literal char sequence
678                     sb.setLength(sb.length() - 1);
679                     octalOrLiteral(Integer.toString(decimalValue), sb);
680 
681                 } else if (decimalValue <= caps.size()) {
682                     //  Captures inside a negative lookahead are undefined when referenced from the outside.
683                     final Capture capture = caps.get(decimalValue - 1);
684                     if (!capture.canBeReferencedFrom(negLookaheadGroup, negLookaheadLevel)) {
685                         // Outside reference to capture in negative lookahead, omit from output buffer.
686                         sb.setLength(sb.length() - 1);
687                     } else {
688                         // Append backreference to output buffer.
689                         sb.append(decimalValue);
690                     }
691                 } else {
692                     // Forward references to a capture group are always undefined so we can omit it from the output buffer.
693                     // However, if the target capture does not exist, we need to rewrite the reference as hex escape
694                     // or literal string, so register the reference for later processing.
695                     sb.setLength(sb.length() - 1);
696                     forwardReferences.add(decimalValue);
697                     forwardReferences.add(sb.length());
698                 }
699 
700             }
701             return true;
702         }
703 
704         restart(startIn, startOut);
705         return false;
706     }
707 
708     /*
709      * CharacterClassEscape ::
710      *  one of dDsSwW
711      */
characterClassEscape()712     private boolean characterClassEscape() {
713         switch (ch0) {
714         // java.util.regex requires translation of \s and \S to explicit character list
715         case 's':
716             if (RegExpFactory.usesJavaUtilRegex()) {
717                 sb.setLength(sb.length() - 1);
718                 // No nested class required if we already are inside a character class
719                 if (inCharClass) {
720                     sb.append(Lexer.getWhitespaceRegExp());
721                 } else {
722                     sb.append('[').append(Lexer.getWhitespaceRegExp()).append(']');
723                 }
724                 skip(1);
725                 return true;
726             }
727             return commit(1);
728         case 'S':
729             if (RegExpFactory.usesJavaUtilRegex()) {
730                 sb.setLength(sb.length() - 1);
731                 // In negative class we must use intersection to get double negation ("not anything else than space")
732                 sb.append(inNegativeClass ? "&&[" : "[^").append(Lexer.getWhitespaceRegExp()).append(']');
733                 skip(1);
734                 return true;
735             }
736             return commit(1);
737         case 'd':
738         case 'D':
739         case 'w':
740         case 'W':
741             return commit(1);
742 
743         default:
744             return false;
745         }
746     }
747 
748     /*
749      * CharacterClass ::
750      *      [ [lookahead {^}] ClassRanges ]
751      *      [ ^ ClassRanges ]
752      */
characterClass()753     private boolean characterClass() {
754         final int startIn  = position;
755         final int startOut = sb.length();
756 
757         if (ch0 == '[') {
758             try {
759                 inCharClass = true;
760                 push(']');
761                 commit(1);
762 
763                 if (ch0 == '^') {
764                     inNegativeClass = true;
765                     commit(1);
766                 }
767 
768                 if (classRanges() && ch0 == ']') {
769                     pop(']');
770                     commit(1);
771 
772                     // Substitute empty character classes [] and [^] that never or always match
773                     if (position == startIn + 2) {
774                         sb.setLength(sb.length() - 1);
775                         sb.append("^\\s\\S]");
776                     } else if (position == startIn + 3 && inNegativeClass) {
777                         sb.setLength(sb.length() - 2);
778                         sb.append("\\s\\S]");
779                     }
780 
781                     return true;
782                 }
783             } finally {
784                 inCharClass = false;  // no nested character classes in JavaScript
785                 inNegativeClass = false;
786             }
787         }
788 
789         restart(startIn, startOut);
790         return false;
791     }
792 
793     /*
794      * ClassRanges ::
795      *      [empty]
796      *      NonemptyClassRanges
797      */
classRanges()798     private boolean classRanges() {
799         nonemptyClassRanges();
800         return true;
801     }
802 
803     /*
804      * NonemptyClassRanges ::
805      *      ClassAtom
806      *      ClassAtom NonemptyClassRangesNoDash
807      *      ClassAtom - ClassAtom ClassRanges
808      */
nonemptyClassRanges()809     private boolean nonemptyClassRanges() {
810         final int startIn  = position;
811         final int startOut = sb.length();
812 
813         if (classAtom()) {
814 
815             if (ch0 == '-') {
816                 commit(1);
817 
818                 if (classAtom() && classRanges()) {
819                     return true;
820                 }
821             }
822 
823             nonemptyClassRangesNoDash();
824 
825             return true;
826         }
827 
828         restart(startIn, startOut);
829         return false;
830     }
831 
832     /*
833      * NonemptyClassRangesNoDash ::
834      *      ClassAtom
835      *      ClassAtomNoDash NonemptyClassRangesNoDash
836      *      ClassAtomNoDash - ClassAtom ClassRanges
837      */
nonemptyClassRangesNoDash()838     private boolean nonemptyClassRangesNoDash() {
839         final int startIn  = position;
840         final int startOut = sb.length();
841 
842         if (classAtomNoDash()) {
843 
844             // need to check dash first, as for e.g. [a-b|c-d] will otherwise parse - as an atom
845             if (ch0 == '-') {
846                commit(1);
847 
848                if (classAtom() && classRanges()) {
849                    return true;
850                }
851                //fallthru
852            }
853 
854             nonemptyClassRangesNoDash();
855             return true; // still a class atom
856         }
857 
858         if (classAtom()) {
859             return true;
860         }
861 
862         restart(startIn, startOut);
863         return false;
864     }
865 
866     /*
867      * ClassAtom : - ClassAtomNoDash
868      */
classAtom()869     private boolean classAtom() {
870 
871         if (ch0 == '-') {
872             return commit(1);
873         }
874 
875         return classAtomNoDash();
876     }
877 
878     /*
879      * ClassAtomNoDash ::
880      *      SourceCharacter but not one of \ or ] or -
881      *      \ ClassEscape
882      */
classAtomNoDash()883     private boolean classAtomNoDash() {
884         if (atEOF()) {
885             return false;
886         }
887         final int startIn  = position;
888         final int startOut = sb.length();
889 
890         switch (ch0) {
891         case ']':
892         case '-':
893             return false;
894 
895         case '[':
896             // unescaped left square bracket - add escape
897             sb.append('\\');
898             return commit(1);
899 
900         case '\\':
901             commit(1);
902             if (classEscape()) {
903                 return true;
904             }
905 
906             restart(startIn, startOut);
907             return false;
908 
909         default:
910             return commit(1);
911         }
912     }
913 
914     /*
915      * ClassEscape ::
916      *      DecimalEscape
917      *      b
918      *      CharacterEscape
919      *      CharacterClassEscape
920      */
classEscape()921     private boolean classEscape() {
922 
923         if (decimalEscape()) {
924             return true;
925         }
926 
927         if (ch0 == 'b') {
928             sb.setLength(sb.length() - 1);
929             sb.append('\b');
930             skip(1);
931             return true;
932         }
933 
934         // Note that contrary to ES 5.1 spec we put identityEscape() last because it acts as a catch-all
935         return characterEscape() || characterClassEscape() || identityEscape();
936     }
937 
938     /*
939      * DecimalDigits
940      */
decimalDigits()941     private boolean decimalDigits() {
942         if (!isDecimalDigit(ch0)) {
943             return false;
944         }
945 
946         while (isDecimalDigit(ch0)) {
947             commit(1);
948         }
949 
950         return true;
951     }
952 
unicode(final int value, final StringBuilder buffer)953     private static void unicode(final int value, final StringBuilder buffer) {
954         final String hex = Integer.toHexString(value);
955         buffer.append('u');
956         for (int i = 0; i < 4 - hex.length(); i++) {
957             buffer.append('0');
958         }
959         buffer.append(hex);
960     }
961 
962     // Convert what would have been a backreference into a unicode escape, or a number literal, or both.
octalOrLiteral(final String numberLiteral, final StringBuilder buffer)963     private static void octalOrLiteral(final String numberLiteral, final StringBuilder buffer) {
964         final int length = numberLiteral.length();
965         int octalValue = 0;
966         int pos = 0;
967         // Maximum value for octal escape is 0377 (255) so we stop the loop at 32
968         while (pos < length && octalValue < 0x20) {
969             final char ch = numberLiteral.charAt(pos);
970             if (isOctalDigit(ch)) {
971                 octalValue = octalValue * 8 + ch - '0';
972             } else {
973                 break;
974             }
975             pos++;
976         }
977         if (octalValue > 0) {
978             buffer.append('\\');
979             unicode(octalValue, buffer);
980             buffer.append(numberLiteral.substring(pos));
981         } else {
982             buffer.append(numberLiteral);
983         }
984     }
985 
isOctalDigit(final char ch)986     private static boolean isOctalDigit(final char ch) {
987         return ch >= '0' && ch <= '7';
988     }
989 
isDecimalDigit(final char ch)990     private static boolean isDecimalDigit(final char ch) {
991         return ch >= '0' && ch <= '9';
992     }
993 }
994