1 /*
2  * Copyright (c) 2015, 2018, Oracle and/or its affiliates. All rights reserved.
3  */
4 /*
5  * Licensed to the Apache Software Foundation (ASF) under one or more
6  * contributor license agreements.  See the NOTICE file distributed with
7  * this work for additional information regarding copyright ownership.
8  * The ASF licenses this file to You under the Apache License, Version 2.0
9  * (the "License"); you may not use this file except in compliance with
10  * the License.  You may obtain a copy of the License at
11  *
12  *      http://www.apache.org/licenses/LICENSE-2.0
13  *
14  * Unless required by applicable law or agreed to in writing, software
15  * distributed under the License is distributed on an "AS IS" BASIS,
16  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17  * See the License for the specific language governing permissions and
18  * limitations under the License.
19  */
20 
21 package com.sun.org.apache.xerces.internal.impl.xpath.regex;
22 
23 import java.io.IOException;
24 import java.io.ObjectInputStream;
25 import java.io.ObjectOutputStream;
26 import java.io.ObjectStreamField;
27 import java.util.ArrayList;
28 import java.util.Collections;
29 import java.util.HashMap;
30 import java.util.HashSet;
31 import java.util.List;
32 import java.util.Map;
33 import java.util.Set;
34 import java.util.Vector;
35 
36 /**
37  * This class represents a node in parse tree.
38  *
39  * @xerces.internal
40  * @LastModified: May 2018
41  */
42 class Token implements java.io.Serializable {
43 
44     private static final long serialVersionUID = 8484976002585487481L;
45 
46     static final boolean COUNTTOKENS = true;
47     static int tokens = 0;
48 
49     static final int CHAR = 0;                  // Literal char
50     static final int DOT = 11;                  // .
51     static final int CONCAT = 1;                // XY
52     static final int UNION = 2;                 // X|Y|Z
53     static final int CLOSURE = 3;               // X*
54     static final int RANGE = 4;                 // [a-zA-Z] etc.
55     static final int NRANGE = 5;                // [^a-zA-Z] etc.
56     static final int PAREN = 6;                 // (X) or (?:X)
57     static final int EMPTY = 7;                 //
58     static final int ANCHOR = 8;                // ^ $ \b \B \< \> \A \Z \z
59     static final int NONGREEDYCLOSURE = 9;      // *? +?
60     static final int STRING = 10;               // strings
61     static final int BACKREFERENCE = 12;        // back references
62     static final int LOOKAHEAD = 20;            // (?=...)
63     static final int NEGATIVELOOKAHEAD = 21;    // (?!...)
64     static final int LOOKBEHIND = 22;           // (?<=...)
65     static final int NEGATIVELOOKBEHIND = 23;   // (?<!...)
66     static final int INDEPENDENT = 24;          // (?>...)
67     static final int MODIFIERGROUP = 25;        // (?ims-ims:...)
68     static final int CONDITION = 26;            // (?(...)yes|no)
69 
70     static final int UTF16_MAX = 0x10ffff;
71 
72     final int type;
73 
74     static Token token_dot;
75     static Token token_0to9;
76     static Token token_wordchars;
77     static Token token_not_0to9;
78     static Token token_not_wordchars;
79     static Token token_spaces;
80     static Token token_not_spaces;
81     static Token token_empty;
82     static Token token_linebeginning;
83     static Token token_linebeginning2;
84     static Token token_lineend;
85     static Token token_stringbeginning;
86     static Token token_stringend;
87     static Token token_stringend2;
88     static Token token_wordedge;
89     static Token token_not_wordedge;
90     static Token token_wordbeginning;
91     static Token token_wordend;
92     static {
93         Token.token_empty = new Token(Token.EMPTY);
94 
95         Token.token_linebeginning = Token.createAnchor('^');
96         Token.token_linebeginning2 = Token.createAnchor('@');
97         Token.token_lineend = Token.createAnchor('$');
98         Token.token_stringbeginning = Token.createAnchor('A');
99         Token.token_stringend = Token.createAnchor('z');
100         Token.token_stringend2 = Token.createAnchor('Z');
101         Token.token_wordedge = Token.createAnchor('b');
102         Token.token_not_wordedge = Token.createAnchor('B');
103         Token.token_wordbeginning = Token.createAnchor('<');
104         Token.token_wordend = Token.createAnchor('>');
105 
106         Token.token_dot = new Token(Token.DOT);
107 
108         Token.token_0to9 = Token.createRange();
109         Token.token_0to9.addRange('0', '9');
110         Token.token_wordchars = Token.createRange();
111         Token.token_wordchars.addRange('0', '9');
112         Token.token_wordchars.addRange('A', 'Z');
113         Token.token_wordchars.addRange('_', '_');
114         Token.token_wordchars.addRange('a', 'z');
115         Token.token_spaces = Token.createRange();
116         Token.token_spaces.addRange('\t', '\t');
117         Token.token_spaces.addRange('\n', '\n');
118         Token.token_spaces.addRange('\f', '\f');
119         Token.token_spaces.addRange('\r', '\r');
120         Token.token_spaces.addRange(' ', ' ');
121 
122         Token.token_not_0to9 = Token.complementRanges(Token.token_0to9);
123         Token.token_not_wordchars = Token.complementRanges(Token.token_wordchars);
124         Token.token_not_spaces = Token.complementRanges(Token.token_spaces);
125     }
126 
createLook(int type, Token child)127     static Token.ParenToken createLook(int type, Token child) {
128         if (COUNTTOKENS)  Token.tokens ++;
129         return new Token.ParenToken(type, child, 0);
130     }
createParen(Token child, int pnumber)131     static Token.ParenToken createParen(Token child, int pnumber) {
132         if (COUNTTOKENS)  Token.tokens ++;
133         return new Token.ParenToken(Token.PAREN, child, pnumber);
134     }
createClosure(Token tok)135     static Token.ClosureToken createClosure(Token tok) {
136         if (COUNTTOKENS)  Token.tokens ++;
137         return new Token.ClosureToken(Token.CLOSURE, tok);
138     }
createNGClosure(Token tok)139     static Token.ClosureToken createNGClosure(Token tok) {
140         if (COUNTTOKENS)  Token.tokens ++;
141         return new Token.ClosureToken(Token.NONGREEDYCLOSURE, tok);
142     }
createConcat(Token tok1, Token tok2)143     static Token.ConcatToken createConcat(Token tok1, Token tok2) {
144         if (COUNTTOKENS)  Token.tokens ++;
145         return new Token.ConcatToken(tok1, tok2);
146     }
createConcat()147     static Token.UnionToken createConcat() {
148         if (COUNTTOKENS)  Token.tokens ++;
149         return new Token.UnionToken(Token.CONCAT); // *** It is not a bug.
150     }
createUnion()151     static Token.UnionToken createUnion() {
152         if (COUNTTOKENS)  Token.tokens ++;
153         return new Token.UnionToken(Token.UNION);
154     }
createEmpty()155     static Token createEmpty() {
156         return Token.token_empty;
157     }
createRange()158     static RangeToken createRange() {
159         if (COUNTTOKENS)  Token.tokens ++;
160         return new RangeToken(Token.RANGE);
161     }
createNRange()162     static RangeToken createNRange() {
163         if (COUNTTOKENS)  Token.tokens ++;
164         return new RangeToken(Token.NRANGE);
165     }
createChar(int ch)166     static Token.CharToken createChar(int ch) {
167         if (COUNTTOKENS)  Token.tokens ++;
168         return new Token.CharToken(Token.CHAR, ch);
169     }
createAnchor(int ch)170     static private Token.CharToken createAnchor(int ch) {
171         if (COUNTTOKENS)  Token.tokens ++;
172         return new Token.CharToken(Token.ANCHOR, ch);
173     }
createBackReference(int refno)174     static Token.StringToken createBackReference(int refno) {
175         if (COUNTTOKENS)  Token.tokens ++;
176         return new Token.StringToken(Token.BACKREFERENCE, null, refno);
177     }
createString(String str)178     static Token.StringToken createString(String str) {
179         if (COUNTTOKENS)  Token.tokens ++;
180         return new Token.StringToken(Token.STRING, str, 0);
181     }
createModifierGroup(Token child, int add, int mask)182     static Token.ModifierToken createModifierGroup(Token child, int add, int mask) {
183         if (COUNTTOKENS)  Token.tokens ++;
184         return new Token.ModifierToken(child, add, mask);
185     }
createCondition(int refno, Token condition, Token yespat, Token nopat)186     static Token.ConditionToken createCondition(int refno, Token condition,
187                                                 Token yespat, Token nopat) {
188         if (COUNTTOKENS)  Token.tokens ++;
189         return new Token.ConditionToken(refno, condition, yespat, nopat);
190     }
191 
Token(int type)192     protected Token(int type) {
193         this.type = type;
194     }
195 
196     /**
197      * A number of children.
198      */
size()199     int size() {
200         return 0;
201     }
getChild(int index)202     Token getChild(int index) {
203         return null;
204     }
addChild(Token tok)205     void addChild(Token tok) {
206         throw new RuntimeException("Not supported.");
207     }
208 
209                                                 // for RANGE or NRANGE
addRange(int start, int end)210     protected void addRange(int start, int end) {
211         throw new RuntimeException("Not supported.");
212     }
sortRanges()213     protected void sortRanges() {
214         throw new RuntimeException("Not supported.");
215     }
compactRanges()216     protected void compactRanges() {
217         throw new RuntimeException("Not supported.");
218     }
mergeRanges(Token tok)219     protected void mergeRanges(Token tok) {
220         throw new RuntimeException("Not supported.");
221     }
subtractRanges(Token tok)222     protected void subtractRanges(Token tok) {
223         throw new RuntimeException("Not supported.");
224     }
intersectRanges(Token tok)225     protected void intersectRanges(Token tok) {
226         throw new RuntimeException("Not supported.");
227     }
complementRanges(Token tok)228     static Token complementRanges(Token tok) {
229         return RangeToken.complementRanges(tok);
230     }
231 
232 
setMin(int min)233     void setMin(int min) {                      // for CLOSURE
234     }
setMax(int max)235     void setMax(int max) {                      // for CLOSURE
236     }
getMin()237     int getMin() {                              // for CLOSURE
238         return -1;
239     }
getMax()240     int getMax() {                              // for CLOSURE
241         return -1;
242     }
getReferenceNumber()243     int getReferenceNumber() {                  // for STRING
244         return 0;
245     }
getString()246     String getString() {                        // for STRING
247         return null;
248     }
249 
getParenNumber()250     int getParenNumber() {
251         return 0;
252     }
getChar()253     int getChar() {
254         return -1;
255     }
256 
toString()257     public String toString() {
258         return this.toString(0);
259     }
toString(int options)260     public String toString(int options) {
261         return this.type == Token.DOT ? "." : "";
262     }
263 
264     /**
265      * How many characters are needed?
266      */
getMinLength()267     final int getMinLength() {
268         switch (this.type) {
269           case CONCAT:
270             int sum = 0;
271             for (int i = 0;  i < this.size();  i ++)
272                 sum += this.getChild(i).getMinLength();
273             return sum;
274 
275           case CONDITION:
276           case UNION:
277             if (this.size() == 0)
278                 return 0;
279             int ret = this.getChild(0).getMinLength();
280             for (int i = 1;  i < this.size();  i ++) {
281                 int min = this.getChild(i).getMinLength();
282                 if (min < ret)  ret = min;
283             }
284             return ret;
285 
286           case CLOSURE:
287           case NONGREEDYCLOSURE:
288             if (this.getMin() >= 0)
289                 return this.getMin() * this.getChild(0).getMinLength();
290             return 0;
291 
292           case EMPTY:
293           case ANCHOR:
294             return 0;
295 
296           case DOT:
297           case CHAR:
298           case RANGE:
299           case NRANGE:
300             return 1;
301 
302           case INDEPENDENT:
303           case PAREN:
304           case MODIFIERGROUP:
305             return this.getChild(0).getMinLength();
306 
307           case BACKREFERENCE:
308             return 0;                           // *******
309 
310           case STRING:
311             return this.getString().length();
312 
313           case LOOKAHEAD:
314           case NEGATIVELOOKAHEAD:
315           case LOOKBEHIND:
316           case NEGATIVELOOKBEHIND:
317             return 0;                           // ***** Really?
318 
319           default:
320             throw new RuntimeException("Token#getMinLength(): Invalid Type: "+this.type);
321         }
322     }
323 
getMaxLength()324     final int getMaxLength() {
325         switch (this.type) {
326           case CONCAT:
327             int sum = 0;
328             for (int i = 0;  i < this.size();  i ++) {
329                 int d = this.getChild(i).getMaxLength();
330                 if (d < 0)  return -1;
331                 sum += d;
332             }
333             return sum;
334 
335           case CONDITION:
336           case UNION:
337             if (this.size() == 0)
338                 return 0;
339             int ret = this.getChild(0).getMaxLength();
340             for (int i = 1;  ret >= 0 && i < this.size();  i ++) {
341                 int max = this.getChild(i).getMaxLength();
342                 if (max < 0) {                  // infinity
343                     ret = -1;
344                     break;
345                 }
346                 if (max > ret)  ret = max;
347             }
348             return ret;
349 
350           case CLOSURE:
351           case NONGREEDYCLOSURE:
352             if (this.getMax() >= 0)
353                                                 // When this.child.getMaxLength() < 0,
354                                                 // this returns minus value
355                 return this.getMax() * this.getChild(0).getMaxLength();
356             return -1;
357 
358           case EMPTY:
359           case ANCHOR:
360             return 0;
361 
362           case CHAR:
363             return 1;
364           case DOT:
365           case RANGE:
366           case NRANGE:
367             return 2;
368 
369           case INDEPENDENT:
370           case PAREN:
371           case MODIFIERGROUP:
372             return this.getChild(0).getMaxLength();
373 
374           case BACKREFERENCE:
375             return -1;                          // ******
376 
377           case STRING:
378             return this.getString().length();
379 
380           case LOOKAHEAD:
381           case NEGATIVELOOKAHEAD:
382           case LOOKBEHIND:
383           case NEGATIVELOOKBEHIND:
384             return 0;                           // ***** Really?
385 
386           default:
387             throw new RuntimeException("Token#getMaxLength(): Invalid Type: "+this.type);
388         }
389     }
390 
391     static final int FC_CONTINUE = 0;
392     static final int FC_TERMINAL = 1;
393     static final int FC_ANY = 2;
isSet(int options, int flag)394     private static final boolean isSet(int options, int flag) {
395         return (options & flag) == flag;
396     }
analyzeFirstCharacter(RangeToken result, int options)397     final int analyzeFirstCharacter(RangeToken result, int options) {
398         switch (this.type) {
399           case CONCAT:
400             int ret = FC_CONTINUE;
401             for (int i = 0;  i < this.size();  i ++)
402                 if ((ret = this.getChild(i).analyzeFirstCharacter(result, options)) != FC_CONTINUE)
403                     break;
404             return ret;
405 
406           case UNION:
407             if (this.size() == 0)
408                 return FC_CONTINUE;
409             /*
410              *  a|b|c -> FC_TERMINAL
411              *  a|.|c -> FC_ANY
412              *  a|b|  -> FC_CONTINUE
413              */
414             int ret2 = FC_CONTINUE;
415             boolean hasEmpty = false;
416             for (int i = 0;  i < this.size();  i ++) {
417                 ret2 = this.getChild(i).analyzeFirstCharacter(result, options);
418                 if (ret2 == FC_ANY)
419                     break;
420                 else if (ret2 == FC_CONTINUE)
421                     hasEmpty = true;
422             }
423             return hasEmpty ? FC_CONTINUE : ret2;
424 
425           case CONDITION:
426             int ret3 = this.getChild(0).analyzeFirstCharacter(result, options);
427             if (this.size() == 1)  return FC_CONTINUE;
428             if (ret3 == FC_ANY)  return ret3;
429             int ret4 = this.getChild(1).analyzeFirstCharacter(result, options);
430             if (ret4 == FC_ANY)  return ret4;
431             return ret3 == FC_CONTINUE || ret4 == FC_CONTINUE ? FC_CONTINUE : FC_TERMINAL;
432 
433           case CLOSURE:
434           case NONGREEDYCLOSURE:
435             this.getChild(0).analyzeFirstCharacter(result, options);
436             return FC_CONTINUE;
437 
438           case EMPTY:
439           case ANCHOR:
440             return FC_CONTINUE;
441 
442           case CHAR:
443             int ch = this.getChar();
444             result.addRange(ch, ch);
445             if (ch < 0x10000 && isSet(options, RegularExpression.IGNORE_CASE)) {
446                 ch = Character.toUpperCase((char)ch);
447                 result.addRange(ch, ch);
448                 ch = Character.toLowerCase((char)ch);
449                 result.addRange(ch, ch);
450             }
451             return FC_TERMINAL;
452 
453           case DOT:
454               return FC_ANY;
455 
456           case RANGE:
457             result.mergeRanges(this);
458             return FC_TERMINAL;
459 
460           case NRANGE:                          // ****
461             result.mergeRanges(Token.complementRanges(this));
462             return FC_TERMINAL;
463 
464           case INDEPENDENT:
465           case PAREN:
466             return this.getChild(0).analyzeFirstCharacter(result, options);
467 
468           case MODIFIERGROUP:
469             options |= ((ModifierToken)this).getOptions();
470             options &= ~((ModifierToken)this).getOptionsMask();
471             return this.getChild(0).analyzeFirstCharacter(result, options);
472 
473           case BACKREFERENCE:
474             result.addRange(0, UTF16_MAX);  // **** We can not optimize.
475             return FC_ANY;
476 
477           case STRING:
478             int cha = this.getString().charAt(0);
479             int ch2;
480             if (REUtil.isHighSurrogate(cha)
481                 && this.getString().length() >= 2
482                 && REUtil.isLowSurrogate((ch2 = this.getString().charAt(1))))
483                 cha = REUtil.composeFromSurrogates(cha, ch2);
484             result.addRange(cha, cha);
485             if (cha < 0x10000 && isSet(options, RegularExpression.IGNORE_CASE)) {
486                 cha = Character.toUpperCase((char)cha);
487                 result.addRange(cha, cha);
488                 cha = Character.toLowerCase((char)cha);
489                 result.addRange(cha, cha);
490             }
491             return FC_TERMINAL;
492 
493           case LOOKAHEAD:
494           case NEGATIVELOOKAHEAD:
495           case LOOKBEHIND:
496           case NEGATIVELOOKBEHIND:
497             return FC_CONTINUE;
498 
499           default:
500             throw new RuntimeException("Token#analyzeHeadCharacter(): Invalid Type: "+this.type);
501         }
502     }
503 
isShorterThan(Token tok)504     private final boolean isShorterThan(Token tok) {
505         if (tok == null)  return false;
506         /*
507         int mylength;
508         if (this.type == STRING)  mylength = this.getString().length();
509         else if (this.type == CHAR)  mylength = this.getChar() >= 0x10000 ? 2 : 1;
510         else throw new RuntimeException("Internal Error: Illegal type: "+this.type);
511         int otherlength;
512         if (tok.type == STRING)  otherlength = tok.getString().length();
513         else if (tok.type == CHAR)  otherlength = tok.getChar() >= 0x10000 ? 2 : 1;
514         else throw new RuntimeException("Internal Error: Illegal type: "+tok.type);
515         */
516         int mylength;
517         if (this.type == STRING)  mylength = this.getString().length();
518         else throw new RuntimeException("Internal Error: Illegal type: "+this.type);
519         int otherlength;
520         if (tok.type == STRING)  otherlength = tok.getString().length();
521         else throw new RuntimeException("Internal Error: Illegal type: "+tok.type);
522         return mylength < otherlength;
523     }
524 
525     static class FixedStringContainer {
526         Token token = null;
527         int options = 0;
FixedStringContainer()528         FixedStringContainer() {
529         }
530     }
531 
findFixedString(FixedStringContainer container, int options)532     final void findFixedString(FixedStringContainer container, int options) {
533         switch (this.type) {
534           case CONCAT:
535             Token prevToken = null;
536             int prevOptions = 0;
537             for (int i = 0;  i < this.size();  i ++) {
538                 this.getChild(i).findFixedString(container, options);
539                 if (prevToken == null || prevToken.isShorterThan(container.token)) {
540                     prevToken = container.token;
541                     prevOptions = container.options;
542                 }
543             }
544             container.token = prevToken;
545             container.options = prevOptions;
546             return;
547 
548           case UNION:
549           case CLOSURE:
550           case NONGREEDYCLOSURE:
551           case EMPTY:
552           case ANCHOR:
553           case RANGE:
554           case DOT:
555           case NRANGE:
556           case BACKREFERENCE:
557           case LOOKAHEAD:
558           case NEGATIVELOOKAHEAD:
559           case LOOKBEHIND:
560           case NEGATIVELOOKBEHIND:
561           case CONDITION:
562             container.token = null;
563             return;
564 
565           case CHAR:                            // Ignore CHAR tokens.
566             container.token = null;             // **
567             return;                             // **
568 
569           case STRING:
570             container.token = this;
571             container.options = options;
572             return;
573 
574           case INDEPENDENT:
575           case PAREN:
576             this.getChild(0).findFixedString(container, options);
577             return;
578 
579           case MODIFIERGROUP:
580             options |= ((ModifierToken)this).getOptions();
581             options &= ~((ModifierToken)this).getOptionsMask();
582             this.getChild(0).findFixedString(container, options);
583             return;
584 
585           default:
586             throw new RuntimeException("Token#findFixedString(): Invalid Type: "+this.type);
587         }
588     }
589 
match(int ch)590     boolean match(int ch) {
591         throw new RuntimeException("NFAArrow#match(): Internal error: "+this.type);
592     }
593 
594     // ------------------------------------------------------
595     private static volatile Map<String, Token> categories = null;
596     private static volatile Map<String, Token> categories2 = null;
597     private static final Object lock = new Object();
598     private static final String[] categoryNames = {
599         "Cn", "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me", "Mc", "Nd",
600         "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", null, "Co", "Cs",
601         "Pd", "Ps", "Pe", "Pc", "Po", "Sm", "Sc", "Sk", "So", // 28
602         "Pi", "Pf",  // 29, 30
603         "L", "M", "N", "Z", "C", "P", "S",      // 31-37
604     };
605 
606     // Schema Rec. {Datatypes} - Punctuation
607     static final int CHAR_INIT_QUOTE  = 29;     // Pi - initial quote
608     static final int CHAR_FINAL_QUOTE = 30;     // Pf - final quote
609     static final int CHAR_LETTER = 31;
610     static final int CHAR_MARK = 32;
611     static final int CHAR_NUMBER = 33;
612     static final int CHAR_SEPARATOR = 34;
613     static final int CHAR_OTHER = 35;
614     static final int CHAR_PUNCTUATION = 36;
615     static final int CHAR_SYMBOL = 37;
616 
617     //blockNames in UNICODE 3.1 that supported by XML Schema REC
618     private static final String[] blockNames = {
619         /*0000..007F;*/ "Basic Latin",
620         /*0080..00FF;*/ "Latin-1 Supplement",
621         /*0100..017F;*/ "Latin Extended-A",
622         /*0180..024F;*/ "Latin Extended-B",
623         /*0250..02AF;*/ "IPA Extensions",
624         /*02B0..02FF;*/ "Spacing Modifier Letters",
625         /*0300..036F;*/ "Combining Diacritical Marks",
626         /*0370..03FF;*/ "Greek",
627         /*0400..04FF;*/ "Cyrillic",
628         /*0530..058F;*/ "Armenian",
629         /*0590..05FF;*/ "Hebrew",
630         /*0600..06FF;*/ "Arabic",
631         /*0700..074F;*/ "Syriac",
632         /*0780..07BF;*/ "Thaana",
633         /*0900..097F;*/ "Devanagari",
634         /*0980..09FF;*/ "Bengali",
635         /*0A00..0A7F;*/ "Gurmukhi",
636         /*0A80..0AFF;*/ "Gujarati",
637         /*0B00..0B7F;*/ "Oriya",
638         /*0B80..0BFF;*/ "Tamil",
639         /*0C00..0C7F;*/ "Telugu",
640         /*0C80..0CFF;*/ "Kannada",
641         /*0D00..0D7F;*/ "Malayalam",
642         /*0D80..0DFF;*/ "Sinhala",
643         /*0E00..0E7F;*/ "Thai",
644         /*0E80..0EFF;*/ "Lao",
645         /*0F00..0FFF;*/ "Tibetan",
646         /*1000..109F;*/ "Myanmar",
647         /*10A0..10FF;*/ "Georgian",
648         /*1100..11FF;*/ "Hangul Jamo",
649         /*1200..137F;*/ "Ethiopic",
650         /*13A0..13FF;*/ "Cherokee",
651         /*1400..167F;*/ "Unified Canadian Aboriginal Syllabics",
652         /*1680..169F;*/ "Ogham",
653         /*16A0..16FF;*/ "Runic",
654         /*1780..17FF;*/ "Khmer",
655         /*1800..18AF;*/ "Mongolian",
656         /*1E00..1EFF;*/ "Latin Extended Additional",
657         /*1F00..1FFF;*/ "Greek Extended",
658         /*2000..206F;*/ "General Punctuation",
659         /*2070..209F;*/ "Superscripts and Subscripts",
660         /*20A0..20CF;*/ "Currency Symbols",
661         /*20D0..20FF;*/ "Combining Marks for Symbols",
662         /*2100..214F;*/ "Letterlike Symbols",
663         /*2150..218F;*/ "Number Forms",
664         /*2190..21FF;*/ "Arrows",
665         /*2200..22FF;*/ "Mathematical Operators",
666         /*2300..23FF;*/ "Miscellaneous Technical",
667         /*2400..243F;*/ "Control Pictures",
668         /*2440..245F;*/ "Optical Character Recognition",
669         /*2460..24FF;*/ "Enclosed Alphanumerics",
670         /*2500..257F;*/ "Box Drawing",
671         /*2580..259F;*/ "Block Elements",
672         /*25A0..25FF;*/ "Geometric Shapes",
673         /*2600..26FF;*/ "Miscellaneous Symbols",
674         /*2700..27BF;*/ "Dingbats",
675         /*2800..28FF;*/ "Braille Patterns",
676         /*2E80..2EFF;*/ "CJK Radicals Supplement",
677         /*2F00..2FDF;*/ "Kangxi Radicals",
678         /*2FF0..2FFF;*/ "Ideographic Description Characters",
679         /*3000..303F;*/ "CJK Symbols and Punctuation",
680         /*3040..309F;*/ "Hiragana",
681         /*30A0..30FF;*/ "Katakana",
682         /*3100..312F;*/ "Bopomofo",
683         /*3130..318F;*/ "Hangul Compatibility Jamo",
684         /*3190..319F;*/ "Kanbun",
685         /*31A0..31BF;*/ "Bopomofo Extended",
686         /*3200..32FF;*/ "Enclosed CJK Letters and Months",
687         /*3300..33FF;*/ "CJK Compatibility",
688         /*3400..4DB5;*/ "CJK Unified Ideographs Extension A",
689         /*4E00..9FFF;*/ "CJK Unified Ideographs",
690         /*A000..A48F;*/ "Yi Syllables",
691         /*A490..A4CF;*/ "Yi Radicals",
692         /*AC00..D7A3;*/ "Hangul Syllables",
693         /*E000..F8FF;*/ "Private Use",
694         /*F900..FAFF;*/ "CJK Compatibility Ideographs",
695         /*FB00..FB4F;*/ "Alphabetic Presentation Forms",
696         /*FB50..FDFF;*/ "Arabic Presentation Forms-A",
697         /*FE20..FE2F;*/ "Combining Half Marks",
698         /*FE30..FE4F;*/ "CJK Compatibility Forms",
699         /*FE50..FE6F;*/ "Small Form Variants",
700         /*FE70..FEFE;*/ "Arabic Presentation Forms-B",
701         /*FEFF..FEFF;*/ "Specials",
702         /*FF00..FFEF;*/ "Halfwidth and Fullwidth Forms",
703          //missing Specials add manually
704         /*10300..1032F;*/ "Old Italic",         // 84
705         /*10330..1034F;*/ "Gothic",
706         /*10400..1044F;*/ "Deseret",
707         /*1D000..1D0FF;*/ "Byzantine Musical Symbols",
708         /*1D100..1D1FF;*/ "Musical Symbols",
709         /*1D400..1D7FF;*/ "Mathematical Alphanumeric Symbols",
710         /*20000..2A6D6;*/ "CJK Unified Ideographs Extension B",
711         /*2F800..2FA1F;*/ "CJK Compatibility Ideographs Supplement",
712         /*E0000..E007F;*/ "Tags",
713         //missing 2 private use add manually
714 
715     };
716     //ADD THOSE MANUALLY
717     //F0000..FFFFD; "Private Use",
718     //100000..10FFFD; "Private Use"
719     //FFF0..FFFD; "Specials",
720     static final String blockRanges =
721        "\u0000\u007F\u0080\u00FF\u0100\u017F\u0180\u024F\u0250\u02AF\u02B0\u02FF\u0300\u036F"
722         +"\u0370\u03FF\u0400\u04FF\u0530\u058F\u0590\u05FF\u0600\u06FF\u0700\u074F\u0780\u07BF"
723         +"\u0900\u097F\u0980\u09FF\u0A00\u0A7F\u0A80\u0AFF\u0B00\u0B7F\u0B80\u0BFF\u0C00\u0C7F\u0C80\u0CFF"
724         +"\u0D00\u0D7F\u0D80\u0DFF\u0E00\u0E7F\u0E80\u0EFF\u0F00\u0FFF\u1000\u109F\u10A0\u10FF\u1100\u11FF"
725         +"\u1200\u137F\u13A0\u13FF\u1400\u167F\u1680\u169F\u16A0\u16FF\u1780\u17FF\u1800\u18AF\u1E00\u1EFF"
726         +"\u1F00\u1FFF\u2000\u206F\u2070\u209F\u20A0\u20CF\u20D0\u20FF\u2100\u214F\u2150\u218F\u2190\u21FF\u2200\u22FF"
727         +"\u2300\u23FF\u2400\u243F\u2440\u245F\u2460\u24FF\u2500\u257F\u2580\u259F\u25A0\u25FF\u2600\u26FF\u2700\u27BF"
728         +"\u2800\u28FF\u2E80\u2EFF\u2F00\u2FDF\u2FF0\u2FFF\u3000\u303F\u3040\u309F\u30A0\u30FF\u3100\u312F\u3130\u318F"
729         +"\u3190\u319F\u31A0\u31BF\u3200\u32FF\u3300\u33FF\u3400\u4DB5\u4E00\u9FFF\uA000\uA48F\uA490\uA4CF"
730         +"\uAC00\uD7A3\uE000\uF8FF\uF900\uFAFF\uFB00\uFB4F\uFB50\uFDFF"
731         +"\uFE20\uFE2F\uFE30\uFE4F\uFE50\uFE6F\uFE70\uFEFE\uFEFF\uFEFF\uFF00\uFFEF";
732     static final int[] nonBMPBlockRanges = {
733         0x10300, 0x1032F,       // 84
734         0x10330, 0x1034F,
735         0x10400, 0x1044F,
736         0x1D000, 0x1D0FF,
737         0x1D100, 0x1D1FF,
738         0x1D400, 0x1D7FF,
739         0x20000, 0x2A6D6,
740         0x2F800, 0x2FA1F,
741         0xE0000, 0xE007F
742     };
743     private static final int NONBMP_BLOCK_START = 84;
744 
getRange(String name, boolean positive)745     static protected RangeToken getRange(String name, boolean positive) {
746         // use local variable for better performance
747         Map<String, Token> localCat = Token.categories;
748         if (localCat == null) {
749             synchronized (lock) {
750                 localCat = Token.categories;
751                 if (localCat == null) {
752                     Map<String, Token> tmpCat = new HashMap<>();
753                     Map<String, Token> tmpCat2 = new HashMap<>();
754 
755                     Token[] ranges = new Token[Token.categoryNames.length];
756                     for (int i = 0;  i < ranges.length;  i ++) {
757                         ranges[i] = Token.createRange();
758                     }
759                     int type;
760                     for (int i = 0;  i < 0x10000;  i ++) {
761                         type = Character.getType((char)i);
762                         if (type == Character.START_PUNCTUATION ||
763                             type == Character.END_PUNCTUATION) {
764                             //build table of Pi values
765                             if (i == 0x00AB || i == 0x2018 || i == 0x201B || i == 0x201C ||
766                                 i == 0x201F || i == 0x2039) {
767                                 type = CHAR_INIT_QUOTE;
768                             }
769                             //build table of Pf values
770                             if (i == 0x00BB || i == 0x2019 || i == 0x201D || i == 0x203A ) {
771                                 type = CHAR_FINAL_QUOTE;
772                             }
773                         }
774                         ranges[type].addRange(i, i);
775                         switch (type) {
776                           case Character.UPPERCASE_LETTER:
777                           case Character.LOWERCASE_LETTER:
778                           case Character.TITLECASE_LETTER:
779                           case Character.MODIFIER_LETTER:
780                           case Character.OTHER_LETTER:
781                             type = CHAR_LETTER;
782                             break;
783                           case Character.NON_SPACING_MARK:
784                           case Character.COMBINING_SPACING_MARK:
785                           case Character.ENCLOSING_MARK:
786                             type = CHAR_MARK;
787                             break;
788                           case Character.DECIMAL_DIGIT_NUMBER:
789                           case Character.LETTER_NUMBER:
790                           case Character.OTHER_NUMBER:
791                             type = CHAR_NUMBER;
792                             break;
793                           case Character.SPACE_SEPARATOR:
794                           case Character.LINE_SEPARATOR:
795                           case Character.PARAGRAPH_SEPARATOR:
796                             type = CHAR_SEPARATOR;
797                             break;
798                           case Character.CONTROL:
799                           case Character.FORMAT:
800                           case Character.SURROGATE:
801                           case Character.PRIVATE_USE:
802                           case Character.UNASSIGNED:
803                             type = CHAR_OTHER;
804                             break;
805                           case Character.CONNECTOR_PUNCTUATION:
806                           case Character.DASH_PUNCTUATION:
807                           case Character.START_PUNCTUATION:
808                           case Character.END_PUNCTUATION:
809                           case CHAR_INIT_QUOTE:
810                           case CHAR_FINAL_QUOTE:
811                           case Character.OTHER_PUNCTUATION:
812                             type = CHAR_PUNCTUATION;
813                             break;
814                           case Character.MATH_SYMBOL:
815                           case Character.CURRENCY_SYMBOL:
816                           case Character.MODIFIER_SYMBOL:
817                           case Character.OTHER_SYMBOL:
818                             type = CHAR_SYMBOL;
819                             break;
820                           default:
821                             throw new RuntimeException("org.apache.xerces.utils.regex.Token#getRange(): Unknown Unicode category: "+type);
822                         }
823                         ranges[type].addRange(i, i);
824                     } // for all characters
825                     ranges[Character.UNASSIGNED].addRange(0x10000, Token.UTF16_MAX);
826 
827                     for (int i = 0;  i < ranges.length;  i ++) {
828                         if (Token.categoryNames[i] != null) {
829                             if (i == Character.UNASSIGNED) { // Unassigned
830                                 ranges[i].addRange(0x10000, Token.UTF16_MAX);
831                             }
832                             tmpCat.put(Token.categoryNames[i], ranges[i]);
833                             tmpCat2.put(Token.categoryNames[i],
834                                                   Token.complementRanges(ranges[i]));
835                         }
836                     }
837                     //REVISIT: do we really need to support block names as in Unicode 3.1
838                     //         or we can just create all the names in IsBLOCKNAME format (XML Schema REC)?
839                     //
840                     StringBuilder buffer = new StringBuilder(50);
841                     for (int i = 0;  i < Token.blockNames.length;  i ++) {
842                         Token r1 = Token.createRange();
843                         int location;
844                         if (i < NONBMP_BLOCK_START) {
845                             location = i*2;
846                             int rstart = Token.blockRanges.charAt(location);
847                             int rend = Token.blockRanges.charAt(location+1);
848                             //DEBUGING
849                             //System.out.println(n+" " +Integer.toHexString(rstart)
850                             //                     +"-"+ Integer.toHexString(rend));
851                             r1.addRange(rstart, rend);
852                         } else {
853                             location = (i - NONBMP_BLOCK_START) * 2;
854                             r1.addRange(Token.nonBMPBlockRanges[location],
855                                         Token.nonBMPBlockRanges[location + 1]);
856                         }
857                         String n = Token.blockNames[i];
858                         if (n.equals("Specials"))
859                             r1.addRange(0xfff0, 0xfffd);
860                         if (n.equals("Private Use")) {
861                             r1.addRange(0xF0000,0xFFFFD);
862                             r1.addRange(0x100000,0x10FFFD);
863                         }
864                         tmpCat.put(n, r1);
865                         tmpCat2.put(n, Token.complementRanges(r1));
866                         buffer.setLength(0);
867                         buffer.append("Is");
868                         if (n.indexOf(' ') >= 0) {
869                             for (int ci = 0;  ci < n.length();  ci ++)
870                                 if (n.charAt(ci) != ' ')  buffer.append(n.charAt(ci));
871                         }
872                         else {
873                             buffer.append(n);
874                         }
875                         Token.setAlias(tmpCat, tmpCat2, buffer.toString(), n, true);
876                     }
877 
878                     // TR#18 1.2
879                     Token.setAlias(tmpCat, tmpCat2, "ASSIGNED", "Cn", false);
880                     Token.setAlias(tmpCat, tmpCat2, "UNASSIGNED", "Cn", true);
881                     Token all = Token.createRange();
882                     all.addRange(0, Token.UTF16_MAX);
883                     tmpCat.put("ALL", all);
884                     tmpCat2.put("ALL", Token.complementRanges(all));
885                     Token.registerNonXS("ASSIGNED");
886                     Token.registerNonXS("UNASSIGNED");
887                     Token.registerNonXS("ALL");
888 
889                     Token isalpha = Token.createRange();
890                     isalpha.mergeRanges(ranges[Character.UPPERCASE_LETTER]); // Lu
891                     isalpha.mergeRanges(ranges[Character.LOWERCASE_LETTER]); // Ll
892                     isalpha.mergeRanges(ranges[Character.OTHER_LETTER]); // Lo
893                     tmpCat.put("IsAlpha", isalpha);
894                     tmpCat2.put("IsAlpha", Token.complementRanges(isalpha));
895                     Token.registerNonXS("IsAlpha");
896 
897                     Token isalnum = Token.createRange();
898                     isalnum.mergeRanges(isalpha);   // Lu Ll Lo
899                     isalnum.mergeRanges(ranges[Character.DECIMAL_DIGIT_NUMBER]); // Nd
900                     tmpCat.put("IsAlnum", isalnum);
901                     tmpCat2.put("IsAlnum", Token.complementRanges(isalnum));
902                     Token.registerNonXS("IsAlnum");
903 
904                     Token isspace = Token.createRange();
905                     isspace.mergeRanges(Token.token_spaces);
906                     isspace.mergeRanges(ranges[CHAR_SEPARATOR]); // Z
907                     tmpCat.put("IsSpace", isspace);
908                     tmpCat2.put("IsSpace", Token.complementRanges(isspace));
909                     Token.registerNonXS("IsSpace");
910 
911                     Token isword = Token.createRange();
912                     isword.mergeRanges(isalnum);     // Lu Ll Lo Nd
913                     isword.addRange('_', '_');
914                     tmpCat.put("IsWord", isword);
915                     tmpCat2.put("IsWord", Token.complementRanges(isword));
916                     Token.registerNonXS("IsWord");
917 
918                     Token isascii = Token.createRange();
919                     isascii.addRange(0, 127);
920                     tmpCat.put("IsASCII", isascii);
921                     tmpCat2.put("IsASCII", Token.complementRanges(isascii));
922                     Token.registerNonXS("IsASCII");
923 
924                     Token isnotgraph = Token.createRange();
925                     isnotgraph.mergeRanges(ranges[CHAR_OTHER]);
926                     isnotgraph.addRange(' ', ' ');
927                     tmpCat.put("IsGraph", Token.complementRanges(isnotgraph));
928                     tmpCat2.put("IsGraph", isnotgraph);
929                     Token.registerNonXS("IsGraph");
930 
931                     Token isxdigit = Token.createRange();
932                     isxdigit.addRange('0', '9');
933                     isxdigit.addRange('A', 'F');
934                     isxdigit.addRange('a', 'f');
935                     tmpCat.put("IsXDigit", Token.complementRanges(isxdigit));
936                     tmpCat2.put("IsXDigit", isxdigit);
937                     Token.registerNonXS("IsXDigit");
938 
939                     Token.setAlias(tmpCat, tmpCat2, "IsDigit", "Nd", true);
940                     Token.setAlias(tmpCat, tmpCat2, "IsUpper", "Lu", true);
941                     Token.setAlias(tmpCat, tmpCat2, "IsLower", "Ll", true);
942                     Token.setAlias(tmpCat, tmpCat2, "IsCntrl", "C", true);
943                     Token.setAlias(tmpCat, tmpCat2, "IsPrint", "C", false);
944                     Token.setAlias(tmpCat, tmpCat2, "IsPunct", "P", true);
945                     Token.registerNonXS("IsDigit");
946                     Token.registerNonXS("IsUpper");
947                     Token.registerNonXS("IsLower");
948                     Token.registerNonXS("IsCntrl");
949                     Token.registerNonXS("IsPrint");
950                     Token.registerNonXS("IsPunct");
951 
952                     Token.setAlias(tmpCat, tmpCat2, "alpha", "IsAlpha", true);
953                     Token.setAlias(tmpCat, tmpCat2, "alnum", "IsAlnum", true);
954                     Token.setAlias(tmpCat, tmpCat2, "ascii", "IsASCII", true);
955                     Token.setAlias(tmpCat, tmpCat2, "cntrl", "IsCntrl", true);
956                     Token.setAlias(tmpCat, tmpCat2, "digit", "IsDigit", true);
957                     Token.setAlias(tmpCat, tmpCat2, "graph", "IsGraph", true);
958                     Token.setAlias(tmpCat, tmpCat2, "lower", "IsLower", true);
959                     Token.setAlias(tmpCat, tmpCat2, "print", "IsPrint", true);
960                     Token.setAlias(tmpCat, tmpCat2, "punct", "IsPunct", true);
961                     Token.setAlias(tmpCat, tmpCat2, "space", "IsSpace", true);
962                     Token.setAlias(tmpCat, tmpCat2, "upper", "IsUpper", true);
963                     Token.setAlias(tmpCat, tmpCat2, "word", "IsWord", true); // Perl extension
964                     Token.setAlias(tmpCat, tmpCat2, "xdigit", "IsXDigit", true);
965                     Token.registerNonXS("alpha");
966                     Token.registerNonXS("alnum");
967                     Token.registerNonXS("ascii");
968                     Token.registerNonXS("cntrl");
969                     Token.registerNonXS("digit");
970                     Token.registerNonXS("graph");
971                     Token.registerNonXS("lower");
972                     Token.registerNonXS("print");
973                     Token.registerNonXS("punct");
974                     Token.registerNonXS("space");
975                     Token.registerNonXS("upper");
976                     Token.registerNonXS("word");
977                     Token.registerNonXS("xdigit");
978                     Token.categories = localCat = Collections.unmodifiableMap(tmpCat);
979                     Token.categories2 = Collections.unmodifiableMap(tmpCat2);
980                 } // localCat == null
981             } // synchronized
982         } // if null
983         return positive ? (RangeToken)localCat.get(name)
984             : (RangeToken)Token.categories2.get(name);
985     }
getRange(String name, boolean positive, boolean xs)986     static protected RangeToken getRange(String name, boolean positive, boolean xs) {
987         RangeToken range = Token.getRange(name, positive);
988         if (xs && range != null && Token.isRegisterNonXS(name))
989             range = null;
990         return range;
991     }
992 
993     static final Set<String> nonxs = Collections.synchronizedSet(new HashSet<>());
994     /**
995      * This method is called by only getRange().
996      * So this method need not MT-safe.
997      */
registerNonXS(String name)998     static protected void registerNonXS(String name) {
999         Token.nonxs.add(name);
1000     }
1001 
isRegisterNonXS(String name)1002     static protected boolean isRegisterNonXS(String name) {
1003         return Token.nonxs.contains(name);
1004     }
1005 
setAlias(Map<String, Token> tmpCat, Map<String, Token> tmpCat2, String newName, String name, boolean positive)1006     private static void setAlias(Map<String, Token> tmpCat, Map<String, Token> tmpCat2,
1007             String newName, String name, boolean positive) {
1008         Token t1 = tmpCat.get(name);
1009         Token t2 = tmpCat2.get(name);
1010         if (positive) {
1011             tmpCat.put(newName, t1);
1012             tmpCat2.put(newName, t2);
1013         } else {
1014             tmpCat2.put(newName, t1);
1015             tmpCat.put(newName, t2);
1016         }
1017     }
1018 
1019     // ------------------------------------------------------
1020 
1021     static final String viramaString =
1022     "\u094D"// ;DEVANAGARI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1023     +"\u09CD"//;BENGALI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1024     +"\u0A4D"//;GURMUKHI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1025     +"\u0ACD"//;GUJARATI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1026     +"\u0B4D"//;ORIYA SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1027     +"\u0BCD"//;TAMIL SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1028     +"\u0C4D"//;TELUGU SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1029     +"\u0CCD"//;KANNADA SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1030     +"\u0D4D"//;MALAYALAM SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1031     +"\u0E3A"//;THAI CHARACTER PHINTHU;Mn;9;ON;;;;;N;THAI VOWEL SIGN PHINTHU;;;;
1032     +"\u0F84";//;TIBETAN MARK HALANTA;Mn;9;ON;;;;;N;TIBETAN VIRAMA;;;;
1033 
1034     static private Token token_grapheme = null;
getGraphemePattern()1035     static synchronized Token getGraphemePattern() {
1036         if (Token.token_grapheme != null)
1037             return Token.token_grapheme;
1038 
1039         Token base_char = Token.createRange();  // [{ASSIGNED}]-[{M},{C}]
1040         base_char.mergeRanges(Token.getRange("ASSIGNED", true));
1041         base_char.subtractRanges(Token.getRange("M", true));
1042         base_char.subtractRanges(Token.getRange("C", true));
1043 
1044         Token virama = Token.createRange();
1045         for (int i = 0;  i < Token.viramaString.length(); i++) {
1046             virama.addRange(i, i);
1047         }
1048 
1049         Token combiner_wo_virama = Token.createRange();
1050         combiner_wo_virama.mergeRanges(Token.getRange("M", true));
1051         combiner_wo_virama.addRange(0x1160, 0x11ff); // hangul_medial and hangul_final
1052         combiner_wo_virama.addRange(0xff9e, 0xff9f); // extras
1053 
1054         Token left = Token.createUnion();       // base_char?
1055         left.addChild(base_char);
1056         left.addChild(Token.token_empty);
1057 
1058         Token foo = Token.createUnion();
1059         foo.addChild(Token.createConcat(virama, Token.getRange("L", true)));
1060         foo.addChild(combiner_wo_virama);
1061 
1062         foo = Token.createClosure(foo);
1063 
1064         foo = Token.createConcat(left, foo);
1065 
1066         Token.token_grapheme = foo;
1067         return Token.token_grapheme;
1068     }
1069 
1070     /**
1071      * Combing Character Sequence in Perl 5.6.
1072      */
1073     static private Token token_ccs = null;
getCombiningCharacterSequence()1074     static synchronized Token getCombiningCharacterSequence() {
1075         if (Token.token_ccs != null)
1076             return Token.token_ccs;
1077 
1078         Token foo = Token.createClosure(Token.getRange("M", true)); // \pM*
1079         foo = Token.createConcat(Token.getRange("M", false), foo); // \PM + \pM*
1080         Token.token_ccs = foo;
1081         return Token.token_ccs;
1082     }
1083 
1084     // ------------------------------------------------------
1085 
1086     // ------------------------------------------------------
1087     /**
1088      * This class represents a node in parse tree.
1089      */
1090     static class StringToken extends Token implements java.io.Serializable {
1091 
1092         private static final long serialVersionUID = -4614366944218504172L;
1093 
1094         String string;
1095         final int refNumber;
1096 
StringToken(int type, String str, int n)1097         StringToken(int type, String str, int n) {
1098             super(type);
1099             this.string = str;
1100             this.refNumber = n;
1101         }
1102 
getReferenceNumber()1103         int getReferenceNumber() {              // for STRING
1104             return this.refNumber;
1105         }
getString()1106         String getString() {                    // for STRING
1107             return this.string;
1108         }
1109 
toString(int options)1110         public String toString(int options) {
1111             if (this.type == BACKREFERENCE)
1112                 return "\\"+this.refNumber;
1113             else
1114                 return REUtil.quoteMeta(this.string);
1115         }
1116     }
1117 
1118     /**
1119      * This class represents a node in parse tree.
1120      */
1121     static class ConcatToken extends Token implements java.io.Serializable {
1122 
1123         private static final long serialVersionUID = 8717321425541346381L;
1124 
1125         final Token child;
1126         final Token child2;
1127 
ConcatToken(Token t1, Token t2)1128         ConcatToken(Token t1, Token t2) {
1129             super(Token.CONCAT);
1130             this.child = t1;
1131             this.child2 = t2;
1132         }
1133 
size()1134         int size() {
1135             return 2;
1136         }
getChild(int index)1137         Token getChild(int index) {
1138             return index == 0 ? this.child : this.child2;
1139         }
1140 
toString(int options)1141         public String toString(int options) {
1142             String ret;
1143             if (this.child2.type == CLOSURE && this.child2.getChild(0) == this.child) {
1144                 ret = this.child.toString(options)+"+";
1145             } else if (this.child2.type == NONGREEDYCLOSURE && this.child2.getChild(0) == this.child) {
1146                 ret = this.child.toString(options)+"+?";
1147             } else
1148                 ret = this.child.toString(options)+this.child2.toString(options);
1149             return ret;
1150         }
1151     }
1152 
1153     /**
1154      * This class represents a node in parse tree.
1155      */
1156     static class CharToken extends Token implements java.io.Serializable {
1157 
1158         private static final long serialVersionUID = -4394272816279496989L;
1159 
1160         final int chardata;
1161 
CharToken(int type, int ch)1162         CharToken(int type, int ch) {
1163             super(type);
1164             this.chardata = ch;
1165         }
1166 
getChar()1167         int getChar() {
1168             return this.chardata;
1169         }
1170 
toString(int options)1171         public String toString(int options) {
1172             String ret;
1173             switch (this.type) {
1174               case CHAR:
1175                 switch (this.chardata) {
1176                   case '|':  case '*':  case '+':  case '?':
1177                   case '(':  case ')':  case '.':  case '[':
1178                   case '{':  case '\\':
1179                     ret = "\\"+(char)this.chardata;
1180                     break;
1181                   case '\f':  ret = "\\f";  break;
1182                   case '\n':  ret = "\\n";  break;
1183                   case '\r':  ret = "\\r";  break;
1184                   case '\t':  ret = "\\t";  break;
1185                   case 0x1b:  ret = "\\e";  break;
1186                     //case 0x0b:  ret = "\\v";  break;
1187                   default:
1188                     if (this.chardata >= 0x10000) {
1189                         String pre = "0"+Integer.toHexString(this.chardata);
1190                         ret = "\\v"+pre.substring(pre.length()-6, pre.length());
1191                     } else
1192                         ret = ""+(char)this.chardata;
1193                 }
1194                 break;
1195 
1196               case ANCHOR:
1197                 if (this == Token.token_linebeginning || this == Token.token_lineend)
1198                     ret = ""+(char)this.chardata;
1199                 else
1200                     ret = "\\"+(char)this.chardata;
1201                 break;
1202 
1203               default:
1204                 ret = null;
1205             }
1206             return ret;
1207         }
1208 
match(int ch)1209         boolean match(int ch) {
1210             if (this.type == CHAR) {
1211                 return ch == this.chardata;
1212             } else
1213                 throw new RuntimeException("NFAArrow#match(): Internal error: "+this.type);
1214         }
1215     }
1216 
1217     /**
1218      * This class represents a node in parse tree.
1219      */
1220     static class ClosureToken extends Token implements java.io.Serializable {
1221 
1222         private static final long serialVersionUID = 1308971930673997452L;
1223 
1224         int min;
1225         int max;
1226         final Token child;
1227 
ClosureToken(int type, Token tok)1228         ClosureToken(int type, Token tok) {
1229             super(type);
1230             this.child = tok;
1231             this.setMin(-1);
1232             this.setMax(-1);
1233         }
1234 
size()1235         int size() {
1236             return 1;
1237         }
getChild(int index)1238         Token getChild(int index) {
1239             return this.child;
1240         }
1241 
setMin(int min)1242         final void setMin(int min) {
1243             this.min = min;
1244         }
setMax(int max)1245         final void setMax(int max) {
1246             this.max = max;
1247         }
getMin()1248         final int getMin() {
1249             return this.min;
1250         }
getMax()1251         final int getMax() {
1252             return this.max;
1253         }
1254 
toString(int options)1255         public String toString(int options) {
1256             String ret;
1257             if (this.type == CLOSURE) {
1258                 if (this.getMin() < 0 && this.getMax() < 0) {
1259                     ret = this.child.toString(options)+"*";
1260                 } else if (this.getMin() == this.getMax()) {
1261                     ret = this.child.toString(options)+"{"+this.getMin()+"}";
1262                 } else if (this.getMin() >= 0 && this.getMax() >= 0) {
1263                     ret = this.child.toString(options)+"{"+this.getMin()+","+this.getMax()+"}";
1264                 } else if (this.getMin() >= 0 && this.getMax() < 0) {
1265                     ret = this.child.toString(options)+"{"+this.getMin()+",}";
1266                 } else
1267                     throw new RuntimeException("Token#toString(): CLOSURE "
1268                                                +this.getMin()+", "+this.getMax());
1269             } else {
1270                 if (this.getMin() < 0 && this.getMax() < 0) {
1271                     ret = this.child.toString(options)+"*?";
1272                 } else if (this.getMin() == this.getMax()) {
1273                     ret = this.child.toString(options)+"{"+this.getMin()+"}?";
1274                 } else if (this.getMin() >= 0 && this.getMax() >= 0) {
1275                     ret = this.child.toString(options)+"{"+this.getMin()+","+this.getMax()+"}?";
1276                 } else if (this.getMin() >= 0 && this.getMax() < 0) {
1277                     ret = this.child.toString(options)+"{"+this.getMin()+",}?";
1278                 } else
1279                     throw new RuntimeException("Token#toString(): NONGREEDYCLOSURE "
1280                                                +this.getMin()+", "+this.getMax());
1281             }
1282             return ret;
1283         }
1284     }
1285 
1286     /**
1287      * This class represents a node in parse tree.
1288      */
1289     static class ParenToken extends Token implements java.io.Serializable {
1290 
1291         private static final long serialVersionUID = -5938014719827987704L;
1292 
1293         final Token child;
1294         final int parennumber;
1295 
ParenToken(int type, Token tok, int paren)1296         ParenToken(int type, Token tok, int paren) {
1297             super(type);
1298             this.child = tok;
1299             this.parennumber = paren;
1300         }
1301 
size()1302         int size() {
1303             return 1;
1304         }
getChild(int index)1305         Token getChild(int index) {
1306             return this.child;
1307         }
1308 
getParenNumber()1309         int getParenNumber() {
1310             return this.parennumber;
1311         }
1312 
toString(int options)1313         public String toString(int options) {
1314             String ret = null;
1315             switch (this.type) {
1316               case PAREN:
1317                 if (this.parennumber == 0) {
1318                     ret = "(?:"+this.child.toString(options)+")";
1319                 } else {
1320                     ret = "("+this.child.toString(options)+")";
1321                 }
1322                 break;
1323 
1324               case LOOKAHEAD:
1325                 ret = "(?="+this.child.toString(options)+")";
1326                 break;
1327               case NEGATIVELOOKAHEAD:
1328                 ret = "(?!"+this.child.toString(options)+")";
1329                 break;
1330               case LOOKBEHIND:
1331                 ret = "(?<="+this.child.toString(options)+")";
1332                 break;
1333               case NEGATIVELOOKBEHIND:
1334                 ret = "(?<!"+this.child.toString(options)+")";
1335                 break;
1336               case INDEPENDENT:
1337                 ret = "(?>"+this.child.toString(options)+")";
1338                 break;
1339             }
1340             return ret;
1341         }
1342     }
1343 
1344     /**
1345      * (?(condition)yes-pattern|no-pattern)
1346      */
1347     static class ConditionToken extends Token implements java.io.Serializable {
1348 
1349         private static final long serialVersionUID = 4353765277910594411L;
1350 
1351         final int refNumber;
1352         final Token condition;
1353         final Token yes;
1354         final Token no;
ConditionToken(int refno, Token cond, Token yespat, Token nopat)1355         ConditionToken(int refno, Token cond, Token yespat, Token nopat) {
1356             super(Token.CONDITION);
1357             this.refNumber = refno;
1358             this.condition = cond;
1359             this.yes = yespat;
1360             this.no = nopat;
1361         }
size()1362         int size() {
1363             return this.no == null ? 1 : 2;
1364         }
getChild(int index)1365         Token getChild(int index) {
1366             if (index == 0)  return this.yes;
1367             if (index == 1)  return this.no;
1368             throw new RuntimeException("Internal Error: "+index);
1369         }
1370 
toString(int options)1371         public String toString(int options) {
1372             String ret;
1373             if (refNumber > 0) {
1374                 ret = "(?("+refNumber+")";
1375             } else if (this.condition.type == Token.ANCHOR) {
1376                 ret = "(?("+this.condition+")";
1377             } else {
1378                 ret = "(?"+this.condition;
1379             }
1380 
1381             if (this.no == null) {
1382                 ret += this.yes+")";
1383             } else {
1384                 ret += this.yes+"|"+this.no+")";
1385             }
1386             return ret;
1387         }
1388     }
1389 
1390     /**
1391      * (ims-ims: .... )
1392      */
1393     static class ModifierToken extends Token implements java.io.Serializable {
1394 
1395         private static final long serialVersionUID = -9114536559696480356L;
1396 
1397         final Token child;
1398         final int add;
1399         final int mask;
1400 
ModifierToken(Token tok, int add, int mask)1401         ModifierToken(Token tok, int add, int mask) {
1402             super(Token.MODIFIERGROUP);
1403             this.child = tok;
1404             this.add = add;
1405             this.mask = mask;
1406         }
1407 
size()1408         int size() {
1409             return 1;
1410         }
getChild(int index)1411         Token getChild(int index) {
1412             return this.child;
1413         }
1414 
getOptions()1415         int getOptions() {
1416             return this.add;
1417         }
getOptionsMask()1418         int getOptionsMask() {
1419             return this.mask;
1420         }
1421 
toString(int options)1422         public String toString(int options) {
1423             return "(?"
1424                 +(this.add == 0 ? "" : REUtil.createOptionString(this.add))
1425                 +(this.mask == 0 ? "" : REUtil.createOptionString(this.mask))
1426                 +":"
1427                 +this.child.toString(options)
1428                 +")";
1429         }
1430     }
1431 
1432     /**
1433      * This class represents a node in parse tree.
1434      * for UNION or CONCAT.
1435      */
1436     static class UnionToken extends Token implements java.io.Serializable {
1437 
1438         private static final long serialVersionUID = -2568843945989489861L;
1439 
1440         List<Token> children;
1441 
1442         /**
1443          * @serialField children Vector children
1444          */
1445         private static final ObjectStreamField[] serialPersistentFields =
1446             new ObjectStreamField[] {
1447                 new ObjectStreamField("children", Vector.class),
1448             };
1449 
UnionToken(int type)1450         UnionToken(int type) {
1451             super(type);
1452         }
1453 
1454         @Override
addChild(Token tok)1455         void addChild(Token tok) {
1456             if (tok == null)  return;
1457             if (this.children == null)  this.children = new ArrayList<>();
1458             if (this.type == UNION) {
1459                 this.children.add(tok);
1460                 return;
1461             }
1462                                                 // This is CONCAT, and new child is CONCAT.
1463             if (tok.type == CONCAT) {
1464                 for (int i = 0;  i < tok.size();  i ++)
1465                     this.addChild(tok.getChild(i)); // Recursion
1466                 return;
1467             }
1468             int size = this.children.size();
1469             if (size == 0) {
1470                 this.children.add(tok);
1471                 return;
1472             }
1473             Token previous = this.children.get(size - 1);
1474             if (!((previous.type == CHAR || previous.type == STRING)
1475                   && (tok.type == CHAR || tok.type == STRING))) {
1476                 this.children.add(tok);
1477                 return;
1478             }
1479 
1480             //System.err.println("Merge '"+previous+"' and '"+tok+"'.");
1481 
1482             StringBuilder buffer;
1483             int nextMaxLength = (tok.type == CHAR ? 2 : tok.getString().length());
1484             if (previous.type == CHAR) {        // Replace previous token by STRING
1485                 buffer = new StringBuilder(2 + nextMaxLength);
1486                 int ch = previous.getChar();
1487                 if (ch >= 0x10000)
1488                     buffer.append(REUtil.decomposeToSurrogates(ch));
1489                 else
1490                     buffer.append((char)ch);
1491                 previous = Token.createString(null);
1492                 this.children.set(size - 1, previous);
1493             } else {                            // STRING
1494                 buffer = new StringBuilder(previous.getString().length() + nextMaxLength);
1495                 buffer.append(previous.getString());
1496             }
1497 
1498             if (tok.type == CHAR) {
1499                 int ch = tok.getChar();
1500                 if (ch >= 0x10000)
1501                     buffer.append(REUtil.decomposeToSurrogates(ch));
1502                 else
1503                     buffer.append((char)ch);
1504             } else {
1505                 buffer.append(tok.getString());
1506             }
1507 
1508             ((StringToken)previous).string = new String(buffer);
1509         }
1510 
1511         @Override
size()1512         int size() {
1513             return this.children == null ? 0 : this.children.size();
1514         }
1515         @Override
getChild(int index)1516         Token getChild(int index) {
1517             return this.children.get(index);
1518         }
1519 
1520         @Override
toString(int options)1521         public String toString(int options) {
1522             String ret;
1523             if (this.type == CONCAT) {
1524                 if (this.children.size() == 2) {
1525                     Token ch = this.getChild(0);
1526                     Token ch2 = this.getChild(1);
1527                     if (ch2.type == CLOSURE && ch2.getChild(0) == ch) {
1528                         ret = ch.toString(options)+"+";
1529                     } else if (ch2.type == NONGREEDYCLOSURE && ch2.getChild(0) == ch) {
1530                         ret = ch.toString(options)+"+?";
1531                     } else
1532                         ret = ch.toString(options)+ch2.toString(options);
1533                 } else {
1534                     StringBuilder sb = new StringBuilder();
1535                     this.children.stream().forEach((children1) -> {
1536                         sb.append((children1).toString(options));
1537                     });
1538                     ret = sb.toString();
1539                 }
1540                 return ret;
1541             }
1542             if (this.children.size() == 2 && this.getChild(1).type == EMPTY) {
1543                 ret = this.getChild(0).toString(options)+"?";
1544             } else if (this.children.size() == 2
1545                        && this.getChild(0).type == EMPTY) {
1546                 ret = this.getChild(1).toString(options)+"??";
1547             } else {
1548                 StringBuilder sb = new StringBuilder();
1549                 sb.append((this.children.get(0)).toString(options));
1550                 for (int i = 1;  i < this.children.size();  i ++) {
1551                     sb.append('|');
1552                     sb.append((this.children.get(i)).toString(options));
1553                 }
1554                 ret = sb.toString();
1555             }
1556             return ret;
1557         }
1558 
1559         /**
1560          * @serialData Serialized fields. Convert the List to Vector for backward compatibility.
1561          */
writeObject(ObjectOutputStream out)1562         private void writeObject(ObjectOutputStream out) throws IOException {
1563             // Convert List to Vector
1564             Vector<Token> vChildren = (children == null)? null : new Vector<>(children);
1565 
1566             // Write serialized fields
1567             ObjectOutputStream.PutField pf = out.putFields();
1568             pf.put("children", vChildren);
1569             out.writeFields();
1570     }
1571 
1572         @SuppressWarnings("unchecked")
readObject(ObjectInputStream in)1573         private void readObject(ObjectInputStream in)
1574                             throws IOException, ClassNotFoundException {
1575             // We have to read serialized fields first.
1576             ObjectInputStream.GetField gf = in.readFields();
1577             Vector<Token> vChildren = (Vector<Token>)gf.get("children", null);
1578 
1579             //convert Vector back to List
1580             if (vChildren != null) children = new ArrayList<>(vChildren);
1581         }
1582     }
1583 }
1584