1 /*
2  * Copyright (c) 2014, 2015, Oracle and/or its affiliates. All rights reserved.
3  */
4 /*
5  * Licensed to the Apache Software Foundation (ASF) under one or more
6  * contributor license agreements.  See the NOTICE file distributed with
7  * this work for additional information regarding copyright ownership.
8  * The ASF licenses this file to You under the Apache License, Version 2.0
9  * (the "License"); you may not use this file except in compliance with
10  * the License.  You may obtain a copy of the License at
11  *
12  *      http://www.apache.org/licenses/LICENSE-2.0
13  *
14  * Unless required by applicable law or agreed to in writing, software
15  * distributed under the License is distributed on an "AS IS" BASIS,
16  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17  * See the License for the specific language governing permissions and
18  * limitations under the License.
19  */
20 
21 package com.sun.org.apache.xerces.internal.impl.xpath.regex;
22 
23 /**
24  * This class represents a character class such as [a-z] or a period.
25  *
26  * @xerces.internal
27  *
28  */
29 final class RangeToken extends Token implements java.io.Serializable {
30 
31     private static final long serialVersionUID = 3257568399592010545L;
32 
33     int[] ranges;
34     boolean sorted;
35     boolean compacted;
36     RangeToken icaseCache = null;
37     int[] map = null;
38     int nonMapIndex;
39 
RangeToken(int type)40     RangeToken(int type) {
41         super(type);
42         this.setSorted(false);
43     }
44 
45     // for RANGE or NRANGE
addRange(int start, int end)46     protected void addRange(int start, int end) {
47         this.icaseCache = null;
48         //System.err.println("Token#addRange(): "+start+" "+end);
49         int r1, r2;
50         if (start <= end) {
51             r1 = start;
52             r2 = end;
53         } else {
54             r1 = end;
55             r2 = start;
56         }
57 
58         int pos = 0;
59         if (this.ranges == null) {
60             this.ranges = new int[2];
61             this.ranges[0] = r1;
62             this.ranges[1] = r2;
63             this.setSorted(true);
64         } else {
65             pos = this.ranges.length;
66             if (this.ranges[pos-1]+1 == r1) {
67                 this.ranges[pos-1] = r2;
68                 return;
69             }
70             int[] temp = new int[pos+2];
71             System.arraycopy(this.ranges, 0, temp, 0, pos);
72             this.ranges = temp;
73             if (this.ranges[pos-1] >= r1)
74                 this.setSorted(false);
75             this.ranges[pos++] = r1;
76             this.ranges[pos] = r2;
77             if (!this.sorted)
78                 this.sortRanges();
79         }
80     }
81 
isSorted()82     private final boolean isSorted() {
83         return this.sorted;
84     }
setSorted(boolean sort)85     private final void setSorted(boolean sort) {
86         this.sorted = sort;
87         if (!sort)  this.compacted = false;
88     }
isCompacted()89     private final boolean isCompacted() {
90         return this.compacted;
91     }
setCompacted()92     private final void setCompacted() {
93         this.compacted = true;
94     }
95 
sortRanges()96     protected void sortRanges() {
97         if (this.isSorted())
98             return;
99         if (this.ranges == null)
100             return;
101         //System.err.println("Do sorting: "+this.ranges.length);
102 
103                                                 // Bubble sort
104                                                 // Why? -- In many cases,
105                                                 //         this.ranges has few elements.
106         for (int i = this.ranges.length-4;  i >= 0;  i -= 2) {
107             for (int j = 0;  j <= i;  j += 2) {
108                 if (this.ranges[j] > this.ranges[j+2]
109                     || this.ranges[j] == this.ranges[j+2] && this.ranges[j+1] > this.ranges[j+3]) {
110                     int tmp;
111                     tmp = this.ranges[j+2];
112                     this.ranges[j+2] = this.ranges[j];
113                     this.ranges[j] = tmp;
114                     tmp = this.ranges[j+3];
115                     this.ranges[j+3] = this.ranges[j+1];
116                     this.ranges[j+1] = tmp;
117                 }
118             }
119         }
120         this.setSorted(true);
121     }
122 
123     /**
124      * this.ranges is sorted.
125      */
compactRanges()126     protected void compactRanges() {
127         boolean DEBUG = false;
128         if (this.ranges == null || this.ranges.length <= 2)
129             return;
130         if (this.isCompacted())
131             return;
132         int base = 0;                           // Index of writing point
133         int target = 0;                         // Index of processing point
134 
135         while (target < this.ranges.length) {
136             if (base != target) {
137                 this.ranges[base] = this.ranges[target++];
138                 this.ranges[base+1] = this.ranges[target++];
139             } else
140                 target += 2;
141             int baseend = this.ranges[base+1];
142             while (target < this.ranges.length) {
143                 if (baseend+1 < this.ranges[target])
144                     break;
145                 if (baseend+1 == this.ranges[target]) {
146                     if (DEBUG)
147                         System.err.println("Token#compactRanges(): Compaction: ["+this.ranges[base]
148                                            +", "+this.ranges[base+1]
149                                            +"], ["+this.ranges[target]
150                                            +", "+this.ranges[target+1]
151                                            +"] -> ["+this.ranges[base]
152                                            +", "+this.ranges[target+1]
153                                            +"]");
154                     this.ranges[base+1] = this.ranges[target+1];
155                     baseend = this.ranges[base+1];
156                     target += 2;
157                 } else if (baseend >= this.ranges[target+1]) {
158                     if (DEBUG)
159                         System.err.println("Token#compactRanges(): Compaction: ["+this.ranges[base]
160                                            +", "+this.ranges[base+1]
161                                            +"], ["+this.ranges[target]
162                                            +", "+this.ranges[target+1]
163                                            +"] -> ["+this.ranges[base]
164                                            +", "+this.ranges[base+1]
165                                            +"]");
166                     target += 2;
167                 } else if (baseend < this.ranges[target+1]) {
168                     if (DEBUG)
169                         System.err.println("Token#compactRanges(): Compaction: ["+this.ranges[base]
170                                            +", "+this.ranges[base+1]
171                                            +"], ["+this.ranges[target]
172                                            +", "+this.ranges[target+1]
173                                            +"] -> ["+this.ranges[base]
174                                            +", "+this.ranges[target+1]
175                                            +"]");
176                     this.ranges[base+1] = this.ranges[target+1];
177                     baseend = this.ranges[base+1];
178                     target += 2;
179                 } else {
180                     throw new RuntimeException("Token#compactRanges(): Internel Error: ["
181                                                +this.ranges[base]
182                                                +","+this.ranges[base+1]
183                                                +"] ["+this.ranges[target]
184                                                +","+this.ranges[target+1]+"]");
185                 }
186             } // while
187             base += 2;
188         }
189 
190         if (base != this.ranges.length) {
191             int[] result = new int[base];
192             System.arraycopy(this.ranges, 0, result, 0, base);
193             this.ranges = result;
194         }
195         this.setCompacted();
196     }
197 
mergeRanges(Token token)198     protected void mergeRanges(Token token) {
199         RangeToken tok = (RangeToken)token;
200         this.sortRanges();
201         tok.sortRanges();
202         if (tok.ranges == null)
203             return;
204         this.icaseCache = null;
205         this.setSorted(true);
206         if (this.ranges == null) {
207             this.ranges = new int[tok.ranges.length];
208             System.arraycopy(tok.ranges, 0, this.ranges, 0, tok.ranges.length);
209             return;
210         }
211         int[] result = new int[this.ranges.length+tok.ranges.length];
212         for (int i = 0, j = 0, k = 0;  i < this.ranges.length || j < tok.ranges.length;) {
213             if (i >= this.ranges.length) {
214                 result[k++] = tok.ranges[j++];
215                 result[k++] = tok.ranges[j++];
216             } else if (j >= tok.ranges.length) {
217                 result[k++] = this.ranges[i++];
218                 result[k++] = this.ranges[i++];
219             } else if (tok.ranges[j] < this.ranges[i]
220                        || tok.ranges[j] == this.ranges[i] && tok.ranges[j+1] < this.ranges[i+1]) {
221                 result[k++] = tok.ranges[j++];
222                 result[k++] = tok.ranges[j++];
223             } else {
224                 result[k++] = this.ranges[i++];
225                 result[k++] = this.ranges[i++];
226             }
227         }
228         this.ranges = result;
229     }
230 
subtractRanges(Token token)231     protected void subtractRanges(Token token) {
232         if (token.type == NRANGE) {
233             this.intersectRanges(token);
234             return;
235         }
236         RangeToken tok = (RangeToken)token;
237         if (tok.ranges == null || this.ranges == null)
238             return;
239         this.icaseCache = null;
240         this.sortRanges();
241         this.compactRanges();
242         tok.sortRanges();
243         tok.compactRanges();
244 
245         //System.err.println("Token#substractRanges(): Entry: "+this.ranges.length+", "+tok.ranges.length);
246 
247         int[] result = new int[this.ranges.length+tok.ranges.length];
248         int wp = 0, src = 0, sub = 0;
249         while (src < this.ranges.length && sub < tok.ranges.length) {
250             int srcbegin = this.ranges[src];
251             int srcend = this.ranges[src+1];
252             int subbegin = tok.ranges[sub];
253             int subend = tok.ranges[sub+1];
254             if (srcend < subbegin) {            // Not overlapped
255                                                 // src: o-----o
256                                                 // sub:         o-----o
257                                                 // res: o-----o
258                                                 // Reuse sub
259                 result[wp++] = this.ranges[src++];
260                 result[wp++] = this.ranges[src++];
261             } else if (srcend >= subbegin
262                        && srcbegin <= subend) { // Overlapped
263                                                 // src:    o--------o
264                                                 // sub:  o----o
265                                                 // sub:      o----o
266                                                 // sub:          o----o
267                                                 // sub:  o------------o
268                 if (subbegin <= srcbegin && srcend <= subend) {
269                                                 // src:    o--------o
270                                                 // sub:  o------------o
271                                                 // res: empty
272                                                 // Reuse sub
273                     src += 2;
274                 } else if (subbegin <= srcbegin) {
275                                                 // src:    o--------o
276                                                 // sub:  o----o
277                                                 // res:       o-----o
278                                                 // Reuse src(=res)
279                     this.ranges[src] = subend+1;
280                     sub += 2;
281                 } else if (srcend <= subend) {
282                                                 // src:    o--------o
283                                                 // sub:          o----o
284                                                 // res:    o-----o
285                                                 // Reuse sub
286                     result[wp++] = srcbegin;
287                     result[wp++] = subbegin-1;
288                     src += 2;
289                 } else {
290                                                 // src:    o--------o
291                                                 // sub:      o----o
292                                                 // res:    o-o    o-o
293                                                 // Reuse src(=right res)
294                     result[wp++] = srcbegin;
295                     result[wp++] = subbegin-1;
296                     this.ranges[src] = subend+1;
297                     sub += 2;
298                 }
299             } else if (subend < srcbegin) {
300                                                 // Not overlapped
301                                                 // src:          o-----o
302                                                 // sub: o----o
303                 sub += 2;
304             } else {
305                 throw new RuntimeException("Token#subtractRanges(): Internal Error: ["+this.ranges[src]
306                                            +","+this.ranges[src+1]
307                                            +"] - ["+tok.ranges[sub]
308                                            +","+tok.ranges[sub+1]
309                                            +"]");
310             }
311         }
312         while (src < this.ranges.length) {
313             result[wp++] = this.ranges[src++];
314             result[wp++] = this.ranges[src++];
315         }
316         this.ranges = new int[wp];
317         System.arraycopy(result, 0, this.ranges, 0, wp);
318                                                 // this.ranges is sorted and compacted.
319     }
320 
321     /**
322      * @param tok Ignore whether it is NRANGE or not.
323      */
intersectRanges(Token token)324     protected void intersectRanges(Token token) {
325         RangeToken tok = (RangeToken)token;
326         if (tok.ranges == null || this.ranges == null)
327             return;
328         this.icaseCache = null;
329         this.sortRanges();
330         this.compactRanges();
331         tok.sortRanges();
332         tok.compactRanges();
333 
334         int[] result = new int[this.ranges.length+tok.ranges.length];
335         int wp = 0, src1 = 0, src2 = 0;
336         while (src1 < this.ranges.length && src2 < tok.ranges.length) {
337             int src1begin = this.ranges[src1];
338             int src1end = this.ranges[src1+1];
339             int src2begin = tok.ranges[src2];
340             int src2end = tok.ranges[src2+1];
341             if (src1end < src2begin) {          // Not overlapped
342                                                 // src1: o-----o
343                                                 // src2:         o-----o
344                                                 // res:  empty
345                                                 // Reuse src2
346                 src1 += 2;
347             } else if (src1end >= src2begin
348                        && src1begin <= src2end) { // Overlapped
349                                                 // src1:    o--------o
350                                                 // src2:  o----o
351                                                 // src2:      o----o
352                                                 // src2:          o----o
353                                                 // src2:  o------------o
354                 if (src2begin <= src1begin && src1end <= src2end) {
355                                                 // src1:    o--------o
356                                                 // src2:  o------------o
357                                                 // res:     o--------o
358                                                 // Reuse src2
359                     result[wp++] = src1begin;
360                     result[wp++] = src1end;
361                     src1 += 2;
362                 } else if (src2begin <= src1begin) {
363                                                 // src1:    o--------o
364                                                 // src2:  o----o
365                                                 // res:     o--o
366                                                 // Reuse the rest of src1
367                     result[wp++] = src1begin;
368                     result[wp++] = src2end;
369                     this.ranges[src1] = src2end+1;
370                     src2 += 2;
371                 } else if (src1end <= src2end) {
372                                                 // src1:    o--------o
373                                                 // src2:          o----o
374                                                 // res:           o--o
375                                                 // Reuse src2
376                     result[wp++] = src2begin;
377                     result[wp++] = src1end;
378                     src1 += 2;
379                 } else {
380                                                 // src1:    o--------o
381                                                 // src2:      o----o
382                                                 // res:       o----o
383                                                 // Reuse the rest of src1
384                     result[wp++] = src2begin;
385                     result[wp++] = src2end;
386                     this.ranges[src1] = src2end+1;
387                     src2 += 2;
388                 }
389             } else if (src2end < src1begin) {
390                                                 // Not overlapped
391                                                 // src1:          o-----o
392                                                 // src2: o----o
393                 src2 += 2;
394             } else {
395                 throw new RuntimeException("Token#intersectRanges(): Internal Error: ["
396                                            +this.ranges[src1]
397                                            +","+this.ranges[src1+1]
398                                            +"] & ["+tok.ranges[src2]
399                                            +","+tok.ranges[src2+1]
400                                            +"]");
401             }
402         }
403         this.ranges = new int[wp];
404         System.arraycopy(result, 0, this.ranges, 0, wp);
405                                                 // this.ranges is sorted and compacted.
406     }
407 
408     /**
409      * for RANGE: Creates complement.
410      * for NRANGE: Creates the same meaning RANGE.
411      */
complementRanges(Token token)412     static Token complementRanges(Token token) {
413         if (token.type != RANGE && token.type != NRANGE)
414             throw new IllegalArgumentException("Token#complementRanges(): must be RANGE: "+token.type);
415         RangeToken tok = (RangeToken)token;
416         tok.sortRanges();
417         tok.compactRanges();
418         int len = tok.ranges.length+2;
419         if (tok.ranges[0] == 0)
420             len -= 2;
421         int last = tok.ranges[tok.ranges.length-1];
422         if (last == UTF16_MAX)
423             len -= 2;
424         RangeToken ret = Token.createRange();
425         ret.ranges = new int[len];
426         int wp = 0;
427         if (tok.ranges[0] > 0) {
428             ret.ranges[wp++] = 0;
429             ret.ranges[wp++] = tok.ranges[0]-1;
430         }
431         for (int i = 1;  i < tok.ranges.length-2;  i += 2) {
432             ret.ranges[wp++] = tok.ranges[i]+1;
433             ret.ranges[wp++] = tok.ranges[i+1]-1;
434         }
435         if (last != UTF16_MAX) {
436             ret.ranges[wp++] = last+1;
437             ret.ranges[wp] = UTF16_MAX;
438         }
439         ret.setCompacted();
440         return ret;
441     }
442 
getCaseInsensitiveToken()443     synchronized RangeToken getCaseInsensitiveToken() {
444         if (this.icaseCache != null)
445             return this.icaseCache;
446 
447         RangeToken uppers = this.type == Token.RANGE ? Token.createRange() : Token.createNRange();
448         for (int i = 0;  i < this.ranges.length;  i += 2) {
449             for (int ch = this.ranges[i];  ch <= this.ranges[i+1];  ch ++) {
450                 if (ch > 0xffff)
451                     uppers.addRange(ch, ch);
452                 else {
453                     char uch = Character.toUpperCase((char)ch);
454                     uppers.addRange(uch, uch);
455                 }
456             }
457         }
458         RangeToken lowers = this.type == Token.RANGE ? Token.createRange() : Token.createNRange();
459         for (int i = 0;  i < uppers.ranges.length;  i += 2) {
460             for (int ch = uppers.ranges[i];  ch <= uppers.ranges[i+1];  ch ++) {
461                 if (ch > 0xffff)
462                     lowers.addRange(ch, ch);
463                 else {
464                     char lch = Character.toLowerCase((char)ch);
465                     lowers.addRange(lch, lch);
466                 }
467             }
468         }
469         lowers.mergeRanges(uppers);
470         lowers.mergeRanges(this);
471         lowers.compactRanges();
472 
473         this.icaseCache = lowers;
474         return lowers;
475     }
476 
dumpRanges()477     void dumpRanges() {
478         System.err.print("RANGE: ");
479         if (this.ranges == null) {
480             System.err.println(" NULL");
481             return;
482         }
483         for (int i = 0;  i < this.ranges.length;  i += 2) {
484             System.err.print("["+this.ranges[i]+","+this.ranges[i+1]+"] ");
485         }
486         System.err.println("");
487     }
488 
match(int ch)489     boolean match(int ch) {
490         if (this.map == null)  this.createMap();
491         boolean ret;
492         if (this.type == RANGE) {
493             if (ch < MAPSIZE)
494                 return (this.map[ch/32] & (1<<(ch&0x1f))) != 0;
495             ret = false;
496             for (int i = this.nonMapIndex;  i < this.ranges.length;  i += 2) {
497                 if (this.ranges[i] <= ch && ch <= this.ranges[i+1])
498                     return true;
499             }
500         } else {
501             if (ch < MAPSIZE)
502                 return (this.map[ch/32] & (1<<(ch&0x1f))) == 0;
503             ret = true;
504             for (int i = this.nonMapIndex;  i < this.ranges.length;  i += 2) {
505                 if (this.ranges[i] <= ch && ch <= this.ranges[i+1])
506                     return false;
507             }
508         }
509         return ret;
510     }
511 
512     private static final int MAPSIZE = 256;
createMap()513     private void createMap() {
514         int asize = MAPSIZE/32;                 // 32 is the number of bits in `int'.
515         int [] map = new int[asize];
516         int nonMapIndex = this.ranges.length;
517         for (int i = 0; i < asize; ++i) {
518             map[i] = 0;
519         }
520         for (int i = 0; i < this.ranges.length;  i += 2) {
521             int s = this.ranges[i];
522             int e = this.ranges[i+1];
523             if (s < MAPSIZE) {
524                 for (int j = s; j <= e && j < MAPSIZE; j++) {
525                     map[j/32] |= 1<<(j&0x1f); // s&0x1f : 0-31
526                 }
527             }
528             else {
529                 nonMapIndex = i;
530                 break;
531             }
532             if (e >= MAPSIZE) {
533                 nonMapIndex = i;
534                 break;
535             }
536         }
537         this.map = map;
538         this.nonMapIndex = nonMapIndex;
539         //for (int i = 0;  i < asize;  i ++)  System.err.println("Map: "+Integer.toString(this.map[i], 16));
540     }
541 
toString(int options)542     public String toString(int options) {
543         String ret;
544         if (this.type == RANGE) {
545             if (this == Token.token_dot)
546                 ret = ".";
547             else if (this == Token.token_0to9)
548                 ret = "\\d";
549             else if (this == Token.token_wordchars)
550                 ret = "\\w";
551             else if (this == Token.token_spaces)
552                 ret = "\\s";
553             else {
554                 StringBuilder sb = new StringBuilder();
555                 sb.append('[');
556                 for (int i = 0;  i < this.ranges.length;  i += 2) {
557                     if ((options & RegularExpression.SPECIAL_COMMA) != 0 && i > 0)  sb.append(',');
558                     if (this.ranges[i] == this.ranges[i+1]) {
559                         sb.append(escapeCharInCharClass(this.ranges[i]));
560                     } else {
561                         sb.append(escapeCharInCharClass(this.ranges[i]));
562                         sb.append('-');
563                         sb.append(escapeCharInCharClass(this.ranges[i+1]));
564                     }
565                 }
566                 sb.append(']');
567                 ret = sb.toString();
568             }
569         } else {
570             if (this == Token.token_not_0to9)
571                 ret = "\\D";
572             else if (this == Token.token_not_wordchars)
573                 ret = "\\W";
574             else if (this == Token.token_not_spaces)
575                 ret = "\\S";
576             else {
577                 StringBuffer sb = new StringBuffer();
578                 sb.append("[^");
579                 for (int i = 0;  i < this.ranges.length;  i += 2) {
580                     if ((options & RegularExpression.SPECIAL_COMMA) != 0 && i > 0)  sb.append(',');
581                     if (this.ranges[i] == this.ranges[i+1]) {
582                         sb.append(escapeCharInCharClass(this.ranges[i]));
583                     } else {
584                         sb.append(escapeCharInCharClass(this.ranges[i]));
585                         sb.append('-');
586                         sb.append(escapeCharInCharClass(this.ranges[i+1]));
587                     }
588                 }
589                 sb.append(']');
590                 ret = sb.toString();
591             }
592         }
593         return ret;
594     }
595 
escapeCharInCharClass(int ch)596     private static String escapeCharInCharClass(int ch) {
597         String ret;
598         switch (ch) {
599           case '[':  case ']':  case '-':  case '^':
600           case ',':  case '\\':
601             ret = "\\"+(char)ch;
602             break;
603           case '\f':  ret = "\\f";  break;
604           case '\n':  ret = "\\n";  break;
605           case '\r':  ret = "\\r";  break;
606           case '\t':  ret = "\\t";  break;
607           case 0x1b:  ret = "\\e";  break;
608           //case 0x0b:  ret = "\\v";  break;
609           default:
610             if (ch < 0x20) {
611                 String pre = "0"+Integer.toHexString(ch);
612                 ret = "\\x"+pre.substring(pre.length()-2, pre.length());
613             } else if (ch >= 0x10000) {
614                 String pre = "0"+Integer.toHexString(ch);
615                 ret = "\\v"+pre.substring(pre.length()-6, pre.length());
616             } else
617                 ret = ""+(char)ch;
618         }
619         return ret;
620     }
621 
622 }
623