1 /* 2 * Copyright (c) 2014, 2015, Oracle and/or its affiliates. All rights reserved. 3 */ 4 /* 5 * Licensed to the Apache Software Foundation (ASF) under one or more 6 * contributor license agreements. See the NOTICE file distributed with 7 * this work for additional information regarding copyright ownership. 8 * The ASF licenses this file to You under the Apache License, Version 2.0 9 * (the "License"); you may not use this file except in compliance with 10 * the License. You may obtain a copy of the License at 11 * 12 * http://www.apache.org/licenses/LICENSE-2.0 13 * 14 * Unless required by applicable law or agreed to in writing, software 15 * distributed under the License is distributed on an "AS IS" BASIS, 16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 * See the License for the specific language governing permissions and 18 * limitations under the License. 19 */ 20 21 package com.sun.org.apache.xerces.internal.impl.xpath.regex; 22 23 /** 24 * This class represents a character class such as [a-z] or a period. 25 * 26 * @xerces.internal 27 * 28 */ 29 final class RangeToken extends Token implements java.io.Serializable { 30 31 private static final long serialVersionUID = 3257568399592010545L; 32 33 int[] ranges; 34 boolean sorted; 35 boolean compacted; 36 RangeToken icaseCache = null; 37 int[] map = null; 38 int nonMapIndex; 39 RangeToken(int type)40 RangeToken(int type) { 41 super(type); 42 this.setSorted(false); 43 } 44 45 // for RANGE or NRANGE addRange(int start, int end)46 protected void addRange(int start, int end) { 47 this.icaseCache = null; 48 //System.err.println("Token#addRange(): "+start+" "+end); 49 int r1, r2; 50 if (start <= end) { 51 r1 = start; 52 r2 = end; 53 } else { 54 r1 = end; 55 r2 = start; 56 } 57 58 int pos = 0; 59 if (this.ranges == null) { 60 this.ranges = new int[2]; 61 this.ranges[0] = r1; 62 this.ranges[1] = r2; 63 this.setSorted(true); 64 } else { 65 pos = this.ranges.length; 66 if (this.ranges[pos-1]+1 == r1) { 67 this.ranges[pos-1] = r2; 68 return; 69 } 70 int[] temp = new int[pos+2]; 71 System.arraycopy(this.ranges, 0, temp, 0, pos); 72 this.ranges = temp; 73 if (this.ranges[pos-1] >= r1) 74 this.setSorted(false); 75 this.ranges[pos++] = r1; 76 this.ranges[pos] = r2; 77 if (!this.sorted) 78 this.sortRanges(); 79 } 80 } 81 isSorted()82 private final boolean isSorted() { 83 return this.sorted; 84 } setSorted(boolean sort)85 private final void setSorted(boolean sort) { 86 this.sorted = sort; 87 if (!sort) this.compacted = false; 88 } isCompacted()89 private final boolean isCompacted() { 90 return this.compacted; 91 } setCompacted()92 private final void setCompacted() { 93 this.compacted = true; 94 } 95 sortRanges()96 protected void sortRanges() { 97 if (this.isSorted()) 98 return; 99 if (this.ranges == null) 100 return; 101 //System.err.println("Do sorting: "+this.ranges.length); 102 103 // Bubble sort 104 // Why? -- In many cases, 105 // this.ranges has few elements. 106 for (int i = this.ranges.length-4; i >= 0; i -= 2) { 107 for (int j = 0; j <= i; j += 2) { 108 if (this.ranges[j] > this.ranges[j+2] 109 || this.ranges[j] == this.ranges[j+2] && this.ranges[j+1] > this.ranges[j+3]) { 110 int tmp; 111 tmp = this.ranges[j+2]; 112 this.ranges[j+2] = this.ranges[j]; 113 this.ranges[j] = tmp; 114 tmp = this.ranges[j+3]; 115 this.ranges[j+3] = this.ranges[j+1]; 116 this.ranges[j+1] = tmp; 117 } 118 } 119 } 120 this.setSorted(true); 121 } 122 123 /** 124 * this.ranges is sorted. 125 */ compactRanges()126 protected void compactRanges() { 127 boolean DEBUG = false; 128 if (this.ranges == null || this.ranges.length <= 2) 129 return; 130 if (this.isCompacted()) 131 return; 132 int base = 0; // Index of writing point 133 int target = 0; // Index of processing point 134 135 while (target < this.ranges.length) { 136 if (base != target) { 137 this.ranges[base] = this.ranges[target++]; 138 this.ranges[base+1] = this.ranges[target++]; 139 } else 140 target += 2; 141 int baseend = this.ranges[base+1]; 142 while (target < this.ranges.length) { 143 if (baseend+1 < this.ranges[target]) 144 break; 145 if (baseend+1 == this.ranges[target]) { 146 if (DEBUG) 147 System.err.println("Token#compactRanges(): Compaction: ["+this.ranges[base] 148 +", "+this.ranges[base+1] 149 +"], ["+this.ranges[target] 150 +", "+this.ranges[target+1] 151 +"] -> ["+this.ranges[base] 152 +", "+this.ranges[target+1] 153 +"]"); 154 this.ranges[base+1] = this.ranges[target+1]; 155 baseend = this.ranges[base+1]; 156 target += 2; 157 } else if (baseend >= this.ranges[target+1]) { 158 if (DEBUG) 159 System.err.println("Token#compactRanges(): Compaction: ["+this.ranges[base] 160 +", "+this.ranges[base+1] 161 +"], ["+this.ranges[target] 162 +", "+this.ranges[target+1] 163 +"] -> ["+this.ranges[base] 164 +", "+this.ranges[base+1] 165 +"]"); 166 target += 2; 167 } else if (baseend < this.ranges[target+1]) { 168 if (DEBUG) 169 System.err.println("Token#compactRanges(): Compaction: ["+this.ranges[base] 170 +", "+this.ranges[base+1] 171 +"], ["+this.ranges[target] 172 +", "+this.ranges[target+1] 173 +"] -> ["+this.ranges[base] 174 +", "+this.ranges[target+1] 175 +"]"); 176 this.ranges[base+1] = this.ranges[target+1]; 177 baseend = this.ranges[base+1]; 178 target += 2; 179 } else { 180 throw new RuntimeException("Token#compactRanges(): Internel Error: [" 181 +this.ranges[base] 182 +","+this.ranges[base+1] 183 +"] ["+this.ranges[target] 184 +","+this.ranges[target+1]+"]"); 185 } 186 } // while 187 base += 2; 188 } 189 190 if (base != this.ranges.length) { 191 int[] result = new int[base]; 192 System.arraycopy(this.ranges, 0, result, 0, base); 193 this.ranges = result; 194 } 195 this.setCompacted(); 196 } 197 mergeRanges(Token token)198 protected void mergeRanges(Token token) { 199 RangeToken tok = (RangeToken)token; 200 this.sortRanges(); 201 tok.sortRanges(); 202 if (tok.ranges == null) 203 return; 204 this.icaseCache = null; 205 this.setSorted(true); 206 if (this.ranges == null) { 207 this.ranges = new int[tok.ranges.length]; 208 System.arraycopy(tok.ranges, 0, this.ranges, 0, tok.ranges.length); 209 return; 210 } 211 int[] result = new int[this.ranges.length+tok.ranges.length]; 212 for (int i = 0, j = 0, k = 0; i < this.ranges.length || j < tok.ranges.length;) { 213 if (i >= this.ranges.length) { 214 result[k++] = tok.ranges[j++]; 215 result[k++] = tok.ranges[j++]; 216 } else if (j >= tok.ranges.length) { 217 result[k++] = this.ranges[i++]; 218 result[k++] = this.ranges[i++]; 219 } else if (tok.ranges[j] < this.ranges[i] 220 || tok.ranges[j] == this.ranges[i] && tok.ranges[j+1] < this.ranges[i+1]) { 221 result[k++] = tok.ranges[j++]; 222 result[k++] = tok.ranges[j++]; 223 } else { 224 result[k++] = this.ranges[i++]; 225 result[k++] = this.ranges[i++]; 226 } 227 } 228 this.ranges = result; 229 } 230 subtractRanges(Token token)231 protected void subtractRanges(Token token) { 232 if (token.type == NRANGE) { 233 this.intersectRanges(token); 234 return; 235 } 236 RangeToken tok = (RangeToken)token; 237 if (tok.ranges == null || this.ranges == null) 238 return; 239 this.icaseCache = null; 240 this.sortRanges(); 241 this.compactRanges(); 242 tok.sortRanges(); 243 tok.compactRanges(); 244 245 //System.err.println("Token#substractRanges(): Entry: "+this.ranges.length+", "+tok.ranges.length); 246 247 int[] result = new int[this.ranges.length+tok.ranges.length]; 248 int wp = 0, src = 0, sub = 0; 249 while (src < this.ranges.length && sub < tok.ranges.length) { 250 int srcbegin = this.ranges[src]; 251 int srcend = this.ranges[src+1]; 252 int subbegin = tok.ranges[sub]; 253 int subend = tok.ranges[sub+1]; 254 if (srcend < subbegin) { // Not overlapped 255 // src: o-----o 256 // sub: o-----o 257 // res: o-----o 258 // Reuse sub 259 result[wp++] = this.ranges[src++]; 260 result[wp++] = this.ranges[src++]; 261 } else if (srcend >= subbegin 262 && srcbegin <= subend) { // Overlapped 263 // src: o--------o 264 // sub: o----o 265 // sub: o----o 266 // sub: o----o 267 // sub: o------------o 268 if (subbegin <= srcbegin && srcend <= subend) { 269 // src: o--------o 270 // sub: o------------o 271 // res: empty 272 // Reuse sub 273 src += 2; 274 } else if (subbegin <= srcbegin) { 275 // src: o--------o 276 // sub: o----o 277 // res: o-----o 278 // Reuse src(=res) 279 this.ranges[src] = subend+1; 280 sub += 2; 281 } else if (srcend <= subend) { 282 // src: o--------o 283 // sub: o----o 284 // res: o-----o 285 // Reuse sub 286 result[wp++] = srcbegin; 287 result[wp++] = subbegin-1; 288 src += 2; 289 } else { 290 // src: o--------o 291 // sub: o----o 292 // res: o-o o-o 293 // Reuse src(=right res) 294 result[wp++] = srcbegin; 295 result[wp++] = subbegin-1; 296 this.ranges[src] = subend+1; 297 sub += 2; 298 } 299 } else if (subend < srcbegin) { 300 // Not overlapped 301 // src: o-----o 302 // sub: o----o 303 sub += 2; 304 } else { 305 throw new RuntimeException("Token#subtractRanges(): Internal Error: ["+this.ranges[src] 306 +","+this.ranges[src+1] 307 +"] - ["+tok.ranges[sub] 308 +","+tok.ranges[sub+1] 309 +"]"); 310 } 311 } 312 while (src < this.ranges.length) { 313 result[wp++] = this.ranges[src++]; 314 result[wp++] = this.ranges[src++]; 315 } 316 this.ranges = new int[wp]; 317 System.arraycopy(result, 0, this.ranges, 0, wp); 318 // this.ranges is sorted and compacted. 319 } 320 321 /** 322 * @param tok Ignore whether it is NRANGE or not. 323 */ intersectRanges(Token token)324 protected void intersectRanges(Token token) { 325 RangeToken tok = (RangeToken)token; 326 if (tok.ranges == null || this.ranges == null) 327 return; 328 this.icaseCache = null; 329 this.sortRanges(); 330 this.compactRanges(); 331 tok.sortRanges(); 332 tok.compactRanges(); 333 334 int[] result = new int[this.ranges.length+tok.ranges.length]; 335 int wp = 0, src1 = 0, src2 = 0; 336 while (src1 < this.ranges.length && src2 < tok.ranges.length) { 337 int src1begin = this.ranges[src1]; 338 int src1end = this.ranges[src1+1]; 339 int src2begin = tok.ranges[src2]; 340 int src2end = tok.ranges[src2+1]; 341 if (src1end < src2begin) { // Not overlapped 342 // src1: o-----o 343 // src2: o-----o 344 // res: empty 345 // Reuse src2 346 src1 += 2; 347 } else if (src1end >= src2begin 348 && src1begin <= src2end) { // Overlapped 349 // src1: o--------o 350 // src2: o----o 351 // src2: o----o 352 // src2: o----o 353 // src2: o------------o 354 if (src2begin <= src1begin && src1end <= src2end) { 355 // src1: o--------o 356 // src2: o------------o 357 // res: o--------o 358 // Reuse src2 359 result[wp++] = src1begin; 360 result[wp++] = src1end; 361 src1 += 2; 362 } else if (src2begin <= src1begin) { 363 // src1: o--------o 364 // src2: o----o 365 // res: o--o 366 // Reuse the rest of src1 367 result[wp++] = src1begin; 368 result[wp++] = src2end; 369 this.ranges[src1] = src2end+1; 370 src2 += 2; 371 } else if (src1end <= src2end) { 372 // src1: o--------o 373 // src2: o----o 374 // res: o--o 375 // Reuse src2 376 result[wp++] = src2begin; 377 result[wp++] = src1end; 378 src1 += 2; 379 } else { 380 // src1: o--------o 381 // src2: o----o 382 // res: o----o 383 // Reuse the rest of src1 384 result[wp++] = src2begin; 385 result[wp++] = src2end; 386 this.ranges[src1] = src2end+1; 387 src2 += 2; 388 } 389 } else if (src2end < src1begin) { 390 // Not overlapped 391 // src1: o-----o 392 // src2: o----o 393 src2 += 2; 394 } else { 395 throw new RuntimeException("Token#intersectRanges(): Internal Error: [" 396 +this.ranges[src1] 397 +","+this.ranges[src1+1] 398 +"] & ["+tok.ranges[src2] 399 +","+tok.ranges[src2+1] 400 +"]"); 401 } 402 } 403 this.ranges = new int[wp]; 404 System.arraycopy(result, 0, this.ranges, 0, wp); 405 // this.ranges is sorted and compacted. 406 } 407 408 /** 409 * for RANGE: Creates complement. 410 * for NRANGE: Creates the same meaning RANGE. 411 */ complementRanges(Token token)412 static Token complementRanges(Token token) { 413 if (token.type != RANGE && token.type != NRANGE) 414 throw new IllegalArgumentException("Token#complementRanges(): must be RANGE: "+token.type); 415 RangeToken tok = (RangeToken)token; 416 tok.sortRanges(); 417 tok.compactRanges(); 418 int len = tok.ranges.length+2; 419 if (tok.ranges[0] == 0) 420 len -= 2; 421 int last = tok.ranges[tok.ranges.length-1]; 422 if (last == UTF16_MAX) 423 len -= 2; 424 RangeToken ret = Token.createRange(); 425 ret.ranges = new int[len]; 426 int wp = 0; 427 if (tok.ranges[0] > 0) { 428 ret.ranges[wp++] = 0; 429 ret.ranges[wp++] = tok.ranges[0]-1; 430 } 431 for (int i = 1; i < tok.ranges.length-2; i += 2) { 432 ret.ranges[wp++] = tok.ranges[i]+1; 433 ret.ranges[wp++] = tok.ranges[i+1]-1; 434 } 435 if (last != UTF16_MAX) { 436 ret.ranges[wp++] = last+1; 437 ret.ranges[wp] = UTF16_MAX; 438 } 439 ret.setCompacted(); 440 return ret; 441 } 442 getCaseInsensitiveToken()443 synchronized RangeToken getCaseInsensitiveToken() { 444 if (this.icaseCache != null) 445 return this.icaseCache; 446 447 RangeToken uppers = this.type == Token.RANGE ? Token.createRange() : Token.createNRange(); 448 for (int i = 0; i < this.ranges.length; i += 2) { 449 for (int ch = this.ranges[i]; ch <= this.ranges[i+1]; ch ++) { 450 if (ch > 0xffff) 451 uppers.addRange(ch, ch); 452 else { 453 char uch = Character.toUpperCase((char)ch); 454 uppers.addRange(uch, uch); 455 } 456 } 457 } 458 RangeToken lowers = this.type == Token.RANGE ? Token.createRange() : Token.createNRange(); 459 for (int i = 0; i < uppers.ranges.length; i += 2) { 460 for (int ch = uppers.ranges[i]; ch <= uppers.ranges[i+1]; ch ++) { 461 if (ch > 0xffff) 462 lowers.addRange(ch, ch); 463 else { 464 char lch = Character.toLowerCase((char)ch); 465 lowers.addRange(lch, lch); 466 } 467 } 468 } 469 lowers.mergeRanges(uppers); 470 lowers.mergeRanges(this); 471 lowers.compactRanges(); 472 473 this.icaseCache = lowers; 474 return lowers; 475 } 476 dumpRanges()477 void dumpRanges() { 478 System.err.print("RANGE: "); 479 if (this.ranges == null) { 480 System.err.println(" NULL"); 481 return; 482 } 483 for (int i = 0; i < this.ranges.length; i += 2) { 484 System.err.print("["+this.ranges[i]+","+this.ranges[i+1]+"] "); 485 } 486 System.err.println(""); 487 } 488 match(int ch)489 boolean match(int ch) { 490 if (this.map == null) this.createMap(); 491 boolean ret; 492 if (this.type == RANGE) { 493 if (ch < MAPSIZE) 494 return (this.map[ch/32] & (1<<(ch&0x1f))) != 0; 495 ret = false; 496 for (int i = this.nonMapIndex; i < this.ranges.length; i += 2) { 497 if (this.ranges[i] <= ch && ch <= this.ranges[i+1]) 498 return true; 499 } 500 } else { 501 if (ch < MAPSIZE) 502 return (this.map[ch/32] & (1<<(ch&0x1f))) == 0; 503 ret = true; 504 for (int i = this.nonMapIndex; i < this.ranges.length; i += 2) { 505 if (this.ranges[i] <= ch && ch <= this.ranges[i+1]) 506 return false; 507 } 508 } 509 return ret; 510 } 511 512 private static final int MAPSIZE = 256; createMap()513 private void createMap() { 514 int asize = MAPSIZE/32; // 32 is the number of bits in `int'. 515 int [] map = new int[asize]; 516 int nonMapIndex = this.ranges.length; 517 for (int i = 0; i < asize; ++i) { 518 map[i] = 0; 519 } 520 for (int i = 0; i < this.ranges.length; i += 2) { 521 int s = this.ranges[i]; 522 int e = this.ranges[i+1]; 523 if (s < MAPSIZE) { 524 for (int j = s; j <= e && j < MAPSIZE; j++) { 525 map[j/32] |= 1<<(j&0x1f); // s&0x1f : 0-31 526 } 527 } 528 else { 529 nonMapIndex = i; 530 break; 531 } 532 if (e >= MAPSIZE) { 533 nonMapIndex = i; 534 break; 535 } 536 } 537 this.map = map; 538 this.nonMapIndex = nonMapIndex; 539 //for (int i = 0; i < asize; i ++) System.err.println("Map: "+Integer.toString(this.map[i], 16)); 540 } 541 toString(int options)542 public String toString(int options) { 543 String ret; 544 if (this.type == RANGE) { 545 if (this == Token.token_dot) 546 ret = "."; 547 else if (this == Token.token_0to9) 548 ret = "\\d"; 549 else if (this == Token.token_wordchars) 550 ret = "\\w"; 551 else if (this == Token.token_spaces) 552 ret = "\\s"; 553 else { 554 StringBuilder sb = new StringBuilder(); 555 sb.append('['); 556 for (int i = 0; i < this.ranges.length; i += 2) { 557 if ((options & RegularExpression.SPECIAL_COMMA) != 0 && i > 0) sb.append(','); 558 if (this.ranges[i] == this.ranges[i+1]) { 559 sb.append(escapeCharInCharClass(this.ranges[i])); 560 } else { 561 sb.append(escapeCharInCharClass(this.ranges[i])); 562 sb.append('-'); 563 sb.append(escapeCharInCharClass(this.ranges[i+1])); 564 } 565 } 566 sb.append(']'); 567 ret = sb.toString(); 568 } 569 } else { 570 if (this == Token.token_not_0to9) 571 ret = "\\D"; 572 else if (this == Token.token_not_wordchars) 573 ret = "\\W"; 574 else if (this == Token.token_not_spaces) 575 ret = "\\S"; 576 else { 577 StringBuffer sb = new StringBuffer(); 578 sb.append("[^"); 579 for (int i = 0; i < this.ranges.length; i += 2) { 580 if ((options & RegularExpression.SPECIAL_COMMA) != 0 && i > 0) sb.append(','); 581 if (this.ranges[i] == this.ranges[i+1]) { 582 sb.append(escapeCharInCharClass(this.ranges[i])); 583 } else { 584 sb.append(escapeCharInCharClass(this.ranges[i])); 585 sb.append('-'); 586 sb.append(escapeCharInCharClass(this.ranges[i+1])); 587 } 588 } 589 sb.append(']'); 590 ret = sb.toString(); 591 } 592 } 593 return ret; 594 } 595 escapeCharInCharClass(int ch)596 private static String escapeCharInCharClass(int ch) { 597 String ret; 598 switch (ch) { 599 case '[': case ']': case '-': case '^': 600 case ',': case '\\': 601 ret = "\\"+(char)ch; 602 break; 603 case '\f': ret = "\\f"; break; 604 case '\n': ret = "\\n"; break; 605 case '\r': ret = "\\r"; break; 606 case '\t': ret = "\\t"; break; 607 case 0x1b: ret = "\\e"; break; 608 //case 0x0b: ret = "\\v"; break; 609 default: 610 if (ch < 0x20) { 611 String pre = "0"+Integer.toHexString(ch); 612 ret = "\\x"+pre.substring(pre.length()-2, pre.length()); 613 } else if (ch >= 0x10000) { 614 String pre = "0"+Integer.toHexString(ch); 615 ret = "\\v"+pre.substring(pre.length()-6, pre.length()); 616 } else 617 ret = ""+(char)ch; 618 } 619 return ret; 620 } 621 622 } 623