1 /* 2 * Copyright (c) 2015, 2018, Oracle and/or its affiliates. All rights reserved. 3 */ 4 /* 5 * Licensed to the Apache Software Foundation (ASF) under one or more 6 * contributor license agreements. See the NOTICE file distributed with 7 * this work for additional information regarding copyright ownership. 8 * The ASF licenses this file to You under the Apache License, Version 2.0 9 * (the "License"); you may not use this file except in compliance with 10 * the License. You may obtain a copy of the License at 11 * 12 * http://www.apache.org/licenses/LICENSE-2.0 13 * 14 * Unless required by applicable law or agreed to in writing, software 15 * distributed under the License is distributed on an "AS IS" BASIS, 16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 * See the License for the specific language governing permissions and 18 * limitations under the License. 19 */ 20 21 package com.sun.org.apache.xerces.internal.impl.xpath.regex; 22 23 import java.io.IOException; 24 import java.io.ObjectInputStream; 25 import java.io.ObjectOutputStream; 26 import java.io.ObjectStreamField; 27 import java.util.ArrayList; 28 import java.util.Collections; 29 import java.util.HashMap; 30 import java.util.HashSet; 31 import java.util.List; 32 import java.util.Map; 33 import java.util.Set; 34 import java.util.Vector; 35 36 /** 37 * This class represents a node in parse tree. 38 * 39 * @xerces.internal 40 * @LastModified: May 2018 41 */ 42 class Token implements java.io.Serializable { 43 44 private static final long serialVersionUID = 8484976002585487481L; 45 46 static final boolean COUNTTOKENS = true; 47 static int tokens = 0; 48 49 static final int CHAR = 0; // Literal char 50 static final int DOT = 11; // . 51 static final int CONCAT = 1; // XY 52 static final int UNION = 2; // X|Y|Z 53 static final int CLOSURE = 3; // X* 54 static final int RANGE = 4; // [a-zA-Z] etc. 55 static final int NRANGE = 5; // [^a-zA-Z] etc. 56 static final int PAREN = 6; // (X) or (?:X) 57 static final int EMPTY = 7; // 58 static final int ANCHOR = 8; // ^ $ \b \B \< \> \A \Z \z 59 static final int NONGREEDYCLOSURE = 9; // *? +? 60 static final int STRING = 10; // strings 61 static final int BACKREFERENCE = 12; // back references 62 static final int LOOKAHEAD = 20; // (?=...) 63 static final int NEGATIVELOOKAHEAD = 21; // (?!...) 64 static final int LOOKBEHIND = 22; // (?<=...) 65 static final int NEGATIVELOOKBEHIND = 23; // (?<!...) 66 static final int INDEPENDENT = 24; // (?>...) 67 static final int MODIFIERGROUP = 25; // (?ims-ims:...) 68 static final int CONDITION = 26; // (?(...)yes|no) 69 70 static final int UTF16_MAX = 0x10ffff; 71 72 final int type; 73 74 static Token token_dot; 75 static Token token_0to9; 76 static Token token_wordchars; 77 static Token token_not_0to9; 78 static Token token_not_wordchars; 79 static Token token_spaces; 80 static Token token_not_spaces; 81 static Token token_empty; 82 static Token token_linebeginning; 83 static Token token_linebeginning2; 84 static Token token_lineend; 85 static Token token_stringbeginning; 86 static Token token_stringend; 87 static Token token_stringend2; 88 static Token token_wordedge; 89 static Token token_not_wordedge; 90 static Token token_wordbeginning; 91 static Token token_wordend; 92 static { 93 Token.token_empty = new Token(Token.EMPTY); 94 95 Token.token_linebeginning = Token.createAnchor('^'); 96 Token.token_linebeginning2 = Token.createAnchor('@'); 97 Token.token_lineend = Token.createAnchor('$'); 98 Token.token_stringbeginning = Token.createAnchor('A'); 99 Token.token_stringend = Token.createAnchor('z'); 100 Token.token_stringend2 = Token.createAnchor('Z'); 101 Token.token_wordedge = Token.createAnchor('b'); 102 Token.token_not_wordedge = Token.createAnchor('B'); 103 Token.token_wordbeginning = Token.createAnchor('<'); 104 Token.token_wordend = Token.createAnchor('>'); 105 106 Token.token_dot = new Token(Token.DOT); 107 108 Token.token_0to9 = Token.createRange(); 109 Token.token_0to9.addRange('0', '9'); 110 Token.token_wordchars = Token.createRange(); 111 Token.token_wordchars.addRange('0', '9'); 112 Token.token_wordchars.addRange('A', 'Z'); 113 Token.token_wordchars.addRange('_', '_'); 114 Token.token_wordchars.addRange('a', 'z'); 115 Token.token_spaces = Token.createRange(); 116 Token.token_spaces.addRange('\t', '\t'); 117 Token.token_spaces.addRange('\n', '\n'); 118 Token.token_spaces.addRange('\f', '\f'); 119 Token.token_spaces.addRange('\r', '\r'); 120 Token.token_spaces.addRange(' ', ' '); 121 122 Token.token_not_0to9 = Token.complementRanges(Token.token_0to9); 123 Token.token_not_wordchars = Token.complementRanges(Token.token_wordchars); 124 Token.token_not_spaces = Token.complementRanges(Token.token_spaces); 125 } 126 createLook(int type, Token child)127 static Token.ParenToken createLook(int type, Token child) { 128 if (COUNTTOKENS) Token.tokens ++; 129 return new Token.ParenToken(type, child, 0); 130 } createParen(Token child, int pnumber)131 static Token.ParenToken createParen(Token child, int pnumber) { 132 if (COUNTTOKENS) Token.tokens ++; 133 return new Token.ParenToken(Token.PAREN, child, pnumber); 134 } createClosure(Token tok)135 static Token.ClosureToken createClosure(Token tok) { 136 if (COUNTTOKENS) Token.tokens ++; 137 return new Token.ClosureToken(Token.CLOSURE, tok); 138 } createNGClosure(Token tok)139 static Token.ClosureToken createNGClosure(Token tok) { 140 if (COUNTTOKENS) Token.tokens ++; 141 return new Token.ClosureToken(Token.NONGREEDYCLOSURE, tok); 142 } createConcat(Token tok1, Token tok2)143 static Token.ConcatToken createConcat(Token tok1, Token tok2) { 144 if (COUNTTOKENS) Token.tokens ++; 145 return new Token.ConcatToken(tok1, tok2); 146 } createConcat()147 static Token.UnionToken createConcat() { 148 if (COUNTTOKENS) Token.tokens ++; 149 return new Token.UnionToken(Token.CONCAT); // *** It is not a bug. 150 } createUnion()151 static Token.UnionToken createUnion() { 152 if (COUNTTOKENS) Token.tokens ++; 153 return new Token.UnionToken(Token.UNION); 154 } createEmpty()155 static Token createEmpty() { 156 return Token.token_empty; 157 } createRange()158 static RangeToken createRange() { 159 if (COUNTTOKENS) Token.tokens ++; 160 return new RangeToken(Token.RANGE); 161 } createNRange()162 static RangeToken createNRange() { 163 if (COUNTTOKENS) Token.tokens ++; 164 return new RangeToken(Token.NRANGE); 165 } createChar(int ch)166 static Token.CharToken createChar(int ch) { 167 if (COUNTTOKENS) Token.tokens ++; 168 return new Token.CharToken(Token.CHAR, ch); 169 } createAnchor(int ch)170 static private Token.CharToken createAnchor(int ch) { 171 if (COUNTTOKENS) Token.tokens ++; 172 return new Token.CharToken(Token.ANCHOR, ch); 173 } createBackReference(int refno)174 static Token.StringToken createBackReference(int refno) { 175 if (COUNTTOKENS) Token.tokens ++; 176 return new Token.StringToken(Token.BACKREFERENCE, null, refno); 177 } createString(String str)178 static Token.StringToken createString(String str) { 179 if (COUNTTOKENS) Token.tokens ++; 180 return new Token.StringToken(Token.STRING, str, 0); 181 } createModifierGroup(Token child, int add, int mask)182 static Token.ModifierToken createModifierGroup(Token child, int add, int mask) { 183 if (COUNTTOKENS) Token.tokens ++; 184 return new Token.ModifierToken(child, add, mask); 185 } createCondition(int refno, Token condition, Token yespat, Token nopat)186 static Token.ConditionToken createCondition(int refno, Token condition, 187 Token yespat, Token nopat) { 188 if (COUNTTOKENS) Token.tokens ++; 189 return new Token.ConditionToken(refno, condition, yespat, nopat); 190 } 191 Token(int type)192 protected Token(int type) { 193 this.type = type; 194 } 195 196 /** 197 * A number of children. 198 */ size()199 int size() { 200 return 0; 201 } getChild(int index)202 Token getChild(int index) { 203 return null; 204 } addChild(Token tok)205 void addChild(Token tok) { 206 throw new RuntimeException("Not supported."); 207 } 208 209 // for RANGE or NRANGE addRange(int start, int end)210 protected void addRange(int start, int end) { 211 throw new RuntimeException("Not supported."); 212 } sortRanges()213 protected void sortRanges() { 214 throw new RuntimeException("Not supported."); 215 } compactRanges()216 protected void compactRanges() { 217 throw new RuntimeException("Not supported."); 218 } mergeRanges(Token tok)219 protected void mergeRanges(Token tok) { 220 throw new RuntimeException("Not supported."); 221 } subtractRanges(Token tok)222 protected void subtractRanges(Token tok) { 223 throw new RuntimeException("Not supported."); 224 } intersectRanges(Token tok)225 protected void intersectRanges(Token tok) { 226 throw new RuntimeException("Not supported."); 227 } complementRanges(Token tok)228 static Token complementRanges(Token tok) { 229 return RangeToken.complementRanges(tok); 230 } 231 232 setMin(int min)233 void setMin(int min) { // for CLOSURE 234 } setMax(int max)235 void setMax(int max) { // for CLOSURE 236 } getMin()237 int getMin() { // for CLOSURE 238 return -1; 239 } getMax()240 int getMax() { // for CLOSURE 241 return -1; 242 } getReferenceNumber()243 int getReferenceNumber() { // for STRING 244 return 0; 245 } getString()246 String getString() { // for STRING 247 return null; 248 } 249 getParenNumber()250 int getParenNumber() { 251 return 0; 252 } getChar()253 int getChar() { 254 return -1; 255 } 256 toString()257 public String toString() { 258 return this.toString(0); 259 } toString(int options)260 public String toString(int options) { 261 return this.type == Token.DOT ? "." : ""; 262 } 263 264 /** 265 * How many characters are needed? 266 */ getMinLength()267 final int getMinLength() { 268 switch (this.type) { 269 case CONCAT: 270 int sum = 0; 271 for (int i = 0; i < this.size(); i ++) 272 sum += this.getChild(i).getMinLength(); 273 return sum; 274 275 case CONDITION: 276 case UNION: 277 if (this.size() == 0) 278 return 0; 279 int ret = this.getChild(0).getMinLength(); 280 for (int i = 1; i < this.size(); i ++) { 281 int min = this.getChild(i).getMinLength(); 282 if (min < ret) ret = min; 283 } 284 return ret; 285 286 case CLOSURE: 287 case NONGREEDYCLOSURE: 288 if (this.getMin() >= 0) 289 return this.getMin() * this.getChild(0).getMinLength(); 290 return 0; 291 292 case EMPTY: 293 case ANCHOR: 294 return 0; 295 296 case DOT: 297 case CHAR: 298 case RANGE: 299 case NRANGE: 300 return 1; 301 302 case INDEPENDENT: 303 case PAREN: 304 case MODIFIERGROUP: 305 return this.getChild(0).getMinLength(); 306 307 case BACKREFERENCE: 308 return 0; // ******* 309 310 case STRING: 311 return this.getString().length(); 312 313 case LOOKAHEAD: 314 case NEGATIVELOOKAHEAD: 315 case LOOKBEHIND: 316 case NEGATIVELOOKBEHIND: 317 return 0; // ***** Really? 318 319 default: 320 throw new RuntimeException("Token#getMinLength(): Invalid Type: "+this.type); 321 } 322 } 323 getMaxLength()324 final int getMaxLength() { 325 switch (this.type) { 326 case CONCAT: 327 int sum = 0; 328 for (int i = 0; i < this.size(); i ++) { 329 int d = this.getChild(i).getMaxLength(); 330 if (d < 0) return -1; 331 sum += d; 332 } 333 return sum; 334 335 case CONDITION: 336 case UNION: 337 if (this.size() == 0) 338 return 0; 339 int ret = this.getChild(0).getMaxLength(); 340 for (int i = 1; ret >= 0 && i < this.size(); i ++) { 341 int max = this.getChild(i).getMaxLength(); 342 if (max < 0) { // infinity 343 ret = -1; 344 break; 345 } 346 if (max > ret) ret = max; 347 } 348 return ret; 349 350 case CLOSURE: 351 case NONGREEDYCLOSURE: 352 if (this.getMax() >= 0) 353 // When this.child.getMaxLength() < 0, 354 // this returns minus value 355 return this.getMax() * this.getChild(0).getMaxLength(); 356 return -1; 357 358 case EMPTY: 359 case ANCHOR: 360 return 0; 361 362 case CHAR: 363 return 1; 364 case DOT: 365 case RANGE: 366 case NRANGE: 367 return 2; 368 369 case INDEPENDENT: 370 case PAREN: 371 case MODIFIERGROUP: 372 return this.getChild(0).getMaxLength(); 373 374 case BACKREFERENCE: 375 return -1; // ****** 376 377 case STRING: 378 return this.getString().length(); 379 380 case LOOKAHEAD: 381 case NEGATIVELOOKAHEAD: 382 case LOOKBEHIND: 383 case NEGATIVELOOKBEHIND: 384 return 0; // ***** Really? 385 386 default: 387 throw new RuntimeException("Token#getMaxLength(): Invalid Type: "+this.type); 388 } 389 } 390 391 static final int FC_CONTINUE = 0; 392 static final int FC_TERMINAL = 1; 393 static final int FC_ANY = 2; isSet(int options, int flag)394 private static final boolean isSet(int options, int flag) { 395 return (options & flag) == flag; 396 } analyzeFirstCharacter(RangeToken result, int options)397 final int analyzeFirstCharacter(RangeToken result, int options) { 398 switch (this.type) { 399 case CONCAT: 400 int ret = FC_CONTINUE; 401 for (int i = 0; i < this.size(); i ++) 402 if ((ret = this.getChild(i).analyzeFirstCharacter(result, options)) != FC_CONTINUE) 403 break; 404 return ret; 405 406 case UNION: 407 if (this.size() == 0) 408 return FC_CONTINUE; 409 /* 410 * a|b|c -> FC_TERMINAL 411 * a|.|c -> FC_ANY 412 * a|b| -> FC_CONTINUE 413 */ 414 int ret2 = FC_CONTINUE; 415 boolean hasEmpty = false; 416 for (int i = 0; i < this.size(); i ++) { 417 ret2 = this.getChild(i).analyzeFirstCharacter(result, options); 418 if (ret2 == FC_ANY) 419 break; 420 else if (ret2 == FC_CONTINUE) 421 hasEmpty = true; 422 } 423 return hasEmpty ? FC_CONTINUE : ret2; 424 425 case CONDITION: 426 int ret3 = this.getChild(0).analyzeFirstCharacter(result, options); 427 if (this.size() == 1) return FC_CONTINUE; 428 if (ret3 == FC_ANY) return ret3; 429 int ret4 = this.getChild(1).analyzeFirstCharacter(result, options); 430 if (ret4 == FC_ANY) return ret4; 431 return ret3 == FC_CONTINUE || ret4 == FC_CONTINUE ? FC_CONTINUE : FC_TERMINAL; 432 433 case CLOSURE: 434 case NONGREEDYCLOSURE: 435 this.getChild(0).analyzeFirstCharacter(result, options); 436 return FC_CONTINUE; 437 438 case EMPTY: 439 case ANCHOR: 440 return FC_CONTINUE; 441 442 case CHAR: 443 int ch = this.getChar(); 444 result.addRange(ch, ch); 445 if (ch < 0x10000 && isSet(options, RegularExpression.IGNORE_CASE)) { 446 ch = Character.toUpperCase((char)ch); 447 result.addRange(ch, ch); 448 ch = Character.toLowerCase((char)ch); 449 result.addRange(ch, ch); 450 } 451 return FC_TERMINAL; 452 453 case DOT: 454 return FC_ANY; 455 456 case RANGE: 457 result.mergeRanges(this); 458 return FC_TERMINAL; 459 460 case NRANGE: // **** 461 result.mergeRanges(Token.complementRanges(this)); 462 return FC_TERMINAL; 463 464 case INDEPENDENT: 465 case PAREN: 466 return this.getChild(0).analyzeFirstCharacter(result, options); 467 468 case MODIFIERGROUP: 469 options |= ((ModifierToken)this).getOptions(); 470 options &= ~((ModifierToken)this).getOptionsMask(); 471 return this.getChild(0).analyzeFirstCharacter(result, options); 472 473 case BACKREFERENCE: 474 result.addRange(0, UTF16_MAX); // **** We can not optimize. 475 return FC_ANY; 476 477 case STRING: 478 int cha = this.getString().charAt(0); 479 int ch2; 480 if (REUtil.isHighSurrogate(cha) 481 && this.getString().length() >= 2 482 && REUtil.isLowSurrogate((ch2 = this.getString().charAt(1)))) 483 cha = REUtil.composeFromSurrogates(cha, ch2); 484 result.addRange(cha, cha); 485 if (cha < 0x10000 && isSet(options, RegularExpression.IGNORE_CASE)) { 486 cha = Character.toUpperCase((char)cha); 487 result.addRange(cha, cha); 488 cha = Character.toLowerCase((char)cha); 489 result.addRange(cha, cha); 490 } 491 return FC_TERMINAL; 492 493 case LOOKAHEAD: 494 case NEGATIVELOOKAHEAD: 495 case LOOKBEHIND: 496 case NEGATIVELOOKBEHIND: 497 return FC_CONTINUE; 498 499 default: 500 throw new RuntimeException("Token#analyzeHeadCharacter(): Invalid Type: "+this.type); 501 } 502 } 503 isShorterThan(Token tok)504 private final boolean isShorterThan(Token tok) { 505 if (tok == null) return false; 506 /* 507 int mylength; 508 if (this.type == STRING) mylength = this.getString().length(); 509 else if (this.type == CHAR) mylength = this.getChar() >= 0x10000 ? 2 : 1; 510 else throw new RuntimeException("Internal Error: Illegal type: "+this.type); 511 int otherlength; 512 if (tok.type == STRING) otherlength = tok.getString().length(); 513 else if (tok.type == CHAR) otherlength = tok.getChar() >= 0x10000 ? 2 : 1; 514 else throw new RuntimeException("Internal Error: Illegal type: "+tok.type); 515 */ 516 int mylength; 517 if (this.type == STRING) mylength = this.getString().length(); 518 else throw new RuntimeException("Internal Error: Illegal type: "+this.type); 519 int otherlength; 520 if (tok.type == STRING) otherlength = tok.getString().length(); 521 else throw new RuntimeException("Internal Error: Illegal type: "+tok.type); 522 return mylength < otherlength; 523 } 524 525 static class FixedStringContainer { 526 Token token = null; 527 int options = 0; FixedStringContainer()528 FixedStringContainer() { 529 } 530 } 531 findFixedString(FixedStringContainer container, int options)532 final void findFixedString(FixedStringContainer container, int options) { 533 switch (this.type) { 534 case CONCAT: 535 Token prevToken = null; 536 int prevOptions = 0; 537 for (int i = 0; i < this.size(); i ++) { 538 this.getChild(i).findFixedString(container, options); 539 if (prevToken == null || prevToken.isShorterThan(container.token)) { 540 prevToken = container.token; 541 prevOptions = container.options; 542 } 543 } 544 container.token = prevToken; 545 container.options = prevOptions; 546 return; 547 548 case UNION: 549 case CLOSURE: 550 case NONGREEDYCLOSURE: 551 case EMPTY: 552 case ANCHOR: 553 case RANGE: 554 case DOT: 555 case NRANGE: 556 case BACKREFERENCE: 557 case LOOKAHEAD: 558 case NEGATIVELOOKAHEAD: 559 case LOOKBEHIND: 560 case NEGATIVELOOKBEHIND: 561 case CONDITION: 562 container.token = null; 563 return; 564 565 case CHAR: // Ignore CHAR tokens. 566 container.token = null; // ** 567 return; // ** 568 569 case STRING: 570 container.token = this; 571 container.options = options; 572 return; 573 574 case INDEPENDENT: 575 case PAREN: 576 this.getChild(0).findFixedString(container, options); 577 return; 578 579 case MODIFIERGROUP: 580 options |= ((ModifierToken)this).getOptions(); 581 options &= ~((ModifierToken)this).getOptionsMask(); 582 this.getChild(0).findFixedString(container, options); 583 return; 584 585 default: 586 throw new RuntimeException("Token#findFixedString(): Invalid Type: "+this.type); 587 } 588 } 589 match(int ch)590 boolean match(int ch) { 591 throw new RuntimeException("NFAArrow#match(): Internal error: "+this.type); 592 } 593 594 // ------------------------------------------------------ 595 private static volatile Map<String, Token> categories = null; 596 private static volatile Map<String, Token> categories2 = null; 597 private static final Object lock = new Object(); 598 private static final String[] categoryNames = { 599 "Cn", "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me", "Mc", "Nd", 600 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", null, "Co", "Cs", 601 "Pd", "Ps", "Pe", "Pc", "Po", "Sm", "Sc", "Sk", "So", // 28 602 "Pi", "Pf", // 29, 30 603 "L", "M", "N", "Z", "C", "P", "S", // 31-37 604 }; 605 606 // Schema Rec. {Datatypes} - Punctuation 607 static final int CHAR_INIT_QUOTE = 29; // Pi - initial quote 608 static final int CHAR_FINAL_QUOTE = 30; // Pf - final quote 609 static final int CHAR_LETTER = 31; 610 static final int CHAR_MARK = 32; 611 static final int CHAR_NUMBER = 33; 612 static final int CHAR_SEPARATOR = 34; 613 static final int CHAR_OTHER = 35; 614 static final int CHAR_PUNCTUATION = 36; 615 static final int CHAR_SYMBOL = 37; 616 617 //blockNames in UNICODE 3.1 that supported by XML Schema REC 618 private static final String[] blockNames = { 619 /*0000..007F;*/ "Basic Latin", 620 /*0080..00FF;*/ "Latin-1 Supplement", 621 /*0100..017F;*/ "Latin Extended-A", 622 /*0180..024F;*/ "Latin Extended-B", 623 /*0250..02AF;*/ "IPA Extensions", 624 /*02B0..02FF;*/ "Spacing Modifier Letters", 625 /*0300..036F;*/ "Combining Diacritical Marks", 626 /*0370..03FF;*/ "Greek", 627 /*0400..04FF;*/ "Cyrillic", 628 /*0530..058F;*/ "Armenian", 629 /*0590..05FF;*/ "Hebrew", 630 /*0600..06FF;*/ "Arabic", 631 /*0700..074F;*/ "Syriac", 632 /*0780..07BF;*/ "Thaana", 633 /*0900..097F;*/ "Devanagari", 634 /*0980..09FF;*/ "Bengali", 635 /*0A00..0A7F;*/ "Gurmukhi", 636 /*0A80..0AFF;*/ "Gujarati", 637 /*0B00..0B7F;*/ "Oriya", 638 /*0B80..0BFF;*/ "Tamil", 639 /*0C00..0C7F;*/ "Telugu", 640 /*0C80..0CFF;*/ "Kannada", 641 /*0D00..0D7F;*/ "Malayalam", 642 /*0D80..0DFF;*/ "Sinhala", 643 /*0E00..0E7F;*/ "Thai", 644 /*0E80..0EFF;*/ "Lao", 645 /*0F00..0FFF;*/ "Tibetan", 646 /*1000..109F;*/ "Myanmar", 647 /*10A0..10FF;*/ "Georgian", 648 /*1100..11FF;*/ "Hangul Jamo", 649 /*1200..137F;*/ "Ethiopic", 650 /*13A0..13FF;*/ "Cherokee", 651 /*1400..167F;*/ "Unified Canadian Aboriginal Syllabics", 652 /*1680..169F;*/ "Ogham", 653 /*16A0..16FF;*/ "Runic", 654 /*1780..17FF;*/ "Khmer", 655 /*1800..18AF;*/ "Mongolian", 656 /*1E00..1EFF;*/ "Latin Extended Additional", 657 /*1F00..1FFF;*/ "Greek Extended", 658 /*2000..206F;*/ "General Punctuation", 659 /*2070..209F;*/ "Superscripts and Subscripts", 660 /*20A0..20CF;*/ "Currency Symbols", 661 /*20D0..20FF;*/ "Combining Marks for Symbols", 662 /*2100..214F;*/ "Letterlike Symbols", 663 /*2150..218F;*/ "Number Forms", 664 /*2190..21FF;*/ "Arrows", 665 /*2200..22FF;*/ "Mathematical Operators", 666 /*2300..23FF;*/ "Miscellaneous Technical", 667 /*2400..243F;*/ "Control Pictures", 668 /*2440..245F;*/ "Optical Character Recognition", 669 /*2460..24FF;*/ "Enclosed Alphanumerics", 670 /*2500..257F;*/ "Box Drawing", 671 /*2580..259F;*/ "Block Elements", 672 /*25A0..25FF;*/ "Geometric Shapes", 673 /*2600..26FF;*/ "Miscellaneous Symbols", 674 /*2700..27BF;*/ "Dingbats", 675 /*2800..28FF;*/ "Braille Patterns", 676 /*2E80..2EFF;*/ "CJK Radicals Supplement", 677 /*2F00..2FDF;*/ "Kangxi Radicals", 678 /*2FF0..2FFF;*/ "Ideographic Description Characters", 679 /*3000..303F;*/ "CJK Symbols and Punctuation", 680 /*3040..309F;*/ "Hiragana", 681 /*30A0..30FF;*/ "Katakana", 682 /*3100..312F;*/ "Bopomofo", 683 /*3130..318F;*/ "Hangul Compatibility Jamo", 684 /*3190..319F;*/ "Kanbun", 685 /*31A0..31BF;*/ "Bopomofo Extended", 686 /*3200..32FF;*/ "Enclosed CJK Letters and Months", 687 /*3300..33FF;*/ "CJK Compatibility", 688 /*3400..4DB5;*/ "CJK Unified Ideographs Extension A", 689 /*4E00..9FFF;*/ "CJK Unified Ideographs", 690 /*A000..A48F;*/ "Yi Syllables", 691 /*A490..A4CF;*/ "Yi Radicals", 692 /*AC00..D7A3;*/ "Hangul Syllables", 693 /*E000..F8FF;*/ "Private Use", 694 /*F900..FAFF;*/ "CJK Compatibility Ideographs", 695 /*FB00..FB4F;*/ "Alphabetic Presentation Forms", 696 /*FB50..FDFF;*/ "Arabic Presentation Forms-A", 697 /*FE20..FE2F;*/ "Combining Half Marks", 698 /*FE30..FE4F;*/ "CJK Compatibility Forms", 699 /*FE50..FE6F;*/ "Small Form Variants", 700 /*FE70..FEFE;*/ "Arabic Presentation Forms-B", 701 /*FEFF..FEFF;*/ "Specials", 702 /*FF00..FFEF;*/ "Halfwidth and Fullwidth Forms", 703 //missing Specials add manually 704 /*10300..1032F;*/ "Old Italic", // 84 705 /*10330..1034F;*/ "Gothic", 706 /*10400..1044F;*/ "Deseret", 707 /*1D000..1D0FF;*/ "Byzantine Musical Symbols", 708 /*1D100..1D1FF;*/ "Musical Symbols", 709 /*1D400..1D7FF;*/ "Mathematical Alphanumeric Symbols", 710 /*20000..2A6D6;*/ "CJK Unified Ideographs Extension B", 711 /*2F800..2FA1F;*/ "CJK Compatibility Ideographs Supplement", 712 /*E0000..E007F;*/ "Tags", 713 //missing 2 private use add manually 714 715 }; 716 //ADD THOSE MANUALLY 717 //F0000..FFFFD; "Private Use", 718 //100000..10FFFD; "Private Use" 719 //FFF0..FFFD; "Specials", 720 static final String blockRanges = 721 "\u0000\u007F\u0080\u00FF\u0100\u017F\u0180\u024F\u0250\u02AF\u02B0\u02FF\u0300\u036F" 722 +"\u0370\u03FF\u0400\u04FF\u0530\u058F\u0590\u05FF\u0600\u06FF\u0700\u074F\u0780\u07BF" 723 +"\u0900\u097F\u0980\u09FF\u0A00\u0A7F\u0A80\u0AFF\u0B00\u0B7F\u0B80\u0BFF\u0C00\u0C7F\u0C80\u0CFF" 724 +"\u0D00\u0D7F\u0D80\u0DFF\u0E00\u0E7F\u0E80\u0EFF\u0F00\u0FFF\u1000\u109F\u10A0\u10FF\u1100\u11FF" 725 +"\u1200\u137F\u13A0\u13FF\u1400\u167F\u1680\u169F\u16A0\u16FF\u1780\u17FF\u1800\u18AF\u1E00\u1EFF" 726 +"\u1F00\u1FFF\u2000\u206F\u2070\u209F\u20A0\u20CF\u20D0\u20FF\u2100\u214F\u2150\u218F\u2190\u21FF\u2200\u22FF" 727 +"\u2300\u23FF\u2400\u243F\u2440\u245F\u2460\u24FF\u2500\u257F\u2580\u259F\u25A0\u25FF\u2600\u26FF\u2700\u27BF" 728 +"\u2800\u28FF\u2E80\u2EFF\u2F00\u2FDF\u2FF0\u2FFF\u3000\u303F\u3040\u309F\u30A0\u30FF\u3100\u312F\u3130\u318F" 729 +"\u3190\u319F\u31A0\u31BF\u3200\u32FF\u3300\u33FF\u3400\u4DB5\u4E00\u9FFF\uA000\uA48F\uA490\uA4CF" 730 +"\uAC00\uD7A3\uE000\uF8FF\uF900\uFAFF\uFB00\uFB4F\uFB50\uFDFF" 731 +"\uFE20\uFE2F\uFE30\uFE4F\uFE50\uFE6F\uFE70\uFEFE\uFEFF\uFEFF\uFF00\uFFEF"; 732 static final int[] nonBMPBlockRanges = { 733 0x10300, 0x1032F, // 84 734 0x10330, 0x1034F, 735 0x10400, 0x1044F, 736 0x1D000, 0x1D0FF, 737 0x1D100, 0x1D1FF, 738 0x1D400, 0x1D7FF, 739 0x20000, 0x2A6D6, 740 0x2F800, 0x2FA1F, 741 0xE0000, 0xE007F 742 }; 743 private static final int NONBMP_BLOCK_START = 84; 744 getRange(String name, boolean positive)745 static protected RangeToken getRange(String name, boolean positive) { 746 // use local variable for better performance 747 Map<String, Token> localCat = Token.categories; 748 if (localCat == null) { 749 synchronized (lock) { 750 localCat = Token.categories; 751 if (localCat == null) { 752 Map<String, Token> tmpCat = new HashMap<>(); 753 Map<String, Token> tmpCat2 = new HashMap<>(); 754 755 Token[] ranges = new Token[Token.categoryNames.length]; 756 for (int i = 0; i < ranges.length; i ++) { 757 ranges[i] = Token.createRange(); 758 } 759 int type; 760 for (int i = 0; i < 0x10000; i ++) { 761 type = Character.getType((char)i); 762 if (type == Character.START_PUNCTUATION || 763 type == Character.END_PUNCTUATION) { 764 //build table of Pi values 765 if (i == 0x00AB || i == 0x2018 || i == 0x201B || i == 0x201C || 766 i == 0x201F || i == 0x2039) { 767 type = CHAR_INIT_QUOTE; 768 } 769 //build table of Pf values 770 if (i == 0x00BB || i == 0x2019 || i == 0x201D || i == 0x203A ) { 771 type = CHAR_FINAL_QUOTE; 772 } 773 } 774 ranges[type].addRange(i, i); 775 switch (type) { 776 case Character.UPPERCASE_LETTER: 777 case Character.LOWERCASE_LETTER: 778 case Character.TITLECASE_LETTER: 779 case Character.MODIFIER_LETTER: 780 case Character.OTHER_LETTER: 781 type = CHAR_LETTER; 782 break; 783 case Character.NON_SPACING_MARK: 784 case Character.COMBINING_SPACING_MARK: 785 case Character.ENCLOSING_MARK: 786 type = CHAR_MARK; 787 break; 788 case Character.DECIMAL_DIGIT_NUMBER: 789 case Character.LETTER_NUMBER: 790 case Character.OTHER_NUMBER: 791 type = CHAR_NUMBER; 792 break; 793 case Character.SPACE_SEPARATOR: 794 case Character.LINE_SEPARATOR: 795 case Character.PARAGRAPH_SEPARATOR: 796 type = CHAR_SEPARATOR; 797 break; 798 case Character.CONTROL: 799 case Character.FORMAT: 800 case Character.SURROGATE: 801 case Character.PRIVATE_USE: 802 case Character.UNASSIGNED: 803 type = CHAR_OTHER; 804 break; 805 case Character.CONNECTOR_PUNCTUATION: 806 case Character.DASH_PUNCTUATION: 807 case Character.START_PUNCTUATION: 808 case Character.END_PUNCTUATION: 809 case CHAR_INIT_QUOTE: 810 case CHAR_FINAL_QUOTE: 811 case Character.OTHER_PUNCTUATION: 812 type = CHAR_PUNCTUATION; 813 break; 814 case Character.MATH_SYMBOL: 815 case Character.CURRENCY_SYMBOL: 816 case Character.MODIFIER_SYMBOL: 817 case Character.OTHER_SYMBOL: 818 type = CHAR_SYMBOL; 819 break; 820 default: 821 throw new RuntimeException("org.apache.xerces.utils.regex.Token#getRange(): Unknown Unicode category: "+type); 822 } 823 ranges[type].addRange(i, i); 824 } // for all characters 825 ranges[Character.UNASSIGNED].addRange(0x10000, Token.UTF16_MAX); 826 827 for (int i = 0; i < ranges.length; i ++) { 828 if (Token.categoryNames[i] != null) { 829 if (i == Character.UNASSIGNED) { // Unassigned 830 ranges[i].addRange(0x10000, Token.UTF16_MAX); 831 } 832 tmpCat.put(Token.categoryNames[i], ranges[i]); 833 tmpCat2.put(Token.categoryNames[i], 834 Token.complementRanges(ranges[i])); 835 } 836 } 837 //REVISIT: do we really need to support block names as in Unicode 3.1 838 // or we can just create all the names in IsBLOCKNAME format (XML Schema REC)? 839 // 840 StringBuilder buffer = new StringBuilder(50); 841 for (int i = 0; i < Token.blockNames.length; i ++) { 842 Token r1 = Token.createRange(); 843 int location; 844 if (i < NONBMP_BLOCK_START) { 845 location = i*2; 846 int rstart = Token.blockRanges.charAt(location); 847 int rend = Token.blockRanges.charAt(location+1); 848 //DEBUGING 849 //System.out.println(n+" " +Integer.toHexString(rstart) 850 // +"-"+ Integer.toHexString(rend)); 851 r1.addRange(rstart, rend); 852 } else { 853 location = (i - NONBMP_BLOCK_START) * 2; 854 r1.addRange(Token.nonBMPBlockRanges[location], 855 Token.nonBMPBlockRanges[location + 1]); 856 } 857 String n = Token.blockNames[i]; 858 if (n.equals("Specials")) 859 r1.addRange(0xfff0, 0xfffd); 860 if (n.equals("Private Use")) { 861 r1.addRange(0xF0000,0xFFFFD); 862 r1.addRange(0x100000,0x10FFFD); 863 } 864 tmpCat.put(n, r1); 865 tmpCat2.put(n, Token.complementRanges(r1)); 866 buffer.setLength(0); 867 buffer.append("Is"); 868 if (n.indexOf(' ') >= 0) { 869 for (int ci = 0; ci < n.length(); ci ++) 870 if (n.charAt(ci) != ' ') buffer.append(n.charAt(ci)); 871 } 872 else { 873 buffer.append(n); 874 } 875 Token.setAlias(tmpCat, tmpCat2, buffer.toString(), n, true); 876 } 877 878 // TR#18 1.2 879 Token.setAlias(tmpCat, tmpCat2, "ASSIGNED", "Cn", false); 880 Token.setAlias(tmpCat, tmpCat2, "UNASSIGNED", "Cn", true); 881 Token all = Token.createRange(); 882 all.addRange(0, Token.UTF16_MAX); 883 tmpCat.put("ALL", all); 884 tmpCat2.put("ALL", Token.complementRanges(all)); 885 Token.registerNonXS("ASSIGNED"); 886 Token.registerNonXS("UNASSIGNED"); 887 Token.registerNonXS("ALL"); 888 889 Token isalpha = Token.createRange(); 890 isalpha.mergeRanges(ranges[Character.UPPERCASE_LETTER]); // Lu 891 isalpha.mergeRanges(ranges[Character.LOWERCASE_LETTER]); // Ll 892 isalpha.mergeRanges(ranges[Character.OTHER_LETTER]); // Lo 893 tmpCat.put("IsAlpha", isalpha); 894 tmpCat2.put("IsAlpha", Token.complementRanges(isalpha)); 895 Token.registerNonXS("IsAlpha"); 896 897 Token isalnum = Token.createRange(); 898 isalnum.mergeRanges(isalpha); // Lu Ll Lo 899 isalnum.mergeRanges(ranges[Character.DECIMAL_DIGIT_NUMBER]); // Nd 900 tmpCat.put("IsAlnum", isalnum); 901 tmpCat2.put("IsAlnum", Token.complementRanges(isalnum)); 902 Token.registerNonXS("IsAlnum"); 903 904 Token isspace = Token.createRange(); 905 isspace.mergeRanges(Token.token_spaces); 906 isspace.mergeRanges(ranges[CHAR_SEPARATOR]); // Z 907 tmpCat.put("IsSpace", isspace); 908 tmpCat2.put("IsSpace", Token.complementRanges(isspace)); 909 Token.registerNonXS("IsSpace"); 910 911 Token isword = Token.createRange(); 912 isword.mergeRanges(isalnum); // Lu Ll Lo Nd 913 isword.addRange('_', '_'); 914 tmpCat.put("IsWord", isword); 915 tmpCat2.put("IsWord", Token.complementRanges(isword)); 916 Token.registerNonXS("IsWord"); 917 918 Token isascii = Token.createRange(); 919 isascii.addRange(0, 127); 920 tmpCat.put("IsASCII", isascii); 921 tmpCat2.put("IsASCII", Token.complementRanges(isascii)); 922 Token.registerNonXS("IsASCII"); 923 924 Token isnotgraph = Token.createRange(); 925 isnotgraph.mergeRanges(ranges[CHAR_OTHER]); 926 isnotgraph.addRange(' ', ' '); 927 tmpCat.put("IsGraph", Token.complementRanges(isnotgraph)); 928 tmpCat2.put("IsGraph", isnotgraph); 929 Token.registerNonXS("IsGraph"); 930 931 Token isxdigit = Token.createRange(); 932 isxdigit.addRange('0', '9'); 933 isxdigit.addRange('A', 'F'); 934 isxdigit.addRange('a', 'f'); 935 tmpCat.put("IsXDigit", Token.complementRanges(isxdigit)); 936 tmpCat2.put("IsXDigit", isxdigit); 937 Token.registerNonXS("IsXDigit"); 938 939 Token.setAlias(tmpCat, tmpCat2, "IsDigit", "Nd", true); 940 Token.setAlias(tmpCat, tmpCat2, "IsUpper", "Lu", true); 941 Token.setAlias(tmpCat, tmpCat2, "IsLower", "Ll", true); 942 Token.setAlias(tmpCat, tmpCat2, "IsCntrl", "C", true); 943 Token.setAlias(tmpCat, tmpCat2, "IsPrint", "C", false); 944 Token.setAlias(tmpCat, tmpCat2, "IsPunct", "P", true); 945 Token.registerNonXS("IsDigit"); 946 Token.registerNonXS("IsUpper"); 947 Token.registerNonXS("IsLower"); 948 Token.registerNonXS("IsCntrl"); 949 Token.registerNonXS("IsPrint"); 950 Token.registerNonXS("IsPunct"); 951 952 Token.setAlias(tmpCat, tmpCat2, "alpha", "IsAlpha", true); 953 Token.setAlias(tmpCat, tmpCat2, "alnum", "IsAlnum", true); 954 Token.setAlias(tmpCat, tmpCat2, "ascii", "IsASCII", true); 955 Token.setAlias(tmpCat, tmpCat2, "cntrl", "IsCntrl", true); 956 Token.setAlias(tmpCat, tmpCat2, "digit", "IsDigit", true); 957 Token.setAlias(tmpCat, tmpCat2, "graph", "IsGraph", true); 958 Token.setAlias(tmpCat, tmpCat2, "lower", "IsLower", true); 959 Token.setAlias(tmpCat, tmpCat2, "print", "IsPrint", true); 960 Token.setAlias(tmpCat, tmpCat2, "punct", "IsPunct", true); 961 Token.setAlias(tmpCat, tmpCat2, "space", "IsSpace", true); 962 Token.setAlias(tmpCat, tmpCat2, "upper", "IsUpper", true); 963 Token.setAlias(tmpCat, tmpCat2, "word", "IsWord", true); // Perl extension 964 Token.setAlias(tmpCat, tmpCat2, "xdigit", "IsXDigit", true); 965 Token.registerNonXS("alpha"); 966 Token.registerNonXS("alnum"); 967 Token.registerNonXS("ascii"); 968 Token.registerNonXS("cntrl"); 969 Token.registerNonXS("digit"); 970 Token.registerNonXS("graph"); 971 Token.registerNonXS("lower"); 972 Token.registerNonXS("print"); 973 Token.registerNonXS("punct"); 974 Token.registerNonXS("space"); 975 Token.registerNonXS("upper"); 976 Token.registerNonXS("word"); 977 Token.registerNonXS("xdigit"); 978 Token.categories = localCat = Collections.unmodifiableMap(tmpCat); 979 Token.categories2 = Collections.unmodifiableMap(tmpCat2); 980 } // localCat == null 981 } // synchronized 982 } // if null 983 return positive ? (RangeToken)localCat.get(name) 984 : (RangeToken)Token.categories2.get(name); 985 } getRange(String name, boolean positive, boolean xs)986 static protected RangeToken getRange(String name, boolean positive, boolean xs) { 987 RangeToken range = Token.getRange(name, positive); 988 if (xs && range != null && Token.isRegisterNonXS(name)) 989 range = null; 990 return range; 991 } 992 993 static final Set<String> nonxs = Collections.synchronizedSet(new HashSet<>()); 994 /** 995 * This method is called by only getRange(). 996 * So this method need not MT-safe. 997 */ registerNonXS(String name)998 static protected void registerNonXS(String name) { 999 Token.nonxs.add(name); 1000 } 1001 isRegisterNonXS(String name)1002 static protected boolean isRegisterNonXS(String name) { 1003 return Token.nonxs.contains(name); 1004 } 1005 setAlias(Map<String, Token> tmpCat, Map<String, Token> tmpCat2, String newName, String name, boolean positive)1006 private static void setAlias(Map<String, Token> tmpCat, Map<String, Token> tmpCat2, 1007 String newName, String name, boolean positive) { 1008 Token t1 = tmpCat.get(name); 1009 Token t2 = tmpCat2.get(name); 1010 if (positive) { 1011 tmpCat.put(newName, t1); 1012 tmpCat2.put(newName, t2); 1013 } else { 1014 tmpCat2.put(newName, t1); 1015 tmpCat.put(newName, t2); 1016 } 1017 } 1018 1019 // ------------------------------------------------------ 1020 1021 static final String viramaString = 1022 "\u094D"// ;DEVANAGARI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;; 1023 +"\u09CD"//;BENGALI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;; 1024 +"\u0A4D"//;GURMUKHI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;; 1025 +"\u0ACD"//;GUJARATI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;; 1026 +"\u0B4D"//;ORIYA SIGN VIRAMA;Mn;9;ON;;;;;N;;;;; 1027 +"\u0BCD"//;TAMIL SIGN VIRAMA;Mn;9;ON;;;;;N;;;;; 1028 +"\u0C4D"//;TELUGU SIGN VIRAMA;Mn;9;ON;;;;;N;;;;; 1029 +"\u0CCD"//;KANNADA SIGN VIRAMA;Mn;9;ON;;;;;N;;;;; 1030 +"\u0D4D"//;MALAYALAM SIGN VIRAMA;Mn;9;ON;;;;;N;;;;; 1031 +"\u0E3A"//;THAI CHARACTER PHINTHU;Mn;9;ON;;;;;N;THAI VOWEL SIGN PHINTHU;;;; 1032 +"\u0F84";//;TIBETAN MARK HALANTA;Mn;9;ON;;;;;N;TIBETAN VIRAMA;;;; 1033 1034 static private Token token_grapheme = null; getGraphemePattern()1035 static synchronized Token getGraphemePattern() { 1036 if (Token.token_grapheme != null) 1037 return Token.token_grapheme; 1038 1039 Token base_char = Token.createRange(); // [{ASSIGNED}]-[{M},{C}] 1040 base_char.mergeRanges(Token.getRange("ASSIGNED", true)); 1041 base_char.subtractRanges(Token.getRange("M", true)); 1042 base_char.subtractRanges(Token.getRange("C", true)); 1043 1044 Token virama = Token.createRange(); 1045 for (int i = 0; i < Token.viramaString.length(); i++) { 1046 virama.addRange(i, i); 1047 } 1048 1049 Token combiner_wo_virama = Token.createRange(); 1050 combiner_wo_virama.mergeRanges(Token.getRange("M", true)); 1051 combiner_wo_virama.addRange(0x1160, 0x11ff); // hangul_medial and hangul_final 1052 combiner_wo_virama.addRange(0xff9e, 0xff9f); // extras 1053 1054 Token left = Token.createUnion(); // base_char? 1055 left.addChild(base_char); 1056 left.addChild(Token.token_empty); 1057 1058 Token foo = Token.createUnion(); 1059 foo.addChild(Token.createConcat(virama, Token.getRange("L", true))); 1060 foo.addChild(combiner_wo_virama); 1061 1062 foo = Token.createClosure(foo); 1063 1064 foo = Token.createConcat(left, foo); 1065 1066 Token.token_grapheme = foo; 1067 return Token.token_grapheme; 1068 } 1069 1070 /** 1071 * Combing Character Sequence in Perl 5.6. 1072 */ 1073 static private Token token_ccs = null; getCombiningCharacterSequence()1074 static synchronized Token getCombiningCharacterSequence() { 1075 if (Token.token_ccs != null) 1076 return Token.token_ccs; 1077 1078 Token foo = Token.createClosure(Token.getRange("M", true)); // \pM* 1079 foo = Token.createConcat(Token.getRange("M", false), foo); // \PM + \pM* 1080 Token.token_ccs = foo; 1081 return Token.token_ccs; 1082 } 1083 1084 // ------------------------------------------------------ 1085 1086 // ------------------------------------------------------ 1087 /** 1088 * This class represents a node in parse tree. 1089 */ 1090 static class StringToken extends Token implements java.io.Serializable { 1091 1092 private static final long serialVersionUID = -4614366944218504172L; 1093 1094 String string; 1095 final int refNumber; 1096 StringToken(int type, String str, int n)1097 StringToken(int type, String str, int n) { 1098 super(type); 1099 this.string = str; 1100 this.refNumber = n; 1101 } 1102 getReferenceNumber()1103 int getReferenceNumber() { // for STRING 1104 return this.refNumber; 1105 } getString()1106 String getString() { // for STRING 1107 return this.string; 1108 } 1109 toString(int options)1110 public String toString(int options) { 1111 if (this.type == BACKREFERENCE) 1112 return "\\"+this.refNumber; 1113 else 1114 return REUtil.quoteMeta(this.string); 1115 } 1116 } 1117 1118 /** 1119 * This class represents a node in parse tree. 1120 */ 1121 static class ConcatToken extends Token implements java.io.Serializable { 1122 1123 private static final long serialVersionUID = 8717321425541346381L; 1124 1125 final Token child; 1126 final Token child2; 1127 ConcatToken(Token t1, Token t2)1128 ConcatToken(Token t1, Token t2) { 1129 super(Token.CONCAT); 1130 this.child = t1; 1131 this.child2 = t2; 1132 } 1133 size()1134 int size() { 1135 return 2; 1136 } getChild(int index)1137 Token getChild(int index) { 1138 return index == 0 ? this.child : this.child2; 1139 } 1140 toString(int options)1141 public String toString(int options) { 1142 String ret; 1143 if (this.child2.type == CLOSURE && this.child2.getChild(0) == this.child) { 1144 ret = this.child.toString(options)+"+"; 1145 } else if (this.child2.type == NONGREEDYCLOSURE && this.child2.getChild(0) == this.child) { 1146 ret = this.child.toString(options)+"+?"; 1147 } else 1148 ret = this.child.toString(options)+this.child2.toString(options); 1149 return ret; 1150 } 1151 } 1152 1153 /** 1154 * This class represents a node in parse tree. 1155 */ 1156 static class CharToken extends Token implements java.io.Serializable { 1157 1158 private static final long serialVersionUID = -4394272816279496989L; 1159 1160 final int chardata; 1161 CharToken(int type, int ch)1162 CharToken(int type, int ch) { 1163 super(type); 1164 this.chardata = ch; 1165 } 1166 getChar()1167 int getChar() { 1168 return this.chardata; 1169 } 1170 toString(int options)1171 public String toString(int options) { 1172 String ret; 1173 switch (this.type) { 1174 case CHAR: 1175 switch (this.chardata) { 1176 case '|': case '*': case '+': case '?': 1177 case '(': case ')': case '.': case '[': 1178 case '{': case '\\': 1179 ret = "\\"+(char)this.chardata; 1180 break; 1181 case '\f': ret = "\\f"; break; 1182 case '\n': ret = "\\n"; break; 1183 case '\r': ret = "\\r"; break; 1184 case '\t': ret = "\\t"; break; 1185 case 0x1b: ret = "\\e"; break; 1186 //case 0x0b: ret = "\\v"; break; 1187 default: 1188 if (this.chardata >= 0x10000) { 1189 String pre = "0"+Integer.toHexString(this.chardata); 1190 ret = "\\v"+pre.substring(pre.length()-6, pre.length()); 1191 } else 1192 ret = ""+(char)this.chardata; 1193 } 1194 break; 1195 1196 case ANCHOR: 1197 if (this == Token.token_linebeginning || this == Token.token_lineend) 1198 ret = ""+(char)this.chardata; 1199 else 1200 ret = "\\"+(char)this.chardata; 1201 break; 1202 1203 default: 1204 ret = null; 1205 } 1206 return ret; 1207 } 1208 match(int ch)1209 boolean match(int ch) { 1210 if (this.type == CHAR) { 1211 return ch == this.chardata; 1212 } else 1213 throw new RuntimeException("NFAArrow#match(): Internal error: "+this.type); 1214 } 1215 } 1216 1217 /** 1218 * This class represents a node in parse tree. 1219 */ 1220 static class ClosureToken extends Token implements java.io.Serializable { 1221 1222 private static final long serialVersionUID = 1308971930673997452L; 1223 1224 int min; 1225 int max; 1226 final Token child; 1227 ClosureToken(int type, Token tok)1228 ClosureToken(int type, Token tok) { 1229 super(type); 1230 this.child = tok; 1231 this.setMin(-1); 1232 this.setMax(-1); 1233 } 1234 size()1235 int size() { 1236 return 1; 1237 } getChild(int index)1238 Token getChild(int index) { 1239 return this.child; 1240 } 1241 setMin(int min)1242 final void setMin(int min) { 1243 this.min = min; 1244 } setMax(int max)1245 final void setMax(int max) { 1246 this.max = max; 1247 } getMin()1248 final int getMin() { 1249 return this.min; 1250 } getMax()1251 final int getMax() { 1252 return this.max; 1253 } 1254 toString(int options)1255 public String toString(int options) { 1256 String ret; 1257 if (this.type == CLOSURE) { 1258 if (this.getMin() < 0 && this.getMax() < 0) { 1259 ret = this.child.toString(options)+"*"; 1260 } else if (this.getMin() == this.getMax()) { 1261 ret = this.child.toString(options)+"{"+this.getMin()+"}"; 1262 } else if (this.getMin() >= 0 && this.getMax() >= 0) { 1263 ret = this.child.toString(options)+"{"+this.getMin()+","+this.getMax()+"}"; 1264 } else if (this.getMin() >= 0 && this.getMax() < 0) { 1265 ret = this.child.toString(options)+"{"+this.getMin()+",}"; 1266 } else 1267 throw new RuntimeException("Token#toString(): CLOSURE " 1268 +this.getMin()+", "+this.getMax()); 1269 } else { 1270 if (this.getMin() < 0 && this.getMax() < 0) { 1271 ret = this.child.toString(options)+"*?"; 1272 } else if (this.getMin() == this.getMax()) { 1273 ret = this.child.toString(options)+"{"+this.getMin()+"}?"; 1274 } else if (this.getMin() >= 0 && this.getMax() >= 0) { 1275 ret = this.child.toString(options)+"{"+this.getMin()+","+this.getMax()+"}?"; 1276 } else if (this.getMin() >= 0 && this.getMax() < 0) { 1277 ret = this.child.toString(options)+"{"+this.getMin()+",}?"; 1278 } else 1279 throw new RuntimeException("Token#toString(): NONGREEDYCLOSURE " 1280 +this.getMin()+", "+this.getMax()); 1281 } 1282 return ret; 1283 } 1284 } 1285 1286 /** 1287 * This class represents a node in parse tree. 1288 */ 1289 static class ParenToken extends Token implements java.io.Serializable { 1290 1291 private static final long serialVersionUID = -5938014719827987704L; 1292 1293 final Token child; 1294 final int parennumber; 1295 ParenToken(int type, Token tok, int paren)1296 ParenToken(int type, Token tok, int paren) { 1297 super(type); 1298 this.child = tok; 1299 this.parennumber = paren; 1300 } 1301 size()1302 int size() { 1303 return 1; 1304 } getChild(int index)1305 Token getChild(int index) { 1306 return this.child; 1307 } 1308 getParenNumber()1309 int getParenNumber() { 1310 return this.parennumber; 1311 } 1312 toString(int options)1313 public String toString(int options) { 1314 String ret = null; 1315 switch (this.type) { 1316 case PAREN: 1317 if (this.parennumber == 0) { 1318 ret = "(?:"+this.child.toString(options)+")"; 1319 } else { 1320 ret = "("+this.child.toString(options)+")"; 1321 } 1322 break; 1323 1324 case LOOKAHEAD: 1325 ret = "(?="+this.child.toString(options)+")"; 1326 break; 1327 case NEGATIVELOOKAHEAD: 1328 ret = "(?!"+this.child.toString(options)+")"; 1329 break; 1330 case LOOKBEHIND: 1331 ret = "(?<="+this.child.toString(options)+")"; 1332 break; 1333 case NEGATIVELOOKBEHIND: 1334 ret = "(?<!"+this.child.toString(options)+")"; 1335 break; 1336 case INDEPENDENT: 1337 ret = "(?>"+this.child.toString(options)+")"; 1338 break; 1339 } 1340 return ret; 1341 } 1342 } 1343 1344 /** 1345 * (?(condition)yes-pattern|no-pattern) 1346 */ 1347 static class ConditionToken extends Token implements java.io.Serializable { 1348 1349 private static final long serialVersionUID = 4353765277910594411L; 1350 1351 final int refNumber; 1352 final Token condition; 1353 final Token yes; 1354 final Token no; ConditionToken(int refno, Token cond, Token yespat, Token nopat)1355 ConditionToken(int refno, Token cond, Token yespat, Token nopat) { 1356 super(Token.CONDITION); 1357 this.refNumber = refno; 1358 this.condition = cond; 1359 this.yes = yespat; 1360 this.no = nopat; 1361 } size()1362 int size() { 1363 return this.no == null ? 1 : 2; 1364 } getChild(int index)1365 Token getChild(int index) { 1366 if (index == 0) return this.yes; 1367 if (index == 1) return this.no; 1368 throw new RuntimeException("Internal Error: "+index); 1369 } 1370 toString(int options)1371 public String toString(int options) { 1372 String ret; 1373 if (refNumber > 0) { 1374 ret = "(?("+refNumber+")"; 1375 } else if (this.condition.type == Token.ANCHOR) { 1376 ret = "(?("+this.condition+")"; 1377 } else { 1378 ret = "(?"+this.condition; 1379 } 1380 1381 if (this.no == null) { 1382 ret += this.yes+")"; 1383 } else { 1384 ret += this.yes+"|"+this.no+")"; 1385 } 1386 return ret; 1387 } 1388 } 1389 1390 /** 1391 * (ims-ims: .... ) 1392 */ 1393 static class ModifierToken extends Token implements java.io.Serializable { 1394 1395 private static final long serialVersionUID = -9114536559696480356L; 1396 1397 final Token child; 1398 final int add; 1399 final int mask; 1400 ModifierToken(Token tok, int add, int mask)1401 ModifierToken(Token tok, int add, int mask) { 1402 super(Token.MODIFIERGROUP); 1403 this.child = tok; 1404 this.add = add; 1405 this.mask = mask; 1406 } 1407 size()1408 int size() { 1409 return 1; 1410 } getChild(int index)1411 Token getChild(int index) { 1412 return this.child; 1413 } 1414 getOptions()1415 int getOptions() { 1416 return this.add; 1417 } getOptionsMask()1418 int getOptionsMask() { 1419 return this.mask; 1420 } 1421 toString(int options)1422 public String toString(int options) { 1423 return "(?" 1424 +(this.add == 0 ? "" : REUtil.createOptionString(this.add)) 1425 +(this.mask == 0 ? "" : REUtil.createOptionString(this.mask)) 1426 +":" 1427 +this.child.toString(options) 1428 +")"; 1429 } 1430 } 1431 1432 /** 1433 * This class represents a node in parse tree. 1434 * for UNION or CONCAT. 1435 */ 1436 static class UnionToken extends Token implements java.io.Serializable { 1437 1438 private static final long serialVersionUID = -2568843945989489861L; 1439 1440 List<Token> children; 1441 1442 /** 1443 * @serialField children Vector children 1444 */ 1445 private static final ObjectStreamField[] serialPersistentFields = 1446 new ObjectStreamField[] { 1447 new ObjectStreamField("children", Vector.class), 1448 }; 1449 UnionToken(int type)1450 UnionToken(int type) { 1451 super(type); 1452 } 1453 1454 @Override addChild(Token tok)1455 void addChild(Token tok) { 1456 if (tok == null) return; 1457 if (this.children == null) this.children = new ArrayList<>(); 1458 if (this.type == UNION) { 1459 this.children.add(tok); 1460 return; 1461 } 1462 // This is CONCAT, and new child is CONCAT. 1463 if (tok.type == CONCAT) { 1464 for (int i = 0; i < tok.size(); i ++) 1465 this.addChild(tok.getChild(i)); // Recursion 1466 return; 1467 } 1468 int size = this.children.size(); 1469 if (size == 0) { 1470 this.children.add(tok); 1471 return; 1472 } 1473 Token previous = this.children.get(size - 1); 1474 if (!((previous.type == CHAR || previous.type == STRING) 1475 && (tok.type == CHAR || tok.type == STRING))) { 1476 this.children.add(tok); 1477 return; 1478 } 1479 1480 //System.err.println("Merge '"+previous+"' and '"+tok+"'."); 1481 1482 StringBuilder buffer; 1483 int nextMaxLength = (tok.type == CHAR ? 2 : tok.getString().length()); 1484 if (previous.type == CHAR) { // Replace previous token by STRING 1485 buffer = new StringBuilder(2 + nextMaxLength); 1486 int ch = previous.getChar(); 1487 if (ch >= 0x10000) 1488 buffer.append(REUtil.decomposeToSurrogates(ch)); 1489 else 1490 buffer.append((char)ch); 1491 previous = Token.createString(null); 1492 this.children.set(size - 1, previous); 1493 } else { // STRING 1494 buffer = new StringBuilder(previous.getString().length() + nextMaxLength); 1495 buffer.append(previous.getString()); 1496 } 1497 1498 if (tok.type == CHAR) { 1499 int ch = tok.getChar(); 1500 if (ch >= 0x10000) 1501 buffer.append(REUtil.decomposeToSurrogates(ch)); 1502 else 1503 buffer.append((char)ch); 1504 } else { 1505 buffer.append(tok.getString()); 1506 } 1507 1508 ((StringToken)previous).string = new String(buffer); 1509 } 1510 1511 @Override size()1512 int size() { 1513 return this.children == null ? 0 : this.children.size(); 1514 } 1515 @Override getChild(int index)1516 Token getChild(int index) { 1517 return this.children.get(index); 1518 } 1519 1520 @Override toString(int options)1521 public String toString(int options) { 1522 String ret; 1523 if (this.type == CONCAT) { 1524 if (this.children.size() == 2) { 1525 Token ch = this.getChild(0); 1526 Token ch2 = this.getChild(1); 1527 if (ch2.type == CLOSURE && ch2.getChild(0) == ch) { 1528 ret = ch.toString(options)+"+"; 1529 } else if (ch2.type == NONGREEDYCLOSURE && ch2.getChild(0) == ch) { 1530 ret = ch.toString(options)+"+?"; 1531 } else 1532 ret = ch.toString(options)+ch2.toString(options); 1533 } else { 1534 StringBuilder sb = new StringBuilder(); 1535 this.children.stream().forEach((children1) -> { 1536 sb.append((children1).toString(options)); 1537 }); 1538 ret = sb.toString(); 1539 } 1540 return ret; 1541 } 1542 if (this.children.size() == 2 && this.getChild(1).type == EMPTY) { 1543 ret = this.getChild(0).toString(options)+"?"; 1544 } else if (this.children.size() == 2 1545 && this.getChild(0).type == EMPTY) { 1546 ret = this.getChild(1).toString(options)+"??"; 1547 } else { 1548 StringBuilder sb = new StringBuilder(); 1549 sb.append((this.children.get(0)).toString(options)); 1550 for (int i = 1; i < this.children.size(); i ++) { 1551 sb.append('|'); 1552 sb.append((this.children.get(i)).toString(options)); 1553 } 1554 ret = sb.toString(); 1555 } 1556 return ret; 1557 } 1558 1559 /** 1560 * @serialData Serialized fields. Convert the List to Vector for backward compatibility. 1561 */ writeObject(ObjectOutputStream out)1562 private void writeObject(ObjectOutputStream out) throws IOException { 1563 // Convert List to Vector 1564 Vector<Token> vChildren = (children == null)? null : new Vector<>(children); 1565 1566 // Write serialized fields 1567 ObjectOutputStream.PutField pf = out.putFields(); 1568 pf.put("children", vChildren); 1569 out.writeFields(); 1570 } 1571 1572 @SuppressWarnings("unchecked") readObject(ObjectInputStream in)1573 private void readObject(ObjectInputStream in) 1574 throws IOException, ClassNotFoundException { 1575 // We have to read serialized fields first. 1576 ObjectInputStream.GetField gf = in.readFields(); 1577 Vector<Token> vChildren = (Vector<Token>)gf.get("children", null); 1578 1579 //convert Vector back to List 1580 if (vChildren != null) children = new ArrayList<>(vChildren); 1581 } 1582 } 1583 } 1584