1 // 2 // This software is now distributed according to 3 // the Lesser Gnu Public License. Please see 4 // http://www.gnu.org/copyleft/lesser.txt for 5 // the details. 6 // -- Happy Computing! 7 // 8 package com.stevesoft.pat; 9 10 import jalview.util.MessageManager; 11 12 import java.io.File; 13 import java.io.FilenameFilter; 14 import java.util.BitSet; 15 import java.util.Hashtable; 16 17 import com.stevesoft.pat.wrap.StringWrap; 18 19 /** Matches a Unicode punctuation character. */ 20 class UnicodePunct extends UniValidator 21 { 22 @Override validate(StringLike s, int from, int to)23 public int validate(StringLike s, int from, int to) 24 { 25 return from < s.length() && Prop.isPunct(s.charAt(from)) ? to : -1; 26 } 27 } 28 29 /** Matches a Unicode white space character. */ 30 class UnicodeWhite extends UniValidator 31 { 32 @Override validate(StringLike s, int from, int to)33 public int validate(StringLike s, int from, int to) 34 { 35 return from < s.length() && Prop.isWhite(s.charAt(from)) ? to : -1; 36 } 37 } 38 39 /** 40 * Matches a character that is not a Unicode punctuation character. 41 */ 42 class NUnicodePunct extends UniValidator 43 { 44 @Override validate(StringLike s, int from, int to)45 public int validate(StringLike s, int from, int to) 46 { 47 return from < s.length() && !Prop.isPunct(s.charAt(from)) ? to : -1; 48 } 49 } 50 51 /** 52 * Matches a character that is not a Unicode white space character. 53 */ 54 class NUnicodeWhite extends UniValidator 55 { 56 @Override validate(StringLike s, int from, int to)57 public int validate(StringLike s, int from, int to) 58 { 59 return from < s.length() && !Prop.isWhite(s.charAt(from)) ? to : -1; 60 } 61 } 62 63 /** Matches a Unicode word character: an alphanumeric or underscore. */ 64 class UnicodeW extends UniValidator 65 { 66 @Override validate(StringLike s, int from, int to)67 public int validate(StringLike s, int from, int to) 68 { 69 if (from >= s.length()) 70 { 71 return -1; 72 } 73 char c = s.charAt(from); 74 return (Prop.isAlphabetic(c) || Prop.isDecimalDigit(c) || c == '_') ? to 75 : -1; 76 } 77 } 78 79 /** Matches a character that is not a Unicode alphanumeric or underscore. */ 80 class NUnicodeW extends UniValidator 81 { 82 @Override validate(StringLike s, int from, int to)83 public int validate(StringLike s, int from, int to) 84 { 85 if (from >= s.length()) 86 { 87 return -1; 88 } 89 char c = s.charAt(from); 90 return !(Prop.isAlphabetic(c) || Prop.isDecimalDigit(c) || c == '_') ? to 91 : -1; 92 } 93 } 94 95 /** Matches a Unicode decimal digit. */ 96 class UnicodeDigit extends UniValidator 97 { 98 @Override validate(StringLike s, int from, int to)99 public int validate(StringLike s, int from, int to) 100 { 101 return from < s.length() && Prop.isDecimalDigit(s.charAt(from)) ? to 102 : -1; 103 } 104 } 105 106 /** Matches a character that is not a Unicode digit. */ 107 class NUnicodeDigit extends UniValidator 108 { 109 @Override validate(StringLike s, int from, int to)110 public int validate(StringLike s, int from, int to) 111 { 112 return from < s.length() && !Prop.isDecimalDigit(s.charAt(from)) ? to 113 : -1; 114 } 115 } 116 117 /** Matches a Unicode math character. */ 118 class UnicodeMath extends UniValidator 119 { 120 @Override validate(StringLike s, int from, int to)121 public int validate(StringLike s, int from, int to) 122 { 123 return from < s.length() && Prop.isMath(s.charAt(from)) ? to : -1; 124 } 125 } 126 127 /** Matches a non-math Unicode character. */ 128 class NUnicodeMath extends UniValidator 129 { 130 @Override validate(StringLike s, int from, int to)131 public int validate(StringLike s, int from, int to) 132 { 133 return from < s.length() && !Prop.isMath(s.charAt(from)) ? to : -1; 134 } 135 } 136 137 /** Matches a Unicode currency symbol. */ 138 class UnicodeCurrency extends UniValidator 139 { 140 @Override validate(StringLike s, int from, int to)141 public int validate(StringLike s, int from, int to) 142 { 143 return from < s.length() && Prop.isCurrency(s.charAt(from)) ? to : -1; 144 } 145 } 146 147 /** Matches a non-currency symbol Unicode character. */ 148 class NUnicodeCurrency extends UniValidator 149 { 150 @Override validate(StringLike s, int from, int to)151 public int validate(StringLike s, int from, int to) 152 { 153 return from < s.length() && !Prop.isCurrency(s.charAt(from)) ? to : -1; 154 } 155 } 156 157 /** Matches a Unicode alphabetic character. */ 158 class UnicodeAlpha extends UniValidator 159 { 160 @Override validate(StringLike s, int from, int to)161 public int validate(StringLike s, int from, int to) 162 { 163 return from < s.length() && Prop.isAlphabetic(s.charAt(from)) ? to : -1; 164 } 165 } 166 167 /** Matches a non-alphabetic Unicode character. */ 168 class NUnicodeAlpha extends UniValidator 169 { 170 @Override validate(StringLike s, int from, int to)171 public int validate(StringLike s, int from, int to) 172 { 173 return from < s.length() && !Prop.isAlphabetic(s.charAt(from)) ? to 174 : -1; 175 } 176 } 177 178 /** Matches an upper case Unicode character. */ 179 class UnicodeUpper extends UniValidator 180 { 181 @Override validate(StringLike s, int from, int to)182 public int validate(StringLike s, int from, int to) 183 { 184 return from < s.length() && isUpper(s.charAt(from)) ? to : -1; 185 } 186 isUpper(char c)187 final boolean isUpper(char c) 188 { 189 return c == CaseMgr.toUpperCase(c) && c != CaseMgr.toLowerCase(c); 190 } 191 } 192 193 /** Matches an upper case Unicode character. */ 194 class UnicodeLower extends UniValidator 195 { 196 @Override validate(StringLike s, int from, int to)197 public int validate(StringLike s, int from, int to) 198 { 199 return from < s.length() && isLower(s.charAt(from)) ? to : -1; 200 } 201 isLower(char c)202 final boolean isLower(char c) 203 { 204 return c != CaseMgr.toUpperCase(c) && c == CaseMgr.toLowerCase(c); 205 } 206 } 207 208 /** 209 * Regex provides the parser which constructs the linked list of Pattern classes 210 * from a String. 211 * <p> 212 * For the purpose of this documentation, the fact that java interprets the 213 * backslash will be ignored. In practice, however, you will need a double 214 * backslash to obtain a string that contains a single backslash character. 215 * Thus, the example pattern "\b" should really be typed as "\\b" inside java 216 * code. 217 * <p> 218 * Note that Regex is part of package "com.stevesoft.pat". To use it, simply 219 * import com.stevesoft.pat.Regex at the top of your file. 220 * <p> 221 * Regex is made with a constructor that takes a String that defines the regular 222 * expression. Thus, for example 223 * 224 * <pre> 225 * Regex r = new Regex("[a-c]*"); 226 * </pre> 227 * 228 * matches any number of characters so long as the are 'a', 'b', or 'c'). 229 * <p> 230 * To attempt to match the Pattern to a given string, you can use either the 231 * search(String) member function, or the matchAt(String,int position) member 232 * function. These functions return a boolean which tells you whether or not the 233 * thing worked, and sets the methods "charsMatched()" and "matchedFrom()" in 234 * the Regex object appropriately. 235 * <p> 236 * The portion of the string before the match can be obtained by the left() 237 * member, and the portion after the match can be obtained by the right() 238 * member. 239 * <p> 240 * Essentially, this package implements a syntax that is very much like the perl 241 * 5 regular expression syntax. 242 * 243 * Longer example: 244 * 245 * <pre> 246 * Regex r = new Regex("x(a|b)y"); 247 * r.matchAt("xay", 0); 248 * System.out.println("sub = " + r.stringMatched(1)); 249 * </pre> 250 * 251 * The above would print "sub = a". 252 * 253 * <pre> 254 * r.left() // would return "x" 255 * r.right() // would return "y" 256 * </pre> 257 * 258 * <p> 259 * Differences between this package and perl5:<br> 260 * The extended Pattern for setting flags, is now supported, but the flags are 261 * different. "(?i)" tells the pattern to ignore case, "(?Q)" sets the 262 * "dontMatchInQuotes" flag, and "(?iQ)" sets them both. You can change the 263 * escape character. The pattern 264 * 265 * <pre> 266 * (?e=#)#d+ 267 * </pre> 268 * 269 * is the same as 270 * 271 * <pre> 272 * \d+ 273 * </pre> 274 * 275 * , but note that the sequence 276 * 277 * <pre> 278 * (?e=#) 279 * </pre> 280 * 281 * <b>must</b> occur at the very beginning of the pattern. There may be other 282 * small differences as well. I will either make my package conform or note them 283 * as I become aware of them. 284 * <p> 285 * This package supports additional patterns not in perl5: <center> 286 * <table * border=1> 287 * <tr> 288 * <td>(?@())</td> 289 * <td>Group</td> 290 * <td>This matches all characters between the '(' character and the balancing 291 * ')' character. Thus, it will match "()" as well as "(())". The balancing 292 * characters are arbitrary, thus (?@{}) matches on "{}" and "{{}}".</td> 293 * <tr> 294 * <td>(?<1)</td> 295 * <td>Backup</td> 296 * <td>Moves the pointer backwards within the text. This allows you to make a 297 * "look behind." It fails if it attempts to move to a position before the 298 * beginning of the string. "x(?<1)" is equivalent to "(?=x)". The number, 1 299 * in this example, is the number of characters to move backwards.</td> 300 * </table> 301 * </center> </dl> 302 * 303 * @author Steven R. Brandt 304 * @version package com.stevesoft.pat, release 1.5.3 305 * @see Pattern 306 */ 307 public class Regex extends RegRes implements FilenameFilter 308 { 309 /** 310 * BackRefOffset gives the identity number of the first pattern. Version 1.0 311 * used zero, version 1.1 uses 1 to be more compatible with perl. 312 */ 313 static int BackRefOffset = 1; 314 315 private static Pattern none = new NoPattern(); 316 317 Pattern thePattern = none; 318 319 patInt minMatch = new patInt(0); 320 321 static Hashtable validators = new Hashtable(); 322 static 323 { 324 define("p", "(?>1)", new UnicodePunct()); 325 define("P", "(?>1)", new NUnicodePunct()); 326 define("s", "(?>1)", new UnicodeWhite()); 327 define("S", "(?>1)", new NUnicodeWhite()); 328 define("w", "(?>1)", new UnicodeW()); 329 define("W", "(?>1)", new NUnicodeW()); 330 define("d", "(?>1)", new UnicodeDigit()); 331 define("D", "(?>1)", new NUnicodeDigit()); 332 define("m", "(?>1)", new UnicodeMath()); 333 define("M", "(?>1)", new NUnicodeMath()); 334 define("c", "(?>1)", new UnicodeCurrency()); 335 define("C", "(?>1)", new NUnicodeCurrency()); 336 define("a", "(?>1)", new UnicodeAlpha()); 337 define("A", "(?>1)", new NUnicodeAlpha()); 338 define("uc", "(?>1)", new UnicodeUpper()); 339 define("lc", "(?>1)", new UnicodeLower()); 340 } 341 342 /** Set the dontMatch in quotes flag. */ setDontMatchInQuotes(boolean b)343 public void setDontMatchInQuotes(boolean b) 344 { 345 dontMatchInQuotes = b; 346 } 347 348 /** Find out if the dontMatchInQuotes flag is enabled. */ getDontMatchInQuotes()349 public boolean getDontMatchInQuotes() 350 { 351 return dontMatchInQuotes; 352 } 353 354 boolean dontMatchInQuotes = false; 355 356 /** 357 * Set the state of the ignoreCase flag. If set to true, then the pattern 358 * matcher will ignore case when searching for a match. 359 */ setIgnoreCase(boolean b)360 public void setIgnoreCase(boolean b) 361 { 362 ignoreCase = b; 363 } 364 365 /** 366 * Get the state of the ignoreCase flag. Returns true if we are ignoring the 367 * case of the pattern, false otherwise. 368 */ getIgnoreCase()369 public boolean getIgnoreCase() 370 { 371 return ignoreCase; 372 } 373 374 boolean ignoreCase = false; 375 376 static boolean defaultMFlag = false; 377 378 /** 379 * Set the default value of the m flag. If it is set to true, then the MFlag 380 * will be on for any regex search executed. 381 */ setDefaultMFlag(boolean mFlag)382 public static void setDefaultMFlag(boolean mFlag) 383 { 384 defaultMFlag = mFlag; 385 } 386 387 /** 388 * Get the default value of the m flag. If it is set to true, then the MFlag 389 * will be on for any regex search executed. 390 */ getDefaultMFlag()391 public static boolean getDefaultMFlag() 392 { 393 return defaultMFlag; 394 } 395 396 /** 397 * Initializes the object without a Pattern. To supply a Pattern use 398 * compile(String s). 399 * 400 * @see com.stevesoft.pat.Regex#compile(java.lang.String) 401 */ Regex()402 public Regex() 403 { 404 } 405 406 /** 407 * Create and compile a Regex, but do not throw any exceptions. If you wish to 408 * have exceptions thrown for syntax errors, you must use the Regex(void) 409 * constructor to create the Regex object, and then call the compile method. 410 * Therefore, you should only call this method when you know your pattern is 411 * right. I will probably become more like 412 * 413 * @see com.stevesoft.pat.Regex#search(java.lang.String) 414 * @see com.stevesoft.pat.Regex#compile(java.lang.String) 415 */ Regex(String s)416 public Regex(String s) 417 { 418 try 419 { 420 compile(s); 421 } catch (RegSyntax rs) 422 { 423 } 424 } 425 426 ReplaceRule rep = null; 427 428 /** 429 * Create and compile both a Regex and a ReplaceRule. 430 * 431 * @see com.stevesoft.pat.ReplaceRule 432 * @see com.stevesoft.pat.Regex#compile(java.lang.String) 433 */ Regex(String s, String rp)434 public Regex(String s, String rp) 435 { 436 this(s); 437 rep = ReplaceRule.perlCode(rp); 438 } 439 440 /** 441 * Create and compile a Regex, but give it the ReplaceRule specified. This 442 * allows the user finer control of the Replacement process, if that is 443 * desired. 444 * 445 * @see com.stevesoft.pat.ReplaceRule 446 * @see com.stevesoft.pat.Regex#compile(java.lang.String) 447 */ Regex(String s, ReplaceRule rp)448 public Regex(String s, ReplaceRule rp) 449 { 450 this(s); 451 rep = rp; 452 } 453 454 /** 455 * Change the ReplaceRule of this Regex by compiling a new one using String 456 * rp. 457 */ setReplaceRule(String rp)458 public void setReplaceRule(String rp) 459 { 460 rep = ReplaceRule.perlCode(rp); 461 repr = null; // Clear Replacer history 462 } 463 464 /** Change the ReplaceRule of this Regex to rp. */ setReplaceRule(ReplaceRule rp)465 public void setReplaceRule(ReplaceRule rp) 466 { 467 rep = rp; 468 } 469 470 /** 471 * Test to see if a custom defined rule exists. 472 * 473 * @see com.stevesoft.pat#define(java.lang.String,java.lang.String,Validator) 474 */ isDefined(String nm)475 public static boolean isDefined(String nm) 476 { 477 return validators.get(nm) != null; 478 } 479 480 /** 481 * Removes a custom defined rule. 482 * 483 * @see com.stevesoft.pat#define(java.lang.String,java.lang.String,Validator) 484 */ undefine(String nm)485 public static void undefine(String nm) 486 { 487 validators.remove(nm); 488 } 489 490 /** 491 * Defines a method to create a new rule. See test/deriv2.java and 492 * test/deriv3.java for examples of how to use it. 493 */ define(String nm, String pat, Validator v)494 public static void define(String nm, String pat, Validator v) 495 { 496 v.pattern = pat; 497 validators.put(nm, v); 498 } 499 500 /** 501 * Defines a shorthand for a pattern. The pattern will be invoked by a string 502 * that has the form "(??"+nm+")". 503 */ define(String nm, String pat)504 public static void define(String nm, String pat) 505 { 506 validators.put(nm, pat); 507 } 508 509 /** Get the current ReplaceRule. */ getReplaceRule()510 public ReplaceRule getReplaceRule() 511 { 512 return rep; 513 } 514 515 Replacer repr = null; 516 _getReplacer()517 final Replacer _getReplacer() 518 { 519 return repr == null ? repr = new Replacer() : repr; 520 } 521 getReplacer()522 public Replacer getReplacer() 523 { 524 if (repr == null) 525 { 526 repr = new Replacer(); 527 } 528 repr.rh.me = this; 529 repr.rh.prev = null; 530 return repr; 531 } 532 533 /** 534 * Replace the first occurence of this pattern in String s according to the 535 * ReplaceRule. 536 * 537 * @see com.stevesoft.pat.ReplaceRule 538 * @see com.stevesoft.pat.Regex#getReplaceRule() 539 */ replaceFirst(String s)540 public String replaceFirst(String s) 541 { 542 return _getReplacer().replaceFirstRegion(s, this, 0, s.length()) 543 .toString(); 544 } 545 546 /** 547 * Replace the first occurence of this pattern in String s beginning with 548 * position pos according to the ReplaceRule. 549 * 550 * @see com.stevesoft.pat.ReplaceRule 551 * @see com.stevesoft.pat.Regex#getReplaceRule() 552 */ replaceFirstFrom(String s, int pos)553 public String replaceFirstFrom(String s, int pos) 554 { 555 return _getReplacer().replaceFirstRegion(s, this, pos, s.length()) 556 .toString(); 557 } 558 559 /** 560 * Replace the first occurence of this pattern in String s beginning with 561 * position start and ending with end according to the ReplaceRule. 562 * 563 * @see com.stevesoft.pat.ReplaceRule 564 * @see com.stevesoft.pat.Regex#getReplaceRule() 565 */ replaceFirstRegion(String s, int start, int end)566 public String replaceFirstRegion(String s, int start, int end) 567 { 568 return _getReplacer().replaceFirstRegion(s, this, start, end) 569 .toString(); 570 } 571 572 /** 573 * Replace all occurences of this pattern in String s according to the 574 * ReplaceRule. 575 * 576 * @see com.stevesoft.pat.ReplaceRule 577 * @see com.stevesoft.pat.Regex#getReplaceRule() 578 */ replaceAll(String s)579 public String replaceAll(String s) 580 { 581 return _getReplacer().replaceAllRegion(s, this, 0, s.length()) 582 .toString(); 583 } 584 replaceAll(StringLike s)585 public StringLike replaceAll(StringLike s) 586 { 587 return _getReplacer().replaceAllRegion(s, this, 0, s.length()); 588 } 589 590 /** 591 * Replace all occurences of this pattern in String s beginning with position 592 * pos according to the ReplaceRule. 593 * 594 * @see com.stevesoft.pat.ReplaceRule 595 * @see com.stevesoft.pat.Regex#getReplaceRule() 596 */ replaceAllFrom(String s, int pos)597 public String replaceAllFrom(String s, int pos) 598 { 599 return _getReplacer().replaceAllRegion(s, this, pos, s.length()) 600 .toString(); 601 } 602 603 /** 604 * Replace all occurences of this pattern in String s beginning with position 605 * start and ending with end according to the ReplaceRule. 606 * 607 * @see com.stevesoft.pat.ReplaceRule 608 * @see com.stevesoft.pat.Regex#getReplaceRule() 609 */ replaceAllRegion(String s, int start, int end)610 public String replaceAllRegion(String s, int start, int end) 611 { 612 return _getReplacer().replaceAllRegion(s, this, start, end).toString(); 613 } 614 615 /** Essentially clones the Regex object */ Regex(Regex r)616 public Regex(Regex r) 617 { 618 super(r); 619 dontMatchInQuotes = r.dontMatchInQuotes; 620 esc = r.esc; 621 ignoreCase = r.ignoreCase; 622 gFlag = r.gFlag; 623 if (r.rep == null) 624 { 625 rep = null; 626 } 627 else 628 { 629 rep = (ReplaceRule) r.rep.clone(); 630 } 631 /* 632 * try { compile(r.toString()); } catch(RegSyntax r_) {} 633 */ 634 thePattern = r.thePattern.clone(new Hashtable()); 635 minMatch = r.minMatch; 636 skipper = r.skipper; 637 } 638 639 /** 640 * By default, the escape character is the backslash, but you can make it 641 * anything you want by setting this variable. 642 */ 643 public char esc = Pattern.ESC; 644 645 /** 646 * This method compiles a regular expression, making it possible to call the 647 * search or matchAt methods. 648 * 649 * @exception com.stevesoft.pat.RegSyntax 650 * is thrown if a syntax error is encountered in the pattern. For 651 * example, "x{3,1}" or "*a" are not valid patterns. 652 * @see com.stevesoft.pat.Regex#search 653 * @see com.stevesoft.pat.Regex#matchAt 654 */ compile(String prepat)655 public void compile(String prepat) throws RegSyntax 656 { 657 String postpat = parsePerl.codify(prepat, true); 658 String pat = postpat == null ? prepat : postpat; 659 minMatch = null; 660 ignoreCase = false; 661 dontMatchInQuotes = false; 662 Rthings mk = new Rthings(this); 663 int offset = mk.val; 664 String newpat = pat; 665 thePattern = none; 666 p = null; 667 or = null; 668 minMatch = new patInt(0); 669 StrPos sp = new StrPos(pat, 0); 670 if (sp.incMatch("(?e=")) 671 { 672 char newEsc = sp.c; 673 sp.inc(); 674 if (sp.match(')')) 675 { 676 newpat = reEscape(pat.substring(6), newEsc, Pattern.ESC); 677 } 678 } 679 else if (esc != Pattern.ESC) 680 { 681 newpat = reEscape(pat, esc, Pattern.ESC); 682 } 683 thePattern = _compile(newpat, mk); 684 numSubs_ = mk.val - offset; 685 mk.set(this); 686 } 687 688 /* 689 * If a Regex is compared against a Regex, a check is done to see that the 690 * patterns are equal as well as the most recent match. If a Regex is compare 691 * with a RegRes, only the result of the most recent match is compared. 692 */ 693 @Override equals(Object o)694 public boolean equals(Object o) 695 { 696 if (o instanceof Regex) 697 { 698 if (toString().equals(o.toString())) 699 { 700 return super.equals(o); 701 } 702 else 703 { 704 return false; 705 } 706 } 707 else 708 { 709 return super.equals(o); 710 } 711 } 712 713 /** A clone by any other name would smell as sweet. */ 714 @Override clone()715 public Object clone() 716 { 717 return new Regex(this); 718 } 719 720 /** Return a clone of the underlying RegRes object. */ result()721 public RegRes result() 722 { 723 return (RegRes) super.clone(); 724 } 725 726 // prep sets global variables of class 727 // Pattern so that it can access them 728 // during an attempt at a match 729 Pthings pt = new Pthings(); 730 prep(StringLike s)731 final Pthings prep(StringLike s) 732 { 733 // if(gFlag) 734 pt.lastPos = matchedTo(); 735 if (pt.lastPos < 0) 736 { 737 pt.lastPos = 0; 738 } 739 if ((s == null ? null : s.unwrap()) != (src == null ? null : s.unwrap())) 740 { 741 pt.lastPos = 0; 742 } 743 src = s; 744 pt.dotDoesntMatchCR = dotDoesntMatchCR && (!sFlag); 745 pt.mFlag = (mFlag | defaultMFlag); 746 pt.ignoreCase = ignoreCase; 747 pt.no_check = false; 748 if (pt.marks != null) 749 { 750 for (int i = 0; i < pt.marks.length; i++) 751 { 752 pt.marks[i] = -1; 753 } 754 } 755 pt.marks = null; 756 pt.nMarks = numSubs_; 757 pt.src = s; 758 if (dontMatchInQuotes) 759 { 760 setCbits(s, pt); 761 } 762 else 763 { 764 pt.cbits = null; 765 } 766 return pt; 767 } 768 769 /** 770 * Attempt to match a Pattern beginning at a specified location within the 771 * string. 772 * 773 * @see com.stevesoft.pat.Regex#search 774 */ matchAt(String s, int start_pos)775 public boolean matchAt(String s, int start_pos) 776 { 777 return _search(s, start_pos, start_pos); 778 } 779 780 /** 781 * Attempt to match a Pattern beginning at a specified location within the 782 * StringLike. 783 * 784 * @see com.stevesoft.pat.Regex#search 785 */ matchAt(StringLike s, int start_pos)786 public boolean matchAt(StringLike s, int start_pos) 787 { 788 return _search(s, start_pos, start_pos); 789 } 790 791 /** 792 * Search through a String for the first occurrence of a match. 793 * 794 * @see com.stevesoft.pat.Regex#searchFrom 795 * @see com.stevesoft.pat.Regex#matchAt 796 */ search(String s)797 public boolean search(String s) 798 { 799 if (s == null) 800 { 801 throw new NullPointerException( 802 MessageManager 803 .getString("exception.null_string_given_to_regex_search")); 804 } 805 return _search(s, 0, s.length()); 806 } 807 search(StringLike sl)808 public boolean search(StringLike sl) 809 { 810 if (sl == null) 811 { 812 throw new NullPointerException( 813 MessageManager 814 .getString("exception.null_string_like_given_to_regex_search")); 815 } 816 return _search(sl, 0, sl.length()); 817 } 818 reverseSearch(String s)819 public boolean reverseSearch(String s) 820 { 821 if (s == null) 822 { 823 throw new NullPointerException( 824 MessageManager 825 .getString("exception.null_string_given_to_regex_reverse_search")); 826 } 827 return _reverseSearch(s, 0, s.length()); 828 } 829 reverseSearch(StringLike sl)830 public boolean reverseSearch(StringLike sl) 831 { 832 if (sl == null) 833 { 834 throw new NullPointerException( 835 MessageManager 836 .getString("exception.null_string_like_given_to_regex_reverse_search")); 837 } 838 return _reverseSearch(sl, 0, sl.length()); 839 } 840 841 /** 842 * Search through a String for the first occurence of a match, but start at 843 * position 844 * 845 * <pre> 846 * start 847 * </pre> 848 */ searchFrom(String s, int start)849 public boolean searchFrom(String s, int start) 850 { 851 if (s == null) 852 { 853 throw new NullPointerException( 854 MessageManager 855 .getString("exception.null_string_like_given_to_regex_search_from")); 856 } 857 return _search(s, start, s.length()); 858 } 859 searchFrom(StringLike s, int start)860 public boolean searchFrom(StringLike s, int start) 861 { 862 if (s == null) 863 { 864 throw new NullPointerException( 865 MessageManager 866 .getString("exception.null_string_like_given_to_regex_search_from")); 867 } 868 return _search(s, start, s.length()); 869 } 870 871 /** 872 * Search through a region of a String for the first occurence of a match. 873 */ searchRegion(String s, int start, int end)874 public boolean searchRegion(String s, int start, int end) 875 { 876 if (s == null) 877 { 878 throw new NullPointerException( 879 MessageManager 880 .getString("exception.null_string_like_given_to_regex_search_region")); 881 } 882 return _search(s, start, end); 883 } 884 885 /** 886 * Set this to change the default behavior of the "." pattern. By default it 887 * now matches perl's behavior and fails to match the '\n' character. 888 */ 889 public static boolean dotDoesntMatchCR = true; 890 891 StringLike gFlags; 892 893 int gFlagto = 0; 894 895 boolean gFlag = false; 896 897 /** Set the 'g' flag */ setGFlag(boolean b)898 public void setGFlag(boolean b) 899 { 900 gFlag = b; 901 } 902 903 /** Get the state of the 'g' flag. */ getGFlag()904 public boolean getGFlag() 905 { 906 return gFlag; 907 } 908 909 boolean sFlag = false; 910 911 /** Get the state of the sFlag */ getSFlag()912 public boolean getSFlag() 913 { 914 return sFlag; 915 } 916 917 boolean mFlag = false; 918 919 /** Get the state of the sFlag */ getMFlag()920 public boolean getMFlag() 921 { 922 return mFlag; 923 } 924 _search(String s, int start, int end)925 final boolean _search(String s, int start, int end) 926 { 927 return _search(new StringWrap(s), start, end); 928 } 929 _search(StringLike s, int start, int end)930 final boolean _search(StringLike s, int start, int end) 931 { 932 if (gFlag && gFlagto > 0 && gFlags != null 933 && s.unwrap() == gFlags.unwrap()) 934 { 935 start = gFlagto; 936 } 937 gFlags = null; 938 939 Pthings pt = prep(s); 940 941 int up = (minMatch == null ? end : end - minMatch.i); 942 943 if (up < start && end >= start) 944 { 945 up = start; 946 } 947 948 if (skipper == null) 949 { 950 for (int i = start; i <= up; i++) 951 { 952 charsMatched_ = thePattern.matchAt(s, i, pt); 953 if (charsMatched_ >= 0) 954 { 955 matchFrom_ = thePattern.mfrom; 956 marks = pt.marks; 957 gFlagto = matchFrom_ + charsMatched_; 958 gFlags = s; 959 return didMatch_ = true; 960 } 961 } 962 } 963 else 964 { 965 pt.no_check = true; 966 for (int i = start; i <= up; i++) 967 { 968 i = skipper.find(src, i, up); 969 if (i < 0) 970 { 971 charsMatched_ = matchFrom_ = -1; 972 return didMatch_ = false; 973 } 974 charsMatched_ = thePattern.matchAt(s, i, pt); 975 if (charsMatched_ >= 0) 976 { 977 matchFrom_ = thePattern.mfrom; 978 marks = pt.marks; 979 gFlagto = matchFrom_ + charsMatched_; 980 gFlags = s; 981 return didMatch_ = true; 982 } 983 } 984 } 985 return didMatch_ = false; 986 } 987 988 /* 989 * final boolean _search(LongStringLike s,long start,long end) { if(gFlag && 990 * gFlagto > 0 && s==gFlags) start = gFlagto; gFlags = null; 991 * 992 * Pthings pt=prep(s); 993 * 994 * int up = end;//(minMatch == null ? end : end-minMatch.i); 995 * 996 * if(up < start && end >= start) up = start; 997 * 998 * if(skipper == null) { for(long i=start;i<=up;i++) { charsMatched_ = 999 * thePattern.matchAt(s,i,pt); if(charsMatched_ >= 0) { matchFrom_ = 1000 * thePattern.mfrom; marks = pt.marks; gFlagto = matchFrom_+charsMatched_; 1001 * return didMatch_=true; } } } else { pt.no_check = true; for(long 1002 * i=start;i<=up;i++) { i = skipper.find(src,i,up); if(i<0) { charsMatched_ = 1003 * matchFrom_ = -1; return didMatch_ = false; } charsMatched_ = 1004 * thePattern.matchAt(s,i,pt); if(charsMatched_ >= 0) { matchFrom_ = 1005 * thePattern.mfrom; marks = pt.marks; gFlagto = matchFrom_+charsMatched_; 1006 * gFlags = s; return didMatch_=true; } else { i = s.adjustIndex(i); up = 1007 * s.adjustEnd(i); } } } return didMatch_=false; } 1008 */ 1009 _reverseSearch(String s, int start, int end)1010 boolean _reverseSearch(String s, int start, int end) 1011 { 1012 return _reverseSearch(new StringWrap(s), start, end); 1013 } 1014 _reverseSearch(StringLike s, int start, int end)1015 boolean _reverseSearch(StringLike s, int start, int end) 1016 { 1017 if (gFlag && gFlagto > 0 && s.unwrap() == gFlags.unwrap()) 1018 { 1019 end = gFlagto; 1020 } 1021 gFlags = null; 1022 Pthings pt = prep(s); 1023 for (int i = end; i >= start; i--) 1024 { 1025 charsMatched_ = thePattern.matchAt(s, i, pt); 1026 if (charsMatched_ >= 0) 1027 { 1028 matchFrom_ = thePattern.mfrom; 1029 marks = pt.marks; 1030 gFlagto = matchFrom_ - 1; 1031 gFlags = s; 1032 return didMatch_ = true; 1033 } 1034 } 1035 return didMatch_ = false; 1036 } 1037 1038 // This routine sets the cbits variable 1039 // of class Pattern. Cbits is true for 1040 // the bit corresponding to a character inside 1041 // a set of quotes. 1042 static StringLike lasts = null; 1043 1044 static BitSet lastbs = null; 1045 setCbits(StringLike s, Pthings pt)1046 static void setCbits(StringLike s, Pthings pt) 1047 { 1048 if (s == lasts) 1049 { 1050 pt.cbits = lastbs; 1051 return; 1052 } 1053 BitSet bs = new BitSet(s.length()); 1054 char qc = ' '; 1055 boolean setBit = false; 1056 for (int i = 0; i < s.length(); i++) 1057 { 1058 if (setBit) 1059 { 1060 bs.set(i); 1061 } 1062 char c = s.charAt(i); 1063 if (!setBit && c == '"') 1064 { 1065 qc = c; 1066 setBit = true; 1067 bs.set(i); 1068 } 1069 else if (!setBit && c == '\'') 1070 { 1071 qc = c; 1072 setBit = true; 1073 bs.set(i); 1074 } 1075 else if (setBit && c == qc) 1076 { 1077 setBit = false; 1078 } 1079 else if (setBit && c == '\\' && i + 1 < s.length()) 1080 { 1081 i++; 1082 if (setBit) 1083 { 1084 bs.set(i); 1085 } 1086 } 1087 } 1088 pt.cbits = lastbs = bs; 1089 lasts = s; 1090 } 1091 1092 // Wanted user to over-ride this in alpha version, 1093 // but it wasn't really necessary because of this trick: newRegex()1094 Regex newRegex() 1095 { 1096 try 1097 { 1098 return getClass().getDeclaredConstructor().newInstance(); 1099 } catch (InstantiationException ie) 1100 { 1101 return null; 1102 } catch (IllegalAccessException iae) 1103 { 1104 return null; 1105 } catch (ReflectiveOperationException roe) 1106 { 1107 return null; 1108 } 1109 } 1110 1111 /** 1112 * Only needed for creating your own extensions of Regex. This method adds the 1113 * next Pattern in the chain of patterns or sets the Pattern if it is the 1114 * first call. 1115 */ add(Pattern p2)1116 protected void add(Pattern p2) 1117 { 1118 if (p == null) 1119 { 1120 p = p2; 1121 } 1122 else 1123 { 1124 p.add(p2); 1125 p2 = p; 1126 } 1127 } 1128 1129 /** 1130 * You only need to use this method if you are creating your own extentions to 1131 * Regex. compile1 compiles one Pattern element, it can be over-ridden to 1132 * allow the Regex compiler to understand new syntax. See deriv.java for an 1133 * example. This routine is the heart of class Regex. Rthings has one integer 1134 * member called intValue, it is used to keep track of the number of ()'s in 1135 * the Pattern. 1136 * 1137 * @exception com.stevesoft.pat.RegSyntax 1138 * is thrown when a nonsensensical pattern is supplied. For 1139 * example, a pattern beginning with *. 1140 */ compile1(StrPos sp, Rthings mk)1141 protected void compile1(StrPos sp, Rthings mk) throws RegSyntax 1142 { 1143 if (sp.match('[')) 1144 { 1145 sp.inc(); 1146 add(matchBracket(sp)); 1147 } 1148 else if (sp.match('|')) 1149 { 1150 if (or == null) 1151 { 1152 or = new Or(); 1153 } 1154 if (p == null) 1155 { 1156 p = new NullPattern(); 1157 } 1158 or.addOr(p); 1159 p = null; 1160 } 1161 else if (sp.incMatch("(?<")) 1162 { 1163 patInt i = sp.getPatInt(); 1164 if (i == null) 1165 { 1166 RegSyntaxError.endItAll("No int after (?<"); 1167 } 1168 add(new Backup(i.intValue())); 1169 if (!sp.match(')')) 1170 { 1171 RegSyntaxError.endItAll("No ) after (?<"); 1172 } 1173 } 1174 else if (sp.incMatch("(?>")) 1175 { 1176 patInt i = sp.getPatInt(); 1177 if (i == null) 1178 { 1179 RegSyntaxError.endItAll("No int after (?>"); 1180 } 1181 add(new Backup(-i.intValue())); 1182 if (!sp.match(')')) 1183 { 1184 RegSyntaxError.endItAll("No ) after (?<"); 1185 } 1186 } 1187 else if (sp.incMatch("(?@")) 1188 { 1189 char op = sp.c; 1190 sp.inc(); 1191 char cl = sp.c; 1192 sp.inc(); 1193 if (!sp.match(')')) 1194 { 1195 RegSyntaxError.endItAll("(?@ does not have closing paren"); 1196 } 1197 add(new Group(op, cl)); 1198 } 1199 else if (sp.incMatch("(?#")) 1200 { 1201 while (!sp.match(')')) 1202 { 1203 sp.inc(); 1204 } 1205 } 1206 else if (sp.dontMatch && sp.c == 'w') 1207 { 1208 // Regex r = new Regex(); 1209 // r._compile("[a-zA-Z0-9_]",mk); 1210 // add(new Goop("\\w",r.thePattern)); 1211 Bracket b = new Bracket(false); 1212 b.addOr(new Range('a', 'z')); 1213 b.addOr(new Range('A', 'Z')); 1214 b.addOr(new Range('0', '9')); 1215 b.addOr(new oneChar('_')); 1216 add(b); 1217 } 1218 else if (sp.dontMatch && sp.c == 'G') 1219 { 1220 add(new BackG()); 1221 } 1222 else if (sp.dontMatch && sp.c == 's') 1223 { 1224 // Regex r = new Regex(); 1225 // r._compile("[ \t\n\r\b]",mk); 1226 // add(new Goop("\\s",r.thePattern)); 1227 Bracket b = new Bracket(false); 1228 b.addOr(new oneChar((char) 32)); 1229 b.addOr(new Range((char) 8, (char) 10)); 1230 b.addOr(new oneChar((char) 13)); 1231 add(b); 1232 } 1233 else if (sp.dontMatch && sp.c == 'd') 1234 { 1235 // Regex r = new Regex(); 1236 // r._compile("[0-9]",mk); 1237 // add(new Goop("\\d",r.thePattern)); 1238 Range digit = new Range('0', '9'); 1239 digit.printBrackets = true; 1240 add(digit); 1241 } 1242 else if (sp.dontMatch && sp.c == 'W') 1243 { 1244 // Regex r = new Regex(); 1245 // r._compile("[^a-zA-Z0-9_]",mk); 1246 // add(new Goop("\\W",r.thePattern)); 1247 Bracket b = new Bracket(true); 1248 b.addOr(new Range('a', 'z')); 1249 b.addOr(new Range('A', 'Z')); 1250 b.addOr(new Range('0', '9')); 1251 b.addOr(new oneChar('_')); 1252 add(b); 1253 } 1254 else if (sp.dontMatch && sp.c == 'S') 1255 { 1256 // Regex r = new Regex(); 1257 // r._compile("[^ \t\n\r\b]",mk); 1258 // add(new Goop("\\S",r.thePattern)); 1259 Bracket b = new Bracket(true); 1260 b.addOr(new oneChar((char) 32)); 1261 b.addOr(new Range((char) 8, (char) 10)); 1262 b.addOr(new oneChar((char) 13)); 1263 add(b); 1264 } 1265 else if (sp.dontMatch && sp.c == 'D') 1266 { 1267 // Regex r = new Regex(); 1268 // r._compile("[^0-9]",mk); 1269 // add(new Goop("\\D",r.thePattern)); 1270 Bracket b = new Bracket(true); 1271 b.addOr(new Range('0', '9')); 1272 add(b); 1273 } 1274 else if (sp.dontMatch && sp.c == 'B') 1275 { 1276 Regex r = new Regex(); 1277 r._compile("(?!" + back_slash + "b)", mk); 1278 add(r.thePattern); 1279 } 1280 else if (isOctalString(sp)) 1281 { 1282 int d = sp.c - '0'; 1283 sp.inc(); 1284 d = 8 * d + sp.c - '0'; 1285 StrPos sp2 = new StrPos(sp); 1286 sp2.inc(); 1287 if (isOctalDigit(sp2, false)) 1288 { 1289 sp.inc(); 1290 d = 8 * d + sp.c - '0'; 1291 } 1292 add(new oneChar((char) d)); 1293 } 1294 else if (sp.dontMatch && sp.c >= '1' && sp.c <= '9') 1295 { 1296 int iv = sp.c - '0'; 1297 StrPos s2 = new StrPos(sp); 1298 s2.inc(); 1299 if (!s2.dontMatch && s2.c >= '0' && s2.c <= '9') 1300 { 1301 iv = 10 * iv + (s2.c - '0'); 1302 sp.inc(); 1303 } 1304 add(new BackMatch(iv)); 1305 } 1306 else if (sp.dontMatch && sp.c == 'b') 1307 { 1308 add(new Boundary()); 1309 } 1310 else if (sp.match('\b')) 1311 { 1312 add(new Boundary()); 1313 } 1314 else if (sp.match('$')) 1315 { 1316 add(new End(true)); 1317 } 1318 else if (sp.dontMatch && sp.c == 'Z') 1319 { 1320 add(new End(false)); 1321 } 1322 else if (sp.match('.')) 1323 { 1324 add(new Any()); 1325 } 1326 else if (sp.incMatch("(??")) 1327 { 1328 StringBuffer sb = new StringBuffer(); 1329 StringBuffer sb2 = new StringBuffer(); 1330 while (!sp.match(')') && !sp.match(':')) 1331 { 1332 sb.append(sp.c); 1333 sp.inc(); 1334 } 1335 if (sp.incMatch(":")) 1336 { 1337 while (!sp.match(')')) 1338 { 1339 sb2.append(sp.c); 1340 sp.inc(); 1341 } 1342 } 1343 String sbs = sb.toString(); 1344 if (validators.get(sbs) instanceof String) 1345 { 1346 String pat = (String) validators.get(sbs); 1347 Regex r = newRegex(); 1348 Rthings rth = new Rthings(this); 1349 rth.noBackRefs = true; 1350 r._compile(pat, rth); 1351 add(r.thePattern); 1352 } 1353 else 1354 { 1355 Custom cm = new Custom(sb.toString()); 1356 if (cm.v != null) 1357 { 1358 Validator v2 = cm.v.arg(sb2.toString()); 1359 if (v2 != null) 1360 { 1361 v2.argsave = sb2.toString(); 1362 String p = cm.v.pattern; 1363 cm.v = v2; 1364 v2.pattern = p; 1365 } 1366 Regex r = newRegex(); 1367 Rthings rth = new Rthings(this); 1368 rth.noBackRefs = true; 1369 r._compile(cm.v.pattern, rth); 1370 cm.sub = r.thePattern; 1371 cm.sub.add(new CustomEndpoint(cm)); 1372 cm.sub.setParent(cm); 1373 add(cm); 1374 } 1375 } 1376 } 1377 else if (sp.match('(')) 1378 { 1379 mk.parenLevel++; 1380 Regex r = newRegex(); 1381 // r.or = new Or(); 1382 sp.inc(); 1383 if (sp.incMatch("?:")) 1384 { 1385 r.or = new Or(); 1386 } 1387 else if (sp.incMatch("?=")) 1388 { 1389 r.or = new lookAhead(false); 1390 } 1391 else if (sp.incMatch("?!")) 1392 { 1393 r.or = new lookAhead(true); 1394 } 1395 else if (sp.match('?')) 1396 { 1397 sp.inc(); 1398 do 1399 { 1400 if (sp.c == 'i') 1401 { 1402 mk.ignoreCase = true; 1403 } 1404 if (sp.c == 'Q') 1405 { 1406 mk.dontMatchInQuotes = true; 1407 } 1408 if (sp.c == 'o') 1409 { 1410 mk.optimizeMe = true; 1411 } 1412 if (sp.c == 'g') 1413 { 1414 mk.gFlag = true; 1415 } 1416 if (sp.c == 's') 1417 { 1418 mk.sFlag = true; 1419 } 1420 if (sp.c == 'm') 1421 { 1422 mk.mFlag = true; 1423 } 1424 sp.inc(); 1425 } while (!sp.match(')') && !sp.eos); 1426 r = null; 1427 mk.parenLevel--; 1428 if (sp.eos) // throw new RegSyntax 1429 { 1430 RegSyntaxError.endItAll("Unclosed ()"); 1431 } 1432 } 1433 else 1434 { // just ordinary parenthesis 1435 r.or = mk.noBackRefs ? new Or() : new OrMark(mk.val++); 1436 } 1437 if (r != null) 1438 { 1439 add(r._compile(sp, mk)); 1440 } 1441 } 1442 else if (sp.match('^')) 1443 { 1444 add(new Start(true)); 1445 } 1446 else if (sp.dontMatch && sp.c == 'A') 1447 { 1448 add(new Start(false)); 1449 } 1450 else if (sp.match('*')) 1451 { 1452 addMulti(new patInt(0), new patInf()); 1453 } 1454 else if (sp.match('+')) 1455 { 1456 addMulti(new patInt(1), new patInf()); 1457 } 1458 else if (sp.match('?')) 1459 { 1460 addMulti(new patInt(0), new patInt(1)); 1461 } 1462 else if (sp.match('{')) 1463 { 1464 boolean bad = false; 1465 StrPos sp2 = new StrPos(sp); 1466 // StringBuffer sb = new StringBuffer(); 1467 sp.inc(); 1468 patInt i1 = sp.getPatInt(); 1469 patInt i2 = null; 1470 if (sp.match('}')) 1471 { 1472 i2 = i1; 1473 } 1474 else 1475 { 1476 if (!sp.match(',')) 1477 { 1478 /* 1479 * RegSyntaxError.endItAll( "String \"{"+i2+ "\" should be followed 1480 * with , or }"); 1481 */ 1482 bad = true; 1483 } 1484 sp.inc(); 1485 if (sp.match('}')) 1486 { 1487 i2 = new patInf(); 1488 } 1489 else 1490 { 1491 i2 = sp.getPatInt(); 1492 } 1493 } 1494 if (i1 == null || i2 == null) 1495 { 1496 /* 1497 * throw new RegSyntax("Badly formatted Multi: " +"{"+i1+","+i2+"}"); 1498 */ 1499 bad = true; 1500 } 1501 if (bad) 1502 { 1503 sp.dup(sp2); 1504 add(new oneChar(sp.c)); 1505 } 1506 else 1507 { 1508 addMulti(i1, i2); 1509 } 1510 } 1511 else if (sp.escMatch('x') && next2Hex(sp)) 1512 { 1513 sp.inc(); 1514 int d = getHexDigit(sp); 1515 sp.inc(); 1516 d = 16 * d + getHexDigit(sp); 1517 add(new oneChar((char) d)); 1518 } 1519 else if (sp.escMatch('c')) 1520 { 1521 sp.inc(); 1522 if (sp.c < Ctrl.cmap.length) 1523 { 1524 add(new oneChar(Ctrl.cmap[sp.c])); 1525 } 1526 else 1527 { 1528 add(new oneChar(sp.c)); 1529 } 1530 } 1531 else if (sp.escMatch('f')) 1532 { 1533 add(new oneChar((char) 12)); 1534 } 1535 else if (sp.escMatch('a')) 1536 { 1537 add(new oneChar((char) 7)); 1538 } 1539 else if (sp.escMatch('t')) 1540 { 1541 add(new oneChar('\t')); 1542 } 1543 else if (sp.escMatch('n')) 1544 { 1545 add(new oneChar('\n')); 1546 } 1547 else if (sp.escMatch('r')) 1548 { 1549 add(new oneChar('\r')); 1550 } 1551 else if (sp.escMatch('b')) 1552 { 1553 add(new oneChar('\b')); 1554 } 1555 else if (sp.escMatch('e')) 1556 { 1557 add(new oneChar((char) 27)); 1558 } 1559 else 1560 { 1561 add(new oneChar(sp.c)); 1562 if (sp.match(')')) 1563 { 1564 RegSyntaxError.endItAll("Unmatched right paren in pattern"); 1565 } 1566 } 1567 } 1568 1569 // compiles all Pattern elements, internal method _compile(String pat, Rthings mk)1570 private Pattern _compile(String pat, Rthings mk) throws RegSyntax 1571 { 1572 minMatch = null; 1573 sFlag = mFlag = ignoreCase = gFlag = false; 1574 StrPos sp = new StrPos(pat, 0); 1575 thePattern = _compile(sp, mk); 1576 pt.marks = null; 1577 return thePattern; 1578 } 1579 1580 Pattern p = null; 1581 1582 Or or = null; 1583 _compile(StrPos sp, Rthings mk)1584 Pattern _compile(StrPos sp, Rthings mk) throws RegSyntax 1585 { 1586 while (!(sp.eos || (or != null && sp.match(')')))) 1587 { 1588 compile1(sp, mk); 1589 sp.inc(); 1590 } 1591 if (sp.match(')')) 1592 { 1593 mk.parenLevel--; 1594 } 1595 else if (sp.eos && mk.parenLevel != 0) 1596 { 1597 RegSyntaxError.endItAll("Unclosed Parenthesis! lvl=" + mk.parenLevel); 1598 } 1599 if (or != null) 1600 { 1601 if (p == null) 1602 { 1603 p = new NullPattern(); 1604 } 1605 or.addOr(p); 1606 return or; 1607 } 1608 return p == null ? new NullPattern() : p; 1609 } 1610 1611 // add a multi object to the end of the chain 1612 // which applies to the last object addMulti(patInt i1, patInt i2)1613 void addMulti(patInt i1, patInt i2) throws RegSyntax 1614 { 1615 Pattern last, last2; 1616 for (last = p; last != null && last.next != null; last = last.next) 1617 { 1618 ; 1619 } 1620 if (last == null || last == p) 1621 { 1622 last2 = null; 1623 } 1624 else 1625 { 1626 for (last2 = p; last2.next != last; last2 = last2.next) 1627 { 1628 ; 1629 } 1630 } 1631 if (last instanceof Multi && i1.intValue() == 0 && i2.intValue() == 1) 1632 { 1633 ((Multi) last).matchFewest = true; 1634 } 1635 else if (last instanceof FastMulti && i1.intValue() == 0 1636 && i2.intValue() == 1) 1637 { 1638 ((FastMulti) last).matchFewest = true; 1639 } 1640 else if (last instanceof DotMulti && i1.intValue() == 0 1641 && i2.intValue() == 1) 1642 { 1643 ((DotMulti) last).matchFewest = true; 1644 } 1645 else if (last instanceof Multi || last instanceof DotMulti 1646 || last instanceof FastMulti) 1647 { 1648 throw new RegSyntax("Syntax error."); 1649 } 1650 else if (last2 == null) 1651 { 1652 p = mkMulti(i1, i2, p); 1653 } 1654 else 1655 { 1656 last2.next = mkMulti(i1, i2, last); 1657 } 1658 } 1659 mkMulti(patInt lo, patInt hi, Pattern p)1660 final static Pattern mkMulti(patInt lo, patInt hi, Pattern p) 1661 throws RegSyntax 1662 { 1663 if (p instanceof Any && p.next == null) 1664 { 1665 return new DotMulti(lo, hi); 1666 } 1667 return RegOpt.safe4fm(p) ? (Pattern) new FastMulti(lo, hi, p) 1668 : (Pattern) new Multi(lo, hi, p); 1669 } 1670 1671 // process the bracket operator matchBracket(StrPos sp)1672 Pattern matchBracket(StrPos sp) throws RegSyntax 1673 { 1674 Bracket ret; 1675 if (sp.match('^')) 1676 { 1677 ret = new Bracket(true); 1678 sp.inc(); 1679 } 1680 else 1681 { 1682 ret = new Bracket(false); 1683 } 1684 if (sp.match(']')) 1685 { 1686 // throw new RegSyntax 1687 RegSyntaxError.endItAll("Unmatched []"); 1688 } 1689 1690 while (!sp.eos && !sp.match(']')) 1691 { 1692 StrPos s1 = new StrPos(sp); 1693 s1.inc(); 1694 StrPos s1_ = new StrPos(s1); 1695 s1_.inc(); 1696 if (s1.match('-') && !s1_.match(']')) 1697 { 1698 StrPos s2 = new StrPos(s1); 1699 s2.inc(); 1700 if (!s2.eos) 1701 { 1702 ret.addOr(new Range(sp.c, s2.c)); 1703 } 1704 sp.inc(); 1705 sp.inc(); 1706 } 1707 else if (sp.escMatch('Q')) 1708 { 1709 sp.inc(); 1710 while (!sp.escMatch('E')) 1711 { 1712 ret.addOr(new oneChar(sp.c)); 1713 sp.inc(); 1714 } 1715 } 1716 else if (sp.escMatch('d')) 1717 { 1718 ret.addOr(new Range('0', '9')); 1719 } 1720 else if (sp.escMatch('s')) 1721 { 1722 ret.addOr(new oneChar((char) 32)); 1723 ret.addOr(new Range((char) 8, (char) 10)); 1724 ret.addOr(new oneChar((char) 13)); 1725 } 1726 else if (sp.escMatch('w')) 1727 { 1728 ret.addOr(new Range('a', 'z')); 1729 ret.addOr(new Range('A', 'Z')); 1730 ret.addOr(new Range('0', '9')); 1731 ret.addOr(new oneChar('_')); 1732 } 1733 else if (sp.escMatch('D')) 1734 { 1735 ret.addOr(new Range((char) 0, (char) 47)); 1736 ret.addOr(new Range((char) 58, (char) 65535)); 1737 } 1738 else if (sp.escMatch('S')) 1739 { 1740 ret.addOr(new Range((char) 0, (char) 7)); 1741 ret.addOr(new Range((char) 11, (char) 12)); 1742 ret.addOr(new Range((char) 14, (char) 31)); 1743 ret.addOr(new Range((char) 33, (char) 65535)); 1744 } 1745 else if (sp.escMatch('W')) 1746 { 1747 ret.addOr(new Range((char) 0, (char) 64)); 1748 ret.addOr(new Range((char) 91, (char) 94)); 1749 ret.addOr(new oneChar((char) 96)); 1750 ret.addOr(new Range((char) 123, (char) 65535)); 1751 } 1752 else if (sp.escMatch('x') && next2Hex(sp)) 1753 { 1754 sp.inc(); 1755 int d = getHexDigit(sp); 1756 sp.inc(); 1757 d = 16 * d + getHexDigit(sp); 1758 ret.addOr(new oneChar((char) d)); 1759 } 1760 else if (sp.escMatch('a')) 1761 { 1762 ret.addOr(new oneChar((char) 7)); 1763 } 1764 else if (sp.escMatch('f')) 1765 { 1766 ret.addOr(new oneChar((char) 12)); 1767 } 1768 else if (sp.escMatch('e')) 1769 { 1770 ret.addOr(new oneChar((char) 27)); 1771 } 1772 else if (sp.escMatch('n')) 1773 { 1774 ret.addOr(new oneChar('\n')); 1775 } 1776 else if (sp.escMatch('t')) 1777 { 1778 ret.addOr(new oneChar('\t')); 1779 } 1780 else if (sp.escMatch('r')) 1781 { 1782 ret.addOr(new oneChar('\r')); 1783 } 1784 else if (sp.escMatch('c')) 1785 { 1786 sp.inc(); 1787 if (sp.c < Ctrl.cmap.length) 1788 { 1789 ret.addOr(new oneChar(Ctrl.cmap[sp.c])); 1790 } 1791 else 1792 { 1793 ret.addOr(new oneChar(sp.c)); 1794 } 1795 } 1796 else if (isOctalString(sp)) 1797 { 1798 int d = sp.c - '0'; 1799 sp.inc(); 1800 d = 8 * d + sp.c - '0'; 1801 StrPos sp2 = new StrPos(sp); 1802 sp2.inc(); 1803 if (isOctalDigit(sp2, false)) 1804 { 1805 sp.inc(); 1806 d = 8 * d + sp.c - '0'; 1807 } 1808 ret.addOr(new oneChar((char) d)); 1809 } 1810 else 1811 { 1812 ret.addOr(new oneChar(sp.c)); 1813 } 1814 sp.inc(); 1815 } 1816 return ret; 1817 } 1818 1819 /** 1820 * Converts the stored Pattern to a String -- this is a decompile. Note that 1821 * \t and \n will really print out here, Not just the two character 1822 * representations. Also be prepared to see some strange output if your 1823 * characters are not printable. 1824 */ 1825 @Override toString()1826 public String toString() 1827 { 1828 if (false && thePattern == null) 1829 { 1830 return ""; 1831 } 1832 else 1833 { 1834 StringBuffer sb = new StringBuffer(); 1835 if (esc != Pattern.ESC) 1836 { 1837 sb.append("(?e="); 1838 sb.append(esc); 1839 sb.append(")"); 1840 } 1841 if (gFlag || mFlag || !dotDoesntMatchCR || sFlag || ignoreCase 1842 || dontMatchInQuotes || optimized()) 1843 { 1844 sb.append("(?"); 1845 if (ignoreCase) 1846 { 1847 sb.append("i"); 1848 } 1849 if (mFlag) 1850 { 1851 sb.append("m"); 1852 } 1853 if (sFlag || !dotDoesntMatchCR) 1854 { 1855 sb.append("s"); 1856 } 1857 if (dontMatchInQuotes) 1858 { 1859 sb.append("Q"); 1860 } 1861 if (optimized()) 1862 { 1863 sb.append("o"); 1864 } 1865 if (gFlag) 1866 { 1867 sb.append("g"); 1868 } 1869 sb.append(")"); 1870 } 1871 String patstr = thePattern.toString(); 1872 if (esc != Pattern.ESC) 1873 { 1874 patstr = reEscape(patstr, Pattern.ESC, esc); 1875 } 1876 sb.append(patstr); 1877 return sb.toString(); 1878 } 1879 } 1880 1881 // Re-escape Pattern, allows us to use a different escape 1882 // character. reEscape(String s, char oldEsc, char newEsc)1883 static String reEscape(String s, char oldEsc, char newEsc) 1884 { 1885 if (oldEsc == newEsc) 1886 { 1887 return s; 1888 } 1889 int i; 1890 StringBuffer sb = new StringBuffer(); 1891 for (i = 0; i < s.length(); i++) 1892 { 1893 if (s.charAt(i) == oldEsc && i + 1 < s.length()) 1894 { 1895 if (s.charAt(i + 1) == oldEsc) 1896 { 1897 sb.append(oldEsc); 1898 } 1899 else 1900 { 1901 sb.append(newEsc); 1902 sb.append(s.charAt(i + 1)); 1903 } 1904 i++; 1905 } 1906 else if (s.charAt(i) == newEsc) 1907 { 1908 sb.append(newEsc); 1909 sb.append(newEsc); 1910 } 1911 else 1912 { 1913 sb.append(s.charAt(i)); 1914 } 1915 } 1916 return sb.toString(); 1917 } 1918 1919 /** 1920 * This method implements FilenameFilter, allowing one to use a Regex to 1921 * search through a directory using File.list. There is a FileRegex now that 1922 * does this better. 1923 * 1924 * @see com.stevesoft.pat.FileRegex 1925 */ 1926 @Override accept(File dir, String s)1927 public boolean accept(File dir, String s) 1928 { 1929 return search(s); 1930 } 1931 1932 /** The version of this package */ version()1933 final static public String version() 1934 { 1935 return "lgpl release 1.5.3"; 1936 } 1937 1938 /** 1939 * Once this method is called, the state of variables ignoreCase and 1940 * dontMatchInQuotes should not be changed as the results will be 1941 * unpredictable. However, search and matchAt will run more quickly. Note that 1942 * you can check to see if the pattern has been optimized by calling the 1943 * optimized() method. 1944 * <p> 1945 * This method will attempt to rewrite your pattern in a way that makes it 1946 * faster (not all patterns execute at the same speed). In general, 1947 * "(?: ... )" will be faster than "( ... )" so if you don't need the 1948 * backreference, you should group using the former pattern. 1949 * <p> 1950 * It will also introduce new pattern elements that you can't get to 1951 * otherwise, for example if you have a large table of strings, i.e. the 1952 * months of the year "(January|February|...)" optimize() will make a 1953 * Hashtable that takes it to the next appropriate pattern element -- 1954 * eliminating the need for a linear search. 1955 * 1956 * @see com.stevesoft.pat.Regex#optimized 1957 * @see com.stevesoft.pat.Regex#ignoreCase 1958 * @see com.stevesoft.pat.Regex#dontMatchInQuotes 1959 * @see com.stevesoft.pat.Regex#matchAt 1960 * @see com.stevesoft.pat.Regex#search 1961 */ optimize()1962 public void optimize() 1963 { 1964 if (optimized() || thePattern == null) 1965 { 1966 return; 1967 } 1968 minMatch = new patInt(0); // thePattern.countMinChars(); 1969 thePattern = RegOpt.opt(thePattern, ignoreCase, dontMatchInQuotes); 1970 skipper = Skip.findSkip(this); 1971 // RegOpt.setParents(this); 1972 return; 1973 } 1974 1975 Skip skipper; 1976 1977 /** 1978 * This function returns true if the optimize method has been called. 1979 */ optimized()1980 public boolean optimized() 1981 { 1982 return minMatch != null; 1983 } 1984 1985 /** 1986 * A bit of syntactic surgar for those who want to make their code look more 1987 * perl-like. To use this initialize your Regex object by saying: 1988 * 1989 * <pre> 1990 * Regex r1 = Regex.perlCode("s/hello/goodbye/"); 1991 * Regex r2 = Regex.perlCode("s'fish'frog'i"); 1992 * Regex r3 = Regex.perlCode("m'hello'); 1993 * </pre> 1994 * 1995 * The i for ignoreCase is supported in this syntax, as well as m, s, and x. 1996 * The g flat is a bit of a special case. 1997 * <p> 1998 * If you wish to replace all occurences of a pattern, you do not put a 'g' in 1999 * the perlCode, but call Regex's replaceAll method. 2000 * <p> 2001 * If you wish to simply and only do a search for r2's pattern, you can do 2002 * this by calling the searchFrom method method repeatedly, or by calling 2003 * search repeatedly if the g flag is set. 2004 * <p> 2005 * Note: Currently perlCode does <em>not</em> support the (?e=#) syntax for 2006 * changing the escape character. 2007 */ 2008 perlCode(String s)2009 public static Regex perlCode(String s) 2010 { 2011 // this file is big enough, see parsePerl.java 2012 // for this function. 2013 return parsePerl.parse(s); 2014 } 2015 2016 static final char back_slash = '\\'; 2017 2018 /** 2019 * Checks to see if there are only literal and no special pattern elements in 2020 * this Regex. 2021 */ isLiteral()2022 public boolean isLiteral() 2023 { 2024 Pattern x = thePattern; 2025 while (x != null) 2026 { 2027 if (x instanceof oneChar) 2028 { 2029 ; 2030 } 2031 else if (x instanceof Skipped) 2032 { 2033 ; 2034 } 2035 else 2036 { 2037 return false; 2038 } 2039 x = x.next; 2040 } 2041 return true; 2042 } 2043 2044 /** 2045 * You only need to know about this if you are inventing your own pattern 2046 * elements. 2047 */ countMinChars()2048 public patInt countMinChars() 2049 { 2050 return thePattern.countMinChars(); 2051 } 2052 2053 /** 2054 * You only need to know about this if you are inventing your own pattern 2055 * elements. 2056 */ countMaxChars()2057 public patInt countMaxChars() 2058 { 2059 return thePattern.countMaxChars(); 2060 } 2061 isHexDigit(StrPos sp)2062 boolean isHexDigit(StrPos sp) 2063 { 2064 boolean r = !sp.eos 2065 && !sp.dontMatch 2066 && ((sp.c >= '0' && sp.c <= '9') 2067 || (sp.c >= 'a' && sp.c <= 'f') || (sp.c >= 'A' && sp.c <= 'F')); 2068 return r; 2069 } 2070 isOctalDigit(StrPos sp, boolean first)2071 boolean isOctalDigit(StrPos sp, boolean first) 2072 { 2073 boolean r = !sp.eos && !(first ^ sp.dontMatch) && sp.c >= '0' 2074 && sp.c <= '7'; 2075 return r; 2076 } 2077 getHexDigit(StrPos sp)2078 int getHexDigit(StrPos sp) 2079 { 2080 if (sp.c >= '0' && sp.c <= '9') 2081 { 2082 return sp.c - '0'; 2083 } 2084 if (sp.c >= 'a' && sp.c <= 'f') 2085 { 2086 return sp.c - 'a' + 10; 2087 } 2088 return sp.c - 'A' + 10; 2089 } 2090 next2Hex(StrPos sp)2091 boolean next2Hex(StrPos sp) 2092 { 2093 StrPos sp2 = new StrPos(sp); 2094 sp2.inc(); 2095 if (!isHexDigit(sp2)) 2096 { 2097 return false; 2098 } 2099 sp2.inc(); 2100 if (!isHexDigit(sp2)) 2101 { 2102 return false; 2103 } 2104 return true; 2105 } 2106 isOctalString(StrPos sp)2107 boolean isOctalString(StrPos sp) 2108 { 2109 if (!isOctalDigit(sp, true)) 2110 { 2111 return false; 2112 } 2113 StrPos sp2 = new StrPos(sp); 2114 sp2.inc(); 2115 if (!isOctalDigit(sp2, false)) 2116 { 2117 return false; 2118 } 2119 return true; 2120 } 2121 } 2122