1 package org.apache.regexp; 2 3 /* 4 * ==================================================================== 5 * 6 * The Apache Software License, Version 1.1 7 * 8 * Copyright (c) 1999 The Apache Software Foundation. All rights 9 * reserved. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in 20 * the documentation and/or other materials provided with the 21 * distribution. 22 * 23 * 3. The end-user documentation included with the redistribution, if 24 * any, must include the following acknowlegement: 25 * "This product includes software developed by the 26 * Apache Software Foundation (http://www.apache.org/)." 27 * Alternately, this acknowlegement may appear in the software itself, 28 * if and wherever such third-party acknowlegements normally appear. 29 * 30 * 4. The names "The Jakarta Project", "Jakarta-Regexp", and "Apache Software 31 * Foundation" must not be used to endorse or promote products derived 32 * from this software without prior written permission. For written 33 * permission, please contact apache@apache.org. 34 * 35 * 5. Products derived from this software may not be called "Apache" 36 * nor may "Apache" appear in their names without prior written 37 * permission of the Apache Group. 38 * 39 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED 40 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 41 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 42 * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR 43 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 44 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 45 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF 46 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 47 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 48 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 49 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 50 * SUCH DAMAGE. 51 * ==================================================================== 52 * 53 * This software consists of voluntary contributions made by many 54 * individuals on behalf of the Apache Software Foundation. For more 55 * information on the Apache Software Foundation, please see 56 * <http://www.apache.org/>. 57 * 58 */ 59 60 import org.apache.regexp.RE; 61 import java.util.Hashtable; 62 63 /** 64 * A regular expression compiler class. This class compiles a pattern string into a 65 * regular expression program interpretable by the RE evaluator class. The 'recompile' 66 * command line tool uses this compiler to pre-compile regular expressions for use 67 * with RE. For a description of the syntax accepted by RECompiler and what you can 68 * do with regular expressions, see the documentation for the RE matcher class. 69 * 70 * @see RE 71 * @see recompile 72 * 73 * @author <a href="mailto:jonl@muppetlabs.com">Jonathan Locke</a> 74 * @version $Id: RECompiler.java,v 1.2 2000/05/14 21:04:17 jon Exp $ 75 */ 76 public class RECompiler 77 { 78 // The compiled program 79 char[] instruction; // The compiled RE 'program' instruction buffer 80 int lenInstruction; // The amount of the program buffer currently in use 81 82 // Input state for compiling regular expression 83 String pattern; // Input string 84 int len; // Length of the pattern string 85 int idx; // Current input index into ac 86 int parens; // Total number of paren pairs 87 88 // Node flags 89 static final int NODE_NORMAL = 0; // No flags (nothing special) 90 static final int NODE_NULLABLE = 1; // True if node is potentially null 91 static final int NODE_TOPLEVEL = 2; // True if top level expr 92 93 // Special types of 'escapes' 94 static final char ESC_MASK = 0xfff0; // Escape complexity mask 95 static final char ESC_BACKREF = 0xffff; // Escape is really a backreference 96 static final char ESC_COMPLEX = 0xfffe; // Escape isn't really a true character 97 static final char ESC_CLASS = 0xfffd; // Escape represents a whole class of characters 98 99 // {m,n} stacks 100 static final int maxBrackets = 10; // Maximum number of bracket pairs 101 static int brackets = 0; // Number of bracket sets 102 static int[] bracketStart = null; // Starting point 103 static int[] bracketEnd = null; // Ending point 104 static int[] bracketMin = null; // Minimum number of matches 105 static int[] bracketOpt = null; // Additional optional matches 106 static final int bracketUnbounded = -1; // Unbounded value 107 static final int bracketFinished = -2; // Unbounded value 108 109 // Lookup table for POSIX character class names 110 static Hashtable hashPOSIX = new Hashtable(); 111 static 112 { 113 hashPOSIX.put("alnum", new Character(RE.POSIX_CLASS_ALNUM)); 114 hashPOSIX.put("alpha", new Character(RE.POSIX_CLASS_ALPHA)); 115 hashPOSIX.put("blank", new Character(RE.POSIX_CLASS_BLANK)); 116 hashPOSIX.put("cntrl", new Character(RE.POSIX_CLASS_CNTRL)); 117 hashPOSIX.put("digit", new Character(RE.POSIX_CLASS_DIGIT)); 118 hashPOSIX.put("graph", new Character(RE.POSIX_CLASS_GRAPH)); 119 hashPOSIX.put("lower", new Character(RE.POSIX_CLASS_LOWER)); 120 hashPOSIX.put("print", new Character(RE.POSIX_CLASS_PRINT)); 121 hashPOSIX.put("punct", new Character(RE.POSIX_CLASS_PUNCT)); 122 hashPOSIX.put("space", new Character(RE.POSIX_CLASS_SPACE)); 123 hashPOSIX.put("upper", new Character(RE.POSIX_CLASS_UPPER)); 124 hashPOSIX.put("xdigit", new Character(RE.POSIX_CLASS_XDIGIT)); 125 hashPOSIX.put("javastart", new Character(RE.POSIX_CLASS_JSTART)); 126 hashPOSIX.put("javapart", new Character(RE.POSIX_CLASS_JPART)); 127 } 128 129 /** 130 * Constructor. Creates (initially empty) storage for a regular expression program. 131 */ RECompiler()132 public RECompiler() 133 { 134 // Start off with a generous, yet reasonable, initial size 135 instruction = new char[128]; 136 lenInstruction = 0; 137 } 138 139 /** 140 * Ensures that n more characters can fit in the program buffer. 141 * If n more can't fit, then the size is doubled until it can. 142 * @param n Number of additional characters to ensure will fit. 143 */ ensure(int n)144 void ensure(int n) 145 { 146 // Get current program length 147 int curlen = instruction.length; 148 149 // If the current length + n more is too much 150 if (lenInstruction + n >= curlen) 151 { 152 // Double the size of the program array until n more will fit 153 while (lenInstruction + n >= curlen) 154 { 155 curlen *= 2; 156 } 157 158 // Allocate new program array and move data into it 159 char[] newInstruction = new char[curlen]; 160 System.arraycopy(instruction, 0, newInstruction, 0, lenInstruction); 161 instruction = newInstruction; 162 } 163 } 164 165 /** 166 * Emit a single character into the program stream. 167 * @param c Character to add 168 */ emit(char c)169 void emit(char c) 170 { 171 // Make room for character 172 ensure(1); 173 174 // Add character 175 instruction[lenInstruction++] = c; 176 } 177 178 /** 179 * Inserts a node with a given opcode and opdata at insertAt. The node relative next 180 * pointer is initialized to 0. 181 * @param opcode Opcode for new node 182 * @param opdata Opdata for new node (only the low 16 bits are currently used) 183 * @param insertAt Index at which to insert the new node in the program 184 */ nodeInsert(char opcode, int opdata, int insertAt)185 void nodeInsert(char opcode, int opdata, int insertAt) 186 { 187 // Make room for a new node 188 ensure(RE.nodeSize); 189 190 // Move everything from insertAt to the end down nodeSize elements 191 System.arraycopy(instruction, insertAt, instruction, insertAt + RE.nodeSize, lenInstruction - insertAt); 192 instruction[insertAt + RE.offsetOpcode] = opcode; 193 instruction[insertAt + RE.offsetOpdata] = (char)opdata; 194 instruction[insertAt + RE.offsetNext] = 0; 195 lenInstruction += RE.nodeSize; 196 } 197 198 /** 199 * Appends a node to the end of a node chain 200 * @param node Start of node chain to traverse 201 * @param pointTo Node to have the tail of the chain point to 202 */ setNextOfEnd(int node, int pointTo)203 void setNextOfEnd(int node, int pointTo) 204 { 205 // Traverse the chain until the next offset is 0 206 int next; 207 while ((next = instruction[node + RE.offsetNext]) != 0) 208 { 209 node += next; 210 } 211 212 // Point the last node in the chain to pointTo. 213 instruction[node + RE.offsetNext] = (char)(short)(pointTo - node); 214 } 215 216 /** 217 * Adds a new node 218 * @param opcode Opcode for node 219 * @param opdata Opdata for node (only the low 16 bits are currently used) 220 * @return Index of new node in program 221 */ node(char opcode, int opdata)222 int node(char opcode, int opdata) 223 { 224 // Make room for a new node 225 ensure(RE.nodeSize); 226 227 // Add new node at end 228 instruction[lenInstruction + RE.offsetOpcode] = opcode; 229 instruction[lenInstruction + RE.offsetOpdata] = (char)opdata; 230 instruction[lenInstruction + RE.offsetNext] = 0; 231 lenInstruction += RE.nodeSize; 232 233 // Return index of new node 234 return lenInstruction - RE.nodeSize; 235 } 236 237 238 /** 239 * Throws a new internal error exception 240 * @exception Error Thrown in the event of an internal error. 241 */ internalError()242 void internalError() throws Error 243 { 244 throw new Error("Internal error!"); 245 } 246 247 /** 248 * Throws a new syntax error exception 249 * @exception RESyntaxException Thrown if the regular expression has invalid syntax. 250 */ syntaxError(String s)251 void syntaxError(String s) throws RESyntaxException 252 { 253 throw new RESyntaxException(s); 254 } 255 256 /** 257 * Allocate storage for brackets only as needed 258 */ allocBrackets()259 void allocBrackets() 260 { 261 // Allocate bracket stacks if not already done 262 if (bracketStart == null) 263 { 264 // Allocate storage 265 bracketStart = new int[maxBrackets]; 266 bracketEnd = new int[maxBrackets]; 267 bracketMin = new int[maxBrackets]; 268 bracketOpt = new int[maxBrackets]; 269 270 // Initialize to invalid values 271 for (int i = 0; i < maxBrackets; i++) 272 { 273 bracketStart[i] = bracketEnd[i] = bracketMin[i] = bracketOpt[i] = -1; 274 } 275 } 276 } 277 278 /** 279 * Match bracket {m,n} expression put results in bracket member variables 280 * @exception RESyntaxException Thrown if the regular expression has invalid syntax. 281 */ bracket()282 void bracket() throws RESyntaxException 283 { 284 // Current character must be a '{' 285 if (idx >= len || pattern.charAt(idx++) != '{') 286 { 287 internalError(); 288 } 289 290 // Next char must be a digit 291 if (idx >= len || !Character.isDigit(pattern.charAt(idx))) 292 { 293 syntaxError("Expected digit"); 294 } 295 296 // Get min ('m' of {m,n}) number 297 StringBuffer number = new StringBuffer(); 298 while (idx < len && Character.isDigit(pattern.charAt(idx))) 299 { 300 number.append(pattern.charAt(idx++)); 301 } 302 try 303 { 304 bracketMin[brackets] = Integer.parseInt(number.toString()); 305 } 306 catch (NumberFormatException e) 307 { 308 syntaxError("Expected valid number"); 309 } 310 311 // If out of input, fail 312 if (idx >= len) 313 { 314 syntaxError("Expected comma or right bracket"); 315 } 316 317 // If end of expr, optional limit is 0 318 if (pattern.charAt(idx) == '}') 319 { 320 idx++; 321 bracketOpt[brackets] = 0; 322 return; 323 } 324 325 // Must have at least {m,} and maybe {m,n}. 326 if (idx >= len || pattern.charAt(idx++) != ',') 327 { 328 syntaxError("Expected comma"); 329 } 330 331 // If out of input, fail 332 if (idx >= len) 333 { 334 syntaxError("Expected comma or right bracket"); 335 } 336 337 // If {m,} max is unlimited 338 if (pattern.charAt(idx) == '}') 339 { 340 idx++; 341 bracketOpt[brackets] = bracketUnbounded; 342 return; 343 } 344 345 // Next char must be a digit 346 if (idx >= len || !Character.isDigit(pattern.charAt(idx))) 347 { 348 syntaxError("Expected digit"); 349 } 350 351 // Get max number 352 number.setLength(0); 353 while (idx < len && Character.isDigit(pattern.charAt(idx))) 354 { 355 number.append(pattern.charAt(idx++)); 356 } 357 try 358 { 359 bracketOpt[brackets] = Integer.parseInt(number.toString()) - bracketMin[brackets]; 360 } 361 catch (NumberFormatException e) 362 { 363 syntaxError("Expected valid number"); 364 } 365 366 // Optional repetitions must be > 0 367 if (bracketOpt[brackets] <= 0) 368 { 369 syntaxError("Bad range"); 370 } 371 372 // Must have close brace 373 if (idx >= len || pattern.charAt(idx++) != '}') 374 { 375 syntaxError("Missing close brace"); 376 } 377 } 378 379 /** 380 * Match an escape sequence. Handles quoted chars and octal escapes as well 381 * as normal escape characters. Always advances the input stream by the 382 * right amount. This code "understands" the subtle difference between an 383 * octal escape and a backref. You can access the type of ESC_CLASS or 384 * ESC_COMPLEX or ESC_BACKREF by looking at pattern[idx - 1]. 385 * @return ESC_* code or character if simple escape 386 * @exception RESyntaxException Thrown if the regular expression has invalid syntax. 387 */ escape()388 char escape() throws RESyntaxException 389 { 390 // "Shouldn't" happen 391 if (pattern.charAt(idx) != '\\') 392 { 393 internalError(); 394 } 395 396 // Escape shouldn't occur as last character in string! 397 if (idx + 1 == len) 398 { 399 syntaxError("Escape terminates string"); 400 } 401 402 // Switch on character after backslash 403 idx += 2; 404 char escapeChar = pattern.charAt(idx - 1); 405 switch (escapeChar) 406 { 407 case RE.E_BOUND: 408 case RE.E_NBOUND: 409 return ESC_COMPLEX; 410 411 case RE.E_ALNUM: 412 case RE.E_NALNUM: 413 case RE.E_SPACE: 414 case RE.E_NSPACE: 415 case RE.E_DIGIT: 416 case RE.E_NDIGIT: 417 return ESC_CLASS; 418 419 case 'u': 420 case 'x': 421 { 422 // Exact required hex digits for escape type 423 int hexDigits = (escapeChar == 'u' ? 4 : 2); 424 425 // Parse up to hexDigits characters from input 426 int val = 0; 427 for ( ; idx < len && hexDigits-- > 0; idx++) 428 { 429 // Get char 430 char c = pattern.charAt(idx); 431 432 // If it's a hexadecimal digit (0-9) 433 if (c >= '0' && c <= '9') 434 { 435 // Compute new value 436 val = (val << 4) + c - '0'; 437 } 438 else 439 { 440 // If it's a hexadecimal letter (a-f) 441 c = Character.toLowerCase(c); 442 if (c >= 'a' && c <= 'f') 443 { 444 // Compute new value 445 val = (val << 4) + (c - 'a') + 10; 446 } 447 else 448 { 449 // If it's not a valid digit or hex letter, the escape must be invalid 450 // because hexDigits of input have not been absorbed yet. 451 syntaxError("Expected " + hexDigits + " hexadecimal digits after \\" + escapeChar); 452 } 453 } 454 } 455 return (char)val; 456 } 457 458 case 't': 459 return '\t'; 460 461 case 'n': 462 return '\n'; 463 464 case 'r': 465 return '\r'; 466 467 case 'f': 468 return '\f'; 469 470 case '0': 471 case '1': 472 case '2': 473 case '3': 474 case '4': 475 case '5': 476 case '6': 477 case '7': 478 case '8': 479 case '9': 480 481 // An octal escape starts with a 0 or has two digits in a row 482 if ((idx < len && Character.isDigit(pattern.charAt(idx))) || escapeChar == '0') 483 { 484 // Handle \nnn octal escapes 485 int val = escapeChar - '0'; 486 if (idx < len && Character.isDigit(pattern.charAt(idx))) 487 { 488 val = ((val << 3) + (pattern.charAt(idx++) - '0')); 489 if (idx < len && Character.isDigit(pattern.charAt(idx))) 490 { 491 val = ((val << 3) + (pattern.charAt(idx++) - '0')); 492 } 493 } 494 return (char)val; 495 } 496 497 // It's actually a backreference (\[1-9]), not an escape 498 return ESC_BACKREF; 499 500 default: 501 502 // Simple quoting of a character 503 return escapeChar; 504 } 505 } 506 507 /** 508 * Compile a character class 509 * @return Index of class node 510 * @exception RESyntaxException Thrown if the regular expression has invalid syntax. 511 */ characterClass()512 int characterClass() throws RESyntaxException 513 { 514 // Check for bad calling or empty class 515 if (pattern.charAt(idx) != '[') 516 { 517 internalError(); 518 } 519 520 // Check for unterminated or empty class 521 if ((idx + 1) >= len || pattern.charAt(++idx) == ']') 522 { 523 syntaxError("Empty or unterminated class"); 524 } 525 526 // Check for POSIX character class 527 if (idx < len && pattern.charAt(idx) == ':') 528 { 529 // Skip colon 530 idx++; 531 532 // POSIX character classes are denoted with lowercase ASCII strings 533 int idxStart = idx; 534 while (idx < len && pattern.charAt(idx) >= 'a' && pattern.charAt(idx) <= 'z') 535 { 536 idx++; 537 } 538 539 // Should be a ":]" to terminate the POSIX character class 540 if ((idx + 1) < len && pattern.charAt(idx) == ':' && pattern.charAt(idx + 1) == ']') 541 { 542 // Get character class 543 String charClass = pattern.substring(idxStart, idx); 544 545 // Select the POSIX class id 546 Character i = (Character)hashPOSIX.get(charClass); 547 if (i != null) 548 { 549 // Move past colon and right bracket 550 idx += 2; 551 552 // Return new POSIX character class node 553 return node(RE.OP_POSIXCLASS, i.charValue()); 554 } 555 syntaxError("Invalid POSIX character class '" + charClass + "'"); 556 } 557 syntaxError("Invalid POSIX character class syntax"); 558 } 559 560 // Try to build a class. Create OP_ANYOF node 561 int ret = node(RE.OP_ANYOF, 0); 562 563 // Parse class declaration 564 char CHAR_INVALID = Character.MAX_VALUE; 565 char last = CHAR_INVALID; 566 char simpleChar = 0; 567 boolean include = true; 568 boolean definingRange = false; 569 int idxFirst = idx; 570 char rangeStart = Character.MIN_VALUE; 571 char rangeEnd; 572 RERange range = new RERange(); 573 while (idx < len && pattern.charAt(idx) != ']') 574 { 575 576 switchOnCharacter: 577 578 // Switch on character 579 switch (pattern.charAt(idx)) 580 { 581 case '^': 582 include = !include; 583 if (idx == idxFirst) 584 { 585 range.include(Character.MIN_VALUE, Character.MAX_VALUE, true); 586 } 587 idx++; 588 continue; 589 590 case '\\': 591 { 592 // Escape always advances the stream 593 char c; 594 switch (c = escape ()) 595 { 596 case ESC_COMPLEX: 597 case ESC_BACKREF: 598 599 // Word boundaries and backrefs not allowed in a character class! 600 syntaxError("Bad character class"); 601 602 case ESC_CLASS: 603 604 // Classes can't be an endpoint of a range 605 if (definingRange) 606 { 607 syntaxError("Bad character class"); 608 } 609 610 // Handle specific type of class (some are ok) 611 switch (pattern.charAt(idx - 1)) 612 { 613 case RE.E_NSPACE: 614 case RE.E_NDIGIT: 615 case RE.E_NALNUM: 616 syntaxError("Bad character class"); 617 618 case RE.E_SPACE: 619 range.include('\t', include); 620 range.include('\r', include); 621 range.include('\f', include); 622 range.include('\n', include); 623 range.include('\b', include); 624 range.include(' ', include); 625 break; 626 627 case RE.E_ALNUM: 628 range.include('a', 'z', include); 629 range.include('A', 'Z', include); 630 range.include('_', include); 631 632 // Fall through! 633 634 case RE.E_DIGIT: 635 range.include('0', '9', include); 636 break; 637 } 638 639 // Make last char invalid (can't be a range start) 640 last = CHAR_INVALID; 641 break; 642 643 default: 644 645 // Escape is simple so treat as a simple char 646 simpleChar = c; 647 break switchOnCharacter; 648 } 649 } 650 continue; 651 652 case '-': 653 654 // Start a range if one isn't already started 655 if (definingRange) 656 { 657 syntaxError("Bad class range"); 658 } 659 definingRange = true; 660 661 // If no last character, start of range is 0 662 rangeStart = (last == CHAR_INVALID ? 0 : last); 663 664 // Premature end of range. define up to Character.MAX_VALUE 665 if ((idx + 1) < len && pattern.charAt(++idx) == ']') 666 { 667 simpleChar = Character.MAX_VALUE; 668 break; 669 } 670 continue; 671 672 default: 673 simpleChar = pattern.charAt(idx++); 674 break; 675 } 676 677 // Handle simple character simpleChar 678 if (definingRange) 679 { 680 // if we are defining a range make it now 681 rangeEnd = simpleChar; 682 683 // Actually create a range if the range is ok 684 if (rangeStart >= rangeEnd) 685 { 686 syntaxError("Bad character class"); 687 } 688 range.include(rangeStart, rangeEnd, include); 689 690 // We are done defining the range 691 last = CHAR_INVALID; 692 definingRange = false; 693 } 694 else 695 { 696 // If simple character and not start of range, include it 697 if ((idx + 1) >= len || pattern.charAt(idx + 1) != '-') 698 { 699 range.include(simpleChar, include); 700 } 701 last = simpleChar; 702 } 703 } 704 705 // Shouldn't be out of input 706 if (idx == len) 707 { 708 syntaxError("Unterminated character class"); 709 } 710 711 // Absorb the ']' end of class marker 712 idx++; 713 714 // Emit character class definition 715 instruction[ret + RE.offsetOpdata] = (char)range.num; 716 for (int i = 0; i < range.num; i++) 717 { 718 emit((char)range.minRange[i]); 719 emit((char)range.maxRange[i]); 720 } 721 return ret; 722 } 723 724 /** 725 * Absorb an atomic character string. This method is a little tricky because 726 * it can un-include the last character of string if a closure operator follows. 727 * This is correct because *+? have higher precedence than concatentation (thus 728 * ABC* means AB(C*) and NOT (ABC)*). 729 * @return Index of new atom node 730 * @exception RESyntaxException Thrown if the regular expression has invalid syntax. 731 */ atom()732 int atom() throws RESyntaxException 733 { 734 // Create a string node 735 int ret = node(RE.OP_ATOM, 0); 736 737 // Length of atom 738 int lenAtom = 0; 739 740 // Loop while we've got input 741 742 atomLoop: 743 744 while (idx < len) 745 { 746 // Is there a next char? 747 if ((idx + 1) < len) 748 { 749 char c = pattern.charAt(idx + 1); 750 751 // If the next 'char' is an escape, look past the whole escape 752 if (pattern.charAt(idx) == '\\') 753 { 754 int idxEscape = idx; 755 escape(); 756 if (idx < len) 757 { 758 c = pattern.charAt(idx); 759 } 760 idx = idxEscape; 761 } 762 763 // Switch on next char 764 switch (c) 765 { 766 case '{': 767 case '?': 768 case '*': 769 case '+': 770 771 // If the next character is a closure operator and our atom is non-empty, the 772 // current character should bind to the closure operator rather than the atom 773 if (lenAtom != 0) 774 { 775 break atomLoop; 776 } 777 } 778 } 779 780 // Switch on current char 781 switch (pattern.charAt(idx)) 782 { 783 case ']': 784 case '^': 785 case '$': 786 case '.': 787 case '[': 788 case '(': 789 case ')': 790 case '|': 791 break atomLoop; 792 793 case '{': 794 case '?': 795 case '*': 796 case '+': 797 798 // We should have an atom by now 799 if (lenAtom == 0) 800 { 801 // No atom before closure 802 syntaxError("Missing operand to closure"); 803 } 804 break atomLoop; 805 806 case '\\': 807 808 { 809 // Get the escaped character (advances input automatically) 810 int idxBeforeEscape = idx; 811 char c = escape(); 812 813 // Check if it's a simple escape (as opposed to, say, a backreference) 814 if ((c & ESC_MASK) == ESC_MASK) 815 { 816 // Not a simple escape, so backup to where we were before the escape. 817 idx = idxBeforeEscape; 818 break atomLoop; 819 } 820 821 // Add escaped char to atom 822 emit(c); 823 lenAtom++; 824 } 825 break; 826 827 default: 828 829 // Add normal character to atom 830 emit(pattern.charAt(idx++)); 831 lenAtom++; 832 break; 833 } 834 } 835 836 // This "shouldn't" happen 837 if (lenAtom == 0) 838 { 839 internalError(); 840 } 841 842 // Emit the atom length into the program 843 instruction[ret + RE.offsetOpdata] = (char)lenAtom; 844 return ret; 845 } 846 847 /** 848 * Match a terminal node. 849 * @param flags Flags 850 * @return Index of terminal node (closeable) 851 * @exception RESyntaxException Thrown if the regular expression has invalid syntax. 852 */ terminal(int[] flags)853 int terminal(int[] flags) throws RESyntaxException 854 { 855 switch (pattern.charAt(idx)) 856 { 857 case RE.OP_EOL: 858 case RE.OP_BOL: 859 case RE.OP_ANY: 860 return node(pattern.charAt(idx++), 0); 861 862 case '[': 863 return characterClass(); 864 865 case '(': 866 return expr(flags); 867 868 case ')': 869 syntaxError("Unexpected close paren"); 870 871 case '|': 872 internalError(); 873 874 case ']': 875 syntaxError("Mismatched class"); 876 877 case 0: 878 syntaxError("Unexpected end of input"); 879 880 case '?': 881 case '+': 882 case '{': 883 case '*': 884 syntaxError("Missing operand to closure"); 885 886 case '\\': 887 { 888 // Don't forget, escape() advances the input stream! 889 int idxBeforeEscape = idx; 890 891 // Switch on escaped character 892 switch (escape()) 893 { 894 case ESC_CLASS: 895 case ESC_COMPLEX: 896 flags[0] &= ~NODE_NULLABLE; 897 return node(RE.OP_ESCAPE, pattern.charAt(idx - 1)); 898 899 case ESC_BACKREF: 900 { 901 char backreference = (char)(pattern.charAt(idx - 1) - '0'); 902 if (parens <= backreference) 903 { 904 syntaxError("Bad backreference"); 905 } 906 flags[0] |= NODE_NULLABLE; 907 return node(RE.OP_BACKREF, backreference); 908 } 909 910 default: 911 912 // We had a simple escape and we want to have it end up in 913 // an atom, so we back up and fall though to the default handling 914 idx = idxBeforeEscape; 915 flags[0] &= ~NODE_NULLABLE; 916 break; 917 } 918 } 919 } 920 921 // Everything above either fails or returns. 922 // If it wasn't one of the above, it must be the start of an atom. 923 flags[0] &= ~NODE_NULLABLE; 924 return atom(); 925 } 926 927 /** 928 * Compile a possibly closured terminal 929 * @param flags Flags passed by reference 930 * @return Index of closured node 931 * @exception RESyntaxException Thrown if the regular expression has invalid syntax. 932 */ closure(int[] flags)933 int closure(int[] flags) throws RESyntaxException 934 { 935 // Before terminal 936 int idxBeforeTerminal = idx; 937 938 // Values to pass by reference to terminal() 939 int[] terminalFlags = { NODE_NORMAL }; 940 941 // Get terminal symbol 942 int ret = terminal(terminalFlags); 943 944 // Or in flags from terminal symbol 945 flags[0] |= terminalFlags[0]; 946 947 // Advance input, set NODE_NULLABLE flag and do sanity checks 948 if (idx >= len) 949 { 950 return ret; 951 } 952 boolean greedy = true; 953 char closureType = pattern.charAt(idx); 954 switch (closureType) 955 { 956 case '?': 957 case '*': 958 959 // The current node can be null 960 flags[0] |= NODE_NULLABLE; 961 962 case '+': 963 964 // Eat closure character 965 idx++; 966 967 case '{': 968 969 // Don't allow blantant stupidity 970 int opcode = instruction[ret + RE.offsetOpcode]; 971 if (opcode == RE.OP_BOL || opcode == RE.OP_EOL) 972 { 973 syntaxError("Bad closure operand"); 974 } 975 if ((terminalFlags[0] & NODE_NULLABLE) != 0) 976 { 977 syntaxError("Closure operand can't be nullable"); 978 } 979 break; 980 } 981 982 // If the next character is a '?', make the closure non-greedy (reluctant) 983 if (idx < len && pattern.charAt(idx) == '?') 984 { 985 idx++; 986 greedy = false; 987 } 988 989 if (greedy) 990 { 991 // Actually do the closure now 992 switch (closureType) 993 { 994 case '{': 995 { 996 // We look for our bracket in the list 997 boolean found = false; 998 int i; 999 allocBrackets(); 1000 for (i = 0; i < brackets; i++) 1001 { 1002 if (bracketStart[i] == idx) 1003 { 1004 found = true; 1005 break; 1006 } 1007 } 1008 1009 // If its not in the list we parse the {m,n} 1010 if (!found) 1011 { 1012 if (brackets >= maxBrackets) 1013 { 1014 syntaxError("Too many bracketed closures (limit is 10)"); 1015 } 1016 bracketStart[brackets] = idx; 1017 bracket(); 1018 bracketEnd[brackets] = idx; 1019 i = brackets++; 1020 } 1021 1022 // If there's a min, rewind stream and reparse 1023 if (--bracketMin[i] > 0) 1024 { 1025 // Rewind stream and run it through again 1026 idx = idxBeforeTerminal; 1027 break; 1028 } 1029 1030 // Do the right thing for maximum ({m,}) 1031 if (bracketOpt[i] == bracketFinished) 1032 { 1033 // Drop through now and closure expression. 1034 // We are done with the {m,} expr, so skip rest 1035 closureType = '*'; 1036 bracketOpt[i] = 0; 1037 idx = bracketEnd[i]; 1038 } 1039 else 1040 if (bracketOpt[i] == bracketUnbounded) 1041 { 1042 idx = idxBeforeTerminal; 1043 bracketOpt[i] = bracketFinished; 1044 break; 1045 } 1046 else 1047 if (bracketOpt[i]-- > 0) 1048 { 1049 // Drop through to optionally close and then 'play it again sam!' 1050 idx = idxBeforeTerminal; 1051 closureType = '?'; 1052 } 1053 else 1054 { 1055 // We are done. skip the rest of {m,n} expr 1056 idx = bracketEnd[i]; 1057 break; 1058 } 1059 } 1060 1061 // Fall through! 1062 1063 case '?': 1064 case '*': 1065 1066 if (!greedy) 1067 { 1068 break; 1069 } 1070 1071 if (closureType == '?') 1072 { 1073 // X? is compiled as (X|) 1074 nodeInsert(RE.OP_BRANCH, 0, ret); // branch before X 1075 setNextOfEnd(ret, node (RE.OP_BRANCH, 0)); // inserted branch to option 1076 int nothing = node (RE.OP_NOTHING, 0); // which is OP_NOTHING 1077 setNextOfEnd(ret, nothing); // point (second) branch to OP_NOTHING 1078 setNextOfEnd(ret + RE.nodeSize, nothing); // point the end of X to OP_NOTHING node 1079 } 1080 1081 if (closureType == '*') 1082 { 1083 // X* is compiled as (X{gotoX}|) 1084 nodeInsert(RE.OP_BRANCH, 0, ret); // branch before X 1085 setNextOfEnd(ret + RE.nodeSize, node(RE.OP_BRANCH, 0)); // end of X points to an option 1086 setNextOfEnd(ret + RE.nodeSize, node(RE.OP_GOTO, 0)); // to goto 1087 setNextOfEnd(ret + RE.nodeSize, ret); // the start again 1088 setNextOfEnd(ret, node(RE.OP_BRANCH, 0)); // the other option is 1089 setNextOfEnd(ret, node(RE.OP_NOTHING, 0)); // OP_NOTHING 1090 } 1091 break; 1092 1093 case '+': 1094 { 1095 // X+ is compiled as X({gotoX}|) 1096 int branch; 1097 branch = node(RE.OP_BRANCH, 0); // a new branch 1098 setNextOfEnd(ret, branch); // is added to the end of X 1099 setNextOfEnd(node(RE.OP_GOTO, 0), ret); // one option is to go back to the start 1100 setNextOfEnd(branch, node(RE.OP_BRANCH, 0)); // the other option 1101 setNextOfEnd(ret, node(RE.OP_NOTHING, 0)); // is OP_NOTHING 1102 } 1103 break; 1104 } 1105 } 1106 else 1107 { 1108 // Add end after closured subexpr 1109 setNextOfEnd(ret, node(RE.OP_END, 0)); 1110 1111 // Actually do the closure now 1112 switch (closureType) 1113 { 1114 case '?': 1115 nodeInsert(RE.OP_RELUCTANTMAYBE, 0, ret); 1116 break; 1117 1118 case '*': 1119 nodeInsert(RE.OP_RELUCTANTSTAR, 0, ret); 1120 break; 1121 1122 case '+': 1123 nodeInsert(RE.OP_RELUCTANTPLUS, 0, ret); 1124 break; 1125 } 1126 1127 // Point to the expr after the closure 1128 setNextOfEnd(ret, lenInstruction); 1129 } 1130 return ret; 1131 } 1132 1133 /** 1134 * Compile one branch of an or operator (implements concatenation) 1135 * @param flags Flags passed by reference 1136 * @return Pointer to branch node 1137 * @exception RESyntaxException Thrown if the regular expression has invalid syntax. 1138 */ branch(int[] flags)1139 int branch(int[] flags) throws RESyntaxException 1140 { 1141 // Get each possibly closured piece and concat 1142 int node; 1143 int ret = node(RE.OP_BRANCH, 0); 1144 int chain = -1; 1145 int[] closureFlags = new int[1]; 1146 boolean nullable = true; 1147 while (idx < len && pattern.charAt(idx) != '|' && pattern.charAt(idx) != ')') 1148 { 1149 // Get new node 1150 closureFlags[0] = NODE_NORMAL; 1151 node = closure(closureFlags); 1152 if (closureFlags[0] == NODE_NORMAL) 1153 { 1154 nullable = false; 1155 } 1156 1157 // If there's a chain, append to the end 1158 if (chain != -1) 1159 { 1160 setNextOfEnd(chain, node); 1161 } 1162 1163 // Chain starts at current 1164 chain = node; 1165 } 1166 1167 // If we don't run loop, make a nothing node 1168 if (chain == -1) 1169 { 1170 node(RE.OP_NOTHING, 0); 1171 } 1172 1173 // Set nullable flag for this branch 1174 if (nullable) 1175 { 1176 flags[0] |= NODE_NULLABLE; 1177 } 1178 return ret; 1179 } 1180 1181 /** 1182 * Compile an expression with possible parens around it. Paren matching 1183 * is done at this level so we can tie the branch tails together. 1184 * @param flags Flag value passed by reference 1185 * @return Node index of expression in instruction array 1186 * @exception RESyntaxException Thrown if the regular expression has invalid syntax. 1187 */ expr(int[] flags)1188 int expr(int[] flags) throws RESyntaxException 1189 { 1190 // Create open paren node unless we were called from the top level (which has no parens) 1191 boolean paren = false; 1192 int ret = -1; 1193 int closeParens = parens; 1194 if ((flags[0] & NODE_TOPLEVEL) == 0 && pattern.charAt(idx) == '(') 1195 { 1196 idx++; 1197 paren = true; 1198 ret = node(RE.OP_OPEN, parens++); 1199 } 1200 flags[0] &= ~NODE_TOPLEVEL; 1201 1202 // Create a branch node 1203 int branch = branch(flags); 1204 if (ret == -1) 1205 { 1206 ret = branch; 1207 } 1208 else 1209 { 1210 setNextOfEnd(ret, branch); 1211 } 1212 1213 // Loop through branches 1214 while (idx < len && pattern.charAt(idx) == '|') 1215 { 1216 idx++; 1217 branch = branch(flags); 1218 setNextOfEnd(ret, branch); 1219 } 1220 1221 // Create an ending node (either a close paren or an OP_END) 1222 int end; 1223 if (paren) 1224 { 1225 if (idx < len && pattern.charAt(idx) == ')') 1226 { 1227 idx++; 1228 } 1229 else 1230 { 1231 syntaxError("Missing close paren"); 1232 } 1233 end = node(RE.OP_CLOSE, closeParens); 1234 } 1235 else 1236 { 1237 end = node(RE.OP_END, 0); 1238 } 1239 1240 // Append the ending node to the ret nodelist 1241 setNextOfEnd(ret, end); 1242 1243 // Hook the ends of each branch to the end node 1244 for (int next = -1, i = ret; next != 0; next = instruction[i + RE.offsetNext], i += next) 1245 { 1246 // If branch, make the end of the branch's operand chain point to the end node. 1247 if (instruction[i + RE.offsetOpcode] == RE.OP_BRANCH) 1248 { 1249 setNextOfEnd(i + RE.nodeSize, end); 1250 } 1251 } 1252 1253 // Return the node list 1254 return ret; 1255 } 1256 1257 /** 1258 * Compiles a regular expression pattern into a program runnable by the pattern 1259 * matcher class 'RE'. 1260 * @param pattern Regular expression pattern to compile (see RECompiler class 1261 * for details). 1262 * @return A compiled regular expression program. 1263 * @exception RESyntaxException Thrown if the regular expression has invalid syntax. 1264 * @see RECompiler 1265 * @see RE 1266 */ compile(String pattern)1267 public REProgram compile(String pattern) throws RESyntaxException 1268 { 1269 // Initialize variables for compilation 1270 this.pattern = pattern; // Save pattern in instance variable 1271 len = pattern.length(); // Precompute pattern length for speed 1272 idx = 0; // Set parsing index to the first character 1273 lenInstruction = 0; // Set emitted instruction count to zero 1274 parens = 1; // Set paren level to 1 (the implicit outer parens) 1275 brackets = 0; // No bracketed closures yet 1276 1277 // Initialize pass by reference flags value 1278 int[] flags = { NODE_TOPLEVEL }; 1279 1280 // Parse expression 1281 expr(flags); 1282 1283 // Should be at end of input 1284 if (idx != len) 1285 { 1286 if (pattern.charAt(idx) == ')') 1287 { 1288 syntaxError("Unmatched close paren"); 1289 } 1290 syntaxError("Unexpected input remains"); 1291 } 1292 1293 // Return the result 1294 char[] ins = new char[lenInstruction]; 1295 System.arraycopy(instruction, 0, ins, 0, lenInstruction); 1296 return new REProgram(ins); 1297 } 1298 1299 /** 1300 * Local, nested class for maintaining character ranges for character classes. 1301 */ 1302 class RERange 1303 { 1304 int size = 16; // Capacity of current range arrays 1305 int[] minRange = new int[size]; // Range minima 1306 int[] maxRange = new int[size]; // Range maxima 1307 int num = 0; // Number of range array elements in use 1308 1309 /** 1310 * Deletes the range at a given index from the range lists 1311 * @param index Index of range to delete from minRange and maxRange arrays. 1312 */ delete(int index)1313 void delete(int index) 1314 { 1315 // Return if no elements left or index is out of range 1316 if (num == 0 || index >= num) 1317 { 1318 return; 1319 } 1320 1321 // Move elements down 1322 while (index++ < num) 1323 { 1324 if (index - 1 >= 0) 1325 { 1326 minRange[index-1] = minRange[index]; 1327 maxRange[index-1] = maxRange[index]; 1328 } 1329 } 1330 1331 // One less element now 1332 num--; 1333 } 1334 1335 /** 1336 * Merges a range into the range list, coalescing ranges if possible. 1337 * @param min Minimum end of range 1338 * @param max Maximum end of range 1339 */ merge(int min, int max)1340 void merge(int min, int max) 1341 { 1342 // Loop through ranges 1343 for (int i = 0; i < num; i++) 1344 { 1345 // Min-max is subsumed by minRange[i]-maxRange[i] 1346 if (min >= minRange[i] && max <= maxRange[i]) 1347 { 1348 return; 1349 } 1350 1351 // Min-max subsumes minRange[i]-maxRange[i] 1352 else if (min <= minRange[i] && max >= maxRange[i]) 1353 { 1354 delete(i); 1355 merge(min, max); 1356 return; 1357 } 1358 1359 // Min is in the range, but max is outside 1360 else if (min >= minRange[i] && min <= maxRange[i]) 1361 { 1362 delete(i); 1363 min = minRange[i]; 1364 merge(min, max); 1365 return; 1366 } 1367 1368 // Max is in the range, but min is outside 1369 else if (max >= minRange[i] && max <= maxRange[i]) 1370 { 1371 delete(i); 1372 max = maxRange[i]; 1373 merge(min, max); 1374 return; 1375 } 1376 } 1377 1378 // Must not overlap any other ranges 1379 if (num >= size) 1380 { 1381 size *= 2; 1382 int[] newMin = new int[size]; 1383 int[] newMax = new int[size]; 1384 System.arraycopy(minRange, 0, newMin, 0, num); 1385 System.arraycopy(maxRange, 0, newMax, 0, num); 1386 minRange = newMin; 1387 maxRange = newMax; 1388 } 1389 minRange[num] = min; 1390 maxRange[num] = max; 1391 num++; 1392 } 1393 1394 /** 1395 * Removes a range by deleting or shrinking all other ranges 1396 * @param min Minimum end of range 1397 * @param max Maximum end of range 1398 */ remove(int min, int max)1399 void remove(int min, int max) 1400 { 1401 // Loop through ranges 1402 for (int i = 0; i < num; i++) 1403 { 1404 // minRange[i]-maxRange[i] is subsumed by min-max 1405 if (minRange[i] >= min && maxRange[i] <= max) 1406 { 1407 delete(i); 1408 i--; 1409 return; 1410 } 1411 1412 // min-max is subsumed by minRange[i]-maxRange[i] 1413 else if (min >= minRange[i] && max <= maxRange[i]) 1414 { 1415 int minr = minRange[i]; 1416 int maxr = maxRange[i]; 1417 delete(i); 1418 if (minr < min - 1) 1419 { 1420 merge(minr, min - 1); 1421 } 1422 if (max + 1 < maxr) 1423 { 1424 merge(max + 1, maxr); 1425 } 1426 return; 1427 } 1428 1429 // minRange is in the range, but maxRange is outside 1430 else if (minRange[i] >= min && minRange[i] <= max) 1431 { 1432 minRange[i] = max + 1; 1433 return; 1434 } 1435 1436 // maxRange is in the range, but minRange is outside 1437 else if (maxRange[i] >= min && maxRange[i] <= max) 1438 { 1439 maxRange[i] = min - 1; 1440 return; 1441 } 1442 } 1443 } 1444 1445 /** 1446 * Includes (or excludes) the range from min to max, inclusive. 1447 * @param min Minimum end of range 1448 * @param max Maximum end of range 1449 * @param include True if range should be included. False otherwise. 1450 */ include(int min, int max, boolean include)1451 void include(int min, int max, boolean include) 1452 { 1453 if (include) 1454 { 1455 merge(min, max); 1456 } 1457 else 1458 { 1459 remove(min, max); 1460 } 1461 } 1462 1463 /** 1464 * Includes a range with the same min and max 1465 * @param minmax Minimum and maximum end of range (inclusive) 1466 * @param include True if range should be included. False otherwise. 1467 */ include(char minmax, boolean include)1468 void include(char minmax, boolean include) 1469 { 1470 include(minmax, minmax, include); 1471 } 1472 } 1473 } 1474