1 2 /* The INDIRECT_TO_PROGRAM mode can be useful for debugging 3 in 3m to ensure that a regexp "program" is not misinterpreted 4 as a pointer by the conservative collector. (This was a problem 5 once, anyway.) */ 6 /* #define INDIRECT_TO_PROGRAM */ 7 8 struct Regwork; 9 10 typedef int (*Scheme_Regexp_Matcher)(struct Regwork *rw); 11 12 typedef struct regexp { 13 Scheme_Type type; 14 MZ_HASH_KEY_EX 15 Scheme_Object *source; 16 intptr_t nsubexp, ncounter, maxlookback; 17 intptr_t regsize; 18 short flags; 19 unsigned char *regstart; /* Infor about required starting bytes */ 20 intptr_t regmust; /* pointer relative to self to required starting string */ 21 intptr_t regmlen; /* length of the string at regmust */ 22 #ifdef INDIRECT_TO_PROGRAM 23 char *program; 24 #else 25 char program[1]; /* Unwarranted chumminess with compiler. */ 26 #endif 27 } regexp; 28 29 #define REGEXP_IS_UTF8 0x01 30 #define REGEXP_IS_PCRE 0x02 31 #define REGEXP_ANCH 0x04 32 #define REGEXP_MUST_CI 0x08 33 34 #ifdef INDIRECT_TO_PROGRAM 35 # define N_ITO_DELTA(prog, extra, re) extra 36 # define N_ITO_SPACE(v) 0 37 # define ITO(x, y) x 38 #else 39 # define N_ITO_DELTA(prog, extra, re) ((prog+extra) - re) 40 # define N_ITO_SPACE(v) v 41 # define ITO(x, y) y 42 #endif 43 44 /* 156 is octal 234: */ 45 #define MAGIC 156 46 47 /* 48 * The "internal use only" fields in regexp.h are present to pass info from 49 * compile to execute that permits the execute phase to run lots faster on 50 * simple cases. They are: 51 * 52 * regstart bitmap for chars that must begin a match; NULL if none obvious 53 * reganch is the match anchored (at beginning-of-line only)? 54 * regmust string (pointer into program) that match must include, or NULL 55 * regmlen length of regmust string 56 * 57 * Regstart and reganch permit very fast decisions on suitable starting points 58 * for a match, cutting down the work a lot. Regmust permits fast rejection 59 * of lines that cannot possibly match. The regmust tests are costly enough 60 * that regcomp() supplies a regmust only if the r.e. contains something 61 * potentially expensive (at present, the only such thing detected is * or + 62 * at the start of the r.e., which can involve a lot of backup). Regmlen is 63 * supplied because the test in regexec() needs it and regcomp() is computing 64 * it anyway. 65 */ 66 67 /* 68 * Structure for regexp "program". This is essentially a linear encoding 69 * of a nondeterministic finite-state machine (aka syntax charts or 70 * "railroad normal form" in parsing technology). Each node is an opcode 71 * plus a "next" pointer, possibly plus an operand. "Next" pointers of 72 * all nodes except BRANCH implement concatenation; a "next" pointer with 73 * a BRANCH on both ends of it is connecting two alternatives. (Here we 74 * have one of the subtle syntax dependencies: an individual BRANCH (as 75 * opposed to a collection of them) is never concatenated with anything 76 * because of operator precedence.) The operand of some types of node is 77 * a literal string; for others, it is a node leading into a sub-FSM. In 78 * particular, the operand of a BRANCH node is the first node of the branch. 79 * (NB this is *not* a tree structure: the tail of the branch connects 80 * to the thing following the set of BRANCHes.) The opcodes are: 81 */ 82 83 /* definition number opnd? meaning */ 84 #define END 0 /* no End of program. */ 85 #define BOI 1 /* no Match "" at beginning of input. */ 86 #define EOI 2 /* no Match "" at end of input. */ 87 #define ANY 3 /* no Match any one character. */ 88 #define ANYL 4 /* no Anything but a linefeed */ 89 #define ANYOF 5 /* bitmap Match any character in the bitmap. */ 90 #define EXACTLY1 6 /* byte Match the character. */ 91 #define RANGE 7 /* byte,byte Match any character in this range. */ 92 #define NOTRANGE 8 /* byte,byte Match any character not in this range. */ 93 #define BRANCH 9 /* node Match this alternative, or the next... */ 94 #define BACK 10 /* no Match "", "next" ptr points backward. */ 95 #define EXACTLY 11 /* str Match this string. */ 96 #define EXACTLY_CI 12 /* str Match this string. */ 97 #define NOTHING 13 /* no Match empty string. */ 98 #define STAR 14 /* node Match this (simple) thing 0 or more times. */ 99 #define PLUS 15 /* node Match this (simple) thing 1 or more times. */ 100 #define STAR2 16 /* non-greedy star. */ 101 #define PLUS2 17 /* non-greedy plus. */ 102 #define STAR3 18 /* 2 nos Like STAR, but with numeric quantifiers */ 103 #define STAR4 19 /* 2 nos non-greedy STAR3 */ 104 #define OPENN 20 /* like OPEN, but with an n >= 50, or n == 0 means (?:...) */ 105 #define CLOSEN 21 /* like CLOSE, but with an n >= 50 */ 106 #define LOOKT 22 /* (?=...) or (?<=...)*/ 107 #define LOOKF 23 /* (?!...) or (?<!...) */ 108 #define LOOKTX 24 /* (?>...) */ 109 #define LOOKBT 25 /* (?<=...)*/ 110 #define LOOKBF 26 /* (?<!...) */ 111 #define LOOKE 27 /* ender for LOOK* */ 112 #define BACKREF 28 /* \<n>, to match exactly the result for cluster <n> */ 113 #define BACKREF_CI 29 /* case-insensitive version */ 114 #define COUNTINIT 30 115 #define COUNTOVER 31 116 #define COUNTUNDER 32 117 #define COUNTBACK 33 118 #define COUNTBACKFAIL 34 119 #define SAVECONST 35 /* no no Save position and count */ 120 #define MAYBECONST 36 /* no no Save position and count */ 121 #define WORDBOUND 37 122 #define NOTWORDBOUND 38 123 #define BOL 39 /* no Match "" at beginning of line. */ 124 #define EOL 40 /* no Match "" at end of line. */ 125 #define UNIPROP 41 126 #define CONDITIONAL 42 127 #define EXACTLY2 43 /* byte,byte Match either byte (useful for some CI cases) */ 128 #define OPEN 44 /* no Mark this point in input as start of #n. */ 129 /* OPEN+1 is number 1, etc. */ 130 #define CLOSE 78 /* no Analogous to OPEN. */ 131 132 # define OPSTR(o) (o + 2) 133 # define OPSTRx(o) (o + 1) 134 # define OPLEN(o, regstr) ((int)(((unsigned char *)regstr)[o] << 8) | (((unsigned char *)regstr)[o+1])) 135 # define OPRNGS(o, regstr) ((int)(((unsigned char *)regstr)[o])) 136 137 /* 138 * Opcode notes: 139 * 140 * BRANCH The set of branches constituting a single choice are hooked 141 * together with their "next" pointers, since precedence prevents 142 * anything being concatenated to any individual branch. The 143 * "next" pointer of the last BRANCH in a choice points to the 144 * thing following the whole choice. This is also where the 145 * final "next" pointer of each individual branch points; each 146 * branch starts with the operand node of a BRANCH node. 147 * 148 * BACK Normal "next" pointers all implicitly point forward; BACK 149 * exists to make loop structures possible. 150 * 151 * STAR,PLUS '?', and complex '*' and '+', are implemented as circular 152 * BRANCH structures using BACK. Simple cases (one character 153 * per match) are implemented with STAR and PLUS for speed 154 * and to minimize recursive plunges. 155 * 156 * OPEN,CLOSE ...are numbered at compile time. 157 */ 158 159 /* 160 * A node is one char of opcode followed by two chars of "next" pointer. 161 * "Next" pointers are stored as two 8-bit pieces, high order first. The 162 * value is a positive offset from the opcode of the node containing it. 163 * An operand, if any, simply follows the node. (Note that much of the 164 * code generation knows about this implicit relationship.) 165 * 166 * Using two bytes for the "next" pointer is vast overkill for most things, 167 * but allows patterns to get big without disasters. 168 */ 169 #define OP(p, regstr) (regstr[p]) 170 #define NEXT(p, regstr) (((((unsigned char *)regstr)[(p)+1]&255)<<8) + (((unsigned char *)regstr)[(p)+2]&255)) 171 #define OPERAND(p) ((p) + 3) 172 #define OPERAND2(p) ((p) + 5) 173 #define OPERAND3(p) ((p) + 7) 174 #define OPERAND4(p) ((p) + 9) 175 176 /* 177 * See regmagic.h for one further detail of program structure. 178 */ 179 180 181 /* 182 * Utility definitions. 183 */ 184 #define UCHAR(v) ((unsigned char)(v)) 185 186 #define ISMULT(c, parse_flags) ((c) == '*' || (c) == '+' || (c) == '?' || ((parse_flags & PARSE_PCRE) && ((c) == '{'))) 187 #define META "^$.[()|?+*\\" 188 #define PCRE_META "^$.[()|?+*\\{}]" 189 190 /* 191 * Flags to be passed up and down. 192 */ 193 #define HASWIDTH 0x01 /* Known never to match null string. */ 194 #define SIMPLE 0x02 /* Simple enough to be STAR/PLUS operand. */ 195 #define SPSTART 0x04 /* Starts with * or +. */ 196 #define SPFIXED 0x08 /* Always matches a particular length */ 197 #define NEEDSAVECONST 0x10 /* Fixed-size thing inside (), lift out save in case of repeat. */ 198 #define SPNOTHING 0x20 /* Unconditionally matches nothing. */ 199 #define WORST 0 /* Worst case. */ 200 201 /* Parser flags: */ 202 #define PARSE_CASE_SENS 0x1 203 #define PARSE_PCRE 0x2 204 #define PARSE_SINGLE_LINE 0x4 205 206 #define rx_tolower(c) (((c >= 'A') && (c <= 'Z')) ? (c + ('a' - 'A')) : c) 207 #define rx_toupper(c) (((c >= 'a') && (c <= 'z')) ? (c - ('a' - 'A')) : c) 208 #define rx_isword(c) (((c >= 'a') && (c <= 'z')) || ((c >= 'A') && (c <= 'Z')) || ((c >= '0') && (c <= '9')) || (c == '_')) 209 210 /* 211 * Work variables for regtry(). 212 */ 213 typedef struct Regwork { 214 MZTAG_IF_REQUIRED 215 char *str; /* copy of regstr; used only to protect before thread swaps */ 216 char *instr; 217 Scheme_Object *port; 218 Scheme_Object *unless_evt; 219 char nonblock, aborted; 220 rxpos instr_size; /* For port reads */ 221 rxpos input_maxend; /* For port reads */ 222 rxpos input, input_end, input_start; /* String-input pointer. */ 223 rxpos input_min; /* input_start minus prefix_size */ 224 rxpos boi; /* Beginning of input, for ^ check. */ 225 rxpos *startp; /* Pointer to startp array. */ 226 rxpos *maybep; /* Pointer to tentative startp array. */ 227 rxpos *endp; /* Ditto for endp. */ 228 int *counters; /* For {} counters */ 229 Scheme_Object *peekskip; 230 char *prefix; 231 rxpos prefix_len, prefix_delta; 232 struct rx_lazy_str_t *lazy_string; 233 234 int non_tail, rewind_stack_size, rewind_stack_count, rewind_stack_prompt; 235 rxpos *rewind_stack; 236 } Regwork; 237 238 typedef struct rx_lazy_str_t { 239 MZTAG_IF_REQUIRED 240 intptr_t start, done, end, blen; 241 mzchar *chars; 242 char *s; 243 } rx_lazy_str_t; 244