1 
2 /* The INDIRECT_TO_PROGRAM mode can be useful for debugging
3    in 3m to ensure that a regexp "program" is not misinterpreted
4    as a pointer by the conservative collector. (This was a problem
5    once, anyway.) */
6 /* #define INDIRECT_TO_PROGRAM */
7 
8 struct Regwork;
9 
10 typedef int (*Scheme_Regexp_Matcher)(struct Regwork *rw);
11 
12 typedef struct regexp {
13   Scheme_Type type;
14   MZ_HASH_KEY_EX
15   Scheme_Object *source;
16   intptr_t nsubexp, ncounter, maxlookback;
17   intptr_t regsize;
18   short flags;
19   unsigned char *regstart;      /* Infor about required starting bytes */
20   intptr_t regmust;                 /* pointer relative to self to required starting string */
21   intptr_t regmlen;			/* length of the string at regmust */
22 #ifdef INDIRECT_TO_PROGRAM
23   char *program;
24 #else
25   char program[1];		/* Unwarranted chumminess with compiler. */
26 #endif
27 } regexp;
28 
29 #define REGEXP_IS_UTF8    0x01
30 #define REGEXP_IS_PCRE    0x02
31 #define REGEXP_ANCH       0x04
32 #define REGEXP_MUST_CI    0x08
33 
34 #ifdef INDIRECT_TO_PROGRAM
35 # define N_ITO_DELTA(prog, extra, re) extra
36 # define N_ITO_SPACE(v) 0
37 # define ITO(x, y) x
38 #else
39 # define N_ITO_DELTA(prog, extra, re) ((prog+extra) - re)
40 # define N_ITO_SPACE(v) v
41 # define ITO(x, y) y
42 #endif
43 
44 /* 156 is octal 234: */
45 #define MAGIC   156
46 
47 /*
48  * The "internal use only" fields in regexp.h are present to pass info from
49  * compile to execute that permits the execute phase to run lots faster on
50  * simple cases.  They are:
51  *
52  * regstart	bitmap for chars that must begin a match; NULL if none obvious
53  * reganch	is the match anchored (at beginning-of-line only)?
54  * regmust	string (pointer into program) that match must include, or NULL
55  * regmlen	length of regmust string
56  *
57  * Regstart and reganch permit very fast decisions on suitable starting points
58  * for a match, cutting down the work a lot.  Regmust permits fast rejection
59  * of lines that cannot possibly match.  The regmust tests are costly enough
60  * that regcomp() supplies a regmust only if the r.e. contains something
61  * potentially expensive (at present, the only such thing detected is * or +
62  * at the start of the r.e., which can involve a lot of backup).  Regmlen is
63  * supplied because the test in regexec() needs it and regcomp() is computing
64  * it anyway.
65  */
66 
67 /*
68  * Structure for regexp "program".  This is essentially a linear encoding
69  * of a nondeterministic finite-state machine (aka syntax charts or
70  * "railroad normal form" in parsing technology).  Each node is an opcode
71  * plus a "next" pointer, possibly plus an operand.  "Next" pointers of
72  * all nodes except BRANCH implement concatenation; a "next" pointer with
73  * a BRANCH on both ends of it is connecting two alternatives.  (Here we
74  * have one of the subtle syntax dependencies:  an individual BRANCH (as
75  * opposed to a collection of them) is never concatenated with anything
76  * because of operator precedence.)  The operand of some types of node is
77  * a literal string; for others, it is a node leading into a sub-FSM.  In
78  * particular, the operand of a BRANCH node is the first node of the branch.
79  * (NB this is *not* a tree structure:  the tail of the branch connects
80  * to the thing following the set of BRANCHes.)  The opcodes are:
81  */
82 
83 /* definition     number  opnd?   meaning */
84 #define END       0       /* no   End of program. */
85 #define BOI       1       /* no   Match "" at beginning of input. */
86 #define EOI       2       /* no   Match "" at end of input. */
87 #define ANY       3       /* no   Match any one character. */
88 #define ANYL      4       /* no   Anything but a linefeed */
89 #define ANYOF     5       /* bitmap  Match any character in the bitmap. */
90 #define EXACTLY1  6       /* byte Match the character. */
91 #define RANGE     7       /* byte,byte  Match any character in this range. */
92 #define NOTRANGE  8       /* byte,byte  Match any character not in this range. */
93 #define BRANCH    9       /* node Match this alternative, or the next... */
94 #define BACK      10      /* no   Match "", "next" ptr points backward. */
95 #define EXACTLY   11      /* str  Match this string. */
96 #define EXACTLY_CI 12     /* str  Match this string. */
97 #define NOTHING   13      /* no   Match empty string. */
98 #define STAR      14      /* node Match this (simple) thing 0 or more times. */
99 #define PLUS      15      /* node Match this (simple) thing 1 or more times. */
100 #define STAR2     16      /* non-greedy star. */
101 #define PLUS2     17      /* non-greedy plus. */
102 #define STAR3     18      /* 2 nos  Like STAR, but with numeric quantifiers */
103 #define STAR4     19      /* 2 nos  non-greedy STAR3 */
104 #define OPENN     20      /* like OPEN, but with an n >= 50, or n == 0 means (?:...) */
105 #define CLOSEN    21      /* like CLOSE, but with an n >= 50 */
106 #define LOOKT     22      /* (?=...) or (?<=...)*/
107 #define LOOKF     23      /* (?!...) or (?<!...) */
108 #define LOOKTX    24      /* (?>...) */
109 #define LOOKBT    25      /* (?<=...)*/
110 #define LOOKBF    26      /* (?<!...) */
111 #define LOOKE     27      /* ender for LOOK* */
112 #define BACKREF   28      /* \<n>, to match exactly the result for cluster <n> */
113 #define BACKREF_CI 29      /* case-insensitive version */
114 #define COUNTINIT 30
115 #define COUNTOVER 31
116 #define COUNTUNDER 32
117 #define COUNTBACK 33
118 #define COUNTBACKFAIL 34
119 #define SAVECONST 35      /* no no   Save position and count */
120 #define MAYBECONST 36      /* no no   Save position and count */
121 #define WORDBOUND 37
122 #define NOTWORDBOUND 38
123 #define BOL       39      /* no   Match "" at beginning of line. */
124 #define EOL       40      /* no   Match "" at end of line. */
125 #define UNIPROP   41
126 #define CONDITIONAL 42
127 #define EXACTLY2  43      /* byte,byte  Match either byte (useful for some CI cases) */
128 #define OPEN      44      /* no   Mark this point in input as start of #n. */
129 /*      OPEN+1 is number 1, etc. */
130 #define CLOSE     78      /* no   Analogous to OPEN. */
131 
132 # define OPSTR(o) (o + 2)
133 # define OPSTRx(o) (o + 1)
134 # define OPLEN(o, regstr) ((int)(((unsigned char *)regstr)[o] << 8) | (((unsigned char *)regstr)[o+1]))
135 # define OPRNGS(o, regstr) ((int)(((unsigned char *)regstr)[o]))
136 
137 /*
138  * Opcode notes:
139  *
140  * BRANCH	The set of branches constituting a single choice are hooked
141  *		together with their "next" pointers, since precedence prevents
142  *		anything being concatenated to any individual branch.  The
143  *		"next" pointer of the last BRANCH in a choice points to the
144  *		thing following the whole choice.  This is also where the
145  *		final "next" pointer of each individual branch points; each
146  *		branch starts with the operand node of a BRANCH node.
147  *
148  * BACK		Normal "next" pointers all implicitly point forward; BACK
149  *		exists to make loop structures possible.
150  *
151  * STAR,PLUS	'?', and complex '*' and '+', are implemented as circular
152  *		BRANCH structures using BACK.  Simple cases (one character
153  *		per match) are implemented with STAR and PLUS for speed
154  *		and to minimize recursive plunges.
155  *
156  * OPEN,CLOSE	...are numbered at compile time.
157  */
158 
159 /*
160  * A node is one char of opcode followed by two chars of "next" pointer.
161  * "Next" pointers are stored as two 8-bit pieces, high order first.  The
162  * value is a positive offset from the opcode of the node containing it.
163  * An operand, if any, simply follows the node.  (Note that much of the
164  * code generation knows about this implicit relationship.)
165  *
166  * Using two bytes for the "next" pointer is vast overkill for most things,
167  * but allows patterns to get big without disasters.
168  */
169 #define	OP(p, regstr)	(regstr[p])
170 #define	NEXT(p, regstr)	(((((unsigned char *)regstr)[(p)+1]&255)<<8) + (((unsigned char *)regstr)[(p)+2]&255))
171 #define	OPERAND(p)	((p) + 3)
172 #define	OPERAND2(p)	((p) + 5)
173 #define	OPERAND3(p)	((p) + 7)
174 #define	OPERAND4(p)	((p) + 9)
175 
176 /*
177  * See regmagic.h for one further detail of program structure.
178  */
179 
180 
181 /*
182  * Utility definitions.
183  */
184 #define	UCHAR(v) ((unsigned char)(v))
185 
186 #define	ISMULT(c, parse_flags)  ((c) == '*' || (c) == '+' || (c) == '?' || ((parse_flags & PARSE_PCRE) && ((c) == '{')))
187 #define	META	   "^$.[()|?+*\\"
188 #define	PCRE_META  "^$.[()|?+*\\{}]"
189 
190 /*
191  * Flags to be passed up and down.
192  */
193 #define	HASWIDTH       0x01	/* Known never to match null string. */
194 #define	SIMPLE	       0x02	/* Simple enough to be STAR/PLUS operand. */
195 #define	SPSTART	       0x04	/* Starts with * or +. */
196 #define	SPFIXED	       0x08	/* Always matches a particular length */
197 #define NEEDSAVECONST  0x10     /* Fixed-size thing inside (), lift out save in case of repeat. */
198 #define SPNOTHING      0x20     /* Unconditionally matches nothing. */
199 #define	WORST		0	/* Worst case. */
200 
201 /* Parser flags: */
202 #define PARSE_CASE_SENS    0x1
203 #define PARSE_PCRE         0x2
204 #define PARSE_SINGLE_LINE  0x4
205 
206 #define rx_tolower(c) (((c >= 'A') && (c <= 'Z')) ? (c + ('a' - 'A')) : c)
207 #define rx_toupper(c) (((c >= 'a') && (c <= 'z')) ? (c - ('a' - 'A')) : c)
208 #define rx_isword(c) (((c >= 'a') && (c <= 'z')) || ((c >= 'A') && (c <= 'Z')) || ((c >= '0') && (c <= '9')) || (c == '_'))
209 
210 /*
211  * Work variables for regtry().
212  */
213 typedef struct Regwork {
214   MZTAG_IF_REQUIRED
215   char *str;              /* copy of regstr; used only to protect before thread swaps */
216   char *instr;
217   Scheme_Object *port;
218   Scheme_Object *unless_evt;
219   char nonblock, aborted;
220   rxpos instr_size;       /* For port reads */
221   rxpos input_maxend;     /* For port reads */
222   rxpos input, input_end, input_start; /* String-input pointer. */
223   rxpos input_min;        /* input_start minus prefix_size */
224   rxpos boi;	          /* Beginning of input, for ^ check. */
225   rxpos *startp;	  /* Pointer to startp array. */
226   rxpos *maybep;	  /* Pointer to tentative startp array. */
227   rxpos *endp;		  /* Ditto for endp. */
228   int *counters;          /* For {} counters */
229   Scheme_Object *peekskip;
230   char *prefix;
231   rxpos prefix_len, prefix_delta;
232   struct rx_lazy_str_t *lazy_string;
233 
234   int non_tail, rewind_stack_size, rewind_stack_count, rewind_stack_prompt;
235   rxpos *rewind_stack;
236 } Regwork;
237 
238 typedef struct rx_lazy_str_t {
239   MZTAG_IF_REQUIRED
240   intptr_t start, done, end, blen;
241   mzchar *chars;
242   char *s;
243 } rx_lazy_str_t;
244