1 /********************************************
2 rexp2.c
3 copyright 2009-2016,2017, Thomas E. Dickey
4 copyright 2010, Jonathan Nieder
5 copyright 1991-1992,1993, Michael D. Brennan
6
7 This is a source file for mawk, an implementation of
8 the AWK programming language.
9
10 Mawk is distributed without warranty under the terms of
11 the GNU General Public License, version 2, 1991.
12 ********************************************/
13
14 /*
15 * $MawkId: rexp2.c,v 1.26 2017/10/17 01:19:15 tom Exp $
16 */
17
18 /* test a string against a machine */
19
20 #include "rexp.h"
21
22 #define STACKGROWTH 16
23
24 RT_STATE *RE_run_stack_base;
25 RT_STATE *RE_run_stack_limit;
26
27 /* Large model DOS segment arithemetic breaks the current stack.
28 This hack fixes it without rewriting the whole thing, 5/31/91 */
29 RT_STATE *RE_run_stack_empty;
30
31 RT_POS_ENTRY *RE_pos_stack_base;
32 RT_POS_ENTRY *RE_pos_stack_limit;
33 RT_POS_ENTRY *RE_pos_stack_empty;
34
35 void
RE_run_stack_init(void)36 RE_run_stack_init(void)
37 {
38 if (!RE_run_stack_base) {
39 RE_run_stack_base = (RT_STATE *)
40 RE_malloc(sizeof(RT_STATE) * STACKGROWTH);
41 RE_run_stack_limit = RE_run_stack_base + STACKGROWTH;
42 RE_run_stack_empty = RE_run_stack_base - 1;
43 }
44 }
45
46 void
RE_pos_stack_init(void)47 RE_pos_stack_init(void)
48 {
49 if (!RE_pos_stack_base) {
50 RE_pos_stack_base = (RT_POS_ENTRY *)
51 RE_malloc(sizeof(RT_POS_ENTRY) * STACKGROWTH);
52 RE_pos_stack_limit = RE_pos_stack_base + STACKGROWTH;
53 RE_pos_stack_empty = RE_pos_stack_base;
54
55 RE_pos_stack_base->pos = NULL; /* RE_pos_peek(RE_pos_stack_empty) */
56 RE_pos_stack_base->owner = -1; /* avoid popping base */
57 RE_pos_stack_base->prev_offset = 0; /* avoid popping below base */
58 }
59 }
60
61 /* sometimes during REmatch(), this stack can grow pretty large.
62 In real life cases, the back tracking usually fails. Some
63 work is needed here to improve the algorithm.
64 I.e., figure out how not to stack useless paths.
65 */
66
67 RT_STATE *
RE_new_run_stack(void)68 RE_new_run_stack(void)
69 {
70 size_t oldsize = (size_t) (RE_run_stack_limit - RE_run_stack_base);
71 size_t newsize = oldsize + STACKGROWTH;
72
73 #ifdef LMDOS /* large model DOS */
74 /* have to worry about overflow on multiplication (ugh) */
75 if (newsize >= 4096)
76 RE_run_stack_base = (RT_STATE *) 0;
77 else
78 #endif
79
80 RE_run_stack_base = (RT_STATE *) realloc(RE_run_stack_base,
81 newsize * sizeof(RT_STATE));
82
83 if (!RE_run_stack_base) {
84 fprintf(stderr, "out of memory for RE run time stack\n");
85 /* this is pretty unusual, I've only seen it happen on
86 weird input to REmatch() under 16bit DOS , the same
87 situation worked easily on 32bit machine. */
88 mawk_exit(100);
89 }
90
91 RE_run_stack_limit = RE_run_stack_base + newsize;
92 RE_run_stack_empty = RE_run_stack_base - 1;
93
94 /* return the new stackp */
95 return RE_run_stack_base + oldsize;
96 }
97
98 RT_POS_ENTRY *
RE_new_pos_stack(void)99 RE_new_pos_stack(void)
100 {
101 size_t oldsize = (size_t) (RE_pos_stack_limit - RE_pos_stack_base);
102 size_t newsize = oldsize + STACKGROWTH;
103
104 /* FIXME: handle overflow on multiplication for large model DOS
105 * (see RE_new_run_stack()).
106 */
107 RE_pos_stack_base = (RT_POS_ENTRY *)
108 realloc(RE_pos_stack_base, newsize * sizeof(RT_POS_ENTRY));
109
110 if (!RE_pos_stack_base) {
111 fprintf(stderr, "out of memory for RE string position stack\n");
112 mawk_exit(100);
113 }
114
115 RE_pos_stack_limit = RE_pos_stack_base + newsize;
116 RE_pos_stack_empty = RE_pos_stack_base;
117
118 /* return the new stackp */
119 return RE_pos_stack_base + oldsize;
120 }
121
122 #ifdef DEBUG
123 static RT_STATE *
slow_push(RT_STATE * sp,STATE * m,char * s,RT_POS_ENTRY * pos_top,int u)124 slow_push(
125 RT_STATE * sp,
126 STATE * m,
127 char *s,
128 RT_POS_ENTRY * pos_top,
129 int u)
130 {
131 if (sp == RE_run_stack_limit)
132 sp = RE_new_run_stack();
133 sp->m = m;
134 sp->s = s;
135 sp->u = u;
136 sp->sp = pos_top - RE_pos_stack_base;
137 sp->tp = pos_top->prev_offset;
138 return sp;
139 }
140 #endif
141
142 #ifdef DEBUG
143 #define push(mx,sx,px,ux) do { \
144 stackp = slow_push(++stackp, mx, sx, px, ux); \
145 } while(0)
146 #else
147 #define push(mx,sx,px,ux) do { \
148 if (++stackp == RE_run_stack_limit) \
149 stackp = RE_new_run_stack(); \
150 stackp->m = (mx); \
151 stackp->s = (sx); \
152 stackp->u = (ux); \
153 stackp->sp = (int) ((px) - RE_pos_stack_base); \
154 stackp->tp = (px)->prev_offset; \
155 } while(0)
156 #endif
157
158 #define CASE_UANY(x) case (x)+U_OFF: /* FALLTHRU */ case (x)+U_ON
159
160 /*
161 * test if str ~ /machine/
162 */
163 int
REtest(char * str,size_t len,PTR machine)164 REtest(char *str, /* string to test */
165 size_t len, /* ...its length */
166 PTR machine) /* compiled regular-expression */
167 {
168 register STATE *m = (STATE *) machine;
169 char *s = str;
170 register RT_STATE *stackp;
171 int u_flag;
172 char *str_end = str + len;
173 RT_POS_ENTRY *sp;
174 int t; /*convenient temps */
175 STATE *tm;
176
177 /* handle the easy case quickly */
178 if (m->s_type == M_STR && (m + 1)->s_type == M_ACCEPT) {
179 return str_str(s, len, m->s_data.str, (size_t) m->s_len) != (char *) 0;
180 } else {
181 u_flag = U_ON;
182 stackp = RE_run_stack_empty;
183 sp = RE_pos_stack_empty;
184 RE_CASE();
185 }
186
187 refill:
188 if (stackp == RE_run_stack_empty) {
189 return 0;
190 }
191 m = stackp->m;
192 s = stackp->s;
193 sp = RE_pos_stack_base + stackp->sp;
194 sp->prev_offset = stackp->tp;
195 u_flag = (stackp--)->u;
196
197 reswitch:
198
199 switch (m->s_type + u_flag) {
200 case M_STR + U_OFF + END_OFF:
201 if (strncmp(s, m->s_data.str, (size_t) m->s_len)) {
202 RE_FILL();
203 }
204 s += m->s_len;
205 m++;
206 RE_CASE();
207
208 case M_STR + U_OFF + END_ON:
209 if (strcmp(s, m->s_data.str)) {
210 RE_FILL();
211 }
212 s += m->s_len;
213 m++;
214 RE_CASE();
215
216 case M_STR + U_ON + END_OFF:
217 if (!(s = str_str(s, (size_t) (str_end - s), m->s_data.str, (size_t) m->s_len))) {
218 RE_FILL();
219 }
220 push(m, s + 1, sp, U_ON);
221 s += m->s_len;
222 m++;
223 u_flag = U_OFF;
224 RE_CASE();
225
226 case M_STR + U_ON + END_ON:
227 t = (int) (str_end - s) - (int) m->s_len;
228 if (t < 0 || memcmp(s + t, m->s_data.str, (size_t) m->s_len)) {
229 RE_FILL();
230 }
231 s = str_end;
232 m++;
233 u_flag = U_OFF;
234 RE_CASE();
235
236 case M_CLASS + U_OFF + END_OFF:
237 if (s >= str_end || !ison(*m->s_data.bvp, s[0])) {
238 RE_FILL();
239 }
240 s++;
241 m++;
242 RE_CASE();
243
244 case M_CLASS + U_OFF + END_ON:
245 if (s >= str_end) {
246 RE_FILL();
247 }
248 if ((s + 1) < str_end || !ison(*m->s_data.bvp, s[0])) {
249 RE_FILL();
250 }
251 s++;
252 m++;
253 RE_CASE();
254
255 case M_CLASS + U_ON + END_OFF:
256 for (;;) {
257 if (s >= str_end) {
258 RE_FILL();
259 } else if (ison(*m->s_data.bvp, s[0])) {
260 break;
261 }
262 s++;
263 }
264 s++;
265 push(m, s, sp, U_ON);
266 m++;
267 u_flag = U_OFF;
268 RE_CASE();
269
270 case M_CLASS + U_ON + END_ON:
271 if (s >= str_end || !ison(*m->s_data.bvp, str_end[-1])) {
272 RE_FILL();
273 }
274 s = str_end;
275 m++;
276 u_flag = U_OFF;
277 RE_CASE();
278
279 case M_ANY + U_OFF + END_OFF:
280 if (s >= str_end) {
281 RE_FILL();
282 }
283 s++;
284 m++;
285 RE_CASE();
286
287 case M_ANY + U_OFF + END_ON:
288 if (s >= str_end || (s + 1) < str_end) {
289 RE_FILL();
290 }
291 s++;
292 m++;
293 RE_CASE();
294
295 case M_ANY + U_ON + END_OFF:
296 if (s >= str_end) {
297 RE_FILL();
298 }
299 s++;
300 push(m, s, sp, U_ON);
301 m++;
302 u_flag = U_OFF;
303 RE_CASE();
304
305 case M_ANY + U_ON + END_ON:
306 if (s >= str_end) {
307 RE_FILL();
308 }
309 s = str_end;
310 m++;
311 u_flag = U_OFF;
312 RE_CASE();
313
314 case M_START + U_OFF + END_OFF:
315 case M_START + U_ON + END_OFF:
316 if (s != str) {
317 RE_FILL();
318 }
319 m++;
320 u_flag = U_OFF;
321 RE_CASE();
322
323 case M_START + U_OFF + END_ON:
324 case M_START + U_ON + END_ON:
325 if (s != str || s < str_end) {
326 RE_FILL();
327 }
328 m++;
329 u_flag = U_OFF;
330 RE_CASE();
331
332 case M_END + U_OFF:
333 if (s < str_end) {
334 RE_FILL();
335 }
336 m++;
337 RE_CASE();
338
339 case M_END + U_ON:
340 s += strlen(s);
341 m++;
342 u_flag = U_OFF;
343 RE_CASE();
344
345 CASE_UANY(M_U):
346 u_flag = U_ON;
347 m++;
348 RE_CASE();
349
350 CASE_UANY(M_1J):
351 m += m->s_data.jump;
352 RE_CASE();
353
354 CASE_UANY(M_SAVE_POS): /* save position for a later M_2JC */
355 sp = RE_pos_push(sp, stackp, s);
356 m++;
357 RE_CASE();
358
359 CASE_UANY(M_2JA): /* take the non jump branch */
360 /* don't stack an ACCEPT */
361 if ((tm = m + m->s_data.jump)->s_type == M_ACCEPT) {
362 return 1;
363 }
364 push(tm, s, sp, u_flag);
365 m++;
366 RE_CASE();
367
368 CASE_UANY(M_2JC): /* take the jump branch if position changed */
369 if (RE_pos_pop(&sp, stackp) == s) {
370 /* did not advance: do not jump back */
371 m++;
372 RE_CASE();
373 }
374 /* FALLTHRU */
375
376 CASE_UANY(M_2JB): /* take the jump branch */
377 /* don't stack an ACCEPT */
378 if ((tm = m + 1)->s_type == M_ACCEPT) {
379 return 1;
380 }
381 push(tm, s, sp, u_flag);
382 m += m->s_data.jump;
383 RE_CASE();
384
385 CASE_UANY(M_ACCEPT):
386 return 1;
387
388 default:
389 RE_panic("unexpected case in REtest");
390 }
391 }
392
393 #undef push
394