1 /********************************************
2 rexp2.c
3 copyright 2009-2016,2017, Thomas E. Dickey
4 copyright 2010, Jonathan Nieder
5 copyright 1991-1992,1993, Michael D. Brennan
6 
7 This is a source file for mawk, an implementation of
8 the AWK programming language.
9 
10 Mawk is distributed without warranty under the terms of
11 the GNU General Public License, version 2, 1991.
12 ********************************************/
13 
14 /*
15  * $MawkId: rexp2.c,v 1.26 2017/10/17 01:19:15 tom Exp $
16  */
17 
18 /*  test a string against a machine   */
19 
20 #include "rexp.h"
21 
22 #define	 STACKGROWTH	16
23 
24 RT_STATE *RE_run_stack_base;
25 RT_STATE *RE_run_stack_limit;
26 
27 /* Large model DOS segment arithemetic breaks the current stack.
28    This hack fixes it without rewriting the whole thing, 5/31/91 */
29 RT_STATE *RE_run_stack_empty;
30 
31 RT_POS_ENTRY *RE_pos_stack_base;
32 RT_POS_ENTRY *RE_pos_stack_limit;
33 RT_POS_ENTRY *RE_pos_stack_empty;
34 
35 void
RE_run_stack_init(void)36 RE_run_stack_init(void)
37 {
38     if (!RE_run_stack_base) {
39 	RE_run_stack_base = (RT_STATE *)
40 	    RE_malloc(sizeof(RT_STATE) * STACKGROWTH);
41 	RE_run_stack_limit = RE_run_stack_base + STACKGROWTH;
42 	RE_run_stack_empty = RE_run_stack_base - 1;
43     }
44 }
45 
46 void
RE_pos_stack_init(void)47 RE_pos_stack_init(void)
48 {
49     if (!RE_pos_stack_base) {
50 	RE_pos_stack_base = (RT_POS_ENTRY *)
51 	    RE_malloc(sizeof(RT_POS_ENTRY) * STACKGROWTH);
52 	RE_pos_stack_limit = RE_pos_stack_base + STACKGROWTH;
53 	RE_pos_stack_empty = RE_pos_stack_base;
54 
55 	RE_pos_stack_base->pos = NULL;	/* RE_pos_peek(RE_pos_stack_empty) */
56 	RE_pos_stack_base->owner = -1;	/* avoid popping base */
57 	RE_pos_stack_base->prev_offset = 0;	/* avoid popping below base */
58     }
59 }
60 
61 /* sometimes during REmatch(), this stack can grow pretty large.
62    In real life cases, the back tracking usually fails. Some
63    work is needed here to improve the algorithm.
64    I.e., figure out how not to stack useless paths.
65 */
66 
67 RT_STATE *
RE_new_run_stack(void)68 RE_new_run_stack(void)
69 {
70     size_t oldsize = (size_t) (RE_run_stack_limit - RE_run_stack_base);
71     size_t newsize = oldsize + STACKGROWTH;
72 
73 #ifdef	LMDOS			/* large model DOS */
74     /* have to worry about overflow on multiplication (ugh) */
75     if (newsize >= 4096)
76 	RE_run_stack_base = (RT_STATE *) 0;
77     else
78 #endif
79 
80 	RE_run_stack_base = (RT_STATE *) realloc(RE_run_stack_base,
81 						 newsize * sizeof(RT_STATE));
82 
83     if (!RE_run_stack_base) {
84 	fprintf(stderr, "out of memory for RE run time stack\n");
85 	/* this is pretty unusual, I've only seen it happen on
86 	   weird input to REmatch() under 16bit DOS , the same
87 	   situation worked easily on 32bit machine.  */
88 	mawk_exit(100);
89     }
90 
91     RE_run_stack_limit = RE_run_stack_base + newsize;
92     RE_run_stack_empty = RE_run_stack_base - 1;
93 
94     /* return the new stackp */
95     return RE_run_stack_base + oldsize;
96 }
97 
98 RT_POS_ENTRY *
RE_new_pos_stack(void)99 RE_new_pos_stack(void)
100 {
101     size_t oldsize = (size_t) (RE_pos_stack_limit - RE_pos_stack_base);
102     size_t newsize = oldsize + STACKGROWTH;
103 
104     /* FIXME: handle overflow on multiplication for large model DOS
105      * (see RE_new_run_stack()).
106      */
107     RE_pos_stack_base = (RT_POS_ENTRY *)
108 	realloc(RE_pos_stack_base, newsize * sizeof(RT_POS_ENTRY));
109 
110     if (!RE_pos_stack_base) {
111 	fprintf(stderr, "out of memory for RE string position stack\n");
112 	mawk_exit(100);
113     }
114 
115     RE_pos_stack_limit = RE_pos_stack_base + newsize;
116     RE_pos_stack_empty = RE_pos_stack_base;
117 
118     /* return the new stackp */
119     return RE_pos_stack_base + oldsize;
120 }
121 
122 #ifdef	DEBUG
123 static RT_STATE *
slow_push(RT_STATE * sp,STATE * m,char * s,RT_POS_ENTRY * pos_top,int u)124 slow_push(
125 	     RT_STATE * sp,
126 	     STATE * m,
127 	     char *s,
128 	     RT_POS_ENTRY * pos_top,
129 	     int u)
130 {
131     if (sp == RE_run_stack_limit)
132 	sp = RE_new_run_stack();
133     sp->m = m;
134     sp->s = s;
135     sp->u = u;
136     sp->sp = pos_top - RE_pos_stack_base;
137     sp->tp = pos_top->prev_offset;
138     return sp;
139 }
140 #endif
141 
142 #ifdef	 DEBUG
143 #define	 push(mx,sx,px,ux) do { \
144 		stackp = slow_push(++stackp, mx, sx, px, ux); \
145 	} while(0)
146 #else
147 #define	 push(mx,sx,px,ux) do { \
148 		if (++stackp == RE_run_stack_limit) \
149 			stackp = RE_new_run_stack(); \
150 		stackp->m = (mx); \
151 		stackp->s = (sx); \
152 		stackp->u = (ux); \
153 		stackp->sp = (int) ((px) - RE_pos_stack_base); \
154 		stackp->tp = (px)->prev_offset; \
155 	} while(0)
156 #endif
157 
158 #define	CASE_UANY(x) case (x)+U_OFF:  /* FALLTHRU */ case (x)+U_ON
159 
160 /*
161  * test if str ~ /machine/
162  */
163 int
REtest(char * str,size_t len,PTR machine)164 REtest(char *str,		/* string to test */
165        size_t len,		/* ...its length */
166        PTR machine)		/* compiled regular-expression */
167 {
168     register STATE *m = (STATE *) machine;
169     char *s = str;
170     register RT_STATE *stackp;
171     int u_flag;
172     char *str_end = str + len;
173     RT_POS_ENTRY *sp;
174     int t;			/*convenient temps */
175     STATE *tm;
176 
177     /* handle the easy case quickly */
178     if (m->s_type == M_STR && (m + 1)->s_type == M_ACCEPT) {
179 	return str_str(s, len, m->s_data.str, (size_t) m->s_len) != (char *) 0;
180     } else {
181 	u_flag = U_ON;
182 	stackp = RE_run_stack_empty;
183 	sp = RE_pos_stack_empty;
184 	RE_CASE();
185     }
186 
187   refill:
188     if (stackp == RE_run_stack_empty) {
189 	return 0;
190     }
191     m = stackp->m;
192     s = stackp->s;
193     sp = RE_pos_stack_base + stackp->sp;
194     sp->prev_offset = stackp->tp;
195     u_flag = (stackp--)->u;
196 
197   reswitch:
198 
199     switch (m->s_type + u_flag) {
200     case M_STR + U_OFF + END_OFF:
201 	if (strncmp(s, m->s_data.str, (size_t) m->s_len)) {
202 	    RE_FILL();
203 	}
204 	s += m->s_len;
205 	m++;
206 	RE_CASE();
207 
208     case M_STR + U_OFF + END_ON:
209 	if (strcmp(s, m->s_data.str)) {
210 	    RE_FILL();
211 	}
212 	s += m->s_len;
213 	m++;
214 	RE_CASE();
215 
216     case M_STR + U_ON + END_OFF:
217 	if (!(s = str_str(s, (size_t) (str_end - s), m->s_data.str, (size_t) m->s_len))) {
218 	    RE_FILL();
219 	}
220 	push(m, s + 1, sp, U_ON);
221 	s += m->s_len;
222 	m++;
223 	u_flag = U_OFF;
224 	RE_CASE();
225 
226     case M_STR + U_ON + END_ON:
227 	t = (int) (str_end - s) - (int) m->s_len;
228 	if (t < 0 || memcmp(s + t, m->s_data.str, (size_t) m->s_len)) {
229 	    RE_FILL();
230 	}
231 	s = str_end;
232 	m++;
233 	u_flag = U_OFF;
234 	RE_CASE();
235 
236     case M_CLASS + U_OFF + END_OFF:
237 	if (s >= str_end || !ison(*m->s_data.bvp, s[0])) {
238 	    RE_FILL();
239 	}
240 	s++;
241 	m++;
242 	RE_CASE();
243 
244     case M_CLASS + U_OFF + END_ON:
245 	if (s >= str_end) {
246 	    RE_FILL();
247 	}
248 	if ((s + 1) < str_end || !ison(*m->s_data.bvp, s[0])) {
249 	    RE_FILL();
250 	}
251 	s++;
252 	m++;
253 	RE_CASE();
254 
255     case M_CLASS + U_ON + END_OFF:
256 	for (;;) {
257 	    if (s >= str_end) {
258 		RE_FILL();
259 	    } else if (ison(*m->s_data.bvp, s[0])) {
260 		break;
261 	    }
262 	    s++;
263 	}
264 	s++;
265 	push(m, s, sp, U_ON);
266 	m++;
267 	u_flag = U_OFF;
268 	RE_CASE();
269 
270     case M_CLASS + U_ON + END_ON:
271 	if (s >= str_end || !ison(*m->s_data.bvp, str_end[-1])) {
272 	    RE_FILL();
273 	}
274 	s = str_end;
275 	m++;
276 	u_flag = U_OFF;
277 	RE_CASE();
278 
279     case M_ANY + U_OFF + END_OFF:
280 	if (s >= str_end) {
281 	    RE_FILL();
282 	}
283 	s++;
284 	m++;
285 	RE_CASE();
286 
287     case M_ANY + U_OFF + END_ON:
288 	if (s >= str_end || (s + 1) < str_end) {
289 	    RE_FILL();
290 	}
291 	s++;
292 	m++;
293 	RE_CASE();
294 
295     case M_ANY + U_ON + END_OFF:
296 	if (s >= str_end) {
297 	    RE_FILL();
298 	}
299 	s++;
300 	push(m, s, sp, U_ON);
301 	m++;
302 	u_flag = U_OFF;
303 	RE_CASE();
304 
305     case M_ANY + U_ON + END_ON:
306 	if (s >= str_end) {
307 	    RE_FILL();
308 	}
309 	s = str_end;
310 	m++;
311 	u_flag = U_OFF;
312 	RE_CASE();
313 
314     case M_START + U_OFF + END_OFF:
315     case M_START + U_ON + END_OFF:
316 	if (s != str) {
317 	    RE_FILL();
318 	}
319 	m++;
320 	u_flag = U_OFF;
321 	RE_CASE();
322 
323     case M_START + U_OFF + END_ON:
324     case M_START + U_ON + END_ON:
325 	if (s != str || s < str_end) {
326 	    RE_FILL();
327 	}
328 	m++;
329 	u_flag = U_OFF;
330 	RE_CASE();
331 
332     case M_END + U_OFF:
333 	if (s < str_end) {
334 	    RE_FILL();
335 	}
336 	m++;
337 	RE_CASE();
338 
339     case M_END + U_ON:
340 	s += strlen(s);
341 	m++;
342 	u_flag = U_OFF;
343 	RE_CASE();
344 
345       CASE_UANY(M_U):
346 	u_flag = U_ON;
347 	m++;
348 	RE_CASE();
349 
350       CASE_UANY(M_1J):
351 	m += m->s_data.jump;
352 	RE_CASE();
353 
354       CASE_UANY(M_SAVE_POS):	/* save position for a later M_2JC */
355 	sp = RE_pos_push(sp, stackp, s);
356 	m++;
357 	RE_CASE();
358 
359       CASE_UANY(M_2JA):	/* take the non jump branch */
360 	/* don't stack an ACCEPT */
361 	if ((tm = m + m->s_data.jump)->s_type == M_ACCEPT) {
362 	    return 1;
363 	}
364 	push(tm, s, sp, u_flag);
365 	m++;
366 	RE_CASE();
367 
368       CASE_UANY(M_2JC):	/* take the jump branch if position changed */
369 	if (RE_pos_pop(&sp, stackp) == s) {
370 	    /* did not advance: do not jump back */
371 	    m++;
372 	    RE_CASE();
373 	}
374 	/* FALLTHRU */
375 
376       CASE_UANY(M_2JB):	/* take the jump branch */
377 	/* don't stack an ACCEPT */
378 	if ((tm = m + 1)->s_type == M_ACCEPT) {
379 	    return 1;
380 	}
381 	push(tm, s, sp, u_flag);
382 	m += m->s_data.jump;
383 	RE_CASE();
384 
385       CASE_UANY(M_ACCEPT):
386 	return 1;
387 
388     default:
389 	RE_panic("unexpected case in REtest");
390     }
391 }
392 
393 #undef push
394