1 /********************************************
2 re_cmpl.c
3 copyright 2008-2014,2016, Thomas E. Dickey
4 copyright 1991-1994,2014, Michael D. Brennan
5 
6 This is a source file for mawk, an implementation of
7 the AWK programming language.
8 
9 Mawk is distributed without warranty under the terms of
10 the GNU General Public License, version 2, 1991.
11 ********************************************/
12 
13 /*
14  * $MawkId: re_cmpl.c,v 1.30 2016/09/30 13:47:45 tom Exp $
15  */
16 
17 /*  re_cmpl.c  */
18 
19 #include "mawk.h"
20 #include "memory.h"
21 #include "scan.h"
22 #include "regexp.h"
23 #include "repl.h"
24 
25 typedef struct re_node {
26     RE_DATA re;			/* keep this first, for re_destroy() */
27     STRING *sval;
28     struct re_node *link;
29 } RE_NODE;
30 
31 /* a list of compiled regular expressions */
32 static RE_NODE *re_list;
33 
34 static const char efmt[] = "regular expression compile failed (%s)\n%s";
35 
36 /* compile a STRING to a regular expression machine.
37    Search a list of pre-compiled strings first
38 */
39 PTR
re_compile(STRING * sval)40 re_compile(STRING * sval)
41 {
42     register RE_NODE *p;
43     RE_NODE *q;
44     char *s;
45 
46     /* search list */
47     s = sval->str;
48     p = re_list;
49     q = (RE_NODE *) 0;
50 
51     while (p) {
52 	if (sval->len == p->sval->len
53 	    && memcmp(s, p->sval->str, sval->len) == 0) {
54 	    /* found */
55 	    if (!q)		/* already at front */
56 		goto _return;
57 	    else {		/* delete from list for move to front */
58 		q->link = p->link;
59 		goto found;
60 	    }
61 	} else {
62 	    q = p;
63 	    p = p->link;
64 	}
65     }
66 
67     /* not found */
68     p = ZMALLOC(RE_NODE);
69     p->sval = sval;
70 
71     sval->ref_cnt++;
72     p->re.anchored = (*s == '^');
73     p->re.is_empty = (sval->len == 0);
74     if (!(p->re.compiled = REcompile(s, sval->len))) {
75 	ZFREE(p);
76 	sval->ref_cnt--;
77 	if (mawk_state == EXECUTION)
78 	    rt_error(efmt, REerror(), s);
79 	else {			/* compiling */
80 	    compile_error(efmt, REerror(), s);
81 	    return (PTR) 0;
82 	}
83     }
84 
85   found:
86     /* insert p at the front of the list */
87     p->link = re_list;
88     re_list = p;
89 
90   _return:
91 
92 #ifdef DEBUG
93     if (dump_RE)
94 	REmprint(p->re.compiled, stderr);
95 #endif
96     return refRE_DATA(p->re);
97 }
98 
99 /* this is only used by da() */
100 
101 char *
re_uncompile(PTR m)102 re_uncompile(PTR m)
103 {
104     register RE_NODE *p;
105 
106     for (p = re_list; p; p = p->link)
107 	if (p->re.compiled == cast_to_re(m))
108 	    return p->sval->str;
109 #ifdef DEBUG
110     bozo("non compiled machine");
111 #else
112     return NULL;
113 #endif
114 }
115 
116 #ifdef NO_LEAKS
117 void
re_destroy(PTR m)118 re_destroy(PTR m)
119 {
120     RE_NODE *p = (RE_NODE *) m;
121     RE_NODE *q;
122     RE_NODE *r;
123 
124     if (p != 0) {
125 	for (q = re_list, r = 0; q != 0; r = q, q = q->link) {
126 	    if (q == p) {
127 		free_STRING(p->sval);
128 		REdestroy(p->re.compiled);
129 		if (r != 0)
130 		    r->link = q->link;
131 		else
132 		    re_list = q->link;
133 		free(q);
134 		break;
135 	    }
136 	}
137     }
138 }
139 #endif
140 
141 /*=================================================*/
142 /*  replacement	 operations   */
143 
144 /* create a replacement CELL from a STRING *  */
145 
146 /* Here the replacement string gets scanned for &
147  * which calls for using the matched text in the replacement
148  * So to get a literal & in the replacement requires a convention
149  * which is \& is literal & not a matched text &.
150  * Here are the Posix rules which this code supports:
151            \&   -->   &
152 	   \\   -->   \
153 	   \c   -->   \c
154 	   &    -->   matched text
155 
156 */
157 
158 /* FIXME  -- this function doesn't handle embedded nulls
159    split_buff[] and MAX_SPLIT are obsolete, but needed by this
160    function.  Putting
161    them here is temporary until the rewrite to handle nulls.
162 */
163 
164 #define MAX_SPLIT  256		/* handle up to 256 &'s as matched text */
165 static STRING *split_buff[MAX_SPLIT];
166 
167 static CELL *
REPL_compile(STRING * sval)168 REPL_compile(STRING * sval)
169 {
170     VCount count = 0;
171     register char *p = sval->str;
172     register char *q;
173     char *xbuff;
174     CELL *cp;
175 
176     q = xbuff = (char *) zmalloc(sval->len + 1);
177 
178     while (1) {
179 	switch (*p) {
180 	case 0:
181 	    *q = 0;
182 	    goto done;
183 
184 	case '\\':
185 	    if (p[1] == '&' || p[1] == '\\') {
186 		*q++ = p[1];
187 		p += 2;
188 		continue;
189 	    } else
190 		break;
191 
192 	case '&':
193 	    /* if empty we don't need to make a node */
194 	    if (q != xbuff) {
195 		*q = 0;
196 		split_buff[count++] = new_STRING(xbuff);
197 	    }
198 	    /* and a null node for the '&'  */
199 	    split_buff[count++] = (STRING *) 0;
200 	    /*  reset  */
201 	    p++;
202 	    q = xbuff;
203 	    continue;
204 
205 	default:
206 	    break;
207 	}
208 
209 	*q++ = *p++;
210     }
211 
212   done:
213     /* if we have one empty string it will get made now */
214     if (q > xbuff || count == 0)
215 	split_buff[count++] = new_STRING(xbuff);
216 
217     /* This will never happen */
218     if (count > MAX_SPLIT)
219 	overflow("replacement pieces", MAX_SPLIT);
220 
221     cp = ZMALLOC(CELL);
222     if (count == 1 && split_buff[0]) {
223 	cp->type = C_REPL;
224 	cp->ptr = (PTR) split_buff[0];
225 	USED_SPLIT_BUFF(0);
226     } else {
227 	STRING **sp = (STRING **)
228 	(cp->ptr = zmalloc(sizeof(STRING *) * count));
229 	VCount j = 0;
230 
231 	while (j < count) {
232 	    *sp++ = split_buff[j++];
233 	    USED_SPLIT_BUFF(j - 1);
234 	}
235 
236 	cp->type = C_REPLV;
237 	cp->vcnt = count;
238     }
239     zfree(xbuff, sval->len + 1);
240     return cp;
241 }
242 
243 /* free memory used by a replacement CELL  */
244 
245 void
repl_destroy(CELL * cp)246 repl_destroy(CELL *cp)
247 {
248     register STRING **p;
249     VCount cnt;
250 
251     if (cp->type == C_REPL) {
252 	free_STRING(string(cp));
253     } else if (cp->ptr != 0) {	/* an C_REPLV           */
254 	p = (STRING **) cp->ptr;
255 	for (cnt = cp->vcnt; cnt; cnt--) {
256 	    if (*p) {
257 		free_STRING(*p);
258 	    }
259 	    p++;
260 	}
261 	zfree(cp->ptr, cp->vcnt * sizeof(STRING *));
262     }
263 }
264 
265 /* copy a C_REPLV cell to another CELL */
266 
267 CELL *
replv_cpy(CELL * target,CELL * source)268 replv_cpy(CELL *target, CELL *source)
269 {
270     STRING **t, **s;
271     VCount cnt;
272 
273     target->type = C_REPLV;
274     cnt = target->vcnt = source->vcnt;
275     target->ptr = (PTR) zmalloc(cnt * sizeof(STRING *));
276 
277     t = (STRING **) target->ptr;
278     s = (STRING **) source->ptr;
279     while (cnt) {
280 	cnt--;
281 	if (*s)
282 	    (*s)->ref_cnt++;
283 	*t++ = *s++;
284     }
285     return target;
286 }
287 
288 /* here's our old friend linked linear list with move to the front
289    for compilation of replacement CELLs	 */
290 
291 typedef struct repl_node {
292     struct repl_node *link;
293     STRING *sval;		/* the input */
294     CELL *cp;			/* the output */
295 } REPL_NODE;
296 
297 static REPL_NODE *repl_list;
298 
299 /* search the list (with move to the front) for a compiled
300    separator.
301    return a ptr to a CELL (C_REPL or C_REPLV)
302 */
303 
304 CELL *
repl_compile(STRING * sval)305 repl_compile(STRING * sval)
306 {
307     register REPL_NODE *p;
308     REPL_NODE *q;
309     char *s;
310 
311     /* search the list */
312     s = sval->str;
313     p = repl_list;
314     q = (REPL_NODE *) 0;
315     while (p) {
316 	if (strcmp(s, p->sval->str) == 0)	/* found */
317 	{
318 	    if (!q)		/* already at front */
319 		return p->cp;
320 	    else {		/* delete from list for move to front */
321 		q->link = p->link;
322 		goto found;
323 	    }
324 
325 	} else {
326 	    q = p;
327 	    p = p->link;
328 	}
329     }
330 
331     /* not found */
332     p = ZMALLOC(REPL_NODE);
333     p->sval = sval;
334     sval->ref_cnt++;
335     p->cp = REPL_compile(sval);
336 
337   found:
338 /* insert p at the front of the list */
339     p->link = repl_list;
340     repl_list = p;
341     return p->cp;
342 }
343 
344 /* return the string for a CELL or type REPL or REPLV,
345    this is only used by da()  */
346 
347 char *
repl_uncompile(CELL * cp)348 repl_uncompile(CELL *cp)
349 {
350     register REPL_NODE *p = repl_list;
351 
352     if (cp->type == C_REPL) {
353 	while (p) {
354 	    if (p->cp->type == C_REPL && p->cp->ptr == cp->ptr)
355 		return p->sval->str;
356 	    else
357 		p = p->link;
358 	}
359     } else {
360 	while (p) {
361 	    if (p->cp->type == C_REPLV &&
362 		memcmp(cp->ptr, p->cp->ptr, cp->vcnt * sizeof(STRING *))
363 		== 0)
364 		return p->sval->str;
365 	    else
366 		p = p->link;
367 	}
368     }
369 
370 #ifdef DEBUG
371     bozo("unable to uncompile an repl");
372 #else
373     return NULL;
374 #endif
375 }
376 
377 /*
378   convert a C_REPLV to	C_REPL
379      replacing the &s with sval
380 */
381 
382 CELL *
replv_to_repl(CELL * cp,STRING * sval)383 replv_to_repl(CELL *cp, STRING * sval)
384 {
385     register STRING **p;
386     STRING **sblock = (STRING **) cp->ptr;
387     unsigned cnt, vcnt = cp->vcnt;
388     size_t len;
389     char *target;
390 
391 #ifdef DEBUG
392     if (cp->type != C_REPLV)
393 	bozo("not replv");
394 #endif
395 
396     p = sblock;
397     cnt = vcnt;
398     len = 0;
399     while (cnt--) {
400 	if (*p)
401 	    len += (*p++)->len;
402 	else {
403 	    *p++ = sval;
404 	    sval->ref_cnt++;
405 	    len += sval->len;
406 	}
407     }
408     cp->type = C_REPL;
409     cp->ptr = (PTR) new_STRING0(len);
410 
411     p = sblock;
412     cnt = vcnt;
413     target = string(cp)->str;
414     while (cnt--) {
415 	memcpy(target, (*p)->str, (*p)->len);
416 	target += (*p)->len;
417 	free_STRING(*p);
418 	p++;
419     }
420 
421     zfree(sblock, vcnt * sizeof(STRING *));
422     return cp;
423 }
424 
425 #ifdef NO_LEAKS
426 typedef struct _all_ptrs {
427     struct _all_ptrs *next;
428     PTR m;
429 } ALL_PTRS;
430 
431 static ALL_PTRS *all_ptrs;
432 /*
433  * Some regular expressions are parsed, and the pointer stored in the byte-code
434  * where we cannot distinguish it from other constants.  Keep a list here, to
435  * free on exit for auditing.
436  */
437 void
no_leaks_re_ptr(PTR m)438 no_leaks_re_ptr(PTR m)
439 {
440     ALL_PTRS *p = calloc(1, sizeof(ALL_PTRS));
441     p->next = all_ptrs;
442     p->m = m;
443     all_ptrs = p;
444 }
445 
446 void
re_leaks(void)447 re_leaks(void)
448 {
449     while (all_ptrs != 0) {
450 	ALL_PTRS *next = all_ptrs->next;
451 	re_destroy(all_ptrs->m);
452 	free(all_ptrs);
453 	all_ptrs = next;
454     }
455 
456     while (repl_list != 0) {
457 	REPL_NODE *p = repl_list->link;
458 	free_STRING(repl_list->sval);
459 	free_cell_data(repl_list->cp);
460 	ZFREE(repl_list);
461 	repl_list = p;
462     }
463 }
464 #endif
465