1 /********************************************
2 re_cmpl.c
3 copyright 2008-2014,2016, Thomas E. Dickey
4 copyright 1991-1994,2014, Michael D. Brennan
5
6 This is a source file for mawk, an implementation of
7 the AWK programming language.
8
9 Mawk is distributed without warranty under the terms of
10 the GNU General Public License, version 2, 1991.
11 ********************************************/
12
13 /*
14 * $MawkId: re_cmpl.c,v 1.30 2016/09/30 13:47:45 tom Exp $
15 */
16
17 /* re_cmpl.c */
18
19 #include "mawk.h"
20 #include "memory.h"
21 #include "scan.h"
22 #include "regexp.h"
23 #include "repl.h"
24
25 typedef struct re_node {
26 RE_DATA re; /* keep this first, for re_destroy() */
27 STRING *sval;
28 struct re_node *link;
29 } RE_NODE;
30
31 /* a list of compiled regular expressions */
32 static RE_NODE *re_list;
33
34 static const char efmt[] = "regular expression compile failed (%s)\n%s";
35
36 /* compile a STRING to a regular expression machine.
37 Search a list of pre-compiled strings first
38 */
39 PTR
re_compile(STRING * sval)40 re_compile(STRING * sval)
41 {
42 register RE_NODE *p;
43 RE_NODE *q;
44 char *s;
45
46 /* search list */
47 s = sval->str;
48 p = re_list;
49 q = (RE_NODE *) 0;
50
51 while (p) {
52 if (sval->len == p->sval->len
53 && memcmp(s, p->sval->str, sval->len) == 0) {
54 /* found */
55 if (!q) /* already at front */
56 goto _return;
57 else { /* delete from list for move to front */
58 q->link = p->link;
59 goto found;
60 }
61 } else {
62 q = p;
63 p = p->link;
64 }
65 }
66
67 /* not found */
68 p = ZMALLOC(RE_NODE);
69 p->sval = sval;
70
71 sval->ref_cnt++;
72 p->re.anchored = (*s == '^');
73 p->re.is_empty = (sval->len == 0);
74 if (!(p->re.compiled = REcompile(s, sval->len))) {
75 ZFREE(p);
76 sval->ref_cnt--;
77 if (mawk_state == EXECUTION)
78 rt_error(efmt, REerror(), s);
79 else { /* compiling */
80 compile_error(efmt, REerror(), s);
81 return (PTR) 0;
82 }
83 }
84
85 found:
86 /* insert p at the front of the list */
87 p->link = re_list;
88 re_list = p;
89
90 _return:
91
92 #ifdef DEBUG
93 if (dump_RE)
94 REmprint(p->re.compiled, stderr);
95 #endif
96 return refRE_DATA(p->re);
97 }
98
99 /* this is only used by da() */
100
101 char *
re_uncompile(PTR m)102 re_uncompile(PTR m)
103 {
104 register RE_NODE *p;
105
106 for (p = re_list; p; p = p->link)
107 if (p->re.compiled == cast_to_re(m))
108 return p->sval->str;
109 #ifdef DEBUG
110 bozo("non compiled machine");
111 #else
112 return NULL;
113 #endif
114 }
115
116 #ifdef NO_LEAKS
117 void
re_destroy(PTR m)118 re_destroy(PTR m)
119 {
120 RE_NODE *p = (RE_NODE *) m;
121 RE_NODE *q;
122 RE_NODE *r;
123
124 if (p != 0) {
125 for (q = re_list, r = 0; q != 0; r = q, q = q->link) {
126 if (q == p) {
127 free_STRING(p->sval);
128 REdestroy(p->re.compiled);
129 if (r != 0)
130 r->link = q->link;
131 else
132 re_list = q->link;
133 free(q);
134 break;
135 }
136 }
137 }
138 }
139 #endif
140
141 /*=================================================*/
142 /* replacement operations */
143
144 /* create a replacement CELL from a STRING * */
145
146 /* Here the replacement string gets scanned for &
147 * which calls for using the matched text in the replacement
148 * So to get a literal & in the replacement requires a convention
149 * which is \& is literal & not a matched text &.
150 * Here are the Posix rules which this code supports:
151 \& --> &
152 \\ --> \
153 \c --> \c
154 & --> matched text
155
156 */
157
158 /* FIXME -- this function doesn't handle embedded nulls
159 split_buff[] and MAX_SPLIT are obsolete, but needed by this
160 function. Putting
161 them here is temporary until the rewrite to handle nulls.
162 */
163
164 #define MAX_SPLIT 256 /* handle up to 256 &'s as matched text */
165 static STRING *split_buff[MAX_SPLIT];
166
167 static CELL *
REPL_compile(STRING * sval)168 REPL_compile(STRING * sval)
169 {
170 VCount count = 0;
171 register char *p = sval->str;
172 register char *q;
173 char *xbuff;
174 CELL *cp;
175
176 q = xbuff = (char *) zmalloc(sval->len + 1);
177
178 while (1) {
179 switch (*p) {
180 case 0:
181 *q = 0;
182 goto done;
183
184 case '\\':
185 if (p[1] == '&' || p[1] == '\\') {
186 *q++ = p[1];
187 p += 2;
188 continue;
189 } else
190 break;
191
192 case '&':
193 /* if empty we don't need to make a node */
194 if (q != xbuff) {
195 *q = 0;
196 split_buff[count++] = new_STRING(xbuff);
197 }
198 /* and a null node for the '&' */
199 split_buff[count++] = (STRING *) 0;
200 /* reset */
201 p++;
202 q = xbuff;
203 continue;
204
205 default:
206 break;
207 }
208
209 *q++ = *p++;
210 }
211
212 done:
213 /* if we have one empty string it will get made now */
214 if (q > xbuff || count == 0)
215 split_buff[count++] = new_STRING(xbuff);
216
217 /* This will never happen */
218 if (count > MAX_SPLIT)
219 overflow("replacement pieces", MAX_SPLIT);
220
221 cp = ZMALLOC(CELL);
222 if (count == 1 && split_buff[0]) {
223 cp->type = C_REPL;
224 cp->ptr = (PTR) split_buff[0];
225 USED_SPLIT_BUFF(0);
226 } else {
227 STRING **sp = (STRING **)
228 (cp->ptr = zmalloc(sizeof(STRING *) * count));
229 VCount j = 0;
230
231 while (j < count) {
232 *sp++ = split_buff[j++];
233 USED_SPLIT_BUFF(j - 1);
234 }
235
236 cp->type = C_REPLV;
237 cp->vcnt = count;
238 }
239 zfree(xbuff, sval->len + 1);
240 return cp;
241 }
242
243 /* free memory used by a replacement CELL */
244
245 void
repl_destroy(CELL * cp)246 repl_destroy(CELL *cp)
247 {
248 register STRING **p;
249 VCount cnt;
250
251 if (cp->type == C_REPL) {
252 free_STRING(string(cp));
253 } else if (cp->ptr != 0) { /* an C_REPLV */
254 p = (STRING **) cp->ptr;
255 for (cnt = cp->vcnt; cnt; cnt--) {
256 if (*p) {
257 free_STRING(*p);
258 }
259 p++;
260 }
261 zfree(cp->ptr, cp->vcnt * sizeof(STRING *));
262 }
263 }
264
265 /* copy a C_REPLV cell to another CELL */
266
267 CELL *
replv_cpy(CELL * target,CELL * source)268 replv_cpy(CELL *target, CELL *source)
269 {
270 STRING **t, **s;
271 VCount cnt;
272
273 target->type = C_REPLV;
274 cnt = target->vcnt = source->vcnt;
275 target->ptr = (PTR) zmalloc(cnt * sizeof(STRING *));
276
277 t = (STRING **) target->ptr;
278 s = (STRING **) source->ptr;
279 while (cnt) {
280 cnt--;
281 if (*s)
282 (*s)->ref_cnt++;
283 *t++ = *s++;
284 }
285 return target;
286 }
287
288 /* here's our old friend linked linear list with move to the front
289 for compilation of replacement CELLs */
290
291 typedef struct repl_node {
292 struct repl_node *link;
293 STRING *sval; /* the input */
294 CELL *cp; /* the output */
295 } REPL_NODE;
296
297 static REPL_NODE *repl_list;
298
299 /* search the list (with move to the front) for a compiled
300 separator.
301 return a ptr to a CELL (C_REPL or C_REPLV)
302 */
303
304 CELL *
repl_compile(STRING * sval)305 repl_compile(STRING * sval)
306 {
307 register REPL_NODE *p;
308 REPL_NODE *q;
309 char *s;
310
311 /* search the list */
312 s = sval->str;
313 p = repl_list;
314 q = (REPL_NODE *) 0;
315 while (p) {
316 if (strcmp(s, p->sval->str) == 0) /* found */
317 {
318 if (!q) /* already at front */
319 return p->cp;
320 else { /* delete from list for move to front */
321 q->link = p->link;
322 goto found;
323 }
324
325 } else {
326 q = p;
327 p = p->link;
328 }
329 }
330
331 /* not found */
332 p = ZMALLOC(REPL_NODE);
333 p->sval = sval;
334 sval->ref_cnt++;
335 p->cp = REPL_compile(sval);
336
337 found:
338 /* insert p at the front of the list */
339 p->link = repl_list;
340 repl_list = p;
341 return p->cp;
342 }
343
344 /* return the string for a CELL or type REPL or REPLV,
345 this is only used by da() */
346
347 char *
repl_uncompile(CELL * cp)348 repl_uncompile(CELL *cp)
349 {
350 register REPL_NODE *p = repl_list;
351
352 if (cp->type == C_REPL) {
353 while (p) {
354 if (p->cp->type == C_REPL && p->cp->ptr == cp->ptr)
355 return p->sval->str;
356 else
357 p = p->link;
358 }
359 } else {
360 while (p) {
361 if (p->cp->type == C_REPLV &&
362 memcmp(cp->ptr, p->cp->ptr, cp->vcnt * sizeof(STRING *))
363 == 0)
364 return p->sval->str;
365 else
366 p = p->link;
367 }
368 }
369
370 #ifdef DEBUG
371 bozo("unable to uncompile an repl");
372 #else
373 return NULL;
374 #endif
375 }
376
377 /*
378 convert a C_REPLV to C_REPL
379 replacing the &s with sval
380 */
381
382 CELL *
replv_to_repl(CELL * cp,STRING * sval)383 replv_to_repl(CELL *cp, STRING * sval)
384 {
385 register STRING **p;
386 STRING **sblock = (STRING **) cp->ptr;
387 unsigned cnt, vcnt = cp->vcnt;
388 size_t len;
389 char *target;
390
391 #ifdef DEBUG
392 if (cp->type != C_REPLV)
393 bozo("not replv");
394 #endif
395
396 p = sblock;
397 cnt = vcnt;
398 len = 0;
399 while (cnt--) {
400 if (*p)
401 len += (*p++)->len;
402 else {
403 *p++ = sval;
404 sval->ref_cnt++;
405 len += sval->len;
406 }
407 }
408 cp->type = C_REPL;
409 cp->ptr = (PTR) new_STRING0(len);
410
411 p = sblock;
412 cnt = vcnt;
413 target = string(cp)->str;
414 while (cnt--) {
415 memcpy(target, (*p)->str, (*p)->len);
416 target += (*p)->len;
417 free_STRING(*p);
418 p++;
419 }
420
421 zfree(sblock, vcnt * sizeof(STRING *));
422 return cp;
423 }
424
425 #ifdef NO_LEAKS
426 typedef struct _all_ptrs {
427 struct _all_ptrs *next;
428 PTR m;
429 } ALL_PTRS;
430
431 static ALL_PTRS *all_ptrs;
432 /*
433 * Some regular expressions are parsed, and the pointer stored in the byte-code
434 * where we cannot distinguish it from other constants. Keep a list here, to
435 * free on exit for auditing.
436 */
437 void
no_leaks_re_ptr(PTR m)438 no_leaks_re_ptr(PTR m)
439 {
440 ALL_PTRS *p = calloc(1, sizeof(ALL_PTRS));
441 p->next = all_ptrs;
442 p->m = m;
443 all_ptrs = p;
444 }
445
446 void
re_leaks(void)447 re_leaks(void)
448 {
449 while (all_ptrs != 0) {
450 ALL_PTRS *next = all_ptrs->next;
451 re_destroy(all_ptrs->m);
452 free(all_ptrs);
453 all_ptrs = next;
454 }
455
456 while (repl_list != 0) {
457 REPL_NODE *p = repl_list->link;
458 free_STRING(repl_list->sval);
459 free_cell_data(repl_list->cp);
460 ZFREE(repl_list);
461 repl_list = p;
462 }
463 }
464 #endif
465