1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24
25 /* lasciate ogne speranza, voi ch'intrate. */
26
27 #if HAVE_NBTOOL_CONFIG_H
28 #include "nbtool_config.h"
29 #endif
30
31 #define DEBUG
32
33 #include <ctype.h>
34 #include <limits.h>
35 #include <stdio.h>
36 #include <string.h>
37 #include <stdlib.h>
38 #include "awk.h"
39 #include "awkgram.h"
40
41 #define MAXLIN 22
42
43 #define type(v) (v)->nobj /* badly overloaded here */
44 #define info(v) (v)->ntype /* badly overloaded here */
45 #define left(v) (v)->narg[0]
46 #define right(v) (v)->narg[1]
47 #define parent(v) (v)->nnext
48
49 #define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
50 #define ELEAF case EMPTYRE: /* empty string in regexp */
51 #define UNARY case STAR: case PLUS: case QUEST:
52
53 /* encoding in tree Nodes:
54 leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
55 left is index, right contains value or pointer to value
56 unary (STAR, PLUS, QUEST): left is child, right is null
57 binary (CAT, OR): left and right are children
58 parent contains pointer to parent
59 */
60
61
62 int *setvec;
63 int *tmpset;
64 int maxsetvec = 0;
65
66 int rtok; /* next token in current re */
67 int rlxval;
68 static const uschar *rlxstr;
69 static const uschar *prestr; /* current position in current re */
70 static const uschar *lastre; /* origin of last re */
71 static const uschar *lastatom; /* origin of last Atom */
72 static const uschar *starttok;
73 static const uschar *basestr; /* starts with original, replaced during
74 repetition processing */
75 static const uschar *firstbasestr;
76
77 static int setcnt;
78 static int poscnt;
79
80 const char *patbeg;
81 int patlen;
82
83 #define NFA 128 /* cache this many dynamic fa's */
84 fa *fatab[NFA];
85 int nfatab = 0; /* entries in fatab */
86
87 static int *
intalloc(size_t n,const char * f)88 intalloc(size_t n, const char *f)
89 {
90 void *p = calloc(n, sizeof(int));
91 if (p == NULL)
92 overflo(f);
93 return p;
94 }
95
96 static void
resizesetvec(const char * f)97 resizesetvec(const char *f)
98 {
99 if (maxsetvec == 0)
100 maxsetvec = MAXLIN;
101 else
102 maxsetvec *= 4;
103 setvec = realloc(setvec, maxsetvec * sizeof(*setvec));
104 tmpset = realloc(tmpset, maxsetvec * sizeof(*tmpset));
105 if (setvec == NULL || tmpset == NULL)
106 overflo(f);
107 }
108
109 static void
resize_state(fa * f,int state)110 resize_state(fa *f, int state)
111 {
112 void *p;
113 int i, new_count;
114
115 if (++state < f->state_count)
116 return;
117
118 new_count = state + 10; /* needs to be tuned */
119
120 p = realloc(f->gototab, new_count * sizeof(f->gototab[0]));
121 if (p == NULL)
122 goto out;
123 f->gototab = p;
124
125 p = realloc(f->out, new_count * sizeof(f->out[0]));
126 if (p == NULL)
127 goto out;
128 f->out = p;
129
130 p = realloc(f->posns, new_count * sizeof(f->posns[0]));
131 if (p == NULL)
132 goto out;
133 f->posns = p;
134
135 for (i = f->state_count; i < new_count; ++i) {
136 f->gototab[i] = calloc(NCHARS, sizeof(**f->gototab));
137 if (f->gototab[i] == NULL)
138 goto out;
139 f->out[i] = 0;
140 f->posns[i] = NULL;
141 }
142 f->state_count = new_count;
143 return;
144 out:
145 overflo(__func__);
146 }
147
makedfa(const char * s,bool anchor)148 fa *makedfa(const char *s, bool anchor) /* returns dfa for reg expr s */
149 {
150 int i, use, nuse;
151 fa *pfa;
152 static int now = 1;
153
154 if (setvec == NULL) { /* first time through any RE */
155 resizesetvec(__func__);
156 }
157
158 if (compile_time != RUNNING) /* a constant for sure */
159 return mkdfa(s, anchor);
160 for (i = 0; i < nfatab; i++) /* is it there already? */
161 if (fatab[i]->anchor == anchor
162 && strcmp((const char *) fatab[i]->restr, s) == 0) {
163 fatab[i]->use = now++;
164 return fatab[i];
165 }
166 pfa = mkdfa(s, anchor);
167 if (nfatab < NFA) { /* room for another */
168 fatab[nfatab] = pfa;
169 fatab[nfatab]->use = now++;
170 nfatab++;
171 return pfa;
172 }
173 use = fatab[0]->use; /* replace least-recently used */
174 nuse = 0;
175 for (i = 1; i < nfatab; i++)
176 if (fatab[i]->use < use) {
177 use = fatab[i]->use;
178 nuse = i;
179 }
180 freefa(fatab[nuse]);
181 fatab[nuse] = pfa;
182 pfa->use = now++;
183 return pfa;
184 }
185
mkdfa(const char * s,bool anchor)186 fa *mkdfa(const char *s, bool anchor) /* does the real work of making a dfa */
187 /* anchor = true for anchored matches, else false */
188 {
189 Node *p, *p1;
190 fa *f;
191
192 firstbasestr = (const uschar *) s;
193 basestr = firstbasestr;
194 p = reparse(s);
195 p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
196 /* put ALL STAR in front of reg. exp. */
197 p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
198 /* put FINAL after reg. exp. */
199
200 poscnt = 0;
201 penter(p1); /* enter parent pointers and leaf indices */
202 if ((f = calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL)
203 overflo(__func__);
204 f->accept = poscnt-1; /* penter has computed number of positions in re */
205 cfoll(f, p1); /* set up follow sets */
206 freetr(p1);
207 resize_state(f, 1);
208 f->posns[0] = intalloc(*(f->re[0].lfollow), __func__);
209 f->posns[1] = intalloc(1, __func__);
210 *f->posns[1] = 0;
211 f->initstat = makeinit(f, anchor);
212 f->anchor = anchor;
213 f->restr = (uschar *) tostring(s);
214 if (firstbasestr != basestr) {
215 if (basestr)
216 xfree(basestr);
217 }
218 return f;
219 }
220
makeinit(fa * f,bool anchor)221 int makeinit(fa *f, bool anchor)
222 {
223 int i, k;
224
225 f->curstat = 2;
226 f->out[2] = 0;
227 k = *(f->re[0].lfollow);
228 xfree(f->posns[2]);
229 f->posns[2] = intalloc(k + 1, __func__);
230 for (i = 0; i <= k; i++) {
231 (f->posns[2])[i] = (f->re[0].lfollow)[i];
232 }
233 if ((f->posns[2])[1] == f->accept)
234 f->out[2] = 1;
235 for (i = 0; i < NCHARS; i++)
236 f->gototab[2][i] = 0;
237 f->curstat = cgoto(f, 2, HAT);
238 if (anchor) {
239 *f->posns[2] = k-1; /* leave out position 0 */
240 for (i = 0; i < k; i++) {
241 (f->posns[0])[i] = (f->posns[2])[i];
242 }
243
244 f->out[0] = f->out[2];
245 if (f->curstat != 2)
246 --(*f->posns[f->curstat]);
247 }
248 return f->curstat;
249 }
250
penter(Node * p)251 void penter(Node *p) /* set up parent pointers and leaf indices */
252 {
253 switch (type(p)) {
254 ELEAF
255 LEAF
256 info(p) = poscnt;
257 poscnt++;
258 break;
259 UNARY
260 penter(left(p));
261 parent(left(p)) = p;
262 break;
263 case CAT:
264 case OR:
265 penter(left(p));
266 penter(right(p));
267 parent(left(p)) = p;
268 parent(right(p)) = p;
269 break;
270 case ZERO:
271 break;
272 default: /* can't happen */
273 FATAL("can't happen: unknown type %d in penter", type(p));
274 break;
275 }
276 }
277
freetr(Node * p)278 void freetr(Node *p) /* free parse tree */
279 {
280 switch (type(p)) {
281 ELEAF
282 LEAF
283 xfree(p);
284 break;
285 UNARY
286 case ZERO:
287 freetr(left(p));
288 xfree(p);
289 break;
290 case CAT:
291 case OR:
292 freetr(left(p));
293 freetr(right(p));
294 xfree(p);
295 break;
296 default: /* can't happen */
297 FATAL("can't happen: unknown type %d in freetr", type(p));
298 break;
299 }
300 }
301
302 /* in the parsing of regular expressions, metacharacters like . have */
303 /* to be seen literally; \056 is not a metacharacter. */
304
hexstr(const uschar ** pp)305 int hexstr(const uschar **pp) /* find and eval hex string at pp, return new p */
306 { /* only pick up one 8-bit byte (2 chars) */
307 const uschar *p;
308 int n = 0;
309 int i;
310
311 for (i = 0, p = *pp; i < 2 && isxdigit(*p); i++, p++) {
312 if (isdigit(*p))
313 n = 16 * n + *p - '0';
314 else if (*p >= 'a' && *p <= 'f')
315 n = 16 * n + *p - 'a' + 10;
316 else if (*p >= 'A' && *p <= 'F')
317 n = 16 * n + *p - 'A' + 10;
318 }
319 *pp = p;
320 return n;
321 }
322
323 #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */
324
quoted(const uschar ** pp)325 int quoted(const uschar **pp) /* pick up next thing after a \\ */
326 /* and increment *pp */
327 {
328 const uschar *p = *pp;
329 int c;
330
331 if ((c = *p++) == 't')
332 c = '\t';
333 else if (c == 'n')
334 c = '\n';
335 else if (c == 'f')
336 c = '\f';
337 else if (c == 'r')
338 c = '\r';
339 else if (c == 'b')
340 c = '\b';
341 else if (c == 'v')
342 c = '\v';
343 else if (c == 'a')
344 c = '\a';
345 else if (c == '\\')
346 c = '\\';
347 else if (c == 'x') { /* hexadecimal goo follows */
348 c = hexstr(&p); /* this adds a null if number is invalid */
349 } else if (isoctdigit(c)) { /* \d \dd \ddd */
350 int n = c - '0';
351 if (isoctdigit(*p)) {
352 n = 8 * n + *p++ - '0';
353 if (isoctdigit(*p))
354 n = 8 * n + *p++ - '0';
355 }
356 c = n;
357 } /* else */
358 /* c = c; */
359 *pp = p;
360 return c;
361 }
362
cclenter(const char * argp)363 char *cclenter(const char *argp) /* add a character class */
364 {
365 int i, c, c2;
366 const uschar *op, *p = (const uschar *) argp;
367 uschar *bp;
368 static uschar *buf = NULL;
369 static int bufsz = 100;
370
371 op = p;
372 if (buf == NULL && (buf = malloc(bufsz)) == NULL)
373 FATAL("out of space for character class [%.10s...] 1", p);
374 bp = buf;
375 for (i = 0; (c = *p++) != 0; ) {
376 if (c == '\\') {
377 c = quoted(&p);
378 } else if (c == '-' && i > 0 && bp[-1] != 0) {
379 if (*p != 0) {
380 c = bp[-1];
381 c2 = *p++;
382 if (c2 == '\\')
383 c2 = quoted(&p);
384 if (c > c2) { /* empty; ignore */
385 bp--;
386 i--;
387 continue;
388 }
389 while (c < c2) {
390 if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter1"))
391 FATAL("out of space for character class [%.10s...] 2", p);
392 *bp++ = ++c;
393 i++;
394 }
395 continue;
396 }
397 }
398 if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter2"))
399 FATAL("out of space for character class [%.10s...] 3", p);
400 *bp++ = c;
401 i++;
402 }
403 *bp = 0;
404 dprintf( ("cclenter: in = |%s|, out = |%s|\n", op, buf) );
405 xfree(op);
406 return (char *) tostring((char *) buf);
407 }
408
overflo(const char * s)409 void overflo(const char *s)
410 {
411 FATAL("regular expression too big: out of space in %.30s...", s);
412 }
413
cfoll(fa * f,Node * v)414 void cfoll(fa *f, Node *v) /* enter follow set of each leaf of vertex v into lfollow[leaf] */
415 {
416 int i;
417 int *p;
418
419 switch (type(v)) {
420 ELEAF
421 LEAF
422 f->re[info(v)].ltype = type(v);
423 f->re[info(v)].lval.np = right(v);
424 while (f->accept >= maxsetvec) { /* guessing here! */
425 resizesetvec(__func__);
426 }
427 for (i = 0; i <= f->accept; i++)
428 setvec[i] = 0;
429 setcnt = 0;
430 follow(v); /* computes setvec and setcnt */
431 p = intalloc(setcnt + 1, __func__);
432 f->re[info(v)].lfollow = p;
433 *p = setcnt;
434 for (i = f->accept; i >= 0; i--)
435 if (setvec[i] == 1)
436 *++p = i;
437 break;
438 UNARY
439 cfoll(f,left(v));
440 break;
441 case CAT:
442 case OR:
443 cfoll(f,left(v));
444 cfoll(f,right(v));
445 break;
446 case ZERO:
447 break;
448 default: /* can't happen */
449 FATAL("can't happen: unknown type %d in cfoll", type(v));
450 }
451 }
452
first(Node * p)453 int first(Node *p) /* collects initially active leaves of p into setvec */
454 /* returns 0 if p matches empty string */
455 {
456 int b, lp;
457
458 switch (type(p)) {
459 ELEAF
460 LEAF
461 lp = info(p); /* look for high-water mark of subscripts */
462 while (setcnt >= maxsetvec || lp >= maxsetvec) { /* guessing here! */
463 resizesetvec(__func__);
464 }
465 if (type(p) == EMPTYRE) {
466 setvec[lp] = 0;
467 return(0);
468 }
469 if (setvec[lp] != 1) {
470 setvec[lp] = 1;
471 setcnt++;
472 }
473 if (type(p) == CCL && (*(char *) right(p)) == '\0')
474 return(0); /* empty CCL */
475 return(1);
476 case PLUS:
477 if (first(left(p)) == 0)
478 return(0);
479 return(1);
480 case STAR:
481 case QUEST:
482 first(left(p));
483 return(0);
484 case CAT:
485 if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
486 return(1);
487 case OR:
488 b = first(right(p));
489 if (first(left(p)) == 0 || b == 0) return(0);
490 return(1);
491 case ZERO:
492 return 0;
493 }
494 FATAL("can't happen: unknown type %d in first", type(p)); /* can't happen */
495 return(-1);
496 }
497
follow(Node * v)498 void follow(Node *v) /* collects leaves that can follow v into setvec */
499 {
500 Node *p;
501
502 if (type(v) == FINAL)
503 return;
504 p = parent(v);
505 switch (type(p)) {
506 case STAR:
507 case PLUS:
508 first(v);
509 follow(p);
510 return;
511
512 case OR:
513 case QUEST:
514 follow(p);
515 return;
516
517 case CAT:
518 if (v == left(p)) { /* v is left child of p */
519 if (first(right(p)) == 0) {
520 follow(p);
521 return;
522 }
523 } else /* v is right child */
524 follow(p);
525 return;
526 }
527 }
528
member(int c,const char * sarg)529 int member(int c, const char *sarg) /* is c in s? */
530 {
531 const uschar *s = (const uschar *) sarg;
532
533 while (*s)
534 if (c == *s++)
535 return(1);
536 return(0);
537 }
538
match(fa * f,const char * p0)539 int match(fa *f, const char *p0) /* shortest match ? */
540 {
541 int s, ns;
542 const uschar *p = (const uschar *) p0;
543
544 s = f->initstat;
545 assert (s < f->state_count);
546
547 if (f->out[s])
548 return(1);
549 do {
550 /* assert(*p < NCHARS); */
551 if ((ns = f->gototab[s][*p]) != 0)
552 s = ns;
553 else
554 s = cgoto(f, s, *p);
555 if (f->out[s])
556 return(1);
557 } while (*p++ != 0);
558 return(0);
559 }
560
pmatch(fa * f,const char * p0)561 int pmatch(fa *f, const char *p0) /* longest match, for sub */
562 {
563 int s, ns;
564 const uschar *p = (const uschar *) p0;
565 const uschar *q;
566
567 s = f->initstat;
568 assert(s < f->state_count);
569
570 patbeg = (const char *)p;
571 patlen = -1;
572 do {
573 q = p;
574 do {
575 if (f->out[s]) /* final state */
576 patlen = q-p;
577 /* assert(*q < NCHARS); */
578 if ((ns = f->gototab[s][*q]) != 0)
579 s = ns;
580 else
581 s = cgoto(f, s, *q);
582
583 assert(s < f->state_count);
584
585 if (s == 1) { /* no transition */
586 if (patlen >= 0) {
587 patbeg = (const char *) p;
588 return(1);
589 }
590 else
591 goto nextin; /* no match */
592 }
593 } while (*q++ != 0);
594 if (f->out[s])
595 patlen = q-p-1; /* don't count $ */
596 if (patlen >= 0) {
597 patbeg = (const char *) p;
598 return(1);
599 }
600 nextin:
601 s = 2;
602 } while (*p++);
603 return (0);
604 }
605
nematch(fa * f,const char * p0)606 int nematch(fa *f, const char *p0) /* non-empty match, for sub */
607 {
608 int s, ns;
609 const uschar *p = (const uschar *) p0;
610 const uschar *q;
611
612 s = f->initstat;
613 assert(s < f->state_count);
614
615 patbeg = (const char *)p;
616 patlen = -1;
617 while (*p) {
618 q = p;
619 do {
620 if (f->out[s]) /* final state */
621 patlen = q-p;
622 /* assert(*q < NCHARS); */
623 if ((ns = f->gototab[s][*q]) != 0)
624 s = ns;
625 else
626 s = cgoto(f, s, *q);
627 if (s == 1) { /* no transition */
628 if (patlen > 0) {
629 patbeg = (const char *) p;
630 return(1);
631 } else
632 goto nnextin; /* no nonempty match */
633 }
634 } while (*q++ != 0);
635 if (f->out[s])
636 patlen = q-p-1; /* don't count $ */
637 if (patlen > 0 ) {
638 patbeg = (const char *) p;
639 return(1);
640 }
641 nnextin:
642 s = 2;
643 p++;
644 }
645 return (0);
646 }
647
648
649 /*
650 * NAME
651 * fnematch
652 *
653 * DESCRIPTION
654 * A stream-fed version of nematch which transfers characters to a
655 * null-terminated buffer. All characters up to and including the last
656 * character of the matching text or EOF are placed in the buffer. If
657 * a match is found, patbeg and patlen are set appropriately.
658 *
659 * RETURN VALUES
660 * false No match found.
661 * true Match found.
662 */
663
fnematch(fa * pfa,FILE * f,char ** pbuf,int * pbufsize,int quantum)664 bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
665 {
666 char *buf = *pbuf;
667 int bufsize = *pbufsize;
668 int c, i, j, k, ns, s;
669
670 s = pfa->initstat;
671 patlen = 0;
672
673 /*
674 * All indices relative to buf.
675 * i <= j <= k <= bufsize
676 *
677 * i: origin of active substring
678 * j: current character
679 * k: destination of next getc()
680 */
681 i = -1, k = 0;
682 do {
683 j = i++;
684 do {
685 if (++j == k) {
686 if (k == bufsize)
687 if (!adjbuf((char **) &buf, &bufsize, bufsize+1, quantum, 0, "fnematch"))
688 FATAL("stream '%.30s...' too long", buf);
689 buf[k++] = (c = getc(f)) != EOF ? c : 0;
690 }
691 c = buf[j];
692 /* assert(c < NCHARS); */
693
694 if ((ns = pfa->gototab[s][c]) != 0)
695 s = ns;
696 else
697 s = cgoto(pfa, s, c);
698
699 if (pfa->out[s]) { /* final state */
700 patlen = j - i + 1;
701 if (c == 0) /* don't count $ */
702 patlen--;
703 }
704 } while (buf[j] && s != 1);
705 s = 2;
706 } while (buf[i] && !patlen);
707
708 /* adjbuf() may have relocated a resized buffer. Inform the world. */
709 *pbuf = buf;
710 *pbufsize = bufsize;
711
712 if (patlen) {
713 patbeg = (char *) buf + i;
714 /*
715 * Under no circumstances is the last character fed to
716 * the automaton part of the match. It is EOF's nullbyte,
717 * or it sent the automaton into a state with no further
718 * transitions available (s==1), or both. Room for a
719 * terminating nullbyte is guaranteed.
720 *
721 * ungetc any chars after the end of matching text
722 * (except for EOF's nullbyte, if present) and null
723 * terminate the buffer.
724 */
725 do
726 if (buf[--k] && ungetc(buf[k], f) == EOF)
727 FATAL("unable to ungetc '%c'", buf[k]);
728 while (k > i + patlen);
729 buf[k] = '\0';
730 return true;
731 }
732 else
733 return false;
734 }
735
reparse(const char * p)736 Node *reparse(const char *p) /* parses regular expression pointed to by p */
737 { /* uses relex() to scan regular expression */
738 Node *np;
739
740 dprintf( ("reparse <%s>\n", p) );
741 lastre = prestr = (const uschar *) p; /* prestr points to string to be parsed */
742 rtok = relex();
743 /* GNU compatibility: an empty regexp matches anything */
744 if (rtok == '\0') {
745 /* FATAL("empty regular expression"); previous */
746 return(op2(EMPTYRE, NIL, NIL));
747 }
748 np = regexp();
749 if (rtok != '\0')
750 FATAL("syntax error in regular expression %s at %s", lastre, prestr);
751 return(np);
752 }
753
regexp(void)754 Node *regexp(void) /* top-level parse of reg expr */
755 {
756 return (alt(concat(primary())));
757 }
758
primary(void)759 Node *primary(void)
760 {
761 Node *np;
762 int savelastatom;
763
764 switch (rtok) {
765 case CHAR:
766 lastatom = starttok;
767 np = op2(CHAR, NIL, itonp(rlxval));
768 rtok = relex();
769 return (unary(np));
770 case ALL:
771 rtok = relex();
772 return (unary(op2(ALL, NIL, NIL)));
773 case EMPTYRE:
774 rtok = relex();
775 return (unary(op2(EMPTYRE, NIL, NIL)));
776 case DOT:
777 lastatom = starttok;
778 rtok = relex();
779 return (unary(op2(DOT, NIL, NIL)));
780 case CCL:
781 np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr));
782 lastatom = starttok;
783 rtok = relex();
784 return (unary(np));
785 case NCCL:
786 np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr));
787 lastatom = starttok;
788 rtok = relex();
789 return (unary(np));
790 case '^':
791 rtok = relex();
792 return (unary(op2(CHAR, NIL, itonp(HAT))));
793 case '$':
794 rtok = relex();
795 return (unary(op2(CHAR, NIL, NIL)));
796 case '(':
797 lastatom = starttok;
798 savelastatom = starttok - basestr; /* Retain over recursion */
799 rtok = relex();
800 if (rtok == ')') { /* special pleading for () */
801 rtok = relex();
802 return unary(op2(CCL, NIL, (Node *) tostring("")));
803 }
804 np = regexp();
805 if (rtok == ')') {
806 lastatom = basestr + savelastatom; /* Restore */
807 rtok = relex();
808 return (unary(np));
809 }
810 else
811 FATAL("syntax error in regular expression %s at %s", lastre, prestr);
812 default:
813 FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
814 }
815 return 0; /*NOTREACHED*/
816 }
817
concat(Node * np)818 Node *concat(Node *np)
819 {
820 switch (rtok) {
821 case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
822 return (concat(op2(CAT, np, primary())));
823 case EMPTYRE:
824 rtok = relex();
825 return (concat(op2(CAT, op2(CCL, NIL, (Node *) tostring("")),
826 primary())));
827 }
828 return (np);
829 }
830
alt(Node * np)831 Node *alt(Node *np)
832 {
833 if (rtok == OR) {
834 rtok = relex();
835 return (alt(op2(OR, np, concat(primary()))));
836 }
837 return (np);
838 }
839
unary(Node * np)840 Node *unary(Node *np)
841 {
842 switch (rtok) {
843 case STAR:
844 rtok = relex();
845 return (unary(op2(STAR, np, NIL)));
846 case PLUS:
847 rtok = relex();
848 return (unary(op2(PLUS, np, NIL)));
849 case QUEST:
850 rtok = relex();
851 return (unary(op2(QUEST, np, NIL)));
852 case ZERO:
853 rtok = relex();
854 return (unary(op2(ZERO, np, NIL)));
855 default:
856 return (np);
857 }
858 }
859
860 /*
861 * Character class definitions conformant to the POSIX locale as
862 * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
863 * and operating character sets are both ASCII (ISO646) or supersets
864 * thereof.
865 *
866 * Note that to avoid overflowing the temporary buffer used in
867 * relex(), the expanded character class (prior to range expansion)
868 * must be less than twice the size of their full name.
869 */
870
871 /* Because isblank doesn't show up in any of the header files on any
872 * system i use, it's defined here. if some other locale has a richer
873 * definition of "blank", define HAS_ISBLANK and provide your own
874 * version.
875 * the parentheses here are an attempt to find a path through the maze
876 * of macro definition and/or function and/or version provided. thanks
877 * to nelson beebe for the suggestion; let's see if it works everywhere.
878 */
879
880 /* #define HAS_ISBLANK */
881 #ifndef HAS_ISBLANK
882
883 int (xisblank)(int c)
884 {
885 return c==' ' || c=='\t';
886 }
887
888 #endif
889
890 static const struct charclass {
891 const char *cc_name;
892 int cc_namelen;
893 int (*cc_func)(int);
894 } charclasses[] = {
895 { "alnum", 5, isalnum },
896 { "alpha", 5, isalpha },
897 #ifndef HAS_ISBLANK
898 { "blank", 5, xisblank },
899 #else
900 { "blank", 5, isblank },
901 #endif
902 { "cntrl", 5, iscntrl },
903 { "digit", 5, isdigit },
904 { "graph", 5, isgraph },
905 { "lower", 5, islower },
906 { "print", 5, isprint },
907 { "punct", 5, ispunct },
908 { "space", 5, isspace },
909 { "upper", 5, isupper },
910 { "xdigit", 6, isxdigit },
911 { NULL, 0, NULL },
912 };
913
914 #define REPEAT_SIMPLE 0
915 #define REPEAT_PLUS_APPENDED 1
916 #define REPEAT_WITH_Q 2
917 #define REPEAT_ZERO 3
918
919 static int
replace_repeat(const uschar * reptok,int reptoklen,const uschar * atom,int atomlen,int firstnum,int secondnum,int special_case)920 replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
921 int atomlen, int firstnum, int secondnum, int special_case)
922 {
923 int i, j;
924 uschar *buf = 0;
925 int ret = 1;
926 int init_q = (firstnum == 0); /* first added char will be ? */
927 int n_q_reps = secondnum-firstnum; /* m>n, so reduce until {1,m-n} left */
928 int prefix_length = reptok - basestr; /* prefix includes first rep */
929 int suffix_length = strlen((const char *) reptok) - reptoklen; /* string after rep specifier */
930 int size = prefix_length + suffix_length;
931
932 if (firstnum > 1) { /* add room for reps 2 through firstnum */
933 size += atomlen*(firstnum-1);
934 }
935
936 /* Adjust size of buffer for special cases */
937 if (special_case == REPEAT_PLUS_APPENDED) {
938 size++; /* for the final + */
939 } else if (special_case == REPEAT_WITH_Q) {
940 size += init_q + (atomlen+1)* n_q_reps;
941 } else if (special_case == REPEAT_ZERO) {
942 size += 2; /* just a null ERE: () */
943 }
944 if ((buf = malloc(size + 1)) == NULL)
945 FATAL("out of space in reg expr %.10s..", lastre);
946 memcpy(buf, basestr, prefix_length); /* copy prefix */
947 j = prefix_length;
948 if (special_case == REPEAT_ZERO) {
949 j -= atomlen;
950 buf[j++] = '(';
951 buf[j++] = ')';
952 }
953 for (i = 1; i < firstnum; i++) { /* copy x reps */
954 memcpy(&buf[j], atom, atomlen);
955 j += atomlen;
956 }
957 if (special_case == REPEAT_PLUS_APPENDED) {
958 buf[j++] = '+';
959 } else if (special_case == REPEAT_WITH_Q) {
960 if (init_q)
961 buf[j++] = '?';
962 for (i = init_q; i < n_q_reps; i++) { /* copy x? reps */
963 memcpy(&buf[j], atom, atomlen);
964 j += atomlen;
965 buf[j++] = '?';
966 }
967 }
968 memcpy(&buf[j], reptok+reptoklen, suffix_length);
969 if (special_case == REPEAT_ZERO) {
970 buf[j+suffix_length] = '\0';
971 } else {
972 buf[size] = '\0';
973 }
974 /* free old basestr */
975 if (firstbasestr != basestr) {
976 if (basestr)
977 xfree(basestr);
978 }
979 basestr = buf;
980 prestr = buf + prefix_length;
981 if (special_case == REPEAT_ZERO) {
982 prestr -= atomlen;
983 ret++;
984 }
985 return ret;
986 }
987
repeat(const uschar * reptok,int reptoklen,const uschar * atom,int atomlen,int firstnum,int secondnum)988 static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
989 int atomlen, int firstnum, int secondnum)
990 {
991 /*
992 In general, the repetition specifier or "bound" is replaced here
993 by an equivalent ERE string, repeating the immediately previous atom
994 and appending ? and + as needed. Note that the first copy of the
995 atom is left in place, except in the special_case of a zero-repeat
996 (i.e., {0}).
997 */
998 if (secondnum < 0) { /* means {n,} -> repeat n-1 times followed by PLUS */
999 if (firstnum < 2) {
1000 /* 0 or 1: should be handled before you get here */
1001 FATAL("internal error");
1002 } else {
1003 return replace_repeat(reptok, reptoklen, atom, atomlen,
1004 firstnum, secondnum, REPEAT_PLUS_APPENDED);
1005 }
1006 } else if (firstnum == secondnum) { /* {n} or {n,n} -> simply repeat n-1 times */
1007 if (firstnum == 0) { /* {0} or {0,0} */
1008 /* This case is unusual because the resulting
1009 replacement string might actually be SMALLER than
1010 the original ERE */
1011 return replace_repeat(reptok, reptoklen, atom, atomlen,
1012 firstnum, secondnum, REPEAT_ZERO);
1013 } else { /* (firstnum >= 1) */
1014 return replace_repeat(reptok, reptoklen, atom, atomlen,
1015 firstnum, secondnum, REPEAT_SIMPLE);
1016 }
1017 } else if (firstnum < secondnum) { /* {n,m} -> repeat n-1 times then alternate */
1018 /* x{n,m} => xx...x{1, m-n+1} => xx...x?x?x?..x? */
1019 return replace_repeat(reptok, reptoklen, atom, atomlen,
1020 firstnum, secondnum, REPEAT_WITH_Q);
1021 } else { /* Error - shouldn't be here (n>m) */
1022 FATAL("internal error");
1023 }
1024 return 0;
1025 }
1026
relex(void)1027 int relex(void) /* lexical analyzer for reparse */
1028 {
1029 int c, n;
1030 int cflag;
1031 static uschar *buf = NULL;
1032 static int bufsz = 100;
1033 uschar *bp;
1034 const struct charclass *cc;
1035 int i;
1036 int num, m;
1037 bool commafound, digitfound;
1038 const uschar *startreptok;
1039 static int parens = 0;
1040
1041 rescan:
1042 starttok = prestr;
1043
1044 switch (c = *prestr++) {
1045 case '|': return OR;
1046 case '*': return STAR;
1047 case '+': return PLUS;
1048 case '?': return QUEST;
1049 case '.': return DOT;
1050 case '\0': prestr--; return '\0';
1051 case '^':
1052 case '$':
1053 return c;
1054 case '(':
1055 parens++;
1056 return c;
1057 case ')':
1058 if (parens) {
1059 parens--;
1060 return c;
1061 }
1062 /* unmatched close parenthesis; per POSIX, treat as literal */
1063 rlxval = c;
1064 return CHAR;
1065 case '\\':
1066 rlxval = quoted(&prestr);
1067 return CHAR;
1068 default:
1069 rlxval = c;
1070 return CHAR;
1071 case '[':
1072 if (buf == NULL && (buf = malloc(bufsz)) == NULL)
1073 FATAL("out of space in reg expr %.10s..", lastre);
1074 bp = buf;
1075 if (*prestr == '^') {
1076 cflag = 1;
1077 prestr++;
1078 }
1079 else
1080 cflag = 0;
1081 n = 2 * strlen((const char *) prestr)+1;
1082 if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
1083 FATAL("out of space for reg expr %.10s...", lastre);
1084 for (; ; ) {
1085 if ((c = *prestr++) == '\\') {
1086 *bp++ = '\\';
1087 if ((c = *prestr++) == '\0')
1088 FATAL("nonterminated character class %.20s...", lastre);
1089 *bp++ = c;
1090 /* } else if (c == '\n') { */
1091 /* FATAL("newline in character class %.20s...", lastre); */
1092 } else if (c == '[' && *prestr == ':') {
1093 /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
1094 for (cc = charclasses; cc->cc_name; cc++)
1095 if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
1096 break;
1097 if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
1098 prestr[2 + cc->cc_namelen] == ']') {
1099 prestr += cc->cc_namelen + 3;
1100 /*
1101 * BUG: We begin at 1, instead of 0, since we
1102 * would otherwise prematurely terminate the
1103 * string for classes like [[:cntrl:]]. This
1104 * means that we can't match the NUL character,
1105 * not without first adapting the entire
1106 * program to track each string's length.
1107 */
1108 for (i = 1; i <= UCHAR_MAX; i++) {
1109 if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, "relex2"))
1110 FATAL("out of space for reg expr %.10s...", lastre);
1111 if (cc->cc_func(i)) {
1112 *bp++ = i;
1113 n++;
1114 }
1115 }
1116 } else
1117 *bp++ = c;
1118 } else if (c == '[' && *prestr == '.') {
1119 char collate_char;
1120 prestr++;
1121 collate_char = *prestr++;
1122 if (*prestr == '.' && prestr[1] == ']') {
1123 prestr += 2;
1124 /* Found it: map via locale TBD: for
1125 now, simply return this char. This
1126 is sufficient to pass conformance
1127 test awk.ex 156
1128 */
1129 if (*prestr == ']') {
1130 prestr++;
1131 rlxval = collate_char;
1132 return CHAR;
1133 }
1134 }
1135 } else if (c == '[' && *prestr == '=') {
1136 char equiv_char;
1137 prestr++;
1138 equiv_char = *prestr++;
1139 if (*prestr == '=' && prestr[1] == ']') {
1140 prestr += 2;
1141 /* Found it: map via locale TBD: for now
1142 simply return this char. This is
1143 sufficient to pass conformance test
1144 awk.ex 156
1145 */
1146 if (*prestr == ']') {
1147 prestr++;
1148 rlxval = equiv_char;
1149 return CHAR;
1150 }
1151 }
1152 } else if (c == '\0') {
1153 FATAL("nonterminated character class %.20s", lastre);
1154 } else if (bp == buf) { /* 1st char is special */
1155 *bp++ = c;
1156 } else if (c == ']') {
1157 *bp++ = 0;
1158 rlxstr = (uschar *) tostring((char *) buf);
1159 if (cflag == 0)
1160 return CCL;
1161 else
1162 return NCCL;
1163 } else
1164 *bp++ = c;
1165 }
1166 break;
1167 case '{':
1168 if (isdigit(*(prestr))) {
1169 num = 0; /* Process as a repetition */
1170 n = -1; m = -1;
1171 commafound = false;
1172 digitfound = false;
1173 startreptok = prestr-1;
1174 /* Remember start of previous atom here ? */
1175 } else { /* just a { char, not a repetition */
1176 rlxval = c;
1177 return CHAR;
1178 }
1179 for (; ; ) {
1180 if ((c = *prestr++) == '}') {
1181 if (commafound) {
1182 if (digitfound) { /* {n,m} */
1183 m = num;
1184 if (m < n)
1185 FATAL("illegal repetition expression: class %.20s",
1186 lastre);
1187 if (n == 0 && m == 1) {
1188 return QUEST;
1189 }
1190 } else { /* {n,} */
1191 if (n == 0)
1192 return STAR;
1193 else if (n == 1)
1194 return PLUS;
1195 }
1196 } else {
1197 if (digitfound) { /* {n} same as {n,n} */
1198 n = num;
1199 m = num;
1200 } else { /* {} */
1201 FATAL("illegal repetition expression: class %.20s",
1202 lastre);
1203 }
1204 }
1205 if (repeat(starttok, prestr-starttok, lastatom,
1206 startreptok - lastatom, n, m) > 0) {
1207 if (n == 0 && m == 0) {
1208 return ZERO;
1209 }
1210 /* must rescan input for next token */
1211 goto rescan;
1212 }
1213 /* Failed to replace: eat up {...} characters
1214 and treat like just PLUS */
1215 return PLUS;
1216 } else if (c == '\0') {
1217 FATAL("nonterminated character class %.20s",
1218 lastre);
1219 } else if (isdigit(c)) {
1220 num = 10 * num + c - '0';
1221 digitfound = true;
1222 } else if (c == ',') {
1223 if (commafound)
1224 FATAL("illegal repetition expression: class %.20s",
1225 lastre);
1226 /* looking for {n,} or {n,m} */
1227 commafound = true;
1228 n = num;
1229 digitfound = false; /* reset */
1230 num = 0;
1231 } else {
1232 FATAL("illegal repetition expression: class %.20s",
1233 lastre);
1234 }
1235 }
1236 break;
1237 }
1238 }
1239
cgoto(fa * f,int s,int c)1240 int cgoto(fa *f, int s, int c)
1241 {
1242 int *p, *q;
1243 int i, j, k;
1244
1245 assert(c == HAT || c < NCHARS);
1246 while (f->accept >= maxsetvec) { /* guessing here! */
1247 resizesetvec(__func__);
1248 }
1249 for (i = 0; i <= f->accept; i++)
1250 setvec[i] = 0;
1251 setcnt = 0;
1252 resize_state(f, s);
1253 /* compute positions of gototab[s,c] into setvec */
1254 p = f->posns[s];
1255 for (i = 1; i <= *p; i++) {
1256 if ((k = f->re[p[i]].ltype) != FINAL) {
1257 if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
1258 || (k == DOT && c != 0 && c != HAT)
1259 || (k == ALL && c != 0)
1260 || (k == EMPTYRE && c != 0)
1261 || (k == CCL && member(c, (char *) f->re[p[i]].lval.up))
1262 || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) {
1263 q = f->re[p[i]].lfollow;
1264 for (j = 1; j <= *q; j++) {
1265 if (q[j] >= maxsetvec) {
1266 resizesetvec(__func__);
1267 }
1268 if (setvec[q[j]] == 0) {
1269 setcnt++;
1270 setvec[q[j]] = 1;
1271 }
1272 }
1273 }
1274 }
1275 }
1276 /* determine if setvec is a previous state */
1277 tmpset[0] = setcnt;
1278 j = 1;
1279 for (i = f->accept; i >= 0; i--)
1280 if (setvec[i]) {
1281 tmpset[j++] = i;
1282 }
1283 resize_state(f, f->curstat > s ? f->curstat : s);
1284 /* tmpset == previous state? */
1285 for (i = 1; i <= f->curstat; i++) {
1286 p = f->posns[i];
1287 if ((k = tmpset[0]) != p[0])
1288 goto different;
1289 for (j = 1; j <= k; j++)
1290 if (tmpset[j] != p[j])
1291 goto different;
1292 /* setvec is state i */
1293 if (c != HAT)
1294 f->gototab[s][c] = i;
1295 return i;
1296 different:;
1297 }
1298
1299 /* add tmpset to current set of states */
1300 ++(f->curstat);
1301 resize_state(f, f->curstat);
1302 for (i = 0; i < NCHARS; i++)
1303 f->gototab[f->curstat][i] = 0;
1304 xfree(f->posns[f->curstat]);
1305 p = intalloc(setcnt + 1, __func__);
1306
1307 f->posns[f->curstat] = p;
1308 if (c != HAT)
1309 f->gototab[s][c] = f->curstat;
1310 for (i = 0; i <= setcnt; i++)
1311 p[i] = tmpset[i];
1312 if (setvec[f->accept])
1313 f->out[f->curstat] = 1;
1314 else
1315 f->out[f->curstat] = 0;
1316 return f->curstat;
1317 }
1318
1319
freefa(fa * f)1320 void freefa(fa *f) /* free a finite automaton */
1321 {
1322 int i;
1323
1324 if (f == NULL)
1325 return;
1326 for (i = 0; i < f->state_count; i++)
1327 xfree(f->gototab[i])
1328 for (i = 0; i <= f->curstat; i++)
1329 xfree(f->posns[i]);
1330 for (i = 0; i <= f->accept; i++) {
1331 xfree(f->re[i].lfollow);
1332 if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
1333 xfree(f->re[i].lval.np);
1334 }
1335 xfree(f->restr);
1336 xfree(f->out);
1337 xfree(f->posns);
1338 xfree(f->gototab);
1339 xfree(f);
1340 }
1341