1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24
25 /* lasciate ogne speranza, voi ch'intrate. */
26
27 #if HAVE_NBTOOL_CONFIG_H
28 #include "nbtool_config.h"
29 #endif
30
31 #define DEBUG
32
33 #include <ctype.h>
34 #include <stdio.h>
35 #include <string.h>
36 #include <stdlib.h>
37 #include <assert.h>
38 #include "awk.h"
39 #include "awkgram.h"
40
41 #define HAT (NCHARS+2) /* matches ^ in regular expr */
42 /* NCHARS is 2**n */
43 #define MAXLIN 22
44
45 #define type(v) (v)->nobj /* badly overloaded here */
46 #define info(v) (v)->ntype /* badly overloaded here */
47 #define left(v) (v)->narg[0]
48 #define right(v) (v)->narg[1]
49 #define parent(v) (v)->nnext
50
51 #define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
52 #define ELEAF case EMPTYRE: /* empty string in regexp */
53 #define UNARY case STAR: case PLUS: case QUEST:
54
55 /* encoding in tree Nodes:
56 leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
57 left is index, right contains value or pointer to value
58 unary (STAR, PLUS, QUEST): left is child, right is null
59 binary (CAT, OR): left and right are children
60 parent contains pointer to parent
61 */
62
63
64 int *setvec;
65 int *tmpset;
66 int maxsetvec = 0;
67
68 int rtok; /* next token in current re */
69 int rlxval;
70 static const uschar *rlxstr;
71 static const uschar *prestr; /* current position in current re */
72 static const uschar *lastre; /* origin of last re */
73
74 static int setcnt;
75 static int poscnt;
76
77 uschar *patbeg;
78 int patlen;
79
80 #define NFA 128 /* cache this many dynamic fa's */
81 fa *fatab[NFA];
82 int nfatab = 0; /* entries in fatab */
83
84 static void
resizesetvec(const char * msg)85 resizesetvec(const char *msg)
86 {
87 if (maxsetvec == 0)
88 maxsetvec = MAXLIN;
89 else
90 maxsetvec *= 4;
91 setvec = realloc(setvec, maxsetvec * sizeof(*setvec));
92 tmpset = realloc(tmpset, maxsetvec * sizeof(*tmpset));
93 if (setvec == 0 || tmpset == 0)
94 overflo(msg);
95 }
96
97 static void
resize_state(fa * f,int state)98 resize_state(fa *f, int state)
99 {
100 void *p;
101 int i, new_count;
102
103 if (++state < f->state_count)
104 return;
105
106 new_count = state + 10; /* needs to be tuned */
107
108 p = realloc(f->gototab, new_count * sizeof(f->gototab[0]));
109 if (p == NULL)
110 goto out;
111 f->gototab = p;
112
113 p = realloc(f->out, new_count * sizeof(f->out[0]));
114 if (p == NULL)
115 goto out;
116 f->out = p;
117
118 p = realloc(f->posns, new_count * sizeof(f->posns[0]));
119 if (p == NULL)
120 goto out;
121 f->posns = p;
122
123 for (i = f->state_count; i < new_count; ++i) {
124 f->gototab[i] = calloc(1, NCHARS * sizeof (**f->gototab));
125 if (f->gototab[i] == NULL)
126 goto out;
127 f->out[i] = 0;
128 f->posns[i] = NULL;
129 }
130 f->state_count = new_count;
131 return;
132 out:
133 overflo("out of memory in resize_state");
134 }
135
makedfa(const char * s,int anchor)136 fa *makedfa(const char *s, int anchor) /* returns dfa for reg expr s */
137 {
138 int i, use, nuse;
139 fa *pfa;
140 static int now = 1;
141
142 if (setvec == 0) /* first time through any RE */
143 resizesetvec("out of space initializing makedfa");
144
145 if (compile_time) /* a constant for sure */
146 return mkdfa(s, anchor);
147 for (i = 0; i < nfatab; i++) /* is it there already? */
148 if (fatab[i]->anchor == anchor
149 && strcmp((const char *) fatab[i]->restr, s) == 0) {
150 fatab[i]->use = now++;
151 return fatab[i];
152 }
153 pfa = mkdfa(s, anchor);
154 if (nfatab < NFA) { /* room for another */
155 fatab[nfatab] = pfa;
156 fatab[nfatab]->use = now++;
157 nfatab++;
158 return pfa;
159 }
160 use = fatab[0]->use; /* replace least-recently used */
161 nuse = 0;
162 for (i = 1; i < nfatab; i++)
163 if (fatab[i]->use < use) {
164 use = fatab[i]->use;
165 nuse = i;
166 }
167 freefa(fatab[nuse]);
168 fatab[nuse] = pfa;
169 pfa->use = now++;
170 return pfa;
171 }
172
mkdfa(const char * s,int anchor)173 fa *mkdfa(const char *s, int anchor) /* does the real work of making a dfa */
174 /* anchor = 1 for anchored matches, else 0 */
175 {
176 Node *p, *p1;
177 fa *f;
178
179 p = reparse(s);
180 p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
181 /* put ALL STAR in front of reg. exp. */
182 p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
183 /* put FINAL after reg. exp. */
184
185 poscnt = 0;
186 penter(p1); /* enter parent pointers and leaf indices */
187 if ((f = calloc(1, sizeof(*f) + poscnt*sizeof(rrow))) == NULL)
188 overflo("out of space for fa");
189 f->accept = poscnt-1; /* penter has computed number of positions in re */
190 cfoll(f, p1); /* set up follow sets */
191 freetr(p1);
192 resize_state(f, 1);
193 if ((f->posns[0] = calloc(1, *(f->re[0].lfollow)*sizeof(int))) == NULL)
194 overflo("out of space in makedfa");
195 if ((f->posns[1] = calloc(1, sizeof(int))) == NULL)
196 overflo("out of space in makedfa");
197 *f->posns[1] = 0;
198 f->initstat = makeinit(f, anchor);
199 f->anchor = anchor;
200 f->restr = (uschar *) tostring(s);
201 return f;
202 }
203
makeinit(fa * f,int anchor)204 int makeinit(fa *f, int anchor)
205 {
206 int i, k;
207
208 resize_state(f, 2);
209 f->curstat = 2;
210 f->out[2] = 0;
211 k = *(f->re[0].lfollow);
212 xfree(f->posns[2]);
213 if ((f->posns[2] = calloc(1, (k+1)*sizeof(int))) == NULL)
214 overflo("out of space in makeinit");
215 for (i=0; i <= k; i++) {
216 (f->posns[2])[i] = (f->re[0].lfollow)[i];
217 }
218 if ((f->posns[2])[1] == f->accept)
219 f->out[2] = 1;
220 for (i=0; i < NCHARS; i++)
221 f->gototab[2][i] = 0;
222 f->curstat = cgoto(f, 2, HAT);
223 if (anchor) {
224 *f->posns[2] = k-1; /* leave out position 0 */
225 for (i=0; i < k; i++) {
226 (f->posns[0])[i] = (f->posns[2])[i];
227 }
228
229 f->out[0] = f->out[2];
230 if (f->curstat != 2) {
231 resize_state(f, f->curstat);
232 --(*f->posns[f->curstat]);
233 }
234 }
235 return f->curstat;
236 }
237
penter(Node * p)238 void penter(Node *p) /* set up parent pointers and leaf indices */
239 {
240 switch (type(p)) {
241 ELEAF
242 LEAF
243 info(p) = poscnt;
244 poscnt++;
245 break;
246 UNARY
247 penter(left(p));
248 parent(left(p)) = p;
249 break;
250 case CAT:
251 case OR:
252 penter(left(p));
253 penter(right(p));
254 parent(left(p)) = p;
255 parent(right(p)) = p;
256 break;
257 default: /* can't happen */
258 FATAL("can't happen: unknown type %d in penter", type(p));
259 break;
260 }
261 }
262
freetr(Node * p)263 void freetr(Node *p) /* free parse tree */
264 {
265 switch (type(p)) {
266 ELEAF
267 LEAF
268 xfree(p);
269 break;
270 UNARY
271 freetr(left(p));
272 xfree(p);
273 break;
274 case CAT:
275 case OR:
276 freetr(left(p));
277 freetr(right(p));
278 xfree(p);
279 break;
280 default: /* can't happen */
281 FATAL("can't happen: unknown type %d in freetr", type(p));
282 break;
283 }
284 }
285
286 /* in the parsing of regular expressions, metacharacters like . have */
287 /* to be seen literally; \056 is not a metacharacter. */
288
hexstr(const uschar ** pp)289 int hexstr(const uschar **pp) /* find and eval hex string at pp, return new p */
290 { /* only pick up one 8-bit byte (2 chars) */
291 const uschar *p;
292 int n = 0;
293 int i;
294
295 for (i = 0, p = *pp; i < 2 && isxdigit(*p); i++, p++) {
296 if (isdigit(*p))
297 n = 16 * n + *p - '0';
298 else if (*p >= 'a' && *p <= 'f')
299 n = 16 * n + *p - 'a' + 10;
300 else if (*p >= 'A' && *p <= 'F')
301 n = 16 * n + *p - 'A' + 10;
302 }
303 *pp = p;
304 return n;
305 }
306
307 #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */
308
quoted(const uschar ** pp)309 int quoted(const uschar **pp) /* pick up next thing after a \\ */
310 /* and increment *pp */
311 {
312 const uschar *p = *pp;
313 int c;
314
315 if ((c = *p++) == 't')
316 c = '\t';
317 else if (c == 'n')
318 c = '\n';
319 else if (c == 'f')
320 c = '\f';
321 else if (c == 'r')
322 c = '\r';
323 else if (c == 'b')
324 c = '\b';
325 else if (c == '\\')
326 c = '\\';
327 else if (c == 'x') { /* hexadecimal goo follows */
328 c = hexstr(&p); /* this adds a null if number is invalid */
329 } else if (isoctdigit(c)) { /* \d \dd \ddd */
330 int n = c - '0';
331 if (isoctdigit(*p)) {
332 n = 8 * n + *p++ - '0';
333 if (isoctdigit(*p))
334 n = 8 * n + *p++ - '0';
335 }
336 c = n;
337 } /* else */
338 /* c = c; */
339 *pp = p;
340 return c;
341 }
342
cclenter(const char * argp)343 char *cclenter(const char *argp) /* add a character class */
344 {
345 int i, c, c2;
346 const uschar *p = (const uschar *) argp;
347 const uschar *op;
348 uschar *bp;
349 static uschar *buf = 0;
350 static int bufsz = 100;
351
352 op = p;
353 if (buf == 0 && (buf = malloc(bufsz)) == NULL)
354 FATAL("out of space for character class [%.10s...] 1", p);
355 bp = buf;
356 for (i = 0; (c = *p++) != 0; ) {
357 if (c == '\\') {
358 c = quoted(&p);
359 } else if (c == '-' && i > 0 && bp[-1] != 0) {
360 if (*p != 0) {
361 c = bp[-1];
362 c2 = *p++;
363 if (c2 == '\\')
364 c2 = quoted(&p);
365 if (c > c2) { /* empty; ignore */
366 bp--;
367 i--;
368 continue;
369 }
370 while (c < c2) {
371 if (!adjbuf(&buf, &bufsz, bp-buf+2, 100, &bp, "cclenter1"))
372 FATAL("out of space for character class [%.10s...] 2", p);
373 *bp++ = ++c;
374 i++;
375 }
376 continue;
377 }
378 }
379 if (!adjbuf(&buf, &bufsz, bp-buf+2, 100, &bp, "cclenter2"))
380 FATAL("out of space for character class [%.10s...] 3", p);
381 *bp++ = c;
382 i++;
383 }
384 *bp = 0;
385 dprintf( ("cclenter: in = |%s|, out = |%s|\n", op, buf) );
386 free(__UNCONST(op));
387 return (char *) tostring((char *) buf);
388 }
389
overflo(const char * s)390 void overflo(const char *s)
391 {
392 FATAL("regular expression too big: %.30s...", s);
393 }
394
cfoll(fa * f,Node * v)395 void cfoll(fa *f, Node *v) /* enter follow set of each leaf of vertex v into lfollow[leaf] */
396 {
397 int i;
398 int *p;
399
400 switch (type(v)) {
401 ELEAF
402 LEAF
403 f->re[info(v)].ltype = type(v);
404 f->re[info(v)].lval.np = right(v);
405 while (f->accept >= maxsetvec) /* guessing here! */
406 resizesetvec("out of space in cfoll()");
407 for (i = 0; i <= f->accept; i++)
408 setvec[i] = 0;
409 setcnt = 0;
410 follow(v); /* computes setvec and setcnt */
411 if ((p = calloc(1, (setcnt+1)*sizeof(int))) == NULL)
412 overflo("out of space building follow set");
413 f->re[info(v)].lfollow = p;
414 *p = setcnt;
415 for (i = f->accept; i >= 0; i--)
416 if (setvec[i] == 1)
417 *++p = i;
418 break;
419 UNARY
420 cfoll(f,left(v));
421 break;
422 case CAT:
423 case OR:
424 cfoll(f,left(v));
425 cfoll(f,right(v));
426 break;
427 default: /* can't happen */
428 FATAL("can't happen: unknown type %d in cfoll", type(v));
429 }
430 }
431
first(Node * p)432 int first(Node *p) /* collects initially active leaves of p into setvec */
433 /* returns 0 if p matches empty string */
434 {
435 int b, lp;
436
437 switch (type(p)) {
438 ELEAF
439 LEAF
440 lp = info(p); /* look for high-water mark of subscripts */
441 while (setcnt >= maxsetvec || lp >= maxsetvec) /* guessing here! */
442 resizesetvec("out of space in first()");
443 if (type(p) == EMPTYRE) {
444 setvec[lp] = 0;
445 return(0);
446 }
447 if (setvec[lp] != 1) {
448 setvec[lp] = 1;
449 setcnt++;
450 }
451 if (type(p) == CCL && (*(char *) right(p)) == '\0')
452 return(0); /* empty CCL */
453 else return(1);
454 case PLUS:
455 if (first(left(p)) == 0) return(0);
456 return(1);
457 case STAR:
458 case QUEST:
459 first(left(p));
460 return(0);
461 case CAT:
462 if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
463 return(1);
464 case OR:
465 b = first(right(p));
466 if (first(left(p)) == 0 || b == 0) return(0);
467 return(1);
468 }
469 FATAL("can't happen: unknown type %d in first", type(p)); /* can't happen */
470 return(-1);
471 }
472
follow(Node * v)473 void follow(Node *v) /* collects leaves that can follow v into setvec */
474 {
475 Node *p;
476
477 if (type(v) == FINAL)
478 return;
479 p = parent(v);
480 switch (type(p)) {
481 case STAR:
482 case PLUS:
483 first(v);
484 follow(p);
485 return;
486
487 case OR:
488 case QUEST:
489 follow(p);
490 return;
491
492 case CAT:
493 if (v == left(p)) { /* v is left child of p */
494 if (first(right(p)) == 0) {
495 follow(p);
496 return;
497 }
498 } else /* v is right child */
499 follow(p);
500 return;
501 }
502 }
503
member(int c,const char * sarg)504 int member(int c, const char *sarg) /* is c in s? */
505 {
506 const uschar *s = (const uschar *) sarg;
507
508 while (*s)
509 if (c == *s++)
510 return(1);
511 return(0);
512 }
513
match(fa * f,const char * p0)514 int match(fa *f, const char *p0) /* shortest match ? */
515 {
516 int s, ns;
517 const uschar *p = (const uschar *) p0;
518
519 s = f->initstat;
520 assert (s < f->state_count);
521
522 if (f->out[s])
523 return(1);
524 do {
525 /* assert(*p < NCHARS); */
526 if ((ns = f->gototab[s][*p]) != 0)
527 s = ns;
528 else
529 s = cgoto(f, s, *p);
530
531 assert (s < f->state_count);
532
533 if (f->out[s])
534 return(1);
535 } while (*p++ != 0);
536 return(0);
537 }
538
pmatch(fa * f,const char * p0)539 int pmatch(fa *f, const char *p0) /* longest match, for sub */
540 {
541 int s, ns;
542 uschar *p = __UNCONST(p0);
543 uschar *q;
544
545 s = f->initstat;
546 assert(s < f->state_count);
547 patbeg = p;
548 patlen = -1;
549 do {
550 q = p;
551 do {
552 if (f->out[s]) /* final state */
553 patlen = q-p;
554 /* assert(*q < NCHARS); */
555 if ((ns = f->gototab[s][*q]) != 0)
556 s = ns;
557 else
558 s = cgoto(f, s, *q);
559
560 assert(s < f->state_count);
561
562 if (s == 1) { /* no transition */
563 if (patlen >= 0) {
564 patbeg = p;
565 return(1);
566 }
567 else
568 goto nextin; /* no match */
569 }
570 } while (*q++ != 0);
571 if (f->out[s])
572 patlen = q-p-1; /* don't count $ */
573 if (patlen >= 0) {
574 patbeg = p;
575 return(1);
576 }
577 nextin:
578 s = 2;
579 } while (*p++ != 0);
580 return (0);
581 }
582
nematch(fa * f,const char * p0)583 int nematch(fa *f, const char *p0) /* non-empty match, for sub */
584 {
585 int s, ns;
586 uschar *p = __UNCONST(p0);
587 uschar *q;
588
589 s = f->initstat;
590 assert(s < f->state_count);
591
592 patlen = -1;
593 while (*p) {
594 q = p;
595 do {
596 if (f->out[s]) /* final state */
597 patlen = q-p;
598 /* assert(*q < NCHARS); */
599 if ((ns = f->gototab[s][*q]) != 0)
600 s = ns;
601 else
602 s = cgoto(f, s, *q);
603
604 assert(s < f->state_count);
605
606 if (s == 1) { /* no transition */
607 if (patlen > 0) {
608 patbeg = p;
609 return(1);
610 } else
611 goto nnextin; /* no nonempty match */
612 }
613 } while (*q++ != 0);
614 if (f->out[s])
615 patlen = q-p-1; /* don't count $ */
616 if (patlen > 0 ) {
617 patbeg = p;
618 return(1);
619 }
620 nnextin:
621 s = 2;
622 p++;
623 }
624 return (0);
625 }
626
627
628 /*
629 * NAME
630 * fnematch
631 *
632 * DESCRIPTION
633 * A stream-fed version of nematch which transfers characters to a
634 * null-terminated buffer. All characters up to and including the last
635 * character of the matching text or EOF are placed in the buffer. If
636 * a match is found, patbeg and patlen are set appropriately.
637 *
638 * RETURN VALUES
639 * 0 No match found.
640 * 1 Match found.
641 */
642
fnematch(fa * pfa,FILE * f,uschar ** pbuf,int * pbufsize,int quantum)643 int fnematch(fa *pfa, FILE *f, uschar **pbuf, int *pbufsize, int quantum)
644 {
645 uschar *buf = *pbuf;
646 int bufsize = *pbufsize;
647 int c, i, j, k, ns, s;
648
649 s = pfa->initstat;
650 assert(s < pfa->state_count);
651 patlen = 0;
652
653 /*
654 * All indices relative to buf.
655 * i <= j <= k <= bufsize
656 *
657 * i: origin of active substring
658 * j: current character
659 * k: destination of next getc()
660 */
661 i = -1, k = 0;
662 do {
663 j = i++;
664 do {
665 if (++j == k) {
666 if (k == bufsize)
667 if (!adjbuf(&buf, &bufsize, bufsize+1, quantum, 0, "fnematch"))
668 FATAL("stream '%.30s...' too long", buf);
669 buf[k++] = (c = getc(f)) != EOF ? c : 0;
670 }
671 c = buf[j];
672 /* assert(c < NCHARS); */
673
674 if ((ns = pfa->gototab[s][c]) != 0)
675 s = ns;
676 else
677 s = cgoto(pfa, s, c);
678 assert(s < pfa->state_count);
679
680 if (pfa->out[s]) { /* final state */
681 patlen = j - i + 1;
682 if (c == 0) /* don't count $ */
683 patlen--;
684 }
685 } while (buf[j] && s != 1);
686 s = 2;
687 } while (buf[i] && !patlen);
688
689 /* adjbuf() may have relocated a resized buffer. Inform the world. */
690 *pbuf = buf;
691 *pbufsize = bufsize;
692
693 if (patlen) {
694 patbeg = buf + i;
695 /*
696 * Under no circumstances is the last character fed to
697 * the automaton part of the match. It is EOF's nullbyte,
698 * or it sent the automaton into a state with no further
699 * transitions available (s==1), or both. Room for a
700 * terminating nullbyte is guaranteed.
701 *
702 * ungetc any chars after the end of matching text
703 * (except for EOF's nullbyte, if present) and null
704 * terminate the buffer.
705 */
706 do
707 if (buf[--k] && ungetc(buf[k], f) == EOF)
708 FATAL("unable to ungetc '%c'", buf[k]);
709 while (k > i + patlen);
710 buf[k] = 0;
711 return 1;
712 }
713 else
714 return 0;
715 }
716
reparse(const char * p)717 Node *reparse(const char *p) /* parses regular expression pointed to by p */
718 { /* uses relex() to scan regular expression */
719 Node *np;
720
721 dprintf( ("reparse <%s>\n", p) );
722 lastre = prestr = (const uschar *) p; /* prestr points to string to be parsed */
723 rtok = relex();
724 /* GNU compatibility: an empty regexp matches anything */
725 if (rtok == '\0') {
726 /* FATAL("empty regular expression"); previous */
727 return(op2(EMPTYRE, NIL, NIL));
728 }
729 np = regexp();
730 if (rtok != '\0')
731 FATAL("syntax error in regular expression %s at %s", lastre, prestr);
732 return(np);
733 }
734
regexp(void)735 Node *regexp(void) /* top-level parse of reg expr */
736 {
737 return (alt(concat(primary())));
738 }
739
primary(void)740 Node *primary(void)
741 {
742 Node *np;
743
744 switch (rtok) {
745 case CHAR:
746 np = op2(CHAR, NIL, itonp(rlxval));
747 rtok = relex();
748 return (unary(np));
749 case ALL:
750 rtok = relex();
751 return (unary(op2(ALL, NIL, NIL)));
752 case EMPTYRE:
753 rtok = relex();
754 return (unary(op2(ALL, NIL, NIL)));
755 case DOT:
756 rtok = relex();
757 return (unary(op2(DOT, NIL, NIL)));
758 case CCL:
759 np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr));
760 rtok = relex();
761 return (unary(np));
762 case NCCL:
763 np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr));
764 rtok = relex();
765 return (unary(np));
766 case '^':
767 rtok = relex();
768 return (unary(op2(CHAR, NIL, itonp(HAT))));
769 case '$':
770 rtok = relex();
771 return (unary(op2(CHAR, NIL, NIL)));
772 case '(':
773 rtok = relex();
774 if (rtok == ')') { /* special pleading for () */
775 rtok = relex();
776 return unary(op2(CCL, NIL, (Node *) tostring("")));
777 }
778 np = regexp();
779 if (rtok == ')') {
780 rtok = relex();
781 return (unary(np));
782 }
783 else
784 FATAL("syntax error in regular expression %s at %s", lastre, prestr);
785 default:
786 FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
787 }
788 return 0; /*NOTREACHED*/
789 }
790
concat(Node * np)791 Node *concat(Node *np)
792 {
793 switch (rtok) {
794 case CHAR: case DOT: case ALL: case EMPTYRE: case CCL: case NCCL: case '$': case '(':
795 return (concat(op2(CAT, np, primary())));
796 }
797 return (np);
798 }
799
alt(Node * np)800 Node *alt(Node *np)
801 {
802 if (rtok == OR) {
803 rtok = relex();
804 return (alt(op2(OR, np, concat(primary()))));
805 }
806 return (np);
807 }
808
unary(Node * np)809 Node *unary(Node *np)
810 {
811 switch (rtok) {
812 case STAR:
813 rtok = relex();
814 return (unary(op2(STAR, np, NIL)));
815 case PLUS:
816 rtok = relex();
817 return (unary(op2(PLUS, np, NIL)));
818 case QUEST:
819 rtok = relex();
820 return (unary(op2(QUEST, np, NIL)));
821 default:
822 return (np);
823 }
824 }
825
826 /*
827 * Character class definitions conformant to the POSIX locale as
828 * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
829 * and operating character sets are both ASCII (ISO646) or supersets
830 * thereof.
831 *
832 * Note that to avoid overflowing the temporary buffer used in
833 * relex(), the expanded character class (prior to range expansion)
834 * must be less than twice the size of their full name.
835 */
836
837 /* Because isblank doesn't show up in any of the header files on any
838 * system i use, it's defined here. if some other locale has a richer
839 * definition of "blank", define HAS_ISBLANK and provide your own
840 * version.
841 * the parentheses here are an attempt to find a path through the maze
842 * of macro definition and/or function and/or version provided. thanks
843 * to nelson beebe for the suggestion; let's see if it works everywhere.
844 */
845
846 /* #define HAS_ISBLANK */
847
848 static const struct charclass {
849 const char *cc_name;
850 int cc_namelen;
851 int (*cc_func)(int);
852 } charclasses[] = {
853 { "alnum", 5, isalnum },
854 { "alpha", 5, isalpha },
855 #ifndef HAS_ISBLANK
856 { "blank", 5, isspace }, /* was isblank */
857 #else
858 { "blank", 5, isblank },
859 #endif
860 { "cntrl", 5, iscntrl },
861 { "digit", 5, isdigit },
862 { "graph", 5, isgraph },
863 { "lower", 5, islower },
864 { "print", 5, isprint },
865 { "punct", 5, ispunct },
866 { "space", 5, isspace },
867 { "upper", 5, isupper },
868 { "xdigit", 6, isxdigit },
869 { NULL, 0, NULL },
870 };
871
872
relex(void)873 int relex(void) /* lexical analyzer for reparse */
874 {
875 int c, n;
876 int cflag;
877 static uschar *buf = 0;
878 static int bufsz = 100;
879 uschar *bp;
880 const struct charclass *cc;
881 int i;
882
883 switch (c = *prestr++) {
884 case '|': return OR;
885 case '*': return STAR;
886 case '+': return PLUS;
887 case '?': return QUEST;
888 case '.': return DOT;
889 case '\0': prestr--; return '\0';
890 case '^':
891 case '$':
892 case '(':
893 case ')':
894 return c;
895 case '\\':
896 rlxval = quoted(&prestr);
897 return CHAR;
898 default:
899 rlxval = c;
900 return CHAR;
901 case '[':
902 if (buf == 0 && (buf = malloc(bufsz)) == NULL)
903 FATAL("out of space in reg expr %.10s..", lastre);
904 bp = buf;
905 if (*prestr == '^') {
906 cflag = 1;
907 prestr++;
908 }
909 else
910 cflag = 0;
911 n = 2 * strlen((const char *) prestr)+1;
912 if (!adjbuf(&buf, &bufsz, n, n, &bp, "relex1"))
913 FATAL("out of space for reg expr %.10s...", lastre);
914 for (; ; ) {
915 if ((c = *prestr++) == '\\') {
916 *bp++ = '\\';
917 if ((c = *prestr++) == '\0')
918 FATAL("nonterminated character class %.20s...", lastre);
919 *bp++ = c;
920 /* } else if (c == '\n') { */
921 /* FATAL("newline in character class %.20s...", lastre); */
922 } else if (c == '[' && *prestr == ':') {
923 /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
924 for (cc = charclasses; cc->cc_name; cc++)
925 if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
926 break;
927 if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
928 prestr[2 + cc->cc_namelen] == ']') {
929 prestr += cc->cc_namelen + 3;
930 for (i = 1; i < NCHARS; i++) {
931 if (!adjbuf(&buf, &bufsz, bp-buf+1, 100, &bp, "relex2"))
932 FATAL("out of space for reg expr %.10s...", lastre);
933 if (cc->cc_func(i)) {
934 *bp++ = i;
935 n++;
936 }
937 }
938 } else
939 *bp++ = c;
940 } else if (c == '\0') {
941 FATAL("nonterminated character class %.20s", lastre);
942 } else if (bp == buf) { /* 1st char is special */
943 *bp++ = c;
944 } else if (c == ']') {
945 *bp++ = 0;
946 rlxstr = (uschar *) tostring((char *) buf);
947 if (cflag == 0)
948 return CCL;
949 else
950 return NCCL;
951 } else
952 *bp++ = c;
953 }
954 }
955 }
956
cgoto(fa * f,int s,int c)957 int cgoto(fa *f, int s, int c)
958 {
959 int i, j, k;
960 int *p, *q;
961
962 assert(c == HAT || c < NCHARS);
963 while (f->accept >= maxsetvec) /* guessing here! */
964 resizesetvec("out of space in cgoto()");
965 for (i = 0; i <= f->accept; i++)
966 setvec[i] = 0;
967 setcnt = 0;
968 resize_state(f, s);
969 /* compute positions of gototab[s,c] into setvec */
970 p = f->posns[s];
971 for (i = 1; i <= *p; i++) {
972 if ((k = f->re[p[i]].ltype) != FINAL) {
973 if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
974 || (k == DOT && c != 0 && c != HAT)
975 || (k == ALL && c != 0)
976 || (k == EMPTYRE && c != 0)
977 || (k == CCL && member(c, (char *) f->re[p[i]].lval.up))
978 || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) {
979 q = f->re[p[i]].lfollow;
980 for (j = 1; j <= *q; j++) {
981 if (q[j] >= maxsetvec)
982 resizesetvec("cgoto overflow");
983 if (setvec[q[j]] == 0) {
984 setcnt++;
985 setvec[q[j]] = 1;
986 }
987 }
988 }
989 }
990 }
991 /* determine if setvec is a previous state */
992 tmpset[0] = setcnt;
993 j = 1;
994 for (i = f->accept; i >= 0; i--)
995 if (setvec[i]) {
996 tmpset[j++] = i;
997 }
998
999 resize_state(f, f->curstat > s ? f->curstat : s);
1000 /* tmpset == previous state? */
1001 for (i = 1; i <= f->curstat; i++) {
1002 p = f->posns[i];
1003 if ((k = tmpset[0]) != p[0])
1004 goto different;
1005 for (j = 1; j <= k; j++)
1006 if (tmpset[j] != p[j])
1007 goto different;
1008 /* setvec is state i */
1009 if (c != HAT)
1010 f->gototab[s][c] = i;
1011 return i;
1012 different:;
1013 }
1014
1015 /* add tmpset to current set of states */
1016 ++(f->curstat);
1017 resize_state(f, f->curstat);
1018 for (i = 0; i < NCHARS; i++)
1019 f->gototab[f->curstat][i] = 0;
1020 xfree(f->posns[f->curstat]);
1021 if ((p = calloc(1, (setcnt+1)*sizeof(int))) == NULL)
1022 overflo("out of space in cgoto");
1023
1024 f->posns[f->curstat] = p;
1025 if (c != HAT)
1026 f->gototab[s][c] = f->curstat;
1027 for (i = 0; i <= setcnt; i++)
1028 p[i] = tmpset[i];
1029 if (setvec[f->accept])
1030 f->out[f->curstat] = 1;
1031 else
1032 f->out[f->curstat] = 0;
1033 return f->curstat;
1034 }
1035
1036
freefa(fa * f)1037 void freefa(fa *f) /* free a finite automaton */
1038 {
1039 int i;
1040
1041 if (f == NULL)
1042 return;
1043 for (i = 0; i < f->state_count; i++) {
1044 xfree(f->gototab[i])
1045 xfree(f->posns[i]);
1046 }
1047 for (i = 0; i <= f->accept; i++) {
1048 xfree(f->re[i].lfollow);
1049 if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
1050 xfree((f->re[i].lval.np));
1051 }
1052 xfree(f->restr);
1053 xfree(f->out);
1054 xfree(f->posns);
1055 xfree(f->gototab);
1056 xfree(f);
1057 }
1058