14b588458SPeter Avalos /****************************************************************
24b588458SPeter Avalos Copyright (C) Lucent Technologies 1997
34b588458SPeter Avalos All Rights Reserved
44b588458SPeter Avalos
54b588458SPeter Avalos Permission to use, copy, modify, and distribute this software and
64b588458SPeter Avalos its documentation for any purpose and without fee is hereby
74b588458SPeter Avalos granted, provided that the above copyright notice appear in all
84b588458SPeter Avalos copies and that both that the copyright notice and this
94b588458SPeter Avalos permission notice and warranty disclaimer appear in supporting
104b588458SPeter Avalos documentation, and that the name Lucent Technologies or any of
114b588458SPeter Avalos its entities not be used in advertising or publicity pertaining
124b588458SPeter Avalos to distribution of the software without specific, written prior
134b588458SPeter Avalos permission.
144b588458SPeter Avalos
154b588458SPeter Avalos LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
164b588458SPeter Avalos INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
174b588458SPeter Avalos IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
184b588458SPeter Avalos SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
194b588458SPeter Avalos WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
204b588458SPeter Avalos IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
214b588458SPeter Avalos ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
224b588458SPeter Avalos THIS SOFTWARE.
234b588458SPeter Avalos ****************************************************************/
244b588458SPeter Avalos
254b588458SPeter Avalos /* lasciate ogne speranza, voi ch'intrate. */
264b588458SPeter Avalos
274b588458SPeter Avalos #define DEBUG
284b588458SPeter Avalos
294b588458SPeter Avalos #include <ctype.h>
301d48fce0SDaniel Fojt #include <limits.h>
314b588458SPeter Avalos #include <stdio.h>
324b588458SPeter Avalos #include <string.h>
334b588458SPeter Avalos #include <stdlib.h>
344b588458SPeter Avalos #include "awk.h"
3548f09a05SAntonio Huete Jimenez #include "awkgram.tab.h"
364b588458SPeter Avalos
374b588458SPeter Avalos #define MAXLIN 22
384b588458SPeter Avalos
394b588458SPeter Avalos #define type(v) (v)->nobj /* badly overloaded here */
404b588458SPeter Avalos #define info(v) (v)->ntype /* badly overloaded here */
414b588458SPeter Avalos #define left(v) (v)->narg[0]
424b588458SPeter Avalos #define right(v) (v)->narg[1]
434b588458SPeter Avalos #define parent(v) (v)->nnext
444b588458SPeter Avalos
454b588458SPeter Avalos #define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
464b588458SPeter Avalos #define ELEAF case EMPTYRE: /* empty string in regexp */
474b588458SPeter Avalos #define UNARY case STAR: case PLUS: case QUEST:
484b588458SPeter Avalos
494b588458SPeter Avalos /* encoding in tree Nodes:
504b588458SPeter Avalos leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
514b588458SPeter Avalos left is index, right contains value or pointer to value
524b588458SPeter Avalos unary (STAR, PLUS, QUEST): left is child, right is null
534b588458SPeter Avalos binary (CAT, OR): left and right are children
544b588458SPeter Avalos parent contains pointer to parent
554b588458SPeter Avalos */
564b588458SPeter Avalos
574b588458SPeter Avalos
584b588458SPeter Avalos int *setvec;
594b588458SPeter Avalos int *tmpset;
604b588458SPeter Avalos int maxsetvec = 0;
614b588458SPeter Avalos
624b588458SPeter Avalos int rtok; /* next token in current re */
634b588458SPeter Avalos int rlxval;
641d48fce0SDaniel Fojt static const uschar *rlxstr;
651d48fce0SDaniel Fojt static const uschar *prestr; /* current position in current re */
661d48fce0SDaniel Fojt static const uschar *lastre; /* origin of last re */
671d48fce0SDaniel Fojt static const uschar *lastatom; /* origin of last Atom */
681d48fce0SDaniel Fojt static const uschar *starttok;
691d48fce0SDaniel Fojt static const uschar *basestr; /* starts with original, replaced during
701d48fce0SDaniel Fojt repetition processing */
711d48fce0SDaniel Fojt static const uschar *firstbasestr;
724b588458SPeter Avalos
734b588458SPeter Avalos static int setcnt;
744b588458SPeter Avalos static int poscnt;
754b588458SPeter Avalos
761d48fce0SDaniel Fojt const char *patbeg;
774b588458SPeter Avalos int patlen;
784b588458SPeter Avalos
791d48fce0SDaniel Fojt #define NFA 128 /* cache this many dynamic fa's */
804b588458SPeter Avalos fa *fatab[NFA];
814b588458SPeter Avalos int nfatab = 0; /* entries in fatab */
824b588458SPeter Avalos
83*ed569bc2SAaron LI extern int u8_nextlen(const char *s);
84*ed569bc2SAaron LI
85*ed569bc2SAaron LI
86*ed569bc2SAaron LI /* utf-8 mechanism:
87*ed569bc2SAaron LI
88*ed569bc2SAaron LI For most of Awk, utf-8 strings just "work", since they look like
89*ed569bc2SAaron LI null-terminated sequences of 8-bit bytes.
90*ed569bc2SAaron LI
91*ed569bc2SAaron LI Functions like length(), index(), and substr() have to operate
92*ed569bc2SAaron LI in units of utf-8 characters. The u8_* functions in run.c
93*ed569bc2SAaron LI handle this.
94*ed569bc2SAaron LI
95*ed569bc2SAaron LI Regular expressions are more complicated, since the basic
96*ed569bc2SAaron LI mechanism of the goto table used 8-bit byte indices into the
97*ed569bc2SAaron LI gototab entries to compute the next state. Unicode is a lot
98*ed569bc2SAaron LI bigger, so the gototab entries are now structs with a character
99*ed569bc2SAaron LI and a next state. These are sorted by code point and binary
100*ed569bc2SAaron LI searched.
101*ed569bc2SAaron LI
102*ed569bc2SAaron LI Throughout the RE mechanism in b.c, utf-8 characters are
103*ed569bc2SAaron LI converted to their utf-32 value. This mostly shows up in
104*ed569bc2SAaron LI cclenter, which expands character class ranges like a-z and now
105*ed569bc2SAaron LI alpha-omega. The size of a gototab array is still about 256.
106*ed569bc2SAaron LI This should be dynamic, but for now things work ok for a single
107*ed569bc2SAaron LI code page of Unicode, which is the most likely case.
108*ed569bc2SAaron LI
109*ed569bc2SAaron LI The code changes are localized in run.c and b.c. I have added a
110*ed569bc2SAaron LI handful of functions to somewhat better hide the implementation,
111*ed569bc2SAaron LI but a lot more could be done.
112*ed569bc2SAaron LI
113*ed569bc2SAaron LI */
114*ed569bc2SAaron LI
115*ed569bc2SAaron LI static int entry_cmp(const void *l, const void *r);
116*ed569bc2SAaron LI static int get_gototab(fa*, int, int);
117*ed569bc2SAaron LI static int set_gototab(fa*, int, int, int);
118*ed569bc2SAaron LI static void clear_gototab(fa*, int);
119*ed569bc2SAaron LI extern int u8_rune(int *, const char *);
120*ed569bc2SAaron LI
1211d48fce0SDaniel Fojt static int *
intalloc(size_t n,const char * f)1221d48fce0SDaniel Fojt intalloc(size_t n, const char *f)
1231d48fce0SDaniel Fojt {
12448f09a05SAntonio Huete Jimenez int *p = (int *) calloc(n, sizeof(int));
1251d48fce0SDaniel Fojt if (p == NULL)
1261d48fce0SDaniel Fojt overflo(f);
1271d48fce0SDaniel Fojt return p;
1281d48fce0SDaniel Fojt }
1291d48fce0SDaniel Fojt
1301d48fce0SDaniel Fojt static void
resizesetvec(const char * f)1311d48fce0SDaniel Fojt resizesetvec(const char *f)
1321d48fce0SDaniel Fojt {
1331d48fce0SDaniel Fojt if (maxsetvec == 0)
1341d48fce0SDaniel Fojt maxsetvec = MAXLIN;
1351d48fce0SDaniel Fojt else
1361d48fce0SDaniel Fojt maxsetvec *= 4;
13748f09a05SAntonio Huete Jimenez setvec = (int *) realloc(setvec, maxsetvec * sizeof(*setvec));
13848f09a05SAntonio Huete Jimenez tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(*tmpset));
1391d48fce0SDaniel Fojt if (setvec == NULL || tmpset == NULL)
1401d48fce0SDaniel Fojt overflo(f);
1411d48fce0SDaniel Fojt }
1421d48fce0SDaniel Fojt
1431d48fce0SDaniel Fojt static void
resize_state(fa * f,int state)1441d48fce0SDaniel Fojt resize_state(fa *f, int state)
1451d48fce0SDaniel Fojt {
146*ed569bc2SAaron LI gtt *p;
14748f09a05SAntonio Huete Jimenez uschar *p2;
14848f09a05SAntonio Huete Jimenez int **p3;
1491d48fce0SDaniel Fojt int i, new_count;
1501d48fce0SDaniel Fojt
1511d48fce0SDaniel Fojt if (++state < f->state_count)
1521d48fce0SDaniel Fojt return;
1531d48fce0SDaniel Fojt
1541d48fce0SDaniel Fojt new_count = state + 10; /* needs to be tuned */
1551d48fce0SDaniel Fojt
156*ed569bc2SAaron LI p = (gtt *) realloc(f->gototab, new_count * sizeof(gtt));
1571d48fce0SDaniel Fojt if (p == NULL)
1581d48fce0SDaniel Fojt goto out;
1591d48fce0SDaniel Fojt f->gototab = p;
1601d48fce0SDaniel Fojt
16148f09a05SAntonio Huete Jimenez p2 = (uschar *) realloc(f->out, new_count * sizeof(f->out[0]));
16248f09a05SAntonio Huete Jimenez if (p2 == NULL)
1631d48fce0SDaniel Fojt goto out;
16448f09a05SAntonio Huete Jimenez f->out = p2;
1651d48fce0SDaniel Fojt
16648f09a05SAntonio Huete Jimenez p3 = (int **) realloc(f->posns, new_count * sizeof(f->posns[0]));
16748f09a05SAntonio Huete Jimenez if (p3 == NULL)
1681d48fce0SDaniel Fojt goto out;
16948f09a05SAntonio Huete Jimenez f->posns = p3;
1701d48fce0SDaniel Fojt
1711d48fce0SDaniel Fojt for (i = f->state_count; i < new_count; ++i) {
172*ed569bc2SAaron LI f->gototab[i].entries = (gtte *) calloc(NCHARS, sizeof(gtte));
173*ed569bc2SAaron LI if (f->gototab[i].entries == NULL)
1741d48fce0SDaniel Fojt goto out;
175*ed569bc2SAaron LI f->gototab[i].allocated = NCHARS;
176*ed569bc2SAaron LI f->gototab[i].inuse = 0;
1771d48fce0SDaniel Fojt f->out[i] = 0;
1781d48fce0SDaniel Fojt f->posns[i] = NULL;
1791d48fce0SDaniel Fojt }
1801d48fce0SDaniel Fojt f->state_count = new_count;
1811d48fce0SDaniel Fojt return;
1821d48fce0SDaniel Fojt out:
1831d48fce0SDaniel Fojt overflo(__func__);
1841d48fce0SDaniel Fojt }
1851d48fce0SDaniel Fojt
makedfa(const char * s,bool anchor)1861d48fce0SDaniel Fojt fa *makedfa(const char *s, bool anchor) /* returns dfa for reg expr s */
1874b588458SPeter Avalos {
1884b588458SPeter Avalos int i, use, nuse;
1894b588458SPeter Avalos fa *pfa;
1904b588458SPeter Avalos static int now = 1;
1914b588458SPeter Avalos
1921d48fce0SDaniel Fojt if (setvec == NULL) { /* first time through any RE */
1931d48fce0SDaniel Fojt resizesetvec(__func__);
1944b588458SPeter Avalos }
1954b588458SPeter Avalos
1961d48fce0SDaniel Fojt if (compile_time != RUNNING) /* a constant for sure */
1974b588458SPeter Avalos return mkdfa(s, anchor);
1984b588458SPeter Avalos for (i = 0; i < nfatab; i++) /* is it there already? */
1994b588458SPeter Avalos if (fatab[i]->anchor == anchor
2004b588458SPeter Avalos && strcmp((const char *) fatab[i]->restr, s) == 0) {
2014b588458SPeter Avalos fatab[i]->use = now++;
2024b588458SPeter Avalos return fatab[i];
2034b588458SPeter Avalos }
2044b588458SPeter Avalos pfa = mkdfa(s, anchor);
2054b588458SPeter Avalos if (nfatab < NFA) { /* room for another */
2064b588458SPeter Avalos fatab[nfatab] = pfa;
2074b588458SPeter Avalos fatab[nfatab]->use = now++;
2084b588458SPeter Avalos nfatab++;
2094b588458SPeter Avalos return pfa;
2104b588458SPeter Avalos }
2114b588458SPeter Avalos use = fatab[0]->use; /* replace least-recently used */
2124b588458SPeter Avalos nuse = 0;
2134b588458SPeter Avalos for (i = 1; i < nfatab; i++)
2144b588458SPeter Avalos if (fatab[i]->use < use) {
2154b588458SPeter Avalos use = fatab[i]->use;
2164b588458SPeter Avalos nuse = i;
2174b588458SPeter Avalos }
2184b588458SPeter Avalos freefa(fatab[nuse]);
2194b588458SPeter Avalos fatab[nuse] = pfa;
2204b588458SPeter Avalos pfa->use = now++;
2214b588458SPeter Avalos return pfa;
2224b588458SPeter Avalos }
2234b588458SPeter Avalos
mkdfa(const char * s,bool anchor)2241d48fce0SDaniel Fojt fa *mkdfa(const char *s, bool anchor) /* does the real work of making a dfa */
2251d48fce0SDaniel Fojt /* anchor = true for anchored matches, else false */
2264b588458SPeter Avalos {
2274b588458SPeter Avalos Node *p, *p1;
2284b588458SPeter Avalos fa *f;
2294b588458SPeter Avalos
2301d48fce0SDaniel Fojt firstbasestr = (const uschar *) s;
2311d48fce0SDaniel Fojt basestr = firstbasestr;
2324b588458SPeter Avalos p = reparse(s);
2334b588458SPeter Avalos p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
2344b588458SPeter Avalos /* put ALL STAR in front of reg. exp. */
2354b588458SPeter Avalos p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
2364b588458SPeter Avalos /* put FINAL after reg. exp. */
2374b588458SPeter Avalos
2384b588458SPeter Avalos poscnt = 0;
2394b588458SPeter Avalos penter(p1); /* enter parent pointers and leaf indices */
24048f09a05SAntonio Huete Jimenez if ((f = (fa *) calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL)
2411d48fce0SDaniel Fojt overflo(__func__);
2424b588458SPeter Avalos f->accept = poscnt-1; /* penter has computed number of positions in re */
2434b588458SPeter Avalos cfoll(f, p1); /* set up follow sets */
2444b588458SPeter Avalos freetr(p1);
2451d48fce0SDaniel Fojt resize_state(f, 1);
2461d48fce0SDaniel Fojt f->posns[0] = intalloc(*(f->re[0].lfollow), __func__);
2471d48fce0SDaniel Fojt f->posns[1] = intalloc(1, __func__);
2484b588458SPeter Avalos *f->posns[1] = 0;
2494b588458SPeter Avalos f->initstat = makeinit(f, anchor);
2504b588458SPeter Avalos f->anchor = anchor;
2514b588458SPeter Avalos f->restr = (uschar *) tostring(s);
2521d48fce0SDaniel Fojt if (firstbasestr != basestr) {
2531d48fce0SDaniel Fojt if (basestr)
2541d48fce0SDaniel Fojt xfree(basestr);
2551d48fce0SDaniel Fojt }
2564b588458SPeter Avalos return f;
2574b588458SPeter Avalos }
2584b588458SPeter Avalos
makeinit(fa * f,bool anchor)2591d48fce0SDaniel Fojt int makeinit(fa *f, bool anchor)
2604b588458SPeter Avalos {
2614b588458SPeter Avalos int i, k;
2624b588458SPeter Avalos
2634b588458SPeter Avalos f->curstat = 2;
2644b588458SPeter Avalos f->out[2] = 0;
2654b588458SPeter Avalos k = *(f->re[0].lfollow);
2664b588458SPeter Avalos xfree(f->posns[2]);
2671d48fce0SDaniel Fojt f->posns[2] = intalloc(k + 1, __func__);
2684b588458SPeter Avalos for (i = 0; i <= k; i++) {
2694b588458SPeter Avalos (f->posns[2])[i] = (f->re[0].lfollow)[i];
2704b588458SPeter Avalos }
2714b588458SPeter Avalos if ((f->posns[2])[1] == f->accept)
2724b588458SPeter Avalos f->out[2] = 1;
273*ed569bc2SAaron LI clear_gototab(f, 2);
2744b588458SPeter Avalos f->curstat = cgoto(f, 2, HAT);
2754b588458SPeter Avalos if (anchor) {
2764b588458SPeter Avalos *f->posns[2] = k-1; /* leave out position 0 */
2774b588458SPeter Avalos for (i = 0; i < k; i++) {
2784b588458SPeter Avalos (f->posns[0])[i] = (f->posns[2])[i];
2794b588458SPeter Avalos }
2804b588458SPeter Avalos
2814b588458SPeter Avalos f->out[0] = f->out[2];
2824b588458SPeter Avalos if (f->curstat != 2)
2834b588458SPeter Avalos --(*f->posns[f->curstat]);
2844b588458SPeter Avalos }
2854b588458SPeter Avalos return f->curstat;
2864b588458SPeter Avalos }
2874b588458SPeter Avalos
penter(Node * p)2884b588458SPeter Avalos void penter(Node *p) /* set up parent pointers and leaf indices */
2894b588458SPeter Avalos {
2904b588458SPeter Avalos switch (type(p)) {
2914b588458SPeter Avalos ELEAF
2924b588458SPeter Avalos LEAF
2934b588458SPeter Avalos info(p) = poscnt;
2944b588458SPeter Avalos poscnt++;
2954b588458SPeter Avalos break;
2964b588458SPeter Avalos UNARY
2974b588458SPeter Avalos penter(left(p));
2984b588458SPeter Avalos parent(left(p)) = p;
2994b588458SPeter Avalos break;
3004b588458SPeter Avalos case CAT:
3014b588458SPeter Avalos case OR:
3024b588458SPeter Avalos penter(left(p));
3034b588458SPeter Avalos penter(right(p));
3044b588458SPeter Avalos parent(left(p)) = p;
3054b588458SPeter Avalos parent(right(p)) = p;
3064b588458SPeter Avalos break;
3071d48fce0SDaniel Fojt case ZERO:
3081d48fce0SDaniel Fojt break;
3094b588458SPeter Avalos default: /* can't happen */
3104b588458SPeter Avalos FATAL("can't happen: unknown type %d in penter", type(p));
3114b588458SPeter Avalos break;
3124b588458SPeter Avalos }
3134b588458SPeter Avalos }
3144b588458SPeter Avalos
freetr(Node * p)3154b588458SPeter Avalos void freetr(Node *p) /* free parse tree */
3164b588458SPeter Avalos {
3174b588458SPeter Avalos switch (type(p)) {
3184b588458SPeter Avalos ELEAF
3194b588458SPeter Avalos LEAF
3204b588458SPeter Avalos xfree(p);
3214b588458SPeter Avalos break;
3224b588458SPeter Avalos UNARY
3231d48fce0SDaniel Fojt case ZERO:
3244b588458SPeter Avalos freetr(left(p));
3254b588458SPeter Avalos xfree(p);
3264b588458SPeter Avalos break;
3274b588458SPeter Avalos case CAT:
3284b588458SPeter Avalos case OR:
3294b588458SPeter Avalos freetr(left(p));
3304b588458SPeter Avalos freetr(right(p));
3314b588458SPeter Avalos xfree(p);
3324b588458SPeter Avalos break;
3334b588458SPeter Avalos default: /* can't happen */
3344b588458SPeter Avalos FATAL("can't happen: unknown type %d in freetr", type(p));
3354b588458SPeter Avalos break;
3364b588458SPeter Avalos }
3374b588458SPeter Avalos }
3384b588458SPeter Avalos
3394b588458SPeter Avalos /* in the parsing of regular expressions, metacharacters like . have */
3404b588458SPeter Avalos /* to be seen literally; \056 is not a metacharacter. */
3414b588458SPeter Avalos
hexstr(const uschar ** pp,int max)342*ed569bc2SAaron LI int hexstr(const uschar **pp, int max) /* find and eval hex string at pp, return new p */
3434b588458SPeter Avalos { /* only pick up one 8-bit byte (2 chars) */
3441d48fce0SDaniel Fojt const uschar *p;
3454b588458SPeter Avalos int n = 0;
3464b588458SPeter Avalos int i;
3474b588458SPeter Avalos
348*ed569bc2SAaron LI for (i = 0, p = *pp; i < max && isxdigit(*p); i++, p++) {
349*ed569bc2SAaron LI if (isdigit((int) *p))
3504b588458SPeter Avalos n = 16 * n + *p - '0';
3514b588458SPeter Avalos else if (*p >= 'a' && *p <= 'f')
3524b588458SPeter Avalos n = 16 * n + *p - 'a' + 10;
3534b588458SPeter Avalos else if (*p >= 'A' && *p <= 'F')
3544b588458SPeter Avalos n = 16 * n + *p - 'A' + 10;
3554b588458SPeter Avalos }
3561d48fce0SDaniel Fojt *pp = p;
3574b588458SPeter Avalos return n;
3584b588458SPeter Avalos }
3594b588458SPeter Avalos
360*ed569bc2SAaron LI
361*ed569bc2SAaron LI
3624b588458SPeter Avalos #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */
3634b588458SPeter Avalos
quoted(const uschar ** pp)3641d48fce0SDaniel Fojt int quoted(const uschar **pp) /* pick up next thing after a \\ */
3654b588458SPeter Avalos /* and increment *pp */
3664b588458SPeter Avalos {
3671d48fce0SDaniel Fojt const uschar *p = *pp;
3684b588458SPeter Avalos int c;
3694b588458SPeter Avalos
370*ed569bc2SAaron LI /* BUG: should advance by utf-8 char even if makes no sense */
371*ed569bc2SAaron LI
372*ed569bc2SAaron LI if ((c = *p++) == 't') {
3734b588458SPeter Avalos c = '\t';
374*ed569bc2SAaron LI } else if (c == 'n') {
3754b588458SPeter Avalos c = '\n';
376*ed569bc2SAaron LI } else if (c == 'f') {
3774b588458SPeter Avalos c = '\f';
378*ed569bc2SAaron LI } else if (c == 'r') {
3794b588458SPeter Avalos c = '\r';
380*ed569bc2SAaron LI } else if (c == 'b') {
3814b588458SPeter Avalos c = '\b';
382*ed569bc2SAaron LI } else if (c == 'v') {
3831d48fce0SDaniel Fojt c = '\v';
384*ed569bc2SAaron LI } else if (c == 'a') {
3851d48fce0SDaniel Fojt c = '\a';
386*ed569bc2SAaron LI } else if (c == '\\') {
3874b588458SPeter Avalos c = '\\';
388*ed569bc2SAaron LI } else if (c == 'x') { /* 2 hex digits follow */
389*ed569bc2SAaron LI c = hexstr(&p, 2); /* this adds a null if number is invalid */
390*ed569bc2SAaron LI } else if (c == 'u') { /* unicode char number up to 8 hex digits */
391*ed569bc2SAaron LI c = hexstr(&p, 8);
3924b588458SPeter Avalos } else if (isoctdigit(c)) { /* \d \dd \ddd */
3934b588458SPeter Avalos int n = c - '0';
3944b588458SPeter Avalos if (isoctdigit(*p)) {
3954b588458SPeter Avalos n = 8 * n + *p++ - '0';
3964b588458SPeter Avalos if (isoctdigit(*p))
3974b588458SPeter Avalos n = 8 * n + *p++ - '0';
3984b588458SPeter Avalos }
3994b588458SPeter Avalos c = n;
4004b588458SPeter Avalos } /* else */
4014b588458SPeter Avalos /* c = c; */
4024b588458SPeter Avalos *pp = p;
4034b588458SPeter Avalos return c;
4044b588458SPeter Avalos }
4054b588458SPeter Avalos
cclenter(const char * argp)406*ed569bc2SAaron LI int *cclenter(const char *argp) /* add a character class */
4074b588458SPeter Avalos {
4084b588458SPeter Avalos int i, c, c2;
409*ed569bc2SAaron LI int n;
410*ed569bc2SAaron LI const uschar *p = (const uschar *) argp;
411*ed569bc2SAaron LI int *bp, *retp;
412*ed569bc2SAaron LI static int *buf = NULL;
4134b588458SPeter Avalos static int bufsz = 100;
4144b588458SPeter Avalos
415*ed569bc2SAaron LI if (buf == NULL && (buf = (int *) calloc(bufsz, sizeof(int))) == NULL)
4164b588458SPeter Avalos FATAL("out of space for character class [%.10s...] 1", p);
4174b588458SPeter Avalos bp = buf;
418*ed569bc2SAaron LI for (i = 0; *p != 0; ) {
419*ed569bc2SAaron LI n = u8_rune(&c, (const char *) p);
420*ed569bc2SAaron LI p += n;
4214b588458SPeter Avalos if (c == '\\') {
422b12bae18SSascha Wildner c = quoted(&p);
4234b588458SPeter Avalos } else if (c == '-' && i > 0 && bp[-1] != 0) {
4244b588458SPeter Avalos if (*p != 0) {
4254b588458SPeter Avalos c = bp[-1];
426*ed569bc2SAaron LI /* c2 = *p++; */
427*ed569bc2SAaron LI n = u8_rune(&c2, (const char *) p);
428*ed569bc2SAaron LI p += n;
4294b588458SPeter Avalos if (c2 == '\\')
430*ed569bc2SAaron LI c2 = quoted(&p); /* BUG: sets p, has to be u8 size */
4314b588458SPeter Avalos if (c > c2) { /* empty; ignore */
4324b588458SPeter Avalos bp--;
4334b588458SPeter Avalos i--;
4344b588458SPeter Avalos continue;
4354b588458SPeter Avalos }
4364b588458SPeter Avalos while (c < c2) {
437*ed569bc2SAaron LI if (i >= bufsz) {
438*ed569bc2SAaron LI bufsz *= 2;
439*ed569bc2SAaron LI buf = (int *) realloc(buf, bufsz * sizeof(int));
440*ed569bc2SAaron LI if (buf == NULL)
4414b588458SPeter Avalos FATAL("out of space for character class [%.10s...] 2", p);
442*ed569bc2SAaron LI bp = buf + i;
443*ed569bc2SAaron LI }
4444b588458SPeter Avalos *bp++ = ++c;
4454b588458SPeter Avalos i++;
4464b588458SPeter Avalos }
4474b588458SPeter Avalos continue;
4484b588458SPeter Avalos }
4494b588458SPeter Avalos }
450*ed569bc2SAaron LI if (i >= bufsz) {
451*ed569bc2SAaron LI bufsz *= 2;
452*ed569bc2SAaron LI buf = (int *) realloc(buf, bufsz * sizeof(int));
453*ed569bc2SAaron LI if (buf == NULL)
454*ed569bc2SAaron LI FATAL("out of space for character class [%.10s...] 2", p);
455*ed569bc2SAaron LI bp = buf + i;
456*ed569bc2SAaron LI }
4574b588458SPeter Avalos *bp++ = c;
4584b588458SPeter Avalos i++;
4594b588458SPeter Avalos }
4604b588458SPeter Avalos *bp = 0;
461*ed569bc2SAaron LI /* DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf); BUG: can't print array of int */
462*ed569bc2SAaron LI /* xfree(op); BUG: what are we freeing here? */
463*ed569bc2SAaron LI retp = (int *) calloc(bp-buf+1, sizeof(int));
464*ed569bc2SAaron LI for (i = 0; i < bp-buf+1; i++)
465*ed569bc2SAaron LI retp[i] = buf[i];
466*ed569bc2SAaron LI return retp;
4674b588458SPeter Avalos }
4684b588458SPeter Avalos
overflo(const char * s)4694b588458SPeter Avalos void overflo(const char *s)
4704b588458SPeter Avalos {
4711d48fce0SDaniel Fojt FATAL("regular expression too big: out of space in %.30s...", s);
4724b588458SPeter Avalos }
4734b588458SPeter Avalos
cfoll(fa * f,Node * v)4744b588458SPeter Avalos void cfoll(fa *f, Node *v) /* enter follow set of each leaf of vertex v into lfollow[leaf] */
4754b588458SPeter Avalos {
4764b588458SPeter Avalos int i;
4774b588458SPeter Avalos int *p;
4784b588458SPeter Avalos
4794b588458SPeter Avalos switch (type(v)) {
4804b588458SPeter Avalos ELEAF
4814b588458SPeter Avalos LEAF
4824b588458SPeter Avalos f->re[info(v)].ltype = type(v);
4834b588458SPeter Avalos f->re[info(v)].lval.np = right(v);
4844b588458SPeter Avalos while (f->accept >= maxsetvec) { /* guessing here! */
4851d48fce0SDaniel Fojt resizesetvec(__func__);
4864b588458SPeter Avalos }
4874b588458SPeter Avalos for (i = 0; i <= f->accept; i++)
4884b588458SPeter Avalos setvec[i] = 0;
4894b588458SPeter Avalos setcnt = 0;
4904b588458SPeter Avalos follow(v); /* computes setvec and setcnt */
4911d48fce0SDaniel Fojt p = intalloc(setcnt + 1, __func__);
4924b588458SPeter Avalos f->re[info(v)].lfollow = p;
4934b588458SPeter Avalos *p = setcnt;
4944b588458SPeter Avalos for (i = f->accept; i >= 0; i--)
4954b588458SPeter Avalos if (setvec[i] == 1)
4964b588458SPeter Avalos *++p = i;
4974b588458SPeter Avalos break;
4984b588458SPeter Avalos UNARY
4994b588458SPeter Avalos cfoll(f,left(v));
5004b588458SPeter Avalos break;
5014b588458SPeter Avalos case CAT:
5024b588458SPeter Avalos case OR:
5034b588458SPeter Avalos cfoll(f,left(v));
5044b588458SPeter Avalos cfoll(f,right(v));
5054b588458SPeter Avalos break;
5061d48fce0SDaniel Fojt case ZERO:
5071d48fce0SDaniel Fojt break;
5084b588458SPeter Avalos default: /* can't happen */
5094b588458SPeter Avalos FATAL("can't happen: unknown type %d in cfoll", type(v));
5104b588458SPeter Avalos }
5114b588458SPeter Avalos }
5124b588458SPeter Avalos
first(Node * p)5134b588458SPeter Avalos int first(Node *p) /* collects initially active leaves of p into setvec */
5144b588458SPeter Avalos /* returns 0 if p matches empty string */
5154b588458SPeter Avalos {
5164b588458SPeter Avalos int b, lp;
5174b588458SPeter Avalos
5184b588458SPeter Avalos switch (type(p)) {
5194b588458SPeter Avalos ELEAF
5204b588458SPeter Avalos LEAF
5214b588458SPeter Avalos lp = info(p); /* look for high-water mark of subscripts */
5224b588458SPeter Avalos while (setcnt >= maxsetvec || lp >= maxsetvec) { /* guessing here! */
5231d48fce0SDaniel Fojt resizesetvec(__func__);
5244b588458SPeter Avalos }
5254b588458SPeter Avalos if (type(p) == EMPTYRE) {
5264b588458SPeter Avalos setvec[lp] = 0;
5274b588458SPeter Avalos return(0);
5284b588458SPeter Avalos }
5294b588458SPeter Avalos if (setvec[lp] != 1) {
5304b588458SPeter Avalos setvec[lp] = 1;
5314b588458SPeter Avalos setcnt++;
5324b588458SPeter Avalos }
533*ed569bc2SAaron LI if (type(p) == CCL && (*(int *) right(p)) == 0)
5344b588458SPeter Avalos return(0); /* empty CCL */
5351d48fce0SDaniel Fojt return(1);
5364b588458SPeter Avalos case PLUS:
5371d48fce0SDaniel Fojt if (first(left(p)) == 0)
5381d48fce0SDaniel Fojt return(0);
5394b588458SPeter Avalos return(1);
5404b588458SPeter Avalos case STAR:
5414b588458SPeter Avalos case QUEST:
5424b588458SPeter Avalos first(left(p));
5434b588458SPeter Avalos return(0);
5444b588458SPeter Avalos case CAT:
5454b588458SPeter Avalos if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
5464b588458SPeter Avalos return(1);
5474b588458SPeter Avalos case OR:
5484b588458SPeter Avalos b = first(right(p));
5494b588458SPeter Avalos if (first(left(p)) == 0 || b == 0) return(0);
5504b588458SPeter Avalos return(1);
5511d48fce0SDaniel Fojt case ZERO:
5521d48fce0SDaniel Fojt return 0;
5534b588458SPeter Avalos }
5544b588458SPeter Avalos FATAL("can't happen: unknown type %d in first", type(p)); /* can't happen */
5554b588458SPeter Avalos return(-1);
5564b588458SPeter Avalos }
5574b588458SPeter Avalos
follow(Node * v)5584b588458SPeter Avalos void follow(Node *v) /* collects leaves that can follow v into setvec */
5594b588458SPeter Avalos {
5604b588458SPeter Avalos Node *p;
5614b588458SPeter Avalos
5624b588458SPeter Avalos if (type(v) == FINAL)
5634b588458SPeter Avalos return;
5644b588458SPeter Avalos p = parent(v);
5654b588458SPeter Avalos switch (type(p)) {
5664b588458SPeter Avalos case STAR:
5674b588458SPeter Avalos case PLUS:
5684b588458SPeter Avalos first(v);
5694b588458SPeter Avalos follow(p);
5704b588458SPeter Avalos return;
5714b588458SPeter Avalos
5724b588458SPeter Avalos case OR:
5734b588458SPeter Avalos case QUEST:
5744b588458SPeter Avalos follow(p);
5754b588458SPeter Avalos return;
5764b588458SPeter Avalos
5774b588458SPeter Avalos case CAT:
5784b588458SPeter Avalos if (v == left(p)) { /* v is left child of p */
5794b588458SPeter Avalos if (first(right(p)) == 0) {
5804b588458SPeter Avalos follow(p);
5814b588458SPeter Avalos return;
5824b588458SPeter Avalos }
5834b588458SPeter Avalos } else /* v is right child */
5844b588458SPeter Avalos follow(p);
5854b588458SPeter Avalos return;
5864b588458SPeter Avalos }
5874b588458SPeter Avalos }
5884b588458SPeter Avalos
member(int c,int * sarg)589*ed569bc2SAaron LI int member(int c, int *sarg) /* is c in s? */
5904b588458SPeter Avalos {
591*ed569bc2SAaron LI int *s = (int *) sarg;
5924b588458SPeter Avalos
5934b588458SPeter Avalos while (*s)
5944b588458SPeter Avalos if (c == *s++)
5954b588458SPeter Avalos return(1);
5964b588458SPeter Avalos return(0);
5974b588458SPeter Avalos }
5984b588458SPeter Avalos
resize_gototab(fa * f,int state)599*ed569bc2SAaron LI static void resize_gototab(fa *f, int state)
600*ed569bc2SAaron LI {
601*ed569bc2SAaron LI size_t new_size = f->gototab[state].allocated * 2;
602*ed569bc2SAaron LI gtte *p = (gtte *) realloc(f->gototab[state].entries, new_size * sizeof(gtte));
603*ed569bc2SAaron LI if (p == NULL)
604*ed569bc2SAaron LI overflo(__func__);
605*ed569bc2SAaron LI
606*ed569bc2SAaron LI // need to initialized the new memory to zero
607*ed569bc2SAaron LI size_t orig_size = f->gototab[state].allocated; // 2nd half of new mem is this size
608*ed569bc2SAaron LI memset(p + orig_size, 0, orig_size * sizeof(gtte)); // clean it out
609*ed569bc2SAaron LI
610*ed569bc2SAaron LI f->gototab[state].allocated = new_size; // update gototab info
611*ed569bc2SAaron LI f->gototab[state].entries = p;
612*ed569bc2SAaron LI }
613*ed569bc2SAaron LI
get_gototab(fa * f,int state,int ch)614*ed569bc2SAaron LI static int get_gototab(fa *f, int state, int ch) /* hide gototab implementation */
615*ed569bc2SAaron LI {
616*ed569bc2SAaron LI gtte key;
617*ed569bc2SAaron LI gtte *item;
618*ed569bc2SAaron LI
619*ed569bc2SAaron LI key.ch = ch;
620*ed569bc2SAaron LI key.state = 0; /* irrelevant */
621*ed569bc2SAaron LI item = (gtte *) bsearch(& key, f->gototab[state].entries,
622*ed569bc2SAaron LI f->gototab[state].inuse, sizeof(gtte),
623*ed569bc2SAaron LI entry_cmp);
624*ed569bc2SAaron LI
625*ed569bc2SAaron LI if (item == NULL)
626*ed569bc2SAaron LI return 0;
627*ed569bc2SAaron LI else
628*ed569bc2SAaron LI return item->state;
629*ed569bc2SAaron LI }
630*ed569bc2SAaron LI
entry_cmp(const void * l,const void * r)631*ed569bc2SAaron LI static int entry_cmp(const void *l, const void *r)
632*ed569bc2SAaron LI {
633*ed569bc2SAaron LI const gtte *left, *right;
634*ed569bc2SAaron LI
635*ed569bc2SAaron LI left = (const gtte *) l;
636*ed569bc2SAaron LI right = (const gtte *) r;
637*ed569bc2SAaron LI
638*ed569bc2SAaron LI return left->ch - right->ch;
639*ed569bc2SAaron LI }
640*ed569bc2SAaron LI
set_gototab(fa * f,int state,int ch,int val)641*ed569bc2SAaron LI static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab implementation */
642*ed569bc2SAaron LI {
643*ed569bc2SAaron LI if (f->gototab[state].inuse == 0) {
644*ed569bc2SAaron LI f->gototab[state].entries[0].ch = ch;
645*ed569bc2SAaron LI f->gototab[state].entries[0].state = val;
646*ed569bc2SAaron LI f->gototab[state].inuse++;
647*ed569bc2SAaron LI return val;
648*ed569bc2SAaron LI } else if (ch > f->gototab[state].entries[f->gototab[state].inuse-1].ch) {
649*ed569bc2SAaron LI // not seen yet, insert and return
650*ed569bc2SAaron LI gtt *tab = & f->gototab[state];
651*ed569bc2SAaron LI if (tab->inuse + 1 >= tab->allocated)
652*ed569bc2SAaron LI resize_gototab(f, state);
653*ed569bc2SAaron LI
654*ed569bc2SAaron LI f->gototab[state].entries[f->gototab[state].inuse-1].ch = ch;
655*ed569bc2SAaron LI f->gototab[state].entries[f->gototab[state].inuse-1].state = val;
656*ed569bc2SAaron LI f->gototab[state].inuse++;
657*ed569bc2SAaron LI return val;
658*ed569bc2SAaron LI } else {
659*ed569bc2SAaron LI // maybe we have it, maybe we don't
660*ed569bc2SAaron LI gtte key;
661*ed569bc2SAaron LI gtte *item;
662*ed569bc2SAaron LI
663*ed569bc2SAaron LI key.ch = ch;
664*ed569bc2SAaron LI key.state = 0; /* irrelevant */
665*ed569bc2SAaron LI item = (gtte *) bsearch(& key, f->gototab[state].entries,
666*ed569bc2SAaron LI f->gototab[state].inuse, sizeof(gtte),
667*ed569bc2SAaron LI entry_cmp);
668*ed569bc2SAaron LI
669*ed569bc2SAaron LI if (item != NULL) {
670*ed569bc2SAaron LI // we have it, update state and return
671*ed569bc2SAaron LI item->state = val;
672*ed569bc2SAaron LI return item->state;
673*ed569bc2SAaron LI }
674*ed569bc2SAaron LI // otherwise, fall through to insert and reallocate.
675*ed569bc2SAaron LI }
676*ed569bc2SAaron LI
677*ed569bc2SAaron LI gtt *tab = & f->gototab[state];
678*ed569bc2SAaron LI if (tab->inuse + 1 >= tab->allocated)
679*ed569bc2SAaron LI resize_gototab(f, state);
680*ed569bc2SAaron LI ++tab->inuse;
681*ed569bc2SAaron LI f->gototab[state].entries[tab->inuse].ch = ch;
682*ed569bc2SAaron LI f->gototab[state].entries[tab->inuse].state = val;
683*ed569bc2SAaron LI
684*ed569bc2SAaron LI qsort(f->gototab[state].entries,
685*ed569bc2SAaron LI f->gototab[state].inuse, sizeof(gtte), entry_cmp);
686*ed569bc2SAaron LI
687*ed569bc2SAaron LI return val; /* not used anywhere at the moment */
688*ed569bc2SAaron LI }
689*ed569bc2SAaron LI
clear_gototab(fa * f,int state)690*ed569bc2SAaron LI static void clear_gototab(fa *f, int state)
691*ed569bc2SAaron LI {
692*ed569bc2SAaron LI memset(f->gototab[state].entries, 0,
693*ed569bc2SAaron LI f->gototab[state].allocated * sizeof(gtte));
694*ed569bc2SAaron LI f->gototab[state].inuse = 0;
695*ed569bc2SAaron LI }
696*ed569bc2SAaron LI
match(fa * f,const char * p0)6974b588458SPeter Avalos int match(fa *f, const char *p0) /* shortest match ? */
6984b588458SPeter Avalos {
6994b588458SPeter Avalos int s, ns;
700*ed569bc2SAaron LI int n;
701*ed569bc2SAaron LI int rune;
7021d48fce0SDaniel Fojt const uschar *p = (const uschar *) p0;
7034b588458SPeter Avalos
704*ed569bc2SAaron LI /* return pmatch(f, p0); does it matter whether longest or shortest? */
705*ed569bc2SAaron LI
7061d48fce0SDaniel Fojt s = f->initstat;
7071d48fce0SDaniel Fojt assert (s < f->state_count);
7081d48fce0SDaniel Fojt
7094b588458SPeter Avalos if (f->out[s])
7104b588458SPeter Avalos return(1);
7114b588458SPeter Avalos do {
7124b588458SPeter Avalos /* assert(*p < NCHARS); */
713*ed569bc2SAaron LI n = u8_rune(&rune, (const char *) p);
714*ed569bc2SAaron LI if ((ns = get_gototab(f, s, rune)) != 0)
7154b588458SPeter Avalos s = ns;
7164b588458SPeter Avalos else
717*ed569bc2SAaron LI s = cgoto(f, s, rune);
7184b588458SPeter Avalos if (f->out[s])
7194b588458SPeter Avalos return(1);
720*ed569bc2SAaron LI if (*p == 0)
721*ed569bc2SAaron LI break;
722*ed569bc2SAaron LI p += n;
723*ed569bc2SAaron LI } while (1); /* was *p++ != 0 */
7244b588458SPeter Avalos return(0);
7254b588458SPeter Avalos }
7264b588458SPeter Avalos
pmatch(fa * f,const char * p0)7274b588458SPeter Avalos int pmatch(fa *f, const char *p0) /* longest match, for sub */
7284b588458SPeter Avalos {
7294b588458SPeter Avalos int s, ns;
730*ed569bc2SAaron LI int n;
731*ed569bc2SAaron LI int rune;
7321d48fce0SDaniel Fojt const uschar *p = (const uschar *) p0;
7331d48fce0SDaniel Fojt const uschar *q;
7344b588458SPeter Avalos
7354b588458SPeter Avalos s = f->initstat;
7361d48fce0SDaniel Fojt assert(s < f->state_count);
7371d48fce0SDaniel Fojt
7381d48fce0SDaniel Fojt patbeg = (const char *)p;
7394b588458SPeter Avalos patlen = -1;
7404b588458SPeter Avalos do {
7414b588458SPeter Avalos q = p;
7424b588458SPeter Avalos do {
7434b588458SPeter Avalos if (f->out[s]) /* final state */
7444b588458SPeter Avalos patlen = q-p;
7454b588458SPeter Avalos /* assert(*q < NCHARS); */
746*ed569bc2SAaron LI n = u8_rune(&rune, (const char *) q);
747*ed569bc2SAaron LI if ((ns = get_gototab(f, s, rune)) != 0)
7484b588458SPeter Avalos s = ns;
7494b588458SPeter Avalos else
750*ed569bc2SAaron LI s = cgoto(f, s, rune);
7511d48fce0SDaniel Fojt
7521d48fce0SDaniel Fojt assert(s < f->state_count);
7531d48fce0SDaniel Fojt
7544b588458SPeter Avalos if (s == 1) { /* no transition */
7554b588458SPeter Avalos if (patlen >= 0) {
7561d48fce0SDaniel Fojt patbeg = (const char *) p;
7574b588458SPeter Avalos return(1);
7584b588458SPeter Avalos }
7594b588458SPeter Avalos else
7604b588458SPeter Avalos goto nextin; /* no match */
7614b588458SPeter Avalos }
762*ed569bc2SAaron LI if (*q == 0)
763*ed569bc2SAaron LI break;
764*ed569bc2SAaron LI q += n;
765*ed569bc2SAaron LI } while (1);
766*ed569bc2SAaron LI q++; /* was *q++ */
7674b588458SPeter Avalos if (f->out[s])
7684b588458SPeter Avalos patlen = q-p-1; /* don't count $ */
7694b588458SPeter Avalos if (patlen >= 0) {
7701d48fce0SDaniel Fojt patbeg = (const char *) p;
7714b588458SPeter Avalos return(1);
7724b588458SPeter Avalos }
7734b588458SPeter Avalos nextin:
7744b588458SPeter Avalos s = 2;
775*ed569bc2SAaron LI if (*p == 0)
776*ed569bc2SAaron LI break;
777*ed569bc2SAaron LI n = u8_rune(&rune, (const char *) p);
778*ed569bc2SAaron LI p += n;
779*ed569bc2SAaron LI } while (1); /* was *p++ */
7804b588458SPeter Avalos return (0);
7814b588458SPeter Avalos }
7824b588458SPeter Avalos
nematch(fa * f,const char * p0)7834b588458SPeter Avalos int nematch(fa *f, const char *p0) /* non-empty match, for sub */
7844b588458SPeter Avalos {
7854b588458SPeter Avalos int s, ns;
786*ed569bc2SAaron LI int n;
787*ed569bc2SAaron LI int rune;
7881d48fce0SDaniel Fojt const uschar *p = (const uschar *) p0;
7891d48fce0SDaniel Fojt const uschar *q;
7904b588458SPeter Avalos
7914b588458SPeter Avalos s = f->initstat;
7921d48fce0SDaniel Fojt assert(s < f->state_count);
7931d48fce0SDaniel Fojt
7941d48fce0SDaniel Fojt patbeg = (const char *)p;
7954b588458SPeter Avalos patlen = -1;
7964b588458SPeter Avalos while (*p) {
7974b588458SPeter Avalos q = p;
7984b588458SPeter Avalos do {
7994b588458SPeter Avalos if (f->out[s]) /* final state */
8004b588458SPeter Avalos patlen = q-p;
8014b588458SPeter Avalos /* assert(*q < NCHARS); */
802*ed569bc2SAaron LI n = u8_rune(&rune, (const char *) q);
803*ed569bc2SAaron LI if ((ns = get_gototab(f, s, rune)) != 0)
8044b588458SPeter Avalos s = ns;
8054b588458SPeter Avalos else
806*ed569bc2SAaron LI s = cgoto(f, s, rune);
8074b588458SPeter Avalos if (s == 1) { /* no transition */
8084b588458SPeter Avalos if (patlen > 0) {
8091d48fce0SDaniel Fojt patbeg = (const char *) p;
8104b588458SPeter Avalos return(1);
8114b588458SPeter Avalos } else
8124b588458SPeter Avalos goto nnextin; /* no nonempty match */
8134b588458SPeter Avalos }
814*ed569bc2SAaron LI if (*q == 0)
815*ed569bc2SAaron LI break;
816*ed569bc2SAaron LI q += n;
817*ed569bc2SAaron LI } while (1);
818*ed569bc2SAaron LI q++;
8194b588458SPeter Avalos if (f->out[s])
8204b588458SPeter Avalos patlen = q-p-1; /* don't count $ */
8214b588458SPeter Avalos if (patlen > 0 ) {
8221d48fce0SDaniel Fojt patbeg = (const char *) p;
8234b588458SPeter Avalos return(1);
8244b588458SPeter Avalos }
8254b588458SPeter Avalos nnextin:
8264b588458SPeter Avalos s = 2;
8274b588458SPeter Avalos p++;
8284b588458SPeter Avalos }
8294b588458SPeter Avalos return (0);
8304b588458SPeter Avalos }
8314b588458SPeter Avalos
8321d48fce0SDaniel Fojt
833*ed569bc2SAaron LI #define MAX_UTF_BYTES 4 // UTF-8 is up to 4 bytes long
834*ed569bc2SAaron LI
8351d48fce0SDaniel Fojt /*
8361d48fce0SDaniel Fojt * NAME
8371d48fce0SDaniel Fojt * fnematch
8381d48fce0SDaniel Fojt *
8391d48fce0SDaniel Fojt * DESCRIPTION
8401d48fce0SDaniel Fojt * A stream-fed version of nematch which transfers characters to a
8411d48fce0SDaniel Fojt * null-terminated buffer. All characters up to and including the last
8421d48fce0SDaniel Fojt * character of the matching text or EOF are placed in the buffer. If
8431d48fce0SDaniel Fojt * a match is found, patbeg and patlen are set appropriately.
8441d48fce0SDaniel Fojt *
8451d48fce0SDaniel Fojt * RETURN VALUES
8461d48fce0SDaniel Fojt * false No match found.
8471d48fce0SDaniel Fojt * true Match found.
8481d48fce0SDaniel Fojt */
8491d48fce0SDaniel Fojt
fnematch(fa * pfa,FILE * f,char ** pbuf,int * pbufsize,int quantum)8501d48fce0SDaniel Fojt bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
8511d48fce0SDaniel Fojt {
852*ed569bc2SAaron LI char *i, *j, *k, *buf = *pbuf;
8531d48fce0SDaniel Fojt int bufsize = *pbufsize;
854*ed569bc2SAaron LI int c, n, ns, s;
8551d48fce0SDaniel Fojt
8561d48fce0SDaniel Fojt s = pfa->initstat;
8571d48fce0SDaniel Fojt patlen = 0;
8581d48fce0SDaniel Fojt
8591d48fce0SDaniel Fojt /*
860*ed569bc2SAaron LI * buf <= i <= j <= k <= buf+bufsize
8611d48fce0SDaniel Fojt *
8621d48fce0SDaniel Fojt * i: origin of active substring
8631d48fce0SDaniel Fojt * j: current character
864*ed569bc2SAaron LI * k: destination of the next getc
8651d48fce0SDaniel Fojt */
8661d48fce0SDaniel Fojt
867*ed569bc2SAaron LI i = j = k = buf;
868*ed569bc2SAaron LI
869*ed569bc2SAaron LI do {
870*ed569bc2SAaron LI /*
871*ed569bc2SAaron LI * Call u8_rune with at least MAX_UTF_BYTES ahead in
872*ed569bc2SAaron LI * the buffer until EOF interferes.
873*ed569bc2SAaron LI */
874*ed569bc2SAaron LI if (k - j < MAX_UTF_BYTES) {
875*ed569bc2SAaron LI if (k + MAX_UTF_BYTES > buf + bufsize) {
876*ed569bc2SAaron LI adjbuf((char **) &buf, &bufsize,
877*ed569bc2SAaron LI bufsize + MAX_UTF_BYTES,
878*ed569bc2SAaron LI quantum, 0, "fnematch");
879*ed569bc2SAaron LI }
880*ed569bc2SAaron LI for (n = MAX_UTF_BYTES ; n > 0; n--) {
881*ed569bc2SAaron LI *k++ = (c = getc(f)) != EOF ? c : 0;
882*ed569bc2SAaron LI if (c == EOF) {
883*ed569bc2SAaron LI if (ferror(f))
884*ed569bc2SAaron LI FATAL("fnematch: getc error");
885*ed569bc2SAaron LI break;
886*ed569bc2SAaron LI }
887*ed569bc2SAaron LI }
888*ed569bc2SAaron LI }
889*ed569bc2SAaron LI
890*ed569bc2SAaron LI j += u8_rune(&c, j);
891*ed569bc2SAaron LI
892*ed569bc2SAaron LI if ((ns = get_gototab(pfa, s, c)) != 0)
8931d48fce0SDaniel Fojt s = ns;
8941d48fce0SDaniel Fojt else
8951d48fce0SDaniel Fojt s = cgoto(pfa, s, c);
8961d48fce0SDaniel Fojt
8971d48fce0SDaniel Fojt if (pfa->out[s]) { /* final state */
898*ed569bc2SAaron LI patbeg = i;
899*ed569bc2SAaron LI patlen = j - i;
9001d48fce0SDaniel Fojt if (c == 0) /* don't count $ */
9011d48fce0SDaniel Fojt patlen--;
9021d48fce0SDaniel Fojt }
903*ed569bc2SAaron LI
904*ed569bc2SAaron LI if (c && s != 1)
905*ed569bc2SAaron LI continue; /* origin i still viable, next j */
906*ed569bc2SAaron LI if (patlen)
907*ed569bc2SAaron LI break; /* best match found */
908*ed569bc2SAaron LI
909*ed569bc2SAaron LI /* no match at origin i, next i and start over */
910*ed569bc2SAaron LI i += u8_rune(&c, i);
911*ed569bc2SAaron LI if (c == 0)
912*ed569bc2SAaron LI break; /* no match */
913*ed569bc2SAaron LI j = i;
9141d48fce0SDaniel Fojt s = 2;
915*ed569bc2SAaron LI } while (1);
9161d48fce0SDaniel Fojt
9171d48fce0SDaniel Fojt /* adjbuf() may have relocated a resized buffer. Inform the world. */
9181d48fce0SDaniel Fojt *pbuf = buf;
9191d48fce0SDaniel Fojt *pbufsize = bufsize;
9201d48fce0SDaniel Fojt
9211d48fce0SDaniel Fojt if (patlen) {
9221d48fce0SDaniel Fojt /*
9231d48fce0SDaniel Fojt * Under no circumstances is the last character fed to
9241d48fce0SDaniel Fojt * the automaton part of the match. It is EOF's nullbyte,
9251d48fce0SDaniel Fojt * or it sent the automaton into a state with no further
9261d48fce0SDaniel Fojt * transitions available (s==1), or both. Room for a
9271d48fce0SDaniel Fojt * terminating nullbyte is guaranteed.
9281d48fce0SDaniel Fojt *
9291d48fce0SDaniel Fojt * ungetc any chars after the end of matching text
9301d48fce0SDaniel Fojt * (except for EOF's nullbyte, if present) and null
9311d48fce0SDaniel Fojt * terminate the buffer.
9321d48fce0SDaniel Fojt */
9331d48fce0SDaniel Fojt do
934*ed569bc2SAaron LI if (*--k && ungetc(*k, f) == EOF)
935*ed569bc2SAaron LI FATAL("unable to ungetc '%c'", *k);
936*ed569bc2SAaron LI while (k > patbeg + patlen);
937*ed569bc2SAaron LI *k = '\0';
9381d48fce0SDaniel Fojt return true;
9391d48fce0SDaniel Fojt }
9401d48fce0SDaniel Fojt else
9411d48fce0SDaniel Fojt return false;
9421d48fce0SDaniel Fojt }
9431d48fce0SDaniel Fojt
reparse(const char * p)9444b588458SPeter Avalos Node *reparse(const char *p) /* parses regular expression pointed to by p */
9454b588458SPeter Avalos { /* uses relex() to scan regular expression */
9464b588458SPeter Avalos Node *np;
9474b588458SPeter Avalos
948e5e686a0SDaniel Fojt DPRINTF("reparse <%s>\n", p);
9491d48fce0SDaniel Fojt lastre = prestr = (const uschar *) p; /* prestr points to string to be parsed */
9504b588458SPeter Avalos rtok = relex();
9514b588458SPeter Avalos /* GNU compatibility: an empty regexp matches anything */
9524b588458SPeter Avalos if (rtok == '\0') {
9534b588458SPeter Avalos /* FATAL("empty regular expression"); previous */
9544b588458SPeter Avalos return(op2(EMPTYRE, NIL, NIL));
9554b588458SPeter Avalos }
9564b588458SPeter Avalos np = regexp();
9574b588458SPeter Avalos if (rtok != '\0')
9584b588458SPeter Avalos FATAL("syntax error in regular expression %s at %s", lastre, prestr);
9594b588458SPeter Avalos return(np);
9604b588458SPeter Avalos }
9614b588458SPeter Avalos
regexp(void)9624b588458SPeter Avalos Node *regexp(void) /* top-level parse of reg expr */
9634b588458SPeter Avalos {
9644b588458SPeter Avalos return (alt(concat(primary())));
9654b588458SPeter Avalos }
9664b588458SPeter Avalos
primary(void)9674b588458SPeter Avalos Node *primary(void)
9684b588458SPeter Avalos {
9694b588458SPeter Avalos Node *np;
9701d48fce0SDaniel Fojt int savelastatom;
9714b588458SPeter Avalos
9724b588458SPeter Avalos switch (rtok) {
9734b588458SPeter Avalos case CHAR:
9741d48fce0SDaniel Fojt lastatom = starttok;
9754b588458SPeter Avalos np = op2(CHAR, NIL, itonp(rlxval));
9764b588458SPeter Avalos rtok = relex();
9774b588458SPeter Avalos return (unary(np));
9784b588458SPeter Avalos case ALL:
9794b588458SPeter Avalos rtok = relex();
9804b588458SPeter Avalos return (unary(op2(ALL, NIL, NIL)));
9814b588458SPeter Avalos case EMPTYRE:
9824b588458SPeter Avalos rtok = relex();
9831d48fce0SDaniel Fojt return (unary(op2(EMPTYRE, NIL, NIL)));
9844b588458SPeter Avalos case DOT:
9851d48fce0SDaniel Fojt lastatom = starttok;
9864b588458SPeter Avalos rtok = relex();
9874b588458SPeter Avalos return (unary(op2(DOT, NIL, NIL)));
9884b588458SPeter Avalos case CCL:
9891d48fce0SDaniel Fojt np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr));
9901d48fce0SDaniel Fojt lastatom = starttok;
9914b588458SPeter Avalos rtok = relex();
9924b588458SPeter Avalos return (unary(np));
9934b588458SPeter Avalos case NCCL:
9941d48fce0SDaniel Fojt np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr));
9951d48fce0SDaniel Fojt lastatom = starttok;
9964b588458SPeter Avalos rtok = relex();
9974b588458SPeter Avalos return (unary(np));
9984b588458SPeter Avalos case '^':
9994b588458SPeter Avalos rtok = relex();
10004b588458SPeter Avalos return (unary(op2(CHAR, NIL, itonp(HAT))));
10014b588458SPeter Avalos case '$':
10024b588458SPeter Avalos rtok = relex();
10034b588458SPeter Avalos return (unary(op2(CHAR, NIL, NIL)));
10044b588458SPeter Avalos case '(':
10051d48fce0SDaniel Fojt lastatom = starttok;
10061d48fce0SDaniel Fojt savelastatom = starttok - basestr; /* Retain over recursion */
10074b588458SPeter Avalos rtok = relex();
10084b588458SPeter Avalos if (rtok == ')') { /* special pleading for () */
10094b588458SPeter Avalos rtok = relex();
1010*ed569bc2SAaron LI return unary(op2(CCL, NIL, (Node *) cclenter("")));
10114b588458SPeter Avalos }
10124b588458SPeter Avalos np = regexp();
10134b588458SPeter Avalos if (rtok == ')') {
10141d48fce0SDaniel Fojt lastatom = basestr + savelastatom; /* Restore */
10154b588458SPeter Avalos rtok = relex();
10164b588458SPeter Avalos return (unary(np));
10174b588458SPeter Avalos }
10184b588458SPeter Avalos else
10194b588458SPeter Avalos FATAL("syntax error in regular expression %s at %s", lastre, prestr);
10204b588458SPeter Avalos default:
10214b588458SPeter Avalos FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
10224b588458SPeter Avalos }
10234b588458SPeter Avalos return 0; /*NOTREACHED*/
10244b588458SPeter Avalos }
10254b588458SPeter Avalos
concat(Node * np)10264b588458SPeter Avalos Node *concat(Node *np)
10274b588458SPeter Avalos {
10284b588458SPeter Avalos switch (rtok) {
10291d48fce0SDaniel Fojt case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
10304b588458SPeter Avalos return (concat(op2(CAT, np, primary())));
10311d48fce0SDaniel Fojt case EMPTYRE:
10321d48fce0SDaniel Fojt rtok = relex();
1033*ed569bc2SAaron LI return (concat(op2(CAT, op2(CCL, NIL, (Node *) cclenter("")),
10341d48fce0SDaniel Fojt primary())));
10354b588458SPeter Avalos }
10364b588458SPeter Avalos return (np);
10374b588458SPeter Avalos }
10384b588458SPeter Avalos
alt(Node * np)10394b588458SPeter Avalos Node *alt(Node *np)
10404b588458SPeter Avalos {
10414b588458SPeter Avalos if (rtok == OR) {
10424b588458SPeter Avalos rtok = relex();
10434b588458SPeter Avalos return (alt(op2(OR, np, concat(primary()))));
10444b588458SPeter Avalos }
10454b588458SPeter Avalos return (np);
10464b588458SPeter Avalos }
10474b588458SPeter Avalos
unary(Node * np)10484b588458SPeter Avalos Node *unary(Node *np)
10494b588458SPeter Avalos {
10504b588458SPeter Avalos switch (rtok) {
10514b588458SPeter Avalos case STAR:
10524b588458SPeter Avalos rtok = relex();
10534b588458SPeter Avalos return (unary(op2(STAR, np, NIL)));
10544b588458SPeter Avalos case PLUS:
10554b588458SPeter Avalos rtok = relex();
10564b588458SPeter Avalos return (unary(op2(PLUS, np, NIL)));
10574b588458SPeter Avalos case QUEST:
10584b588458SPeter Avalos rtok = relex();
10594b588458SPeter Avalos return (unary(op2(QUEST, np, NIL)));
10601d48fce0SDaniel Fojt case ZERO:
10611d48fce0SDaniel Fojt rtok = relex();
10621d48fce0SDaniel Fojt return (unary(op2(ZERO, np, NIL)));
10634b588458SPeter Avalos default:
10644b588458SPeter Avalos return (np);
10654b588458SPeter Avalos }
10664b588458SPeter Avalos }
10674b588458SPeter Avalos
10684b588458SPeter Avalos /*
10694b588458SPeter Avalos * Character class definitions conformant to the POSIX locale as
10704b588458SPeter Avalos * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
10714b588458SPeter Avalos * and operating character sets are both ASCII (ISO646) or supersets
10724b588458SPeter Avalos * thereof.
10734b588458SPeter Avalos *
10744b588458SPeter Avalos * Note that to avoid overflowing the temporary buffer used in
10754b588458SPeter Avalos * relex(), the expanded character class (prior to range expansion)
10764b588458SPeter Avalos * must be less than twice the size of their full name.
10774b588458SPeter Avalos */
10784b588458SPeter Avalos
10794b588458SPeter Avalos /* Because isblank doesn't show up in any of the header files on any
10804b588458SPeter Avalos * system i use, it's defined here. if some other locale has a richer
10814b588458SPeter Avalos * definition of "blank", define HAS_ISBLANK and provide your own
10824b588458SPeter Avalos * version.
10834b588458SPeter Avalos * the parentheses here are an attempt to find a path through the maze
10844b588458SPeter Avalos * of macro definition and/or function and/or version provided. thanks
10854b588458SPeter Avalos * to nelson beebe for the suggestion; let's see if it works everywhere.
10864b588458SPeter Avalos */
10874b588458SPeter Avalos
10884b588458SPeter Avalos /* #define HAS_ISBLANK */
10894b588458SPeter Avalos #ifndef HAS_ISBLANK
10904b588458SPeter Avalos
10914b588458SPeter Avalos int (xisblank)(int c)
10924b588458SPeter Avalos {
10934b588458SPeter Avalos return c==' ' || c=='\t';
10944b588458SPeter Avalos }
10954b588458SPeter Avalos
10964b588458SPeter Avalos #endif
10974b588458SPeter Avalos
10981d48fce0SDaniel Fojt static const struct charclass {
10994b588458SPeter Avalos const char *cc_name;
11004b588458SPeter Avalos int cc_namelen;
11014b588458SPeter Avalos int (*cc_func)(int);
11024b588458SPeter Avalos } charclasses[] = {
11034b588458SPeter Avalos { "alnum", 5, isalnum },
11044b588458SPeter Avalos { "alpha", 5, isalpha },
11050020174dSPeter Avalos #ifndef HAS_ISBLANK
11061d48fce0SDaniel Fojt { "blank", 5, xisblank },
11070020174dSPeter Avalos #else
11080020174dSPeter Avalos { "blank", 5, isblank },
11090020174dSPeter Avalos #endif
11104b588458SPeter Avalos { "cntrl", 5, iscntrl },
11114b588458SPeter Avalos { "digit", 5, isdigit },
11124b588458SPeter Avalos { "graph", 5, isgraph },
11134b588458SPeter Avalos { "lower", 5, islower },
11144b588458SPeter Avalos { "print", 5, isprint },
11154b588458SPeter Avalos { "punct", 5, ispunct },
11164b588458SPeter Avalos { "space", 5, isspace },
11174b588458SPeter Avalos { "upper", 5, isupper },
11184b588458SPeter Avalos { "xdigit", 6, isxdigit },
11194b588458SPeter Avalos { NULL, 0, NULL },
11204b588458SPeter Avalos };
11214b588458SPeter Avalos
11221d48fce0SDaniel Fojt #define REPEAT_SIMPLE 0
11231d48fce0SDaniel Fojt #define REPEAT_PLUS_APPENDED 1
11241d48fce0SDaniel Fojt #define REPEAT_WITH_Q 2
11251d48fce0SDaniel Fojt #define REPEAT_ZERO 3
11261d48fce0SDaniel Fojt
11271d48fce0SDaniel Fojt static int
replace_repeat(const uschar * reptok,int reptoklen,const uschar * atom,int atomlen,int firstnum,int secondnum,int special_case)11281d48fce0SDaniel Fojt replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
11291d48fce0SDaniel Fojt int atomlen, int firstnum, int secondnum, int special_case)
11301d48fce0SDaniel Fojt {
11311d48fce0SDaniel Fojt int i, j;
11321d48fce0SDaniel Fojt uschar *buf = 0;
11331d48fce0SDaniel Fojt int ret = 1;
11341d48fce0SDaniel Fojt int init_q = (firstnum == 0); /* first added char will be ? */
11351d48fce0SDaniel Fojt int n_q_reps = secondnum-firstnum; /* m>n, so reduce until {1,m-n} left */
11361d48fce0SDaniel Fojt int prefix_length = reptok - basestr; /* prefix includes first rep */
11371d48fce0SDaniel Fojt int suffix_length = strlen((const char *) reptok) - reptoklen; /* string after rep specifier */
11381d48fce0SDaniel Fojt int size = prefix_length + suffix_length;
11391d48fce0SDaniel Fojt
11401d48fce0SDaniel Fojt if (firstnum > 1) { /* add room for reps 2 through firstnum */
11411d48fce0SDaniel Fojt size += atomlen*(firstnum-1);
11421d48fce0SDaniel Fojt }
11431d48fce0SDaniel Fojt
11441d48fce0SDaniel Fojt /* Adjust size of buffer for special cases */
11451d48fce0SDaniel Fojt if (special_case == REPEAT_PLUS_APPENDED) {
11461d48fce0SDaniel Fojt size++; /* for the final + */
11471d48fce0SDaniel Fojt } else if (special_case == REPEAT_WITH_Q) {
114848f09a05SAntonio Huete Jimenez size += init_q + (atomlen+1)* (n_q_reps-init_q);
11491d48fce0SDaniel Fojt } else if (special_case == REPEAT_ZERO) {
11501d48fce0SDaniel Fojt size += 2; /* just a null ERE: () */
11511d48fce0SDaniel Fojt }
115248f09a05SAntonio Huete Jimenez if ((buf = (uschar *) malloc(size + 1)) == NULL)
11531d48fce0SDaniel Fojt FATAL("out of space in reg expr %.10s..", lastre);
11541d48fce0SDaniel Fojt memcpy(buf, basestr, prefix_length); /* copy prefix */
11551d48fce0SDaniel Fojt j = prefix_length;
11561d48fce0SDaniel Fojt if (special_case == REPEAT_ZERO) {
11571d48fce0SDaniel Fojt j -= atomlen;
11581d48fce0SDaniel Fojt buf[j++] = '(';
11591d48fce0SDaniel Fojt buf[j++] = ')';
11601d48fce0SDaniel Fojt }
11611d48fce0SDaniel Fojt for (i = 1; i < firstnum; i++) { /* copy x reps */
11621d48fce0SDaniel Fojt memcpy(&buf[j], atom, atomlen);
11631d48fce0SDaniel Fojt j += atomlen;
11641d48fce0SDaniel Fojt }
11651d48fce0SDaniel Fojt if (special_case == REPEAT_PLUS_APPENDED) {
11661d48fce0SDaniel Fojt buf[j++] = '+';
11671d48fce0SDaniel Fojt } else if (special_case == REPEAT_WITH_Q) {
11681d48fce0SDaniel Fojt if (init_q)
11691d48fce0SDaniel Fojt buf[j++] = '?';
11701d48fce0SDaniel Fojt for (i = init_q; i < n_q_reps; i++) { /* copy x? reps */
11711d48fce0SDaniel Fojt memcpy(&buf[j], atom, atomlen);
11721d48fce0SDaniel Fojt j += atomlen;
11731d48fce0SDaniel Fojt buf[j++] = '?';
11741d48fce0SDaniel Fojt }
11751d48fce0SDaniel Fojt }
11761d48fce0SDaniel Fojt memcpy(&buf[j], reptok+reptoklen, suffix_length);
117748f09a05SAntonio Huete Jimenez j += suffix_length;
117848f09a05SAntonio Huete Jimenez buf[j] = '\0';
11791d48fce0SDaniel Fojt /* free old basestr */
11801d48fce0SDaniel Fojt if (firstbasestr != basestr) {
11811d48fce0SDaniel Fojt if (basestr)
11821d48fce0SDaniel Fojt xfree(basestr);
11831d48fce0SDaniel Fojt }
11841d48fce0SDaniel Fojt basestr = buf;
11851d48fce0SDaniel Fojt prestr = buf + prefix_length;
11861d48fce0SDaniel Fojt if (special_case == REPEAT_ZERO) {
11871d48fce0SDaniel Fojt prestr -= atomlen;
11881d48fce0SDaniel Fojt ret++;
11891d48fce0SDaniel Fojt }
11901d48fce0SDaniel Fojt return ret;
11911d48fce0SDaniel Fojt }
11921d48fce0SDaniel Fojt
repeat(const uschar * reptok,int reptoklen,const uschar * atom,int atomlen,int firstnum,int secondnum)11931d48fce0SDaniel Fojt static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
11941d48fce0SDaniel Fojt int atomlen, int firstnum, int secondnum)
11951d48fce0SDaniel Fojt {
11961d48fce0SDaniel Fojt /*
11971d48fce0SDaniel Fojt In general, the repetition specifier or "bound" is replaced here
11981d48fce0SDaniel Fojt by an equivalent ERE string, repeating the immediately previous atom
11991d48fce0SDaniel Fojt and appending ? and + as needed. Note that the first copy of the
12001d48fce0SDaniel Fojt atom is left in place, except in the special_case of a zero-repeat
12011d48fce0SDaniel Fojt (i.e., {0}).
12021d48fce0SDaniel Fojt */
12031d48fce0SDaniel Fojt if (secondnum < 0) { /* means {n,} -> repeat n-1 times followed by PLUS */
12041d48fce0SDaniel Fojt if (firstnum < 2) {
12051d48fce0SDaniel Fojt /* 0 or 1: should be handled before you get here */
12061d48fce0SDaniel Fojt FATAL("internal error");
12071d48fce0SDaniel Fojt } else {
12081d48fce0SDaniel Fojt return replace_repeat(reptok, reptoklen, atom, atomlen,
12091d48fce0SDaniel Fojt firstnum, secondnum, REPEAT_PLUS_APPENDED);
12101d48fce0SDaniel Fojt }
12111d48fce0SDaniel Fojt } else if (firstnum == secondnum) { /* {n} or {n,n} -> simply repeat n-1 times */
12121d48fce0SDaniel Fojt if (firstnum == 0) { /* {0} or {0,0} */
12131d48fce0SDaniel Fojt /* This case is unusual because the resulting
12141d48fce0SDaniel Fojt replacement string might actually be SMALLER than
12151d48fce0SDaniel Fojt the original ERE */
12161d48fce0SDaniel Fojt return replace_repeat(reptok, reptoklen, atom, atomlen,
12171d48fce0SDaniel Fojt firstnum, secondnum, REPEAT_ZERO);
12181d48fce0SDaniel Fojt } else { /* (firstnum >= 1) */
12191d48fce0SDaniel Fojt return replace_repeat(reptok, reptoklen, atom, atomlen,
12201d48fce0SDaniel Fojt firstnum, secondnum, REPEAT_SIMPLE);
12211d48fce0SDaniel Fojt }
12221d48fce0SDaniel Fojt } else if (firstnum < secondnum) { /* {n,m} -> repeat n-1 times then alternate */
12231d48fce0SDaniel Fojt /* x{n,m} => xx...x{1, m-n+1} => xx...x?x?x?..x? */
12241d48fce0SDaniel Fojt return replace_repeat(reptok, reptoklen, atom, atomlen,
12251d48fce0SDaniel Fojt firstnum, secondnum, REPEAT_WITH_Q);
12261d48fce0SDaniel Fojt } else { /* Error - shouldn't be here (n>m) */
12271d48fce0SDaniel Fojt FATAL("internal error");
12281d48fce0SDaniel Fojt }
12291d48fce0SDaniel Fojt return 0;
12301d48fce0SDaniel Fojt }
12314b588458SPeter Avalos
relex(void)12324b588458SPeter Avalos int relex(void) /* lexical analyzer for reparse */
12334b588458SPeter Avalos {
12344b588458SPeter Avalos int c, n;
12354b588458SPeter Avalos int cflag;
12361d48fce0SDaniel Fojt static uschar *buf = NULL;
12374b588458SPeter Avalos static int bufsz = 100;
12384b588458SPeter Avalos uschar *bp;
12391d48fce0SDaniel Fojt const struct charclass *cc;
12404b588458SPeter Avalos int i;
12411d48fce0SDaniel Fojt int num, m;
12421d48fce0SDaniel Fojt bool commafound, digitfound;
12431d48fce0SDaniel Fojt const uschar *startreptok;
12441d48fce0SDaniel Fojt static int parens = 0;
12451d48fce0SDaniel Fojt
12461d48fce0SDaniel Fojt rescan:
12471d48fce0SDaniel Fojt starttok = prestr;
12484b588458SPeter Avalos
1249*ed569bc2SAaron LI if ((n = u8_rune(&rlxval, (const char *) prestr)) > 1) {
1250*ed569bc2SAaron LI prestr += n;
1251*ed569bc2SAaron LI starttok = prestr;
1252*ed569bc2SAaron LI return CHAR;
1253*ed569bc2SAaron LI }
1254*ed569bc2SAaron LI
12554b588458SPeter Avalos switch (c = *prestr++) {
12564b588458SPeter Avalos case '|': return OR;
12574b588458SPeter Avalos case '*': return STAR;
12584b588458SPeter Avalos case '+': return PLUS;
12594b588458SPeter Avalos case '?': return QUEST;
12604b588458SPeter Avalos case '.': return DOT;
12614b588458SPeter Avalos case '\0': prestr--; return '\0';
12624b588458SPeter Avalos case '^':
12634b588458SPeter Avalos case '$':
12644b588458SPeter Avalos return c;
12651d48fce0SDaniel Fojt case '(':
12661d48fce0SDaniel Fojt parens++;
12671d48fce0SDaniel Fojt return c;
12681d48fce0SDaniel Fojt case ')':
12691d48fce0SDaniel Fojt if (parens) {
12701d48fce0SDaniel Fojt parens--;
12711d48fce0SDaniel Fojt return c;
12721d48fce0SDaniel Fojt }
12731d48fce0SDaniel Fojt /* unmatched close parenthesis; per POSIX, treat as literal */
12741d48fce0SDaniel Fojt rlxval = c;
12751d48fce0SDaniel Fojt return CHAR;
12764b588458SPeter Avalos case '\\':
1277b12bae18SSascha Wildner rlxval = quoted(&prestr);
12784b588458SPeter Avalos return CHAR;
12794b588458SPeter Avalos default:
12804b588458SPeter Avalos rlxval = c;
12814b588458SPeter Avalos return CHAR;
12824b588458SPeter Avalos case '[':
128348f09a05SAntonio Huete Jimenez if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
12844b588458SPeter Avalos FATAL("out of space in reg expr %.10s..", lastre);
12854b588458SPeter Avalos bp = buf;
12864b588458SPeter Avalos if (*prestr == '^') {
12874b588458SPeter Avalos cflag = 1;
12884b588458SPeter Avalos prestr++;
12894b588458SPeter Avalos }
12904b588458SPeter Avalos else
12914b588458SPeter Avalos cflag = 0;
1292*ed569bc2SAaron LI n = 5 * strlen((const char *) prestr)+1; /* BUG: was 2. what value? */
12934b588458SPeter Avalos if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
12944b588458SPeter Avalos FATAL("out of space for reg expr %.10s...", lastre);
12954b588458SPeter Avalos for (; ; ) {
1296*ed569bc2SAaron LI if ((n = u8_rune(&rlxval, (const char *) prestr)) > 1) {
1297*ed569bc2SAaron LI for (i = 0; i < n; i++)
1298*ed569bc2SAaron LI *bp++ = *prestr++;
1299*ed569bc2SAaron LI continue;
1300*ed569bc2SAaron LI }
13014b588458SPeter Avalos if ((c = *prestr++) == '\\') {
13024b588458SPeter Avalos *bp++ = '\\';
13034b588458SPeter Avalos if ((c = *prestr++) == '\0')
13044b588458SPeter Avalos FATAL("nonterminated character class %.20s...", lastre);
13054b588458SPeter Avalos *bp++ = c;
13064b588458SPeter Avalos /* } else if (c == '\n') { */
13074b588458SPeter Avalos /* FATAL("newline in character class %.20s...", lastre); */
13084b588458SPeter Avalos } else if (c == '[' && *prestr == ':') {
13094b588458SPeter Avalos /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
13104b588458SPeter Avalos for (cc = charclasses; cc->cc_name; cc++)
13114b588458SPeter Avalos if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
13124b588458SPeter Avalos break;
13134b588458SPeter Avalos if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
13144b588458SPeter Avalos prestr[2 + cc->cc_namelen] == ']') {
13154b588458SPeter Avalos prestr += cc->cc_namelen + 3;
13161d48fce0SDaniel Fojt /*
13171d48fce0SDaniel Fojt * BUG: We begin at 1, instead of 0, since we
13181d48fce0SDaniel Fojt * would otherwise prematurely terminate the
13191d48fce0SDaniel Fojt * string for classes like [[:cntrl:]]. This
13201d48fce0SDaniel Fojt * means that we can't match the NUL character,
13211d48fce0SDaniel Fojt * not without first adapting the entire
13221d48fce0SDaniel Fojt * program to track each string's length.
13231d48fce0SDaniel Fojt */
13241d48fce0SDaniel Fojt for (i = 1; i <= UCHAR_MAX; i++) {
132548f09a05SAntonio Huete Jimenez if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "relex2"))
13264b588458SPeter Avalos FATAL("out of space for reg expr %.10s...", lastre);
13274b588458SPeter Avalos if (cc->cc_func(i)) {
1328e5e686a0SDaniel Fojt /* escape backslash */
1329e5e686a0SDaniel Fojt if (i == '\\') {
1330e5e686a0SDaniel Fojt *bp++ = '\\';
1331e5e686a0SDaniel Fojt n++;
1332e5e686a0SDaniel Fojt }
1333e5e686a0SDaniel Fojt
13344b588458SPeter Avalos *bp++ = i;
13354b588458SPeter Avalos n++;
13364b588458SPeter Avalos }
13374b588458SPeter Avalos }
13384b588458SPeter Avalos } else
13394b588458SPeter Avalos *bp++ = c;
13401d48fce0SDaniel Fojt } else if (c == '[' && *prestr == '.') {
13411d48fce0SDaniel Fojt char collate_char;
13421d48fce0SDaniel Fojt prestr++;
13431d48fce0SDaniel Fojt collate_char = *prestr++;
13441d48fce0SDaniel Fojt if (*prestr == '.' && prestr[1] == ']') {
13451d48fce0SDaniel Fojt prestr += 2;
13461d48fce0SDaniel Fojt /* Found it: map via locale TBD: for
13471d48fce0SDaniel Fojt now, simply return this char. This
13481d48fce0SDaniel Fojt is sufficient to pass conformance
13491d48fce0SDaniel Fojt test awk.ex 156
13501d48fce0SDaniel Fojt */
13511d48fce0SDaniel Fojt if (*prestr == ']') {
13521d48fce0SDaniel Fojt prestr++;
13531d48fce0SDaniel Fojt rlxval = collate_char;
13541d48fce0SDaniel Fojt return CHAR;
13551d48fce0SDaniel Fojt }
13561d48fce0SDaniel Fojt }
13571d48fce0SDaniel Fojt } else if (c == '[' && *prestr == '=') {
13581d48fce0SDaniel Fojt char equiv_char;
13591d48fce0SDaniel Fojt prestr++;
13601d48fce0SDaniel Fojt equiv_char = *prestr++;
13611d48fce0SDaniel Fojt if (*prestr == '=' && prestr[1] == ']') {
13621d48fce0SDaniel Fojt prestr += 2;
13631d48fce0SDaniel Fojt /* Found it: map via locale TBD: for now
13641d48fce0SDaniel Fojt simply return this char. This is
13651d48fce0SDaniel Fojt sufficient to pass conformance test
13661d48fce0SDaniel Fojt awk.ex 156
13671d48fce0SDaniel Fojt */
13681d48fce0SDaniel Fojt if (*prestr == ']') {
13691d48fce0SDaniel Fojt prestr++;
13701d48fce0SDaniel Fojt rlxval = equiv_char;
13711d48fce0SDaniel Fojt return CHAR;
13721d48fce0SDaniel Fojt }
13731d48fce0SDaniel Fojt }
13744b588458SPeter Avalos } else if (c == '\0') {
13754b588458SPeter Avalos FATAL("nonterminated character class %.20s", lastre);
13764b588458SPeter Avalos } else if (bp == buf) { /* 1st char is special */
13774b588458SPeter Avalos *bp++ = c;
13784b588458SPeter Avalos } else if (c == ']') {
13794b588458SPeter Avalos *bp++ = 0;
13804b588458SPeter Avalos rlxstr = (uschar *) tostring((char *) buf);
13814b588458SPeter Avalos if (cflag == 0)
13824b588458SPeter Avalos return CCL;
13834b588458SPeter Avalos else
13844b588458SPeter Avalos return NCCL;
13854b588458SPeter Avalos } else
13864b588458SPeter Avalos *bp++ = c;
13874b588458SPeter Avalos }
13881d48fce0SDaniel Fojt break;
13891d48fce0SDaniel Fojt case '{':
1390*ed569bc2SAaron LI if (isdigit((int) *(prestr))) {
13911d48fce0SDaniel Fojt num = 0; /* Process as a repetition */
13921d48fce0SDaniel Fojt n = -1; m = -1;
13931d48fce0SDaniel Fojt commafound = false;
13941d48fce0SDaniel Fojt digitfound = false;
13951d48fce0SDaniel Fojt startreptok = prestr-1;
13961d48fce0SDaniel Fojt /* Remember start of previous atom here ? */
13971d48fce0SDaniel Fojt } else { /* just a { char, not a repetition */
13981d48fce0SDaniel Fojt rlxval = c;
13991d48fce0SDaniel Fojt return CHAR;
14001d48fce0SDaniel Fojt }
14011d48fce0SDaniel Fojt for (; ; ) {
14021d48fce0SDaniel Fojt if ((c = *prestr++) == '}') {
14031d48fce0SDaniel Fojt if (commafound) {
14041d48fce0SDaniel Fojt if (digitfound) { /* {n,m} */
14051d48fce0SDaniel Fojt m = num;
14061d48fce0SDaniel Fojt if (m < n)
14071d48fce0SDaniel Fojt FATAL("illegal repetition expression: class %.20s",
14081d48fce0SDaniel Fojt lastre);
14091d48fce0SDaniel Fojt if (n == 0 && m == 1) {
14101d48fce0SDaniel Fojt return QUEST;
14111d48fce0SDaniel Fojt }
14121d48fce0SDaniel Fojt } else { /* {n,} */
14131d48fce0SDaniel Fojt if (n == 0)
14141d48fce0SDaniel Fojt return STAR;
14151d48fce0SDaniel Fojt else if (n == 1)
14161d48fce0SDaniel Fojt return PLUS;
14171d48fce0SDaniel Fojt }
14181d48fce0SDaniel Fojt } else {
14191d48fce0SDaniel Fojt if (digitfound) { /* {n} same as {n,n} */
14201d48fce0SDaniel Fojt n = num;
14211d48fce0SDaniel Fojt m = num;
14221d48fce0SDaniel Fojt } else { /* {} */
14231d48fce0SDaniel Fojt FATAL("illegal repetition expression: class %.20s",
14241d48fce0SDaniel Fojt lastre);
14251d48fce0SDaniel Fojt }
14261d48fce0SDaniel Fojt }
14271d48fce0SDaniel Fojt if (repeat(starttok, prestr-starttok, lastatom,
14281d48fce0SDaniel Fojt startreptok - lastatom, n, m) > 0) {
14291d48fce0SDaniel Fojt if (n == 0 && m == 0) {
14301d48fce0SDaniel Fojt return ZERO;
14311d48fce0SDaniel Fojt }
14321d48fce0SDaniel Fojt /* must rescan input for next token */
14331d48fce0SDaniel Fojt goto rescan;
14341d48fce0SDaniel Fojt }
14351d48fce0SDaniel Fojt /* Failed to replace: eat up {...} characters
14361d48fce0SDaniel Fojt and treat like just PLUS */
14371d48fce0SDaniel Fojt return PLUS;
14381d48fce0SDaniel Fojt } else if (c == '\0') {
14391d48fce0SDaniel Fojt FATAL("nonterminated character class %.20s",
14401d48fce0SDaniel Fojt lastre);
14411d48fce0SDaniel Fojt } else if (isdigit(c)) {
14421d48fce0SDaniel Fojt num = 10 * num + c - '0';
14431d48fce0SDaniel Fojt digitfound = true;
14441d48fce0SDaniel Fojt } else if (c == ',') {
14451d48fce0SDaniel Fojt if (commafound)
14461d48fce0SDaniel Fojt FATAL("illegal repetition expression: class %.20s",
14471d48fce0SDaniel Fojt lastre);
14481d48fce0SDaniel Fojt /* looking for {n,} or {n,m} */
14491d48fce0SDaniel Fojt commafound = true;
14501d48fce0SDaniel Fojt n = num;
14511d48fce0SDaniel Fojt digitfound = false; /* reset */
14521d48fce0SDaniel Fojt num = 0;
14531d48fce0SDaniel Fojt } else {
14541d48fce0SDaniel Fojt FATAL("illegal repetition expression: class %.20s",
14551d48fce0SDaniel Fojt lastre);
14561d48fce0SDaniel Fojt }
14571d48fce0SDaniel Fojt }
14581d48fce0SDaniel Fojt break;
14594b588458SPeter Avalos }
14604b588458SPeter Avalos }
14614b588458SPeter Avalos
cgoto(fa * f,int s,int c)14624b588458SPeter Avalos int cgoto(fa *f, int s, int c)
14634b588458SPeter Avalos {
14644b588458SPeter Avalos int *p, *q;
14651d48fce0SDaniel Fojt int i, j, k;
14664b588458SPeter Avalos
1467*ed569bc2SAaron LI /* assert(c == HAT || c < NCHARS); BUG: seg fault if disable test */
14684b588458SPeter Avalos while (f->accept >= maxsetvec) { /* guessing here! */
14691d48fce0SDaniel Fojt resizesetvec(__func__);
14704b588458SPeter Avalos }
14714b588458SPeter Avalos for (i = 0; i <= f->accept; i++)
14724b588458SPeter Avalos setvec[i] = 0;
14734b588458SPeter Avalos setcnt = 0;
14741d48fce0SDaniel Fojt resize_state(f, s);
14754b588458SPeter Avalos /* compute positions of gototab[s,c] into setvec */
14764b588458SPeter Avalos p = f->posns[s];
14774b588458SPeter Avalos for (i = 1; i <= *p; i++) {
14784b588458SPeter Avalos if ((k = f->re[p[i]].ltype) != FINAL) {
14794b588458SPeter Avalos if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
14804b588458SPeter Avalos || (k == DOT && c != 0 && c != HAT)
14814b588458SPeter Avalos || (k == ALL && c != 0)
14824b588458SPeter Avalos || (k == EMPTYRE && c != 0)
1483*ed569bc2SAaron LI || (k == CCL && member(c, (int *) f->re[p[i]].lval.rp))
1484*ed569bc2SAaron LI || (k == NCCL && !member(c, (int *) f->re[p[i]].lval.rp) && c != 0 && c != HAT)) {
14854b588458SPeter Avalos q = f->re[p[i]].lfollow;
14864b588458SPeter Avalos for (j = 1; j <= *q; j++) {
14874b588458SPeter Avalos if (q[j] >= maxsetvec) {
14881d48fce0SDaniel Fojt resizesetvec(__func__);
14894b588458SPeter Avalos }
14904b588458SPeter Avalos if (setvec[q[j]] == 0) {
14914b588458SPeter Avalos setcnt++;
14924b588458SPeter Avalos setvec[q[j]] = 1;
14934b588458SPeter Avalos }
14944b588458SPeter Avalos }
14954b588458SPeter Avalos }
14964b588458SPeter Avalos }
14974b588458SPeter Avalos }
14984b588458SPeter Avalos /* determine if setvec is a previous state */
14994b588458SPeter Avalos tmpset[0] = setcnt;
15004b588458SPeter Avalos j = 1;
15014b588458SPeter Avalos for (i = f->accept; i >= 0; i--)
15024b588458SPeter Avalos if (setvec[i]) {
15034b588458SPeter Avalos tmpset[j++] = i;
15044b588458SPeter Avalos }
15051d48fce0SDaniel Fojt resize_state(f, f->curstat > s ? f->curstat : s);
15064b588458SPeter Avalos /* tmpset == previous state? */
15074b588458SPeter Avalos for (i = 1; i <= f->curstat; i++) {
15084b588458SPeter Avalos p = f->posns[i];
15094b588458SPeter Avalos if ((k = tmpset[0]) != p[0])
15104b588458SPeter Avalos goto different;
15114b588458SPeter Avalos for (j = 1; j <= k; j++)
15124b588458SPeter Avalos if (tmpset[j] != p[j])
15134b588458SPeter Avalos goto different;
15144b588458SPeter Avalos /* setvec is state i */
15151d48fce0SDaniel Fojt if (c != HAT)
1516*ed569bc2SAaron LI set_gototab(f, s, c, i);
15174b588458SPeter Avalos return i;
15184b588458SPeter Avalos different:;
15194b588458SPeter Avalos }
15204b588458SPeter Avalos
15214b588458SPeter Avalos /* add tmpset to current set of states */
15224b588458SPeter Avalos ++(f->curstat);
15231d48fce0SDaniel Fojt resize_state(f, f->curstat);
1524*ed569bc2SAaron LI clear_gototab(f, f->curstat);
15254b588458SPeter Avalos xfree(f->posns[f->curstat]);
15261d48fce0SDaniel Fojt p = intalloc(setcnt + 1, __func__);
15274b588458SPeter Avalos
15284b588458SPeter Avalos f->posns[f->curstat] = p;
15291d48fce0SDaniel Fojt if (c != HAT)
1530*ed569bc2SAaron LI set_gototab(f, s, c, f->curstat);
15314b588458SPeter Avalos for (i = 0; i <= setcnt; i++)
15324b588458SPeter Avalos p[i] = tmpset[i];
15334b588458SPeter Avalos if (setvec[f->accept])
15344b588458SPeter Avalos f->out[f->curstat] = 1;
15354b588458SPeter Avalos else
15364b588458SPeter Avalos f->out[f->curstat] = 0;
15374b588458SPeter Avalos return f->curstat;
15384b588458SPeter Avalos }
15394b588458SPeter Avalos
15404b588458SPeter Avalos
freefa(fa * f)15414b588458SPeter Avalos void freefa(fa *f) /* free a finite automaton */
15424b588458SPeter Avalos {
15434b588458SPeter Avalos int i;
15444b588458SPeter Avalos
15454b588458SPeter Avalos if (f == NULL)
15464b588458SPeter Avalos return;
15471d48fce0SDaniel Fojt for (i = 0; i < f->state_count; i++)
1548*ed569bc2SAaron LI xfree(f->gototab[i].entries);
1549*ed569bc2SAaron LI xfree(f->gototab);
15504b588458SPeter Avalos for (i = 0; i <= f->curstat; i++)
15514b588458SPeter Avalos xfree(f->posns[i]);
15524b588458SPeter Avalos for (i = 0; i <= f->accept; i++) {
15534b588458SPeter Avalos xfree(f->re[i].lfollow);
15544b588458SPeter Avalos if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
15551d48fce0SDaniel Fojt xfree(f->re[i].lval.np);
15564b588458SPeter Avalos }
15574b588458SPeter Avalos xfree(f->restr);
15581d48fce0SDaniel Fojt xfree(f->out);
15591d48fce0SDaniel Fojt xfree(f->posns);
15601d48fce0SDaniel Fojt xfree(f->gototab);
15614b588458SPeter Avalos xfree(f);
15624b588458SPeter Avalos }
1563