1 /*
2  * This file is part of DGD, https://github.com/dworkin/dgd
3  * Copyright (C) 1993-2010 Dworkin B.V.
4  * Copyright (C) 2010,2012 DGD Authors (see the commit log for details)
5  *
6  * This program is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU Affero General Public License as
8  * published by the Free Software Foundation, either version 3 of the
9  * License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU Affero General Public License for more details.
15  *
16  * You should have received a copy of the GNU Affero General Public License
17  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 # include "dgd.h"
21 # include "hash.h"
22 # include "str.h"
23 # include "dfa.h"
24 
25 /*
26  * NAME:	charset->neg()
27  * DESCRIPTION:	negate a charset
28  */
cs_neg(Uint * cs)29 static void cs_neg(Uint *cs)
30 {
31     *cs++ ^= 0xffffffffL;
32     *cs++ ^= 0xffffffffL;
33     *cs++ ^= 0xffffffffL;
34     *cs++ ^= 0xffffffffL;
35     *cs++ ^= 0xffffffffL;
36     *cs++ ^= 0xffffffffL;
37     *cs++ ^= 0xffffffffL;
38     *cs   ^= 0xffffffffL;
39 }
40 
41 /*
42  * NAME:	charset->or()
43  * DESCRIPTION:	or two charsets
44  */
cs_or(Uint * cs1,Uint * cs2)45 static void cs_or(Uint *cs1, Uint *cs2)
46 {
47     *cs1++ |= *cs2++;
48     *cs1++ |= *cs2++;
49     *cs1++ |= *cs2++;
50     *cs1++ |= *cs2++;
51     *cs1++ |= *cs2++;
52     *cs1++ |= *cs2++;
53     *cs1++ |= *cs2++;
54     *cs1   |= *cs2;
55 }
56 
57 /*
58  * NAME:	charset->sub()
59  * DESCRIPTION:	subtract a charset from another one
60  */
cs_sub(Uint * cs1,Uint * cs2)61 static void cs_sub(Uint *cs1, Uint *cs2)
62 {
63     *cs1++ &= ~*cs2++;
64     *cs1++ &= ~*cs2++;
65     *cs1++ &= ~*cs2++;
66     *cs1++ &= ~*cs2++;
67     *cs1++ &= ~*cs2++;
68     *cs1++ &= ~*cs2++;
69     *cs1++ &= ~*cs2++;
70     *cs1   &= ~*cs2;
71 }
72 
73 /*
74  * NAME:	charset->intersect()
75  * DESCRIPTION:	return TRUE if two character sets intersect, FALSE otherwise
76  */
cs_intersect(Uint * cs1,Uint * cs2)77 static bool cs_intersect(Uint *cs1, Uint *cs2)
78 {
79     Uint i;
80 
81     i  = *cs1++ & *cs2++;
82     i |= *cs1++ & *cs2++;
83     i |= *cs1++ & *cs2++;
84     i |= *cs1++ & *cs2++;
85     i |= *cs1++ & *cs2++;
86     i |= *cs1++ & *cs2++;
87     i |= *cs1++ & *cs2++;
88     i |= *cs1   & *cs2;
89 
90     return (i != 0);
91 }
92 
93 /*
94  * NAME:	charset->overlap()
95  * DESCRIPTION:	Check if two character sets overlap.  Return TRUE if they do,
96  *		or if the first set contains the second one.
97  */
cs_overlap(Uint * cs1,Uint * cs2,Uint * cs3,Uint * cs4)98 static bool cs_overlap(Uint *cs1, Uint *cs2, Uint *cs3, Uint *cs4)
99 {
100     Uint s3, s4;
101 
102     s3  = *cs3 = *cs1 & *cs2++;  s4  = *cs4++ = *cs1++ & ~*cs3++;
103     s3 |= *cs3 = *cs1 & *cs2++;  s4 |= *cs4++ = *cs1++ & ~*cs3++;
104     s3 |= *cs3 = *cs1 & *cs2++;  s4 |= *cs4++ = *cs1++ & ~*cs3++;
105     s3 |= *cs3 = *cs1 & *cs2++;  s4 |= *cs4++ = *cs1++ & ~*cs3++;
106     s3 |= *cs3 = *cs1 & *cs2++;  s4 |= *cs4++ = *cs1++ & ~*cs3++;
107     s3 |= *cs3 = *cs1 & *cs2++;  s4 |= *cs4++ = *cs1++ & ~*cs3++;
108     s3 |= *cs3 = *cs1 & *cs2++;  s4 |= *cs4++ = *cs1++ & ~*cs3++;
109     s3 |= *cs3 = *cs1 & *cs2;    s4 |= *cs4   = *cs1   & ~*cs3;
110 
111     return (s3 != 0 && s4 != 0);
112 }
113 
114 /*
115  * NAME:	charset->firstc()
116  * DESCRIPTION:	find the first char in a charset
117  */
cs_firstc(Uint * cset,int c)118 static int cs_firstc(Uint *cset, int c)
119 {
120     Uint x;
121 
122     while (c < 256) {
123 	if ((x=cset[c >> 5] >> (c & 31)) != 0) {
124 	    while ((x & 0xff) == 0) {
125 		x >>= 8;
126 		c += 8;
127 	    }
128 	    while ((x & 1) == 0) {
129 		x >>= 1;
130 		c++;
131 	    }
132 	    return c;
133 	}
134 	c += 32;
135 	c &= ~31;
136     }
137 
138     /* not found */
139     return -1;
140 }
141 
142 /*
143  * NAME:	charset->eclass()
144  * DESCRIPTION:	convert a charset into an equivalence class
145  */
cs_eclass(Uint * cset,char * eclass,int class)146 static int cs_eclass(Uint *cset, char *eclass, int class)
147 {
148     int n, c;
149     Uint x;
150 
151     n = 0;
152     for (c = cs_firstc(cset, 0); c < 256; c += 31, c &= ~31) {
153 	x = cset[c >> 5] >> (c & 31);
154 	if (x != 0) {
155 	    do {
156 		while ((x & 0xff) == 0) {
157 		    x >>= 8;
158 		    c += 8;
159 		}
160 		if (x & 1) {
161 		    eclass[c] = class;
162 		    n++;
163 		}
164 		x >>= 1;
165 		c++;
166 	    } while (x != 0);
167 	} else {
168 	    c++;
169 	}
170     }
171 
172     return n;
173 }
174 
175 
176 typedef struct {
177     hte chain;			/* hash table chain */
178     char *rgx;			/* regular expression this position is in */
179     unsigned short size;	/* size of position (length of string - 2) */
180     unsigned short ruleno;	/* the rule this position is in */
181     Uint nposn;			/* position number */
182     bool final;			/* final position? */
183     bool alloc;			/* position allocated separately? */
184 } rgxposn;
185 
186 # define RPCHUNKSZ	32
187 
188 typedef struct _rpchunk_ {
189     int chunksz;		/* size of chunk */
190     struct _rpchunk_ *next;	/* next in linked list */
191     rgxposn rp[RPCHUNKSZ];	/* rgxposn chunk */
192 } rpchunk;
193 
194 /*
195  * NAME:	rgxposn->alloc()
196  * DESCRIPTION:	allocate a new rgxposn (or return an old one)
197  */
rp_alloc(hashtab * htab,char * posn,unsigned short size,rpchunk ** c,char * rgx,Uint nposn,unsigned short ruleno,bool final)198 static rgxposn *rp_alloc(hashtab *htab, char *posn, unsigned short size, rpchunk **c, char *rgx, Uint nposn, unsigned short ruleno, bool final)
199 {
200     rgxposn **rrp, *rp;
201 
202     rrp = (rgxposn **) ht_lookup(htab, posn, TRUE);
203     if (*rrp != (rgxposn *) NULL) {
204 	return *rrp;	/* already exists */
205     }
206 
207     if (*c == (rpchunk *) NULL || (*c)->chunksz == RPCHUNKSZ) {
208 	rpchunk *x;
209 
210 	x = ALLOC(rpchunk, 1);
211 	x->next = *c;
212 	*c = x;
213 	x->chunksz = 0;
214     }
215     rp = &(*c)->rp[(*c)->chunksz++];
216     rp->chain.next = (hte *) *rrp;
217     *rrp = rp;
218     rp->chain.name = posn;
219     rp->rgx = rgx;
220     rp->size = size;
221     rp->nposn = nposn;
222     rp->ruleno = ruleno;
223     rp->final = final;
224     rp->alloc = FALSE;
225 
226     return rp;
227 }
228 
229 /*
230  * NAME:	rgxposn->new()
231  * DESCRIPTION:	create a new rgxposn
232  */
rp_new(hashtab * htab,char * posn,unsigned short size,rpchunk ** c,char * rgx,Uint nposn,unsigned short ruleno,bool final)233 static rgxposn *rp_new(hashtab *htab, char *posn, unsigned short size, rpchunk **c, char *rgx, Uint nposn, unsigned short ruleno, bool final)
234 {
235     rgxposn *rp;
236 
237     rp = rp_alloc(htab, posn, size, c, rgx, nposn, ruleno, final);
238     if (rp->nposn == nposn) {
239 	strcpy(rp->chain.name = ALLOC(char, size + 3), posn);
240 	rp->alloc = TRUE;
241     }
242     return rp;
243 }
244 
245 /*
246  * NAME:	rgxposn->clear()
247  * DESCRIPTION:	free all rgxposns
248  */
rp_clear(rpchunk * c)249 static void rp_clear(rpchunk *c)
250 {
251     rpchunk *f;
252     rgxposn *rp;
253     int i;
254 
255     while (c != (rpchunk *) NULL) {
256 	for (rp = c->rp, i = c->chunksz; i != 0; rp++, --i) {
257 	    if (rp->alloc) {
258 		FREE(rp->chain.name);
259 	    }
260 	}
261 	f = c;
262 	c = c->next;
263 	FREE(f);
264     }
265 }
266 
267 /*
268  * NAME:	rgxposn->transposn()
269  * DESCRIPTION:	convert a transition into a position
270  */
rp_transposn(char * rgx,char * trans,char * buf,unsigned short * buflen)271 static bool rp_transposn(char *rgx, char *trans, char *buf, unsigned short *buflen)
272 {
273     char a[256], b[256], c[256], heap[256];
274     char *p, *q;
275     int n, place;
276     unsigned short i, j, len;
277 
278     memset(a, '\0', 256);
279     heap[0] = 0;
280     len = 0;
281 
282     /* from transitions to places */
283     if (trans == (char *) NULL) {
284 	n = 1;
285 	b[0] = 1;
286     } else {
287 	n = 0;
288 	for (p = trans; *p != '\0'; p++) {
289 	    for (i = UCHAR(*p); ; i = place + 1) {
290 		place = UCHAR(rgx[i]) + 1;
291 		if (!a[place]) {
292 		    a[place] = TRUE;
293 		    if (place != UCHAR(rgx[0])) {
294 			switch (rgx[place]) {
295 			case '|':
296 			    /* branch */
297 			    b[n++] = place + 2;
298 			    continue;
299 
300 			case '+':
301 			    /* pattern+ */
302 			    b[n++] = place + 2;
303 			    if (place < i) {
304 				continue;
305 			    }
306 			    break;
307 
308 			default:
309 			    /* add to heap */
310 			    for (i = ++len, j = i >> 1;
311 				 UCHAR(heap[j]) > place;
312 				 i = j, j >>= 1) {
313 				heap[i] = heap[j];
314 			    }
315 			    heap[i] = place;
316 			    break;
317 			}
318 		    }
319 		}
320 		break;
321 	    }
322 	}
323     }
324     b[n] = '\0';
325 
326     /* closure: alternate between b and c */
327     for (p = b, q = c; *p != '\0'; p = q, q = (q == b) ? c : b) {
328 	n = 0;
329 	do {
330 	    place = UCHAR(*p++);
331 	    if (!a[place]) {
332 		a[place] = TRUE;
333 		if (place != UCHAR(rgx[0])) {
334 		    switch (rgx[place]) {
335 		    case '|':
336 			/* branch */
337 			q[n++] = place + 2;
338 			q[n++] = UCHAR(rgx[place + 1]) + 1;
339 			continue;
340 
341 		    case '+':
342 			/* pattern+ */
343 			q[n++] = place + 2;
344 			continue;
345 		    }
346 
347 		    /* add to heap */
348 		    for (i = ++len, j = i >> 1;
349 			 UCHAR(heap[j]) > place;
350 			 i = j, j >>= 1) {
351 			heap[i] = heap[j];
352 		    }
353 		    heap[i] = place;
354 		}
355 	    }
356 	} while (*p != '\0');
357 	q[n] = '\0';
358     }
359 
360     /* from heap to buf */
361     *buflen = len;
362     for (p = buf; len != 0; --len) {
363 	*p++ = heap[1];
364 	n = UCHAR(heap[len]);
365 	for (i = 1, j = 2; j < len; i = j, j <<= 1) {
366 	    if (UCHAR(heap[j]) > UCHAR(heap[j + 1])) {
367 		j++;
368 	    }
369 	    if (n <= UCHAR(heap[j])) {
370 		break;
371 	    }
372 	    heap[i] = heap[j];
373 	}
374 	heap[i] = n;
375     }
376     *p = '\0';
377 
378     return a[UCHAR(rgx[0])];	/* final? */
379 }
380 
381 static Uint bits[] = {
382     0x00000001L, 0x00000003L, 0x00000007L, 0x0000000fL,
383     0x0000001fL, 0x0000003fL, 0x0000007fL, 0x000000ffL,
384     0x000001ffL, 0x000003ffL, 0x000007ffL, 0x00000fffL,
385     0x00001fffL, 0x00003fffL, 0x00007fffL, 0x0000ffffL,
386     0x0001ffffL, 0x0003ffffL, 0x0007ffffL, 0x000fffffL,
387     0x001fffffL, 0x003fffffL, 0x007fffffL, 0x00ffffffL,
388     0x01ffffffL, 0x03ffffffL, 0x07ffffffL, 0x0fffffffL,
389     0x1fffffffL, 0x3fffffffL, 0x7fffffffL, 0xffffffffL
390 };
391 
392 /*
393  * NAME:	rgxposn->cset()
394  * DESCRIPTION:	create input sets for a position
395  */
rp_cset(rgxposn * rp,Uint * cset)396 static void rp_cset(rgxposn *rp, Uint *cset)
397 {
398     char *p, *q;
399     int c, n, x;
400     bool negate;
401 
402     for (q = rp->chain.name + 2; *q != '\0'; q++) {
403 	memset(cset, '\0', 32);
404 	p = rp->rgx + UCHAR(*q);
405 	switch (*p) {
406 	case '[':
407 	    /* character class */
408 	    p++;
409 	    if (*p == '^') {
410 		negate = TRUE;
411 		p++;
412 	    } else {
413 		negate = FALSE;
414 	    }
415 	    do {
416 		if (*p == '\\') {
417 		    p++;
418 		}
419 		c = UCHAR(*p++);
420 		cset[c >> 5] |= 1 << (c & 31);
421 		if (p[0] == '-' && p[1] != ']') {
422 		    n = UCHAR(p[1]) - c;
423 		    if (n != 0) {
424 			x = 32 - (++c & 31);
425 			if (x > n) {
426 			    x = n;
427 			}
428 			cset[c >> 5] |= bits[x - 1] << (c & 31);
429 			c += x;
430 			n -= x;
431 			while (n >= 32) {
432 			    cset[c >> 5] |= 0xffffffffL;
433 			    c += 32;
434 			    n -= 32;
435 			}
436 			if (n != 0) {
437 			    cset[c >> 5] |= bits[n - 1];
438 			}
439 		    }
440 		    p += 2;
441 		}
442 	    } while (*p != ']');
443 	    if (negate) {
444 		cs_neg(cset);
445 	    }
446 	    break;
447 
448 	case '.':
449 	    /* anything */
450 	    memset(cset, -1, 32);
451 	    break;
452 
453 	case '\\':
454 	    /* escaped char */
455 	    p++;
456 	default:
457 	    /* normal char */
458 	    c = UCHAR(*p);
459 	    cset[c >> 5] |= 1 << (c & 31);
460 	    break;
461 	}
462 
463 	cset += 8;
464     }
465 }
466 
467 /*
468  * NAME:	rgxposn->trans()
469  * DESCRIPTION:	perform a transition on a position, given an input set
470  */
rp_trans(rgxposn * rp,Uint * cset,char * posn,unsigned short * size)471 static bool rp_trans(rgxposn *rp, Uint *cset, char *posn, unsigned short *size)
472 {
473     char trans[256];
474     char *p, *q;
475     int c, n, x;
476     char *t;
477     Uint found;
478     bool negate;
479 
480     t = trans;
481     for (q = rp->chain.name + 2; *q != '\0'; q++) {
482 	p = rp->rgx + UCHAR(*q);
483 	found = 0;
484 	switch (*p) {
485 	case '[':
486 	    /* character class */
487 	    p++;
488 	    if (*p == '^') {
489 		negate = TRUE;
490 		p++;
491 	    } else {
492 		negate = FALSE;
493 	    }
494 	    do {
495 		if (*p == '\\') {
496 		    p++;
497 		}
498 		c = UCHAR(*p++);
499 		found |= cset[c >> 5] & 1 << (c & 31);
500 		if (p[0] == '-' && p[1] != ']') {
501 		    n = UCHAR(p[1]) - c;
502 		    if (n != 0) {
503 			x = 32 - (++c & 31);
504 			if (x > n) {
505 			    x = n;
506 			}
507 			found |= cset[c >> 5] & (bits[x - 1] << (c & 31));
508 			c += x;
509 			n -= x;
510 			while (n >= 32) {
511 			    found |= cset[c >> 5] & 0xffffffffL;
512 			    c += 32;
513 			    n -= 32;
514 			}
515 			if (n != 0) {
516 			    found |= cset[c >> 5] & bits[n - 1];
517 			}
518 		    }
519 		    p += 2;
520 		}
521 	    } while (*p != ']');
522 	    if (negate) {
523 		found = !found;
524 	    }
525 	    break;
526 
527 	case '.':
528 	    /* anything */
529 	    found = 1;
530 	    break;
531 
532 	case '\\':
533 	    /* escaped char */
534 	    p++;
535 	default:
536 	    /* normal char */
537 	    c = UCHAR(*p);
538 	    found = cset[c >> 5] & (1 << (c & 31));
539 	    break;
540 	}
541 	if (found != 0) {
542 	    *t++ = p - rp->rgx + 1;
543 	}
544     }
545     *t = '\0';
546 
547     return rp_transposn(rp->rgx, trans, posn, size);
548 }
549 
550 /*
551  * NAME:	rgxposn->load()
552  * DESCRIPTION:	load a rgxposn from a buffer
553  */
rp_load(hashtab * htab,rpchunk ** c,Uint nposn,char * buf,char * grammar)554 static rgxposn *rp_load(hashtab *htab, rpchunk **c, Uint nposn, char *buf, char *grammar)
555 {
556     char *rgx;
557     unsigned short ruleno, size;
558     bool final;
559 
560     rgx = grammar + (UCHAR(buf[0]) << 8) + UCHAR(buf[1]);
561     ruleno = (UCHAR(buf[2]) << 8) + UCHAR(buf[3]);
562     buf += 4;
563     if (*buf == '\0') {
564 	final = TRUE;
565 	buf++;
566     } else {
567 	final = FALSE;
568     }
569     size = UCHAR(*buf++);
570 
571     return rp_alloc(htab, buf, size, c, rgx, nposn, ruleno, final);
572 }
573 
574 /*
575  * NAME:	rgxposn->save()
576  * DESCRIPTION:	save a rgxposn to a buffer
577  */
rp_save(rgxposn * rp,char * buf,char * grammar)578 static char *rp_save(rgxposn *rp, char *buf, char *grammar)
579 {
580     unsigned short rgx;
581 
582     rgx = rp->rgx - grammar;
583     *buf++ = rgx >> 8;
584     *buf++ = rgx;
585     *buf++ = rp->ruleno >> 8;
586     *buf++ = rp->ruleno;
587     if (rp->final) {
588 	*buf++ = '\0';
589     }
590     *buf++ = rp->size;
591     memcpy(buf, rp->chain.name, rp->size + 3);
592     return buf + rp->size + 3;
593 }
594 
595 
596 typedef struct {
597     union {			/* regexp positions */
598 	rgxposn *e;		/* 1 */
599 	rgxposn **a;		/* > 1 */
600     } posn;
601     union {			/* strings */
602 	unsigned short e[2];	/* 1, 2 */
603 	unsigned short *a;	/* > 2 */
604     } str;
605     char *trans;		/* transitions */
606     unsigned short nposn;	/* number of positions */
607     unsigned short nstr;	/* number of string positions */
608     unsigned short len;		/* string length */
609     unsigned short ntrans;	/* number of transitions */
610     short final;		/* rule number, -1: not final */
611     unsigned short next;	/* next in hash chain */
612     bool alloc;			/* transitions allocated? */
613 } dfastate;
614 
615 # define POSNA(state)	(((state)->nposn == 1) ? \
616 			  &(state)->posn.e : (state)->posn.a)
617 # define STRA(state)	(((state)->nstr <= 2) ? \
618 			  (state)->str.e : (state)->str.a)
619 
620 /*
621  * NAME:	dfastate->hash()
622  * DESCRIPTION:	put a new state in the hash table, or return an old one
623  */
ds_hash(unsigned short * htab,Uint htabsize,dfastate * states,unsigned short idx)624 static unsigned short ds_hash(unsigned short *htab, Uint htabsize, dfastate *states, unsigned short idx)
625 {
626     Uint x;
627     rgxposn **posn;
628     unsigned short n, *str;
629     dfastate *newstate, *ds;
630     unsigned short *dds;
631 
632     /* hash on position and string pointers */
633     newstate = &states[idx];
634     x = newstate->len ^ newstate->final;
635     for (n = newstate->nposn, posn = POSNA(newstate); n > 0; --n) {
636 	x = (x >> 3) ^ (x << 29) ^ (uintptr_t) *posn++;
637     }
638     for (n = newstate->nstr, str = STRA(newstate); n > 0; --n) {
639 	x = (x >> 3) ^ (x << 29) ^ (uintptr_t) *str++;
640     }
641 
642     /* check state hash table */
643     posn = POSNA(newstate);
644     str = STRA(newstate);
645     dds = &htab[(Uint) x % htabsize];
646     ds = &states[*dds];
647     while (ds != states) {
648 	if (newstate->len == ds->len && newstate->final == ds->final &&
649 	    newstate->nposn == ds->nposn && newstate->nstr == ds->nstr &&
650 	    memcmp(posn, POSNA(ds), newstate->nposn * sizeof(rgxposn*)) == 0 &&
651 	    memcmp(str, STRA(ds), newstate->nstr * sizeof(unsigned short)) == 0)
652 	{
653 	    /* state already exists */
654 	    return *dds;
655 	}
656 	dds = &ds->next;
657 	ds = &states[*dds];
658     }
659 
660     newstate->next = *dds;
661     return *dds = idx;
662 }
663 
664 # define TRANS_NONE	0	/* no transitions */
665 # define TRANS_ZERO	1	/* all transitions to state 0 */
666 # define TRANS_STATES	2	/* normal transitions */
667 
668 /*
669  * NAME:	dfastate->load()
670  * DESCRIPTION:	load a dfastate from a buffer
671  */
ds_load(dfastate * state,char * buf,unsigned short ntrans,char * zerotrans)672 static char *ds_load(dfastate *state, char *buf, unsigned short ntrans, char *zerotrans)
673 {
674     state->posn.a = (rgxposn **) NULL;
675     state->str.a = (unsigned short *) NULL;
676     state->nposn = state->nstr = state->len = 0;
677     state->alloc = FALSE;
678     state->final = (UCHAR(buf[0]) << 8) + UCHAR(buf[1]);
679     buf += 2;
680     switch (*buf++) {
681     case TRANS_NONE:
682 	state->ntrans = 0;
683 	break;
684 
685     case TRANS_ZERO:
686 	state->ntrans = 256;
687 	state->trans = zerotrans;
688 	break;
689 
690     case TRANS_STATES:
691 	state->ntrans = ntrans;
692 	state->trans = buf;
693 	buf += ntrans << 1;
694 	break;
695     }
696 
697     return buf;
698 }
699 
700 /*
701  * NAME:	dfastate->loadtmp()
702  * DESCRIPTION:	load dfastate temporary data from a buffer
703  */
ds_loadtmp(dfastate * state,char * sbuf,char * pbuf,hashtab * htab,rpchunk ** c,Uint * nposn,char * grammar)704 static char *ds_loadtmp(dfastate *state, char *sbuf, char *pbuf, hashtab *htab, rpchunk **c, Uint *nposn, char *grammar)
705 {
706     rgxposn *rp, **rrp;
707     unsigned short i, *s;
708     char *posn;
709 
710     state->nposn = (UCHAR(sbuf[0]) << 8) + UCHAR(sbuf[1]);
711     state->nstr = (UCHAR(sbuf[2]) << 8) + UCHAR(sbuf[3]);
712     sbuf += 4;
713     state->len = UCHAR(*sbuf++);
714 
715     if (state->nposn != 0) {
716 	if (state->nposn != 1) {
717 	    rrp = state->posn.a = ALLOC(rgxposn*, state->nposn);
718 	} else {
719 	    rrp = &state->posn.e;
720 	}
721 	for (i = state->nposn; i > 0; --i) {
722 	    posn = pbuf + ((Uint) UCHAR(sbuf[0]) << 16) +
723 		   (UCHAR(sbuf[1]) << 8) + UCHAR(sbuf[2]);
724 	    sbuf += 3;
725 	    rp = *rrp++ = rp_load(htab, c, *nposn, posn, grammar);
726 	    if (rp->nposn == *nposn) {
727 		(*nposn)++;
728 	    }
729 	}
730     }
731     if (state->nstr != 0) {
732 	if (state->nstr > 2) {
733 	    s = state->str.a = ALLOC(unsigned short, state->nstr);
734 	} else {
735 	    s = state->str.e;
736 	}
737 	for (i = state->nstr; i > 0; --i) {
738 	    *s++ = (UCHAR(sbuf[0]) << 8) + UCHAR(sbuf[1]);
739 	    sbuf += 2;
740 	}
741     }
742 
743     return sbuf;
744 }
745 
746 /*
747  * NAME:	dfastate->save()
748  * DESCRIPTION:	save a dfastate to a buffer
749  */
ds_save(dfastate * state,char * buf)750 static char *ds_save(dfastate *state, char *buf)
751 {
752     buf[0] = state->final >> 8;
753     buf[1] = state->final;
754     buf += 2;
755     if (state->ntrans == 0) {
756 	*buf++ = TRANS_NONE;
757     } else if (state->nposn == 0 && state->nstr == 0) {
758 	*buf++ = TRANS_ZERO;
759     } else {
760 	*buf++ = TRANS_STATES;
761 	memcpy(buf, state->trans, state->ntrans << 1);
762 	buf += state->ntrans << 1;
763     }
764 
765     return buf;
766 }
767 
768 /*
769  * NAME:	dfastate->savetmp()
770  * DESCRIPTION:	save dfastate temporary data to a buffer
771  */
ds_savetmp(dfastate * state,char * sbuf,char ** pbuf,char * pbase,Uint * ptab,Uint * nposn,char * grammar)772 static char *ds_savetmp(dfastate *state, char *sbuf, char **pbuf, char *pbase, Uint *ptab, Uint *nposn, char *grammar)
773 {
774     rgxposn *rp, **rrp;
775     unsigned short i, *s;
776     Uint n;
777 
778     *sbuf++ = state->nposn >> 8;
779     *sbuf++ = state->nposn;
780     *sbuf++ = state->nstr >> 8;
781     *sbuf++ = state->nstr;
782     *sbuf++ = state->len;
783 
784     rrp = POSNA(state);
785     for (i = state->nposn; i > 0; --i) {
786 	rp = *rrp++;
787 	if (rp->nposn == *nposn) {
788 	    ptab[(*nposn)++] = (intptr_t) *pbuf - (intptr_t) pbase;
789 	    *pbuf = rp_save(rp, *pbuf, grammar);
790 	}
791 	n = ptab[rp->nposn];
792 	*sbuf++ = n >> 16;
793 	*sbuf++ = n >> 8;
794 	*sbuf++ = n;
795     }
796 
797     s = STRA(state);
798     for (i = state->nstr; i > 0; --i) {
799 	*sbuf++ = *s >> 8;
800 	*sbuf++ = *s++;
801     }
802 
803     return sbuf;
804 }
805 
806 
807 struct _dfa_ {
808     char *source;		/* source grammar */
809     char *grammar;		/* reference grammar */
810     char *strings;		/* offset of strings in grammar */
811     unsigned short nsstrings;	/* # strings in source grammar */
812     short whitespace;		/* whitespace rule or -1 */
813     short nomatch;		/* nomatch rule or -1 */
814 
815     bool modified;		/* dfa modified */
816     bool allocated;		/* dfa strings allocated locally */
817     Uint dfasize;		/* size of state machine */
818     Uint tmpssize;		/* size of temporary state data */
819     Uint tmppsize;		/* size of temporary posn data */
820     char *dfastr;		/* saved dfa */
821     char *tmpstr;		/* saved temporary data */
822 
823     unsigned short nregexp;	/* # regexps */
824     Uint nposn;			/* number of unique positions */
825     rpchunk *rpc;		/* current rgxposn chunk */
826     hashtab *posnhtab;		/* position hash table */
827 
828     unsigned short nstates;	/* # states */
829     unsigned short nexpanded;	/* # expanded states */
830     unsigned short endstates;	/* # states with no valid transitions */
831     Uint sttsize;		/* state table size */
832     Uint sthsize;		/* size of state hash table */
833     dfastate *states;		/* dfa states */
834     unsigned short *sthtab;	/* state hash table */
835 
836     unsigned short ecnum;	/* number of equivalence classes */
837     char *ecsplit;		/* equivalence class split history */
838     char *ecmembers;		/* members per equivalence class */
839     Uint *ecset;		/* equivalence class sets */
840     char eclass[256];		/* equivalence classes */
841 
842     char zerotrans[2 * 256];	/* shared zero transitions */
843 };
844 
845 # define DFA_VERSION	1
846 
847 /*
848  * NAME:	dfa->new()
849  * DESCRIPTION:	create new dfa instance
850  */
dfa_new(char * source,char * grammar)851 dfa *dfa_new(char *source, char *grammar)
852 {
853     char posn[258];
854     unsigned short nstrings;
855     dfa *fa;
856     dfastate *state;
857     bool final;
858 
859     fa = ALLOC(dfa, 1);
860 
861     /* grammar info */
862     fa->source = source;
863     fa->grammar = grammar;
864     fa->whitespace = (UCHAR(grammar[1]) << 8) + UCHAR(grammar[2]);
865     fa->nomatch = (UCHAR(grammar[3]) << 8) + UCHAR(grammar[4]);
866     fa->nregexp = (UCHAR(grammar[5]) << 8) + UCHAR(grammar[6]);
867     fa->nsstrings = (UCHAR(grammar[9]) << 8) + UCHAR(grammar[10]);
868     nstrings = fa->nsstrings + (UCHAR(grammar[11]) << 8) + UCHAR(grammar[12]);
869     fa->strings = grammar + 17 + (fa->nregexp << 1);
870 
871     /* size info */
872     fa->modified = TRUE;
873     fa->allocated = FALSE;
874     fa->dfasize = 8 + 256 + 3;		/* header + eclasses + state 1 */
875     fa->tmpssize = 4 + 1 + 5;		/* header + ecsplit + state 1 */
876     fa->tmppsize = 0;
877     fa->dfastr = (char *) NULL;
878     fa->tmpstr = (char *) NULL;
879 
880     /* equivalence classes */
881     fa->ecnum = 1;
882     fa->ecsplit = ALLOC(char, 256 + 256 + 32 * 256);
883     fa->ecmembers = fa->ecsplit + 256;
884     fa->ecset = (Uint *) (fa->ecmembers + 256);
885     memset(fa->eclass, '\0', 256);
886     memset(fa->ecmembers, '\0', 256);
887     memset(fa->ecset, -1, 32);
888     memset(fa->ecset + 8, '\0', 32 * 255);
889 
890     /* positions */
891     fa->nposn = (UCHAR(grammar[7]) << 8) + UCHAR(grammar[8]);
892     fa->rpc = (rpchunk *) NULL;
893     fa->posnhtab = ht_new((fa->nposn + 1) << 2, 257, FALSE);
894 
895     /* states */
896     fa->nstates = 2;
897     fa->sttsize = (Uint) (fa->nposn + nstrings + 1) << 1;
898     fa->sthsize = (Uint) fa->sttsize << 1;
899     fa->nexpanded = 0;
900     fa->endstates = 1;
901     fa->states = ALLOC(dfastate, fa->sttsize);
902     fa->sthtab = ALLOC(unsigned short, fa->sthsize);
903     memset(fa->sthtab, '\0', sizeof(unsigned short) * fa->sthsize);
904 
905     /* initial states */
906     state = &fa->states[0];
907     state->posn.a = (rgxposn **) NULL;
908     state->str.a = (unsigned short *) NULL;
909     state->trans = (char *) NULL;
910     state->nposn = state->nstr = 0;
911     state->ntrans = state->len = 0;
912     (state++)->final = -1;
913     state->posn.a = (fa->nposn > 1) ?
914 		     ALLOC(rgxposn*, fa->nposn) : (rgxposn **) NULL;
915     state->str.a = (nstrings > 2) ?
916 		    ALLOC(unsigned short, nstrings) : (unsigned short *) NULL;
917     state->trans = (char *) NULL;
918     state->nposn = fa->nposn;
919     state->nstr = nstrings;
920     state->ntrans = state->len = 0;
921     state->final = -1;
922     state->alloc = FALSE;
923     grammar += 17;
924     /* initial positions */
925     if (state->nposn == 0 && state->nstr == 0) {
926 	/* no valid transitions from initial state */
927 	state->ntrans = 256;
928 	state->trans = fa->zerotrans;
929 	fa->endstates++;
930     } else {
931 	rgxposn **rrp;
932 	unsigned short i, j, n, *s;
933 	char *rgx;
934 	unsigned short size;
935 
936 	rrp = POSNA(state);
937 	for (i = j = 0; i < fa->nregexp; i++) {
938 	    rgx = fa->grammar + (UCHAR(grammar[0]) << 8) + UCHAR(grammar[1]);
939 	    grammar += 2;
940 	    n = j + (UCHAR(rgx[0]) << 8) + UCHAR(rgx[1]);
941 	    rgx += 2;
942 	    while (j < n) {
943 		final = rp_transposn(rgx, (char *) NULL, posn + 2, &size);
944 		if (final && state->final < 0) {
945 		    state->final = i;
946 		}
947 		posn[0] = 1 + j / 255;
948 		posn[1] = 1 + j % 255;
949 		*rrp++ = rp_new(fa->posnhtab, posn, size, &fa->rpc, rgx,
950 				(Uint) j++, i, final);
951 		fa->tmpssize += 3;
952 		fa->tmppsize += 8 + size + final;
953 		rgx += UCHAR(rgx[0]) + 1;
954 	    }
955 	}
956 	/* initial strings */
957 	for (i = 0, s = STRA(state); i < nstrings; i++) {
958 	    *s++ = i;
959 	}
960 	fa->tmpssize += nstrings << 1;
961     }
962     /* add to hashtable */
963     ds_hash(fa->sthtab, fa->sthsize, fa->states, 1);
964 
965     /* zero transitions */
966     memset(fa->zerotrans, '\0', 2 * 256);
967 
968     return fa;
969 }
970 
971 /*
972  * NAME:	dfa->del()
973  * DESCRIPTION:	delete a dfa instance
974  */
dfa_del(dfa * fa)975 void dfa_del(dfa *fa)
976 {
977     dfastate *state;
978     unsigned short i;
979 
980     if (fa->allocated) {
981 	FREE(fa->dfastr);
982     }
983     if (fa->ecsplit != (char *) NULL) {
984 	FREE(fa->ecsplit);
985     }
986     if (fa->rpc != (rpchunk *) NULL) {
987 	rp_clear(fa->rpc);
988     }
989     if (fa->posnhtab != (hashtab *) NULL) {
990 	ht_del(fa->posnhtab);
991     }
992     for (i = fa->nstates, state = &fa->states[1]; --i > 0; state++) {
993 	if (state->nposn > 1) {
994 	    FREE(state->posn.a);
995 	}
996 	if (state->nstr > 2) {
997 	    FREE(state->str.a);
998 	}
999 	if (state->alloc) {
1000 	    FREE(state->trans);
1001 	}
1002     }
1003     FREE(fa->states);
1004     if (fa->sthtab != (unsigned short *) NULL) {
1005 	FREE(fa->sthtab);
1006     }
1007     FREE(fa);
1008 }
1009 
1010 /*
1011  * NAME:	dfa->extend()
1012  * DESCRIPTION:	extend transition table
1013  */
dfa_extend(dfa * fa,dfastate * state,unsigned short limit)1014 static void dfa_extend(dfa *fa, dfastate *state, unsigned short limit)
1015 {
1016     char *p, *q;
1017     unsigned short i;
1018 
1019     /* extend transition table */
1020     if (!state->alloc) {
1021 	p = ALLOC(char, 2 * 256);
1022 	memcpy(p, state->trans, state->ntrans << 1);
1023 	state->trans = p;
1024 	state->alloc = TRUE;
1025     }
1026     p = state->trans + (state->ntrans << 1);
1027     for (i = state->ntrans; i <= limit; i++) {
1028 	q = &state->trans[UCHAR(fa->ecsplit[i]) << 1];
1029 	*p++ = *q++;
1030 	*p++ = *q;
1031     }
1032     state->ntrans = i;
1033 }
1034 
1035 /*
1036  * state & eclass format:
1037  *
1038  * header	[0]	version number
1039  *		[x][y]	# states
1040  *		[x][y]	# expanded states
1041  *		[x][y]	# end states
1042  *		[x]	# equivalence classes
1043  * eclass	[...]	1 - 256 equivalence classes
1044  *
1045  * state	[x][y]	final				} ...
1046  *		[...]	optional: transitions		}
1047  *
1048  *
1049  * temporary data format:
1050  *
1051  * header	[0]	version number
1052  *		[x][y]	number of positions
1053  * ecsplit	[...]	256 ecsplit data
1054  *
1055  * state	[x][y]	# positions			}
1056  *		[x][y]	# strings			}
1057  *		[x]	len				} ...
1058  *		[...]	position data			}
1059  *		[...]	string data			}
1060  *
1061  * position	[x][y]	regexp			}
1062  *		[x][y]	ruleno			}
1063  *		[0]	optional: final position	} ...
1064  *		[x]	size				}
1065  *		[...]	position data			}
1066  */
1067 
1068 /*
1069  * NAME:	dfa->load()
1070  * DESCRIPTION:	load dfa from string
1071  */
dfa_load(char * source,char * grammar,char * str,Uint len)1072 dfa *dfa_load(char *source, char *grammar, char *str, Uint len)
1073 {
1074     dfa *fa;
1075     dfastate *state;
1076     unsigned short i;
1077     char *buf;
1078     unsigned short nstrings;
1079 
1080     if (str[0] != DFA_VERSION) {
1081 	return dfa_new(source, grammar);
1082     }
1083 
1084     fa = ALLOC(dfa, 1);
1085     fa->dfastr = buf = str;
1086 
1087     /* grammar info */
1088     fa->source = source;
1089     fa->grammar = grammar;
1090     fa->whitespace = (UCHAR(grammar[1]) << 8) + UCHAR(grammar[2]);
1091     fa->nomatch = (UCHAR(grammar[3]) << 8) + UCHAR(grammar[4]);
1092     fa->nregexp = (UCHAR(grammar[5]) << 8) + UCHAR(grammar[6]);
1093     fa->nsstrings = (UCHAR(grammar[9]) << 8) + UCHAR(grammar[10]);
1094     nstrings = fa->nsstrings + (UCHAR(grammar[11]) << 8) + UCHAR(grammar[12]);
1095     fa->strings = grammar + 17 + (fa->nregexp << 1);
1096 
1097     /* positions */
1098     fa->nposn = (UCHAR(grammar[7]) << 8) + UCHAR(grammar[8]);
1099     fa->rpc = (rpchunk *) NULL;
1100     fa->posnhtab = (hashtab *) NULL;
1101 
1102     /* states 1 */
1103     fa->nstates = (UCHAR(buf[1]) << 8) + UCHAR(buf[2]);
1104     fa->nexpanded = (UCHAR(buf[3]) << 8) + UCHAR(buf[4]);
1105     fa->endstates = (UCHAR(buf[5]) << 8) + UCHAR(buf[6]);
1106     fa->sttsize = fa->nstates + 1;
1107     fa->sthsize = (Uint) (fa->nposn + nstrings + 1) << 2;
1108     fa->states = ALLOC(dfastate, fa->sttsize);
1109     fa->sthtab = (unsigned short *) NULL;
1110 
1111     /* equivalence classes */
1112     fa->ecnum = UCHAR(buf[7]) + 1;
1113     buf += 8;
1114     memcpy(fa->eclass, buf, 256);
1115     buf += 256;
1116     fa->ecsplit = (char *) NULL;
1117     fa->ecmembers = (char *) NULL;
1118     fa->ecset = (Uint *) NULL;
1119 
1120     /* states 2 */
1121     fa->states[0].posn.a = (rgxposn **) NULL;
1122     fa->states[0].str.a = (unsigned short *) NULL;
1123     fa->states[0].trans = (char *) NULL;
1124     fa->states[0].nposn = fa->states[0].nstr = 0;
1125     fa->states[0].ntrans = fa->states[0].len = 0;
1126     fa->states[0].final = -1;
1127     for (i = fa->nstates, state = &fa->states[1]; --i > 0; state++) {
1128 	buf = ds_load(state, buf, fa->ecnum, fa->zerotrans);
1129     }
1130 
1131     /* temporary data */
1132     fa->tmpstr = buf;
1133 
1134     /* size info */
1135     fa->dfasize = (intptr_t) buf - (intptr_t) str;
1136     fa->tmpssize = 0;
1137     fa->tmppsize = len - fa->dfasize;
1138     fa->modified = fa->allocated = FALSE;
1139 
1140     /* zero transitions */
1141     memset(fa->zerotrans, '\0', 2 * 256);
1142 
1143     return fa;
1144 }
1145 
1146 /*
1147  * NAME:	dfa->loadtmp()
1148  * DESCRIPTION:	load dfa tmp info
1149  */
dfa_loadtmp(dfa * fa)1150 static void dfa_loadtmp(dfa *fa)
1151 {
1152     dfastate *state;
1153     unsigned short i;
1154     int c;
1155     char *buf;
1156 
1157     buf = fa->tmpstr;
1158     fa->nposn = ((Uint) UCHAR(buf[1]) << 16) + (UCHAR(buf[2]) << 8) +
1159 		UCHAR(buf[3]);
1160     buf += 4;
1161 
1162     /* equivalence classes */
1163     fa->ecsplit = ALLOC(char, 256 + 256 + 32 * 256);
1164     fa->ecmembers = fa->ecsplit + 256;
1165     fa->ecset = (Uint *) (fa->ecmembers + 256);
1166     memcpy(fa->ecsplit, buf, fa->ecnum);
1167     buf += fa->ecnum;
1168     memset(fa->ecmembers, '\0', 256);
1169     memset(fa->ecset, '\0', 32 * 256);
1170     for (i = 256; i > 0; ) {
1171 	--i;
1172 	c = UCHAR(fa->eclass[i]);
1173 	fa->ecmembers[c]++;
1174 	fa->ecset[(c << 3) + (i >> 5)] |= 1 << (i & 31);
1175     }
1176 
1177     /* positions */
1178     fa->posnhtab = ht_new((fa->nposn + 1) << 2, 257, FALSE);
1179 
1180     /* states */
1181     fa->sthtab = ALLOC(unsigned short, fa->sthsize);
1182     memset(fa->sthtab, '\0', sizeof(unsigned short) * fa->sthsize);
1183 
1184     fa->nposn = 0;
1185     for (i = 1, state = &fa->states[1]; i < fa->nstates; i++, state++) {
1186 	buf = ds_loadtmp(state, buf, fa->tmpstr, fa->posnhtab, &fa->rpc,
1187 			 &fa->nposn, fa->grammar);
1188 	ds_hash(fa->sthtab, fa->sthsize, fa->states, i);
1189     }
1190 
1191     /* size info */
1192     fa->tmpssize = (intptr_t) buf - (intptr_t) fa->tmpstr;
1193     fa->tmppsize -= fa->tmpssize;
1194 }
1195 
1196 /*
1197  * NAME:	dfa->save()
1198  * DESCRIPTION:	save dfa to string
1199  */
dfa_save(dfa * fa,char ** str,Uint * len)1200 bool dfa_save(dfa *fa, char **str, Uint *len)
1201 {
1202     unsigned short i;
1203     char *buf;
1204     dfastate *state;
1205     char *pbuf;
1206     Uint *ptab, nposn;
1207 
1208     if (!fa->modified) {
1209 	*str = fa->dfastr;
1210 	*len = fa->dfasize + fa->tmpssize + fa->tmppsize;
1211 	return FALSE;
1212     }
1213 
1214     if (fa->nstates == fa->nexpanded + fa->endstates) {
1215 	fa->tmpssize = fa->tmppsize = 0;
1216     }
1217     if (fa->allocated) {
1218 	FREE(fa->dfastr);
1219     }
1220     fa->dfastr = buf = *str =
1221 		 ALLOC(char, *len = fa->dfasize + fa->tmpssize + fa->tmppsize);
1222     *buf++ = DFA_VERSION;
1223     *buf++ = fa->nstates >> 8;
1224     *buf++ = fa->nstates;
1225     *buf++ = fa->nexpanded >> 8;
1226     *buf++ = fa->nexpanded;
1227     *buf++ = fa->endstates >> 8;
1228     *buf++ = fa->endstates;
1229     *buf++ = fa->ecnum - 1;
1230     memcpy(buf, fa->eclass, 256);
1231     buf += 256;
1232 
1233     for (i = fa->nstates, state = &fa->states[1]; --i > 0; state++) {
1234 	if (state->ntrans != 0 && state->ntrans < fa->ecnum) {
1235 	    dfa_extend(fa, state, fa->ecnum - 1);
1236 	}
1237 	buf = ds_save(state, buf);
1238     }
1239 
1240     fa->modified = FALSE;
1241     fa->allocated = TRUE;
1242     if (fa->tmpssize + fa->tmppsize == 0) {
1243 	/* no tmp data */
1244 	return TRUE;
1245     }
1246 
1247     fa->tmpstr = buf;
1248     pbuf = buf + fa->tmpssize;
1249     *buf++ = 0;
1250     *buf++ = fa->nposn >> 16;
1251     *buf++ = fa->nposn >> 8;
1252     *buf++ = fa->nposn;
1253     memcpy(buf, fa->ecsplit, fa->ecnum);
1254     buf += fa->ecnum;
1255 
1256     ptab = ALLOCA(Uint, fa->nposn);
1257     nposn = 0;
1258     for (i = fa->nstates, state = &fa->states[1]; --i > 0; state++) {
1259 	buf = ds_savetmp(state, buf, &pbuf, fa->tmpstr, ptab, &nposn,
1260 			 fa->grammar);
1261     }
1262     AFREE(ptab);
1263 
1264     return TRUE;
1265 }
1266 
1267 /*
1268  * NAME:	dfa->ecsplit()
1269  * DESCRIPTION:	split up equivalence classes along the borders of character
1270  *		sets
1271  */
dfa_ecsplit(dfa * fa,Uint * iset,Uint * cset,Uint ncset)1272 static void dfa_ecsplit(dfa *fa, Uint *iset, Uint *cset, Uint ncset)
1273 {
1274     Uint ec1[8], ec2[8];
1275     Uint i;
1276     int n, c;
1277 
1278     for (c = cs_firstc(iset, 0); c >= 0; c = cs_firstc(iset, c + 1)) {
1279 	for (i = 0; i < ncset; i++) {
1280 	    /*
1281 	     * get the equivalence class of the first char in the input set
1282 	     */
1283 	    n = UCHAR(fa->eclass[c]);
1284 	    if (fa->ecmembers[n] == 1) {
1285 		break;	/* only one character left */
1286 	    }
1287 	    if (cs_overlap(fa->ecset + (n << 3), cset, ec1, ec2)) {
1288 		/*
1289 		 * create new equivalence class
1290 		 */
1291 		memcpy(fa->ecset + (n << 3), ec1, sizeof(ec1));
1292 		memcpy(fa->ecset + (fa->ecnum << 3), ec2, sizeof(ec2));
1293 		fa->ecsplit[fa->ecnum] = n;
1294 		fa->ecmembers[n] -= fa->ecmembers[fa->ecnum] =
1295 				    cs_eclass(ec2, fa->eclass, fa->ecnum);
1296 		fa->ecnum++;
1297 		fa->dfasize += fa->nexpanded << 1;
1298 		fa->tmpssize++;
1299 	    }
1300 	    cset += 8;
1301 	}
1302 	cset -= i << 3;
1303 
1304 	/* remove from input set */
1305 	cs_sub(iset, fa->ecset + (UCHAR(fa->eclass[c]) << 3));
1306     }
1307 }
1308 
1309 /*
1310  * NAME:	dfa->newstate()
1311  * DESCRIPTION:	get the positions and strings for a new state
1312  */
dfa_newstate(dfa * fa,dfastate * state,dfastate * newstate,Uint * ecset,Uint * cset)1313 static unsigned short dfa_newstate(dfa *fa, dfastate *state, dfastate *newstate, Uint *ecset, Uint *cset)
1314 {
1315     char posn[130];
1316     unsigned short i, n, *s;
1317     rgxposn *rp, **rrp;
1318     char *p;
1319     unsigned short size;
1320     Uint posnsize;
1321     bool final;
1322 
1323     newstate->trans = (char *) NULL;
1324     newstate->nposn = newstate->nstr = newstate->ntrans = 0;
1325     newstate->len = state->len + 1;
1326     newstate->final = -1;
1327     newstate->alloc = FALSE;
1328     posnsize = 0;
1329 
1330     /* positions */
1331     for (i = state->nposn, rrp = POSNA(state); i > 0; --i, rrp++) {
1332 	rp = *rrp;
1333 	for (n = rp->size; n > 0; --n) {
1334 	    if (cs_intersect(ecset, cset)) {
1335 		final = rp_trans(rp, ecset, posn + 2, &size);
1336 		if (size != 0) {
1337 		    posn[0] = rp->chain.name[0];
1338 		    posn[1] = rp->chain.name[1];
1339 		    rp = rp_new(fa->posnhtab, posn, size, &fa->rpc, rp->rgx,
1340 				fa->nposn, rp->ruleno, final);
1341 		    if (rp->nposn == fa->nposn) {
1342 			/* new position */
1343 			fa->nposn++;
1344 			posnsize += 8 + rp->size + final;
1345 		    }
1346 		    newstate->posn.a[newstate->nposn++] = rp;
1347 		}
1348 		if (final && newstate->final < 0) {
1349 		    newstate->final = rp->ruleno;
1350 		}
1351 		cset += n << 3;
1352 		break;
1353 	    }
1354 	    cset += 8;
1355 	}
1356     }
1357 
1358     /* strings */
1359     for (i = state->nstr, s = STRA(state); i > 0; --i, s++) {
1360 	if (*s < fa->nsstrings) {
1361 	    p = fa->strings + (*s << 2);
1362 	    n = UCHAR(fa->source[(UCHAR(p[0]) << 16) + (UCHAR(p[1]) << 8) +
1363 				 UCHAR(p[2]) + state->len]);
1364 	    p += 3;
1365 	} else {
1366 	    p = fa->strings + (fa->nsstrings << 2) +
1367 		((*s - fa->nsstrings) << 1);
1368 	    p = fa->grammar + (UCHAR(p[0]) << 8) + UCHAR(p[1]);
1369 	    n = UCHAR(p[newstate->len]);
1370 	}
1371 	if (ecset[n >> 5] & (1 << (n & 31))) {
1372 	    if (newstate->len == UCHAR(p[0])) {
1373 		/* end of string */
1374 		newstate->final = fa->nregexp + *s;
1375 	    } else {
1376 		/* add string */
1377 		newstate->str.a[newstate->nstr++] = *s;
1378 	    }
1379 	}
1380     }
1381 
1382     return posnsize;
1383 }
1384 
1385 /*
1386  * NAME:	dfa->expand()
1387  * DESCRIPTION:	expand a state
1388  */
dfa_expand(dfa * fa,dfastate * state)1389 static dfastate *dfa_expand(dfa *fa, dfastate *state)
1390 {
1391     Uint iset[8];
1392     Uint *cset, *ecset, ncset;
1393     rgxposn **rrp;
1394     unsigned short i, n, *s;
1395     char *p;
1396     dfastate *newstate;
1397     rgxposn **newposn;
1398     unsigned short *newstr;
1399     Uint size;
1400 
1401     newposn = NULL;
1402     newstr = 0;
1403 
1404     if (fa->posnhtab == (hashtab *) NULL) {
1405 	dfa_loadtmp(fa);	/* load tmp info */
1406     }
1407 
1408     memset(iset, '\0', sizeof(iset));
1409 
1410     /* allocate character sets for strings and positions */
1411     ncset = state->nstr;
1412     for (i = state->nposn, rrp = POSNA(state); i > 0; --i, rrp++) {
1413 	ncset += (*rrp)->size;
1414     }
1415     cset = ALLOCA(Uint, ncset << 3);
1416 
1417     /* construct character sets for all string chars */
1418     for (i = state->nstr, s = STRA(state); i > 0; --i, s++) {
1419 	if (*s < fa->nsstrings) {
1420 	    p = fa->strings + (*s << 2);
1421 	    p = fa->source + (UCHAR(p[0]) << 16) + (UCHAR(p[1]) << 8) +
1422 		UCHAR(p[2]);
1423 	} else {
1424 	    p = fa->strings + (fa->nsstrings << 2) +
1425 		((*s - fa->nsstrings) << 1);
1426 	    p = fa->grammar + (UCHAR(p[0]) << 8) + UCHAR(p[1]) + 1;
1427 	}
1428 	n = UCHAR(p[state->len]);
1429 	memset(cset, '\0', 32);
1430 	cset[n >> 5] |= 1 << (n & 31);
1431 	iset[n >> 5] |= 1 << (n & 31);	/* also add to input set */
1432 	cset += 8;
1433     }
1434 
1435     /* construct character sets for all positions */
1436     for (i = state->nposn, rrp = POSNA(state); i > 0; --i, rrp++) {
1437 	rp_cset(*rrp, cset);
1438 	for (n = (*rrp)->size; n > 0; --n) {
1439 	    cs_or(iset, cset);		/* add to input set */
1440 	    cset += 8;
1441 	}
1442     }
1443     cset -= ncset << 3;
1444 
1445     /*
1446      * adjust equivalence classes
1447      */
1448     dfa_ecsplit(fa, iset, cset, ncset);
1449 
1450     /*
1451      * for all equivalence classes, compute transition states
1452      */
1453     if (state->nposn != 0) {
1454 	newposn = ALLOCA(rgxposn*, state->nposn);
1455     }
1456     if (state->nstr != 0) {
1457 	newstr = ALLOCA(unsigned short, state->nstr);
1458     }
1459     p = state->trans = ALLOC(char, 2 * 256);
1460     state->ntrans = fa->ecnum;
1461     state->alloc = TRUE;
1462     cset += (Uint) state->nstr << 3;
1463     for (i = fa->ecnum, ecset = fa->ecset; i > 0; --i, ecset += 8) {
1464 	/* prepare new state */
1465 	newstate = &fa->states[fa->nstates];
1466 
1467 	/* flesh out new state */
1468 	newstate->posn.a = newposn;
1469 	newstate->str.a = newstr;
1470 	size = dfa_newstate(fa, state, newstate, ecset, cset);
1471 
1472 	if (newstate->nposn == 0 && newstate->nstr == 0 && newstate->final < 0)
1473 	{
1474 	    /* stuck in state 0 */
1475 	    n = 0;
1476 	} else {
1477 	    if (newstate->nposn <= 1) {
1478 		if (newstate->nposn == 0) {
1479 		    newstate->posn.a = (rgxposn **) NULL;
1480 		} else {
1481 		    newstate->posn.e = newposn[0];
1482 		}
1483 	    }
1484 	    if (newstate->nstr <= 2) {
1485 		if (newstate->nstr == 0) {
1486 		    newstate->str.a = (unsigned short *) NULL;
1487 		    newstate->len = 0;
1488 		} else {
1489 		    memcpy(newstate->str.e, newstr, 2 * sizeof(unsigned short));
1490 		}
1491 	    }
1492 
1493 	    n = ds_hash(fa->sthtab, fa->sthsize, fa->states,
1494 			(unsigned short) fa->nstates);
1495 	    if (n == fa->nstates) {
1496 		/*
1497 		 * genuinely new state
1498 		 */
1499 		if (newstate->nposn > 1) {
1500 		    newstate->posn.a = ALLOC(rgxposn*, newstate->nposn);
1501 		    memcpy(newstate->posn.a, newposn,
1502 			   newstate->nposn * sizeof(rgxposn*));
1503 		}
1504 		if (newstate->nstr > 2) {
1505 		    newstate->str.a = ALLOC(unsigned short, newstate->nstr);
1506 		    memcpy(newstate->str.a, newstr,
1507 			   newstate->nstr * sizeof(unsigned short));
1508 		}
1509 		if (newstate->nposn == 0 && newstate->nstr == 0) {
1510 		    newstate->ntrans = 256;
1511 		    newstate->trans = fa->zerotrans;
1512 		    fa->endstates++;
1513 		}
1514 		fa->dfasize += 3;
1515 		fa->tmpssize += 5 + newstate->nposn * 3 + newstate->nstr * 2;
1516 		fa->tmppsize += size;
1517 
1518 		if (++fa->nstates == fa->sttsize) {
1519 		    /* grow table */
1520 		    size = state - fa->states;
1521 		    fa->states = REALLOC(fa->states, dfastate, fa->nstates,
1522 					 fa->sttsize <<= 1);
1523 		    state = fa->states + size;
1524 		}
1525 	    }
1526 	}
1527 
1528 	*p++ = n >> 8;
1529 	*p++ = n;
1530     }
1531 
1532     if (state->nstr != 0) {
1533 	AFREE(newstr);
1534     }
1535     if (state->nposn != 0) {
1536 	AFREE(newposn);
1537     }
1538     AFREE(cset - ((Uint) state->nstr << 3));
1539 
1540     fa->modified = TRUE;
1541     fa->nexpanded++;
1542     fa->dfasize += fa->ecnum << 1;
1543     return state;
1544 }
1545 
1546 /*
1547  * NAME:	dfa->scan()
1548  * DESCRIPTION:	Scan input, while lazily constructing a DFA.
1549  *		Return values:	[0 ..>	token
1550  *				-1	end of string
1551  *				-2	Invalid token
1552  *				-3	DFA too large (deallocate)
1553  */
dfa_scan(dfa * fa,string * str,ssizet * strlen,char ** token,ssizet * len)1554 short dfa_scan(dfa *fa, string *str, ssizet *strlen, char **token, ssizet *len)
1555 {
1556     ssizet size;
1557     unsigned short eclass;
1558     char *p, *q;
1559     dfastate *state;
1560     short final;
1561     ssizet fsize, nomatch;
1562 
1563     nomatch = 0;
1564     fsize = 0;
1565     size = *strlen;
1566     *token = str->text + str->len - size;
1567 
1568     while (size != 0) {
1569 	state = &fa->states[1];
1570 	final = -1;
1571 	p = str->text + str->len - size;
1572 
1573 	while (size != 0) {
1574 	    eclass = UCHAR(fa->eclass[UCHAR(*p)]);
1575 	    if (state->ntrans <= eclass) {
1576 		if (state->ntrans == 0) {
1577 		    /* expand state */
1578 		    if (state == fa->states) {
1579 			break;	/* stuck in state 0 */
1580 		    }
1581 		    state = dfa_expand(fa, state);
1582 		    if (fa->dfasize + fa->tmpssize + fa->tmppsize >
1583 					    (Uint) MAX_AUTOMSZ * USHRT_MAX) {
1584 			unsigned short save;
1585 
1586 			/*
1587 			 * too much temporary data: attempt to expand
1588 			 * all states
1589 			 */
1590 			save = state - fa->states;
1591 			for (state = &fa->states[1];
1592 			     fa->nstates != fa->nexpanded + fa->endstates;
1593 			     state++) {
1594 			    if (fa->nstates > USHRT_MAX - 256 ||
1595 				fa->dfasize > (Uint) MAX_AUTOMSZ * USHRT_MAX) {
1596 				return DFA_TOOBIG;
1597 			    }
1598 			    if (state->ntrans == 0) {
1599 				state = dfa_expand(fa, state);
1600 			    }
1601 			}
1602 			state = &fa->states[save];
1603 		    }
1604 		    if (fa->nstates > USHRT_MAX - 256 ||
1605 			fa->dfasize > (Uint) MAX_AUTOMSZ * USHRT_MAX) {
1606 			return DFA_TOOBIG;
1607 		    }
1608 		    eclass = UCHAR(fa->eclass[UCHAR(*p)]);
1609 		} else {
1610 		    /* extend transition table */
1611 		    dfa_extend(fa, state, eclass);
1612 		}
1613 	    }
1614 
1615 	    /* transition */
1616 	    --size;
1617 	    p++;
1618 	    q = &state->trans[eclass << 1];
1619 	    state = &fa->states[(UCHAR(q[0]) << 8) + UCHAR(q[1])];
1620 
1621 	    /* check if final state */
1622 	    if (state->final >= 0) {
1623 		final = state->final;
1624 		fsize = size;
1625 	    }
1626 	}
1627 
1628 	if (final >= 0) {
1629 	    if (nomatch != 0) {
1630 		if (fa->nomatch != fa->whitespace) {
1631 		    *len = nomatch;
1632 		    return fa->nomatch;
1633 		}
1634 		*token += nomatch;
1635 		nomatch = 0;
1636 	    }
1637 
1638 	    /* in a final state */
1639 	    size = fsize;
1640 	    if (final != fa->whitespace) {
1641 		*len = *strlen - size;
1642 		*strlen = size;
1643 		return final;
1644 	    }
1645 	    /* else whitespace: continue */
1646 	    *token = p - 1;
1647 	    *strlen = size;
1648 	} else if (fa->nomatch >= 0) {
1649 	    nomatch++;
1650 	    size = --*strlen;
1651 	} else {
1652 	    return DFA_REJECT;
1653 	}
1654     }
1655 
1656     if (nomatch != 0 && fa->nomatch != fa->whitespace) {
1657 	*len = nomatch;
1658 	return fa->nomatch;
1659     }
1660 
1661     return DFA_EOS;
1662 }
1663