1 /*
2 * This file is part of DGD, https://github.com/dworkin/dgd
3 * Copyright (C) 1993-2010 Dworkin B.V.
4 * Copyright (C) 2010,2012 DGD Authors (see the commit log for details)
5 *
6 * This program is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Affero General Public License as
8 * published by the Free Software Foundation, either version 3 of the
9 * License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Affero General Public License for more details.
15 *
16 * You should have received a copy of the GNU Affero General Public License
17 * along with this program. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 # include "dgd.h"
21 # include "hash.h"
22 # include "str.h"
23 # include "dfa.h"
24
25 /*
26 * NAME: charset->neg()
27 * DESCRIPTION: negate a charset
28 */
cs_neg(Uint * cs)29 static void cs_neg(Uint *cs)
30 {
31 *cs++ ^= 0xffffffffL;
32 *cs++ ^= 0xffffffffL;
33 *cs++ ^= 0xffffffffL;
34 *cs++ ^= 0xffffffffL;
35 *cs++ ^= 0xffffffffL;
36 *cs++ ^= 0xffffffffL;
37 *cs++ ^= 0xffffffffL;
38 *cs ^= 0xffffffffL;
39 }
40
41 /*
42 * NAME: charset->or()
43 * DESCRIPTION: or two charsets
44 */
cs_or(Uint * cs1,Uint * cs2)45 static void cs_or(Uint *cs1, Uint *cs2)
46 {
47 *cs1++ |= *cs2++;
48 *cs1++ |= *cs2++;
49 *cs1++ |= *cs2++;
50 *cs1++ |= *cs2++;
51 *cs1++ |= *cs2++;
52 *cs1++ |= *cs2++;
53 *cs1++ |= *cs2++;
54 *cs1 |= *cs2;
55 }
56
57 /*
58 * NAME: charset->sub()
59 * DESCRIPTION: subtract a charset from another one
60 */
cs_sub(Uint * cs1,Uint * cs2)61 static void cs_sub(Uint *cs1, Uint *cs2)
62 {
63 *cs1++ &= ~*cs2++;
64 *cs1++ &= ~*cs2++;
65 *cs1++ &= ~*cs2++;
66 *cs1++ &= ~*cs2++;
67 *cs1++ &= ~*cs2++;
68 *cs1++ &= ~*cs2++;
69 *cs1++ &= ~*cs2++;
70 *cs1 &= ~*cs2;
71 }
72
73 /*
74 * NAME: charset->intersect()
75 * DESCRIPTION: return TRUE if two character sets intersect, FALSE otherwise
76 */
cs_intersect(Uint * cs1,Uint * cs2)77 static bool cs_intersect(Uint *cs1, Uint *cs2)
78 {
79 Uint i;
80
81 i = *cs1++ & *cs2++;
82 i |= *cs1++ & *cs2++;
83 i |= *cs1++ & *cs2++;
84 i |= *cs1++ & *cs2++;
85 i |= *cs1++ & *cs2++;
86 i |= *cs1++ & *cs2++;
87 i |= *cs1++ & *cs2++;
88 i |= *cs1 & *cs2;
89
90 return (i != 0);
91 }
92
93 /*
94 * NAME: charset->overlap()
95 * DESCRIPTION: Check if two character sets overlap. Return TRUE if they do,
96 * or if the first set contains the second one.
97 */
cs_overlap(Uint * cs1,Uint * cs2,Uint * cs3,Uint * cs4)98 static bool cs_overlap(Uint *cs1, Uint *cs2, Uint *cs3, Uint *cs4)
99 {
100 Uint s3, s4;
101
102 s3 = *cs3 = *cs1 & *cs2++; s4 = *cs4++ = *cs1++ & ~*cs3++;
103 s3 |= *cs3 = *cs1 & *cs2++; s4 |= *cs4++ = *cs1++ & ~*cs3++;
104 s3 |= *cs3 = *cs1 & *cs2++; s4 |= *cs4++ = *cs1++ & ~*cs3++;
105 s3 |= *cs3 = *cs1 & *cs2++; s4 |= *cs4++ = *cs1++ & ~*cs3++;
106 s3 |= *cs3 = *cs1 & *cs2++; s4 |= *cs4++ = *cs1++ & ~*cs3++;
107 s3 |= *cs3 = *cs1 & *cs2++; s4 |= *cs4++ = *cs1++ & ~*cs3++;
108 s3 |= *cs3 = *cs1 & *cs2++; s4 |= *cs4++ = *cs1++ & ~*cs3++;
109 s3 |= *cs3 = *cs1 & *cs2; s4 |= *cs4 = *cs1 & ~*cs3;
110
111 return (s3 != 0 && s4 != 0);
112 }
113
114 /*
115 * NAME: charset->firstc()
116 * DESCRIPTION: find the first char in a charset
117 */
cs_firstc(Uint * cset,int c)118 static int cs_firstc(Uint *cset, int c)
119 {
120 Uint x;
121
122 while (c < 256) {
123 if ((x=cset[c >> 5] >> (c & 31)) != 0) {
124 while ((x & 0xff) == 0) {
125 x >>= 8;
126 c += 8;
127 }
128 while ((x & 1) == 0) {
129 x >>= 1;
130 c++;
131 }
132 return c;
133 }
134 c += 32;
135 c &= ~31;
136 }
137
138 /* not found */
139 return -1;
140 }
141
142 /*
143 * NAME: charset->eclass()
144 * DESCRIPTION: convert a charset into an equivalence class
145 */
cs_eclass(Uint * cset,char * eclass,int class)146 static int cs_eclass(Uint *cset, char *eclass, int class)
147 {
148 int n, c;
149 Uint x;
150
151 n = 0;
152 for (c = cs_firstc(cset, 0); c < 256; c += 31, c &= ~31) {
153 x = cset[c >> 5] >> (c & 31);
154 if (x != 0) {
155 do {
156 while ((x & 0xff) == 0) {
157 x >>= 8;
158 c += 8;
159 }
160 if (x & 1) {
161 eclass[c] = class;
162 n++;
163 }
164 x >>= 1;
165 c++;
166 } while (x != 0);
167 } else {
168 c++;
169 }
170 }
171
172 return n;
173 }
174
175
176 typedef struct {
177 hte chain; /* hash table chain */
178 char *rgx; /* regular expression this position is in */
179 unsigned short size; /* size of position (length of string - 2) */
180 unsigned short ruleno; /* the rule this position is in */
181 Uint nposn; /* position number */
182 bool final; /* final position? */
183 bool alloc; /* position allocated separately? */
184 } rgxposn;
185
186 # define RPCHUNKSZ 32
187
188 typedef struct _rpchunk_ {
189 int chunksz; /* size of chunk */
190 struct _rpchunk_ *next; /* next in linked list */
191 rgxposn rp[RPCHUNKSZ]; /* rgxposn chunk */
192 } rpchunk;
193
194 /*
195 * NAME: rgxposn->alloc()
196 * DESCRIPTION: allocate a new rgxposn (or return an old one)
197 */
rp_alloc(hashtab * htab,char * posn,unsigned short size,rpchunk ** c,char * rgx,Uint nposn,unsigned short ruleno,bool final)198 static rgxposn *rp_alloc(hashtab *htab, char *posn, unsigned short size, rpchunk **c, char *rgx, Uint nposn, unsigned short ruleno, bool final)
199 {
200 rgxposn **rrp, *rp;
201
202 rrp = (rgxposn **) ht_lookup(htab, posn, TRUE);
203 if (*rrp != (rgxposn *) NULL) {
204 return *rrp; /* already exists */
205 }
206
207 if (*c == (rpchunk *) NULL || (*c)->chunksz == RPCHUNKSZ) {
208 rpchunk *x;
209
210 x = ALLOC(rpchunk, 1);
211 x->next = *c;
212 *c = x;
213 x->chunksz = 0;
214 }
215 rp = &(*c)->rp[(*c)->chunksz++];
216 rp->chain.next = (hte *) *rrp;
217 *rrp = rp;
218 rp->chain.name = posn;
219 rp->rgx = rgx;
220 rp->size = size;
221 rp->nposn = nposn;
222 rp->ruleno = ruleno;
223 rp->final = final;
224 rp->alloc = FALSE;
225
226 return rp;
227 }
228
229 /*
230 * NAME: rgxposn->new()
231 * DESCRIPTION: create a new rgxposn
232 */
rp_new(hashtab * htab,char * posn,unsigned short size,rpchunk ** c,char * rgx,Uint nposn,unsigned short ruleno,bool final)233 static rgxposn *rp_new(hashtab *htab, char *posn, unsigned short size, rpchunk **c, char *rgx, Uint nposn, unsigned short ruleno, bool final)
234 {
235 rgxposn *rp;
236
237 rp = rp_alloc(htab, posn, size, c, rgx, nposn, ruleno, final);
238 if (rp->nposn == nposn) {
239 strcpy(rp->chain.name = ALLOC(char, size + 3), posn);
240 rp->alloc = TRUE;
241 }
242 return rp;
243 }
244
245 /*
246 * NAME: rgxposn->clear()
247 * DESCRIPTION: free all rgxposns
248 */
rp_clear(rpchunk * c)249 static void rp_clear(rpchunk *c)
250 {
251 rpchunk *f;
252 rgxposn *rp;
253 int i;
254
255 while (c != (rpchunk *) NULL) {
256 for (rp = c->rp, i = c->chunksz; i != 0; rp++, --i) {
257 if (rp->alloc) {
258 FREE(rp->chain.name);
259 }
260 }
261 f = c;
262 c = c->next;
263 FREE(f);
264 }
265 }
266
267 /*
268 * NAME: rgxposn->transposn()
269 * DESCRIPTION: convert a transition into a position
270 */
rp_transposn(char * rgx,char * trans,char * buf,unsigned short * buflen)271 static bool rp_transposn(char *rgx, char *trans, char *buf, unsigned short *buflen)
272 {
273 char a[256], b[256], c[256], heap[256];
274 char *p, *q;
275 int n, place;
276 unsigned short i, j, len;
277
278 memset(a, '\0', 256);
279 heap[0] = 0;
280 len = 0;
281
282 /* from transitions to places */
283 if (trans == (char *) NULL) {
284 n = 1;
285 b[0] = 1;
286 } else {
287 n = 0;
288 for (p = trans; *p != '\0'; p++) {
289 for (i = UCHAR(*p); ; i = place + 1) {
290 place = UCHAR(rgx[i]) + 1;
291 if (!a[place]) {
292 a[place] = TRUE;
293 if (place != UCHAR(rgx[0])) {
294 switch (rgx[place]) {
295 case '|':
296 /* branch */
297 b[n++] = place + 2;
298 continue;
299
300 case '+':
301 /* pattern+ */
302 b[n++] = place + 2;
303 if (place < i) {
304 continue;
305 }
306 break;
307
308 default:
309 /* add to heap */
310 for (i = ++len, j = i >> 1;
311 UCHAR(heap[j]) > place;
312 i = j, j >>= 1) {
313 heap[i] = heap[j];
314 }
315 heap[i] = place;
316 break;
317 }
318 }
319 }
320 break;
321 }
322 }
323 }
324 b[n] = '\0';
325
326 /* closure: alternate between b and c */
327 for (p = b, q = c; *p != '\0'; p = q, q = (q == b) ? c : b) {
328 n = 0;
329 do {
330 place = UCHAR(*p++);
331 if (!a[place]) {
332 a[place] = TRUE;
333 if (place != UCHAR(rgx[0])) {
334 switch (rgx[place]) {
335 case '|':
336 /* branch */
337 q[n++] = place + 2;
338 q[n++] = UCHAR(rgx[place + 1]) + 1;
339 continue;
340
341 case '+':
342 /* pattern+ */
343 q[n++] = place + 2;
344 continue;
345 }
346
347 /* add to heap */
348 for (i = ++len, j = i >> 1;
349 UCHAR(heap[j]) > place;
350 i = j, j >>= 1) {
351 heap[i] = heap[j];
352 }
353 heap[i] = place;
354 }
355 }
356 } while (*p != '\0');
357 q[n] = '\0';
358 }
359
360 /* from heap to buf */
361 *buflen = len;
362 for (p = buf; len != 0; --len) {
363 *p++ = heap[1];
364 n = UCHAR(heap[len]);
365 for (i = 1, j = 2; j < len; i = j, j <<= 1) {
366 if (UCHAR(heap[j]) > UCHAR(heap[j + 1])) {
367 j++;
368 }
369 if (n <= UCHAR(heap[j])) {
370 break;
371 }
372 heap[i] = heap[j];
373 }
374 heap[i] = n;
375 }
376 *p = '\0';
377
378 return a[UCHAR(rgx[0])]; /* final? */
379 }
380
381 static Uint bits[] = {
382 0x00000001L, 0x00000003L, 0x00000007L, 0x0000000fL,
383 0x0000001fL, 0x0000003fL, 0x0000007fL, 0x000000ffL,
384 0x000001ffL, 0x000003ffL, 0x000007ffL, 0x00000fffL,
385 0x00001fffL, 0x00003fffL, 0x00007fffL, 0x0000ffffL,
386 0x0001ffffL, 0x0003ffffL, 0x0007ffffL, 0x000fffffL,
387 0x001fffffL, 0x003fffffL, 0x007fffffL, 0x00ffffffL,
388 0x01ffffffL, 0x03ffffffL, 0x07ffffffL, 0x0fffffffL,
389 0x1fffffffL, 0x3fffffffL, 0x7fffffffL, 0xffffffffL
390 };
391
392 /*
393 * NAME: rgxposn->cset()
394 * DESCRIPTION: create input sets for a position
395 */
rp_cset(rgxposn * rp,Uint * cset)396 static void rp_cset(rgxposn *rp, Uint *cset)
397 {
398 char *p, *q;
399 int c, n, x;
400 bool negate;
401
402 for (q = rp->chain.name + 2; *q != '\0'; q++) {
403 memset(cset, '\0', 32);
404 p = rp->rgx + UCHAR(*q);
405 switch (*p) {
406 case '[':
407 /* character class */
408 p++;
409 if (*p == '^') {
410 negate = TRUE;
411 p++;
412 } else {
413 negate = FALSE;
414 }
415 do {
416 if (*p == '\\') {
417 p++;
418 }
419 c = UCHAR(*p++);
420 cset[c >> 5] |= 1 << (c & 31);
421 if (p[0] == '-' && p[1] != ']') {
422 n = UCHAR(p[1]) - c;
423 if (n != 0) {
424 x = 32 - (++c & 31);
425 if (x > n) {
426 x = n;
427 }
428 cset[c >> 5] |= bits[x - 1] << (c & 31);
429 c += x;
430 n -= x;
431 while (n >= 32) {
432 cset[c >> 5] |= 0xffffffffL;
433 c += 32;
434 n -= 32;
435 }
436 if (n != 0) {
437 cset[c >> 5] |= bits[n - 1];
438 }
439 }
440 p += 2;
441 }
442 } while (*p != ']');
443 if (negate) {
444 cs_neg(cset);
445 }
446 break;
447
448 case '.':
449 /* anything */
450 memset(cset, -1, 32);
451 break;
452
453 case '\\':
454 /* escaped char */
455 p++;
456 default:
457 /* normal char */
458 c = UCHAR(*p);
459 cset[c >> 5] |= 1 << (c & 31);
460 break;
461 }
462
463 cset += 8;
464 }
465 }
466
467 /*
468 * NAME: rgxposn->trans()
469 * DESCRIPTION: perform a transition on a position, given an input set
470 */
rp_trans(rgxposn * rp,Uint * cset,char * posn,unsigned short * size)471 static bool rp_trans(rgxposn *rp, Uint *cset, char *posn, unsigned short *size)
472 {
473 char trans[256];
474 char *p, *q;
475 int c, n, x;
476 char *t;
477 Uint found;
478 bool negate;
479
480 t = trans;
481 for (q = rp->chain.name + 2; *q != '\0'; q++) {
482 p = rp->rgx + UCHAR(*q);
483 found = 0;
484 switch (*p) {
485 case '[':
486 /* character class */
487 p++;
488 if (*p == '^') {
489 negate = TRUE;
490 p++;
491 } else {
492 negate = FALSE;
493 }
494 do {
495 if (*p == '\\') {
496 p++;
497 }
498 c = UCHAR(*p++);
499 found |= cset[c >> 5] & 1 << (c & 31);
500 if (p[0] == '-' && p[1] != ']') {
501 n = UCHAR(p[1]) - c;
502 if (n != 0) {
503 x = 32 - (++c & 31);
504 if (x > n) {
505 x = n;
506 }
507 found |= cset[c >> 5] & (bits[x - 1] << (c & 31));
508 c += x;
509 n -= x;
510 while (n >= 32) {
511 found |= cset[c >> 5] & 0xffffffffL;
512 c += 32;
513 n -= 32;
514 }
515 if (n != 0) {
516 found |= cset[c >> 5] & bits[n - 1];
517 }
518 }
519 p += 2;
520 }
521 } while (*p != ']');
522 if (negate) {
523 found = !found;
524 }
525 break;
526
527 case '.':
528 /* anything */
529 found = 1;
530 break;
531
532 case '\\':
533 /* escaped char */
534 p++;
535 default:
536 /* normal char */
537 c = UCHAR(*p);
538 found = cset[c >> 5] & (1 << (c & 31));
539 break;
540 }
541 if (found != 0) {
542 *t++ = p - rp->rgx + 1;
543 }
544 }
545 *t = '\0';
546
547 return rp_transposn(rp->rgx, trans, posn, size);
548 }
549
550 /*
551 * NAME: rgxposn->load()
552 * DESCRIPTION: load a rgxposn from a buffer
553 */
rp_load(hashtab * htab,rpchunk ** c,Uint nposn,char * buf,char * grammar)554 static rgxposn *rp_load(hashtab *htab, rpchunk **c, Uint nposn, char *buf, char *grammar)
555 {
556 char *rgx;
557 unsigned short ruleno, size;
558 bool final;
559
560 rgx = grammar + (UCHAR(buf[0]) << 8) + UCHAR(buf[1]);
561 ruleno = (UCHAR(buf[2]) << 8) + UCHAR(buf[3]);
562 buf += 4;
563 if (*buf == '\0') {
564 final = TRUE;
565 buf++;
566 } else {
567 final = FALSE;
568 }
569 size = UCHAR(*buf++);
570
571 return rp_alloc(htab, buf, size, c, rgx, nposn, ruleno, final);
572 }
573
574 /*
575 * NAME: rgxposn->save()
576 * DESCRIPTION: save a rgxposn to a buffer
577 */
rp_save(rgxposn * rp,char * buf,char * grammar)578 static char *rp_save(rgxposn *rp, char *buf, char *grammar)
579 {
580 unsigned short rgx;
581
582 rgx = rp->rgx - grammar;
583 *buf++ = rgx >> 8;
584 *buf++ = rgx;
585 *buf++ = rp->ruleno >> 8;
586 *buf++ = rp->ruleno;
587 if (rp->final) {
588 *buf++ = '\0';
589 }
590 *buf++ = rp->size;
591 memcpy(buf, rp->chain.name, rp->size + 3);
592 return buf + rp->size + 3;
593 }
594
595
596 typedef struct {
597 union { /* regexp positions */
598 rgxposn *e; /* 1 */
599 rgxposn **a; /* > 1 */
600 } posn;
601 union { /* strings */
602 unsigned short e[2]; /* 1, 2 */
603 unsigned short *a; /* > 2 */
604 } str;
605 char *trans; /* transitions */
606 unsigned short nposn; /* number of positions */
607 unsigned short nstr; /* number of string positions */
608 unsigned short len; /* string length */
609 unsigned short ntrans; /* number of transitions */
610 short final; /* rule number, -1: not final */
611 unsigned short next; /* next in hash chain */
612 bool alloc; /* transitions allocated? */
613 } dfastate;
614
615 # define POSNA(state) (((state)->nposn == 1) ? \
616 &(state)->posn.e : (state)->posn.a)
617 # define STRA(state) (((state)->nstr <= 2) ? \
618 (state)->str.e : (state)->str.a)
619
620 /*
621 * NAME: dfastate->hash()
622 * DESCRIPTION: put a new state in the hash table, or return an old one
623 */
ds_hash(unsigned short * htab,Uint htabsize,dfastate * states,unsigned short idx)624 static unsigned short ds_hash(unsigned short *htab, Uint htabsize, dfastate *states, unsigned short idx)
625 {
626 Uint x;
627 rgxposn **posn;
628 unsigned short n, *str;
629 dfastate *newstate, *ds;
630 unsigned short *dds;
631
632 /* hash on position and string pointers */
633 newstate = &states[idx];
634 x = newstate->len ^ newstate->final;
635 for (n = newstate->nposn, posn = POSNA(newstate); n > 0; --n) {
636 x = (x >> 3) ^ (x << 29) ^ (uintptr_t) *posn++;
637 }
638 for (n = newstate->nstr, str = STRA(newstate); n > 0; --n) {
639 x = (x >> 3) ^ (x << 29) ^ (uintptr_t) *str++;
640 }
641
642 /* check state hash table */
643 posn = POSNA(newstate);
644 str = STRA(newstate);
645 dds = &htab[(Uint) x % htabsize];
646 ds = &states[*dds];
647 while (ds != states) {
648 if (newstate->len == ds->len && newstate->final == ds->final &&
649 newstate->nposn == ds->nposn && newstate->nstr == ds->nstr &&
650 memcmp(posn, POSNA(ds), newstate->nposn * sizeof(rgxposn*)) == 0 &&
651 memcmp(str, STRA(ds), newstate->nstr * sizeof(unsigned short)) == 0)
652 {
653 /* state already exists */
654 return *dds;
655 }
656 dds = &ds->next;
657 ds = &states[*dds];
658 }
659
660 newstate->next = *dds;
661 return *dds = idx;
662 }
663
664 # define TRANS_NONE 0 /* no transitions */
665 # define TRANS_ZERO 1 /* all transitions to state 0 */
666 # define TRANS_STATES 2 /* normal transitions */
667
668 /*
669 * NAME: dfastate->load()
670 * DESCRIPTION: load a dfastate from a buffer
671 */
ds_load(dfastate * state,char * buf,unsigned short ntrans,char * zerotrans)672 static char *ds_load(dfastate *state, char *buf, unsigned short ntrans, char *zerotrans)
673 {
674 state->posn.a = (rgxposn **) NULL;
675 state->str.a = (unsigned short *) NULL;
676 state->nposn = state->nstr = state->len = 0;
677 state->alloc = FALSE;
678 state->final = (UCHAR(buf[0]) << 8) + UCHAR(buf[1]);
679 buf += 2;
680 switch (*buf++) {
681 case TRANS_NONE:
682 state->ntrans = 0;
683 break;
684
685 case TRANS_ZERO:
686 state->ntrans = 256;
687 state->trans = zerotrans;
688 break;
689
690 case TRANS_STATES:
691 state->ntrans = ntrans;
692 state->trans = buf;
693 buf += ntrans << 1;
694 break;
695 }
696
697 return buf;
698 }
699
700 /*
701 * NAME: dfastate->loadtmp()
702 * DESCRIPTION: load dfastate temporary data from a buffer
703 */
ds_loadtmp(dfastate * state,char * sbuf,char * pbuf,hashtab * htab,rpchunk ** c,Uint * nposn,char * grammar)704 static char *ds_loadtmp(dfastate *state, char *sbuf, char *pbuf, hashtab *htab, rpchunk **c, Uint *nposn, char *grammar)
705 {
706 rgxposn *rp, **rrp;
707 unsigned short i, *s;
708 char *posn;
709
710 state->nposn = (UCHAR(sbuf[0]) << 8) + UCHAR(sbuf[1]);
711 state->nstr = (UCHAR(sbuf[2]) << 8) + UCHAR(sbuf[3]);
712 sbuf += 4;
713 state->len = UCHAR(*sbuf++);
714
715 if (state->nposn != 0) {
716 if (state->nposn != 1) {
717 rrp = state->posn.a = ALLOC(rgxposn*, state->nposn);
718 } else {
719 rrp = &state->posn.e;
720 }
721 for (i = state->nposn; i > 0; --i) {
722 posn = pbuf + ((Uint) UCHAR(sbuf[0]) << 16) +
723 (UCHAR(sbuf[1]) << 8) + UCHAR(sbuf[2]);
724 sbuf += 3;
725 rp = *rrp++ = rp_load(htab, c, *nposn, posn, grammar);
726 if (rp->nposn == *nposn) {
727 (*nposn)++;
728 }
729 }
730 }
731 if (state->nstr != 0) {
732 if (state->nstr > 2) {
733 s = state->str.a = ALLOC(unsigned short, state->nstr);
734 } else {
735 s = state->str.e;
736 }
737 for (i = state->nstr; i > 0; --i) {
738 *s++ = (UCHAR(sbuf[0]) << 8) + UCHAR(sbuf[1]);
739 sbuf += 2;
740 }
741 }
742
743 return sbuf;
744 }
745
746 /*
747 * NAME: dfastate->save()
748 * DESCRIPTION: save a dfastate to a buffer
749 */
ds_save(dfastate * state,char * buf)750 static char *ds_save(dfastate *state, char *buf)
751 {
752 buf[0] = state->final >> 8;
753 buf[1] = state->final;
754 buf += 2;
755 if (state->ntrans == 0) {
756 *buf++ = TRANS_NONE;
757 } else if (state->nposn == 0 && state->nstr == 0) {
758 *buf++ = TRANS_ZERO;
759 } else {
760 *buf++ = TRANS_STATES;
761 memcpy(buf, state->trans, state->ntrans << 1);
762 buf += state->ntrans << 1;
763 }
764
765 return buf;
766 }
767
768 /*
769 * NAME: dfastate->savetmp()
770 * DESCRIPTION: save dfastate temporary data to a buffer
771 */
ds_savetmp(dfastate * state,char * sbuf,char ** pbuf,char * pbase,Uint * ptab,Uint * nposn,char * grammar)772 static char *ds_savetmp(dfastate *state, char *sbuf, char **pbuf, char *pbase, Uint *ptab, Uint *nposn, char *grammar)
773 {
774 rgxposn *rp, **rrp;
775 unsigned short i, *s;
776 Uint n;
777
778 *sbuf++ = state->nposn >> 8;
779 *sbuf++ = state->nposn;
780 *sbuf++ = state->nstr >> 8;
781 *sbuf++ = state->nstr;
782 *sbuf++ = state->len;
783
784 rrp = POSNA(state);
785 for (i = state->nposn; i > 0; --i) {
786 rp = *rrp++;
787 if (rp->nposn == *nposn) {
788 ptab[(*nposn)++] = (intptr_t) *pbuf - (intptr_t) pbase;
789 *pbuf = rp_save(rp, *pbuf, grammar);
790 }
791 n = ptab[rp->nposn];
792 *sbuf++ = n >> 16;
793 *sbuf++ = n >> 8;
794 *sbuf++ = n;
795 }
796
797 s = STRA(state);
798 for (i = state->nstr; i > 0; --i) {
799 *sbuf++ = *s >> 8;
800 *sbuf++ = *s++;
801 }
802
803 return sbuf;
804 }
805
806
807 struct _dfa_ {
808 char *source; /* source grammar */
809 char *grammar; /* reference grammar */
810 char *strings; /* offset of strings in grammar */
811 unsigned short nsstrings; /* # strings in source grammar */
812 short whitespace; /* whitespace rule or -1 */
813 short nomatch; /* nomatch rule or -1 */
814
815 bool modified; /* dfa modified */
816 bool allocated; /* dfa strings allocated locally */
817 Uint dfasize; /* size of state machine */
818 Uint tmpssize; /* size of temporary state data */
819 Uint tmppsize; /* size of temporary posn data */
820 char *dfastr; /* saved dfa */
821 char *tmpstr; /* saved temporary data */
822
823 unsigned short nregexp; /* # regexps */
824 Uint nposn; /* number of unique positions */
825 rpchunk *rpc; /* current rgxposn chunk */
826 hashtab *posnhtab; /* position hash table */
827
828 unsigned short nstates; /* # states */
829 unsigned short nexpanded; /* # expanded states */
830 unsigned short endstates; /* # states with no valid transitions */
831 Uint sttsize; /* state table size */
832 Uint sthsize; /* size of state hash table */
833 dfastate *states; /* dfa states */
834 unsigned short *sthtab; /* state hash table */
835
836 unsigned short ecnum; /* number of equivalence classes */
837 char *ecsplit; /* equivalence class split history */
838 char *ecmembers; /* members per equivalence class */
839 Uint *ecset; /* equivalence class sets */
840 char eclass[256]; /* equivalence classes */
841
842 char zerotrans[2 * 256]; /* shared zero transitions */
843 };
844
845 # define DFA_VERSION 1
846
847 /*
848 * NAME: dfa->new()
849 * DESCRIPTION: create new dfa instance
850 */
dfa_new(char * source,char * grammar)851 dfa *dfa_new(char *source, char *grammar)
852 {
853 char posn[258];
854 unsigned short nstrings;
855 dfa *fa;
856 dfastate *state;
857 bool final;
858
859 fa = ALLOC(dfa, 1);
860
861 /* grammar info */
862 fa->source = source;
863 fa->grammar = grammar;
864 fa->whitespace = (UCHAR(grammar[1]) << 8) + UCHAR(grammar[2]);
865 fa->nomatch = (UCHAR(grammar[3]) << 8) + UCHAR(grammar[4]);
866 fa->nregexp = (UCHAR(grammar[5]) << 8) + UCHAR(grammar[6]);
867 fa->nsstrings = (UCHAR(grammar[9]) << 8) + UCHAR(grammar[10]);
868 nstrings = fa->nsstrings + (UCHAR(grammar[11]) << 8) + UCHAR(grammar[12]);
869 fa->strings = grammar + 17 + (fa->nregexp << 1);
870
871 /* size info */
872 fa->modified = TRUE;
873 fa->allocated = FALSE;
874 fa->dfasize = 8 + 256 + 3; /* header + eclasses + state 1 */
875 fa->tmpssize = 4 + 1 + 5; /* header + ecsplit + state 1 */
876 fa->tmppsize = 0;
877 fa->dfastr = (char *) NULL;
878 fa->tmpstr = (char *) NULL;
879
880 /* equivalence classes */
881 fa->ecnum = 1;
882 fa->ecsplit = ALLOC(char, 256 + 256 + 32 * 256);
883 fa->ecmembers = fa->ecsplit + 256;
884 fa->ecset = (Uint *) (fa->ecmembers + 256);
885 memset(fa->eclass, '\0', 256);
886 memset(fa->ecmembers, '\0', 256);
887 memset(fa->ecset, -1, 32);
888 memset(fa->ecset + 8, '\0', 32 * 255);
889
890 /* positions */
891 fa->nposn = (UCHAR(grammar[7]) << 8) + UCHAR(grammar[8]);
892 fa->rpc = (rpchunk *) NULL;
893 fa->posnhtab = ht_new((fa->nposn + 1) << 2, 257, FALSE);
894
895 /* states */
896 fa->nstates = 2;
897 fa->sttsize = (Uint) (fa->nposn + nstrings + 1) << 1;
898 fa->sthsize = (Uint) fa->sttsize << 1;
899 fa->nexpanded = 0;
900 fa->endstates = 1;
901 fa->states = ALLOC(dfastate, fa->sttsize);
902 fa->sthtab = ALLOC(unsigned short, fa->sthsize);
903 memset(fa->sthtab, '\0', sizeof(unsigned short) * fa->sthsize);
904
905 /* initial states */
906 state = &fa->states[0];
907 state->posn.a = (rgxposn **) NULL;
908 state->str.a = (unsigned short *) NULL;
909 state->trans = (char *) NULL;
910 state->nposn = state->nstr = 0;
911 state->ntrans = state->len = 0;
912 (state++)->final = -1;
913 state->posn.a = (fa->nposn > 1) ?
914 ALLOC(rgxposn*, fa->nposn) : (rgxposn **) NULL;
915 state->str.a = (nstrings > 2) ?
916 ALLOC(unsigned short, nstrings) : (unsigned short *) NULL;
917 state->trans = (char *) NULL;
918 state->nposn = fa->nposn;
919 state->nstr = nstrings;
920 state->ntrans = state->len = 0;
921 state->final = -1;
922 state->alloc = FALSE;
923 grammar += 17;
924 /* initial positions */
925 if (state->nposn == 0 && state->nstr == 0) {
926 /* no valid transitions from initial state */
927 state->ntrans = 256;
928 state->trans = fa->zerotrans;
929 fa->endstates++;
930 } else {
931 rgxposn **rrp;
932 unsigned short i, j, n, *s;
933 char *rgx;
934 unsigned short size;
935
936 rrp = POSNA(state);
937 for (i = j = 0; i < fa->nregexp; i++) {
938 rgx = fa->grammar + (UCHAR(grammar[0]) << 8) + UCHAR(grammar[1]);
939 grammar += 2;
940 n = j + (UCHAR(rgx[0]) << 8) + UCHAR(rgx[1]);
941 rgx += 2;
942 while (j < n) {
943 final = rp_transposn(rgx, (char *) NULL, posn + 2, &size);
944 if (final && state->final < 0) {
945 state->final = i;
946 }
947 posn[0] = 1 + j / 255;
948 posn[1] = 1 + j % 255;
949 *rrp++ = rp_new(fa->posnhtab, posn, size, &fa->rpc, rgx,
950 (Uint) j++, i, final);
951 fa->tmpssize += 3;
952 fa->tmppsize += 8 + size + final;
953 rgx += UCHAR(rgx[0]) + 1;
954 }
955 }
956 /* initial strings */
957 for (i = 0, s = STRA(state); i < nstrings; i++) {
958 *s++ = i;
959 }
960 fa->tmpssize += nstrings << 1;
961 }
962 /* add to hashtable */
963 ds_hash(fa->sthtab, fa->sthsize, fa->states, 1);
964
965 /* zero transitions */
966 memset(fa->zerotrans, '\0', 2 * 256);
967
968 return fa;
969 }
970
971 /*
972 * NAME: dfa->del()
973 * DESCRIPTION: delete a dfa instance
974 */
dfa_del(dfa * fa)975 void dfa_del(dfa *fa)
976 {
977 dfastate *state;
978 unsigned short i;
979
980 if (fa->allocated) {
981 FREE(fa->dfastr);
982 }
983 if (fa->ecsplit != (char *) NULL) {
984 FREE(fa->ecsplit);
985 }
986 if (fa->rpc != (rpchunk *) NULL) {
987 rp_clear(fa->rpc);
988 }
989 if (fa->posnhtab != (hashtab *) NULL) {
990 ht_del(fa->posnhtab);
991 }
992 for (i = fa->nstates, state = &fa->states[1]; --i > 0; state++) {
993 if (state->nposn > 1) {
994 FREE(state->posn.a);
995 }
996 if (state->nstr > 2) {
997 FREE(state->str.a);
998 }
999 if (state->alloc) {
1000 FREE(state->trans);
1001 }
1002 }
1003 FREE(fa->states);
1004 if (fa->sthtab != (unsigned short *) NULL) {
1005 FREE(fa->sthtab);
1006 }
1007 FREE(fa);
1008 }
1009
1010 /*
1011 * NAME: dfa->extend()
1012 * DESCRIPTION: extend transition table
1013 */
dfa_extend(dfa * fa,dfastate * state,unsigned short limit)1014 static void dfa_extend(dfa *fa, dfastate *state, unsigned short limit)
1015 {
1016 char *p, *q;
1017 unsigned short i;
1018
1019 /* extend transition table */
1020 if (!state->alloc) {
1021 p = ALLOC(char, 2 * 256);
1022 memcpy(p, state->trans, state->ntrans << 1);
1023 state->trans = p;
1024 state->alloc = TRUE;
1025 }
1026 p = state->trans + (state->ntrans << 1);
1027 for (i = state->ntrans; i <= limit; i++) {
1028 q = &state->trans[UCHAR(fa->ecsplit[i]) << 1];
1029 *p++ = *q++;
1030 *p++ = *q;
1031 }
1032 state->ntrans = i;
1033 }
1034
1035 /*
1036 * state & eclass format:
1037 *
1038 * header [0] version number
1039 * [x][y] # states
1040 * [x][y] # expanded states
1041 * [x][y] # end states
1042 * [x] # equivalence classes
1043 * eclass [...] 1 - 256 equivalence classes
1044 *
1045 * state [x][y] final } ...
1046 * [...] optional: transitions }
1047 *
1048 *
1049 * temporary data format:
1050 *
1051 * header [0] version number
1052 * [x][y] number of positions
1053 * ecsplit [...] 256 ecsplit data
1054 *
1055 * state [x][y] # positions }
1056 * [x][y] # strings }
1057 * [x] len } ...
1058 * [...] position data }
1059 * [...] string data }
1060 *
1061 * position [x][y] regexp }
1062 * [x][y] ruleno }
1063 * [0] optional: final position } ...
1064 * [x] size }
1065 * [...] position data }
1066 */
1067
1068 /*
1069 * NAME: dfa->load()
1070 * DESCRIPTION: load dfa from string
1071 */
dfa_load(char * source,char * grammar,char * str,Uint len)1072 dfa *dfa_load(char *source, char *grammar, char *str, Uint len)
1073 {
1074 dfa *fa;
1075 dfastate *state;
1076 unsigned short i;
1077 char *buf;
1078 unsigned short nstrings;
1079
1080 if (str[0] != DFA_VERSION) {
1081 return dfa_new(source, grammar);
1082 }
1083
1084 fa = ALLOC(dfa, 1);
1085 fa->dfastr = buf = str;
1086
1087 /* grammar info */
1088 fa->source = source;
1089 fa->grammar = grammar;
1090 fa->whitespace = (UCHAR(grammar[1]) << 8) + UCHAR(grammar[2]);
1091 fa->nomatch = (UCHAR(grammar[3]) << 8) + UCHAR(grammar[4]);
1092 fa->nregexp = (UCHAR(grammar[5]) << 8) + UCHAR(grammar[6]);
1093 fa->nsstrings = (UCHAR(grammar[9]) << 8) + UCHAR(grammar[10]);
1094 nstrings = fa->nsstrings + (UCHAR(grammar[11]) << 8) + UCHAR(grammar[12]);
1095 fa->strings = grammar + 17 + (fa->nregexp << 1);
1096
1097 /* positions */
1098 fa->nposn = (UCHAR(grammar[7]) << 8) + UCHAR(grammar[8]);
1099 fa->rpc = (rpchunk *) NULL;
1100 fa->posnhtab = (hashtab *) NULL;
1101
1102 /* states 1 */
1103 fa->nstates = (UCHAR(buf[1]) << 8) + UCHAR(buf[2]);
1104 fa->nexpanded = (UCHAR(buf[3]) << 8) + UCHAR(buf[4]);
1105 fa->endstates = (UCHAR(buf[5]) << 8) + UCHAR(buf[6]);
1106 fa->sttsize = fa->nstates + 1;
1107 fa->sthsize = (Uint) (fa->nposn + nstrings + 1) << 2;
1108 fa->states = ALLOC(dfastate, fa->sttsize);
1109 fa->sthtab = (unsigned short *) NULL;
1110
1111 /* equivalence classes */
1112 fa->ecnum = UCHAR(buf[7]) + 1;
1113 buf += 8;
1114 memcpy(fa->eclass, buf, 256);
1115 buf += 256;
1116 fa->ecsplit = (char *) NULL;
1117 fa->ecmembers = (char *) NULL;
1118 fa->ecset = (Uint *) NULL;
1119
1120 /* states 2 */
1121 fa->states[0].posn.a = (rgxposn **) NULL;
1122 fa->states[0].str.a = (unsigned short *) NULL;
1123 fa->states[0].trans = (char *) NULL;
1124 fa->states[0].nposn = fa->states[0].nstr = 0;
1125 fa->states[0].ntrans = fa->states[0].len = 0;
1126 fa->states[0].final = -1;
1127 for (i = fa->nstates, state = &fa->states[1]; --i > 0; state++) {
1128 buf = ds_load(state, buf, fa->ecnum, fa->zerotrans);
1129 }
1130
1131 /* temporary data */
1132 fa->tmpstr = buf;
1133
1134 /* size info */
1135 fa->dfasize = (intptr_t) buf - (intptr_t) str;
1136 fa->tmpssize = 0;
1137 fa->tmppsize = len - fa->dfasize;
1138 fa->modified = fa->allocated = FALSE;
1139
1140 /* zero transitions */
1141 memset(fa->zerotrans, '\0', 2 * 256);
1142
1143 return fa;
1144 }
1145
1146 /*
1147 * NAME: dfa->loadtmp()
1148 * DESCRIPTION: load dfa tmp info
1149 */
dfa_loadtmp(dfa * fa)1150 static void dfa_loadtmp(dfa *fa)
1151 {
1152 dfastate *state;
1153 unsigned short i;
1154 int c;
1155 char *buf;
1156
1157 buf = fa->tmpstr;
1158 fa->nposn = ((Uint) UCHAR(buf[1]) << 16) + (UCHAR(buf[2]) << 8) +
1159 UCHAR(buf[3]);
1160 buf += 4;
1161
1162 /* equivalence classes */
1163 fa->ecsplit = ALLOC(char, 256 + 256 + 32 * 256);
1164 fa->ecmembers = fa->ecsplit + 256;
1165 fa->ecset = (Uint *) (fa->ecmembers + 256);
1166 memcpy(fa->ecsplit, buf, fa->ecnum);
1167 buf += fa->ecnum;
1168 memset(fa->ecmembers, '\0', 256);
1169 memset(fa->ecset, '\0', 32 * 256);
1170 for (i = 256; i > 0; ) {
1171 --i;
1172 c = UCHAR(fa->eclass[i]);
1173 fa->ecmembers[c]++;
1174 fa->ecset[(c << 3) + (i >> 5)] |= 1 << (i & 31);
1175 }
1176
1177 /* positions */
1178 fa->posnhtab = ht_new((fa->nposn + 1) << 2, 257, FALSE);
1179
1180 /* states */
1181 fa->sthtab = ALLOC(unsigned short, fa->sthsize);
1182 memset(fa->sthtab, '\0', sizeof(unsigned short) * fa->sthsize);
1183
1184 fa->nposn = 0;
1185 for (i = 1, state = &fa->states[1]; i < fa->nstates; i++, state++) {
1186 buf = ds_loadtmp(state, buf, fa->tmpstr, fa->posnhtab, &fa->rpc,
1187 &fa->nposn, fa->grammar);
1188 ds_hash(fa->sthtab, fa->sthsize, fa->states, i);
1189 }
1190
1191 /* size info */
1192 fa->tmpssize = (intptr_t) buf - (intptr_t) fa->tmpstr;
1193 fa->tmppsize -= fa->tmpssize;
1194 }
1195
1196 /*
1197 * NAME: dfa->save()
1198 * DESCRIPTION: save dfa to string
1199 */
dfa_save(dfa * fa,char ** str,Uint * len)1200 bool dfa_save(dfa *fa, char **str, Uint *len)
1201 {
1202 unsigned short i;
1203 char *buf;
1204 dfastate *state;
1205 char *pbuf;
1206 Uint *ptab, nposn;
1207
1208 if (!fa->modified) {
1209 *str = fa->dfastr;
1210 *len = fa->dfasize + fa->tmpssize + fa->tmppsize;
1211 return FALSE;
1212 }
1213
1214 if (fa->nstates == fa->nexpanded + fa->endstates) {
1215 fa->tmpssize = fa->tmppsize = 0;
1216 }
1217 if (fa->allocated) {
1218 FREE(fa->dfastr);
1219 }
1220 fa->dfastr = buf = *str =
1221 ALLOC(char, *len = fa->dfasize + fa->tmpssize + fa->tmppsize);
1222 *buf++ = DFA_VERSION;
1223 *buf++ = fa->nstates >> 8;
1224 *buf++ = fa->nstates;
1225 *buf++ = fa->nexpanded >> 8;
1226 *buf++ = fa->nexpanded;
1227 *buf++ = fa->endstates >> 8;
1228 *buf++ = fa->endstates;
1229 *buf++ = fa->ecnum - 1;
1230 memcpy(buf, fa->eclass, 256);
1231 buf += 256;
1232
1233 for (i = fa->nstates, state = &fa->states[1]; --i > 0; state++) {
1234 if (state->ntrans != 0 && state->ntrans < fa->ecnum) {
1235 dfa_extend(fa, state, fa->ecnum - 1);
1236 }
1237 buf = ds_save(state, buf);
1238 }
1239
1240 fa->modified = FALSE;
1241 fa->allocated = TRUE;
1242 if (fa->tmpssize + fa->tmppsize == 0) {
1243 /* no tmp data */
1244 return TRUE;
1245 }
1246
1247 fa->tmpstr = buf;
1248 pbuf = buf + fa->tmpssize;
1249 *buf++ = 0;
1250 *buf++ = fa->nposn >> 16;
1251 *buf++ = fa->nposn >> 8;
1252 *buf++ = fa->nposn;
1253 memcpy(buf, fa->ecsplit, fa->ecnum);
1254 buf += fa->ecnum;
1255
1256 ptab = ALLOCA(Uint, fa->nposn);
1257 nposn = 0;
1258 for (i = fa->nstates, state = &fa->states[1]; --i > 0; state++) {
1259 buf = ds_savetmp(state, buf, &pbuf, fa->tmpstr, ptab, &nposn,
1260 fa->grammar);
1261 }
1262 AFREE(ptab);
1263
1264 return TRUE;
1265 }
1266
1267 /*
1268 * NAME: dfa->ecsplit()
1269 * DESCRIPTION: split up equivalence classes along the borders of character
1270 * sets
1271 */
dfa_ecsplit(dfa * fa,Uint * iset,Uint * cset,Uint ncset)1272 static void dfa_ecsplit(dfa *fa, Uint *iset, Uint *cset, Uint ncset)
1273 {
1274 Uint ec1[8], ec2[8];
1275 Uint i;
1276 int n, c;
1277
1278 for (c = cs_firstc(iset, 0); c >= 0; c = cs_firstc(iset, c + 1)) {
1279 for (i = 0; i < ncset; i++) {
1280 /*
1281 * get the equivalence class of the first char in the input set
1282 */
1283 n = UCHAR(fa->eclass[c]);
1284 if (fa->ecmembers[n] == 1) {
1285 break; /* only one character left */
1286 }
1287 if (cs_overlap(fa->ecset + (n << 3), cset, ec1, ec2)) {
1288 /*
1289 * create new equivalence class
1290 */
1291 memcpy(fa->ecset + (n << 3), ec1, sizeof(ec1));
1292 memcpy(fa->ecset + (fa->ecnum << 3), ec2, sizeof(ec2));
1293 fa->ecsplit[fa->ecnum] = n;
1294 fa->ecmembers[n] -= fa->ecmembers[fa->ecnum] =
1295 cs_eclass(ec2, fa->eclass, fa->ecnum);
1296 fa->ecnum++;
1297 fa->dfasize += fa->nexpanded << 1;
1298 fa->tmpssize++;
1299 }
1300 cset += 8;
1301 }
1302 cset -= i << 3;
1303
1304 /* remove from input set */
1305 cs_sub(iset, fa->ecset + (UCHAR(fa->eclass[c]) << 3));
1306 }
1307 }
1308
1309 /*
1310 * NAME: dfa->newstate()
1311 * DESCRIPTION: get the positions and strings for a new state
1312 */
dfa_newstate(dfa * fa,dfastate * state,dfastate * newstate,Uint * ecset,Uint * cset)1313 static unsigned short dfa_newstate(dfa *fa, dfastate *state, dfastate *newstate, Uint *ecset, Uint *cset)
1314 {
1315 char posn[130];
1316 unsigned short i, n, *s;
1317 rgxposn *rp, **rrp;
1318 char *p;
1319 unsigned short size;
1320 Uint posnsize;
1321 bool final;
1322
1323 newstate->trans = (char *) NULL;
1324 newstate->nposn = newstate->nstr = newstate->ntrans = 0;
1325 newstate->len = state->len + 1;
1326 newstate->final = -1;
1327 newstate->alloc = FALSE;
1328 posnsize = 0;
1329
1330 /* positions */
1331 for (i = state->nposn, rrp = POSNA(state); i > 0; --i, rrp++) {
1332 rp = *rrp;
1333 for (n = rp->size; n > 0; --n) {
1334 if (cs_intersect(ecset, cset)) {
1335 final = rp_trans(rp, ecset, posn + 2, &size);
1336 if (size != 0) {
1337 posn[0] = rp->chain.name[0];
1338 posn[1] = rp->chain.name[1];
1339 rp = rp_new(fa->posnhtab, posn, size, &fa->rpc, rp->rgx,
1340 fa->nposn, rp->ruleno, final);
1341 if (rp->nposn == fa->nposn) {
1342 /* new position */
1343 fa->nposn++;
1344 posnsize += 8 + rp->size + final;
1345 }
1346 newstate->posn.a[newstate->nposn++] = rp;
1347 }
1348 if (final && newstate->final < 0) {
1349 newstate->final = rp->ruleno;
1350 }
1351 cset += n << 3;
1352 break;
1353 }
1354 cset += 8;
1355 }
1356 }
1357
1358 /* strings */
1359 for (i = state->nstr, s = STRA(state); i > 0; --i, s++) {
1360 if (*s < fa->nsstrings) {
1361 p = fa->strings + (*s << 2);
1362 n = UCHAR(fa->source[(UCHAR(p[0]) << 16) + (UCHAR(p[1]) << 8) +
1363 UCHAR(p[2]) + state->len]);
1364 p += 3;
1365 } else {
1366 p = fa->strings + (fa->nsstrings << 2) +
1367 ((*s - fa->nsstrings) << 1);
1368 p = fa->grammar + (UCHAR(p[0]) << 8) + UCHAR(p[1]);
1369 n = UCHAR(p[newstate->len]);
1370 }
1371 if (ecset[n >> 5] & (1 << (n & 31))) {
1372 if (newstate->len == UCHAR(p[0])) {
1373 /* end of string */
1374 newstate->final = fa->nregexp + *s;
1375 } else {
1376 /* add string */
1377 newstate->str.a[newstate->nstr++] = *s;
1378 }
1379 }
1380 }
1381
1382 return posnsize;
1383 }
1384
1385 /*
1386 * NAME: dfa->expand()
1387 * DESCRIPTION: expand a state
1388 */
dfa_expand(dfa * fa,dfastate * state)1389 static dfastate *dfa_expand(dfa *fa, dfastate *state)
1390 {
1391 Uint iset[8];
1392 Uint *cset, *ecset, ncset;
1393 rgxposn **rrp;
1394 unsigned short i, n, *s;
1395 char *p;
1396 dfastate *newstate;
1397 rgxposn **newposn;
1398 unsigned short *newstr;
1399 Uint size;
1400
1401 newposn = NULL;
1402 newstr = 0;
1403
1404 if (fa->posnhtab == (hashtab *) NULL) {
1405 dfa_loadtmp(fa); /* load tmp info */
1406 }
1407
1408 memset(iset, '\0', sizeof(iset));
1409
1410 /* allocate character sets for strings and positions */
1411 ncset = state->nstr;
1412 for (i = state->nposn, rrp = POSNA(state); i > 0; --i, rrp++) {
1413 ncset += (*rrp)->size;
1414 }
1415 cset = ALLOCA(Uint, ncset << 3);
1416
1417 /* construct character sets for all string chars */
1418 for (i = state->nstr, s = STRA(state); i > 0; --i, s++) {
1419 if (*s < fa->nsstrings) {
1420 p = fa->strings + (*s << 2);
1421 p = fa->source + (UCHAR(p[0]) << 16) + (UCHAR(p[1]) << 8) +
1422 UCHAR(p[2]);
1423 } else {
1424 p = fa->strings + (fa->nsstrings << 2) +
1425 ((*s - fa->nsstrings) << 1);
1426 p = fa->grammar + (UCHAR(p[0]) << 8) + UCHAR(p[1]) + 1;
1427 }
1428 n = UCHAR(p[state->len]);
1429 memset(cset, '\0', 32);
1430 cset[n >> 5] |= 1 << (n & 31);
1431 iset[n >> 5] |= 1 << (n & 31); /* also add to input set */
1432 cset += 8;
1433 }
1434
1435 /* construct character sets for all positions */
1436 for (i = state->nposn, rrp = POSNA(state); i > 0; --i, rrp++) {
1437 rp_cset(*rrp, cset);
1438 for (n = (*rrp)->size; n > 0; --n) {
1439 cs_or(iset, cset); /* add to input set */
1440 cset += 8;
1441 }
1442 }
1443 cset -= ncset << 3;
1444
1445 /*
1446 * adjust equivalence classes
1447 */
1448 dfa_ecsplit(fa, iset, cset, ncset);
1449
1450 /*
1451 * for all equivalence classes, compute transition states
1452 */
1453 if (state->nposn != 0) {
1454 newposn = ALLOCA(rgxposn*, state->nposn);
1455 }
1456 if (state->nstr != 0) {
1457 newstr = ALLOCA(unsigned short, state->nstr);
1458 }
1459 p = state->trans = ALLOC(char, 2 * 256);
1460 state->ntrans = fa->ecnum;
1461 state->alloc = TRUE;
1462 cset += (Uint) state->nstr << 3;
1463 for (i = fa->ecnum, ecset = fa->ecset; i > 0; --i, ecset += 8) {
1464 /* prepare new state */
1465 newstate = &fa->states[fa->nstates];
1466
1467 /* flesh out new state */
1468 newstate->posn.a = newposn;
1469 newstate->str.a = newstr;
1470 size = dfa_newstate(fa, state, newstate, ecset, cset);
1471
1472 if (newstate->nposn == 0 && newstate->nstr == 0 && newstate->final < 0)
1473 {
1474 /* stuck in state 0 */
1475 n = 0;
1476 } else {
1477 if (newstate->nposn <= 1) {
1478 if (newstate->nposn == 0) {
1479 newstate->posn.a = (rgxposn **) NULL;
1480 } else {
1481 newstate->posn.e = newposn[0];
1482 }
1483 }
1484 if (newstate->nstr <= 2) {
1485 if (newstate->nstr == 0) {
1486 newstate->str.a = (unsigned short *) NULL;
1487 newstate->len = 0;
1488 } else {
1489 memcpy(newstate->str.e, newstr, 2 * sizeof(unsigned short));
1490 }
1491 }
1492
1493 n = ds_hash(fa->sthtab, fa->sthsize, fa->states,
1494 (unsigned short) fa->nstates);
1495 if (n == fa->nstates) {
1496 /*
1497 * genuinely new state
1498 */
1499 if (newstate->nposn > 1) {
1500 newstate->posn.a = ALLOC(rgxposn*, newstate->nposn);
1501 memcpy(newstate->posn.a, newposn,
1502 newstate->nposn * sizeof(rgxposn*));
1503 }
1504 if (newstate->nstr > 2) {
1505 newstate->str.a = ALLOC(unsigned short, newstate->nstr);
1506 memcpy(newstate->str.a, newstr,
1507 newstate->nstr * sizeof(unsigned short));
1508 }
1509 if (newstate->nposn == 0 && newstate->nstr == 0) {
1510 newstate->ntrans = 256;
1511 newstate->trans = fa->zerotrans;
1512 fa->endstates++;
1513 }
1514 fa->dfasize += 3;
1515 fa->tmpssize += 5 + newstate->nposn * 3 + newstate->nstr * 2;
1516 fa->tmppsize += size;
1517
1518 if (++fa->nstates == fa->sttsize) {
1519 /* grow table */
1520 size = state - fa->states;
1521 fa->states = REALLOC(fa->states, dfastate, fa->nstates,
1522 fa->sttsize <<= 1);
1523 state = fa->states + size;
1524 }
1525 }
1526 }
1527
1528 *p++ = n >> 8;
1529 *p++ = n;
1530 }
1531
1532 if (state->nstr != 0) {
1533 AFREE(newstr);
1534 }
1535 if (state->nposn != 0) {
1536 AFREE(newposn);
1537 }
1538 AFREE(cset - ((Uint) state->nstr << 3));
1539
1540 fa->modified = TRUE;
1541 fa->nexpanded++;
1542 fa->dfasize += fa->ecnum << 1;
1543 return state;
1544 }
1545
1546 /*
1547 * NAME: dfa->scan()
1548 * DESCRIPTION: Scan input, while lazily constructing a DFA.
1549 * Return values: [0 ..> token
1550 * -1 end of string
1551 * -2 Invalid token
1552 * -3 DFA too large (deallocate)
1553 */
dfa_scan(dfa * fa,string * str,ssizet * strlen,char ** token,ssizet * len)1554 short dfa_scan(dfa *fa, string *str, ssizet *strlen, char **token, ssizet *len)
1555 {
1556 ssizet size;
1557 unsigned short eclass;
1558 char *p, *q;
1559 dfastate *state;
1560 short final;
1561 ssizet fsize, nomatch;
1562
1563 nomatch = 0;
1564 fsize = 0;
1565 size = *strlen;
1566 *token = str->text + str->len - size;
1567
1568 while (size != 0) {
1569 state = &fa->states[1];
1570 final = -1;
1571 p = str->text + str->len - size;
1572
1573 while (size != 0) {
1574 eclass = UCHAR(fa->eclass[UCHAR(*p)]);
1575 if (state->ntrans <= eclass) {
1576 if (state->ntrans == 0) {
1577 /* expand state */
1578 if (state == fa->states) {
1579 break; /* stuck in state 0 */
1580 }
1581 state = dfa_expand(fa, state);
1582 if (fa->dfasize + fa->tmpssize + fa->tmppsize >
1583 (Uint) MAX_AUTOMSZ * USHRT_MAX) {
1584 unsigned short save;
1585
1586 /*
1587 * too much temporary data: attempt to expand
1588 * all states
1589 */
1590 save = state - fa->states;
1591 for (state = &fa->states[1];
1592 fa->nstates != fa->nexpanded + fa->endstates;
1593 state++) {
1594 if (fa->nstates > USHRT_MAX - 256 ||
1595 fa->dfasize > (Uint) MAX_AUTOMSZ * USHRT_MAX) {
1596 return DFA_TOOBIG;
1597 }
1598 if (state->ntrans == 0) {
1599 state = dfa_expand(fa, state);
1600 }
1601 }
1602 state = &fa->states[save];
1603 }
1604 if (fa->nstates > USHRT_MAX - 256 ||
1605 fa->dfasize > (Uint) MAX_AUTOMSZ * USHRT_MAX) {
1606 return DFA_TOOBIG;
1607 }
1608 eclass = UCHAR(fa->eclass[UCHAR(*p)]);
1609 } else {
1610 /* extend transition table */
1611 dfa_extend(fa, state, eclass);
1612 }
1613 }
1614
1615 /* transition */
1616 --size;
1617 p++;
1618 q = &state->trans[eclass << 1];
1619 state = &fa->states[(UCHAR(q[0]) << 8) + UCHAR(q[1])];
1620
1621 /* check if final state */
1622 if (state->final >= 0) {
1623 final = state->final;
1624 fsize = size;
1625 }
1626 }
1627
1628 if (final >= 0) {
1629 if (nomatch != 0) {
1630 if (fa->nomatch != fa->whitespace) {
1631 *len = nomatch;
1632 return fa->nomatch;
1633 }
1634 *token += nomatch;
1635 nomatch = 0;
1636 }
1637
1638 /* in a final state */
1639 size = fsize;
1640 if (final != fa->whitespace) {
1641 *len = *strlen - size;
1642 *strlen = size;
1643 return final;
1644 }
1645 /* else whitespace: continue */
1646 *token = p - 1;
1647 *strlen = size;
1648 } else if (fa->nomatch >= 0) {
1649 nomatch++;
1650 size = --*strlen;
1651 } else {
1652 return DFA_REJECT;
1653 }
1654 }
1655
1656 if (nomatch != 0 && fa->nomatch != fa->whitespace) {
1657 *len = nomatch;
1658 return fa->nomatch;
1659 }
1660
1661 return DFA_EOS;
1662 }
1663