1 /*
2 * NFA utilities.
3 * This file is #included by regcomp.c.
4 *
5 * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
6 *
7 * Development of this software was funded, in part, by Cray Research Inc.,
8 * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
9 * Corporation, none of whom are responsible for the results. The author
10 * thanks all of them.
11 *
12 * Redistribution and use in source and binary forms -- with or without
13 * modification -- are permitted for any purpose, provided that
14 * redistributions in source form retain this entire copyright notice and
15 * indicate the origin and nature of any modifications.
16 *
17 * I'd appreciate being given credit for this package in the documentation
18 * of software which uses it, but that is not a requirement.
19 *
20 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
21 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
22 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
23 * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
29 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 *
31 * src/backend/regex/regc_nfa.c
32 *
33 *
34 * One or two things that technically ought to be in here
35 * are actually in color.c, thanks to some incestuous relationships in
36 * the color chains.
37 */
38
39 #define NISERR() VISERR(nfa->v)
40 #define NERR(e) VERR(nfa->v, (e))
41
42
43 /*
44 * newnfa - set up an NFA
45 */
46 static struct nfa * /* the NFA, or NULL */
newnfa(struct vars * v,struct colormap * cm,struct nfa * parent)47 newnfa(struct vars *v,
48 struct colormap *cm,
49 struct nfa *parent) /* NULL if primary NFA */
50 {
51 struct nfa *nfa;
52
53 nfa = (struct nfa *) MALLOC(sizeof(struct nfa));
54 if (nfa == NULL)
55 {
56 ERR(REG_ESPACE);
57 return NULL;
58 }
59
60 /* Make the NFA minimally valid, so freenfa() will behave sanely */
61 nfa->states = NULL;
62 nfa->slast = NULL;
63 nfa->freestates = NULL;
64 nfa->freearcs = NULL;
65 nfa->lastsb = NULL;
66 nfa->lastab = NULL;
67 nfa->lastsbused = 0;
68 nfa->lastabused = 0;
69 nfa->nstates = 0;
70 nfa->cm = cm;
71 nfa->v = v;
72 nfa->bos[0] = nfa->bos[1] = COLORLESS;
73 nfa->eos[0] = nfa->eos[1] = COLORLESS;
74 nfa->flags = 0;
75 nfa->minmatchall = nfa->maxmatchall = -1;
76 nfa->parent = parent; /* Precedes newfstate so parent is valid. */
77
78 /* Create required infrastructure */
79 nfa->post = newfstate(nfa, '@'); /* number 0 */
80 nfa->pre = newfstate(nfa, '>'); /* number 1 */
81 nfa->init = newstate(nfa); /* may become invalid later */
82 nfa->final = newstate(nfa);
83 if (ISERR())
84 {
85 freenfa(nfa);
86 return NULL;
87 }
88 rainbow(nfa, nfa->cm, PLAIN, COLORLESS, nfa->pre, nfa->init);
89 newarc(nfa, '^', 1, nfa->pre, nfa->init);
90 newarc(nfa, '^', 0, nfa->pre, nfa->init);
91 rainbow(nfa, nfa->cm, PLAIN, COLORLESS, nfa->final, nfa->post);
92 newarc(nfa, '$', 1, nfa->final, nfa->post);
93 newarc(nfa, '$', 0, nfa->final, nfa->post);
94
95 if (ISERR())
96 {
97 freenfa(nfa);
98 return NULL;
99 }
100 return nfa;
101 }
102
103 /*
104 * freenfa - free an entire NFA
105 */
106 static void
freenfa(struct nfa * nfa)107 freenfa(struct nfa *nfa)
108 {
109 struct statebatch *sb;
110 struct statebatch *sbnext;
111 struct arcbatch *ab;
112 struct arcbatch *abnext;
113
114 for (sb = nfa->lastsb; sb != NULL; sb = sbnext)
115 {
116 sbnext = sb->next;
117 nfa->v->spaceused -= STATEBATCHSIZE(sb->nstates);
118 FREE(sb);
119 }
120 nfa->lastsb = NULL;
121 for (ab = nfa->lastab; ab != NULL; ab = abnext)
122 {
123 abnext = ab->next;
124 nfa->v->spaceused -= ARCBATCHSIZE(ab->narcs);
125 FREE(ab);
126 }
127 nfa->lastab = NULL;
128
129 nfa->nstates = -1;
130 FREE(nfa);
131 }
132
133 /*
134 * newstate - allocate an NFA state, with zero flag value
135 */
136 static struct state * /* NULL on error */
newstate(struct nfa * nfa)137 newstate(struct nfa *nfa)
138 {
139 struct state *s;
140
141 /*
142 * This is a handy place to check for operation cancel during regex
143 * compilation, since no code path will go very long without making a new
144 * state or arc.
145 */
146 if (CANCEL_REQUESTED(nfa->v->re))
147 {
148 NERR(REG_CANCEL);
149 return NULL;
150 }
151
152 /* first, recycle anything that's on the freelist */
153 if (nfa->freestates != NULL)
154 {
155 s = nfa->freestates;
156 nfa->freestates = s->next;
157 }
158 /* otherwise, is there anything left in the last statebatch? */
159 else if (nfa->lastsb != NULL && nfa->lastsbused < nfa->lastsb->nstates)
160 {
161 s = &nfa->lastsb->s[nfa->lastsbused++];
162 }
163 /* otherwise, need to allocate a new statebatch */
164 else
165 {
166 struct statebatch *newSb;
167 size_t nstates;
168
169 if (nfa->v->spaceused >= REG_MAX_COMPILE_SPACE)
170 {
171 NERR(REG_ETOOBIG);
172 return NULL;
173 }
174 nstates = (nfa->lastsb != NULL) ? nfa->lastsb->nstates * 2 : FIRSTSBSIZE;
175 if (nstates > MAXSBSIZE)
176 nstates = MAXSBSIZE;
177 newSb = (struct statebatch *) MALLOC(STATEBATCHSIZE(nstates));
178 if (newSb == NULL)
179 {
180 NERR(REG_ESPACE);
181 return NULL;
182 }
183 nfa->v->spaceused += STATEBATCHSIZE(nstates);
184 newSb->nstates = nstates;
185 newSb->next = nfa->lastsb;
186 nfa->lastsb = newSb;
187 nfa->lastsbused = 1;
188 s = &newSb->s[0];
189 }
190
191 assert(nfa->nstates >= 0);
192 s->no = nfa->nstates++;
193 s->flag = 0;
194 if (nfa->states == NULL)
195 nfa->states = s;
196 s->nins = 0;
197 s->ins = NULL;
198 s->nouts = 0;
199 s->outs = NULL;
200 s->tmp = NULL;
201 s->next = NULL;
202 if (nfa->slast != NULL)
203 {
204 assert(nfa->slast->next == NULL);
205 nfa->slast->next = s;
206 }
207 s->prev = nfa->slast;
208 nfa->slast = s;
209 return s;
210 }
211
212 /*
213 * newfstate - allocate an NFA state with a specified flag value
214 */
215 static struct state * /* NULL on error */
newfstate(struct nfa * nfa,int flag)216 newfstate(struct nfa *nfa, int flag)
217 {
218 struct state *s;
219
220 s = newstate(nfa);
221 if (s != NULL)
222 s->flag = (char) flag;
223 return s;
224 }
225
226 /*
227 * dropstate - delete a state's inarcs and outarcs and free it
228 */
229 static void
dropstate(struct nfa * nfa,struct state * s)230 dropstate(struct nfa *nfa,
231 struct state *s)
232 {
233 struct arc *a;
234
235 while ((a = s->ins) != NULL)
236 freearc(nfa, a);
237 while ((a = s->outs) != NULL)
238 freearc(nfa, a);
239 freestate(nfa, s);
240 }
241
242 /*
243 * freestate - free a state, which has no in-arcs or out-arcs
244 */
245 static void
freestate(struct nfa * nfa,struct state * s)246 freestate(struct nfa *nfa,
247 struct state *s)
248 {
249 assert(s != NULL);
250 assert(s->nins == 0 && s->nouts == 0);
251
252 s->no = FREESTATE;
253 s->flag = 0;
254 if (s->next != NULL)
255 s->next->prev = s->prev;
256 else
257 {
258 assert(s == nfa->slast);
259 nfa->slast = s->prev;
260 }
261 if (s->prev != NULL)
262 s->prev->next = s->next;
263 else
264 {
265 assert(s == nfa->states);
266 nfa->states = s->next;
267 }
268 s->prev = NULL;
269 s->next = nfa->freestates; /* don't delete it, put it on the free list */
270 nfa->freestates = s;
271 }
272
273 /*
274 * newarc - set up a new arc within an NFA
275 *
276 * This function checks to make sure that no duplicate arcs are created.
277 * In general we never want duplicates.
278 *
279 * However: in principle, a RAINBOW arc is redundant with any plain arc
280 * (unless that arc is for a pseudocolor). But we don't try to recognize
281 * that redundancy, either here or in allied operations such as moveins().
282 * The pseudocolor consideration makes that more costly than it seems worth.
283 */
284 static void
newarc(struct nfa * nfa,int t,color co,struct state * from,struct state * to)285 newarc(struct nfa *nfa,
286 int t,
287 color co,
288 struct state *from,
289 struct state *to)
290 {
291 struct arc *a;
292
293 assert(from != NULL && to != NULL);
294
295 /*
296 * This is a handy place to check for operation cancel during regex
297 * compilation, since no code path will go very long without making a new
298 * state or arc.
299 */
300 if (CANCEL_REQUESTED(nfa->v->re))
301 {
302 NERR(REG_CANCEL);
303 return;
304 }
305
306 /* check for duplicate arc, using whichever chain is shorter */
307 if (from->nouts <= to->nins)
308 {
309 for (a = from->outs; a != NULL; a = a->outchain)
310 if (a->to == to && a->co == co && a->type == t)
311 return;
312 }
313 else
314 {
315 for (a = to->ins; a != NULL; a = a->inchain)
316 if (a->from == from && a->co == co && a->type == t)
317 return;
318 }
319
320 /* no dup, so create the arc */
321 createarc(nfa, t, co, from, to);
322 }
323
324 /*
325 * createarc - create a new arc within an NFA
326 *
327 * This function must *only* be used after verifying that there is no existing
328 * identical arc (same type/color/from/to).
329 */
330 static void
createarc(struct nfa * nfa,int t,color co,struct state * from,struct state * to)331 createarc(struct nfa *nfa,
332 int t,
333 color co,
334 struct state *from,
335 struct state *to)
336 {
337 struct arc *a;
338
339 a = allocarc(nfa);
340 if (NISERR())
341 return;
342 assert(a != NULL);
343
344 a->type = t;
345 a->co = co;
346 a->to = to;
347 a->from = from;
348
349 /*
350 * Put the new arc on the beginning, not the end, of the chains; it's
351 * simpler here, and freearc() is the same cost either way. See also the
352 * logic in moveins() and its cohorts, as well as fixempties().
353 */
354 a->inchain = to->ins;
355 a->inchainRev = NULL;
356 if (to->ins)
357 to->ins->inchainRev = a;
358 to->ins = a;
359 a->outchain = from->outs;
360 a->outchainRev = NULL;
361 if (from->outs)
362 from->outs->outchainRev = a;
363 from->outs = a;
364
365 from->nouts++;
366 to->nins++;
367
368 if (COLORED(a) && nfa->parent == NULL)
369 colorchain(nfa->cm, a);
370 }
371
372 /*
373 * allocarc - allocate a new arc within an NFA
374 */
375 static struct arc * /* NULL for failure */
allocarc(struct nfa * nfa)376 allocarc(struct nfa *nfa)
377 {
378 struct arc *a;
379
380 /* first, recycle anything that's on the freelist */
381 if (nfa->freearcs != NULL)
382 {
383 a = nfa->freearcs;
384 nfa->freearcs = a->freechain;
385 }
386 /* otherwise, is there anything left in the last arcbatch? */
387 else if (nfa->lastab != NULL && nfa->lastabused < nfa->lastab->narcs)
388 {
389 a = &nfa->lastab->a[nfa->lastabused++];
390 }
391 /* otherwise, need to allocate a new arcbatch */
392 else
393 {
394 struct arcbatch *newAb;
395 size_t narcs;
396
397 if (nfa->v->spaceused >= REG_MAX_COMPILE_SPACE)
398 {
399 NERR(REG_ETOOBIG);
400 return NULL;
401 }
402 narcs = (nfa->lastab != NULL) ? nfa->lastab->narcs * 2 : FIRSTABSIZE;
403 if (narcs > MAXABSIZE)
404 narcs = MAXABSIZE;
405 newAb = (struct arcbatch *) MALLOC(ARCBATCHSIZE(narcs));
406 if (newAb == NULL)
407 {
408 NERR(REG_ESPACE);
409 return NULL;
410 }
411 nfa->v->spaceused += ARCBATCHSIZE(narcs);
412 newAb->narcs = narcs;
413 newAb->next = nfa->lastab;
414 nfa->lastab = newAb;
415 nfa->lastabused = 1;
416 a = &newAb->a[0];
417 }
418
419 return a;
420 }
421
422 /*
423 * freearc - free an arc
424 */
425 static void
freearc(struct nfa * nfa,struct arc * victim)426 freearc(struct nfa *nfa,
427 struct arc *victim)
428 {
429 struct state *from = victim->from;
430 struct state *to = victim->to;
431 struct arc *predecessor;
432
433 assert(victim->type != 0);
434
435 /* take it off color chain if necessary */
436 if (COLORED(victim) && nfa->parent == NULL)
437 uncolorchain(nfa->cm, victim);
438
439 /* take it off source's out-chain */
440 assert(from != NULL);
441 predecessor = victim->outchainRev;
442 if (predecessor == NULL)
443 {
444 assert(from->outs == victim);
445 from->outs = victim->outchain;
446 }
447 else
448 {
449 assert(predecessor->outchain == victim);
450 predecessor->outchain = victim->outchain;
451 }
452 if (victim->outchain != NULL)
453 {
454 assert(victim->outchain->outchainRev == victim);
455 victim->outchain->outchainRev = predecessor;
456 }
457 from->nouts--;
458
459 /* take it off target's in-chain */
460 assert(to != NULL);
461 predecessor = victim->inchainRev;
462 if (predecessor == NULL)
463 {
464 assert(to->ins == victim);
465 to->ins = victim->inchain;
466 }
467 else
468 {
469 assert(predecessor->inchain == victim);
470 predecessor->inchain = victim->inchain;
471 }
472 if (victim->inchain != NULL)
473 {
474 assert(victim->inchain->inchainRev == victim);
475 victim->inchain->inchainRev = predecessor;
476 }
477 to->nins--;
478
479 /* clean up and place on NFA's free list */
480 victim->type = 0;
481 victim->from = NULL; /* precautions... */
482 victim->to = NULL;
483 victim->inchain = NULL;
484 victim->inchainRev = NULL;
485 victim->outchain = NULL;
486 victim->outchainRev = NULL;
487 victim->freechain = nfa->freearcs;
488 nfa->freearcs = victim;
489 }
490
491 /*
492 * changearcsource - flip an arc to have a different from state
493 *
494 * Caller must have verified that there is no pre-existing duplicate arc.
495 */
496 static void
changearcsource(struct arc * a,struct state * newfrom)497 changearcsource(struct arc *a, struct state *newfrom)
498 {
499 struct state *oldfrom = a->from;
500 struct arc *predecessor;
501
502 assert(oldfrom != newfrom);
503
504 /* take it off old source's out-chain */
505 assert(oldfrom != NULL);
506 predecessor = a->outchainRev;
507 if (predecessor == NULL)
508 {
509 assert(oldfrom->outs == a);
510 oldfrom->outs = a->outchain;
511 }
512 else
513 {
514 assert(predecessor->outchain == a);
515 predecessor->outchain = a->outchain;
516 }
517 if (a->outchain != NULL)
518 {
519 assert(a->outchain->outchainRev == a);
520 a->outchain->outchainRev = predecessor;
521 }
522 oldfrom->nouts--;
523
524 a->from = newfrom;
525
526 /* prepend it to new source's out-chain */
527 a->outchain = newfrom->outs;
528 a->outchainRev = NULL;
529 if (newfrom->outs)
530 newfrom->outs->outchainRev = a;
531 newfrom->outs = a;
532 newfrom->nouts++;
533 }
534
535 /*
536 * changearctarget - flip an arc to have a different to state
537 *
538 * Caller must have verified that there is no pre-existing duplicate arc.
539 */
540 static void
changearctarget(struct arc * a,struct state * newto)541 changearctarget(struct arc *a, struct state *newto)
542 {
543 struct state *oldto = a->to;
544 struct arc *predecessor;
545
546 assert(oldto != newto);
547
548 /* take it off old target's in-chain */
549 assert(oldto != NULL);
550 predecessor = a->inchainRev;
551 if (predecessor == NULL)
552 {
553 assert(oldto->ins == a);
554 oldto->ins = a->inchain;
555 }
556 else
557 {
558 assert(predecessor->inchain == a);
559 predecessor->inchain = a->inchain;
560 }
561 if (a->inchain != NULL)
562 {
563 assert(a->inchain->inchainRev == a);
564 a->inchain->inchainRev = predecessor;
565 }
566 oldto->nins--;
567
568 a->to = newto;
569
570 /* prepend it to new target's in-chain */
571 a->inchain = newto->ins;
572 a->inchainRev = NULL;
573 if (newto->ins)
574 newto->ins->inchainRev = a;
575 newto->ins = a;
576 newto->nins++;
577 }
578
579 /*
580 * hasnonemptyout - Does state have a non-EMPTY out arc?
581 */
582 static int
hasnonemptyout(struct state * s)583 hasnonemptyout(struct state *s)
584 {
585 struct arc *a;
586
587 for (a = s->outs; a != NULL; a = a->outchain)
588 {
589 if (a->type != EMPTY)
590 return 1;
591 }
592 return 0;
593 }
594
595 /*
596 * findarc - find arc, if any, from given source with given type and color
597 * If there is more than one such arc, the result is random.
598 */
599 static struct arc *
findarc(struct state * s,int type,color co)600 findarc(struct state *s,
601 int type,
602 color co)
603 {
604 struct arc *a;
605
606 for (a = s->outs; a != NULL; a = a->outchain)
607 if (a->type == type && a->co == co)
608 return a;
609 return NULL;
610 }
611
612 /*
613 * cparc - allocate a new arc within an NFA, copying details from old one
614 */
615 static void
cparc(struct nfa * nfa,struct arc * oa,struct state * from,struct state * to)616 cparc(struct nfa *nfa,
617 struct arc *oa,
618 struct state *from,
619 struct state *to)
620 {
621 newarc(nfa, oa->type, oa->co, from, to);
622 }
623
624 /*
625 * sortins - sort the in arcs of a state by from/color/type
626 */
627 static void
sortins(struct nfa * nfa,struct state * s)628 sortins(struct nfa *nfa,
629 struct state *s)
630 {
631 struct arc **sortarray;
632 struct arc *a;
633 int n = s->nins;
634 int i;
635
636 if (n <= 1)
637 return; /* nothing to do */
638 /* make an array of arc pointers ... */
639 sortarray = (struct arc **) MALLOC(n * sizeof(struct arc *));
640 if (sortarray == NULL)
641 {
642 NERR(REG_ESPACE);
643 return;
644 }
645 i = 0;
646 for (a = s->ins; a != NULL; a = a->inchain)
647 sortarray[i++] = a;
648 assert(i == n);
649 /* ... sort the array */
650 qsort(sortarray, n, sizeof(struct arc *), sortins_cmp);
651 /* ... and rebuild arc list in order */
652 /* it seems worth special-casing first and last items to simplify loop */
653 a = sortarray[0];
654 s->ins = a;
655 a->inchain = sortarray[1];
656 a->inchainRev = NULL;
657 for (i = 1; i < n - 1; i++)
658 {
659 a = sortarray[i];
660 a->inchain = sortarray[i + 1];
661 a->inchainRev = sortarray[i - 1];
662 }
663 a = sortarray[i];
664 a->inchain = NULL;
665 a->inchainRev = sortarray[i - 1];
666 FREE(sortarray);
667 }
668
669 static int
sortins_cmp(const void * a,const void * b)670 sortins_cmp(const void *a, const void *b)
671 {
672 const struct arc *aa = *((const struct arc *const *) a);
673 const struct arc *bb = *((const struct arc *const *) b);
674
675 /* we check the fields in the order they are most likely to be different */
676 if (aa->from->no < bb->from->no)
677 return -1;
678 if (aa->from->no > bb->from->no)
679 return 1;
680 if (aa->co < bb->co)
681 return -1;
682 if (aa->co > bb->co)
683 return 1;
684 if (aa->type < bb->type)
685 return -1;
686 if (aa->type > bb->type)
687 return 1;
688 return 0;
689 }
690
691 /*
692 * sortouts - sort the out arcs of a state by to/color/type
693 */
694 static void
sortouts(struct nfa * nfa,struct state * s)695 sortouts(struct nfa *nfa,
696 struct state *s)
697 {
698 struct arc **sortarray;
699 struct arc *a;
700 int n = s->nouts;
701 int i;
702
703 if (n <= 1)
704 return; /* nothing to do */
705 /* make an array of arc pointers ... */
706 sortarray = (struct arc **) MALLOC(n * sizeof(struct arc *));
707 if (sortarray == NULL)
708 {
709 NERR(REG_ESPACE);
710 return;
711 }
712 i = 0;
713 for (a = s->outs; a != NULL; a = a->outchain)
714 sortarray[i++] = a;
715 assert(i == n);
716 /* ... sort the array */
717 qsort(sortarray, n, sizeof(struct arc *), sortouts_cmp);
718 /* ... and rebuild arc list in order */
719 /* it seems worth special-casing first and last items to simplify loop */
720 a = sortarray[0];
721 s->outs = a;
722 a->outchain = sortarray[1];
723 a->outchainRev = NULL;
724 for (i = 1; i < n - 1; i++)
725 {
726 a = sortarray[i];
727 a->outchain = sortarray[i + 1];
728 a->outchainRev = sortarray[i - 1];
729 }
730 a = sortarray[i];
731 a->outchain = NULL;
732 a->outchainRev = sortarray[i - 1];
733 FREE(sortarray);
734 }
735
736 static int
sortouts_cmp(const void * a,const void * b)737 sortouts_cmp(const void *a, const void *b)
738 {
739 const struct arc *aa = *((const struct arc *const *) a);
740 const struct arc *bb = *((const struct arc *const *) b);
741
742 /* we check the fields in the order they are most likely to be different */
743 if (aa->to->no < bb->to->no)
744 return -1;
745 if (aa->to->no > bb->to->no)
746 return 1;
747 if (aa->co < bb->co)
748 return -1;
749 if (aa->co > bb->co)
750 return 1;
751 if (aa->type < bb->type)
752 return -1;
753 if (aa->type > bb->type)
754 return 1;
755 return 0;
756 }
757
758 /*
759 * Common decision logic about whether to use arc-by-arc operations or
760 * sort/merge. If there's just a few source arcs we cannot recoup the
761 * cost of sorting the destination arc list, no matter how large it is.
762 * Otherwise, limit the number of arc-by-arc comparisons to about 1000
763 * (a somewhat arbitrary choice, but the breakeven point would probably
764 * be machine dependent anyway).
765 */
766 #define BULK_ARC_OP_USE_SORT(nsrcarcs, ndestarcs) \
767 ((nsrcarcs) < 4 ? 0 : ((nsrcarcs) > 32 || (ndestarcs) > 32))
768
769 /*
770 * moveins - move all in arcs of a state to another state
771 *
772 * You might think this could be done better by just updating the
773 * existing arcs, and you would be right if it weren't for the need
774 * for duplicate suppression, which makes it easier to just make new
775 * ones to exploit the suppression built into newarc.
776 *
777 * However, if we have a whole lot of arcs to deal with, retail duplicate
778 * checks become too slow. In that case we proceed by sorting and merging
779 * the arc lists, and then we can indeed just update the arcs in-place.
780 */
781 static void
moveins(struct nfa * nfa,struct state * oldState,struct state * newState)782 moveins(struct nfa *nfa,
783 struct state *oldState,
784 struct state *newState)
785 {
786 assert(oldState != newState);
787
788 if (!BULK_ARC_OP_USE_SORT(oldState->nins, newState->nins))
789 {
790 /* With not too many arcs, just do them one at a time */
791 struct arc *a;
792
793 while ((a = oldState->ins) != NULL)
794 {
795 cparc(nfa, a, a->from, newState);
796 freearc(nfa, a);
797 }
798 }
799 else
800 {
801 /*
802 * With many arcs, use a sort-merge approach. Note changearctarget()
803 * will put the arc onto the front of newState's chain, so it does not
804 * break our walk through the sorted part of the chain.
805 */
806 struct arc *oa;
807 struct arc *na;
808
809 /*
810 * Because we bypass newarc() in this code path, we'd better include a
811 * cancel check.
812 */
813 if (CANCEL_REQUESTED(nfa->v->re))
814 {
815 NERR(REG_CANCEL);
816 return;
817 }
818
819 sortins(nfa, oldState);
820 sortins(nfa, newState);
821 if (NISERR())
822 return; /* might have failed to sort */
823 oa = oldState->ins;
824 na = newState->ins;
825 while (oa != NULL && na != NULL)
826 {
827 struct arc *a = oa;
828
829 switch (sortins_cmp(&oa, &na))
830 {
831 case -1:
832 /* newState does not have anything matching oa */
833 oa = oa->inchain;
834
835 /*
836 * Rather than doing createarc+freearc, we can just unlink
837 * and relink the existing arc struct.
838 */
839 changearctarget(a, newState);
840 break;
841 case 0:
842 /* match, advance in both lists */
843 oa = oa->inchain;
844 na = na->inchain;
845 /* ... and drop duplicate arc from oldState */
846 freearc(nfa, a);
847 break;
848 case +1:
849 /* advance only na; oa might have a match later */
850 na = na->inchain;
851 break;
852 default:
853 assert(NOTREACHED);
854 }
855 }
856 while (oa != NULL)
857 {
858 /* newState does not have anything matching oa */
859 struct arc *a = oa;
860
861 oa = oa->inchain;
862 changearctarget(a, newState);
863 }
864 }
865
866 assert(oldState->nins == 0);
867 assert(oldState->ins == NULL);
868 }
869
870 /*
871 * copyins - copy in arcs of a state to another state
872 */
873 static void
copyins(struct nfa * nfa,struct state * oldState,struct state * newState)874 copyins(struct nfa *nfa,
875 struct state *oldState,
876 struct state *newState)
877 {
878 assert(oldState != newState);
879
880 if (!BULK_ARC_OP_USE_SORT(oldState->nins, newState->nins))
881 {
882 /* With not too many arcs, just do them one at a time */
883 struct arc *a;
884
885 for (a = oldState->ins; a != NULL; a = a->inchain)
886 cparc(nfa, a, a->from, newState);
887 }
888 else
889 {
890 /*
891 * With many arcs, use a sort-merge approach. Note that createarc()
892 * will put new arcs onto the front of newState's chain, so it does
893 * not break our walk through the sorted part of the chain.
894 */
895 struct arc *oa;
896 struct arc *na;
897
898 /*
899 * Because we bypass newarc() in this code path, we'd better include a
900 * cancel check.
901 */
902 if (CANCEL_REQUESTED(nfa->v->re))
903 {
904 NERR(REG_CANCEL);
905 return;
906 }
907
908 sortins(nfa, oldState);
909 sortins(nfa, newState);
910 if (NISERR())
911 return; /* might have failed to sort */
912 oa = oldState->ins;
913 na = newState->ins;
914 while (oa != NULL && na != NULL)
915 {
916 struct arc *a = oa;
917
918 switch (sortins_cmp(&oa, &na))
919 {
920 case -1:
921 /* newState does not have anything matching oa */
922 oa = oa->inchain;
923 createarc(nfa, a->type, a->co, a->from, newState);
924 break;
925 case 0:
926 /* match, advance in both lists */
927 oa = oa->inchain;
928 na = na->inchain;
929 break;
930 case +1:
931 /* advance only na; oa might have a match later */
932 na = na->inchain;
933 break;
934 default:
935 assert(NOTREACHED);
936 }
937 }
938 while (oa != NULL)
939 {
940 /* newState does not have anything matching oa */
941 struct arc *a = oa;
942
943 oa = oa->inchain;
944 createarc(nfa, a->type, a->co, a->from, newState);
945 }
946 }
947 }
948
949 /*
950 * mergeins - merge a list of inarcs into a state
951 *
952 * This is much like copyins, but the source arcs are listed in an array,
953 * and are not guaranteed unique. It's okay to clobber the array contents.
954 */
955 static void
mergeins(struct nfa * nfa,struct state * s,struct arc ** arcarray,int arccount)956 mergeins(struct nfa *nfa,
957 struct state *s,
958 struct arc **arcarray,
959 int arccount)
960 {
961 struct arc *na;
962 int i;
963 int j;
964
965 if (arccount <= 0)
966 return;
967
968 /*
969 * Because we bypass newarc() in this code path, we'd better include a
970 * cancel check.
971 */
972 if (CANCEL_REQUESTED(nfa->v->re))
973 {
974 NERR(REG_CANCEL);
975 return;
976 }
977
978 /* Sort existing inarcs as well as proposed new ones */
979 sortins(nfa, s);
980 if (NISERR())
981 return; /* might have failed to sort */
982
983 qsort(arcarray, arccount, sizeof(struct arc *), sortins_cmp);
984
985 /*
986 * arcarray very likely includes dups, so we must eliminate them. (This
987 * could be folded into the next loop, but it's not worth the trouble.)
988 */
989 j = 0;
990 for (i = 1; i < arccount; i++)
991 {
992 switch (sortins_cmp(&arcarray[j], &arcarray[i]))
993 {
994 case -1:
995 /* non-dup */
996 arcarray[++j] = arcarray[i];
997 break;
998 case 0:
999 /* dup */
1000 break;
1001 default:
1002 /* trouble */
1003 assert(NOTREACHED);
1004 }
1005 }
1006 arccount = j + 1;
1007
1008 /*
1009 * Now merge into s' inchain. Note that createarc() will put new arcs
1010 * onto the front of s's chain, so it does not break our walk through the
1011 * sorted part of the chain.
1012 */
1013 i = 0;
1014 na = s->ins;
1015 while (i < arccount && na != NULL)
1016 {
1017 struct arc *a = arcarray[i];
1018
1019 switch (sortins_cmp(&a, &na))
1020 {
1021 case -1:
1022 /* s does not have anything matching a */
1023 createarc(nfa, a->type, a->co, a->from, s);
1024 i++;
1025 break;
1026 case 0:
1027 /* match, advance in both lists */
1028 i++;
1029 na = na->inchain;
1030 break;
1031 case +1:
1032 /* advance only na; array might have a match later */
1033 na = na->inchain;
1034 break;
1035 default:
1036 assert(NOTREACHED);
1037 }
1038 }
1039 while (i < arccount)
1040 {
1041 /* s does not have anything matching a */
1042 struct arc *a = arcarray[i];
1043
1044 createarc(nfa, a->type, a->co, a->from, s);
1045 i++;
1046 }
1047 }
1048
1049 /*
1050 * moveouts - move all out arcs of a state to another state
1051 *
1052 * See comments for moveins()
1053 */
1054 static void
moveouts(struct nfa * nfa,struct state * oldState,struct state * newState)1055 moveouts(struct nfa *nfa,
1056 struct state *oldState,
1057 struct state *newState)
1058 {
1059 assert(oldState != newState);
1060
1061 if (!BULK_ARC_OP_USE_SORT(oldState->nouts, newState->nouts))
1062 {
1063 /* With not too many arcs, just do them one at a time */
1064 struct arc *a;
1065
1066 while ((a = oldState->outs) != NULL)
1067 {
1068 cparc(nfa, a, newState, a->to);
1069 freearc(nfa, a);
1070 }
1071 }
1072 else
1073 {
1074 /*
1075 * With many arcs, use a sort-merge approach. Note changearcsource()
1076 * will put the arc onto the front of newState's chain, so it does not
1077 * break our walk through the sorted part of the chain.
1078 */
1079 struct arc *oa;
1080 struct arc *na;
1081
1082 /*
1083 * Because we bypass newarc() in this code path, we'd better include a
1084 * cancel check.
1085 */
1086 if (CANCEL_REQUESTED(nfa->v->re))
1087 {
1088 NERR(REG_CANCEL);
1089 return;
1090 }
1091
1092 sortouts(nfa, oldState);
1093 sortouts(nfa, newState);
1094 if (NISERR())
1095 return; /* might have failed to sort */
1096 oa = oldState->outs;
1097 na = newState->outs;
1098 while (oa != NULL && na != NULL)
1099 {
1100 struct arc *a = oa;
1101
1102 switch (sortouts_cmp(&oa, &na))
1103 {
1104 case -1:
1105 /* newState does not have anything matching oa */
1106 oa = oa->outchain;
1107
1108 /*
1109 * Rather than doing createarc+freearc, we can just unlink
1110 * and relink the existing arc struct.
1111 */
1112 changearcsource(a, newState);
1113 break;
1114 case 0:
1115 /* match, advance in both lists */
1116 oa = oa->outchain;
1117 na = na->outchain;
1118 /* ... and drop duplicate arc from oldState */
1119 freearc(nfa, a);
1120 break;
1121 case +1:
1122 /* advance only na; oa might have a match later */
1123 na = na->outchain;
1124 break;
1125 default:
1126 assert(NOTREACHED);
1127 }
1128 }
1129 while (oa != NULL)
1130 {
1131 /* newState does not have anything matching oa */
1132 struct arc *a = oa;
1133
1134 oa = oa->outchain;
1135 changearcsource(a, newState);
1136 }
1137 }
1138
1139 assert(oldState->nouts == 0);
1140 assert(oldState->outs == NULL);
1141 }
1142
1143 /*
1144 * copyouts - copy out arcs of a state to another state
1145 */
1146 static void
copyouts(struct nfa * nfa,struct state * oldState,struct state * newState)1147 copyouts(struct nfa *nfa,
1148 struct state *oldState,
1149 struct state *newState)
1150 {
1151 assert(oldState != newState);
1152
1153 if (!BULK_ARC_OP_USE_SORT(oldState->nouts, newState->nouts))
1154 {
1155 /* With not too many arcs, just do them one at a time */
1156 struct arc *a;
1157
1158 for (a = oldState->outs; a != NULL; a = a->outchain)
1159 cparc(nfa, a, newState, a->to);
1160 }
1161 else
1162 {
1163 /*
1164 * With many arcs, use a sort-merge approach. Note that createarc()
1165 * will put new arcs onto the front of newState's chain, so it does
1166 * not break our walk through the sorted part of the chain.
1167 */
1168 struct arc *oa;
1169 struct arc *na;
1170
1171 /*
1172 * Because we bypass newarc() in this code path, we'd better include a
1173 * cancel check.
1174 */
1175 if (CANCEL_REQUESTED(nfa->v->re))
1176 {
1177 NERR(REG_CANCEL);
1178 return;
1179 }
1180
1181 sortouts(nfa, oldState);
1182 sortouts(nfa, newState);
1183 if (NISERR())
1184 return; /* might have failed to sort */
1185 oa = oldState->outs;
1186 na = newState->outs;
1187 while (oa != NULL && na != NULL)
1188 {
1189 struct arc *a = oa;
1190
1191 switch (sortouts_cmp(&oa, &na))
1192 {
1193 case -1:
1194 /* newState does not have anything matching oa */
1195 oa = oa->outchain;
1196 createarc(nfa, a->type, a->co, newState, a->to);
1197 break;
1198 case 0:
1199 /* match, advance in both lists */
1200 oa = oa->outchain;
1201 na = na->outchain;
1202 break;
1203 case +1:
1204 /* advance only na; oa might have a match later */
1205 na = na->outchain;
1206 break;
1207 default:
1208 assert(NOTREACHED);
1209 }
1210 }
1211 while (oa != NULL)
1212 {
1213 /* newState does not have anything matching oa */
1214 struct arc *a = oa;
1215
1216 oa = oa->outchain;
1217 createarc(nfa, a->type, a->co, newState, a->to);
1218 }
1219 }
1220 }
1221
1222 /*
1223 * cloneouts - copy out arcs of a state to another state pair, modifying type
1224 *
1225 * This is only used to convert PLAIN arcs to AHEAD/BEHIND arcs, which share
1226 * the same interpretation of "co". It wouldn't be sensible with LACONs.
1227 */
1228 static void
cloneouts(struct nfa * nfa,struct state * old,struct state * from,struct state * to,int type)1229 cloneouts(struct nfa *nfa,
1230 struct state *old,
1231 struct state *from,
1232 struct state *to,
1233 int type)
1234 {
1235 struct arc *a;
1236
1237 assert(old != from);
1238 assert(type == AHEAD || type == BEHIND);
1239
1240 for (a = old->outs; a != NULL; a = a->outchain)
1241 {
1242 assert(a->type == PLAIN);
1243 newarc(nfa, type, a->co, from, to);
1244 }
1245 }
1246
1247 /*
1248 * delsub - delete a sub-NFA, updating subre pointers if necessary
1249 *
1250 * This uses a recursive traversal of the sub-NFA, marking already-seen
1251 * states using their tmp pointer.
1252 */
1253 static void
delsub(struct nfa * nfa,struct state * lp,struct state * rp)1254 delsub(struct nfa *nfa,
1255 struct state *lp, /* the sub-NFA goes from here... */
1256 struct state *rp) /* ...to here, *not* inclusive */
1257 {
1258 assert(lp != rp);
1259
1260 rp->tmp = rp; /* mark end */
1261
1262 deltraverse(nfa, lp, lp);
1263 if (NISERR())
1264 return; /* asserts might not hold after failure */
1265 assert(lp->nouts == 0 && rp->nins == 0); /* did the job */
1266 assert(lp->no != FREESTATE && rp->no != FREESTATE); /* no more */
1267
1268 rp->tmp = NULL; /* unmark end */
1269 lp->tmp = NULL; /* and begin, marked by deltraverse */
1270 }
1271
1272 /*
1273 * deltraverse - the recursive heart of delsub
1274 * This routine's basic job is to destroy all out-arcs of the state.
1275 */
1276 static void
deltraverse(struct nfa * nfa,struct state * leftend,struct state * s)1277 deltraverse(struct nfa *nfa,
1278 struct state *leftend,
1279 struct state *s)
1280 {
1281 struct arc *a;
1282 struct state *to;
1283
1284 /* Since this is recursive, it could be driven to stack overflow */
1285 if (STACK_TOO_DEEP(nfa->v->re))
1286 {
1287 NERR(REG_ETOOBIG);
1288 return;
1289 }
1290
1291 if (s->nouts == 0)
1292 return; /* nothing to do */
1293 if (s->tmp != NULL)
1294 return; /* already in progress */
1295
1296 s->tmp = s; /* mark as in progress */
1297
1298 while ((a = s->outs) != NULL)
1299 {
1300 to = a->to;
1301 deltraverse(nfa, leftend, to);
1302 if (NISERR())
1303 return; /* asserts might not hold after failure */
1304 assert(to->nouts == 0 || to->tmp != NULL);
1305 freearc(nfa, a);
1306 if (to->nins == 0 && to->tmp == NULL)
1307 {
1308 assert(to->nouts == 0);
1309 freestate(nfa, to);
1310 }
1311 }
1312
1313 assert(s->no != FREESTATE); /* we're still here */
1314 assert(s == leftend || s->nins != 0); /* and still reachable */
1315 assert(s->nouts == 0); /* but have no outarcs */
1316
1317 s->tmp = NULL; /* we're done here */
1318 }
1319
1320 /*
1321 * dupnfa - duplicate sub-NFA
1322 *
1323 * Another recursive traversal, this time using tmp to point to duplicates
1324 * as well as mark already-seen states. (You knew there was a reason why
1325 * it's a state pointer, didn't you? :-))
1326 */
1327 static void
dupnfa(struct nfa * nfa,struct state * start,struct state * stop,struct state * from,struct state * to)1328 dupnfa(struct nfa *nfa,
1329 struct state *start, /* duplicate of subNFA starting here */
1330 struct state *stop, /* and stopping here */
1331 struct state *from, /* stringing duplicate from here */
1332 struct state *to) /* to here */
1333 {
1334 if (start == stop)
1335 {
1336 newarc(nfa, EMPTY, 0, from, to);
1337 return;
1338 }
1339
1340 stop->tmp = to;
1341 duptraverse(nfa, start, from);
1342 /* done, except for clearing out the tmp pointers */
1343
1344 stop->tmp = NULL;
1345 cleartraverse(nfa, start);
1346 }
1347
1348 /*
1349 * duptraverse - recursive heart of dupnfa
1350 */
1351 static void
duptraverse(struct nfa * nfa,struct state * s,struct state * stmp)1352 duptraverse(struct nfa *nfa,
1353 struct state *s,
1354 struct state *stmp) /* s's duplicate, or NULL */
1355 {
1356 struct arc *a;
1357
1358 /* Since this is recursive, it could be driven to stack overflow */
1359 if (STACK_TOO_DEEP(nfa->v->re))
1360 {
1361 NERR(REG_ETOOBIG);
1362 return;
1363 }
1364
1365 if (s->tmp != NULL)
1366 return; /* already done */
1367
1368 s->tmp = (stmp == NULL) ? newstate(nfa) : stmp;
1369 if (s->tmp == NULL)
1370 {
1371 assert(NISERR());
1372 return;
1373 }
1374
1375 for (a = s->outs; a != NULL && !NISERR(); a = a->outchain)
1376 {
1377 duptraverse(nfa, a->to, (struct state *) NULL);
1378 if (NISERR())
1379 break;
1380 assert(a->to->tmp != NULL);
1381 cparc(nfa, a, s->tmp, a->to->tmp);
1382 }
1383 }
1384
1385 /*
1386 * removeconstraints - remove any constraints in an NFA
1387 *
1388 * Constraint arcs are replaced by empty arcs, essentially treating all
1389 * constraints as automatically satisfied.
1390 */
1391 static void
removeconstraints(struct nfa * nfa,struct state * start,struct state * stop)1392 removeconstraints(struct nfa *nfa,
1393 struct state *start, /* process subNFA starting here */
1394 struct state *stop) /* and stopping here */
1395 {
1396 if (start == stop)
1397 return;
1398
1399 stop->tmp = stop;
1400 removetraverse(nfa, start);
1401 /* done, except for clearing out the tmp pointers */
1402
1403 stop->tmp = NULL;
1404 cleartraverse(nfa, start);
1405 }
1406
1407 /*
1408 * removetraverse - recursive heart of removeconstraints
1409 */
1410 static void
removetraverse(struct nfa * nfa,struct state * s)1411 removetraverse(struct nfa *nfa,
1412 struct state *s)
1413 {
1414 struct arc *a;
1415 struct arc *oa;
1416
1417 /* Since this is recursive, it could be driven to stack overflow */
1418 if (STACK_TOO_DEEP(nfa->v->re))
1419 {
1420 NERR(REG_ETOOBIG);
1421 return;
1422 }
1423
1424 if (s->tmp != NULL)
1425 return; /* already done */
1426
1427 s->tmp = s;
1428 for (a = s->outs; a != NULL && !NISERR(); a = oa)
1429 {
1430 removetraverse(nfa, a->to);
1431 if (NISERR())
1432 break;
1433 oa = a->outchain;
1434 switch (a->type)
1435 {
1436 case PLAIN:
1437 case EMPTY:
1438 /* nothing to do */
1439 break;
1440 case AHEAD:
1441 case BEHIND:
1442 case '^':
1443 case '$':
1444 case LACON:
1445 /* replace it */
1446 newarc(nfa, EMPTY, 0, s, a->to);
1447 freearc(nfa, a);
1448 break;
1449 default:
1450 NERR(REG_ASSERT);
1451 break;
1452 }
1453 }
1454 }
1455
1456 /*
1457 * cleartraverse - recursive cleanup for algorithms that leave tmp ptrs set
1458 */
1459 static void
cleartraverse(struct nfa * nfa,struct state * s)1460 cleartraverse(struct nfa *nfa,
1461 struct state *s)
1462 {
1463 struct arc *a;
1464
1465 /* Since this is recursive, it could be driven to stack overflow */
1466 if (STACK_TOO_DEEP(nfa->v->re))
1467 {
1468 NERR(REG_ETOOBIG);
1469 return;
1470 }
1471
1472 if (s->tmp == NULL)
1473 return;
1474 s->tmp = NULL;
1475
1476 for (a = s->outs; a != NULL; a = a->outchain)
1477 cleartraverse(nfa, a->to);
1478 }
1479
1480 /*
1481 * single_color_transition - does getting from s1 to s2 cross one PLAIN arc?
1482 *
1483 * If traversing from s1 to s2 requires a single PLAIN match (possibly of any
1484 * of a set of colors), return a state whose outarc list contains only PLAIN
1485 * arcs of those color(s). Otherwise return NULL.
1486 *
1487 * This is used before optimizing the NFA, so there may be EMPTY arcs, which
1488 * we should ignore; the possibility of an EMPTY is why the result state could
1489 * be different from s1.
1490 *
1491 * It's worth troubling to handle multiple parallel PLAIN arcs here because a
1492 * bracket construct such as [abc] might yield either one or several parallel
1493 * PLAIN arcs depending on earlier atoms in the expression. We'd rather that
1494 * that implementation detail not create user-visible performance differences.
1495 */
1496 static struct state *
single_color_transition(struct state * s1,struct state * s2)1497 single_color_transition(struct state *s1, struct state *s2)
1498 {
1499 struct arc *a;
1500
1501 /* Ignore leading EMPTY arc, if any */
1502 if (s1->nouts == 1 && s1->outs->type == EMPTY)
1503 s1 = s1->outs->to;
1504 /* Likewise for any trailing EMPTY arc */
1505 if (s2->nins == 1 && s2->ins->type == EMPTY)
1506 s2 = s2->ins->from;
1507 /* Perhaps we could have a single-state loop in between, if so reject */
1508 if (s1 == s2)
1509 return NULL;
1510 /* s1 must have at least one outarc... */
1511 if (s1->outs == NULL)
1512 return NULL;
1513 /* ... and they must all be PLAIN arcs to s2 */
1514 for (a = s1->outs; a != NULL; a = a->outchain)
1515 {
1516 if (a->type != PLAIN || a->to != s2)
1517 return NULL;
1518 }
1519 /* OK, return s1 as the possessor of the relevant outarcs */
1520 return s1;
1521 }
1522
1523 /*
1524 * specialcolors - fill in special colors for an NFA
1525 */
1526 static void
specialcolors(struct nfa * nfa)1527 specialcolors(struct nfa *nfa)
1528 {
1529 /* false colors for BOS, BOL, EOS, EOL */
1530 if (nfa->parent == NULL)
1531 {
1532 nfa->bos[0] = pseudocolor(nfa->cm);
1533 nfa->bos[1] = pseudocolor(nfa->cm);
1534 nfa->eos[0] = pseudocolor(nfa->cm);
1535 nfa->eos[1] = pseudocolor(nfa->cm);
1536 }
1537 else
1538 {
1539 assert(nfa->parent->bos[0] != COLORLESS);
1540 nfa->bos[0] = nfa->parent->bos[0];
1541 assert(nfa->parent->bos[1] != COLORLESS);
1542 nfa->bos[1] = nfa->parent->bos[1];
1543 assert(nfa->parent->eos[0] != COLORLESS);
1544 nfa->eos[0] = nfa->parent->eos[0];
1545 assert(nfa->parent->eos[1] != COLORLESS);
1546 nfa->eos[1] = nfa->parent->eos[1];
1547 }
1548 }
1549
1550 /*
1551 * optimize - optimize an NFA
1552 *
1553 * The main goal of this function is not so much "optimization" (though it
1554 * does try to get rid of useless NFA states) as reducing the NFA to a form
1555 * the regex executor can handle. The executor, and indeed the cNFA format
1556 * that is its input, can only handle PLAIN and LACON arcs. The output of
1557 * the regex parser also includes EMPTY (do-nothing) arcs, as well as
1558 * ^, $, AHEAD, and BEHIND constraint arcs, which we must get rid of here.
1559 * We first get rid of EMPTY arcs and then deal with the constraint arcs.
1560 * The hardest part of either job is to get rid of circular loops of the
1561 * target arc type. We would have to do that in any case, though, as such a
1562 * loop would otherwise allow the executor to cycle through the loop endlessly
1563 * without making any progress in the input string.
1564 */
1565 static long /* re_info bits */
optimize(struct nfa * nfa,FILE * f)1566 optimize(struct nfa *nfa,
1567 FILE *f) /* for debug output; NULL none */
1568 {
1569 #ifdef REG_DEBUG
1570 int verbose = (f != NULL) ? 1 : 0;
1571
1572 if (verbose)
1573 fprintf(f, "\ninitial cleanup:\n");
1574 #endif
1575 cleanup(nfa); /* may simplify situation */
1576 #ifdef REG_DEBUG
1577 if (verbose)
1578 dumpnfa(nfa, f);
1579 if (verbose)
1580 fprintf(f, "\nempties:\n");
1581 #endif
1582 fixempties(nfa, f); /* get rid of EMPTY arcs */
1583 #ifdef REG_DEBUG
1584 if (verbose)
1585 fprintf(f, "\nconstraints:\n");
1586 #endif
1587 fixconstraintloops(nfa, f); /* get rid of constraint loops */
1588 pullback(nfa, f); /* pull back constraints backward */
1589 pushfwd(nfa, f); /* push fwd constraints forward */
1590 #ifdef REG_DEBUG
1591 if (verbose)
1592 fprintf(f, "\nfinal cleanup:\n");
1593 #endif
1594 cleanup(nfa); /* final tidying */
1595 #ifdef REG_DEBUG
1596 if (verbose)
1597 dumpnfa(nfa, f);
1598 #endif
1599 return analyze(nfa); /* and analysis */
1600 }
1601
1602 /*
1603 * pullback - pull back constraints backward to eliminate them
1604 */
1605 static void
pullback(struct nfa * nfa,FILE * f)1606 pullback(struct nfa *nfa,
1607 FILE *f) /* for debug output; NULL none */
1608 {
1609 struct state *s;
1610 struct state *nexts;
1611 struct arc *a;
1612 struct arc *nexta;
1613 struct state *intermediates;
1614 int progress;
1615
1616 /* find and pull until there are no more */
1617 do
1618 {
1619 progress = 0;
1620 for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
1621 {
1622 nexts = s->next;
1623 intermediates = NULL;
1624 for (a = s->outs; a != NULL && !NISERR(); a = nexta)
1625 {
1626 nexta = a->outchain;
1627 if (a->type == '^' || a->type == BEHIND)
1628 if (pull(nfa, a, &intermediates))
1629 progress = 1;
1630 }
1631 /* clear tmp fields of intermediate states created here */
1632 while (intermediates != NULL)
1633 {
1634 struct state *ns = intermediates->tmp;
1635
1636 intermediates->tmp = NULL;
1637 intermediates = ns;
1638 }
1639 /* if s is now useless, get rid of it */
1640 if ((s->nins == 0 || s->nouts == 0) && !s->flag)
1641 dropstate(nfa, s);
1642 }
1643 if (progress && f != NULL)
1644 dumpnfa(nfa, f);
1645 } while (progress && !NISERR());
1646 if (NISERR())
1647 return;
1648
1649 /*
1650 * Any ^ constraints we were able to pull to the start state can now be
1651 * replaced by PLAIN arcs referencing the BOS or BOL colors. There should
1652 * be no other ^ or BEHIND arcs left in the NFA, though we do not check
1653 * that here (compact() will fail if so).
1654 */
1655 for (a = nfa->pre->outs; a != NULL; a = nexta)
1656 {
1657 nexta = a->outchain;
1658 if (a->type == '^')
1659 {
1660 assert(a->co == 0 || a->co == 1);
1661 newarc(nfa, PLAIN, nfa->bos[a->co], a->from, a->to);
1662 freearc(nfa, a);
1663 }
1664 }
1665 }
1666
1667 /*
1668 * pull - pull a back constraint backward past its source state
1669 *
1670 * Returns 1 if successful (which it always is unless the source is the
1671 * start state or we have an internal error), 0 if nothing happened.
1672 *
1673 * A significant property of this function is that it deletes no pre-existing
1674 * states, and no outarcs of the constraint's from state other than the given
1675 * constraint arc. This makes the loops in pullback() safe, at the cost that
1676 * we may leave useless states behind. Therefore, we leave it to pullback()
1677 * to delete such states.
1678 *
1679 * If the from state has multiple back-constraint outarcs, and/or multiple
1680 * compatible constraint inarcs, we only need to create one new intermediate
1681 * state per combination of predecessor and successor states. *intermediates
1682 * points to a list of such intermediate states for this from state (chained
1683 * through their tmp fields).
1684 */
1685 static int
pull(struct nfa * nfa,struct arc * con,struct state ** intermediates)1686 pull(struct nfa *nfa,
1687 struct arc *con,
1688 struct state **intermediates)
1689 {
1690 struct state *from = con->from;
1691 struct state *to = con->to;
1692 struct arc *a;
1693 struct arc *nexta;
1694 struct state *s;
1695
1696 assert(from != to); /* should have gotten rid of this earlier */
1697 if (from->flag) /* can't pull back beyond start */
1698 return 0;
1699 if (from->nins == 0)
1700 { /* unreachable */
1701 freearc(nfa, con);
1702 return 1;
1703 }
1704
1705 /*
1706 * First, clone from state if necessary to avoid other outarcs. This may
1707 * seem wasteful, but it simplifies the logic, and we'll get rid of the
1708 * clone state again at the bottom.
1709 */
1710 if (from->nouts > 1)
1711 {
1712 s = newstate(nfa);
1713 if (NISERR())
1714 return 0;
1715 copyins(nfa, from, s); /* duplicate inarcs */
1716 cparc(nfa, con, s, to); /* move constraint arc */
1717 freearc(nfa, con);
1718 if (NISERR())
1719 return 0;
1720 from = s;
1721 con = from->outs;
1722 }
1723 assert(from->nouts == 1);
1724
1725 /* propagate the constraint into the from state's inarcs */
1726 for (a = from->ins; a != NULL && !NISERR(); a = nexta)
1727 {
1728 nexta = a->inchain;
1729 switch (combine(nfa, con, a))
1730 {
1731 case INCOMPATIBLE: /* destroy the arc */
1732 freearc(nfa, a);
1733 break;
1734 case SATISFIED: /* no action needed */
1735 break;
1736 case COMPATIBLE: /* swap the two arcs, more or less */
1737 /* need an intermediate state, but might have one already */
1738 for (s = *intermediates; s != NULL; s = s->tmp)
1739 {
1740 assert(s->nins > 0 && s->nouts > 0);
1741 if (s->ins->from == a->from && s->outs->to == to)
1742 break;
1743 }
1744 if (s == NULL)
1745 {
1746 s = newstate(nfa);
1747 if (NISERR())
1748 return 0;
1749 s->tmp = *intermediates;
1750 *intermediates = s;
1751 }
1752 cparc(nfa, con, a->from, s);
1753 cparc(nfa, a, s, to);
1754 freearc(nfa, a);
1755 break;
1756 case REPLACEARC: /* replace arc's color */
1757 newarc(nfa, a->type, con->co, a->from, to);
1758 freearc(nfa, a);
1759 break;
1760 default:
1761 assert(NOTREACHED);
1762 break;
1763 }
1764 }
1765
1766 /* remaining inarcs, if any, incorporate the constraint */
1767 moveins(nfa, from, to);
1768 freearc(nfa, con);
1769 /* from state is now useless, but we leave it to pullback() to clean up */
1770 return 1;
1771 }
1772
1773 /*
1774 * pushfwd - push forward constraints forward to eliminate them
1775 */
1776 static void
pushfwd(struct nfa * nfa,FILE * f)1777 pushfwd(struct nfa *nfa,
1778 FILE *f) /* for debug output; NULL none */
1779 {
1780 struct state *s;
1781 struct state *nexts;
1782 struct arc *a;
1783 struct arc *nexta;
1784 struct state *intermediates;
1785 int progress;
1786
1787 /* find and push until there are no more */
1788 do
1789 {
1790 progress = 0;
1791 for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
1792 {
1793 nexts = s->next;
1794 intermediates = NULL;
1795 for (a = s->ins; a != NULL && !NISERR(); a = nexta)
1796 {
1797 nexta = a->inchain;
1798 if (a->type == '$' || a->type == AHEAD)
1799 if (push(nfa, a, &intermediates))
1800 progress = 1;
1801 }
1802 /* clear tmp fields of intermediate states created here */
1803 while (intermediates != NULL)
1804 {
1805 struct state *ns = intermediates->tmp;
1806
1807 intermediates->tmp = NULL;
1808 intermediates = ns;
1809 }
1810 /* if s is now useless, get rid of it */
1811 if ((s->nins == 0 || s->nouts == 0) && !s->flag)
1812 dropstate(nfa, s);
1813 }
1814 if (progress && f != NULL)
1815 dumpnfa(nfa, f);
1816 } while (progress && !NISERR());
1817 if (NISERR())
1818 return;
1819
1820 /*
1821 * Any $ constraints we were able to push to the post state can now be
1822 * replaced by PLAIN arcs referencing the EOS or EOL colors. There should
1823 * be no other $ or AHEAD arcs left in the NFA, though we do not check
1824 * that here (compact() will fail if so).
1825 */
1826 for (a = nfa->post->ins; a != NULL; a = nexta)
1827 {
1828 nexta = a->inchain;
1829 if (a->type == '$')
1830 {
1831 assert(a->co == 0 || a->co == 1);
1832 newarc(nfa, PLAIN, nfa->eos[a->co], a->from, a->to);
1833 freearc(nfa, a);
1834 }
1835 }
1836 }
1837
1838 /*
1839 * push - push a forward constraint forward past its destination state
1840 *
1841 * Returns 1 if successful (which it always is unless the destination is the
1842 * post state or we have an internal error), 0 if nothing happened.
1843 *
1844 * A significant property of this function is that it deletes no pre-existing
1845 * states, and no inarcs of the constraint's to state other than the given
1846 * constraint arc. This makes the loops in pushfwd() safe, at the cost that
1847 * we may leave useless states behind. Therefore, we leave it to pushfwd()
1848 * to delete such states.
1849 *
1850 * If the to state has multiple forward-constraint inarcs, and/or multiple
1851 * compatible constraint outarcs, we only need to create one new intermediate
1852 * state per combination of predecessor and successor states. *intermediates
1853 * points to a list of such intermediate states for this to state (chained
1854 * through their tmp fields).
1855 */
1856 static int
push(struct nfa * nfa,struct arc * con,struct state ** intermediates)1857 push(struct nfa *nfa,
1858 struct arc *con,
1859 struct state **intermediates)
1860 {
1861 struct state *from = con->from;
1862 struct state *to = con->to;
1863 struct arc *a;
1864 struct arc *nexta;
1865 struct state *s;
1866
1867 assert(to != from); /* should have gotten rid of this earlier */
1868 if (to->flag) /* can't push forward beyond end */
1869 return 0;
1870 if (to->nouts == 0)
1871 { /* dead end */
1872 freearc(nfa, con);
1873 return 1;
1874 }
1875
1876 /*
1877 * First, clone to state if necessary to avoid other inarcs. This may
1878 * seem wasteful, but it simplifies the logic, and we'll get rid of the
1879 * clone state again at the bottom.
1880 */
1881 if (to->nins > 1)
1882 {
1883 s = newstate(nfa);
1884 if (NISERR())
1885 return 0;
1886 copyouts(nfa, to, s); /* duplicate outarcs */
1887 cparc(nfa, con, from, s); /* move constraint arc */
1888 freearc(nfa, con);
1889 if (NISERR())
1890 return 0;
1891 to = s;
1892 con = to->ins;
1893 }
1894 assert(to->nins == 1);
1895
1896 /* propagate the constraint into the to state's outarcs */
1897 for (a = to->outs; a != NULL && !NISERR(); a = nexta)
1898 {
1899 nexta = a->outchain;
1900 switch (combine(nfa, con, a))
1901 {
1902 case INCOMPATIBLE: /* destroy the arc */
1903 freearc(nfa, a);
1904 break;
1905 case SATISFIED: /* no action needed */
1906 break;
1907 case COMPATIBLE: /* swap the two arcs, more or less */
1908 /* need an intermediate state, but might have one already */
1909 for (s = *intermediates; s != NULL; s = s->tmp)
1910 {
1911 assert(s->nins > 0 && s->nouts > 0);
1912 if (s->ins->from == from && s->outs->to == a->to)
1913 break;
1914 }
1915 if (s == NULL)
1916 {
1917 s = newstate(nfa);
1918 if (NISERR())
1919 return 0;
1920 s->tmp = *intermediates;
1921 *intermediates = s;
1922 }
1923 cparc(nfa, con, s, a->to);
1924 cparc(nfa, a, from, s);
1925 freearc(nfa, a);
1926 break;
1927 case REPLACEARC: /* replace arc's color */
1928 newarc(nfa, a->type, con->co, from, a->to);
1929 freearc(nfa, a);
1930 break;
1931 default:
1932 assert(NOTREACHED);
1933 break;
1934 }
1935 }
1936
1937 /* remaining outarcs, if any, incorporate the constraint */
1938 moveouts(nfa, to, from);
1939 freearc(nfa, con);
1940 /* to state is now useless, but we leave it to pushfwd() to clean up */
1941 return 1;
1942 }
1943
1944 /*
1945 * combine - constraint lands on an arc, what happens?
1946 *
1947 * #def INCOMPATIBLE 1 // destroys arc
1948 * #def SATISFIED 2 // constraint satisfied
1949 * #def COMPATIBLE 3 // compatible but not satisfied yet
1950 * #def REPLACEARC 4 // replace arc's color with constraint color
1951 */
1952 static int
combine(struct nfa * nfa,struct arc * con,struct arc * a)1953 combine(struct nfa *nfa,
1954 struct arc *con,
1955 struct arc *a)
1956 {
1957 #define CA(ct,at) (((ct)<<CHAR_BIT) | (at))
1958
1959 switch (CA(con->type, a->type))
1960 {
1961 case CA('^', PLAIN): /* newlines are handled separately */
1962 case CA('$', PLAIN):
1963 return INCOMPATIBLE;
1964 break;
1965 case CA(AHEAD, PLAIN): /* color constraints meet colors */
1966 case CA(BEHIND, PLAIN):
1967 if (con->co == a->co)
1968 return SATISFIED;
1969 if (con->co == RAINBOW)
1970 {
1971 /* con is satisfied unless arc's color is a pseudocolor */
1972 if (!(nfa->cm->cd[a->co].flags & PSEUDO))
1973 return SATISFIED;
1974 }
1975 else if (a->co == RAINBOW)
1976 {
1977 /* con is incompatible if it's for a pseudocolor */
1978 if (nfa->cm->cd[con->co].flags & PSEUDO)
1979 return INCOMPATIBLE;
1980 /* otherwise, constraint constrains arc to be only its color */
1981 return REPLACEARC;
1982 }
1983 return INCOMPATIBLE;
1984 break;
1985 case CA('^', '^'): /* collision, similar constraints */
1986 case CA('$', '$'):
1987 if (con->co == a->co) /* true duplication */
1988 return SATISFIED;
1989 return INCOMPATIBLE;
1990 break;
1991 case CA(AHEAD, AHEAD): /* collision, similar constraints */
1992 case CA(BEHIND, BEHIND):
1993 if (con->co == a->co) /* true duplication */
1994 return SATISFIED;
1995 if (con->co == RAINBOW)
1996 {
1997 /* con is satisfied unless arc's color is a pseudocolor */
1998 if (!(nfa->cm->cd[a->co].flags & PSEUDO))
1999 return SATISFIED;
2000 }
2001 else if (a->co == RAINBOW)
2002 {
2003 /* con is incompatible if it's for a pseudocolor */
2004 if (nfa->cm->cd[con->co].flags & PSEUDO)
2005 return INCOMPATIBLE;
2006 /* otherwise, constraint constrains arc to be only its color */
2007 return REPLACEARC;
2008 }
2009 return INCOMPATIBLE;
2010 break;
2011 case CA('^', BEHIND): /* collision, dissimilar constraints */
2012 case CA(BEHIND, '^'):
2013 case CA('$', AHEAD):
2014 case CA(AHEAD, '$'):
2015 return INCOMPATIBLE;
2016 break;
2017 case CA('^', '$'): /* constraints passing each other */
2018 case CA('^', AHEAD):
2019 case CA(BEHIND, '$'):
2020 case CA(BEHIND, AHEAD):
2021 case CA('$', '^'):
2022 case CA('$', BEHIND):
2023 case CA(AHEAD, '^'):
2024 case CA(AHEAD, BEHIND):
2025 case CA('^', LACON):
2026 case CA(BEHIND, LACON):
2027 case CA('$', LACON):
2028 case CA(AHEAD, LACON):
2029 return COMPATIBLE;
2030 break;
2031 }
2032 assert(NOTREACHED);
2033 return INCOMPATIBLE; /* for benefit of blind compilers */
2034 }
2035
2036 /*
2037 * fixempties - get rid of EMPTY arcs
2038 */
2039 static void
fixempties(struct nfa * nfa,FILE * f)2040 fixempties(struct nfa *nfa,
2041 FILE *f) /* for debug output; NULL none */
2042 {
2043 struct state *s;
2044 struct state *s2;
2045 struct state *nexts;
2046 struct arc *a;
2047 struct arc *nexta;
2048 int totalinarcs;
2049 struct arc **inarcsorig;
2050 struct arc **arcarray;
2051 int arccount;
2052 int prevnins;
2053 int nskip;
2054
2055 /*
2056 * First, get rid of any states whose sole out-arc is an EMPTY, since
2057 * they're basically just aliases for their successor. The parsing
2058 * algorithm creates enough of these that it's worth special-casing this.
2059 */
2060 for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
2061 {
2062 nexts = s->next;
2063 if (s->flag || s->nouts != 1)
2064 continue;
2065 a = s->outs;
2066 assert(a != NULL && a->outchain == NULL);
2067 if (a->type != EMPTY)
2068 continue;
2069 if (s != a->to)
2070 moveins(nfa, s, a->to);
2071 dropstate(nfa, s);
2072 }
2073
2074 /*
2075 * Similarly, get rid of any state with a single EMPTY in-arc, by folding
2076 * it into its predecessor.
2077 */
2078 for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
2079 {
2080 nexts = s->next;
2081 /* while we're at it, ensure tmp fields are clear for next step */
2082 assert(s->tmp == NULL);
2083 if (s->flag || s->nins != 1)
2084 continue;
2085 a = s->ins;
2086 assert(a != NULL && a->inchain == NULL);
2087 if (a->type != EMPTY)
2088 continue;
2089 if (s != a->from)
2090 moveouts(nfa, s, a->from);
2091 dropstate(nfa, s);
2092 }
2093
2094 if (NISERR())
2095 return;
2096
2097 /*
2098 * For each remaining NFA state, find all other states from which it is
2099 * reachable by a chain of one or more EMPTY arcs. Then generate new arcs
2100 * that eliminate the need for each such chain.
2101 *
2102 * We could replace a chain of EMPTY arcs that leads from a "from" state
2103 * to a "to" state either by pushing non-EMPTY arcs forward (linking
2104 * directly from "from"'s predecessors to "to") or by pulling them back
2105 * (linking directly from "from" to "to"'s successors). We choose to
2106 * always do the former; this choice is somewhat arbitrary, but the
2107 * approach below requires that we uniformly do one or the other.
2108 *
2109 * Suppose we have a chain of N successive EMPTY arcs (where N can easily
2110 * approach the size of the NFA). All of the intermediate states must
2111 * have additional inarcs and outarcs, else they'd have been removed by
2112 * the steps above. Assuming their inarcs are mostly not empties, we will
2113 * add O(N^2) arcs to the NFA, since a non-EMPTY inarc leading to any one
2114 * state in the chain must be duplicated to lead to all its successor
2115 * states as well. So there is no hope of doing less than O(N^2) work;
2116 * however, we should endeavor to keep the big-O cost from being even
2117 * worse than that, which it can easily become without care. In
2118 * particular, suppose we were to copy all S1's inarcs forward to S2, and
2119 * then also to S3, and then later we consider pushing S2's inarcs forward
2120 * to S3. If we include the arcs already copied from S1 in that, we'd be
2121 * doing O(N^3) work. (The duplicate-arc elimination built into newarc()
2122 * and its cohorts would get rid of the extra arcs, but not without cost.)
2123 *
2124 * We can avoid this cost by treating only arcs that existed at the start
2125 * of this phase as candidates to be pushed forward. To identify those,
2126 * we remember the first inarc each state had to start with. We rely on
2127 * the fact that newarc() and friends put new arcs on the front of their
2128 * to-states' inchains, and that this phase never deletes arcs, so that
2129 * the original arcs must be the last arcs in their to-states' inchains.
2130 *
2131 * So the process here is that, for each state in the NFA, we gather up
2132 * all non-EMPTY inarcs of states that can reach the target state via
2133 * EMPTY arcs. We then sort, de-duplicate, and merge these arcs into the
2134 * target state's inchain. (We can safely use sort-merge for this as long
2135 * as we update each state's original-arcs pointer after we add arcs to
2136 * it; the sort step of mergeins probably changed the order of the old
2137 * arcs.)
2138 *
2139 * Another refinement worth making is that, because we only add non-EMPTY
2140 * arcs during this phase, and all added arcs have the same from-state as
2141 * the non-EMPTY arc they were cloned from, we know ahead of time that any
2142 * states having only EMPTY outarcs will be useless for lack of outarcs
2143 * after we drop the EMPTY arcs. (They cannot gain non-EMPTY outarcs if
2144 * they had none to start with.) So we need not bother to update the
2145 * inchains of such states at all.
2146 */
2147
2148 /* Remember the states' first original inarcs */
2149 /* ... and while at it, count how many old inarcs there are altogether */
2150 inarcsorig = (struct arc **) MALLOC(nfa->nstates * sizeof(struct arc *));
2151 if (inarcsorig == NULL)
2152 {
2153 NERR(REG_ESPACE);
2154 return;
2155 }
2156 totalinarcs = 0;
2157 for (s = nfa->states; s != NULL; s = s->next)
2158 {
2159 inarcsorig[s->no] = s->ins;
2160 totalinarcs += s->nins;
2161 }
2162
2163 /*
2164 * Create a workspace for accumulating the inarcs to be added to the
2165 * current target state. totalinarcs is probably a considerable
2166 * overestimate of the space needed, but the NFA is unlikely to be large
2167 * enough at this point to make it worth being smarter.
2168 */
2169 arcarray = (struct arc **) MALLOC(totalinarcs * sizeof(struct arc *));
2170 if (arcarray == NULL)
2171 {
2172 NERR(REG_ESPACE);
2173 FREE(inarcsorig);
2174 return;
2175 }
2176
2177 /* And iterate over the target states */
2178 for (s = nfa->states; s != NULL && !NISERR(); s = s->next)
2179 {
2180 /* Ignore target states without non-EMPTY outarcs, per note above */
2181 if (!s->flag && !hasnonemptyout(s))
2182 continue;
2183
2184 /* Find predecessor states and accumulate their original inarcs */
2185 arccount = 0;
2186 for (s2 = emptyreachable(nfa, s, s, inarcsorig); s2 != s; s2 = nexts)
2187 {
2188 /* Add s2's original inarcs to arcarray[], but ignore empties */
2189 for (a = inarcsorig[s2->no]; a != NULL; a = a->inchain)
2190 {
2191 if (a->type != EMPTY)
2192 arcarray[arccount++] = a;
2193 }
2194
2195 /* Reset the tmp fields as we walk back */
2196 nexts = s2->tmp;
2197 s2->tmp = NULL;
2198 }
2199 s->tmp = NULL;
2200 assert(arccount <= totalinarcs);
2201
2202 /* Remember how many original inarcs this state has */
2203 prevnins = s->nins;
2204
2205 /* Add non-duplicate inarcs to target state */
2206 mergeins(nfa, s, arcarray, arccount);
2207
2208 /* Now we must update the state's inarcsorig pointer */
2209 nskip = s->nins - prevnins;
2210 a = s->ins;
2211 while (nskip-- > 0)
2212 a = a->inchain;
2213 inarcsorig[s->no] = a;
2214 }
2215
2216 FREE(arcarray);
2217 FREE(inarcsorig);
2218
2219 if (NISERR())
2220 return;
2221
2222 /*
2223 * Now remove all the EMPTY arcs, since we don't need them anymore.
2224 */
2225 for (s = nfa->states; s != NULL; s = s->next)
2226 {
2227 for (a = s->outs; a != NULL; a = nexta)
2228 {
2229 nexta = a->outchain;
2230 if (a->type == EMPTY)
2231 freearc(nfa, a);
2232 }
2233 }
2234
2235 /*
2236 * And remove any states that have become useless. (This cleanup is not
2237 * very thorough, and would be even less so if we tried to combine it with
2238 * the previous step; but cleanup() will take care of anything we miss.)
2239 */
2240 for (s = nfa->states; s != NULL; s = nexts)
2241 {
2242 nexts = s->next;
2243 if ((s->nins == 0 || s->nouts == 0) && !s->flag)
2244 dropstate(nfa, s);
2245 }
2246
2247 if (f != NULL)
2248 dumpnfa(nfa, f);
2249 }
2250
2251 /*
2252 * emptyreachable - recursively find all states that can reach s by EMPTY arcs
2253 *
2254 * The return value is the last such state found. Its tmp field links back
2255 * to the next-to-last such state, and so on back to s, so that all these
2256 * states can be located without searching the whole NFA.
2257 *
2258 * Since this is only used in fixempties(), we pass in the inarcsorig[] array
2259 * maintained by that function. This lets us skip over all new inarcs, which
2260 * are certainly not EMPTY arcs.
2261 *
2262 * The maximum recursion depth here is equal to the length of the longest
2263 * loop-free chain of EMPTY arcs, which is surely no more than the size of
2264 * the NFA ... but that could still be enough to cause trouble.
2265 */
2266 static struct state *
emptyreachable(struct nfa * nfa,struct state * s,struct state * lastfound,struct arc ** inarcsorig)2267 emptyreachable(struct nfa *nfa,
2268 struct state *s,
2269 struct state *lastfound,
2270 struct arc **inarcsorig)
2271 {
2272 struct arc *a;
2273
2274 /* Since this is recursive, it could be driven to stack overflow */
2275 if (STACK_TOO_DEEP(nfa->v->re))
2276 {
2277 NERR(REG_ETOOBIG);
2278 return lastfound;
2279 }
2280
2281 s->tmp = lastfound;
2282 lastfound = s;
2283 for (a = inarcsorig[s->no]; a != NULL; a = a->inchain)
2284 {
2285 if (a->type == EMPTY && a->from->tmp == NULL)
2286 lastfound = emptyreachable(nfa, a->from, lastfound, inarcsorig);
2287 }
2288 return lastfound;
2289 }
2290
2291 /*
2292 * isconstraintarc - detect whether an arc is of a constraint type
2293 */
2294 static inline int
isconstraintarc(struct arc * a)2295 isconstraintarc(struct arc *a)
2296 {
2297 switch (a->type)
2298 {
2299 case '^':
2300 case '$':
2301 case BEHIND:
2302 case AHEAD:
2303 case LACON:
2304 return 1;
2305 }
2306 return 0;
2307 }
2308
2309 /*
2310 * hasconstraintout - does state have a constraint out arc?
2311 */
2312 static int
hasconstraintout(struct state * s)2313 hasconstraintout(struct state *s)
2314 {
2315 struct arc *a;
2316
2317 for (a = s->outs; a != NULL; a = a->outchain)
2318 {
2319 if (isconstraintarc(a))
2320 return 1;
2321 }
2322 return 0;
2323 }
2324
2325 /*
2326 * fixconstraintloops - get rid of loops containing only constraint arcs
2327 *
2328 * A loop of states that contains only constraint arcs is useless, since
2329 * passing around the loop represents no forward progress. Moreover, it
2330 * would cause infinite looping in pullback/pushfwd, so we need to get rid
2331 * of such loops before doing that.
2332 */
2333 static void
fixconstraintloops(struct nfa * nfa,FILE * f)2334 fixconstraintloops(struct nfa *nfa,
2335 FILE *f) /* for debug output; NULL none */
2336 {
2337 struct state *s;
2338 struct state *nexts;
2339 struct arc *a;
2340 struct arc *nexta;
2341 int hasconstraints;
2342
2343 /*
2344 * In the trivial case of a state that loops to itself, we can just drop
2345 * the constraint arc altogether. This is worth special-casing because
2346 * such loops are far more common than loops containing multiple states.
2347 * While we're at it, note whether any constraint arcs survive.
2348 */
2349 hasconstraints = 0;
2350 for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
2351 {
2352 nexts = s->next;
2353 /* while we're at it, ensure tmp fields are clear for next step */
2354 assert(s->tmp == NULL);
2355 for (a = s->outs; a != NULL && !NISERR(); a = nexta)
2356 {
2357 nexta = a->outchain;
2358 if (isconstraintarc(a))
2359 {
2360 if (a->to == s)
2361 freearc(nfa, a);
2362 else
2363 hasconstraints = 1;
2364 }
2365 }
2366 /* If we removed all the outarcs, the state is useless. */
2367 if (s->nouts == 0 && !s->flag)
2368 dropstate(nfa, s);
2369 }
2370
2371 /* Nothing to do if no remaining constraint arcs */
2372 if (NISERR() || !hasconstraints)
2373 return;
2374
2375 /*
2376 * Starting from each remaining NFA state, search outwards for a
2377 * constraint loop. If we find a loop, break the loop, then start the
2378 * search over. (We could possibly retain some state from the first scan,
2379 * but it would complicate things greatly, and multi-state constraint
2380 * loops are rare enough that it's not worth optimizing the case.)
2381 */
2382 restart:
2383 for (s = nfa->states; s != NULL && !NISERR(); s = s->next)
2384 {
2385 if (findconstraintloop(nfa, s))
2386 goto restart;
2387 }
2388
2389 if (NISERR())
2390 return;
2391
2392 /*
2393 * Now remove any states that have become useless. (This cleanup is not
2394 * very thorough, and would be even less so if we tried to combine it with
2395 * the previous step; but cleanup() will take care of anything we miss.)
2396 *
2397 * Because findconstraintloop intentionally doesn't reset all tmp fields,
2398 * we have to clear them after it's done. This is a convenient place to
2399 * do that, too.
2400 */
2401 for (s = nfa->states; s != NULL; s = nexts)
2402 {
2403 nexts = s->next;
2404 s->tmp = NULL;
2405 if ((s->nins == 0 || s->nouts == 0) && !s->flag)
2406 dropstate(nfa, s);
2407 }
2408
2409 if (f != NULL)
2410 dumpnfa(nfa, f);
2411 }
2412
2413 /*
2414 * findconstraintloop - recursively find a loop of constraint arcs
2415 *
2416 * If we find a loop, break it by calling breakconstraintloop(), then
2417 * return 1; otherwise return 0.
2418 *
2419 * State tmp fields are guaranteed all NULL on a success return, because
2420 * breakconstraintloop does that. After a failure return, any state that
2421 * is known not to be part of a loop is marked with s->tmp == s; this allows
2422 * us not to have to re-prove that fact on later calls. (This convention is
2423 * workable because we already eliminated single-state loops.)
2424 *
2425 * Note that the found loop doesn't necessarily include the first state we
2426 * are called on. Any loop reachable from that state will do.
2427 *
2428 * The maximum recursion depth here is one more than the length of the longest
2429 * loop-free chain of constraint arcs, which is surely no more than the size
2430 * of the NFA ... but that could still be enough to cause trouble.
2431 */
2432 static int
findconstraintloop(struct nfa * nfa,struct state * s)2433 findconstraintloop(struct nfa *nfa, struct state *s)
2434 {
2435 struct arc *a;
2436
2437 /* Since this is recursive, it could be driven to stack overflow */
2438 if (STACK_TOO_DEEP(nfa->v->re))
2439 {
2440 NERR(REG_ETOOBIG);
2441 return 1; /* to exit as quickly as possible */
2442 }
2443
2444 if (s->tmp != NULL)
2445 {
2446 /* Already proven uninteresting? */
2447 if (s->tmp == s)
2448 return 0;
2449 /* Found a loop involving s */
2450 breakconstraintloop(nfa, s);
2451 /* The tmp fields have been cleaned up by breakconstraintloop */
2452 return 1;
2453 }
2454 for (a = s->outs; a != NULL; a = a->outchain)
2455 {
2456 if (isconstraintarc(a))
2457 {
2458 struct state *sto = a->to;
2459
2460 assert(sto != s);
2461 s->tmp = sto;
2462 if (findconstraintloop(nfa, sto))
2463 return 1;
2464 }
2465 }
2466
2467 /*
2468 * If we get here, no constraint loop exists leading out from s. Mark it
2469 * with s->tmp == s so we need not rediscover that fact again later.
2470 */
2471 s->tmp = s;
2472 return 0;
2473 }
2474
2475 /*
2476 * breakconstraintloop - break a loop of constraint arcs
2477 *
2478 * sinitial is any one member state of the loop. Each loop member's tmp
2479 * field links to its successor within the loop. (Note that this function
2480 * will reset all the tmp fields to NULL.)
2481 *
2482 * We can break the loop by, for any one state S1 in the loop, cloning its
2483 * loop successor state S2 (and possibly following states), and then moving
2484 * all S1->S2 constraint arcs to point to the cloned S2. The cloned S2 should
2485 * copy any non-constraint outarcs of S2. Constraint outarcs should be
2486 * dropped if they point back to S1, else they need to be copied as arcs to
2487 * similarly cloned states S3, S4, etc. In general, each cloned state copies
2488 * non-constraint outarcs, drops constraint outarcs that would lead to itself
2489 * or any earlier cloned state, and sends other constraint outarcs to newly
2490 * cloned states. No cloned state will have any inarcs that aren't constraint
2491 * arcs or do not lead from S1 or earlier-cloned states. It's okay to drop
2492 * constraint back-arcs since they would not take us to any state we've not
2493 * already been in; therefore, no new constraint loop is created. In this way
2494 * we generate a modified NFA that can still represent every useful state
2495 * sequence, but not sequences that represent state loops with no consumption
2496 * of input data. Note that the set of cloned states will certainly include
2497 * all of the loop member states other than S1, and it may also include
2498 * non-loop states that are reachable from S2 via constraint arcs. This is
2499 * important because there is no guarantee that findconstraintloop found a
2500 * maximal loop (and searching for one would be NP-hard, so don't try).
2501 * Frequently the "non-loop states" are actually part of a larger loop that
2502 * we didn't notice, and indeed there may be several overlapping loops.
2503 * This technique ensures convergence in such cases, while considering only
2504 * the originally-found loop does not.
2505 *
2506 * If there is only one S1->S2 constraint arc, then that constraint is
2507 * certainly satisfied when we enter any of the clone states. This means that
2508 * in the common case where many of the constraint arcs are identically
2509 * labeled, we can merge together clone states linked by a similarly-labeled
2510 * constraint: if we can get to the first one we can certainly get to the
2511 * second, so there's no need to distinguish. This greatly reduces the number
2512 * of new states needed, so we preferentially break the given loop at a state
2513 * pair where this is true.
2514 *
2515 * Furthermore, it's fairly common to find that a cloned successor state has
2516 * no outarcs, especially if we're a bit aggressive about removing unnecessary
2517 * outarcs. If that happens, then there is simply not any interesting state
2518 * that can be reached through the predecessor's loop arcs, which means we can
2519 * break the loop just by removing those loop arcs, with no new states added.
2520 */
2521 static void
breakconstraintloop(struct nfa * nfa,struct state * sinitial)2522 breakconstraintloop(struct nfa *nfa, struct state *sinitial)
2523 {
2524 struct state *s;
2525 struct state *shead;
2526 struct state *stail;
2527 struct state *sclone;
2528 struct state *nexts;
2529 struct arc *refarc;
2530 struct arc *a;
2531 struct arc *nexta;
2532
2533 /*
2534 * Start by identifying which loop step we want to break at.
2535 * Preferentially this is one with only one constraint arc. (XXX are
2536 * there any other secondary heuristics we want to use here?) Set refarc
2537 * to point to the selected lone constraint arc, if there is one.
2538 */
2539 refarc = NULL;
2540 s = sinitial;
2541 do
2542 {
2543 nexts = s->tmp;
2544 assert(nexts != s); /* should not see any one-element loops */
2545 if (refarc == NULL)
2546 {
2547 int narcs = 0;
2548
2549 for (a = s->outs; a != NULL; a = a->outchain)
2550 {
2551 if (a->to == nexts && isconstraintarc(a))
2552 {
2553 refarc = a;
2554 narcs++;
2555 }
2556 }
2557 assert(narcs > 0);
2558 if (narcs > 1)
2559 refarc = NULL; /* multiple constraint arcs here, no good */
2560 }
2561 s = nexts;
2562 } while (s != sinitial);
2563
2564 if (refarc)
2565 {
2566 /* break at the refarc */
2567 shead = refarc->from;
2568 stail = refarc->to;
2569 assert(stail == shead->tmp);
2570 }
2571 else
2572 {
2573 /* for lack of a better idea, break after sinitial */
2574 shead = sinitial;
2575 stail = sinitial->tmp;
2576 }
2577
2578 /*
2579 * Reset the tmp fields so that we can use them for local storage in
2580 * clonesuccessorstates. (findconstraintloop won't mind, since it's just
2581 * going to abandon its search anyway.)
2582 */
2583 for (s = nfa->states; s != NULL; s = s->next)
2584 s->tmp = NULL;
2585
2586 /*
2587 * Recursively build clone state(s) as needed.
2588 */
2589 sclone = newstate(nfa);
2590 if (sclone == NULL)
2591 {
2592 assert(NISERR());
2593 return;
2594 }
2595
2596 clonesuccessorstates(nfa, stail, sclone, shead, refarc,
2597 NULL, NULL, nfa->nstates);
2598
2599 if (NISERR())
2600 return;
2601
2602 /*
2603 * It's possible that sclone has no outarcs at all, in which case it's
2604 * useless. (We don't try extremely hard to get rid of useless states
2605 * here, but this is an easy and fairly common case.)
2606 */
2607 if (sclone->nouts == 0)
2608 {
2609 freestate(nfa, sclone);
2610 sclone = NULL;
2611 }
2612
2613 /*
2614 * Move shead's constraint-loop arcs to point to sclone, or just drop them
2615 * if we discovered we don't need sclone.
2616 */
2617 for (a = shead->outs; a != NULL; a = nexta)
2618 {
2619 nexta = a->outchain;
2620 if (a->to == stail && isconstraintarc(a))
2621 {
2622 if (sclone)
2623 cparc(nfa, a, shead, sclone);
2624 freearc(nfa, a);
2625 if (NISERR())
2626 break;
2627 }
2628 }
2629 }
2630
2631 /*
2632 * clonesuccessorstates - create a tree of constraint-arc successor states
2633 *
2634 * ssource is the state to be cloned, and sclone is the state to copy its
2635 * outarcs into. sclone's inarcs, if any, should already be set up.
2636 *
2637 * spredecessor is the original predecessor state that we are trying to build
2638 * successors for (it may not be the immediate predecessor of ssource).
2639 * refarc, if not NULL, is the original constraint arc that is known to have
2640 * been traversed out of spredecessor to reach the successor(s).
2641 *
2642 * For each cloned successor state, we transiently create a "donemap" that is
2643 * a boolean array showing which source states we've already visited for this
2644 * clone state. This prevents infinite recursion as well as useless repeat
2645 * visits to the same state subtree (which can add up fast, since typical NFAs
2646 * have multiple redundant arc pathways). Each donemap is a char array
2647 * indexed by state number. The donemaps are all of the same size "nstates",
2648 * which is nfa->nstates as of the start of the recursion. This is enough to
2649 * have entries for all pre-existing states, but *not* entries for clone
2650 * states created during the recursion. That's okay since we have no need to
2651 * mark those.
2652 *
2653 * curdonemap is NULL when recursing to a new sclone state, or sclone's
2654 * donemap when we are recursing without having created a new state (which we
2655 * do when we decide we can merge a successor state into the current clone
2656 * state). outerdonemap is NULL at the top level and otherwise the parent
2657 * clone state's donemap.
2658 *
2659 * The successor states we create and fill here form a strict tree structure,
2660 * with each state having exactly one predecessor, except that the toplevel
2661 * state has no inarcs as yet (breakconstraintloop will add its inarcs from
2662 * spredecessor after we're done). Thus, we can examine sclone's inarcs back
2663 * to the root, plus refarc if any, to identify the set of constraints already
2664 * known valid at the current point. This allows us to avoid generating extra
2665 * successor states.
2666 */
2667 static void
clonesuccessorstates(struct nfa * nfa,struct state * ssource,struct state * sclone,struct state * spredecessor,struct arc * refarc,char * curdonemap,char * outerdonemap,int nstates)2668 clonesuccessorstates(struct nfa *nfa,
2669 struct state *ssource,
2670 struct state *sclone,
2671 struct state *spredecessor,
2672 struct arc *refarc,
2673 char *curdonemap,
2674 char *outerdonemap,
2675 int nstates)
2676 {
2677 char *donemap;
2678 struct arc *a;
2679
2680 /* Since this is recursive, it could be driven to stack overflow */
2681 if (STACK_TOO_DEEP(nfa->v->re))
2682 {
2683 NERR(REG_ETOOBIG);
2684 return;
2685 }
2686
2687 /* If this state hasn't already got a donemap, create one */
2688 donemap = curdonemap;
2689 if (donemap == NULL)
2690 {
2691 donemap = (char *) MALLOC(nstates * sizeof(char));
2692 if (donemap == NULL)
2693 {
2694 NERR(REG_ESPACE);
2695 return;
2696 }
2697
2698 if (outerdonemap != NULL)
2699 {
2700 /*
2701 * Not at outermost recursion level, so copy the outer level's
2702 * donemap; this ensures that we see states in process of being
2703 * visited at outer levels, or already merged into predecessor
2704 * states, as ones we shouldn't traverse back to.
2705 */
2706 memcpy(donemap, outerdonemap, nstates * sizeof(char));
2707 }
2708 else
2709 {
2710 /* At outermost level, only spredecessor is off-limits */
2711 memset(donemap, 0, nstates * sizeof(char));
2712 assert(spredecessor->no < nstates);
2713 donemap[spredecessor->no] = 1;
2714 }
2715 }
2716
2717 /* Mark ssource as visited in the donemap */
2718 assert(ssource->no < nstates);
2719 assert(donemap[ssource->no] == 0);
2720 donemap[ssource->no] = 1;
2721
2722 /*
2723 * We proceed by first cloning all of ssource's outarcs, creating new
2724 * clone states as needed but not doing more with them than that. Then in
2725 * a second pass, recurse to process the child clone states. This allows
2726 * us to have only one child clone state per reachable source state, even
2727 * when there are multiple outarcs leading to the same state. Also, when
2728 * we do visit a child state, its set of inarcs is known exactly, which
2729 * makes it safe to apply the constraint-is-already-checked optimization.
2730 * Also, this ensures that we've merged all the states we can into the
2731 * current clone before we recurse to any children, thus possibly saving
2732 * them from making extra images of those states.
2733 *
2734 * While this function runs, child clone states of the current state are
2735 * marked by setting their tmp fields to point to the original state they
2736 * were cloned from. This makes it possible to detect multiple outarcs
2737 * leading to the same state, and also makes it easy to distinguish clone
2738 * states from original states (which will have tmp == NULL).
2739 */
2740 for (a = ssource->outs; a != NULL && !NISERR(); a = a->outchain)
2741 {
2742 struct state *sto = a->to;
2743
2744 /*
2745 * We do not consider cloning successor states that have no constraint
2746 * outarcs; just link to them as-is. They cannot be part of a
2747 * constraint loop so there is no need to make copies. In particular,
2748 * this rule keeps us from trying to clone the post state, which would
2749 * be a bad idea.
2750 */
2751 if (isconstraintarc(a) && hasconstraintout(sto))
2752 {
2753 struct state *prevclone;
2754 int canmerge;
2755 struct arc *a2;
2756
2757 /*
2758 * Back-link constraint arcs must not be followed. Nor is there a
2759 * need to revisit states previously merged into this clone.
2760 */
2761 assert(sto->no < nstates);
2762 if (donemap[sto->no] != 0)
2763 continue;
2764
2765 /*
2766 * Check whether we already have a child clone state for this
2767 * source state.
2768 */
2769 prevclone = NULL;
2770 for (a2 = sclone->outs; a2 != NULL; a2 = a2->outchain)
2771 {
2772 if (a2->to->tmp == sto)
2773 {
2774 prevclone = a2->to;
2775 break;
2776 }
2777 }
2778
2779 /*
2780 * If this arc is labeled the same as refarc, or the same as any
2781 * arc we must have traversed to get to sclone, then no additional
2782 * constraints need to be met to get to sto, so we should just
2783 * merge its outarcs into sclone.
2784 */
2785 if (refarc && a->type == refarc->type && a->co == refarc->co)
2786 canmerge = 1;
2787 else
2788 {
2789 struct state *s;
2790
2791 canmerge = 0;
2792 for (s = sclone; s->ins; s = s->ins->from)
2793 {
2794 if (s->nins == 1 &&
2795 a->type == s->ins->type && a->co == s->ins->co)
2796 {
2797 canmerge = 1;
2798 break;
2799 }
2800 }
2801 }
2802
2803 if (canmerge)
2804 {
2805 /*
2806 * We can merge into sclone. If we previously made a child
2807 * clone state, drop it; there's no need to visit it. (This
2808 * can happen if ssource has multiple pathways to sto, and we
2809 * only just now found one that is provably a no-op.)
2810 */
2811 if (prevclone)
2812 dropstate(nfa, prevclone); /* kills our outarc, too */
2813
2814 /* Recurse to merge sto's outarcs into sclone */
2815 clonesuccessorstates(nfa,
2816 sto,
2817 sclone,
2818 spredecessor,
2819 refarc,
2820 donemap,
2821 outerdonemap,
2822 nstates);
2823 /* sto should now be marked as previously visited */
2824 assert(NISERR() || donemap[sto->no] == 1);
2825 }
2826 else if (prevclone)
2827 {
2828 /*
2829 * We already have a clone state for this successor, so just
2830 * make another arc to it.
2831 */
2832 cparc(nfa, a, sclone, prevclone);
2833 }
2834 else
2835 {
2836 /*
2837 * We need to create a new successor clone state.
2838 */
2839 struct state *stoclone;
2840
2841 stoclone = newstate(nfa);
2842 if (stoclone == NULL)
2843 {
2844 assert(NISERR());
2845 break;
2846 }
2847 /* Mark it as to what it's a clone of */
2848 stoclone->tmp = sto;
2849 /* ... and add the outarc leading to it */
2850 cparc(nfa, a, sclone, stoclone);
2851 }
2852 }
2853 else
2854 {
2855 /*
2856 * Non-constraint outarcs just get copied to sclone, as do outarcs
2857 * leading to states with no constraint outarc.
2858 */
2859 cparc(nfa, a, sclone, sto);
2860 }
2861 }
2862
2863 /*
2864 * If we are at outer level for this clone state, recurse to all its child
2865 * clone states, clearing their tmp fields as we go. (If we're not
2866 * outermost for sclone, leave this to be done by the outer call level.)
2867 * Note that if we have multiple outarcs leading to the same clone state,
2868 * it will only be recursed-to once.
2869 */
2870 if (curdonemap == NULL)
2871 {
2872 for (a = sclone->outs; a != NULL && !NISERR(); a = a->outchain)
2873 {
2874 struct state *stoclone = a->to;
2875 struct state *sto = stoclone->tmp;
2876
2877 if (sto != NULL)
2878 {
2879 stoclone->tmp = NULL;
2880 clonesuccessorstates(nfa,
2881 sto,
2882 stoclone,
2883 spredecessor,
2884 refarc,
2885 NULL,
2886 donemap,
2887 nstates);
2888 }
2889 }
2890
2891 /* Don't forget to free sclone's donemap when done with it */
2892 FREE(donemap);
2893 }
2894 }
2895
2896 /*
2897 * cleanup - clean up NFA after optimizations
2898 */
2899 static void
cleanup(struct nfa * nfa)2900 cleanup(struct nfa *nfa)
2901 {
2902 struct state *s;
2903 struct state *nexts;
2904 int n;
2905
2906 if (NISERR())
2907 return;
2908
2909 /* clear out unreachable or dead-end states */
2910 /* use pre to mark reachable, then post to mark can-reach-post */
2911 markreachable(nfa, nfa->pre, (struct state *) NULL, nfa->pre);
2912 markcanreach(nfa, nfa->post, nfa->pre, nfa->post);
2913 for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
2914 {
2915 nexts = s->next;
2916 if (s->tmp != nfa->post && !s->flag)
2917 dropstate(nfa, s);
2918 }
2919 assert(NISERR() || nfa->post->nins == 0 || nfa->post->tmp == nfa->post);
2920 cleartraverse(nfa, nfa->pre);
2921 assert(NISERR() || nfa->post->nins == 0 || nfa->post->tmp == NULL);
2922 /* the nins==0 (final unreachable) case will be caught later */
2923
2924 /* renumber surviving states */
2925 n = 0;
2926 for (s = nfa->states; s != NULL; s = s->next)
2927 s->no = n++;
2928 nfa->nstates = n;
2929 }
2930
2931 /*
2932 * markreachable - recursive marking of reachable states
2933 */
2934 static void
markreachable(struct nfa * nfa,struct state * s,struct state * okay,struct state * mark)2935 markreachable(struct nfa *nfa,
2936 struct state *s,
2937 struct state *okay, /* consider only states with this mark */
2938 struct state *mark) /* the value to mark with */
2939 {
2940 struct arc *a;
2941
2942 /* Since this is recursive, it could be driven to stack overflow */
2943 if (STACK_TOO_DEEP(nfa->v->re))
2944 {
2945 NERR(REG_ETOOBIG);
2946 return;
2947 }
2948
2949 if (s->tmp != okay)
2950 return;
2951 s->tmp = mark;
2952
2953 for (a = s->outs; a != NULL; a = a->outchain)
2954 markreachable(nfa, a->to, okay, mark);
2955 }
2956
2957 /*
2958 * markcanreach - recursive marking of states which can reach here
2959 */
2960 static void
markcanreach(struct nfa * nfa,struct state * s,struct state * okay,struct state * mark)2961 markcanreach(struct nfa *nfa,
2962 struct state *s,
2963 struct state *okay, /* consider only states with this mark */
2964 struct state *mark) /* the value to mark with */
2965 {
2966 struct arc *a;
2967
2968 /* Since this is recursive, it could be driven to stack overflow */
2969 if (STACK_TOO_DEEP(nfa->v->re))
2970 {
2971 NERR(REG_ETOOBIG);
2972 return;
2973 }
2974
2975 if (s->tmp != okay)
2976 return;
2977 s->tmp = mark;
2978
2979 for (a = s->ins; a != NULL; a = a->inchain)
2980 markcanreach(nfa, a->from, okay, mark);
2981 }
2982
2983 /*
2984 * analyze - ascertain potentially-useful facts about an optimized NFA
2985 */
2986 static long /* re_info bits to be ORed in */
analyze(struct nfa * nfa)2987 analyze(struct nfa *nfa)
2988 {
2989 struct arc *a;
2990 struct arc *aa;
2991
2992 if (NISERR())
2993 return 0;
2994
2995 /* Detect whether NFA can't match anything */
2996 if (nfa->pre->outs == NULL)
2997 return REG_UIMPOSSIBLE;
2998
2999 /* Detect whether NFA matches all strings (possibly with length bounds) */
3000 checkmatchall(nfa);
3001
3002 /* Detect whether NFA can possibly match a zero-length string */
3003 for (a = nfa->pre->outs; a != NULL; a = a->outchain)
3004 for (aa = a->to->outs; aa != NULL; aa = aa->outchain)
3005 if (aa->to == nfa->post)
3006 return REG_UEMPTYMATCH;
3007 return 0;
3008 }
3009
3010 /*
3011 * checkmatchall - does the NFA represent no more than a string length test?
3012 *
3013 * If so, set nfa->minmatchall and nfa->maxmatchall correctly (they are -1
3014 * to begin with) and set the MATCHALL bit in nfa->flags.
3015 *
3016 * To succeed, we require all arcs to be PLAIN RAINBOW arcs, except for those
3017 * for pseudocolors (i.e., BOS/BOL/EOS/EOL). We must be able to reach the
3018 * post state via RAINBOW arcs, and if there are any loops in the graph, they
3019 * must be loop-to-self arcs, ensuring that each loop iteration consumes
3020 * exactly one character. (Longer loops are problematic because they create
3021 * non-consecutive possible match lengths; we have no good way to represent
3022 * that situation for lengths beyond the DUPINF limit.)
3023 *
3024 * Pseudocolor arcs complicate things a little. We know that they can only
3025 * appear as pre-state outarcs (for BOS/BOL) or post-state inarcs (for
3026 * EOS/EOL). There, they must exactly replicate the parallel RAINBOW arcs,
3027 * e.g. if the pre state has one RAINBOW outarc to state 2, it must have BOS
3028 * and BOL outarcs to state 2, and no others. Missing or extra pseudocolor
3029 * arcs can occur, meaning that the NFA involves some constraint on the
3030 * adjacent characters, which makes it not a matchall NFA.
3031 */
3032 static void
checkmatchall(struct nfa * nfa)3033 checkmatchall(struct nfa *nfa)
3034 {
3035 bool **haspaths;
3036 struct state *s;
3037 int i;
3038
3039 /*
3040 * If there are too many states, don't bother trying to detect matchall.
3041 * This limit serves to bound the time and memory we could consume below.
3042 * Note that even if the graph is all-RAINBOW, if there are significantly
3043 * more than DUPINF states then it's likely that there are paths of length
3044 * more than DUPINF, which would force us to fail anyhow. In practice,
3045 * plausible ways of writing a matchall regex with maximum finite path
3046 * length K tend not to have very many more than K states.
3047 */
3048 if (nfa->nstates > DUPINF * 2)
3049 return;
3050
3051 /*
3052 * First, scan all the states to verify that only RAINBOW arcs appear,
3053 * plus pseudocolor arcs adjacent to the pre and post states. This lets
3054 * us quickly eliminate most cases that aren't matchall NFAs.
3055 */
3056 for (s = nfa->states; s != NULL; s = s->next)
3057 {
3058 struct arc *a;
3059
3060 for (a = s->outs; a != NULL; a = a->outchain)
3061 {
3062 if (a->type != PLAIN)
3063 return; /* any LACONs make it non-matchall */
3064 if (a->co != RAINBOW)
3065 {
3066 if (nfa->cm->cd[a->co].flags & PSEUDO)
3067 {
3068 /*
3069 * Pseudocolor arc: verify it's in a valid place (this
3070 * seems quite unlikely to fail, but let's be sure).
3071 */
3072 if (s == nfa->pre &&
3073 (a->co == nfa->bos[0] || a->co == nfa->bos[1]))
3074 /* okay BOS/BOL arc */ ;
3075 else if (a->to == nfa->post &&
3076 (a->co == nfa->eos[0] || a->co == nfa->eos[1]))
3077 /* okay EOS/EOL arc */ ;
3078 else
3079 return; /* unexpected pseudocolor arc */
3080 /* We'll check these arcs some more below. */
3081 }
3082 else
3083 return; /* any other color makes it non-matchall */
3084 }
3085 }
3086 /* Also, assert that the tmp fields are available for use. */
3087 assert(s->tmp == NULL);
3088 }
3089
3090 /*
3091 * The next cheapest check we can make is to verify that the BOS/BOL
3092 * outarcs of the pre state reach the same states as its RAINBOW outarcs.
3093 * If they don't, the NFA expresses some constraints on the character
3094 * before the matched string, making it non-matchall. Likewise, the
3095 * EOS/EOL inarcs of the post state must match its RAINBOW inarcs.
3096 */
3097 if (!check_out_colors_match(nfa->pre, RAINBOW, nfa->bos[0]) ||
3098 !check_out_colors_match(nfa->pre, RAINBOW, nfa->bos[1]) ||
3099 !check_in_colors_match(nfa->post, RAINBOW, nfa->eos[0]) ||
3100 !check_in_colors_match(nfa->post, RAINBOW, nfa->eos[1]))
3101 return;
3102
3103 /*
3104 * Initialize an array of path-length arrays, in which
3105 * checkmatchall_recurse will return per-state results. This lets us
3106 * memo-ize the recursive search and avoid exponential time consumption.
3107 */
3108 haspaths = (bool **) MALLOC(nfa->nstates * sizeof(bool *));
3109 if (haspaths == NULL)
3110 return; /* fail quietly */
3111 memset(haspaths, 0, nfa->nstates * sizeof(bool *));
3112
3113 /*
3114 * Recursively search the graph for all-RAINBOW paths to the "post" state,
3115 * starting at the "pre" state, and computing the lengths of the paths.
3116 * (Given the preceding checks, there should be at least one such path.
3117 * However we could get back a false result anyway, in case there are
3118 * multi-state loops, paths exceeding DUPINF+1 length, or non-algorithmic
3119 * failures such as ENOMEM.)
3120 */
3121 if (checkmatchall_recurse(nfa, nfa->pre, haspaths))
3122 {
3123 /* The useful result is the path length array for the pre state */
3124 bool *haspath = haspaths[nfa->pre->no];
3125 int minmatch,
3126 maxmatch,
3127 morematch;
3128
3129 assert(haspath != NULL);
3130
3131 /*
3132 * haspath[] now represents the set of possible path lengths; but we
3133 * want to reduce that to a min and max value, because it doesn't seem
3134 * worth complicating regexec.c to deal with nonconsecutive possible
3135 * match lengths. Find min and max of first run of lengths, then
3136 * verify there are no nonconsecutive lengths.
3137 */
3138 for (minmatch = 0; minmatch <= DUPINF + 1; minmatch++)
3139 {
3140 if (haspath[minmatch])
3141 break;
3142 }
3143 assert(minmatch <= DUPINF + 1); /* else checkmatchall_recurse lied */
3144 for (maxmatch = minmatch; maxmatch < DUPINF + 1; maxmatch++)
3145 {
3146 if (!haspath[maxmatch + 1])
3147 break;
3148 }
3149 for (morematch = maxmatch + 1; morematch <= DUPINF + 1; morematch++)
3150 {
3151 if (haspath[morematch])
3152 {
3153 haspath = NULL; /* fail, there are nonconsecutive lengths */
3154 break;
3155 }
3156 }
3157
3158 if (haspath != NULL)
3159 {
3160 /*
3161 * Success, so record the info. Here we have a fine point: the
3162 * path length from the pre state includes the pre-to-initial
3163 * transition, so it's one more than the actually matched string
3164 * length. (We avoided counting the final-to-post transition
3165 * within checkmatchall_recurse, but not this one.) This is why
3166 * checkmatchall_recurse allows one more level of path length than
3167 * might seem necessary. This decrement also takes care of
3168 * converting checkmatchall_recurse's definition of "infinity" as
3169 * "DUPINF+1" to our normal representation as "DUPINF".
3170 */
3171 assert(minmatch > 0); /* else pre and post states were adjacent */
3172 nfa->minmatchall = minmatch - 1;
3173 nfa->maxmatchall = maxmatch - 1;
3174 nfa->flags |= MATCHALL;
3175 }
3176 }
3177
3178 /* Clean up */
3179 for (i = 0; i < nfa->nstates; i++)
3180 {
3181 if (haspaths[i] != NULL)
3182 FREE(haspaths[i]);
3183 }
3184 FREE(haspaths);
3185 }
3186
3187 /*
3188 * checkmatchall_recurse - recursive search for checkmatchall
3189 *
3190 * s is the state to be examined in this recursion level.
3191 * haspaths[] is an array of per-state exit path length arrays.
3192 *
3193 * We return true if the search was performed successfully, false if
3194 * we had to fail because of multi-state loops or other internal reasons.
3195 * (Because "dead" states that can't reach the post state have been
3196 * eliminated, and we already verified that only RAINBOW and matching
3197 * pseudocolor arcs exist, every state should have RAINBOW path(s) to
3198 * the post state. Hence we take a false result from recursive calls
3199 * as meaning that we'd better fail altogether, not just that that
3200 * particular state can't reach the post state.)
3201 *
3202 * On success, we store a malloc'd result array in haspaths[s->no],
3203 * showing the possible path lengths from s to the post state.
3204 * Each state's haspath[] array is of length DUPINF+2. The entries from
3205 * k = 0 to DUPINF are true if there is an all-RAINBOW path of length k
3206 * from this state to the string end. haspath[DUPINF+1] is true if all
3207 * path lengths >= DUPINF+1 are possible. (Situations that cannot be
3208 * represented under these rules cause failure.)
3209 *
3210 * checkmatchall is responsible for eventually freeing the haspath[] arrays.
3211 */
3212 static bool
checkmatchall_recurse(struct nfa * nfa,struct state * s,bool ** haspaths)3213 checkmatchall_recurse(struct nfa *nfa, struct state *s, bool **haspaths)
3214 {
3215 bool result = false;
3216 bool foundloop = false;
3217 bool *haspath;
3218 struct arc *a;
3219
3220 /*
3221 * Since this is recursive, it could be driven to stack overflow. But we
3222 * need not treat that as a hard failure; just deem the NFA non-matchall.
3223 */
3224 if (STACK_TOO_DEEP(nfa->v->re))
3225 return false;
3226
3227 /* In case the search takes a long time, check for cancel */
3228 if (CANCEL_REQUESTED(nfa->v->re))
3229 {
3230 NERR(REG_CANCEL);
3231 return false;
3232 }
3233
3234 /* Create a haspath array for this state */
3235 haspath = (bool *) MALLOC((DUPINF + 2) * sizeof(bool));
3236 if (haspath == NULL)
3237 return false; /* again, treat as non-matchall */
3238 memset(haspath, 0, (DUPINF + 2) * sizeof(bool));
3239
3240 /* Mark this state as being visited */
3241 assert(s->tmp == NULL);
3242 s->tmp = s;
3243
3244 for (a = s->outs; a != NULL; a = a->outchain)
3245 {
3246 if (a->co != RAINBOW)
3247 continue; /* ignore pseudocolor arcs */
3248 if (a->to == nfa->post)
3249 {
3250 /* We found an all-RAINBOW path to the post state */
3251 result = true;
3252
3253 /*
3254 * Mark this state as being zero steps away from the string end
3255 * (the transition to the post state isn't counted).
3256 */
3257 haspath[0] = true;
3258 }
3259 else if (a->to == s)
3260 {
3261 /* We found a cycle of length 1, which we'll deal with below. */
3262 foundloop = true;
3263 }
3264 else if (a->to->tmp != NULL)
3265 {
3266 /* It's busy, so we found a cycle of length > 1, so fail. */
3267 result = false;
3268 break;
3269 }
3270 else
3271 {
3272 /* Consider paths forward through this to-state. */
3273 bool *nexthaspath;
3274 int i;
3275
3276 /* If to-state was not already visited, recurse */
3277 if (haspaths[a->to->no] == NULL)
3278 {
3279 result = checkmatchall_recurse(nfa, a->to, haspaths);
3280 /* Fail if any recursive path fails */
3281 if (!result)
3282 break;
3283 }
3284 else
3285 {
3286 /* The previous visit must have found path(s) to the end */
3287 result = true;
3288 }
3289 assert(a->to->tmp == NULL);
3290 nexthaspath = haspaths[a->to->no];
3291 assert(nexthaspath != NULL);
3292
3293 /*
3294 * Now, for every path of length i from a->to to the string end,
3295 * there is a path of length i + 1 from s to the string end.
3296 */
3297 if (nexthaspath[DUPINF] != nexthaspath[DUPINF + 1])
3298 {
3299 /*
3300 * a->to has a path of length exactly DUPINF, but not longer;
3301 * or it has paths of all lengths > DUPINF but not one of
3302 * exactly that length. In either case, we cannot represent
3303 * the possible path lengths from s correctly, so fail.
3304 */
3305 result = false;
3306 break;
3307 }
3308 /* Merge knowledge of these path lengths into what we have */
3309 for (i = 0; i < DUPINF; i++)
3310 haspath[i + 1] |= nexthaspath[i];
3311 /* Infinity + 1 is still infinity */
3312 haspath[DUPINF + 1] |= nexthaspath[DUPINF + 1];
3313 }
3314 }
3315
3316 if (result && foundloop)
3317 {
3318 /*
3319 * If there is a length-1 loop at this state, then find the shortest
3320 * known path length to the end. The loop means that every larger
3321 * path length is possible, too. (It doesn't matter whether any of
3322 * the longer lengths were already known possible.)
3323 */
3324 int i;
3325
3326 for (i = 0; i <= DUPINF; i++)
3327 {
3328 if (haspath[i])
3329 break;
3330 }
3331 for (i++; i <= DUPINF + 1; i++)
3332 haspath[i] = true;
3333 }
3334
3335 /* Report out the completed path length map */
3336 assert(s->no < nfa->nstates);
3337 assert(haspaths[s->no] == NULL);
3338 haspaths[s->no] = haspath;
3339
3340 /* Mark state no longer busy */
3341 s->tmp = NULL;
3342
3343 return result;
3344 }
3345
3346 /*
3347 * check_out_colors_match - subroutine for checkmatchall
3348 *
3349 * Check whether the set of states reachable from s by arcs of color co1
3350 * is equivalent to the set reachable by arcs of color co2.
3351 * checkmatchall already verified that all of the NFA's arcs are PLAIN,
3352 * so we need not examine arc types here.
3353 */
3354 static bool
check_out_colors_match(struct state * s,color co1,color co2)3355 check_out_colors_match(struct state *s, color co1, color co2)
3356 {
3357 bool result = true;
3358 struct arc *a;
3359
3360 /*
3361 * To do this in linear time, we assume that the NFA contains no duplicate
3362 * arcs. Run through the out-arcs, marking states reachable by arcs of
3363 * color co1. Run through again, un-marking states reachable by arcs of
3364 * color co2; if we see a not-marked state, we know this co2 arc is
3365 * unmatched. Then run through again, checking for still-marked states,
3366 * and in any case leaving all the tmp fields reset to NULL.
3367 */
3368 for (a = s->outs; a != NULL; a = a->outchain)
3369 {
3370 if (a->co == co1)
3371 {
3372 assert(a->to->tmp == NULL);
3373 a->to->tmp = a->to;
3374 }
3375 }
3376 for (a = s->outs; a != NULL; a = a->outchain)
3377 {
3378 if (a->co == co2)
3379 {
3380 if (a->to->tmp != NULL)
3381 a->to->tmp = NULL;
3382 else
3383 result = false; /* unmatched co2 arc */
3384 }
3385 }
3386 for (a = s->outs; a != NULL; a = a->outchain)
3387 {
3388 if (a->co == co1)
3389 {
3390 if (a->to->tmp != NULL)
3391 {
3392 result = false; /* unmatched co1 arc */
3393 a->to->tmp = NULL;
3394 }
3395 }
3396 }
3397 return result;
3398 }
3399
3400 /*
3401 * check_in_colors_match - subroutine for checkmatchall
3402 *
3403 * Check whether the set of states that can reach s by arcs of color co1
3404 * is equivalent to the set that can reach s by arcs of color co2.
3405 * checkmatchall already verified that all of the NFA's arcs are PLAIN,
3406 * so we need not examine arc types here.
3407 */
3408 static bool
check_in_colors_match(struct state * s,color co1,color co2)3409 check_in_colors_match(struct state *s, color co1, color co2)
3410 {
3411 bool result = true;
3412 struct arc *a;
3413
3414 /*
3415 * Identical algorithm to check_out_colors_match, except examine the
3416 * from-states of s' inarcs.
3417 */
3418 for (a = s->ins; a != NULL; a = a->inchain)
3419 {
3420 if (a->co == co1)
3421 {
3422 assert(a->from->tmp == NULL);
3423 a->from->tmp = a->from;
3424 }
3425 }
3426 for (a = s->ins; a != NULL; a = a->inchain)
3427 {
3428 if (a->co == co2)
3429 {
3430 if (a->from->tmp != NULL)
3431 a->from->tmp = NULL;
3432 else
3433 result = false; /* unmatched co2 arc */
3434 }
3435 }
3436 for (a = s->ins; a != NULL; a = a->inchain)
3437 {
3438 if (a->co == co1)
3439 {
3440 if (a->from->tmp != NULL)
3441 {
3442 result = false; /* unmatched co1 arc */
3443 a->from->tmp = NULL;
3444 }
3445 }
3446 }
3447 return result;
3448 }
3449
3450 /*
3451 * compact - construct the compact representation of an NFA
3452 */
3453 static void
compact(struct nfa * nfa,struct cnfa * cnfa)3454 compact(struct nfa *nfa,
3455 struct cnfa *cnfa)
3456 {
3457 struct state *s;
3458 struct arc *a;
3459 size_t nstates;
3460 size_t narcs;
3461 struct carc *ca;
3462 struct carc *first;
3463
3464 assert(!NISERR());
3465
3466 nstates = 0;
3467 narcs = 0;
3468 for (s = nfa->states; s != NULL; s = s->next)
3469 {
3470 nstates++;
3471 narcs += s->nouts + 1; /* need one extra for endmarker */
3472 }
3473
3474 cnfa->stflags = (char *) MALLOC(nstates * sizeof(char));
3475 cnfa->states = (struct carc **) MALLOC(nstates * sizeof(struct carc *));
3476 cnfa->arcs = (struct carc *) MALLOC(narcs * sizeof(struct carc));
3477 if (cnfa->stflags == NULL || cnfa->states == NULL || cnfa->arcs == NULL)
3478 {
3479 if (cnfa->stflags != NULL)
3480 FREE(cnfa->stflags);
3481 if (cnfa->states != NULL)
3482 FREE(cnfa->states);
3483 if (cnfa->arcs != NULL)
3484 FREE(cnfa->arcs);
3485 NERR(REG_ESPACE);
3486 return;
3487 }
3488 cnfa->nstates = nstates;
3489 cnfa->pre = nfa->pre->no;
3490 cnfa->post = nfa->post->no;
3491 cnfa->bos[0] = nfa->bos[0];
3492 cnfa->bos[1] = nfa->bos[1];
3493 cnfa->eos[0] = nfa->eos[0];
3494 cnfa->eos[1] = nfa->eos[1];
3495 cnfa->ncolors = maxcolor(nfa->cm) + 1;
3496 cnfa->flags = nfa->flags;
3497 cnfa->minmatchall = nfa->minmatchall;
3498 cnfa->maxmatchall = nfa->maxmatchall;
3499
3500 ca = cnfa->arcs;
3501 for (s = nfa->states; s != NULL; s = s->next)
3502 {
3503 assert((size_t) s->no < nstates);
3504 cnfa->stflags[s->no] = 0;
3505 cnfa->states[s->no] = ca;
3506 first = ca;
3507 for (a = s->outs; a != NULL; a = a->outchain)
3508 switch (a->type)
3509 {
3510 case PLAIN:
3511 ca->co = a->co;
3512 ca->to = a->to->no;
3513 ca++;
3514 break;
3515 case LACON:
3516 assert(s->no != cnfa->pre);
3517 assert(a->co >= 0);
3518 ca->co = (color) (cnfa->ncolors + a->co);
3519 ca->to = a->to->no;
3520 ca++;
3521 cnfa->flags |= HASLACONS;
3522 break;
3523 default:
3524 NERR(REG_ASSERT);
3525 return;
3526 }
3527 carcsort(first, ca - first);
3528 ca->co = COLORLESS;
3529 ca->to = 0;
3530 ca++;
3531 }
3532 assert(ca == &cnfa->arcs[narcs]);
3533 assert(cnfa->nstates != 0);
3534
3535 /* mark no-progress states */
3536 for (a = nfa->pre->outs; a != NULL; a = a->outchain)
3537 cnfa->stflags[a->to->no] = CNFA_NOPROGRESS;
3538 cnfa->stflags[nfa->pre->no] = CNFA_NOPROGRESS;
3539 }
3540
3541 /*
3542 * carcsort - sort compacted-NFA arcs by color
3543 */
3544 static void
carcsort(struct carc * first,size_t n)3545 carcsort(struct carc *first, size_t n)
3546 {
3547 if (n > 1)
3548 qsort(first, n, sizeof(struct carc), carc_cmp);
3549 }
3550
3551 static int
carc_cmp(const void * a,const void * b)3552 carc_cmp(const void *a, const void *b)
3553 {
3554 const struct carc *aa = (const struct carc *) a;
3555 const struct carc *bb = (const struct carc *) b;
3556
3557 if (aa->co < bb->co)
3558 return -1;
3559 if (aa->co > bb->co)
3560 return +1;
3561 if (aa->to < bb->to)
3562 return -1;
3563 if (aa->to > bb->to)
3564 return +1;
3565 return 0;
3566 }
3567
3568 /*
3569 * freecnfa - free a compacted NFA
3570 */
3571 static void
freecnfa(struct cnfa * cnfa)3572 freecnfa(struct cnfa *cnfa)
3573 {
3574 assert(!NULLCNFA(*cnfa)); /* not empty already */
3575 FREE(cnfa->stflags);
3576 FREE(cnfa->states);
3577 FREE(cnfa->arcs);
3578 ZAPCNFA(*cnfa);
3579 }
3580
3581 /*
3582 * dumpnfa - dump an NFA in human-readable form
3583 */
3584 static void
dumpnfa(struct nfa * nfa,FILE * f)3585 dumpnfa(struct nfa *nfa,
3586 FILE *f)
3587 {
3588 #ifdef REG_DEBUG
3589 struct state *s;
3590 int nstates = 0;
3591 int narcs = 0;
3592
3593 fprintf(f, "pre %d, post %d", nfa->pre->no, nfa->post->no);
3594 if (nfa->bos[0] != COLORLESS)
3595 fprintf(f, ", bos [%ld]", (long) nfa->bos[0]);
3596 if (nfa->bos[1] != COLORLESS)
3597 fprintf(f, ", bol [%ld]", (long) nfa->bos[1]);
3598 if (nfa->eos[0] != COLORLESS)
3599 fprintf(f, ", eos [%ld]", (long) nfa->eos[0]);
3600 if (nfa->eos[1] != COLORLESS)
3601 fprintf(f, ", eol [%ld]", (long) nfa->eos[1]);
3602 if (nfa->flags & HASLACONS)
3603 fprintf(f, ", haslacons");
3604 if (nfa->flags & MATCHALL)
3605 {
3606 fprintf(f, ", minmatchall %d", nfa->minmatchall);
3607 if (nfa->maxmatchall == DUPINF)
3608 fprintf(f, ", maxmatchall inf");
3609 else
3610 fprintf(f, ", maxmatchall %d", nfa->maxmatchall);
3611 }
3612 fprintf(f, "\n");
3613 for (s = nfa->states; s != NULL; s = s->next)
3614 {
3615 dumpstate(s, f);
3616 nstates++;
3617 narcs += s->nouts;
3618 }
3619 fprintf(f, "total of %d states, %d arcs\n", nstates, narcs);
3620 if (nfa->parent == NULL)
3621 dumpcolors(nfa->cm, f);
3622 fflush(f);
3623 #endif
3624 }
3625
3626 #ifdef REG_DEBUG /* subordinates of dumpnfa */
3627
3628 /*
3629 * dumpstate - dump an NFA state in human-readable form
3630 */
3631 static void
dumpstate(struct state * s,FILE * f)3632 dumpstate(struct state *s,
3633 FILE *f)
3634 {
3635 struct arc *a;
3636
3637 fprintf(f, "%d%s%c", s->no, (s->tmp != NULL) ? "T" : "",
3638 (s->flag) ? s->flag : '.');
3639 if (s->prev != NULL && s->prev->next != s)
3640 fprintf(f, "\tstate chain bad\n");
3641 if (s->nouts == 0)
3642 fprintf(f, "\tno out arcs\n");
3643 else
3644 dumparcs(s, f);
3645 for (a = s->ins; a != NULL; a = a->inchain)
3646 {
3647 if (a->to != s)
3648 fprintf(f, "\tlink from %d to %d on %d's in-chain\n",
3649 a->from->no, a->to->no, s->no);
3650 }
3651 fflush(f);
3652 }
3653
3654 /*
3655 * dumparcs - dump out-arcs in human-readable form
3656 */
3657 static void
dumparcs(struct state * s,FILE * f)3658 dumparcs(struct state *s,
3659 FILE *f)
3660 {
3661 int pos;
3662 struct arc *a;
3663
3664 /* printing oldest arcs first is usually clearer */
3665 a = s->outs;
3666 assert(a != NULL);
3667 while (a->outchain != NULL)
3668 a = a->outchain;
3669 pos = 1;
3670 do
3671 {
3672 dumparc(a, s, f);
3673 if (pos == 5)
3674 {
3675 fprintf(f, "\n");
3676 pos = 1;
3677 }
3678 else
3679 pos++;
3680 a = a->outchainRev;
3681 } while (a != NULL);
3682 if (pos != 1)
3683 fprintf(f, "\n");
3684 }
3685
3686 /*
3687 * dumparc - dump one outarc in readable form, including prefixing tab
3688 */
3689 static void
dumparc(struct arc * a,struct state * s,FILE * f)3690 dumparc(struct arc *a,
3691 struct state *s,
3692 FILE *f)
3693 {
3694 struct arc *aa;
3695
3696 fprintf(f, "\t");
3697 switch (a->type)
3698 {
3699 case PLAIN:
3700 if (a->co == RAINBOW)
3701 fprintf(f, "[*]");
3702 else
3703 fprintf(f, "[%ld]", (long) a->co);
3704 break;
3705 case AHEAD:
3706 if (a->co == RAINBOW)
3707 fprintf(f, ">*>");
3708 else
3709 fprintf(f, ">%ld>", (long) a->co);
3710 break;
3711 case BEHIND:
3712 if (a->co == RAINBOW)
3713 fprintf(f, "<*<");
3714 else
3715 fprintf(f, "<%ld<", (long) a->co);
3716 break;
3717 case LACON:
3718 fprintf(f, ":%ld:", (long) a->co);
3719 break;
3720 case '^':
3721 case '$':
3722 fprintf(f, "%c%d", a->type, (int) a->co);
3723 break;
3724 case EMPTY:
3725 break;
3726 default:
3727 fprintf(f, "0x%x/0%lo", a->type, (long) a->co);
3728 break;
3729 }
3730 if (a->from != s)
3731 fprintf(f, "?%d?", a->from->no);
3732 for (aa = a->from->outs; aa != NULL; aa = aa->outchain)
3733 if (aa == a)
3734 break; /* NOTE BREAK OUT */
3735 if (aa == NULL)
3736 fprintf(f, "?!?"); /* missing from out-chain */
3737 fprintf(f, "->");
3738 if (a->to == NULL)
3739 {
3740 fprintf(f, "NULL");
3741 return;
3742 }
3743 fprintf(f, "%d", a->to->no);
3744 for (aa = a->to->ins; aa != NULL; aa = aa->inchain)
3745 if (aa == a)
3746 break; /* NOTE BREAK OUT */
3747 if (aa == NULL)
3748 fprintf(f, "?!?"); /* missing from in-chain */
3749 }
3750 #endif /* REG_DEBUG */
3751
3752 /*
3753 * dumpcnfa - dump a compacted NFA in human-readable form
3754 */
3755 #ifdef REG_DEBUG
3756 static void
dumpcnfa(struct cnfa * cnfa,FILE * f)3757 dumpcnfa(struct cnfa *cnfa,
3758 FILE *f)
3759 {
3760 int st;
3761
3762 fprintf(f, "pre %d, post %d", cnfa->pre, cnfa->post);
3763 if (cnfa->bos[0] != COLORLESS)
3764 fprintf(f, ", bos [%ld]", (long) cnfa->bos[0]);
3765 if (cnfa->bos[1] != COLORLESS)
3766 fprintf(f, ", bol [%ld]", (long) cnfa->bos[1]);
3767 if (cnfa->eos[0] != COLORLESS)
3768 fprintf(f, ", eos [%ld]", (long) cnfa->eos[0]);
3769 if (cnfa->eos[1] != COLORLESS)
3770 fprintf(f, ", eol [%ld]", (long) cnfa->eos[1]);
3771 if (cnfa->flags & HASLACONS)
3772 fprintf(f, ", haslacons");
3773 if (cnfa->flags & MATCHALL)
3774 {
3775 fprintf(f, ", minmatchall %d", cnfa->minmatchall);
3776 if (cnfa->maxmatchall == DUPINF)
3777 fprintf(f, ", maxmatchall inf");
3778 else
3779 fprintf(f, ", maxmatchall %d", cnfa->maxmatchall);
3780 }
3781 fprintf(f, "\n");
3782 for (st = 0; st < cnfa->nstates; st++)
3783 dumpcstate(st, cnfa, f);
3784 fflush(f);
3785 }
3786 #endif
3787
3788 #ifdef REG_DEBUG /* subordinates of dumpcnfa */
3789
3790 /*
3791 * dumpcstate - dump a compacted-NFA state in human-readable form
3792 */
3793 static void
dumpcstate(int st,struct cnfa * cnfa,FILE * f)3794 dumpcstate(int st,
3795 struct cnfa *cnfa,
3796 FILE *f)
3797 {
3798 struct carc *ca;
3799 int pos;
3800
3801 fprintf(f, "%d%s", st, (cnfa->stflags[st] & CNFA_NOPROGRESS) ? ":" : ".");
3802 pos = 1;
3803 for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
3804 {
3805 if (ca->co == RAINBOW)
3806 fprintf(f, "\t[*]->%d", ca->to);
3807 else if (ca->co < cnfa->ncolors)
3808 fprintf(f, "\t[%ld]->%d", (long) ca->co, ca->to);
3809 else
3810 fprintf(f, "\t:%ld:->%d", (long) (ca->co - cnfa->ncolors), ca->to);
3811 if (pos == 5)
3812 {
3813 fprintf(f, "\n");
3814 pos = 1;
3815 }
3816 else
3817 pos++;
3818 }
3819 if (ca == cnfa->states[st] || pos != 1)
3820 fprintf(f, "\n");
3821 fflush(f);
3822 }
3823
3824 #endif /* REG_DEBUG */
3825