1 /*
2 * NFA utilities.
3 * This file is #included by regcomp.c.
4 *
5 * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
6 *
7 * Development of this software was funded, in part, by Cray Research Inc.,
8 * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
9 * Corporation, none of whom are responsible for the results. The author
10 * thanks all of them.
11 *
12 * Redistribution and use in source and binary forms -- with or without
13 * modification -- are permitted for any purpose, provided that
14 * redistributions in source form retain this entire copyright notice and
15 * indicate the origin and nature of any modifications.
16 *
17 * I'd appreciate being given credit for this package in the documentation of
18 * software which uses it, but that is not a requirement.
19 *
20 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
21 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
22 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
23 * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
29 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 *
31 * One or two things that technically ought to be in here are actually in
32 * color.c, thanks to some incestuous relationships in the color chains.
33 */
34
35 #define NISERR() VISERR(nfa->v)
36 #define NERR(e) VERR(nfa->v, (e))
37 #define STACK_TOO_DEEP(x) (0)
38 #define CANCEL_REQUESTED(x) (0)
39 #define REG_CANCEL 777
40
41 /*
42 - newnfa - set up an NFA
43 ^ static struct nfa *newnfa(struct vars *, struct colormap *, struct nfa *);
44 */
45 static struct nfa * /* the NFA, or NULL */
newnfa(struct vars * v,struct colormap * cm,struct nfa * parent)46 newnfa(
47 struct vars *v,
48 struct colormap *cm,
49 struct nfa *parent) /* NULL if primary NFA */
50 {
51 struct nfa *nfa;
52
53 nfa = (struct nfa *) MALLOC(sizeof(struct nfa));
54 if (nfa == NULL) {
55 ERR(REG_ESPACE);
56 return NULL;
57 }
58
59 nfa->states = NULL;
60 nfa->slast = NULL;
61 nfa->free = NULL;
62 nfa->nstates = 0;
63 nfa->cm = cm;
64 nfa->v = v;
65 nfa->size = 0;
66 nfa->bos[0] = nfa->bos[1] = COLORLESS;
67 nfa->eos[0] = nfa->eos[1] = COLORLESS;
68 nfa->parent = parent; /* Precedes newfstate so parent is valid. */
69 nfa->post = newfstate(nfa, '@'); /* number 0 */
70 nfa->pre = newfstate(nfa, '>'); /* number 1 */
71
72 nfa->init = newstate(nfa); /* May become invalid later. */
73 nfa->final = newstate(nfa);
74 if (ISERR()) {
75 freenfa(nfa);
76 return NULL;
77 }
78 rainbow(nfa, nfa->cm, PLAIN, COLORLESS, nfa->pre, nfa->init);
79 newarc(nfa, '^', 1, nfa->pre, nfa->init);
80 newarc(nfa, '^', 0, nfa->pre, nfa->init);
81 rainbow(nfa, nfa->cm, PLAIN, COLORLESS, nfa->final, nfa->post);
82 newarc(nfa, '$', 1, nfa->final, nfa->post);
83 newarc(nfa, '$', 0, nfa->final, nfa->post);
84
85 if (ISERR()) {
86 freenfa(nfa);
87 return NULL;
88 }
89 return nfa;
90 }
91
92 /*
93 - TooManyStates - checks if the max states exceeds the compile-time value
94 ^ static int TooManyStates(struct nfa *);
95 */
96 static int
TooManyStates(struct nfa * nfa)97 TooManyStates(
98 struct nfa *nfa)
99 {
100 struct nfa *parent = nfa->parent;
101 size_t sz = nfa->size;
102
103 while (parent != NULL) {
104 sz = parent->size;
105 parent = parent->parent;
106 }
107 if (sz > REG_MAX_STATES) {
108 return 1;
109 }
110 return 0;
111 }
112
113 /*
114 - IncrementSize - increases the tracked size of the NFA and its parents.
115 ^ static void IncrementSize(struct nfa *);
116 */
117 static void
IncrementSize(struct nfa * nfa)118 IncrementSize(
119 struct nfa *nfa)
120 {
121 struct nfa *parent = nfa->parent;
122
123 nfa->size++;
124 while (parent != NULL) {
125 parent->size++;
126 parent = parent->parent;
127 }
128 }
129
130 /*
131 - DecrementSize - increases the tracked size of the NFA and its parents.
132 ^ static void DecrementSize(struct nfa *);
133 */
134 static void
DecrementSize(struct nfa * nfa)135 DecrementSize(
136 struct nfa *nfa)
137 {
138 struct nfa *parent = nfa->parent;
139
140 nfa->size--;
141 while (parent != NULL) {
142 parent->size--;
143 parent = parent->parent;
144 }
145 }
146
147 /*
148 - freenfa - free an entire NFA
149 ^ static VOID freenfa(struct nfa *);
150 */
151 static void
freenfa(struct nfa * nfa)152 freenfa(
153 struct nfa *nfa)
154 {
155 struct state *s;
156
157 while ((s = nfa->states) != NULL) {
158 s->nins = s->nouts = 0; /* don't worry about arcs */
159 freestate(nfa, s);
160 }
161 while ((s = nfa->free) != NULL) {
162 nfa->free = s->next;
163 destroystate(nfa, s);
164 }
165
166 nfa->slast = NULL;
167 nfa->nstates = -1;
168 nfa->pre = NULL;
169 nfa->post = NULL;
170 FREE(nfa);
171 }
172
173 /*
174 - newstate - allocate an NFA state, with zero flag value
175 ^ static struct state *newstate(struct nfa *);
176 */
177 static struct state * /* NULL on error */
newstate(struct nfa * nfa)178 newstate(
179 struct nfa *nfa)
180 {
181 struct state *s;
182
183 if (TooManyStates(nfa)) {
184 /* XXX: add specific error for this */
185 NERR(REG_ETOOBIG);
186 return NULL;
187 }
188 if (nfa->free != NULL) {
189 s = nfa->free;
190 nfa->free = s->next;
191 } else {
192 s = (struct state *) MALLOC(sizeof(struct state));
193 if (s == NULL) {
194 NERR(REG_ESPACE);
195 return NULL;
196 }
197 s->oas.next = NULL;
198 s->free = NULL;
199 s->noas = 0;
200 }
201
202 assert(nfa->nstates >= 0);
203 s->no = nfa->nstates++;
204 s->flag = 0;
205 if (nfa->states == NULL) {
206 nfa->states = s;
207 }
208 s->nins = 0;
209 s->ins = NULL;
210 s->nouts = 0;
211 s->outs = NULL;
212 s->tmp = NULL;
213 s->next = NULL;
214 if (nfa->slast != NULL) {
215 assert(nfa->slast->next == NULL);
216 nfa->slast->next = s;
217 }
218 s->prev = nfa->slast;
219 nfa->slast = s;
220
221 /*
222 * Track the current size and the parent size.
223 */
224
225 IncrementSize(nfa);
226 return s;
227 }
228
229 /*
230 - newfstate - allocate an NFA state with a specified flag value
231 ^ static struct state *newfstate(struct nfa *, int flag);
232 */
233 static struct state * /* NULL on error */
newfstate(struct nfa * nfa,int flag)234 newfstate(
235 struct nfa *nfa,
236 int flag)
237 {
238 struct state *s;
239
240 s = newstate(nfa);
241 if (s != NULL) {
242 s->flag = (char) flag;
243 }
244 return s;
245 }
246
247 /*
248 - dropstate - delete a state's inarcs and outarcs and free it
249 ^ static VOID dropstate(struct nfa *, struct state *);
250 */
251 static void
dropstate(struct nfa * nfa,struct state * s)252 dropstate(
253 struct nfa *nfa,
254 struct state *s)
255 {
256 struct arc *a;
257
258 while ((a = s->ins) != NULL) {
259 freearc(nfa, a);
260 }
261 while ((a = s->outs) != NULL) {
262 freearc(nfa, a);
263 }
264 freestate(nfa, s);
265 }
266
267 /*
268 - freestate - free a state, which has no in-arcs or out-arcs
269 ^ static VOID freestate(struct nfa *, struct state *);
270 */
271 static void
freestate(struct nfa * nfa,struct state * s)272 freestate(
273 struct nfa *nfa,
274 struct state *s)
275 {
276 assert(s != NULL);
277 assert(s->nins == 0 && s->nouts == 0);
278
279 s->no = FREESTATE;
280 s->flag = 0;
281 if (s->next != NULL) {
282 s->next->prev = s->prev;
283 } else {
284 assert(s == nfa->slast);
285 nfa->slast = s->prev;
286 }
287 if (s->prev != NULL) {
288 s->prev->next = s->next;
289 } else {
290 assert(s == nfa->states);
291 nfa->states = s->next;
292 }
293 s->prev = NULL;
294 s->next = nfa->free; /* don't delete it, put it on the free list */
295 nfa->free = s;
296 DecrementSize(nfa);
297 }
298
299 /*
300 - destroystate - really get rid of an already-freed state
301 ^ static VOID destroystate(struct nfa *, struct state *);
302 */
303 static void
destroystate(struct nfa * nfa,struct state * s)304 destroystate(
305 struct nfa *nfa,
306 struct state *s)
307 {
308 struct arcbatch *ab;
309 struct arcbatch *abnext;
310
311 assert(s->no == FREESTATE);
312 for (ab=s->oas.next ; ab!=NULL ; ab=abnext) {
313 abnext = ab->next;
314 FREE(ab);
315 }
316 s->ins = NULL;
317 s->outs = NULL;
318 s->next = NULL;
319 FREE(s);
320 }
321
322 /*
323 - newarc - set up a new arc within an NFA
324 ^ static VOID newarc(struct nfa *, int, pcolor, struct state *,
325 ^ struct state *);
326 */
327 /*
328 * This function checks to make sure that no duplicate arcs are created.
329 * In general we never want duplicates.
330 */
331 static void
newarc(struct nfa * nfa,int t,pcolor co,struct state * from,struct state * to)332 newarc(
333 struct nfa *nfa,
334 int t,
335 pcolor co,
336 struct state *from,
337 struct state *to)
338 {
339 struct arc *a;
340
341 assert(from != NULL && to != NULL);
342
343 /* check for duplicate arc, using whichever chain is shorter */
344 if (from->nouts <= to->nins) {
345 for (a = from->outs; a != NULL; a = a->outchain) {
346 if (a->to == to && a->co == co && a->type == t) {
347 return;
348 }
349 }
350 } else {
351 for (a = to->ins; a != NULL; a = a->inchain) {
352 if (a->from == from && a->co == co && a->type == t) {
353 return;
354 }
355 }
356 }
357
358 /* no dup, so create the arc */
359 createarc(nfa, t, co, from, to);
360 }
361
362 /*
363 * createarc - create a new arc within an NFA
364 *
365 * This function must *only* be used after verifying that there is no existing
366 * identical arc (same type/color/from/to).
367 */
368 static void
createarc(struct nfa * nfa,int t,pcolor co,struct state * from,struct state * to)369 createarc(
370 struct nfa * nfa,
371 int t,
372 pcolor co,
373 struct state * from,
374 struct state * to)
375 {
376 struct arc *a;
377
378 /* the arc is physically allocated within its from-state */
379 a = allocarc(nfa, from);
380 if (NISERR()) {
381 return;
382 }
383 assert(a != NULL);
384
385 a->type = t;
386 a->co = (color) co;
387 a->to = to;
388 a->from = from;
389
390 /*
391 * Put the new arc on the beginning, not the end, of the chains; it's
392 * simpler here, and freearc() is the same cost either way. See also the
393 * logic in moveins() and its cohorts, as well as fixempties().
394 */
395 a->inchain = to->ins;
396 a->inchainRev = NULL;
397 if (to->ins) {
398 to->ins->inchainRev = a;
399 }
400 to->ins = a;
401 a->outchain = from->outs;
402 a->outchainRev = NULL;
403 if (from->outs) {
404 from->outs->outchainRev = a;
405 }
406 from->outs = a;
407
408 from->nouts++;
409 to->nins++;
410
411 if (COLORED(a) && nfa->parent == NULL) {
412 colorchain(nfa->cm, a);
413 }
414 }
415
416 /*
417 - allocarc - allocate a new out-arc within a state
418 ^ static struct arc *allocarc(struct nfa *, struct state *);
419 */
420 static struct arc * /* NULL for failure */
allocarc(struct nfa * nfa,struct state * s)421 allocarc(
422 struct nfa *nfa,
423 struct state *s)
424 {
425 struct arc *a;
426
427 /*
428 * Shortcut
429 */
430
431 if (s->free == NULL && s->noas < ABSIZE) {
432 a = &s->oas.a[s->noas];
433 s->noas++;
434 return a;
435 }
436
437 /*
438 * if none at hand, get more
439 */
440
441 if (s->free == NULL) {
442 struct arcbatch *newAb = (struct arcbatch *)
443 MALLOC(sizeof(struct arcbatch));
444 int i;
445
446 if (newAb == NULL) {
447 NERR(REG_ESPACE);
448 return NULL;
449 }
450 newAb->next = s->oas.next;
451 s->oas.next = newAb;
452
453 for (i=0 ; i<ABSIZE ; i++) {
454 newAb->a[i].type = 0;
455 newAb->a[i].freechain = &newAb->a[i+1];
456 }
457 newAb->a[ABSIZE-1].freechain = NULL;
458 s->free = &newAb->a[0];
459 }
460 assert(s->free != NULL);
461
462 a = s->free;
463 s->free = a->freechain;
464 return a;
465 }
466
467 /*
468 - freearc - free an arc
469 ^ static VOID freearc(struct nfa *, struct arc *);
470 */
471 static void
freearc(struct nfa * nfa,struct arc * victim)472 freearc(
473 struct nfa *nfa,
474 struct arc *victim)
475 {
476 struct state *from = victim->from;
477 struct state *to = victim->to;
478 struct arc *predecessor;
479
480 assert(victim->type != 0);
481
482 /*
483 * Take it off color chain if necessary.
484 */
485
486 if (COLORED(victim) && nfa->parent == NULL) {
487 uncolorchain(nfa->cm, victim);
488 }
489
490 /*
491 * Take it off source's out-chain.
492 */
493
494 assert(from != NULL);
495 predecessor = victim->outchainRev;
496 if (predecessor == NULL) {
497 assert(from->outs == victim);
498 from->outs = victim->outchain;
499 } else {
500 assert(predecessor->outchain == victim);
501 predecessor->outchain = victim->outchain;
502 }
503 if (victim->outchain != NULL) {
504 assert(victim->outchain->outchainRev == victim);
505 victim->outchain->outchainRev = predecessor;
506 }
507 from->nouts--;
508
509 /*
510 * Take it off target's in-chain.
511 */
512
513 assert(to != NULL);
514 predecessor = victim->inchainRev;
515 if (predecessor == NULL) {
516 assert(to->ins == victim);
517 to->ins = victim->inchain;
518 } else {
519 assert(predecessor->inchain == victim);
520 predecessor->inchain = victim->inchain;
521 }
522 if (victim->inchain != NULL) {
523 assert(victim->inchain->inchainRev == victim);
524 victim->inchain->inchainRev = predecessor;
525 }
526 to->nins--;
527
528 /*
529 * Clean up and place on from-state's free list.
530 */
531
532 victim->type = 0;
533 victim->from = NULL; /* precautions... */
534 victim->to = NULL;
535 victim->inchain = NULL;
536 victim->inchainRev = NULL;
537 victim->outchain = NULL;
538 victim->outchainRev = NULL;
539 victim->freechain = from->free;
540 from->free = victim;
541 }
542
543 /*
544 * changearctarget - flip an arc to have a different to state
545 *
546 * Caller must have verified that there is no pre-existing duplicate arc.
547 *
548 * Note that because we store arcs in their from state, we can't easily have
549 * a similar changearcsource function.
550 */
551 static void
changearctarget(struct arc * a,struct state * newto)552 changearctarget(struct arc * a, struct state * newto)
553 {
554 struct state *oldto = a->to;
555 struct arc *predecessor;
556
557 assert(oldto != newto);
558
559 /* take it off old target's in-chain */
560 assert(oldto != NULL);
561 predecessor = a->inchainRev;
562 if (predecessor == NULL) {
563 assert(oldto->ins == a);
564 oldto->ins = a->inchain;
565 } else {
566 assert(predecessor->inchain == a);
567 predecessor->inchain = a->inchain;
568 }
569 if (a->inchain != NULL) {
570 assert(a->inchain->inchainRev == a);
571 a->inchain->inchainRev = predecessor;
572 }
573 oldto->nins--;
574
575 a->to = newto;
576
577 /* prepend it to new target's in-chain */
578 a->inchain = newto->ins;
579 a->inchainRev = NULL;
580 if (newto->ins) {
581 newto->ins->inchainRev = a;
582 }
583 newto->ins = a;
584 newto->nins++;
585 }
586
587 /*
588 - hasnonemptyout - Does state have a non-EMPTY out arc?
589 ^ static int hasnonemptyout(struct state *);
590 */
591 static int
hasnonemptyout(struct state * s)592 hasnonemptyout(
593 struct state *s)
594 {
595 struct arc *a;
596
597 for (a = s->outs; a != NULL; a = a->outchain) {
598 if (a->type != EMPTY) {
599 return 1;
600 }
601 }
602 return 0;
603 }
604
605 /*
606 - findarc - find arc, if any, from given source with given type and color
607 * If there is more than one such arc, the result is random.
608 ^ static struct arc *findarc(struct state *, int, pcolor);
609 */
610 static struct arc *
findarc(struct state * s,int type,pcolor co)611 findarc(
612 struct state *s,
613 int type,
614 pcolor co)
615 {
616 struct arc *a;
617
618 for (a=s->outs ; a!=NULL ; a=a->outchain) {
619 if (a->type == type && a->co == co) {
620 return a;
621 }
622 }
623 return NULL;
624 }
625
626 /*
627 - cparc - allocate a new arc within an NFA, copying details from old one
628 ^ static VOID cparc(struct nfa *, struct arc *, struct state *,
629 ^ struct state *);
630 */
631 static void
cparc(struct nfa * nfa,struct arc * oa,struct state * from,struct state * to)632 cparc(
633 struct nfa *nfa,
634 struct arc *oa,
635 struct state *from,
636 struct state *to)
637 {
638 newarc(nfa, oa->type, oa->co, from, to);
639 }
640
641 /*
642 * sortins - sort the in arcs of a state by from/color/type
643 */
644 static void
sortins(struct nfa * nfa,struct state * s)645 sortins(
646 struct nfa * nfa,
647 struct state * s)
648 {
649 struct arc **sortarray;
650 struct arc *a;
651 int n = s->nins;
652 int i;
653
654 if (n <= 1) {
655 return; /* nothing to do */
656 }
657 /* make an array of arc pointers ... */
658 sortarray = (struct arc **) MALLOC(n * sizeof(struct arc *));
659 if (sortarray == NULL) {
660 NERR(REG_ESPACE);
661 return;
662 }
663 i = 0;
664 for (a = s->ins; a != NULL; a = a->inchain) {
665 sortarray[i++] = a;
666 }
667 assert(i == n);
668 /* ... sort the array */
669 qsort(sortarray, n, sizeof(struct arc *), sortins_cmp);
670 /* ... and rebuild arc list in order */
671 /* it seems worth special-casing first and last items to simplify loop */
672 a = sortarray[0];
673 s->ins = a;
674 a->inchain = sortarray[1];
675 a->inchainRev = NULL;
676 for (i = 1; i < n - 1; i++) {
677 a = sortarray[i];
678 a->inchain = sortarray[i + 1];
679 a->inchainRev = sortarray[i - 1];
680 }
681 a = sortarray[i];
682 a->inchain = NULL;
683 a->inchainRev = sortarray[i - 1];
684 FREE(sortarray);
685 }
686
687 static int
sortins_cmp(const void * a,const void * b)688 sortins_cmp(
689 const void *a,
690 const void *b)
691 {
692 const struct arc *aa = *((const struct arc * const *) a);
693 const struct arc *bb = *((const struct arc * const *) b);
694
695 /* we check the fields in the order they are most likely to be different */
696 if (aa->from->no < bb->from->no) {
697 return -1;
698 }
699 if (aa->from->no > bb->from->no) {
700 return 1;
701 }
702 if (aa->co < bb->co) {
703 return -1;
704 }
705 if (aa->co > bb->co) {
706 return 1;
707 }
708 if (aa->type < bb->type) {
709 return -1;
710 }
711 if (aa->type > bb->type) {
712 return 1;
713 }
714 return 0;
715 }
716
717 /*
718 * sortouts - sort the out arcs of a state by to/color/type
719 */
720 static void
sortouts(struct nfa * nfa,struct state * s)721 sortouts(
722 struct nfa * nfa,
723 struct state * s)
724 {
725 struct arc **sortarray;
726 struct arc *a;
727 int n = s->nouts;
728 int i;
729
730 if (n <= 1) {
731 return; /* nothing to do */
732 }
733 /* make an array of arc pointers ... */
734 sortarray = (struct arc **) MALLOC(n * sizeof(struct arc *));
735 if (sortarray == NULL) {
736 NERR(REG_ESPACE);
737 return;
738 }
739 i = 0;
740 for (a = s->outs; a != NULL; a = a->outchain) {
741 sortarray[i++] = a;
742 }
743 assert(i == n);
744 /* ... sort the array */
745 qsort(sortarray, n, sizeof(struct arc *), sortouts_cmp);
746 /* ... and rebuild arc list in order */
747 /* it seems worth special-casing first and last items to simplify loop */
748 a = sortarray[0];
749 s->outs = a;
750 a->outchain = sortarray[1];
751 a->outchainRev = NULL;
752 for (i = 1; i < n - 1; i++) {
753 a = sortarray[i];
754 a->outchain = sortarray[i + 1];
755 a->outchainRev = sortarray[i - 1];
756 }
757 a = sortarray[i];
758 a->outchain = NULL;
759 a->outchainRev = sortarray[i - 1];
760 FREE(sortarray);
761 }
762
763 static int
sortouts_cmp(const void * a,const void * b)764 sortouts_cmp(
765 const void *a,
766 const void *b)
767 {
768 const struct arc *aa = *((const struct arc * const *) a);
769 const struct arc *bb = *((const struct arc * const *) b);
770
771 /* we check the fields in the order they are most likely to be different */
772 if (aa->to->no < bb->to->no) {
773 return -1;
774 }
775 if (aa->to->no > bb->to->no) {
776 return 1;
777 }
778 if (aa->co < bb->co) {
779 return -1;
780 }
781 if (aa->co > bb->co) {
782 return 1;
783 }
784 if (aa->type < bb->type) {
785 return -1;
786 }
787 if (aa->type > bb->type) {
788 return 1;
789 }
790 return 0;
791 }
792
793 /*
794 * Common decision logic about whether to use arc-by-arc operations or
795 * sort/merge. If there's just a few source arcs we cannot recoup the
796 * cost of sorting the destination arc list, no matter how large it is.
797 * Otherwise, limit the number of arc-by-arc comparisons to about 1000
798 * (a somewhat arbitrary choice, but the breakeven point would probably
799 * be machine dependent anyway).
800 */
801 #define BULK_ARC_OP_USE_SORT(nsrcarcs, ndestarcs) \
802 ((nsrcarcs) < 4 ? 0 : ((nsrcarcs) > 32 || (ndestarcs) > 32))
803
804 /*
805 - moveins - move all in arcs of a state to another state
806 * You might think this could be done better by just updating the
807 * existing arcs, and you would be right if it weren't for the need
808 * for duplicate suppression, which makes it easier to just make new
809 * ones to exploit the suppression built into newarc.
810 *
811 * However, if we have a whole lot of arcs to deal with, retail duplicate
812 * checks become too slow. In that case we proceed by sorting and merging
813 * the arc lists, and then we can indeed just update the arcs in-place.
814 *
815 ^ static VOID moveins(struct nfa *, struct state *, struct state *);
816 */
817 static void
moveins(struct nfa * nfa,struct state * oldState,struct state * newState)818 moveins(
819 struct nfa *nfa,
820 struct state *oldState,
821 struct state *newState)
822 {
823 assert(oldState != newState);
824
825 if (!BULK_ARC_OP_USE_SORT(oldState->nins, newState->nins)) {
826 /* With not too many arcs, just do them one at a time */
827 struct arc *a;
828
829 while ((a = oldState->ins) != NULL) {
830 cparc(nfa, a, a->from, newState);
831 freearc(nfa, a);
832 }
833 } else {
834 /*
835 * With many arcs, use a sort-merge approach. Note changearctarget()
836 * will put the arc onto the front of newState's chain, so it does not
837 * break our walk through the sorted part of the chain.
838 */
839 struct arc *oa;
840 struct arc *na;
841
842 /*
843 * Because we bypass newarc() in this code path, we'd better include a
844 * cancel check.
845 */
846 if (CANCEL_REQUESTED(nfa->v->re)) {
847 NERR(REG_CANCEL);
848 return;
849 }
850
851 sortins(nfa, oldState);
852 sortins(nfa, newState);
853 if (NISERR()) {
854 return; /* might have failed to sort */
855 }
856 oa = oldState->ins;
857 na = newState->ins;
858 while (oa != NULL && na != NULL) {
859 struct arc *a = oa;
860
861 switch (sortins_cmp(&oa, &na)) {
862 case -1:
863 /* newState does not have anything matching oa */
864 oa = oa->inchain;
865
866 /*
867 * Rather than doing createarc+freearc, we can just unlink
868 * and relink the existing arc struct.
869 */
870 changearctarget(a, newState);
871 break;
872 case 0:
873 /* match, advance in both lists */
874 oa = oa->inchain;
875 na = na->inchain;
876 /* ... and drop duplicate arc from oldState */
877 freearc(nfa, a);
878 break;
879 case +1:
880 /* advance only na; oa might have a match later */
881 na = na->inchain;
882 break;
883 default:
884 assert(NOTREACHED);
885 }
886 }
887 while (oa != NULL) {
888 /* newState does not have anything matching oa */
889 struct arc *a = oa;
890
891 oa = oa->inchain;
892 changearctarget(a, newState);
893 }
894 }
895
896 assert(oldState->nins == 0);
897 assert(oldState->ins == NULL);
898 }
899
900 /*
901 - copyins - copy in arcs of a state to another state
902 ^ static VOID copyins(struct nfa *, struct state *, struct state *, int);
903 */
904 static void
copyins(struct nfa * nfa,struct state * oldState,struct state * newState)905 copyins(
906 struct nfa *nfa,
907 struct state *oldState,
908 struct state *newState)
909 {
910 assert(oldState != newState);
911
912 if (!BULK_ARC_OP_USE_SORT(oldState->nins, newState->nins)) {
913 /* With not too many arcs, just do them one at a time */
914 struct arc *a;
915
916 for (a = oldState->ins; a != NULL; a = a->inchain) {
917 cparc(nfa, a, a->from, newState);
918 }
919 } else {
920 /*
921 * With many arcs, use a sort-merge approach. Note that createarc()
922 * will put new arcs onto the front of newState's chain, so it does
923 * not break our walk through the sorted part of the chain.
924 */
925 struct arc *oa;
926 struct arc *na;
927
928 /*
929 * Because we bypass newarc() in this code path, we'd better include a
930 * cancel check.
931 */
932 if (CANCEL_REQUESTED(nfa->v->re)) {
933 NERR(REG_CANCEL);
934 return;
935 }
936
937 sortins(nfa, oldState);
938 sortins(nfa, newState);
939 if (NISERR()) {
940 return; /* might have failed to sort */
941 }
942 oa = oldState->ins;
943 na = newState->ins;
944 while (oa != NULL && na != NULL) {
945 struct arc *a = oa;
946
947 switch (sortins_cmp(&oa, &na)) {
948 case -1:
949 /* newState does not have anything matching oa */
950 oa = oa->inchain;
951 createarc(nfa, a->type, a->co, a->from, newState);
952 break;
953 case 0:
954 /* match, advance in both lists */
955 oa = oa->inchain;
956 na = na->inchain;
957 break;
958 case +1:
959 /* advance only na; oa might have a match later */
960 na = na->inchain;
961 break;
962 default:
963 assert(NOTREACHED);
964 }
965 }
966 while (oa != NULL) {
967 /* newState does not have anything matching oa */
968 struct arc *a = oa;
969
970 oa = oa->inchain;
971 createarc(nfa, a->type, a->co, a->from, newState);
972 }
973 }
974 }
975
976 /*
977 * mergeins - merge a list of inarcs into a state
978 *
979 * This is much like copyins, but the source arcs are listed in an array,
980 * and are not guaranteed unique. It's okay to clobber the array contents.
981 */
982 static void
mergeins(struct nfa * nfa,struct state * s,struct arc ** arcarray,int arccount)983 mergeins(
984 struct nfa * nfa,
985 struct state * s,
986 struct arc ** arcarray,
987 int arccount)
988 {
989 struct arc *na;
990 int i;
991 int j;
992
993 if (arccount <= 0) {
994 return;
995 }
996
997 /*
998 * Because we bypass newarc() in this code path, we'd better include a
999 * cancel check.
1000 */
1001 if (CANCEL_REQUESTED(nfa->v->re)) {
1002 NERR(REG_CANCEL);
1003 return;
1004 }
1005
1006 /* Sort existing inarcs as well as proposed new ones */
1007 sortins(nfa, s);
1008 if (NISERR()) {
1009 return; /* might have failed to sort */
1010 }
1011
1012 qsort(arcarray, arccount, sizeof(struct arc *), sortins_cmp);
1013
1014 /*
1015 * arcarray very likely includes dups, so we must eliminate them. (This
1016 * could be folded into the next loop, but it's not worth the trouble.)
1017 */
1018 j = 0;
1019 for (i = 1; i < arccount; i++) {
1020 switch (sortins_cmp(&arcarray[j], &arcarray[i])) {
1021 case -1:
1022 /* non-dup */
1023 arcarray[++j] = arcarray[i];
1024 break;
1025 case 0:
1026 /* dup */
1027 break;
1028 default:
1029 /* trouble */
1030 assert(NOTREACHED);
1031 }
1032 }
1033 arccount = j + 1;
1034
1035 /*
1036 * Now merge into s' inchain. Note that createarc() will put new arcs
1037 * onto the front of s's chain, so it does not break our walk through the
1038 * sorted part of the chain.
1039 */
1040 i = 0;
1041 na = s->ins;
1042 while (i < arccount && na != NULL) {
1043 struct arc *a = arcarray[i];
1044
1045 switch (sortins_cmp(&a, &na)) {
1046 case -1:
1047 /* s does not have anything matching a */
1048 createarc(nfa, a->type, a->co, a->from, s);
1049 i++;
1050 break;
1051 case 0:
1052 /* match, advance in both lists */
1053 i++;
1054 na = na->inchain;
1055 break;
1056 case +1:
1057 /* advance only na; array might have a match later */
1058 na = na->inchain;
1059 break;
1060 default:
1061 assert(NOTREACHED);
1062 }
1063 }
1064 while (i < arccount) {
1065 /* s does not have anything matching a */
1066 struct arc *a = arcarray[i];
1067
1068 createarc(nfa, a->type, a->co, a->from, s);
1069 i++;
1070 }
1071 }
1072
1073 /*
1074 - moveouts - move all out arcs of a state to another state
1075 ^ static VOID moveouts(struct nfa *, struct state *, struct state *);
1076 */
1077 static void
moveouts(struct nfa * nfa,struct state * oldState,struct state * newState)1078 moveouts(
1079 struct nfa *nfa,
1080 struct state *oldState,
1081 struct state *newState)
1082 {
1083 assert(oldState != newState);
1084
1085 if (!BULK_ARC_OP_USE_SORT(oldState->nouts, newState->nouts)) {
1086 /* With not too many arcs, just do them one at a time */
1087 struct arc *a;
1088
1089 while ((a = oldState->outs) != NULL) {
1090 cparc(nfa, a, newState, a->to);
1091 freearc(nfa, a);
1092 }
1093 } else {
1094 /*
1095 * With many arcs, use a sort-merge approach. Note that createarc()
1096 * will put new arcs onto the front of newState's chain, so it does
1097 * not break our walk through the sorted part of the chain.
1098 */
1099 struct arc *oa;
1100 struct arc *na;
1101
1102 /*
1103 * Because we bypass newarc() in this code path, we'd better include a
1104 * cancel check.
1105 */
1106 if (CANCEL_REQUESTED(nfa->v->re)) {
1107 NERR(REG_CANCEL);
1108 return;
1109 }
1110
1111 sortouts(nfa, oldState);
1112 sortouts(nfa, newState);
1113 if (NISERR()) {
1114 return; /* might have failed to sort */
1115 }
1116 oa = oldState->outs;
1117 na = newState->outs;
1118 while (oa != NULL && na != NULL) {
1119 struct arc *a = oa;
1120
1121 switch (sortouts_cmp(&oa, &na)) {
1122 case -1:
1123 /* newState does not have anything matching oa */
1124 oa = oa->outchain;
1125 createarc(nfa, a->type, a->co, newState, a->to);
1126 freearc(nfa, a);
1127 break;
1128 case 0:
1129 /* match, advance in both lists */
1130 oa = oa->outchain;
1131 na = na->outchain;
1132 /* ... and drop duplicate arc from oldState */
1133 freearc(nfa, a);
1134 break;
1135 case +1:
1136 /* advance only na; oa might have a match later */
1137 na = na->outchain;
1138 break;
1139 default:
1140 assert(NOTREACHED);
1141 }
1142 }
1143 while (oa != NULL) {
1144 /* newState does not have anything matching oa */
1145 struct arc *a = oa;
1146
1147 oa = oa->outchain;
1148 createarc(nfa, a->type, a->co, newState, a->to);
1149 freearc(nfa, a);
1150 }
1151 }
1152
1153 assert(oldState->nouts == 0);
1154 assert(oldState->outs == NULL);
1155 }
1156
1157 /*
1158 - copyouts - copy out arcs of a state to another state
1159 ^ static VOID copyouts(struct nfa *, struct state *, struct state *, int);
1160 */
1161 static void
copyouts(struct nfa * nfa,struct state * oldState,struct state * newState)1162 copyouts(
1163 struct nfa *nfa,
1164 struct state *oldState,
1165 struct state *newState)
1166 {
1167 assert(oldState != newState);
1168
1169 if (!BULK_ARC_OP_USE_SORT(oldState->nouts, newState->nouts)) {
1170 /* With not too many arcs, just do them one at a time */
1171 struct arc *a;
1172
1173 for (a = oldState->outs; a != NULL; a = a->outchain) {
1174 cparc(nfa, a, newState, a->to);
1175 }
1176 } else {
1177 /*
1178 * With many arcs, use a sort-merge approach. Note that createarc()
1179 * will put new arcs onto the front of newState's chain, so it does
1180 * not break our walk through the sorted part of the chain.
1181 */
1182 struct arc *oa;
1183 struct arc *na;
1184
1185 /*
1186 * Because we bypass newarc() in this code path, we'd better include a
1187 * cancel check.
1188 */
1189 if (CANCEL_REQUESTED(nfa->v->re)) {
1190 NERR(REG_CANCEL);
1191 return;
1192 }
1193
1194 sortouts(nfa, oldState);
1195 sortouts(nfa, newState);
1196 if (NISERR()) {
1197 return; /* might have failed to sort */
1198 }
1199 oa = oldState->outs;
1200 na = newState->outs;
1201 while (oa != NULL && na != NULL) {
1202 struct arc *a = oa;
1203
1204 switch (sortouts_cmp(&oa, &na)) {
1205 case -1:
1206 /* newState does not have anything matching oa */
1207 oa = oa->outchain;
1208 createarc(nfa, a->type, a->co, newState, a->to);
1209 break;
1210 case 0:
1211 /* match, advance in both lists */
1212 oa = oa->outchain;
1213 na = na->outchain;
1214 break;
1215 case +1:
1216 /* advance only na; oa might have a match later */
1217 na = na->outchain;
1218 break;
1219 default:
1220 assert(NOTREACHED);
1221 }
1222 }
1223 while (oa != NULL) {
1224 /* newState does not have anything matching oa */
1225 struct arc *a = oa;
1226
1227 oa = oa->outchain;
1228 createarc(nfa, a->type, a->co, newState, a->to);
1229 }
1230 }
1231 }
1232
1233 /*
1234 - cloneouts - copy out arcs of a state to another state pair, modifying type
1235 ^ static VOID cloneouts(struct nfa *, struct state *, struct state *,
1236 ^ struct state *, int);
1237 */
1238 static void
cloneouts(struct nfa * nfa,struct state * old,struct state * from,struct state * to,int type)1239 cloneouts(
1240 struct nfa *nfa,
1241 struct state *old,
1242 struct state *from,
1243 struct state *to,
1244 int type)
1245 {
1246 struct arc *a;
1247
1248 assert(old != from);
1249
1250 for (a=old->outs ; a!=NULL ; a=a->outchain) {
1251 newarc(nfa, type, a->co, from, to);
1252 }
1253 }
1254
1255 /*
1256 - delsub - delete a sub-NFA, updating subre pointers if necessary
1257 * This uses a recursive traversal of the sub-NFA, marking already-seen
1258 * states using their tmp pointer.
1259 ^ static VOID delsub(struct nfa *, struct state *, struct state *);
1260 */
1261 static void
delsub(struct nfa * nfa,struct state * lp,struct state * rp)1262 delsub(
1263 struct nfa *nfa,
1264 struct state *lp, /* the sub-NFA goes from here... */
1265 struct state *rp) /* ...to here, *not* inclusive */
1266 {
1267 assert(lp != rp);
1268
1269 rp->tmp = rp; /* mark end */
1270
1271 deltraverse(nfa, lp, lp);
1272 assert(lp->nouts == 0 && rp->nins == 0); /* did the job */
1273 assert(lp->no != FREESTATE && rp->no != FREESTATE); /* no more */
1274
1275 rp->tmp = NULL; /* unmark end */
1276 lp->tmp = NULL; /* and begin, marked by deltraverse */
1277 }
1278
1279 /*
1280 - deltraverse - the recursive heart of delsub
1281 * This routine's basic job is to destroy all out-arcs of the state.
1282 ^ static VOID deltraverse(struct nfa *, struct state *, struct state *);
1283 */
1284 static void
deltraverse(struct nfa * nfa,struct state * leftend,struct state * s)1285 deltraverse(
1286 struct nfa *nfa,
1287 struct state *leftend,
1288 struct state *s)
1289 {
1290 struct arc *a;
1291 struct state *to;
1292
1293 if (s->nouts == 0) {
1294 return; /* nothing to do */
1295 }
1296 if (s->tmp != NULL) {
1297 return; /* already in progress */
1298 }
1299
1300 s->tmp = s; /* mark as in progress */
1301
1302 while ((a = s->outs) != NULL) {
1303 to = a->to;
1304 deltraverse(nfa, leftend, to);
1305 assert(to->nouts == 0 || to->tmp != NULL);
1306 freearc(nfa, a);
1307 if (to->nins == 0 && to->tmp == NULL) {
1308 assert(to->nouts == 0);
1309 freestate(nfa, to);
1310 }
1311 }
1312
1313 assert(s->no != FREESTATE); /* we're still here */
1314 assert(s == leftend || s->nins != 0); /* and still reachable */
1315 assert(s->nouts == 0); /* but have no outarcs */
1316
1317 s->tmp = NULL; /* we're done here */
1318 }
1319
1320 /*
1321 - dupnfa - duplicate sub-NFA
1322 * Another recursive traversal, this time using tmp to point to duplicates as
1323 * well as mark already-seen states. (You knew there was a reason why it's a
1324 * state pointer, didn't you? :-))
1325 ^ static VOID dupnfa(struct nfa *, struct state *, struct state *,
1326 ^ struct state *, struct state *);
1327 */
1328 static void
dupnfa(struct nfa * nfa,struct state * start,struct state * stop,struct state * from,struct state * to)1329 dupnfa(
1330 struct nfa *nfa,
1331 struct state *start, /* duplicate of subNFA starting here */
1332 struct state *stop, /* and stopping here */
1333 struct state *from, /* stringing duplicate from here */
1334 struct state *to) /* to here */
1335 {
1336 if (start == stop) {
1337 newarc(nfa, EMPTY, 0, from, to);
1338 return;
1339 }
1340
1341 stop->tmp = to;
1342 duptraverse(nfa, start, from);
1343 /* done, except for clearing out the tmp pointers */
1344
1345 stop->tmp = NULL;
1346 cleartraverse(nfa, start);
1347 }
1348
1349 /*
1350 - duptraverse - recursive heart of dupnfa
1351 ^ static VOID duptraverse(struct nfa *, struct state *, struct state *);
1352 */
1353 static void
duptraverse(struct nfa * nfa,struct state * s,struct state * stmp)1354 duptraverse(
1355 struct nfa *nfa,
1356 struct state *s,
1357 struct state *stmp) /* s's duplicate, or NULL */
1358 {
1359 struct arc *a;
1360
1361 if (s->tmp != NULL) {
1362 return; /* already done */
1363 }
1364
1365 s->tmp = (stmp == NULL) ? newstate(nfa) : stmp;
1366 if (s->tmp == NULL) {
1367 assert(NISERR());
1368 return;
1369 }
1370
1371 for (a=s->outs ; a!=NULL && !NISERR() ; a=a->outchain) {
1372 duptraverse(nfa, a->to, NULL);
1373 if (NISERR()) {
1374 break;
1375 }
1376 assert(a->to->tmp != NULL);
1377 cparc(nfa, a, s->tmp, a->to->tmp);
1378 }
1379 }
1380
1381 /*
1382 - cleartraverse - recursive cleanup for algorithms that leave tmp ptrs set
1383 ^ static VOID cleartraverse(struct nfa *, struct state *);
1384 */
1385 static void
cleartraverse(struct nfa * nfa,struct state * s)1386 cleartraverse(
1387 struct nfa *nfa,
1388 struct state *s)
1389 {
1390 struct arc *a;
1391
1392 if (s->tmp == NULL) {
1393 return;
1394 }
1395 s->tmp = NULL;
1396
1397 for (a=s->outs ; a!=NULL ; a=a->outchain) {
1398 cleartraverse(nfa, a->to);
1399 }
1400 }
1401
1402 /*
1403 - specialcolors - fill in special colors for an NFA
1404 ^ static VOID specialcolors(struct nfa *);
1405 */
1406 static void
specialcolors(struct nfa * nfa)1407 specialcolors(
1408 struct nfa *nfa)
1409 {
1410 /*
1411 * False colors for BOS, BOL, EOS, EOL
1412 */
1413
1414 if (nfa->parent == NULL) {
1415 nfa->bos[0] = pseudocolor(nfa->cm);
1416 nfa->bos[1] = pseudocolor(nfa->cm);
1417 nfa->eos[0] = pseudocolor(nfa->cm);
1418 nfa->eos[1] = pseudocolor(nfa->cm);
1419 } else {
1420 assert(nfa->parent->bos[0] != COLORLESS);
1421 nfa->bos[0] = nfa->parent->bos[0];
1422 assert(nfa->parent->bos[1] != COLORLESS);
1423 nfa->bos[1] = nfa->parent->bos[1];
1424 assert(nfa->parent->eos[0] != COLORLESS);
1425 nfa->eos[0] = nfa->parent->eos[0];
1426 assert(nfa->parent->eos[1] != COLORLESS);
1427 nfa->eos[1] = nfa->parent->eos[1];
1428 }
1429 }
1430
1431 /*
1432 - optimize - optimize an NFA
1433 ^ static long optimize(struct nfa *, FILE *);
1434 */
1435
1436 /*
1437 * The main goal of this function is not so much "optimization" (though it
1438 * does try to get rid of useless NFA states) as reducing the NFA to a form
1439 * the regex executor can handle. The executor, and indeed the cNFA format
1440 * that is its input, can only handle PLAIN and LACON arcs. The output of
1441 * the regex parser also includes EMPTY (do-nothing) arcs, as well as
1442 * ^, $, AHEAD, and BEHIND constraint arcs, which we must get rid of here.
1443 * We first get rid of EMPTY arcs and then deal with the constraint arcs.
1444 * The hardest part of either job is to get rid of circular loops of the
1445 * target arc type. We would have to do that in any case, though, as such a
1446 * loop would otherwise allow the executor to cycle through the loop endlessly
1447 * without making any progress in the input string.
1448 */
1449 static long /* re_info bits */
optimize(struct nfa * nfa,FILE * f)1450 optimize(
1451 struct nfa *nfa,
1452 FILE *f) /* for debug output; NULL none */
1453 {
1454 int verbose = (f != NULL) ? 1 : 0;
1455
1456 if (verbose) {
1457 fprintf(f, "\ninitial cleanup:\n");
1458 }
1459 cleanup(nfa); /* may simplify situation */
1460 if (verbose) {
1461 dumpnfa(nfa, f);
1462 }
1463 if (verbose) {
1464 fprintf(f, "\nempties:\n");
1465 }
1466 fixempties(nfa, f); /* get rid of EMPTY arcs */
1467 if (verbose) {
1468 fprintf(f, "\nconstraints:\n");
1469 }
1470 fixconstraintloops(nfa, f); /* get rid of constraint loops */
1471 pullback(nfa, f); /* pull back constraints backward */
1472 pushfwd(nfa, f); /* push fwd constraints forward */
1473 if (verbose) {
1474 fprintf(f, "\nfinal cleanup:\n");
1475 }
1476 cleanup(nfa); /* final tidying */
1477 #ifdef REG_DEBUG
1478 if (verbose) {
1479 dumpnfa(nfa, f);
1480 }
1481 #endif
1482 return analyze(nfa); /* and analysis */
1483 }
1484
1485 /*
1486 - pullback - pull back constraints backward to eliminate them
1487 ^ static VOID pullback(struct nfa *, FILE *);
1488 */
1489 static void
pullback(struct nfa * nfa,FILE * f)1490 pullback(
1491 struct nfa *nfa,
1492 FILE *f) /* for debug output; NULL none */
1493 {
1494 struct state *s;
1495 struct state *nexts;
1496 struct arc *a;
1497 struct arc *nexta;
1498 struct state *intermediates;
1499 int progress;
1500
1501 /*
1502 * Find and pull until there are no more.
1503 */
1504
1505 do {
1506 progress = 0;
1507 for (s=nfa->states ; s!=NULL && !NISERR() ; s=nexts) {
1508 nexts = s->next;
1509 intermediates = NULL;
1510 for (a=s->outs ; a!=NULL && !NISERR() ; a=nexta) {
1511 nexta = a->outchain;
1512 if (a->type == '^' || a->type == BEHIND) {
1513 if (pull(nfa, a, &intermediates)) {
1514 progress = 1;
1515 }
1516 }
1517 assert(nexta == NULL || s->no != FREESTATE);
1518 }
1519 /* clear tmp fields of intermediate states created here */
1520 while (intermediates != NULL) {
1521 struct state *ns = intermediates->tmp;
1522
1523 intermediates->tmp = NULL;
1524 intermediates = ns;
1525 }
1526 /* if s is now useless, get rid of it */
1527 if ((s->nins == 0 || s->nouts == 0) && !s->flag) {
1528 dropstate(nfa, s);
1529 }
1530 }
1531 if (progress && f != NULL) {
1532 dumpnfa(nfa, f);
1533 }
1534 } while (progress && !NISERR());
1535 if (NISERR()) {
1536 return;
1537 }
1538
1539 /*
1540 * Any ^ constraints we were able to pull to the start state can now be
1541 * replaced by PLAIN arcs referencing the BOS or BOL colors. There should
1542 * be no other ^ or BEHIND arcs left in the NFA, though we do not check
1543 * that here (compact() will fail if so).
1544 */
1545 for (a=nfa->pre->outs ; a!=NULL ; a=nexta) {
1546 nexta = a->outchain;
1547 if (a->type == '^') {
1548 assert(a->co == 0 || a->co == 1);
1549 newarc(nfa, PLAIN, nfa->bos[a->co], a->from, a->to);
1550 freearc(nfa, a);
1551 }
1552 }
1553 }
1554
1555 /*
1556 - pull - pull a back constraint backward past its source state
1557 *
1558 * Returns 1 if successful (which it always is unless the source is the
1559 * start state or we have an internal error), 0 if nothing happened.
1560 *
1561 * A significant property of this function is that it deletes no pre-existing
1562 * states, and no outarcs of the constraint's from state other than the given
1563 * constraint arc. This makes the loops in pullback() safe, at the cost that
1564 * we may leave useless states behind. Therefore, we leave it to pullback()
1565 * to delete such states.
1566 *
1567 * If the from state has multiple back-constraint outarcs, and/or multiple
1568 * compatible constraint inarcs, we only need to create one new intermediate
1569 * state per combination of predecessor and successor states. *intermediates
1570 * points to a list of such intermediate states for this from state (chained
1571 * through their tmp fields).
1572 ^ static int pull(struct nfa *, struct arc *);
1573 */
1574 static int
pull(struct nfa * nfa,struct arc * con,struct state ** intermediates)1575 pull(
1576 struct nfa *nfa,
1577 struct arc *con,
1578 struct state **intermediates)
1579 {
1580 struct state *from = con->from;
1581 struct state *to = con->to;
1582 struct arc *a;
1583 struct arc *nexta;
1584 struct state *s;
1585
1586 assert(from != to); /* should have gotten rid of this earlier */
1587 if (from->flag) { /* can't pull back beyond start */
1588 return 0;
1589 }
1590 if (from->nins == 0) { /* unreachable */
1591 freearc(nfa, con);
1592 return 1;
1593 }
1594
1595 /*
1596 * First, clone from state if necessary to avoid other outarcs. This may
1597 * seem wasteful, but it simplifies the logic, and we'll get rid of the
1598 * clone state again at the bottom.
1599 */
1600
1601 if (from->nouts > 1) {
1602 s = newstate(nfa);
1603 if (NISERR()) {
1604 return 0;
1605 }
1606 copyins(nfa, from, s); /* duplicate inarcs */
1607 cparc(nfa, con, s, to); /* move constraint arc */
1608 freearc(nfa, con);
1609 if (NISERR()) {
1610 return 0;
1611 }
1612 from = s;
1613 con = from->outs;
1614 }
1615 assert(from->nouts == 1);
1616
1617 /*
1618 * Propagate the constraint into the from state's inarcs.
1619 */
1620
1621 for (a=from->ins ; a!=NULL && !NISERR(); a=nexta) {
1622 nexta = a->inchain;
1623 switch (combine(con, a)) {
1624 case INCOMPATIBLE: /* destroy the arc */
1625 freearc(nfa, a);
1626 break;
1627 case SATISFIED: /* no action needed */
1628 break;
1629 case COMPATIBLE: /* swap the two arcs, more or less */
1630 /* need an intermediate state, but might have one already */
1631 for (s = *intermediates; s != NULL; s = s->tmp) {
1632 assert(s->nins > 0 && s->nouts > 0);
1633 if (s->ins->from == a->from && s->outs->to == to) {
1634 break;
1635 }
1636 }
1637 if (s == NULL) {
1638 s = newstate(nfa);
1639 if (NISERR()) {
1640 return 0;
1641 }
1642 s->tmp = *intermediates;
1643 *intermediates = s;
1644 }
1645 cparc(nfa, con, a->from, s);
1646 cparc(nfa, a, s, to);
1647 freearc(nfa, a);
1648 break;
1649 default:
1650 assert(NOTREACHED);
1651 break;
1652 }
1653 }
1654
1655 /*
1656 * Remaining inarcs, if any, incorporate the constraint.
1657 */
1658
1659 moveins(nfa, from, to);
1660 freearc(nfa, con);
1661 /* from state is now useless, but we leave it to pullback() to clean up */
1662 return 1;
1663 }
1664
1665 /*
1666 - pushfwd - push forward constraints forward to eliminate them
1667 ^ static VOID pushfwd(struct nfa *, FILE *);
1668 */
1669 static void
pushfwd(struct nfa * nfa,FILE * f)1670 pushfwd(
1671 struct nfa *nfa,
1672 FILE *f) /* for debug output; NULL none */
1673 {
1674 struct state *s;
1675 struct state *nexts;
1676 struct arc *a;
1677 struct arc *nexta;
1678 struct state *intermediates;
1679 int progress;
1680
1681 /*
1682 * Find and push until there are no more.
1683 */
1684
1685 do {
1686 progress = 0;
1687 for (s=nfa->states ; s!=NULL && !NISERR() ; s=nexts) {
1688 nexts = s->next;
1689 intermediates = NULL;
1690 for (a = s->ins; a != NULL && !NISERR(); a = nexta) {
1691 nexta = a->inchain;
1692 if (a->type == '$' || a->type == AHEAD) {
1693 if (push(nfa, a, &intermediates)) {
1694 progress = 1;
1695 }
1696 }
1697 }
1698 /* clear tmp fields of intermediate states created here */
1699 while (intermediates != NULL) {
1700 struct state *ns = intermediates->tmp;
1701
1702 intermediates->tmp = NULL;
1703 intermediates = ns;
1704 }
1705 /* if s is now useless, get rid of it */
1706 if ((s->nins == 0 || s->nouts == 0) && !s->flag) {
1707 dropstate(nfa, s);
1708 }
1709 }
1710 if (progress && f != NULL) {
1711 dumpnfa(nfa, f);
1712 }
1713 } while (progress && !NISERR());
1714 if (NISERR()) {
1715 return;
1716 }
1717
1718 /*
1719 * Any $ constraints we were able to push to the post state can now be
1720 * replaced by PLAIN arcs referencing the EOS or EOL colors. There should
1721 * be no other $ or AHEAD arcs left in the NFA, though we do not check
1722 * that here (compact() will fail if so).
1723 */
1724 for (a = nfa->post->ins; a != NULL; a = nexta) {
1725 nexta = a->inchain;
1726 if (a->type == '$') {
1727 assert(a->co == 0 || a->co == 1);
1728 newarc(nfa, PLAIN, nfa->eos[a->co], a->from, a->to);
1729 freearc(nfa, a);
1730 }
1731 }
1732 }
1733
1734 /*
1735 - push - push a forward constraint forward past its destination state
1736 *
1737 * Returns 1 if successful (which it always is unless the destination is the
1738 * post state or we have an internal error), 0 if nothing happened.
1739 *
1740 * A significant property of this function is that it deletes no pre-existing
1741 * states, and no inarcs of the constraint's to state other than the given
1742 * constraint arc. This makes the loops in pushfwd() safe, at the cost that
1743 * we may leave useless states behind. Therefore, we leave it to pushfwd()
1744 * to delete such states.
1745 *
1746 * If the to state has multiple forward-constraint inarcs, and/or multiple
1747 * compatible constraint outarcs, we only need to create one new intermediate
1748 * state per combination of predecessor and successor states. *intermediates
1749 * points to a list of such intermediate states for this to state (chained
1750 * through their tmp fields).
1751 ^ static int push(struct nfa *, struct arc *);
1752 */
1753 static int
push(struct nfa * nfa,struct arc * con,struct state ** intermediates)1754 push(
1755 struct nfa *nfa,
1756 struct arc *con,
1757 struct state **intermediates)
1758 {
1759 struct state *from = con->from;
1760 struct state *to = con->to;
1761 struct arc *a;
1762 struct arc *nexta;
1763 struct state *s;
1764
1765 assert(to != from); /* should have gotten rid of this earlier */
1766 if (to->flag) { /* can't push forward beyond end */
1767 return 0;
1768 }
1769 if (to->nouts == 0) { /* dead end */
1770 freearc(nfa, con);
1771 return 1;
1772 }
1773
1774 /*
1775 * First, clone to state if necessary to avoid other inarcs. This may
1776 * seem wasteful, but it simplifies the logic, and we'll get rid of the
1777 * clone state again at the bottom.
1778 */
1779
1780 if (to->nins > 1) {
1781 s = newstate(nfa);
1782 if (NISERR()) {
1783 return 0;
1784 }
1785 copyouts(nfa, to, s); /* duplicate outarcs */
1786 cparc(nfa, con, from, s); /* move constraint arc */
1787 freearc(nfa, con);
1788 if (NISERR()) {
1789 return 0;
1790 }
1791 to = s;
1792 con = to->ins;
1793 }
1794 assert(to->nins == 1);
1795
1796 /*
1797 * Propagate the constraint into the to state's outarcs.
1798 */
1799
1800 for (a = to->outs; a != NULL && !NISERR(); a = nexta) {
1801 nexta = a->outchain;
1802 switch (combine(con, a)) {
1803 case INCOMPATIBLE: /* destroy the arc */
1804 freearc(nfa, a);
1805 break;
1806 case SATISFIED: /* no action needed */
1807 break;
1808 case COMPATIBLE: /* swap the two arcs, more or less */
1809 /* need an intermediate state, but might have one already */
1810 for (s = *intermediates; s != NULL; s = s->tmp) {
1811 assert(s->nins > 0 && s->nouts > 0);
1812 if (s->ins->from == from && s->outs->to == a->to) {
1813 break;
1814 }
1815 }
1816 if (s == NULL) {
1817 s = newstate(nfa);
1818 if (NISERR()) {
1819 return 0;
1820 }
1821 s->tmp = *intermediates;
1822 *intermediates = s;
1823 }
1824 cparc(nfa, con, s, a->to);
1825 cparc(nfa, a, from, s);
1826 freearc(nfa, a);
1827 break;
1828 default:
1829 assert(NOTREACHED);
1830 break;
1831 }
1832 }
1833
1834 /*
1835 * Remaining outarcs, if any, incorporate the constraint.
1836 */
1837
1838 moveouts(nfa, to, from);
1839 freearc(nfa, con);
1840 /* to state is now useless, but we leave it to pushfwd() to clean up */
1841 return 1;
1842 }
1843
1844 /*
1845 - combine - constraint lands on an arc, what happens?
1846 ^ #def INCOMPATIBLE 1 // destroys arc
1847 ^ #def SATISFIED 2 // constraint satisfied
1848 ^ #def COMPATIBLE 3 // compatible but not satisfied yet
1849 ^ static int combine(struct arc *, struct arc *);
1850 */
1851 static int
combine(struct arc * con,struct arc * a)1852 combine(
1853 struct arc *con,
1854 struct arc *a)
1855 {
1856 #define CA(ct,at) (((ct)<<CHAR_BIT) | (at))
1857
1858 switch (CA(con->type, a->type)) {
1859 case CA('^', PLAIN): /* newlines are handled separately */
1860 case CA('$', PLAIN):
1861 return INCOMPATIBLE;
1862 break;
1863 case CA(AHEAD, PLAIN): /* color constraints meet colors */
1864 case CA(BEHIND, PLAIN):
1865 if (con->co == a->co) {
1866 return SATISFIED;
1867 }
1868 return INCOMPATIBLE;
1869 break;
1870 case CA('^', '^'): /* collision, similar constraints */
1871 case CA('$', '$'):
1872 case CA(AHEAD, AHEAD):
1873 case CA(BEHIND, BEHIND):
1874 if (con->co == a->co) { /* true duplication */
1875 return SATISFIED;
1876 }
1877 return INCOMPATIBLE;
1878 break;
1879 case CA('^', BEHIND): /* collision, dissimilar constraints */
1880 case CA(BEHIND, '^'):
1881 case CA('$', AHEAD):
1882 case CA(AHEAD, '$'):
1883 return INCOMPATIBLE;
1884 break;
1885 case CA('^', '$'): /* constraints passing each other */
1886 case CA('^', AHEAD):
1887 case CA(BEHIND, '$'):
1888 case CA(BEHIND, AHEAD):
1889 case CA('$', '^'):
1890 case CA('$', BEHIND):
1891 case CA(AHEAD, '^'):
1892 case CA(AHEAD, BEHIND):
1893 case CA('^', LACON):
1894 case CA(BEHIND, LACON):
1895 case CA('$', LACON):
1896 case CA(AHEAD, LACON):
1897 return COMPATIBLE;
1898 break;
1899 }
1900 assert(NOTREACHED);
1901 return INCOMPATIBLE; /* for benefit of blind compilers */
1902 }
1903
1904 /*
1905 - fixempties - get rid of EMPTY arcs
1906 ^ static VOID fixempties(struct nfa *, FILE *);
1907 */
1908 static void
fixempties(struct nfa * nfa,FILE * f)1909 fixempties(
1910 struct nfa *nfa,
1911 FILE *f) /* for debug output; NULL none */
1912 {
1913 struct state *s;
1914 struct state *s2;
1915 struct state *nexts;
1916 struct arc *a;
1917 struct arc *nexta;
1918 int totalinarcs;
1919 struct arc **inarcsorig;
1920 struct arc **arcarray;
1921 int arccount;
1922 int prevnins;
1923 int nskip;
1924
1925 /*
1926 * First, get rid of any states whose sole out-arc is an EMPTY,
1927 * since they're basically just aliases for their successor. The
1928 * parsing algorithm creates enough of these that it's worth
1929 * special-casing this.
1930 */
1931 for (s = nfa->states; s != NULL && !NISERR(); s = nexts) {
1932 nexts = s->next;
1933 if (s->flag || s->nouts != 1) {
1934 continue;
1935 }
1936 a = s->outs;
1937 assert(a != NULL && a->outchain == NULL);
1938 if (a->type != EMPTY) {
1939 continue;
1940 }
1941 if (s != a->to) {
1942 moveins(nfa, s, a->to);
1943 }
1944 dropstate(nfa, s);
1945 }
1946
1947 /*
1948 * Similarly, get rid of any state with a single EMPTY in-arc, by
1949 * folding it into its predecessor.
1950 */
1951 for (s = nfa->states; s != NULL && !NISERR(); s = nexts) {
1952 nexts = s->next;
1953 /* Ensure tmp fields are clear for next step */
1954 assert(s->tmp == NULL);
1955 if (s->flag || s->nins != 1) {
1956 continue;
1957 }
1958 a = s->ins;
1959 assert(a != NULL && a->inchain == NULL);
1960 if (a->type != EMPTY) {
1961 continue;
1962 }
1963 if (s != a->from) {
1964 moveouts(nfa, s, a->from);
1965 }
1966 dropstate(nfa, s);
1967 }
1968
1969 if (NISERR()) {
1970 return;
1971 }
1972
1973 /*
1974 * For each remaining NFA state, find all other states from which it is
1975 * reachable by a chain of one or more EMPTY arcs. Then generate new arcs
1976 * that eliminate the need for each such chain.
1977 *
1978 * We could replace a chain of EMPTY arcs that leads from a "from" state
1979 * to a "to" state either by pushing non-EMPTY arcs forward (linking
1980 * directly from "from"'s predecessors to "to") or by pulling them back
1981 * (linking directly from "from" to "to"'s successors). We choose to
1982 * always do the former; this choice is somewhat arbitrary, but the
1983 * approach below requires that we uniformly do one or the other.
1984 *
1985 * Suppose we have a chain of N successive EMPTY arcs (where N can easily
1986 * approach the size of the NFA). All of the intermediate states must
1987 * have additional inarcs and outarcs, else they'd have been removed by
1988 * the steps above. Assuming their inarcs are mostly not empties, we will
1989 * add O(N^2) arcs to the NFA, since a non-EMPTY inarc leading to any one
1990 * state in the chain must be duplicated to lead to all its successor
1991 * states as well. So there is no hope of doing less than O(N^2) work;
1992 * however, we should endeavor to keep the big-O cost from being even
1993 * worse than that, which it can easily become without care. In
1994 * particular, suppose we were to copy all S1's inarcs forward to S2, and
1995 * then also to S3, and then later we consider pushing S2's inarcs forward
1996 * to S3. If we include the arcs already copied from S1 in that, we'd be
1997 * doing O(N^3) work. (The duplicate-arc elimination built into newarc()
1998 * and its cohorts would get rid of the extra arcs, but not without cost.)
1999 *
2000 * We can avoid this cost by treating only arcs that existed at the start
2001 * of this phase as candidates to be pushed forward. To identify those,
2002 * we remember the first inarc each state had to start with. We rely on
2003 * the fact that newarc() and friends put new arcs on the front of their
2004 * to-states' inchains, and that this phase never deletes arcs, so that
2005 * the original arcs must be the last arcs in their to-states' inchains.
2006 *
2007 * So the process here is that, for each state in the NFA, we gather up
2008 * all non-EMPTY inarcs of states that can reach the target state via
2009 * EMPTY arcs. We then sort, de-duplicate, and merge these arcs into the
2010 * target state's inchain. (We can safely use sort-merge for this as long
2011 * as we update each state's original-arcs pointer after we add arcs to
2012 * it; the sort step of mergeins probably changed the order of the old
2013 * arcs.)
2014 *
2015 * Another refinement worth making is that, because we only add non-EMPTY
2016 * arcs during this phase, and all added arcs have the same from-state as
2017 * the non-EMPTY arc they were cloned from, we know ahead of time that any
2018 * states having only EMPTY outarcs will be useless for lack of outarcs
2019 * after we drop the EMPTY arcs. (They cannot gain non-EMPTY outarcs if
2020 * they had none to start with.) So we need not bother to update the
2021 * inchains of such states at all.
2022 */
2023
2024 /* Remember the states' first original inarcs */
2025 /* ... and while at it, count how many old inarcs there are altogether */
2026 inarcsorig = (struct arc **) MALLOC(nfa->nstates * sizeof(struct arc *));
2027 if (inarcsorig == NULL) {
2028 NERR(REG_ESPACE);
2029 return;
2030 }
2031 totalinarcs = 0;
2032 for (s = nfa->states; s != NULL; s = s->next) {
2033 inarcsorig[s->no] = s->ins;
2034 totalinarcs += s->nins;
2035 }
2036
2037 /*
2038 * Create a workspace for accumulating the inarcs to be added to the
2039 * current target state. totalinarcs is probably a considerable
2040 * overestimate of the space needed, but the NFA is unlikely to be large
2041 * enough at this point to make it worth being smarter.
2042 */
2043 arcarray = (struct arc **) MALLOC(totalinarcs * sizeof(struct arc *));
2044 if (arcarray == NULL) {
2045 NERR(REG_ESPACE);
2046 FREE(inarcsorig);
2047 return;
2048 }
2049
2050 /* And iterate over the target states */
2051 for (s = nfa->states; s != NULL && !NISERR(); s = s->next) {
2052 /* Ignore target states without non-EMPTY outarcs, per note above */
2053 if (!s->flag && !hasnonemptyout(s)) {
2054 continue;
2055 }
2056
2057 /* Find predecessor states and accumulate their original inarcs */
2058 arccount = 0;
2059 for (s2 = emptyreachable(nfa, s, s, inarcsorig); s2 != s; s2 = nexts) {
2060 /* Add s2's original inarcs to arcarray[], but ignore empties */
2061 for (a = inarcsorig[s2->no]; a != NULL; a = a->inchain) {
2062 if (a->type != EMPTY) {
2063 arcarray[arccount++] = a;
2064 }
2065 }
2066
2067 /* Reset the tmp fields as we walk back */
2068 nexts = s2->tmp;
2069 s2->tmp = NULL;
2070 }
2071 s->tmp = NULL;
2072 assert(arccount <= totalinarcs);
2073
2074 /* Remember how many original inarcs this state has */
2075 prevnins = s->nins;
2076
2077 /* Add non-duplicate inarcs to target state */
2078 mergeins(nfa, s, arcarray, arccount);
2079
2080 /* Now we must update the state's inarcsorig pointer */
2081 nskip = s->nins - prevnins;
2082 a = s->ins;
2083 while (nskip-- > 0) {
2084 a = a->inchain;
2085 }
2086 inarcsorig[s->no] = a;
2087 }
2088
2089 FREE(arcarray);
2090 FREE(inarcsorig);
2091
2092 if (NISERR()) {
2093 return;
2094 }
2095
2096 /*
2097 * Remove all the EMPTY arcs, since we don't need them anymore.
2098 */
2099 for (s = nfa->states; s != NULL; s = s->next) {
2100 for (a = s->outs; a != NULL; a = nexta) {
2101 nexta = a->outchain;
2102 if (a->type == EMPTY) {
2103 freearc(nfa, a);
2104 }
2105 }
2106 }
2107
2108 /*
2109 * And remove any states that have become useless. (This cleanup is
2110 * not very thorough, and would be even less so if we tried to
2111 * combine it with the previous step; but cleanup() will take care
2112 * of anything we miss.)
2113 */
2114 for (s = nfa->states; s != NULL; s = nexts) {
2115 nexts = s->next;
2116 if ((s->nins == 0 || s->nouts == 0) && !s->flag) {
2117 dropstate(nfa, s);
2118 }
2119 }
2120
2121 if (f != NULL) {
2122 dumpnfa(nfa, f);
2123 }
2124 }
2125
2126 /*
2127 - emptyreachable - recursively find all states that can reach s by EMPTY arcs
2128 * The return value is the last such state found. Its tmp field links back
2129 * to the next-to-last such state, and so on back to s, so that all these
2130 * states can be located without searching the whole NFA.
2131 *
2132 * Since this is only used in fixempties(), we pass in the inarcsorig[] array
2133 * maintained by that function. This lets us skip over all new inarcs, which
2134 * are certainly not EMPTY arcs.
2135 *
2136 * The maximum recursion depth here is equal to the length of the longest
2137 * loop-free chain of EMPTY arcs, which is surely no more than the size of
2138 * the NFA, and in practice will be less than that.
2139 ^ static struct state *emptyreachable(struct state *, struct state *);
2140 */
2141 static struct state *
emptyreachable(struct nfa * nfa,struct state * s,struct state * lastfound,struct arc ** inarcsorig)2142 emptyreachable(
2143 struct nfa *nfa,
2144 struct state *s,
2145 struct state *lastfound,
2146 struct arc **inarcsorig)
2147 {
2148 struct arc *a;
2149
2150 s->tmp = lastfound;
2151 lastfound = s;
2152 for (a = inarcsorig[s->no]; a != NULL; a = a->inchain) {
2153 if (a->type == EMPTY && a->from->tmp == NULL) {
2154 lastfound = emptyreachable(nfa, a->from, lastfound, inarcsorig);
2155 }
2156 }
2157 return lastfound;
2158 }
2159
2160 /*
2161 * isconstraintarc - detect whether an arc is of a constraint type
2162 */
2163 static inline int
isconstraintarc(struct arc * a)2164 isconstraintarc(struct arc * a)
2165 {
2166 switch (a->type)
2167 {
2168 case '^':
2169 case '$':
2170 case BEHIND:
2171 case AHEAD:
2172 case LACON:
2173 return 1;
2174 }
2175 return 0;
2176 }
2177
2178 /*
2179 * hasconstraintout - does state have a constraint out arc?
2180 */
2181 static int
hasconstraintout(struct state * s)2182 hasconstraintout(struct state * s)
2183 {
2184 struct arc *a;
2185
2186 for (a = s->outs; a != NULL; a = a->outchain) {
2187 if (isconstraintarc(a)) {
2188 return 1;
2189 }
2190 }
2191 return 0;
2192 }
2193
2194 /*
2195 * fixconstraintloops - get rid of loops containing only constraint arcs
2196 *
2197 * A loop of states that contains only constraint arcs is useless, since
2198 * passing around the loop represents no forward progress. Moreover, it
2199 * would cause infinite looping in pullback/pushfwd, so we need to get rid
2200 * of such loops before doing that.
2201 */
2202 static void
fixconstraintloops(struct nfa * nfa,FILE * f)2203 fixconstraintloops(
2204 struct nfa * nfa,
2205 FILE *f) /* for debug output; NULL none */
2206 {
2207 struct state *s;
2208 struct state *nexts;
2209 struct arc *a;
2210 struct arc *nexta;
2211 int hasconstraints;
2212
2213 /*
2214 * In the trivial case of a state that loops to itself, we can just drop
2215 * the constraint arc altogether. This is worth special-casing because
2216 * such loops are far more common than loops containing multiple states.
2217 * While we're at it, note whether any constraint arcs survive.
2218 */
2219 hasconstraints = 0;
2220 for (s = nfa->states; s != NULL && !NISERR(); s = nexts) {
2221 nexts = s->next;
2222 /* while we're at it, ensure tmp fields are clear for next step */
2223 assert(s->tmp == NULL);
2224 for (a = s->outs; a != NULL && !NISERR(); a = nexta) {
2225 nexta = a->outchain;
2226 if (isconstraintarc(a)) {
2227 if (a->to == s) {
2228 freearc(nfa, a);
2229 } else {
2230 hasconstraints = 1;
2231 }
2232 }
2233 }
2234 /* If we removed all the outarcs, the state is useless. */
2235 if (s->nouts == 0 && !s->flag) {
2236 dropstate(nfa, s);
2237 }
2238 }
2239
2240 /* Nothing to do if no remaining constraint arcs */
2241 if (NISERR() || !hasconstraints) {
2242 return;
2243 }
2244
2245 /*
2246 * Starting from each remaining NFA state, search outwards for a
2247 * constraint loop. If we find a loop, break the loop, then start the
2248 * search over. (We could possibly retain some state from the first scan,
2249 * but it would complicate things greatly, and multi-state constraint
2250 * loops are rare enough that it's not worth optimizing the case.)
2251 */
2252 restart:
2253 for (s = nfa->states; s != NULL && !NISERR(); s = s->next) {
2254 if (findconstraintloop(nfa, s)) {
2255 goto restart;
2256 }
2257 }
2258
2259 if (NISERR()) {
2260 return;
2261 }
2262
2263 /*
2264 * Now remove any states that have become useless. (This cleanup is not
2265 * very thorough, and would be even less so if we tried to combine it with
2266 * the previous step; but cleanup() will take care of anything we miss.)
2267 *
2268 * Because findconstraintloop intentionally doesn't reset all tmp fields,
2269 * we have to clear them after it's done. This is a convenient place to
2270 * do that, too.
2271 */
2272 for (s = nfa->states; s != NULL; s = nexts) {
2273 nexts = s->next;
2274 s->tmp = NULL;
2275 if ((s->nins == 0 || s->nouts == 0) && !s->flag) {
2276 dropstate(nfa, s);
2277 }
2278 }
2279
2280 if (f != NULL) {
2281 dumpnfa(nfa, f);
2282 }
2283 }
2284
2285 /*
2286 * findconstraintloop - recursively find a loop of constraint arcs
2287 *
2288 * If we find a loop, break it by calling breakconstraintloop(), then
2289 * return 1; otherwise return 0.
2290 *
2291 * State tmp fields are guaranteed all NULL on a success return, because
2292 * breakconstraintloop does that. After a failure return, any state that
2293 * is known not to be part of a loop is marked with s->tmp == s; this allows
2294 * us not to have to re-prove that fact on later calls. (This convention is
2295 * workable because we already eliminated single-state loops.)
2296 *
2297 * Note that the found loop doesn't necessarily include the first state we
2298 * are called on. Any loop reachable from that state will do.
2299 *
2300 * The maximum recursion depth here is one more than the length of the longest
2301 * loop-free chain of constraint arcs, which is surely no more than the size
2302 * of the NFA, and in practice will be a lot less than that.
2303 */
2304 static int
findconstraintloop(struct nfa * nfa,struct state * s)2305 findconstraintloop(struct nfa * nfa, struct state * s)
2306 {
2307 struct arc *a;
2308
2309 /* Since this is recursive, it could be driven to stack overflow */
2310 if (STACK_TOO_DEEP(nfa->v->re)) {
2311 NERR(REG_ETOOBIG);
2312 return 1; /* to exit as quickly as possible */
2313 }
2314
2315 if (s->tmp != NULL) {
2316 /* Already proven uninteresting? */
2317 if (s->tmp == s) {
2318 return 0;
2319 }
2320 /* Found a loop involving s */
2321 breakconstraintloop(nfa, s);
2322 /* The tmp fields have been cleaned up by breakconstraintloop */
2323 return 1;
2324 }
2325 for (a = s->outs; a != NULL; a = a->outchain) {
2326 if (isconstraintarc(a)) {
2327 struct state *sto = a->to;
2328
2329 assert(sto != s);
2330 s->tmp = sto;
2331 if (findconstraintloop(nfa, sto)) {
2332 return 1;
2333 }
2334 }
2335 }
2336
2337 /*
2338 * If we get here, no constraint loop exists leading out from s. Mark it
2339 * with s->tmp == s so we need not rediscover that fact again later.
2340 */
2341 s->tmp = s;
2342 return 0;
2343 }
2344
2345 /*
2346 * breakconstraintloop - break a loop of constraint arcs
2347 *
2348 * sinitial is any one member state of the loop. Each loop member's tmp
2349 * field links to its successor within the loop. (Note that this function
2350 * will reset all the tmp fields to NULL.)
2351 *
2352 * We can break the loop by, for any one state S1 in the loop, cloning its
2353 * loop successor state S2 (and possibly following states), and then moving
2354 * all S1->S2 constraint arcs to point to the cloned S2. The cloned S2 should
2355 * copy any non-constraint outarcs of S2. Constraint outarcs should be
2356 * dropped if they point back to S1, else they need to be copied as arcs to
2357 * similarly cloned states S3, S4, etc. In general, each cloned state copies
2358 * non-constraint outarcs, drops constraint outarcs that would lead to itself
2359 * or any earlier cloned state, and sends other constraint outarcs to newly
2360 * cloned states. No cloned state will have any inarcs that aren't constraint
2361 * arcs or do not lead from S1 or earlier-cloned states. It's okay to drop
2362 * constraint back-arcs since they would not take us to any state we've not
2363 * already been in; therefore, no new constraint loop is created. In this way
2364 * we generate a modified NFA that can still represent every useful state
2365 * sequence, but not sequences that represent state loops with no consumption
2366 * of input data. Note that the set of cloned states will certainly include
2367 * all of the loop member states other than S1, and it may also include
2368 * non-loop states that are reachable from S2 via constraint arcs. This is
2369 * important because there is no guarantee that findconstraintloop found a
2370 * maximal loop (and searching for one would be NP-hard, so don't try).
2371 * Frequently the "non-loop states" are actually part of a larger loop that
2372 * we didn't notice, and indeed there may be several overlapping loops.
2373 * This technique ensures convergence in such cases, while considering only
2374 * the originally-found loop does not.
2375 *
2376 * If there is only one S1->S2 constraint arc, then that constraint is
2377 * certainly satisfied when we enter any of the clone states. This means that
2378 * in the common case where many of the constraint arcs are identically
2379 * labeled, we can merge together clone states linked by a similarly-labeled
2380 * constraint: if we can get to the first one we can certainly get to the
2381 * second, so there's no need to distinguish. This greatly reduces the number
2382 * of new states needed, so we preferentially break the given loop at a state
2383 * pair where this is true.
2384 *
2385 * Furthermore, it's fairly common to find that a cloned successor state has
2386 * no outarcs, especially if we're a bit aggressive about removing unnecessary
2387 * outarcs. If that happens, then there is simply not any interesting state
2388 * that can be reached through the predecessor's loop arcs, which means we can
2389 * break the loop just by removing those loop arcs, with no new states added.
2390 */
2391 static void
breakconstraintloop(struct nfa * nfa,struct state * sinitial)2392 breakconstraintloop(struct nfa * nfa, struct state * sinitial)
2393 {
2394 struct state *s;
2395 struct state *shead;
2396 struct state *stail;
2397 struct state *sclone;
2398 struct state *nexts;
2399 struct arc *refarc;
2400 struct arc *a;
2401 struct arc *nexta;
2402
2403 /*
2404 * Start by identifying which loop step we want to break at.
2405 * Preferentially this is one with only one constraint arc. (XXX are
2406 * there any other secondary heuristics we want to use here?) Set refarc
2407 * to point to the selected lone constraint arc, if there is one.
2408 */
2409 refarc = NULL;
2410 s = sinitial;
2411 do {
2412 nexts = s->tmp;
2413 assert(nexts != s); /* should not see any one-element loops */
2414 if (refarc == NULL) {
2415 int narcs = 0;
2416
2417 for (a = s->outs; a != NULL; a = a->outchain) {
2418 if (a->to == nexts && isconstraintarc(a)) {
2419 refarc = a;
2420 narcs++;
2421 }
2422 }
2423 assert(narcs > 0);
2424 if (narcs > 1) {
2425 refarc = NULL; /* multiple constraint arcs here, no good */
2426 }
2427 }
2428 s = nexts;
2429 } while (s != sinitial);
2430
2431 if (refarc) {
2432 /* break at the refarc */
2433 shead = refarc->from;
2434 stail = refarc->to;
2435 assert(stail == shead->tmp);
2436 } else {
2437 /* for lack of a better idea, break after sinitial */
2438 shead = sinitial;
2439 stail = sinitial->tmp;
2440 }
2441
2442 /*
2443 * Reset the tmp fields so that we can use them for local storage in
2444 * clonesuccessorstates. (findconstraintloop won't mind, since it's just
2445 * going to abandon its search anyway.)
2446 */
2447 for (s = nfa->states; s != NULL; s = s->next) {
2448 s->tmp = NULL;
2449 }
2450
2451 /*
2452 * Recursively build clone state(s) as needed.
2453 */
2454 sclone = newstate(nfa);
2455 if (sclone == NULL) {
2456 assert(NISERR());
2457 return;
2458 }
2459
2460 clonesuccessorstates(nfa, stail, sclone, shead, refarc,
2461 NULL, NULL, nfa->nstates);
2462
2463 if (NISERR()) {
2464 return;
2465 }
2466
2467 /*
2468 * It's possible that sclone has no outarcs at all, in which case it's
2469 * useless. (We don't try extremely hard to get rid of useless states
2470 * here, but this is an easy and fairly common case.)
2471 */
2472 if (sclone->nouts == 0) {
2473 freestate(nfa, sclone);
2474 sclone = NULL;
2475 }
2476
2477 /*
2478 * Move shead's constraint-loop arcs to point to sclone, or just drop them
2479 * if we discovered we don't need sclone.
2480 */
2481 for (a = shead->outs; a != NULL; a = nexta) {
2482 nexta = a->outchain;
2483 if (a->to == stail && isconstraintarc(a)) {
2484 if (sclone) {
2485 cparc(nfa, a, shead, sclone);
2486 }
2487 freearc(nfa, a);
2488 if (NISERR()) {
2489 break;
2490 }
2491 }
2492 }
2493 }
2494
2495 /*
2496 * clonesuccessorstates - create a tree of constraint-arc successor states
2497 *
2498 * ssource is the state to be cloned, and sclone is the state to copy its
2499 * outarcs into. sclone's inarcs, if any, should already be set up.
2500 *
2501 * spredecessor is the original predecessor state that we are trying to build
2502 * successors for (it may not be the immediate predecessor of ssource).
2503 * refarc, if not NULL, is the original constraint arc that is known to have
2504 * been traversed out of spredecessor to reach the successor(s).
2505 *
2506 * For each cloned successor state, we transiently create a "donemap" that is
2507 * a boolean array showing which source states we've already visited for this
2508 * clone state. This prevents infinite recursion as well as useless repeat
2509 * visits to the same state subtree (which can add up fast, since typical NFAs
2510 * have multiple redundant arc pathways). Each donemap is a char array
2511 * indexed by state number. The donemaps are all of the same size "nstates",
2512 * which is nfa->nstates as of the start of the recursion. This is enough to
2513 * have entries for all pre-existing states, but *not* entries for clone
2514 * states created during the recursion. That's okay since we have no need to
2515 * mark those.
2516 *
2517 * curdonemap is NULL when recursing to a new sclone state, or sclone's
2518 * donemap when we are recursing without having created a new state (which we
2519 * do when we decide we can merge a successor state into the current clone
2520 * state). outerdonemap is NULL at the top level and otherwise the parent
2521 * clone state's donemap.
2522 *
2523 * The successor states we create and fill here form a strict tree structure,
2524 * with each state having exactly one predecessor, except that the toplevel
2525 * state has no inarcs as yet (breakconstraintloop will add its inarcs from
2526 * spredecessor after we're done). Thus, we can examine sclone's inarcs back
2527 * to the root, plus refarc if any, to identify the set of constraints already
2528 * known valid at the current point. This allows us to avoid generating extra
2529 * successor states.
2530 */
2531 static void
clonesuccessorstates(struct nfa * nfa,struct state * ssource,struct state * sclone,struct state * spredecessor,struct arc * refarc,char * curdonemap,char * outerdonemap,int nstates)2532 clonesuccessorstates(
2533 struct nfa * nfa,
2534 struct state * ssource,
2535 struct state * sclone,
2536 struct state * spredecessor,
2537 struct arc * refarc,
2538 char *curdonemap,
2539 char *outerdonemap,
2540 int nstates)
2541 {
2542 char *donemap;
2543 struct arc *a;
2544
2545 /* Since this is recursive, it could be driven to stack overflow */
2546 if (STACK_TOO_DEEP(nfa->v->re)) {
2547 NERR(REG_ETOOBIG);
2548 return;
2549 }
2550
2551 /* If this state hasn't already got a donemap, create one */
2552 donemap = curdonemap;
2553 if (donemap == NULL) {
2554 donemap = (char *) MALLOC(nstates * sizeof(char));
2555 if (donemap == NULL) {
2556 NERR(REG_ESPACE);
2557 return;
2558 }
2559
2560 if (outerdonemap != NULL) {
2561 /*
2562 * Not at outermost recursion level, so copy the outer level's
2563 * donemap; this ensures that we see states in process of being
2564 * visited at outer levels, or already merged into predecessor
2565 * states, as ones we shouldn't traverse back to.
2566 */
2567 memcpy(donemap, outerdonemap, nstates * sizeof(char));
2568 } else {
2569 /* At outermost level, only spredecessor is off-limits */
2570 memset(donemap, 0, nstates * sizeof(char));
2571 assert(spredecessor->no < nstates);
2572 donemap[spredecessor->no] = 1;
2573 }
2574 }
2575
2576 /* Mark ssource as visited in the donemap */
2577 assert(ssource->no < nstates);
2578 assert(donemap[ssource->no] == 0);
2579 donemap[ssource->no] = 1;
2580
2581 /*
2582 * We proceed by first cloning all of ssource's outarcs, creating new
2583 * clone states as needed but not doing more with them than that. Then in
2584 * a second pass, recurse to process the child clone states. This allows
2585 * us to have only one child clone state per reachable source state, even
2586 * when there are multiple outarcs leading to the same state. Also, when
2587 * we do visit a child state, its set of inarcs is known exactly, which
2588 * makes it safe to apply the constraint-is-already-checked optimization.
2589 * Also, this ensures that we've merged all the states we can into the
2590 * current clone before we recurse to any children, thus possibly saving
2591 * them from making extra images of those states.
2592 *
2593 * While this function runs, child clone states of the current state are
2594 * marked by setting their tmp fields to point to the original state they
2595 * were cloned from. This makes it possible to detect multiple outarcs
2596 * leading to the same state, and also makes it easy to distinguish clone
2597 * states from original states (which will have tmp == NULL).
2598 */
2599 for (a = ssource->outs; a != NULL && !NISERR(); a = a->outchain) {
2600 struct state *sto = a->to;
2601
2602 /*
2603 * We do not consider cloning successor states that have no constraint
2604 * outarcs; just link to them as-is. They cannot be part of a
2605 * constraint loop so there is no need to make copies. In particular,
2606 * this rule keeps us from trying to clone the post state, which would
2607 * be a bad idea.
2608 */
2609 if (isconstraintarc(a) && hasconstraintout(sto)) {
2610 struct state *prevclone;
2611 int canmerge;
2612 struct arc *a2;
2613
2614 /*
2615 * Back-link constraint arcs must not be followed. Nor is there a
2616 * need to revisit states previously merged into this clone.
2617 */
2618 assert(sto->no < nstates);
2619 if (donemap[sto->no] != 0) {
2620 continue;
2621 }
2622
2623 /*
2624 * Check whether we already have a child clone state for this
2625 * source state.
2626 */
2627 prevclone = NULL;
2628 for (a2 = sclone->outs; a2 != NULL; a2 = a2->outchain) {
2629 if (a2->to->tmp == sto) {
2630 prevclone = a2->to;
2631 break;
2632 }
2633 }
2634
2635 /*
2636 * If this arc is labeled the same as refarc, or the same as any
2637 * arc we must have traversed to get to sclone, then no additional
2638 * constraints need to be met to get to sto, so we should just
2639 * merge its outarcs into sclone.
2640 */
2641 if (refarc && a->type == refarc->type && a->co == refarc->co) {
2642 canmerge = 1;
2643 } else {
2644 struct state *s;
2645
2646 canmerge = 0;
2647 for (s = sclone; s->ins; s = s->ins->from) {
2648 if (s->nins == 1 &&
2649 a->type == s->ins->type && a->co == s->ins->co) {
2650 canmerge = 1;
2651 break;
2652 }
2653 }
2654 }
2655
2656 if (canmerge) {
2657 /*
2658 * We can merge into sclone. If we previously made a child
2659 * clone state, drop it; there's no need to visit it. (This
2660 * can happen if ssource has multiple pathways to sto, and we
2661 * only just now found one that is provably a no-op.)
2662 */
2663 if (prevclone) {
2664 dropstate(nfa, prevclone); /* kills our outarc, too */
2665 }
2666
2667 /* Recurse to merge sto's outarcs into sclone */
2668 clonesuccessorstates(nfa, sto, sclone, spredecessor, refarc,
2669 donemap, outerdonemap, nstates);
2670 /* sto should now be marked as previously visited */
2671 assert(NISERR() || donemap[sto->no] == 1);
2672 } else if (prevclone) {
2673 /*
2674 * We already have a clone state for this successor, so just
2675 * make another arc to it.
2676 */
2677 cparc(nfa, a, sclone, prevclone);
2678 } else {
2679 /*
2680 * We need to create a new successor clone state.
2681 */
2682 struct state *stoclone;
2683
2684 stoclone = newstate(nfa);
2685 if (stoclone == NULL) {
2686 assert(NISERR());
2687 break;
2688 }
2689 /* Mark it as to what it's a clone of */
2690 stoclone->tmp = sto;
2691 /* ... and add the outarc leading to it */
2692 cparc(nfa, a, sclone, stoclone);
2693 }
2694 } else {
2695 /*
2696 * Non-constraint outarcs just get copied to sclone, as do outarcs
2697 * leading to states with no constraint outarc.
2698 */
2699 cparc(nfa, a, sclone, sto);
2700 }
2701 }
2702
2703 /*
2704 * If we are at outer level for this clone state, recurse to all its child
2705 * clone states, clearing their tmp fields as we go. (If we're not
2706 * outermost for sclone, leave this to be done by the outer call level.)
2707 * Note that if we have multiple outarcs leading to the same clone state,
2708 * it will only be recursed-to once.
2709 */
2710 if (curdonemap == NULL) {
2711 for (a = sclone->outs; a != NULL && !NISERR(); a = a->outchain) {
2712 struct state *stoclone = a->to;
2713 struct state *sto = stoclone->tmp;
2714
2715 if (sto != NULL) {
2716 stoclone->tmp = NULL;
2717 clonesuccessorstates(nfa, sto, stoclone, spredecessor, refarc,
2718 NULL, donemap, nstates);
2719 }
2720 }
2721
2722 /* Don't forget to free sclone's donemap when done with it */
2723 FREE(donemap);
2724 }
2725 }
2726
2727 /*
2728 - cleanup - clean up NFA after optimizations
2729 ^ static VOID cleanup(struct nfa *);
2730 */
2731 static void
cleanup(struct nfa * nfa)2732 cleanup(
2733 struct nfa *nfa)
2734 {
2735 struct state *s;
2736 struct state *nexts;
2737 int n;
2738
2739 /*
2740 * Clear out unreachable or dead-end states. Use pre to mark reachable,
2741 * then post to mark can-reach-post.
2742 */
2743
2744 markreachable(nfa, nfa->pre, NULL, nfa->pre);
2745 markcanreach(nfa, nfa->post, nfa->pre, nfa->post);
2746 for (s = nfa->states; s != NULL; s = nexts) {
2747 nexts = s->next;
2748 if (s->tmp != nfa->post && !s->flag) {
2749 dropstate(nfa, s);
2750 }
2751 }
2752 assert(nfa->post->nins == 0 || nfa->post->tmp == nfa->post);
2753 cleartraverse(nfa, nfa->pre);
2754 assert(nfa->post->nins == 0 || nfa->post->tmp == NULL);
2755 /* the nins==0 (final unreachable) case will be caught later */
2756
2757 /*
2758 * Renumber surviving states.
2759 */
2760
2761 n = 0;
2762 for (s = nfa->states; s != NULL; s = s->next) {
2763 s->no = n++;
2764 }
2765 nfa->nstates = n;
2766 }
2767
2768 /*
2769 - markreachable - recursive marking of reachable states
2770 ^ static VOID markreachable(struct nfa *, struct state *, struct state *,
2771 ^ struct state *);
2772 */
2773 static void
markreachable(struct nfa * nfa,struct state * s,struct state * okay,struct state * mark)2774 markreachable(
2775 struct nfa *nfa,
2776 struct state *s,
2777 struct state *okay, /* consider only states with this mark */
2778 struct state *mark) /* the value to mark with */
2779 {
2780 struct arc *a;
2781
2782 if (s->tmp != okay) {
2783 return;
2784 }
2785 s->tmp = mark;
2786
2787 for (a = s->outs; a != NULL; a = a->outchain) {
2788 markreachable(nfa, a->to, okay, mark);
2789 }
2790 }
2791
2792 /*
2793 - markcanreach - recursive marking of states which can reach here
2794 ^ static VOID markcanreach(struct nfa *, struct state *, struct state *,
2795 ^ struct state *);
2796 */
2797 static void
markcanreach(struct nfa * nfa,struct state * s,struct state * okay,struct state * mark)2798 markcanreach(
2799 struct nfa *nfa,
2800 struct state *s,
2801 struct state *okay, /* consider only states with this mark */
2802 struct state *mark) /* the value to mark with */
2803 {
2804 struct arc *a;
2805
2806 if (s->tmp != okay) {
2807 return;
2808 }
2809 s->tmp = mark;
2810
2811 for (a = s->ins; a != NULL; a = a->inchain) {
2812 markcanreach(nfa, a->from, okay, mark);
2813 }
2814 }
2815
2816 /*
2817 - analyze - ascertain potentially-useful facts about an optimized NFA
2818 ^ static long analyze(struct nfa *);
2819 */
2820 static long /* re_info bits to be ORed in */
analyze(struct nfa * nfa)2821 analyze(
2822 struct nfa *nfa)
2823 {
2824 struct arc *a;
2825 struct arc *aa;
2826
2827 if (nfa->pre->outs == NULL) {
2828 return REG_UIMPOSSIBLE;
2829 }
2830 for (a = nfa->pre->outs; a != NULL; a = a->outchain) {
2831 for (aa = a->to->outs; aa != NULL; aa = aa->outchain) {
2832 if (aa->to == nfa->post) {
2833 return REG_UEMPTYMATCH;
2834 }
2835 }
2836 }
2837 return 0;
2838 }
2839
2840 /*
2841 - compact - construct the compact representation of an NFA
2842 ^ static VOID compact(struct nfa *, struct cnfa *);
2843 */
2844 static void
compact(struct nfa * nfa,struct cnfa * cnfa)2845 compact(
2846 struct nfa *nfa,
2847 struct cnfa *cnfa)
2848 {
2849 struct state *s;
2850 struct arc *a;
2851 size_t nstates;
2852 size_t narcs;
2853 struct carc *ca;
2854 struct carc *first;
2855
2856 assert(!NISERR());
2857
2858 nstates = 0;
2859 narcs = 0;
2860 for (s = nfa->states; s != NULL; s = s->next) {
2861 nstates++;
2862 narcs += 1 + s->nouts + 1;
2863 /* 1 as a fake for flags, nouts for arcs, 1 as endmarker */
2864 }
2865
2866 cnfa->states = (struct carc **) MALLOC(nstates * sizeof(struct carc *));
2867 cnfa->arcs = (struct carc *) MALLOC(narcs * sizeof(struct carc));
2868 if (cnfa->states == NULL || cnfa->arcs == NULL) {
2869 if (cnfa->states != NULL) {
2870 FREE(cnfa->states);
2871 }
2872 if (cnfa->arcs != NULL) {
2873 FREE(cnfa->arcs);
2874 }
2875 NERR(REG_ESPACE);
2876 return;
2877 }
2878 cnfa->nstates = nstates;
2879 cnfa->pre = nfa->pre->no;
2880 cnfa->post = nfa->post->no;
2881 cnfa->bos[0] = nfa->bos[0];
2882 cnfa->bos[1] = nfa->bos[1];
2883 cnfa->eos[0] = nfa->eos[0];
2884 cnfa->eos[1] = nfa->eos[1];
2885 cnfa->ncolors = maxcolor(nfa->cm) + 1;
2886 cnfa->flags = 0;
2887
2888 ca = cnfa->arcs;
2889 for (s = nfa->states; s != NULL; s = s->next) {
2890 assert((size_t) s->no < nstates);
2891 cnfa->states[s->no] = ca;
2892 ca->co = 0; /* clear and skip flags "arc" */
2893 ca++;
2894 first = ca;
2895 for (a = s->outs; a != NULL; a = a->outchain) {
2896 switch (a->type) {
2897 case PLAIN:
2898 ca->co = a->co;
2899 ca->to = a->to->no;
2900 ca++;
2901 break;
2902 case LACON:
2903 assert(s->no != cnfa->pre);
2904 ca->co = (color) (cnfa->ncolors + a->co);
2905 ca->to = a->to->no;
2906 ca++;
2907 cnfa->flags |= HASLACONS;
2908 break;
2909 default:
2910 NERR(REG_ASSERT);
2911 break;
2912 }
2913 }
2914 carcsort(first, ca - first);
2915 ca->co = COLORLESS;
2916 ca->to = 0;
2917 ca++;
2918 }
2919 assert(ca == &cnfa->arcs[narcs]);
2920 assert(cnfa->nstates != 0);
2921
2922 /*
2923 * Mark no-progress states.
2924 */
2925
2926 for (a = nfa->pre->outs; a != NULL; a = a->outchain) {
2927 cnfa->states[a->to->no]->co = 1;
2928 }
2929 cnfa->states[nfa->pre->no]->co = 1;
2930 }
2931
2932 /*
2933 - carcsort - sort compacted-NFA arcs by color
2934 ^ static VOID carcsort(struct carc *, struct carc *);
2935 */
2936 static void
carcsort(struct carc * first,size_t n)2937 carcsort(
2938 struct carc *first,
2939 size_t n)
2940 {
2941 if (n > 1) {
2942 qsort(first, n, sizeof(struct carc), carc_cmp);
2943 }
2944 }
2945
2946 static int
carc_cmp(const void * a,const void * b)2947 carc_cmp(
2948 const void *a,
2949 const void *b)
2950 {
2951 const struct carc *aa = (const struct carc *) a;
2952 const struct carc *bb = (const struct carc *) b;
2953
2954 if (aa->co < bb->co) {
2955 return -1;
2956 }
2957 if (aa->co > bb->co) {
2958 return +1;
2959 }
2960 if (aa->to < bb->to) {
2961 return -1;
2962 }
2963 if (aa->to > bb->to) {
2964 return +1;
2965 }
2966 return 0;
2967 }
2968
2969 /*
2970 - freecnfa - free a compacted NFA
2971 ^ static VOID freecnfa(struct cnfa *);
2972 */
2973 static void
freecnfa(struct cnfa * cnfa)2974 freecnfa(
2975 struct cnfa *cnfa)
2976 {
2977 assert(cnfa->nstates != 0); /* not empty already */
2978 cnfa->nstates = 0;
2979 FREE(cnfa->states);
2980 FREE(cnfa->arcs);
2981 }
2982
2983 /*
2984 - dumpnfa - dump an NFA in human-readable form
2985 ^ static VOID dumpnfa(struct nfa *, FILE *);
2986 */
2987 static void
dumpnfa(struct nfa * nfa,FILE * f)2988 dumpnfa(
2989 struct nfa *nfa,
2990 FILE *f)
2991 {
2992 #ifdef REG_DEBUG
2993 struct state *s;
2994 int nstates = 0;
2995 int narcs = 0;
2996
2997 fprintf(f, "pre %d, post %d", nfa->pre->no, nfa->post->no);
2998 if (nfa->bos[0] != COLORLESS) {
2999 fprintf(f, ", bos [%ld]", (long) nfa->bos[0]);
3000 }
3001 if (nfa->bos[1] != COLORLESS) {
3002 fprintf(f, ", bol [%ld]", (long) nfa->bos[1]);
3003 }
3004 if (nfa->eos[0] != COLORLESS) {
3005 fprintf(f, ", eos [%ld]", (long) nfa->eos[0]);
3006 }
3007 if (nfa->eos[1] != COLORLESS) {
3008 fprintf(f, ", eol [%ld]", (long) nfa->eos[1]);
3009 }
3010 fprintf(f, "\n");
3011 for (s = nfa->states; s != NULL; s = s->next) {
3012 dumpstate(s, f);
3013 nstates++;
3014 narcs += s->nouts;
3015 }
3016 fprintf(f, "total of %d states, %d arcs\n", nstates, narcs);
3017 if (nfa->parent == NULL) {
3018 dumpcolors(nfa->cm, f);
3019 }
3020 fflush(f);
3021 #endif
3022 }
3023
3024 #ifdef REG_DEBUG /* subordinates of dumpnfa */
3025 /*
3026 ^ #ifdef REG_DEBUG
3027 */
3028
3029 /*
3030 - dumpstate - dump an NFA state in human-readable form
3031 ^ static VOID dumpstate(struct state *, FILE *);
3032 */
3033 static void
dumpstate(struct state * s,FILE * f)3034 dumpstate(
3035 struct state *s,
3036 FILE *f)
3037 {
3038 struct arc *a;
3039
3040 fprintf(f, "%d%s%c", s->no, (s->tmp != NULL) ? "T" : "",
3041 (s->flag) ? s->flag : '.');
3042 if (s->prev != NULL && s->prev->next != s) {
3043 fprintf(f, "\tstate chain bad\n");
3044 }
3045 if (s->nouts == 0) {
3046 fprintf(f, "\tno out arcs\n");
3047 } else {
3048 dumparcs(s, f);
3049 }
3050 fflush(f);
3051 for (a = s->ins; a != NULL; a = a->inchain) {
3052 if (a->to != s) {
3053 fprintf(f, "\tlink from %d to %d on %d's in-chain\n",
3054 a->from->no, a->to->no, s->no);
3055 }
3056 }
3057 }
3058
3059 /*
3060 - dumparcs - dump out-arcs in human-readable form
3061 ^ static VOID dumparcs(struct state *, FILE *);
3062 */
3063 static void
dumparcs(struct state * s,FILE * f)3064 dumparcs(
3065 struct state *s,
3066 FILE *f)
3067 {
3068 int pos;
3069 struct arc *a;
3070
3071 /* printing oldest arcs first is usually clearer */
3072 a = s->outs;
3073 assert(a != NULL);
3074 while (a->outchain != NULL) {
3075 a = a->outchain;
3076 }
3077 pos = 1;
3078 do {
3079 dumparc(a, s, f);
3080 if (pos == 5) {
3081 fprintf(f, "\n");
3082 pos = 1;
3083 } else {
3084 pos++;
3085 }
3086 a = a->outchainRev;
3087 } while (a != NULL);
3088 if (pos != 1) {
3089 fprintf(f, "\n");
3090 }
3091 }
3092
3093 /*
3094 - dumparc - dump one outarc in readable form, including prefixing tab
3095 ^ static VOID dumparc(struct arc *, struct state *, FILE *);
3096 */
3097 static void
dumparc(struct arc * a,struct state * s,FILE * f)3098 dumparc(
3099 struct arc *a,
3100 struct state *s,
3101 FILE *f)
3102 {
3103 struct arc *aa;
3104 struct arcbatch *ab;
3105
3106 fprintf(f, "\t");
3107 switch (a->type) {
3108 case PLAIN:
3109 fprintf(f, "[%ld]", (long) a->co);
3110 break;
3111 case AHEAD:
3112 fprintf(f, ">%ld>", (long) a->co);
3113 break;
3114 case BEHIND:
3115 fprintf(f, "<%ld<", (long) a->co);
3116 break;
3117 case LACON:
3118 fprintf(f, ":%ld:", (long) a->co);
3119 break;
3120 case '^':
3121 case '$':
3122 fprintf(f, "%c%d", a->type, (int) a->co);
3123 break;
3124 case EMPTY:
3125 break;
3126 default:
3127 fprintf(f, "0x%x/0%lo", a->type, (long) a->co);
3128 break;
3129 }
3130 if (a->from != s) {
3131 fprintf(f, "?%d?", a->from->no);
3132 }
3133 for (ab = &a->from->oas; ab != NULL; ab = ab->next) {
3134 for (aa = &ab->a[0]; aa < &ab->a[ABSIZE]; aa++) {
3135 if (aa == a) {
3136 break; /* NOTE BREAK OUT */
3137 }
3138 }
3139 if (aa < &ab->a[ABSIZE]) { /* propagate break */
3140 break; /* NOTE BREAK OUT */
3141 }
3142 }
3143 if (ab == NULL) {
3144 fprintf(f, "?!?"); /* not in allocated space */
3145 }
3146 fprintf(f, "->");
3147 if (a->to == NULL) {
3148 fprintf(f, "NULL");
3149 return;
3150 }
3151 fprintf(f, "%d", a->to->no);
3152 for (aa = a->to->ins; aa != NULL; aa = aa->inchain) {
3153 if (aa == a) {
3154 break; /* NOTE BREAK OUT */
3155 }
3156 }
3157 if (aa == NULL) {
3158 fprintf(f, "?!?"); /* missing from in-chain */
3159 }
3160 }
3161
3162 /*
3163 ^ #endif
3164 */
3165 #endif /* ifdef REG_DEBUG */
3166
3167 /*
3168 - dumpcnfa - dump a compacted NFA in human-readable form
3169 ^ static VOID dumpcnfa(struct cnfa *, FILE *);
3170 */
3171 static void
dumpcnfa(struct cnfa * cnfa,FILE * f)3172 dumpcnfa(
3173 struct cnfa *cnfa,
3174 FILE *f)
3175 {
3176 #ifdef REG_DEBUG
3177 int st;
3178
3179 fprintf(f, "pre %d, post %d", cnfa->pre, cnfa->post);
3180 if (cnfa->bos[0] != COLORLESS) {
3181 fprintf(f, ", bos [%ld]", (long) cnfa->bos[0]);
3182 }
3183 if (cnfa->bos[1] != COLORLESS) {
3184 fprintf(f, ", bol [%ld]", (long) cnfa->bos[1]);
3185 }
3186 if (cnfa->eos[0] != COLORLESS) {
3187 fprintf(f, ", eos [%ld]", (long) cnfa->eos[0]);
3188 }
3189 if (cnfa->eos[1] != COLORLESS) {
3190 fprintf(f, ", eol [%ld]", (long) cnfa->eos[1]);
3191 }
3192 if (cnfa->flags&HASLACONS) {
3193 fprintf(f, ", haslacons");
3194 }
3195 fprintf(f, "\n");
3196 for (st = 0; st < cnfa->nstates; st++) {
3197 dumpcstate(st, cnfa->states[st], cnfa, f);
3198 }
3199 fflush(f);
3200 #endif
3201 }
3202
3203 #ifdef REG_DEBUG /* subordinates of dumpcnfa */
3204 /*
3205 ^ #ifdef REG_DEBUG
3206 */
3207
3208 /*
3209 - dumpcstate - dump a compacted-NFA state in human-readable form
3210 ^ static VOID dumpcstate(int, struct carc *, struct cnfa *, FILE *);
3211 */
3212 static void
dumpcstate(int st,struct carc * ca,struct cnfa * cnfa,FILE * f)3213 dumpcstate(
3214 int st,
3215 struct carc *ca,
3216 struct cnfa *cnfa,
3217 FILE *f)
3218 {
3219 int i;
3220 int pos;
3221
3222 fprintf(f, "%d%s", st, (ca[0].co) ? ":" : ".");
3223 pos = 1;
3224 for (i = 1; ca[i].co != COLORLESS; i++) {
3225 if (ca[i].co < cnfa->ncolors) {
3226 fprintf(f, "\t[%ld]->%d", (long) ca[i].co, ca[i].to);
3227 } else {
3228 fprintf(f, "\t:%ld:->%d", (long) ca[i].co-cnfa->ncolors,ca[i].to);
3229 }
3230 if (pos == 5) {
3231 fprintf(f, "\n");
3232 pos = 1;
3233 } else {
3234 pos++;
3235 }
3236 }
3237 if (i == 1 || pos != 1) {
3238 fprintf(f, "\n");
3239 }
3240 fflush(f);
3241 }
3242
3243 /*
3244 ^ #endif
3245 */
3246 #endif /* ifdef REG_DEBUG */
3247
3248 /*
3249 * Local Variables:
3250 * mode: c
3251 * c-basic-offset: 4
3252 * fill-column: 78
3253 * End:
3254 */
3255