1 /*
2  * NFA utilities.
3  * This file is #included by regcomp.c.
4  *
5  * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
6  *
7  * Development of this software was funded, in part, by Cray Research Inc.,
8  * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
9  * Corporation, none of whom are responsible for the results. The author
10  * thanks all of them.
11  *
12  * Redistribution and use in source and binary forms -- with or without
13  * modification -- are permitted for any purpose, provided that
14  * redistributions in source form retain this entire copyright notice and
15  * indicate the origin and nature of any modifications.
16  *
17  * I'd appreciate being given credit for this package in the documentation of
18  * software which uses it, but that is not a requirement.
19  *
20  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
21  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
22  * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
23  * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
29  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  *
31  * One or two things that technically ought to be in here are actually in
32  * color.c, thanks to some incestuous relationships in the color chains.
33  */
34 
35 #define	NISERR()	VISERR(nfa->v)
36 #define	NERR(e)		VERR(nfa->v, (e))
37 #define STACK_TOO_DEEP(x) (0)
38 #define CANCEL_REQUESTED(x) (0)
39 #define REG_CANCEL 777
40 
41 /*
42  - newnfa - set up an NFA
43  ^ static struct nfa *newnfa(struct vars *, struct colormap *, struct nfa *);
44  */
45 static struct nfa *		/* the NFA, or NULL */
newnfa(struct vars * v,struct colormap * cm,struct nfa * parent)46 newnfa(
47     struct vars *v,
48     struct colormap *cm,
49     struct nfa *parent)		/* NULL if primary NFA */
50 {
51     struct nfa *nfa;
52 
53     nfa = (struct nfa *) MALLOC(sizeof(struct nfa));
54     if (nfa == NULL) {
55 	ERR(REG_ESPACE);
56 	return NULL;
57     }
58 
59     nfa->states = NULL;
60     nfa->slast = NULL;
61     nfa->free = NULL;
62     nfa->nstates = 0;
63     nfa->cm = cm;
64     nfa->v = v;
65     nfa->size = 0;
66     nfa->bos[0] = nfa->bos[1] = COLORLESS;
67     nfa->eos[0] = nfa->eos[1] = COLORLESS;
68     nfa->parent = parent;	/* Precedes newfstate so parent is valid. */
69     nfa->post = newfstate(nfa, '@');	/* number 0 */
70     nfa->pre = newfstate(nfa, '>');	/* number 1 */
71 
72     nfa->init = newstate(nfa);	/* May become invalid later. */
73     nfa->final = newstate(nfa);
74     if (ISERR()) {
75 	freenfa(nfa);
76 	return NULL;
77     }
78     rainbow(nfa, nfa->cm, PLAIN, COLORLESS, nfa->pre, nfa->init);
79     newarc(nfa, '^', 1, nfa->pre, nfa->init);
80     newarc(nfa, '^', 0, nfa->pre, nfa->init);
81     rainbow(nfa, nfa->cm, PLAIN, COLORLESS, nfa->final, nfa->post);
82     newarc(nfa, '$', 1, nfa->final, nfa->post);
83     newarc(nfa, '$', 0, nfa->final, nfa->post);
84 
85     if (ISERR()) {
86 	freenfa(nfa);
87 	return NULL;
88     }
89     return nfa;
90 }
91 
92 /*
93  - TooManyStates - checks if the max states exceeds the compile-time value
94  ^ static int TooManyStates(struct nfa *);
95  */
96 static int
TooManyStates(struct nfa * nfa)97 TooManyStates(
98     struct nfa *nfa)
99 {
100     struct nfa *parent = nfa->parent;
101     size_t sz = nfa->size;
102 
103     while (parent != NULL) {
104 	sz = parent->size;
105 	parent = parent->parent;
106     }
107     if (sz > REG_MAX_STATES) {
108 	return 1;
109     }
110     return 0;
111 }
112 
113 /*
114  - IncrementSize - increases the tracked size of the NFA and its parents.
115  ^ static void IncrementSize(struct nfa *);
116  */
117 static void
IncrementSize(struct nfa * nfa)118 IncrementSize(
119     struct nfa *nfa)
120 {
121     struct nfa *parent = nfa->parent;
122 
123     nfa->size++;
124     while (parent != NULL) {
125 	parent->size++;
126 	parent = parent->parent;
127     }
128 }
129 
130 /*
131  - DecrementSize - increases the tracked size of the NFA and its parents.
132  ^ static void DecrementSize(struct nfa *);
133  */
134 static void
DecrementSize(struct nfa * nfa)135 DecrementSize(
136     struct nfa *nfa)
137 {
138     struct nfa *parent = nfa->parent;
139 
140     nfa->size--;
141     while (parent != NULL) {
142 	parent->size--;
143 	parent = parent->parent;
144     }
145 }
146 
147 /*
148  - freenfa - free an entire NFA
149  ^ static VOID freenfa(struct nfa *);
150  */
151 static void
freenfa(struct nfa * nfa)152 freenfa(
153     struct nfa *nfa)
154 {
155     struct state *s;
156 
157     while ((s = nfa->states) != NULL) {
158 	s->nins = s->nouts = 0;	/* don't worry about arcs */
159 	freestate(nfa, s);
160     }
161     while ((s = nfa->free) != NULL) {
162 	nfa->free = s->next;
163 	destroystate(nfa, s);
164     }
165 
166     nfa->slast = NULL;
167     nfa->nstates = -1;
168     nfa->pre = NULL;
169     nfa->post = NULL;
170     FREE(nfa);
171 }
172 
173 /*
174  - newstate - allocate an NFA state, with zero flag value
175  ^ static struct state *newstate(struct nfa *);
176  */
177 static struct state *		/* NULL on error */
newstate(struct nfa * nfa)178 newstate(
179     struct nfa *nfa)
180 {
181     struct state *s;
182 
183     if (TooManyStates(nfa)) {
184 	/* XXX: add specific error for this */
185 	NERR(REG_ETOOBIG);
186 	return NULL;
187     }
188     if (nfa->free != NULL) {
189 	s = nfa->free;
190 	nfa->free = s->next;
191     } else {
192 	s = (struct state *) MALLOC(sizeof(struct state));
193 	if (s == NULL) {
194 	    NERR(REG_ESPACE);
195 	    return NULL;
196 	}
197 	s->oas.next = NULL;
198 	s->free = NULL;
199 	s->noas = 0;
200     }
201 
202     assert(nfa->nstates >= 0);
203     s->no = nfa->nstates++;
204     s->flag = 0;
205     if (nfa->states == NULL) {
206 	nfa->states = s;
207     }
208     s->nins = 0;
209     s->ins = NULL;
210     s->nouts = 0;
211     s->outs = NULL;
212     s->tmp = NULL;
213     s->next = NULL;
214     if (nfa->slast != NULL) {
215 	assert(nfa->slast->next == NULL);
216 	nfa->slast->next = s;
217     }
218     s->prev = nfa->slast;
219     nfa->slast = s;
220 
221     /*
222      * Track the current size and the parent size.
223      */
224 
225     IncrementSize(nfa);
226     return s;
227 }
228 
229 /*
230  - newfstate - allocate an NFA state with a specified flag value
231  ^ static struct state *newfstate(struct nfa *, int flag);
232  */
233 static struct state *		/* NULL on error */
newfstate(struct nfa * nfa,int flag)234 newfstate(
235     struct nfa *nfa,
236     int flag)
237 {
238     struct state *s;
239 
240     s = newstate(nfa);
241     if (s != NULL) {
242 	s->flag = (char) flag;
243     }
244     return s;
245 }
246 
247 /*
248  - dropstate - delete a state's inarcs and outarcs and free it
249  ^ static VOID dropstate(struct nfa *, struct state *);
250  */
251 static void
dropstate(struct nfa * nfa,struct state * s)252 dropstate(
253     struct nfa *nfa,
254     struct state *s)
255 {
256     struct arc *a;
257 
258     while ((a = s->ins) != NULL) {
259 	freearc(nfa, a);
260     }
261     while ((a = s->outs) != NULL) {
262 	freearc(nfa, a);
263     }
264     freestate(nfa, s);
265 }
266 
267 /*
268  - freestate - free a state, which has no in-arcs or out-arcs
269  ^ static VOID freestate(struct nfa *, struct state *);
270  */
271 static void
freestate(struct nfa * nfa,struct state * s)272 freestate(
273     struct nfa *nfa,
274     struct state *s)
275 {
276     assert(s != NULL);
277     assert(s->nins == 0 && s->nouts == 0);
278 
279     s->no = FREESTATE;
280     s->flag = 0;
281     if (s->next != NULL) {
282 	s->next->prev = s->prev;
283     } else {
284 	assert(s == nfa->slast);
285 	nfa->slast = s->prev;
286     }
287     if (s->prev != NULL) {
288 	s->prev->next = s->next;
289     } else {
290 	assert(s == nfa->states);
291 	nfa->states = s->next;
292     }
293     s->prev = NULL;
294     s->next = nfa->free;	/* don't delete it, put it on the free list */
295     nfa->free = s;
296     DecrementSize(nfa);
297 }
298 
299 /*
300  - destroystate - really get rid of an already-freed state
301  ^ static VOID destroystate(struct nfa *, struct state *);
302  */
303 static void
destroystate(struct nfa * nfa,struct state * s)304 destroystate(
305     struct nfa *nfa,
306     struct state *s)
307 {
308     struct arcbatch *ab;
309     struct arcbatch *abnext;
310 
311     assert(s->no == FREESTATE);
312     for (ab=s->oas.next ; ab!=NULL ; ab=abnext) {
313 	abnext = ab->next;
314 	FREE(ab);
315     }
316     s->ins = NULL;
317     s->outs = NULL;
318     s->next = NULL;
319     FREE(s);
320 }
321 
322 /*
323  - newarc - set up a new arc within an NFA
324  ^ static VOID newarc(struct nfa *, int, pcolor, struct state *,
325  ^	struct state *);
326  */
327 /*
328  * This function checks to make sure that no duplicate arcs are created.
329  * In general we never want duplicates.
330  */
331 static void
newarc(struct nfa * nfa,int t,pcolor co,struct state * from,struct state * to)332 newarc(
333     struct nfa *nfa,
334     int t,
335     pcolor co,
336     struct state *from,
337     struct state *to)
338 {
339     struct arc *a;
340 
341     assert(from != NULL && to != NULL);
342 
343     /* check for duplicate arc, using whichever chain is shorter */
344     if (from->nouts <= to->nins) {
345 	for (a = from->outs; a != NULL; a = a->outchain) {
346 	    if (a->to == to && a->co == co && a->type == t) {
347 		return;
348 	    }
349 	}
350     } else {
351 	for (a = to->ins; a != NULL; a = a->inchain) {
352 	    if (a->from == from && a->co == co && a->type == t) {
353 		return;
354 	    }
355 	}
356     }
357 
358     /* no dup, so create the arc */
359     createarc(nfa, t, co, from, to);
360 }
361 
362 /*
363  * createarc - create a new arc within an NFA
364  *
365  * This function must *only* be used after verifying that there is no existing
366  * identical arc (same type/color/from/to).
367  */
368 static void
createarc(struct nfa * nfa,int t,pcolor co,struct state * from,struct state * to)369 createarc(
370     struct nfa * nfa,
371     int t,
372     pcolor co,
373     struct state * from,
374     struct state * to)
375 {
376     struct arc *a;
377 
378     /* the arc is physically allocated within its from-state */
379     a = allocarc(nfa, from);
380     if (NISERR()) {
381 	return;
382     }
383     assert(a != NULL);
384 
385     a->type = t;
386     a->co = (color) co;
387     a->to = to;
388     a->from = from;
389 
390     /*
391      * Put the new arc on the beginning, not the end, of the chains; it's
392      * simpler here, and freearc() is the same cost either way.  See also the
393      * logic in moveins() and its cohorts, as well as fixempties().
394      */
395     a->inchain = to->ins;
396     a->inchainRev = NULL;
397     if (to->ins) {
398 	to->ins->inchainRev = a;
399     }
400     to->ins = a;
401     a->outchain = from->outs;
402     a->outchainRev = NULL;
403     if (from->outs) {
404 	from->outs->outchainRev = a;
405     }
406     from->outs = a;
407 
408     from->nouts++;
409     to->nins++;
410 
411     if (COLORED(a) && nfa->parent == NULL) {
412 	colorchain(nfa->cm, a);
413     }
414 }
415 
416 /*
417  - allocarc - allocate a new out-arc within a state
418  ^ static struct arc *allocarc(struct nfa *, struct state *);
419  */
420 static struct arc *		/* NULL for failure */
allocarc(struct nfa * nfa,struct state * s)421 allocarc(
422     struct nfa *nfa,
423     struct state *s)
424 {
425     struct arc *a;
426 
427     /*
428      * Shortcut
429      */
430 
431     if (s->free == NULL && s->noas < ABSIZE) {
432 	a = &s->oas.a[s->noas];
433 	s->noas++;
434 	return a;
435     }
436 
437     /*
438      * if none at hand, get more
439      */
440 
441     if (s->free == NULL) {
442 	struct arcbatch *newAb = (struct arcbatch *)
443 		MALLOC(sizeof(struct arcbatch));
444 	int i;
445 
446 	if (newAb == NULL) {
447 	    NERR(REG_ESPACE);
448 	    return NULL;
449 	}
450 	newAb->next = s->oas.next;
451 	s->oas.next = newAb;
452 
453 	for (i=0 ; i<ABSIZE ; i++) {
454 	    newAb->a[i].type = 0;
455 	    newAb->a[i].freechain = &newAb->a[i+1];
456 	}
457 	newAb->a[ABSIZE-1].freechain = NULL;
458 	s->free = &newAb->a[0];
459     }
460     assert(s->free != NULL);
461 
462     a = s->free;
463     s->free = a->freechain;
464     return a;
465 }
466 
467 /*
468  - freearc - free an arc
469  ^ static VOID freearc(struct nfa *, struct arc *);
470  */
471 static void
freearc(struct nfa * nfa,struct arc * victim)472 freearc(
473     struct nfa *nfa,
474     struct arc *victim)
475 {
476     struct state *from = victim->from;
477     struct state *to = victim->to;
478     struct arc *predecessor;
479 
480     assert(victim->type != 0);
481 
482     /*
483      * Take it off color chain if necessary.
484      */
485 
486     if (COLORED(victim) && nfa->parent == NULL) {
487 	uncolorchain(nfa->cm, victim);
488     }
489 
490     /*
491      * Take it off source's out-chain.
492      */
493 
494     assert(from != NULL);
495     predecessor = victim->outchainRev;
496     if (predecessor == NULL) {
497 	assert(from->outs == victim);
498 	from->outs = victim->outchain;
499     } else {
500 	assert(predecessor->outchain == victim);
501 	predecessor->outchain = victim->outchain;
502     }
503     if (victim->outchain != NULL) {
504 	assert(victim->outchain->outchainRev == victim);
505 	victim->outchain->outchainRev = predecessor;
506     }
507     from->nouts--;
508 
509     /*
510      * Take it off target's in-chain.
511      */
512 
513     assert(to != NULL);
514     predecessor = victim->inchainRev;
515     if (predecessor == NULL) {
516 	assert(to->ins == victim);
517 	to->ins = victim->inchain;
518     } else {
519 	assert(predecessor->inchain == victim);
520 	predecessor->inchain = victim->inchain;
521     }
522     if (victim->inchain != NULL) {
523 	assert(victim->inchain->inchainRev == victim);
524 	victim->inchain->inchainRev = predecessor;
525     }
526     to->nins--;
527 
528     /*
529      * Clean up and place on from-state's free list.
530      */
531 
532     victim->type = 0;
533     victim->from = NULL;	/* precautions... */
534     victim->to = NULL;
535     victim->inchain = NULL;
536     victim->inchainRev = NULL;
537     victim->outchain = NULL;
538     victim->outchainRev = NULL;
539     victim->freechain = from->free;
540     from->free = victim;
541 }
542 
543 /*
544  * changearctarget - flip an arc to have a different to state
545  *
546  * Caller must have verified that there is no pre-existing duplicate arc.
547  *
548  * Note that because we store arcs in their from state, we can't easily have
549  * a similar changearcsource function.
550  */
551 static void
changearctarget(struct arc * a,struct state * newto)552 changearctarget(struct arc * a, struct state * newto)
553 {
554     struct state *oldto = a->to;
555     struct arc *predecessor;
556 
557     assert(oldto != newto);
558 
559     /* take it off old target's in-chain */
560     assert(oldto != NULL);
561     predecessor = a->inchainRev;
562     if (predecessor == NULL) {
563 	assert(oldto->ins == a);
564 	oldto->ins = a->inchain;
565     } else {
566 	assert(predecessor->inchain == a);
567 	predecessor->inchain = a->inchain;
568     }
569     if (a->inchain != NULL) {
570 	assert(a->inchain->inchainRev == a);
571 	a->inchain->inchainRev = predecessor;
572     }
573     oldto->nins--;
574 
575     a->to = newto;
576 
577     /* prepend it to new target's in-chain */
578     a->inchain = newto->ins;
579     a->inchainRev = NULL;
580     if (newto->ins) {
581 	newto->ins->inchainRev = a;
582     }
583     newto->ins = a;
584     newto->nins++;
585 }
586 
587 /*
588  - hasnonemptyout - Does state have a non-EMPTY out arc?
589  ^ static int hasnonemptyout(struct state *);
590  */
591 static int
hasnonemptyout(struct state * s)592 hasnonemptyout(
593     struct state *s)
594 {
595     struct arc *a;
596 
597     for (a = s->outs; a != NULL; a = a->outchain) {
598 	if (a->type != EMPTY) {
599 	    return 1;
600 	}
601     }
602     return 0;
603 }
604 
605 /*
606  - findarc - find arc, if any, from given source with given type and color
607  * If there is more than one such arc, the result is random.
608  ^ static struct arc *findarc(struct state *, int, pcolor);
609  */
610 static struct arc *
findarc(struct state * s,int type,pcolor co)611 findarc(
612     struct state *s,
613     int type,
614     pcolor co)
615 {
616     struct arc *a;
617 
618     for (a=s->outs ; a!=NULL ; a=a->outchain) {
619 	if (a->type == type && a->co == co) {
620 	    return a;
621 	}
622     }
623     return NULL;
624 }
625 
626 /*
627  - cparc - allocate a new arc within an NFA, copying details from old one
628  ^ static VOID cparc(struct nfa *, struct arc *, struct state *,
629  ^ 	struct state *);
630  */
631 static void
cparc(struct nfa * nfa,struct arc * oa,struct state * from,struct state * to)632 cparc(
633     struct nfa *nfa,
634     struct arc *oa,
635     struct state *from,
636     struct state *to)
637 {
638     newarc(nfa, oa->type, oa->co, from, to);
639 }
640 
641 /*
642  * sortins - sort the in arcs of a state by from/color/type
643  */
644 static void
sortins(struct nfa * nfa,struct state * s)645 sortins(
646     struct nfa * nfa,
647     struct state * s)
648 {
649     struct arc **sortarray;
650     struct arc *a;
651     int n = s->nins;
652     int i;
653 
654     if (n <= 1) {
655 	return;		/* nothing to do */
656     }
657     /* make an array of arc pointers ... */
658     sortarray = (struct arc **) MALLOC(n * sizeof(struct arc *));
659     if (sortarray == NULL) {
660 	NERR(REG_ESPACE);
661 	return;
662     }
663     i = 0;
664     for (a = s->ins; a != NULL; a = a->inchain) {
665 	sortarray[i++] = a;
666     }
667     assert(i == n);
668     /* ... sort the array */
669     qsort(sortarray, n, sizeof(struct arc *), sortins_cmp);
670     /* ... and rebuild arc list in order */
671     /* it seems worth special-casing first and last items to simplify loop */
672     a = sortarray[0];
673     s->ins = a;
674     a->inchain = sortarray[1];
675     a->inchainRev = NULL;
676     for (i = 1; i < n - 1; i++) {
677 	a = sortarray[i];
678 	a->inchain = sortarray[i + 1];
679 	a->inchainRev = sortarray[i - 1];
680     }
681     a = sortarray[i];
682     a->inchain = NULL;
683     a->inchainRev = sortarray[i - 1];
684     FREE(sortarray);
685 }
686 
687 static int
sortins_cmp(const void * a,const void * b)688 sortins_cmp(
689     const void *a,
690     const void *b)
691 {
692     const struct arc *aa = *((const struct arc * const *) a);
693     const struct arc *bb = *((const struct arc * const *) b);
694 
695     /* we check the fields in the order they are most likely to be different */
696     if (aa->from->no < bb->from->no) {
697 	return -1;
698     }
699     if (aa->from->no > bb->from->no) {
700  	return 1;
701     }
702     if (aa->co < bb->co) {
703  	return -1;
704     }
705     if (aa->co > bb->co) {
706  	return 1;
707     }
708     if (aa->type < bb->type) {
709  	return -1;
710     }
711     if (aa->type > bb->type) {
712  	return 1;
713     }
714     return 0;
715 }
716 
717 /*
718  * sortouts - sort the out arcs of a state by to/color/type
719  */
720 static void
sortouts(struct nfa * nfa,struct state * s)721 sortouts(
722     struct nfa * nfa,
723     struct state * s)
724 {
725     struct arc **sortarray;
726     struct arc *a;
727     int	n = s->nouts;
728     int	i;
729 
730     if (n <= 1) {
731 	return;					/* nothing to do */
732     }
733     /* make an array of arc pointers ... */
734     sortarray = (struct arc **) MALLOC(n * sizeof(struct arc *));
735     if (sortarray == NULL) {
736 	NERR(REG_ESPACE);
737 	return;
738     }
739     i = 0;
740     for (a = s->outs; a != NULL; a = a->outchain) {
741 	sortarray[i++] = a;
742     }
743     assert(i == n);
744     /* ... sort the array */
745     qsort(sortarray, n, sizeof(struct arc *), sortouts_cmp);
746     /* ... and rebuild arc list in order */
747     /* it seems worth special-casing first and last items to simplify loop */
748     a = sortarray[0];
749     s->outs = a;
750     a->outchain = sortarray[1];
751     a->outchainRev = NULL;
752     for (i = 1; i < n - 1; i++) {
753 	a = sortarray[i];
754 	a->outchain = sortarray[i + 1];
755 	a->outchainRev = sortarray[i - 1];
756     }
757     a = sortarray[i];
758     a->outchain = NULL;
759     a->outchainRev = sortarray[i - 1];
760     FREE(sortarray);
761 }
762 
763 static int
sortouts_cmp(const void * a,const void * b)764 sortouts_cmp(
765     const void *a,
766     const void *b)
767 {
768     const struct arc *aa = *((const struct arc * const *) a);
769     const struct arc *bb = *((const struct arc * const *) b);
770 
771     /* we check the fields in the order they are most likely to be different */
772     if (aa->to->no < bb->to->no) {
773 	return -1;
774     }
775     if (aa->to->no > bb->to->no) {
776 	return 1;
777     }
778     if (aa->co < bb->co) {
779 	return -1;
780     }
781     if (aa->co > bb->co) {
782 	return 1;
783     }
784     if (aa->type < bb->type) {
785 	return -1;
786     }
787     if (aa->type > bb->type) {
788 	return 1;
789     }
790     return 0;
791 }
792 
793 /*
794  * Common decision logic about whether to use arc-by-arc operations or
795  * sort/merge.  If there's just a few source arcs we cannot recoup the
796  * cost of sorting the destination arc list, no matter how large it is.
797  * Otherwise, limit the number of arc-by-arc comparisons to about 1000
798  * (a somewhat arbitrary choice, but the breakeven point would probably
799  * be machine dependent anyway).
800  */
801 #define BULK_ARC_OP_USE_SORT(nsrcarcs, ndestarcs) \
802 	((nsrcarcs) < 4 ? 0 : ((nsrcarcs) > 32 || (ndestarcs) > 32))
803 
804 /*
805  - moveins - move all in arcs of a state to another state
806  * You might think this could be done better by just updating the
807  * existing arcs, and you would be right if it weren't for the need
808  * for duplicate suppression, which makes it easier to just make new
809  * ones to exploit the suppression built into newarc.
810  *
811  * However, if we have a whole lot of arcs to deal with, retail duplicate
812  * checks become too slow.  In that case we proceed by sorting and merging
813  * the arc lists, and then we can indeed just update the arcs in-place.
814  *
815  ^ static VOID moveins(struct nfa *, struct state *, struct state *);
816  */
817 static void
moveins(struct nfa * nfa,struct state * oldState,struct state * newState)818 moveins(
819     struct nfa *nfa,
820     struct state *oldState,
821     struct state *newState)
822 {
823     assert(oldState != newState);
824 
825     if (!BULK_ARC_OP_USE_SORT(oldState->nins, newState->nins)) {
826 	/* With not too many arcs, just do them one at a time */
827 	struct arc *a;
828 
829 	while ((a = oldState->ins) != NULL) {
830 	    cparc(nfa, a, a->from, newState);
831 	    freearc(nfa, a);
832 	}
833     } else {
834 	/*
835 	 * With many arcs, use a sort-merge approach.  Note changearctarget()
836 	 * will put the arc onto the front of newState's chain, so it does not
837 	 * break our walk through the sorted part of the chain.
838 	 */
839 	struct arc *oa;
840 	struct arc *na;
841 
842 	/*
843 	 * Because we bypass newarc() in this code path, we'd better include a
844 	 * cancel check.
845 	 */
846 	if (CANCEL_REQUESTED(nfa->v->re)) {
847 	    NERR(REG_CANCEL);
848 	    return;
849 	}
850 
851 	sortins(nfa, oldState);
852 	sortins(nfa, newState);
853 	if (NISERR()) {
854 	    return;		/* might have failed to sort */
855 	}
856 	oa = oldState->ins;
857 	na = newState->ins;
858 	while (oa != NULL && na != NULL) {
859 	    struct arc *a = oa;
860 
861 	    switch (sortins_cmp(&oa, &na)) {
862 		case -1:
863 		    /* newState does not have anything matching oa */
864 		    oa = oa->inchain;
865 
866 		    /*
867 		     * Rather than doing createarc+freearc, we can just unlink
868 		     * and relink the existing arc struct.
869 		     */
870 		    changearctarget(a, newState);
871 		    break;
872 		case 0:
873 		    /* match, advance in both lists */
874 		    oa = oa->inchain;
875 		    na = na->inchain;
876 		    /* ... and drop duplicate arc from oldState */
877 		    freearc(nfa, a);
878 		    break;
879 		case +1:
880 		    /* advance only na; oa might have a match later */
881 		    na = na->inchain;
882 		    break;
883 		default:
884 		    assert(NOTREACHED);
885 	    }
886 	}
887 	while (oa != NULL) {
888 	    /* newState does not have anything matching oa */
889 	    struct arc *a = oa;
890 
891 	    oa = oa->inchain;
892 	    changearctarget(a, newState);
893 	}
894     }
895 
896     assert(oldState->nins == 0);
897     assert(oldState->ins == NULL);
898 }
899 
900 /*
901  - copyins - copy in arcs of a state to another state
902  ^ static VOID copyins(struct nfa *, struct state *, struct state *, int);
903  */
904 static void
copyins(struct nfa * nfa,struct state * oldState,struct state * newState)905 copyins(
906     struct nfa *nfa,
907     struct state *oldState,
908     struct state *newState)
909 {
910     assert(oldState != newState);
911 
912     if (!BULK_ARC_OP_USE_SORT(oldState->nins, newState->nins)) {
913 	/* With not too many arcs, just do them one at a time */
914 	struct arc *a;
915 
916 	for (a = oldState->ins; a != NULL; a = a->inchain) {
917 	    cparc(nfa, a, a->from, newState);
918 	}
919     } else {
920 	/*
921 	 * With many arcs, use a sort-merge approach.  Note that createarc()
922 	 * will put new arcs onto the front of newState's chain, so it does
923 	 * not break our walk through the sorted part of the chain.
924 	 */
925 	struct arc *oa;
926 	struct arc *na;
927 
928 	/*
929 	 * Because we bypass newarc() in this code path, we'd better include a
930 	 * cancel check.
931 	 */
932 	if (CANCEL_REQUESTED(nfa->v->re)) {
933 	    NERR(REG_CANCEL);
934 	    return;
935 	}
936 
937 	sortins(nfa, oldState);
938 	sortins(nfa, newState);
939 	if (NISERR()) {
940 	    return;		/* might have failed to sort */
941 	}
942 	oa = oldState->ins;
943 	na = newState->ins;
944 	while (oa != NULL && na != NULL) {
945 	    struct arc *a = oa;
946 
947 	    switch (sortins_cmp(&oa, &na)) {
948 		case -1:
949 		    /* newState does not have anything matching oa */
950 		    oa = oa->inchain;
951 		    createarc(nfa, a->type, a->co, a->from, newState);
952 		    break;
953 		case 0:
954 		    /* match, advance in both lists */
955 		    oa = oa->inchain;
956 		    na = na->inchain;
957 		    break;
958 		case +1:
959 		    /* advance only na; oa might have a match later */
960 		    na = na->inchain;
961 		    break;
962 		default:
963 		    assert(NOTREACHED);
964 	    }
965 	}
966 	while (oa != NULL) {
967 	    /* newState does not have anything matching oa */
968 	    struct arc *a = oa;
969 
970 	    oa = oa->inchain;
971 	    createarc(nfa, a->type, a->co, a->from, newState);
972 	}
973     }
974 }
975 
976 /*
977  * mergeins - merge a list of inarcs into a state
978  *
979  * This is much like copyins, but the source arcs are listed in an array,
980  * and are not guaranteed unique.  It's okay to clobber the array contents.
981  */
982 static void
mergeins(struct nfa * nfa,struct state * s,struct arc ** arcarray,int arccount)983 mergeins(
984     struct nfa * nfa,
985     struct state * s,
986     struct arc ** arcarray,
987     int arccount)
988 {
989     struct arc *na;
990     int	i;
991     int	j;
992 
993     if (arccount <= 0) {
994 	return;
995     }
996 
997     /*
998      * Because we bypass newarc() in this code path, we'd better include a
999      * cancel check.
1000      */
1001     if (CANCEL_REQUESTED(nfa->v->re)) {
1002 	NERR(REG_CANCEL);
1003 	return;
1004     }
1005 
1006     /* Sort existing inarcs as well as proposed new ones */
1007     sortins(nfa, s);
1008     if (NISERR()) {
1009 	return;			/* might have failed to sort */
1010     }
1011 
1012     qsort(arcarray, arccount, sizeof(struct arc *), sortins_cmp);
1013 
1014     /*
1015      * arcarray very likely includes dups, so we must eliminate them.  (This
1016      * could be folded into the next loop, but it's not worth the trouble.)
1017      */
1018     j = 0;
1019     for (i = 1; i < arccount; i++) {
1020 	switch (sortins_cmp(&arcarray[j], &arcarray[i])) {
1021 	    case -1:
1022 		/* non-dup */
1023 		arcarray[++j] = arcarray[i];
1024 		break;
1025 	    case 0:
1026 		/* dup */
1027 		break;
1028 	    default:
1029 		/* trouble */
1030 		assert(NOTREACHED);
1031 	}
1032     }
1033     arccount = j + 1;
1034 
1035     /*
1036      * Now merge into s' inchain.  Note that createarc() will put new arcs
1037      * onto the front of s's chain, so it does not break our walk through the
1038      * sorted part of the chain.
1039      */
1040     i = 0;
1041     na = s->ins;
1042     while (i < arccount && na != NULL) {
1043 	struct arc *a = arcarray[i];
1044 
1045 	switch (sortins_cmp(&a, &na)) {
1046 	    case -1:
1047 		/* s does not have anything matching a */
1048 		createarc(nfa, a->type, a->co, a->from, s);
1049 		i++;
1050 		break;
1051 	    case 0:
1052 		/* match, advance in both lists */
1053 		i++;
1054 		na = na->inchain;
1055 		break;
1056 	    case +1:
1057 		/* advance only na; array might have a match later */
1058 		na = na->inchain;
1059 		break;
1060 	    default:
1061 		assert(NOTREACHED);
1062 	}
1063     }
1064     while (i < arccount) {
1065 	/* s does not have anything matching a */
1066 	struct arc *a = arcarray[i];
1067 
1068 	createarc(nfa, a->type, a->co, a->from, s);
1069 	i++;
1070     }
1071 }
1072 
1073 /*
1074  - moveouts - move all out arcs of a state to another state
1075  ^ static VOID moveouts(struct nfa *, struct state *, struct state *);
1076  */
1077 static void
moveouts(struct nfa * nfa,struct state * oldState,struct state * newState)1078 moveouts(
1079     struct nfa *nfa,
1080     struct state *oldState,
1081     struct state *newState)
1082 {
1083     assert(oldState != newState);
1084 
1085     if (!BULK_ARC_OP_USE_SORT(oldState->nouts, newState->nouts)) {
1086 	/* With not too many arcs, just do them one at a time */
1087 	struct arc *a;
1088 
1089 	while ((a = oldState->outs) != NULL) {
1090 	    cparc(nfa, a, newState, a->to);
1091 	    freearc(nfa, a);
1092 	}
1093     } else {
1094 	/*
1095 	 * With many arcs, use a sort-merge approach.  Note that createarc()
1096 	 * will put new arcs onto the front of newState's chain, so it does
1097 	 * not break our walk through the sorted part of the chain.
1098 	 */
1099 	struct arc *oa;
1100 	struct arc *na;
1101 
1102 	/*
1103 	 * Because we bypass newarc() in this code path, we'd better include a
1104 	 * cancel check.
1105 	 */
1106 	if (CANCEL_REQUESTED(nfa->v->re)) {
1107 	    NERR(REG_CANCEL);
1108 	    return;
1109 	}
1110 
1111 	sortouts(nfa, oldState);
1112 	sortouts(nfa, newState);
1113 	if (NISERR()) {
1114 	    return;	/* might have failed to sort */
1115 	}
1116 	oa = oldState->outs;
1117 	na = newState->outs;
1118 	while (oa != NULL && na != NULL) {
1119 	    struct arc *a = oa;
1120 
1121 	    switch (sortouts_cmp(&oa, &na)) {
1122 		case -1:
1123 		    /* newState does not have anything matching oa */
1124 		    oa = oa->outchain;
1125 		    createarc(nfa, a->type, a->co, newState, a->to);
1126 		    freearc(nfa, a);
1127 		    break;
1128 		case 0:
1129 		    /* match, advance in both lists */
1130 		    oa = oa->outchain;
1131 		    na = na->outchain;
1132 		    /* ... and drop duplicate arc from oldState */
1133 		    freearc(nfa, a);
1134 		    break;
1135 		case +1:
1136 		    /* advance only na; oa might have a match later */
1137 		    na = na->outchain;
1138 		    break;
1139 		default:
1140 		    assert(NOTREACHED);
1141 	    }
1142 	}
1143 	while (oa != NULL) {
1144 	    /* newState does not have anything matching oa */
1145 	    struct arc *a = oa;
1146 
1147 	    oa = oa->outchain;
1148 	    createarc(nfa, a->type, a->co, newState, a->to);
1149 	    freearc(nfa, a);
1150 	}
1151     }
1152 
1153     assert(oldState->nouts == 0);
1154     assert(oldState->outs == NULL);
1155 }
1156 
1157 /*
1158  - copyouts - copy out arcs of a state to another state
1159  ^ static VOID copyouts(struct nfa *, struct state *, struct state *, int);
1160  */
1161 static void
copyouts(struct nfa * nfa,struct state * oldState,struct state * newState)1162 copyouts(
1163     struct nfa *nfa,
1164     struct state *oldState,
1165     struct state *newState)
1166 {
1167     assert(oldState != newState);
1168 
1169     if (!BULK_ARC_OP_USE_SORT(oldState->nouts, newState->nouts)) {
1170 	/* With not too many arcs, just do them one at a time */
1171 	struct arc *a;
1172 
1173 	for (a = oldState->outs; a != NULL; a = a->outchain) {
1174 	    cparc(nfa, a, newState, a->to);
1175 	}
1176     } else {
1177  	/*
1178 	 * With many arcs, use a sort-merge approach.  Note that createarc()
1179 	 * will put new arcs onto the front of newState's chain, so it does
1180 	 * not break our walk through the sorted part of the chain.
1181 	 */
1182 	struct arc *oa;
1183 	struct arc *na;
1184 
1185 	/*
1186 	 * Because we bypass newarc() in this code path, we'd better include a
1187 	 * cancel check.
1188 	 */
1189 	if (CANCEL_REQUESTED(nfa->v->re)) {
1190 	    NERR(REG_CANCEL);
1191 	    return;
1192 	}
1193 
1194 	sortouts(nfa, oldState);
1195 	sortouts(nfa, newState);
1196 	if (NISERR()) {
1197 	    return;		/* might have failed to sort */
1198 	}
1199 	oa = oldState->outs;
1200 	na = newState->outs;
1201 	while (oa != NULL && na != NULL) {
1202 	    struct arc *a = oa;
1203 
1204 	    switch (sortouts_cmp(&oa, &na)) {
1205 		case -1:
1206 		    /* newState does not have anything matching oa */
1207 		    oa = oa->outchain;
1208 		    createarc(nfa, a->type, a->co, newState, a->to);
1209 		    break;
1210 		case 0:
1211 		    /* match, advance in both lists */
1212 		    oa = oa->outchain;
1213 		    na = na->outchain;
1214 		    break;
1215 		case +1:
1216 		    /* advance only na; oa might have a match later */
1217 		    na = na->outchain;
1218 		    break;
1219 		default:
1220 		    assert(NOTREACHED);
1221 	    }
1222 	}
1223 	while (oa != NULL) {
1224 	    /* newState does not have anything matching oa */
1225 	    struct arc *a = oa;
1226 
1227 	    oa = oa->outchain;
1228 	    createarc(nfa, a->type, a->co, newState, a->to);
1229 	}
1230     }
1231 }
1232 
1233 /*
1234  - cloneouts - copy out arcs of a state to another state pair, modifying type
1235  ^ static VOID cloneouts(struct nfa *, struct state *, struct state *,
1236  ^ 	struct state *, int);
1237  */
1238 static void
cloneouts(struct nfa * nfa,struct state * old,struct state * from,struct state * to,int type)1239 cloneouts(
1240     struct nfa *nfa,
1241     struct state *old,
1242     struct state *from,
1243     struct state *to,
1244     int type)
1245 {
1246     struct arc *a;
1247 
1248     assert(old != from);
1249 
1250     for (a=old->outs ; a!=NULL ; a=a->outchain) {
1251 	newarc(nfa, type, a->co, from, to);
1252     }
1253 }
1254 
1255 /*
1256  - delsub - delete a sub-NFA, updating subre pointers if necessary
1257  * This uses a recursive traversal of the sub-NFA, marking already-seen
1258  * states using their tmp pointer.
1259  ^ static VOID delsub(struct nfa *, struct state *, struct state *);
1260  */
1261 static void
delsub(struct nfa * nfa,struct state * lp,struct state * rp)1262 delsub(
1263     struct nfa *nfa,
1264     struct state *lp,		/* the sub-NFA goes from here... */
1265     struct state *rp)		/* ...to here, *not* inclusive */
1266 {
1267     assert(lp != rp);
1268 
1269     rp->tmp = rp;		/* mark end */
1270 
1271     deltraverse(nfa, lp, lp);
1272     assert(lp->nouts == 0 && rp->nins == 0);	/* did the job */
1273     assert(lp->no != FREESTATE && rp->no != FREESTATE);	/* no more */
1274 
1275     rp->tmp = NULL;		/* unmark end */
1276     lp->tmp = NULL;		/* and begin, marked by deltraverse */
1277 }
1278 
1279 /*
1280  - deltraverse - the recursive heart of delsub
1281  * This routine's basic job is to destroy all out-arcs of the state.
1282  ^ static VOID deltraverse(struct nfa *, struct state *, struct state *);
1283  */
1284 static void
deltraverse(struct nfa * nfa,struct state * leftend,struct state * s)1285 deltraverse(
1286     struct nfa *nfa,
1287     struct state *leftend,
1288     struct state *s)
1289 {
1290     struct arc *a;
1291     struct state *to;
1292 
1293     if (s->nouts == 0) {
1294 	return;			/* nothing to do */
1295     }
1296     if (s->tmp != NULL) {
1297 	return;			/* already in progress */
1298     }
1299 
1300     s->tmp = s;			/* mark as in progress */
1301 
1302     while ((a = s->outs) != NULL) {
1303 	to = a->to;
1304 	deltraverse(nfa, leftend, to);
1305 	assert(to->nouts == 0 || to->tmp != NULL);
1306 	freearc(nfa, a);
1307 	if (to->nins == 0 && to->tmp == NULL) {
1308 	    assert(to->nouts == 0);
1309 	    freestate(nfa, to);
1310 	}
1311     }
1312 
1313     assert(s->no != FREESTATE);	/* we're still here */
1314     assert(s == leftend || s->nins != 0);	/* and still reachable */
1315     assert(s->nouts == 0);	/* but have no outarcs */
1316 
1317     s->tmp = NULL;		/* we're done here */
1318 }
1319 
1320 /*
1321  - dupnfa - duplicate sub-NFA
1322  * Another recursive traversal, this time using tmp to point to duplicates as
1323  * well as mark already-seen states. (You knew there was a reason why it's a
1324  * state pointer, didn't you? :-))
1325  ^ static VOID dupnfa(struct nfa *, struct state *, struct state *,
1326  ^ 	struct state *, struct state *);
1327  */
1328 static void
dupnfa(struct nfa * nfa,struct state * start,struct state * stop,struct state * from,struct state * to)1329 dupnfa(
1330     struct nfa *nfa,
1331     struct state *start,	/* duplicate of subNFA starting here */
1332     struct state *stop,		/* and stopping here */
1333     struct state *from,		/* stringing duplicate from here */
1334     struct state *to)		/* to here */
1335 {
1336     if (start == stop) {
1337 	newarc(nfa, EMPTY, 0, from, to);
1338 	return;
1339     }
1340 
1341     stop->tmp = to;
1342     duptraverse(nfa, start, from);
1343     /* done, except for clearing out the tmp pointers */
1344 
1345     stop->tmp = NULL;
1346     cleartraverse(nfa, start);
1347 }
1348 
1349 /*
1350  - duptraverse - recursive heart of dupnfa
1351  ^ static VOID duptraverse(struct nfa *, struct state *, struct state *);
1352  */
1353 static void
duptraverse(struct nfa * nfa,struct state * s,struct state * stmp)1354 duptraverse(
1355     struct nfa *nfa,
1356     struct state *s,
1357     struct state *stmp)		/* s's duplicate, or NULL */
1358 {
1359     struct arc *a;
1360 
1361     if (s->tmp != NULL) {
1362 	return;			/* already done */
1363     }
1364 
1365     s->tmp = (stmp == NULL) ? newstate(nfa) : stmp;
1366     if (s->tmp == NULL) {
1367 	assert(NISERR());
1368 	return;
1369     }
1370 
1371     for (a=s->outs ; a!=NULL && !NISERR() ; a=a->outchain) {
1372 	duptraverse(nfa, a->to, NULL);
1373 	if (NISERR()) {
1374 	    break;
1375 	}
1376 	assert(a->to->tmp != NULL);
1377 	cparc(nfa, a, s->tmp, a->to->tmp);
1378     }
1379 }
1380 
1381 /*
1382  - cleartraverse - recursive cleanup for algorithms that leave tmp ptrs set
1383  ^ static VOID cleartraverse(struct nfa *, struct state *);
1384  */
1385 static void
cleartraverse(struct nfa * nfa,struct state * s)1386 cleartraverse(
1387     struct nfa *nfa,
1388     struct state *s)
1389 {
1390     struct arc *a;
1391 
1392     if (s->tmp == NULL) {
1393 	return;
1394     }
1395     s->tmp = NULL;
1396 
1397     for (a=s->outs ; a!=NULL ; a=a->outchain) {
1398 	cleartraverse(nfa, a->to);
1399     }
1400 }
1401 
1402 /*
1403  - specialcolors - fill in special colors for an NFA
1404  ^ static VOID specialcolors(struct nfa *);
1405  */
1406 static void
specialcolors(struct nfa * nfa)1407 specialcolors(
1408     struct nfa *nfa)
1409 {
1410     /*
1411      * False colors for BOS, BOL, EOS, EOL
1412      */
1413 
1414     if (nfa->parent == NULL) {
1415 	nfa->bos[0] = pseudocolor(nfa->cm);
1416 	nfa->bos[1] = pseudocolor(nfa->cm);
1417 	nfa->eos[0] = pseudocolor(nfa->cm);
1418 	nfa->eos[1] = pseudocolor(nfa->cm);
1419     } else {
1420 	assert(nfa->parent->bos[0] != COLORLESS);
1421 	nfa->bos[0] = nfa->parent->bos[0];
1422 	assert(nfa->parent->bos[1] != COLORLESS);
1423 	nfa->bos[1] = nfa->parent->bos[1];
1424 	assert(nfa->parent->eos[0] != COLORLESS);
1425 	nfa->eos[0] = nfa->parent->eos[0];
1426 	assert(nfa->parent->eos[1] != COLORLESS);
1427 	nfa->eos[1] = nfa->parent->eos[1];
1428     }
1429 }
1430 
1431 /*
1432  - optimize - optimize an NFA
1433  ^ static long optimize(struct nfa *, FILE *);
1434  */
1435 
1436  /*
1437   * The main goal of this function is not so much "optimization" (though it
1438   * does try to get rid of useless NFA states) as reducing the NFA to a form
1439   * the regex executor can handle.  The executor, and indeed the cNFA format
1440   * that is its input, can only handle PLAIN and LACON arcs.  The output of
1441   * the regex parser also includes EMPTY (do-nothing) arcs, as well as
1442   * ^, $, AHEAD, and BEHIND constraint arcs, which we must get rid of here.
1443   * We first get rid of EMPTY arcs and then deal with the constraint arcs.
1444   * The hardest part of either job is to get rid of circular loops of the
1445   * target arc type.  We would have to do that in any case, though, as such a
1446   * loop would otherwise allow the executor to cycle through the loop endlessly
1447   * without making any progress in the input string.
1448   */
1449 static long			/* re_info bits */
optimize(struct nfa * nfa,FILE * f)1450 optimize(
1451     struct nfa *nfa,
1452     FILE *f)			/* for debug output; NULL none */
1453 {
1454     int verbose = (f != NULL) ? 1 : 0;
1455 
1456     if (verbose) {
1457 	fprintf(f, "\ninitial cleanup:\n");
1458     }
1459     cleanup(nfa);		/* may simplify situation */
1460     if (verbose) {
1461 	dumpnfa(nfa, f);
1462     }
1463     if (verbose) {
1464 	fprintf(f, "\nempties:\n");
1465     }
1466     fixempties(nfa, f);		/* get rid of EMPTY arcs */
1467     if (verbose) {
1468 	fprintf(f, "\nconstraints:\n");
1469     }
1470     fixconstraintloops(nfa, f);	/* get rid of constraint loops */
1471     pullback(nfa, f);		/* pull back constraints backward */
1472     pushfwd(nfa, f);		/* push fwd constraints forward */
1473     if (verbose) {
1474 	fprintf(f, "\nfinal cleanup:\n");
1475     }
1476     cleanup(nfa);		/* final tidying */
1477 #ifdef REG_DEBUG
1478     if (verbose) {
1479 	dumpnfa(nfa, f);
1480     }
1481 #endif
1482     return analyze(nfa);	/* and analysis */
1483 }
1484 
1485 /*
1486  - pullback - pull back constraints backward to eliminate them
1487  ^ static VOID pullback(struct nfa *, FILE *);
1488  */
1489 static void
pullback(struct nfa * nfa,FILE * f)1490 pullback(
1491     struct nfa *nfa,
1492     FILE *f)			/* for debug output; NULL none */
1493 {
1494     struct state *s;
1495     struct state *nexts;
1496     struct arc *a;
1497     struct arc *nexta;
1498     struct state *intermediates;
1499     int progress;
1500 
1501     /*
1502      * Find and pull until there are no more.
1503      */
1504 
1505     do {
1506 	progress = 0;
1507 	for (s=nfa->states ; s!=NULL && !NISERR() ; s=nexts) {
1508 	    nexts = s->next;
1509 	    intermediates = NULL;
1510 	    for (a=s->outs ; a!=NULL && !NISERR() ; a=nexta) {
1511 		nexta = a->outchain;
1512 		if (a->type == '^' || a->type == BEHIND) {
1513 		    if (pull(nfa, a, &intermediates)) {
1514 			progress = 1;
1515 		    }
1516 		}
1517 		assert(nexta == NULL || s->no != FREESTATE);
1518 	    }
1519 	    /* clear tmp fields of intermediate states created here */
1520 	    while (intermediates != NULL) {
1521 		struct state *ns = intermediates->tmp;
1522 
1523 		intermediates->tmp = NULL;
1524 		intermediates = ns;
1525 	    }
1526 	    /* if s is now useless, get rid of it */
1527 	    if ((s->nins == 0 || s->nouts == 0) && !s->flag) {
1528 		dropstate(nfa, s);
1529 	    }
1530 	}
1531 	if (progress && f != NULL) {
1532 	    dumpnfa(nfa, f);
1533 	}
1534     } while (progress && !NISERR());
1535     if (NISERR()) {
1536 	return;
1537     }
1538 
1539     /*
1540      * Any ^ constraints we were able to pull to the start state can now be
1541      * replaced by PLAIN arcs referencing the BOS or BOL colors.  There should
1542      * be no other ^ or BEHIND arcs left in the NFA, though we do not check
1543      * that here (compact() will fail if so).
1544      */
1545     for (a=nfa->pre->outs ; a!=NULL ; a=nexta) {
1546 	nexta = a->outchain;
1547 	if (a->type == '^') {
1548 	    assert(a->co == 0 || a->co == 1);
1549 	    newarc(nfa, PLAIN, nfa->bos[a->co], a->from, a->to);
1550 	    freearc(nfa, a);
1551 	}
1552     }
1553 }
1554 
1555 /*
1556  - pull - pull a back constraint backward past its source state
1557  *
1558  * Returns 1 if successful (which it always is unless the source is the
1559  * start state or we have an internal error), 0 if nothing happened.
1560  *
1561  * A significant property of this function is that it deletes no pre-existing
1562  * states, and no outarcs of the constraint's from state other than the given
1563  * constraint arc.  This makes the loops in pullback() safe, at the cost that
1564  * we may leave useless states behind.  Therefore, we leave it to pullback()
1565  * to delete such states.
1566  *
1567  * If the from state has multiple back-constraint outarcs, and/or multiple
1568  * compatible constraint inarcs, we only need to create one new intermediate
1569  * state per combination of predecessor and successor states.  *intermediates
1570  * points to a list of such intermediate states for this from state (chained
1571  * through their tmp fields).
1572  ^ static int pull(struct nfa *, struct arc *);
1573  */
1574 static int
pull(struct nfa * nfa,struct arc * con,struct state ** intermediates)1575 pull(
1576     struct nfa *nfa,
1577     struct arc *con,
1578     struct state **intermediates)
1579 {
1580     struct state *from = con->from;
1581     struct state *to = con->to;
1582     struct arc *a;
1583     struct arc *nexta;
1584     struct state *s;
1585 
1586     assert(from != to);		/* should have gotten rid of this earlier */
1587     if (from->flag) {		/* can't pull back beyond start */
1588 	return 0;
1589     }
1590     if (from->nins == 0) {	/* unreachable */
1591 	freearc(nfa, con);
1592 	return 1;
1593     }
1594 
1595     /*
1596      * First, clone from state if necessary to avoid other outarcs.  This may
1597      * seem wasteful, but it simplifies the logic, and we'll get rid of the
1598      * clone state again at the bottom.
1599      */
1600 
1601     if (from->nouts > 1) {
1602 	s = newstate(nfa);
1603 	if (NISERR()) {
1604 	    return 0;
1605 	}
1606 	copyins(nfa, from, s);	/* duplicate inarcs */
1607 	cparc(nfa, con, s, to);		/* move constraint arc */
1608 	freearc(nfa, con);
1609 	if (NISERR()) {
1610 	    return 0;
1611 	}
1612 	from = s;
1613 	con = from->outs;
1614     }
1615     assert(from->nouts == 1);
1616 
1617     /*
1618      * Propagate the constraint into the from state's inarcs.
1619      */
1620 
1621     for (a=from->ins ; a!=NULL && !NISERR(); a=nexta) {
1622 	nexta = a->inchain;
1623 	switch (combine(con, a)) {
1624 	case INCOMPATIBLE:	/* destroy the arc */
1625 	    freearc(nfa, a);
1626 	    break;
1627 	case SATISFIED:		/* no action needed */
1628 	    break;
1629 	case COMPATIBLE:	/* swap the two arcs, more or less */
1630 	    /* need an intermediate state, but might have one already */
1631 	    for (s = *intermediates; s != NULL; s = s->tmp) {
1632 		assert(s->nins > 0 && s->nouts > 0);
1633 		if (s->ins->from == a->from && s->outs->to == to) {
1634 		    break;
1635 		}
1636 	    }
1637 	    if (s == NULL) {
1638 		s = newstate(nfa);
1639 		if (NISERR()) {
1640 		    return 0;
1641 		}
1642 		s->tmp = *intermediates;
1643 		*intermediates = s;
1644 	    }
1645   	    cparc(nfa, con, a->from, s);
1646 	    cparc(nfa, a, s, to);
1647  	    freearc(nfa, a);
1648   	    break;
1649 	default:
1650 	    assert(NOTREACHED);
1651 	    break;
1652 	}
1653     }
1654 
1655     /*
1656      * Remaining inarcs, if any, incorporate the constraint.
1657      */
1658 
1659     moveins(nfa, from, to);
1660     freearc(nfa, con);
1661     /* from state is now useless, but we leave it to pullback() to clean up */
1662     return 1;
1663 }
1664 
1665 /*
1666  - pushfwd - push forward constraints forward to eliminate them
1667  ^ static VOID pushfwd(struct nfa *, FILE *);
1668  */
1669 static void
pushfwd(struct nfa * nfa,FILE * f)1670 pushfwd(
1671     struct nfa *nfa,
1672     FILE *f)			/* for debug output; NULL none */
1673 {
1674     struct state *s;
1675     struct state *nexts;
1676     struct arc *a;
1677     struct arc *nexta;
1678     struct state *intermediates;
1679     int progress;
1680 
1681     /*
1682      * Find and push until there are no more.
1683      */
1684 
1685     do {
1686 	progress = 0;
1687 	for (s=nfa->states ; s!=NULL && !NISERR() ; s=nexts) {
1688 	    nexts = s->next;
1689 	    intermediates = NULL;
1690 	    for (a = s->ins; a != NULL && !NISERR(); a = nexta) {
1691 		nexta = a->inchain;
1692 		if (a->type == '$' || a->type == AHEAD) {
1693 		    if (push(nfa, a, &intermediates)) {
1694 			progress = 1;
1695 		    }
1696 		}
1697 	    }
1698 	    /* clear tmp fields of intermediate states created here */
1699 	    while (intermediates != NULL) {
1700 		struct state *ns = intermediates->tmp;
1701 
1702 		intermediates->tmp = NULL;
1703 		intermediates = ns;
1704 	    }
1705 	    /* if s is now useless, get rid of it */
1706 	    if ((s->nins == 0 || s->nouts == 0) && !s->flag) {
1707 		dropstate(nfa, s);
1708 	    }
1709 	}
1710 	if (progress && f != NULL) {
1711 	    dumpnfa(nfa, f);
1712 	}
1713     } while (progress && !NISERR());
1714     if (NISERR()) {
1715 	return;
1716     }
1717 
1718     /*
1719      * Any $ constraints we were able to push to the post state can now be
1720      * replaced by PLAIN arcs referencing the EOS or EOL colors.  There should
1721      * be no other $ or AHEAD arcs left in the NFA, though we do not check
1722      * that here (compact() will fail if so).
1723      */
1724     for (a = nfa->post->ins; a != NULL; a = nexta) {
1725 	nexta = a->inchain;
1726 	if (a->type == '$') {
1727 	    assert(a->co == 0 || a->co == 1);
1728 	    newarc(nfa, PLAIN, nfa->eos[a->co], a->from, a->to);
1729 	    freearc(nfa, a);
1730 	}
1731     }
1732 }
1733 
1734 /*
1735  - push - push a forward constraint forward past its destination state
1736  *
1737  * Returns 1 if successful (which it always is unless the destination is the
1738  * post state or we have an internal error), 0 if nothing happened.
1739  *
1740  * A significant property of this function is that it deletes no pre-existing
1741  * states, and no inarcs of the constraint's to state other than the given
1742  * constraint arc.  This makes the loops in pushfwd() safe, at the cost that
1743  * we may leave useless states behind.  Therefore, we leave it to pushfwd()
1744  * to delete such states.
1745  *
1746  * If the to state has multiple forward-constraint inarcs, and/or multiple
1747  * compatible constraint outarcs, we only need to create one new intermediate
1748  * state per combination of predecessor and successor states.  *intermediates
1749  * points to a list of such intermediate states for this to state (chained
1750  * through their tmp fields).
1751  ^ static int push(struct nfa *, struct arc *);
1752  */
1753 static int
push(struct nfa * nfa,struct arc * con,struct state ** intermediates)1754 push(
1755     struct nfa *nfa,
1756     struct arc *con,
1757     struct state **intermediates)
1758 {
1759     struct state *from = con->from;
1760     struct state *to = con->to;
1761     struct arc *a;
1762     struct arc *nexta;
1763     struct state *s;
1764 
1765     assert(to != from);		/* should have gotten rid of this earlier */
1766     if (to->flag) {		/* can't push forward beyond end */
1767 	return 0;
1768     }
1769     if (to->nouts == 0) {	/* dead end */
1770 	freearc(nfa, con);
1771 	return 1;
1772     }
1773 
1774     /*
1775      * First, clone to state if necessary to avoid other inarcs.  This may
1776      * seem wasteful, but it simplifies the logic, and we'll get rid of the
1777      * clone state again at the bottom.
1778      */
1779 
1780     if (to->nins > 1) {
1781 	s = newstate(nfa);
1782 	if (NISERR()) {
1783 	    return 0;
1784 	}
1785 	copyouts(nfa, to, s);		/* duplicate outarcs */
1786 	cparc(nfa, con, from, s);	/* move constraint arc */
1787 	freearc(nfa, con);
1788 	if (NISERR()) {
1789 	    return 0;
1790 	}
1791 	to = s;
1792 	con = to->ins;
1793     }
1794     assert(to->nins == 1);
1795 
1796     /*
1797      * Propagate the constraint into the to state's outarcs.
1798      */
1799 
1800     for (a = to->outs; a != NULL && !NISERR(); a = nexta) {
1801 	nexta = a->outchain;
1802 	switch (combine(con, a)) {
1803 	case INCOMPATIBLE:	/* destroy the arc */
1804 	    freearc(nfa, a);
1805 	    break;
1806 	case SATISFIED:		/* no action needed */
1807 	    break;
1808 	case COMPATIBLE:	/* swap the two arcs, more or less */
1809 	    /* need an intermediate state, but might have one already */
1810 	    for (s = *intermediates; s != NULL; s = s->tmp) {
1811 		assert(s->nins > 0 && s->nouts > 0);
1812 		if (s->ins->from == from && s->outs->to == a->to) {
1813 		    break;
1814 		}
1815 	    }
1816 	    if (s == NULL) {
1817 		s = newstate(nfa);
1818 		if (NISERR()) {
1819 		    return 0;
1820 		}
1821 		s->tmp = *intermediates;
1822 		*intermediates = s;
1823 	    }
1824 	    cparc(nfa, con, s, a->to);
1825   	    cparc(nfa, a, from, s);
1826   	    freearc(nfa, a);
1827   	    break;
1828 	default:
1829 	    assert(NOTREACHED);
1830 	    break;
1831 	}
1832     }
1833 
1834     /*
1835      * Remaining outarcs, if any, incorporate the constraint.
1836      */
1837 
1838     moveouts(nfa, to, from);
1839     freearc(nfa, con);
1840     /* to state is now useless, but we leave it to pushfwd() to clean up */
1841     return 1;
1842 }
1843 
1844 /*
1845  - combine - constraint lands on an arc, what happens?
1846  ^ #def	INCOMPATIBLE	1	// destroys arc
1847  ^ #def	SATISFIED	2	// constraint satisfied
1848  ^ #def	COMPATIBLE	3	// compatible but not satisfied yet
1849  ^ static int combine(struct arc *, struct arc *);
1850  */
1851 static int
combine(struct arc * con,struct arc * a)1852 combine(
1853     struct arc *con,
1854     struct arc *a)
1855 {
1856 #define CA(ct,at)	(((ct)<<CHAR_BIT) | (at))
1857 
1858     switch (CA(con->type, a->type)) {
1859     case CA('^', PLAIN):	/* newlines are handled separately */
1860     case CA('$', PLAIN):
1861 	return INCOMPATIBLE;
1862 	break;
1863     case CA(AHEAD, PLAIN):	/* color constraints meet colors */
1864     case CA(BEHIND, PLAIN):
1865 	if (con->co == a->co) {
1866 	    return SATISFIED;
1867 	}
1868 	return INCOMPATIBLE;
1869 	break;
1870     case CA('^', '^'):		/* collision, similar constraints */
1871     case CA('$', '$'):
1872     case CA(AHEAD, AHEAD):
1873     case CA(BEHIND, BEHIND):
1874 	if (con->co == a->co) {	/* true duplication */
1875 	    return SATISFIED;
1876 	}
1877 	return INCOMPATIBLE;
1878 	break;
1879     case CA('^', BEHIND):	/* collision, dissimilar constraints */
1880     case CA(BEHIND, '^'):
1881     case CA('$', AHEAD):
1882     case CA(AHEAD, '$'):
1883 	return INCOMPATIBLE;
1884 	break;
1885     case CA('^', '$'):		/* constraints passing each other */
1886     case CA('^', AHEAD):
1887     case CA(BEHIND, '$'):
1888     case CA(BEHIND, AHEAD):
1889     case CA('$', '^'):
1890     case CA('$', BEHIND):
1891     case CA(AHEAD, '^'):
1892     case CA(AHEAD, BEHIND):
1893     case CA('^', LACON):
1894     case CA(BEHIND, LACON):
1895     case CA('$', LACON):
1896     case CA(AHEAD, LACON):
1897 	return COMPATIBLE;
1898 	break;
1899     }
1900     assert(NOTREACHED);
1901     return INCOMPATIBLE;	/* for benefit of blind compilers */
1902 }
1903 
1904 /*
1905  - fixempties - get rid of EMPTY arcs
1906  ^ static VOID fixempties(struct nfa *, FILE *);
1907  */
1908 static void
fixempties(struct nfa * nfa,FILE * f)1909 fixempties(
1910     struct nfa *nfa,
1911     FILE *f)			/* for debug output; NULL none */
1912 {
1913     struct state *s;
1914     struct state *s2;
1915     struct state *nexts;
1916     struct arc *a;
1917     struct arc *nexta;
1918     int totalinarcs;
1919     struct arc **inarcsorig;
1920     struct arc **arcarray;
1921     int arccount;
1922     int prevnins;
1923     int nskip;
1924 
1925     /*
1926      * First, get rid of any states whose sole out-arc is an EMPTY,
1927      * since they're basically just aliases for their successor.  The
1928      * parsing algorithm creates enough of these that it's worth
1929      * special-casing this.
1930      */
1931     for (s = nfa->states; s != NULL && !NISERR(); s = nexts) {
1932 	nexts = s->next;
1933 	if (s->flag || s->nouts != 1) {
1934 	    continue;
1935 	}
1936 	a = s->outs;
1937 	assert(a != NULL && a->outchain == NULL);
1938 	if (a->type != EMPTY) {
1939 	    continue;
1940 	}
1941 	if (s != a->to) {
1942 	    moveins(nfa, s, a->to);
1943 	}
1944 	dropstate(nfa, s);
1945     }
1946 
1947     /*
1948      * Similarly, get rid of any state with a single EMPTY in-arc, by
1949      * folding it into its predecessor.
1950      */
1951     for (s = nfa->states; s != NULL && !NISERR(); s = nexts) {
1952 	nexts = s->next;
1953 	/* Ensure tmp fields are clear for next step */
1954 	assert(s->tmp == NULL);
1955 	if (s->flag || s->nins != 1) {
1956 	    continue;
1957 	}
1958 	a = s->ins;
1959 	assert(a != NULL && a->inchain == NULL);
1960 	if (a->type != EMPTY) {
1961 	    continue;
1962 	}
1963 	if (s != a->from) {
1964 	    moveouts(nfa, s, a->from);
1965 	}
1966 	dropstate(nfa, s);
1967     }
1968 
1969     if (NISERR()) {
1970 	return;
1971     }
1972 
1973     /*
1974      * For each remaining NFA state, find all other states from which it is
1975      * reachable by a chain of one or more EMPTY arcs.  Then generate new arcs
1976      * that eliminate the need for each such chain.
1977      *
1978      * We could replace a chain of EMPTY arcs that leads from a "from" state
1979      * to a "to" state either by pushing non-EMPTY arcs forward (linking
1980      * directly from "from"'s predecessors to "to") or by pulling them back
1981      * (linking directly from "from" to "to"'s successors).  We choose to
1982      * always do the former; this choice is somewhat arbitrary, but the
1983      * approach below requires that we uniformly do one or the other.
1984      *
1985      * Suppose we have a chain of N successive EMPTY arcs (where N can easily
1986      * approach the size of the NFA).  All of the intermediate states must
1987      * have additional inarcs and outarcs, else they'd have been removed by
1988      * the steps above.  Assuming their inarcs are mostly not empties, we will
1989      * add O(N^2) arcs to the NFA, since a non-EMPTY inarc leading to any one
1990      * state in the chain must be duplicated to lead to all its successor
1991      * states as well.  So there is no hope of doing less than O(N^2) work;
1992      * however, we should endeavor to keep the big-O cost from being even
1993      * worse than that, which it can easily become without care.  In
1994      * particular, suppose we were to copy all S1's inarcs forward to S2, and
1995      * then also to S3, and then later we consider pushing S2's inarcs forward
1996      * to S3.  If we include the arcs already copied from S1 in that, we'd be
1997      * doing O(N^3) work.  (The duplicate-arc elimination built into newarc()
1998      * and its cohorts would get rid of the extra arcs, but not without cost.)
1999      *
2000      * We can avoid this cost by treating only arcs that existed at the start
2001      * of this phase as candidates to be pushed forward.  To identify those,
2002      * we remember the first inarc each state had to start with.  We rely on
2003      * the fact that newarc() and friends put new arcs on the front of their
2004      * to-states' inchains, and that this phase never deletes arcs, so that
2005      * the original arcs must be the last arcs in their to-states' inchains.
2006      *
2007      * So the process here is that, for each state in the NFA, we gather up
2008      * all non-EMPTY inarcs of states that can reach the target state via
2009      * EMPTY arcs.  We then sort, de-duplicate, and merge these arcs into the
2010      * target state's inchain.  (We can safely use sort-merge for this as long
2011      * as we update each state's original-arcs pointer after we add arcs to
2012      * it; the sort step of mergeins probably changed the order of the old
2013      * arcs.)
2014      *
2015      * Another refinement worth making is that, because we only add non-EMPTY
2016      * arcs during this phase, and all added arcs have the same from-state as
2017      * the non-EMPTY arc they were cloned from, we know ahead of time that any
2018      * states having only EMPTY outarcs will be useless for lack of outarcs
2019      * after we drop the EMPTY arcs.  (They cannot gain non-EMPTY outarcs if
2020      * they had none to start with.)  So we need not bother to update the
2021      * inchains of such states at all.
2022      */
2023 
2024     /* Remember the states' first original inarcs */
2025     /* ... and while at it, count how many old inarcs there are altogether */
2026     inarcsorig = (struct arc **) MALLOC(nfa->nstates * sizeof(struct arc *));
2027     if (inarcsorig == NULL) {
2028 	NERR(REG_ESPACE);
2029 	return;
2030     }
2031     totalinarcs = 0;
2032     for (s = nfa->states; s != NULL; s = s->next) {
2033 	inarcsorig[s->no] = s->ins;
2034 	totalinarcs += s->nins;
2035     }
2036 
2037     /*
2038      * Create a workspace for accumulating the inarcs to be added to the
2039      * current target state.  totalinarcs is probably a considerable
2040      * overestimate of the space needed, but the NFA is unlikely to be large
2041      * enough at this point to make it worth being smarter.
2042      */
2043     arcarray = (struct arc **) MALLOC(totalinarcs * sizeof(struct arc *));
2044     if (arcarray == NULL) {
2045 	NERR(REG_ESPACE);
2046 	FREE(inarcsorig);
2047 	return;
2048     }
2049 
2050     /* And iterate over the target states */
2051     for (s = nfa->states; s != NULL && !NISERR(); s = s->next) {
2052 	/* Ignore target states without non-EMPTY outarcs, per note above */
2053 	if (!s->flag && !hasnonemptyout(s)) {
2054 	    continue;
2055 	}
2056 
2057 	/* Find predecessor states and accumulate their original inarcs */
2058 	arccount = 0;
2059 	for (s2 = emptyreachable(nfa, s, s, inarcsorig); s2 != s; s2 = nexts) {
2060 	    /* Add s2's original inarcs to arcarray[], but ignore empties */
2061 	    for (a = inarcsorig[s2->no]; a != NULL; a = a->inchain) {
2062 		if (a->type != EMPTY) {
2063 		    arcarray[arccount++] = a;
2064 		}
2065 	    }
2066 
2067   	    /* Reset the tmp fields as we walk back */
2068   	    nexts = s2->tmp;
2069   	    s2->tmp = NULL;
2070   	}
2071   	s->tmp = NULL;
2072 	assert(arccount <= totalinarcs);
2073 
2074 	/* Remember how many original inarcs this state has */
2075 	prevnins = s->nins;
2076 
2077 	/* Add non-duplicate inarcs to target state */
2078 	mergeins(nfa, s, arcarray, arccount);
2079 
2080 	/* Now we must update the state's inarcsorig pointer */
2081 	nskip = s->nins - prevnins;
2082 	a = s->ins;
2083 	while (nskip-- > 0) {
2084 	    a = a->inchain;
2085 	}
2086 	inarcsorig[s->no] = a;
2087     }
2088 
2089     FREE(arcarray);
2090     FREE(inarcsorig);
2091 
2092     if (NISERR()) {
2093 	return;
2094     }
2095 
2096     /*
2097      * Remove all the EMPTY arcs, since we don't need them anymore.
2098      */
2099     for (s = nfa->states; s != NULL; s = s->next) {
2100 	for (a = s->outs; a != NULL; a = nexta) {
2101 	    nexta = a->outchain;
2102 	    if (a->type == EMPTY) {
2103 		freearc(nfa, a);
2104 	    }
2105 	}
2106     }
2107 
2108     /*
2109      * And remove any states that have become useless.  (This cleanup is
2110      * not very thorough, and would be even less so if we tried to
2111      * combine it with the previous step; but cleanup() will take care
2112      * of anything we miss.)
2113      */
2114     for (s = nfa->states; s != NULL; s = nexts) {
2115 	nexts = s->next;
2116 	if ((s->nins == 0 || s->nouts == 0) && !s->flag) {
2117 	    dropstate(nfa, s);
2118 	}
2119     }
2120 
2121     if (f != NULL) {
2122 	dumpnfa(nfa, f);
2123     }
2124 }
2125 
2126 /*
2127  - emptyreachable - recursively find all states that can reach s by EMPTY arcs
2128  * The return value is the last such state found.  Its tmp field links back
2129  * to the next-to-last such state, and so on back to s, so that all these
2130  * states can be located without searching the whole NFA.
2131  *
2132  * Since this is only used in fixempties(), we pass in the inarcsorig[] array
2133  * maintained by that function.  This lets us skip over all new inarcs, which
2134  * are certainly not EMPTY arcs.
2135  *
2136  * The maximum recursion depth here is equal to the length of the longest
2137  * loop-free chain of EMPTY arcs, which is surely no more than the size of
2138  * the NFA, and in practice will be less than that.
2139  ^ static struct state *emptyreachable(struct state *, struct state *);
2140  */
2141 static struct state *
emptyreachable(struct nfa * nfa,struct state * s,struct state * lastfound,struct arc ** inarcsorig)2142 emptyreachable(
2143     struct nfa *nfa,
2144     struct state *s,
2145     struct state *lastfound,
2146     struct arc **inarcsorig)
2147 {
2148     struct arc *a;
2149 
2150     s->tmp = lastfound;
2151     lastfound = s;
2152     for (a = inarcsorig[s->no]; a != NULL; a = a->inchain) {
2153 	if (a->type == EMPTY && a->from->tmp == NULL) {
2154 	    lastfound = emptyreachable(nfa, a->from, lastfound, inarcsorig);
2155 	}
2156     }
2157     return lastfound;
2158 }
2159 
2160 /*
2161  * isconstraintarc - detect whether an arc is of a constraint type
2162  */
2163 static inline int
isconstraintarc(struct arc * a)2164 isconstraintarc(struct arc * a)
2165 {
2166     switch (a->type)
2167     {
2168 	case '^':
2169 	case '$':
2170 	case BEHIND:
2171 	case AHEAD:
2172 	case LACON:
2173 	    return 1;
2174     }
2175     return 0;
2176 }
2177 
2178 /*
2179  * hasconstraintout - does state have a constraint out arc?
2180  */
2181 static int
hasconstraintout(struct state * s)2182 hasconstraintout(struct state * s)
2183 {
2184     struct arc *a;
2185 
2186     for (a = s->outs; a != NULL; a = a->outchain) {
2187 	if (isconstraintarc(a)) {
2188 	    return 1;
2189 	}
2190     }
2191     return 0;
2192 }
2193 
2194 /*
2195  * fixconstraintloops - get rid of loops containing only constraint arcs
2196  *
2197  * A loop of states that contains only constraint arcs is useless, since
2198  * passing around the loop represents no forward progress.  Moreover, it
2199  * would cause infinite looping in pullback/pushfwd, so we need to get rid
2200  * of such loops before doing that.
2201  */
2202 static void
fixconstraintloops(struct nfa * nfa,FILE * f)2203 fixconstraintloops(
2204     struct nfa * nfa,
2205     FILE *f)		/* for debug output; NULL none */
2206 {
2207     struct state *s;
2208     struct state *nexts;
2209     struct arc *a;
2210     struct arc *nexta;
2211     int hasconstraints;
2212 
2213     /*
2214      * In the trivial case of a state that loops to itself, we can just drop
2215      * the constraint arc altogether.  This is worth special-casing because
2216      * such loops are far more common than loops containing multiple states.
2217      * While we're at it, note whether any constraint arcs survive.
2218      */
2219     hasconstraints = 0;
2220     for (s = nfa->states; s != NULL && !NISERR(); s = nexts) {
2221 	nexts = s->next;
2222 	/* while we're at it, ensure tmp fields are clear for next step */
2223 	assert(s->tmp == NULL);
2224 	for (a = s->outs; a != NULL && !NISERR(); a = nexta) {
2225 	    nexta = a->outchain;
2226 	    if (isconstraintarc(a)) {
2227 		if (a->to == s) {
2228 		    freearc(nfa, a);
2229 		} else {
2230 		    hasconstraints = 1;
2231  		}
2232 	    }
2233 	}
2234  	/* If we removed all the outarcs, the state is useless. */
2235  	if (s->nouts == 0 && !s->flag) {
2236  	    dropstate(nfa, s);
2237 	}
2238     }
2239 
2240     /* Nothing to do if no remaining constraint arcs */
2241     if (NISERR() || !hasconstraints) {
2242 	return;
2243     }
2244 
2245     /*
2246      * Starting from each remaining NFA state, search outwards for a
2247      * constraint loop.  If we find a loop, break the loop, then start the
2248      * search over.  (We could possibly retain some state from the first scan,
2249      * but it would complicate things greatly, and multi-state constraint
2250      * loops are rare enough that it's not worth optimizing the case.)
2251      */
2252   restart:
2253     for (s = nfa->states; s != NULL && !NISERR(); s = s->next) {
2254 	if (findconstraintloop(nfa, s)) {
2255 	    goto restart;
2256 	}
2257     }
2258 
2259     if (NISERR()) {
2260 	return;
2261     }
2262 
2263     /*
2264      * Now remove any states that have become useless.  (This cleanup is not
2265      * very thorough, and would be even less so if we tried to combine it with
2266      * the previous step; but cleanup() will take care of anything we miss.)
2267      *
2268      * Because findconstraintloop intentionally doesn't reset all tmp fields,
2269      * we have to clear them after it's done.  This is a convenient place to
2270      * do that, too.
2271      */
2272     for (s = nfa->states; s != NULL; s = nexts) {
2273 	nexts = s->next;
2274 	s->tmp = NULL;
2275 	if ((s->nins == 0 || s->nouts == 0) && !s->flag) {
2276 	    dropstate(nfa, s);
2277 	}
2278     }
2279 
2280     if (f != NULL) {
2281  	dumpnfa(nfa, f);
2282     }
2283 }
2284 
2285 /*
2286  * findconstraintloop - recursively find a loop of constraint arcs
2287  *
2288  * If we find a loop, break it by calling breakconstraintloop(), then
2289  * return 1; otherwise return 0.
2290  *
2291  * State tmp fields are guaranteed all NULL on a success return, because
2292  * breakconstraintloop does that.  After a failure return, any state that
2293  * is known not to be part of a loop is marked with s->tmp == s; this allows
2294  * us not to have to re-prove that fact on later calls.  (This convention is
2295  * workable because we already eliminated single-state loops.)
2296  *
2297  * Note that the found loop doesn't necessarily include the first state we
2298  * are called on.  Any loop reachable from that state will do.
2299  *
2300  * The maximum recursion depth here is one more than the length of the longest
2301  * loop-free chain of constraint arcs, which is surely no more than the size
2302  * of the NFA, and in practice will be a lot less than that.
2303  */
2304 static int
findconstraintloop(struct nfa * nfa,struct state * s)2305 findconstraintloop(struct nfa * nfa, struct state * s)
2306 {
2307     struct arc *a;
2308 
2309     /* Since this is recursive, it could be driven to stack overflow */
2310     if (STACK_TOO_DEEP(nfa->v->re)) {
2311 	NERR(REG_ETOOBIG);
2312 	return 1;		/* to exit as quickly as possible */
2313     }
2314 
2315     if (s->tmp != NULL) {
2316 	/* Already proven uninteresting? */
2317 	if (s->tmp == s) {
2318 	    return 0;
2319 	}
2320 	/* Found a loop involving s */
2321 	breakconstraintloop(nfa, s);
2322 	/* The tmp fields have been cleaned up by breakconstraintloop */
2323 	return 1;
2324     }
2325     for (a = s->outs; a != NULL; a = a->outchain) {
2326 	if (isconstraintarc(a)) {
2327 	    struct state *sto = a->to;
2328 
2329 	    assert(sto != s);
2330 	    s->tmp = sto;
2331 	    if (findconstraintloop(nfa, sto)) {
2332 		return 1;
2333 	    }
2334 	}
2335     }
2336 
2337     /*
2338      * If we get here, no constraint loop exists leading out from s.  Mark it
2339      * with s->tmp == s so we need not rediscover that fact again later.
2340      */
2341     s->tmp = s;
2342     return 0;
2343 }
2344 
2345 /*
2346  * breakconstraintloop - break a loop of constraint arcs
2347  *
2348  * sinitial is any one member state of the loop.  Each loop member's tmp
2349  * field links to its successor within the loop.  (Note that this function
2350  * will reset all the tmp fields to NULL.)
2351  *
2352  * We can break the loop by, for any one state S1 in the loop, cloning its
2353  * loop successor state S2 (and possibly following states), and then moving
2354  * all S1->S2 constraint arcs to point to the cloned S2.  The cloned S2 should
2355  * copy any non-constraint outarcs of S2.  Constraint outarcs should be
2356  * dropped if they point back to S1, else they need to be copied as arcs to
2357  * similarly cloned states S3, S4, etc.  In general, each cloned state copies
2358  * non-constraint outarcs, drops constraint outarcs that would lead to itself
2359  * or any earlier cloned state, and sends other constraint outarcs to newly
2360  * cloned states.  No cloned state will have any inarcs that aren't constraint
2361  * arcs or do not lead from S1 or earlier-cloned states.  It's okay to drop
2362  * constraint back-arcs since they would not take us to any state we've not
2363  * already been in; therefore, no new constraint loop is created.  In this way
2364  * we generate a modified NFA that can still represent every useful state
2365  * sequence, but not sequences that represent state loops with no consumption
2366  * of input data.  Note that the set of cloned states will certainly include
2367  * all of the loop member states other than S1, and it may also include
2368  * non-loop states that are reachable from S2 via constraint arcs.  This is
2369  * important because there is no guarantee that findconstraintloop found a
2370  * maximal loop (and searching for one would be NP-hard, so don't try).
2371  * Frequently the "non-loop states" are actually part of a larger loop that
2372  * we didn't notice, and indeed there may be several overlapping loops.
2373  * This technique ensures convergence in such cases, while considering only
2374  * the originally-found loop does not.
2375  *
2376  * If there is only one S1->S2 constraint arc, then that constraint is
2377  * certainly satisfied when we enter any of the clone states.  This means that
2378  * in the common case where many of the constraint arcs are identically
2379  * labeled, we can merge together clone states linked by a similarly-labeled
2380  * constraint: if we can get to the first one we can certainly get to the
2381  * second, so there's no need to distinguish.  This greatly reduces the number
2382  * of new states needed, so we preferentially break the given loop at a state
2383  * pair where this is true.
2384  *
2385  * Furthermore, it's fairly common to find that a cloned successor state has
2386  * no outarcs, especially if we're a bit aggressive about removing unnecessary
2387  * outarcs.  If that happens, then there is simply not any interesting state
2388  * that can be reached through the predecessor's loop arcs, which means we can
2389  * break the loop just by removing those loop arcs, with no new states added.
2390  */
2391 static void
breakconstraintloop(struct nfa * nfa,struct state * sinitial)2392 breakconstraintloop(struct nfa * nfa, struct state * sinitial)
2393 {
2394     struct state *s;
2395     struct state *shead;
2396     struct state *stail;
2397     struct state *sclone;
2398     struct state *nexts;
2399     struct arc *refarc;
2400     struct arc *a;
2401     struct arc *nexta;
2402 
2403     /*
2404      * Start by identifying which loop step we want to break at.
2405      * Preferentially this is one with only one constraint arc.  (XXX are
2406      * there any other secondary heuristics we want to use here?)  Set refarc
2407      * to point to the selected lone constraint arc, if there is one.
2408      */
2409     refarc = NULL;
2410     s = sinitial;
2411     do {
2412 	nexts = s->tmp;
2413 	assert(nexts != s);	/* should not see any one-element loops */
2414 	if (refarc == NULL) {
2415 	    int narcs = 0;
2416 
2417 	    for (a = s->outs; a != NULL; a = a->outchain) {
2418 		if (a->to == nexts && isconstraintarc(a)) {
2419 		    refarc = a;
2420 		    narcs++;
2421 		}
2422 	    }
2423 	    assert(narcs > 0);
2424 	    if (narcs > 1) {
2425 		refarc = NULL;	/* multiple constraint arcs here, no good */
2426 	    }
2427 	}
2428 	s = nexts;
2429     } while (s != sinitial);
2430 
2431     if (refarc) {
2432 	/* break at the refarc */
2433 	shead = refarc->from;
2434 	stail = refarc->to;
2435 	assert(stail == shead->tmp);
2436     } else {
2437 	/* for lack of a better idea, break after sinitial */
2438 	shead = sinitial;
2439 	stail = sinitial->tmp;
2440     }
2441 
2442     /*
2443      * Reset the tmp fields so that we can use them for local storage in
2444      * clonesuccessorstates.  (findconstraintloop won't mind, since it's just
2445      * going to abandon its search anyway.)
2446      */
2447     for (s = nfa->states; s != NULL; s = s->next) {
2448 	s->tmp = NULL;
2449     }
2450 
2451     /*
2452      * Recursively build clone state(s) as needed.
2453      */
2454     sclone = newstate(nfa);
2455     if (sclone == NULL) {
2456 	assert(NISERR());
2457 	return;
2458     }
2459 
2460     clonesuccessorstates(nfa, stail, sclone, shead, refarc,
2461 	    NULL, NULL, nfa->nstates);
2462 
2463     if (NISERR()) {
2464 	return;
2465     }
2466 
2467     /*
2468      * It's possible that sclone has no outarcs at all, in which case it's
2469      * useless.  (We don't try extremely hard to get rid of useless states
2470      * here, but this is an easy and fairly common case.)
2471      */
2472     if (sclone->nouts == 0) {
2473 	freestate(nfa, sclone);
2474 	sclone = NULL;
2475     }
2476 
2477     /*
2478      * Move shead's constraint-loop arcs to point to sclone, or just drop them
2479      * if we discovered we don't need sclone.
2480      */
2481     for (a = shead->outs; a != NULL; a = nexta) {
2482 	nexta = a->outchain;
2483 	if (a->to == stail && isconstraintarc(a)) {
2484 	    if (sclone) {
2485 		cparc(nfa, a, shead, sclone);
2486 	    }
2487 	    freearc(nfa, a);
2488 	    if (NISERR()) {
2489 		break;
2490 	    }
2491 	}
2492     }
2493 }
2494 
2495 /*
2496  * clonesuccessorstates - create a tree of constraint-arc successor states
2497  *
2498  * ssource is the state to be cloned, and sclone is the state to copy its
2499  * outarcs into.  sclone's inarcs, if any, should already be set up.
2500  *
2501  * spredecessor is the original predecessor state that we are trying to build
2502  * successors for (it may not be the immediate predecessor of ssource).
2503  * refarc, if not NULL, is the original constraint arc that is known to have
2504  * been traversed out of spredecessor to reach the successor(s).
2505  *
2506  * For each cloned successor state, we transiently create a "donemap" that is
2507  * a boolean array showing which source states we've already visited for this
2508  * clone state.  This prevents infinite recursion as well as useless repeat
2509  * visits to the same state subtree (which can add up fast, since typical NFAs
2510  * have multiple redundant arc pathways).  Each donemap is a char array
2511  * indexed by state number.  The donemaps are all of the same size "nstates",
2512  * which is nfa->nstates as of the start of the recursion.  This is enough to
2513  * have entries for all pre-existing states, but *not* entries for clone
2514  * states created during the recursion.  That's okay since we have no need to
2515  * mark those.
2516  *
2517  * curdonemap is NULL when recursing to a new sclone state, or sclone's
2518  * donemap when we are recursing without having created a new state (which we
2519  * do when we decide we can merge a successor state into the current clone
2520  * state).  outerdonemap is NULL at the top level and otherwise the parent
2521  * clone state's donemap.
2522  *
2523  * The successor states we create and fill here form a strict tree structure,
2524  * with each state having exactly one predecessor, except that the toplevel
2525  * state has no inarcs as yet (breakconstraintloop will add its inarcs from
2526  * spredecessor after we're done).  Thus, we can examine sclone's inarcs back
2527  * to the root, plus refarc if any, to identify the set of constraints already
2528  * known valid at the current point.  This allows us to avoid generating extra
2529  * successor states.
2530  */
2531 static void
clonesuccessorstates(struct nfa * nfa,struct state * ssource,struct state * sclone,struct state * spredecessor,struct arc * refarc,char * curdonemap,char * outerdonemap,int nstates)2532 clonesuccessorstates(
2533     struct nfa * nfa,
2534     struct state * ssource,
2535     struct state * sclone,
2536     struct state * spredecessor,
2537     struct arc * refarc,
2538     char *curdonemap,
2539     char *outerdonemap,
2540     int nstates)
2541 {
2542     char *donemap;
2543     struct arc *a;
2544 
2545     /* Since this is recursive, it could be driven to stack overflow */
2546     if (STACK_TOO_DEEP(nfa->v->re)) {
2547 	NERR(REG_ETOOBIG);
2548 	return;
2549     }
2550 
2551     /* If this state hasn't already got a donemap, create one */
2552     donemap = curdonemap;
2553     if (donemap == NULL) {
2554 	donemap = (char *) MALLOC(nstates * sizeof(char));
2555 	if (donemap == NULL) {
2556 	    NERR(REG_ESPACE);
2557 	    return;
2558 	}
2559 
2560 	if (outerdonemap != NULL) {
2561 	    /*
2562 	     * Not at outermost recursion level, so copy the outer level's
2563 	     * donemap; this ensures that we see states in process of being
2564 	     * visited at outer levels, or already merged into predecessor
2565 	     * states, as ones we shouldn't traverse back to.
2566 	     */
2567 	    memcpy(donemap, outerdonemap, nstates * sizeof(char));
2568 	} else {
2569 	    /* At outermost level, only spredecessor is off-limits */
2570 	    memset(donemap, 0, nstates * sizeof(char));
2571 	    assert(spredecessor->no < nstates);
2572 	    donemap[spredecessor->no] = 1;
2573 	}
2574     }
2575 
2576     /* Mark ssource as visited in the donemap */
2577     assert(ssource->no < nstates);
2578     assert(donemap[ssource->no] == 0);
2579     donemap[ssource->no] = 1;
2580 
2581     /*
2582      * We proceed by first cloning all of ssource's outarcs, creating new
2583      * clone states as needed but not doing more with them than that.  Then in
2584      * a second pass, recurse to process the child clone states.  This allows
2585      * us to have only one child clone state per reachable source state, even
2586      * when there are multiple outarcs leading to the same state.  Also, when
2587      * we do visit a child state, its set of inarcs is known exactly, which
2588      * makes it safe to apply the constraint-is-already-checked optimization.
2589      * Also, this ensures that we've merged all the states we can into the
2590      * current clone before we recurse to any children, thus possibly saving
2591      * them from making extra images of those states.
2592      *
2593      * While this function runs, child clone states of the current state are
2594      * marked by setting their tmp fields to point to the original state they
2595      * were cloned from.  This makes it possible to detect multiple outarcs
2596      * leading to the same state, and also makes it easy to distinguish clone
2597      * states from original states (which will have tmp == NULL).
2598      */
2599     for (a = ssource->outs; a != NULL && !NISERR(); a = a->outchain) {
2600 	struct state *sto = a->to;
2601 
2602 	/*
2603 	 * We do not consider cloning successor states that have no constraint
2604 	 * outarcs; just link to them as-is.  They cannot be part of a
2605 	 * constraint loop so there is no need to make copies.  In particular,
2606 	 * this rule keeps us from trying to clone the post state, which would
2607 	 * be a bad idea.
2608 	 */
2609 	if (isconstraintarc(a) && hasconstraintout(sto)) {
2610 	    struct state *prevclone;
2611 	    int canmerge;
2612 	    struct arc *a2;
2613 
2614 	    /*
2615 	     * Back-link constraint arcs must not be followed.  Nor is there a
2616 	     * need to revisit states previously merged into this clone.
2617 	     */
2618 	    assert(sto->no < nstates);
2619 	    if (donemap[sto->no] != 0) {
2620 		continue;
2621 	    }
2622 
2623 	    /*
2624 	     * Check whether we already have a child clone state for this
2625 	     * source state.
2626 	     */
2627 	    prevclone = NULL;
2628 	    for (a2 = sclone->outs; a2 != NULL; a2 = a2->outchain) {
2629 		if (a2->to->tmp == sto) {
2630 		    prevclone = a2->to;
2631 		    break;
2632 		}
2633 	    }
2634 
2635 	    /*
2636 	     * If this arc is labeled the same as refarc, or the same as any
2637 	     * arc we must have traversed to get to sclone, then no additional
2638 	     * constraints need to be met to get to sto, so we should just
2639 	     * merge its outarcs into sclone.
2640 	     */
2641 	    if (refarc && a->type == refarc->type && a->co == refarc->co) {
2642 		canmerge = 1;
2643 	    } else {
2644 		struct state *s;
2645 
2646 		canmerge = 0;
2647 		for (s = sclone; s->ins; s = s->ins->from) {
2648 		    if (s->nins == 1 &&
2649 			    a->type == s->ins->type && a->co == s->ins->co) {
2650 			canmerge = 1;
2651 			break;
2652 		    }
2653 		}
2654 	    }
2655 
2656 	    if (canmerge) {
2657 		/*
2658 		 * We can merge into sclone.  If we previously made a child
2659 		 * clone state, drop it; there's no need to visit it.  (This
2660 		 * can happen if ssource has multiple pathways to sto, and we
2661 		 * only just now found one that is provably a no-op.)
2662 		 */
2663 		if (prevclone) {
2664 		    dropstate(nfa, prevclone);	/* kills our outarc, too */
2665 		}
2666 
2667 		/* Recurse to merge sto's outarcs into sclone */
2668 		clonesuccessorstates(nfa, sto, sclone, spredecessor, refarc,
2669 			donemap, outerdonemap, nstates);
2670 		/* sto should now be marked as previously visited */
2671 		assert(NISERR() || donemap[sto->no] == 1);
2672 	    } else if (prevclone) {
2673 		/*
2674 		 * We already have a clone state for this successor, so just
2675 		 * make another arc to it.
2676 		 */
2677 		cparc(nfa, a, sclone, prevclone);
2678 	    } else {
2679 		/*
2680 		 * We need to create a new successor clone state.
2681 		 */
2682 		struct state *stoclone;
2683 
2684 		stoclone = newstate(nfa);
2685 		if (stoclone == NULL) {
2686 		    assert(NISERR());
2687 		    break;
2688 		}
2689 		/* Mark it as to what it's a clone of */
2690 		stoclone->tmp = sto;
2691 		/* ... and add the outarc leading to it */
2692 		cparc(nfa, a, sclone, stoclone);
2693 	    }
2694 	} else {
2695 	    /*
2696 	     * Non-constraint outarcs just get copied to sclone, as do outarcs
2697 	     * leading to states with no constraint outarc.
2698 	     */
2699 	    cparc(nfa, a, sclone, sto);
2700 	}
2701     }
2702 
2703     /*
2704      * If we are at outer level for this clone state, recurse to all its child
2705      * clone states, clearing their tmp fields as we go.  (If we're not
2706      * outermost for sclone, leave this to be done by the outer call level.)
2707      * Note that if we have multiple outarcs leading to the same clone state,
2708      * it will only be recursed-to once.
2709      */
2710     if (curdonemap == NULL) {
2711 	for (a = sclone->outs; a != NULL && !NISERR(); a = a->outchain) {
2712 	    struct state *stoclone = a->to;
2713 	    struct state *sto = stoclone->tmp;
2714 
2715 	    if (sto != NULL) {
2716 		stoclone->tmp = NULL;
2717 		clonesuccessorstates(nfa, sto, stoclone, spredecessor, refarc,
2718 			NULL, donemap, nstates);
2719 	    }
2720 	}
2721 
2722 	/* Don't forget to free sclone's donemap when done with it */
2723 	FREE(donemap);
2724     }
2725 }
2726 
2727 /*
2728  - cleanup - clean up NFA after optimizations
2729  ^ static VOID cleanup(struct nfa *);
2730  */
2731 static void
cleanup(struct nfa * nfa)2732 cleanup(
2733     struct nfa *nfa)
2734 {
2735     struct state *s;
2736     struct state *nexts;
2737     int n;
2738 
2739     /*
2740      * Clear out unreachable or dead-end states. Use pre to mark reachable,
2741      * then post to mark can-reach-post.
2742      */
2743 
2744     markreachable(nfa, nfa->pre, NULL, nfa->pre);
2745     markcanreach(nfa, nfa->post, nfa->pre, nfa->post);
2746     for (s = nfa->states; s != NULL; s = nexts) {
2747 	nexts = s->next;
2748 	if (s->tmp != nfa->post && !s->flag) {
2749 	    dropstate(nfa, s);
2750 	}
2751     }
2752     assert(nfa->post->nins == 0 || nfa->post->tmp == nfa->post);
2753     cleartraverse(nfa, nfa->pre);
2754     assert(nfa->post->nins == 0 || nfa->post->tmp == NULL);
2755     /* the nins==0 (final unreachable) case will be caught later */
2756 
2757     /*
2758      * Renumber surviving states.
2759      */
2760 
2761     n = 0;
2762     for (s = nfa->states; s != NULL; s = s->next) {
2763 	s->no = n++;
2764     }
2765     nfa->nstates = n;
2766 }
2767 
2768 /*
2769  - markreachable - recursive marking of reachable states
2770  ^ static VOID markreachable(struct nfa *, struct state *, struct state *,
2771  ^ 	struct state *);
2772  */
2773 static void
markreachable(struct nfa * nfa,struct state * s,struct state * okay,struct state * mark)2774 markreachable(
2775     struct nfa *nfa,
2776     struct state *s,
2777     struct state *okay,		/* consider only states with this mark */
2778     struct state *mark)		/* the value to mark with */
2779 {
2780     struct arc *a;
2781 
2782     if (s->tmp != okay) {
2783 	return;
2784     }
2785     s->tmp = mark;
2786 
2787     for (a = s->outs; a != NULL; a = a->outchain) {
2788 	markreachable(nfa, a->to, okay, mark);
2789     }
2790 }
2791 
2792 /*
2793  - markcanreach - recursive marking of states which can reach here
2794  ^ static VOID markcanreach(struct nfa *, struct state *, struct state *,
2795  ^ 	struct state *);
2796  */
2797 static void
markcanreach(struct nfa * nfa,struct state * s,struct state * okay,struct state * mark)2798 markcanreach(
2799     struct nfa *nfa,
2800     struct state *s,
2801     struct state *okay,		/* consider only states with this mark */
2802     struct state *mark)		/* the value to mark with */
2803 {
2804     struct arc *a;
2805 
2806     if (s->tmp != okay) {
2807 	return;
2808     }
2809     s->tmp = mark;
2810 
2811     for (a = s->ins; a != NULL; a = a->inchain) {
2812 	markcanreach(nfa, a->from, okay, mark);
2813     }
2814 }
2815 
2816 /*
2817  - analyze - ascertain potentially-useful facts about an optimized NFA
2818  ^ static long analyze(struct nfa *);
2819  */
2820 static long			/* re_info bits to be ORed in */
analyze(struct nfa * nfa)2821 analyze(
2822     struct nfa *nfa)
2823 {
2824     struct arc *a;
2825     struct arc *aa;
2826 
2827     if (nfa->pre->outs == NULL) {
2828 	return REG_UIMPOSSIBLE;
2829     }
2830     for (a = nfa->pre->outs; a != NULL; a = a->outchain) {
2831 	for (aa = a->to->outs; aa != NULL; aa = aa->outchain) {
2832 	    if (aa->to == nfa->post) {
2833 		return REG_UEMPTYMATCH;
2834 	    }
2835 	}
2836     }
2837     return 0;
2838 }
2839 
2840 /*
2841  - compact - construct the compact representation of an NFA
2842  ^ static VOID compact(struct nfa *, struct cnfa *);
2843  */
2844 static void
compact(struct nfa * nfa,struct cnfa * cnfa)2845 compact(
2846     struct nfa *nfa,
2847     struct cnfa *cnfa)
2848 {
2849     struct state *s;
2850     struct arc *a;
2851     size_t nstates;
2852     size_t narcs;
2853     struct carc *ca;
2854     struct carc *first;
2855 
2856     assert(!NISERR());
2857 
2858     nstates = 0;
2859     narcs = 0;
2860     for (s = nfa->states; s != NULL; s = s->next) {
2861 	nstates++;
2862 	narcs += 1 + s->nouts + 1;
2863 	/* 1 as a fake for flags, nouts for arcs, 1 as endmarker */
2864     }
2865 
2866     cnfa->states = (struct carc **) MALLOC(nstates * sizeof(struct carc *));
2867     cnfa->arcs = (struct carc *) MALLOC(narcs * sizeof(struct carc));
2868     if (cnfa->states == NULL || cnfa->arcs == NULL) {
2869 	if (cnfa->states != NULL) {
2870 	    FREE(cnfa->states);
2871 	}
2872 	if (cnfa->arcs != NULL) {
2873 	    FREE(cnfa->arcs);
2874 	}
2875 	NERR(REG_ESPACE);
2876 	return;
2877     }
2878     cnfa->nstates = nstates;
2879     cnfa->pre = nfa->pre->no;
2880     cnfa->post = nfa->post->no;
2881     cnfa->bos[0] = nfa->bos[0];
2882     cnfa->bos[1] = nfa->bos[1];
2883     cnfa->eos[0] = nfa->eos[0];
2884     cnfa->eos[1] = nfa->eos[1];
2885     cnfa->ncolors = maxcolor(nfa->cm) + 1;
2886     cnfa->flags = 0;
2887 
2888     ca = cnfa->arcs;
2889     for (s = nfa->states; s != NULL; s = s->next) {
2890 	assert((size_t) s->no < nstates);
2891 	cnfa->states[s->no] = ca;
2892 	ca->co = 0;		/* clear and skip flags "arc" */
2893 	ca++;
2894 	first = ca;
2895 	for (a = s->outs; a != NULL; a = a->outchain) {
2896 	    switch (a->type) {
2897 	    case PLAIN:
2898 		ca->co = a->co;
2899 		ca->to = a->to->no;
2900 		ca++;
2901 		break;
2902 	    case LACON:
2903 		assert(s->no != cnfa->pre);
2904 		ca->co = (color) (cnfa->ncolors + a->co);
2905 		ca->to = a->to->no;
2906 		ca++;
2907 		cnfa->flags |= HASLACONS;
2908 		break;
2909 	    default:
2910 		NERR(REG_ASSERT);
2911 		break;
2912 	    }
2913 	}
2914 	carcsort(first, ca - first);
2915 	ca->co = COLORLESS;
2916 	ca->to = 0;
2917 	ca++;
2918     }
2919     assert(ca == &cnfa->arcs[narcs]);
2920     assert(cnfa->nstates != 0);
2921 
2922     /*
2923      * Mark no-progress states.
2924      */
2925 
2926     for (a = nfa->pre->outs; a != NULL; a = a->outchain) {
2927 	cnfa->states[a->to->no]->co = 1;
2928     }
2929     cnfa->states[nfa->pre->no]->co = 1;
2930 }
2931 
2932 /*
2933  - carcsort - sort compacted-NFA arcs by color
2934  ^ static VOID carcsort(struct carc *, struct carc *);
2935  */
2936 static void
carcsort(struct carc * first,size_t n)2937 carcsort(
2938     struct carc *first,
2939     size_t n)
2940 {
2941     if (n > 1) {
2942 	qsort(first, n, sizeof(struct carc), carc_cmp);
2943     }
2944 }
2945 
2946 static int
carc_cmp(const void * a,const void * b)2947 carc_cmp(
2948     const void *a,
2949     const void *b)
2950 {
2951     const struct carc *aa = (const struct carc *) a;
2952     const struct carc *bb = (const struct carc *) b;
2953 
2954     if (aa->co < bb->co) {
2955 	return -1;
2956     }
2957     if (aa->co > bb->co) {
2958 	return +1;
2959     }
2960     if (aa->to < bb->to) {
2961 	return -1;
2962     }
2963     if (aa->to > bb->to) {
2964 	return +1;
2965     }
2966     return 0;
2967 }
2968 
2969 /*
2970  - freecnfa - free a compacted NFA
2971  ^ static VOID freecnfa(struct cnfa *);
2972  */
2973 static void
freecnfa(struct cnfa * cnfa)2974 freecnfa(
2975     struct cnfa *cnfa)
2976 {
2977     assert(cnfa->nstates != 0);	/* not empty already */
2978     cnfa->nstates = 0;
2979     FREE(cnfa->states);
2980     FREE(cnfa->arcs);
2981 }
2982 
2983 /*
2984  - dumpnfa - dump an NFA in human-readable form
2985  ^ static VOID dumpnfa(struct nfa *, FILE *);
2986  */
2987 static void
dumpnfa(struct nfa * nfa,FILE * f)2988 dumpnfa(
2989     struct nfa *nfa,
2990     FILE *f)
2991 {
2992 #ifdef REG_DEBUG
2993     struct state *s;
2994     int nstates = 0;
2995     int narcs = 0;
2996 
2997     fprintf(f, "pre %d, post %d", nfa->pre->no, nfa->post->no);
2998     if (nfa->bos[0] != COLORLESS) {
2999 	fprintf(f, ", bos [%ld]", (long) nfa->bos[0]);
3000     }
3001     if (nfa->bos[1] != COLORLESS) {
3002 	fprintf(f, ", bol [%ld]", (long) nfa->bos[1]);
3003     }
3004     if (nfa->eos[0] != COLORLESS) {
3005 	fprintf(f, ", eos [%ld]", (long) nfa->eos[0]);
3006     }
3007     if (nfa->eos[1] != COLORLESS) {
3008 	fprintf(f, ", eol [%ld]", (long) nfa->eos[1]);
3009     }
3010     fprintf(f, "\n");
3011     for (s = nfa->states; s != NULL; s = s->next) {
3012 	dumpstate(s, f);
3013 	nstates++;
3014 	narcs += s->nouts;
3015     }
3016     fprintf(f, "total of %d states, %d arcs\n", nstates, narcs);
3017     if (nfa->parent == NULL) {
3018 	dumpcolors(nfa->cm, f);
3019     }
3020     fflush(f);
3021 #endif
3022 }
3023 
3024 #ifdef REG_DEBUG		/* subordinates of dumpnfa */
3025 /*
3026  ^ #ifdef REG_DEBUG
3027  */
3028 
3029 /*
3030  - dumpstate - dump an NFA state in human-readable form
3031  ^ static VOID dumpstate(struct state *, FILE *);
3032  */
3033 static void
dumpstate(struct state * s,FILE * f)3034 dumpstate(
3035     struct state *s,
3036     FILE *f)
3037 {
3038     struct arc *a;
3039 
3040     fprintf(f, "%d%s%c", s->no, (s->tmp != NULL) ? "T" : "",
3041 	    (s->flag) ? s->flag : '.');
3042     if (s->prev != NULL && s->prev->next != s) {
3043 	fprintf(f, "\tstate chain bad\n");
3044     }
3045     if (s->nouts == 0) {
3046 	fprintf(f, "\tno out arcs\n");
3047     } else {
3048 	dumparcs(s, f);
3049     }
3050     fflush(f);
3051     for (a = s->ins; a != NULL; a = a->inchain) {
3052 	if (a->to != s) {
3053 	    fprintf(f, "\tlink from %d to %d on %d's in-chain\n",
3054 		    a->from->no, a->to->no, s->no);
3055 	}
3056     }
3057 }
3058 
3059 /*
3060  - dumparcs - dump out-arcs in human-readable form
3061  ^ static VOID dumparcs(struct state *, FILE *);
3062  */
3063 static void
dumparcs(struct state * s,FILE * f)3064 dumparcs(
3065     struct state *s,
3066     FILE *f)
3067 {
3068     int pos;
3069     struct arc *a;
3070 
3071     /* printing oldest arcs first is usually clearer */
3072     a = s->outs;
3073     assert(a != NULL);
3074     while (a->outchain != NULL) {
3075 	a = a->outchain;
3076     }
3077     pos = 1;
3078     do {
3079 	dumparc(a, s, f);
3080 	if (pos == 5) {
3081 	    fprintf(f, "\n");
3082 	    pos = 1;
3083 	} else {
3084 	    pos++;
3085 	}
3086 	a = a->outchainRev;
3087     } while (a != NULL);
3088     if (pos != 1) {
3089 	fprintf(f, "\n");
3090     }
3091 }
3092 
3093 /*
3094  - dumparc - dump one outarc in readable form, including prefixing tab
3095  ^ static VOID dumparc(struct arc *, struct state *, FILE *);
3096  */
3097 static void
dumparc(struct arc * a,struct state * s,FILE * f)3098 dumparc(
3099     struct arc *a,
3100     struct state *s,
3101     FILE *f)
3102 {
3103     struct arc *aa;
3104     struct arcbatch *ab;
3105 
3106     fprintf(f, "\t");
3107     switch (a->type) {
3108     case PLAIN:
3109 	fprintf(f, "[%ld]", (long) a->co);
3110 	break;
3111     case AHEAD:
3112 	fprintf(f, ">%ld>", (long) a->co);
3113 	break;
3114     case BEHIND:
3115 	fprintf(f, "<%ld<", (long) a->co);
3116 	break;
3117     case LACON:
3118 	fprintf(f, ":%ld:", (long) a->co);
3119 	break;
3120     case '^':
3121     case '$':
3122 	fprintf(f, "%c%d", a->type, (int) a->co);
3123 	break;
3124     case EMPTY:
3125 	break;
3126     default:
3127 	fprintf(f, "0x%x/0%lo", a->type, (long) a->co);
3128 	break;
3129     }
3130     if (a->from != s) {
3131 	fprintf(f, "?%d?", a->from->no);
3132     }
3133     for (ab = &a->from->oas; ab != NULL; ab = ab->next) {
3134 	for (aa = &ab->a[0]; aa < &ab->a[ABSIZE]; aa++) {
3135 	    if (aa == a) {
3136 		break;		/* NOTE BREAK OUT */
3137 	    }
3138 	}
3139 	if (aa < &ab->a[ABSIZE]) {	/* propagate break */
3140 	    break;		/* NOTE BREAK OUT */
3141 	}
3142     }
3143     if (ab == NULL) {
3144 	fprintf(f, "?!?");	/* not in allocated space */
3145     }
3146     fprintf(f, "->");
3147     if (a->to == NULL) {
3148 	fprintf(f, "NULL");
3149 	return;
3150     }
3151     fprintf(f, "%d", a->to->no);
3152     for (aa = a->to->ins; aa != NULL; aa = aa->inchain) {
3153 	if (aa == a) {
3154 	    break;		/* NOTE BREAK OUT */
3155 	}
3156     }
3157     if (aa == NULL) {
3158 	fprintf(f, "?!?");	/* missing from in-chain */
3159     }
3160 }
3161 
3162 /*
3163  ^ #endif
3164  */
3165 #endif				/* ifdef REG_DEBUG */
3166 
3167 /*
3168  - dumpcnfa - dump a compacted NFA in human-readable form
3169  ^ static VOID dumpcnfa(struct cnfa *, FILE *);
3170  */
3171 static void
dumpcnfa(struct cnfa * cnfa,FILE * f)3172 dumpcnfa(
3173     struct cnfa *cnfa,
3174     FILE *f)
3175 {
3176 #ifdef REG_DEBUG
3177     int st;
3178 
3179     fprintf(f, "pre %d, post %d", cnfa->pre, cnfa->post);
3180     if (cnfa->bos[0] != COLORLESS) {
3181 	fprintf(f, ", bos [%ld]", (long) cnfa->bos[0]);
3182     }
3183     if (cnfa->bos[1] != COLORLESS) {
3184 	fprintf(f, ", bol [%ld]", (long) cnfa->bos[1]);
3185     }
3186     if (cnfa->eos[0] != COLORLESS) {
3187 	fprintf(f, ", eos [%ld]", (long) cnfa->eos[0]);
3188     }
3189     if (cnfa->eos[1] != COLORLESS) {
3190 	fprintf(f, ", eol [%ld]", (long) cnfa->eos[1]);
3191     }
3192     if (cnfa->flags&HASLACONS) {
3193 	fprintf(f, ", haslacons");
3194     }
3195     fprintf(f, "\n");
3196     for (st = 0; st < cnfa->nstates; st++) {
3197 	dumpcstate(st, cnfa->states[st], cnfa, f);
3198     }
3199     fflush(f);
3200 #endif
3201 }
3202 
3203 #ifdef REG_DEBUG		/* subordinates of dumpcnfa */
3204 /*
3205  ^ #ifdef REG_DEBUG
3206  */
3207 
3208 /*
3209  - dumpcstate - dump a compacted-NFA state in human-readable form
3210  ^ static VOID dumpcstate(int, struct carc *, struct cnfa *, FILE *);
3211  */
3212 static void
dumpcstate(int st,struct carc * ca,struct cnfa * cnfa,FILE * f)3213 dumpcstate(
3214     int st,
3215     struct carc *ca,
3216     struct cnfa *cnfa,
3217     FILE *f)
3218 {
3219     int i;
3220     int pos;
3221 
3222     fprintf(f, "%d%s", st, (ca[0].co) ? ":" : ".");
3223     pos = 1;
3224     for (i = 1; ca[i].co != COLORLESS; i++) {
3225 	if (ca[i].co < cnfa->ncolors) {
3226 	    fprintf(f, "\t[%ld]->%d", (long) ca[i].co, ca[i].to);
3227 	} else {
3228 	    fprintf(f, "\t:%ld:->%d", (long) ca[i].co-cnfa->ncolors,ca[i].to);
3229 	}
3230 	if (pos == 5) {
3231 	    fprintf(f, "\n");
3232 	    pos = 1;
3233 	} else {
3234 	    pos++;
3235 	}
3236     }
3237     if (i == 1 || pos != 1) {
3238 	fprintf(f, "\n");
3239     }
3240     fflush(f);
3241 }
3242 
3243 /*
3244  ^ #endif
3245  */
3246 #endif				/* ifdef REG_DEBUG */
3247 
3248 /*
3249  * Local Variables:
3250  * mode: c
3251  * c-basic-offset: 4
3252  * fill-column: 78
3253  * End:
3254  */
3255