1 /*
2  * NFA utilities.
3  * This file is #included by regcomp.c.
4  *
5  * Copyright (c) 1998, 1999 Henry Spencer.  All rights reserved.
6  *
7  * Development of this software was funded, in part, by Cray Research Inc.,
8  * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
9  * Corporation, none of whom are responsible for the results.  The author
10  * thanks all of them.
11  *
12  * Redistribution and use in source and binary forms -- with or without
13  * modification -- are permitted for any purpose, provided that
14  * redistributions in source form retain this entire copyright notice and
15  * indicate the origin and nature of any modifications.
16  *
17  * I'd appreciate being given credit for this package in the documentation
18  * of software which uses it, but that is not a requirement.
19  *
20  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
21  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
22  * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
23  * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
29  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  *
31  * src/backend/regex/regc_nfa.c
32  *
33  *
34  * One or two things that technically ought to be in here
35  * are actually in color.c, thanks to some incestuous relationships in
36  * the color chains.
37  */
38 
39 #define NISERR()	VISERR(nfa->v)
40 #define NERR(e)		VERR(nfa->v, (e))
41 
42 
43 /*
44  * newnfa - set up an NFA
45  */
46 static struct nfa *				/* the NFA, or NULL */
newnfa(struct vars * v,struct colormap * cm,struct nfa * parent)47 newnfa(struct vars * v,
48 	   struct colormap * cm,
49 	   struct nfa * parent)		/* NULL if primary NFA */
50 {
51 	struct nfa *nfa;
52 
53 	nfa = (struct nfa *) MALLOC(sizeof(struct nfa));
54 	if (nfa == NULL)
55 	{
56 		ERR(REG_ESPACE);
57 		return NULL;
58 	}
59 
60 	nfa->states = NULL;
61 	nfa->slast = NULL;
62 	nfa->free = NULL;
63 	nfa->nstates = 0;
64 	nfa->cm = cm;
65 	nfa->v = v;
66 	nfa->bos[0] = nfa->bos[1] = COLORLESS;
67 	nfa->eos[0] = nfa->eos[1] = COLORLESS;
68 	nfa->parent = parent;		/* Precedes newfstate so parent is valid. */
69 	nfa->post = newfstate(nfa, '@');	/* number 0 */
70 	nfa->pre = newfstate(nfa, '>');		/* number 1 */
71 
72 	nfa->init = newstate(nfa);	/* may become invalid later */
73 	nfa->final = newstate(nfa);
74 	if (ISERR())
75 	{
76 		freenfa(nfa);
77 		return NULL;
78 	}
79 	rainbow(nfa, nfa->cm, PLAIN, COLORLESS, nfa->pre, nfa->init);
80 	newarc(nfa, '^', 1, nfa->pre, nfa->init);
81 	newarc(nfa, '^', 0, nfa->pre, nfa->init);
82 	rainbow(nfa, nfa->cm, PLAIN, COLORLESS, nfa->final, nfa->post);
83 	newarc(nfa, '$', 1, nfa->final, nfa->post);
84 	newarc(nfa, '$', 0, nfa->final, nfa->post);
85 
86 	if (ISERR())
87 	{
88 		freenfa(nfa);
89 		return NULL;
90 	}
91 	return nfa;
92 }
93 
94 /*
95  * freenfa - free an entire NFA
96  */
97 static void
freenfa(struct nfa * nfa)98 freenfa(struct nfa * nfa)
99 {
100 	struct state *s;
101 
102 	while ((s = nfa->states) != NULL)
103 	{
104 		s->nins = s->nouts = 0; /* don't worry about arcs */
105 		freestate(nfa, s);
106 	}
107 	while ((s = nfa->free) != NULL)
108 	{
109 		nfa->free = s->next;
110 		destroystate(nfa, s);
111 	}
112 
113 	nfa->slast = NULL;
114 	nfa->nstates = -1;
115 	nfa->pre = NULL;
116 	nfa->post = NULL;
117 	FREE(nfa);
118 }
119 
120 /*
121  * newstate - allocate an NFA state, with zero flag value
122  */
123 static struct state *			/* NULL on error */
newstate(struct nfa * nfa)124 newstate(struct nfa * nfa)
125 {
126 	struct state *s;
127 
128 	/*
129 	 * This is a handy place to check for operation cancel during regex
130 	 * compilation, since no code path will go very long without making a new
131 	 * state or arc.
132 	 */
133 	if (CANCEL_REQUESTED(nfa->v->re))
134 	{
135 		NERR(REG_CANCEL);
136 		return NULL;
137 	}
138 
139 	if (nfa->free != NULL)
140 	{
141 		s = nfa->free;
142 		nfa->free = s->next;
143 	}
144 	else
145 	{
146 		if (nfa->v->spaceused >= REG_MAX_COMPILE_SPACE)
147 		{
148 			NERR(REG_ETOOBIG);
149 			return NULL;
150 		}
151 		s = (struct state *) MALLOC(sizeof(struct state));
152 		if (s == NULL)
153 		{
154 			NERR(REG_ESPACE);
155 			return NULL;
156 		}
157 		nfa->v->spaceused += sizeof(struct state);
158 		s->oas.next = NULL;
159 		s->free = NULL;
160 		s->noas = 0;
161 	}
162 
163 	assert(nfa->nstates >= 0);
164 	s->no = nfa->nstates++;
165 	s->flag = 0;
166 	if (nfa->states == NULL)
167 		nfa->states = s;
168 	s->nins = 0;
169 	s->ins = NULL;
170 	s->nouts = 0;
171 	s->outs = NULL;
172 	s->tmp = NULL;
173 	s->next = NULL;
174 	if (nfa->slast != NULL)
175 	{
176 		assert(nfa->slast->next == NULL);
177 		nfa->slast->next = s;
178 	}
179 	s->prev = nfa->slast;
180 	nfa->slast = s;
181 	return s;
182 }
183 
184 /*
185  * newfstate - allocate an NFA state with a specified flag value
186  */
187 static struct state *			/* NULL on error */
newfstate(struct nfa * nfa,int flag)188 newfstate(struct nfa * nfa, int flag)
189 {
190 	struct state *s;
191 
192 	s = newstate(nfa);
193 	if (s != NULL)
194 		s->flag = (char) flag;
195 	return s;
196 }
197 
198 /*
199  * dropstate - delete a state's inarcs and outarcs and free it
200  */
201 static void
dropstate(struct nfa * nfa,struct state * s)202 dropstate(struct nfa * nfa,
203 		  struct state * s)
204 {
205 	struct arc *a;
206 
207 	while ((a = s->ins) != NULL)
208 		freearc(nfa, a);
209 	while ((a = s->outs) != NULL)
210 		freearc(nfa, a);
211 	freestate(nfa, s);
212 }
213 
214 /*
215  * freestate - free a state, which has no in-arcs or out-arcs
216  */
217 static void
freestate(struct nfa * nfa,struct state * s)218 freestate(struct nfa * nfa,
219 		  struct state * s)
220 {
221 	assert(s != NULL);
222 	assert(s->nins == 0 && s->nouts == 0);
223 
224 	s->no = FREESTATE;
225 	s->flag = 0;
226 	if (s->next != NULL)
227 		s->next->prev = s->prev;
228 	else
229 	{
230 		assert(s == nfa->slast);
231 		nfa->slast = s->prev;
232 	}
233 	if (s->prev != NULL)
234 		s->prev->next = s->next;
235 	else
236 	{
237 		assert(s == nfa->states);
238 		nfa->states = s->next;
239 	}
240 	s->prev = NULL;
241 	s->next = nfa->free;		/* don't delete it, put it on the free list */
242 	nfa->free = s;
243 }
244 
245 /*
246  * destroystate - really get rid of an already-freed state
247  */
248 static void
destroystate(struct nfa * nfa,struct state * s)249 destroystate(struct nfa * nfa,
250 			 struct state * s)
251 {
252 	struct arcbatch *ab;
253 	struct arcbatch *abnext;
254 
255 	assert(s->no == FREESTATE);
256 	for (ab = s->oas.next; ab != NULL; ab = abnext)
257 	{
258 		abnext = ab->next;
259 		FREE(ab);
260 		nfa->v->spaceused -= sizeof(struct arcbatch);
261 	}
262 	s->ins = NULL;
263 	s->outs = NULL;
264 	s->next = NULL;
265 	FREE(s);
266 	nfa->v->spaceused -= sizeof(struct state);
267 }
268 
269 /*
270  * newarc - set up a new arc within an NFA
271  *
272  * This function checks to make sure that no duplicate arcs are created.
273  * In general we never want duplicates.
274  */
275 static void
newarc(struct nfa * nfa,int t,pcolor co,struct state * from,struct state * to)276 newarc(struct nfa * nfa,
277 	   int t,
278 	   pcolor co,
279 	   struct state * from,
280 	   struct state * to)
281 {
282 	struct arc *a;
283 
284 	assert(from != NULL && to != NULL);
285 
286 	/*
287 	 * This is a handy place to check for operation cancel during regex
288 	 * compilation, since no code path will go very long without making a new
289 	 * state or arc.
290 	 */
291 	if (CANCEL_REQUESTED(nfa->v->re))
292 	{
293 		NERR(REG_CANCEL);
294 		return;
295 	}
296 
297 	/* check for duplicate arc, using whichever chain is shorter */
298 	if (from->nouts <= to->nins)
299 	{
300 		for (a = from->outs; a != NULL; a = a->outchain)
301 			if (a->to == to && a->co == co && a->type == t)
302 				return;
303 	}
304 	else
305 	{
306 		for (a = to->ins; a != NULL; a = a->inchain)
307 			if (a->from == from && a->co == co && a->type == t)
308 				return;
309 	}
310 
311 	/* no dup, so create the arc */
312 	createarc(nfa, t, co, from, to);
313 }
314 
315 /*
316  * createarc - create a new arc within an NFA
317  *
318  * This function must *only* be used after verifying that there is no existing
319  * identical arc (same type/color/from/to).
320  */
321 static void
createarc(struct nfa * nfa,int t,pcolor co,struct state * from,struct state * to)322 createarc(struct nfa * nfa,
323 		  int t,
324 		  pcolor co,
325 		  struct state * from,
326 		  struct state * to)
327 {
328 	struct arc *a;
329 
330 	/* the arc is physically allocated within its from-state */
331 	a = allocarc(nfa, from);
332 	if (NISERR())
333 		return;
334 	assert(a != NULL);
335 
336 	a->type = t;
337 	a->co = (color) co;
338 	a->to = to;
339 	a->from = from;
340 
341 	/*
342 	 * Put the new arc on the beginning, not the end, of the chains; it's
343 	 * simpler here, and freearc() is the same cost either way.  See also the
344 	 * logic in moveins() and its cohorts, as well as fixempties().
345 	 */
346 	a->inchain = to->ins;
347 	a->inchainRev = NULL;
348 	if (to->ins)
349 		to->ins->inchainRev = a;
350 	to->ins = a;
351 	a->outchain = from->outs;
352 	a->outchainRev = NULL;
353 	if (from->outs)
354 		from->outs->outchainRev = a;
355 	from->outs = a;
356 
357 	from->nouts++;
358 	to->nins++;
359 
360 	if (COLORED(a) && nfa->parent == NULL)
361 		colorchain(nfa->cm, a);
362 }
363 
364 /*
365  * allocarc - allocate a new out-arc within a state
366  */
367 static struct arc *				/* NULL for failure */
allocarc(struct nfa * nfa,struct state * s)368 allocarc(struct nfa * nfa,
369 		 struct state * s)
370 {
371 	struct arc *a;
372 
373 	/* shortcut */
374 	if (s->free == NULL && s->noas < ABSIZE)
375 	{
376 		a = &s->oas.a[s->noas];
377 		s->noas++;
378 		return a;
379 	}
380 
381 	/* if none at hand, get more */
382 	if (s->free == NULL)
383 	{
384 		struct arcbatch *newAb;
385 		int			i;
386 
387 		if (nfa->v->spaceused >= REG_MAX_COMPILE_SPACE)
388 		{
389 			NERR(REG_ETOOBIG);
390 			return NULL;
391 		}
392 		newAb = (struct arcbatch *) MALLOC(sizeof(struct arcbatch));
393 		if (newAb == NULL)
394 		{
395 			NERR(REG_ESPACE);
396 			return NULL;
397 		}
398 		nfa->v->spaceused += sizeof(struct arcbatch);
399 		newAb->next = s->oas.next;
400 		s->oas.next = newAb;
401 
402 		for (i = 0; i < ABSIZE; i++)
403 		{
404 			newAb->a[i].type = 0;
405 			newAb->a[i].freechain = &newAb->a[i + 1];
406 		}
407 		newAb->a[ABSIZE - 1].freechain = NULL;
408 		s->free = &newAb->a[0];
409 	}
410 	assert(s->free != NULL);
411 
412 	a = s->free;
413 	s->free = a->freechain;
414 	return a;
415 }
416 
417 /*
418  * freearc - free an arc
419  */
420 static void
freearc(struct nfa * nfa,struct arc * victim)421 freearc(struct nfa * nfa,
422 		struct arc * victim)
423 {
424 	struct state *from = victim->from;
425 	struct state *to = victim->to;
426 	struct arc *predecessor;
427 
428 	assert(victim->type != 0);
429 
430 	/* take it off color chain if necessary */
431 	if (COLORED(victim) && nfa->parent == NULL)
432 		uncolorchain(nfa->cm, victim);
433 
434 	/* take it off source's out-chain */
435 	assert(from != NULL);
436 	predecessor = victim->outchainRev;
437 	if (predecessor == NULL)
438 	{
439 		assert(from->outs == victim);
440 		from->outs = victim->outchain;
441 	}
442 	else
443 	{
444 		assert(predecessor->outchain == victim);
445 		predecessor->outchain = victim->outchain;
446 	}
447 	if (victim->outchain != NULL)
448 	{
449 		assert(victim->outchain->outchainRev == victim);
450 		victim->outchain->outchainRev = predecessor;
451 	}
452 	from->nouts--;
453 
454 	/* take it off target's in-chain */
455 	assert(to != NULL);
456 	predecessor = victim->inchainRev;
457 	if (predecessor == NULL)
458 	{
459 		assert(to->ins == victim);
460 		to->ins = victim->inchain;
461 	}
462 	else
463 	{
464 		assert(predecessor->inchain == victim);
465 		predecessor->inchain = victim->inchain;
466 	}
467 	if (victim->inchain != NULL)
468 	{
469 		assert(victim->inchain->inchainRev == victim);
470 		victim->inchain->inchainRev = predecessor;
471 	}
472 	to->nins--;
473 
474 	/* clean up and place on from-state's free list */
475 	victim->type = 0;
476 	victim->from = NULL;		/* precautions... */
477 	victim->to = NULL;
478 	victim->inchain = NULL;
479 	victim->inchainRev = NULL;
480 	victim->outchain = NULL;
481 	victim->outchainRev = NULL;
482 	victim->freechain = from->free;
483 	from->free = victim;
484 }
485 
486 /*
487  * changearctarget - flip an arc to have a different to state
488  *
489  * Caller must have verified that there is no pre-existing duplicate arc.
490  *
491  * Note that because we store arcs in their from state, we can't easily have
492  * a similar changearcsource function.
493  */
494 static void
changearctarget(struct arc * a,struct state * newto)495 changearctarget(struct arc * a, struct state * newto)
496 {
497 	struct state *oldto = a->to;
498 	struct arc *predecessor;
499 
500 	assert(oldto != newto);
501 
502 	/* take it off old target's in-chain */
503 	assert(oldto != NULL);
504 	predecessor = a->inchainRev;
505 	if (predecessor == NULL)
506 	{
507 		assert(oldto->ins == a);
508 		oldto->ins = a->inchain;
509 	}
510 	else
511 	{
512 		assert(predecessor->inchain == a);
513 		predecessor->inchain = a->inchain;
514 	}
515 	if (a->inchain != NULL)
516 	{
517 		assert(a->inchain->inchainRev == a);
518 		a->inchain->inchainRev = predecessor;
519 	}
520 	oldto->nins--;
521 
522 	a->to = newto;
523 
524 	/* prepend it to new target's in-chain */
525 	a->inchain = newto->ins;
526 	a->inchainRev = NULL;
527 	if (newto->ins)
528 		newto->ins->inchainRev = a;
529 	newto->ins = a;
530 	newto->nins++;
531 }
532 
533 /*
534  * hasnonemptyout - Does state have a non-EMPTY out arc?
535  */
536 static int
hasnonemptyout(struct state * s)537 hasnonemptyout(struct state * s)
538 {
539 	struct arc *a;
540 
541 	for (a = s->outs; a != NULL; a = a->outchain)
542 	{
543 		if (a->type != EMPTY)
544 			return 1;
545 	}
546 	return 0;
547 }
548 
549 /*
550  * findarc - find arc, if any, from given source with given type and color
551  * If there is more than one such arc, the result is random.
552  */
553 static struct arc *
findarc(struct state * s,int type,pcolor co)554 findarc(struct state * s,
555 		int type,
556 		pcolor co)
557 {
558 	struct arc *a;
559 
560 	for (a = s->outs; a != NULL; a = a->outchain)
561 		if (a->type == type && a->co == co)
562 			return a;
563 	return NULL;
564 }
565 
566 /*
567  * cparc - allocate a new arc within an NFA, copying details from old one
568  */
569 static void
cparc(struct nfa * nfa,struct arc * oa,struct state * from,struct state * to)570 cparc(struct nfa * nfa,
571 	  struct arc * oa,
572 	  struct state * from,
573 	  struct state * to)
574 {
575 	newarc(nfa, oa->type, oa->co, from, to);
576 }
577 
578 /*
579  * sortins - sort the in arcs of a state by from/color/type
580  */
581 static void
sortins(struct nfa * nfa,struct state * s)582 sortins(struct nfa * nfa,
583 		struct state * s)
584 {
585 	struct arc **sortarray;
586 	struct arc *a;
587 	int			n = s->nins;
588 	int			i;
589 
590 	if (n <= 1)
591 		return;					/* nothing to do */
592 	/* make an array of arc pointers ... */
593 	sortarray = (struct arc **) MALLOC(n * sizeof(struct arc *));
594 	if (sortarray == NULL)
595 	{
596 		NERR(REG_ESPACE);
597 		return;
598 	}
599 	i = 0;
600 	for (a = s->ins; a != NULL; a = a->inchain)
601 		sortarray[i++] = a;
602 	assert(i == n);
603 	/* ... sort the array */
604 	qsort(sortarray, n, sizeof(struct arc *), sortins_cmp);
605 	/* ... and rebuild arc list in order */
606 	/* it seems worth special-casing first and last items to simplify loop */
607 	a = sortarray[0];
608 	s->ins = a;
609 	a->inchain = sortarray[1];
610 	a->inchainRev = NULL;
611 	for (i = 1; i < n - 1; i++)
612 	{
613 		a = sortarray[i];
614 		a->inchain = sortarray[i + 1];
615 		a->inchainRev = sortarray[i - 1];
616 	}
617 	a = sortarray[i];
618 	a->inchain = NULL;
619 	a->inchainRev = sortarray[i - 1];
620 	FREE(sortarray);
621 }
622 
623 static int
sortins_cmp(const void * a,const void * b)624 sortins_cmp(const void *a, const void *b)
625 {
626 	const struct arc *aa = *((const struct arc * const *) a);
627 	const struct arc *bb = *((const struct arc * const *) b);
628 
629 	/* we check the fields in the order they are most likely to be different */
630 	if (aa->from->no < bb->from->no)
631 		return -1;
632 	if (aa->from->no > bb->from->no)
633 		return 1;
634 	if (aa->co < bb->co)
635 		return -1;
636 	if (aa->co > bb->co)
637 		return 1;
638 	if (aa->type < bb->type)
639 		return -1;
640 	if (aa->type > bb->type)
641 		return 1;
642 	return 0;
643 }
644 
645 /*
646  * sortouts - sort the out arcs of a state by to/color/type
647  */
648 static void
sortouts(struct nfa * nfa,struct state * s)649 sortouts(struct nfa * nfa,
650 		 struct state * s)
651 {
652 	struct arc **sortarray;
653 	struct arc *a;
654 	int			n = s->nouts;
655 	int			i;
656 
657 	if (n <= 1)
658 		return;					/* nothing to do */
659 	/* make an array of arc pointers ... */
660 	sortarray = (struct arc **) MALLOC(n * sizeof(struct arc *));
661 	if (sortarray == NULL)
662 	{
663 		NERR(REG_ESPACE);
664 		return;
665 	}
666 	i = 0;
667 	for (a = s->outs; a != NULL; a = a->outchain)
668 		sortarray[i++] = a;
669 	assert(i == n);
670 	/* ... sort the array */
671 	qsort(sortarray, n, sizeof(struct arc *), sortouts_cmp);
672 	/* ... and rebuild arc list in order */
673 	/* it seems worth special-casing first and last items to simplify loop */
674 	a = sortarray[0];
675 	s->outs = a;
676 	a->outchain = sortarray[1];
677 	a->outchainRev = NULL;
678 	for (i = 1; i < n - 1; i++)
679 	{
680 		a = sortarray[i];
681 		a->outchain = sortarray[i + 1];
682 		a->outchainRev = sortarray[i - 1];
683 	}
684 	a = sortarray[i];
685 	a->outchain = NULL;
686 	a->outchainRev = sortarray[i - 1];
687 	FREE(sortarray);
688 }
689 
690 static int
sortouts_cmp(const void * a,const void * b)691 sortouts_cmp(const void *a, const void *b)
692 {
693 	const struct arc *aa = *((const struct arc * const *) a);
694 	const struct arc *bb = *((const struct arc * const *) b);
695 
696 	/* we check the fields in the order they are most likely to be different */
697 	if (aa->to->no < bb->to->no)
698 		return -1;
699 	if (aa->to->no > bb->to->no)
700 		return 1;
701 	if (aa->co < bb->co)
702 		return -1;
703 	if (aa->co > bb->co)
704 		return 1;
705 	if (aa->type < bb->type)
706 		return -1;
707 	if (aa->type > bb->type)
708 		return 1;
709 	return 0;
710 }
711 
712 /*
713  * Common decision logic about whether to use arc-by-arc operations or
714  * sort/merge.  If there's just a few source arcs we cannot recoup the
715  * cost of sorting the destination arc list, no matter how large it is.
716  * Otherwise, limit the number of arc-by-arc comparisons to about 1000
717  * (a somewhat arbitrary choice, but the breakeven point would probably
718  * be machine dependent anyway).
719  */
720 #define BULK_ARC_OP_USE_SORT(nsrcarcs, ndestarcs) \
721 	((nsrcarcs) < 4 ? 0 : ((nsrcarcs) > 32 || (ndestarcs) > 32))
722 
723 /*
724  * moveins - move all in arcs of a state to another state
725  *
726  * You might think this could be done better by just updating the
727  * existing arcs, and you would be right if it weren't for the need
728  * for duplicate suppression, which makes it easier to just make new
729  * ones to exploit the suppression built into newarc.
730  *
731  * However, if we have a whole lot of arcs to deal with, retail duplicate
732  * checks become too slow.  In that case we proceed by sorting and merging
733  * the arc lists, and then we can indeed just update the arcs in-place.
734  */
735 static void
moveins(struct nfa * nfa,struct state * oldState,struct state * newState)736 moveins(struct nfa * nfa,
737 		struct state * oldState,
738 		struct state * newState)
739 {
740 	assert(oldState != newState);
741 
742 	if (!BULK_ARC_OP_USE_SORT(oldState->nins, newState->nins))
743 	{
744 		/* With not too many arcs, just do them one at a time */
745 		struct arc *a;
746 
747 		while ((a = oldState->ins) != NULL)
748 		{
749 			cparc(nfa, a, a->from, newState);
750 			freearc(nfa, a);
751 		}
752 	}
753 	else
754 	{
755 		/*
756 		 * With many arcs, use a sort-merge approach.  Note changearctarget()
757 		 * will put the arc onto the front of newState's chain, so it does not
758 		 * break our walk through the sorted part of the chain.
759 		 */
760 		struct arc *oa;
761 		struct arc *na;
762 
763 		/*
764 		 * Because we bypass newarc() in this code path, we'd better include a
765 		 * cancel check.
766 		 */
767 		if (CANCEL_REQUESTED(nfa->v->re))
768 		{
769 			NERR(REG_CANCEL);
770 			return;
771 		}
772 
773 		sortins(nfa, oldState);
774 		sortins(nfa, newState);
775 		if (NISERR())
776 			return;				/* might have failed to sort */
777 		oa = oldState->ins;
778 		na = newState->ins;
779 		while (oa != NULL && na != NULL)
780 		{
781 			struct arc *a = oa;
782 
783 			switch (sortins_cmp(&oa, &na))
784 			{
785 				case -1:
786 					/* newState does not have anything matching oa */
787 					oa = oa->inchain;
788 
789 					/*
790 					 * Rather than doing createarc+freearc, we can just unlink
791 					 * and relink the existing arc struct.
792 					 */
793 					changearctarget(a, newState);
794 					break;
795 				case 0:
796 					/* match, advance in both lists */
797 					oa = oa->inchain;
798 					na = na->inchain;
799 					/* ... and drop duplicate arc from oldState */
800 					freearc(nfa, a);
801 					break;
802 				case +1:
803 					/* advance only na; oa might have a match later */
804 					na = na->inchain;
805 					break;
806 				default:
807 					assert(NOTREACHED);
808 			}
809 		}
810 		while (oa != NULL)
811 		{
812 			/* newState does not have anything matching oa */
813 			struct arc *a = oa;
814 
815 			oa = oa->inchain;
816 			changearctarget(a, newState);
817 		}
818 	}
819 
820 	assert(oldState->nins == 0);
821 	assert(oldState->ins == NULL);
822 }
823 
824 /*
825  * copyins - copy in arcs of a state to another state
826  */
827 static void
copyins(struct nfa * nfa,struct state * oldState,struct state * newState)828 copyins(struct nfa * nfa,
829 		struct state * oldState,
830 		struct state * newState)
831 {
832 	assert(oldState != newState);
833 
834 	if (!BULK_ARC_OP_USE_SORT(oldState->nins, newState->nins))
835 	{
836 		/* With not too many arcs, just do them one at a time */
837 		struct arc *a;
838 
839 		for (a = oldState->ins; a != NULL; a = a->inchain)
840 			cparc(nfa, a, a->from, newState);
841 	}
842 	else
843 	{
844 		/*
845 		 * With many arcs, use a sort-merge approach.  Note that createarc()
846 		 * will put new arcs onto the front of newState's chain, so it does
847 		 * not break our walk through the sorted part of the chain.
848 		 */
849 		struct arc *oa;
850 		struct arc *na;
851 
852 		/*
853 		 * Because we bypass newarc() in this code path, we'd better include a
854 		 * cancel check.
855 		 */
856 		if (CANCEL_REQUESTED(nfa->v->re))
857 		{
858 			NERR(REG_CANCEL);
859 			return;
860 		}
861 
862 		sortins(nfa, oldState);
863 		sortins(nfa, newState);
864 		if (NISERR())
865 			return;				/* might have failed to sort */
866 		oa = oldState->ins;
867 		na = newState->ins;
868 		while (oa != NULL && na != NULL)
869 		{
870 			struct arc *a = oa;
871 
872 			switch (sortins_cmp(&oa, &na))
873 			{
874 				case -1:
875 					/* newState does not have anything matching oa */
876 					oa = oa->inchain;
877 					createarc(nfa, a->type, a->co, a->from, newState);
878 					break;
879 				case 0:
880 					/* match, advance in both lists */
881 					oa = oa->inchain;
882 					na = na->inchain;
883 					break;
884 				case +1:
885 					/* advance only na; oa might have a match later */
886 					na = na->inchain;
887 					break;
888 				default:
889 					assert(NOTREACHED);
890 			}
891 		}
892 		while (oa != NULL)
893 		{
894 			/* newState does not have anything matching oa */
895 			struct arc *a = oa;
896 
897 			oa = oa->inchain;
898 			createarc(nfa, a->type, a->co, a->from, newState);
899 		}
900 	}
901 }
902 
903 /*
904  * mergeins - merge a list of inarcs into a state
905  *
906  * This is much like copyins, but the source arcs are listed in an array,
907  * and are not guaranteed unique.  It's okay to clobber the array contents.
908  */
909 static void
mergeins(struct nfa * nfa,struct state * s,struct arc ** arcarray,int arccount)910 mergeins(struct nfa * nfa,
911 		 struct state * s,
912 		 struct arc ** arcarray,
913 		 int arccount)
914 {
915 	struct arc *na;
916 	int			i;
917 	int			j;
918 
919 	if (arccount <= 0)
920 		return;
921 
922 	/*
923 	 * Because we bypass newarc() in this code path, we'd better include a
924 	 * cancel check.
925 	 */
926 	if (CANCEL_REQUESTED(nfa->v->re))
927 	{
928 		NERR(REG_CANCEL);
929 		return;
930 	}
931 
932 	/* Sort existing inarcs as well as proposed new ones */
933 	sortins(nfa, s);
934 	if (NISERR())
935 		return;					/* might have failed to sort */
936 
937 	qsort(arcarray, arccount, sizeof(struct arc *), sortins_cmp);
938 
939 	/*
940 	 * arcarray very likely includes dups, so we must eliminate them.  (This
941 	 * could be folded into the next loop, but it's not worth the trouble.)
942 	 */
943 	j = 0;
944 	for (i = 1; i < arccount; i++)
945 	{
946 		switch (sortins_cmp(&arcarray[j], &arcarray[i]))
947 		{
948 			case -1:
949 				/* non-dup */
950 				arcarray[++j] = arcarray[i];
951 				break;
952 			case 0:
953 				/* dup */
954 				break;
955 			default:
956 				/* trouble */
957 				assert(NOTREACHED);
958 		}
959 	}
960 	arccount = j + 1;
961 
962 	/*
963 	 * Now merge into s' inchain.  Note that createarc() will put new arcs
964 	 * onto the front of s's chain, so it does not break our walk through the
965 	 * sorted part of the chain.
966 	 */
967 	i = 0;
968 	na = s->ins;
969 	while (i < arccount && na != NULL)
970 	{
971 		struct arc *a = arcarray[i];
972 
973 		switch (sortins_cmp(&a, &na))
974 		{
975 			case -1:
976 				/* s does not have anything matching a */
977 				createarc(nfa, a->type, a->co, a->from, s);
978 				i++;
979 				break;
980 			case 0:
981 				/* match, advance in both lists */
982 				i++;
983 				na = na->inchain;
984 				break;
985 			case +1:
986 				/* advance only na; array might have a match later */
987 				na = na->inchain;
988 				break;
989 			default:
990 				assert(NOTREACHED);
991 		}
992 	}
993 	while (i < arccount)
994 	{
995 		/* s does not have anything matching a */
996 		struct arc *a = arcarray[i];
997 
998 		createarc(nfa, a->type, a->co, a->from, s);
999 		i++;
1000 	}
1001 }
1002 
1003 /*
1004  * moveouts - move all out arcs of a state to another state
1005  */
1006 static void
moveouts(struct nfa * nfa,struct state * oldState,struct state * newState)1007 moveouts(struct nfa * nfa,
1008 		 struct state * oldState,
1009 		 struct state * newState)
1010 {
1011 	assert(oldState != newState);
1012 
1013 	if (!BULK_ARC_OP_USE_SORT(oldState->nouts, newState->nouts))
1014 	{
1015 		/* With not too many arcs, just do them one at a time */
1016 		struct arc *a;
1017 
1018 		while ((a = oldState->outs) != NULL)
1019 		{
1020 			cparc(nfa, a, newState, a->to);
1021 			freearc(nfa, a);
1022 		}
1023 	}
1024 	else
1025 	{
1026 		/*
1027 		 * With many arcs, use a sort-merge approach.  Note that createarc()
1028 		 * will put new arcs onto the front of newState's chain, so it does
1029 		 * not break our walk through the sorted part of the chain.
1030 		 */
1031 		struct arc *oa;
1032 		struct arc *na;
1033 
1034 		/*
1035 		 * Because we bypass newarc() in this code path, we'd better include a
1036 		 * cancel check.
1037 		 */
1038 		if (CANCEL_REQUESTED(nfa->v->re))
1039 		{
1040 			NERR(REG_CANCEL);
1041 			return;
1042 		}
1043 
1044 		sortouts(nfa, oldState);
1045 		sortouts(nfa, newState);
1046 		if (NISERR())
1047 			return;				/* might have failed to sort */
1048 		oa = oldState->outs;
1049 		na = newState->outs;
1050 		while (oa != NULL && na != NULL)
1051 		{
1052 			struct arc *a = oa;
1053 
1054 			switch (sortouts_cmp(&oa, &na))
1055 			{
1056 				case -1:
1057 					/* newState does not have anything matching oa */
1058 					oa = oa->outchain;
1059 					createarc(nfa, a->type, a->co, newState, a->to);
1060 					freearc(nfa, a);
1061 					break;
1062 				case 0:
1063 					/* match, advance in both lists */
1064 					oa = oa->outchain;
1065 					na = na->outchain;
1066 					/* ... and drop duplicate arc from oldState */
1067 					freearc(nfa, a);
1068 					break;
1069 				case +1:
1070 					/* advance only na; oa might have a match later */
1071 					na = na->outchain;
1072 					break;
1073 				default:
1074 					assert(NOTREACHED);
1075 			}
1076 		}
1077 		while (oa != NULL)
1078 		{
1079 			/* newState does not have anything matching oa */
1080 			struct arc *a = oa;
1081 
1082 			oa = oa->outchain;
1083 			createarc(nfa, a->type, a->co, newState, a->to);
1084 			freearc(nfa, a);
1085 		}
1086 	}
1087 
1088 	assert(oldState->nouts == 0);
1089 	assert(oldState->outs == NULL);
1090 }
1091 
1092 /*
1093  * copyouts - copy out arcs of a state to another state
1094  */
1095 static void
copyouts(struct nfa * nfa,struct state * oldState,struct state * newState)1096 copyouts(struct nfa * nfa,
1097 		 struct state * oldState,
1098 		 struct state * newState)
1099 {
1100 	assert(oldState != newState);
1101 
1102 	if (!BULK_ARC_OP_USE_SORT(oldState->nouts, newState->nouts))
1103 	{
1104 		/* With not too many arcs, just do them one at a time */
1105 		struct arc *a;
1106 
1107 		for (a = oldState->outs; a != NULL; a = a->outchain)
1108 			cparc(nfa, a, newState, a->to);
1109 	}
1110 	else
1111 	{
1112 		/*
1113 		 * With many arcs, use a sort-merge approach.  Note that createarc()
1114 		 * will put new arcs onto the front of newState's chain, so it does
1115 		 * not break our walk through the sorted part of the chain.
1116 		 */
1117 		struct arc *oa;
1118 		struct arc *na;
1119 
1120 		/*
1121 		 * Because we bypass newarc() in this code path, we'd better include a
1122 		 * cancel check.
1123 		 */
1124 		if (CANCEL_REQUESTED(nfa->v->re))
1125 		{
1126 			NERR(REG_CANCEL);
1127 			return;
1128 		}
1129 
1130 		sortouts(nfa, oldState);
1131 		sortouts(nfa, newState);
1132 		if (NISERR())
1133 			return;				/* might have failed to sort */
1134 		oa = oldState->outs;
1135 		na = newState->outs;
1136 		while (oa != NULL && na != NULL)
1137 		{
1138 			struct arc *a = oa;
1139 
1140 			switch (sortouts_cmp(&oa, &na))
1141 			{
1142 				case -1:
1143 					/* newState does not have anything matching oa */
1144 					oa = oa->outchain;
1145 					createarc(nfa, a->type, a->co, newState, a->to);
1146 					break;
1147 				case 0:
1148 					/* match, advance in both lists */
1149 					oa = oa->outchain;
1150 					na = na->outchain;
1151 					break;
1152 				case +1:
1153 					/* advance only na; oa might have a match later */
1154 					na = na->outchain;
1155 					break;
1156 				default:
1157 					assert(NOTREACHED);
1158 			}
1159 		}
1160 		while (oa != NULL)
1161 		{
1162 			/* newState does not have anything matching oa */
1163 			struct arc *a = oa;
1164 
1165 			oa = oa->outchain;
1166 			createarc(nfa, a->type, a->co, newState, a->to);
1167 		}
1168 	}
1169 }
1170 
1171 /*
1172  * cloneouts - copy out arcs of a state to another state pair, modifying type
1173  */
1174 static void
cloneouts(struct nfa * nfa,struct state * old,struct state * from,struct state * to,int type)1175 cloneouts(struct nfa * nfa,
1176 		  struct state * old,
1177 		  struct state * from,
1178 		  struct state * to,
1179 		  int type)
1180 {
1181 	struct arc *a;
1182 
1183 	assert(old != from);
1184 
1185 	for (a = old->outs; a != NULL; a = a->outchain)
1186 		newarc(nfa, type, a->co, from, to);
1187 }
1188 
1189 /*
1190  * delsub - delete a sub-NFA, updating subre pointers if necessary
1191  *
1192  * This uses a recursive traversal of the sub-NFA, marking already-seen
1193  * states using their tmp pointer.
1194  */
1195 static void
delsub(struct nfa * nfa,struct state * lp,struct state * rp)1196 delsub(struct nfa * nfa,
1197 	   struct state * lp,		/* the sub-NFA goes from here... */
1198 	   struct state * rp)		/* ...to here, *not* inclusive */
1199 {
1200 	assert(lp != rp);
1201 
1202 	rp->tmp = rp;				/* mark end */
1203 
1204 	deltraverse(nfa, lp, lp);
1205 	if (NISERR())
1206 		return;					/* asserts might not hold after failure */
1207 	assert(lp->nouts == 0 && rp->nins == 0);	/* did the job */
1208 	assert(lp->no != FREESTATE && rp->no != FREESTATE); /* no more */
1209 
1210 	rp->tmp = NULL;				/* unmark end */
1211 	lp->tmp = NULL;				/* and begin, marked by deltraverse */
1212 }
1213 
1214 /*
1215  * deltraverse - the recursive heart of delsub
1216  * This routine's basic job is to destroy all out-arcs of the state.
1217  */
1218 static void
deltraverse(struct nfa * nfa,struct state * leftend,struct state * s)1219 deltraverse(struct nfa * nfa,
1220 			struct state * leftend,
1221 			struct state * s)
1222 {
1223 	struct arc *a;
1224 	struct state *to;
1225 
1226 	/* Since this is recursive, it could be driven to stack overflow */
1227 	if (STACK_TOO_DEEP(nfa->v->re))
1228 	{
1229 		NERR(REG_ETOOBIG);
1230 		return;
1231 	}
1232 
1233 	if (s->nouts == 0)
1234 		return;					/* nothing to do */
1235 	if (s->tmp != NULL)
1236 		return;					/* already in progress */
1237 
1238 	s->tmp = s;					/* mark as in progress */
1239 
1240 	while ((a = s->outs) != NULL)
1241 	{
1242 		to = a->to;
1243 		deltraverse(nfa, leftend, to);
1244 		if (NISERR())
1245 			return;				/* asserts might not hold after failure */
1246 		assert(to->nouts == 0 || to->tmp != NULL);
1247 		freearc(nfa, a);
1248 		if (to->nins == 0 && to->tmp == NULL)
1249 		{
1250 			assert(to->nouts == 0);
1251 			freestate(nfa, to);
1252 		}
1253 	}
1254 
1255 	assert(s->no != FREESTATE); /* we're still here */
1256 	assert(s == leftend || s->nins != 0);		/* and still reachable */
1257 	assert(s->nouts == 0);		/* but have no outarcs */
1258 
1259 	s->tmp = NULL;				/* we're done here */
1260 }
1261 
1262 /*
1263  * dupnfa - duplicate sub-NFA
1264  *
1265  * Another recursive traversal, this time using tmp to point to duplicates
1266  * as well as mark already-seen states.  (You knew there was a reason why
1267  * it's a state pointer, didn't you? :-))
1268  */
1269 static void
dupnfa(struct nfa * nfa,struct state * start,struct state * stop,struct state * from,struct state * to)1270 dupnfa(struct nfa * nfa,
1271 	   struct state * start,	/* duplicate of subNFA starting here */
1272 	   struct state * stop,		/* and stopping here */
1273 	   struct state * from,		/* stringing duplicate from here */
1274 	   struct state * to)		/* to here */
1275 {
1276 	if (start == stop)
1277 	{
1278 		newarc(nfa, EMPTY, 0, from, to);
1279 		return;
1280 	}
1281 
1282 	stop->tmp = to;
1283 	duptraverse(nfa, start, from);
1284 	/* done, except for clearing out the tmp pointers */
1285 
1286 	stop->tmp = NULL;
1287 	cleartraverse(nfa, start);
1288 }
1289 
1290 /*
1291  * duptraverse - recursive heart of dupnfa
1292  */
1293 static void
duptraverse(struct nfa * nfa,struct state * s,struct state * stmp)1294 duptraverse(struct nfa * nfa,
1295 			struct state * s,
1296 			struct state * stmp)	/* s's duplicate, or NULL */
1297 {
1298 	struct arc *a;
1299 
1300 	/* Since this is recursive, it could be driven to stack overflow */
1301 	if (STACK_TOO_DEEP(nfa->v->re))
1302 	{
1303 		NERR(REG_ETOOBIG);
1304 		return;
1305 	}
1306 
1307 	if (s->tmp != NULL)
1308 		return;					/* already done */
1309 
1310 	s->tmp = (stmp == NULL) ? newstate(nfa) : stmp;
1311 	if (s->tmp == NULL)
1312 	{
1313 		assert(NISERR());
1314 		return;
1315 	}
1316 
1317 	for (a = s->outs; a != NULL && !NISERR(); a = a->outchain)
1318 	{
1319 		duptraverse(nfa, a->to, (struct state *) NULL);
1320 		if (NISERR())
1321 			break;
1322 		assert(a->to->tmp != NULL);
1323 		cparc(nfa, a, s->tmp, a->to->tmp);
1324 	}
1325 }
1326 
1327 /*
1328  * cleartraverse - recursive cleanup for algorithms that leave tmp ptrs set
1329  */
1330 static void
cleartraverse(struct nfa * nfa,struct state * s)1331 cleartraverse(struct nfa * nfa,
1332 			  struct state * s)
1333 {
1334 	struct arc *a;
1335 
1336 	/* Since this is recursive, it could be driven to stack overflow */
1337 	if (STACK_TOO_DEEP(nfa->v->re))
1338 	{
1339 		NERR(REG_ETOOBIG);
1340 		return;
1341 	}
1342 
1343 	if (s->tmp == NULL)
1344 		return;
1345 	s->tmp = NULL;
1346 
1347 	for (a = s->outs; a != NULL; a = a->outchain)
1348 		cleartraverse(nfa, a->to);
1349 }
1350 
1351 /*
1352  * single_color_transition - does getting from s1 to s2 cross one PLAIN arc?
1353  *
1354  * If traversing from s1 to s2 requires a single PLAIN match (possibly of any
1355  * of a set of colors), return a state whose outarc list contains only PLAIN
1356  * arcs of those color(s).  Otherwise return NULL.
1357  *
1358  * This is used before optimizing the NFA, so there may be EMPTY arcs, which
1359  * we should ignore; the possibility of an EMPTY is why the result state could
1360  * be different from s1.
1361  *
1362  * It's worth troubling to handle multiple parallel PLAIN arcs here because a
1363  * bracket construct such as [abc] might yield either one or several parallel
1364  * PLAIN arcs depending on earlier atoms in the expression.  We'd rather that
1365  * that implementation detail not create user-visible performance differences.
1366  */
1367 static struct state *
single_color_transition(struct state * s1,struct state * s2)1368 single_color_transition(struct state * s1, struct state * s2)
1369 {
1370 	struct arc *a;
1371 
1372 	/* Ignore leading EMPTY arc, if any */
1373 	if (s1->nouts == 1 && s1->outs->type == EMPTY)
1374 		s1 = s1->outs->to;
1375 	/* Likewise for any trailing EMPTY arc */
1376 	if (s2->nins == 1 && s2->ins->type == EMPTY)
1377 		s2 = s2->ins->from;
1378 	/* Perhaps we could have a single-state loop in between, if so reject */
1379 	if (s1 == s2)
1380 		return NULL;
1381 	/* s1 must have at least one outarc... */
1382 	if (s1->outs == NULL)
1383 		return NULL;
1384 	/* ... and they must all be PLAIN arcs to s2 */
1385 	for (a = s1->outs; a != NULL; a = a->outchain)
1386 	{
1387 		if (a->type != PLAIN || a->to != s2)
1388 			return NULL;
1389 	}
1390 	/* OK, return s1 as the possessor of the relevant outarcs */
1391 	return s1;
1392 }
1393 
1394 /*
1395  * specialcolors - fill in special colors for an NFA
1396  */
1397 static void
specialcolors(struct nfa * nfa)1398 specialcolors(struct nfa * nfa)
1399 {
1400 	/* false colors for BOS, BOL, EOS, EOL */
1401 	if (nfa->parent == NULL)
1402 	{
1403 		nfa->bos[0] = pseudocolor(nfa->cm);
1404 		nfa->bos[1] = pseudocolor(nfa->cm);
1405 		nfa->eos[0] = pseudocolor(nfa->cm);
1406 		nfa->eos[1] = pseudocolor(nfa->cm);
1407 	}
1408 	else
1409 	{
1410 		assert(nfa->parent->bos[0] != COLORLESS);
1411 		nfa->bos[0] = nfa->parent->bos[0];
1412 		assert(nfa->parent->bos[1] != COLORLESS);
1413 		nfa->bos[1] = nfa->parent->bos[1];
1414 		assert(nfa->parent->eos[0] != COLORLESS);
1415 		nfa->eos[0] = nfa->parent->eos[0];
1416 		assert(nfa->parent->eos[1] != COLORLESS);
1417 		nfa->eos[1] = nfa->parent->eos[1];
1418 	}
1419 }
1420 
1421 /*
1422  * optimize - optimize an NFA
1423  *
1424  * The main goal of this function is not so much "optimization" (though it
1425  * does try to get rid of useless NFA states) as reducing the NFA to a form
1426  * the regex executor can handle.  The executor, and indeed the cNFA format
1427  * that is its input, can only handle PLAIN and LACON arcs.  The output of
1428  * the regex parser also includes EMPTY (do-nothing) arcs, as well as
1429  * ^, $, AHEAD, and BEHIND constraint arcs, which we must get rid of here.
1430  * We first get rid of EMPTY arcs and then deal with the constraint arcs.
1431  * The hardest part of either job is to get rid of circular loops of the
1432  * target arc type.  We would have to do that in any case, though, as such a
1433  * loop would otherwise allow the executor to cycle through the loop endlessly
1434  * without making any progress in the input string.
1435  */
1436 static long						/* re_info bits */
optimize(struct nfa * nfa,FILE * f)1437 optimize(struct nfa * nfa,
1438 		 FILE *f)				/* for debug output; NULL none */
1439 {
1440 #ifdef REG_DEBUG
1441 	int			verbose = (f != NULL) ? 1 : 0;
1442 
1443 	if (verbose)
1444 		fprintf(f, "\ninitial cleanup:\n");
1445 #endif
1446 	cleanup(nfa);				/* may simplify situation */
1447 #ifdef REG_DEBUG
1448 	if (verbose)
1449 		dumpnfa(nfa, f);
1450 	if (verbose)
1451 		fprintf(f, "\nempties:\n");
1452 #endif
1453 	fixempties(nfa, f);			/* get rid of EMPTY arcs */
1454 #ifdef REG_DEBUG
1455 	if (verbose)
1456 		fprintf(f, "\nconstraints:\n");
1457 #endif
1458 	fixconstraintloops(nfa, f); /* get rid of constraint loops */
1459 	pullback(nfa, f);			/* pull back constraints backward */
1460 	pushfwd(nfa, f);			/* push fwd constraints forward */
1461 #ifdef REG_DEBUG
1462 	if (verbose)
1463 		fprintf(f, "\nfinal cleanup:\n");
1464 #endif
1465 	cleanup(nfa);				/* final tidying */
1466 #ifdef REG_DEBUG
1467 	if (verbose)
1468 		dumpnfa(nfa, f);
1469 #endif
1470 	return analyze(nfa);		/* and analysis */
1471 }
1472 
1473 /*
1474  * pullback - pull back constraints backward to eliminate them
1475  */
1476 static void
pullback(struct nfa * nfa,FILE * f)1477 pullback(struct nfa * nfa,
1478 		 FILE *f)				/* for debug output; NULL none */
1479 {
1480 	struct state *s;
1481 	struct state *nexts;
1482 	struct arc *a;
1483 	struct arc *nexta;
1484 	struct state *intermediates;
1485 	int			progress;
1486 
1487 	/* find and pull until there are no more */
1488 	do
1489 	{
1490 		progress = 0;
1491 		for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
1492 		{
1493 			nexts = s->next;
1494 			intermediates = NULL;
1495 			for (a = s->outs; a != NULL && !NISERR(); a = nexta)
1496 			{
1497 				nexta = a->outchain;
1498 				if (a->type == '^' || a->type == BEHIND)
1499 					if (pull(nfa, a, &intermediates))
1500 						progress = 1;
1501 			}
1502 			/* clear tmp fields of intermediate states created here */
1503 			while (intermediates != NULL)
1504 			{
1505 				struct state *ns = intermediates->tmp;
1506 
1507 				intermediates->tmp = NULL;
1508 				intermediates = ns;
1509 			}
1510 			/* if s is now useless, get rid of it */
1511 			if ((s->nins == 0 || s->nouts == 0) && !s->flag)
1512 				dropstate(nfa, s);
1513 		}
1514 		if (progress && f != NULL)
1515 			dumpnfa(nfa, f);
1516 	} while (progress && !NISERR());
1517 	if (NISERR())
1518 		return;
1519 
1520 	/*
1521 	 * Any ^ constraints we were able to pull to the start state can now be
1522 	 * replaced by PLAIN arcs referencing the BOS or BOL colors.  There should
1523 	 * be no other ^ or BEHIND arcs left in the NFA, though we do not check
1524 	 * that here (compact() will fail if so).
1525 	 */
1526 	for (a = nfa->pre->outs; a != NULL; a = nexta)
1527 	{
1528 		nexta = a->outchain;
1529 		if (a->type == '^')
1530 		{
1531 			assert(a->co == 0 || a->co == 1);
1532 			newarc(nfa, PLAIN, nfa->bos[a->co], a->from, a->to);
1533 			freearc(nfa, a);
1534 		}
1535 	}
1536 }
1537 
1538 /*
1539  * pull - pull a back constraint backward past its source state
1540  *
1541  * Returns 1 if successful (which it always is unless the source is the
1542  * start state or we have an internal error), 0 if nothing happened.
1543  *
1544  * A significant property of this function is that it deletes no pre-existing
1545  * states, and no outarcs of the constraint's from state other than the given
1546  * constraint arc.  This makes the loops in pullback() safe, at the cost that
1547  * we may leave useless states behind.  Therefore, we leave it to pullback()
1548  * to delete such states.
1549  *
1550  * If the from state has multiple back-constraint outarcs, and/or multiple
1551  * compatible constraint inarcs, we only need to create one new intermediate
1552  * state per combination of predecessor and successor states.  *intermediates
1553  * points to a list of such intermediate states for this from state (chained
1554  * through their tmp fields).
1555  */
1556 static int
pull(struct nfa * nfa,struct arc * con,struct state ** intermediates)1557 pull(struct nfa * nfa,
1558 	 struct arc * con,
1559 	 struct state ** intermediates)
1560 {
1561 	struct state *from = con->from;
1562 	struct state *to = con->to;
1563 	struct arc *a;
1564 	struct arc *nexta;
1565 	struct state *s;
1566 
1567 	assert(from != to);			/* should have gotten rid of this earlier */
1568 	if (from->flag)				/* can't pull back beyond start */
1569 		return 0;
1570 	if (from->nins == 0)
1571 	{							/* unreachable */
1572 		freearc(nfa, con);
1573 		return 1;
1574 	}
1575 
1576 	/*
1577 	 * First, clone from state if necessary to avoid other outarcs.  This may
1578 	 * seem wasteful, but it simplifies the logic, and we'll get rid of the
1579 	 * clone state again at the bottom.
1580 	 */
1581 	if (from->nouts > 1)
1582 	{
1583 		s = newstate(nfa);
1584 		if (NISERR())
1585 			return 0;
1586 		copyins(nfa, from, s);	/* duplicate inarcs */
1587 		cparc(nfa, con, s, to); /* move constraint arc */
1588 		freearc(nfa, con);
1589 		if (NISERR())
1590 			return 0;
1591 		from = s;
1592 		con = from->outs;
1593 	}
1594 	assert(from->nouts == 1);
1595 
1596 	/* propagate the constraint into the from state's inarcs */
1597 	for (a = from->ins; a != NULL && !NISERR(); a = nexta)
1598 	{
1599 		nexta = a->inchain;
1600 		switch (combine(con, a))
1601 		{
1602 			case INCOMPATIBLE:	/* destroy the arc */
1603 				freearc(nfa, a);
1604 				break;
1605 			case SATISFIED:		/* no action needed */
1606 				break;
1607 			case COMPATIBLE:	/* swap the two arcs, more or less */
1608 				/* need an intermediate state, but might have one already */
1609 				for (s = *intermediates; s != NULL; s = s->tmp)
1610 				{
1611 					assert(s->nins > 0 && s->nouts > 0);
1612 					if (s->ins->from == a->from && s->outs->to == to)
1613 						break;
1614 				}
1615 				if (s == NULL)
1616 				{
1617 					s = newstate(nfa);
1618 					if (NISERR())
1619 						return 0;
1620 					s->tmp = *intermediates;
1621 					*intermediates = s;
1622 				}
1623 				cparc(nfa, con, a->from, s);
1624 				cparc(nfa, a, s, to);
1625 				freearc(nfa, a);
1626 				break;
1627 			default:
1628 				assert(NOTREACHED);
1629 				break;
1630 		}
1631 	}
1632 
1633 	/* remaining inarcs, if any, incorporate the constraint */
1634 	moveins(nfa, from, to);
1635 	freearc(nfa, con);
1636 	/* from state is now useless, but we leave it to pullback() to clean up */
1637 	return 1;
1638 }
1639 
1640 /*
1641  * pushfwd - push forward constraints forward to eliminate them
1642  */
1643 static void
pushfwd(struct nfa * nfa,FILE * f)1644 pushfwd(struct nfa * nfa,
1645 		FILE *f)				/* for debug output; NULL none */
1646 {
1647 	struct state *s;
1648 	struct state *nexts;
1649 	struct arc *a;
1650 	struct arc *nexta;
1651 	struct state *intermediates;
1652 	int			progress;
1653 
1654 	/* find and push until there are no more */
1655 	do
1656 	{
1657 		progress = 0;
1658 		for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
1659 		{
1660 			nexts = s->next;
1661 			intermediates = NULL;
1662 			for (a = s->ins; a != NULL && !NISERR(); a = nexta)
1663 			{
1664 				nexta = a->inchain;
1665 				if (a->type == '$' || a->type == AHEAD)
1666 					if (push(nfa, a, &intermediates))
1667 						progress = 1;
1668 			}
1669 			/* clear tmp fields of intermediate states created here */
1670 			while (intermediates != NULL)
1671 			{
1672 				struct state *ns = intermediates->tmp;
1673 
1674 				intermediates->tmp = NULL;
1675 				intermediates = ns;
1676 			}
1677 			/* if s is now useless, get rid of it */
1678 			if ((s->nins == 0 || s->nouts == 0) && !s->flag)
1679 				dropstate(nfa, s);
1680 		}
1681 		if (progress && f != NULL)
1682 			dumpnfa(nfa, f);
1683 	} while (progress && !NISERR());
1684 	if (NISERR())
1685 		return;
1686 
1687 	/*
1688 	 * Any $ constraints we were able to push to the post state can now be
1689 	 * replaced by PLAIN arcs referencing the EOS or EOL colors.  There should
1690 	 * be no other $ or AHEAD arcs left in the NFA, though we do not check
1691 	 * that here (compact() will fail if so).
1692 	 */
1693 	for (a = nfa->post->ins; a != NULL; a = nexta)
1694 	{
1695 		nexta = a->inchain;
1696 		if (a->type == '$')
1697 		{
1698 			assert(a->co == 0 || a->co == 1);
1699 			newarc(nfa, PLAIN, nfa->eos[a->co], a->from, a->to);
1700 			freearc(nfa, a);
1701 		}
1702 	}
1703 }
1704 
1705 /*
1706  * push - push a forward constraint forward past its destination state
1707  *
1708  * Returns 1 if successful (which it always is unless the destination is the
1709  * post state or we have an internal error), 0 if nothing happened.
1710  *
1711  * A significant property of this function is that it deletes no pre-existing
1712  * states, and no inarcs of the constraint's to state other than the given
1713  * constraint arc.  This makes the loops in pushfwd() safe, at the cost that
1714  * we may leave useless states behind.  Therefore, we leave it to pushfwd()
1715  * to delete such states.
1716  *
1717  * If the to state has multiple forward-constraint inarcs, and/or multiple
1718  * compatible constraint outarcs, we only need to create one new intermediate
1719  * state per combination of predecessor and successor states.  *intermediates
1720  * points to a list of such intermediate states for this to state (chained
1721  * through their tmp fields).
1722  */
1723 static int
push(struct nfa * nfa,struct arc * con,struct state ** intermediates)1724 push(struct nfa * nfa,
1725 	 struct arc * con,
1726 	 struct state ** intermediates)
1727 {
1728 	struct state *from = con->from;
1729 	struct state *to = con->to;
1730 	struct arc *a;
1731 	struct arc *nexta;
1732 	struct state *s;
1733 
1734 	assert(to != from);			/* should have gotten rid of this earlier */
1735 	if (to->flag)				/* can't push forward beyond end */
1736 		return 0;
1737 	if (to->nouts == 0)
1738 	{							/* dead end */
1739 		freearc(nfa, con);
1740 		return 1;
1741 	}
1742 
1743 	/*
1744 	 * First, clone to state if necessary to avoid other inarcs.  This may
1745 	 * seem wasteful, but it simplifies the logic, and we'll get rid of the
1746 	 * clone state again at the bottom.
1747 	 */
1748 	if (to->nins > 1)
1749 	{
1750 		s = newstate(nfa);
1751 		if (NISERR())
1752 			return 0;
1753 		copyouts(nfa, to, s);	/* duplicate outarcs */
1754 		cparc(nfa, con, from, s);		/* move constraint arc */
1755 		freearc(nfa, con);
1756 		if (NISERR())
1757 			return 0;
1758 		to = s;
1759 		con = to->ins;
1760 	}
1761 	assert(to->nins == 1);
1762 
1763 	/* propagate the constraint into the to state's outarcs */
1764 	for (a = to->outs; a != NULL && !NISERR(); a = nexta)
1765 	{
1766 		nexta = a->outchain;
1767 		switch (combine(con, a))
1768 		{
1769 			case INCOMPATIBLE:	/* destroy the arc */
1770 				freearc(nfa, a);
1771 				break;
1772 			case SATISFIED:		/* no action needed */
1773 				break;
1774 			case COMPATIBLE:	/* swap the two arcs, more or less */
1775 				/* need an intermediate state, but might have one already */
1776 				for (s = *intermediates; s != NULL; s = s->tmp)
1777 				{
1778 					assert(s->nins > 0 && s->nouts > 0);
1779 					if (s->ins->from == from && s->outs->to == a->to)
1780 						break;
1781 				}
1782 				if (s == NULL)
1783 				{
1784 					s = newstate(nfa);
1785 					if (NISERR())
1786 						return 0;
1787 					s->tmp = *intermediates;
1788 					*intermediates = s;
1789 				}
1790 				cparc(nfa, con, s, a->to);
1791 				cparc(nfa, a, from, s);
1792 				freearc(nfa, a);
1793 				break;
1794 			default:
1795 				assert(NOTREACHED);
1796 				break;
1797 		}
1798 	}
1799 
1800 	/* remaining outarcs, if any, incorporate the constraint */
1801 	moveouts(nfa, to, from);
1802 	freearc(nfa, con);
1803 	/* to state is now useless, but we leave it to pushfwd() to clean up */
1804 	return 1;
1805 }
1806 
1807 /*
1808  * combine - constraint lands on an arc, what happens?
1809  *
1810  * #def INCOMPATIBLE	1	// destroys arc
1811  * #def SATISFIED		2	// constraint satisfied
1812  * #def COMPATIBLE		3	// compatible but not satisfied yet
1813  */
1814 static int
combine(struct arc * con,struct arc * a)1815 combine(struct arc * con,
1816 		struct arc * a)
1817 {
1818 #define  CA(ct,at)	 (((ct)<<CHAR_BIT) | (at))
1819 
1820 	switch (CA(con->type, a->type))
1821 	{
1822 		case CA('^', PLAIN):	/* newlines are handled separately */
1823 		case CA('$', PLAIN):
1824 			return INCOMPATIBLE;
1825 			break;
1826 		case CA(AHEAD, PLAIN):	/* color constraints meet colors */
1827 		case CA(BEHIND, PLAIN):
1828 			if (con->co == a->co)
1829 				return SATISFIED;
1830 			return INCOMPATIBLE;
1831 			break;
1832 		case CA('^', '^'):		/* collision, similar constraints */
1833 		case CA('$', '$'):
1834 		case CA(AHEAD, AHEAD):
1835 		case CA(BEHIND, BEHIND):
1836 			if (con->co == a->co)		/* true duplication */
1837 				return SATISFIED;
1838 			return INCOMPATIBLE;
1839 			break;
1840 		case CA('^', BEHIND):	/* collision, dissimilar constraints */
1841 		case CA(BEHIND, '^'):
1842 		case CA('$', AHEAD):
1843 		case CA(AHEAD, '$'):
1844 			return INCOMPATIBLE;
1845 			break;
1846 		case CA('^', '$'):		/* constraints passing each other */
1847 		case CA('^', AHEAD):
1848 		case CA(BEHIND, '$'):
1849 		case CA(BEHIND, AHEAD):
1850 		case CA('$', '^'):
1851 		case CA('$', BEHIND):
1852 		case CA(AHEAD, '^'):
1853 		case CA(AHEAD, BEHIND):
1854 		case CA('^', LACON):
1855 		case CA(BEHIND, LACON):
1856 		case CA('$', LACON):
1857 		case CA(AHEAD, LACON):
1858 			return COMPATIBLE;
1859 			break;
1860 	}
1861 	assert(NOTREACHED);
1862 	return INCOMPATIBLE;		/* for benefit of blind compilers */
1863 }
1864 
1865 /*
1866  * fixempties - get rid of EMPTY arcs
1867  */
1868 static void
fixempties(struct nfa * nfa,FILE * f)1869 fixempties(struct nfa * nfa,
1870 		   FILE *f)				/* for debug output; NULL none */
1871 {
1872 	struct state *s;
1873 	struct state *s2;
1874 	struct state *nexts;
1875 	struct arc *a;
1876 	struct arc *nexta;
1877 	int			totalinarcs;
1878 	struct arc **inarcsorig;
1879 	struct arc **arcarray;
1880 	int			arccount;
1881 	int			prevnins;
1882 	int			nskip;
1883 
1884 	/*
1885 	 * First, get rid of any states whose sole out-arc is an EMPTY, since
1886 	 * they're basically just aliases for their successor.  The parsing
1887 	 * algorithm creates enough of these that it's worth special-casing this.
1888 	 */
1889 	for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
1890 	{
1891 		nexts = s->next;
1892 		if (s->flag || s->nouts != 1)
1893 			continue;
1894 		a = s->outs;
1895 		assert(a != NULL && a->outchain == NULL);
1896 		if (a->type != EMPTY)
1897 			continue;
1898 		if (s != a->to)
1899 			moveins(nfa, s, a->to);
1900 		dropstate(nfa, s);
1901 	}
1902 
1903 	/*
1904 	 * Similarly, get rid of any state with a single EMPTY in-arc, by folding
1905 	 * it into its predecessor.
1906 	 */
1907 	for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
1908 	{
1909 		nexts = s->next;
1910 		/* while we're at it, ensure tmp fields are clear for next step */
1911 		assert(s->tmp == NULL);
1912 		if (s->flag || s->nins != 1)
1913 			continue;
1914 		a = s->ins;
1915 		assert(a != NULL && a->inchain == NULL);
1916 		if (a->type != EMPTY)
1917 			continue;
1918 		if (s != a->from)
1919 			moveouts(nfa, s, a->from);
1920 		dropstate(nfa, s);
1921 	}
1922 
1923 	if (NISERR())
1924 		return;
1925 
1926 	/*
1927 	 * For each remaining NFA state, find all other states from which it is
1928 	 * reachable by a chain of one or more EMPTY arcs.  Then generate new arcs
1929 	 * that eliminate the need for each such chain.
1930 	 *
1931 	 * We could replace a chain of EMPTY arcs that leads from a "from" state
1932 	 * to a "to" state either by pushing non-EMPTY arcs forward (linking
1933 	 * directly from "from"'s predecessors to "to") or by pulling them back
1934 	 * (linking directly from "from" to "to"'s successors).  We choose to
1935 	 * always do the former; this choice is somewhat arbitrary, but the
1936 	 * approach below requires that we uniformly do one or the other.
1937 	 *
1938 	 * Suppose we have a chain of N successive EMPTY arcs (where N can easily
1939 	 * approach the size of the NFA).  All of the intermediate states must
1940 	 * have additional inarcs and outarcs, else they'd have been removed by
1941 	 * the steps above.  Assuming their inarcs are mostly not empties, we will
1942 	 * add O(N^2) arcs to the NFA, since a non-EMPTY inarc leading to any one
1943 	 * state in the chain must be duplicated to lead to all its successor
1944 	 * states as well.  So there is no hope of doing less than O(N^2) work;
1945 	 * however, we should endeavor to keep the big-O cost from being even
1946 	 * worse than that, which it can easily become without care.  In
1947 	 * particular, suppose we were to copy all S1's inarcs forward to S2, and
1948 	 * then also to S3, and then later we consider pushing S2's inarcs forward
1949 	 * to S3.  If we include the arcs already copied from S1 in that, we'd be
1950 	 * doing O(N^3) work.  (The duplicate-arc elimination built into newarc()
1951 	 * and its cohorts would get rid of the extra arcs, but not without cost.)
1952 	 *
1953 	 * We can avoid this cost by treating only arcs that existed at the start
1954 	 * of this phase as candidates to be pushed forward.  To identify those,
1955 	 * we remember the first inarc each state had to start with.  We rely on
1956 	 * the fact that newarc() and friends put new arcs on the front of their
1957 	 * to-states' inchains, and that this phase never deletes arcs, so that
1958 	 * the original arcs must be the last arcs in their to-states' inchains.
1959 	 *
1960 	 * So the process here is that, for each state in the NFA, we gather up
1961 	 * all non-EMPTY inarcs of states that can reach the target state via
1962 	 * EMPTY arcs.  We then sort, de-duplicate, and merge these arcs into the
1963 	 * target state's inchain.  (We can safely use sort-merge for this as long
1964 	 * as we update each state's original-arcs pointer after we add arcs to
1965 	 * it; the sort step of mergeins probably changed the order of the old
1966 	 * arcs.)
1967 	 *
1968 	 * Another refinement worth making is that, because we only add non-EMPTY
1969 	 * arcs during this phase, and all added arcs have the same from-state as
1970 	 * the non-EMPTY arc they were cloned from, we know ahead of time that any
1971 	 * states having only EMPTY outarcs will be useless for lack of outarcs
1972 	 * after we drop the EMPTY arcs.  (They cannot gain non-EMPTY outarcs if
1973 	 * they had none to start with.)  So we need not bother to update the
1974 	 * inchains of such states at all.
1975 	 */
1976 
1977 	/* Remember the states' first original inarcs */
1978 	/* ... and while at it, count how many old inarcs there are altogether */
1979 	inarcsorig = (struct arc **) MALLOC(nfa->nstates * sizeof(struct arc *));
1980 	if (inarcsorig == NULL)
1981 	{
1982 		NERR(REG_ESPACE);
1983 		return;
1984 	}
1985 	totalinarcs = 0;
1986 	for (s = nfa->states; s != NULL; s = s->next)
1987 	{
1988 		inarcsorig[s->no] = s->ins;
1989 		totalinarcs += s->nins;
1990 	}
1991 
1992 	/*
1993 	 * Create a workspace for accumulating the inarcs to be added to the
1994 	 * current target state.  totalinarcs is probably a considerable
1995 	 * overestimate of the space needed, but the NFA is unlikely to be large
1996 	 * enough at this point to make it worth being smarter.
1997 	 */
1998 	arcarray = (struct arc **) MALLOC(totalinarcs * sizeof(struct arc *));
1999 	if (arcarray == NULL)
2000 	{
2001 		NERR(REG_ESPACE);
2002 		FREE(inarcsorig);
2003 		return;
2004 	}
2005 
2006 	/* And iterate over the target states */
2007 	for (s = nfa->states; s != NULL && !NISERR(); s = s->next)
2008 	{
2009 		/* Ignore target states without non-EMPTY outarcs, per note above */
2010 		if (!s->flag && !hasnonemptyout(s))
2011 			continue;
2012 
2013 		/* Find predecessor states and accumulate their original inarcs */
2014 		arccount = 0;
2015 		for (s2 = emptyreachable(nfa, s, s, inarcsorig); s2 != s; s2 = nexts)
2016 		{
2017 			/* Add s2's original inarcs to arcarray[], but ignore empties */
2018 			for (a = inarcsorig[s2->no]; a != NULL; a = a->inchain)
2019 			{
2020 				if (a->type != EMPTY)
2021 					arcarray[arccount++] = a;
2022 			}
2023 
2024 			/* Reset the tmp fields as we walk back */
2025 			nexts = s2->tmp;
2026 			s2->tmp = NULL;
2027 		}
2028 		s->tmp = NULL;
2029 		assert(arccount <= totalinarcs);
2030 
2031 		/* Remember how many original inarcs this state has */
2032 		prevnins = s->nins;
2033 
2034 		/* Add non-duplicate inarcs to target state */
2035 		mergeins(nfa, s, arcarray, arccount);
2036 
2037 		/* Now we must update the state's inarcsorig pointer */
2038 		nskip = s->nins - prevnins;
2039 		a = s->ins;
2040 		while (nskip-- > 0)
2041 			a = a->inchain;
2042 		inarcsorig[s->no] = a;
2043 	}
2044 
2045 	FREE(arcarray);
2046 	FREE(inarcsorig);
2047 
2048 	if (NISERR())
2049 		return;
2050 
2051 	/*
2052 	 * Now remove all the EMPTY arcs, since we don't need them anymore.
2053 	 */
2054 	for (s = nfa->states; s != NULL; s = s->next)
2055 	{
2056 		for (a = s->outs; a != NULL; a = nexta)
2057 		{
2058 			nexta = a->outchain;
2059 			if (a->type == EMPTY)
2060 				freearc(nfa, a);
2061 		}
2062 	}
2063 
2064 	/*
2065 	 * And remove any states that have become useless.  (This cleanup is not
2066 	 * very thorough, and would be even less so if we tried to combine it with
2067 	 * the previous step; but cleanup() will take care of anything we miss.)
2068 	 */
2069 	for (s = nfa->states; s != NULL; s = nexts)
2070 	{
2071 		nexts = s->next;
2072 		if ((s->nins == 0 || s->nouts == 0) && !s->flag)
2073 			dropstate(nfa, s);
2074 	}
2075 
2076 	if (f != NULL)
2077 		dumpnfa(nfa, f);
2078 }
2079 
2080 /*
2081  * emptyreachable - recursively find all states that can reach s by EMPTY arcs
2082  *
2083  * The return value is the last such state found.  Its tmp field links back
2084  * to the next-to-last such state, and so on back to s, so that all these
2085  * states can be located without searching the whole NFA.
2086  *
2087  * Since this is only used in fixempties(), we pass in the inarcsorig[] array
2088  * maintained by that function.  This lets us skip over all new inarcs, which
2089  * are certainly not EMPTY arcs.
2090  *
2091  * The maximum recursion depth here is equal to the length of the longest
2092  * loop-free chain of EMPTY arcs, which is surely no more than the size of
2093  * the NFA ... but that could still be enough to cause trouble.
2094  */
2095 static struct state *
emptyreachable(struct nfa * nfa,struct state * s,struct state * lastfound,struct arc ** inarcsorig)2096 emptyreachable(struct nfa * nfa,
2097 			   struct state * s,
2098 			   struct state * lastfound,
2099 			   struct arc ** inarcsorig)
2100 {
2101 	struct arc *a;
2102 
2103 	/* Since this is recursive, it could be driven to stack overflow */
2104 	if (STACK_TOO_DEEP(nfa->v->re))
2105 	{
2106 		NERR(REG_ETOOBIG);
2107 		return lastfound;
2108 	}
2109 
2110 	s->tmp = lastfound;
2111 	lastfound = s;
2112 	for (a = inarcsorig[s->no]; a != NULL; a = a->inchain)
2113 	{
2114 		if (a->type == EMPTY && a->from->tmp == NULL)
2115 			lastfound = emptyreachable(nfa, a->from, lastfound, inarcsorig);
2116 	}
2117 	return lastfound;
2118 }
2119 
2120 /*
2121  * isconstraintarc - detect whether an arc is of a constraint type
2122  */
2123 static inline int
isconstraintarc(struct arc * a)2124 isconstraintarc(struct arc * a)
2125 {
2126 	switch (a->type)
2127 	{
2128 		case '^':
2129 		case '$':
2130 		case BEHIND:
2131 		case AHEAD:
2132 		case LACON:
2133 			return 1;
2134 	}
2135 	return 0;
2136 }
2137 
2138 /*
2139  * hasconstraintout - does state have a constraint out arc?
2140  */
2141 static int
hasconstraintout(struct state * s)2142 hasconstraintout(struct state * s)
2143 {
2144 	struct arc *a;
2145 
2146 	for (a = s->outs; a != NULL; a = a->outchain)
2147 	{
2148 		if (isconstraintarc(a))
2149 			return 1;
2150 	}
2151 	return 0;
2152 }
2153 
2154 /*
2155  * fixconstraintloops - get rid of loops containing only constraint arcs
2156  *
2157  * A loop of states that contains only constraint arcs is useless, since
2158  * passing around the loop represents no forward progress.  Moreover, it
2159  * would cause infinite looping in pullback/pushfwd, so we need to get rid
2160  * of such loops before doing that.
2161  */
2162 static void
fixconstraintloops(struct nfa * nfa,FILE * f)2163 fixconstraintloops(struct nfa * nfa,
2164 				   FILE *f)		/* for debug output; NULL none */
2165 {
2166 	struct state *s;
2167 	struct state *nexts;
2168 	struct arc *a;
2169 	struct arc *nexta;
2170 	int			hasconstraints;
2171 
2172 	/*
2173 	 * In the trivial case of a state that loops to itself, we can just drop
2174 	 * the constraint arc altogether.  This is worth special-casing because
2175 	 * such loops are far more common than loops containing multiple states.
2176 	 * While we're at it, note whether any constraint arcs survive.
2177 	 */
2178 	hasconstraints = 0;
2179 	for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
2180 	{
2181 		nexts = s->next;
2182 		/* while we're at it, ensure tmp fields are clear for next step */
2183 		assert(s->tmp == NULL);
2184 		for (a = s->outs; a != NULL && !NISERR(); a = nexta)
2185 		{
2186 			nexta = a->outchain;
2187 			if (isconstraintarc(a))
2188 			{
2189 				if (a->to == s)
2190 					freearc(nfa, a);
2191 				else
2192 					hasconstraints = 1;
2193 			}
2194 		}
2195 		/* If we removed all the outarcs, the state is useless. */
2196 		if (s->nouts == 0 && !s->flag)
2197 			dropstate(nfa, s);
2198 	}
2199 
2200 	/* Nothing to do if no remaining constraint arcs */
2201 	if (NISERR() || !hasconstraints)
2202 		return;
2203 
2204 	/*
2205 	 * Starting from each remaining NFA state, search outwards for a
2206 	 * constraint loop.  If we find a loop, break the loop, then start the
2207 	 * search over.  (We could possibly retain some state from the first scan,
2208 	 * but it would complicate things greatly, and multi-state constraint
2209 	 * loops are rare enough that it's not worth optimizing the case.)
2210 	 */
2211 restart:
2212 	for (s = nfa->states; s != NULL && !NISERR(); s = s->next)
2213 	{
2214 		if (findconstraintloop(nfa, s))
2215 			goto restart;
2216 	}
2217 
2218 	if (NISERR())
2219 		return;
2220 
2221 	/*
2222 	 * Now remove any states that have become useless.  (This cleanup is not
2223 	 * very thorough, and would be even less so if we tried to combine it with
2224 	 * the previous step; but cleanup() will take care of anything we miss.)
2225 	 *
2226 	 * Because findconstraintloop intentionally doesn't reset all tmp fields,
2227 	 * we have to clear them after it's done.  This is a convenient place to
2228 	 * do that, too.
2229 	 */
2230 	for (s = nfa->states; s != NULL; s = nexts)
2231 	{
2232 		nexts = s->next;
2233 		s->tmp = NULL;
2234 		if ((s->nins == 0 || s->nouts == 0) && !s->flag)
2235 			dropstate(nfa, s);
2236 	}
2237 
2238 	if (f != NULL)
2239 		dumpnfa(nfa, f);
2240 }
2241 
2242 /*
2243  * findconstraintloop - recursively find a loop of constraint arcs
2244  *
2245  * If we find a loop, break it by calling breakconstraintloop(), then
2246  * return 1; otherwise return 0.
2247  *
2248  * State tmp fields are guaranteed all NULL on a success return, because
2249  * breakconstraintloop does that.  After a failure return, any state that
2250  * is known not to be part of a loop is marked with s->tmp == s; this allows
2251  * us not to have to re-prove that fact on later calls.  (This convention is
2252  * workable because we already eliminated single-state loops.)
2253  *
2254  * Note that the found loop doesn't necessarily include the first state we
2255  * are called on.  Any loop reachable from that state will do.
2256  *
2257  * The maximum recursion depth here is one more than the length of the longest
2258  * loop-free chain of constraint arcs, which is surely no more than the size
2259  * of the NFA ... but that could still be enough to cause trouble.
2260  */
2261 static int
findconstraintloop(struct nfa * nfa,struct state * s)2262 findconstraintloop(struct nfa * nfa, struct state * s)
2263 {
2264 	struct arc *a;
2265 
2266 	/* Since this is recursive, it could be driven to stack overflow */
2267 	if (STACK_TOO_DEEP(nfa->v->re))
2268 	{
2269 		NERR(REG_ETOOBIG);
2270 		return 1;				/* to exit as quickly as possible */
2271 	}
2272 
2273 	if (s->tmp != NULL)
2274 	{
2275 		/* Already proven uninteresting? */
2276 		if (s->tmp == s)
2277 			return 0;
2278 		/* Found a loop involving s */
2279 		breakconstraintloop(nfa, s);
2280 		/* The tmp fields have been cleaned up by breakconstraintloop */
2281 		return 1;
2282 	}
2283 	for (a = s->outs; a != NULL; a = a->outchain)
2284 	{
2285 		if (isconstraintarc(a))
2286 		{
2287 			struct state *sto = a->to;
2288 
2289 			assert(sto != s);
2290 			s->tmp = sto;
2291 			if (findconstraintloop(nfa, sto))
2292 				return 1;
2293 		}
2294 	}
2295 
2296 	/*
2297 	 * If we get here, no constraint loop exists leading out from s.  Mark it
2298 	 * with s->tmp == s so we need not rediscover that fact again later.
2299 	 */
2300 	s->tmp = s;
2301 	return 0;
2302 }
2303 
2304 /*
2305  * breakconstraintloop - break a loop of constraint arcs
2306  *
2307  * sinitial is any one member state of the loop.  Each loop member's tmp
2308  * field links to its successor within the loop.  (Note that this function
2309  * will reset all the tmp fields to NULL.)
2310  *
2311  * We can break the loop by, for any one state S1 in the loop, cloning its
2312  * loop successor state S2 (and possibly following states), and then moving
2313  * all S1->S2 constraint arcs to point to the cloned S2.  The cloned S2 should
2314  * copy any non-constraint outarcs of S2.  Constraint outarcs should be
2315  * dropped if they point back to S1, else they need to be copied as arcs to
2316  * similarly cloned states S3, S4, etc.  In general, each cloned state copies
2317  * non-constraint outarcs, drops constraint outarcs that would lead to itself
2318  * or any earlier cloned state, and sends other constraint outarcs to newly
2319  * cloned states.  No cloned state will have any inarcs that aren't constraint
2320  * arcs or do not lead from S1 or earlier-cloned states.  It's okay to drop
2321  * constraint back-arcs since they would not take us to any state we've not
2322  * already been in; therefore, no new constraint loop is created.  In this way
2323  * we generate a modified NFA that can still represent every useful state
2324  * sequence, but not sequences that represent state loops with no consumption
2325  * of input data.  Note that the set of cloned states will certainly include
2326  * all of the loop member states other than S1, and it may also include
2327  * non-loop states that are reachable from S2 via constraint arcs.  This is
2328  * important because there is no guarantee that findconstraintloop found a
2329  * maximal loop (and searching for one would be NP-hard, so don't try).
2330  * Frequently the "non-loop states" are actually part of a larger loop that
2331  * we didn't notice, and indeed there may be several overlapping loops.
2332  * This technique ensures convergence in such cases, while considering only
2333  * the originally-found loop does not.
2334  *
2335  * If there is only one S1->S2 constraint arc, then that constraint is
2336  * certainly satisfied when we enter any of the clone states.  This means that
2337  * in the common case where many of the constraint arcs are identically
2338  * labeled, we can merge together clone states linked by a similarly-labeled
2339  * constraint: if we can get to the first one we can certainly get to the
2340  * second, so there's no need to distinguish.  This greatly reduces the number
2341  * of new states needed, so we preferentially break the given loop at a state
2342  * pair where this is true.
2343  *
2344  * Furthermore, it's fairly common to find that a cloned successor state has
2345  * no outarcs, especially if we're a bit aggressive about removing unnecessary
2346  * outarcs.  If that happens, then there is simply not any interesting state
2347  * that can be reached through the predecessor's loop arcs, which means we can
2348  * break the loop just by removing those loop arcs, with no new states added.
2349  */
2350 static void
breakconstraintloop(struct nfa * nfa,struct state * sinitial)2351 breakconstraintloop(struct nfa * nfa, struct state * sinitial)
2352 {
2353 	struct state *s;
2354 	struct state *shead;
2355 	struct state *stail;
2356 	struct state *sclone;
2357 	struct state *nexts;
2358 	struct arc *refarc;
2359 	struct arc *a;
2360 	struct arc *nexta;
2361 
2362 	/*
2363 	 * Start by identifying which loop step we want to break at.
2364 	 * Preferentially this is one with only one constraint arc.  (XXX are
2365 	 * there any other secondary heuristics we want to use here?)  Set refarc
2366 	 * to point to the selected lone constraint arc, if there is one.
2367 	 */
2368 	refarc = NULL;
2369 	s = sinitial;
2370 	do
2371 	{
2372 		nexts = s->tmp;
2373 		assert(nexts != s);		/* should not see any one-element loops */
2374 		if (refarc == NULL)
2375 		{
2376 			int			narcs = 0;
2377 
2378 			for (a = s->outs; a != NULL; a = a->outchain)
2379 			{
2380 				if (a->to == nexts && isconstraintarc(a))
2381 				{
2382 					refarc = a;
2383 					narcs++;
2384 				}
2385 			}
2386 			assert(narcs > 0);
2387 			if (narcs > 1)
2388 				refarc = NULL;	/* multiple constraint arcs here, no good */
2389 		}
2390 		s = nexts;
2391 	} while (s != sinitial);
2392 
2393 	if (refarc)
2394 	{
2395 		/* break at the refarc */
2396 		shead = refarc->from;
2397 		stail = refarc->to;
2398 		assert(stail == shead->tmp);
2399 	}
2400 	else
2401 	{
2402 		/* for lack of a better idea, break after sinitial */
2403 		shead = sinitial;
2404 		stail = sinitial->tmp;
2405 	}
2406 
2407 	/*
2408 	 * Reset the tmp fields so that we can use them for local storage in
2409 	 * clonesuccessorstates.  (findconstraintloop won't mind, since it's just
2410 	 * going to abandon its search anyway.)
2411 	 */
2412 	for (s = nfa->states; s != NULL; s = s->next)
2413 		s->tmp = NULL;
2414 
2415 	/*
2416 	 * Recursively build clone state(s) as needed.
2417 	 */
2418 	sclone = newstate(nfa);
2419 	if (sclone == NULL)
2420 	{
2421 		assert(NISERR());
2422 		return;
2423 	}
2424 
2425 	clonesuccessorstates(nfa, stail, sclone, shead, refarc,
2426 						 NULL, NULL, nfa->nstates);
2427 
2428 	if (NISERR())
2429 		return;
2430 
2431 	/*
2432 	 * It's possible that sclone has no outarcs at all, in which case it's
2433 	 * useless.  (We don't try extremely hard to get rid of useless states
2434 	 * here, but this is an easy and fairly common case.)
2435 	 */
2436 	if (sclone->nouts == 0)
2437 	{
2438 		freestate(nfa, sclone);
2439 		sclone = NULL;
2440 	}
2441 
2442 	/*
2443 	 * Move shead's constraint-loop arcs to point to sclone, or just drop them
2444 	 * if we discovered we don't need sclone.
2445 	 */
2446 	for (a = shead->outs; a != NULL; a = nexta)
2447 	{
2448 		nexta = a->outchain;
2449 		if (a->to == stail && isconstraintarc(a))
2450 		{
2451 			if (sclone)
2452 				cparc(nfa, a, shead, sclone);
2453 			freearc(nfa, a);
2454 			if (NISERR())
2455 				break;
2456 		}
2457 	}
2458 }
2459 
2460 /*
2461  * clonesuccessorstates - create a tree of constraint-arc successor states
2462  *
2463  * ssource is the state to be cloned, and sclone is the state to copy its
2464  * outarcs into.  sclone's inarcs, if any, should already be set up.
2465  *
2466  * spredecessor is the original predecessor state that we are trying to build
2467  * successors for (it may not be the immediate predecessor of ssource).
2468  * refarc, if not NULL, is the original constraint arc that is known to have
2469  * been traversed out of spredecessor to reach the successor(s).
2470  *
2471  * For each cloned successor state, we transiently create a "donemap" that is
2472  * a boolean array showing which source states we've already visited for this
2473  * clone state.  This prevents infinite recursion as well as useless repeat
2474  * visits to the same state subtree (which can add up fast, since typical NFAs
2475  * have multiple redundant arc pathways).  Each donemap is a char array
2476  * indexed by state number.  The donemaps are all of the same size "nstates",
2477  * which is nfa->nstates as of the start of the recursion.  This is enough to
2478  * have entries for all pre-existing states, but *not* entries for clone
2479  * states created during the recursion.  That's okay since we have no need to
2480  * mark those.
2481  *
2482  * curdonemap is NULL when recursing to a new sclone state, or sclone's
2483  * donemap when we are recursing without having created a new state (which we
2484  * do when we decide we can merge a successor state into the current clone
2485  * state).  outerdonemap is NULL at the top level and otherwise the parent
2486  * clone state's donemap.
2487  *
2488  * The successor states we create and fill here form a strict tree structure,
2489  * with each state having exactly one predecessor, except that the toplevel
2490  * state has no inarcs as yet (breakconstraintloop will add its inarcs from
2491  * spredecessor after we're done).  Thus, we can examine sclone's inarcs back
2492  * to the root, plus refarc if any, to identify the set of constraints already
2493  * known valid at the current point.  This allows us to avoid generating extra
2494  * successor states.
2495  */
2496 static void
clonesuccessorstates(struct nfa * nfa,struct state * ssource,struct state * sclone,struct state * spredecessor,struct arc * refarc,char * curdonemap,char * outerdonemap,int nstates)2497 clonesuccessorstates(struct nfa * nfa,
2498 					 struct state * ssource,
2499 					 struct state * sclone,
2500 					 struct state * spredecessor,
2501 					 struct arc * refarc,
2502 					 char *curdonemap,
2503 					 char *outerdonemap,
2504 					 int nstates)
2505 {
2506 	char	   *donemap;
2507 	struct arc *a;
2508 
2509 	/* Since this is recursive, it could be driven to stack overflow */
2510 	if (STACK_TOO_DEEP(nfa->v->re))
2511 	{
2512 		NERR(REG_ETOOBIG);
2513 		return;
2514 	}
2515 
2516 	/* If this state hasn't already got a donemap, create one */
2517 	donemap = curdonemap;
2518 	if (donemap == NULL)
2519 	{
2520 		donemap = (char *) MALLOC(nstates * sizeof(char));
2521 		if (donemap == NULL)
2522 		{
2523 			NERR(REG_ESPACE);
2524 			return;
2525 		}
2526 
2527 		if (outerdonemap != NULL)
2528 		{
2529 			/*
2530 			 * Not at outermost recursion level, so copy the outer level's
2531 			 * donemap; this ensures that we see states in process of being
2532 			 * visited at outer levels, or already merged into predecessor
2533 			 * states, as ones we shouldn't traverse back to.
2534 			 */
2535 			memcpy(donemap, outerdonemap, nstates * sizeof(char));
2536 		}
2537 		else
2538 		{
2539 			/* At outermost level, only spredecessor is off-limits */
2540 			memset(donemap, 0, nstates * sizeof(char));
2541 			assert(spredecessor->no < nstates);
2542 			donemap[spredecessor->no] = 1;
2543 		}
2544 	}
2545 
2546 	/* Mark ssource as visited in the donemap */
2547 	assert(ssource->no < nstates);
2548 	assert(donemap[ssource->no] == 0);
2549 	donemap[ssource->no] = 1;
2550 
2551 	/*
2552 	 * We proceed by first cloning all of ssource's outarcs, creating new
2553 	 * clone states as needed but not doing more with them than that.  Then in
2554 	 * a second pass, recurse to process the child clone states.  This allows
2555 	 * us to have only one child clone state per reachable source state, even
2556 	 * when there are multiple outarcs leading to the same state.  Also, when
2557 	 * we do visit a child state, its set of inarcs is known exactly, which
2558 	 * makes it safe to apply the constraint-is-already-checked optimization.
2559 	 * Also, this ensures that we've merged all the states we can into the
2560 	 * current clone before we recurse to any children, thus possibly saving
2561 	 * them from making extra images of those states.
2562 	 *
2563 	 * While this function runs, child clone states of the current state are
2564 	 * marked by setting their tmp fields to point to the original state they
2565 	 * were cloned from.  This makes it possible to detect multiple outarcs
2566 	 * leading to the same state, and also makes it easy to distinguish clone
2567 	 * states from original states (which will have tmp == NULL).
2568 	 */
2569 	for (a = ssource->outs; a != NULL && !NISERR(); a = a->outchain)
2570 	{
2571 		struct state *sto = a->to;
2572 
2573 		/*
2574 		 * We do not consider cloning successor states that have no constraint
2575 		 * outarcs; just link to them as-is.  They cannot be part of a
2576 		 * constraint loop so there is no need to make copies.  In particular,
2577 		 * this rule keeps us from trying to clone the post state, which would
2578 		 * be a bad idea.
2579 		 */
2580 		if (isconstraintarc(a) && hasconstraintout(sto))
2581 		{
2582 			struct state *prevclone;
2583 			int			canmerge;
2584 			struct arc *a2;
2585 
2586 			/*
2587 			 * Back-link constraint arcs must not be followed.  Nor is there a
2588 			 * need to revisit states previously merged into this clone.
2589 			 */
2590 			assert(sto->no < nstates);
2591 			if (donemap[sto->no] != 0)
2592 				continue;
2593 
2594 			/*
2595 			 * Check whether we already have a child clone state for this
2596 			 * source state.
2597 			 */
2598 			prevclone = NULL;
2599 			for (a2 = sclone->outs; a2 != NULL; a2 = a2->outchain)
2600 			{
2601 				if (a2->to->tmp == sto)
2602 				{
2603 					prevclone = a2->to;
2604 					break;
2605 				}
2606 			}
2607 
2608 			/*
2609 			 * If this arc is labeled the same as refarc, or the same as any
2610 			 * arc we must have traversed to get to sclone, then no additional
2611 			 * constraints need to be met to get to sto, so we should just
2612 			 * merge its outarcs into sclone.
2613 			 */
2614 			if (refarc && a->type == refarc->type && a->co == refarc->co)
2615 				canmerge = 1;
2616 			else
2617 			{
2618 				struct state *s;
2619 
2620 				canmerge = 0;
2621 				for (s = sclone; s->ins; s = s->ins->from)
2622 				{
2623 					if (s->nins == 1 &&
2624 						a->type == s->ins->type && a->co == s->ins->co)
2625 					{
2626 						canmerge = 1;
2627 						break;
2628 					}
2629 				}
2630 			}
2631 
2632 			if (canmerge)
2633 			{
2634 				/*
2635 				 * We can merge into sclone.  If we previously made a child
2636 				 * clone state, drop it; there's no need to visit it.  (This
2637 				 * can happen if ssource has multiple pathways to sto, and we
2638 				 * only just now found one that is provably a no-op.)
2639 				 */
2640 				if (prevclone)
2641 					dropstate(nfa, prevclone);	/* kills our outarc, too */
2642 
2643 				/* Recurse to merge sto's outarcs into sclone */
2644 				clonesuccessorstates(nfa,
2645 									 sto,
2646 									 sclone,
2647 									 spredecessor,
2648 									 refarc,
2649 									 donemap,
2650 									 outerdonemap,
2651 									 nstates);
2652 				/* sto should now be marked as previously visited */
2653 				assert(NISERR() || donemap[sto->no] == 1);
2654 			}
2655 			else if (prevclone)
2656 			{
2657 				/*
2658 				 * We already have a clone state for this successor, so just
2659 				 * make another arc to it.
2660 				 */
2661 				cparc(nfa, a, sclone, prevclone);
2662 			}
2663 			else
2664 			{
2665 				/*
2666 				 * We need to create a new successor clone state.
2667 				 */
2668 				struct state *stoclone;
2669 
2670 				stoclone = newstate(nfa);
2671 				if (stoclone == NULL)
2672 				{
2673 					assert(NISERR());
2674 					break;
2675 				}
2676 				/* Mark it as to what it's a clone of */
2677 				stoclone->tmp = sto;
2678 				/* ... and add the outarc leading to it */
2679 				cparc(nfa, a, sclone, stoclone);
2680 			}
2681 		}
2682 		else
2683 		{
2684 			/*
2685 			 * Non-constraint outarcs just get copied to sclone, as do outarcs
2686 			 * leading to states with no constraint outarc.
2687 			 */
2688 			cparc(nfa, a, sclone, sto);
2689 		}
2690 	}
2691 
2692 	/*
2693 	 * If we are at outer level for this clone state, recurse to all its child
2694 	 * clone states, clearing their tmp fields as we go.  (If we're not
2695 	 * outermost for sclone, leave this to be done by the outer call level.)
2696 	 * Note that if we have multiple outarcs leading to the same clone state,
2697 	 * it will only be recursed-to once.
2698 	 */
2699 	if (curdonemap == NULL)
2700 	{
2701 		for (a = sclone->outs; a != NULL && !NISERR(); a = a->outchain)
2702 		{
2703 			struct state *stoclone = a->to;
2704 			struct state *sto = stoclone->tmp;
2705 
2706 			if (sto != NULL)
2707 			{
2708 				stoclone->tmp = NULL;
2709 				clonesuccessorstates(nfa,
2710 									 sto,
2711 									 stoclone,
2712 									 spredecessor,
2713 									 refarc,
2714 									 NULL,
2715 									 donemap,
2716 									 nstates);
2717 			}
2718 		}
2719 
2720 		/* Don't forget to free sclone's donemap when done with it */
2721 		FREE(donemap);
2722 	}
2723 }
2724 
2725 /*
2726  * cleanup - clean up NFA after optimizations
2727  */
2728 static void
cleanup(struct nfa * nfa)2729 cleanup(struct nfa * nfa)
2730 {
2731 	struct state *s;
2732 	struct state *nexts;
2733 	int			n;
2734 
2735 	if (NISERR())
2736 		return;
2737 
2738 	/* clear out unreachable or dead-end states */
2739 	/* use pre to mark reachable, then post to mark can-reach-post */
2740 	markreachable(nfa, nfa->pre, (struct state *) NULL, nfa->pre);
2741 	markcanreach(nfa, nfa->post, nfa->pre, nfa->post);
2742 	for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
2743 	{
2744 		nexts = s->next;
2745 		if (s->tmp != nfa->post && !s->flag)
2746 			dropstate(nfa, s);
2747 	}
2748 	assert(NISERR() || nfa->post->nins == 0 || nfa->post->tmp == nfa->post);
2749 	cleartraverse(nfa, nfa->pre);
2750 	assert(NISERR() || nfa->post->nins == 0 || nfa->post->tmp == NULL);
2751 	/* the nins==0 (final unreachable) case will be caught later */
2752 
2753 	/* renumber surviving states */
2754 	n = 0;
2755 	for (s = nfa->states; s != NULL; s = s->next)
2756 		s->no = n++;
2757 	nfa->nstates = n;
2758 }
2759 
2760 /*
2761  * markreachable - recursive marking of reachable states
2762  */
2763 static void
markreachable(struct nfa * nfa,struct state * s,struct state * okay,struct state * mark)2764 markreachable(struct nfa * nfa,
2765 			  struct state * s,
2766 			  struct state * okay,		/* consider only states with this mark */
2767 			  struct state * mark)		/* the value to mark with */
2768 {
2769 	struct arc *a;
2770 
2771 	/* Since this is recursive, it could be driven to stack overflow */
2772 	if (STACK_TOO_DEEP(nfa->v->re))
2773 	{
2774 		NERR(REG_ETOOBIG);
2775 		return;
2776 	}
2777 
2778 	if (s->tmp != okay)
2779 		return;
2780 	s->tmp = mark;
2781 
2782 	for (a = s->outs; a != NULL; a = a->outchain)
2783 		markreachable(nfa, a->to, okay, mark);
2784 }
2785 
2786 /*
2787  * markcanreach - recursive marking of states which can reach here
2788  */
2789 static void
markcanreach(struct nfa * nfa,struct state * s,struct state * okay,struct state * mark)2790 markcanreach(struct nfa * nfa,
2791 			 struct state * s,
2792 			 struct state * okay,		/* consider only states with this mark */
2793 			 struct state * mark)		/* the value to mark with */
2794 {
2795 	struct arc *a;
2796 
2797 	/* Since this is recursive, it could be driven to stack overflow */
2798 	if (STACK_TOO_DEEP(nfa->v->re))
2799 	{
2800 		NERR(REG_ETOOBIG);
2801 		return;
2802 	}
2803 
2804 	if (s->tmp != okay)
2805 		return;
2806 	s->tmp = mark;
2807 
2808 	for (a = s->ins; a != NULL; a = a->inchain)
2809 		markcanreach(nfa, a->from, okay, mark);
2810 }
2811 
2812 /*
2813  * analyze - ascertain potentially-useful facts about an optimized NFA
2814  */
2815 static long						/* re_info bits to be ORed in */
analyze(struct nfa * nfa)2816 analyze(struct nfa * nfa)
2817 {
2818 	struct arc *a;
2819 	struct arc *aa;
2820 
2821 	if (NISERR())
2822 		return 0;
2823 
2824 	if (nfa->pre->outs == NULL)
2825 		return REG_UIMPOSSIBLE;
2826 	for (a = nfa->pre->outs; a != NULL; a = a->outchain)
2827 		for (aa = a->to->outs; aa != NULL; aa = aa->outchain)
2828 			if (aa->to == nfa->post)
2829 				return REG_UEMPTYMATCH;
2830 	return 0;
2831 }
2832 
2833 /*
2834  * compact - construct the compact representation of an NFA
2835  */
2836 static void
compact(struct nfa * nfa,struct cnfa * cnfa)2837 compact(struct nfa * nfa,
2838 		struct cnfa * cnfa)
2839 {
2840 	struct state *s;
2841 	struct arc *a;
2842 	size_t		nstates;
2843 	size_t		narcs;
2844 	struct carc *ca;
2845 	struct carc *first;
2846 
2847 	assert(!NISERR());
2848 
2849 	nstates = 0;
2850 	narcs = 0;
2851 	for (s = nfa->states; s != NULL; s = s->next)
2852 	{
2853 		nstates++;
2854 		narcs += s->nouts + 1;	/* need one extra for endmarker */
2855 	}
2856 
2857 	cnfa->stflags = (char *) MALLOC(nstates * sizeof(char));
2858 	cnfa->states = (struct carc **) MALLOC(nstates * sizeof(struct carc *));
2859 	cnfa->arcs = (struct carc *) MALLOC(narcs * sizeof(struct carc));
2860 	if (cnfa->stflags == NULL || cnfa->states == NULL || cnfa->arcs == NULL)
2861 	{
2862 		if (cnfa->stflags != NULL)
2863 			FREE(cnfa->stflags);
2864 		if (cnfa->states != NULL)
2865 			FREE(cnfa->states);
2866 		if (cnfa->arcs != NULL)
2867 			FREE(cnfa->arcs);
2868 		NERR(REG_ESPACE);
2869 		return;
2870 	}
2871 	cnfa->nstates = nstates;
2872 	cnfa->pre = nfa->pre->no;
2873 	cnfa->post = nfa->post->no;
2874 	cnfa->bos[0] = nfa->bos[0];
2875 	cnfa->bos[1] = nfa->bos[1];
2876 	cnfa->eos[0] = nfa->eos[0];
2877 	cnfa->eos[1] = nfa->eos[1];
2878 	cnfa->ncolors = maxcolor(nfa->cm) + 1;
2879 	cnfa->flags = 0;
2880 
2881 	ca = cnfa->arcs;
2882 	for (s = nfa->states; s != NULL; s = s->next)
2883 	{
2884 		assert((size_t) s->no < nstates);
2885 		cnfa->stflags[s->no] = 0;
2886 		cnfa->states[s->no] = ca;
2887 		first = ca;
2888 		for (a = s->outs; a != NULL; a = a->outchain)
2889 			switch (a->type)
2890 			{
2891 				case PLAIN:
2892 					ca->co = a->co;
2893 					ca->to = a->to->no;
2894 					ca++;
2895 					break;
2896 				case LACON:
2897 					assert(s->no != cnfa->pre);
2898 					ca->co = (color) (cnfa->ncolors + a->co);
2899 					ca->to = a->to->no;
2900 					ca++;
2901 					cnfa->flags |= HASLACONS;
2902 					break;
2903 				default:
2904 					NERR(REG_ASSERT);
2905 					break;
2906 			}
2907 		carcsort(first, ca - first);
2908 		ca->co = COLORLESS;
2909 		ca->to = 0;
2910 		ca++;
2911 	}
2912 	assert(ca == &cnfa->arcs[narcs]);
2913 	assert(cnfa->nstates != 0);
2914 
2915 	/* mark no-progress states */
2916 	for (a = nfa->pre->outs; a != NULL; a = a->outchain)
2917 		cnfa->stflags[a->to->no] = CNFA_NOPROGRESS;
2918 	cnfa->stflags[nfa->pre->no] = CNFA_NOPROGRESS;
2919 }
2920 
2921 /*
2922  * carcsort - sort compacted-NFA arcs by color
2923  */
2924 static void
carcsort(struct carc * first,size_t n)2925 carcsort(struct carc * first, size_t n)
2926 {
2927 	if (n > 1)
2928 		qsort(first, n, sizeof(struct carc), carc_cmp);
2929 }
2930 
2931 static int
carc_cmp(const void * a,const void * b)2932 carc_cmp(const void *a, const void *b)
2933 {
2934 	const struct carc *aa = (const struct carc *) a;
2935 	const struct carc *bb = (const struct carc *) b;
2936 
2937 	if (aa->co < bb->co)
2938 		return -1;
2939 	if (aa->co > bb->co)
2940 		return +1;
2941 	if (aa->to < bb->to)
2942 		return -1;
2943 	if (aa->to > bb->to)
2944 		return +1;
2945 	return 0;
2946 }
2947 
2948 /*
2949  * freecnfa - free a compacted NFA
2950  */
2951 static void
freecnfa(struct cnfa * cnfa)2952 freecnfa(struct cnfa * cnfa)
2953 {
2954 	assert(cnfa->nstates != 0); /* not empty already */
2955 	cnfa->nstates = 0;
2956 	FREE(cnfa->stflags);
2957 	FREE(cnfa->states);
2958 	FREE(cnfa->arcs);
2959 }
2960 
2961 /*
2962  * dumpnfa - dump an NFA in human-readable form
2963  */
2964 static void
dumpnfa(struct nfa * nfa,FILE * f)2965 dumpnfa(struct nfa * nfa,
2966 		FILE *f)
2967 {
2968 #ifdef REG_DEBUG
2969 	struct state *s;
2970 	int			nstates = 0;
2971 	int			narcs = 0;
2972 
2973 	fprintf(f, "pre %d, post %d", nfa->pre->no, nfa->post->no);
2974 	if (nfa->bos[0] != COLORLESS)
2975 		fprintf(f, ", bos [%ld]", (long) nfa->bos[0]);
2976 	if (nfa->bos[1] != COLORLESS)
2977 		fprintf(f, ", bol [%ld]", (long) nfa->bos[1]);
2978 	if (nfa->eos[0] != COLORLESS)
2979 		fprintf(f, ", eos [%ld]", (long) nfa->eos[0]);
2980 	if (nfa->eos[1] != COLORLESS)
2981 		fprintf(f, ", eol [%ld]", (long) nfa->eos[1]);
2982 	fprintf(f, "\n");
2983 	for (s = nfa->states; s != NULL; s = s->next)
2984 	{
2985 		dumpstate(s, f);
2986 		nstates++;
2987 		narcs += s->nouts;
2988 	}
2989 	fprintf(f, "total of %d states, %d arcs\n", nstates, narcs);
2990 	if (nfa->parent == NULL)
2991 		dumpcolors(nfa->cm, f);
2992 	fflush(f);
2993 #endif
2994 }
2995 
2996 #ifdef REG_DEBUG				/* subordinates of dumpnfa */
2997 
2998 /*
2999  * dumpstate - dump an NFA state in human-readable form
3000  */
3001 static void
dumpstate(struct state * s,FILE * f)3002 dumpstate(struct state * s,
3003 		  FILE *f)
3004 {
3005 	struct arc *a;
3006 
3007 	fprintf(f, "%d%s%c", s->no, (s->tmp != NULL) ? "T" : "",
3008 			(s->flag) ? s->flag : '.');
3009 	if (s->prev != NULL && s->prev->next != s)
3010 		fprintf(f, "\tstate chain bad\n");
3011 	if (s->nouts == 0)
3012 		fprintf(f, "\tno out arcs\n");
3013 	else
3014 		dumparcs(s, f);
3015 	fflush(f);
3016 	for (a = s->ins; a != NULL; a = a->inchain)
3017 	{
3018 		if (a->to != s)
3019 			fprintf(f, "\tlink from %d to %d on %d's in-chain\n",
3020 					a->from->no, a->to->no, s->no);
3021 	}
3022 }
3023 
3024 /*
3025  * dumparcs - dump out-arcs in human-readable form
3026  */
3027 static void
dumparcs(struct state * s,FILE * f)3028 dumparcs(struct state * s,
3029 		 FILE *f)
3030 {
3031 	int			pos;
3032 	struct arc *a;
3033 
3034 	/* printing oldest arcs first is usually clearer */
3035 	a = s->outs;
3036 	assert(a != NULL);
3037 	while (a->outchain != NULL)
3038 		a = a->outchain;
3039 	pos = 1;
3040 	do
3041 	{
3042 		dumparc(a, s, f);
3043 		if (pos == 5)
3044 		{
3045 			fprintf(f, "\n");
3046 			pos = 1;
3047 		}
3048 		else
3049 			pos++;
3050 		a = a->outchainRev;
3051 	} while (a != NULL);
3052 	if (pos != 1)
3053 		fprintf(f, "\n");
3054 }
3055 
3056 /*
3057  * dumparc - dump one outarc in readable form, including prefixing tab
3058  */
3059 static void
dumparc(struct arc * a,struct state * s,FILE * f)3060 dumparc(struct arc * a,
3061 		struct state * s,
3062 		FILE *f)
3063 {
3064 	struct arc *aa;
3065 	struct arcbatch *ab;
3066 
3067 	fprintf(f, "\t");
3068 	switch (a->type)
3069 	{
3070 		case PLAIN:
3071 			fprintf(f, "[%ld]", (long) a->co);
3072 			break;
3073 		case AHEAD:
3074 			fprintf(f, ">%ld>", (long) a->co);
3075 			break;
3076 		case BEHIND:
3077 			fprintf(f, "<%ld<", (long) a->co);
3078 			break;
3079 		case LACON:
3080 			fprintf(f, ":%ld:", (long) a->co);
3081 			break;
3082 		case '^':
3083 		case '$':
3084 			fprintf(f, "%c%d", a->type, (int) a->co);
3085 			break;
3086 		case EMPTY:
3087 			break;
3088 		default:
3089 			fprintf(f, "0x%x/0%lo", a->type, (long) a->co);
3090 			break;
3091 	}
3092 	if (a->from != s)
3093 		fprintf(f, "?%d?", a->from->no);
3094 	for (ab = &a->from->oas; ab != NULL; ab = ab->next)
3095 	{
3096 		for (aa = &ab->a[0]; aa < &ab->a[ABSIZE]; aa++)
3097 			if (aa == a)
3098 				break;			/* NOTE BREAK OUT */
3099 		if (aa < &ab->a[ABSIZE])	/* propagate break */
3100 			break;				/* NOTE BREAK OUT */
3101 	}
3102 	if (ab == NULL)
3103 		fprintf(f, "?!?");		/* not in allocated space */
3104 	fprintf(f, "->");
3105 	if (a->to == NULL)
3106 	{
3107 		fprintf(f, "NULL");
3108 		return;
3109 	}
3110 	fprintf(f, "%d", a->to->no);
3111 	for (aa = a->to->ins; aa != NULL; aa = aa->inchain)
3112 		if (aa == a)
3113 			break;				/* NOTE BREAK OUT */
3114 	if (aa == NULL)
3115 		fprintf(f, "?!?");		/* missing from in-chain */
3116 }
3117 #endif   /* REG_DEBUG */
3118 
3119 /*
3120  * dumpcnfa - dump a compacted NFA in human-readable form
3121  */
3122 #ifdef REG_DEBUG
3123 static void
dumpcnfa(struct cnfa * cnfa,FILE * f)3124 dumpcnfa(struct cnfa * cnfa,
3125 		 FILE *f)
3126 {
3127 	int			st;
3128 
3129 	fprintf(f, "pre %d, post %d", cnfa->pre, cnfa->post);
3130 	if (cnfa->bos[0] != COLORLESS)
3131 		fprintf(f, ", bos [%ld]", (long) cnfa->bos[0]);
3132 	if (cnfa->bos[1] != COLORLESS)
3133 		fprintf(f, ", bol [%ld]", (long) cnfa->bos[1]);
3134 	if (cnfa->eos[0] != COLORLESS)
3135 		fprintf(f, ", eos [%ld]", (long) cnfa->eos[0]);
3136 	if (cnfa->eos[1] != COLORLESS)
3137 		fprintf(f, ", eol [%ld]", (long) cnfa->eos[1]);
3138 	if (cnfa->flags & HASLACONS)
3139 		fprintf(f, ", haslacons");
3140 	fprintf(f, "\n");
3141 	for (st = 0; st < cnfa->nstates; st++)
3142 		dumpcstate(st, cnfa, f);
3143 	fflush(f);
3144 }
3145 #endif
3146 
3147 #ifdef REG_DEBUG				/* subordinates of dumpcnfa */
3148 
3149 /*
3150  * dumpcstate - dump a compacted-NFA state in human-readable form
3151  */
3152 static void
dumpcstate(int st,struct cnfa * cnfa,FILE * f)3153 dumpcstate(int st,
3154 		   struct cnfa * cnfa,
3155 		   FILE *f)
3156 {
3157 	struct carc *ca;
3158 	int			pos;
3159 
3160 	fprintf(f, "%d%s", st, (cnfa->stflags[st] & CNFA_NOPROGRESS) ? ":" : ".");
3161 	pos = 1;
3162 	for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
3163 	{
3164 		if (ca->co < cnfa->ncolors)
3165 			fprintf(f, "\t[%ld]->%d", (long) ca->co, ca->to);
3166 		else
3167 			fprintf(f, "\t:%ld:->%d", (long) (ca->co - cnfa->ncolors), ca->to);
3168 		if (pos == 5)
3169 		{
3170 			fprintf(f, "\n");
3171 			pos = 1;
3172 		}
3173 		else
3174 			pos++;
3175 	}
3176 	if (ca == cnfa->states[st] || pos != 1)
3177 		fprintf(f, "\n");
3178 	fflush(f);
3179 }
3180 
3181 #endif   /* REG_DEBUG */
3182