1 /*
2  * NFA utilities.
3  * This file is #included by regcomp.c.
4  *
5  * Copyright (c) 1998, 1999 Henry Spencer.  All rights reserved.
6  *
7  * Development of this software was funded, in part, by Cray Research Inc.,
8  * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
9  * Corporation, none of whom are responsible for the results.  The author
10  * thanks all of them.
11  *
12  * Redistribution and use in source and binary forms -- with or without
13  * modification -- are permitted for any purpose, provided that
14  * redistributions in source form retain this entire copyright notice and
15  * indicate the origin and nature of any modifications.
16  *
17  * I'd appreciate being given credit for this package in the documentation
18  * of software which uses it, but that is not a requirement.
19  *
20  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
21  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
22  * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
23  * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
29  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  *
31  * src/backend/regex/regc_nfa.c
32  *
33  *
34  * One or two things that technically ought to be in here
35  * are actually in color.c, thanks to some incestuous relationships in
36  * the color chains.
37  */
38 
39 #define NISERR()	VISERR(nfa->v)
40 #define NERR(e)		VERR(nfa->v, (e))
41 
42 
43 /*
44  * newnfa - set up an NFA
45  */
46 static struct nfa *				/* the NFA, or NULL */
newnfa(struct vars * v,struct colormap * cm,struct nfa * parent)47 newnfa(struct vars *v,
48 	   struct colormap *cm,
49 	   struct nfa *parent)		/* NULL if primary NFA */
50 {
51 	struct nfa *nfa;
52 
53 	nfa = (struct nfa *) MALLOC(sizeof(struct nfa));
54 	if (nfa == NULL)
55 	{
56 		ERR(REG_ESPACE);
57 		return NULL;
58 	}
59 
60 	/* Make the NFA minimally valid, so freenfa() will behave sanely */
61 	nfa->states = NULL;
62 	nfa->slast = NULL;
63 	nfa->freestates = NULL;
64 	nfa->freearcs = NULL;
65 	nfa->lastsb = NULL;
66 	nfa->lastab = NULL;
67 	nfa->lastsbused = 0;
68 	nfa->lastabused = 0;
69 	nfa->nstates = 0;
70 	nfa->cm = cm;
71 	nfa->v = v;
72 	nfa->bos[0] = nfa->bos[1] = COLORLESS;
73 	nfa->eos[0] = nfa->eos[1] = COLORLESS;
74 	nfa->flags = 0;
75 	nfa->minmatchall = nfa->maxmatchall = -1;
76 	nfa->parent = parent;		/* Precedes newfstate so parent is valid. */
77 
78 	/* Create required infrastructure */
79 	nfa->post = newfstate(nfa, '@');	/* number 0 */
80 	nfa->pre = newfstate(nfa, '>'); /* number 1 */
81 	nfa->init = newstate(nfa);	/* may become invalid later */
82 	nfa->final = newstate(nfa);
83 	if (ISERR())
84 	{
85 		freenfa(nfa);
86 		return NULL;
87 	}
88 	rainbow(nfa, nfa->cm, PLAIN, COLORLESS, nfa->pre, nfa->init);
89 	newarc(nfa, '^', 1, nfa->pre, nfa->init);
90 	newarc(nfa, '^', 0, nfa->pre, nfa->init);
91 	rainbow(nfa, nfa->cm, PLAIN, COLORLESS, nfa->final, nfa->post);
92 	newarc(nfa, '$', 1, nfa->final, nfa->post);
93 	newarc(nfa, '$', 0, nfa->final, nfa->post);
94 
95 	if (ISERR())
96 	{
97 		freenfa(nfa);
98 		return NULL;
99 	}
100 	return nfa;
101 }
102 
103 /*
104  * freenfa - free an entire NFA
105  */
106 static void
freenfa(struct nfa * nfa)107 freenfa(struct nfa *nfa)
108 {
109 	struct statebatch *sb;
110 	struct statebatch *sbnext;
111 	struct arcbatch *ab;
112 	struct arcbatch *abnext;
113 
114 	for (sb = nfa->lastsb; sb != NULL; sb = sbnext)
115 	{
116 		sbnext = sb->next;
117 		nfa->v->spaceused -= STATEBATCHSIZE(sb->nstates);
118 		FREE(sb);
119 	}
120 	nfa->lastsb = NULL;
121 	for (ab = nfa->lastab; ab != NULL; ab = abnext)
122 	{
123 		abnext = ab->next;
124 		nfa->v->spaceused -= ARCBATCHSIZE(ab->narcs);
125 		FREE(ab);
126 	}
127 	nfa->lastab = NULL;
128 
129 	nfa->nstates = -1;
130 	FREE(nfa);
131 }
132 
133 /*
134  * newstate - allocate an NFA state, with zero flag value
135  */
136 static struct state *			/* NULL on error */
newstate(struct nfa * nfa)137 newstate(struct nfa *nfa)
138 {
139 	struct state *s;
140 
141 	/*
142 	 * This is a handy place to check for operation cancel during regex
143 	 * compilation, since no code path will go very long without making a new
144 	 * state or arc.
145 	 */
146 	if (CANCEL_REQUESTED(nfa->v->re))
147 	{
148 		NERR(REG_CANCEL);
149 		return NULL;
150 	}
151 
152 	/* first, recycle anything that's on the freelist */
153 	if (nfa->freestates != NULL)
154 	{
155 		s = nfa->freestates;
156 		nfa->freestates = s->next;
157 	}
158 	/* otherwise, is there anything left in the last statebatch? */
159 	else if (nfa->lastsb != NULL && nfa->lastsbused < nfa->lastsb->nstates)
160 	{
161 		s = &nfa->lastsb->s[nfa->lastsbused++];
162 	}
163 	/* otherwise, need to allocate a new statebatch */
164 	else
165 	{
166 		struct statebatch *newSb;
167 		size_t		nstates;
168 
169 		if (nfa->v->spaceused >= REG_MAX_COMPILE_SPACE)
170 		{
171 			NERR(REG_ETOOBIG);
172 			return NULL;
173 		}
174 		nstates = (nfa->lastsb != NULL) ? nfa->lastsb->nstates * 2 : FIRSTSBSIZE;
175 		if (nstates > MAXSBSIZE)
176 			nstates = MAXSBSIZE;
177 		newSb = (struct statebatch *) MALLOC(STATEBATCHSIZE(nstates));
178 		if (newSb == NULL)
179 		{
180 			NERR(REG_ESPACE);
181 			return NULL;
182 		}
183 		nfa->v->spaceused += STATEBATCHSIZE(nstates);
184 		newSb->nstates = nstates;
185 		newSb->next = nfa->lastsb;
186 		nfa->lastsb = newSb;
187 		nfa->lastsbused = 1;
188 		s = &newSb->s[0];
189 	}
190 
191 	assert(nfa->nstates >= 0);
192 	s->no = nfa->nstates++;
193 	s->flag = 0;
194 	if (nfa->states == NULL)
195 		nfa->states = s;
196 	s->nins = 0;
197 	s->ins = NULL;
198 	s->nouts = 0;
199 	s->outs = NULL;
200 	s->tmp = NULL;
201 	s->next = NULL;
202 	if (nfa->slast != NULL)
203 	{
204 		assert(nfa->slast->next == NULL);
205 		nfa->slast->next = s;
206 	}
207 	s->prev = nfa->slast;
208 	nfa->slast = s;
209 	return s;
210 }
211 
212 /*
213  * newfstate - allocate an NFA state with a specified flag value
214  */
215 static struct state *			/* NULL on error */
newfstate(struct nfa * nfa,int flag)216 newfstate(struct nfa *nfa, int flag)
217 {
218 	struct state *s;
219 
220 	s = newstate(nfa);
221 	if (s != NULL)
222 		s->flag = (char) flag;
223 	return s;
224 }
225 
226 /*
227  * dropstate - delete a state's inarcs and outarcs and free it
228  */
229 static void
dropstate(struct nfa * nfa,struct state * s)230 dropstate(struct nfa *nfa,
231 		  struct state *s)
232 {
233 	struct arc *a;
234 
235 	while ((a = s->ins) != NULL)
236 		freearc(nfa, a);
237 	while ((a = s->outs) != NULL)
238 		freearc(nfa, a);
239 	freestate(nfa, s);
240 }
241 
242 /*
243  * freestate - free a state, which has no in-arcs or out-arcs
244  */
245 static void
freestate(struct nfa * nfa,struct state * s)246 freestate(struct nfa *nfa,
247 		  struct state *s)
248 {
249 	assert(s != NULL);
250 	assert(s->nins == 0 && s->nouts == 0);
251 
252 	s->no = FREESTATE;
253 	s->flag = 0;
254 	if (s->next != NULL)
255 		s->next->prev = s->prev;
256 	else
257 	{
258 		assert(s == nfa->slast);
259 		nfa->slast = s->prev;
260 	}
261 	if (s->prev != NULL)
262 		s->prev->next = s->next;
263 	else
264 	{
265 		assert(s == nfa->states);
266 		nfa->states = s->next;
267 	}
268 	s->prev = NULL;
269 	s->next = nfa->freestates;	/* don't delete it, put it on the free list */
270 	nfa->freestates = s;
271 }
272 
273 /*
274  * newarc - set up a new arc within an NFA
275  *
276  * This function checks to make sure that no duplicate arcs are created.
277  * In general we never want duplicates.
278  *
279  * However: in principle, a RAINBOW arc is redundant with any plain arc
280  * (unless that arc is for a pseudocolor).  But we don't try to recognize
281  * that redundancy, either here or in allied operations such as moveins().
282  * The pseudocolor consideration makes that more costly than it seems worth.
283  */
284 static void
newarc(struct nfa * nfa,int t,color co,struct state * from,struct state * to)285 newarc(struct nfa *nfa,
286 	   int t,
287 	   color co,
288 	   struct state *from,
289 	   struct state *to)
290 {
291 	struct arc *a;
292 
293 	assert(from != NULL && to != NULL);
294 
295 	/*
296 	 * This is a handy place to check for operation cancel during regex
297 	 * compilation, since no code path will go very long without making a new
298 	 * state or arc.
299 	 */
300 	if (CANCEL_REQUESTED(nfa->v->re))
301 	{
302 		NERR(REG_CANCEL);
303 		return;
304 	}
305 
306 	/* check for duplicate arc, using whichever chain is shorter */
307 	if (from->nouts <= to->nins)
308 	{
309 		for (a = from->outs; a != NULL; a = a->outchain)
310 			if (a->to == to && a->co == co && a->type == t)
311 				return;
312 	}
313 	else
314 	{
315 		for (a = to->ins; a != NULL; a = a->inchain)
316 			if (a->from == from && a->co == co && a->type == t)
317 				return;
318 	}
319 
320 	/* no dup, so create the arc */
321 	createarc(nfa, t, co, from, to);
322 }
323 
324 /*
325  * createarc - create a new arc within an NFA
326  *
327  * This function must *only* be used after verifying that there is no existing
328  * identical arc (same type/color/from/to).
329  */
330 static void
createarc(struct nfa * nfa,int t,color co,struct state * from,struct state * to)331 createarc(struct nfa *nfa,
332 		  int t,
333 		  color co,
334 		  struct state *from,
335 		  struct state *to)
336 {
337 	struct arc *a;
338 
339 	a = allocarc(nfa);
340 	if (NISERR())
341 		return;
342 	assert(a != NULL);
343 
344 	a->type = t;
345 	a->co = co;
346 	a->to = to;
347 	a->from = from;
348 
349 	/*
350 	 * Put the new arc on the beginning, not the end, of the chains; it's
351 	 * simpler here, and freearc() is the same cost either way.  See also the
352 	 * logic in moveins() and its cohorts, as well as fixempties().
353 	 */
354 	a->inchain = to->ins;
355 	a->inchainRev = NULL;
356 	if (to->ins)
357 		to->ins->inchainRev = a;
358 	to->ins = a;
359 	a->outchain = from->outs;
360 	a->outchainRev = NULL;
361 	if (from->outs)
362 		from->outs->outchainRev = a;
363 	from->outs = a;
364 
365 	from->nouts++;
366 	to->nins++;
367 
368 	if (COLORED(a) && nfa->parent == NULL)
369 		colorchain(nfa->cm, a);
370 }
371 
372 /*
373  * allocarc - allocate a new arc within an NFA
374  */
375 static struct arc *				/* NULL for failure */
allocarc(struct nfa * nfa)376 allocarc(struct nfa *nfa)
377 {
378 	struct arc *a;
379 
380 	/* first, recycle anything that's on the freelist */
381 	if (nfa->freearcs != NULL)
382 	{
383 		a = nfa->freearcs;
384 		nfa->freearcs = a->freechain;
385 	}
386 	/* otherwise, is there anything left in the last arcbatch? */
387 	else if (nfa->lastab != NULL && nfa->lastabused < nfa->lastab->narcs)
388 	{
389 		a = &nfa->lastab->a[nfa->lastabused++];
390 	}
391 	/* otherwise, need to allocate a new arcbatch */
392 	else
393 	{
394 		struct arcbatch *newAb;
395 		size_t		narcs;
396 
397 		if (nfa->v->spaceused >= REG_MAX_COMPILE_SPACE)
398 		{
399 			NERR(REG_ETOOBIG);
400 			return NULL;
401 		}
402 		narcs = (nfa->lastab != NULL) ? nfa->lastab->narcs * 2 : FIRSTABSIZE;
403 		if (narcs > MAXABSIZE)
404 			narcs = MAXABSIZE;
405 		newAb = (struct arcbatch *) MALLOC(ARCBATCHSIZE(narcs));
406 		if (newAb == NULL)
407 		{
408 			NERR(REG_ESPACE);
409 			return NULL;
410 		}
411 		nfa->v->spaceused += ARCBATCHSIZE(narcs);
412 		newAb->narcs = narcs;
413 		newAb->next = nfa->lastab;
414 		nfa->lastab = newAb;
415 		nfa->lastabused = 1;
416 		a = &newAb->a[0];
417 	}
418 
419 	return a;
420 }
421 
422 /*
423  * freearc - free an arc
424  */
425 static void
freearc(struct nfa * nfa,struct arc * victim)426 freearc(struct nfa *nfa,
427 		struct arc *victim)
428 {
429 	struct state *from = victim->from;
430 	struct state *to = victim->to;
431 	struct arc *predecessor;
432 
433 	assert(victim->type != 0);
434 
435 	/* take it off color chain if necessary */
436 	if (COLORED(victim) && nfa->parent == NULL)
437 		uncolorchain(nfa->cm, victim);
438 
439 	/* take it off source's out-chain */
440 	assert(from != NULL);
441 	predecessor = victim->outchainRev;
442 	if (predecessor == NULL)
443 	{
444 		assert(from->outs == victim);
445 		from->outs = victim->outchain;
446 	}
447 	else
448 	{
449 		assert(predecessor->outchain == victim);
450 		predecessor->outchain = victim->outchain;
451 	}
452 	if (victim->outchain != NULL)
453 	{
454 		assert(victim->outchain->outchainRev == victim);
455 		victim->outchain->outchainRev = predecessor;
456 	}
457 	from->nouts--;
458 
459 	/* take it off target's in-chain */
460 	assert(to != NULL);
461 	predecessor = victim->inchainRev;
462 	if (predecessor == NULL)
463 	{
464 		assert(to->ins == victim);
465 		to->ins = victim->inchain;
466 	}
467 	else
468 	{
469 		assert(predecessor->inchain == victim);
470 		predecessor->inchain = victim->inchain;
471 	}
472 	if (victim->inchain != NULL)
473 	{
474 		assert(victim->inchain->inchainRev == victim);
475 		victim->inchain->inchainRev = predecessor;
476 	}
477 	to->nins--;
478 
479 	/* clean up and place on NFA's free list */
480 	victim->type = 0;
481 	victim->from = NULL;		/* precautions... */
482 	victim->to = NULL;
483 	victim->inchain = NULL;
484 	victim->inchainRev = NULL;
485 	victim->outchain = NULL;
486 	victim->outchainRev = NULL;
487 	victim->freechain = nfa->freearcs;
488 	nfa->freearcs = victim;
489 }
490 
491 /*
492  * changearcsource - flip an arc to have a different from state
493  *
494  * Caller must have verified that there is no pre-existing duplicate arc.
495  */
496 static void
changearcsource(struct arc * a,struct state * newfrom)497 changearcsource(struct arc *a, struct state *newfrom)
498 {
499 	struct state *oldfrom = a->from;
500 	struct arc *predecessor;
501 
502 	assert(oldfrom != newfrom);
503 
504 	/* take it off old source's out-chain */
505 	assert(oldfrom != NULL);
506 	predecessor = a->outchainRev;
507 	if (predecessor == NULL)
508 	{
509 		assert(oldfrom->outs == a);
510 		oldfrom->outs = a->outchain;
511 	}
512 	else
513 	{
514 		assert(predecessor->outchain == a);
515 		predecessor->outchain = a->outchain;
516 	}
517 	if (a->outchain != NULL)
518 	{
519 		assert(a->outchain->outchainRev == a);
520 		a->outchain->outchainRev = predecessor;
521 	}
522 	oldfrom->nouts--;
523 
524 	a->from = newfrom;
525 
526 	/* prepend it to new source's out-chain */
527 	a->outchain = newfrom->outs;
528 	a->outchainRev = NULL;
529 	if (newfrom->outs)
530 		newfrom->outs->outchainRev = a;
531 	newfrom->outs = a;
532 	newfrom->nouts++;
533 }
534 
535 /*
536  * changearctarget - flip an arc to have a different to state
537  *
538  * Caller must have verified that there is no pre-existing duplicate arc.
539  */
540 static void
changearctarget(struct arc * a,struct state * newto)541 changearctarget(struct arc *a, struct state *newto)
542 {
543 	struct state *oldto = a->to;
544 	struct arc *predecessor;
545 
546 	assert(oldto != newto);
547 
548 	/* take it off old target's in-chain */
549 	assert(oldto != NULL);
550 	predecessor = a->inchainRev;
551 	if (predecessor == NULL)
552 	{
553 		assert(oldto->ins == a);
554 		oldto->ins = a->inchain;
555 	}
556 	else
557 	{
558 		assert(predecessor->inchain == a);
559 		predecessor->inchain = a->inchain;
560 	}
561 	if (a->inchain != NULL)
562 	{
563 		assert(a->inchain->inchainRev == a);
564 		a->inchain->inchainRev = predecessor;
565 	}
566 	oldto->nins--;
567 
568 	a->to = newto;
569 
570 	/* prepend it to new target's in-chain */
571 	a->inchain = newto->ins;
572 	a->inchainRev = NULL;
573 	if (newto->ins)
574 		newto->ins->inchainRev = a;
575 	newto->ins = a;
576 	newto->nins++;
577 }
578 
579 /*
580  * hasnonemptyout - Does state have a non-EMPTY out arc?
581  */
582 static int
hasnonemptyout(struct state * s)583 hasnonemptyout(struct state *s)
584 {
585 	struct arc *a;
586 
587 	for (a = s->outs; a != NULL; a = a->outchain)
588 	{
589 		if (a->type != EMPTY)
590 			return 1;
591 	}
592 	return 0;
593 }
594 
595 /*
596  * findarc - find arc, if any, from given source with given type and color
597  * If there is more than one such arc, the result is random.
598  */
599 static struct arc *
findarc(struct state * s,int type,color co)600 findarc(struct state *s,
601 		int type,
602 		color co)
603 {
604 	struct arc *a;
605 
606 	for (a = s->outs; a != NULL; a = a->outchain)
607 		if (a->type == type && a->co == co)
608 			return a;
609 	return NULL;
610 }
611 
612 /*
613  * cparc - allocate a new arc within an NFA, copying details from old one
614  */
615 static void
cparc(struct nfa * nfa,struct arc * oa,struct state * from,struct state * to)616 cparc(struct nfa *nfa,
617 	  struct arc *oa,
618 	  struct state *from,
619 	  struct state *to)
620 {
621 	newarc(nfa, oa->type, oa->co, from, to);
622 }
623 
624 /*
625  * sortins - sort the in arcs of a state by from/color/type
626  */
627 static void
sortins(struct nfa * nfa,struct state * s)628 sortins(struct nfa *nfa,
629 		struct state *s)
630 {
631 	struct arc **sortarray;
632 	struct arc *a;
633 	int			n = s->nins;
634 	int			i;
635 
636 	if (n <= 1)
637 		return;					/* nothing to do */
638 	/* make an array of arc pointers ... */
639 	sortarray = (struct arc **) MALLOC(n * sizeof(struct arc *));
640 	if (sortarray == NULL)
641 	{
642 		NERR(REG_ESPACE);
643 		return;
644 	}
645 	i = 0;
646 	for (a = s->ins; a != NULL; a = a->inchain)
647 		sortarray[i++] = a;
648 	assert(i == n);
649 	/* ... sort the array */
650 	qsort(sortarray, n, sizeof(struct arc *), sortins_cmp);
651 	/* ... and rebuild arc list in order */
652 	/* it seems worth special-casing first and last items to simplify loop */
653 	a = sortarray[0];
654 	s->ins = a;
655 	a->inchain = sortarray[1];
656 	a->inchainRev = NULL;
657 	for (i = 1; i < n - 1; i++)
658 	{
659 		a = sortarray[i];
660 		a->inchain = sortarray[i + 1];
661 		a->inchainRev = sortarray[i - 1];
662 	}
663 	a = sortarray[i];
664 	a->inchain = NULL;
665 	a->inchainRev = sortarray[i - 1];
666 	FREE(sortarray);
667 }
668 
669 static int
sortins_cmp(const void * a,const void * b)670 sortins_cmp(const void *a, const void *b)
671 {
672 	const struct arc *aa = *((const struct arc *const *) a);
673 	const struct arc *bb = *((const struct arc *const *) b);
674 
675 	/* we check the fields in the order they are most likely to be different */
676 	if (aa->from->no < bb->from->no)
677 		return -1;
678 	if (aa->from->no > bb->from->no)
679 		return 1;
680 	if (aa->co < bb->co)
681 		return -1;
682 	if (aa->co > bb->co)
683 		return 1;
684 	if (aa->type < bb->type)
685 		return -1;
686 	if (aa->type > bb->type)
687 		return 1;
688 	return 0;
689 }
690 
691 /*
692  * sortouts - sort the out arcs of a state by to/color/type
693  */
694 static void
sortouts(struct nfa * nfa,struct state * s)695 sortouts(struct nfa *nfa,
696 		 struct state *s)
697 {
698 	struct arc **sortarray;
699 	struct arc *a;
700 	int			n = s->nouts;
701 	int			i;
702 
703 	if (n <= 1)
704 		return;					/* nothing to do */
705 	/* make an array of arc pointers ... */
706 	sortarray = (struct arc **) MALLOC(n * sizeof(struct arc *));
707 	if (sortarray == NULL)
708 	{
709 		NERR(REG_ESPACE);
710 		return;
711 	}
712 	i = 0;
713 	for (a = s->outs; a != NULL; a = a->outchain)
714 		sortarray[i++] = a;
715 	assert(i == n);
716 	/* ... sort the array */
717 	qsort(sortarray, n, sizeof(struct arc *), sortouts_cmp);
718 	/* ... and rebuild arc list in order */
719 	/* it seems worth special-casing first and last items to simplify loop */
720 	a = sortarray[0];
721 	s->outs = a;
722 	a->outchain = sortarray[1];
723 	a->outchainRev = NULL;
724 	for (i = 1; i < n - 1; i++)
725 	{
726 		a = sortarray[i];
727 		a->outchain = sortarray[i + 1];
728 		a->outchainRev = sortarray[i - 1];
729 	}
730 	a = sortarray[i];
731 	a->outchain = NULL;
732 	a->outchainRev = sortarray[i - 1];
733 	FREE(sortarray);
734 }
735 
736 static int
sortouts_cmp(const void * a,const void * b)737 sortouts_cmp(const void *a, const void *b)
738 {
739 	const struct arc *aa = *((const struct arc *const *) a);
740 	const struct arc *bb = *((const struct arc *const *) b);
741 
742 	/* we check the fields in the order they are most likely to be different */
743 	if (aa->to->no < bb->to->no)
744 		return -1;
745 	if (aa->to->no > bb->to->no)
746 		return 1;
747 	if (aa->co < bb->co)
748 		return -1;
749 	if (aa->co > bb->co)
750 		return 1;
751 	if (aa->type < bb->type)
752 		return -1;
753 	if (aa->type > bb->type)
754 		return 1;
755 	return 0;
756 }
757 
758 /*
759  * Common decision logic about whether to use arc-by-arc operations or
760  * sort/merge.  If there's just a few source arcs we cannot recoup the
761  * cost of sorting the destination arc list, no matter how large it is.
762  * Otherwise, limit the number of arc-by-arc comparisons to about 1000
763  * (a somewhat arbitrary choice, but the breakeven point would probably
764  * be machine dependent anyway).
765  */
766 #define BULK_ARC_OP_USE_SORT(nsrcarcs, ndestarcs) \
767 	((nsrcarcs) < 4 ? 0 : ((nsrcarcs) > 32 || (ndestarcs) > 32))
768 
769 /*
770  * moveins - move all in arcs of a state to another state
771  *
772  * You might think this could be done better by just updating the
773  * existing arcs, and you would be right if it weren't for the need
774  * for duplicate suppression, which makes it easier to just make new
775  * ones to exploit the suppression built into newarc.
776  *
777  * However, if we have a whole lot of arcs to deal with, retail duplicate
778  * checks become too slow.  In that case we proceed by sorting and merging
779  * the arc lists, and then we can indeed just update the arcs in-place.
780  */
781 static void
moveins(struct nfa * nfa,struct state * oldState,struct state * newState)782 moveins(struct nfa *nfa,
783 		struct state *oldState,
784 		struct state *newState)
785 {
786 	assert(oldState != newState);
787 
788 	if (!BULK_ARC_OP_USE_SORT(oldState->nins, newState->nins))
789 	{
790 		/* With not too many arcs, just do them one at a time */
791 		struct arc *a;
792 
793 		while ((a = oldState->ins) != NULL)
794 		{
795 			cparc(nfa, a, a->from, newState);
796 			freearc(nfa, a);
797 		}
798 	}
799 	else
800 	{
801 		/*
802 		 * With many arcs, use a sort-merge approach.  Note changearctarget()
803 		 * will put the arc onto the front of newState's chain, so it does not
804 		 * break our walk through the sorted part of the chain.
805 		 */
806 		struct arc *oa;
807 		struct arc *na;
808 
809 		/*
810 		 * Because we bypass newarc() in this code path, we'd better include a
811 		 * cancel check.
812 		 */
813 		if (CANCEL_REQUESTED(nfa->v->re))
814 		{
815 			NERR(REG_CANCEL);
816 			return;
817 		}
818 
819 		sortins(nfa, oldState);
820 		sortins(nfa, newState);
821 		if (NISERR())
822 			return;				/* might have failed to sort */
823 		oa = oldState->ins;
824 		na = newState->ins;
825 		while (oa != NULL && na != NULL)
826 		{
827 			struct arc *a = oa;
828 
829 			switch (sortins_cmp(&oa, &na))
830 			{
831 				case -1:
832 					/* newState does not have anything matching oa */
833 					oa = oa->inchain;
834 
835 					/*
836 					 * Rather than doing createarc+freearc, we can just unlink
837 					 * and relink the existing arc struct.
838 					 */
839 					changearctarget(a, newState);
840 					break;
841 				case 0:
842 					/* match, advance in both lists */
843 					oa = oa->inchain;
844 					na = na->inchain;
845 					/* ... and drop duplicate arc from oldState */
846 					freearc(nfa, a);
847 					break;
848 				case +1:
849 					/* advance only na; oa might have a match later */
850 					na = na->inchain;
851 					break;
852 				default:
853 					assert(NOTREACHED);
854 			}
855 		}
856 		while (oa != NULL)
857 		{
858 			/* newState does not have anything matching oa */
859 			struct arc *a = oa;
860 
861 			oa = oa->inchain;
862 			changearctarget(a, newState);
863 		}
864 	}
865 
866 	assert(oldState->nins == 0);
867 	assert(oldState->ins == NULL);
868 }
869 
870 /*
871  * copyins - copy in arcs of a state to another state
872  */
873 static void
copyins(struct nfa * nfa,struct state * oldState,struct state * newState)874 copyins(struct nfa *nfa,
875 		struct state *oldState,
876 		struct state *newState)
877 {
878 	assert(oldState != newState);
879 
880 	if (!BULK_ARC_OP_USE_SORT(oldState->nins, newState->nins))
881 	{
882 		/* With not too many arcs, just do them one at a time */
883 		struct arc *a;
884 
885 		for (a = oldState->ins; a != NULL; a = a->inchain)
886 			cparc(nfa, a, a->from, newState);
887 	}
888 	else
889 	{
890 		/*
891 		 * With many arcs, use a sort-merge approach.  Note that createarc()
892 		 * will put new arcs onto the front of newState's chain, so it does
893 		 * not break our walk through the sorted part of the chain.
894 		 */
895 		struct arc *oa;
896 		struct arc *na;
897 
898 		/*
899 		 * Because we bypass newarc() in this code path, we'd better include a
900 		 * cancel check.
901 		 */
902 		if (CANCEL_REQUESTED(nfa->v->re))
903 		{
904 			NERR(REG_CANCEL);
905 			return;
906 		}
907 
908 		sortins(nfa, oldState);
909 		sortins(nfa, newState);
910 		if (NISERR())
911 			return;				/* might have failed to sort */
912 		oa = oldState->ins;
913 		na = newState->ins;
914 		while (oa != NULL && na != NULL)
915 		{
916 			struct arc *a = oa;
917 
918 			switch (sortins_cmp(&oa, &na))
919 			{
920 				case -1:
921 					/* newState does not have anything matching oa */
922 					oa = oa->inchain;
923 					createarc(nfa, a->type, a->co, a->from, newState);
924 					break;
925 				case 0:
926 					/* match, advance in both lists */
927 					oa = oa->inchain;
928 					na = na->inchain;
929 					break;
930 				case +1:
931 					/* advance only na; oa might have a match later */
932 					na = na->inchain;
933 					break;
934 				default:
935 					assert(NOTREACHED);
936 			}
937 		}
938 		while (oa != NULL)
939 		{
940 			/* newState does not have anything matching oa */
941 			struct arc *a = oa;
942 
943 			oa = oa->inchain;
944 			createarc(nfa, a->type, a->co, a->from, newState);
945 		}
946 	}
947 }
948 
949 /*
950  * mergeins - merge a list of inarcs into a state
951  *
952  * This is much like copyins, but the source arcs are listed in an array,
953  * and are not guaranteed unique.  It's okay to clobber the array contents.
954  */
955 static void
mergeins(struct nfa * nfa,struct state * s,struct arc ** arcarray,int arccount)956 mergeins(struct nfa *nfa,
957 		 struct state *s,
958 		 struct arc **arcarray,
959 		 int arccount)
960 {
961 	struct arc *na;
962 	int			i;
963 	int			j;
964 
965 	if (arccount <= 0)
966 		return;
967 
968 	/*
969 	 * Because we bypass newarc() in this code path, we'd better include a
970 	 * cancel check.
971 	 */
972 	if (CANCEL_REQUESTED(nfa->v->re))
973 	{
974 		NERR(REG_CANCEL);
975 		return;
976 	}
977 
978 	/* Sort existing inarcs as well as proposed new ones */
979 	sortins(nfa, s);
980 	if (NISERR())
981 		return;					/* might have failed to sort */
982 
983 	qsort(arcarray, arccount, sizeof(struct arc *), sortins_cmp);
984 
985 	/*
986 	 * arcarray very likely includes dups, so we must eliminate them.  (This
987 	 * could be folded into the next loop, but it's not worth the trouble.)
988 	 */
989 	j = 0;
990 	for (i = 1; i < arccount; i++)
991 	{
992 		switch (sortins_cmp(&arcarray[j], &arcarray[i]))
993 		{
994 			case -1:
995 				/* non-dup */
996 				arcarray[++j] = arcarray[i];
997 				break;
998 			case 0:
999 				/* dup */
1000 				break;
1001 			default:
1002 				/* trouble */
1003 				assert(NOTREACHED);
1004 		}
1005 	}
1006 	arccount = j + 1;
1007 
1008 	/*
1009 	 * Now merge into s' inchain.  Note that createarc() will put new arcs
1010 	 * onto the front of s's chain, so it does not break our walk through the
1011 	 * sorted part of the chain.
1012 	 */
1013 	i = 0;
1014 	na = s->ins;
1015 	while (i < arccount && na != NULL)
1016 	{
1017 		struct arc *a = arcarray[i];
1018 
1019 		switch (sortins_cmp(&a, &na))
1020 		{
1021 			case -1:
1022 				/* s does not have anything matching a */
1023 				createarc(nfa, a->type, a->co, a->from, s);
1024 				i++;
1025 				break;
1026 			case 0:
1027 				/* match, advance in both lists */
1028 				i++;
1029 				na = na->inchain;
1030 				break;
1031 			case +1:
1032 				/* advance only na; array might have a match later */
1033 				na = na->inchain;
1034 				break;
1035 			default:
1036 				assert(NOTREACHED);
1037 		}
1038 	}
1039 	while (i < arccount)
1040 	{
1041 		/* s does not have anything matching a */
1042 		struct arc *a = arcarray[i];
1043 
1044 		createarc(nfa, a->type, a->co, a->from, s);
1045 		i++;
1046 	}
1047 }
1048 
1049 /*
1050  * moveouts - move all out arcs of a state to another state
1051  *
1052  * See comments for moveins()
1053  */
1054 static void
moveouts(struct nfa * nfa,struct state * oldState,struct state * newState)1055 moveouts(struct nfa *nfa,
1056 		 struct state *oldState,
1057 		 struct state *newState)
1058 {
1059 	assert(oldState != newState);
1060 
1061 	if (!BULK_ARC_OP_USE_SORT(oldState->nouts, newState->nouts))
1062 	{
1063 		/* With not too many arcs, just do them one at a time */
1064 		struct arc *a;
1065 
1066 		while ((a = oldState->outs) != NULL)
1067 		{
1068 			cparc(nfa, a, newState, a->to);
1069 			freearc(nfa, a);
1070 		}
1071 	}
1072 	else
1073 	{
1074 		/*
1075 		 * With many arcs, use a sort-merge approach.  Note changearcsource()
1076 		 * will put the arc onto the front of newState's chain, so it does not
1077 		 * break our walk through the sorted part of the chain.
1078 		 */
1079 		struct arc *oa;
1080 		struct arc *na;
1081 
1082 		/*
1083 		 * Because we bypass newarc() in this code path, we'd better include a
1084 		 * cancel check.
1085 		 */
1086 		if (CANCEL_REQUESTED(nfa->v->re))
1087 		{
1088 			NERR(REG_CANCEL);
1089 			return;
1090 		}
1091 
1092 		sortouts(nfa, oldState);
1093 		sortouts(nfa, newState);
1094 		if (NISERR())
1095 			return;				/* might have failed to sort */
1096 		oa = oldState->outs;
1097 		na = newState->outs;
1098 		while (oa != NULL && na != NULL)
1099 		{
1100 			struct arc *a = oa;
1101 
1102 			switch (sortouts_cmp(&oa, &na))
1103 			{
1104 				case -1:
1105 					/* newState does not have anything matching oa */
1106 					oa = oa->outchain;
1107 
1108 					/*
1109 					 * Rather than doing createarc+freearc, we can just unlink
1110 					 * and relink the existing arc struct.
1111 					 */
1112 					changearcsource(a, newState);
1113 					break;
1114 				case 0:
1115 					/* match, advance in both lists */
1116 					oa = oa->outchain;
1117 					na = na->outchain;
1118 					/* ... and drop duplicate arc from oldState */
1119 					freearc(nfa, a);
1120 					break;
1121 				case +1:
1122 					/* advance only na; oa might have a match later */
1123 					na = na->outchain;
1124 					break;
1125 				default:
1126 					assert(NOTREACHED);
1127 			}
1128 		}
1129 		while (oa != NULL)
1130 		{
1131 			/* newState does not have anything matching oa */
1132 			struct arc *a = oa;
1133 
1134 			oa = oa->outchain;
1135 			changearcsource(a, newState);
1136 		}
1137 	}
1138 
1139 	assert(oldState->nouts == 0);
1140 	assert(oldState->outs == NULL);
1141 }
1142 
1143 /*
1144  * copyouts - copy out arcs of a state to another state
1145  */
1146 static void
copyouts(struct nfa * nfa,struct state * oldState,struct state * newState)1147 copyouts(struct nfa *nfa,
1148 		 struct state *oldState,
1149 		 struct state *newState)
1150 {
1151 	assert(oldState != newState);
1152 
1153 	if (!BULK_ARC_OP_USE_SORT(oldState->nouts, newState->nouts))
1154 	{
1155 		/* With not too many arcs, just do them one at a time */
1156 		struct arc *a;
1157 
1158 		for (a = oldState->outs; a != NULL; a = a->outchain)
1159 			cparc(nfa, a, newState, a->to);
1160 	}
1161 	else
1162 	{
1163 		/*
1164 		 * With many arcs, use a sort-merge approach.  Note that createarc()
1165 		 * will put new arcs onto the front of newState's chain, so it does
1166 		 * not break our walk through the sorted part of the chain.
1167 		 */
1168 		struct arc *oa;
1169 		struct arc *na;
1170 
1171 		/*
1172 		 * Because we bypass newarc() in this code path, we'd better include a
1173 		 * cancel check.
1174 		 */
1175 		if (CANCEL_REQUESTED(nfa->v->re))
1176 		{
1177 			NERR(REG_CANCEL);
1178 			return;
1179 		}
1180 
1181 		sortouts(nfa, oldState);
1182 		sortouts(nfa, newState);
1183 		if (NISERR())
1184 			return;				/* might have failed to sort */
1185 		oa = oldState->outs;
1186 		na = newState->outs;
1187 		while (oa != NULL && na != NULL)
1188 		{
1189 			struct arc *a = oa;
1190 
1191 			switch (sortouts_cmp(&oa, &na))
1192 			{
1193 				case -1:
1194 					/* newState does not have anything matching oa */
1195 					oa = oa->outchain;
1196 					createarc(nfa, a->type, a->co, newState, a->to);
1197 					break;
1198 				case 0:
1199 					/* match, advance in both lists */
1200 					oa = oa->outchain;
1201 					na = na->outchain;
1202 					break;
1203 				case +1:
1204 					/* advance only na; oa might have a match later */
1205 					na = na->outchain;
1206 					break;
1207 				default:
1208 					assert(NOTREACHED);
1209 			}
1210 		}
1211 		while (oa != NULL)
1212 		{
1213 			/* newState does not have anything matching oa */
1214 			struct arc *a = oa;
1215 
1216 			oa = oa->outchain;
1217 			createarc(nfa, a->type, a->co, newState, a->to);
1218 		}
1219 	}
1220 }
1221 
1222 /*
1223  * cloneouts - copy out arcs of a state to another state pair, modifying type
1224  *
1225  * This is only used to convert PLAIN arcs to AHEAD/BEHIND arcs, which share
1226  * the same interpretation of "co".  It wouldn't be sensible with LACONs.
1227  */
1228 static void
cloneouts(struct nfa * nfa,struct state * old,struct state * from,struct state * to,int type)1229 cloneouts(struct nfa *nfa,
1230 		  struct state *old,
1231 		  struct state *from,
1232 		  struct state *to,
1233 		  int type)
1234 {
1235 	struct arc *a;
1236 
1237 	assert(old != from);
1238 	assert(type == AHEAD || type == BEHIND);
1239 
1240 	for (a = old->outs; a != NULL; a = a->outchain)
1241 	{
1242 		assert(a->type == PLAIN);
1243 		newarc(nfa, type, a->co, from, to);
1244 	}
1245 }
1246 
1247 /*
1248  * delsub - delete a sub-NFA, updating subre pointers if necessary
1249  *
1250  * This uses a recursive traversal of the sub-NFA, marking already-seen
1251  * states using their tmp pointer.
1252  */
1253 static void
delsub(struct nfa * nfa,struct state * lp,struct state * rp)1254 delsub(struct nfa *nfa,
1255 	   struct state *lp,		/* the sub-NFA goes from here... */
1256 	   struct state *rp)		/* ...to here, *not* inclusive */
1257 {
1258 	assert(lp != rp);
1259 
1260 	rp->tmp = rp;				/* mark end */
1261 
1262 	deltraverse(nfa, lp, lp);
1263 	if (NISERR())
1264 		return;					/* asserts might not hold after failure */
1265 	assert(lp->nouts == 0 && rp->nins == 0);	/* did the job */
1266 	assert(lp->no != FREESTATE && rp->no != FREESTATE); /* no more */
1267 
1268 	rp->tmp = NULL;				/* unmark end */
1269 	lp->tmp = NULL;				/* and begin, marked by deltraverse */
1270 }
1271 
1272 /*
1273  * deltraverse - the recursive heart of delsub
1274  * This routine's basic job is to destroy all out-arcs of the state.
1275  */
1276 static void
deltraverse(struct nfa * nfa,struct state * leftend,struct state * s)1277 deltraverse(struct nfa *nfa,
1278 			struct state *leftend,
1279 			struct state *s)
1280 {
1281 	struct arc *a;
1282 	struct state *to;
1283 
1284 	/* Since this is recursive, it could be driven to stack overflow */
1285 	if (STACK_TOO_DEEP(nfa->v->re))
1286 	{
1287 		NERR(REG_ETOOBIG);
1288 		return;
1289 	}
1290 
1291 	if (s->nouts == 0)
1292 		return;					/* nothing to do */
1293 	if (s->tmp != NULL)
1294 		return;					/* already in progress */
1295 
1296 	s->tmp = s;					/* mark as in progress */
1297 
1298 	while ((a = s->outs) != NULL)
1299 	{
1300 		to = a->to;
1301 		deltraverse(nfa, leftend, to);
1302 		if (NISERR())
1303 			return;				/* asserts might not hold after failure */
1304 		assert(to->nouts == 0 || to->tmp != NULL);
1305 		freearc(nfa, a);
1306 		if (to->nins == 0 && to->tmp == NULL)
1307 		{
1308 			assert(to->nouts == 0);
1309 			freestate(nfa, to);
1310 		}
1311 	}
1312 
1313 	assert(s->no != FREESTATE); /* we're still here */
1314 	assert(s == leftend || s->nins != 0);	/* and still reachable */
1315 	assert(s->nouts == 0);		/* but have no outarcs */
1316 
1317 	s->tmp = NULL;				/* we're done here */
1318 }
1319 
1320 /*
1321  * dupnfa - duplicate sub-NFA
1322  *
1323  * Another recursive traversal, this time using tmp to point to duplicates
1324  * as well as mark already-seen states.  (You knew there was a reason why
1325  * it's a state pointer, didn't you? :-))
1326  */
1327 static void
dupnfa(struct nfa * nfa,struct state * start,struct state * stop,struct state * from,struct state * to)1328 dupnfa(struct nfa *nfa,
1329 	   struct state *start,		/* duplicate of subNFA starting here */
1330 	   struct state *stop,		/* and stopping here */
1331 	   struct state *from,		/* stringing duplicate from here */
1332 	   struct state *to)		/* to here */
1333 {
1334 	if (start == stop)
1335 	{
1336 		newarc(nfa, EMPTY, 0, from, to);
1337 		return;
1338 	}
1339 
1340 	stop->tmp = to;
1341 	duptraverse(nfa, start, from);
1342 	/* done, except for clearing out the tmp pointers */
1343 
1344 	stop->tmp = NULL;
1345 	cleartraverse(nfa, start);
1346 }
1347 
1348 /*
1349  * duptraverse - recursive heart of dupnfa
1350  */
1351 static void
duptraverse(struct nfa * nfa,struct state * s,struct state * stmp)1352 duptraverse(struct nfa *nfa,
1353 			struct state *s,
1354 			struct state *stmp) /* s's duplicate, or NULL */
1355 {
1356 	struct arc *a;
1357 
1358 	/* Since this is recursive, it could be driven to stack overflow */
1359 	if (STACK_TOO_DEEP(nfa->v->re))
1360 	{
1361 		NERR(REG_ETOOBIG);
1362 		return;
1363 	}
1364 
1365 	if (s->tmp != NULL)
1366 		return;					/* already done */
1367 
1368 	s->tmp = (stmp == NULL) ? newstate(nfa) : stmp;
1369 	if (s->tmp == NULL)
1370 	{
1371 		assert(NISERR());
1372 		return;
1373 	}
1374 
1375 	for (a = s->outs; a != NULL && !NISERR(); a = a->outchain)
1376 	{
1377 		duptraverse(nfa, a->to, (struct state *) NULL);
1378 		if (NISERR())
1379 			break;
1380 		assert(a->to->tmp != NULL);
1381 		cparc(nfa, a, s->tmp, a->to->tmp);
1382 	}
1383 }
1384 
1385 /*
1386  * removeconstraints - remove any constraints in an NFA
1387  *
1388  * Constraint arcs are replaced by empty arcs, essentially treating all
1389  * constraints as automatically satisfied.
1390  */
1391 static void
removeconstraints(struct nfa * nfa,struct state * start,struct state * stop)1392 removeconstraints(struct nfa *nfa,
1393 				  struct state *start,	/* process subNFA starting here */
1394 				  struct state *stop)	/* and stopping here */
1395 {
1396 	if (start == stop)
1397 		return;
1398 
1399 	stop->tmp = stop;
1400 	removetraverse(nfa, start);
1401 	/* done, except for clearing out the tmp pointers */
1402 
1403 	stop->tmp = NULL;
1404 	cleartraverse(nfa, start);
1405 }
1406 
1407 /*
1408  * removetraverse - recursive heart of removeconstraints
1409  */
1410 static void
removetraverse(struct nfa * nfa,struct state * s)1411 removetraverse(struct nfa *nfa,
1412 			   struct state *s)
1413 {
1414 	struct arc *a;
1415 	struct arc *oa;
1416 
1417 	/* Since this is recursive, it could be driven to stack overflow */
1418 	if (STACK_TOO_DEEP(nfa->v->re))
1419 	{
1420 		NERR(REG_ETOOBIG);
1421 		return;
1422 	}
1423 
1424 	if (s->tmp != NULL)
1425 		return;					/* already done */
1426 
1427 	s->tmp = s;
1428 	for (a = s->outs; a != NULL && !NISERR(); a = oa)
1429 	{
1430 		removetraverse(nfa, a->to);
1431 		if (NISERR())
1432 			break;
1433 		oa = a->outchain;
1434 		switch (a->type)
1435 		{
1436 			case PLAIN:
1437 			case EMPTY:
1438 				/* nothing to do */
1439 				break;
1440 			case AHEAD:
1441 			case BEHIND:
1442 			case '^':
1443 			case '$':
1444 			case LACON:
1445 				/* replace it */
1446 				newarc(nfa, EMPTY, 0, s, a->to);
1447 				freearc(nfa, a);
1448 				break;
1449 			default:
1450 				NERR(REG_ASSERT);
1451 				break;
1452 		}
1453 	}
1454 }
1455 
1456 /*
1457  * cleartraverse - recursive cleanup for algorithms that leave tmp ptrs set
1458  */
1459 static void
cleartraverse(struct nfa * nfa,struct state * s)1460 cleartraverse(struct nfa *nfa,
1461 			  struct state *s)
1462 {
1463 	struct arc *a;
1464 
1465 	/* Since this is recursive, it could be driven to stack overflow */
1466 	if (STACK_TOO_DEEP(nfa->v->re))
1467 	{
1468 		NERR(REG_ETOOBIG);
1469 		return;
1470 	}
1471 
1472 	if (s->tmp == NULL)
1473 		return;
1474 	s->tmp = NULL;
1475 
1476 	for (a = s->outs; a != NULL; a = a->outchain)
1477 		cleartraverse(nfa, a->to);
1478 }
1479 
1480 /*
1481  * single_color_transition - does getting from s1 to s2 cross one PLAIN arc?
1482  *
1483  * If traversing from s1 to s2 requires a single PLAIN match (possibly of any
1484  * of a set of colors), return a state whose outarc list contains only PLAIN
1485  * arcs of those color(s).  Otherwise return NULL.
1486  *
1487  * This is used before optimizing the NFA, so there may be EMPTY arcs, which
1488  * we should ignore; the possibility of an EMPTY is why the result state could
1489  * be different from s1.
1490  *
1491  * It's worth troubling to handle multiple parallel PLAIN arcs here because a
1492  * bracket construct such as [abc] might yield either one or several parallel
1493  * PLAIN arcs depending on earlier atoms in the expression.  We'd rather that
1494  * that implementation detail not create user-visible performance differences.
1495  */
1496 static struct state *
single_color_transition(struct state * s1,struct state * s2)1497 single_color_transition(struct state *s1, struct state *s2)
1498 {
1499 	struct arc *a;
1500 
1501 	/* Ignore leading EMPTY arc, if any */
1502 	if (s1->nouts == 1 && s1->outs->type == EMPTY)
1503 		s1 = s1->outs->to;
1504 	/* Likewise for any trailing EMPTY arc */
1505 	if (s2->nins == 1 && s2->ins->type == EMPTY)
1506 		s2 = s2->ins->from;
1507 	/* Perhaps we could have a single-state loop in between, if so reject */
1508 	if (s1 == s2)
1509 		return NULL;
1510 	/* s1 must have at least one outarc... */
1511 	if (s1->outs == NULL)
1512 		return NULL;
1513 	/* ... and they must all be PLAIN arcs to s2 */
1514 	for (a = s1->outs; a != NULL; a = a->outchain)
1515 	{
1516 		if (a->type != PLAIN || a->to != s2)
1517 			return NULL;
1518 	}
1519 	/* OK, return s1 as the possessor of the relevant outarcs */
1520 	return s1;
1521 }
1522 
1523 /*
1524  * specialcolors - fill in special colors for an NFA
1525  */
1526 static void
specialcolors(struct nfa * nfa)1527 specialcolors(struct nfa *nfa)
1528 {
1529 	/* false colors for BOS, BOL, EOS, EOL */
1530 	if (nfa->parent == NULL)
1531 	{
1532 		nfa->bos[0] = pseudocolor(nfa->cm);
1533 		nfa->bos[1] = pseudocolor(nfa->cm);
1534 		nfa->eos[0] = pseudocolor(nfa->cm);
1535 		nfa->eos[1] = pseudocolor(nfa->cm);
1536 	}
1537 	else
1538 	{
1539 		assert(nfa->parent->bos[0] != COLORLESS);
1540 		nfa->bos[0] = nfa->parent->bos[0];
1541 		assert(nfa->parent->bos[1] != COLORLESS);
1542 		nfa->bos[1] = nfa->parent->bos[1];
1543 		assert(nfa->parent->eos[0] != COLORLESS);
1544 		nfa->eos[0] = nfa->parent->eos[0];
1545 		assert(nfa->parent->eos[1] != COLORLESS);
1546 		nfa->eos[1] = nfa->parent->eos[1];
1547 	}
1548 }
1549 
1550 /*
1551  * optimize - optimize an NFA
1552  *
1553  * The main goal of this function is not so much "optimization" (though it
1554  * does try to get rid of useless NFA states) as reducing the NFA to a form
1555  * the regex executor can handle.  The executor, and indeed the cNFA format
1556  * that is its input, can only handle PLAIN and LACON arcs.  The output of
1557  * the regex parser also includes EMPTY (do-nothing) arcs, as well as
1558  * ^, $, AHEAD, and BEHIND constraint arcs, which we must get rid of here.
1559  * We first get rid of EMPTY arcs and then deal with the constraint arcs.
1560  * The hardest part of either job is to get rid of circular loops of the
1561  * target arc type.  We would have to do that in any case, though, as such a
1562  * loop would otherwise allow the executor to cycle through the loop endlessly
1563  * without making any progress in the input string.
1564  */
1565 static long						/* re_info bits */
optimize(struct nfa * nfa,FILE * f)1566 optimize(struct nfa *nfa,
1567 		 FILE *f)				/* for debug output; NULL none */
1568 {
1569 #ifdef REG_DEBUG
1570 	int			verbose = (f != NULL) ? 1 : 0;
1571 
1572 	if (verbose)
1573 		fprintf(f, "\ninitial cleanup:\n");
1574 #endif
1575 	cleanup(nfa);				/* may simplify situation */
1576 #ifdef REG_DEBUG
1577 	if (verbose)
1578 		dumpnfa(nfa, f);
1579 	if (verbose)
1580 		fprintf(f, "\nempties:\n");
1581 #endif
1582 	fixempties(nfa, f);			/* get rid of EMPTY arcs */
1583 #ifdef REG_DEBUG
1584 	if (verbose)
1585 		fprintf(f, "\nconstraints:\n");
1586 #endif
1587 	fixconstraintloops(nfa, f); /* get rid of constraint loops */
1588 	pullback(nfa, f);			/* pull back constraints backward */
1589 	pushfwd(nfa, f);			/* push fwd constraints forward */
1590 #ifdef REG_DEBUG
1591 	if (verbose)
1592 		fprintf(f, "\nfinal cleanup:\n");
1593 #endif
1594 	cleanup(nfa);				/* final tidying */
1595 #ifdef REG_DEBUG
1596 	if (verbose)
1597 		dumpnfa(nfa, f);
1598 #endif
1599 	return analyze(nfa);		/* and analysis */
1600 }
1601 
1602 /*
1603  * pullback - pull back constraints backward to eliminate them
1604  */
1605 static void
pullback(struct nfa * nfa,FILE * f)1606 pullback(struct nfa *nfa,
1607 		 FILE *f)				/* for debug output; NULL none */
1608 {
1609 	struct state *s;
1610 	struct state *nexts;
1611 	struct arc *a;
1612 	struct arc *nexta;
1613 	struct state *intermediates;
1614 	int			progress;
1615 
1616 	/* find and pull until there are no more */
1617 	do
1618 	{
1619 		progress = 0;
1620 		for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
1621 		{
1622 			nexts = s->next;
1623 			intermediates = NULL;
1624 			for (a = s->outs; a != NULL && !NISERR(); a = nexta)
1625 			{
1626 				nexta = a->outchain;
1627 				if (a->type == '^' || a->type == BEHIND)
1628 					if (pull(nfa, a, &intermediates))
1629 						progress = 1;
1630 			}
1631 			/* clear tmp fields of intermediate states created here */
1632 			while (intermediates != NULL)
1633 			{
1634 				struct state *ns = intermediates->tmp;
1635 
1636 				intermediates->tmp = NULL;
1637 				intermediates = ns;
1638 			}
1639 			/* if s is now useless, get rid of it */
1640 			if ((s->nins == 0 || s->nouts == 0) && !s->flag)
1641 				dropstate(nfa, s);
1642 		}
1643 		if (progress && f != NULL)
1644 			dumpnfa(nfa, f);
1645 	} while (progress && !NISERR());
1646 	if (NISERR())
1647 		return;
1648 
1649 	/*
1650 	 * Any ^ constraints we were able to pull to the start state can now be
1651 	 * replaced by PLAIN arcs referencing the BOS or BOL colors.  There should
1652 	 * be no other ^ or BEHIND arcs left in the NFA, though we do not check
1653 	 * that here (compact() will fail if so).
1654 	 */
1655 	for (a = nfa->pre->outs; a != NULL; a = nexta)
1656 	{
1657 		nexta = a->outchain;
1658 		if (a->type == '^')
1659 		{
1660 			assert(a->co == 0 || a->co == 1);
1661 			newarc(nfa, PLAIN, nfa->bos[a->co], a->from, a->to);
1662 			freearc(nfa, a);
1663 		}
1664 	}
1665 }
1666 
1667 /*
1668  * pull - pull a back constraint backward past its source state
1669  *
1670  * Returns 1 if successful (which it always is unless the source is the
1671  * start state or we have an internal error), 0 if nothing happened.
1672  *
1673  * A significant property of this function is that it deletes no pre-existing
1674  * states, and no outarcs of the constraint's from state other than the given
1675  * constraint arc.  This makes the loops in pullback() safe, at the cost that
1676  * we may leave useless states behind.  Therefore, we leave it to pullback()
1677  * to delete such states.
1678  *
1679  * If the from state has multiple back-constraint outarcs, and/or multiple
1680  * compatible constraint inarcs, we only need to create one new intermediate
1681  * state per combination of predecessor and successor states.  *intermediates
1682  * points to a list of such intermediate states for this from state (chained
1683  * through their tmp fields).
1684  */
1685 static int
pull(struct nfa * nfa,struct arc * con,struct state ** intermediates)1686 pull(struct nfa *nfa,
1687 	 struct arc *con,
1688 	 struct state **intermediates)
1689 {
1690 	struct state *from = con->from;
1691 	struct state *to = con->to;
1692 	struct arc *a;
1693 	struct arc *nexta;
1694 	struct state *s;
1695 
1696 	assert(from != to);			/* should have gotten rid of this earlier */
1697 	if (from->flag)				/* can't pull back beyond start */
1698 		return 0;
1699 	if (from->nins == 0)
1700 	{							/* unreachable */
1701 		freearc(nfa, con);
1702 		return 1;
1703 	}
1704 
1705 	/*
1706 	 * First, clone from state if necessary to avoid other outarcs.  This may
1707 	 * seem wasteful, but it simplifies the logic, and we'll get rid of the
1708 	 * clone state again at the bottom.
1709 	 */
1710 	if (from->nouts > 1)
1711 	{
1712 		s = newstate(nfa);
1713 		if (NISERR())
1714 			return 0;
1715 		copyins(nfa, from, s);	/* duplicate inarcs */
1716 		cparc(nfa, con, s, to); /* move constraint arc */
1717 		freearc(nfa, con);
1718 		if (NISERR())
1719 			return 0;
1720 		from = s;
1721 		con = from->outs;
1722 	}
1723 	assert(from->nouts == 1);
1724 
1725 	/* propagate the constraint into the from state's inarcs */
1726 	for (a = from->ins; a != NULL && !NISERR(); a = nexta)
1727 	{
1728 		nexta = a->inchain;
1729 		switch (combine(nfa, con, a))
1730 		{
1731 			case INCOMPATIBLE:	/* destroy the arc */
1732 				freearc(nfa, a);
1733 				break;
1734 			case SATISFIED:		/* no action needed */
1735 				break;
1736 			case COMPATIBLE:	/* swap the two arcs, more or less */
1737 				/* need an intermediate state, but might have one already */
1738 				for (s = *intermediates; s != NULL; s = s->tmp)
1739 				{
1740 					assert(s->nins > 0 && s->nouts > 0);
1741 					if (s->ins->from == a->from && s->outs->to == to)
1742 						break;
1743 				}
1744 				if (s == NULL)
1745 				{
1746 					s = newstate(nfa);
1747 					if (NISERR())
1748 						return 0;
1749 					s->tmp = *intermediates;
1750 					*intermediates = s;
1751 				}
1752 				cparc(nfa, con, a->from, s);
1753 				cparc(nfa, a, s, to);
1754 				freearc(nfa, a);
1755 				break;
1756 			case REPLACEARC:	/* replace arc's color */
1757 				newarc(nfa, a->type, con->co, a->from, to);
1758 				freearc(nfa, a);
1759 				break;
1760 			default:
1761 				assert(NOTREACHED);
1762 				break;
1763 		}
1764 	}
1765 
1766 	/* remaining inarcs, if any, incorporate the constraint */
1767 	moveins(nfa, from, to);
1768 	freearc(nfa, con);
1769 	/* from state is now useless, but we leave it to pullback() to clean up */
1770 	return 1;
1771 }
1772 
1773 /*
1774  * pushfwd - push forward constraints forward to eliminate them
1775  */
1776 static void
pushfwd(struct nfa * nfa,FILE * f)1777 pushfwd(struct nfa *nfa,
1778 		FILE *f)				/* for debug output; NULL none */
1779 {
1780 	struct state *s;
1781 	struct state *nexts;
1782 	struct arc *a;
1783 	struct arc *nexta;
1784 	struct state *intermediates;
1785 	int			progress;
1786 
1787 	/* find and push until there are no more */
1788 	do
1789 	{
1790 		progress = 0;
1791 		for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
1792 		{
1793 			nexts = s->next;
1794 			intermediates = NULL;
1795 			for (a = s->ins; a != NULL && !NISERR(); a = nexta)
1796 			{
1797 				nexta = a->inchain;
1798 				if (a->type == '$' || a->type == AHEAD)
1799 					if (push(nfa, a, &intermediates))
1800 						progress = 1;
1801 			}
1802 			/* clear tmp fields of intermediate states created here */
1803 			while (intermediates != NULL)
1804 			{
1805 				struct state *ns = intermediates->tmp;
1806 
1807 				intermediates->tmp = NULL;
1808 				intermediates = ns;
1809 			}
1810 			/* if s is now useless, get rid of it */
1811 			if ((s->nins == 0 || s->nouts == 0) && !s->flag)
1812 				dropstate(nfa, s);
1813 		}
1814 		if (progress && f != NULL)
1815 			dumpnfa(nfa, f);
1816 	} while (progress && !NISERR());
1817 	if (NISERR())
1818 		return;
1819 
1820 	/*
1821 	 * Any $ constraints we were able to push to the post state can now be
1822 	 * replaced by PLAIN arcs referencing the EOS or EOL colors.  There should
1823 	 * be no other $ or AHEAD arcs left in the NFA, though we do not check
1824 	 * that here (compact() will fail if so).
1825 	 */
1826 	for (a = nfa->post->ins; a != NULL; a = nexta)
1827 	{
1828 		nexta = a->inchain;
1829 		if (a->type == '$')
1830 		{
1831 			assert(a->co == 0 || a->co == 1);
1832 			newarc(nfa, PLAIN, nfa->eos[a->co], a->from, a->to);
1833 			freearc(nfa, a);
1834 		}
1835 	}
1836 }
1837 
1838 /*
1839  * push - push a forward constraint forward past its destination state
1840  *
1841  * Returns 1 if successful (which it always is unless the destination is the
1842  * post state or we have an internal error), 0 if nothing happened.
1843  *
1844  * A significant property of this function is that it deletes no pre-existing
1845  * states, and no inarcs of the constraint's to state other than the given
1846  * constraint arc.  This makes the loops in pushfwd() safe, at the cost that
1847  * we may leave useless states behind.  Therefore, we leave it to pushfwd()
1848  * to delete such states.
1849  *
1850  * If the to state has multiple forward-constraint inarcs, and/or multiple
1851  * compatible constraint outarcs, we only need to create one new intermediate
1852  * state per combination of predecessor and successor states.  *intermediates
1853  * points to a list of such intermediate states for this to state (chained
1854  * through their tmp fields).
1855  */
1856 static int
push(struct nfa * nfa,struct arc * con,struct state ** intermediates)1857 push(struct nfa *nfa,
1858 	 struct arc *con,
1859 	 struct state **intermediates)
1860 {
1861 	struct state *from = con->from;
1862 	struct state *to = con->to;
1863 	struct arc *a;
1864 	struct arc *nexta;
1865 	struct state *s;
1866 
1867 	assert(to != from);			/* should have gotten rid of this earlier */
1868 	if (to->flag)				/* can't push forward beyond end */
1869 		return 0;
1870 	if (to->nouts == 0)
1871 	{							/* dead end */
1872 		freearc(nfa, con);
1873 		return 1;
1874 	}
1875 
1876 	/*
1877 	 * First, clone to state if necessary to avoid other inarcs.  This may
1878 	 * seem wasteful, but it simplifies the logic, and we'll get rid of the
1879 	 * clone state again at the bottom.
1880 	 */
1881 	if (to->nins > 1)
1882 	{
1883 		s = newstate(nfa);
1884 		if (NISERR())
1885 			return 0;
1886 		copyouts(nfa, to, s);	/* duplicate outarcs */
1887 		cparc(nfa, con, from, s);	/* move constraint arc */
1888 		freearc(nfa, con);
1889 		if (NISERR())
1890 			return 0;
1891 		to = s;
1892 		con = to->ins;
1893 	}
1894 	assert(to->nins == 1);
1895 
1896 	/* propagate the constraint into the to state's outarcs */
1897 	for (a = to->outs; a != NULL && !NISERR(); a = nexta)
1898 	{
1899 		nexta = a->outchain;
1900 		switch (combine(nfa, con, a))
1901 		{
1902 			case INCOMPATIBLE:	/* destroy the arc */
1903 				freearc(nfa, a);
1904 				break;
1905 			case SATISFIED:		/* no action needed */
1906 				break;
1907 			case COMPATIBLE:	/* swap the two arcs, more or less */
1908 				/* need an intermediate state, but might have one already */
1909 				for (s = *intermediates; s != NULL; s = s->tmp)
1910 				{
1911 					assert(s->nins > 0 && s->nouts > 0);
1912 					if (s->ins->from == from && s->outs->to == a->to)
1913 						break;
1914 				}
1915 				if (s == NULL)
1916 				{
1917 					s = newstate(nfa);
1918 					if (NISERR())
1919 						return 0;
1920 					s->tmp = *intermediates;
1921 					*intermediates = s;
1922 				}
1923 				cparc(nfa, con, s, a->to);
1924 				cparc(nfa, a, from, s);
1925 				freearc(nfa, a);
1926 				break;
1927 			case REPLACEARC:	/* replace arc's color */
1928 				newarc(nfa, a->type, con->co, from, a->to);
1929 				freearc(nfa, a);
1930 				break;
1931 			default:
1932 				assert(NOTREACHED);
1933 				break;
1934 		}
1935 	}
1936 
1937 	/* remaining outarcs, if any, incorporate the constraint */
1938 	moveouts(nfa, to, from);
1939 	freearc(nfa, con);
1940 	/* to state is now useless, but we leave it to pushfwd() to clean up */
1941 	return 1;
1942 }
1943 
1944 /*
1945  * combine - constraint lands on an arc, what happens?
1946  *
1947  * #def INCOMPATIBLE	1	// destroys arc
1948  * #def SATISFIED		2	// constraint satisfied
1949  * #def COMPATIBLE		3	// compatible but not satisfied yet
1950  * #def REPLACEARC		4	// replace arc's color with constraint color
1951  */
1952 static int
combine(struct nfa * nfa,struct arc * con,struct arc * a)1953 combine(struct nfa *nfa,
1954 		struct arc *con,
1955 		struct arc *a)
1956 {
1957 #define  CA(ct,at)	 (((ct)<<CHAR_BIT) | (at))
1958 
1959 	switch (CA(con->type, a->type))
1960 	{
1961 		case CA('^', PLAIN):	/* newlines are handled separately */
1962 		case CA('$', PLAIN):
1963 			return INCOMPATIBLE;
1964 			break;
1965 		case CA(AHEAD, PLAIN):	/* color constraints meet colors */
1966 		case CA(BEHIND, PLAIN):
1967 			if (con->co == a->co)
1968 				return SATISFIED;
1969 			if (con->co == RAINBOW)
1970 			{
1971 				/* con is satisfied unless arc's color is a pseudocolor */
1972 				if (!(nfa->cm->cd[a->co].flags & PSEUDO))
1973 					return SATISFIED;
1974 			}
1975 			else if (a->co == RAINBOW)
1976 			{
1977 				/* con is incompatible if it's for a pseudocolor */
1978 				if (nfa->cm->cd[con->co].flags & PSEUDO)
1979 					return INCOMPATIBLE;
1980 				/* otherwise, constraint constrains arc to be only its color */
1981 				return REPLACEARC;
1982 			}
1983 			return INCOMPATIBLE;
1984 			break;
1985 		case CA('^', '^'):		/* collision, similar constraints */
1986 		case CA('$', '$'):
1987 			if (con->co == a->co)	/* true duplication */
1988 				return SATISFIED;
1989 			return INCOMPATIBLE;
1990 			break;
1991 		case CA(AHEAD, AHEAD):	/* collision, similar constraints */
1992 		case CA(BEHIND, BEHIND):
1993 			if (con->co == a->co)	/* true duplication */
1994 				return SATISFIED;
1995 			if (con->co == RAINBOW)
1996 			{
1997 				/* con is satisfied unless arc's color is a pseudocolor */
1998 				if (!(nfa->cm->cd[a->co].flags & PSEUDO))
1999 					return SATISFIED;
2000 			}
2001 			else if (a->co == RAINBOW)
2002 			{
2003 				/* con is incompatible if it's for a pseudocolor */
2004 				if (nfa->cm->cd[con->co].flags & PSEUDO)
2005 					return INCOMPATIBLE;
2006 				/* otherwise, constraint constrains arc to be only its color */
2007 				return REPLACEARC;
2008 			}
2009 			return INCOMPATIBLE;
2010 			break;
2011 		case CA('^', BEHIND):	/* collision, dissimilar constraints */
2012 		case CA(BEHIND, '^'):
2013 		case CA('$', AHEAD):
2014 		case CA(AHEAD, '$'):
2015 			return INCOMPATIBLE;
2016 			break;
2017 		case CA('^', '$'):		/* constraints passing each other */
2018 		case CA('^', AHEAD):
2019 		case CA(BEHIND, '$'):
2020 		case CA(BEHIND, AHEAD):
2021 		case CA('$', '^'):
2022 		case CA('$', BEHIND):
2023 		case CA(AHEAD, '^'):
2024 		case CA(AHEAD, BEHIND):
2025 		case CA('^', LACON):
2026 		case CA(BEHIND, LACON):
2027 		case CA('$', LACON):
2028 		case CA(AHEAD, LACON):
2029 			return COMPATIBLE;
2030 			break;
2031 	}
2032 	assert(NOTREACHED);
2033 	return INCOMPATIBLE;		/* for benefit of blind compilers */
2034 }
2035 
2036 /*
2037  * fixempties - get rid of EMPTY arcs
2038  */
2039 static void
fixempties(struct nfa * nfa,FILE * f)2040 fixempties(struct nfa *nfa,
2041 		   FILE *f)				/* for debug output; NULL none */
2042 {
2043 	struct state *s;
2044 	struct state *s2;
2045 	struct state *nexts;
2046 	struct arc *a;
2047 	struct arc *nexta;
2048 	int			totalinarcs;
2049 	struct arc **inarcsorig;
2050 	struct arc **arcarray;
2051 	int			arccount;
2052 	int			prevnins;
2053 	int			nskip;
2054 
2055 	/*
2056 	 * First, get rid of any states whose sole out-arc is an EMPTY, since
2057 	 * they're basically just aliases for their successor.  The parsing
2058 	 * algorithm creates enough of these that it's worth special-casing this.
2059 	 */
2060 	for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
2061 	{
2062 		nexts = s->next;
2063 		if (s->flag || s->nouts != 1)
2064 			continue;
2065 		a = s->outs;
2066 		assert(a != NULL && a->outchain == NULL);
2067 		if (a->type != EMPTY)
2068 			continue;
2069 		if (s != a->to)
2070 			moveins(nfa, s, a->to);
2071 		dropstate(nfa, s);
2072 	}
2073 
2074 	/*
2075 	 * Similarly, get rid of any state with a single EMPTY in-arc, by folding
2076 	 * it into its predecessor.
2077 	 */
2078 	for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
2079 	{
2080 		nexts = s->next;
2081 		/* while we're at it, ensure tmp fields are clear for next step */
2082 		assert(s->tmp == NULL);
2083 		if (s->flag || s->nins != 1)
2084 			continue;
2085 		a = s->ins;
2086 		assert(a != NULL && a->inchain == NULL);
2087 		if (a->type != EMPTY)
2088 			continue;
2089 		if (s != a->from)
2090 			moveouts(nfa, s, a->from);
2091 		dropstate(nfa, s);
2092 	}
2093 
2094 	if (NISERR())
2095 		return;
2096 
2097 	/*
2098 	 * For each remaining NFA state, find all other states from which it is
2099 	 * reachable by a chain of one or more EMPTY arcs.  Then generate new arcs
2100 	 * that eliminate the need for each such chain.
2101 	 *
2102 	 * We could replace a chain of EMPTY arcs that leads from a "from" state
2103 	 * to a "to" state either by pushing non-EMPTY arcs forward (linking
2104 	 * directly from "from"'s predecessors to "to") or by pulling them back
2105 	 * (linking directly from "from" to "to"'s successors).  We choose to
2106 	 * always do the former; this choice is somewhat arbitrary, but the
2107 	 * approach below requires that we uniformly do one or the other.
2108 	 *
2109 	 * Suppose we have a chain of N successive EMPTY arcs (where N can easily
2110 	 * approach the size of the NFA).  All of the intermediate states must
2111 	 * have additional inarcs and outarcs, else they'd have been removed by
2112 	 * the steps above.  Assuming their inarcs are mostly not empties, we will
2113 	 * add O(N^2) arcs to the NFA, since a non-EMPTY inarc leading to any one
2114 	 * state in the chain must be duplicated to lead to all its successor
2115 	 * states as well.  So there is no hope of doing less than O(N^2) work;
2116 	 * however, we should endeavor to keep the big-O cost from being even
2117 	 * worse than that, which it can easily become without care.  In
2118 	 * particular, suppose we were to copy all S1's inarcs forward to S2, and
2119 	 * then also to S3, and then later we consider pushing S2's inarcs forward
2120 	 * to S3.  If we include the arcs already copied from S1 in that, we'd be
2121 	 * doing O(N^3) work.  (The duplicate-arc elimination built into newarc()
2122 	 * and its cohorts would get rid of the extra arcs, but not without cost.)
2123 	 *
2124 	 * We can avoid this cost by treating only arcs that existed at the start
2125 	 * of this phase as candidates to be pushed forward.  To identify those,
2126 	 * we remember the first inarc each state had to start with.  We rely on
2127 	 * the fact that newarc() and friends put new arcs on the front of their
2128 	 * to-states' inchains, and that this phase never deletes arcs, so that
2129 	 * the original arcs must be the last arcs in their to-states' inchains.
2130 	 *
2131 	 * So the process here is that, for each state in the NFA, we gather up
2132 	 * all non-EMPTY inarcs of states that can reach the target state via
2133 	 * EMPTY arcs.  We then sort, de-duplicate, and merge these arcs into the
2134 	 * target state's inchain.  (We can safely use sort-merge for this as long
2135 	 * as we update each state's original-arcs pointer after we add arcs to
2136 	 * it; the sort step of mergeins probably changed the order of the old
2137 	 * arcs.)
2138 	 *
2139 	 * Another refinement worth making is that, because we only add non-EMPTY
2140 	 * arcs during this phase, and all added arcs have the same from-state as
2141 	 * the non-EMPTY arc they were cloned from, we know ahead of time that any
2142 	 * states having only EMPTY outarcs will be useless for lack of outarcs
2143 	 * after we drop the EMPTY arcs.  (They cannot gain non-EMPTY outarcs if
2144 	 * they had none to start with.)  So we need not bother to update the
2145 	 * inchains of such states at all.
2146 	 */
2147 
2148 	/* Remember the states' first original inarcs */
2149 	/* ... and while at it, count how many old inarcs there are altogether */
2150 	inarcsorig = (struct arc **) MALLOC(nfa->nstates * sizeof(struct arc *));
2151 	if (inarcsorig == NULL)
2152 	{
2153 		NERR(REG_ESPACE);
2154 		return;
2155 	}
2156 	totalinarcs = 0;
2157 	for (s = nfa->states; s != NULL; s = s->next)
2158 	{
2159 		inarcsorig[s->no] = s->ins;
2160 		totalinarcs += s->nins;
2161 	}
2162 
2163 	/*
2164 	 * Create a workspace for accumulating the inarcs to be added to the
2165 	 * current target state.  totalinarcs is probably a considerable
2166 	 * overestimate of the space needed, but the NFA is unlikely to be large
2167 	 * enough at this point to make it worth being smarter.
2168 	 */
2169 	arcarray = (struct arc **) MALLOC(totalinarcs * sizeof(struct arc *));
2170 	if (arcarray == NULL)
2171 	{
2172 		NERR(REG_ESPACE);
2173 		FREE(inarcsorig);
2174 		return;
2175 	}
2176 
2177 	/* And iterate over the target states */
2178 	for (s = nfa->states; s != NULL && !NISERR(); s = s->next)
2179 	{
2180 		/* Ignore target states without non-EMPTY outarcs, per note above */
2181 		if (!s->flag && !hasnonemptyout(s))
2182 			continue;
2183 
2184 		/* Find predecessor states and accumulate their original inarcs */
2185 		arccount = 0;
2186 		for (s2 = emptyreachable(nfa, s, s, inarcsorig); s2 != s; s2 = nexts)
2187 		{
2188 			/* Add s2's original inarcs to arcarray[], but ignore empties */
2189 			for (a = inarcsorig[s2->no]; a != NULL; a = a->inchain)
2190 			{
2191 				if (a->type != EMPTY)
2192 					arcarray[arccount++] = a;
2193 			}
2194 
2195 			/* Reset the tmp fields as we walk back */
2196 			nexts = s2->tmp;
2197 			s2->tmp = NULL;
2198 		}
2199 		s->tmp = NULL;
2200 		assert(arccount <= totalinarcs);
2201 
2202 		/* Remember how many original inarcs this state has */
2203 		prevnins = s->nins;
2204 
2205 		/* Add non-duplicate inarcs to target state */
2206 		mergeins(nfa, s, arcarray, arccount);
2207 
2208 		/* Now we must update the state's inarcsorig pointer */
2209 		nskip = s->nins - prevnins;
2210 		a = s->ins;
2211 		while (nskip-- > 0)
2212 			a = a->inchain;
2213 		inarcsorig[s->no] = a;
2214 	}
2215 
2216 	FREE(arcarray);
2217 	FREE(inarcsorig);
2218 
2219 	if (NISERR())
2220 		return;
2221 
2222 	/*
2223 	 * Now remove all the EMPTY arcs, since we don't need them anymore.
2224 	 */
2225 	for (s = nfa->states; s != NULL; s = s->next)
2226 	{
2227 		for (a = s->outs; a != NULL; a = nexta)
2228 		{
2229 			nexta = a->outchain;
2230 			if (a->type == EMPTY)
2231 				freearc(nfa, a);
2232 		}
2233 	}
2234 
2235 	/*
2236 	 * And remove any states that have become useless.  (This cleanup is not
2237 	 * very thorough, and would be even less so if we tried to combine it with
2238 	 * the previous step; but cleanup() will take care of anything we miss.)
2239 	 */
2240 	for (s = nfa->states; s != NULL; s = nexts)
2241 	{
2242 		nexts = s->next;
2243 		if ((s->nins == 0 || s->nouts == 0) && !s->flag)
2244 			dropstate(nfa, s);
2245 	}
2246 
2247 	if (f != NULL)
2248 		dumpnfa(nfa, f);
2249 }
2250 
2251 /*
2252  * emptyreachable - recursively find all states that can reach s by EMPTY arcs
2253  *
2254  * The return value is the last such state found.  Its tmp field links back
2255  * to the next-to-last such state, and so on back to s, so that all these
2256  * states can be located without searching the whole NFA.
2257  *
2258  * Since this is only used in fixempties(), we pass in the inarcsorig[] array
2259  * maintained by that function.  This lets us skip over all new inarcs, which
2260  * are certainly not EMPTY arcs.
2261  *
2262  * The maximum recursion depth here is equal to the length of the longest
2263  * loop-free chain of EMPTY arcs, which is surely no more than the size of
2264  * the NFA ... but that could still be enough to cause trouble.
2265  */
2266 static struct state *
emptyreachable(struct nfa * nfa,struct state * s,struct state * lastfound,struct arc ** inarcsorig)2267 emptyreachable(struct nfa *nfa,
2268 			   struct state *s,
2269 			   struct state *lastfound,
2270 			   struct arc **inarcsorig)
2271 {
2272 	struct arc *a;
2273 
2274 	/* Since this is recursive, it could be driven to stack overflow */
2275 	if (STACK_TOO_DEEP(nfa->v->re))
2276 	{
2277 		NERR(REG_ETOOBIG);
2278 		return lastfound;
2279 	}
2280 
2281 	s->tmp = lastfound;
2282 	lastfound = s;
2283 	for (a = inarcsorig[s->no]; a != NULL; a = a->inchain)
2284 	{
2285 		if (a->type == EMPTY && a->from->tmp == NULL)
2286 			lastfound = emptyreachable(nfa, a->from, lastfound, inarcsorig);
2287 	}
2288 	return lastfound;
2289 }
2290 
2291 /*
2292  * isconstraintarc - detect whether an arc is of a constraint type
2293  */
2294 static inline int
isconstraintarc(struct arc * a)2295 isconstraintarc(struct arc *a)
2296 {
2297 	switch (a->type)
2298 	{
2299 		case '^':
2300 		case '$':
2301 		case BEHIND:
2302 		case AHEAD:
2303 		case LACON:
2304 			return 1;
2305 	}
2306 	return 0;
2307 }
2308 
2309 /*
2310  * hasconstraintout - does state have a constraint out arc?
2311  */
2312 static int
hasconstraintout(struct state * s)2313 hasconstraintout(struct state *s)
2314 {
2315 	struct arc *a;
2316 
2317 	for (a = s->outs; a != NULL; a = a->outchain)
2318 	{
2319 		if (isconstraintarc(a))
2320 			return 1;
2321 	}
2322 	return 0;
2323 }
2324 
2325 /*
2326  * fixconstraintloops - get rid of loops containing only constraint arcs
2327  *
2328  * A loop of states that contains only constraint arcs is useless, since
2329  * passing around the loop represents no forward progress.  Moreover, it
2330  * would cause infinite looping in pullback/pushfwd, so we need to get rid
2331  * of such loops before doing that.
2332  */
2333 static void
fixconstraintloops(struct nfa * nfa,FILE * f)2334 fixconstraintloops(struct nfa *nfa,
2335 				   FILE *f)		/* for debug output; NULL none */
2336 {
2337 	struct state *s;
2338 	struct state *nexts;
2339 	struct arc *a;
2340 	struct arc *nexta;
2341 	int			hasconstraints;
2342 
2343 	/*
2344 	 * In the trivial case of a state that loops to itself, we can just drop
2345 	 * the constraint arc altogether.  This is worth special-casing because
2346 	 * such loops are far more common than loops containing multiple states.
2347 	 * While we're at it, note whether any constraint arcs survive.
2348 	 */
2349 	hasconstraints = 0;
2350 	for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
2351 	{
2352 		nexts = s->next;
2353 		/* while we're at it, ensure tmp fields are clear for next step */
2354 		assert(s->tmp == NULL);
2355 		for (a = s->outs; a != NULL && !NISERR(); a = nexta)
2356 		{
2357 			nexta = a->outchain;
2358 			if (isconstraintarc(a))
2359 			{
2360 				if (a->to == s)
2361 					freearc(nfa, a);
2362 				else
2363 					hasconstraints = 1;
2364 			}
2365 		}
2366 		/* If we removed all the outarcs, the state is useless. */
2367 		if (s->nouts == 0 && !s->flag)
2368 			dropstate(nfa, s);
2369 	}
2370 
2371 	/* Nothing to do if no remaining constraint arcs */
2372 	if (NISERR() || !hasconstraints)
2373 		return;
2374 
2375 	/*
2376 	 * Starting from each remaining NFA state, search outwards for a
2377 	 * constraint loop.  If we find a loop, break the loop, then start the
2378 	 * search over.  (We could possibly retain some state from the first scan,
2379 	 * but it would complicate things greatly, and multi-state constraint
2380 	 * loops are rare enough that it's not worth optimizing the case.)
2381 	 */
2382 restart:
2383 	for (s = nfa->states; s != NULL && !NISERR(); s = s->next)
2384 	{
2385 		if (findconstraintloop(nfa, s))
2386 			goto restart;
2387 	}
2388 
2389 	if (NISERR())
2390 		return;
2391 
2392 	/*
2393 	 * Now remove any states that have become useless.  (This cleanup is not
2394 	 * very thorough, and would be even less so if we tried to combine it with
2395 	 * the previous step; but cleanup() will take care of anything we miss.)
2396 	 *
2397 	 * Because findconstraintloop intentionally doesn't reset all tmp fields,
2398 	 * we have to clear them after it's done.  This is a convenient place to
2399 	 * do that, too.
2400 	 */
2401 	for (s = nfa->states; s != NULL; s = nexts)
2402 	{
2403 		nexts = s->next;
2404 		s->tmp = NULL;
2405 		if ((s->nins == 0 || s->nouts == 0) && !s->flag)
2406 			dropstate(nfa, s);
2407 	}
2408 
2409 	if (f != NULL)
2410 		dumpnfa(nfa, f);
2411 }
2412 
2413 /*
2414  * findconstraintloop - recursively find a loop of constraint arcs
2415  *
2416  * If we find a loop, break it by calling breakconstraintloop(), then
2417  * return 1; otherwise return 0.
2418  *
2419  * State tmp fields are guaranteed all NULL on a success return, because
2420  * breakconstraintloop does that.  After a failure return, any state that
2421  * is known not to be part of a loop is marked with s->tmp == s; this allows
2422  * us not to have to re-prove that fact on later calls.  (This convention is
2423  * workable because we already eliminated single-state loops.)
2424  *
2425  * Note that the found loop doesn't necessarily include the first state we
2426  * are called on.  Any loop reachable from that state will do.
2427  *
2428  * The maximum recursion depth here is one more than the length of the longest
2429  * loop-free chain of constraint arcs, which is surely no more than the size
2430  * of the NFA ... but that could still be enough to cause trouble.
2431  */
2432 static int
findconstraintloop(struct nfa * nfa,struct state * s)2433 findconstraintloop(struct nfa *nfa, struct state *s)
2434 {
2435 	struct arc *a;
2436 
2437 	/* Since this is recursive, it could be driven to stack overflow */
2438 	if (STACK_TOO_DEEP(nfa->v->re))
2439 	{
2440 		NERR(REG_ETOOBIG);
2441 		return 1;				/* to exit as quickly as possible */
2442 	}
2443 
2444 	if (s->tmp != NULL)
2445 	{
2446 		/* Already proven uninteresting? */
2447 		if (s->tmp == s)
2448 			return 0;
2449 		/* Found a loop involving s */
2450 		breakconstraintloop(nfa, s);
2451 		/* The tmp fields have been cleaned up by breakconstraintloop */
2452 		return 1;
2453 	}
2454 	for (a = s->outs; a != NULL; a = a->outchain)
2455 	{
2456 		if (isconstraintarc(a))
2457 		{
2458 			struct state *sto = a->to;
2459 
2460 			assert(sto != s);
2461 			s->tmp = sto;
2462 			if (findconstraintloop(nfa, sto))
2463 				return 1;
2464 		}
2465 	}
2466 
2467 	/*
2468 	 * If we get here, no constraint loop exists leading out from s.  Mark it
2469 	 * with s->tmp == s so we need not rediscover that fact again later.
2470 	 */
2471 	s->tmp = s;
2472 	return 0;
2473 }
2474 
2475 /*
2476  * breakconstraintloop - break a loop of constraint arcs
2477  *
2478  * sinitial is any one member state of the loop.  Each loop member's tmp
2479  * field links to its successor within the loop.  (Note that this function
2480  * will reset all the tmp fields to NULL.)
2481  *
2482  * We can break the loop by, for any one state S1 in the loop, cloning its
2483  * loop successor state S2 (and possibly following states), and then moving
2484  * all S1->S2 constraint arcs to point to the cloned S2.  The cloned S2 should
2485  * copy any non-constraint outarcs of S2.  Constraint outarcs should be
2486  * dropped if they point back to S1, else they need to be copied as arcs to
2487  * similarly cloned states S3, S4, etc.  In general, each cloned state copies
2488  * non-constraint outarcs, drops constraint outarcs that would lead to itself
2489  * or any earlier cloned state, and sends other constraint outarcs to newly
2490  * cloned states.  No cloned state will have any inarcs that aren't constraint
2491  * arcs or do not lead from S1 or earlier-cloned states.  It's okay to drop
2492  * constraint back-arcs since they would not take us to any state we've not
2493  * already been in; therefore, no new constraint loop is created.  In this way
2494  * we generate a modified NFA that can still represent every useful state
2495  * sequence, but not sequences that represent state loops with no consumption
2496  * of input data.  Note that the set of cloned states will certainly include
2497  * all of the loop member states other than S1, and it may also include
2498  * non-loop states that are reachable from S2 via constraint arcs.  This is
2499  * important because there is no guarantee that findconstraintloop found a
2500  * maximal loop (and searching for one would be NP-hard, so don't try).
2501  * Frequently the "non-loop states" are actually part of a larger loop that
2502  * we didn't notice, and indeed there may be several overlapping loops.
2503  * This technique ensures convergence in such cases, while considering only
2504  * the originally-found loop does not.
2505  *
2506  * If there is only one S1->S2 constraint arc, then that constraint is
2507  * certainly satisfied when we enter any of the clone states.  This means that
2508  * in the common case where many of the constraint arcs are identically
2509  * labeled, we can merge together clone states linked by a similarly-labeled
2510  * constraint: if we can get to the first one we can certainly get to the
2511  * second, so there's no need to distinguish.  This greatly reduces the number
2512  * of new states needed, so we preferentially break the given loop at a state
2513  * pair where this is true.
2514  *
2515  * Furthermore, it's fairly common to find that a cloned successor state has
2516  * no outarcs, especially if we're a bit aggressive about removing unnecessary
2517  * outarcs.  If that happens, then there is simply not any interesting state
2518  * that can be reached through the predecessor's loop arcs, which means we can
2519  * break the loop just by removing those loop arcs, with no new states added.
2520  */
2521 static void
breakconstraintloop(struct nfa * nfa,struct state * sinitial)2522 breakconstraintloop(struct nfa *nfa, struct state *sinitial)
2523 {
2524 	struct state *s;
2525 	struct state *shead;
2526 	struct state *stail;
2527 	struct state *sclone;
2528 	struct state *nexts;
2529 	struct arc *refarc;
2530 	struct arc *a;
2531 	struct arc *nexta;
2532 
2533 	/*
2534 	 * Start by identifying which loop step we want to break at.
2535 	 * Preferentially this is one with only one constraint arc.  (XXX are
2536 	 * there any other secondary heuristics we want to use here?)  Set refarc
2537 	 * to point to the selected lone constraint arc, if there is one.
2538 	 */
2539 	refarc = NULL;
2540 	s = sinitial;
2541 	do
2542 	{
2543 		nexts = s->tmp;
2544 		assert(nexts != s);		/* should not see any one-element loops */
2545 		if (refarc == NULL)
2546 		{
2547 			int			narcs = 0;
2548 
2549 			for (a = s->outs; a != NULL; a = a->outchain)
2550 			{
2551 				if (a->to == nexts && isconstraintarc(a))
2552 				{
2553 					refarc = a;
2554 					narcs++;
2555 				}
2556 			}
2557 			assert(narcs > 0);
2558 			if (narcs > 1)
2559 				refarc = NULL;	/* multiple constraint arcs here, no good */
2560 		}
2561 		s = nexts;
2562 	} while (s != sinitial);
2563 
2564 	if (refarc)
2565 	{
2566 		/* break at the refarc */
2567 		shead = refarc->from;
2568 		stail = refarc->to;
2569 		assert(stail == shead->tmp);
2570 	}
2571 	else
2572 	{
2573 		/* for lack of a better idea, break after sinitial */
2574 		shead = sinitial;
2575 		stail = sinitial->tmp;
2576 	}
2577 
2578 	/*
2579 	 * Reset the tmp fields so that we can use them for local storage in
2580 	 * clonesuccessorstates.  (findconstraintloop won't mind, since it's just
2581 	 * going to abandon its search anyway.)
2582 	 */
2583 	for (s = nfa->states; s != NULL; s = s->next)
2584 		s->tmp = NULL;
2585 
2586 	/*
2587 	 * Recursively build clone state(s) as needed.
2588 	 */
2589 	sclone = newstate(nfa);
2590 	if (sclone == NULL)
2591 	{
2592 		assert(NISERR());
2593 		return;
2594 	}
2595 
2596 	clonesuccessorstates(nfa, stail, sclone, shead, refarc,
2597 						 NULL, NULL, nfa->nstates);
2598 
2599 	if (NISERR())
2600 		return;
2601 
2602 	/*
2603 	 * It's possible that sclone has no outarcs at all, in which case it's
2604 	 * useless.  (We don't try extremely hard to get rid of useless states
2605 	 * here, but this is an easy and fairly common case.)
2606 	 */
2607 	if (sclone->nouts == 0)
2608 	{
2609 		freestate(nfa, sclone);
2610 		sclone = NULL;
2611 	}
2612 
2613 	/*
2614 	 * Move shead's constraint-loop arcs to point to sclone, or just drop them
2615 	 * if we discovered we don't need sclone.
2616 	 */
2617 	for (a = shead->outs; a != NULL; a = nexta)
2618 	{
2619 		nexta = a->outchain;
2620 		if (a->to == stail && isconstraintarc(a))
2621 		{
2622 			if (sclone)
2623 				cparc(nfa, a, shead, sclone);
2624 			freearc(nfa, a);
2625 			if (NISERR())
2626 				break;
2627 		}
2628 	}
2629 }
2630 
2631 /*
2632  * clonesuccessorstates - create a tree of constraint-arc successor states
2633  *
2634  * ssource is the state to be cloned, and sclone is the state to copy its
2635  * outarcs into.  sclone's inarcs, if any, should already be set up.
2636  *
2637  * spredecessor is the original predecessor state that we are trying to build
2638  * successors for (it may not be the immediate predecessor of ssource).
2639  * refarc, if not NULL, is the original constraint arc that is known to have
2640  * been traversed out of spredecessor to reach the successor(s).
2641  *
2642  * For each cloned successor state, we transiently create a "donemap" that is
2643  * a boolean array showing which source states we've already visited for this
2644  * clone state.  This prevents infinite recursion as well as useless repeat
2645  * visits to the same state subtree (which can add up fast, since typical NFAs
2646  * have multiple redundant arc pathways).  Each donemap is a char array
2647  * indexed by state number.  The donemaps are all of the same size "nstates",
2648  * which is nfa->nstates as of the start of the recursion.  This is enough to
2649  * have entries for all pre-existing states, but *not* entries for clone
2650  * states created during the recursion.  That's okay since we have no need to
2651  * mark those.
2652  *
2653  * curdonemap is NULL when recursing to a new sclone state, or sclone's
2654  * donemap when we are recursing without having created a new state (which we
2655  * do when we decide we can merge a successor state into the current clone
2656  * state).  outerdonemap is NULL at the top level and otherwise the parent
2657  * clone state's donemap.
2658  *
2659  * The successor states we create and fill here form a strict tree structure,
2660  * with each state having exactly one predecessor, except that the toplevel
2661  * state has no inarcs as yet (breakconstraintloop will add its inarcs from
2662  * spredecessor after we're done).  Thus, we can examine sclone's inarcs back
2663  * to the root, plus refarc if any, to identify the set of constraints already
2664  * known valid at the current point.  This allows us to avoid generating extra
2665  * successor states.
2666  */
2667 static void
clonesuccessorstates(struct nfa * nfa,struct state * ssource,struct state * sclone,struct state * spredecessor,struct arc * refarc,char * curdonemap,char * outerdonemap,int nstates)2668 clonesuccessorstates(struct nfa *nfa,
2669 					 struct state *ssource,
2670 					 struct state *sclone,
2671 					 struct state *spredecessor,
2672 					 struct arc *refarc,
2673 					 char *curdonemap,
2674 					 char *outerdonemap,
2675 					 int nstates)
2676 {
2677 	char	   *donemap;
2678 	struct arc *a;
2679 
2680 	/* Since this is recursive, it could be driven to stack overflow */
2681 	if (STACK_TOO_DEEP(nfa->v->re))
2682 	{
2683 		NERR(REG_ETOOBIG);
2684 		return;
2685 	}
2686 
2687 	/* If this state hasn't already got a donemap, create one */
2688 	donemap = curdonemap;
2689 	if (donemap == NULL)
2690 	{
2691 		donemap = (char *) MALLOC(nstates * sizeof(char));
2692 		if (donemap == NULL)
2693 		{
2694 			NERR(REG_ESPACE);
2695 			return;
2696 		}
2697 
2698 		if (outerdonemap != NULL)
2699 		{
2700 			/*
2701 			 * Not at outermost recursion level, so copy the outer level's
2702 			 * donemap; this ensures that we see states in process of being
2703 			 * visited at outer levels, or already merged into predecessor
2704 			 * states, as ones we shouldn't traverse back to.
2705 			 */
2706 			memcpy(donemap, outerdonemap, nstates * sizeof(char));
2707 		}
2708 		else
2709 		{
2710 			/* At outermost level, only spredecessor is off-limits */
2711 			memset(donemap, 0, nstates * sizeof(char));
2712 			assert(spredecessor->no < nstates);
2713 			donemap[spredecessor->no] = 1;
2714 		}
2715 	}
2716 
2717 	/* Mark ssource as visited in the donemap */
2718 	assert(ssource->no < nstates);
2719 	assert(donemap[ssource->no] == 0);
2720 	donemap[ssource->no] = 1;
2721 
2722 	/*
2723 	 * We proceed by first cloning all of ssource's outarcs, creating new
2724 	 * clone states as needed but not doing more with them than that.  Then in
2725 	 * a second pass, recurse to process the child clone states.  This allows
2726 	 * us to have only one child clone state per reachable source state, even
2727 	 * when there are multiple outarcs leading to the same state.  Also, when
2728 	 * we do visit a child state, its set of inarcs is known exactly, which
2729 	 * makes it safe to apply the constraint-is-already-checked optimization.
2730 	 * Also, this ensures that we've merged all the states we can into the
2731 	 * current clone before we recurse to any children, thus possibly saving
2732 	 * them from making extra images of those states.
2733 	 *
2734 	 * While this function runs, child clone states of the current state are
2735 	 * marked by setting their tmp fields to point to the original state they
2736 	 * were cloned from.  This makes it possible to detect multiple outarcs
2737 	 * leading to the same state, and also makes it easy to distinguish clone
2738 	 * states from original states (which will have tmp == NULL).
2739 	 */
2740 	for (a = ssource->outs; a != NULL && !NISERR(); a = a->outchain)
2741 	{
2742 		struct state *sto = a->to;
2743 
2744 		/*
2745 		 * We do not consider cloning successor states that have no constraint
2746 		 * outarcs; just link to them as-is.  They cannot be part of a
2747 		 * constraint loop so there is no need to make copies.  In particular,
2748 		 * this rule keeps us from trying to clone the post state, which would
2749 		 * be a bad idea.
2750 		 */
2751 		if (isconstraintarc(a) && hasconstraintout(sto))
2752 		{
2753 			struct state *prevclone;
2754 			int			canmerge;
2755 			struct arc *a2;
2756 
2757 			/*
2758 			 * Back-link constraint arcs must not be followed.  Nor is there a
2759 			 * need to revisit states previously merged into this clone.
2760 			 */
2761 			assert(sto->no < nstates);
2762 			if (donemap[sto->no] != 0)
2763 				continue;
2764 
2765 			/*
2766 			 * Check whether we already have a child clone state for this
2767 			 * source state.
2768 			 */
2769 			prevclone = NULL;
2770 			for (a2 = sclone->outs; a2 != NULL; a2 = a2->outchain)
2771 			{
2772 				if (a2->to->tmp == sto)
2773 				{
2774 					prevclone = a2->to;
2775 					break;
2776 				}
2777 			}
2778 
2779 			/*
2780 			 * If this arc is labeled the same as refarc, or the same as any
2781 			 * arc we must have traversed to get to sclone, then no additional
2782 			 * constraints need to be met to get to sto, so we should just
2783 			 * merge its outarcs into sclone.
2784 			 */
2785 			if (refarc && a->type == refarc->type && a->co == refarc->co)
2786 				canmerge = 1;
2787 			else
2788 			{
2789 				struct state *s;
2790 
2791 				canmerge = 0;
2792 				for (s = sclone; s->ins; s = s->ins->from)
2793 				{
2794 					if (s->nins == 1 &&
2795 						a->type == s->ins->type && a->co == s->ins->co)
2796 					{
2797 						canmerge = 1;
2798 						break;
2799 					}
2800 				}
2801 			}
2802 
2803 			if (canmerge)
2804 			{
2805 				/*
2806 				 * We can merge into sclone.  If we previously made a child
2807 				 * clone state, drop it; there's no need to visit it.  (This
2808 				 * can happen if ssource has multiple pathways to sto, and we
2809 				 * only just now found one that is provably a no-op.)
2810 				 */
2811 				if (prevclone)
2812 					dropstate(nfa, prevclone);	/* kills our outarc, too */
2813 
2814 				/* Recurse to merge sto's outarcs into sclone */
2815 				clonesuccessorstates(nfa,
2816 									 sto,
2817 									 sclone,
2818 									 spredecessor,
2819 									 refarc,
2820 									 donemap,
2821 									 outerdonemap,
2822 									 nstates);
2823 				/* sto should now be marked as previously visited */
2824 				assert(NISERR() || donemap[sto->no] == 1);
2825 			}
2826 			else if (prevclone)
2827 			{
2828 				/*
2829 				 * We already have a clone state for this successor, so just
2830 				 * make another arc to it.
2831 				 */
2832 				cparc(nfa, a, sclone, prevclone);
2833 			}
2834 			else
2835 			{
2836 				/*
2837 				 * We need to create a new successor clone state.
2838 				 */
2839 				struct state *stoclone;
2840 
2841 				stoclone = newstate(nfa);
2842 				if (stoclone == NULL)
2843 				{
2844 					assert(NISERR());
2845 					break;
2846 				}
2847 				/* Mark it as to what it's a clone of */
2848 				stoclone->tmp = sto;
2849 				/* ... and add the outarc leading to it */
2850 				cparc(nfa, a, sclone, stoclone);
2851 			}
2852 		}
2853 		else
2854 		{
2855 			/*
2856 			 * Non-constraint outarcs just get copied to sclone, as do outarcs
2857 			 * leading to states with no constraint outarc.
2858 			 */
2859 			cparc(nfa, a, sclone, sto);
2860 		}
2861 	}
2862 
2863 	/*
2864 	 * If we are at outer level for this clone state, recurse to all its child
2865 	 * clone states, clearing their tmp fields as we go.  (If we're not
2866 	 * outermost for sclone, leave this to be done by the outer call level.)
2867 	 * Note that if we have multiple outarcs leading to the same clone state,
2868 	 * it will only be recursed-to once.
2869 	 */
2870 	if (curdonemap == NULL)
2871 	{
2872 		for (a = sclone->outs; a != NULL && !NISERR(); a = a->outchain)
2873 		{
2874 			struct state *stoclone = a->to;
2875 			struct state *sto = stoclone->tmp;
2876 
2877 			if (sto != NULL)
2878 			{
2879 				stoclone->tmp = NULL;
2880 				clonesuccessorstates(nfa,
2881 									 sto,
2882 									 stoclone,
2883 									 spredecessor,
2884 									 refarc,
2885 									 NULL,
2886 									 donemap,
2887 									 nstates);
2888 			}
2889 		}
2890 
2891 		/* Don't forget to free sclone's donemap when done with it */
2892 		FREE(donemap);
2893 	}
2894 }
2895 
2896 /*
2897  * cleanup - clean up NFA after optimizations
2898  */
2899 static void
cleanup(struct nfa * nfa)2900 cleanup(struct nfa *nfa)
2901 {
2902 	struct state *s;
2903 	struct state *nexts;
2904 	int			n;
2905 
2906 	if (NISERR())
2907 		return;
2908 
2909 	/* clear out unreachable or dead-end states */
2910 	/* use pre to mark reachable, then post to mark can-reach-post */
2911 	markreachable(nfa, nfa->pre, (struct state *) NULL, nfa->pre);
2912 	markcanreach(nfa, nfa->post, nfa->pre, nfa->post);
2913 	for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
2914 	{
2915 		nexts = s->next;
2916 		if (s->tmp != nfa->post && !s->flag)
2917 			dropstate(nfa, s);
2918 	}
2919 	assert(NISERR() || nfa->post->nins == 0 || nfa->post->tmp == nfa->post);
2920 	cleartraverse(nfa, nfa->pre);
2921 	assert(NISERR() || nfa->post->nins == 0 || nfa->post->tmp == NULL);
2922 	/* the nins==0 (final unreachable) case will be caught later */
2923 
2924 	/* renumber surviving states */
2925 	n = 0;
2926 	for (s = nfa->states; s != NULL; s = s->next)
2927 		s->no = n++;
2928 	nfa->nstates = n;
2929 }
2930 
2931 /*
2932  * markreachable - recursive marking of reachable states
2933  */
2934 static void
markreachable(struct nfa * nfa,struct state * s,struct state * okay,struct state * mark)2935 markreachable(struct nfa *nfa,
2936 			  struct state *s,
2937 			  struct state *okay,	/* consider only states with this mark */
2938 			  struct state *mark)	/* the value to mark with */
2939 {
2940 	struct arc *a;
2941 
2942 	/* Since this is recursive, it could be driven to stack overflow */
2943 	if (STACK_TOO_DEEP(nfa->v->re))
2944 	{
2945 		NERR(REG_ETOOBIG);
2946 		return;
2947 	}
2948 
2949 	if (s->tmp != okay)
2950 		return;
2951 	s->tmp = mark;
2952 
2953 	for (a = s->outs; a != NULL; a = a->outchain)
2954 		markreachable(nfa, a->to, okay, mark);
2955 }
2956 
2957 /*
2958  * markcanreach - recursive marking of states which can reach here
2959  */
2960 static void
markcanreach(struct nfa * nfa,struct state * s,struct state * okay,struct state * mark)2961 markcanreach(struct nfa *nfa,
2962 			 struct state *s,
2963 			 struct state *okay,	/* consider only states with this mark */
2964 			 struct state *mark)	/* the value to mark with */
2965 {
2966 	struct arc *a;
2967 
2968 	/* Since this is recursive, it could be driven to stack overflow */
2969 	if (STACK_TOO_DEEP(nfa->v->re))
2970 	{
2971 		NERR(REG_ETOOBIG);
2972 		return;
2973 	}
2974 
2975 	if (s->tmp != okay)
2976 		return;
2977 	s->tmp = mark;
2978 
2979 	for (a = s->ins; a != NULL; a = a->inchain)
2980 		markcanreach(nfa, a->from, okay, mark);
2981 }
2982 
2983 /*
2984  * analyze - ascertain potentially-useful facts about an optimized NFA
2985  */
2986 static long						/* re_info bits to be ORed in */
analyze(struct nfa * nfa)2987 analyze(struct nfa *nfa)
2988 {
2989 	struct arc *a;
2990 	struct arc *aa;
2991 
2992 	if (NISERR())
2993 		return 0;
2994 
2995 	/* Detect whether NFA can't match anything */
2996 	if (nfa->pre->outs == NULL)
2997 		return REG_UIMPOSSIBLE;
2998 
2999 	/* Detect whether NFA matches all strings (possibly with length bounds) */
3000 	checkmatchall(nfa);
3001 
3002 	/* Detect whether NFA can possibly match a zero-length string */
3003 	for (a = nfa->pre->outs; a != NULL; a = a->outchain)
3004 		for (aa = a->to->outs; aa != NULL; aa = aa->outchain)
3005 			if (aa->to == nfa->post)
3006 				return REG_UEMPTYMATCH;
3007 	return 0;
3008 }
3009 
3010 /*
3011  * checkmatchall - does the NFA represent no more than a string length test?
3012  *
3013  * If so, set nfa->minmatchall and nfa->maxmatchall correctly (they are -1
3014  * to begin with) and set the MATCHALL bit in nfa->flags.
3015  *
3016  * To succeed, we require all arcs to be PLAIN RAINBOW arcs, except for those
3017  * for pseudocolors (i.e., BOS/BOL/EOS/EOL).  We must be able to reach the
3018  * post state via RAINBOW arcs, and if there are any loops in the graph, they
3019  * must be loop-to-self arcs, ensuring that each loop iteration consumes
3020  * exactly one character.  (Longer loops are problematic because they create
3021  * non-consecutive possible match lengths; we have no good way to represent
3022  * that situation for lengths beyond the DUPINF limit.)
3023  *
3024  * Pseudocolor arcs complicate things a little.  We know that they can only
3025  * appear as pre-state outarcs (for BOS/BOL) or post-state inarcs (for
3026  * EOS/EOL).  There, they must exactly replicate the parallel RAINBOW arcs,
3027  * e.g. if the pre state has one RAINBOW outarc to state 2, it must have BOS
3028  * and BOL outarcs to state 2, and no others.  Missing or extra pseudocolor
3029  * arcs can occur, meaning that the NFA involves some constraint on the
3030  * adjacent characters, which makes it not a matchall NFA.
3031  */
3032 static void
checkmatchall(struct nfa * nfa)3033 checkmatchall(struct nfa *nfa)
3034 {
3035 	bool	  **haspaths;
3036 	struct state *s;
3037 	int			i;
3038 
3039 	/*
3040 	 * If there are too many states, don't bother trying to detect matchall.
3041 	 * This limit serves to bound the time and memory we could consume below.
3042 	 * Note that even if the graph is all-RAINBOW, if there are significantly
3043 	 * more than DUPINF states then it's likely that there are paths of length
3044 	 * more than DUPINF, which would force us to fail anyhow.  In practice,
3045 	 * plausible ways of writing a matchall regex with maximum finite path
3046 	 * length K tend not to have very many more than K states.
3047 	 */
3048 	if (nfa->nstates > DUPINF * 2)
3049 		return;
3050 
3051 	/*
3052 	 * First, scan all the states to verify that only RAINBOW arcs appear,
3053 	 * plus pseudocolor arcs adjacent to the pre and post states.  This lets
3054 	 * us quickly eliminate most cases that aren't matchall NFAs.
3055 	 */
3056 	for (s = nfa->states; s != NULL; s = s->next)
3057 	{
3058 		struct arc *a;
3059 
3060 		for (a = s->outs; a != NULL; a = a->outchain)
3061 		{
3062 			if (a->type != PLAIN)
3063 				return;			/* any LACONs make it non-matchall */
3064 			if (a->co != RAINBOW)
3065 			{
3066 				if (nfa->cm->cd[a->co].flags & PSEUDO)
3067 				{
3068 					/*
3069 					 * Pseudocolor arc: verify it's in a valid place (this
3070 					 * seems quite unlikely to fail, but let's be sure).
3071 					 */
3072 					if (s == nfa->pre &&
3073 						(a->co == nfa->bos[0] || a->co == nfa->bos[1]))
3074 						 /* okay BOS/BOL arc */ ;
3075 					else if (a->to == nfa->post &&
3076 							 (a->co == nfa->eos[0] || a->co == nfa->eos[1]))
3077 						 /* okay EOS/EOL arc */ ;
3078 					else
3079 						return; /* unexpected pseudocolor arc */
3080 					/* We'll check these arcs some more below. */
3081 				}
3082 				else
3083 					return;		/* any other color makes it non-matchall */
3084 			}
3085 		}
3086 		/* Also, assert that the tmp fields are available for use. */
3087 		assert(s->tmp == NULL);
3088 	}
3089 
3090 	/*
3091 	 * The next cheapest check we can make is to verify that the BOS/BOL
3092 	 * outarcs of the pre state reach the same states as its RAINBOW outarcs.
3093 	 * If they don't, the NFA expresses some constraints on the character
3094 	 * before the matched string, making it non-matchall.  Likewise, the
3095 	 * EOS/EOL inarcs of the post state must match its RAINBOW inarcs.
3096 	 */
3097 	if (!check_out_colors_match(nfa->pre, RAINBOW, nfa->bos[0]) ||
3098 		!check_out_colors_match(nfa->pre, RAINBOW, nfa->bos[1]) ||
3099 		!check_in_colors_match(nfa->post, RAINBOW, nfa->eos[0]) ||
3100 		!check_in_colors_match(nfa->post, RAINBOW, nfa->eos[1]))
3101 		return;
3102 
3103 	/*
3104 	 * Initialize an array of path-length arrays, in which
3105 	 * checkmatchall_recurse will return per-state results.  This lets us
3106 	 * memo-ize the recursive search and avoid exponential time consumption.
3107 	 */
3108 	haspaths = (bool **) MALLOC(nfa->nstates * sizeof(bool *));
3109 	if (haspaths == NULL)
3110 		return;					/* fail quietly */
3111 	memset(haspaths, 0, nfa->nstates * sizeof(bool *));
3112 
3113 	/*
3114 	 * Recursively search the graph for all-RAINBOW paths to the "post" state,
3115 	 * starting at the "pre" state, and computing the lengths of the paths.
3116 	 * (Given the preceding checks, there should be at least one such path.
3117 	 * However we could get back a false result anyway, in case there are
3118 	 * multi-state loops, paths exceeding DUPINF+1 length, or non-algorithmic
3119 	 * failures such as ENOMEM.)
3120 	 */
3121 	if (checkmatchall_recurse(nfa, nfa->pre, haspaths))
3122 	{
3123 		/* The useful result is the path length array for the pre state */
3124 		bool	   *haspath = haspaths[nfa->pre->no];
3125 		int			minmatch,
3126 					maxmatch,
3127 					morematch;
3128 
3129 		assert(haspath != NULL);
3130 
3131 		/*
3132 		 * haspath[] now represents the set of possible path lengths; but we
3133 		 * want to reduce that to a min and max value, because it doesn't seem
3134 		 * worth complicating regexec.c to deal with nonconsecutive possible
3135 		 * match lengths.  Find min and max of first run of lengths, then
3136 		 * verify there are no nonconsecutive lengths.
3137 		 */
3138 		for (minmatch = 0; minmatch <= DUPINF + 1; minmatch++)
3139 		{
3140 			if (haspath[minmatch])
3141 				break;
3142 		}
3143 		assert(minmatch <= DUPINF + 1); /* else checkmatchall_recurse lied */
3144 		for (maxmatch = minmatch; maxmatch < DUPINF + 1; maxmatch++)
3145 		{
3146 			if (!haspath[maxmatch + 1])
3147 				break;
3148 		}
3149 		for (morematch = maxmatch + 1; morematch <= DUPINF + 1; morematch++)
3150 		{
3151 			if (haspath[morematch])
3152 			{
3153 				haspath = NULL; /* fail, there are nonconsecutive lengths */
3154 				break;
3155 			}
3156 		}
3157 
3158 		if (haspath != NULL)
3159 		{
3160 			/*
3161 			 * Success, so record the info.  Here we have a fine point: the
3162 			 * path length from the pre state includes the pre-to-initial
3163 			 * transition, so it's one more than the actually matched string
3164 			 * length.  (We avoided counting the final-to-post transition
3165 			 * within checkmatchall_recurse, but not this one.)  This is why
3166 			 * checkmatchall_recurse allows one more level of path length than
3167 			 * might seem necessary.  This decrement also takes care of
3168 			 * converting checkmatchall_recurse's definition of "infinity" as
3169 			 * "DUPINF+1" to our normal representation as "DUPINF".
3170 			 */
3171 			assert(minmatch > 0);	/* else pre and post states were adjacent */
3172 			nfa->minmatchall = minmatch - 1;
3173 			nfa->maxmatchall = maxmatch - 1;
3174 			nfa->flags |= MATCHALL;
3175 		}
3176 	}
3177 
3178 	/* Clean up */
3179 	for (i = 0; i < nfa->nstates; i++)
3180 	{
3181 		if (haspaths[i] != NULL)
3182 			FREE(haspaths[i]);
3183 	}
3184 	FREE(haspaths);
3185 }
3186 
3187 /*
3188  * checkmatchall_recurse - recursive search for checkmatchall
3189  *
3190  * s is the state to be examined in this recursion level.
3191  * haspaths[] is an array of per-state exit path length arrays.
3192  *
3193  * We return true if the search was performed successfully, false if
3194  * we had to fail because of multi-state loops or other internal reasons.
3195  * (Because "dead" states that can't reach the post state have been
3196  * eliminated, and we already verified that only RAINBOW and matching
3197  * pseudocolor arcs exist, every state should have RAINBOW path(s) to
3198  * the post state.  Hence we take a false result from recursive calls
3199  * as meaning that we'd better fail altogether, not just that that
3200  * particular state can't reach the post state.)
3201  *
3202  * On success, we store a malloc'd result array in haspaths[s->no],
3203  * showing the possible path lengths from s to the post state.
3204  * Each state's haspath[] array is of length DUPINF+2.  The entries from
3205  * k = 0 to DUPINF are true if there is an all-RAINBOW path of length k
3206  * from this state to the string end.  haspath[DUPINF+1] is true if all
3207  * path lengths >= DUPINF+1 are possible.  (Situations that cannot be
3208  * represented under these rules cause failure.)
3209  *
3210  * checkmatchall is responsible for eventually freeing the haspath[] arrays.
3211  */
3212 static bool
checkmatchall_recurse(struct nfa * nfa,struct state * s,bool ** haspaths)3213 checkmatchall_recurse(struct nfa *nfa, struct state *s, bool **haspaths)
3214 {
3215 	bool		result = false;
3216 	bool		foundloop = false;
3217 	bool	   *haspath;
3218 	struct arc *a;
3219 
3220 	/*
3221 	 * Since this is recursive, it could be driven to stack overflow.  But we
3222 	 * need not treat that as a hard failure; just deem the NFA non-matchall.
3223 	 */
3224 	if (STACK_TOO_DEEP(nfa->v->re))
3225 		return false;
3226 
3227 	/* In case the search takes a long time, check for cancel */
3228 	if (CANCEL_REQUESTED(nfa->v->re))
3229 	{
3230 		NERR(REG_CANCEL);
3231 		return false;
3232 	}
3233 
3234 	/* Create a haspath array for this state */
3235 	haspath = (bool *) MALLOC((DUPINF + 2) * sizeof(bool));
3236 	if (haspath == NULL)
3237 		return false;			/* again, treat as non-matchall */
3238 	memset(haspath, 0, (DUPINF + 2) * sizeof(bool));
3239 
3240 	/* Mark this state as being visited */
3241 	assert(s->tmp == NULL);
3242 	s->tmp = s;
3243 
3244 	for (a = s->outs; a != NULL; a = a->outchain)
3245 	{
3246 		if (a->co != RAINBOW)
3247 			continue;			/* ignore pseudocolor arcs */
3248 		if (a->to == nfa->post)
3249 		{
3250 			/* We found an all-RAINBOW path to the post state */
3251 			result = true;
3252 
3253 			/*
3254 			 * Mark this state as being zero steps away from the string end
3255 			 * (the transition to the post state isn't counted).
3256 			 */
3257 			haspath[0] = true;
3258 		}
3259 		else if (a->to == s)
3260 		{
3261 			/* We found a cycle of length 1, which we'll deal with below. */
3262 			foundloop = true;
3263 		}
3264 		else if (a->to->tmp != NULL)
3265 		{
3266 			/* It's busy, so we found a cycle of length > 1, so fail. */
3267 			result = false;
3268 			break;
3269 		}
3270 		else
3271 		{
3272 			/* Consider paths forward through this to-state. */
3273 			bool	   *nexthaspath;
3274 			int			i;
3275 
3276 			/* If to-state was not already visited, recurse */
3277 			if (haspaths[a->to->no] == NULL)
3278 			{
3279 				result = checkmatchall_recurse(nfa, a->to, haspaths);
3280 				/* Fail if any recursive path fails */
3281 				if (!result)
3282 					break;
3283 			}
3284 			else
3285 			{
3286 				/* The previous visit must have found path(s) to the end */
3287 				result = true;
3288 			}
3289 			assert(a->to->tmp == NULL);
3290 			nexthaspath = haspaths[a->to->no];
3291 			assert(nexthaspath != NULL);
3292 
3293 			/*
3294 			 * Now, for every path of length i from a->to to the string end,
3295 			 * there is a path of length i + 1 from s to the string end.
3296 			 */
3297 			if (nexthaspath[DUPINF] != nexthaspath[DUPINF + 1])
3298 			{
3299 				/*
3300 				 * a->to has a path of length exactly DUPINF, but not longer;
3301 				 * or it has paths of all lengths > DUPINF but not one of
3302 				 * exactly that length.  In either case, we cannot represent
3303 				 * the possible path lengths from s correctly, so fail.
3304 				 */
3305 				result = false;
3306 				break;
3307 			}
3308 			/* Merge knowledge of these path lengths into what we have */
3309 			for (i = 0; i < DUPINF; i++)
3310 				haspath[i + 1] |= nexthaspath[i];
3311 			/* Infinity + 1 is still infinity */
3312 			haspath[DUPINF + 1] |= nexthaspath[DUPINF + 1];
3313 		}
3314 	}
3315 
3316 	if (result && foundloop)
3317 	{
3318 		/*
3319 		 * If there is a length-1 loop at this state, then find the shortest
3320 		 * known path length to the end.  The loop means that every larger
3321 		 * path length is possible, too.  (It doesn't matter whether any of
3322 		 * the longer lengths were already known possible.)
3323 		 */
3324 		int			i;
3325 
3326 		for (i = 0; i <= DUPINF; i++)
3327 		{
3328 			if (haspath[i])
3329 				break;
3330 		}
3331 		for (i++; i <= DUPINF + 1; i++)
3332 			haspath[i] = true;
3333 	}
3334 
3335 	/* Report out the completed path length map */
3336 	assert(s->no < nfa->nstates);
3337 	assert(haspaths[s->no] == NULL);
3338 	haspaths[s->no] = haspath;
3339 
3340 	/* Mark state no longer busy */
3341 	s->tmp = NULL;
3342 
3343 	return result;
3344 }
3345 
3346 /*
3347  * check_out_colors_match - subroutine for checkmatchall
3348  *
3349  * Check whether the set of states reachable from s by arcs of color co1
3350  * is equivalent to the set reachable by arcs of color co2.
3351  * checkmatchall already verified that all of the NFA's arcs are PLAIN,
3352  * so we need not examine arc types here.
3353  */
3354 static bool
check_out_colors_match(struct state * s,color co1,color co2)3355 check_out_colors_match(struct state *s, color co1, color co2)
3356 {
3357 	bool		result = true;
3358 	struct arc *a;
3359 
3360 	/*
3361 	 * To do this in linear time, we assume that the NFA contains no duplicate
3362 	 * arcs.  Run through the out-arcs, marking states reachable by arcs of
3363 	 * color co1.  Run through again, un-marking states reachable by arcs of
3364 	 * color co2; if we see a not-marked state, we know this co2 arc is
3365 	 * unmatched.  Then run through again, checking for still-marked states,
3366 	 * and in any case leaving all the tmp fields reset to NULL.
3367 	 */
3368 	for (a = s->outs; a != NULL; a = a->outchain)
3369 	{
3370 		if (a->co == co1)
3371 		{
3372 			assert(a->to->tmp == NULL);
3373 			a->to->tmp = a->to;
3374 		}
3375 	}
3376 	for (a = s->outs; a != NULL; a = a->outchain)
3377 	{
3378 		if (a->co == co2)
3379 		{
3380 			if (a->to->tmp != NULL)
3381 				a->to->tmp = NULL;
3382 			else
3383 				result = false; /* unmatched co2 arc */
3384 		}
3385 	}
3386 	for (a = s->outs; a != NULL; a = a->outchain)
3387 	{
3388 		if (a->co == co1)
3389 		{
3390 			if (a->to->tmp != NULL)
3391 			{
3392 				result = false; /* unmatched co1 arc */
3393 				a->to->tmp = NULL;
3394 			}
3395 		}
3396 	}
3397 	return result;
3398 }
3399 
3400 /*
3401  * check_in_colors_match - subroutine for checkmatchall
3402  *
3403  * Check whether the set of states that can reach s by arcs of color co1
3404  * is equivalent to the set that can reach s by arcs of color co2.
3405  * checkmatchall already verified that all of the NFA's arcs are PLAIN,
3406  * so we need not examine arc types here.
3407  */
3408 static bool
check_in_colors_match(struct state * s,color co1,color co2)3409 check_in_colors_match(struct state *s, color co1, color co2)
3410 {
3411 	bool		result = true;
3412 	struct arc *a;
3413 
3414 	/*
3415 	 * Identical algorithm to check_out_colors_match, except examine the
3416 	 * from-states of s' inarcs.
3417 	 */
3418 	for (a = s->ins; a != NULL; a = a->inchain)
3419 	{
3420 		if (a->co == co1)
3421 		{
3422 			assert(a->from->tmp == NULL);
3423 			a->from->tmp = a->from;
3424 		}
3425 	}
3426 	for (a = s->ins; a != NULL; a = a->inchain)
3427 	{
3428 		if (a->co == co2)
3429 		{
3430 			if (a->from->tmp != NULL)
3431 				a->from->tmp = NULL;
3432 			else
3433 				result = false; /* unmatched co2 arc */
3434 		}
3435 	}
3436 	for (a = s->ins; a != NULL; a = a->inchain)
3437 	{
3438 		if (a->co == co1)
3439 		{
3440 			if (a->from->tmp != NULL)
3441 			{
3442 				result = false; /* unmatched co1 arc */
3443 				a->from->tmp = NULL;
3444 			}
3445 		}
3446 	}
3447 	return result;
3448 }
3449 
3450 /*
3451  * compact - construct the compact representation of an NFA
3452  */
3453 static void
compact(struct nfa * nfa,struct cnfa * cnfa)3454 compact(struct nfa *nfa,
3455 		struct cnfa *cnfa)
3456 {
3457 	struct state *s;
3458 	struct arc *a;
3459 	size_t		nstates;
3460 	size_t		narcs;
3461 	struct carc *ca;
3462 	struct carc *first;
3463 
3464 	assert(!NISERR());
3465 
3466 	nstates = 0;
3467 	narcs = 0;
3468 	for (s = nfa->states; s != NULL; s = s->next)
3469 	{
3470 		nstates++;
3471 		narcs += s->nouts + 1;	/* need one extra for endmarker */
3472 	}
3473 
3474 	cnfa->stflags = (char *) MALLOC(nstates * sizeof(char));
3475 	cnfa->states = (struct carc **) MALLOC(nstates * sizeof(struct carc *));
3476 	cnfa->arcs = (struct carc *) MALLOC(narcs * sizeof(struct carc));
3477 	if (cnfa->stflags == NULL || cnfa->states == NULL || cnfa->arcs == NULL)
3478 	{
3479 		if (cnfa->stflags != NULL)
3480 			FREE(cnfa->stflags);
3481 		if (cnfa->states != NULL)
3482 			FREE(cnfa->states);
3483 		if (cnfa->arcs != NULL)
3484 			FREE(cnfa->arcs);
3485 		NERR(REG_ESPACE);
3486 		return;
3487 	}
3488 	cnfa->nstates = nstates;
3489 	cnfa->pre = nfa->pre->no;
3490 	cnfa->post = nfa->post->no;
3491 	cnfa->bos[0] = nfa->bos[0];
3492 	cnfa->bos[1] = nfa->bos[1];
3493 	cnfa->eos[0] = nfa->eos[0];
3494 	cnfa->eos[1] = nfa->eos[1];
3495 	cnfa->ncolors = maxcolor(nfa->cm) + 1;
3496 	cnfa->flags = nfa->flags;
3497 	cnfa->minmatchall = nfa->minmatchall;
3498 	cnfa->maxmatchall = nfa->maxmatchall;
3499 
3500 	ca = cnfa->arcs;
3501 	for (s = nfa->states; s != NULL; s = s->next)
3502 	{
3503 		assert((size_t) s->no < nstates);
3504 		cnfa->stflags[s->no] = 0;
3505 		cnfa->states[s->no] = ca;
3506 		first = ca;
3507 		for (a = s->outs; a != NULL; a = a->outchain)
3508 			switch (a->type)
3509 			{
3510 				case PLAIN:
3511 					ca->co = a->co;
3512 					ca->to = a->to->no;
3513 					ca++;
3514 					break;
3515 				case LACON:
3516 					assert(s->no != cnfa->pre);
3517 					assert(a->co >= 0);
3518 					ca->co = (color) (cnfa->ncolors + a->co);
3519 					ca->to = a->to->no;
3520 					ca++;
3521 					cnfa->flags |= HASLACONS;
3522 					break;
3523 				default:
3524 					NERR(REG_ASSERT);
3525 					return;
3526 			}
3527 		carcsort(first, ca - first);
3528 		ca->co = COLORLESS;
3529 		ca->to = 0;
3530 		ca++;
3531 	}
3532 	assert(ca == &cnfa->arcs[narcs]);
3533 	assert(cnfa->nstates != 0);
3534 
3535 	/* mark no-progress states */
3536 	for (a = nfa->pre->outs; a != NULL; a = a->outchain)
3537 		cnfa->stflags[a->to->no] = CNFA_NOPROGRESS;
3538 	cnfa->stflags[nfa->pre->no] = CNFA_NOPROGRESS;
3539 }
3540 
3541 /*
3542  * carcsort - sort compacted-NFA arcs by color
3543  */
3544 static void
carcsort(struct carc * first,size_t n)3545 carcsort(struct carc *first, size_t n)
3546 {
3547 	if (n > 1)
3548 		qsort(first, n, sizeof(struct carc), carc_cmp);
3549 }
3550 
3551 static int
carc_cmp(const void * a,const void * b)3552 carc_cmp(const void *a, const void *b)
3553 {
3554 	const struct carc *aa = (const struct carc *) a;
3555 	const struct carc *bb = (const struct carc *) b;
3556 
3557 	if (aa->co < bb->co)
3558 		return -1;
3559 	if (aa->co > bb->co)
3560 		return +1;
3561 	if (aa->to < bb->to)
3562 		return -1;
3563 	if (aa->to > bb->to)
3564 		return +1;
3565 	return 0;
3566 }
3567 
3568 /*
3569  * freecnfa - free a compacted NFA
3570  */
3571 static void
freecnfa(struct cnfa * cnfa)3572 freecnfa(struct cnfa *cnfa)
3573 {
3574 	assert(!NULLCNFA(*cnfa));	/* not empty already */
3575 	FREE(cnfa->stflags);
3576 	FREE(cnfa->states);
3577 	FREE(cnfa->arcs);
3578 	ZAPCNFA(*cnfa);
3579 }
3580 
3581 /*
3582  * dumpnfa - dump an NFA in human-readable form
3583  */
3584 static void
dumpnfa(struct nfa * nfa,FILE * f)3585 dumpnfa(struct nfa *nfa,
3586 		FILE *f)
3587 {
3588 #ifdef REG_DEBUG
3589 	struct state *s;
3590 	int			nstates = 0;
3591 	int			narcs = 0;
3592 
3593 	fprintf(f, "pre %d, post %d", nfa->pre->no, nfa->post->no);
3594 	if (nfa->bos[0] != COLORLESS)
3595 		fprintf(f, ", bos [%ld]", (long) nfa->bos[0]);
3596 	if (nfa->bos[1] != COLORLESS)
3597 		fprintf(f, ", bol [%ld]", (long) nfa->bos[1]);
3598 	if (nfa->eos[0] != COLORLESS)
3599 		fprintf(f, ", eos [%ld]", (long) nfa->eos[0]);
3600 	if (nfa->eos[1] != COLORLESS)
3601 		fprintf(f, ", eol [%ld]", (long) nfa->eos[1]);
3602 	if (nfa->flags & HASLACONS)
3603 		fprintf(f, ", haslacons");
3604 	if (nfa->flags & MATCHALL)
3605 	{
3606 		fprintf(f, ", minmatchall %d", nfa->minmatchall);
3607 		if (nfa->maxmatchall == DUPINF)
3608 			fprintf(f, ", maxmatchall inf");
3609 		else
3610 			fprintf(f, ", maxmatchall %d", nfa->maxmatchall);
3611 	}
3612 	fprintf(f, "\n");
3613 	for (s = nfa->states; s != NULL; s = s->next)
3614 	{
3615 		dumpstate(s, f);
3616 		nstates++;
3617 		narcs += s->nouts;
3618 	}
3619 	fprintf(f, "total of %d states, %d arcs\n", nstates, narcs);
3620 	if (nfa->parent == NULL)
3621 		dumpcolors(nfa->cm, f);
3622 	fflush(f);
3623 #endif
3624 }
3625 
3626 #ifdef REG_DEBUG				/* subordinates of dumpnfa */
3627 
3628 /*
3629  * dumpstate - dump an NFA state in human-readable form
3630  */
3631 static void
dumpstate(struct state * s,FILE * f)3632 dumpstate(struct state *s,
3633 		  FILE *f)
3634 {
3635 	struct arc *a;
3636 
3637 	fprintf(f, "%d%s%c", s->no, (s->tmp != NULL) ? "T" : "",
3638 			(s->flag) ? s->flag : '.');
3639 	if (s->prev != NULL && s->prev->next != s)
3640 		fprintf(f, "\tstate chain bad\n");
3641 	if (s->nouts == 0)
3642 		fprintf(f, "\tno out arcs\n");
3643 	else
3644 		dumparcs(s, f);
3645 	for (a = s->ins; a != NULL; a = a->inchain)
3646 	{
3647 		if (a->to != s)
3648 			fprintf(f, "\tlink from %d to %d on %d's in-chain\n",
3649 					a->from->no, a->to->no, s->no);
3650 	}
3651 	fflush(f);
3652 }
3653 
3654 /*
3655  * dumparcs - dump out-arcs in human-readable form
3656  */
3657 static void
dumparcs(struct state * s,FILE * f)3658 dumparcs(struct state *s,
3659 		 FILE *f)
3660 {
3661 	int			pos;
3662 	struct arc *a;
3663 
3664 	/* printing oldest arcs first is usually clearer */
3665 	a = s->outs;
3666 	assert(a != NULL);
3667 	while (a->outchain != NULL)
3668 		a = a->outchain;
3669 	pos = 1;
3670 	do
3671 	{
3672 		dumparc(a, s, f);
3673 		if (pos == 5)
3674 		{
3675 			fprintf(f, "\n");
3676 			pos = 1;
3677 		}
3678 		else
3679 			pos++;
3680 		a = a->outchainRev;
3681 	} while (a != NULL);
3682 	if (pos != 1)
3683 		fprintf(f, "\n");
3684 }
3685 
3686 /*
3687  * dumparc - dump one outarc in readable form, including prefixing tab
3688  */
3689 static void
dumparc(struct arc * a,struct state * s,FILE * f)3690 dumparc(struct arc *a,
3691 		struct state *s,
3692 		FILE *f)
3693 {
3694 	struct arc *aa;
3695 
3696 	fprintf(f, "\t");
3697 	switch (a->type)
3698 	{
3699 		case PLAIN:
3700 			if (a->co == RAINBOW)
3701 				fprintf(f, "[*]");
3702 			else
3703 				fprintf(f, "[%ld]", (long) a->co);
3704 			break;
3705 		case AHEAD:
3706 			if (a->co == RAINBOW)
3707 				fprintf(f, ">*>");
3708 			else
3709 				fprintf(f, ">%ld>", (long) a->co);
3710 			break;
3711 		case BEHIND:
3712 			if (a->co == RAINBOW)
3713 				fprintf(f, "<*<");
3714 			else
3715 				fprintf(f, "<%ld<", (long) a->co);
3716 			break;
3717 		case LACON:
3718 			fprintf(f, ":%ld:", (long) a->co);
3719 			break;
3720 		case '^':
3721 		case '$':
3722 			fprintf(f, "%c%d", a->type, (int) a->co);
3723 			break;
3724 		case EMPTY:
3725 			break;
3726 		default:
3727 			fprintf(f, "0x%x/0%lo", a->type, (long) a->co);
3728 			break;
3729 	}
3730 	if (a->from != s)
3731 		fprintf(f, "?%d?", a->from->no);
3732 	for (aa = a->from->outs; aa != NULL; aa = aa->outchain)
3733 		if (aa == a)
3734 			break;				/* NOTE BREAK OUT */
3735 	if (aa == NULL)
3736 		fprintf(f, "?!?");		/* missing from out-chain */
3737 	fprintf(f, "->");
3738 	if (a->to == NULL)
3739 	{
3740 		fprintf(f, "NULL");
3741 		return;
3742 	}
3743 	fprintf(f, "%d", a->to->no);
3744 	for (aa = a->to->ins; aa != NULL; aa = aa->inchain)
3745 		if (aa == a)
3746 			break;				/* NOTE BREAK OUT */
3747 	if (aa == NULL)
3748 		fprintf(f, "?!?");		/* missing from in-chain */
3749 }
3750 #endif							/* REG_DEBUG */
3751 
3752 /*
3753  * dumpcnfa - dump a compacted NFA in human-readable form
3754  */
3755 #ifdef REG_DEBUG
3756 static void
dumpcnfa(struct cnfa * cnfa,FILE * f)3757 dumpcnfa(struct cnfa *cnfa,
3758 		 FILE *f)
3759 {
3760 	int			st;
3761 
3762 	fprintf(f, "pre %d, post %d", cnfa->pre, cnfa->post);
3763 	if (cnfa->bos[0] != COLORLESS)
3764 		fprintf(f, ", bos [%ld]", (long) cnfa->bos[0]);
3765 	if (cnfa->bos[1] != COLORLESS)
3766 		fprintf(f, ", bol [%ld]", (long) cnfa->bos[1]);
3767 	if (cnfa->eos[0] != COLORLESS)
3768 		fprintf(f, ", eos [%ld]", (long) cnfa->eos[0]);
3769 	if (cnfa->eos[1] != COLORLESS)
3770 		fprintf(f, ", eol [%ld]", (long) cnfa->eos[1]);
3771 	if (cnfa->flags & HASLACONS)
3772 		fprintf(f, ", haslacons");
3773 	if (cnfa->flags & MATCHALL)
3774 	{
3775 		fprintf(f, ", minmatchall %d", cnfa->minmatchall);
3776 		if (cnfa->maxmatchall == DUPINF)
3777 			fprintf(f, ", maxmatchall inf");
3778 		else
3779 			fprintf(f, ", maxmatchall %d", cnfa->maxmatchall);
3780 	}
3781 	fprintf(f, "\n");
3782 	for (st = 0; st < cnfa->nstates; st++)
3783 		dumpcstate(st, cnfa, f);
3784 	fflush(f);
3785 }
3786 #endif
3787 
3788 #ifdef REG_DEBUG				/* subordinates of dumpcnfa */
3789 
3790 /*
3791  * dumpcstate - dump a compacted-NFA state in human-readable form
3792  */
3793 static void
dumpcstate(int st,struct cnfa * cnfa,FILE * f)3794 dumpcstate(int st,
3795 		   struct cnfa *cnfa,
3796 		   FILE *f)
3797 {
3798 	struct carc *ca;
3799 	int			pos;
3800 
3801 	fprintf(f, "%d%s", st, (cnfa->stflags[st] & CNFA_NOPROGRESS) ? ":" : ".");
3802 	pos = 1;
3803 	for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
3804 	{
3805 		if (ca->co == RAINBOW)
3806 			fprintf(f, "\t[*]->%d", ca->to);
3807 		else if (ca->co < cnfa->ncolors)
3808 			fprintf(f, "\t[%ld]->%d", (long) ca->co, ca->to);
3809 		else
3810 			fprintf(f, "\t:%ld:->%d", (long) (ca->co - cnfa->ncolors), ca->to);
3811 		if (pos == 5)
3812 		{
3813 			fprintf(f, "\n");
3814 			pos = 1;
3815 		}
3816 		else
3817 			pos++;
3818 	}
3819 	if (ca == cnfa->states[st] || pos != 1)
3820 		fprintf(f, "\n");
3821 	fflush(f);
3822 }
3823 
3824 #endif							/* REG_DEBUG */
3825