1 /*
2  * DFA routines
3  * This file is #included by regexec.c.
4  *
5  * Copyright (c) 1998, 1999 Henry Spencer.  All rights reserved.
6  *
7  * Development of this software was funded, in part, by Cray Research Inc.,
8  * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
9  * Corporation, none of whom are responsible for the results.  The author
10  * thanks all of them.
11  *
12  * Redistribution and use in source and binary forms -- with or without
13  * modification -- are permitted for any purpose, provided that
14  * redistributions in source form retain this entire copyright notice and
15  * indicate the origin and nature of any modifications.
16  *
17  * I'd appreciate being given credit for this package in the documentation
18  * of software which uses it, but that is not a requirement.
19  *
20  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
21  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
22  * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
23  * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
29  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  *
31  * src/backend/regex/rege_dfa.c
32  *
33  */
34 
35 /*
36  * longest - longest-preferred matching engine
37  *
38  * On success, returns match endpoint address.  Returns NULL on no match.
39  * Internal errors also return NULL, with v->err set.
40  */
41 static chr *
longest(struct vars * v,struct dfa * d,chr * start,chr * stop,int * hitstopp)42 longest(struct vars *v,
43 		struct dfa *d,
44 		chr *start,				/* where the match should start */
45 		chr *stop,				/* match must end at or before here */
46 		int *hitstopp)			/* record whether hit v->stop, if non-NULL */
47 {
48 	chr		   *cp;
49 	chr		   *realstop = (stop == v->stop) ? stop : stop + 1;
50 	color		co;
51 	struct sset *css;
52 	struct sset *ss;
53 	chr		   *post;
54 	int			i;
55 	struct colormap *cm = d->cm;
56 
57 	/* prevent "uninitialized variable" warnings */
58 	if (hitstopp != NULL)
59 		*hitstopp = 0;
60 
61 	/* initialize */
62 	css = initialize(v, d, start);
63 	if (css == NULL)
64 		return NULL;
65 	cp = start;
66 
67 	/* startup */
68 	FDEBUG(("+++ startup +++\n"));
69 	if (cp == v->start)
70 	{
71 		co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1];
72 		FDEBUG(("color %ld\n", (long) co));
73 	}
74 	else
75 	{
76 		co = GETCOLOR(cm, *(cp - 1));
77 		FDEBUG(("char %c, color %ld\n", (char) *(cp - 1), (long) co));
78 	}
79 	css = miss(v, d, css, co, cp, start);
80 	if (css == NULL)
81 		return NULL;
82 	css->lastseen = cp;
83 
84 	/*
85 	 * This is the main text-scanning loop.  It seems worth having two copies
86 	 * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
87 	 * builds, when you're not actively tracing.
88 	 */
89 #ifdef REG_DEBUG
90 	if (v->eflags & REG_FTRACE)
91 	{
92 		while (cp < realstop)
93 		{
94 			FDEBUG(("+++ at c%d +++\n", (int) (css - d->ssets)));
95 			co = GETCOLOR(cm, *cp);
96 			FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
97 			ss = css->outs[co];
98 			if (ss == NULL)
99 			{
100 				ss = miss(v, d, css, co, cp + 1, start);
101 				if (ss == NULL)
102 					break;		/* NOTE BREAK OUT */
103 			}
104 			cp++;
105 			ss->lastseen = cp;
106 			css = ss;
107 		}
108 	}
109 	else
110 #endif
111 	{
112 		while (cp < realstop)
113 		{
114 			co = GETCOLOR(cm, *cp);
115 			ss = css->outs[co];
116 			if (ss == NULL)
117 			{
118 				ss = miss(v, d, css, co, cp + 1, start);
119 				if (ss == NULL)
120 					break;		/* NOTE BREAK OUT */
121 			}
122 			cp++;
123 			ss->lastseen = cp;
124 			css = ss;
125 		}
126 	}
127 
128 	if (ISERR())
129 		return NULL;
130 
131 	/* shutdown */
132 	FDEBUG(("+++ shutdown at c%d +++\n", (int) (css - d->ssets)));
133 	if (cp == v->stop && stop == v->stop)
134 	{
135 		if (hitstopp != NULL)
136 			*hitstopp = 1;
137 		co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1];
138 		FDEBUG(("color %ld\n", (long) co));
139 		ss = miss(v, d, css, co, cp, start);
140 		if (ISERR())
141 			return NULL;
142 		/* special case:  match ended at eol? */
143 		if (ss != NULL && (ss->flags & POSTSTATE))
144 			return cp;
145 		else if (ss != NULL)
146 			ss->lastseen = cp;	/* to be tidy */
147 	}
148 
149 	/* find last match, if any */
150 	post = d->lastpost;
151 	for (ss = d->ssets, i = d->nssused; i > 0; ss++, i--)
152 		if ((ss->flags & POSTSTATE) && post != ss->lastseen &&
153 			(post == NULL || post < ss->lastseen))
154 			post = ss->lastseen;
155 	if (post != NULL)			/* found one */
156 		return post - 1;
157 
158 	return NULL;
159 }
160 
161 /*
162  * shortest - shortest-preferred matching engine
163  *
164  * On success, returns match endpoint address.  Returns NULL on no match.
165  * Internal errors also return NULL, with v->err set.
166  */
167 static chr *
shortest(struct vars * v,struct dfa * d,chr * start,chr * min,chr * max,chr ** coldp,int * hitstopp)168 shortest(struct vars *v,
169 		 struct dfa *d,
170 		 chr *start,			/* where the match should start */
171 		 chr *min,				/* match must end at or after here */
172 		 chr *max,				/* match must end at or before here */
173 		 chr **coldp,			/* store coldstart pointer here, if non-NULL */
174 		 int *hitstopp)			/* record whether hit v->stop, if non-NULL */
175 {
176 	chr		   *cp;
177 	chr		   *realmin = (min == v->stop) ? min : min + 1;
178 	chr		   *realmax = (max == v->stop) ? max : max + 1;
179 	color		co;
180 	struct sset *css;
181 	struct sset *ss;
182 	struct colormap *cm = d->cm;
183 
184 	/* prevent "uninitialized variable" warnings */
185 	if (coldp != NULL)
186 		*coldp = NULL;
187 	if (hitstopp != NULL)
188 		*hitstopp = 0;
189 
190 	/* initialize */
191 	css = initialize(v, d, start);
192 	if (css == NULL)
193 		return NULL;
194 	cp = start;
195 
196 	/* startup */
197 	FDEBUG(("--- startup ---\n"));
198 	if (cp == v->start)
199 	{
200 		co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1];
201 		FDEBUG(("color %ld\n", (long) co));
202 	}
203 	else
204 	{
205 		co = GETCOLOR(cm, *(cp - 1));
206 		FDEBUG(("char %c, color %ld\n", (char) *(cp - 1), (long) co));
207 	}
208 	css = miss(v, d, css, co, cp, start);
209 	if (css == NULL)
210 		return NULL;
211 	css->lastseen = cp;
212 	ss = css;
213 
214 	/*
215 	 * This is the main text-scanning loop.  It seems worth having two copies
216 	 * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
217 	 * builds, when you're not actively tracing.
218 	 */
219 #ifdef REG_DEBUG
220 	if (v->eflags & REG_FTRACE)
221 	{
222 		while (cp < realmax)
223 		{
224 			FDEBUG(("--- at c%d ---\n", (int) (css - d->ssets)));
225 			co = GETCOLOR(cm, *cp);
226 			FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
227 			ss = css->outs[co];
228 			if (ss == NULL)
229 			{
230 				ss = miss(v, d, css, co, cp + 1, start);
231 				if (ss == NULL)
232 					break;		/* NOTE BREAK OUT */
233 			}
234 			cp++;
235 			ss->lastseen = cp;
236 			css = ss;
237 			if ((ss->flags & POSTSTATE) && cp >= realmin)
238 				break;			/* NOTE BREAK OUT */
239 		}
240 	}
241 	else
242 #endif
243 	{
244 		while (cp < realmax)
245 		{
246 			co = GETCOLOR(cm, *cp);
247 			ss = css->outs[co];
248 			if (ss == NULL)
249 			{
250 				ss = miss(v, d, css, co, cp + 1, start);
251 				if (ss == NULL)
252 					break;		/* NOTE BREAK OUT */
253 			}
254 			cp++;
255 			ss->lastseen = cp;
256 			css = ss;
257 			if ((ss->flags & POSTSTATE) && cp >= realmin)
258 				break;			/* NOTE BREAK OUT */
259 		}
260 	}
261 
262 	if (ss == NULL)
263 		return NULL;
264 
265 	if (coldp != NULL)			/* report last no-progress state set, if any */
266 		*coldp = lastcold(v, d);
267 
268 	if ((ss->flags & POSTSTATE) && cp > min)
269 	{
270 		assert(cp >= realmin);
271 		cp--;
272 	}
273 	else if (cp == v->stop && max == v->stop)
274 	{
275 		co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1];
276 		FDEBUG(("color %ld\n", (long) co));
277 		ss = miss(v, d, css, co, cp, start);
278 		/* match might have ended at eol */
279 		if ((ss == NULL || !(ss->flags & POSTSTATE)) && hitstopp != NULL)
280 			*hitstopp = 1;
281 	}
282 
283 	if (ss == NULL || !(ss->flags & POSTSTATE))
284 		return NULL;
285 
286 	return cp;
287 }
288 
289 /*
290  * matchuntil - incremental matching engine
291  *
292  * This is meant for use with a search-style NFA (that is, the pattern is
293  * known to act as though it had a leading .*).  We determine whether a
294  * match exists starting at v->start and ending at probe.  Multiple calls
295  * require only O(N) time not O(N^2) so long as the probe values are
296  * nondecreasing.  *lastcss and *lastcp must be initialized to NULL before
297  * starting a series of calls.
298  *
299  * Returns 1 if a match exists, 0 if not.
300  * Internal errors also return 0, with v->err set.
301  */
302 static int
matchuntil(struct vars * v,struct dfa * d,chr * probe,struct sset ** lastcss,chr ** lastcp)303 matchuntil(struct vars *v,
304 		   struct dfa *d,
305 		   chr *probe,			/* we want to know if a match ends here */
306 		   struct sset **lastcss,	/* state storage across calls */
307 		   chr **lastcp)		/* state storage across calls */
308 {
309 	chr		   *cp = *lastcp;
310 	color		co;
311 	struct sset *css = *lastcss;
312 	struct sset *ss;
313 	struct colormap *cm = d->cm;
314 
315 	/* initialize and startup, or restart, if necessary */
316 	if (cp == NULL || cp > probe)
317 	{
318 		cp = v->start;
319 		css = initialize(v, d, cp);
320 		if (css == NULL)
321 			return 0;
322 
323 		FDEBUG((">>> startup >>>\n"));
324 		co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1];
325 		FDEBUG(("color %ld\n", (long) co));
326 
327 		css = miss(v, d, css, co, cp, v->start);
328 		if (css == NULL)
329 			return 0;
330 		css->lastseen = cp;
331 	}
332 	else if (css == NULL)
333 	{
334 		/* we previously found that no match is possible beyond *lastcp */
335 		return 0;
336 	}
337 	ss = css;
338 
339 	/*
340 	 * This is the main text-scanning loop.  It seems worth having two copies
341 	 * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
342 	 * builds, when you're not actively tracing.
343 	 */
344 #ifdef REG_DEBUG
345 	if (v->eflags & REG_FTRACE)
346 	{
347 		while (cp < probe)
348 		{
349 			FDEBUG((">>> at c%d >>>\n", (int) (css - d->ssets)));
350 			co = GETCOLOR(cm, *cp);
351 			FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
352 			ss = css->outs[co];
353 			if (ss == NULL)
354 			{
355 				ss = miss(v, d, css, co, cp + 1, v->start);
356 				if (ss == NULL)
357 					break;		/* NOTE BREAK OUT */
358 			}
359 			cp++;
360 			ss->lastseen = cp;
361 			css = ss;
362 		}
363 	}
364 	else
365 #endif
366 	{
367 		while (cp < probe)
368 		{
369 			co = GETCOLOR(cm, *cp);
370 			ss = css->outs[co];
371 			if (ss == NULL)
372 			{
373 				ss = miss(v, d, css, co, cp + 1, v->start);
374 				if (ss == NULL)
375 					break;		/* NOTE BREAK OUT */
376 			}
377 			cp++;
378 			ss->lastseen = cp;
379 			css = ss;
380 		}
381 	}
382 
383 	*lastcss = ss;
384 	*lastcp = cp;
385 
386 	if (ss == NULL)
387 		return 0;				/* impossible match, or internal error */
388 
389 	/* We need to process one more chr, or the EOS symbol, to check match */
390 	if (cp < v->stop)
391 	{
392 		FDEBUG((">>> at c%d >>>\n", (int) (css - d->ssets)));
393 		co = GETCOLOR(cm, *cp);
394 		FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
395 		ss = css->outs[co];
396 		if (ss == NULL)
397 			ss = miss(v, d, css, co, cp + 1, v->start);
398 	}
399 	else
400 	{
401 		assert(cp == v->stop);
402 		co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1];
403 		FDEBUG(("color %ld\n", (long) co));
404 		ss = miss(v, d, css, co, cp, v->start);
405 	}
406 
407 	if (ss == NULL || !(ss->flags & POSTSTATE))
408 		return 0;
409 
410 	return 1;
411 }
412 
413 /*
414  * lastcold - determine last point at which no progress had been made
415  */
416 static chr *					/* endpoint, or NULL */
lastcold(struct vars * v,struct dfa * d)417 lastcold(struct vars *v,
418 		 struct dfa *d)
419 {
420 	struct sset *ss;
421 	chr		   *nopr;
422 	int			i;
423 
424 	nopr = d->lastnopr;
425 	if (nopr == NULL)
426 		nopr = v->start;
427 	for (ss = d->ssets, i = d->nssused; i > 0; ss++, i--)
428 		if ((ss->flags & NOPROGRESS) && nopr < ss->lastseen)
429 			nopr = ss->lastseen;
430 	return nopr;
431 }
432 
433 /*
434  * newdfa - set up a fresh DFA
435  */
436 static struct dfa *
newdfa(struct vars * v,struct cnfa * cnfa,struct colormap * cm,struct smalldfa * sml)437 newdfa(struct vars *v,
438 	   struct cnfa *cnfa,
439 	   struct colormap *cm,
440 	   struct smalldfa *sml)	/* preallocated space, may be NULL */
441 {
442 	struct dfa *d;
443 	size_t		nss = cnfa->nstates * 2;
444 	int			wordsper = (cnfa->nstates + UBITS - 1) / UBITS;
445 	struct smalldfa *smallwas = sml;
446 
447 	assert(cnfa != NULL && cnfa->nstates != 0);
448 
449 	if (nss <= FEWSTATES && cnfa->ncolors <= FEWCOLORS)
450 	{
451 		assert(wordsper == 1);
452 		if (sml == NULL)
453 		{
454 			sml = (struct smalldfa *) MALLOC(sizeof(struct smalldfa));
455 			if (sml == NULL)
456 			{
457 				ERR(REG_ESPACE);
458 				return NULL;
459 			}
460 		}
461 		d = &sml->dfa;
462 		d->ssets = sml->ssets;
463 		d->statesarea = sml->statesarea;
464 		d->work = &d->statesarea[nss];
465 		d->outsarea = sml->outsarea;
466 		d->incarea = sml->incarea;
467 		d->cptsmalloced = 0;
468 		d->mallocarea = (smallwas == NULL) ? (char *) sml : NULL;
469 	}
470 	else
471 	{
472 		d = (struct dfa *) MALLOC(sizeof(struct dfa));
473 		if (d == NULL)
474 		{
475 			ERR(REG_ESPACE);
476 			return NULL;
477 		}
478 		d->ssets = (struct sset *) MALLOC(nss * sizeof(struct sset));
479 		d->statesarea = (unsigned *) MALLOC((nss + WORK) * wordsper *
480 											sizeof(unsigned));
481 		d->work = &d->statesarea[nss * wordsper];
482 		d->outsarea = (struct sset **) MALLOC(nss * cnfa->ncolors *
483 											  sizeof(struct sset *));
484 		d->incarea = (struct arcp *) MALLOC(nss * cnfa->ncolors *
485 											sizeof(struct arcp));
486 		d->cptsmalloced = 1;
487 		d->mallocarea = (char *) d;
488 		if (d->ssets == NULL || d->statesarea == NULL ||
489 			d->outsarea == NULL || d->incarea == NULL)
490 		{
491 			freedfa(d);
492 			ERR(REG_ESPACE);
493 			return NULL;
494 		}
495 	}
496 
497 	d->nssets = (v->eflags & REG_SMALL) ? 7 : nss;
498 	d->nssused = 0;
499 	d->nstates = cnfa->nstates;
500 	d->ncolors = cnfa->ncolors;
501 	d->wordsper = wordsper;
502 	d->cnfa = cnfa;
503 	d->cm = cm;
504 	d->lastpost = NULL;
505 	d->lastnopr = NULL;
506 	d->search = d->ssets;
507 
508 	/* initialization of sset fields is done as needed */
509 
510 	return d;
511 }
512 
513 /*
514  * freedfa - free a DFA
515  */
516 static void
freedfa(struct dfa * d)517 freedfa(struct dfa *d)
518 {
519 	if (d->cptsmalloced)
520 	{
521 		if (d->ssets != NULL)
522 			FREE(d->ssets);
523 		if (d->statesarea != NULL)
524 			FREE(d->statesarea);
525 		if (d->outsarea != NULL)
526 			FREE(d->outsarea);
527 		if (d->incarea != NULL)
528 			FREE(d->incarea);
529 	}
530 
531 	if (d->mallocarea != NULL)
532 		FREE(d->mallocarea);
533 }
534 
535 /*
536  * hash - construct a hash code for a bitvector
537  *
538  * There are probably better ways, but they're more expensive.
539  */
540 static unsigned
hash(unsigned * uv,int n)541 hash(unsigned *uv,
542 	 int n)
543 {
544 	int			i;
545 	unsigned	h;
546 
547 	h = 0;
548 	for (i = 0; i < n; i++)
549 		h ^= uv[i];
550 	return h;
551 }
552 
553 /*
554  * initialize - hand-craft a cache entry for startup, otherwise get ready
555  */
556 static struct sset *
initialize(struct vars * v,struct dfa * d,chr * start)557 initialize(struct vars *v,
558 		   struct dfa *d,
559 		   chr *start)
560 {
561 	struct sset *ss;
562 	int			i;
563 
564 	/* is previous one still there? */
565 	if (d->nssused > 0 && (d->ssets[0].flags & STARTER))
566 		ss = &d->ssets[0];
567 	else
568 	{							/* no, must (re)build it */
569 		ss = getvacant(v, d, start, start);
570 		if (ss == NULL)
571 			return NULL;
572 		for (i = 0; i < d->wordsper; i++)
573 			ss->states[i] = 0;
574 		BSET(ss->states, d->cnfa->pre);
575 		ss->hash = HASH(ss->states, d->wordsper);
576 		assert(d->cnfa->pre != d->cnfa->post);
577 		ss->flags = STARTER | LOCKED | NOPROGRESS;
578 		/* lastseen dealt with below */
579 	}
580 
581 	for (i = 0; i < d->nssused; i++)
582 		d->ssets[i].lastseen = NULL;
583 	ss->lastseen = start;		/* maybe untrue, but harmless */
584 	d->lastpost = NULL;
585 	d->lastnopr = NULL;
586 	return ss;
587 }
588 
589 /*
590  * miss - handle a stateset cache miss
591  *
592  * css is the current stateset, co is the color of the current input character,
593  * cp points to the character after that (which is where we may need to test
594  * LACONs).  start does not affect matching behavior but is needed for pickss'
595  * heuristics about which stateset cache entry to replace.
596  *
597  * Ordinarily, returns the address of the next stateset (the one that is
598  * valid after consuming the input character).  Returns NULL if no valid
599  * NFA states remain, ie we have a certain match failure.
600  * Internal errors also return NULL, with v->err set.
601  */
602 static struct sset *
miss(struct vars * v,struct dfa * d,struct sset * css,color co,chr * cp,chr * start)603 miss(struct vars *v,
604 	 struct dfa *d,
605 	 struct sset *css,
606 	 color co,
607 	 chr *cp,					/* next chr */
608 	 chr *start)				/* where the attempt got started */
609 {
610 	struct cnfa *cnfa = d->cnfa;
611 	int			i;
612 	unsigned	h;
613 	struct carc *ca;
614 	struct sset *p;
615 	int			ispost;
616 	int			noprogress;
617 	int			gotstate;
618 	int			dolacons;
619 	int			sawlacons;
620 
621 	/* for convenience, we can be called even if it might not be a miss */
622 	if (css->outs[co] != NULL)
623 	{
624 		FDEBUG(("hit\n"));
625 		return css->outs[co];
626 	}
627 	FDEBUG(("miss\n"));
628 
629 	/*
630 	 * Checking for operation cancel in the inner text search loop seems
631 	 * unduly expensive.  As a compromise, check during cache misses.
632 	 */
633 	if (CANCEL_REQUESTED(v->re))
634 	{
635 		ERR(REG_CANCEL);
636 		return NULL;
637 	}
638 
639 	/*
640 	 * What set of states would we end up in after consuming the co character?
641 	 * We first consider PLAIN arcs that consume the character, and then look
642 	 * to see what LACON arcs could be traversed after consuming it.
643 	 */
644 	for (i = 0; i < d->wordsper; i++)
645 		d->work[i] = 0;			/* build new stateset bitmap in d->work */
646 	ispost = 0;
647 	noprogress = 1;
648 	gotstate = 0;
649 	for (i = 0; i < d->nstates; i++)
650 		if (ISBSET(css->states, i))
651 			for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++)
652 				if (ca->co == co)
653 				{
654 					BSET(d->work, ca->to);
655 					gotstate = 1;
656 					if (ca->to == cnfa->post)
657 						ispost = 1;
658 					if (!(cnfa->stflags[ca->to] & CNFA_NOPROGRESS))
659 						noprogress = 0;
660 					FDEBUG(("%d -> %d\n", i, ca->to));
661 				}
662 	if (!gotstate)
663 		return NULL;			/* character cannot reach any new state */
664 	dolacons = (cnfa->flags & HASLACONS);
665 	sawlacons = 0;
666 	/* outer loop handles transitive closure of reachable-by-LACON states */
667 	while (dolacons)
668 	{
669 		dolacons = 0;
670 		for (i = 0; i < d->nstates; i++)
671 			if (ISBSET(d->work, i))
672 				for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++)
673 				{
674 					if (ca->co < cnfa->ncolors)
675 						continue;	/* not a LACON arc */
676 					if (ISBSET(d->work, ca->to))
677 						continue;	/* arc would be a no-op anyway */
678 					sawlacons = 1;	/* this LACON affects our result */
679 					if (!lacon(v, cnfa, cp, ca->co))
680 					{
681 						if (ISERR())
682 							return NULL;
683 						continue;	/* LACON arc cannot be traversed */
684 					}
685 					if (ISERR())
686 						return NULL;
687 					BSET(d->work, ca->to);
688 					dolacons = 1;
689 					if (ca->to == cnfa->post)
690 						ispost = 1;
691 					if (!(cnfa->stflags[ca->to] & CNFA_NOPROGRESS))
692 						noprogress = 0;
693 					FDEBUG(("%d :> %d\n", i, ca->to));
694 				}
695 	}
696 	h = HASH(d->work, d->wordsper);
697 
698 	/* Is this stateset already in the cache? */
699 	for (p = d->ssets, i = d->nssused; i > 0; p++, i--)
700 		if (HIT(h, d->work, p, d->wordsper))
701 		{
702 			FDEBUG(("cached c%d\n", (int) (p - d->ssets)));
703 			break;				/* NOTE BREAK OUT */
704 		}
705 	if (i == 0)
706 	{							/* nope, need a new cache entry */
707 		p = getvacant(v, d, cp, start);
708 		if (p == NULL)
709 			return NULL;
710 		assert(p != css);
711 		for (i = 0; i < d->wordsper; i++)
712 			p->states[i] = d->work[i];
713 		p->hash = h;
714 		p->flags = (ispost) ? POSTSTATE : 0;
715 		if (noprogress)
716 			p->flags |= NOPROGRESS;
717 		/* lastseen to be dealt with by caller */
718 	}
719 
720 	/*
721 	 * Link new stateset to old, unless a LACON affected the result, in which
722 	 * case we don't create the link.  That forces future transitions across
723 	 * this same arc (same prior stateset and character color) to come through
724 	 * miss() again, so that we can recheck the LACON(s), which might or might
725 	 * not pass since context will be different.
726 	 */
727 	if (!sawlacons)
728 	{
729 		FDEBUG(("c%d[%d]->c%d\n",
730 				(int) (css - d->ssets), co, (int) (p - d->ssets)));
731 		css->outs[co] = p;
732 		css->inchain[co] = p->ins;
733 		p->ins.ss = css;
734 		p->ins.co = co;
735 	}
736 	return p;
737 }
738 
739 /*
740  * lacon - lookaround-constraint checker for miss()
741  */
742 static int						/* predicate:  constraint satisfied? */
lacon(struct vars * v,struct cnfa * pcnfa,chr * cp,color co)743 lacon(struct vars *v,
744 	  struct cnfa *pcnfa,		/* parent cnfa */
745 	  chr *cp,
746 	  color co)					/* "color" of the lookaround constraint */
747 {
748 	int			n;
749 	struct subre *sub;
750 	struct dfa *d;
751 	chr		   *end;
752 	int			satisfied;
753 
754 	/* Since this is recursive, it could be driven to stack overflow */
755 	if (STACK_TOO_DEEP(v->re))
756 	{
757 		ERR(REG_ETOOBIG);
758 		return 0;
759 	}
760 
761 	n = co - pcnfa->ncolors;
762 	assert(n > 0 && n < v->g->nlacons && v->g->lacons != NULL);
763 	FDEBUG(("=== testing lacon %d\n", n));
764 	sub = &v->g->lacons[n];
765 	d = getladfa(v, n);
766 	if (d == NULL)
767 		return 0;
768 	if (LATYPE_IS_AHEAD(sub->subno))
769 	{
770 		/* used to use longest() here, but shortest() could be much cheaper */
771 		end = shortest(v, d, cp, cp, v->stop,
772 					   (chr **) NULL, (int *) NULL);
773 		satisfied = LATYPE_IS_POS(sub->subno) ? (end != NULL) : (end == NULL);
774 	}
775 	else
776 	{
777 		/*
778 		 * To avoid doing O(N^2) work when repeatedly testing a lookbehind
779 		 * constraint in an N-character string, we use matchuntil() which can
780 		 * cache the DFA state across calls.  We only need to restart if the
781 		 * probe point decreases, which is not common.  The NFA we're using is
782 		 * a search NFA, so it doesn't mind scanning over stuff before the
783 		 * nominal match.
784 		 */
785 		satisfied = matchuntil(v, d, cp, &v->lblastcss[n], &v->lblastcp[n]);
786 		if (!LATYPE_IS_POS(sub->subno))
787 			satisfied = !satisfied;
788 	}
789 	FDEBUG(("=== lacon %d satisfied %d\n", n, satisfied));
790 	return satisfied;
791 }
792 
793 /*
794  * getvacant - get a vacant state set
795  *
796  * This routine clears out the inarcs and outarcs, but does not otherwise
797  * clear the innards of the state set -- that's up to the caller.
798  */
799 static struct sset *
getvacant(struct vars * v,struct dfa * d,chr * cp,chr * start)800 getvacant(struct vars *v,
801 		  struct dfa *d,
802 		  chr *cp,
803 		  chr *start)
804 {
805 	int			i;
806 	struct sset *ss;
807 	struct sset *p;
808 	struct arcp ap;
809 	color		co;
810 
811 	ss = pickss(v, d, cp, start);
812 	if (ss == NULL)
813 		return NULL;
814 	assert(!(ss->flags & LOCKED));
815 
816 	/* clear out its inarcs, including self-referential ones */
817 	ap = ss->ins;
818 	while ((p = ap.ss) != NULL)
819 	{
820 		co = ap.co;
821 		FDEBUG(("zapping c%d's %ld outarc\n", (int) (p - d->ssets), (long) co));
822 		p->outs[co] = NULL;
823 		ap = p->inchain[co];
824 		p->inchain[co].ss = NULL;	/* paranoia */
825 	}
826 	ss->ins.ss = NULL;
827 
828 	/* take it off the inarc chains of the ssets reached by its outarcs */
829 	for (i = 0; i < d->ncolors; i++)
830 	{
831 		p = ss->outs[i];
832 		assert(p != ss);		/* not self-referential */
833 		if (p == NULL)
834 			continue;			/* NOTE CONTINUE */
835 		FDEBUG(("del outarc %d from c%d's in chn\n", i, (int) (p - d->ssets)));
836 		if (p->ins.ss == ss && p->ins.co == i)
837 			p->ins = ss->inchain[i];
838 		else
839 		{
840 			struct arcp lastap = {NULL, 0};
841 
842 			assert(p->ins.ss != NULL);
843 			for (ap = p->ins; ap.ss != NULL &&
844 				 !(ap.ss == ss && ap.co == i);
845 				 ap = ap.ss->inchain[ap.co])
846 				lastap = ap;
847 			assert(ap.ss != NULL);
848 			lastap.ss->inchain[lastap.co] = ss->inchain[i];
849 		}
850 		ss->outs[i] = NULL;
851 		ss->inchain[i].ss = NULL;
852 	}
853 
854 	/* if ss was a success state, may need to remember location */
855 	if ((ss->flags & POSTSTATE) && ss->lastseen != d->lastpost &&
856 		(d->lastpost == NULL || d->lastpost < ss->lastseen))
857 		d->lastpost = ss->lastseen;
858 
859 	/* likewise for a no-progress state */
860 	if ((ss->flags & NOPROGRESS) && ss->lastseen != d->lastnopr &&
861 		(d->lastnopr == NULL || d->lastnopr < ss->lastseen))
862 		d->lastnopr = ss->lastseen;
863 
864 	return ss;
865 }
866 
867 /*
868  * pickss - pick the next stateset to be used
869  */
870 static struct sset *
pickss(struct vars * v,struct dfa * d,chr * cp,chr * start)871 pickss(struct vars *v,
872 	   struct dfa *d,
873 	   chr *cp,
874 	   chr *start)
875 {
876 	int			i;
877 	struct sset *ss;
878 	struct sset *end;
879 	chr		   *ancient;
880 
881 	/* shortcut for cases where cache isn't full */
882 	if (d->nssused < d->nssets)
883 	{
884 		i = d->nssused;
885 		d->nssused++;
886 		ss = &d->ssets[i];
887 		FDEBUG(("new c%d\n", i));
888 		/* set up innards */
889 		ss->states = &d->statesarea[i * d->wordsper];
890 		ss->flags = 0;
891 		ss->ins.ss = NULL;
892 		ss->ins.co = WHITE;		/* give it some value */
893 		ss->outs = &d->outsarea[i * d->ncolors];
894 		ss->inchain = &d->incarea[i * d->ncolors];
895 		for (i = 0; i < d->ncolors; i++)
896 		{
897 			ss->outs[i] = NULL;
898 			ss->inchain[i].ss = NULL;
899 		}
900 		return ss;
901 	}
902 
903 	/* look for oldest, or old enough anyway */
904 	if (cp - start > d->nssets * 2 / 3) /* oldest 33% are expendable */
905 		ancient = cp - d->nssets * 2 / 3;
906 	else
907 		ancient = start;
908 	for (ss = d->search, end = &d->ssets[d->nssets]; ss < end; ss++)
909 		if ((ss->lastseen == NULL || ss->lastseen < ancient) &&
910 			!(ss->flags & LOCKED))
911 		{
912 			d->search = ss + 1;
913 			FDEBUG(("replacing c%d\n", (int) (ss - d->ssets)));
914 			return ss;
915 		}
916 	for (ss = d->ssets, end = d->search; ss < end; ss++)
917 		if ((ss->lastseen == NULL || ss->lastseen < ancient) &&
918 			!(ss->flags & LOCKED))
919 		{
920 			d->search = ss + 1;
921 			FDEBUG(("replacing c%d\n", (int) (ss - d->ssets)));
922 			return ss;
923 		}
924 
925 	/* nobody's old enough?!? -- something's really wrong */
926 	FDEBUG(("cannot find victim to replace!\n"));
927 	ERR(REG_ASSERT);
928 	return NULL;
929 }
930