1 /*
2  * DFA routines
3  * This file is #included by regexec.c.
4  *
5  * Copyright (c) 1998, 1999 Henry Spencer.  All rights reserved.
6  *
7  * Development of this software was funded, in part, by Cray Research Inc.,
8  * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
9  * Corporation, none of whom are responsible for the results.  The author
10  * thanks all of them.
11  *
12  * Redistribution and use in source and binary forms -- with or without
13  * modification -- are permitted for any purpose, provided that
14  * redistributions in source form retain this entire copyright notice and
15  * indicate the origin and nature of any modifications.
16  *
17  * I'd appreciate being given credit for this package in the documentation
18  * of software which uses it, but that is not a requirement.
19  *
20  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
21  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
22  * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
23  * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
29  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  *
31  * src/backend/regex/rege_dfa.c
32  *
33  */
34 
35 /*
36  * longest - longest-preferred matching engine
37  *
38  * On success, returns match endpoint address.  Returns NULL on no match.
39  * Internal errors also return NULL, with v->err set.
40  */
41 static chr *
longest(struct vars * v,struct dfa * d,chr * start,chr * stop,int * hitstopp)42 longest(struct vars *v,
43 		struct dfa *d,
44 		chr *start,				/* where the match should start */
45 		chr *stop,				/* match must end at or before here */
46 		int *hitstopp)			/* record whether hit v->stop, if non-NULL */
47 {
48 	chr		   *cp;
49 	chr		   *realstop = (stop == v->stop) ? stop : stop + 1;
50 	color		co;
51 	struct sset *css;
52 	struct sset *ss;
53 	chr		   *post;
54 	int			i;
55 	struct colormap *cm = d->cm;
56 
57 	/* prevent "uninitialized variable" warnings */
58 	if (hitstopp != NULL)
59 		*hitstopp = 0;
60 
61 	/* if this is a backref to a known string, just match against that */
62 	if (d->backno >= 0)
63 	{
64 		assert((size_t) d->backno < v->nmatch);
65 		if (v->pmatch[d->backno].rm_so >= 0)
66 		{
67 			cp = dfa_backref(v, d, start, start, stop, false);
68 			if (cp == v->stop && stop == v->stop && hitstopp != NULL)
69 				*hitstopp = 1;
70 			return cp;
71 		}
72 	}
73 
74 	/* fast path for matchall NFAs */
75 	if (d->cnfa->flags & MATCHALL)
76 	{
77 		size_t		nchr = stop - start;
78 		size_t		maxmatchall = d->cnfa->maxmatchall;
79 
80 		if (nchr < d->cnfa->minmatchall)
81 			return NULL;
82 		if (maxmatchall == DUPINF)
83 		{
84 			if (stop == v->stop && hitstopp != NULL)
85 				*hitstopp = 1;
86 		}
87 		else
88 		{
89 			if (stop == v->stop && nchr <= maxmatchall + 1 && hitstopp != NULL)
90 				*hitstopp = 1;
91 			if (nchr > maxmatchall)
92 				return start + maxmatchall;
93 		}
94 		return stop;
95 	}
96 
97 	/* initialize */
98 	css = initialize(v, d, start);
99 	if (css == NULL)
100 		return NULL;
101 	cp = start;
102 
103 	/* startup */
104 	FDEBUG(("+++ startup +++\n"));
105 	if (cp == v->start)
106 	{
107 		co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1];
108 		FDEBUG(("color %ld\n", (long) co));
109 	}
110 	else
111 	{
112 		co = GETCOLOR(cm, *(cp - 1));
113 		FDEBUG(("char %c, color %ld\n", (char) *(cp - 1), (long) co));
114 	}
115 	css = miss(v, d, css, co, cp, start);
116 	if (css == NULL)
117 		return NULL;
118 	css->lastseen = cp;
119 
120 	/*
121 	 * This is the main text-scanning loop.  It seems worth having two copies
122 	 * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
123 	 * builds, when you're not actively tracing.
124 	 */
125 #ifdef REG_DEBUG
126 	if (v->eflags & REG_FTRACE)
127 	{
128 		while (cp < realstop)
129 		{
130 			FDEBUG(("+++ at c%d +++\n", (int) (css - d->ssets)));
131 			co = GETCOLOR(cm, *cp);
132 			FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
133 			ss = css->outs[co];
134 			if (ss == NULL)
135 			{
136 				ss = miss(v, d, css, co, cp + 1, start);
137 				if (ss == NULL)
138 					break;		/* NOTE BREAK OUT */
139 			}
140 			cp++;
141 			ss->lastseen = cp;
142 			css = ss;
143 		}
144 	}
145 	else
146 #endif
147 	{
148 		while (cp < realstop)
149 		{
150 			co = GETCOLOR(cm, *cp);
151 			ss = css->outs[co];
152 			if (ss == NULL)
153 			{
154 				ss = miss(v, d, css, co, cp + 1, start);
155 				if (ss == NULL)
156 					break;		/* NOTE BREAK OUT */
157 			}
158 			cp++;
159 			ss->lastseen = cp;
160 			css = ss;
161 		}
162 	}
163 
164 	if (ISERR())
165 		return NULL;
166 
167 	/* shutdown */
168 	FDEBUG(("+++ shutdown at c%d +++\n", (int) (css - d->ssets)));
169 	if (cp == v->stop && stop == v->stop)
170 	{
171 		if (hitstopp != NULL)
172 			*hitstopp = 1;
173 		co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1];
174 		FDEBUG(("color %ld\n", (long) co));
175 		ss = miss(v, d, css, co, cp, start);
176 		if (ISERR())
177 			return NULL;
178 		/* special case:  match ended at eol? */
179 		if (ss != NULL && (ss->flags & POSTSTATE))
180 			return cp;
181 		else if (ss != NULL)
182 			ss->lastseen = cp;	/* to be tidy */
183 	}
184 
185 	/* find last match, if any */
186 	post = d->lastpost;
187 	for (ss = d->ssets, i = d->nssused; i > 0; ss++, i--)
188 		if ((ss->flags & POSTSTATE) && post != ss->lastseen &&
189 			(post == NULL || post < ss->lastseen))
190 			post = ss->lastseen;
191 	if (post != NULL)			/* found one */
192 		return post - 1;
193 
194 	return NULL;
195 }
196 
197 /*
198  * shortest - shortest-preferred matching engine
199  *
200  * On success, returns match endpoint address.  Returns NULL on no match.
201  * Internal errors also return NULL, with v->err set.
202  */
203 static chr *
shortest(struct vars * v,struct dfa * d,chr * start,chr * min,chr * max,chr ** coldp,int * hitstopp)204 shortest(struct vars *v,
205 		 struct dfa *d,
206 		 chr *start,			/* where the match should start */
207 		 chr *min,				/* match must end at or after here */
208 		 chr *max,				/* match must end at or before here */
209 		 chr **coldp,			/* store coldstart pointer here, if non-NULL */
210 		 int *hitstopp)			/* record whether hit v->stop, if non-NULL */
211 {
212 	chr		   *cp;
213 	chr		   *realmin = (min == v->stop) ? min : min + 1;
214 	chr		   *realmax = (max == v->stop) ? max : max + 1;
215 	color		co;
216 	struct sset *css;
217 	struct sset *ss;
218 	struct colormap *cm = d->cm;
219 
220 	/* prevent "uninitialized variable" warnings */
221 	if (coldp != NULL)
222 		*coldp = NULL;
223 	if (hitstopp != NULL)
224 		*hitstopp = 0;
225 
226 	/* if this is a backref to a known string, just match against that */
227 	if (d->backno >= 0)
228 	{
229 		assert((size_t) d->backno < v->nmatch);
230 		if (v->pmatch[d->backno].rm_so >= 0)
231 		{
232 			cp = dfa_backref(v, d, start, min, max, true);
233 			if (cp != NULL && coldp != NULL)
234 				*coldp = start;
235 			/* there is no case where we should set *hitstopp */
236 			return cp;
237 		}
238 	}
239 
240 	/* fast path for matchall NFAs */
241 	if (d->cnfa->flags & MATCHALL)
242 	{
243 		size_t		nchr = min - start;
244 
245 		if (d->cnfa->maxmatchall != DUPINF &&
246 			nchr > d->cnfa->maxmatchall)
247 			return NULL;
248 		if ((max - start) < d->cnfa->minmatchall)
249 			return NULL;
250 		if (nchr < d->cnfa->minmatchall)
251 			min = start + d->cnfa->minmatchall;
252 		if (coldp != NULL)
253 			*coldp = start;
254 		/* there is no case where we should set *hitstopp */
255 		return min;
256 	}
257 
258 	/* initialize */
259 	css = initialize(v, d, start);
260 	if (css == NULL)
261 		return NULL;
262 	cp = start;
263 
264 	/* startup */
265 	FDEBUG(("--- startup ---\n"));
266 	if (cp == v->start)
267 	{
268 		co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1];
269 		FDEBUG(("color %ld\n", (long) co));
270 	}
271 	else
272 	{
273 		co = GETCOLOR(cm, *(cp - 1));
274 		FDEBUG(("char %c, color %ld\n", (char) *(cp - 1), (long) co));
275 	}
276 	css = miss(v, d, css, co, cp, start);
277 	if (css == NULL)
278 		return NULL;
279 	css->lastseen = cp;
280 	ss = css;
281 
282 	/*
283 	 * This is the main text-scanning loop.  It seems worth having two copies
284 	 * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
285 	 * builds, when you're not actively tracing.
286 	 */
287 #ifdef REG_DEBUG
288 	if (v->eflags & REG_FTRACE)
289 	{
290 		while (cp < realmax)
291 		{
292 			FDEBUG(("--- at c%d ---\n", (int) (css - d->ssets)));
293 			co = GETCOLOR(cm, *cp);
294 			FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
295 			ss = css->outs[co];
296 			if (ss == NULL)
297 			{
298 				ss = miss(v, d, css, co, cp + 1, start);
299 				if (ss == NULL)
300 					break;		/* NOTE BREAK OUT */
301 			}
302 			cp++;
303 			ss->lastseen = cp;
304 			css = ss;
305 			if ((ss->flags & POSTSTATE) && cp >= realmin)
306 				break;			/* NOTE BREAK OUT */
307 		}
308 	}
309 	else
310 #endif
311 	{
312 		while (cp < realmax)
313 		{
314 			co = GETCOLOR(cm, *cp);
315 			ss = css->outs[co];
316 			if (ss == NULL)
317 			{
318 				ss = miss(v, d, css, co, cp + 1, start);
319 				if (ss == NULL)
320 					break;		/* NOTE BREAK OUT */
321 			}
322 			cp++;
323 			ss->lastseen = cp;
324 			css = ss;
325 			if ((ss->flags & POSTSTATE) && cp >= realmin)
326 				break;			/* NOTE BREAK OUT */
327 		}
328 	}
329 
330 	if (ss == NULL)
331 		return NULL;
332 
333 	if (coldp != NULL)			/* report last no-progress state set, if any */
334 		*coldp = lastcold(v, d);
335 
336 	if ((ss->flags & POSTSTATE) && cp > min)
337 	{
338 		assert(cp >= realmin);
339 		cp--;
340 	}
341 	else if (cp == v->stop && max == v->stop)
342 	{
343 		co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1];
344 		FDEBUG(("color %ld\n", (long) co));
345 		ss = miss(v, d, css, co, cp, start);
346 		/* match might have ended at eol */
347 		if ((ss == NULL || !(ss->flags & POSTSTATE)) && hitstopp != NULL)
348 			*hitstopp = 1;
349 	}
350 
351 	if (ss == NULL || !(ss->flags & POSTSTATE))
352 		return NULL;
353 
354 	return cp;
355 }
356 
357 /*
358  * matchuntil - incremental matching engine
359  *
360  * This is meant for use with a search-style NFA (that is, the pattern is
361  * known to act as though it had a leading .*).  We determine whether a
362  * match exists starting at v->start and ending at probe.  Multiple calls
363  * require only O(N) time not O(N^2) so long as the probe values are
364  * nondecreasing.  *lastcss and *lastcp must be initialized to NULL before
365  * starting a series of calls.
366  *
367  * Returns 1 if a match exists, 0 if not.
368  * Internal errors also return 0, with v->err set.
369  */
370 static int
matchuntil(struct vars * v,struct dfa * d,chr * probe,struct sset ** lastcss,chr ** lastcp)371 matchuntil(struct vars *v,
372 		   struct dfa *d,
373 		   chr *probe,			/* we want to know if a match ends here */
374 		   struct sset **lastcss,	/* state storage across calls */
375 		   chr **lastcp)		/* state storage across calls */
376 {
377 	chr		   *cp = *lastcp;
378 	color		co;
379 	struct sset *css = *lastcss;
380 	struct sset *ss;
381 	struct colormap *cm = d->cm;
382 
383 	/* fast path for matchall NFAs */
384 	if (d->cnfa->flags & MATCHALL)
385 	{
386 		size_t		nchr = probe - v->start;
387 
388 		if (nchr < d->cnfa->minmatchall)
389 			return 0;
390 		/* maxmatchall will always be infinity, cf. makesearch() */
391 		assert(d->cnfa->maxmatchall == DUPINF);
392 		return 1;
393 	}
394 
395 	/* initialize and startup, or restart, if necessary */
396 	if (cp == NULL || cp > probe)
397 	{
398 		cp = v->start;
399 		css = initialize(v, d, cp);
400 		if (css == NULL)
401 			return 0;
402 
403 		FDEBUG((">>> startup >>>\n"));
404 		co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1];
405 		FDEBUG(("color %ld\n", (long) co));
406 
407 		css = miss(v, d, css, co, cp, v->start);
408 		if (css == NULL)
409 			return 0;
410 		css->lastseen = cp;
411 	}
412 	else if (css == NULL)
413 	{
414 		/* we previously found that no match is possible beyond *lastcp */
415 		return 0;
416 	}
417 	ss = css;
418 
419 	/*
420 	 * This is the main text-scanning loop.  It seems worth having two copies
421 	 * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
422 	 * builds, when you're not actively tracing.
423 	 */
424 #ifdef REG_DEBUG
425 	if (v->eflags & REG_FTRACE)
426 	{
427 		while (cp < probe)
428 		{
429 			FDEBUG((">>> at c%d >>>\n", (int) (css - d->ssets)));
430 			co = GETCOLOR(cm, *cp);
431 			FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
432 			ss = css->outs[co];
433 			if (ss == NULL)
434 			{
435 				ss = miss(v, d, css, co, cp + 1, v->start);
436 				if (ss == NULL)
437 					break;		/* NOTE BREAK OUT */
438 			}
439 			cp++;
440 			ss->lastseen = cp;
441 			css = ss;
442 		}
443 	}
444 	else
445 #endif
446 	{
447 		while (cp < probe)
448 		{
449 			co = GETCOLOR(cm, *cp);
450 			ss = css->outs[co];
451 			if (ss == NULL)
452 			{
453 				ss = miss(v, d, css, co, cp + 1, v->start);
454 				if (ss == NULL)
455 					break;		/* NOTE BREAK OUT */
456 			}
457 			cp++;
458 			ss->lastseen = cp;
459 			css = ss;
460 		}
461 	}
462 
463 	*lastcss = ss;
464 	*lastcp = cp;
465 
466 	if (ss == NULL)
467 		return 0;				/* impossible match, or internal error */
468 
469 	/* We need to process one more chr, or the EOS symbol, to check match */
470 	if (cp < v->stop)
471 	{
472 		FDEBUG((">>> at c%d >>>\n", (int) (css - d->ssets)));
473 		co = GETCOLOR(cm, *cp);
474 		FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
475 		ss = css->outs[co];
476 		if (ss == NULL)
477 			ss = miss(v, d, css, co, cp + 1, v->start);
478 	}
479 	else
480 	{
481 		assert(cp == v->stop);
482 		co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1];
483 		FDEBUG(("color %ld\n", (long) co));
484 		ss = miss(v, d, css, co, cp, v->start);
485 	}
486 
487 	if (ss == NULL || !(ss->flags & POSTSTATE))
488 		return 0;
489 
490 	return 1;
491 }
492 
493 /*
494  * dfa_backref - find best match length for a known backref string
495  *
496  * When the backref's referent is already available, we can deliver an exact
497  * answer with considerably less work than running the backref node's NFA.
498  *
499  * Return match endpoint for longest or shortest valid repeated match,
500  * or NULL if there is no valid match.
501  *
502  * Should be in sync with cbrdissect(), although that has the different task
503  * of checking a match to a predetermined section of the string.
504  */
505 static chr *
dfa_backref(struct vars * v,struct dfa * d,chr * start,chr * min,chr * max,bool shortest)506 dfa_backref(struct vars *v,
507 			struct dfa *d,
508 			chr *start,			/* where the match should start */
509 			chr *min,			/* match must end at or after here */
510 			chr *max,			/* match must end at or before here */
511 			bool shortest)
512 {
513 	int			n = d->backno;
514 	int			backmin = d->backmin;
515 	int			backmax = d->backmax;
516 	size_t		numreps;
517 	size_t		minreps;
518 	size_t		maxreps;
519 	size_t		brlen;
520 	chr		   *brstring;
521 	chr		   *p;
522 
523 	/* get the backreferenced string (caller should have checked this) */
524 	if (v->pmatch[n].rm_so == -1)
525 		return NULL;
526 	brstring = v->start + v->pmatch[n].rm_so;
527 	brlen = v->pmatch[n].rm_eo - v->pmatch[n].rm_so;
528 
529 	/* special-case zero-length backreference to avoid divide by zero */
530 	if (brlen == 0)
531 	{
532 		/*
533 		 * matches only a zero-length string, but any number of repetitions
534 		 * can be considered to be present
535 		 */
536 		if (min == start && backmin <= backmax)
537 			return start;
538 		return NULL;
539 	}
540 
541 	/*
542 	 * convert min and max into numbers of possible repetitions of the backref
543 	 * string, rounding appropriately
544 	 */
545 	if (min <= start)
546 		minreps = 0;
547 	else
548 		minreps = (min - start - 1) / brlen + 1;
549 	maxreps = (max - start) / brlen;
550 
551 	/* apply bounds, then see if there is any allowed match length */
552 	if (minreps < backmin)
553 		minreps = backmin;
554 	if (backmax != DUPINF && maxreps > backmax)
555 		maxreps = backmax;
556 	if (maxreps < minreps)
557 		return NULL;
558 
559 	/* quick exit if zero-repetitions match is valid and preferred */
560 	if (shortest && minreps == 0)
561 		return start;
562 
563 	/* okay, compare the actual string contents */
564 	p = start;
565 	numreps = 0;
566 	while (numreps < maxreps)
567 	{
568 		if ((*v->g->compare) (brstring, p, brlen) != 0)
569 			break;
570 		p += brlen;
571 		numreps++;
572 		if (shortest && numreps >= minreps)
573 			break;
574 	}
575 
576 	if (numreps >= minreps)
577 		return p;
578 	return NULL;
579 }
580 
581 /*
582  * lastcold - determine last point at which no progress had been made
583  */
584 static chr *					/* endpoint, or NULL */
lastcold(struct vars * v,struct dfa * d)585 lastcold(struct vars *v,
586 		 struct dfa *d)
587 {
588 	struct sset *ss;
589 	chr		   *nopr;
590 	int			i;
591 
592 	nopr = d->lastnopr;
593 	if (nopr == NULL)
594 		nopr = v->start;
595 	for (ss = d->ssets, i = d->nssused; i > 0; ss++, i--)
596 		if ((ss->flags & NOPROGRESS) && nopr < ss->lastseen)
597 			nopr = ss->lastseen;
598 	return nopr;
599 }
600 
601 /*
602  * newdfa - set up a fresh DFA
603  *
604  * Returns NULL (and sets v->err) on failure.
605  */
606 static struct dfa *
newdfa(struct vars * v,struct cnfa * cnfa,struct colormap * cm,struct smalldfa * sml)607 newdfa(struct vars *v,
608 	   struct cnfa *cnfa,
609 	   struct colormap *cm,
610 	   struct smalldfa *sml)	/* preallocated space, may be NULL */
611 {
612 	struct dfa *d;
613 	size_t		nss = cnfa->nstates * 2;
614 	int			wordsper = (cnfa->nstates + UBITS - 1) / UBITS;
615 	bool		ismalloced = false;
616 
617 	assert(cnfa != NULL && cnfa->nstates != 0);
618 
619 	if (nss <= FEWSTATES && cnfa->ncolors <= FEWCOLORS)
620 	{
621 		assert(wordsper == 1);
622 		if (sml == NULL)
623 		{
624 			sml = (struct smalldfa *) MALLOC(sizeof(struct smalldfa));
625 			if (sml == NULL)
626 			{
627 				ERR(REG_ESPACE);
628 				return NULL;
629 			}
630 			ismalloced = true;
631 		}
632 		d = &sml->dfa;
633 		d->ssets = sml->ssets;
634 		d->statesarea = sml->statesarea;
635 		d->work = &d->statesarea[nss];
636 		d->outsarea = sml->outsarea;
637 		d->incarea = sml->incarea;
638 		d->ismalloced = ismalloced;
639 		d->arraysmalloced = false;	/* not separately allocated, anyway */
640 	}
641 	else
642 	{
643 		d = (struct dfa *) MALLOC(sizeof(struct dfa));
644 		if (d == NULL)
645 		{
646 			ERR(REG_ESPACE);
647 			return NULL;
648 		}
649 		d->ssets = (struct sset *) MALLOC(nss * sizeof(struct sset));
650 		d->statesarea = (unsigned *) MALLOC((nss + WORK) * wordsper *
651 											sizeof(unsigned));
652 		d->work = &d->statesarea[nss * wordsper];
653 		d->outsarea = (struct sset **) MALLOC(nss * cnfa->ncolors *
654 											  sizeof(struct sset *));
655 		d->incarea = (struct arcp *) MALLOC(nss * cnfa->ncolors *
656 											sizeof(struct arcp));
657 		d->ismalloced = true;
658 		d->arraysmalloced = true;
659 		/* now freedfa() will behave sanely */
660 		if (d->ssets == NULL || d->statesarea == NULL ||
661 			d->outsarea == NULL || d->incarea == NULL)
662 		{
663 			freedfa(d);
664 			ERR(REG_ESPACE);
665 			return NULL;
666 		}
667 	}
668 
669 	d->nssets = (v->eflags & REG_SMALL) ? 7 : nss;
670 	d->nssused = 0;
671 	d->nstates = cnfa->nstates;
672 	d->ncolors = cnfa->ncolors;
673 	d->wordsper = wordsper;
674 	d->cnfa = cnfa;
675 	d->cm = cm;
676 	d->lastpost = NULL;
677 	d->lastnopr = NULL;
678 	d->search = d->ssets;
679 	d->backno = -1;				/* may be set by caller */
680 	d->backmin = d->backmax = 0;
681 
682 	/* initialization of sset fields is done as needed */
683 
684 	return d;
685 }
686 
687 /*
688  * freedfa - free a DFA
689  */
690 static void
freedfa(struct dfa * d)691 freedfa(struct dfa *d)
692 {
693 	if (d->arraysmalloced)
694 	{
695 		if (d->ssets != NULL)
696 			FREE(d->ssets);
697 		if (d->statesarea != NULL)
698 			FREE(d->statesarea);
699 		if (d->outsarea != NULL)
700 			FREE(d->outsarea);
701 		if (d->incarea != NULL)
702 			FREE(d->incarea);
703 	}
704 
705 	if (d->ismalloced)
706 		FREE(d);
707 }
708 
709 /*
710  * hash - construct a hash code for a bitvector
711  *
712  * There are probably better ways, but they're more expensive.
713  */
714 static unsigned
hash(unsigned * uv,int n)715 hash(unsigned *uv,
716 	 int n)
717 {
718 	int			i;
719 	unsigned	h;
720 
721 	h = 0;
722 	for (i = 0; i < n; i++)
723 		h ^= uv[i];
724 	return h;
725 }
726 
727 /*
728  * initialize - hand-craft a cache entry for startup, otherwise get ready
729  */
730 static struct sset *
initialize(struct vars * v,struct dfa * d,chr * start)731 initialize(struct vars *v,
732 		   struct dfa *d,
733 		   chr *start)
734 {
735 	struct sset *ss;
736 	int			i;
737 
738 	/* is previous one still there? */
739 	if (d->nssused > 0 && (d->ssets[0].flags & STARTER))
740 		ss = &d->ssets[0];
741 	else
742 	{							/* no, must (re)build it */
743 		ss = getvacant(v, d, start, start);
744 		if (ss == NULL)
745 			return NULL;
746 		for (i = 0; i < d->wordsper; i++)
747 			ss->states[i] = 0;
748 		BSET(ss->states, d->cnfa->pre);
749 		ss->hash = HASH(ss->states, d->wordsper);
750 		assert(d->cnfa->pre != d->cnfa->post);
751 		ss->flags = STARTER | LOCKED | NOPROGRESS;
752 		/* lastseen dealt with below */
753 	}
754 
755 	for (i = 0; i < d->nssused; i++)
756 		d->ssets[i].lastseen = NULL;
757 	ss->lastseen = start;		/* maybe untrue, but harmless */
758 	d->lastpost = NULL;
759 	d->lastnopr = NULL;
760 	return ss;
761 }
762 
763 /*
764  * miss - handle a stateset cache miss
765  *
766  * css is the current stateset, co is the color of the current input character,
767  * cp points to the character after that (which is where we may need to test
768  * LACONs).  start does not affect matching behavior but is needed for pickss'
769  * heuristics about which stateset cache entry to replace.
770  *
771  * Ordinarily, returns the address of the next stateset (the one that is
772  * valid after consuming the input character).  Returns NULL if no valid
773  * NFA states remain, ie we have a certain match failure.
774  * Internal errors also return NULL, with v->err set.
775  */
776 static struct sset *
miss(struct vars * v,struct dfa * d,struct sset * css,color co,chr * cp,chr * start)777 miss(struct vars *v,
778 	 struct dfa *d,
779 	 struct sset *css,
780 	 color co,
781 	 chr *cp,					/* next chr */
782 	 chr *start)				/* where the attempt got started */
783 {
784 	struct cnfa *cnfa = d->cnfa;
785 	int			i;
786 	unsigned	h;
787 	struct carc *ca;
788 	struct sset *p;
789 	int			ispseudocolor;
790 	int			ispost;
791 	int			noprogress;
792 	int			gotstate;
793 	int			dolacons;
794 	int			sawlacons;
795 
796 	/* for convenience, we can be called even if it might not be a miss */
797 	if (css->outs[co] != NULL)
798 	{
799 		FDEBUG(("hit\n"));
800 		return css->outs[co];
801 	}
802 	FDEBUG(("miss\n"));
803 
804 	/*
805 	 * Checking for operation cancel in the inner text search loop seems
806 	 * unduly expensive.  As a compromise, check during cache misses.
807 	 */
808 	if (CANCEL_REQUESTED(v->re))
809 	{
810 		ERR(REG_CANCEL);
811 		return NULL;
812 	}
813 
814 	/*
815 	 * What set of states would we end up in after consuming the co character?
816 	 * We first consider PLAIN arcs that consume the character, and then look
817 	 * to see what LACON arcs could be traversed after consuming it.
818 	 */
819 	for (i = 0; i < d->wordsper; i++)
820 		d->work[i] = 0;			/* build new stateset bitmap in d->work */
821 	ispseudocolor = d->cm->cd[co].flags & PSEUDO;
822 	ispost = 0;
823 	noprogress = 1;
824 	gotstate = 0;
825 	for (i = 0; i < d->nstates; i++)
826 		if (ISBSET(css->states, i))
827 			for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++)
828 				if (ca->co == co ||
829 					(ca->co == RAINBOW && !ispseudocolor))
830 				{
831 					BSET(d->work, ca->to);
832 					gotstate = 1;
833 					if (ca->to == cnfa->post)
834 						ispost = 1;
835 					if (!(cnfa->stflags[ca->to] & CNFA_NOPROGRESS))
836 						noprogress = 0;
837 					FDEBUG(("%d -> %d\n", i, ca->to));
838 				}
839 	if (!gotstate)
840 		return NULL;			/* character cannot reach any new state */
841 	dolacons = (cnfa->flags & HASLACONS);
842 	sawlacons = 0;
843 	/* outer loop handles transitive closure of reachable-by-LACON states */
844 	while (dolacons)
845 	{
846 		dolacons = 0;
847 		for (i = 0; i < d->nstates; i++)
848 			if (ISBSET(d->work, i))
849 				for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++)
850 				{
851 					if (ca->co < cnfa->ncolors)
852 						continue;	/* not a LACON arc */
853 					if (ISBSET(d->work, ca->to))
854 						continue;	/* arc would be a no-op anyway */
855 					sawlacons = 1;	/* this LACON affects our result */
856 					if (!lacon(v, cnfa, cp, ca->co))
857 					{
858 						if (ISERR())
859 							return NULL;
860 						continue;	/* LACON arc cannot be traversed */
861 					}
862 					if (ISERR())
863 						return NULL;
864 					BSET(d->work, ca->to);
865 					dolacons = 1;
866 					if (ca->to == cnfa->post)
867 						ispost = 1;
868 					if (!(cnfa->stflags[ca->to] & CNFA_NOPROGRESS))
869 						noprogress = 0;
870 					FDEBUG(("%d :> %d\n", i, ca->to));
871 				}
872 	}
873 	h = HASH(d->work, d->wordsper);
874 
875 	/* Is this stateset already in the cache? */
876 	for (p = d->ssets, i = d->nssused; i > 0; p++, i--)
877 		if (HIT(h, d->work, p, d->wordsper))
878 		{
879 			FDEBUG(("cached c%d\n", (int) (p - d->ssets)));
880 			break;				/* NOTE BREAK OUT */
881 		}
882 	if (i == 0)
883 	{							/* nope, need a new cache entry */
884 		p = getvacant(v, d, cp, start);
885 		if (p == NULL)
886 			return NULL;
887 		assert(p != css);
888 		for (i = 0; i < d->wordsper; i++)
889 			p->states[i] = d->work[i];
890 		p->hash = h;
891 		p->flags = (ispost) ? POSTSTATE : 0;
892 		if (noprogress)
893 			p->flags |= NOPROGRESS;
894 		/* lastseen to be dealt with by caller */
895 	}
896 
897 	/*
898 	 * Link new stateset to old, unless a LACON affected the result, in which
899 	 * case we don't create the link.  That forces future transitions across
900 	 * this same arc (same prior stateset and character color) to come through
901 	 * miss() again, so that we can recheck the LACON(s), which might or might
902 	 * not pass since context will be different.
903 	 */
904 	if (!sawlacons)
905 	{
906 		FDEBUG(("c%d[%d]->c%d\n",
907 				(int) (css - d->ssets), co, (int) (p - d->ssets)));
908 		css->outs[co] = p;
909 		css->inchain[co] = p->ins;
910 		p->ins.ss = css;
911 		p->ins.co = co;
912 	}
913 	return p;
914 }
915 
916 /*
917  * lacon - lookaround-constraint checker for miss()
918  */
919 static int						/* predicate:  constraint satisfied? */
lacon(struct vars * v,struct cnfa * pcnfa,chr * cp,color co)920 lacon(struct vars *v,
921 	  struct cnfa *pcnfa,		/* parent cnfa */
922 	  chr *cp,
923 	  color co)					/* "color" of the lookaround constraint */
924 {
925 	int			n;
926 	struct subre *sub;
927 	struct dfa *d;
928 	chr		   *end;
929 	int			satisfied;
930 
931 	/* Since this is recursive, it could be driven to stack overflow */
932 	if (STACK_TOO_DEEP(v->re))
933 	{
934 		ERR(REG_ETOOBIG);
935 		return 0;
936 	}
937 
938 	n = co - pcnfa->ncolors;
939 	assert(n > 0 && n < v->g->nlacons && v->g->lacons != NULL);
940 	FDEBUG(("=== testing lacon %d\n", n));
941 	sub = &v->g->lacons[n];
942 	d = getladfa(v, n);
943 	if (d == NULL)
944 		return 0;
945 	if (LATYPE_IS_AHEAD(sub->latype))
946 	{
947 		/* used to use longest() here, but shortest() could be much cheaper */
948 		end = shortest(v, d, cp, cp, v->stop,
949 					   (chr **) NULL, (int *) NULL);
950 		satisfied = LATYPE_IS_POS(sub->latype) ? (end != NULL) : (end == NULL);
951 	}
952 	else
953 	{
954 		/*
955 		 * To avoid doing O(N^2) work when repeatedly testing a lookbehind
956 		 * constraint in an N-character string, we use matchuntil() which can
957 		 * cache the DFA state across calls.  We only need to restart if the
958 		 * probe point decreases, which is not common.  The NFA we're using is
959 		 * a search NFA, so it doesn't mind scanning over stuff before the
960 		 * nominal match.
961 		 */
962 		satisfied = matchuntil(v, d, cp, &v->lblastcss[n], &v->lblastcp[n]);
963 		if (!LATYPE_IS_POS(sub->latype))
964 			satisfied = !satisfied;
965 	}
966 	FDEBUG(("=== lacon %d satisfied %d\n", n, satisfied));
967 	return satisfied;
968 }
969 
970 /*
971  * getvacant - get a vacant state set
972  *
973  * This routine clears out the inarcs and outarcs, but does not otherwise
974  * clear the innards of the state set -- that's up to the caller.
975  */
976 static struct sset *
getvacant(struct vars * v,struct dfa * d,chr * cp,chr * start)977 getvacant(struct vars *v,
978 		  struct dfa *d,
979 		  chr *cp,
980 		  chr *start)
981 {
982 	int			i;
983 	struct sset *ss;
984 	struct sset *p;
985 	struct arcp ap;
986 	color		co;
987 
988 	ss = pickss(v, d, cp, start);
989 	if (ss == NULL)
990 		return NULL;
991 	assert(!(ss->flags & LOCKED));
992 
993 	/* clear out its inarcs, including self-referential ones */
994 	ap = ss->ins;
995 	while ((p = ap.ss) != NULL)
996 	{
997 		co = ap.co;
998 		FDEBUG(("zapping c%d's %ld outarc\n", (int) (p - d->ssets), (long) co));
999 		p->outs[co] = NULL;
1000 		ap = p->inchain[co];
1001 		p->inchain[co].ss = NULL;	/* paranoia */
1002 	}
1003 	ss->ins.ss = NULL;
1004 
1005 	/* take it off the inarc chains of the ssets reached by its outarcs */
1006 	for (i = 0; i < d->ncolors; i++)
1007 	{
1008 		p = ss->outs[i];
1009 		assert(p != ss);		/* not self-referential */
1010 		if (p == NULL)
1011 			continue;			/* NOTE CONTINUE */
1012 		FDEBUG(("del outarc %d from c%d's in chn\n", i, (int) (p - d->ssets)));
1013 		if (p->ins.ss == ss && p->ins.co == i)
1014 			p->ins = ss->inchain[i];
1015 		else
1016 		{
1017 			struct arcp lastap = {NULL, 0};
1018 
1019 			assert(p->ins.ss != NULL);
1020 			for (ap = p->ins; ap.ss != NULL &&
1021 				 !(ap.ss == ss && ap.co == i);
1022 				 ap = ap.ss->inchain[ap.co])
1023 				lastap = ap;
1024 			assert(ap.ss != NULL);
1025 			lastap.ss->inchain[lastap.co] = ss->inchain[i];
1026 		}
1027 		ss->outs[i] = NULL;
1028 		ss->inchain[i].ss = NULL;
1029 	}
1030 
1031 	/* if ss was a success state, may need to remember location */
1032 	if ((ss->flags & POSTSTATE) && ss->lastseen != d->lastpost &&
1033 		(d->lastpost == NULL || d->lastpost < ss->lastseen))
1034 		d->lastpost = ss->lastseen;
1035 
1036 	/* likewise for a no-progress state */
1037 	if ((ss->flags & NOPROGRESS) && ss->lastseen != d->lastnopr &&
1038 		(d->lastnopr == NULL || d->lastnopr < ss->lastseen))
1039 		d->lastnopr = ss->lastseen;
1040 
1041 	return ss;
1042 }
1043 
1044 /*
1045  * pickss - pick the next stateset to be used
1046  */
1047 static struct sset *
pickss(struct vars * v,struct dfa * d,chr * cp,chr * start)1048 pickss(struct vars *v,
1049 	   struct dfa *d,
1050 	   chr *cp,
1051 	   chr *start)
1052 {
1053 	int			i;
1054 	struct sset *ss;
1055 	struct sset *end;
1056 	chr		   *ancient;
1057 
1058 	/* shortcut for cases where cache isn't full */
1059 	if (d->nssused < d->nssets)
1060 	{
1061 		i = d->nssused;
1062 		d->nssused++;
1063 		ss = &d->ssets[i];
1064 		FDEBUG(("new c%d\n", i));
1065 		/* set up innards */
1066 		ss->states = &d->statesarea[i * d->wordsper];
1067 		ss->flags = 0;
1068 		ss->ins.ss = NULL;
1069 		ss->ins.co = WHITE;		/* give it some value */
1070 		ss->outs = &d->outsarea[i * d->ncolors];
1071 		ss->inchain = &d->incarea[i * d->ncolors];
1072 		for (i = 0; i < d->ncolors; i++)
1073 		{
1074 			ss->outs[i] = NULL;
1075 			ss->inchain[i].ss = NULL;
1076 		}
1077 		return ss;
1078 	}
1079 
1080 	/* look for oldest, or old enough anyway */
1081 	if (cp - start > d->nssets * 2 / 3) /* oldest 33% are expendable */
1082 		ancient = cp - d->nssets * 2 / 3;
1083 	else
1084 		ancient = start;
1085 	for (ss = d->search, end = &d->ssets[d->nssets]; ss < end; ss++)
1086 		if ((ss->lastseen == NULL || ss->lastseen < ancient) &&
1087 			!(ss->flags & LOCKED))
1088 		{
1089 			d->search = ss + 1;
1090 			FDEBUG(("replacing c%d\n", (int) (ss - d->ssets)));
1091 			return ss;
1092 		}
1093 	for (ss = d->ssets, end = d->search; ss < end; ss++)
1094 		if ((ss->lastseen == NULL || ss->lastseen < ancient) &&
1095 			!(ss->flags & LOCKED))
1096 		{
1097 			d->search = ss + 1;
1098 			FDEBUG(("replacing c%d\n", (int) (ss - d->ssets)));
1099 			return ss;
1100 		}
1101 
1102 	/* nobody's old enough?!? -- something's really wrong */
1103 	FDEBUG(("cannot find victim to replace!\n"));
1104 	ERR(REG_ASSERT);
1105 	return NULL;
1106 }
1107