xref: /freebsd/lib/libc/regex/regcomp.c (revision c697fb7f)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1992, 1993, 1994 Henry Spencer.
5  * Copyright (c) 1992, 1993, 1994
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * Copyright (c) 2011 The FreeBSD Foundation
9  * All rights reserved.
10  * Portions of this software were developed by David Chisnall
11  * under sponsorship from the FreeBSD Foundation.
12  *
13  * This code is derived from software contributed to Berkeley by
14  * Henry Spencer.
15  *
16  * Redistribution and use in source and binary forms, with or without
17  * modification, are permitted provided that the following conditions
18  * are met:
19  * 1. Redistributions of source code must retain the above copyright
20  *    notice, this list of conditions and the following disclaimer.
21  * 2. Redistributions in binary form must reproduce the above copyright
22  *    notice, this list of conditions and the following disclaimer in the
23  *    documentation and/or other materials provided with the distribution.
24  * 3. Neither the name of the University nor the names of its contributors
25  *    may be used to endorse or promote products derived from this software
26  *    without specific prior written permission.
27  *
28  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
29  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38  * SUCH DAMAGE.
39  *
40  *	@(#)regcomp.c	8.5 (Berkeley) 3/20/94
41  */
42 
43 #if defined(LIBC_SCCS) && !defined(lint)
44 static char sccsid[] = "@(#)regcomp.c	8.5 (Berkeley) 3/20/94";
45 #endif /* LIBC_SCCS and not lint */
46 #include <sys/cdefs.h>
47 __FBSDID("$FreeBSD$");
48 
49 #include <sys/types.h>
50 #include <stdio.h>
51 #include <string.h>
52 #include <ctype.h>
53 #include <limits.h>
54 #include <stdlib.h>
55 #include <regex.h>
56 #include <stdbool.h>
57 #include <wchar.h>
58 #include <wctype.h>
59 
60 #ifndef LIBREGEX
61 #include "collate.h"
62 #endif
63 
64 #include "utils.h"
65 #include "regex2.h"
66 
67 #include "cname.h"
68 
69 /*
70  * Branching context, used to keep track of branch state for all of the branch-
71  * aware functions. In addition to keeping track of branch positions for the
72  * p_branch_* functions, we use this to simplify some clumsiness in BREs for
73  * detection of whether ^ is acting as an anchor or being used erroneously and
74  * also for whether we're in a sub-expression or not.
75  */
76 struct branchc {
77 	sopno start;
78 	sopno back;
79 	sopno fwd;
80 
81 	int nbranch;
82 	int nchain;
83 	bool outer;
84 	bool terminate;
85 };
86 
87 /*
88  * parse structure, passed up and down to avoid global variables and
89  * other clumsinesses
90  */
91 struct parse {
92 	const char *next;	/* next character in RE */
93 	const char *end;	/* end of string (-> NUL normally) */
94 	int error;		/* has an error been seen? */
95 	sop *strip;		/* malloced strip */
96 	sopno ssize;		/* malloced strip size (allocated) */
97 	sopno slen;		/* malloced strip length (used) */
98 	int ncsalloc;		/* number of csets allocated */
99 	struct re_guts *g;
100 #	define	NPAREN	10	/* we need to remember () 1-9 for back refs */
101 	sopno pbegin[NPAREN];	/* -> ( ([0] unused) */
102 	sopno pend[NPAREN];	/* -> ) ([0] unused) */
103 	bool allowbranch;	/* can this expression branch? */
104 	bool bre;		/* convenience; is this a BRE? */
105 	bool (*parse_expr)(struct parse *, struct branchc *);
106 	void (*pre_parse)(struct parse *, struct branchc *);
107 	void (*post_parse)(struct parse *, struct branchc *);
108 };
109 
110 /* ========= begin header generated by ./mkh ========= */
111 #ifdef __cplusplus
112 extern "C" {
113 #endif
114 
115 /* === regcomp.c === */
116 static bool p_ere_exp(struct parse *p, struct branchc *bc);
117 static void p_str(struct parse *p);
118 static int p_branch_eat_delim(struct parse *p, struct branchc *bc);
119 static void p_branch_ins_offset(struct parse *p, struct branchc *bc);
120 static void p_branch_fix_tail(struct parse *p, struct branchc *bc);
121 static bool p_branch_empty(struct parse *p, struct branchc *bc);
122 static bool p_branch_do(struct parse *p, struct branchc *bc);
123 static void p_bre_pre_parse(struct parse *p, struct branchc *bc);
124 static void p_bre_post_parse(struct parse *p, struct branchc *bc);
125 static void p_re(struct parse *p, int end1, int end2);
126 static bool p_simp_re(struct parse *p, struct branchc *bc);
127 static int p_count(struct parse *p);
128 static void p_bracket(struct parse *p);
129 static int p_range_cmp(wchar_t c1, wchar_t c2);
130 static void p_b_term(struct parse *p, cset *cs);
131 static void p_b_cclass(struct parse *p, cset *cs);
132 static void p_b_eclass(struct parse *p, cset *cs);
133 static wint_t p_b_symbol(struct parse *p);
134 static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
135 static wint_t othercase(wint_t ch);
136 static void bothcases(struct parse *p, wint_t ch);
137 static void ordinary(struct parse *p, wint_t ch);
138 static void nonnewline(struct parse *p);
139 static void repeat(struct parse *p, sopno start, int from, int to);
140 static int seterr(struct parse *p, int e);
141 static cset *allocset(struct parse *p);
142 static void freeset(struct parse *p, cset *cs);
143 static void CHadd(struct parse *p, cset *cs, wint_t ch);
144 static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max);
145 static void CHaddtype(struct parse *p, cset *cs, wctype_t wct);
146 static wint_t singleton(cset *cs);
147 static sopno dupl(struct parse *p, sopno start, sopno finish);
148 static void doemit(struct parse *p, sop op, size_t opnd);
149 static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
150 static void dofwd(struct parse *p, sopno pos, sop value);
151 static int enlarge(struct parse *p, sopno size);
152 static void stripsnug(struct parse *p, struct re_guts *g);
153 static void findmust(struct parse *p, struct re_guts *g);
154 static int altoffset(sop *scan, int offset);
155 static void computejumps(struct parse *p, struct re_guts *g);
156 static void computematchjumps(struct parse *p, struct re_guts *g);
157 static sopno pluscount(struct parse *p, struct re_guts *g);
158 static wint_t wgetnext(struct parse *p);
159 
160 #ifdef __cplusplus
161 }
162 #endif
163 /* ========= end header generated by ./mkh ========= */
164 
165 static char nuls[10];		/* place to point scanner in event of error */
166 
167 /*
168  * macros for use with parse structure
169  * BEWARE:  these know that the parse structure is named `p' !!!
170  */
171 #define	PEEK()	(*p->next)
172 #define	PEEK2()	(*(p->next+1))
173 #define	MORE()	(p->next < p->end)
174 #define	MORE2()	(p->next+1 < p->end)
175 #define	SEE(c)	(MORE() && PEEK() == (c))
176 #define	SEETWO(a, b)	(MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
177 #define	SEESPEC(a)	(p->bre ? SEETWO('\\', a) : SEE(a))
178 #define	EAT(c)	((SEE(c)) ? (NEXT(), 1) : 0)
179 #define	EATTWO(a, b)	((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
180 #define	NEXT()	(p->next++)
181 #define	NEXT2()	(p->next += 2)
182 #define	NEXTn(n)	(p->next += (n))
183 #define	GETNEXT()	(*p->next++)
184 #define	WGETNEXT()	wgetnext(p)
185 #define	SETERROR(e)	seterr(p, (e))
186 #define	REQUIRE(co, e)	((co) || SETERROR(e))
187 #define	MUSTSEE(c, e)	(REQUIRE(MORE() && PEEK() == (c), e))
188 #define	MUSTEAT(c, e)	(REQUIRE(MORE() && GETNEXT() == (c), e))
189 #define	MUSTNOTSEE(c, e)	(REQUIRE(!MORE() || PEEK() != (c), e))
190 #define	EMIT(op, sopnd)	doemit(p, (sop)(op), (size_t)(sopnd))
191 #define	INSERT(op, pos)	doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
192 #define	AHEAD(pos)		dofwd(p, pos, HERE()-(pos))
193 #define	ASTERN(sop, pos)	EMIT(sop, HERE()-pos)
194 #define	HERE()		(p->slen)
195 #define	THERE()		(p->slen - 1)
196 #define	THERETHERE()	(p->slen - 2)
197 #define	DROP(n)	(p->slen -= (n))
198 
199 /* Macro used by computejump()/computematchjump() */
200 #define MIN(a,b)	((a)<(b)?(a):(b))
201 
202 /*
203  - regcomp - interface for parser and compilation
204  = extern int regcomp(regex_t *, const char *, int);
205  = #define	REG_BASIC	0000
206  = #define	REG_EXTENDED	0001
207  = #define	REG_ICASE	0002
208  = #define	REG_NOSUB	0004
209  = #define	REG_NEWLINE	0010
210  = #define	REG_NOSPEC	0020
211  = #define	REG_PEND	0040
212  = #define	REG_DUMP	0200
213  */
214 int				/* 0 success, otherwise REG_something */
215 regcomp(regex_t * __restrict preg,
216 	const char * __restrict pattern,
217 	int cflags)
218 {
219 	struct parse pa;
220 	struct re_guts *g;
221 	struct parse *p = &pa;
222 	int i;
223 	size_t len;
224 	size_t maxlen;
225 #ifdef REDEBUG
226 #	define	GOODFLAGS(f)	(f)
227 #else
228 #	define	GOODFLAGS(f)	((f)&~REG_DUMP)
229 #endif
230 
231 	cflags = GOODFLAGS(cflags);
232 	if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
233 		return(REG_INVARG);
234 
235 	if (cflags&REG_PEND) {
236 		if (preg->re_endp < pattern)
237 			return(REG_INVARG);
238 		len = preg->re_endp - pattern;
239 	} else
240 		len = strlen(pattern);
241 
242 	/* do the mallocs early so failure handling is easy */
243 	g = (struct re_guts *)malloc(sizeof(struct re_guts));
244 	if (g == NULL)
245 		return(REG_ESPACE);
246 	/*
247 	 * Limit the pattern space to avoid a 32-bit overflow on buffer
248 	 * extension.  Also avoid any signed overflow in case of conversion
249 	 * so make the real limit based on a 31-bit overflow.
250 	 *
251 	 * Likely not applicable on 64-bit systems but handle the case
252 	 * generically (who are we to stop people from using ~715MB+
253 	 * patterns?).
254 	 */
255 	maxlen = ((size_t)-1 >> 1) / sizeof(sop) * 2 / 3;
256 	if (len >= maxlen) {
257 		free((char *)g);
258 		return(REG_ESPACE);
259 	}
260 	p->ssize = len/(size_t)2*(size_t)3 + (size_t)1;	/* ugh */
261 	assert(p->ssize >= len);
262 
263 	p->strip = (sop *)malloc(p->ssize * sizeof(sop));
264 	p->slen = 0;
265 	if (p->strip == NULL) {
266 		free((char *)g);
267 		return(REG_ESPACE);
268 	}
269 
270 	/* set things up */
271 	p->g = g;
272 	p->next = pattern;	/* convenience; we do not modify it */
273 	p->end = p->next + len;
274 	p->error = 0;
275 	p->ncsalloc = 0;
276 	for (i = 0; i < NPAREN; i++) {
277 		p->pbegin[i] = 0;
278 		p->pend[i] = 0;
279 	}
280 	if (cflags & REG_EXTENDED) {
281 		p->allowbranch = true;
282 		p->bre = false;
283 		p->parse_expr = p_ere_exp;
284 		p->pre_parse = NULL;
285 		p->post_parse = NULL;
286 	} else {
287 		p->allowbranch = false;
288 		p->bre = true;
289 		p->parse_expr = p_simp_re;
290 		p->pre_parse = p_bre_pre_parse;
291 		p->post_parse = p_bre_post_parse;
292 	}
293 	g->sets = NULL;
294 	g->ncsets = 0;
295 	g->cflags = cflags;
296 	g->iflags = 0;
297 	g->nbol = 0;
298 	g->neol = 0;
299 	g->must = NULL;
300 	g->moffset = -1;
301 	g->charjump = NULL;
302 	g->matchjump = NULL;
303 	g->mlen = 0;
304 	g->nsub = 0;
305 	g->backrefs = 0;
306 
307 	/* do it */
308 	EMIT(OEND, 0);
309 	g->firststate = THERE();
310 	if (cflags & REG_NOSPEC)
311 		p_str(p);
312 	else
313 		p_re(p, OUT, OUT);
314 	EMIT(OEND, 0);
315 	g->laststate = THERE();
316 
317 	/* tidy up loose ends and fill things in */
318 	stripsnug(p, g);
319 	findmust(p, g);
320 	/* only use Boyer-Moore algorithm if the pattern is bigger
321 	 * than three characters
322 	 */
323 	if(g->mlen > 3) {
324 		computejumps(p, g);
325 		computematchjumps(p, g);
326 		if(g->matchjump == NULL && g->charjump != NULL) {
327 			free(g->charjump);
328 			g->charjump = NULL;
329 		}
330 	}
331 	g->nplus = pluscount(p, g);
332 	g->magic = MAGIC2;
333 	preg->re_nsub = g->nsub;
334 	preg->re_g = g;
335 	preg->re_magic = MAGIC1;
336 #ifndef REDEBUG
337 	/* not debugging, so can't rely on the assert() in regexec() */
338 	if (g->iflags&BAD)
339 		SETERROR(REG_ASSERT);
340 #endif
341 
342 	/* win or lose, we're done */
343 	if (p->error != 0)	/* lose */
344 		regfree(preg);
345 	return(p->error);
346 }
347 
348 /*
349  - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op,
350  - return whether we should terminate or not
351  == static bool p_ere_exp(struct parse *p);
352  */
353 static bool
354 p_ere_exp(struct parse *p, struct branchc *bc)
355 {
356 	char c;
357 	wint_t wc;
358 	sopno pos;
359 	int count;
360 	int count2;
361 	sopno subno;
362 	int wascaret = 0;
363 
364 	(void)bc;
365 	assert(MORE());		/* caller should have ensured this */
366 	c = GETNEXT();
367 
368 	pos = HERE();
369 	switch (c) {
370 	case '(':
371 		(void)REQUIRE(MORE(), REG_EPAREN);
372 		p->g->nsub++;
373 		subno = p->g->nsub;
374 		if (subno < NPAREN)
375 			p->pbegin[subno] = HERE();
376 		EMIT(OLPAREN, subno);
377 		if (!SEE(')'))
378 			p_re(p, ')', IGN);
379 		if (subno < NPAREN) {
380 			p->pend[subno] = HERE();
381 			assert(p->pend[subno] != 0);
382 		}
383 		EMIT(ORPAREN, subno);
384 		(void)MUSTEAT(')', REG_EPAREN);
385 		break;
386 #ifndef POSIX_MISTAKE
387 	case ')':		/* happens only if no current unmatched ( */
388 		/*
389 		 * You may ask, why the ifndef?  Because I didn't notice
390 		 * this until slightly too late for 1003.2, and none of the
391 		 * other 1003.2 regular-expression reviewers noticed it at
392 		 * all.  So an unmatched ) is legal POSIX, at least until
393 		 * we can get it fixed.
394 		 */
395 		SETERROR(REG_EPAREN);
396 		break;
397 #endif
398 	case '^':
399 		EMIT(OBOL, 0);
400 		p->g->iflags |= USEBOL;
401 		p->g->nbol++;
402 		wascaret = 1;
403 		break;
404 	case '$':
405 		EMIT(OEOL, 0);
406 		p->g->iflags |= USEEOL;
407 		p->g->neol++;
408 		break;
409 	case '|':
410 		SETERROR(REG_EMPTY);
411 		break;
412 	case '*':
413 	case '+':
414 	case '?':
415 	case '{':
416 		SETERROR(REG_BADRPT);
417 		break;
418 	case '.':
419 		if (p->g->cflags&REG_NEWLINE)
420 			nonnewline(p);
421 		else
422 			EMIT(OANY, 0);
423 		break;
424 	case '[':
425 		p_bracket(p);
426 		break;
427 	case '\\':
428 		(void)REQUIRE(MORE(), REG_EESCAPE);
429 		wc = WGETNEXT();
430 		switch (wc) {
431 		case '<':
432 			EMIT(OBOW, 0);
433 			break;
434 		case '>':
435 			EMIT(OEOW, 0);
436 			break;
437 		default:
438 			ordinary(p, wc);
439 			break;
440 		}
441 		break;
442 	default:
443 		if (p->error != 0)
444 			return (false);
445 		p->next--;
446 		wc = WGETNEXT();
447 		ordinary(p, wc);
448 		break;
449 	}
450 
451 	if (!MORE())
452 		return (false);
453 	c = PEEK();
454 	/* we call { a repetition if followed by a digit */
455 	if (!( c == '*' || c == '+' || c == '?' || c == '{'))
456 		return (false);		/* no repetition, we're done */
457 	else if (c == '{')
458 		(void)REQUIRE(MORE2() && \
459 		    (isdigit((uch)PEEK2()) || PEEK2() == ','), REG_BADRPT);
460 	NEXT();
461 
462 	(void)REQUIRE(!wascaret, REG_BADRPT);
463 	switch (c) {
464 	case '*':	/* implemented as +? */
465 		/* this case does not require the (y|) trick, noKLUDGE */
466 		INSERT(OPLUS_, pos);
467 		ASTERN(O_PLUS, pos);
468 		INSERT(OQUEST_, pos);
469 		ASTERN(O_QUEST, pos);
470 		break;
471 	case '+':
472 		INSERT(OPLUS_, pos);
473 		ASTERN(O_PLUS, pos);
474 		break;
475 	case '?':
476 		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
477 		INSERT(OCH_, pos);		/* offset slightly wrong */
478 		ASTERN(OOR1, pos);		/* this one's right */
479 		AHEAD(pos);			/* fix the OCH_ */
480 		EMIT(OOR2, 0);			/* offset very wrong... */
481 		AHEAD(THERE());			/* ...so fix it */
482 		ASTERN(O_CH, THERETHERE());
483 		break;
484 	case '{':
485 		count = p_count(p);
486 		if (EAT(',')) {
487 			if (isdigit((uch)PEEK())) {
488 				count2 = p_count(p);
489 				(void)REQUIRE(count <= count2, REG_BADBR);
490 			} else		/* single number with comma */
491 				count2 = INFINITY;
492 		} else		/* just a single number */
493 			count2 = count;
494 		repeat(p, pos, count, count2);
495 		if (!EAT('}')) {	/* error heuristics */
496 			while (MORE() && PEEK() != '}')
497 				NEXT();
498 			(void)REQUIRE(MORE(), REG_EBRACE);
499 			SETERROR(REG_BADBR);
500 		}
501 		break;
502 	}
503 
504 	if (!MORE())
505 		return (false);
506 	c = PEEK();
507 	if (!( c == '*' || c == '+' || c == '?' ||
508 				(c == '{' && MORE2() && isdigit((uch)PEEK2())) ) )
509 		return (false);
510 	SETERROR(REG_BADRPT);
511 	return (false);
512 }
513 
514 /*
515  - p_str - string (no metacharacters) "parser"
516  == static void p_str(struct parse *p);
517  */
518 static void
519 p_str(struct parse *p)
520 {
521 	(void)REQUIRE(MORE(), REG_EMPTY);
522 	while (MORE())
523 		ordinary(p, WGETNEXT());
524 }
525 
526 /*
527  * Eat consecutive branch delimiters for the kind of expression that we are
528  * parsing, return the number of delimiters that we ate.
529  */
530 static int
531 p_branch_eat_delim(struct parse *p, struct branchc *bc)
532 {
533 	int nskip;
534 
535 	(void)bc;
536 	nskip = 0;
537 	while (EAT('|'))
538 		++nskip;
539 	return (nskip);
540 }
541 
542 /*
543  * Insert necessary branch book-keeping operations. This emits a
544  * bogus 'next' offset, since we still have more to parse
545  */
546 static void
547 p_branch_ins_offset(struct parse *p, struct branchc *bc)
548 {
549 
550 	if (bc->nbranch == 0) {
551 		INSERT(OCH_, bc->start);	/* offset is wrong */
552 		bc->fwd = bc->start;
553 		bc->back = bc->start;
554 	}
555 
556 	ASTERN(OOR1, bc->back);
557 	bc->back = THERE();
558 	AHEAD(bc->fwd);			/* fix previous offset */
559 	bc->fwd = HERE();
560 	EMIT(OOR2, 0);			/* offset is very wrong */
561 	++bc->nbranch;
562 }
563 
564 /*
565  * Fix the offset of the tail branch, if we actually had any branches.
566  * This is to correct the bogus placeholder offset that we use.
567  */
568 static void
569 p_branch_fix_tail(struct parse *p, struct branchc *bc)
570 {
571 
572 	/* Fix bogus offset at the tail if we actually have branches */
573 	if (bc->nbranch > 0) {
574 		AHEAD(bc->fwd);
575 		ASTERN(O_CH, bc->back);
576 	}
577 }
578 
579 /*
580  * Signal to the parser that an empty branch has been encountered; this will,
581  * in the future, be used to allow for more permissive behavior with empty
582  * branches. The return value should indicate whether parsing may continue
583  * or not.
584  */
585 static bool
586 p_branch_empty(struct parse *p, struct branchc *bc)
587 {
588 
589 	(void)bc;
590 	SETERROR(REG_EMPTY);
591 	return (false);
592 }
593 
594 /*
595  * Take care of any branching requirements. This includes inserting the
596  * appropriate branching instructions as well as eating all of the branch
597  * delimiters until we either run out of pattern or need to parse more pattern.
598  */
599 static bool
600 p_branch_do(struct parse *p, struct branchc *bc)
601 {
602 	int ate = 0;
603 
604 	ate = p_branch_eat_delim(p, bc);
605 	if (ate == 0)
606 		return (false);
607 	else if ((ate > 1 || (bc->outer && !MORE())) && !p_branch_empty(p, bc))
608 		/*
609 		 * Halt parsing only if we have an empty branch and p_branch_empty
610 		 * indicates that we must not continue. In the future, this will not
611 		 * necessarily be an error.
612 		 */
613 		return (false);
614 	p_branch_ins_offset(p, bc);
615 
616 	return (true);
617 }
618 
619 static void
620 p_bre_pre_parse(struct parse *p, struct branchc *bc)
621 {
622 
623 	(void) bc;
624 	/*
625 	 * Does not move cleanly into expression parser because of
626 	 * ordinary interpration of * at the beginning position of
627 	 * an expression.
628 	 */
629 	if (EAT('^')) {
630 		EMIT(OBOL, 0);
631 		p->g->iflags |= USEBOL;
632 		p->g->nbol++;
633 	}
634 }
635 
636 static void
637 p_bre_post_parse(struct parse *p, struct branchc *bc)
638 {
639 
640 	/* Expression is terminating due to EOL token */
641 	if (bc->terminate) {
642 		DROP(1);
643 		EMIT(OEOL, 0);
644 		p->g->iflags |= USEEOL;
645 		p->g->neol++;
646 	}
647 }
648 
649 /*
650  - p_re - Top level parser, concatenation and BRE anchoring
651  == static void p_re(struct parse *p, int end1, int end2);
652  * Giving end1 as OUT essentially eliminates the end1/end2 check.
653  *
654  * This implementation is a bit of a kludge, in that a trailing $ is first
655  * taken as an ordinary character and then revised to be an anchor.
656  * The amount of lookahead needed to avoid this kludge is excessive.
657  */
658 static void
659 p_re(struct parse *p,
660 	int end1,	/* first terminating character */
661 	int end2)	/* second terminating character; ignored for EREs */
662 {
663 	struct branchc bc;
664 
665 	bc.nbranch = 0;
666 	if (end1 == OUT && end2 == OUT)
667 		bc.outer = true;
668 	else
669 		bc.outer = false;
670 #define	SEEEND()	(!p->bre ? SEE(end1) : SEETWO(end1, end2))
671 	for (;;) {
672 		bc.start = HERE();
673 		bc.nchain = 0;
674 		bc.terminate = false;
675 		if (p->pre_parse != NULL)
676 			p->pre_parse(p, &bc);
677 		while (MORE() && (!p->allowbranch || !SEESPEC('|')) && !SEEEND()) {
678 			bc.terminate = p->parse_expr(p, &bc);
679 			++bc.nchain;
680 		}
681 		if (p->post_parse != NULL)
682 			p->post_parse(p, &bc);
683 		(void) REQUIRE(HERE() != bc.start, REG_EMPTY);
684 		if (!p->allowbranch)
685 			break;
686 		/*
687 		 * p_branch_do's return value indicates whether we should
688 		 * continue parsing or not. This is both for correctness and
689 		 * a slight optimization, because it will check if we've
690 		 * encountered an empty branch or the end of the string
691 		 * immediately following a branch delimiter.
692 		 */
693 		if (!p_branch_do(p, &bc))
694 			break;
695 	}
696 #undef SEE_END
697 	if (p->allowbranch)
698 		p_branch_fix_tail(p, &bc);
699 	assert(!MORE() || SEE(end1));
700 }
701 
702 /*
703  - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
704  == static bool p_simp_re(struct parse *p, struct branchc *bc);
705  */
706 static bool			/* was the simple RE an unbackslashed $? */
707 p_simp_re(struct parse *p, struct branchc *bc)
708 {
709 	int c;
710 	int count;
711 	int count2;
712 	sopno pos;
713 	int i;
714 	wint_t wc;
715 	sopno subno;
716 #	define	BACKSL	(1<<CHAR_BIT)
717 
718 	pos = HERE();		/* repetition op, if any, covers from here */
719 
720 	assert(MORE());		/* caller should have ensured this */
721 	c = GETNEXT();
722 	if (c == '\\') {
723 		(void)REQUIRE(MORE(), REG_EESCAPE);
724 		c = BACKSL | GETNEXT();
725 	}
726 	switch (c) {
727 	case '.':
728 		if (p->g->cflags&REG_NEWLINE)
729 			nonnewline(p);
730 		else
731 			EMIT(OANY, 0);
732 		break;
733 	case '[':
734 		p_bracket(p);
735 		break;
736 	case BACKSL|'<':
737 		EMIT(OBOW, 0);
738 		break;
739 	case BACKSL|'>':
740 		EMIT(OEOW, 0);
741 		break;
742 	case BACKSL|'{':
743 		SETERROR(REG_BADRPT);
744 		break;
745 	case BACKSL|'(':
746 		p->g->nsub++;
747 		subno = p->g->nsub;
748 		if (subno < NPAREN)
749 			p->pbegin[subno] = HERE();
750 		EMIT(OLPAREN, subno);
751 		/* the MORE here is an error heuristic */
752 		if (MORE() && !SEETWO('\\', ')'))
753 			p_re(p, '\\', ')');
754 		if (subno < NPAREN) {
755 			p->pend[subno] = HERE();
756 			assert(p->pend[subno] != 0);
757 		}
758 		EMIT(ORPAREN, subno);
759 		(void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
760 		break;
761 	case BACKSL|')':	/* should not get here -- must be user */
762 		SETERROR(REG_EPAREN);
763 		break;
764 	case BACKSL|'1':
765 	case BACKSL|'2':
766 	case BACKSL|'3':
767 	case BACKSL|'4':
768 	case BACKSL|'5':
769 	case BACKSL|'6':
770 	case BACKSL|'7':
771 	case BACKSL|'8':
772 	case BACKSL|'9':
773 		i = (c&~BACKSL) - '0';
774 		assert(i < NPAREN);
775 		if (p->pend[i] != 0) {
776 			assert(i <= p->g->nsub);
777 			EMIT(OBACK_, i);
778 			assert(p->pbegin[i] != 0);
779 			assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
780 			assert(OP(p->strip[p->pend[i]]) == ORPAREN);
781 			(void) dupl(p, p->pbegin[i]+1, p->pend[i]);
782 			EMIT(O_BACK, i);
783 		} else
784 			SETERROR(REG_ESUBREG);
785 		p->g->backrefs = 1;
786 		break;
787 	case '*':
788 		/*
789 		 * Ordinary if used as the first character beyond BOL anchor of
790 		 * a (sub-)expression, counts as a bad repetition operator if it
791 		 * appears otherwise.
792 		 */
793 		(void)REQUIRE(bc->nchain == 0, REG_BADRPT);
794 		/* FALLTHROUGH */
795 	default:
796 		if (p->error != 0)
797 			return (false);	/* Definitely not $... */
798 		p->next--;
799 		wc = WGETNEXT();
800 		ordinary(p, wc);
801 		break;
802 	}
803 
804 	if (EAT('*')) {		/* implemented as +? */
805 		/* this case does not require the (y|) trick, noKLUDGE */
806 		INSERT(OPLUS_, pos);
807 		ASTERN(O_PLUS, pos);
808 		INSERT(OQUEST_, pos);
809 		ASTERN(O_QUEST, pos);
810 	} else if (EATTWO('\\', '{')) {
811 		count = p_count(p);
812 		if (EAT(',')) {
813 			if (MORE() && isdigit((uch)PEEK())) {
814 				count2 = p_count(p);
815 				(void)REQUIRE(count <= count2, REG_BADBR);
816 			} else		/* single number with comma */
817 				count2 = INFINITY;
818 		} else		/* just a single number */
819 			count2 = count;
820 		repeat(p, pos, count, count2);
821 		if (!EATTWO('\\', '}')) {	/* error heuristics */
822 			while (MORE() && !SEETWO('\\', '}'))
823 				NEXT();
824 			(void)REQUIRE(MORE(), REG_EBRACE);
825 			SETERROR(REG_BADBR);
826 		}
827 	} else if (c == '$')     /* $ (but not \$) ends it */
828 		return (true);
829 
830 	return (false);
831 }
832 
833 /*
834  - p_count - parse a repetition count
835  == static int p_count(struct parse *p);
836  */
837 static int			/* the value */
838 p_count(struct parse *p)
839 {
840 	int count = 0;
841 	int ndigits = 0;
842 
843 	while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
844 		count = count*10 + (GETNEXT() - '0');
845 		ndigits++;
846 	}
847 
848 	(void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
849 	return(count);
850 }
851 
852 /*
853  - p_bracket - parse a bracketed character list
854  == static void p_bracket(struct parse *p);
855  */
856 static void
857 p_bracket(struct parse *p)
858 {
859 	cset *cs;
860 	wint_t ch;
861 
862 	/* Dept of Truly Sickening Special-Case Kludges */
863 	if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
864 		EMIT(OBOW, 0);
865 		NEXTn(6);
866 		return;
867 	}
868 	if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
869 		EMIT(OEOW, 0);
870 		NEXTn(6);
871 		return;
872 	}
873 
874 	if ((cs = allocset(p)) == NULL)
875 		return;
876 
877 	if (p->g->cflags&REG_ICASE)
878 		cs->icase = 1;
879 	if (EAT('^'))
880 		cs->invert = 1;
881 	if (EAT(']'))
882 		CHadd(p, cs, ']');
883 	else if (EAT('-'))
884 		CHadd(p, cs, '-');
885 	while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
886 		p_b_term(p, cs);
887 	if (EAT('-'))
888 		CHadd(p, cs, '-');
889 	(void)MUSTEAT(']', REG_EBRACK);
890 
891 	if (p->error != 0)	/* don't mess things up further */
892 		return;
893 
894 	if (cs->invert && p->g->cflags&REG_NEWLINE)
895 		cs->bmp['\n' >> 3] |= 1 << ('\n' & 7);
896 
897 	if ((ch = singleton(cs)) != OUT) {	/* optimize singleton sets */
898 		ordinary(p, ch);
899 		freeset(p, cs);
900 	} else
901 		EMIT(OANYOF, (int)(cs - p->g->sets));
902 }
903 
904 static int
905 p_range_cmp(wchar_t c1, wchar_t c2)
906 {
907 #ifndef LIBREGEX
908 	return __wcollate_range_cmp(c1, c2);
909 #else
910 	/* Copied from libc/collate __wcollate_range_cmp */
911 	wchar_t s1[2], s2[2];
912 
913 	s1[0] = c1;
914 	s1[1] = L'\0';
915 	s2[0] = c2;
916 	s2[1] = L'\0';
917 	return (wcscoll(s1, s2));
918 #endif
919 }
920 
921 /*
922  - p_b_term - parse one term of a bracketed character list
923  == static void p_b_term(struct parse *p, cset *cs);
924  */
925 static void
926 p_b_term(struct parse *p, cset *cs)
927 {
928 	char c;
929 	wint_t start, finish;
930 	wint_t i;
931 #ifndef LIBREGEX
932 	struct xlocale_collate *table =
933 		(struct xlocale_collate*)__get_locale()->components[XLC_COLLATE];
934 #endif
935 	/* classify what we've got */
936 	switch ((MORE()) ? PEEK() : '\0') {
937 	case '[':
938 		c = (MORE2()) ? PEEK2() : '\0';
939 		break;
940 	case '-':
941 		SETERROR(REG_ERANGE);
942 		return;			/* NOTE RETURN */
943 	default:
944 		c = '\0';
945 		break;
946 	}
947 
948 	switch (c) {
949 	case ':':		/* character class */
950 		NEXT2();
951 		(void)REQUIRE(MORE(), REG_EBRACK);
952 		c = PEEK();
953 		(void)REQUIRE(c != '-' && c != ']', REG_ECTYPE);
954 		p_b_cclass(p, cs);
955 		(void)REQUIRE(MORE(), REG_EBRACK);
956 		(void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
957 		break;
958 	case '=':		/* equivalence class */
959 		NEXT2();
960 		(void)REQUIRE(MORE(), REG_EBRACK);
961 		c = PEEK();
962 		(void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
963 		p_b_eclass(p, cs);
964 		(void)REQUIRE(MORE(), REG_EBRACK);
965 		(void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
966 		break;
967 	default:		/* symbol, ordinary character, or range */
968 		start = p_b_symbol(p);
969 		if (SEE('-') && MORE2() && PEEK2() != ']') {
970 			/* range */
971 			NEXT();
972 			if (EAT('-'))
973 				finish = '-';
974 			else
975 				finish = p_b_symbol(p);
976 		} else
977 			finish = start;
978 		if (start == finish)
979 			CHadd(p, cs, start);
980 		else {
981 #ifndef LIBREGEX
982 			if (table->__collate_load_error || MB_CUR_MAX > 1) {
983 #else
984 			if (MB_CUR_MAX > 1) {
985 #endif
986 				(void)REQUIRE(start <= finish, REG_ERANGE);
987 				CHaddrange(p, cs, start, finish);
988 			} else {
989 				(void)REQUIRE(p_range_cmp(start, finish) <= 0, REG_ERANGE);
990 				for (i = 0; i <= UCHAR_MAX; i++) {
991 					if (p_range_cmp(start, i) <= 0 &&
992 					    p_range_cmp(i, finish) <= 0 )
993 						CHadd(p, cs, i);
994 				}
995 			}
996 		}
997 		break;
998 	}
999 }
1000 
1001 /*
1002  - p_b_cclass - parse a character-class name and deal with it
1003  == static void p_b_cclass(struct parse *p, cset *cs);
1004  */
1005 static void
1006 p_b_cclass(struct parse *p, cset *cs)
1007 {
1008 	const char *sp = p->next;
1009 	size_t len;
1010 	wctype_t wct;
1011 	char clname[16];
1012 
1013 	while (MORE() && isalpha((uch)PEEK()))
1014 		NEXT();
1015 	len = p->next - sp;
1016 	if (len >= sizeof(clname) - 1) {
1017 		SETERROR(REG_ECTYPE);
1018 		return;
1019 	}
1020 	memcpy(clname, sp, len);
1021 	clname[len] = '\0';
1022 	if ((wct = wctype(clname)) == 0) {
1023 		SETERROR(REG_ECTYPE);
1024 		return;
1025 	}
1026 	CHaddtype(p, cs, wct);
1027 }
1028 
1029 /*
1030  - p_b_eclass - parse an equivalence-class name and deal with it
1031  == static void p_b_eclass(struct parse *p, cset *cs);
1032  *
1033  * This implementation is incomplete. xxx
1034  */
1035 static void
1036 p_b_eclass(struct parse *p, cset *cs)
1037 {
1038 	wint_t c;
1039 
1040 	c = p_b_coll_elem(p, '=');
1041 	CHadd(p, cs, c);
1042 }
1043 
1044 /*
1045  - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
1046  == static wint_t p_b_symbol(struct parse *p);
1047  */
1048 static wint_t			/* value of symbol */
1049 p_b_symbol(struct parse *p)
1050 {
1051 	wint_t value;
1052 
1053 	(void)REQUIRE(MORE(), REG_EBRACK);
1054 	if (!EATTWO('[', '.'))
1055 		return(WGETNEXT());
1056 
1057 	/* collating symbol */
1058 	value = p_b_coll_elem(p, '.');
1059 	(void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
1060 	return(value);
1061 }
1062 
1063 /*
1064  - p_b_coll_elem - parse a collating-element name and look it up
1065  == static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
1066  */
1067 static wint_t			/* value of collating element */
1068 p_b_coll_elem(struct parse *p,
1069 	wint_t endc)		/* name ended by endc,']' */
1070 {
1071 	const char *sp = p->next;
1072 	struct cname *cp;
1073 	mbstate_t mbs;
1074 	wchar_t wc;
1075 	size_t clen, len;
1076 
1077 	while (MORE() && !SEETWO(endc, ']'))
1078 		NEXT();
1079 	if (!MORE()) {
1080 		SETERROR(REG_EBRACK);
1081 		return(0);
1082 	}
1083 	len = p->next - sp;
1084 	for (cp = cnames; cp->name != NULL; cp++)
1085 		if (strncmp(cp->name, sp, len) == 0 && strlen(cp->name) == len)
1086 			return(cp->code);	/* known name */
1087 	memset(&mbs, 0, sizeof(mbs));
1088 	if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len)
1089 		return (wc);			/* single character */
1090 	else if (clen == (size_t)-1 || clen == (size_t)-2)
1091 		SETERROR(REG_ILLSEQ);
1092 	else
1093 		SETERROR(REG_ECOLLATE);		/* neither */
1094 	return(0);
1095 }
1096 
1097 /*
1098  - othercase - return the case counterpart of an alphabetic
1099  == static wint_t othercase(wint_t ch);
1100  */
1101 static wint_t			/* if no counterpart, return ch */
1102 othercase(wint_t ch)
1103 {
1104 	assert(iswalpha(ch));
1105 	if (iswupper(ch))
1106 		return(towlower(ch));
1107 	else if (iswlower(ch))
1108 		return(towupper(ch));
1109 	else			/* peculiar, but could happen */
1110 		return(ch);
1111 }
1112 
1113 /*
1114  - bothcases - emit a dualcase version of a two-case character
1115  == static void bothcases(struct parse *p, wint_t ch);
1116  *
1117  * Boy, is this implementation ever a kludge...
1118  */
1119 static void
1120 bothcases(struct parse *p, wint_t ch)
1121 {
1122 	const char *oldnext = p->next;
1123 	const char *oldend = p->end;
1124 	char bracket[3 + MB_LEN_MAX];
1125 	size_t n;
1126 	mbstate_t mbs;
1127 
1128 	assert(othercase(ch) != ch);	/* p_bracket() would recurse */
1129 	p->next = bracket;
1130 	memset(&mbs, 0, sizeof(mbs));
1131 	n = wcrtomb(bracket, ch, &mbs);
1132 	assert(n != (size_t)-1);
1133 	bracket[n] = ']';
1134 	bracket[n + 1] = '\0';
1135 	p->end = bracket+n+1;
1136 	p_bracket(p);
1137 	assert(p->next == p->end);
1138 	p->next = oldnext;
1139 	p->end = oldend;
1140 }
1141 
1142 /*
1143  - ordinary - emit an ordinary character
1144  == static void ordinary(struct parse *p, wint_t ch);
1145  */
1146 static void
1147 ordinary(struct parse *p, wint_t ch)
1148 {
1149 	cset *cs;
1150 
1151 	if ((p->g->cflags&REG_ICASE) && iswalpha(ch) && othercase(ch) != ch)
1152 		bothcases(p, ch);
1153 	else if ((ch & OPDMASK) == ch)
1154 		EMIT(OCHAR, ch);
1155 	else {
1156 		/*
1157 		 * Kludge: character is too big to fit into an OCHAR operand.
1158 		 * Emit a singleton set.
1159 		 */
1160 		if ((cs = allocset(p)) == NULL)
1161 			return;
1162 		CHadd(p, cs, ch);
1163 		EMIT(OANYOF, (int)(cs - p->g->sets));
1164 	}
1165 }
1166 
1167 /*
1168  - nonnewline - emit REG_NEWLINE version of OANY
1169  == static void nonnewline(struct parse *p);
1170  *
1171  * Boy, is this implementation ever a kludge...
1172  */
1173 static void
1174 nonnewline(struct parse *p)
1175 {
1176 	const char *oldnext = p->next;
1177 	const char *oldend = p->end;
1178 	char bracket[4];
1179 
1180 	p->next = bracket;
1181 	p->end = bracket+3;
1182 	bracket[0] = '^';
1183 	bracket[1] = '\n';
1184 	bracket[2] = ']';
1185 	bracket[3] = '\0';
1186 	p_bracket(p);
1187 	assert(p->next == bracket+3);
1188 	p->next = oldnext;
1189 	p->end = oldend;
1190 }
1191 
1192 /*
1193  - repeat - generate code for a bounded repetition, recursively if needed
1194  == static void repeat(struct parse *p, sopno start, int from, int to);
1195  */
1196 static void
1197 repeat(struct parse *p,
1198 	sopno start,		/* operand from here to end of strip */
1199 	int from,		/* repeated from this number */
1200 	int to)			/* to this number of times (maybe INFINITY) */
1201 {
1202 	sopno finish = HERE();
1203 #	define	N	2
1204 #	define	INF	3
1205 #	define	REP(f, t)	((f)*8 + (t))
1206 #	define	MAP(n)	(((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1207 	sopno copy;
1208 
1209 	if (p->error != 0)	/* head off possible runaway recursion */
1210 		return;
1211 
1212 	assert(from <= to);
1213 
1214 	switch (REP(MAP(from), MAP(to))) {
1215 	case REP(0, 0):			/* must be user doing this */
1216 		DROP(finish-start);	/* drop the operand */
1217 		break;
1218 	case REP(0, 1):			/* as x{1,1}? */
1219 	case REP(0, N):			/* as x{1,n}? */
1220 	case REP(0, INF):		/* as x{1,}? */
1221 		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1222 		INSERT(OCH_, start);		/* offset is wrong... */
1223 		repeat(p, start+1, 1, to);
1224 		ASTERN(OOR1, start);
1225 		AHEAD(start);			/* ... fix it */
1226 		EMIT(OOR2, 0);
1227 		AHEAD(THERE());
1228 		ASTERN(O_CH, THERETHERE());
1229 		break;
1230 	case REP(1, 1):			/* trivial case */
1231 		/* done */
1232 		break;
1233 	case REP(1, N):			/* as x?x{1,n-1} */
1234 		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1235 		INSERT(OCH_, start);
1236 		ASTERN(OOR1, start);
1237 		AHEAD(start);
1238 		EMIT(OOR2, 0);			/* offset very wrong... */
1239 		AHEAD(THERE());			/* ...so fix it */
1240 		ASTERN(O_CH, THERETHERE());
1241 		copy = dupl(p, start+1, finish+1);
1242 		assert(copy == finish+4);
1243 		repeat(p, copy, 1, to-1);
1244 		break;
1245 	case REP(1, INF):		/* as x+ */
1246 		INSERT(OPLUS_, start);
1247 		ASTERN(O_PLUS, start);
1248 		break;
1249 	case REP(N, N):			/* as xx{m-1,n-1} */
1250 		copy = dupl(p, start, finish);
1251 		repeat(p, copy, from-1, to-1);
1252 		break;
1253 	case REP(N, INF):		/* as xx{n-1,INF} */
1254 		copy = dupl(p, start, finish);
1255 		repeat(p, copy, from-1, to);
1256 		break;
1257 	default:			/* "can't happen" */
1258 		SETERROR(REG_ASSERT);	/* just in case */
1259 		break;
1260 	}
1261 }
1262 
1263 /*
1264  - wgetnext - helper function for WGETNEXT() macro. Gets the next wide
1265  - character from the parse struct, signals a REG_ILLSEQ error if the
1266  - character can't be converted. Returns the number of bytes consumed.
1267  */
1268 static wint_t
1269 wgetnext(struct parse *p)
1270 {
1271 	mbstate_t mbs;
1272 	wchar_t wc;
1273 	size_t n;
1274 
1275 	memset(&mbs, 0, sizeof(mbs));
1276 	n = mbrtowc(&wc, p->next, p->end - p->next, &mbs);
1277 	if (n == (size_t)-1 || n == (size_t)-2) {
1278 		SETERROR(REG_ILLSEQ);
1279 		return (0);
1280 	}
1281 	if (n == 0)
1282 		n = 1;
1283 	p->next += n;
1284 	return (wc);
1285 }
1286 
1287 /*
1288  - seterr - set an error condition
1289  == static int seterr(struct parse *p, int e);
1290  */
1291 static int			/* useless but makes type checking happy */
1292 seterr(struct parse *p, int e)
1293 {
1294 	if (p->error == 0)	/* keep earliest error condition */
1295 		p->error = e;
1296 	p->next = nuls;		/* try to bring things to a halt */
1297 	p->end = nuls;
1298 	return(0);		/* make the return value well-defined */
1299 }
1300 
1301 /*
1302  - allocset - allocate a set of characters for []
1303  == static cset *allocset(struct parse *p);
1304  */
1305 static cset *
1306 allocset(struct parse *p)
1307 {
1308 	cset *cs, *ncs;
1309 
1310 	ncs = reallocarray(p->g->sets, p->g->ncsets + 1, sizeof(*ncs));
1311 	if (ncs == NULL) {
1312 		SETERROR(REG_ESPACE);
1313 		return (NULL);
1314 	}
1315 	p->g->sets = ncs;
1316 	cs = &p->g->sets[p->g->ncsets++];
1317 	memset(cs, 0, sizeof(*cs));
1318 
1319 	return(cs);
1320 }
1321 
1322 /*
1323  - freeset - free a now-unused set
1324  == static void freeset(struct parse *p, cset *cs);
1325  */
1326 static void
1327 freeset(struct parse *p, cset *cs)
1328 {
1329 	cset *top = &p->g->sets[p->g->ncsets];
1330 
1331 	free(cs->wides);
1332 	free(cs->ranges);
1333 	free(cs->types);
1334 	memset(cs, 0, sizeof(*cs));
1335 	if (cs == top-1)	/* recover only the easy case */
1336 		p->g->ncsets--;
1337 }
1338 
1339 /*
1340  - singleton - Determine whether a set contains only one character,
1341  - returning it if so, otherwise returning OUT.
1342  */
1343 static wint_t
1344 singleton(cset *cs)
1345 {
1346 	wint_t i, s, n;
1347 
1348 	for (i = n = 0; i < NC; i++)
1349 		if (CHIN(cs, i)) {
1350 			n++;
1351 			s = i;
1352 		}
1353 	if (n == 1)
1354 		return (s);
1355 	if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 &&
1356 	    cs->icase == 0)
1357 		return (cs->wides[0]);
1358 	/* Don't bother handling the other cases. */
1359 	return (OUT);
1360 }
1361 
1362 /*
1363  - CHadd - add character to character set.
1364  */
1365 static void
1366 CHadd(struct parse *p, cset *cs, wint_t ch)
1367 {
1368 	wint_t nch, *newwides;
1369 	assert(ch >= 0);
1370 	if (ch < NC)
1371 		cs->bmp[ch >> 3] |= 1 << (ch & 7);
1372 	else {
1373 		newwides = reallocarray(cs->wides, cs->nwides + 1,
1374 		    sizeof(*cs->wides));
1375 		if (newwides == NULL) {
1376 			SETERROR(REG_ESPACE);
1377 			return;
1378 		}
1379 		cs->wides = newwides;
1380 		cs->wides[cs->nwides++] = ch;
1381 	}
1382 	if (cs->icase) {
1383 		if ((nch = towlower(ch)) < NC)
1384 			cs->bmp[nch >> 3] |= 1 << (nch & 7);
1385 		if ((nch = towupper(ch)) < NC)
1386 			cs->bmp[nch >> 3] |= 1 << (nch & 7);
1387 	}
1388 }
1389 
1390 /*
1391  - CHaddrange - add all characters in the range [min,max] to a character set.
1392  */
1393 static void
1394 CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max)
1395 {
1396 	crange *newranges;
1397 
1398 	for (; min < NC && min <= max; min++)
1399 		CHadd(p, cs, min);
1400 	if (min >= max)
1401 		return;
1402 	newranges = reallocarray(cs->ranges, cs->nranges + 1,
1403 	    sizeof(*cs->ranges));
1404 	if (newranges == NULL) {
1405 		SETERROR(REG_ESPACE);
1406 		return;
1407 	}
1408 	cs->ranges = newranges;
1409 	cs->ranges[cs->nranges].min = min;
1410 	cs->ranges[cs->nranges].max = max;
1411 	cs->nranges++;
1412 }
1413 
1414 /*
1415  - CHaddtype - add all characters of a certain type to a character set.
1416  */
1417 static void
1418 CHaddtype(struct parse *p, cset *cs, wctype_t wct)
1419 {
1420 	wint_t i;
1421 	wctype_t *newtypes;
1422 
1423 	for (i = 0; i < NC; i++)
1424 		if (iswctype(i, wct))
1425 			CHadd(p, cs, i);
1426 	newtypes = reallocarray(cs->types, cs->ntypes + 1,
1427 	    sizeof(*cs->types));
1428 	if (newtypes == NULL) {
1429 		SETERROR(REG_ESPACE);
1430 		return;
1431 	}
1432 	cs->types = newtypes;
1433 	cs->types[cs->ntypes++] = wct;
1434 }
1435 
1436 /*
1437  - dupl - emit a duplicate of a bunch of sops
1438  == static sopno dupl(struct parse *p, sopno start, sopno finish);
1439  */
1440 static sopno			/* start of duplicate */
1441 dupl(struct parse *p,
1442 	sopno start,		/* from here */
1443 	sopno finish)		/* to this less one */
1444 {
1445 	sopno ret = HERE();
1446 	sopno len = finish - start;
1447 
1448 	assert(finish >= start);
1449 	if (len == 0)
1450 		return(ret);
1451 	if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */
1452 		return(ret);
1453 	(void) memcpy((char *)(p->strip + p->slen),
1454 		(char *)(p->strip + start), (size_t)len*sizeof(sop));
1455 	p->slen += len;
1456 	return(ret);
1457 }
1458 
1459 /*
1460  - doemit - emit a strip operator
1461  == static void doemit(struct parse *p, sop op, size_t opnd);
1462  *
1463  * It might seem better to implement this as a macro with a function as
1464  * hard-case backup, but it's just too big and messy unless there are
1465  * some changes to the data structures.  Maybe later.
1466  */
1467 static void
1468 doemit(struct parse *p, sop op, size_t opnd)
1469 {
1470 	/* avoid making error situations worse */
1471 	if (p->error != 0)
1472 		return;
1473 
1474 	/* deal with oversize operands ("can't happen", more or less) */
1475 	assert(opnd < 1<<OPSHIFT);
1476 
1477 	/* deal with undersized strip */
1478 	if (p->slen >= p->ssize)
1479 		if (!enlarge(p, (p->ssize+1) / 2 * 3))	/* +50% */
1480 			return;
1481 
1482 	/* finally, it's all reduced to the easy case */
1483 	p->strip[p->slen++] = SOP(op, opnd);
1484 }
1485 
1486 /*
1487  - doinsert - insert a sop into the strip
1488  == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
1489  */
1490 static void
1491 doinsert(struct parse *p, sop op, size_t opnd, sopno pos)
1492 {
1493 	sopno sn;
1494 	sop s;
1495 	int i;
1496 
1497 	/* avoid making error situations worse */
1498 	if (p->error != 0)
1499 		return;
1500 
1501 	sn = HERE();
1502 	EMIT(op, opnd);		/* do checks, ensure space */
1503 	assert(HERE() == sn+1);
1504 	s = p->strip[sn];
1505 
1506 	/* adjust paren pointers */
1507 	assert(pos > 0);
1508 	for (i = 1; i < NPAREN; i++) {
1509 		if (p->pbegin[i] >= pos) {
1510 			p->pbegin[i]++;
1511 		}
1512 		if (p->pend[i] >= pos) {
1513 			p->pend[i]++;
1514 		}
1515 	}
1516 
1517 	memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
1518 						(HERE()-pos-1)*sizeof(sop));
1519 	p->strip[pos] = s;
1520 }
1521 
1522 /*
1523  - dofwd - complete a forward reference
1524  == static void dofwd(struct parse *p, sopno pos, sop value);
1525  */
1526 static void
1527 dofwd(struct parse *p, sopno pos, sop value)
1528 {
1529 	/* avoid making error situations worse */
1530 	if (p->error != 0)
1531 		return;
1532 
1533 	assert(value < 1<<OPSHIFT);
1534 	p->strip[pos] = OP(p->strip[pos]) | value;
1535 }
1536 
1537 /*
1538  - enlarge - enlarge the strip
1539  == static int enlarge(struct parse *p, sopno size);
1540  */
1541 static int
1542 enlarge(struct parse *p, sopno size)
1543 {
1544 	sop *sp;
1545 
1546 	if (p->ssize >= size)
1547 		return 1;
1548 
1549 	sp = reallocarray(p->strip, size, sizeof(sop));
1550 	if (sp == NULL) {
1551 		SETERROR(REG_ESPACE);
1552 		return 0;
1553 	}
1554 	p->strip = sp;
1555 	p->ssize = size;
1556 	return 1;
1557 }
1558 
1559 /*
1560  - stripsnug - compact the strip
1561  == static void stripsnug(struct parse *p, struct re_guts *g);
1562  */
1563 static void
1564 stripsnug(struct parse *p, struct re_guts *g)
1565 {
1566 	g->nstates = p->slen;
1567 	g->strip = reallocarray((char *)p->strip, p->slen, sizeof(sop));
1568 	if (g->strip == NULL) {
1569 		SETERROR(REG_ESPACE);
1570 		g->strip = p->strip;
1571 	}
1572 }
1573 
1574 /*
1575  - findmust - fill in must and mlen with longest mandatory literal string
1576  == static void findmust(struct parse *p, struct re_guts *g);
1577  *
1578  * This algorithm could do fancy things like analyzing the operands of |
1579  * for common subsequences.  Someday.  This code is simple and finds most
1580  * of the interesting cases.
1581  *
1582  * Note that must and mlen got initialized during setup.
1583  */
1584 static void
1585 findmust(struct parse *p, struct re_guts *g)
1586 {
1587 	sop *scan;
1588 	sop *start = NULL;
1589 	sop *newstart = NULL;
1590 	sopno newlen;
1591 	sop s;
1592 	char *cp;
1593 	int offset;
1594 	char buf[MB_LEN_MAX];
1595 	size_t clen;
1596 	mbstate_t mbs;
1597 
1598 	/* avoid making error situations worse */
1599 	if (p->error != 0)
1600 		return;
1601 
1602 	/*
1603 	 * It's not generally safe to do a ``char'' substring search on
1604 	 * multibyte character strings, but it's safe for at least
1605 	 * UTF-8 (see RFC 3629).
1606 	 */
1607 	if (MB_CUR_MAX > 1 &&
1608 	    strcmp(_CurrentRuneLocale->__encoding, "UTF-8") != 0)
1609 		return;
1610 
1611 	/* find the longest OCHAR sequence in strip */
1612 	newlen = 0;
1613 	offset = 0;
1614 	g->moffset = 0;
1615 	scan = g->strip + 1;
1616 	do {
1617 		s = *scan++;
1618 		switch (OP(s)) {
1619 		case OCHAR:		/* sequence member */
1620 			if (newlen == 0) {		/* new sequence */
1621 				memset(&mbs, 0, sizeof(mbs));
1622 				newstart = scan - 1;
1623 			}
1624 			clen = wcrtomb(buf, OPND(s), &mbs);
1625 			if (clen == (size_t)-1)
1626 				goto toohard;
1627 			newlen += clen;
1628 			break;
1629 		case OPLUS_:		/* things that don't break one */
1630 		case OLPAREN:
1631 		case ORPAREN:
1632 			break;
1633 		case OQUEST_:		/* things that must be skipped */
1634 		case OCH_:
1635 			offset = altoffset(scan, offset);
1636 			scan--;
1637 			do {
1638 				scan += OPND(s);
1639 				s = *scan;
1640 				/* assert() interferes w debug printouts */
1641 				if (OP(s) != (sop)O_QUEST &&
1642 				    OP(s) != (sop)O_CH && OP(s) != (sop)OOR2) {
1643 					g->iflags |= BAD;
1644 					return;
1645 				}
1646 			} while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH);
1647 			/* FALLTHROUGH */
1648 		case OBOW:		/* things that break a sequence */
1649 		case OEOW:
1650 		case OBOL:
1651 		case OEOL:
1652 		case O_QUEST:
1653 		case O_CH:
1654 		case OEND:
1655 			if (newlen > (sopno)g->mlen) {		/* ends one */
1656 				start = newstart;
1657 				g->mlen = newlen;
1658 				if (offset > -1) {
1659 					g->moffset += offset;
1660 					offset = newlen;
1661 				} else
1662 					g->moffset = offset;
1663 			} else {
1664 				if (offset > -1)
1665 					offset += newlen;
1666 			}
1667 			newlen = 0;
1668 			break;
1669 		case OANY:
1670 			if (newlen > (sopno)g->mlen) {		/* ends one */
1671 				start = newstart;
1672 				g->mlen = newlen;
1673 				if (offset > -1) {
1674 					g->moffset += offset;
1675 					offset = newlen;
1676 				} else
1677 					g->moffset = offset;
1678 			} else {
1679 				if (offset > -1)
1680 					offset += newlen;
1681 			}
1682 			if (offset > -1)
1683 				offset++;
1684 			newlen = 0;
1685 			break;
1686 		case OANYOF:		/* may or may not invalidate offset */
1687 			/* First, everything as OANY */
1688 			if (newlen > (sopno)g->mlen) {		/* ends one */
1689 				start = newstart;
1690 				g->mlen = newlen;
1691 				if (offset > -1) {
1692 					g->moffset += offset;
1693 					offset = newlen;
1694 				} else
1695 					g->moffset = offset;
1696 			} else {
1697 				if (offset > -1)
1698 					offset += newlen;
1699 			}
1700 			if (offset > -1)
1701 				offset++;
1702 			newlen = 0;
1703 			break;
1704 		toohard:
1705 		default:
1706 			/* Anything here makes it impossible or too hard
1707 			 * to calculate the offset -- so we give up;
1708 			 * save the last known good offset, in case the
1709 			 * must sequence doesn't occur later.
1710 			 */
1711 			if (newlen > (sopno)g->mlen) {		/* ends one */
1712 				start = newstart;
1713 				g->mlen = newlen;
1714 				if (offset > -1)
1715 					g->moffset += offset;
1716 				else
1717 					g->moffset = offset;
1718 			}
1719 			offset = -1;
1720 			newlen = 0;
1721 			break;
1722 		}
1723 	} while (OP(s) != OEND);
1724 
1725 	if (g->mlen == 0) {		/* there isn't one */
1726 		g->moffset = -1;
1727 		return;
1728 	}
1729 
1730 	/* turn it into a character string */
1731 	g->must = malloc((size_t)g->mlen + 1);
1732 	if (g->must == NULL) {		/* argh; just forget it */
1733 		g->mlen = 0;
1734 		g->moffset = -1;
1735 		return;
1736 	}
1737 	cp = g->must;
1738 	scan = start;
1739 	memset(&mbs, 0, sizeof(mbs));
1740 	while (cp < g->must + g->mlen) {
1741 		while (OP(s = *scan++) != OCHAR)
1742 			continue;
1743 		clen = wcrtomb(cp, OPND(s), &mbs);
1744 		assert(clen != (size_t)-1);
1745 		cp += clen;
1746 	}
1747 	assert(cp == g->must + g->mlen);
1748 	*cp++ = '\0';		/* just on general principles */
1749 }
1750 
1751 /*
1752  - altoffset - choose biggest offset among multiple choices
1753  == static int altoffset(sop *scan, int offset);
1754  *
1755  * Compute, recursively if necessary, the largest offset among multiple
1756  * re paths.
1757  */
1758 static int
1759 altoffset(sop *scan, int offset)
1760 {
1761 	int largest;
1762 	int try;
1763 	sop s;
1764 
1765 	/* If we gave up already on offsets, return */
1766 	if (offset == -1)
1767 		return -1;
1768 
1769 	largest = 0;
1770 	try = 0;
1771 	s = *scan++;
1772 	while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH) {
1773 		switch (OP(s)) {
1774 		case OOR1:
1775 			if (try > largest)
1776 				largest = try;
1777 			try = 0;
1778 			break;
1779 		case OQUEST_:
1780 		case OCH_:
1781 			try = altoffset(scan, try);
1782 			if (try == -1)
1783 				return -1;
1784 			scan--;
1785 			do {
1786 				scan += OPND(s);
1787 				s = *scan;
1788 				if (OP(s) != (sop)O_QUEST &&
1789 				    OP(s) != (sop)O_CH && OP(s) != (sop)OOR2)
1790 					return -1;
1791 			} while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH);
1792 			/* We must skip to the next position, or we'll
1793 			 * leave altoffset() too early.
1794 			 */
1795 			scan++;
1796 			break;
1797 		case OANYOF:
1798 		case OCHAR:
1799 		case OANY:
1800 			try++;
1801 		case OBOW:
1802 		case OEOW:
1803 		case OLPAREN:
1804 		case ORPAREN:
1805 		case OOR2:
1806 			break;
1807 		default:
1808 			try = -1;
1809 			break;
1810 		}
1811 		if (try == -1)
1812 			return -1;
1813 		s = *scan++;
1814 	}
1815 
1816 	if (try > largest)
1817 		largest = try;
1818 
1819 	return largest+offset;
1820 }
1821 
1822 /*
1823  - computejumps - compute char jumps for BM scan
1824  == static void computejumps(struct parse *p, struct re_guts *g);
1825  *
1826  * This algorithm assumes g->must exists and is has size greater than
1827  * zero. It's based on the algorithm found on Computer Algorithms by
1828  * Sara Baase.
1829  *
1830  * A char jump is the number of characters one needs to jump based on
1831  * the value of the character from the text that was mismatched.
1832  */
1833 static void
1834 computejumps(struct parse *p, struct re_guts *g)
1835 {
1836 	int ch;
1837 	int mindex;
1838 
1839 	/* Avoid making errors worse */
1840 	if (p->error != 0)
1841 		return;
1842 
1843 	g->charjump = (int *)malloc((NC_MAX + 1) * sizeof(int));
1844 	if (g->charjump == NULL)	/* Not a fatal error */
1845 		return;
1846 	/* Adjust for signed chars, if necessary */
1847 	g->charjump = &g->charjump[-(CHAR_MIN)];
1848 
1849 	/* If the character does not exist in the pattern, the jump
1850 	 * is equal to the number of characters in the pattern.
1851 	 */
1852 	for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++)
1853 		g->charjump[ch] = g->mlen;
1854 
1855 	/* If the character does exist, compute the jump that would
1856 	 * take us to the last character in the pattern equal to it
1857 	 * (notice that we match right to left, so that last character
1858 	 * is the first one that would be matched).
1859 	 */
1860 	for (mindex = 0; mindex < g->mlen; mindex++)
1861 		g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1;
1862 }
1863 
1864 /*
1865  - computematchjumps - compute match jumps for BM scan
1866  == static void computematchjumps(struct parse *p, struct re_guts *g);
1867  *
1868  * This algorithm assumes g->must exists and is has size greater than
1869  * zero. It's based on the algorithm found on Computer Algorithms by
1870  * Sara Baase.
1871  *
1872  * A match jump is the number of characters one needs to advance based
1873  * on the already-matched suffix.
1874  * Notice that all values here are minus (g->mlen-1), because of the way
1875  * the search algorithm works.
1876  */
1877 static void
1878 computematchjumps(struct parse *p, struct re_guts *g)
1879 {
1880 	int mindex;		/* General "must" iterator */
1881 	int suffix;		/* Keeps track of matching suffix */
1882 	int ssuffix;		/* Keeps track of suffixes' suffix */
1883 	int* pmatches;		/* pmatches[k] points to the next i
1884 				 * such that i+1...mlen is a substring
1885 				 * of k+1...k+mlen-i-1
1886 				 */
1887 
1888 	/* Avoid making errors worse */
1889 	if (p->error != 0)
1890 		return;
1891 
1892 	pmatches = (int*) malloc(g->mlen * sizeof(int));
1893 	if (pmatches == NULL) {
1894 		g->matchjump = NULL;
1895 		return;
1896 	}
1897 
1898 	g->matchjump = (int*) malloc(g->mlen * sizeof(int));
1899 	if (g->matchjump == NULL) {	/* Not a fatal error */
1900 		free(pmatches);
1901 		return;
1902 	}
1903 
1904 	/* Set maximum possible jump for each character in the pattern */
1905 	for (mindex = 0; mindex < g->mlen; mindex++)
1906 		g->matchjump[mindex] = 2*g->mlen - mindex - 1;
1907 
1908 	/* Compute pmatches[] */
1909 	for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0;
1910 	    mindex--, suffix--) {
1911 		pmatches[mindex] = suffix;
1912 
1913 		/* If a mismatch is found, interrupting the substring,
1914 		 * compute the matchjump for that position. If no
1915 		 * mismatch is found, then a text substring mismatched
1916 		 * against the suffix will also mismatch against the
1917 		 * substring.
1918 		 */
1919 		while (suffix < g->mlen
1920 		    && g->must[mindex] != g->must[suffix]) {
1921 			g->matchjump[suffix] = MIN(g->matchjump[suffix],
1922 			    g->mlen - mindex - 1);
1923 			suffix = pmatches[suffix];
1924 		}
1925 	}
1926 
1927 	/* Compute the matchjump up to the last substring found to jump
1928 	 * to the beginning of the largest must pattern prefix matching
1929 	 * it's own suffix.
1930 	 */
1931 	for (mindex = 0; mindex <= suffix; mindex++)
1932 		g->matchjump[mindex] = MIN(g->matchjump[mindex],
1933 		    g->mlen + suffix - mindex);
1934 
1935         ssuffix = pmatches[suffix];
1936         while (suffix < g->mlen) {
1937                 while (suffix <= ssuffix && suffix < g->mlen) {
1938                         g->matchjump[suffix] = MIN(g->matchjump[suffix],
1939 			    g->mlen + ssuffix - suffix);
1940                         suffix++;
1941                 }
1942 		if (suffix < g->mlen)
1943                 	ssuffix = pmatches[ssuffix];
1944         }
1945 
1946 	free(pmatches);
1947 }
1948 
1949 /*
1950  - pluscount - count + nesting
1951  == static sopno pluscount(struct parse *p, struct re_guts *g);
1952  */
1953 static sopno			/* nesting depth */
1954 pluscount(struct parse *p, struct re_guts *g)
1955 {
1956 	sop *scan;
1957 	sop s;
1958 	sopno plusnest = 0;
1959 	sopno maxnest = 0;
1960 
1961 	if (p->error != 0)
1962 		return(0);	/* there may not be an OEND */
1963 
1964 	scan = g->strip + 1;
1965 	do {
1966 		s = *scan++;
1967 		switch (OP(s)) {
1968 		case OPLUS_:
1969 			plusnest++;
1970 			break;
1971 		case O_PLUS:
1972 			if (plusnest > maxnest)
1973 				maxnest = plusnest;
1974 			plusnest--;
1975 			break;
1976 		}
1977 	} while (OP(s) != OEND);
1978 	if (plusnest != 0)
1979 		g->iflags |= BAD;
1980 	return(maxnest);
1981 }
1982