1 /*-
2  * This code is derived from OpenBSD's libc/regex, original license follows:
3  *
4  * Copyright (c) 1992, 1993, 1994 Henry Spencer.
5  * Copyright (c) 1992, 1993, 1994
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * This code is derived from software contributed to Berkeley by
9  * Henry Spencer.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  * 3. Neither the name of the University nor the names of its contributors
20  *    may be used to endorse or promote products derived from this software
21  *    without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  *
35  *	@(#)regcomp.c	8.5 (Berkeley) 3/20/94
36  */
37 
38 #include <sys/types.h>
39 #include <stdio.h>
40 #include <string.h>
41 #include <ctype.h>
42 #include <limits.h>
43 #include <stdlib.h>
44 
45 #include "clamav.h"
46 #include "others.h"
47 #include "regex.h"
48 
49 #include "utils.h"
50 #include "regex2.h"
51 
52 #include "cclass.h"
53 #include "cname.h"
54 
55 /*
56  * parse structure, passed up and down to avoid global variables and
57  * other clumsinesses
58  */
59 struct parse {
60 	char *next;		/* next character in RE */
61 	char *end;		/* end of string (-> NUL normally) */
62 	int error;		/* has an error been seen? */
63 	sop *strip;		/* malloced strip */
64 	sopno ssize;		/* malloced strip size (allocated) */
65 	sopno slen;		/* malloced strip length (used) */
66 	int ncsalloc;		/* number of csets allocated */
67 	struct re_guts *g;
68 #	define	NPAREN	10	/* we need to remember () 1-9 for back refs */
69 	sopno pbegin[NPAREN];	/* -> ( ([0] unused) */
70 	sopno pend[NPAREN];	/* -> ) ([0] unused) */
71 };
72 
73 static void p_ere(struct parse *, int);
74 static void p_ere_exp(struct parse *);
75 static void p_str(struct parse *);
76 static void p_bre(struct parse *, int, int);
77 static int p_simp_re(struct parse *, int);
78 static int p_count(struct parse *);
79 static void p_bracket(struct parse *);
80 static void p_b_term(struct parse *, cset *);
81 static void p_b_cclass(struct parse *, cset *);
82 static void p_b_eclass(struct parse *, cset *);
83 static char p_b_symbol(struct parse *);
84 static char p_b_coll_elem(struct parse *, int);
85 static char othercase(int);
86 static void bothcases(struct parse *, int);
87 static void ordinary(struct parse *, int);
88 static void nonnewline(struct parse *);
89 static void repeat(struct parse *, sopno, int, int);
90 static int seterr(struct parse *, int);
91 static cset *allocset(struct parse *);
92 static void freeset(struct parse *, cset *);
93 static int freezeset(struct parse *, cset *);
94 static int firstch(struct parse *, cset *);
95 static int nch(struct parse *, cset *);
96 static void mcadd(struct parse *, cset *, const char *);
97 static void mcinvert(struct parse *, cset *);
98 static void mccase(struct parse *, cset *);
99 static int isinsets(struct re_guts *, int);
100 static int samesets(struct re_guts *, int, int);
101 static void categorize(struct parse *, struct re_guts *);
102 static sopno dupl(struct parse *, sopno, sopno);
103 static void doemit(struct parse *, sop, size_t);
104 static void doinsert(struct parse *, sop, size_t, sopno);
105 static void dofwd(struct parse *, sopno, sop);
106 static void enlarge(struct parse *, sopno);
107 static void stripsnug(struct parse *, struct re_guts *);
108 static void findmust(struct parse *, struct re_guts *);
109 static sopno pluscount(struct parse *, struct re_guts *);
110 
111 static char nuls[10];		/* place to point scanner in event of error */
112 
113 /*
114  * macros for use with parse structure
115  * BEWARE:  these know that the parse structure is named `p' !!!
116  */
117 #define	PEEK()	(*p->next)
118 #define	PEEK2()	(*(p->next+1))
119 #define	MORE()	(p->next < p->end)
120 #define	MORE2()	(p->next+1 < p->end)
121 #define	SEE(c)	(MORE() && PEEK() == (c))
122 #define	SEETWO(a, b)	(MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
123 #define	EAT(c)	((SEE(c)) ? (NEXT(), 1) : 0)
124 #define	EATTWO(a, b)	((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
125 #define	NEXT()	(p->next++)
126 #define	NEXT2()	(p->next += 2)
127 #define	NEXTn(n)	(p->next += (n))
128 #define	GETNEXT()	(*p->next++)
129 #define	SETERROR(e)	seterr(p, (e))
130 #define	REQUIRE(co, e)	(void)((co) || SETERROR(e))
131 #define	MUSTSEE(c, e)	(REQUIRE(MORE() && PEEK() == (c), e))
132 #define	MUSTEAT(c, e)	(REQUIRE(MORE() && GETNEXT() == (c), e))
133 #define	MUSTNOTSEE(c, e)	(REQUIRE(!MORE() || PEEK() != (c), e))
134 #define	EMIT(op, sopnd)	doemit(p, (sop)(op), (size_t)(sopnd))
135 #define	INSERT(op, pos)	doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
136 #define	AHEAD(pos)		dofwd(p, pos, HERE()-(pos))
137 #define	ASTERN(sop, pos)	EMIT(sop, HERE()-pos)
138 #define	HERE()		(p->slen)
139 #define	THERE()		(p->slen - 1)
140 #define	THERETHERE()	(p->slen - 2)
141 #define	DROP(n)	(p->slen -= (n))
142 
143 #ifndef NDEBUG
144 static int never = 0;		/* for use in asserts; shuts lint up */
145 #else
146 #define	never	0		/* some <assert.h>s have bugs too */
147 #endif
148 
149 /*
150  - cli_regcomp_real - interface for parser and compilation
151  */
152 int				/* 0 success, otherwise REG_something */
cli_regcomp_real(regex_t * preg,const char * pattern,int cflags)153 cli_regcomp_real(regex_t *preg, const char *pattern, int cflags)
154 {
155 	struct parse pa;
156 	struct re_guts *g;
157 	struct parse *p = &pa;
158 	int i;
159 	size_t len;
160 	size_t maxlen;
161 #ifdef REDEBUG
162 #	define	GOODFLAGS(f)	(f)
163 #else
164 #	define	GOODFLAGS(f)	((f)&~REG_DUMP)
165 #endif
166 
167 	cflags = GOODFLAGS(cflags);
168 	if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
169 		return(REG_INVARG);
170 
171 	if (cflags&REG_PEND) {
172 		if (preg->re_endp < pattern)
173 			return(REG_INVARG);
174 		len = preg->re_endp - pattern;
175 	} else
176 		len = strlen((const char *)pattern);
177 
178 	/* do the mallocs early so failure handling is easy */
179 	g = (struct re_guts *)cli_malloc(sizeof(struct re_guts) +
180 							(NC-1)*sizeof(cat_t));
181 	if (g == NULL)
182 		return(REG_ESPACE);
183 	/* Patch for bb11264 submitted by the Debian team:                */
184 	/*
185 	 * Limit the pattern space to avoid a 32-bit overflow on buffer
186 	 * extension.  Also avoid any signed overflow in case of conversion
187 	 * so make the real limit based on a 31-bit overflow.
188 	 *
189 	 * Likely not applicable on 64-bit systems but handle the case
190 	 * generically (who are we to stop people from using ~715MB+
191 	 * patterns?).
192 	 */
193 	maxlen = ((size_t)-1 >> 1) / sizeof(sop) * 2 / 3;
194 	if (len >= maxlen) {
195 		free((char *)g);
196 		return(REG_ESPACE);
197 	}
198 	p->ssize = len/(size_t)2*(size_t)3 + (size_t)1;	/* ugh */
199 	assert((size_t)p->ssize >= len);
200 
201 	p->strip = (sop *)cli_calloc(p->ssize, sizeof(sop));
202 	p->slen = 0;
203 	if (p->strip == NULL) {
204 		free((char *)g);
205 		return(REG_ESPACE);
206 	}
207 
208 	/* set things up */
209 	p->g = g;
210 	p->next = (char *)pattern;	/* convenience; we do not modify it */
211 	p->end = p->next + len;
212 	p->error = 0;
213 	p->ncsalloc = 0;
214 	for (i = 0; i < NPAREN; i++) {
215 		p->pbegin[i] = 0;
216 		p->pend[i] = 0;
217 	}
218 	g->csetsize = NC;
219 	g->sets = NULL;
220 	g->setbits = NULL;
221 	g->ncsets = 0;
222 	g->cflags = cflags;
223 	g->iflags = 0;
224 	g->nbol = 0;
225 	g->neol = 0;
226 	g->must = NULL;
227 	g->mlen = 0;
228 	g->nsub = 0;
229 	g->ncategories = 1;	/* category 0 is "everything else" */
230 	g->categories = &g->catspace[-(CHAR_MIN)];
231 	(void) memset((char *)g->catspace, 0, NC*sizeof(cat_t));
232 	g->backrefs = 0;
233 
234 	/* do it */
235 	EMIT(OEND, 0);
236 	g->firststate = THERE();
237 	if (cflags&REG_EXTENDED)
238 		p_ere(p, OUT);
239 	else if (cflags&REG_NOSPEC)
240 		p_str(p);
241 	else
242 		p_bre(p, OUT, OUT);
243 	EMIT(OEND, 0);
244 	g->laststate = THERE();
245 
246 	/* tidy up loose ends and fill things in */
247 	categorize(p, g);
248 	stripsnug(p, g);
249 	findmust(p, g);
250 	g->nplus = pluscount(p, g);
251 	g->magic = MAGIC2;
252 	preg->re_nsub = g->nsub;
253 	preg->re_g = g;
254 	preg->re_magic = MAGIC1;
255 #ifndef REDEBUG
256 	/* not debugging, so can't rely on the assert() in cli_regexec() */
257 	if (g->iflags&REGEX_BAD)
258 		SETERROR(REG_ASSERT);
259 #endif
260 
261 	/* win or lose, we're done */
262 	if (p->error != 0)	/* lose */
263 		cli_regfree(preg);
264 	return(p->error);
265 }
266 
267 /*
268  - p_ere - ERE parser top level, concatenation and alternation
269  */
270 static void
p_ere(struct parse * p,int stop)271 p_ere(struct parse *p, int stop)	/* character this ERE should end at */
272 {
273 	char c;
274 	sopno prevback = 0;
275 	sopno prevfwd = 0;
276 	sopno conc;
277 	int first = 1;		/* is this the first alternative? */
278 
279 	for (;;) {
280 		/* do a bunch of concatenated expressions */
281 		conc = HERE();
282 		while (MORE() && (c = PEEK()) != '|' && c != stop)
283 			p_ere_exp(p);
284 		REQUIRE(HERE() != conc, REG_EMPTY);	/* require nonempty */
285 
286 		if (!EAT('|'))
287 			break;		/* NOTE BREAK OUT */
288 
289 		if (first) {
290 			INSERT(OCH_, conc);	/* offset is wrong */
291 			prevfwd = conc;
292 			prevback = conc;
293 			first = 0;
294 		}
295 		ASTERN(OOR1, prevback);
296 		prevback = THERE();
297 		AHEAD(prevfwd);			/* fix previous offset */
298 		prevfwd = HERE();
299 		EMIT(OOR2, 0);			/* offset is very wrong */
300 	}
301 
302 	if (!first) {		/* tail-end fixups */
303 		AHEAD(prevfwd);
304 		ASTERN(O_CH, prevback);
305 	}
306 
307 	assert(!MORE() || SEE(stop));
308 }
309 
310 /*
311  - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
312  */
313 static void
p_ere_exp(struct parse * p)314 p_ere_exp(struct parse *p)
315 {
316 	char c;
317 	sopno pos;
318 	int count;
319 	int count2;
320 	sopno subno;
321 	int wascaret = 0;
322 
323 	assert(MORE());		/* caller should have ensured this */
324 	c = GETNEXT();
325 
326 	pos = HERE();
327 	switch (c) {
328 	case '(':
329 		REQUIRE(MORE(), REG_EPAREN);
330 		p->g->nsub++;
331 		subno = p->g->nsub;
332 		if (subno < NPAREN)
333 			p->pbegin[subno] = HERE();
334 		EMIT(OLPAREN, subno);
335 		if (!SEE(')'))
336 			p_ere(p, ')');
337 		if (subno < NPAREN) {
338 			p->pend[subno] = HERE();
339 			assert(p->pend[subno] != 0);
340 		}
341 		EMIT(ORPAREN, subno);
342 		MUSTEAT(')', REG_EPAREN);
343 		break;
344 #ifndef POSIX_MISTAKE
345 	case ')':		/* happens only if no current unmatched ( */
346 		/*
347 		 * You may ask, why the ifndef?  Because I didn't notice
348 		 * this until slightly too late for 1003.2, and none of the
349 		 * other 1003.2 regular-expression reviewers noticed it at
350 		 * all.  So an unmatched ) is legal POSIX, at least until
351 		 * we can get it fixed.
352 		 */
353 		SETERROR(REG_EPAREN);
354 		break;
355 #endif
356 	case '^':
357 		EMIT(OBOL, 0);
358 		p->g->iflags |= USEBOL;
359 		p->g->nbol++;
360 		wascaret = 1;
361 		break;
362 	case '$':
363 		EMIT(OEOL, 0);
364 		p->g->iflags |= USEEOL;
365 		p->g->neol++;
366 		break;
367 	case '|':
368 		SETERROR(REG_EMPTY);
369 		break;
370 	case '*':
371 	case '+':
372 	case '?':
373 		SETERROR(REG_BADRPT);
374 		break;
375 	case '.':
376 		if (p->g->cflags&REG_NEWLINE)
377 			nonnewline(p);
378 		else
379 			EMIT(OANY, 0);
380 		break;
381 	case '[':
382 		p_bracket(p);
383 		break;
384 	case '\\':
385 		REQUIRE(MORE(), REG_EESCAPE);
386 		c = GETNEXT();
387 		ordinary(p, c);
388 		break;
389 	case '{':		/* okay as ordinary except if digit follows */
390 		REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT);
391 		/* FALLTHROUGH */
392 	default:
393 		ordinary(p, c);
394 		break;
395 	}
396 
397 	if (!MORE())
398 		return;
399 	c = PEEK();
400 	/* we call { a repetition if followed by a digit */
401 	if (!( c == '*' || c == '+' || c == '?' ||
402 				(c == '{' && MORE2() && isdigit((uch)PEEK2())) ))
403 		return;		/* no repetition, we're done */
404 	NEXT();
405 
406 	REQUIRE(!wascaret, REG_BADRPT);
407 	switch (c) {
408 	case '*':	/* implemented as +? */
409 		/* this case does not require the (y|) trick, noKLUDGE */
410 		INSERT(OPLUS_, pos);
411 		ASTERN(O_PLUS, pos);
412 		INSERT(OQUEST_, pos);
413 		ASTERN(O_QUEST, pos);
414 		break;
415 	case '+':
416 		INSERT(OPLUS_, pos);
417 		ASTERN(O_PLUS, pos);
418 		break;
419 	case '?':
420 		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
421 		INSERT(OCH_, pos);		/* offset slightly wrong */
422 		ASTERN(OOR1, pos);		/* this one's right */
423 		AHEAD(pos);			/* fix the OCH_ */
424 		EMIT(OOR2, 0);			/* offset very wrong... */
425 		AHEAD(THERE());			/* ...so fix it */
426 		ASTERN(O_CH, THERETHERE());
427 		break;
428 	case '{':
429 		count = p_count(p);
430 		if (EAT(',')) {
431 			if (isdigit((uch)PEEK())) {
432 				count2 = p_count(p);
433 				REQUIRE(count <= count2, REG_BADBR);
434 			} else		/* single number with comma */
435 				count2 = REGEX_INFINITY;
436 		} else		/* just a single number */
437 			count2 = count;
438 		repeat(p, pos, count, count2);
439 		if (!EAT('}')) {	/* error heuristics */
440 			while (MORE() && PEEK() != '}')
441 				NEXT();
442 			REQUIRE(MORE(), REG_EBRACE);
443 			SETERROR(REG_BADBR);
444 		}
445 		break;
446 	}
447 
448 	if (!MORE())
449 		return;
450 	c = PEEK();
451 	if (!( c == '*' || c == '+' || c == '?' ||
452 				(c == '{' && MORE2() && isdigit((uch)PEEK2())) ) )
453 		return;
454 	SETERROR(REG_BADRPT);
455 }
456 
457 /*
458  - p_str - string (no metacharacters) "parser"
459  */
460 static void
p_str(struct parse * p)461 p_str(struct parse *p)
462 {
463 	REQUIRE(MORE(), REG_EMPTY);
464 	while (MORE())
465 		ordinary(p, GETNEXT());
466 }
467 
468 /*
469  - p_bre - BRE parser top level, anchoring and concatenation
470  * Giving end1 as OUT essentially eliminates the end1/end2 check.
471  *
472  * This implementation is a bit of a kludge, in that a trailing $ is first
473  * taken as an ordinary character and then revised to be an anchor.  The
474  * only undesirable side effect is that '$' gets included as a character
475  * category in such cases.  This is fairly harmless; not worth fixing.
476  * The amount of lookahead needed to avoid this kludge is excessive.
477  */
478 static void
p_bre(struct parse * p,int end1,int end2)479 p_bre(struct parse *p,
480     int end1,		/* first terminating character */
481     int end2)		/* second terminating character */
482 {
483 	sopno start = HERE();
484 	int first = 1;			/* first subexpression? */
485 	int wasdollar = 0;
486 
487 	if (EAT('^')) {
488 		EMIT(OBOL, 0);
489 		p->g->iflags |= USEBOL;
490 		p->g->nbol++;
491 	}
492 	while (MORE() && !SEETWO(end1, end2)) {
493 		wasdollar = p_simp_re(p, first);
494 		first = 0;
495 	}
496 	if (wasdollar) {	/* oops, that was a trailing anchor */
497 		DROP(1);
498 		EMIT(OEOL, 0);
499 		p->g->iflags |= USEEOL;
500 		p->g->neol++;
501 	}
502 
503 	REQUIRE(HERE() != start, REG_EMPTY);	/* require nonempty */
504 }
505 
506 /*
507  - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
508  */
509 static int			/* was the simple RE an unbackslashed $? */
p_simp_re(struct parse * p,int starordinary)510 p_simp_re(struct parse *p,
511     int starordinary)		/* is a leading * an ordinary character? */
512 {
513 	int c;
514 	int count;
515 	int count2;
516 	sopno pos;
517 	int i;
518 	sopno subno;
519 #	define	BACKSL	(1<<CHAR_BIT)
520 
521 	pos = HERE();		/* repetion op, if any, covers from here */
522 
523 	assert(MORE());		/* caller should have ensured this */
524 	c = GETNEXT();
525 	if (c == '\\') {
526 		REQUIRE(MORE(), REG_EESCAPE);
527 		c = BACKSL | GETNEXT();
528 	}
529 	switch (c) {
530 	case '.':
531 		if (p->g->cflags&REG_NEWLINE)
532 			nonnewline(p);
533 		else
534 			EMIT(OANY, 0);
535 		break;
536 	case '[':
537 		p_bracket(p);
538 		break;
539 	case BACKSL|'{':
540 		SETERROR(REG_BADRPT);
541 		break;
542 	case BACKSL|'(':
543 		p->g->nsub++;
544 		subno = p->g->nsub;
545 		if (subno < NPAREN)
546 			p->pbegin[subno] = HERE();
547 		EMIT(OLPAREN, subno);
548 		/* the MORE here is an error heuristic */
549 		if (MORE() && !SEETWO('\\', ')'))
550 			p_bre(p, '\\', ')');
551 		if (subno < NPAREN) {
552 			p->pend[subno] = HERE();
553 			assert(p->pend[subno] != 0);
554 		}
555 		EMIT(ORPAREN, subno);
556 		REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
557 		break;
558 	case BACKSL|')':	/* should not get here -- must be user */
559 	case BACKSL|'}':
560 		SETERROR(REG_EPAREN);
561 		break;
562 	case BACKSL|'1':
563 	case BACKSL|'2':
564 	case BACKSL|'3':
565 	case BACKSL|'4':
566 	case BACKSL|'5':
567 	case BACKSL|'6':
568 	case BACKSL|'7':
569 	case BACKSL|'8':
570 	case BACKSL|'9':
571 		i = (c&~BACKSL) - '0';
572 		assert(i < NPAREN);
573 		if (p->pend[i] != 0) {
574 			assert((size_t)i <= p->g->nsub);
575 			EMIT(OBACK_, i);
576 			assert(p->pbegin[i] != 0);
577 			assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
578 			assert(OP(p->strip[p->pend[i]]) == ORPAREN);
579 			(void) dupl(p, p->pbegin[i]+1, p->pend[i]);
580 			EMIT(O_BACK, i);
581 		} else
582 			SETERROR(REG_ESUBREG);
583 		p->g->backrefs = 1;
584 		break;
585 	case '*':
586 		REQUIRE(starordinary, REG_BADRPT);
587 		/* FALLTHROUGH */
588 	default:
589 		ordinary(p, (char)c);
590 		break;
591 	}
592 
593 	if (EAT('*')) {		/* implemented as +? */
594 		/* this case does not require the (y|) trick, noKLUDGE */
595 		INSERT(OPLUS_, pos);
596 		ASTERN(O_PLUS, pos);
597 		INSERT(OQUEST_, pos);
598 		ASTERN(O_QUEST, pos);
599 	} else if (EATTWO('\\', '{')) {
600 		count = p_count(p);
601 		if (EAT(',')) {
602 			if (MORE() && isdigit((uch)PEEK())) {
603 				count2 = p_count(p);
604 				REQUIRE(count <= count2, REG_BADBR);
605 			} else		/* single number with comma */
606 				count2 = REGEX_INFINITY;
607 		} else		/* just a single number */
608 			count2 = count;
609 		repeat(p, pos, count, count2);
610 		if (!EATTWO('\\', '}')) {	/* error heuristics */
611 			while (MORE() && !SEETWO('\\', '}'))
612 				NEXT();
613 			REQUIRE(MORE(), REG_EBRACE);
614 			SETERROR(REG_BADBR);
615 		}
616 	} else if (c == '$')	/* $ (but not \$) ends it */
617 		return(1);
618 
619 	return(0);
620 }
621 
622 /*
623  - p_count - parse a repetition count
624  */
625 static int			/* the value */
p_count(struct parse * p)626 p_count(struct parse *p)
627 {
628 	int count = 0;
629 	int ndigits = 0;
630 
631 	while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
632 		count = count*10 + (GETNEXT() - '0');
633 		ndigits++;
634 	}
635 
636 	REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
637 	return(count);
638 }
639 
640 /*
641  - p_bracket - parse a bracketed character list
642  *
643  * Note a significant property of this code:  if the allocset() did SETERROR,
644  * no set operations are done.
645  */
646 static void
p_bracket(struct parse * p)647 p_bracket(struct parse *p)
648 {
649 	cset *cs;
650 	int invert = 0;
651 
652 	/* Dept of Truly Sickening Special-Case Kludges */
653 	if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
654 		EMIT(OBOW, 0);
655 		NEXTn(6);
656 		return;
657 	}
658 	if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
659 		EMIT(OEOW, 0);
660 		NEXTn(6);
661 		return;
662 	}
663 
664 	if ((cs = allocset(p)) == NULL) {
665 		/* allocset did set error status in p */
666 		return;
667 	}
668 
669 	if (EAT('^'))
670 		invert++;	/* make note to invert set at end */
671 	if (EAT(']'))
672 		CHadd(cs, ']');
673 	else if (EAT('-'))
674 		CHadd(cs, '-');
675 	while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
676 		p_b_term(p, cs);
677 	if (EAT('-'))
678 		CHadd(cs, '-');
679 	MUSTEAT(']', REG_EBRACK);
680 
681 	if (p->error != 0) {	/* don't mess things up further */
682 		freeset(p, cs);
683 		return;
684 	}
685 
686 	if (p->g->cflags&REG_ICASE) {
687 		int i;
688 		int ci;
689 
690 		for (i = p->g->csetsize - 1; i >= 0; i--)
691 			if (CHIN(cs, i) && isalpha(i)) {
692 				ci = othercase(i);
693 				if (ci != i)
694 					CHadd(cs, ci);
695 			}
696 		if (cs->multis != NULL)
697 			mccase(p, cs);
698 	}
699 	if (invert) {
700 		int i;
701 
702 		for (i = p->g->csetsize - 1; i >= 0; i--)
703 			if (CHIN(cs, i))
704 				CHsub(cs, i);
705 			else
706 				CHadd(cs, i);
707 		if (p->g->cflags&REG_NEWLINE)
708 			CHsub(cs, '\n');
709 		if (cs->multis != NULL)
710 			mcinvert(p, cs);
711 	}
712 
713 	assert(cs->multis == NULL);		/* xxx */
714 
715 	if (nch(p, cs) == 1) {		/* optimize singleton sets */
716 		ordinary(p, firstch(p, cs));
717 		freeset(p, cs);
718 	} else
719 		EMIT(OANYOF, freezeset(p, cs));
720 }
721 
722 /*
723  - p_b_term - parse one term of a bracketed character list
724  */
725 static void
p_b_term(struct parse * p,cset * cs)726 p_b_term(struct parse *p, cset *cs)
727 {
728 	char c;
729 	char start, finish;
730 	int i;
731 
732 	/* classify what we've got */
733 	switch ((MORE()) ? PEEK() : '\0') {
734 	case '[':
735 		c = (MORE2()) ? PEEK2() : '\0';
736 		break;
737 	case '-':
738 		SETERROR(REG_ERANGE);
739 		return;			/* NOTE RETURN */
740 		break;
741 	default:
742 		c = '\0';
743 		break;
744 	}
745 
746 	switch (c) {
747 	case ':':		/* character class */
748 		NEXT2();
749 		REQUIRE(MORE(), REG_EBRACK);
750 		c = PEEK();
751 		REQUIRE(c != '-' && c != ']', REG_ECTYPE);
752 		p_b_cclass(p, cs);
753 		REQUIRE(MORE(), REG_EBRACK);
754 		REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
755 		break;
756 	case '=':		/* equivalence class */
757 		NEXT2();
758 		REQUIRE(MORE(), REG_EBRACK);
759 		c = PEEK();
760 		REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
761 		p_b_eclass(p, cs);
762 		REQUIRE(MORE(), REG_EBRACK);
763 		REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
764 		break;
765 	default:		/* symbol, ordinary character, or range */
766 /* xxx revision needed for multichar stuff */
767 		start = p_b_symbol(p);
768 		if (SEE('-') && MORE2() && PEEK2() != ']') {
769 			/* range */
770 			NEXT();
771 			if (EAT('-'))
772 				finish = '-';
773 			else
774 				finish = p_b_symbol(p);
775 		} else
776 			finish = start;
777 /* xxx what about signed chars here... */
778 		REQUIRE(start <= finish, REG_ERANGE);
779 		for (i = start; i <= finish; i++)
780 			CHadd(cs, i);
781 		break;
782 	}
783 }
784 
785 /*
786  - p_b_cclass - parse a character-class name and deal with it
787  */
788 static void
p_b_cclass(struct parse * p,cset * cs)789 p_b_cclass(struct parse *p, cset *cs)
790 {
791 	char *sp = p->next;
792 	struct cclass *cp;
793 	size_t len;
794 	const char *u;
795 	char c;
796 
797 	while (MORE() && isalpha(PEEK()))
798 		NEXT();
799 	len = p->next - sp;
800 	for (cp = cclasses; cp->name != NULL; cp++)
801 		if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
802 			break;
803 	if (cp->name == NULL) {
804 		/* oops, didn't find it */
805 		SETERROR(REG_ECTYPE);
806 		return;
807 	}
808 
809 	u = cp->chars;
810 	while ((c = *u++) != '\0')
811 		CHadd(cs, c);
812 	for (u = cp->multis; *u != '\0'; u += strlen(u) + 1)
813 		MCadd(p, cs, u);
814 }
815 
816 /*
817  - p_b_eclass - parse an equivalence-class name and deal with it
818  *
819  * This implementation is incomplete. xxx
820  */
821 static void
p_b_eclass(struct parse * p,cset * cs)822 p_b_eclass(struct parse *p, cset *cs)
823 {
824 	char c;
825 
826 	c = p_b_coll_elem(p, '=');
827 	CHadd(cs, c);
828 }
829 
830 /*
831  - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
832  */
833 static char			/* value of symbol */
p_b_symbol(struct parse * p)834 p_b_symbol(struct parse *p)
835 {
836 	char value;
837 
838 	REQUIRE(MORE(), REG_EBRACK);
839 	if (!EATTWO('[', '.'))
840 		return(GETNEXT());
841 
842 	/* collating symbol */
843 	value = p_b_coll_elem(p, '.');
844 	REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
845 	return(value);
846 }
847 
848 /*
849  - p_b_coll_elem - parse a collating-element name and look it up
850  */
851 static char			/* value of collating element */
p_b_coll_elem(struct parse * p,int endc)852 p_b_coll_elem(struct parse *p,
853     int endc)			/* name ended by endc,']' */
854 {
855 	char *sp = p->next;
856 	struct cname *cp;
857 	int len;
858 
859 	while (MORE() && !SEETWO(endc, ']'))
860 		NEXT();
861 	if (!MORE()) {
862 		SETERROR(REG_EBRACK);
863 		return(0);
864 	}
865 	len = p->next - sp;
866 	for (cp = cnames; cp->name != NULL; cp++)
867 		if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
868 			return(cp->code);	/* known name */
869 	if (len == 1)
870 		return(*sp);	/* single character */
871 	SETERROR(REG_ECOLLATE);			/* neither */
872 	return(0);
873 }
874 
875 /*
876  - othercase - return the case counterpart of an alphabetic
877  */
878 static char			/* if no counterpart, return ch */
othercase(int ch)879 othercase(int ch)
880 {
881 	ch = (uch)ch;
882 	assert(isalpha(ch));
883 	if (isupper(ch))
884 		return ((uch)tolower(ch));
885 	else if (islower(ch))
886 		return ((uch)toupper(ch));
887 	else			/* peculiar, but could happen */
888 		return(ch);
889 }
890 
891 /*
892  - bothcases - emit a dualcase version of a two-case character
893  *
894  * Boy, is this implementation ever a kludge...
895  */
896 static void
bothcases(struct parse * p,int ch)897 bothcases(struct parse *p, int ch)
898 {
899 	char *oldnext = p->next;
900 	char *oldend = p->end;
901 	char bracket[3];
902 
903 	ch = (uch)ch;
904 	assert(othercase(ch) != ch);	/* p_bracket() would recurse */
905 	p->next = bracket;
906 	p->end = bracket+2;
907 	bracket[0] = ch;
908 	bracket[1] = ']';
909 	bracket[2] = '\0';
910 	p_bracket(p);
911 	assert(p->next == bracket+2);
912 	p->next = oldnext;
913 	p->end = oldend;
914 }
915 
916 /*
917  - ordinary - emit an ordinary character
918  */
919 static void
ordinary(struct parse * p,int ch)920 ordinary(struct parse *p, int ch)
921 {
922 	cat_t *cap = p->g->categories;
923 
924 	if ((p->g->cflags&REG_ICASE) && isalpha((uch)ch) && othercase(ch) != ch)
925 		bothcases(p, ch);
926 	else {
927 		EMIT(OCHAR, (uch)ch);
928 		if (cap[ch] == 0)
929 			cap[ch] = p->g->ncategories++;
930 	}
931 }
932 
933 /*
934  - nonnewline - emit REG_NEWLINE version of OANY
935  *
936  * Boy, is this implementation ever a kludge...
937  */
938 static void
nonnewline(struct parse * p)939 nonnewline(struct parse *p)
940 {
941 	char *oldnext = p->next;
942 	char *oldend = p->end;
943 	char bracket[4];
944 
945 	p->next = bracket;
946 	p->end = bracket+3;
947 	bracket[0] = '^';
948 	bracket[1] = '\n';
949 	bracket[2] = ']';
950 	bracket[3] = '\0';
951 	p_bracket(p);
952 	assert(p->next == bracket+3);
953 	p->next = oldnext;
954 	p->end = oldend;
955 }
956 
957 /*
958  - repeat - generate code for a bounded repetition, recursively if needed
959  */
960 static void
repeat(struct parse * p,sopno start,int from,int to)961 repeat(struct parse *p,
962     sopno start,		/* operand from here to end of strip */
963     int from,			/* repeated from this number */
964     int to)			/* to this number of times (maybe REGEX_INFINITY) */
965 {
966 	sopno finish = HERE();
967 #	define	N	2
968 #	define	INF	3
969 #	define	REP(f, t)	((f)*8 + (t))
970 #	define	MAP(n)	(((n) <= 1) ? (n) : ((n) == REGEX_INFINITY) ? INF : N)
971 	sopno copy;
972 
973 	if (p->error != 0)	/* head off possible runaway recursion */
974 		return;
975 
976 	assert(from <= to);
977 
978 	switch (REP(MAP(from), MAP(to))) {
979 	case REP(0, 0):			/* must be user doing this */
980 		DROP(finish-start);	/* drop the operand */
981 		break;
982 	case REP(0, 1):			/* as x{1,1}? */
983 	case REP(0, N):			/* as x{1,n}? */
984 	case REP(0, INF):		/* as x{1,}? */
985 		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
986 		INSERT(OCH_, start);		/* offset is wrong... */
987 		repeat(p, start+1, 1, to);
988 		ASTERN(OOR1, start);
989 		AHEAD(start);			/* ... fix it */
990 		EMIT(OOR2, 0);
991 		AHEAD(THERE());
992 		ASTERN(O_CH, THERETHERE());
993 		break;
994 	case REP(1, 1):			/* trivial case */
995 		/* done */
996 		break;
997 	case REP(1, N):			/* as x?x{1,n-1} */
998 		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
999 		INSERT(OCH_, start);
1000 		ASTERN(OOR1, start);
1001 		AHEAD(start);
1002 		EMIT(OOR2, 0);			/* offset very wrong... */
1003 		AHEAD(THERE());			/* ...so fix it */
1004 		ASTERN(O_CH, THERETHERE());
1005 		copy = dupl(p, start+1, finish+1);
1006 		assert(copy == finish+4);
1007 		repeat(p, copy, 1, to-1);
1008 		break;
1009 	case REP(1, INF):		/* as x+ */
1010 		INSERT(OPLUS_, start);
1011 		ASTERN(O_PLUS, start);
1012 		break;
1013 	case REP(N, N):			/* as xx{m-1,n-1} */
1014 		copy = dupl(p, start, finish);
1015 		repeat(p, copy, from-1, to-1);
1016 		break;
1017 	case REP(N, INF):		/* as xx{n-1,INF} */
1018 		copy = dupl(p, start, finish);
1019 		repeat(p, copy, from-1, to);
1020 		break;
1021 	default:			/* "can't happen" */
1022 		SETERROR(REG_ASSERT);	/* just in case */
1023 		break;
1024 	}
1025 }
1026 
1027 /*
1028  - seterr - set an error condition
1029  */
1030 static int			/* useless but makes type checking happy */
seterr(struct parse * p,int e)1031 seterr(struct parse *p, int e)
1032 {
1033 	if (p->error == 0)	/* keep earliest error condition */
1034 		p->error = e;
1035 	p->next = nuls;		/* try to bring things to a halt */
1036 	p->end = nuls;
1037 	return(0);		/* make the return value well-defined */
1038 }
1039 
1040 /*
1041  - allocset - allocate a set of characters for []
1042  */
1043 static cset *
allocset(struct parse * p)1044 allocset(struct parse *p)
1045 {
1046 	int no = p->g->ncsets++;
1047 	size_t nc;
1048 	size_t nbytes;
1049 	cset *cs;
1050 	size_t css = (size_t)p->g->csetsize;
1051 	int i;
1052 
1053 	if (no >= p->ncsalloc) {	/* need another column of space */
1054 		void *ptr;
1055 
1056 		p->ncsalloc += CHAR_BIT;
1057 		nc = p->ncsalloc;
1058 		assert(nc % CHAR_BIT == 0);
1059 		nbytes = nc / CHAR_BIT * css;
1060 
1061 		ptr = (cset *)cli_realloc((char *)p->g->sets, nc * sizeof(cset));
1062 		if (ptr == NULL)
1063 			goto nomem;
1064 		p->g->sets = ptr;
1065 
1066 		ptr = (uch *)cli_realloc((char *)p->g->setbits, nbytes);
1067 		if (ptr == NULL)
1068 			goto nomem;
1069 		p->g->setbits = ptr;
1070 
1071 		for (i = 0; i < no; i++)
1072 			p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT);
1073 
1074 		(void) memset((char *)p->g->setbits + (nbytes - css), 0, css);
1075 	}
1076 	/* XXX should not happen */
1077 	if (p->g->sets == NULL || p->g->setbits == NULL)
1078 		goto nomem;
1079 
1080 	cs = &p->g->sets[no];
1081 	cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
1082 	cs->mask = 1 << ((no) % CHAR_BIT);
1083 	cs->hash = 0;
1084 	cs->smultis = 0;
1085 	cs->multis = NULL;
1086 
1087 	return(cs);
1088 nomem:
1089 	free(p->g->sets);
1090 	p->g->sets = NULL;
1091 	free(p->g->setbits);
1092 	p->g->setbits = NULL;
1093 
1094 	SETERROR(REG_ESPACE);
1095 	/* caller's responsibility not to do set ops */
1096 	return(NULL);
1097 }
1098 
1099 /*
1100  - freeset - free a now-unused set
1101  */
1102 static void
freeset(struct parse * p,cset * cs)1103 freeset(struct parse *p, cset *cs)
1104 {
1105 	size_t i;
1106 	cset *top = &p->g->sets[p->g->ncsets];
1107 	size_t css = (size_t)p->g->csetsize;
1108 
1109 	for (i = 0; i < css; i++)
1110 		CHsub(cs, i);
1111 	if (cs == top-1)	/* recover only the easy case */
1112 		p->g->ncsets--;
1113 }
1114 
1115 /*
1116  - freezeset - final processing on a set of characters
1117  *
1118  * The main task here is merging identical sets.  This is usually a waste
1119  * of time (although the hash code minimizes the overhead), but can win
1120  * big if REG_ICASE is being used.  REG_ICASE, by the way, is why the hash
1121  * is done using addition rather than xor -- all ASCII [aA] sets xor to
1122  * the same value!
1123  */
1124 static int			/* set number */
freezeset(struct parse * p,cset * cs)1125 freezeset(struct parse *p, cset *cs)
1126 {
1127 	uch h = cs->hash;
1128 	size_t i;
1129 	cset *top = &p->g->sets[p->g->ncsets];
1130 	cset *cs2;
1131 	size_t css = (size_t)p->g->csetsize;
1132 
1133 	/* look for an earlier one which is the same */
1134 	for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
1135 		if (cs2->hash == h && cs2 != cs) {
1136 			/* maybe */
1137 			for (i = 0; i < css; i++)
1138 				if (!!CHIN(cs2, i) != !!CHIN(cs, i))
1139 					break;		/* no */
1140 			if (i == css)
1141 				break;			/* yes */
1142 		}
1143 
1144 	if (cs2 < top) {	/* found one */
1145 		freeset(p, cs);
1146 		cs = cs2;
1147 	}
1148 
1149 	return((int)(cs - p->g->sets));
1150 }
1151 
1152 /*
1153  - firstch - return first character in a set (which must have at least one)
1154  */
1155 static int			/* character; there is no "none" value */
firstch(struct parse * p,cset * cs)1156 firstch(struct parse *p, cset *cs)
1157 {
1158 	size_t i;
1159 	size_t css = (size_t)p->g->csetsize;
1160 
1161 	for (i = 0; i < css; i++)
1162 		if (CHIN(cs, i))
1163 			return((char)i);
1164 	assert(never);
1165 	return(0);		/* arbitrary */
1166 }
1167 
1168 /*
1169  - nch - number of characters in a set
1170  */
1171 static int
nch(struct parse * p,cset * cs)1172 nch(struct parse *p, cset *cs)
1173 {
1174 	size_t i;
1175 	size_t css = (size_t)p->g->csetsize;
1176 	int n = 0;
1177 
1178 	for (i = 0; i < css; i++)
1179 		if (CHIN(cs, i))
1180 			n++;
1181 	return(n);
1182 }
1183 
1184 /*
1185  - mcadd - add a collating element to a cset
1186  */
1187 static void
mcadd(struct parse * p,cset * cs,const char * cp)1188 mcadd( struct parse *p, cset *cs, const char *cp)
1189 {
1190 	size_t oldend = cs->smultis;
1191 	void *np;
1192 
1193 	cs->smultis += strlen(cp) + 1;
1194 	np = cli_realloc(cs->multis, cs->smultis);
1195 	if (np == NULL) {
1196 		if (cs->multis)
1197 			free(cs->multis);
1198 		cs->multis = NULL;
1199 		SETERROR(REG_ESPACE);
1200 		return;
1201 	}
1202 	cs->multis = np;
1203 
1204 	cli_strlcpy(cs->multis + oldend - 1, cp, cs->smultis - oldend + 1);
1205 }
1206 
1207 /*
1208  - mcinvert - invert the list of collating elements in a cset
1209  *
1210  * This would have to know the set of possibilities.  Implementation
1211  * is deferred.
1212  */
1213 /* ARGSUSED */
1214 static void
mcinvert(struct parse * p,cset * cs)1215 mcinvert(struct parse *p, cset *cs)
1216 {
1217     UNUSEDPARAM(p);
1218 	assert(cs->multis == NULL);	/* xxx */
1219 }
1220 
1221 /*
1222  - mccase - add case counterparts of the list of collating elements in a cset
1223  *
1224  * This would have to know the set of possibilities.  Implementation
1225  * is deferred.
1226  */
1227 /* ARGSUSED */
1228 static void
mccase(struct parse * p,cset * cs)1229 mccase(struct parse *p, cset *cs)
1230 {
1231     UNUSEDPARAM(p);
1232 	assert(cs->multis == NULL);	/* xxx */
1233 }
1234 
1235 /*
1236  - isinsets - is this character in any sets?
1237  */
1238 static int			/* predicate */
isinsets(struct re_guts * g,int c)1239 isinsets(struct re_guts *g, int c)
1240 {
1241 	uch *col;
1242 	int i;
1243 	int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1244 	unsigned uc = (uch)c;
1245 
1246 	for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1247 		if (col[uc] != 0)
1248 			return(1);
1249 	return(0);
1250 }
1251 
1252 /*
1253  - samesets - are these two characters in exactly the same sets?
1254  */
1255 static int			/* predicate */
samesets(struct re_guts * g,int c1,int c2)1256 samesets(struct re_guts *g, int c1, int c2)
1257 {
1258 	uch *col;
1259 	int i;
1260 	int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1261 	unsigned uc1 = (uch)c1;
1262 	unsigned uc2 = (uch)c2;
1263 
1264 	for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1265 		if (col[uc1] != col[uc2])
1266 			return(0);
1267 	return(1);
1268 }
1269 
1270 /*
1271  - categorize - sort out character categories
1272  */
1273 static void
categorize(struct parse * p,struct re_guts * g)1274 categorize(struct parse *p, struct re_guts *g)
1275 {
1276 	cat_t *cats = g->categories;
1277 	int c;
1278 	int c2;
1279 	cat_t cat;
1280 
1281 	/* avoid making error situations worse */
1282 	if (p->error != 0)
1283 		return;
1284 
1285 	for (c = CHAR_MIN; c <= CHAR_MAX; c++)
1286 		if (cats[c] == 0 && isinsets(g, c)) {
1287 			cat = g->ncategories++;
1288 			cats[c] = cat;
1289 			for (c2 = c+1; c2 <= CHAR_MAX; c2++)
1290 				if (cats[c2] == 0 && samesets(g, c, c2))
1291 					cats[c2] = cat;
1292 		}
1293 }
1294 
1295 /*
1296  - dupl - emit a duplicate of a bunch of sops
1297  */
1298 static sopno			/* start of duplicate */
dupl(struct parse * p,sopno start,sopno finish)1299 dupl(struct parse *p,
1300     sopno start,		/* from here */
1301     sopno finish)		/* to this less one */
1302 {
1303 	sopno ret = HERE();
1304 	sopno len = finish - start;
1305 
1306 	assert(finish >= start);
1307 	if (len == 0)
1308 		return(ret);
1309 	enlarge(p, p->ssize + len);	/* this many unexpected additions */
1310 	assert(p->ssize >= p->slen + len);
1311 	(void) memmove((char *)(p->strip + p->slen),
1312 		(char *)(p->strip + start), (size_t)len*sizeof(sop));
1313 	p->slen += len;
1314 	return(ret);
1315 }
1316 
1317 /*
1318  - doemit - emit a strip operator
1319  *
1320  * It might seem better to implement this as a macro with a function as
1321  * hard-case backup, but it's just too big and messy unless there are
1322  * some changes to the data structures.  Maybe later.
1323  */
1324 static void
doemit(struct parse * p,sop op,size_t opnd)1325 doemit(struct parse *p, sop op, size_t opnd)
1326 {
1327 	/* avoid making error situations worse */
1328 	if (p->error != 0)
1329 		return;
1330 
1331 	/* deal with oversize operands ("can't happen", more or less) */
1332 	assert(opnd < 1<<OPSHIFT);
1333 
1334 	/* deal with undersized strip */
1335 	if (p->slen >= p->ssize)
1336 		enlarge(p, (p->ssize+1) / 2 * 3);	/* +50% */
1337 	assert(p->slen < p->ssize);
1338 
1339 	/* finally, it's all reduced to the easy case */
1340 	p->strip[p->slen++] = SOP(op, opnd);
1341 }
1342 
1343 /*
1344  - doinsert - insert a sop into the strip
1345  */
1346 static void
doinsert(struct parse * p,sop op,size_t opnd,sopno pos)1347 doinsert(struct parse *p, sop op, size_t opnd, sopno pos)
1348 {
1349 	sopno sn;
1350 	sop s;
1351 	int i;
1352 
1353 	/* avoid making error situations worse */
1354 	if (p->error != 0)
1355 		return;
1356 
1357 	sn = HERE();
1358 	EMIT(op, opnd);		/* do checks, ensure space */
1359 	assert(HERE() == sn+1);
1360 	s = p->strip[sn];
1361 
1362 	/* adjust paren pointers */
1363 	assert(pos > 0);
1364 	for (i = 1; i < NPAREN; i++) {
1365 		if (p->pbegin[i] >= pos) {
1366 			p->pbegin[i]++;
1367 		}
1368 		if (p->pend[i] >= pos) {
1369 			p->pend[i]++;
1370 		}
1371 	}
1372 
1373 	memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
1374 						(HERE()-pos-1)*sizeof(sop));
1375 	p->strip[pos] = s;
1376 }
1377 
1378 /*
1379  - dofwd - complete a forward reference
1380  */
1381 static void
dofwd(struct parse * p,sopno pos,sop value)1382 dofwd(struct parse *p, sopno pos, sop value)
1383 {
1384 	/* avoid making error situations worse */
1385 	if (p->error != 0)
1386 		return;
1387 
1388 	assert(value < 1<<OPSHIFT);
1389 	p->strip[pos] = OP(p->strip[pos]) | value;
1390 }
1391 
1392 /*
1393  - enlarge - enlarge the strip
1394  */
1395 static void
enlarge(struct parse * p,sopno size)1396 enlarge(struct parse *p, sopno size)
1397 {
1398 	sop *sp;
1399 
1400 	if (p->ssize >= size)
1401 		return;
1402 
1403 	sp = (sop *)cli_realloc(p->strip, size*sizeof(sop));
1404 	if (sp == NULL) {
1405 		SETERROR(REG_ESPACE);
1406 		return;
1407 	}
1408 	p->strip = sp;
1409 	p->ssize = size;
1410 }
1411 
1412 /*
1413  - stripsnug - compact the strip
1414  */
1415 static void
stripsnug(struct parse * p,struct re_guts * g)1416 stripsnug(struct parse *p, struct re_guts *g)
1417 {
1418 	g->nstates = p->slen;
1419 	g->strip = (sop *)cli_realloc((char *)p->strip, p->slen * sizeof(sop));
1420 	if (g->strip == NULL) {
1421 		SETERROR(REG_ESPACE);
1422 		g->strip = p->strip;
1423 	}
1424 }
1425 
1426 /*
1427  - findmust - fill in must and mlen with longest mandatory literal string
1428  *
1429  * This algorithm could do fancy things like analyzing the operands of |
1430  * for common subsequences.  Someday.  This code is simple and finds most
1431  * of the interesting cases.
1432  *
1433  * Note that must and mlen got initialized during setup.
1434  */
1435 static void
findmust(struct parse * p,struct re_guts * g)1436 findmust(struct parse *p, struct re_guts *g)
1437 {
1438 	sop *scan;
1439 	sop *start = NULL;    /* start initialized in the default case, after that */
1440 	sop *newstart = NULL; /* newstart was initialized in the OCHAR case */
1441 	sopno newlen;
1442 	sop s;
1443 	char *cp;
1444 	sopno i;
1445 
1446 	/* avoid making error situations worse */
1447 	if (p->error != 0)
1448 		return;
1449 
1450 	/* find the longest OCHAR sequence in strip */
1451 	newlen = 0;
1452 	scan = g->strip + 1;
1453 	do {
1454 		s = *scan++;
1455 		switch (OP(s)) {
1456 		case OCHAR:		/* sequence member */
1457 			if (newlen == 0)		/* new sequence */
1458 				newstart = scan - 1;
1459 			newlen++;
1460 			break;
1461 		case OPLUS_:		/* things that don't break one */
1462 		case OLPAREN:
1463 		case ORPAREN:
1464 			break;
1465 		case OQUEST_:		/* things that must be skipped */
1466 		case OCH_:
1467 			scan--;
1468 			do {
1469 				scan += OPND(s);
1470 				s = *scan;
1471 				/* assert() interferes w debug printouts */
1472 				if (OP(s) != O_QUEST && OP(s) != O_CH &&
1473 							OP(s) != OOR2) {
1474 					g->iflags |= REGEX_BAD;
1475 					return;
1476 				}
1477 			} while (OP(s) != O_QUEST && OP(s) != O_CH);
1478 			/* fallthrough */
1479 		default:		/* things that break a sequence */
1480 			if (newlen > g->mlen) {		/* ends one */
1481 				start = newstart;
1482 				g->mlen = newlen;
1483 			}
1484 			newlen = 0;
1485 			break;
1486 		}
1487 	} while (OP(s) != OEND);
1488 
1489 	if (g->mlen == 0)		/* there isn't one */
1490 		return;
1491 	if (start == NULL) {		/* something went wrong */
1492 		g->mlen = 0;
1493 		return;
1494 	}
1495 
1496 	/* turn it into a character string */
1497 	g->must = cli_malloc((size_t)g->mlen + 1);
1498 	if (g->must == NULL) {		/* argh; just forget it */
1499 		g->mlen = 0;
1500 		return;
1501 	}
1502 	cp = g->must;
1503 	scan = start;
1504 	for (i = g->mlen; i > 0; i--) {
1505 		while (OP(s = *scan++) != OCHAR)
1506 			continue;
1507 		assert(cp < g->must + g->mlen);
1508 		*cp++ = (char)OPND(s);
1509 	}
1510 	assert(cp == g->must + g->mlen);
1511 	*cp++ = '\0';		/* just on general principles */
1512 }
1513 
1514 /*
1515  - pluscount - count + nesting
1516  */
1517 static sopno			/* nesting depth */
pluscount(struct parse * p,struct re_guts * g)1518 pluscount(struct parse *p, struct re_guts *g)
1519 {
1520 	sop *scan;
1521 	sop s;
1522 	sopno plusnest = 0;
1523 	sopno maxnest = 0;
1524 
1525 	if (p->error != 0)
1526 		return(0);	/* there may not be an OEND */
1527 
1528 	scan = g->strip + 1;
1529 	do {
1530 		s = *scan++;
1531 		switch (OP(s)) {
1532 		case OPLUS_:
1533 			plusnest++;
1534 			break;
1535 		case O_PLUS:
1536 			if (plusnest > maxnest)
1537 				maxnest = plusnest;
1538 			plusnest--;
1539 			break;
1540 		}
1541 	} while (OP(s) != OEND);
1542 	if (plusnest != 0)
1543 		g->iflags |= REGEX_BAD;
1544 	return(maxnest);
1545 }
1546