xref: /original-bsd/lib/libc/regex/regcomp.c (revision 860e07fc)
1 /*-
2  * Copyright (c) 1992 Henry Spencer.
3  * Copyright (c) 1992 The Regents of the University of California.
4  * All rights reserved.
5  *
6  * This code is derived from software contributed to Berkeley by
7  * Henry Spencer of the University of Toronto.
8  *
9  * %sccs.include.redist.c%
10  *
11  *	@(#)regcomp.c	5.1 (Berkeley) 08/06/92
12  */
13 
14 #if defined(LIBC_SCCS) && !defined(lint)
15 static char sccsid[] = "@(#)regcomp.c	5.1 (Berkeley) 08/06/92";
16 #endif /* LIBC_SCCS and not lint */
17 
18 #include <sys/types.h>
19 
20 #include <stdio.h>
21 #include <string.h>
22 #include <ctype.h>
23 #include <limits.h>
24 #include <stdlib.h>
25 #include <assert.h>
26 #include <regex.h>
27 
28 #include "utils.h"
29 #include "regex2.h"
30 
31 #include "cclass.h"
32 #include "cname.h"
33 
34 static uchar nuls[10];		/* place to point scanner in event of error */
35 
36 /*
37  * parse structure, passed up and down to avoid global variables and
38  * other clumsinesses
39  */
40 struct parse {
41 	uchar *next;		/* next character in RE */
42 	int error;		/* has an error been seen? */
43 	sop *strip;		/* malloced strip */
44 	sopno ssize;		/* malloced strip size (allocated) */
45 	sopno slen;		/* malloced strip length (used) */
46 	int ncsalloc;		/* number of csets allocated */
47 	struct re_guts *g;
48 #	define	NPAREN	10	/* we need to remember () 1-9 for back refs */
49 	sopno pbegin[NPAREN];	/* -> ( ([0] unused) */
50 	sopno pend[NPAREN];	/* -> ) ([0] unused) */
51 };
52 
53 STATIC void	 doemit __P((struct parse *, sop, size_t));
54 STATIC void	 dofwd __P((struct parse *, sopno, sop));
55 STATIC void	 doinsert __P((struct parse *, sop, size_t, sopno));
56 STATIC sopno	 dupl __P((struct parse *, sopno, sopno));
57 STATIC void	 enlarge __P((struct parse *, sopno));
58 STATIC void	 mcadd __P((struct parse *, cset *, uchar *));
59 STATIC uchar	*mcfind __P((cset *, uchar *));
60 STATIC int	 mcin __P((struct parse *, cset *, uchar *));
61 STATIC void	 mcinvert __P((struct parse *, cset *));
62 STATIC void	 mcsub __P((struct parse *, cset *, uchar *));
63 STATIC int	 seterr __P((struct parse *, int));
64 
65 /*
66  * macros for use with parse structure
67  * BEWARE:  these know that the parse structure is named `p' !!!
68  */
69 #define	PEEK()	((uchar)*p->next)
70 #define	PEEK2()	((uchar)*(p->next+1))
71 #define	SEE(c)	(PEEK() == (c))
72 #define	SEETWO(a, b)	(PEEK() == (a) && PEEK2() == (b))
73 #define	EAT(c)	((SEE(c)) ? (NEXT(), 1) : 0)
74 #define	EATTWO(a, b)	((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
75 #define	NEXT()	(p->next++)
76 #define	NEXT2()	(p->next += 2)
77 #define	NEXTn(n)	(p->next += (n))
78 #define	GETNEXT()	((uchar)*p->next++)
79 #define	SETERROR(e)	seterr(p, (e))
80 #define	REQUIRE(co, e)	((co) || SETERROR(e))
81 #define	MUSTSEE(c, e)	(REQUIRE(PEEK() == (c), e))
82 #define	MUSTEAT(c, e)	(REQUIRE(GETNEXT() == (c), e))
83 #define	MUSTNOTSEE(c, e)	(REQUIRE(PEEK() != (c), e))
84 #define	EMIT(sop, sopnd)	doemit(p, sop, (size_t)(sopnd))
85 #define	INSERT(sop, pos)	doinsert(p, sop, HERE()-(pos)+1, pos)
86 #define	FWD(pos)		dofwd(p, pos, HERE()-(pos))
87 #define	BACK(sop, pos)		EMIT(sop, HERE()-pos)
88 #define	HERE()		(p->slen)
89 #define	THERE()		(p->slen - 1)
90 #define	DROP(n)	(p->slen -= (n))
91 
92 /*
93  - regcomp - interface for parser and compilation
94  */
95 int				/* 0 success, otherwise REG_something */
96 regcomp(preg, pattern, cflags)
97 regex_t *preg;
98 const char *pattern;
99 int cflags;
100 {
101 	struct parse pa;
102 	register struct re_guts *g;
103 	register struct parse *p = &pa;
104 	sopno nstates;
105 	register int i;
106 	STATIC void p_ere();
107 	STATIC void p_bre();
108 	STATIC void stripsnug();
109 	STATIC void categorize();
110 	STATIC void findmust();
111 	STATIC sopno pluscount();
112 
113 	/* do the mallocs early so failure handling is easy */
114 	/* the +NUC here is for the category table */
115 	g = (struct re_guts *)malloc(sizeof(struct re_guts) + NUC);
116 	if (g == NULL)
117 		return(REG_ESPACE);
118 	p->ssize = strlen(pattern)/2*3 + 1;
119 	p->strip = (sop *)malloc(p->ssize * sizeof(sop));
120 	p->slen = 0;
121 	if (p->strip == NULL) {
122 		free((char *)g);
123 		return(REG_ESPACE);
124 	}
125 
126 	/* set things up */
127 	p->g = g;
128 	p->next = (uchar *)pattern;
129 	p->error = 0;
130 	p->ncsalloc = 0;
131 	for (i = 0; i < NPAREN; i++) {
132 		p->pbegin[i] = 0;
133 		p->pend[i] = 0;
134 	}
135 	g->csetsize = NUC;
136 	g->sets = NULL;
137 	g->setbits = NULL;
138 	g->ncsets = 0;
139 	g->cflags = cflags;
140 	g->iflags = 0;
141 	g->must = NULL;
142 	g->mlen = 0;
143 	g->nsub = 0;
144 	g->ncategories = 1;	/* category 0 is "everything else" */
145 	g->categories = (uchar *)g + sizeof(struct re_guts);
146 	(void) memset((char *)g->categories, 0, NUC);
147 	g->backrefs = 0;
148 	g->nplus = 0;
149 
150 	/* do it */
151 	EMIT(OEND, 0);
152 	g->firststate = THERE();
153 	if (cflags&REG_EXTENDED)
154 		p_ere(p, '\0');
155 	else
156 		p_bre(p, '\0', '\0');
157 	EMIT(OEND, 0);
158 	g->laststate = THERE();
159 
160 	/* tidy up loose ends and fill things in */
161 	categorize(p, g);
162 	stripsnug(p, g);
163 	findmust(p, g);
164 	g->nplus = pluscount(p, g);
165 	g->magic = MAGIC2;
166 	preg->re_nsub = g->nsub;
167 	preg->re_g = g;
168 	preg->re_magic = MAGIC1;
169 #ifdef NDEBUG
170 	/* not debugging, so can't rely on the assert() in regexec() */
171 	if (g->iflags&BAD)
172 		SETERROR(REG_ASSERT);
173 #endif
174 
175 	/* win or lose, we're done */
176 	if (p->error != 0)	/* lose */
177 		regfree(preg);
178 	return(p->error);
179 }
180 
181 /*
182  - p_ere - ERE parser top level, concatenation and alternation
183  */
184 static void
185 p_ere(p, stop)
186 register struct parse *p;
187 uchar stop;			/* character this ERE should end at */
188 {
189 	STATIC void p_ere_exp();
190 	register uchar c;
191 	register sopno prevback;
192 	register sopno prevfwd;
193 	register sopno conc;
194 	register int first = 1;		/* is this the first alternative? */
195 
196 	for (;;) {
197 		/* do a bunch of concatenated expressions */
198 		conc = HERE();
199 		while ((c = PEEK()) != '|' && c != stop && c != '\0')
200 			p_ere_exp(p);
201 		REQUIRE(HERE() != conc, REG_EMPTY);	/* require nonempty */
202 
203 		if (!EAT('|'))
204 			break;		/* NOTE BREAK OUT */
205 
206 		if (first) {
207 			INSERT(OCH_, conc);	/* offset is wrong */
208 			prevfwd = conc;
209 			prevback = conc;
210 			first = 0;
211 		}
212 		BACK(OOR1, prevback);
213 		prevback = THERE();
214 		FWD(prevfwd);			/* fix previous offset */
215 		prevfwd = HERE();
216 		EMIT(OOR2, 0);			/* offset is very wrong */
217 	}
218 
219 	if (!first) {		/* tail-end fixups */
220 		FWD(prevfwd);
221 		BACK(O_CH, prevback);
222 	}
223 
224 	assert(SEE(stop) || SEE('\0'));
225 }
226 
227 /*
228  - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
229  */
230 static void
231 p_ere_exp(p)
232 register struct parse *p;
233 {
234 	STATIC int p_count();
235 	STATIC void p_bracket();
236 	STATIC void ordinary();
237 	STATIC void nonnewline();
238 	STATIC void repeat();
239 	register uchar c;
240 	register sopno pos;
241 	register sopno sub;
242 	register int count;
243 	register int count2;
244 	register sopno subno;
245 	int wascaret = 0;
246 	/* we call { a repetition if followed by a digit */
247 #	define	ISRPT(c1, c2)	(c1 == '*' || c1 == '+' || c1 == '?' || \
248 						(c1 == '{' && isdigit(c2)))
249 
250 	c = GETNEXT();
251 	assert(c != '\0');	/* caller should have ensured this */
252 
253 	pos = HERE();
254 	switch (c) {
255 	case '(':
256 		MUSTNOTSEE('\0', REG_EPAREN);
257 		p->g->nsub++;
258 		subno = p->g->nsub;
259 		if (subno < NPAREN)
260 			p->pbegin[subno] = HERE();
261 		EMIT(OLPAREN, subno);
262 		if (!SEE(')'))
263 			p_ere(p, ')');
264 		if (subno < NPAREN) {
265 			p->pend[subno] = HERE();
266 			assert(p->pend[subno] != 0);
267 		}
268 		EMIT(ORPAREN, subno);
269 		MUSTEAT(')', REG_EPAREN);
270 		break;
271 #ifndef POSIX_MISTAKE
272 	case ')':		/* happens only if no current unmatched ( */
273 		/*
274 		 * You may ask, why the ifndef?  Because I didn't notice
275 		 * this until slightly too late for 1003.2, and none of the
276 		 * other 1003.2 regular-expression reviewers noticed it at
277 		 * all.  So an unmatched ) is legal POSIX, at least until
278 		 * we can get it fixed.
279 		 */
280 		SETERROR(REG_EPAREN);
281 		break;
282 #endif
283 	case '^':
284 		EMIT(OBOL, 0);
285 		p->g->iflags |= USEBOL;
286 		wascaret = 1;
287 		break;
288 	case '$':
289 		EMIT(OEOL, 0);
290 		p->g->iflags |= USEEOL;
291 		break;
292 	case '|':
293 		SETERROR(REG_EMPTY);
294 		break;
295 	case '*':
296 	case '+':
297 	case '?':
298 		SETERROR(REG_BADRPT);
299 		break;
300 	case '.':
301 		if (p->g->cflags&REG_NEWLINE)
302 			nonnewline(p);
303 		else
304 			EMIT(OANY, 0);
305 		break;
306 	case '[':
307 		p_bracket(p);
308 		break;
309 	case '\\':
310 		c = GETNEXT();
311 		if (c == '^' || c == '.' || c == '[' || c == '$' ||
312 				c == '(' || c == ')' || c == '|' ||
313 				c == '*' || c == '+' || c == '?' ||
314 				c == '{' || c == '\\')
315 			ordinary(p, c);
316 		else
317 			SETERROR(REG_EESCAPE);
318 		break;
319 	case '{':		/* okay as ordinary except if digit follows */
320 		REQUIRE(!isdigit(PEEK()), REG_BADRPT);
321 		/* FALLTHROUGH */
322 	default:
323 		ordinary(p, c);
324 		break;
325 	}
326 
327 	c = PEEK();
328 	if (!ISRPT(c, PEEK2()))
329 		return;		/* no repetition, we're done */
330 	NEXT();
331 
332 	REQUIRE(!wascaret, REG_BADRPT);
333 	switch (c) {
334 	case '*':	/* implemented as +? */
335 		INSERT(OPLUS_, pos);
336 		BACK(O_PLUS, pos);
337 		INSERT(OQUEST_, pos);
338 		BACK(O_QUEST, pos);
339 		break;
340 	case '+':
341 		INSERT(OPLUS_, pos);
342 		BACK(O_PLUS, pos);
343 		break;
344 	case '?':
345 		INSERT(OQUEST_, pos);
346 		BACK(O_QUEST, pos);
347 		break;
348 	case '{':
349 		count = p_count(p);
350 		if (EAT(',')) {
351 			if (isdigit(PEEK())) {
352 				count2 = p_count(p);
353 				REQUIRE(count <= count2, REG_BADBR);
354 			} else		/* single number with comma */
355 				count2 = INFINITY;
356 		} else		/* just a single number */
357 			count2 = count;
358 		repeat(p, pos, count, count2);
359 		if (!EAT('}')) {	/* error heuristics */
360 			while ((c = PEEK()) != '\0' && c != '}')
361 				NEXT();
362 			if (c == '\0')
363 				SETERROR(REG_EBRACE);
364 			else
365 				SETERROR(REG_BADBR);
366 		}
367 		break;
368 	}
369 
370 	c = PEEK();
371 	REQUIRE(!ISRPT(c, PEEK2()), REG_BADRPT);
372 }
373 
374 /*
375  - p_bre - BRE parser top level, anchoring and concatenation
376  *
377  * Giving end1 as '\0' essentially eliminates the end1/end2 check.
378  */
379 static void
380 p_bre(p, end1, end2)
381 register struct parse *p;
382 register uchar end1;		/* first terminating character */
383 register uchar end2;		/* second terminating character */
384 {
385 	STATIC int p_simp_re();
386 	register sopno start = HERE();
387 	register int first = 1;			/* first subexpression? */
388 	register int wasdollar;
389 
390 	if (EAT('^')) {
391 		EMIT(OBOL, 0);
392 		p->g->iflags |= USEBOL;
393 	}
394 	while (!SEE('\0') && !SEETWO(end1, end2)) {
395 		wasdollar = p_simp_re(p, first);
396 		first = 0;
397 	}
398 	if (wasdollar) {	/* oops, that was a trailing anchor */
399 		DROP(1);
400 		EMIT(OEOL, 0);
401 		p->g->iflags |= USEEOL;
402 	}
403 
404 	REQUIRE(HERE() != start, REG_EMPTY);	/* require nonempty */
405 }
406 
407 /*
408  - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
409  */
410 static int			/* was the simple RE an unbackslashed $? */
411 p_simp_re(p, starordinary)
412 register struct parse *p;
413 int starordinary;		/* is a leading * an ordinary character? */
414 {
415 	STATIC int p_count();
416 	STATIC void p_bracket();
417 	STATIC void ordinary();
418 	STATIC void nonnewline();
419 	STATIC void repeat();
420 	register int c;
421 	register int count;
422 	register int count2;
423 	register sopno pos;
424 	register sopno sub;
425 	register int i;
426 	register sopno subno;
427 #	define	BACKSL	(1<<CHAR_BIT)
428 
429 	pos = HERE();		/* repetion op, if any, covers from here */
430 
431 	c = GETNEXT();
432 	assert(c != '\0');	/* caller should have ensured this */
433 	if (c == '\\')
434 		c = BACKSL | PEEK();
435 	switch (c) {
436 	case '.':
437 		if (p->g->cflags&REG_NEWLINE)
438 			nonnewline(p);
439 		else
440 			EMIT(OANY, 0);
441 		break;
442 	case '[':
443 		p_bracket(p);
444 		break;
445 	case BACKSL|'^':
446 	case BACKSL|'.':
447 	case BACKSL|'*':
448 	case BACKSL|'[':
449 	case BACKSL|'$':
450 	case BACKSL|'\\':
451 		ordinary(p, c&~BACKSL);
452 		NEXT();
453 		break;
454 	case BACKSL|'{':
455 		SETERROR(REG_BADRPT);
456 		break;
457 	case BACKSL|'(':
458 		NEXT();
459 		p->g->nsub++;
460 		subno = p->g->nsub;
461 		if (subno < NPAREN)
462 			p->pbegin[subno] = HERE();
463 		EMIT(OLPAREN, subno);
464 		/* the SEE here is an error heuristic */
465 		if (!SEE('\0') && !SEETWO('\\', ')'))
466 			p_bre(p, '\\', ')');
467 		if (subno < NPAREN) {
468 			p->pend[subno] = HERE();
469 			assert(p->pend[subno] != 0);
470 		}
471 		EMIT(ORPAREN, subno);
472 		REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
473 		break;
474 	case BACKSL|')':	/* should not get here -- must be user */
475 	case BACKSL|'}':
476 		SETERROR(REG_EPAREN);
477 		break;
478 	case BACKSL|'1':
479 	case BACKSL|'2':
480 	case BACKSL|'3':
481 	case BACKSL|'4':
482 	case BACKSL|'5':
483 	case BACKSL|'6':
484 	case BACKSL|'7':
485 	case BACKSL|'8':
486 	case BACKSL|'9':
487 		i = (c&~BACKSL) - '0';
488 		assert(i < NPAREN);
489 		if (p->pend[i] != 0) {
490 			assert(i <= p->g->nsub);
491 			EMIT(OBACK_, i);
492 			assert(p->pbegin[i] != 0);
493 			assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
494 			assert(OP(p->strip[p->pend[i]]) == ORPAREN);
495 			(void) dupl(p, p->pbegin[i]+1, p->pend[i]);
496 			EMIT(O_BACK, i);
497 		} else
498 			SETERROR(REG_ESUBREG);
499 		p->g->backrefs = 1;
500 		NEXT();
501 		break;
502 	case '*':
503 		REQUIRE(starordinary, REG_BADRPT);
504 		/* FALLTHROUGH */
505 	default:
506 		if (c & BACKSL) {
507 			SETERROR(REG_EESCAPE);
508 			c &= ~BACKSL;
509 		}
510 		ordinary(p, (uchar)c);
511 		break;
512 	}
513 
514 	if (EAT('*')) {		/* implemented as +? */
515 		INSERT(OPLUS_, pos);
516 		BACK(O_PLUS, pos);
517 		INSERT(OQUEST_, pos);
518 		BACK(O_QUEST, pos);
519 	} else if (EATTWO('\\', '{')) {
520 		count = p_count(p);
521 		if (EAT(',')) {
522 			if (isdigit(PEEK())) {
523 				count2 = p_count(p);
524 				REQUIRE(count <= count2, REG_BADBR);
525 			} else		/* single number with comma */
526 				count2 = INFINITY;
527 		} else		/* just a single number */
528 			count2 = count;
529 		repeat(p, pos, count, count2);
530 		if (!EATTWO('\\', '}')) {	/* error heuristics */
531 			while (!SEE('\0') && !SEETWO('\\', '}'))
532 				NEXT();
533 			if (SEE('\0'))
534 				SETERROR(REG_EBRACE);
535 			else
536 				SETERROR(REG_BADBR);
537 		}
538 	} else if (c == '$')	/* unbackslashed $ not followed by reptn */
539 		return(1);
540 
541 	return(0);
542 }
543 
544 /*
545  - p_count - parse a repetition count
546  */
547 static int			/* the value */
548 p_count(p)
549 register struct parse *p;
550 {
551 	register int count = 0;
552 	register int ndigits = 0;
553 
554 	while (isdigit(PEEK()) && count <= DUPMAX) {
555 		count = count*10 + (GETNEXT() - '0');
556 		ndigits++;
557 	}
558 
559 	REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
560 	return(count);
561 }
562 
563 /*
564  - p_bracket - parse a bracketed character list
565  *
566  * Note a significant property of this code:  if the allocset() did SETERROR,
567  * no set operations are done.
568  */
569 static void
570 p_bracket(p)
571 register struct parse *p;
572 {
573 	STATIC void p_b_term();
574 	STATIC cset *allocset();
575 	STATIC int freezeset();
576 	register uchar c;
577 	register cset *cs = allocset(p);
578 	register int invert = 0;
579 
580 	if (EAT('^'))
581 		invert++;	/* make note to invert set at end */
582 	if (EAT(']'))
583 		CHadd(cs, ']');
584 	while ((c = PEEK()) != '\0' && c != ']' && !SEETWO('-', ']'))
585 		p_b_term(p, cs);
586 	if (EAT('-'))
587 		CHadd(cs, '-');
588 	MUSTEAT(']', REG_EBRACK);
589 
590 	if (invert && p->error == 0) {
591 		register int i;
592 
593 		for (i = p->g->csetsize - 1; i >= 0; i--)
594 			if (CHIN(cs, i))
595 				CHsub(cs, i);
596 			else
597 				CHadd(cs, i);
598 		if (p->g->cflags&REG_NEWLINE)
599 			CHsub(cs, '\n');
600 		if (cs->multis != NULL)
601 			mcinvert(p, cs);
602 	}
603 	assert(cs->multis == NULL);		/* xxx */
604 	EMIT(OANYOF, freezeset(p, cs));
605 }
606 
607 /*
608  - p_b_term - parse one term of a bracketed character list
609  */
610 static void
611 p_b_term(p, cs)
612 register struct parse *p;
613 register cset *cs;
614 {
615 	STATIC uchar p_b_symbol();
616 	STATIC void p_b_cclass();
617 	STATIC void p_b_eclass();
618 	STATIC uchar othercase();
619 	register uchar c;
620 	register uchar start, finish;
621 	register int i;
622 
623 	/* classify what we've got */
624 	switch (PEEK()) {
625 	case '[':
626 		c = PEEK2();
627 		break;
628 	case '-':
629 		SETERROR(REG_ERANGE);
630 		return;			/* NOTE RETURN */
631 		break;
632 	default:
633 		c = '\0';
634 		break;
635 	}
636 
637 	switch (c) {
638 	case ':':		/* character class */
639 		NEXT2();
640 		c = PEEK();
641 		REQUIRE(c != '\0', REG_EBRACK);
642 		REQUIRE(c != '-' && c != ']', REG_ECTYPE);
643 		p_b_cclass(p, cs);
644 		MUSTNOTSEE('\0', REG_EBRACK);
645 		REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
646 		break;
647 	case '=':		/* equivalence class */
648 		NEXT2();
649 		c = PEEK();
650 		REQUIRE(c != '\0', REG_EBRACK);
651 		REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
652 		p_b_eclass(p, cs);
653 		MUSTNOTSEE('\0', REG_EBRACK);
654 		REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
655 		break;
656 	default:		/* symbol, ordinary character, or range */
657 /* xxx revision needed for multichar stuff */
658 		start = p_b_symbol(p);
659 		if (PEEK() == '-' && (c = PEEK2()) != ']' && c != '\0') {
660 			/* range */
661 			NEXT();
662 			if (EAT('-'))
663 				finish = '-';
664 			else
665 				finish = p_b_symbol(p);
666 		} else
667 			finish = start;
668 		REQUIRE(start <= finish, REG_ERANGE);
669 		for (i = start; i <= finish; i++) {
670 			CHadd(cs, i);
671 			if ((p->g->cflags&REG_ICASE) && isalpha(i)) {
672 				c = othercase((uchar)i);
673 				CHadd(cs, c);
674 			}
675 		}
676 		break;
677 	}
678 }
679 
680 /*
681  - p_b_cclass - parse a character-class name and deal with it
682  */
683 static void
684 p_b_cclass(p, cs)
685 register struct parse *p;
686 register cset *cs;
687 {
688 	register uchar *sb = p->next;
689 	register uchar *se = sb;
690 	register struct cclass *cp;
691 	register int len;
692 	register uchar *u;
693 	register uchar c;
694 
695 	while (isalpha(*se))
696 		se++;
697 	len = se - sb;
698 	NEXTn(len);
699 	for (cp = cclasses; cp->name != NULL; cp++)
700 		if (strncmp(cp->name, (char *)sb, len) == 0 &&
701 		    cp->name[len] == '\0')
702 			break;
703 	if (cp->name == NULL) {
704 		/* oops, didn't find it */
705 		SETERROR(REG_ECTYPE);
706 		return;
707 	}
708 
709 	u = (uchar *)cp->chars;
710 	while ((c = *u++) != '\0')
711 		CHadd(cs, c);
712 	for (u = (uchar *)cp->multis; *u != '\0'; u += strlen((char *)u) + 1)
713 		MCadd(cs, u);
714 }
715 
716 /*
717  - p_b_eclass - parse an equivalence-class name and deal with it
718  *
719  * This implementation is incomplete. xxx
720  */
721 static void
722 p_b_eclass(p, cs)
723 register struct parse *p;
724 register cset *cs;
725 {
726 	register uchar c;
727 	STATIC uchar p_b_coll_elem();
728 
729 	c = p_b_coll_elem(p, '=');
730 	CHadd(cs, c);
731 }
732 
733 /*
734  - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
735  */
736 static uchar			/* value of symbol */
737 p_b_symbol(p)
738 register struct parse *p;
739 {
740 	STATIC uchar p_b_coll_elem();
741 	register uchar value;
742 
743 	if (!EATTWO('[', '.')) {
744 		MUSTNOTSEE('\0', REG_EBRACK);
745 		return(GETNEXT());
746 	}
747 
748 	/* collating symbol */
749 	MUSTNOTSEE('\0', REG_EBRACK);
750 	value = p_b_coll_elem(p, '.');
751 	REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
752 	return(value);
753 }
754 
755 /*
756  - p_b_coll_elem - parse a collating-element name and look it up
757  */
758 static uchar			/* value of collating element */
759 p_b_coll_elem(p, endc)
760 register struct parse *p;
761 uchar endc;			/* name ended by endc,']' */
762 {
763 	register uchar *sp = p->next;
764 	register struct cname *cp;
765 	register int len;
766 	register uchar c;
767 
768 	while ((c = PEEK()) != '\0' && !SEETWO(endc, ']'))
769 		NEXT();
770 	if (c == '\0') {
771 		SETERROR(REG_EBRACK);
772 		return(0);
773 	}
774 	len = p->next - sp;
775 	for (cp = cnames; cp->name != NULL; cp++)
776 		if (strncmp(cp->name, (char *)sp, len) == 0 &&
777 		    cp->name[len] == '\0')
778 			return(cp->code);	/* known name */
779 	if (len == 1)
780 		return(*sp);	/* single character */
781 	SETERROR(REG_ECOLLATE);			/* neither */
782 	return(0);
783 }
784 
785 /*
786  - othercase - return the case counterpart of an alphabetic
787  */
788 static uchar
789 othercase(ch)
790 uchar ch;
791 {
792 	assert(isalpha(ch));
793 	if (isupper(ch))
794 		return(tolower(ch));
795 	else if (islower(ch))
796 		return(toupper(ch));
797 	else			/* peculiar, but could happen */
798 		return(ch);
799 }
800 
801 /*
802  - bothcases - emit a dualcase version of a character
803  * Boy, is this implementation ever a kludge...
804  */
805 static void
806 bothcases(p, ch)
807 register struct parse *p;
808 uchar ch;
809 {
810 	register uchar *oldnext;
811 	uchar bracket[3];
812 
813 	oldnext = p->next;
814 	p->next = bracket;
815 	bracket[0] = ch;
816 	bracket[1] = ']';
817 	bracket[2] = '\0';
818 	p_bracket(p);
819 	assert(p->next == bracket+2);
820 	p->next = oldnext;
821 }
822 
823 /*
824  - ordinary - emit an ordinary character
825  */
826 static void
827 ordinary(p, ch)
828 register struct parse *p;
829 register uchar ch;
830 {
831 	register uchar *cap = p->g->categories;
832 
833 	if ((p->g->cflags&REG_ICASE) && isalpha(ch)) {
834 		bothcases(p, ch);
835 		return;
836 	}
837 
838 	EMIT(OCHAR, ch);
839 	if (cap[ch] == 0)
840 		cap[ch] = p->g->ncategories++;
841 }
842 
843 /*
844  - nonnewline - emit REG_NEWLINE version of OANY
845  * Boy, is this implementation ever a kludge...
846  */
847 static void
848 nonnewline(p)
849 register struct parse *p;
850 {
851 	register uchar *oldnext;
852 	uchar bracket[4];
853 
854 	oldnext = p->next;
855 	p->next = bracket;
856 	bracket[0] = '^';
857 	bracket[1] = '\n';
858 	bracket[2] = ']';
859 	bracket[3] = '\0';
860 	p_bracket(p);
861 	assert(p->next == bracket+3);
862 	p->next = oldnext;
863 }
864 
865 /*
866  - repeat - generate code for a bounded repetition, recursively if needed
867  */
868 static void
869 repeat(p, start, from, to)
870 register struct parse *p;
871 sopno start;			/* operand from here to end of strip */
872 int from;			/* repeated from this number */
873 int to;				/* to this number of times (maybe INFINITY) */
874 {
875 	register sopno finish = HERE();
876 #	define	N	2
877 #	define	INF	3
878 #	define	REP(f, t)	((f)*8 + (t))
879 #	define	MAP(n)	(((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
880 	register sopno copy;
881 
882 	if (p->error != 0)	/* head off possible runaway recursion */
883 		return;
884 
885 	assert(from <= to);
886 
887 	switch (REP(MAP(from), MAP(to))) {
888 	case REP(0, 0):			/* must be user doing this */
889 		DROP(finish-start);	/* drop the operand */
890 		break;
891 	case REP(0, 1):			/* as x{1,1}? */
892 	case REP(0, N):			/* as x{1,n}? */
893 	case REP(0, INF):		/* as x{1,}? */
894 		INSERT(OQUEST_, start);		/* offset is wrong... */
895 		repeat(p, start+1, 1, to);
896 		FWD(start);			/* ... fix it */
897 		BACK(O_QUEST, start);
898 		break;
899 	case REP(1, 1):			/* trivial case */
900 		/* done */
901 		break;
902 	case REP(1, N):			/* as x?x{1,n-1} */
903 		INSERT(OQUEST_, start);
904 		BACK(O_QUEST, start);
905 		copy = dupl(p, start+1, finish+1);
906 		assert(copy == finish+2);
907 		repeat(p, copy, 1, to-1);
908 		break;
909 	case REP(1, INF):		/* as x+ */
910 		INSERT(OPLUS_, start);
911 		BACK(O_PLUS, start);
912 		break;
913 	case REP(N, N):			/* as xx{m-1,n-1} */
914 		copy = dupl(p, start, finish);
915 		repeat(p, copy, from-1, to-1);
916 		break;
917 	case REP(N, INF):		/* as xx{n-1,INF} */
918 		copy = dupl(p, start, finish);
919 		repeat(p, copy, from-1, to);
920 		break;
921 	default:			/* "can't happen" */
922 		SETERROR(REG_ASSERT);	/* just in case */
923 		break;
924 	}
925 }
926 
927 /*
928  - seterr - set an error condition
929  */
930 static int			/* useless but makes type checking happy */
931 seterr(p, e)
932 register struct parse *p;
933 int e;
934 {
935 	if (p->error == 0)	/* keep earliest error condition */
936 		p->error = e;
937 	p->next = nuls;		/* try to bring things to a halt */
938 	return(0);		/* make the return value well-defined */
939 }
940 
941 /*
942  - allocset - allocate a set of characters for []
943  */
944 static cset *
945 allocset(p)
946 register struct parse *p;
947 {
948 	register int no = p->g->ncsets++;
949 	register size_t nc;
950 	register size_t nbytes;
951 	register cset *cs;
952 	register int i;
953 	register size_t css = (size_t)p->g->csetsize;
954 
955 	if (no >= p->ncsalloc) {	/* need another column of space */
956 		p->ncsalloc += CHAR_BIT;
957 		nc = p->ncsalloc;
958 		assert(nc % CHAR_BIT == 0);
959 		nbytes = nc / CHAR_BIT * css;
960 		if (p->g->sets == NULL)
961 			p->g->sets = (cset *)malloc(nc * sizeof(cset));
962 		else
963 			p->g->sets = (cset *)realloc((char *)p->g->sets,
964 							nc * sizeof(cset));
965 		if (p->g->setbits == NULL)
966 			p->g->setbits = (uchar *)malloc(nbytes);
967 		else
968 			p->g->setbits = (uchar *)realloc((char *)p->g->setbits,
969 								nbytes);
970 		if (p->g->sets != NULL && p->g->setbits != NULL)
971 			(void) memset((char *)p->g->setbits + (nbytes - css),
972 								0, css);
973 		else {
974 			no = 0;
975 			SETERROR(REG_ESPACE);
976 			/* caller's responsibility not to do set ops */
977 		}
978 	}
979 
980 	assert(p->g->sets != NULL);	/* xxx */
981 	cs = &p->g->sets[no];
982 	cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
983 	cs->mask = 1 << ((no) % CHAR_BIT);
984 	cs->hash = 0;
985 	cs->smultis = 0;
986 	cs->multis = NULL;
987 
988 	return(cs);
989 }
990 
991 /*
992  - freezeset - final processing on a set of characters
993  *
994  * The main task here is merging identical sets.  This is usually a waste
995  * of time (although the hash code minimizes the overhead), but can win
996  * big if REG_ICASE is being used.  REG_ICASE, by the way, is why the hash
997  * is done using addition rather than xor -- all ASCII [aA] sets xor to
998  * the same value!
999  */
1000 static int			/* set number */
1001 freezeset(p, cs)
1002 register struct parse *p;
1003 register cset *cs;
1004 {
1005 	register uchar h = cs->hash;
1006 	register int i;
1007 	register cset *top = &p->g->sets[p->g->ncsets];
1008 	register uchar c;
1009 	register cset *cs2;
1010 	register size_t css = (size_t)p->g->csetsize;
1011 
1012 	/* look for an earlier one which is the same */
1013 	for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
1014 		if (cs2->hash == h && cs2 != cs) {
1015 			/* maybe */
1016 			for (i = 0; i < css; i++)
1017 				if (!!CHIN(cs2, i) != !!CHIN(cs, i))
1018 					break;		/* no */
1019 			if (i == css)
1020 				break;			/* yes */
1021 		}
1022 
1023 	if (cs2 < top) {	/* found one */
1024 		assert(cs == top-1);
1025 		p->g->ncsets--;
1026 		for (i = 0; i < css; i++)
1027 			CHsub(cs, i);
1028 		cs = cs2;
1029 	}
1030 
1031 	return((int)(cs - p->g->sets));
1032 }
1033 
1034 /*
1035  - mcadd - add a collating element to a cset
1036  */
1037 static void
1038 mcadd(p, cs, cp)
1039 register struct parse *p;
1040 register cset *cs;
1041 register uchar *cp;
1042 {
1043 	register size_t oldend = cs->smultis;
1044 
1045 	cs->smultis += strlen((char *)cp) + 1;
1046 	if (cs->multis == NULL)
1047 		cs->multis = (uchar *)malloc(cs->smultis);
1048 	else
1049 		cs->multis = (uchar *)realloc(cs->multis, cs->smultis);
1050 	if (cs->multis == NULL) {
1051 		SETERROR(REG_ESPACE);
1052 		return;
1053 	}
1054 
1055 	(void) strcpy((char *)(cs->multis + oldend - 1), (char *)cp);
1056 	cs->multis[cs->smultis - 1] = '\0';
1057 }
1058 
1059 /*
1060  - mcsub - subtract a collating element from a cset
1061  */
1062 static void
1063 mcsub(p, cs, cp)
1064 register struct parse *p;
1065 register cset *cs;
1066 register uchar *cp;
1067 {
1068 	register uchar *fp = mcfind(cs, cp);
1069 	register size_t len = strlen((char *)fp);
1070 
1071 	assert(p != NULL);
1072 	(void) memmove((char *)fp, (char *)(fp + len + 1),
1073 				cs->smultis - (fp + len + 1 - cs->multis));
1074 	cs->smultis -= len;
1075 
1076 	if (cs->smultis == 0) {
1077 		free((char *)cs->multis);
1078 		cs->multis = NULL;
1079 		return;
1080 	}
1081 
1082 	cs->multis = (uchar *)realloc(cs->multis, cs->smultis);
1083 	assert(cs->multis != NULL);
1084 }
1085 
1086 /*
1087  - mcin - is a collating element in a cset?
1088  */
1089 static int
1090 mcin(p, cs, cp)
1091 register struct parse *p;
1092 register cset *cs;
1093 register uchar *cp;
1094 {
1095 	return(mcfind(cs, cp) != NULL);
1096 }
1097 
1098 /*
1099  - mcfind - find a collating element in a cset
1100  */
1101 static uchar *
1102 mcfind(cs, cp)
1103 register cset *cs;
1104 register uchar *cp;
1105 {
1106 	register uchar *p;
1107 
1108 	if (cs->multis == NULL)
1109 		return(NULL);
1110 	for (p = cs->multis; *p != '\0'; p += strlen((char *)p) + 1)
1111 		if (strcmp((char *)cp, (char *)p) == 0)
1112 			return(p);
1113 	return(NULL);
1114 }
1115 
1116 /*
1117  - mcinvert - invert the list of collating elements in a cset
1118  *
1119  * This would have to know the set of possibilities.  Implementation
1120  * is deferred.
1121  */
1122 static void
1123 mcinvert(p, cs)
1124 register struct parse *p;
1125 register cset *cs;
1126 {
1127 	assert(cs->multis == NULL);	/* xxx */
1128 }
1129 
1130 /*
1131  - isinsets - is this character in any sets?
1132  */
1133 static int			/* predicate */
1134 isinsets(g, c)
1135 register struct re_guts *g;
1136 uchar c;
1137 {
1138 	register uchar *col;
1139 	register int i;
1140 	register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1141 
1142 	for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1143 		if (col[c] != 0)
1144 			return(1);
1145 	return(0);
1146 }
1147 
1148 /*
1149  - samesets - are these two characters in exactly the same sets?
1150  */
1151 static int			/* predicate */
1152 samesets(g, c1, c2)
1153 register struct re_guts *g;
1154 register uchar c1;
1155 register uchar c2;
1156 {
1157 	register uchar *col;
1158 	register int i;
1159 	register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1160 
1161 	for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1162 		if (col[c1] != col[c2])
1163 			return(0);
1164 	return(1);
1165 }
1166 
1167 /*
1168  - categorize - sort out character categories
1169  */
1170 static void
1171 categorize(p, g)
1172 struct parse *p;
1173 register struct re_guts *g;
1174 {
1175 	register uchar *cats = g->categories;
1176 	register unsigned c;
1177 	register unsigned c2;
1178 	register uchar cat;
1179 
1180 	/* avoid making error situations worse */
1181 	if (p->error != 0)
1182 		return;
1183 
1184 	for (c = 0; c < g->csetsize; c++)
1185 		if (cats[c] == 0 && isinsets(g, c)) {
1186 			cat = g->ncategories++;
1187 			cats[c] = cat;
1188 			for (c2 = c+1; c2 < g->csetsize; c2++)
1189 				if (cats[c2] == 0 && samesets(g, c, c2))
1190 					cats[c2] = cat;
1191 		}
1192 }
1193 
1194 /*
1195  - dupl - emit a duplicate of a bunch of sops
1196  */
1197 static sopno			/* start of duplicate */
1198 dupl(p, start, finish)
1199 register struct parse *p;
1200 sopno start;			/* from here */
1201 sopno finish;			/* to this less one */
1202 {
1203 	register int i;
1204 	register sopno ret = HERE();
1205 	register sopno len = finish - start;
1206 
1207 	assert(finish >= start);
1208 	if (len == 0)
1209 		return(ret);
1210 	enlarge(p, p->ssize + len);	/* this many unexpected additions */
1211 	assert(p->ssize >= p->slen + len);
1212 	(void) memcpy((char *)(p->strip + p->slen),
1213 		(char *)(p->strip + start), (size_t)len*sizeof(sop));
1214 	p->slen += len;
1215 	return(ret);
1216 }
1217 
1218 /*
1219  - doemit - emit a strip operator
1220  *
1221  * It might seem better to implement this as a macro with a function as
1222  * hard-case backup, but it's just too big and messy unless there are
1223  * some changes to the data structures.  Maybe later.
1224  */
1225 static void
1226 doemit(p, op, opnd)
1227 register struct parse *p;
1228 sop op;
1229 size_t opnd;
1230 {
1231 	/* avoid making error situations worse */
1232 	if (p->error != 0)
1233 		return;
1234 
1235 	/* deal with oversize operands ("can't happen", more or less) */
1236 	assert(opnd < 1<<OPSHIFT);
1237 
1238 	/* deal with undersized strip */
1239 	if (p->slen >= p->ssize)
1240 		enlarge(p, (p->ssize+1) / 2 * 3);	/* +50% */
1241 	assert(p->slen < p->ssize);
1242 
1243 	/* finally, it's all reduced to the easy case */
1244 	p->strip[p->slen++] = SOP(op, opnd);
1245 }
1246 
1247 /*
1248  - doinsert - insert a sop into the strip
1249  */
1250 static void
1251 doinsert(p, op, opnd, pos)
1252 register struct parse *p;
1253 sop op;
1254 size_t opnd;
1255 sopno pos;
1256 {
1257 	register sopno sn;
1258 	register sop s;
1259 	register int i;
1260 
1261 	/* avoid making error situations worse */
1262 	if (p->error != 0)
1263 		return;
1264 
1265 	sn = HERE();
1266 	EMIT(op, opnd);		/* do checks, ensure space */
1267 	assert(HERE() == sn+1);
1268 	s = p->strip[sn];
1269 
1270 	/* adjust paren pointers */
1271 	assert(pos > 0);
1272 	for (i = 1; i < NPAREN; i++) {
1273 		if (p->pbegin[i] >= pos) {
1274 			p->pbegin[i]++;
1275 		}
1276 		if (p->pend[i] >= pos) {
1277 			p->pend[i]++;
1278 		}
1279 	}
1280 
1281 	memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
1282 						(HERE()-pos-1)*sizeof(sop));
1283 	p->strip[pos] = s;
1284 }
1285 
1286 /*
1287  - dofwd - complete a forward reference
1288  */
1289 static void
1290 dofwd(p, pos, value)
1291 register struct parse *p;
1292 register sopno pos;
1293 sop value;
1294 {
1295 	/* avoid making error situations worse */
1296 	if (p->error != 0)
1297 		return;
1298 
1299 	assert(value < 1<<OPSHIFT);
1300 	p->strip[pos] = OP(p->strip[pos]) | value;
1301 }
1302 
1303 /*
1304  - enlarge - enlarge the strip
1305  */
1306 static void
1307 enlarge(p, size)
1308 register struct parse *p;
1309 register sopno size;
1310 {
1311 	register sop *sp;
1312 
1313 	if (p->ssize >= size)
1314 		return;
1315 
1316 	sp = (sop *)realloc(p->strip, size*sizeof(sop));
1317 	if (sp == NULL) {
1318 		SETERROR(REG_ESPACE);
1319 		return;
1320 	}
1321 	p->strip = sp;
1322 	p->ssize = size;
1323 }
1324 
1325 /*
1326  - stripsnug - compact the strip
1327  */
1328 static void
1329 stripsnug(p, g)
1330 register struct parse *p;
1331 register struct re_guts *g;
1332 {
1333 	g->nstates = p->slen;
1334 	g->strip = (sop *)realloc((sop *)p->strip, p->slen * sizeof(sop));
1335 	if (g->strip == NULL) {
1336 		SETERROR(REG_ESPACE);
1337 		g->strip = p->strip;
1338 	}
1339 }
1340 
1341 /*
1342  - findmust - fill in must and mlen with longest mandatory literal string
1343  *
1344  * This algorithm could do fancy things like analyzing the operands of |
1345  * for common subsequences.  Someday.  This code is simple and finds most
1346  * of the interesting cases.
1347  *
1348  * Note that must and mlen got initialized during setup.
1349  */
1350 STATIC void
1351 findmust(p, g)
1352 struct parse *p;
1353 register struct re_guts *g;
1354 {
1355 	register sop *scan;
1356 	sop *start;
1357 	register sop *newstart;
1358 	register sopno newlen;
1359 	register sop s;
1360 	register char *cp;
1361 	register sopno i;
1362 
1363 	/* avoid making error situations worse */
1364 	if (p->error != 0)
1365 		return;
1366 
1367 	/* find the longest OCHAR sequence in strip */
1368 	newlen = 0;
1369 	scan = g->strip + 1;
1370 	do {
1371 		s = *scan++;
1372 		switch (OP(s)) {
1373 		case OCHAR:		/* sequence member */
1374 			if (newlen == 0)		/* new sequence */
1375 				newstart = scan - 1;
1376 			newlen++;
1377 			break;
1378 		case OPLUS_:		/* things that don't break one */
1379 		case OLPAREN:
1380 		case ORPAREN:
1381 			break;
1382 		case OQUEST_:		/* things that must be skipped */
1383 		case OCH_:
1384 			scan--;
1385 			do {
1386 				scan += OPND(s);
1387 				s = *scan;
1388 				/* assert() interferes w debug printouts */
1389 				if (OP(s) != O_QUEST && OP(s) != O_CH &&
1390 							OP(s) != OOR2) {
1391 					g->iflags |= BAD;
1392 					return;
1393 				}
1394 			} while (OP(s) != O_QUEST && OP(s) != O_CH);
1395 			/* fallthrough */
1396 		default:		/* things that break a sequence */
1397 			if (newlen > g->mlen) {		/* ends one */
1398 				start = newstart;
1399 				g->mlen = newlen;
1400 			}
1401 			newlen = 0;
1402 			break;
1403 		}
1404 	} while (OP(s) != OEND);
1405 
1406 	if (g->mlen == 0)		/* there isn't one */
1407 		return;
1408 
1409 	/* turn it into a character string */
1410 	g->must = malloc((size_t)g->mlen + 1);
1411 	if (g->must == NULL) {		/* argh; just forget it */
1412 		g->mlen = 0;
1413 		return;
1414 	}
1415 	cp = g->must;
1416 	scan = start;
1417 	for (i = g->mlen; i > 0; i--) {
1418 		while (OP(s = *scan++) != OCHAR)
1419 			continue;
1420 		*cp++ = OPND(s);
1421 	}
1422 	*cp++ = '\0';		/* just on general principles */
1423 }
1424 
1425 /*
1426  - pluscount - count + nesting
1427  */
1428 STATIC sopno			/* nesting depth */
1429 pluscount(p, g)
1430 struct parse *p;
1431 register struct re_guts *g;
1432 {
1433 	register sop *scan;
1434 	register sop s;
1435 	register sopno plusnest = 0;
1436 	register sopno maxnest = 0;
1437 
1438 	if (p->error != 0)
1439 		return(0);	/* there may not be an OEND */
1440 
1441 	scan = g->strip + 1;
1442 	do {
1443 		s = *scan++;
1444 		switch (OP(s)) {
1445 		case OPLUS_:
1446 			plusnest++;
1447 			break;
1448 		case O_PLUS:
1449 			if (plusnest > maxnest)
1450 				maxnest = plusnest;
1451 			plusnest--;
1452 			break;
1453 		}
1454 	} while (OP(s) != OEND);
1455 	if (plusnest != 0)
1456 		g->iflags |= BAD;
1457 	return(maxnest);
1458 }
1459