xref: /original-bsd/lib/libc/regex/regcomp.c (revision cf2124ff)
1 /*-
2  * Copyright (c) 1992 Henry Spencer.
3  * Copyright (c) 1992 The Regents of the University of California.
4  * All rights reserved.
5  *
6  * This code is derived from software contributed to Berkeley by
7  * Henry Spencer of the University of Toronto.
8  *
9  * %sccs.include.redist.c%
10  *
11  *	@(#)regcomp.c	5.5 (Berkeley) 10/23/92
12  */
13 
14 #if defined(LIBC_SCCS) && !defined(lint)
15 static char sccsid[] = "@(#)regcomp.c	5.5 (Berkeley) 10/23/92";
16 #endif /* LIBC_SCCS and not lint */
17 
18 #include <sys/types.h>
19 #include <stdio.h>
20 #include <string.h>
21 #include <ctype.h>
22 #include <limits.h>
23 #include <stdlib.h>
24 #include <regex.h>
25 
26 #include "utils.h"
27 #include "regex2.h"
28 
29 #include "cclass.h"
30 #include "cname.h"
31 
32 /*
33  * parse structure, passed up and down to avoid global variables and
34  * other clumsinesses
35  */
36 struct parse {
37 	uchar *next;		/* next character in RE */
38 	int error;		/* has an error been seen? */
39 	sop *strip;		/* malloced strip */
40 	sopno ssize;		/* malloced strip size (allocated) */
41 	sopno slen;		/* malloced strip length (used) */
42 	int ncsalloc;		/* number of csets allocated */
43 	struct re_guts *g;
44 #	define	NPAREN	10	/* we need to remember () 1-9 for back refs */
45 	sopno pbegin[NPAREN];	/* -> ( ([0] unused) */
46 	sopno pend[NPAREN];	/* -> ) ([0] unused) */
47 };
48 
49 static uchar nuls[10];		/* place to point scanner in event of error */
50 
51 /*
52  * macros for use with parse structure
53  * BEWARE:  these know that the parse structure is named `p' !!!
54  */
55 #define	PEEK()	((uchar)*p->next)
56 #define	PEEK2()	((uchar)*(p->next+1))
57 #define	SEE(c)	(PEEK() == (c))
58 #define	SEETWO(a, b)	(PEEK() == (a) && PEEK2() == (b))
59 #define	EAT(c)	((SEE(c)) ? (NEXT(), 1) : 0)
60 #define	EATTWO(a, b)	((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
61 #define	NEXT()	(p->next++)
62 #define	NEXT2()	(p->next += 2)
63 #define	NEXTn(n)	(p->next += (n))
64 #define	GETNEXT()	((uchar)*p->next++)
65 #define	SETERROR(e)	seterr(p, (e))
66 #define	REQUIRE(co, e)	((co) || SETERROR(e))
67 #define	MUSTSEE(c, e)	(REQUIRE(PEEK() == (c), e))
68 #define	MUSTEAT(c, e)	(REQUIRE(GETNEXT() == (c), e))
69 #define	MUSTNOTSEE(c, e)	(REQUIRE(PEEK() != (c), e))
70 #define	EMIT(sop, sopnd)	doemit(p, sop, (size_t)(sopnd))
71 #define	INSERT(sop, pos)	doinsert(p, sop, HERE()-(pos)+1, pos)
72 #define	FWD(pos)		dofwd(p, pos, HERE()-(pos))
73 #define	BACK(sop, pos)		EMIT(sop, HERE()-pos)
74 #define	HERE()		(p->slen)
75 #define	THERE()		(p->slen - 1)
76 #define	DROP(n)	(p->slen -= (n))
77 
78 static cset	*allocset __P((struct parse *));
79 static void	 bothcases __P((struct parse *, u_int));
80 static void	 categorize __P((struct parse *, struct re_guts *));
81 static void	 doemit __P((struct parse *, sop, size_t));
82 static void	 dofwd __P((struct parse *, sopno, sop));
83 static void	 doinsert __P((struct parse *, sop, size_t, sopno));
84 static sopno	 dupl __P((struct parse *, sopno, sopno));
85 static void	 enlarge __P((struct parse *, sopno));
86 static void	 findmust __P((struct parse *, struct re_guts *));
87 static int	 freezeset __P((struct parse *, cset *));
88 static int	 isinsets __P((struct re_guts *, u_int));
89 static void	 mcadd __P((struct parse *, cset *, uchar *));
90 static uchar	*mcfind __P((cset *, u_int *));
91 static int	 mcin __P((struct parse *, cset *, u_int *));
92 static void	 mcinvert __P((struct parse *, cset *));
93 static void	 mcsub __P((struct parse *, cset *, u_int *));
94 static void	 nonnewline __P((struct parse *));
95 static void	 ordinary __P((struct parse *, u_int));
96 static uchar	 othercase __P((u_int));
97 static void	 p_b_cclass __P((struct parse *, cset *));
98 static uchar	 p_b_coll_elem __P((struct parse *, u_int));
99 static void	 p_b_eclass __P((struct parse *, cset *));
100 static uchar	 p_b_symbol __P((struct parse *));
101 static void	 p_b_term __P((struct parse *, cset *));
102 static void	 p_bracket __P((struct parse *));
103 static void	 p_bre  __P((struct parse *, u_int, u_int));
104 static int	 p_count __P((struct parse *));
105 static void	 p_ere  __P((struct parse *, u_int));
106 static void	 p_ere_exp  __P((struct parse *));
107 static int	 p_simp_re __P((struct parse *, int));
108 static sopno	 pluscount __P((struct parse *, struct re_guts *));
109 static void	 repeat __P((struct parse *, sopno, int, int));
110 static int	 samesets __P((struct re_guts *, u_int, u_int));
111 static int	 seterr __P((struct parse *, int));
112 static void	 stripsnug __P((struct parse *, struct re_guts *));
113 
114 /*
115  - regcomp - interface for parser and compilation
116  */
117 int				/* 0 success, otherwise REG_something */
118 regcomp(preg, pattern, cflags)
119 regex_t *preg;
120 const char *pattern;
121 int cflags;
122 {
123 	struct parse pa;
124 	register struct re_guts *g;
125 	register struct parse *p = &pa;
126 	register int i;
127 
128 	/* do the mallocs early so failure handling is easy */
129 	/* the +NUC here is for the category table */
130 	g = (struct re_guts *)malloc(sizeof(struct re_guts) + NUC);
131 	if (g == NULL)
132 		return(REG_ESPACE);
133 	p->ssize = strlen(pattern)/2*3 + 1;
134 	p->strip = (sop *)malloc(p->ssize * sizeof(sop));
135 	p->slen = 0;
136 	if (p->strip == NULL) {
137 		free((char *)g);
138 		return(REG_ESPACE);
139 	}
140 
141 	/* set things up */
142 	p->g = g;
143 	p->next = (uchar *)pattern;
144 	p->error = 0;
145 	p->ncsalloc = 0;
146 	for (i = 0; i < NPAREN; i++) {
147 		p->pbegin[i] = 0;
148 		p->pend[i] = 0;
149 	}
150 	g->csetsize = NUC;
151 	g->sets = NULL;
152 	g->setbits = NULL;
153 	g->ncsets = 0;
154 	g->cflags = cflags;
155 	g->iflags = 0;
156 	g->must = NULL;
157 	g->mlen = 0;
158 	g->nsub = 0;
159 	g->ncategories = 1;	/* category 0 is "everything else" */
160 	g->categories = (uchar *)g + sizeof(struct re_guts);
161 	(void) memset((char *)g->categories, 0, NUC);
162 	g->backrefs = 0;
163 	g->nplus = 0;
164 
165 	/* do it */
166 	EMIT(OEND, 0);
167 	g->firststate = THERE();
168 	if (cflags&REG_EXTENDED)
169 		p_ere(p, '\0');
170 	else
171 		p_bre(p, '\0', '\0');
172 	EMIT(OEND, 0);
173 	g->laststate = THERE();
174 
175 	/* tidy up loose ends and fill things in */
176 	categorize(p, g);
177 	stripsnug(p, g);
178 	findmust(p, g);
179 	g->nplus = pluscount(p, g);
180 	g->magic = MAGIC2;
181 	preg->re_nsub = g->nsub;
182 	preg->re_g = g;
183 	preg->re_magic = MAGIC1;
184 #ifndef REDEBUG
185 	/* not debugging, so can't rely on the assert() in regexec() */
186 	if (g->iflags&BAD)
187 		SETERROR(REG_ASSERT);
188 #endif
189 
190 	/* win or lose, we're done */
191 	if (p->error != 0)	/* lose */
192 		regfree(preg);
193 	return(p->error);
194 }
195 
196 /*
197  - p_ere - ERE parser top level, concatenation and alternation
198  */
199 static void
200 p_ere(p, stop)
201 register struct parse *p;
202 u_int stop;			/* character this ERE should end at */
203 {
204 	register uchar c;
205 	register sopno prevback;
206 	register sopno prevfwd;
207 	register sopno conc;
208 	register int first = 1;		/* is this the first alternative? */
209 
210 	for (;;) {
211 		/* do a bunch of concatenated expressions */
212 		conc = HERE();
213 		while ((c = PEEK()) != '|' && c != stop && c != '\0')
214 			p_ere_exp(p);
215 		REQUIRE(HERE() != conc, REG_EMPTY);	/* require nonempty */
216 
217 		if (!EAT('|'))
218 			break;		/* NOTE BREAK OUT */
219 
220 		if (first) {
221 			INSERT(OCH_, conc);	/* offset is wrong */
222 			prevfwd = conc;
223 			prevback = conc;
224 			first = 0;
225 		}
226 		BACK(OOR1, prevback);
227 		prevback = THERE();
228 		FWD(prevfwd);			/* fix previous offset */
229 		prevfwd = HERE();
230 		EMIT(OOR2, 0);			/* offset is very wrong */
231 	}
232 
233 	if (!first) {		/* tail-end fixups */
234 		FWD(prevfwd);
235 		BACK(O_CH, prevback);
236 	}
237 
238 	assert(SEE(stop) || SEE('\0'));
239 }
240 
241 /*
242  - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
243  */
244 static void
245 p_ere_exp(p)
246 register struct parse *p;
247 {
248 	register uchar c;
249 	register sopno pos;
250 	register int count;
251 	register int count2;
252 	register sopno subno;
253 	int wascaret = 0;
254 	/* we call { a repetition if followed by a digit */
255 #	define	ISRPT(c1, c2)	(c1 == '*' || c1 == '+' || c1 == '?' || \
256 						(c1 == '{' && isdigit(c2)))
257 
258 	c = GETNEXT();
259 	assert(c != '\0');	/* caller should have ensured this */
260 
261 	pos = HERE();
262 	switch (c) {
263 	case '(':
264 		MUSTNOTSEE('\0', REG_EPAREN);
265 		p->g->nsub++;
266 		subno = p->g->nsub;
267 		if (subno < NPAREN)
268 			p->pbegin[subno] = HERE();
269 		EMIT(OLPAREN, subno);
270 		if (!SEE(')'))
271 			p_ere(p, ')');
272 		if (subno < NPAREN) {
273 			p->pend[subno] = HERE();
274 			assert(p->pend[subno] != 0);
275 		}
276 		EMIT(ORPAREN, subno);
277 		MUSTEAT(')', REG_EPAREN);
278 		break;
279 #ifndef POSIX_MISTAKE
280 	case ')':		/* happens only if no current unmatched ( */
281 		/*
282 		 * You may ask, why the ifndef?  Because I didn't notice
283 		 * this until slightly too late for 1003.2, and none of the
284 		 * other 1003.2 regular-expression reviewers noticed it at
285 		 * all.  So an unmatched ) is legal POSIX, at least until
286 		 * we can get it fixed.
287 		 */
288 		SETERROR(REG_EPAREN);
289 		break;
290 #endif
291 	case '^':
292 		EMIT(OBOL, 0);
293 		p->g->iflags |= USEBOL;
294 		wascaret = 1;
295 		break;
296 	case '$':
297 		EMIT(OEOL, 0);
298 		p->g->iflags |= USEEOL;
299 		break;
300 	case '|':
301 		SETERROR(REG_EMPTY);
302 		break;
303 	case '*':
304 	case '+':
305 	case '?':
306 		SETERROR(REG_BADRPT);
307 		break;
308 	case '.':
309 		if (p->g->cflags&REG_NEWLINE)
310 			nonnewline(p);
311 		else
312 			EMIT(OANY, 0);
313 		break;
314 	case '[':
315 		p_bracket(p);
316 		break;
317 	case '\\':
318 		c = GETNEXT();
319 #ifdef xxx
320 		if (c == '^' || c == '.' || c == '[' || c == '$' ||
321 				c == '(' || c == ')' || c == '|' ||
322 				c == '*' || c == '+' || c == '?' ||
323 				c == '{' || c == '\\')
324 #else
325 		if (c != '\0')
326 #endif
327 			ordinary(p, c);
328 		else
329 			SETERROR(REG_EESCAPE);
330 		break;
331 	case '{':		/* okay as ordinary except if digit follows */
332 		REQUIRE(!isdigit(PEEK()), REG_BADRPT);
333 		/* FALLTHROUGH */
334 	default:
335 		ordinary(p, c);
336 		break;
337 	}
338 
339 	c = PEEK();
340 	if (!ISRPT(c, PEEK2()))
341 		return;		/* no repetition, we're done */
342 	NEXT();
343 
344 	REQUIRE(!wascaret, REG_BADRPT);
345 	switch (c) {
346 	case '*':	/* implemented as +? */
347 		INSERT(OPLUS_, pos);
348 		BACK(O_PLUS, pos);
349 		INSERT(OQUEST_, pos);
350 		BACK(O_QUEST, pos);
351 		break;
352 	case '+':
353 		INSERT(OPLUS_, pos);
354 		BACK(O_PLUS, pos);
355 		break;
356 	case '?':
357 		INSERT(OQUEST_, pos);
358 		BACK(O_QUEST, pos);
359 		break;
360 	case '{':
361 		count = p_count(p);
362 		if (EAT(',')) {
363 			if (isdigit(PEEK())) {
364 				count2 = p_count(p);
365 				REQUIRE(count <= count2, REG_BADBR);
366 			} else		/* single number with comma */
367 				count2 = INFINITY;
368 		} else		/* just a single number */
369 			count2 = count;
370 		repeat(p, pos, count, count2);
371 		if (!EAT('}')) {	/* error heuristics */
372 			while ((c = PEEK()) != '\0' && c != '}')
373 				NEXT();
374 			if (c == '\0')
375 				SETERROR(REG_EBRACE);
376 			else
377 				SETERROR(REG_BADBR);
378 		}
379 		break;
380 	}
381 
382 	c = PEEK();
383 	REQUIRE(!ISRPT(c, PEEK2()), REG_BADRPT);
384 }
385 
386 /*
387  - p_bre - BRE parser top level, anchoring and concatenation
388  * Giving end1 as '\0' essentially eliminates the end1/end2 check.
389  *
390  * This implementation is a bit of a kludge, in that a trailing $ is first
391  * taken as an ordinary character and then revised to be an anchor.  The
392  * only undesirable side effect is that '$' gets included as a character
393  * category in such cases.  This is fairly harmless; not worth fixing.
394  * The amount of lookahead needed to avoid this kludge is excessive,
395  * especially since things like "$*" appear to be legal. xxx
396  */
397 static void
398 p_bre(p, end1, end2)
399 register struct parse *p;
400 register u_int end1;		/* first terminating character */
401 register u_int end2;		/* second terminating character */
402 {
403 	register sopno start = HERE();
404 	register int first = 1;			/* first subexpression? */
405 	register int wasdollar = 0;
406 
407 	if (EAT('^')) {
408 		EMIT(OBOL, 0);
409 		p->g->iflags |= USEBOL;
410 	}
411 	while (!SEE('\0') && !SEETWO(end1, end2)) {
412 		wasdollar = p_simp_re(p, first);
413 		first = 0;
414 	}
415 	if (wasdollar) {	/* oops, that was a trailing anchor */
416 		DROP(1);
417 		EMIT(OEOL, 0);
418 		p->g->iflags |= USEEOL;
419 	}
420 
421 	REQUIRE(HERE() != start, REG_EMPTY);	/* require nonempty */
422 }
423 
424 /*
425  - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
426  */
427 static int			/* was the simple RE an unbackslashed $? */
428 p_simp_re(p, starordinary)
429 register struct parse *p;
430 int starordinary;		/* is a leading * an ordinary character? */
431 {
432 	register int c;
433 	register int count;
434 	register int count2;
435 	register sopno pos;
436 	register int i;
437 	register sopno subno;
438 #	define	BACKSL	(1<<CHAR_BIT)
439 
440 	pos = HERE();		/* repetion op, if any, covers from here */
441 
442 	c = GETNEXT();
443 	assert(c != '\0');	/* caller should have ensured this */
444 	if (c == '\\')
445 		c = BACKSL | PEEK();
446 	switch (c) {
447 	case '.':
448 		if (p->g->cflags&REG_NEWLINE)
449 			nonnewline(p);
450 		else
451 			EMIT(OANY, 0);
452 		break;
453 	case '[':
454 		p_bracket(p);
455 		break;
456 	case BACKSL|'{':
457 		SETERROR(REG_BADRPT);
458 		break;
459 	case BACKSL|'(':
460 		NEXT();
461 		p->g->nsub++;
462 		subno = p->g->nsub;
463 		if (subno < NPAREN)
464 			p->pbegin[subno] = HERE();
465 		EMIT(OLPAREN, subno);
466 		/* the SEE here is an error heuristic */
467 		if (!SEE('\0') && !SEETWO('\\', ')'))
468 			p_bre(p, '\\', ')');
469 		if (subno < NPAREN) {
470 			p->pend[subno] = HERE();
471 			assert(p->pend[subno] != 0);
472 		}
473 		EMIT(ORPAREN, subno);
474 		REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
475 		break;
476 	case BACKSL|')':	/* should not get here -- must be user */
477 	case BACKSL|'}':
478 		SETERROR(REG_EPAREN);
479 		break;
480 	case BACKSL|'1':
481 	case BACKSL|'2':
482 	case BACKSL|'3':
483 	case BACKSL|'4':
484 	case BACKSL|'5':
485 	case BACKSL|'6':
486 	case BACKSL|'7':
487 	case BACKSL|'8':
488 	case BACKSL|'9':
489 		i = (c&~BACKSL) - '0';
490 		assert(i < NPAREN);
491 		if (p->pend[i] != 0) {
492 			assert(i <= p->g->nsub);
493 			EMIT(OBACK_, i);
494 			assert(p->pbegin[i] != 0);
495 			assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
496 			assert(OP(p->strip[p->pend[i]]) == ORPAREN);
497 			(void) dupl(p, p->pbegin[i]+1, p->pend[i]);
498 			EMIT(O_BACK, i);
499 		} else
500 			SETERROR(REG_ESUBREG);
501 		p->g->backrefs = 1;
502 		NEXT();
503 		break;
504 	case BACKSL|'\0':
505 		SETERROR(REG_EESCAPE);
506 		break;
507 	case '*':
508 		REQUIRE(starordinary, REG_BADRPT);
509 		/* FALLTHROUGH */
510 	default:
511 		if (c & BACKSL)
512 			NEXT();
513 		ordinary(p, (uchar)(c &~ BACKSL));
514 		break;
515 	}
516 
517 	if (EAT('*')) {		/* implemented as +? */
518 		INSERT(OPLUS_, pos);
519 		BACK(O_PLUS, pos);
520 		INSERT(OQUEST_, pos);
521 		BACK(O_QUEST, pos);
522 	} else if (EATTWO('\\', '{')) {
523 		count = p_count(p);
524 		if (EAT(',')) {
525 			if (isdigit(PEEK())) {
526 				count2 = p_count(p);
527 				REQUIRE(count <= count2, REG_BADBR);
528 			} else		/* single number with comma */
529 				count2 = INFINITY;
530 		} else		/* just a single number */
531 			count2 = count;
532 		repeat(p, pos, count, count2);
533 		if (!EATTWO('\\', '}')) {	/* error heuristics */
534 			while (!SEE('\0') && !SEETWO('\\', '}'))
535 				NEXT();
536 			if (SEE('\0'))
537 				SETERROR(REG_EBRACE);
538 			else
539 				SETERROR(REG_BADBR);
540 		}
541 	} else if (c == '$')	/* unbackslashed $ not followed by reptn */
542 		return(1);
543 
544 	return(0);
545 }
546 
547 /*
548  - p_count - parse a repetition count
549  */
550 static int			/* the value */
551 p_count(p)
552 register struct parse *p;
553 {
554 	register int count = 0;
555 	register int ndigits = 0;
556 
557 	while (isdigit(PEEK()) && count <= DUPMAX) {
558 		count = count*10 + (GETNEXT() - '0');
559 		ndigits++;
560 	}
561 
562 	REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
563 	return(count);
564 }
565 
566 /*
567  - p_bracket - parse a bracketed character list
568  *
569  * Note a significant property of this code:  if the allocset() did SETERROR,
570  * no set operations are done.
571  */
572 static void
573 p_bracket(p)
574 register struct parse *p;
575 {
576 	register uchar c;
577 	register cset *cs = allocset(p);
578 	register int invert = 0;
579 
580 	if (EAT('^'))
581 		invert++;	/* make note to invert set at end */
582 	if (EAT(']'))
583 		CHadd(cs, ']');
584 	else if (EAT('-'))
585 		CHadd(cs, '-');
586 	while ((c = PEEK()) != '\0' && c != ']' && !SEETWO('-', ']'))
587 		p_b_term(p, cs);
588 	if (EAT('-'))
589 		CHadd(cs, '-');
590 	MUSTEAT(']', REG_EBRACK);
591 
592 	if (invert && p->error == 0) {
593 		register int i;
594 
595 		for (i = p->g->csetsize - 1; i >= 0; i--)
596 			if (CHIN(cs, i))
597 				CHsub(cs, i);
598 			else
599 				CHadd(cs, i);
600 		if (p->g->cflags&REG_NEWLINE)
601 			CHsub(cs, '\n');
602 		if (cs->multis != NULL)
603 			mcinvert(p, cs);
604 	}
605 	assert(cs->multis == NULL);		/* xxx */
606 	EMIT(OANYOF, freezeset(p, cs));
607 }
608 
609 /*
610  - p_b_term - parse one term of a bracketed character list
611  */
612 static void
613 p_b_term(p, cs)
614 register struct parse *p;
615 register cset *cs;
616 {
617 	register uchar c;
618 	register uchar start, finish;
619 	register int i;
620 
621 	/* classify what we've got */
622 	switch (PEEK()) {
623 	case '[':
624 		c = PEEK2();
625 		break;
626 	case '-':
627 		SETERROR(REG_ERANGE);
628 		return;			/* NOTE RETURN */
629 		break;
630 	default:
631 		c = '\0';
632 		break;
633 	}
634 
635 	switch (c) {
636 	case ':':		/* character class */
637 		NEXT2();
638 		c = PEEK();
639 		REQUIRE(c != '\0', REG_EBRACK);
640 		REQUIRE(c != '-' && c != ']', REG_ECTYPE);
641 		p_b_cclass(p, cs);
642 		MUSTNOTSEE('\0', REG_EBRACK);
643 		REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
644 		break;
645 	case '=':		/* equivalence class */
646 		NEXT2();
647 		c = PEEK();
648 		REQUIRE(c != '\0', REG_EBRACK);
649 		REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
650 		p_b_eclass(p, cs);
651 		MUSTNOTSEE('\0', REG_EBRACK);
652 		REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
653 		break;
654 	default:		/* symbol, ordinary character, or range */
655 /* xxx revision needed for multichar stuff */
656 		start = p_b_symbol(p);
657 		if (PEEK() == '-' && (c = PEEK2()) != ']' && c != '\0') {
658 			/* range */
659 			NEXT();
660 			if (EAT('-'))
661 				finish = '-';
662 			else
663 				finish = p_b_symbol(p);
664 		} else
665 			finish = start;
666 		REQUIRE(start <= finish, REG_ERANGE);
667 		for (i = start; i <= finish; i++) {
668 			CHadd(cs, i);
669 			if ((p->g->cflags&REG_ICASE) && isalpha(i)) {
670 				c = othercase((uchar)i);
671 				CHadd(cs, c);
672 			}
673 		}
674 		break;
675 	}
676 }
677 
678 /*
679  - p_b_cclass - parse a character-class name and deal with it
680  */
681 static void
682 p_b_cclass(p, cs)
683 register struct parse *p;
684 register cset *cs;
685 {
686 	register uchar *sb = p->next;
687 	register uchar *se = sb;
688 	register struct cclass *cp;
689 	register int len;
690 	register uchar *u;
691 	register uchar c;
692 
693 	while (isalpha(*se))
694 		se++;
695 	len = se - sb;
696 	NEXTn(len);
697 	for (cp = cclasses; cp->name != NULL; cp++)
698 		if (strncmp(cp->name, (char *)sb, len) == 0 &&
699 							cp->name[len] == '\0')
700 			break;
701 	if (cp->name == NULL) {
702 		/* oops, didn't find it */
703 		SETERROR(REG_ECTYPE);
704 		return;
705 	}
706 
707 	u = (uchar *)cp->chars;
708 	while ((c = *u++) != '\0')
709 		CHadd(cs, c);
710 	for (u = (uchar *)cp->multis; *u != '\0'; u += strlen((char *)u) + 1)
711 		MCadd(cs, u);
712 }
713 
714 /*
715  - p_b_eclass - parse an equivalence-class name and deal with it
716  *
717  * This implementation is incomplete. xxx
718  */
719 static void
720 p_b_eclass(p, cs)
721 register struct parse *p;
722 register cset *cs;
723 {
724 	register uchar c;
725 
726 	c = p_b_coll_elem(p, '=');
727 	CHadd(cs, c);
728 }
729 
730 /*
731  - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
732  */
733 static uchar			/* value of symbol */
734 p_b_symbol(p)
735 register struct parse *p;
736 {
737 	register uchar value;
738 
739 	if (!EATTWO('[', '.')) {
740 		MUSTNOTSEE('\0', REG_EBRACK);
741 		return(GETNEXT());
742 	}
743 
744 	/* collating symbol */
745 	MUSTNOTSEE('\0', REG_EBRACK);
746 	value = p_b_coll_elem(p, '.');
747 	REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
748 	return(value);
749 }
750 
751 /*
752  - p_b_coll_elem - parse a collating-element name and look it up
753  */
754 static uchar			/* value of collating element */
755 p_b_coll_elem(p, endc)
756 register struct parse *p;
757 u_int endc;			/* name ended by endc,']' */
758 {
759 	register uchar *sp = p->next;
760 	register struct cname *cp;
761 	register int len;
762 	register uchar c;
763 
764 	while ((c = PEEK()) != '\0' && !SEETWO(endc, ']'))
765 		NEXT();
766 	if (c == '\0') {
767 		SETERROR(REG_EBRACK);
768 		return(0);
769 	}
770 	len = p->next - sp;
771 	for (cp = cnames; cp->name != NULL; cp++)
772 		if (strncmp(cp->name, (char *)sp, len) == 0 &&
773 							cp->name[len] == '\0')
774 			return(cp->code);	/* known name */
775 	if (len == 1)
776 		return(*sp);	/* single character */
777 	SETERROR(REG_ECOLLATE);			/* neither */
778 	return(0);
779 }
780 
781 /*
782  - othercase - return the case counterpart of an alphabetic
783  */
784 static uchar
785 othercase(ch)
786 u_int ch;
787 {
788 	assert(isalpha(ch));
789 	if (isupper(ch))
790 		return(tolower(ch));
791 	else if (islower(ch))
792 		return(toupper(ch));
793 	else			/* peculiar, but could happen */
794 		return(ch);
795 }
796 
797 /*
798  - bothcases - emit a dualcase version of a character
799  *
800  * Boy, is this implementation ever a kludge...
801  */
802 static void
803 bothcases(p, ch)
804 register struct parse *p;
805 u_int ch;
806 {
807 	register uchar *oldnext;
808 	uchar bracket[3];
809 
810 	oldnext = p->next;
811 	p->next = bracket;
812 	bracket[0] = ch;
813 	bracket[1] = ']';
814 	bracket[2] = '\0';
815 	p_bracket(p);
816 	assert(p->next == bracket+2);
817 	p->next = oldnext;
818 }
819 
820 /*
821  - ordinary - emit an ordinary character
822  */
823 static void
824 ordinary(p, ch)
825 register struct parse *p;
826 register u_int ch;
827 {
828 	register uchar *cap = p->g->categories;
829 
830 	if ((p->g->cflags&REG_ICASE) && isalpha(ch)) {
831 		bothcases(p, ch);
832 		return;
833 	}
834 
835 	EMIT(OCHAR, ch);
836 	if (cap[ch] == 0)
837 		cap[ch] = p->g->ncategories++;
838 }
839 
840 /*
841  - nonnewline - emit REG_NEWLINE version of OANY
842  *
843  * Boy, is this implementation ever a kludge...
844  */
845 static void
846 nonnewline(p)
847 register struct parse *p;
848 {
849 	register uchar *oldnext;
850 	uchar bracket[4];
851 
852 	oldnext = p->next;
853 	p->next = bracket;
854 	bracket[0] = '^';
855 	bracket[1] = '\n';
856 	bracket[2] = ']';
857 	bracket[3] = '\0';
858 	p_bracket(p);
859 	assert(p->next == bracket+3);
860 	p->next = oldnext;
861 }
862 
863 /*
864  - repeat - generate code for a bounded repetition, recursively if needed
865  */
866 static void
867 repeat(p, start, from, to)
868 register struct parse *p;
869 sopno start;			/* operand from here to end of strip */
870 int from;			/* repeated from this number */
871 int to;				/* to this number of times (maybe INFINITY) */
872 {
873 	register sopno finish = HERE();
874 #	define	N	2
875 #	define	INF	3
876 #	define	REP(f, t)	((f)*8 + (t))
877 #	define	MAP(n)	(((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
878 	register sopno copy;
879 
880 	if (p->error != 0)	/* head off possible runaway recursion */
881 		return;
882 
883 	assert(from <= to);
884 
885 	switch (REP(MAP(from), MAP(to))) {
886 	case REP(0, 0):			/* must be user doing this */
887 		DROP(finish-start);	/* drop the operand */
888 		break;
889 	case REP(0, 1):			/* as x{1,1}? */
890 	case REP(0, N):			/* as x{1,n}? */
891 	case REP(0, INF):		/* as x{1,}? */
892 		INSERT(OQUEST_, start);		/* offset is wrong... */
893 		repeat(p, start+1, 1, to);
894 		FWD(start);			/* ... fix it */
895 		BACK(O_QUEST, start);
896 		break;
897 	case REP(1, 1):			/* trivial case */
898 		/* done */
899 		break;
900 	case REP(1, N):			/* as x?x{1,n-1} */
901 		INSERT(OQUEST_, start);
902 		BACK(O_QUEST, start);
903 		copy = dupl(p, start+1, finish+1);
904 		assert(copy == finish+2);
905 		repeat(p, copy, 1, to-1);
906 		break;
907 	case REP(1, INF):		/* as x+ */
908 		INSERT(OPLUS_, start);
909 		BACK(O_PLUS, start);
910 		break;
911 	case REP(N, N):			/* as xx{m-1,n-1} */
912 		copy = dupl(p, start, finish);
913 		repeat(p, copy, from-1, to-1);
914 		break;
915 	case REP(N, INF):		/* as xx{n-1,INF} */
916 		copy = dupl(p, start, finish);
917 		repeat(p, copy, from-1, to);
918 		break;
919 	default:			/* "can't happen" */
920 		SETERROR(REG_ASSERT);	/* just in case */
921 		break;
922 	}
923 }
924 
925 /*
926  - seterr - set an error condition
927  */
928 static int			/* useless but makes type checking happy */
929 seterr(p, e)
930 register struct parse *p;
931 int e;
932 {
933 	if (p->error == 0)	/* keep earliest error condition */
934 		p->error = e;
935 	p->next = nuls;		/* try to bring things to a halt */
936 	return(0);		/* make the return value well-defined */
937 }
938 
939 /*
940  - allocset - allocate a set of characters for []
941  */
942 static cset *
943 allocset(p)
944 register struct parse *p;
945 {
946 	register int no = p->g->ncsets++;
947 	register size_t nc;
948 	register size_t nbytes;
949 	register cset *cs;
950 	register size_t css = (size_t)p->g->csetsize;
951 
952 	if (no >= p->ncsalloc) {	/* need another column of space */
953 		p->ncsalloc += CHAR_BIT;
954 		nc = p->ncsalloc;
955 		assert(nc % CHAR_BIT == 0);
956 		nbytes = nc / CHAR_BIT * css;
957 		if (p->g->sets == NULL)
958 			p->g->sets = (cset *)malloc(nc * sizeof(cset));
959 		else
960 			p->g->sets = (cset *)realloc((char *)p->g->sets,
961 							nc * sizeof(cset));
962 		if (p->g->setbits == NULL)
963 			p->g->setbits = (uchar *)malloc(nbytes);
964 		else
965 			p->g->setbits = (uchar *)realloc((char *)p->g->setbits,
966 								nbytes);
967 		if (p->g->sets != NULL && p->g->setbits != NULL)
968 			(void) memset((char *)p->g->setbits + (nbytes - css),
969 								0, css);
970 		else {
971 			no = 0;
972 			SETERROR(REG_ESPACE);
973 			/* caller's responsibility not to do set ops */
974 		}
975 	}
976 
977 	assert(p->g->sets != NULL);	/* xxx */
978 	cs = &p->g->sets[no];
979 	cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
980 	cs->mask = 1 << ((no) % CHAR_BIT);
981 	cs->hash = 0;
982 	cs->smultis = 0;
983 	cs->multis = NULL;
984 
985 	return(cs);
986 }
987 
988 /*
989  - freezeset - final processing on a set of characters
990  *
991  * The main task here is merging identical sets.  This is usually a waste
992  * of time (although the hash code minimizes the overhead), but can win
993  * big if REG_ICASE is being used.  REG_ICASE, by the way, is why the hash
994  * is done using addition rather than xor -- all ASCII [aA] sets xor to
995  * the same value!
996  */
997 static int			/* set number */
998 freezeset(p, cs)
999 register struct parse *p;
1000 register cset *cs;
1001 {
1002 	register uchar h = cs->hash;
1003 	register int i;
1004 	register cset *top = &p->g->sets[p->g->ncsets];
1005 	register cset *cs2;
1006 	register size_t css = (size_t)p->g->csetsize;
1007 
1008 	/* look for an earlier one which is the same */
1009 	for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
1010 		if (cs2->hash == h && cs2 != cs) {
1011 			/* maybe */
1012 			for (i = 0; i < css; i++)
1013 				if (!!CHIN(cs2, i) != !!CHIN(cs, i))
1014 					break;		/* no */
1015 			if (i == css)
1016 				break;			/* yes */
1017 		}
1018 
1019 	if (cs2 < top) {	/* found one */
1020 		assert(cs == top-1);
1021 		p->g->ncsets--;
1022 		for (i = 0; i < css; i++)
1023 			CHsub(cs, i);
1024 		cs = cs2;
1025 	}
1026 
1027 	return((int)(cs - p->g->sets));
1028 }
1029 
1030 /*
1031  - mcadd - add a collating element to a cset
1032  */
1033 static void
1034 mcadd(p, cs, cp)
1035 register struct parse *p;
1036 register cset *cs;
1037 register uchar *cp;
1038 {
1039 	register size_t oldend = cs->smultis;
1040 
1041 	cs->smultis += strlen((char *)cp) + 1;
1042 	if (cs->multis == NULL)
1043 		cs->multis = (uchar *)malloc(cs->smultis);
1044 	else
1045 		cs->multis = (uchar *)realloc(cs->multis, cs->smultis);
1046 	if (cs->multis == NULL) {
1047 		SETERROR(REG_ESPACE);
1048 		return;
1049 	}
1050 
1051 	(void) strcpy((char *)(cs->multis + oldend - 1), (char *)cp);
1052 	cs->multis[cs->smultis - 1] = '\0';
1053 }
1054 
1055 /*
1056  - mcsub - subtract a collating element from a cset
1057  */
1058 static void
1059 mcsub(p, cs, cp)
1060 register struct parse *p;
1061 register cset *cs;
1062 register u_int *cp;
1063 {
1064 	register uchar *fp = mcfind(cs, cp);
1065 	register size_t len = strlen((char *)fp);
1066 
1067 	assert(p != NULL);
1068 	(void) memmove((char *)fp, (char *)(fp + len + 1),
1069 				cs->smultis - (fp + len + 1 - cs->multis));
1070 	cs->smultis -= len;
1071 
1072 	if (cs->smultis == 0) {
1073 		free((char *)cs->multis);
1074 		cs->multis = NULL;
1075 		return;
1076 	}
1077 
1078 	cs->multis = (uchar *)realloc(cs->multis, cs->smultis);
1079 	assert(cs->multis != NULL);
1080 }
1081 
1082 /*
1083  - mcin - is a collating element in a cset?
1084  */
1085 static int
1086 mcin(p, cs, cp)
1087 register struct parse *p;
1088 register cset *cs;
1089 register u_int *cp;
1090 {
1091 	return(mcfind(cs, cp) != NULL);
1092 }
1093 
1094 /*
1095  - mcfind - find a collating element in a cset
1096  */
1097 static uchar *
1098 mcfind(cs, cp)
1099 register cset *cs;
1100 register u_int *cp;
1101 {
1102 	register uchar *p;
1103 
1104 	if (cs->multis == NULL)
1105 		return(NULL);
1106 	for (p = cs->multis; *p != '\0'; p += strlen((char *)p) + 1)
1107 		if (strcmp((char *)cp, (char *)p) == 0)
1108 			return(p);
1109 	return(NULL);
1110 }
1111 
1112 /*
1113  - mcinvert - invert the list of collating elements in a cset
1114  *
1115  * This would have to know the set of possibilities.  Implementation
1116  * is deferred.
1117  */
1118 static void
1119 mcinvert(p, cs)
1120 register struct parse *p;
1121 register cset *cs;
1122 {
1123 	assert(cs->multis == NULL);	/* xxx */
1124 }
1125 
1126 /*
1127  - isinsets - is this character in any sets?
1128  */
1129 static int			/* predicate */
1130 isinsets(g, c)
1131 register struct re_guts *g;
1132 u_int c;
1133 {
1134 	register uchar *col;
1135 	register int i;
1136 	register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1137 
1138 	for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1139 		if (col[c] != 0)
1140 			return(1);
1141 	return(0);
1142 }
1143 
1144 /*
1145  - samesets - are these two characters in exactly the same sets?
1146  */
1147 static int			/* predicate */
1148 samesets(g, c1, c2)
1149 register struct re_guts *g;
1150 register u_int c1;
1151 register u_int c2;
1152 {
1153 	register uchar *col;
1154 	register int i;
1155 	register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1156 
1157 	for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1158 		if (col[c1] != col[c2])
1159 			return(0);
1160 	return(1);
1161 }
1162 
1163 /*
1164  - categorize - sort out character categories
1165  */
1166 static void
1167 categorize(p, g)
1168 struct parse *p;
1169 register struct re_guts *g;
1170 {
1171 	register uchar *cats = g->categories;
1172 	register unsigned c;
1173 	register unsigned c2;
1174 	register uchar cat;
1175 
1176 	/* avoid making error situations worse */
1177 	if (p->error != 0)
1178 		return;
1179 
1180 	for (c = 0; c < g->csetsize; c++)
1181 		if (cats[c] == 0 && isinsets(g, c)) {
1182 			cat = g->ncategories++;
1183 			cats[c] = cat;
1184 			for (c2 = c+1; c2 < g->csetsize; c2++)
1185 				if (cats[c2] == 0 && samesets(g, c, c2))
1186 					cats[c2] = cat;
1187 		}
1188 }
1189 
1190 /*
1191  - dupl - emit a duplicate of a bunch of sops
1192  */
1193 static sopno			/* start of duplicate */
1194 dupl(p, start, finish)
1195 register struct parse *p;
1196 sopno start;			/* from here */
1197 sopno finish;			/* to this less one */
1198 {
1199 	register sopno ret = HERE();
1200 	register sopno len = finish - start;
1201 
1202 	assert(finish >= start);
1203 	if (len == 0)
1204 		return(ret);
1205 	enlarge(p, p->ssize + len);	/* this many unexpected additions */
1206 	assert(p->ssize >= p->slen + len);
1207 	(void) memcpy((char *)(p->strip + p->slen),
1208 		(char *)(p->strip + start), (size_t)len*sizeof(sop));
1209 	p->slen += len;
1210 	return(ret);
1211 }
1212 
1213 /*
1214  - doemit - emit a strip operator
1215  *
1216  * It might seem better to implement this as a macro with a function as
1217  * hard-case backup, but it's just too big and messy unless there are
1218  * some changes to the data structures.  Maybe later.
1219  */
1220 static void
1221 doemit(p, op, opnd)
1222 register struct parse *p;
1223 sop op;
1224 size_t opnd;
1225 {
1226 	/* avoid making error situations worse */
1227 	if (p->error != 0)
1228 		return;
1229 
1230 	/* deal with oversize operands ("can't happen", more or less) */
1231 	assert(opnd < 1<<OPSHIFT);
1232 
1233 	/* deal with undersized strip */
1234 	if (p->slen >= p->ssize)
1235 		enlarge(p, (p->ssize+1) / 2 * 3);	/* +50% */
1236 	assert(p->slen < p->ssize);
1237 
1238 	/* finally, it's all reduced to the easy case */
1239 	p->strip[p->slen++] = SOP(op, opnd);
1240 }
1241 
1242 /*
1243  - doinsert - insert a sop into the strip
1244  */
1245 static void
1246 doinsert(p, op, opnd, pos)
1247 register struct parse *p;
1248 sop op;
1249 size_t opnd;
1250 sopno pos;
1251 {
1252 	register sopno sn;
1253 	register sop s;
1254 	register int i;
1255 
1256 	/* avoid making error situations worse */
1257 	if (p->error != 0)
1258 		return;
1259 
1260 	sn = HERE();
1261 	EMIT(op, opnd);		/* do checks, ensure space */
1262 	assert(HERE() == sn+1);
1263 	s = p->strip[sn];
1264 
1265 	/* adjust paren pointers */
1266 	assert(pos > 0);
1267 	for (i = 1; i < NPAREN; i++) {
1268 		if (p->pbegin[i] >= pos) {
1269 			p->pbegin[i]++;
1270 		}
1271 		if (p->pend[i] >= pos) {
1272 			p->pend[i]++;
1273 		}
1274 	}
1275 
1276 	memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
1277 						(HERE()-pos-1)*sizeof(sop));
1278 	p->strip[pos] = s;
1279 }
1280 
1281 /*
1282  - dofwd - complete a forward reference
1283  */
1284 static void
1285 dofwd(p, pos, value)
1286 register struct parse *p;
1287 register sopno pos;
1288 sop value;
1289 {
1290 	/* avoid making error situations worse */
1291 	if (p->error != 0)
1292 		return;
1293 
1294 	assert(value < 1<<OPSHIFT);
1295 	p->strip[pos] = OP(p->strip[pos]) | value;
1296 }
1297 
1298 /*
1299  - enlarge - enlarge the strip
1300  */
1301 static void
1302 enlarge(p, size)
1303 register struct parse *p;
1304 register sopno size;
1305 {
1306 	register sop *sp;
1307 
1308 	if (p->ssize >= size)
1309 		return;
1310 
1311 	sp = (sop *)realloc(p->strip, size*sizeof(sop));
1312 	if (sp == NULL) {
1313 		SETERROR(REG_ESPACE);
1314 		return;
1315 	}
1316 	p->strip = sp;
1317 	p->ssize = size;
1318 }
1319 
1320 /*
1321  - stripsnug - compact the strip
1322  */
1323 static void
1324 stripsnug(p, g)
1325 register struct parse *p;
1326 register struct re_guts *g;
1327 {
1328 	g->nstates = p->slen;
1329 	g->strip = (sop *)realloc((sop *)p->strip, p->slen * sizeof(sop));
1330 	if (g->strip == NULL) {
1331 		SETERROR(REG_ESPACE);
1332 		g->strip = p->strip;
1333 	}
1334 }
1335 
1336 /*
1337  - findmust - fill in must and mlen with longest mandatory literal string
1338  *
1339  * This algorithm could do fancy things like analyzing the operands of |
1340  * for common subsequences.  Someday.  This code is simple and finds most
1341  * of the interesting cases.
1342  *
1343  * Note that must and mlen got initialized during setup.
1344  */
1345 static void
1346 findmust(p, g)
1347 struct parse *p;
1348 register struct re_guts *g;
1349 {
1350 	register sop *scan;
1351 	sop *start;
1352 	register sop *newstart;
1353 	register sopno newlen;
1354 	register sop s;
1355 	register char *cp;
1356 	register sopno i;
1357 
1358 	/* avoid making error situations worse */
1359 	if (p->error != 0)
1360 		return;
1361 
1362 	/* find the longest OCHAR sequence in strip */
1363 	newlen = 0;
1364 	scan = g->strip + 1;
1365 	do {
1366 		s = *scan++;
1367 		switch (OP(s)) {
1368 		case OCHAR:		/* sequence member */
1369 			if (newlen == 0)		/* new sequence */
1370 				newstart = scan - 1;
1371 			newlen++;
1372 			break;
1373 		case OPLUS_:		/* things that don't break one */
1374 		case OLPAREN:
1375 		case ORPAREN:
1376 			break;
1377 		case OQUEST_:		/* things that must be skipped */
1378 		case OCH_:
1379 			scan--;
1380 			do {
1381 				scan += OPND(s);
1382 				s = *scan;
1383 				/* assert() interferes w debug printouts */
1384 				if (OP(s) != O_QUEST && OP(s) != O_CH &&
1385 							OP(s) != OOR2) {
1386 					g->iflags |= BAD;
1387 					return;
1388 				}
1389 			} while (OP(s) != O_QUEST && OP(s) != O_CH);
1390 			/* fallthrough */
1391 		default:		/* things that break a sequence */
1392 			if (newlen > g->mlen) {		/* ends one */
1393 				start = newstart;
1394 				g->mlen = newlen;
1395 			}
1396 			newlen = 0;
1397 			break;
1398 		}
1399 	} while (OP(s) != OEND);
1400 
1401 	if (g->mlen == 0)		/* there isn't one */
1402 		return;
1403 
1404 	/* turn it into a character string */
1405 	g->must = malloc((size_t)g->mlen + 1);
1406 	if (g->must == NULL) {		/* argh; just forget it */
1407 		g->mlen = 0;
1408 		return;
1409 	}
1410 	cp = g->must;
1411 	scan = start;
1412 	for (i = g->mlen; i > 0; i--) {
1413 		while (OP(s = *scan++) != OCHAR)
1414 			continue;
1415 		*cp++ = OPND(s);
1416 	}
1417 	*cp++ = '\0';		/* just on general principles */
1418 }
1419 
1420 /*
1421  - pluscount - count + nesting
1422  */
1423 static sopno			/* nesting depth */
1424 pluscount(p, g)
1425 struct parse *p;
1426 register struct re_guts *g;
1427 {
1428 	register sop *scan;
1429 	register sop s;
1430 	register sopno plusnest = 0;
1431 	register sopno maxnest = 0;
1432 
1433 	if (p->error != 0)
1434 		return(0);	/* there may not be an OEND */
1435 
1436 	scan = g->strip + 1;
1437 	do {
1438 		s = *scan++;
1439 		switch (OP(s)) {
1440 		case OPLUS_:
1441 			plusnest++;
1442 			break;
1443 		case O_PLUS:
1444 			if (plusnest > maxnest)
1445 				maxnest = plusnest;
1446 			plusnest--;
1447 			break;
1448 		}
1449 	} while (OP(s) != OEND);
1450 	if (plusnest != 0)
1451 		g->iflags |= BAD;
1452 	return(maxnest);
1453 }
1454