xref: /original-bsd/lib/libc/regex/regcomp.c (revision be1f24e8)
1 /*-
2  * Copyright (c) 1992 Henry Spencer.
3  * Copyright (c) 1992 The Regents of the University of California.
4  * All rights reserved.
5  *
6  * This code is derived from software contributed to Berkeley by
7  * Henry Spencer of the University of Toronto.
8  *
9  * %sccs.include.redist.c%
10  *
11  *	@(#)regcomp.c	5.4 (Berkeley) 10/01/92
12  */
13 
14 #if defined(LIBC_SCCS) && !defined(lint)
15 static char sccsid[] = "@(#)regcomp.c	5.4 (Berkeley) 10/01/92";
16 #endif /* LIBC_SCCS and not lint */
17 
18 #include <sys/types.h>
19 #include <stdio.h>
20 #include <string.h>
21 #include <ctype.h>
22 #include <limits.h>
23 #include <stdlib.h>
24 #include <regex.h>
25 
26 #include "utils.h"
27 #include "regex2.h"
28 
29 #include "cclass.h"
30 #include "cname.h"
31 
32 /*
33  * parse structure, passed up and down to avoid global variables and
34  * other clumsinesses
35  */
36 struct parse {
37 	uchar *next;		/* next character in RE */
38 	int error;		/* has an error been seen? */
39 	sop *strip;		/* malloced strip */
40 	sopno ssize;		/* malloced strip size (allocated) */
41 	sopno slen;		/* malloced strip length (used) */
42 	int ncsalloc;		/* number of csets allocated */
43 	struct re_guts *g;
44 #	define	NPAREN	10	/* we need to remember () 1-9 for back refs */
45 	sopno pbegin[NPAREN];	/* -> ( ([0] unused) */
46 	sopno pend[NPAREN];	/* -> ) ([0] unused) */
47 };
48 
49 static uchar nuls[10];		/* place to point scanner in event of error */
50 
51 /*
52  * macros for use with parse structure
53  * BEWARE:  these know that the parse structure is named `p' !!!
54  */
55 #define	PEEK()	((uchar)*p->next)
56 #define	PEEK2()	((uchar)*(p->next+1))
57 #define	SEE(c)	(PEEK() == (c))
58 #define	SEETWO(a, b)	(PEEK() == (a) && PEEK2() == (b))
59 #define	EAT(c)	((SEE(c)) ? (NEXT(), 1) : 0)
60 #define	EATTWO(a, b)	((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
61 #define	NEXT()	(p->next++)
62 #define	NEXT2()	(p->next += 2)
63 #define	NEXTn(n)	(p->next += (n))
64 #define	GETNEXT()	((uchar)*p->next++)
65 #define	SETERROR(e)	seterr(p, (e))
66 #define	REQUIRE(co, e)	((co) || SETERROR(e))
67 #define	MUSTSEE(c, e)	(REQUIRE(PEEK() == (c), e))
68 #define	MUSTEAT(c, e)	(REQUIRE(GETNEXT() == (c), e))
69 #define	MUSTNOTSEE(c, e)	(REQUIRE(PEEK() != (c), e))
70 #define	EMIT(sop, sopnd)	doemit(p, sop, (size_t)(sopnd))
71 #define	INSERT(sop, pos)	doinsert(p, sop, HERE()-(pos)+1, pos)
72 #define	FWD(pos)		dofwd(p, pos, HERE()-(pos))
73 #define	BACK(sop, pos)		EMIT(sop, HERE()-pos)
74 #define	HERE()		(p->slen)
75 #define	THERE()		(p->slen - 1)
76 #define	DROP(n)	(p->slen -= (n))
77 
78 static cset	*allocset __P((struct parse *));
79 static void	 bothcases __P((struct parse *, u_int));
80 static void	 categorize __P((struct parse *, struct re_guts *));
81 static void	 doemit __P((struct parse *, sop, size_t));
82 static void	 dofwd __P((struct parse *, sopno, sop));
83 static void	 doinsert __P((struct parse *, sop, size_t, sopno));
84 static sopno	 dupl __P((struct parse *, sopno, sopno));
85 static void	 enlarge __P((struct parse *, sopno));
86 static void	 findmust __P((struct parse *, struct re_guts *));
87 static int	 freezeset __P((struct parse *, cset *));
88 static int	 isinsets __P((struct re_guts *, u_int));
89 static void	 mcadd __P((struct parse *, cset *, uchar *));
90 static uchar	*mcfind __P((cset *, u_int *));
91 static int	 mcin __P((struct parse *, cset *, u_int *));
92 static void	 mcinvert __P((struct parse *, cset *));
93 static void	 mcsub __P((struct parse *, cset *, u_int *));
94 static void	 nonnewline __P((struct parse *));
95 static void	 ordinary __P((struct parse *, u_int));
96 static uchar	 othercase __P((u_int));
97 static void	 p_b_cclass __P((struct parse *, cset *));
98 static uchar	 p_b_coll_elem __P((struct parse *, u_int));
99 static void	 p_b_eclass __P((struct parse *, cset *));
100 static uchar	 p_b_symbol __P((struct parse *));
101 static void	 p_b_term __P((struct parse *, cset *));
102 static void	 p_bracket __P((struct parse *));
103 static void	 p_bre  __P((struct parse *, u_int, u_int));
104 static int	 p_count __P((struct parse *));
105 static void	 p_ere  __P((struct parse *, u_int));
106 static void	 p_ere_exp  __P((struct parse *));
107 static int	 p_simp_re __P((struct parse *, int));
108 static sopno	 pluscount __P((struct parse *, struct re_guts *));
109 static void	 repeat __P((struct parse *, sopno, int, int));
110 static int	 samesets __P((struct re_guts *, u_int, u_int));
111 static int	 seterr __P((struct parse *, int));
112 static void	 stripsnug __P((struct parse *, struct re_guts *));
113 
114 /*
115  - regcomp - interface for parser and compilation
116  */
117 int				/* 0 success, otherwise REG_something */
118 regcomp(preg, pattern, cflags)
119 regex_t *preg;
120 const char *pattern;
121 int cflags;
122 {
123 	struct parse pa;
124 	register struct re_guts *g;
125 	register struct parse *p = &pa;
126 	register int i;
127 
128 	/* do the mallocs early so failure handling is easy */
129 	/* the +NUC here is for the category table */
130 	g = (struct re_guts *)malloc(sizeof(struct re_guts) + NUC);
131 	if (g == NULL)
132 		return(REG_ESPACE);
133 	p->ssize = strlen(pattern)/2*3 + 1;
134 	p->strip = (sop *)malloc(p->ssize * sizeof(sop));
135 	p->slen = 0;
136 	if (p->strip == NULL) {
137 		free((char *)g);
138 		return(REG_ESPACE);
139 	}
140 
141 	/* set things up */
142 	p->g = g;
143 	p->next = (uchar *)pattern;
144 	p->error = 0;
145 	p->ncsalloc = 0;
146 	for (i = 0; i < NPAREN; i++) {
147 		p->pbegin[i] = 0;
148 		p->pend[i] = 0;
149 	}
150 	g->csetsize = NUC;
151 	g->sets = NULL;
152 	g->setbits = NULL;
153 	g->ncsets = 0;
154 	g->cflags = cflags;
155 	g->iflags = 0;
156 	g->must = NULL;
157 	g->mlen = 0;
158 	g->nsub = 0;
159 	g->ncategories = 1;	/* category 0 is "everything else" */
160 	g->categories = (uchar *)g + sizeof(struct re_guts);
161 	(void) memset((char *)g->categories, 0, NUC);
162 	g->backrefs = 0;
163 	g->nplus = 0;
164 
165 	/* do it */
166 	EMIT(OEND, 0);
167 	g->firststate = THERE();
168 	if (cflags&REG_EXTENDED)
169 		p_ere(p, '\0');
170 	else
171 		p_bre(p, '\0', '\0');
172 	EMIT(OEND, 0);
173 	g->laststate = THERE();
174 
175 	/* tidy up loose ends and fill things in */
176 	categorize(p, g);
177 	stripsnug(p, g);
178 	findmust(p, g);
179 	g->nplus = pluscount(p, g);
180 	g->magic = MAGIC2;
181 	preg->re_nsub = g->nsub;
182 	preg->re_g = g;
183 	preg->re_magic = MAGIC1;
184 #ifndef REDEBUG
185 	/* not debugging, so can't rely on the assert() in regexec() */
186 	if (g->iflags&BAD)
187 		SETERROR(REG_ASSERT);
188 #endif
189 
190 	/* win or lose, we're done */
191 	if (p->error != 0)	/* lose */
192 		regfree(preg);
193 	return(p->error);
194 }
195 
196 /*
197  - p_ere - ERE parser top level, concatenation and alternation
198  */
199 static void
200 p_ere(p, stop)
201 register struct parse *p;
202 u_int stop;			/* character this ERE should end at */
203 {
204 	register uchar c;
205 	register sopno prevback;
206 	register sopno prevfwd;
207 	register sopno conc;
208 	register int first = 1;		/* is this the first alternative? */
209 
210 	for (;;) {
211 		/* do a bunch of concatenated expressions */
212 		conc = HERE();
213 		while ((c = PEEK()) != '|' && c != stop && c != '\0')
214 			p_ere_exp(p);
215 		REQUIRE(HERE() != conc, REG_EMPTY);	/* require nonempty */
216 
217 		if (!EAT('|'))
218 			break;		/* NOTE BREAK OUT */
219 
220 		if (first) {
221 			INSERT(OCH_, conc);	/* offset is wrong */
222 			prevfwd = conc;
223 			prevback = conc;
224 			first = 0;
225 		}
226 		BACK(OOR1, prevback);
227 		prevback = THERE();
228 		FWD(prevfwd);			/* fix previous offset */
229 		prevfwd = HERE();
230 		EMIT(OOR2, 0);			/* offset is very wrong */
231 	}
232 
233 	if (!first) {		/* tail-end fixups */
234 		FWD(prevfwd);
235 		BACK(O_CH, prevback);
236 	}
237 
238 	assert(SEE(stop) || SEE('\0'));
239 }
240 
241 /*
242  - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
243  */
244 static void
245 p_ere_exp(p)
246 register struct parse *p;
247 {
248 	register uchar c;
249 	register sopno pos;
250 	register int count;
251 	register int count2;
252 	register sopno subno;
253 	int wascaret = 0;
254 	/* we call { a repetition if followed by a digit */
255 #	define	ISRPT(c1, c2)	(c1 == '*' || c1 == '+' || c1 == '?' || \
256 						(c1 == '{' && isdigit(c2)))
257 
258 	c = GETNEXT();
259 	assert(c != '\0');	/* caller should have ensured this */
260 
261 	pos = HERE();
262 	switch (c) {
263 	case '(':
264 		MUSTNOTSEE('\0', REG_EPAREN);
265 		p->g->nsub++;
266 		subno = p->g->nsub;
267 		if (subno < NPAREN)
268 			p->pbegin[subno] = HERE();
269 		EMIT(OLPAREN, subno);
270 		if (!SEE(')'))
271 			p_ere(p, ')');
272 		if (subno < NPAREN) {
273 			p->pend[subno] = HERE();
274 			assert(p->pend[subno] != 0);
275 		}
276 		EMIT(ORPAREN, subno);
277 		MUSTEAT(')', REG_EPAREN);
278 		break;
279 #ifndef POSIX_MISTAKE
280 	case ')':		/* happens only if no current unmatched ( */
281 		/*
282 		 * You may ask, why the ifndef?  Because I didn't notice
283 		 * this until slightly too late for 1003.2, and none of the
284 		 * other 1003.2 regular-expression reviewers noticed it at
285 		 * all.  So an unmatched ) is legal POSIX, at least until
286 		 * we can get it fixed.
287 		 */
288 		SETERROR(REG_EPAREN);
289 		break;
290 #endif
291 	case '^':
292 		EMIT(OBOL, 0);
293 		p->g->iflags |= USEBOL;
294 		wascaret = 1;
295 		break;
296 	case '$':
297 		EMIT(OEOL, 0);
298 		p->g->iflags |= USEEOL;
299 		break;
300 	case '|':
301 		SETERROR(REG_EMPTY);
302 		break;
303 	case '*':
304 	case '+':
305 	case '?':
306 		SETERROR(REG_BADRPT);
307 		break;
308 	case '.':
309 		if (p->g->cflags&REG_NEWLINE)
310 			nonnewline(p);
311 		else
312 			EMIT(OANY, 0);
313 		break;
314 	case '[':
315 		p_bracket(p);
316 		break;
317 	case '\\':
318 		c = GETNEXT();
319 #ifdef xxx
320 		if (c == '^' || c == '.' || c == '[' || c == '$' ||
321 				c == '(' || c == ')' || c == '|' ||
322 				c == '*' || c == '+' || c == '?' ||
323 				c == '{' || c == '\\')
324 #else
325 		if (c != '\0')
326 #endif
327 			ordinary(p, c);
328 		else
329 			SETERROR(REG_EESCAPE);
330 		break;
331 	case '{':		/* okay as ordinary except if digit follows */
332 		REQUIRE(!isdigit(PEEK()), REG_BADRPT);
333 		/* FALLTHROUGH */
334 	default:
335 		ordinary(p, c);
336 		break;
337 	}
338 
339 	c = PEEK();
340 	if (!ISRPT(c, PEEK2()))
341 		return;		/* no repetition, we're done */
342 	NEXT();
343 
344 	REQUIRE(!wascaret, REG_BADRPT);
345 	switch (c) {
346 	case '*':	/* implemented as +? */
347 		INSERT(OPLUS_, pos);
348 		BACK(O_PLUS, pos);
349 		INSERT(OQUEST_, pos);
350 		BACK(O_QUEST, pos);
351 		break;
352 	case '+':
353 		INSERT(OPLUS_, pos);
354 		BACK(O_PLUS, pos);
355 		break;
356 	case '?':
357 		INSERT(OQUEST_, pos);
358 		BACK(O_QUEST, pos);
359 		break;
360 	case '{':
361 		count = p_count(p);
362 		if (EAT(',')) {
363 			if (isdigit(PEEK())) {
364 				count2 = p_count(p);
365 				REQUIRE(count <= count2, REG_BADBR);
366 			} else		/* single number with comma */
367 				count2 = INFINITY;
368 		} else		/* just a single number */
369 			count2 = count;
370 		repeat(p, pos, count, count2);
371 		if (!EAT('}')) {	/* error heuristics */
372 			while ((c = PEEK()) != '\0' && c != '}')
373 				NEXT();
374 			if (c == '\0')
375 				SETERROR(REG_EBRACE);
376 			else
377 				SETERROR(REG_BADBR);
378 		}
379 		break;
380 	}
381 
382 	c = PEEK();
383 	REQUIRE(!ISRPT(c, PEEK2()), REG_BADRPT);
384 }
385 
386 /*
387  - p_bre - BRE parser top level, anchoring and concatenation
388  * Giving end1 as '\0' essentially eliminates the end1/end2 check.
389  *
390  * This implementation is a bit of a kludge, in that a trailing $ is first
391  * taken as an ordinary character and then revised to be an anchor.  The
392  * only undesirable side effect is that '$' gets included as a character
393  * category in such cases.  This is fairly harmless; not worth fixing.
394  * The amount of lookahead needed to avoid this kludge is excessive,
395  * especially since things like "$*" appear to be legal. xxx
396  */
397 static void
398 p_bre(p, end1, end2)
399 register struct parse *p;
400 register u_int end1;		/* first terminating character */
401 register u_int end2;		/* second terminating character */
402 {
403 	register sopno start = HERE();
404 	register int first = 1;			/* first subexpression? */
405 	register int wasdollar = 0;
406 
407 	if (EAT('^')) {
408 		EMIT(OBOL, 0);
409 		p->g->iflags |= USEBOL;
410 	}
411 	while (!SEE('\0') && !SEETWO(end1, end2)) {
412 		wasdollar = p_simp_re(p, first);
413 		first = 0;
414 	}
415 	if (wasdollar) {	/* oops, that was a trailing anchor */
416 		DROP(1);
417 		EMIT(OEOL, 0);
418 		p->g->iflags |= USEEOL;
419 	}
420 
421 	REQUIRE(HERE() != start, REG_EMPTY);	/* require nonempty */
422 }
423 
424 /*
425  - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
426  */
427 static int			/* was the simple RE an unbackslashed $? */
428 p_simp_re(p, starordinary)
429 register struct parse *p;
430 int starordinary;		/* is a leading * an ordinary character? */
431 {
432 	register int c;
433 	register int count;
434 	register int count2;
435 	register sopno pos;
436 	register int i;
437 	register sopno subno;
438 #	define	BACKSL	(1<<CHAR_BIT)
439 
440 	pos = HERE();		/* repetion op, if any, covers from here */
441 
442 	c = GETNEXT();
443 	assert(c != '\0');	/* caller should have ensured this */
444 	if (c == '\\')
445 		c = BACKSL | PEEK();
446 	switch (c) {
447 	case '.':
448 		if (p->g->cflags&REG_NEWLINE)
449 			nonnewline(p);
450 		else
451 			EMIT(OANY, 0);
452 		break;
453 	case '[':
454 		p_bracket(p);
455 		break;
456 	case BACKSL|'{':
457 		SETERROR(REG_BADRPT);
458 		break;
459 	case BACKSL|'(':
460 		NEXT();
461 		p->g->nsub++;
462 		subno = p->g->nsub;
463 		if (subno < NPAREN)
464 			p->pbegin[subno] = HERE();
465 		EMIT(OLPAREN, subno);
466 		/* the SEE here is an error heuristic */
467 		if (!SEE('\0') && !SEETWO('\\', ')'))
468 			p_bre(p, '\\', ')');
469 		if (subno < NPAREN) {
470 			p->pend[subno] = HERE();
471 			assert(p->pend[subno] != 0);
472 		}
473 		EMIT(ORPAREN, subno);
474 		REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
475 		break;
476 	case BACKSL|')':	/* should not get here -- must be user */
477 	case BACKSL|'}':
478 		SETERROR(REG_EPAREN);
479 		break;
480 	case BACKSL|'1':
481 	case BACKSL|'2':
482 	case BACKSL|'3':
483 	case BACKSL|'4':
484 	case BACKSL|'5':
485 	case BACKSL|'6':
486 	case BACKSL|'7':
487 	case BACKSL|'8':
488 	case BACKSL|'9':
489 		i = (c&~BACKSL) - '0';
490 		assert(i < NPAREN);
491 		if (p->pend[i] != 0) {
492 			assert(i <= p->g->nsub);
493 			EMIT(OBACK_, i);
494 			assert(p->pbegin[i] != 0);
495 			assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
496 			assert(OP(p->strip[p->pend[i]]) == ORPAREN);
497 			(void) dupl(p, p->pbegin[i]+1, p->pend[i]);
498 			EMIT(O_BACK, i);
499 		} else
500 			SETERROR(REG_ESUBREG);
501 		p->g->backrefs = 1;
502 		NEXT();
503 		break;
504 	case BACKSL|'\0':
505 		SETERROR(REG_EESCAPE);
506 		break;
507 	case '*':
508 		REQUIRE(starordinary, REG_BADRPT);
509 		/* FALLTHROUGH */
510 	default:
511 		if (c & BACKSL)
512 			NEXT();
513 		ordinary(p, (uchar)(c &~ BACKSL));
514 		break;
515 	}
516 
517 	if (EAT('*')) {		/* implemented as +? */
518 		INSERT(OPLUS_, pos);
519 		BACK(O_PLUS, pos);
520 		INSERT(OQUEST_, pos);
521 		BACK(O_QUEST, pos);
522 	} else if (EATTWO('\\', '{')) {
523 		count = p_count(p);
524 		if (EAT(',')) {
525 			if (isdigit(PEEK())) {
526 				count2 = p_count(p);
527 				REQUIRE(count <= count2, REG_BADBR);
528 			} else		/* single number with comma */
529 				count2 = INFINITY;
530 		} else		/* just a single number */
531 			count2 = count;
532 		repeat(p, pos, count, count2);
533 		if (!EATTWO('\\', '}')) {	/* error heuristics */
534 			while (!SEE('\0') && !SEETWO('\\', '}'))
535 				NEXT();
536 			if (SEE('\0'))
537 				SETERROR(REG_EBRACE);
538 			else
539 				SETERROR(REG_BADBR);
540 		}
541 	} else if (c == '$')	/* unbackslashed $ not followed by reptn */
542 		return(1);
543 
544 	return(0);
545 }
546 
547 /*
548  - p_count - parse a repetition count
549  */
550 static int			/* the value */
551 p_count(p)
552 register struct parse *p;
553 {
554 	register int count = 0;
555 	register int ndigits = 0;
556 
557 	while (isdigit(PEEK()) && count <= DUPMAX) {
558 		count = count*10 + (GETNEXT() - '0');
559 		ndigits++;
560 	}
561 
562 	REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
563 	return(count);
564 }
565 
566 /*
567  - p_bracket - parse a bracketed character list
568  *
569  * Note a significant property of this code:  if the allocset() did SETERROR,
570  * no set operations are done.
571  */
572 static void
573 p_bracket(p)
574 register struct parse *p;
575 {
576 	register uchar c;
577 	register cset *cs = allocset(p);
578 	register int invert = 0;
579 
580 	if (EAT('^'))
581 		invert++;	/* make note to invert set at end */
582 	if (EAT(']'))
583 		CHadd(cs, ']');
584 	while ((c = PEEK()) != '\0' && c != ']' && !SEETWO('-', ']'))
585 		p_b_term(p, cs);
586 	if (EAT('-'))
587 		CHadd(cs, '-');
588 	MUSTEAT(']', REG_EBRACK);
589 
590 	if (invert && p->error == 0) {
591 		register int i;
592 
593 		for (i = p->g->csetsize - 1; i >= 0; i--)
594 			if (CHIN(cs, i))
595 				CHsub(cs, i);
596 			else
597 				CHadd(cs, i);
598 		if (p->g->cflags&REG_NEWLINE)
599 			CHsub(cs, '\n');
600 		if (cs->multis != NULL)
601 			mcinvert(p, cs);
602 	}
603 	assert(cs->multis == NULL);		/* xxx */
604 	EMIT(OANYOF, freezeset(p, cs));
605 }
606 
607 /*
608  - p_b_term - parse one term of a bracketed character list
609  */
610 static void
611 p_b_term(p, cs)
612 register struct parse *p;
613 register cset *cs;
614 {
615 	register uchar c;
616 	register uchar start, finish;
617 	register int i;
618 
619 	/* classify what we've got */
620 	switch (PEEK()) {
621 	case '[':
622 		c = PEEK2();
623 		break;
624 	case '-':
625 		SETERROR(REG_ERANGE);
626 		return;			/* NOTE RETURN */
627 		break;
628 	default:
629 		c = '\0';
630 		break;
631 	}
632 
633 	switch (c) {
634 	case ':':		/* character class */
635 		NEXT2();
636 		c = PEEK();
637 		REQUIRE(c != '\0', REG_EBRACK);
638 		REQUIRE(c != '-' && c != ']', REG_ECTYPE);
639 		p_b_cclass(p, cs);
640 		MUSTNOTSEE('\0', REG_EBRACK);
641 		REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
642 		break;
643 	case '=':		/* equivalence class */
644 		NEXT2();
645 		c = PEEK();
646 		REQUIRE(c != '\0', REG_EBRACK);
647 		REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
648 		p_b_eclass(p, cs);
649 		MUSTNOTSEE('\0', REG_EBRACK);
650 		REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
651 		break;
652 	default:		/* symbol, ordinary character, or range */
653 /* xxx revision needed for multichar stuff */
654 		start = p_b_symbol(p);
655 		if (PEEK() == '-' && (c = PEEK2()) != ']' && c != '\0') {
656 			/* range */
657 			NEXT();
658 			if (EAT('-'))
659 				finish = '-';
660 			else
661 				finish = p_b_symbol(p);
662 		} else
663 			finish = start;
664 		REQUIRE(start <= finish, REG_ERANGE);
665 		for (i = start; i <= finish; i++) {
666 			CHadd(cs, i);
667 			if ((p->g->cflags&REG_ICASE) && isalpha(i)) {
668 				c = othercase((uchar)i);
669 				CHadd(cs, c);
670 			}
671 		}
672 		break;
673 	}
674 }
675 
676 /*
677  - p_b_cclass - parse a character-class name and deal with it
678  */
679 static void
680 p_b_cclass(p, cs)
681 register struct parse *p;
682 register cset *cs;
683 {
684 	register uchar *sb = p->next;
685 	register uchar *se = sb;
686 	register struct cclass *cp;
687 	register int len;
688 	register uchar *u;
689 	register uchar c;
690 
691 	while (isalpha(*se))
692 		se++;
693 	len = se - sb;
694 	NEXTn(len);
695 	for (cp = cclasses; cp->name != NULL; cp++)
696 		if (strncmp(cp->name, (char *)sb, len) == 0 &&
697 							cp->name[len] == '\0')
698 			break;
699 	if (cp->name == NULL) {
700 		/* oops, didn't find it */
701 		SETERROR(REG_ECTYPE);
702 		return;
703 	}
704 
705 	u = (uchar *)cp->chars;
706 	while ((c = *u++) != '\0')
707 		CHadd(cs, c);
708 	for (u = (uchar *)cp->multis; *u != '\0'; u += strlen((char *)u) + 1)
709 		MCadd(cs, u);
710 }
711 
712 /*
713  - p_b_eclass - parse an equivalence-class name and deal with it
714  *
715  * This implementation is incomplete. xxx
716  */
717 static void
718 p_b_eclass(p, cs)
719 register struct parse *p;
720 register cset *cs;
721 {
722 	register uchar c;
723 
724 	c = p_b_coll_elem(p, '=');
725 	CHadd(cs, c);
726 }
727 
728 /*
729  - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
730  */
731 static uchar			/* value of symbol */
732 p_b_symbol(p)
733 register struct parse *p;
734 {
735 	register uchar value;
736 
737 	if (!EATTWO('[', '.')) {
738 		MUSTNOTSEE('\0', REG_EBRACK);
739 		return(GETNEXT());
740 	}
741 
742 	/* collating symbol */
743 	MUSTNOTSEE('\0', REG_EBRACK);
744 	value = p_b_coll_elem(p, '.');
745 	REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
746 	return(value);
747 }
748 
749 /*
750  - p_b_coll_elem - parse a collating-element name and look it up
751  */
752 static uchar			/* value of collating element */
753 p_b_coll_elem(p, endc)
754 register struct parse *p;
755 u_int endc;			/* name ended by endc,']' */
756 {
757 	register uchar *sp = p->next;
758 	register struct cname *cp;
759 	register int len;
760 	register uchar c;
761 
762 	while ((c = PEEK()) != '\0' && !SEETWO(endc, ']'))
763 		NEXT();
764 	if (c == '\0') {
765 		SETERROR(REG_EBRACK);
766 		return(0);
767 	}
768 	len = p->next - sp;
769 	for (cp = cnames; cp->name != NULL; cp++)
770 		if (strncmp(cp->name, (char *)sp, len) == 0 &&
771 							cp->name[len] == '\0')
772 			return(cp->code);	/* known name */
773 	if (len == 1)
774 		return(*sp);	/* single character */
775 	SETERROR(REG_ECOLLATE);			/* neither */
776 	return(0);
777 }
778 
779 /*
780  - othercase - return the case counterpart of an alphabetic
781  */
782 static uchar
783 othercase(ch)
784 u_int ch;
785 {
786 	assert(isalpha(ch));
787 	if (isupper(ch))
788 		return(tolower(ch));
789 	else if (islower(ch))
790 		return(toupper(ch));
791 	else			/* peculiar, but could happen */
792 		return(ch);
793 }
794 
795 /*
796  - bothcases - emit a dualcase version of a character
797  *
798  * Boy, is this implementation ever a kludge...
799  */
800 static void
801 bothcases(p, ch)
802 register struct parse *p;
803 u_int ch;
804 {
805 	register uchar *oldnext;
806 	uchar bracket[3];
807 
808 	oldnext = p->next;
809 	p->next = bracket;
810 	bracket[0] = ch;
811 	bracket[1] = ']';
812 	bracket[2] = '\0';
813 	p_bracket(p);
814 	assert(p->next == bracket+2);
815 	p->next = oldnext;
816 }
817 
818 /*
819  - ordinary - emit an ordinary character
820  */
821 static void
822 ordinary(p, ch)
823 register struct parse *p;
824 register u_int ch;
825 {
826 	register uchar *cap = p->g->categories;
827 
828 	if ((p->g->cflags&REG_ICASE) && isalpha(ch)) {
829 		bothcases(p, ch);
830 		return;
831 	}
832 
833 	EMIT(OCHAR, ch);
834 	if (cap[ch] == 0)
835 		cap[ch] = p->g->ncategories++;
836 }
837 
838 /*
839  - nonnewline - emit REG_NEWLINE version of OANY
840  *
841  * Boy, is this implementation ever a kludge...
842  */
843 static void
844 nonnewline(p)
845 register struct parse *p;
846 {
847 	register uchar *oldnext;
848 	uchar bracket[4];
849 
850 	oldnext = p->next;
851 	p->next = bracket;
852 	bracket[0] = '^';
853 	bracket[1] = '\n';
854 	bracket[2] = ']';
855 	bracket[3] = '\0';
856 	p_bracket(p);
857 	assert(p->next == bracket+3);
858 	p->next = oldnext;
859 }
860 
861 /*
862  - repeat - generate code for a bounded repetition, recursively if needed
863  */
864 static void
865 repeat(p, start, from, to)
866 register struct parse *p;
867 sopno start;			/* operand from here to end of strip */
868 int from;			/* repeated from this number */
869 int to;				/* to this number of times (maybe INFINITY) */
870 {
871 	register sopno finish = HERE();
872 #	define	N	2
873 #	define	INF	3
874 #	define	REP(f, t)	((f)*8 + (t))
875 #	define	MAP(n)	(((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
876 	register sopno copy;
877 
878 	if (p->error != 0)	/* head off possible runaway recursion */
879 		return;
880 
881 	assert(from <= to);
882 
883 	switch (REP(MAP(from), MAP(to))) {
884 	case REP(0, 0):			/* must be user doing this */
885 		DROP(finish-start);	/* drop the operand */
886 		break;
887 	case REP(0, 1):			/* as x{1,1}? */
888 	case REP(0, N):			/* as x{1,n}? */
889 	case REP(0, INF):		/* as x{1,}? */
890 		INSERT(OQUEST_, start);		/* offset is wrong... */
891 		repeat(p, start+1, 1, to);
892 		FWD(start);			/* ... fix it */
893 		BACK(O_QUEST, start);
894 		break;
895 	case REP(1, 1):			/* trivial case */
896 		/* done */
897 		break;
898 	case REP(1, N):			/* as x?x{1,n-1} */
899 		INSERT(OQUEST_, start);
900 		BACK(O_QUEST, start);
901 		copy = dupl(p, start+1, finish+1);
902 		assert(copy == finish+2);
903 		repeat(p, copy, 1, to-1);
904 		break;
905 	case REP(1, INF):		/* as x+ */
906 		INSERT(OPLUS_, start);
907 		BACK(O_PLUS, start);
908 		break;
909 	case REP(N, N):			/* as xx{m-1,n-1} */
910 		copy = dupl(p, start, finish);
911 		repeat(p, copy, from-1, to-1);
912 		break;
913 	case REP(N, INF):		/* as xx{n-1,INF} */
914 		copy = dupl(p, start, finish);
915 		repeat(p, copy, from-1, to);
916 		break;
917 	default:			/* "can't happen" */
918 		SETERROR(REG_ASSERT);	/* just in case */
919 		break;
920 	}
921 }
922 
923 /*
924  - seterr - set an error condition
925  */
926 static int			/* useless but makes type checking happy */
927 seterr(p, e)
928 register struct parse *p;
929 int e;
930 {
931 	if (p->error == 0)	/* keep earliest error condition */
932 		p->error = e;
933 	p->next = nuls;		/* try to bring things to a halt */
934 	return(0);		/* make the return value well-defined */
935 }
936 
937 /*
938  - allocset - allocate a set of characters for []
939  */
940 static cset *
941 allocset(p)
942 register struct parse *p;
943 {
944 	register int no = p->g->ncsets++;
945 	register size_t nc;
946 	register size_t nbytes;
947 	register cset *cs;
948 	register size_t css = (size_t)p->g->csetsize;
949 
950 	if (no >= p->ncsalloc) {	/* need another column of space */
951 		p->ncsalloc += CHAR_BIT;
952 		nc = p->ncsalloc;
953 		assert(nc % CHAR_BIT == 0);
954 		nbytes = nc / CHAR_BIT * css;
955 		if (p->g->sets == NULL)
956 			p->g->sets = (cset *)malloc(nc * sizeof(cset));
957 		else
958 			p->g->sets = (cset *)realloc((char *)p->g->sets,
959 							nc * sizeof(cset));
960 		if (p->g->setbits == NULL)
961 			p->g->setbits = (uchar *)malloc(nbytes);
962 		else
963 			p->g->setbits = (uchar *)realloc((char *)p->g->setbits,
964 								nbytes);
965 		if (p->g->sets != NULL && p->g->setbits != NULL)
966 			(void) memset((char *)p->g->setbits + (nbytes - css),
967 								0, css);
968 		else {
969 			no = 0;
970 			SETERROR(REG_ESPACE);
971 			/* caller's responsibility not to do set ops */
972 		}
973 	}
974 
975 	assert(p->g->sets != NULL);	/* xxx */
976 	cs = &p->g->sets[no];
977 	cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
978 	cs->mask = 1 << ((no) % CHAR_BIT);
979 	cs->hash = 0;
980 	cs->smultis = 0;
981 	cs->multis = NULL;
982 
983 	return(cs);
984 }
985 
986 /*
987  - freezeset - final processing on a set of characters
988  *
989  * The main task here is merging identical sets.  This is usually a waste
990  * of time (although the hash code minimizes the overhead), but can win
991  * big if REG_ICASE is being used.  REG_ICASE, by the way, is why the hash
992  * is done using addition rather than xor -- all ASCII [aA] sets xor to
993  * the same value!
994  */
995 static int			/* set number */
996 freezeset(p, cs)
997 register struct parse *p;
998 register cset *cs;
999 {
1000 	register uchar h = cs->hash;
1001 	register int i;
1002 	register cset *top = &p->g->sets[p->g->ncsets];
1003 	register cset *cs2;
1004 	register size_t css = (size_t)p->g->csetsize;
1005 
1006 	/* look for an earlier one which is the same */
1007 	for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
1008 		if (cs2->hash == h && cs2 != cs) {
1009 			/* maybe */
1010 			for (i = 0; i < css; i++)
1011 				if (!!CHIN(cs2, i) != !!CHIN(cs, i))
1012 					break;		/* no */
1013 			if (i == css)
1014 				break;			/* yes */
1015 		}
1016 
1017 	if (cs2 < top) {	/* found one */
1018 		assert(cs == top-1);
1019 		p->g->ncsets--;
1020 		for (i = 0; i < css; i++)
1021 			CHsub(cs, i);
1022 		cs = cs2;
1023 	}
1024 
1025 	return((int)(cs - p->g->sets));
1026 }
1027 
1028 /*
1029  - mcadd - add a collating element to a cset
1030  */
1031 static void
1032 mcadd(p, cs, cp)
1033 register struct parse *p;
1034 register cset *cs;
1035 register uchar *cp;
1036 {
1037 	register size_t oldend = cs->smultis;
1038 
1039 	cs->smultis += strlen((char *)cp) + 1;
1040 	if (cs->multis == NULL)
1041 		cs->multis = (uchar *)malloc(cs->smultis);
1042 	else
1043 		cs->multis = (uchar *)realloc(cs->multis, cs->smultis);
1044 	if (cs->multis == NULL) {
1045 		SETERROR(REG_ESPACE);
1046 		return;
1047 	}
1048 
1049 	(void) strcpy((char *)(cs->multis + oldend - 1), (char *)cp);
1050 	cs->multis[cs->smultis - 1] = '\0';
1051 }
1052 
1053 /*
1054  - mcsub - subtract a collating element from a cset
1055  */
1056 static void
1057 mcsub(p, cs, cp)
1058 register struct parse *p;
1059 register cset *cs;
1060 register u_int *cp;
1061 {
1062 	register uchar *fp = mcfind(cs, cp);
1063 	register size_t len = strlen((char *)fp);
1064 
1065 	assert(p != NULL);
1066 	(void) memmove((char *)fp, (char *)(fp + len + 1),
1067 				cs->smultis - (fp + len + 1 - cs->multis));
1068 	cs->smultis -= len;
1069 
1070 	if (cs->smultis == 0) {
1071 		free((char *)cs->multis);
1072 		cs->multis = NULL;
1073 		return;
1074 	}
1075 
1076 	cs->multis = (uchar *)realloc(cs->multis, cs->smultis);
1077 	assert(cs->multis != NULL);
1078 }
1079 
1080 /*
1081  - mcin - is a collating element in a cset?
1082  */
1083 static int
1084 mcin(p, cs, cp)
1085 register struct parse *p;
1086 register cset *cs;
1087 register u_int *cp;
1088 {
1089 	return(mcfind(cs, cp) != NULL);
1090 }
1091 
1092 /*
1093  - mcfind - find a collating element in a cset
1094  */
1095 static uchar *
1096 mcfind(cs, cp)
1097 register cset *cs;
1098 register u_int *cp;
1099 {
1100 	register uchar *p;
1101 
1102 	if (cs->multis == NULL)
1103 		return(NULL);
1104 	for (p = cs->multis; *p != '\0'; p += strlen((char *)p) + 1)
1105 		if (strcmp((char *)cp, (char *)p) == 0)
1106 			return(p);
1107 	return(NULL);
1108 }
1109 
1110 /*
1111  - mcinvert - invert the list of collating elements in a cset
1112  *
1113  * This would have to know the set of possibilities.  Implementation
1114  * is deferred.
1115  */
1116 static void
1117 mcinvert(p, cs)
1118 register struct parse *p;
1119 register cset *cs;
1120 {
1121 	assert(cs->multis == NULL);	/* xxx */
1122 }
1123 
1124 /*
1125  - isinsets - is this character in any sets?
1126  */
1127 static int			/* predicate */
1128 isinsets(g, c)
1129 register struct re_guts *g;
1130 u_int c;
1131 {
1132 	register uchar *col;
1133 	register int i;
1134 	register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1135 
1136 	for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1137 		if (col[c] != 0)
1138 			return(1);
1139 	return(0);
1140 }
1141 
1142 /*
1143  - samesets - are these two characters in exactly the same sets?
1144  */
1145 static int			/* predicate */
1146 samesets(g, c1, c2)
1147 register struct re_guts *g;
1148 register u_int c1;
1149 register u_int c2;
1150 {
1151 	register uchar *col;
1152 	register int i;
1153 	register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1154 
1155 	for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1156 		if (col[c1] != col[c2])
1157 			return(0);
1158 	return(1);
1159 }
1160 
1161 /*
1162  - categorize - sort out character categories
1163  */
1164 static void
1165 categorize(p, g)
1166 struct parse *p;
1167 register struct re_guts *g;
1168 {
1169 	register uchar *cats = g->categories;
1170 	register unsigned c;
1171 	register unsigned c2;
1172 	register uchar cat;
1173 
1174 	/* avoid making error situations worse */
1175 	if (p->error != 0)
1176 		return;
1177 
1178 	for (c = 0; c < g->csetsize; c++)
1179 		if (cats[c] == 0 && isinsets(g, c)) {
1180 			cat = g->ncategories++;
1181 			cats[c] = cat;
1182 			for (c2 = c+1; c2 < g->csetsize; c2++)
1183 				if (cats[c2] == 0 && samesets(g, c, c2))
1184 					cats[c2] = cat;
1185 		}
1186 }
1187 
1188 /*
1189  - dupl - emit a duplicate of a bunch of sops
1190  */
1191 static sopno			/* start of duplicate */
1192 dupl(p, start, finish)
1193 register struct parse *p;
1194 sopno start;			/* from here */
1195 sopno finish;			/* to this less one */
1196 {
1197 	register sopno ret = HERE();
1198 	register sopno len = finish - start;
1199 
1200 	assert(finish >= start);
1201 	if (len == 0)
1202 		return(ret);
1203 	enlarge(p, p->ssize + len);	/* this many unexpected additions */
1204 	assert(p->ssize >= p->slen + len);
1205 	(void) memcpy((char *)(p->strip + p->slen),
1206 		(char *)(p->strip + start), (size_t)len*sizeof(sop));
1207 	p->slen += len;
1208 	return(ret);
1209 }
1210 
1211 /*
1212  - doemit - emit a strip operator
1213  *
1214  * It might seem better to implement this as a macro with a function as
1215  * hard-case backup, but it's just too big and messy unless there are
1216  * some changes to the data structures.  Maybe later.
1217  */
1218 static void
1219 doemit(p, op, opnd)
1220 register struct parse *p;
1221 sop op;
1222 size_t opnd;
1223 {
1224 	/* avoid making error situations worse */
1225 	if (p->error != 0)
1226 		return;
1227 
1228 	/* deal with oversize operands ("can't happen", more or less) */
1229 	assert(opnd < 1<<OPSHIFT);
1230 
1231 	/* deal with undersized strip */
1232 	if (p->slen >= p->ssize)
1233 		enlarge(p, (p->ssize+1) / 2 * 3);	/* +50% */
1234 	assert(p->slen < p->ssize);
1235 
1236 	/* finally, it's all reduced to the easy case */
1237 	p->strip[p->slen++] = SOP(op, opnd);
1238 }
1239 
1240 /*
1241  - doinsert - insert a sop into the strip
1242  */
1243 static void
1244 doinsert(p, op, opnd, pos)
1245 register struct parse *p;
1246 sop op;
1247 size_t opnd;
1248 sopno pos;
1249 {
1250 	register sopno sn;
1251 	register sop s;
1252 	register int i;
1253 
1254 	/* avoid making error situations worse */
1255 	if (p->error != 0)
1256 		return;
1257 
1258 	sn = HERE();
1259 	EMIT(op, opnd);		/* do checks, ensure space */
1260 	assert(HERE() == sn+1);
1261 	s = p->strip[sn];
1262 
1263 	/* adjust paren pointers */
1264 	assert(pos > 0);
1265 	for (i = 1; i < NPAREN; i++) {
1266 		if (p->pbegin[i] >= pos) {
1267 			p->pbegin[i]++;
1268 		}
1269 		if (p->pend[i] >= pos) {
1270 			p->pend[i]++;
1271 		}
1272 	}
1273 
1274 	memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
1275 						(HERE()-pos-1)*sizeof(sop));
1276 	p->strip[pos] = s;
1277 }
1278 
1279 /*
1280  - dofwd - complete a forward reference
1281  */
1282 static void
1283 dofwd(p, pos, value)
1284 register struct parse *p;
1285 register sopno pos;
1286 sop value;
1287 {
1288 	/* avoid making error situations worse */
1289 	if (p->error != 0)
1290 		return;
1291 
1292 	assert(value < 1<<OPSHIFT);
1293 	p->strip[pos] = OP(p->strip[pos]) | value;
1294 }
1295 
1296 /*
1297  - enlarge - enlarge the strip
1298  */
1299 static void
1300 enlarge(p, size)
1301 register struct parse *p;
1302 register sopno size;
1303 {
1304 	register sop *sp;
1305 
1306 	if (p->ssize >= size)
1307 		return;
1308 
1309 	sp = (sop *)realloc(p->strip, size*sizeof(sop));
1310 	if (sp == NULL) {
1311 		SETERROR(REG_ESPACE);
1312 		return;
1313 	}
1314 	p->strip = sp;
1315 	p->ssize = size;
1316 }
1317 
1318 /*
1319  - stripsnug - compact the strip
1320  */
1321 static void
1322 stripsnug(p, g)
1323 register struct parse *p;
1324 register struct re_guts *g;
1325 {
1326 	g->nstates = p->slen;
1327 	g->strip = (sop *)realloc((sop *)p->strip, p->slen * sizeof(sop));
1328 	if (g->strip == NULL) {
1329 		SETERROR(REG_ESPACE);
1330 		g->strip = p->strip;
1331 	}
1332 }
1333 
1334 /*
1335  - findmust - fill in must and mlen with longest mandatory literal string
1336  *
1337  * This algorithm could do fancy things like analyzing the operands of |
1338  * for common subsequences.  Someday.  This code is simple and finds most
1339  * of the interesting cases.
1340  *
1341  * Note that must and mlen got initialized during setup.
1342  */
1343 static void
1344 findmust(p, g)
1345 struct parse *p;
1346 register struct re_guts *g;
1347 {
1348 	register sop *scan;
1349 	sop *start;
1350 	register sop *newstart;
1351 	register sopno newlen;
1352 	register sop s;
1353 	register char *cp;
1354 	register sopno i;
1355 
1356 	/* avoid making error situations worse */
1357 	if (p->error != 0)
1358 		return;
1359 
1360 	/* find the longest OCHAR sequence in strip */
1361 	newlen = 0;
1362 	scan = g->strip + 1;
1363 	do {
1364 		s = *scan++;
1365 		switch (OP(s)) {
1366 		case OCHAR:		/* sequence member */
1367 			if (newlen == 0)		/* new sequence */
1368 				newstart = scan - 1;
1369 			newlen++;
1370 			break;
1371 		case OPLUS_:		/* things that don't break one */
1372 		case OLPAREN:
1373 		case ORPAREN:
1374 			break;
1375 		case OQUEST_:		/* things that must be skipped */
1376 		case OCH_:
1377 			scan--;
1378 			do {
1379 				scan += OPND(s);
1380 				s = *scan;
1381 				/* assert() interferes w debug printouts */
1382 				if (OP(s) != O_QUEST && OP(s) != O_CH &&
1383 							OP(s) != OOR2) {
1384 					g->iflags |= BAD;
1385 					return;
1386 				}
1387 			} while (OP(s) != O_QUEST && OP(s) != O_CH);
1388 			/* fallthrough */
1389 		default:		/* things that break a sequence */
1390 			if (newlen > g->mlen) {		/* ends one */
1391 				start = newstart;
1392 				g->mlen = newlen;
1393 			}
1394 			newlen = 0;
1395 			break;
1396 		}
1397 	} while (OP(s) != OEND);
1398 
1399 	if (g->mlen == 0)		/* there isn't one */
1400 		return;
1401 
1402 	/* turn it into a character string */
1403 	g->must = malloc((size_t)g->mlen + 1);
1404 	if (g->must == NULL) {		/* argh; just forget it */
1405 		g->mlen = 0;
1406 		return;
1407 	}
1408 	cp = g->must;
1409 	scan = start;
1410 	for (i = g->mlen; i > 0; i--) {
1411 		while (OP(s = *scan++) != OCHAR)
1412 			continue;
1413 		*cp++ = OPND(s);
1414 	}
1415 	*cp++ = '\0';		/* just on general principles */
1416 }
1417 
1418 /*
1419  - pluscount - count + nesting
1420  */
1421 static sopno			/* nesting depth */
1422 pluscount(p, g)
1423 struct parse *p;
1424 register struct re_guts *g;
1425 {
1426 	register sop *scan;
1427 	register sop s;
1428 	register sopno plusnest = 0;
1429 	register sopno maxnest = 0;
1430 
1431 	if (p->error != 0)
1432 		return(0);	/* there may not be an OEND */
1433 
1434 	scan = g->strip + 1;
1435 	do {
1436 		s = *scan++;
1437 		switch (OP(s)) {
1438 		case OPLUS_:
1439 			plusnest++;
1440 			break;
1441 		case O_PLUS:
1442 			if (plusnest > maxnest)
1443 				maxnest = plusnest;
1444 			plusnest--;
1445 			break;
1446 		}
1447 	} while (OP(s) != OEND);
1448 	if (plusnest != 0)
1449 		g->iflags |= BAD;
1450 	return(maxnest);
1451 }
1452