xref: /original-bsd/lib/libcompat/regexp/regexp.c (revision f33f0483)
1 /*
2  * regcomp and regexec -- regsub and regerror are elsewhere
3  *
4  *	Copyright (c) 1986 by University of Toronto.
5  *	Written by Henry Spencer.  Not derived from licensed software.
6  *
7  *	Permission is granted to anyone to use this software for any
8  *	purpose on any computer system, and to redistribute it freely,
9  *	subject to the following restrictions:
10  *
11  *	1. The author is not responsible for the consequences of use of
12  *		this software, no matter how awful, even if they arise
13  *		from defects in it.
14  *
15  *	2. The origin of this software must not be misrepresented, either
16  *		by explicit claim or by omission.
17  *
18  *	3. Altered versions must be plainly marked as such, and must not
19  *		be misrepresented as being the original software.
20  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
21  *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
22  *** to assist in implementing egrep.
23  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
24  *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
25  *** as in BSD grep and ex.
26  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
27  *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
28  *** THIS IS AN ALTERED VERSION.  It was altered by James A. Woods,
29  *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy.
30  *
31  * Beware that some of this code is subtly aware of the way operator
32  * precedence is structured in regular expressions.  Serious changes in
33  * regular-expression syntax might require a total rethink.
34  */
35 #include <regexp.h>
36 #include <stdio.h>
37 #include <ctype.h>
38 #include <stdlib.h>
39 #include <string.h>
40 #include "regmagic.h"
41 
42 /*
43  * The "internal use only" fields in regexp.h are present to pass info from
44  * compile to execute that permits the execute phase to run lots faster on
45  * simple cases.  They are:
46  *
47  * regstart	char that must begin a match; '\0' if none obvious
48  * reganch	is the match anchored (at beginning-of-line only)?
49  * regmust	string (pointer into program) that match must include, or NULL
50  * regmlen	length of regmust string
51  *
52  * Regstart and reganch permit very fast decisions on suitable starting points
53  * for a match, cutting down the work a lot.  Regmust permits fast rejection
54  * of lines that cannot possibly match.  The regmust tests are costly enough
55  * that regcomp() supplies a regmust only if the r.e. contains something
56  * potentially expensive (at present, the only such thing detected is * or +
57  * at the start of the r.e., which can involve a lot of backup).  Regmlen is
58  * supplied because the test in regexec() needs it and regcomp() is computing
59  * it anyway.
60  */
61 
62 /*
63  * Structure for regexp "program".  This is essentially a linear encoding
64  * of a nondeterministic finite-state machine (aka syntax charts or
65  * "railroad normal form" in parsing technology).  Each node is an opcode
66  * plus a "next" pointer, possibly plus an operand.  "Next" pointers of
67  * all nodes except BRANCH implement concatenation; a "next" pointer with
68  * a BRANCH on both ends of it is connecting two alternatives.  (Here we
69  * have one of the subtle syntax dependencies:  an individual BRANCH (as
70  * opposed to a collection of them) is never concatenated with anything
71  * because of operator precedence.)  The operand of some types of node is
72  * a literal string; for others, it is a node leading into a sub-FSM.  In
73  * particular, the operand of a BRANCH node is the first node of the branch.
74  * (NB this is *not* a tree structure:  the tail of the branch connects
75  * to the thing following the set of BRANCHes.)  The opcodes are:
76  */
77 
78 /* definition	number	opnd?	meaning */
79 #define	END	0	/* no	End of program. */
80 #define	BOL	1	/* no	Match "" at beginning of line. */
81 #define	EOL	2	/* no	Match "" at end of line. */
82 #define	ANY	3	/* no	Match any one character. */
83 #define	ANYOF	4	/* str	Match any character in this string. */
84 #define	ANYBUT	5	/* str	Match any character not in this string. */
85 #define	BRANCH	6	/* node	Match this alternative, or the next... */
86 #define	BACK	7	/* no	Match "", "next" ptr points backward. */
87 #define	EXACTLY	8	/* str	Match this string. */
88 #define	NOTHING	9	/* no	Match empty string. */
89 #define	STAR	10	/* node	Match this (simple) thing 0 or more times. */
90 #define	PLUS	11	/* node	Match this (simple) thing 1 or more times. */
91 #define	WORDA	12	/* no	Match "" at wordchar, where prev is nonword */
92 #define	WORDZ	13	/* no	Match "" at nonwordchar, where prev is word */
93 #define	OPEN	20	/* no	Mark this point in input as start of #n. */
94 			/*	OPEN+1 is number 1, etc. */
95 #define	CLOSE	30	/* no	Analogous to OPEN. */
96 
97 /*
98  * Opcode notes:
99  *
100  * BRANCH	The set of branches constituting a single choice are hooked
101  *		together with their "next" pointers, since precedence prevents
102  *		anything being concatenated to any individual branch.  The
103  *		"next" pointer of the last BRANCH in a choice points to the
104  *		thing following the whole choice.  This is also where the
105  *		final "next" pointer of each individual branch points; each
106  *		branch starts with the operand node of a BRANCH node.
107  *
108  * BACK		Normal "next" pointers all implicitly point forward; BACK
109  *		exists to make loop structures possible.
110  *
111  * STAR,PLUS	'?', and complex '*' and '+', are implemented as circular
112  *		BRANCH structures using BACK.  Simple cases (one character
113  *		per match) are implemented with STAR and PLUS for speed
114  *		and to minimize recursive plunges.
115  *
116  * OPEN,CLOSE	...are numbered at compile time.
117  */
118 
119 /*
120  * A node is one char of opcode followed by two chars of "next" pointer.
121  * "Next" pointers are stored as two 8-bit pieces, high order first.  The
122  * value is a positive offset from the opcode of the node containing it.
123  * An operand, if any, simply follows the node.  (Note that much of the
124  * code generation knows about this implicit relationship.)
125  *
126  * Using two bytes for the "next" pointer is vast overkill for most things,
127  * but allows patterns to get big without disasters.
128  */
129 #define	OP(p)	(*(p))
130 #define	NEXT(p)	(((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
131 #define	OPERAND(p)	((p) + 3)
132 
133 /*
134  * See regmagic.h for one further detail of program structure.
135  */
136 
137 
138 /*
139  * Utility definitions.
140  */
141 #ifndef CHARBITS
142 #define	UCHARAT(p)	((int)*(unsigned char *)(p))
143 #else
144 #define	UCHARAT(p)	((int)*(p)&CHARBITS)
145 #endif
146 
147 #define	FAIL(m)	{ regerror(m); return(NULL); }
148 #define	ISMULT(c)	((c) == '*' || (c) == '+' || (c) == '?')
149 
150 /*
151  * Flags to be passed up and down.
152  */
153 #define	HASWIDTH	01	/* Known never to match null string. */
154 #define	SIMPLE		02	/* Simple enough to be STAR/PLUS operand. */
155 #define	SPSTART		04	/* Starts with * or +. */
156 #define	WORST		0	/* Worst case. */
157 
158 /*
159  * Global work variables for regcomp().
160  */
161 static char *regparse;		/* Input-scan pointer. */
162 static int regnpar;		/* () count. */
163 static char regdummy;
164 static char *regcode;		/* Code-emit pointer; &regdummy = don't. */
165 static long regsize;		/* Code size. */
166 
167 /*
168  * Forward declarations for regcomp()'s friends.
169  */
170 #ifndef STATIC
171 #define	STATIC	static
172 #endif
173 STATIC char *reg();
174 STATIC char *regbranch();
175 STATIC char *regpiece();
176 STATIC char *regatom();
177 STATIC char *regnode();
178 STATIC char *regnext();
179 STATIC void regc();
180 STATIC void reginsert();
181 STATIC void regtail();
182 STATIC void regoptail();
183 #ifdef STRCSPN
184 STATIC int strcspn();
185 #endif
186 
187 /*
188  - regcomp - compile a regular expression into internal code
189  *
190  * We can't allocate space until we know how big the compiled form will be,
191  * but we can't compile it (and thus know how big it is) until we've got a
192  * place to put the code.  So we cheat:  we compile it twice, once with code
193  * generation turned off and size counting turned on, and once "for real".
194  * This also means that we don't allocate space until we are sure that the
195  * thing really will compile successfully, and we never have to move the
196  * code and thus invalidate pointers into it.  (Note that it has to be in
197  * one piece because free() must be able to free it all.)
198  *
199  * Beware that the optimization-preparation code in here knows about some
200  * of the structure of the compiled regexp.
201  */
202 regexp *
regcomp(exp)203 regcomp(exp)
204 const char *exp;
205 {
206 	register regexp *r;
207 	register char *scan;
208 	register char *longest;
209 	register int len;
210 	int flags;
211 
212 	if (exp == NULL)
213 		FAIL("NULL argument");
214 
215 	/* First pass: determine size, legality. */
216 #ifdef notdef
217 	if (exp[0] == '.' && exp[1] == '*') exp += 2;  /* aid grep */
218 #endif
219 	regparse = (char *)exp;
220 	regnpar = 1;
221 	regsize = 0L;
222 	regcode = &regdummy;
223 	regc(MAGIC);
224 	if (reg(0, &flags) == NULL)
225 		return(NULL);
226 
227 	/* Small enough for pointer-storage convention? */
228 	if (regsize >= 32767L)		/* Probably could be 65535L. */
229 		FAIL("regexp too big");
230 
231 	/* Allocate space. */
232 	r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize);
233 	if (r == NULL)
234 		FAIL("out of space");
235 
236 	/* Second pass: emit code. */
237 	regparse = (char *)exp;
238 	regnpar = 1;
239 	regcode = r->program;
240 	regc(MAGIC);
241 	if (reg(0, &flags) == NULL)
242 		return(NULL);
243 
244 	/* Dig out information for optimizations. */
245 	r->regstart = '\0';	/* Worst-case defaults. */
246 	r->reganch = 0;
247 	r->regmust = NULL;
248 	r->regmlen = 0;
249 	scan = r->program+1;			/* First BRANCH. */
250 	if (OP(regnext(scan)) == END) {		/* Only one top-level choice. */
251 		scan = OPERAND(scan);
252 
253 		/* Starting-point info. */
254 		if (OP(scan) == EXACTLY)
255 			r->regstart = *OPERAND(scan);
256 		else if (OP(scan) == BOL)
257 			r->reganch++;
258 
259 		/*
260 		 * If there's something expensive in the r.e., find the
261 		 * longest literal string that must appear and make it the
262 		 * regmust.  Resolve ties in favor of later strings, since
263 		 * the regstart check works with the beginning of the r.e.
264 		 * and avoiding duplication strengthens checking.  Not a
265 		 * strong reason, but sufficient in the absence of others.
266 		 */
267 		if (flags&SPSTART) {
268 			longest = NULL;
269 			len = 0;
270 			for (; scan != NULL; scan = regnext(scan))
271 				if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) {
272 					longest = OPERAND(scan);
273 					len = strlen(OPERAND(scan));
274 				}
275 			r->regmust = longest;
276 			r->regmlen = len;
277 		}
278 	}
279 
280 	return(r);
281 }
282 
283 /*
284  - reg - regular expression, i.e. main body or parenthesized thing
285  *
286  * Caller must absorb opening parenthesis.
287  *
288  * Combining parenthesis handling with the base level of regular expression
289  * is a trifle forced, but the need to tie the tails of the branches to what
290  * follows makes it hard to avoid.
291  */
292 static char *
reg(paren,flagp)293 reg(paren, flagp)
294 int paren;			/* Parenthesized? */
295 int *flagp;
296 {
297 	register char *ret;
298 	register char *br;
299 	register char *ender;
300 	register int parno;
301 	int flags;
302 
303 	*flagp = HASWIDTH;	/* Tentatively. */
304 
305 	/* Make an OPEN node, if parenthesized. */
306 	if (paren) {
307 		if (regnpar >= NSUBEXP)
308 			FAIL("too many ()");
309 		parno = regnpar;
310 		regnpar++;
311 		ret = regnode(OPEN+parno);
312 	} else
313 		ret = NULL;
314 
315 	/* Pick up the branches, linking them together. */
316 	br = regbranch(&flags);
317 	if (br == NULL)
318 		return(NULL);
319 	if (ret != NULL)
320 		regtail(ret, br);	/* OPEN -> first. */
321 	else
322 		ret = br;
323 	if (!(flags&HASWIDTH))
324 		*flagp &= ~HASWIDTH;
325 	*flagp |= flags&SPSTART;
326 	while (*regparse == '|' || *regparse == '\n') {
327 		regparse++;
328 		br = regbranch(&flags);
329 		if (br == NULL)
330 			return(NULL);
331 		regtail(ret, br);	/* BRANCH -> BRANCH. */
332 		if (!(flags&HASWIDTH))
333 			*flagp &= ~HASWIDTH;
334 		*flagp |= flags&SPSTART;
335 	}
336 
337 	/* Make a closing node, and hook it on the end. */
338 	ender = regnode((paren) ? CLOSE+parno : END);
339 	regtail(ret, ender);
340 
341 	/* Hook the tails of the branches to the closing node. */
342 	for (br = ret; br != NULL; br = regnext(br))
343 		regoptail(br, ender);
344 
345 	/* Check for proper termination. */
346 	if (paren && *regparse++ != ')') {
347 		FAIL("unmatched ()");
348 	} else if (!paren && *regparse != '\0') {
349 		if (*regparse == ')') {
350 			FAIL("unmatched ()");
351 		} else
352 			FAIL("junk on end");	/* "Can't happen". */
353 		/* NOTREACHED */
354 	}
355 
356 	return(ret);
357 }
358 
359 /*
360  - regbranch - one alternative of an | operator
361  *
362  * Implements the concatenation operator.
363  */
364 static char *
regbranch(flagp)365 regbranch(flagp)
366 int *flagp;
367 {
368 	register char *ret;
369 	register char *chain;
370 	register char *latest;
371 	int flags;
372 
373 	*flagp = WORST;		/* Tentatively. */
374 
375 	ret = regnode(BRANCH);
376 	chain = NULL;
377 	while (*regparse != '\0' && *regparse != ')' &&
378 	       *regparse != '\n' && *regparse != '|') {
379 		latest = regpiece(&flags);
380 		if (latest == NULL)
381 			return(NULL);
382 		*flagp |= flags&HASWIDTH;
383 		if (chain == NULL)	/* First piece. */
384 			*flagp |= flags&SPSTART;
385 		else
386 			regtail(chain, latest);
387 		chain = latest;
388 	}
389 	if (chain == NULL)	/* Loop ran zero times. */
390 		(void) regnode(NOTHING);
391 
392 	return(ret);
393 }
394 
395 /*
396  - regpiece - something followed by possible [*+?]
397  *
398  * Note that the branching code sequences used for ? and the general cases
399  * of * and + are somewhat optimized:  they use the same NOTHING node as
400  * both the endmarker for their branch list and the body of the last branch.
401  * It might seem that this node could be dispensed with entirely, but the
402  * endmarker role is not redundant.
403  */
404 static char *
regpiece(flagp)405 regpiece(flagp)
406 int *flagp;
407 {
408 	register char *ret;
409 	register char op;
410 	register char *next;
411 	int flags;
412 
413 	ret = regatom(&flags);
414 	if (ret == NULL)
415 		return(NULL);
416 
417 	op = *regparse;
418 	if (!ISMULT(op)) {
419 		*flagp = flags;
420 		return(ret);
421 	}
422 
423 	if (!(flags&HASWIDTH) && op != '?')
424 		FAIL("*+ operand could be empty");
425 	*flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
426 
427 	if (op == '*' && (flags&SIMPLE))
428 		reginsert(STAR, ret);
429 	else if (op == '*') {
430 		/* Emit x* as (x&|), where & means "self". */
431 		reginsert(BRANCH, ret);			/* Either x */
432 		regoptail(ret, regnode(BACK));		/* and loop */
433 		regoptail(ret, ret);			/* back */
434 		regtail(ret, regnode(BRANCH));		/* or */
435 		regtail(ret, regnode(NOTHING));		/* null. */
436 	} else if (op == '+' && (flags&SIMPLE))
437 		reginsert(PLUS, ret);
438 	else if (op == '+') {
439 		/* Emit x+ as x(&|), where & means "self". */
440 		next = regnode(BRANCH);			/* Either */
441 		regtail(ret, next);
442 		regtail(regnode(BACK), ret);		/* loop back */
443 		regtail(next, regnode(BRANCH));		/* or */
444 		regtail(ret, regnode(NOTHING));		/* null. */
445 	} else if (op == '?') {
446 		/* Emit x? as (x|) */
447 		reginsert(BRANCH, ret);			/* Either x */
448 		regtail(ret, regnode(BRANCH));		/* or */
449 		next = regnode(NOTHING);		/* null. */
450 		regtail(ret, next);
451 		regoptail(ret, next);
452 	}
453 	regparse++;
454 	if (ISMULT(*regparse))
455 		FAIL("nested *?+");
456 
457 	return(ret);
458 }
459 
460 /*
461  - regatom - the lowest level
462  *
463  * Optimization:  gobbles an entire sequence of ordinary characters so that
464  * it can turn them into a single node, which is smaller to store and
465  * faster to run.  Backslashed characters are exceptions, each becoming a
466  * separate node; the code is simpler that way and it's not worth fixing.
467  */
468 static char *
regatom(flagp)469 regatom(flagp)
470 int *flagp;
471 {
472 	register char *ret;
473 	int flags;
474 
475 	*flagp = WORST;		/* Tentatively. */
476 
477 	switch (*regparse++) {
478 	/* FIXME: these chars only have meaning at beg/end of pat? */
479 	case '^':
480 		ret = regnode(BOL);
481 		break;
482 	case '$':
483 		ret = regnode(EOL);
484 		break;
485 	case '.':
486 		ret = regnode(ANY);
487 		*flagp |= HASWIDTH|SIMPLE;
488 		break;
489 	case '[': {
490 			register int class;
491 			register int classend;
492 
493 			if (*regparse == '^') {	/* Complement of range. */
494 				ret = regnode(ANYBUT);
495 				regparse++;
496 			} else
497 				ret = regnode(ANYOF);
498 			if (*regparse == ']' || *regparse == '-')
499 				regc(*regparse++);
500 			while (*regparse != '\0' && *regparse != ']') {
501 				if (*regparse == '-') {
502 					regparse++;
503 					if (*regparse == ']' || *regparse == '\0')
504 						regc('-');
505 					else {
506 						class = UCHARAT(regparse-2)+1;
507 						classend = UCHARAT(regparse);
508 						if (class > classend+1)
509 							FAIL("invalid [] range");
510 						for (; class <= classend; class++)
511 							regc(class);
512 						regparse++;
513 					}
514 				} else
515 					regc(*regparse++);
516 			}
517 			regc('\0');
518 			if (*regparse != ']')
519 				FAIL("unmatched []");
520 			regparse++;
521 			*flagp |= HASWIDTH|SIMPLE;
522 		}
523 		break;
524 	case '(':
525 		ret = reg(1, &flags);
526 		if (ret == NULL)
527 			return(NULL);
528 		*flagp |= flags&(HASWIDTH|SPSTART);
529 		break;
530 	case '\0':
531 	case '|':
532 	case '\n':
533 	case ')':
534 		FAIL("internal urp");	/* Supposed to be caught earlier. */
535 		break;
536 	case '?':
537 	case '+':
538 	case '*':
539 		FAIL("?+* follows nothing");
540 		break;
541 	case '\\':
542 		switch (*regparse++) {
543 		case '\0':
544 			FAIL("trailing \\");
545 			break;
546 		case '<':
547 			ret = regnode(WORDA);
548 			break;
549 		case '>':
550 			ret = regnode(WORDZ);
551 			break;
552 		/* FIXME: Someday handle \1, \2, ... */
553 		default:
554 			/* Handle general quoted chars in exact-match routine */
555 			goto de_fault;
556 		}
557 		break;
558 	de_fault:
559 	default:
560 		/*
561 		 * Encode a string of characters to be matched exactly.
562 		 *
563 		 * This is a bit tricky due to quoted chars and due to
564 		 * '*', '+', and '?' taking the SINGLE char previous
565 		 * as their operand.
566 		 *
567 		 * On entry, the char at regparse[-1] is going to go
568 		 * into the string, no matter what it is.  (It could be
569 		 * following a \ if we are entered from the '\' case.)
570 		 *
571 		 * Basic idea is to pick up a good char in  ch  and
572 		 * examine the next char.  If it's *+? then we twiddle.
573 		 * If it's \ then we frozzle.  If it's other magic char
574 		 * we push  ch  and terminate the string.  If none of the
575 		 * above, we push  ch  on the string and go around again.
576 		 *
577 		 *  regprev  is used to remember where "the current char"
578 		 * starts in the string, if due to a *+? we need to back
579 		 * up and put the current char in a separate, 1-char, string.
580 		 * When  regprev  is NULL,  ch  is the only char in the
581 		 * string; this is used in *+? handling, and in setting
582 		 * flags |= SIMPLE at the end.
583 		 */
584 		{
585 			char *regprev;
586 			register char ch;
587 
588 			regparse--;			/* Look at cur char */
589 			ret = regnode(EXACTLY);
590 			for ( regprev = 0 ; ; ) {
591 				ch = *regparse++;	/* Get current char */
592 				switch (*regparse) {	/* look at next one */
593 
594 				default:
595 					regc(ch);	/* Add cur to string */
596 					break;
597 
598 				case '.': case '[': case '(':
599 				case ')': case '|': case '\n':
600 				case '$': case '^':
601 				case '\0':
602 				/* FIXME, $ and ^ should not always be magic */
603 				magic:
604 					regc(ch);	/* dump cur char */
605 					goto done;	/* and we are done */
606 
607 				case '?': case '+': case '*':
608 					if (!regprev) 	/* If just ch in str, */
609 						goto magic;	/* use it */
610 					/* End mult-char string one early */
611 					regparse = regprev; /* Back up parse */
612 					goto done;
613 
614 				case '\\':
615 					regc(ch);	/* Cur char OK */
616 					switch (regparse[1]){ /* Look after \ */
617 					case '\0':
618 					case '<':
619 					case '>':
620 					/* FIXME: Someday handle \1, \2, ... */
621 						goto done; /* Not quoted */
622 					default:
623 						/* Backup point is \, scan							 * point is after it. */
624 						regprev = regparse;
625 						regparse++;
626 						continue;	/* NOT break; */
627 					}
628 				}
629 				regprev = regparse;	/* Set backup point */
630 			}
631 		done:
632 			regc('\0');
633 			*flagp |= HASWIDTH;
634 			if (!regprev)		/* One char? */
635 				*flagp |= SIMPLE;
636 		}
637 		break;
638 	}
639 
640 	return(ret);
641 }
642 
643 /*
644  - regnode - emit a node
645  */
646 static char *			/* Location. */
regnode(op)647 regnode(op)
648 char op;
649 {
650 	register char *ret;
651 	register char *ptr;
652 
653 	ret = regcode;
654 	if (ret == &regdummy) {
655 		regsize += 3;
656 		return(ret);
657 	}
658 
659 	ptr = ret;
660 	*ptr++ = op;
661 	*ptr++ = '\0';		/* Null "next" pointer. */
662 	*ptr++ = '\0';
663 	regcode = ptr;
664 
665 	return(ret);
666 }
667 
668 /*
669  - regc - emit (if appropriate) a byte of code
670  */
671 static void
regc(b)672 regc(b)
673 char b;
674 {
675 	if (regcode != &regdummy)
676 		*regcode++ = b;
677 	else
678 		regsize++;
679 }
680 
681 /*
682  - reginsert - insert an operator in front of already-emitted operand
683  *
684  * Means relocating the operand.
685  */
686 static void
reginsert(op,opnd)687 reginsert(op, opnd)
688 char op;
689 char *opnd;
690 {
691 	register char *src;
692 	register char *dst;
693 	register char *place;
694 
695 	if (regcode == &regdummy) {
696 		regsize += 3;
697 		return;
698 	}
699 
700 	src = regcode;
701 	regcode += 3;
702 	dst = regcode;
703 	while (src > opnd)
704 		*--dst = *--src;
705 
706 	place = opnd;		/* Op node, where operand used to be. */
707 	*place++ = op;
708 	*place++ = '\0';
709 	*place++ = '\0';
710 }
711 
712 /*
713  - regtail - set the next-pointer at the end of a node chain
714  */
715 static void
regtail(p,val)716 regtail(p, val)
717 char *p;
718 char *val;
719 {
720 	register char *scan;
721 	register char *temp;
722 	register int offset;
723 
724 	if (p == &regdummy)
725 		return;
726 
727 	/* Find last node. */
728 	scan = p;
729 	for (;;) {
730 		temp = regnext(scan);
731 		if (temp == NULL)
732 			break;
733 		scan = temp;
734 	}
735 
736 	if (OP(scan) == BACK)
737 		offset = scan - val;
738 	else
739 		offset = val - scan;
740 	*(scan+1) = (offset>>8)&0377;
741 	*(scan+2) = offset&0377;
742 }
743 
744 /*
745  - regoptail - regtail on operand of first argument; nop if operandless
746  */
747 static void
regoptail(p,val)748 regoptail(p, val)
749 char *p;
750 char *val;
751 {
752 	/* "Operandless" and "op != BRANCH" are synonymous in practice. */
753 	if (p == NULL || p == &regdummy || OP(p) != BRANCH)
754 		return;
755 	regtail(OPERAND(p), val);
756 }
757 
758 /*
759  * regexec and friends
760  */
761 
762 /*
763  * Global work variables for regexec().
764  */
765 static char *reginput;		/* String-input pointer. */
766 static char *regbol;		/* Beginning of input, for ^ check. */
767 static char **regstartp;	/* Pointer to startp array. */
768 static char **regendp;		/* Ditto for endp. */
769 
770 /*
771  * Forwards.
772  */
773 STATIC int regtry();
774 STATIC int regmatch();
775 STATIC int regrepeat();
776 
777 #ifdef DEBUG
778 int regnarrate = 0;
779 void regdump();
780 STATIC char *regprop();
781 #endif
782 
783 /*
784  - regexec - match a regexp against a string
785  */
786 int
regexec(prog,string)787 regexec(prog, string)
788 register const regexp *prog;
789 register const char *string;
790 {
791 	register char *s;
792 	extern char *strchr();
793 
794 	/* Be paranoid... */
795 	if (prog == NULL || string == NULL) {
796 		regerror("NULL parameter");
797 		return(0);
798 	}
799 
800 	/* Check validity of program. */
801 	if (UCHARAT(prog->program) != MAGIC) {
802 		regerror("corrupted program");
803 		return(0);
804 	}
805 
806 	/* If there is a "must appear" string, look for it. */
807 	if (prog->regmust != NULL) {
808 		s = (char *)string;
809 		while ((s = strchr(s, prog->regmust[0])) != NULL) {
810 			if (strncmp(s, prog->regmust, prog->regmlen) == 0)
811 				break;	/* Found it. */
812 			s++;
813 		}
814 		if (s == NULL)	/* Not present. */
815 			return(0);
816 	}
817 
818 	/* Mark beginning of line for ^ . */
819 	regbol = (char *)string;
820 
821 	/* Simplest case:  anchored match need be tried only once. */
822 	if (prog->reganch)
823 		return(regtry(prog, string));
824 
825 	/* Messy cases:  unanchored match. */
826 	s = (char *)string;
827 	if (prog->regstart != '\0')
828 		/* We know what char it must start with. */
829 		while ((s = strchr(s, prog->regstart)) != NULL) {
830 			if (regtry(prog, s))
831 				return(1);
832 			s++;
833 		}
834 	else
835 		/* We don't -- general case. */
836 		do {
837 			if (regtry(prog, s))
838 				return(1);
839 		} while (*s++ != '\0');
840 
841 	/* Failure. */
842 	return(0);
843 }
844 
845 /*
846  - regtry - try match at specific point
847  */
848 static int			/* 0 failure, 1 success */
regtry(prog,string)849 regtry(prog, string)
850 regexp *prog;
851 char *string;
852 {
853 	register int i;
854 	register char **sp;
855 	register char **ep;
856 
857 	reginput = string;
858 	regstartp = prog->startp;
859 	regendp = prog->endp;
860 
861 	sp = prog->startp;
862 	ep = prog->endp;
863 	for (i = NSUBEXP; i > 0; i--) {
864 		*sp++ = NULL;
865 		*ep++ = NULL;
866 	}
867 	if (regmatch(prog->program + 1)) {
868 		prog->startp[0] = string;
869 		prog->endp[0] = reginput;
870 		return(1);
871 	} else
872 		return(0);
873 }
874 
875 /*
876  - regmatch - main matching routine
877  *
878  * Conceptually the strategy is simple:  check to see whether the current
879  * node matches, call self recursively to see whether the rest matches,
880  * and then act accordingly.  In practice we make some effort to avoid
881  * recursion, in particular by going through "ordinary" nodes (that don't
882  * need to know whether the rest of the match failed) by a loop instead of
883  * by recursion.
884  */
885 static int			/* 0 failure, 1 success */
regmatch(prog)886 regmatch(prog)
887 char *prog;
888 {
889 	register char *scan;	/* Current node. */
890 	char *next;		/* Next node. */
891 	extern char *strchr();
892 
893 	scan = prog;
894 #ifdef DEBUG
895 	if (scan != NULL && regnarrate)
896 		fprintf(stderr, "%s(\n", regprop(scan));
897 #endif
898 	while (scan != NULL) {
899 #ifdef DEBUG
900 		if (regnarrate)
901 			fprintf(stderr, "%s...\n", regprop(scan));
902 #endif
903 		next = regnext(scan);
904 
905 		switch (OP(scan)) {
906 		case BOL:
907 			if (reginput != regbol)
908 				return(0);
909 			break;
910 		case EOL:
911 			if (*reginput != '\0')
912 				return(0);
913 			break;
914 		case WORDA:
915 			/* Must be looking at a letter, digit, or _ */
916 			if ((!isalnum(*reginput)) && *reginput != '_')
917 				return(0);
918 			/* Prev must be BOL or nonword */
919 			if (reginput > regbol &&
920 			    (isalnum(reginput[-1]) || reginput[-1] == '_'))
921 				return(0);
922 			break;
923 		case WORDZ:
924 			/* Must be looking at non letter, digit, or _ */
925 			if (isalnum(*reginput) || *reginput == '_')
926 				return(0);
927 			/* We don't care what the previous char was */
928 			break;
929 		case ANY:
930 			if (*reginput == '\0')
931 				return(0);
932 			reginput++;
933 			break;
934 		case EXACTLY: {
935 				register int len;
936 				register char *opnd;
937 
938 				opnd = OPERAND(scan);
939 				/* Inline the first character, for speed. */
940 				if (*opnd != *reginput)
941 					return(0);
942 				len = strlen(opnd);
943 				if (len > 1 && strncmp(opnd, reginput, len) != 0)
944 					return(0);
945 				reginput += len;
946 			}
947 			break;
948 		case ANYOF:
949  			if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL)
950 				return(0);
951 			reginput++;
952 			break;
953 		case ANYBUT:
954  			if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL)
955 				return(0);
956 			reginput++;
957 			break;
958 		case NOTHING:
959 			break;
960 		case BACK:
961 			break;
962 		case OPEN+1:
963 		case OPEN+2:
964 		case OPEN+3:
965 		case OPEN+4:
966 		case OPEN+5:
967 		case OPEN+6:
968 		case OPEN+7:
969 		case OPEN+8:
970 		case OPEN+9: {
971 				register int no;
972 				register char *save;
973 
974 				no = OP(scan) - OPEN;
975 				save = reginput;
976 
977 				if (regmatch(next)) {
978 					/*
979 					 * Don't set startp if some later
980 					 * invocation of the same parentheses
981 					 * already has.
982 					 */
983 					if (regstartp[no] == NULL)
984 						regstartp[no] = save;
985 					return(1);
986 				} else
987 					return(0);
988 			}
989 			break;
990 		case CLOSE+1:
991 		case CLOSE+2:
992 		case CLOSE+3:
993 		case CLOSE+4:
994 		case CLOSE+5:
995 		case CLOSE+6:
996 		case CLOSE+7:
997 		case CLOSE+8:
998 		case CLOSE+9: {
999 				register int no;
1000 				register char *save;
1001 
1002 				no = OP(scan) - CLOSE;
1003 				save = reginput;
1004 
1005 				if (regmatch(next)) {
1006 					/*
1007 					 * Don't set endp if some later
1008 					 * invocation of the same parentheses
1009 					 * already has.
1010 					 */
1011 					if (regendp[no] == NULL)
1012 						regendp[no] = save;
1013 					return(1);
1014 				} else
1015 					return(0);
1016 			}
1017 			break;
1018 		case BRANCH: {
1019 				register char *save;
1020 
1021 				if (OP(next) != BRANCH)		/* No choice. */
1022 					next = OPERAND(scan);	/* Avoid recursion. */
1023 				else {
1024 					do {
1025 						save = reginput;
1026 						if (regmatch(OPERAND(scan)))
1027 							return(1);
1028 						reginput = save;
1029 						scan = regnext(scan);
1030 					} while (scan != NULL && OP(scan) == BRANCH);
1031 					return(0);
1032 					/* NOTREACHED */
1033 				}
1034 			}
1035 			break;
1036 		case STAR:
1037 		case PLUS: {
1038 				register char nextch;
1039 				register int no;
1040 				register char *save;
1041 				register int min;
1042 
1043 				/*
1044 				 * Lookahead to avoid useless match attempts
1045 				 * when we know what character comes next.
1046 				 */
1047 				nextch = '\0';
1048 				if (OP(next) == EXACTLY)
1049 					nextch = *OPERAND(next);
1050 				min = (OP(scan) == STAR) ? 0 : 1;
1051 				save = reginput;
1052 				no = regrepeat(OPERAND(scan));
1053 				while (no >= min) {
1054 					/* If it could work, try it. */
1055 					if (nextch == '\0' || *reginput == nextch)
1056 						if (regmatch(next))
1057 							return(1);
1058 					/* Couldn't or didn't -- back up. */
1059 					no--;
1060 					reginput = save + no;
1061 				}
1062 				return(0);
1063 			}
1064 			break;
1065 		case END:
1066 			return(1);	/* Success! */
1067 			break;
1068 		default:
1069 			regerror("memory corruption");
1070 			return(0);
1071 			break;
1072 		}
1073 
1074 		scan = next;
1075 	}
1076 
1077 	/*
1078 	 * We get here only if there's trouble -- normally "case END" is
1079 	 * the terminating point.
1080 	 */
1081 	regerror("corrupted pointers");
1082 	return(0);
1083 }
1084 
1085 /*
1086  - regrepeat - repeatedly match something simple, report how many
1087  */
1088 static int
regrepeat(p)1089 regrepeat(p)
1090 char *p;
1091 {
1092 	register int count = 0;
1093 	register char *scan;
1094 	register char *opnd;
1095 
1096 	scan = reginput;
1097 	opnd = OPERAND(p);
1098 	switch (OP(p)) {
1099 	case ANY:
1100 		count = strlen(scan);
1101 		scan += count;
1102 		break;
1103 	case EXACTLY:
1104 		while (*opnd == *scan) {
1105 			count++;
1106 			scan++;
1107 		}
1108 		break;
1109 	case ANYOF:
1110 		while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
1111 			count++;
1112 			scan++;
1113 		}
1114 		break;
1115 	case ANYBUT:
1116 		while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
1117 			count++;
1118 			scan++;
1119 		}
1120 		break;
1121 	default:		/* Oh dear.  Called inappropriately. */
1122 		regerror("internal foulup");
1123 		count = 0;	/* Best compromise. */
1124 		break;
1125 	}
1126 	reginput = scan;
1127 
1128 	return(count);
1129 }
1130 
1131 /*
1132  - regnext - dig the "next" pointer out of a node
1133  */
1134 static char *
regnext(p)1135 regnext(p)
1136 register char *p;
1137 {
1138 	register int offset;
1139 
1140 	if (p == &regdummy)
1141 		return(NULL);
1142 
1143 	offset = NEXT(p);
1144 	if (offset == 0)
1145 		return(NULL);
1146 
1147 	if (OP(p) == BACK)
1148 		return(p-offset);
1149 	else
1150 		return(p+offset);
1151 }
1152 
1153 #ifdef DEBUG
1154 
1155 STATIC char *regprop();
1156 
1157 /*
1158  - regdump - dump a regexp onto stdout in vaguely comprehensible form
1159  */
1160 void
regdump(r)1161 regdump(r)
1162 regexp *r;
1163 {
1164 	register char *s;
1165 	register char op = EXACTLY;	/* Arbitrary non-END op. */
1166 	register char *next;
1167 	extern char *strchr();
1168 
1169 
1170 	s = r->program + 1;
1171 	while (op != END) {	/* While that wasn't END last time... */
1172 		op = OP(s);
1173 		printf("%2d%s", s-r->program, regprop(s));	/* Where, what. */
1174 		next = regnext(s);
1175 		if (next == NULL)		/* Next ptr. */
1176 			printf("(0)");
1177 		else
1178 			printf("(%d)", (s-r->program)+(next-s));
1179 		s += 3;
1180 		if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
1181 			/* Literal string, where present. */
1182 			while (*s != '\0') {
1183 				putchar(*s);
1184 				s++;
1185 			}
1186 			s++;
1187 		}
1188 		putchar('\n');
1189 	}
1190 
1191 	/* Header fields of interest. */
1192 	if (r->regstart != '\0')
1193 		printf("start `%c' ", r->regstart);
1194 	if (r->reganch)
1195 		printf("anchored ");
1196 	if (r->regmust != NULL)
1197 		printf("must have \"%s\"", r->regmust);
1198 	printf("\n");
1199 }
1200 
1201 /*
1202  - regprop - printable representation of opcode
1203  */
1204 static char *
regprop(op)1205 regprop(op)
1206 char *op;
1207 {
1208 	register char *p;
1209 	static char buf[50];
1210 
1211 	(void) strcpy(buf, ":");
1212 
1213 	switch (OP(op)) {
1214 	case BOL:
1215 		p = "BOL";
1216 		break;
1217 	case EOL:
1218 		p = "EOL";
1219 		break;
1220 	case ANY:
1221 		p = "ANY";
1222 		break;
1223 	case ANYOF:
1224 		p = "ANYOF";
1225 		break;
1226 	case ANYBUT:
1227 		p = "ANYBUT";
1228 		break;
1229 	case BRANCH:
1230 		p = "BRANCH";
1231 		break;
1232 	case EXACTLY:
1233 		p = "EXACTLY";
1234 		break;
1235 	case NOTHING:
1236 		p = "NOTHING";
1237 		break;
1238 	case BACK:
1239 		p = "BACK";
1240 		break;
1241 	case END:
1242 		p = "END";
1243 		break;
1244 	case OPEN+1:
1245 	case OPEN+2:
1246 	case OPEN+3:
1247 	case OPEN+4:
1248 	case OPEN+5:
1249 	case OPEN+6:
1250 	case OPEN+7:
1251 	case OPEN+8:
1252 	case OPEN+9:
1253 		sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
1254 		p = NULL;
1255 		break;
1256 	case CLOSE+1:
1257 	case CLOSE+2:
1258 	case CLOSE+3:
1259 	case CLOSE+4:
1260 	case CLOSE+5:
1261 	case CLOSE+6:
1262 	case CLOSE+7:
1263 	case CLOSE+8:
1264 	case CLOSE+9:
1265 		sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
1266 		p = NULL;
1267 		break;
1268 	case STAR:
1269 		p = "STAR";
1270 		break;
1271 	case PLUS:
1272 		p = "PLUS";
1273 		break;
1274 	case WORDA:
1275 		p = "WORDA";
1276 		break;
1277 	case WORDZ:
1278 		p = "WORDZ";
1279 		break;
1280 	default:
1281 		regerror("corrupted opcode");
1282 		break;
1283 	}
1284 	if (p != NULL)
1285 		(void) strcat(buf, p);
1286 	return(buf);
1287 }
1288 #endif
1289 
1290 /*
1291  * The following is provided for those people who do not have strcspn() in
1292  * their C libraries.  They should get off their butts and do something
1293  * about it; at least one public-domain implementation of those (highly
1294  * useful) string routines has been published on Usenet.
1295  */
1296 #ifdef STRCSPN
1297 /*
1298  * strcspn - find length of initial segment of s1 consisting entirely
1299  * of characters not from s2
1300  */
1301 
1302 static int
strcspn(s1,s2)1303 strcspn(s1, s2)
1304 char *s1;
1305 char *s2;
1306 {
1307 	register char *scan1;
1308 	register char *scan2;
1309 	register int count;
1310 
1311 	count = 0;
1312 	for (scan1 = s1; *scan1 != '\0'; scan1++) {
1313 		for (scan2 = s2; *scan2 != '\0';)	/* ++ moved down. */
1314 			if (*scan1 == *scan2++)
1315 				return(count);
1316 		count++;
1317 	}
1318 	return(count);
1319 }
1320 #endif
1321