1 /*
2  * reg_compile and reg_execute
3  *
4  *	Copyright (c) 1986 by University of Toronto.
5  *	Written by Henry Spencer.  Not derived from licensed software.
6  *
7  *	Permission is granted to anyone to use this software for any
8  *	purpose on any computer system, and to redistribute it freely,
9  *	subject to the following restrictions:
10  *
11  *	1. The author is not responsible for the consequences of use of
12  *		this software, no matter how awful, even if they arise
13  *		from defects in it.
14  *
15  *	2. The origin of this software must not be misrepresented, either
16  *		by explicit claim or by omission.
17  *
18  *	3. Altered versions must be plainly marked as such, and must not
19  *		be misrepresented as being the original software.
20  *
21  * Beware that some of this code is subtly aware of the way operator
22  * precedence is structured in regular expressions.  Serious changes in
23  * regular-expression syntax might require a total rethink.
24  */
25 #include <stdio.h>
26 #include <stdlib.h> /* for malloc() */
27 #include <string.h> /* for strlen() */
28 
29 #include "regexpr.h"
30 
31 /*
32  * The first byte of the regexp internal "program" is actually this magic
33  * number; the start node begins in the second byte.
34  */
35 #define	MAGIC	0234
36 
37 /*
38  * The "internal use only" fields in regexpr.h are present to pass info from
39  * compile to execute that permits the execute phase to run lots faster on
40  * simple cases.  They are:
41  *
42  * regstart	char that must begin a match; '\0' if none obvious
43  * reganch	is the match anchored (at beginning-of-line only)?
44  * regmust	string (pointer into program) that match must include, or NULL
45  * regmlen	length of regmust string
46  *
47  * Regstart and reganch permit very fast decisions on suitable starting points
48  * for a match, cutting down the work a lot.  Regmust permits fast rejection
49  * of lines that cannot possibly match.  The regmust tests are costly enough
50  * that reg_compile() supplies a regmust only if the r.e. contains something
51  * potentially expensive (at present, the only such thing detected is * or +
52  * at the start of the r.e., which can involve a lot of backup).  Regmlen is
53  * supplied because the test in reg_execute() needs it and reg_compile() is
54  * computing it anyway.
55  */
56 
57 /*
58  * Structure for regexp "program".  This is essentially a linear encoding
59  * of a nondeterministic finite-state machine (aka syntax charts or
60  * "railroad normal form" in parsing technology).  Each node is an opcode
61  * plus a "next" pointer, possibly plus an operand.  "Next" pointers of
62  * all nodes except BRANCH implement concatenation; a "next" pointer with
63  * a BRANCH on both ends of it is connecting two alternatives.  (Here we
64  * have one of the subtle syntax dependencies:  an individual BRANCH (as
65  * opposed to a collection of them) is never concatenated with anything
66  * because of operator precedence.)  The operand of some types of node is
67  * a literal string; for others, it is a node leading into a sub-FSM.  In
68  * particular, the operand of a BRANCH node is the first node of the branch.
69  * (NB this is *not* a tree structure:  the tail of the branch connects
70  * to the thing following the set of BRANCHes.)  The opcodes are:
71  */
72 
73 /* definition	number	opnd?	meaning */
74 #define	END	0	/* no	End of program. */
75 #define	BOL	1	/* no	Match "" at beginning of line. */
76 #define	EOL	2	/* no	Match "" at end of line. */
77 #define	ANY	3	/* no	Match any one character. */
78 #define	ANYOF	4	/* str	Match any character in this string. */
79 #define	ANYBUT	5	/* str	Match any character not in this string. */
80 #define	BRANCH	6	/* node	Match this alternative, or the next... */
81 #define	BACK	7	/* no	Match "", "next" ptr points backward. */
82 #define	EXACTLY	8	/* str	Match this string. */
83 #define	NOTHING	9	/* no	Match empty string. */
84 #define	STAR	10	/* node	Match this (simple) thing 0 or more times. */
85 #define	PLUS	11	/* node	Match this (simple) thing 1 or more times. */
86 #define	OPEN	20	/* no	Mark this point in input as start of #n. */
87 			/*	OPEN+1 is number 1, etc. */
88 #define	CLOSE	30	/* no	Analogous to OPEN. */
89 
90 /*
91  * Opcode notes:
92  *
93  * BRANCH	The set of branches constituting a single choice are hooked
94  *		together with their "next" pointers, since precedence prevents
95  *		anything being concatenated to any individual branch.  The
96  *		"next" pointer of the last BRANCH in a choice points to the
97  *		thing following the whole choice.  This is also where the
98  *		final "next" pointer of each individual branch points; each
99  *		branch starts with the operand node of a BRANCH node.
100  *
101  * BACK		Normal "next" pointers all implicitly point forward; BACK
102  *		exists to make loop structures possible.
103  *
104  * STAR,PLUS	'?', and complex '*' and '+', are implemented as circular
105  *		BRANCH structures using BACK.  Simple cases (one character
106  *		per match) are implemented with STAR and PLUS for speed
107  *		and to minimize recursive plunges.
108  *
109  * OPEN,CLOSE	...are numbered at compile time.
110  */
111 
112 /*
113  * A node is one char of opcode followed by two chars of "next" pointer.
114  * "Next" pointers are stored as two 8-bit pieces, high order first.  The
115  * value is a positive offset from the opcode of the node containing it.
116  * An operand, if any, simply follows the node.  (Note that much of the
117  * code generation knows about this implicit relationship.)
118  *
119  * Using two bytes for the "next" pointer is vast overkill for most things,
120  * but allows patterns to get big without disasters.
121  */
122 #define	OP(p)	(*(p))
123 #define	NEXT(p)	(((*((p)+1)&0377)<<8) + *((p)+2)&0377)
124 #define	OPERAND(p)	((p) + 3)
125 
126 /*
127  * See regmagic.h for one further detail of program structure.
128  */
129 
130 
131 /*
132  * Utility definitions.
133  */
134 #ifndef CHARBITS
135 #define	UCHARAT(p)	((int)*(unsigned char *)(p))
136 #else
137 #define	UCHARAT(p)	((int)*(p)&CHARBITS)
138 #endif
139 
140 #define	FAIL(m)	{ reg_error(m); return(NULL); }
141 #define	ISMULT(c)	((c) == '*' || (c) == '+' || (c) == '?')
142 #define	META	"^$.[()|?+*\\"
143 
144 /*
145  * Flags to be passed up and down.
146  */
147 #define	HASWIDTH	01	/* Known never to match null string. */
148 #define	SIMPLE		02	/* Simple enough to be STAR/PLUS operand. */
149 #define	SPSTART		04	/* Starts with * or +. */
150 #define	WORST		0	/* Worst case. */
151 
152 /*
153  * Global work variables for reg_compile().
154  */
155 static char *regparse;		/* Input-scan pointer. */
156 static int regnpar;		/* () count. */
157 static char regdummy;
158 static char *regcode;		/* Code-emit pointer; &regdummy = don't. */
159 static long regsize;		/* Code size. */
160 
161 /*
162  * Forward declarations for reg_compile()'s friends.
163  */
164 #ifndef STATIC
165 #define	STATIC	static
166 #endif
167 STATIC char *reg(int, int *);
168 STATIC char *regbranch(int *);
169 STATIC char *regpiece(int *);
170 STATIC char *regatom(int *);
171 STATIC char *regnode(char);
172 STATIC char *regnext(char *);
173 STATIC void regc(char);
174 STATIC void reginsert(char, char *);
175 STATIC void regtail(char *, char *);
176 STATIC void regoptail(char *, char *);
177 #ifdef STRCSPN
178 STATIC int strcspn(char *, char *);
179 #endif
180 
181 /*
182  - reg_compile - compile a regular expression into internal code
183  *
184  * We can't allocate space until we know how big the compiled form will be,
185  * but we can't compile it (and thus know how big it is) until we've got a
186  * place to put the code.  So we cheat:  we compile it twice, once with code
187  * generation turned off and size counting turned on, and once "for real".
188  * This also means that we don't allocate space until we are sure that the
189  * thing really will compile successfully, and we never have to move the
190  * code and thus invalidate pointers into it.  (Note that it has to be in
191  * one piece because free() must be able to free it all.)
192  *
193  * Beware that the optimization-preparation code in here knows about some
194  * of the structure of the compiled regexp.
195  */
reg_compile(char * exp)196 regexpr *reg_compile(char *exp)
197 {
198 	register regexpr *r;
199 	register char *scan;
200 	register char *longest;
201 	register int len;
202 	int flags;
203 #if !defined(__DragonFly__)
204 	extern char *malloc();
205 #endif
206 
207 	if (exp == NULL)
208 		FAIL("NULL argument");
209 
210 	/* First pass: determine size, legality. */
211 	regparse = exp;
212 	regnpar = 1;
213 	regsize = 0L;
214 	regcode = &regdummy;
215 	regc(MAGIC);
216 	if (reg(0, &flags) == NULL)
217 		return(NULL);
218 
219 	/* Small enough for pointer-storage convention? */
220 	if (regsize >= 32767L)		/* Probably could be 65535L. */
221 		FAIL("regexpr too big");
222 
223 	/* Allocate space. */
224 	r = (regexpr *)malloc(sizeof(regexpr) + (unsigned)regsize);
225 	if (r == NULL)
226 		FAIL("out of space");
227 
228 	/* Second pass: emit code. */
229 	regparse = exp;
230 	regnpar = 1;
231 	regcode = r->program;
232 	regc(MAGIC);
233 	if (reg(0, &flags) == NULL)
234 		return(NULL);
235 
236 	/* Dig out information for optimizations. */
237 	r->regstart = '\0';	/* Worst-case defaults. */
238 	r->reganch = 0;
239 	r->regmust = NULL;
240 	r->regmlen = 0;
241 	scan = r->program+1;			/* First BRANCH. */
242 	if (OP(regnext(scan)) == END) {		/* Only one top-level choice. */
243 		scan = OPERAND(scan);
244 
245 		/* Starting-point info. */
246 		if (OP(scan) == EXACTLY)
247 			r->regstart = *OPERAND(scan);
248 		else if (OP(scan) == BOL)
249 			r->reganch++;
250 
251 		/*
252 		 * If there's something expensive in the r.e., find the
253 		 * longest literal string that must appear and make it the
254 		 * regmust.  Resolve ties in favor of later strings, since
255 		 * the regstart check works with the beginning of the r.e.
256 		 * and avoiding duplication strengthens checking.  Not a
257 		 * strong reason, but sufficient in the absence of others.
258 		 */
259 		if (flags&SPSTART) {
260 			longest = NULL;
261 			len = 0;
262 			for (; scan != NULL; scan = regnext(scan))
263 				if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) {
264 					longest = OPERAND(scan);
265 					len = strlen(OPERAND(scan));
266 				}
267 			r->regmust = longest;
268 			r->regmlen = len;
269 		}
270 	}
271 
272 	return(r);
273 }
274 
275 /*
276  - reg - regular expression, i.e. main body or parenthesized thing
277  *
278  * Caller must absorb opening parenthesis.
279  *
280  * Combining parenthesis handling with the base level of regular expression
281  * is a trifle forced, but the need to tie the tails of the branches to what
282  * follows makes it hard to avoid.
283  */
reg(int paren,int * flagp)284 static char *reg(int paren, int *flagp)
285 {
286 	register char *ret;
287 	register char *br;
288 	register char *ender;
289 	register int parno;
290 	int flags;
291 
292 	*flagp = HASWIDTH;	/* Tentatively. */
293 
294 	/* Make an OPEN node, if parenthesized. */
295 	if (paren) {
296 		if (regnpar >= NSUBEXP)
297 			FAIL("too many ()");
298 		parno = regnpar;
299 		regnpar++;
300 		ret = regnode(OPEN+parno);
301 	} else
302 		ret = NULL;
303 
304 	/* Pick up the branches, linking them together. */
305 	br = regbranch(&flags);
306 	if (br == NULL)
307 		return(NULL);
308 	if (ret != NULL)
309 		regtail(ret, br);	/* OPEN -> first. */
310 	else
311 		ret = br;
312 	if (!(flags&HASWIDTH))
313 		*flagp &= ~HASWIDTH;
314 	*flagp |= flags&SPSTART;
315 	while (*regparse == '|') {
316 		regparse++;
317 		br = regbranch(&flags);
318 		if (br == NULL)
319 			return(NULL);
320 		regtail(ret, br);	/* BRANCH -> BRANCH. */
321 		if (!(flags&HASWIDTH))
322 			*flagp &= ~HASWIDTH;
323 		*flagp |= flags&SPSTART;
324 	}
325 
326 	/* Make a closing node, and hook it on the end. */
327 	ender = regnode((paren) ? CLOSE+parno : END);
328 	regtail(ret, ender);
329 
330 	/* Hook the tails of the branches to the closing node. */
331 	for (br = ret; br != NULL; br = regnext(br))
332 		regoptail(br, ender);
333 
334 	/* Check for proper termination. */
335 	if (paren && *regparse++ != ')') {
336 		FAIL("unmatched ()");
337 	} else if (!paren && *regparse != '\0') {
338 		if (*regparse == ')') {
339 			FAIL("unmatched ()");
340 		} else
341 			FAIL("junk on end");	/* "Can't happen". */
342 		/* NOTREACHED */
343 	}
344 
345 	return(ret);
346 }
347 
348 /*
349  - regbranch - one alternative of an | operator
350  *
351  * Implements the concatenation operator.
352  */
regbranch(int * flagp)353 static char *regbranch(int *flagp)
354 {
355 	register char *ret;
356 	register char *chain;
357 	register char *latest;
358 	int flags;
359 
360 	*flagp = WORST;		/* Tentatively. */
361 
362 	ret = regnode(BRANCH);
363 	chain = NULL;
364 	while (*regparse != '\0' && *regparse != '|' && *regparse != ')') {
365 		latest = regpiece(&flags);
366 		if (latest == NULL)
367 			return(NULL);
368 		*flagp |= flags&HASWIDTH;
369 		if (chain == NULL)	/* First piece. */
370 			*flagp |= flags&SPSTART;
371 		else
372 			regtail(chain, latest);
373 		chain = latest;
374 	}
375 	if (chain == NULL)	/* Loop ran zero times. */
376 		(void) regnode(NOTHING);
377 
378 	return(ret);
379 }
380 
381 /*
382  - regpiece - something followed by possible [*+?]
383  *
384  * Note that the branching code sequences used for ? and the general cases
385  * of * and + are somewhat optimized:  they use the same NOTHING node as
386  * both the endmarker for their branch list and the body of the last branch.
387  * It might seem that this node could be dispensed with entirely, but the
388  * endmarker role is not redundant.
389  */
regpiece(int * flagp)390 static char *regpiece(int *flagp)
391 {
392 	register char *ret;
393 	register char op;
394 	register char *next;
395 	int flags;
396 
397 	ret = regatom(&flags);
398 	if (ret == NULL)
399 		return(NULL);
400 
401 	op = *regparse;
402 	if (!ISMULT(op)) {
403 		*flagp = flags;
404 		return(ret);
405 	}
406 
407 	if (!(flags&HASWIDTH) && op != '?')
408 		FAIL("*+ operand could be empty");
409 	*flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
410 
411 	if (op == '*' && (flags&SIMPLE))
412 		reginsert(STAR, ret);
413 	else if (op == '*') {
414 		/* Emit x* as (x&|), where & means "self". */
415 		reginsert(BRANCH, ret);			/* Either x */
416 		regoptail(ret, regnode(BACK));		/* and loop */
417 		regoptail(ret, ret);			/* back */
418 		regtail(ret, regnode(BRANCH));		/* or */
419 		regtail(ret, regnode(NOTHING));		/* null. */
420 	} else if (op == '+' && (flags&SIMPLE))
421 		reginsert(PLUS, ret);
422 	else if (op == '+') {
423 		/* Emit x+ as x(&|), where & means "self". */
424 		next = regnode(BRANCH);			/* Either */
425 		regtail(ret, next);
426 		regtail(regnode(BACK), ret);		/* loop back */
427 		regtail(next, regnode(BRANCH));		/* or */
428 		regtail(ret, regnode(NOTHING));		/* null. */
429 	} else if (op == '?') {
430 		/* Emit x? as (x|) */
431 		reginsert(BRANCH, ret);			/* Either x */
432 		regtail(ret, regnode(BRANCH));		/* or */
433 		next = regnode(NOTHING);		/* null. */
434 		regtail(ret, next);
435 		regoptail(ret, next);
436 	}
437 	regparse++;
438 	if (ISMULT(*regparse))
439 		FAIL("nested *?+");
440 
441 	return(ret);
442 }
443 
444 /*
445  - regatom - the lowest level
446  *
447  * Optimization:  gobbles an entire sequence of ordinary characters so that
448  * it can turn them into a single node, which is smaller to store and
449  * faster to run.  Backslashed characters are exceptions, each becoming a
450  * separate node; the code is simpler that way and it's not worth fixing.
451  */
regatom(int * flagp)452 static char *regatom(int *flagp)
453 {
454 	register char *ret;
455 	int flags;
456 
457 	*flagp = WORST;		/* Tentatively. */
458 
459 	switch (*regparse++) {
460 	case '^':
461 		ret = regnode(BOL);
462 		break;
463 	case '$':
464 		ret = regnode(EOL);
465 		break;
466 	case '.':
467 		ret = regnode(ANY);
468 		*flagp |= HASWIDTH|SIMPLE;
469 		break;
470 	case '[': {
471 			register int class;
472 			register int classend;
473 
474 			if (*regparse == '^') {	/* Complement of range. */
475 				ret = regnode(ANYBUT);
476 				regparse++;
477 			} else
478 				ret = regnode(ANYOF);
479 			if (*regparse == ']' || *regparse == '-')
480 				regc(*regparse++);
481 			while (*regparse != '\0' && *regparse != ']') {
482 				if (*regparse == '-') {
483 					regparse++;
484 					if (*regparse == ']' || *regparse == '\0')
485 						regc('-');
486 					else {
487 						class = UCHARAT(regparse-2)+1;
488 						classend = UCHARAT(regparse);
489 						if (class > classend+1)
490 							FAIL("invalid [] range");
491 						for (; class <= classend; class++)
492 							regc(class);
493 						regparse++;
494 					}
495 				} else
496 					regc(*regparse++);
497 			}
498 			regc('\0');
499 			if (*regparse != ']')
500 				FAIL("unmatched []");
501 			regparse++;
502 			*flagp |= HASWIDTH|SIMPLE;
503 		}
504 		break;
505 	case '(':
506 		ret = reg(1, &flags);
507 		if (ret == NULL)
508 			return(NULL);
509 		*flagp |= flags&(HASWIDTH|SPSTART);
510 		break;
511 	case '\0':
512 	case '|':
513 	case ')':
514 		FAIL("internal urp");	/* Supposed to be caught earlier. */
515 		break;
516 	case '?':
517 	case '+':
518 	case '*':
519 		FAIL("?+* follows nothing");
520 		break;
521 	case '\\':
522 		if (*regparse == '\0')
523 			FAIL("trailing \\");
524 		ret = regnode(EXACTLY);
525 		regc(*regparse++);
526 		regc('\0');
527 		*flagp |= HASWIDTH|SIMPLE;
528 		break;
529 	default: {
530 			register int len;
531 			register char ender;
532 
533 			regparse--;
534 			len = strcspn(regparse, META);
535 			if (len <= 0)
536 				FAIL("internal disaster");
537 			ender = *(regparse+len);
538 			if (len > 1 && ISMULT(ender))
539 				len--;		/* Back off clear of ?+* operand. */
540 			*flagp |= HASWIDTH;
541 			if (len == 1)
542 				*flagp |= SIMPLE;
543 			ret = regnode(EXACTLY);
544 			while (len > 0) {
545 				regc(*regparse++);
546 				len--;
547 			}
548 			regc('\0');
549 		}
550 		break;
551 	}
552 
553 	return(ret);
554 }
555 
556 /*
557  - regnode - emit a node
558  */
regnode(char op)559 static char *regnode(char op)
560 {
561 	register char *ret;
562 	register char *ptr;
563 
564 	ret = regcode;
565 	if (ret == &regdummy) {
566 		regsize += 3;
567 		return(ret);
568 	}
569 
570 	ptr = ret;
571 	*ptr++ = op;
572 	*ptr++ = '\0';		/* Null "next" pointer. */
573 	*ptr++ = '\0';
574 	regcode = ptr;
575 
576 	return(ret);
577 }
578 
579 /*
580  - regc - emit (if appropriate) a byte of code
581  */
regc(char b)582 static void regc(char b)
583 {
584 	if (regcode != &regdummy)
585 		*regcode++ = b;
586 	else
587 		regsize++;
588 }
589 
590 /*
591  - reginsert - insert an operator in front of already-emitted operand
592  *
593  * Means relocating the operand.
594  */
reginsert(char op,char * opnd)595 static void reginsert(char op, char *opnd)
596 {
597 	register char *src;
598 	register char *dst;
599 	register char *place;
600 
601 	if (regcode == &regdummy) {
602 		regsize += 3;
603 		return;
604 	}
605 
606 	src = regcode;
607 	regcode += 3;
608 	dst = regcode;
609 	while (src > opnd)
610 		*--dst = *--src;
611 
612 	place = opnd;		/* Op node, where operand used to be. */
613 	*place++ = op;
614 	*place++ = '\0';
615 	*place++ = '\0';
616 }
617 
618 /*
619  - regtail - set the next-pointer at the end of a node chain
620  */
regtail(char * p,char * val)621 static void regtail(char *p, char *val)
622 {
623 	register char *scan;
624 	register char *temp;
625 	register int offset;
626 
627 	if (p == &regdummy)
628 		return;
629 
630 	/* Find last node. */
631 	scan = p;
632 	for (;;) {
633 		temp = regnext(scan);
634 		if (temp == NULL)
635 			break;
636 		scan = temp;
637 	}
638 
639 	if (OP(scan) == BACK)
640 		offset = scan - val;
641 	else
642 		offset = val - scan;
643 	*(scan+1) = (offset>>8)&0377;
644 	*(scan+2) = offset&0377;
645 }
646 
647 /*
648  - regoptail - regtail on operand of first argument; nop if operandless
649  */
regoptail(char * p,char * val)650 static void regoptail(char *p, char *val)
651 {
652 	/* "Operandless" and "op != BRANCH" are synonymous in practice. */
653 	if (p == NULL || p == &regdummy || OP(p) != BRANCH)
654 		return;
655 	regtail(OPERAND(p), val);
656 }
657 
658 /*
659  * reg_execute and friends
660  */
661 
662 /*
663  * Global work variables for reg_execute().
664  */
665 static char *reginput;		/* String-input pointer. */
666 static char *regbol;		/* Beginning of input, for ^ check. */
667 static char **regstartp;	/* Pointer to startp array. */
668 static char **regendp;		/* Ditto for endp. */
669 
670 /*
671  * Forwards.
672  */
673 STATIC int regtry(regexpr *, char *);
674 STATIC int regmatch(char *);
675 STATIC int regrepeat(char *);
676 
677 #ifdef DEBUG
678 int regnarrate = 0;
679 void regdump(regexpr *);
680 STATIC char *regprop(char *);
681 #endif
682 
683 /*
684  - reg_execute - match a regexp against a string
685  */
reg_execute(register regexpr * prog,register char * string)686 int reg_execute(register regexpr *prog, register char *string)
687 {
688 	register char *s;
689 	extern char *strchr();
690 
691 	/* Be paranoid... */
692 	if (prog == NULL || string == NULL) {
693 		reg_error("NULL parameter");
694 		return(-1);
695 	}
696 
697 	/* Check validity of program. */
698 	if (UCHARAT(prog->program) != MAGIC) {
699 		reg_error("corrupted program");
700 		return(-1);
701 	}
702 
703 	/* If there is a "must appear" string, look for it. */
704 	if (prog->regmust != NULL) {
705 		s = string;
706 		while ((s = strchr(s, prog->regmust[0])) != NULL) {
707 			if (strncmp(s, prog->regmust, prog->regmlen) == 0)
708 				break;	/* Found it. */
709 			s++;
710 		}
711 		if (s == NULL)	/* Not present. */
712 			return(0);
713 	}
714 
715 	/* Mark beginning of line for ^ . */
716 	regbol = string;
717 
718 	/* Simplest case:  anchored match need be tried only once. */
719 	if (prog->reganch)
720 		return(regtry(prog, string));
721 
722 	/* Messy cases:  unanchored match. */
723 	s = string;
724 	if (prog->regstart != '\0')
725 		/* We know what char it must start with. */
726 		while ((s = strchr(s, prog->regstart)) != NULL) {
727 			if (regtry(prog, s))
728 				return(1);
729 			s++;
730 		}
731 	else
732 		/* We don't -- general case. */
733 		do {
734 			if (regtry(prog, s))
735 				return(1);
736 		} while (*s++ != '\0');
737 
738 	/* Failure. */
739 	return(0);
740 }
741 
742 /*
743  - regtry - try match at specific point
744  */
regtry(regexpr * prog,char * string)745 static int regtry(regexpr *prog, char *string)
746 {
747 	register int i;
748 	register char **sp;
749 	register char **ep;
750 
751 	reginput = string;
752 	regstartp = prog->startp;
753 	regendp = prog->endp;
754 
755 	sp = prog->startp;
756 	ep = prog->endp;
757 	for (i = NSUBEXP; i > 0; i--) {
758 		*sp++ = NULL;
759 		*ep++ = NULL;
760 	}
761 	if (regmatch(prog->program + 1)) {
762 		prog->startp[0] = string;
763 		prog->endp[0] = reginput;
764 		return(1);
765 	} else
766 		return(0);
767 }
768 
769 /*
770  - regmatch - main matching routine
771  *
772  * Conceptually the strategy is simple:  check to see whether the current
773  * node matches, call self recursively to see whether the rest matches,
774  * and then act accordingly.  In practice we make some effort to avoid
775  * recursion, in particular by going through "ordinary" nodes (that don't
776  * need to know whether the rest of the match failed) by a loop instead of
777  * by recursion.
778  */
regmatch(char * prog)779 static int regmatch(char *prog)
780 {
781 	register char *scan;	/* Current node. */
782 	char *next;		/* Next node. */
783 	extern char *strchr();
784 
785 	scan = prog;
786 #ifdef DEBUG
787 	if (scan != NULL && regnarrate)
788 		fprintf(stderr, "%s(\n", regprop(scan));
789 #endif
790 	while (scan != NULL) {
791 #ifdef DEBUG
792 		if (regnarrate)
793 			fprintf(stderr, "%s...\n", regprop(scan));
794 #endif
795 		next = regnext(scan);
796 
797 		switch (OP(scan)) {
798 		case BOL:
799 			if (reginput != regbol)
800 				return(0);
801 			break;
802 		case EOL:
803 			if (*reginput != '\0')
804 				return(0);
805 			break;
806 		case ANY:
807 			if (*reginput == '\0')
808 				return(0);
809 			reginput++;
810 			break;
811 		case EXACTLY: {
812 				register int len;
813 				register char *opnd;
814 
815 				opnd = OPERAND(scan);
816 				/* Inline the first character, for speed. */
817 				if (*opnd != *reginput)
818 					return(0);
819 				len = strlen(opnd);
820 				if (len > 1 && strncmp(opnd, reginput, len) != 0)
821 					return(0);
822 				reginput += len;
823 			}
824 			break;
825 		case ANYOF:
826                         if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL)
827 				return(0);
828 
829 			reginput++;
830 			break;
831 		case ANYBUT:
832                         if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL)
833 				return(0);
834 			reginput++;
835 			break;
836 		case NOTHING:
837 			break;
838 		case BACK:
839 			break;
840 		case OPEN+1:
841 		case OPEN+2:
842 		case OPEN+3:
843 		case OPEN+4:
844 		case OPEN+5:
845 		case OPEN+6:
846 		case OPEN+7:
847 		case OPEN+8:
848 		case OPEN+9: {
849 				register int no;
850 				register char *save;
851 
852 				no = OP(scan) - OPEN;
853 				save = reginput;
854 
855 				if (regmatch(next)) {
856 					/*
857 					 * Don't set startp if some later
858 					 * invocation of the same parentheses
859 					 * already has.
860 					 */
861 					if (regstartp[no] == NULL)
862 						regstartp[no] = save;
863 					return(1);
864 				} else
865 					return(0);
866 			}
867 			break;
868 		case CLOSE+1:
869 		case CLOSE+2:
870 		case CLOSE+3:
871 		case CLOSE+4:
872 		case CLOSE+5:
873 		case CLOSE+6:
874 		case CLOSE+7:
875 		case CLOSE+8:
876 		case CLOSE+9: {
877 				register int no;
878 				register char *save;
879 
880 				no = OP(scan) - CLOSE;
881 				save = reginput;
882 
883 				if (regmatch(next)) {
884 					/*
885 					 * Don't set endp if some later
886 					 * invocation of the same parentheses
887 					 * already has.
888 					 */
889 					if (regendp[no] == NULL)
890 						regendp[no] = save;
891 					return(1);
892 				} else
893 					return(0);
894 			}
895 			break;
896 		case BRANCH: {
897 				register char *save;
898 
899 				if (OP(next) != BRANCH)		/* No choice. */
900 					next = OPERAND(scan);	/* Avoid recursion. */
901 				else {
902 					do {
903 						save = reginput;
904 						if (regmatch(OPERAND(scan)))
905 							return(1);
906 						reginput = save;
907 						scan = regnext(scan);
908 					} while (scan != NULL && OP(scan) == BRANCH);
909 					return(0);
910 					/* NOTREACHED */
911 				}
912 			}
913 			break;
914 		case STAR:
915 		case PLUS: {
916 				register char nextch;
917 				register int no;
918 				register char *save;
919 				register int min;
920 
921 				/*
922 				 * Lookahead to avoid useless match attempts
923 				 * when we know what character comes next.
924 				 */
925 				nextch = '\0';
926 				if (OP(next) == EXACTLY)
927 					nextch = *OPERAND(next);
928 				min = (OP(scan) == STAR) ? 0 : 1;
929 				save = reginput;
930 				no = regrepeat(OPERAND(scan));
931 				while (no >= min) {
932 					/* If it could work, try it. */
933 					if (nextch == '\0' || *reginput == nextch)
934 						if (regmatch(next))
935 							return(1);
936 					/* Couldn't or didn't -- back up. */
937 					no--;
938 					reginput = save + no;
939 				}
940 				return(0);
941 			}
942 			break;
943 		case END:
944 			return(1);	/* Success! */
945 			break;
946 		default:
947 			reg_error("memory corruption");
948 			return(0);
949 			break;
950 		}
951 
952 		scan = next;
953 	}
954 
955 	/*
956 	 * We get here only if there's trouble -- normally "case END" is
957 	 * the terminating point.
958 	 */
959 	reg_error("corrupted pointers");
960 	return(0);
961 }
962 
963 /*
964  - regrepeat - repeatedly match something simple, report how many
965  */
regrepeat(char * p)966 static int regrepeat(char *p)
967 {
968 	register int count = 0;
969 	register char *scan;
970 	register char *opnd;
971         extern char *strchr();
972 
973 	scan = reginput;
974 	opnd = OPERAND(p);
975 	switch (OP(p)) {
976 	case ANY:
977 		count = strlen(scan);
978 		scan += count;
979 		break;
980 	case EXACTLY:
981 		while (*opnd == *scan) {
982 			count++;
983 			scan++;
984 		}
985 		break;
986 	case ANYOF:
987 		while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
988 			count++;
989 			scan++;
990 		}
991 		break;
992 	case ANYBUT:
993 		while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
994 			count++;
995 			scan++;
996 		}
997 		break;
998 	default:		/* Oh dear.  Called inappropriately. */
999 		reg_error("internal foulup");
1000 		count = 0;	/* Best compromise. */
1001 		break;
1002 	}
1003 	reginput = scan;
1004 
1005 	return(count);
1006 }
1007 
1008 /*
1009  - regnext - dig the "next" pointer out of a node
1010  */
regnext(register char * p)1011 static char * regnext(register char *p)
1012 {
1013 	register int offset;
1014 
1015 	if (p == &regdummy)
1016 		return(NULL);
1017 
1018 	offset = NEXT(p);
1019 	if (offset == 0)
1020 		return(NULL);
1021 
1022 	if (OP(p) == BACK)
1023 		return(p-offset);
1024 	else
1025 		return(p+offset);
1026 }
1027 
1028 /*
1029  - reg_substitute - perform substitutions after a regexp match
1030  */
reg_substitute(regexpr * prog,char * source,char * dest)1031 void reg_substitute(regexpr *prog, char *source, char *dest)
1032 {
1033 	register char *src;
1034 	register char *dst;
1035 	register char c;
1036 	register int no;
1037 	register int len;
1038 	extern char *strncpy();
1039 
1040 	if (prog == NULL || source == NULL || dest == NULL) {
1041 		reg_error("NULL parm to reg_substitute");
1042 		return;
1043 	}
1044 	if (UCHARAT(prog->program) != MAGIC) {
1045 		reg_error("damaged regexpr fed to reg_substitute");
1046 		return;
1047 	}
1048 
1049 	src = source;
1050 	dst = dest;
1051 	while ((c = *src++) != '\0') {
1052 		if (c == '&')
1053 			no = 0;
1054 		else if (c == '\\' && '0' <= *src && *src <= '9')
1055 			no = *src++ - '0';
1056 		else
1057 			no = -1;
1058 
1059 		if (no < 0) {	/* Ordinary character. */
1060                         if (c == '\\' && (*src == '\\' || *src == '&'))
1061                                c = *src++;
1062 			*dst++ = c;
1063 		} else if (prog->startp[no] != NULL && prog->endp[no] != NULL) {
1064 			len = prog->endp[no] - prog->startp[no];
1065 			(void) strncpy(dst, prog->startp[no], len);
1066 			dst += len;
1067                         if (len != 0 && *(dst-1) == '\0') {
1068 				reg_error("damaged match string");
1069 				return;
1070 			}
1071 		}
1072 	}
1073 	*dst++ = '\0';
1074 }
1075 
reg_error(char * s)1076 void reg_error(char *s)
1077 {
1078         return;
1079 }
1080 
1081 #ifdef DEBUG
1082 
1083 STATIC char *regprop(char *);
1084 
1085 /*
1086  - regdump - dump a regexpr onto stdout in vaguely comprehensible form
1087  */
regdump(regexpr * r)1088 void regdump(regexpr *r)
1089 {
1090 	register char *s;
1091 	register char op = EXACTLY;	/* Arbitrary non-END op. */
1092 	register char *next;
1093 	extern char *strchr();
1094 
1095 
1096 	s = r->program + 1;
1097 	while (op != END) {	/* While that wasn't END last time... */
1098 		op = OP(s);
1099 		printf("%2d%s", s-r->program, regprop(s));	/* Where, what. */
1100 		next = regnext(s);
1101 		if (next == NULL)		/* Next ptr. */
1102 			printf("(0)");
1103 		else
1104 			printf("(%d)", (s-r->program)+(next-s));
1105 		s += 3;
1106 		if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
1107 			/* Literal string, where present. */
1108 			while (*s != '\0') {
1109 				putchar(*s);
1110 				s++;
1111 			}
1112 			s++;
1113 		}
1114 		putchar('\n');
1115 	}
1116 
1117 	/* Header fields of interest. */
1118 	if (r->regstart != '\0')
1119 		printf("start `%c' ", r->regstart);
1120 	if (r->reganch)
1121 		printf("anchored ");
1122 	if (r->regmust != NULL)
1123 		printf("must have \"%s\"", r->regmust);
1124 	printf("\n");
1125 }
1126 
1127 /*
1128  - regprop - printable representation of opcode
1129  */
regprop(char * op)1130 static char *regprop(char *op)
1131 {
1132 	register char *p;
1133 	static char buf[50];
1134 
1135 	(void) strcpy(buf, ":");
1136 
1137 	switch (OP(op)) {
1138 	case BOL:
1139 		p = "BOL";
1140 		break;
1141 	case EOL:
1142 		p = "EOL";
1143 		break;
1144 	case ANY:
1145 		p = "ANY";
1146 		break;
1147 	case ANYOF:
1148 		p = "ANYOF";
1149 		break;
1150 	case ANYBUT:
1151 		p = "ANYBUT";
1152 		break;
1153 	case BRANCH:
1154 		p = "BRANCH";
1155 		break;
1156 	case EXACTLY:
1157 		p = "EXACTLY";
1158 		break;
1159 	case NOTHING:
1160 		p = "NOTHING";
1161 		break;
1162 	case BACK:
1163 		p = "BACK";
1164 		break;
1165 	case END:
1166 		p = "END";
1167 		break;
1168 	case OPEN+1:
1169 	case OPEN+2:
1170 	case OPEN+3:
1171 	case OPEN+4:
1172 	case OPEN+5:
1173 	case OPEN+6:
1174 	case OPEN+7:
1175 	case OPEN+8:
1176 	case OPEN+9:
1177 		sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
1178 		p = NULL;
1179 		break;
1180 	case CLOSE+1:
1181 	case CLOSE+2:
1182 	case CLOSE+3:
1183 	case CLOSE+4:
1184 	case CLOSE+5:
1185 	case CLOSE+6:
1186 	case CLOSE+7:
1187 	case CLOSE+8:
1188 	case CLOSE+9:
1189 		sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
1190 		p = NULL;
1191 		break;
1192 	case STAR:
1193 		p = "STAR";
1194 		break;
1195 	case PLUS:
1196 		p = "PLUS";
1197 		break;
1198 	default:
1199 		reg_error("corrupted opcode");
1200 		break;
1201 	}
1202 	if (p != NULL)
1203 		(void) strcat(buf, p);
1204 	return(buf);
1205 }
1206 #endif
1207 
1208 /*
1209  * The following is provided for those people who do not have strcspn() in
1210  * their C libraries.  They should get off their butts and do something
1211  * about it; at least one public-domain implementation of those (highly
1212  * useful) string routines has been published on Usenet.
1213  */
1214 #ifdef STRCSPN
1215 /*
1216  * strcspn - find length of initial segment of s1 consisting entirely
1217  * of characters not from s2
1218  */
1219 
strcspn(char * s1,char * s2)1220 static int strcspn(char *s1, char *s2)
1221 {
1222 	register char *scan1;
1223 	register char *scan2;
1224 	register int count;
1225 
1226 	count = 0;
1227 	for (scan1 = s1; *scan1 != '\0'; scan1++) {
1228 		for (scan2 = s2; *scan2 != '\0';)	/* ++ moved down. */
1229 			if (*scan1 == *scan2++)
1230 				return(count);
1231 		count++;
1232 	}
1233 	return(count);
1234 }
1235 #endif
1236