1 /*
2  * @(#)regexp.c	1.3 of 18 April 87
3  *
4  *	Copyright (c) 1986 by University of Toronto.
5  *	Written by Henry Spencer.  Not derived from licensed software.
6  *
7  *	Permission is granted to anyone to use this software for any
8  *	purpose on any computer system, and to redistribute it freely,
9  *	subject to the following restrictions:
10  *
11  *	1. The author is not responsible for the consequences of use of
12  *		this software, no matter how awful, even if they arise
13  *		from defects in it.
14  *
15  *	2. The origin of this software must not be misrepresented, either
16  *		by explicit claim or by omission.
17  *
18  *	3. Altered versions must be plainly marked as such, and must not
19  *		be misrepresented as being the original software.
20  *
21  * Beware that some of this code is subtly aware of the way operator
22  * precedence is structured in regular expressions.  Serious changes in
23  * regular-expression syntax might require a total rethink.
24  */
25 
26 /*
27  * NOTICE: THIS CODE HAS BEEN MODIFIED TO FIT THE NN ENVIRONMENT.
28  *
29  * modified by Kim Fabricius Storm
30  * modified by Michael T. Pins
31  */
32 
33 #include <stdio.h>
34 #include <string.h>
35 #include <stdlib.h>
36 #include <ctype.h>
37 #include "config.h"
38 #include "global.h"
39 #include "regexp.h"
40 #include "nn_term.h"
41 
42 /*
43  * The "internal use only" fields in regexp.h are present to pass info from
44  * compile to execute that permits the execute phase to run lots faster on
45  * simple cases.  They are:
46  *
47  * regstart	char that must begin a match; '\0' if none obvious
48  * reganch	is the match anchored (at beginning-of-line only)?
49  * regmust	string (pointer into program) that match must include, or NULL
50  * regmlen	length of regmust string
51  *
52  * Regstart and reganch permit very fast decisions on suitable starting points
53  * for a match, cutting down the work a lot.  Regmust permits fast rejection
54  * of lines that cannot possibly match.  The regmust tests are costly enough
55  * that regcomp() supplies a regmust only if the r.e. contains something
56  * potentially expensive (at present, the only such thing detected is * or +
57  * at the start of the r.e., which can involve a lot of backup).  Regmlen is
58  * supplied because the test in regexec() needs it and regcomp() is computing
59  * it anyway.
60  */
61 
62 /*
63  * Structure for regexp "program".  This is essentially a linear encoding
64  * of a nondeterministic finite-state machine (aka syntax charts or
65  * "railroad normal form" in parsing technology).  Each node is an opcode
66  * plus a "next" pointer, possibly plus an operand.  "Next" pointers of
67  * all nodes except BRANCH implement concatenation; a "next" pointer with
68  * a BRANCH on both ends of it is connecting two alternatives.  (Here we
69  * have one of the subtle syntax dependencies:  an individual BRANCH (as
70  * opposed to a collection of them) is never concatenated with anything
71  * because of operator precedence.)  The operand of some types of node is
72  * a literal string; for others, it is a node leading into a sub-FSM.  In
73  * particular, the operand of a BRANCH node is the first node of the branch.
74  * (NB this is *not* a tree structure:  the tail of the branch connects
75  * to the thing following the set of BRANCHes.)  The opcodes are:
76  */
77 
78 /* definition	number	opnd?	meaning */
79 #define	END	0		/* no	End of program. */
80 #define	BOL	1		/* no	Match "" at beginning of line. */
81 #define	EOL	2		/* no	Match "" at end of line. */
82 #define	ANY	3		/* no	Match any one character. */
83 #define	ANYOF	4		/* str	Match any character in this string. */
84 #define	ANYBUT	5		/* str	Match any character not in this
85 				 * string. */
86 #define	BRANCH	6		/* node	Match this alternative, or the
87 				 * next... */
88 #define	BACK	7		/* no	Match "", "next" ptr points backward. */
89 #define	EXACTLY	8		/* str	Match this string. */
90 #define	NOTHING	9		/* no	Match empty string. */
91 #define	STAR	10		/* node	Match this (simple) thing 0 or more
92 				 * times. */
93 #define	PLUS	11		/* node	Match this (simple) thing 1 or more
94 				 * times. */
95 #define W_START 12		/* no	Match beginning of word */
96 #define W_END	13		/* no	Match end of word */
97 #define	OPEN	20		/* no	Mark this point in input as start of
98 				 * #n. */
99  /* OPEN+1 is number 1, etc. */
100 #define	CLOSE	30		/* no	Analogous to OPEN. */
101 
102 /*
103  * Opcode notes:
104  *
105  * BRANCH	The set of branches constituting a single choice are hooked
106  *		together with their "next" pointers, since precedence prevents
107  *		anything being concatenated to any individual branch.  The
108  *		"next" pointer of the last BRANCH in a choice points to the
109  *		thing following the whole choice.  This is also where the
110  *		final "next" pointer of each individual branch points; each
111  *		branch starts with the operand node of a BRANCH node.
112  *
113  * BACK		Normal "next" pointers all implicitly point forward; BACK
114  *		exists to make loop structures possible.
115  *
116  * STAR,PLUS	'?', and complex '*' and '+', are implemented as circular
117  *		BRANCH structures using BACK.  Simple cases (one character
118  *		per match) are implemented with STAR and PLUS for speed
119  *		and to minimize recursive plunges.
120  *
121  * OPEN,CLOSE	...are numbered at compile time.
122  */
123 
124 /*
125  * A node is one char of opcode followed by two chars of "next" pointer.
126  * "Next" pointers are stored as two 8-bit pieces, high order first.  The
127  * value is a positive offset from the opcode of the node containing it.
128  * An operand, if any, simply follows the node.  (Note that much of the
129  * code generation knows about this implicit relationship.)
130  *
131  * Using two bytes for the "next" pointer is vast overkill for most things,
132  * but allows patterns to get big without disasters.
133  */
134 #define	OP(p)	(*(p))
135 #define	NEXT(p)	(((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
136 #define	OPERAND(p)	((p) + 3)
137 
138 
139 /*
140  * Utility definitions.
141  */
142 
143 #ifndef CHARBITS
144 #define	UCHARAT(p)	((int)*(unsigned char *)(p))
145 #else
146 #define	UCHARAT(p)	((int)*(p)&CHARBITS)
147 #endif
148 
149 #define	FAIL(m)	{ regerror(m); return(NULL); }
150 #define	ISMULT(c)	((c) == '*' || (c) == '+' || (c) == '?')
151 #define	META	"^$.[()|?+*\\"
152 
153 /*
154  * Flags to be passed up and down.
155  */
156 #define	HASWIDTH	01	/* Known never to match null string. */
157 #define	SIMPLE		02	/* Simple enough to be STAR/PLUS operand. */
158 #define	SPSTART		04	/* Starts with * or +. */
159 #define	WORST		0	/* Worst case. */
160 
161 /*
162  * Global work variables for regcomp().
163  */
164 static char    *regparse;	/* Input-scan pointer. */
165 static int      regnpar;	/* () count. */
166 static char     regdummy;
167 static char    *regcode;	/* Code-emit pointer; &regdummy = don't. */
168 static long     regsize;	/* Code size. */
169 
170 /*
171  * Forward declarations for regcomp()'s friends.
172  */
173 
174 #ifndef STATIC
175 #define	STATIC	static
176 #endif
177 
178 STATIC char    *reg(int, int *);
179 STATIC char    *regbranch(int *);
180 STATIC char    *regpiece(int *);
181 STATIC char    *regatom(int *);
182 STATIC char    *regnode(char);
183 STATIC char    *regnext(register char *);
184 STATIC void     regc(char);
185 STATIC void     reginsert(char, char *);
186 STATIC void     regtail(char *, char *);
187 STATIC void     regoptail(char *, char *);
188 STATIC void     regerror(char *);
189 
190 #ifdef STRCSPN
191 STATIC int      strcspn(char *, char *);
192 #endif
193 
194 /*
195  - regcomp - compile a regular expression into internal code
196  *
197  * We can't allocate space until we know how big the compiled form will be,
198  * but we can't compile it (and thus know how big it is) until we've got a
199  * place to put the code.  So we cheat:  we compile it twice, once with code
200  * generation turned off and size counting turned on, and once "for real".
201  * This also means that we don't allocate space until we are sure that the
202  * thing really will compile successfully, and we never have to move the
203  * code and thus invalidate pointers into it.  (Note that it has to be in
204  * one piece because free() must be able to free it all.)
205  *
206  * Beware that the optimization-preparation code in here knows about some
207  * of the structure of the compiled regexp.
208  */
209 regexp         *
regcomp(char * exp)210 regcomp(char *exp)
211 {
212     register regexp *r;
213     register char  *scan;
214     register char  *longest;
215     register int    len;
216     int             flags;
217 
218     if (exp == NULL)
219 	FAIL("NULL argument");
220 
221     /* First pass: determine size, legality. */
222     regparse = exp;
223     regnpar = 1;
224     regsize = 0L;
225     regcode = &regdummy;
226     regc((char) MAGIC);
227     if (reg(0, &flags) == NULL)
228 	return (NULL);
229 
230     /* Small enough for pointer-storage convention? */
231     if (regsize >= 32767L)	/* Probably could be 65535L. */
232 	FAIL("regexp too big");
233 
234     /* Allocate space. */
235     r = (regexp *) malloc(sizeof(regexp) + (unsigned) regsize);
236     if (r == NULL)
237 	FAIL("out of space");
238 
239     /* Second pass: emit code. */
240     regparse = exp;
241     regnpar = 1;
242     regcode = r->program;
243     regc((char) MAGIC);
244     if (reg(0, &flags) == NULL)
245 	return (NULL);
246 
247     /* Dig out information for optimizations. */
248     r->regstart = '\0';		/* Worst-case defaults. */
249     r->reganch = 0;
250     r->regmust = NULL;
251     r->regmlen = 0;
252     scan = r->program + 1;	/* First BRANCH. */
253     if (OP(regnext(scan)) == END) {	/* Only one top-level choice. */
254 	scan = OPERAND(scan);
255 
256 	/* Starting-point info. */
257 	if (OP(scan) == EXACTLY)
258 	    r->regstart = *OPERAND(scan);
259 	else if (OP(scan) == BOL)
260 	    r->reganch++;
261 
262 	/*
263 	 * If there's something expensive in the r.e., find the longest
264 	 * literal string that must appear and make it the regmust.  Resolve
265 	 * ties in favor of later strings, since the regstart check works
266 	 * with the beginning of the r.e. and avoiding duplication
267 	 * strengthens checking.  Not a strong reason, but sufficient in the
268 	 * absence of others.
269 	 */
270 	if (flags & SPSTART) {
271 	    longest = NULL;
272 	    len = 0;
273 	    for (; scan != NULL; scan = regnext(scan))
274 		if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) {
275 		    longest = OPERAND(scan);
276 		    len = strlen(OPERAND(scan));
277 		}
278 	    r->regmust = longest;
279 	    r->regmlen = len;
280 	}
281     }
282     return (r);
283 }
284 
285 /*
286  - reg - regular expression, i.e. main body or parenthesized thing
287  *
288  * Caller must absorb opening parenthesis.
289  *
290  * Combining parenthesis handling with the base level of regular expression
291  * is a trifle forced, but the need to tie the tails of the branches to what
292  * follows makes it hard to avoid.
293  *
294  * paren: Parenthesized?
295  */
296 static char    *
reg(int paren,int * flagp)297 reg(int paren, int *flagp)
298 {
299     register char  *ret;
300     register char  *br;
301     register char  *ender;
302     register int    parno = 0;
303     int             flags;
304 
305     *flagp = HASWIDTH;		/* Tentatively. */
306 
307     /* Make an OPEN node, if parenthesized. */
308     if (paren) {
309 	if (regnpar >= NSUBEXP)
310 	    FAIL("too many ()");
311 	parno = regnpar;
312 	regnpar++;
313 	ret = regnode(OPEN + parno);
314     } else
315 	ret = NULL;
316 
317     /* Pick up the branches, linking them together. */
318     br = regbranch(&flags);
319     if (br == NULL)
320 	return (NULL);
321     if (ret != NULL)
322 	regtail(ret, br);	/* OPEN -> first. */
323     else
324 	ret = br;
325     if (!(flags & HASWIDTH))
326 	*flagp &= ~HASWIDTH;
327     *flagp |= flags & SPSTART;
328     while (*regparse == '|') {
329 	regparse++;
330 	br = regbranch(&flags);
331 	if (br == NULL)
332 	    return (NULL);
333 	regtail(ret, br);	/* BRANCH -> BRANCH. */
334 	if (!(flags & HASWIDTH))
335 	    *flagp &= ~HASWIDTH;
336 	*flagp |= flags & SPSTART;
337     }
338 
339     /* Make a closing node, and hook it on the end. */
340     ender = regnode((paren) ? CLOSE + parno : END);
341     regtail(ret, ender);
342 
343     /* Hook the tails of the branches to the closing node. */
344     for (br = ret; br != NULL; br = regnext(br))
345 	regoptail(br, ender);
346 
347     /* Check for proper termination. */
348     if (paren && *regparse++ != ')') {
349 	FAIL("unmatched ()");
350     } else if (!paren && *regparse != '\0') {
351 	if (*regparse == ')') {
352 	    FAIL("unmatched ()");
353 	} else
354 	    FAIL("junk on end");/* "Can't happen". */
355 	/* NOTREACHED */
356     }
357     return (ret);
358 }
359 
360 /*
361  - regbranch - one alternative of an | operator
362  *
363  * Implements the concatenation operator.
364  */
365 static char    *
regbranch(int * flagp)366 regbranch(int *flagp)
367 {
368     register char  *ret;
369     register char  *chain;
370     register char  *latest;
371     int             flags;
372 
373     *flagp = WORST;		/* Tentatively. */
374 
375     ret = regnode(BRANCH);
376     chain = NULL;
377     while (*regparse != '\0' && *regparse != '|' && *regparse != ')') {
378 	latest = regpiece(&flags);
379 	if (latest == NULL)
380 	    return (NULL);
381 	*flagp |= flags & HASWIDTH;
382 	if (chain == NULL)	/* First piece. */
383 	    *flagp |= flags & SPSTART;
384 	else
385 	    regtail(chain, latest);
386 	chain = latest;
387     }
388     if (chain == NULL)		/* Loop ran zero times. */
389 	(void) regnode(NOTHING);
390 
391     return (ret);
392 }
393 
394 /*
395  - regpiece - something followed by possible [*+?]
396  *
397  * Note that the branching code sequences used for ? and the general cases
398  * of * and + are somewhat optimized:  they use the same NOTHING node as
399  * both the endmarker for their branch list and the body of the last branch.
400  * It might seem that this node could be dispensed with entirely, but the
401  * endmarker role is not redundant.
402  */
403 static char    *
regpiece(int * flagp)404 regpiece(int *flagp)
405 {
406     register char  *ret;
407     register char   op;
408     register char  *next;
409     int             flags;
410 
411     ret = regatom(&flags);
412     if (ret == NULL)
413 	return (NULL);
414 
415     op = *regparse;
416     if (!ISMULT(op)) {
417 	*flagp = flags;
418 	return (ret);
419     }
420     if (!(flags & HASWIDTH) && op != '?')
421 	FAIL("*+ operand could be empty");
422     *flagp = (op != '+') ? (WORST | SPSTART) : (WORST | HASWIDTH);
423 
424     if (op == '*' && (flags & SIMPLE))
425 	reginsert(STAR, ret);
426     else if (op == '*') {
427 	/* Emit x* as (x&|), where & means "self". */
428 	reginsert(BRANCH, ret);	/* Either x */
429 	regoptail(ret, regnode(BACK));	/* and loop */
430 	regoptail(ret, ret);	/* back */
431 	regtail(ret, regnode(BRANCH));	/* or */
432 	regtail(ret, regnode(NOTHING));	/* null. */
433     } else if (op == '+' && (flags & SIMPLE))
434 	reginsert(PLUS, ret);
435     else if (op == '+') {
436 	/* Emit x+ as x(&|), where & means "self". */
437 	next = regnode(BRANCH);	/* Either */
438 	regtail(ret, next);
439 	regtail(regnode(BACK), ret);	/* loop back */
440 	regtail(next, regnode(BRANCH));	/* or */
441 	regtail(ret, regnode(NOTHING));	/* null. */
442     } else if (op == '?') {
443 	/* Emit x? as (x|) */
444 	reginsert(BRANCH, ret);	/* Either x */
445 	regtail(ret, regnode(BRANCH));	/* or */
446 	next = regnode(NOTHING);/* null. */
447 	regtail(ret, next);
448 	regoptail(ret, next);
449     }
450     regparse++;
451     if (ISMULT(*regparse))
452 	FAIL("nested *?+");
453 
454     return (ret);
455 }
456 
457 /*
458  - regatom - the lowest level
459  *
460  * Optimization:  gobbles an entire sequence of ordinary characters so that
461  * it can turn them into a single node, which is smaller to store and
462  * faster to run.  Backslashed characters are exceptions, each becoming a
463  * separate node; the code is simpler that way and it's not worth fixing.
464  */
465 static char    *
regatom(int * flagp)466 regatom(int *flagp)
467 {
468     register char  *ret;
469     int             flags;
470 
471     *flagp = WORST;		/* Tentatively. */
472 
473     switch (*regparse++) {
474 	case '^':
475 	    ret = regnode(BOL);
476 	    break;
477 	case '$':
478 	    ret = regnode(EOL);
479 	    break;
480 	case '.':
481 	    ret = regnode(ANY);
482 	    *flagp |= HASWIDTH | SIMPLE;
483 	    break;
484 	case '[':{
485 		register int    class;
486 		register int    classend;
487 
488 		if (*regparse == '^') {	/* Complement of range. */
489 		    ret = regnode(ANYBUT);
490 		    regparse++;
491 		} else
492 		    ret = regnode(ANYOF);
493 		if (*regparse == ']' || *regparse == '-')
494 		    regc(*regparse++);
495 		while (*regparse != '\0' && *regparse != ']') {
496 		    if (*regparse == '-') {
497 			regparse++;
498 			if (*regparse == ']' || *regparse == '\0')
499 			    regc('-');
500 			else {
501 			    class = UCHARAT(regparse - 2) + 1;
502 			    classend = UCHARAT(regparse);
503 			    if (class > classend + 1)
504 				FAIL("invalid [] range");
505 			    for (; class <= classend; class++)
506 				regc(class);
507 			    regparse++;
508 			}
509 		    } else
510 			regc(*regparse++);
511 		}
512 		regc('\0');
513 		if (*regparse != ']')
514 		    FAIL("unmatched []");
515 		regparse++;
516 		*flagp |= HASWIDTH | SIMPLE;
517 	    }
518 	    break;
519 	case '(':
520 	    ret = reg(1, &flags);
521 	    if (ret == NULL)
522 		return (NULL);
523 	    *flagp |= flags & (HASWIDTH | SPSTART);
524 	    break;
525 	case '\0':
526 	case '|':
527 	case ')':
528 	    FAIL("internal urp");	/* Supposed to be caught earlier. */
529 	    break;
530 	case '?':
531 	case '+':
532 	case '*':
533 	    FAIL("?+* follows nothing");
534 	    break;
535 	case '\\':
536 	    switch (*regparse++) {
537 		case '\0':
538 		    FAIL("trailing \\");
539 		    break;	/* for lint, we can't actually get here */
540 		case '<':
541 		    ret = regnode(W_START);
542 		    break;
543 		case '>':
544 		    ret = regnode(W_END);
545 		    break;
546 		default:
547 		    ret = regnode(EXACTLY);
548 		    regc(*regparse);
549 		    regc('\0');
550 		    *flagp |= HASWIDTH | SIMPLE;
551 		    break;
552 	    }
553 	    break;
554 	default:{
555 		register int    len;
556 		register char   ender;
557 
558 		regparse--;
559 		len = strcspn(regparse, META);
560 		if (len <= 0)
561 		    FAIL("internal disaster");
562 		ender = *(regparse + len);
563 		if (len > 1 && ISMULT(ender))
564 		    len--;	/* Back off clear of ?+* operand. */
565 		*flagp |= HASWIDTH;
566 		if (len == 1)
567 		    *flagp |= SIMPLE;
568 		ret = regnode(EXACTLY);
569 		while (len > 0) {
570 		    regc(*regparse++);
571 		    len--;
572 		}
573 		regc('\0');
574 	    }
575 	    break;
576     }
577 
578     return (ret);
579 }
580 
581 /*
582  - regnode - emit a node
583  */
584 static char    *		/* Location. */
regnode(char op)585 regnode(char op)
586 {
587     register char  *ret;
588     register char  *ptr;
589 
590     ret = regcode;
591     if (ret == &regdummy) {
592 	regsize += 3;
593 	return (ret);
594     }
595     ptr = ret;
596     *ptr++ = op;
597     *ptr++ = '\0';		/* Null "next" pointer. */
598     *ptr++ = '\0';
599     regcode = ptr;
600 
601     return (ret);
602 }
603 
604 /*
605  - regc - emit (if appropriate) a byte of code
606  */
607 static void
regc(char b)608 regc(char b)
609 {
610     if (regcode != &regdummy)
611 	*regcode++ = b;
612     else
613 	regsize++;
614 }
615 
616 /*
617  - reginsert - insert an operator in front of already-emitted operand
618  *
619  * Means relocating the operand.
620  */
621 static void
reginsert(char op,char * opnd)622 reginsert(char op, char *opnd)
623 {
624     register char  *src;
625     register char  *dst;
626     register char  *place;
627 
628     if (regcode == &regdummy) {
629 	regsize += 3;
630 	return;
631     }
632     src = regcode;
633     regcode += 3;
634     dst = regcode;
635     while (src > opnd)
636 	*--dst = *--src;
637 
638     place = opnd;		/* Op node, where operand used to be. */
639     *place++ = op;
640     *place++ = '\0';
641     *place++ = '\0';
642 }
643 
644 /*
645  - regtail - set the next-pointer at the end of a node chain
646  */
647 static void
regtail(char * p,char * val)648 regtail(char *p, char *val)
649 {
650     register char  *scan;
651     register char  *temp;
652     register int    offset;
653 
654     if (p == &regdummy)
655 	return;
656 
657     /* Find last node. */
658     scan = p;
659     for (;;) {
660 	temp = regnext(scan);
661 	if (temp == NULL)
662 	    break;
663 	scan = temp;
664     }
665 
666     if (OP(scan) == BACK)
667 	offset = scan - val;
668     else
669 	offset = val - scan;
670     *(scan + 1) = (offset >> 8) & 0377;
671     *(scan + 2) = offset & 0377;
672 }
673 
674 /*
675  - regoptail - regtail on operand of first argument; nop if operandless
676  */
677 static void
regoptail(char * p,char * val)678 regoptail(char *p, char *val)
679 {
680     /* "Operandless" and "op != BRANCH" are synonymous in practice. */
681     if (p == NULL || p == &regdummy || OP(p) != BRANCH)
682 	return;
683     regtail(OPERAND(p), val);
684 }
685 
686 /*
687  * regexec and friends
688  */
689 
690 /*
691  * Global work variables for regexec().
692  */
693 static char    *reginput;	/* String-input pointer. */
694 static char    *regbol;		/* Beginning of input, for ^ check. */
695 static char   **regstartp;	/* Pointer to startp array. */
696 static char   **regendp;	/* Ditto for endp. */
697 
698 /*
699  * Forwards.
700  */
701 STATIC int      regtry(regexp *, char *);
702 STATIC int      regmatch(char *);
703 STATIC int      regrepeat(char *);
704 
705 #ifdef DEBUG
706 int             regnarrate = 0;
707 void            regdump(regexp *);
708 STATIC char    *regprop(char *);
709 #endif
710 
711 /*
712  - regexec - match a regexp against a string
713  */
714 int
regexec(register regexp * prog,register char * string)715 regexec(register regexp * prog, register char *string)
716 {
717     register char  *s;
718 
719     /* Be paranoid... */
720     if (prog == NULL || string == NULL) {
721 	regerror("NULL parameter");
722 	return (0);
723     }
724     /* Check validity of program. */
725     if (UCHARAT(prog->program) != MAGIC) {
726 	regerror("corrupted program");
727 	return (0);
728     }
729     /* If there is a "must appear" string, look for it. */
730     if (prog->regmust != NULL) {
731 	s = string;
732 	while ((s = strchr(s, prog->regmust[0])) != NULL) {
733 	    if (strncmp(s, prog->regmust, prog->regmlen) == 0)
734 		break;		/* Found it. */
735 	    s++;
736 	}
737 	if (s == NULL)		/* Not present. */
738 	    return (0);
739     }
740     /* Mark beginning of line for ^ . */
741     regbol = string;
742 
743     /* Simplest case:  anchored match need be tried only once. */
744     if (prog->reganch)
745 	return (regtry(prog, string));
746 
747     /* Messy cases:  unanchored match. */
748     s = string;
749     if (prog->regstart != '\0')
750 	/* We know what char it must start with. */
751 	while ((s = strchr(s, prog->regstart)) != NULL) {
752 	    if (regtry(prog, s))
753 		return (1);
754 	    s++;
755 	}
756     else
757 	/* We don't -- general case. */
758 	do {
759 	    if (regtry(prog, s))
760 		return (1);
761 	} while (*s++ != '\0');
762 
763     /* Failure. */
764     return (0);
765 }
766 
767 /*
768  - regtry - try match at specific point
769  */
770 static int			/* 0 failure, 1 success */
regtry(regexp * prog,char * string)771 regtry(regexp * prog, char *string)
772 {
773     register int    i;
774     register char **sp;
775     register char **ep;
776 
777     reginput = string;
778     regstartp = prog->startp;
779     regendp = prog->endp;
780 
781     sp = prog->startp;
782     ep = prog->endp;
783     for (i = NSUBEXP; i > 0; i--) {
784 	*sp++ = NULL;
785 	*ep++ = NULL;
786     }
787     if (regmatch(prog->program + 1)) {
788 	prog->startp[0] = string;
789 	prog->endp[0] = reginput;
790 	return (1);
791     } else
792 	return (0);
793 }
794 
795 /*
796  - regmatch - main matching routine
797  *
798  * Conceptually the strategy is simple:  check to see whether the current
799  * node matches, call self recursively to see whether the rest matches,
800  * and then act accordingly.  In practice we make some effort to avoid
801  * recursion, in particular by going through "ordinary" nodes (that don't
802  * need to know whether the rest of the match failed) by a loop instead of
803  * by recursion.
804  */
805 static int			/* 0 failure, 1 success */
regmatch(char * prog)806 regmatch(char *prog)
807 {
808     register char  *scan;	/* Current node. */
809     char           *next;	/* Next node. */
810 
811     scan = prog;
812 
813 #ifdef DEBUG
814     if (scan != NULL && regnarrate)
815 	fprintf(stderr, "%s(\n", regprop(scan));
816 #endif
817 
818     while (scan != NULL) {
819 
820 #ifdef DEBUG
821 	if (regnarrate)
822 	    fprintf(stderr, "%s...\n", regprop(scan));
823 #endif
824 
825 	next = regnext(scan);
826 
827 	switch (OP(scan)) {
828 	    case BOL:
829 		if (reginput != regbol)
830 		    return (0);
831 		break;
832 	    case EOL:
833 		if (*reginput != '\0')
834 		    return (0);
835 		break;
836 	    case W_START:
837 		if (*reginput == '\0' || isspace(*reginput))
838 		    return (0);
839 		if (reginput != regbol && !isspace(*(reginput - 1)))
840 		    return (0);
841 		break;
842 	    case W_END:
843 		if (*reginput != '\0' && !isspace(*reginput))
844 		    return (0);
845 		if (reginput == regbol || isspace(*(reginput - 1)))
846 		    return (0);
847 		break;
848 	    case ANY:
849 		if (*reginput == '\0')
850 		    return (0);
851 		reginput++;
852 		break;
853 	    case EXACTLY:{
854 		    register int    len;
855 		    register char  *opnd;
856 
857 		    opnd = OPERAND(scan);
858 		    /* Inline the first character, for speed. */
859 		    if (*opnd != *reginput)
860 			return (0);
861 		    len = strlen(opnd);
862 		    if (len > 1 && strncmp(opnd, reginput, len) != 0)
863 			return (0);
864 		    reginput += len;
865 		}
866 		break;
867 	    case ANYOF:
868 		if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL)
869 		    return (0);
870 		reginput++;
871 		break;
872 	    case ANYBUT:
873 		if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL)
874 		    return (0);
875 		reginput++;
876 		break;
877 	    case NOTHING:
878 		break;
879 	    case BACK:
880 		break;
881 	    case OPEN + 1:
882 	    case OPEN + 2:
883 	    case OPEN + 3:
884 	    case OPEN + 4:
885 	    case OPEN + 5:
886 	    case OPEN + 6:
887 	    case OPEN + 7:
888 	    case OPEN + 8:
889 	    case OPEN + 9:{
890 		    register int    no;
891 		    register char  *save;
892 
893 		    no = OP(scan) - OPEN;
894 		    save = reginput;
895 
896 		    if (regmatch(next)) {
897 
898 			/*
899 			 * Don't set startp if some later invocation of the
900 			 * same parentheses already has.
901 			 */
902 			if (regstartp[no] == NULL)
903 			    regstartp[no] = save;
904 			return (1);
905 		    } else
906 			return (0);
907 		}
908 		break;
909 	    case CLOSE + 1:
910 	    case CLOSE + 2:
911 	    case CLOSE + 3:
912 	    case CLOSE + 4:
913 	    case CLOSE + 5:
914 	    case CLOSE + 6:
915 	    case CLOSE + 7:
916 	    case CLOSE + 8:
917 	    case CLOSE + 9:{
918 		    register int    no;
919 		    register char  *save;
920 
921 		    no = OP(scan) - CLOSE;
922 		    save = reginput;
923 
924 		    if (regmatch(next)) {
925 
926 			/*
927 			 * Don't set endp if some later invocation of the
928 			 * same parentheses already has.
929 			 */
930 			if (regendp[no] == NULL)
931 			    regendp[no] = save;
932 			return (1);
933 		    } else
934 			return (0);
935 		}
936 		break;
937 	    case BRANCH:{
938 		    register char  *save;
939 
940 		    if (OP(next) != BRANCH)	/* No choice. */
941 			next = OPERAND(scan);	/* Avoid recursion. */
942 		    else {
943 			do {
944 			    save = reginput;
945 			    if (regmatch(OPERAND(scan)))
946 				return (1);
947 			    reginput = save;
948 			    scan = regnext(scan);
949 			} while (scan != NULL && OP(scan) == BRANCH);
950 			return (0);
951 			/* NOTREACHED */
952 		    }
953 		}
954 		break;
955 	    case STAR:
956 	    case PLUS:{
957 		    register char   nextch;
958 		    register int    no;
959 		    register char  *save;
960 		    register int    min;
961 
962 		    /*
963 		     * Lookahead to avoid useless match attempts when we know
964 		     * what character comes next.
965 		     */
966 		    nextch = '\0';
967 		    if (OP(next) == EXACTLY)
968 			nextch = *OPERAND(next);
969 		    min = (OP(scan) == STAR) ? 0 : 1;
970 		    save = reginput;
971 		    no = regrepeat(OPERAND(scan));
972 		    while (no >= min) {
973 			/* If it could work, try it. */
974 			if (nextch == '\0' || *reginput == nextch)
975 			    if (regmatch(next))
976 				return (1);
977 			/* Couldn't or didn't -- back up. */
978 			no--;
979 			reginput = save + no;
980 		    }
981 		    return (0);
982 		}
983 		break;
984 	    case END:
985 		return (1);	/* Success! */
986 		break;
987 	    default:
988 		regerror("memory corruption");
989 		return (0);
990 		break;
991 	}
992 
993 	scan = next;
994     }
995 
996     /*
997      * We get here only if there's trouble -- normally "case END" is the
998      * terminating point.
999      */
1000     regerror("corrupted pointers");
1001     return (0);
1002 }
1003 
1004 /*
1005  - regrepeat - repeatedly match something simple, report how many
1006  */
1007 static int
regrepeat(char * p)1008 regrepeat(char *p)
1009 {
1010     register int    count = 0;
1011     register char  *scan;
1012     register char  *opnd;
1013 
1014     scan = reginput;
1015     opnd = OPERAND(p);
1016     switch (OP(p)) {
1017 	case ANY:
1018 	    count = strlen(scan);
1019 	    scan += count;
1020 	    break;
1021 	case EXACTLY:
1022 	    while (*opnd == *scan) {
1023 		count++;
1024 		scan++;
1025 	    }
1026 	    break;
1027 	case ANYOF:
1028 	    while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
1029 		count++;
1030 		scan++;
1031 	    }
1032 	    break;
1033 	case ANYBUT:
1034 	    while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
1035 		count++;
1036 		scan++;
1037 	    }
1038 	    break;
1039 	default:		/* Oh dear.  Called inappropriately. */
1040 	    regerror("internal foulup");
1041 	    count = 0;		/* Best compromise. */
1042 	    break;
1043     }
1044     reginput = scan;
1045 
1046     return (count);
1047 }
1048 
1049 /*
1050  - regnext - dig the "next" pointer out of a node
1051  */
1052 static char    *
regnext(register char * p)1053 regnext(register char *p)
1054 {
1055     register int    offset;
1056 
1057     if (p == &regdummy)
1058 	return (NULL);
1059 
1060     offset = NEXT(p);
1061     if (offset == 0)
1062 	return (NULL);
1063 
1064     if (OP(p) == BACK)
1065 	return (p - offset);
1066     else
1067 	return (p + offset);
1068 }
1069 
1070 #ifdef DEBUG
1071 STATIC char    *regprop(char *);
1072 
1073 /*
1074  - regdump - dump a regexp onto stdout in vaguely comprehensible form
1075  */
1076 void
regdump(regexp * r)1077 regdump(regexp * r)
1078 {
1079     register char  *s;
1080     register char   op = EXACTLY;	/* Arbitrary non-END op. */
1081     register char  *next;
1082 
1083 
1084     s = r->program + 1;
1085     while (op != END) {		/* While that wasn't END last time... */
1086 	op = OP(s);
1087 	printf("%2d%s", s - r->program, regprop(s));	/* Where, what. */
1088 	next = regnext(s);
1089 	if (next == NULL)	/* Next ptr. */
1090 	    printf("(0)");
1091 	else
1092 	    printf("(%d)", (s - r->program) + (next - s));
1093 	s += 3;
1094 	if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
1095 	    /* Literal string, where present. */
1096 	    while (*s != '\0') {
1097 		putchar(*s);
1098 		s++;
1099 	    }
1100 	    s++;
1101 	}
1102 	putchar('\n');
1103     }
1104 
1105     /* Header fields of interest. */
1106     if (r->regstart != '\0')
1107 	printf("start `%c' ", r->regstart);
1108     if (r->reganch)
1109 	printf("anchored ");
1110     if (r->regmust != NULL)
1111 	printf("must have \"%s\"", r->regmust);
1112     printf("\n");
1113 }
1114 
1115 /*
1116  - regprop - printable representation of opcode
1117  */
1118 static char    *
regprop(char * op)1119 regprop(char *op)
1120 {
1121     register char  *p;
1122     static char     buf[50];
1123 
1124     (void) strcpy(buf, ":");
1125 
1126     switch (OP(op)) {
1127 	case BOL:
1128 	    p = "BOL";
1129 	    break;
1130 	case EOL:
1131 	    p = "EOL";
1132 	    break;
1133 	case W_START:
1134 	    p = "WSTART";
1135 	    break;
1136 	case W_END:
1137 	    p = "WEND";
1138 	    break;
1139 	case ANY:
1140 	    p = "ANY";
1141 	    break;
1142 	case ANYOF:
1143 	    p = "ANYOF";
1144 	    break;
1145 	case ANYBUT:
1146 	    p = "ANYBUT";
1147 	    break;
1148 	case BRANCH:
1149 	    p = "BRANCH";
1150 	    break;
1151 	case EXACTLY:
1152 	    p = "EXACTLY";
1153 	    break;
1154 	case NOTHING:
1155 	    p = "NOTHING";
1156 	    break;
1157 	case BACK:
1158 	    p = "BACK";
1159 	    break;
1160 	case END:
1161 	    p = "END";
1162 	    break;
1163 	case OPEN + 1:
1164 	case OPEN + 2:
1165 	case OPEN + 3:
1166 	case OPEN + 4:
1167 	case OPEN + 5:
1168 	case OPEN + 6:
1169 	case OPEN + 7:
1170 	case OPEN + 8:
1171 	case OPEN + 9:
1172 	    sprintf(buf + strlen(buf), "OPEN%d", OP(op) - OPEN);
1173 	    p = NULL;
1174 	    break;
1175 	case CLOSE + 1:
1176 	case CLOSE + 2:
1177 	case CLOSE + 3:
1178 	case CLOSE + 4:
1179 	case CLOSE + 5:
1180 	case CLOSE + 6:
1181 	case CLOSE + 7:
1182 	case CLOSE + 8:
1183 	case CLOSE + 9:
1184 	    sprintf(buf + strlen(buf), "CLOSE%d", OP(op) - CLOSE);
1185 	    p = NULL;
1186 	    break;
1187 	case STAR:
1188 	    p = "STAR";
1189 	    break;
1190 	case PLUS:
1191 	    p = "PLUS";
1192 	    break;
1193 	default:
1194 	    regerror("corrupted opcode");
1195 	    break;
1196     }
1197     if (p != NULL)
1198 	(void) strcat(buf, p);
1199     return (buf);
1200 }
1201 
1202 #endif
1203 
1204 /*
1205  * The following is provided for those people who do not have strcspn() in
1206  * their C libraries.  They should get off their butts and do something
1207  * about it; at least one public-domain implementation of those (highly
1208  * useful) string routines has been published on Usenet.
1209  */
1210 
1211 #ifdef STRCSPN
1212 /*
1213  * strcspn - find length of initial segment of s1 consisting entirely
1214  * of characters not from s2
1215  */
1216 
1217 static int
strcspn(char * s1,char * s2)1218 strcspn(char *s1, char *s2)
1219 {
1220     register char  *scan1;
1221     register char  *scan2;
1222     register int    count;
1223 
1224     count = 0;
1225     for (scan1 = s1; *scan1 != '\0'; scan1++) {
1226 	for (scan2 = s2; *scan2 != '\0';)	/* ++ moved down. */
1227 	    if (*scan1 == *scan2++)
1228 		return (count);
1229 	count++;
1230     }
1231     return (count);
1232 }
1233 
1234 #endif
1235 
1236 
1237 void
regerror(char * s)1238 regerror(char *s)
1239 {
1240 
1241     msg("regexp: %s", s);
1242     /* NOTREACHED */
1243 }
1244