1 /*
2  *
3  * regexp.c - regular expression matching
4  *
5  * DESCRIPTION
6  *
7  *	Underneath the reformatting and comment blocks which were added to
8  *	make it consistent with the rest of the code, you will find a
9  *	modified version of Henry Specer's regular expression library.
10  *	Henry's functions were modified to provide the minimal regular
11  *	expression matching, as required by P1003.  Henry's code was
12  *	copyrighted, and copy of the copyright message and restrictions
13  *	are provided, verbatim, below:
14  *
15  *	Copyright (c) 1986 by University of Toronto.
16  *	Written by Henry Spencer.  Not derived from licensed software.
17  *
18  *	Permission is granted to anyone to use this software for any
19  *	purpose on any computer system, and to redistribute it freely,
20  *	subject to the following restrictions:
21  *
22  *	1. The author is not responsible for the consequences of use of
23  *         this software, no matter how awful, even if they arise
24  *	   from defects in it.
25  *
26  *	2. The origin of this software must not be misrepresented, either
27  *	   by explicit claim or by omission.
28  *
29  *	3. Altered versions must be plainly marked as such, and must not
30  *	   be misrepresented as being the original software.
31  *
32  *
33  * This version modified by Ian Phillipps to return pointer to terminating
34  * NUL on substitution string. [ Temp mail address ex-igp@camcon.co.uk ]
35  *
36  *	Altered by amylaar to support excompatible option and the
37  *      operators \< and >\ . ( 7.Sep. 1991 )
38  *
39  * regsub altered by amylaar to take an additional parameter specifying
40  * maximum number of bytes that can be written to the memory region
41  * pointed to by dest
42  *
43  * Also altered by Fredrik Hubinette to handle the + operator and
44  * eight-bit chars. Mars 22 1996
45  *
46  *
47  * 	Beware that some of this code is subtly aware of the way operator
48  * 	precedence is structured in regular expressions.  Serious changes in
49  * 	regular-expression syntax might require a total rethink.
50  *
51  * AUTHORS
52  *
53  *     Mark H. Colburn, NAPS International (mark@jhereg.mn.org)
54  *     Henry Spencer, University of Torronto (henry@utzoo.edu)
55  *
56  * Sponsored by The USENIX Association for public distribution.
57  *
58  */
59 
60 /* Headers */
61 #include "my_global.h"
62 #include <ctype.h>
63 #include "regexp.h"
64 #ifdef	__WIN__
65 #include <string.h>
66 #else
67 #include "memory.h"
68 #include "error.h"
69 #endif
70 
71 /*
72  * The "internal use only" fields in regexp.h are present to pass info from
73  * compile to execute that permits the execute phase to run lots faster on
74  * simple cases.  They are:
75  *
76  * regstart	char that must begin a match; '\0' if none obvious
77  * reganch	is the match anchored (at beginning-of-line only)?
78  * regmust	string (pointer into program) that match must include, or NULL
79  * regmlen	length of regmust string
80  *
81  * Regstart and reganch permit very fast decisions on suitable starting points
82  * for a match, cutting down the work a lot.  Regmust permits fast rejection
83  * of lines that cannot possibly match.  The regmust tests are costly enough
84  * that regcomp() supplies a regmust only if the r.e. contains something
85  * potentially expensive (at present, the only such thing detected is * or +
86  * at the start of the r.e., which can involve a lot of backup).  Regmlen is
87  * supplied because the test in regexec() needs it and regcomp() is computing
88  * it anyway.
89  */
90 
91 /*
92  * Structure for regexp "program".  This is essentially a linear encoding
93  * of a nondeterministic finite-state machine (aka syntax charts or
94  * "railroad normal form" in parsing technology).  Each node is an opcode
95  * plus a "nxt" pointer, possibly plus an operand.  "Nxt" pointers of
96  * all nodes except BRANCH implement concatenation; a "nxt" pointer with
97  * a BRANCH on both ends of it is connecting two alternatives.  (Here we
98  * have one of the subtle syntax dependencies:  an individual BRANCH (as
99  * opposed to a collection of them) is never concatenated with anything
100  * because of operator precedence.)  The operand of some types of node is
101  * a literal string; for others, it is a node leading into a sub-FSM.  In
102  * particular, the operand of a BRANCH node is the first node of the branch.
103  * (NB this is *not* a tree structure:  the tail of the branch connects
104  * to the thing following the set of BRANCHes.)  The opcodes are:
105  */
106 
107 /* definition	number	opnd?	meaning */
108 #define	END	0		/* no	End of program. */
109 #define	BOL	1		/* no	Match "" at beginning of line. */
110 #define	EOL	2		/* no	Match "" at end of line. */
111 #define	ANY	3		/* no	Match any one character. */
112 #define	ANYOF	4		/* str	Match any character in this string. */
113 #define	ANYBUT	5		/* str	Match any character not in this
114 				 * string. */
115 #define	BRANCH	6		/* node	Match this alternative, or the
116 				 * nxt... */
117 #define	BACK	7		/* no	Match "", "nxt" ptr points backward. */
118 #define	EXACTLY	8		/* str	Match this string. */
119 #define	NOTHING	9		/* no	Match empty string. */
120 #define	STAR	10		/* node	Match this (simple) thing 0 or more
121 				 * times. */
122 #define WORDSTART 11		/* node matching a start of a word          */
123 #define WORDEND 12		/* node matching an end of a word           */
124 #define	OPEN	20		/* no	Mark this point in input as start of
125 				 * #n. */
126  /* OPEN+1 is number 1, etc. */
127 #define	CLOSE	30		/* no	Analogous to OPEN. */
128 
129 /*
130  * Opcode notes:
131  *
132  * BRANCH	The set of branches constituting a single choice are hooked
133  *		together with their "nxt" pointers, since precedence prevents
134  *		anything being concatenated to any individual branch.  The
135  *		"nxt" pointer of the last BRANCH in a choice points to the
136  *		thing following the whole choice.  This is also where the
137  *		final "nxt" pointer of each individual branch points; each
138  *		branch starts with the operand node of a BRANCH node.
139  *
140  * BACK		Normal "nxt" pointers all implicitly point forward; BACK
141  *		exists to make loop structures possible.
142  *
143  * STAR		complex '*', are implemented as circular BRANCH structures
144  *		using BACK.  Simple cases (one character per match) are
145  *		implemented with STAR for speed and to minimize recursive
146  *		plunges.
147  *
148  * OPEN,CLOSE	...are numbered at compile time.
149  */
150 
151 /*
152  * A node is one char of opcode followed by two chars of "nxt" pointer.
153  * "Nxt" pointers are stored as two 8-bit pieces, high order first.  The
154  * value is a positive offset from the opcode of the node containing it.
155  * An operand, if any, simply follows the node.  (Note that much of the
156  * code generation knows about this implicit relationship.)
157  *
158  * Using two bytes for the "nxt" pointer is vast overkill for most things,
159  * but allows patterns to get big without disasters.
160  */
161 #define	OP(p)	(*(p))
162 #define	NEXT(p)	(((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
163 #define	OPERAND(p)	((p) + 3)
164 
165 /*
166  * The first byte of the regexp internal "program" is actually this magic
167  * number; the start node begins in the second byte.
168  */
169 #define	MAGIC	0234
170 
171 /*
172  * Utility definitions.
173  */
174 
175 #ifdef	__WIN__
176 #define error(X,Y) fprintf(stderr, X, Y)
177 #endif
178 #define regerror(X) error("Regexp: %s\n",X);
179 #define SPECIAL 0x100
180 #define LBRAC	('('|SPECIAL)
181 #define RBRAC	(')'|SPECIAL)
182 #define ASTERIX	('*'|SPECIAL)
183 #define PLUS	('+'|SPECIAL)
184 #define OR_OP	('|'|SPECIAL)
185 #define DOLLAR	('$'|SPECIAL)
186 #define DOT	('.'|SPECIAL)
187 #define CARET	('^'|SPECIAL)
188 #define LSQBRAC ('['|SPECIAL)
189 #define RSQBRAC (']'|SPECIAL)
190 #define LSHBRAC ('<'|SPECIAL)
191 #define RSHBRAC ('>'|SPECIAL)
192 #define	FAIL(m)	{ regerror(m); return(NULL); }
193 #define	ISMULT(c)	((c) == ASTERIX || (c)==PLUS)
194 #define	META	"^$.[()|*+\\"
195 #ifndef CHARBITS
196 #define CHARBITS	0xff
197 #define	UCHARAT(p)	((int)*(unsigned char *)(p))
198 #else
199 #define	UCHARAT(p)	((int)*(p)&CHARBITS)
200 #endif
201 #define ISWORDPART(c) ( isalnum(c) || (c) == '_' )
202 
203 /*
204  * Flags to be passed up and down.
205  */
206 #define	HASWIDTH	01	/* Known never to match null string. */
207 #define	SIMPLE		02	/* Simple enough to be STAR operand. */
208 #define	SPSTART		04	/* Starts with * */
209 #define	WORST		0	/* Worst case. */
210 #ifdef __WIN__
211 #define	STRCHR(A,B)	strchr(A,B)
212 #endif
213 
214 /*
215  * Global work variables for regcomp().
216  */
217 static short   *regparse;	/* Input-scan pointer. */
218 static int      regnpar;	/* () count. */
219 static char     regdummy;
220 static char    *regcode;	/* Code-emit pointer; &regdummy = don't. */
221 static long     regsize;	/* Code size. */
222 
223 /*
224  * Forward declarations for regcomp()'s friends.
225  */
226 #ifndef STATIC
227 #define	STATIC	static
228 #endif
229 STATIC char    *reg();
230 STATIC char    *regbranch();
231 STATIC char    *regpiece();
232 STATIC char    *regatom();
233 STATIC char    *regnode();
234 STATIC char    *regnext();
235 STATIC void     regc();
236 STATIC void     reginsert();
237 STATIC void     regtail();
238 STATIC void     regoptail();
239 
240 /*
241  - regcomp - compile a regular expression into internal code
242  *
243  * We can't allocate space until we know how big the compiled form will be,
244  * but we can't compile it (and thus know how big it is) until we've got a
245  * place to put the code.  So we cheat:  we compile it twice, once with code
246  * generation turned off and size counting turned on, and once "for real".
247  * This also means that we don't allocate space until we are sure that the
248  * thing really will compile successfully, and we never have to move the
249  * code and thus invalidate pointers into it.  (Note that it has to be in
250  * one piece because free() must be able to free it all.)
251  *
252  * Beware that the optimization-preparation code in here knows about some
253  * of the structure of the compiled regexp.
254  */
regcomp(exp,excompat)255 regexp *regcomp(exp,excompat)
256 char   *exp;
257 int		excompat;	/* \( \) operators like in unix ex */
258 {
259     register regexp *r;
260     register char  *scan;
261     register char  *longest;
262     register int    len;
263     int             flags;
264     short	   *exp2,*dest,c;
265 
266     if (exp == (char *)NULL)
267 	FAIL("NULL argument");
268 
269     exp2=(short*)malloc( (strlen(exp)+1) * (sizeof(short[8])/sizeof(char[8])) );
270     for ( scan=exp,dest=exp2;( c= UCHARAT(scan++)); ) {
271 	switch (c) {
272 	    case '(':
273 	    case ')':
274 		*dest++ = excompat ? c : c | SPECIAL;
275 		break;
276 	    case '.':
277 	    case '*':
278 	    case '+':
279 	    case '|':
280 	    case '$':
281 	    case '^':
282 	    case '[':
283 	    case ']':
284 		*dest++ =  c | SPECIAL;
285 		break;
286 	    case '\\':
287 		switch ( c = *scan++ ) {
288 		    case '(':
289 		    case ')':
290 			*dest++ = excompat ? c | SPECIAL : c;
291 			break;
292 		    case '<':
293 		    case '>':
294 			*dest++ = c | SPECIAL;
295 			break;
296 		    case '{':
297 		    case '}':
298 			FAIL("sorry, unimplemented operator");
299 		    case 'b': *dest++ = '\b'; break;
300 		    case 't': *dest++ = '\t'; break;
301 		    case 'r': *dest++ = '\r'; break;
302 		    default:
303 			*dest++ = c;
304 		}
305 		break;
306 	    default:
307 		*dest++ = c;
308 	}
309     }
310     *dest=0;
311     /* First pass: determine size, legality. */
312     regparse = exp2;
313     regnpar = 1;
314     regsize = 0L;
315     regcode = &regdummy;
316     regc(MAGIC);
317     if (reg(0, &flags) == (char *)NULL)
318 	return ((regexp *)NULL);
319 
320     /* Small enough for pointer-storage convention? */
321     if (regsize >= 32767L)	/* Probably could be 65535L. */
322 	FAIL("regexp too big");
323 
324     /* Allocate space. */
325     r = (regexp *) malloc(sizeof(regexp) + (unsigned) regsize);
326     if (r == (regexp *) NULL)
327 	FAIL("out of space");
328     memset(r, 0, sizeof(regexp) + (unsigned)regsize);
329 
330     /* Second pass: emit code. */
331     regparse = exp2;
332     regnpar = 1;
333     regcode = r->program;
334     regc(MAGIC);
335     if (reg(0, &flags) == NULL)
336 	return ((regexp *) NULL);
337 
338     /* Dig out information for optimizations. */
339     r->regstart = '\0';		/* Worst-case defaults. */
340     r->reganch = 0;
341     r->regmust = NULL;
342     r->regmlen = 0;
343     scan = r->program + 1;	/* First BRANCH. */
344     if (OP(regnext(scan)) == END) {	/* Only one top-level choice. */
345 	scan = OPERAND(scan);
346 
347 	/* Starting-point info. */
348 	if (OP(scan) == EXACTLY)
349 	    r->regstart = *OPERAND(scan);
350 	else if (OP(scan) == BOL)
351 	    r->reganch++;
352 
353 	/*
354 	 * If there's something expensive in the r.e., find the longest
355 	 * literal string that must appear and make it the regmust.  Resolve
356 	 * ties in favor of later strings, since the regstart check works
357 	 * with the beginning of the r.e. and avoiding duplication
358 	 * strengthens checking.  Not a strong reason, but sufficient in the
359 	 * absence of others.
360 	 */
361 	if (flags & SPSTART) {
362 	    longest = NULL;
363 	    len = 0;
364 	    for (; scan != NULL; scan = regnext(scan))
365 		if (OP(scan) == EXACTLY &&
366 		    (int)strlen(OPERAND(scan)) >= len) {
367 		    longest = OPERAND(scan);
368 		    len = strlen(OPERAND(scan));
369 		}
370 	    r->regmust = longest;
371 	    r->regmlen = len;
372 	}
373     }
374     free((char*)exp2);
375     return (r);
376 }
377 
378 /*
379  - reg - regular expression, i.e. main body or parenthesized thing
380  *
381  * Caller must absorb opening parenthesis.
382  *
383  * Combining parenthesis handling with the base level of regular expression
384  * is a trifle forced, but the need to tie the tails of the branches to what
385  * follows makes it hard to avoid.
386  */
reg(paren,flagp)387 static char *reg(paren, flagp)
388 int             paren;		/* Parenthesized? */
389 int            *flagp;
390 {
391     register char  *ret;
392     register char  *br;
393     register char  *ender;
394     register int    parno=0; /* make gcc happy */
395     int             flags;
396 
397     *flagp = HASWIDTH;		/* Tentatively. */
398 
399     /* Make an OPEN node, if parenthesized. */
400     if (paren) {
401 	if (regnpar >= NSUBEXP)
402 	    FAIL("too many ()");
403 	parno = regnpar;
404 	regnpar++;
405 	ret = regnode(OPEN + parno);
406     } else
407 	ret = (char *)NULL;
408 
409     /* Pick up the branches, linking them together. */
410     br = regbranch(&flags);
411     if (br == (char *)NULL)
412 	return ((char *)NULL);
413     if (ret != (char *)NULL)
414 	regtail(ret, br);	/* OPEN -> first. */
415     else
416 	ret = br;
417     if (!(flags & HASWIDTH))
418 	*flagp &= ~HASWIDTH;
419     *flagp |= flags & SPSTART;
420     while (*regparse == OR_OP) {
421 	regparse++;
422 	br = regbranch(&flags);
423 	if (br == (char *)NULL)
424 	    return ((char *)NULL);
425 	regtail(ret, br);	/* BRANCH -> BRANCH. */
426 	if (!(flags & HASWIDTH))
427 	    *flagp &= ~HASWIDTH;
428 	*flagp |= flags & SPSTART;
429     }
430 
431     /* Make a closing node, and hook it on the end. */
432     ender = regnode((paren) ? CLOSE + parno : END);
433     regtail(ret, ender);
434 
435     /* Hook the tails of the branches to the closing node. */
436     for (br = ret; br != (char *)NULL; br = regnext(br))
437 	regoptail(br, ender);
438 
439     /* Check for proper termination. */
440     if (paren && *regparse++ != RBRAC) {
441 	FAIL("unmatched ()");
442     } else if (!paren && *regparse != '\0') {
443 	if (*regparse == RBRAC) {
444 	    FAIL("unmatched ()");
445 	} else
446 	    FAIL("junk on end");/* "Can't happen". */
447 	/* NOTREACHED */
448     }
449     return (ret);
450 }
451 
452 /*
453  - regbranch - one alternative of an | operator
454  *
455  * Implements the concatenation operator.
456  */
regbranch(flagp)457 static char  *regbranch(flagp)
458 int            *flagp;
459 {
460     register char  *ret;
461     register char  *chain;
462     register char  *latest;
463     int             flags;
464 
465     *flagp = WORST;		/* Tentatively. */
466 
467     ret = regnode(BRANCH);
468     chain = (char *)NULL;
469     while (*regparse != '\0' && *regparse != OR_OP && *regparse != RBRAC) {
470 	latest = regpiece(&flags);
471 	if (latest == (char *)NULL)
472 	    return ((char *)NULL);
473 	*flagp |= flags & HASWIDTH;
474 	if (chain == (char *)NULL)	/* First piece. */
475 	    *flagp |= flags & SPSTART;
476 	else
477 	    regtail(chain, latest);
478 	chain = latest;
479     }
480     if (chain == (char *)NULL)		/* Loop ran zero times. */
481 	regnode(NOTHING);
482 
483     return (ret);
484 }
485 
486 /*
487  - regpiece - something followed by possible [*]
488  *
489  * Note that the branching code sequence used for * is somewhat optimized:
490  * they use the same NOTHING node as both the endmarker for their branch
491  * list and the body of the last branch.  It might seem that this node could
492  * be dispensed with entirely, but the endmarker role is not redundant.
493  */
regpiece(flagp)494 static char *regpiece(flagp)
495 int            *flagp;
496 {
497   register char  *ret;
498   register short  op;
499   /* register char  *nxt; */
500   int             flags;
501 
502   ret = regatom(&flags);
503   if (ret == (char *)NULL)
504     return ((char *)NULL);
505 
506   op = *regparse;
507   if (!ISMULT(op)) {
508     *flagp = flags;
509     return (ret);
510   }
511   if (!(flags & HASWIDTH))
512     FAIL("* or + operand could be empty");
513   *flagp = (WORST | SPSTART);
514 
515   if(op == ASTERIX)
516   {
517     if (flags & SIMPLE)
518     {
519       reginsert(STAR, ret);
520     }
521     else
522     {
523       /* Emit x* as (x&|), where & means "self". */
524       reginsert(BRANCH, ret);	/* Either x */
525       regoptail(ret, regnode(BACK)); /* and loop */
526       regoptail(ret, ret);	/* back */
527       regtail(ret, regnode(BRANCH)); /* or */
528       regtail(ret, regnode(NOTHING)); /* null. */
529     }
530   }
531   else if(op == PLUS)
532   {
533     /*  Emit a+ as (a&) where & means "self" /Fredrik Hubinette */
534     char *tmp;
535     tmp=regnode(BACK);
536     reginsert(BRANCH, tmp);
537     regtail(ret, tmp);
538     regoptail(tmp, ret);
539     regtail(ret, regnode(BRANCH));
540     regtail(ret, regnode(NOTHING));
541   }
542 
543   regparse++;
544   if (ISMULT(*regparse))
545     FAIL("nested * or +");
546 
547   return (ret);
548 }
549 
550 
551 /*
552  - regatom - the lowest level
553  *
554  * Optimization:  gobbles an entire sequence of ordinary characters so that
555  * it can turn them into a single node, which is smaller to store and
556  * faster to run.
557  */
regatom(flagp)558 static char *regatom(flagp)
559 int            *flagp;
560 {
561     register char  *ret;
562     int             flags;
563 
564     *flagp = WORST;		/* Tentatively. */
565 
566     switch (*regparse++) {
567     case CARET:
568 	ret = regnode(BOL);
569 	break;
570     case DOLLAR:
571 	ret = regnode(EOL);
572 	break;
573     case DOT:
574 	ret = regnode(ANY);
575 	*flagp |= HASWIDTH | SIMPLE;
576 	break;
577     case LSHBRAC:
578 	ret = regnode(WORDSTART);
579 	break;
580     case RSHBRAC:
581 	ret = regnode(WORDEND);
582 	break;
583     case LSQBRAC:{
584 	    register int    class;
585 	    register int    classend;
586 
587 	    if (*regparse == CARET) {	/* Complement of range. */
588 		ret = regnode(ANYBUT);
589 		regparse++;
590 	    } else
591 		ret = regnode(ANYOF);
592 	    if (*regparse == RSQBRAC || *regparse == '-')
593 		regc(*regparse++);
594 	    while (*regparse != '\0' && *regparse != RSQBRAC) {
595 		if (*regparse == '-') {
596 		    regparse++;
597 		    if (*regparse == RSQBRAC || *regparse == '\0')
598 			regc('-');
599 		    else {
600 			class = (CHARBITS & *(regparse - 2)) + 1;
601 			classend = (CHARBITS & *(regparse));
602 			if (class > classend + 1)
603 			    FAIL("invalid [] range");
604 			for (; class <= classend; class++)
605 			    regc(class);
606 			regparse++;
607 		    }
608 		} else
609 		    regc(*regparse++);
610 	    }
611 	    regc('\0');
612 	    if (*regparse != RSQBRAC)
613 		FAIL("unmatched []");
614 	    regparse++;
615 	    *flagp |= HASWIDTH | SIMPLE;
616 	}
617 	break;
618     case LBRAC:
619 	ret = reg(1, &flags);
620 	if (ret == (char *)NULL)
621 	    return ((char *)NULL);
622 	*flagp |= flags & (HASWIDTH | SPSTART);
623 	break;
624     case '\0':
625     case OR_OP:
626     case RBRAC:
627 	FAIL("internal urp");	/* Supposed to be caught earlier. */
628 
629     case ASTERIX:
630 	FAIL("* follows nothing\n");
631 
632     default:{
633 	    register int    len;
634 	    register short  ender;
635 
636 	    regparse--;
637 	    for (len=0; regparse[len] &&
638 	        !(regparse[len]&SPECIAL) && regparse[len] != RSQBRAC; len++) ;
639 	    if (len <= 0)
640 	    {
641 	      FAIL("internal disaster");
642 	    }
643 	    ender = *(regparse + len);
644 	    if (len > 1 && ISMULT(ender))
645 		len--;		/* Back off clear of * operand. */
646 	    *flagp |= HASWIDTH;
647 	    if (len == 1)
648 		*flagp |= SIMPLE;
649 	    ret = regnode(EXACTLY);
650 	    while (len > 0) {
651 		regc(*regparse++);
652 		len--;
653 	    }
654 	    regc('\0');
655 	}
656 	break;
657     }
658 
659     return (ret);
660 }
661 
662 /*
663  - regnode - emit a node
664  */
regnode(op)665 static char *regnode(op)
666 char            op;
667 {
668     register char  *ret;
669     register char  *ptr;
670 
671     ret = regcode;
672     if (ret == &regdummy) {
673 	regsize += 3;
674 	return (ret);
675     }
676     ptr = ret;
677     *ptr++ = op;
678     *ptr++ = '\0';		/* Null "nxt" pointer. */
679     *ptr++ = '\0';
680     regcode = ptr;
681 
682     return (ret);
683 }
684 
685 /*
686  - regc - emit (if appropriate) a byte of code
687  */
regc(b)688 static void regc(b)
689 char            b;
690 {
691     if (regcode != &regdummy)
692 	*regcode++ = b;
693     else
694 	regsize++;
695 }
696 
697 /*
698  - reginsert - insert an operator in front of already-emitted operand
699  *
700  * Means relocating the operand.
701  */
reginsert(op,opnd)702 static void reginsert(op, opnd)
703 char            op;
704 char           *opnd;
705 {
706     register char  *src;
707     register char  *dst;
708     register char  *place;
709 
710     if (regcode == &regdummy) {
711 	regsize += 3;
712 	return;
713     }
714     src = regcode;
715     regcode += 3;
716     dst = regcode;
717     while (src > opnd)
718 	*--dst = *--src;
719 
720     place = opnd;		/* Op node, where operand used to be. */
721     *place++ = op;
722     *place++ = '\0';
723     *place++ = '\0';
724 }
725 
726 /*
727  - regtail - set the next-pointer at the end of a node chain
728  */
regtail(p,val)729 static void regtail(p, val)
730 char           *p;
731 char           *val;
732 {
733     register char  *scan;
734     register char  *temp;
735     register int    offset;
736 
737     if (p == &regdummy)
738 	return;
739 
740     /* Find last node. */
741     scan = p;
742     for (;;) {
743 	temp = regnext(scan);
744 	if (temp == (char *)NULL)
745 	    break;
746 	scan = temp;
747     }
748 
749     if (OP(scan) == BACK)
750 	offset = scan - val;
751     else
752 	offset = val - scan;
753     *(scan + 1) = (offset >> 8) & 0377;
754     *(scan + 2) = offset & 0377;
755 }
756 
757 /*
758  - regoptail - regtail on operand of first argument; nop if operandless
759  */
regoptail(p,val)760 static void regoptail(p, val)
761 char           *p;
762 char           *val;
763 {
764     /* "Operandless" and "op != BRANCH" are synonymous in practice. */
765     if (p == (char *)NULL || p == &regdummy || OP(p) != BRANCH)
766 	return;
767     regtail(OPERAND(p), val);
768 }
769 
770 /*
771  * regexec and friends
772  */
773 
774 /*
775  * Global work variables for regexec().
776  */
777 static char    *reginput;	/* String-input pointer. */
778 static char    *regbol;		/* Beginning of input, for ^ check. */
779 static char   **regstartp;	/* Pointer to startp array. */
780 static char   **regendp;	/* Ditto for endp. */
781 
782 /*
783  * Forwards.
784  */
785 STATIC int      regtry();
786 STATIC int      regmatch();
787 STATIC int      regrepeat();
788 
789 #ifdef DEBUG
790 int             regnarrate = 0;
791 void            regdump();
792 STATIC char    *regprop();
793 #endif
794 
795 /*
796  - regexec - match a regexp against a string
797  */
regexec(prog,string)798 int regexec(prog, string)
799 register regexp *prog;
800 register char  *string;
801 {
802     register char  *s;
803 
804     /* Be paranoid... */
805     if (prog == (regexp *)NULL || string == (char *)NULL) {
806 	regerror("NULL parameter");
807 	return (0);
808     }
809     /* Check validity of program. */
810     if (UCHARAT(prog->program) != MAGIC) {
811 	regerror("corrupted program");
812 	return (0);
813     }
814     /* If there is a "must appear" string, look for it. */
815     if (prog->regmust != (char *)NULL) {
816 	s = string;
817 	while ((s = STRCHR(s, prog->regmust[0])) != (char *)NULL) {
818 	    if (strncmp(s, prog->regmust, prog->regmlen) == 0)
819 		break;		/* Found it. */
820 	    s++;
821 	}
822 	if (s == (char *)NULL)		/* Not present. */
823 	    return (0);
824     }
825     /* Mark beginning of line for ^ . */
826     regbol = string;
827 
828     /* Simplest case:  anchored match need be tried only once. */
829     if (prog->reganch)
830 	return (regtry(prog, string));
831 
832     /* Messy cases:  unanchored match. */
833     s = string;
834     if (prog->regstart != '\0')
835 	/* We know what char it must start with. */
836 	while ((s = STRCHR(s, prog->regstart)) != (char *)NULL) {
837 	    if (regtry(prog, s))
838 		return (1);
839 	    s++;
840 	}
841     else
842 	/* We don't -- general case. */
843 	do {
844 	    if (regtry(prog, s))
845 		return (1);
846 	} while (*s++ != '\0');
847 
848     /* Failure. */
849     return (0);
850 }
851 
852 /*
853  - regtry - try match at specific point
854  */
855 
regtry(regexp * prog,char * string)856 static int regtry(regexp *prog, char *string)
857 {
858     register int    i;
859     register char **sp;
860     register char **ep;
861 
862     reginput = string;
863     regstartp = prog->startp;
864     regendp = prog->endp;
865 
866     sp = prog->startp;
867     ep = prog->endp;
868     for (i = NSUBEXP; i > 0; i--) {
869 	*sp++ = (char *)NULL;
870 	*ep++ = (char *)NULL;
871     }
872     if (regmatch(prog->program + 1)) {
873 	prog->startp[0] = string;
874 	prog->endp[0] = reginput;
875 	return (1);
876     } else
877 	return (0);
878 }
879 
880 /*
881  - regmatch - main matching routine
882  *
883  * Conceptually the strategy is simple:  check to see whether the current
884  * node matches, call self recursively to see whether the rest matches,
885  * and then act accordingly.  In practice we make some effort to avoid
886  * recursion, in particular by going through "ordinary" nodes (that don't
887  * need to know whether the rest of the match failed) by a loop instead of
888  * by recursion.
889  */
890 
regmatch(char * prog)891 static int regmatch(char *prog)
892 {
893     register char  *scan;	/* Current node. */
894     char           *nxt;	/* nxt node. */
895 
896     scan = prog;
897 #ifdef DEBUG
898     if (scan != (char *)NULL && regnarrate)
899 	fprintf(stderr, "%s(\n", regprop(scan));
900 #endif
901     while (scan != (char *)NULL) {
902 #ifdef DEBUG
903 	if (regnarrate)
904 	    fprintf(stderr, "%s...\n", regprop(scan));
905 #endif
906 	nxt = regnext(scan);
907 
908 	switch (OP(scan)) {
909 	case BOL:
910 	    if (reginput != regbol)
911 		return (0);
912 	    break;
913 	case EOL:
914 	    if (*reginput != '\0')
915 		return (0);
916 	    break;
917 	case ANY:
918 	    if (*reginput == '\0')
919 		return (0);
920 	    reginput++;
921 	    break;
922 	case WORDSTART:
923 	    if (reginput == regbol)
924 		break;
925 	    if (*reginput == '\0' ||
926 	       ISWORDPART( *(reginput-1) ) || !ISWORDPART( *reginput ) )
927 		return (0);
928 	    break;
929 	case WORDEND:
930 	    if (*reginput == '\0')
931 		break;
932 	    if ( reginput == regbol ||
933 	       !ISWORDPART( *(reginput-1) ) || ISWORDPART( *reginput ) )
934 		return (0);
935 	    break;
936 	case EXACTLY:{
937 		register int    len;
938 		register char  *opnd;
939 
940 		opnd = OPERAND(scan);
941 		/* Inline the first character, for speed. */
942 		if (*opnd != *reginput)
943 		    return (0);
944 		len = strlen(opnd);
945 		if (len > 1 && strncmp(opnd, reginput, len) != 0)
946 		    return (0);
947 		reginput += len;
948 	    }
949 	    break;
950 	case ANYOF:
951 	    if (*reginput == '\0' ||
952 		 STRCHR(OPERAND(scan), *reginput) == (char *)NULL)
953 		return (0);
954 	    reginput++;
955 	    break;
956 	case ANYBUT:
957 	    if (*reginput == '\0' ||
958 		 STRCHR(OPERAND(scan), *reginput) != (char *)NULL)
959 		return (0);
960 	    reginput++;
961 	    break;
962 	case NOTHING:
963 	    break;
964 	case BACK:
965 	    break;
966 	case OPEN + 1:
967 	case OPEN + 2:
968 	case OPEN + 3:
969 	case OPEN + 4:
970 	case OPEN + 5:
971 	case OPEN + 6:
972 	case OPEN + 7:
973 	case OPEN + 8:
974 	case OPEN + 9:{
975 		register int    no;
976 		register char  *save;
977 
978 		no = OP(scan) - OPEN;
979 		save = reginput;
980 
981 		if (regmatch(nxt)) {
982 		    /*
983 		     * Don't set startp if some later invocation of the same
984 		     * parentheses already has.
985 		     */
986 		    if (regstartp[no] == (char *)NULL)
987 			regstartp[no] = save;
988 		    return (1);
989 		} else
990 		    return (0);
991 	    }
992 
993 	case CLOSE + 1:
994 	case CLOSE + 2:
995 	case CLOSE + 3:
996 	case CLOSE + 4:
997 	case CLOSE + 5:
998 	case CLOSE + 6:
999 	case CLOSE + 7:
1000 	case CLOSE + 8:
1001 	case CLOSE + 9:{
1002 		register int    no;
1003 		register char  *save;
1004 
1005 		no = OP(scan) - CLOSE;
1006 		save = reginput;
1007 
1008 		if (regmatch(nxt)) {
1009 		    /*
1010 		     * Don't set endp if some later invocation of the same
1011 		     * parentheses already has.
1012 		     */
1013 		    if (regendp[no] == (char *)NULL)
1014 			regendp[no] = save;
1015 		    return (1);
1016 		} else
1017 		    return (0);
1018 	    }
1019 
1020 	case BRANCH:{
1021 		register char  *save;
1022 
1023 		if (OP(nxt) != BRANCH)	/* No choice. */
1024 		    nxt = OPERAND(scan);	/* Avoid recursion. */
1025 		else {
1026 		    do {
1027 			save = reginput;
1028 			if (regmatch(OPERAND(scan)))
1029 			    return (1);
1030 			reginput = save;
1031 			scan = regnext(scan);
1032 		    } while (scan != (char *)NULL && OP(scan) == BRANCH);
1033 		    return (0);
1034 		    /* NOTREACHED */
1035 		}
1036 	    }
1037 	    break;
1038 	case STAR:{
1039 		register char   nextch;
1040 		register int    no;
1041 		register char  *save;
1042 		register int    minimum;
1043 
1044 		/*
1045 		 * Lookahead to avoid useless match attempts when we know
1046 		 * what character comes next.
1047 		 */
1048 		nextch = '\0';
1049 		if (OP(nxt) == EXACTLY)
1050 		    nextch = *OPERAND(nxt);
1051 		minimum = (OP(scan) == STAR) ? 0 : 1;
1052 		save = reginput;
1053 		no = regrepeat(OPERAND(scan));
1054 		while (no >= minimum) {
1055 		    /* If it could work, try it. */
1056 		    if (nextch == '\0' || *reginput == nextch)
1057 			if (regmatch(nxt))
1058 			    return (1);
1059 		    /* Couldn't or didn't -- back up. */
1060 		    no--;
1061 		    reginput = save + no;
1062 		}
1063 		return (0);
1064 	    }
1065 
1066 	case END:
1067 	    return (1);		/* Success! */
1068 
1069 	default:
1070 	    regerror("memory corruption");
1071 	    return (0);
1072 
1073 	}
1074 
1075 	scan = nxt;
1076     }
1077 
1078     /*
1079      * We get here only if there's trouble -- normally "case END" is the
1080      * terminating point.
1081      */
1082     regerror("corrupted pointers");
1083     return (0);
1084 }
1085 
1086 /*
1087  - regrepeat - repeatedly match something simple, report how many
1088  */
1089 
regrepeat(char * p)1090 static int regrepeat(char *p)
1091 {
1092     register int    count = 0;
1093     register char  *scan;
1094     register char  *opnd;
1095 
1096     scan = reginput;
1097     opnd = OPERAND(p);
1098     switch (OP(p)) {
1099     case ANY:
1100 	count = strlen(scan);
1101 	scan += count;
1102 	break;
1103     case EXACTLY:
1104 	while (*opnd == *scan) {
1105 	    count++;
1106 	    scan++;
1107 	}
1108 	break;
1109     case ANYOF:
1110 	while (*scan != '\0' && STRCHR(opnd, *scan) != (char *)NULL) {
1111 	    count++;
1112 	    scan++;
1113 	}
1114 	break;
1115     case ANYBUT:
1116 	while (*scan != '\0' && STRCHR(opnd, *scan) == (char *)NULL) {
1117 	    count++;
1118 	    scan++;
1119 	}
1120 	break;
1121     default:			/* Oh dear.  Called inappropriately. */
1122 	regerror("internal foulup");
1123 	count = 0;		/* Best compromise. */
1124 	break;
1125     }
1126     reginput = scan;
1127 
1128     return (count);
1129 }
1130 
1131 
1132 /*
1133  - regnext - dig the "nxt" pointer out of a node
1134  */
1135 
regnext(register char * p)1136 static char *regnext(register char *p)
1137 {
1138     register int    offset;
1139 
1140     if (p == &regdummy)
1141 	return ((char *)NULL);
1142 
1143     offset = NEXT(p);
1144     if (offset == 0)
1145 	return ((char *)NULL);
1146 
1147     if (OP(p) == BACK)
1148 	return (p - offset);
1149     else
1150 	return (p + offset);
1151 }
1152 
1153 #ifdef DEBUG
1154 
1155 STATIC char    *regprop();
1156 
1157 /*
1158  - regdump - dump a regexp onto stdout in vaguely comprehensible form
1159  */
regdump(regexp * r)1160 void regdump(regexp *r)
1161 {
1162     register char  *s;
1163     register char   op = EXACTLY;	/* Arbitrary non-END op. */
1164     register char  *nxt;
1165 
1166     s = r->program + 1;
1167     while (op != END) {		/* While that wasn't END last time... */
1168 	op = OP(s);
1169 	printf("%2ld%s", (long)(s - r->program), regprop(s));	/* Where, what. */
1170 	nxt = regnext(s);
1171 	if (nxt == (char *)NULL)	/* nxt ptr. */
1172 	    printf("(0)");
1173 	else
1174 	    printf("(%ld)", (long)( (s - r->program) + (nxt - s)));
1175 	s += 3;
1176 	if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
1177 	    /* Literal string, where present. */
1178 	    while (*s != '\0') {
1179 		putchar(*s);
1180 		s++;
1181 	    }
1182 	    s++;
1183 	}
1184 	putchar('\n');
1185     }
1186 
1187     /* Header fields of interest. */
1188     if (r->regstart != '\0')
1189 	printf("start `%c' ", r->regstart);
1190     if (r->reganch)
1191 	printf("anchored ");
1192     if (r->regmust != (char *)NULL)
1193 	printf("must have \"%s\"", r->regmust);
1194     printf("\n");
1195 }
1196 
1197 /*
1198  - regprop - printable representation of opcode
1199  */
1200 
regprop(char * op)1201 static char *regprop(char *op)
1202 {
1203     register char  *p;
1204     static char     buf[50];
1205 
1206     strcpy(buf, ":");
1207 
1208     switch (OP(op)) {
1209     case BOL:
1210 	p = "BOL";
1211 	break;
1212     case EOL:
1213 	p = "EOL";
1214 	break;
1215     case ANY:
1216 	p = "ANY";
1217 	break;
1218     case ANYOF:
1219 	p = "ANYOF";
1220 	break;
1221     case ANYBUT:
1222 	p = "ANYBUT";
1223 	break;
1224     case BRANCH:
1225 	p = "BRANCH";
1226 	break;
1227     case EXACTLY:
1228 	p = "EXACTLY";
1229 	break;
1230     case NOTHING:
1231 	p = "NOTHING";
1232 	break;
1233     case BACK:
1234 	p = "BACK";
1235 	break;
1236     case END:
1237 	p = "END";
1238 	break;
1239     case OPEN + 1:
1240     case OPEN + 2:
1241     case OPEN + 3:
1242     case OPEN + 4:
1243     case OPEN + 5:
1244     case OPEN + 6:
1245     case OPEN + 7:
1246     case OPEN + 8:
1247     case OPEN + 9:
1248 	sprintf(buf + strlen(buf), "OPEN%d", OP(op) - OPEN);
1249 	p = (char *)NULL;
1250 	break;
1251     case CLOSE + 1:
1252     case CLOSE + 2:
1253     case CLOSE + 3:
1254     case CLOSE + 4:
1255     case CLOSE + 5:
1256     case CLOSE + 6:
1257     case CLOSE + 7:
1258     case CLOSE + 8:
1259     case CLOSE + 9:
1260 	sprintf(buf + strlen(buf), "CLOSE%d", OP(op) - CLOSE);
1261 	p = (char *)NULL;
1262 	break;
1263     case STAR:
1264 	p = "STAR";
1265 	break;
1266     default:
1267 	regerror("corrupted opcode");
1268 	p=(char *)NULL;
1269 	break;
1270     }
1271     if (p != (char *)NULL)
1272 	strcat(buf, p);
1273     return (buf);
1274 }
1275 #endif
1276 
1277 /*
1278  - regsub - perform substitutions after a regexp match
1279  */
1280 
regsub(regexp * prog,char * source,char * dest,int n)1281 char *regsub(regexp *prog, char *source, char *dest, int n)
1282 {
1283     register char  *src;
1284     register char  *dst;
1285     register char   c;
1286     register int    no;
1287     register int    len;
1288     extern char    *strncpy();
1289 
1290     if (prog == (regexp *)NULL ||
1291 	source == (char *)NULL || dest == (char *)NULL) {
1292 	regerror("NULL parm to regsub");
1293 	return NULL;
1294     }
1295     if (UCHARAT(prog->program) != MAGIC) {
1296 	regerror("damaged regexp fed to regsub");
1297 	return NULL;
1298     }
1299     src = source;
1300     dst = dest;
1301     while ((c = *src++) != '\0') {
1302 	if (c == '&')
1303 	    no = 0;
1304 	else if (c == '\\' && '0' <= *src && *src <= '9')
1305 	    no = *src++ - '0';
1306 	else
1307 	    no = -1;
1308 
1309 	if (no < 0) {		/* Ordinary character. */
1310 	    if (c == '\\' && (*src == '\\' || *src == '&'))
1311 		c = *src++;
1312 	    if (--n < 0) {				/* amylaar */
1313 		regerror("line too long");
1314 		return NULL;
1315 	    }
1316 	    *dst++ = c;
1317 	} else if (prog->startp[no] != (char *)NULL &&
1318 		   prog->endp[no] != (char *)NULL) {
1319 	    len = prog->endp[no] - prog->startp[no];
1320 	    if ( (n-=len) < 0 ) {		/* amylaar */
1321 		regerror("line too long");
1322 		return NULL;
1323 	    }
1324 	    strncpy(dst, prog->startp[no], len);
1325 	    dst += len;
1326 	    if (len != 0 && *(dst - 1) == '\0') {	/* strncpy hit NUL. */
1327 		regerror("damaged match string");
1328 		return NULL;
1329 	    }
1330 	}
1331     }
1332     if (--n < 0) {			/* amylaar */
1333     	regerror("line too long");
1334     	return NULL;
1335     }
1336     *dst = '\0';
1337     return dst;
1338 }
1339 
1340 
1341 #if 0	/* Use the local regerror() in ed.c */
1342 
1343 void regerror(char *s)
1344 {
1345     fprintf(stderr, "regexp(3): %s", s);
1346     exit(1);
1347 }
1348 #endif /* 0 */
1349