1 /*
2  * regcomp and regexec -- regsub and regerror are elsewhere
3  *
4  *  Copyright (c) 1986 by University of Toronto.
5  *  Written by Henry Spencer.  Not derived from licensed software.
6  *
7  *  Permission is granted to anyone to use this software for any
8  *  purpose on any computer system, and to redistribute it freely,
9  *  subject to the following restrictions:
10  *
11  *  1. The author is not responsible for the consequences of use of
12  *      this software, no matter how awful, even if they arise
13  *      from defects in it.
14  *
15  *  2. The origin of this software must not be misrepresented, either
16  *      by explicit claim or by omission.
17  *
18  *  3. Altered versions must be plainly marked as such, and must not
19  *      be misrepresented as being the original software.
20  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
21  *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
22  *** to assist in implementing egrep.
23  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
24  *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
25  *** as in BSD grep and ex.
26  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
27  *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
28  *** THIS IS AN ALTERED VERSION.  It was altered by James A. Woods,
29  *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy.
30  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
31  *** seiwald@vix.com, on 28 August 1993, for use in jam.  Regmagic.h
32  *** was moved into regexp.h, and the include of regexp.h now uses "'s
33  *** to avoid conflicting with the system regexp.h.  Const, bless its
34  *** soul, was removed so it can compile everywhere.  The declaration
35  *** of strchr() was in conflict on AIX, so it was removed (as it is
36  *** happily defined in string.h).
37  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
38  *** seiwald@perforce.com, on 20 January 2000, to use function prototypes.
39  *
40  * Beware that some of this code is subtly aware of the way operator precedence
41  * is structured in regular expressions. Serious changes in regular-expression
42  * syntax might require a total rethink.
43  */
44 
45 
46 #include "jam.h"
47 #include "regexp.h"
48 #include "output.h"
49 
50 #include <stdio.h>
51 #include <ctype.h>
52 #ifndef ultrix
53 # include <stdlib.h>
54 #endif
55 #include <string.h>
56 
57 
58 /*
59  * The "internal use only" fields in regexp.h are present to pass info from
60  * compile to execute that permits the execute phase to run lots faster on
61  * simple cases. They are:
62  :
63  *  regstart char that must begin a match; '\0' if none obvious.
64  *  reganch  is the match anchored (at beginning-of-line only)?
65  *  regmust  string (pointer into program) that match must include, or NULL.
66  *  regmlen  length of regmust string.
67  *
68  * Regstart and reganch permit very fast decisions on suitable starting points
69  * for a match, cutting down the work a lot.  Regmust permits fast rejection of
70  * lines that cannot possibly match.  The regmust tests are costly enough that
71  * regcomp() supplies a regmust only if the r.e. contains something potentially
72  * expensive (at present, the only such thing detected is * or + at the start of
73  * the r.e., which can involve a lot of backup). Regmlen is supplied because the
74  * test in regexec() needs it and regcomp() is computing it anyway.
75  */
76 
77 /*
78  * Structure for regexp "program". This is essentially a linear encoding of a
79  * nondeterministic finite-state machine (aka syntax charts or "railroad normal
80  * form" in parsing technology). Each node is an opcode plus a "next" pointer,
81  * possibly plus an operand. "Next" pointers of all nodes except BRANCH
82  * implement concatenation; a "next" pointer with a BRANCH on both ends of it is
83  * connecting two alternatives. [Here we have one of the subtle syntax
84  * dependencies: an individual BRANCH, as opposed to a collection of them, is
85  * never concatenated with anything because of operator precedence.] The operand
86  * of some types of node is a literal string; for others, it is a node leading
87  * into a sub-FSM. In particular, the operand of a BRANCH node is the first node
88  * of the branch. [NB this is *not* a tree structure: the tail of the branch
89  * connects to the thing following the set of BRANCHes.] The opcodes are:
90  */
91 
92 /* definition   number  opnd?   meaning */
93 #define END 0   /* no   End of program. */
94 #define BOL 1   /* no   Match "" at beginning of line. */
95 #define EOL 2   /* no   Match "" at end of line. */
96 #define ANY 3   /* no   Match any one character. */
97 #define ANYOF   4   /* str  Match any character in this string. */
98 #define ANYBUT  5   /* str  Match any character not in this string. */
99 #define BRANCH  6   /* node Match this alternative, or the next... */
100 #define BACK    7   /* no   Match "", "next" ptr points backward. */
101 #define EXACTLY 8   /* str  Match this string. */
102 #define NOTHING 9   /* no   Match empty string. */
103 #define STAR    10  /* node Match this (simple) thing 0 or more times. */
104 #define PLUS    11  /* node Match this (simple) thing 1 or more times. */
105 #define WORDA   12  /* no   Match "" at wordchar, where prev is nonword */
106 #define WORDZ   13  /* no   Match "" at nonwordchar, where prev is word */
107 #define OPEN    20  /* no   Mark this point in input as start of #n. */
108             /*  OPEN+1 is number 1, etc. */
109 #define CLOSE   30  /* no   Analogous to OPEN. */
110 
111 
112 /*
113  * Opcode notes:
114  *
115  * BRANCH   The set of branches constituting a single choice are hooked
116  *      together with their "next" pointers, since precedence prevents
117  *      anything being concatenated to any individual branch.  The
118  *      "next" pointer of the last BRANCH in a choice points to the
119  *      thing following the whole choice.  This is also where the
120  *      final "next" pointer of each individual branch points; each
121  *      branch starts with the operand node of a BRANCH node.
122  *
123  * BACK     Normal "next" pointers all implicitly point forward; BACK
124  *      exists to make loop structures possible.
125  *
126  * STAR,PLUS    '?', and complex '*' and '+', are implemented as circular
127  *      BRANCH structures using BACK.  Simple cases (one character
128  *      per match) are implemented with STAR and PLUS for speed
129  *      and to minimize recursive plunges.
130  *
131  * OPEN,CLOSE   ...are numbered at compile time.
132  */
133 
134 /*
135  * A node is one char of opcode followed by two chars of "next" pointer.
136  * "Next" pointers are stored as two 8-bit pieces, high order first.  The
137  * value is a positive offset from the opcode of the node containing it.
138  * An operand, if any, simply follows the node.  (Note that much of the
139  * code generation knows about this implicit relationship.)
140  *
141  * Using two bytes for the "next" pointer is vast overkill for most things,
142  * but allows patterns to get big without disasters.
143  */
144 #define OP(p)   (*(p))
145 #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
146 #define OPERAND(p)  ((p) + 3)
147 
148 /*
149  * See regmagic.h for one further detail of program structure.
150  */
151 
152 
153 /*
154  * Utility definitions.
155  */
156 #ifndef CHARBITS
157 #define UCHARAT(p)  ((int)*(const unsigned char *)(p))
158 #else
159 #define UCHARAT(p)  ((int)*(p)&CHARBITS)
160 #endif
161 
162 #define FAIL(m) { regerror(m); return(NULL); }
163 #define ISMULT(c)   ((c) == '*' || (c) == '+' || (c) == '?')
164 
165 /*
166  * Flags to be passed up and down.
167  */
168 #define HASWIDTH    01  /* Known never to match null string. */
169 #define SIMPLE      02  /* Simple enough to be STAR/PLUS operand. */
170 #define SPSTART     04  /* Starts with * or +. */
171 #define WORST       0   /* Worst case. */
172 
173 /*
174  * Global work variables for regcomp().
175  */
176 static char *regparse;      /* Input-scan pointer. */
177 static int regnpar;     /* () count. */
178 static char regdummy;
179 static char *regcode;       /* Code-emit pointer; &regdummy = don't. */
180 static long regsize;        /* Code size. */
181 
182 /*
183  * Forward declarations for regcomp()'s friends.
184  */
185 #ifndef STATIC
186 #define STATIC  static
187 #endif
188 STATIC char *reg( int paren, int *flagp );
189 STATIC char *regbranch( int *flagp );
190 STATIC char *regpiece( int *flagp );
191 STATIC char *regatom( int *flagp );
192 STATIC char *regnode( int op );
193 STATIC char *regnext( char *p );
194 STATIC void regc( int b );
195 STATIC void reginsert( char op, char *opnd );
196 STATIC void regtail( char *p, char *val );
197 STATIC void regoptail( char *p, char *val );
198 #ifdef STRCSPN
199 STATIC int strcspn();
200 #endif
201 
202 /*
203  - regcomp - compile a regular expression into internal code
204  *
205  * We can't allocate space until we know how big the compiled form will be,
206  * but we can't compile it (and thus know how big it is) until we've got a
207  * place to put the code.  So we cheat:  we compile it twice, once with code
208  * generation turned off and size counting turned on, and once "for real".
209  * This also means that we don't allocate space until we are sure that the
210  * thing really will compile successfully, and we never have to move the
211  * code and thus invalidate pointers into it.  (Note that it has to be in
212  * one piece because free() must be able to free it all.)
213  *
214  * Beware that the optimization-preparation code in here knows about some
215  * of the structure of the compiled regexp.
216  */
217 regexp *
regcomp(const char * exp)218 regcomp( const char *exp )
219 {
220     regexp *r;
221     char *scan;
222     char *longest;
223     unsigned len;
224     int flags;
225 
226     if (exp == NULL)
227         FAIL("NULL argument");
228 
229     /* First pass: determine size, legality. */
230 #ifdef notdef
231     if (exp[0] == '.' && exp[1] == '*') exp += 2;  /* aid grep */
232 #endif
233     regparse = (char *)exp;
234     regnpar = 1;
235     regsize = 0L;
236     regcode = &regdummy;
237     regc(MAGIC);
238     if (reg(0, &flags) == NULL)
239         return(NULL);
240 
241     /* Small enough for pointer-storage convention? */
242     if (regsize >= 32767L)      /* Probably could be 65535L. */
243         FAIL("regexp too big");
244 
245     /* Allocate space. */
246     r = (regexp *)BJAM_MALLOC(sizeof(regexp) + (unsigned)regsize);
247     if (r == NULL)
248         FAIL("out of space");
249 
250     /* Second pass: emit code. */
251     regparse = (char *)exp;
252     regnpar = 1;
253     regcode = r->program;
254     regc(MAGIC);
255     if (reg(0, &flags) == NULL)
256         return(NULL);
257 
258     /* Dig out information for optimizations. */
259     r->regstart = '\0'; /* Worst-case defaults. */
260     r->reganch = 0;
261     r->regmust = NULL;
262     r->regmlen = 0;
263     scan = r->program+1;            /* First BRANCH. */
264     if (OP(regnext(scan)) == END) {     /* Only one top-level choice. */
265         scan = OPERAND(scan);
266 
267         /* Starting-point info. */
268         if (OP(scan) == EXACTLY)
269             r->regstart = *OPERAND(scan);
270         else if (OP(scan) == BOL)
271             r->reganch++;
272 
273         /*
274          * If there's something expensive in the r.e., find the
275          * longest literal string that must appear and make it the
276          * regmust.  Resolve ties in favor of later strings, since
277          * the regstart check works with the beginning of the r.e.
278          * and avoiding duplication strengthens checking.  Not a
279          * strong reason, but sufficient in the absence of others.
280          */
281         if (flags&SPSTART) {
282             longest = NULL;
283             len = 0;
284             for (; scan != NULL; scan = regnext(scan))
285                 if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) {
286                     longest = OPERAND(scan);
287                     len = strlen(OPERAND(scan));
288                 }
289             r->regmust = longest;
290             r->regmlen = len;
291         }
292     }
293 
294     return(r);
295 }
296 
297 /*
298  - reg - regular expression, i.e. main body or parenthesized thing
299  *
300  * Caller must absorb opening parenthesis.
301  *
302  * Combining parenthesis handling with the base level of regular expression
303  * is a trifle forced, but the need to tie the tails of the branches to what
304  * follows makes it hard to avoid.
305  */
306 static char *
reg(int paren,int * flagp)307 reg(
308     int paren,          /* Parenthesized? */
309     int *flagp )
310 {
311     char *ret;
312     char *br;
313     char *ender;
314     int parno = 0;
315     int flags;
316 
317     *flagp = HASWIDTH;  /* Tentatively. */
318 
319     /* Make an OPEN node, if parenthesized. */
320     if (paren) {
321         if (regnpar >= NSUBEXP)
322             FAIL("too many ()");
323         parno = regnpar;
324         regnpar++;
325         ret = regnode(OPEN+parno);
326     } else
327         ret = NULL;
328 
329     /* Pick up the branches, linking them together. */
330     br = regbranch(&flags);
331     if (br == NULL)
332         return(NULL);
333     if (ret != NULL)
334         regtail(ret, br);   /* OPEN -> first. */
335     else
336         ret = br;
337     if (!(flags&HASWIDTH))
338         *flagp &= ~HASWIDTH;
339     *flagp |= flags&SPSTART;
340     while (*regparse == '|' || *regparse == '\n') {
341         regparse++;
342         br = regbranch(&flags);
343         if (br == NULL)
344             return(NULL);
345         regtail(ret, br);   /* BRANCH -> BRANCH. */
346         if (!(flags&HASWIDTH))
347             *flagp &= ~HASWIDTH;
348         *flagp |= flags&SPSTART;
349     }
350 
351     /* Make a closing node, and hook it on the end. */
352     ender = regnode((paren) ? CLOSE+parno : END);
353     regtail(ret, ender);
354 
355     /* Hook the tails of the branches to the closing node. */
356     for (br = ret; br != NULL; br = regnext(br))
357         regoptail(br, ender);
358 
359     /* Check for proper termination. */
360     if (paren && *regparse++ != ')') {
361         FAIL("unmatched ()");
362     } else if (!paren && *regparse != '\0') {
363         if (*regparse == ')') {
364             FAIL("unmatched ()");
365         } else
366             FAIL("junk on end");    /* "Can't happen". */
367         /* NOTREACHED */
368     }
369 
370     return(ret);
371 }
372 
373 /*
374  - regbranch - one alternative of an | operator
375  *
376  * Implements the concatenation operator.
377  */
378 static char *
regbranch(int * flagp)379 regbranch( int *flagp )
380 {
381     char *ret;
382     char *chain;
383     char *latest;
384     int flags;
385 
386     *flagp = WORST;     /* Tentatively. */
387 
388     ret = regnode(BRANCH);
389     chain = NULL;
390     while (*regparse != '\0' && *regparse != ')' &&
391            *regparse != '\n' && *regparse != '|') {
392         latest = regpiece(&flags);
393         if (latest == NULL)
394             return(NULL);
395         *flagp |= flags&HASWIDTH;
396         if (chain == NULL)  /* First piece. */
397             *flagp |= flags&SPSTART;
398         else
399             regtail(chain, latest);
400         chain = latest;
401     }
402     if (chain == NULL)  /* Loop ran zero times. */
403         (void) regnode(NOTHING);
404 
405     return(ret);
406 }
407 
408 /*
409  - regpiece - something followed by possible [*+?]
410  *
411  * Note that the branching code sequences used for ? and the general cases
412  * of * and + are somewhat optimized:  they use the same NOTHING node as
413  * both the endmarker for their branch list and the body of the last branch.
414  * It might seem that this node could be dispensed with entirely, but the
415  * endmarker role is not redundant.
416  */
417 static char *
regpiece(int * flagp)418 regpiece( int *flagp )
419 {
420     char *ret;
421     char op;
422     char *next;
423     int flags;
424 
425     ret = regatom(&flags);
426     if (ret == NULL)
427         return(NULL);
428 
429     op = *regparse;
430     if (!ISMULT(op)) {
431         *flagp = flags;
432         return(ret);
433     }
434 
435     if (!(flags&HASWIDTH) && op != '?')
436         FAIL("*+ operand could be empty");
437     *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
438 
439     if (op == '*' && (flags&SIMPLE))
440         reginsert(STAR, ret);
441     else if (op == '*') {
442         /* Emit x* as (x&|), where & means "self". */
443         reginsert(BRANCH, ret);         /* Either x */
444         regoptail(ret, regnode(BACK));      /* and loop */
445         regoptail(ret, ret);            /* back */
446         regtail(ret, regnode(BRANCH));      /* or */
447         regtail(ret, regnode(NOTHING));     /* null. */
448     } else if (op == '+' && (flags&SIMPLE))
449         reginsert(PLUS, ret);
450     else if (op == '+') {
451         /* Emit x+ as x(&|), where & means "self". */
452         next = regnode(BRANCH);         /* Either */
453         regtail(ret, next);
454         regtail(regnode(BACK), ret);        /* loop back */
455         regtail(next, regnode(BRANCH));     /* or */
456         regtail(ret, regnode(NOTHING));     /* null. */
457     } else if (op == '?') {
458         /* Emit x? as (x|) */
459         reginsert(BRANCH, ret);         /* Either x */
460         regtail(ret, regnode(BRANCH));      /* or */
461         next = regnode(NOTHING);        /* null. */
462         regtail(ret, next);
463         regoptail(ret, next);
464     }
465     regparse++;
466     if (ISMULT(*regparse))
467         FAIL("nested *?+");
468 
469     return(ret);
470 }
471 
472 /*
473  - regatom - the lowest level
474  *
475  * Optimization:  gobbles an entire sequence of ordinary characters so that
476  * it can turn them into a single node, which is smaller to store and
477  * faster to run.  Backslashed characters are exceptions, each becoming a
478  * separate node; the code is simpler that way and it's not worth fixing.
479  */
480 static char *
regatom(int * flagp)481 regatom( int *flagp )
482 {
483     char *ret;
484     int flags;
485 
486     *flagp = WORST;     /* Tentatively. */
487 
488     switch (*regparse++) {
489     /* FIXME: these chars only have meaning at beg/end of pat? */
490     case '^':
491         ret = regnode(BOL);
492         break;
493     case '$':
494         ret = regnode(EOL);
495         break;
496     case '.':
497         ret = regnode(ANY);
498         *flagp |= HASWIDTH|SIMPLE;
499         break;
500     case '[': {
501             int classr;
502             int classend;
503 
504             if (*regparse == '^') { /* Complement of range. */
505                 ret = regnode(ANYBUT);
506                 regparse++;
507             } else
508                 ret = regnode(ANYOF);
509             if (*regparse == ']' || *regparse == '-')
510                 regc(*regparse++);
511             while (*regparse != '\0' && *regparse != ']') {
512                 if (*regparse == '-') {
513                     regparse++;
514                     if (*regparse == ']' || *regparse == '\0')
515                         regc('-');
516                     else {
517                         classr = UCHARAT(regparse-2)+1;
518                         classend = UCHARAT(regparse);
519                         if (classr > classend+1)
520                             FAIL("invalid [] range");
521                         for (; classr <= classend; classr++)
522                             regc(classr);
523                         regparse++;
524                     }
525                 } else
526                     regc(*regparse++);
527             }
528             regc('\0');
529             if (*regparse != ']')
530                 FAIL("unmatched []");
531             regparse++;
532             *flagp |= HASWIDTH|SIMPLE;
533         }
534         break;
535     case '(':
536         ret = reg(1, &flags);
537         if (ret == NULL)
538             return(NULL);
539         *flagp |= flags&(HASWIDTH|SPSTART);
540         break;
541     case '\0':
542     case '|':
543     case '\n':
544     case ')':
545         FAIL("internal urp");   /* Supposed to be caught earlier. */
546         break;
547     case '?':
548     case '+':
549     case '*':
550         FAIL("?+* follows nothing");
551         break;
552     case '\\':
553         switch (*regparse++) {
554         case '\0':
555             FAIL("trailing \\");
556             break;
557         case '<':
558             ret = regnode(WORDA);
559             break;
560         case '>':
561             ret = regnode(WORDZ);
562             break;
563         /* FIXME: Someday handle \1, \2, ... */
564         default:
565             /* Handle general quoted chars in exact-match routine */
566             goto de_fault;
567         }
568         break;
569     de_fault:
570     default:
571         /*
572          * Encode a string of characters to be matched exactly.
573          *
574          * This is a bit tricky due to quoted chars and due to
575          * '*', '+', and '?' taking the SINGLE char previous
576          * as their operand.
577          *
578          * On entry, the char at regparse[-1] is going to go
579          * into the string, no matter what it is.  (It could be
580          * following a \ if we are entered from the '\' case.)
581          *
582          * Basic idea is to pick up a good char in  ch  and
583          * examine the next char.  If it's *+? then we twiddle.
584          * If it's \ then we frozzle.  If it's other magic char
585          * we push  ch  and terminate the string.  If none of the
586          * above, we push  ch  on the string and go around again.
587          *
588          *  regprev  is used to remember where "the current char"
589          * starts in the string, if due to a *+? we need to back
590          * up and put the current char in a separate, 1-char, string.
591          * When  regprev  is NULL,  ch  is the only char in the
592          * string; this is used in *+? handling, and in setting
593          * flags |= SIMPLE at the end.
594          */
595         {
596             char *regprev;
597             char ch;
598 
599             regparse--;         /* Look at cur char */
600             ret = regnode(EXACTLY);
601             for ( regprev = 0 ; ; ) {
602                 ch = *regparse++;   /* Get current char */
603                 switch (*regparse) {    /* look at next one */
604 
605                 default:
606                     regc(ch);   /* Add cur to string */
607                     break;
608 
609                 case '.': case '[': case '(':
610                 case ')': case '|': case '\n':
611                 case '$': case '^':
612                 case '\0':
613                 /* FIXME, $ and ^ should not always be magic */
614                 magic:
615                     regc(ch);   /* dump cur char */
616                     goto done;  /* and we are done */
617 
618                 case '?': case '+': case '*':
619                     if (!regprev)   /* If just ch in str, */
620                         goto magic; /* use it */
621                     /* End mult-char string one early */
622                     regparse = regprev; /* Back up parse */
623                     goto done;
624 
625                 case '\\':
626                     regc(ch);   /* Cur char OK */
627                     switch (regparse[1]){ /* Look after \ */
628                     case '\0':
629                     case '<':
630                     case '>':
631                     /* FIXME: Someday handle \1, \2, ... */
632                         goto done; /* Not quoted */
633                     default:
634                         /* Backup point is \, scan                           * point is after it. */
635                         regprev = regparse;
636                         regparse++;
637                         continue;   /* NOT break; */
638                     }
639                 }
640                 regprev = regparse; /* Set backup point */
641             }
642         done:
643             regc('\0');
644             *flagp |= HASWIDTH;
645             if (!regprev)       /* One char? */
646                 *flagp |= SIMPLE;
647         }
648         break;
649     }
650 
651     return(ret);
652 }
653 
654 /*
655  - regnode - emit a node
656  */
657 static char *           /* Location. */
regnode(int op)658 regnode( int op )
659 {
660     char *ret;
661     char *ptr;
662 
663     ret = regcode;
664     if (ret == &regdummy) {
665         regsize += 3;
666         return(ret);
667     }
668 
669     ptr = ret;
670     *ptr++ = op;
671     *ptr++ = '\0';      /* Null "next" pointer. */
672     *ptr++ = '\0';
673     regcode = ptr;
674 
675     return(ret);
676 }
677 
678 /*
679  - regc - emit (if appropriate) a byte of code
680  */
681 static void
regc(int b)682 regc( int b )
683 {
684     if (regcode != &regdummy)
685         *regcode++ = b;
686     else
687         regsize++;
688 }
689 
690 /*
691  - reginsert - insert an operator in front of already-emitted operand
692  *
693  * Means relocating the operand.
694  */
695 static void
reginsert(char op,char * opnd)696 reginsert(
697     char op,
698     char *opnd )
699 {
700     char *src;
701     char *dst;
702     char *place;
703 
704     if (regcode == &regdummy) {
705         regsize += 3;
706         return;
707     }
708 
709     src = regcode;
710     regcode += 3;
711     dst = regcode;
712     while (src > opnd)
713         *--dst = *--src;
714 
715     place = opnd;       /* Op node, where operand used to be. */
716     *place++ = op;
717     *place++ = '\0';
718     *place++ = '\0';
719 }
720 
721 /*
722  - regtail - set the next-pointer at the end of a node chain
723  */
724 static void
regtail(char * p,char * val)725 regtail(
726     char *p,
727     char *val )
728 {
729     char *scan;
730     char *temp;
731     int offset;
732 
733     if (p == &regdummy)
734         return;
735 
736     /* Find last node. */
737     scan = p;
738     for (;;) {
739         temp = regnext(scan);
740         if (temp == NULL)
741             break;
742         scan = temp;
743     }
744 
745     if (OP(scan) == BACK)
746         offset = scan - val;
747     else
748         offset = val - scan;
749     *(scan+1) = (offset>>8)&0377;
750     *(scan+2) = offset&0377;
751 }
752 
753 /*
754  - regoptail - regtail on operand of first argument; nop if operandless
755  */
756 
757 static void
regoptail(char * p,char * val)758 regoptail(
759     char *p,
760     char *val )
761 {
762     /* "Operandless" and "op != BRANCH" are synonymous in practice. */
763     if (p == NULL || p == &regdummy || OP(p) != BRANCH)
764         return;
765     regtail(OPERAND(p), val);
766 }
767 
768 /*
769  * regexec and friends
770  */
771 
772 /*
773  * Global work variables for regexec().
774  */
775 static const char *reginput;      /* String-input pointer. */
776 static const char *regbol;        /* Beginning of input, for ^ check. */
777 static const char **regstartp;    /* Pointer to startp array. */
778 static const char **regendp;      /* Ditto for endp. */
779 
780 /*
781  * Forwards.
782  */
783 STATIC int regtry( regexp *prog, const char *string );
784 STATIC int regmatch( char *prog );
785 STATIC int regrepeat( char *p );
786 
787 #ifdef DEBUG
788 int regnarrate = 0;
789 void regdump();
790 STATIC char *regprop();
791 #endif
792 
793 /*
794  - regexec - match a regexp against a string
795  */
796 int
regexec(regexp * prog,const char * string)797 regexec(
798     regexp *prog,
799     const char *string )
800 {
801     char *s;
802 
803     /* Be paranoid... */
804     if (prog == NULL || string == NULL) {
805         regerror("NULL parameter");
806         return(0);
807     }
808 
809     /* Check validity of program. */
810     if (UCHARAT(prog->program) != MAGIC) {
811         regerror("corrupted program");
812         return(0);
813     }
814 
815     /* If there is a "must appear" string, look for it. */
816     if ( prog->regmust != NULL )
817     {
818         s = (char *)string;
819         while ( ( s = strchr( s, prog->regmust[ 0 ] ) ) != NULL )
820         {
821             if ( !strncmp( s, prog->regmust, prog->regmlen ) )
822                 break;  /* Found it. */
823             ++s;
824         }
825         if ( s == NULL )  /* Not present. */
826             return 0;
827     }
828 
829     /* Mark beginning of line for ^ . */
830     regbol = (char *)string;
831 
832     /* Simplest case:  anchored match need be tried only once. */
833     if ( prog->reganch )
834         return regtry( prog, string );
835 
836     /* Messy cases:  unanchored match. */
837     s = (char *)string;
838     if (prog->regstart != '\0')
839         /* We know what char it must start with. */
840         while ((s = strchr(s, prog->regstart)) != NULL) {
841             if (regtry(prog, s))
842                 return(1);
843             s++;
844         }
845     else
846         /* We do not -- general case. */
847         do {
848             if ( regtry( prog, s ) )
849                 return( 1 );
850         } while ( *s++ != '\0' );
851 
852     /* Failure. */
853     return 0;
854 }
855 
856 
857 /*
858  * regtry() - try match at specific point.
859  */
860 
861 static int          /* 0 failure, 1 success */
regtry(regexp * prog,const char * string)862 regtry(
863     regexp *prog,
864     const char *string )
865 {
866     int i;
867     const char * * sp;
868     const char * * ep;
869 
870     reginput = string;
871     regstartp = prog->startp;
872     regendp = prog->endp;
873 
874     sp = prog->startp;
875     ep = prog->endp;
876     for ( i = NSUBEXP; i > 0; --i )
877     {
878         *sp++ = NULL;
879         *ep++ = NULL;
880     }
881     if ( regmatch( prog->program + 1 ) )
882     {
883         prog->startp[ 0 ] = string;
884         prog->endp[ 0 ] = reginput;
885         return 1;
886     }
887     else
888         return 0;
889 }
890 
891 
892 /*
893  * regmatch() - main matching routine.
894  *
895  * Conceptually the strategy is simple: check to see whether the current node
896  * matches, call self recursively to see whether the rest matches, and then act
897  * accordingly. In practice we make some effort to avoid recursion, in
898  * particular by going through "ordinary" nodes (that do not need to know
899  * whether the rest of the match failed) by a loop instead of by recursion.
900  */
901 
902 static int          /* 0 failure, 1 success */
regmatch(char * prog)903 regmatch( char * prog )
904 {
905     char * scan;  /* Current node. */
906     char * next;  /* Next node. */
907 
908     scan = prog;
909 #ifdef DEBUG
910     if (scan != NULL && regnarrate)
911         err_printf("%s(\n", regprop(scan));
912 #endif
913     while (scan != NULL) {
914 #ifdef DEBUG
915         if (regnarrate)
916             err_printf("%s...\n", regprop(scan));
917 #endif
918         next = regnext(scan);
919 
920         switch (OP(scan)) {
921         case BOL:
922             if (reginput != regbol)
923                 return(0);
924             break;
925         case EOL:
926             if (*reginput != '\0')
927                 return(0);
928             break;
929         case WORDA:
930             /* Must be looking at a letter, digit, or _ */
931             if ((!isalnum(*reginput)) && *reginput != '_')
932                 return(0);
933             /* Prev must be BOL or nonword */
934             if (reginput > regbol &&
935                 (isalnum(reginput[-1]) || reginput[-1] == '_'))
936                 return(0);
937             break;
938         case WORDZ:
939             /* Must be looking at non letter, digit, or _ */
940             if (isalnum(*reginput) || *reginput == '_')
941                 return(0);
942             /* We don't care what the previous char was */
943             break;
944         case ANY:
945             if (*reginput == '\0')
946                 return(0);
947             reginput++;
948             break;
949         case EXACTLY: {
950                 int len;
951                 char *opnd;
952 
953                 opnd = OPERAND(scan);
954                 /* Inline the first character, for speed. */
955                 if (*opnd != *reginput)
956                     return(0);
957                 len = strlen(opnd);
958                 if (len > 1 && strncmp(opnd, reginput, len) != 0)
959                     return(0);
960                 reginput += len;
961             }
962             break;
963         case ANYOF:
964             if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL)
965                 return(0);
966             reginput++;
967             break;
968         case ANYBUT:
969             if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL)
970                 return(0);
971             reginput++;
972             break;
973         case NOTHING:
974             break;
975         case BACK:
976             break;
977         case OPEN+1:
978         case OPEN+2:
979         case OPEN+3:
980         case OPEN+4:
981         case OPEN+5:
982         case OPEN+6:
983         case OPEN+7:
984         case OPEN+8:
985         case OPEN+9: {
986                 int no;
987                 const char *save;
988 
989                 no = OP(scan) - OPEN;
990                 save = reginput;
991 
992                 if (regmatch(next)) {
993                     /*
994                      * Don't set startp if some later
995                      * invocation of the same parentheses
996                      * already has.
997                      */
998                     if (regstartp[no] == NULL)
999                         regstartp[no] = save;
1000                     return(1);
1001                 } else
1002                     return(0);
1003             }
1004             break;
1005         case CLOSE+1:
1006         case CLOSE+2:
1007         case CLOSE+3:
1008         case CLOSE+4:
1009         case CLOSE+5:
1010         case CLOSE+6:
1011         case CLOSE+7:
1012         case CLOSE+8:
1013         case CLOSE+9: {
1014                 int no;
1015                 const char *save;
1016 
1017                 no = OP(scan) - CLOSE;
1018                 save = reginput;
1019 
1020                 if (regmatch(next)) {
1021                     /*
1022                      * Don't set endp if some later
1023                      * invocation of the same parentheses
1024                      * already has.
1025                      */
1026                     if (regendp[no] == NULL)
1027                         regendp[no] = save;
1028                     return(1);
1029                 } else
1030                     return(0);
1031             }
1032             break;
1033         case BRANCH: {
1034                 const char *save;
1035 
1036                 if (OP(next) != BRANCH)     /* No choice. */
1037                     next = OPERAND(scan);   /* Avoid recursion. */
1038                 else {
1039                     do {
1040                         save = reginput;
1041                         if (regmatch(OPERAND(scan)))
1042                             return(1);
1043                         reginput = save;
1044                         scan = regnext(scan);
1045                     } while (scan != NULL && OP(scan) == BRANCH);
1046                     return(0);
1047                     /* NOTREACHED */
1048                 }
1049             }
1050             break;
1051         case STAR:
1052         case PLUS: {
1053                 char nextch;
1054                 int no;
1055                 const char *save;
1056                 int min;
1057 
1058                 /*
1059                  * Lookahead to avoid useless match attempts
1060                  * when we know what character comes next.
1061                  */
1062                 nextch = '\0';
1063                 if (OP(next) == EXACTLY)
1064                     nextch = *OPERAND(next);
1065                 min = (OP(scan) == STAR) ? 0 : 1;
1066                 save = reginput;
1067                 no = regrepeat(OPERAND(scan));
1068                 while (no >= min) {
1069                     /* If it could work, try it. */
1070                     if (nextch == '\0' || *reginput == nextch)
1071                         if (regmatch(next))
1072                             return(1);
1073                     /* Couldn't or didn't -- back up. */
1074                     no--;
1075                     reginput = save + no;
1076                 }
1077                 return(0);
1078             }
1079             break;
1080         case END:
1081             return(1);  /* Success! */
1082             break;
1083         default:
1084             regerror("memory corruption");
1085             return(0);
1086             break;
1087         }
1088 
1089         scan = next;
1090     }
1091 
1092     /*
1093      * We get here only if there's trouble -- normally "case END" is
1094      * the terminating point.
1095      */
1096     regerror("corrupted pointers");
1097     return(0);
1098 }
1099 
1100 /*
1101  - regrepeat - repeatedly match something simple, report how many
1102  */
1103 static int
regrepeat(char * p)1104 regrepeat( char *p )
1105 {
1106     int count = 0;
1107     const char *scan;
1108     char *opnd;
1109 
1110     scan = reginput;
1111     opnd = OPERAND(p);
1112     switch (OP(p)) {
1113     case ANY:
1114         count = strlen(scan);
1115         scan += count;
1116         break;
1117     case EXACTLY:
1118         while (*opnd == *scan) {
1119             count++;
1120             scan++;
1121         }
1122         break;
1123     case ANYOF:
1124         while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
1125             count++;
1126             scan++;
1127         }
1128         break;
1129     case ANYBUT:
1130         while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
1131             count++;
1132             scan++;
1133         }
1134         break;
1135     default:        /* Oh dear.  Called inappropriately. */
1136         regerror("internal foulup");
1137         count = 0;  /* Best compromise. */
1138         break;
1139     }
1140     reginput = scan;
1141 
1142     return(count);
1143 }
1144 
1145 /*
1146  - regnext - dig the "next" pointer out of a node
1147  */
1148 static char *
regnext(char * p)1149 regnext( char *p )
1150 {
1151     int offset;
1152 
1153     if (p == &regdummy)
1154         return(NULL);
1155 
1156     offset = NEXT(p);
1157     if (offset == 0)
1158         return(NULL);
1159 
1160     if (OP(p) == BACK)
1161         return(p-offset);
1162     else
1163         return(p+offset);
1164 }
1165 
1166 #ifdef DEBUG
1167 
1168 STATIC char *regprop();
1169 
1170 /*
1171  - regdump - dump a regexp onto stdout in vaguely comprehensible form
1172  */
1173 void
regdump(regexp * r)1174 regdump( regexp *r )
1175 {
1176     char *s;
1177     char op = EXACTLY; /* Arbitrary non-END op. */
1178     char *next;
1179 
1180 
1181     s = r->program + 1;
1182     while (op != END) { /* While that wasn't END last time... */
1183         op = OP(s);
1184         out_printf("%2d%s", s-r->program, regprop(s));  /* Where, what. */
1185         next = regnext(s);
1186         if (next == NULL)       /* Next ptr. */
1187             out_printf("(0)");
1188         else
1189             out_printf("(%d)", (s-r->program)+(next-s));
1190         s += 3;
1191         if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
1192             /* Literal string, where present. */
1193             while (*s != '\0') {
1194                 out_putc(*s);
1195                 s++;
1196             }
1197             s++;
1198         }
1199         out_putc('\n');
1200     }
1201 
1202     /* Header fields of interest. */
1203     if (r->regstart != '\0')
1204         out_printf("start `%c' ", r->regstart);
1205     if (r->reganch)
1206         out_printf("anchored ");
1207     if (r->regmust != NULL)
1208         out_printf("must have \"%s\"", r->regmust);
1209     out_printf("\n");
1210 }
1211 
1212 /*
1213  - regprop - printable representation of opcode
1214  */
1215 static char *
regprop(char * op)1216 regprop( char *op )
1217 {
1218     char *p;
1219     static char buf[50];
1220 
1221     (void) strcpy(buf, ":");
1222 
1223     switch (OP(op)) {
1224     case BOL:
1225         p = "BOL";
1226         break;
1227     case EOL:
1228         p = "EOL";
1229         break;
1230     case ANY:
1231         p = "ANY";
1232         break;
1233     case ANYOF:
1234         p = "ANYOF";
1235         break;
1236     case ANYBUT:
1237         p = "ANYBUT";
1238         break;
1239     case BRANCH:
1240         p = "BRANCH";
1241         break;
1242     case EXACTLY:
1243         p = "EXACTLY";
1244         break;
1245     case NOTHING:
1246         p = "NOTHING";
1247         break;
1248     case BACK:
1249         p = "BACK";
1250         break;
1251     case END:
1252         p = "END";
1253         break;
1254     case OPEN+1:
1255     case OPEN+2:
1256     case OPEN+3:
1257     case OPEN+4:
1258     case OPEN+5:
1259     case OPEN+6:
1260     case OPEN+7:
1261     case OPEN+8:
1262     case OPEN+9:
1263         sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
1264         p = NULL;
1265         break;
1266     case CLOSE+1:
1267     case CLOSE+2:
1268     case CLOSE+3:
1269     case CLOSE+4:
1270     case CLOSE+5:
1271     case CLOSE+6:
1272     case CLOSE+7:
1273     case CLOSE+8:
1274     case CLOSE+9:
1275         sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
1276         p = NULL;
1277         break;
1278     case STAR:
1279         p = "STAR";
1280         break;
1281     case PLUS:
1282         p = "PLUS";
1283         break;
1284     case WORDA:
1285         p = "WORDA";
1286         break;
1287     case WORDZ:
1288         p = "WORDZ";
1289         break;
1290     default:
1291         regerror("corrupted opcode");
1292         break;
1293     }
1294     if (p != NULL)
1295         (void) strcat(buf, p);
1296     return(buf);
1297 }
1298 #endif
1299 
1300 /*
1301  * The following is provided for those people who do not have strcspn() in
1302  * their C libraries.  They should get off their butts and do something
1303  * about it; at least one public-domain implementation of those (highly
1304  * useful) string routines has been published on Usenet.
1305  */
1306 #ifdef STRCSPN
1307 /*
1308  * strcspn - find length of initial segment of s1 consisting entirely
1309  * of characters not from s2
1310  */
1311 
1312 static int
strcspn(char * s1,char * s2)1313 strcspn(
1314     char *s1,
1315     char *s2 )
1316 {
1317     char *scan1;
1318     char *scan2;
1319     int count;
1320 
1321     count = 0;
1322     for (scan1 = s1; *scan1 != '\0'; scan1++) {
1323         for (scan2 = s2; *scan2 != '\0';)   /* ++ moved down. */
1324             if (*scan1 == *scan2++)
1325                 return(count);
1326         count++;
1327     }
1328     return(count);
1329 }
1330 #endif
1331