1 #ifdef HAVE_CONFIG_H
2 #include <config.h>
3 #endif
4
5 /*
6 * evr_regcomp and evr_regexec -- evr_regsub and evr_regerror are elsewhere
7 *
8 * Copyright (c) 1986 by University of Toronto.
9 * Written by Henry Spencer. Not derived from licensed software.
10 *
11 * Permission is granted to anyone to use this software for any
12 * purpose on any computer system, and to redistribute it freely,
13 * subject to the following restrictions:
14 *
15 * 1. The author is not responsible for the consequences of use of
16 * this software, no matter how awful, even if they arise
17 * from defects in it.
18 *
19 * 2. The origin of this software must not be misrepresented, either
20 * by explicit claim or by omission.
21 *
22 * 3. Altered versions must be plainly marked as such, and must not
23 * be misrepresented as being the original software.
24 *
25 * Beware that some of this code is subtly aware of the way operator
26 * precedence is structured in regular expressions. Serious changes in
27 * regular-expression syntax might require a total rethink.
28 *
29 * 1/18/2006 -- [ET] Renamed functions to prevent name clashes with
30 * other libraries.
31 */
32 #include <stdio.h>
33 #include <string.h>
34
35 #ifdef HAVE_MALLOC_H
36 #include <malloc.h> /* added 8/28/2001 -- [ET] */
37 #endif
38
39 #include "regexp.h"
40 #include "regmagic.h"
41
42 /*
43 * The "internal use only" fields in regexp.h are present to pass info from
44 * compile to execute that permits the execute phase to run lots faster on
45 * simple cases. They are:
46 *
47 * regstart char that must begin a match; '\0' if none obvious
48 * reganch is the match anchored (at beginning-of-line only)?
49 * regmust string (pointer into program) that match must include, or NULL
50 * regmlen length of regmust string
51 *
52 * Regstart and reganch permit very fast decisions on suitable starting points
53 * for a match, cutting down the work a lot. Regmust permits fast rejection
54 * of lines that cannot possibly match. The regmust tests are costly enough
55 * that evr_regcomp() supplies a regmust only if the r.e. contains something
56 * potentially expensive (at present, the only such thing detected is * or +
57 * at the start of the r.e., which can involve a lot of backup). Regmlen is
58 * supplied because the test in evr_regexec() needs it and evr_regcomp() is computing
59 * it anyway.
60 */
61
62 /*
63 * Structure for regexp "program". This is essentially a linear encoding
64 * of a nondeterministic finite-state machine (aka syntax charts or
65 * "railroad normal form" in parsing technology). Each node is an opcode
66 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
67 * all nodes except BRANCH implement concatenation; a "next" pointer with
68 * a BRANCH on both ends of it is connecting two alternatives. (Here we
69 * have one of the subtle syntax dependencies: an individual BRANCH (as
70 * opposed to a collection of them) is never concatenated with anything
71 * because of operator precedence.) The operand of some types of node is
72 * a literal string; for others, it is a node leading into a sub-FSM. In
73 * particular, the operand of a BRANCH node is the first node of the branch.
74 * (NB this is *not* a tree structure: the tail of the branch connects
75 * to the thing following the set of BRANCHes.) The opcodes are:
76 */
77
78 /* definition number opnd? meaning */
79 #define END 0 /* no End of program. */
80 #define BOL 1 /* no Match "" at beginning of line. */
81 #define EOL 2 /* no Match "" at end of line. */
82 #define ANY 3 /* no Match any one character. */
83 #define ANYOF 4 /* str Match any character in this string. */
84 #define ANYBUT 5 /* str Match any character not in this string. */
85 #define BRANCH 6 /* node Match this alternative, or the next... */
86 #define BACK 7 /* no Match "", "next" ptr points backward. */
87 #define EXACTLY 8 /* str Match this string. */
88 #define NOTHING 9 /* no Match empty string. */
89 #define STAR 10 /* node Match this (simple) thing 0 or more times. */
90 #define PLUS 11 /* node Match this (simple) thing 1 or more times. */
91 #define OPEN 20 /* no Mark this point in input as start of #n. */
92 /* OPEN+1 is number 1, etc. */
93 #define CLOSE 30 /* no Analogous to OPEN. */
94
95 /*
96 * Opcode notes:
97 *
98 * BRANCH The set of branches constituting a single choice are hooked
99 * together with their "next" pointers, since precedence prevents
100 * anything being concatenated to any individual branch. The
101 * "next" pointer of the last BRANCH in a choice points to the
102 * thing following the whole choice. This is also where the
103 * final "next" pointer of each individual branch points; each
104 * branch starts with the operand node of a BRANCH node.
105 *
106 * BACK Normal "next" pointers all implicitly point forward; BACK
107 * exists to make loop structures possible.
108 *
109 * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
110 * BRANCH structures using BACK. Simple cases (one character
111 * per match) are implemented with STAR and PLUS for speed
112 * and to minimize recursive plunges.
113 *
114 * OPEN,CLOSE ...are numbered at compile time.
115 */
116
117 /*
118 * A node is one char of opcode followed by two chars of "next" pointer.
119 * "Next" pointers are stored as two 8-bit pieces, high order first. The
120 * value is a positive offset from the opcode of the node containing it.
121 * An operand, if any, simply follows the node. (Note that much of the
122 * code generation knows about this implicit relationship.)
123 *
124 * Using two bytes for the "next" pointer is vast overkill for most things,
125 * but allows patterns to get big without disasters.
126 */
127 #define OP(p) (*(p))
128 #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
129 #define OPERAND(p) ((p) + 3)
130
131 /*
132 * See regmagic.h for one further detail of program structure.
133 */
134
135
136 /*
137 * Utility definitions.
138 */
139 #ifndef CHARBITS
140 #define UCHARAT(p) ((int)*(unsigned char *)(p))
141 #else
142 #define UCHARAT(p) ((int)*(p)&CHARBITS)
143 #endif
144
145 #define FAIL(m) { evr_regerror(m); return(NULL); }
146 #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?')
147 #define META "^$.[()|?+*\\"
148
149 /*
150 * Flags to be passed up and down.
151 */
152 #define HASWIDTH 01 /* Known never to match null string. */
153 #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */
154 #define SPSTART 04 /* Starts with * or +. */
155 #define WORST 0 /* Worst case. */
156
157 /*
158 * Global work variables for evr_regcomp().
159 */
160 static char *regparse; /* Input-scan pointer. */
161 static int regnpar; /* () count. */
162 static char regdummy;
163 static char *regcode; /* Code-emit pointer; ®dummy = don't. */
164 static int regsize; /* Code size. */
165
166 /*
167 * Forward declarations for evr_regcomp()'s friends.
168 */
169 #ifndef STATIC
170 #define STATIC static
171 #endif
172 STATIC char *reg(int paren, int *flagp);
173 STATIC char *regbranch(int *flagp);
174 STATIC char *regpiece(int *flagp);
175 STATIC char *regatom(int *flagp);
176 STATIC char *regnode(char op);
177 STATIC char *regnext(register char *p);
178 STATIC void regc(char b);
179 STATIC void reginsert(char op, char *opnd);
180 STATIC void regtail(char *p, char *val);
181 STATIC void regoptail(char *p, char *val);
182 #ifdef STRCSPN
183 STATIC int strcspn(char *s1, char *s2);
184 #endif
185
186 /*
187 - evr_regcomp - compile a regular expression into internal code
188 *
189 * We can't allocate space until we know how big the compiled form will be,
190 * but we can't compile it (and thus know how big it is) until we've got a
191 * place to put the code. So we cheat: we compile it twice, once with code
192 * generation turned off and size counting turned on, and once "for real".
193 * This also means that we don't allocate space until we are sure that the
194 * thing really will compile successfully, and we never have to move the
195 * code and thus invalidate pointers into it. (Note that it has to be in
196 * one piece because free() must be able to free it all.)
197 *
198 * Beware that the optimization-preparation code in here knows about some
199 * of the structure of the compiled regexp.
200 */
201 regexp *
evr_regcomp(char * exp)202 evr_regcomp(char *exp)
203 {
204 register regexp *r;
205 register char *scan;
206 register char *longest;
207 register int len;
208 int flags;
209 extern void *malloc(size_t size);
210
211 if (exp == NULL)
212 FAIL("NULL argument");
213
214 /* First pass: determine size, legality. */
215 regparse = exp;
216 regnpar = 1;
217 regsize = 0L;
218 regcode = ®dummy;
219 regc(MAGIC);
220 if (reg(0, &flags) == NULL)
221 return(NULL);
222
223 /* Small enough for pointer-storage convention? */
224 if (regsize >= 32767L) /* Probably could be 65535L. */
225 FAIL("regexp too big");
226
227 /* Allocate space. */
228 r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize);
229 if (r == NULL)
230 FAIL("out of space");
231
232 /* Second pass: emit code. */
233 regparse = exp;
234 regnpar = 1;
235 regcode = r->program;
236 regc(MAGIC);
237 if (reg(0, &flags) == NULL)
238 return(NULL);
239
240 /* Dig out information for optimizations. */
241 r->regstart = '\0'; /* Worst-case defaults. */
242 r->reganch = 0;
243 r->regmust = NULL;
244 r->regmlen = 0;
245 scan = r->program+1; /* First BRANCH. */
246 if (OP(regnext(scan)) == END) { /* Only one top-level choice. */
247 scan = OPERAND(scan);
248
249 /* Starting-point info. */
250 if (OP(scan) == EXACTLY)
251 r->regstart = *OPERAND(scan);
252 else if (OP(scan) == BOL)
253 r->reganch++;
254
255 /*
256 * If there's something expensive in the r.e., find the
257 * longest literal string that must appear and make it the
258 * regmust. Resolve ties in favor of later strings, since
259 * the regstart check works with the beginning of the r.e.
260 * and avoiding duplication strengthens checking. Not a
261 * strong reason, but sufficient in the absence of others.
262 */
263 if (flags&SPSTART) {
264 longest = NULL;
265 len = 0;
266 for (; scan != NULL; scan = regnext(scan))
267 if (OP(scan) == EXACTLY && (int)strlen(OPERAND(scan)) >= len) {
268 longest = OPERAND(scan);
269 len = strlen(OPERAND(scan));
270 }
271 r->regmust = longest;
272 r->regmlen = len;
273 }
274 }
275
276 return(r);
277 }
278
279 /*
280 - reg - regular expression, i.e. main body or parenthesized thing
281 *
282 * Caller must absorb opening parenthesis.
283 *
284 * Combining parenthesis handling with the base level of regular expression
285 * is a trifle forced, but the need to tie the tails of the branches to what
286 * follows makes it hard to avoid.
287 */
288 static char *
reg(int paren,int * flagp)289 reg(int paren, /* Parenthesized? */
290 int *flagp)
291 {
292 register char *ret;
293 register char *br;
294 register char *ender;
295 register int parno = 0;
296 int flags;
297
298 *flagp = HASWIDTH; /* Tentatively. */
299
300 /* Make an OPEN node, if parenthesized. */
301 if (paren) {
302 if (regnpar >= NSUBEXP)
303 FAIL("too many ()");
304 parno = regnpar;
305 regnpar++;
306 ret = regnode(OPEN+parno);
307 } else
308 ret = NULL;
309
310 /* Pick up the branches, linking them together. */
311 br = regbranch(&flags);
312 if (br == NULL)
313 return(NULL);
314 if (ret != NULL)
315 regtail(ret, br); /* OPEN -> first. */
316 else
317 ret = br;
318 if (!(flags&HASWIDTH))
319 *flagp &= ~HASWIDTH;
320 *flagp |= flags&SPSTART;
321 while (*regparse == '|') {
322 regparse++;
323 br = regbranch(&flags);
324 if (br == NULL)
325 return(NULL);
326 regtail(ret, br); /* BRANCH -> BRANCH. */
327 if (!(flags&HASWIDTH))
328 *flagp &= ~HASWIDTH;
329 *flagp |= flags&SPSTART;
330 }
331
332 /* Make a closing node, and hook it on the end. */
333 ender = regnode((paren) ? CLOSE+parno : END);
334 regtail(ret, ender);
335
336 /* Hook the tails of the branches to the closing node. */
337 for (br = ret; br != NULL; br = regnext(br))
338 regoptail(br, ender);
339
340 /* Check for proper termination. */
341 if (paren && *regparse++ != ')') {
342 FAIL("unmatched ()");
343 } else if (!paren && *regparse != '\0') {
344 if (*regparse == ')') {
345 FAIL("unmatched ()");
346 } else
347 FAIL("junk on end"); /* "Can't happen". */
348 /* NOTREACHED */
349 }
350
351 return(ret);
352 }
353
354 /*
355 - regbranch - one alternative of an | operator
356 *
357 * Implements the concatenation operator.
358 */
359 static char *
regbranch(int * flagp)360 regbranch(int *flagp)
361 {
362 register char *ret;
363 register char *chain;
364 register char *latest;
365 int flags;
366
367 *flagp = WORST; /* Tentatively. */
368
369 ret = regnode(BRANCH);
370 chain = NULL;
371 while (*regparse != '\0' && *regparse != '|' && *regparse != ')') {
372 latest = regpiece(&flags);
373 if (latest == NULL)
374 return(NULL);
375 *flagp |= flags&HASWIDTH;
376 if (chain == NULL) /* First piece. */
377 *flagp |= flags&SPSTART;
378 else
379 regtail(chain, latest);
380 chain = latest;
381 }
382 if (chain == NULL) /* Loop ran zero times. */
383 (void) regnode(NOTHING);
384
385 return(ret);
386 }
387
388 /*
389 - regpiece - something followed by possible [*+?]
390 *
391 * Note that the branching code sequences used for ? and the general cases
392 * of * and + are somewhat optimized: they use the same NOTHING node as
393 * both the endmarker for their branch list and the body of the last branch.
394 * It might seem that this node could be dispensed with entirely, but the
395 * endmarker role is not redundant.
396 */
397 static char *
regpiece(int * flagp)398 regpiece(int *flagp)
399 {
400 register char *ret;
401 register char op;
402 register char *next;
403 int flags;
404
405 ret = regatom(&flags);
406 if (ret == NULL)
407 return(NULL);
408
409 op = *regparse;
410 if (!ISMULT(op)) {
411 *flagp = flags;
412 return(ret);
413 }
414
415 if (!(flags&HASWIDTH) && op != '?')
416 FAIL("*+ operand could be empty");
417 *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
418
419 if (op == '*' && (flags&SIMPLE))
420 reginsert(STAR, ret);
421 else if (op == '*') {
422 /* Emit x* as (x&|), where & means "self". */
423 reginsert(BRANCH, ret); /* Either x */
424 regoptail(ret, regnode(BACK)); /* and loop */
425 regoptail(ret, ret); /* back */
426 regtail(ret, regnode(BRANCH)); /* or */
427 regtail(ret, regnode(NOTHING)); /* null. */
428 } else if (op == '+' && (flags&SIMPLE))
429 reginsert(PLUS, ret);
430 else if (op == '+') {
431 /* Emit x+ as x(&|), where & means "self". */
432 next = regnode(BRANCH); /* Either */
433 regtail(ret, next);
434 regtail(regnode(BACK), ret); /* loop back */
435 regtail(next, regnode(BRANCH)); /* or */
436 regtail(ret, regnode(NOTHING)); /* null. */
437 } else if (op == '?') {
438 /* Emit x? as (x|) */
439 reginsert(BRANCH, ret); /* Either x */
440 regtail(ret, regnode(BRANCH)); /* or */
441 next = regnode(NOTHING); /* null. */
442 regtail(ret, next);
443 regoptail(ret, next);
444 }
445 regparse++;
446 if (ISMULT(*regparse))
447 FAIL("nested *?+");
448
449 return(ret);
450 }
451
452 /*
453 - regatom - the lowest level
454 *
455 * Optimization: gobbles an entire sequence of ordinary characters so that
456 * it can turn them into a single node, which is smaller to store and
457 * faster to run. Backslashed characters are exceptions, each becoming a
458 * separate node; the code is simpler that way and it's not worth fixing.
459 */
460 static char *
regatom(int * flagp)461 regatom(int *flagp)
462 {
463 register char *ret;
464 int flags;
465
466 *flagp = WORST; /* Tentatively. */
467
468 switch (*regparse++) {
469 case '^':
470 ret = regnode(BOL);
471 break;
472 case '$':
473 ret = regnode(EOL);
474 break;
475 case '.':
476 ret = regnode(ANY);
477 *flagp |= HASWIDTH|SIMPLE;
478 break;
479 case '[': {
480 register int class;
481 register int classend;
482
483 if (*regparse == '^') { /* Complement of range. */
484 ret = regnode(ANYBUT);
485 regparse++;
486 } else
487 ret = regnode(ANYOF);
488 if (*regparse == ']' || *regparse == '-')
489 regc(*regparse++);
490 while (*regparse != '\0' && *regparse != ']') {
491 if (*regparse == '-') {
492 regparse++;
493 if (*regparse == ']' || *regparse == '\0')
494 regc('-');
495 else {
496 class = UCHARAT(regparse-2)+1;
497 classend = UCHARAT(regparse);
498 if (class > classend+1)
499 FAIL("invalid [] range");
500 for (; class <= classend; class++)
501 regc(class);
502 regparse++;
503 }
504 } else
505 regc(*regparse++);
506 }
507 regc('\0');
508 if (*regparse != ']')
509 FAIL("unmatched []");
510 regparse++;
511 *flagp |= HASWIDTH|SIMPLE;
512 }
513 break;
514 case '(':
515 ret = reg(1, &flags);
516 if (ret == NULL)
517 return(NULL);
518 *flagp |= flags&(HASWIDTH|SPSTART);
519 break;
520 case '\0':
521 case '|':
522 case ')':
523 FAIL("internal urp"); /* Supposed to be caught earlier. */
524 break;
525 case '?':
526 case '+':
527 case '*':
528 FAIL("?+* follows nothing");
529 break;
530 case '\\':
531 if (*regparse == '\0')
532 FAIL("trailing \\");
533 ret = regnode(EXACTLY);
534 regc(*regparse++);
535 regc('\0');
536 *flagp |= HASWIDTH|SIMPLE;
537 break;
538 default: {
539 register int len;
540 register char ender;
541
542 regparse--;
543 len = strcspn(regparse, META);
544 if (len <= 0)
545 FAIL("internal disaster");
546 ender = *(regparse+len);
547 if (len > 1 && ISMULT(ender))
548 len--; /* Back off clear of ?+* operand. */
549 *flagp |= HASWIDTH;
550 if (len == 1)
551 *flagp |= SIMPLE;
552 ret = regnode(EXACTLY);
553 while (len > 0) {
554 regc(*regparse++);
555 len--;
556 }
557 regc('\0');
558 }
559 break;
560 }
561
562 return(ret);
563 }
564
565 /*
566 - regnode - emit a node
567 */
568 static char * /* Location. */
regnode(char op)569 regnode(char op)
570 {
571 register char *ret;
572 register char *ptr;
573
574 ret = regcode;
575 if (ret == ®dummy) {
576 regsize += 3;
577 return(ret);
578 }
579
580 ptr = ret;
581 *ptr++ = op;
582 *ptr++ = '\0'; /* Null "next" pointer. */
583 *ptr++ = '\0';
584 regcode = ptr;
585
586 return(ret);
587 }
588
589 /*
590 - regc - emit (if appropriate) a byte of code
591 */
592 static void
regc(char b)593 regc(char b)
594 {
595 if (regcode != ®dummy)
596 *regcode++ = b;
597 else
598 regsize++;
599 }
600
601 /*
602 - reginsert - insert an operator in front of already-emitted operand
603 *
604 * Means relocating the operand.
605 */
606 static void
reginsert(char op,char * opnd)607 reginsert(char op, char *opnd)
608 {
609 register char *src;
610 register char *dst;
611 register char *place;
612
613 if (regcode == ®dummy) {
614 regsize += 3;
615 return;
616 }
617
618 src = regcode;
619 regcode += 3;
620 dst = regcode;
621 while (src > opnd)
622 *--dst = *--src;
623
624 place = opnd; /* Op node, where operand used to be. */
625 *place++ = op;
626 *place++ = '\0';
627 *place++ = '\0';
628 }
629
630 /*
631 - regtail - set the next-pointer at the end of a node chain
632 */
633 static void
regtail(char * p,char * val)634 regtail(char *p, char *val)
635 {
636 register char *scan;
637 register char *temp;
638 register int offset;
639
640 if (p == ®dummy)
641 return;
642
643 /* Find last node. */
644 scan = p;
645 for (;;) {
646 temp = regnext(scan);
647 if (temp == NULL)
648 break;
649 scan = temp;
650 }
651
652 if (OP(scan) == BACK)
653 offset = scan - val;
654 else
655 offset = val - scan;
656 *(scan+1) = (offset>>8)&0377;
657 *(scan+2) = offset&0377;
658 }
659
660 /*
661 - regoptail - regtail on operand of first argument; nop if operandless
662 */
663 static void
regoptail(char * p,char * val)664 regoptail(char *p, char *val)
665 {
666 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
667 if (p == NULL || p == ®dummy || OP(p) != BRANCH)
668 return;
669 regtail(OPERAND(p), val);
670 }
671
672 /*
673 * evr_regexec and friends
674 */
675
676 /*
677 * Global work variables for evr_regexec().
678 */
679 static char *reginput; /* String-input pointer. */
680 static char *regbol; /* Beginning of input, for ^ check. */
681 static char **regstartp; /* Pointer to startp array. */
682 static char **regendp; /* Ditto for endp. */
683
684 /*
685 * Forwards.
686 */
687 STATIC int regtry(regexp *prog, char *string);
688 STATIC int regmatch(char *prog);
689 STATIC int regrepeat(char *p);
690
691 #ifdef DEBUG
692 int regnarrate = 0;
693 void regdump(regexp *r);
694 STATIC char *regprop(char *op);
695 #endif
696
697 /*
698 - evr_regexec - match a regexp against a string
699 */
700 int
evr_regexec(register regexp * prog,register char * string)701 evr_regexec(register regexp *prog, register char *string)
702 {
703 register char *s;
704
705 /* Be paranoid... */
706 if (prog == NULL || string == NULL) {
707 evr_regerror("NULL parameter");
708 return(0);
709 }
710
711 /* Check validity of program. */
712 if (UCHARAT(prog->program) != MAGIC) {
713 evr_regerror("corrupted program");
714 return(0);
715 }
716
717 /* If there is a "must appear" string, look for it. */
718 if (prog->regmust != NULL) {
719 s = string;
720 while ((s = strchr(s, prog->regmust[0])) != NULL) {
721 if (strncmp(s, prog->regmust, prog->regmlen) == 0)
722 break; /* Found it. */
723 s++;
724 }
725 if (s == NULL) /* Not present. */
726 return(0);
727 }
728
729 /* Mark beginning of line for ^ . */
730 regbol = string;
731
732 /* Simplest case: anchored match need be tried only once. */
733 if (prog->reganch)
734 return(regtry(prog, string));
735
736 /* Messy cases: unanchored match. */
737 s = string;
738 if (prog->regstart != '\0')
739 /* We know what char it must start with. */
740 while ((s = strchr(s, prog->regstart)) != NULL) {
741 if (regtry(prog, s))
742 return(1);
743 s++;
744 }
745 else
746 /* We don't -- general case. */
747 do {
748 if (regtry(prog, s))
749 return(1);
750 } while (*s++ != '\0');
751
752 /* Failure. */
753 return(0);
754 }
755
756 /*
757 - regtry - try match at specific point
758 */
759 static int /* 0 failure, 1 success */
regtry(regexp * prog,char * string)760 regtry(regexp *prog, char *string)
761 {
762 register int i;
763 register char **sp;
764 register char **ep;
765
766 reginput = string;
767 regstartp = prog->startp;
768 regendp = prog->endp;
769
770 sp = prog->startp;
771 ep = prog->endp;
772 for (i = NSUBEXP; i > 0; i--) {
773 *sp++ = NULL;
774 *ep++ = NULL;
775 }
776 if (regmatch(prog->program + 1)) {
777 prog->startp[0] = string;
778 prog->endp[0] = reginput;
779 return(1);
780 } else
781 return(0);
782 }
783
784 /*
785 - regmatch - main matching routine
786 *
787 * Conceptually the strategy is simple: check to see whether the current
788 * node matches, call self recursively to see whether the rest matches,
789 * and then act accordingly. In practice we make some effort to avoid
790 * recursion, in particular by going through "ordinary" nodes (that don't
791 * need to know whether the rest of the match failed) by a loop instead of
792 * by recursion.
793 */
794 static int /* 0 failure, 1 success */
regmatch(char * prog)795 regmatch(char *prog)
796 {
797 register char *scan; /* Current node. */
798 char *next; /* Next node. */
799
800 scan = prog;
801 #ifdef DEBUG
802 if (scan != NULL && regnarrate)
803 fprintf(stderr, "%s(\n", regprop(scan));
804 #endif
805 while (scan != NULL) {
806 #ifdef DEBUG
807 if (regnarrate)
808 fprintf(stderr, "%s...\n", regprop(scan));
809 #endif
810 next = regnext(scan);
811
812 switch (OP(scan)) {
813 case BOL:
814 if (reginput != regbol)
815 return(0);
816 break;
817 case EOL:
818 if (*reginput != '\0')
819 return(0);
820 break;
821 case ANY:
822 if (*reginput == '\0')
823 return(0);
824 reginput++;
825 break;
826 case EXACTLY: {
827 register int len;
828 register char *opnd;
829
830 opnd = OPERAND(scan);
831 /* Inline the first character, for speed. */
832 if (*opnd != *reginput)
833 return(0);
834 len = strlen(opnd);
835 if (len > 1 && strncmp(opnd, reginput, len) != 0)
836 return(0);
837 reginput += len;
838 }
839 break;
840 case ANYOF:
841 if (strchr(OPERAND(scan), *reginput) == NULL)
842 return(0);
843 reginput++;
844 break;
845 case ANYBUT:
846 if (strchr(OPERAND(scan), *reginput) != NULL)
847 return(0);
848 reginput++;
849 break;
850 case NOTHING:
851 break;
852 case BACK:
853 break;
854 case OPEN+1:
855 case OPEN+2:
856 case OPEN+3:
857 case OPEN+4:
858 case OPEN+5:
859 case OPEN+6:
860 case OPEN+7:
861 case OPEN+8:
862 case OPEN+9: {
863 register int no;
864 register char *save;
865
866 no = OP(scan) - OPEN;
867 save = reginput;
868
869 if (regmatch(next)) {
870 /*
871 * Don't set startp if some later
872 * invocation of the same parentheses
873 * already has.
874 */
875 if (regstartp[no] == NULL)
876 regstartp[no] = save;
877 return(1);
878 } else
879 return(0);
880 }
881 break;
882 case CLOSE+1:
883 case CLOSE+2:
884 case CLOSE+3:
885 case CLOSE+4:
886 case CLOSE+5:
887 case CLOSE+6:
888 case CLOSE+7:
889 case CLOSE+8:
890 case CLOSE+9: {
891 register int no;
892 register char *save;
893
894 no = OP(scan) - CLOSE;
895 save = reginput;
896
897 if (regmatch(next)) {
898 /*
899 * Don't set endp if some later
900 * invocation of the same parentheses
901 * already has.
902 */
903 if (regendp[no] == NULL)
904 regendp[no] = save;
905 return(1);
906 } else
907 return(0);
908 }
909 break;
910 case BRANCH: {
911 register char *save;
912
913 if (OP(next) != BRANCH) /* No choice. */
914 next = OPERAND(scan); /* Avoid recursion. */
915 else {
916 do {
917 save = reginput;
918 if (regmatch(OPERAND(scan)))
919 return(1);
920 reginput = save;
921 scan = regnext(scan);
922 } while (scan != NULL && OP(scan) == BRANCH);
923 return(0);
924 /* NOTREACHED */
925 }
926 }
927 break;
928 case STAR:
929 case PLUS: {
930 register char nextch;
931 register int no;
932 register char *save;
933 register int min;
934
935 /*
936 * Lookahead to avoid useless match attempts
937 * when we know what character comes next.
938 */
939 nextch = '\0';
940 if (OP(next) == EXACTLY)
941 nextch = *OPERAND(next);
942 min = (OP(scan) == STAR) ? 0 : 1;
943 save = reginput;
944 no = regrepeat(OPERAND(scan));
945 while (no >= min) {
946 /* If it could work, try it. */
947 if (nextch == '\0' || *reginput == nextch)
948 if (regmatch(next))
949 return(1);
950 /* Couldn't or didn't -- back up. */
951 no--;
952 reginput = save + no;
953 }
954 return(0);
955 }
956 break;
957 case END:
958 return(1); /* Success! */
959 break;
960 default:
961 evr_regerror("memory corruption");
962 return(0);
963 break;
964 }
965
966 scan = next;
967 }
968
969 /*
970 * We get here only if there's trouble -- normally "case END" is
971 * the terminating point.
972 */
973 evr_regerror("corrupted pointers");
974 return(0);
975 }
976
977 /*
978 - regrepeat - repeatedly match something simple, report how many
979 */
980 static int
regrepeat(char * p)981 regrepeat(char *p)
982 {
983 register int count = 0;
984 register char *scan;
985 register char *opnd;
986
987 scan = reginput;
988 opnd = OPERAND(p);
989 switch (OP(p)) {
990 case ANY:
991 count = strlen(scan);
992 scan += count;
993 break;
994 case EXACTLY:
995 while (*opnd == *scan) {
996 count++;
997 scan++;
998 }
999 break;
1000 case ANYOF:
1001 while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
1002 count++;
1003 scan++;
1004 }
1005 break;
1006 case ANYBUT:
1007 while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
1008 count++;
1009 scan++;
1010 }
1011 break;
1012 default: /* Oh dear. Called inappropriately. */
1013 evr_regerror("internal foulup");
1014 count = 0; /* Best compromise. */
1015 break;
1016 }
1017 reginput = scan;
1018
1019 return(count);
1020 }
1021
1022 /*
1023 - regnext - dig the "next" pointer out of a node
1024 */
1025 static char *
regnext(register char * p)1026 regnext(register char *p)
1027 {
1028 register int offset;
1029
1030 if (p == ®dummy)
1031 return(NULL);
1032
1033 offset = NEXT(p);
1034 if (offset == 0)
1035 return(NULL);
1036
1037 if (OP(p) == BACK)
1038 return(p-offset);
1039 else
1040 return(p+offset);
1041 }
1042
1043 #ifdef DEBUG
1044
1045 /*
1046 - regdump - dump a regexp onto stdout in vaguely comprehensible form
1047 */
1048 void
regdump(regexp * r)1049 regdump(regexp *r)
1050 {
1051 register char *s;
1052 register char op = EXACTLY; /* Arbitrary non-END op. */
1053 register char *next;
1054
1055 s = r->program + 1;
1056 while (op != END) { /* While that wasn't END last time... */
1057 op = OP(s);
1058 printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */
1059 next = regnext(s);
1060 if (next == NULL) /* Next ptr. */
1061 printf("(0)");
1062 else
1063 printf("(%d)", (s-r->program)+(next-s));
1064 s += 3;
1065 if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
1066 /* Literal string, where present. */
1067 while (*s != '\0') {
1068 putchar(*s);
1069 s++;
1070 }
1071 s++;
1072 }
1073 putchar('\n');
1074 }
1075
1076 /* Header fields of interest. */
1077 if (r->regstart != '\0')
1078 printf("start `%c' ", r->regstart);
1079 if (r->reganch)
1080 printf("anchored ");
1081 if (r->regmust != NULL)
1082 printf("must have \"%s\"", r->regmust);
1083 printf("\n");
1084 }
1085
1086 /*
1087 - regprop - printable representation of opcode
1088 */
1089 static char *
regprop(char * op)1090 regprop(char *op)
1091 {
1092 register char *p;
1093 static char buf[50];
1094
1095 (void) strcpy(buf, ":");
1096
1097 switch (OP(op)) {
1098 case BOL:
1099 p = "BOL";
1100 break;
1101 case EOL:
1102 p = "EOL";
1103 break;
1104 case ANY:
1105 p = "ANY";
1106 break;
1107 case ANYOF:
1108 p = "ANYOF";
1109 break;
1110 case ANYBUT:
1111 p = "ANYBUT";
1112 break;
1113 case BRANCH:
1114 p = "BRANCH";
1115 break;
1116 case EXACTLY:
1117 p = "EXACTLY";
1118 break;
1119 case NOTHING:
1120 p = "NOTHING";
1121 break;
1122 case BACK:
1123 p = "BACK";
1124 break;
1125 case END:
1126 p = "END";
1127 break;
1128 case OPEN+1:
1129 case OPEN+2:
1130 case OPEN+3:
1131 case OPEN+4:
1132 case OPEN+5:
1133 case OPEN+6:
1134 case OPEN+7:
1135 case OPEN+8:
1136 case OPEN+9:
1137 sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
1138 p = NULL;
1139 break;
1140 case CLOSE+1:
1141 case CLOSE+2:
1142 case CLOSE+3:
1143 case CLOSE+4:
1144 case CLOSE+5:
1145 case CLOSE+6:
1146 case CLOSE+7:
1147 case CLOSE+8:
1148 case CLOSE+9:
1149 sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
1150 p = NULL;
1151 break;
1152 case STAR:
1153 p = "STAR";
1154 break;
1155 case PLUS:
1156 p = "PLUS";
1157 break;
1158 default:
1159 evr_regerror("corrupted opcode");
1160 break;
1161 }
1162 if (p != NULL)
1163 (void) strcat(buf, p);
1164 return(buf);
1165 }
1166 #endif
1167
1168 /*
1169 * The following is provided for those people who do not have strcspn() in
1170 * their C libraries. They should get off their butts and do something
1171 * about it; at least one public-domain implementation of those (highly
1172 * useful) string routines has been published on Usenet.
1173 */
1174 #ifdef STRCSPN
1175 /*
1176 * strcspn - find length of initial segment of s1 consisting entirely
1177 * of characters not from s2
1178 */
1179
1180 static int
strcspn(char * s1,char * s2)1181 strcspn(char *s1, char *s2)
1182 {
1183 register char *scan1;
1184 register char *scan2;
1185 register int count;
1186
1187 count = 0;
1188 for (scan1 = s1; *scan1 != '\0'; scan1++) {
1189 for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */
1190 if (*scan1 == *scan2++)
1191 return(count);
1192 count++;
1193 }
1194 return(count);
1195 }
1196 #endif
1197