1 /*
2 * @(#)regexp.c 1.3 of 18 April 87
3 *
4 * Copyright (c) 1986 by University of Toronto.
5 * Written by Henry Spencer. Not derived from licensed software.
6 *
7 * Permission is granted to anyone to use this software for any
8 * purpose on any computer system, and to redistribute it freely,
9 * subject to the following restrictions:
10 *
11 * 1. The author is not responsible for the consequences of use of
12 * this software, no matter how awful, even if they arise
13 * from defects in it.
14 *
15 * 2. The origin of this software must not be misrepresented, either
16 * by explicit claim or by omission.
17 *
18 * 3. Altered versions must be plainly marked as such, and must not
19 * be misrepresented as being the original software.
20 *
21 * Beware that some of this code is subtly aware of the way operator
22 * precedence is structured in regular expressions. Serious changes in
23 * regular-expression syntax might require a total rethink.
24 */
25
26 /*
27 * NOTICE: THIS CODE HAS BEEN MODIFIED TO FIT THE NN ENVIRONMENT.
28 *
29 * modified by Kim Fabricius Storm
30 * modified by Michael T. Pins
31 */
32
33 #include <stdio.h>
34 #include <string.h>
35 #include <stdlib.h>
36 #include <ctype.h>
37 #include "config.h"
38 #include "global.h"
39 #include "regexp.h"
40 #include "nn_term.h"
41
42 /*
43 * The "internal use only" fields in regexp.h are present to pass info from
44 * compile to execute that permits the execute phase to run lots faster on
45 * simple cases. They are:
46 *
47 * regstart char that must begin a match; '\0' if none obvious
48 * reganch is the match anchored (at beginning-of-line only)?
49 * regmust string (pointer into program) that match must include, or NULL
50 * regmlen length of regmust string
51 *
52 * Regstart and reganch permit very fast decisions on suitable starting points
53 * for a match, cutting down the work a lot. Regmust permits fast rejection
54 * of lines that cannot possibly match. The regmust tests are costly enough
55 * that regcomp() supplies a regmust only if the r.e. contains something
56 * potentially expensive (at present, the only such thing detected is * or +
57 * at the start of the r.e., which can involve a lot of backup). Regmlen is
58 * supplied because the test in regexec() needs it and regcomp() is computing
59 * it anyway.
60 */
61
62 /*
63 * Structure for regexp "program". This is essentially a linear encoding
64 * of a nondeterministic finite-state machine (aka syntax charts or
65 * "railroad normal form" in parsing technology). Each node is an opcode
66 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
67 * all nodes except BRANCH implement concatenation; a "next" pointer with
68 * a BRANCH on both ends of it is connecting two alternatives. (Here we
69 * have one of the subtle syntax dependencies: an individual BRANCH (as
70 * opposed to a collection of them) is never concatenated with anything
71 * because of operator precedence.) The operand of some types of node is
72 * a literal string; for others, it is a node leading into a sub-FSM. In
73 * particular, the operand of a BRANCH node is the first node of the branch.
74 * (NB this is *not* a tree structure: the tail of the branch connects
75 * to the thing following the set of BRANCHes.) The opcodes are:
76 */
77
78 /* definition number opnd? meaning */
79 #define END 0 /* no End of program. */
80 #define BOL 1 /* no Match "" at beginning of line. */
81 #define EOL 2 /* no Match "" at end of line. */
82 #define ANY 3 /* no Match any one character. */
83 #define ANYOF 4 /* str Match any character in this string. */
84 #define ANYBUT 5 /* str Match any character not in this
85 * string. */
86 #define BRANCH 6 /* node Match this alternative, or the
87 * next... */
88 #define BACK 7 /* no Match "", "next" ptr points backward. */
89 #define EXACTLY 8 /* str Match this string. */
90 #define NOTHING 9 /* no Match empty string. */
91 #define STAR 10 /* node Match this (simple) thing 0 or more
92 * times. */
93 #define PLUS 11 /* node Match this (simple) thing 1 or more
94 * times. */
95 #define W_START 12 /* no Match beginning of word */
96 #define W_END 13 /* no Match end of word */
97 #define OPEN 20 /* no Mark this point in input as start of
98 * #n. */
99 /* OPEN+1 is number 1, etc. */
100 #define CLOSE 30 /* no Analogous to OPEN. */
101
102 /*
103 * Opcode notes:
104 *
105 * BRANCH The set of branches constituting a single choice are hooked
106 * together with their "next" pointers, since precedence prevents
107 * anything being concatenated to any individual branch. The
108 * "next" pointer of the last BRANCH in a choice points to the
109 * thing following the whole choice. This is also where the
110 * final "next" pointer of each individual branch points; each
111 * branch starts with the operand node of a BRANCH node.
112 *
113 * BACK Normal "next" pointers all implicitly point forward; BACK
114 * exists to make loop structures possible.
115 *
116 * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
117 * BRANCH structures using BACK. Simple cases (one character
118 * per match) are implemented with STAR and PLUS for speed
119 * and to minimize recursive plunges.
120 *
121 * OPEN,CLOSE ...are numbered at compile time.
122 */
123
124 /*
125 * A node is one char of opcode followed by two chars of "next" pointer.
126 * "Next" pointers are stored as two 8-bit pieces, high order first. The
127 * value is a positive offset from the opcode of the node containing it.
128 * An operand, if any, simply follows the node. (Note that much of the
129 * code generation knows about this implicit relationship.)
130 *
131 * Using two bytes for the "next" pointer is vast overkill for most things,
132 * but allows patterns to get big without disasters.
133 */
134 #define OP(p) (*(p))
135 #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
136 #define OPERAND(p) ((p) + 3)
137
138
139 /*
140 * Utility definitions.
141 */
142
143 #ifndef CHARBITS
144 #define UCHARAT(p) ((int)*(unsigned char *)(p))
145 #else
146 #define UCHARAT(p) ((int)*(p)&CHARBITS)
147 #endif
148
149 #define FAIL(m) { regerror(m); return(NULL); }
150 #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?')
151 #define META "^$.[()|?+*\\"
152
153 /*
154 * Flags to be passed up and down.
155 */
156 #define HASWIDTH 01 /* Known never to match null string. */
157 #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */
158 #define SPSTART 04 /* Starts with * or +. */
159 #define WORST 0 /* Worst case. */
160
161 /*
162 * Global work variables for regcomp().
163 */
164 static char *regparse; /* Input-scan pointer. */
165 static int regnpar; /* () count. */
166 static char regdummy;
167 static char *regcode; /* Code-emit pointer; ®dummy = don't. */
168 static long regsize; /* Code size. */
169
170 /*
171 * Forward declarations for regcomp()'s friends.
172 */
173
174 #ifndef STATIC
175 #define STATIC static
176 #endif
177
178 STATIC char *reg(int, int *);
179 STATIC char *regbranch(int *);
180 STATIC char *regpiece(int *);
181 STATIC char *regatom(int *);
182 STATIC char *regnode(char);
183 STATIC char *regnext(register char *);
184 STATIC void regc(char);
185 STATIC void reginsert(char, char *);
186 STATIC void regtail(char *, char *);
187 STATIC void regoptail(char *, char *);
188 STATIC void regerror(char *);
189
190 #ifdef STRCSPN
191 STATIC int strcspn(char *, char *);
192 #endif
193
194 /*
195 - regcomp - compile a regular expression into internal code
196 *
197 * We can't allocate space until we know how big the compiled form will be,
198 * but we can't compile it (and thus know how big it is) until we've got a
199 * place to put the code. So we cheat: we compile it twice, once with code
200 * generation turned off and size counting turned on, and once "for real".
201 * This also means that we don't allocate space until we are sure that the
202 * thing really will compile successfully, and we never have to move the
203 * code and thus invalidate pointers into it. (Note that it has to be in
204 * one piece because free() must be able to free it all.)
205 *
206 * Beware that the optimization-preparation code in here knows about some
207 * of the structure of the compiled regexp.
208 */
209 regexp *
regcomp(char * exp)210 regcomp(char *exp)
211 {
212 register regexp *r;
213 register char *scan;
214 register char *longest;
215 register int len;
216 int flags;
217
218 if (exp == NULL)
219 FAIL("NULL argument");
220
221 /* First pass: determine size, legality. */
222 regparse = exp;
223 regnpar = 1;
224 regsize = 0L;
225 regcode = ®dummy;
226 regc((char) MAGIC);
227 if (reg(0, &flags) == NULL)
228 return (NULL);
229
230 /* Small enough for pointer-storage convention? */
231 if (regsize >= 32767L) /* Probably could be 65535L. */
232 FAIL("regexp too big");
233
234 /* Allocate space. */
235 r = (regexp *) malloc(sizeof(regexp) + (unsigned) regsize);
236 if (r == NULL)
237 FAIL("out of space");
238
239 /* Second pass: emit code. */
240 regparse = exp;
241 regnpar = 1;
242 regcode = r->program;
243 regc((char) MAGIC);
244 if (reg(0, &flags) == NULL)
245 return (NULL);
246
247 /* Dig out information for optimizations. */
248 r->regstart = '\0'; /* Worst-case defaults. */
249 r->reganch = 0;
250 r->regmust = NULL;
251 r->regmlen = 0;
252 scan = r->program + 1; /* First BRANCH. */
253 if (OP(regnext(scan)) == END) { /* Only one top-level choice. */
254 scan = OPERAND(scan);
255
256 /* Starting-point info. */
257 if (OP(scan) == EXACTLY)
258 r->regstart = *OPERAND(scan);
259 else if (OP(scan) == BOL)
260 r->reganch++;
261
262 /*
263 * If there's something expensive in the r.e., find the longest
264 * literal string that must appear and make it the regmust. Resolve
265 * ties in favor of later strings, since the regstart check works
266 * with the beginning of the r.e. and avoiding duplication
267 * strengthens checking. Not a strong reason, but sufficient in the
268 * absence of others.
269 */
270 if (flags & SPSTART) {
271 longest = NULL;
272 len = 0;
273 for (; scan != NULL; scan = regnext(scan))
274 if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) {
275 longest = OPERAND(scan);
276 len = strlen(OPERAND(scan));
277 }
278 r->regmust = longest;
279 r->regmlen = len;
280 }
281 }
282 return (r);
283 }
284
285 /*
286 - reg - regular expression, i.e. main body or parenthesized thing
287 *
288 * Caller must absorb opening parenthesis.
289 *
290 * Combining parenthesis handling with the base level of regular expression
291 * is a trifle forced, but the need to tie the tails of the branches to what
292 * follows makes it hard to avoid.
293 *
294 * paren: Parenthesized?
295 */
296 static char *
reg(int paren,int * flagp)297 reg(int paren, int *flagp)
298 {
299 register char *ret;
300 register char *br;
301 register char *ender;
302 register int parno = 0;
303 int flags;
304
305 *flagp = HASWIDTH; /* Tentatively. */
306
307 /* Make an OPEN node, if parenthesized. */
308 if (paren) {
309 if (regnpar >= NSUBEXP)
310 FAIL("too many ()");
311 parno = regnpar;
312 regnpar++;
313 ret = regnode(OPEN + parno);
314 } else
315 ret = NULL;
316
317 /* Pick up the branches, linking them together. */
318 br = regbranch(&flags);
319 if (br == NULL)
320 return (NULL);
321 if (ret != NULL)
322 regtail(ret, br); /* OPEN -> first. */
323 else
324 ret = br;
325 if (!(flags & HASWIDTH))
326 *flagp &= ~HASWIDTH;
327 *flagp |= flags & SPSTART;
328 while (*regparse == '|') {
329 regparse++;
330 br = regbranch(&flags);
331 if (br == NULL)
332 return (NULL);
333 regtail(ret, br); /* BRANCH -> BRANCH. */
334 if (!(flags & HASWIDTH))
335 *flagp &= ~HASWIDTH;
336 *flagp |= flags & SPSTART;
337 }
338
339 /* Make a closing node, and hook it on the end. */
340 ender = regnode((paren) ? CLOSE + parno : END);
341 regtail(ret, ender);
342
343 /* Hook the tails of the branches to the closing node. */
344 for (br = ret; br != NULL; br = regnext(br))
345 regoptail(br, ender);
346
347 /* Check for proper termination. */
348 if (paren && *regparse++ != ')') {
349 FAIL("unmatched ()");
350 } else if (!paren && *regparse != '\0') {
351 if (*regparse == ')') {
352 FAIL("unmatched ()");
353 } else
354 FAIL("junk on end");/* "Can't happen". */
355 /* NOTREACHED */
356 }
357 return (ret);
358 }
359
360 /*
361 - regbranch - one alternative of an | operator
362 *
363 * Implements the concatenation operator.
364 */
365 static char *
regbranch(int * flagp)366 regbranch(int *flagp)
367 {
368 register char *ret;
369 register char *chain;
370 register char *latest;
371 int flags;
372
373 *flagp = WORST; /* Tentatively. */
374
375 ret = regnode(BRANCH);
376 chain = NULL;
377 while (*regparse != '\0' && *regparse != '|' && *regparse != ')') {
378 latest = regpiece(&flags);
379 if (latest == NULL)
380 return (NULL);
381 *flagp |= flags & HASWIDTH;
382 if (chain == NULL) /* First piece. */
383 *flagp |= flags & SPSTART;
384 else
385 regtail(chain, latest);
386 chain = latest;
387 }
388 if (chain == NULL) /* Loop ran zero times. */
389 (void) regnode(NOTHING);
390
391 return (ret);
392 }
393
394 /*
395 - regpiece - something followed by possible [*+?]
396 *
397 * Note that the branching code sequences used for ? and the general cases
398 * of * and + are somewhat optimized: they use the same NOTHING node as
399 * both the endmarker for their branch list and the body of the last branch.
400 * It might seem that this node could be dispensed with entirely, but the
401 * endmarker role is not redundant.
402 */
403 static char *
regpiece(int * flagp)404 regpiece(int *flagp)
405 {
406 register char *ret;
407 register char op;
408 register char *next;
409 int flags;
410
411 ret = regatom(&flags);
412 if (ret == NULL)
413 return (NULL);
414
415 op = *regparse;
416 if (!ISMULT(op)) {
417 *flagp = flags;
418 return (ret);
419 }
420 if (!(flags & HASWIDTH) && op != '?')
421 FAIL("*+ operand could be empty");
422 *flagp = (op != '+') ? (WORST | SPSTART) : (WORST | HASWIDTH);
423
424 if (op == '*' && (flags & SIMPLE))
425 reginsert(STAR, ret);
426 else if (op == '*') {
427 /* Emit x* as (x&|), where & means "self". */
428 reginsert(BRANCH, ret); /* Either x */
429 regoptail(ret, regnode(BACK)); /* and loop */
430 regoptail(ret, ret); /* back */
431 regtail(ret, regnode(BRANCH)); /* or */
432 regtail(ret, regnode(NOTHING)); /* null. */
433 } else if (op == '+' && (flags & SIMPLE))
434 reginsert(PLUS, ret);
435 else if (op == '+') {
436 /* Emit x+ as x(&|), where & means "self". */
437 next = regnode(BRANCH); /* Either */
438 regtail(ret, next);
439 regtail(regnode(BACK), ret); /* loop back */
440 regtail(next, regnode(BRANCH)); /* or */
441 regtail(ret, regnode(NOTHING)); /* null. */
442 } else if (op == '?') {
443 /* Emit x? as (x|) */
444 reginsert(BRANCH, ret); /* Either x */
445 regtail(ret, regnode(BRANCH)); /* or */
446 next = regnode(NOTHING);/* null. */
447 regtail(ret, next);
448 regoptail(ret, next);
449 }
450 regparse++;
451 if (ISMULT(*regparse))
452 FAIL("nested *?+");
453
454 return (ret);
455 }
456
457 /*
458 - regatom - the lowest level
459 *
460 * Optimization: gobbles an entire sequence of ordinary characters so that
461 * it can turn them into a single node, which is smaller to store and
462 * faster to run. Backslashed characters are exceptions, each becoming a
463 * separate node; the code is simpler that way and it's not worth fixing.
464 */
465 static char *
regatom(int * flagp)466 regatom(int *flagp)
467 {
468 register char *ret;
469 int flags;
470
471 *flagp = WORST; /* Tentatively. */
472
473 switch (*regparse++) {
474 case '^':
475 ret = regnode(BOL);
476 break;
477 case '$':
478 ret = regnode(EOL);
479 break;
480 case '.':
481 ret = regnode(ANY);
482 *flagp |= HASWIDTH | SIMPLE;
483 break;
484 case '[':{
485 register int class;
486 register int classend;
487
488 if (*regparse == '^') { /* Complement of range. */
489 ret = regnode(ANYBUT);
490 regparse++;
491 } else
492 ret = regnode(ANYOF);
493 if (*regparse == ']' || *regparse == '-')
494 regc(*regparse++);
495 while (*regparse != '\0' && *regparse != ']') {
496 if (*regparse == '-') {
497 regparse++;
498 if (*regparse == ']' || *regparse == '\0')
499 regc('-');
500 else {
501 class = UCHARAT(regparse - 2) + 1;
502 classend = UCHARAT(regparse);
503 if (class > classend + 1)
504 FAIL("invalid [] range");
505 for (; class <= classend; class++)
506 regc(class);
507 regparse++;
508 }
509 } else
510 regc(*regparse++);
511 }
512 regc('\0');
513 if (*regparse != ']')
514 FAIL("unmatched []");
515 regparse++;
516 *flagp |= HASWIDTH | SIMPLE;
517 }
518 break;
519 case '(':
520 ret = reg(1, &flags);
521 if (ret == NULL)
522 return (NULL);
523 *flagp |= flags & (HASWIDTH | SPSTART);
524 break;
525 case '\0':
526 case '|':
527 case ')':
528 FAIL("internal urp"); /* Supposed to be caught earlier. */
529 break;
530 case '?':
531 case '+':
532 case '*':
533 FAIL("?+* follows nothing");
534 break;
535 case '\\':
536 switch (*regparse++) {
537 case '\0':
538 FAIL("trailing \\");
539 break; /* for lint, we can't actually get here */
540 case '<':
541 ret = regnode(W_START);
542 break;
543 case '>':
544 ret = regnode(W_END);
545 break;
546 default:
547 ret = regnode(EXACTLY);
548 regc(*regparse);
549 regc('\0');
550 *flagp |= HASWIDTH | SIMPLE;
551 break;
552 }
553 break;
554 default:{
555 register int len;
556 register char ender;
557
558 regparse--;
559 len = strcspn(regparse, META);
560 if (len <= 0)
561 FAIL("internal disaster");
562 ender = *(regparse + len);
563 if (len > 1 && ISMULT(ender))
564 len--; /* Back off clear of ?+* operand. */
565 *flagp |= HASWIDTH;
566 if (len == 1)
567 *flagp |= SIMPLE;
568 ret = regnode(EXACTLY);
569 while (len > 0) {
570 regc(*regparse++);
571 len--;
572 }
573 regc('\0');
574 }
575 break;
576 }
577
578 return (ret);
579 }
580
581 /*
582 - regnode - emit a node
583 */
584 static char * /* Location. */
regnode(char op)585 regnode(char op)
586 {
587 register char *ret;
588 register char *ptr;
589
590 ret = regcode;
591 if (ret == ®dummy) {
592 regsize += 3;
593 return (ret);
594 }
595 ptr = ret;
596 *ptr++ = op;
597 *ptr++ = '\0'; /* Null "next" pointer. */
598 *ptr++ = '\0';
599 regcode = ptr;
600
601 return (ret);
602 }
603
604 /*
605 - regc - emit (if appropriate) a byte of code
606 */
607 static void
regc(char b)608 regc(char b)
609 {
610 if (regcode != ®dummy)
611 *regcode++ = b;
612 else
613 regsize++;
614 }
615
616 /*
617 - reginsert - insert an operator in front of already-emitted operand
618 *
619 * Means relocating the operand.
620 */
621 static void
reginsert(char op,char * opnd)622 reginsert(char op, char *opnd)
623 {
624 register char *src;
625 register char *dst;
626 register char *place;
627
628 if (regcode == ®dummy) {
629 regsize += 3;
630 return;
631 }
632 src = regcode;
633 regcode += 3;
634 dst = regcode;
635 while (src > opnd)
636 *--dst = *--src;
637
638 place = opnd; /* Op node, where operand used to be. */
639 *place++ = op;
640 *place++ = '\0';
641 *place++ = '\0';
642 }
643
644 /*
645 - regtail - set the next-pointer at the end of a node chain
646 */
647 static void
regtail(char * p,char * val)648 regtail(char *p, char *val)
649 {
650 register char *scan;
651 register char *temp;
652 register int offset;
653
654 if (p == ®dummy)
655 return;
656
657 /* Find last node. */
658 scan = p;
659 for (;;) {
660 temp = regnext(scan);
661 if (temp == NULL)
662 break;
663 scan = temp;
664 }
665
666 if (OP(scan) == BACK)
667 offset = scan - val;
668 else
669 offset = val - scan;
670 *(scan + 1) = (offset >> 8) & 0377;
671 *(scan + 2) = offset & 0377;
672 }
673
674 /*
675 - regoptail - regtail on operand of first argument; nop if operandless
676 */
677 static void
regoptail(char * p,char * val)678 regoptail(char *p, char *val)
679 {
680 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
681 if (p == NULL || p == ®dummy || OP(p) != BRANCH)
682 return;
683 regtail(OPERAND(p), val);
684 }
685
686 /*
687 * regexec and friends
688 */
689
690 /*
691 * Global work variables for regexec().
692 */
693 static char *reginput; /* String-input pointer. */
694 static char *regbol; /* Beginning of input, for ^ check. */
695 static char **regstartp; /* Pointer to startp array. */
696 static char **regendp; /* Ditto for endp. */
697
698 /*
699 * Forwards.
700 */
701 STATIC int regtry(regexp *, char *);
702 STATIC int regmatch(char *);
703 STATIC int regrepeat(char *);
704
705 #ifdef DEBUG
706 int regnarrate = 0;
707 void regdump(regexp *);
708 STATIC char *regprop(char *);
709 #endif
710
711 /*
712 - regexec - match a regexp against a string
713 */
714 int
regexec(register regexp * prog,register char * string)715 regexec(register regexp * prog, register char *string)
716 {
717 register char *s;
718
719 /* Be paranoid... */
720 if (prog == NULL || string == NULL) {
721 regerror("NULL parameter");
722 return (0);
723 }
724 /* Check validity of program. */
725 if (UCHARAT(prog->program) != MAGIC) {
726 regerror("corrupted program");
727 return (0);
728 }
729 /* If there is a "must appear" string, look for it. */
730 if (prog->regmust != NULL) {
731 s = string;
732 while ((s = strchr(s, prog->regmust[0])) != NULL) {
733 if (strncmp(s, prog->regmust, prog->regmlen) == 0)
734 break; /* Found it. */
735 s++;
736 }
737 if (s == NULL) /* Not present. */
738 return (0);
739 }
740 /* Mark beginning of line for ^ . */
741 regbol = string;
742
743 /* Simplest case: anchored match need be tried only once. */
744 if (prog->reganch)
745 return (regtry(prog, string));
746
747 /* Messy cases: unanchored match. */
748 s = string;
749 if (prog->regstart != '\0')
750 /* We know what char it must start with. */
751 while ((s = strchr(s, prog->regstart)) != NULL) {
752 if (regtry(prog, s))
753 return (1);
754 s++;
755 }
756 else
757 /* We don't -- general case. */
758 do {
759 if (regtry(prog, s))
760 return (1);
761 } while (*s++ != '\0');
762
763 /* Failure. */
764 return (0);
765 }
766
767 /*
768 - regtry - try match at specific point
769 */
770 static int /* 0 failure, 1 success */
regtry(regexp * prog,char * string)771 regtry(regexp * prog, char *string)
772 {
773 register int i;
774 register char **sp;
775 register char **ep;
776
777 reginput = string;
778 regstartp = prog->startp;
779 regendp = prog->endp;
780
781 sp = prog->startp;
782 ep = prog->endp;
783 for (i = NSUBEXP; i > 0; i--) {
784 *sp++ = NULL;
785 *ep++ = NULL;
786 }
787 if (regmatch(prog->program + 1)) {
788 prog->startp[0] = string;
789 prog->endp[0] = reginput;
790 return (1);
791 } else
792 return (0);
793 }
794
795 /*
796 - regmatch - main matching routine
797 *
798 * Conceptually the strategy is simple: check to see whether the current
799 * node matches, call self recursively to see whether the rest matches,
800 * and then act accordingly. In practice we make some effort to avoid
801 * recursion, in particular by going through "ordinary" nodes (that don't
802 * need to know whether the rest of the match failed) by a loop instead of
803 * by recursion.
804 */
805 static int /* 0 failure, 1 success */
regmatch(char * prog)806 regmatch(char *prog)
807 {
808 register char *scan; /* Current node. */
809 char *next; /* Next node. */
810
811 scan = prog;
812
813 #ifdef DEBUG
814 if (scan != NULL && regnarrate)
815 fprintf(stderr, "%s(\n", regprop(scan));
816 #endif
817
818 while (scan != NULL) {
819
820 #ifdef DEBUG
821 if (regnarrate)
822 fprintf(stderr, "%s...\n", regprop(scan));
823 #endif
824
825 next = regnext(scan);
826
827 switch (OP(scan)) {
828 case BOL:
829 if (reginput != regbol)
830 return (0);
831 break;
832 case EOL:
833 if (*reginput != '\0')
834 return (0);
835 break;
836 case W_START:
837 if (*reginput == '\0' || isspace(*reginput))
838 return (0);
839 if (reginput != regbol && !isspace(*(reginput - 1)))
840 return (0);
841 break;
842 case W_END:
843 if (*reginput != '\0' && !isspace(*reginput))
844 return (0);
845 if (reginput == regbol || isspace(*(reginput - 1)))
846 return (0);
847 break;
848 case ANY:
849 if (*reginput == '\0')
850 return (0);
851 reginput++;
852 break;
853 case EXACTLY:{
854 register int len;
855 register char *opnd;
856
857 opnd = OPERAND(scan);
858 /* Inline the first character, for speed. */
859 if (*opnd != *reginput)
860 return (0);
861 len = strlen(opnd);
862 if (len > 1 && strncmp(opnd, reginput, len) != 0)
863 return (0);
864 reginput += len;
865 }
866 break;
867 case ANYOF:
868 if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL)
869 return (0);
870 reginput++;
871 break;
872 case ANYBUT:
873 if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL)
874 return (0);
875 reginput++;
876 break;
877 case NOTHING:
878 break;
879 case BACK:
880 break;
881 case OPEN + 1:
882 case OPEN + 2:
883 case OPEN + 3:
884 case OPEN + 4:
885 case OPEN + 5:
886 case OPEN + 6:
887 case OPEN + 7:
888 case OPEN + 8:
889 case OPEN + 9:{
890 register int no;
891 register char *save;
892
893 no = OP(scan) - OPEN;
894 save = reginput;
895
896 if (regmatch(next)) {
897
898 /*
899 * Don't set startp if some later invocation of the
900 * same parentheses already has.
901 */
902 if (regstartp[no] == NULL)
903 regstartp[no] = save;
904 return (1);
905 } else
906 return (0);
907 }
908 break;
909 case CLOSE + 1:
910 case CLOSE + 2:
911 case CLOSE + 3:
912 case CLOSE + 4:
913 case CLOSE + 5:
914 case CLOSE + 6:
915 case CLOSE + 7:
916 case CLOSE + 8:
917 case CLOSE + 9:{
918 register int no;
919 register char *save;
920
921 no = OP(scan) - CLOSE;
922 save = reginput;
923
924 if (regmatch(next)) {
925
926 /*
927 * Don't set endp if some later invocation of the
928 * same parentheses already has.
929 */
930 if (regendp[no] == NULL)
931 regendp[no] = save;
932 return (1);
933 } else
934 return (0);
935 }
936 break;
937 case BRANCH:{
938 register char *save;
939
940 if (OP(next) != BRANCH) /* No choice. */
941 next = OPERAND(scan); /* Avoid recursion. */
942 else {
943 do {
944 save = reginput;
945 if (regmatch(OPERAND(scan)))
946 return (1);
947 reginput = save;
948 scan = regnext(scan);
949 } while (scan != NULL && OP(scan) == BRANCH);
950 return (0);
951 /* NOTREACHED */
952 }
953 }
954 break;
955 case STAR:
956 case PLUS:{
957 register char nextch;
958 register int no;
959 register char *save;
960 register int min;
961
962 /*
963 * Lookahead to avoid useless match attempts when we know
964 * what character comes next.
965 */
966 nextch = '\0';
967 if (OP(next) == EXACTLY)
968 nextch = *OPERAND(next);
969 min = (OP(scan) == STAR) ? 0 : 1;
970 save = reginput;
971 no = regrepeat(OPERAND(scan));
972 while (no >= min) {
973 /* If it could work, try it. */
974 if (nextch == '\0' || *reginput == nextch)
975 if (regmatch(next))
976 return (1);
977 /* Couldn't or didn't -- back up. */
978 no--;
979 reginput = save + no;
980 }
981 return (0);
982 }
983 break;
984 case END:
985 return (1); /* Success! */
986 break;
987 default:
988 regerror("memory corruption");
989 return (0);
990 break;
991 }
992
993 scan = next;
994 }
995
996 /*
997 * We get here only if there's trouble -- normally "case END" is the
998 * terminating point.
999 */
1000 regerror("corrupted pointers");
1001 return (0);
1002 }
1003
1004 /*
1005 - regrepeat - repeatedly match something simple, report how many
1006 */
1007 static int
regrepeat(char * p)1008 regrepeat(char *p)
1009 {
1010 register int count = 0;
1011 register char *scan;
1012 register char *opnd;
1013
1014 scan = reginput;
1015 opnd = OPERAND(p);
1016 switch (OP(p)) {
1017 case ANY:
1018 count = strlen(scan);
1019 scan += count;
1020 break;
1021 case EXACTLY:
1022 while (*opnd == *scan) {
1023 count++;
1024 scan++;
1025 }
1026 break;
1027 case ANYOF:
1028 while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
1029 count++;
1030 scan++;
1031 }
1032 break;
1033 case ANYBUT:
1034 while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
1035 count++;
1036 scan++;
1037 }
1038 break;
1039 default: /* Oh dear. Called inappropriately. */
1040 regerror("internal foulup");
1041 count = 0; /* Best compromise. */
1042 break;
1043 }
1044 reginput = scan;
1045
1046 return (count);
1047 }
1048
1049 /*
1050 - regnext - dig the "next" pointer out of a node
1051 */
1052 static char *
regnext(register char * p)1053 regnext(register char *p)
1054 {
1055 register int offset;
1056
1057 if (p == ®dummy)
1058 return (NULL);
1059
1060 offset = NEXT(p);
1061 if (offset == 0)
1062 return (NULL);
1063
1064 if (OP(p) == BACK)
1065 return (p - offset);
1066 else
1067 return (p + offset);
1068 }
1069
1070 #ifdef DEBUG
1071 STATIC char *regprop(char *);
1072
1073 /*
1074 - regdump - dump a regexp onto stdout in vaguely comprehensible form
1075 */
1076 void
regdump(regexp * r)1077 regdump(regexp * r)
1078 {
1079 register char *s;
1080 register char op = EXACTLY; /* Arbitrary non-END op. */
1081 register char *next;
1082
1083
1084 s = r->program + 1;
1085 while (op != END) { /* While that wasn't END last time... */
1086 op = OP(s);
1087 printf("%2d%s", s - r->program, regprop(s)); /* Where, what. */
1088 next = regnext(s);
1089 if (next == NULL) /* Next ptr. */
1090 printf("(0)");
1091 else
1092 printf("(%d)", (s - r->program) + (next - s));
1093 s += 3;
1094 if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
1095 /* Literal string, where present. */
1096 while (*s != '\0') {
1097 putchar(*s);
1098 s++;
1099 }
1100 s++;
1101 }
1102 putchar('\n');
1103 }
1104
1105 /* Header fields of interest. */
1106 if (r->regstart != '\0')
1107 printf("start `%c' ", r->regstart);
1108 if (r->reganch)
1109 printf("anchored ");
1110 if (r->regmust != NULL)
1111 printf("must have \"%s\"", r->regmust);
1112 printf("\n");
1113 }
1114
1115 /*
1116 - regprop - printable representation of opcode
1117 */
1118 static char *
regprop(char * op)1119 regprop(char *op)
1120 {
1121 register char *p;
1122 static char buf[50];
1123
1124 (void) strcpy(buf, ":");
1125
1126 switch (OP(op)) {
1127 case BOL:
1128 p = "BOL";
1129 break;
1130 case EOL:
1131 p = "EOL";
1132 break;
1133 case W_START:
1134 p = "WSTART";
1135 break;
1136 case W_END:
1137 p = "WEND";
1138 break;
1139 case ANY:
1140 p = "ANY";
1141 break;
1142 case ANYOF:
1143 p = "ANYOF";
1144 break;
1145 case ANYBUT:
1146 p = "ANYBUT";
1147 break;
1148 case BRANCH:
1149 p = "BRANCH";
1150 break;
1151 case EXACTLY:
1152 p = "EXACTLY";
1153 break;
1154 case NOTHING:
1155 p = "NOTHING";
1156 break;
1157 case BACK:
1158 p = "BACK";
1159 break;
1160 case END:
1161 p = "END";
1162 break;
1163 case OPEN + 1:
1164 case OPEN + 2:
1165 case OPEN + 3:
1166 case OPEN + 4:
1167 case OPEN + 5:
1168 case OPEN + 6:
1169 case OPEN + 7:
1170 case OPEN + 8:
1171 case OPEN + 9:
1172 sprintf(buf + strlen(buf), "OPEN%d", OP(op) - OPEN);
1173 p = NULL;
1174 break;
1175 case CLOSE + 1:
1176 case CLOSE + 2:
1177 case CLOSE + 3:
1178 case CLOSE + 4:
1179 case CLOSE + 5:
1180 case CLOSE + 6:
1181 case CLOSE + 7:
1182 case CLOSE + 8:
1183 case CLOSE + 9:
1184 sprintf(buf + strlen(buf), "CLOSE%d", OP(op) - CLOSE);
1185 p = NULL;
1186 break;
1187 case STAR:
1188 p = "STAR";
1189 break;
1190 case PLUS:
1191 p = "PLUS";
1192 break;
1193 default:
1194 regerror("corrupted opcode");
1195 break;
1196 }
1197 if (p != NULL)
1198 (void) strcat(buf, p);
1199 return (buf);
1200 }
1201
1202 #endif
1203
1204 /*
1205 * The following is provided for those people who do not have strcspn() in
1206 * their C libraries. They should get off their butts and do something
1207 * about it; at least one public-domain implementation of those (highly
1208 * useful) string routines has been published on Usenet.
1209 */
1210
1211 #ifdef STRCSPN
1212 /*
1213 * strcspn - find length of initial segment of s1 consisting entirely
1214 * of characters not from s2
1215 */
1216
1217 static int
strcspn(char * s1,char * s2)1218 strcspn(char *s1, char *s2)
1219 {
1220 register char *scan1;
1221 register char *scan2;
1222 register int count;
1223
1224 count = 0;
1225 for (scan1 = s1; *scan1 != '\0'; scan1++) {
1226 for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */
1227 if (*scan1 == *scan2++)
1228 return (count);
1229 count++;
1230 }
1231 return (count);
1232 }
1233
1234 #endif
1235
1236
1237 void
regerror(char * s)1238 regerror(char *s)
1239 {
1240
1241 msg("regexp: %s", s);
1242 /* NOTREACHED */
1243 }
1244