1 /*
2 * regcomp and regexec -- regsub and regerror are elsewhere
3 *
4 * Copyright (c) 1986 by University of Toronto.
5 * Written by Henry Spencer. Not derived from licensed software.
6 *
7 * Permission is granted to anyone to use this software for any
8 * purpose on any computer system, and to redistribute it freely,
9 * subject to the following restrictions:
10 *
11 * 1. The author is not responsible for the consequences of use of
12 * this software, no matter how awful, even if they arise
13 * from defects in it.
14 *
15 * 2. The origin of this software must not be misrepresented, either
16 * by explicit claim or by omission.
17 *
18 * 3. Altered versions must be plainly marked as such, and must not
19 * be misrepresented as being the original software.
20 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
21 *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
22 *** to assist in implementing egrep.
23 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
24 *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
25 *** as in BSD grep and ex.
26 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
27 *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
28 *** THIS IS AN ALTERED VERSION. It was altered by James A. Woods,
29 *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy.
30 *
31 * Beware that some of this code is subtly aware of the way operator
32 * precedence is structured in regular expressions. Serious changes in
33 * regular-expression syntax might require a total rethink.
34 */
35 #include <regexp.h>
36 #include <stdio.h>
37 #include <ctype.h>
38 #include <stdlib.h>
39 #include <string.h>
40 #include "regmagic.h"
41
42 /*
43 * The "internal use only" fields in regexp.h are present to pass info from
44 * compile to execute that permits the execute phase to run lots faster on
45 * simple cases. They are:
46 *
47 * regstart char that must begin a match; '\0' if none obvious
48 * reganch is the match anchored (at beginning-of-line only)?
49 * regmust string (pointer into program) that match must include, or NULL
50 * regmlen length of regmust string
51 *
52 * Regstart and reganch permit very fast decisions on suitable starting points
53 * for a match, cutting down the work a lot. Regmust permits fast rejection
54 * of lines that cannot possibly match. The regmust tests are costly enough
55 * that regcomp() supplies a regmust only if the r.e. contains something
56 * potentially expensive (at present, the only such thing detected is * or +
57 * at the start of the r.e., which can involve a lot of backup). Regmlen is
58 * supplied because the test in regexec() needs it and regcomp() is computing
59 * it anyway.
60 */
61
62 /*
63 * Structure for regexp "program". This is essentially a linear encoding
64 * of a nondeterministic finite-state machine (aka syntax charts or
65 * "railroad normal form" in parsing technology). Each node is an opcode
66 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
67 * all nodes except BRANCH implement concatenation; a "next" pointer with
68 * a BRANCH on both ends of it is connecting two alternatives. (Here we
69 * have one of the subtle syntax dependencies: an individual BRANCH (as
70 * opposed to a collection of them) is never concatenated with anything
71 * because of operator precedence.) The operand of some types of node is
72 * a literal string; for others, it is a node leading into a sub-FSM. In
73 * particular, the operand of a BRANCH node is the first node of the branch.
74 * (NB this is *not* a tree structure: the tail of the branch connects
75 * to the thing following the set of BRANCHes.) The opcodes are:
76 */
77
78 /* definition number opnd? meaning */
79 #define END 0 /* no End of program. */
80 #define BOL 1 /* no Match "" at beginning of line. */
81 #define EOL 2 /* no Match "" at end of line. */
82 #define ANY 3 /* no Match any one character. */
83 #define ANYOF 4 /* str Match any character in this string. */
84 #define ANYBUT 5 /* str Match any character not in this string. */
85 #define BRANCH 6 /* node Match this alternative, or the next... */
86 #define BACK 7 /* no Match "", "next" ptr points backward. */
87 #define EXACTLY 8 /* str Match this string. */
88 #define NOTHING 9 /* no Match empty string. */
89 #define STAR 10 /* node Match this (simple) thing 0 or more times. */
90 #define PLUS 11 /* node Match this (simple) thing 1 or more times. */
91 #define WORDA 12 /* no Match "" at wordchar, where prev is nonword */
92 #define WORDZ 13 /* no Match "" at nonwordchar, where prev is word */
93 #define OPEN 20 /* no Mark this point in input as start of #n. */
94 /* OPEN+1 is number 1, etc. */
95 #define CLOSE 30 /* no Analogous to OPEN. */
96
97 /*
98 * Opcode notes:
99 *
100 * BRANCH The set of branches constituting a single choice are hooked
101 * together with their "next" pointers, since precedence prevents
102 * anything being concatenated to any individual branch. The
103 * "next" pointer of the last BRANCH in a choice points to the
104 * thing following the whole choice. This is also where the
105 * final "next" pointer of each individual branch points; each
106 * branch starts with the operand node of a BRANCH node.
107 *
108 * BACK Normal "next" pointers all implicitly point forward; BACK
109 * exists to make loop structures possible.
110 *
111 * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
112 * BRANCH structures using BACK. Simple cases (one character
113 * per match) are implemented with STAR and PLUS for speed
114 * and to minimize recursive plunges.
115 *
116 * OPEN,CLOSE ...are numbered at compile time.
117 */
118
119 /*
120 * A node is one char of opcode followed by two chars of "next" pointer.
121 * "Next" pointers are stored as two 8-bit pieces, high order first. The
122 * value is a positive offset from the opcode of the node containing it.
123 * An operand, if any, simply follows the node. (Note that much of the
124 * code generation knows about this implicit relationship.)
125 *
126 * Using two bytes for the "next" pointer is vast overkill for most things,
127 * but allows patterns to get big without disasters.
128 */
129 #define OP(p) (*(p))
130 #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
131 #define OPERAND(p) ((p) + 3)
132
133 /*
134 * See regmagic.h for one further detail of program structure.
135 */
136
137
138 /*
139 * Utility definitions.
140 */
141 #ifndef CHARBITS
142 #define UCHARAT(p) ((int)*(unsigned char *)(p))
143 #else
144 #define UCHARAT(p) ((int)*(p)&CHARBITS)
145 #endif
146
147 #define FAIL(m) { regerror(m); return(NULL); }
148 #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?')
149
150 /*
151 * Flags to be passed up and down.
152 */
153 #define HASWIDTH 01 /* Known never to match null string. */
154 #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */
155 #define SPSTART 04 /* Starts with * or +. */
156 #define WORST 0 /* Worst case. */
157
158 /*
159 * Global work variables for regcomp().
160 */
161 static char *regparse; /* Input-scan pointer. */
162 static int regnpar; /* () count. */
163 static char regdummy;
164 static char *regcode; /* Code-emit pointer; ®dummy = don't. */
165 static long regsize; /* Code size. */
166
167 /*
168 * Forward declarations for regcomp()'s friends.
169 */
170 #ifndef STATIC
171 #define STATIC static
172 #endif
173 STATIC char *reg();
174 STATIC char *regbranch();
175 STATIC char *regpiece();
176 STATIC char *regatom();
177 STATIC char *regnode();
178 STATIC char *regnext();
179 STATIC void regc();
180 STATIC void reginsert();
181 STATIC void regtail();
182 STATIC void regoptail();
183 #ifdef STRCSPN
184 STATIC int strcspn();
185 #endif
186
187 /*
188 - regcomp - compile a regular expression into internal code
189 *
190 * We can't allocate space until we know how big the compiled form will be,
191 * but we can't compile it (and thus know how big it is) until we've got a
192 * place to put the code. So we cheat: we compile it twice, once with code
193 * generation turned off and size counting turned on, and once "for real".
194 * This also means that we don't allocate space until we are sure that the
195 * thing really will compile successfully, and we never have to move the
196 * code and thus invalidate pointers into it. (Note that it has to be in
197 * one piece because free() must be able to free it all.)
198 *
199 * Beware that the optimization-preparation code in here knows about some
200 * of the structure of the compiled regexp.
201 */
202 regexp *
regcomp(exp)203 regcomp(exp)
204 const char *exp;
205 {
206 register regexp *r;
207 register char *scan;
208 register char *longest;
209 register int len;
210 int flags;
211
212 if (exp == NULL)
213 FAIL("NULL argument");
214
215 /* First pass: determine size, legality. */
216 #ifdef notdef
217 if (exp[0] == '.' && exp[1] == '*') exp += 2; /* aid grep */
218 #endif
219 regparse = (char *)exp;
220 regnpar = 1;
221 regsize = 0L;
222 regcode = ®dummy;
223 regc(MAGIC);
224 if (reg(0, &flags) == NULL)
225 return(NULL);
226
227 /* Small enough for pointer-storage convention? */
228 if (regsize >= 32767L) /* Probably could be 65535L. */
229 FAIL("regexp too big");
230
231 /* Allocate space. */
232 r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize);
233 if (r == NULL)
234 FAIL("out of space");
235
236 /* Second pass: emit code. */
237 regparse = (char *)exp;
238 regnpar = 1;
239 regcode = r->program;
240 regc(MAGIC);
241 if (reg(0, &flags) == NULL)
242 return(NULL);
243
244 /* Dig out information for optimizations. */
245 r->regstart = '\0'; /* Worst-case defaults. */
246 r->reganch = 0;
247 r->regmust = NULL;
248 r->regmlen = 0;
249 scan = r->program+1; /* First BRANCH. */
250 if (OP(regnext(scan)) == END) { /* Only one top-level choice. */
251 scan = OPERAND(scan);
252
253 /* Starting-point info. */
254 if (OP(scan) == EXACTLY)
255 r->regstart = *OPERAND(scan);
256 else if (OP(scan) == BOL)
257 r->reganch++;
258
259 /*
260 * If there's something expensive in the r.e., find the
261 * longest literal string that must appear and make it the
262 * regmust. Resolve ties in favor of later strings, since
263 * the regstart check works with the beginning of the r.e.
264 * and avoiding duplication strengthens checking. Not a
265 * strong reason, but sufficient in the absence of others.
266 */
267 if (flags&SPSTART) {
268 longest = NULL;
269 len = 0;
270 for (; scan != NULL; scan = regnext(scan))
271 if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) {
272 longest = OPERAND(scan);
273 len = strlen(OPERAND(scan));
274 }
275 r->regmust = longest;
276 r->regmlen = len;
277 }
278 }
279
280 return(r);
281 }
282
283 /*
284 - reg - regular expression, i.e. main body or parenthesized thing
285 *
286 * Caller must absorb opening parenthesis.
287 *
288 * Combining parenthesis handling with the base level of regular expression
289 * is a trifle forced, but the need to tie the tails of the branches to what
290 * follows makes it hard to avoid.
291 */
292 static char *
reg(paren,flagp)293 reg(paren, flagp)
294 int paren; /* Parenthesized? */
295 int *flagp;
296 {
297 register char *ret;
298 register char *br;
299 register char *ender;
300 register int parno;
301 int flags;
302
303 *flagp = HASWIDTH; /* Tentatively. */
304
305 /* Make an OPEN node, if parenthesized. */
306 if (paren) {
307 if (regnpar >= NSUBEXP)
308 FAIL("too many ()");
309 parno = regnpar;
310 regnpar++;
311 ret = regnode(OPEN+parno);
312 } else
313 ret = NULL;
314
315 /* Pick up the branches, linking them together. */
316 br = regbranch(&flags);
317 if (br == NULL)
318 return(NULL);
319 if (ret != NULL)
320 regtail(ret, br); /* OPEN -> first. */
321 else
322 ret = br;
323 if (!(flags&HASWIDTH))
324 *flagp &= ~HASWIDTH;
325 *flagp |= flags&SPSTART;
326 while (*regparse == '|' || *regparse == '\n') {
327 regparse++;
328 br = regbranch(&flags);
329 if (br == NULL)
330 return(NULL);
331 regtail(ret, br); /* BRANCH -> BRANCH. */
332 if (!(flags&HASWIDTH))
333 *flagp &= ~HASWIDTH;
334 *flagp |= flags&SPSTART;
335 }
336
337 /* Make a closing node, and hook it on the end. */
338 ender = regnode((paren) ? CLOSE+parno : END);
339 regtail(ret, ender);
340
341 /* Hook the tails of the branches to the closing node. */
342 for (br = ret; br != NULL; br = regnext(br))
343 regoptail(br, ender);
344
345 /* Check for proper termination. */
346 if (paren && *regparse++ != ')') {
347 FAIL("unmatched ()");
348 } else if (!paren && *regparse != '\0') {
349 if (*regparse == ')') {
350 FAIL("unmatched ()");
351 } else
352 FAIL("junk on end"); /* "Can't happen". */
353 /* NOTREACHED */
354 }
355
356 return(ret);
357 }
358
359 /*
360 - regbranch - one alternative of an | operator
361 *
362 * Implements the concatenation operator.
363 */
364 static char *
regbranch(flagp)365 regbranch(flagp)
366 int *flagp;
367 {
368 register char *ret;
369 register char *chain;
370 register char *latest;
371 int flags;
372
373 *flagp = WORST; /* Tentatively. */
374
375 ret = regnode(BRANCH);
376 chain = NULL;
377 while (*regparse != '\0' && *regparse != ')' &&
378 *regparse != '\n' && *regparse != '|') {
379 latest = regpiece(&flags);
380 if (latest == NULL)
381 return(NULL);
382 *flagp |= flags&HASWIDTH;
383 if (chain == NULL) /* First piece. */
384 *flagp |= flags&SPSTART;
385 else
386 regtail(chain, latest);
387 chain = latest;
388 }
389 if (chain == NULL) /* Loop ran zero times. */
390 (void) regnode(NOTHING);
391
392 return(ret);
393 }
394
395 /*
396 - regpiece - something followed by possible [*+?]
397 *
398 * Note that the branching code sequences used for ? and the general cases
399 * of * and + are somewhat optimized: they use the same NOTHING node as
400 * both the endmarker for their branch list and the body of the last branch.
401 * It might seem that this node could be dispensed with entirely, but the
402 * endmarker role is not redundant.
403 */
404 static char *
regpiece(flagp)405 regpiece(flagp)
406 int *flagp;
407 {
408 register char *ret;
409 register char op;
410 register char *next;
411 int flags;
412
413 ret = regatom(&flags);
414 if (ret == NULL)
415 return(NULL);
416
417 op = *regparse;
418 if (!ISMULT(op)) {
419 *flagp = flags;
420 return(ret);
421 }
422
423 if (!(flags&HASWIDTH) && op != '?')
424 FAIL("*+ operand could be empty");
425 *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
426
427 if (op == '*' && (flags&SIMPLE))
428 reginsert(STAR, ret);
429 else if (op == '*') {
430 /* Emit x* as (x&|), where & means "self". */
431 reginsert(BRANCH, ret); /* Either x */
432 regoptail(ret, regnode(BACK)); /* and loop */
433 regoptail(ret, ret); /* back */
434 regtail(ret, regnode(BRANCH)); /* or */
435 regtail(ret, regnode(NOTHING)); /* null. */
436 } else if (op == '+' && (flags&SIMPLE))
437 reginsert(PLUS, ret);
438 else if (op == '+') {
439 /* Emit x+ as x(&|), where & means "self". */
440 next = regnode(BRANCH); /* Either */
441 regtail(ret, next);
442 regtail(regnode(BACK), ret); /* loop back */
443 regtail(next, regnode(BRANCH)); /* or */
444 regtail(ret, regnode(NOTHING)); /* null. */
445 } else if (op == '?') {
446 /* Emit x? as (x|) */
447 reginsert(BRANCH, ret); /* Either x */
448 regtail(ret, regnode(BRANCH)); /* or */
449 next = regnode(NOTHING); /* null. */
450 regtail(ret, next);
451 regoptail(ret, next);
452 }
453 regparse++;
454 if (ISMULT(*regparse))
455 FAIL("nested *?+");
456
457 return(ret);
458 }
459
460 /*
461 - regatom - the lowest level
462 *
463 * Optimization: gobbles an entire sequence of ordinary characters so that
464 * it can turn them into a single node, which is smaller to store and
465 * faster to run. Backslashed characters are exceptions, each becoming a
466 * separate node; the code is simpler that way and it's not worth fixing.
467 */
468 static char *
regatom(flagp)469 regatom(flagp)
470 int *flagp;
471 {
472 register char *ret;
473 int flags;
474
475 *flagp = WORST; /* Tentatively. */
476
477 switch (*regparse++) {
478 /* FIXME: these chars only have meaning at beg/end of pat? */
479 case '^':
480 ret = regnode(BOL);
481 break;
482 case '$':
483 ret = regnode(EOL);
484 break;
485 case '.':
486 ret = regnode(ANY);
487 *flagp |= HASWIDTH|SIMPLE;
488 break;
489 case '[': {
490 register int class;
491 register int classend;
492
493 if (*regparse == '^') { /* Complement of range. */
494 ret = regnode(ANYBUT);
495 regparse++;
496 } else
497 ret = regnode(ANYOF);
498 if (*regparse == ']' || *regparse == '-')
499 regc(*regparse++);
500 while (*regparse != '\0' && *regparse != ']') {
501 if (*regparse == '-') {
502 regparse++;
503 if (*regparse == ']' || *regparse == '\0')
504 regc('-');
505 else {
506 class = UCHARAT(regparse-2)+1;
507 classend = UCHARAT(regparse);
508 if (class > classend+1)
509 FAIL("invalid [] range");
510 for (; class <= classend; class++)
511 regc(class);
512 regparse++;
513 }
514 } else
515 regc(*regparse++);
516 }
517 regc('\0');
518 if (*regparse != ']')
519 FAIL("unmatched []");
520 regparse++;
521 *flagp |= HASWIDTH|SIMPLE;
522 }
523 break;
524 case '(':
525 ret = reg(1, &flags);
526 if (ret == NULL)
527 return(NULL);
528 *flagp |= flags&(HASWIDTH|SPSTART);
529 break;
530 case '\0':
531 case '|':
532 case '\n':
533 case ')':
534 FAIL("internal urp"); /* Supposed to be caught earlier. */
535 break;
536 case '?':
537 case '+':
538 case '*':
539 FAIL("?+* follows nothing");
540 break;
541 case '\\':
542 switch (*regparse++) {
543 case '\0':
544 FAIL("trailing \\");
545 break;
546 case '<':
547 ret = regnode(WORDA);
548 break;
549 case '>':
550 ret = regnode(WORDZ);
551 break;
552 /* FIXME: Someday handle \1, \2, ... */
553 default:
554 /* Handle general quoted chars in exact-match routine */
555 goto de_fault;
556 }
557 break;
558 de_fault:
559 default:
560 /*
561 * Encode a string of characters to be matched exactly.
562 *
563 * This is a bit tricky due to quoted chars and due to
564 * '*', '+', and '?' taking the SINGLE char previous
565 * as their operand.
566 *
567 * On entry, the char at regparse[-1] is going to go
568 * into the string, no matter what it is. (It could be
569 * following a \ if we are entered from the '\' case.)
570 *
571 * Basic idea is to pick up a good char in ch and
572 * examine the next char. If it's *+? then we twiddle.
573 * If it's \ then we frozzle. If it's other magic char
574 * we push ch and terminate the string. If none of the
575 * above, we push ch on the string and go around again.
576 *
577 * regprev is used to remember where "the current char"
578 * starts in the string, if due to a *+? we need to back
579 * up and put the current char in a separate, 1-char, string.
580 * When regprev is NULL, ch is the only char in the
581 * string; this is used in *+? handling, and in setting
582 * flags |= SIMPLE at the end.
583 */
584 {
585 char *regprev;
586 register char ch;
587
588 regparse--; /* Look at cur char */
589 ret = regnode(EXACTLY);
590 for ( regprev = 0 ; ; ) {
591 ch = *regparse++; /* Get current char */
592 switch (*regparse) { /* look at next one */
593
594 default:
595 regc(ch); /* Add cur to string */
596 break;
597
598 case '.': case '[': case '(':
599 case ')': case '|': case '\n':
600 case '$': case '^':
601 case '\0':
602 /* FIXME, $ and ^ should not always be magic */
603 magic:
604 regc(ch); /* dump cur char */
605 goto done; /* and we are done */
606
607 case '?': case '+': case '*':
608 if (!regprev) /* If just ch in str, */
609 goto magic; /* use it */
610 /* End mult-char string one early */
611 regparse = regprev; /* Back up parse */
612 goto done;
613
614 case '\\':
615 regc(ch); /* Cur char OK */
616 switch (regparse[1]){ /* Look after \ */
617 case '\0':
618 case '<':
619 case '>':
620 /* FIXME: Someday handle \1, \2, ... */
621 goto done; /* Not quoted */
622 default:
623 /* Backup point is \, scan * point is after it. */
624 regprev = regparse;
625 regparse++;
626 continue; /* NOT break; */
627 }
628 }
629 regprev = regparse; /* Set backup point */
630 }
631 done:
632 regc('\0');
633 *flagp |= HASWIDTH;
634 if (!regprev) /* One char? */
635 *flagp |= SIMPLE;
636 }
637 break;
638 }
639
640 return(ret);
641 }
642
643 /*
644 - regnode - emit a node
645 */
646 static char * /* Location. */
regnode(op)647 regnode(op)
648 char op;
649 {
650 register char *ret;
651 register char *ptr;
652
653 ret = regcode;
654 if (ret == ®dummy) {
655 regsize += 3;
656 return(ret);
657 }
658
659 ptr = ret;
660 *ptr++ = op;
661 *ptr++ = '\0'; /* Null "next" pointer. */
662 *ptr++ = '\0';
663 regcode = ptr;
664
665 return(ret);
666 }
667
668 /*
669 - regc - emit (if appropriate) a byte of code
670 */
671 static void
regc(b)672 regc(b)
673 char b;
674 {
675 if (regcode != ®dummy)
676 *regcode++ = b;
677 else
678 regsize++;
679 }
680
681 /*
682 - reginsert - insert an operator in front of already-emitted operand
683 *
684 * Means relocating the operand.
685 */
686 static void
reginsert(op,opnd)687 reginsert(op, opnd)
688 char op;
689 char *opnd;
690 {
691 register char *src;
692 register char *dst;
693 register char *place;
694
695 if (regcode == ®dummy) {
696 regsize += 3;
697 return;
698 }
699
700 src = regcode;
701 regcode += 3;
702 dst = regcode;
703 while (src > opnd)
704 *--dst = *--src;
705
706 place = opnd; /* Op node, where operand used to be. */
707 *place++ = op;
708 *place++ = '\0';
709 *place++ = '\0';
710 }
711
712 /*
713 - regtail - set the next-pointer at the end of a node chain
714 */
715 static void
regtail(p,val)716 regtail(p, val)
717 char *p;
718 char *val;
719 {
720 register char *scan;
721 register char *temp;
722 register int offset;
723
724 if (p == ®dummy)
725 return;
726
727 /* Find last node. */
728 scan = p;
729 for (;;) {
730 temp = regnext(scan);
731 if (temp == NULL)
732 break;
733 scan = temp;
734 }
735
736 if (OP(scan) == BACK)
737 offset = scan - val;
738 else
739 offset = val - scan;
740 *(scan+1) = (offset>>8)&0377;
741 *(scan+2) = offset&0377;
742 }
743
744 /*
745 - regoptail - regtail on operand of first argument; nop if operandless
746 */
747 static void
regoptail(p,val)748 regoptail(p, val)
749 char *p;
750 char *val;
751 {
752 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
753 if (p == NULL || p == ®dummy || OP(p) != BRANCH)
754 return;
755 regtail(OPERAND(p), val);
756 }
757
758 /*
759 * regexec and friends
760 */
761
762 /*
763 * Global work variables for regexec().
764 */
765 static char *reginput; /* String-input pointer. */
766 static char *regbol; /* Beginning of input, for ^ check. */
767 static char **regstartp; /* Pointer to startp array. */
768 static char **regendp; /* Ditto for endp. */
769
770 /*
771 * Forwards.
772 */
773 STATIC int regtry();
774 STATIC int regmatch();
775 STATIC int regrepeat();
776
777 #ifdef DEBUG
778 int regnarrate = 0;
779 void regdump();
780 STATIC char *regprop();
781 #endif
782
783 /*
784 - regexec - match a regexp against a string
785 */
786 int
regexec(prog,string)787 regexec(prog, string)
788 register const regexp *prog;
789 register const char *string;
790 {
791 register char *s;
792 extern char *strchr();
793
794 /* Be paranoid... */
795 if (prog == NULL || string == NULL) {
796 regerror("NULL parameter");
797 return(0);
798 }
799
800 /* Check validity of program. */
801 if (UCHARAT(prog->program) != MAGIC) {
802 regerror("corrupted program");
803 return(0);
804 }
805
806 /* If there is a "must appear" string, look for it. */
807 if (prog->regmust != NULL) {
808 s = (char *)string;
809 while ((s = strchr(s, prog->regmust[0])) != NULL) {
810 if (strncmp(s, prog->regmust, prog->regmlen) == 0)
811 break; /* Found it. */
812 s++;
813 }
814 if (s == NULL) /* Not present. */
815 return(0);
816 }
817
818 /* Mark beginning of line for ^ . */
819 regbol = (char *)string;
820
821 /* Simplest case: anchored match need be tried only once. */
822 if (prog->reganch)
823 return(regtry(prog, string));
824
825 /* Messy cases: unanchored match. */
826 s = (char *)string;
827 if (prog->regstart != '\0')
828 /* We know what char it must start with. */
829 while ((s = strchr(s, prog->regstart)) != NULL) {
830 if (regtry(prog, s))
831 return(1);
832 s++;
833 }
834 else
835 /* We don't -- general case. */
836 do {
837 if (regtry(prog, s))
838 return(1);
839 } while (*s++ != '\0');
840
841 /* Failure. */
842 return(0);
843 }
844
845 /*
846 - regtry - try match at specific point
847 */
848 static int /* 0 failure, 1 success */
regtry(prog,string)849 regtry(prog, string)
850 regexp *prog;
851 char *string;
852 {
853 register int i;
854 register char **sp;
855 register char **ep;
856
857 reginput = string;
858 regstartp = prog->startp;
859 regendp = prog->endp;
860
861 sp = prog->startp;
862 ep = prog->endp;
863 for (i = NSUBEXP; i > 0; i--) {
864 *sp++ = NULL;
865 *ep++ = NULL;
866 }
867 if (regmatch(prog->program + 1)) {
868 prog->startp[0] = string;
869 prog->endp[0] = reginput;
870 return(1);
871 } else
872 return(0);
873 }
874
875 /*
876 - regmatch - main matching routine
877 *
878 * Conceptually the strategy is simple: check to see whether the current
879 * node matches, call self recursively to see whether the rest matches,
880 * and then act accordingly. In practice we make some effort to avoid
881 * recursion, in particular by going through "ordinary" nodes (that don't
882 * need to know whether the rest of the match failed) by a loop instead of
883 * by recursion.
884 */
885 static int /* 0 failure, 1 success */
regmatch(prog)886 regmatch(prog)
887 char *prog;
888 {
889 register char *scan; /* Current node. */
890 char *next; /* Next node. */
891 extern char *strchr();
892
893 scan = prog;
894 #ifdef DEBUG
895 if (scan != NULL && regnarrate)
896 fprintf(stderr, "%s(\n", regprop(scan));
897 #endif
898 while (scan != NULL) {
899 #ifdef DEBUG
900 if (regnarrate)
901 fprintf(stderr, "%s...\n", regprop(scan));
902 #endif
903 next = regnext(scan);
904
905 switch (OP(scan)) {
906 case BOL:
907 if (reginput != regbol)
908 return(0);
909 break;
910 case EOL:
911 if (*reginput != '\0')
912 return(0);
913 break;
914 case WORDA:
915 /* Must be looking at a letter, digit, or _ */
916 if ((!isalnum(*reginput)) && *reginput != '_')
917 return(0);
918 /* Prev must be BOL or nonword */
919 if (reginput > regbol &&
920 (isalnum(reginput[-1]) || reginput[-1] == '_'))
921 return(0);
922 break;
923 case WORDZ:
924 /* Must be looking at non letter, digit, or _ */
925 if (isalnum(*reginput) || *reginput == '_')
926 return(0);
927 /* We don't care what the previous char was */
928 break;
929 case ANY:
930 if (*reginput == '\0')
931 return(0);
932 reginput++;
933 break;
934 case EXACTLY: {
935 register int len;
936 register char *opnd;
937
938 opnd = OPERAND(scan);
939 /* Inline the first character, for speed. */
940 if (*opnd != *reginput)
941 return(0);
942 len = strlen(opnd);
943 if (len > 1 && strncmp(opnd, reginput, len) != 0)
944 return(0);
945 reginput += len;
946 }
947 break;
948 case ANYOF:
949 if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL)
950 return(0);
951 reginput++;
952 break;
953 case ANYBUT:
954 if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL)
955 return(0);
956 reginput++;
957 break;
958 case NOTHING:
959 break;
960 case BACK:
961 break;
962 case OPEN+1:
963 case OPEN+2:
964 case OPEN+3:
965 case OPEN+4:
966 case OPEN+5:
967 case OPEN+6:
968 case OPEN+7:
969 case OPEN+8:
970 case OPEN+9: {
971 register int no;
972 register char *save;
973
974 no = OP(scan) - OPEN;
975 save = reginput;
976
977 if (regmatch(next)) {
978 /*
979 * Don't set startp if some later
980 * invocation of the same parentheses
981 * already has.
982 */
983 if (regstartp[no] == NULL)
984 regstartp[no] = save;
985 return(1);
986 } else
987 return(0);
988 }
989 break;
990 case CLOSE+1:
991 case CLOSE+2:
992 case CLOSE+3:
993 case CLOSE+4:
994 case CLOSE+5:
995 case CLOSE+6:
996 case CLOSE+7:
997 case CLOSE+8:
998 case CLOSE+9: {
999 register int no;
1000 register char *save;
1001
1002 no = OP(scan) - CLOSE;
1003 save = reginput;
1004
1005 if (regmatch(next)) {
1006 /*
1007 * Don't set endp if some later
1008 * invocation of the same parentheses
1009 * already has.
1010 */
1011 if (regendp[no] == NULL)
1012 regendp[no] = save;
1013 return(1);
1014 } else
1015 return(0);
1016 }
1017 break;
1018 case BRANCH: {
1019 register char *save;
1020
1021 if (OP(next) != BRANCH) /* No choice. */
1022 next = OPERAND(scan); /* Avoid recursion. */
1023 else {
1024 do {
1025 save = reginput;
1026 if (regmatch(OPERAND(scan)))
1027 return(1);
1028 reginput = save;
1029 scan = regnext(scan);
1030 } while (scan != NULL && OP(scan) == BRANCH);
1031 return(0);
1032 /* NOTREACHED */
1033 }
1034 }
1035 break;
1036 case STAR:
1037 case PLUS: {
1038 register char nextch;
1039 register int no;
1040 register char *save;
1041 register int min;
1042
1043 /*
1044 * Lookahead to avoid useless match attempts
1045 * when we know what character comes next.
1046 */
1047 nextch = '\0';
1048 if (OP(next) == EXACTLY)
1049 nextch = *OPERAND(next);
1050 min = (OP(scan) == STAR) ? 0 : 1;
1051 save = reginput;
1052 no = regrepeat(OPERAND(scan));
1053 while (no >= min) {
1054 /* If it could work, try it. */
1055 if (nextch == '\0' || *reginput == nextch)
1056 if (regmatch(next))
1057 return(1);
1058 /* Couldn't or didn't -- back up. */
1059 no--;
1060 reginput = save + no;
1061 }
1062 return(0);
1063 }
1064 break;
1065 case END:
1066 return(1); /* Success! */
1067 break;
1068 default:
1069 regerror("memory corruption");
1070 return(0);
1071 break;
1072 }
1073
1074 scan = next;
1075 }
1076
1077 /*
1078 * We get here only if there's trouble -- normally "case END" is
1079 * the terminating point.
1080 */
1081 regerror("corrupted pointers");
1082 return(0);
1083 }
1084
1085 /*
1086 - regrepeat - repeatedly match something simple, report how many
1087 */
1088 static int
regrepeat(p)1089 regrepeat(p)
1090 char *p;
1091 {
1092 register int count = 0;
1093 register char *scan;
1094 register char *opnd;
1095
1096 scan = reginput;
1097 opnd = OPERAND(p);
1098 switch (OP(p)) {
1099 case ANY:
1100 count = strlen(scan);
1101 scan += count;
1102 break;
1103 case EXACTLY:
1104 while (*opnd == *scan) {
1105 count++;
1106 scan++;
1107 }
1108 break;
1109 case ANYOF:
1110 while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
1111 count++;
1112 scan++;
1113 }
1114 break;
1115 case ANYBUT:
1116 while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
1117 count++;
1118 scan++;
1119 }
1120 break;
1121 default: /* Oh dear. Called inappropriately. */
1122 regerror("internal foulup");
1123 count = 0; /* Best compromise. */
1124 break;
1125 }
1126 reginput = scan;
1127
1128 return(count);
1129 }
1130
1131 /*
1132 - regnext - dig the "next" pointer out of a node
1133 */
1134 static char *
regnext(p)1135 regnext(p)
1136 register char *p;
1137 {
1138 register int offset;
1139
1140 if (p == ®dummy)
1141 return(NULL);
1142
1143 offset = NEXT(p);
1144 if (offset == 0)
1145 return(NULL);
1146
1147 if (OP(p) == BACK)
1148 return(p-offset);
1149 else
1150 return(p+offset);
1151 }
1152
1153 #ifdef DEBUG
1154
1155 STATIC char *regprop();
1156
1157 /*
1158 - regdump - dump a regexp onto stdout in vaguely comprehensible form
1159 */
1160 void
regdump(r)1161 regdump(r)
1162 regexp *r;
1163 {
1164 register char *s;
1165 register char op = EXACTLY; /* Arbitrary non-END op. */
1166 register char *next;
1167 extern char *strchr();
1168
1169
1170 s = r->program + 1;
1171 while (op != END) { /* While that wasn't END last time... */
1172 op = OP(s);
1173 printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */
1174 next = regnext(s);
1175 if (next == NULL) /* Next ptr. */
1176 printf("(0)");
1177 else
1178 printf("(%d)", (s-r->program)+(next-s));
1179 s += 3;
1180 if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
1181 /* Literal string, where present. */
1182 while (*s != '\0') {
1183 putchar(*s);
1184 s++;
1185 }
1186 s++;
1187 }
1188 putchar('\n');
1189 }
1190
1191 /* Header fields of interest. */
1192 if (r->regstart != '\0')
1193 printf("start `%c' ", r->regstart);
1194 if (r->reganch)
1195 printf("anchored ");
1196 if (r->regmust != NULL)
1197 printf("must have \"%s\"", r->regmust);
1198 printf("\n");
1199 }
1200
1201 /*
1202 - regprop - printable representation of opcode
1203 */
1204 static char *
regprop(op)1205 regprop(op)
1206 char *op;
1207 {
1208 register char *p;
1209 static char buf[50];
1210
1211 (void) strcpy(buf, ":");
1212
1213 switch (OP(op)) {
1214 case BOL:
1215 p = "BOL";
1216 break;
1217 case EOL:
1218 p = "EOL";
1219 break;
1220 case ANY:
1221 p = "ANY";
1222 break;
1223 case ANYOF:
1224 p = "ANYOF";
1225 break;
1226 case ANYBUT:
1227 p = "ANYBUT";
1228 break;
1229 case BRANCH:
1230 p = "BRANCH";
1231 break;
1232 case EXACTLY:
1233 p = "EXACTLY";
1234 break;
1235 case NOTHING:
1236 p = "NOTHING";
1237 break;
1238 case BACK:
1239 p = "BACK";
1240 break;
1241 case END:
1242 p = "END";
1243 break;
1244 case OPEN+1:
1245 case OPEN+2:
1246 case OPEN+3:
1247 case OPEN+4:
1248 case OPEN+5:
1249 case OPEN+6:
1250 case OPEN+7:
1251 case OPEN+8:
1252 case OPEN+9:
1253 sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
1254 p = NULL;
1255 break;
1256 case CLOSE+1:
1257 case CLOSE+2:
1258 case CLOSE+3:
1259 case CLOSE+4:
1260 case CLOSE+5:
1261 case CLOSE+6:
1262 case CLOSE+7:
1263 case CLOSE+8:
1264 case CLOSE+9:
1265 sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
1266 p = NULL;
1267 break;
1268 case STAR:
1269 p = "STAR";
1270 break;
1271 case PLUS:
1272 p = "PLUS";
1273 break;
1274 case WORDA:
1275 p = "WORDA";
1276 break;
1277 case WORDZ:
1278 p = "WORDZ";
1279 break;
1280 default:
1281 regerror("corrupted opcode");
1282 break;
1283 }
1284 if (p != NULL)
1285 (void) strcat(buf, p);
1286 return(buf);
1287 }
1288 #endif
1289
1290 /*
1291 * The following is provided for those people who do not have strcspn() in
1292 * their C libraries. They should get off their butts and do something
1293 * about it; at least one public-domain implementation of those (highly
1294 * useful) string routines has been published on Usenet.
1295 */
1296 #ifdef STRCSPN
1297 /*
1298 * strcspn - find length of initial segment of s1 consisting entirely
1299 * of characters not from s2
1300 */
1301
1302 static int
strcspn(s1,s2)1303 strcspn(s1, s2)
1304 char *s1;
1305 char *s2;
1306 {
1307 register char *scan1;
1308 register char *scan2;
1309 register int count;
1310
1311 count = 0;
1312 for (scan1 = s1; *scan1 != '\0'; scan1++) {
1313 for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */
1314 if (*scan1 == *scan2++)
1315 return(count);
1316 count++;
1317 }
1318 return(count);
1319 }
1320 #endif
1321