1 /*
2 * reg_compile and reg_execute
3 *
4 * Copyright (c) 1986 by University of Toronto.
5 * Written by Henry Spencer. Not derived from licensed software.
6 *
7 * Permission is granted to anyone to use this software for any
8 * purpose on any computer system, and to redistribute it freely,
9 * subject to the following restrictions:
10 *
11 * 1. The author is not responsible for the consequences of use of
12 * this software, no matter how awful, even if they arise
13 * from defects in it.
14 *
15 * 2. The origin of this software must not be misrepresented, either
16 * by explicit claim or by omission.
17 *
18 * 3. Altered versions must be plainly marked as such, and must not
19 * be misrepresented as being the original software.
20 *
21 * Beware that some of this code is subtly aware of the way operator
22 * precedence is structured in regular expressions. Serious changes in
23 * regular-expression syntax might require a total rethink.
24 */
25 #include <stdio.h>
26 #include <stdlib.h> /* for malloc() */
27 #include <string.h> /* for strlen() */
28
29 #include "regexpr.h"
30
31 /*
32 * The first byte of the regexp internal "program" is actually this magic
33 * number; the start node begins in the second byte.
34 */
35 #define MAGIC 0234
36
37 /*
38 * The "internal use only" fields in regexpr.h are present to pass info from
39 * compile to execute that permits the execute phase to run lots faster on
40 * simple cases. They are:
41 *
42 * regstart char that must begin a match; '\0' if none obvious
43 * reganch is the match anchored (at beginning-of-line only)?
44 * regmust string (pointer into program) that match must include, or NULL
45 * regmlen length of regmust string
46 *
47 * Regstart and reganch permit very fast decisions on suitable starting points
48 * for a match, cutting down the work a lot. Regmust permits fast rejection
49 * of lines that cannot possibly match. The regmust tests are costly enough
50 * that reg_compile() supplies a regmust only if the r.e. contains something
51 * potentially expensive (at present, the only such thing detected is * or +
52 * at the start of the r.e., which can involve a lot of backup). Regmlen is
53 * supplied because the test in reg_execute() needs it and reg_compile() is
54 * computing it anyway.
55 */
56
57 /*
58 * Structure for regexp "program". This is essentially a linear encoding
59 * of a nondeterministic finite-state machine (aka syntax charts or
60 * "railroad normal form" in parsing technology). Each node is an opcode
61 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
62 * all nodes except BRANCH implement concatenation; a "next" pointer with
63 * a BRANCH on both ends of it is connecting two alternatives. (Here we
64 * have one of the subtle syntax dependencies: an individual BRANCH (as
65 * opposed to a collection of them) is never concatenated with anything
66 * because of operator precedence.) The operand of some types of node is
67 * a literal string; for others, it is a node leading into a sub-FSM. In
68 * particular, the operand of a BRANCH node is the first node of the branch.
69 * (NB this is *not* a tree structure: the tail of the branch connects
70 * to the thing following the set of BRANCHes.) The opcodes are:
71 */
72
73 /* definition number opnd? meaning */
74 #define END 0 /* no End of program. */
75 #define BOL 1 /* no Match "" at beginning of line. */
76 #define EOL 2 /* no Match "" at end of line. */
77 #define ANY 3 /* no Match any one character. */
78 #define ANYOF 4 /* str Match any character in this string. */
79 #define ANYBUT 5 /* str Match any character not in this string. */
80 #define BRANCH 6 /* node Match this alternative, or the next... */
81 #define BACK 7 /* no Match "", "next" ptr points backward. */
82 #define EXACTLY 8 /* str Match this string. */
83 #define NOTHING 9 /* no Match empty string. */
84 #define STAR 10 /* node Match this (simple) thing 0 or more times. */
85 #define PLUS 11 /* node Match this (simple) thing 1 or more times. */
86 #define OPEN 20 /* no Mark this point in input as start of #n. */
87 /* OPEN+1 is number 1, etc. */
88 #define CLOSE 30 /* no Analogous to OPEN. */
89
90 /*
91 * Opcode notes:
92 *
93 * BRANCH The set of branches constituting a single choice are hooked
94 * together with their "next" pointers, since precedence prevents
95 * anything being concatenated to any individual branch. The
96 * "next" pointer of the last BRANCH in a choice points to the
97 * thing following the whole choice. This is also where the
98 * final "next" pointer of each individual branch points; each
99 * branch starts with the operand node of a BRANCH node.
100 *
101 * BACK Normal "next" pointers all implicitly point forward; BACK
102 * exists to make loop structures possible.
103 *
104 * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
105 * BRANCH structures using BACK. Simple cases (one character
106 * per match) are implemented with STAR and PLUS for speed
107 * and to minimize recursive plunges.
108 *
109 * OPEN,CLOSE ...are numbered at compile time.
110 */
111
112 /*
113 * A node is one char of opcode followed by two chars of "next" pointer.
114 * "Next" pointers are stored as two 8-bit pieces, high order first. The
115 * value is a positive offset from the opcode of the node containing it.
116 * An operand, if any, simply follows the node. (Note that much of the
117 * code generation knows about this implicit relationship.)
118 *
119 * Using two bytes for the "next" pointer is vast overkill for most things,
120 * but allows patterns to get big without disasters.
121 */
122 #define OP(p) (*(p))
123 #define NEXT(p) (((*((p)+1)&0377)<<8) + *((p)+2)&0377)
124 #define OPERAND(p) ((p) + 3)
125
126 /*
127 * See regmagic.h for one further detail of program structure.
128 */
129
130
131 /*
132 * Utility definitions.
133 */
134 #ifndef CHARBITS
135 #define UCHARAT(p) ((int)*(unsigned char *)(p))
136 #else
137 #define UCHARAT(p) ((int)*(p)&CHARBITS)
138 #endif
139
140 #define FAIL(m) { reg_error(m); return(NULL); }
141 #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?')
142 #define META "^$.[()|?+*\\"
143
144 /*
145 * Flags to be passed up and down.
146 */
147 #define HASWIDTH 01 /* Known never to match null string. */
148 #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */
149 #define SPSTART 04 /* Starts with * or +. */
150 #define WORST 0 /* Worst case. */
151
152 /*
153 * Global work variables for reg_compile().
154 */
155 static char *regparse; /* Input-scan pointer. */
156 static int regnpar; /* () count. */
157 static char regdummy;
158 static char *regcode; /* Code-emit pointer; ®dummy = don't. */
159 static long regsize; /* Code size. */
160
161 /*
162 * Forward declarations for reg_compile()'s friends.
163 */
164 #ifndef STATIC
165 #define STATIC static
166 #endif
167 STATIC char *reg(int, int *);
168 STATIC char *regbranch(int *);
169 STATIC char *regpiece(int *);
170 STATIC char *regatom(int *);
171 STATIC char *regnode(char);
172 STATIC char *regnext(char *);
173 STATIC void regc(char);
174 STATIC void reginsert(char, char *);
175 STATIC void regtail(char *, char *);
176 STATIC void regoptail(char *, char *);
177 #ifdef STRCSPN
178 STATIC int strcspn(char *, char *);
179 #endif
180
181 /*
182 - reg_compile - compile a regular expression into internal code
183 *
184 * We can't allocate space until we know how big the compiled form will be,
185 * but we can't compile it (and thus know how big it is) until we've got a
186 * place to put the code. So we cheat: we compile it twice, once with code
187 * generation turned off and size counting turned on, and once "for real".
188 * This also means that we don't allocate space until we are sure that the
189 * thing really will compile successfully, and we never have to move the
190 * code and thus invalidate pointers into it. (Note that it has to be in
191 * one piece because free() must be able to free it all.)
192 *
193 * Beware that the optimization-preparation code in here knows about some
194 * of the structure of the compiled regexp.
195 */
reg_compile(char * exp)196 regexpr *reg_compile(char *exp)
197 {
198 register regexpr *r;
199 register char *scan;
200 register char *longest;
201 register int len;
202 int flags;
203 #if !defined(__DragonFly__)
204 extern char *malloc();
205 #endif
206
207 if (exp == NULL)
208 FAIL("NULL argument");
209
210 /* First pass: determine size, legality. */
211 regparse = exp;
212 regnpar = 1;
213 regsize = 0L;
214 regcode = ®dummy;
215 regc(MAGIC);
216 if (reg(0, &flags) == NULL)
217 return(NULL);
218
219 /* Small enough for pointer-storage convention? */
220 if (regsize >= 32767L) /* Probably could be 65535L. */
221 FAIL("regexpr too big");
222
223 /* Allocate space. */
224 r = (regexpr *)malloc(sizeof(regexpr) + (unsigned)regsize);
225 if (r == NULL)
226 FAIL("out of space");
227
228 /* Second pass: emit code. */
229 regparse = exp;
230 regnpar = 1;
231 regcode = r->program;
232 regc(MAGIC);
233 if (reg(0, &flags) == NULL)
234 return(NULL);
235
236 /* Dig out information for optimizations. */
237 r->regstart = '\0'; /* Worst-case defaults. */
238 r->reganch = 0;
239 r->regmust = NULL;
240 r->regmlen = 0;
241 scan = r->program+1; /* First BRANCH. */
242 if (OP(regnext(scan)) == END) { /* Only one top-level choice. */
243 scan = OPERAND(scan);
244
245 /* Starting-point info. */
246 if (OP(scan) == EXACTLY)
247 r->regstart = *OPERAND(scan);
248 else if (OP(scan) == BOL)
249 r->reganch++;
250
251 /*
252 * If there's something expensive in the r.e., find the
253 * longest literal string that must appear and make it the
254 * regmust. Resolve ties in favor of later strings, since
255 * the regstart check works with the beginning of the r.e.
256 * and avoiding duplication strengthens checking. Not a
257 * strong reason, but sufficient in the absence of others.
258 */
259 if (flags&SPSTART) {
260 longest = NULL;
261 len = 0;
262 for (; scan != NULL; scan = regnext(scan))
263 if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) {
264 longest = OPERAND(scan);
265 len = strlen(OPERAND(scan));
266 }
267 r->regmust = longest;
268 r->regmlen = len;
269 }
270 }
271
272 return(r);
273 }
274
275 /*
276 - reg - regular expression, i.e. main body or parenthesized thing
277 *
278 * Caller must absorb opening parenthesis.
279 *
280 * Combining parenthesis handling with the base level of regular expression
281 * is a trifle forced, but the need to tie the tails of the branches to what
282 * follows makes it hard to avoid.
283 */
reg(int paren,int * flagp)284 static char *reg(int paren, int *flagp)
285 {
286 register char *ret;
287 register char *br;
288 register char *ender;
289 register int parno;
290 int flags;
291
292 *flagp = HASWIDTH; /* Tentatively. */
293
294 /* Make an OPEN node, if parenthesized. */
295 if (paren) {
296 if (regnpar >= NSUBEXP)
297 FAIL("too many ()");
298 parno = regnpar;
299 regnpar++;
300 ret = regnode(OPEN+parno);
301 } else
302 ret = NULL;
303
304 /* Pick up the branches, linking them together. */
305 br = regbranch(&flags);
306 if (br == NULL)
307 return(NULL);
308 if (ret != NULL)
309 regtail(ret, br); /* OPEN -> first. */
310 else
311 ret = br;
312 if (!(flags&HASWIDTH))
313 *flagp &= ~HASWIDTH;
314 *flagp |= flags&SPSTART;
315 while (*regparse == '|') {
316 regparse++;
317 br = regbranch(&flags);
318 if (br == NULL)
319 return(NULL);
320 regtail(ret, br); /* BRANCH -> BRANCH. */
321 if (!(flags&HASWIDTH))
322 *flagp &= ~HASWIDTH;
323 *flagp |= flags&SPSTART;
324 }
325
326 /* Make a closing node, and hook it on the end. */
327 ender = regnode((paren) ? CLOSE+parno : END);
328 regtail(ret, ender);
329
330 /* Hook the tails of the branches to the closing node. */
331 for (br = ret; br != NULL; br = regnext(br))
332 regoptail(br, ender);
333
334 /* Check for proper termination. */
335 if (paren && *regparse++ != ')') {
336 FAIL("unmatched ()");
337 } else if (!paren && *regparse != '\0') {
338 if (*regparse == ')') {
339 FAIL("unmatched ()");
340 } else
341 FAIL("junk on end"); /* "Can't happen". */
342 /* NOTREACHED */
343 }
344
345 return(ret);
346 }
347
348 /*
349 - regbranch - one alternative of an | operator
350 *
351 * Implements the concatenation operator.
352 */
regbranch(int * flagp)353 static char *regbranch(int *flagp)
354 {
355 register char *ret;
356 register char *chain;
357 register char *latest;
358 int flags;
359
360 *flagp = WORST; /* Tentatively. */
361
362 ret = regnode(BRANCH);
363 chain = NULL;
364 while (*regparse != '\0' && *regparse != '|' && *regparse != ')') {
365 latest = regpiece(&flags);
366 if (latest == NULL)
367 return(NULL);
368 *flagp |= flags&HASWIDTH;
369 if (chain == NULL) /* First piece. */
370 *flagp |= flags&SPSTART;
371 else
372 regtail(chain, latest);
373 chain = latest;
374 }
375 if (chain == NULL) /* Loop ran zero times. */
376 (void) regnode(NOTHING);
377
378 return(ret);
379 }
380
381 /*
382 - regpiece - something followed by possible [*+?]
383 *
384 * Note that the branching code sequences used for ? and the general cases
385 * of * and + are somewhat optimized: they use the same NOTHING node as
386 * both the endmarker for their branch list and the body of the last branch.
387 * It might seem that this node could be dispensed with entirely, but the
388 * endmarker role is not redundant.
389 */
regpiece(int * flagp)390 static char *regpiece(int *flagp)
391 {
392 register char *ret;
393 register char op;
394 register char *next;
395 int flags;
396
397 ret = regatom(&flags);
398 if (ret == NULL)
399 return(NULL);
400
401 op = *regparse;
402 if (!ISMULT(op)) {
403 *flagp = flags;
404 return(ret);
405 }
406
407 if (!(flags&HASWIDTH) && op != '?')
408 FAIL("*+ operand could be empty");
409 *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
410
411 if (op == '*' && (flags&SIMPLE))
412 reginsert(STAR, ret);
413 else if (op == '*') {
414 /* Emit x* as (x&|), where & means "self". */
415 reginsert(BRANCH, ret); /* Either x */
416 regoptail(ret, regnode(BACK)); /* and loop */
417 regoptail(ret, ret); /* back */
418 regtail(ret, regnode(BRANCH)); /* or */
419 regtail(ret, regnode(NOTHING)); /* null. */
420 } else if (op == '+' && (flags&SIMPLE))
421 reginsert(PLUS, ret);
422 else if (op == '+') {
423 /* Emit x+ as x(&|), where & means "self". */
424 next = regnode(BRANCH); /* Either */
425 regtail(ret, next);
426 regtail(regnode(BACK), ret); /* loop back */
427 regtail(next, regnode(BRANCH)); /* or */
428 regtail(ret, regnode(NOTHING)); /* null. */
429 } else if (op == '?') {
430 /* Emit x? as (x|) */
431 reginsert(BRANCH, ret); /* Either x */
432 regtail(ret, regnode(BRANCH)); /* or */
433 next = regnode(NOTHING); /* null. */
434 regtail(ret, next);
435 regoptail(ret, next);
436 }
437 regparse++;
438 if (ISMULT(*regparse))
439 FAIL("nested *?+");
440
441 return(ret);
442 }
443
444 /*
445 - regatom - the lowest level
446 *
447 * Optimization: gobbles an entire sequence of ordinary characters so that
448 * it can turn them into a single node, which is smaller to store and
449 * faster to run. Backslashed characters are exceptions, each becoming a
450 * separate node; the code is simpler that way and it's not worth fixing.
451 */
regatom(int * flagp)452 static char *regatom(int *flagp)
453 {
454 register char *ret;
455 int flags;
456
457 *flagp = WORST; /* Tentatively. */
458
459 switch (*regparse++) {
460 case '^':
461 ret = regnode(BOL);
462 break;
463 case '$':
464 ret = regnode(EOL);
465 break;
466 case '.':
467 ret = regnode(ANY);
468 *flagp |= HASWIDTH|SIMPLE;
469 break;
470 case '[': {
471 register int class;
472 register int classend;
473
474 if (*regparse == '^') { /* Complement of range. */
475 ret = regnode(ANYBUT);
476 regparse++;
477 } else
478 ret = regnode(ANYOF);
479 if (*regparse == ']' || *regparse == '-')
480 regc(*regparse++);
481 while (*regparse != '\0' && *regparse != ']') {
482 if (*regparse == '-') {
483 regparse++;
484 if (*regparse == ']' || *regparse == '\0')
485 regc('-');
486 else {
487 class = UCHARAT(regparse-2)+1;
488 classend = UCHARAT(regparse);
489 if (class > classend+1)
490 FAIL("invalid [] range");
491 for (; class <= classend; class++)
492 regc(class);
493 regparse++;
494 }
495 } else
496 regc(*regparse++);
497 }
498 regc('\0');
499 if (*regparse != ']')
500 FAIL("unmatched []");
501 regparse++;
502 *flagp |= HASWIDTH|SIMPLE;
503 }
504 break;
505 case '(':
506 ret = reg(1, &flags);
507 if (ret == NULL)
508 return(NULL);
509 *flagp |= flags&(HASWIDTH|SPSTART);
510 break;
511 case '\0':
512 case '|':
513 case ')':
514 FAIL("internal urp"); /* Supposed to be caught earlier. */
515 break;
516 case '?':
517 case '+':
518 case '*':
519 FAIL("?+* follows nothing");
520 break;
521 case '\\':
522 if (*regparse == '\0')
523 FAIL("trailing \\");
524 ret = regnode(EXACTLY);
525 regc(*regparse++);
526 regc('\0');
527 *flagp |= HASWIDTH|SIMPLE;
528 break;
529 default: {
530 register int len;
531 register char ender;
532
533 regparse--;
534 len = strcspn(regparse, META);
535 if (len <= 0)
536 FAIL("internal disaster");
537 ender = *(regparse+len);
538 if (len > 1 && ISMULT(ender))
539 len--; /* Back off clear of ?+* operand. */
540 *flagp |= HASWIDTH;
541 if (len == 1)
542 *flagp |= SIMPLE;
543 ret = regnode(EXACTLY);
544 while (len > 0) {
545 regc(*regparse++);
546 len--;
547 }
548 regc('\0');
549 }
550 break;
551 }
552
553 return(ret);
554 }
555
556 /*
557 - regnode - emit a node
558 */
regnode(char op)559 static char *regnode(char op)
560 {
561 register char *ret;
562 register char *ptr;
563
564 ret = regcode;
565 if (ret == ®dummy) {
566 regsize += 3;
567 return(ret);
568 }
569
570 ptr = ret;
571 *ptr++ = op;
572 *ptr++ = '\0'; /* Null "next" pointer. */
573 *ptr++ = '\0';
574 regcode = ptr;
575
576 return(ret);
577 }
578
579 /*
580 - regc - emit (if appropriate) a byte of code
581 */
regc(char b)582 static void regc(char b)
583 {
584 if (regcode != ®dummy)
585 *regcode++ = b;
586 else
587 regsize++;
588 }
589
590 /*
591 - reginsert - insert an operator in front of already-emitted operand
592 *
593 * Means relocating the operand.
594 */
reginsert(char op,char * opnd)595 static void reginsert(char op, char *opnd)
596 {
597 register char *src;
598 register char *dst;
599 register char *place;
600
601 if (regcode == ®dummy) {
602 regsize += 3;
603 return;
604 }
605
606 src = regcode;
607 regcode += 3;
608 dst = regcode;
609 while (src > opnd)
610 *--dst = *--src;
611
612 place = opnd; /* Op node, where operand used to be. */
613 *place++ = op;
614 *place++ = '\0';
615 *place++ = '\0';
616 }
617
618 /*
619 - regtail - set the next-pointer at the end of a node chain
620 */
regtail(char * p,char * val)621 static void regtail(char *p, char *val)
622 {
623 register char *scan;
624 register char *temp;
625 register int offset;
626
627 if (p == ®dummy)
628 return;
629
630 /* Find last node. */
631 scan = p;
632 for (;;) {
633 temp = regnext(scan);
634 if (temp == NULL)
635 break;
636 scan = temp;
637 }
638
639 if (OP(scan) == BACK)
640 offset = scan - val;
641 else
642 offset = val - scan;
643 *(scan+1) = (offset>>8)&0377;
644 *(scan+2) = offset&0377;
645 }
646
647 /*
648 - regoptail - regtail on operand of first argument; nop if operandless
649 */
regoptail(char * p,char * val)650 static void regoptail(char *p, char *val)
651 {
652 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
653 if (p == NULL || p == ®dummy || OP(p) != BRANCH)
654 return;
655 regtail(OPERAND(p), val);
656 }
657
658 /*
659 * reg_execute and friends
660 */
661
662 /*
663 * Global work variables for reg_execute().
664 */
665 static char *reginput; /* String-input pointer. */
666 static char *regbol; /* Beginning of input, for ^ check. */
667 static char **regstartp; /* Pointer to startp array. */
668 static char **regendp; /* Ditto for endp. */
669
670 /*
671 * Forwards.
672 */
673 STATIC int regtry(regexpr *, char *);
674 STATIC int regmatch(char *);
675 STATIC int regrepeat(char *);
676
677 #ifdef DEBUG
678 int regnarrate = 0;
679 void regdump(regexpr *);
680 STATIC char *regprop(char *);
681 #endif
682
683 /*
684 - reg_execute - match a regexp against a string
685 */
reg_execute(register regexpr * prog,register char * string)686 int reg_execute(register regexpr *prog, register char *string)
687 {
688 register char *s;
689 extern char *strchr();
690
691 /* Be paranoid... */
692 if (prog == NULL || string == NULL) {
693 reg_error("NULL parameter");
694 return(-1);
695 }
696
697 /* Check validity of program. */
698 if (UCHARAT(prog->program) != MAGIC) {
699 reg_error("corrupted program");
700 return(-1);
701 }
702
703 /* If there is a "must appear" string, look for it. */
704 if (prog->regmust != NULL) {
705 s = string;
706 while ((s = strchr(s, prog->regmust[0])) != NULL) {
707 if (strncmp(s, prog->regmust, prog->regmlen) == 0)
708 break; /* Found it. */
709 s++;
710 }
711 if (s == NULL) /* Not present. */
712 return(0);
713 }
714
715 /* Mark beginning of line for ^ . */
716 regbol = string;
717
718 /* Simplest case: anchored match need be tried only once. */
719 if (prog->reganch)
720 return(regtry(prog, string));
721
722 /* Messy cases: unanchored match. */
723 s = string;
724 if (prog->regstart != '\0')
725 /* We know what char it must start with. */
726 while ((s = strchr(s, prog->regstart)) != NULL) {
727 if (regtry(prog, s))
728 return(1);
729 s++;
730 }
731 else
732 /* We don't -- general case. */
733 do {
734 if (regtry(prog, s))
735 return(1);
736 } while (*s++ != '\0');
737
738 /* Failure. */
739 return(0);
740 }
741
742 /*
743 - regtry - try match at specific point
744 */
regtry(regexpr * prog,char * string)745 static int regtry(regexpr *prog, char *string)
746 {
747 register int i;
748 register char **sp;
749 register char **ep;
750
751 reginput = string;
752 regstartp = prog->startp;
753 regendp = prog->endp;
754
755 sp = prog->startp;
756 ep = prog->endp;
757 for (i = NSUBEXP; i > 0; i--) {
758 *sp++ = NULL;
759 *ep++ = NULL;
760 }
761 if (regmatch(prog->program + 1)) {
762 prog->startp[0] = string;
763 prog->endp[0] = reginput;
764 return(1);
765 } else
766 return(0);
767 }
768
769 /*
770 - regmatch - main matching routine
771 *
772 * Conceptually the strategy is simple: check to see whether the current
773 * node matches, call self recursively to see whether the rest matches,
774 * and then act accordingly. In practice we make some effort to avoid
775 * recursion, in particular by going through "ordinary" nodes (that don't
776 * need to know whether the rest of the match failed) by a loop instead of
777 * by recursion.
778 */
regmatch(char * prog)779 static int regmatch(char *prog)
780 {
781 register char *scan; /* Current node. */
782 char *next; /* Next node. */
783 extern char *strchr();
784
785 scan = prog;
786 #ifdef DEBUG
787 if (scan != NULL && regnarrate)
788 fprintf(stderr, "%s(\n", regprop(scan));
789 #endif
790 while (scan != NULL) {
791 #ifdef DEBUG
792 if (regnarrate)
793 fprintf(stderr, "%s...\n", regprop(scan));
794 #endif
795 next = regnext(scan);
796
797 switch (OP(scan)) {
798 case BOL:
799 if (reginput != regbol)
800 return(0);
801 break;
802 case EOL:
803 if (*reginput != '\0')
804 return(0);
805 break;
806 case ANY:
807 if (*reginput == '\0')
808 return(0);
809 reginput++;
810 break;
811 case EXACTLY: {
812 register int len;
813 register char *opnd;
814
815 opnd = OPERAND(scan);
816 /* Inline the first character, for speed. */
817 if (*opnd != *reginput)
818 return(0);
819 len = strlen(opnd);
820 if (len > 1 && strncmp(opnd, reginput, len) != 0)
821 return(0);
822 reginput += len;
823 }
824 break;
825 case ANYOF:
826 if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL)
827 return(0);
828
829 reginput++;
830 break;
831 case ANYBUT:
832 if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL)
833 return(0);
834 reginput++;
835 break;
836 case NOTHING:
837 break;
838 case BACK:
839 break;
840 case OPEN+1:
841 case OPEN+2:
842 case OPEN+3:
843 case OPEN+4:
844 case OPEN+5:
845 case OPEN+6:
846 case OPEN+7:
847 case OPEN+8:
848 case OPEN+9: {
849 register int no;
850 register char *save;
851
852 no = OP(scan) - OPEN;
853 save = reginput;
854
855 if (regmatch(next)) {
856 /*
857 * Don't set startp if some later
858 * invocation of the same parentheses
859 * already has.
860 */
861 if (regstartp[no] == NULL)
862 regstartp[no] = save;
863 return(1);
864 } else
865 return(0);
866 }
867 break;
868 case CLOSE+1:
869 case CLOSE+2:
870 case CLOSE+3:
871 case CLOSE+4:
872 case CLOSE+5:
873 case CLOSE+6:
874 case CLOSE+7:
875 case CLOSE+8:
876 case CLOSE+9: {
877 register int no;
878 register char *save;
879
880 no = OP(scan) - CLOSE;
881 save = reginput;
882
883 if (regmatch(next)) {
884 /*
885 * Don't set endp if some later
886 * invocation of the same parentheses
887 * already has.
888 */
889 if (regendp[no] == NULL)
890 regendp[no] = save;
891 return(1);
892 } else
893 return(0);
894 }
895 break;
896 case BRANCH: {
897 register char *save;
898
899 if (OP(next) != BRANCH) /* No choice. */
900 next = OPERAND(scan); /* Avoid recursion. */
901 else {
902 do {
903 save = reginput;
904 if (regmatch(OPERAND(scan)))
905 return(1);
906 reginput = save;
907 scan = regnext(scan);
908 } while (scan != NULL && OP(scan) == BRANCH);
909 return(0);
910 /* NOTREACHED */
911 }
912 }
913 break;
914 case STAR:
915 case PLUS: {
916 register char nextch;
917 register int no;
918 register char *save;
919 register int min;
920
921 /*
922 * Lookahead to avoid useless match attempts
923 * when we know what character comes next.
924 */
925 nextch = '\0';
926 if (OP(next) == EXACTLY)
927 nextch = *OPERAND(next);
928 min = (OP(scan) == STAR) ? 0 : 1;
929 save = reginput;
930 no = regrepeat(OPERAND(scan));
931 while (no >= min) {
932 /* If it could work, try it. */
933 if (nextch == '\0' || *reginput == nextch)
934 if (regmatch(next))
935 return(1);
936 /* Couldn't or didn't -- back up. */
937 no--;
938 reginput = save + no;
939 }
940 return(0);
941 }
942 break;
943 case END:
944 return(1); /* Success! */
945 break;
946 default:
947 reg_error("memory corruption");
948 return(0);
949 break;
950 }
951
952 scan = next;
953 }
954
955 /*
956 * We get here only if there's trouble -- normally "case END" is
957 * the terminating point.
958 */
959 reg_error("corrupted pointers");
960 return(0);
961 }
962
963 /*
964 - regrepeat - repeatedly match something simple, report how many
965 */
regrepeat(char * p)966 static int regrepeat(char *p)
967 {
968 register int count = 0;
969 register char *scan;
970 register char *opnd;
971 extern char *strchr();
972
973 scan = reginput;
974 opnd = OPERAND(p);
975 switch (OP(p)) {
976 case ANY:
977 count = strlen(scan);
978 scan += count;
979 break;
980 case EXACTLY:
981 while (*opnd == *scan) {
982 count++;
983 scan++;
984 }
985 break;
986 case ANYOF:
987 while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
988 count++;
989 scan++;
990 }
991 break;
992 case ANYBUT:
993 while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
994 count++;
995 scan++;
996 }
997 break;
998 default: /* Oh dear. Called inappropriately. */
999 reg_error("internal foulup");
1000 count = 0; /* Best compromise. */
1001 break;
1002 }
1003 reginput = scan;
1004
1005 return(count);
1006 }
1007
1008 /*
1009 - regnext - dig the "next" pointer out of a node
1010 */
regnext(register char * p)1011 static char * regnext(register char *p)
1012 {
1013 register int offset;
1014
1015 if (p == ®dummy)
1016 return(NULL);
1017
1018 offset = NEXT(p);
1019 if (offset == 0)
1020 return(NULL);
1021
1022 if (OP(p) == BACK)
1023 return(p-offset);
1024 else
1025 return(p+offset);
1026 }
1027
1028 /*
1029 - reg_substitute - perform substitutions after a regexp match
1030 */
reg_substitute(regexpr * prog,char * source,char * dest)1031 void reg_substitute(regexpr *prog, char *source, char *dest)
1032 {
1033 register char *src;
1034 register char *dst;
1035 register char c;
1036 register int no;
1037 register int len;
1038 extern char *strncpy();
1039
1040 if (prog == NULL || source == NULL || dest == NULL) {
1041 reg_error("NULL parm to reg_substitute");
1042 return;
1043 }
1044 if (UCHARAT(prog->program) != MAGIC) {
1045 reg_error("damaged regexpr fed to reg_substitute");
1046 return;
1047 }
1048
1049 src = source;
1050 dst = dest;
1051 while ((c = *src++) != '\0') {
1052 if (c == '&')
1053 no = 0;
1054 else if (c == '\\' && '0' <= *src && *src <= '9')
1055 no = *src++ - '0';
1056 else
1057 no = -1;
1058
1059 if (no < 0) { /* Ordinary character. */
1060 if (c == '\\' && (*src == '\\' || *src == '&'))
1061 c = *src++;
1062 *dst++ = c;
1063 } else if (prog->startp[no] != NULL && prog->endp[no] != NULL) {
1064 len = prog->endp[no] - prog->startp[no];
1065 (void) strncpy(dst, prog->startp[no], len);
1066 dst += len;
1067 if (len != 0 && *(dst-1) == '\0') {
1068 reg_error("damaged match string");
1069 return;
1070 }
1071 }
1072 }
1073 *dst++ = '\0';
1074 }
1075
reg_error(char * s)1076 void reg_error(char *s)
1077 {
1078 return;
1079 }
1080
1081 #ifdef DEBUG
1082
1083 STATIC char *regprop(char *);
1084
1085 /*
1086 - regdump - dump a regexpr onto stdout in vaguely comprehensible form
1087 */
regdump(regexpr * r)1088 void regdump(regexpr *r)
1089 {
1090 register char *s;
1091 register char op = EXACTLY; /* Arbitrary non-END op. */
1092 register char *next;
1093 extern char *strchr();
1094
1095
1096 s = r->program + 1;
1097 while (op != END) { /* While that wasn't END last time... */
1098 op = OP(s);
1099 printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */
1100 next = regnext(s);
1101 if (next == NULL) /* Next ptr. */
1102 printf("(0)");
1103 else
1104 printf("(%d)", (s-r->program)+(next-s));
1105 s += 3;
1106 if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
1107 /* Literal string, where present. */
1108 while (*s != '\0') {
1109 putchar(*s);
1110 s++;
1111 }
1112 s++;
1113 }
1114 putchar('\n');
1115 }
1116
1117 /* Header fields of interest. */
1118 if (r->regstart != '\0')
1119 printf("start `%c' ", r->regstart);
1120 if (r->reganch)
1121 printf("anchored ");
1122 if (r->regmust != NULL)
1123 printf("must have \"%s\"", r->regmust);
1124 printf("\n");
1125 }
1126
1127 /*
1128 - regprop - printable representation of opcode
1129 */
regprop(char * op)1130 static char *regprop(char *op)
1131 {
1132 register char *p;
1133 static char buf[50];
1134
1135 (void) strcpy(buf, ":");
1136
1137 switch (OP(op)) {
1138 case BOL:
1139 p = "BOL";
1140 break;
1141 case EOL:
1142 p = "EOL";
1143 break;
1144 case ANY:
1145 p = "ANY";
1146 break;
1147 case ANYOF:
1148 p = "ANYOF";
1149 break;
1150 case ANYBUT:
1151 p = "ANYBUT";
1152 break;
1153 case BRANCH:
1154 p = "BRANCH";
1155 break;
1156 case EXACTLY:
1157 p = "EXACTLY";
1158 break;
1159 case NOTHING:
1160 p = "NOTHING";
1161 break;
1162 case BACK:
1163 p = "BACK";
1164 break;
1165 case END:
1166 p = "END";
1167 break;
1168 case OPEN+1:
1169 case OPEN+2:
1170 case OPEN+3:
1171 case OPEN+4:
1172 case OPEN+5:
1173 case OPEN+6:
1174 case OPEN+7:
1175 case OPEN+8:
1176 case OPEN+9:
1177 sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
1178 p = NULL;
1179 break;
1180 case CLOSE+1:
1181 case CLOSE+2:
1182 case CLOSE+3:
1183 case CLOSE+4:
1184 case CLOSE+5:
1185 case CLOSE+6:
1186 case CLOSE+7:
1187 case CLOSE+8:
1188 case CLOSE+9:
1189 sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
1190 p = NULL;
1191 break;
1192 case STAR:
1193 p = "STAR";
1194 break;
1195 case PLUS:
1196 p = "PLUS";
1197 break;
1198 default:
1199 reg_error("corrupted opcode");
1200 break;
1201 }
1202 if (p != NULL)
1203 (void) strcat(buf, p);
1204 return(buf);
1205 }
1206 #endif
1207
1208 /*
1209 * The following is provided for those people who do not have strcspn() in
1210 * their C libraries. They should get off their butts and do something
1211 * about it; at least one public-domain implementation of those (highly
1212 * useful) string routines has been published on Usenet.
1213 */
1214 #ifdef STRCSPN
1215 /*
1216 * strcspn - find length of initial segment of s1 consisting entirely
1217 * of characters not from s2
1218 */
1219
strcspn(char * s1,char * s2)1220 static int strcspn(char *s1, char *s2)
1221 {
1222 register char *scan1;
1223 register char *scan2;
1224 register int count;
1225
1226 count = 0;
1227 for (scan1 = s1; *scan1 != '\0'; scan1++) {
1228 for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */
1229 if (*scan1 == *scan2++)
1230 return(count);
1231 count++;
1232 }
1233 return(count);
1234 }
1235 #endif
1236