1 /*
2 * regcomp and regexec -- regsub and regerror are elsewhere
3 *
4 * Copyright (c) 1986 by University of Toronto.
5 * Written by Henry Spencer. Not derived from licensed software.
6 *
7 * Permission is granted to anyone to use this software for any
8 * purpose on any computer system, and to redistribute it freely,
9 * subject to the following restrictions:
10 *
11 * 1. The author is not responsible for the consequences of use of
12 * this software, no matter how awful, even if they arise
13 * from defects in it.
14 *
15 * 2. The origin of this software must not be misrepresented, either
16 * by explicit claim or by omission.
17 *
18 * 3. Altered versions must be plainly marked as such, and must not
19 * be misrepresented as being the original software.
20 *
21 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
22 *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
23 *** to assist in implementing egrep.
24 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
25 *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
26 *** as in BSD grep and ex.
27 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
28 *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
29 *** THIS IS AN ALTERED VERSION. It was altered by James A. Woods,
30 *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy.
31 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
32 *** seiwald@vix.com, on 28 August 1993, for use in jam. Regmagic.h
33 *** was moved into regexp.h, and the include of regexp.h now uses "'s
34 *** to avoid conflicting with the system regexp.h. Const, bless its
35 *** soul, was removed so it can compile everywhere. The declaration
36 *** of strchr() was in conflict on AIX, so it was removed (as it is
37 *** happily defined in string.h).
38 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
39 *** seiwald@perforce.com, on 20 January 2000, to use function prototypes.
40 *
41 * Beware that some of this code is subtly aware of the way operator
42 * precedence is structured in regular expressions. Serious changes in
43 * regular-expression syntax might require a total rethink.
44 */
45
46 #include "config.h"
47
48 #ifndef HAVE_REGEX
49
50 #include "regexp.h"
51 #include <stdio.h>
52 #include <ctype.h>
53 #ifndef ultrix
54 #include <stdlib.h>
55 #endif
56 #include <string.h>
57
58 /*
59 * The "internal use only" fields in regexp.h are present to pass info from
60 * compile to execute that permits the execute phase to run lots faster on
61 * simple cases. They are:
62 *
63 * regstart char that must begin a match; '\0' if none obvious
64 * reganch is the match anchored (at beginning-of-line only)?
65 * regmust string (pointer into program) that match must include, or NULL
66 * regmlen length of regmust string
67 *
68 * Regstart and reganch permit very fast decisions on suitable starting points
69 * for a match, cutting down the work a lot. Regmust permits fast rejection
70 * of lines that cannot possibly match. The regmust tests are costly enough
71 * that regcomp() supplies a regmust only if the r.e. contains something
72 * potentially expensive (at present, the only such thing detected is * or +
73 * at the start of the r.e., which can involve a lot of backup). Regmlen is
74 * supplied because the test in regexec() needs it and regcomp() is computing
75 * it anyway.
76 */
77
78 /*
79 * Structure for regexp "program". This is essentially a linear encoding
80 * of a nondeterministic finite-state machine (aka syntax charts or
81 * "railroad normal form" in parsing technology). Each node is an opcode
82 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
83 * all nodes except BRANCH implement concatenation; a "next" pointer with
84 * a BRANCH on both ends of it is connecting two alternatives. (Here we
85 * have one of the subtle syntax dependencies: an individual BRANCH (as
86 * opposed to a collection of them) is never concatenated with anything
87 * because of operator precedence.) The operand of some types of node is
88 * a literal string; for others, it is a node leading into a sub-FSM. In
89 * particular, the operand of a BRANCH node is the first node of the branch.
90 * (NB this is *not* a tree structure: the tail of the branch connects
91 * to the thing following the set of BRANCHes.) The opcodes are:
92 */
93
94 /* definition number opnd? meaning */
95 #define END 0 /* no End of program. */
96 #define BOL 1 /* no Match "" at beginning of line. */
97 #define EOL 2 /* no Match "" at end of line. */
98 #define ANY 3 /* no Match any one character. */
99 #define ANYOF 4 /* str Match any character in this string. */
100 #define ANYBUT 5 /* str Match any character not in this string. */
101 #define BRANCH 6 /* node Match this alternative, or the next... */
102 #define BACK 7 /* no Match "", "next" ptr points backward. */
103 #define EXACTLY 8 /* str Match this string. */
104 #define NOTHING 9 /* no Match empty string. */
105 #define STAR 10 /* node Match this (simple) thing 0 or more times. */
106 #define PLUS 11 /* node Match this (simple) thing 1 or more times. */
107 #define WORDA 12 /* no Match "" at wordchar, where prev is nonword */
108 #define WORDZ 13 /* no Match "" at nonwordchar, where prev is word */
109 #define OPEN 20 /* no Mark this point in input as start of #n. */
110 /* OPEN+1 is number 1, etc. */
111 #define CLOSE 30 /* no Analogous to OPEN. */
112
113 /*
114 * Opcode notes:
115 *
116 * BRANCH The set of branches constituting a single choice are hooked
117 * together with their "next" pointers, since precedence prevents
118 * anything being concatenated to any individual branch. The
119 * "next" pointer of the last BRANCH in a choice points to the
120 * thing following the whole choice. This is also where the
121 * final "next" pointer of each individual branch points; each
122 * branch starts with the operand node of a BRANCH node.
123 *
124 * BACK Normal "next" pointers all implicitly point forward; BACK
125 * exists to make loop structures possible.
126 *
127 * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
128 * BRANCH structures using BACK. Simple cases (one character
129 * per match) are implemented with STAR and PLUS for speed
130 * and to minimize recursive plunges.
131 *
132 * OPEN,CLOSE ...are numbered at compile time.
133 */
134
135 /*
136 * A node is one char of opcode followed by two chars of "next" pointer.
137 * "Next" pointers are stored as two 8-bit pieces, high order first. The
138 * value is a positive offset from the opcode of the node containing it.
139 * An operand, if any, simply follows the node. (Note that much of the
140 * code generation knows about this implicit relationship.)
141 *
142 * Using two bytes for the "next" pointer is vast overkill for most things,
143 * but allows patterns to get big without disasters.
144 */
145 #define OP(p) (*(p))
146 /* NOLINTNEXTLINE(whitespace/operators) */
147 #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
148 #define OPERAND(p) ((p) + 3)
149
150 /*
151 * See regmagic.h for one further detail of program structure.
152 */
153
154 /*
155 * Utility definitions.
156 */
157 #ifndef CHARBITS
158 #define UCHARAT(p) ((int)*(unsigned char *)(p))
159 #else
160 #define UCHARAT(p) ((int)*(p)&CHARBITS)
161 #endif
162
163 #define FAIL(m) { regerror(m); return(NULL); }
164 #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?')
165
166 /*
167 * Flags to be passed up and down.
168 */
169 #define HASWIDTH 01 /* Known never to match null string. */
170 #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */
171 #define SPSTART 04 /* Starts with * or +. */
172 #define WORST 0 /* Worst case. */
173
174 /*
175 * Global work variables for regcomp().
176 */
177 static char *regparse; /* Input-scan pointer. */
178 static int regnpar; /* () count. */
179 static char regdummy;
180 static char *regcode; /* Code-emit pointer; ®dummy = don't. */
181 /* NOLINTNEXTLINE(runtime/int) */
182 static long regsize; /* Code size. */
183
184 /*
185 * Forward declarations for regcomp()'s friends.
186 */
187 #ifndef STATIC
188 #define STATIC static
189 #endif
190 STATIC char *reg(int paren, int *flagp);
191 STATIC char *regbranch(int *flagp);
192 STATIC char *regpiece(int *flagp);
193 STATIC char *regatom(int *flagp);
194 STATIC char *regnode(int op);
195 STATIC char *regnext(register char *p);
196 STATIC void regc(int b);
197 STATIC void reginsert(char op, char *opnd);
198 STATIC void regtail(char *p, char *val);
199 STATIC void regoptail(char *p, char *val);
200 #ifdef WANT_STRCSPN
201 STATIC int strcspn(char *s1, char *s2);
202 #endif
203
204 /*
205 - regcomp - compile a regular expression into internal code
206 *
207 * We can't allocate space until we know how big the compiled form will be,
208 * but we can't compile it (and thus know how big it is) until we've got a
209 * place to put the code. So we cheat: we compile it twice, once with code
210 * generation turned off and size counting turned on, and once "for real".
211 * This also means that we don't allocate space until we are sure that the
212 * thing really will compile successfully, and we never have to move the
213 * code and thus invalidate pointers into it. (Note that it has to be in
214 * one piece because free() must be able to free it all.)
215 *
216 * Beware that the optimization-preparation code in here knows about some
217 * of the structure of the compiled regexp.
218 */
regcomp(char * exp)219 regexp *regcomp(char *exp)
220 {
221 register regexp *r;
222 register char *scan;
223 register char *longest;
224 /* cppcheck-suppress variableScope */
225 register unsigned len;
226 int flags;
227
228 if (exp == NULL)
229 FAIL("NULL argument");
230
231 /* First pass: determine size, legality. */
232 #ifdef notdef
233 if (exp[0] == '.' && exp[1] == '*')
234 exp += 2; /* aid grep */
235 #endif
236 regparse = (char *)exp;
237 regnpar = 1;
238 regsize = 0L;
239 regcode = ®dummy;
240 regc(MAGIC);
241 if (reg(0, &flags) == NULL)
242 return (NULL);
243
244 /* Small enough for pointer-storage convention? */
245 if (regsize >= 32767L) /* Probably could be 65535L. */
246 FAIL("regexp too big");
247
248 /* Allocate space. */
249 r = (regexp *) malloc(sizeof(regexp) + (unsigned)regsize);
250 if (r == NULL)
251 FAIL("out of space");
252
253 /* Second pass: emit code. */
254 regparse = (char *)exp;
255 regnpar = 1;
256 regcode = r->program;
257 regc(MAGIC);
258 if (reg(0, &flags) == NULL)
259 /* cppcheck-suppress memleak Memory leak: r */
260 return (NULL);
261
262 /* Dig out information for optimizations. */
263 r->regstart = '\0'; /* Worst-case defaults. */
264 r->reganch = 0;
265 r->regmust = NULL;
266 r->regmlen = 0;
267 scan = r->program + 1; /* First BRANCH. */
268 if (OP(regnext(scan)) == END) { /* Only one top-level choice. */
269 scan = OPERAND(scan);
270
271 /* Starting-point info. */
272 if (OP(scan) == EXACTLY)
273 r->regstart = *OPERAND(scan);
274 else if (OP(scan) == BOL)
275 r->reganch++;
276
277 /*
278 * If there's something expensive in the r.e., find the
279 * longest literal string that must appear and make it the
280 * regmust. Resolve ties in favor of later strings, since
281 * the regstart check works with the beginning of the r.e.
282 * and avoiding duplication strengthens checking. Not a
283 * strong reason, but sufficient in the absence of others.
284 */
285 if (flags & SPSTART) {
286 longest = NULL;
287 len = 0;
288 for (; scan != NULL; scan = regnext(scan))
289 /* Flawfinder: ignore (strlen) */
290 if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) {
291 longest = OPERAND(scan);
292 /* Flawfinder: ignore (strlen) */
293 len = strlen(OPERAND(scan));
294 }
295 r->regmust = longest;
296 r->regmlen = len;
297 }
298 }
299
300 return (r);
301 }
302
303 /*
304 - reg - regular expression, i.e. main body or parenthesized thing
305 *
306 * Caller must absorb opening parenthesis.
307 *
308 * Combining parenthesis handling with the base level of regular expression
309 * is a trifle forced, but the need to tie the tails of the branches to what
310 * follows makes it hard to avoid.
311 */
reg(int paren,int * flagp)312 static char *reg(int paren, /* Parenthesized? */
313 int *flagp)
314 {
315 register char *ret;
316 register char *br;
317 register char *ender;
318 register int parno = 0;
319 int flags;
320
321 *flagp = HASWIDTH; /* Tentatively. */
322
323 /* Make an OPEN node, if parenthesized. */
324 if (paren) {
325 if (regnpar >= NSUBEXP)
326 FAIL("too many ()");
327 parno = regnpar;
328 regnpar++;
329 ret = regnode(OPEN + parno);
330 } else
331 ret = NULL;
332
333 /* Pick up the branches, linking them together. */
334 br = regbranch(&flags);
335 if (br == NULL)
336 return (NULL);
337 if (ret != NULL)
338 regtail(ret, br); /* OPEN -> first. */
339 else
340 ret = br;
341 if (!(flags & HASWIDTH))
342 *flagp &= ~HASWIDTH;
343 *flagp |= flags & SPSTART;
344 while (*regparse == '|' || *regparse == '\n') {
345 regparse++;
346 br = regbranch(&flags);
347 if (br == NULL)
348 return (NULL);
349 regtail(ret, br); /* BRANCH -> BRANCH. */
350 if (!(flags & HASWIDTH))
351 *flagp &= ~HASWIDTH;
352 *flagp |= flags & SPSTART;
353 }
354
355 /* Make a closing node, and hook it on the end. */
356 ender = regnode((paren) ? CLOSE + parno : END);
357 regtail(ret, ender);
358
359 /* Hook the tails of the branches to the closing node. */
360 for (br = ret; br != NULL; br = regnext(br))
361 regoptail(br, ender);
362
363 /* Check for proper termination. */
364 if (paren && *regparse++ != ')') {
365 FAIL("unmatched ()");
366 } else if (!paren && *regparse != '\0') {
367 if (*regparse == ')') {
368 FAIL("unmatched ()");
369 } else
370 FAIL("junk on end"); /* "Can't happen". */
371 /* NOTREACHED */
372 }
373
374 return (ret);
375 }
376
377 /*
378 - regbranch - one alternative of an | operator
379 *
380 * Implements the concatenation operator.
381 */
regbranch(int * flagp)382 static char *regbranch(int *flagp)
383 {
384 register char *ret;
385 register char *chain;
386 register char *latest;
387 int flags;
388
389 *flagp = WORST; /* Tentatively. */
390
391 ret = regnode(BRANCH);
392 chain = NULL;
393 while (*regparse != '\0' && *regparse != ')' &&
394 *regparse != '\n' && *regparse != '|') {
395 latest = regpiece(&flags);
396 if (latest == NULL)
397 return (NULL);
398 *flagp |= flags & HASWIDTH;
399 if (chain == NULL) /* First piece. */
400 *flagp |= flags & SPSTART;
401 else
402 regtail(chain, latest);
403 chain = latest;
404 }
405 if (chain == NULL) /* Loop ran zero times. */
406 (void)regnode(NOTHING);
407
408 return (ret);
409 }
410
411 /*
412 - regpiece - something followed by possible [*+?]
413 *
414 * Note that the branching code sequences used for ? and the general cases
415 * of * and + are somewhat optimized: they use the same NOTHING node as
416 * both the endmarker for their branch list and the body of the last branch.
417 * It might seem that this node could be dispensed with entirely, but the
418 * endmarker role is not redundant.
419 */
regpiece(int * flagp)420 static char *regpiece(int *flagp)
421 {
422 register char *ret;
423 register char op;
424 register char *next;
425 int flags;
426
427 ret = regatom(&flags);
428 if (ret == NULL)
429 return (NULL);
430
431 op = *regparse;
432 if (!ISMULT(op)) {
433 *flagp = flags;
434 return (ret);
435 }
436
437 if (!(flags & HASWIDTH) && op != '?')
438 FAIL("*+ operand could be empty");
439 *flagp = (op != '+') ? (WORST | SPSTART) : (WORST | HASWIDTH);
440
441 if (op == '*' && (flags & SIMPLE))
442 reginsert(STAR, ret);
443 else if (op == '*') {
444 /* Emit x* as (x&|), where & means "self". */
445 reginsert(BRANCH, ret); /* Either x */
446 regoptail(ret, regnode(BACK)); /* and loop */
447 regoptail(ret, ret); /* back */
448 regtail(ret, regnode(BRANCH)); /* or */
449 regtail(ret, regnode(NOTHING)); /* null. */
450 } else if (op == '+' && (flags & SIMPLE))
451 reginsert(PLUS, ret);
452 else if (op == '+') {
453 /* Emit x+ as x(&|), where & means "self". */
454 next = regnode(BRANCH); /* Either */
455 regtail(ret, next);
456 regtail(regnode(BACK), ret); /* loop back */
457 regtail(next, regnode(BRANCH)); /* or */
458 regtail(ret, regnode(NOTHING)); /* null. */
459 } else if (op == '?') {
460 /* Emit x? as (x|) */
461 reginsert(BRANCH, ret); /* Either x */
462 regtail(ret, regnode(BRANCH)); /* or */
463 next = regnode(NOTHING); /* null. */
464 regtail(ret, next);
465 regoptail(ret, next);
466 }
467 regparse++;
468 if (ISMULT(*regparse))
469 FAIL("nested *?+");
470
471 return (ret);
472 }
473
474 /*
475 - regatom - the lowest level
476 *
477 * Optimization: gobbles an entire sequence of ordinary characters so that
478 * it can turn them into a single node, which is smaller to store and
479 * faster to run. Backslashed characters are exceptions, each becoming a
480 * separate node; the code is simpler that way and it's not worth fixing.
481 */
regatom(int * flagp)482 static char *regatom(int *flagp)
483 {
484 register char *ret;
485 int flags;
486
487 *flagp = WORST; /* Tentatively. */
488
489 switch (*regparse++) {
490 /* FIXME: these chars only have meaning at beg/end of pat? */
491 case '^':
492 ret = regnode(BOL);
493 break;
494 case '$':
495 ret = regnode(EOL);
496 break;
497 case '.':
498 ret = regnode(ANY);
499 *flagp |= HASWIDTH | SIMPLE;
500 break;
501 case '[':{
502 register int classr;
503 register int classend;
504
505 if (*regparse == '^') { /* Complement of range. */
506 ret = regnode(ANYBUT);
507 regparse++;
508 } else
509 ret = regnode(ANYOF);
510 if (*regparse == ']' || *regparse == '-')
511 regc(*regparse++);
512 while (*regparse != '\0' && *regparse != ']') {
513 if (*regparse == '-') {
514 regparse++;
515 if (*regparse == ']' || *regparse == '\0')
516 regc('-');
517 else {
518 classr = UCHARAT(regparse - 2) + 1;
519 classend = UCHARAT(regparse);
520 if (classr > classend + 1)
521 FAIL("invalid [] range");
522 for (; classr <= classend; classr++)
523 regc(classr);
524 regparse++;
525 }
526 } else
527 regc(*regparse++);
528 }
529 regc('\0');
530 if (*regparse != ']')
531 FAIL("unmatched []");
532 regparse++;
533 *flagp |= HASWIDTH | SIMPLE;
534 }
535 break;
536 case '(':
537 ret = reg(1, &flags);
538 if (ret == NULL)
539 return (NULL);
540 *flagp |= flags & (HASWIDTH | SPSTART);
541 break;
542 case '\0':
543 case '|':
544 case '\n':
545 case ')':
546 FAIL("internal urp"); /* Supposed to be caught earlier. */
547 break;
548 case '?':
549 case '+':
550 case '*':
551 FAIL("?+* follows nothing");
552 break;
553 case '\\':
554 switch (*regparse++) {
555 case '\0':
556 FAIL("trailing \\");
557 break;
558 case '<':
559 ret = regnode(WORDA);
560 break;
561 case '>':
562 ret = regnode(WORDZ);
563 break;
564 /* FIXME: Someday handle \1, \2, ... */
565 default:
566 /* Handle general quoted chars in exact-match routine */
567 goto de_fault;
568 }
569 break;
570 de_fault:
571 default:
572 /*
573 * Encode a string of characters to be matched exactly.
574 *
575 * This is a bit tricky due to quoted chars and due to
576 * '*', '+', and '?' taking the SINGLE char previous
577 * as their operand.
578 *
579 * On entry, the char at regparse[-1] is going to go
580 * into the string, no matter what it is. (It could be
581 * following a \ if we are entered from the '\' case.)
582 *
583 * Basic idea is to pick up a good char in ch and
584 * examine the next char. If it's *+? then we twiddle.
585 * If it's \ then we frozzle. If it's other magic char
586 * we push ch and terminate the string. If none of the
587 * above, we push ch on the string and go around again.
588 *
589 * regprev is used to remember where "the current char"
590 * starts in the string, if due to a *+? we need to back
591 * up and put the current char in a separate, 1-char, string.
592 * When regprev is NULL, ch is the only char in the
593 * string; this is used in *+? handling, and in setting
594 * flags |= SIMPLE at the end.
595 */
596 {
597 char *regprev;
598 register char ch;
599
600 regparse--; /* Look at cur char */
601 ret = regnode(EXACTLY);
602 for (regprev = 0;;) {
603 ch = *regparse++; /* Get current char */
604 switch (*regparse) { /* look at next one */
605
606 default:
607 regc(ch); /* Add cur to string */
608 break;
609
610 case '.':
611 case '[':
612 case '(':
613 case ')':
614 case '|':
615 case '\n':
616 case '$':
617 case '^':
618 case '\0':
619 /* FIXME, $ and ^ should not always be magic */
620 magic:
621 regc(ch); /* dump cur char */
622 goto done; /* and we are done */
623
624 case '?':
625 case '+':
626 case '*':
627 if (!regprev) /* If just ch in str, */
628 goto magic; /* use it */
629 /* End mult-char string one early */
630 regparse = regprev; /* Back up parse */
631 goto done;
632
633 case '\\':
634 regc(ch); /* Cur char OK */
635 switch (regparse[1]) { /* Look after \ */
636 case '\0':
637 case '<':
638 case '>':
639 /* FIXME: Someday handle \1, \2, ... */
640 goto done; /* Not quoted */
641 default:
642 /* NOLINTNEXTLINE(whitespace/line_length) */
643 /* Backup point is \, scan * point is after it. */
644 regprev = regparse;
645 regparse++;
646 continue; /* NOT break; */
647 }
648 }
649 regprev = regparse; /* Set backup point */
650 }
651 done:
652 regc('\0');
653 *flagp |= HASWIDTH;
654 if (!regprev) /* One char? */
655 *flagp |= SIMPLE;
656 }
657 break;
658 }
659
660 return (ret);
661 }
662
663 /*
664 - regnode - emit a node
665 */
666 static char * /* Location. */
regnode(int op)667 regnode(int op)
668 {
669 register char *ret;
670 register char *ptr;
671
672 ret = regcode;
673 if (ret == ®dummy) {
674 regsize += 3;
675 return (ret);
676 }
677
678 ptr = ret;
679 *ptr++ = op;
680 *ptr++ = '\0'; /* Null "next" pointer. */
681 *ptr++ = '\0';
682 regcode = ptr;
683
684 return (ret);
685 }
686
687 /*
688 - regc - emit (if appropriate) a byte of code
689 */
regc(int b)690 static void regc(int b)
691 {
692 if (regcode != ®dummy)
693 *regcode++ = b;
694 else
695 regsize++;
696 }
697
698 /*
699 - reginsert - insert an operator in front of already-emitted operand
700 *
701 * Means relocating the operand.
702 */
reginsert(char op,char * opnd)703 static void reginsert(char op, char *opnd)
704 {
705 register char *src;
706 register char *dst;
707 register char *place;
708
709 if (regcode == ®dummy) {
710 regsize += 3;
711 return;
712 }
713
714 src = regcode;
715 regcode += 3;
716 dst = regcode;
717 while (src > opnd)
718 *--dst = *--src;
719
720 place = opnd; /* Op node, where operand used to be. */
721 *place++ = op;
722 *place++ = '\0';
723 *place++ = '\0';
724 }
725
726 /*
727 - regtail - set the next-pointer at the end of a node chain
728 */
regtail(char * p,char * val)729 static void regtail(char *p, char *val)
730 {
731 register char *scan;
732 register char *temp;
733 register int offset;
734
735 if (p == ®dummy)
736 return;
737
738 /* Find last node. */
739 scan = p;
740 for (;;) {
741 temp = regnext(scan);
742 if (temp == NULL)
743 break;
744 scan = temp;
745 }
746
747 if (OP(scan) == BACK)
748 offset = scan - val;
749 else
750 offset = val - scan;
751 *(scan + 1) = (offset >> 8) & 0377;
752 *(scan + 2) = offset & 0377;
753 }
754
755 /*
756 - regoptail - regtail on operand of first argument; nop if operandless
757 */
758
regoptail(char * p,char * val)759 static void regoptail(char *p, char *val)
760 {
761 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
762 if (p == NULL || p == ®dummy || OP(p) != BRANCH)
763 return;
764 regtail(OPERAND(p), val);
765 }
766
767 /*
768 * regexec and friends
769 */
770
771 /*
772 * Global work variables for regexec().
773 */
774 static char *reginput; /* String-input pointer. */
775 static char *regbol; /* Beginning of input, for ^ check. */
776 static char **regstartp; /* Pointer to startp array. */
777 static char **regendp; /* Ditto for endp. */
778
779 /*
780 * Forwards.
781 */
782 STATIC int regtry(regexp * prog, char *string);
783 STATIC int regmatch(char *prog);
784 STATIC int regrepeat(char *p);
785
786 #ifdef WANT_REGDUMP
787 int regnarrate = 0;
788 STATIC char *regprop(regexp *r);
789 STATIC char *regprop(char *op);
790 #endif
791
792 /*
793 - regexec - match a regexp against a string
794 */
regexec(register regexp * prog,register char * string)795 int regexec(register regexp * prog, register char *string)
796 {
797 register char *s;
798
799 /* Be paranoid... */
800 if (prog == NULL || string == NULL) {
801 regerror("NULL parameter");
802 return (0);
803 }
804
805 /* Check validity of program. */
806 if (UCHARAT(prog->program) != MAGIC) {
807 regerror("corrupted program");
808 return (0);
809 }
810
811 /* If there is a "must appear" string, look for it. */
812 if (prog->regmust != NULL) {
813 s = (char *)string;
814 while ((s = strchr(s, prog->regmust[0])) != NULL) {
815 if (strncmp(s, prog->regmust, prog->regmlen) == 0)
816 break; /* Found it. */
817 s++;
818 }
819 if (s == NULL) /* Not present. */
820 return (0);
821 }
822
823 /* Mark beginning of line for ^ . */
824 regbol = (char *)string;
825
826 /* Simplest case: anchored match need be tried only once. */
827 if (prog->reganch)
828 return (regtry(prog, string));
829
830 /* Messy cases: unanchored match. */
831 s = (char *)string;
832 if (prog->regstart != '\0')
833 /* We know what char it must start with. */
834 while ((s = strchr(s, prog->regstart)) != NULL) {
835 if (regtry(prog, s))
836 return (1);
837 s++;
838 } else
839 /* We don't -- general case. */
840 do {
841 if (regtry(prog, s))
842 return (1);
843 } while (*s++ != '\0');
844
845 /* Failure. */
846 return (0);
847 }
848
849 /*
850 - regtry - try match at specific point
851 */
regtry(regexp * prog,char * string)852 static int /* 0 failure, 1 success */ regtry(regexp * prog, char *string)
853 {
854 register int i;
855 register char **sp;
856 register char **ep;
857
858 reginput = string;
859 regstartp = prog->startp;
860 regendp = prog->endp;
861
862 sp = prog->startp;
863 ep = prog->endp;
864 for (i = NSUBEXP; i > 0; i--) {
865 *sp++ = NULL;
866 *ep++ = NULL;
867 }
868 if (regmatch(prog->program + 1)) {
869 prog->startp[0] = string;
870 prog->endp[0] = reginput;
871 return (1);
872 } else
873 return (0);
874 }
875
876 /*
877 - regmatch - main matching routine
878 *
879 * Conceptually the strategy is simple: check to see whether the current
880 * node matches, call self recursively to see whether the rest matches,
881 * and then act accordingly. In practice we make some effort to avoid
882 * recursion, in particular by going through "ordinary" nodes (that don't
883 * need to know whether the rest of the match failed) by a loop instead of
884 * by recursion.
885 */
regmatch(char * prog)886 static int /* 0 failure, 1 success */ regmatch(char *prog)
887 {
888 register char *scan; /* Current node. */
889 char *next; /* Next node. */
890
891 scan = prog;
892 #ifdef WANT_REGDUMP
893 if (scan != NULL && regnarrate)
894 fprintf(stderr, "%s(\n", regprop(scan));
895 #endif
896 while (scan != NULL) {
897 #ifdef WANT_REGDUMP
898 if (regnarrate)
899 fprintf(stderr, "%s...\n", regprop(scan));
900 #endif
901 next = regnext(scan);
902
903 switch (OP(scan)) {
904 case BOL:
905 if (reginput != regbol)
906 return (0);
907 break;
908 case EOL:
909 if (*reginput != '\0')
910 return (0);
911 break;
912 case WORDA:
913 /* Must be looking at a letter, digit, or _ */
914 if ((!isalnum((int)*reginput)) && *reginput != '_')
915 return (0);
916 /* Prev must be BOL or nonword */
917 if (reginput > regbol &&
918 (isalnum((int)reginput[-1]) || reginput[-1] == '_'))
919 return (0);
920 break;
921 case WORDZ:
922 /* Must be looking at non letter, digit, or _ */
923 if (isalnum((int)*reginput) || *reginput == '_')
924 return (0);
925 /* We don't care what the previous char was */
926 break;
927 case ANY:
928 if (*reginput == '\0')
929 return (0);
930 reginput++;
931 break;
932 case EXACTLY:{
933 register int len;
934 register char *opnd;
935
936 opnd = OPERAND(scan);
937 /* Inline the first character, for speed. */
938 if (*opnd != *reginput)
939 return (0);
940 /* Flawfinder: ignore (strlen) */
941 len = strlen(opnd);
942 if (len > 1 && strncmp(opnd, reginput, len) != 0)
943 return (0);
944 reginput += len;
945 }
946 break;
947 case ANYOF:
948 if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL)
949 return (0);
950 reginput++;
951 break;
952 case ANYBUT:
953 if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL)
954 return (0);
955 reginput++;
956 break;
957 case NOTHING:
958 break;
959 case BACK:
960 break;
961 case OPEN + 1:
962 case OPEN + 2:
963 case OPEN + 3:
964 case OPEN + 4:
965 case OPEN + 5:
966 case OPEN + 6:
967 case OPEN + 7:
968 case OPEN + 8:
969 case OPEN + 9:{
970 register int no;
971 register char *save;
972
973 no = OP(scan) - OPEN;
974 save = reginput;
975
976 if (regmatch(next)) {
977 /*
978 * Don't set startp if some later
979 * invocation of the same parentheses
980 * already has.
981 */
982 if (regstartp[no] == NULL)
983 regstartp[no] = save;
984 return (1);
985 } else
986 return (0);
987 }
988 break;
989 case CLOSE + 1:
990 case CLOSE + 2:
991 case CLOSE + 3:
992 case CLOSE + 4:
993 case CLOSE + 5:
994 case CLOSE + 6:
995 case CLOSE + 7:
996 case CLOSE + 8:
997 case CLOSE + 9:{
998 register int no;
999 register char *save;
1000
1001 no = OP(scan) - CLOSE;
1002 save = reginput;
1003
1004 if (regmatch(next)) {
1005 /*
1006 * Don't set endp if some later
1007 * invocation of the same parentheses
1008 * already has.
1009 */
1010 if (regendp[no] == NULL)
1011 regendp[no] = save;
1012 return (1);
1013 } else
1014 return (0);
1015 }
1016 break;
1017 case BRANCH:{
1018 register char *save;
1019
1020 if (OP(next) != BRANCH) /* No choice. */
1021 next = OPERAND(scan); /* Avoid recursion. */
1022 else {
1023 do {
1024 save = reginput;
1025 if (regmatch(OPERAND(scan)))
1026 return (1);
1027 reginput = save;
1028 scan = regnext(scan);
1029 } while (scan != NULL && OP(scan) == BRANCH);
1030 return (0);
1031 /* NOTREACHED */
1032 }
1033 }
1034 break;
1035 case STAR:
1036 case PLUS:{
1037 register char nextch;
1038 register int no;
1039 register char *save;
1040 register int min;
1041
1042 /*
1043 * Lookahead to avoid useless match attempts
1044 * when we know what character comes next.
1045 */
1046 nextch = '\0';
1047 if (OP(next) == EXACTLY)
1048 nextch = *OPERAND(next);
1049 min = (OP(scan) == STAR) ? 0 : 1;
1050 save = reginput;
1051 no = regrepeat(OPERAND(scan));
1052 while (no >= min) {
1053 /* If it could work, try it. */
1054 if (nextch == '\0' || *reginput == nextch)
1055 if (regmatch(next))
1056 return (1);
1057 /* Couldn't or didn't -- back up. */
1058 no--;
1059 reginput = save + no;
1060 }
1061 return (0);
1062 }
1063 break;
1064 case END:
1065 return (1); /* Success! */
1066 break;
1067 default:
1068 regerror("memory corruption");
1069 return (0);
1070 break;
1071 }
1072
1073 scan = next;
1074 }
1075
1076 /*
1077 * We get here only if there's trouble -- normally "case END" is
1078 * the terminating point.
1079 */
1080 regerror("corrupted pointers");
1081 return (0);
1082 }
1083
1084 /*
1085 - regrepeat - repeatedly match something simple, report how many
1086 */
regrepeat(char * p)1087 static int regrepeat(char *p)
1088 {
1089 register int count = 0;
1090 register char *scan;
1091 register char *opnd;
1092
1093 scan = reginput;
1094 opnd = OPERAND(p);
1095 switch (OP(p)) {
1096 case ANY:
1097 /* Flawfinder: ignore (strlen) */
1098 count = strlen(scan);
1099 scan += count;
1100 break;
1101 case EXACTLY:
1102 while (*opnd == *scan) {
1103 count++;
1104 scan++;
1105 }
1106 break;
1107 case ANYOF:
1108 while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
1109 count++;
1110 scan++;
1111 }
1112 break;
1113 case ANYBUT:
1114 while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
1115 count++;
1116 scan++;
1117 }
1118 break;
1119 default: /* Oh dear. Called inappropriately. */
1120 regerror("internal foulup");
1121 count = 0; /* Best compromise. */
1122 break;
1123 }
1124 reginput = scan;
1125
1126 return (count);
1127 }
1128
1129 /*
1130 - regnext - dig the "next" pointer out of a node
1131 */
regnext(register char * p)1132 static char *regnext(register char *p)
1133 {
1134 register int offset;
1135
1136 if (p == ®dummy)
1137 return (NULL);
1138
1139 offset = NEXT(p);
1140 if (offset == 0)
1141 return (NULL);
1142
1143 if (OP(p) == BACK)
1144 return (p - offset);
1145 else
1146 return (p + offset);
1147 }
1148
1149 #ifdef WANT_REGDUMP
1150
1151 /*
1152 - regdump - dump a regexp onto stdout in vaguely comprehensible form
1153 */
regdump(regexp * r)1154 void regdump(regexp * r)
1155 {
1156 register char *s;
1157 register char op = EXACTLY; /* Arbitrary non-END op. */
1158 register char *next;
1159
1160 s = r->program + 1;
1161 while (op != END) { /* While that wasn't END last time... */
1162 op = OP(s);
1163 printf("%2d%s", s - r->program, regprop(s)); /* Where, what. */
1164 next = regnext(s);
1165 if (next == NULL) /* Next ptr. */
1166 printf("(0)");
1167 else
1168 printf("(%d)", (s - r->program) + (next - s));
1169 s += 3;
1170 if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
1171 /* Literal string, where present. */
1172 while (*s != '\0') {
1173 putchar(*s);
1174 s++;
1175 }
1176 s++;
1177 }
1178 putchar('\n');
1179 }
1180
1181 /* Header fields of interest. */
1182 if (r->regstart != '\0')
1183 printf("start `%c' ", r->regstart);
1184 if (r->reganch)
1185 printf("anchored ");
1186 if (r->regmust != NULL)
1187 printf("must have \"%s\"", r->regmust);
1188 printf("\n");
1189 }
1190
1191 /*
1192 - regprop - printable representation of opcode
1193 */
regprop(char * op)1194 static char *regprop(char *op)
1195 {
1196 register char *p = NULL;
1197 /* Flawfinder: ignore (char) */
1198 static char buf[50];
1199
1200 /* Flawfinder: fix (strcpy) */
1201 (void)strlcpy(buf, ":", sizeof(buf));
1202
1203 switch (OP(op)) {
1204 case BOL:
1205 p = "BOL";
1206 break;
1207 case EOL:
1208 p = "EOL";
1209 break;
1210 case ANY:
1211 p = "ANY";
1212 break;
1213 case ANYOF:
1214 p = "ANYOF";
1215 break;
1216 case ANYBUT:
1217 p = "ANYBUT";
1218 break;
1219 case BRANCH:
1220 p = "BRANCH";
1221 break;
1222 case EXACTLY:
1223 p = "EXACTLY";
1224 break;
1225 case NOTHING:
1226 p = "NOTHING";
1227 break;
1228 case BACK:
1229 p = "BACK";
1230 break;
1231 case END:
1232 p = "END";
1233 break;
1234 case OPEN + 1:
1235 case OPEN + 2:
1236 case OPEN + 3:
1237 case OPEN + 4:
1238 case OPEN + 5:
1239 case OPEN + 6:
1240 case OPEN + 7:
1241 case OPEN + 8:
1242 case OPEN + 9:
1243 /* Flawfinder: fix (strlen) */
1244 len = strlen(buf);
1245 /* Flawfinder: fix (sprintf) */
1246 snprintf(buf + len, sizeof(buf) - len, "OPEN%d", OP(op) - OPEN);
1247 p = NULL;
1248 break;
1249 case CLOSE + 1:
1250 case CLOSE + 2:
1251 case CLOSE + 3:
1252 case CLOSE + 4:
1253 case CLOSE + 5:
1254 case CLOSE + 6:
1255 case CLOSE + 7:
1256 case CLOSE + 8:
1257 case CLOSE + 9:
1258 /* Flawfinder: fix (strlen) */
1259 len = strlen(buf);
1260 /* Flawfinder: fix (sprintf) */
1261 snprintf(buf + len, sizeof(buf) - len, "CLOSE%d", OP(op) - CLOSE);
1262 p = NULL;
1263 break;
1264 case STAR:
1265 p = "STAR";
1266 break;
1267 case PLUS:
1268 p = "PLUS";
1269 break;
1270 case WORDA:
1271 p = "WORDA";
1272 break;
1273 case WORDZ:
1274 p = "WORDZ";
1275 break;
1276 default:
1277 regerror("corrupted opcode");
1278 break;
1279 }
1280 if (p != NULL)
1281 /* Flawfinder: fix (strcat) */
1282 strlcat(buf, p, sizeof(buf));
1283 return (buf);
1284 }
1285 #endif /* WANT_REGDUMP */
1286
1287 /*
1288 * The following is provided for those people who do not have strcspn() in
1289 * their C libraries. They should get off their butts and do something
1290 * about it; at least one public-domain implementation of those (highly
1291 * useful) string routines has been published on Usenet.
1292 */
1293 #ifdef WANT_STRCSPN
1294 /*
1295 * strcspn - find length of initial segment of s1 consisting entirely
1296 * of characters not from s2
1297 */
1298
strcspn(char * s1,char * s2)1299 static int strcspn(char *s1, char *s2)
1300 {
1301 register char *scan1;
1302 register char *scan2;
1303 register int count;
1304
1305 count = 0;
1306 for (scan1 = s1; *scan1 != '\0'; scan1++) {
1307 for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */
1308 if (*scan1 == *scan2++)
1309 return (count);
1310 count++;
1311 }
1312 return (count);
1313 }
1314 #endif /* WANT_STRCSPN */
1315
1316 #ifdef WANT_REGERROR
regerror(char * s)1317 void regerror(char *s)
1318 {
1319 fprintf(stderr, "regerror: %s\n", s);
1320 }
1321 #endif /* WANT_REGERROR */
1322
1323 #ifdef WANT_REGSUB
1324 /*
1325 * regsub
1326 *
1327 * Copyright (c) 1986 by University of Toronto.
1328 * Written by Henry Spencer. Not derived from licensed software.
1329 *
1330 * Permission is granted to anyone to use this software for any
1331 * purpose on any computer system, and to redistribute it freely,
1332 * subject to the following restrictions:
1333 *
1334 * 1. The author is not responsible for the consequences of use of
1335 * this software, no matter how awful, even if they arise
1336 * from defects in it.
1337 *
1338 * 2. The origin of this software must not be misrepresented, either
1339 * by explicit claim or by omission.
1340 *
1341 * 3. Altered versions must be plainly marked as such, and must not
1342 * be misrepresented as being the original software.
1343 */
1344
1345 #ifndef CHARBITS
1346 #define UCHARAT(p) ((int)*(unsigned char *)(p))
1347 #else
1348 #define UCHARAT(p) ((int)*(p)&CHARBITS)
1349 #endif
1350
1351 /*
1352 - regsub - perform substitutions after a regexp match
1353 */
regsub(prog,source,dest)1354 void regsub(prog, source, dest)
1355 regexp *prog;
1356 char *source;
1357 char *dest;
1358 {
1359 register char *src;
1360 register char *dst;
1361 register char c;
1362 register int no;
1363 register int len;
1364
1365 if (prog == NULL || source == NULL || dest == NULL) {
1366 regerror("NULL parm to regsub");
1367 return;
1368 }
1369 if (UCHARAT(prog->program) != MAGIC) {
1370 regerror("damaged regexp fed to regsub");
1371 return;
1372 }
1373
1374 src = source;
1375 dst = dest;
1376 while ((c = *src++) != '\0') {
1377 if (c == '&')
1378 no = 0;
1379 else if (c == '\\' && '0' <= *src && *src <= '9')
1380 no = *src++ - '0';
1381 else
1382 no = -1;
1383 if (no < 0) { /* Ordinary character. */
1384 if (c == '\\' && (*src == '\\' || *src == '&'))
1385 c = *src++;
1386 *dst++ = c;
1387 } else if (prog->startp[no] != NULL && prog->endp[no] != NULL &&
1388 prog->endp[no] > prog->startp[no]) {
1389 len = prog->endp[no] - prog->startp[no];
1390 /* Flawfinder: ignore (strncpy) */
1391 strncpy(dst, prog->startp[no], len);
1392 dst += len;
1393 if (len != 0 && *(dst - 1) == '\0') { /* strncpy hit NUL. */
1394 regerror("damaged match string");
1395 return;
1396 }
1397 }
1398 }
1399 *dst++ = '\0';
1400 }
1401 #endif /* WANT_REGSUB */
1402
1403 #endif /* HAVE_REGEX */
1404