1 static const char rcsid[] = "$Id: s_regex.c,v 1.10 2004/04/30 17:00:58 will Exp $";
2 /*
3 * regex - Regular expression pattern matching
4 * and replacement
5 *
6 *
7 * By: Ozan S. Yigit (oz)
8 * Dept. of Computer Science
9 * York University
10 *
11 *
12 * These routines are the PUBLIC DOMAIN equivalents
13 * of regex routines as found in 4.nBSD UN*X, with minor
14 * extensions.
15 *
16 * These routines are derived from various implementations
17 * found in software tools books, and Conroy's grep. They
18 * are NOT derived from licensed/restricted software.
19 * For more interesting/academic/complicated implementations,
20 * see Henry Spencer's regexp routines, or GNU Emacs pattern
21 * matching module.
22 *
23 * ** NAMES CHANGED TO BE PREFIXED WITH s_, TO AVOID CONFLICTS **
24 * ** -Will Deich, Feb 1997 **
25 *
26 * Routines:
27 * s_re_comp: compile a regular expression into
28 * a DFA.
29 *
30 * char *s_re_comp(s)
31 * char *s;
32 *
33 * s_re_exec: execute the DFA to match a pattern.
34 *
35 * int s_re_exec(s)
36 * char *s;
37 *
38 * s_re_modw change s_re_exec's understanding of what
39 * a "word" looks like (for \< and \>)
40 * by adding into the hidden word-character
41 * table.
42 *
43 * void s_re_modw(s)
44 * char *s;
45 *
46 * s_re_subs: substitute the matched portions in
47 * a new string.
48 *
49 * int s_re_subs(src, dst)
50 * char *src;
51 * char *dst;
52 *
53 * s_re_fail: failure routine for s_re_exec.
54 *
55 * void s_re_fail(msg, op)
56 * char *msg;
57 * char op;
58 *
59 * Regular Expressions:
60 *
61 * [1] char matches itself, unless it is a special
62 * character (metachar): . \ [ ] * + ^ $
63 *
64 * [2] . matches any character.
65 *
66 * [3] \ matches the character following it, except
67 * when followed by a left or right round bracket,
68 * a digit 1 to 9 or a left or right angle bracket.
69 * (see [7], [8] and [9])
70 * It is used as an escape character for all
71 * other meta-characters, and itself. When used
72 * in a set ([4]), it is treated as an ordinary
73 * character.
74 *
75 * [4] [set] matches one of the characters in the set.
76 * If the first character in the set is "^",
77 * it matches a character NOT in the set. A
78 * shorthand S-E is used to specify a set of
79 * characters S upto E, inclusive. The special
80 * characters "]" and "-" have no special
81 * meaning if they appear as the first chars
82 * in the set.
83 * examples: match:
84 *
85 * [a-z] any lowercase alpha
86 *
87 * [^]-] any char except ] and -
88 *
89 * [^A-Z] any char except uppercase
90 * alpha
91 *
92 * [a-zA-Z] any alpha
93 *
94 * [5] * any regular expression form [1] to [4], followed by
95 * closure char (*) matches zero or more matches of
96 * that form.
97 *
98 * [6] + same as [5], except it matches one or more.
99 *
100 * [7] a regular expression in the form [1] to [10], enclosed
101 * as \(form\) matches what form matches. The enclosure
102 * creates a set of tags, used for [8] and for
103 * pattern substution. The tagged forms are numbered
104 * starting from 1.
105 *
106 * [8] a \ followed by a digit 1 to 9 matches whatever a
107 * previously tagged regular expression ([7]) matched.
108 *
109 * [9] \< a regular expression starting with a \< construct
110 * \> and/or ending with a \> construct, restricts the
111 * pattern matching to the beginning of a word, and/or
112 * the end of a word. A word is defined to be a character
113 * string beginning and/or ending with the characters
114 * A-Z a-z 0-9 and _. It must also be preceded and/or
115 * followed by any character outside those mentioned.
116 *
117 * [10] a composite regular expression xy where x and y
118 * are in the form [1] to [10] matches the longest
119 * match of x followed by a match for y.
120 *
121 * [11] ^ a regular expression starting with a ^ character
122 * $ and/or ending with a $ character, restricts the
123 * pattern matching to the beginning of the line,
124 * or the end of line. [anchors] Elsewhere in the
125 * pattern, ^ and $ are treated as ordinary characters.
126 *
127 *
128 * Acknowledgements:
129 *
130 * HCR's Hugh Redelmeier has been most helpful in various
131 * stages of development. He convinced me to include BOW
132 * and EOW constructs, originally invented by Rob Pike at
133 * the University of Toronto.
134 *
135 * References:
136 * Software tools Kernighan & Plauger
137 * Software tools in Pascal Kernighan & Plauger
138 * Grep [rsx-11 C dist] David Conroy
139 * ed - text editor Un*x Programmer's Manual
140 * Advanced editing on Un*x B. W. Kernighan
141 * RegExp routines Henry Spencer
142 *
143 * Notes:
144 *
145 * This implementation uses a bit-set representation for character
146 * classes for speed and compactness. Each character is represented
147 * by one bit in a 128-bit block. Thus, CCL or NCL always takes a
148 * constant 16 bytes in the internal dfa, and s_re_exec does a single
149 * bit comparison to locate the character in the set.
150 *
151 * Examples:
152 *
153 * pattern: foo*.*
154 * compile: CHR f CHR o CLO CHR o END CLO ANY END END
155 * matches: fo foo fooo foobar fobar foxx ...
156 *
157 * pattern: fo[ob]a[rz]
158 * compile: CHR f CHR o CCL 2 o b CHR a CCL bitset END
159 * matches: fobar fooar fobaz fooaz
160 *
161 * pattern: foo\\+
162 * compile: CHR f CHR o CHR o CHR \ CLO CHR \ END END
163 * matches: foo\ foo\\ foo\\\ ...
164 *
165 * pattern: \(foo\)[1-3]\1 (same as foo[1-3]foo)
166 * compile: BOT 1 CHR f CHR o CHR o EOT 1 CCL bitset REF 1 END
167 * matches: foo1foo foo2foo foo3foo
168 *
169 * pattern: \(fo.*\)-\1
170 * compile: BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END
171 * matches: foo-foo fo-fo fob-fob foobar-foobar ...
172 *
173 */
174
175 #define MAXDFA 1024
176 #define MAXTAG 10
177
178 #define OKP 1
179 #define NOP 0
180
181 #define CHR 1
182 #define ANY 2
183 #define CCL 3
184 #define NCL 4
185 #define BOL 5
186 #define EOL 6
187 #define BOT 7
188 #define EOT 8
189 #define BOW 9
190 #define EOW 10
191 #define REF 11
192 #define CLO 12
193
194 #define END 0
195
196 /*
197 * The following defines are not meant
198 * to be changeable. They are for readibility
199 * only.
200 *
201 */
202 #define MAXCHR 128
203 #define CHRBIT 8
204 #define BITBLK MAXCHR/CHRBIT
205 #define BLKIND 0170
206 #define BITIND 07
207
208 #define ASCIIB 0177
209
210 typedef unsigned char CHAR;
211
212 static int tagstk[MAXTAG]; /* subpat tag stack..*/
213 static CHAR dfa[MAXDFA]; /* automaton.. */
214 static int sta = NOP; /* status of lastpat */
215
216 static CHAR bittab[BITBLK]; /* bit table for CCL */
217
218 static void
chset(c)219 chset(c)
220 register CHAR c;
221 {
222 bittab[((int) ((c)&BLKIND))>>3] |= 1<<((c)& (unsigned) BITIND);
223 }
224
225 #define badpat(x) return(*dfa = END, x)
226 #define store(x) *mp++ = x
227
228 char *
s_re_comp(pat)229 s_re_comp(pat)
230 char *pat;
231 {
232 register char *p; /* pattern pointer */
233 register CHAR *mp=dfa; /* dfa pointer */
234 register CHAR *lp; /* saved pointer.. */
235 register CHAR *sp=dfa; /* another one.. */
236
237 register int tagi = 0; /* tag stack index */
238 register int tagc = 1; /* actual tag count */
239
240 register int n;
241 int c1, c2;
242
243 if (!pat || !*pat) {
244 if (sta)
245 return(0);
246 else
247 badpat("No previous regular expression");
248 }
249 sta = NOP;
250
251 for (p = pat; *p; p++) {
252 lp = mp;
253 switch(*p) {
254
255 case '.': /* match any char.. */
256 store(ANY);
257 break;
258
259 case '^': /* match beginning.. */
260 if (p == pat)
261 store(BOL);
262 else {
263 store(CHR);
264 store(*p);
265 }
266 break;
267
268 case '$': /* match endofline.. */
269 if (!*(p+1))
270 store(EOL);
271 else {
272 store(CHR);
273 store(*p);
274 }
275 break;
276
277 case '[': /* match char class..*/
278
279 if (*++p == '^') {
280 store(NCL);
281 p++;
282 }
283 else
284 store(CCL);
285
286 if (*p == '-') /* real dash */
287 chset(*p++);
288 if (*p == ']') /* real brac */
289 chset(*p++);
290 while (*p && *p != ']') {
291 if (*p == '-' && *(p+1) && *(p+1) != ']') {
292 p++;
293 c1 = *(p-2) + 1;
294 c2 = *p++;
295 while (c1 <= c2)
296 chset(c1++);
297 }
298 #ifdef EXTEND
299 else if (*p == '\\' && *(p+1)) {
300 p++;
301 chset(*p++);
302 }
303 #endif
304 else
305 chset(*p++);
306 }
307 if (!*p)
308 badpat("Missing ]");
309
310 for (n = 0; n < BITBLK; bittab[n++] = (char) 0)
311 store(bittab[n]);
312
313 break;
314
315 case '*': /* match 0 or more.. */
316 case '+': /* match 1 or more.. */
317 if (p == pat)
318 badpat("Empty closure");
319 lp = sp; /* previous opcode */
320 if (*lp == CLO) /* equivalence.. */
321 break;
322 switch(*lp) {
323
324 case BOL:
325 case BOT:
326 case EOT:
327 case BOW:
328 case EOW:
329 case REF:
330 badpat("Illegal closure");
331 default:
332 break;
333 }
334
335 if (*p == '+')
336 for (sp = mp; lp < sp; lp++)
337 store(*lp);
338
339 store(END);
340 store(END);
341 sp = mp;
342 while (--mp > lp)
343 *mp = mp[-1];
344 store(CLO);
345 mp = sp;
346 break;
347
348 case '\\': /* tags, backrefs .. */
349 switch(*++p) {
350
351 case '(':
352 if (tagc < MAXTAG) {
353 tagstk[++tagi] = tagc;
354 store(BOT);
355 store(tagc++);
356 }
357 else
358 badpat("Too many \\(\\) pairs");
359 break;
360 case ')':
361 if (*sp == BOT)
362 badpat("Null pattern inside \\(\\)");
363 if (tagi > 0) {
364 store(EOT);
365 store(tagstk[tagi--]);
366 }
367 else
368 badpat("Unmatched \\)");
369 break;
370 case '<':
371 store(BOW);
372 break;
373 case '>':
374 if (*sp == BOW)
375 badpat("Null pattern inside \\<\\>");
376 store(EOW);
377 break;
378 case '1':
379 case '2':
380 case '3':
381 case '4':
382 case '5':
383 case '6':
384 case '7':
385 case '8':
386 case '9':
387 n = *p-'0';
388 if (tagi > 0 && tagstk[tagi] == n)
389 badpat("Cyclical reference");
390 if (tagc > n) {
391 store(REF);
392 store(n);
393 }
394 else
395 badpat("Undetermined reference");
396 break;
397 #ifdef EXTEND
398 case 'b':
399 store(CHR);
400 store('\b');
401 break;
402 case 'n':
403 store(CHR);
404 store('\n');
405 break;
406 case 'f':
407 store(CHR);
408 store('\f');
409 break;
410 case 'r':
411 store(CHR);
412 store('\r');
413 break;
414 case 't':
415 store(CHR);
416 store('\t');
417 break;
418 #endif
419 default:
420 store(CHR);
421 store(*p);
422 }
423 break;
424
425 default : /* an ordinary char */
426 store(CHR);
427 store(*p);
428 break;
429 }
430 sp = lp;
431 }
432 if (tagi > 0)
433 badpat("Unmatched \\(");
434 store(END);
435 sta = OKP;
436 return(0);
437 }
438
439
440 static char *bol;
441 static char *bopat[MAXTAG];
442 static char *eopat[MAXTAG];
443 static char *pmatch();
444
445 /*
446 * s_re_exec:
447 * execute dfa to find a match.
448 *
449 * special cases: (dfa[0])
450 * BOL
451 * Match only once, starting from the
452 * beginning.
453 * CHR
454 * First locate the character without
455 * calling pmatch, and if found, call
456 * pmatch for the remaining string.
457 * END
458 * s_re_comp failed, poor luser did not
459 * check for it. Fail fast.
460 *
461 * If a match is found, bopat[0] and eopat[0] are set
462 * to the beginning and the end of the matched fragment,
463 * respectively.
464 *
465 */
466
467 int
s_re_exec(lp)468 s_re_exec(lp)
469 register char *lp;
470 {
471 register char c;
472 register char *ep = 0;
473 register CHAR *ap = dfa;
474
475 bol = lp;
476
477 bopat[0] = 0;
478 bopat[1] = 0;
479 bopat[2] = 0;
480 bopat[3] = 0;
481 bopat[4] = 0;
482 bopat[5] = 0;
483 bopat[6] = 0;
484 bopat[7] = 0;
485 bopat[8] = 0;
486 bopat[9] = 0;
487
488 switch(*ap) {
489
490 case BOL: /* anchored: match from BOL only */
491 ep = pmatch(lp,ap);
492 break;
493 case CHR: /* ordinary char: locate it fast */
494 c = *(ap+1);
495 while (*lp && *lp != c)
496 lp++;
497 if (!*lp) /* if EOS, fail, else fall thru. */
498 return(0);
499 default: /* regular matching all the way. */
500 while (*lp) {
501 if ((ep = pmatch(lp,ap)))
502 break;
503 lp++;
504 }
505 break;
506 case END: /* munged automaton. fail always */
507 return(0);
508 }
509 if (!ep)
510 return(0);
511
512 bopat[0] = lp;
513 eopat[0] = ep;
514 return(1);
515 }
516
517 /*
518 * pmatch:
519 * internal routine for the hard part
520 *
521 * This code is mostly snarfed from an early
522 * grep written by David Conroy. The backref and
523 * tag stuff, and various other mods are by oZ.
524 *
525 * special cases: (dfa[n], dfa[n+1])
526 * CLO ANY
527 * We KNOW ".*" will match ANYTHING
528 * upto the end of line. Thus, go to
529 * the end of line straight, without
530 * calling pmatch recursively. As in
531 * the other closure cases, the remaining
532 * pattern must be matched by moving
533 * backwards on the string recursively,
534 * to find a match for xy (x is ".*" and
535 * y is the remaining pattern) where
536 * the match satisfies the LONGEST match
537 * for x followed by a match for y.
538 * CLO CHR
539 * We can again scan the string forward
540 * for the single char without recursion,
541 * and at the point of failure, we execute
542 * the remaining dfa recursively, as
543 * described above.
544 *
545 * At the end of a successful match, bopat[n] and eopat[n]
546 * are set to the beginning and end of subpatterns matched
547 * by tagged expressions (n = 1 to 9).
548 *
549 */
550
551 extern void s_re_fail();
552
553 /*
554 * character classification table for word boundary
555 * operators BOW and EOW. the reason for not using
556 * ctype macros is that we can let the user add into
557 * our own table. see s_re_modw. This table is not in
558 * the bitset form, since we may wish to extend it
559 * in the future for other character classifications.
560 *
561 * TRUE for 0-9 A-Z a-z _
562 */
563 static char chrtyp[MAXCHR] = {
564 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
565 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
566 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
567 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
568 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
569 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
570 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
571 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
572 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
573 1, 0, 0, 0, 0, 1, 0, 1, 1, 1,
574 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
575 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
576 1, 1, 1, 0, 0, 0, 0, 0
577 };
578
579 #define inascii(x) (0177&(x))
580 #define iswordc(x) chrtyp[inascii(x)]
581 #define isinset(x,y) ((x)[((y)&BLKIND)>>3] & (1<<((y)&BITIND)))
582
583 /*
584 * skip values for CLO XXX to skip past the closure
585 *
586 */
587
588 #define ANYSKIP 2 /* CLO ANY END ... */
589 #define CHRSKIP 3 /* CLO CHR chr END ... */
590 #define CCLSKIP 18 /* CLO CCL 16bytes END ... */
591
592 static char *
pmatch(lp,ap)593 pmatch(lp, ap)
594 register char *lp;
595 register CHAR *ap;
596 {
597 register char *e; /* extra pointer for CLO */
598 register char *bp; /* beginning of subpat.. */
599 register char *ep; /* ending of subpat.. */
600 register int op, c, n;
601 char *are; /* to save the line ptr. */
602
603 while ((op = *ap++) != END)
604 switch(op) {
605
606 case CHR:
607 if ((unsigned char) *lp++ != *ap++)
608 return(0);
609 break;
610 case ANY:
611 if (!*lp++)
612 return(0);
613 break;
614 case CCL:
615 c = *lp++;
616 if (!isinset(ap,c))
617 return(0);
618 ap += BITBLK;
619 break;
620 case NCL:
621 c = *lp++;
622 if (isinset(ap,c))
623 return(0);
624 ap += BITBLK;
625 break;
626 case BOL:
627 if (lp != bol)
628 return(0);
629 break;
630 case EOL:
631 if (*lp)
632 return(0);
633 break;
634 case BOT:
635 bopat[*ap++] = lp;
636 break;
637 case EOT:
638 eopat[*ap++] = lp;
639 break;
640 case BOW:
641 if (!(lp!=bol && iswordc(lp[-1])) && iswordc(*lp))
642 break;
643 return(0);
644 case EOW:
645 if ((lp!=bol && iswordc(lp[-1])) && !iswordc(*lp))
646 break;
647 return(0);
648 case REF:
649 n = *ap++;
650 bp = bopat[n];
651 ep = eopat[n];
652 while (bp < ep)
653 if (*bp++ != *lp++)
654 return(0);
655 break;
656 case CLO:
657 are = lp;
658 switch(*ap) {
659
660 case ANY:
661 while (*lp)
662 lp++;
663 n = ANYSKIP;
664 break;
665 case CHR:
666 c = *(ap+1);
667 while (*lp && c == *lp)
668 lp++;
669 n = CHRSKIP;
670 break;
671 case CCL:
672 case NCL:
673 while (*lp && (e = pmatch(lp, ap)))
674 lp = e;
675 n = CCLSKIP;
676 break;
677 default:
678 s_re_fail("closure: bad dfa.", *ap);
679 return(0);
680 }
681
682 ap += n;
683
684 while (lp >= are) {
685 if ((e = pmatch(lp, ap)))
686 return(e);
687 --lp;
688 }
689 return(0);
690 default:
691 s_re_fail("s_re_exec: bad dfa.", op);
692 return(0);
693 }
694 return(lp);
695 }
696
697 /*
698 * s_re_modw:
699 * add new characters into the word table to
700 * change the s_re_exec's understanding of what
701 * a word should look like. Note that we only
702 * accept additions into the word definition.
703 *
704 * If the string parameter is 0 or null string,
705 * the table is reset back to the default, which
706 * contains A-Z a-z 0-9 _. [We use the compact
707 * bitset representation for the default table]
708 *
709 */
710
711 static unsigned char deftab[16] = {
712 0, 0, 0, 0, 0, 0, 0377, 003, 0376, 0377, 0377, 0207,
713 0376, 0377, 0377, 007
714 };
715
716 void
s_re_modw(s)717 s_re_modw(s)
718 register char *s;
719 {
720 register int i;
721
722 if (!s || !*s) {
723 for (i = 0; i < MAXCHR; i++)
724 if (!isinset(deftab,i))
725 iswordc(i) = 0;
726 }
727 else
728 while(*s)
729 iswordc(*s++) = 1;
730 }
731
732 /*
733 * s_re_subs:
734 * substitute the matched portions of the src in
735 * dst.
736 *
737 * & substitute the entire matched pattern.
738 *
739 * \digit substitute a subpattern, with the given
740 * tag number. Tags are numbered from 1 to
741 * 9. If the particular tagged subpattern
742 * does not exist, null is substituted.
743 *
744 */
745 int
s_re_subs(src,dst)746 s_re_subs(src, dst)
747 register char *src;
748 register char *dst;
749 {
750 register char c;
751 register int pin;
752 register char *bp;
753 register char *ep;
754
755 if (!*src || !bopat[0])
756 return(0);
757
758 while ((c = *src++)) {
759 switch(c) {
760
761 case '&':
762 pin = 0;
763 break;
764
765 case '\\':
766 c = *src++;
767 if (c >= '0' && c <= '9') {
768 pin = c - '0';
769 break;
770 }
771
772 default:
773 *dst++ = c;
774 continue;
775 }
776
777 if ((bp = bopat[pin]) && (ep = eopat[pin])) {
778 while (*bp && bp < ep)
779 *dst++ = *bp++;
780 if (bp < ep)
781 return(0);
782 }
783 }
784 *dst = (char) 0;
785 return(1);
786 }
787
788 #ifdef DEBUG
789 /*
790 * symbolic - produce a symbolic dump of the
791 * dfa
792 */
symbolic(s)793 symbolic(s)
794 char *s;
795 {
796 printf("pattern: %s\n", s);
797 printf("dfacode:\n");
798 dfadump(dfa);
799 }
800
801 static
dfadump(ap)802 dfadump(ap)
803 CHAR *ap;
804 {
805 register int n;
806
807 while (*ap != END)
808 switch(*ap++) {
809 case CLO:
810 printf("CLOSURE");
811 dfadump(ap);
812 switch(*ap) {
813 case CHR:
814 n = CHRSKIP;
815 break;
816 case ANY:
817 n = ANYSKIP;
818 break;
819 case CCL:
820 case NCL:
821 n = CCLSKIP;
822 break;
823 }
824 ap += n;
825 break;
826 case CHR:
827 printf("\tCHR %c\n",*ap++);
828 break;
829 case ANY:
830 printf("\tANY .\n");
831 break;
832 case BOL:
833 printf("\tBOL -\n");
834 break;
835 case EOL:
836 printf("\tEOL -\n");
837 break;
838 case BOT:
839 printf("BOT: %d\n",*ap++);
840 break;
841 case EOT:
842 printf("EOT: %d\n",*ap++);
843 break;
844 case BOW:
845 printf("BOW\n");
846 break;
847 case EOW:
848 printf("EOW\n");
849 break;
850 case REF:
851 printf("REF: %d\n",*ap++);
852 break;
853 case CCL:
854 printf("\tCCL [");
855 for (n = 0; n < MAXCHR; n++)
856 if (isinset(ap,(CHAR)n))
857 printf("%c",n);
858 printf("]\n");
859 ap += BITBLK;
860 break;
861 case NCL:
862 printf("\tNCL [");
863 for (n = 0; n < MAXCHR; n++)
864 if (isinset(ap,(CHAR)n))
865 printf("%c",n);
866 printf("]\n");
867 ap += BITBLK;
868 break;
869 default:
870 printf("bad dfa. opcode %o\n", ap[-1]);
871 exit(1);
872 break;
873 }
874 }
875 #endif
876