1 #ifndef lint
2 static char sccsid[] = "@(#)regexp.c 1.2 (LBL) 12/4/85";
3 static char rcsid[] =
4 "$Id: regexp.c,v 1.3 1999/05/27 16:17:43 mike Exp $";
5 #endif
6
7 /*
8 * Copyright %%\copyright%% 1980 The Regents of the University of California.
9 * All rights reserved.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 * 3. All advertising materials mentioning features or use of this software
20 * must display the following acknowledgement:
21 * This product includes software developed by the University of
22 * California, Berkeley and its contributors.
23 * 4. Neither the name of the University nor the names of its contributors
24 * may be used to endorse or promote products derived from this software
25 * without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37 * SUCH DAMAGE.
38 */
39
40 /*
41 * Regular expression matching routines for lgrind/tgrind/tfontedpr.
42 *
43 * These routines were written by Dave Presotto (I think) for vgrind.
44 * Minor mods & attempts to improve performance by Van Jacobson
45 * (van@@lbl-rtsg) and Chris Torek (chris@@maryland).
46 *
47 * Modifications.
48 * --------------
49 * Sep 91 George V Reilly Fixed up some bogus uses of @NIL@ and @NULL@.
50 * 30 Mar 85 Van & Chris Changed @expmatch()@ to return pointer to
51 * start of what was matched in addition to
52 * pointer to match end. Several changes to
53 * improve performance (too numerous to mention).
54 * 11 Dec 84 Dave Presotto Written.
55 */
56
57
58 #include <stdio.h>
59 #include <stdlib.h>
60 #include <ctype.h>
61 #include <string.h>
62 #include "regexp.h"
63
64 #define makelower(c) (isupper((c)) ? tolower((c)) : (c))
65
66 extern char *l_id;
67 static void expconv(void); /* forward declaration */
68
69 int (*re_strncmp)(const char *, const char *, size_t);
70 /* function used by @expmatch()@ to compare
71 * strings. The caller should make it point to
72 * @strncmp()@ if case is significant &
73 * @lc_strncmp()@ otherwise.
74 */
75
76
77 /* @lc_strncmp()@ --- like @strncmp()@ except that we convert the
78 * first string to lower case before comparing.
79 */
lc_strncmp(register const char * s1,register const char * s2,register size_t len)80 int lc_strncmp(register const char *s1, register const char *s2, register size_t len)
81 {
82 while (len-- > 0)
83 if (*s2 - makelower(*s1))
84 return 1;
85 else
86 s2++, s1++;
87
88 return 0;
89 }
90
91
92 /* The following routine converts an irregular expression to
93 * internal format.
94 *
95 * Either meta symbols (%|\a|%, %|\d|%, or %|\p|%) or character strings
96 * or operations (alternation or parenthesizing) can be specified.
97 * Each starts with a descriptor byte. The descriptor byte
98 * has @STR@ set for strings, @META@ set for meta symbols,
99 * and @OPER@ set for operations. The descriptor byte can also have
100 * the @OPT@ bit set if the object defined is optional. Also @ALT@
101 * can be set to indicate an alternation.
102 *
103 * For metasymbols, the byte following the descriptor byte identities
104 * the meta symbol (containing an ASCII @'a'@, @'d'@, @'p'@, @'|'@,
105 * or @'('@). For strings, the byte after the descriptor is a
106 * character count for the string:
107 *@
108 * meta symbols := descriptor
109 * symbol
110 *
111 * strings := descriptor
112 * character count
113 * the string
114 *
115 * operations := descriptor
116 * symbol
117 * character count@
118 */
119
120
121 /*
122 * handy macros for accessing parts of match blocks
123 */
124 #define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */
125 #define MNEXT(A) (A+2) /* character following a metasymbol block */
126
127 #define OSYM(A) (*(A+1)) /* symbol in an operation block */
128 #define OCNT(A) (*(A+2)) /* character count */
129 #define ONEXT(A) (A+3) /* next character after the operation */
130 #define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */
131
132 #define SCNT(A) (*(A+1)) /* byte count of a string */
133 #define SSTR(A) (A+2) /* address of the string */
134 #define SNEXT(A) (A+2+*(A+1)) /* character following the string */
135
136
137 /*
138 * bit flags in the descriptor
139 */
140 #define OPT 1
141 #define STR 2
142 #define META 4
143 #define ALT 8
144 #define OPER 16
145
146 char *ure; /* pointer current position in unconverted exp */
147 char *ccre; /* pointer to current position in converted exp */
148
149
150
151 /*
152 * Convert an irregular expression to the internal form
153 */
convexp(char * re)154 char *convexp(char *re)
155 /* re unconverted irregular expression */
156 {
157 register char *cre; /* pointer to converted regular expression */
158
159 /* allocate room for the converted expression */
160 if (re == NULL)
161 return NULL;
162 if (*re == '\0')
163 return NULL;
164 cre = (char*)malloc(4 * strlen(re) + 3);
165 ccre = cre;
166 ure = re;
167
168 /* start the conversion with a %|\a|% */
169 *cre = META | OPT;
170 MSYM(cre) = 'a';
171 ccre = MNEXT(cre);
172
173 /* start the conversion (it's recursive) */
174 expconv();
175 *ccre = 0;
176 return cre;
177 }
178
179
180 /*
181 * Actually do the conversion
182 */
expconv(void)183 static void expconv(void)
184 {
185 register char *cs; /* pointer to current symbol in converted exp */
186 register char c; /* character being processed */
187 register char *acs; /* pointer to last alternate */
188 register int temp;
189
190 /* let the conversion begin */
191 acs = NULL;
192 cs = NULL;
193 while (*ure != '\0') {
194 switch (c = *ure++) {
195
196 case '\\':
197 switch (c = *ure++) {
198
199 /* escaped characters are just characters */
200 default:
201 if (cs == NULL || (*cs & STR) == 0) {
202 cs = ccre;
203 *cs = STR;
204 SCNT(cs) = 1;
205 ccre += 2;
206 } else
207 SCNT(cs)++;
208 *ccre++ = c;
209 break;
210
211 /* normal(?) metacharacters */
212 case 'a':
213 case 'd':
214 case 'e':
215 case 'p':
216 if (acs != NULL && acs != cs) {
217 do {
218 temp = OCNT(acs);
219 OCNT(acs) = ccre - acs;
220 acs -= temp;
221 } while (temp != 0);
222 acs = NULL;
223 }
224 cs = ccre;
225 *cs = META;
226 MSYM(cs) = c;
227 ccre = MNEXT(cs);
228 break;
229 }
230 break;
231
232 /* just put the symbol in */
233 case '^':
234 case '$':
235 if (acs != NULL && acs != cs) {
236 do {
237 temp = OCNT(acs);
238 OCNT(acs) = ccre - acs;
239 acs -= temp;
240 } while (temp != 0);
241 acs = NULL;
242 }
243 cs = ccre;
244 *cs = META;
245 MSYM(cs) = c;
246 ccre = MNEXT(cs);
247 break;
248
249 /* mark the last match sequence as optional */
250 case '?':
251 if (cs)
252 *cs = *cs | OPT;
253 break;
254
255 /* recurse and define a subexpression */
256 case '(':
257 if (acs != NULL && acs != cs) {
258 do {
259 temp = OCNT(acs);
260 OCNT(acs) = ccre - acs;
261 acs -= temp;
262 } while (temp != 0);
263 acs = NULL;
264 }
265 cs = ccre;
266 *cs = OPER;
267 OSYM(cs) = '(';
268 ccre = ONEXT(cs);
269 expconv();
270 OCNT(cs) = ccre - cs; /* offset to next symbol */
271 break;
272
273 /* return from a recursion */
274 case ')':
275 if (acs != NULL) {
276 do {
277 temp = OCNT(acs);
278 OCNT(acs) = ccre - acs;
279 acs -= temp;
280 } while (temp != 0);
281 acs = NULL;
282 }
283 cs = ccre;
284 *cs = META;
285 MSYM(cs) = c;
286 ccre = MNEXT(cs);
287 return;
288
289 /* Mark the last match sequence as having an alternate. */
290 /* The third byte will contain an offset to jump over the */
291 /* alternate match in case the first did not fail */
292 case '|':
293 if (acs != NULL && acs != cs)
294 OCNT(ccre) = ccre - acs; /* make a back pointer */
295 else
296 OCNT(ccre) = 0;
297 *cs |= ALT;
298 cs = ccre;
299 *cs = OPER;
300 OSYM(cs) = '|';
301 ccre = ONEXT(cs);
302 acs = cs; /* remember that the pointer is to be filled */
303 break;
304
305 /* if it's not a metasymbol, just build a character string */
306 default:
307 if (cs == NULL || (*cs & STR) == 0) {
308 cs = ccre;
309 *cs = STR;
310 SCNT(cs) = 1;
311 ccre = SSTR(cs);
312 } else
313 SCNT(cs)++;
314 *ccre++ = c;
315 break;
316 }
317 }
318 if (acs != NULL) {
319 do {
320 temp = OCNT(acs);
321 OCNT(acs) = ccre - acs;
322 acs -= temp;
323 } while (temp != 0);
324 acs = NULL;
325 }
326 return;
327 } /* end of @expconv()@ */
328
329
330 /*
331 * The following routine recognises an irregular expression
332 * with the following special characters:
333 *
334 * %|\?|% means last match was optional
335 * %|\a|% matches any number of characters
336 * %|\d|% matches any number of spaces and tabs
337 * %|\p|% matches any number of alphanumeric characters.
338 * The characters matched will be copied into
339 * the area pointed to by @mstring@.
340 * %|\||% alternation
341 * %|\( \)|% grouping used mostly for alternation and
342 * optionality
343 *
344 * The irregular expression must be translated to internal form
345 * prior to calling this routine
346 *
347 * The value returned is the pointer to the last character matched.
348 * If @strtptr@ is non-@NULL@, a pointer to the start of the string matched
349 * (excluding %|\a|% matches) will be returned at @*strtptr@.
350 * If @mstring@ is non-@NULL@, the string matched by a %|\p|% will be copied
351 * into @mstring@.
352 */
353
354 boolean _escaped; /* true if we are currently @_escaped@ */
355 char *_sstart; /* start of string. Set in @putScp()@. */
356
357
expmatch(register char * s,register char * re,char ** strtptr,char * mstring)358 char *expmatch(
359 register char *s, /* string to check for a match in */
360 register char *re, /* a converted irregular expression */
361 char **strtptr, /* where to put ptr to start of match */
362 char *mstring) /* where to put whatever matches a %|\p|% */
363 {
364 register char *cs; /* the current symbol */
365 register char *ptr, *s1; /* temporary pointer */
366 register char c; /* temporary */
367 boolean matched; /* a temporary @boolean@ */
368
369 /* initial conditions */
370 if (strtptr)
371 *strtptr = '\0';
372 if (re == NULL)
373 return NULL;
374 cs = re;
375 matched = FALSE;
376
377 /* loop till expression string is exhausted (or at least pretty tired) */
378 while (*cs) {
379 switch (*cs & (OPER | STR | META)) {
380
381 /* try to match a string */
382 case STR:
383 matched = !((*re_strncmp)(s, SSTR(cs), SCNT(cs)));
384 if (matched) {
385
386 /* hoorah: it matches */
387 s += SCNT(cs);
388 cs = SNEXT(cs);
389 } else if (*cs & ALT) {
390
391 /* alternation: skip to next expression */
392 cs = SNEXT(cs);
393 } else if (*cs & OPT) {
394
395 /* the match is optional */
396 cs = SNEXT(cs);
397 matched = 1; /* indicate a successful match */
398 } else {
399
400 /* no match: error return */
401 return NULL;
402 }
403 break;
404
405 /* an operator: do something fancy */
406 case OPER:
407 switch (OSYM(cs)) {
408
409 /* this is an alternation */
410 case '|':
411 if (matched)
412
413 /* last thing in the alternation was a match: skip ahead */
414 cs = OPTR(cs);
415 else
416
417 /* no match: keep trying */
418 cs = ONEXT(cs);
419 break;
420
421 /* this is a grouping: recurse */
422 case '(':
423 ptr = expmatch(s, ONEXT(cs), strtptr, mstring);
424 if (ptr != NULL) {
425
426 /* the subexpression matched */
427 matched = 1;
428 s = ptr;
429 } else if (*cs & ALT) {
430
431 /* alternation: skip to next expression */
432 matched = 0;
433 } else if (*cs & OPT) {
434
435 /* the match is optional */
436 matched = 1; /* indicate a successful match */
437 } else {
438
439 /* no match: error return */
440 return NULL;
441 }
442 cs = OPTR(cs);
443 break;
444 }
445 break;
446
447 /* try to match a metasymbol */
448 case META:
449 switch (MSYM(cs)) {
450
451 /* try to match anything and remember what was matched */
452 case 'p':
453 /*
454 * This is really the same as trying the match the
455 * remaining parts of the expression to any subset
456 * of the string.
457 */
458 s1 = s;
459 do {
460 ptr = expmatch(s1, MNEXT(cs), strtptr, mstring);
461 if (ptr != NULL && s1 != s) {
462
463 /* we have a match: remember the match */
464 if (mstring) {
465 strncpy(mstring, s, (size_t)(s1 - s));
466 mstring[(int)(s1 - s)] = '\0';
467 }
468 return ptr;
469
470 } else if (ptr != NULL && (*cs & OPT)) {
471
472 /* %|\p|% was optional, so no match is ok */
473 return ptr;
474
475 } else if (ptr != NULL) {
476
477 /* not optional and we still matched */
478 return NULL;
479 }
480 if (!isalnum(*s1) && !strchr(l_id, *s1))
481 return NULL;
482 if (*s1 == '\\')
483 _escaped = _escaped ? FALSE : TRUE;
484 else
485 _escaped = FALSE;
486 } while (*s1++);
487 return NULL;
488
489 /* try to match anything */
490 case 'a':
491 /*
492 * This is really the same as trying the match the
493 * remaining parts of the expression to any subset
494 * of the string.
495 */
496 s1 = s;
497 do {
498 /*
499 * Hack for an important special case: if the next thing
500 * in the pattern is a string, just gobble characters until
501 * we find something that matches that string (this saves
502 * the cost of a recursive call on @expmatch()@ while scanning
503 * for the start of comments or strings). Since many
504 * patterns end with a string, we also treat that as a
505 * special case.
506 */
507 if(*(ptr = MNEXT(cs)) == STR) {
508 c = *SSTR(ptr);
509 while(*s1 && *s1 != c)
510 s1++;
511
512 if (*s1 == 0)
513 return NULL;
514
515 if (SNEXT(ptr) == 0 && (s1 != s || *cs & OPT)) {
516 /* next item is a string, it's the last item and
517 * the %|\a|% match is ok --- just loop to try & match
518 * the string.
519 */
520 if (strtptr)
521 *strtptr = s1;
522
523 cs = ptr;
524 s = s1;
525 break;
526 }
527 }
528 ptr = expmatch(s1, MNEXT(cs), strtptr, mstring);
529 if (ptr != NULL && (s1 != s || *cs & OPT)) {
530
531 /* we have a match */
532 if (strtptr)
533 *strtptr = s1;
534
535 return ptr;
536
537 } else if (ptr != NULL) {
538
539 /* not optional and we still matched */
540 return NULL;
541 }
542 if (*s1 == '\\')
543 _escaped = _escaped ? FALSE : TRUE;
544 else
545 _escaped = FALSE;
546 } while (*s1++);
547 return NULL;
548
549 /* fail if we are currently @_escaped@ */
550 case 'e':
551 if (_escaped)
552 return NULL;
553 cs = MNEXT(cs);
554 break;
555
556 /* match any number of tabs and spaces */
557 case 'd':
558 ptr = s;
559 while (*s == ' ' || *s == '\t')
560 s++;
561
562 if (s != ptr || s == _sstart) {
563
564 /* match: be happy */
565 matched = 1;
566 cs = MNEXT(cs);
567 } else if (*s == '\n' || *s == '\0') {
568
569 /* match: be happy */
570 matched = 1;
571 cs = MNEXT(cs);
572 } else if (*cs & ALT) {
573
574 /* try the next part */
575 matched = 0;
576 cs = MNEXT(cs);
577 } else if (*cs & OPT) {
578
579 /* doesn't matter */
580 matched = 1;
581 cs = MNEXT(cs);
582 } else
583
584 /* no match: error return */
585 return NULL;
586
587 break;
588
589 /* check for end of line */
590 case '$':
591 if (*s == '\0' || *s == '\n') {
592
593 /* match: be happy */
594 /* s++; was here
595 * removed; now $ is left in stream to be matched again, which
596 * is not bad (EOL can end several things at once. Also it
597 * leaves things like CE on the same line.
598 */
599 matched = 1;
600 cs = MNEXT(cs);
601 } else if (*cs & ALT) {
602
603 /* try the next part */
604 matched = 0;
605 cs = MNEXT(cs);
606 } else if (*cs & OPT) {
607
608 /* doesn't matter */
609 matched = 1;
610 cs = MNEXT(cs);
611 } else
612
613 /* no match: error return */
614 return NULL;
615 break;
616
617 /* check for start of line */
618 case '^':
619 if (s == _sstart) {
620
621 /* match: be happy */
622 matched = 1;
623 cs = MNEXT(cs);
624 } else if (*cs & ALT) {
625
626 /* try the next part */
627 matched = 0;
628 cs = MNEXT(cs);
629 } else if (*cs & OPT) {
630
631 /* doesn't matter */
632 matched = 1;
633 cs = MNEXT(cs);
634 } else
635
636 /* no match: error return */
637 return NULL;
638 break;
639
640 /* end of a subexpression: return success */
641 case ')':
642 return s;
643 }
644 break;
645 }
646 }
647 return s;
648 }
649