1 #ifndef lint
2 static char sccsid[] = "@(#)regexp.c     1.2 (LBL) 12/4/85";
3 static char rcsid[] =
4    "$Id: regexp.c,v 1.3 1999/05/27 16:17:43 mike Exp $";
5 #endif
6 
7 /*
8  * Copyright %%\copyright%% 1980 The Regents of the University of California.
9  * All rights reserved.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  * 3. All advertising materials mentioning features or use of this software
20  *    must display the following acknowledgement:
21  *	This product includes software developed by the University of
22  *	California, Berkeley and its contributors.
23  * 4. Neither the name of the University nor the names of its contributors
24  *    may be used to endorse or promote products derived from this software
25  *    without specific prior written permission.
26  *
27  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37  * SUCH DAMAGE.
38  */
39 
40 /*
41  * Regular expression matching routines for lgrind/tgrind/tfontedpr.
42  *
43  * These routines were written by Dave Presotto (I think) for vgrind.
44  * Minor mods & attempts to improve performance by Van Jacobson
45  * (van@@lbl-rtsg) and Chris Torek (chris@@maryland).
46  *
47  * Modifications.
48  * --------------
49  *    Sep 91	George V Reilly	Fixed up some bogus uses of @NIL@ and @NULL@.
50  * 30 Mar 85	Van & Chris	Changed @expmatch()@ to return pointer to
51  *				start of what was matched in addition to
52  *				pointer to match end.  Several changes to
53  *				improve performance (too numerous to mention).
54  * 11 Dec 84	Dave Presotto	Written.
55  */
56 
57 
58 #include <stdio.h>
59 #include <stdlib.h>
60 #include <ctype.h>
61 #include <string.h>
62 #include "regexp.h"
63 
64 #define makelower(c) (isupper((c)) ? tolower((c)) : (c))
65 
66 extern char *l_id;
67 static void expconv(void);	 /* forward declaration */
68 
69 int    (*re_strncmp)(const char *, const char *, size_t);
70 			/* function used by @expmatch()@ to compare
71 			* strings.  The caller should make it point to
72 			* @strncmp()@ if case is significant &
73 			* @lc_strncmp()@ otherwise.
74 			*/
75 
76 
77 /*  @lc_strncmp()@ ---	like @strncmp()@ except that we convert the
78  *			first string to lower case before comparing.
79  */
lc_strncmp(register const char * s1,register const char * s2,register size_t len)80 int lc_strncmp(register const char *s1, register const char *s2, register size_t len)
81 {
82    while (len-- > 0)
83       if (*s2 - makelower(*s1))
84 	 return 1;
85       else
86 	 s2++, s1++;
87 
88    return 0;
89 }
90 
91 
92 /*  The following routine converts an irregular expression to
93  *  internal format.
94  *
95  *  Either meta symbols (%|\a|%, %|\d|%, or %|\p|%) or character strings
96  *  or operations (alternation or parenthesizing) can be specified.
97  *  Each starts with a descriptor byte.  The descriptor byte
98  *  has @STR@ set for strings, @META@ set for meta symbols,
99  *  and @OPER@ set for operations.  The descriptor byte can also have
100  *  the @OPT@ bit set if the object defined is optional.  Also @ALT@
101  *  can be set to indicate an alternation.
102  *
103  *  For metasymbols, the byte following the descriptor byte identities
104  *  the meta symbol (containing an ASCII @'a'@, @'d'@, @'p'@, @'|'@,
105  *  or @'('@).  For strings, the byte after the descriptor is a
106  *  character count for the string:
107  *@
108  *	meta symbols :=	descriptor
109  *			symbol
110  *
111  *	strings :=	descriptor
112  *			character count
113  *			the string
114  *
115  *	operations :=	descriptor
116  *			symbol
117  *			character count@
118  */
119 
120 
121 /*
122  *  handy macros for accessing parts of match blocks
123  */
124 #define MSYM(A)	 (*(A+1))	/* symbol in a meta symbol block */
125 #define MNEXT(A) (A+2)		/* character following a metasymbol block */
126 
127 #define OSYM(A)	 (*(A+1))	/* symbol in an operation block */
128 #define OCNT(A)	 (*(A+2))	/* character count */
129 #define ONEXT(A) (A+3)		/* next character after the operation */
130 #define OPTR(A)	 (A+*(A+2))	/* place pointed to by the operator */
131 
132 #define SCNT(A)	 (*(A+1))	/* byte count of a string */
133 #define SSTR(A)	 (A+2)		/* address of the string */
134 #define SNEXT(A) (A+2+*(A+1))	/* character following the string */
135 
136 
137 /*
138  *  bit flags in the descriptor
139  */
140 #define OPT	 1
141 #define STR	 2
142 #define META	 4
143 #define ALT	 8
144 #define OPER	16
145 
146 char *ure;		/* pointer current position in unconverted exp */
147 char *ccre;		/* pointer to current position in converted exp */
148 
149 
150 
151 /*
152  * Convert an irregular expression to the internal form
153  */
convexp(char * re)154 char *convexp(char *re)
155 /* re	 unconverted irregular expression */
156 {
157    register char *cre;	/* pointer to converted regular expression */
158 
159    /* allocate room for the converted expression */
160    if (re == NULL)
161       return NULL;
162    if (*re == '\0')
163       return NULL;
164    cre = (char*)malloc(4 * strlen(re) + 3);
165    ccre = cre;
166    ure = re;
167 
168    /* start the conversion with a %|\a|% */
169    *cre = META | OPT;
170    MSYM(cre) = 'a';
171    ccre = MNEXT(cre);
172 
173    /* start the conversion (it's recursive) */
174    expconv();
175    *ccre = 0;
176    return cre;
177 }
178 
179 
180 /*
181  * Actually do the conversion
182  */
expconv(void)183 static void expconv(void)
184 {
185    register char *cs;	/* pointer to current symbol in converted exp */
186    register char c;	/* character being processed */
187    register char *acs;	/* pointer to last alternate */
188    register int temp;
189 
190    /* let the conversion begin */
191    acs = NULL;
192    cs = NULL;
193    while (*ure != '\0') {
194 	  switch (c = *ure++) {
195 
196 	  case '\\':
197 	 switch (c = *ure++) {
198 
199 		/* escaped characters are just characters */
200 	 default:
201 		if (cs == NULL || (*cs & STR) == 0) {
202 		   cs = ccre;
203 		   *cs = STR;
204 		   SCNT(cs) = 1;
205 		   ccre += 2;
206 		} else
207 		   SCNT(cs)++;
208 		*ccre++ = c;
209 		break;
210 
211 		/* normal(?) metacharacters */
212 	 case 'a':
213 	 case 'd':
214 	 case 'e':
215 	 case 'p':
216 		if (acs != NULL && acs != cs) {
217 		   do {
218 		  temp = OCNT(acs);
219 		  OCNT(acs) = ccre - acs;
220 		  acs -= temp;
221 		   } while (temp != 0);
222 		   acs = NULL;
223 		}
224 		cs = ccre;
225 		*cs = META;
226 		MSYM(cs) = c;
227 	    ccre = MNEXT(cs);
228 	    break;
229 	 }
230 	 break;
231 
232 	 /* just put the symbol in */
233       case '^':
234       case '$':
235 	 if (acs != NULL && acs != cs) {
236 	    do {
237 	       temp = OCNT(acs);
238 	       OCNT(acs) = ccre - acs;
239 	       acs -= temp;
240 	    } while (temp != 0);
241 	    acs = NULL;
242 	 }
243 	 cs = ccre;
244 	 *cs = META;
245 	 MSYM(cs) = c;
246 	 ccre = MNEXT(cs);
247 	 break;
248 
249 	 /* mark the last match sequence as optional */
250       case '?':
251 	 if (cs)
252 		*cs = *cs | OPT;
253 	 break;
254 
255 	 /* recurse and define a subexpression */
256       case '(':
257 	 if (acs != NULL && acs != cs) {
258 	    do {
259 	       temp = OCNT(acs);
260 	       OCNT(acs) = ccre - acs;
261 	       acs -= temp;
262 	    } while (temp != 0);
263 	    acs = NULL;
264 	 }
265 	 cs = ccre;
266 	 *cs = OPER;
267 	 OSYM(cs) = '(';
268 	 ccre = ONEXT(cs);
269 	 expconv();
270 	 OCNT(cs) = ccre - cs;		/* offset to next symbol */
271 	 break;
272 
273 	 /* return from a recursion */
274       case ')':
275 	 if (acs != NULL) {
276 	    do {
277 	       temp = OCNT(acs);
278 	       OCNT(acs) = ccre - acs;
279 	       acs -= temp;
280 	    } while (temp != 0);
281 	    acs = NULL;
282 	 }
283 	 cs = ccre;
284 	 *cs = META;
285 	 MSYM(cs) = c;
286 	 ccre = MNEXT(cs);
287 	 return;
288 
289 	 /* Mark the last match sequence as having an alternate. */
290 	 /* The third byte will contain an offset to jump over the */
291 	 /* alternate match in case the first did not fail */
292       case '|':
293 	 if (acs != NULL && acs != cs)
294 	    OCNT(ccre) = ccre - acs;	/* make a back pointer */
295 	 else
296 	    OCNT(ccre) = 0;
297 	 *cs |= ALT;
298 	 cs = ccre;
299 	 *cs = OPER;
300 	 OSYM(cs) = '|';
301 	 ccre = ONEXT(cs);
302 	 acs = cs;	/* remember that the pointer is to be filled */
303 	 break;
304 
305 	 /* if it's not a metasymbol, just build a character string */
306       default:
307 	 if (cs == NULL || (*cs & STR) == 0) {
308 	    cs = ccre;
309 	    *cs = STR;
310 	    SCNT(cs) = 1;
311 	    ccre = SSTR(cs);
312 	 } else
313 	    SCNT(cs)++;
314 	 *ccre++ = c;
315 	 break;
316       }
317    }
318    if (acs != NULL) {
319       do {
320 	 temp = OCNT(acs);
321 	 OCNT(acs) = ccre - acs;
322 	 acs -= temp;
323       } while (temp != 0);
324       acs = NULL;
325    }
326    return;
327 } /* end of @expconv()@ */
328 
329 
330 /*
331  *  The following routine recognises an irregular expression
332  *  with the following special characters:
333  *
334  *	%|\?|%		means last match was optional
335  *	%|\a|%		matches any number of characters
336  *	%|\d|%		matches any number of spaces and tabs
337  *	%|\p|%		matches any number of alphanumeric characters.
338  *			The characters matched will be copied into
339  *			the area pointed to by @mstring@.
340  *	%|\||%		alternation
341  *	%|\( \)|%	grouping used mostly for alternation and
342  *			optionality
343  *
344  *  The irregular expression must be translated to internal form
345  *  prior to calling this routine
346  *
347  *  The value returned is the pointer to the last character matched.
348  *  If @strtptr@ is non-@NULL@, a pointer to the start of the string matched
349  *  (excluding %|\a|% matches) will be returned at @*strtptr@.
350  *  If @mstring@ is non-@NULL@, the string matched by a %|\p|% will be copied
351  *  into @mstring@.
352  */
353 
354 boolean _escaped;		/* true if we are currently @_escaped@ */
355 char *_sstart;			/* start of string.  Set in @putScp()@. */
356 
357 
expmatch(register char * s,register char * re,char ** strtptr,char * mstring)358 char *expmatch(
359    register char *s,		/* string to check for a match in */
360    register char *re,		/* a converted irregular expression */
361    char **strtptr,		/* where to put ptr to start of match */
362    char *mstring)		/* where to put whatever matches a %|\p|% */
363 {
364    register char *cs;		/* the current symbol */
365    register char *ptr, *s1;	/* temporary pointer */
366    register char c;		/* temporary */
367    boolean matched;		/* a temporary @boolean@ */
368 
369    /* initial conditions */
370    if (strtptr)
371       *strtptr = '\0';
372    if (re == NULL)
373       return NULL;
374    cs = re;
375    matched = FALSE;
376 
377    /* loop till expression string is exhausted (or at least pretty tired) */
378    while (*cs) {
379       switch (*cs & (OPER | STR | META)) {
380 
381 	 /* try to match a string */
382       case STR:
383 	 matched = !((*re_strncmp)(s, SSTR(cs), SCNT(cs)));
384 	 if (matched) {
385 
386 	    /* hoorah: it matches */
387 	    s += SCNT(cs);
388 	    cs = SNEXT(cs);
389 	 } else if (*cs & ALT) {
390 
391 	    /* alternation: skip to next expression */
392 	    cs = SNEXT(cs);
393 	 } else if (*cs & OPT) {
394 
395 	    /* the match is optional */
396 	    cs = SNEXT(cs);
397 	    matched = 1;		/* indicate a successful match */
398 	 } else {
399 
400 	    /* no match: error return */
401 	    return NULL;
402 	 }
403 	 break;
404 
405 	 /* an operator: do something fancy */
406       case OPER:
407 	 switch (OSYM(cs)) {
408 
409 	    /* this is an alternation */
410 	 case '|':
411 	    if (matched)
412 
413 	       /* last thing in the alternation was a match: skip ahead */
414 	       cs = OPTR(cs);
415 	    else
416 
417 	       /* no match: keep trying */
418 	       cs = ONEXT(cs);
419 	    break;
420 
421 	    /* this is a grouping: recurse */
422 	 case '(':
423 	    ptr = expmatch(s, ONEXT(cs), strtptr, mstring);
424 	    if (ptr != NULL) {
425 
426 	       /* the subexpression matched */
427 	       matched = 1;
428 	       s = ptr;
429 	    } else if (*cs & ALT) {
430 
431 	       /* alternation: skip to next expression */
432 	       matched = 0;
433 	    } else if (*cs & OPT) {
434 
435 	       /* the match is optional */
436 	       matched = 1;	/* indicate a successful match */
437 	    } else {
438 
439 	       /* no match: error return */
440 	       return NULL;
441 	    }
442 	    cs = OPTR(cs);
443 	    break;
444 	 }
445 	 break;
446 
447 	 /* try to match a metasymbol */
448       case META:
449 	 switch (MSYM(cs)) {
450 
451 	    /* try to match anything and remember what was matched */
452 	 case 'p':
453 	    /*
454 	     *  This is really the same as trying the match the
455 	     *  remaining parts of the expression to any subset
456 	     *  of the string.
457 	     */
458 	    s1 = s;
459 	    do {
460 	       ptr = expmatch(s1, MNEXT(cs), strtptr, mstring);
461 	       if (ptr != NULL && s1 != s) {
462 
463 		  /* we have a match: remember the match */
464 		  if (mstring) {
465 		     strncpy(mstring, s, (size_t)(s1 - s));
466 		     mstring[(int)(s1 - s)] = '\0';
467 		  }
468 		  return ptr;
469 
470 	       } else if (ptr != NULL && (*cs & OPT)) {
471 
472 		  /* %|\p|% was optional, so no match is ok */
473 		  return ptr;
474 
475 	       } else if (ptr != NULL) {
476 
477 		  /* not optional and we still matched */
478 		  return NULL;
479 	       }
480 	       if (!isalnum(*s1) && !strchr(l_id, *s1))
481 		  return NULL;
482 	       if (*s1 == '\\')
483 		  _escaped = _escaped ? FALSE : TRUE;
484 	       else
485 		  _escaped = FALSE;
486 	    } while (*s1++);
487 	    return NULL;
488 
489 	    /* try to match anything */
490 	 case 'a':
491 	    /*
492 	     *  This is really the same as trying the match the
493 	     *  remaining parts of the expression to any subset
494 	     *  of the string.
495 	     */
496 	    s1 = s;
497 	    do {
498 	       /*
499 		* Hack for an important special case: if the next thing
500 		* in the pattern is a string, just gobble characters until
501 		* we find something that matches that string (this saves
502 		* the cost of a recursive call on @expmatch()@ while scanning
503 		* for the start of comments or strings).  Since many
504 		* patterns end with a string, we also treat that as a
505 		* special case.
506 		*/
507 	       if(*(ptr = MNEXT(cs)) == STR) {
508 		  c = *SSTR(ptr);
509 		  while(*s1 && *s1 != c)
510 		     s1++;
511 
512 		  if (*s1 == 0)
513 		     return NULL;
514 
515 		  if (SNEXT(ptr) == 0 && (s1 != s || *cs & OPT)) {
516 		     /* next item is a string, it's the last item and
517 		      * the %|\a|% match is ok --- just loop to try & match
518 		      * the string.
519 		      */
520 		     if (strtptr)
521 			*strtptr = s1;
522 
523 		     cs = ptr;
524 		     s = s1;
525 		     break;
526 		  }
527 	       }
528 	       ptr = expmatch(s1, MNEXT(cs), strtptr, mstring);
529 	       if (ptr != NULL && (s1 != s || *cs & OPT)) {
530 
531 		  /* we have a match */
532 		  if (strtptr)
533 		     *strtptr = s1;
534 
535 		  return ptr;
536 
537 	       } else if (ptr != NULL) {
538 
539 		  /* not optional and we still matched */
540 		  return NULL;
541 	       }
542 	       if (*s1 == '\\')
543 		  _escaped = _escaped ? FALSE : TRUE;
544 	       else
545 		  _escaped = FALSE;
546 	    } while (*s1++);
547 	    return NULL;
548 
549 	    /* fail if we are currently @_escaped@ */
550 	 case 'e':
551 	    if (_escaped)
552 	       return NULL;
553 	    cs = MNEXT(cs);
554 	    break;
555 
556 	    /* match any number of tabs and spaces */
557 	 case 'd':
558 	    ptr = s;
559 	    while (*s == ' ' || *s == '\t')
560 	       s++;
561 
562 	    if (s != ptr || s == _sstart) {
563 
564 	       /* match: be happy */
565 	       matched = 1;
566 	       cs = MNEXT(cs);
567 	    } else if (*s == '\n' || *s == '\0') {
568 
569 	       /* match: be happy */
570 	       matched = 1;
571 	       cs = MNEXT(cs);
572 	    } else if (*cs & ALT) {
573 
574 	       /* try the next part */
575 	       matched = 0;
576 	       cs = MNEXT(cs);
577 	    } else if (*cs & OPT) {
578 
579 	       /* doesn't matter */
580 	       matched = 1;
581 	       cs = MNEXT(cs);
582 	    } else
583 
584 	       /* no match: error return */
585 	       return NULL;
586 
587 	    break;
588 
589 	    /* check for end of line */
590 	 case '$':
591 	    if (*s == '\0' || *s == '\n') {
592 
593 	       /* match: be happy */
594 	       /* s++; was here
595 		* removed; now $ is left in stream to be matched again, which
596 		* is not bad (EOL can end several things at once. Also it
597 		* leaves things like CE on the same line.
598 		*/
599 	       matched = 1;
600 	       cs = MNEXT(cs);
601 	    } else if (*cs & ALT) {
602 
603 	       /* try the next part */
604 	       matched = 0;
605 	       cs = MNEXT(cs);
606 	    } else if (*cs & OPT) {
607 
608 	       /* doesn't matter */
609 	       matched = 1;
610 	       cs = MNEXT(cs);
611 	    } else
612 
613 	       /* no match: error return */
614 	       return NULL;
615 	    break;
616 
617 	    /* check for start of line */
618 	 case '^':
619 	    if (s == _sstart) {
620 
621 	       /* match: be happy */
622 	       matched = 1;
623 	       cs = MNEXT(cs);
624 	    } else if (*cs & ALT) {
625 
626 	       /* try the next part */
627 	       matched = 0;
628 	       cs = MNEXT(cs);
629 	    } else if (*cs & OPT) {
630 
631 	       /* doesn't matter */
632 	       matched = 1;
633 	       cs = MNEXT(cs);
634 	    } else
635 
636 	       /* no match: error return */
637 	       return NULL;
638 	    break;
639 
640 	    /* end of a subexpression: return success */
641 	 case ')':
642 	    return s;
643 	 }
644 	 break;
645       }
646    }
647    return s;
648 }
649