xref: /freebsd/usr.bin/vgrind/regexp.c (revision 7bd6fde3)
1 /*
2  * Copyright (c) 1980, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  * 3. All advertising materials mentioning features or use of this software
15  *    must display the following acknowledgement:
16  *	This product includes software developed by the University of
17  *	California, Berkeley and its contributors.
18  * 4. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 #include <sys/cdefs.h>
36 
37 __FBSDID("$FreeBSD$");
38 
39 #ifndef lint
40 static const char copyright[] =
41 "@(#) Copyright (c) 1980, 1993\n\
42 	The Regents of the University of California.  All rights reserved.\n";
43 #endif
44 
45 #ifndef lint
46 static const char sccsid[] = "@(#)regexp.c	8.1 (Berkeley) 6/6/93";
47 #endif
48 
49 #include <ctype.h>
50 #include <stdlib.h>
51 #include <string.h>
52 
53 #include "extern.h"
54 
55 #define FALSE	0
56 #define TRUE	!(FALSE)
57 #define NIL	0
58 
59 static void	expconv(void);
60 
61 boolean	 _escaped;	/* true if we are currently _escaped */
62 char	*s_start;	/* start of string */
63 boolean	 l_onecase;	/* true if upper and lower equivalent */
64 
65 #define makelower(c) (isupper((c)) ? tolower((c)) : (c))
66 
67 /*  STRNCMP -	like strncmp except that we convert the
68  *	 	first string to lower case before comparing
69  *		if l_onecase is set.
70  */
71 
72 int
73 STRNCMP(s1, s2, len)
74 	register char *s1,*s2;
75 	register int len;
76 {
77 	if (l_onecase) {
78 	    do
79 		if (*s2 - makelower(*s1))
80 			return (*s2 - makelower(*s1));
81 		else {
82 			s2++;
83 			s1++;
84 		}
85 	    while (--len);
86 	} else {
87 	    do
88 		if (*s2 - *s1)
89 			return (*s2 - *s1);
90 		else {
91 			s2++;
92 			s1++;
93 		}
94 	    while (--len);
95 	}
96 	return(0);
97 }
98 
99 /*	The following routine converts an irregular expression to
100  *	internal format.
101  *
102  *	Either meta symbols (\a \d or \p) or character strings or
103  *	operations ( alternation or perenthesizing ) can be
104  *	specified.  Each starts with a descriptor byte.  The descriptor
105  *	byte has STR set for strings, META set for meta symbols
106  *	and OPER set for operations.
107  *	The descriptor byte can also have the OPT bit set if the object
108  *	defined is optional.  Also ALT can be set to indicate an alternation.
109  *
110  *	For metasymbols the byte following the descriptor byte identities
111  *	the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '(').  For
112  *	strings the byte after the descriptor is a character count for
113  *	the string:
114  *
115  *		meta symbols := descriptor
116  *				symbol
117  *
118  *		strings :=	descriptor
119  *				character count
120  *				the string
121  *
122  *		operatins :=	descriptor
123  *				symbol
124  *				character count
125  */
126 
127 /*
128  *  handy macros for accessing parts of match blocks
129  */
130 #define MSYM(A) (*(A+1))	/* symbol in a meta symbol block */
131 #define MNEXT(A) (A+2)		/* character following a metasymbol block */
132 
133 #define OSYM(A) (*(A+1))	/* symbol in an operation block */
134 #define OCNT(A) (*(A+2))	/* character count */
135 #define ONEXT(A) (A+3)		/* next character after the operation */
136 #define OPTR(A) (A+*(A+2))	/* place pointed to by the operator */
137 
138 #define SCNT(A) (*(A+1))	/* byte count of a string */
139 #define SSTR(A) (A+2)		/* address of the string */
140 #define SNEXT(A) (A+2+*(A+1))	/* character following the string */
141 
142 /*
143  *  bit flags in the descriptor
144  */
145 #define OPT 1
146 #define STR 2
147 #define META 4
148 #define ALT 8
149 #define OPER 16
150 
151 static char *ccre;	/* pointer to current position in converted exp*/
152 static char *ure;	/* pointer current position in unconverted exp */
153 
154 char *
155 convexp(re)
156     char *re;		/* unconverted irregular expression */
157 {
158     register char *cre;		/* pointer to converted regular expression */
159 
160     /* allocate room for the converted expression */
161     if (re == NIL)
162 	return (NIL);
163     if (*re == '\0')
164 	return (NIL);
165     cre = malloc (4 * strlen(re) + 3);
166     ccre = cre;
167     ure = re;
168 
169     /* start the conversion with a \a */
170     *cre = META | OPT;
171     MSYM(cre) = 'a';
172     ccre = MNEXT(cre);
173 
174     /* start the conversion (its recursive) */
175     expconv ();
176     *ccre = 0;
177     return (cre);
178 }
179 
180 static void
181 expconv()
182 {
183     register char *cs;		/* pointer to current symbol in converted exp */
184     register char c;		/* character being processed */
185     register char *acs;		/* pinter to last alternate */
186     register int temp;
187 
188     /* let the conversion begin */
189     acs = NIL;
190     cs = NIL;
191     while (*ure != NIL) {
192 	switch (c = *ure++) {
193 
194 	case '\\':
195 	    switch (c = *ure++) {
196 
197 	    /* escaped characters are just characters */
198 	    default:
199 		if (cs == NIL || (*cs & STR) == 0) {
200 		    cs = ccre;
201 		    *cs = STR;
202 		    SCNT(cs) = 1;
203 		    ccre += 2;
204 		} else
205 		    SCNT(cs)++;
206 		*ccre++ = c;
207 		break;
208 
209 	    /* normal(?) metacharacters */
210 	    case 'a':
211 	    case 'd':
212 	    case 'e':
213 	    case 'p':
214 		if (acs != NIL && acs != cs) {
215 		    do {
216 			temp = OCNT(acs);
217 			OCNT(acs) = ccre - acs;
218 			acs -= temp;
219 		    } while (temp != 0);
220 		    acs = NIL;
221 		}
222 		cs = ccre;
223 		*cs = META;
224 		MSYM(cs) = c;
225 		ccre = MNEXT(cs);
226 		break;
227 	    }
228 	    break;
229 
230 	/* just put the symbol in */
231 	case '^':
232 	case '$':
233 	    if (acs != NIL && acs != cs) {
234 		do {
235 		    temp = OCNT(acs);
236 		    OCNT(acs) = ccre - acs;
237 		    acs -= temp;
238 		} while (temp != 0);
239 		acs = NIL;
240 	    }
241 	    cs = ccre;
242 	    *cs = META;
243 	    MSYM(cs) = c;
244 	    ccre = MNEXT(cs);
245 	    break;
246 
247 	/* mark the last match sequence as optional */
248 	case '?':
249 	    if (cs)
250 	    	*cs = *cs | OPT;
251 	    break;
252 
253 	/* recurse and define a subexpression */
254 	case '(':
255 	    if (acs != NIL && acs != cs) {
256 		do {
257 		    temp = OCNT(acs);
258 		    OCNT(acs) = ccre - acs;
259 		    acs -= temp;
260 		} while (temp != 0);
261 		acs = NIL;
262 	    }
263 	    cs = ccre;
264 	    *cs = OPER;
265 	    OSYM(cs) = '(';
266 	    ccre = ONEXT(cs);
267 	    expconv ();
268 	    OCNT(cs) = ccre - cs;		/* offset to next symbol */
269 	    break;
270 
271 	/* return from a recursion */
272 	case ')':
273 	    if (acs != NIL) {
274 		do {
275 		    temp = OCNT(acs);
276 		    OCNT(acs) = ccre - acs;
277 		    acs -= temp;
278 		} while (temp != 0);
279 		acs = NIL;
280 	    }
281 	    cs = ccre;
282 	    *cs = META;
283 	    MSYM(cs) = c;
284 	    ccre = MNEXT(cs);
285 	    return;
286 
287 	/* mark the last match sequence as having an alternate */
288 	/* the third byte will contain an offset to jump over the */
289 	/* alternate match in case the first did not fail */
290 	case '|':
291 	    if (acs != NIL && acs != cs)
292 		OCNT(ccre) = ccre - acs;	/* make a back pointer */
293 	    else
294 		OCNT(ccre) = 0;
295 	    *cs |= ALT;
296 	    cs = ccre;
297 	    *cs = OPER;
298 	    OSYM(cs) = '|';
299 	    ccre = ONEXT(cs);
300 	    acs = cs;	/* remember that the pointer is to be filles */
301 	    break;
302 
303 	/* if its not a metasymbol just build a scharacter string */
304 	default:
305 	    if (cs == NIL || (*cs & STR) == 0) {
306 		cs = ccre;
307 		*cs = STR;
308 		SCNT(cs) = 1;
309 		ccre = SSTR(cs);
310 	    } else
311 		SCNT(cs)++;
312 	    *ccre++ = c;
313 	    break;
314 	}
315     }
316     if (acs != NIL) {
317 	do {
318 	    temp = OCNT(acs);
319 	    OCNT(acs) = ccre - acs;
320 	    acs -= temp;
321 	} while (temp != 0);
322 	acs = NIL;
323     }
324     return;
325 }
326 /* end of convertre */
327 
328 
329 /*
330  *	The following routine recognises an irregular expresion
331  *	with the following special characters:
332  *
333  *		\?	-	means last match was optional
334  *		\a	-	matches any number of characters
335  *		\d	-	matches any number of spaces and tabs
336  *		\p	-	matches any number of alphanumeric
337  *				characters. The
338  *				characters matched will be copied into
339  *				the area pointed to by 'name'.
340  *		\|	-	alternation
341  *		\( \)	-	grouping used mostly for alternation and
342  *				optionality
343  *
344  *	The irregular expression must be translated to internal form
345  *	prior to calling this routine
346  *
347  *	The value returned is the pointer to the first non \a
348  *	character matched.
349  */
350 
351 char *
352 expmatch (s, re, mstring)
353     register char *s;		/* string to check for a match in */
354     register char *re;		/* a converted irregular expression */
355     register char *mstring;	/* where to put whatever matches a \p */
356 {
357     register char *cs;		/* the current symbol */
358     register char *ptr,*s1;	/* temporary pointer */
359     boolean matched;		/* a temporary boolean */
360 
361     /* initial conditions */
362     if (re == NIL)
363 	return (NIL);
364     cs = re;
365     matched = FALSE;
366 
367     /* loop till expression string is exhausted (or at least pretty tired) */
368     while (*cs) {
369 	switch (*cs & (OPER | STR | META)) {
370 
371 	/* try to match a string */
372 	case STR:
373 	    matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
374 	    if (matched) {
375 
376 		/* hoorah it matches */
377 		s += SCNT(cs);
378 		cs = SNEXT(cs);
379 	    } else if (*cs & ALT) {
380 
381 		/* alternation, skip to next expression */
382 		cs = SNEXT(cs);
383 	    } else if (*cs & OPT) {
384 
385 		/* the match is optional */
386 		cs = SNEXT(cs);
387 		matched = 1;		/* indicate a successful match */
388 	    } else {
389 
390 		/* no match, error return */
391 		return (NIL);
392 	    }
393 	    break;
394 
395 	/* an operator, do something fancy */
396 	case OPER:
397 	    switch (OSYM(cs)) {
398 
399 	    /* this is an alternation */
400 	    case '|':
401 		if (matched)
402 
403 		    /* last thing in the alternation was a match, skip ahead */
404 		    cs = OPTR(cs);
405 		else
406 
407 		    /* no match, keep trying */
408 		    cs = ONEXT(cs);
409 		break;
410 
411 	    /* this is a grouping, recurse */
412 	    case '(':
413 		ptr = expmatch (s, ONEXT(cs), mstring);
414 		if (ptr != NIL) {
415 
416 		    /* the subexpression matched */
417 		    matched = 1;
418 		    s = ptr;
419 		} else if (*cs & ALT) {
420 
421 		    /* alternation, skip to next expression */
422 		    matched = 0;
423 		} else if (*cs & OPT) {
424 
425 		    /* the match is optional */
426 		    matched = 1;	/* indicate a successful match */
427 		} else {
428 
429 		    /* no match, error return */
430 		    return (NIL);
431 		}
432 		cs = OPTR(cs);
433 		break;
434 	    }
435 	    break;
436 
437 	/* try to match a metasymbol */
438 	case META:
439 	    switch (MSYM(cs)) {
440 
441 	    /* try to match anything and remember what was matched */
442 	    case 'p':
443 		/*
444 		 *  This is really the same as trying the match the
445 		 *  remaining parts of the expression to any subset
446 		 *  of the string.
447 		 */
448 		s1 = s;
449 		do {
450 		    ptr = expmatch (s1, MNEXT(cs), mstring);
451 		    if (ptr != NIL && s1 != s) {
452 
453 			/* we have a match, remember the match */
454 			strncpy (mstring, s, s1 - s);
455 			mstring[s1 - s] = '\0';
456 			return (ptr);
457 		    } else if (ptr != NIL && (*cs & OPT)) {
458 
459 			/* it was aoptional so no match is ok */
460 			return (ptr);
461 		    } else if (ptr != NIL) {
462 
463 			/* not optional and we still matched */
464 			return (NIL);
465 		    }
466 		    if (!(isalnum(*s1) || *s1 == '_' ||
467 			  /* C++ destructor */
468 			  *s1 == '~' ||
469 			  /* C++ scope operator */
470 			  (strlen(s1) > 1 && *s1 == ':' && s1[1] == ':' &&
471 			   (s1++, TRUE))))
472 			return (NIL);
473 		    if (*s1 == '\\')
474 			_escaped = _escaped ? FALSE : TRUE;
475 		    else
476 			_escaped = FALSE;
477 		} while (*s1++);
478 		return (NIL);
479 
480 	    /* try to match anything */
481 	    case 'a':
482 		/*
483 		 *  This is really the same as trying the match the
484 		 *  remaining parts of the expression to any subset
485 		 *  of the string.
486 		 */
487 		s1 = s;
488 		do {
489 		    ptr = expmatch (s1, MNEXT(cs), mstring);
490 		    if (ptr != NIL && s1 != s) {
491 
492 			/* we have a match */
493 			return (ptr);
494 		    } else if (ptr != NIL && (*cs & OPT)) {
495 
496 			/* it was aoptional so no match is ok */
497 			return (ptr);
498 		    } else if (ptr != NIL) {
499 
500 			/* not optional and we still matched */
501 			return (NIL);
502 		    }
503 		    if (*s1 == '\\')
504 			_escaped = _escaped ? FALSE : TRUE;
505 		    else
506 			_escaped = FALSE;
507 		} while (*s1++);
508 		return (NIL);
509 
510 	    /* fail if we are currently _escaped */
511 	    case 'e':
512 		if (_escaped)
513 		    return(NIL);
514 		cs = MNEXT(cs);
515 		break;
516 
517 	    /* match any number of tabs and spaces */
518 	    case 'd':
519 		ptr = s;
520 		while (*s == ' ' || *s == '\t')
521 		    s++;
522 		if (s != ptr || s == s_start) {
523 
524 		    /* match, be happy */
525 		    matched = 1;
526 		    cs = MNEXT(cs);
527 		} else if (*s == '\n' || *s == '\0') {
528 
529 		    /* match, be happy */
530 		    matched = 1;
531 		    cs = MNEXT(cs);
532 		} else if (*cs & ALT) {
533 
534 		    /* try the next part */
535 		    matched = 0;
536 		    cs = MNEXT(cs);
537 		} else if (*cs & OPT) {
538 
539 		    /* doesn't matter */
540 		    matched = 1;
541 		    cs = MNEXT(cs);
542 		} else
543 
544 		    /* no match, error return */
545 		    return (NIL);
546 		break;
547 
548 	    /* check for end of line */
549 	    case '$':
550 		if (*s == '\0' || *s == '\n') {
551 
552 		    /* match, be happy */
553 		    s++;
554 		    matched = 1;
555 		    cs = MNEXT(cs);
556 		} else if (*cs & ALT) {
557 
558 		    /* try the next part */
559 		    matched = 0;
560 		    cs = MNEXT(cs);
561 		} else if (*cs & OPT) {
562 
563 		    /* doesn't matter */
564 		    matched = 1;
565 		    cs = MNEXT(cs);
566 		} else
567 
568 		    /* no match, error return */
569 		    return (NIL);
570 		break;
571 
572 	    /* check for start of line */
573 	    case '^':
574 		if (s == s_start) {
575 
576 		    /* match, be happy */
577 		    matched = 1;
578 		    cs = MNEXT(cs);
579 		} else if (*cs & ALT) {
580 
581 		    /* try the next part */
582 		    matched = 0;
583 		    cs = MNEXT(cs);
584 		} else if (*cs & OPT) {
585 
586 		    /* doesn't matter */
587 		    matched = 1;
588 		    cs = MNEXT(cs);
589 		} else
590 
591 		    /* no match, error return */
592 		    return (NIL);
593 		break;
594 
595 	    /* end of a subexpression, return success */
596 	    case ')':
597 		return (s);
598 	    }
599 	    break;
600 	}
601     }
602     return (s);
603 }
604