xref: /netbsd/usr.bin/vgrind/regexp.c (revision 6550d01e)
1 /*	$NetBSD: regexp.c,v 1.11 2008/07/21 14:19:27 lukem Exp $	*/
2 
3 /*
4  * Copyright (c) 1980, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32 
33 #include <sys/cdefs.h>
34 #ifndef lint
35 __COPYRIGHT("@(#) Copyright (c) 1980, 1993\
36  The Regents of the University of California.  All rights reserved.");
37 #endif /* not lint */
38 
39 #ifndef lint
40 #if 0
41 static char sccsid[] = "@(#)regexp.c	8.1 (Berkeley) 6/6/93";
42 #endif
43 __RCSID("$NetBSD: regexp.c,v 1.11 2008/07/21 14:19:27 lukem Exp $");
44 #endif /* not lint */
45 
46 #include <assert.h>
47 #include <ctype.h>
48 #include <stdlib.h>
49 #include <string.h>
50 #include "extern.h"
51 
52 #define FALSE	0
53 #define TRUE	!(FALSE)
54 #define NIL	0
55 
56 static void	expconv __P((void));
57 
58 boolean	 x_escaped;	/* true if we are currently x_escaped */
59 char	*x_start;	/* start of string */
60 boolean	 l_onecase;	/* true if upper and lower equivalent */
61 
62 #define makelower(c) (isupper((unsigned char)(c)) ? tolower((unsigned char)(c)) : (c))
63 
64 /*  STRNCMP -	like strncmp except that we convert the
65  *	 	first string to lower case before comparing
66  *		if l_onecase is set.
67  */
68 
69 int
70 STRNCMP(s1, s2, len)
71 	char *s1,*s2;
72 	int len;
73 {
74 	if (l_onecase) {
75 	    do
76 		if (*s2 - makelower(*s1))
77 			return (*s2 - makelower(*s1));
78 		else {
79 			s2++;
80 			s1++;
81 		}
82 	    while (--len);
83 	} else {
84 	    do
85 		if (*s2 - *s1)
86 			return (*s2 - *s1);
87 		else {
88 			s2++;
89 			s1++;
90 		}
91 	    while (--len);
92 	}
93 	return(0);
94 }
95 
96 /*	The following routine converts an irregular expression to
97  *	internal format.
98  *
99  *	Either meta symbols (\a \d or \p) or character strings or
100  *	operations ( alternation or perenthesizing ) can be
101  *	specified.  Each starts with a descriptor byte.  The descriptor
102  *	byte has STR set for strings, META set for meta symbols
103  *	and OPER set for operations.
104  *	The descriptor byte can also have the OPT bit set if the object
105  *	defined is optional.  Also ALT can be set to indicate an alternation.
106  *
107  *	For metasymbols the byte following the descriptor byte identities
108  *	the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '(').  For
109  *	strings the byte after the descriptor is a character count for
110  *	the string:
111  *
112  *		meta symbols := descriptor
113  *				symbol
114  *
115  *		strings :=	descriptor
116  *				character count
117  *				the string
118  *
119  *		operatins :=	descriptor
120  *				symbol
121  *				character count
122  */
123 
124 /*
125  *  handy macros for accessing parts of match blocks
126  */
127 #define MSYM(A) (*(A+1))	/* symbol in a meta symbol block */
128 #define MNEXT(A) (A+2)		/* character following a metasymbol block */
129 
130 #define OSYM(A) (*(A+1))	/* symbol in an operation block */
131 #define OCNT(A) (*(A+2))	/* character count */
132 #define ONEXT(A) (A+3)		/* next character after the operation */
133 #define OPTR(A) (A+*(A+2))	/* place pointed to by the operator */
134 
135 #define SCNT(A) (*(A+1))	/* byte count of a string */
136 #define SSTR(A) (A+2)		/* address of the string */
137 #define SNEXT(A) (A+2+*(A+1))	/* character following the string */
138 
139 /*
140  *  bit flags in the descriptor
141  */
142 #define OPT 1
143 #define STR 2
144 #define META 4
145 #define ALT 8
146 #define OPER 16
147 
148 static char *ccre;	/* pointer to current position in converted exp*/
149 static char *ure;	/* pointer current position in unconverted exp */
150 
151 char *
152 convexp(re)
153     char *re;		/* unconverted irregular expression */
154 {
155     char *cre;		/* pointer to converted regular expression */
156 
157     /* allocate room for the converted expression */
158     if (re == NIL)
159 	return (NIL);
160     if (*re == '\0')
161 	return (NIL);
162     cre = malloc(4 * strlen(re) + 3);
163     ccre = cre;
164     ure = re;
165 
166     /* start the conversion with a \a */
167     *cre = META | OPT;
168     MSYM(cre) = 'a';
169     ccre = MNEXT(cre);
170 
171     /* start the conversion (its recursive) */
172     expconv();
173     *ccre = 0;
174     return (cre);
175 }
176 
177 static void
178 expconv()
179 {
180     char *cs;		/* pointer to current symbol in converted exp */
181     char c;		/* character being processed */
182     char *acs;		/* pinter to last alternate */
183     int temp;
184 
185     /* let the conversion begin */
186     acs = NIL;
187     cs = NIL;
188     while (*ure != NIL) {
189 	switch (c = *ure++) {
190 
191 	case '\\':
192 	    switch (c = *ure++) {
193 
194 	    /* escaped characters are just characters */
195 	    default:
196 		if (cs == NIL || (*cs & STR) == 0) {
197 		    cs = ccre;
198 		    *cs = STR;
199 		    SCNT(cs) = 1;
200 		    ccre += 2;
201 		} else
202 		    SCNT(cs)++;
203 		*ccre++ = c;
204 		break;
205 
206 	    /* normal(?) metacharacters */
207 	    case 'a':
208 	    case 'd':
209 	    case 'e':
210 	    case 'p':
211 		if (acs != NIL && acs != cs) {
212 		    do {
213 			temp = OCNT(acs);
214 			OCNT(acs) = ccre - acs;
215 			acs -= temp;
216 		    } while (temp != 0);
217 		    acs = NIL;
218 		}
219 		cs = ccre;
220 		*cs = META;
221 		MSYM(cs) = c;
222 		ccre = MNEXT(cs);
223 		break;
224 	    }
225 	    break;
226 
227 	/* just put the symbol in */
228 	case '^':
229 	case '$':
230 	    if (acs != NIL && acs != cs) {
231 		do {
232 		    temp = OCNT(acs);
233 		    OCNT(acs) = ccre - acs;
234 		    acs -= temp;
235 		} while (temp != 0);
236 		acs = NIL;
237 	    }
238 	    cs = ccre;
239 	    *cs = META;
240 	    MSYM(cs) = c;
241 	    ccre = MNEXT(cs);
242 	    break;
243 
244 	/* mark the last match sequence as optional */
245 	case '?':
246 	    if (cs)
247 	    	*cs = *cs | OPT;
248 	    break;
249 
250 	/* recurse and define a subexpression */
251 	case '(':
252 	    if (acs != NIL && acs != cs) {
253 		do {
254 		    temp = OCNT(acs);
255 		    OCNT(acs) = ccre - acs;
256 		    acs -= temp;
257 		} while (temp != 0);
258 		acs = NIL;
259 	    }
260 	    cs = ccre;
261 	    *cs = OPER;
262 	    OSYM(cs) = '(';
263 	    ccre = ONEXT(cs);
264 	    expconv();
265 	    OCNT(cs) = ccre - cs;		/* offset to next symbol */
266 	    break;
267 
268 	/* reurn from a recursion */
269 	case ')':
270 	    if (acs != NIL) {
271 		do {
272 		    temp = OCNT(acs);
273 		    OCNT(acs) = ccre - acs;
274 		    acs -= temp;
275 		} while (temp != 0);
276 		acs = NIL;
277 	    }
278 	    cs = ccre;
279 	    *cs = META;
280 	    MSYM(cs) = c;
281 	    ccre = MNEXT(cs);
282 	    return;
283 
284 	/* mark the last match sequence as having an alternate */
285 	/* the third byte will contain an offset to jump over the */
286 	/* alternate match in case the first did not fail */
287 	case '|':
288 	    if (acs != NIL && acs != cs)
289 		OCNT(ccre) = ccre - acs;	/* make a back pointer */
290 	    else
291 		OCNT(ccre) = 0;
292 	    assert(cs != NULL);
293 	    *cs |= ALT;
294 	    cs = ccre;
295 	    *cs = OPER;
296 	    OSYM(cs) = '|';
297 	    ccre = ONEXT(cs);
298 	    acs = cs;	/* remember that the pointer is to be filles */
299 	    break;
300 
301 	/* if its not a metasymbol just build a scharacter string */
302 	default:
303 	    if (cs == NIL || (*cs & STR) == 0) {
304 		cs = ccre;
305 		*cs = STR;
306 		SCNT(cs) = 1;
307 		ccre = SSTR(cs);
308 	    } else
309 		SCNT(cs)++;
310 	    *ccre++ = c;
311 	    break;
312 	}
313     }
314     if (acs != NIL) {
315 	do {
316 	    temp = OCNT(acs);
317 	    OCNT(acs) = ccre - acs;
318 	    acs -= temp;
319 	} while (temp != 0);
320 	acs = NIL;
321     }
322     return;
323 }
324 /* end of convertre */
325 
326 
327 /*
328  *	The following routine recognises an irregular expresion
329  *	with the following special characters:
330  *
331  *		\?	-	means last match was optional
332  *		\a	-	matches any number of characters
333  *		\d	-	matches any number of spaces and tabs
334  *		\p	-	matches any number of alphanumeric
335  *				characters. The
336  *				characters matched will be copied into
337  *				the area pointed to by 'name'.
338  *		\|	-	alternation
339  *		\( \)	-	grouping used mostly for alternation and
340  *				optionality
341  *
342  *	The irregular expression must be translated to internal form
343  *	prior to calling this routine
344  *
345  *	The value returned is the pointer to the first non \a
346  *	character matched.
347  */
348 
349 char *
350 expmatch(s, re, mstring)
351     char *s;		/* string to check for a match in */
352     char *re;		/* a converted irregular expression */
353     char *mstring;	/* where to put whatever matches a \p */
354 {
355     char *cs;		/* the current symbol */
356     char *ptr,*s1;	/* temporary pointer */
357     boolean matched;	/* a temporary boolean */
358 
359     /* initial conditions */
360     if (re == NIL)
361 	return (NIL);
362     cs = re;
363     matched = FALSE;
364 
365     /* loop till expression string is exhausted (or at least pretty tired) */
366     while (*cs) {
367 	switch (*cs & (OPER | STR | META)) {
368 
369 	/* try to match a string */
370 	case STR:
371 	    matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
372 	    if (matched) {
373 
374 		/* hoorah it matches */
375 		s += SCNT(cs);
376 		cs = SNEXT(cs);
377 	    } else if (*cs & ALT) {
378 
379 		/* alternation, skip to next expression */
380 		cs = SNEXT(cs);
381 	    } else if (*cs & OPT) {
382 
383 		/* the match is optional */
384 		cs = SNEXT(cs);
385 		matched = 1;		/* indicate a successful match */
386 	    } else {
387 
388 		/* no match, error return */
389 		return (NIL);
390 	    }
391 	    break;
392 
393 	/* an operator, do something fancy */
394 	case OPER:
395 	    switch (OSYM(cs)) {
396 
397 	    /* this is an alternation */
398 	    case '|':
399 		if (matched)
400 
401 		    /* last thing in the alternation was a match, skip ahead */
402 		    cs = OPTR(cs);
403 		else
404 
405 		    /* no match, keep trying */
406 		    cs = ONEXT(cs);
407 		break;
408 
409 	    /* this is a grouping, recurse */
410 	    case '(':
411 		ptr = expmatch(s, ONEXT(cs), mstring);
412 		if (ptr != NIL) {
413 
414 		    /* the subexpression matched */
415 		    matched = 1;
416 		    s = ptr;
417 		} else if (*cs & ALT) {
418 
419 		    /* alternation, skip to next expression */
420 		    matched = 0;
421 		} else if (*cs & OPT) {
422 
423 		    /* the match is optional */
424 		    matched = 1;	/* indicate a successful match */
425 		} else {
426 
427 		    /* no match, error return */
428 		    return (NIL);
429 		}
430 		cs = OPTR(cs);
431 		break;
432 	    }
433 	    break;
434 
435 	/* try to match a metasymbol */
436 	case META:
437 	    switch (MSYM(cs)) {
438 
439 	    /* try to match anything and remember what was matched */
440 	    case 'p':
441 		/*
442 		 *  This is really the same as trying the match the
443 		 *  remaining parts of the expression to any subset
444 		 *  of the string.
445 		 */
446 		s1 = s;
447 		do {
448 		    ptr = expmatch(s1, MNEXT(cs), mstring);
449 		    if (ptr != NIL && s1 != s) {
450 
451 			/* we have a match, remember the match */
452 			strncpy(mstring, s, s1 - s);
453 			mstring[s1 - s] = '\0';
454 			return (ptr);
455 		    } else if (ptr != NIL && (*cs & OPT)) {
456 
457 			/* it was aoptional so no match is ok */
458 			return (ptr);
459 		    } else if (ptr != NIL) {
460 
461 			/* not optional and we still matched */
462 			return (NIL);
463 		    }
464 		    if (!isalnum((unsigned char)*s1) && *s1 != '_')
465 			return (NIL);
466 		    if (*s1 == '\\')
467 			x_escaped = x_escaped ? FALSE : TRUE;
468 		    else
469 			x_escaped = FALSE;
470 		} while (*s1++);
471 		return (NIL);
472 
473 	    /* try to match anything */
474 	    case 'a':
475 		/*
476 		 *  This is really the same as trying the match the
477 		 *  remaining parts of the expression to any subset
478 		 *  of the string.
479 		 */
480 		s1 = s;
481 		do {
482 		    ptr = expmatch(s1, MNEXT(cs), mstring);
483 		    if (ptr != NIL && s1 != s) {
484 
485 			/* we have a match */
486 			return (ptr);
487 		    } else if (ptr != NIL && (*cs & OPT)) {
488 
489 			/* it was aoptional so no match is ok */
490 			return (ptr);
491 		    } else if (ptr != NIL) {
492 
493 			/* not optional and we still matched */
494 			return (NIL);
495 		    }
496 		    if (*s1 == '\\')
497 			x_escaped = x_escaped ? FALSE : TRUE;
498 		    else
499 			x_escaped = FALSE;
500 		} while (*s1++);
501 		return (NIL);
502 
503 	    /* fail if we are currently x_escaped */
504 	    case 'e':
505 		if (x_escaped)
506 		    return(NIL);
507 		cs = MNEXT(cs);
508 		break;
509 
510 	    /* match any number of tabs and spaces */
511 	    case 'd':
512 		ptr = s;
513 		while (*s == ' ' || *s == '\t')
514 		    s++;
515 		if (s != ptr || s == x_start) {
516 
517 		    /* match, be happy */
518 		    matched = 1;
519 		    cs = MNEXT(cs);
520 		} else if (*s == '\n' || *s == '\0') {
521 
522 		    /* match, be happy */
523 		    matched = 1;
524 		    cs = MNEXT(cs);
525 		} else if (*cs & ALT) {
526 
527 		    /* try the next part */
528 		    matched = 0;
529 		    cs = MNEXT(cs);
530 		} else if (*cs & OPT) {
531 
532 		    /* doesn't matter */
533 		    matched = 1;
534 		    cs = MNEXT(cs);
535 		} else
536 
537 		    /* no match, error return */
538 		    return (NIL);
539 		break;
540 
541 	    /* check for end of line */
542 	    case '$':
543 		if (*s == '\0' || *s == '\n') {
544 
545 		    /* match, be happy */
546 		    s++;
547 		    matched = 1;
548 		    cs = MNEXT(cs);
549 		} else if (*cs & ALT) {
550 
551 		    /* try the next part */
552 		    matched = 0;
553 		    cs = MNEXT(cs);
554 		} else if (*cs & OPT) {
555 
556 		    /* doesn't matter */
557 		    matched = 1;
558 		    cs = MNEXT(cs);
559 		} else
560 
561 		    /* no match, error return */
562 		    return (NIL);
563 		break;
564 
565 	    /* check for start of line */
566 	    case '^':
567 		if (s == x_start) {
568 
569 		    /* match, be happy */
570 		    matched = 1;
571 		    cs = MNEXT(cs);
572 		} else if (*cs & ALT) {
573 
574 		    /* try the next part */
575 		    matched = 0;
576 		    cs = MNEXT(cs);
577 		} else if (*cs & OPT) {
578 
579 		    /* doesn't matter */
580 		    matched = 1;
581 		    cs = MNEXT(cs);
582 		} else
583 
584 		    /* no match, error return */
585 		    return (NIL);
586 		break;
587 
588 	    /* end of a subexpression, return success */
589 	    case ')':
590 		return (s);
591 	    }
592 	    break;
593 	}
594     }
595     return (s);
596 }
597