xref: /original-bsd/usr.bin/vgrind/regexp.c (revision 860e07fc)
1 /*
2  * Copyright (c) 1980 The Regents of the University of California.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. All advertising materials mentioning features or use of this software
14  *    must display the following acknowledgement:
15  *	This product includes software developed by the University of
16  *	California, Berkeley and its contributors.
17  * 4. Neither the name of the University nor the names of its contributors
18  *    may be used to endorse or promote products derived from this software
19  *    without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  */
33 
34 #ifndef lint
35 static char sccsid[] = "@(#)regexp.c	5.4 (Berkeley) 8/3/92";
36 #endif /* not lint */
37 
38 #include <ctype.h>
39 #include <stdlib.h>
40 #include <string.h>
41 #include "extern.h"
42 
43 #define FALSE	0
44 #define TRUE	!(FALSE)
45 #define NIL	0
46 
47 static void	expconv __P((void));
48 
49 boolean	 _escaped;	/* true if we are currently _escaped */
50 char	*_start;	/* start of string */
51 boolean	 l_onecase;	/* true if upper and lower equivalent */
52 
53 #define makelower(c) (isupper((c)) ? tolower((c)) : (c))
54 
55 /*  STRNCMP -	like strncmp except that we convert the
56  *	 	first string to lower case before comparing
57  *		if l_onecase is set.
58  */
59 
60 int
61 STRNCMP(s1, s2, len)
62 	register char *s1,*s2;
63 	register int len;
64 {
65 	if (l_onecase) {
66 	    do
67 		if (*s2 - makelower(*s1))
68 			return (*s2 - makelower(*s1));
69 		else {
70 			s2++;
71 			s1++;
72 		}
73 	    while (--len);
74 	} else {
75 	    do
76 		if (*s2 - *s1)
77 			return (*s2 - *s1);
78 		else {
79 			s2++;
80 			s1++;
81 		}
82 	    while (--len);
83 	}
84 	return(0);
85 }
86 
87 /*	The following routine converts an irregular expression to
88  *	internal format.
89  *
90  *	Either meta symbols (\a \d or \p) or character strings or
91  *	operations ( alternation or perenthesizing ) can be
92  *	specified.  Each starts with a descriptor byte.  The descriptor
93  *	byte has STR set for strings, META set for meta symbols
94  *	and OPER set for operations.
95  *	The descriptor byte can also have the OPT bit set if the object
96  *	defined is optional.  Also ALT can be set to indicate an alternation.
97  *
98  *	For metasymbols the byte following the descriptor byte identities
99  *	the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '(').  For
100  *	strings the byte after the descriptor is a character count for
101  *	the string:
102  *
103  *		meta symbols := descriptor
104  *				symbol
105  *
106  *		strings :=	descriptor
107  *				character count
108  *				the string
109  *
110  *		operatins :=	descriptor
111  *				symbol
112  *				character count
113  */
114 
115 /*
116  *  handy macros for accessing parts of match blocks
117  */
118 #define MSYM(A) (*(A+1))	/* symbol in a meta symbol block */
119 #define MNEXT(A) (A+2)		/* character following a metasymbol block */
120 
121 #define OSYM(A) (*(A+1))	/* symbol in an operation block */
122 #define OCNT(A) (*(A+2))	/* character count */
123 #define ONEXT(A) (A+3)		/* next character after the operation */
124 #define OPTR(A) (A+*(A+2))	/* place pointed to by the operator */
125 
126 #define SCNT(A) (*(A+1))	/* byte count of a string */
127 #define SSTR(A) (A+2)		/* address of the string */
128 #define SNEXT(A) (A+2+*(A+1))	/* character following the string */
129 
130 /*
131  *  bit flags in the descriptor
132  */
133 #define OPT 1
134 #define STR 2
135 #define META 4
136 #define ALT 8
137 #define OPER 16
138 
139 static char *ccre;	/* pointer to current position in converted exp*/
140 static char *ure;	/* pointer current position in unconverted exp */
141 
142 char *
143 convexp(re)
144     char *re;		/* unconverted irregular expression */
145 {
146     register char *cre;		/* pointer to converted regular expression */
147 
148     /* allocate room for the converted expression */
149     if (re == NIL)
150 	return (NIL);
151     if (*re == '\0')
152 	return (NIL);
153     cre = malloc (4 * strlen(re) + 3);
154     ccre = cre;
155     ure = re;
156 
157     /* start the conversion with a \a */
158     *cre = META | OPT;
159     MSYM(cre) = 'a';
160     ccre = MNEXT(cre);
161 
162     /* start the conversion (its recursive) */
163     expconv ();
164     *ccre = 0;
165     return (cre);
166 }
167 
168 static void
169 expconv()
170 {
171     register char *cs;		/* pointer to current symbol in converted exp */
172     register char c;		/* character being processed */
173     register char *acs;		/* pinter to last alternate */
174     register int temp;
175 
176     /* let the conversion begin */
177     acs = NIL;
178     cs = NIL;
179     while (*ure != NIL) {
180 	switch (c = *ure++) {
181 
182 	case '\\':
183 	    switch (c = *ure++) {
184 
185 	    /* escaped characters are just characters */
186 	    default:
187 		if (cs == NIL || (*cs & STR) == 0) {
188 		    cs = ccre;
189 		    *cs = STR;
190 		    SCNT(cs) = 1;
191 		    ccre += 2;
192 		} else
193 		    SCNT(cs)++;
194 		*ccre++ = c;
195 		break;
196 
197 	    /* normal(?) metacharacters */
198 	    case 'a':
199 	    case 'd':
200 	    case 'e':
201 	    case 'p':
202 		if (acs != NIL && acs != cs) {
203 		    do {
204 			temp = OCNT(acs);
205 			OCNT(acs) = ccre - acs;
206 			acs -= temp;
207 		    } while (temp != 0);
208 		    acs = NIL;
209 		}
210 		cs = ccre;
211 		*cs = META;
212 		MSYM(cs) = c;
213 		ccre = MNEXT(cs);
214 		break;
215 	    }
216 	    break;
217 
218 	/* just put the symbol in */
219 	case '^':
220 	case '$':
221 	    if (acs != NIL && acs != cs) {
222 		do {
223 		    temp = OCNT(acs);
224 		    OCNT(acs) = ccre - acs;
225 		    acs -= temp;
226 		} while (temp != 0);
227 		acs = NIL;
228 	    }
229 	    cs = ccre;
230 	    *cs = META;
231 	    MSYM(cs) = c;
232 	    ccre = MNEXT(cs);
233 	    break;
234 
235 	/* mark the last match sequence as optional */
236 	case '?':
237 	    if (cs)
238 	    	*cs = *cs | OPT;
239 	    break;
240 
241 	/* recurse and define a subexpression */
242 	case '(':
243 	    if (acs != NIL && acs != cs) {
244 		do {
245 		    temp = OCNT(acs);
246 		    OCNT(acs) = ccre - acs;
247 		    acs -= temp;
248 		} while (temp != 0);
249 		acs = NIL;
250 	    }
251 	    cs = ccre;
252 	    *cs = OPER;
253 	    OSYM(cs) = '(';
254 	    ccre = ONEXT(cs);
255 	    expconv ();
256 	    OCNT(cs) = ccre - cs;		/* offset to next symbol */
257 	    break;
258 
259 	/* reurn from a recursion */
260 	case ')':
261 	    if (acs != NIL) {
262 		do {
263 		    temp = OCNT(acs);
264 		    OCNT(acs) = ccre - acs;
265 		    acs -= temp;
266 		} while (temp != 0);
267 		acs = NIL;
268 	    }
269 	    cs = ccre;
270 	    *cs = META;
271 	    MSYM(cs) = c;
272 	    ccre = MNEXT(cs);
273 	    return;
274 
275 	/* mark the last match sequence as having an alternate */
276 	/* the third byte will contain an offset to jump over the */
277 	/* alternate match in case the first did not fail */
278 	case '|':
279 	    if (acs != NIL && acs != cs)
280 		OCNT(ccre) = ccre - acs;	/* make a back pointer */
281 	    else
282 		OCNT(ccre) = 0;
283 	    *cs |= ALT;
284 	    cs = ccre;
285 	    *cs = OPER;
286 	    OSYM(cs) = '|';
287 	    ccre = ONEXT(cs);
288 	    acs = cs;	/* remember that the pointer is to be filles */
289 	    break;
290 
291 	/* if its not a metasymbol just build a scharacter string */
292 	default:
293 	    if (cs == NIL || (*cs & STR) == 0) {
294 		cs = ccre;
295 		*cs = STR;
296 		SCNT(cs) = 1;
297 		ccre = SSTR(cs);
298 	    } else
299 		SCNT(cs)++;
300 	    *ccre++ = c;
301 	    break;
302 	}
303     }
304     if (acs != NIL) {
305 	do {
306 	    temp = OCNT(acs);
307 	    OCNT(acs) = ccre - acs;
308 	    acs -= temp;
309 	} while (temp != 0);
310 	acs = NIL;
311     }
312     return;
313 }
314 /* end of convertre */
315 
316 
317 /*
318  *	The following routine recognises an irregular expresion
319  *	with the following special characters:
320  *
321  *		\?	-	means last match was optional
322  *		\a	-	matches any number of characters
323  *		\d	-	matches any number of spaces and tabs
324  *		\p	-	matches any number of alphanumeric
325  *				characters. The
326  *				characters matched will be copied into
327  *				the area pointed to by 'name'.
328  *		\|	-	alternation
329  *		\( \)	-	grouping used mostly for alternation and
330  *				optionality
331  *
332  *	The irregular expression must be translated to internal form
333  *	prior to calling this routine
334  *
335  *	The value returned is the pointer to the first non \a
336  *	character matched.
337  */
338 
339 char *
340 expmatch (s, re, mstring)
341     register char *s;		/* string to check for a match in */
342     register char *re;		/* a converted irregular expression */
343     register char *mstring;	/* where to put whatever matches a \p */
344 {
345     register char *cs;		/* the current symbol */
346     register char *ptr,*s1;	/* temporary pointer */
347     boolean matched;		/* a temporary boolean */
348 
349     /* initial conditions */
350     if (re == NIL)
351 	return (NIL);
352     cs = re;
353     matched = FALSE;
354 
355     /* loop till expression string is exhausted (or at least pretty tired) */
356     while (*cs) {
357 	switch (*cs & (OPER | STR | META)) {
358 
359 	/* try to match a string */
360 	case STR:
361 	    matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
362 	    if (matched) {
363 
364 		/* hoorah it matches */
365 		s += SCNT(cs);
366 		cs = SNEXT(cs);
367 	    } else if (*cs & ALT) {
368 
369 		/* alternation, skip to next expression */
370 		cs = SNEXT(cs);
371 	    } else if (*cs & OPT) {
372 
373 		/* the match is optional */
374 		cs = SNEXT(cs);
375 		matched = 1;		/* indicate a successful match */
376 	    } else {
377 
378 		/* no match, error return */
379 		return (NIL);
380 	    }
381 	    break;
382 
383 	/* an operator, do something fancy */
384 	case OPER:
385 	    switch (OSYM(cs)) {
386 
387 	    /* this is an alternation */
388 	    case '|':
389 		if (matched)
390 
391 		    /* last thing in the alternation was a match, skip ahead */
392 		    cs = OPTR(cs);
393 		else
394 
395 		    /* no match, keep trying */
396 		    cs = ONEXT(cs);
397 		break;
398 
399 	    /* this is a grouping, recurse */
400 	    case '(':
401 		ptr = expmatch (s, ONEXT(cs), mstring);
402 		if (ptr != NIL) {
403 
404 		    /* the subexpression matched */
405 		    matched = 1;
406 		    s = ptr;
407 		} else if (*cs & ALT) {
408 
409 		    /* alternation, skip to next expression */
410 		    matched = 0;
411 		} else if (*cs & OPT) {
412 
413 		    /* the match is optional */
414 		    matched = 1;	/* indicate a successful match */
415 		} else {
416 
417 		    /* no match, error return */
418 		    return (NIL);
419 		}
420 		cs = OPTR(cs);
421 		break;
422 	    }
423 	    break;
424 
425 	/* try to match a metasymbol */
426 	case META:
427 	    switch (MSYM(cs)) {
428 
429 	    /* try to match anything and remember what was matched */
430 	    case 'p':
431 		/*
432 		 *  This is really the same as trying the match the
433 		 *  remaining parts of the expression to any subset
434 		 *  of the string.
435 		 */
436 		s1 = s;
437 		do {
438 		    ptr = expmatch (s1, MNEXT(cs), mstring);
439 		    if (ptr != NIL && s1 != s) {
440 
441 			/* we have a match, remember the match */
442 			strncpy (mstring, s, s1 - s);
443 			mstring[s1 - s] = '\0';
444 			return (ptr);
445 		    } else if (ptr != NIL && (*cs & OPT)) {
446 
447 			/* it was aoptional so no match is ok */
448 			return (ptr);
449 		    } else if (ptr != NIL) {
450 
451 			/* not optional and we still matched */
452 			return (NIL);
453 		    }
454 		    if (!isalnum(*s1) && *s1 != '_')
455 			return (NIL);
456 		    if (*s1 == '\\')
457 			_escaped = _escaped ? FALSE : TRUE;
458 		    else
459 			_escaped = FALSE;
460 		} while (*s1++);
461 		return (NIL);
462 
463 	    /* try to match anything */
464 	    case 'a':
465 		/*
466 		 *  This is really the same as trying the match the
467 		 *  remaining parts of the expression to any subset
468 		 *  of the string.
469 		 */
470 		s1 = s;
471 		do {
472 		    ptr = expmatch (s1, MNEXT(cs), mstring);
473 		    if (ptr != NIL && s1 != s) {
474 
475 			/* we have a match */
476 			return (ptr);
477 		    } else if (ptr != NIL && (*cs & OPT)) {
478 
479 			/* it was aoptional so no match is ok */
480 			return (ptr);
481 		    } else if (ptr != NIL) {
482 
483 			/* not optional and we still matched */
484 			return (NIL);
485 		    }
486 		    if (*s1 == '\\')
487 			_escaped = _escaped ? FALSE : TRUE;
488 		    else
489 			_escaped = FALSE;
490 		} while (*s1++);
491 		return (NIL);
492 
493 	    /* fail if we are currently _escaped */
494 	    case 'e':
495 		if (_escaped)
496 		    return(NIL);
497 		cs = MNEXT(cs);
498 		break;
499 
500 	    /* match any number of tabs and spaces */
501 	    case 'd':
502 		ptr = s;
503 		while (*s == ' ' || *s == '\t')
504 		    s++;
505 		if (s != ptr || s == _start) {
506 
507 		    /* match, be happy */
508 		    matched = 1;
509 		    cs = MNEXT(cs);
510 		} else if (*s == '\n' || *s == '\0') {
511 
512 		    /* match, be happy */
513 		    matched = 1;
514 		    cs = MNEXT(cs);
515 		} else if (*cs & ALT) {
516 
517 		    /* try the next part */
518 		    matched = 0;
519 		    cs = MNEXT(cs);
520 		} else if (*cs & OPT) {
521 
522 		    /* doesn't matter */
523 		    matched = 1;
524 		    cs = MNEXT(cs);
525 		} else
526 
527 		    /* no match, error return */
528 		    return (NIL);
529 		break;
530 
531 	    /* check for end of line */
532 	    case '$':
533 		if (*s == '\0' || *s == '\n') {
534 
535 		    /* match, be happy */
536 		    s++;
537 		    matched = 1;
538 		    cs = MNEXT(cs);
539 		} else if (*cs & ALT) {
540 
541 		    /* try the next part */
542 		    matched = 0;
543 		    cs = MNEXT(cs);
544 		} else if (*cs & OPT) {
545 
546 		    /* doesn't matter */
547 		    matched = 1;
548 		    cs = MNEXT(cs);
549 		} else
550 
551 		    /* no match, error return */
552 		    return (NIL);
553 		break;
554 
555 	    /* check for start of line */
556 	    case '^':
557 		if (s == _start) {
558 
559 		    /* match, be happy */
560 		    matched = 1;
561 		    cs = MNEXT(cs);
562 		} else if (*cs & ALT) {
563 
564 		    /* try the next part */
565 		    matched = 0;
566 		    cs = MNEXT(cs);
567 		} else if (*cs & OPT) {
568 
569 		    /* doesn't matter */
570 		    matched = 1;
571 		    cs = MNEXT(cs);
572 		} else
573 
574 		    /* no match, error return */
575 		    return (NIL);
576 		break;
577 
578 	    /* end of a subexpression, return success */
579 	    case ')':
580 		return (s);
581 	    }
582 	    break;
583 	}
584     }
585     return (s);
586 }
587