xref: /original-bsd/usr.bin/vgrind/regexp.c (revision c3e32dec)
1 /*
2  * Copyright (c) 1980, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  *
6  * %sccs.include.redist.c%
7  */
8 
9 #ifndef lint
10 static char copyright[] =
11 "@(#) Copyright (c) 1980, 1993\n\
12 	The Regents of the University of California.  All rights reserved.\n";
13 #endif /* not lint */
14 
15 #ifndef lint
16 static char sccsid[] = "@(#)regexp.c	8.1 (Berkeley) 06/06/93";
17 #endif /* not lint */
18 
19 #include <ctype.h>
20 #include <stdlib.h>
21 #include <string.h>
22 #include "extern.h"
23 
24 #define FALSE	0
25 #define TRUE	!(FALSE)
26 #define NIL	0
27 
28 static void	expconv __P((void));
29 
30 boolean	 _escaped;	/* true if we are currently _escaped */
31 char	*_start;	/* start of string */
32 boolean	 l_onecase;	/* true if upper and lower equivalent */
33 
34 #define makelower(c) (isupper((c)) ? tolower((c)) : (c))
35 
36 /*  STRNCMP -	like strncmp except that we convert the
37  *	 	first string to lower case before comparing
38  *		if l_onecase is set.
39  */
40 
41 int
42 STRNCMP(s1, s2, len)
43 	register char *s1,*s2;
44 	register int len;
45 {
46 	if (l_onecase) {
47 	    do
48 		if (*s2 - makelower(*s1))
49 			return (*s2 - makelower(*s1));
50 		else {
51 			s2++;
52 			s1++;
53 		}
54 	    while (--len);
55 	} else {
56 	    do
57 		if (*s2 - *s1)
58 			return (*s2 - *s1);
59 		else {
60 			s2++;
61 			s1++;
62 		}
63 	    while (--len);
64 	}
65 	return(0);
66 }
67 
68 /*	The following routine converts an irregular expression to
69  *	internal format.
70  *
71  *	Either meta symbols (\a \d or \p) or character strings or
72  *	operations ( alternation or perenthesizing ) can be
73  *	specified.  Each starts with a descriptor byte.  The descriptor
74  *	byte has STR set for strings, META set for meta symbols
75  *	and OPER set for operations.
76  *	The descriptor byte can also have the OPT bit set if the object
77  *	defined is optional.  Also ALT can be set to indicate an alternation.
78  *
79  *	For metasymbols the byte following the descriptor byte identities
80  *	the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '(').  For
81  *	strings the byte after the descriptor is a character count for
82  *	the string:
83  *
84  *		meta symbols := descriptor
85  *				symbol
86  *
87  *		strings :=	descriptor
88  *				character count
89  *				the string
90  *
91  *		operatins :=	descriptor
92  *				symbol
93  *				character count
94  */
95 
96 /*
97  *  handy macros for accessing parts of match blocks
98  */
99 #define MSYM(A) (*(A+1))	/* symbol in a meta symbol block */
100 #define MNEXT(A) (A+2)		/* character following a metasymbol block */
101 
102 #define OSYM(A) (*(A+1))	/* symbol in an operation block */
103 #define OCNT(A) (*(A+2))	/* character count */
104 #define ONEXT(A) (A+3)		/* next character after the operation */
105 #define OPTR(A) (A+*(A+2))	/* place pointed to by the operator */
106 
107 #define SCNT(A) (*(A+1))	/* byte count of a string */
108 #define SSTR(A) (A+2)		/* address of the string */
109 #define SNEXT(A) (A+2+*(A+1))	/* character following the string */
110 
111 /*
112  *  bit flags in the descriptor
113  */
114 #define OPT 1
115 #define STR 2
116 #define META 4
117 #define ALT 8
118 #define OPER 16
119 
120 static char *ccre;	/* pointer to current position in converted exp*/
121 static char *ure;	/* pointer current position in unconverted exp */
122 
123 char *
124 convexp(re)
125     char *re;		/* unconverted irregular expression */
126 {
127     register char *cre;		/* pointer to converted regular expression */
128 
129     /* allocate room for the converted expression */
130     if (re == NIL)
131 	return (NIL);
132     if (*re == '\0')
133 	return (NIL);
134     cre = malloc (4 * strlen(re) + 3);
135     ccre = cre;
136     ure = re;
137 
138     /* start the conversion with a \a */
139     *cre = META | OPT;
140     MSYM(cre) = 'a';
141     ccre = MNEXT(cre);
142 
143     /* start the conversion (its recursive) */
144     expconv ();
145     *ccre = 0;
146     return (cre);
147 }
148 
149 static void
150 expconv()
151 {
152     register char *cs;		/* pointer to current symbol in converted exp */
153     register char c;		/* character being processed */
154     register char *acs;		/* pinter to last alternate */
155     register int temp;
156 
157     /* let the conversion begin */
158     acs = NIL;
159     cs = NIL;
160     while (*ure != NIL) {
161 	switch (c = *ure++) {
162 
163 	case '\\':
164 	    switch (c = *ure++) {
165 
166 	    /* escaped characters are just characters */
167 	    default:
168 		if (cs == NIL || (*cs & STR) == 0) {
169 		    cs = ccre;
170 		    *cs = STR;
171 		    SCNT(cs) = 1;
172 		    ccre += 2;
173 		} else
174 		    SCNT(cs)++;
175 		*ccre++ = c;
176 		break;
177 
178 	    /* normal(?) metacharacters */
179 	    case 'a':
180 	    case 'd':
181 	    case 'e':
182 	    case 'p':
183 		if (acs != NIL && acs != cs) {
184 		    do {
185 			temp = OCNT(acs);
186 			OCNT(acs) = ccre - acs;
187 			acs -= temp;
188 		    } while (temp != 0);
189 		    acs = NIL;
190 		}
191 		cs = ccre;
192 		*cs = META;
193 		MSYM(cs) = c;
194 		ccre = MNEXT(cs);
195 		break;
196 	    }
197 	    break;
198 
199 	/* just put the symbol in */
200 	case '^':
201 	case '$':
202 	    if (acs != NIL && acs != cs) {
203 		do {
204 		    temp = OCNT(acs);
205 		    OCNT(acs) = ccre - acs;
206 		    acs -= temp;
207 		} while (temp != 0);
208 		acs = NIL;
209 	    }
210 	    cs = ccre;
211 	    *cs = META;
212 	    MSYM(cs) = c;
213 	    ccre = MNEXT(cs);
214 	    break;
215 
216 	/* mark the last match sequence as optional */
217 	case '?':
218 	    if (cs)
219 	    	*cs = *cs | OPT;
220 	    break;
221 
222 	/* recurse and define a subexpression */
223 	case '(':
224 	    if (acs != NIL && acs != cs) {
225 		do {
226 		    temp = OCNT(acs);
227 		    OCNT(acs) = ccre - acs;
228 		    acs -= temp;
229 		} while (temp != 0);
230 		acs = NIL;
231 	    }
232 	    cs = ccre;
233 	    *cs = OPER;
234 	    OSYM(cs) = '(';
235 	    ccre = ONEXT(cs);
236 	    expconv ();
237 	    OCNT(cs) = ccre - cs;		/* offset to next symbol */
238 	    break;
239 
240 	/* reurn from a recursion */
241 	case ')':
242 	    if (acs != NIL) {
243 		do {
244 		    temp = OCNT(acs);
245 		    OCNT(acs) = ccre - acs;
246 		    acs -= temp;
247 		} while (temp != 0);
248 		acs = NIL;
249 	    }
250 	    cs = ccre;
251 	    *cs = META;
252 	    MSYM(cs) = c;
253 	    ccre = MNEXT(cs);
254 	    return;
255 
256 	/* mark the last match sequence as having an alternate */
257 	/* the third byte will contain an offset to jump over the */
258 	/* alternate match in case the first did not fail */
259 	case '|':
260 	    if (acs != NIL && acs != cs)
261 		OCNT(ccre) = ccre - acs;	/* make a back pointer */
262 	    else
263 		OCNT(ccre) = 0;
264 	    *cs |= ALT;
265 	    cs = ccre;
266 	    *cs = OPER;
267 	    OSYM(cs) = '|';
268 	    ccre = ONEXT(cs);
269 	    acs = cs;	/* remember that the pointer is to be filles */
270 	    break;
271 
272 	/* if its not a metasymbol just build a scharacter string */
273 	default:
274 	    if (cs == NIL || (*cs & STR) == 0) {
275 		cs = ccre;
276 		*cs = STR;
277 		SCNT(cs) = 1;
278 		ccre = SSTR(cs);
279 	    } else
280 		SCNT(cs)++;
281 	    *ccre++ = c;
282 	    break;
283 	}
284     }
285     if (acs != NIL) {
286 	do {
287 	    temp = OCNT(acs);
288 	    OCNT(acs) = ccre - acs;
289 	    acs -= temp;
290 	} while (temp != 0);
291 	acs = NIL;
292     }
293     return;
294 }
295 /* end of convertre */
296 
297 
298 /*
299  *	The following routine recognises an irregular expresion
300  *	with the following special characters:
301  *
302  *		\?	-	means last match was optional
303  *		\a	-	matches any number of characters
304  *		\d	-	matches any number of spaces and tabs
305  *		\p	-	matches any number of alphanumeric
306  *				characters. The
307  *				characters matched will be copied into
308  *				the area pointed to by 'name'.
309  *		\|	-	alternation
310  *		\( \)	-	grouping used mostly for alternation and
311  *				optionality
312  *
313  *	The irregular expression must be translated to internal form
314  *	prior to calling this routine
315  *
316  *	The value returned is the pointer to the first non \a
317  *	character matched.
318  */
319 
320 char *
321 expmatch (s, re, mstring)
322     register char *s;		/* string to check for a match in */
323     register char *re;		/* a converted irregular expression */
324     register char *mstring;	/* where to put whatever matches a \p */
325 {
326     register char *cs;		/* the current symbol */
327     register char *ptr,*s1;	/* temporary pointer */
328     boolean matched;		/* a temporary boolean */
329 
330     /* initial conditions */
331     if (re == NIL)
332 	return (NIL);
333     cs = re;
334     matched = FALSE;
335 
336     /* loop till expression string is exhausted (or at least pretty tired) */
337     while (*cs) {
338 	switch (*cs & (OPER | STR | META)) {
339 
340 	/* try to match a string */
341 	case STR:
342 	    matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
343 	    if (matched) {
344 
345 		/* hoorah it matches */
346 		s += SCNT(cs);
347 		cs = SNEXT(cs);
348 	    } else if (*cs & ALT) {
349 
350 		/* alternation, skip to next expression */
351 		cs = SNEXT(cs);
352 	    } else if (*cs & OPT) {
353 
354 		/* the match is optional */
355 		cs = SNEXT(cs);
356 		matched = 1;		/* indicate a successful match */
357 	    } else {
358 
359 		/* no match, error return */
360 		return (NIL);
361 	    }
362 	    break;
363 
364 	/* an operator, do something fancy */
365 	case OPER:
366 	    switch (OSYM(cs)) {
367 
368 	    /* this is an alternation */
369 	    case '|':
370 		if (matched)
371 
372 		    /* last thing in the alternation was a match, skip ahead */
373 		    cs = OPTR(cs);
374 		else
375 
376 		    /* no match, keep trying */
377 		    cs = ONEXT(cs);
378 		break;
379 
380 	    /* this is a grouping, recurse */
381 	    case '(':
382 		ptr = expmatch (s, ONEXT(cs), mstring);
383 		if (ptr != NIL) {
384 
385 		    /* the subexpression matched */
386 		    matched = 1;
387 		    s = ptr;
388 		} else if (*cs & ALT) {
389 
390 		    /* alternation, skip to next expression */
391 		    matched = 0;
392 		} else if (*cs & OPT) {
393 
394 		    /* the match is optional */
395 		    matched = 1;	/* indicate a successful match */
396 		} else {
397 
398 		    /* no match, error return */
399 		    return (NIL);
400 		}
401 		cs = OPTR(cs);
402 		break;
403 	    }
404 	    break;
405 
406 	/* try to match a metasymbol */
407 	case META:
408 	    switch (MSYM(cs)) {
409 
410 	    /* try to match anything and remember what was matched */
411 	    case 'p':
412 		/*
413 		 *  This is really the same as trying the match the
414 		 *  remaining parts of the expression to any subset
415 		 *  of the string.
416 		 */
417 		s1 = s;
418 		do {
419 		    ptr = expmatch (s1, MNEXT(cs), mstring);
420 		    if (ptr != NIL && s1 != s) {
421 
422 			/* we have a match, remember the match */
423 			strncpy (mstring, s, s1 - s);
424 			mstring[s1 - s] = '\0';
425 			return (ptr);
426 		    } else if (ptr != NIL && (*cs & OPT)) {
427 
428 			/* it was aoptional so no match is ok */
429 			return (ptr);
430 		    } else if (ptr != NIL) {
431 
432 			/* not optional and we still matched */
433 			return (NIL);
434 		    }
435 		    if (!isalnum(*s1) && *s1 != '_')
436 			return (NIL);
437 		    if (*s1 == '\\')
438 			_escaped = _escaped ? FALSE : TRUE;
439 		    else
440 			_escaped = FALSE;
441 		} while (*s1++);
442 		return (NIL);
443 
444 	    /* try to match anything */
445 	    case 'a':
446 		/*
447 		 *  This is really the same as trying the match the
448 		 *  remaining parts of the expression to any subset
449 		 *  of the string.
450 		 */
451 		s1 = s;
452 		do {
453 		    ptr = expmatch (s1, MNEXT(cs), mstring);
454 		    if (ptr != NIL && s1 != s) {
455 
456 			/* we have a match */
457 			return (ptr);
458 		    } else if (ptr != NIL && (*cs & OPT)) {
459 
460 			/* it was aoptional so no match is ok */
461 			return (ptr);
462 		    } else if (ptr != NIL) {
463 
464 			/* not optional and we still matched */
465 			return (NIL);
466 		    }
467 		    if (*s1 == '\\')
468 			_escaped = _escaped ? FALSE : TRUE;
469 		    else
470 			_escaped = FALSE;
471 		} while (*s1++);
472 		return (NIL);
473 
474 	    /* fail if we are currently _escaped */
475 	    case 'e':
476 		if (_escaped)
477 		    return(NIL);
478 		cs = MNEXT(cs);
479 		break;
480 
481 	    /* match any number of tabs and spaces */
482 	    case 'd':
483 		ptr = s;
484 		while (*s == ' ' || *s == '\t')
485 		    s++;
486 		if (s != ptr || s == _start) {
487 
488 		    /* match, be happy */
489 		    matched = 1;
490 		    cs = MNEXT(cs);
491 		} else if (*s == '\n' || *s == '\0') {
492 
493 		    /* match, be happy */
494 		    matched = 1;
495 		    cs = MNEXT(cs);
496 		} else if (*cs & ALT) {
497 
498 		    /* try the next part */
499 		    matched = 0;
500 		    cs = MNEXT(cs);
501 		} else if (*cs & OPT) {
502 
503 		    /* doesn't matter */
504 		    matched = 1;
505 		    cs = MNEXT(cs);
506 		} else
507 
508 		    /* no match, error return */
509 		    return (NIL);
510 		break;
511 
512 	    /* check for end of line */
513 	    case '$':
514 		if (*s == '\0' || *s == '\n') {
515 
516 		    /* match, be happy */
517 		    s++;
518 		    matched = 1;
519 		    cs = MNEXT(cs);
520 		} else if (*cs & ALT) {
521 
522 		    /* try the next part */
523 		    matched = 0;
524 		    cs = MNEXT(cs);
525 		} else if (*cs & OPT) {
526 
527 		    /* doesn't matter */
528 		    matched = 1;
529 		    cs = MNEXT(cs);
530 		} else
531 
532 		    /* no match, error return */
533 		    return (NIL);
534 		break;
535 
536 	    /* check for start of line */
537 	    case '^':
538 		if (s == _start) {
539 
540 		    /* match, be happy */
541 		    matched = 1;
542 		    cs = MNEXT(cs);
543 		} else if (*cs & ALT) {
544 
545 		    /* try the next part */
546 		    matched = 0;
547 		    cs = MNEXT(cs);
548 		} else if (*cs & OPT) {
549 
550 		    /* doesn't matter */
551 		    matched = 1;
552 		    cs = MNEXT(cs);
553 		} else
554 
555 		    /* no match, error return */
556 		    return (NIL);
557 		break;
558 
559 	    /* end of a subexpression, return success */
560 	    case ')':
561 		return (s);
562 	    }
563 	    break;
564 	}
565     }
566     return (s);
567 }
568