xref: /original-bsd/usr.bin/vgrind/regexp.c (revision e1db577d)
1 /*
2  * Copyright (c) 1980 The Regents of the University of California.
3  * All rights reserved.
4  *
5  * %sccs.include.redist.c%
6  */
7 
8 #ifndef lint
9 static char sccsid[] = "@(#)regexp.c	5.3 (Berkeley) 06/01/90";
10 #endif /* not lint */
11 
12 #include <ctype.h>
13 
14 typedef int	boolean;
15 #define TRUE	1
16 #define FALSE	0
17 #define NIL	0
18 
19 boolean l_onecase;	/* true if upper and lower equivalent */
20 
21 #define makelower(c) (isupper((c)) ? tolower((c)) : (c))
22 
23 /*  STRNCMP -	like strncmp except that we convert the
24  *	 	first string to lower case before comparing
25  *		if l_onecase is set.
26  */
27 
28 STRNCMP(s1, s2, len)
29 	register char *s1,*s2;
30 	register int len;
31 {
32 	if (l_onecase) {
33 	    do
34 		if (*s2 - makelower(*s1))
35 			return (*s2 - makelower(*s1));
36 		else {
37 			s2++;
38 			s1++;
39 		}
40 	    while (--len);
41 	} else {
42 	    do
43 		if (*s2 - *s1)
44 			return (*s2 - *s1);
45 		else {
46 			s2++;
47 			s1++;
48 		}
49 	    while (--len);
50 	}
51 	return(0);
52 }
53 
54 /*	The following routine converts an irregular expression to
55  *	internal format.
56  *
57  *	Either meta symbols (\a \d or \p) or character strings or
58  *	operations ( alternation or perenthesizing ) can be
59  *	specified.  Each starts with a descriptor byte.  The descriptor
60  *	byte has STR set for strings, META set for meta symbols
61  *	and OPER set for operations.
62  *	The descriptor byte can also have the OPT bit set if the object
63  *	defined is optional.  Also ALT can be set to indicate an alternation.
64  *
65  *	For metasymbols the byte following the descriptor byte identities
66  *	the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '(').  For
67  *	strings the byte after the descriptor is a character count for
68  *	the string:
69  *
70  *		meta symbols := descriptor
71  *				symbol
72  *
73  *		strings :=	descriptor
74  *				character count
75  *				the string
76  *
77  *		operatins :=	descriptor
78  *				symbol
79  *				character count
80  */
81 
82 /*
83  *  handy macros for accessing parts of match blocks
84  */
85 #define MSYM(A) (*(A+1))	/* symbol in a meta symbol block */
86 #define MNEXT(A) (A+2)		/* character following a metasymbol block */
87 
88 #define OSYM(A) (*(A+1))	/* symbol in an operation block */
89 #define OCNT(A) (*(A+2))	/* character count */
90 #define ONEXT(A) (A+3)		/* next character after the operation */
91 #define OPTR(A) (A+*(A+2))	/* place pointed to by the operator */
92 
93 #define SCNT(A) (*(A+1))	/* byte count of a string */
94 #define SSTR(A) (A+2)		/* address of the string */
95 #define SNEXT(A) (A+2+*(A+1))	/* character following the string */
96 
97 /*
98  *  bit flags in the descriptor
99  */
100 #define OPT 1
101 #define STR 2
102 #define META 4
103 #define ALT 8
104 #define OPER 16
105 
106 char *ure;		/* pointer current position in unconverted exp */
107 char *ccre;		/* pointer to current position in converted exp*/
108 char *malloc();
109 
110 char *
111 convexp(re)
112     char *re;		/* unconverted irregular expression */
113 {
114     register char *cre;		/* pointer to converted regular expression */
115 
116     /* allocate room for the converted expression */
117     if (re == NIL)
118 	return (NIL);
119     if (*re == '\0')
120 	return (NIL);
121     cre = malloc (4 * strlen(re) + 3);
122     ccre = cre;
123     ure = re;
124 
125     /* start the conversion with a \a */
126     *cre = META | OPT;
127     MSYM(cre) = 'a';
128     ccre = MNEXT(cre);
129 
130     /* start the conversion (its recursive) */
131     expconv ();
132     *ccre = 0;
133     return (cre);
134 }
135 
136 expconv()
137 {
138     register char *cs;		/* pointer to current symbol in converted exp */
139     register char c;		/* character being processed */
140     register char *acs;		/* pinter to last alternate */
141     register int temp;
142 
143     /* let the conversion begin */
144     acs = NIL;
145     cs = NIL;
146     while (*ure != NIL) {
147 	switch (c = *ure++) {
148 
149 	case '\\':
150 	    switch (c = *ure++) {
151 
152 	    /* escaped characters are just characters */
153 	    default:
154 		if (cs == NIL || (*cs & STR) == 0) {
155 		    cs = ccre;
156 		    *cs = STR;
157 		    SCNT(cs) = 1;
158 		    ccre += 2;
159 		} else
160 		    SCNT(cs)++;
161 		*ccre++ = c;
162 		break;
163 
164 	    /* normal(?) metacharacters */
165 	    case 'a':
166 	    case 'd':
167 	    case 'e':
168 	    case 'p':
169 		if (acs != NIL && acs != cs) {
170 		    do {
171 			temp = OCNT(acs);
172 			OCNT(acs) = ccre - acs;
173 			acs -= temp;
174 		    } while (temp != 0);
175 		    acs = NIL;
176 		}
177 		cs = ccre;
178 		*cs = META;
179 		MSYM(cs) = c;
180 		ccre = MNEXT(cs);
181 		break;
182 	    }
183 	    break;
184 
185 	/* just put the symbol in */
186 	case '^':
187 	case '$':
188 	    if (acs != NIL && acs != cs) {
189 		do {
190 		    temp = OCNT(acs);
191 		    OCNT(acs) = ccre - acs;
192 		    acs -= temp;
193 		} while (temp != 0);
194 		acs = NIL;
195 	    }
196 	    cs = ccre;
197 	    *cs = META;
198 	    MSYM(cs) = c;
199 	    ccre = MNEXT(cs);
200 	    break;
201 
202 	/* mark the last match sequence as optional */
203 	case '?':
204 	    if (cs)
205 	    	*cs = *cs | OPT;
206 	    break;
207 
208 	/* recurse and define a subexpression */
209 	case '(':
210 	    if (acs != NIL && acs != cs) {
211 		do {
212 		    temp = OCNT(acs);
213 		    OCNT(acs) = ccre - acs;
214 		    acs -= temp;
215 		} while (temp != 0);
216 		acs = NIL;
217 	    }
218 	    cs = ccre;
219 	    *cs = OPER;
220 	    OSYM(cs) = '(';
221 	    ccre = ONEXT(cs);
222 	    expconv ();
223 	    OCNT(cs) = ccre - cs;		/* offset to next symbol */
224 	    break;
225 
226 	/* return from a recursion */
227 	case ')':
228 	    if (acs != NIL) {
229 		do {
230 		    temp = OCNT(acs);
231 		    OCNT(acs) = ccre - acs;
232 		    acs -= temp;
233 		} while (temp != 0);
234 		acs = NIL;
235 	    }
236 	    cs = ccre;
237 	    *cs = META;
238 	    MSYM(cs) = c;
239 	    ccre = MNEXT(cs);
240 	    return;
241 
242 	/* mark the last match sequence as having an alternate */
243 	/* the third byte will contain an offset to jump over the */
244 	/* alternate match in case the first did not fail */
245 	case '|':
246 	    if (acs != NIL && acs != cs)
247 		OCNT(ccre) = ccre - acs;	/* make a back pointer */
248 	    else
249 		OCNT(ccre) = 0;
250 	    *cs |= ALT;
251 	    cs = ccre;
252 	    *cs = OPER;
253 	    OSYM(cs) = '|';
254 	    ccre = ONEXT(cs);
255 	    acs = cs;	/* remember that the pointer is to be filles */
256 	    break;
257 
258 	/* if its not a metasymbol just build a scharacter string */
259 	default:
260 	    if (cs == NIL || (*cs & STR) == 0) {
261 		cs = ccre;
262 		*cs = STR;
263 		SCNT(cs) = 1;
264 		ccre = SSTR(cs);
265 	    } else
266 		SCNT(cs)++;
267 	    *ccre++ = c;
268 	    break;
269 	}
270     }
271     if (acs != NIL) {
272 	do {
273 	    temp = OCNT(acs);
274 	    OCNT(acs) = ccre - acs;
275 	    acs -= temp;
276 	} while (temp != 0);
277 	acs = NIL;
278     }
279     return;
280 }
281 /* end of convertre */
282 
283 
284 /*
285  *	The following routine recognises an irregular expresion
286  *	with the following special characters:
287  *
288  *		\?	-	means last match was optional
289  *		\a	-	matches any number of characters
290  *		\d	-	matches any number of spaces and tabs
291  *		\p	-	matches any number of alphanumeric
292  *				characters. The
293  *				characters matched will be copied into
294  *				the area pointed to by 'name'.
295  *		\|	-	alternation
296  *		\( \)	-	grouping used mostly for alternation and
297  *				optionality
298  *
299  *	The irregular expression must be translated to internal form
300  *	prior to calling this routine
301  *
302  *	The value returned is the pointer to the first non \a
303  *	character matched.
304  */
305 
306 boolean _escaped;		/* true if we are currently _escaped */
307 char *_start;			/* start of string */
308 
309 char *
310 expmatch (s, re, mstring)
311     register char *s;		/* string to check for a match in */
312     register char *re;		/* a converted irregular expression */
313     register char *mstring;	/* where to put whatever matches a \p */
314 {
315     register char *cs;		/* the current symbol */
316     register char *ptr,*s1;	/* temporary pointer */
317     boolean matched;		/* a temporary boolean */
318 
319     /* initial conditions */
320     if (re == NIL)
321 	return (NIL);
322     cs = re;
323     matched = FALSE;
324 
325     /* loop till expression string is exhausted (or at least pretty tired) */
326     while (*cs) {
327 	switch (*cs & (OPER | STR | META)) {
328 
329 	/* try to match a string */
330 	case STR:
331 	    matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
332 	    if (matched) {
333 
334 		/* hoorah it matches */
335 		s += SCNT(cs);
336 		cs = SNEXT(cs);
337 	    } else if (*cs & ALT) {
338 
339 		/* alternation, skip to next expression */
340 		cs = SNEXT(cs);
341 	    } else if (*cs & OPT) {
342 
343 		/* the match is optional */
344 		cs = SNEXT(cs);
345 		matched = 1;		/* indicate a successful match */
346 	    } else {
347 
348 		/* no match, error return */
349 		return (NIL);
350 	    }
351 	    break;
352 
353 	/* an operator, do something fancy */
354 	case OPER:
355 	    switch (OSYM(cs)) {
356 
357 	    /* this is an alternation */
358 	    case '|':
359 		if (matched)
360 
361 		    /* last thing in the alternation was a match, skip ahead */
362 		    cs = OPTR(cs);
363 		else
364 
365 		    /* no match, keep trying */
366 		    cs = ONEXT(cs);
367 		break;
368 
369 	    /* this is a grouping, recurse */
370 	    case '(':
371 		ptr = expmatch (s, ONEXT(cs), mstring);
372 		if (ptr != NIL) {
373 
374 		    /* the subexpression matched */
375 		    matched = 1;
376 		    s = ptr;
377 		} else if (*cs & ALT) {
378 
379 		    /* alternation, skip to next expression */
380 		    matched = 0;
381 		} else if (*cs & OPT) {
382 
383 		    /* the match is optional */
384 		    matched = 1;	/* indicate a successful match */
385 		} else {
386 
387 		    /* no match, error return */
388 		    return (NIL);
389 		}
390 		cs = OPTR(cs);
391 		break;
392 	    }
393 	    break;
394 
395 	/* try to match a metasymbol */
396 	case META:
397 	    switch (MSYM(cs)) {
398 
399 	    /* try to match anything and remember what was matched */
400 	    case 'p':
401 		/*
402 		 *  This is really the same as trying the match the
403 		 *  remaining parts of the expression to any subset
404 		 *  of the string.
405 		 */
406 		s1 = s;
407 		do {
408 		    ptr = expmatch (s1, MNEXT(cs), mstring);
409 		    if (ptr != NIL && s1 != s) {
410 
411 			/* we have a match, remember the match */
412 			strncpy (mstring, s, s1 - s);
413 			mstring[s1 - s] = '\0';
414 			return (ptr);
415 		    } else if (ptr != NIL && (*cs & OPT)) {
416 
417 			/* it was aoptional so no match is ok */
418 			return (ptr);
419 		    } else if (ptr != NIL) {
420 
421 			/* not optional and we still matched */
422 			return (NIL);
423 		    }
424 		    if (!isalnum(*s1) && *s1 != '_')
425 			return (NIL);
426 		    if (*s1 == '\\')
427 			_escaped = _escaped ? FALSE : TRUE;
428 		    else
429 			_escaped = FALSE;
430 		} while (*s1++);
431 		return (NIL);
432 
433 	    /* try to match anything */
434 	    case 'a':
435 		/*
436 		 *  This is really the same as trying the match the
437 		 *  remaining parts of the expression to any subset
438 		 *  of the string.
439 		 */
440 		s1 = s;
441 		do {
442 		    ptr = expmatch (s1, MNEXT(cs), mstring);
443 		    if (ptr != NIL && s1 != s) {
444 
445 			/* we have a match */
446 			return (ptr);
447 		    } else if (ptr != NIL && (*cs & OPT)) {
448 
449 			/* it was aoptional so no match is ok */
450 			return (ptr);
451 		    } else if (ptr != NIL) {
452 
453 			/* not optional and we still matched */
454 			return (NIL);
455 		    }
456 		    if (*s1 == '\\')
457 			_escaped = _escaped ? FALSE : TRUE;
458 		    else
459 			_escaped = FALSE;
460 		} while (*s1++);
461 		return (NIL);
462 
463 	    /* fail if we are currently _escaped */
464 	    case 'e':
465 		if (_escaped)
466 		    return(NIL);
467 		cs = MNEXT(cs);
468 		break;
469 
470 	    /* match any number of tabs and spaces */
471 	    case 'd':
472 		ptr = s;
473 		while (*s == ' ' || *s == '\t')
474 		    s++;
475 		if (s != ptr || s == _start) {
476 
477 		    /* match, be happy */
478 		    matched = 1;
479 		    cs = MNEXT(cs);
480 		} else if (*s == '\n' || *s == '\0') {
481 
482 		    /* match, be happy */
483 		    matched = 1;
484 		    cs = MNEXT(cs);
485 		} else if (*cs & ALT) {
486 
487 		    /* try the next part */
488 		    matched = 0;
489 		    cs = MNEXT(cs);
490 		} else if (*cs & OPT) {
491 
492 		    /* doesn't matter */
493 		    matched = 1;
494 		    cs = MNEXT(cs);
495 		} else
496 
497 		    /* no match, error return */
498 		    return (NIL);
499 		break;
500 
501 	    /* check for end of line */
502 	    case '$':
503 		if (*s == '\0' || *s == '\n') {
504 
505 		    /* match, be happy */
506 		    s++;
507 		    matched = 1;
508 		    cs = MNEXT(cs);
509 		} else if (*cs & ALT) {
510 
511 		    /* try the next part */
512 		    matched = 0;
513 		    cs = MNEXT(cs);
514 		} else if (*cs & OPT) {
515 
516 		    /* doesn't matter */
517 		    matched = 1;
518 		    cs = MNEXT(cs);
519 		} else
520 
521 		    /* no match, error return */
522 		    return (NIL);
523 		break;
524 
525 	    /* check for start of line */
526 	    case '^':
527 		if (s == _start) {
528 
529 		    /* match, be happy */
530 		    matched = 1;
531 		    cs = MNEXT(cs);
532 		} else if (*cs & ALT) {
533 
534 		    /* try the next part */
535 		    matched = 0;
536 		    cs = MNEXT(cs);
537 		} else if (*cs & OPT) {
538 
539 		    /* doesn't matter */
540 		    matched = 1;
541 		    cs = MNEXT(cs);
542 		} else
543 
544 		    /* no match, error return */
545 		    return (NIL);
546 		break;
547 
548 	    /* end of a subexpression, return success */
549 	    case ')':
550 		return (s);
551 	    }
552 	    break;
553 	}
554     }
555     return (s);
556 }
557