xref: /original-bsd/usr.bin/vgrind/regexp.c (revision da818fbb)
1 /*
2  * Copyright (c) 1980 The Regents of the University of California.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms are permitted
6  * provided that the above copyright notice and this paragraph are
7  * duplicated in all such forms and that any documentation,
8  * advertising materials, and other materials related to such
9  * distribution and use acknowledge that the software was developed
10  * by the University of California, Berkeley.  The name of the
11  * University may not be used to endorse or promote products derived
12  * from this software without specific prior written permission.
13  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
14  * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
15  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
16  */
17 
18 #ifndef lint
19 static char sccsid[] = "@(#)regexp.c	5.2 (Berkeley) 10/25/88";
20 #endif /* not lint */
21 
22 #include <ctype.h>
23 
24 typedef int	boolean;
25 #define TRUE	1
26 #define FALSE	0
27 #define NIL	0
28 
29 boolean l_onecase;	/* true if upper and lower equivalent */
30 
31 #define makelower(c) (isupper((c)) ? tolower((c)) : (c))
32 
33 /*  STRNCMP -	like strncmp except that we convert the
34  *	 	first string to lower case before comparing
35  *		if l_onecase is set.
36  */
37 
38 STRNCMP(s1, s2, len)
39 	register char *s1,*s2;
40 	register int len;
41 {
42 	if (l_onecase) {
43 	    do
44 		if (*s2 - makelower(*s1))
45 			return (*s2 - makelower(*s1));
46 		else {
47 			s2++;
48 			s1++;
49 		}
50 	    while (--len);
51 	} else {
52 	    do
53 		if (*s2 - *s1)
54 			return (*s2 - *s1);
55 		else {
56 			s2++;
57 			s1++;
58 		}
59 	    while (--len);
60 	}
61 	return(0);
62 }
63 
64 /*	The following routine converts an irregular expression to
65  *	internal format.
66  *
67  *	Either meta symbols (\a \d or \p) or character strings or
68  *	operations ( alternation or perenthesizing ) can be
69  *	specified.  Each starts with a descriptor byte.  The descriptor
70  *	byte has STR set for strings, META set for meta symbols
71  *	and OPER set for operations.
72  *	The descriptor byte can also have the OPT bit set if the object
73  *	defined is optional.  Also ALT can be set to indicate an alternation.
74  *
75  *	For metasymbols the byte following the descriptor byte identities
76  *	the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '(').  For
77  *	strings the byte after the descriptor is a character count for
78  *	the string:
79  *
80  *		meta symbols := descriptor
81  *				symbol
82  *
83  *		strings :=	descriptor
84  *				character count
85  *				the string
86  *
87  *		operatins :=	descriptor
88  *				symbol
89  *				character count
90  */
91 
92 /*
93  *  handy macros for accessing parts of match blocks
94  */
95 #define MSYM(A) (*(A+1))	/* symbol in a meta symbol block */
96 #define MNEXT(A) (A+2)		/* character following a metasymbol block */
97 
98 #define OSYM(A) (*(A+1))	/* symbol in an operation block */
99 #define OCNT(A) (*(A+2))	/* character count */
100 #define ONEXT(A) (A+3)		/* next character after the operation */
101 #define OPTR(A) (A+*(A+2))	/* place pointed to by the operator */
102 
103 #define SCNT(A) (*(A+1))	/* byte count of a string */
104 #define SSTR(A) (A+2)		/* address of the string */
105 #define SNEXT(A) (A+2+*(A+1))	/* character following the string */
106 
107 /*
108  *  bit flags in the descriptor
109  */
110 #define OPT 1
111 #define STR 2
112 #define META 4
113 #define ALT 8
114 #define OPER 16
115 
116 char *ure;		/* pointer current position in unconverted exp */
117 char *ccre;		/* pointer to current position in converted exp*/
118 char *malloc();
119 
120 char *
121 convexp(re)
122     char *re;		/* unconverted irregular expression */
123 {
124     register char *cre;		/* pointer to converted regular expression */
125 
126     /* allocate room for the converted expression */
127     if (re == NIL)
128 	return (NIL);
129     if (*re == '\0')
130 	return (NIL);
131     cre = malloc (4 * strlen(re) + 3);
132     ccre = cre;
133     ure = re;
134 
135     /* start the conversion with a \a */
136     *cre = META | OPT;
137     MSYM(cre) = 'a';
138     ccre = MNEXT(cre);
139 
140     /* start the conversion (its recursive) */
141     expconv ();
142     *ccre = 0;
143     return (cre);
144 }
145 
146 expconv()
147 {
148     register char *cs;		/* pointer to current symbol in converted exp */
149     register char c;		/* character being processed */
150     register char *acs;		/* pinter to last alternate */
151     register int temp;
152 
153     /* let the conversion begin */
154     acs = NIL;
155     cs = NIL;
156     while (*ure != NIL) {
157 	switch (c = *ure++) {
158 
159 	case '\\':
160 	    switch (c = *ure++) {
161 
162 	    /* escaped characters are just characters */
163 	    default:
164 		if (cs == NIL || (*cs & STR) == 0) {
165 		    cs = ccre;
166 		    *cs = STR;
167 		    SCNT(cs) = 1;
168 		    ccre += 2;
169 		} else
170 		    SCNT(cs)++;
171 		*ccre++ = c;
172 		break;
173 
174 	    /* normal(?) metacharacters */
175 	    case 'a':
176 	    case 'd':
177 	    case 'e':
178 	    case 'p':
179 		if (acs != NIL && acs != cs) {
180 		    do {
181 			temp = OCNT(acs);
182 			OCNT(acs) = ccre - acs;
183 			acs -= temp;
184 		    } while (temp != 0);
185 		    acs = NIL;
186 		}
187 		cs = ccre;
188 		*cs = META;
189 		MSYM(cs) = c;
190 		ccre = MNEXT(cs);
191 		break;
192 	    }
193 	    break;
194 
195 	/* just put the symbol in */
196 	case '^':
197 	case '$':
198 	    if (acs != NIL && acs != cs) {
199 		do {
200 		    temp = OCNT(acs);
201 		    OCNT(acs) = ccre - acs;
202 		    acs -= temp;
203 		} while (temp != 0);
204 		acs = NIL;
205 	    }
206 	    cs = ccre;
207 	    *cs = META;
208 	    MSYM(cs) = c;
209 	    ccre = MNEXT(cs);
210 	    break;
211 
212 	/* mark the last match sequence as optional */
213 	case '?':
214 	    if (cs)
215 	    	*cs = *cs | OPT;
216 	    break;
217 
218 	/* recurse and define a subexpression */
219 	case '(':
220 	    if (acs != NIL && acs != cs) {
221 		do {
222 		    temp = OCNT(acs);
223 		    OCNT(acs) = ccre - acs;
224 		    acs -= temp;
225 		} while (temp != 0);
226 		acs = NIL;
227 	    }
228 	    cs = ccre;
229 	    *cs = OPER;
230 	    OSYM(cs) = '(';
231 	    ccre = ONEXT(cs);
232 	    expconv ();
233 	    OCNT(cs) = ccre - cs;		/* offset to next symbol */
234 	    break;
235 
236 	/* return from a recursion */
237 	case ')':
238 	    if (acs != NIL) {
239 		do {
240 		    temp = OCNT(acs);
241 		    OCNT(acs) = ccre - acs;
242 		    acs -= temp;
243 		} while (temp != 0);
244 		acs = NIL;
245 	    }
246 	    cs = ccre;
247 	    *cs = META;
248 	    MSYM(cs) = c;
249 	    ccre = MNEXT(cs);
250 	    return;
251 
252 	/* mark the last match sequence as having an alternate */
253 	/* the third byte will contain an offset to jump over the */
254 	/* alternate match in case the first did not fail */
255 	case '|':
256 	    if (acs != NIL && acs != cs)
257 		OCNT(ccre) = ccre - acs;	/* make a back pointer */
258 	    else
259 		OCNT(ccre) = 0;
260 	    *cs |= ALT;
261 	    cs = ccre;
262 	    *cs = OPER;
263 	    OSYM(cs) = '|';
264 	    ccre = ONEXT(cs);
265 	    acs = cs;	/* remember that the pointer is to be filles */
266 	    break;
267 
268 	/* if its not a metasymbol just build a scharacter string */
269 	default:
270 	    if (cs == NIL || (*cs & STR) == 0) {
271 		cs = ccre;
272 		*cs = STR;
273 		SCNT(cs) = 1;
274 		ccre = SSTR(cs);
275 	    } else
276 		SCNT(cs)++;
277 	    *ccre++ = c;
278 	    break;
279 	}
280     }
281     if (acs != NIL) {
282 	do {
283 	    temp = OCNT(acs);
284 	    OCNT(acs) = ccre - acs;
285 	    acs -= temp;
286 	} while (temp != 0);
287 	acs = NIL;
288     }
289     return;
290 }
291 /* end of convertre */
292 
293 
294 /*
295  *	The following routine recognises an irregular expresion
296  *	with the following special characters:
297  *
298  *		\?	-	means last match was optional
299  *		\a	-	matches any number of characters
300  *		\d	-	matches any number of spaces and tabs
301  *		\p	-	matches any number of alphanumeric
302  *				characters. The
303  *				characters matched will be copied into
304  *				the area pointed to by 'name'.
305  *		\|	-	alternation
306  *		\( \)	-	grouping used mostly for alternation and
307  *				optionality
308  *
309  *	The irregular expression must be translated to internal form
310  *	prior to calling this routine
311  *
312  *	The value returned is the pointer to the first non \a
313  *	character matched.
314  */
315 
316 boolean _escaped;		/* true if we are currently _escaped */
317 char *_start;			/* start of string */
318 
319 char *
320 expmatch (s, re, mstring)
321     register char *s;		/* string to check for a match in */
322     register char *re;		/* a converted irregular expression */
323     register char *mstring;	/* where to put whatever matches a \p */
324 {
325     register char *cs;		/* the current symbol */
326     register char *ptr,*s1;	/* temporary pointer */
327     boolean matched;		/* a temporary boolean */
328 
329     /* initial conditions */
330     if (re == NIL)
331 	return (NIL);
332     cs = re;
333     matched = FALSE;
334 
335     /* loop till expression string is exhausted (or at least pretty tired) */
336     while (*cs) {
337 	switch (*cs & (OPER | STR | META)) {
338 
339 	/* try to match a string */
340 	case STR:
341 	    matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
342 	    if (matched) {
343 
344 		/* hoorah it matches */
345 		s += SCNT(cs);
346 		cs = SNEXT(cs);
347 	    } else if (*cs & ALT) {
348 
349 		/* alternation, skip to next expression */
350 		cs = SNEXT(cs);
351 	    } else if (*cs & OPT) {
352 
353 		/* the match is optional */
354 		cs = SNEXT(cs);
355 		matched = 1;		/* indicate a successful match */
356 	    } else {
357 
358 		/* no match, error return */
359 		return (NIL);
360 	    }
361 	    break;
362 
363 	/* an operator, do something fancy */
364 	case OPER:
365 	    switch (OSYM(cs)) {
366 
367 	    /* this is an alternation */
368 	    case '|':
369 		if (matched)
370 
371 		    /* last thing in the alternation was a match, skip ahead */
372 		    cs = OPTR(cs);
373 		else
374 
375 		    /* no match, keep trying */
376 		    cs = ONEXT(cs);
377 		break;
378 
379 	    /* this is a grouping, recurse */
380 	    case '(':
381 		ptr = expmatch (s, ONEXT(cs), mstring);
382 		if (ptr != NIL) {
383 
384 		    /* the subexpression matched */
385 		    matched = 1;
386 		    s = ptr;
387 		} else if (*cs & ALT) {
388 
389 		    /* alternation, skip to next expression */
390 		    matched = 0;
391 		} else if (*cs & OPT) {
392 
393 		    /* the match is optional */
394 		    matched = 1;	/* indicate a successful match */
395 		} else {
396 
397 		    /* no match, error return */
398 		    return (NIL);
399 		}
400 		cs = OPTR(cs);
401 		break;
402 	    }
403 	    break;
404 
405 	/* try to match a metasymbol */
406 	case META:
407 	    switch (MSYM(cs)) {
408 
409 	    /* try to match anything and remember what was matched */
410 	    case 'p':
411 		/*
412 		 *  This is really the same as trying the match the
413 		 *  remaining parts of the expression to any subset
414 		 *  of the string.
415 		 */
416 		s1 = s;
417 		do {
418 		    ptr = expmatch (s1, MNEXT(cs), mstring);
419 		    if (ptr != NIL && s1 != s) {
420 
421 			/* we have a match, remember the match */
422 			strncpy (mstring, s, s1 - s);
423 			mstring[s1 - s] = '\0';
424 			return (ptr);
425 		    } else if (ptr != NIL && (*cs & OPT)) {
426 
427 			/* it was aoptional so no match is ok */
428 			return (ptr);
429 		    } else if (ptr != NIL) {
430 
431 			/* not optional and we still matched */
432 			return (NIL);
433 		    }
434 		    if (!isalnum(*s1) && *s1 != '_')
435 			return (NIL);
436 		    if (*s1 == '\\')
437 			_escaped = _escaped ? FALSE : TRUE;
438 		    else
439 			_escaped = FALSE;
440 		} while (*s1++);
441 		return (NIL);
442 
443 	    /* try to match anything */
444 	    case 'a':
445 		/*
446 		 *  This is really the same as trying the match the
447 		 *  remaining parts of the expression to any subset
448 		 *  of the string.
449 		 */
450 		s1 = s;
451 		do {
452 		    ptr = expmatch (s1, MNEXT(cs), mstring);
453 		    if (ptr != NIL && s1 != s) {
454 
455 			/* we have a match */
456 			return (ptr);
457 		    } else if (ptr != NIL && (*cs & OPT)) {
458 
459 			/* it was aoptional so no match is ok */
460 			return (ptr);
461 		    } else if (ptr != NIL) {
462 
463 			/* not optional and we still matched */
464 			return (NIL);
465 		    }
466 		    if (*s1 == '\\')
467 			_escaped = _escaped ? FALSE : TRUE;
468 		    else
469 			_escaped = FALSE;
470 		} while (*s1++);
471 		return (NIL);
472 
473 	    /* fail if we are currently _escaped */
474 	    case 'e':
475 		if (_escaped)
476 		    return(NIL);
477 		cs = MNEXT(cs);
478 		break;
479 
480 	    /* match any number of tabs and spaces */
481 	    case 'd':
482 		ptr = s;
483 		while (*s == ' ' || *s == '\t')
484 		    s++;
485 		if (s != ptr || s == _start) {
486 
487 		    /* match, be happy */
488 		    matched = 1;
489 		    cs = MNEXT(cs);
490 		} else if (*s == '\n' || *s == '\0') {
491 
492 		    /* match, be happy */
493 		    matched = 1;
494 		    cs = MNEXT(cs);
495 		} else if (*cs & ALT) {
496 
497 		    /* try the next part */
498 		    matched = 0;
499 		    cs = MNEXT(cs);
500 		} else if (*cs & OPT) {
501 
502 		    /* doesn't matter */
503 		    matched = 1;
504 		    cs = MNEXT(cs);
505 		} else
506 
507 		    /* no match, error return */
508 		    return (NIL);
509 		break;
510 
511 	    /* check for end of line */
512 	    case '$':
513 		if (*s == '\0' || *s == '\n') {
514 
515 		    /* match, be happy */
516 		    s++;
517 		    matched = 1;
518 		    cs = MNEXT(cs);
519 		} else if (*cs & ALT) {
520 
521 		    /* try the next part */
522 		    matched = 0;
523 		    cs = MNEXT(cs);
524 		} else if (*cs & OPT) {
525 
526 		    /* doesn't matter */
527 		    matched = 1;
528 		    cs = MNEXT(cs);
529 		} else
530 
531 		    /* no match, error return */
532 		    return (NIL);
533 		break;
534 
535 	    /* check for start of line */
536 	    case '^':
537 		if (s == _start) {
538 
539 		    /* match, be happy */
540 		    matched = 1;
541 		    cs = MNEXT(cs);
542 		} else if (*cs & ALT) {
543 
544 		    /* try the next part */
545 		    matched = 0;
546 		    cs = MNEXT(cs);
547 		} else if (*cs & OPT) {
548 
549 		    /* doesn't matter */
550 		    matched = 1;
551 		    cs = MNEXT(cs);
552 		} else
553 
554 		    /* no match, error return */
555 		    return (NIL);
556 		break;
557 
558 	    /* end of a subexpression, return success */
559 	    case ')':
560 		return (s);
561 	    }
562 	    break;
563 	}
564     }
565     return (s);
566 }
567