xref: /original-bsd/usr.bin/vgrind/regexp.c (revision 26e4b21d)
1 static char sccsid[] = "@(#)regexp.c	4.1	(Berkeley)	10/19/82";
2 
3 #include <ctype.h>
4 
5 typedef int	boolean;
6 #define TRUE	1
7 #define FALSE	0
8 #define NIL	0
9 
10 boolean l_onecase;	/* true if upper and lower equivalent */
11 
12 #define makelower(c) (isupper((c)) ? tolower((c)) : (c))
13 
14 /*  STRNCMP -	like strncmp except that we convert the
15  *	 	first string to lower case before comparing
16  *		if l_onecase is set.
17  */
18 
19 STRNCMP(s1, s2, len)
20 	register char *s1,*s2;
21 	register int len;
22 {
23 	if (l_onecase) {
24 	    do
25 		if (*s2 - makelower(*s1))
26 			return (*s2 - makelower(*s1));
27 		else {
28 			s2++;
29 			s1++;
30 		}
31 	    while (--len);
32 	} else {
33 	    do
34 		if (*s2 - *s1)
35 			return (*s2 - *s1);
36 		else {
37 			s2++;
38 			s1++;
39 		}
40 	    while (--len);
41 	}
42 	return(0);
43 }
44 
45 /*	The following routine converts an irregular expression to
46  *	internal format.
47  *
48  *	Either meta symbols (\a \d or \p) or character strings or
49  *	operations ( alternation or perenthesizing ) can be
50  *	specified.  Each starts with a descriptor byte.  The descriptor
51  *	byte has STR set for strings, META set for meta symbols
52  *	and OPER set for operations.
53  *	The descriptor byte can also have the OPT bit set if the object
54  *	defined is optional.  Also ALT can be set to indicate an alternation.
55  *
56  *	For metasymbols the byte following the descriptor byte identities
57  *	the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '(').  For
58  *	strings the byte after the descriptor is a character count for
59  *	the string:
60  *
61  *		meta symbols := descriptor
62  *				symbol
63  *
64  *		strings :=	descriptor
65  *				character count
66  *				the string
67  *
68  *		operatins :=	descriptor
69  *				symbol
70  *				character count
71  */
72 
73 /*
74  *  handy macros for accessing parts of match blocks
75  */
76 #define MSYM(A) (*(A+1))	/* symbol in a meta symbol block */
77 #define MNEXT(A) (A+2)		/* character following a metasymbol block */
78 
79 #define OSYM(A) (*(A+1))	/* symbol in an operation block */
80 #define OCNT(A) (*(A+2))	/* character count */
81 #define ONEXT(A) (A+3)		/* next character after the operation */
82 #define OPTR(A) (A+*(A+2))	/* place pointed to by the operator */
83 
84 #define SCNT(A) (*(A+1))	/* byte count of a string */
85 #define SSTR(A) (A+2)		/* address of the string */
86 #define SNEXT(A) (A+2+*(A+1))	/* character following the string */
87 
88 /*
89  *  bit flags in the descriptor
90  */
91 #define OPT 1
92 #define STR 2
93 #define META 4
94 #define ALT 8
95 #define OPER 16
96 
97 char *ure;		/* pointer current position in unconverted exp */
98 char *ccre;		/* pointer to current position in converted exp*/
99 char *malloc();
100 
101 char *
102 convexp(re)
103     char *re;		/* unconverted irregular expression */
104 {
105     register char *cre;		/* pointer to converted regular expression */
106 
107     /* allocate room for the converted expression */
108     if (re == NIL)
109 	return (NIL);
110     if (*re == '\0')
111 	return (NIL);
112     cre = malloc (4 * strlen(re) + 3);
113     ccre = cre;
114     ure = re;
115 
116     /* start the conversion with a \a */
117     *cre = META | OPT;
118     MSYM(cre) = 'a';
119     ccre = MNEXT(cre);
120 
121     /* start the conversion (its recursive) */
122     expconv ();
123     *ccre = 0;
124     return (cre);
125 }
126 
127 expconv()
128 {
129     register char *cs;		/* pointer to current symbol in converted exp */
130     register char c;		/* character being processed */
131     register char *acs;		/* pinter to last alternate */
132     register int temp;
133 
134     /* let the conversion begin */
135     acs = NIL;
136     while (*ure != NIL) {
137 	switch (c = *ure++) {
138 
139 	case '\\':
140 	    switch (c = *ure++) {
141 
142 	    /* escaped characters are just characters */
143 	    default:
144 		if ((*cs & STR) == 0) {
145 		    cs = ccre;
146 		    *cs = STR;
147 		    SCNT(cs) = 1;
148 		    ccre += 2;
149 		} else
150 		    SCNT(cs)++;
151 		*ccre++ = c;
152 		break;
153 
154 	    /* normal(?) metacharacters */
155 	    case 'a':
156 	    case 'd':
157 	    case 'e':
158 	    case 'p':
159 		if (acs != NIL && acs != cs) {
160 		    do {
161 			temp = OCNT(acs);
162 			OCNT(acs) = ccre - acs;
163 			acs -= temp;
164 		    } while (temp != 0);
165 		    acs = NIL;
166 		}
167 		cs = ccre;
168 		*cs = META;
169 		MSYM(cs) = c;
170 		ccre = MNEXT(cs);
171 		break;
172 	    }
173 	    break;
174 
175 	/* just put the symbol in */
176 	case '^':
177 	case '$':
178 	    if (acs != NIL && acs != cs) {
179 		do {
180 		    temp = OCNT(acs);
181 		    OCNT(acs) = ccre - acs;
182 		    acs -= temp;
183 		} while (temp != 0);
184 		acs = NIL;
185 	    }
186 	    cs = ccre;
187 	    *cs = META;
188 	    MSYM(cs) = c;
189 	    ccre = MNEXT(cs);
190 	    break;
191 
192 	/* mark the last match sequence as optional */
193 	case '?':
194 	    *cs = *cs | OPT;
195 	    break;
196 
197 	/* recurse and define a subexpression */
198 	case '(':
199 	    if (acs != NIL && acs != cs) {
200 		do {
201 		    temp = OCNT(acs);
202 		    OCNT(acs) = ccre - acs;
203 		    acs -= temp;
204 		} while (temp != 0);
205 		acs = NIL;
206 	    }
207 	    cs = ccre;
208 	    *cs = OPER;
209 	    OSYM(cs) = '(';
210 	    ccre = ONEXT(cs);
211 	    expconv ();
212 	    OCNT(cs) = ccre - cs;		/* offset to next symbol */
213 	    break;
214 
215 	/* return from a recursion */
216 	case ')':
217 	    if (acs != NIL) {
218 		do {
219 		    temp = OCNT(acs);
220 		    OCNT(acs) = ccre - acs;
221 		    acs -= temp;
222 		} while (temp != 0);
223 		acs = NIL;
224 	    }
225 	    cs = ccre;
226 	    *cs = META;
227 	    MSYM(cs) = c;
228 	    ccre = MNEXT(cs);
229 	    return;
230 
231 	/* mark the last match sequence as having an alternate */
232 	/* the third byte will contain an offset to jump over the */
233 	/* alternate match in case the first did not fail */
234 	case '|':
235 	    if (acs != NIL && acs != cs)
236 		OCNT(ccre) = ccre - acs;	/* make a back pointer */
237 	    else
238 		OCNT(ccre) = 0;
239 	    *cs |= ALT;
240 	    cs = ccre;
241 	    *cs = OPER;
242 	    OSYM(cs) = '|';
243 	    ccre = ONEXT(cs);
244 	    acs = cs;	/* remember that the pointer is to be filles */
245 	    break;
246 
247 	/* if its not a metasymbol just build a scharacter string */
248 	default:
249 	    if ((*cs & STR) == 0) {
250 		cs = ccre;
251 		*cs = STR;
252 		SCNT(cs) = 1;
253 		ccre = SSTR(cs);
254 	    } else
255 		SCNT(cs)++;
256 	    *ccre++ = c;
257 	    break;
258 	}
259     }
260     if (acs != NIL) {
261 	do {
262 	    temp = OCNT(acs);
263 	    OCNT(acs) = ccre - acs;
264 	    acs -= temp;
265 	} while (temp != 0);
266 	acs = NIL;
267     }
268     return;
269 }
270 /* end of convertre */
271 
272 
273 /*
274  *	The following routine recognises an irregular expresion
275  *	with the following special characters:
276  *
277  *		\?	-	means last match was optional
278  *		\a	-	matches any number of characters
279  *		\d	-	matches any number of spaces and tabs
280  *		\p	-	matches any number of alphanumeric
281  *				characters. The
282  *				characters matched will be copied into
283  *				the area pointed to by 'name'.
284  *		\|	-	alternation
285  *		\( \)	-	grouping used mostly for alternation and
286  *				optionality
287  *
288  *	The irregular expression must be translated to internal form
289  *	prior to calling this routine
290  *
291  *	The value returned is the pointer to the first non \a
292  *	character matched.
293  */
294 
295 boolean _escaped;		/* true if we are currently _escaped */
296 char *_start;			/* start of string */
297 
298 char *
299 expmatch (s, re, mstring)
300     register char *s;		/* string to check for a match in */
301     register char *re;		/* a converted irregular expression */
302     register char *mstring;	/* where to put whatever matches a \p */
303 {
304     register char *cs;		/* the current symbol */
305     register char *ptr,*s1;	/* temporary pointer */
306     boolean matched;		/* a temporary boolean */
307 
308     /* initial conditions */
309     if (re == NIL)
310 	return (NIL);
311     cs = re;
312     matched = FALSE;
313 
314     /* loop till expression string is exhausted (or at least pretty tired) */
315     while (*cs) {
316 	switch (*cs & (OPER | STR | META)) {
317 
318 	/* try to match a string */
319 	case STR:
320 	    matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
321 	    if (matched) {
322 
323 		/* hoorah it matches */
324 		s += SCNT(cs);
325 		cs = SNEXT(cs);
326 	    } else if (*cs & ALT) {
327 
328 		/* alternation, skip to next expression */
329 		cs = SNEXT(cs);
330 	    } else if (*cs & OPT) {
331 
332 		/* the match is optional */
333 		cs = SNEXT(cs);
334 		matched = 1;		/* indicate a successful match */
335 	    } else {
336 
337 		/* no match, error return */
338 		return (NIL);
339 	    }
340 	    break;
341 
342 	/* an operator, do something fancy */
343 	case OPER:
344 	    switch (OSYM(cs)) {
345 
346 	    /* this is an alternation */
347 	    case '|':
348 		if (matched)
349 
350 		    /* last thing in the alternation was a match, skip ahead */
351 		    cs = OPTR(cs);
352 		else
353 
354 		    /* no match, keep trying */
355 		    cs = ONEXT(cs);
356 		break;
357 
358 	    /* this is a grouping, recurse */
359 	    case '(':
360 		ptr = expmatch (s, ONEXT(cs), mstring);
361 		if (ptr != NIL) {
362 
363 		    /* the subexpression matched */
364 		    matched = 1;
365 		    s = ptr;
366 		} else if (*cs & ALT) {
367 
368 		    /* alternation, skip to next expression */
369 		    matched = 0;
370 		} else if (*cs & OPT) {
371 
372 		    /* the match is optional */
373 		    matched = 1;	/* indicate a successful match */
374 		} else {
375 
376 		    /* no match, error return */
377 		    return (NIL);
378 		}
379 		cs = OPTR(cs);
380 		break;
381 	    }
382 	    break;
383 
384 	/* try to match a metasymbol */
385 	case META:
386 	    switch (MSYM(cs)) {
387 
388 	    /* try to match anything and remember what was matched */
389 	    case 'p':
390 		/*
391 		 *  This is really the same as trying the match the
392 		 *  remaining parts of the expression to any subset
393 		 *  of the string.
394 		 */
395 		s1 = s;
396 		do {
397 		    ptr = expmatch (s1, MNEXT(cs), mstring);
398 		    if (ptr != NIL && s1 != s) {
399 
400 			/* we have a match, remember the match */
401 			strncpy (mstring, s, s1 - s);
402 			mstring[s1 - s] = '\0';
403 			return (ptr);
404 		    } else if (ptr != NIL && (*cs & OPT)) {
405 
406 			/* it was aoptional so no match is ok */
407 			return (ptr);
408 		    } else if (ptr != NIL) {
409 
410 			/* not optional and we still matched */
411 			return (NIL);
412 		    }
413 		    if (!isalnum(*s1) && *s1 != '_')
414 			return (NIL);
415 		    if (*s1 == '\\')
416 			_escaped = _escaped ? FALSE : TRUE;
417 		    else
418 			_escaped = FALSE;
419 		} while (*s1++);
420 		return (NIL);
421 
422 	    /* try to match anything */
423 	    case 'a':
424 		/*
425 		 *  This is really the same as trying the match the
426 		 *  remaining parts of the expression to any subset
427 		 *  of the string.
428 		 */
429 		s1 = s;
430 		do {
431 		    ptr = expmatch (s1, MNEXT(cs), mstring);
432 		    if (ptr != NIL && s1 != s) {
433 
434 			/* we have a match */
435 			return (ptr);
436 		    } else if (ptr != NIL && (*cs & OPT)) {
437 
438 			/* it was aoptional so no match is ok */
439 			return (ptr);
440 		    } else if (ptr != NIL) {
441 
442 			/* not optional and we still matched */
443 			return (NIL);
444 		    }
445 		    if (*s1 == '\\')
446 			_escaped = _escaped ? FALSE : TRUE;
447 		    else
448 			_escaped = FALSE;
449 		} while (*s1++);
450 		return (NIL);
451 
452 	    /* fail if we are currently _escaped */
453 	    case 'e':
454 		if (_escaped)
455 		    return(NIL);
456 		cs = MNEXT(cs);
457 		break;
458 
459 	    /* match any number of tabs and spaces */
460 	    case 'd':
461 		ptr = s;
462 		while (*s == ' ' || *s == '\t')
463 		    s++;
464 		if (s != ptr || s == _start) {
465 
466 		    /* match, be happy */
467 		    matched = 1;
468 		    cs = MNEXT(cs);
469 		} else if (*s == '\n' || *s == '\0') {
470 
471 		    /* match, be happy */
472 		    matched = 1;
473 		    cs = MNEXT(cs);
474 		} else if (*cs & ALT) {
475 
476 		    /* try the next part */
477 		    matched = 0;
478 		    cs = MNEXT(cs);
479 		} else if (*cs & OPT) {
480 
481 		    /* doesn't matter */
482 		    matched = 1;
483 		    cs = MNEXT(cs);
484 		} else
485 
486 		    /* no match, error return */
487 		    return (NIL);
488 		break;
489 
490 	    /* check for end of line */
491 	    case '$':
492 		if (*s == '\0' || *s == '\n') {
493 
494 		    /* match, be happy */
495 		    s++;
496 		    matched = 1;
497 		    cs = MNEXT(cs);
498 		} else if (*cs & ALT) {
499 
500 		    /* try the next part */
501 		    matched = 0;
502 		    cs = MNEXT(cs);
503 		} else if (*cs & OPT) {
504 
505 		    /* doesn't matter */
506 		    matched = 1;
507 		    cs = MNEXT(cs);
508 		} else
509 
510 		    /* no match, error return */
511 		    return (NIL);
512 		break;
513 
514 	    /* check for start of line */
515 	    case '^':
516 		if (s == _start) {
517 
518 		    /* match, be happy */
519 		    matched = 1;
520 		    cs = MNEXT(cs);
521 		} else if (*cs & ALT) {
522 
523 		    /* try the next part */
524 		    matched = 0;
525 		    cs = MNEXT(cs);
526 		} else if (*cs & OPT) {
527 
528 		    /* doesn't matter */
529 		    matched = 1;
530 		    cs = MNEXT(cs);
531 		} else
532 
533 		    /* no match, error return */
534 		    return (NIL);
535 		break;
536 
537 	    /* end of a subexpression, return success */
538 	    case ')':
539 		return (s);
540 	    }
541 	    break;
542 	}
543     }
544     return (s);
545 }
546