xref: /freebsd/usr.bin/vgrind/regexp.c (revision bdcbfde3)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1980, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32 
33 
34 #ifndef lint
35 static const char copyright[] =
36 "@(#) Copyright (c) 1980, 1993\n\
37 	The Regents of the University of California.  All rights reserved.\n";
38 #endif
39 
40 
41 #include <ctype.h>
42 #include <stdlib.h>
43 #include <stdbool.h>
44 #include <string.h>
45 
46 #include "extern.h"
47 
48 static void	expconv(void);
49 
50 bool	 _escaped;	/* true if we are currently x_escaped */
51 char	*s_start;	/* start of string */
52 bool	 l_onecase;	/* true if upper and lower equivalent */
53 
54 #define makelower(c) (isupper((c)) ? tolower((c)) : (c))
55 
56 /*  STRNCMP -	like strncmp except that we convert the
57  *	 	first string to lower case before comparing
58  *		if l_onecase is set.
59  */
60 
61 int
62 STRNCMP(register char *s1, register char *s2, register int len)
63 {
64 	if (l_onecase) {
65 	    do
66 		if (*s2 - makelower(*s1))
67 			return (*s2 - makelower(*s1));
68 		else {
69 			s2++;
70 			s1++;
71 		}
72 	    while (--len);
73 	} else {
74 	    do
75 		if (*s2 - *s1)
76 			return (*s2 - *s1);
77 		else {
78 			s2++;
79 			s1++;
80 		}
81 	    while (--len);
82 	}
83 	return(0);
84 }
85 
86 /*	The following routine converts an irregular expression to
87  *	internal format.
88  *
89  *	Either meta symbols (\a \d or \p) or character strings or
90  *	operations ( alternation or parenthesizing ) can be
91  *	specified.  Each starts with a descriptor byte.  The descriptor
92  *	byte has STR set for strings, META set for meta symbols
93  *	and OPER set for operations.
94  *	The descriptor byte can also have the OPT bit set if the object
95  *	defined is optional.  Also ALT can be set to indicate an alternation.
96  *
97  *	For metasymbols the byte following the descriptor byte identities
98  *	the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '(').  For
99  *	strings the byte after the descriptor is a character count for
100  *	the string:
101  *
102  *		meta symbols := descriptor
103  *				symbol
104  *
105  *		strings :=	descriptor
106  *				character count
107  *				the string
108  *
109  *		operations :=	descriptor
110  *				symbol
111  *				character count
112  */
113 
114 /*
115  *  handy macros for accessing parts of match blocks
116  */
117 #define MSYM(A) (*(A+1))	/* symbol in a meta symbol block */
118 #define MNEXT(A) (A+2)		/* character following a metasymbol block */
119 
120 #define OSYM(A) (*(A+1))	/* symbol in an operation block */
121 #define OCNT(A) (*(A+2))	/* character count */
122 #define ONEXT(A) (A+3)		/* next character after the operation */
123 #define OPTR(A) (A+*(A+2))	/* place pointed to by the operator */
124 
125 #define SCNT(A) (*(A+1))	/* byte count of a string */
126 #define SSTR(A) (A+2)		/* address of the string */
127 #define SNEXT(A) (A+2+*(A+1))	/* character following the string */
128 
129 /*
130  *  bit flags in the descriptor
131  */
132 #define OPT 1
133 #define STR 2
134 #define META 4
135 #define ALT 8
136 #define OPER 16
137 
138 static char *ccre;	/* pointer to current position in converted exp*/
139 static char *ure;	/* pointer current position in unconverted exp */
140 
141 /* re: unconverted irregular expression */
142 char *
143 convexp(char *re)
144 {
145     register char *cre;		/* pointer to converted regular expression */
146 
147     /* allocate room for the converted expression */
148     if (re == NULL)
149 	return (NULL);
150     if (*re == '\0')
151 	return (NULL);
152     cre = malloc(4 * strlen(re) + 3);
153     ccre = cre;
154     ure = re;
155 
156     /* start the conversion with a \a */
157     *cre = META | OPT;
158     MSYM(cre) = 'a';
159     ccre = MNEXT(cre);
160 
161     /* start the conversion (its recursive) */
162     expconv ();
163     *ccre = 0;
164     return (cre);
165 }
166 
167 static void
168 expconv()
169 {
170     register char *cs;		/* pointer to current symbol in converted exp */
171     register char c;		/* character being processed */
172     register char *acs;		/* pinter to last alternate */
173     register int temp;
174 
175     /* let the conversion begin */
176     acs = NULL;
177     cs = NULL;
178     while (*ure) {
179 	switch (c = *ure++) {
180 
181 	case '\\':
182 	    switch (c = *ure++) {
183 
184 	    /* escaped characters are just characters */
185 	    default:
186 		if (cs == NULL || (*cs & STR) == 0) {
187 		    cs = ccre;
188 		    *cs = STR;
189 		    SCNT(cs) = 1;
190 		    ccre += 2;
191 		} else
192 		    SCNT(cs)++;
193 		*ccre++ = c;
194 		break;
195 
196 	    /* normal(?) metacharacters */
197 	    case 'a':
198 	    case 'd':
199 	    case 'e':
200 	    case 'p':
201 		if (acs != NULL && acs != cs) {
202 		    do {
203 			temp = OCNT(acs);
204 			OCNT(acs) = ccre - acs;
205 			acs -= temp;
206 		    } while (temp != 0);
207 		    acs = NULL;
208 		}
209 		cs = ccre;
210 		*cs = META;
211 		MSYM(cs) = c;
212 		ccre = MNEXT(cs);
213 		break;
214 	    }
215 	    break;
216 
217 	/* just put the symbol in */
218 	case '^':
219 	case '$':
220 	    if (acs != NULL && acs != cs) {
221 		do {
222 		    temp = OCNT(acs);
223 		    OCNT(acs) = ccre - acs;
224 		    acs -= temp;
225 		} while (temp != 0);
226 		acs = NULL;
227 	    }
228 	    cs = ccre;
229 	    *cs = META;
230 	    MSYM(cs) = c;
231 	    ccre = MNEXT(cs);
232 	    break;
233 
234 	/* mark the last match sequence as optional */
235 	case '?':
236 	    if (cs)
237 	    	*cs = *cs | OPT;
238 	    break;
239 
240 	/* recurse and define a subexpression */
241 	case '(':
242 	    if (acs != NULL && acs != cs) {
243 		do {
244 		    temp = OCNT(acs);
245 		    OCNT(acs) = ccre - acs;
246 		    acs -= temp;
247 		} while (temp != 0);
248 		acs = NULL;
249 	    }
250 	    cs = ccre;
251 	    *cs = OPER;
252 	    OSYM(cs) = '(';
253 	    ccre = ONEXT(cs);
254 	    expconv();
255 	    OCNT(cs) = ccre - cs;		/* offset to next symbol */
256 	    break;
257 
258 	/* return from a recursion */
259 	case ')':
260 	    if (acs != NULL) {
261 		do {
262 		    temp = OCNT(acs);
263 		    OCNT(acs) = ccre - acs;
264 		    acs -= temp;
265 		} while (temp != 0);
266 		acs = NULL;
267 	    }
268 	    cs = ccre;
269 	    *cs = META;
270 	    MSYM(cs) = c;
271 	    ccre = MNEXT(cs);
272 	    return;
273 
274 	/* mark the last match sequence as having an alternate */
275 	/* the third byte will contain an offset to jump over the */
276 	/* alternate match in case the first did not fail */
277 	case '|':
278 	    if (acs != NULL && acs != cs)
279 		OCNT(ccre) = ccre - acs;	/* make a back pointer */
280 	    else
281 		OCNT(ccre) = 0;
282 	    *cs |= ALT;
283 	    cs = ccre;
284 	    *cs = OPER;
285 	    OSYM(cs) = '|';
286 	    ccre = ONEXT(cs);
287 	    acs = cs;	/* remember that the pointer is to be filles */
288 	    break;
289 
290 	/* if its not a metasymbol just build a scharacter string */
291 	default:
292 	    if (cs == NULL || (*cs & STR) == 0) {
293 		cs = ccre;
294 		*cs = STR;
295 		SCNT(cs) = 1;
296 		ccre = SSTR(cs);
297 	    } else
298 		SCNT(cs)++;
299 	    *ccre++ = c;
300 	    break;
301 	}
302     }
303     if (acs != NULL) {
304 	do {
305 	    temp = OCNT(acs);
306 	    OCNT(acs) = ccre - acs;
307 	    acs -= temp;
308 	} while (temp != 0);
309 	acs = NULL;
310     }
311     return;
312 }
313 /* end of convertre */
314 
315 
316 /*
317  *	The following routine recognises an irregular expression
318  *	with the following special characters:
319  *
320  *		\?	-	means last match was optional
321  *		\a	-	matches any number of characters
322  *		\d	-	matches any number of spaces and tabs
323  *		\p	-	matches any number of alphanumeric
324  *				characters. The
325  *				characters matched will be copied into
326  *				the area pointed to by 'name'.
327  *		\|	-	alternation
328  *		\( \)	-	grouping used mostly for alternation and
329  *				optionality
330  *
331  *	The irregular expression must be translated to internal form
332  *	prior to calling this routine
333  *
334  *	The value returned is the pointer to the first non \a
335  *	character matched.
336  */
337 
338 /*
339  *  s: string to check for a match in
340  *  re: a converted irregular expression
341  *  mstring: where to put whatever matches a \p
342  */
343 char *
344 expmatch (register char *s, register char *re, register char *mstring)
345 {
346     register char *cs;		/* the current symbol */
347     register char *ptr,*s1;	/* temporary pointer */
348     bool matched;	/* a temporary bool */
349 
350     /* initial conditions */
351     if (re == NULL)
352 	return (NULL);
353     cs = re;
354     matched = false;
355 
356     /* loop till expression string is exhausted (or at least pretty tired) */
357     while (*cs) {
358 	switch (*cs & (OPER | STR | META)) {
359 
360 	/* try to match a string */
361 	case STR:
362 	    matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
363 	    if (matched) {
364 
365 		/* hoorah it matches */
366 		s += SCNT(cs);
367 		cs = SNEXT(cs);
368 	    } else if (*cs & ALT) {
369 
370 		/* alternation, skip to next expression */
371 		cs = SNEXT(cs);
372 	    } else if (*cs & OPT) {
373 
374 		/* the match is optional */
375 		cs = SNEXT(cs);
376 		matched = 1;		/* indicate a successful match */
377 	    } else {
378 
379 		/* no match, error return */
380 		return (NULL);
381 	    }
382 	    break;
383 
384 	/* an operator, do something fancy */
385 	case OPER:
386 	    switch (OSYM(cs)) {
387 
388 	    /* this is an alternation */
389 	    case '|':
390 		if (matched)
391 
392 		    /* last thing in the alternation was a match, skip ahead */
393 		    cs = OPTR(cs);
394 		else
395 
396 		    /* no match, keep trying */
397 		    cs = ONEXT(cs);
398 		break;
399 
400 	    /* this is a grouping, recurse */
401 	    case '(':
402 		ptr = expmatch(s, ONEXT(cs), mstring);
403 		if (ptr != NULL) {
404 
405 		    /* the subexpression matched */
406 		    matched = 1;
407 		    s = ptr;
408 		} else if (*cs & ALT) {
409 
410 		    /* alternation, skip to next expression */
411 		    matched = 0;
412 		} else if (*cs & OPT) {
413 
414 		    /* the match is optional */
415 		    matched = 1;	/* indicate a successful match */
416 		} else {
417 
418 		    /* no match, error return */
419 		    return (NULL);
420 		}
421 		cs = OPTR(cs);
422 		break;
423 	    }
424 	    break;
425 
426 	/* try to match a metasymbol */
427 	case META:
428 	    switch (MSYM(cs)) {
429 
430 	    /* try to match anything and remember what was matched */
431 	    case 'p':
432 		/*
433 		 *  This is really the same as trying the match the
434 		 *  remaining parts of the expression to any subset
435 		 *  of the string.
436 		 */
437 		s1 = s;
438 		do {
439 		    ptr = expmatch(s1, MNEXT(cs), mstring);
440 		    if (ptr != NULL && s1 != s) {
441 
442 			/* we have a match, remember the match */
443 			strncpy (mstring, s, s1 - s);
444 			mstring[s1 - s] = '\0';
445 			return (ptr);
446 		    } else if (ptr != NULL && (*cs & OPT)) {
447 
448 			/* it was aoptional so no match is ok */
449 			return (ptr);
450 		    } else if (ptr != NULL) {
451 
452 			/* not optional and we still matched */
453 			return (NULL);
454 		    }
455 		    if (!(isalnum(*s1) || *s1 == '_' ||
456 			  /* C++ destructor */
457 			  *s1 == '~' ||
458 			  /* C++ scope operator */
459 			  (strlen(s1) > 1 && *s1 == ':' && s1[1] == ':' &&
460 			   (s1++, true))))
461 			return (NULL);
462 		    if (*s1 == '\\')
463 			_escaped = _escaped ? false : true;
464 		    else
465 			_escaped = false;
466 		} while (*s1++);
467 		return (NULL);
468 
469 	    /* try to match anything */
470 	    case 'a':
471 		/*
472 		 *  This is really the same as trying the match the
473 		 *  remaining parts of the expression to any subset
474 		 *  of the string.
475 		 */
476 		s1 = s;
477 		do {
478 		    ptr = expmatch(s1, MNEXT(cs), mstring);
479 		    if (ptr != NULL && s1 != s) {
480 
481 			/* we have a match */
482 			return (ptr);
483 		    } else if (ptr != NULL && (*cs & OPT)) {
484 
485 			/* it was aoptional so no match is ok */
486 			return (ptr);
487 		    } else if (ptr != NULL) {
488 
489 			/* not optional and we still matched */
490 			return (NULL);
491 		    }
492 		    if (*s1 == '\\')
493 			_escaped = _escaped ? false : true;
494 		    else
495 			_escaped = false;
496 		} while (*s1++);
497 		return (NULL);
498 
499 	    /* fail if we are currently _escaped */
500 	    case 'e':
501 		if (_escaped)
502 		    return(NULL);
503 		cs = MNEXT(cs);
504 		break;
505 
506 	    /* match any number of tabs and spaces */
507 	    case 'd':
508 		ptr = s;
509 		while (*s == ' ' || *s == '\t')
510 		    s++;
511 		if (s != ptr || s == s_start) {
512 
513 		    /* match, be happy */
514 		    matched = 1;
515 		    cs = MNEXT(cs);
516 		} else if (*s == '\n' || *s == '\0') {
517 
518 		    /* match, be happy */
519 		    matched = 1;
520 		    cs = MNEXT(cs);
521 		} else if (*cs & ALT) {
522 
523 		    /* try the next part */
524 		    matched = 0;
525 		    cs = MNEXT(cs);
526 		} else if (*cs & OPT) {
527 
528 		    /* doesn't matter */
529 		    matched = 1;
530 		    cs = MNEXT(cs);
531 		} else
532 
533 		    /* no match, error return */
534 		    return (NULL);
535 		break;
536 
537 	    /* check for end of line */
538 	    case '$':
539 		if (*s == '\0' || *s == '\n') {
540 
541 		    /* match, be happy */
542 		    s++;
543 		    matched = 1;
544 		    cs = MNEXT(cs);
545 		} else if (*cs & ALT) {
546 
547 		    /* try the next part */
548 		    matched = 0;
549 		    cs = MNEXT(cs);
550 		} else if (*cs & OPT) {
551 
552 		    /* doesn't matter */
553 		    matched = 1;
554 		    cs = MNEXT(cs);
555 		} else
556 
557 		    /* no match, error return */
558 		    return (NULL);
559 		break;
560 
561 	    /* check for start of line */
562 	    case '^':
563 		if (s == s_start) {
564 
565 		    /* match, be happy */
566 		    matched = 1;
567 		    cs = MNEXT(cs);
568 		} else if (*cs & ALT) {
569 
570 		    /* try the next part */
571 		    matched = 0;
572 		    cs = MNEXT(cs);
573 		} else if (*cs & OPT) {
574 
575 		    /* doesn't matter */
576 		    matched = 1;
577 		    cs = MNEXT(cs);
578 		} else
579 
580 		    /* no match, error return */
581 		    return (NULL);
582 		break;
583 
584 	    /* end of a subexpression, return success */
585 	    case ')':
586 		return (s);
587 	    }
588 	    break;
589 	}
590     }
591     return (s);
592 }
593