xref: /original-bsd/old/regexp/regexp.c (revision 2301fdfb)
1 #include <ctype.h>
2 
3 typedef int	boolean;
4 #define TRUE	1
5 #define FALSE	0
6 #define NIL	0
7 
8 boolean l_onecase;	/* true if upper and lower equivalent */
9 
10 #define makelower(c) (isupper((c)) ? tolower((c)) : (c))
11 
12 /*  STRNCMP -	like strncmp except that we convert the
13  *	 	first string to lower case before comparing
14  *		if l_onecase is set.
15  */
16 
17 STRNCMP(s1, s2, len)
18 	register char *s1,*s2;
19 	register int len;
20 {
21 	if (l_onecase) {
22 	    do
23 		if (*s2 - makelower(*s1))
24 			return (*s2 - makelower(*s1));
25 		else {
26 			s2++;
27 			s1++;
28 		}
29 	    while (--len);
30 	} else {
31 	    do
32 		if (*s2 - *s1)
33 			return (*s2 - *s1);
34 		else {
35 			s2++;
36 			s1++;
37 		}
38 	    while (--len);
39 	}
40 	return(0);
41 }
42 
43 /*	The following routine converts an irregular expression to
44  *	internal format.
45  *
46  *	Either meta symbols (\a \d or \p) or character strings or
47  *	operations ( alternation or perenthesizing ) can be
48  *	specified.  Each starts with a descriptor byte.  The descriptor
49  *	byte has STR set for strings, META set for meta symbols
50  *	and OPER set for operations.
51  *	The descriptor byte can also have the OPT bit set if the object
52  *	defined is optional.  Also ALT can be set to indicate an alternation.
53  *
54  *	For metasymbols the byte following the descriptor byte identities
55  *	the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '(').  For
56  *	strings the byte after the descriptor is a character count for
57  *	the string:
58  *
59  *		meta symbols := descriptor
60  *				symbol
61  *
62  *		strings :=	descriptor
63  *				character count
64  *				the string
65  *
66  *		operatins :=	descriptor
67  *				symbol
68  *				character count
69  */
70 
71 /*
72  *  handy macros for accessing parts of match blocks
73  */
74 #define MSYM(A) (*(A+1))	/* symbol in a meta symbol block */
75 #define MNEXT(A) (A+2)		/* character following a metasymbol block */
76 
77 #define OSYM(A) (*(A+1))	/* symbol in an operation block */
78 #define OCNT(A) (*(A+2))	/* character count */
79 #define ONEXT(A) (A+3)		/* next character after the operation */
80 #define OPTR(A) (A+*(A+2))	/* place pointed to by the operator */
81 
82 #define SCNT(A) (*(A+1))	/* byte count of a string */
83 #define SSTR(A) (A+2)		/* address of the string */
84 #define SNEXT(A) (A+2+*(A+1))	/* character following the string */
85 
86 /*
87  *  bit flags in the descriptor
88  */
89 #define OPT 1
90 #define STR 2
91 #define META 4
92 #define ALT 8
93 #define OPER 16
94 
95 char *ure;		/* pointer current position in unconverted exp */
96 char *ccre;		/* pointer to current position in converted exp*/
97 char *malloc();
98 
99 char *
100 convexp(re)
101     char *re;		/* unconverted irregular expression */
102 {
103     register char *cre;		/* pointer to converted regular expression */
104 
105     /* allocate room for the converted expression */
106     if (re == NIL)
107 	return (NIL);
108     if (*re == '\0')
109 	return (NIL);
110     cre = malloc (4 * strlen(re) + 3);
111     ccre = cre;
112     ure = re;
113 
114     /* start the conversion with a \a */
115     *cre = META | OPT;
116     MSYM(cre) = 'a';
117     ccre = MNEXT(cre);
118 
119     /* start the conversion (its recursive) */
120     expconv ();
121     *ccre = 0;
122     return (cre);
123 }
124 
125 expconv()
126 {
127     register char *cs;		/* pointer to current symbol in converted exp */
128     register char c;		/* character being processed */
129     register char *acs;		/* pinter to last alternate */
130     register int temp;
131 
132     /* let the conversion begin */
133     acs = NIL;
134     while (*ure != NIL) {
135 	switch (c = *ure++) {
136 
137 	case '\\':
138 	    switch (c = *ure++) {
139 
140 	    /* escaped characters are just characters */
141 	    default:
142 		if ((*cs & STR) == 0) {
143 		    cs = ccre;
144 		    *cs = STR;
145 		    SCNT(cs) = 1;
146 		    ccre += 2;
147 		} else
148 		    SCNT(cs)++;
149 		*ccre++ = c;
150 		break;
151 
152 	    /* normal(?) metacharacters */
153 	    case 'a':
154 	    case 'd':
155 	    case 'e':
156 	    case 'p':
157 		if (acs != NIL && acs != cs) {
158 		    do {
159 			temp = OCNT(acs);
160 			OCNT(acs) = ccre - acs;
161 			acs -= temp;
162 		    } while (temp != 0);
163 		    acs = NIL;
164 		}
165 		cs = ccre;
166 		*cs = META;
167 		MSYM(cs) = c;
168 		ccre = MNEXT(cs);
169 		break;
170 	    }
171 	    break;
172 
173 	/* just put the symbol in */
174 	case '^':
175 	case '$':
176 	    if (acs != NIL && acs != cs) {
177 		do {
178 		    temp = OCNT(acs);
179 		    OCNT(acs) = ccre - acs;
180 		    acs -= temp;
181 		} while (temp != 0);
182 		acs = NIL;
183 	    }
184 	    cs = ccre;
185 	    *cs = META;
186 	    MSYM(cs) = c;
187 	    ccre = MNEXT(cs);
188 	    break;
189 
190 	/* mark the last match sequence as optional */
191 	case '?':
192 	    *cs = *cs | OPT;
193 	    break;
194 
195 	/* recurse and define a subexpression */
196 	case '(':
197 	    if (acs != NIL && acs != cs) {
198 		do {
199 		    temp = OCNT(acs);
200 		    OCNT(acs) = ccre - acs;
201 		    acs -= temp;
202 		} while (temp != 0);
203 		acs = NIL;
204 	    }
205 	    cs = ccre;
206 	    *cs = OPER;
207 	    OSYM(cs) = '(';
208 	    ccre = ONEXT(cs);
209 	    expconv ();
210 	    OCNT(cs) = ccre - cs;		/* offset to next symbol */
211 	    break;
212 
213 	/* return from a recursion */
214 	case ')':
215 	    if (acs != NIL) {
216 		do {
217 		    temp = OCNT(acs);
218 		    OCNT(acs) = ccre - acs;
219 		    acs -= temp;
220 		} while (temp != 0);
221 		acs = NIL;
222 	    }
223 	    cs = ccre;
224 	    *cs = META;
225 	    MSYM(cs) = c;
226 	    ccre = MNEXT(cs);
227 	    return;
228 
229 	/* mark the last match sequence as having an alternate */
230 	/* the third byte will contain an offset to jump over the */
231 	/* alternate match in case the first did not fail */
232 	case '|':
233 	    if (acs != NIL && acs != cs)
234 		OCNT(ccre) = ccre - acs;	/* make a back pointer */
235 	    else
236 		OCNT(ccre) = 0;
237 	    *cs |= ALT;
238 	    cs = ccre;
239 	    *cs = OPER;
240 	    OSYM(cs) = '|';
241 	    ccre = ONEXT(cs);
242 	    acs = cs;	/* remember that the pointer is to be filles */
243 	    break;
244 
245 	/* if its not a metasymbol just build a scharacter string */
246 	default:
247 	    if ((*cs & STR) == 0) {
248 		cs = ccre;
249 		*cs = STR;
250 		SCNT(cs) = 1;
251 		ccre = SSTR(cs);
252 	    } else
253 		SCNT(cs)++;
254 	    *ccre++ = c;
255 	    break;
256 	}
257     }
258     if (acs != NIL) {
259 	do {
260 	    temp = OCNT(acs);
261 	    OCNT(acs) = ccre - acs;
262 	    acs -= temp;
263 	} while (temp != 0);
264 	acs = NIL;
265     }
266     return;
267 }
268 /* end of convertre */
269 
270 
271 /*
272  *	The following routine recognises an irregular expresion
273  *	with the following special characters:
274  *
275  *		\?	-	means last match was optional
276  *		\a	-	matches any number of characters
277  *		\d	-	matches any number of spaces and tabs
278  *		\p	-	matches any number of alphanumeric
279  *				characters. The
280  *				characters matched will be copied into
281  *				the area pointed to by 'name'.
282  *		\|	-	alternation
283  *		\( \)	-	grouping used mostly for alternation and
284  *				optionality
285  *
286  *	The irregular expression must be translated to internal form
287  *	prior to calling this routine
288  *
289  *	The value returned is the pointer to the first non \a
290  *	character matched.
291  */
292 
293 boolean _escaped;		/* true if we are currently _escaped */
294 char *_start;			/* start of string */
295 
296 char *
297 expmatch (s, re, mstring)
298     register char *s;		/* string to check for a match in */
299     register char *re;		/* a converted irregular expression */
300     register char *mstring;	/* where to put whatever matches a \p */
301 {
302     register char *cs;		/* the current symbol */
303     register char *ptr,*s1;	/* temporary pointer */
304     boolean matched;		/* a temporary boolean */
305 
306     /* initial conditions */
307     if (re == NIL)
308 	return (NIL);
309     cs = re;
310     matched = FALSE;
311 
312     /* loop till expression string is exhausted (or at least pretty tired) */
313     while (*cs) {
314 	switch (*cs & (OPER | STR | META)) {
315 
316 	/* try to match a string */
317 	case STR:
318 	    matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
319 	    if (matched) {
320 
321 		/* hoorah it matches */
322 		s += SCNT(cs);
323 		cs = SNEXT(cs);
324 	    } else if (*cs & ALT) {
325 
326 		/* alternation, skip to next expression */
327 		cs = SNEXT(cs);
328 	    } else if (*cs & OPT) {
329 
330 		/* the match is optional */
331 		cs = SNEXT(cs);
332 		matched = 1;		/* indicate a successful match */
333 	    } else {
334 
335 		/* no match, error return */
336 		return (NIL);
337 	    }
338 	    break;
339 
340 	/* an operator, do something fancy */
341 	case OPER:
342 	    switch (OSYM(cs)) {
343 
344 	    /* this is an alternation */
345 	    case '|':
346 		if (matched)
347 
348 		    /* last thing in the alternation was a match, skip ahead */
349 		    cs = OPTR(cs);
350 		else
351 
352 		    /* no match, keep trying */
353 		    cs = ONEXT(cs);
354 		break;
355 
356 	    /* this is a grouping, recurse */
357 	    case '(':
358 		ptr = expmatch (s, ONEXT(cs), mstring);
359 		if (ptr != NIL) {
360 
361 		    /* the subexpression matched */
362 		    matched = 1;
363 		    s = ptr;
364 		} else if (*cs & ALT) {
365 
366 		    /* alternation, skip to next expression */
367 		    matched = 0;
368 		} else if (*cs & OPT) {
369 
370 		    /* the match is optional */
371 		    matched = 1;	/* indicate a successful match */
372 		} else {
373 
374 		    /* no match, error return */
375 		    return (NIL);
376 		}
377 		cs = OPTR(cs);
378 		break;
379 	    }
380 	    break;
381 
382 	/* try to match a metasymbol */
383 	case META:
384 	    switch (MSYM(cs)) {
385 
386 	    /* try to match anything and remember what was matched */
387 	    case 'p':
388 		/*
389 		 *  This is really the same as trying the match the
390 		 *  remaining parts of the expression to any subset
391 		 *  of the string.
392 		 */
393 		s1 = s;
394 		do {
395 		    ptr = expmatch (s1, MNEXT(cs), mstring);
396 		    if (ptr != NIL && s1 != s) {
397 
398 			/* we have a match, remember the match */
399 			strncpy (mstring, s, s1 - s);
400 			mstring[s1 - s] = '\0';
401 			return (ptr);
402 		    } else if (ptr != NIL && (*cs & OPT)) {
403 
404 			/* it was aoptional so no match is ok */
405 			return (ptr);
406 		    } else if (ptr != NIL) {
407 
408 			/* not optional and we still matched */
409 			return (NIL);
410 		    }
411 		    if (!isalnum(*s1) && *s1 != '_')
412 			return (NIL);
413 		    if (*s1 == '\\')
414 			_escaped = _escaped ? FALSE : TRUE;
415 		    else
416 			_escaped = FALSE;
417 		} while (*s1++);
418 		return (NIL);
419 
420 	    /* try to match anything */
421 	    case 'a':
422 		/*
423 		 *  This is really the same as trying the match the
424 		 *  remaining parts of the expression to any subset
425 		 *  of the string.
426 		 */
427 		s1 = s;
428 		do {
429 		    ptr = expmatch (s1, MNEXT(cs), mstring);
430 		    if (ptr != NIL && s1 != s) {
431 
432 			/* we have a match */
433 			return (ptr);
434 		    } else if (ptr != NIL && (*cs & OPT)) {
435 
436 			/* it was aoptional so no match is ok */
437 			return (ptr);
438 		    } else if (ptr != NIL) {
439 
440 			/* not optional and we still matched */
441 			return (NIL);
442 		    }
443 		    if (*s1 == '\\')
444 			_escaped = _escaped ? FALSE : TRUE;
445 		    else
446 			_escaped = FALSE;
447 		} while (*s1++);
448 		return (NIL);
449 
450 	    /* fail if we are currently _escaped */
451 	    case 'e':
452 		if (_escaped)
453 		    return(NIL);
454 		cs = MNEXT(cs);
455 		break;
456 
457 	    /* match any number of tabs and spaces */
458 	    case 'd':
459 		ptr = s;
460 		while (*s == ' ' || *s == '\t')
461 		    s++;
462 		if (s != ptr || s == _start) {
463 
464 		    /* match, be happy */
465 		    matched = 1;
466 		    cs = MNEXT(cs);
467 		} else if (*s == '\n' || *s == '\0') {
468 
469 		    /* match, be happy */
470 		    matched = 1;
471 		    cs = MNEXT(cs);
472 		} else if (*cs & ALT) {
473 
474 		    /* try the next part */
475 		    matched = 0;
476 		    cs = MNEXT(cs);
477 		} else if (*cs & OPT) {
478 
479 		    /* doesn't matter */
480 		    matched = 1;
481 		    cs = MNEXT(cs);
482 		} else
483 
484 		    /* no match, error return */
485 		    return (NIL);
486 		break;
487 
488 	    /* check for end of line */
489 	    case '$':
490 		if (*s == '\0' || *s == '\n') {
491 
492 		    /* match, be happy */
493 		    s++;
494 		    matched = 1;
495 		    cs = MNEXT(cs);
496 		} else if (*cs & ALT) {
497 
498 		    /* try the next part */
499 		    matched = 0;
500 		    cs = MNEXT(cs);
501 		} else if (*cs & OPT) {
502 
503 		    /* doesn't matter */
504 		    matched = 1;
505 		    cs = MNEXT(cs);
506 		} else
507 
508 		    /* no match, error return */
509 		    return (NIL);
510 		break;
511 
512 	    /* check for start of line */
513 	    case '^':
514 		if (s == _start) {
515 
516 		    /* match, be happy */
517 		    matched = 1;
518 		    cs = MNEXT(cs);
519 		} else if (*cs & ALT) {
520 
521 		    /* try the next part */
522 		    matched = 0;
523 		    cs = MNEXT(cs);
524 		} else if (*cs & OPT) {
525 
526 		    /* doesn't matter */
527 		    matched = 1;
528 		    cs = MNEXT(cs);
529 		} else
530 
531 		    /* no match, error return */
532 		    return (NIL);
533 		break;
534 
535 	    /* end of a subexpression, return success */
536 	    case ')':
537 		return (s);
538 	    }
539 	    break;
540 	}
541     }
542     return (s);
543 }
544