1 /*
2  * Copyright (c) 1992-1998 Michael A. Cooper.
3  * This software may be freely used and distributed provided it is not
4  * sold for profit or used in part or in whole for commercial gain
5  * without prior written agreement, and the author is credited
6  * appropriately.
7  */
8 /*
9  * Copyright (c) 1983 Regents of the University of California.
10  * All rights reserved.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. All advertising materials mentioning features or use of this software
21  *    must display the following acknowledgement:
22  *	This product includes software developed by the University of
23  *	California, Berkeley and its contributors.
24  * 4. Neither the name of the University nor the names of its contributors
25  *    may be used to endorse or promote products derived from this software
26  *    without specific prior written permission.
27  *
28  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
29  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38  * SUCH DAMAGE.
39  */
40 
41 #if !defined(lint)
42 static char RCSid[] =
43 "$Id: regex.c,v 6.3 1998/11/10 04:14:28 mcooper Exp $";
44 
45 static char sccsid[] = "@(#)regex.c	5.2 (Berkeley) 3/9/86";
46 
47 static char copyright[] =
48 "Copyright (c) 1992-1998 Michael A. Cooper.\n\
49 @(#) Copyright (c) 1983 Regents of the University of California.\n\
50  All rights reserved.\n";
51 #endif /* not lint */
52 
53 /*
54  * routines to do regular expression matching
55  *
56  * Entry points:
57  *
58  *	re_comp(s)
59  *		char *s;
60  *	 ... returns 0 if the string s was compiled successfully,
61  *		     a pointer to an error message otherwise.
62  *	     If passed 0 or a null string returns without changing
63  *           the currently compiled re (see note 11 below).
64  *
65  *	re_exec(s)
66  *		char *s;
67  *	 ... returns 1 if the string s matches the last compiled regular
68  *		       expression,
69  *		     0 if the string s failed to match the last compiled
70  *		       regular expression, and
71  *		    -1 if the compiled regular expression was invalid
72  *		       (indicating an internal error).
73  *
74  * The strings passed to both re_comp and re_exec may have trailing or
75  * embedded newline characters; they are terminated by nulls.
76  *
77  * The identity of the author of these routines is lost in antiquity;
78  * this is essentially the same as the re code in the original V6 ed.
79  *
80  * The regular expressions recognized are described below. This description
81  * is essentially the same as that for ed.
82  *
83  *	A regular expression specifies a set of strings of characters.
84  *	A member of this set of strings is said to be matched by
85  *	the regular expression.  In the following specification for
86  *	regular expressions the word `character' means any character but NUL.
87  *
88  *	1.  Any character except a special character matches itself.
89  *	    Special characters are the regular expression delimiter plus
90  *	    \ [ . and sometimes ^ * $.
91  *	2.  A . matches any character.
92  *	3.  A \ followed by any character except a digit or ( )
93  *	    matches that character.
94  *	4.  A nonempty string s bracketed [s] (or [^s]) matches any
95  *	    character in (or not in) s. In s, \ has no special meaning,
96  *	    and ] may only appear as the first letter. A substring
97  *	    a-b, with a and b in ascending ASCII order, stands for
98  *	    the inclusive range of ASCII characters.
99  *	5.  A regular expression of form 1-4 followed by * matches a
100  *	    sequence of 0 or more matches of the regular expression.
101  *	6.  A regular expression, x, of form 1-8, bracketed \(x\)
102  *	    matches what x matches.
103  *	7.  A \ followed by a digit n matches a copy of the string that the
104  *	    bracketed regular expression beginning with the nth \( matched.
105  *	8.  A regular expression of form 1-8, x, followed by a regular
106  *	    expression of form 1-7, y matches a match for x followed by
107  *	    a match for y, with the x match being as long as possible
108  *	    while still permitting a y match.
109  *	9.  A regular expression of form 1-8 preceded by ^ (or followed
110  *	    by $), is constrained to matches that begin at the left
111  *	    (or end at the right) end of a line.
112  *	10. A regular expression of form 1-9 picks out the longest among
113  *	    the leftmost matches in a line.
114  *	11. An empty regular expression stands for a copy of the last
115  *	    regular expression encountered.
116  */
117 
118 /*
119  * constants for re's
120  */
121 #define	CBRA	1
122 #define	CCHR	2
123 #define	CDOT	4
124 #define	CCL	6
125 #define	NCCL	8
126 #define	CDOL	10
127 #define	CEOF	11
128 #define	CKET	12
129 #define	CBACK	18
130 
131 #define	CSTAR	01
132 
133 #define	ESIZE	512
134 #define	NBRA	9
135 
136 static	char	expbuf[ESIZE], *braslist[NBRA], *braelist[NBRA];
137 static	char	circf;
138 static	int	advance();
139 
140 /*
141  * compile the regular expression argument into a dfa
142  */
143 char *
re_comp(sp)144 re_comp(sp)
145 	register char	*sp;
146 {
147 	register int	c;
148 	register char	*ep = expbuf;
149 	int	cclcnt, numbra = 0;
150 	char	*lastep = 0;
151 	char	bracket[NBRA];
152 	char	*bracketp = &bracket[0];
153 	static	char	*retoolong = "Regular expression too long";
154 
155 #define	comerr(msg) {expbuf[0] = 0; numbra = 0; return(msg); }
156 
157 	if (sp == 0 || *sp == '\0') {
158 		if (*ep == 0)
159 			return("No previous regular expression");
160 		return(0);
161 	}
162 	if (*sp == '^') {
163 		circf = 1;
164 		sp++;
165 	}
166 	else
167 		circf = 0;
168 	for (;;) {
169 		if (ep >= &expbuf[ESIZE])
170 			comerr(retoolong);
171 		if ((c = *sp++) == '\0') {
172 			if (bracketp != bracket)
173 				comerr("unmatched \\(");
174 			*ep++ = CEOF;
175 			*ep++ = 0;
176 			return(0);
177 		}
178 		if (c != '*')
179 			lastep = ep;
180 		switch (c) {
181 
182 		case '.':
183 			*ep++ = CDOT;
184 			continue;
185 
186 		case '*':
187 			if (lastep == 0 || *lastep == CBRA || *lastep == CKET)
188 				goto defchar;
189 			*lastep |= CSTAR;
190 			continue;
191 
192 		case '$':
193 			if (*sp != '\0')
194 				goto defchar;
195 			*ep++ = CDOL;
196 			continue;
197 
198 		case '[':
199 			*ep++ = CCL;
200 			*ep++ = 0;
201 			cclcnt = 1;
202 			if ((c = *sp++) == '^') {
203 				c = *sp++;
204 				ep[-2] = NCCL;
205 			}
206 			do {
207 				if (c == '\0')
208 					comerr("missing ]");
209 				if (c == '-' && ep [-1] != 0) {
210 					if ((c = *sp++) == ']') {
211 						*ep++ = '-';
212 						cclcnt++;
213 						break;
214 					}
215 					while (ep[-1] < c) {
216 						*ep = ep[-1] + 1;
217 						ep++;
218 						cclcnt++;
219 						if (ep >= &expbuf[ESIZE])
220 							comerr(retoolong);
221 					}
222 				}
223 				*ep++ = c;
224 				cclcnt++;
225 				if (ep >= &expbuf[ESIZE])
226 					comerr(retoolong);
227 			} while ((c = *sp++) != ']');
228 			lastep[1] = cclcnt;
229 			continue;
230 
231 		case '\\':
232 			if ((c = *sp++) == '(') {
233 				if (numbra >= NBRA)
234 					comerr("too many \\(\\) pairs");
235 				*bracketp++ = numbra;
236 				*ep++ = CBRA;
237 				*ep++ = numbra++;
238 				continue;
239 			}
240 			if (c == ')') {
241 				if (bracketp <= bracket)
242 					comerr("unmatched \\)");
243 				*ep++ = CKET;
244 				*ep++ = *--bracketp;
245 				continue;
246 			}
247 			if (c >= '1' && c < ('1' + NBRA)) {
248 				*ep++ = CBACK;
249 				*ep++ = c - '1';
250 				continue;
251 			}
252 			*ep++ = CCHR;
253 			*ep++ = c;
254 			continue;
255 
256 		defchar:
257 		default:
258 			*ep++ = CCHR;
259 			*ep++ = c;
260 		}
261 	}
262 }
263 
264 /*
265  * match the argument string against the compiled re
266  */
267 int
re_exec(p1)268 re_exec(p1)
269 	register char	*p1;
270 {
271 	register char	*p2 = expbuf;
272 	register int	c;
273 	int	rv;
274 
275 	for (c = 0; c < NBRA; c++) {
276 		braslist[c] = 0;
277 		braelist[c] = 0;
278 	}
279 	if (circf)
280 		return((advance(p1, p2)));
281 	/*
282 	 * fast check for first character
283 	 */
284 	if (*p2 == CCHR) {
285 		c = p2[1];
286 		do {
287 			if (*p1 != c)
288 				continue;
289 			if (rv = advance(p1, p2))
290 				return(rv);
291 		} while (*p1++);
292 		return(0);
293 	}
294 	/*
295 	 * regular algorithm
296 	 */
297 	do
298 		if (rv = advance(p1, p2))
299 			return(rv);
300 	while (*p1++);
301 	return(0);
302 }
303 
304 /*
305  * try to match the next thing in the dfa
306  */
307 static	int
advance(lp,ep)308 advance(lp, ep)
309 	register char	*lp, *ep;
310 {
311 	register char	*curlp;
312 	int	ct, i;
313 	int	rv;
314 
315 	for (;;)
316 		switch (*ep++) {
317 
318 		case CCHR:
319 			if (*ep++ == *lp++)
320 				continue;
321 			return(0);
322 
323 		case CDOT:
324 			if (*lp++)
325 				continue;
326 			return(0);
327 
328 		case CDOL:
329 			if (*lp == '\0')
330 				continue;
331 			return(0);
332 
333 		case CEOF:
334 			return(1);
335 
336 		case CCL:
337 			if (cclass(ep, *lp++, 1)) {
338 				ep += *ep;
339 				continue;
340 			}
341 			return(0);
342 
343 		case NCCL:
344 			if (cclass(ep, *lp++, 0)) {
345 				ep += *ep;
346 				continue;
347 			}
348 			return(0);
349 
350 		case CBRA:
351 			braslist[*ep++] = lp;
352 			continue;
353 
354 		case CKET:
355 			braelist[*ep++] = lp;
356 			continue;
357 
358 		case CBACK:
359 			if (braelist[i = *ep++] == 0)
360 				return(-1);
361 			if (backref(i, lp)) {
362 				lp += braelist[i] - braslist[i];
363 				continue;
364 			}
365 			return(0);
366 
367 		case CBACK|CSTAR:
368 			if (braelist[i = *ep++] == 0)
369 				return(-1);
370 			curlp = lp;
371 			ct = braelist[i] - braslist[i];
372 			while (backref(i, lp))
373 				lp += ct;
374 			while (lp >= curlp) {
375 				if (rv = advance(lp, ep))
376 					return(rv);
377 				lp -= ct;
378 			}
379 			continue;
380 
381 		case CDOT|CSTAR:
382 			curlp = lp;
383 			while (*lp++)
384 				;
385 			goto star;
386 
387 		case CCHR|CSTAR:
388 			curlp = lp;
389 			while (*lp++ == *ep)
390 				;
391 			ep++;
392 			goto star;
393 
394 		case CCL|CSTAR:
395 		case NCCL|CSTAR:
396 			curlp = lp;
397 			while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR)))
398 				;
399 			ep += *ep;
400 			goto star;
401 
402 		star:
403 			do {
404 				lp--;
405 				if (rv = advance(lp, ep))
406 					return(rv);
407 			} while (lp > curlp);
408 			return(0);
409 
410 		default:
411 			return(-1);
412 		}
413 }
414 
backref(i,lp)415 backref(i, lp)
416 	register int	i;
417 	register char	*lp;
418 {
419 	register char	*bp;
420 
421 	bp = braslist[i];
422 	while (*bp++ == *lp++)
423 		if (bp >= braelist[i])
424 			return(1);
425 	return(0);
426 }
427 
428 int
cclass(set,c,af)429 cclass(set, c, af)
430 	register char	*set, c;
431 	int	af;
432 {
433 	register int	n;
434 
435 	if (c == 0)
436 		return(0);
437 	n = *set++;
438 	while (--n)
439 		if (*set++ == c)
440 			return(af);
441 	return(! af);
442 }
443