xref: /original-bsd/lib/libcompat/4.3/regex.c (revision c3e32dec)
1 /*-
2  * Copyright (c) 1980, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * %sccs.include.proprietary.c%
6  */
7 
8 #if defined(LIBC_SCCS) && !defined(lint)
9 static char sccsid[] = "@(#)regex.c	8.1 (Berkeley) 06/04/93";
10 #endif /* LIBC_SCCS and not lint */
11 
12 #include <unistd.h>
13 
14 static int advance();
15 static int backref();
16 static int cclass();
17 
18 /*
19  * routines to do regular expression matching
20  *
21  * Entry points:
22  *
23  *	re_comp(s)
24  *		char *s;
25  *	 ... returns 0 if the string s was compiled successfully,
26  *		     a pointer to an error message otherwise.
27  *	     If passed 0 or a null string returns without changing
28  *           the currently compiled re (see note 11 below).
29  *
30  *	re_exec(s)
31  *		char *s;
32  *	 ... returns 1 if the string s matches the last compiled regular
33  *		       expression,
34  *		     0 if the string s failed to match the last compiled
35  *		       regular expression, and
36  *		    -1 if the compiled regular expression was invalid
37  *		       (indicating an internal error).
38  *
39  * The strings passed to both re_comp and re_exec may have trailing or
40  * embedded newline characters; they are terminated by nulls.
41  *
42  * The identity of the author of these routines is lost in antiquity;
43  * this is essentially the same as the re code in the original V6 ed.
44  *
45  * The regular expressions recognized are described below. This description
46  * is essentially the same as that for ed.
47  *
48  *	A regular expression specifies a set of strings of characters.
49  *	A member of this set of strings is said to be matched by
50  *	the regular expression.  In the following specification for
51  *	regular expressions the word `character' means any character but NUL.
52  *
53  *	1.  Any character except a special character matches itself.
54  *	    Special characters are the regular expression delimiter plus
55  *	    \ [ . and sometimes ^ * $.
56  *	2.  A . matches any character.
57  *	3.  A \ followed by any character except a digit or ( )
58  *	    matches that character.
59  *	4.  A nonempty string s bracketed [s] (or [^s]) matches any
60  *	    character in (or not in) s. In s, \ has no special meaning,
61  *	    and ] may only appear as the first letter. A substring
62  *	    a-b, with a and b in ascending ASCII order, stands for
63  *	    the inclusive range of ASCII characters.
64  *	5.  A regular expression of form 1-4 followed by * matches a
65  *	    sequence of 0 or more matches of the regular expression.
66  *	6.  A regular expression, x, of form 1-8, bracketed \(x\)
67  *	    matches what x matches.
68  *	7.  A \ followed by a digit n matches a copy of the string that the
69  *	    bracketed regular expression beginning with the nth \( matched.
70  *	8.  A regular expression of form 1-8, x, followed by a regular
71  *	    expression of form 1-7, y matches a match for x followed by
72  *	    a match for y, with the x match being as long as possible
73  *	    while still permitting a y match.
74  *	9.  A regular expression of form 1-8 preceded by ^ (or followed
75  *	    by $), is constrained to matches that begin at the left
76  *	    (or end at the right) end of a line.
77  *	10. A regular expression of form 1-9 picks out the longest among
78  *	    the leftmost matches in a line.
79  *	11. An empty regular expression stands for a copy of the last
80  *	    regular expression encountered.
81  */
82 
83 /*
84  * constants for re's
85  */
86 #define	CBRA	1
87 #define	CCHR	2
88 #define	CDOT	4
89 #define	CCL	6
90 #define	NCCL	8
91 #define	CDOL	10
92 #define	CEOF	11
93 #define	CKET	12
94 #define	CBACK	18
95 
96 #define	CSTAR	01
97 
98 #define	ESIZE	512
99 #define	NBRA	9
100 
101 static	char	expbuf[ESIZE], *braslist[NBRA], *braelist[NBRA];
102 static	char	circf;
103 
104 /*
105  * compile the regular expression argument into a dfa
106  */
107 char *
108 re_comp(sp)
109 	register const char	*sp;
110 {
111 	register int	c;
112 	register char	*ep = expbuf;
113 	int	cclcnt, numbra = 0;
114 	char	*lastep = 0;
115 	char	bracket[NBRA];
116 	char	*bracketp = &bracket[0];
117 	static	char	*retoolong = "Regular expression too long";
118 
119 #define	comerr(msg) {expbuf[0] = 0; numbra = 0; return(msg); }
120 
121 	if (sp == 0 || *sp == '\0') {
122 		if (*ep == 0)
123 			return("No previous regular expression");
124 		return(0);
125 	}
126 	if (*sp == '^') {
127 		circf = 1;
128 		sp++;
129 	}
130 	else
131 		circf = 0;
132 	for (;;) {
133 		if (ep >= &expbuf[ESIZE])
134 			comerr(retoolong);
135 		if ((c = *sp++) == '\0') {
136 			if (bracketp != bracket)
137 				comerr("unmatched \\(");
138 			*ep++ = CEOF;
139 			*ep++ = 0;
140 			return(0);
141 		}
142 		if (c != '*')
143 			lastep = ep;
144 		switch (c) {
145 
146 		case '.':
147 			*ep++ = CDOT;
148 			continue;
149 
150 		case '*':
151 			if (lastep == 0 || *lastep == CBRA || *lastep == CKET)
152 				goto defchar;
153 			*lastep |= CSTAR;
154 			continue;
155 
156 		case '$':
157 			if (*sp != '\0')
158 				goto defchar;
159 			*ep++ = CDOL;
160 			continue;
161 
162 		case '[':
163 			*ep++ = CCL;
164 			*ep++ = 0;
165 			cclcnt = 1;
166 			if ((c = *sp++) == '^') {
167 				c = *sp++;
168 				ep[-2] = NCCL;
169 			}
170 			do {
171 				if (c == '\0')
172 					comerr("missing ]");
173 				if (c == '-' && ep [-1] != 0) {
174 					if ((c = *sp++) == ']') {
175 						*ep++ = '-';
176 						cclcnt++;
177 						break;
178 					}
179 					while (ep[-1] < c) {
180 						*ep = ep[-1] + 1;
181 						ep++;
182 						cclcnt++;
183 						if (ep >= &expbuf[ESIZE])
184 							comerr(retoolong);
185 					}
186 				}
187 				*ep++ = c;
188 				cclcnt++;
189 				if (ep >= &expbuf[ESIZE])
190 					comerr(retoolong);
191 			} while ((c = *sp++) != ']');
192 			lastep[1] = cclcnt;
193 			continue;
194 
195 		case '\\':
196 			if ((c = *sp++) == '(') {
197 				if (numbra >= NBRA)
198 					comerr("too many \\(\\) pairs");
199 				*bracketp++ = numbra;
200 				*ep++ = CBRA;
201 				*ep++ = numbra++;
202 				continue;
203 			}
204 			if (c == ')') {
205 				if (bracketp <= bracket)
206 					comerr("unmatched \\)");
207 				*ep++ = CKET;
208 				*ep++ = *--bracketp;
209 				continue;
210 			}
211 			if (c >= '1' && c < ('1' + NBRA)) {
212 				*ep++ = CBACK;
213 				*ep++ = c - '1';
214 				continue;
215 			}
216 			*ep++ = CCHR;
217 			*ep++ = c;
218 			continue;
219 
220 		defchar:
221 		default:
222 			*ep++ = CCHR;
223 			*ep++ = c;
224 		}
225 	}
226 }
227 
228 /*
229  * match the argument string against the compiled re
230  */
231 int
232 re_exec(p1)
233 	register const char	*p1;
234 {
235 	register char	*p2 = expbuf;
236 	register int	c;
237 	int	rv;
238 
239 	for (c = 0; c < NBRA; c++) {
240 		braslist[c] = 0;
241 		braelist[c] = 0;
242 	}
243 	if (circf)
244 		return((advance(p1, p2)));
245 	/*
246 	 * fast check for first character
247 	 */
248 	if (*p2 == CCHR) {
249 		c = p2[1];
250 		do {
251 			if (*p1 != c)
252 				continue;
253 			if (rv = advance(p1, p2))
254 				return(rv);
255 		} while (*p1++);
256 		return(0);
257 	}
258 	/*
259 	 * regular algorithm
260 	 */
261 	do
262 		if (rv = advance(p1, p2))
263 			return(rv);
264 	while (*p1++);
265 	return(0);
266 }
267 
268 /*
269  * try to match the next thing in the dfa
270  */
271 static	int
272 advance(lp, ep)
273 	register char	*lp, *ep;
274 {
275 	register char	*curlp;
276 	int	ct, i;
277 	int	rv;
278 
279 	for (;;)
280 		switch (*ep++) {
281 
282 		case CCHR:
283 			if (*ep++ == *lp++)
284 				continue;
285 			return(0);
286 
287 		case CDOT:
288 			if (*lp++)
289 				continue;
290 			return(0);
291 
292 		case CDOL:
293 			if (*lp == '\0')
294 				continue;
295 			return(0);
296 
297 		case CEOF:
298 			return(1);
299 
300 		case CCL:
301 			if (cclass(ep, *lp++, 1)) {
302 				ep += *ep;
303 				continue;
304 			}
305 			return(0);
306 
307 		case NCCL:
308 			if (cclass(ep, *lp++, 0)) {
309 				ep += *ep;
310 				continue;
311 			}
312 			return(0);
313 
314 		case CBRA:
315 			braslist[*ep++] = lp;
316 			continue;
317 
318 		case CKET:
319 			braelist[*ep++] = lp;
320 			continue;
321 
322 		case CBACK:
323 			if (braelist[i = *ep++] == 0)
324 				return(-1);
325 			if (backref(i, lp)) {
326 				lp += braelist[i] - braslist[i];
327 				continue;
328 			}
329 			return(0);
330 
331 		case CBACK|CSTAR:
332 			if (braelist[i = *ep++] == 0)
333 				return(-1);
334 			curlp = lp;
335 			ct = braelist[i] - braslist[i];
336 			while (backref(i, lp))
337 				lp += ct;
338 			while (lp >= curlp) {
339 				if (rv = advance(lp, ep))
340 					return(rv);
341 				lp -= ct;
342 			}
343 			continue;
344 
345 		case CDOT|CSTAR:
346 			curlp = lp;
347 			while (*lp++)
348 				;
349 			goto star;
350 
351 		case CCHR|CSTAR:
352 			curlp = lp;
353 			while (*lp++ == *ep)
354 				;
355 			ep++;
356 			goto star;
357 
358 		case CCL|CSTAR:
359 		case NCCL|CSTAR:
360 			curlp = lp;
361 			while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR)))
362 				;
363 			ep += *ep;
364 			goto star;
365 
366 		star:
367 			do {
368 				lp--;
369 				if (rv = advance(lp, ep))
370 					return(rv);
371 			} while (lp > curlp);
372 			return(0);
373 
374 		default:
375 			return(-1);
376 		}
377 }
378 
379 static
380 backref(i, lp)
381 	register int	i;
382 	register char	*lp;
383 {
384 	register char	*bp;
385 
386 	bp = braslist[i];
387 	while (*bp++ == *lp++)
388 		if (bp >= braelist[i])
389 			return(1);
390 	return(0);
391 }
392 
393 static int
394 cclass(set, c, af)
395 	register char	*set, c;
396 	int	af;
397 {
398 	register int	n;
399 
400 	if (c == 0)
401 		return(0);
402 	n = *set++;
403 	while (--n)
404 		if (*set++ == c)
405 			return(af);
406 	return(! af);
407 }
408