xref: /original-bsd/lib/libcompat/4.3/regex.c (revision 5b560bbe)
1 /*
2  * Copyright (c) 1980 Regents of the University of California.
3  * All rights reserved.  The Berkeley software License Agreement
4  * specifies the terms and conditions for redistribution.
5  */
6 
7 #if defined(LIBC_SCCS) && !defined(lint)
8 static char sccsid[] = "@(#)regex.c	5.4 (Berkeley) 02/23/91";
9 #endif LIBC_SCCS and not lint
10 
11 #include <unistd.h>
12 
13 static int advance();
14 static int backref();
15 static int cclass();
16 
17 /*
18  * routines to do regular expression matching
19  *
20  * Entry points:
21  *
22  *	re_comp(s)
23  *		char *s;
24  *	 ... returns 0 if the string s was compiled successfully,
25  *		     a pointer to an error message otherwise.
26  *	     If passed 0 or a null string returns without changing
27  *           the currently compiled re (see note 11 below).
28  *
29  *	re_exec(s)
30  *		char *s;
31  *	 ... returns 1 if the string s matches the last compiled regular
32  *		       expression,
33  *		     0 if the string s failed to match the last compiled
34  *		       regular expression, and
35  *		    -1 if the compiled regular expression was invalid
36  *		       (indicating an internal error).
37  *
38  * The strings passed to both re_comp and re_exec may have trailing or
39  * embedded newline characters; they are terminated by nulls.
40  *
41  * The identity of the author of these routines is lost in antiquity;
42  * this is essentially the same as the re code in the original V6 ed.
43  *
44  * The regular expressions recognized are described below. This description
45  * is essentially the same as that for ed.
46  *
47  *	A regular expression specifies a set of strings of characters.
48  *	A member of this set of strings is said to be matched by
49  *	the regular expression.  In the following specification for
50  *	regular expressions the word `character' means any character but NUL.
51  *
52  *	1.  Any character except a special character matches itself.
53  *	    Special characters are the regular expression delimiter plus
54  *	    \ [ . and sometimes ^ * $.
55  *	2.  A . matches any character.
56  *	3.  A \ followed by any character except a digit or ( )
57  *	    matches that character.
58  *	4.  A nonempty string s bracketed [s] (or [^s]) matches any
59  *	    character in (or not in) s. In s, \ has no special meaning,
60  *	    and ] may only appear as the first letter. A substring
61  *	    a-b, with a and b in ascending ASCII order, stands for
62  *	    the inclusive range of ASCII characters.
63  *	5.  A regular expression of form 1-4 followed by * matches a
64  *	    sequence of 0 or more matches of the regular expression.
65  *	6.  A regular expression, x, of form 1-8, bracketed \(x\)
66  *	    matches what x matches.
67  *	7.  A \ followed by a digit n matches a copy of the string that the
68  *	    bracketed regular expression beginning with the nth \( matched.
69  *	8.  A regular expression of form 1-8, x, followed by a regular
70  *	    expression of form 1-7, y matches a match for x followed by
71  *	    a match for y, with the x match being as long as possible
72  *	    while still permitting a y match.
73  *	9.  A regular expression of form 1-8 preceded by ^ (or followed
74  *	    by $), is constrained to matches that begin at the left
75  *	    (or end at the right) end of a line.
76  *	10. A regular expression of form 1-9 picks out the longest among
77  *	    the leftmost matches in a line.
78  *	11. An empty regular expression stands for a copy of the last
79  *	    regular expression encountered.
80  */
81 
82 /*
83  * constants for re's
84  */
85 #define	CBRA	1
86 #define	CCHR	2
87 #define	CDOT	4
88 #define	CCL	6
89 #define	NCCL	8
90 #define	CDOL	10
91 #define	CEOF	11
92 #define	CKET	12
93 #define	CBACK	18
94 
95 #define	CSTAR	01
96 
97 #define	ESIZE	512
98 #define	NBRA	9
99 
100 static	char	expbuf[ESIZE], *braslist[NBRA], *braelist[NBRA];
101 static	char	circf;
102 
103 /*
104  * compile the regular expression argument into a dfa
105  */
106 char *
107 re_comp(sp)
108 	register const char	*sp;
109 {
110 	register int	c;
111 	register char	*ep = expbuf;
112 	int	cclcnt, numbra = 0;
113 	char	*lastep = 0;
114 	char	bracket[NBRA];
115 	char	*bracketp = &bracket[0];
116 	static	char	*retoolong = "Regular expression too long";
117 
118 #define	comerr(msg) {expbuf[0] = 0; numbra = 0; return(msg); }
119 
120 	if (sp == 0 || *sp == '\0') {
121 		if (*ep == 0)
122 			return("No previous regular expression");
123 		return(0);
124 	}
125 	if (*sp == '^') {
126 		circf = 1;
127 		sp++;
128 	}
129 	else
130 		circf = 0;
131 	for (;;) {
132 		if (ep >= &expbuf[ESIZE])
133 			comerr(retoolong);
134 		if ((c = *sp++) == '\0') {
135 			if (bracketp != bracket)
136 				comerr("unmatched \\(");
137 			*ep++ = CEOF;
138 			*ep++ = 0;
139 			return(0);
140 		}
141 		if (c != '*')
142 			lastep = ep;
143 		switch (c) {
144 
145 		case '.':
146 			*ep++ = CDOT;
147 			continue;
148 
149 		case '*':
150 			if (lastep == 0 || *lastep == CBRA || *lastep == CKET)
151 				goto defchar;
152 			*lastep |= CSTAR;
153 			continue;
154 
155 		case '$':
156 			if (*sp != '\0')
157 				goto defchar;
158 			*ep++ = CDOL;
159 			continue;
160 
161 		case '[':
162 			*ep++ = CCL;
163 			*ep++ = 0;
164 			cclcnt = 1;
165 			if ((c = *sp++) == '^') {
166 				c = *sp++;
167 				ep[-2] = NCCL;
168 			}
169 			do {
170 				if (c == '\0')
171 					comerr("missing ]");
172 				if (c == '-' && ep [-1] != 0) {
173 					if ((c = *sp++) == ']') {
174 						*ep++ = '-';
175 						cclcnt++;
176 						break;
177 					}
178 					while (ep[-1] < c) {
179 						*ep = ep[-1] + 1;
180 						ep++;
181 						cclcnt++;
182 						if (ep >= &expbuf[ESIZE])
183 							comerr(retoolong);
184 					}
185 				}
186 				*ep++ = c;
187 				cclcnt++;
188 				if (ep >= &expbuf[ESIZE])
189 					comerr(retoolong);
190 			} while ((c = *sp++) != ']');
191 			lastep[1] = cclcnt;
192 			continue;
193 
194 		case '\\':
195 			if ((c = *sp++) == '(') {
196 				if (numbra >= NBRA)
197 					comerr("too many \\(\\) pairs");
198 				*bracketp++ = numbra;
199 				*ep++ = CBRA;
200 				*ep++ = numbra++;
201 				continue;
202 			}
203 			if (c == ')') {
204 				if (bracketp <= bracket)
205 					comerr("unmatched \\)");
206 				*ep++ = CKET;
207 				*ep++ = *--bracketp;
208 				continue;
209 			}
210 			if (c >= '1' && c < ('1' + NBRA)) {
211 				*ep++ = CBACK;
212 				*ep++ = c - '1';
213 				continue;
214 			}
215 			*ep++ = CCHR;
216 			*ep++ = c;
217 			continue;
218 
219 		defchar:
220 		default:
221 			*ep++ = CCHR;
222 			*ep++ = c;
223 		}
224 	}
225 }
226 
227 /*
228  * match the argument string against the compiled re
229  */
230 int
231 re_exec(p1)
232 	register const char	*p1;
233 {
234 	register char	*p2 = expbuf;
235 	register int	c;
236 	int	rv;
237 
238 	for (c = 0; c < NBRA; c++) {
239 		braslist[c] = 0;
240 		braelist[c] = 0;
241 	}
242 	if (circf)
243 		return((advance(p1, p2)));
244 	/*
245 	 * fast check for first character
246 	 */
247 	if (*p2 == CCHR) {
248 		c = p2[1];
249 		do {
250 			if (*p1 != c)
251 				continue;
252 			if (rv = advance(p1, p2))
253 				return(rv);
254 		} while (*p1++);
255 		return(0);
256 	}
257 	/*
258 	 * regular algorithm
259 	 */
260 	do
261 		if (rv = advance(p1, p2))
262 			return(rv);
263 	while (*p1++);
264 	return(0);
265 }
266 
267 /*
268  * try to match the next thing in the dfa
269  */
270 static	int
271 advance(lp, ep)
272 	register char	*lp, *ep;
273 {
274 	register char	*curlp;
275 	int	ct, i;
276 	int	rv;
277 
278 	for (;;)
279 		switch (*ep++) {
280 
281 		case CCHR:
282 			if (*ep++ == *lp++)
283 				continue;
284 			return(0);
285 
286 		case CDOT:
287 			if (*lp++)
288 				continue;
289 			return(0);
290 
291 		case CDOL:
292 			if (*lp == '\0')
293 				continue;
294 			return(0);
295 
296 		case CEOF:
297 			return(1);
298 
299 		case CCL:
300 			if (cclass(ep, *lp++, 1)) {
301 				ep += *ep;
302 				continue;
303 			}
304 			return(0);
305 
306 		case NCCL:
307 			if (cclass(ep, *lp++, 0)) {
308 				ep += *ep;
309 				continue;
310 			}
311 			return(0);
312 
313 		case CBRA:
314 			braslist[*ep++] = lp;
315 			continue;
316 
317 		case CKET:
318 			braelist[*ep++] = lp;
319 			continue;
320 
321 		case CBACK:
322 			if (braelist[i = *ep++] == 0)
323 				return(-1);
324 			if (backref(i, lp)) {
325 				lp += braelist[i] - braslist[i];
326 				continue;
327 			}
328 			return(0);
329 
330 		case CBACK|CSTAR:
331 			if (braelist[i = *ep++] == 0)
332 				return(-1);
333 			curlp = lp;
334 			ct = braelist[i] - braslist[i];
335 			while (backref(i, lp))
336 				lp += ct;
337 			while (lp >= curlp) {
338 				if (rv = advance(lp, ep))
339 					return(rv);
340 				lp -= ct;
341 			}
342 			continue;
343 
344 		case CDOT|CSTAR:
345 			curlp = lp;
346 			while (*lp++)
347 				;
348 			goto star;
349 
350 		case CCHR|CSTAR:
351 			curlp = lp;
352 			while (*lp++ == *ep)
353 				;
354 			ep++;
355 			goto star;
356 
357 		case CCL|CSTAR:
358 		case NCCL|CSTAR:
359 			curlp = lp;
360 			while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR)))
361 				;
362 			ep += *ep;
363 			goto star;
364 
365 		star:
366 			do {
367 				lp--;
368 				if (rv = advance(lp, ep))
369 					return(rv);
370 			} while (lp > curlp);
371 			return(0);
372 
373 		default:
374 			return(-1);
375 		}
376 }
377 
378 static
379 backref(i, lp)
380 	register int	i;
381 	register char	*lp;
382 {
383 	register char	*bp;
384 
385 	bp = braslist[i];
386 	while (*bp++ == *lp++)
387 		if (bp >= braelist[i])
388 			return(1);
389 	return(0);
390 }
391 
392 static int
393 cclass(set, c, af)
394 	register char	*set, c;
395 	int	af;
396 {
397 	register int	n;
398 
399 	if (c == 0)
400 		return(0);
401 	n = *set++;
402 	while (--n)
403 		if (*set++ == c)
404 			return(af);
405 	return(! af);
406 }
407