xref: /original-bsd/lib/libcompat/4.3/regex.c (revision 61b6c03f)
1 /*
2  * Copyright (c) 1980 Regents of the University of California.
3  * All rights reserved.  The Berkeley software License Agreement
4  * specifies the terms and conditions for redistribution.
5  */
6 
7 #if defined(LIBC_SCCS) && !defined(lint)
8 static char sccsid[] = "@(#)regex.c	5.2 (Berkeley) 03/09/86";
9 #endif LIBC_SCCS and not lint
10 
11 #
12 
13 /*
14  * routines to do regular expression matching
15  *
16  * Entry points:
17  *
18  *	re_comp(s)
19  *		char *s;
20  *	 ... returns 0 if the string s was compiled successfully,
21  *		     a pointer to an error message otherwise.
22  *	     If passed 0 or a null string returns without changing
23  *           the currently compiled re (see note 11 below).
24  *
25  *	re_exec(s)
26  *		char *s;
27  *	 ... returns 1 if the string s matches the last compiled regular
28  *		       expression,
29  *		     0 if the string s failed to match the last compiled
30  *		       regular expression, and
31  *		    -1 if the compiled regular expression was invalid
32  *		       (indicating an internal error).
33  *
34  * The strings passed to both re_comp and re_exec may have trailing or
35  * embedded newline characters; they are terminated by nulls.
36  *
37  * The identity of the author of these routines is lost in antiquity;
38  * this is essentially the same as the re code in the original V6 ed.
39  *
40  * The regular expressions recognized are described below. This description
41  * is essentially the same as that for ed.
42  *
43  *	A regular expression specifies a set of strings of characters.
44  *	A member of this set of strings is said to be matched by
45  *	the regular expression.  In the following specification for
46  *	regular expressions the word `character' means any character but NUL.
47  *
48  *	1.  Any character except a special character matches itself.
49  *	    Special characters are the regular expression delimiter plus
50  *	    \ [ . and sometimes ^ * $.
51  *	2.  A . matches any character.
52  *	3.  A \ followed by any character except a digit or ( )
53  *	    matches that character.
54  *	4.  A nonempty string s bracketed [s] (or [^s]) matches any
55  *	    character in (or not in) s. In s, \ has no special meaning,
56  *	    and ] may only appear as the first letter. A substring
57  *	    a-b, with a and b in ascending ASCII order, stands for
58  *	    the inclusive range of ASCII characters.
59  *	5.  A regular expression of form 1-4 followed by * matches a
60  *	    sequence of 0 or more matches of the regular expression.
61  *	6.  A regular expression, x, of form 1-8, bracketed \(x\)
62  *	    matches what x matches.
63  *	7.  A \ followed by a digit n matches a copy of the string that the
64  *	    bracketed regular expression beginning with the nth \( matched.
65  *	8.  A regular expression of form 1-8, x, followed by a regular
66  *	    expression of form 1-7, y matches a match for x followed by
67  *	    a match for y, with the x match being as long as possible
68  *	    while still permitting a y match.
69  *	9.  A regular expression of form 1-8 preceded by ^ (or followed
70  *	    by $), is constrained to matches that begin at the left
71  *	    (or end at the right) end of a line.
72  *	10. A regular expression of form 1-9 picks out the longest among
73  *	    the leftmost matches in a line.
74  *	11. An empty regular expression stands for a copy of the last
75  *	    regular expression encountered.
76  */
77 
78 /*
79  * constants for re's
80  */
81 #define	CBRA	1
82 #define	CCHR	2
83 #define	CDOT	4
84 #define	CCL	6
85 #define	NCCL	8
86 #define	CDOL	10
87 #define	CEOF	11
88 #define	CKET	12
89 #define	CBACK	18
90 
91 #define	CSTAR	01
92 
93 #define	ESIZE	512
94 #define	NBRA	9
95 
96 static	char	expbuf[ESIZE], *braslist[NBRA], *braelist[NBRA];
97 static	char	circf;
98 
99 /*
100  * compile the regular expression argument into a dfa
101  */
102 char *
103 re_comp(sp)
104 	register char	*sp;
105 {
106 	register int	c;
107 	register char	*ep = expbuf;
108 	int	cclcnt, numbra = 0;
109 	char	*lastep = 0;
110 	char	bracket[NBRA];
111 	char	*bracketp = &bracket[0];
112 	static	char	*retoolong = "Regular expression too long";
113 
114 #define	comerr(msg) {expbuf[0] = 0; numbra = 0; return(msg); }
115 
116 	if (sp == 0 || *sp == '\0') {
117 		if (*ep == 0)
118 			return("No previous regular expression");
119 		return(0);
120 	}
121 	if (*sp == '^') {
122 		circf = 1;
123 		sp++;
124 	}
125 	else
126 		circf = 0;
127 	for (;;) {
128 		if (ep >= &expbuf[ESIZE])
129 			comerr(retoolong);
130 		if ((c = *sp++) == '\0') {
131 			if (bracketp != bracket)
132 				comerr("unmatched \\(");
133 			*ep++ = CEOF;
134 			*ep++ = 0;
135 			return(0);
136 		}
137 		if (c != '*')
138 			lastep = ep;
139 		switch (c) {
140 
141 		case '.':
142 			*ep++ = CDOT;
143 			continue;
144 
145 		case '*':
146 			if (lastep == 0 || *lastep == CBRA || *lastep == CKET)
147 				goto defchar;
148 			*lastep |= CSTAR;
149 			continue;
150 
151 		case '$':
152 			if (*sp != '\0')
153 				goto defchar;
154 			*ep++ = CDOL;
155 			continue;
156 
157 		case '[':
158 			*ep++ = CCL;
159 			*ep++ = 0;
160 			cclcnt = 1;
161 			if ((c = *sp++) == '^') {
162 				c = *sp++;
163 				ep[-2] = NCCL;
164 			}
165 			do {
166 				if (c == '\0')
167 					comerr("missing ]");
168 				if (c == '-' && ep [-1] != 0) {
169 					if ((c = *sp++) == ']') {
170 						*ep++ = '-';
171 						cclcnt++;
172 						break;
173 					}
174 					while (ep[-1] < c) {
175 						*ep = ep[-1] + 1;
176 						ep++;
177 						cclcnt++;
178 						if (ep >= &expbuf[ESIZE])
179 							comerr(retoolong);
180 					}
181 				}
182 				*ep++ = c;
183 				cclcnt++;
184 				if (ep >= &expbuf[ESIZE])
185 					comerr(retoolong);
186 			} while ((c = *sp++) != ']');
187 			lastep[1] = cclcnt;
188 			continue;
189 
190 		case '\\':
191 			if ((c = *sp++) == '(') {
192 				if (numbra >= NBRA)
193 					comerr("too many \\(\\) pairs");
194 				*bracketp++ = numbra;
195 				*ep++ = CBRA;
196 				*ep++ = numbra++;
197 				continue;
198 			}
199 			if (c == ')') {
200 				if (bracketp <= bracket)
201 					comerr("unmatched \\)");
202 				*ep++ = CKET;
203 				*ep++ = *--bracketp;
204 				continue;
205 			}
206 			if (c >= '1' && c < ('1' + NBRA)) {
207 				*ep++ = CBACK;
208 				*ep++ = c - '1';
209 				continue;
210 			}
211 			*ep++ = CCHR;
212 			*ep++ = c;
213 			continue;
214 
215 		defchar:
216 		default:
217 			*ep++ = CCHR;
218 			*ep++ = c;
219 		}
220 	}
221 }
222 
223 /*
224  * match the argument string against the compiled re
225  */
226 int
227 re_exec(p1)
228 	register char	*p1;
229 {
230 	register char	*p2 = expbuf;
231 	register int	c;
232 	int	rv;
233 
234 	for (c = 0; c < NBRA; c++) {
235 		braslist[c] = 0;
236 		braelist[c] = 0;
237 	}
238 	if (circf)
239 		return((advance(p1, p2)));
240 	/*
241 	 * fast check for first character
242 	 */
243 	if (*p2 == CCHR) {
244 		c = p2[1];
245 		do {
246 			if (*p1 != c)
247 				continue;
248 			if (rv = advance(p1, p2))
249 				return(rv);
250 		} while (*p1++);
251 		return(0);
252 	}
253 	/*
254 	 * regular algorithm
255 	 */
256 	do
257 		if (rv = advance(p1, p2))
258 			return(rv);
259 	while (*p1++);
260 	return(0);
261 }
262 
263 /*
264  * try to match the next thing in the dfa
265  */
266 static	int
267 advance(lp, ep)
268 	register char	*lp, *ep;
269 {
270 	register char	*curlp;
271 	int	ct, i;
272 	int	rv;
273 
274 	for (;;)
275 		switch (*ep++) {
276 
277 		case CCHR:
278 			if (*ep++ == *lp++)
279 				continue;
280 			return(0);
281 
282 		case CDOT:
283 			if (*lp++)
284 				continue;
285 			return(0);
286 
287 		case CDOL:
288 			if (*lp == '\0')
289 				continue;
290 			return(0);
291 
292 		case CEOF:
293 			return(1);
294 
295 		case CCL:
296 			if (cclass(ep, *lp++, 1)) {
297 				ep += *ep;
298 				continue;
299 			}
300 			return(0);
301 
302 		case NCCL:
303 			if (cclass(ep, *lp++, 0)) {
304 				ep += *ep;
305 				continue;
306 			}
307 			return(0);
308 
309 		case CBRA:
310 			braslist[*ep++] = lp;
311 			continue;
312 
313 		case CKET:
314 			braelist[*ep++] = lp;
315 			continue;
316 
317 		case CBACK:
318 			if (braelist[i = *ep++] == 0)
319 				return(-1);
320 			if (backref(i, lp)) {
321 				lp += braelist[i] - braslist[i];
322 				continue;
323 			}
324 			return(0);
325 
326 		case CBACK|CSTAR:
327 			if (braelist[i = *ep++] == 0)
328 				return(-1);
329 			curlp = lp;
330 			ct = braelist[i] - braslist[i];
331 			while (backref(i, lp))
332 				lp += ct;
333 			while (lp >= curlp) {
334 				if (rv = advance(lp, ep))
335 					return(rv);
336 				lp -= ct;
337 			}
338 			continue;
339 
340 		case CDOT|CSTAR:
341 			curlp = lp;
342 			while (*lp++)
343 				;
344 			goto star;
345 
346 		case CCHR|CSTAR:
347 			curlp = lp;
348 			while (*lp++ == *ep)
349 				;
350 			ep++;
351 			goto star;
352 
353 		case CCL|CSTAR:
354 		case NCCL|CSTAR:
355 			curlp = lp;
356 			while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR)))
357 				;
358 			ep += *ep;
359 			goto star;
360 
361 		star:
362 			do {
363 				lp--;
364 				if (rv = advance(lp, ep))
365 					return(rv);
366 			} while (lp > curlp);
367 			return(0);
368 
369 		default:
370 			return(-1);
371 		}
372 }
373 
374 backref(i, lp)
375 	register int	i;
376 	register char	*lp;
377 {
378 	register char	*bp;
379 
380 	bp = braslist[i];
381 	while (*bp++ == *lp++)
382 		if (bp >= braelist[i])
383 			return(1);
384 	return(0);
385 }
386 
387 int
388 cclass(set, c, af)
389 	register char	*set, c;
390 	int	af;
391 {
392 	register int	n;
393 
394 	if (c == 0)
395 		return(0);
396 	n = *set++;
397 	while (--n)
398 		if (*set++ == c)
399 			return(af);
400 	return(! af);
401 }
402