xref: /original-bsd/lib/libcompat/4.3/regex.c (revision bac5051c)
1 /* @(#)regex.c	4.1 (Berkeley) 12/21/80 */
2 #
3 
4 /*
5  * routines to do regular expression matching
6  *
7  * Entry points:
8  *
9  *	re_comp(s)
10  *		char *s;
11  *	 ... returns 0 if the string s was compiled successfully,
12  *		     a pointer to an error message otherwise.
13  *	     If passed 0 or a null string returns without changing
14  *           the currently compiled re (see note 11 below).
15  *
16  *	re_exec(s)
17  *		char *s;
18  *	 ... returns 1 if the string s matches the last compiled regular
19  *		       expression,
20  *		     0 if the string s failed to match the last compiled
21  *		       regular expression, and
22  *		    -1 if the compiled regular expression was invalid
23  *		       (indicating an internal error).
24  *
25  * The strings passed to both re_comp and re_exec may have trailing or
26  * embedded newline characters; they are terminated by nulls.
27  *
28  * The identity of the author of these routines is lost in antiquity;
29  * this is essentially the same as the re code in the original V6 ed.
30  *
31  * The regular expressions recognized are described below. This description
32  * is essentially the same as that for ed.
33  *
34  *	A regular expression specifies a set of strings of characters.
35  *	A member of this set of strings is said to be matched by
36  *	the regular expression.  In the following specification for
37  *	regular expressions the word `character' means any character but NUL.
38  *
39  *	1.  Any character except a special character matches itself.
40  *	    Special characters are the regular expression delimiter plus
41  *	    \ [ . and sometimes ^ * $.
42  *	2.  A . matches any character.
43  *	3.  A \ followed by any character except a digit or ( )
44  *	    matches that character.
45  *	4.  A nonempty string s bracketed [s] (or [^s]) matches any
46  *	    character in (or not in) s. In s, \ has no special meaning,
47  *	    and ] may only appear as the first letter. A substring
48  *	    a-b, with a and b in ascending ASCII order, stands for
49  *	    the inclusive range of ASCII characters.
50  *	5.  A regular expression of form 1-4 followed by * matches a
51  *	    sequence of 0 or more matches of the regular expression.
52  *	6.  A regular expression, x, of form 1-8, bracketed \(x\)
53  *	    matches what x matches.
54  *	7.  A \ followed by a digit n matches a copy of the string that the
55  *	    bracketed regular expression beginning with the nth \( matched.
56  *	8.  A regular expression of form 1-8, x, followed by a regular
57  *	    expression of form 1-7, y matches a match for x followed by
58  *	    a match for y, with the x match being as long as possible
59  *	    while still permitting a y match.
60  *	9.  A regular expression of form 1-8 preceded by ^ (or followed
61  *	    by $), is constrained to matches that begin at the left
62  *	    (or end at the right) end of a line.
63  *	10. A regular expression of form 1-9 picks out the longest among
64  *	    the leftmost matches in a line.
65  *	11. An empty regular expression stands for a copy of the last
66  *	    regular expression encountered.
67  */
68 
69 /*
70  * constants for re's
71  */
72 #define	CBRA	1
73 #define	CCHR	2
74 #define	CDOT	4
75 #define	CCL	6
76 #define	NCCL	8
77 #define	CDOL	10
78 #define	CEOF	11
79 #define	CKET	12
80 #define	CBACK	18
81 
82 #define	CSTAR	01
83 
84 #define	ESIZE	512
85 #define	NBRA	9
86 
87 static	char	expbuf[ESIZE], *braslist[NBRA], *braelist[NBRA];
88 static	char	circf;
89 
90 /*
91  * compile the regular expression argument into a dfa
92  */
93 char *
94 re_comp(sp)
95 	register char	*sp;
96 {
97 	register int	c;
98 	register char	*ep = expbuf;
99 	int	cclcnt, numbra = 0;
100 	char	*lastep = 0;
101 	char	bracket[NBRA];
102 	char	*bracketp = &bracket[0];
103 	static	char	*retoolong = "Regular expression too long";
104 
105 #define	comerr(msg) {expbuf[0] = 0; numbra = 0; return(msg); }
106 
107 	if (sp == 0 || *sp == '\0') {
108 		if (*ep == 0)
109 			return("No previous regular expression");
110 		return(0);
111 	}
112 	if (*sp == '^') {
113 		circf = 1;
114 		sp++;
115 	}
116 	else
117 		circf = 0;
118 	for (;;) {
119 		if (ep >= &expbuf[ESIZE])
120 			comerr(retoolong);
121 		if ((c = *sp++) == '\0') {
122 			if (bracketp != bracket)
123 				comerr("unmatched \\(");
124 			*ep++ = CEOF;
125 			*ep++ = 0;
126 			return(0);
127 		}
128 		if (c != '*')
129 			lastep = ep;
130 		switch (c) {
131 
132 		case '.':
133 			*ep++ = CDOT;
134 			continue;
135 
136 		case '*':
137 			if (lastep == 0 || *lastep == CBRA || *lastep == CKET)
138 				goto defchar;
139 			*lastep |= CSTAR;
140 			continue;
141 
142 		case '$':
143 			if (*sp != '\0')
144 				goto defchar;
145 			*ep++ = CDOL;
146 			continue;
147 
148 		case '[':
149 			*ep++ = CCL;
150 			*ep++ = 0;
151 			cclcnt = 1;
152 			if ((c = *sp++) == '^') {
153 				c = *sp++;
154 				ep[-2] = NCCL;
155 			}
156 			do {
157 				if (c == '\0')
158 					comerr("missing ]");
159 				if (c == '-' && ep [-1] != 0) {
160 					if ((c = *sp++) == ']') {
161 						*ep++ = '-';
162 						cclcnt++;
163 						break;
164 					}
165 					while (ep[-1] < c) {
166 						*ep = ep[-1] + 1;
167 						ep++;
168 						cclcnt++;
169 						if (ep >= &expbuf[ESIZE])
170 							comerr(retoolong);
171 					}
172 				}
173 				*ep++ = c;
174 				cclcnt++;
175 				if (ep >= &expbuf[ESIZE])
176 					comerr(retoolong);
177 			} while ((c = *sp++) != ']');
178 			lastep[1] = cclcnt;
179 			continue;
180 
181 		case '\\':
182 			if ((c = *sp++) == '(') {
183 				if (numbra >= NBRA)
184 					comerr("too many \\(\\) pairs");
185 				*bracketp++ = numbra;
186 				*ep++ = CBRA;
187 				*ep++ = numbra++;
188 				continue;
189 			}
190 			if (c == ')') {
191 				if (bracketp <= bracket)
192 					comerr("unmatched \\)");
193 				*ep++ = CKET;
194 				*ep++ = *--bracketp;
195 				continue;
196 			}
197 			if (c >= '1' && c < ('1' + NBRA)) {
198 				*ep++ = CBACK;
199 				*ep++ = c - '1';
200 				continue;
201 			}
202 			*ep++ = CCHR;
203 			*ep++ = c;
204 			continue;
205 
206 		defchar:
207 		default:
208 			*ep++ = CCHR;
209 			*ep++ = c;
210 		}
211 	}
212 }
213 
214 /*
215  * match the argument string against the compiled re
216  */
217 int
218 re_exec(p1)
219 	register char	*p1;
220 {
221 	register char	*p2 = expbuf;
222 	register int	c;
223 	int	rv;
224 
225 	for (c = 0; c < NBRA; c++) {
226 		braslist[c] = 0;
227 		braelist[c] = 0;
228 	}
229 	if (circf)
230 		return((advance(p1, p2)));
231 	/*
232 	 * fast check for first character
233 	 */
234 	if (*p2 == CCHR) {
235 		c = p2[1];
236 		do {
237 			if (*p1 != c)
238 				continue;
239 			if (rv = advance(p1, p2))
240 				return(rv);
241 		} while (*p1++);
242 		return(0);
243 	}
244 	/*
245 	 * regular algorithm
246 	 */
247 	do
248 		if (rv = advance(p1, p2))
249 			return(rv);
250 	while (*p1++);
251 	return(0);
252 }
253 
254 /*
255  * try to match the next thing in the dfa
256  */
257 static	int
258 advance(lp, ep)
259 	register char	*lp, *ep;
260 {
261 	register char	*curlp;
262 	int	ct, i;
263 	int	rv;
264 
265 	for (;;)
266 		switch (*ep++) {
267 
268 		case CCHR:
269 			if (*ep++ == *lp++)
270 				continue;
271 			return(0);
272 
273 		case CDOT:
274 			if (*lp++)
275 				continue;
276 			return(0);
277 
278 		case CDOL:
279 			if (*lp == '\0')
280 				continue;
281 			return(0);
282 
283 		case CEOF:
284 			return(1);
285 
286 		case CCL:
287 			if (cclass(ep, *lp++, 1)) {
288 				ep += *ep;
289 				continue;
290 			}
291 			return(0);
292 
293 		case NCCL:
294 			if (cclass(ep, *lp++, 0)) {
295 				ep += *ep;
296 				continue;
297 			}
298 			return(0);
299 
300 		case CBRA:
301 			braslist[*ep++] = lp;
302 			continue;
303 
304 		case CKET:
305 			braelist[*ep++] = lp;
306 			continue;
307 
308 		case CBACK:
309 			if (braelist[i = *ep++] == 0)
310 				return(-1);
311 			if (backref(i, lp)) {
312 				lp += braelist[i] - braslist[i];
313 				continue;
314 			}
315 			return(0);
316 
317 		case CBACK|CSTAR:
318 			if (braelist[i = *ep++] == 0)
319 				return(-1);
320 			curlp = lp;
321 			ct = braelist[i] - braslist[i];
322 			while (backref(i, lp))
323 				lp += ct;
324 			while (lp >= curlp) {
325 				if (rv = advance(lp, ep))
326 					return(rv);
327 				lp -= ct;
328 			}
329 			continue;
330 
331 		case CDOT|CSTAR:
332 			curlp = lp;
333 			while (*lp++)
334 				;
335 			goto star;
336 
337 		case CCHR|CSTAR:
338 			curlp = lp;
339 			while (*lp++ == *ep)
340 				;
341 			ep++;
342 			goto star;
343 
344 		case CCL|CSTAR:
345 		case NCCL|CSTAR:
346 			curlp = lp;
347 			while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR)))
348 				;
349 			ep += *ep;
350 			goto star;
351 
352 		star:
353 			do {
354 				lp--;
355 				if (rv = advance(lp, ep))
356 					return(rv);
357 			} while (lp > curlp);
358 			return(0);
359 
360 		default:
361 			return(-1);
362 		}
363 }
364 
365 backref(i, lp)
366 	register int	i;
367 	register char	*lp;
368 {
369 	register char	*bp;
370 
371 	bp = braslist[i];
372 	while (*bp++ == *lp++)
373 		if (bp >= braelist[i])
374 			return(1);
375 	return(0);
376 }
377 
378 int
379 cclass(set, c, af)
380 	register char	*set, c;
381 	int	af;
382 {
383 	register int	n;
384 
385 	if (c == 0)
386 		return(0);
387 	n = *set++;
388 	while (--n)
389 		if (*set++ == c)
390 			return(af);
391 	return(! af);
392 }
393