xref: /netbsd/games/quiz/rxp.c (revision c4a72b64)
1 /*	$NetBSD: rxp.c,v 1.10 2002/12/06 01:54:55 thorpej Exp $	*/
2 
3 /*-
4  * Copyright (c) 1991, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * Jim R. Oldroyd at The Instruction Set and Keith Gabryelski at
9  * Commodore Business Machines.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  * 3. All advertising materials mentioning features or use of this software
20  *    must display the following acknowledgement:
21  *	This product includes software developed by the University of
22  *	California, Berkeley and its contributors.
23  * 4. Neither the name of the University nor the names of its contributors
24  *    may be used to endorse or promote products derived from this software
25  *    without specific prior written permission.
26  *
27  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37  * SUCH DAMAGE.
38  */
39 
40 #include <sys/cdefs.h>
41 #ifndef lint
42 #if 0
43 static char sccsid[] = "@(#)rxp.c	8.1 (Berkeley) 5/31/93";
44 #else
45 __RCSID("$NetBSD: rxp.c,v 1.10 2002/12/06 01:54:55 thorpej Exp $");
46 #endif
47 #endif /* not lint */
48 
49 /*
50  * regular expression parser
51  *
52  * external functions and return values are:
53  * rxp_compile(s)
54  *	TRUE	success
55  *	FALSE	parse failure; error message will be in char rxperr[]
56  * metas are:
57  *	{...}	optional pattern, equialent to [...|]
58  *	|	alternate pattern
59  *	[...]	pattern delimiters
60  *
61  * rxp_match(s)
62  *	TRUE	string s matches compiled pattern
63  *	FALSE	match failure or regexp error
64  *
65  * rxp_expand()
66  *	char *	reverse-engineered regular expression string
67  *	NULL	regexp error
68  */
69 
70 #include <stdio.h>
71 #include <stdlib.h>
72 #include <ctype.h>
73 #include "quiz.h"
74 					/* regexp tokens,	arg */
75 #define	LIT	(-1)			/* literal character,	char */
76 #define	SOT	(-2)			/* start text anchor,	- */
77 #define	EOT	(-3)			/* end text anchor,	- */
78 #define	GRP_S	(-4)			/* start alternate grp,	ptr_to_end */
79 #define	GRP_E	(-5)			/* end group,		- */
80 #define	ALT_S	(-6)			/* alternate starts,	ptr_to_next */
81 #define	ALT_E	(-7)			/* alternate ends,	- */
82 #define	END	(-8)			/* end of regexp,	- */
83 
84 typedef short Rxp_t;			/* type for regexp tokens */
85 
86 static Rxp_t rxpbuf[RXP_LINE_SZ];	/* compiled regular expression buffer */
87 char rxperr[128];			/* parser error message */
88 
89 static int	 rxp__compile __P((const char *, int));
90 static char	*rxp__expand __P((int));
91 static int	 rxp__match __P((const char *, int, Rxp_t *, Rxp_t *, const char *));
92 
93 int
94 rxp_compile(s)
95 	const char *	s;
96 {
97 	return (rxp__compile(s, TRUE));
98 }
99 
100 static int
101 rxp__compile(s, first)
102 	const char *s;
103 	int first;
104 {
105 	static Rxp_t *rp;
106 	static const char *sp;
107 	Rxp_t *grp_ptr;
108 	Rxp_t *alt_ptr;
109 	int esc, err;
110 
111 	esc = 0;
112 	if (first) {
113 		rp = rxpbuf;
114 		sp = s;
115 		*rp++ = SOT;	/* auto-anchor: pat is really ^pat$ */
116 		*rp++ = GRP_S;	/* auto-group: ^pat$ is really ^[pat]$ */
117 		*rp++ = 0;
118 	}
119 	*rp++ = ALT_S;
120 	alt_ptr = rp;
121 	*rp++ = 0;
122 	for (; *sp; ++sp) {
123 		if (rp - rxpbuf >= RXP_LINE_SZ - 4) {
124 			(void)snprintf(rxperr, sizeof(rxperr),
125 			    "regular expression too long %s", s);
126 			return (FALSE);
127 		}
128 		if (*sp == ':' && !esc)
129 			break;
130 		if (esc) {
131 			*rp++ = LIT;
132 			*rp++ = *sp;
133 			esc = 0;
134 		}
135 		else switch (*sp) {
136 		case '\\':
137 			esc = 1;
138 			break;
139 		case '{':
140 		case '[':
141 			*rp++ = GRP_S;
142 			grp_ptr = rp;
143 			*rp++ = 0;
144 			sp++;
145 			if ((err = rxp__compile(s, FALSE)) != TRUE)
146 				return (err);
147 			*rp++ = GRP_E;
148 			*grp_ptr = rp - rxpbuf;
149 			break;
150 		case '}':
151 		case ']':
152 		case '|':
153 			*rp++ = ALT_E;
154 			*alt_ptr = rp - rxpbuf;
155 			if (*sp != ']') {
156 				*rp++ = ALT_S;
157 				alt_ptr = rp;
158 				*rp++ = 0;
159 			}
160 			if (*sp != '|') {
161 				if (*sp != ']') {
162 					*rp++ = ALT_E;
163 					*alt_ptr = rp - rxpbuf;
164 				}
165 				if (first) {
166 					(void)snprintf(rxperr, sizeof(rxperr),
167 					    "unmatched alternator in regexp %s",
168 					     s);
169 					return (FALSE);
170 				}
171 				return (TRUE);
172 			}
173 			break;
174 		default:
175 			*rp++ = LIT;
176 			*rp++ = *sp;
177 			esc = 0;
178 			break;
179 		}
180 	}
181 	if (!first) {
182 		(void)snprintf(rxperr, sizeof(rxperr),
183 		    "unmatched alternator in regexp %s", s);
184 		return (FALSE);
185 	}
186 	*rp++ = ALT_E;
187 	*alt_ptr = rp - rxpbuf;
188 	*rp++ = GRP_E;
189 	*(rxpbuf + 2) = rp - rxpbuf;
190 	*rp++ = EOT;
191 	*rp = END;
192 	return (TRUE);
193 }
194 
195 /*
196  * match string against compiled regular expression
197  */
198 int
199 rxp_match(s)
200 	const char *	s;
201 {
202 	return (rxp__match(s, TRUE, NULL, NULL, NULL));
203 }
204 
205 static int
206 rxp__match(s, first, j_succ, j_fail, sp_fail)
207 	const char *s;
208 	int first;
209 	Rxp_t *j_succ;		/* jump here on successful alt match */
210 	Rxp_t *j_fail;		/* jump here on failed match */
211 	const char *sp_fail;		/* reset sp to here on failed match */
212 {
213 	static Rxp_t *rp;
214 	static const char *sp;
215 	int ch;
216 	Rxp_t *grp_end = NULL;
217 
218 	if (first) {
219 		rp = rxpbuf;
220 		sp = s;
221 	}
222 	while (rp < rxpbuf + RXP_LINE_SZ && *rp != END)
223 		switch(*rp) {
224 		case LIT:
225 			rp++;
226 			ch = isascii(*rp) && isupper(*rp) ? tolower(*rp) : *rp;
227 			if (ch != *sp++) {
228 				rp = j_fail;
229 				sp = sp_fail;
230 				return (FALSE);
231 			}
232 			rp++;
233 			break;
234 		case SOT:
235 			if (sp != s)
236 				return (FALSE);
237 			rp++;
238 			break;
239 		case EOT:
240 			if (*sp != 0)
241 				return (FALSE);
242 			rp++;
243 			break;
244 		case GRP_S:
245 			rp++;
246 			grp_end = rxpbuf + *rp++;
247 			break;
248 		case ALT_S:
249 			rp++;
250 			rxp__match(sp, FALSE, grp_end, rxpbuf + *rp++, sp);
251 			break;
252 		case ALT_E:
253 			rp = j_succ;
254 			return (TRUE);
255 		case GRP_E:
256 			rp = j_fail;
257 			sp = sp_fail;
258 			return (FALSE);
259 		default:
260 			abort();
261 		}
262 	return (*rp != END ? FALSE : TRUE);
263 }
264 
265 /*
266  * Reverse engineer the regular expression, by picking first of all alternates.
267  */
268 char *
269 rxp_expand()
270 {
271 	return (rxp__expand(TRUE));
272 }
273 
274 static char *
275 rxp__expand(first)
276 	int first;
277 {
278 	static char buf[RXP_LINE_SZ/2];
279 	static Rxp_t *rp;
280 	static char *bp;
281 	Rxp_t *grp_ptr;
282 	char *err;
283 
284 	if (first) {
285 		rp = rxpbuf;
286 		bp = buf;
287 	}
288 	while (rp < rxpbuf + RXP_LINE_SZ && *rp != END)
289 		switch(*rp) {
290 		case LIT:
291 			rp++;
292 			*bp++ = *rp++;
293 			break;
294 		case GRP_S:
295 			rp++;
296 			grp_ptr = rxpbuf + *rp;
297 			rp++;
298 			if ((err = rxp__expand(FALSE)) == NULL)
299 				return (err);
300 			rp = grp_ptr;
301 			break;
302 		case ALT_E:
303 			return (buf);
304 		case ALT_S:
305 			rp++;
306 			/* FALLTHROUGH */
307 		case SOT:
308 		case EOT:
309 		case GRP_E:
310 			rp++;
311 			break;
312 		default:
313 			return (NULL);
314 		}
315 	if (first) {
316 		if (*rp != END)
317 			return (NULL);
318 		*bp = '\0';
319 	}
320 	return (buf);
321 }
322