1 /* $NetBSD: rxp.c,v 1.10 2002/12/06 01:54:55 thorpej Exp $ */ 2 3 /*- 4 * Copyright (c) 1991, 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Jim R. Oldroyd at The Instruction Set and Keith Gabryelski at 9 * Commodore Business Machines. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. All advertising materials mentioning features or use of this software 20 * must display the following acknowledgement: 21 * This product includes software developed by the University of 22 * California, Berkeley and its contributors. 23 * 4. Neither the name of the University nor the names of its contributors 24 * may be used to endorse or promote products derived from this software 25 * without specific prior written permission. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 37 * SUCH DAMAGE. 38 */ 39 40 #include <sys/cdefs.h> 41 #ifndef lint 42 #if 0 43 static char sccsid[] = "@(#)rxp.c 8.1 (Berkeley) 5/31/93"; 44 #else 45 __RCSID("$NetBSD: rxp.c,v 1.10 2002/12/06 01:54:55 thorpej Exp $"); 46 #endif 47 #endif /* not lint */ 48 49 /* 50 * regular expression parser 51 * 52 * external functions and return values are: 53 * rxp_compile(s) 54 * TRUE success 55 * FALSE parse failure; error message will be in char rxperr[] 56 * metas are: 57 * {...} optional pattern, equialent to [...|] 58 * | alternate pattern 59 * [...] pattern delimiters 60 * 61 * rxp_match(s) 62 * TRUE string s matches compiled pattern 63 * FALSE match failure or regexp error 64 * 65 * rxp_expand() 66 * char * reverse-engineered regular expression string 67 * NULL regexp error 68 */ 69 70 #include <stdio.h> 71 #include <stdlib.h> 72 #include <ctype.h> 73 #include "quiz.h" 74 /* regexp tokens, arg */ 75 #define LIT (-1) /* literal character, char */ 76 #define SOT (-2) /* start text anchor, - */ 77 #define EOT (-3) /* end text anchor, - */ 78 #define GRP_S (-4) /* start alternate grp, ptr_to_end */ 79 #define GRP_E (-5) /* end group, - */ 80 #define ALT_S (-6) /* alternate starts, ptr_to_next */ 81 #define ALT_E (-7) /* alternate ends, - */ 82 #define END (-8) /* end of regexp, - */ 83 84 typedef short Rxp_t; /* type for regexp tokens */ 85 86 static Rxp_t rxpbuf[RXP_LINE_SZ]; /* compiled regular expression buffer */ 87 char rxperr[128]; /* parser error message */ 88 89 static int rxp__compile __P((const char *, int)); 90 static char *rxp__expand __P((int)); 91 static int rxp__match __P((const char *, int, Rxp_t *, Rxp_t *, const char *)); 92 93 int 94 rxp_compile(s) 95 const char * s; 96 { 97 return (rxp__compile(s, TRUE)); 98 } 99 100 static int 101 rxp__compile(s, first) 102 const char *s; 103 int first; 104 { 105 static Rxp_t *rp; 106 static const char *sp; 107 Rxp_t *grp_ptr; 108 Rxp_t *alt_ptr; 109 int esc, err; 110 111 esc = 0; 112 if (first) { 113 rp = rxpbuf; 114 sp = s; 115 *rp++ = SOT; /* auto-anchor: pat is really ^pat$ */ 116 *rp++ = GRP_S; /* auto-group: ^pat$ is really ^[pat]$ */ 117 *rp++ = 0; 118 } 119 *rp++ = ALT_S; 120 alt_ptr = rp; 121 *rp++ = 0; 122 for (; *sp; ++sp) { 123 if (rp - rxpbuf >= RXP_LINE_SZ - 4) { 124 (void)snprintf(rxperr, sizeof(rxperr), 125 "regular expression too long %s", s); 126 return (FALSE); 127 } 128 if (*sp == ':' && !esc) 129 break; 130 if (esc) { 131 *rp++ = LIT; 132 *rp++ = *sp; 133 esc = 0; 134 } 135 else switch (*sp) { 136 case '\\': 137 esc = 1; 138 break; 139 case '{': 140 case '[': 141 *rp++ = GRP_S; 142 grp_ptr = rp; 143 *rp++ = 0; 144 sp++; 145 if ((err = rxp__compile(s, FALSE)) != TRUE) 146 return (err); 147 *rp++ = GRP_E; 148 *grp_ptr = rp - rxpbuf; 149 break; 150 case '}': 151 case ']': 152 case '|': 153 *rp++ = ALT_E; 154 *alt_ptr = rp - rxpbuf; 155 if (*sp != ']') { 156 *rp++ = ALT_S; 157 alt_ptr = rp; 158 *rp++ = 0; 159 } 160 if (*sp != '|') { 161 if (*sp != ']') { 162 *rp++ = ALT_E; 163 *alt_ptr = rp - rxpbuf; 164 } 165 if (first) { 166 (void)snprintf(rxperr, sizeof(rxperr), 167 "unmatched alternator in regexp %s", 168 s); 169 return (FALSE); 170 } 171 return (TRUE); 172 } 173 break; 174 default: 175 *rp++ = LIT; 176 *rp++ = *sp; 177 esc = 0; 178 break; 179 } 180 } 181 if (!first) { 182 (void)snprintf(rxperr, sizeof(rxperr), 183 "unmatched alternator in regexp %s", s); 184 return (FALSE); 185 } 186 *rp++ = ALT_E; 187 *alt_ptr = rp - rxpbuf; 188 *rp++ = GRP_E; 189 *(rxpbuf + 2) = rp - rxpbuf; 190 *rp++ = EOT; 191 *rp = END; 192 return (TRUE); 193 } 194 195 /* 196 * match string against compiled regular expression 197 */ 198 int 199 rxp_match(s) 200 const char * s; 201 { 202 return (rxp__match(s, TRUE, NULL, NULL, NULL)); 203 } 204 205 static int 206 rxp__match(s, first, j_succ, j_fail, sp_fail) 207 const char *s; 208 int first; 209 Rxp_t *j_succ; /* jump here on successful alt match */ 210 Rxp_t *j_fail; /* jump here on failed match */ 211 const char *sp_fail; /* reset sp to here on failed match */ 212 { 213 static Rxp_t *rp; 214 static const char *sp; 215 int ch; 216 Rxp_t *grp_end = NULL; 217 218 if (first) { 219 rp = rxpbuf; 220 sp = s; 221 } 222 while (rp < rxpbuf + RXP_LINE_SZ && *rp != END) 223 switch(*rp) { 224 case LIT: 225 rp++; 226 ch = isascii(*rp) && isupper(*rp) ? tolower(*rp) : *rp; 227 if (ch != *sp++) { 228 rp = j_fail; 229 sp = sp_fail; 230 return (FALSE); 231 } 232 rp++; 233 break; 234 case SOT: 235 if (sp != s) 236 return (FALSE); 237 rp++; 238 break; 239 case EOT: 240 if (*sp != 0) 241 return (FALSE); 242 rp++; 243 break; 244 case GRP_S: 245 rp++; 246 grp_end = rxpbuf + *rp++; 247 break; 248 case ALT_S: 249 rp++; 250 rxp__match(sp, FALSE, grp_end, rxpbuf + *rp++, sp); 251 break; 252 case ALT_E: 253 rp = j_succ; 254 return (TRUE); 255 case GRP_E: 256 rp = j_fail; 257 sp = sp_fail; 258 return (FALSE); 259 default: 260 abort(); 261 } 262 return (*rp != END ? FALSE : TRUE); 263 } 264 265 /* 266 * Reverse engineer the regular expression, by picking first of all alternates. 267 */ 268 char * 269 rxp_expand() 270 { 271 return (rxp__expand(TRUE)); 272 } 273 274 static char * 275 rxp__expand(first) 276 int first; 277 { 278 static char buf[RXP_LINE_SZ/2]; 279 static Rxp_t *rp; 280 static char *bp; 281 Rxp_t *grp_ptr; 282 char *err; 283 284 if (first) { 285 rp = rxpbuf; 286 bp = buf; 287 } 288 while (rp < rxpbuf + RXP_LINE_SZ && *rp != END) 289 switch(*rp) { 290 case LIT: 291 rp++; 292 *bp++ = *rp++; 293 break; 294 case GRP_S: 295 rp++; 296 grp_ptr = rxpbuf + *rp; 297 rp++; 298 if ((err = rxp__expand(FALSE)) == NULL) 299 return (err); 300 rp = grp_ptr; 301 break; 302 case ALT_E: 303 return (buf); 304 case ALT_S: 305 rp++; 306 /* FALLTHROUGH */ 307 case SOT: 308 case EOT: 309 case GRP_E: 310 rp++; 311 break; 312 default: 313 return (NULL); 314 } 315 if (first) { 316 if (*rp != END) 317 return (NULL); 318 *bp = '\0'; 319 } 320 return (buf); 321 } 322