1 /*- 2 * Copyright (c) 1980, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * %sccs.include.proprietary.c% 6 */ 7 8 #if defined(LIBC_SCCS) && !defined(lint) 9 static char sccsid[] = "@(#)regex.c 8.1 (Berkeley) 06/04/93"; 10 #endif /* LIBC_SCCS and not lint */ 11 12 #include <unistd.h> 13 14 static int advance(); 15 static int backref(); 16 static int cclass(); 17 18 /* 19 * routines to do regular expression matching 20 * 21 * Entry points: 22 * 23 * re_comp(s) 24 * char *s; 25 * ... returns 0 if the string s was compiled successfully, 26 * a pointer to an error message otherwise. 27 * If passed 0 or a null string returns without changing 28 * the currently compiled re (see note 11 below). 29 * 30 * re_exec(s) 31 * char *s; 32 * ... returns 1 if the string s matches the last compiled regular 33 * expression, 34 * 0 if the string s failed to match the last compiled 35 * regular expression, and 36 * -1 if the compiled regular expression was invalid 37 * (indicating an internal error). 38 * 39 * The strings passed to both re_comp and re_exec may have trailing or 40 * embedded newline characters; they are terminated by nulls. 41 * 42 * The identity of the author of these routines is lost in antiquity; 43 * this is essentially the same as the re code in the original V6 ed. 44 * 45 * The regular expressions recognized are described below. This description 46 * is essentially the same as that for ed. 47 * 48 * A regular expression specifies a set of strings of characters. 49 * A member of this set of strings is said to be matched by 50 * the regular expression. In the following specification for 51 * regular expressions the word `character' means any character but NUL. 52 * 53 * 1. Any character except a special character matches itself. 54 * Special characters are the regular expression delimiter plus 55 * \ [ . and sometimes ^ * $. 56 * 2. A . matches any character. 57 * 3. A \ followed by any character except a digit or ( ) 58 * matches that character. 59 * 4. A nonempty string s bracketed [s] (or [^s]) matches any 60 * character in (or not in) s. In s, \ has no special meaning, 61 * and ] may only appear as the first letter. A substring 62 * a-b, with a and b in ascending ASCII order, stands for 63 * the inclusive range of ASCII characters. 64 * 5. A regular expression of form 1-4 followed by * matches a 65 * sequence of 0 or more matches of the regular expression. 66 * 6. A regular expression, x, of form 1-8, bracketed \(x\) 67 * matches what x matches. 68 * 7. A \ followed by a digit n matches a copy of the string that the 69 * bracketed regular expression beginning with the nth \( matched. 70 * 8. A regular expression of form 1-8, x, followed by a regular 71 * expression of form 1-7, y matches a match for x followed by 72 * a match for y, with the x match being as long as possible 73 * while still permitting a y match. 74 * 9. A regular expression of form 1-8 preceded by ^ (or followed 75 * by $), is constrained to matches that begin at the left 76 * (or end at the right) end of a line. 77 * 10. A regular expression of form 1-9 picks out the longest among 78 * the leftmost matches in a line. 79 * 11. An empty regular expression stands for a copy of the last 80 * regular expression encountered. 81 */ 82 83 /* 84 * constants for re's 85 */ 86 #define CBRA 1 87 #define CCHR 2 88 #define CDOT 4 89 #define CCL 6 90 #define NCCL 8 91 #define CDOL 10 92 #define CEOF 11 93 #define CKET 12 94 #define CBACK 18 95 96 #define CSTAR 01 97 98 #define ESIZE 512 99 #define NBRA 9 100 101 static char expbuf[ESIZE], *braslist[NBRA], *braelist[NBRA]; 102 static char circf; 103 104 /* 105 * compile the regular expression argument into a dfa 106 */ 107 char * 108 re_comp(sp) 109 register const char *sp; 110 { 111 register int c; 112 register char *ep = expbuf; 113 int cclcnt, numbra = 0; 114 char *lastep = 0; 115 char bracket[NBRA]; 116 char *bracketp = &bracket[0]; 117 static char *retoolong = "Regular expression too long"; 118 119 #define comerr(msg) {expbuf[0] = 0; numbra = 0; return(msg); } 120 121 if (sp == 0 || *sp == '\0') { 122 if (*ep == 0) 123 return("No previous regular expression"); 124 return(0); 125 } 126 if (*sp == '^') { 127 circf = 1; 128 sp++; 129 } 130 else 131 circf = 0; 132 for (;;) { 133 if (ep >= &expbuf[ESIZE]) 134 comerr(retoolong); 135 if ((c = *sp++) == '\0') { 136 if (bracketp != bracket) 137 comerr("unmatched \\("); 138 *ep++ = CEOF; 139 *ep++ = 0; 140 return(0); 141 } 142 if (c != '*') 143 lastep = ep; 144 switch (c) { 145 146 case '.': 147 *ep++ = CDOT; 148 continue; 149 150 case '*': 151 if (lastep == 0 || *lastep == CBRA || *lastep == CKET) 152 goto defchar; 153 *lastep |= CSTAR; 154 continue; 155 156 case '$': 157 if (*sp != '\0') 158 goto defchar; 159 *ep++ = CDOL; 160 continue; 161 162 case '[': 163 *ep++ = CCL; 164 *ep++ = 0; 165 cclcnt = 1; 166 if ((c = *sp++) == '^') { 167 c = *sp++; 168 ep[-2] = NCCL; 169 } 170 do { 171 if (c == '\0') 172 comerr("missing ]"); 173 if (c == '-' && ep [-1] != 0) { 174 if ((c = *sp++) == ']') { 175 *ep++ = '-'; 176 cclcnt++; 177 break; 178 } 179 while (ep[-1] < c) { 180 *ep = ep[-1] + 1; 181 ep++; 182 cclcnt++; 183 if (ep >= &expbuf[ESIZE]) 184 comerr(retoolong); 185 } 186 } 187 *ep++ = c; 188 cclcnt++; 189 if (ep >= &expbuf[ESIZE]) 190 comerr(retoolong); 191 } while ((c = *sp++) != ']'); 192 lastep[1] = cclcnt; 193 continue; 194 195 case '\\': 196 if ((c = *sp++) == '(') { 197 if (numbra >= NBRA) 198 comerr("too many \\(\\) pairs"); 199 *bracketp++ = numbra; 200 *ep++ = CBRA; 201 *ep++ = numbra++; 202 continue; 203 } 204 if (c == ')') { 205 if (bracketp <= bracket) 206 comerr("unmatched \\)"); 207 *ep++ = CKET; 208 *ep++ = *--bracketp; 209 continue; 210 } 211 if (c >= '1' && c < ('1' + NBRA)) { 212 *ep++ = CBACK; 213 *ep++ = c - '1'; 214 continue; 215 } 216 *ep++ = CCHR; 217 *ep++ = c; 218 continue; 219 220 defchar: 221 default: 222 *ep++ = CCHR; 223 *ep++ = c; 224 } 225 } 226 } 227 228 /* 229 * match the argument string against the compiled re 230 */ 231 int 232 re_exec(p1) 233 register const char *p1; 234 { 235 register char *p2 = expbuf; 236 register int c; 237 int rv; 238 239 for (c = 0; c < NBRA; c++) { 240 braslist[c] = 0; 241 braelist[c] = 0; 242 } 243 if (circf) 244 return((advance(p1, p2))); 245 /* 246 * fast check for first character 247 */ 248 if (*p2 == CCHR) { 249 c = p2[1]; 250 do { 251 if (*p1 != c) 252 continue; 253 if (rv = advance(p1, p2)) 254 return(rv); 255 } while (*p1++); 256 return(0); 257 } 258 /* 259 * regular algorithm 260 */ 261 do 262 if (rv = advance(p1, p2)) 263 return(rv); 264 while (*p1++); 265 return(0); 266 } 267 268 /* 269 * try to match the next thing in the dfa 270 */ 271 static int 272 advance(lp, ep) 273 register char *lp, *ep; 274 { 275 register char *curlp; 276 int ct, i; 277 int rv; 278 279 for (;;) 280 switch (*ep++) { 281 282 case CCHR: 283 if (*ep++ == *lp++) 284 continue; 285 return(0); 286 287 case CDOT: 288 if (*lp++) 289 continue; 290 return(0); 291 292 case CDOL: 293 if (*lp == '\0') 294 continue; 295 return(0); 296 297 case CEOF: 298 return(1); 299 300 case CCL: 301 if (cclass(ep, *lp++, 1)) { 302 ep += *ep; 303 continue; 304 } 305 return(0); 306 307 case NCCL: 308 if (cclass(ep, *lp++, 0)) { 309 ep += *ep; 310 continue; 311 } 312 return(0); 313 314 case CBRA: 315 braslist[*ep++] = lp; 316 continue; 317 318 case CKET: 319 braelist[*ep++] = lp; 320 continue; 321 322 case CBACK: 323 if (braelist[i = *ep++] == 0) 324 return(-1); 325 if (backref(i, lp)) { 326 lp += braelist[i] - braslist[i]; 327 continue; 328 } 329 return(0); 330 331 case CBACK|CSTAR: 332 if (braelist[i = *ep++] == 0) 333 return(-1); 334 curlp = lp; 335 ct = braelist[i] - braslist[i]; 336 while (backref(i, lp)) 337 lp += ct; 338 while (lp >= curlp) { 339 if (rv = advance(lp, ep)) 340 return(rv); 341 lp -= ct; 342 } 343 continue; 344 345 case CDOT|CSTAR: 346 curlp = lp; 347 while (*lp++) 348 ; 349 goto star; 350 351 case CCHR|CSTAR: 352 curlp = lp; 353 while (*lp++ == *ep) 354 ; 355 ep++; 356 goto star; 357 358 case CCL|CSTAR: 359 case NCCL|CSTAR: 360 curlp = lp; 361 while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR))) 362 ; 363 ep += *ep; 364 goto star; 365 366 star: 367 do { 368 lp--; 369 if (rv = advance(lp, ep)) 370 return(rv); 371 } while (lp > curlp); 372 return(0); 373 374 default: 375 return(-1); 376 } 377 } 378 379 static 380 backref(i, lp) 381 register int i; 382 register char *lp; 383 { 384 register char *bp; 385 386 bp = braslist[i]; 387 while (*bp++ == *lp++) 388 if (bp >= braelist[i]) 389 return(1); 390 return(0); 391 } 392 393 static int 394 cclass(set, c, af) 395 register char *set, c; 396 int af; 397 { 398 register int n; 399 400 if (c == 0) 401 return(0); 402 n = *set++; 403 while (--n) 404 if (*set++ == c) 405 return(af); 406 return(! af); 407 } 408