1 /* 2 * Copyright (c) 1980 Regents of the University of California. 3 * All rights reserved. The Berkeley software License Agreement 4 * specifies the terms and conditions for redistribution. 5 */ 6 7 #if defined(LIBC_SCCS) && !defined(lint) 8 static char sccsid[] = "@(#)regex.c 5.4 (Berkeley) 02/23/91"; 9 #endif LIBC_SCCS and not lint 10 11 #include <unistd.h> 12 13 static int advance(); 14 static int backref(); 15 static int cclass(); 16 17 /* 18 * routines to do regular expression matching 19 * 20 * Entry points: 21 * 22 * re_comp(s) 23 * char *s; 24 * ... returns 0 if the string s was compiled successfully, 25 * a pointer to an error message otherwise. 26 * If passed 0 or a null string returns without changing 27 * the currently compiled re (see note 11 below). 28 * 29 * re_exec(s) 30 * char *s; 31 * ... returns 1 if the string s matches the last compiled regular 32 * expression, 33 * 0 if the string s failed to match the last compiled 34 * regular expression, and 35 * -1 if the compiled regular expression was invalid 36 * (indicating an internal error). 37 * 38 * The strings passed to both re_comp and re_exec may have trailing or 39 * embedded newline characters; they are terminated by nulls. 40 * 41 * The identity of the author of these routines is lost in antiquity; 42 * this is essentially the same as the re code in the original V6 ed. 43 * 44 * The regular expressions recognized are described below. This description 45 * is essentially the same as that for ed. 46 * 47 * A regular expression specifies a set of strings of characters. 48 * A member of this set of strings is said to be matched by 49 * the regular expression. In the following specification for 50 * regular expressions the word `character' means any character but NUL. 51 * 52 * 1. Any character except a special character matches itself. 53 * Special characters are the regular expression delimiter plus 54 * \ [ . and sometimes ^ * $. 55 * 2. A . matches any character. 56 * 3. A \ followed by any character except a digit or ( ) 57 * matches that character. 58 * 4. A nonempty string s bracketed [s] (or [^s]) matches any 59 * character in (or not in) s. In s, \ has no special meaning, 60 * and ] may only appear as the first letter. A substring 61 * a-b, with a and b in ascending ASCII order, stands for 62 * the inclusive range of ASCII characters. 63 * 5. A regular expression of form 1-4 followed by * matches a 64 * sequence of 0 or more matches of the regular expression. 65 * 6. A regular expression, x, of form 1-8, bracketed \(x\) 66 * matches what x matches. 67 * 7. A \ followed by a digit n matches a copy of the string that the 68 * bracketed regular expression beginning with the nth \( matched. 69 * 8. A regular expression of form 1-8, x, followed by a regular 70 * expression of form 1-7, y matches a match for x followed by 71 * a match for y, with the x match being as long as possible 72 * while still permitting a y match. 73 * 9. A regular expression of form 1-8 preceded by ^ (or followed 74 * by $), is constrained to matches that begin at the left 75 * (or end at the right) end of a line. 76 * 10. A regular expression of form 1-9 picks out the longest among 77 * the leftmost matches in a line. 78 * 11. An empty regular expression stands for a copy of the last 79 * regular expression encountered. 80 */ 81 82 /* 83 * constants for re's 84 */ 85 #define CBRA 1 86 #define CCHR 2 87 #define CDOT 4 88 #define CCL 6 89 #define NCCL 8 90 #define CDOL 10 91 #define CEOF 11 92 #define CKET 12 93 #define CBACK 18 94 95 #define CSTAR 01 96 97 #define ESIZE 512 98 #define NBRA 9 99 100 static char expbuf[ESIZE], *braslist[NBRA], *braelist[NBRA]; 101 static char circf; 102 103 /* 104 * compile the regular expression argument into a dfa 105 */ 106 char * 107 re_comp(sp) 108 register const char *sp; 109 { 110 register int c; 111 register char *ep = expbuf; 112 int cclcnt, numbra = 0; 113 char *lastep = 0; 114 char bracket[NBRA]; 115 char *bracketp = &bracket[0]; 116 static char *retoolong = "Regular expression too long"; 117 118 #define comerr(msg) {expbuf[0] = 0; numbra = 0; return(msg); } 119 120 if (sp == 0 || *sp == '\0') { 121 if (*ep == 0) 122 return("No previous regular expression"); 123 return(0); 124 } 125 if (*sp == '^') { 126 circf = 1; 127 sp++; 128 } 129 else 130 circf = 0; 131 for (;;) { 132 if (ep >= &expbuf[ESIZE]) 133 comerr(retoolong); 134 if ((c = *sp++) == '\0') { 135 if (bracketp != bracket) 136 comerr("unmatched \\("); 137 *ep++ = CEOF; 138 *ep++ = 0; 139 return(0); 140 } 141 if (c != '*') 142 lastep = ep; 143 switch (c) { 144 145 case '.': 146 *ep++ = CDOT; 147 continue; 148 149 case '*': 150 if (lastep == 0 || *lastep == CBRA || *lastep == CKET) 151 goto defchar; 152 *lastep |= CSTAR; 153 continue; 154 155 case '$': 156 if (*sp != '\0') 157 goto defchar; 158 *ep++ = CDOL; 159 continue; 160 161 case '[': 162 *ep++ = CCL; 163 *ep++ = 0; 164 cclcnt = 1; 165 if ((c = *sp++) == '^') { 166 c = *sp++; 167 ep[-2] = NCCL; 168 } 169 do { 170 if (c == '\0') 171 comerr("missing ]"); 172 if (c == '-' && ep [-1] != 0) { 173 if ((c = *sp++) == ']') { 174 *ep++ = '-'; 175 cclcnt++; 176 break; 177 } 178 while (ep[-1] < c) { 179 *ep = ep[-1] + 1; 180 ep++; 181 cclcnt++; 182 if (ep >= &expbuf[ESIZE]) 183 comerr(retoolong); 184 } 185 } 186 *ep++ = c; 187 cclcnt++; 188 if (ep >= &expbuf[ESIZE]) 189 comerr(retoolong); 190 } while ((c = *sp++) != ']'); 191 lastep[1] = cclcnt; 192 continue; 193 194 case '\\': 195 if ((c = *sp++) == '(') { 196 if (numbra >= NBRA) 197 comerr("too many \\(\\) pairs"); 198 *bracketp++ = numbra; 199 *ep++ = CBRA; 200 *ep++ = numbra++; 201 continue; 202 } 203 if (c == ')') { 204 if (bracketp <= bracket) 205 comerr("unmatched \\)"); 206 *ep++ = CKET; 207 *ep++ = *--bracketp; 208 continue; 209 } 210 if (c >= '1' && c < ('1' + NBRA)) { 211 *ep++ = CBACK; 212 *ep++ = c - '1'; 213 continue; 214 } 215 *ep++ = CCHR; 216 *ep++ = c; 217 continue; 218 219 defchar: 220 default: 221 *ep++ = CCHR; 222 *ep++ = c; 223 } 224 } 225 } 226 227 /* 228 * match the argument string against the compiled re 229 */ 230 int 231 re_exec(p1) 232 register const char *p1; 233 { 234 register char *p2 = expbuf; 235 register int c; 236 int rv; 237 238 for (c = 0; c < NBRA; c++) { 239 braslist[c] = 0; 240 braelist[c] = 0; 241 } 242 if (circf) 243 return((advance(p1, p2))); 244 /* 245 * fast check for first character 246 */ 247 if (*p2 == CCHR) { 248 c = p2[1]; 249 do { 250 if (*p1 != c) 251 continue; 252 if (rv = advance(p1, p2)) 253 return(rv); 254 } while (*p1++); 255 return(0); 256 } 257 /* 258 * regular algorithm 259 */ 260 do 261 if (rv = advance(p1, p2)) 262 return(rv); 263 while (*p1++); 264 return(0); 265 } 266 267 /* 268 * try to match the next thing in the dfa 269 */ 270 static int 271 advance(lp, ep) 272 register char *lp, *ep; 273 { 274 register char *curlp; 275 int ct, i; 276 int rv; 277 278 for (;;) 279 switch (*ep++) { 280 281 case CCHR: 282 if (*ep++ == *lp++) 283 continue; 284 return(0); 285 286 case CDOT: 287 if (*lp++) 288 continue; 289 return(0); 290 291 case CDOL: 292 if (*lp == '\0') 293 continue; 294 return(0); 295 296 case CEOF: 297 return(1); 298 299 case CCL: 300 if (cclass(ep, *lp++, 1)) { 301 ep += *ep; 302 continue; 303 } 304 return(0); 305 306 case NCCL: 307 if (cclass(ep, *lp++, 0)) { 308 ep += *ep; 309 continue; 310 } 311 return(0); 312 313 case CBRA: 314 braslist[*ep++] = lp; 315 continue; 316 317 case CKET: 318 braelist[*ep++] = lp; 319 continue; 320 321 case CBACK: 322 if (braelist[i = *ep++] == 0) 323 return(-1); 324 if (backref(i, lp)) { 325 lp += braelist[i] - braslist[i]; 326 continue; 327 } 328 return(0); 329 330 case CBACK|CSTAR: 331 if (braelist[i = *ep++] == 0) 332 return(-1); 333 curlp = lp; 334 ct = braelist[i] - braslist[i]; 335 while (backref(i, lp)) 336 lp += ct; 337 while (lp >= curlp) { 338 if (rv = advance(lp, ep)) 339 return(rv); 340 lp -= ct; 341 } 342 continue; 343 344 case CDOT|CSTAR: 345 curlp = lp; 346 while (*lp++) 347 ; 348 goto star; 349 350 case CCHR|CSTAR: 351 curlp = lp; 352 while (*lp++ == *ep) 353 ; 354 ep++; 355 goto star; 356 357 case CCL|CSTAR: 358 case NCCL|CSTAR: 359 curlp = lp; 360 while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR))) 361 ; 362 ep += *ep; 363 goto star; 364 365 star: 366 do { 367 lp--; 368 if (rv = advance(lp, ep)) 369 return(rv); 370 } while (lp > curlp); 371 return(0); 372 373 default: 374 return(-1); 375 } 376 } 377 378 static 379 backref(i, lp) 380 register int i; 381 register char *lp; 382 { 383 register char *bp; 384 385 bp = braslist[i]; 386 while (*bp++ == *lp++) 387 if (bp >= braelist[i]) 388 return(1); 389 return(0); 390 } 391 392 static int 393 cclass(set, c, af) 394 register char *set, c; 395 int af; 396 { 397 register int n; 398 399 if (c == 0) 400 return(0); 401 n = *set++; 402 while (--n) 403 if (*set++ == c) 404 return(af); 405 return(! af); 406 } 407