1 /* @(#)regex.c 4.1 (Berkeley) 12/21/80 */ 2 # 3 4 /* 5 * routines to do regular expression matching 6 * 7 * Entry points: 8 * 9 * re_comp(s) 10 * char *s; 11 * ... returns 0 if the string s was compiled successfully, 12 * a pointer to an error message otherwise. 13 * If passed 0 or a null string returns without changing 14 * the currently compiled re (see note 11 below). 15 * 16 * re_exec(s) 17 * char *s; 18 * ... returns 1 if the string s matches the last compiled regular 19 * expression, 20 * 0 if the string s failed to match the last compiled 21 * regular expression, and 22 * -1 if the compiled regular expression was invalid 23 * (indicating an internal error). 24 * 25 * The strings passed to both re_comp and re_exec may have trailing or 26 * embedded newline characters; they are terminated by nulls. 27 * 28 * The identity of the author of these routines is lost in antiquity; 29 * this is essentially the same as the re code in the original V6 ed. 30 * 31 * The regular expressions recognized are described below. This description 32 * is essentially the same as that for ed. 33 * 34 * A regular expression specifies a set of strings of characters. 35 * A member of this set of strings is said to be matched by 36 * the regular expression. In the following specification for 37 * regular expressions the word `character' means any character but NUL. 38 * 39 * 1. Any character except a special character matches itself. 40 * Special characters are the regular expression delimiter plus 41 * \ [ . and sometimes ^ * $. 42 * 2. A . matches any character. 43 * 3. A \ followed by any character except a digit or ( ) 44 * matches that character. 45 * 4. A nonempty string s bracketed [s] (or [^s]) matches any 46 * character in (or not in) s. In s, \ has no special meaning, 47 * and ] may only appear as the first letter. A substring 48 * a-b, with a and b in ascending ASCII order, stands for 49 * the inclusive range of ASCII characters. 50 * 5. A regular expression of form 1-4 followed by * matches a 51 * sequence of 0 or more matches of the regular expression. 52 * 6. A regular expression, x, of form 1-8, bracketed \(x\) 53 * matches what x matches. 54 * 7. A \ followed by a digit n matches a copy of the string that the 55 * bracketed regular expression beginning with the nth \( matched. 56 * 8. A regular expression of form 1-8, x, followed by a regular 57 * expression of form 1-7, y matches a match for x followed by 58 * a match for y, with the x match being as long as possible 59 * while still permitting a y match. 60 * 9. A regular expression of form 1-8 preceded by ^ (or followed 61 * by $), is constrained to matches that begin at the left 62 * (or end at the right) end of a line. 63 * 10. A regular expression of form 1-9 picks out the longest among 64 * the leftmost matches in a line. 65 * 11. An empty regular expression stands for a copy of the last 66 * regular expression encountered. 67 */ 68 69 /* 70 * constants for re's 71 */ 72 #define CBRA 1 73 #define CCHR 2 74 #define CDOT 4 75 #define CCL 6 76 #define NCCL 8 77 #define CDOL 10 78 #define CEOF 11 79 #define CKET 12 80 #define CBACK 18 81 82 #define CSTAR 01 83 84 #define ESIZE 512 85 #define NBRA 9 86 87 static char expbuf[ESIZE], *braslist[NBRA], *braelist[NBRA]; 88 static char circf; 89 90 /* 91 * compile the regular expression argument into a dfa 92 */ 93 char * 94 re_comp(sp) 95 register char *sp; 96 { 97 register int c; 98 register char *ep = expbuf; 99 int cclcnt, numbra = 0; 100 char *lastep = 0; 101 char bracket[NBRA]; 102 char *bracketp = &bracket[0]; 103 static char *retoolong = "Regular expression too long"; 104 105 #define comerr(msg) {expbuf[0] = 0; numbra = 0; return(msg); } 106 107 if (sp == 0 || *sp == '\0') { 108 if (*ep == 0) 109 return("No previous regular expression"); 110 return(0); 111 } 112 if (*sp == '^') { 113 circf = 1; 114 sp++; 115 } 116 else 117 circf = 0; 118 for (;;) { 119 if (ep >= &expbuf[ESIZE]) 120 comerr(retoolong); 121 if ((c = *sp++) == '\0') { 122 if (bracketp != bracket) 123 comerr("unmatched \\("); 124 *ep++ = CEOF; 125 *ep++ = 0; 126 return(0); 127 } 128 if (c != '*') 129 lastep = ep; 130 switch (c) { 131 132 case '.': 133 *ep++ = CDOT; 134 continue; 135 136 case '*': 137 if (lastep == 0 || *lastep == CBRA || *lastep == CKET) 138 goto defchar; 139 *lastep |= CSTAR; 140 continue; 141 142 case '$': 143 if (*sp != '\0') 144 goto defchar; 145 *ep++ = CDOL; 146 continue; 147 148 case '[': 149 *ep++ = CCL; 150 *ep++ = 0; 151 cclcnt = 1; 152 if ((c = *sp++) == '^') { 153 c = *sp++; 154 ep[-2] = NCCL; 155 } 156 do { 157 if (c == '\0') 158 comerr("missing ]"); 159 if (c == '-' && ep [-1] != 0) { 160 if ((c = *sp++) == ']') { 161 *ep++ = '-'; 162 cclcnt++; 163 break; 164 } 165 while (ep[-1] < c) { 166 *ep = ep[-1] + 1; 167 ep++; 168 cclcnt++; 169 if (ep >= &expbuf[ESIZE]) 170 comerr(retoolong); 171 } 172 } 173 *ep++ = c; 174 cclcnt++; 175 if (ep >= &expbuf[ESIZE]) 176 comerr(retoolong); 177 } while ((c = *sp++) != ']'); 178 lastep[1] = cclcnt; 179 continue; 180 181 case '\\': 182 if ((c = *sp++) == '(') { 183 if (numbra >= NBRA) 184 comerr("too many \\(\\) pairs"); 185 *bracketp++ = numbra; 186 *ep++ = CBRA; 187 *ep++ = numbra++; 188 continue; 189 } 190 if (c == ')') { 191 if (bracketp <= bracket) 192 comerr("unmatched \\)"); 193 *ep++ = CKET; 194 *ep++ = *--bracketp; 195 continue; 196 } 197 if (c >= '1' && c < ('1' + NBRA)) { 198 *ep++ = CBACK; 199 *ep++ = c - '1'; 200 continue; 201 } 202 *ep++ = CCHR; 203 *ep++ = c; 204 continue; 205 206 defchar: 207 default: 208 *ep++ = CCHR; 209 *ep++ = c; 210 } 211 } 212 } 213 214 /* 215 * match the argument string against the compiled re 216 */ 217 int 218 re_exec(p1) 219 register char *p1; 220 { 221 register char *p2 = expbuf; 222 register int c; 223 int rv; 224 225 for (c = 0; c < NBRA; c++) { 226 braslist[c] = 0; 227 braelist[c] = 0; 228 } 229 if (circf) 230 return((advance(p1, p2))); 231 /* 232 * fast check for first character 233 */ 234 if (*p2 == CCHR) { 235 c = p2[1]; 236 do { 237 if (*p1 != c) 238 continue; 239 if (rv = advance(p1, p2)) 240 return(rv); 241 } while (*p1++); 242 return(0); 243 } 244 /* 245 * regular algorithm 246 */ 247 do 248 if (rv = advance(p1, p2)) 249 return(rv); 250 while (*p1++); 251 return(0); 252 } 253 254 /* 255 * try to match the next thing in the dfa 256 */ 257 static int 258 advance(lp, ep) 259 register char *lp, *ep; 260 { 261 register char *curlp; 262 int ct, i; 263 int rv; 264 265 for (;;) 266 switch (*ep++) { 267 268 case CCHR: 269 if (*ep++ == *lp++) 270 continue; 271 return(0); 272 273 case CDOT: 274 if (*lp++) 275 continue; 276 return(0); 277 278 case CDOL: 279 if (*lp == '\0') 280 continue; 281 return(0); 282 283 case CEOF: 284 return(1); 285 286 case CCL: 287 if (cclass(ep, *lp++, 1)) { 288 ep += *ep; 289 continue; 290 } 291 return(0); 292 293 case NCCL: 294 if (cclass(ep, *lp++, 0)) { 295 ep += *ep; 296 continue; 297 } 298 return(0); 299 300 case CBRA: 301 braslist[*ep++] = lp; 302 continue; 303 304 case CKET: 305 braelist[*ep++] = lp; 306 continue; 307 308 case CBACK: 309 if (braelist[i = *ep++] == 0) 310 return(-1); 311 if (backref(i, lp)) { 312 lp += braelist[i] - braslist[i]; 313 continue; 314 } 315 return(0); 316 317 case CBACK|CSTAR: 318 if (braelist[i = *ep++] == 0) 319 return(-1); 320 curlp = lp; 321 ct = braelist[i] - braslist[i]; 322 while (backref(i, lp)) 323 lp += ct; 324 while (lp >= curlp) { 325 if (rv = advance(lp, ep)) 326 return(rv); 327 lp -= ct; 328 } 329 continue; 330 331 case CDOT|CSTAR: 332 curlp = lp; 333 while (*lp++) 334 ; 335 goto star; 336 337 case CCHR|CSTAR: 338 curlp = lp; 339 while (*lp++ == *ep) 340 ; 341 ep++; 342 goto star; 343 344 case CCL|CSTAR: 345 case NCCL|CSTAR: 346 curlp = lp; 347 while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR))) 348 ; 349 ep += *ep; 350 goto star; 351 352 star: 353 do { 354 lp--; 355 if (rv = advance(lp, ep)) 356 return(rv); 357 } while (lp > curlp); 358 return(0); 359 360 default: 361 return(-1); 362 } 363 } 364 365 backref(i, lp) 366 register int i; 367 register char *lp; 368 { 369 register char *bp; 370 371 bp = braslist[i]; 372 while (*bp++ == *lp++) 373 if (bp >= braelist[i]) 374 return(1); 375 return(0); 376 } 377 378 int 379 cclass(set, c, af) 380 register char *set, c; 381 int af; 382 { 383 register int n; 384 385 if (c == 0) 386 return(0); 387 n = *set++; 388 while (--n) 389 if (*set++ == c) 390 return(af); 391 return(! af); 392 } 393