1 /* 2 * Copyright (c) 1980 Regents of the University of California. 3 * All rights reserved. The Berkeley software License Agreement 4 * specifies the terms and conditions for redistribution. 5 */ 6 7 #if defined(LIBC_SCCS) && !defined(lint) 8 static char sccsid[] = "@(#)regex.c 5.2 (Berkeley) 03/09/86"; 9 #endif LIBC_SCCS and not lint 10 11 # 12 13 /* 14 * routines to do regular expression matching 15 * 16 * Entry points: 17 * 18 * re_comp(s) 19 * char *s; 20 * ... returns 0 if the string s was compiled successfully, 21 * a pointer to an error message otherwise. 22 * If passed 0 or a null string returns without changing 23 * the currently compiled re (see note 11 below). 24 * 25 * re_exec(s) 26 * char *s; 27 * ... returns 1 if the string s matches the last compiled regular 28 * expression, 29 * 0 if the string s failed to match the last compiled 30 * regular expression, and 31 * -1 if the compiled regular expression was invalid 32 * (indicating an internal error). 33 * 34 * The strings passed to both re_comp and re_exec may have trailing or 35 * embedded newline characters; they are terminated by nulls. 36 * 37 * The identity of the author of these routines is lost in antiquity; 38 * this is essentially the same as the re code in the original V6 ed. 39 * 40 * The regular expressions recognized are described below. This description 41 * is essentially the same as that for ed. 42 * 43 * A regular expression specifies a set of strings of characters. 44 * A member of this set of strings is said to be matched by 45 * the regular expression. In the following specification for 46 * regular expressions the word `character' means any character but NUL. 47 * 48 * 1. Any character except a special character matches itself. 49 * Special characters are the regular expression delimiter plus 50 * \ [ . and sometimes ^ * $. 51 * 2. A . matches any character. 52 * 3. A \ followed by any character except a digit or ( ) 53 * matches that character. 54 * 4. A nonempty string s bracketed [s] (or [^s]) matches any 55 * character in (or not in) s. In s, \ has no special meaning, 56 * and ] may only appear as the first letter. A substring 57 * a-b, with a and b in ascending ASCII order, stands for 58 * the inclusive range of ASCII characters. 59 * 5. A regular expression of form 1-4 followed by * matches a 60 * sequence of 0 or more matches of the regular expression. 61 * 6. A regular expression, x, of form 1-8, bracketed \(x\) 62 * matches what x matches. 63 * 7. A \ followed by a digit n matches a copy of the string that the 64 * bracketed regular expression beginning with the nth \( matched. 65 * 8. A regular expression of form 1-8, x, followed by a regular 66 * expression of form 1-7, y matches a match for x followed by 67 * a match for y, with the x match being as long as possible 68 * while still permitting a y match. 69 * 9. A regular expression of form 1-8 preceded by ^ (or followed 70 * by $), is constrained to matches that begin at the left 71 * (or end at the right) end of a line. 72 * 10. A regular expression of form 1-9 picks out the longest among 73 * the leftmost matches in a line. 74 * 11. An empty regular expression stands for a copy of the last 75 * regular expression encountered. 76 */ 77 78 /* 79 * constants for re's 80 */ 81 #define CBRA 1 82 #define CCHR 2 83 #define CDOT 4 84 #define CCL 6 85 #define NCCL 8 86 #define CDOL 10 87 #define CEOF 11 88 #define CKET 12 89 #define CBACK 18 90 91 #define CSTAR 01 92 93 #define ESIZE 512 94 #define NBRA 9 95 96 static char expbuf[ESIZE], *braslist[NBRA], *braelist[NBRA]; 97 static char circf; 98 99 /* 100 * compile the regular expression argument into a dfa 101 */ 102 char * 103 re_comp(sp) 104 register char *sp; 105 { 106 register int c; 107 register char *ep = expbuf; 108 int cclcnt, numbra = 0; 109 char *lastep = 0; 110 char bracket[NBRA]; 111 char *bracketp = &bracket[0]; 112 static char *retoolong = "Regular expression too long"; 113 114 #define comerr(msg) {expbuf[0] = 0; numbra = 0; return(msg); } 115 116 if (sp == 0 || *sp == '\0') { 117 if (*ep == 0) 118 return("No previous regular expression"); 119 return(0); 120 } 121 if (*sp == '^') { 122 circf = 1; 123 sp++; 124 } 125 else 126 circf = 0; 127 for (;;) { 128 if (ep >= &expbuf[ESIZE]) 129 comerr(retoolong); 130 if ((c = *sp++) == '\0') { 131 if (bracketp != bracket) 132 comerr("unmatched \\("); 133 *ep++ = CEOF; 134 *ep++ = 0; 135 return(0); 136 } 137 if (c != '*') 138 lastep = ep; 139 switch (c) { 140 141 case '.': 142 *ep++ = CDOT; 143 continue; 144 145 case '*': 146 if (lastep == 0 || *lastep == CBRA || *lastep == CKET) 147 goto defchar; 148 *lastep |= CSTAR; 149 continue; 150 151 case '$': 152 if (*sp != '\0') 153 goto defchar; 154 *ep++ = CDOL; 155 continue; 156 157 case '[': 158 *ep++ = CCL; 159 *ep++ = 0; 160 cclcnt = 1; 161 if ((c = *sp++) == '^') { 162 c = *sp++; 163 ep[-2] = NCCL; 164 } 165 do { 166 if (c == '\0') 167 comerr("missing ]"); 168 if (c == '-' && ep [-1] != 0) { 169 if ((c = *sp++) == ']') { 170 *ep++ = '-'; 171 cclcnt++; 172 break; 173 } 174 while (ep[-1] < c) { 175 *ep = ep[-1] + 1; 176 ep++; 177 cclcnt++; 178 if (ep >= &expbuf[ESIZE]) 179 comerr(retoolong); 180 } 181 } 182 *ep++ = c; 183 cclcnt++; 184 if (ep >= &expbuf[ESIZE]) 185 comerr(retoolong); 186 } while ((c = *sp++) != ']'); 187 lastep[1] = cclcnt; 188 continue; 189 190 case '\\': 191 if ((c = *sp++) == '(') { 192 if (numbra >= NBRA) 193 comerr("too many \\(\\) pairs"); 194 *bracketp++ = numbra; 195 *ep++ = CBRA; 196 *ep++ = numbra++; 197 continue; 198 } 199 if (c == ')') { 200 if (bracketp <= bracket) 201 comerr("unmatched \\)"); 202 *ep++ = CKET; 203 *ep++ = *--bracketp; 204 continue; 205 } 206 if (c >= '1' && c < ('1' + NBRA)) { 207 *ep++ = CBACK; 208 *ep++ = c - '1'; 209 continue; 210 } 211 *ep++ = CCHR; 212 *ep++ = c; 213 continue; 214 215 defchar: 216 default: 217 *ep++ = CCHR; 218 *ep++ = c; 219 } 220 } 221 } 222 223 /* 224 * match the argument string against the compiled re 225 */ 226 int 227 re_exec(p1) 228 register char *p1; 229 { 230 register char *p2 = expbuf; 231 register int c; 232 int rv; 233 234 for (c = 0; c < NBRA; c++) { 235 braslist[c] = 0; 236 braelist[c] = 0; 237 } 238 if (circf) 239 return((advance(p1, p2))); 240 /* 241 * fast check for first character 242 */ 243 if (*p2 == CCHR) { 244 c = p2[1]; 245 do { 246 if (*p1 != c) 247 continue; 248 if (rv = advance(p1, p2)) 249 return(rv); 250 } while (*p1++); 251 return(0); 252 } 253 /* 254 * regular algorithm 255 */ 256 do 257 if (rv = advance(p1, p2)) 258 return(rv); 259 while (*p1++); 260 return(0); 261 } 262 263 /* 264 * try to match the next thing in the dfa 265 */ 266 static int 267 advance(lp, ep) 268 register char *lp, *ep; 269 { 270 register char *curlp; 271 int ct, i; 272 int rv; 273 274 for (;;) 275 switch (*ep++) { 276 277 case CCHR: 278 if (*ep++ == *lp++) 279 continue; 280 return(0); 281 282 case CDOT: 283 if (*lp++) 284 continue; 285 return(0); 286 287 case CDOL: 288 if (*lp == '\0') 289 continue; 290 return(0); 291 292 case CEOF: 293 return(1); 294 295 case CCL: 296 if (cclass(ep, *lp++, 1)) { 297 ep += *ep; 298 continue; 299 } 300 return(0); 301 302 case NCCL: 303 if (cclass(ep, *lp++, 0)) { 304 ep += *ep; 305 continue; 306 } 307 return(0); 308 309 case CBRA: 310 braslist[*ep++] = lp; 311 continue; 312 313 case CKET: 314 braelist[*ep++] = lp; 315 continue; 316 317 case CBACK: 318 if (braelist[i = *ep++] == 0) 319 return(-1); 320 if (backref(i, lp)) { 321 lp += braelist[i] - braslist[i]; 322 continue; 323 } 324 return(0); 325 326 case CBACK|CSTAR: 327 if (braelist[i = *ep++] == 0) 328 return(-1); 329 curlp = lp; 330 ct = braelist[i] - braslist[i]; 331 while (backref(i, lp)) 332 lp += ct; 333 while (lp >= curlp) { 334 if (rv = advance(lp, ep)) 335 return(rv); 336 lp -= ct; 337 } 338 continue; 339 340 case CDOT|CSTAR: 341 curlp = lp; 342 while (*lp++) 343 ; 344 goto star; 345 346 case CCHR|CSTAR: 347 curlp = lp; 348 while (*lp++ == *ep) 349 ; 350 ep++; 351 goto star; 352 353 case CCL|CSTAR: 354 case NCCL|CSTAR: 355 curlp = lp; 356 while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR))) 357 ; 358 ep += *ep; 359 goto star; 360 361 star: 362 do { 363 lp--; 364 if (rv = advance(lp, ep)) 365 return(rv); 366 } while (lp > curlp); 367 return(0); 368 369 default: 370 return(-1); 371 } 372 } 373 374 backref(i, lp) 375 register int i; 376 register char *lp; 377 { 378 register char *bp; 379 380 bp = braslist[i]; 381 while (*bp++ == *lp++) 382 if (bp >= braelist[i]) 383 return(1); 384 return(0); 385 } 386 387 int 388 cclass(set, c, af) 389 register char *set, c; 390 int af; 391 { 392 register int n; 393 394 if (c == 0) 395 return(0); 396 n = *set++; 397 while (--n) 398 if (*set++ == c) 399 return(af); 400 return(! af); 401 } 402