1 static char sccsid[] = "@(#)regexp.c 4.1 (Berkeley) 10/19/82"; 2 3 #include <ctype.h> 4 5 typedef int boolean; 6 #define TRUE 1 7 #define FALSE 0 8 #define NIL 0 9 10 boolean l_onecase; /* true if upper and lower equivalent */ 11 12 #define makelower(c) (isupper((c)) ? tolower((c)) : (c)) 13 14 /* STRNCMP - like strncmp except that we convert the 15 * first string to lower case before comparing 16 * if l_onecase is set. 17 */ 18 19 STRNCMP(s1, s2, len) 20 register char *s1,*s2; 21 register int len; 22 { 23 if (l_onecase) { 24 do 25 if (*s2 - makelower(*s1)) 26 return (*s2 - makelower(*s1)); 27 else { 28 s2++; 29 s1++; 30 } 31 while (--len); 32 } else { 33 do 34 if (*s2 - *s1) 35 return (*s2 - *s1); 36 else { 37 s2++; 38 s1++; 39 } 40 while (--len); 41 } 42 return(0); 43 } 44 45 /* The following routine converts an irregular expression to 46 * internal format. 47 * 48 * Either meta symbols (\a \d or \p) or character strings or 49 * operations ( alternation or perenthesizing ) can be 50 * specified. Each starts with a descriptor byte. The descriptor 51 * byte has STR set for strings, META set for meta symbols 52 * and OPER set for operations. 53 * The descriptor byte can also have the OPT bit set if the object 54 * defined is optional. Also ALT can be set to indicate an alternation. 55 * 56 * For metasymbols the byte following the descriptor byte identities 57 * the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For 58 * strings the byte after the descriptor is a character count for 59 * the string: 60 * 61 * meta symbols := descriptor 62 * symbol 63 * 64 * strings := descriptor 65 * character count 66 * the string 67 * 68 * operatins := descriptor 69 * symbol 70 * character count 71 */ 72 73 /* 74 * handy macros for accessing parts of match blocks 75 */ 76 #define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */ 77 #define MNEXT(A) (A+2) /* character following a metasymbol block */ 78 79 #define OSYM(A) (*(A+1)) /* symbol in an operation block */ 80 #define OCNT(A) (*(A+2)) /* character count */ 81 #define ONEXT(A) (A+3) /* next character after the operation */ 82 #define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */ 83 84 #define SCNT(A) (*(A+1)) /* byte count of a string */ 85 #define SSTR(A) (A+2) /* address of the string */ 86 #define SNEXT(A) (A+2+*(A+1)) /* character following the string */ 87 88 /* 89 * bit flags in the descriptor 90 */ 91 #define OPT 1 92 #define STR 2 93 #define META 4 94 #define ALT 8 95 #define OPER 16 96 97 char *ure; /* pointer current position in unconverted exp */ 98 char *ccre; /* pointer to current position in converted exp*/ 99 char *malloc(); 100 101 char * 102 convexp(re) 103 char *re; /* unconverted irregular expression */ 104 { 105 register char *cre; /* pointer to converted regular expression */ 106 107 /* allocate room for the converted expression */ 108 if (re == NIL) 109 return (NIL); 110 if (*re == '\0') 111 return (NIL); 112 cre = malloc (4 * strlen(re) + 3); 113 ccre = cre; 114 ure = re; 115 116 /* start the conversion with a \a */ 117 *cre = META | OPT; 118 MSYM(cre) = 'a'; 119 ccre = MNEXT(cre); 120 121 /* start the conversion (its recursive) */ 122 expconv (); 123 *ccre = 0; 124 return (cre); 125 } 126 127 expconv() 128 { 129 register char *cs; /* pointer to current symbol in converted exp */ 130 register char c; /* character being processed */ 131 register char *acs; /* pinter to last alternate */ 132 register int temp; 133 134 /* let the conversion begin */ 135 acs = NIL; 136 while (*ure != NIL) { 137 switch (c = *ure++) { 138 139 case '\\': 140 switch (c = *ure++) { 141 142 /* escaped characters are just characters */ 143 default: 144 if ((*cs & STR) == 0) { 145 cs = ccre; 146 *cs = STR; 147 SCNT(cs) = 1; 148 ccre += 2; 149 } else 150 SCNT(cs)++; 151 *ccre++ = c; 152 break; 153 154 /* normal(?) metacharacters */ 155 case 'a': 156 case 'd': 157 case 'e': 158 case 'p': 159 if (acs != NIL && acs != cs) { 160 do { 161 temp = OCNT(acs); 162 OCNT(acs) = ccre - acs; 163 acs -= temp; 164 } while (temp != 0); 165 acs = NIL; 166 } 167 cs = ccre; 168 *cs = META; 169 MSYM(cs) = c; 170 ccre = MNEXT(cs); 171 break; 172 } 173 break; 174 175 /* just put the symbol in */ 176 case '^': 177 case '$': 178 if (acs != NIL && acs != cs) { 179 do { 180 temp = OCNT(acs); 181 OCNT(acs) = ccre - acs; 182 acs -= temp; 183 } while (temp != 0); 184 acs = NIL; 185 } 186 cs = ccre; 187 *cs = META; 188 MSYM(cs) = c; 189 ccre = MNEXT(cs); 190 break; 191 192 /* mark the last match sequence as optional */ 193 case '?': 194 *cs = *cs | OPT; 195 break; 196 197 /* recurse and define a subexpression */ 198 case '(': 199 if (acs != NIL && acs != cs) { 200 do { 201 temp = OCNT(acs); 202 OCNT(acs) = ccre - acs; 203 acs -= temp; 204 } while (temp != 0); 205 acs = NIL; 206 } 207 cs = ccre; 208 *cs = OPER; 209 OSYM(cs) = '('; 210 ccre = ONEXT(cs); 211 expconv (); 212 OCNT(cs) = ccre - cs; /* offset to next symbol */ 213 break; 214 215 /* return from a recursion */ 216 case ')': 217 if (acs != NIL) { 218 do { 219 temp = OCNT(acs); 220 OCNT(acs) = ccre - acs; 221 acs -= temp; 222 } while (temp != 0); 223 acs = NIL; 224 } 225 cs = ccre; 226 *cs = META; 227 MSYM(cs) = c; 228 ccre = MNEXT(cs); 229 return; 230 231 /* mark the last match sequence as having an alternate */ 232 /* the third byte will contain an offset to jump over the */ 233 /* alternate match in case the first did not fail */ 234 case '|': 235 if (acs != NIL && acs != cs) 236 OCNT(ccre) = ccre - acs; /* make a back pointer */ 237 else 238 OCNT(ccre) = 0; 239 *cs |= ALT; 240 cs = ccre; 241 *cs = OPER; 242 OSYM(cs) = '|'; 243 ccre = ONEXT(cs); 244 acs = cs; /* remember that the pointer is to be filles */ 245 break; 246 247 /* if its not a metasymbol just build a scharacter string */ 248 default: 249 if ((*cs & STR) == 0) { 250 cs = ccre; 251 *cs = STR; 252 SCNT(cs) = 1; 253 ccre = SSTR(cs); 254 } else 255 SCNT(cs)++; 256 *ccre++ = c; 257 break; 258 } 259 } 260 if (acs != NIL) { 261 do { 262 temp = OCNT(acs); 263 OCNT(acs) = ccre - acs; 264 acs -= temp; 265 } while (temp != 0); 266 acs = NIL; 267 } 268 return; 269 } 270 /* end of convertre */ 271 272 273 /* 274 * The following routine recognises an irregular expresion 275 * with the following special characters: 276 * 277 * \? - means last match was optional 278 * \a - matches any number of characters 279 * \d - matches any number of spaces and tabs 280 * \p - matches any number of alphanumeric 281 * characters. The 282 * characters matched will be copied into 283 * the area pointed to by 'name'. 284 * \| - alternation 285 * \( \) - grouping used mostly for alternation and 286 * optionality 287 * 288 * The irregular expression must be translated to internal form 289 * prior to calling this routine 290 * 291 * The value returned is the pointer to the first non \a 292 * character matched. 293 */ 294 295 boolean _escaped; /* true if we are currently _escaped */ 296 char *_start; /* start of string */ 297 298 char * 299 expmatch (s, re, mstring) 300 register char *s; /* string to check for a match in */ 301 register char *re; /* a converted irregular expression */ 302 register char *mstring; /* where to put whatever matches a \p */ 303 { 304 register char *cs; /* the current symbol */ 305 register char *ptr,*s1; /* temporary pointer */ 306 boolean matched; /* a temporary boolean */ 307 308 /* initial conditions */ 309 if (re == NIL) 310 return (NIL); 311 cs = re; 312 matched = FALSE; 313 314 /* loop till expression string is exhausted (or at least pretty tired) */ 315 while (*cs) { 316 switch (*cs & (OPER | STR | META)) { 317 318 /* try to match a string */ 319 case STR: 320 matched = !STRNCMP (s, SSTR(cs), SCNT(cs)); 321 if (matched) { 322 323 /* hoorah it matches */ 324 s += SCNT(cs); 325 cs = SNEXT(cs); 326 } else if (*cs & ALT) { 327 328 /* alternation, skip to next expression */ 329 cs = SNEXT(cs); 330 } else if (*cs & OPT) { 331 332 /* the match is optional */ 333 cs = SNEXT(cs); 334 matched = 1; /* indicate a successful match */ 335 } else { 336 337 /* no match, error return */ 338 return (NIL); 339 } 340 break; 341 342 /* an operator, do something fancy */ 343 case OPER: 344 switch (OSYM(cs)) { 345 346 /* this is an alternation */ 347 case '|': 348 if (matched) 349 350 /* last thing in the alternation was a match, skip ahead */ 351 cs = OPTR(cs); 352 else 353 354 /* no match, keep trying */ 355 cs = ONEXT(cs); 356 break; 357 358 /* this is a grouping, recurse */ 359 case '(': 360 ptr = expmatch (s, ONEXT(cs), mstring); 361 if (ptr != NIL) { 362 363 /* the subexpression matched */ 364 matched = 1; 365 s = ptr; 366 } else if (*cs & ALT) { 367 368 /* alternation, skip to next expression */ 369 matched = 0; 370 } else if (*cs & OPT) { 371 372 /* the match is optional */ 373 matched = 1; /* indicate a successful match */ 374 } else { 375 376 /* no match, error return */ 377 return (NIL); 378 } 379 cs = OPTR(cs); 380 break; 381 } 382 break; 383 384 /* try to match a metasymbol */ 385 case META: 386 switch (MSYM(cs)) { 387 388 /* try to match anything and remember what was matched */ 389 case 'p': 390 /* 391 * This is really the same as trying the match the 392 * remaining parts of the expression to any subset 393 * of the string. 394 */ 395 s1 = s; 396 do { 397 ptr = expmatch (s1, MNEXT(cs), mstring); 398 if (ptr != NIL && s1 != s) { 399 400 /* we have a match, remember the match */ 401 strncpy (mstring, s, s1 - s); 402 mstring[s1 - s] = '\0'; 403 return (ptr); 404 } else if (ptr != NIL && (*cs & OPT)) { 405 406 /* it was aoptional so no match is ok */ 407 return (ptr); 408 } else if (ptr != NIL) { 409 410 /* not optional and we still matched */ 411 return (NIL); 412 } 413 if (!isalnum(*s1) && *s1 != '_') 414 return (NIL); 415 if (*s1 == '\\') 416 _escaped = _escaped ? FALSE : TRUE; 417 else 418 _escaped = FALSE; 419 } while (*s1++); 420 return (NIL); 421 422 /* try to match anything */ 423 case 'a': 424 /* 425 * This is really the same as trying the match the 426 * remaining parts of the expression to any subset 427 * of the string. 428 */ 429 s1 = s; 430 do { 431 ptr = expmatch (s1, MNEXT(cs), mstring); 432 if (ptr != NIL && s1 != s) { 433 434 /* we have a match */ 435 return (ptr); 436 } else if (ptr != NIL && (*cs & OPT)) { 437 438 /* it was aoptional so no match is ok */ 439 return (ptr); 440 } else if (ptr != NIL) { 441 442 /* not optional and we still matched */ 443 return (NIL); 444 } 445 if (*s1 == '\\') 446 _escaped = _escaped ? FALSE : TRUE; 447 else 448 _escaped = FALSE; 449 } while (*s1++); 450 return (NIL); 451 452 /* fail if we are currently _escaped */ 453 case 'e': 454 if (_escaped) 455 return(NIL); 456 cs = MNEXT(cs); 457 break; 458 459 /* match any number of tabs and spaces */ 460 case 'd': 461 ptr = s; 462 while (*s == ' ' || *s == '\t') 463 s++; 464 if (s != ptr || s == _start) { 465 466 /* match, be happy */ 467 matched = 1; 468 cs = MNEXT(cs); 469 } else if (*s == '\n' || *s == '\0') { 470 471 /* match, be happy */ 472 matched = 1; 473 cs = MNEXT(cs); 474 } else if (*cs & ALT) { 475 476 /* try the next part */ 477 matched = 0; 478 cs = MNEXT(cs); 479 } else if (*cs & OPT) { 480 481 /* doesn't matter */ 482 matched = 1; 483 cs = MNEXT(cs); 484 } else 485 486 /* no match, error return */ 487 return (NIL); 488 break; 489 490 /* check for end of line */ 491 case '$': 492 if (*s == '\0' || *s == '\n') { 493 494 /* match, be happy */ 495 s++; 496 matched = 1; 497 cs = MNEXT(cs); 498 } else if (*cs & ALT) { 499 500 /* try the next part */ 501 matched = 0; 502 cs = MNEXT(cs); 503 } else if (*cs & OPT) { 504 505 /* doesn't matter */ 506 matched = 1; 507 cs = MNEXT(cs); 508 } else 509 510 /* no match, error return */ 511 return (NIL); 512 break; 513 514 /* check for start of line */ 515 case '^': 516 if (s == _start) { 517 518 /* match, be happy */ 519 matched = 1; 520 cs = MNEXT(cs); 521 } else if (*cs & ALT) { 522 523 /* try the next part */ 524 matched = 0; 525 cs = MNEXT(cs); 526 } else if (*cs & OPT) { 527 528 /* doesn't matter */ 529 matched = 1; 530 cs = MNEXT(cs); 531 } else 532 533 /* no match, error return */ 534 return (NIL); 535 break; 536 537 /* end of a subexpression, return success */ 538 case ')': 539 return (s); 540 } 541 break; 542 } 543 } 544 return (s); 545 } 546