1 /* 2 * Copyright (c) 1980 The Regents of the University of California. 3 * All rights reserved. 4 * 5 * %sccs.include.redist.c% 6 */ 7 8 #ifndef lint 9 static char sccsid[] = "@(#)regexp.c 5.3 (Berkeley) 06/01/90"; 10 #endif /* not lint */ 11 12 #include <ctype.h> 13 14 typedef int boolean; 15 #define TRUE 1 16 #define FALSE 0 17 #define NIL 0 18 19 boolean l_onecase; /* true if upper and lower equivalent */ 20 21 #define makelower(c) (isupper((c)) ? tolower((c)) : (c)) 22 23 /* STRNCMP - like strncmp except that we convert the 24 * first string to lower case before comparing 25 * if l_onecase is set. 26 */ 27 28 STRNCMP(s1, s2, len) 29 register char *s1,*s2; 30 register int len; 31 { 32 if (l_onecase) { 33 do 34 if (*s2 - makelower(*s1)) 35 return (*s2 - makelower(*s1)); 36 else { 37 s2++; 38 s1++; 39 } 40 while (--len); 41 } else { 42 do 43 if (*s2 - *s1) 44 return (*s2 - *s1); 45 else { 46 s2++; 47 s1++; 48 } 49 while (--len); 50 } 51 return(0); 52 } 53 54 /* The following routine converts an irregular expression to 55 * internal format. 56 * 57 * Either meta symbols (\a \d or \p) or character strings or 58 * operations ( alternation or perenthesizing ) can be 59 * specified. Each starts with a descriptor byte. The descriptor 60 * byte has STR set for strings, META set for meta symbols 61 * and OPER set for operations. 62 * The descriptor byte can also have the OPT bit set if the object 63 * defined is optional. Also ALT can be set to indicate an alternation. 64 * 65 * For metasymbols the byte following the descriptor byte identities 66 * the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For 67 * strings the byte after the descriptor is a character count for 68 * the string: 69 * 70 * meta symbols := descriptor 71 * symbol 72 * 73 * strings := descriptor 74 * character count 75 * the string 76 * 77 * operatins := descriptor 78 * symbol 79 * character count 80 */ 81 82 /* 83 * handy macros for accessing parts of match blocks 84 */ 85 #define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */ 86 #define MNEXT(A) (A+2) /* character following a metasymbol block */ 87 88 #define OSYM(A) (*(A+1)) /* symbol in an operation block */ 89 #define OCNT(A) (*(A+2)) /* character count */ 90 #define ONEXT(A) (A+3) /* next character after the operation */ 91 #define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */ 92 93 #define SCNT(A) (*(A+1)) /* byte count of a string */ 94 #define SSTR(A) (A+2) /* address of the string */ 95 #define SNEXT(A) (A+2+*(A+1)) /* character following the string */ 96 97 /* 98 * bit flags in the descriptor 99 */ 100 #define OPT 1 101 #define STR 2 102 #define META 4 103 #define ALT 8 104 #define OPER 16 105 106 char *ure; /* pointer current position in unconverted exp */ 107 char *ccre; /* pointer to current position in converted exp*/ 108 char *malloc(); 109 110 char * 111 convexp(re) 112 char *re; /* unconverted irregular expression */ 113 { 114 register char *cre; /* pointer to converted regular expression */ 115 116 /* allocate room for the converted expression */ 117 if (re == NIL) 118 return (NIL); 119 if (*re == '\0') 120 return (NIL); 121 cre = malloc (4 * strlen(re) + 3); 122 ccre = cre; 123 ure = re; 124 125 /* start the conversion with a \a */ 126 *cre = META | OPT; 127 MSYM(cre) = 'a'; 128 ccre = MNEXT(cre); 129 130 /* start the conversion (its recursive) */ 131 expconv (); 132 *ccre = 0; 133 return (cre); 134 } 135 136 expconv() 137 { 138 register char *cs; /* pointer to current symbol in converted exp */ 139 register char c; /* character being processed */ 140 register char *acs; /* pinter to last alternate */ 141 register int temp; 142 143 /* let the conversion begin */ 144 acs = NIL; 145 cs = NIL; 146 while (*ure != NIL) { 147 switch (c = *ure++) { 148 149 case '\\': 150 switch (c = *ure++) { 151 152 /* escaped characters are just characters */ 153 default: 154 if (cs == NIL || (*cs & STR) == 0) { 155 cs = ccre; 156 *cs = STR; 157 SCNT(cs) = 1; 158 ccre += 2; 159 } else 160 SCNT(cs)++; 161 *ccre++ = c; 162 break; 163 164 /* normal(?) metacharacters */ 165 case 'a': 166 case 'd': 167 case 'e': 168 case 'p': 169 if (acs != NIL && acs != cs) { 170 do { 171 temp = OCNT(acs); 172 OCNT(acs) = ccre - acs; 173 acs -= temp; 174 } while (temp != 0); 175 acs = NIL; 176 } 177 cs = ccre; 178 *cs = META; 179 MSYM(cs) = c; 180 ccre = MNEXT(cs); 181 break; 182 } 183 break; 184 185 /* just put the symbol in */ 186 case '^': 187 case '$': 188 if (acs != NIL && acs != cs) { 189 do { 190 temp = OCNT(acs); 191 OCNT(acs) = ccre - acs; 192 acs -= temp; 193 } while (temp != 0); 194 acs = NIL; 195 } 196 cs = ccre; 197 *cs = META; 198 MSYM(cs) = c; 199 ccre = MNEXT(cs); 200 break; 201 202 /* mark the last match sequence as optional */ 203 case '?': 204 if (cs) 205 *cs = *cs | OPT; 206 break; 207 208 /* recurse and define a subexpression */ 209 case '(': 210 if (acs != NIL && acs != cs) { 211 do { 212 temp = OCNT(acs); 213 OCNT(acs) = ccre - acs; 214 acs -= temp; 215 } while (temp != 0); 216 acs = NIL; 217 } 218 cs = ccre; 219 *cs = OPER; 220 OSYM(cs) = '('; 221 ccre = ONEXT(cs); 222 expconv (); 223 OCNT(cs) = ccre - cs; /* offset to next symbol */ 224 break; 225 226 /* return from a recursion */ 227 case ')': 228 if (acs != NIL) { 229 do { 230 temp = OCNT(acs); 231 OCNT(acs) = ccre - acs; 232 acs -= temp; 233 } while (temp != 0); 234 acs = NIL; 235 } 236 cs = ccre; 237 *cs = META; 238 MSYM(cs) = c; 239 ccre = MNEXT(cs); 240 return; 241 242 /* mark the last match sequence as having an alternate */ 243 /* the third byte will contain an offset to jump over the */ 244 /* alternate match in case the first did not fail */ 245 case '|': 246 if (acs != NIL && acs != cs) 247 OCNT(ccre) = ccre - acs; /* make a back pointer */ 248 else 249 OCNT(ccre) = 0; 250 *cs |= ALT; 251 cs = ccre; 252 *cs = OPER; 253 OSYM(cs) = '|'; 254 ccre = ONEXT(cs); 255 acs = cs; /* remember that the pointer is to be filles */ 256 break; 257 258 /* if its not a metasymbol just build a scharacter string */ 259 default: 260 if (cs == NIL || (*cs & STR) == 0) { 261 cs = ccre; 262 *cs = STR; 263 SCNT(cs) = 1; 264 ccre = SSTR(cs); 265 } else 266 SCNT(cs)++; 267 *ccre++ = c; 268 break; 269 } 270 } 271 if (acs != NIL) { 272 do { 273 temp = OCNT(acs); 274 OCNT(acs) = ccre - acs; 275 acs -= temp; 276 } while (temp != 0); 277 acs = NIL; 278 } 279 return; 280 } 281 /* end of convertre */ 282 283 284 /* 285 * The following routine recognises an irregular expresion 286 * with the following special characters: 287 * 288 * \? - means last match was optional 289 * \a - matches any number of characters 290 * \d - matches any number of spaces and tabs 291 * \p - matches any number of alphanumeric 292 * characters. The 293 * characters matched will be copied into 294 * the area pointed to by 'name'. 295 * \| - alternation 296 * \( \) - grouping used mostly for alternation and 297 * optionality 298 * 299 * The irregular expression must be translated to internal form 300 * prior to calling this routine 301 * 302 * The value returned is the pointer to the first non \a 303 * character matched. 304 */ 305 306 boolean _escaped; /* true if we are currently _escaped */ 307 char *_start; /* start of string */ 308 309 char * 310 expmatch (s, re, mstring) 311 register char *s; /* string to check for a match in */ 312 register char *re; /* a converted irregular expression */ 313 register char *mstring; /* where to put whatever matches a \p */ 314 { 315 register char *cs; /* the current symbol */ 316 register char *ptr,*s1; /* temporary pointer */ 317 boolean matched; /* a temporary boolean */ 318 319 /* initial conditions */ 320 if (re == NIL) 321 return (NIL); 322 cs = re; 323 matched = FALSE; 324 325 /* loop till expression string is exhausted (or at least pretty tired) */ 326 while (*cs) { 327 switch (*cs & (OPER | STR | META)) { 328 329 /* try to match a string */ 330 case STR: 331 matched = !STRNCMP (s, SSTR(cs), SCNT(cs)); 332 if (matched) { 333 334 /* hoorah it matches */ 335 s += SCNT(cs); 336 cs = SNEXT(cs); 337 } else if (*cs & ALT) { 338 339 /* alternation, skip to next expression */ 340 cs = SNEXT(cs); 341 } else if (*cs & OPT) { 342 343 /* the match is optional */ 344 cs = SNEXT(cs); 345 matched = 1; /* indicate a successful match */ 346 } else { 347 348 /* no match, error return */ 349 return (NIL); 350 } 351 break; 352 353 /* an operator, do something fancy */ 354 case OPER: 355 switch (OSYM(cs)) { 356 357 /* this is an alternation */ 358 case '|': 359 if (matched) 360 361 /* last thing in the alternation was a match, skip ahead */ 362 cs = OPTR(cs); 363 else 364 365 /* no match, keep trying */ 366 cs = ONEXT(cs); 367 break; 368 369 /* this is a grouping, recurse */ 370 case '(': 371 ptr = expmatch (s, ONEXT(cs), mstring); 372 if (ptr != NIL) { 373 374 /* the subexpression matched */ 375 matched = 1; 376 s = ptr; 377 } else if (*cs & ALT) { 378 379 /* alternation, skip to next expression */ 380 matched = 0; 381 } else if (*cs & OPT) { 382 383 /* the match is optional */ 384 matched = 1; /* indicate a successful match */ 385 } else { 386 387 /* no match, error return */ 388 return (NIL); 389 } 390 cs = OPTR(cs); 391 break; 392 } 393 break; 394 395 /* try to match a metasymbol */ 396 case META: 397 switch (MSYM(cs)) { 398 399 /* try to match anything and remember what was matched */ 400 case 'p': 401 /* 402 * This is really the same as trying the match the 403 * remaining parts of the expression to any subset 404 * of the string. 405 */ 406 s1 = s; 407 do { 408 ptr = expmatch (s1, MNEXT(cs), mstring); 409 if (ptr != NIL && s1 != s) { 410 411 /* we have a match, remember the match */ 412 strncpy (mstring, s, s1 - s); 413 mstring[s1 - s] = '\0'; 414 return (ptr); 415 } else if (ptr != NIL && (*cs & OPT)) { 416 417 /* it was aoptional so no match is ok */ 418 return (ptr); 419 } else if (ptr != NIL) { 420 421 /* not optional and we still matched */ 422 return (NIL); 423 } 424 if (!isalnum(*s1) && *s1 != '_') 425 return (NIL); 426 if (*s1 == '\\') 427 _escaped = _escaped ? FALSE : TRUE; 428 else 429 _escaped = FALSE; 430 } while (*s1++); 431 return (NIL); 432 433 /* try to match anything */ 434 case 'a': 435 /* 436 * This is really the same as trying the match the 437 * remaining parts of the expression to any subset 438 * of the string. 439 */ 440 s1 = s; 441 do { 442 ptr = expmatch (s1, MNEXT(cs), mstring); 443 if (ptr != NIL && s1 != s) { 444 445 /* we have a match */ 446 return (ptr); 447 } else if (ptr != NIL && (*cs & OPT)) { 448 449 /* it was aoptional so no match is ok */ 450 return (ptr); 451 } else if (ptr != NIL) { 452 453 /* not optional and we still matched */ 454 return (NIL); 455 } 456 if (*s1 == '\\') 457 _escaped = _escaped ? FALSE : TRUE; 458 else 459 _escaped = FALSE; 460 } while (*s1++); 461 return (NIL); 462 463 /* fail if we are currently _escaped */ 464 case 'e': 465 if (_escaped) 466 return(NIL); 467 cs = MNEXT(cs); 468 break; 469 470 /* match any number of tabs and spaces */ 471 case 'd': 472 ptr = s; 473 while (*s == ' ' || *s == '\t') 474 s++; 475 if (s != ptr || s == _start) { 476 477 /* match, be happy */ 478 matched = 1; 479 cs = MNEXT(cs); 480 } else if (*s == '\n' || *s == '\0') { 481 482 /* match, be happy */ 483 matched = 1; 484 cs = MNEXT(cs); 485 } else if (*cs & ALT) { 486 487 /* try the next part */ 488 matched = 0; 489 cs = MNEXT(cs); 490 } else if (*cs & OPT) { 491 492 /* doesn't matter */ 493 matched = 1; 494 cs = MNEXT(cs); 495 } else 496 497 /* no match, error return */ 498 return (NIL); 499 break; 500 501 /* check for end of line */ 502 case '$': 503 if (*s == '\0' || *s == '\n') { 504 505 /* match, be happy */ 506 s++; 507 matched = 1; 508 cs = MNEXT(cs); 509 } else if (*cs & ALT) { 510 511 /* try the next part */ 512 matched = 0; 513 cs = MNEXT(cs); 514 } else if (*cs & OPT) { 515 516 /* doesn't matter */ 517 matched = 1; 518 cs = MNEXT(cs); 519 } else 520 521 /* no match, error return */ 522 return (NIL); 523 break; 524 525 /* check for start of line */ 526 case '^': 527 if (s == _start) { 528 529 /* match, be happy */ 530 matched = 1; 531 cs = MNEXT(cs); 532 } else if (*cs & ALT) { 533 534 /* try the next part */ 535 matched = 0; 536 cs = MNEXT(cs); 537 } else if (*cs & OPT) { 538 539 /* doesn't matter */ 540 matched = 1; 541 cs = MNEXT(cs); 542 } else 543 544 /* no match, error return */ 545 return (NIL); 546 break; 547 548 /* end of a subexpression, return success */ 549 case ')': 550 return (s); 551 } 552 break; 553 } 554 } 555 return (s); 556 } 557