1 /* 2 * Copyright (c) 1980, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * 6 * %sccs.include.redist.c% 7 */ 8 9 #ifndef lint 10 static char copyright[] = 11 "@(#) Copyright (c) 1980, 1993\n\ 12 The Regents of the University of California. All rights reserved.\n"; 13 #endif /* not lint */ 14 15 #ifndef lint 16 static char sccsid[] = "@(#)regexp.c 8.1 (Berkeley) 06/06/93"; 17 #endif /* not lint */ 18 19 #include <ctype.h> 20 #include <stdlib.h> 21 #include <string.h> 22 #include "extern.h" 23 24 #define FALSE 0 25 #define TRUE !(FALSE) 26 #define NIL 0 27 28 static void expconv __P((void)); 29 30 boolean _escaped; /* true if we are currently _escaped */ 31 char *_start; /* start of string */ 32 boolean l_onecase; /* true if upper and lower equivalent */ 33 34 #define makelower(c) (isupper((c)) ? tolower((c)) : (c)) 35 36 /* STRNCMP - like strncmp except that we convert the 37 * first string to lower case before comparing 38 * if l_onecase is set. 39 */ 40 41 int 42 STRNCMP(s1, s2, len) 43 register char *s1,*s2; 44 register int len; 45 { 46 if (l_onecase) { 47 do 48 if (*s2 - makelower(*s1)) 49 return (*s2 - makelower(*s1)); 50 else { 51 s2++; 52 s1++; 53 } 54 while (--len); 55 } else { 56 do 57 if (*s2 - *s1) 58 return (*s2 - *s1); 59 else { 60 s2++; 61 s1++; 62 } 63 while (--len); 64 } 65 return(0); 66 } 67 68 /* The following routine converts an irregular expression to 69 * internal format. 70 * 71 * Either meta symbols (\a \d or \p) or character strings or 72 * operations ( alternation or perenthesizing ) can be 73 * specified. Each starts with a descriptor byte. The descriptor 74 * byte has STR set for strings, META set for meta symbols 75 * and OPER set for operations. 76 * The descriptor byte can also have the OPT bit set if the object 77 * defined is optional. Also ALT can be set to indicate an alternation. 78 * 79 * For metasymbols the byte following the descriptor byte identities 80 * the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For 81 * strings the byte after the descriptor is a character count for 82 * the string: 83 * 84 * meta symbols := descriptor 85 * symbol 86 * 87 * strings := descriptor 88 * character count 89 * the string 90 * 91 * operatins := descriptor 92 * symbol 93 * character count 94 */ 95 96 /* 97 * handy macros for accessing parts of match blocks 98 */ 99 #define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */ 100 #define MNEXT(A) (A+2) /* character following a metasymbol block */ 101 102 #define OSYM(A) (*(A+1)) /* symbol in an operation block */ 103 #define OCNT(A) (*(A+2)) /* character count */ 104 #define ONEXT(A) (A+3) /* next character after the operation */ 105 #define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */ 106 107 #define SCNT(A) (*(A+1)) /* byte count of a string */ 108 #define SSTR(A) (A+2) /* address of the string */ 109 #define SNEXT(A) (A+2+*(A+1)) /* character following the string */ 110 111 /* 112 * bit flags in the descriptor 113 */ 114 #define OPT 1 115 #define STR 2 116 #define META 4 117 #define ALT 8 118 #define OPER 16 119 120 static char *ccre; /* pointer to current position in converted exp*/ 121 static char *ure; /* pointer current position in unconverted exp */ 122 123 char * 124 convexp(re) 125 char *re; /* unconverted irregular expression */ 126 { 127 register char *cre; /* pointer to converted regular expression */ 128 129 /* allocate room for the converted expression */ 130 if (re == NIL) 131 return (NIL); 132 if (*re == '\0') 133 return (NIL); 134 cre = malloc (4 * strlen(re) + 3); 135 ccre = cre; 136 ure = re; 137 138 /* start the conversion with a \a */ 139 *cre = META | OPT; 140 MSYM(cre) = 'a'; 141 ccre = MNEXT(cre); 142 143 /* start the conversion (its recursive) */ 144 expconv (); 145 *ccre = 0; 146 return (cre); 147 } 148 149 static void 150 expconv() 151 { 152 register char *cs; /* pointer to current symbol in converted exp */ 153 register char c; /* character being processed */ 154 register char *acs; /* pinter to last alternate */ 155 register int temp; 156 157 /* let the conversion begin */ 158 acs = NIL; 159 cs = NIL; 160 while (*ure != NIL) { 161 switch (c = *ure++) { 162 163 case '\\': 164 switch (c = *ure++) { 165 166 /* escaped characters are just characters */ 167 default: 168 if (cs == NIL || (*cs & STR) == 0) { 169 cs = ccre; 170 *cs = STR; 171 SCNT(cs) = 1; 172 ccre += 2; 173 } else 174 SCNT(cs)++; 175 *ccre++ = c; 176 break; 177 178 /* normal(?) metacharacters */ 179 case 'a': 180 case 'd': 181 case 'e': 182 case 'p': 183 if (acs != NIL && acs != cs) { 184 do { 185 temp = OCNT(acs); 186 OCNT(acs) = ccre - acs; 187 acs -= temp; 188 } while (temp != 0); 189 acs = NIL; 190 } 191 cs = ccre; 192 *cs = META; 193 MSYM(cs) = c; 194 ccre = MNEXT(cs); 195 break; 196 } 197 break; 198 199 /* just put the symbol in */ 200 case '^': 201 case '$': 202 if (acs != NIL && acs != cs) { 203 do { 204 temp = OCNT(acs); 205 OCNT(acs) = ccre - acs; 206 acs -= temp; 207 } while (temp != 0); 208 acs = NIL; 209 } 210 cs = ccre; 211 *cs = META; 212 MSYM(cs) = c; 213 ccre = MNEXT(cs); 214 break; 215 216 /* mark the last match sequence as optional */ 217 case '?': 218 if (cs) 219 *cs = *cs | OPT; 220 break; 221 222 /* recurse and define a subexpression */ 223 case '(': 224 if (acs != NIL && acs != cs) { 225 do { 226 temp = OCNT(acs); 227 OCNT(acs) = ccre - acs; 228 acs -= temp; 229 } while (temp != 0); 230 acs = NIL; 231 } 232 cs = ccre; 233 *cs = OPER; 234 OSYM(cs) = '('; 235 ccre = ONEXT(cs); 236 expconv (); 237 OCNT(cs) = ccre - cs; /* offset to next symbol */ 238 break; 239 240 /* reurn from a recursion */ 241 case ')': 242 if (acs != NIL) { 243 do { 244 temp = OCNT(acs); 245 OCNT(acs) = ccre - acs; 246 acs -= temp; 247 } while (temp != 0); 248 acs = NIL; 249 } 250 cs = ccre; 251 *cs = META; 252 MSYM(cs) = c; 253 ccre = MNEXT(cs); 254 return; 255 256 /* mark the last match sequence as having an alternate */ 257 /* the third byte will contain an offset to jump over the */ 258 /* alternate match in case the first did not fail */ 259 case '|': 260 if (acs != NIL && acs != cs) 261 OCNT(ccre) = ccre - acs; /* make a back pointer */ 262 else 263 OCNT(ccre) = 0; 264 *cs |= ALT; 265 cs = ccre; 266 *cs = OPER; 267 OSYM(cs) = '|'; 268 ccre = ONEXT(cs); 269 acs = cs; /* remember that the pointer is to be filles */ 270 break; 271 272 /* if its not a metasymbol just build a scharacter string */ 273 default: 274 if (cs == NIL || (*cs & STR) == 0) { 275 cs = ccre; 276 *cs = STR; 277 SCNT(cs) = 1; 278 ccre = SSTR(cs); 279 } else 280 SCNT(cs)++; 281 *ccre++ = c; 282 break; 283 } 284 } 285 if (acs != NIL) { 286 do { 287 temp = OCNT(acs); 288 OCNT(acs) = ccre - acs; 289 acs -= temp; 290 } while (temp != 0); 291 acs = NIL; 292 } 293 return; 294 } 295 /* end of convertre */ 296 297 298 /* 299 * The following routine recognises an irregular expresion 300 * with the following special characters: 301 * 302 * \? - means last match was optional 303 * \a - matches any number of characters 304 * \d - matches any number of spaces and tabs 305 * \p - matches any number of alphanumeric 306 * characters. The 307 * characters matched will be copied into 308 * the area pointed to by 'name'. 309 * \| - alternation 310 * \( \) - grouping used mostly for alternation and 311 * optionality 312 * 313 * The irregular expression must be translated to internal form 314 * prior to calling this routine 315 * 316 * The value returned is the pointer to the first non \a 317 * character matched. 318 */ 319 320 char * 321 expmatch (s, re, mstring) 322 register char *s; /* string to check for a match in */ 323 register char *re; /* a converted irregular expression */ 324 register char *mstring; /* where to put whatever matches a \p */ 325 { 326 register char *cs; /* the current symbol */ 327 register char *ptr,*s1; /* temporary pointer */ 328 boolean matched; /* a temporary boolean */ 329 330 /* initial conditions */ 331 if (re == NIL) 332 return (NIL); 333 cs = re; 334 matched = FALSE; 335 336 /* loop till expression string is exhausted (or at least pretty tired) */ 337 while (*cs) { 338 switch (*cs & (OPER | STR | META)) { 339 340 /* try to match a string */ 341 case STR: 342 matched = !STRNCMP (s, SSTR(cs), SCNT(cs)); 343 if (matched) { 344 345 /* hoorah it matches */ 346 s += SCNT(cs); 347 cs = SNEXT(cs); 348 } else if (*cs & ALT) { 349 350 /* alternation, skip to next expression */ 351 cs = SNEXT(cs); 352 } else if (*cs & OPT) { 353 354 /* the match is optional */ 355 cs = SNEXT(cs); 356 matched = 1; /* indicate a successful match */ 357 } else { 358 359 /* no match, error return */ 360 return (NIL); 361 } 362 break; 363 364 /* an operator, do something fancy */ 365 case OPER: 366 switch (OSYM(cs)) { 367 368 /* this is an alternation */ 369 case '|': 370 if (matched) 371 372 /* last thing in the alternation was a match, skip ahead */ 373 cs = OPTR(cs); 374 else 375 376 /* no match, keep trying */ 377 cs = ONEXT(cs); 378 break; 379 380 /* this is a grouping, recurse */ 381 case '(': 382 ptr = expmatch (s, ONEXT(cs), mstring); 383 if (ptr != NIL) { 384 385 /* the subexpression matched */ 386 matched = 1; 387 s = ptr; 388 } else if (*cs & ALT) { 389 390 /* alternation, skip to next expression */ 391 matched = 0; 392 } else if (*cs & OPT) { 393 394 /* the match is optional */ 395 matched = 1; /* indicate a successful match */ 396 } else { 397 398 /* no match, error return */ 399 return (NIL); 400 } 401 cs = OPTR(cs); 402 break; 403 } 404 break; 405 406 /* try to match a metasymbol */ 407 case META: 408 switch (MSYM(cs)) { 409 410 /* try to match anything and remember what was matched */ 411 case 'p': 412 /* 413 * This is really the same as trying the match the 414 * remaining parts of the expression to any subset 415 * of the string. 416 */ 417 s1 = s; 418 do { 419 ptr = expmatch (s1, MNEXT(cs), mstring); 420 if (ptr != NIL && s1 != s) { 421 422 /* we have a match, remember the match */ 423 strncpy (mstring, s, s1 - s); 424 mstring[s1 - s] = '\0'; 425 return (ptr); 426 } else if (ptr != NIL && (*cs & OPT)) { 427 428 /* it was aoptional so no match is ok */ 429 return (ptr); 430 } else if (ptr != NIL) { 431 432 /* not optional and we still matched */ 433 return (NIL); 434 } 435 if (!isalnum(*s1) && *s1 != '_') 436 return (NIL); 437 if (*s1 == '\\') 438 _escaped = _escaped ? FALSE : TRUE; 439 else 440 _escaped = FALSE; 441 } while (*s1++); 442 return (NIL); 443 444 /* try to match anything */ 445 case 'a': 446 /* 447 * This is really the same as trying the match the 448 * remaining parts of the expression to any subset 449 * of the string. 450 */ 451 s1 = s; 452 do { 453 ptr = expmatch (s1, MNEXT(cs), mstring); 454 if (ptr != NIL && s1 != s) { 455 456 /* we have a match */ 457 return (ptr); 458 } else if (ptr != NIL && (*cs & OPT)) { 459 460 /* it was aoptional so no match is ok */ 461 return (ptr); 462 } else if (ptr != NIL) { 463 464 /* not optional and we still matched */ 465 return (NIL); 466 } 467 if (*s1 == '\\') 468 _escaped = _escaped ? FALSE : TRUE; 469 else 470 _escaped = FALSE; 471 } while (*s1++); 472 return (NIL); 473 474 /* fail if we are currently _escaped */ 475 case 'e': 476 if (_escaped) 477 return(NIL); 478 cs = MNEXT(cs); 479 break; 480 481 /* match any number of tabs and spaces */ 482 case 'd': 483 ptr = s; 484 while (*s == ' ' || *s == '\t') 485 s++; 486 if (s != ptr || s == _start) { 487 488 /* match, be happy */ 489 matched = 1; 490 cs = MNEXT(cs); 491 } else if (*s == '\n' || *s == '\0') { 492 493 /* match, be happy */ 494 matched = 1; 495 cs = MNEXT(cs); 496 } else if (*cs & ALT) { 497 498 /* try the next part */ 499 matched = 0; 500 cs = MNEXT(cs); 501 } else if (*cs & OPT) { 502 503 /* doesn't matter */ 504 matched = 1; 505 cs = MNEXT(cs); 506 } else 507 508 /* no match, error return */ 509 return (NIL); 510 break; 511 512 /* check for end of line */ 513 case '$': 514 if (*s == '\0' || *s == '\n') { 515 516 /* match, be happy */ 517 s++; 518 matched = 1; 519 cs = MNEXT(cs); 520 } else if (*cs & ALT) { 521 522 /* try the next part */ 523 matched = 0; 524 cs = MNEXT(cs); 525 } else if (*cs & OPT) { 526 527 /* doesn't matter */ 528 matched = 1; 529 cs = MNEXT(cs); 530 } else 531 532 /* no match, error return */ 533 return (NIL); 534 break; 535 536 /* check for start of line */ 537 case '^': 538 if (s == _start) { 539 540 /* match, be happy */ 541 matched = 1; 542 cs = MNEXT(cs); 543 } else if (*cs & ALT) { 544 545 /* try the next part */ 546 matched = 0; 547 cs = MNEXT(cs); 548 } else if (*cs & OPT) { 549 550 /* doesn't matter */ 551 matched = 1; 552 cs = MNEXT(cs); 553 } else 554 555 /* no match, error return */ 556 return (NIL); 557 break; 558 559 /* end of a subexpression, return success */ 560 case ')': 561 return (s); 562 } 563 break; 564 } 565 } 566 return (s); 567 } 568