1 /* 2 * Copyright (c) 1980 The Regents of the University of California. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms are permitted 6 * provided that the above copyright notice and this paragraph are 7 * duplicated in all such forms and that any documentation, 8 * advertising materials, and other materials related to such 9 * distribution and use acknowledge that the software was developed 10 * by the University of California, Berkeley. The name of the 11 * University may not be used to endorse or promote products derived 12 * from this software without specific prior written permission. 13 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR 14 * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED 15 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. 16 */ 17 18 #ifndef lint 19 static char sccsid[] = "@(#)regexp.c 5.2 (Berkeley) 10/25/88"; 20 #endif /* not lint */ 21 22 #include <ctype.h> 23 24 typedef int boolean; 25 #define TRUE 1 26 #define FALSE 0 27 #define NIL 0 28 29 boolean l_onecase; /* true if upper and lower equivalent */ 30 31 #define makelower(c) (isupper((c)) ? tolower((c)) : (c)) 32 33 /* STRNCMP - like strncmp except that we convert the 34 * first string to lower case before comparing 35 * if l_onecase is set. 36 */ 37 38 STRNCMP(s1, s2, len) 39 register char *s1,*s2; 40 register int len; 41 { 42 if (l_onecase) { 43 do 44 if (*s2 - makelower(*s1)) 45 return (*s2 - makelower(*s1)); 46 else { 47 s2++; 48 s1++; 49 } 50 while (--len); 51 } else { 52 do 53 if (*s2 - *s1) 54 return (*s2 - *s1); 55 else { 56 s2++; 57 s1++; 58 } 59 while (--len); 60 } 61 return(0); 62 } 63 64 /* The following routine converts an irregular expression to 65 * internal format. 66 * 67 * Either meta symbols (\a \d or \p) or character strings or 68 * operations ( alternation or perenthesizing ) can be 69 * specified. Each starts with a descriptor byte. The descriptor 70 * byte has STR set for strings, META set for meta symbols 71 * and OPER set for operations. 72 * The descriptor byte can also have the OPT bit set if the object 73 * defined is optional. Also ALT can be set to indicate an alternation. 74 * 75 * For metasymbols the byte following the descriptor byte identities 76 * the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For 77 * strings the byte after the descriptor is a character count for 78 * the string: 79 * 80 * meta symbols := descriptor 81 * symbol 82 * 83 * strings := descriptor 84 * character count 85 * the string 86 * 87 * operatins := descriptor 88 * symbol 89 * character count 90 */ 91 92 /* 93 * handy macros for accessing parts of match blocks 94 */ 95 #define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */ 96 #define MNEXT(A) (A+2) /* character following a metasymbol block */ 97 98 #define OSYM(A) (*(A+1)) /* symbol in an operation block */ 99 #define OCNT(A) (*(A+2)) /* character count */ 100 #define ONEXT(A) (A+3) /* next character after the operation */ 101 #define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */ 102 103 #define SCNT(A) (*(A+1)) /* byte count of a string */ 104 #define SSTR(A) (A+2) /* address of the string */ 105 #define SNEXT(A) (A+2+*(A+1)) /* character following the string */ 106 107 /* 108 * bit flags in the descriptor 109 */ 110 #define OPT 1 111 #define STR 2 112 #define META 4 113 #define ALT 8 114 #define OPER 16 115 116 char *ure; /* pointer current position in unconverted exp */ 117 char *ccre; /* pointer to current position in converted exp*/ 118 char *malloc(); 119 120 char * 121 convexp(re) 122 char *re; /* unconverted irregular expression */ 123 { 124 register char *cre; /* pointer to converted regular expression */ 125 126 /* allocate room for the converted expression */ 127 if (re == NIL) 128 return (NIL); 129 if (*re == '\0') 130 return (NIL); 131 cre = malloc (4 * strlen(re) + 3); 132 ccre = cre; 133 ure = re; 134 135 /* start the conversion with a \a */ 136 *cre = META | OPT; 137 MSYM(cre) = 'a'; 138 ccre = MNEXT(cre); 139 140 /* start the conversion (its recursive) */ 141 expconv (); 142 *ccre = 0; 143 return (cre); 144 } 145 146 expconv() 147 { 148 register char *cs; /* pointer to current symbol in converted exp */ 149 register char c; /* character being processed */ 150 register char *acs; /* pinter to last alternate */ 151 register int temp; 152 153 /* let the conversion begin */ 154 acs = NIL; 155 cs = NIL; 156 while (*ure != NIL) { 157 switch (c = *ure++) { 158 159 case '\\': 160 switch (c = *ure++) { 161 162 /* escaped characters are just characters */ 163 default: 164 if (cs == NIL || (*cs & STR) == 0) { 165 cs = ccre; 166 *cs = STR; 167 SCNT(cs) = 1; 168 ccre += 2; 169 } else 170 SCNT(cs)++; 171 *ccre++ = c; 172 break; 173 174 /* normal(?) metacharacters */ 175 case 'a': 176 case 'd': 177 case 'e': 178 case 'p': 179 if (acs != NIL && acs != cs) { 180 do { 181 temp = OCNT(acs); 182 OCNT(acs) = ccre - acs; 183 acs -= temp; 184 } while (temp != 0); 185 acs = NIL; 186 } 187 cs = ccre; 188 *cs = META; 189 MSYM(cs) = c; 190 ccre = MNEXT(cs); 191 break; 192 } 193 break; 194 195 /* just put the symbol in */ 196 case '^': 197 case '$': 198 if (acs != NIL && acs != cs) { 199 do { 200 temp = OCNT(acs); 201 OCNT(acs) = ccre - acs; 202 acs -= temp; 203 } while (temp != 0); 204 acs = NIL; 205 } 206 cs = ccre; 207 *cs = META; 208 MSYM(cs) = c; 209 ccre = MNEXT(cs); 210 break; 211 212 /* mark the last match sequence as optional */ 213 case '?': 214 if (cs) 215 *cs = *cs | OPT; 216 break; 217 218 /* recurse and define a subexpression */ 219 case '(': 220 if (acs != NIL && acs != cs) { 221 do { 222 temp = OCNT(acs); 223 OCNT(acs) = ccre - acs; 224 acs -= temp; 225 } while (temp != 0); 226 acs = NIL; 227 } 228 cs = ccre; 229 *cs = OPER; 230 OSYM(cs) = '('; 231 ccre = ONEXT(cs); 232 expconv (); 233 OCNT(cs) = ccre - cs; /* offset to next symbol */ 234 break; 235 236 /* return from a recursion */ 237 case ')': 238 if (acs != NIL) { 239 do { 240 temp = OCNT(acs); 241 OCNT(acs) = ccre - acs; 242 acs -= temp; 243 } while (temp != 0); 244 acs = NIL; 245 } 246 cs = ccre; 247 *cs = META; 248 MSYM(cs) = c; 249 ccre = MNEXT(cs); 250 return; 251 252 /* mark the last match sequence as having an alternate */ 253 /* the third byte will contain an offset to jump over the */ 254 /* alternate match in case the first did not fail */ 255 case '|': 256 if (acs != NIL && acs != cs) 257 OCNT(ccre) = ccre - acs; /* make a back pointer */ 258 else 259 OCNT(ccre) = 0; 260 *cs |= ALT; 261 cs = ccre; 262 *cs = OPER; 263 OSYM(cs) = '|'; 264 ccre = ONEXT(cs); 265 acs = cs; /* remember that the pointer is to be filles */ 266 break; 267 268 /* if its not a metasymbol just build a scharacter string */ 269 default: 270 if (cs == NIL || (*cs & STR) == 0) { 271 cs = ccre; 272 *cs = STR; 273 SCNT(cs) = 1; 274 ccre = SSTR(cs); 275 } else 276 SCNT(cs)++; 277 *ccre++ = c; 278 break; 279 } 280 } 281 if (acs != NIL) { 282 do { 283 temp = OCNT(acs); 284 OCNT(acs) = ccre - acs; 285 acs -= temp; 286 } while (temp != 0); 287 acs = NIL; 288 } 289 return; 290 } 291 /* end of convertre */ 292 293 294 /* 295 * The following routine recognises an irregular expresion 296 * with the following special characters: 297 * 298 * \? - means last match was optional 299 * \a - matches any number of characters 300 * \d - matches any number of spaces and tabs 301 * \p - matches any number of alphanumeric 302 * characters. The 303 * characters matched will be copied into 304 * the area pointed to by 'name'. 305 * \| - alternation 306 * \( \) - grouping used mostly for alternation and 307 * optionality 308 * 309 * The irregular expression must be translated to internal form 310 * prior to calling this routine 311 * 312 * The value returned is the pointer to the first non \a 313 * character matched. 314 */ 315 316 boolean _escaped; /* true if we are currently _escaped */ 317 char *_start; /* start of string */ 318 319 char * 320 expmatch (s, re, mstring) 321 register char *s; /* string to check for a match in */ 322 register char *re; /* a converted irregular expression */ 323 register char *mstring; /* where to put whatever matches a \p */ 324 { 325 register char *cs; /* the current symbol */ 326 register char *ptr,*s1; /* temporary pointer */ 327 boolean matched; /* a temporary boolean */ 328 329 /* initial conditions */ 330 if (re == NIL) 331 return (NIL); 332 cs = re; 333 matched = FALSE; 334 335 /* loop till expression string is exhausted (or at least pretty tired) */ 336 while (*cs) { 337 switch (*cs & (OPER | STR | META)) { 338 339 /* try to match a string */ 340 case STR: 341 matched = !STRNCMP (s, SSTR(cs), SCNT(cs)); 342 if (matched) { 343 344 /* hoorah it matches */ 345 s += SCNT(cs); 346 cs = SNEXT(cs); 347 } else if (*cs & ALT) { 348 349 /* alternation, skip to next expression */ 350 cs = SNEXT(cs); 351 } else if (*cs & OPT) { 352 353 /* the match is optional */ 354 cs = SNEXT(cs); 355 matched = 1; /* indicate a successful match */ 356 } else { 357 358 /* no match, error return */ 359 return (NIL); 360 } 361 break; 362 363 /* an operator, do something fancy */ 364 case OPER: 365 switch (OSYM(cs)) { 366 367 /* this is an alternation */ 368 case '|': 369 if (matched) 370 371 /* last thing in the alternation was a match, skip ahead */ 372 cs = OPTR(cs); 373 else 374 375 /* no match, keep trying */ 376 cs = ONEXT(cs); 377 break; 378 379 /* this is a grouping, recurse */ 380 case '(': 381 ptr = expmatch (s, ONEXT(cs), mstring); 382 if (ptr != NIL) { 383 384 /* the subexpression matched */ 385 matched = 1; 386 s = ptr; 387 } else if (*cs & ALT) { 388 389 /* alternation, skip to next expression */ 390 matched = 0; 391 } else if (*cs & OPT) { 392 393 /* the match is optional */ 394 matched = 1; /* indicate a successful match */ 395 } else { 396 397 /* no match, error return */ 398 return (NIL); 399 } 400 cs = OPTR(cs); 401 break; 402 } 403 break; 404 405 /* try to match a metasymbol */ 406 case META: 407 switch (MSYM(cs)) { 408 409 /* try to match anything and remember what was matched */ 410 case 'p': 411 /* 412 * This is really the same as trying the match the 413 * remaining parts of the expression to any subset 414 * of the string. 415 */ 416 s1 = s; 417 do { 418 ptr = expmatch (s1, MNEXT(cs), mstring); 419 if (ptr != NIL && s1 != s) { 420 421 /* we have a match, remember the match */ 422 strncpy (mstring, s, s1 - s); 423 mstring[s1 - s] = '\0'; 424 return (ptr); 425 } else if (ptr != NIL && (*cs & OPT)) { 426 427 /* it was aoptional so no match is ok */ 428 return (ptr); 429 } else if (ptr != NIL) { 430 431 /* not optional and we still matched */ 432 return (NIL); 433 } 434 if (!isalnum(*s1) && *s1 != '_') 435 return (NIL); 436 if (*s1 == '\\') 437 _escaped = _escaped ? FALSE : TRUE; 438 else 439 _escaped = FALSE; 440 } while (*s1++); 441 return (NIL); 442 443 /* try to match anything */ 444 case 'a': 445 /* 446 * This is really the same as trying the match the 447 * remaining parts of the expression to any subset 448 * of the string. 449 */ 450 s1 = s; 451 do { 452 ptr = expmatch (s1, MNEXT(cs), mstring); 453 if (ptr != NIL && s1 != s) { 454 455 /* we have a match */ 456 return (ptr); 457 } else if (ptr != NIL && (*cs & OPT)) { 458 459 /* it was aoptional so no match is ok */ 460 return (ptr); 461 } else if (ptr != NIL) { 462 463 /* not optional and we still matched */ 464 return (NIL); 465 } 466 if (*s1 == '\\') 467 _escaped = _escaped ? FALSE : TRUE; 468 else 469 _escaped = FALSE; 470 } while (*s1++); 471 return (NIL); 472 473 /* fail if we are currently _escaped */ 474 case 'e': 475 if (_escaped) 476 return(NIL); 477 cs = MNEXT(cs); 478 break; 479 480 /* match any number of tabs and spaces */ 481 case 'd': 482 ptr = s; 483 while (*s == ' ' || *s == '\t') 484 s++; 485 if (s != ptr || s == _start) { 486 487 /* match, be happy */ 488 matched = 1; 489 cs = MNEXT(cs); 490 } else if (*s == '\n' || *s == '\0') { 491 492 /* match, be happy */ 493 matched = 1; 494 cs = MNEXT(cs); 495 } else if (*cs & ALT) { 496 497 /* try the next part */ 498 matched = 0; 499 cs = MNEXT(cs); 500 } else if (*cs & OPT) { 501 502 /* doesn't matter */ 503 matched = 1; 504 cs = MNEXT(cs); 505 } else 506 507 /* no match, error return */ 508 return (NIL); 509 break; 510 511 /* check for end of line */ 512 case '$': 513 if (*s == '\0' || *s == '\n') { 514 515 /* match, be happy */ 516 s++; 517 matched = 1; 518 cs = MNEXT(cs); 519 } else if (*cs & ALT) { 520 521 /* try the next part */ 522 matched = 0; 523 cs = MNEXT(cs); 524 } else if (*cs & OPT) { 525 526 /* doesn't matter */ 527 matched = 1; 528 cs = MNEXT(cs); 529 } else 530 531 /* no match, error return */ 532 return (NIL); 533 break; 534 535 /* check for start of line */ 536 case '^': 537 if (s == _start) { 538 539 /* match, be happy */ 540 matched = 1; 541 cs = MNEXT(cs); 542 } else if (*cs & ALT) { 543 544 /* try the next part */ 545 matched = 0; 546 cs = MNEXT(cs); 547 } else if (*cs & OPT) { 548 549 /* doesn't matter */ 550 matched = 1; 551 cs = MNEXT(cs); 552 } else 553 554 /* no match, error return */ 555 return (NIL); 556 break; 557 558 /* end of a subexpression, return success */ 559 case ')': 560 return (s); 561 } 562 break; 563 } 564 } 565 return (s); 566 } 567