1 /* 2 * Copyright (c) 1980 The Regents of the University of California. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. All advertising materials mentioning features or use of this software 14 * must display the following acknowledgement: 15 * This product includes software developed by the University of 16 * California, Berkeley and its contributors. 17 * 4. Neither the name of the University nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 */ 33 34 #ifndef lint 35 static char sccsid[] = "@(#)regexp.c 5.4 (Berkeley) 8/3/92"; 36 #endif /* not lint */ 37 38 #include <ctype.h> 39 #include <stdlib.h> 40 #include <string.h> 41 #include "extern.h" 42 43 #define FALSE 0 44 #define TRUE !(FALSE) 45 #define NIL 0 46 47 static void expconv __P((void)); 48 49 boolean _escaped; /* true if we are currently _escaped */ 50 char *_start; /* start of string */ 51 boolean l_onecase; /* true if upper and lower equivalent */ 52 53 #define makelower(c) (isupper((c)) ? tolower((c)) : (c)) 54 55 /* STRNCMP - like strncmp except that we convert the 56 * first string to lower case before comparing 57 * if l_onecase is set. 58 */ 59 60 int 61 STRNCMP(s1, s2, len) 62 register char *s1,*s2; 63 register int len; 64 { 65 if (l_onecase) { 66 do 67 if (*s2 - makelower(*s1)) 68 return (*s2 - makelower(*s1)); 69 else { 70 s2++; 71 s1++; 72 } 73 while (--len); 74 } else { 75 do 76 if (*s2 - *s1) 77 return (*s2 - *s1); 78 else { 79 s2++; 80 s1++; 81 } 82 while (--len); 83 } 84 return(0); 85 } 86 87 /* The following routine converts an irregular expression to 88 * internal format. 89 * 90 * Either meta symbols (\a \d or \p) or character strings or 91 * operations ( alternation or perenthesizing ) can be 92 * specified. Each starts with a descriptor byte. The descriptor 93 * byte has STR set for strings, META set for meta symbols 94 * and OPER set for operations. 95 * The descriptor byte can also have the OPT bit set if the object 96 * defined is optional. Also ALT can be set to indicate an alternation. 97 * 98 * For metasymbols the byte following the descriptor byte identities 99 * the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For 100 * strings the byte after the descriptor is a character count for 101 * the string: 102 * 103 * meta symbols := descriptor 104 * symbol 105 * 106 * strings := descriptor 107 * character count 108 * the string 109 * 110 * operatins := descriptor 111 * symbol 112 * character count 113 */ 114 115 /* 116 * handy macros for accessing parts of match blocks 117 */ 118 #define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */ 119 #define MNEXT(A) (A+2) /* character following a metasymbol block */ 120 121 #define OSYM(A) (*(A+1)) /* symbol in an operation block */ 122 #define OCNT(A) (*(A+2)) /* character count */ 123 #define ONEXT(A) (A+3) /* next character after the operation */ 124 #define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */ 125 126 #define SCNT(A) (*(A+1)) /* byte count of a string */ 127 #define SSTR(A) (A+2) /* address of the string */ 128 #define SNEXT(A) (A+2+*(A+1)) /* character following the string */ 129 130 /* 131 * bit flags in the descriptor 132 */ 133 #define OPT 1 134 #define STR 2 135 #define META 4 136 #define ALT 8 137 #define OPER 16 138 139 static char *ccre; /* pointer to current position in converted exp*/ 140 static char *ure; /* pointer current position in unconverted exp */ 141 142 char * 143 convexp(re) 144 char *re; /* unconverted irregular expression */ 145 { 146 register char *cre; /* pointer to converted regular expression */ 147 148 /* allocate room for the converted expression */ 149 if (re == NIL) 150 return (NIL); 151 if (*re == '\0') 152 return (NIL); 153 cre = malloc (4 * strlen(re) + 3); 154 ccre = cre; 155 ure = re; 156 157 /* start the conversion with a \a */ 158 *cre = META | OPT; 159 MSYM(cre) = 'a'; 160 ccre = MNEXT(cre); 161 162 /* start the conversion (its recursive) */ 163 expconv (); 164 *ccre = 0; 165 return (cre); 166 } 167 168 static void 169 expconv() 170 { 171 register char *cs; /* pointer to current symbol in converted exp */ 172 register char c; /* character being processed */ 173 register char *acs; /* pinter to last alternate */ 174 register int temp; 175 176 /* let the conversion begin */ 177 acs = NIL; 178 cs = NIL; 179 while (*ure != NIL) { 180 switch (c = *ure++) { 181 182 case '\\': 183 switch (c = *ure++) { 184 185 /* escaped characters are just characters */ 186 default: 187 if (cs == NIL || (*cs & STR) == 0) { 188 cs = ccre; 189 *cs = STR; 190 SCNT(cs) = 1; 191 ccre += 2; 192 } else 193 SCNT(cs)++; 194 *ccre++ = c; 195 break; 196 197 /* normal(?) metacharacters */ 198 case 'a': 199 case 'd': 200 case 'e': 201 case 'p': 202 if (acs != NIL && acs != cs) { 203 do { 204 temp = OCNT(acs); 205 OCNT(acs) = ccre - acs; 206 acs -= temp; 207 } while (temp != 0); 208 acs = NIL; 209 } 210 cs = ccre; 211 *cs = META; 212 MSYM(cs) = c; 213 ccre = MNEXT(cs); 214 break; 215 } 216 break; 217 218 /* just put the symbol in */ 219 case '^': 220 case '$': 221 if (acs != NIL && acs != cs) { 222 do { 223 temp = OCNT(acs); 224 OCNT(acs) = ccre - acs; 225 acs -= temp; 226 } while (temp != 0); 227 acs = NIL; 228 } 229 cs = ccre; 230 *cs = META; 231 MSYM(cs) = c; 232 ccre = MNEXT(cs); 233 break; 234 235 /* mark the last match sequence as optional */ 236 case '?': 237 if (cs) 238 *cs = *cs | OPT; 239 break; 240 241 /* recurse and define a subexpression */ 242 case '(': 243 if (acs != NIL && acs != cs) { 244 do { 245 temp = OCNT(acs); 246 OCNT(acs) = ccre - acs; 247 acs -= temp; 248 } while (temp != 0); 249 acs = NIL; 250 } 251 cs = ccre; 252 *cs = OPER; 253 OSYM(cs) = '('; 254 ccre = ONEXT(cs); 255 expconv (); 256 OCNT(cs) = ccre - cs; /* offset to next symbol */ 257 break; 258 259 /* reurn from a recursion */ 260 case ')': 261 if (acs != NIL) { 262 do { 263 temp = OCNT(acs); 264 OCNT(acs) = ccre - acs; 265 acs -= temp; 266 } while (temp != 0); 267 acs = NIL; 268 } 269 cs = ccre; 270 *cs = META; 271 MSYM(cs) = c; 272 ccre = MNEXT(cs); 273 return; 274 275 /* mark the last match sequence as having an alternate */ 276 /* the third byte will contain an offset to jump over the */ 277 /* alternate match in case the first did not fail */ 278 case '|': 279 if (acs != NIL && acs != cs) 280 OCNT(ccre) = ccre - acs; /* make a back pointer */ 281 else 282 OCNT(ccre) = 0; 283 *cs |= ALT; 284 cs = ccre; 285 *cs = OPER; 286 OSYM(cs) = '|'; 287 ccre = ONEXT(cs); 288 acs = cs; /* remember that the pointer is to be filles */ 289 break; 290 291 /* if its not a metasymbol just build a scharacter string */ 292 default: 293 if (cs == NIL || (*cs & STR) == 0) { 294 cs = ccre; 295 *cs = STR; 296 SCNT(cs) = 1; 297 ccre = SSTR(cs); 298 } else 299 SCNT(cs)++; 300 *ccre++ = c; 301 break; 302 } 303 } 304 if (acs != NIL) { 305 do { 306 temp = OCNT(acs); 307 OCNT(acs) = ccre - acs; 308 acs -= temp; 309 } while (temp != 0); 310 acs = NIL; 311 } 312 return; 313 } 314 /* end of convertre */ 315 316 317 /* 318 * The following routine recognises an irregular expresion 319 * with the following special characters: 320 * 321 * \? - means last match was optional 322 * \a - matches any number of characters 323 * \d - matches any number of spaces and tabs 324 * \p - matches any number of alphanumeric 325 * characters. The 326 * characters matched will be copied into 327 * the area pointed to by 'name'. 328 * \| - alternation 329 * \( \) - grouping used mostly for alternation and 330 * optionality 331 * 332 * The irregular expression must be translated to internal form 333 * prior to calling this routine 334 * 335 * The value returned is the pointer to the first non \a 336 * character matched. 337 */ 338 339 char * 340 expmatch (s, re, mstring) 341 register char *s; /* string to check for a match in */ 342 register char *re; /* a converted irregular expression */ 343 register char *mstring; /* where to put whatever matches a \p */ 344 { 345 register char *cs; /* the current symbol */ 346 register char *ptr,*s1; /* temporary pointer */ 347 boolean matched; /* a temporary boolean */ 348 349 /* initial conditions */ 350 if (re == NIL) 351 return (NIL); 352 cs = re; 353 matched = FALSE; 354 355 /* loop till expression string is exhausted (or at least pretty tired) */ 356 while (*cs) { 357 switch (*cs & (OPER | STR | META)) { 358 359 /* try to match a string */ 360 case STR: 361 matched = !STRNCMP (s, SSTR(cs), SCNT(cs)); 362 if (matched) { 363 364 /* hoorah it matches */ 365 s += SCNT(cs); 366 cs = SNEXT(cs); 367 } else if (*cs & ALT) { 368 369 /* alternation, skip to next expression */ 370 cs = SNEXT(cs); 371 } else if (*cs & OPT) { 372 373 /* the match is optional */ 374 cs = SNEXT(cs); 375 matched = 1; /* indicate a successful match */ 376 } else { 377 378 /* no match, error return */ 379 return (NIL); 380 } 381 break; 382 383 /* an operator, do something fancy */ 384 case OPER: 385 switch (OSYM(cs)) { 386 387 /* this is an alternation */ 388 case '|': 389 if (matched) 390 391 /* last thing in the alternation was a match, skip ahead */ 392 cs = OPTR(cs); 393 else 394 395 /* no match, keep trying */ 396 cs = ONEXT(cs); 397 break; 398 399 /* this is a grouping, recurse */ 400 case '(': 401 ptr = expmatch (s, ONEXT(cs), mstring); 402 if (ptr != NIL) { 403 404 /* the subexpression matched */ 405 matched = 1; 406 s = ptr; 407 } else if (*cs & ALT) { 408 409 /* alternation, skip to next expression */ 410 matched = 0; 411 } else if (*cs & OPT) { 412 413 /* the match is optional */ 414 matched = 1; /* indicate a successful match */ 415 } else { 416 417 /* no match, error return */ 418 return (NIL); 419 } 420 cs = OPTR(cs); 421 break; 422 } 423 break; 424 425 /* try to match a metasymbol */ 426 case META: 427 switch (MSYM(cs)) { 428 429 /* try to match anything and remember what was matched */ 430 case 'p': 431 /* 432 * This is really the same as trying the match the 433 * remaining parts of the expression to any subset 434 * of the string. 435 */ 436 s1 = s; 437 do { 438 ptr = expmatch (s1, MNEXT(cs), mstring); 439 if (ptr != NIL && s1 != s) { 440 441 /* we have a match, remember the match */ 442 strncpy (mstring, s, s1 - s); 443 mstring[s1 - s] = '\0'; 444 return (ptr); 445 } else if (ptr != NIL && (*cs & OPT)) { 446 447 /* it was aoptional so no match is ok */ 448 return (ptr); 449 } else if (ptr != NIL) { 450 451 /* not optional and we still matched */ 452 return (NIL); 453 } 454 if (!isalnum(*s1) && *s1 != '_') 455 return (NIL); 456 if (*s1 == '\\') 457 _escaped = _escaped ? FALSE : TRUE; 458 else 459 _escaped = FALSE; 460 } while (*s1++); 461 return (NIL); 462 463 /* try to match anything */ 464 case 'a': 465 /* 466 * This is really the same as trying the match the 467 * remaining parts of the expression to any subset 468 * of the string. 469 */ 470 s1 = s; 471 do { 472 ptr = expmatch (s1, MNEXT(cs), mstring); 473 if (ptr != NIL && s1 != s) { 474 475 /* we have a match */ 476 return (ptr); 477 } else if (ptr != NIL && (*cs & OPT)) { 478 479 /* it was aoptional so no match is ok */ 480 return (ptr); 481 } else if (ptr != NIL) { 482 483 /* not optional and we still matched */ 484 return (NIL); 485 } 486 if (*s1 == '\\') 487 _escaped = _escaped ? FALSE : TRUE; 488 else 489 _escaped = FALSE; 490 } while (*s1++); 491 return (NIL); 492 493 /* fail if we are currently _escaped */ 494 case 'e': 495 if (_escaped) 496 return(NIL); 497 cs = MNEXT(cs); 498 break; 499 500 /* match any number of tabs and spaces */ 501 case 'd': 502 ptr = s; 503 while (*s == ' ' || *s == '\t') 504 s++; 505 if (s != ptr || s == _start) { 506 507 /* match, be happy */ 508 matched = 1; 509 cs = MNEXT(cs); 510 } else if (*s == '\n' || *s == '\0') { 511 512 /* match, be happy */ 513 matched = 1; 514 cs = MNEXT(cs); 515 } else if (*cs & ALT) { 516 517 /* try the next part */ 518 matched = 0; 519 cs = MNEXT(cs); 520 } else if (*cs & OPT) { 521 522 /* doesn't matter */ 523 matched = 1; 524 cs = MNEXT(cs); 525 } else 526 527 /* no match, error return */ 528 return (NIL); 529 break; 530 531 /* check for end of line */ 532 case '$': 533 if (*s == '\0' || *s == '\n') { 534 535 /* match, be happy */ 536 s++; 537 matched = 1; 538 cs = MNEXT(cs); 539 } else if (*cs & ALT) { 540 541 /* try the next part */ 542 matched = 0; 543 cs = MNEXT(cs); 544 } else if (*cs & OPT) { 545 546 /* doesn't matter */ 547 matched = 1; 548 cs = MNEXT(cs); 549 } else 550 551 /* no match, error return */ 552 return (NIL); 553 break; 554 555 /* check for start of line */ 556 case '^': 557 if (s == _start) { 558 559 /* match, be happy */ 560 matched = 1; 561 cs = MNEXT(cs); 562 } else if (*cs & ALT) { 563 564 /* try the next part */ 565 matched = 0; 566 cs = MNEXT(cs); 567 } else if (*cs & OPT) { 568 569 /* doesn't matter */ 570 matched = 1; 571 cs = MNEXT(cs); 572 } else 573 574 /* no match, error return */ 575 return (NIL); 576 break; 577 578 /* end of a subexpression, return success */ 579 case ')': 580 return (s); 581 } 582 break; 583 } 584 } 585 return (s); 586 } 587