1 /* $NetBSD: regexp.c,v 1.11 2008/07/21 14:19:27 lukem Exp $ */ 2 3 /* 4 * Copyright (c) 1980, 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 */ 32 33 #include <sys/cdefs.h> 34 #ifndef lint 35 __COPYRIGHT("@(#) Copyright (c) 1980, 1993\ 36 The Regents of the University of California. All rights reserved."); 37 #endif /* not lint */ 38 39 #ifndef lint 40 #if 0 41 static char sccsid[] = "@(#)regexp.c 8.1 (Berkeley) 6/6/93"; 42 #endif 43 __RCSID("$NetBSD: regexp.c,v 1.11 2008/07/21 14:19:27 lukem Exp $"); 44 #endif /* not lint */ 45 46 #include <assert.h> 47 #include <ctype.h> 48 #include <stdlib.h> 49 #include <string.h> 50 #include "extern.h" 51 52 #define FALSE 0 53 #define TRUE !(FALSE) 54 #define NIL 0 55 56 static void expconv __P((void)); 57 58 boolean x_escaped; /* true if we are currently x_escaped */ 59 char *x_start; /* start of string */ 60 boolean l_onecase; /* true if upper and lower equivalent */ 61 62 #define makelower(c) (isupper((unsigned char)(c)) ? tolower((unsigned char)(c)) : (c)) 63 64 /* STRNCMP - like strncmp except that we convert the 65 * first string to lower case before comparing 66 * if l_onecase is set. 67 */ 68 69 int 70 STRNCMP(s1, s2, len) 71 char *s1,*s2; 72 int len; 73 { 74 if (l_onecase) { 75 do 76 if (*s2 - makelower(*s1)) 77 return (*s2 - makelower(*s1)); 78 else { 79 s2++; 80 s1++; 81 } 82 while (--len); 83 } else { 84 do 85 if (*s2 - *s1) 86 return (*s2 - *s1); 87 else { 88 s2++; 89 s1++; 90 } 91 while (--len); 92 } 93 return(0); 94 } 95 96 /* The following routine converts an irregular expression to 97 * internal format. 98 * 99 * Either meta symbols (\a \d or \p) or character strings or 100 * operations ( alternation or perenthesizing ) can be 101 * specified. Each starts with a descriptor byte. The descriptor 102 * byte has STR set for strings, META set for meta symbols 103 * and OPER set for operations. 104 * The descriptor byte can also have the OPT bit set if the object 105 * defined is optional. Also ALT can be set to indicate an alternation. 106 * 107 * For metasymbols the byte following the descriptor byte identities 108 * the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For 109 * strings the byte after the descriptor is a character count for 110 * the string: 111 * 112 * meta symbols := descriptor 113 * symbol 114 * 115 * strings := descriptor 116 * character count 117 * the string 118 * 119 * operatins := descriptor 120 * symbol 121 * character count 122 */ 123 124 /* 125 * handy macros for accessing parts of match blocks 126 */ 127 #define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */ 128 #define MNEXT(A) (A+2) /* character following a metasymbol block */ 129 130 #define OSYM(A) (*(A+1)) /* symbol in an operation block */ 131 #define OCNT(A) (*(A+2)) /* character count */ 132 #define ONEXT(A) (A+3) /* next character after the operation */ 133 #define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */ 134 135 #define SCNT(A) (*(A+1)) /* byte count of a string */ 136 #define SSTR(A) (A+2) /* address of the string */ 137 #define SNEXT(A) (A+2+*(A+1)) /* character following the string */ 138 139 /* 140 * bit flags in the descriptor 141 */ 142 #define OPT 1 143 #define STR 2 144 #define META 4 145 #define ALT 8 146 #define OPER 16 147 148 static char *ccre; /* pointer to current position in converted exp*/ 149 static char *ure; /* pointer current position in unconverted exp */ 150 151 char * 152 convexp(re) 153 char *re; /* unconverted irregular expression */ 154 { 155 char *cre; /* pointer to converted regular expression */ 156 157 /* allocate room for the converted expression */ 158 if (re == NIL) 159 return (NIL); 160 if (*re == '\0') 161 return (NIL); 162 cre = malloc(4 * strlen(re) + 3); 163 ccre = cre; 164 ure = re; 165 166 /* start the conversion with a \a */ 167 *cre = META | OPT; 168 MSYM(cre) = 'a'; 169 ccre = MNEXT(cre); 170 171 /* start the conversion (its recursive) */ 172 expconv(); 173 *ccre = 0; 174 return (cre); 175 } 176 177 static void 178 expconv() 179 { 180 char *cs; /* pointer to current symbol in converted exp */ 181 char c; /* character being processed */ 182 char *acs; /* pinter to last alternate */ 183 int temp; 184 185 /* let the conversion begin */ 186 acs = NIL; 187 cs = NIL; 188 while (*ure != NIL) { 189 switch (c = *ure++) { 190 191 case '\\': 192 switch (c = *ure++) { 193 194 /* escaped characters are just characters */ 195 default: 196 if (cs == NIL || (*cs & STR) == 0) { 197 cs = ccre; 198 *cs = STR; 199 SCNT(cs) = 1; 200 ccre += 2; 201 } else 202 SCNT(cs)++; 203 *ccre++ = c; 204 break; 205 206 /* normal(?) metacharacters */ 207 case 'a': 208 case 'd': 209 case 'e': 210 case 'p': 211 if (acs != NIL && acs != cs) { 212 do { 213 temp = OCNT(acs); 214 OCNT(acs) = ccre - acs; 215 acs -= temp; 216 } while (temp != 0); 217 acs = NIL; 218 } 219 cs = ccre; 220 *cs = META; 221 MSYM(cs) = c; 222 ccre = MNEXT(cs); 223 break; 224 } 225 break; 226 227 /* just put the symbol in */ 228 case '^': 229 case '$': 230 if (acs != NIL && acs != cs) { 231 do { 232 temp = OCNT(acs); 233 OCNT(acs) = ccre - acs; 234 acs -= temp; 235 } while (temp != 0); 236 acs = NIL; 237 } 238 cs = ccre; 239 *cs = META; 240 MSYM(cs) = c; 241 ccre = MNEXT(cs); 242 break; 243 244 /* mark the last match sequence as optional */ 245 case '?': 246 if (cs) 247 *cs = *cs | OPT; 248 break; 249 250 /* recurse and define a subexpression */ 251 case '(': 252 if (acs != NIL && acs != cs) { 253 do { 254 temp = OCNT(acs); 255 OCNT(acs) = ccre - acs; 256 acs -= temp; 257 } while (temp != 0); 258 acs = NIL; 259 } 260 cs = ccre; 261 *cs = OPER; 262 OSYM(cs) = '('; 263 ccre = ONEXT(cs); 264 expconv(); 265 OCNT(cs) = ccre - cs; /* offset to next symbol */ 266 break; 267 268 /* reurn from a recursion */ 269 case ')': 270 if (acs != NIL) { 271 do { 272 temp = OCNT(acs); 273 OCNT(acs) = ccre - acs; 274 acs -= temp; 275 } while (temp != 0); 276 acs = NIL; 277 } 278 cs = ccre; 279 *cs = META; 280 MSYM(cs) = c; 281 ccre = MNEXT(cs); 282 return; 283 284 /* mark the last match sequence as having an alternate */ 285 /* the third byte will contain an offset to jump over the */ 286 /* alternate match in case the first did not fail */ 287 case '|': 288 if (acs != NIL && acs != cs) 289 OCNT(ccre) = ccre - acs; /* make a back pointer */ 290 else 291 OCNT(ccre) = 0; 292 assert(cs != NULL); 293 *cs |= ALT; 294 cs = ccre; 295 *cs = OPER; 296 OSYM(cs) = '|'; 297 ccre = ONEXT(cs); 298 acs = cs; /* remember that the pointer is to be filles */ 299 break; 300 301 /* if its not a metasymbol just build a scharacter string */ 302 default: 303 if (cs == NIL || (*cs & STR) == 0) { 304 cs = ccre; 305 *cs = STR; 306 SCNT(cs) = 1; 307 ccre = SSTR(cs); 308 } else 309 SCNT(cs)++; 310 *ccre++ = c; 311 break; 312 } 313 } 314 if (acs != NIL) { 315 do { 316 temp = OCNT(acs); 317 OCNT(acs) = ccre - acs; 318 acs -= temp; 319 } while (temp != 0); 320 acs = NIL; 321 } 322 return; 323 } 324 /* end of convertre */ 325 326 327 /* 328 * The following routine recognises an irregular expresion 329 * with the following special characters: 330 * 331 * \? - means last match was optional 332 * \a - matches any number of characters 333 * \d - matches any number of spaces and tabs 334 * \p - matches any number of alphanumeric 335 * characters. The 336 * characters matched will be copied into 337 * the area pointed to by 'name'. 338 * \| - alternation 339 * \( \) - grouping used mostly for alternation and 340 * optionality 341 * 342 * The irregular expression must be translated to internal form 343 * prior to calling this routine 344 * 345 * The value returned is the pointer to the first non \a 346 * character matched. 347 */ 348 349 char * 350 expmatch(s, re, mstring) 351 char *s; /* string to check for a match in */ 352 char *re; /* a converted irregular expression */ 353 char *mstring; /* where to put whatever matches a \p */ 354 { 355 char *cs; /* the current symbol */ 356 char *ptr,*s1; /* temporary pointer */ 357 boolean matched; /* a temporary boolean */ 358 359 /* initial conditions */ 360 if (re == NIL) 361 return (NIL); 362 cs = re; 363 matched = FALSE; 364 365 /* loop till expression string is exhausted (or at least pretty tired) */ 366 while (*cs) { 367 switch (*cs & (OPER | STR | META)) { 368 369 /* try to match a string */ 370 case STR: 371 matched = !STRNCMP (s, SSTR(cs), SCNT(cs)); 372 if (matched) { 373 374 /* hoorah it matches */ 375 s += SCNT(cs); 376 cs = SNEXT(cs); 377 } else if (*cs & ALT) { 378 379 /* alternation, skip to next expression */ 380 cs = SNEXT(cs); 381 } else if (*cs & OPT) { 382 383 /* the match is optional */ 384 cs = SNEXT(cs); 385 matched = 1; /* indicate a successful match */ 386 } else { 387 388 /* no match, error return */ 389 return (NIL); 390 } 391 break; 392 393 /* an operator, do something fancy */ 394 case OPER: 395 switch (OSYM(cs)) { 396 397 /* this is an alternation */ 398 case '|': 399 if (matched) 400 401 /* last thing in the alternation was a match, skip ahead */ 402 cs = OPTR(cs); 403 else 404 405 /* no match, keep trying */ 406 cs = ONEXT(cs); 407 break; 408 409 /* this is a grouping, recurse */ 410 case '(': 411 ptr = expmatch(s, ONEXT(cs), mstring); 412 if (ptr != NIL) { 413 414 /* the subexpression matched */ 415 matched = 1; 416 s = ptr; 417 } else if (*cs & ALT) { 418 419 /* alternation, skip to next expression */ 420 matched = 0; 421 } else if (*cs & OPT) { 422 423 /* the match is optional */ 424 matched = 1; /* indicate a successful match */ 425 } else { 426 427 /* no match, error return */ 428 return (NIL); 429 } 430 cs = OPTR(cs); 431 break; 432 } 433 break; 434 435 /* try to match a metasymbol */ 436 case META: 437 switch (MSYM(cs)) { 438 439 /* try to match anything and remember what was matched */ 440 case 'p': 441 /* 442 * This is really the same as trying the match the 443 * remaining parts of the expression to any subset 444 * of the string. 445 */ 446 s1 = s; 447 do { 448 ptr = expmatch(s1, MNEXT(cs), mstring); 449 if (ptr != NIL && s1 != s) { 450 451 /* we have a match, remember the match */ 452 strncpy(mstring, s, s1 - s); 453 mstring[s1 - s] = '\0'; 454 return (ptr); 455 } else if (ptr != NIL && (*cs & OPT)) { 456 457 /* it was aoptional so no match is ok */ 458 return (ptr); 459 } else if (ptr != NIL) { 460 461 /* not optional and we still matched */ 462 return (NIL); 463 } 464 if (!isalnum((unsigned char)*s1) && *s1 != '_') 465 return (NIL); 466 if (*s1 == '\\') 467 x_escaped = x_escaped ? FALSE : TRUE; 468 else 469 x_escaped = FALSE; 470 } while (*s1++); 471 return (NIL); 472 473 /* try to match anything */ 474 case 'a': 475 /* 476 * This is really the same as trying the match the 477 * remaining parts of the expression to any subset 478 * of the string. 479 */ 480 s1 = s; 481 do { 482 ptr = expmatch(s1, MNEXT(cs), mstring); 483 if (ptr != NIL && s1 != s) { 484 485 /* we have a match */ 486 return (ptr); 487 } else if (ptr != NIL && (*cs & OPT)) { 488 489 /* it was aoptional so no match is ok */ 490 return (ptr); 491 } else if (ptr != NIL) { 492 493 /* not optional and we still matched */ 494 return (NIL); 495 } 496 if (*s1 == '\\') 497 x_escaped = x_escaped ? FALSE : TRUE; 498 else 499 x_escaped = FALSE; 500 } while (*s1++); 501 return (NIL); 502 503 /* fail if we are currently x_escaped */ 504 case 'e': 505 if (x_escaped) 506 return(NIL); 507 cs = MNEXT(cs); 508 break; 509 510 /* match any number of tabs and spaces */ 511 case 'd': 512 ptr = s; 513 while (*s == ' ' || *s == '\t') 514 s++; 515 if (s != ptr || s == x_start) { 516 517 /* match, be happy */ 518 matched = 1; 519 cs = MNEXT(cs); 520 } else if (*s == '\n' || *s == '\0') { 521 522 /* match, be happy */ 523 matched = 1; 524 cs = MNEXT(cs); 525 } else if (*cs & ALT) { 526 527 /* try the next part */ 528 matched = 0; 529 cs = MNEXT(cs); 530 } else if (*cs & OPT) { 531 532 /* doesn't matter */ 533 matched = 1; 534 cs = MNEXT(cs); 535 } else 536 537 /* no match, error return */ 538 return (NIL); 539 break; 540 541 /* check for end of line */ 542 case '$': 543 if (*s == '\0' || *s == '\n') { 544 545 /* match, be happy */ 546 s++; 547 matched = 1; 548 cs = MNEXT(cs); 549 } else if (*cs & ALT) { 550 551 /* try the next part */ 552 matched = 0; 553 cs = MNEXT(cs); 554 } else if (*cs & OPT) { 555 556 /* doesn't matter */ 557 matched = 1; 558 cs = MNEXT(cs); 559 } else 560 561 /* no match, error return */ 562 return (NIL); 563 break; 564 565 /* check for start of line */ 566 case '^': 567 if (s == x_start) { 568 569 /* match, be happy */ 570 matched = 1; 571 cs = MNEXT(cs); 572 } else if (*cs & ALT) { 573 574 /* try the next part */ 575 matched = 0; 576 cs = MNEXT(cs); 577 } else if (*cs & OPT) { 578 579 /* doesn't matter */ 580 matched = 1; 581 cs = MNEXT(cs); 582 } else 583 584 /* no match, error return */ 585 return (NIL); 586 break; 587 588 /* end of a subexpression, return success */ 589 case ')': 590 return (s); 591 } 592 break; 593 } 594 } 595 return (s); 596 } 597