1 /* $OpenBSD: patterns.c,v 1.5 2016/02/14 18:20:59 semarie Exp $ */ 2 3 /* 4 * Copyright (c) 2015 Reyk Floeter <reyk@openbsd.org> 5 * Copyright (C) 1994-2015 Lua.org, PUC-Rio. 6 * 7 * Permission is hereby granted, free of charge, to any person obtaining 8 * a copy of this software and associated documentation files (the 9 * "Software"), to deal in the Software without restriction, including 10 * without limitation the rights to use, copy, modify, merge, publish, 11 * distribute, sublicense, and/or sell copies of the Software, and to 12 * permit persons to whom the Software is furnished to do so, subject to 13 * the following conditions: 14 * 15 * The above copyright notice and this permission notice shall be 16 * included in all copies or substantial portions of the Software. 17 * 18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 21 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 22 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 25 */ 26 27 /* 28 * Derived from Lua 5.3.1: 29 * $Id: patterns.c,v 1.5 2016/02/14 18:20:59 semarie Exp $ 30 * Standard library for string operations and pattern-matching 31 */ 32 33 #include <sys/types.h> 34 #include <ctype.h> 35 #include <errno.h> 36 #include <stddef.h> 37 #include <stdlib.h> 38 #include <string.h> 39 40 #include "patterns.h" 41 42 #define uchar(c) ((unsigned char)(c)) /* macro to 'unsign' a char */ 43 #define CAP_UNFINISHED (-1) 44 #define CAP_POSITION (-2) 45 #define L_ESC '%' 46 #define SPECIALS "^$*+?.([%-" 47 48 struct match_state { 49 int matchdepth; /* control for recursive depth (to avoid C 50 * stack overflow) */ 51 int repetitioncounter; /* control the repetition items */ 52 int maxcaptures; /* configured capture limit */ 53 const char *src_init; /* init of source string */ 54 const char *src_end; /* end ('\0') of source string */ 55 const char *p_end; /* end ('\0') of pattern */ 56 const char *error; /* should be NULL */ 57 int level; /* total number of captures (finished or 58 * unfinished) */ 59 struct { 60 const char *init; 61 ptrdiff_t len; 62 } capture[MAXCAPTURES]; 63 }; 64 65 /* recursive function */ 66 static const char *match(struct match_state *, const char *, const char *); 67 68 static int 69 match_error(struct match_state *ms, const char *error) 70 { 71 ms->error = ms->error == NULL ? error : ms->error; 72 return (-1); 73 } 74 75 static int 76 check_capture(struct match_state *ms, int l) 77 { 78 l -= '1'; 79 if (l < 0 || l >= ms->level || ms->capture[l].len == CAP_UNFINISHED) 80 return match_error(ms, "invalid capture index"); 81 return (l); 82 } 83 84 static int 85 capture_to_close(struct match_state *ms) 86 { 87 int level = ms->level; 88 for (level--; level >= 0; level--) 89 if (ms->capture[level].len == CAP_UNFINISHED) 90 return (level); 91 return match_error(ms, "invalid pattern capture"); 92 } 93 94 static const char * 95 classend(struct match_state *ms, const char *p) 96 { 97 switch (*p++) { 98 case L_ESC: 99 if (p == ms->p_end) 100 match_error(ms, 101 "malformed pattern (ends with '%')"); 102 return p + 1; 103 case '[': 104 if (*p == '^') 105 p++; 106 do { 107 /* look for a ']' */ 108 if (p == ms->p_end) { 109 match_error(ms, 110 "malformed pattern (missing ']')"); 111 break; 112 } 113 if (*(p++) == L_ESC && p < ms->p_end) { 114 /* skip escapes (e.g. '%]') */ 115 p++; 116 } 117 } while (*p != ']'); 118 return p + 1; 119 default: 120 return p; 121 } 122 } 123 124 static int 125 match_class(int c, int cl) 126 { 127 int res; 128 switch (tolower(cl)) { 129 case 'a': 130 res = isalpha(c); 131 break; 132 case 'c': 133 res = iscntrl(c); 134 break; 135 case 'd': 136 res = isdigit(c); 137 break; 138 case 'g': 139 res = isgraph(c); 140 break; 141 case 'l': 142 res = islower(c); 143 break; 144 case 'p': 145 res = ispunct(c); 146 break; 147 case 's': 148 res = isspace(c); 149 break; 150 case 'u': 151 res = isupper(c); 152 break; 153 case 'w': 154 res = isalnum(c); 155 break; 156 case 'x': 157 res = isxdigit(c); 158 break; 159 default: 160 return (cl == c); 161 } 162 return (islower(cl) ? res : !res); 163 } 164 165 static int 166 matchbracketclass(int c, const char *p, const char *ec) 167 { 168 int sig = 1; 169 if (*(p + 1) == '^') { 170 sig = 0; 171 /* skip the '^' */ 172 p++; 173 } 174 while (++p < ec) { 175 if (*p == L_ESC) { 176 p++; 177 if (match_class(c, uchar(*p))) 178 return sig; 179 } else if ((*(p + 1) == '-') && (p + 2 < ec)) { 180 p += 2; 181 if (uchar(*(p - 2)) <= c && c <= uchar(*p)) 182 return sig; 183 } else if (uchar(*p) == c) 184 return sig; 185 } 186 return !sig; 187 } 188 189 static int 190 singlematch(struct match_state *ms, const char *s, const char *p, 191 const char *ep) 192 { 193 if (s >= ms->src_end) 194 return 0; 195 else { 196 int c = uchar(*s); 197 switch (*p) { 198 case '.': 199 /* matches any char */ 200 return (1); 201 case L_ESC: 202 return match_class(c, uchar(*(p + 1))); 203 case '[': 204 return matchbracketclass(c, p, ep - 1); 205 default: 206 return (uchar(*p) == c); 207 } 208 } 209 } 210 211 static const char * 212 matchbalance(struct match_state *ms, const char *s, const char *p) 213 { 214 if (p >= ms->p_end - 1) { 215 match_error(ms, 216 "malformed pattern (missing arguments to '%b')"); 217 return (NULL); 218 } 219 if (*s != *p) 220 return (NULL); 221 else { 222 int b = *p; 223 int e = *(p + 1); 224 int cont = 1; 225 while (++s < ms->src_end) { 226 if (*s == e) { 227 if (--cont == 0) 228 return s + 1; 229 } else if (*s == b) 230 cont++; 231 } 232 } 233 234 /* string ends out of balance */ 235 return (NULL); 236 } 237 238 static const char * 239 max_expand(struct match_state *ms, const char *s, const char *p, const char *ep) 240 { 241 ptrdiff_t i = 0; 242 /* counts maximum expand for item */ 243 while (singlematch(ms, s + i, p, ep)) 244 i++; 245 /* keeps trying to match with the maximum repetitions */ 246 while (i >= 0) { 247 const char *res = match(ms, (s + i), ep + 1); 248 if (res) 249 return res; 250 /* else didn't match; reduce 1 repetition to try again */ 251 i--; 252 } 253 return NULL; 254 } 255 256 static const char * 257 min_expand(struct match_state *ms, const char *s, const char *p, const char *ep) 258 { 259 for (;;) { 260 const char *res = match(ms, s, ep + 1); 261 if (res != NULL) 262 return res; 263 else if (singlematch(ms, s, p, ep)) 264 s++; /* try with one more repetition */ 265 else 266 return NULL; 267 } 268 } 269 270 static const char * 271 start_capture(struct match_state *ms, const char *s, const char *p, int what) 272 { 273 const char *res; 274 275 int level = ms->level; 276 if (level >= ms->maxcaptures) { 277 match_error(ms, "too many captures"); 278 return (NULL); 279 } 280 ms->capture[level].init = s; 281 ms->capture[level].len = what; 282 ms->level = level + 1; 283 /* undo capture if match failed */ 284 if ((res = match(ms, s, p)) == NULL) 285 ms->level--; 286 return res; 287 } 288 289 static const char * 290 end_capture(struct match_state *ms, const char *s, const char *p) 291 { 292 int l = capture_to_close(ms); 293 const char *res; 294 if (l == -1) 295 return NULL; 296 /* close capture */ 297 ms->capture[l].len = s - ms->capture[l].init; 298 /* undo capture if match failed */ 299 if ((res = match(ms, s, p)) == NULL) 300 ms->capture[l].len = CAP_UNFINISHED; 301 return res; 302 } 303 304 static const char * 305 match_capture(struct match_state *ms, const char *s, int l) 306 { 307 size_t len; 308 l = check_capture(ms, l); 309 if (l == -1) 310 return NULL; 311 len = ms->capture[l].len; 312 if ((size_t) (ms->src_end - s) >= len && 313 memcmp(ms->capture[l].init, s, len) == 0) 314 return s + len; 315 else 316 return NULL; 317 } 318 319 static const char * 320 match(struct match_state *ms, const char *s, const char *p) 321 { 322 const char *ep, *res; 323 char previous; 324 325 if (ms->matchdepth-- == 0) { 326 match_error(ms, "pattern too complex"); 327 return (NULL); 328 } 329 330 /* using goto's to optimize tail recursion */ 331 init: 332 /* end of pattern? */ 333 if (p != ms->p_end) { 334 switch (*p) { 335 case '(': 336 /* start capture */ 337 if (*(p + 1) == ')') 338 /* position capture? */ 339 s = start_capture(ms, s, p + 2, CAP_POSITION); 340 else 341 s = start_capture(ms, s, p + 1, CAP_UNFINISHED); 342 break; 343 case ')': 344 /* end capture */ 345 s = end_capture(ms, s, p + 1); 346 break; 347 case '$': 348 /* is the '$' the last char in pattern? */ 349 if ((p + 1) != ms->p_end) { 350 /* no; go to default */ 351 goto dflt; 352 } 353 /* check end of string */ 354 s = (s == ms->src_end) ? s : NULL; 355 break; 356 case L_ESC: 357 /* escaped sequences not in the format class[*+?-]? */ 358 switch (*(p + 1)) { 359 case 'b': 360 /* balanced string? */ 361 s = matchbalance(ms, s, p + 2); 362 if (s != NULL) { 363 p += 4; 364 /* return match(ms, s, p + 4); */ 365 goto init; 366 } /* else fail (s == NULL) */ 367 break; 368 case 'f': 369 /* frontier? */ 370 p += 2; 371 if (*p != '[') { 372 match_error(ms, "missing '['" 373 " after '%f' in pattern"); 374 break; 375 } 376 /* points to what is next */ 377 ep = classend(ms, p); 378 if (ms->error != NULL) 379 break; 380 previous = 381 (s == ms->src_init) ? '\0' : *(s - 1); 382 if (!matchbracketclass(uchar(previous), 383 p, ep - 1) && 384 matchbracketclass(uchar(*s), 385 p, ep - 1)) { 386 p = ep; 387 /* return match(ms, s, ep); */ 388 goto init; 389 } 390 /* match failed */ 391 s = NULL; 392 break; 393 case '0': 394 case '1': 395 case '2': 396 case '3': 397 case '4': 398 case '5': 399 case '6': 400 case '7': 401 case '8': 402 case '9': 403 /* capture results (%0-%9)? */ 404 s = match_capture(ms, s, uchar(*(p + 1))); 405 if (s != NULL) { 406 p += 2; 407 /* return match(ms, s, p + 2) */ 408 goto init; 409 } 410 break; 411 default: 412 goto dflt; 413 } 414 break; 415 default: 416 417 /* pattern class plus optional suffix */ 418 dflt: 419 /* points to optional suffix */ 420 ep = classend(ms, p); 421 if (ms->error != NULL) 422 break; 423 424 /* does not match at least once? */ 425 if (!singlematch(ms, s, p, ep)) { 426 if (ms->repetitioncounter-- == 0) { 427 match_error(ms, "max repetition items"); 428 s = NULL; /* fail */ 429 /* accept empty? */ 430 } else if 431 (*ep == '*' || *ep == '?' || *ep == '-') { 432 p = ep + 1; 433 /* return match(ms, s, ep + 1); */ 434 goto init; 435 } else { 436 /* '+' or no suffix */ 437 s = NULL; /* fail */ 438 } 439 } else { 440 /* matched once */ 441 /* handle optional suffix */ 442 switch (*ep) { 443 case '?': 444 /* optional */ 445 if ((res = 446 match(ms, s + 1, ep + 1)) != NULL) 447 s = res; 448 else { 449 /* 450 * else return 451 * match(ms, s, ep + 1); 452 */ 453 p = ep + 1; 454 goto init; 455 } 456 break; 457 case '+': 458 /* 1 or more repetitions */ 459 s++; /* 1 match already done */ 460 /* FALLTHROUGH */ 461 case '*': 462 /* 0 or more repetitions */ 463 s = max_expand(ms, s, p, ep); 464 break; 465 case '-': 466 /* 0 or more repetitions (minimum) */ 467 s = min_expand(ms, s, p, ep); 468 break; 469 default: 470 /* no suffix */ 471 s++; 472 p = ep; 473 /* return match(ms, s + 1, ep); */ 474 goto init; 475 } 476 } 477 break; 478 } 479 } 480 ms->matchdepth++; 481 return s; 482 } 483 484 static const char * 485 lmemfind(const char *s1, size_t l1, 486 const char *s2, size_t l2) 487 { 488 const char *init; 489 490 if (l2 == 0) { 491 /* empty strings are everywhere */ 492 return (s1); 493 } else if (l2 > l1) { 494 /* avoids a negative 'l1' */ 495 return (NULL); 496 } else { 497 /* 498 * to search for a '*s2' inside 's1' 499 * - 1st char will be checked by 'memchr' 500 * - 's2' cannot be found after that 501 */ 502 l2--; 503 l1 = l1 - l2; 504 while (l1 > 0 && 505 (init = (const char *)memchr(s1, *s2, l1)) != NULL) { 506 /* 1st char is already checked */ 507 init++; 508 if (memcmp(init, s2 + 1, l2) == 0) 509 return init - 1; 510 else { 511 /* correct 'l1' and 's1' to try again */ 512 l1 -= init - s1; 513 s1 = init; 514 } 515 } 516 /* not found */ 517 return (NULL); 518 } 519 } 520 521 static int 522 push_onecapture(struct match_state *ms, int i, const char *s, 523 const char *e, struct str_find *sm) 524 { 525 if (i >= ms->level) { 526 if (i == 0 || ms->level == 0) { 527 /* add whole match */ 528 sm->sm_so = (off_t)(s - ms->src_init); 529 sm->sm_eo = (off_t)(e - s) + sm->sm_so; 530 } else 531 return match_error(ms, "invalid capture index"); 532 } else { 533 ptrdiff_t l = ms->capture[i].len; 534 if (l == CAP_UNFINISHED) 535 return match_error(ms, "unfinished capture"); 536 sm->sm_so = ms->capture[i].init - ms->src_init; 537 sm->sm_eo = sm->sm_so + l; 538 } 539 sm->sm_eo = sm->sm_eo < sm->sm_so ? sm->sm_so : sm->sm_eo; 540 return (0); 541 } 542 543 static int 544 push_captures(struct match_state *ms, const char *s, const char *e, 545 struct str_find *sm, size_t nsm) 546 { 547 unsigned int i; 548 unsigned int nlevels = (ms->level <= 0 && s) ? 1 : ms->level; 549 550 if (nlevels > nsm) 551 nlevels = nsm; 552 for (i = 0; i < nlevels; i++) 553 if (push_onecapture(ms, i, s, e, sm + i) == -1) 554 break; 555 556 /* number of strings pushed */ 557 return (nlevels); 558 } 559 560 /* check whether pattern has no special characters */ 561 static int 562 nospecials(const char *p, size_t l) 563 { 564 size_t upto = 0; 565 566 do { 567 if (strpbrk(p + upto, SPECIALS)) { 568 /* pattern has a special character */ 569 return 0; 570 } 571 /* may have more after \0 */ 572 upto += strlen(p + upto) + 1; 573 } while (upto <= l); 574 575 /* no special chars found */ 576 return (1); 577 } 578 579 static int 580 str_find_aux(struct match_state *ms, const char *pattern, const char *string, 581 struct str_find *sm, size_t nsm, off_t init) 582 { 583 size_t ls = strlen(string); 584 size_t lp = strlen(pattern); 585 const char *s = string; 586 const char *p = pattern; 587 const char *s1, *s2; 588 int anchor, i; 589 590 if (init < 0) 591 init = 0; 592 else if (init > (off_t)ls) 593 return match_error(ms, "starting after string's end"); 594 s1 = s + init; 595 596 if (nospecials(p, lp)) { 597 /* do a plain search */ 598 s2 = lmemfind(s1, ls - (size_t)init, p, lp); 599 if (s2 != NULL) { 600 i = 0; 601 sm[i].sm_so = 0; 602 sm[i].sm_eo = ls; 603 if (nsm > 1) { 604 i++; 605 sm[i].sm_so = s2 - s; 606 sm[i].sm_eo = (s2 - s) + lp; 607 } 608 return (i + 1); 609 } 610 return (0); 611 } 612 613 anchor = (*p == '^'); 614 if (anchor) { 615 p++; 616 lp--; /* skip anchor character */ 617 } 618 ms->maxcaptures = (nsm > MAXCAPTURES ? MAXCAPTURES : nsm) - 1; 619 ms->matchdepth = MAXCCALLS; 620 ms->repetitioncounter = MAXREPETITION; 621 ms->src_init = s; 622 ms->src_end = s + ls; 623 ms->p_end = p + lp; 624 do { 625 const char *res; 626 ms->level = 0; 627 if ((res = match(ms, s1, p)) != NULL) { 628 sm->sm_so = 0; 629 sm->sm_eo = ls; 630 return push_captures(ms, s1, res, sm + 1, nsm - 1) + 1; 631 632 } else if (ms->error != NULL) { 633 return 0; 634 } 635 } while (s1++ < ms->src_end && !anchor); 636 637 return 0; 638 } 639 640 int 641 str_find(const char *string, const char *pattern, struct str_find *sm, 642 size_t nsm, const char **errstr) 643 { 644 struct match_state ms; 645 int ret; 646 647 memset(&ms, 0, sizeof(ms)); 648 memset(sm, 0, nsm * sizeof(*sm)); 649 650 ret = str_find_aux(&ms, pattern, string, sm, nsm, 0); 651 if (ms.error != NULL) { 652 /* Return 0 on error and store the error string */ 653 *errstr = ms.error; 654 ret = 0; 655 } else 656 *errstr = NULL; 657 658 return (ret); 659 } 660 661 int 662 str_match(const char *string, const char *pattern, struct str_match *m, 663 const char **errstr) 664 { 665 struct str_find sm[MAXCAPTURES]; 666 struct match_state ms; 667 int ret, i; 668 size_t len, nsm; 669 670 nsm = MAXCAPTURES; 671 memset(&ms, 0, sizeof(ms)); 672 memset(sm, 0, sizeof(sm)); 673 memset(m, 0, sizeof(*m)); 674 675 ret = str_find_aux(&ms, pattern, string, sm, nsm, 0); 676 if (ret <= 0 || ms.error != NULL) { 677 /* Return -1 on error and store the error string */ 678 *errstr = ms.error; 679 return (-1); 680 } 681 682 if ((m->sm_match = calloc(ret, sizeof(char *))) == NULL) { 683 *errstr = strerror(errno); 684 return (-1); 685 } 686 m->sm_nmatch = ret; 687 688 for (i = 0; i < ret; i++) { 689 if (sm[i].sm_so > sm[i].sm_eo) 690 continue; 691 len = sm[i].sm_eo - sm[i].sm_so; 692 if ((m->sm_match[i] = strndup(string + 693 sm[i].sm_so, len)) == NULL) { 694 *errstr = strerror(errno); 695 str_match_free(m); 696 return (-1); 697 } 698 } 699 700 *errstr = NULL; 701 return (0); 702 } 703 704 void 705 str_match_free(struct str_match *m) 706 { 707 unsigned int i = 0; 708 for (i = 0; i < m->sm_nmatch; i++) 709 free(m->sm_match[i]); 710 free(m->sm_match); 711 m->sm_match = NULL; 712 m->sm_nmatch = 0; 713 } 714