1 /******************************************************************************\ 2 * Copyright (c) 2016, Robert van Engelen, Genivia Inc. All rights reserved. * 3 * * 4 * Redistribution and use in source and binary forms, with or without * 5 * modification, are permitted provided that the following conditions are met: * 6 * * 7 * (1) Redistributions of source code must retain the above copyright notice, * 8 * this list of conditions and the following disclaimer. * 9 * * 10 * (2) Redistributions in binary form must reproduce the above copyright * 11 * notice, this list of conditions and the following disclaimer in the * 12 * documentation and/or other materials provided with the distribution. * 13 * * 14 * (3) The name of the author may not be used to endorse or promote products * 15 * derived from this software without specific prior written permission. * 16 * * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED * 18 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * 19 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO * 20 * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * 21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * 22 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * 23 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * 24 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * 25 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * 26 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * 27 \******************************************************************************/ 28 29 /** 30 @file fuzzymatcher.h 31 @brief RE/flex fuzzy matcher engine 32 @author Robert van Engelen - engelen@genivia.com 33 @copyright (c) 2016-2020, Robert van Engelen, Genivia Inc. All rights reserved. 34 @copyright (c) BSD-3 License - see LICENSE.txt 35 */ 36 37 #ifndef REFLEX_FUZZYMATCHER_H 38 #define REFLEX_FUZZYMATCHER_H 39 40 #include <reflex/matcher.h> 41 #include <reflex/pattern.h> 42 43 namespace reflex { 44 45 /// RE/flex fuzzy matcher engine class, implements reflex::Matcher fuzzy pattern matching interface with scan, find, split functors and iterators. 46 /** More info TODO */ 47 class FuzzyMatcher : public Matcher { 48 public: 49 /// Optional flags for the max parameter to constrain fuzzy matching, otherwise no constraints 50 static const uint16_t INS = 0x1000; ///< fuzzy match allows character insertions 51 static const uint16_t DEL = 0x2000; ///< fuzzy match allows character deletions 52 static const uint16_t SUB = 0x4000; ///< character substitutions count as one edit, not two (insert+delete) 53 /// Default constructor. FuzzyMatcher()54 FuzzyMatcher() 55 : 56 Matcher(), 57 max_(1), 58 err_(0), 59 ins_(true), 60 del_(true), 61 sub_(true) 62 { 63 bpt_.resize(max_); 64 } 65 /// Construct matcher engine from a pattern or a string regex, and an input character sequence. 66 template<typename P> /// @tparam <P> a reflex::Pattern or a string regex 67 FuzzyMatcher( 68 const P *pattern, ///< points to a reflex::Pattern or a string regex for this matcher 69 const Input& input = Input(), ///< input character sequence for this matcher 70 const char *opt = NULL) ///< option string of the form `(A|N|T(=[[:digit:]])?|;)*` 71 : Matcher(pattern,input,opt)72 Matcher(pattern, input, opt), 73 max_(1), 74 err_(0), 75 ins_(true), 76 del_(true), 77 sub_(true) 78 { 79 bpt_.resize(max_); 80 } 81 /// Construct matcher engine from a pattern or a string regex, and an input character sequence. 82 template<typename P> /// @tparam <P> a reflex::Pattern or a string regex 83 FuzzyMatcher( 84 const P *pattern, ///< points to a reflex::Pattern or a string regex for this matcher 85 uint16_t max, ///< max errors 86 const Input& input = Input(), ///< input character sequence for this matcher 87 const char *opt = NULL) ///< option string of the form `(A|N|T(=[[:digit:]])?|;)*` 88 : Matcher(pattern,input,opt)89 Matcher(pattern, input, opt), 90 max_(static_cast<uint8_t>(max)), 91 err_(0), 92 ins_(max <= 0xFF || (max & INS)), 93 del_(max <= 0xFF || (max & DEL)), 94 sub_(max <= 0xFF || (max & SUB)) 95 { 96 bpt_.resize(max_); 97 } 98 /// Construct matcher engine from a pattern or a string regex, and an input character sequence. 99 template<typename P> /// @tparam <P> a reflex::Pattern or a string regex 100 FuzzyMatcher( 101 const P& pattern, ///< a reflex::Pattern or a string regex for this matcher 102 const Input& input = Input(), ///< input character sequence for this matcher 103 const char *opt = NULL) ///< option string of the form `(A|N|T(=[[:digit:]])?|;)*` 104 : Matcher(pattern,input,opt)105 Matcher(pattern, input, opt), 106 max_(1), 107 err_(0), 108 ins_(true), 109 del_(true), 110 sub_(true) 111 { 112 bpt_.resize(max_); 113 } 114 /// Construct matcher engine from a pattern or a string regex, and an input character sequence. 115 template<typename P> /// @tparam <P> a reflex::Pattern or a string regex 116 FuzzyMatcher( 117 const P& pattern, ///< a reflex::Pattern or a string regex for this matcher 118 uint16_t max, ///< max errors 119 const Input& input = Input(), ///< input character sequence for this matcher 120 const char *opt = NULL) ///< option string of the form `(A|N|T(=[[:digit:]])?|;)*` 121 : Matcher(pattern,input,opt)122 Matcher(pattern, input, opt), 123 max_(static_cast<uint8_t>(max)), 124 err_(0), 125 ins_(max <= 0xFF || (max & INS)), 126 del_(max <= 0xFF || (max & DEL)), 127 sub_(max <= 0xFF || (max & SUB)) 128 { 129 bpt_.resize(max_); 130 } 131 /// Copy constructor. FuzzyMatcher(const FuzzyMatcher & matcher)132 FuzzyMatcher(const FuzzyMatcher& matcher) ///< matcher to copy with pattern (pattern may be shared) 133 : 134 Matcher(matcher), 135 max_(matcher.max_), 136 err_(0), 137 ins_(true), 138 del_(true), 139 sub_(true) 140 { 141 DBGLOG("FuzzyMatcher::FuzzyMatcher(matcher)"); 142 bpt_.resize(max_); 143 } 144 /// Assign a matcher. 145 FuzzyMatcher& operator=(const FuzzyMatcher& matcher) ///< matcher to copy 146 { 147 Matcher::operator=(matcher); 148 max_ = matcher.max_; 149 err_ = 0; 150 ins_ = matcher.ins_; 151 del_ = matcher.del_; 152 sub_ = matcher.sub_; 153 return *this; 154 } 155 /// Polymorphic cloning. clone()156 virtual FuzzyMatcher *clone() 157 { 158 return new FuzzyMatcher(*this); 159 } 160 /// Returns the number of edits made for the match, edits() <= max, not guaranteed to be the minimum edit distance. edits()161 uint8_t edits() 162 /// @returns 0 to max edit distance 163 const 164 { 165 return err_; 166 } 167 protected: 168 /// Save state to restore fuzzy matcher state after a second pass 169 struct SaveState { SaveStateSaveState170 SaveState(size_t ded) 171 : 172 use(false), 173 loc(0), 174 cap(0), 175 txt(0), 176 cur(0), 177 pos(0), 178 ded(ded), 179 mrk(false), 180 err(0) 181 { } 182 bool use; 183 size_t loc; 184 size_t cap; 185 size_t txt; 186 size_t cur; 187 size_t pos; 188 size_t ded; 189 bool mrk; 190 uint8_t err; 191 }; 192 /// Backtrack point. 193 struct BacktrackPoint { BacktrackPointBacktrackPoint194 BacktrackPoint() 195 : 196 pc0(NULL), 197 pc1(NULL), 198 len(0), 199 err(0), 200 alt(true), 201 sub(true) 202 { } 203 const Pattern::Opcode *pc0; ///< start of opcode 204 const Pattern::Opcode *pc1; ///< pointer to opcode to rerun on backtracking 205 size_t len; ///< length of string matched so far 206 uint8_t err; ///< to restore errors 207 bool alt; ///< true if alternating between pattern char substitution and insertion, otherwise insertion only 208 bool sub; ///< flag alternates between pattern char substitution (true) and insertion (false) 209 }; 210 /// Set backtrack point. 211 void point(BacktrackPoint& bpt, const Pattern::Opcode *pc, bool alternate = true, bool eof = false) 212 { 213 // advance to the first goto opcode 214 while (!Pattern::is_opcode_goto(*pc)) 215 ++pc; 216 bpt.pc0 = pc; 217 bpt.pc1 = pc; 218 bpt.len = pos_ - (txt_ - buf_) - !eof; 219 bpt.err = err_; 220 bpt.alt = sub_ && alternate; 221 bpt.sub = bpt.alt; 222 } 223 /// backtrack on a backtrack point to insert or substitute a pattern char, restoring current text char matched and errors. backtrack(BacktrackPoint & bpt,int & c1)224 const Pattern::Opcode *backtrack(BacktrackPoint& bpt, int& c1) 225 { 226 // no more alternatives 227 if (bpt.pc1 == NULL) 228 return NULL; 229 // done when no more goto opcodes on characters remain 230 if (!Pattern::is_opcode_goto(*bpt.pc1) || Pattern::is_meta(Pattern::lo_of(*bpt.pc1))) 231 return bpt.pc1 = NULL; 232 Pattern::Index jump = Pattern::index_of(*bpt.pc1); 233 // last opcode is a HALT? 234 if (jump == Pattern::Const::HALT) 235 { 236 if (!Pattern::is_opcode_goto(*bpt.pc0) || (Pattern::lo_of(*bpt.pc0) & 0xC0) != 0xC0) 237 return bpt.pc1 = NULL; 238 // loop over UTF-8 multibytes, checking linear case only (i.e. one wide char or a short range) 239 for (int i = 0; i < 3; ++i) 240 { 241 jump = Pattern::index_of(*bpt.pc0); 242 if (jump == Pattern::Const::HALT) 243 return bpt.pc1 = NULL; 244 if (jump == Pattern::Const::LONG) 245 jump = Pattern::long_index_of(bpt.pc0[1]); 246 const Pattern::Opcode *pc0 = pat_->opc_ + jump; 247 const Pattern::Opcode *pc1 = pc0; 248 while (!Pattern::is_opcode_goto(*pc1)) 249 ++pc1; 250 if (!Pattern::is_opcode_goto(*pc1) || Pattern::is_meta(Pattern::lo_of(*pc1)) || (Pattern::lo_of(*pc1) & 0x80) != 0x80) 251 break; 252 bpt.pc0 = pc0; 253 bpt.pc1 = pc1; 254 } 255 jump = Pattern::index_of(*bpt.pc1); 256 bpt.sub = bpt.alt; 257 DBGLOG("Multibyte jump to %u", jump); 258 } 259 if (jump == Pattern::Const::LONG) 260 jump = Pattern::long_index_of(bpt.pc1[1]); 261 // restore errors 262 err_ = bpt.err; 263 // restore pos in the input 264 pos_ = (txt_ - buf_) + bpt.len; 265 // set c1 to previous char before pos, to eventually set c0 in match(method) 266 if (pos_ > 0) 267 c1 = static_cast<unsigned char>(buf_[pos_ - 1]); 268 else 269 c1 = got_; 270 // substitute or insert a pattern char in the text? 271 if (bpt.sub) 272 { 273 DBGLOG("Substitute, jump to %u at pos %zu", jump, pos_); 274 // skip UTF-8 multibytes 275 int c = get(); 276 if (c >= 0xC0) 277 { 278 int n = (c >= 0xE0) + (c >= 0xF0); 279 while (n-- >= 0) 280 if ((c = get()) == EOF) 281 break; 282 } 283 bpt.sub = false; 284 bpt.pc1 += !bpt.alt; 285 } 286 else 287 { 288 DBGLOG("Insert, jump to %u at pos %zu", jump, pos_); 289 bpt.sub = bpt.alt; 290 ++bpt.pc1; 291 } 292 return pat_->opc_ + jump; 293 } 294 /// Returns true if input fuzzy-matched the pattern using method Const::SCAN, Const::FIND, Const::SPLIT, or Const::MATCH. match(Method method)295 virtual size_t match(Method method) ///< Const::SCAN, Const::FIND, Const::SPLIT, or Const::MATCH 296 /// @returns nonzero if input matched the pattern 297 { 298 DBGLOG("BEGIN FuzzyMatcher::match()"); 299 reset_text(); 300 SaveState sst(ded_); 301 len_ = 0; // split text length starts with 0 302 anc_ = false; // no word boundary anchor found and applied 303 scan: 304 txt_ = buf_ + cur_; 305 #if !defined(WITH_NO_INDENT) 306 mrk_ = false; 307 ind_ = pos_; // ind scans input in buf[] in newline() up to pos - 1 308 col_ = 0; // count columns for indent matching 309 #endif 310 find: 311 int c1 = got_; 312 bool bol = at_bol(); // at begin of line? 313 #if !defined(WITH_NO_INDENT) 314 redo: 315 #endif 316 lap_.resize(0); 317 cap_ = 0; 318 bool nul = method == Const::MATCH; 319 if (pat_->opc_ != NULL) 320 { 321 err_ = 0; 322 uint8_t stack = 0; 323 const Pattern::Opcode *pc = pat_->opc_; 324 while (true) 325 { 326 const Pattern::Opcode *pc0; 327 while (true) 328 { 329 Pattern::Opcode opcode = *pc; 330 Pattern::Index jump; 331 DBGLOG("Fetch: code[%zu] = 0x%08X", pc - pat_->opc_, opcode); 332 pc0 = pc; 333 if (!Pattern::is_opcode_goto(opcode)) 334 { 335 switch (opcode >> 24) 336 { 337 case 0xFE: // TAKE 338 cap_ = Pattern::long_index_of(opcode); 339 cur_ = pos_; 340 ++pc; 341 DBGLOG("Take: cap = %zu", cap_); 342 continue; 343 case 0xFD: // REDO 344 cap_ = Const::REDO; 345 DBGLOG("Redo"); 346 cur_ = pos_; 347 ++pc; 348 continue; 349 case 0xFC: // TAIL 350 { 351 Pattern::Lookahead la = Pattern::lookahead_of(opcode); 352 DBGLOG("Tail: %u", la); 353 if (lap_.size() > la && lap_[la] >= 0) 354 cur_ = txt_ - buf_ + static_cast<size_t>(lap_[la]); // mind the (new) gap 355 ++pc; 356 continue; 357 } 358 case 0xFB: // HEAD 359 { 360 Pattern::Lookahead la = Pattern::lookahead_of(opcode); 361 DBGLOG("Head: lookahead[%u] = %zu", la, pos_ - (txt_ - buf_)); 362 if (lap_.size() <= la) 363 lap_.resize(la + 1, -1); 364 lap_[la] = static_cast<int>(pos_ - (txt_ - buf_)); // mind the gap 365 ++pc; 366 continue; 367 } 368 #if !defined(WITH_NO_INDENT) 369 case Pattern::META_DED - Pattern::META_MIN: 370 if (ded_ > 0) 371 { 372 jump = Pattern::index_of(opcode); 373 if (jump == Pattern::Const::LONG) 374 jump = Pattern::long_index_of(pc[1]); 375 DBGLOG("Dedent ded = %zu", ded_); // unconditional dedent matching \j 376 nul = true; 377 pc = pat_->opc_ + jump; 378 continue; 379 } 380 #endif 381 } 382 if (c1 == EOF) 383 break; 384 int c0 = c1; 385 c1 = get(); 386 DBGLOG("Get: c1 = %d", c1); 387 // where to jump back to (backtrack on meta transitions) 388 Pattern::Index back = Pattern::Const::IMAX; 389 // to jump to longest sequence of matching metas 390 jump = Pattern::Const::IMAX; 391 while (true) 392 { 393 if ((jump == Pattern::Const::IMAX || back == Pattern::Const::IMAX) && !Pattern::is_opcode_goto(opcode)) 394 { 395 // we no longer have to pass through all if jump and back are set 396 switch (opcode >> 24) 397 { 398 case 0xFE: // TAKE 399 cap_ = Pattern::long_index_of(opcode); 400 cur_ = pos_; 401 if (c1 != EOF) 402 --cur_; // must unget one char 403 opcode = *++pc; 404 DBGLOG("Take: cap = %zu", cap_); 405 continue; 406 case 0xFD: // REDO 407 cap_ = Const::REDO; 408 DBGLOG("Redo"); 409 cur_ = pos_; 410 if (c1 != EOF) 411 --cur_; // must unget one char 412 opcode = *++pc; 413 continue; 414 case 0xFC: // TAIL 415 { 416 Pattern::Lookahead la = Pattern::lookahead_of(opcode); 417 DBGLOG("Tail: %u", la); 418 if (lap_.size() > la && lap_[la] >= 0) 419 cur_ = txt_ - buf_ + static_cast<size_t>(lap_[la]); // mind the (new) gap 420 opcode = *++pc; 421 continue; 422 } 423 case 0xFB: // HEAD 424 opcode = *++pc; 425 continue; 426 #if !defined(WITH_NO_INDENT) 427 case Pattern::META_DED - Pattern::META_MIN: 428 DBGLOG("DED? %d", c1); 429 if (jump == Pattern::Const::IMAX && back == Pattern::Const::IMAX && bol && dedent()) 430 { 431 jump = Pattern::index_of(opcode); 432 if (jump == Pattern::Const::LONG) 433 jump = Pattern::long_index_of(*++pc); 434 } 435 opcode = *++pc; 436 continue; 437 case Pattern::META_IND - Pattern::META_MIN: 438 DBGLOG("IND? %d", c1); 439 if (jump == Pattern::Const::IMAX && back == Pattern::Const::IMAX && bol && indent()) 440 { 441 jump = Pattern::index_of(opcode); 442 if (jump == Pattern::Const::LONG) 443 jump = Pattern::long_index_of(*++pc); 444 } 445 opcode = *++pc; 446 continue; 447 case Pattern::META_UND - Pattern::META_MIN: 448 DBGLOG("UND"); 449 if (mrk_) 450 { 451 jump = Pattern::index_of(opcode); 452 if (jump == Pattern::Const::LONG) 453 jump = Pattern::long_index_of(*++pc); 454 } 455 mrk_ = false; 456 ded_ = 0; 457 opcode = *++pc; 458 continue; 459 #endif 460 case Pattern::META_EOB - Pattern::META_MIN: 461 DBGLOG("EOB? %d", c1); 462 if (jump == Pattern::Const::IMAX && c1 == EOF) 463 { 464 jump = Pattern::index_of(opcode); 465 if (jump == Pattern::Const::LONG) 466 jump = Pattern::long_index_of(*++pc); 467 } 468 opcode = *++pc; 469 continue; 470 case Pattern::META_BOB - Pattern::META_MIN: 471 DBGLOG("BOB? %d", at_bob()); 472 if (jump == Pattern::Const::IMAX && at_bob()) 473 { 474 jump = Pattern::index_of(opcode); 475 if (jump == Pattern::Const::LONG) 476 jump = Pattern::long_index_of(*++pc); 477 } 478 opcode = *++pc; 479 continue; 480 case Pattern::META_EOL - Pattern::META_MIN: 481 DBGLOG("EOL? %d", c1); 482 anc_ = true; 483 if (jump == Pattern::Const::IMAX && (c1 == EOF || c1 == '\n' || (c1 == '\r' && peek() == '\n'))) 484 { 485 jump = Pattern::index_of(opcode); 486 if (jump == Pattern::Const::LONG) 487 jump = Pattern::long_index_of(*++pc); 488 } 489 opcode = *++pc; 490 continue; 491 case Pattern::META_BOL - Pattern::META_MIN: 492 DBGLOG("BOL? %d", bol); 493 anc_ = true; 494 if (jump == Pattern::Const::IMAX && bol) 495 { 496 jump = Pattern::index_of(opcode); 497 if (jump == Pattern::Const::LONG) 498 jump = Pattern::long_index_of(*++pc); 499 } 500 opcode = *++pc; 501 continue; 502 case Pattern::META_EWE - Pattern::META_MIN: 503 DBGLOG("EWE? %d %d %d", c0, c1, isword(c0) && !isword(c1)); 504 anc_ = true; 505 if (jump == Pattern::Const::IMAX && (isword(c0) || opt_.W) && !isword(c1)) 506 { 507 jump = Pattern::index_of(opcode); 508 if (jump == Pattern::Const::LONG) 509 jump = Pattern::long_index_of(*++pc); 510 } 511 opcode = *++pc; 512 continue; 513 case Pattern::META_BWE - Pattern::META_MIN: 514 DBGLOG("BWE? %d %d %d", c0, c1, !isword(c0) && isword(c1)); 515 anc_ = true; 516 if (jump == Pattern::Const::IMAX && !isword(c0) && isword(c1)) 517 { 518 jump = Pattern::index_of(opcode); 519 if (jump == Pattern::Const::LONG) 520 jump = Pattern::long_index_of(*++pc); 521 } 522 opcode = *++pc; 523 continue; 524 case Pattern::META_EWB - Pattern::META_MIN: 525 DBGLOG("EWB? %d", at_eow()); 526 anc_ = true; 527 if (jump == Pattern::Const::IMAX && isword(got_) && 528 !isword(static_cast<unsigned char>(method == Const::SPLIT ? txt_[len_] : *txt_))) 529 { 530 jump = Pattern::index_of(opcode); 531 if (jump == Pattern::Const::LONG) 532 jump = Pattern::long_index_of(*++pc); 533 } 534 opcode = *++pc; 535 continue; 536 case Pattern::META_BWB - Pattern::META_MIN: 537 DBGLOG("BWB? %d", at_bow()); 538 anc_ = true; 539 if (jump == Pattern::Const::IMAX && !isword(got_) && 540 (opt_.W || isword(static_cast<unsigned char>(method == Const::SPLIT ? txt_[len_] : *txt_)))) 541 { 542 jump = Pattern::index_of(opcode); 543 if (jump == Pattern::Const::LONG) 544 jump = Pattern::long_index_of(*++pc); 545 } 546 opcode = *++pc; 547 continue; 548 case Pattern::META_NWE - Pattern::META_MIN: 549 DBGLOG("NWE? %d %d %d", c0, c1, isword(c0) == isword(c1)); 550 anc_ = true; 551 if (jump == Pattern::Const::IMAX && isword(c0) == isword(c1)) 552 { 553 jump = Pattern::index_of(opcode); 554 if (jump == Pattern::Const::LONG) 555 jump = Pattern::long_index_of(*++pc); 556 } 557 opcode = *++pc; 558 continue; 559 case Pattern::META_NWB - Pattern::META_MIN: 560 DBGLOG("NWB? %d %d", at_bow(), at_eow()); 561 anc_ = true; 562 if (jump == Pattern::Const::IMAX && 563 isword(got_) == isword(static_cast<unsigned char>(txt_[len_]))) 564 { 565 jump = Pattern::index_of(opcode); 566 if (jump == Pattern::Const::LONG) 567 jump = Pattern::long_index_of(*++pc); 568 } 569 opcode = *++pc; 570 continue; 571 case 0xFF: // LONG 572 opcode = *++pc; 573 continue; 574 } 575 } 576 if (jump == Pattern::Const::IMAX) 577 { 578 if (back != Pattern::Const::IMAX) 579 { 580 pc = pat_->opc_ + back; 581 opcode = *pc; 582 } 583 break; 584 } 585 DBGLOG("Backtrack: pc = %u", jump); 586 if (back == Pattern::Const::IMAX) 587 back = static_cast<Pattern::Index>(pc - pat_->opc_); 588 pc = pat_->opc_ + jump; 589 opcode = *pc; 590 jump = Pattern::Const::IMAX; 591 } 592 if (c1 == EOF) 593 break; 594 } 595 else 596 { 597 if (Pattern::is_opcode_halt(opcode)) 598 break; 599 if (c1 == EOF) 600 break; 601 c1 = get(); 602 DBGLOG("Get: c1 = %d", c1); 603 if (c1 == EOF) 604 break; 605 } 606 { 607 Pattern::Opcode lo = c1 << 24; 608 Pattern::Opcode hi = lo | 0x00FFFFFF; 609 unrolled: 610 if (hi < opcode || lo > (opcode << 8)) 611 { 612 opcode = *++pc; 613 if (hi < opcode || lo > (opcode << 8)) 614 { 615 opcode = *++pc; 616 if (hi < opcode || lo > (opcode << 8)) 617 { 618 opcode = *++pc; 619 if (hi < opcode || lo > (opcode << 8)) 620 { 621 opcode = *++pc; 622 if (hi < opcode || lo > (opcode << 8)) 623 { 624 opcode = *++pc; 625 if (hi < opcode || lo > (opcode << 8)) 626 { 627 opcode = *++pc; 628 if (hi < opcode || lo > (opcode << 8)) 629 { 630 opcode = *++pc; 631 if (hi < opcode || lo > (opcode << 8)) 632 { 633 opcode = *++pc; 634 goto unrolled; 635 } 636 } 637 } 638 } 639 } 640 } 641 } 642 } 643 } 644 jump = Pattern::index_of(opcode); 645 if (jump == 0) 646 { 647 // loop back to start state after only one char matched (one transition) but w/o full match, then optimize 648 if (cap_ == 0 && pos_ == cur_ + 1 && method == Const::FIND) 649 cur_ = pos_; // set cur_ to move forward from cur_ + 1 with FIND advance() 650 } 651 else if (jump >= Pattern::Const::LONG) 652 { 653 if (jump == Pattern::Const::HALT) 654 break; 655 jump = Pattern::long_index_of(pc[1]); 656 } 657 pc = pat_->opc_ + jump; 658 } 659 // exit fuzzy loop if nothing consumed 660 if (pos_ == static_cast<size_t>(txt_ + len_ - buf_)) 661 break; 662 // match, i.e. cap_ > 0? 663 if (method == Const::MATCH) 664 { 665 // exit fuzzy loop if fuzzy match succeeds till end of input 666 if (cap_ > 0) 667 { 668 if (c1 == EOF) 669 break; 670 while (err_ < max_) 671 { 672 c1 = get(); 673 if (c1 == EOF) 674 break; 675 // skip one (multibyte) char 676 if (c1 >= 0xC0) 677 { 678 int n = (c1 >= 0xE0) + (c1 >= 0xF0); 679 while (n-- >= 0) 680 if ((c1 = get()) == EOF) 681 break; 682 } 683 ++err_; 684 } 685 if (at_end()) 686 { 687 DBGLOG("match pos = %zu", pos_); 688 set_current(pos_); 689 break; 690 } 691 } 692 } 693 else 694 { 695 // exit fuzzy loop if match or first char mismatched 696 if (cap_ > 0 || pos_ == static_cast<size_t>(txt_ + len_ - buf_ + 1)) 697 break; 698 } 699 // no match, use fuzzy matching with max error 700 if (c1 == '\0' || c1 == '\n' || c1 == EOF) 701 { 702 // do not try to fuzzy match NUL, LF, or EOF 703 if (err_ < max_ && del_) 704 { 705 ++err_; 706 // set backtrack point to insert pattern char only, not substitute, if pc0 os a different point than the last 707 if (stack == 0 || bpt_[stack - 1].pc0 != pc0) 708 { 709 point(bpt_[stack++], pc0, false, c1 == EOF); 710 DBGLOG("point[%u] at %zu EOF", stack - 1, pc0 - pat_->opc_); 711 } 712 } 713 pc = NULL; 714 while (stack > 0 && pc == NULL) 715 { 716 pc = backtrack(bpt_[stack - 1], c1); 717 if (pc == NULL) 718 --stack; 719 } 720 // exhausted all backtracking points? 721 if (pc == NULL) 722 break; 723 } 724 else 725 { 726 if (err_ < max_) 727 { 728 ++err_; 729 if (del_ || sub_) 730 { 731 // set backtrack point if pc0 is a different point than the last 732 if (stack == 0 || bpt_[stack - 1].pc0 != pc0) 733 { 734 point(bpt_[stack++], pc0); 735 DBGLOG("point[%u] at %zu pos %zu", stack - 1, pc0 - pat_->opc_, pos_ - 1); 736 } 737 } 738 if (ins_) 739 { 740 // try pattern char deletion (text insertion): skip one (multibyte) char then rerun opcode at pc0 741 if (c1 >= 0xC0) 742 { 743 int n = (c1 >= 0xE0) + (c1 >= 0xF0); 744 while (n-- >= 0) 745 if ((c1 = get()) == EOF) 746 break; 747 } 748 pc = pc0; 749 DBGLOG("delete %c at pos %zu", c1, pos_ - 1); 750 } 751 } 752 else 753 { 754 // try insertion or substitution of pattern char 755 pc = NULL; 756 while (stack > 0 && pc == NULL) 757 { 758 pc = backtrack(bpt_[stack - 1], c1); 759 if (pc == NULL) 760 --stack; 761 } 762 // exhausted all backtracking points? 763 if (pc == NULL) 764 break; 765 } 766 } 767 } 768 } 769 // if fuzzy matched with errors then perform a second pass ahead of this match to check for an exact match 770 if (cap_ > 0 && err_ > 0 && !sst.use && (method == Const::FIND || method == Const::SPLIT)) 771 { 772 // this part is based on advance() in matcher.cpp, limited to advancing ahead till the one of the first pattern char(s) match excluding \n 773 size_t loc = txt_ - buf_ + 1; 774 const char *s = buf_ + loc; 775 const char *e = static_cast<const char*>(std::memchr(s, '\n', cur_ - loc)); 776 if (e == NULL) 777 e = buf_ + cur_; 778 if (pat_->len_ == 0) 779 { 780 if (pat_->min_ > 0) 781 { 782 const Pattern::Pred *pma = pat_->pma_; 783 while (s < e && (pma[static_cast<uint8_t>(*s)] & 0xc0) == 0xc0) 784 ++s; 785 if (s < e) 786 { 787 loc = s - buf_; 788 sst.use = true; 789 sst.loc = loc; 790 sst.cap = cap_; 791 sst.txt = txt_ - buf_; 792 sst.cur = cur_; 793 sst.pos = pos_; 794 size_t tmp = ded_; 795 ded_ = sst.ded; 796 sst.ded = tmp; 797 sst.mrk = mrk_; 798 sst.err = err_; 799 set_current(loc); 800 goto scan; 801 } 802 } 803 } 804 else if (s < e) 805 { 806 s = static_cast<const char*>(std::memchr(s, *pat_->pre_, e - s)); 807 if (s != NULL) 808 { 809 loc = s - buf_; 810 sst.use = true; 811 sst.loc = loc; 812 sst.cap = cap_; 813 sst.txt = txt_ - buf_; 814 sst.cur = cur_; 815 sst.pos = pos_; 816 size_t tmp = ded_; 817 ded_ = sst.ded; 818 sst.ded = tmp; 819 sst.mrk = mrk_; 820 sst.err = err_; 821 set_current(loc); 822 goto scan; 823 } 824 } 825 } 826 else if (sst.use && (cap_ == 0 || err_ >= sst.err)) 827 { 828 // if the buffer was shifted then cur_, pos_ and txt_ are no longer at the same location in the buffer, we must adjust for this 829 size_t loc = txt_ - buf_; 830 size_t shift = sst.loc - loc; 831 cap_ = sst.cap; 832 cur_ = sst.cur - shift; 833 pos_ = sst.pos - shift; 834 ded_ = sst.ded; 835 mrk_ = sst.mrk; 836 err_ = sst.err; 837 txt_ = buf_ + sst.txt - shift; 838 } 839 else if (sst.use && cap_ > 0 && method == Const::SPLIT) 840 { 841 size_t loc = txt_ - buf_; 842 size_t shift = sst.loc - loc; 843 len_ = loc - sst.txt + shift; 844 } 845 #if !defined(WITH_NO_INDENT) 846 if (mrk_ && cap_ != Const::REDO) 847 { 848 if (col_ > 0 && (tab_.empty() || tab_.back() < col_)) 849 { 850 DBGLOG("Set new stop: tab_[%zu] = %zu", tab_.size(), col_); 851 tab_.push_back(col_); 852 } 853 else if (!tab_.empty() && tab_.back() > col_) 854 { 855 size_t n; 856 for (n = tab_.size() - 1; n > 0; --n) 857 if (tab_.at(n - 1) <= col_) 858 break; 859 ded_ += tab_.size() - n; 860 DBGLOG("Dedents: ded = %zu tab_ = %zu", ded_, tab_.size()); 861 tab_.resize(n); 862 // adjust stop when indents are not aligned (Python would give an error) 863 if (n > 0) 864 tab_.back() = col_; 865 } 866 } 867 if (ded_ > 0) 868 { 869 DBGLOG("Dedents: ded = %zu", ded_); 870 if (col_ == 0 && bol) 871 { 872 ded_ += tab_.size(); 873 tab_.resize(0); 874 DBGLOG("Rescan for pending dedents: ded = %zu", ded_); 875 pos_ = ind_; 876 // avoid looping, match \j exactly 877 bol = false; 878 goto redo; 879 } 880 --ded_; 881 } 882 #endif 883 if (method == Const::SPLIT) 884 { 885 DBGLOG("Split: len = %zu cap = %zu cur = %zu pos = %zu end = %zu txt-buf = %zu eob = %d got = %d", len_, cap_, cur_, pos_, end_, txt_-buf_, (int)eof_, got_); 886 if (cap_ == 0 || (cur_ == static_cast<size_t>(txt_ - buf_) && !at_bob())) 887 { 888 if (!hit_end() && (txt_ + len_ < buf_ + end_ || peek() != EOF)) 889 { 890 ++len_; 891 DBGLOG("Split continue: len = %zu", len_); 892 set_current(++cur_); 893 goto find; 894 } 895 if (got_ != Const::EOB) 896 cap_ = Const::EMPTY; 897 else 898 cap_ = 0; 899 set_current(end_); 900 got_ = Const::EOB; 901 DBGLOG("Split at eof: cap = %zu txt = '%s' len = %zu", cap_, std::string(txt_, len_).c_str(), len_); 902 DBGLOG("END FuzzyMatcher::match()"); 903 return cap_; 904 } 905 if (cur_ == 0 && at_bob() && at_end()) 906 { 907 cap_ = Const::EMPTY; 908 got_ = Const::EOB; 909 } 910 else 911 { 912 set_current(cur_); 913 } 914 DBGLOG("Split: txt = '%s' len = %zu", std::string(txt_, len_).c_str(), len_); 915 DBGLOG("END FuzzyMatcher::match()"); 916 return cap_; 917 } 918 if (cap_ == 0) 919 { 920 if (method == Const::FIND) 921 { 922 if (!at_end()) 923 { 924 if (anc_) 925 { 926 cur_ = txt_ - buf_; // reset current to pattern start when a word boundary was encountered 927 anc_ = false; 928 } 929 // fuzzy search with find() can safely advance on a single prefix char of the regex 930 if (pos_ > cur_) 931 { 932 // this part is based on advance() in matcher.cpp, limited to advancing ahead till the one of the first pattern char(s) match 933 size_t loc = cur_ + 1; 934 if (pat_->len_ == 0) 935 { 936 if (pat_->min_ > 0) 937 { 938 const Pattern::Pred *pma = pat_->pma_; 939 while (true) 940 { 941 const char *s = buf_ + loc; 942 const char *e = buf_ + end_; 943 while (s < e && (pma[static_cast<uint8_t>(*s)] & 0xc0) == 0xc0) 944 ++s; 945 if (s < e) 946 { 947 loc = s - buf_; 948 set_current(loc); 949 goto scan; 950 } 951 loc = e - buf_; 952 set_current_match(loc - 1); 953 peek_more(); 954 loc = cur_ + 1; 955 if (loc >= end_) 956 break; 957 } 958 } 959 } 960 else 961 { 962 while (true) 963 { 964 const char *s = buf_ + loc; 965 const char *e = buf_ + end_; 966 s = static_cast<const char*>(std::memchr(s, *pat_->pre_, e - s)); 967 if (s != NULL) 968 { 969 loc = s - buf_; 970 set_current(loc); 971 goto scan; 972 } 973 loc = e - buf_; 974 set_current_match(loc - 1); 975 peek_more(); 976 loc = cur_ + 1; 977 if (loc + pat_->len_ > end_) 978 break; 979 } 980 } 981 } 982 txt_ = buf_ + cur_; 983 } 984 } 985 else 986 { 987 // SCAN and MATCH: no match: backup to begin of unmatched text to report as error 988 cur_ = txt_ - buf_; 989 } 990 } 991 len_ = cur_ - (txt_ - buf_); 992 if (len_ == 0 && !nul) 993 { 994 DBGLOG("Empty or no match cur = %zu pos = %zu end = %zu", cur_, pos_, end_); 995 pos_ = cur_; 996 if (at_end()) 997 { 998 set_current(cur_); 999 DBGLOG("Reject empty match at EOF"); 1000 cap_ = 0; 1001 } 1002 else if (method == Const::FIND) 1003 { 1004 DBGLOG("Reject empty match and continue?"); 1005 // skip one char to keep searching 1006 set_current(++cur_); 1007 // allow FIND with "N" to match an empty line, with ^$ etc. 1008 if (cap_ == 0 || !opt_.N || (!bol && (c1 == '\n' || (c1 == '\r' && peek() == '\n')))) 1009 goto scan; 1010 DBGLOG("Accept empty match"); 1011 } 1012 else 1013 { 1014 set_current(cur_); 1015 DBGLOG("Reject empty match"); 1016 cap_ = 0; 1017 } 1018 } 1019 else if (len_ == 0 && cur_ == end_) 1020 { 1021 DBGLOG("Hit end: got = %d", got_); 1022 if (cap_ == Const::REDO && !opt_.A) 1023 cap_ = 0; 1024 } 1025 else 1026 { 1027 set_current(cur_); 1028 if (len_ > 0 && cap_ == Const::REDO && !opt_.A) 1029 { 1030 DBGLOG("Ignore accept and continue: len = %zu", len_); 1031 len_ = 0; 1032 if (method != Const::MATCH) 1033 goto scan; 1034 cap_ = 0; 1035 } 1036 } 1037 DBGLOG("Return: cap = %zu txt = '%s' len = %zu pos = %zu got = %d", cap_, std::string(txt_, len_).c_str(), len_, pos_, got_); 1038 DBGLOG("END match()"); 1039 return cap_; 1040 } 1041 std::vector<BacktrackPoint> bpt_; ///< vector of backtrack points, max_ size 1042 uint8_t max_; ///< max errors 1043 uint8_t err_; ///< accumulated edit distance (not guaranteed minimal) 1044 bool ins_; ///< fuzzy match inserted chars (extra chars) 1045 bool del_; ///< fuzzy match deleted chars (missing chars) 1046 bool sub_; ///< fuzzy match substituted chars 1047 }; 1048 1049 } // namespace reflex 1050 1051 #endif 1052