1 /******************************************************************************\
2 * Copyright (c) 2016, Robert van Engelen, Genivia Inc. All rights reserved.    *
3 *                                                                              *
4 * Redistribution and use in source and binary forms, with or without           *
5 * modification, are permitted provided that the following conditions are met:  *
6 *                                                                              *
7 *   (1) Redistributions of source code must retain the above copyright notice, *
8 *       this list of conditions and the following disclaimer.                  *
9 *                                                                              *
10 *   (2) Redistributions in binary form must reproduce the above copyright      *
11 *       notice, this list of conditions and the following disclaimer in the    *
12 *       documentation and/or other materials provided with the distribution.   *
13 *                                                                              *
14 *   (3) The name of the author may not be used to endorse or promote products  *
15 *       derived from this software without specific prior written permission.  *
16 *                                                                              *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED *
18 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF         *
19 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO   *
20 * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,       *
21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, *
22 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;  *
23 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,     *
24 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR      *
25 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF       *
26 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.                                   *
27 \******************************************************************************/
28 
29 /**
30 @file      fuzzymatcher.h
31 @brief     RE/flex fuzzy matcher engine
32 @author    Robert van Engelen - engelen@genivia.com
33 @copyright (c) 2016-2020, Robert van Engelen, Genivia Inc. All rights reserved.
34 @copyright (c) BSD-3 License - see LICENSE.txt
35 */
36 
37 #ifndef REFLEX_FUZZYMATCHER_H
38 #define REFLEX_FUZZYMATCHER_H
39 
40 #include <reflex/matcher.h>
41 #include <reflex/pattern.h>
42 
43 namespace reflex {
44 
45 /// RE/flex fuzzy matcher engine class, implements reflex::Matcher fuzzy pattern matching interface with scan, find, split functors and iterators.
46 /** More info TODO */
47 class FuzzyMatcher : public Matcher {
48  public:
49   /// Optional flags for the max parameter to constrain fuzzy matching, otherwise no constraints
50   static const uint16_t INS = 0x1000; ///< fuzzy match allows character insertions
51   static const uint16_t DEL = 0x2000; ///< fuzzy match allows character deletions
52   static const uint16_t SUB = 0x4000; ///< character substitutions count as one edit, not two (insert+delete)
53   /// Default constructor.
FuzzyMatcher()54   FuzzyMatcher()
55     :
56       Matcher(),
57       max_(1),
58       err_(0),
59       ins_(true),
60       del_(true),
61       sub_(true)
62   {
63     bpt_.resize(max_);
64   }
65   /// Construct matcher engine from a pattern or a string regex, and an input character sequence.
66   template<typename P> /// @tparam <P> a reflex::Pattern or a string regex
67   FuzzyMatcher(
68       const P     *pattern,         ///< points to a reflex::Pattern or a string regex for this matcher
69       const Input& input = Input(), ///< input character sequence for this matcher
70       const char  *opt = NULL)      ///< option string of the form `(A|N|T(=[[:digit:]])?|;)*`
71     :
Matcher(pattern,input,opt)72       Matcher(pattern, input, opt),
73       max_(1),
74       err_(0),
75       ins_(true),
76       del_(true),
77       sub_(true)
78   {
79     bpt_.resize(max_);
80   }
81   /// Construct matcher engine from a pattern or a string regex, and an input character sequence.
82   template<typename P> /// @tparam <P> a reflex::Pattern or a string regex
83   FuzzyMatcher(
84       const P     *pattern,         ///< points to a reflex::Pattern or a string regex for this matcher
85       uint16_t     max,             ///< max errors
86       const Input& input = Input(), ///< input character sequence for this matcher
87       const char  *opt = NULL)      ///< option string of the form `(A|N|T(=[[:digit:]])?|;)*`
88     :
Matcher(pattern,input,opt)89       Matcher(pattern, input, opt),
90       max_(static_cast<uint8_t>(max)),
91       err_(0),
92       ins_(max <= 0xFF || (max & INS)),
93       del_(max <= 0xFF || (max & DEL)),
94       sub_(max <= 0xFF || (max & SUB))
95   {
96     bpt_.resize(max_);
97   }
98   /// Construct matcher engine from a pattern or a string regex, and an input character sequence.
99   template<typename P> /// @tparam <P> a reflex::Pattern or a string regex
100   FuzzyMatcher(
101       const P&     pattern,         ///< a reflex::Pattern or a string regex for this matcher
102       const Input& input = Input(), ///< input character sequence for this matcher
103       const char  *opt = NULL)      ///< option string of the form `(A|N|T(=[[:digit:]])?|;)*`
104     :
Matcher(pattern,input,opt)105       Matcher(pattern, input, opt),
106       max_(1),
107       err_(0),
108       ins_(true),
109       del_(true),
110       sub_(true)
111   {
112     bpt_.resize(max_);
113   }
114   /// Construct matcher engine from a pattern or a string regex, and an input character sequence.
115   template<typename P> /// @tparam <P> a reflex::Pattern or a string regex
116   FuzzyMatcher(
117       const P&     pattern,         ///< a reflex::Pattern or a string regex for this matcher
118       uint16_t     max,             ///< max errors
119       const Input& input = Input(), ///< input character sequence for this matcher
120       const char  *opt = NULL)      ///< option string of the form `(A|N|T(=[[:digit:]])?|;)*`
121     :
Matcher(pattern,input,opt)122       Matcher(pattern, input, opt),
123       max_(static_cast<uint8_t>(max)),
124       err_(0),
125       ins_(max <= 0xFF || (max & INS)),
126       del_(max <= 0xFF || (max & DEL)),
127       sub_(max <= 0xFF || (max & SUB))
128   {
129     bpt_.resize(max_);
130   }
131   /// Copy constructor.
FuzzyMatcher(const FuzzyMatcher & matcher)132   FuzzyMatcher(const FuzzyMatcher& matcher) ///< matcher to copy with pattern (pattern may be shared)
133     :
134       Matcher(matcher),
135       max_(matcher.max_),
136       err_(0),
137       ins_(true),
138       del_(true),
139       sub_(true)
140   {
141     DBGLOG("FuzzyMatcher::FuzzyMatcher(matcher)");
142     bpt_.resize(max_);
143   }
144   /// Assign a matcher.
145   FuzzyMatcher& operator=(const FuzzyMatcher& matcher) ///< matcher to copy
146   {
147     Matcher::operator=(matcher);
148     max_ = matcher.max_;
149     err_ = 0;
150     ins_ = matcher.ins_;
151     del_ = matcher.del_;
152     sub_ = matcher.sub_;
153     return *this;
154   }
155   /// Polymorphic cloning.
clone()156   virtual FuzzyMatcher *clone()
157   {
158     return new FuzzyMatcher(*this);
159   }
160   /// Returns the number of edits made for the match, edits() <= max, not guaranteed to be the minimum edit distance.
edits()161   uint8_t edits()
162     /// @returns 0 to max edit distance
163     const
164   {
165     return err_;
166   }
167  protected:
168   /// Save state to restore fuzzy matcher state after a second pass
169   struct SaveState {
SaveStateSaveState170     SaveState(size_t ded)
171       :
172         use(false),
173         loc(0),
174         cap(0),
175         txt(0),
176         cur(0),
177         pos(0),
178         ded(ded),
179         mrk(false),
180         err(0)
181     { }
182     bool    use;
183     size_t  loc;
184     size_t  cap;
185     size_t  txt;
186     size_t  cur;
187     size_t  pos;
188     size_t  ded;
189     bool    mrk;
190     uint8_t err;
191   };
192   /// Backtrack point.
193   struct BacktrackPoint {
BacktrackPointBacktrackPoint194     BacktrackPoint()
195       :
196         pc0(NULL),
197         pc1(NULL),
198         len(0),
199         err(0),
200         alt(true),
201         sub(true)
202     { }
203     const Pattern::Opcode *pc0; ///< start of opcode
204     const Pattern::Opcode *pc1; ///< pointer to opcode to rerun on backtracking
205     size_t                 len; ///< length of string matched so far
206     uint8_t                err; ///< to restore errors
207     bool                   alt; ///< true if alternating between pattern char substitution and insertion, otherwise insertion only
208     bool                   sub; ///< flag alternates between pattern char substitution (true) and insertion (false)
209   };
210   /// Set backtrack point.
211   void point(BacktrackPoint& bpt, const Pattern::Opcode *pc, bool alternate = true, bool eof = false)
212   {
213     // advance to the first goto opcode
214     while (!Pattern::is_opcode_goto(*pc))
215       ++pc;
216     bpt.pc0 = pc;
217     bpt.pc1 = pc;
218     bpt.len = pos_ - (txt_ - buf_) - !eof;
219     bpt.err = err_;
220     bpt.alt = sub_ && alternate;
221     bpt.sub = bpt.alt;
222   }
223   /// backtrack on a backtrack point to insert or substitute a pattern char, restoring current text char matched and errors.
backtrack(BacktrackPoint & bpt,int & c1)224   const Pattern::Opcode *backtrack(BacktrackPoint& bpt, int& c1)
225   {
226     // no more alternatives
227     if (bpt.pc1 == NULL)
228       return NULL;
229     // done when no more goto opcodes on characters remain
230     if (!Pattern::is_opcode_goto(*bpt.pc1) || Pattern::is_meta(Pattern::lo_of(*bpt.pc1)))
231       return bpt.pc1 = NULL;
232     Pattern::Index jump = Pattern::index_of(*bpt.pc1);
233     // last opcode is a HALT?
234     if (jump == Pattern::Const::HALT)
235     {
236       if (!Pattern::is_opcode_goto(*bpt.pc0) || (Pattern::lo_of(*bpt.pc0) & 0xC0) != 0xC0)
237         return bpt.pc1 = NULL;
238       // loop over UTF-8 multibytes, checking linear case only (i.e. one wide char or a short range)
239       for (int i = 0; i < 3; ++i)
240       {
241         jump = Pattern::index_of(*bpt.pc0);
242         if (jump == Pattern::Const::HALT)
243           return bpt.pc1 = NULL;
244         if (jump == Pattern::Const::LONG)
245           jump = Pattern::long_index_of(bpt.pc0[1]);
246         const Pattern::Opcode *pc0 = pat_->opc_ + jump;
247         const Pattern::Opcode *pc1 = pc0;
248         while (!Pattern::is_opcode_goto(*pc1))
249           ++pc1;
250         if (!Pattern::is_opcode_goto(*pc1) || Pattern::is_meta(Pattern::lo_of(*pc1)) || (Pattern::lo_of(*pc1) & 0x80) != 0x80)
251           break;
252         bpt.pc0 = pc0;
253         bpt.pc1 = pc1;
254       }
255       jump = Pattern::index_of(*bpt.pc1);
256       bpt.sub = bpt.alt;
257       DBGLOG("Multibyte jump to %u", jump);
258     }
259     if (jump == Pattern::Const::LONG)
260       jump = Pattern::long_index_of(bpt.pc1[1]);
261     // restore errors
262     err_ = bpt.err;
263     // restore pos in the input
264     pos_ = (txt_ - buf_) + bpt.len;
265     // set c1 to previous char before pos, to eventually set c0 in match(method)
266     if (pos_ > 0)
267       c1 = static_cast<unsigned char>(buf_[pos_ - 1]);
268     else
269       c1 = got_;
270     // substitute or insert a pattern char in the text?
271     if (bpt.sub)
272     {
273       DBGLOG("Substitute, jump to %u at pos %zu", jump, pos_);
274       // skip UTF-8 multibytes
275       int c = get();
276       if (c >= 0xC0)
277       {
278         int n = (c >= 0xE0) + (c >= 0xF0);
279         while (n-- >= 0)
280           if ((c = get()) == EOF)
281             break;
282       }
283       bpt.sub = false;
284       bpt.pc1 += !bpt.alt;
285     }
286     else
287     {
288       DBGLOG("Insert, jump to %u at pos %zu", jump, pos_);
289       bpt.sub = bpt.alt;
290       ++bpt.pc1;
291     }
292     return pat_->opc_ + jump;
293   }
294   /// Returns true if input fuzzy-matched the pattern using method Const::SCAN, Const::FIND, Const::SPLIT, or Const::MATCH.
match(Method method)295   virtual size_t match(Method method) ///< Const::SCAN, Const::FIND, Const::SPLIT, or Const::MATCH
296     /// @returns nonzero if input matched the pattern
297   {
298     DBGLOG("BEGIN FuzzyMatcher::match()");
299     reset_text();
300     SaveState sst(ded_);
301     len_ = 0; // split text length starts with 0
302     anc_ = false; // no word boundary anchor found and applied
303 scan:
304     txt_ = buf_ + cur_;
305 #if !defined(WITH_NO_INDENT)
306     mrk_ = false;
307     ind_ = pos_; // ind scans input in buf[] in newline() up to pos - 1
308     col_ = 0; // count columns for indent matching
309 #endif
310 find:
311     int c1 = got_;
312     bool bol = at_bol(); // at begin of line?
313 #if !defined(WITH_NO_INDENT)
314 redo:
315 #endif
316     lap_.resize(0);
317     cap_ = 0;
318     bool nul = method == Const::MATCH;
319     if (pat_->opc_ != NULL)
320     {
321       err_ = 0;
322       uint8_t stack = 0;
323       const Pattern::Opcode *pc = pat_->opc_;
324       while (true)
325       {
326         const Pattern::Opcode *pc0;
327         while (true)
328         {
329           Pattern::Opcode opcode = *pc;
330           Pattern::Index jump;
331           DBGLOG("Fetch: code[%zu] = 0x%08X", pc - pat_->opc_, opcode);
332           pc0 = pc;
333           if (!Pattern::is_opcode_goto(opcode))
334           {
335             switch (opcode >> 24)
336             {
337               case 0xFE: // TAKE
338                 cap_ = Pattern::long_index_of(opcode);
339                 cur_ = pos_;
340                 ++pc;
341                 DBGLOG("Take: cap = %zu", cap_);
342                 continue;
343               case 0xFD: // REDO
344                 cap_ = Const::REDO;
345                 DBGLOG("Redo");
346                 cur_ = pos_;
347                 ++pc;
348                 continue;
349               case 0xFC: // TAIL
350                 {
351                   Pattern::Lookahead la = Pattern::lookahead_of(opcode);
352                   DBGLOG("Tail: %u", la);
353                   if (lap_.size() > la && lap_[la] >= 0)
354                     cur_ = txt_ - buf_ + static_cast<size_t>(lap_[la]); // mind the (new) gap
355                   ++pc;
356                   continue;
357                 }
358               case 0xFB: // HEAD
359                 {
360                   Pattern::Lookahead la = Pattern::lookahead_of(opcode);
361                   DBGLOG("Head: lookahead[%u] = %zu", la, pos_ - (txt_ - buf_));
362                   if (lap_.size() <= la)
363                     lap_.resize(la + 1, -1);
364                   lap_[la] = static_cast<int>(pos_ - (txt_ - buf_)); // mind the gap
365                   ++pc;
366                   continue;
367                 }
368 #if !defined(WITH_NO_INDENT)
369               case Pattern::META_DED - Pattern::META_MIN:
370                 if (ded_ > 0)
371                 {
372                   jump = Pattern::index_of(opcode);
373                   if (jump == Pattern::Const::LONG)
374                     jump = Pattern::long_index_of(pc[1]);
375                   DBGLOG("Dedent ded = %zu", ded_); // unconditional dedent matching \j
376                   nul = true;
377                   pc = pat_->opc_ + jump;
378                   continue;
379                 }
380 #endif
381             }
382             if (c1 == EOF)
383               break;
384             int c0 = c1;
385             c1 = get();
386             DBGLOG("Get: c1 = %d", c1);
387             // where to jump back to (backtrack on meta transitions)
388             Pattern::Index back = Pattern::Const::IMAX;
389             // to jump to longest sequence of matching metas
390             jump = Pattern::Const::IMAX;
391             while (true)
392             {
393               if ((jump == Pattern::Const::IMAX || back == Pattern::Const::IMAX) && !Pattern::is_opcode_goto(opcode))
394               {
395                 // we no longer have to pass through all if jump and back are set
396                 switch (opcode >> 24)
397                 {
398                   case 0xFE: // TAKE
399                     cap_ = Pattern::long_index_of(opcode);
400                     cur_ = pos_;
401                     if (c1 != EOF)
402                       --cur_; // must unget one char
403                     opcode = *++pc;
404                     DBGLOG("Take: cap = %zu", cap_);
405                     continue;
406                   case 0xFD: // REDO
407                     cap_ = Const::REDO;
408                     DBGLOG("Redo");
409                     cur_ = pos_;
410                     if (c1 != EOF)
411                       --cur_; // must unget one char
412                     opcode = *++pc;
413                     continue;
414                   case 0xFC: // TAIL
415                     {
416                       Pattern::Lookahead la = Pattern::lookahead_of(opcode);
417                       DBGLOG("Tail: %u", la);
418                       if (lap_.size() > la && lap_[la] >= 0)
419                         cur_ = txt_ - buf_ + static_cast<size_t>(lap_[la]); // mind the (new) gap
420                       opcode = *++pc;
421                       continue;
422                     }
423                   case 0xFB: // HEAD
424                     opcode = *++pc;
425                     continue;
426 #if !defined(WITH_NO_INDENT)
427                   case Pattern::META_DED - Pattern::META_MIN:
428                     DBGLOG("DED? %d", c1);
429                     if (jump == Pattern::Const::IMAX && back == Pattern::Const::IMAX && bol && dedent())
430                     {
431                       jump = Pattern::index_of(opcode);
432                       if (jump == Pattern::Const::LONG)
433                         jump = Pattern::long_index_of(*++pc);
434                     }
435                     opcode = *++pc;
436                     continue;
437                   case Pattern::META_IND - Pattern::META_MIN:
438                     DBGLOG("IND? %d", c1);
439                     if (jump == Pattern::Const::IMAX && back == Pattern::Const::IMAX && bol && indent())
440                     {
441                       jump = Pattern::index_of(opcode);
442                       if (jump == Pattern::Const::LONG)
443                         jump = Pattern::long_index_of(*++pc);
444                     }
445                     opcode = *++pc;
446                     continue;
447                   case Pattern::META_UND - Pattern::META_MIN:
448                     DBGLOG("UND");
449                     if (mrk_)
450                     {
451                       jump = Pattern::index_of(opcode);
452                       if (jump == Pattern::Const::LONG)
453                         jump = Pattern::long_index_of(*++pc);
454                     }
455                     mrk_ = false;
456                     ded_ = 0;
457                     opcode = *++pc;
458                     continue;
459 #endif
460                   case Pattern::META_EOB - Pattern::META_MIN:
461                     DBGLOG("EOB? %d", c1);
462                     if (jump == Pattern::Const::IMAX && c1 == EOF)
463                     {
464                       jump = Pattern::index_of(opcode);
465                       if (jump == Pattern::Const::LONG)
466                         jump = Pattern::long_index_of(*++pc);
467                     }
468                     opcode = *++pc;
469                     continue;
470                   case Pattern::META_BOB - Pattern::META_MIN:
471                     DBGLOG("BOB? %d", at_bob());
472                     if (jump == Pattern::Const::IMAX && at_bob())
473                     {
474                       jump = Pattern::index_of(opcode);
475                       if (jump == Pattern::Const::LONG)
476                         jump = Pattern::long_index_of(*++pc);
477                     }
478                     opcode = *++pc;
479                     continue;
480                   case Pattern::META_EOL - Pattern::META_MIN:
481                     DBGLOG("EOL? %d", c1);
482                     anc_ = true;
483                     if (jump == Pattern::Const::IMAX && (c1 == EOF || c1 == '\n' || (c1 == '\r' && peek() == '\n')))
484                     {
485                       jump = Pattern::index_of(opcode);
486                       if (jump == Pattern::Const::LONG)
487                         jump = Pattern::long_index_of(*++pc);
488                     }
489                     opcode = *++pc;
490                     continue;
491                   case Pattern::META_BOL - Pattern::META_MIN:
492                     DBGLOG("BOL? %d", bol);
493                     anc_ = true;
494                     if (jump == Pattern::Const::IMAX && bol)
495                     {
496                       jump = Pattern::index_of(opcode);
497                       if (jump == Pattern::Const::LONG)
498                         jump = Pattern::long_index_of(*++pc);
499                     }
500                     opcode = *++pc;
501                     continue;
502                   case Pattern::META_EWE - Pattern::META_MIN:
503                     DBGLOG("EWE? %d %d %d", c0, c1, isword(c0) && !isword(c1));
504                     anc_ = true;
505                     if (jump == Pattern::Const::IMAX && (isword(c0) || opt_.W) && !isword(c1))
506                     {
507                       jump = Pattern::index_of(opcode);
508                       if (jump == Pattern::Const::LONG)
509                         jump = Pattern::long_index_of(*++pc);
510                     }
511                     opcode = *++pc;
512                     continue;
513                   case Pattern::META_BWE - Pattern::META_MIN:
514                     DBGLOG("BWE? %d %d %d", c0, c1, !isword(c0) && isword(c1));
515                     anc_ = true;
516                     if (jump == Pattern::Const::IMAX && !isword(c0) && isword(c1))
517                     {
518                       jump = Pattern::index_of(opcode);
519                       if (jump == Pattern::Const::LONG)
520                         jump = Pattern::long_index_of(*++pc);
521                     }
522                     opcode = *++pc;
523                     continue;
524                   case Pattern::META_EWB - Pattern::META_MIN:
525                     DBGLOG("EWB? %d", at_eow());
526                     anc_ = true;
527                     if (jump == Pattern::Const::IMAX && isword(got_) &&
528                         !isword(static_cast<unsigned char>(method == Const::SPLIT ? txt_[len_] : *txt_)))
529                     {
530                       jump = Pattern::index_of(opcode);
531                       if (jump == Pattern::Const::LONG)
532                         jump = Pattern::long_index_of(*++pc);
533                     }
534                     opcode = *++pc;
535                     continue;
536                   case Pattern::META_BWB - Pattern::META_MIN:
537                     DBGLOG("BWB? %d", at_bow());
538                     anc_ = true;
539                     if (jump == Pattern::Const::IMAX && !isword(got_) &&
540                         (opt_.W || isword(static_cast<unsigned char>(method == Const::SPLIT ? txt_[len_] : *txt_))))
541                     {
542                       jump = Pattern::index_of(opcode);
543                       if (jump == Pattern::Const::LONG)
544                         jump = Pattern::long_index_of(*++pc);
545                     }
546                     opcode = *++pc;
547                     continue;
548                   case Pattern::META_NWE - Pattern::META_MIN:
549                     DBGLOG("NWE? %d %d %d", c0, c1, isword(c0) == isword(c1));
550                     anc_ = true;
551                     if (jump == Pattern::Const::IMAX && isword(c0) == isword(c1))
552                     {
553                       jump = Pattern::index_of(opcode);
554                       if (jump == Pattern::Const::LONG)
555                         jump = Pattern::long_index_of(*++pc);
556                     }
557                     opcode = *++pc;
558                     continue;
559                   case Pattern::META_NWB - Pattern::META_MIN:
560                     DBGLOG("NWB? %d %d", at_bow(), at_eow());
561                     anc_ = true;
562                     if (jump == Pattern::Const::IMAX &&
563                         isword(got_) == isword(static_cast<unsigned char>(txt_[len_])))
564                     {
565                       jump = Pattern::index_of(opcode);
566                       if (jump == Pattern::Const::LONG)
567                         jump = Pattern::long_index_of(*++pc);
568                     }
569                     opcode = *++pc;
570                     continue;
571                   case 0xFF: // LONG
572                     opcode = *++pc;
573                     continue;
574                 }
575               }
576               if (jump == Pattern::Const::IMAX)
577               {
578                 if (back != Pattern::Const::IMAX)
579                 {
580                   pc = pat_->opc_ + back;
581                   opcode = *pc;
582                 }
583                 break;
584               }
585               DBGLOG("Backtrack: pc = %u", jump);
586               if (back == Pattern::Const::IMAX)
587                 back = static_cast<Pattern::Index>(pc - pat_->opc_);
588               pc = pat_->opc_ + jump;
589               opcode = *pc;
590               jump = Pattern::Const::IMAX;
591             }
592             if (c1 == EOF)
593               break;
594           }
595           else
596           {
597             if (Pattern::is_opcode_halt(opcode))
598               break;
599             if (c1 == EOF)
600               break;
601             c1 = get();
602             DBGLOG("Get: c1 = %d", c1);
603             if (c1 == EOF)
604               break;
605           }
606           {
607             Pattern::Opcode lo = c1 << 24;
608             Pattern::Opcode hi = lo | 0x00FFFFFF;
609 unrolled:
610             if (hi < opcode || lo > (opcode << 8))
611             {
612               opcode = *++pc;
613               if (hi < opcode || lo > (opcode << 8))
614               {
615                 opcode = *++pc;
616                 if (hi < opcode || lo > (opcode << 8))
617                 {
618                   opcode = *++pc;
619                   if (hi < opcode || lo > (opcode << 8))
620                   {
621                     opcode = *++pc;
622                     if (hi < opcode || lo > (opcode << 8))
623                     {
624                       opcode = *++pc;
625                       if (hi < opcode || lo > (opcode << 8))
626                       {
627                         opcode = *++pc;
628                         if (hi < opcode || lo > (opcode << 8))
629                         {
630                           opcode = *++pc;
631                           if (hi < opcode || lo > (opcode << 8))
632                           {
633                             opcode = *++pc;
634                             goto unrolled;
635                           }
636                         }
637                       }
638                     }
639                   }
640                 }
641               }
642             }
643           }
644           jump = Pattern::index_of(opcode);
645           if (jump == 0)
646           {
647             // loop back to start state after only one char matched (one transition) but w/o full match, then optimize
648             if (cap_ == 0 && pos_ == cur_ + 1 && method == Const::FIND)
649               cur_ = pos_; // set cur_ to move forward from cur_ + 1 with FIND advance()
650           }
651           else if (jump >= Pattern::Const::LONG)
652           {
653             if (jump == Pattern::Const::HALT)
654               break;
655             jump = Pattern::long_index_of(pc[1]);
656           }
657           pc = pat_->opc_ + jump;
658         }
659         // exit fuzzy loop if nothing consumed
660         if (pos_ == static_cast<size_t>(txt_ + len_ - buf_))
661           break;
662         // match, i.e. cap_ > 0?
663         if (method == Const::MATCH)
664         {
665           // exit fuzzy loop if fuzzy match succeeds till end of input
666           if (cap_ > 0)
667           {
668             if (c1 == EOF)
669               break;
670             while (err_ < max_)
671             {
672               c1 = get();
673               if (c1 == EOF)
674                 break;
675               // skip one (multibyte) char
676               if (c1 >= 0xC0)
677               {
678                 int n = (c1 >= 0xE0) + (c1 >= 0xF0);
679                 while (n-- >= 0)
680                   if ((c1 = get()) == EOF)
681                     break;
682               }
683               ++err_;
684             }
685             if (at_end())
686             {
687               DBGLOG("match pos = %zu", pos_);
688               set_current(pos_);
689               break;
690             }
691           }
692         }
693         else
694         {
695           // exit fuzzy loop if match or first char mismatched
696           if (cap_ > 0 || pos_ == static_cast<size_t>(txt_ + len_ - buf_ + 1))
697             break;
698         }
699         // no match, use fuzzy matching with max error
700         if (c1 == '\0' || c1 == '\n' || c1 == EOF)
701         {
702           // do not try to fuzzy match NUL, LF, or EOF
703           if (err_ < max_ && del_)
704           {
705             ++err_;
706             // set backtrack point to insert pattern char only, not substitute, if pc0 os a different point than the last
707             if (stack == 0 || bpt_[stack - 1].pc0 != pc0)
708             {
709               point(bpt_[stack++], pc0, false, c1 == EOF);
710               DBGLOG("point[%u] at %zu EOF", stack - 1, pc0 - pat_->opc_);
711             }
712           }
713           pc = NULL;
714           while (stack > 0 && pc == NULL)
715           {
716             pc = backtrack(bpt_[stack - 1], c1);
717             if (pc == NULL)
718               --stack;
719           }
720           // exhausted all backtracking points?
721           if (pc == NULL)
722             break;
723         }
724         else
725         {
726           if (err_ < max_)
727           {
728             ++err_;
729             if (del_ || sub_)
730             {
731               // set backtrack point if pc0 is a different point than the last
732               if (stack == 0 || bpt_[stack - 1].pc0 != pc0)
733               {
734                 point(bpt_[stack++], pc0);
735                 DBGLOG("point[%u] at %zu pos %zu", stack - 1, pc0 - pat_->opc_, pos_ - 1);
736               }
737             }
738             if (ins_)
739             {
740               // try pattern char deletion (text insertion): skip one (multibyte) char then rerun opcode at pc0
741               if (c1 >= 0xC0)
742               {
743                 int n = (c1 >= 0xE0) + (c1 >= 0xF0);
744                 while (n-- >= 0)
745                   if ((c1 = get()) == EOF)
746                     break;
747               }
748               pc = pc0;
749               DBGLOG("delete %c at pos %zu", c1, pos_ - 1);
750             }
751           }
752           else
753           {
754             // try insertion or substitution of pattern char
755             pc = NULL;
756             while (stack > 0 && pc == NULL)
757             {
758               pc = backtrack(bpt_[stack - 1], c1);
759               if (pc == NULL)
760                 --stack;
761             }
762             // exhausted all backtracking points?
763             if (pc == NULL)
764               break;
765           }
766         }
767       }
768     }
769     // if fuzzy matched with errors then perform a second pass ahead of this match to check for an exact match
770     if (cap_ > 0 && err_ > 0 && !sst.use && (method == Const::FIND || method == Const::SPLIT))
771     {
772       // this part is based on advance() in matcher.cpp, limited to advancing ahead till the one of the first pattern char(s) match excluding \n
773       size_t loc = txt_ - buf_ + 1;
774       const char *s = buf_ + loc;
775       const char *e = static_cast<const char*>(std::memchr(s, '\n', cur_ - loc));
776       if (e == NULL)
777         e = buf_ + cur_;
778       if (pat_->len_ == 0)
779       {
780         if (pat_->min_ > 0)
781         {
782           const Pattern::Pred *pma = pat_->pma_;
783           while (s < e && (pma[static_cast<uint8_t>(*s)] & 0xc0) == 0xc0)
784             ++s;
785           if (s < e)
786           {
787             loc = s - buf_;
788             sst.use = true;
789             sst.loc = loc;
790             sst.cap = cap_;
791             sst.txt = txt_ - buf_;
792             sst.cur = cur_;
793             sst.pos = pos_;
794             size_t tmp = ded_;
795             ded_ = sst.ded;
796             sst.ded = tmp;
797             sst.mrk = mrk_;
798             sst.err = err_;
799             set_current(loc);
800             goto scan;
801           }
802         }
803       }
804       else if (s < e)
805       {
806         s = static_cast<const char*>(std::memchr(s, *pat_->pre_, e - s));
807         if (s != NULL)
808         {
809           loc = s - buf_;
810           sst.use = true;
811           sst.loc = loc;
812           sst.cap = cap_;
813           sst.txt = txt_ - buf_;
814           sst.cur = cur_;
815           sst.pos = pos_;
816           size_t tmp = ded_;
817           ded_ = sst.ded;
818           sst.ded = tmp;
819           sst.mrk = mrk_;
820           sst.err = err_;
821           set_current(loc);
822           goto scan;
823         }
824       }
825     }
826     else if (sst.use && (cap_ == 0 || err_ >= sst.err))
827     {
828       // if the buffer was shifted then cur_, pos_ and txt_ are no longer at the same location in the buffer, we must adjust for this
829       size_t loc = txt_ - buf_;
830       size_t shift = sst.loc - loc;
831       cap_ = sst.cap;
832       cur_ = sst.cur - shift;
833       pos_ = sst.pos - shift;
834       ded_ = sst.ded;
835       mrk_ = sst.mrk;
836       err_ = sst.err;
837       txt_ = buf_ + sst.txt - shift;
838     }
839     else if (sst.use && cap_ > 0 && method == Const::SPLIT)
840     {
841       size_t loc = txt_ - buf_;
842       size_t shift = sst.loc - loc;
843       len_ = loc - sst.txt + shift;
844     }
845 #if !defined(WITH_NO_INDENT)
846     if (mrk_ && cap_ != Const::REDO)
847     {
848       if (col_ > 0 && (tab_.empty() || tab_.back() < col_))
849       {
850         DBGLOG("Set new stop: tab_[%zu] = %zu", tab_.size(), col_);
851         tab_.push_back(col_);
852       }
853       else if (!tab_.empty() && tab_.back() > col_)
854       {
855         size_t n;
856         for (n = tab_.size() - 1; n > 0; --n)
857           if (tab_.at(n - 1) <= col_)
858             break;
859         ded_ += tab_.size() - n;
860         DBGLOG("Dedents: ded = %zu tab_ = %zu", ded_, tab_.size());
861         tab_.resize(n);
862         // adjust stop when indents are not aligned (Python would give an error)
863         if (n > 0)
864           tab_.back() = col_;
865       }
866     }
867     if (ded_ > 0)
868     {
869       DBGLOG("Dedents: ded = %zu", ded_);
870       if (col_ == 0 && bol)
871       {
872         ded_ += tab_.size();
873         tab_.resize(0);
874         DBGLOG("Rescan for pending dedents: ded = %zu", ded_);
875         pos_ = ind_;
876         // avoid looping, match \j exactly
877         bol = false;
878         goto redo;
879       }
880       --ded_;
881     }
882 #endif
883     if (method == Const::SPLIT)
884     {
885       DBGLOG("Split: len = %zu cap = %zu cur = %zu pos = %zu end = %zu txt-buf = %zu eob = %d got = %d", len_, cap_, cur_, pos_, end_, txt_-buf_, (int)eof_, got_);
886       if (cap_ == 0 || (cur_ == static_cast<size_t>(txt_ - buf_) && !at_bob()))
887       {
888         if (!hit_end() && (txt_ + len_ < buf_ + end_ || peek() != EOF))
889         {
890           ++len_;
891           DBGLOG("Split continue: len = %zu", len_);
892           set_current(++cur_);
893           goto find;
894         }
895         if (got_ != Const::EOB)
896           cap_ = Const::EMPTY;
897         else
898           cap_ = 0;
899         set_current(end_);
900         got_ = Const::EOB;
901         DBGLOG("Split at eof: cap = %zu txt = '%s' len = %zu", cap_, std::string(txt_, len_).c_str(), len_);
902         DBGLOG("END FuzzyMatcher::match()");
903         return cap_;
904       }
905       if (cur_ == 0 && at_bob() && at_end())
906       {
907         cap_ = Const::EMPTY;
908         got_ = Const::EOB;
909       }
910       else
911       {
912         set_current(cur_);
913       }
914       DBGLOG("Split: txt = '%s' len = %zu", std::string(txt_, len_).c_str(), len_);
915       DBGLOG("END FuzzyMatcher::match()");
916       return cap_;
917     }
918     if (cap_ == 0)
919     {
920       if (method == Const::FIND)
921       {
922         if (!at_end())
923         {
924           if (anc_)
925           {
926             cur_ = txt_ - buf_; // reset current to pattern start when a word boundary was encountered
927             anc_ = false;
928           }
929           // fuzzy search with find() can safely advance on a single prefix char of the regex
930           if (pos_ > cur_)
931           {
932             // this part is based on advance() in matcher.cpp, limited to advancing ahead till the one of the first pattern char(s) match
933             size_t loc = cur_ + 1;
934             if (pat_->len_ == 0)
935             {
936               if (pat_->min_ > 0)
937               {
938                 const Pattern::Pred *pma = pat_->pma_;
939                 while (true)
940                 {
941                   const char *s = buf_ + loc;
942                   const char *e = buf_ + end_;
943                   while (s < e && (pma[static_cast<uint8_t>(*s)] & 0xc0) == 0xc0)
944                     ++s;
945                   if (s < e)
946                   {
947                     loc = s - buf_;
948                     set_current(loc);
949                     goto scan;
950                   }
951                   loc = e - buf_;
952                   set_current_match(loc - 1);
953                   peek_more();
954                   loc = cur_ + 1;
955                   if (loc >= end_)
956                     break;
957                 }
958               }
959             }
960             else
961             {
962               while (true)
963               {
964                 const char *s = buf_ + loc;
965                 const char *e = buf_ + end_;
966                 s = static_cast<const char*>(std::memchr(s, *pat_->pre_, e - s));
967                 if (s != NULL)
968                 {
969                   loc = s - buf_;
970                   set_current(loc);
971                   goto scan;
972                 }
973                 loc = e - buf_;
974                 set_current_match(loc - 1);
975                 peek_more();
976                 loc = cur_ + 1;
977                 if (loc + pat_->len_ > end_)
978                   break;
979               }
980             }
981           }
982           txt_ = buf_ + cur_;
983         }
984       }
985       else
986       {
987         // SCAN and MATCH: no match: backup to begin of unmatched text to report as error
988         cur_ = txt_ - buf_;
989       }
990     }
991     len_ = cur_ - (txt_ - buf_);
992     if (len_ == 0 && !nul)
993     {
994       DBGLOG("Empty or no match cur = %zu pos = %zu end = %zu", cur_, pos_, end_);
995       pos_ = cur_;
996       if (at_end())
997       {
998         set_current(cur_);
999         DBGLOG("Reject empty match at EOF");
1000         cap_ = 0;
1001       }
1002       else if (method == Const::FIND)
1003       {
1004         DBGLOG("Reject empty match and continue?");
1005         // skip one char to keep searching
1006         set_current(++cur_);
1007         // allow FIND with "N" to match an empty line, with ^$ etc.
1008         if (cap_ == 0 || !opt_.N || (!bol && (c1 == '\n' || (c1 == '\r' && peek() == '\n'))))
1009           goto scan;
1010         DBGLOG("Accept empty match");
1011       }
1012       else
1013       {
1014         set_current(cur_);
1015         DBGLOG("Reject empty match");
1016         cap_ = 0;
1017       }
1018     }
1019     else if (len_ == 0 && cur_ == end_)
1020     {
1021       DBGLOG("Hit end: got = %d", got_);
1022       if (cap_ == Const::REDO && !opt_.A)
1023         cap_ = 0;
1024     }
1025     else
1026     {
1027       set_current(cur_);
1028       if (len_ > 0 && cap_ == Const::REDO && !opt_.A)
1029       {
1030         DBGLOG("Ignore accept and continue: len = %zu", len_);
1031         len_ = 0;
1032         if (method != Const::MATCH)
1033           goto scan;
1034         cap_ = 0;
1035       }
1036     }
1037     DBGLOG("Return: cap = %zu txt = '%s' len = %zu pos = %zu got = %d", cap_, std::string(txt_, len_).c_str(), len_, pos_, got_);
1038     DBGLOG("END match()");
1039     return cap_;
1040   }
1041   std::vector<BacktrackPoint> bpt_; ///< vector of backtrack points, max_ size
1042   uint8_t max_;                     ///< max errors
1043   uint8_t err_;                     ///< accumulated edit distance (not guaranteed minimal)
1044   bool ins_;                        ///< fuzzy match inserted chars (extra chars)
1045   bool del_;                        ///< fuzzy match deleted chars (missing chars)
1046   bool sub_;                        ///< fuzzy match substituted chars
1047 };
1048 
1049 } // namespace reflex
1050 
1051 #endif
1052