1 /******************************************************************************\
2 * Copyright (c) 2016, Robert van Engelen, Genivia Inc. All rights reserved.    *
3 *                                                                              *
4 * Redistribution and use in source and binary forms, with or without           *
5 * modification, are permitted provided that the following conditions are met:  *
6 *                                                                              *
7 *   (1) Redistributions of source code must retain the above copyright notice, *
8 *       this list of conditions and the following disclaimer.                  *
9 *                                                                              *
10 *   (2) Redistributions in binary form must reproduce the above copyright      *
11 *       notice, this list of conditions and the following disclaimer in the    *
12 *       documentation and/or other materials provided with the distribution.   *
13 *                                                                              *
14 *   (3) The name of the author may not be used to endorse or promote products  *
15 *       derived from this software without specific prior written permission.  *
16 *                                                                              *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED *
18 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF         *
19 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO   *
20 * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,       *
21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, *
22 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;  *
23 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,     *
24 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR      *
25 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF       *
26 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.                                   *
27 \******************************************************************************/
28 
29 /**
30 @file      absmatcher.h
31 @brief     RE/flex abstract matcher base class and pattern matcher class
32 @author    Robert van Engelen - engelen@genivia.com
33 @copyright (c) 2016-2020, Robert van Engelen, Genivia Inc. All rights reserved.
34 @copyright (c) BSD-3 License - see LICENSE.txt
35 */
36 
37 #ifndef REFLEX_ABSMATCHER_H
38 #define REFLEX_ABSMATCHER_H
39 
40 /// This compile-time option may speed up buffer reallocation with realloc() instead of new and delete.
41 #define WITH_REALLOC
42 
43 /// This compile-time option speeds up matching, but slows input().
44 #define WITH_FAST_GET
45 
46 /// This compile-time option adds span(), line(), wline(), bol(), eol()
47 #define WITH_SPAN
48 
49 #include <reflex/convert.h>
50 #include <reflex/debug.h>
51 #include <reflex/input.h>
52 #include <reflex/traits.h>
53 #include <reflex/simd.h>
54 #include <cstdlib>
55 #include <cctype>
56 #include <iterator>
57 
58 namespace reflex {
59 
60 /// Check ASCII word-like character `[A-Za-z0-9_]`, permitting the character range 0..303 (0x12F) and EOF.
isword(int c)61 inline int isword(int c) ///< Character to check
62   /// @returns nonzero if argument c is in `[A-Za-z0-9_]`, zero otherwise
63 {
64   return std::isalnum(static_cast<unsigned char>(c)) | (c == '_');
65 }
66 
67 /// The abstract matcher base class template defines an interface for all pattern matcher engines.
68 /**
69 The buffer expands when matches do not fit.  The buffer size is initially BUFSZ.
70 
71 ```
72       _________________
73      |  |    |    |    |
74 buf_=|  |text|rest|free|
75      |__|____|____|____|
76         ^    ^    ^    ^
77         cur_ pos_ end_ max_
78 
79 buf_ // points to buffered input, buffer may grow to fit long matches
80 cur_ // current position in buf_ while matching text, cur_ = pos_ afterwards, can be changed by more()
81 pos_ // position in buf_ to start the next match
82 end_ // position in buf_ that is free to fill with more input
83 max_ // allocated size of buf_, must ensure that max_ > end_ for text() to add a final \0
84 txt_ // points to the match, will be 0-terminated when text() or rest() are called
85 len_ // length of the match
86 chr_ // char located at txt_[len_] when txt_[len_] is set to \0 by text(), is \0 otherwise
87 got_ // buf_[cur_-1] or txt_[-1] character before this match (assigned before each match), initially Const::BOB
88 eof_ // true if no more data can/should be fetched to fill the buffer
89 ```
90 */
91 class AbstractMatcher {
92  protected:
93   typedef int Method; ///< a method is one of Const::SCAN, Const::FIND, Const::SPLIT, Const::MATCH
94  public:
95   /// AbstractMatcher::Const common constants.
96   struct Const {
97     static const Method SCAN  = 0;          ///< AbstractMatcher::match method is to scan input (tokenizer)
98     static const Method FIND  = 1;          ///< AbstractMatcher::match method is to find pattern in input
99     static const Method SPLIT = 2;          ///< AbstractMatcher::match method is to split input at pattern matches
100     static const Method MATCH = 3;          ///< AbstractMatcher::match method is to match the entire input
101     static const int NUL      = '\0';       ///< NUL string terminator
102     static const int UNK      = 256;        ///< unknown/undefined character meta-char marker
103     static const int BOB      = 257;        ///< begin of buffer meta-char marker
104     static const int EOB      = EOF;        ///< end of buffer meta-char marker
105 #ifndef REFLEX_BUFSZ
106     static const size_t BUFSZ = (64*1024);  ///< initial buffer size, at least 4096 bytes
107 #else
108     static const size_t BUFSZ = REFLEX_BUFSZ;
109 #endif
110     static const size_t BLOCK = 4096;       ///< minimum remaining unused space in the buffer, to prevent excessive shifting
111     static const size_t REDO  = 0x7FFFFFFF; ///< reflex::Matcher::accept() returns "redo" with reflex::Matcher option "A"
112     static const size_t EMPTY = 0xFFFFFFFF; ///< accept() returns "empty" last split at end of input
113   };
114   /// Context returned by before() and after()
115   struct Context {
ContextContext116     Context()
117       :
118         buf(NULL),
119         len(0),
120         num(0)
121     { }
ContextContext122     Context(const char *buf, size_t len, size_t num)
123       :
124         buf(buf),
125         len(len),
126         num(num)
127     { }
128     const char *buf; ///< pointer to buffer
129     size_t      len; ///< length of buffered context
130     size_t      num; ///< number of bytes shifted out so far, when buffer shifted
131   };
132   /// Event handler functor base class to invoke when the buffer contents are shifted out, e.g. for logging the data searched.
133   struct Handler { virtual void operator()(AbstractMatcher&, const char*, size_t, size_t) = 0; };
134  protected:
135   /// AbstractMatcher::Options for matcher engines.
136   struct Option {
OptionOption137     Option()
138       :
139         A(false),
140         N(false),
141         W(false),
142         T(8)
143     { }
144     bool A; ///< accept any/all (?^X) negative patterns as Const::REDO accept index codes
145     bool N; ///< nullable, find may return empty match (N/A to scan, split, matches)
146     bool W; ///< half-check for "whole words", check only left of \< and right of \> for non-word character
147     char T; ///< tab size, must be a power of 2, default is 8, for column count and indent \i, \j, and \k
148   };
149   /// AbstractMatcher::Iterator class for scanning, searching, and splitting input character sequences.
150   template<typename T> /// @tparam <T> AbstractMatcher or const AbstractMatcher
151   class Iterator : public std::iterator<std::input_iterator_tag,T> {
152     friend class AbstractMatcher;
153     friend class Iterator<typename reflex::TypeOp<T>::ConstType>;
154     friend class Iterator<typename reflex::TypeOp<T>::NonConstType>;
155    public:
156     /// Construct an AbstractMatcher::Iterator such that Iterator() == AbstractMatcher::Operation(*this, method).end().
Iterator()157     Iterator()
158       :
159         matcher_(NULL),
160         method_()
161     { }
162     /// Copy constructor.
Iterator(const Iterator<typename reflex::TypeOp<T>::NonConstType> & it)163     Iterator(const Iterator<typename reflex::TypeOp<T>::NonConstType>& it)
164       :
165         matcher_(it.matcher_),
166         method_(it.method_)
167     { }
168     /// AbstractMatcher::Iterator dereference.
169     T& operator*() const
170       /// @returns (const) reference to the iterator's matcher
171     {
172       return *matcher_;
173     }
174     /// AbstractMatcher::Iterator pointer.
175     T* operator->() const
176       /// @returns (const) pointer to the iterator's matcher
177     {
178       return matcher_;
179     }
180     /// AbstractMatcher::Iterator equality.
181     bool operator==(const Iterator<typename reflex::TypeOp<T>::ConstType>& rhs) const
182       /// @returns true if iterator equals RHS
183     {
184       return matcher_ == rhs.matcher_;
185     }
186     /// AbstractMatcher::Iterator inequality.
187     bool operator!=(const Iterator<typename reflex::TypeOp<T>::ConstType>& rhs) const
188       /// @returns true if iterator does not equal RHS
189     {
190       return matcher_ != rhs.matcher_;
191     }
192     /// AbstractMatcher::Iterator preincrement.
193     Iterator& operator++()
194       /// @returns reference to this iterator
195     {
196       if (matcher_->match(method_) == 0)
197         matcher_ = NULL;
198       return *this;
199     }
200     /// AbstractMatcher::Iterator postincrement.
201     Iterator operator++(int)
202       /// @returns iterator to current match
203     {
204       Iterator it = *this;
205       operator++();
206       return it;
207     }
208     /// Construct an AbstractMatcher::Iterator to scan, search, or split an input character sequence.
Iterator(AbstractMatcher * matcher,Method method)209     Iterator(
210         AbstractMatcher *matcher, ///< iterate over pattern matches with this matcher
211         Method           method)  ///< match using method Const::SCAN, Const::FIND, or Const::SPLIT
212       :
213         matcher_(matcher),
214         method_(method)
215     {
216       if (matcher_ && matcher_->match(method_) == 0)
217         matcher_ = NULL;
218     }
219    private:
220     AbstractMatcher *matcher_; ///< the matcher used by this iterator
221     Method           method_;  ///< the method for pattern matching by this iterator's matcher
222   };
223  public:
224   typedef AbstractMatcher::Iterator<AbstractMatcher>       iterator;       ///< std::input_iterator for scanning, searching, and splitting input character sequences
225   typedef AbstractMatcher::Iterator<const AbstractMatcher> const_iterator; ///< std::input_iterator for scanning, searching, and splitting input character sequences
226   /// AbstractMatcher::Operation functor to match input to a pattern, also provides a (const) AbstractMatcher::iterator to iterate over matches.
227   class Operation {
228    public:
229     /// Construct an AbstractMatcher::Operation functor to scan, search, or split an input character sequence.
Operation(AbstractMatcher * matcher,Method method)230     Operation(
231         AbstractMatcher *matcher, ///< use this matcher for this functor
232         Method           method)  ///< match using method Const::SCAN, Const::FIND, or Const::SPLIT
233       :
234         matcher_(matcher),
235         method_(method)
236     { }
init(AbstractMatcher * matcher,Method method)237     void init(
238         AbstractMatcher *matcher, ///< use this matcher for this functor
239         Method           method)  ///< match using method Const::SCAN, Const::FIND, or Const::SPLIT
240     {
241       matcher_ = matcher;
242       method_ = method;
243     }
244     /// AbstractMatcher::Operation() matches input to a pattern using method Const::SCAN, Const::FIND, or Const::SPLIT.
operator()245     size_t operator()() const
246       /// @returns value of accept() >= 1 for match or 0 for end of matches
247     {
248       return matcher_->match(method_);
249     }
250     /// AbstractMatcher::Operation.begin() returns a std::input_iterator to the start of the matches.
begin()251     iterator begin() const
252       /// @returns input iterator
253     {
254       return iterator(matcher_, method_);
255     }
256     /// AbstractMatcher::Operation.end() returns a std::input_iterator to the end of matches.
end()257     iterator end() const
258       /// @returns input iterator
259     {
260       return iterator();
261     }
262     /// AbstractMatcher::Operation.cbegin() returns a const std::input_iterator to the start of the matches.
cbegin()263     const_iterator cbegin() const
264       /// @returns input const_iterator
265     {
266       return const_iterator(matcher_, method_);
267     }
268     /// AbstractMatcher::Operation.cend() returns a const std::input_iterator to the end of matches.
cend()269     const_iterator cend() const
270       /// @returns input const_iterator
271     {
272       return const_iterator();
273     }
274    private:
275     AbstractMatcher *matcher_; ///< the matcher used by this functor
276     Method           method_;  ///< the method for pattern matching by this functor's matcher
277   };
278   /// Construct a base abstract matcher.
AbstractMatcher(const Input & input,const char * opt)279   AbstractMatcher(
280       const Input& input, ///< input character sequence for this matcher
281       const char  *opt)   ///< option string of the form `(A|N|T(=[[:digit:]])?|;)*`
282     :
283       scan(this, Const::SCAN),
284       find(this, Const::FIND),
285       split(this, Const::SPLIT)
286   {
287     in = input;
288     init(opt);
289   }
290   /// Construct a base abstract matcher.
AbstractMatcher(const Input & input,const Option & opt)291   AbstractMatcher(
292       const Input&  input, ///< input character sequence for this matcher
293       const Option& opt)   ///< options
294     :
295       scan(this, Const::SCAN),
296       find(this, Const::FIND),
297       split(this, Const::SPLIT)
298   {
299     in = input;
300     init();
301     opt_ = opt;
302   }
303   /// Delete abstract matcher, deletes this matcher's internal buffer.
~AbstractMatcher()304   virtual ~AbstractMatcher()
305   {
306     DBGLOG("AbstractMatcher::~AbstractMatcher()");
307     if (own_)
308     {
309 #if defined(WITH_REALLOC)
310 #if (defined(__WIN32__) || defined(_WIN32) || defined(WIN32) || defined(_WIN64) || defined(__BORLANDC__)) && !defined(__CYGWIN__)
311       _aligned_free(static_cast<void*>(buf_));
312 #else
313       std::free(static_cast<void*>(buf_));
314 #endif
315 #else
316       delete[] buf_;
317 #endif
318     }
319   }
320   /// Polymorphic cloning.
321   virtual AbstractMatcher *clone() = 0;
322   /// Reset this matcher's state to the initial state and set options (when provided).
323   virtual void reset(const char *opt = NULL)
324   {
325     DBGLOG("AbstractMatcher::reset(%s)", opt ? opt : "(null)");
326     if (opt)
327     {
328       opt_.A = false; // when true: accept any/all (?^X) negative patterns as Const::REDO accept index codes
329       opt_.N = false; // when true: find may return empty match (N/A to scan, split, matches)
330       opt_.W = false; // when true: half-check for "whole words", check only left of \< and right of \> for non-word character
331       opt_.T = 8;     // tab size 1, 2, 4, or 8
332       if (opt)
333       {
334         for (const char *s = opt; *s != '\0'; ++s)
335         {
336           switch (*s)
337           {
338             case 'A':
339               opt_.A = true;
340               break;
341             case 'N':
342               opt_.N = true;
343               break;
344             case 'W':
345               opt_.W = true;
346               break;
347             case 'T':
348               opt_.T = isdigit(*(s += (s[1] == '=') + 1)) ? static_cast<char>(*s - '0') : 0;
349               break;
350           }
351         }
352       }
353     }
354     if (!own_)
355     {
356       max_ = Const::BUFSZ;
357 #if defined(WITH_REALLOC)
358 #if (defined(__WIN32__) || defined(_WIN32) || defined(WIN32) || defined(_WIN64) || defined(__BORLANDC__)) && !defined(__CYGWIN__)
359       buf_ = static_cast<char*>(_aligned_malloc(max_, 4096));
360       if (buf_ == NULL)
361         throw std::bad_alloc();
362 #else
363       buf_ = NULL;
364       if (posix_memalign(reinterpret_cast<void**>(&buf_), 4096, max_) != 0)
365         throw std::bad_alloc();
366 #endif
367 #else
368       buf_ = new char[max_];
369 #endif
370     }
371     buf_[0] = '\0';
372     txt_ = buf_;
373     len_ = 0;
374     cap_ = 0;
375     cur_ = 0;
376     pos_ = 0;
377     end_ = 0;
378     ind_ = 0;
379     blk_ = 0;
380     got_ = Const::BOB;
381     chr_ = '\0';
382 #if defined(WITH_SPAN)
383     bol_ = buf_;
384     evh_ = NULL;
385 #endif
386     lpb_ = buf_;
387     lno_ = 1;
388 #if defined(WITH_SPAN)
389     cpb_ = buf_;
390 #endif
391     cno_ = 0;
392     num_ = 0;
393     own_ = true;
394     eof_ = false;
395     mat_ = false;
396   }
397   /// Set buffer block size for reading: use 0 (or omit argument) to buffer all input in which case returns true if all the data could be read and false if a read error occurred.
398   bool buffer(size_t blk = 0) ///< new block size between 1 and Const::BLOCK, or 0 to buffer all input (default)
399     /// @returns true when successful to buffer all input when n=0
400   {
401     if (blk > Const::BLOCK)
402       blk = Const::BLOCK;
403     DBGLOG("AbstractMatcher::buffer(%zu)", blk);
404     blk_ = blk;
405     if (blk > 0 || eof_ || in.eof())
406       return true;
407     size_t n = in.size(); // get the (rest of the) data size, which is 0 if unknown (e.g. reading input from a TTY or a pipe)
408     if (n > 0)
409     {
410       (void)grow(n + 1); // now attempt to fetch all (remaining) data to store in the buffer, +1 for a final \0
411       end_ += get(buf_, n);
412     }
413     while (in.good()) // there is more to get while good(), e.g. via wrap()
414     {
415       (void)grow();
416       end_ += get(buf_ + end_, max_ - end_);
417     }
418     if (end_ == max_)
419       (void)grow(1); // make sure we have room for a final \0
420     return in.eof();
421   }
422 #if defined(WITH_SPAN)
423   /// Set event handler functor to invoke when the buffer contents are shifted out, e.g. for logging the data searched.
set_handler(Handler * handler)424   void set_handler(Handler *handler)
425   {
426     evh_ = handler;
427   }
428   /// Get the buffered context before the matching line.
before()429   inline Context before()
430   {
431     (void)lineno();
432     return Context(buf_, bol_ - buf_, num_);
433   }
434   /// Get the buffered context after EOF is reached.
after()435   inline Context after()
436   {
437     if (hit_end())
438     {
439       (void)lineno();
440       // if there is no \n at the end of input: increase line count by one to compensate
441       if (bol_ < txt_)
442         ++lno_;
443       return Context(buf_, end_, num_);
444     }
445     return Context(buf_, 0, num_);
446   }
447 #endif
448   /// Set interactive input with buffer size of 1 to read data bytewise which is very slow.
interactive()449   void interactive()
450     /// @note Use this method before any matching is done and before any input is read since the last time input was (re)set.
451   {
452     DBGLOG("AbstractMatcher::interactive()");
453     (void)buffer(1);
454   }
455   /// Flush the buffer's remaining content.
flush()456   void flush()
457   {
458     DBGLOG("AbstractMatcher::flush()");
459     pos_ = end_;
460   }
461   /// Returns more input data directly from the source (method can be overriden, as by reflex::FlexLexer::get(s, n) for example that invokes reflex::FlexLexer::LexerInput(s, n)).
get(char * s,size_t n)462   virtual size_t get(
463       /// @returns the nonzero number of (less or equal to n) 8-bit characters added to buffer s from the current input, or zero when EOF
464       char  *s, ///< points to the string buffer to fill with input
465       size_t n) ///< size of buffer pointed to by s
466   {
467     return in.get(s, n);
468   }
469   /// Returns true if wrapping of input after EOF is supported.
wrap()470   virtual bool wrap()
471     /// @returns true if input was succesfully wrapped
472   {
473     return false;
474   }
475   /// Set the input character sequence for this matcher and reset/restart the matcher.
input(const Input & input)476   virtual AbstractMatcher& input(const Input& input) ///< input character sequence for this matcher
477     /// @returns this matcher
478   {
479     DBGLOG("AbstractMatcher::input()");
480     in = input;
481     reset();
482     return *this;
483   }
484   /// Set the buffer base containing 0-terminated character data to scan in place (data may be modified), reset/restart the matcher.
buffer(char * base,size_t size)485   AbstractMatcher& buffer(
486       char *base,  ///< base of the buffer containing 0-terminated character data
487       size_t size) ///< nonzero size of the buffer
488     /// @returns this matcher
489   {
490     if (size > 0)
491     {
492       if (own_)
493       {
494 #if defined(WITH_REALLOC)
495 #if (defined(__WIN32__) || defined(_WIN32) || defined(WIN32) || defined(_WIN64) || defined(__BORLANDC__)) && !defined(__CYGWIN__)
496         _aligned_free(static_cast<void*>(buf_));
497 #else
498         std::free(static_cast<void*>(buf_));
499 #endif
500 #else
501         delete[] buf_;
502 #endif
503       }
504       buf_ = base;
505       txt_ = buf_;
506       len_ = 0;
507       cap_ = 0;
508       cur_ = 0;
509       pos_ = 0;
510       end_ = size - 1;
511       max_ = size;
512       ind_ = 0;
513       blk_ = 0;
514       got_ = Const::BOB;
515       chr_ = '\0';
516 #if defined(WITH_SPAN)
517       bol_ = buf_;
518       evh_ = NULL;
519 #endif
520       lpb_ = buf_;
521       lno_ = 1;
522 #if defined(WITH_SPAN)
523       cpb_ = buf_;
524 #endif
525       cno_ = 0;
526       num_ = 0;
527       own_ = false;
528       eof_ = true;
529       mat_ = false;
530     }
531     return *this;
532   }
533 
534   /// Returns nonzero capture index (i.e. true) if the entire input matches this matcher's pattern (and internally caches the true/false result to permit repeat invocations).
matches()535   inline size_t matches()
536     /// @returns nonzero capture index (i.e. true) if the entire input matched this matcher's pattern, zero (i.e. false) otherwise
537   {
538     if (!mat_ && at_bob())
539       mat_ = match(Const::MATCH) && at_end();
540     return mat_;
541   }
542   /// Returns a positive integer (true) indicating the capture index of the matched text in the pattern or zero (false) for a mismatch.
accept()543   inline size_t accept() const
544     /// @returns nonzero capture index of the match in the pattern, which may be matcher dependent, or zero for a mismatch, or Const::EMPTY for the empty last split
545   {
546     return cap_;
547   }
548   /// Returns pointer to the begin of the matched text (non-0-terminated), a constant-time operation, use with end() or use size() for text end/length.
begin()549   inline const char *begin() const
550     /// @returns const char* pointer to the matched text in the buffer
551   {
552     return txt_;
553   }
554   /// Returns pointer to the exclusive end of the matched text, a constant-time operation.
end()555   inline const char *end() const
556     /// @returns const char* pointer to the exclusive end of the matched text in the buffer
557   {
558     return txt_ + len_;
559   }
560   /// Returns 0-terminated string of the text matched, does not include matched \0s, this is a constant-time operation.
text()561   inline const char *text()
562     /// @returns 0-terminated const char* string with text matched
563   {
564     if (chr_ == '\0')
565     {
566       chr_ = txt_[len_];
567       txt_[len_] = '\0';
568     }
569     return txt_;
570   }
571   /// Returns the text matched as a string, a copy of text(), may include matched \0s.
str()572   inline std::string str() const
573     /// @returns string with text matched
574   {
575     return std::string(txt_, len_);
576   }
577   /// Returns the match as a wide string, converted from UTF-8 text(), may include matched \0s.
wstr()578   inline std::wstring wstr() const
579     /// @returns wide string with text matched
580   {
581     return wcs(txt_, len_);
582   }
583   /// Returns the length of the matched text in number of bytes, including matched \0s, a constant-time operation.
size()584   inline size_t size() const
585     /// @returns match size in bytes
586   {
587     return len_;
588   }
589   /// Returns the length of the matched text in number of wide characters.
wsize()590   inline size_t wsize() const
591     /// @returns the length of the match in number of wide (multibyte UTF-8) characters
592   {
593     size_t n = 0;
594     const char *e = txt_ + len_;
595     for (const char *s = txt_; s < e; ++s)
596       n += (*s & 0xC0) != 0x80;
597     return n;
598   }
599   /// Returns the first 8-bit character of the text matched.
chr()600   inline int chr() const
601     /// @returns 8-bit char
602   {
603     return *txt_;
604   }
605   /// Returns the first wide character of the text matched.
wchr()606   inline int wchr() const
607     /// @returns wide char (UTF-8 converted to Unicode)
608   {
609     return utf8(txt_);
610   }
611   /// Set or change the starting line number of the last match.
lineno(size_t n)612   inline void lineno(size_t n) ///< new line number
613   {
614     if (lpb_ < txt_)
615       (void)lineno(); // update lno_ and bol_ (or cno_) before overriding lno_
616     lno_ = n;
617   }
618   /// Updates and returns the starting line number of the match in the input character sequence.
lineno()619   inline size_t lineno()
620     /// @returns line number
621   {
622 #if defined(WITH_SPAN)
623     if (lpb_ < txt_)
624     {
625       const char *s = lpb_;
626       const char *t = txt_;
627       size_t n = 0;
628 #if defined(HAVE_AVX512BW) && (!defined(_MSC_VER) || defined(_WIN64))
629       if (s + 63 > t && have_HW_AVX512BW())
630       {
631         n += simd_nlcount_avx512bw(s, t);
632       }
633       else if (s + 31 > t && have_HW_AVX2())
634       {
635         n += simd_nlcount_avx2(s, t);
636       }
637       else if (have_HW_SSE2())
638       {
639         __m128i vlcn = _mm_set1_epi8('\n');
640         while (s + 15 <= t)
641         {
642           __m128i vlcm = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s));
643           __m128i vlceq = _mm_cmpeq_epi8(vlcm, vlcn);
644           uint32_t mask = _mm_movemask_epi8(vlceq);
645           n += popcount(mask);
646           s += 16;
647         }
648       }
649 #elif defined(HAVE_AVX2)
650       if (s + 31 > t && have_HW_AVX2())
651       {
652         n += simd_nlcount_avx2(s, t);
653       }
654       else if (have_HW_SSE2())
655       {
656         __m128i vlcn = _mm_set1_epi8('\n');
657         while (s + 15 <= t)
658         {
659           __m128i vlcm = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s));
660           __m128i vlceq = _mm_cmpeq_epi8(vlcm, vlcn);
661           uint32_t mask = _mm_movemask_epi8(vlceq);
662           n += popcount(mask);
663           s += 16;
664         }
665       }
666 #elif defined(HAVE_SSE2)
667       if (have_HW_SSE2())
668       {
669         __m128i vlcn = _mm_set1_epi8('\n');
670         while (s + 15 <= t)
671         {
672           __m128i vlcm = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s));
673           __m128i vlceq = _mm_cmpeq_epi8(vlcm, vlcn);
674           uint32_t mask = _mm_movemask_epi8(vlceq);
675           n += popcount(mask);
676           s += 16;
677         }
678       }
679 #elif defined(HAVE_NEON)
680       {
681         // ARM AArch64/NEON SIMD optimized loop? - no code that runs faster than the code below?
682       }
683 #endif
684       uint32_t n0 = 0, n1 = 0, n2 = 0, n3 = 0;
685       // clang/gcc 4-way auto-vectorizable loop
686       while (s + 3 < t)
687       {
688         n0 += s[0] == '\n';
689         n1 += s[1] == '\n';
690         n2 += s[2] == '\n';
691         n3 += s[3] == '\n';
692         s += 4;
693       }
694       n += n0 + n1 + n2 + n3;
695       // epilogue
696       if (s < t)
697       {
698         n += *s == '\n';
699         if (++s < t)
700         {
701           n += *s == '\n';
702           if (++s < t)
703             n += *s == '\n';
704         }
705       }
706       // if newlines are detected, then find begin of the last line to adjust bol
707       if (n > 0)
708       {
709         lno_ += n;
710         s = lpb_;
711         // clang/gcc 4-way auto-vectorizable loop
712         while (t - 4 >= s)
713         {
714           if ((t[-1] == '\n') | (t[-2] == '\n') | (t[-3] == '\n') | (t[-4] == '\n'))
715             break;
716           t -= 4;
717         }
718         // epilogue
719         if (--t >= s && *t != '\n')
720           if (--t >= s && *t != '\n')
721             if (--t >= s && *t != '\n')
722               --t;
723         bol_ = t + 1;
724         cpb_ = bol_;
725         cno_ = 0;
726       }
727       lpb_ = txt_;
728     }
729 #else
730     size_t n = lno_;
731     size_t k = cno_;
732     const char *s = lpb_;
733     const char *e = txt_;
734     while (s < e)
735     {
736       if (*s == '\n')
737       {
738         ++n;
739         k = 0;
740       }
741       else if (*s == '\t')
742       {
743         // count tab spacing
744         k += 1 + (~k & (opt_.T - 1));
745       }
746       else
747       {
748         // count column offset in UTF-8 chars
749         k += ((*s & 0xC0) != 0x80);
750       }
751       ++s;
752     }
753     lpb_ = e;
754     lno_ = n;
755     cno_ = k;
756 #endif
757     return lno_;
758   }
759   /// Returns the number of lines that the match spans.
lines()760   inline size_t lines()
761     /// @returns number of lines
762   {
763     size_t n = 1;
764     const char *e = txt_ + len_;
765     for (const char *s = txt_; s < e; ++s)
766       n += (*s == '\n');
767     return n;
768   }
769   /// Returns the inclusive ending line number of the match in the input character sequence.
lineno_end()770   inline size_t lineno_end()
771     /// @returns line number
772   {
773     return lineno() + lines() - 1;
774   }
775   /// Updates and returns the starting column number of the matched text, taking tab spacing into account and counting wide characters as one character each
columno()776   inline size_t columno()
777     /// @returns column number
778   {
779     (void)lineno();
780 #if defined(WITH_SPAN)
781     const char *s = cpb_;
782     const char *e = txt_;
783     size_t k = cno_;
784     size_t m = opt_.T - 1;
785     while (s < e)
786     {
787       if (*s == '\t')
788         k += 1 + (~k & m); // count tab spacing
789       else
790         k += ((*s & 0xC0) != 0x80); // count column offset in UTF-8 chars
791       ++s;
792     }
793     cpb_ = txt_;
794     cno_ = k;
795 #endif
796     return cno_;
797   }
798   /// Returns the number of columns of the matched text, taking tab spacing into account and counting wide characters as one character each.
columns()799   inline size_t columns()
800     /// @returns number of columns
801   {
802     // count columns in tabs and UTF-8 chars
803 #if defined(WITH_SPAN)
804     const char *s = txt_;
805     const char *e = txt_ + len_;
806     size_t n = columno();
807     size_t k = n;
808     while (s < e)
809     {
810       if (*s == '\t')
811         k += 1 + (~k & (opt_.T - 1)); // count tab spacing
812       else if (*s != '\r' && *s != '\n')
813         k += ((*s & 0xC0) != 0x80); // count column offset in UTF-8 chars
814       ++s;
815     }
816     return k - n;
817 #else
818     size_t n = cno_;
819     size_t m = 0;
820     const char *s;
821     const char *t = buf_;
822     for (s = txt_ + len_ - 1; s >= t; --s)
823     {
824       if (*s == '\n')
825       {
826         n = 0;
827         break;
828       }
829     }
830     t = txt_;
831     const char *e = txt_ + len_;
832     for (++s; s < e; ++s)
833     {
834       if (s == t)
835         m = n;
836       if (*s == '\t')
837         n += 1 + (~n & (opt_.T - 1));
838       else
839         n += (*s & 0xC0) != 0x80;
840     }
841     return n - m;
842 #endif
843   }
844 #if defined(WITH_SPAN)
845   /// Returns the inclusive ending column number of the matched text on the ending matching line, taking tab spacing into account and counting wide characters as one character each
columno_end()846   inline size_t columno_end()
847     /// @returns column number
848   {
849     if (len_ == 0)
850       return columno();
851     (void)lineno();
852     const char *e = txt_ + len_;
853     const char *s = e;
854     const char *b = bol_;
855     while (--s >= b)
856       if (*s == '\n')
857         break;
858     size_t k = 0;
859     while (++s < e)
860     {
861       if (*s == '\t')
862         k += 1 + (~k & (opt_.T - 1));
863       else
864         k += (*s & 0xC0) != 0x80;
865     }
866     return k > 0 ? k - 1 : 0;
867   }
868 #endif
869   /// Returns std::pair<size_t,std::string>(accept(), str()), useful for tokenizing input into containers of pairs.
pair()870   inline std::pair<size_t,std::string> pair() const
871     /// @returns std::pair<size_t,std::string>(accept(), str())
872   {
873     return std::pair<size_t,std::string>(accept(), str());
874   }
875   /// Returns std::pair<size_t,std::wstring>(accept(), wstr()), useful for tokenizing input into containers of pairs.
wpair()876   inline std::pair<size_t,std::wstring> wpair() const
877     /// @returns std::pair<size_t,std::wstring>(accept(), wstr())
878   {
879     return std::pair<size_t,std::wstring>(accept(), wstr());
880   }
881   /// Returns the position of the first character of the match in the input character sequence, a constant-time operation.
first()882   inline size_t first() const
883     /// @returns position in the input character sequence
884   {
885     return num_ + txt_ - buf_;
886   }
887   /// Returns the exclusive position of the last character of the match in the input character sequence, a constant-time operation.
last()888   inline size_t last() const
889     /// @returns position in the input character sequence
890   {
891     return first() + size();
892   }
893   /// Returns true if this matcher is at the start of a buffer to read an input character sequence. Use reset() to restart reading new input.
at_bob()894   inline bool at_bob() const
895     /// @returns true if at the begin of an input sequence
896   {
897     return got_ == Const::BOB;
898   }
899   /// Set/reset the begin of a buffer state.
set_bob(bool bob)900   inline void set_bob(bool bob) ///< if true: set begin of buffer state
901   {
902     if (bob)
903       got_ = Const::BOB;
904     else if (got_ == Const::BOB)
905       got_ = Const::UNK;
906   }
907   /// Returns true if this matcher has no more input to read from the input character sequence.
at_end()908   inline bool at_end()
909     /// @returns true if at end of input and a read attempt will produce EOF
910   {
911     return pos_ >= end_ && (eof_ || peek() == EOF);
912   }
913   /// Returns true if this matcher hit the end of the input character sequence.
hit_end()914   inline bool hit_end() const
915     /// @returns true if EOF was hit (and possibly more input would have changed the result), false otherwise (but next read attempt may return EOF immediately)
916   {
917     return pos_ >= end_ && eof_;
918   }
919   /// Set and force the end of input state.
set_end(bool eof)920   inline void set_end(bool eof)
921   {
922     if (eof)
923       flush();
924     if (own_)
925       eof_ = eof;
926   }
927   /// Returns true if this matcher reached the begin of a new line.
at_bol()928   inline bool at_bol() const
929     /// @returns true if at begin of a new line
930   {
931     return got_ == Const::BOB || got_ == '\n';
932   }
933   /// Set/reset the begin of a new line state.
set_bol(bool bol)934   inline void set_bol(bool bol) ///< if true: set begin of a new line state
935   {
936     if (bol)
937       got_ = '\n';
938     else if (got_ == '\n')
939       got_ = Const::UNK;
940   }
941   /// Returns true if this matcher matched text that begins a word.
at_bow()942   inline bool at_bow()
943     /// @returns true if this matcher matched text that begins a word
944   {
945     return !isword(got_) && isword(txt_ < buf_ + end_ ? static_cast<unsigned char>(*txt_) : peek_more());
946   }
947   /// Returns true if this matcher matched text that ends a word.
at_eow()948   inline bool at_eow()
949     /// @returns true if this matcher matched text that ends a word
950   {
951     return isword(got_) && !isword(txt_ < buf_ + end_ ? static_cast<unsigned char>(*txt_) : peek_more());
952   }
953   /// Returns the next 8-bit character (unsigned char 0..255 or EOF) from the input character sequence, while preserving the current text() match (but pointer returned by text() may change; warning: does not preserve the yytext string pointer when options --flex and --bison are used).
input()954   int input()
955     /// @returns the next character (unsigned char 0..255) from input or EOF (-1)
956   {
957     DBGLOG("AbstractMatcher::input() pos = %zu end = %zu", pos_, end_);
958     if (pos_ < end_)
959     {
960       if (chr_ != '\0' && buf_ + pos_ == txt_ + len_)
961         got_ = chr_;
962       else
963         got_ = static_cast<unsigned char>(buf_[pos_]);
964       ++pos_;
965     }
966     else
967     {
968 #if defined(WITH_FAST_GET)
969       got_ = get_more();
970 #else
971       got_ = get();
972 #endif
973     }
974     cur_ = pos_;
975     return got_;
976   }
977   /// Returns the next wide character (unsigned 0..U+10FFFF or EOF) from the input character sequence, while preserving the current text() match (but pointer returned by text() may change; warning: does not preserve the yytext string pointer when options --flex and --bison are used).
winput()978   int winput()
979     /// @returns the next wide character (unsigned 0..U+10FFFF) or EOF (-1)
980   {
981     DBGLOG("AbstractMatcher::winput()");
982     char tmp[8] = { 0 }, *s = tmp;
983     int c;
984     if ((c = input()) == EOF)
985       return EOF;
986     if (static_cast<unsigned char>(*s++ = c) >= 0x80)
987     {
988       while (((++*s = get()) & 0xC0) == 0x80)
989         continue;
990       got_ = static_cast<unsigned char>(buf_[cur_ = --pos_]);
991     }
992     return utf8(tmp);
993   }
994   /// Put back one character (8-bit) on the input character sequence for matching, DANGER: invalidates the previous text() pointer and match info, unput is not honored when matching in-place using buffer(base, size) and nothing has been read yet.
unput(char c)995   void unput(char c) ///< 8-bit character to put back
996   {
997     DBGLOG("AbstractMatcher::unput()");
998     reset_text();
999     if (pos_ > 0)
1000     {
1001       --pos_;
1002     }
1003     else if (own_)
1004     {
1005       txt_ = buf_;
1006       len_ = 0;
1007       if (end_ + 1 >= max_)
1008         (void)grow();
1009       std::memmove(buf_ + 1, buf_, end_);
1010       ++end_;
1011     }
1012     buf_[pos_] = c;
1013     cur_ = pos_;
1014   }
1015   /// Put back one (wide) character on the input character sequence for matching, DANGER: invalidates the previous text() pointer and match info, unput is not honored when matching in-place using buffer(base, size) and nothing has been read yet.
wunput(int c)1016   void wunput(int c) ///< character to put back
1017   {
1018     DBGLOG("AbstractMatcher::wunput()");
1019     char tmp[8];
1020     size_t n = utf8(c, tmp);
1021     if (pos_ >= n)
1022     {
1023       pos_ -= n;
1024     }
1025     else if (own_)
1026     {
1027       txt_ = buf_;
1028       len_ = 0;
1029       if (end_ + n >= max_)
1030         (void)grow();
1031       std::memmove(buf_ + n, buf_, end_);
1032       end_ += n;
1033     }
1034     std::memcpy(&buf_[pos_], tmp, n);
1035     cur_ = pos_;
1036   }
1037   /// Peek at the next character available for reading from the current input source.
peek()1038   inline int peek()
1039     /// @returns the character (unsigned char 0..255) or EOF (-1)
1040   {
1041     DBGLOG("AbstractMatcher::peek()");
1042 #if defined(WITH_FAST_GET)
1043     return pos_ < end_ ? static_cast<unsigned char>(buf_[pos_]) : peek_more();
1044 #else
1045     if (pos_ < end_)
1046       return static_cast<unsigned char>(buf_[pos_]);
1047     if (eof_)
1048       return EOF;
1049     while (true)
1050     {
1051       if (end_ + blk_ + 1 >= max_)
1052         (void)grow();
1053       end_ += get(buf_ + end_, blk_ > 0 ? blk_ : max_ - end_ - 1);
1054       if (pos_ < end_)
1055         return static_cast<unsigned char>(buf_[pos_]);
1056       DBGLOGN("peek(): EOF");
1057       if (!wrap())
1058       {
1059         eof_ = true;
1060         return EOF;
1061       }
1062     }
1063 #endif
1064   }
1065 #if defined(WITH_SPAN)
1066   /// Returns pointer to the begin of the line in the buffer containing the matched text.
bol()1067   inline const char *bol()
1068     /// @returns pointer to the begin of line
1069   {
1070     (void)lineno();
1071     return bol_;
1072   }
1073   /// Returns pointer to the end of the line (last char + 1) in the buffer containing the matched text, DANGER: invalidates previous bol() and text() pointers, use eol() before bol(), text(), begin(), and end() when those are used.
1074   inline const char *eol(bool inclusive = false) ///< true if inclusive, i.e. point after \n
1075     /// @returns pointer to the end of line
1076   {
1077     if (chr_ == '\n' || (txt_ + len_ < buf_ + end_ && txt_[len_] == '\n'))
1078       return txt_ + len_ + inclusive;
1079     size_t loc = pos_;
1080     while (true)
1081     {
1082       if (loc < end_)
1083       {
1084         char *s = static_cast<char*>(std::memchr(buf_ + loc, '\n', end_ - loc));
1085         if (s != NULL)
1086           return s + inclusive;
1087       }
1088       if (eof_)
1089         break;
1090       (void)grow();
1091       loc = end_;
1092       end_ += get(buf_ + end_, blk_ > 0 ? blk_ : max_ - end_ - 1);
1093       if (loc >= end_ && !wrap())
1094       {
1095         eof_ = true;
1096         break;
1097       }
1098     }
1099     return buf_ + end_;
1100   }
1101   /// Returns the number of bytes in the buffer available to search from the current begin()/text() position.
avail()1102   size_t avail()
1103   {
1104     if (peek() == EOF)
1105       return 0;
1106     return end_ - (txt_ - buf_);
1107   }
1108   /// Returns the byte offset of the match from the start of the line.
border()1109   size_t border()
1110     /// @returns border offset
1111   {
1112     return txt_ - bol();
1113   }
1114   /// Enlarge the match to span the entire line of input (excluding \n), return text().
span()1115   const char *span()
1116     /// @returns const char* span of text for the entire line
1117   {
1118     DBGLOG("AbstractMatcher::span()");
1119     (void)lineno();
1120     len_ += txt_ - bol_;
1121     txt_ = const_cast<char*>(bol_); // requires ugly cast
1122     if (chr_ == '\n')
1123       return txt_;
1124     reset_text();
1125     const char *e = eol();
1126     set_current(e - buf_);
1127     len_ = e - bol_;
1128     return text();
1129   }
1130   /// Returns the line of input (excluding \n) as a string containing the matched text as a substring.
line()1131   std::string line()
1132     /// @returns matching line as a string
1133   {
1134     DBGLOG("AbstractMatcher::line()");
1135     reset_text();
1136     const char *e = eol(); // warning: must call eol() before bol()
1137     const char *b = bol();
1138     return std::string(b, e - b);
1139   }
1140   /// Returns the line of input (excluding \n) as a wide string containing the matched text as a substring.
wline()1141   std::wstring wline()
1142     /// @returns matching line as a wide string
1143   {
1144     DBGLOG("AbstractMatcher::wline()");
1145     reset_text();
1146     const char *e = eol(); // warning: must call eol() before bol()
1147     const char *b = bol();
1148     while (b < e && (*b & 0xC0) == 0x80) // make sure we advance forward to valid UTF-8
1149       ++b;
1150     return wcs(b, e - b);
1151   }
1152 #endif
1153   /// Skip input until the specified ASCII character is consumed and return true, or EOF is reached and return false.
skip(char c)1154   bool skip(char c) ///< ASCII character to skip to
1155     /// @returns true if skipped to c, false if EOF is reached
1156   {
1157     DBGLOG("AbstractMatcher::skip()");
1158     reset_text();
1159     len_ = 0;
1160     while (true)
1161     {
1162       txt_ = static_cast<char*>(std::memchr(buf_ + pos_, c, end_ - pos_));
1163       if (txt_ != NULL)
1164       {
1165         ++txt_;
1166         set_current(txt_ - buf_);
1167         return true;
1168       }
1169       if (eof_)
1170         break;
1171       pos_ = cur_ = end_;
1172       txt_ = buf_ + end_;
1173       (void)grow();
1174       end_ += get(buf_ + end_, blk_ > 0 ? blk_ : max_ - end_ - 1);
1175       if (pos_ >= end_ && !wrap())
1176       {
1177         eof_ = true;
1178         break;
1179       }
1180     }
1181     set_current(end_);
1182     return false;
1183   }
1184   /// Skip input until the specified Unicode character is consumed and return true, or EOF is reached and return false.
skip(wchar_t c)1185   bool skip(wchar_t c) ///< Unicode character to skip to
1186     /// @returns true if skipped to c, false if EOF is reached
1187   {
1188     char s[8];
1189     size_t n = utf8(c, s);
1190     s[n] = '\0';
1191     return skip(s);
1192   }
1193   /// Skip input until the specified literal UTF-8 string is consumed and return true, or EOF is reached and return false.
skip(const char * s)1194   bool skip(const char *s) ///< literal UTF-8 string to skip to
1195     /// @returns true if skipped to c, false if EOF is reached
1196   {
1197     if (s == NULL || s[0] == '\0')
1198       return true;
1199     if (s[1] == '\0')
1200       return skip(s[0]);
1201     while (skip(s[0]))
1202     {
1203       const char *t = s + 1;
1204       while (true)
1205       {
1206         if (*t == '\0')
1207         {
1208           set_current(pos_);
1209           return true;
1210         }
1211         int c = get();
1212         if (c == EOF)
1213           return false;
1214         if (c != static_cast<unsigned char>(*t))
1215           break;
1216         ++t;
1217       }
1218       pos_ = txt_ - buf_;
1219     }
1220     return false;
1221   }
1222   /// Fetch the rest of the input as text, useful for searching/splitting up to n times after which the rest is needed.
rest()1223   const char *rest()
1224     /// @returns const char* string of the remaining input (wrapped with more input when AbstractMatcher::wrap is defined)
1225   {
1226     DBGLOG("AbstractMatcher::rest()");
1227     reset_text();
1228     cur_ = pos_;
1229     txt_ = buf_ + cur_;
1230     while (!eof_)
1231     {
1232       (void)grow();
1233       pos_ = end_;
1234       end_ += get(buf_ + end_, blk_ > 0 ? blk_ : max_ - end_ - 1);
1235       if (pos_ >= end_ && !wrap())
1236         eof_ = true;
1237     }
1238     len_ = end_ - cur_;
1239     pos_ = cur_ = end_;
1240     DBGLOGN("rest() length = %zu", len_);
1241     return text();
1242   }
1243   /// Append the next match to the currently matched text returned by AbstractMatcher::text, when the next match found is adjacent to the current match.
more()1244   void more()
1245   {
1246     cur_ = txt_ - buf_;
1247   }
1248   /// Truncate the AbstractMatcher::text length of the match to n characters in length and reposition for next match.
less(size_t n)1249   void less(size_t n) ///< truncated string length
1250   {
1251     if (n < len_)
1252     {
1253       DBGCHK(pos_ < max_);
1254       reset_text();
1255       pos_ = txt_ - buf_ + n;
1256       DBGCHK(pos_ < max_);
1257       len_ = n;
1258       cur_ = pos_;
1259     }
1260   }
1261   /// Cast this matcher to positive integer indicating the nonzero capture index of the matched text in the pattern, same as AbstractMatcher::accept.
size_t()1262   operator size_t() const
1263     /// @returns nonzero capture index of a match, which may be matcher dependent, or zero for a mismatch
1264   {
1265     return accept();
1266   }
1267   /// Cast this matcher to a std::string of the text matched by this matcher.
string()1268   operator std::string() const
1269     /// @returns std::string with matched text
1270   {
1271     return str();
1272   }
1273   /// Cast this matcher to a std::wstring of the text matched by this matcher.
wstring()1274   operator std::wstring() const
1275     /// @returns std::wstring converted to UCS from the 0-terminated matched UTF-8 text
1276   {
1277     return wstr();
1278   }
1279   /// Cast the match to std::pair<size_t,std::wstring>(accept(), wstr()), useful for tokenization into containers.
1280   operator std::pair<size_t,std::string>() const
1281     /// @returns std::pair<size_t,std::wstring>(accept(), wstr())
1282   {
1283     return pair();
1284   }
1285   /// Returns true if matched text is equal to a string, useful for std::algorithm.
1286   bool operator==(const char *rhs) ///< rhs string to compare to
1287     /// @returns true if matched text is equal to rhs string
1288     const
1289   {
1290     return std::strncmp(rhs, txt_, len_) == 0 && rhs[len_] == '\0';
1291   }
1292   /// Returns true if matched text is equalt to a string, useful for std::algorithm.
1293   bool operator==(const std::string& rhs) ///< rhs string to compare to
1294     /// @returns true if matched text is equal to rhs string
1295     const
1296   {
1297     return rhs.size() == len_ && rhs.compare(0, std::string::npos, txt_, len_) == 0;
1298   }
1299   /// Returns true if capture index is equal to a given size_t value, useful for std::algorithm.
1300   bool operator==(size_t rhs) ///< capture index to compare accept() to
1301     /// @returns true if capture index is equal to rhs
1302     const
1303   {
1304     return accept() == rhs;
1305   }
1306   /// Returns true if capture index is equal to a given int value, useful for std::algorithm.
1307   bool operator==(int rhs) ///< capture index to compare accept() to
1308     /// @returns true if capture index is equal to rhs
1309     const
1310   {
1311     return static_cast<int>(accept()) == rhs;
1312   }
1313   /// Returns true if matched text is not equal to a string, useful for std::algorithm.
1314   bool operator!=(const char *rhs) ///< rhs string to compare to
1315     /// @returns true if matched text is not equal to rhs string
1316     const
1317   {
1318     return std::strncmp(rhs, txt_, len_) != 0 || rhs[len_] != '\0'; // if static checkers complain here, they are wrong
1319   }
1320   /// Returns true if matched text is not equal to a string, useful for std::algorithm.
1321   bool operator!=(const std::string& rhs) ///< rhs string to compare to
1322     /// @returns true if matched text is not equal to rhs string
1323     const
1324   {
1325     return rhs.size() > len_ || rhs.compare(0, std::string::npos, txt_, len_) != 0;
1326   }
1327   /// Returns true if capture index is not equal to a given size_t value, useful for std::algorithm.
1328   bool operator!=(size_t rhs) ///< capture index to compare accept() to
1329     /// @returns true if capture index is not equal to rhs
1330     const
1331   {
1332     return accept() != rhs;
1333   }
1334   /// Returns true if capture index is not equal to a given int value, useful for std::algorithm.
1335   bool operator!=(int rhs) ///< capture index to compare accept() to
1336     /// @returns true if capture index is not equal to rhs
1337     const
1338   {
1339     return static_cast<int>(accept()) != rhs;
1340   }
1341   /// Returns captured text as a std::pair<const char*,size_t> with string pointer (non-0-terminated) and length.
1342   virtual std::pair<const char*,size_t> operator[](size_t n)
1343     /// @returns std::pair of string pointer and length in the captured text, where [0] returns std::pair(begin(), size())
1344     const = 0;
1345   /// Returns the group capture identifier containing the group capture index >0 and name (or NULL) of a named group capture, or (1,NULL) by default
1346   virtual std::pair<size_t,const char*> group_id()
1347     /// @returns a pair of size_t and string
1348     = 0;
1349   /// Returns the next group capture identifier containing the group capture index >0 and name (or NULL) of a named group capture, or (0,NULL) when no more groups matched
1350   virtual std::pair<size_t,const char*> group_next_id()
1351     /// @returns a pair of size_t and string
1352     = 0;
1353   /// Set tab size 1, 2, 4, or 8
tabs(char n)1354   void tabs(char n) ///< tab size 1, 2, 4, or 8
1355   {
1356     opt_.T = n & 0xf;
1357   }
1358   /// Returns current tab size 1, 2, 4, or 8.
tabs()1359   char tabs()
1360   {
1361     return opt_.T;
1362   }
1363   Operation scan;  ///< functor to scan input (to tokenize input)
1364   Operation find;  ///< functor to search input
1365   Operation split; ///< functor to split input
1366   Input in;        ///< input character sequence being matched by this matcher
1367  protected:
1368   /// Initialize the base abstract matcher at construction.
1369   virtual void init(const char *opt = NULL) ///< options
1370   {
1371     DBGLOG("AbstractMatcher::init(%s)", opt ? opt : "");
1372     own_ = false; // require allocation of a buffer
1373     reset(opt);
1374   }
1375   /// The abstract match operation implemented by pattern matching engines derived from AbstractMatcher.
1376   virtual size_t match(Method method)
1377     /// @returns nonzero when input matched the pattern using method Const::SCAN, Const::FIND, Const::SPLIT, or Const::MATCH
1378     = 0;
1379   /// Shift or expand the internal buffer when it is too small to accommodate more input, where the buffer size is doubled when needed, change cur_, pos_, end_, max_, ind_, buf_, bol_, lpb_, and txt_.
1380   inline bool grow(size_t need = Const::BLOCK) ///< optional needed space = Const::BLOCK size by default
1381     /// @returns true if buffer was shifted or enlarged
1382   {
1383     if (max_ - end_ >= need + 1)
1384       return false;
1385 #if defined(WITH_SPAN)
1386     (void)lineno();
1387     cno_ = 0;
1388     if (bol_ + Const::BUFSZ - buf_ < txt_ - bol_ && evh_ == NULL)
1389     {
1390       // this line is very long, so shift all the way to the match instead of to the begin of the last line
1391       DBGLOG("Line in buffer is too long to shift, moving bol position to text match position");
1392       (void)columno();
1393       bol_ = txt_;
1394     }
1395     size_t gap = bol_ - buf_;
1396     if (gap > 0)
1397     {
1398       if (evh_ != NULL)
1399         (*evh_)(*this, buf_, gap, num_);
1400       cur_ -= gap;
1401       ind_ -= gap;
1402       pos_ -= gap;
1403       end_ -= gap;
1404       txt_ -= gap;
1405       bol_ -= gap;
1406       lpb_ -= gap;
1407       num_ += gap;
1408       std::memmove(buf_, buf_ + gap, end_);
1409     }
1410     if (max_ - end_ >= need)
1411     {
1412       DBGLOG("Shift buffer to close gap of %zu bytes", gap);
1413     }
1414     else
1415     {
1416       size_t newmax = end_ + need;
1417       while (max_ < newmax)
1418         max_ *= 2;
1419       DBGLOG("Expand buffer to %zu bytes", max_);
1420 #if defined(WITH_REALLOC)
1421 #if (defined(__WIN32__) || defined(_WIN32) || defined(WIN32) || defined(_WIN64) || defined(__BORLANDC__)) && !defined(__CYGWIN__)
1422       char *newbuf = static_cast<char*>(_aligned_realloc(static_cast<void*>(buf_), max_, 4096));
1423 #else
1424       char *newbuf = static_cast<char*>(std::realloc(static_cast<void*>(buf_), max_));
1425 #endif
1426       if (newbuf == NULL)
1427         throw std::bad_alloc();
1428 #else
1429       char *newbuf = new char[max_];
1430       std::memcpy(newbuf, buf_, end_);
1431       delete[] buf_;
1432 #endif
1433       txt_ = newbuf + (txt_ - buf_);
1434       lpb_ = newbuf + (lpb_ - buf_);
1435       buf_ = newbuf;
1436     }
1437     bol_ = buf_;
1438     cpb_ = buf_;
1439 #else
1440     size_t gap = txt_ - buf_;
1441     if (max_ - end_ + gap >= need)
1442     {
1443       DBGLOG("Shift buffer to close gap of %zu bytes", gap);
1444       (void)lineno();
1445       cur_ -= gap;
1446       ind_ -= gap;
1447       pos_ -= gap;
1448       end_ -= gap;
1449       num_ += gap;
1450       if (end_ > 0)
1451         std::memmove(buf_, txt_, end_);
1452       txt_ = buf_;
1453       lpb_ = buf_;
1454     }
1455     else
1456     {
1457       size_t newmax = end_ - gap + need;
1458       size_t oldmax = max_;
1459       while (max_ < newmax)
1460         max_ *= 2;
1461       if (oldmax < max_)
1462       {
1463         DBGLOG("Expand buffer from %zu to %zu bytes", oldmax, max_);
1464         (void)lineno();
1465         cur_ -= gap;
1466         ind_ -= gap;
1467         pos_ -= gap;
1468         end_ -= gap;
1469         num_ += gap;
1470 #if defined(WITH_REALLOC)
1471         std::memmove(buf_, txt_, end_);
1472 #if (defined(__WIN32__) || defined(_WIN32) || defined(WIN32) || defined(_WIN64) || defined(__BORLANDC__)) && !defined(__CYGWIN__)
1473         char *newbuf = static_cast<char*>(_aligned_realloc(static_cast<void*>(buf_), max_, 4096));
1474 #else
1475         char *newbuf = static_cast<char*>(std::realloc(static_cast<void*>(buf_), max_));
1476 #endif
1477         if (newbuf == NULL)
1478           throw std::bad_alloc();
1479 #else
1480         char *newbuf = new char[max_];
1481         std::memcpy(newbuf, txt_, end_);
1482         delete[] buf_;
1483 #endif
1484         buf_ = newbuf;
1485         txt_ = buf_;
1486         lpb_ = buf_;
1487       }
1488     }
1489 #endif
1490     return true;
1491   }
1492   /// Returns the next character read from the current input source.
get()1493   inline int get()
1494     /// @returns the character read (unsigned char 0..255) or EOF (-1)
1495   {
1496     DBGLOG("AbstractMatcher::get()");
1497 #if defined(WITH_FAST_GET)
1498     return pos_ < end_ ? static_cast<unsigned char>(buf_[pos_++]) : get_more();
1499 #else
1500     if (pos_ < end_)
1501       return static_cast<unsigned char>(buf_[pos_++]);
1502     if (eof_)
1503       return EOF;
1504     while (true)
1505     {
1506       if (end_ + blk_ + 1 >= max_)
1507         (void)grow();
1508       end_ += get(buf_ + end_, blk_ > 0 ? blk_ : max_ - end_ - 1);
1509       if (pos_ < end_)
1510         return static_cast<unsigned char>(buf_[pos_++]);
1511       DBGLOGN("get(): EOF");
1512       if (!wrap())
1513       {
1514         eof_ = true;
1515         return EOF;
1516       }
1517     }
1518 #endif
1519   }
1520   /// Reset the matched text by removing the terminating \0, which is needed to search for a new match.
reset_text()1521   inline void reset_text()
1522   {
1523     if (chr_ != '\0')
1524     {
1525       txt_[len_] = chr_;
1526       chr_ = '\0';
1527     }
1528   }
1529   /// Set the current position in the buffer for the next match.
set_current(size_t loc)1530   inline void set_current(size_t loc) ///< new location in buffer
1531   {
1532     DBGCHK(loc <= end_);
1533     pos_ = cur_ = loc;
1534 #if defined(WITH_SPAN)
1535     got_ = loc > 0 ? static_cast<unsigned char>(buf_[loc - 1]) : '\n';
1536 #else
1537     got_ = loc > 0 ? static_cast<unsigned char>(buf_[loc - 1]) : Const::UNK;
1538 #endif
1539   }
1540   /// Set the current match position in the buffer.
set_current_match(size_t loc)1541   inline void set_current_match(size_t loc) ///< new location in buffer
1542   {
1543     set_current(loc);
1544     txt_ = buf_ + cur_;
1545   }
1546   /// Get the next character and grow the buffer to make more room if necessary.
get_more()1547   inline int get_more()
1548     /// @returns the character read (unsigned char 0..255) or EOF (-1)
1549   {
1550     DBGLOG("AbstractMatcher::get_more()");
1551     if (eof_)
1552       return EOF;
1553     while (true)
1554     {
1555       if (end_ + blk_ + 1 >= max_)
1556         (void)grow();
1557       end_ += get(buf_ + end_, blk_ > 0 ? blk_ : max_ - end_ - 1);
1558       if (pos_ < end_)
1559         return static_cast<unsigned char>(buf_[pos_++]);
1560       DBGLOGN("get_more(): EOF");
1561       if (!wrap())
1562       {
1563         eof_ = true;
1564         return EOF;
1565       }
1566     }
1567   }
1568   /// Peek at the next character and grow the buffer to make more room if necessary.
peek_more()1569   inline int peek_more()
1570     /// @returns the character (unsigned char 0..255) or EOF (-1)
1571   {
1572     DBGLOG("AbstractMatcher::peek_more()");
1573     if (eof_)
1574       return EOF;
1575     while (true)
1576     {
1577       if (end_ + blk_ + 1 >= max_)
1578         (void)grow();
1579       end_ += get(buf_ + end_, blk_ > 0 ? blk_ : max_ - end_ - 1);
1580       if (pos_ < end_)
1581         return static_cast<unsigned char>(buf_[pos_]);
1582       DBGLOGN("peek_more(): EOF");
1583       if (!wrap())
1584       {
1585         eof_ = true;
1586         return EOF;
1587       }
1588     }
1589   }
1590   Option      opt_; ///< options for matcher engines
1591   char       *buf_; ///< input character sequence buffer
1592   char       *txt_; ///< points to the matched text in buffer AbstractMatcher::buf_
1593   size_t      len_; ///< size of the matched text
1594   size_t      cap_; ///< nonzero capture index of an accepted match or zero
1595   size_t      cur_; ///< next position in AbstractMatcher::buf_ to assign to AbstractMatcher::txt_
1596   size_t      pos_; ///< position in AbstractMatcher::buf_ after AbstractMatcher::txt_
1597   size_t      end_; ///< ending position of the input buffered in AbstractMatcher::buf_
1598   size_t      max_; ///< total buffer size and max position + 1 to fill
1599   size_t      ind_; ///< current indent position
1600   size_t      blk_; ///< block size for block-based input reading, as set by AbstractMatcher::buffer
1601   int         got_; ///< last unsigned character we looked at (to determine anchors and boundaries)
1602   int         chr_; ///< the character located at AbstractMatcher::txt_[AbstractMatcher::len_]
1603 #if defined(WITH_SPAN)
1604   const char *bol_; ///< begin of line pointer in buffer
1605   Handler    *evh_; ///< event handler functor to invoke when buffer contents are shifted out
1606 #endif
1607   const char *lpb_; ///< line pointer in buffer, updated when counting line numbers with lineno()
1608   size_t      lno_; ///< line number count (cached)
1609 #if defined(WITH_SPAN)
1610   const char *cpb_; ///< column pointer in buffer, updated when counting column numbers with columno()
1611 #endif
1612   size_t      cno_; ///< column number count (cached)
1613   size_t      num_; ///< character count of the input till bol_
1614   bool        own_; ///< true if AbstractMatcher::buf_ was allocated and should be deleted
1615   bool        eof_; ///< input has reached EOF
1616   bool        mat_; ///< true if AbstractMatcher::matches() was successful
1617 };
1618 
1619 /// The pattern matcher class template extends abstract matcher base class.
1620 template<typename P> /// @tparam <P> pattern class to instantiate a matcher
1621 class PatternMatcher : public AbstractMatcher {
1622  public:
1623   typedef P Pattern; ///< pattern class of this matcher, a typedef of the PatternMatcher template parameter
1624   /// Copy constructor, the underlying pattern object is shared (not deep copied).
PatternMatcher(const PatternMatcher & matcher)1625   PatternMatcher(const PatternMatcher& matcher) ///< matcher with pattern to use (pattern may be shared)
1626     :
1627       AbstractMatcher(matcher.in, matcher.opt_),
1628       pat_(matcher.pat_),
1629       own_(false)
1630   {
1631     DBGLOG("PatternMatcher::PatternMatcher(matcher)");
1632   }
1633   /// Delete matcher, deletes pattern when owned
~PatternMatcher()1634   virtual ~PatternMatcher()
1635   {
1636     DBGLOG("PatternMatcher::~PatternMatcher()");
1637     if (own_ && pat_ != NULL)
1638       delete pat_;
1639   }
1640   /// Assign a matcher, the underlying pattern object is shared (not deep copied).
1641   PatternMatcher& operator=(const PatternMatcher& matcher) ///< matcher with pattern to use (pattern may be shared)
1642   {
1643     scan.init(this, Const::SCAN);
1644     find.init(this, Const::FIND);
1645     split.init(this, Const::SPLIT);
1646     in = matcher.in;
1647     reset();
1648     opt_ = matcher.opt_;
1649     pat_ = matcher.pat_,
1650     own_ = false;
1651     return *this;
1652   }
1653   /// Set the pattern to use with this matcher as a shared pointer to another matcher pattern.
pattern(const PatternMatcher & matcher)1654   virtual PatternMatcher& pattern(const PatternMatcher& matcher) ///< the other matcher
1655     /// @returns this matcher
1656   {
1657     opt_ = matcher.opt_;
1658     return this->pattern(matcher.pattern());
1659   }
1660   /// Set the pattern to use with this matcher (the given pattern is shared and must be persistent).
pattern(const Pattern & pattern)1661   virtual PatternMatcher& pattern(const Pattern& pattern) ///< pattern object for this matcher
1662     /// @returns this matcher
1663   {
1664     DBGLOG("PatternMatcher::pattern()");
1665     if (pat_ != &pattern)
1666     {
1667       if (own_ && pat_ != NULL)
1668         delete pat_;
1669       pat_ = &pattern;
1670       own_ = false;
1671     }
1672     return *this;
1673   }
1674   /// Set the pattern to use with this matcher (the given pattern is shared and must be persistent).
pattern(const Pattern * pattern)1675   virtual PatternMatcher& pattern(const Pattern *pattern) ///< pattern object for this matcher
1676     /// @returns this matcher
1677   {
1678     DBGLOG("PatternMatcher::pattern()");
1679     if (pat_ != pattern)
1680     {
1681       if (own_ && pat_ != NULL)
1682         delete pat_;
1683       pat_ = pattern;
1684       own_ = false;
1685     }
1686     return *this;
1687   }
1688   /// Set the pattern from a regex string to use with this matcher.
pattern(const char * pattern)1689   virtual PatternMatcher& pattern(const char *pattern) ///< regex string to instantiate internal pattern object
1690     /// @returns this matcher
1691   {
1692     DBGLOG("PatternMatcher::pattern(\"%s\")", pattern);
1693     if (own_ && pat_ != NULL)
1694       delete pat_;
1695     pat_ = new Pattern(pattern);
1696     own_ = true;
1697     return *this;
1698   }
1699   /// Set the pattern from a regex string to use with this matcher.
pattern(const std::string & pattern)1700   virtual PatternMatcher& pattern(const std::string& pattern) ///< regex string to instantiate internal pattern object
1701     /// @returns this matcher
1702   {
1703     DBGLOG("PatternMatcher::pattern(\"%s\")", pattern.c_str());
1704     if (own_ && pat_ != NULL)
1705       delete pat_;
1706     pat_ = new Pattern(pattern);
1707     own_ = true;
1708     return *this;
1709   }
1710   /// Returns true if this matcher has a pattern.
has_pattern()1711   bool has_pattern() const
1712     /// @returns true if this matcher has a pattern
1713   {
1714     return pat_ != NULL;
1715   }
1716   /// Returns true if this matcher has its own pattern not received from another matcher (responsible to delete).
own_pattern()1717   bool own_pattern() const
1718     /// @returns true if this matcher has its own pattern
1719   {
1720     return own_ && pat_ != NULL;
1721   }
1722   /// Returns a reference to the pattern object associated with this matcher.
pattern()1723   const Pattern& pattern() const
1724     /// @returns reference to pattern object
1725   {
1726     ASSERT(pat_ != NULL);
1727     return *pat_;
1728   }
1729  protected:
1730   /// Construct a base abstract matcher from a pointer to a persistent pattern object (that is shared with this class) and an input character sequence.
1731   PatternMatcher(
1732       const Pattern *pattern = NULL,  ///< points to pattern object for this matcher
1733       const Input&   input = Input(), ///< input character sequence for this matcher
1734       const char    *opt = NULL)      ///< option string of the form `(A|N|T(=[[:digit:]])?|;)*`
1735     :
AbstractMatcher(input,opt)1736       AbstractMatcher(input, opt),
1737       pat_(pattern),
1738       own_(false)
1739   { }
1740   /// Construct a base abstract matcher from a persistent pattern object (that is shared with this class) and an input character sequence.
1741   PatternMatcher(
1742       const Pattern& pattern,         ///< pattern object for this matcher
1743       const Input&   input = Input(), ///< input character sequence for this matcher
1744       const char    *opt = NULL)      ///< option string of the form `(A|N|T(=[[:digit:]])?|;)*`
1745     :
AbstractMatcher(input,opt)1746       AbstractMatcher(input, opt),
1747       pat_(&pattern),
1748       own_(false)
1749   { }
1750   /// Construct a base abstract matcher from a regex pattern string and an input character sequence.
1751   PatternMatcher(
1752       const char  *pattern,         ///< regex string instantiates pattern object for this matcher
1753       const Input& input = Input(), ///< input character sequence for this matcher
1754       const char  *opt = NULL)      ///< option string of the form `(A|N|T(=[[:digit:]])?|;)*`
1755     :
AbstractMatcher(input,opt)1756       AbstractMatcher(input, opt),
1757       pat_(new Pattern(pattern)),
1758       own_(true)
1759   { }
1760   /// Construct a base abstract matcher from a regex pattern string and an input character sequence.
1761   PatternMatcher(
1762       const std::string& pattern,         ///< regex string instantiates pattern object for this matcher
1763       const Input&       input = Input(), ///< input character sequence for this matcher
1764       const char        *opt = NULL)      ///< option string of the form `(A|N|T(=[[:digit:]])?|;)*`
1765     :
AbstractMatcher(input,opt)1766       AbstractMatcher(input, opt),
1767       pat_(new Pattern(pattern)),
1768       own_(true)
1769   { }
1770   const Pattern *pat_; ///< points to the pattern object used by the matcher
1771   bool           own_; ///< true if PatternMatcher::pat_ was allocated and should be deleted
1772 };
1773 
1774 /// A specialization of the pattern matcher class template for std::string, extends abstract matcher base class.
1775 template<>
1776 class PatternMatcher<std::string> : public AbstractMatcher {
1777  public:
1778   typedef std::string Pattern; ///< pattern class of this matcher
1779   /// Copy constructor, the underlying pattern string is copied.
PatternMatcher(const PatternMatcher & matcher)1780   PatternMatcher(const PatternMatcher& matcher) ///< matcher with pattern to copy and use
1781     :
1782       AbstractMatcher(matcher.in, matcher.opt_),
1783       pat_(matcher.pat_ != NULL ? new Pattern(*matcher.pat_) : NULL),
1784       own_(matcher.pat_ != NULL)
1785   { }
1786   /// Delete matcher, deletes pattern when owned
~PatternMatcher()1787   virtual ~PatternMatcher()
1788   {
1789     DBGLOG("PatternMatcher::~PatternMatcher()");
1790     if (own_ && pat_ != NULL)
1791       delete pat_;
1792   }
1793   /// Assign a matcher, the underlying pattern string is shared (not deep copied).
1794   PatternMatcher& operator=(const PatternMatcher& matcher) ///< matcher with pattern to use (pattern may be shared)
1795   {
1796     scan.init(this, Const::SCAN);
1797     find.init(this, Const::FIND);
1798     split.init(this, Const::SPLIT);
1799     in = matcher.in;
1800     reset();
1801     opt_ = matcher.opt_;
1802     pat_ = matcher.pat_,
1803     own_ = false;
1804     return *this;
1805   }
1806   /// Set the pattern to use with this matcher as a shared pointer to another matcher pattern.
pattern(const PatternMatcher & matcher)1807   virtual PatternMatcher& pattern(const PatternMatcher& matcher) ///< the other matcher
1808     /// @returns this matcher
1809   {
1810     opt_ = matcher.opt_;
1811     return this->pattern(matcher.pattern());
1812   }
1813   /// Set the pattern to use with this matcher (the given pattern is shared and must be persistent).
pattern(const Pattern * pattern)1814   virtual PatternMatcher& pattern(const Pattern *pattern) ///< pattern string for this matcher
1815     /// @returns this matcher
1816   {
1817     DBGLOG("Patternatcher::pattern()");
1818     if (pat_ != pattern)
1819     {
1820       if (own_ && pat_ != NULL)
1821         delete pat_;
1822       pat_ = pattern;
1823       own_ = false;
1824     }
1825     return *this;
1826   }
1827   /// Set the pattern from a regex string to use with this matcher.
pattern(const char * pattern)1828   virtual PatternMatcher& pattern(const char *pattern) ///< regex string to instantiate internal pattern string
1829     /// @returns this matcher
1830   {
1831     DBGLOG("Patternatcher::pattern(\"%s\")", pattern);
1832     if (own_ && pat_ != NULL)
1833       delete pat_;
1834     pat_ = new Pattern(pattern);
1835     own_ = true;
1836     return *this;
1837   }
1838   /// Set the pattern from a regex string to use with this matcher.
pattern(const std::string & pattern)1839   virtual PatternMatcher& pattern(const std::string& pattern) ///< regex string to instantiate internal pattern string
1840     /// @returns this matcher
1841   {
1842     DBGLOG("Patternatcher::pattern(\"%s\")", pattern.c_str());
1843     if (own_ && pat_ != NULL)
1844       delete pat_;
1845     pat_ = new Pattern(pattern);
1846     own_ = true;
1847     return *this;
1848   }
1849   /// Returns true if this matcher has a pattern.
has_pattern()1850   bool has_pattern() const
1851     /// @returns true if this matcher has a pattern
1852   {
1853     return pat_ != NULL;
1854   }
1855   /// Returns true if this matcher has its own pattern not received from another matcher (responsible to delete).
own_pattern()1856   bool own_pattern() const
1857     /// @returns true if this matcher has its own pattern
1858   {
1859     return own_ && pat_ != NULL;
1860   }
1861   /// Returns a reference to the pattern string associated with this matcher.
pattern()1862   const Pattern& pattern() const
1863     /// @returns reference to pattern string
1864   {
1865     ASSERT(pat_ != NULL);
1866     return *pat_;
1867   }
1868  protected:
1869   /// Construct a base abstract matcher from a pointer to a persistent pattern string (that is shared with this class) and an input character sequence.
1870   PatternMatcher(
1871       const Pattern *pattern = NULL,  ///< points to pattern string for this matcher
1872       const Input&   input = Input(), ///< input character sequence for this matcher
1873       const char    *opt = NULL)      ///< option string of the form `(A|N|T(=[[:digit:]])?|;)*`
1874     :
AbstractMatcher(input,opt)1875       AbstractMatcher(input, opt),
1876       pat_(pattern),
1877       own_(false)
1878   { }
1879   /// Construct a base abstract matcher from a regex pattern string and an input character sequence.
1880   PatternMatcher(
1881       const char  *pattern,         ///< regex string instantiates pattern string for this matcher
1882       const Input& input = Input(), ///< input character sequence for this matcher
1883       const char  *opt = NULL)      ///< option string of the form `(A|N|T(=[[:digit:]])?|;)*`
1884     :
AbstractMatcher(input,opt)1885       AbstractMatcher(input, opt),
1886       pat_(new Pattern(pattern)),
1887       own_(true)
1888   { }
1889   /// Construct a base abstract matcher from a regex pattern string and an input character sequence.
1890   PatternMatcher(
1891       const std::string& pattern,         ///< regex string instantiates pattern string for this matcher
1892       const Input&       input = Input(), ///< input character sequence for this matcher
1893       const char        *opt = NULL)      ///< option string of the form `(A|N|T(=[[:digit:]])?|;)*`
1894     :
AbstractMatcher(input,opt)1895       AbstractMatcher(input, opt),
1896       pat_(new Pattern(pattern)),
1897       own_(true)
1898   { }
1899   const Pattern *pat_; ///< points to the pattern string used by the matcher
1900   bool           own_; ///< true if PatternMatcher::pat_ was allocated and should be deleted
1901 };
1902 
1903 } // namespace reflex
1904 
1905 /// Write matched text to a stream.
1906 inline std::ostream& operator<<(std::ostream& os, const reflex::AbstractMatcher& matcher)
1907 {
1908   os.write(matcher.begin(), matcher.size());
1909   return os;
1910 }
1911 
1912 /// Read stream and store all content in the matcher's buffer.
1913 inline std::istream& operator>>(std::istream& is, reflex::AbstractMatcher& matcher)
1914 {
1915   matcher.input(is).buffer();
1916   return is;
1917 }
1918 
1919 #endif
1920