1 /******************************************************************************\
2 * Copyright (c) 2016, Robert van Engelen, Genivia Inc. All rights reserved. *
3 * *
4 * Redistribution and use in source and binary forms, with or without *
5 * modification, are permitted provided that the following conditions are met: *
6 * *
7 * (1) Redistributions of source code must retain the above copyright notice, *
8 * this list of conditions and the following disclaimer. *
9 * *
10 * (2) Redistributions in binary form must reproduce the above copyright *
11 * notice, this list of conditions and the following disclaimer in the *
12 * documentation and/or other materials provided with the distribution. *
13 * *
14 * (3) The name of the author may not be used to endorse or promote products *
15 * derived from this software without specific prior written permission. *
16 * *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED *
18 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF *
19 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO *
20 * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, *
21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, *
22 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; *
23 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, *
24 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR *
25 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF *
26 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
27 \******************************************************************************/
28
29 /**
30 @file absmatcher.h
31 @brief RE/flex abstract matcher base class and pattern matcher class
32 @author Robert van Engelen - engelen@genivia.com
33 @copyright (c) 2016-2020, Robert van Engelen, Genivia Inc. All rights reserved.
34 @copyright (c) BSD-3 License - see LICENSE.txt
35 */
36
37 #ifndef REFLEX_ABSMATCHER_H
38 #define REFLEX_ABSMATCHER_H
39
40 /// This compile-time option may speed up buffer reallocation with realloc() instead of new and delete.
41 #define WITH_REALLOC
42
43 /// This compile-time option speeds up matching, but slows input().
44 #define WITH_FAST_GET
45
46 /// This compile-time option adds span(), line(), wline(), bol(), eol()
47 #define WITH_SPAN
48
49 #include <reflex/convert.h>
50 #include <reflex/debug.h>
51 #include <reflex/input.h>
52 #include <reflex/traits.h>
53 #include <reflex/simd.h>
54 #include <cstdlib>
55 #include <cctype>
56 #include <iterator>
57
58 namespace reflex {
59
60 /// Check ASCII word-like character `[A-Za-z0-9_]`, permitting the character range 0..303 (0x12F) and EOF.
isword(int c)61 inline int isword(int c) ///< Character to check
62 /// @returns nonzero if argument c is in `[A-Za-z0-9_]`, zero otherwise
63 {
64 return std::isalnum(static_cast<unsigned char>(c)) | (c == '_');
65 }
66
67 /// The abstract matcher base class template defines an interface for all pattern matcher engines.
68 /**
69 The buffer expands when matches do not fit. The buffer size is initially BUFSZ.
70
71 ```
72 _________________
73 | | | | |
74 buf_=| |text|rest|free|
75 |__|____|____|____|
76 ^ ^ ^ ^
77 cur_ pos_ end_ max_
78
79 buf_ // points to buffered input, buffer may grow to fit long matches
80 cur_ // current position in buf_ while matching text, cur_ = pos_ afterwards, can be changed by more()
81 pos_ // position in buf_ to start the next match
82 end_ // position in buf_ that is free to fill with more input
83 max_ // allocated size of buf_, must ensure that max_ > end_ for text() to add a final \0
84 txt_ // points to the match, will be 0-terminated when text() or rest() are called
85 len_ // length of the match
86 chr_ // char located at txt_[len_] when txt_[len_] is set to \0 by text(), is \0 otherwise
87 got_ // buf_[cur_-1] or txt_[-1] character before this match (assigned before each match), initially Const::BOB
88 eof_ // true if no more data can/should be fetched to fill the buffer
89 ```
90 */
91 class AbstractMatcher {
92 protected:
93 typedef int Method; ///< a method is one of Const::SCAN, Const::FIND, Const::SPLIT, Const::MATCH
94 public:
95 /// AbstractMatcher::Const common constants.
96 struct Const {
97 static const Method SCAN = 0; ///< AbstractMatcher::match method is to scan input (tokenizer)
98 static const Method FIND = 1; ///< AbstractMatcher::match method is to find pattern in input
99 static const Method SPLIT = 2; ///< AbstractMatcher::match method is to split input at pattern matches
100 static const Method MATCH = 3; ///< AbstractMatcher::match method is to match the entire input
101 static const int NUL = '\0'; ///< NUL string terminator
102 static const int UNK = 256; ///< unknown/undefined character meta-char marker
103 static const int BOB = 257; ///< begin of buffer meta-char marker
104 static const int EOB = EOF; ///< end of buffer meta-char marker
105 #ifndef REFLEX_BUFSZ
106 static const size_t BUFSZ = (64*1024); ///< initial buffer size, at least 4096 bytes
107 #else
108 static const size_t BUFSZ = REFLEX_BUFSZ;
109 #endif
110 static const size_t BLOCK = 4096; ///< minimum remaining unused space in the buffer, to prevent excessive shifting
111 static const size_t REDO = 0x7FFFFFFF; ///< reflex::Matcher::accept() returns "redo" with reflex::Matcher option "A"
112 static const size_t EMPTY = 0xFFFFFFFF; ///< accept() returns "empty" last split at end of input
113 };
114 /// Context returned by before() and after()
115 struct Context {
ContextContext116 Context()
117 :
118 buf(NULL),
119 len(0),
120 num(0)
121 { }
ContextContext122 Context(const char *buf, size_t len, size_t num)
123 :
124 buf(buf),
125 len(len),
126 num(num)
127 { }
128 const char *buf; ///< pointer to buffer
129 size_t len; ///< length of buffered context
130 size_t num; ///< number of bytes shifted out so far, when buffer shifted
131 };
132 /// Event handler functor base class to invoke when the buffer contents are shifted out, e.g. for logging the data searched.
133 struct Handler { virtual void operator()(AbstractMatcher&, const char*, size_t, size_t) = 0; };
134 protected:
135 /// AbstractMatcher::Options for matcher engines.
136 struct Option {
OptionOption137 Option()
138 :
139 A(false),
140 N(false),
141 W(false),
142 T(8)
143 { }
144 bool A; ///< accept any/all (?^X) negative patterns as Const::REDO accept index codes
145 bool N; ///< nullable, find may return empty match (N/A to scan, split, matches)
146 bool W; ///< half-check for "whole words", check only left of \< and right of \> for non-word character
147 char T; ///< tab size, must be a power of 2, default is 8, for column count and indent \i, \j, and \k
148 };
149 /// AbstractMatcher::Iterator class for scanning, searching, and splitting input character sequences.
150 template<typename T> /// @tparam <T> AbstractMatcher or const AbstractMatcher
151 class Iterator : public std::iterator<std::input_iterator_tag,T> {
152 friend class AbstractMatcher;
153 friend class Iterator<typename reflex::TypeOp<T>::ConstType>;
154 friend class Iterator<typename reflex::TypeOp<T>::NonConstType>;
155 public:
156 /// Construct an AbstractMatcher::Iterator such that Iterator() == AbstractMatcher::Operation(*this, method).end().
Iterator()157 Iterator()
158 :
159 matcher_(NULL),
160 method_()
161 { }
162 /// Copy constructor.
Iterator(const Iterator<typename reflex::TypeOp<T>::NonConstType> & it)163 Iterator(const Iterator<typename reflex::TypeOp<T>::NonConstType>& it)
164 :
165 matcher_(it.matcher_),
166 method_(it.method_)
167 { }
168 /// AbstractMatcher::Iterator dereference.
169 T& operator*() const
170 /// @returns (const) reference to the iterator's matcher
171 {
172 return *matcher_;
173 }
174 /// AbstractMatcher::Iterator pointer.
175 T* operator->() const
176 /// @returns (const) pointer to the iterator's matcher
177 {
178 return matcher_;
179 }
180 /// AbstractMatcher::Iterator equality.
181 bool operator==(const Iterator<typename reflex::TypeOp<T>::ConstType>& rhs) const
182 /// @returns true if iterator equals RHS
183 {
184 return matcher_ == rhs.matcher_;
185 }
186 /// AbstractMatcher::Iterator inequality.
187 bool operator!=(const Iterator<typename reflex::TypeOp<T>::ConstType>& rhs) const
188 /// @returns true if iterator does not equal RHS
189 {
190 return matcher_ != rhs.matcher_;
191 }
192 /// AbstractMatcher::Iterator preincrement.
193 Iterator& operator++()
194 /// @returns reference to this iterator
195 {
196 if (matcher_->match(method_) == 0)
197 matcher_ = NULL;
198 return *this;
199 }
200 /// AbstractMatcher::Iterator postincrement.
201 Iterator operator++(int)
202 /// @returns iterator to current match
203 {
204 Iterator it = *this;
205 operator++();
206 return it;
207 }
208 /// Construct an AbstractMatcher::Iterator to scan, search, or split an input character sequence.
Iterator(AbstractMatcher * matcher,Method method)209 Iterator(
210 AbstractMatcher *matcher, ///< iterate over pattern matches with this matcher
211 Method method) ///< match using method Const::SCAN, Const::FIND, or Const::SPLIT
212 :
213 matcher_(matcher),
214 method_(method)
215 {
216 if (matcher_ && matcher_->match(method_) == 0)
217 matcher_ = NULL;
218 }
219 private:
220 AbstractMatcher *matcher_; ///< the matcher used by this iterator
221 Method method_; ///< the method for pattern matching by this iterator's matcher
222 };
223 public:
224 typedef AbstractMatcher::Iterator<AbstractMatcher> iterator; ///< std::input_iterator for scanning, searching, and splitting input character sequences
225 typedef AbstractMatcher::Iterator<const AbstractMatcher> const_iterator; ///< std::input_iterator for scanning, searching, and splitting input character sequences
226 /// AbstractMatcher::Operation functor to match input to a pattern, also provides a (const) AbstractMatcher::iterator to iterate over matches.
227 class Operation {
228 public:
229 /// Construct an AbstractMatcher::Operation functor to scan, search, or split an input character sequence.
Operation(AbstractMatcher * matcher,Method method)230 Operation(
231 AbstractMatcher *matcher, ///< use this matcher for this functor
232 Method method) ///< match using method Const::SCAN, Const::FIND, or Const::SPLIT
233 :
234 matcher_(matcher),
235 method_(method)
236 { }
init(AbstractMatcher * matcher,Method method)237 void init(
238 AbstractMatcher *matcher, ///< use this matcher for this functor
239 Method method) ///< match using method Const::SCAN, Const::FIND, or Const::SPLIT
240 {
241 matcher_ = matcher;
242 method_ = method;
243 }
244 /// AbstractMatcher::Operation() matches input to a pattern using method Const::SCAN, Const::FIND, or Const::SPLIT.
operator()245 size_t operator()() const
246 /// @returns value of accept() >= 1 for match or 0 for end of matches
247 {
248 return matcher_->match(method_);
249 }
250 /// AbstractMatcher::Operation.begin() returns a std::input_iterator to the start of the matches.
begin()251 iterator begin() const
252 /// @returns input iterator
253 {
254 return iterator(matcher_, method_);
255 }
256 /// AbstractMatcher::Operation.end() returns a std::input_iterator to the end of matches.
end()257 iterator end() const
258 /// @returns input iterator
259 {
260 return iterator();
261 }
262 /// AbstractMatcher::Operation.cbegin() returns a const std::input_iterator to the start of the matches.
cbegin()263 const_iterator cbegin() const
264 /// @returns input const_iterator
265 {
266 return const_iterator(matcher_, method_);
267 }
268 /// AbstractMatcher::Operation.cend() returns a const std::input_iterator to the end of matches.
cend()269 const_iterator cend() const
270 /// @returns input const_iterator
271 {
272 return const_iterator();
273 }
274 private:
275 AbstractMatcher *matcher_; ///< the matcher used by this functor
276 Method method_; ///< the method for pattern matching by this functor's matcher
277 };
278 /// Construct a base abstract matcher.
AbstractMatcher(const Input & input,const char * opt)279 AbstractMatcher(
280 const Input& input, ///< input character sequence for this matcher
281 const char *opt) ///< option string of the form `(A|N|T(=[[:digit:]])?|;)*`
282 :
283 scan(this, Const::SCAN),
284 find(this, Const::FIND),
285 split(this, Const::SPLIT)
286 {
287 in = input;
288 init(opt);
289 }
290 /// Construct a base abstract matcher.
AbstractMatcher(const Input & input,const Option & opt)291 AbstractMatcher(
292 const Input& input, ///< input character sequence for this matcher
293 const Option& opt) ///< options
294 :
295 scan(this, Const::SCAN),
296 find(this, Const::FIND),
297 split(this, Const::SPLIT)
298 {
299 in = input;
300 init();
301 opt_ = opt;
302 }
303 /// Delete abstract matcher, deletes this matcher's internal buffer.
~AbstractMatcher()304 virtual ~AbstractMatcher()
305 {
306 DBGLOG("AbstractMatcher::~AbstractMatcher()");
307 if (own_)
308 {
309 #if defined(WITH_REALLOC)
310 #if (defined(__WIN32__) || defined(_WIN32) || defined(WIN32) || defined(_WIN64) || defined(__BORLANDC__)) && !defined(__CYGWIN__)
311 _aligned_free(static_cast<void*>(buf_));
312 #else
313 std::free(static_cast<void*>(buf_));
314 #endif
315 #else
316 delete[] buf_;
317 #endif
318 }
319 }
320 /// Polymorphic cloning.
321 virtual AbstractMatcher *clone() = 0;
322 /// Reset this matcher's state to the initial state and set options (when provided).
323 virtual void reset(const char *opt = NULL)
324 {
325 DBGLOG("AbstractMatcher::reset(%s)", opt ? opt : "(null)");
326 if (opt)
327 {
328 opt_.A = false; // when true: accept any/all (?^X) negative patterns as Const::REDO accept index codes
329 opt_.N = false; // when true: find may return empty match (N/A to scan, split, matches)
330 opt_.W = false; // when true: half-check for "whole words", check only left of \< and right of \> for non-word character
331 opt_.T = 8; // tab size 1, 2, 4, or 8
332 if (opt)
333 {
334 for (const char *s = opt; *s != '\0'; ++s)
335 {
336 switch (*s)
337 {
338 case 'A':
339 opt_.A = true;
340 break;
341 case 'N':
342 opt_.N = true;
343 break;
344 case 'W':
345 opt_.W = true;
346 break;
347 case 'T':
348 opt_.T = isdigit(*(s += (s[1] == '=') + 1)) ? static_cast<char>(*s - '0') : 0;
349 break;
350 }
351 }
352 }
353 }
354 if (!own_)
355 {
356 max_ = Const::BUFSZ;
357 #if defined(WITH_REALLOC)
358 #if (defined(__WIN32__) || defined(_WIN32) || defined(WIN32) || defined(_WIN64) || defined(__BORLANDC__)) && !defined(__CYGWIN__)
359 buf_ = static_cast<char*>(_aligned_malloc(max_, 4096));
360 if (buf_ == NULL)
361 throw std::bad_alloc();
362 #else
363 buf_ = NULL;
364 if (posix_memalign(reinterpret_cast<void**>(&buf_), 4096, max_) != 0)
365 throw std::bad_alloc();
366 #endif
367 #else
368 buf_ = new char[max_];
369 #endif
370 }
371 buf_[0] = '\0';
372 txt_ = buf_;
373 len_ = 0;
374 cap_ = 0;
375 cur_ = 0;
376 pos_ = 0;
377 end_ = 0;
378 ind_ = 0;
379 blk_ = 0;
380 got_ = Const::BOB;
381 chr_ = '\0';
382 #if defined(WITH_SPAN)
383 bol_ = buf_;
384 evh_ = NULL;
385 #endif
386 lpb_ = buf_;
387 lno_ = 1;
388 #if defined(WITH_SPAN)
389 cpb_ = buf_;
390 #endif
391 cno_ = 0;
392 num_ = 0;
393 own_ = true;
394 eof_ = false;
395 mat_ = false;
396 }
397 /// Set buffer block size for reading: use 0 (or omit argument) to buffer all input in which case returns true if all the data could be read and false if a read error occurred.
398 bool buffer(size_t blk = 0) ///< new block size between 1 and Const::BLOCK, or 0 to buffer all input (default)
399 /// @returns true when successful to buffer all input when n=0
400 {
401 if (blk > Const::BLOCK)
402 blk = Const::BLOCK;
403 DBGLOG("AbstractMatcher::buffer(%zu)", blk);
404 blk_ = blk;
405 if (blk > 0 || eof_ || in.eof())
406 return true;
407 size_t n = in.size(); // get the (rest of the) data size, which is 0 if unknown (e.g. reading input from a TTY or a pipe)
408 if (n > 0)
409 {
410 (void)grow(n + 1); // now attempt to fetch all (remaining) data to store in the buffer, +1 for a final \0
411 end_ += get(buf_, n);
412 }
413 while (in.good()) // there is more to get while good(), e.g. via wrap()
414 {
415 (void)grow();
416 end_ += get(buf_ + end_, max_ - end_);
417 }
418 if (end_ == max_)
419 (void)grow(1); // make sure we have room for a final \0
420 return in.eof();
421 }
422 #if defined(WITH_SPAN)
423 /// Set event handler functor to invoke when the buffer contents are shifted out, e.g. for logging the data searched.
set_handler(Handler * handler)424 void set_handler(Handler *handler)
425 {
426 evh_ = handler;
427 }
428 /// Get the buffered context before the matching line.
before()429 inline Context before()
430 {
431 (void)lineno();
432 return Context(buf_, bol_ - buf_, num_);
433 }
434 /// Get the buffered context after EOF is reached.
after()435 inline Context after()
436 {
437 if (hit_end())
438 {
439 (void)lineno();
440 // if there is no \n at the end of input: increase line count by one to compensate
441 if (bol_ < txt_)
442 ++lno_;
443 return Context(buf_, end_, num_);
444 }
445 return Context(buf_, 0, num_);
446 }
447 #endif
448 /// Set interactive input with buffer size of 1 to read data bytewise which is very slow.
interactive()449 void interactive()
450 /// @note Use this method before any matching is done and before any input is read since the last time input was (re)set.
451 {
452 DBGLOG("AbstractMatcher::interactive()");
453 (void)buffer(1);
454 }
455 /// Flush the buffer's remaining content.
flush()456 void flush()
457 {
458 DBGLOG("AbstractMatcher::flush()");
459 pos_ = end_;
460 }
461 /// Returns more input data directly from the source (method can be overriden, as by reflex::FlexLexer::get(s, n) for example that invokes reflex::FlexLexer::LexerInput(s, n)).
get(char * s,size_t n)462 virtual size_t get(
463 /// @returns the nonzero number of (less or equal to n) 8-bit characters added to buffer s from the current input, or zero when EOF
464 char *s, ///< points to the string buffer to fill with input
465 size_t n) ///< size of buffer pointed to by s
466 {
467 return in.get(s, n);
468 }
469 /// Returns true if wrapping of input after EOF is supported.
wrap()470 virtual bool wrap()
471 /// @returns true if input was succesfully wrapped
472 {
473 return false;
474 }
475 /// Set the input character sequence for this matcher and reset/restart the matcher.
input(const Input & input)476 virtual AbstractMatcher& input(const Input& input) ///< input character sequence for this matcher
477 /// @returns this matcher
478 {
479 DBGLOG("AbstractMatcher::input()");
480 in = input;
481 reset();
482 return *this;
483 }
484 /// Set the buffer base containing 0-terminated character data to scan in place (data may be modified), reset/restart the matcher.
buffer(char * base,size_t size)485 AbstractMatcher& buffer(
486 char *base, ///< base of the buffer containing 0-terminated character data
487 size_t size) ///< nonzero size of the buffer
488 /// @returns this matcher
489 {
490 if (size > 0)
491 {
492 if (own_)
493 {
494 #if defined(WITH_REALLOC)
495 #if (defined(__WIN32__) || defined(_WIN32) || defined(WIN32) || defined(_WIN64) || defined(__BORLANDC__)) && !defined(__CYGWIN__)
496 _aligned_free(static_cast<void*>(buf_));
497 #else
498 std::free(static_cast<void*>(buf_));
499 #endif
500 #else
501 delete[] buf_;
502 #endif
503 }
504 buf_ = base;
505 txt_ = buf_;
506 len_ = 0;
507 cap_ = 0;
508 cur_ = 0;
509 pos_ = 0;
510 end_ = size - 1;
511 max_ = size;
512 ind_ = 0;
513 blk_ = 0;
514 got_ = Const::BOB;
515 chr_ = '\0';
516 #if defined(WITH_SPAN)
517 bol_ = buf_;
518 evh_ = NULL;
519 #endif
520 lpb_ = buf_;
521 lno_ = 1;
522 #if defined(WITH_SPAN)
523 cpb_ = buf_;
524 #endif
525 cno_ = 0;
526 num_ = 0;
527 own_ = false;
528 eof_ = true;
529 mat_ = false;
530 }
531 return *this;
532 }
533
534 /// Returns nonzero capture index (i.e. true) if the entire input matches this matcher's pattern (and internally caches the true/false result to permit repeat invocations).
matches()535 inline size_t matches()
536 /// @returns nonzero capture index (i.e. true) if the entire input matched this matcher's pattern, zero (i.e. false) otherwise
537 {
538 if (!mat_ && at_bob())
539 mat_ = match(Const::MATCH) && at_end();
540 return mat_;
541 }
542 /// Returns a positive integer (true) indicating the capture index of the matched text in the pattern or zero (false) for a mismatch.
accept()543 inline size_t accept() const
544 /// @returns nonzero capture index of the match in the pattern, which may be matcher dependent, or zero for a mismatch, or Const::EMPTY for the empty last split
545 {
546 return cap_;
547 }
548 /// Returns pointer to the begin of the matched text (non-0-terminated), a constant-time operation, use with end() or use size() for text end/length.
begin()549 inline const char *begin() const
550 /// @returns const char* pointer to the matched text in the buffer
551 {
552 return txt_;
553 }
554 /// Returns pointer to the exclusive end of the matched text, a constant-time operation.
end()555 inline const char *end() const
556 /// @returns const char* pointer to the exclusive end of the matched text in the buffer
557 {
558 return txt_ + len_;
559 }
560 /// Returns 0-terminated string of the text matched, does not include matched \0s, this is a constant-time operation.
text()561 inline const char *text()
562 /// @returns 0-terminated const char* string with text matched
563 {
564 if (chr_ == '\0')
565 {
566 chr_ = txt_[len_];
567 txt_[len_] = '\0';
568 }
569 return txt_;
570 }
571 /// Returns the text matched as a string, a copy of text(), may include matched \0s.
str()572 inline std::string str() const
573 /// @returns string with text matched
574 {
575 return std::string(txt_, len_);
576 }
577 /// Returns the match as a wide string, converted from UTF-8 text(), may include matched \0s.
wstr()578 inline std::wstring wstr() const
579 /// @returns wide string with text matched
580 {
581 return wcs(txt_, len_);
582 }
583 /// Returns the length of the matched text in number of bytes, including matched \0s, a constant-time operation.
size()584 inline size_t size() const
585 /// @returns match size in bytes
586 {
587 return len_;
588 }
589 /// Returns the length of the matched text in number of wide characters.
wsize()590 inline size_t wsize() const
591 /// @returns the length of the match in number of wide (multibyte UTF-8) characters
592 {
593 size_t n = 0;
594 const char *e = txt_ + len_;
595 for (const char *s = txt_; s < e; ++s)
596 n += (*s & 0xC0) != 0x80;
597 return n;
598 }
599 /// Returns the first 8-bit character of the text matched.
chr()600 inline int chr() const
601 /// @returns 8-bit char
602 {
603 return *txt_;
604 }
605 /// Returns the first wide character of the text matched.
wchr()606 inline int wchr() const
607 /// @returns wide char (UTF-8 converted to Unicode)
608 {
609 return utf8(txt_);
610 }
611 /// Set or change the starting line number of the last match.
lineno(size_t n)612 inline void lineno(size_t n) ///< new line number
613 {
614 if (lpb_ < txt_)
615 (void)lineno(); // update lno_ and bol_ (or cno_) before overriding lno_
616 lno_ = n;
617 }
618 /// Updates and returns the starting line number of the match in the input character sequence.
lineno()619 inline size_t lineno()
620 /// @returns line number
621 {
622 #if defined(WITH_SPAN)
623 if (lpb_ < txt_)
624 {
625 const char *s = lpb_;
626 const char *t = txt_;
627 size_t n = 0;
628 #if defined(HAVE_AVX512BW) && (!defined(_MSC_VER) || defined(_WIN64))
629 if (s + 63 > t && have_HW_AVX512BW())
630 {
631 n += simd_nlcount_avx512bw(s, t);
632 }
633 else if (s + 31 > t && have_HW_AVX2())
634 {
635 n += simd_nlcount_avx2(s, t);
636 }
637 else if (have_HW_SSE2())
638 {
639 __m128i vlcn = _mm_set1_epi8('\n');
640 while (s + 15 <= t)
641 {
642 __m128i vlcm = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s));
643 __m128i vlceq = _mm_cmpeq_epi8(vlcm, vlcn);
644 uint32_t mask = _mm_movemask_epi8(vlceq);
645 n += popcount(mask);
646 s += 16;
647 }
648 }
649 #elif defined(HAVE_AVX2)
650 if (s + 31 > t && have_HW_AVX2())
651 {
652 n += simd_nlcount_avx2(s, t);
653 }
654 else if (have_HW_SSE2())
655 {
656 __m128i vlcn = _mm_set1_epi8('\n');
657 while (s + 15 <= t)
658 {
659 __m128i vlcm = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s));
660 __m128i vlceq = _mm_cmpeq_epi8(vlcm, vlcn);
661 uint32_t mask = _mm_movemask_epi8(vlceq);
662 n += popcount(mask);
663 s += 16;
664 }
665 }
666 #elif defined(HAVE_SSE2)
667 if (have_HW_SSE2())
668 {
669 __m128i vlcn = _mm_set1_epi8('\n');
670 while (s + 15 <= t)
671 {
672 __m128i vlcm = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s));
673 __m128i vlceq = _mm_cmpeq_epi8(vlcm, vlcn);
674 uint32_t mask = _mm_movemask_epi8(vlceq);
675 n += popcount(mask);
676 s += 16;
677 }
678 }
679 #elif defined(HAVE_NEON)
680 {
681 // ARM AArch64/NEON SIMD optimized loop? - no code that runs faster than the code below?
682 }
683 #endif
684 uint32_t n0 = 0, n1 = 0, n2 = 0, n3 = 0;
685 // clang/gcc 4-way auto-vectorizable loop
686 while (s + 3 < t)
687 {
688 n0 += s[0] == '\n';
689 n1 += s[1] == '\n';
690 n2 += s[2] == '\n';
691 n3 += s[3] == '\n';
692 s += 4;
693 }
694 n += n0 + n1 + n2 + n3;
695 // epilogue
696 if (s < t)
697 {
698 n += *s == '\n';
699 if (++s < t)
700 {
701 n += *s == '\n';
702 if (++s < t)
703 n += *s == '\n';
704 }
705 }
706 // if newlines are detected, then find begin of the last line to adjust bol
707 if (n > 0)
708 {
709 lno_ += n;
710 s = lpb_;
711 // clang/gcc 4-way auto-vectorizable loop
712 while (t - 4 >= s)
713 {
714 if ((t[-1] == '\n') | (t[-2] == '\n') | (t[-3] == '\n') | (t[-4] == '\n'))
715 break;
716 t -= 4;
717 }
718 // epilogue
719 if (--t >= s && *t != '\n')
720 if (--t >= s && *t != '\n')
721 if (--t >= s && *t != '\n')
722 --t;
723 bol_ = t + 1;
724 cpb_ = bol_;
725 cno_ = 0;
726 }
727 lpb_ = txt_;
728 }
729 #else
730 size_t n = lno_;
731 size_t k = cno_;
732 const char *s = lpb_;
733 const char *e = txt_;
734 while (s < e)
735 {
736 if (*s == '\n')
737 {
738 ++n;
739 k = 0;
740 }
741 else if (*s == '\t')
742 {
743 // count tab spacing
744 k += 1 + (~k & (opt_.T - 1));
745 }
746 else
747 {
748 // count column offset in UTF-8 chars
749 k += ((*s & 0xC0) != 0x80);
750 }
751 ++s;
752 }
753 lpb_ = e;
754 lno_ = n;
755 cno_ = k;
756 #endif
757 return lno_;
758 }
759 /// Returns the number of lines that the match spans.
lines()760 inline size_t lines()
761 /// @returns number of lines
762 {
763 size_t n = 1;
764 const char *e = txt_ + len_;
765 for (const char *s = txt_; s < e; ++s)
766 n += (*s == '\n');
767 return n;
768 }
769 /// Returns the inclusive ending line number of the match in the input character sequence.
lineno_end()770 inline size_t lineno_end()
771 /// @returns line number
772 {
773 return lineno() + lines() - 1;
774 }
775 /// Updates and returns the starting column number of the matched text, taking tab spacing into account and counting wide characters as one character each
columno()776 inline size_t columno()
777 /// @returns column number
778 {
779 (void)lineno();
780 #if defined(WITH_SPAN)
781 const char *s = cpb_;
782 const char *e = txt_;
783 size_t k = cno_;
784 size_t m = opt_.T - 1;
785 while (s < e)
786 {
787 if (*s == '\t')
788 k += 1 + (~k & m); // count tab spacing
789 else
790 k += ((*s & 0xC0) != 0x80); // count column offset in UTF-8 chars
791 ++s;
792 }
793 cpb_ = txt_;
794 cno_ = k;
795 #endif
796 return cno_;
797 }
798 /// Returns the number of columns of the matched text, taking tab spacing into account and counting wide characters as one character each.
columns()799 inline size_t columns()
800 /// @returns number of columns
801 {
802 // count columns in tabs and UTF-8 chars
803 #if defined(WITH_SPAN)
804 const char *s = txt_;
805 const char *e = txt_ + len_;
806 size_t n = columno();
807 size_t k = n;
808 while (s < e)
809 {
810 if (*s == '\t')
811 k += 1 + (~k & (opt_.T - 1)); // count tab spacing
812 else if (*s != '\r' && *s != '\n')
813 k += ((*s & 0xC0) != 0x80); // count column offset in UTF-8 chars
814 ++s;
815 }
816 return k - n;
817 #else
818 size_t n = cno_;
819 size_t m = 0;
820 const char *s;
821 const char *t = buf_;
822 for (s = txt_ + len_ - 1; s >= t; --s)
823 {
824 if (*s == '\n')
825 {
826 n = 0;
827 break;
828 }
829 }
830 t = txt_;
831 const char *e = txt_ + len_;
832 for (++s; s < e; ++s)
833 {
834 if (s == t)
835 m = n;
836 if (*s == '\t')
837 n += 1 + (~n & (opt_.T - 1));
838 else
839 n += (*s & 0xC0) != 0x80;
840 }
841 return n - m;
842 #endif
843 }
844 #if defined(WITH_SPAN)
845 /// Returns the inclusive ending column number of the matched text on the ending matching line, taking tab spacing into account and counting wide characters as one character each
columno_end()846 inline size_t columno_end()
847 /// @returns column number
848 {
849 if (len_ == 0)
850 return columno();
851 (void)lineno();
852 const char *e = txt_ + len_;
853 const char *s = e;
854 const char *b = bol_;
855 while (--s >= b)
856 if (*s == '\n')
857 break;
858 size_t k = 0;
859 while (++s < e)
860 {
861 if (*s == '\t')
862 k += 1 + (~k & (opt_.T - 1));
863 else
864 k += (*s & 0xC0) != 0x80;
865 }
866 return k > 0 ? k - 1 : 0;
867 }
868 #endif
869 /// Returns std::pair<size_t,std::string>(accept(), str()), useful for tokenizing input into containers of pairs.
pair()870 inline std::pair<size_t,std::string> pair() const
871 /// @returns std::pair<size_t,std::string>(accept(), str())
872 {
873 return std::pair<size_t,std::string>(accept(), str());
874 }
875 /// Returns std::pair<size_t,std::wstring>(accept(), wstr()), useful for tokenizing input into containers of pairs.
wpair()876 inline std::pair<size_t,std::wstring> wpair() const
877 /// @returns std::pair<size_t,std::wstring>(accept(), wstr())
878 {
879 return std::pair<size_t,std::wstring>(accept(), wstr());
880 }
881 /// Returns the position of the first character of the match in the input character sequence, a constant-time operation.
first()882 inline size_t first() const
883 /// @returns position in the input character sequence
884 {
885 return num_ + txt_ - buf_;
886 }
887 /// Returns the exclusive position of the last character of the match in the input character sequence, a constant-time operation.
last()888 inline size_t last() const
889 /// @returns position in the input character sequence
890 {
891 return first() + size();
892 }
893 /// Returns true if this matcher is at the start of a buffer to read an input character sequence. Use reset() to restart reading new input.
at_bob()894 inline bool at_bob() const
895 /// @returns true if at the begin of an input sequence
896 {
897 return got_ == Const::BOB;
898 }
899 /// Set/reset the begin of a buffer state.
set_bob(bool bob)900 inline void set_bob(bool bob) ///< if true: set begin of buffer state
901 {
902 if (bob)
903 got_ = Const::BOB;
904 else if (got_ == Const::BOB)
905 got_ = Const::UNK;
906 }
907 /// Returns true if this matcher has no more input to read from the input character sequence.
at_end()908 inline bool at_end()
909 /// @returns true if at end of input and a read attempt will produce EOF
910 {
911 return pos_ >= end_ && (eof_ || peek() == EOF);
912 }
913 /// Returns true if this matcher hit the end of the input character sequence.
hit_end()914 inline bool hit_end() const
915 /// @returns true if EOF was hit (and possibly more input would have changed the result), false otherwise (but next read attempt may return EOF immediately)
916 {
917 return pos_ >= end_ && eof_;
918 }
919 /// Set and force the end of input state.
set_end(bool eof)920 inline void set_end(bool eof)
921 {
922 if (eof)
923 flush();
924 if (own_)
925 eof_ = eof;
926 }
927 /// Returns true if this matcher reached the begin of a new line.
at_bol()928 inline bool at_bol() const
929 /// @returns true if at begin of a new line
930 {
931 return got_ == Const::BOB || got_ == '\n';
932 }
933 /// Set/reset the begin of a new line state.
set_bol(bool bol)934 inline void set_bol(bool bol) ///< if true: set begin of a new line state
935 {
936 if (bol)
937 got_ = '\n';
938 else if (got_ == '\n')
939 got_ = Const::UNK;
940 }
941 /// Returns true if this matcher matched text that begins a word.
at_bow()942 inline bool at_bow()
943 /// @returns true if this matcher matched text that begins a word
944 {
945 return !isword(got_) && isword(txt_ < buf_ + end_ ? static_cast<unsigned char>(*txt_) : peek_more());
946 }
947 /// Returns true if this matcher matched text that ends a word.
at_eow()948 inline bool at_eow()
949 /// @returns true if this matcher matched text that ends a word
950 {
951 return isword(got_) && !isword(txt_ < buf_ + end_ ? static_cast<unsigned char>(*txt_) : peek_more());
952 }
953 /// Returns the next 8-bit character (unsigned char 0..255 or EOF) from the input character sequence, while preserving the current text() match (but pointer returned by text() may change; warning: does not preserve the yytext string pointer when options --flex and --bison are used).
input()954 int input()
955 /// @returns the next character (unsigned char 0..255) from input or EOF (-1)
956 {
957 DBGLOG("AbstractMatcher::input() pos = %zu end = %zu", pos_, end_);
958 if (pos_ < end_)
959 {
960 if (chr_ != '\0' && buf_ + pos_ == txt_ + len_)
961 got_ = chr_;
962 else
963 got_ = static_cast<unsigned char>(buf_[pos_]);
964 ++pos_;
965 }
966 else
967 {
968 #if defined(WITH_FAST_GET)
969 got_ = get_more();
970 #else
971 got_ = get();
972 #endif
973 }
974 cur_ = pos_;
975 return got_;
976 }
977 /// Returns the next wide character (unsigned 0..U+10FFFF or EOF) from the input character sequence, while preserving the current text() match (but pointer returned by text() may change; warning: does not preserve the yytext string pointer when options --flex and --bison are used).
winput()978 int winput()
979 /// @returns the next wide character (unsigned 0..U+10FFFF) or EOF (-1)
980 {
981 DBGLOG("AbstractMatcher::winput()");
982 char tmp[8] = { 0 }, *s = tmp;
983 int c;
984 if ((c = input()) == EOF)
985 return EOF;
986 if (static_cast<unsigned char>(*s++ = c) >= 0x80)
987 {
988 while (((++*s = get()) & 0xC0) == 0x80)
989 continue;
990 got_ = static_cast<unsigned char>(buf_[cur_ = --pos_]);
991 }
992 return utf8(tmp);
993 }
994 /// Put back one character (8-bit) on the input character sequence for matching, DANGER: invalidates the previous text() pointer and match info, unput is not honored when matching in-place using buffer(base, size) and nothing has been read yet.
unput(char c)995 void unput(char c) ///< 8-bit character to put back
996 {
997 DBGLOG("AbstractMatcher::unput()");
998 reset_text();
999 if (pos_ > 0)
1000 {
1001 --pos_;
1002 }
1003 else if (own_)
1004 {
1005 txt_ = buf_;
1006 len_ = 0;
1007 if (end_ + 1 >= max_)
1008 (void)grow();
1009 std::memmove(buf_ + 1, buf_, end_);
1010 ++end_;
1011 }
1012 buf_[pos_] = c;
1013 cur_ = pos_;
1014 }
1015 /// Put back one (wide) character on the input character sequence for matching, DANGER: invalidates the previous text() pointer and match info, unput is not honored when matching in-place using buffer(base, size) and nothing has been read yet.
wunput(int c)1016 void wunput(int c) ///< character to put back
1017 {
1018 DBGLOG("AbstractMatcher::wunput()");
1019 char tmp[8];
1020 size_t n = utf8(c, tmp);
1021 if (pos_ >= n)
1022 {
1023 pos_ -= n;
1024 }
1025 else if (own_)
1026 {
1027 txt_ = buf_;
1028 len_ = 0;
1029 if (end_ + n >= max_)
1030 (void)grow();
1031 std::memmove(buf_ + n, buf_, end_);
1032 end_ += n;
1033 }
1034 std::memcpy(&buf_[pos_], tmp, n);
1035 cur_ = pos_;
1036 }
1037 /// Peek at the next character available for reading from the current input source.
peek()1038 inline int peek()
1039 /// @returns the character (unsigned char 0..255) or EOF (-1)
1040 {
1041 DBGLOG("AbstractMatcher::peek()");
1042 #if defined(WITH_FAST_GET)
1043 return pos_ < end_ ? static_cast<unsigned char>(buf_[pos_]) : peek_more();
1044 #else
1045 if (pos_ < end_)
1046 return static_cast<unsigned char>(buf_[pos_]);
1047 if (eof_)
1048 return EOF;
1049 while (true)
1050 {
1051 if (end_ + blk_ + 1 >= max_)
1052 (void)grow();
1053 end_ += get(buf_ + end_, blk_ > 0 ? blk_ : max_ - end_ - 1);
1054 if (pos_ < end_)
1055 return static_cast<unsigned char>(buf_[pos_]);
1056 DBGLOGN("peek(): EOF");
1057 if (!wrap())
1058 {
1059 eof_ = true;
1060 return EOF;
1061 }
1062 }
1063 #endif
1064 }
1065 #if defined(WITH_SPAN)
1066 /// Returns pointer to the begin of the line in the buffer containing the matched text.
bol()1067 inline const char *bol()
1068 /// @returns pointer to the begin of line
1069 {
1070 (void)lineno();
1071 return bol_;
1072 }
1073 /// Returns pointer to the end of the line (last char + 1) in the buffer containing the matched text, DANGER: invalidates previous bol() and text() pointers, use eol() before bol(), text(), begin(), and end() when those are used.
1074 inline const char *eol(bool inclusive = false) ///< true if inclusive, i.e. point after \n
1075 /// @returns pointer to the end of line
1076 {
1077 if (chr_ == '\n' || (txt_ + len_ < buf_ + end_ && txt_[len_] == '\n'))
1078 return txt_ + len_ + inclusive;
1079 size_t loc = pos_;
1080 while (true)
1081 {
1082 if (loc < end_)
1083 {
1084 char *s = static_cast<char*>(std::memchr(buf_ + loc, '\n', end_ - loc));
1085 if (s != NULL)
1086 return s + inclusive;
1087 }
1088 if (eof_)
1089 break;
1090 (void)grow();
1091 loc = end_;
1092 end_ += get(buf_ + end_, blk_ > 0 ? blk_ : max_ - end_ - 1);
1093 if (loc >= end_ && !wrap())
1094 {
1095 eof_ = true;
1096 break;
1097 }
1098 }
1099 return buf_ + end_;
1100 }
1101 /// Returns the number of bytes in the buffer available to search from the current begin()/text() position.
avail()1102 size_t avail()
1103 {
1104 if (peek() == EOF)
1105 return 0;
1106 return end_ - (txt_ - buf_);
1107 }
1108 /// Returns the byte offset of the match from the start of the line.
border()1109 size_t border()
1110 /// @returns border offset
1111 {
1112 return txt_ - bol();
1113 }
1114 /// Enlarge the match to span the entire line of input (excluding \n), return text().
span()1115 const char *span()
1116 /// @returns const char* span of text for the entire line
1117 {
1118 DBGLOG("AbstractMatcher::span()");
1119 (void)lineno();
1120 len_ += txt_ - bol_;
1121 txt_ = const_cast<char*>(bol_); // requires ugly cast
1122 if (chr_ == '\n')
1123 return txt_;
1124 reset_text();
1125 const char *e = eol();
1126 set_current(e - buf_);
1127 len_ = e - bol_;
1128 return text();
1129 }
1130 /// Returns the line of input (excluding \n) as a string containing the matched text as a substring.
line()1131 std::string line()
1132 /// @returns matching line as a string
1133 {
1134 DBGLOG("AbstractMatcher::line()");
1135 reset_text();
1136 const char *e = eol(); // warning: must call eol() before bol()
1137 const char *b = bol();
1138 return std::string(b, e - b);
1139 }
1140 /// Returns the line of input (excluding \n) as a wide string containing the matched text as a substring.
wline()1141 std::wstring wline()
1142 /// @returns matching line as a wide string
1143 {
1144 DBGLOG("AbstractMatcher::wline()");
1145 reset_text();
1146 const char *e = eol(); // warning: must call eol() before bol()
1147 const char *b = bol();
1148 while (b < e && (*b & 0xC0) == 0x80) // make sure we advance forward to valid UTF-8
1149 ++b;
1150 return wcs(b, e - b);
1151 }
1152 #endif
1153 /// Skip input until the specified ASCII character is consumed and return true, or EOF is reached and return false.
skip(char c)1154 bool skip(char c) ///< ASCII character to skip to
1155 /// @returns true if skipped to c, false if EOF is reached
1156 {
1157 DBGLOG("AbstractMatcher::skip()");
1158 reset_text();
1159 len_ = 0;
1160 while (true)
1161 {
1162 txt_ = static_cast<char*>(std::memchr(buf_ + pos_, c, end_ - pos_));
1163 if (txt_ != NULL)
1164 {
1165 ++txt_;
1166 set_current(txt_ - buf_);
1167 return true;
1168 }
1169 if (eof_)
1170 break;
1171 pos_ = cur_ = end_;
1172 txt_ = buf_ + end_;
1173 (void)grow();
1174 end_ += get(buf_ + end_, blk_ > 0 ? blk_ : max_ - end_ - 1);
1175 if (pos_ >= end_ && !wrap())
1176 {
1177 eof_ = true;
1178 break;
1179 }
1180 }
1181 set_current(end_);
1182 return false;
1183 }
1184 /// Skip input until the specified Unicode character is consumed and return true, or EOF is reached and return false.
skip(wchar_t c)1185 bool skip(wchar_t c) ///< Unicode character to skip to
1186 /// @returns true if skipped to c, false if EOF is reached
1187 {
1188 char s[8];
1189 size_t n = utf8(c, s);
1190 s[n] = '\0';
1191 return skip(s);
1192 }
1193 /// Skip input until the specified literal UTF-8 string is consumed and return true, or EOF is reached and return false.
skip(const char * s)1194 bool skip(const char *s) ///< literal UTF-8 string to skip to
1195 /// @returns true if skipped to c, false if EOF is reached
1196 {
1197 if (s == NULL || s[0] == '\0')
1198 return true;
1199 if (s[1] == '\0')
1200 return skip(s[0]);
1201 while (skip(s[0]))
1202 {
1203 const char *t = s + 1;
1204 while (true)
1205 {
1206 if (*t == '\0')
1207 {
1208 set_current(pos_);
1209 return true;
1210 }
1211 int c = get();
1212 if (c == EOF)
1213 return false;
1214 if (c != static_cast<unsigned char>(*t))
1215 break;
1216 ++t;
1217 }
1218 pos_ = txt_ - buf_;
1219 }
1220 return false;
1221 }
1222 /// Fetch the rest of the input as text, useful for searching/splitting up to n times after which the rest is needed.
rest()1223 const char *rest()
1224 /// @returns const char* string of the remaining input (wrapped with more input when AbstractMatcher::wrap is defined)
1225 {
1226 DBGLOG("AbstractMatcher::rest()");
1227 reset_text();
1228 cur_ = pos_;
1229 txt_ = buf_ + cur_;
1230 while (!eof_)
1231 {
1232 (void)grow();
1233 pos_ = end_;
1234 end_ += get(buf_ + end_, blk_ > 0 ? blk_ : max_ - end_ - 1);
1235 if (pos_ >= end_ && !wrap())
1236 eof_ = true;
1237 }
1238 len_ = end_ - cur_;
1239 pos_ = cur_ = end_;
1240 DBGLOGN("rest() length = %zu", len_);
1241 return text();
1242 }
1243 /// Append the next match to the currently matched text returned by AbstractMatcher::text, when the next match found is adjacent to the current match.
more()1244 void more()
1245 {
1246 cur_ = txt_ - buf_;
1247 }
1248 /// Truncate the AbstractMatcher::text length of the match to n characters in length and reposition for next match.
less(size_t n)1249 void less(size_t n) ///< truncated string length
1250 {
1251 if (n < len_)
1252 {
1253 DBGCHK(pos_ < max_);
1254 reset_text();
1255 pos_ = txt_ - buf_ + n;
1256 DBGCHK(pos_ < max_);
1257 len_ = n;
1258 cur_ = pos_;
1259 }
1260 }
1261 /// Cast this matcher to positive integer indicating the nonzero capture index of the matched text in the pattern, same as AbstractMatcher::accept.
size_t()1262 operator size_t() const
1263 /// @returns nonzero capture index of a match, which may be matcher dependent, or zero for a mismatch
1264 {
1265 return accept();
1266 }
1267 /// Cast this matcher to a std::string of the text matched by this matcher.
string()1268 operator std::string() const
1269 /// @returns std::string with matched text
1270 {
1271 return str();
1272 }
1273 /// Cast this matcher to a std::wstring of the text matched by this matcher.
wstring()1274 operator std::wstring() const
1275 /// @returns std::wstring converted to UCS from the 0-terminated matched UTF-8 text
1276 {
1277 return wstr();
1278 }
1279 /// Cast the match to std::pair<size_t,std::wstring>(accept(), wstr()), useful for tokenization into containers.
1280 operator std::pair<size_t,std::string>() const
1281 /// @returns std::pair<size_t,std::wstring>(accept(), wstr())
1282 {
1283 return pair();
1284 }
1285 /// Returns true if matched text is equal to a string, useful for std::algorithm.
1286 bool operator==(const char *rhs) ///< rhs string to compare to
1287 /// @returns true if matched text is equal to rhs string
1288 const
1289 {
1290 return std::strncmp(rhs, txt_, len_) == 0 && rhs[len_] == '\0';
1291 }
1292 /// Returns true if matched text is equalt to a string, useful for std::algorithm.
1293 bool operator==(const std::string& rhs) ///< rhs string to compare to
1294 /// @returns true if matched text is equal to rhs string
1295 const
1296 {
1297 return rhs.size() == len_ && rhs.compare(0, std::string::npos, txt_, len_) == 0;
1298 }
1299 /// Returns true if capture index is equal to a given size_t value, useful for std::algorithm.
1300 bool operator==(size_t rhs) ///< capture index to compare accept() to
1301 /// @returns true if capture index is equal to rhs
1302 const
1303 {
1304 return accept() == rhs;
1305 }
1306 /// Returns true if capture index is equal to a given int value, useful for std::algorithm.
1307 bool operator==(int rhs) ///< capture index to compare accept() to
1308 /// @returns true if capture index is equal to rhs
1309 const
1310 {
1311 return static_cast<int>(accept()) == rhs;
1312 }
1313 /// Returns true if matched text is not equal to a string, useful for std::algorithm.
1314 bool operator!=(const char *rhs) ///< rhs string to compare to
1315 /// @returns true if matched text is not equal to rhs string
1316 const
1317 {
1318 return std::strncmp(rhs, txt_, len_) != 0 || rhs[len_] != '\0'; // if static checkers complain here, they are wrong
1319 }
1320 /// Returns true if matched text is not equal to a string, useful for std::algorithm.
1321 bool operator!=(const std::string& rhs) ///< rhs string to compare to
1322 /// @returns true if matched text is not equal to rhs string
1323 const
1324 {
1325 return rhs.size() > len_ || rhs.compare(0, std::string::npos, txt_, len_) != 0;
1326 }
1327 /// Returns true if capture index is not equal to a given size_t value, useful for std::algorithm.
1328 bool operator!=(size_t rhs) ///< capture index to compare accept() to
1329 /// @returns true if capture index is not equal to rhs
1330 const
1331 {
1332 return accept() != rhs;
1333 }
1334 /// Returns true if capture index is not equal to a given int value, useful for std::algorithm.
1335 bool operator!=(int rhs) ///< capture index to compare accept() to
1336 /// @returns true if capture index is not equal to rhs
1337 const
1338 {
1339 return static_cast<int>(accept()) != rhs;
1340 }
1341 /// Returns captured text as a std::pair<const char*,size_t> with string pointer (non-0-terminated) and length.
1342 virtual std::pair<const char*,size_t> operator[](size_t n)
1343 /// @returns std::pair of string pointer and length in the captured text, where [0] returns std::pair(begin(), size())
1344 const = 0;
1345 /// Returns the group capture identifier containing the group capture index >0 and name (or NULL) of a named group capture, or (1,NULL) by default
1346 virtual std::pair<size_t,const char*> group_id()
1347 /// @returns a pair of size_t and string
1348 = 0;
1349 /// Returns the next group capture identifier containing the group capture index >0 and name (or NULL) of a named group capture, or (0,NULL) when no more groups matched
1350 virtual std::pair<size_t,const char*> group_next_id()
1351 /// @returns a pair of size_t and string
1352 = 0;
1353 /// Set tab size 1, 2, 4, or 8
tabs(char n)1354 void tabs(char n) ///< tab size 1, 2, 4, or 8
1355 {
1356 opt_.T = n & 0xf;
1357 }
1358 /// Returns current tab size 1, 2, 4, or 8.
tabs()1359 char tabs()
1360 {
1361 return opt_.T;
1362 }
1363 Operation scan; ///< functor to scan input (to tokenize input)
1364 Operation find; ///< functor to search input
1365 Operation split; ///< functor to split input
1366 Input in; ///< input character sequence being matched by this matcher
1367 protected:
1368 /// Initialize the base abstract matcher at construction.
1369 virtual void init(const char *opt = NULL) ///< options
1370 {
1371 DBGLOG("AbstractMatcher::init(%s)", opt ? opt : "");
1372 own_ = false; // require allocation of a buffer
1373 reset(opt);
1374 }
1375 /// The abstract match operation implemented by pattern matching engines derived from AbstractMatcher.
1376 virtual size_t match(Method method)
1377 /// @returns nonzero when input matched the pattern using method Const::SCAN, Const::FIND, Const::SPLIT, or Const::MATCH
1378 = 0;
1379 /// Shift or expand the internal buffer when it is too small to accommodate more input, where the buffer size is doubled when needed, change cur_, pos_, end_, max_, ind_, buf_, bol_, lpb_, and txt_.
1380 inline bool grow(size_t need = Const::BLOCK) ///< optional needed space = Const::BLOCK size by default
1381 /// @returns true if buffer was shifted or enlarged
1382 {
1383 if (max_ - end_ >= need + 1)
1384 return false;
1385 #if defined(WITH_SPAN)
1386 (void)lineno();
1387 cno_ = 0;
1388 if (bol_ + Const::BUFSZ - buf_ < txt_ - bol_ && evh_ == NULL)
1389 {
1390 // this line is very long, so shift all the way to the match instead of to the begin of the last line
1391 DBGLOG("Line in buffer is too long to shift, moving bol position to text match position");
1392 (void)columno();
1393 bol_ = txt_;
1394 }
1395 size_t gap = bol_ - buf_;
1396 if (gap > 0)
1397 {
1398 if (evh_ != NULL)
1399 (*evh_)(*this, buf_, gap, num_);
1400 cur_ -= gap;
1401 ind_ -= gap;
1402 pos_ -= gap;
1403 end_ -= gap;
1404 txt_ -= gap;
1405 bol_ -= gap;
1406 lpb_ -= gap;
1407 num_ += gap;
1408 std::memmove(buf_, buf_ + gap, end_);
1409 }
1410 if (max_ - end_ >= need)
1411 {
1412 DBGLOG("Shift buffer to close gap of %zu bytes", gap);
1413 }
1414 else
1415 {
1416 size_t newmax = end_ + need;
1417 while (max_ < newmax)
1418 max_ *= 2;
1419 DBGLOG("Expand buffer to %zu bytes", max_);
1420 #if defined(WITH_REALLOC)
1421 #if (defined(__WIN32__) || defined(_WIN32) || defined(WIN32) || defined(_WIN64) || defined(__BORLANDC__)) && !defined(__CYGWIN__)
1422 char *newbuf = static_cast<char*>(_aligned_realloc(static_cast<void*>(buf_), max_, 4096));
1423 #else
1424 char *newbuf = static_cast<char*>(std::realloc(static_cast<void*>(buf_), max_));
1425 #endif
1426 if (newbuf == NULL)
1427 throw std::bad_alloc();
1428 #else
1429 char *newbuf = new char[max_];
1430 std::memcpy(newbuf, buf_, end_);
1431 delete[] buf_;
1432 #endif
1433 txt_ = newbuf + (txt_ - buf_);
1434 lpb_ = newbuf + (lpb_ - buf_);
1435 buf_ = newbuf;
1436 }
1437 bol_ = buf_;
1438 cpb_ = buf_;
1439 #else
1440 size_t gap = txt_ - buf_;
1441 if (max_ - end_ + gap >= need)
1442 {
1443 DBGLOG("Shift buffer to close gap of %zu bytes", gap);
1444 (void)lineno();
1445 cur_ -= gap;
1446 ind_ -= gap;
1447 pos_ -= gap;
1448 end_ -= gap;
1449 num_ += gap;
1450 if (end_ > 0)
1451 std::memmove(buf_, txt_, end_);
1452 txt_ = buf_;
1453 lpb_ = buf_;
1454 }
1455 else
1456 {
1457 size_t newmax = end_ - gap + need;
1458 size_t oldmax = max_;
1459 while (max_ < newmax)
1460 max_ *= 2;
1461 if (oldmax < max_)
1462 {
1463 DBGLOG("Expand buffer from %zu to %zu bytes", oldmax, max_);
1464 (void)lineno();
1465 cur_ -= gap;
1466 ind_ -= gap;
1467 pos_ -= gap;
1468 end_ -= gap;
1469 num_ += gap;
1470 #if defined(WITH_REALLOC)
1471 std::memmove(buf_, txt_, end_);
1472 #if (defined(__WIN32__) || defined(_WIN32) || defined(WIN32) || defined(_WIN64) || defined(__BORLANDC__)) && !defined(__CYGWIN__)
1473 char *newbuf = static_cast<char*>(_aligned_realloc(static_cast<void*>(buf_), max_, 4096));
1474 #else
1475 char *newbuf = static_cast<char*>(std::realloc(static_cast<void*>(buf_), max_));
1476 #endif
1477 if (newbuf == NULL)
1478 throw std::bad_alloc();
1479 #else
1480 char *newbuf = new char[max_];
1481 std::memcpy(newbuf, txt_, end_);
1482 delete[] buf_;
1483 #endif
1484 buf_ = newbuf;
1485 txt_ = buf_;
1486 lpb_ = buf_;
1487 }
1488 }
1489 #endif
1490 return true;
1491 }
1492 /// Returns the next character read from the current input source.
get()1493 inline int get()
1494 /// @returns the character read (unsigned char 0..255) or EOF (-1)
1495 {
1496 DBGLOG("AbstractMatcher::get()");
1497 #if defined(WITH_FAST_GET)
1498 return pos_ < end_ ? static_cast<unsigned char>(buf_[pos_++]) : get_more();
1499 #else
1500 if (pos_ < end_)
1501 return static_cast<unsigned char>(buf_[pos_++]);
1502 if (eof_)
1503 return EOF;
1504 while (true)
1505 {
1506 if (end_ + blk_ + 1 >= max_)
1507 (void)grow();
1508 end_ += get(buf_ + end_, blk_ > 0 ? blk_ : max_ - end_ - 1);
1509 if (pos_ < end_)
1510 return static_cast<unsigned char>(buf_[pos_++]);
1511 DBGLOGN("get(): EOF");
1512 if (!wrap())
1513 {
1514 eof_ = true;
1515 return EOF;
1516 }
1517 }
1518 #endif
1519 }
1520 /// Reset the matched text by removing the terminating \0, which is needed to search for a new match.
reset_text()1521 inline void reset_text()
1522 {
1523 if (chr_ != '\0')
1524 {
1525 txt_[len_] = chr_;
1526 chr_ = '\0';
1527 }
1528 }
1529 /// Set the current position in the buffer for the next match.
set_current(size_t loc)1530 inline void set_current(size_t loc) ///< new location in buffer
1531 {
1532 DBGCHK(loc <= end_);
1533 pos_ = cur_ = loc;
1534 #if defined(WITH_SPAN)
1535 got_ = loc > 0 ? static_cast<unsigned char>(buf_[loc - 1]) : '\n';
1536 #else
1537 got_ = loc > 0 ? static_cast<unsigned char>(buf_[loc - 1]) : Const::UNK;
1538 #endif
1539 }
1540 /// Set the current match position in the buffer.
set_current_match(size_t loc)1541 inline void set_current_match(size_t loc) ///< new location in buffer
1542 {
1543 set_current(loc);
1544 txt_ = buf_ + cur_;
1545 }
1546 /// Get the next character and grow the buffer to make more room if necessary.
get_more()1547 inline int get_more()
1548 /// @returns the character read (unsigned char 0..255) or EOF (-1)
1549 {
1550 DBGLOG("AbstractMatcher::get_more()");
1551 if (eof_)
1552 return EOF;
1553 while (true)
1554 {
1555 if (end_ + blk_ + 1 >= max_)
1556 (void)grow();
1557 end_ += get(buf_ + end_, blk_ > 0 ? blk_ : max_ - end_ - 1);
1558 if (pos_ < end_)
1559 return static_cast<unsigned char>(buf_[pos_++]);
1560 DBGLOGN("get_more(): EOF");
1561 if (!wrap())
1562 {
1563 eof_ = true;
1564 return EOF;
1565 }
1566 }
1567 }
1568 /// Peek at the next character and grow the buffer to make more room if necessary.
peek_more()1569 inline int peek_more()
1570 /// @returns the character (unsigned char 0..255) or EOF (-1)
1571 {
1572 DBGLOG("AbstractMatcher::peek_more()");
1573 if (eof_)
1574 return EOF;
1575 while (true)
1576 {
1577 if (end_ + blk_ + 1 >= max_)
1578 (void)grow();
1579 end_ += get(buf_ + end_, blk_ > 0 ? blk_ : max_ - end_ - 1);
1580 if (pos_ < end_)
1581 return static_cast<unsigned char>(buf_[pos_]);
1582 DBGLOGN("peek_more(): EOF");
1583 if (!wrap())
1584 {
1585 eof_ = true;
1586 return EOF;
1587 }
1588 }
1589 }
1590 Option opt_; ///< options for matcher engines
1591 char *buf_; ///< input character sequence buffer
1592 char *txt_; ///< points to the matched text in buffer AbstractMatcher::buf_
1593 size_t len_; ///< size of the matched text
1594 size_t cap_; ///< nonzero capture index of an accepted match or zero
1595 size_t cur_; ///< next position in AbstractMatcher::buf_ to assign to AbstractMatcher::txt_
1596 size_t pos_; ///< position in AbstractMatcher::buf_ after AbstractMatcher::txt_
1597 size_t end_; ///< ending position of the input buffered in AbstractMatcher::buf_
1598 size_t max_; ///< total buffer size and max position + 1 to fill
1599 size_t ind_; ///< current indent position
1600 size_t blk_; ///< block size for block-based input reading, as set by AbstractMatcher::buffer
1601 int got_; ///< last unsigned character we looked at (to determine anchors and boundaries)
1602 int chr_; ///< the character located at AbstractMatcher::txt_[AbstractMatcher::len_]
1603 #if defined(WITH_SPAN)
1604 const char *bol_; ///< begin of line pointer in buffer
1605 Handler *evh_; ///< event handler functor to invoke when buffer contents are shifted out
1606 #endif
1607 const char *lpb_; ///< line pointer in buffer, updated when counting line numbers with lineno()
1608 size_t lno_; ///< line number count (cached)
1609 #if defined(WITH_SPAN)
1610 const char *cpb_; ///< column pointer in buffer, updated when counting column numbers with columno()
1611 #endif
1612 size_t cno_; ///< column number count (cached)
1613 size_t num_; ///< character count of the input till bol_
1614 bool own_; ///< true if AbstractMatcher::buf_ was allocated and should be deleted
1615 bool eof_; ///< input has reached EOF
1616 bool mat_; ///< true if AbstractMatcher::matches() was successful
1617 };
1618
1619 /// The pattern matcher class template extends abstract matcher base class.
1620 template<typename P> /// @tparam <P> pattern class to instantiate a matcher
1621 class PatternMatcher : public AbstractMatcher {
1622 public:
1623 typedef P Pattern; ///< pattern class of this matcher, a typedef of the PatternMatcher template parameter
1624 /// Copy constructor, the underlying pattern object is shared (not deep copied).
PatternMatcher(const PatternMatcher & matcher)1625 PatternMatcher(const PatternMatcher& matcher) ///< matcher with pattern to use (pattern may be shared)
1626 :
1627 AbstractMatcher(matcher.in, matcher.opt_),
1628 pat_(matcher.pat_),
1629 own_(false)
1630 {
1631 DBGLOG("PatternMatcher::PatternMatcher(matcher)");
1632 }
1633 /// Delete matcher, deletes pattern when owned
~PatternMatcher()1634 virtual ~PatternMatcher()
1635 {
1636 DBGLOG("PatternMatcher::~PatternMatcher()");
1637 if (own_ && pat_ != NULL)
1638 delete pat_;
1639 }
1640 /// Assign a matcher, the underlying pattern object is shared (not deep copied).
1641 PatternMatcher& operator=(const PatternMatcher& matcher) ///< matcher with pattern to use (pattern may be shared)
1642 {
1643 scan.init(this, Const::SCAN);
1644 find.init(this, Const::FIND);
1645 split.init(this, Const::SPLIT);
1646 in = matcher.in;
1647 reset();
1648 opt_ = matcher.opt_;
1649 pat_ = matcher.pat_,
1650 own_ = false;
1651 return *this;
1652 }
1653 /// Set the pattern to use with this matcher as a shared pointer to another matcher pattern.
pattern(const PatternMatcher & matcher)1654 virtual PatternMatcher& pattern(const PatternMatcher& matcher) ///< the other matcher
1655 /// @returns this matcher
1656 {
1657 opt_ = matcher.opt_;
1658 return this->pattern(matcher.pattern());
1659 }
1660 /// Set the pattern to use with this matcher (the given pattern is shared and must be persistent).
pattern(const Pattern & pattern)1661 virtual PatternMatcher& pattern(const Pattern& pattern) ///< pattern object for this matcher
1662 /// @returns this matcher
1663 {
1664 DBGLOG("PatternMatcher::pattern()");
1665 if (pat_ != &pattern)
1666 {
1667 if (own_ && pat_ != NULL)
1668 delete pat_;
1669 pat_ = &pattern;
1670 own_ = false;
1671 }
1672 return *this;
1673 }
1674 /// Set the pattern to use with this matcher (the given pattern is shared and must be persistent).
pattern(const Pattern * pattern)1675 virtual PatternMatcher& pattern(const Pattern *pattern) ///< pattern object for this matcher
1676 /// @returns this matcher
1677 {
1678 DBGLOG("PatternMatcher::pattern()");
1679 if (pat_ != pattern)
1680 {
1681 if (own_ && pat_ != NULL)
1682 delete pat_;
1683 pat_ = pattern;
1684 own_ = false;
1685 }
1686 return *this;
1687 }
1688 /// Set the pattern from a regex string to use with this matcher.
pattern(const char * pattern)1689 virtual PatternMatcher& pattern(const char *pattern) ///< regex string to instantiate internal pattern object
1690 /// @returns this matcher
1691 {
1692 DBGLOG("PatternMatcher::pattern(\"%s\")", pattern);
1693 if (own_ && pat_ != NULL)
1694 delete pat_;
1695 pat_ = new Pattern(pattern);
1696 own_ = true;
1697 return *this;
1698 }
1699 /// Set the pattern from a regex string to use with this matcher.
pattern(const std::string & pattern)1700 virtual PatternMatcher& pattern(const std::string& pattern) ///< regex string to instantiate internal pattern object
1701 /// @returns this matcher
1702 {
1703 DBGLOG("PatternMatcher::pattern(\"%s\")", pattern.c_str());
1704 if (own_ && pat_ != NULL)
1705 delete pat_;
1706 pat_ = new Pattern(pattern);
1707 own_ = true;
1708 return *this;
1709 }
1710 /// Returns true if this matcher has a pattern.
has_pattern()1711 bool has_pattern() const
1712 /// @returns true if this matcher has a pattern
1713 {
1714 return pat_ != NULL;
1715 }
1716 /// Returns true if this matcher has its own pattern not received from another matcher (responsible to delete).
own_pattern()1717 bool own_pattern() const
1718 /// @returns true if this matcher has its own pattern
1719 {
1720 return own_ && pat_ != NULL;
1721 }
1722 /// Returns a reference to the pattern object associated with this matcher.
pattern()1723 const Pattern& pattern() const
1724 /// @returns reference to pattern object
1725 {
1726 ASSERT(pat_ != NULL);
1727 return *pat_;
1728 }
1729 protected:
1730 /// Construct a base abstract matcher from a pointer to a persistent pattern object (that is shared with this class) and an input character sequence.
1731 PatternMatcher(
1732 const Pattern *pattern = NULL, ///< points to pattern object for this matcher
1733 const Input& input = Input(), ///< input character sequence for this matcher
1734 const char *opt = NULL) ///< option string of the form `(A|N|T(=[[:digit:]])?|;)*`
1735 :
AbstractMatcher(input,opt)1736 AbstractMatcher(input, opt),
1737 pat_(pattern),
1738 own_(false)
1739 { }
1740 /// Construct a base abstract matcher from a persistent pattern object (that is shared with this class) and an input character sequence.
1741 PatternMatcher(
1742 const Pattern& pattern, ///< pattern object for this matcher
1743 const Input& input = Input(), ///< input character sequence for this matcher
1744 const char *opt = NULL) ///< option string of the form `(A|N|T(=[[:digit:]])?|;)*`
1745 :
AbstractMatcher(input,opt)1746 AbstractMatcher(input, opt),
1747 pat_(&pattern),
1748 own_(false)
1749 { }
1750 /// Construct a base abstract matcher from a regex pattern string and an input character sequence.
1751 PatternMatcher(
1752 const char *pattern, ///< regex string instantiates pattern object for this matcher
1753 const Input& input = Input(), ///< input character sequence for this matcher
1754 const char *opt = NULL) ///< option string of the form `(A|N|T(=[[:digit:]])?|;)*`
1755 :
AbstractMatcher(input,opt)1756 AbstractMatcher(input, opt),
1757 pat_(new Pattern(pattern)),
1758 own_(true)
1759 { }
1760 /// Construct a base abstract matcher from a regex pattern string and an input character sequence.
1761 PatternMatcher(
1762 const std::string& pattern, ///< regex string instantiates pattern object for this matcher
1763 const Input& input = Input(), ///< input character sequence for this matcher
1764 const char *opt = NULL) ///< option string of the form `(A|N|T(=[[:digit:]])?|;)*`
1765 :
AbstractMatcher(input,opt)1766 AbstractMatcher(input, opt),
1767 pat_(new Pattern(pattern)),
1768 own_(true)
1769 { }
1770 const Pattern *pat_; ///< points to the pattern object used by the matcher
1771 bool own_; ///< true if PatternMatcher::pat_ was allocated and should be deleted
1772 };
1773
1774 /// A specialization of the pattern matcher class template for std::string, extends abstract matcher base class.
1775 template<>
1776 class PatternMatcher<std::string> : public AbstractMatcher {
1777 public:
1778 typedef std::string Pattern; ///< pattern class of this matcher
1779 /// Copy constructor, the underlying pattern string is copied.
PatternMatcher(const PatternMatcher & matcher)1780 PatternMatcher(const PatternMatcher& matcher) ///< matcher with pattern to copy and use
1781 :
1782 AbstractMatcher(matcher.in, matcher.opt_),
1783 pat_(matcher.pat_ != NULL ? new Pattern(*matcher.pat_) : NULL),
1784 own_(matcher.pat_ != NULL)
1785 { }
1786 /// Delete matcher, deletes pattern when owned
~PatternMatcher()1787 virtual ~PatternMatcher()
1788 {
1789 DBGLOG("PatternMatcher::~PatternMatcher()");
1790 if (own_ && pat_ != NULL)
1791 delete pat_;
1792 }
1793 /// Assign a matcher, the underlying pattern string is shared (not deep copied).
1794 PatternMatcher& operator=(const PatternMatcher& matcher) ///< matcher with pattern to use (pattern may be shared)
1795 {
1796 scan.init(this, Const::SCAN);
1797 find.init(this, Const::FIND);
1798 split.init(this, Const::SPLIT);
1799 in = matcher.in;
1800 reset();
1801 opt_ = matcher.opt_;
1802 pat_ = matcher.pat_,
1803 own_ = false;
1804 return *this;
1805 }
1806 /// Set the pattern to use with this matcher as a shared pointer to another matcher pattern.
pattern(const PatternMatcher & matcher)1807 virtual PatternMatcher& pattern(const PatternMatcher& matcher) ///< the other matcher
1808 /// @returns this matcher
1809 {
1810 opt_ = matcher.opt_;
1811 return this->pattern(matcher.pattern());
1812 }
1813 /// Set the pattern to use with this matcher (the given pattern is shared and must be persistent).
pattern(const Pattern * pattern)1814 virtual PatternMatcher& pattern(const Pattern *pattern) ///< pattern string for this matcher
1815 /// @returns this matcher
1816 {
1817 DBGLOG("Patternatcher::pattern()");
1818 if (pat_ != pattern)
1819 {
1820 if (own_ && pat_ != NULL)
1821 delete pat_;
1822 pat_ = pattern;
1823 own_ = false;
1824 }
1825 return *this;
1826 }
1827 /// Set the pattern from a regex string to use with this matcher.
pattern(const char * pattern)1828 virtual PatternMatcher& pattern(const char *pattern) ///< regex string to instantiate internal pattern string
1829 /// @returns this matcher
1830 {
1831 DBGLOG("Patternatcher::pattern(\"%s\")", pattern);
1832 if (own_ && pat_ != NULL)
1833 delete pat_;
1834 pat_ = new Pattern(pattern);
1835 own_ = true;
1836 return *this;
1837 }
1838 /// Set the pattern from a regex string to use with this matcher.
pattern(const std::string & pattern)1839 virtual PatternMatcher& pattern(const std::string& pattern) ///< regex string to instantiate internal pattern string
1840 /// @returns this matcher
1841 {
1842 DBGLOG("Patternatcher::pattern(\"%s\")", pattern.c_str());
1843 if (own_ && pat_ != NULL)
1844 delete pat_;
1845 pat_ = new Pattern(pattern);
1846 own_ = true;
1847 return *this;
1848 }
1849 /// Returns true if this matcher has a pattern.
has_pattern()1850 bool has_pattern() const
1851 /// @returns true if this matcher has a pattern
1852 {
1853 return pat_ != NULL;
1854 }
1855 /// Returns true if this matcher has its own pattern not received from another matcher (responsible to delete).
own_pattern()1856 bool own_pattern() const
1857 /// @returns true if this matcher has its own pattern
1858 {
1859 return own_ && pat_ != NULL;
1860 }
1861 /// Returns a reference to the pattern string associated with this matcher.
pattern()1862 const Pattern& pattern() const
1863 /// @returns reference to pattern string
1864 {
1865 ASSERT(pat_ != NULL);
1866 return *pat_;
1867 }
1868 protected:
1869 /// Construct a base abstract matcher from a pointer to a persistent pattern string (that is shared with this class) and an input character sequence.
1870 PatternMatcher(
1871 const Pattern *pattern = NULL, ///< points to pattern string for this matcher
1872 const Input& input = Input(), ///< input character sequence for this matcher
1873 const char *opt = NULL) ///< option string of the form `(A|N|T(=[[:digit:]])?|;)*`
1874 :
AbstractMatcher(input,opt)1875 AbstractMatcher(input, opt),
1876 pat_(pattern),
1877 own_(false)
1878 { }
1879 /// Construct a base abstract matcher from a regex pattern string and an input character sequence.
1880 PatternMatcher(
1881 const char *pattern, ///< regex string instantiates pattern string for this matcher
1882 const Input& input = Input(), ///< input character sequence for this matcher
1883 const char *opt = NULL) ///< option string of the form `(A|N|T(=[[:digit:]])?|;)*`
1884 :
AbstractMatcher(input,opt)1885 AbstractMatcher(input, opt),
1886 pat_(new Pattern(pattern)),
1887 own_(true)
1888 { }
1889 /// Construct a base abstract matcher from a regex pattern string and an input character sequence.
1890 PatternMatcher(
1891 const std::string& pattern, ///< regex string instantiates pattern string for this matcher
1892 const Input& input = Input(), ///< input character sequence for this matcher
1893 const char *opt = NULL) ///< option string of the form `(A|N|T(=[[:digit:]])?|;)*`
1894 :
AbstractMatcher(input,opt)1895 AbstractMatcher(input, opt),
1896 pat_(new Pattern(pattern)),
1897 own_(true)
1898 { }
1899 const Pattern *pat_; ///< points to the pattern string used by the matcher
1900 bool own_; ///< true if PatternMatcher::pat_ was allocated and should be deleted
1901 };
1902
1903 } // namespace reflex
1904
1905 /// Write matched text to a stream.
1906 inline std::ostream& operator<<(std::ostream& os, const reflex::AbstractMatcher& matcher)
1907 {
1908 os.write(matcher.begin(), matcher.size());
1909 return os;
1910 }
1911
1912 /// Read stream and store all content in the matcher's buffer.
1913 inline std::istream& operator>>(std::istream& is, reflex::AbstractMatcher& matcher)
1914 {
1915 matcher.input(is).buffer();
1916 return is;
1917 }
1918
1919 #endif
1920