1 // Copyright 2003-2009 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Regular expression interface RE2.
6 //
7 // Originally the PCRE C++ wrapper, but adapted to use
8 // the new automata-based regular expression engines.
9
10 #include "re2/re2.h"
11
12 #include <assert.h>
13 #include <ctype.h>
14 #include <errno.h>
15 #ifdef _MSC_VER
16 #include <intrin.h>
17 #endif
18 #include <stdint.h>
19 #include <stdlib.h>
20 #include <string.h>
21 #include <algorithm>
22 #include <atomic>
23 #include <iterator>
24 #include <mutex>
25 #include <string>
26 #include <utility>
27 #include <vector>
28
29 #include "util/util.h"
30 #include "util/logging.h"
31 #include "util/strutil.h"
32 #include "util/utf.h"
33 #include "re2/prog.h"
34 #include "re2/regexp.h"
35 #include "re2/sparse_array.h"
36
37 namespace re2 {
38
39 // Maximum number of args we can set
40 static const int kMaxArgs = 16;
41 static const int kVecSize = 1+kMaxArgs;
42
43 const int RE2::Options::kDefaultMaxMem; // initialized in re2.h
44
Options(RE2::CannedOptions opt)45 RE2::Options::Options(RE2::CannedOptions opt)
46 : encoding_(opt == RE2::Latin1 ? EncodingLatin1 : EncodingUTF8),
47 posix_syntax_(opt == RE2::POSIX),
48 longest_match_(opt == RE2::POSIX),
49 log_errors_(opt != RE2::Quiet),
50 max_mem_(kDefaultMaxMem),
51 literal_(false),
52 never_nl_(false),
53 dot_nl_(false),
54 never_capture_(false),
55 case_sensitive_(true),
56 perl_classes_(false),
57 word_boundary_(false),
58 one_line_(false) {
59 }
60
61 // static empty objects for use as const references.
62 // To avoid global constructors, allocated in RE2::Init().
63 static const std::string* empty_string;
64 static const std::map<std::string, int>* empty_named_groups;
65 static const std::map<int, std::string>* empty_group_names;
66
67 // Converts from Regexp error code to RE2 error code.
68 // Maybe some day they will diverge. In any event, this
69 // hides the existence of Regexp from RE2 users.
RegexpErrorToRE2(re2::RegexpStatusCode code)70 static RE2::ErrorCode RegexpErrorToRE2(re2::RegexpStatusCode code) {
71 switch (code) {
72 case re2::kRegexpSuccess:
73 return RE2::NoError;
74 case re2::kRegexpInternalError:
75 return RE2::ErrorInternal;
76 case re2::kRegexpBadEscape:
77 return RE2::ErrorBadEscape;
78 case re2::kRegexpBadCharClass:
79 return RE2::ErrorBadCharClass;
80 case re2::kRegexpBadCharRange:
81 return RE2::ErrorBadCharRange;
82 case re2::kRegexpMissingBracket:
83 return RE2::ErrorMissingBracket;
84 case re2::kRegexpMissingParen:
85 return RE2::ErrorMissingParen;
86 case re2::kRegexpUnexpectedParen:
87 return RE2::ErrorUnexpectedParen;
88 case re2::kRegexpTrailingBackslash:
89 return RE2::ErrorTrailingBackslash;
90 case re2::kRegexpRepeatArgument:
91 return RE2::ErrorRepeatArgument;
92 case re2::kRegexpRepeatSize:
93 return RE2::ErrorRepeatSize;
94 case re2::kRegexpRepeatOp:
95 return RE2::ErrorRepeatOp;
96 case re2::kRegexpBadPerlOp:
97 return RE2::ErrorBadPerlOp;
98 case re2::kRegexpBadUTF8:
99 return RE2::ErrorBadUTF8;
100 case re2::kRegexpBadNamedCapture:
101 return RE2::ErrorBadNamedCapture;
102 }
103 return RE2::ErrorInternal;
104 }
105
trunc(const StringPiece & pattern)106 static std::string trunc(const StringPiece& pattern) {
107 if (pattern.size() < 100)
108 return std::string(pattern);
109 return std::string(pattern.substr(0, 100)) + "...";
110 }
111
112
RE2(const char * pattern)113 RE2::RE2(const char* pattern) {
114 Init(pattern, DefaultOptions);
115 }
116
RE2(const std::string & pattern)117 RE2::RE2(const std::string& pattern) {
118 Init(pattern, DefaultOptions);
119 }
120
RE2(const StringPiece & pattern)121 RE2::RE2(const StringPiece& pattern) {
122 Init(pattern, DefaultOptions);
123 }
124
RE2(const StringPiece & pattern,const Options & options)125 RE2::RE2(const StringPiece& pattern, const Options& options) {
126 Init(pattern, options);
127 }
128
ParseFlags() const129 int RE2::Options::ParseFlags() const {
130 int flags = Regexp::ClassNL;
131 switch (encoding()) {
132 default:
133 if (log_errors())
134 LOG(ERROR) << "Unknown encoding " << encoding();
135 break;
136 case RE2::Options::EncodingUTF8:
137 break;
138 case RE2::Options::EncodingLatin1:
139 flags |= Regexp::Latin1;
140 break;
141 }
142
143 if (!posix_syntax())
144 flags |= Regexp::LikePerl;
145
146 if (literal())
147 flags |= Regexp::Literal;
148
149 if (never_nl())
150 flags |= Regexp::NeverNL;
151
152 if (dot_nl())
153 flags |= Regexp::DotNL;
154
155 if (never_capture())
156 flags |= Regexp::NeverCapture;
157
158 if (!case_sensitive())
159 flags |= Regexp::FoldCase;
160
161 if (perl_classes())
162 flags |= Regexp::PerlClasses;
163
164 if (word_boundary())
165 flags |= Regexp::PerlB;
166
167 if (one_line())
168 flags |= Regexp::OneLine;
169
170 return flags;
171 }
172
Init(const StringPiece & pattern,const Options & options)173 void RE2::Init(const StringPiece& pattern, const Options& options) {
174 static std::once_flag empty_once;
175 std::call_once(empty_once, []() {
176 empty_string = new std::string;
177 empty_named_groups = new std::map<std::string, int>;
178 empty_group_names = new std::map<int, std::string>;
179 });
180
181 pattern_.assign(pattern.data(), pattern.size());
182 options_.Copy(options);
183 entire_regexp_ = NULL;
184 error_ = empty_string;
185 error_code_ = NoError;
186 error_arg_.clear();
187 prefix_.clear();
188 prefix_foldcase_ = false;
189 suffix_regexp_ = NULL;
190 prog_ = NULL;
191 num_captures_ = -1;
192 is_one_pass_ = false;
193
194 rprog_ = NULL;
195 named_groups_ = NULL;
196 group_names_ = NULL;
197
198 RegexpStatus status;
199 entire_regexp_ = Regexp::Parse(
200 pattern_,
201 static_cast<Regexp::ParseFlags>(options_.ParseFlags()),
202 &status);
203 if (entire_regexp_ == NULL) {
204 if (options_.log_errors()) {
205 LOG(ERROR) << "Error parsing '" << trunc(pattern_) << "': "
206 << status.Text();
207 }
208 error_ = new std::string(status.Text());
209 error_code_ = RegexpErrorToRE2(status.code());
210 error_arg_ = std::string(status.error_arg());
211 return;
212 }
213
214 re2::Regexp* suffix;
215 if (entire_regexp_->RequiredPrefix(&prefix_, &prefix_foldcase_, &suffix))
216 suffix_regexp_ = suffix;
217 else
218 suffix_regexp_ = entire_regexp_->Incref();
219
220 // Two thirds of the memory goes to the forward Prog,
221 // one third to the reverse prog, because the forward
222 // Prog has two DFAs but the reverse prog has one.
223 prog_ = suffix_regexp_->CompileToProg(options_.max_mem()*2/3);
224 if (prog_ == NULL) {
225 if (options_.log_errors())
226 LOG(ERROR) << "Error compiling '" << trunc(pattern_) << "'";
227 error_ = new std::string("pattern too large - compile failed");
228 error_code_ = RE2::ErrorPatternTooLarge;
229 return;
230 }
231
232 // We used to compute this lazily, but it's used during the
233 // typical control flow for a match call, so we now compute
234 // it eagerly, which avoids the overhead of std::once_flag.
235 num_captures_ = suffix_regexp_->NumCaptures();
236
237 // Could delay this until the first match call that
238 // cares about submatch information, but the one-pass
239 // machine's memory gets cut from the DFA memory budget,
240 // and that is harder to do if the DFA has already
241 // been built.
242 is_one_pass_ = prog_->IsOnePass();
243 }
244
245 // Returns rprog_, computing it if needed.
ReverseProg() const246 re2::Prog* RE2::ReverseProg() const {
247 std::call_once(rprog_once_, [](const RE2* re) {
248 re->rprog_ =
249 re->suffix_regexp_->CompileToReverseProg(re->options_.max_mem() / 3);
250 if (re->rprog_ == NULL) {
251 if (re->options_.log_errors())
252 LOG(ERROR) << "Error reverse compiling '" << trunc(re->pattern_) << "'";
253 // We no longer touch error_ and error_code_ because failing to compile
254 // the reverse Prog is not a showstopper: falling back to NFA execution
255 // is fine. More importantly, an RE2 object is supposed to be logically
256 // immutable: whatever ok() would have returned after Init() completed,
257 // it should continue to return that no matter what ReverseProg() does.
258 }
259 }, this);
260 return rprog_;
261 }
262
~RE2()263 RE2::~RE2() {
264 if (suffix_regexp_)
265 suffix_regexp_->Decref();
266 if (entire_regexp_)
267 entire_regexp_->Decref();
268 delete prog_;
269 delete rprog_;
270 if (error_ != empty_string)
271 delete error_;
272 if (named_groups_ != NULL && named_groups_ != empty_named_groups)
273 delete named_groups_;
274 if (group_names_ != NULL && group_names_ != empty_group_names)
275 delete group_names_;
276 }
277
ProgramSize() const278 int RE2::ProgramSize() const {
279 if (prog_ == NULL)
280 return -1;
281 return prog_->size();
282 }
283
ReverseProgramSize() const284 int RE2::ReverseProgramSize() const {
285 if (prog_ == NULL)
286 return -1;
287 Prog* prog = ReverseProg();
288 if (prog == NULL)
289 return -1;
290 return prog->size();
291 }
292
293 // Finds the most significant non-zero bit in n.
FindMSBSet(uint32_t n)294 static int FindMSBSet(uint32_t n) {
295 DCHECK_NE(n, 0);
296 #if defined(__GNUC__)
297 return 31 ^ __builtin_clz(n);
298 #elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
299 unsigned long c;
300 _BitScanReverse(&c, n);
301 return static_cast<int>(c);
302 #else
303 int c = 0;
304 for (int shift = 1 << 4; shift != 0; shift >>= 1) {
305 uint32_t word = n >> shift;
306 if (word != 0) {
307 n = word;
308 c += shift;
309 }
310 }
311 return c;
312 #endif
313 }
314
Fanout(Prog * prog,std::vector<int> * histogram)315 static int Fanout(Prog* prog, std::vector<int>* histogram) {
316 SparseArray<int> fanout(prog->size());
317 prog->Fanout(&fanout);
318 int data[32] = {};
319 int size = 0;
320 for (SparseArray<int>::iterator i = fanout.begin(); i != fanout.end(); ++i) {
321 if (i->value() == 0)
322 continue;
323 uint32_t value = i->value();
324 int bucket = FindMSBSet(value);
325 bucket += value & (value-1) ? 1 : 0;
326 ++data[bucket];
327 size = std::max(size, bucket+1);
328 }
329 if (histogram != NULL)
330 histogram->assign(data, data+size);
331 return size-1;
332 }
333
ProgramFanout(std::vector<int> * histogram) const334 int RE2::ProgramFanout(std::vector<int>* histogram) const {
335 if (prog_ == NULL)
336 return -1;
337 return Fanout(prog_, histogram);
338 }
339
ReverseProgramFanout(std::vector<int> * histogram) const340 int RE2::ReverseProgramFanout(std::vector<int>* histogram) const {
341 if (prog_ == NULL)
342 return -1;
343 Prog* prog = ReverseProg();
344 if (prog == NULL)
345 return -1;
346 return Fanout(prog, histogram);
347 }
348
349 // Returns named_groups_, computing it if needed.
NamedCapturingGroups() const350 const std::map<std::string, int>& RE2::NamedCapturingGroups() const {
351 std::call_once(named_groups_once_, [](const RE2* re) {
352 if (re->suffix_regexp_ != NULL)
353 re->named_groups_ = re->suffix_regexp_->NamedCaptures();
354 if (re->named_groups_ == NULL)
355 re->named_groups_ = empty_named_groups;
356 }, this);
357 return *named_groups_;
358 }
359
360 // Returns group_names_, computing it if needed.
CapturingGroupNames() const361 const std::map<int, std::string>& RE2::CapturingGroupNames() const {
362 std::call_once(group_names_once_, [](const RE2* re) {
363 if (re->suffix_regexp_ != NULL)
364 re->group_names_ = re->suffix_regexp_->CaptureNames();
365 if (re->group_names_ == NULL)
366 re->group_names_ = empty_group_names;
367 }, this);
368 return *group_names_;
369 }
370
371 /***** Convenience interfaces *****/
372
FullMatchN(const StringPiece & text,const RE2 & re,const Arg * const args[],int n)373 bool RE2::FullMatchN(const StringPiece& text, const RE2& re,
374 const Arg* const args[], int n) {
375 return re.DoMatch(text, ANCHOR_BOTH, NULL, args, n);
376 }
377
PartialMatchN(const StringPiece & text,const RE2 & re,const Arg * const args[],int n)378 bool RE2::PartialMatchN(const StringPiece& text, const RE2& re,
379 const Arg* const args[], int n) {
380 return re.DoMatch(text, UNANCHORED, NULL, args, n);
381 }
382
ConsumeN(StringPiece * input,const RE2 & re,const Arg * const args[],int n)383 bool RE2::ConsumeN(StringPiece* input, const RE2& re,
384 const Arg* const args[], int n) {
385 size_t consumed;
386 if (re.DoMatch(*input, ANCHOR_START, &consumed, args, n)) {
387 input->remove_prefix(consumed);
388 return true;
389 } else {
390 return false;
391 }
392 }
393
FindAndConsumeN(StringPiece * input,const RE2 & re,const Arg * const args[],int n)394 bool RE2::FindAndConsumeN(StringPiece* input, const RE2& re,
395 const Arg* const args[], int n) {
396 size_t consumed;
397 if (re.DoMatch(*input, UNANCHORED, &consumed, args, n)) {
398 input->remove_prefix(consumed);
399 return true;
400 } else {
401 return false;
402 }
403 }
404
Replace(std::string * str,const RE2 & re,const StringPiece & rewrite)405 bool RE2::Replace(std::string* str,
406 const RE2& re,
407 const StringPiece& rewrite) {
408 StringPiece vec[kVecSize];
409 int nvec = 1 + MaxSubmatch(rewrite);
410 if (nvec > 1 + re.NumberOfCapturingGroups())
411 return false;
412 if (nvec > static_cast<int>(arraysize(vec)))
413 return false;
414 if (!re.Match(*str, 0, str->size(), UNANCHORED, vec, nvec))
415 return false;
416
417 std::string s;
418 if (!re.Rewrite(&s, rewrite, vec, nvec))
419 return false;
420
421 assert(vec[0].data() >= str->data());
422 assert(vec[0].data() + vec[0].size() <= str->data() + str->size());
423 str->replace(vec[0].data() - str->data(), vec[0].size(), s);
424 return true;
425 }
426
GlobalReplace(std::string * str,const RE2 & re,const StringPiece & rewrite)427 int RE2::GlobalReplace(std::string* str,
428 const RE2& re,
429 const StringPiece& rewrite) {
430 StringPiece vec[kVecSize];
431 int nvec = 1 + MaxSubmatch(rewrite);
432 if (nvec > 1 + re.NumberOfCapturingGroups())
433 return false;
434 if (nvec > static_cast<int>(arraysize(vec)))
435 return false;
436
437 const char* p = str->data();
438 const char* ep = p + str->size();
439 const char* lastend = NULL;
440 std::string out;
441 int count = 0;
442 #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
443 // Iterate just once when fuzzing. Otherwise, we easily get bogged down
444 // and coverage is unlikely to improve despite significant expense.
445 while (p == str->data()) {
446 #else
447 while (p <= ep) {
448 #endif
449 if (!re.Match(*str, static_cast<size_t>(p - str->data()),
450 str->size(), UNANCHORED, vec, nvec))
451 break;
452 if (p < vec[0].data())
453 out.append(p, vec[0].data() - p);
454 if (vec[0].data() == lastend && vec[0].empty()) {
455 // Disallow empty match at end of last match: skip ahead.
456 //
457 // fullrune() takes int, not ptrdiff_t. However, it just looks
458 // at the leading byte and treats any length >= 4 the same.
459 if (re.options().encoding() == RE2::Options::EncodingUTF8 &&
460 fullrune(p, static_cast<int>(std::min(ptrdiff_t{4}, ep - p)))) {
461 // re is in UTF-8 mode and there is enough left of str
462 // to allow us to advance by up to UTFmax bytes.
463 Rune r;
464 int n = chartorune(&r, p);
465 // Some copies of chartorune have a bug that accepts
466 // encodings of values in (10FFFF, 1FFFFF] as valid.
467 if (r > Runemax) {
468 n = 1;
469 r = Runeerror;
470 }
471 if (!(n == 1 && r == Runeerror)) { // no decoding error
472 out.append(p, n);
473 p += n;
474 continue;
475 }
476 }
477 // Most likely, re is in Latin-1 mode. If it is in UTF-8 mode,
478 // we fell through from above and the GIGO principle applies.
479 if (p < ep)
480 out.append(p, 1);
481 p++;
482 continue;
483 }
484 re.Rewrite(&out, rewrite, vec, nvec);
485 p = vec[0].data() + vec[0].size();
486 lastend = p;
487 count++;
488 }
489
490 if (count == 0)
491 return 0;
492
493 if (p < ep)
494 out.append(p, ep - p);
495 using std::swap;
496 swap(out, *str);
497 return count;
498 }
499
500 bool RE2::Extract(const StringPiece& text,
501 const RE2& re,
502 const StringPiece& rewrite,
503 std::string* out) {
504 StringPiece vec[kVecSize];
505 int nvec = 1 + MaxSubmatch(rewrite);
506 if (nvec > 1 + re.NumberOfCapturingGroups())
507 return false;
508 if (nvec > static_cast<int>(arraysize(vec)))
509 return false;
510 if (!re.Match(text, 0, text.size(), UNANCHORED, vec, nvec))
511 return false;
512
513 out->clear();
514 return re.Rewrite(out, rewrite, vec, nvec);
515 }
516
517 std::string RE2::QuoteMeta(const StringPiece& unquoted) {
518 std::string result;
519 result.reserve(unquoted.size() << 1);
520
521 // Escape any ascii character not in [A-Za-z_0-9].
522 //
523 // Note that it's legal to escape a character even if it has no
524 // special meaning in a regular expression -- so this function does
525 // that. (This also makes it identical to the perl function of the
526 // same name except for the null-character special case;
527 // see `perldoc -f quotemeta`.)
528 for (size_t ii = 0; ii < unquoted.size(); ++ii) {
529 // Note that using 'isalnum' here raises the benchmark time from
530 // 32ns to 58ns:
531 if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
532 (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
533 (unquoted[ii] < '0' || unquoted[ii] > '9') &&
534 unquoted[ii] != '_' &&
535 // If this is the part of a UTF8 or Latin1 character, we need
536 // to copy this byte without escaping. Experimentally this is
537 // what works correctly with the regexp library.
538 !(unquoted[ii] & 128)) {
539 if (unquoted[ii] == '\0') { // Special handling for null chars.
540 // Note that this special handling is not strictly required for RE2,
541 // but this quoting is required for other regexp libraries such as
542 // PCRE.
543 // Can't use "\\0" since the next character might be a digit.
544 result += "\\x00";
545 continue;
546 }
547 result += '\\';
548 }
549 result += unquoted[ii];
550 }
551
552 return result;
553 }
554
555 bool RE2::PossibleMatchRange(std::string* min, std::string* max,
556 int maxlen) const {
557 if (prog_ == NULL)
558 return false;
559
560 int n = static_cast<int>(prefix_.size());
561 if (n > maxlen)
562 n = maxlen;
563
564 // Determine initial min max from prefix_ literal.
565 *min = prefix_.substr(0, n);
566 *max = prefix_.substr(0, n);
567 if (prefix_foldcase_) {
568 // prefix is ASCII lowercase; change *min to uppercase.
569 for (int i = 0; i < n; i++) {
570 char& c = (*min)[i];
571 if ('a' <= c && c <= 'z')
572 c += 'A' - 'a';
573 }
574 }
575
576 // Add to prefix min max using PossibleMatchRange on regexp.
577 std::string dmin, dmax;
578 maxlen -= n;
579 if (maxlen > 0 && prog_->PossibleMatchRange(&dmin, &dmax, maxlen)) {
580 min->append(dmin);
581 max->append(dmax);
582 } else if (!max->empty()) {
583 // prog_->PossibleMatchRange has failed us,
584 // but we still have useful information from prefix_.
585 // Round up *max to allow any possible suffix.
586 PrefixSuccessor(max);
587 } else {
588 // Nothing useful.
589 *min = "";
590 *max = "";
591 return false;
592 }
593
594 return true;
595 }
596
597 // Avoid possible locale nonsense in standard strcasecmp.
598 // The string a is known to be all lowercase.
599 static int ascii_strcasecmp(const char* a, const char* b, size_t len) {
600 const char* ae = a + len;
601
602 for (; a < ae; a++, b++) {
603 uint8_t x = *a;
604 uint8_t y = *b;
605 if ('A' <= y && y <= 'Z')
606 y += 'a' - 'A';
607 if (x != y)
608 return x - y;
609 }
610 return 0;
611 }
612
613
614 /***** Actual matching and rewriting code *****/
615
616 bool RE2::Match(const StringPiece& text,
617 size_t startpos,
618 size_t endpos,
619 Anchor re_anchor,
620 StringPiece* submatch,
621 int nsubmatch) const {
622 if (!ok()) {
623 if (options_.log_errors())
624 LOG(ERROR) << "Invalid RE2: " << *error_;
625 return false;
626 }
627
628 if (startpos > endpos || endpos > text.size()) {
629 if (options_.log_errors())
630 LOG(ERROR) << "RE2: invalid startpos, endpos pair. ["
631 << "startpos: " << startpos << ", "
632 << "endpos: " << endpos << ", "
633 << "text size: " << text.size() << "]";
634 return false;
635 }
636
637 StringPiece subtext = text;
638 subtext.remove_prefix(startpos);
639 subtext.remove_suffix(text.size() - endpos);
640
641 // Use DFAs to find exact location of match, filter out non-matches.
642
643 // Don't ask for the location if we won't use it.
644 // SearchDFA can do extra optimizations in that case.
645 StringPiece match;
646 StringPiece* matchp = &match;
647 if (nsubmatch == 0)
648 matchp = NULL;
649
650 int ncap = 1 + NumberOfCapturingGroups();
651 if (ncap > nsubmatch)
652 ncap = nsubmatch;
653
654 // If the regexp is anchored explicitly, must not be in middle of text.
655 if (prog_->anchor_start() && startpos != 0)
656 return false;
657 if (prog_->anchor_end() && endpos != text.size())
658 return false;
659
660 // If the regexp is anchored explicitly, update re_anchor
661 // so that we can potentially fall into a faster case below.
662 if (prog_->anchor_start() && prog_->anchor_end())
663 re_anchor = ANCHOR_BOTH;
664 else if (prog_->anchor_start() && re_anchor != ANCHOR_BOTH)
665 re_anchor = ANCHOR_START;
666
667 // Check for the required prefix, if any.
668 size_t prefixlen = 0;
669 if (!prefix_.empty()) {
670 if (startpos != 0)
671 return false;
672 prefixlen = prefix_.size();
673 if (prefixlen > subtext.size())
674 return false;
675 if (prefix_foldcase_) {
676 if (ascii_strcasecmp(&prefix_[0], subtext.data(), prefixlen) != 0)
677 return false;
678 } else {
679 if (memcmp(&prefix_[0], subtext.data(), prefixlen) != 0)
680 return false;
681 }
682 subtext.remove_prefix(prefixlen);
683 // If there is a required prefix, the anchor must be at least ANCHOR_START.
684 if (re_anchor != ANCHOR_BOTH)
685 re_anchor = ANCHOR_START;
686 }
687
688 Prog::Anchor anchor = Prog::kUnanchored;
689 Prog::MatchKind kind = Prog::kFirstMatch;
690 if (options_.longest_match())
691 kind = Prog::kLongestMatch;
692
693 bool can_one_pass = (is_one_pass_ && ncap <= Prog::kMaxOnePassCapture);
694
695 // BitState allocates a bitmap of size prog_->list_count() * text.size().
696 // It also allocates a stack of 3-word structures which could potentially
697 // grow as large as prog_->list_count() * text.size(), but in practice is
698 // much smaller.
699 const int kMaxBitStateBitmapSize = 256*1024; // bitmap size <= max (bits)
700 bool can_bit_state = prog_->CanBitState();
701 size_t bit_state_text_max = kMaxBitStateBitmapSize / prog_->list_count();
702
703 #ifdef RE2_HAVE_THREAD_LOCAL
704 hooks::context = this;
705 #endif
706 bool dfa_failed = false;
707 bool skipped_test = false;
708 switch (re_anchor) {
709 default:
710 LOG(DFATAL) << "Unexpected re_anchor value: " << re_anchor;
711 return false;
712
713 case UNANCHORED: {
714 if (prog_->anchor_end()) {
715 // This is a very special case: we don't need the forward DFA because
716 // we already know where the match must end! Instead, the reverse DFA
717 // can say whether there is a match and (optionally) where it starts.
718 Prog* prog = ReverseProg();
719 if (prog == NULL) {
720 // Fall back to NFA below.
721 skipped_test = true;
722 break;
723 }
724 if (!prog->SearchDFA(subtext, text, Prog::kAnchored,
725 Prog::kLongestMatch, matchp, &dfa_failed, NULL)) {
726 if (dfa_failed) {
727 if (options_.log_errors())
728 LOG(ERROR) << "DFA out of memory: "
729 << "pattern length " << pattern_.size() << ", "
730 << "program size " << prog->size() << ", "
731 << "list count " << prog->list_count() << ", "
732 << "bytemap range " << prog->bytemap_range();
733 // Fall back to NFA below.
734 skipped_test = true;
735 break;
736 }
737 return false;
738 }
739 if (matchp == NULL) // Matched. Don't care where.
740 return true;
741 break;
742 }
743
744 if (!prog_->SearchDFA(subtext, text, anchor, kind,
745 matchp, &dfa_failed, NULL)) {
746 if (dfa_failed) {
747 if (options_.log_errors())
748 LOG(ERROR) << "DFA out of memory: "
749 << "pattern length " << pattern_.size() << ", "
750 << "program size " << prog_->size() << ", "
751 << "list count " << prog_->list_count() << ", "
752 << "bytemap range " << prog_->bytemap_range();
753 // Fall back to NFA below.
754 skipped_test = true;
755 break;
756 }
757 return false;
758 }
759 if (matchp == NULL) // Matched. Don't care where.
760 return true;
761 // SearchDFA set match.end() but didn't know where the
762 // match started. Run the regexp backward from match.end()
763 // to find the longest possible match -- that's where it started.
764 Prog* prog = ReverseProg();
765 if (prog == NULL) {
766 // Fall back to NFA below.
767 skipped_test = true;
768 break;
769 }
770 if (!prog->SearchDFA(match, text, Prog::kAnchored,
771 Prog::kLongestMatch, &match, &dfa_failed, NULL)) {
772 if (dfa_failed) {
773 if (options_.log_errors())
774 LOG(ERROR) << "DFA out of memory: "
775 << "pattern length " << pattern_.size() << ", "
776 << "program size " << prog->size() << ", "
777 << "list count " << prog->list_count() << ", "
778 << "bytemap range " << prog->bytemap_range();
779 // Fall back to NFA below.
780 skipped_test = true;
781 break;
782 }
783 if (options_.log_errors())
784 LOG(ERROR) << "SearchDFA inconsistency";
785 return false;
786 }
787 break;
788 }
789
790 case ANCHOR_BOTH:
791 case ANCHOR_START:
792 if (re_anchor == ANCHOR_BOTH)
793 kind = Prog::kFullMatch;
794 anchor = Prog::kAnchored;
795
796 // If only a small amount of text and need submatch
797 // information anyway and we're going to use OnePass or BitState
798 // to get it, we might as well not even bother with the DFA:
799 // OnePass or BitState will be fast enough.
800 // On tiny texts, OnePass outruns even the DFA, and
801 // it doesn't have the shared state and occasional mutex that
802 // the DFA does.
803 if (can_one_pass && text.size() <= 4096 &&
804 (ncap > 1 || text.size() <= 16)) {
805 skipped_test = true;
806 break;
807 }
808 if (can_bit_state && text.size() <= bit_state_text_max && ncap > 1) {
809 skipped_test = true;
810 break;
811 }
812 if (!prog_->SearchDFA(subtext, text, anchor, kind,
813 &match, &dfa_failed, NULL)) {
814 if (dfa_failed) {
815 if (options_.log_errors())
816 LOG(ERROR) << "DFA out of memory: "
817 << "pattern length " << pattern_.size() << ", "
818 << "program size " << prog_->size() << ", "
819 << "list count " << prog_->list_count() << ", "
820 << "bytemap range " << prog_->bytemap_range();
821 // Fall back to NFA below.
822 skipped_test = true;
823 break;
824 }
825 return false;
826 }
827 break;
828 }
829
830 if (!skipped_test && ncap <= 1) {
831 // We know exactly where it matches. That's enough.
832 if (ncap == 1)
833 submatch[0] = match;
834 } else {
835 StringPiece subtext1;
836 if (skipped_test) {
837 // DFA ran out of memory or was skipped:
838 // need to search in entire original text.
839 subtext1 = subtext;
840 } else {
841 // DFA found the exact match location:
842 // let NFA run an anchored, full match search
843 // to find submatch locations.
844 subtext1 = match;
845 anchor = Prog::kAnchored;
846 kind = Prog::kFullMatch;
847 }
848
849 if (can_one_pass && anchor != Prog::kUnanchored) {
850 if (!prog_->SearchOnePass(subtext1, text, anchor, kind, submatch, ncap)) {
851 if (!skipped_test && options_.log_errors())
852 LOG(ERROR) << "SearchOnePass inconsistency";
853 return false;
854 }
855 } else if (can_bit_state && subtext1.size() <= bit_state_text_max) {
856 if (!prog_->SearchBitState(subtext1, text, anchor,
857 kind, submatch, ncap)) {
858 if (!skipped_test && options_.log_errors())
859 LOG(ERROR) << "SearchBitState inconsistency";
860 return false;
861 }
862 } else {
863 if (!prog_->SearchNFA(subtext1, text, anchor, kind, submatch, ncap)) {
864 if (!skipped_test && options_.log_errors())
865 LOG(ERROR) << "SearchNFA inconsistency";
866 return false;
867 }
868 }
869 }
870
871 // Adjust overall match for required prefix that we stripped off.
872 if (prefixlen > 0 && nsubmatch > 0)
873 submatch[0] = StringPiece(submatch[0].data() - prefixlen,
874 submatch[0].size() + prefixlen);
875
876 // Zero submatches that don't exist in the regexp.
877 for (int i = ncap; i < nsubmatch; i++)
878 submatch[i] = StringPiece();
879 return true;
880 }
881
882 // Internal matcher - like Match() but takes Args not StringPieces.
883 bool RE2::DoMatch(const StringPiece& text,
884 Anchor re_anchor,
885 size_t* consumed,
886 const Arg* const* args,
887 int n) const {
888 if (!ok()) {
889 if (options_.log_errors())
890 LOG(ERROR) << "Invalid RE2: " << *error_;
891 return false;
892 }
893
894 if (NumberOfCapturingGroups() < n) {
895 // RE has fewer capturing groups than number of Arg pointers passed in.
896 return false;
897 }
898
899 // Count number of capture groups needed.
900 int nvec;
901 if (n == 0 && consumed == NULL)
902 nvec = 0;
903 else
904 nvec = n+1;
905
906 StringPiece* vec;
907 StringPiece stkvec[kVecSize];
908 StringPiece* heapvec = NULL;
909
910 if (nvec <= static_cast<int>(arraysize(stkvec))) {
911 vec = stkvec;
912 } else {
913 vec = new StringPiece[nvec];
914 heapvec = vec;
915 }
916
917 if (!Match(text, 0, text.size(), re_anchor, vec, nvec)) {
918 delete[] heapvec;
919 return false;
920 }
921
922 if (consumed != NULL)
923 *consumed = static_cast<size_t>(vec[0].end() - text.begin());
924
925 if (n == 0 || args == NULL) {
926 // We are not interested in results
927 delete[] heapvec;
928 return true;
929 }
930
931 // If we got here, we must have matched the whole pattern.
932 for (int i = 0; i < n; i++) {
933 const StringPiece& s = vec[i+1];
934 if (!args[i]->Parse(s.data(), s.size())) {
935 // TODO: Should we indicate what the error was?
936 delete[] heapvec;
937 return false;
938 }
939 }
940
941 delete[] heapvec;
942 return true;
943 }
944
945 // Checks that the rewrite string is well-formed with respect to this
946 // regular expression.
947 bool RE2::CheckRewriteString(const StringPiece& rewrite,
948 std::string* error) const {
949 int max_token = -1;
950 for (const char *s = rewrite.data(), *end = s + rewrite.size();
951 s < end; s++) {
952 int c = *s;
953 if (c != '\\') {
954 continue;
955 }
956 if (++s == end) {
957 *error = "Rewrite schema error: '\\' not allowed at end.";
958 return false;
959 }
960 c = *s;
961 if (c == '\\') {
962 continue;
963 }
964 if (!isdigit(c)) {
965 *error = "Rewrite schema error: "
966 "'\\' must be followed by a digit or '\\'.";
967 return false;
968 }
969 int n = (c - '0');
970 if (max_token < n) {
971 max_token = n;
972 }
973 }
974
975 if (max_token > NumberOfCapturingGroups()) {
976 *error = StringPrintf(
977 "Rewrite schema requests %d matches, but the regexp only has %d "
978 "parenthesized subexpressions.",
979 max_token, NumberOfCapturingGroups());
980 return false;
981 }
982 return true;
983 }
984
985 // Returns the maximum submatch needed for the rewrite to be done by Replace().
986 // E.g. if rewrite == "foo \\2,\\1", returns 2.
987 int RE2::MaxSubmatch(const StringPiece& rewrite) {
988 int max = 0;
989 for (const char *s = rewrite.data(), *end = s + rewrite.size();
990 s < end; s++) {
991 if (*s == '\\') {
992 s++;
993 int c = (s < end) ? *s : -1;
994 if (isdigit(c)) {
995 int n = (c - '0');
996 if (n > max)
997 max = n;
998 }
999 }
1000 }
1001 return max;
1002 }
1003
1004 // Append the "rewrite" string, with backslash subsitutions from "vec",
1005 // to string "out".
1006 bool RE2::Rewrite(std::string* out,
1007 const StringPiece& rewrite,
1008 const StringPiece* vec,
1009 int veclen) const {
1010 for (const char *s = rewrite.data(), *end = s + rewrite.size();
1011 s < end; s++) {
1012 if (*s != '\\') {
1013 out->push_back(*s);
1014 continue;
1015 }
1016 s++;
1017 int c = (s < end) ? *s : -1;
1018 if (isdigit(c)) {
1019 int n = (c - '0');
1020 if (n >= veclen) {
1021 if (options_.log_errors()) {
1022 LOG(ERROR) << "invalid substitution \\" << n
1023 << " from " << veclen << " groups";
1024 }
1025 return false;
1026 }
1027 StringPiece snip = vec[n];
1028 if (!snip.empty())
1029 out->append(snip.data(), snip.size());
1030 } else if (c == '\\') {
1031 out->push_back('\\');
1032 } else {
1033 if (options_.log_errors())
1034 LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data();
1035 return false;
1036 }
1037 }
1038 return true;
1039 }
1040
1041 /***** Parsers for various types *****/
1042
1043 namespace re2_internal {
1044
1045 template <>
1046 bool Parse(const char* str, size_t n, void* dest) {
1047 // We fail if somebody asked us to store into a non-NULL void* pointer
1048 return (dest == NULL);
1049 }
1050
1051 template <>
1052 bool Parse(const char* str, size_t n, std::string* dest) {
1053 if (dest == NULL) return true;
1054 dest->assign(str, n);
1055 return true;
1056 }
1057
1058 template <>
1059 bool Parse(const char* str, size_t n, StringPiece* dest) {
1060 if (dest == NULL) return true;
1061 *dest = StringPiece(str, n);
1062 return true;
1063 }
1064
1065 template <>
1066 bool Parse(const char* str, size_t n, char* dest) {
1067 if (n != 1) return false;
1068 if (dest == NULL) return true;
1069 *dest = str[0];
1070 return true;
1071 }
1072
1073 template <>
1074 bool Parse(const char* str, size_t n, signed char* dest) {
1075 if (n != 1) return false;
1076 if (dest == NULL) return true;
1077 *dest = str[0];
1078 return true;
1079 }
1080
1081 template <>
1082 bool Parse(const char* str, size_t n, unsigned char* dest) {
1083 if (n != 1) return false;
1084 if (dest == NULL) return true;
1085 *dest = str[0];
1086 return true;
1087 }
1088
1089 // Largest number spec that we are willing to parse
1090 static const int kMaxNumberLength = 32;
1091
1092 // REQUIRES "buf" must have length at least nbuf.
1093 // Copies "str" into "buf" and null-terminates.
1094 // Overwrites *np with the new length.
1095 static const char* TerminateNumber(char* buf, size_t nbuf, const char* str,
1096 size_t* np, bool accept_spaces) {
1097 size_t n = *np;
1098 if (n == 0) return "";
1099 if (n > 0 && isspace(*str)) {
1100 // We are less forgiving than the strtoxxx() routines and do not
1101 // allow leading spaces. We do allow leading spaces for floats.
1102 if (!accept_spaces) {
1103 return "";
1104 }
1105 while (n > 0 && isspace(*str)) {
1106 n--;
1107 str++;
1108 }
1109 }
1110
1111 // Although buf has a fixed maximum size, we can still handle
1112 // arbitrarily large integers correctly by omitting leading zeros.
1113 // (Numbers that are still too long will be out of range.)
1114 // Before deciding whether str is too long,
1115 // remove leading zeros with s/000+/00/.
1116 // Leaving the leading two zeros in place means that
1117 // we don't change 0000x123 (invalid) into 0x123 (valid).
1118 // Skip over leading - before replacing.
1119 bool neg = false;
1120 if (n >= 1 && str[0] == '-') {
1121 neg = true;
1122 n--;
1123 str++;
1124 }
1125
1126 if (n >= 3 && str[0] == '0' && str[1] == '0') {
1127 while (n >= 3 && str[2] == '0') {
1128 n--;
1129 str++;
1130 }
1131 }
1132
1133 if (neg) { // make room in buf for -
1134 n++;
1135 str--;
1136 }
1137
1138 if (n > nbuf-1) return "";
1139
1140 memmove(buf, str, n);
1141 if (neg) {
1142 buf[0] = '-';
1143 }
1144 buf[n] = '\0';
1145 *np = n;
1146 return buf;
1147 }
1148
1149 template <>
1150 bool Parse(const char* str, size_t n, float* dest) {
1151 if (n == 0) return false;
1152 static const int kMaxLength = 200;
1153 char buf[kMaxLength+1];
1154 str = TerminateNumber(buf, sizeof buf, str, &n, true);
1155 char* end;
1156 errno = 0;
1157 float r = strtof(str, &end);
1158 if (end != str + n) return false; // Leftover junk
1159 if (errno) return false;
1160 if (dest == NULL) return true;
1161 *dest = r;
1162 return true;
1163 }
1164
1165 template <>
1166 bool Parse(const char* str, size_t n, double* dest) {
1167 if (n == 0) return false;
1168 static const int kMaxLength = 200;
1169 char buf[kMaxLength+1];
1170 str = TerminateNumber(buf, sizeof buf, str, &n, true);
1171 char* end;
1172 errno = 0;
1173 double r = strtod(str, &end);
1174 if (end != str + n) return false; // Leftover junk
1175 if (errno) return false;
1176 if (dest == NULL) return true;
1177 *dest = r;
1178 return true;
1179 }
1180
1181 template <>
1182 bool Parse(const char* str, size_t n, long* dest, int radix) {
1183 if (n == 0) return false;
1184 char buf[kMaxNumberLength+1];
1185 str = TerminateNumber(buf, sizeof buf, str, &n, false);
1186 char* end;
1187 errno = 0;
1188 long r = strtol(str, &end, radix);
1189 if (end != str + n) return false; // Leftover junk
1190 if (errno) return false;
1191 if (dest == NULL) return true;
1192 *dest = r;
1193 return true;
1194 }
1195
1196 template <>
1197 bool Parse(const char* str, size_t n, unsigned long* dest, int radix) {
1198 if (n == 0) return false;
1199 char buf[kMaxNumberLength+1];
1200 str = TerminateNumber(buf, sizeof buf, str, &n, false);
1201 if (str[0] == '-') {
1202 // strtoul() will silently accept negative numbers and parse
1203 // them. This module is more strict and treats them as errors.
1204 return false;
1205 }
1206
1207 char* end;
1208 errno = 0;
1209 unsigned long r = strtoul(str, &end, radix);
1210 if (end != str + n) return false; // Leftover junk
1211 if (errno) return false;
1212 if (dest == NULL) return true;
1213 *dest = r;
1214 return true;
1215 }
1216
1217 template <>
1218 bool Parse(const char* str, size_t n, short* dest, int radix) {
1219 long r;
1220 if (!Parse(str, n, &r, radix)) return false; // Could not parse
1221 if ((short)r != r) return false; // Out of range
1222 if (dest == NULL) return true;
1223 *dest = (short)r;
1224 return true;
1225 }
1226
1227 template <>
1228 bool Parse(const char* str, size_t n, unsigned short* dest, int radix) {
1229 unsigned long r;
1230 if (!Parse(str, n, &r, radix)) return false; // Could not parse
1231 if ((unsigned short)r != r) return false; // Out of range
1232 if (dest == NULL) return true;
1233 *dest = (unsigned short)r;
1234 return true;
1235 }
1236
1237 template <>
1238 bool Parse(const char* str, size_t n, int* dest, int radix) {
1239 long r;
1240 if (!Parse(str, n, &r, radix)) return false; // Could not parse
1241 if ((int)r != r) return false; // Out of range
1242 if (dest == NULL) return true;
1243 *dest = (int)r;
1244 return true;
1245 }
1246
1247 template <>
1248 bool Parse(const char* str, size_t n, unsigned int* dest, int radix) {
1249 unsigned long r;
1250 if (!Parse(str, n, &r, radix)) return false; // Could not parse
1251 if ((unsigned int)r != r) return false; // Out of range
1252 if (dest == NULL) return true;
1253 *dest = (unsigned int)r;
1254 return true;
1255 }
1256
1257 template <>
1258 bool Parse(const char* str, size_t n, long long* dest, int radix) {
1259 if (n == 0) return false;
1260 char buf[kMaxNumberLength+1];
1261 str = TerminateNumber(buf, sizeof buf, str, &n, false);
1262 char* end;
1263 errno = 0;
1264 long long r = strtoll(str, &end, radix);
1265 if (end != str + n) return false; // Leftover junk
1266 if (errno) return false;
1267 if (dest == NULL) return true;
1268 *dest = r;
1269 return true;
1270 }
1271
1272 template <>
1273 bool Parse(const char* str, size_t n, unsigned long long* dest, int radix) {
1274 if (n == 0) return false;
1275 char buf[kMaxNumberLength+1];
1276 str = TerminateNumber(buf, sizeof buf, str, &n, false);
1277 if (str[0] == '-') {
1278 // strtoull() will silently accept negative numbers and parse
1279 // them. This module is more strict and treats them as errors.
1280 return false;
1281 }
1282 char* end;
1283 errno = 0;
1284 unsigned long long r = strtoull(str, &end, radix);
1285 if (end != str + n) return false; // Leftover junk
1286 if (errno) return false;
1287 if (dest == NULL) return true;
1288 *dest = r;
1289 return true;
1290 }
1291
1292 } // namespace re2_internal
1293
1294 namespace hooks {
1295
1296 #ifdef RE2_HAVE_THREAD_LOCAL
1297 thread_local const RE2* context = NULL;
1298 #endif
1299
1300 template <typename T>
1301 union Hook {
1302 void Store(T* cb) { cb_.store(cb, std::memory_order_release); }
1303 T* Load() const { return cb_.load(std::memory_order_acquire); }
1304
1305 #if !defined(__clang__) && defined(_MSC_VER)
1306 // Citing https://github.com/protocolbuffers/protobuf/pull/4777 as precedent,
1307 // this is a gross hack to make std::atomic<T*> constant-initialized on MSVC.
1308 static_assert(ATOMIC_POINTER_LOCK_FREE == 2,
1309 "std::atomic<T*> must be always lock-free");
1310 T* cb_for_constinit_;
1311 #endif
1312
1313 std::atomic<T*> cb_;
1314 };
1315
1316 template <typename T>
1317 static void DoNothing(const T&) {}
1318
1319 #define DEFINE_HOOK(type, name) \
1320 static Hook<type##Callback> name##_hook = {{&DoNothing<type>}}; \
1321 void Set##type##Hook(type##Callback* cb) { name##_hook.Store(cb); } \
1322 type##Callback* Get##type##Hook() { return name##_hook.Load(); }
1323
1324 DEFINE_HOOK(DFAStateCacheReset, dfa_state_cache_reset)
1325 DEFINE_HOOK(DFASearchFailure, dfa_search_failure)
1326
1327 #undef DEFINE_HOOK
1328
1329 } // namespace hooks
1330
1331 } // namespace re2
1332