1 // Copyright 2003-2009 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Regular expression interface RE2.
6 //
7 // Originally the PCRE C++ wrapper, but adapted to use
8 // the new automata-based regular expression engines.
9 
10 #include "re2/re2.h"
11 
12 #include <assert.h>
13 #include <ctype.h>
14 #include <errno.h>
15 #ifdef _MSC_VER
16 #include <intrin.h>
17 #endif
18 #include <stdint.h>
19 #include <stdlib.h>
20 #include <string.h>
21 #include <algorithm>
22 #include <atomic>
23 #include <iterator>
24 #include <mutex>
25 #include <string>
26 #include <utility>
27 #include <vector>
28 
29 #include "util/util.h"
30 #include "util/logging.h"
31 #include "util/strutil.h"
32 #include "util/utf.h"
33 #include "re2/prog.h"
34 #include "re2/regexp.h"
35 #include "re2/sparse_array.h"
36 
37 namespace re2 {
38 
39 // Maximum number of args we can set
40 static const int kMaxArgs = 16;
41 static const int kVecSize = 1+kMaxArgs;
42 
43 const int RE2::Options::kDefaultMaxMem;  // initialized in re2.h
44 
Options(RE2::CannedOptions opt)45 RE2::Options::Options(RE2::CannedOptions opt)
46   : encoding_(opt == RE2::Latin1 ? EncodingLatin1 : EncodingUTF8),
47     posix_syntax_(opt == RE2::POSIX),
48     longest_match_(opt == RE2::POSIX),
49     log_errors_(opt != RE2::Quiet),
50     max_mem_(kDefaultMaxMem),
51     literal_(false),
52     never_nl_(false),
53     dot_nl_(false),
54     never_capture_(false),
55     case_sensitive_(true),
56     perl_classes_(false),
57     word_boundary_(false),
58     one_line_(false) {
59 }
60 
61 // static empty objects for use as const references.
62 // To avoid global constructors, allocated in RE2::Init().
63 static const std::string* empty_string;
64 static const std::map<std::string, int>* empty_named_groups;
65 static const std::map<int, std::string>* empty_group_names;
66 
67 // Converts from Regexp error code to RE2 error code.
68 // Maybe some day they will diverge.  In any event, this
69 // hides the existence of Regexp from RE2 users.
RegexpErrorToRE2(re2::RegexpStatusCode code)70 static RE2::ErrorCode RegexpErrorToRE2(re2::RegexpStatusCode code) {
71   switch (code) {
72     case re2::kRegexpSuccess:
73       return RE2::NoError;
74     case re2::kRegexpInternalError:
75       return RE2::ErrorInternal;
76     case re2::kRegexpBadEscape:
77       return RE2::ErrorBadEscape;
78     case re2::kRegexpBadCharClass:
79       return RE2::ErrorBadCharClass;
80     case re2::kRegexpBadCharRange:
81       return RE2::ErrorBadCharRange;
82     case re2::kRegexpMissingBracket:
83       return RE2::ErrorMissingBracket;
84     case re2::kRegexpMissingParen:
85       return RE2::ErrorMissingParen;
86     case re2::kRegexpUnexpectedParen:
87       return RE2::ErrorUnexpectedParen;
88     case re2::kRegexpTrailingBackslash:
89       return RE2::ErrorTrailingBackslash;
90     case re2::kRegexpRepeatArgument:
91       return RE2::ErrorRepeatArgument;
92     case re2::kRegexpRepeatSize:
93       return RE2::ErrorRepeatSize;
94     case re2::kRegexpRepeatOp:
95       return RE2::ErrorRepeatOp;
96     case re2::kRegexpBadPerlOp:
97       return RE2::ErrorBadPerlOp;
98     case re2::kRegexpBadUTF8:
99       return RE2::ErrorBadUTF8;
100     case re2::kRegexpBadNamedCapture:
101       return RE2::ErrorBadNamedCapture;
102   }
103   return RE2::ErrorInternal;
104 }
105 
trunc(const StringPiece & pattern)106 static std::string trunc(const StringPiece& pattern) {
107   if (pattern.size() < 100)
108     return std::string(pattern);
109   return std::string(pattern.substr(0, 100)) + "...";
110 }
111 
112 
RE2(const char * pattern)113 RE2::RE2(const char* pattern) {
114   Init(pattern, DefaultOptions);
115 }
116 
RE2(const std::string & pattern)117 RE2::RE2(const std::string& pattern) {
118   Init(pattern, DefaultOptions);
119 }
120 
RE2(const StringPiece & pattern)121 RE2::RE2(const StringPiece& pattern) {
122   Init(pattern, DefaultOptions);
123 }
124 
RE2(const StringPiece & pattern,const Options & options)125 RE2::RE2(const StringPiece& pattern, const Options& options) {
126   Init(pattern, options);
127 }
128 
ParseFlags() const129 int RE2::Options::ParseFlags() const {
130   int flags = Regexp::ClassNL;
131   switch (encoding()) {
132     default:
133       if (log_errors())
134         LOG(ERROR) << "Unknown encoding " << encoding();
135       break;
136     case RE2::Options::EncodingUTF8:
137       break;
138     case RE2::Options::EncodingLatin1:
139       flags |= Regexp::Latin1;
140       break;
141   }
142 
143   if (!posix_syntax())
144     flags |= Regexp::LikePerl;
145 
146   if (literal())
147     flags |= Regexp::Literal;
148 
149   if (never_nl())
150     flags |= Regexp::NeverNL;
151 
152   if (dot_nl())
153     flags |= Regexp::DotNL;
154 
155   if (never_capture())
156     flags |= Regexp::NeverCapture;
157 
158   if (!case_sensitive())
159     flags |= Regexp::FoldCase;
160 
161   if (perl_classes())
162     flags |= Regexp::PerlClasses;
163 
164   if (word_boundary())
165     flags |= Regexp::PerlB;
166 
167   if (one_line())
168     flags |= Regexp::OneLine;
169 
170   return flags;
171 }
172 
Init(const StringPiece & pattern,const Options & options)173 void RE2::Init(const StringPiece& pattern, const Options& options) {
174   static std::once_flag empty_once;
175   std::call_once(empty_once, []() {
176     empty_string = new std::string;
177     empty_named_groups = new std::map<std::string, int>;
178     empty_group_names = new std::map<int, std::string>;
179   });
180 
181   pattern_.assign(pattern.data(), pattern.size());
182   options_.Copy(options);
183   entire_regexp_ = NULL;
184   error_ = empty_string;
185   error_code_ = NoError;
186   error_arg_.clear();
187   prefix_.clear();
188   prefix_foldcase_ = false;
189   suffix_regexp_ = NULL;
190   prog_ = NULL;
191   num_captures_ = -1;
192   is_one_pass_ = false;
193 
194   rprog_ = NULL;
195   named_groups_ = NULL;
196   group_names_ = NULL;
197 
198   RegexpStatus status;
199   entire_regexp_ = Regexp::Parse(
200     pattern_,
201     static_cast<Regexp::ParseFlags>(options_.ParseFlags()),
202     &status);
203   if (entire_regexp_ == NULL) {
204     if (options_.log_errors()) {
205       LOG(ERROR) << "Error parsing '" << trunc(pattern_) << "': "
206                  << status.Text();
207     }
208     error_ = new std::string(status.Text());
209     error_code_ = RegexpErrorToRE2(status.code());
210     error_arg_ = std::string(status.error_arg());
211     return;
212   }
213 
214   re2::Regexp* suffix;
215   if (entire_regexp_->RequiredPrefix(&prefix_, &prefix_foldcase_, &suffix))
216     suffix_regexp_ = suffix;
217   else
218     suffix_regexp_ = entire_regexp_->Incref();
219 
220   // Two thirds of the memory goes to the forward Prog,
221   // one third to the reverse prog, because the forward
222   // Prog has two DFAs but the reverse prog has one.
223   prog_ = suffix_regexp_->CompileToProg(options_.max_mem()*2/3);
224   if (prog_ == NULL) {
225     if (options_.log_errors())
226       LOG(ERROR) << "Error compiling '" << trunc(pattern_) << "'";
227     error_ = new std::string("pattern too large - compile failed");
228     error_code_ = RE2::ErrorPatternTooLarge;
229     return;
230   }
231 
232   // We used to compute this lazily, but it's used during the
233   // typical control flow for a match call, so we now compute
234   // it eagerly, which avoids the overhead of std::once_flag.
235   num_captures_ = suffix_regexp_->NumCaptures();
236 
237   // Could delay this until the first match call that
238   // cares about submatch information, but the one-pass
239   // machine's memory gets cut from the DFA memory budget,
240   // and that is harder to do if the DFA has already
241   // been built.
242   is_one_pass_ = prog_->IsOnePass();
243 }
244 
245 // Returns rprog_, computing it if needed.
ReverseProg() const246 re2::Prog* RE2::ReverseProg() const {
247   std::call_once(rprog_once_, [](const RE2* re) {
248     re->rprog_ =
249         re->suffix_regexp_->CompileToReverseProg(re->options_.max_mem() / 3);
250     if (re->rprog_ == NULL) {
251       if (re->options_.log_errors())
252         LOG(ERROR) << "Error reverse compiling '" << trunc(re->pattern_) << "'";
253       // We no longer touch error_ and error_code_ because failing to compile
254       // the reverse Prog is not a showstopper: falling back to NFA execution
255       // is fine. More importantly, an RE2 object is supposed to be logically
256       // immutable: whatever ok() would have returned after Init() completed,
257       // it should continue to return that no matter what ReverseProg() does.
258     }
259   }, this);
260   return rprog_;
261 }
262 
~RE2()263 RE2::~RE2() {
264   if (suffix_regexp_)
265     suffix_regexp_->Decref();
266   if (entire_regexp_)
267     entire_regexp_->Decref();
268   delete prog_;
269   delete rprog_;
270   if (error_ != empty_string)
271     delete error_;
272   if (named_groups_ != NULL && named_groups_ != empty_named_groups)
273     delete named_groups_;
274   if (group_names_ != NULL &&  group_names_ != empty_group_names)
275     delete group_names_;
276 }
277 
ProgramSize() const278 int RE2::ProgramSize() const {
279   if (prog_ == NULL)
280     return -1;
281   return prog_->size();
282 }
283 
ReverseProgramSize() const284 int RE2::ReverseProgramSize() const {
285   if (prog_ == NULL)
286     return -1;
287   Prog* prog = ReverseProg();
288   if (prog == NULL)
289     return -1;
290   return prog->size();
291 }
292 
293 // Finds the most significant non-zero bit in n.
FindMSBSet(uint32_t n)294 static int FindMSBSet(uint32_t n) {
295   DCHECK_NE(n, 0);
296 #if defined(__GNUC__)
297   return 31 ^ __builtin_clz(n);
298 #elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
299   unsigned long c;
300   _BitScanReverse(&c, n);
301   return static_cast<int>(c);
302 #else
303   int c = 0;
304   for (int shift = 1 << 4; shift != 0; shift >>= 1) {
305     uint32_t word = n >> shift;
306     if (word != 0) {
307       n = word;
308       c += shift;
309     }
310   }
311   return c;
312 #endif
313 }
314 
Fanout(Prog * prog,std::vector<int> * histogram)315 static int Fanout(Prog* prog, std::vector<int>* histogram) {
316   SparseArray<int> fanout(prog->size());
317   prog->Fanout(&fanout);
318   int data[32] = {};
319   int size = 0;
320   for (SparseArray<int>::iterator i = fanout.begin(); i != fanout.end(); ++i) {
321     if (i->value() == 0)
322       continue;
323     uint32_t value = i->value();
324     int bucket = FindMSBSet(value);
325     bucket += value & (value-1) ? 1 : 0;
326     ++data[bucket];
327     size = std::max(size, bucket+1);
328   }
329   if (histogram != NULL)
330     histogram->assign(data, data+size);
331   return size-1;
332 }
333 
ProgramFanout(std::vector<int> * histogram) const334 int RE2::ProgramFanout(std::vector<int>* histogram) const {
335   if (prog_ == NULL)
336     return -1;
337   return Fanout(prog_, histogram);
338 }
339 
ReverseProgramFanout(std::vector<int> * histogram) const340 int RE2::ReverseProgramFanout(std::vector<int>* histogram) const {
341   if (prog_ == NULL)
342     return -1;
343   Prog* prog = ReverseProg();
344   if (prog == NULL)
345     return -1;
346   return Fanout(prog, histogram);
347 }
348 
349 // Returns named_groups_, computing it if needed.
NamedCapturingGroups() const350 const std::map<std::string, int>& RE2::NamedCapturingGroups() const {
351   std::call_once(named_groups_once_, [](const RE2* re) {
352     if (re->suffix_regexp_ != NULL)
353       re->named_groups_ = re->suffix_regexp_->NamedCaptures();
354     if (re->named_groups_ == NULL)
355       re->named_groups_ = empty_named_groups;
356   }, this);
357   return *named_groups_;
358 }
359 
360 // Returns group_names_, computing it if needed.
CapturingGroupNames() const361 const std::map<int, std::string>& RE2::CapturingGroupNames() const {
362   std::call_once(group_names_once_, [](const RE2* re) {
363     if (re->suffix_regexp_ != NULL)
364       re->group_names_ = re->suffix_regexp_->CaptureNames();
365     if (re->group_names_ == NULL)
366       re->group_names_ = empty_group_names;
367   }, this);
368   return *group_names_;
369 }
370 
371 /***** Convenience interfaces *****/
372 
FullMatchN(const StringPiece & text,const RE2 & re,const Arg * const args[],int n)373 bool RE2::FullMatchN(const StringPiece& text, const RE2& re,
374                      const Arg* const args[], int n) {
375   return re.DoMatch(text, ANCHOR_BOTH, NULL, args, n);
376 }
377 
PartialMatchN(const StringPiece & text,const RE2 & re,const Arg * const args[],int n)378 bool RE2::PartialMatchN(const StringPiece& text, const RE2& re,
379                         const Arg* const args[], int n) {
380   return re.DoMatch(text, UNANCHORED, NULL, args, n);
381 }
382 
ConsumeN(StringPiece * input,const RE2 & re,const Arg * const args[],int n)383 bool RE2::ConsumeN(StringPiece* input, const RE2& re,
384                    const Arg* const args[], int n) {
385   size_t consumed;
386   if (re.DoMatch(*input, ANCHOR_START, &consumed, args, n)) {
387     input->remove_prefix(consumed);
388     return true;
389   } else {
390     return false;
391   }
392 }
393 
FindAndConsumeN(StringPiece * input,const RE2 & re,const Arg * const args[],int n)394 bool RE2::FindAndConsumeN(StringPiece* input, const RE2& re,
395                           const Arg* const args[], int n) {
396   size_t consumed;
397   if (re.DoMatch(*input, UNANCHORED, &consumed, args, n)) {
398     input->remove_prefix(consumed);
399     return true;
400   } else {
401     return false;
402   }
403 }
404 
Replace(std::string * str,const RE2 & re,const StringPiece & rewrite)405 bool RE2::Replace(std::string* str,
406                   const RE2& re,
407                   const StringPiece& rewrite) {
408   StringPiece vec[kVecSize];
409   int nvec = 1 + MaxSubmatch(rewrite);
410   if (nvec > 1 + re.NumberOfCapturingGroups())
411     return false;
412   if (nvec > static_cast<int>(arraysize(vec)))
413     return false;
414   if (!re.Match(*str, 0, str->size(), UNANCHORED, vec, nvec))
415     return false;
416 
417   std::string s;
418   if (!re.Rewrite(&s, rewrite, vec, nvec))
419     return false;
420 
421   assert(vec[0].data() >= str->data());
422   assert(vec[0].data() + vec[0].size() <= str->data() + str->size());
423   str->replace(vec[0].data() - str->data(), vec[0].size(), s);
424   return true;
425 }
426 
GlobalReplace(std::string * str,const RE2 & re,const StringPiece & rewrite)427 int RE2::GlobalReplace(std::string* str,
428                        const RE2& re,
429                        const StringPiece& rewrite) {
430   StringPiece vec[kVecSize];
431   int nvec = 1 + MaxSubmatch(rewrite);
432   if (nvec > 1 + re.NumberOfCapturingGroups())
433     return false;
434   if (nvec > static_cast<int>(arraysize(vec)))
435     return false;
436 
437   const char* p = str->data();
438   const char* ep = p + str->size();
439   const char* lastend = NULL;
440   std::string out;
441   int count = 0;
442 #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
443   // Iterate just once when fuzzing. Otherwise, we easily get bogged down
444   // and coverage is unlikely to improve despite significant expense.
445   while (p == str->data()) {
446 #else
447   while (p <= ep) {
448 #endif
449     if (!re.Match(*str, static_cast<size_t>(p - str->data()),
450                   str->size(), UNANCHORED, vec, nvec))
451       break;
452     if (p < vec[0].data())
453       out.append(p, vec[0].data() - p);
454     if (vec[0].data() == lastend && vec[0].empty()) {
455       // Disallow empty match at end of last match: skip ahead.
456       //
457       // fullrune() takes int, not ptrdiff_t. However, it just looks
458       // at the leading byte and treats any length >= 4 the same.
459       if (re.options().encoding() == RE2::Options::EncodingUTF8 &&
460           fullrune(p, static_cast<int>(std::min(ptrdiff_t{4}, ep - p)))) {
461         // re is in UTF-8 mode and there is enough left of str
462         // to allow us to advance by up to UTFmax bytes.
463         Rune r;
464         int n = chartorune(&r, p);
465         // Some copies of chartorune have a bug that accepts
466         // encodings of values in (10FFFF, 1FFFFF] as valid.
467         if (r > Runemax) {
468           n = 1;
469           r = Runeerror;
470         }
471         if (!(n == 1 && r == Runeerror)) {  // no decoding error
472           out.append(p, n);
473           p += n;
474           continue;
475         }
476       }
477       // Most likely, re is in Latin-1 mode. If it is in UTF-8 mode,
478       // we fell through from above and the GIGO principle applies.
479       if (p < ep)
480         out.append(p, 1);
481       p++;
482       continue;
483     }
484     re.Rewrite(&out, rewrite, vec, nvec);
485     p = vec[0].data() + vec[0].size();
486     lastend = p;
487     count++;
488   }
489 
490   if (count == 0)
491     return 0;
492 
493   if (p < ep)
494     out.append(p, ep - p);
495   using std::swap;
496   swap(out, *str);
497   return count;
498 }
499 
500 bool RE2::Extract(const StringPiece& text,
501                   const RE2& re,
502                   const StringPiece& rewrite,
503                   std::string* out) {
504   StringPiece vec[kVecSize];
505   int nvec = 1 + MaxSubmatch(rewrite);
506   if (nvec > 1 + re.NumberOfCapturingGroups())
507     return false;
508   if (nvec > static_cast<int>(arraysize(vec)))
509     return false;
510   if (!re.Match(text, 0, text.size(), UNANCHORED, vec, nvec))
511     return false;
512 
513   out->clear();
514   return re.Rewrite(out, rewrite, vec, nvec);
515 }
516 
517 std::string RE2::QuoteMeta(const StringPiece& unquoted) {
518   std::string result;
519   result.reserve(unquoted.size() << 1);
520 
521   // Escape any ascii character not in [A-Za-z_0-9].
522   //
523   // Note that it's legal to escape a character even if it has no
524   // special meaning in a regular expression -- so this function does
525   // that.  (This also makes it identical to the perl function of the
526   // same name except for the null-character special case;
527   // see `perldoc -f quotemeta`.)
528   for (size_t ii = 0; ii < unquoted.size(); ++ii) {
529     // Note that using 'isalnum' here raises the benchmark time from
530     // 32ns to 58ns:
531     if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
532         (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
533         (unquoted[ii] < '0' || unquoted[ii] > '9') &&
534         unquoted[ii] != '_' &&
535         // If this is the part of a UTF8 or Latin1 character, we need
536         // to copy this byte without escaping.  Experimentally this is
537         // what works correctly with the regexp library.
538         !(unquoted[ii] & 128)) {
539       if (unquoted[ii] == '\0') {  // Special handling for null chars.
540         // Note that this special handling is not strictly required for RE2,
541         // but this quoting is required for other regexp libraries such as
542         // PCRE.
543         // Can't use "\\0" since the next character might be a digit.
544         result += "\\x00";
545         continue;
546       }
547       result += '\\';
548     }
549     result += unquoted[ii];
550   }
551 
552   return result;
553 }
554 
555 bool RE2::PossibleMatchRange(std::string* min, std::string* max,
556                              int maxlen) const {
557   if (prog_ == NULL)
558     return false;
559 
560   int n = static_cast<int>(prefix_.size());
561   if (n > maxlen)
562     n = maxlen;
563 
564   // Determine initial min max from prefix_ literal.
565   *min = prefix_.substr(0, n);
566   *max = prefix_.substr(0, n);
567   if (prefix_foldcase_) {
568     // prefix is ASCII lowercase; change *min to uppercase.
569     for (int i = 0; i < n; i++) {
570       char& c = (*min)[i];
571       if ('a' <= c && c <= 'z')
572         c += 'A' - 'a';
573     }
574   }
575 
576   // Add to prefix min max using PossibleMatchRange on regexp.
577   std::string dmin, dmax;
578   maxlen -= n;
579   if (maxlen > 0 && prog_->PossibleMatchRange(&dmin, &dmax, maxlen)) {
580     min->append(dmin);
581     max->append(dmax);
582   } else if (!max->empty()) {
583     // prog_->PossibleMatchRange has failed us,
584     // but we still have useful information from prefix_.
585     // Round up *max to allow any possible suffix.
586     PrefixSuccessor(max);
587   } else {
588     // Nothing useful.
589     *min = "";
590     *max = "";
591     return false;
592   }
593 
594   return true;
595 }
596 
597 // Avoid possible locale nonsense in standard strcasecmp.
598 // The string a is known to be all lowercase.
599 static int ascii_strcasecmp(const char* a, const char* b, size_t len) {
600   const char* ae = a + len;
601 
602   for (; a < ae; a++, b++) {
603     uint8_t x = *a;
604     uint8_t y = *b;
605     if ('A' <= y && y <= 'Z')
606       y += 'a' - 'A';
607     if (x != y)
608       return x - y;
609   }
610   return 0;
611 }
612 
613 
614 /***** Actual matching and rewriting code *****/
615 
616 bool RE2::Match(const StringPiece& text,
617                 size_t startpos,
618                 size_t endpos,
619                 Anchor re_anchor,
620                 StringPiece* submatch,
621                 int nsubmatch) const {
622   if (!ok()) {
623     if (options_.log_errors())
624       LOG(ERROR) << "Invalid RE2: " << *error_;
625     return false;
626   }
627 
628   if (startpos > endpos || endpos > text.size()) {
629     if (options_.log_errors())
630       LOG(ERROR) << "RE2: invalid startpos, endpos pair. ["
631                  << "startpos: " << startpos << ", "
632                  << "endpos: " << endpos << ", "
633                  << "text size: " << text.size() << "]";
634     return false;
635   }
636 
637   StringPiece subtext = text;
638   subtext.remove_prefix(startpos);
639   subtext.remove_suffix(text.size() - endpos);
640 
641   // Use DFAs to find exact location of match, filter out non-matches.
642 
643   // Don't ask for the location if we won't use it.
644   // SearchDFA can do extra optimizations in that case.
645   StringPiece match;
646   StringPiece* matchp = &match;
647   if (nsubmatch == 0)
648     matchp = NULL;
649 
650   int ncap = 1 + NumberOfCapturingGroups();
651   if (ncap > nsubmatch)
652     ncap = nsubmatch;
653 
654   // If the regexp is anchored explicitly, must not be in middle of text.
655   if (prog_->anchor_start() && startpos != 0)
656     return false;
657   if (prog_->anchor_end() && endpos != text.size())
658     return false;
659 
660   // If the regexp is anchored explicitly, update re_anchor
661   // so that we can potentially fall into a faster case below.
662   if (prog_->anchor_start() && prog_->anchor_end())
663     re_anchor = ANCHOR_BOTH;
664   else if (prog_->anchor_start() && re_anchor != ANCHOR_BOTH)
665     re_anchor = ANCHOR_START;
666 
667   // Check for the required prefix, if any.
668   size_t prefixlen = 0;
669   if (!prefix_.empty()) {
670     if (startpos != 0)
671       return false;
672     prefixlen = prefix_.size();
673     if (prefixlen > subtext.size())
674       return false;
675     if (prefix_foldcase_) {
676       if (ascii_strcasecmp(&prefix_[0], subtext.data(), prefixlen) != 0)
677         return false;
678     } else {
679       if (memcmp(&prefix_[0], subtext.data(), prefixlen) != 0)
680         return false;
681     }
682     subtext.remove_prefix(prefixlen);
683     // If there is a required prefix, the anchor must be at least ANCHOR_START.
684     if (re_anchor != ANCHOR_BOTH)
685       re_anchor = ANCHOR_START;
686   }
687 
688   Prog::Anchor anchor = Prog::kUnanchored;
689   Prog::MatchKind kind = Prog::kFirstMatch;
690   if (options_.longest_match())
691     kind = Prog::kLongestMatch;
692 
693   bool can_one_pass = (is_one_pass_ && ncap <= Prog::kMaxOnePassCapture);
694 
695   // BitState allocates a bitmap of size prog_->list_count() * text.size().
696   // It also allocates a stack of 3-word structures which could potentially
697   // grow as large as prog_->list_count() * text.size(), but in practice is
698   // much smaller.
699   const int kMaxBitStateBitmapSize = 256*1024;  // bitmap size <= max (bits)
700   bool can_bit_state = prog_->CanBitState();
701   size_t bit_state_text_max = kMaxBitStateBitmapSize / prog_->list_count();
702 
703 #ifdef RE2_HAVE_THREAD_LOCAL
704   hooks::context = this;
705 #endif
706   bool dfa_failed = false;
707   bool skipped_test = false;
708   switch (re_anchor) {
709     default:
710       LOG(DFATAL) << "Unexpected re_anchor value: " << re_anchor;
711       return false;
712 
713     case UNANCHORED: {
714       if (prog_->anchor_end()) {
715         // This is a very special case: we don't need the forward DFA because
716         // we already know where the match must end! Instead, the reverse DFA
717         // can say whether there is a match and (optionally) where it starts.
718         Prog* prog = ReverseProg();
719         if (prog == NULL) {
720           // Fall back to NFA below.
721           skipped_test = true;
722           break;
723         }
724         if (!prog->SearchDFA(subtext, text, Prog::kAnchored,
725                              Prog::kLongestMatch, matchp, &dfa_failed, NULL)) {
726           if (dfa_failed) {
727             if (options_.log_errors())
728               LOG(ERROR) << "DFA out of memory: "
729                          << "pattern length " << pattern_.size() << ", "
730                          << "program size " << prog->size() << ", "
731                          << "list count " << prog->list_count() << ", "
732                          << "bytemap range " << prog->bytemap_range();
733             // Fall back to NFA below.
734             skipped_test = true;
735             break;
736           }
737           return false;
738         }
739         if (matchp == NULL)  // Matched.  Don't care where.
740           return true;
741         break;
742       }
743 
744       if (!prog_->SearchDFA(subtext, text, anchor, kind,
745                             matchp, &dfa_failed, NULL)) {
746         if (dfa_failed) {
747           if (options_.log_errors())
748             LOG(ERROR) << "DFA out of memory: "
749                        << "pattern length " << pattern_.size() << ", "
750                        << "program size " << prog_->size() << ", "
751                        << "list count " << prog_->list_count() << ", "
752                        << "bytemap range " << prog_->bytemap_range();
753           // Fall back to NFA below.
754           skipped_test = true;
755           break;
756         }
757         return false;
758       }
759       if (matchp == NULL)  // Matched.  Don't care where.
760         return true;
761       // SearchDFA set match.end() but didn't know where the
762       // match started.  Run the regexp backward from match.end()
763       // to find the longest possible match -- that's where it started.
764       Prog* prog = ReverseProg();
765       if (prog == NULL) {
766         // Fall back to NFA below.
767         skipped_test = true;
768         break;
769       }
770       if (!prog->SearchDFA(match, text, Prog::kAnchored,
771                            Prog::kLongestMatch, &match, &dfa_failed, NULL)) {
772         if (dfa_failed) {
773           if (options_.log_errors())
774             LOG(ERROR) << "DFA out of memory: "
775                        << "pattern length " << pattern_.size() << ", "
776                        << "program size " << prog->size() << ", "
777                        << "list count " << prog->list_count() << ", "
778                        << "bytemap range " << prog->bytemap_range();
779           // Fall back to NFA below.
780           skipped_test = true;
781           break;
782         }
783         if (options_.log_errors())
784           LOG(ERROR) << "SearchDFA inconsistency";
785         return false;
786       }
787       break;
788     }
789 
790     case ANCHOR_BOTH:
791     case ANCHOR_START:
792       if (re_anchor == ANCHOR_BOTH)
793         kind = Prog::kFullMatch;
794       anchor = Prog::kAnchored;
795 
796       // If only a small amount of text and need submatch
797       // information anyway and we're going to use OnePass or BitState
798       // to get it, we might as well not even bother with the DFA:
799       // OnePass or BitState will be fast enough.
800       // On tiny texts, OnePass outruns even the DFA, and
801       // it doesn't have the shared state and occasional mutex that
802       // the DFA does.
803       if (can_one_pass && text.size() <= 4096 &&
804           (ncap > 1 || text.size() <= 16)) {
805         skipped_test = true;
806         break;
807       }
808       if (can_bit_state && text.size() <= bit_state_text_max && ncap > 1) {
809         skipped_test = true;
810         break;
811       }
812       if (!prog_->SearchDFA(subtext, text, anchor, kind,
813                             &match, &dfa_failed, NULL)) {
814         if (dfa_failed) {
815           if (options_.log_errors())
816             LOG(ERROR) << "DFA out of memory: "
817                        << "pattern length " << pattern_.size() << ", "
818                        << "program size " << prog_->size() << ", "
819                        << "list count " << prog_->list_count() << ", "
820                        << "bytemap range " << prog_->bytemap_range();
821           // Fall back to NFA below.
822           skipped_test = true;
823           break;
824         }
825         return false;
826       }
827       break;
828   }
829 
830   if (!skipped_test && ncap <= 1) {
831     // We know exactly where it matches.  That's enough.
832     if (ncap == 1)
833       submatch[0] = match;
834   } else {
835     StringPiece subtext1;
836     if (skipped_test) {
837       // DFA ran out of memory or was skipped:
838       // need to search in entire original text.
839       subtext1 = subtext;
840     } else {
841       // DFA found the exact match location:
842       // let NFA run an anchored, full match search
843       // to find submatch locations.
844       subtext1 = match;
845       anchor = Prog::kAnchored;
846       kind = Prog::kFullMatch;
847     }
848 
849     if (can_one_pass && anchor != Prog::kUnanchored) {
850       if (!prog_->SearchOnePass(subtext1, text, anchor, kind, submatch, ncap)) {
851         if (!skipped_test && options_.log_errors())
852           LOG(ERROR) << "SearchOnePass inconsistency";
853         return false;
854       }
855     } else if (can_bit_state && subtext1.size() <= bit_state_text_max) {
856       if (!prog_->SearchBitState(subtext1, text, anchor,
857                                  kind, submatch, ncap)) {
858         if (!skipped_test && options_.log_errors())
859           LOG(ERROR) << "SearchBitState inconsistency";
860         return false;
861       }
862     } else {
863       if (!prog_->SearchNFA(subtext1, text, anchor, kind, submatch, ncap)) {
864         if (!skipped_test && options_.log_errors())
865           LOG(ERROR) << "SearchNFA inconsistency";
866         return false;
867       }
868     }
869   }
870 
871   // Adjust overall match for required prefix that we stripped off.
872   if (prefixlen > 0 && nsubmatch > 0)
873     submatch[0] = StringPiece(submatch[0].data() - prefixlen,
874                               submatch[0].size() + prefixlen);
875 
876   // Zero submatches that don't exist in the regexp.
877   for (int i = ncap; i < nsubmatch; i++)
878     submatch[i] = StringPiece();
879   return true;
880 }
881 
882 // Internal matcher - like Match() but takes Args not StringPieces.
883 bool RE2::DoMatch(const StringPiece& text,
884                   Anchor re_anchor,
885                   size_t* consumed,
886                   const Arg* const* args,
887                   int n) const {
888   if (!ok()) {
889     if (options_.log_errors())
890       LOG(ERROR) << "Invalid RE2: " << *error_;
891     return false;
892   }
893 
894   if (NumberOfCapturingGroups() < n) {
895     // RE has fewer capturing groups than number of Arg pointers passed in.
896     return false;
897   }
898 
899   // Count number of capture groups needed.
900   int nvec;
901   if (n == 0 && consumed == NULL)
902     nvec = 0;
903   else
904     nvec = n+1;
905 
906   StringPiece* vec;
907   StringPiece stkvec[kVecSize];
908   StringPiece* heapvec = NULL;
909 
910   if (nvec <= static_cast<int>(arraysize(stkvec))) {
911     vec = stkvec;
912   } else {
913     vec = new StringPiece[nvec];
914     heapvec = vec;
915   }
916 
917   if (!Match(text, 0, text.size(), re_anchor, vec, nvec)) {
918     delete[] heapvec;
919     return false;
920   }
921 
922   if (consumed != NULL)
923     *consumed = static_cast<size_t>(vec[0].end() - text.begin());
924 
925   if (n == 0 || args == NULL) {
926     // We are not interested in results
927     delete[] heapvec;
928     return true;
929   }
930 
931   // If we got here, we must have matched the whole pattern.
932   for (int i = 0; i < n; i++) {
933     const StringPiece& s = vec[i+1];
934     if (!args[i]->Parse(s.data(), s.size())) {
935       // TODO: Should we indicate what the error was?
936       delete[] heapvec;
937       return false;
938     }
939   }
940 
941   delete[] heapvec;
942   return true;
943 }
944 
945 // Checks that the rewrite string is well-formed with respect to this
946 // regular expression.
947 bool RE2::CheckRewriteString(const StringPiece& rewrite,
948                              std::string* error) const {
949   int max_token = -1;
950   for (const char *s = rewrite.data(), *end = s + rewrite.size();
951        s < end; s++) {
952     int c = *s;
953     if (c != '\\') {
954       continue;
955     }
956     if (++s == end) {
957       *error = "Rewrite schema error: '\\' not allowed at end.";
958       return false;
959     }
960     c = *s;
961     if (c == '\\') {
962       continue;
963     }
964     if (!isdigit(c)) {
965       *error = "Rewrite schema error: "
966                "'\\' must be followed by a digit or '\\'.";
967       return false;
968     }
969     int n = (c - '0');
970     if (max_token < n) {
971       max_token = n;
972     }
973   }
974 
975   if (max_token > NumberOfCapturingGroups()) {
976     *error = StringPrintf(
977         "Rewrite schema requests %d matches, but the regexp only has %d "
978         "parenthesized subexpressions.",
979         max_token, NumberOfCapturingGroups());
980     return false;
981   }
982   return true;
983 }
984 
985 // Returns the maximum submatch needed for the rewrite to be done by Replace().
986 // E.g. if rewrite == "foo \\2,\\1", returns 2.
987 int RE2::MaxSubmatch(const StringPiece& rewrite) {
988   int max = 0;
989   for (const char *s = rewrite.data(), *end = s + rewrite.size();
990        s < end; s++) {
991     if (*s == '\\') {
992       s++;
993       int c = (s < end) ? *s : -1;
994       if (isdigit(c)) {
995         int n = (c - '0');
996         if (n > max)
997           max = n;
998       }
999     }
1000   }
1001   return max;
1002 }
1003 
1004 // Append the "rewrite" string, with backslash subsitutions from "vec",
1005 // to string "out".
1006 bool RE2::Rewrite(std::string* out,
1007                   const StringPiece& rewrite,
1008                   const StringPiece* vec,
1009                   int veclen) const {
1010   for (const char *s = rewrite.data(), *end = s + rewrite.size();
1011        s < end; s++) {
1012     if (*s != '\\') {
1013       out->push_back(*s);
1014       continue;
1015     }
1016     s++;
1017     int c = (s < end) ? *s : -1;
1018     if (isdigit(c)) {
1019       int n = (c - '0');
1020       if (n >= veclen) {
1021         if (options_.log_errors()) {
1022           LOG(ERROR) << "invalid substitution \\" << n
1023                      << " from " << veclen << " groups";
1024         }
1025         return false;
1026       }
1027       StringPiece snip = vec[n];
1028       if (!snip.empty())
1029         out->append(snip.data(), snip.size());
1030     } else if (c == '\\') {
1031       out->push_back('\\');
1032     } else {
1033       if (options_.log_errors())
1034         LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data();
1035       return false;
1036     }
1037   }
1038   return true;
1039 }
1040 
1041 /***** Parsers for various types *****/
1042 
1043 namespace re2_internal {
1044 
1045 template <>
1046 bool Parse(const char* str, size_t n, void* dest) {
1047   // We fail if somebody asked us to store into a non-NULL void* pointer
1048   return (dest == NULL);
1049 }
1050 
1051 template <>
1052 bool Parse(const char* str, size_t n, std::string* dest) {
1053   if (dest == NULL) return true;
1054   dest->assign(str, n);
1055   return true;
1056 }
1057 
1058 template <>
1059 bool Parse(const char* str, size_t n, StringPiece* dest) {
1060   if (dest == NULL) return true;
1061   *dest = StringPiece(str, n);
1062   return true;
1063 }
1064 
1065 template <>
1066 bool Parse(const char* str, size_t n, char* dest) {
1067   if (n != 1) return false;
1068   if (dest == NULL) return true;
1069   *dest = str[0];
1070   return true;
1071 }
1072 
1073 template <>
1074 bool Parse(const char* str, size_t n, signed char* dest) {
1075   if (n != 1) return false;
1076   if (dest == NULL) return true;
1077   *dest = str[0];
1078   return true;
1079 }
1080 
1081 template <>
1082 bool Parse(const char* str, size_t n, unsigned char* dest) {
1083   if (n != 1) return false;
1084   if (dest == NULL) return true;
1085   *dest = str[0];
1086   return true;
1087 }
1088 
1089 // Largest number spec that we are willing to parse
1090 static const int kMaxNumberLength = 32;
1091 
1092 // REQUIRES "buf" must have length at least nbuf.
1093 // Copies "str" into "buf" and null-terminates.
1094 // Overwrites *np with the new length.
1095 static const char* TerminateNumber(char* buf, size_t nbuf, const char* str,
1096                                    size_t* np, bool accept_spaces) {
1097   size_t n = *np;
1098   if (n == 0) return "";
1099   if (n > 0 && isspace(*str)) {
1100     // We are less forgiving than the strtoxxx() routines and do not
1101     // allow leading spaces. We do allow leading spaces for floats.
1102     if (!accept_spaces) {
1103       return "";
1104     }
1105     while (n > 0 && isspace(*str)) {
1106       n--;
1107       str++;
1108     }
1109   }
1110 
1111   // Although buf has a fixed maximum size, we can still handle
1112   // arbitrarily large integers correctly by omitting leading zeros.
1113   // (Numbers that are still too long will be out of range.)
1114   // Before deciding whether str is too long,
1115   // remove leading zeros with s/000+/00/.
1116   // Leaving the leading two zeros in place means that
1117   // we don't change 0000x123 (invalid) into 0x123 (valid).
1118   // Skip over leading - before replacing.
1119   bool neg = false;
1120   if (n >= 1 && str[0] == '-') {
1121     neg = true;
1122     n--;
1123     str++;
1124   }
1125 
1126   if (n >= 3 && str[0] == '0' && str[1] == '0') {
1127     while (n >= 3 && str[2] == '0') {
1128       n--;
1129       str++;
1130     }
1131   }
1132 
1133   if (neg) {  // make room in buf for -
1134     n++;
1135     str--;
1136   }
1137 
1138   if (n > nbuf-1) return "";
1139 
1140   memmove(buf, str, n);
1141   if (neg) {
1142     buf[0] = '-';
1143   }
1144   buf[n] = '\0';
1145   *np = n;
1146   return buf;
1147 }
1148 
1149 template <>
1150 bool Parse(const char* str, size_t n, float* dest) {
1151   if (n == 0) return false;
1152   static const int kMaxLength = 200;
1153   char buf[kMaxLength+1];
1154   str = TerminateNumber(buf, sizeof buf, str, &n, true);
1155   char* end;
1156   errno = 0;
1157   float r = strtof(str, &end);
1158   if (end != str + n) return false;   // Leftover junk
1159   if (errno) return false;
1160   if (dest == NULL) return true;
1161   *dest = r;
1162   return true;
1163 }
1164 
1165 template <>
1166 bool Parse(const char* str, size_t n, double* dest) {
1167   if (n == 0) return false;
1168   static const int kMaxLength = 200;
1169   char buf[kMaxLength+1];
1170   str = TerminateNumber(buf, sizeof buf, str, &n, true);
1171   char* end;
1172   errno = 0;
1173   double r = strtod(str, &end);
1174   if (end != str + n) return false;   // Leftover junk
1175   if (errno) return false;
1176   if (dest == NULL) return true;
1177   *dest = r;
1178   return true;
1179 }
1180 
1181 template <>
1182 bool Parse(const char* str, size_t n, long* dest, int radix) {
1183   if (n == 0) return false;
1184   char buf[kMaxNumberLength+1];
1185   str = TerminateNumber(buf, sizeof buf, str, &n, false);
1186   char* end;
1187   errno = 0;
1188   long r = strtol(str, &end, radix);
1189   if (end != str + n) return false;   // Leftover junk
1190   if (errno) return false;
1191   if (dest == NULL) return true;
1192   *dest = r;
1193   return true;
1194 }
1195 
1196 template <>
1197 bool Parse(const char* str, size_t n, unsigned long* dest, int radix) {
1198   if (n == 0) return false;
1199   char buf[kMaxNumberLength+1];
1200   str = TerminateNumber(buf, sizeof buf, str, &n, false);
1201   if (str[0] == '-') {
1202     // strtoul() will silently accept negative numbers and parse
1203     // them.  This module is more strict and treats them as errors.
1204     return false;
1205   }
1206 
1207   char* end;
1208   errno = 0;
1209   unsigned long r = strtoul(str, &end, radix);
1210   if (end != str + n) return false;   // Leftover junk
1211   if (errno) return false;
1212   if (dest == NULL) return true;
1213   *dest = r;
1214   return true;
1215 }
1216 
1217 template <>
1218 bool Parse(const char* str, size_t n, short* dest, int radix) {
1219   long r;
1220   if (!Parse(str, n, &r, radix)) return false;  // Could not parse
1221   if ((short)r != r) return false;              // Out of range
1222   if (dest == NULL) return true;
1223   *dest = (short)r;
1224   return true;
1225 }
1226 
1227 template <>
1228 bool Parse(const char* str, size_t n, unsigned short* dest, int radix) {
1229   unsigned long r;
1230   if (!Parse(str, n, &r, radix)) return false;  // Could not parse
1231   if ((unsigned short)r != r) return false;     // Out of range
1232   if (dest == NULL) return true;
1233   *dest = (unsigned short)r;
1234   return true;
1235 }
1236 
1237 template <>
1238 bool Parse(const char* str, size_t n, int* dest, int radix) {
1239   long r;
1240   if (!Parse(str, n, &r, radix)) return false;  // Could not parse
1241   if ((int)r != r) return false;                // Out of range
1242   if (dest == NULL) return true;
1243   *dest = (int)r;
1244   return true;
1245 }
1246 
1247 template <>
1248 bool Parse(const char* str, size_t n, unsigned int* dest, int radix) {
1249   unsigned long r;
1250   if (!Parse(str, n, &r, radix)) return false;  // Could not parse
1251   if ((unsigned int)r != r) return false;       // Out of range
1252   if (dest == NULL) return true;
1253   *dest = (unsigned int)r;
1254   return true;
1255 }
1256 
1257 template <>
1258 bool Parse(const char* str, size_t n, long long* dest, int radix) {
1259   if (n == 0) return false;
1260   char buf[kMaxNumberLength+1];
1261   str = TerminateNumber(buf, sizeof buf, str, &n, false);
1262   char* end;
1263   errno = 0;
1264   long long r = strtoll(str, &end, radix);
1265   if (end != str + n) return false;   // Leftover junk
1266   if (errno) return false;
1267   if (dest == NULL) return true;
1268   *dest = r;
1269   return true;
1270 }
1271 
1272 template <>
1273 bool Parse(const char* str, size_t n, unsigned long long* dest, int radix) {
1274   if (n == 0) return false;
1275   char buf[kMaxNumberLength+1];
1276   str = TerminateNumber(buf, sizeof buf, str, &n, false);
1277   if (str[0] == '-') {
1278     // strtoull() will silently accept negative numbers and parse
1279     // them.  This module is more strict and treats them as errors.
1280     return false;
1281   }
1282   char* end;
1283   errno = 0;
1284   unsigned long long r = strtoull(str, &end, radix);
1285   if (end != str + n) return false;   // Leftover junk
1286   if (errno) return false;
1287   if (dest == NULL) return true;
1288   *dest = r;
1289   return true;
1290 }
1291 
1292 }  // namespace re2_internal
1293 
1294 namespace hooks {
1295 
1296 #ifdef RE2_HAVE_THREAD_LOCAL
1297 thread_local const RE2* context = NULL;
1298 #endif
1299 
1300 template <typename T>
1301 union Hook {
1302   void Store(T* cb) { cb_.store(cb, std::memory_order_release); }
1303   T* Load() const { return cb_.load(std::memory_order_acquire); }
1304 
1305 #if !defined(__clang__) && defined(_MSC_VER)
1306   // Citing https://github.com/protocolbuffers/protobuf/pull/4777 as precedent,
1307   // this is a gross hack to make std::atomic<T*> constant-initialized on MSVC.
1308   static_assert(ATOMIC_POINTER_LOCK_FREE == 2,
1309                 "std::atomic<T*> must be always lock-free");
1310   T* cb_for_constinit_;
1311 #endif
1312 
1313   std::atomic<T*> cb_;
1314 };
1315 
1316 template <typename T>
1317 static void DoNothing(const T&) {}
1318 
1319 #define DEFINE_HOOK(type, name)                                       \
1320   static Hook<type##Callback> name##_hook = {{&DoNothing<type>}};     \
1321   void Set##type##Hook(type##Callback* cb) { name##_hook.Store(cb); } \
1322   type##Callback* Get##type##Hook() { return name##_hook.Load(); }
1323 
1324 DEFINE_HOOK(DFAStateCacheReset, dfa_state_cache_reset)
1325 DEFINE_HOOK(DFASearchFailure, dfa_search_failure)
1326 
1327 #undef DEFINE_HOOK
1328 
1329 }  // namespace hooks
1330 
1331 }  // namespace re2
1332