1 // Copyright 2008 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Regular expression engine tester -- test all the implementations against each other.
6 
7 #include <stddef.h>
8 #include <stdint.h>
9 #include <string.h>
10 #include <string>
11 
12 #include "util/util.h"
13 #include "util/flags.h"
14 #include "util/logging.h"
15 #include "util/strutil.h"
16 #include "re2/testing/tester.h"
17 #include "re2/prog.h"
18 #include "re2/re2.h"
19 #include "re2/regexp.h"
20 
21 DEFINE_FLAG(bool, dump_prog, false, "dump regexp program");
22 DEFINE_FLAG(bool, log_okay, false, "log successful runs");
23 DEFINE_FLAG(bool, dump_rprog, false, "dump reversed regexp program");
24 
25 DEFINE_FLAG(int, max_regexp_failures, 100,
26             "maximum number of regexp test failures (-1 = unlimited)");
27 
28 DEFINE_FLAG(std::string, regexp_engines, "",
29             "pattern to select regexp engines to test");
30 
31 namespace re2 {
32 
33 enum {
34   kMaxSubmatch = 1+16,  // $0...$16
35 };
36 
37 const char* engine_names[kEngineMax] = {
38   "Backtrack",
39   "NFA",
40   "DFA",
41   "DFA1",
42   "OnePass",
43   "BitState",
44   "RE2",
45   "RE2a",
46   "RE2b",
47   "PCRE",
48 };
49 
50 // Returns the name of the engine.
EngineName(Engine e)51 static const char* EngineName(Engine e) {
52   CHECK_GE(e, 0);
53   CHECK_LT(e, arraysize(engine_names));
54   CHECK(engine_names[e] != NULL);
55   return engine_names[e];
56 }
57 
58 // Returns bit mask of engines to use.
Engines()59 static uint32_t Engines() {
60   static bool did_parse = false;
61   static uint32_t cached_engines = 0;
62 
63   if (did_parse)
64     return cached_engines;
65 
66   if (GetFlag(FLAGS_regexp_engines).empty()) {
67     cached_engines = ~0;
68   } else {
69     for (Engine i = static_cast<Engine>(0); i < kEngineMax; i++)
70       if (GetFlag(FLAGS_regexp_engines).find(EngineName(i)) != std::string::npos)
71         cached_engines |= 1<<i;
72   }
73 
74   if (cached_engines == 0)
75     LOG(INFO) << "Warning: no engines enabled.";
76   if (!UsingPCRE)
77     cached_engines &= ~(1<<kEnginePCRE);
78   for (Engine i = static_cast<Engine>(0); i < kEngineMax; i++) {
79     if (cached_engines & (1<<i))
80       LOG(INFO) << EngineName(i) << " enabled";
81   }
82 
83   did_parse = true;
84   return cached_engines;
85 }
86 
87 // The result of running a match.
88 struct TestInstance::Result {
Resultre2::TestInstance::Result89   Result()
90       : skipped(false),
91         matched(false),
92         untrusted(false),
93         have_submatch(false),
94         have_submatch0(false) {
95     ClearSubmatch();
96   }
97 
ClearSubmatchre2::TestInstance::Result98   void ClearSubmatch() {
99     for (int i = 0; i < kMaxSubmatch; i++)
100       submatch[i] = StringPiece();
101   }
102 
103   bool skipped;         // test skipped: wasn't applicable
104   bool matched;         // found a match
105   bool untrusted;       // don't really trust the answer
106   bool have_submatch;   // computed all submatch info
107   bool have_submatch0;  // computed just submatch[0]
108   StringPiece submatch[kMaxSubmatch];
109 };
110 
111 typedef TestInstance::Result Result;
112 
113 // Formats a single capture range s in text in the form (a,b)
114 // where a and b are the starting and ending offsets of s in text.
FormatCapture(const StringPiece & text,const StringPiece & s)115 static std::string FormatCapture(const StringPiece& text,
116                                  const StringPiece& s) {
117   if (s.data() == NULL)
118     return "(?,?)";
119   return StringPrintf("(%td,%td)",
120                       s.begin() - text.begin(),
121                       s.end() - text.begin());
122 }
123 
124 // Returns whether text contains non-ASCII (>= 0x80) bytes.
NonASCII(const StringPiece & text)125 static bool NonASCII(const StringPiece& text) {
126   for (size_t i = 0; i < text.size(); i++)
127     if ((uint8_t)text[i] >= 0x80)
128       return true;
129   return false;
130 }
131 
132 // Returns string representation of match kind.
FormatKind(Prog::MatchKind kind)133 static std::string FormatKind(Prog::MatchKind kind) {
134   switch (kind) {
135     case Prog::kFullMatch:
136       return "full match";
137     case Prog::kLongestMatch:
138       return "longest match";
139     case Prog::kFirstMatch:
140       return "first match";
141     case Prog::kManyMatch:
142       return "many match";
143   }
144   return "???";
145 }
146 
147 // Returns string representation of anchor kind.
FormatAnchor(Prog::Anchor anchor)148 static std::string FormatAnchor(Prog::Anchor anchor) {
149   switch (anchor) {
150     case Prog::kAnchored:
151       return "anchored";
152     case Prog::kUnanchored:
153       return "unanchored";
154   }
155   return "???";
156 }
157 
158 struct ParseMode {
159   Regexp::ParseFlags parse_flags;
160   std::string desc;
161 };
162 
163 static const Regexp::ParseFlags single_line =
164   Regexp::LikePerl;
165 static const Regexp::ParseFlags multi_line =
166   static_cast<Regexp::ParseFlags>(Regexp::LikePerl & ~Regexp::OneLine);
167 
168 static ParseMode parse_modes[] = {
169   { single_line,                   "single-line"          },
170   { single_line|Regexp::Latin1,    "single-line, latin1"  },
171   { multi_line,                    "multiline"            },
172   { multi_line|Regexp::NonGreedy,  "multiline, nongreedy" },
173   { multi_line|Regexp::Latin1,     "multiline, latin1"    },
174 };
175 
FormatMode(Regexp::ParseFlags flags)176 static std::string FormatMode(Regexp::ParseFlags flags) {
177   for (size_t i = 0; i < arraysize(parse_modes); i++)
178     if (parse_modes[i].parse_flags == flags)
179       return parse_modes[i].desc;
180   return StringPrintf("%#x", static_cast<uint32_t>(flags));
181 }
182 
183 // Constructs and saves all the matching engines that
184 // will be required for the given tests.
TestInstance(const StringPiece & regexp_str,Prog::MatchKind kind,Regexp::ParseFlags flags)185 TestInstance::TestInstance(const StringPiece& regexp_str, Prog::MatchKind kind,
186                            Regexp::ParseFlags flags)
187   : regexp_str_(regexp_str),
188     kind_(kind),
189     flags_(flags),
190     error_(false),
191     regexp_(NULL),
192     num_captures_(0),
193     prog_(NULL),
194     rprog_(NULL),
195     re_(NULL),
196     re2_(NULL) {
197 
198   VLOG(1) << CEscape(regexp_str);
199 
200   // Compile regexp to prog.
201   // Always required - needed for backtracking (reference implementation).
202   RegexpStatus status;
203   regexp_ = Regexp::Parse(regexp_str, flags, &status);
204   if (regexp_ == NULL) {
205     LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_)
206               << " mode: " << FormatMode(flags);
207     error_ = true;
208     return;
209   }
210   num_captures_ = regexp_->NumCaptures();
211   prog_ = regexp_->CompileToProg(0);
212   if (prog_ == NULL) {
213     LOG(INFO) << "Cannot compile: " << CEscape(regexp_str_);
214     error_ = true;
215     return;
216   }
217   if (GetFlag(FLAGS_dump_prog)) {
218     LOG(INFO) << "Prog for "
219               << " regexp "
220               << CEscape(regexp_str_)
221               << " (" << FormatKind(kind_)
222               << ", " << FormatMode(flags_)
223               << ")\n"
224               << prog_->Dump();
225   }
226 
227   // Compile regexp to reversed prog.  Only needed for DFA engines.
228   if (Engines() & ((1<<kEngineDFA)|(1<<kEngineDFA1))) {
229     rprog_ = regexp_->CompileToReverseProg(0);
230     if (rprog_ == NULL) {
231       LOG(INFO) << "Cannot reverse compile: " << CEscape(regexp_str_);
232       error_ = true;
233       return;
234     }
235     if (GetFlag(FLAGS_dump_rprog))
236       LOG(INFO) << rprog_->Dump();
237   }
238 
239   // Create re string that will be used for RE and RE2.
240   std::string re = std::string(regexp_str);
241   // Accomodate flags.
242   // Regexp::Latin1 will be accomodated below.
243   if (!(flags & Regexp::OneLine))
244     re = "(?m)" + re;
245   if (flags & Regexp::NonGreedy)
246     re = "(?U)" + re;
247   if (flags & Regexp::DotNL)
248     re = "(?s)" + re;
249 
250   // Compile regexp to RE2.
251   if (Engines() & ((1<<kEngineRE2)|(1<<kEngineRE2a)|(1<<kEngineRE2b))) {
252     RE2::Options options;
253     if (flags & Regexp::Latin1)
254       options.set_encoding(RE2::Options::EncodingLatin1);
255     if (kind_ == Prog::kLongestMatch)
256       options.set_longest_match(true);
257     re2_ = new RE2(re, options);
258     if (!re2_->error().empty()) {
259       LOG(INFO) << "Cannot RE2: " << CEscape(re);
260       error_ = true;
261       return;
262     }
263   }
264 
265   // Compile regexp to RE.
266   // PCRE as exposed by the RE interface isn't always usable.
267   // 1. It disagrees about handling of empty-string reptitions
268   //    like matching (a*)* against "b".  PCRE treats the (a*) as
269   //    occurring once, while we treat it as occurring not at all.
270   // 2. It treats $ as this weird thing meaning end of string
271   //    or before the \n at the end of the string.
272   // 3. It doesn't implement POSIX leftmost-longest matching.
273   // 4. It lets \s match vertical tab.
274   // MimicsPCRE() detects 1 and 2.
275   if ((Engines() & (1<<kEnginePCRE)) && regexp_->MimicsPCRE() &&
276       kind_ != Prog::kLongestMatch) {
277     PCRE_Options o;
278     o.set_option(PCRE::UTF8);
279     if (flags & Regexp::Latin1)
280       o.set_option(PCRE::None);
281     // PCRE has interface bug keeping us from finding $0, so
282     // add one more layer of parens.
283     re_ = new PCRE("("+re+")", o);
284     if (!re_->error().empty()) {
285       LOG(INFO) << "Cannot PCRE: " << CEscape(re);
286       error_ = true;
287       return;
288     }
289   }
290 }
291 
~TestInstance()292 TestInstance::~TestInstance() {
293   if (regexp_)
294     regexp_->Decref();
295   delete prog_;
296   delete rprog_;
297   delete re_;
298   delete re2_;
299 }
300 
301 // Runs a single search using the named engine type.
302 // This interface hides all the irregularities of the various
303 // engine interfaces from the rest of this file.
RunSearch(Engine type,const StringPiece & orig_text,const StringPiece & orig_context,Prog::Anchor anchor,Result * result)304 void TestInstance::RunSearch(Engine type,
305                              const StringPiece& orig_text,
306                              const StringPiece& orig_context,
307                              Prog::Anchor anchor,
308                              Result* result) {
309   if (regexp_ == NULL) {
310     result->skipped = true;
311     return;
312   }
313   int nsubmatch = 1 + num_captures_;  // NumCaptures doesn't count $0
314   if (nsubmatch > kMaxSubmatch)
315     nsubmatch = kMaxSubmatch;
316 
317   StringPiece text = orig_text;
318   StringPiece context = orig_context;
319 
320   switch (type) {
321     default:
322       LOG(FATAL) << "Bad RunSearch type: " << (int)type;
323 
324     case kEngineBacktrack:
325       if (prog_ == NULL) {
326         result->skipped = true;
327         break;
328       }
329       result->matched =
330         prog_->UnsafeSearchBacktrack(text, context, anchor, kind_,
331                                      result->submatch, nsubmatch);
332       result->have_submatch = true;
333       break;
334 
335     case kEngineNFA:
336       if (prog_ == NULL) {
337         result->skipped = true;
338         break;
339       }
340       result->matched =
341         prog_->SearchNFA(text, context, anchor, kind_,
342                         result->submatch, nsubmatch);
343       result->have_submatch = true;
344       break;
345 
346     case kEngineDFA:
347       if (prog_ == NULL) {
348         result->skipped = true;
349         break;
350       }
351       result->matched = prog_->SearchDFA(text, context, anchor, kind_, NULL,
352                                          &result->skipped, NULL);
353       break;
354 
355     case kEngineDFA1:
356       if (prog_ == NULL || rprog_ == NULL) {
357         result->skipped = true;
358         break;
359       }
360       result->matched =
361         prog_->SearchDFA(text, context, anchor, kind_, result->submatch,
362                          &result->skipped, NULL);
363       // If anchored, no need for second run,
364       // but do it anyway to find more bugs.
365       if (result->matched) {
366         if (!rprog_->SearchDFA(result->submatch[0], context,
367                                Prog::kAnchored, Prog::kLongestMatch,
368                                result->submatch,
369                                &result->skipped, NULL)) {
370           LOG(ERROR) << "Reverse DFA inconsistency: "
371                      << CEscape(regexp_str_)
372                      << " on " << CEscape(text);
373           result->matched = false;
374         }
375       }
376       result->have_submatch0 = true;
377       break;
378 
379     case kEngineOnePass:
380       if (prog_ == NULL ||
381           !prog_->IsOnePass() ||
382           anchor == Prog::kUnanchored ||
383           nsubmatch > Prog::kMaxOnePassCapture) {
384         result->skipped = true;
385         break;
386       }
387       result->matched = prog_->SearchOnePass(text, context, anchor, kind_,
388                                       result->submatch, nsubmatch);
389       result->have_submatch = true;
390       break;
391 
392     case kEngineBitState:
393       if (prog_ == NULL ||
394           !prog_->CanBitState()) {
395         result->skipped = true;
396         break;
397       }
398       result->matched = prog_->SearchBitState(text, context, anchor, kind_,
399                                               result->submatch, nsubmatch);
400       result->have_submatch = true;
401       break;
402 
403     case kEngineRE2:
404     case kEngineRE2a:
405     case kEngineRE2b: {
406       if (!re2_ || text.end() != context.end()) {
407         result->skipped = true;
408         break;
409       }
410 
411       RE2::Anchor re_anchor;
412       if (anchor == Prog::kAnchored)
413         re_anchor = RE2::ANCHOR_START;
414       else
415         re_anchor = RE2::UNANCHORED;
416       if (kind_ == Prog::kFullMatch)
417         re_anchor = RE2::ANCHOR_BOTH;
418 
419       result->matched = re2_->Match(
420           context,
421           static_cast<size_t>(text.begin() - context.begin()),
422           static_cast<size_t>(text.end() - context.begin()),
423           re_anchor,
424           result->submatch,
425           nsubmatch);
426       result->have_submatch = nsubmatch > 0;
427       break;
428     }
429 
430     case kEnginePCRE: {
431       if (!re_ || text.begin() != context.begin() ||
432           text.end() != context.end()) {
433         result->skipped = true;
434         break;
435       }
436 
437       // In Perl/PCRE, \v matches any character considered vertical
438       // whitespace, not just vertical tab. Regexp::MimicsPCRE() is
439       // unable to handle all cases of this, unfortunately, so just
440       // catch them here. :(
441       if (regexp_str_.find("\\v") != StringPiece::npos &&
442           (text.find('\n') != StringPiece::npos ||
443            text.find('\f') != StringPiece::npos ||
444            text.find('\r') != StringPiece::npos)) {
445         result->skipped = true;
446         break;
447       }
448 
449       // PCRE 8.34 or so started allowing vertical tab to match \s,
450       // following a change made in Perl 5.18. RE2 does not.
451       if ((regexp_str_.find("\\s") != StringPiece::npos ||
452            regexp_str_.find("\\S") != StringPiece::npos) &&
453           text.find('\v') != StringPiece::npos) {
454         result->skipped = true;
455         break;
456       }
457 
458       const PCRE::Arg **argptr = new const PCRE::Arg*[nsubmatch];
459       PCRE::Arg *a = new PCRE::Arg[nsubmatch];
460       for (int i = 0; i < nsubmatch; i++) {
461         a[i] = PCRE::Arg(&result->submatch[i]);
462         argptr[i] = &a[i];
463       }
464       size_t consumed;
465       PCRE::Anchor pcre_anchor;
466       if (anchor == Prog::kAnchored)
467         pcre_anchor = PCRE::ANCHOR_START;
468       else
469         pcre_anchor = PCRE::UNANCHORED;
470       if (kind_ == Prog::kFullMatch)
471         pcre_anchor = PCRE::ANCHOR_BOTH;
472       re_->ClearHitLimit();
473       result->matched =
474         re_->DoMatch(text,
475                      pcre_anchor,
476                      &consumed,
477                      argptr, nsubmatch);
478       if (re_->HitLimit()) {
479         result->untrusted = true;
480         delete[] argptr;
481         delete[] a;
482         break;
483       }
484       result->have_submatch = true;
485       delete[] argptr;
486       delete[] a;
487       break;
488     }
489   }
490 
491   if (!result->matched)
492     result->ClearSubmatch();
493 }
494 
495 // Checks whether r is okay given that correct is the right answer.
496 // Specifically, r's answers have to match (but it doesn't have to
497 // claim to have all the answers).
ResultOkay(const Result & r,const Result & correct)498 static bool ResultOkay(const Result& r, const Result& correct) {
499   if (r.skipped)
500     return true;
501   if (r.matched != correct.matched)
502     return false;
503   if (r.have_submatch || r.have_submatch0) {
504     for (int i = 0; i < kMaxSubmatch; i++) {
505       if (correct.submatch[i].data() != r.submatch[i].data() ||
506           correct.submatch[i].size() != r.submatch[i].size())
507         return false;
508       if (!r.have_submatch)
509         break;
510     }
511   }
512   return true;
513 }
514 
515 // Runs a single test.
RunCase(const StringPiece & text,const StringPiece & context,Prog::Anchor anchor)516 bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context,
517                            Prog::Anchor anchor) {
518   // Backtracking is the gold standard.
519   Result correct;
520   RunSearch(kEngineBacktrack, text, context, anchor, &correct);
521   if (correct.skipped) {
522     if (regexp_ == NULL)
523       return true;
524     LOG(ERROR) << "Skipped backtracking! " << CEscape(regexp_str_)
525                << " " << FormatMode(flags_);
526     return false;
527   }
528   VLOG(1) << "Try: regexp " << CEscape(regexp_str_)
529           << " text " << CEscape(text)
530           << " (" << FormatKind(kind_)
531           << ", " << FormatAnchor(anchor)
532           << ", " << FormatMode(flags_)
533           << ")";
534 
535   // Compare the others.
536   bool all_okay = true;
537   for (Engine i = kEngineBacktrack+1; i < kEngineMax; i++) {
538     if (!(Engines() & (1<<i)))
539       continue;
540 
541     Result r;
542     RunSearch(i, text, context, anchor, &r);
543     if (ResultOkay(r, correct)) {
544       if (GetFlag(FLAGS_log_okay))
545         LogMatch(r.skipped ? "Skipped: " : "Okay: ", i, text, context, anchor);
546       continue;
547     }
548 
549     // We disagree with PCRE on the meaning of some Unicode matches.
550     // In particular, we treat non-ASCII UTF-8 as non-word characters.
551     // We also treat "empty" character sets like [^\w\W] as being
552     // impossible to match, while PCRE apparently excludes some code
553     // points (e.g., 0x0080) from both \w and \W.
554     if (i == kEnginePCRE && NonASCII(text))
555       continue;
556 
557     if (!r.untrusted)
558       all_okay = false;
559 
560     LogMatch(r.untrusted ? "(Untrusted) Mismatch: " : "Mismatch: ", i, text,
561              context, anchor);
562     if (r.matched != correct.matched) {
563       if (r.matched) {
564         LOG(INFO) << "   Should not match (but does).";
565       } else {
566         LOG(INFO) << "   Should match (but does not).";
567         continue;
568       }
569     }
570     for (int i = 0; i < 1+num_captures_; i++) {
571       if (r.submatch[i].data() != correct.submatch[i].data() ||
572           r.submatch[i].size() != correct.submatch[i].size()) {
573         LOG(INFO) <<
574           StringPrintf("   $%d: should be %s is %s",
575                        i,
576                        FormatCapture(text, correct.submatch[i]).c_str(),
577                        FormatCapture(text, r.submatch[i]).c_str());
578       } else {
579         LOG(INFO) <<
580           StringPrintf("   $%d: %s ok", i,
581                        FormatCapture(text, r.submatch[i]).c_str());
582       }
583     }
584   }
585 
586   if (!all_okay) {
587     // This will be initialised once (after flags have been initialised)
588     // and that is desirable because we want to enforce a global limit.
589     static int max_regexp_failures = GetFlag(FLAGS_max_regexp_failures);
590     if (max_regexp_failures > 0 && --max_regexp_failures == 0)
591       LOG(QFATAL) << "Too many regexp failures.";
592   }
593 
594   return all_okay;
595 }
596 
LogMatch(const char * prefix,Engine e,const StringPiece & text,const StringPiece & context,Prog::Anchor anchor)597 void TestInstance::LogMatch(const char* prefix, Engine e,
598                             const StringPiece& text, const StringPiece& context,
599                             Prog::Anchor anchor) {
600   LOG(INFO) << prefix
601     << EngineName(e)
602     << " regexp "
603     << CEscape(regexp_str_)
604     << " "
605     << CEscape(regexp_->ToString())
606     << " text "
607     << CEscape(text)
608     << " ("
609     << text.begin() - context.begin()
610     << ","
611     << text.end() - context.begin()
612     << ") of context "
613     << CEscape(context)
614     << " (" << FormatKind(kind_)
615     << ", " << FormatAnchor(anchor)
616     << ", " << FormatMode(flags_)
617     << ")";
618 }
619 
620 static Prog::MatchKind kinds[] = {
621   Prog::kFirstMatch,
622   Prog::kLongestMatch,
623   Prog::kFullMatch,
624 };
625 
626 // Test all possible match kinds and parse modes.
Tester(const StringPiece & regexp)627 Tester::Tester(const StringPiece& regexp) {
628   error_ = false;
629   for (size_t i = 0; i < arraysize(kinds); i++) {
630     for (size_t j = 0; j < arraysize(parse_modes); j++) {
631       TestInstance* t = new TestInstance(regexp, kinds[i],
632                                          parse_modes[j].parse_flags);
633       error_ |= t->error();
634       v_.push_back(t);
635     }
636   }
637 }
638 
~Tester()639 Tester::~Tester() {
640   for (size_t i = 0; i < v_.size(); i++)
641     delete v_[i];
642 }
643 
TestCase(const StringPiece & text,const StringPiece & context,Prog::Anchor anchor)644 bool Tester::TestCase(const StringPiece& text, const StringPiece& context,
645                          Prog::Anchor anchor) {
646   bool okay = true;
647   for (size_t i = 0; i < v_.size(); i++)
648     okay &= (!v_[i]->error() && v_[i]->RunCase(text, context, anchor));
649   return okay;
650 }
651 
652 static Prog::Anchor anchors[] = {
653   Prog::kAnchored,
654   Prog::kUnanchored
655 };
656 
TestInput(const StringPiece & text)657 bool Tester::TestInput(const StringPiece& text) {
658   bool okay = TestInputInContext(text, text);
659   if (!text.empty()) {
660     StringPiece sp;
661     sp = text;
662     sp.remove_prefix(1);
663     okay &= TestInputInContext(sp, text);
664     sp = text;
665     sp.remove_suffix(1);
666     okay &= TestInputInContext(sp, text);
667   }
668   return okay;
669 }
670 
TestInputInContext(const StringPiece & text,const StringPiece & context)671 bool Tester::TestInputInContext(const StringPiece& text,
672                                 const StringPiece& context) {
673   bool okay = true;
674   for (size_t i = 0; i < arraysize(anchors); i++)
675     okay &= TestCase(text, context, anchors[i]);
676   return okay;
677 }
678 
TestRegexpOnText(const StringPiece & regexp,const StringPiece & text)679 bool TestRegexpOnText(const StringPiece& regexp,
680                       const StringPiece& text) {
681   Tester t(regexp);
682   return t.TestInput(text);
683 }
684 
685 }  // namespace re2
686