1 // Copyright 2008 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Regular expression engine tester -- test all the implementations against each other.
6
7 #include <stddef.h>
8 #include <stdint.h>
9 #include <string.h>
10 #include <string>
11
12 #include "util/util.h"
13 #include "util/flags.h"
14 #include "util/logging.h"
15 #include "util/strutil.h"
16 #include "re2/testing/tester.h"
17 #include "re2/prog.h"
18 #include "re2/re2.h"
19 #include "re2/regexp.h"
20
21 DEFINE_FLAG(bool, dump_prog, false, "dump regexp program");
22 DEFINE_FLAG(bool, log_okay, false, "log successful runs");
23 DEFINE_FLAG(bool, dump_rprog, false, "dump reversed regexp program");
24
25 DEFINE_FLAG(int, max_regexp_failures, 100,
26 "maximum number of regexp test failures (-1 = unlimited)");
27
28 DEFINE_FLAG(std::string, regexp_engines, "",
29 "pattern to select regexp engines to test");
30
31 namespace re2 {
32
33 enum {
34 kMaxSubmatch = 1+16, // $0...$16
35 };
36
37 const char* engine_names[kEngineMax] = {
38 "Backtrack",
39 "NFA",
40 "DFA",
41 "DFA1",
42 "OnePass",
43 "BitState",
44 "RE2",
45 "RE2a",
46 "RE2b",
47 "PCRE",
48 };
49
50 // Returns the name of the engine.
EngineName(Engine e)51 static const char* EngineName(Engine e) {
52 CHECK_GE(e, 0);
53 CHECK_LT(e, arraysize(engine_names));
54 CHECK(engine_names[e] != NULL);
55 return engine_names[e];
56 }
57
58 // Returns bit mask of engines to use.
Engines()59 static uint32_t Engines() {
60 static bool did_parse = false;
61 static uint32_t cached_engines = 0;
62
63 if (did_parse)
64 return cached_engines;
65
66 if (GetFlag(FLAGS_regexp_engines).empty()) {
67 cached_engines = ~0;
68 } else {
69 for (Engine i = static_cast<Engine>(0); i < kEngineMax; i++)
70 if (GetFlag(FLAGS_regexp_engines).find(EngineName(i)) != std::string::npos)
71 cached_engines |= 1<<i;
72 }
73
74 if (cached_engines == 0)
75 LOG(INFO) << "Warning: no engines enabled.";
76 if (!UsingPCRE)
77 cached_engines &= ~(1<<kEnginePCRE);
78 for (Engine i = static_cast<Engine>(0); i < kEngineMax; i++) {
79 if (cached_engines & (1<<i))
80 LOG(INFO) << EngineName(i) << " enabled";
81 }
82
83 did_parse = true;
84 return cached_engines;
85 }
86
87 // The result of running a match.
88 struct TestInstance::Result {
Resultre2::TestInstance::Result89 Result()
90 : skipped(false),
91 matched(false),
92 untrusted(false),
93 have_submatch(false),
94 have_submatch0(false) {
95 ClearSubmatch();
96 }
97
ClearSubmatchre2::TestInstance::Result98 void ClearSubmatch() {
99 for (int i = 0; i < kMaxSubmatch; i++)
100 submatch[i] = StringPiece();
101 }
102
103 bool skipped; // test skipped: wasn't applicable
104 bool matched; // found a match
105 bool untrusted; // don't really trust the answer
106 bool have_submatch; // computed all submatch info
107 bool have_submatch0; // computed just submatch[0]
108 StringPiece submatch[kMaxSubmatch];
109 };
110
111 typedef TestInstance::Result Result;
112
113 // Formats a single capture range s in text in the form (a,b)
114 // where a and b are the starting and ending offsets of s in text.
FormatCapture(const StringPiece & text,const StringPiece & s)115 static std::string FormatCapture(const StringPiece& text,
116 const StringPiece& s) {
117 if (s.data() == NULL)
118 return "(?,?)";
119 return StringPrintf("(%td,%td)",
120 s.begin() - text.begin(),
121 s.end() - text.begin());
122 }
123
124 // Returns whether text contains non-ASCII (>= 0x80) bytes.
NonASCII(const StringPiece & text)125 static bool NonASCII(const StringPiece& text) {
126 for (size_t i = 0; i < text.size(); i++)
127 if ((uint8_t)text[i] >= 0x80)
128 return true;
129 return false;
130 }
131
132 // Returns string representation of match kind.
FormatKind(Prog::MatchKind kind)133 static std::string FormatKind(Prog::MatchKind kind) {
134 switch (kind) {
135 case Prog::kFullMatch:
136 return "full match";
137 case Prog::kLongestMatch:
138 return "longest match";
139 case Prog::kFirstMatch:
140 return "first match";
141 case Prog::kManyMatch:
142 return "many match";
143 }
144 return "???";
145 }
146
147 // Returns string representation of anchor kind.
FormatAnchor(Prog::Anchor anchor)148 static std::string FormatAnchor(Prog::Anchor anchor) {
149 switch (anchor) {
150 case Prog::kAnchored:
151 return "anchored";
152 case Prog::kUnanchored:
153 return "unanchored";
154 }
155 return "???";
156 }
157
158 struct ParseMode {
159 Regexp::ParseFlags parse_flags;
160 std::string desc;
161 };
162
163 static const Regexp::ParseFlags single_line =
164 Regexp::LikePerl;
165 static const Regexp::ParseFlags multi_line =
166 static_cast<Regexp::ParseFlags>(Regexp::LikePerl & ~Regexp::OneLine);
167
168 static ParseMode parse_modes[] = {
169 { single_line, "single-line" },
170 { single_line|Regexp::Latin1, "single-line, latin1" },
171 { multi_line, "multiline" },
172 { multi_line|Regexp::NonGreedy, "multiline, nongreedy" },
173 { multi_line|Regexp::Latin1, "multiline, latin1" },
174 };
175
FormatMode(Regexp::ParseFlags flags)176 static std::string FormatMode(Regexp::ParseFlags flags) {
177 for (size_t i = 0; i < arraysize(parse_modes); i++)
178 if (parse_modes[i].parse_flags == flags)
179 return parse_modes[i].desc;
180 return StringPrintf("%#x", static_cast<uint32_t>(flags));
181 }
182
183 // Constructs and saves all the matching engines that
184 // will be required for the given tests.
TestInstance(const StringPiece & regexp_str,Prog::MatchKind kind,Regexp::ParseFlags flags)185 TestInstance::TestInstance(const StringPiece& regexp_str, Prog::MatchKind kind,
186 Regexp::ParseFlags flags)
187 : regexp_str_(regexp_str),
188 kind_(kind),
189 flags_(flags),
190 error_(false),
191 regexp_(NULL),
192 num_captures_(0),
193 prog_(NULL),
194 rprog_(NULL),
195 re_(NULL),
196 re2_(NULL) {
197
198 VLOG(1) << CEscape(regexp_str);
199
200 // Compile regexp to prog.
201 // Always required - needed for backtracking (reference implementation).
202 RegexpStatus status;
203 regexp_ = Regexp::Parse(regexp_str, flags, &status);
204 if (regexp_ == NULL) {
205 LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_)
206 << " mode: " << FormatMode(flags);
207 error_ = true;
208 return;
209 }
210 num_captures_ = regexp_->NumCaptures();
211 prog_ = regexp_->CompileToProg(0);
212 if (prog_ == NULL) {
213 LOG(INFO) << "Cannot compile: " << CEscape(regexp_str_);
214 error_ = true;
215 return;
216 }
217 if (GetFlag(FLAGS_dump_prog)) {
218 LOG(INFO) << "Prog for "
219 << " regexp "
220 << CEscape(regexp_str_)
221 << " (" << FormatKind(kind_)
222 << ", " << FormatMode(flags_)
223 << ")\n"
224 << prog_->Dump();
225 }
226
227 // Compile regexp to reversed prog. Only needed for DFA engines.
228 if (Engines() & ((1<<kEngineDFA)|(1<<kEngineDFA1))) {
229 rprog_ = regexp_->CompileToReverseProg(0);
230 if (rprog_ == NULL) {
231 LOG(INFO) << "Cannot reverse compile: " << CEscape(regexp_str_);
232 error_ = true;
233 return;
234 }
235 if (GetFlag(FLAGS_dump_rprog))
236 LOG(INFO) << rprog_->Dump();
237 }
238
239 // Create re string that will be used for RE and RE2.
240 std::string re = std::string(regexp_str);
241 // Accomodate flags.
242 // Regexp::Latin1 will be accomodated below.
243 if (!(flags & Regexp::OneLine))
244 re = "(?m)" + re;
245 if (flags & Regexp::NonGreedy)
246 re = "(?U)" + re;
247 if (flags & Regexp::DotNL)
248 re = "(?s)" + re;
249
250 // Compile regexp to RE2.
251 if (Engines() & ((1<<kEngineRE2)|(1<<kEngineRE2a)|(1<<kEngineRE2b))) {
252 RE2::Options options;
253 if (flags & Regexp::Latin1)
254 options.set_encoding(RE2::Options::EncodingLatin1);
255 if (kind_ == Prog::kLongestMatch)
256 options.set_longest_match(true);
257 re2_ = new RE2(re, options);
258 if (!re2_->error().empty()) {
259 LOG(INFO) << "Cannot RE2: " << CEscape(re);
260 error_ = true;
261 return;
262 }
263 }
264
265 // Compile regexp to RE.
266 // PCRE as exposed by the RE interface isn't always usable.
267 // 1. It disagrees about handling of empty-string reptitions
268 // like matching (a*)* against "b". PCRE treats the (a*) as
269 // occurring once, while we treat it as occurring not at all.
270 // 2. It treats $ as this weird thing meaning end of string
271 // or before the \n at the end of the string.
272 // 3. It doesn't implement POSIX leftmost-longest matching.
273 // 4. It lets \s match vertical tab.
274 // MimicsPCRE() detects 1 and 2.
275 if ((Engines() & (1<<kEnginePCRE)) && regexp_->MimicsPCRE() &&
276 kind_ != Prog::kLongestMatch) {
277 PCRE_Options o;
278 o.set_option(PCRE::UTF8);
279 if (flags & Regexp::Latin1)
280 o.set_option(PCRE::None);
281 // PCRE has interface bug keeping us from finding $0, so
282 // add one more layer of parens.
283 re_ = new PCRE("("+re+")", o);
284 if (!re_->error().empty()) {
285 LOG(INFO) << "Cannot PCRE: " << CEscape(re);
286 error_ = true;
287 return;
288 }
289 }
290 }
291
~TestInstance()292 TestInstance::~TestInstance() {
293 if (regexp_)
294 regexp_->Decref();
295 delete prog_;
296 delete rprog_;
297 delete re_;
298 delete re2_;
299 }
300
301 // Runs a single search using the named engine type.
302 // This interface hides all the irregularities of the various
303 // engine interfaces from the rest of this file.
RunSearch(Engine type,const StringPiece & orig_text,const StringPiece & orig_context,Prog::Anchor anchor,Result * result)304 void TestInstance::RunSearch(Engine type,
305 const StringPiece& orig_text,
306 const StringPiece& orig_context,
307 Prog::Anchor anchor,
308 Result* result) {
309 if (regexp_ == NULL) {
310 result->skipped = true;
311 return;
312 }
313 int nsubmatch = 1 + num_captures_; // NumCaptures doesn't count $0
314 if (nsubmatch > kMaxSubmatch)
315 nsubmatch = kMaxSubmatch;
316
317 StringPiece text = orig_text;
318 StringPiece context = orig_context;
319
320 switch (type) {
321 default:
322 LOG(FATAL) << "Bad RunSearch type: " << (int)type;
323
324 case kEngineBacktrack:
325 if (prog_ == NULL) {
326 result->skipped = true;
327 break;
328 }
329 result->matched =
330 prog_->UnsafeSearchBacktrack(text, context, anchor, kind_,
331 result->submatch, nsubmatch);
332 result->have_submatch = true;
333 break;
334
335 case kEngineNFA:
336 if (prog_ == NULL) {
337 result->skipped = true;
338 break;
339 }
340 result->matched =
341 prog_->SearchNFA(text, context, anchor, kind_,
342 result->submatch, nsubmatch);
343 result->have_submatch = true;
344 break;
345
346 case kEngineDFA:
347 if (prog_ == NULL) {
348 result->skipped = true;
349 break;
350 }
351 result->matched = prog_->SearchDFA(text, context, anchor, kind_, NULL,
352 &result->skipped, NULL);
353 break;
354
355 case kEngineDFA1:
356 if (prog_ == NULL || rprog_ == NULL) {
357 result->skipped = true;
358 break;
359 }
360 result->matched =
361 prog_->SearchDFA(text, context, anchor, kind_, result->submatch,
362 &result->skipped, NULL);
363 // If anchored, no need for second run,
364 // but do it anyway to find more bugs.
365 if (result->matched) {
366 if (!rprog_->SearchDFA(result->submatch[0], context,
367 Prog::kAnchored, Prog::kLongestMatch,
368 result->submatch,
369 &result->skipped, NULL)) {
370 LOG(ERROR) << "Reverse DFA inconsistency: "
371 << CEscape(regexp_str_)
372 << " on " << CEscape(text);
373 result->matched = false;
374 }
375 }
376 result->have_submatch0 = true;
377 break;
378
379 case kEngineOnePass:
380 if (prog_ == NULL ||
381 !prog_->IsOnePass() ||
382 anchor == Prog::kUnanchored ||
383 nsubmatch > Prog::kMaxOnePassCapture) {
384 result->skipped = true;
385 break;
386 }
387 result->matched = prog_->SearchOnePass(text, context, anchor, kind_,
388 result->submatch, nsubmatch);
389 result->have_submatch = true;
390 break;
391
392 case kEngineBitState:
393 if (prog_ == NULL ||
394 !prog_->CanBitState()) {
395 result->skipped = true;
396 break;
397 }
398 result->matched = prog_->SearchBitState(text, context, anchor, kind_,
399 result->submatch, nsubmatch);
400 result->have_submatch = true;
401 break;
402
403 case kEngineRE2:
404 case kEngineRE2a:
405 case kEngineRE2b: {
406 if (!re2_ || text.end() != context.end()) {
407 result->skipped = true;
408 break;
409 }
410
411 RE2::Anchor re_anchor;
412 if (anchor == Prog::kAnchored)
413 re_anchor = RE2::ANCHOR_START;
414 else
415 re_anchor = RE2::UNANCHORED;
416 if (kind_ == Prog::kFullMatch)
417 re_anchor = RE2::ANCHOR_BOTH;
418
419 result->matched = re2_->Match(
420 context,
421 static_cast<size_t>(text.begin() - context.begin()),
422 static_cast<size_t>(text.end() - context.begin()),
423 re_anchor,
424 result->submatch,
425 nsubmatch);
426 result->have_submatch = nsubmatch > 0;
427 break;
428 }
429
430 case kEnginePCRE: {
431 if (!re_ || text.begin() != context.begin() ||
432 text.end() != context.end()) {
433 result->skipped = true;
434 break;
435 }
436
437 // In Perl/PCRE, \v matches any character considered vertical
438 // whitespace, not just vertical tab. Regexp::MimicsPCRE() is
439 // unable to handle all cases of this, unfortunately, so just
440 // catch them here. :(
441 if (regexp_str_.find("\\v") != StringPiece::npos &&
442 (text.find('\n') != StringPiece::npos ||
443 text.find('\f') != StringPiece::npos ||
444 text.find('\r') != StringPiece::npos)) {
445 result->skipped = true;
446 break;
447 }
448
449 // PCRE 8.34 or so started allowing vertical tab to match \s,
450 // following a change made in Perl 5.18. RE2 does not.
451 if ((regexp_str_.find("\\s") != StringPiece::npos ||
452 regexp_str_.find("\\S") != StringPiece::npos) &&
453 text.find('\v') != StringPiece::npos) {
454 result->skipped = true;
455 break;
456 }
457
458 const PCRE::Arg **argptr = new const PCRE::Arg*[nsubmatch];
459 PCRE::Arg *a = new PCRE::Arg[nsubmatch];
460 for (int i = 0; i < nsubmatch; i++) {
461 a[i] = PCRE::Arg(&result->submatch[i]);
462 argptr[i] = &a[i];
463 }
464 size_t consumed;
465 PCRE::Anchor pcre_anchor;
466 if (anchor == Prog::kAnchored)
467 pcre_anchor = PCRE::ANCHOR_START;
468 else
469 pcre_anchor = PCRE::UNANCHORED;
470 if (kind_ == Prog::kFullMatch)
471 pcre_anchor = PCRE::ANCHOR_BOTH;
472 re_->ClearHitLimit();
473 result->matched =
474 re_->DoMatch(text,
475 pcre_anchor,
476 &consumed,
477 argptr, nsubmatch);
478 if (re_->HitLimit()) {
479 result->untrusted = true;
480 delete[] argptr;
481 delete[] a;
482 break;
483 }
484 result->have_submatch = true;
485 delete[] argptr;
486 delete[] a;
487 break;
488 }
489 }
490
491 if (!result->matched)
492 result->ClearSubmatch();
493 }
494
495 // Checks whether r is okay given that correct is the right answer.
496 // Specifically, r's answers have to match (but it doesn't have to
497 // claim to have all the answers).
ResultOkay(const Result & r,const Result & correct)498 static bool ResultOkay(const Result& r, const Result& correct) {
499 if (r.skipped)
500 return true;
501 if (r.matched != correct.matched)
502 return false;
503 if (r.have_submatch || r.have_submatch0) {
504 for (int i = 0; i < kMaxSubmatch; i++) {
505 if (correct.submatch[i].data() != r.submatch[i].data() ||
506 correct.submatch[i].size() != r.submatch[i].size())
507 return false;
508 if (!r.have_submatch)
509 break;
510 }
511 }
512 return true;
513 }
514
515 // Runs a single test.
RunCase(const StringPiece & text,const StringPiece & context,Prog::Anchor anchor)516 bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context,
517 Prog::Anchor anchor) {
518 // Backtracking is the gold standard.
519 Result correct;
520 RunSearch(kEngineBacktrack, text, context, anchor, &correct);
521 if (correct.skipped) {
522 if (regexp_ == NULL)
523 return true;
524 LOG(ERROR) << "Skipped backtracking! " << CEscape(regexp_str_)
525 << " " << FormatMode(flags_);
526 return false;
527 }
528 VLOG(1) << "Try: regexp " << CEscape(regexp_str_)
529 << " text " << CEscape(text)
530 << " (" << FormatKind(kind_)
531 << ", " << FormatAnchor(anchor)
532 << ", " << FormatMode(flags_)
533 << ")";
534
535 // Compare the others.
536 bool all_okay = true;
537 for (Engine i = kEngineBacktrack+1; i < kEngineMax; i++) {
538 if (!(Engines() & (1<<i)))
539 continue;
540
541 Result r;
542 RunSearch(i, text, context, anchor, &r);
543 if (ResultOkay(r, correct)) {
544 if (GetFlag(FLAGS_log_okay))
545 LogMatch(r.skipped ? "Skipped: " : "Okay: ", i, text, context, anchor);
546 continue;
547 }
548
549 // We disagree with PCRE on the meaning of some Unicode matches.
550 // In particular, we treat non-ASCII UTF-8 as non-word characters.
551 // We also treat "empty" character sets like [^\w\W] as being
552 // impossible to match, while PCRE apparently excludes some code
553 // points (e.g., 0x0080) from both \w and \W.
554 if (i == kEnginePCRE && NonASCII(text))
555 continue;
556
557 if (!r.untrusted)
558 all_okay = false;
559
560 LogMatch(r.untrusted ? "(Untrusted) Mismatch: " : "Mismatch: ", i, text,
561 context, anchor);
562 if (r.matched != correct.matched) {
563 if (r.matched) {
564 LOG(INFO) << " Should not match (but does).";
565 } else {
566 LOG(INFO) << " Should match (but does not).";
567 continue;
568 }
569 }
570 for (int i = 0; i < 1+num_captures_; i++) {
571 if (r.submatch[i].data() != correct.submatch[i].data() ||
572 r.submatch[i].size() != correct.submatch[i].size()) {
573 LOG(INFO) <<
574 StringPrintf(" $%d: should be %s is %s",
575 i,
576 FormatCapture(text, correct.submatch[i]).c_str(),
577 FormatCapture(text, r.submatch[i]).c_str());
578 } else {
579 LOG(INFO) <<
580 StringPrintf(" $%d: %s ok", i,
581 FormatCapture(text, r.submatch[i]).c_str());
582 }
583 }
584 }
585
586 if (!all_okay) {
587 // This will be initialised once (after flags have been initialised)
588 // and that is desirable because we want to enforce a global limit.
589 static int max_regexp_failures = GetFlag(FLAGS_max_regexp_failures);
590 if (max_regexp_failures > 0 && --max_regexp_failures == 0)
591 LOG(QFATAL) << "Too many regexp failures.";
592 }
593
594 return all_okay;
595 }
596
LogMatch(const char * prefix,Engine e,const StringPiece & text,const StringPiece & context,Prog::Anchor anchor)597 void TestInstance::LogMatch(const char* prefix, Engine e,
598 const StringPiece& text, const StringPiece& context,
599 Prog::Anchor anchor) {
600 LOG(INFO) << prefix
601 << EngineName(e)
602 << " regexp "
603 << CEscape(regexp_str_)
604 << " "
605 << CEscape(regexp_->ToString())
606 << " text "
607 << CEscape(text)
608 << " ("
609 << text.begin() - context.begin()
610 << ","
611 << text.end() - context.begin()
612 << ") of context "
613 << CEscape(context)
614 << " (" << FormatKind(kind_)
615 << ", " << FormatAnchor(anchor)
616 << ", " << FormatMode(flags_)
617 << ")";
618 }
619
620 static Prog::MatchKind kinds[] = {
621 Prog::kFirstMatch,
622 Prog::kLongestMatch,
623 Prog::kFullMatch,
624 };
625
626 // Test all possible match kinds and parse modes.
Tester(const StringPiece & regexp)627 Tester::Tester(const StringPiece& regexp) {
628 error_ = false;
629 for (size_t i = 0; i < arraysize(kinds); i++) {
630 for (size_t j = 0; j < arraysize(parse_modes); j++) {
631 TestInstance* t = new TestInstance(regexp, kinds[i],
632 parse_modes[j].parse_flags);
633 error_ |= t->error();
634 v_.push_back(t);
635 }
636 }
637 }
638
~Tester()639 Tester::~Tester() {
640 for (size_t i = 0; i < v_.size(); i++)
641 delete v_[i];
642 }
643
TestCase(const StringPiece & text,const StringPiece & context,Prog::Anchor anchor)644 bool Tester::TestCase(const StringPiece& text, const StringPiece& context,
645 Prog::Anchor anchor) {
646 bool okay = true;
647 for (size_t i = 0; i < v_.size(); i++)
648 okay &= (!v_[i]->error() && v_[i]->RunCase(text, context, anchor));
649 return okay;
650 }
651
652 static Prog::Anchor anchors[] = {
653 Prog::kAnchored,
654 Prog::kUnanchored
655 };
656
TestInput(const StringPiece & text)657 bool Tester::TestInput(const StringPiece& text) {
658 bool okay = TestInputInContext(text, text);
659 if (!text.empty()) {
660 StringPiece sp;
661 sp = text;
662 sp.remove_prefix(1);
663 okay &= TestInputInContext(sp, text);
664 sp = text;
665 sp.remove_suffix(1);
666 okay &= TestInputInContext(sp, text);
667 }
668 return okay;
669 }
670
TestInputInContext(const StringPiece & text,const StringPiece & context)671 bool Tester::TestInputInContext(const StringPiece& text,
672 const StringPiece& context) {
673 bool okay = true;
674 for (size_t i = 0; i < arraysize(anchors); i++)
675 okay &= TestCase(text, context, anchors[i]);
676 return okay;
677 }
678
TestRegexpOnText(const StringPiece & regexp,const StringPiece & text)679 bool TestRegexpOnText(const StringPiece& regexp,
680 const StringPiece& text) {
681 Tester t(regexp);
682 return t.TestInput(text);
683 }
684
685 } // namespace re2
686