1 // Copyright 2006-2008 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 #include <stdint.h>
6 #include <string>
7 #include <thread>
8 #include <vector>
9 
10 #include "util/test.h"
11 #include "util/flags.h"
12 #include "util/logging.h"
13 #include "util/malloc_counter.h"
14 #include "util/strutil.h"
15 #include "re2/prog.h"
16 #include "re2/re2.h"
17 #include "re2/regexp.h"
18 #include "re2/testing/regexp_generator.h"
19 #include "re2/testing/string_generator.h"
20 
21 static const bool UsingMallocCounter = false;
22 
23 DEFINE_FLAG(int, size, 8, "log2(number of DFA nodes)");
24 DEFINE_FLAG(int, repeat, 2, "Repetition count.");
25 DEFINE_FLAG(int, threads, 4, "number of threads");
26 
27 namespace re2 {
28 
29 static int state_cache_resets = 0;
30 static int search_failures = 0;
31 
32 struct SetHooks {
SetHooksre2::SetHooks33   SetHooks() {
34     hooks::SetDFAStateCacheResetHook([](const hooks::DFAStateCacheReset&) {
35       ++state_cache_resets;
36     });
37     hooks::SetDFASearchFailureHook([](const hooks::DFASearchFailure&) {
38       ++search_failures;
39     });
40   }
41 } set_hooks;
42 
43 // Check that multithreaded access to DFA class works.
44 
45 // Helper function: builds entire DFA for prog.
DoBuild(Prog * prog)46 static void DoBuild(Prog* prog) {
47   ASSERT_TRUE(prog->BuildEntireDFA(Prog::kFirstMatch, nullptr));
48 }
49 
TEST(Multithreaded,BuildEntireDFA)50 TEST(Multithreaded, BuildEntireDFA) {
51   // Create regexp with 2^FLAGS_size states in DFA.
52   std::string s = "a";
53   for (int i = 0; i < GetFlag(FLAGS_size); i++)
54     s += "[ab]";
55   s += "b";
56   Regexp* re = Regexp::Parse(s, Regexp::LikePerl, NULL);
57   ASSERT_TRUE(re != NULL);
58 
59   // Check that single-threaded code works.
60   {
61     Prog* prog = re->CompileToProg(0);
62     ASSERT_TRUE(prog != NULL);
63 
64     std::thread t(DoBuild, prog);
65     t.join();
66 
67     delete prog;
68   }
69 
70   // Build the DFA simultaneously in a bunch of threads.
71   for (int i = 0; i < GetFlag(FLAGS_repeat); i++) {
72     Prog* prog = re->CompileToProg(0);
73     ASSERT_TRUE(prog != NULL);
74 
75     std::vector<std::thread> threads;
76     for (int j = 0; j < GetFlag(FLAGS_threads); j++)
77       threads.emplace_back(DoBuild, prog);
78     for (int j = 0; j < GetFlag(FLAGS_threads); j++)
79       threads[j].join();
80 
81     // One more compile, to make sure everything is okay.
82     prog->BuildEntireDFA(Prog::kFirstMatch, nullptr);
83     delete prog;
84   }
85 
86   re->Decref();
87 }
88 
89 // Check that DFA size requirements are followed.
90 // BuildEntireDFA will, like SearchDFA, stop building out
91 // the DFA once the memory limits are reached.
TEST(SingleThreaded,BuildEntireDFA)92 TEST(SingleThreaded, BuildEntireDFA) {
93   // Create regexp with 2^30 states in DFA.
94   Regexp* re = Regexp::Parse("a[ab]{30}b", Regexp::LikePerl, NULL);
95   ASSERT_TRUE(re != NULL);
96 
97   for (int i = 17; i < 24; i++) {
98     int64_t limit = int64_t{1}<<i;
99     int64_t usage;
100     //int64_t progusage, dfamem;
101     {
102       testing::MallocCounter m(testing::MallocCounter::THIS_THREAD_ONLY);
103       Prog* prog = re->CompileToProg(limit);
104       ASSERT_TRUE(prog != NULL);
105       //progusage = m.HeapGrowth();
106       //dfamem = prog->dfa_mem();
107       prog->BuildEntireDFA(Prog::kFirstMatch, nullptr);
108       prog->BuildEntireDFA(Prog::kLongestMatch, nullptr);
109       usage = m.HeapGrowth();
110       delete prog;
111     }
112     if (UsingMallocCounter) {
113       //LOG(INFO) << "limit " << limit << ", "
114       //          << "prog usage " << progusage << ", "
115       //          << "DFA budget " << dfamem << ", "
116       //          << "total " << usage;
117       // Tolerate +/- 10%.
118       ASSERT_GT(usage, limit*9/10);
119       ASSERT_LT(usage, limit*11/10);
120     }
121   }
122   re->Decref();
123 }
124 
125 // Test that the DFA gets the right result even if it runs
126 // out of memory during a search.  The regular expression
127 // 0[01]{n}$ matches a binary string of 0s and 1s only if
128 // the (n+1)th-to-last character is a 0.  Matching this in
129 // a single forward pass (as done by the DFA) requires
130 // keeping one bit for each of the last n+1 characters
131 // (whether each was a 0), or 2^(n+1) possible states.
132 // If we run this regexp to search in a string that contains
133 // every possible n-character binary string as a substring,
134 // then it will have to run through at least 2^n states.
135 // States are big data structures -- certainly more than 1 byte --
136 // so if the DFA can search correctly while staying within a
137 // 2^n byte limit, it must be handling out-of-memory conditions
138 // gracefully.
TEST(SingleThreaded,SearchDFA)139 TEST(SingleThreaded, SearchDFA) {
140   // The De Bruijn string is the worst case input for this regexp.
141   // By default, the DFA will notice that it is flushing its cache
142   // too frequently and will bail out early, so that RE2 can use the
143   // NFA implementation instead.  (The DFA loses its speed advantage
144   // if it can't get a good cache hit rate.)
145   // Tell the DFA to trudge along instead.
146   Prog::TEST_dfa_should_bail_when_slow(false);
147   state_cache_resets = 0;
148   search_failures = 0;
149 
150   // Choice of n is mostly arbitrary, except that:
151   //   * making n too big makes the test run for too long.
152   //   * making n too small makes the DFA refuse to run,
153   //     because it has so little memory compared to the program size.
154   // Empirically, n = 18 is a good compromise between the two.
155   const int n = 18;
156 
157   Regexp* re = Regexp::Parse(StringPrintf("0[01]{%d}$", n),
158                              Regexp::LikePerl, NULL);
159   ASSERT_TRUE(re != NULL);
160 
161   // The De Bruijn string for n ends with a 1 followed by n 0s in a row,
162   // which is not a match for 0[01]{n}$.  Adding one more 0 is a match.
163   std::string no_match = DeBruijnString(n);
164   std::string match = no_match + "0";
165 
166   int64_t usage;
167   int64_t peak_usage;
168   {
169     testing::MallocCounter m(testing::MallocCounter::THIS_THREAD_ONLY);
170     Prog* prog = re->CompileToProg(1<<n);
171     ASSERT_TRUE(prog != NULL);
172     for (int i = 0; i < 10; i++) {
173       bool matched = false;
174       bool failed = false;
175       matched = prog->SearchDFA(match, StringPiece(), Prog::kUnanchored,
176                                 Prog::kFirstMatch, NULL, &failed, NULL);
177       ASSERT_FALSE(failed);
178       ASSERT_TRUE(matched);
179       matched = prog->SearchDFA(no_match, StringPiece(), Prog::kUnanchored,
180                                 Prog::kFirstMatch, NULL, &failed, NULL);
181       ASSERT_FALSE(failed);
182       ASSERT_FALSE(matched);
183     }
184     usage = m.HeapGrowth();
185     peak_usage = m.PeakHeapGrowth();
186     delete prog;
187   }
188   if (UsingMallocCounter) {
189     //LOG(INFO) << "usage " << usage << ", "
190     //          << "peak usage " << peak_usage;
191     ASSERT_LT(usage, 1<<n);
192     ASSERT_LT(peak_usage, 1<<n);
193   }
194   re->Decref();
195 
196   // Reset to original behaviour.
197   Prog::TEST_dfa_should_bail_when_slow(true);
198   ASSERT_GT(state_cache_resets, 0);
199   ASSERT_EQ(search_failures, 0);
200 }
201 
202 // Helper function: searches for match, which should match,
203 // and no_match, which should not.
DoSearch(Prog * prog,const StringPiece & match,const StringPiece & no_match)204 static void DoSearch(Prog* prog, const StringPiece& match,
205                      const StringPiece& no_match) {
206   for (int i = 0; i < 2; i++) {
207     bool matched = false;
208     bool failed = false;
209     matched = prog->SearchDFA(match, StringPiece(), Prog::kUnanchored,
210                               Prog::kFirstMatch, NULL, &failed, NULL);
211     ASSERT_FALSE(failed);
212     ASSERT_TRUE(matched);
213     matched = prog->SearchDFA(no_match, StringPiece(), Prog::kUnanchored,
214                               Prog::kFirstMatch, NULL, &failed, NULL);
215     ASSERT_FALSE(failed);
216     ASSERT_FALSE(matched);
217   }
218 }
219 
TEST(Multithreaded,SearchDFA)220 TEST(Multithreaded, SearchDFA) {
221   Prog::TEST_dfa_should_bail_when_slow(false);
222   state_cache_resets = 0;
223   search_failures = 0;
224 
225   // Same as single-threaded test above.
226   const int n = 18;
227   Regexp* re = Regexp::Parse(StringPrintf("0[01]{%d}$", n),
228                              Regexp::LikePerl, NULL);
229   ASSERT_TRUE(re != NULL);
230   std::string no_match = DeBruijnString(n);
231   std::string match = no_match + "0";
232 
233   // Check that single-threaded code works.
234   {
235     Prog* prog = re->CompileToProg(1<<n);
236     ASSERT_TRUE(prog != NULL);
237 
238     std::thread t(DoSearch, prog, match, no_match);
239     t.join();
240 
241     delete prog;
242   }
243 
244   // Run the search simultaneously in a bunch of threads.
245   // Reuse same flags for Multithreaded.BuildDFA above.
246   for (int i = 0; i < GetFlag(FLAGS_repeat); i++) {
247     Prog* prog = re->CompileToProg(1<<n);
248     ASSERT_TRUE(prog != NULL);
249 
250     std::vector<std::thread> threads;
251     for (int j = 0; j < GetFlag(FLAGS_threads); j++)
252       threads.emplace_back(DoSearch, prog, match, no_match);
253     for (int j = 0; j < GetFlag(FLAGS_threads); j++)
254       threads[j].join();
255 
256     delete prog;
257   }
258 
259   re->Decref();
260 
261   // Reset to original behaviour.
262   Prog::TEST_dfa_should_bail_when_slow(true);
263   ASSERT_GT(state_cache_resets, 0);
264   ASSERT_EQ(search_failures, 0);
265 }
266 
267 struct ReverseTest {
268   const char* regexp;
269   const char* text;
270   bool match;
271 };
272 
273 // Test that reverse DFA handles anchored/unanchored correctly.
274 // It's in the DFA interface but not used by RE2.
275 ReverseTest reverse_tests[] = {
276   { "\\A(a|b)", "abc", true },
277   { "(a|b)\\z", "cba", true },
278   { "\\A(a|b)", "cba", false },
279   { "(a|b)\\z", "abc", false },
280 };
281 
TEST(DFA,ReverseMatch)282 TEST(DFA, ReverseMatch) {
283   int nfail = 0;
284   for (size_t i = 0; i < arraysize(reverse_tests); i++) {
285     const ReverseTest& t = reverse_tests[i];
286     Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL);
287     ASSERT_TRUE(re != NULL);
288     Prog* prog = re->CompileToReverseProg(0);
289     ASSERT_TRUE(prog != NULL);
290     bool failed = false;
291     bool matched = prog->SearchDFA(t.text, StringPiece(), Prog::kUnanchored,
292                                    Prog::kFirstMatch, NULL, &failed, NULL);
293     if (matched != t.match) {
294       LOG(ERROR) << t.regexp << " on " << t.text << ": want " << t.match;
295       nfail++;
296     }
297     delete prog;
298     re->Decref();
299   }
300   EXPECT_EQ(nfail, 0);
301 }
302 
303 struct CallbackTest {
304   const char* regexp;
305   const char* dump;
306 };
307 
308 // Test that DFA::BuildAllStates() builds the expected DFA states
309 // and issues the expected callbacks. These test cases reflect the
310 // very compact encoding of the callbacks, but that also makes them
311 // very difficult to understand, so let's work through "\\Aa\\z".
312 // There are three slots per DFA state because the bytemap has two
313 // equivalence classes and there is a third slot for kByteEndText:
314 //   0: all bytes that are not 'a'
315 //   1: the byte 'a'
316 //   2: kByteEndText
317 // -1 means that there is no transition from that DFA state to any
318 // other DFA state for that slot. The valid transitions are thus:
319 //   state 0 --slot 1--> state 1
320 //   state 1 --slot 2--> state 2
321 // The double brackets indicate that state 2 is a matching state.
322 // Putting it together, this means that the DFA must consume the
323 // byte 'a' and then hit end of text. Q.E.D.
324 CallbackTest callback_tests[] = {
325   { "\\Aa\\z", "[-1,1,-1] [-1,-1,2] [[-1,-1,-1]]" },
326   { "\\Aab\\z", "[-1,1,-1,-1] [-1,-1,2,-1] [-1,-1,-1,3] [[-1,-1,-1,-1]]" },
327   { "\\Aa*b\\z", "[-1,0,1,-1] [-1,-1,-1,2] [[-1,-1,-1,-1]]" },
328   { "\\Aa+b\\z", "[-1,1,-1,-1] [-1,1,2,-1] [-1,-1,-1,3] [[-1,-1,-1,-1]]" },
329   { "\\Aa?b\\z", "[-1,1,2,-1] [-1,-1,2,-1] [-1,-1,-1,3] [[-1,-1,-1,-1]]" },
330   { "\\Aa\\C*\\z", "[-1,1,-1] [1,1,2] [[-1,-1,-1]]" },
331   { "\\Aa\\C*", "[-1,1,-1] [2,2,3] [[2,2,2]] [[-1,-1,-1]]" },
332   { "a\\C*", "[0,1,-1] [2,2,3] [[2,2,2]] [[-1,-1,-1]]" },
333   { "\\C*", "[1,2] [[1,1]] [[-1,-1]]" },
334   { "a", "[0,1,-1] [2,2,2] [[-1,-1,-1]]"} ,
335 };
336 
TEST(DFA,Callback)337 TEST(DFA, Callback) {
338   int nfail = 0;
339   for (size_t i = 0; i < arraysize(callback_tests); i++) {
340     const CallbackTest& t = callback_tests[i];
341     Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL);
342     ASSERT_TRUE(re != NULL);
343     Prog* prog = re->CompileToProg(0);
344     ASSERT_TRUE(prog != NULL);
345     std::string dump;
346     prog->BuildEntireDFA(Prog::kLongestMatch, [&](const int* next, bool match) {
347       ASSERT_TRUE(next != NULL);
348       if (!dump.empty())
349         dump += " ";
350       dump += match ? "[[" : "[";
351       for (int b = 0; b < prog->bytemap_range() + 1; b++)
352         dump += StringPrintf("%d,", next[b]);
353       dump.pop_back();
354       dump += match ? "]]" : "]";
355     });
356     if (dump != t.dump) {
357       LOG(ERROR) << t.regexp << " bytemap:\n" << prog->DumpByteMap();
358       LOG(ERROR) << t.regexp << " dump:\ngot " << dump << "\nwant " << t.dump;
359       nfail++;
360     }
361     delete prog;
362     re->Decref();
363   }
364   EXPECT_EQ(nfail, 0);
365 }
366 
367 }  // namespace re2
368