1 // Copyright 2007 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Test prog.cc, compile.cc
6 
7 #include <string>
8 
9 #include "util/test.h"
10 #include "util/logging.h"
11 #include "re2/regexp.h"
12 #include "re2/prog.h"
13 
14 namespace re2 {
15 
16 // Simple input/output tests checking that
17 // the regexp compiles to the expected code.
18 // These are just to sanity check the basic implementation.
19 // The real confidence tests happen by testing the NFA/DFA
20 // that run the compiled code.
21 
22 struct Test {
23   const char* regexp;
24   const char* code;
25 };
26 
27 static Test tests[] = {
28   { "a",
29     "3. byte [61-61] 0 -> 4\n"
30     "4. match! 0\n" },
31   { "ab",
32     "3. byte [61-61] 0 -> 4\n"
33     "4. byte [62-62] 0 -> 5\n"
34     "5. match! 0\n" },
35   { "a|c",
36     "3+ byte [61-61] 0 -> 5\n"
37     "4. byte [63-63] 0 -> 5\n"
38     "5. match! 0\n" },
39   { "a|b",
40     "3. byte [61-62] 0 -> 4\n"
41     "4. match! 0\n" },
42   { "[ab]",
43     "3. byte [61-62] 0 -> 4\n"
44     "4. match! 0\n" },
45   { "a+",
46     "3. byte [61-61] 0 -> 4\n"
47     "4+ nop -> 3\n"
48     "5. match! 0\n" },
49   { "a+?",
50     "3. byte [61-61] 0 -> 4\n"
51     "4+ match! 0\n"
52     "5. nop -> 3\n" },
53   { "a*",
54     "3+ byte [61-61] 1 -> 3\n"
55     "4. match! 0\n" },
56   { "a*?",
57     "3+ match! 0\n"
58     "4. byte [61-61] 0 -> 3\n" },
59   { "a?",
60     "3+ byte [61-61] 1 -> 5\n"
61     "4. nop -> 5\n"
62     "5. match! 0\n" },
63   { "a??",
64     "3+ nop -> 5\n"
65     "4. byte [61-61] 0 -> 5\n"
66     "5. match! 0\n" },
67   { "a{4}",
68     "3. byte [61-61] 0 -> 4\n"
69     "4. byte [61-61] 0 -> 5\n"
70     "5. byte [61-61] 0 -> 6\n"
71     "6. byte [61-61] 0 -> 7\n"
72     "7. match! 0\n" },
73   { "(a)",
74     "3. capture 2 -> 4\n"
75     "4. byte [61-61] 0 -> 5\n"
76     "5. capture 3 -> 6\n"
77     "6. match! 0\n" },
78   { "(?:a)",
79     "3. byte [61-61] 0 -> 4\n"
80     "4. match! 0\n" },
81   { "",
82     "3. match! 0\n" },
83   { ".",
84     "3+ byte [00-09] 0 -> 5\n"
85     "4. byte [0b-ff] 0 -> 5\n"
86     "5. match! 0\n" },
87   { "[^ab]",
88     "3+ byte [00-09] 0 -> 6\n"
89     "4+ byte [0b-60] 0 -> 6\n"
90     "5. byte [63-ff] 0 -> 6\n"
91     "6. match! 0\n" },
92   { "[Aa]",
93     "3. byte/i [61-61] 0 -> 4\n"
94     "4. match! 0\n" },
95   { "\\C+",
96     "3. byte [00-ff] 0 -> 4\n"
97     "4+ altmatch -> 5 | 6\n"
98     "5+ nop -> 3\n"
99     "6. match! 0\n" },
100   { "\\C*",
101     "3+ altmatch -> 4 | 5\n"
102     "4+ byte [00-ff] 1 -> 3\n"
103     "5. match! 0\n" },
104   { "\\C?",
105     "3+ byte [00-ff] 1 -> 5\n"
106     "4. nop -> 5\n"
107     "5. match! 0\n" },
108   // Issue 20992936
109   { "[[-`]",
110     "3. byte [5b-60] 0 -> 4\n"
111     "4. match! 0\n" },
112 };
113 
TEST(TestRegexpCompileToProg,Simple)114 TEST(TestRegexpCompileToProg, Simple) {
115   int failed = 0;
116   for (size_t i = 0; i < arraysize(tests); i++) {
117     const re2::Test& t = tests[i];
118     Regexp* re = Regexp::Parse(t.regexp, Regexp::PerlX|Regexp::Latin1, NULL);
119     if (re == NULL) {
120       LOG(ERROR) << "Cannot parse: " << t.regexp;
121       failed++;
122       continue;
123     }
124     Prog* prog = re->CompileToProg(0);
125     if (prog == NULL) {
126       LOG(ERROR) << "Cannot compile: " << t.regexp;
127       re->Decref();
128       failed++;
129       continue;
130     }
131     ASSERT_TRUE(re->CompileToProg(1) == NULL);
132     std::string s = prog->Dump();
133     if (s != t.code) {
134       LOG(ERROR) << "Incorrect compiled code for: " << t.regexp;
135       LOG(ERROR) << "Want:\n" << t.code;
136       LOG(ERROR) << "Got:\n" << s;
137       failed++;
138     }
139     delete prog;
140     re->Decref();
141   }
142   EXPECT_EQ(failed, 0);
143 }
144 
DumpByteMap(StringPiece pattern,Regexp::ParseFlags flags,std::string * bytemap)145 static void DumpByteMap(StringPiece pattern, Regexp::ParseFlags flags,
146                         std::string* bytemap) {
147   Regexp* re = Regexp::Parse(pattern, flags, NULL);
148   EXPECT_TRUE(re != NULL);
149 
150   {
151     Prog* prog = re->CompileToProg(0);
152     EXPECT_TRUE(prog != NULL);
153     *bytemap = prog->DumpByteMap();
154     delete prog;
155   }
156 
157   {
158     Prog* prog = re->CompileToReverseProg(0);
159     EXPECT_TRUE(prog != NULL);
160     EXPECT_EQ(*bytemap, prog->DumpByteMap());
161     delete prog;
162   }
163 
164   re->Decref();
165 }
166 
TEST(TestCompile,Latin1Ranges)167 TEST(TestCompile, Latin1Ranges) {
168   // The distinct byte ranges involved in the Latin-1 dot ([^\n]).
169 
170   std::string bytemap;
171 
172   DumpByteMap(".", Regexp::PerlX|Regexp::Latin1, &bytemap);
173   EXPECT_EQ("[00-09] -> 0\n"
174             "[0a-0a] -> 1\n"
175             "[0b-ff] -> 0\n",
176             bytemap);
177 }
178 
TEST(TestCompile,OtherByteMapTests)179 TEST(TestCompile, OtherByteMapTests) {
180   std::string bytemap;
181 
182   // Test that "absent" ranges are mapped to the same byte class.
183   DumpByteMap("[0-9A-Fa-f]+", Regexp::PerlX|Regexp::Latin1, &bytemap);
184   EXPECT_EQ("[00-2f] -> 0\n"
185             "[30-39] -> 1\n"
186             "[3a-40] -> 0\n"
187             "[41-46] -> 1\n"
188             "[47-60] -> 0\n"
189             "[61-66] -> 1\n"
190             "[67-ff] -> 0\n",
191             bytemap);
192 
193   // Test the byte classes for \b.
194   DumpByteMap("\\b", Regexp::LikePerl|Regexp::Latin1, &bytemap);
195   EXPECT_EQ("[00-2f] -> 0\n"
196             "[30-39] -> 1\n"
197             "[3a-40] -> 0\n"
198             "[41-5a] -> 1\n"
199             "[5b-5e] -> 0\n"
200             "[5f-5f] -> 1\n"
201             "[60-60] -> 0\n"
202             "[61-7a] -> 1\n"
203             "[7b-ff] -> 0\n",
204             bytemap);
205 
206   // Bug in the ASCII case-folding optimization created too many byte classes.
207   DumpByteMap("[^_]", Regexp::LikePerl|Regexp::Latin1, &bytemap);
208   EXPECT_EQ("[00-5e] -> 0\n"
209             "[5f-5f] -> 1\n"
210             "[60-ff] -> 0\n",
211             bytemap);
212 }
213 
TEST(TestCompile,UTF8Ranges)214 TEST(TestCompile, UTF8Ranges) {
215   // The distinct byte ranges involved in the UTF-8 dot ([^\n]).
216   // Once, erroneously split between 0x3f and 0x40 because it is
217   // a 6-bit boundary.
218 
219   std::string bytemap;
220 
221   DumpByteMap(".", Regexp::PerlX, &bytemap);
222   EXPECT_EQ("[00-09] -> 0\n"
223             "[0a-0a] -> 1\n"
224             "[0b-7f] -> 0\n"
225             "[80-bf] -> 2\n"
226             "[c0-c1] -> 1\n"
227             "[c2-df] -> 3\n"
228             "[e0-ef] -> 4\n"
229             "[f0-f4] -> 5\n"
230             "[f5-ff] -> 1\n",
231             bytemap);
232 }
233 
TEST(TestCompile,InsufficientMemory)234 TEST(TestCompile, InsufficientMemory) {
235   Regexp* re = Regexp::Parse(
236       "^(?P<name1>[^\\s]+)\\s+(?P<name2>[^\\s]+)\\s+(?P<name3>.+)$",
237       Regexp::LikePerl, NULL);
238   EXPECT_TRUE(re != NULL);
239   Prog* prog = re->CompileToProg(850);
240   // If the memory budget has been exhausted, compilation should fail
241   // and return NULL instead of trying to do anything with NoMatch().
242   EXPECT_TRUE(prog == NULL);
243   re->Decref();
244 }
245 
Dump(StringPiece pattern,Regexp::ParseFlags flags,std::string * forward,std::string * reverse)246 static void Dump(StringPiece pattern, Regexp::ParseFlags flags,
247                  std::string* forward, std::string* reverse) {
248   Regexp* re = Regexp::Parse(pattern, flags, NULL);
249   EXPECT_TRUE(re != NULL);
250 
251   if (forward != NULL) {
252     Prog* prog = re->CompileToProg(0);
253     EXPECT_TRUE(prog != NULL);
254     *forward = prog->Dump();
255     delete prog;
256   }
257 
258   if (reverse != NULL) {
259     Prog* prog = re->CompileToReverseProg(0);
260     EXPECT_TRUE(prog != NULL);
261     *reverse = prog->Dump();
262     delete prog;
263   }
264 
265   re->Decref();
266 }
267 
TEST(TestCompile,Bug26705922)268 TEST(TestCompile, Bug26705922) {
269   // Bug in the compiler caused inefficient bytecode to be generated for Unicode
270   // groups: common suffixes were cached, but common prefixes were not factored.
271 
272   std::string forward, reverse;
273 
274   Dump("[\\x{10000}\\x{10010}]", Regexp::LikePerl, &forward, &reverse);
275   EXPECT_EQ("3. byte [f0-f0] 0 -> 4\n"
276             "4. byte [90-90] 0 -> 5\n"
277             "5. byte [80-80] 0 -> 6\n"
278             "6+ byte [80-80] 0 -> 8\n"
279             "7. byte [90-90] 0 -> 8\n"
280             "8. match! 0\n",
281             forward);
282   EXPECT_EQ("3+ byte [80-80] 0 -> 5\n"
283             "4. byte [90-90] 0 -> 5\n"
284             "5. byte [80-80] 0 -> 6\n"
285             "6. byte [90-90] 0 -> 7\n"
286             "7. byte [f0-f0] 0 -> 8\n"
287             "8. match! 0\n",
288             reverse);
289 
290   Dump("[\\x{8000}-\\x{10FFF}]", Regexp::LikePerl, &forward, &reverse);
291   EXPECT_EQ("3+ byte [e8-ef] 0 -> 5\n"
292             "4. byte [f0-f0] 0 -> 8\n"
293             "5. byte [80-bf] 0 -> 6\n"
294             "6. byte [80-bf] 0 -> 7\n"
295             "7. match! 0\n"
296             "8. byte [90-90] 0 -> 5\n",
297             forward);
298   EXPECT_EQ("3. byte [80-bf] 0 -> 4\n"
299             "4. byte [80-bf] 0 -> 5\n"
300             "5+ byte [e8-ef] 0 -> 7\n"
301             "6. byte [90-90] 0 -> 8\n"
302             "7. match! 0\n"
303             "8. byte [f0-f0] 0 -> 7\n",
304             reverse);
305 
306   Dump("[\\x{80}-\\x{10FFFF}]", Regexp::LikePerl, &forward, &reverse);
307   EXPECT_EQ("3+ byte [c2-df] 0 -> 6\n"
308             "4+ byte [e0-ef] 0 -> 8\n"
309             "5. byte [f0-f4] 0 -> 9\n"
310             "6. byte [80-bf] 0 -> 7\n"
311             "7. match! 0\n"
312             "8. byte [80-bf] 0 -> 6\n"
313             "9. byte [80-bf] 0 -> 8\n",
314             forward);
315   EXPECT_EQ("3. byte [80-bf] 0 -> 4\n"
316             "4+ byte [c2-df] 0 -> 6\n"
317             "5. byte [80-bf] 0 -> 7\n"
318             "6. match! 0\n"
319             "7+ byte [e0-ef] 0 -> 6\n"
320             "8. byte [80-bf] 0 -> 9\n"
321             "9. byte [f0-f4] 0 -> 6\n",
322             reverse);
323 }
324 
TEST(TestCompile,Bug35237384)325 TEST(TestCompile, Bug35237384) {
326   // Bug in the compiler caused inefficient bytecode to be generated for
327   // nested nullable subexpressions.
328 
329   std::string forward;
330 
331   Dump("a**{3,}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL);
332   EXPECT_EQ("3+ byte [61-61] 1 -> 3\n"
333             "4. nop -> 5\n"
334             "5+ byte [61-61] 1 -> 5\n"
335             "6. nop -> 7\n"
336             "7+ byte [61-61] 1 -> 7\n"
337             "8. match! 0\n",
338             forward);
339 
340   Dump("(a*|b*)*{3,}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL);
341   EXPECT_EQ("3+ nop -> 6\n"
342             "4+ nop -> 8\n"
343             "5. nop -> 21\n"
344             "6+ byte [61-61] 1 -> 6\n"
345             "7. nop -> 3\n"
346             "8+ byte [62-62] 1 -> 8\n"
347             "9. nop -> 3\n"
348             "10+ byte [61-61] 1 -> 10\n"
349             "11. nop -> 21\n"
350             "12+ byte [62-62] 1 -> 12\n"
351             "13. nop -> 21\n"
352             "14+ byte [61-61] 1 -> 14\n"
353             "15. nop -> 18\n"
354             "16+ byte [62-62] 1 -> 16\n"
355             "17. nop -> 18\n"
356             "18+ nop -> 14\n"
357             "19+ nop -> 16\n"
358             "20. match! 0\n"
359             "21+ nop -> 10\n"
360             "22+ nop -> 12\n"
361             "23. nop -> 18\n",
362       forward);
363 
364   Dump("((|S.+)+|(|S.+)+|){2}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL);
365   EXPECT_EQ("3+ nop -> 36\n"
366             "4+ nop -> 31\n"
367             "5. nop -> 33\n"
368             "6+ byte [00-09] 0 -> 8\n"
369             "7. byte [0b-ff] 0 -> 8\n"
370             "8+ nop -> 6\n"
371             "9+ nop -> 29\n"
372             "10. nop -> 28\n"
373             "11+ byte [00-09] 0 -> 13\n"
374             "12. byte [0b-ff] 0 -> 13\n"
375             "13+ nop -> 11\n"
376             "14+ nop -> 26\n"
377             "15. nop -> 28\n"
378             "16+ byte [00-09] 0 -> 18\n"
379             "17. byte [0b-ff] 0 -> 18\n"
380             "18+ nop -> 16\n"
381             "19+ nop -> 36\n"
382             "20. nop -> 33\n"
383             "21+ byte [00-09] 0 -> 23\n"
384             "22. byte [0b-ff] 0 -> 23\n"
385             "23+ nop -> 21\n"
386             "24+ nop -> 31\n"
387             "25. nop -> 33\n"
388             "26+ nop -> 28\n"
389             "27. byte [53-53] 0 -> 11\n"
390             "28. match! 0\n"
391             "29+ nop -> 28\n"
392             "30. byte [53-53] 0 -> 6\n"
393             "31+ nop -> 33\n"
394             "32. byte [53-53] 0 -> 21\n"
395             "33+ nop -> 29\n"
396             "34+ nop -> 26\n"
397             "35. nop -> 28\n"
398             "36+ nop -> 33\n"
399             "37. byte [53-53] 0 -> 16\n",
400       forward);
401 }
402 
403 }  // namespace re2
404