1 // Copyright 2008 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Regular expression generator: generates all possible
6 // regular expressions within parameters (see regexp_generator.h for details).
7 
8 // The regexp generator first generates a sequence of commands in a simple
9 // postfix language.  Each command in the language is a string,
10 // like "a" or "%s*" or "%s|%s".
11 //
12 // To evaluate a command, enough arguments are popped from the value stack to
13 // plug into the %s slots.  Then the result is pushed onto the stack.
14 // For example, the command sequence
15 //      a b %s%s c
16 // results in the stack
17 //      ab c
18 //
19 // GeneratePostfix generates all possible command sequences.
20 // Then RunPostfix turns each sequence into a regular expression
21 // and passes the regexp to HandleRegexp.
22 
23 #include <stddef.h>
24 #include <stdint.h>
25 #include <stdio.h>
26 #include <string.h>
27 #include <memory>
28 #include <stack>
29 #include <string>
30 #include <vector>
31 
32 #include "util/test.h"
33 #include "util/logging.h"
34 #include "util/strutil.h"
35 #include "util/utf.h"
36 #include "re2/testing/regexp_generator.h"
37 
38 namespace re2 {
39 
40 // Returns a vector of the egrep regexp operators.
EgrepOps()41 const std::vector<std::string>& RegexpGenerator::EgrepOps() {
42   static const char *ops[] = {
43     "%s%s",
44     "%s|%s",
45     "%s*",
46     "%s+",
47     "%s?",
48     "%s\\C*",
49   };
50   static std::vector<std::string> v(ops, ops + arraysize(ops));
51   return v;
52 }
53 
RegexpGenerator(int maxatoms,int maxops,const std::vector<std::string> & atoms,const std::vector<std::string> & ops)54 RegexpGenerator::RegexpGenerator(int maxatoms, int maxops,
55                                  const std::vector<std::string>& atoms,
56                                  const std::vector<std::string>& ops)
57     : maxatoms_(maxatoms), maxops_(maxops), atoms_(atoms), ops_(ops) {
58   // Degenerate case.
59   if (atoms_.empty())
60     maxatoms_ = 0;
61   if (ops_.empty())
62     maxops_ = 0;
63 }
64 
65 // Generates all possible regular expressions (within the parameters),
66 // calling HandleRegexp for each one.
Generate()67 void RegexpGenerator::Generate() {
68   std::vector<std::string> postfix;
69   GeneratePostfix(&postfix, 0, 0, 0);
70 }
71 
72 // Generates random regular expressions, calling HandleRegexp for each one.
GenerateRandom(int32_t seed,int n)73 void RegexpGenerator::GenerateRandom(int32_t seed, int n) {
74   rng_.seed(seed);
75 
76   for (int i = 0; i < n; i++) {
77     std::vector<std::string> postfix;
78     GenerateRandomPostfix(&postfix, 0, 0, 0);
79   }
80 }
81 
82 // Counts and returns the number of occurrences of "%s" in s.
CountArgs(const std::string & s)83 static int CountArgs(const std::string& s) {
84   const char *p = s.c_str();
85   int n = 0;
86   while ((p = strstr(p, "%s")) != NULL) {
87     p += 2;
88     n++;
89   }
90   return n;
91 }
92 
93 // Generates all possible postfix command sequences.
94 // Each sequence is handed off to RunPostfix to generate a regular expression.
95 // The arguments are:
96 //   post:  the current postfix sequence
97 //   nstk:  the number of elements that would be on the stack after executing
98 //          the sequence
99 //   ops:   the number of operators used in the sequence
100 //   atoms: the number of atoms used in the sequence
101 // For example, if post were ["a", "b", "%s%s", "c"],
102 // then nstk = 2, ops = 1, atoms = 3.
103 //
104 // The initial call should be GeneratePostfix([empty vector], 0, 0, 0).
105 //
GeneratePostfix(std::vector<std::string> * post,int nstk,int ops,int atoms)106 void RegexpGenerator::GeneratePostfix(std::vector<std::string>* post,
107                                       int nstk, int ops, int atoms) {
108   if (nstk == 1)
109     RunPostfix(*post);
110 
111   // Early out: if used too many operators or can't
112   // get back down to a single expression on the stack
113   // using binary operators, give up.
114   if (ops + nstk - 1 > maxops_)
115     return;
116 
117   // Add atoms if there is room.
118   if (atoms < maxatoms_) {
119     for (size_t i = 0; i < atoms_.size(); i++) {
120       post->push_back(atoms_[i]);
121       GeneratePostfix(post, nstk + 1, ops, atoms + 1);
122       post->pop_back();
123     }
124   }
125 
126   // Add operators if there are enough arguments.
127   if (ops < maxops_) {
128     for (size_t i = 0; i < ops_.size(); i++) {
129       const std::string& fmt = ops_[i];
130       int nargs = CountArgs(fmt);
131       if (nargs <= nstk) {
132         post->push_back(fmt);
133         GeneratePostfix(post, nstk - nargs + 1, ops + 1, atoms);
134         post->pop_back();
135       }
136     }
137   }
138 }
139 
140 // Generates a random postfix command sequence.
141 // Stops and returns true once a single sequence has been generated.
GenerateRandomPostfix(std::vector<std::string> * post,int nstk,int ops,int atoms)142 bool RegexpGenerator::GenerateRandomPostfix(std::vector<std::string>* post,
143                                             int nstk, int ops, int atoms) {
144   std::uniform_int_distribution<int> random_stop(0, maxatoms_ - atoms);
145   std::uniform_int_distribution<int> random_bit(0, 1);
146   std::uniform_int_distribution<int> random_ops_index(
147       0, static_cast<int>(ops_.size()) - 1);
148   std::uniform_int_distribution<int> random_atoms_index(
149       0, static_cast<int>(atoms_.size()) - 1);
150 
151   for (;;) {
152     // Stop if we get to a single element, but only sometimes.
153     if (nstk == 1 && random_stop(rng_) == 0) {
154       RunPostfix(*post);
155       return true;
156     }
157 
158     // Early out: if used too many operators or can't
159     // get back down to a single expression on the stack
160     // using binary operators, give up.
161     if (ops + nstk - 1 > maxops_)
162       return false;
163 
164     // Add operators if there are enough arguments.
165     if (ops < maxops_ && random_bit(rng_) == 0) {
166       const std::string& fmt = ops_[random_ops_index(rng_)];
167       int nargs = CountArgs(fmt);
168       if (nargs <= nstk) {
169         post->push_back(fmt);
170         bool ret = GenerateRandomPostfix(post, nstk - nargs + 1,
171                                          ops + 1, atoms);
172         post->pop_back();
173         if (ret)
174           return true;
175       }
176     }
177 
178     // Add atoms if there is room.
179     if (atoms < maxatoms_ && random_bit(rng_) == 0) {
180       post->push_back(atoms_[random_atoms_index(rng_)]);
181       bool ret = GenerateRandomPostfix(post, nstk + 1, ops, atoms + 1);
182       post->pop_back();
183       if (ret)
184         return true;
185     }
186   }
187 }
188 
189 // Interprets the postfix command sequence to create a regular expression
190 // passed to HandleRegexp.  The results of operators like %s|%s are wrapped
191 // in (?: ) to avoid needing to maintain a precedence table.
RunPostfix(const std::vector<std::string> & post)192 void RegexpGenerator::RunPostfix(const std::vector<std::string>& post) {
193   std::stack<std::string> regexps;
194   for (size_t i = 0; i < post.size(); i++) {
195     switch (CountArgs(post[i])) {
196       default:
197         LOG(FATAL) << "Bad operator: " << post[i];
198       case 0:
199         regexps.push(post[i]);
200         break;
201       case 1: {
202         std::string a = regexps.top();
203         regexps.pop();
204         regexps.push("(?:" + StringPrintf(post[i].c_str(), a.c_str()) + ")");
205         break;
206       }
207       case 2: {
208         std::string b = regexps.top();
209         regexps.pop();
210         std::string a = regexps.top();
211         regexps.pop();
212         regexps.push("(?:" +
213                      StringPrintf(post[i].c_str(), a.c_str(), b.c_str()) +
214                      ")");
215         break;
216       }
217     }
218   }
219 
220   if (regexps.size() != 1) {
221     // Internal error - should never happen.
222     printf("Bad regexp program:\n");
223     for (size_t i = 0; i < post.size(); i++) {
224       printf("  %s\n", CEscape(post[i]).c_str());
225     }
226     printf("Stack after running program:\n");
227     while (!regexps.empty()) {
228       printf("  %s\n", CEscape(regexps.top()).c_str());
229       regexps.pop();
230     }
231     LOG(FATAL) << "Bad regexp program.";
232   }
233 
234   HandleRegexp(regexps.top());
235   HandleRegexp("^(?:" + regexps.top() + ")$");
236   HandleRegexp("^(?:" + regexps.top() + ")");
237   HandleRegexp("(?:" + regexps.top() + ")$");
238 }
239 
240 // Split s into an vector of strings, one for each UTF-8 character.
Explode(const StringPiece & s)241 std::vector<std::string> Explode(const StringPiece& s) {
242   std::vector<std::string> v;
243 
244   for (const char *q = s.data(); q < s.data() + s.size(); ) {
245     const char* p = q;
246     Rune r;
247     q += chartorune(&r, q);
248     v.push_back(std::string(p, q - p));
249   }
250 
251   return v;
252 }
253 
254 // Split string everywhere a substring is found, returning
255 // vector of pieces.
Split(const StringPiece & sep,const StringPiece & s)256 std::vector<std::string> Split(const StringPiece& sep, const StringPiece& s) {
257   std::vector<std::string> v;
258 
259   if (sep.empty())
260     return Explode(s);
261 
262   const char *p = s.data();
263   for (const char *q = s.data(); q + sep.size() <= s.data() + s.size(); q++) {
264     if (StringPiece(q, sep.size()) == sep) {
265       v.push_back(std::string(p, q - p));
266       p = q + sep.size();
267       q = p - 1;  // -1 for ++ in loop
268       continue;
269     }
270   }
271   if (p < s.data() + s.size())
272     v.push_back(std::string(p, s.data() + s.size() - p));
273   return v;
274 }
275 
276 }  // namespace re2
277