1 // Copyright 2009 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 #include "re2/prefilter_tree.h"
6 
7 #include <stddef.h>
8 #include <algorithm>
9 #include <map>
10 #include <memory>
11 #include <set>
12 #include <string>
13 #include <utility>
14 #include <vector>
15 
16 #include "util/util.h"
17 #include "util/logging.h"
18 #include "util/strutil.h"
19 #include "re2/prefilter.h"
20 #include "re2/re2.h"
21 
22 namespace re2 {
23 
24 static const bool ExtraDebug = false;
25 
PrefilterTree()26 PrefilterTree::PrefilterTree()
27     : compiled_(false),
28       min_atom_len_(3) {
29 }
30 
PrefilterTree(int min_atom_len)31 PrefilterTree::PrefilterTree(int min_atom_len)
32     : compiled_(false),
33       min_atom_len_(min_atom_len) {
34 }
35 
~PrefilterTree()36 PrefilterTree::~PrefilterTree() {
37   for (size_t i = 0; i < prefilter_vec_.size(); i++)
38     delete prefilter_vec_[i];
39 
40   for (size_t i = 0; i < entries_.size(); i++)
41     delete entries_[i].parents;
42 }
43 
Add(Prefilter * prefilter)44 void PrefilterTree::Add(Prefilter* prefilter) {
45   if (compiled_) {
46     LOG(DFATAL) << "Add called after Compile.";
47     return;
48   }
49   if (prefilter != NULL && !KeepNode(prefilter)) {
50     delete prefilter;
51     prefilter = NULL;
52   }
53 
54   prefilter_vec_.push_back(prefilter);
55 }
56 
Compile(std::vector<std::string> * atom_vec)57 void PrefilterTree::Compile(std::vector<std::string>* atom_vec) {
58   if (compiled_) {
59     LOG(DFATAL) << "Compile called already.";
60     return;
61   }
62 
63   // Some legacy users of PrefilterTree call Compile() before
64   // adding any regexps and expect Compile() to have no effect.
65   if (prefilter_vec_.empty())
66     return;
67 
68   compiled_ = true;
69 
70   // TODO(junyer): Use std::unordered_set<Prefilter*> instead?
71   NodeMap nodes;
72   AssignUniqueIds(&nodes, atom_vec);
73 
74   // Identify nodes that are too common among prefilters and are
75   // triggering too many parents. Then get rid of them if possible.
76   // Note that getting rid of a prefilter node simply means they are
77   // no longer necessary for their parent to trigger; that is, we do
78   // not miss out on any regexps triggering by getting rid of a
79   // prefilter node.
80   for (size_t i = 0; i < entries_.size(); i++) {
81     StdIntMap* parents = entries_[i].parents;
82     if (parents->size() > 8) {
83       // This one triggers too many things. If all the parents are AND
84       // nodes and have other things guarding them, then get rid of
85       // this trigger. TODO(vsri): Adjust the threshold appropriately,
86       // make it a function of total number of nodes?
87       bool have_other_guard = true;
88       for (StdIntMap::iterator it = parents->begin();
89            it != parents->end(); ++it) {
90         have_other_guard = have_other_guard &&
91             (entries_[it->first].propagate_up_at_count > 1);
92       }
93 
94       if (have_other_guard) {
95         for (StdIntMap::iterator it = parents->begin();
96              it != parents->end(); ++it)
97           entries_[it->first].propagate_up_at_count -= 1;
98 
99         parents->clear();  // Forget the parents
100       }
101     }
102   }
103 
104   if (ExtraDebug)
105     PrintDebugInfo(&nodes);
106 }
107 
CanonicalNode(NodeMap * nodes,Prefilter * node)108 Prefilter* PrefilterTree::CanonicalNode(NodeMap* nodes, Prefilter* node) {
109   std::string node_string = NodeString(node);
110   NodeMap::iterator iter = nodes->find(node_string);
111   if (iter == nodes->end())
112     return NULL;
113   return (*iter).second;
114 }
115 
NodeString(Prefilter * node) const116 std::string PrefilterTree::NodeString(Prefilter* node) const {
117   // Adding the operation disambiguates AND/OR/atom nodes.
118   std::string s = StringPrintf("%d", node->op()) + ":";
119   if (node->op() == Prefilter::ATOM) {
120     s += node->atom();
121   } else {
122     for (size_t i = 0; i < node->subs()->size(); i++) {
123       if (i > 0)
124         s += ',';
125       s += StringPrintf("%d", (*node->subs())[i]->unique_id());
126     }
127   }
128   return s;
129 }
130 
KeepNode(Prefilter * node) const131 bool PrefilterTree::KeepNode(Prefilter* node) const {
132   if (node == NULL)
133     return false;
134 
135   switch (node->op()) {
136     default:
137       LOG(DFATAL) << "Unexpected op in KeepNode: " << node->op();
138       return false;
139 
140     case Prefilter::ALL:
141     case Prefilter::NONE:
142       return false;
143 
144     case Prefilter::ATOM:
145       return node->atom().size() >= static_cast<size_t>(min_atom_len_);
146 
147     case Prefilter::AND: {
148       int j = 0;
149       std::vector<Prefilter*>* subs = node->subs();
150       for (size_t i = 0; i < subs->size(); i++)
151         if (KeepNode((*subs)[i]))
152           (*subs)[j++] = (*subs)[i];
153         else
154           delete (*subs)[i];
155 
156       subs->resize(j);
157       return j > 0;
158     }
159 
160     case Prefilter::OR:
161       for (size_t i = 0; i < node->subs()->size(); i++)
162         if (!KeepNode((*node->subs())[i]))
163           return false;
164       return true;
165   }
166 }
167 
AssignUniqueIds(NodeMap * nodes,std::vector<std::string> * atom_vec)168 void PrefilterTree::AssignUniqueIds(NodeMap* nodes,
169                                     std::vector<std::string>* atom_vec) {
170   atom_vec->clear();
171 
172   // Build vector of all filter nodes, sorted topologically
173   // from top to bottom in v.
174   std::vector<Prefilter*> v;
175 
176   // Add the top level nodes of each regexp prefilter.
177   for (size_t i = 0; i < prefilter_vec_.size(); i++) {
178     Prefilter* f = prefilter_vec_[i];
179     if (f == NULL)
180       unfiltered_.push_back(static_cast<int>(i));
181 
182     // We push NULL also on to v, so that we maintain the
183     // mapping of index==regexpid for level=0 prefilter nodes.
184     v.push_back(f);
185   }
186 
187   // Now add all the descendant nodes.
188   for (size_t i = 0; i < v.size(); i++) {
189     Prefilter* f = v[i];
190     if (f == NULL)
191       continue;
192     if (f->op() == Prefilter::AND || f->op() == Prefilter::OR) {
193       const std::vector<Prefilter*>& subs = *f->subs();
194       for (size_t j = 0; j < subs.size(); j++)
195         v.push_back(subs[j]);
196     }
197   }
198 
199   // Identify unique nodes.
200   int unique_id = 0;
201   for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) {
202     Prefilter *node = v[i];
203     if (node == NULL)
204       continue;
205     node->set_unique_id(-1);
206     Prefilter* canonical = CanonicalNode(nodes, node);
207     if (canonical == NULL) {
208       // Any further nodes that have the same node string
209       // will find this node as the canonical node.
210       nodes->emplace(NodeString(node), node);
211       if (node->op() == Prefilter::ATOM) {
212         atom_vec->push_back(node->atom());
213         atom_index_to_id_.push_back(unique_id);
214       }
215       node->set_unique_id(unique_id++);
216     } else {
217       node->set_unique_id(canonical->unique_id());
218     }
219   }
220   entries_.resize(nodes->size());
221 
222   // Create parent StdIntMap for the entries.
223   for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) {
224     Prefilter* prefilter = v[i];
225     if (prefilter == NULL)
226       continue;
227 
228     if (CanonicalNode(nodes, prefilter) != prefilter)
229       continue;
230 
231     Entry* entry = &entries_[prefilter->unique_id()];
232     entry->parents = new StdIntMap();
233   }
234 
235   // Fill the entries.
236   for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) {
237     Prefilter* prefilter = v[i];
238     if (prefilter == NULL)
239       continue;
240 
241     if (CanonicalNode(nodes, prefilter) != prefilter)
242       continue;
243 
244     Entry* entry = &entries_[prefilter->unique_id()];
245 
246     switch (prefilter->op()) {
247       default:
248       case Prefilter::ALL:
249         LOG(DFATAL) << "Unexpected op: " << prefilter->op();
250         return;
251 
252       case Prefilter::ATOM:
253         entry->propagate_up_at_count = 1;
254         break;
255 
256       case Prefilter::OR:
257       case Prefilter::AND: {
258         std::set<int> uniq_child;
259         for (size_t j = 0; j < prefilter->subs()->size(); j++) {
260           Prefilter* child = (*prefilter->subs())[j];
261           Prefilter* canonical = CanonicalNode(nodes, child);
262           if (canonical == NULL) {
263             LOG(DFATAL) << "Null canonical node";
264             return;
265           }
266           int child_id = canonical->unique_id();
267           uniq_child.insert(child_id);
268           // To the child, we want to add to parent indices.
269           Entry* child_entry = &entries_[child_id];
270           if (child_entry->parents->find(prefilter->unique_id()) ==
271               child_entry->parents->end()) {
272             (*child_entry->parents)[prefilter->unique_id()] = 1;
273           }
274         }
275         entry->propagate_up_at_count = prefilter->op() == Prefilter::AND
276                                            ? static_cast<int>(uniq_child.size())
277                                            : 1;
278 
279         break;
280       }
281     }
282   }
283 
284   // For top level nodes, populate regexp id.
285   for (size_t i = 0; i < prefilter_vec_.size(); i++) {
286     if (prefilter_vec_[i] == NULL)
287       continue;
288     int id = CanonicalNode(nodes, prefilter_vec_[i])->unique_id();
289     DCHECK_LE(0, id);
290     Entry* entry = &entries_[id];
291     entry->regexps.push_back(static_cast<int>(i));
292   }
293 }
294 
295 // Functions for triggering during search.
RegexpsGivenStrings(const std::vector<int> & matched_atoms,std::vector<int> * regexps) const296 void PrefilterTree::RegexpsGivenStrings(
297     const std::vector<int>& matched_atoms,
298     std::vector<int>* regexps) const {
299   regexps->clear();
300   if (!compiled_) {
301     // Some legacy users of PrefilterTree call Compile() before
302     // adding any regexps and expect Compile() to have no effect.
303     // This kludge is a counterpart to that kludge.
304     if (prefilter_vec_.empty())
305       return;
306 
307     LOG(ERROR) << "RegexpsGivenStrings called before Compile.";
308     for (size_t i = 0; i < prefilter_vec_.size(); i++)
309       regexps->push_back(static_cast<int>(i));
310   } else {
311     IntMap regexps_map(static_cast<int>(prefilter_vec_.size()));
312     std::vector<int> matched_atom_ids;
313     for (size_t j = 0; j < matched_atoms.size(); j++)
314       matched_atom_ids.push_back(atom_index_to_id_[matched_atoms[j]]);
315     PropagateMatch(matched_atom_ids, &regexps_map);
316     for (IntMap::iterator it = regexps_map.begin();
317          it != regexps_map.end();
318          ++it)
319       regexps->push_back(it->index());
320 
321     regexps->insert(regexps->end(), unfiltered_.begin(), unfiltered_.end());
322   }
323   std::sort(regexps->begin(), regexps->end());
324 }
325 
PropagateMatch(const std::vector<int> & atom_ids,IntMap * regexps) const326 void PrefilterTree::PropagateMatch(const std::vector<int>& atom_ids,
327                                    IntMap* regexps) const {
328   IntMap count(static_cast<int>(entries_.size()));
329   IntMap work(static_cast<int>(entries_.size()));
330   for (size_t i = 0; i < atom_ids.size(); i++)
331     work.set(atom_ids[i], 1);
332   for (IntMap::iterator it = work.begin(); it != work.end(); ++it) {
333     const Entry& entry = entries_[it->index()];
334     // Record regexps triggered.
335     for (size_t i = 0; i < entry.regexps.size(); i++)
336       regexps->set(entry.regexps[i], 1);
337     int c;
338     // Pass trigger up to parents.
339     for (StdIntMap::iterator it = entry.parents->begin();
340          it != entry.parents->end();
341          ++it) {
342       int j = it->first;
343       const Entry& parent = entries_[j];
344       // Delay until all the children have succeeded.
345       if (parent.propagate_up_at_count > 1) {
346         if (count.has_index(j)) {
347           c = count.get_existing(j) + 1;
348           count.set_existing(j, c);
349         } else {
350           c = 1;
351           count.set_new(j, c);
352         }
353         if (c < parent.propagate_up_at_count)
354           continue;
355       }
356       // Trigger the parent.
357       work.set(j, 1);
358     }
359   }
360 }
361 
362 // Debugging help.
PrintPrefilter(int regexpid)363 void PrefilterTree::PrintPrefilter(int regexpid) {
364   LOG(ERROR) << DebugNodeString(prefilter_vec_[regexpid]);
365 }
366 
PrintDebugInfo(NodeMap * nodes)367 void PrefilterTree::PrintDebugInfo(NodeMap* nodes) {
368   LOG(ERROR) << "#Unique Atoms: " << atom_index_to_id_.size();
369   LOG(ERROR) << "#Unique Nodes: " << entries_.size();
370 
371   for (size_t i = 0; i < entries_.size(); i++) {
372     StdIntMap* parents = entries_[i].parents;
373     const std::vector<int>& regexps = entries_[i].regexps;
374     LOG(ERROR) << "EntryId: " << i
375                << " N: " << parents->size() << " R: " << regexps.size();
376     for (StdIntMap::iterator it = parents->begin(); it != parents->end(); ++it)
377       LOG(ERROR) << it->first;
378   }
379   LOG(ERROR) << "Map:";
380   for (NodeMap::const_iterator iter = nodes->begin();
381        iter != nodes->end(); ++iter)
382     LOG(ERROR) << "NodeId: " << (*iter).second->unique_id()
383                << " Str: " << (*iter).first;
384 }
385 
DebugNodeString(Prefilter * node) const386 std::string PrefilterTree::DebugNodeString(Prefilter* node) const {
387   std::string node_string = "";
388   if (node->op() == Prefilter::ATOM) {
389     DCHECK(!node->atom().empty());
390     node_string += node->atom();
391   } else {
392     // Adding the operation disambiguates AND and OR nodes.
393     node_string +=  node->op() == Prefilter::AND ? "AND" : "OR";
394     node_string += "(";
395     for (size_t i = 0; i < node->subs()->size(); i++) {
396       if (i > 0)
397         node_string += ',';
398       node_string += StringPrintf("%d", (*node->subs())[i]->unique_id());
399       node_string += ":";
400       node_string += DebugNodeString((*node->subs())[i]);
401     }
402     node_string += ")";
403   }
404   return node_string;
405 }
406 
407 }  // namespace re2
408