1 // Copyright 2009 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 #include "re2/prefilter.h"
6 
7 #include <stddef.h>
8 #include <stdint.h>
9 #include <string>
10 #include <vector>
11 
12 #include "util/util.h"
13 #include "util/logging.h"
14 #include "util/strutil.h"
15 #include "util/utf.h"
16 #include "re2/re2.h"
17 #include "re2/unicode_casefold.h"
18 #include "re2/walker-inl.h"
19 
20 namespace re2 {
21 
22 static const bool ExtraDebug = false;
23 
24 typedef std::set<std::string>::iterator SSIter;
25 typedef std::set<std::string>::const_iterator ConstSSIter;
26 
27 // Initializes a Prefilter, allocating subs_ as necessary.
Prefilter(Op op)28 Prefilter::Prefilter(Op op) {
29   op_ = op;
30   subs_ = NULL;
31   if (op_ == AND || op_ == OR)
32     subs_ = new std::vector<Prefilter*>;
33 }
34 
35 // Destroys a Prefilter.
~Prefilter()36 Prefilter::~Prefilter() {
37   if (subs_) {
38     for (size_t i = 0; i < subs_->size(); i++)
39       delete (*subs_)[i];
40     delete subs_;
41     subs_ = NULL;
42   }
43 }
44 
45 // Simplify if the node is an empty Or or And.
Simplify()46 Prefilter* Prefilter::Simplify() {
47   if (op_ != AND && op_ != OR) {
48     return this;
49   }
50 
51   // Nothing left in the AND/OR.
52   if (subs_->empty()) {
53     if (op_ == AND)
54       op_ = ALL;  // AND of nothing is true
55     else
56       op_ = NONE;  // OR of nothing is false
57 
58     return this;
59   }
60 
61   // Just one subnode: throw away wrapper.
62   if (subs_->size() == 1) {
63     Prefilter* a = (*subs_)[0];
64     subs_->clear();
65     delete this;
66     return a->Simplify();
67   }
68 
69   return this;
70 }
71 
72 // Combines two Prefilters together to create an "op" (AND or OR).
73 // The passed Prefilters will be part of the returned Prefilter or deleted.
74 // Does lots of work to avoid creating unnecessarily complicated structures.
AndOr(Op op,Prefilter * a,Prefilter * b)75 Prefilter* Prefilter::AndOr(Op op, Prefilter* a, Prefilter* b) {
76   // If a, b can be rewritten as op, do so.
77   a = a->Simplify();
78   b = b->Simplify();
79 
80   // Canonicalize: a->op <= b->op.
81   if (a->op() > b->op()) {
82     Prefilter* t = a;
83     a = b;
84     b = t;
85   }
86 
87   // Trivial cases.
88   //    ALL AND b = b
89   //    NONE OR b = b
90   //    ALL OR b   = ALL
91   //    NONE AND b = NONE
92   // Don't need to look at b, because of canonicalization above.
93   // ALL and NONE are smallest opcodes.
94   if (a->op() == ALL || a->op() == NONE) {
95     if ((a->op() == ALL && op == AND) ||
96         (a->op() == NONE && op == OR)) {
97       delete a;
98       return b;
99     } else {
100       delete b;
101       return a;
102     }
103   }
104 
105   // If a and b match op, merge their contents.
106   if (a->op() == op && b->op() == op) {
107     for (size_t i = 0; i < b->subs()->size(); i++) {
108       Prefilter* bb = (*b->subs())[i];
109       a->subs()->push_back(bb);
110     }
111     b->subs()->clear();
112     delete b;
113     return a;
114   }
115 
116   // If a already has the same op as the op that is under construction
117   // add in b (similarly if b already has the same op, add in a).
118   if (b->op() == op) {
119     Prefilter* t = a;
120     a = b;
121     b = t;
122   }
123   if (a->op() == op) {
124     a->subs()->push_back(b);
125     return a;
126   }
127 
128   // Otherwise just return the op.
129   Prefilter* c = new Prefilter(op);
130   c->subs()->push_back(a);
131   c->subs()->push_back(b);
132   return c;
133 }
134 
And(Prefilter * a,Prefilter * b)135 Prefilter* Prefilter::And(Prefilter* a, Prefilter* b) {
136   return AndOr(AND, a, b);
137 }
138 
Or(Prefilter * a,Prefilter * b)139 Prefilter* Prefilter::Or(Prefilter* a, Prefilter* b) {
140   return AndOr(OR, a, b);
141 }
142 
SimplifyStringSet(std::set<std::string> * ss)143 static void SimplifyStringSet(std::set<std::string>* ss) {
144   // Now make sure that the strings aren't redundant.  For example, if
145   // we know "ab" is a required string, then it doesn't help at all to
146   // know that "abc" is also a required string, so delete "abc". This
147   // is because, when we are performing a string search to filter
148   // regexps, matching "ab" will already allow this regexp to be a
149   // candidate for match, so further matching "abc" is redundant.
150   // Note that we must ignore "" because find() would find it at the
151   // start of everything and thus we would end up erasing everything.
152   for (SSIter i = ss->begin(); i != ss->end(); ++i) {
153     if (i->empty())
154       continue;
155     SSIter j = i;
156     ++j;
157     while (j != ss->end()) {
158       if (j->find(*i) != std::string::npos) {
159         j = ss->erase(j);
160         continue;
161       }
162       ++j;
163     }
164   }
165 }
166 
OrStrings(std::set<std::string> * ss)167 Prefilter* Prefilter::OrStrings(std::set<std::string>* ss) {
168   Prefilter* or_prefilter = new Prefilter(NONE);
169   SimplifyStringSet(ss);
170   for (SSIter i = ss->begin(); i != ss->end(); ++i)
171     or_prefilter = Or(or_prefilter, FromString(*i));
172   return or_prefilter;
173 }
174 
ToLowerRune(Rune r)175 static Rune ToLowerRune(Rune r) {
176   if (r < Runeself) {
177     if ('A' <= r && r <= 'Z')
178       r += 'a' - 'A';
179     return r;
180   }
181 
182   const CaseFold *f = LookupCaseFold(unicode_tolower, num_unicode_tolower, r);
183   if (f == NULL || r < f->lo)
184     return r;
185   return ApplyFold(f, r);
186 }
187 
ToLowerRuneLatin1(Rune r)188 static Rune ToLowerRuneLatin1(Rune r) {
189   if ('A' <= r && r <= 'Z')
190     r += 'a' - 'A';
191   return r;
192 }
193 
FromString(const std::string & str)194 Prefilter* Prefilter::FromString(const std::string& str) {
195   Prefilter* m = new Prefilter(Prefilter::ATOM);
196   m->atom_ = str;
197   return m;
198 }
199 
200 // Information about a regexp used during computation of Prefilter.
201 // Can be thought of as information about the set of strings matching
202 // the given regular expression.
203 class Prefilter::Info {
204  public:
205   Info();
206   ~Info();
207 
208   // More constructors.  They delete their Info* arguments.
209   static Info* Alt(Info* a, Info* b);
210   static Info* Concat(Info* a, Info* b);
211   static Info* And(Info* a, Info* b);
212   static Info* Star(Info* a);
213   static Info* Plus(Info* a);
214   static Info* Quest(Info* a);
215   static Info* EmptyString();
216   static Info* NoMatch();
217   static Info* AnyCharOrAnyByte();
218   static Info* CClass(CharClass* cc, bool latin1);
219   static Info* Literal(Rune r);
220   static Info* LiteralLatin1(Rune r);
221   static Info* AnyMatch();
222 
223   // Format Info as a string.
224   std::string ToString();
225 
226   // Caller takes ownership of the Prefilter.
227   Prefilter* TakeMatch();
228 
exact()229   std::set<std::string>& exact() { return exact_; }
230 
is_exact() const231   bool is_exact() const { return is_exact_; }
232 
233   class Walker;
234 
235  private:
236   std::set<std::string> exact_;
237 
238   // When is_exact_ is true, the strings that match
239   // are placed in exact_. When it is no longer an exact
240   // set of strings that match this RE, then is_exact_
241   // is false and the match_ contains the required match
242   // criteria.
243   bool is_exact_;
244 
245   // Accumulated Prefilter query that any
246   // match for this regexp is guaranteed to match.
247   Prefilter* match_;
248 };
249 
250 
Info()251 Prefilter::Info::Info()
252   : is_exact_(false),
253     match_(NULL) {
254 }
255 
~Info()256 Prefilter::Info::~Info() {
257   delete match_;
258 }
259 
TakeMatch()260 Prefilter* Prefilter::Info::TakeMatch() {
261   if (is_exact_) {
262     match_ = Prefilter::OrStrings(&exact_);
263     is_exact_ = false;
264   }
265   Prefilter* m = match_;
266   match_ = NULL;
267   return m;
268 }
269 
270 // Format a Info in string form.
ToString()271 std::string Prefilter::Info::ToString() {
272   if (is_exact_) {
273     int n = 0;
274     std::string s;
275     for (SSIter i = exact_.begin(); i != exact_.end(); ++i) {
276       if (n++ > 0)
277         s += ",";
278       s += *i;
279     }
280     return s;
281   }
282 
283   if (match_)
284     return match_->DebugString();
285 
286   return "";
287 }
288 
289 // Add the strings from src to dst.
CopyIn(const std::set<std::string> & src,std::set<std::string> * dst)290 static void CopyIn(const std::set<std::string>& src,
291                    std::set<std::string>* dst) {
292   for (ConstSSIter i = src.begin(); i != src.end(); ++i)
293     dst->insert(*i);
294 }
295 
296 // Add the cross-product of a and b to dst.
297 // (For each string i in a and j in b, add i+j.)
CrossProduct(const std::set<std::string> & a,const std::set<std::string> & b,std::set<std::string> * dst)298 static void CrossProduct(const std::set<std::string>& a,
299                          const std::set<std::string>& b,
300                          std::set<std::string>* dst) {
301   for (ConstSSIter i = a.begin(); i != a.end(); ++i)
302     for (ConstSSIter j = b.begin(); j != b.end(); ++j)
303       dst->insert(*i + *j);
304 }
305 
306 // Concats a and b. Requires that both are exact sets.
307 // Forms an exact set that is a crossproduct of a and b.
Concat(Info * a,Info * b)308 Prefilter::Info* Prefilter::Info::Concat(Info* a, Info* b) {
309   if (a == NULL)
310     return b;
311   DCHECK(a->is_exact_);
312   DCHECK(b && b->is_exact_);
313   Info *ab = new Info();
314 
315   CrossProduct(a->exact_, b->exact_, &ab->exact_);
316   ab->is_exact_ = true;
317 
318   delete a;
319   delete b;
320   return ab;
321 }
322 
323 // Constructs an inexact Info for ab given a and b.
324 // Used only when a or b is not exact or when the
325 // exact cross product is likely to be too big.
And(Info * a,Info * b)326 Prefilter::Info* Prefilter::Info::And(Info* a, Info* b) {
327   if (a == NULL)
328     return b;
329   if (b == NULL)
330     return a;
331 
332   Info *ab = new Info();
333 
334   ab->match_ = Prefilter::And(a->TakeMatch(), b->TakeMatch());
335   ab->is_exact_ = false;
336   delete a;
337   delete b;
338   return ab;
339 }
340 
341 // Constructs Info for a|b given a and b.
Alt(Info * a,Info * b)342 Prefilter::Info* Prefilter::Info::Alt(Info* a, Info* b) {
343   Info *ab = new Info();
344 
345   if (a->is_exact_ && b->is_exact_) {
346     CopyIn(a->exact_, &ab->exact_);
347     CopyIn(b->exact_, &ab->exact_);
348     ab->is_exact_ = true;
349   } else {
350     // Either a or b has is_exact_ = false. If the other
351     // one has is_exact_ = true, we move it to match_ and
352     // then create a OR of a,b. The resulting Info has
353     // is_exact_ = false.
354     ab->match_ = Prefilter::Or(a->TakeMatch(), b->TakeMatch());
355     ab->is_exact_ = false;
356   }
357 
358   delete a;
359   delete b;
360   return ab;
361 }
362 
363 // Constructs Info for a? given a.
Quest(Info * a)364 Prefilter::Info* Prefilter::Info::Quest(Info *a) {
365   Info *ab = new Info();
366 
367   ab->is_exact_ = false;
368   ab->match_ = new Prefilter(ALL);
369   delete a;
370   return ab;
371 }
372 
373 // Constructs Info for a* given a.
374 // Same as a? -- not much to do.
Star(Info * a)375 Prefilter::Info* Prefilter::Info::Star(Info *a) {
376   return Quest(a);
377 }
378 
379 // Constructs Info for a+ given a. If a was exact set, it isn't
380 // anymore.
Plus(Info * a)381 Prefilter::Info* Prefilter::Info::Plus(Info *a) {
382   Info *ab = new Info();
383 
384   ab->match_ = a->TakeMatch();
385   ab->is_exact_ = false;
386 
387   delete a;
388   return ab;
389 }
390 
RuneToString(Rune r)391 static std::string RuneToString(Rune r) {
392   char buf[UTFmax];
393   int n = runetochar(buf, &r);
394   return std::string(buf, n);
395 }
396 
RuneToStringLatin1(Rune r)397 static std::string RuneToStringLatin1(Rune r) {
398   char c = r & 0xff;
399   return std::string(&c, 1);
400 }
401 
402 // Constructs Info for literal rune.
Literal(Rune r)403 Prefilter::Info* Prefilter::Info::Literal(Rune r) {
404   Info* info = new Info();
405   info->exact_.insert(RuneToString(ToLowerRune(r)));
406   info->is_exact_ = true;
407   return info;
408 }
409 
410 // Constructs Info for literal rune for Latin1 encoded string.
LiteralLatin1(Rune r)411 Prefilter::Info* Prefilter::Info::LiteralLatin1(Rune r) {
412   Info* info = new Info();
413   info->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r)));
414   info->is_exact_ = true;
415   return info;
416 }
417 
418 // Constructs Info for dot (any character) or \C (any byte).
AnyCharOrAnyByte()419 Prefilter::Info* Prefilter::Info::AnyCharOrAnyByte() {
420   Prefilter::Info* info = new Prefilter::Info();
421   info->match_ = new Prefilter(ALL);
422   return info;
423 }
424 
425 // Constructs Prefilter::Info for no possible match.
NoMatch()426 Prefilter::Info* Prefilter::Info::NoMatch() {
427   Prefilter::Info* info = new Prefilter::Info();
428   info->match_ = new Prefilter(NONE);
429   return info;
430 }
431 
432 // Constructs Prefilter::Info for any possible match.
433 // This Prefilter::Info is valid for any regular expression,
434 // since it makes no assertions whatsoever about the
435 // strings being matched.
AnyMatch()436 Prefilter::Info* Prefilter::Info::AnyMatch() {
437   Prefilter::Info *info = new Prefilter::Info();
438   info->match_ = new Prefilter(ALL);
439   return info;
440 }
441 
442 // Constructs Prefilter::Info for just the empty string.
EmptyString()443 Prefilter::Info* Prefilter::Info::EmptyString() {
444   Prefilter::Info* info = new Prefilter::Info();
445   info->is_exact_ = true;
446   info->exact_.insert("");
447   return info;
448 }
449 
450 // Constructs Prefilter::Info for a character class.
451 typedef CharClass::iterator CCIter;
CClass(CharClass * cc,bool latin1)452 Prefilter::Info* Prefilter::Info::CClass(CharClass *cc,
453                                          bool latin1) {
454   if (ExtraDebug) {
455     LOG(ERROR) << "CharClassInfo:";
456     for (CCIter i = cc->begin(); i != cc->end(); ++i)
457       LOG(ERROR) << "  " << i->lo << "-" << i->hi;
458   }
459 
460   // If the class is too large, it's okay to overestimate.
461   if (cc->size() > 10)
462     return AnyCharOrAnyByte();
463 
464   Prefilter::Info *a = new Prefilter::Info();
465   for (CCIter i = cc->begin(); i != cc->end(); ++i)
466     for (Rune r = i->lo; r <= i->hi; r++) {
467       if (latin1) {
468         a->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r)));
469       } else {
470         a->exact_.insert(RuneToString(ToLowerRune(r)));
471       }
472     }
473 
474 
475   a->is_exact_ = true;
476 
477   if (ExtraDebug)
478     LOG(ERROR) << " = " << a->ToString();
479 
480   return a;
481 }
482 
483 class Prefilter::Info::Walker : public Regexp::Walker<Prefilter::Info*> {
484  public:
Walker(bool latin1)485   Walker(bool latin1) : latin1_(latin1) {}
486 
487   virtual Info* PostVisit(
488       Regexp* re, Info* parent_arg,
489       Info* pre_arg,
490       Info** child_args, int nchild_args);
491 
492   virtual Info* ShortVisit(
493       Regexp* re,
494       Info* parent_arg);
495 
latin1()496   bool latin1() { return latin1_; }
497  private:
498   bool latin1_;
499 
500   Walker(const Walker&) = delete;
501   Walker& operator=(const Walker&) = delete;
502 };
503 
BuildInfo(Regexp * re)504 Prefilter::Info* Prefilter::BuildInfo(Regexp* re) {
505   if (ExtraDebug)
506     LOG(ERROR) << "BuildPrefilter::Info: " << re->ToString();
507 
508   bool latin1 = (re->parse_flags() & Regexp::Latin1) != 0;
509   Prefilter::Info::Walker w(latin1);
510   Prefilter::Info* info = w.WalkExponential(re, NULL, 100000);
511 
512   if (w.stopped_early()) {
513     delete info;
514     return NULL;
515   }
516 
517   return info;
518 }
519 
ShortVisit(Regexp * re,Prefilter::Info * parent_arg)520 Prefilter::Info* Prefilter::Info::Walker::ShortVisit(
521     Regexp* re, Prefilter::Info* parent_arg) {
522   return AnyMatch();
523 }
524 
525 // Constructs the Prefilter::Info for the given regular expression.
526 // Assumes re is simplified.
PostVisit(Regexp * re,Prefilter::Info * parent_arg,Prefilter::Info * pre_arg,Prefilter::Info ** child_args,int nchild_args)527 Prefilter::Info* Prefilter::Info::Walker::PostVisit(
528     Regexp* re, Prefilter::Info* parent_arg,
529     Prefilter::Info* pre_arg, Prefilter::Info** child_args,
530     int nchild_args) {
531   Prefilter::Info *info;
532   switch (re->op()) {
533     default:
534     case kRegexpRepeat:
535       LOG(DFATAL) << "Bad regexp op " << re->op();
536       info = EmptyString();
537       break;
538 
539     case kRegexpNoMatch:
540       info = NoMatch();
541       break;
542 
543     // These ops match the empty string:
544     case kRegexpEmptyMatch:      // anywhere
545     case kRegexpBeginLine:       // at beginning of line
546     case kRegexpEndLine:         // at end of line
547     case kRegexpBeginText:       // at beginning of text
548     case kRegexpEndText:         // at end of text
549     case kRegexpWordBoundary:    // at word boundary
550     case kRegexpNoWordBoundary:  // not at word boundary
551       info = EmptyString();
552       break;
553 
554     case kRegexpLiteral:
555       if (latin1()) {
556         info = LiteralLatin1(re->rune());
557       }
558       else {
559         info = Literal(re->rune());
560       }
561       break;
562 
563     case kRegexpLiteralString:
564       if (re->nrunes() == 0) {
565         info = NoMatch();
566         break;
567       }
568       if (latin1()) {
569         info = LiteralLatin1(re->runes()[0]);
570         for (int i = 1; i < re->nrunes(); i++) {
571           info = Concat(info, LiteralLatin1(re->runes()[i]));
572         }
573       } else {
574         info = Literal(re->runes()[0]);
575         for (int i = 1; i < re->nrunes(); i++) {
576           info = Concat(info, Literal(re->runes()[i]));
577         }
578       }
579       break;
580 
581     case kRegexpConcat: {
582       // Accumulate in info.
583       // Exact is concat of recent contiguous exact nodes.
584       info = NULL;
585       Info* exact = NULL;
586       for (int i = 0; i < nchild_args; i++) {
587         Info* ci = child_args[i];  // child info
588         if (!ci->is_exact() ||
589             (exact && ci->exact().size() * exact->exact().size() > 16)) {
590           // Exact run is over.
591           info = And(info, exact);
592           exact = NULL;
593           // Add this child's info.
594           info = And(info, ci);
595         } else {
596           // Append to exact run.
597           exact = Concat(exact, ci);
598         }
599       }
600       info = And(info, exact);
601     }
602       break;
603 
604     case kRegexpAlternate:
605       info = child_args[0];
606       for (int i = 1; i < nchild_args; i++)
607         info = Alt(info, child_args[i]);
608       break;
609 
610     case kRegexpStar:
611       info = Star(child_args[0]);
612       break;
613 
614     case kRegexpQuest:
615       info = Quest(child_args[0]);
616       break;
617 
618     case kRegexpPlus:
619       info = Plus(child_args[0]);
620       break;
621 
622     case kRegexpAnyChar:
623     case kRegexpAnyByte:
624       // Claim nothing, except that it's not empty.
625       info = AnyCharOrAnyByte();
626       break;
627 
628     case kRegexpCharClass:
629       info = CClass(re->cc(), latin1());
630       break;
631 
632     case kRegexpCapture:
633       // These don't affect the set of matching strings.
634       info = child_args[0];
635       break;
636   }
637 
638   if (ExtraDebug)
639     LOG(ERROR) << "BuildInfo " << re->ToString()
640                << ": " << (info ? info->ToString() : "");
641 
642   return info;
643 }
644 
645 
FromRegexp(Regexp * re)646 Prefilter* Prefilter::FromRegexp(Regexp* re) {
647   if (re == NULL)
648     return NULL;
649 
650   Regexp* simple = re->Simplify();
651   Prefilter::Info *info = BuildInfo(simple);
652 
653   simple->Decref();
654   if (info == NULL)
655     return NULL;
656 
657   Prefilter* m = info->TakeMatch();
658 
659   delete info;
660   return m;
661 }
662 
DebugString() const663 std::string Prefilter::DebugString() const {
664   switch (op_) {
665     default:
666       LOG(DFATAL) << "Bad op in Prefilter::DebugString: " << op_;
667       return StringPrintf("op%d", op_);
668     case NONE:
669       return "*no-matches*";
670     case ATOM:
671       return atom_;
672     case ALL:
673       return "";
674     case AND: {
675       std::string s = "";
676       for (size_t i = 0; i < subs_->size(); i++) {
677         if (i > 0)
678           s += " ";
679         Prefilter* sub = (*subs_)[i];
680         s += sub ? sub->DebugString() : "<nil>";
681       }
682       return s;
683     }
684     case OR: {
685       std::string s = "(";
686       for (size_t i = 0; i < subs_->size(); i++) {
687         if (i > 0)
688           s += "|";
689         Prefilter* sub = (*subs_)[i];
690         s += sub ? sub->DebugString() : "<nil>";
691       }
692       s += ")";
693       return s;
694     }
695   }
696 }
697 
FromRE2(const RE2 * re2)698 Prefilter* Prefilter::FromRE2(const RE2* re2) {
699   if (re2 == NULL)
700     return NULL;
701 
702   Regexp* regexp = re2->Regexp();
703   if (regexp == NULL)
704     return NULL;
705 
706   return FromRegexp(regexp);
707 }
708 
709 
710 }  // namespace re2
711