1 // Copyright 2006 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Regular expression representation.
6 // Tested by parse_test.cc
7 
8 #include "re2/regexp.h"
9 
10 #include <stddef.h>
11 #include <stdint.h>
12 #include <string.h>
13 #include <algorithm>
14 #include <map>
15 #include <mutex>
16 #include <string>
17 #include <vector>
18 
19 #include "util/util.h"
20 #include "util/logging.h"
21 #include "util/mutex.h"
22 #include "util/utf.h"
23 #include "re2/pod_array.h"
24 #include "re2/stringpiece.h"
25 #include "re2/walker-inl.h"
26 
27 namespace re2 {
28 
29 // Constructor.  Allocates vectors as appropriate for operator.
Regexp(RegexpOp op,ParseFlags parse_flags)30 Regexp::Regexp(RegexpOp op, ParseFlags parse_flags)
31   : op_(static_cast<uint8_t>(op)),
32     simple_(false),
33     parse_flags_(static_cast<uint16_t>(parse_flags)),
34     ref_(1),
35     nsub_(0),
36     down_(NULL) {
37   subone_ = NULL;
38   memset(the_union_, 0, sizeof the_union_);
39 }
40 
41 // Destructor.  Assumes already cleaned up children.
42 // Private: use Decref() instead of delete to destroy Regexps.
43 // Can't call Decref on the sub-Regexps here because
44 // that could cause arbitrarily deep recursion, so
45 // required Decref() to have handled them for us.
~Regexp()46 Regexp::~Regexp() {
47   if (nsub_ > 0)
48     LOG(DFATAL) << "Regexp not destroyed.";
49 
50   switch (op_) {
51     default:
52       break;
53     case kRegexpCapture:
54       delete name_;
55       break;
56     case kRegexpLiteralString:
57       delete[] runes_;
58       break;
59     case kRegexpCharClass:
60       if (cc_)
61         cc_->Delete();
62       delete ccb_;
63       break;
64   }
65 }
66 
67 // If it's possible to destroy this regexp without recurring,
68 // do so and return true.  Else return false.
QuickDestroy()69 bool Regexp::QuickDestroy() {
70   if (nsub_ == 0) {
71     delete this;
72     return true;
73   }
74   return false;
75 }
76 
77 // Lazily allocated.
78 static Mutex* ref_mutex;
79 static std::map<Regexp*, int>* ref_map;
80 
Ref()81 int Regexp::Ref() {
82   if (ref_ < kMaxRef)
83     return ref_;
84 
85   MutexLock l(ref_mutex);
86   return (*ref_map)[this];
87 }
88 
89 // Increments reference count, returns object as convenience.
Incref()90 Regexp* Regexp::Incref() {
91   if (ref_ >= kMaxRef-1) {
92     static std::once_flag ref_once;
93     std::call_once(ref_once, []() {
94       ref_mutex = new Mutex;
95       ref_map = new std::map<Regexp*, int>;
96     });
97 
98     // Store ref count in overflow map.
99     MutexLock l(ref_mutex);
100     if (ref_ == kMaxRef) {
101       // already overflowed
102       (*ref_map)[this]++;
103     } else {
104       // overflowing now
105       (*ref_map)[this] = kMaxRef;
106       ref_ = kMaxRef;
107     }
108     return this;
109   }
110 
111   ref_++;
112   return this;
113 }
114 
115 // Decrements reference count and deletes this object if count reaches 0.
Decref()116 void Regexp::Decref() {
117   if (ref_ == kMaxRef) {
118     // Ref count is stored in overflow map.
119     MutexLock l(ref_mutex);
120     int r = (*ref_map)[this] - 1;
121     if (r < kMaxRef) {
122       ref_ = static_cast<uint16_t>(r);
123       ref_map->erase(this);
124     } else {
125       (*ref_map)[this] = r;
126     }
127     return;
128   }
129   ref_--;
130   if (ref_ == 0)
131     Destroy();
132 }
133 
134 // Deletes this object; ref count has count reached 0.
Destroy()135 void Regexp::Destroy() {
136   if (QuickDestroy())
137     return;
138 
139   // Handle recursive Destroy with explicit stack
140   // to avoid arbitrarily deep recursion on process stack [sigh].
141   down_ = NULL;
142   Regexp* stack = this;
143   while (stack != NULL) {
144     Regexp* re = stack;
145     stack = re->down_;
146     if (re->ref_ != 0)
147       LOG(DFATAL) << "Bad reference count " << re->ref_;
148     if (re->nsub_ > 0) {
149       Regexp** subs = re->sub();
150       for (int i = 0; i < re->nsub_; i++) {
151         Regexp* sub = subs[i];
152         if (sub == NULL)
153           continue;
154         if (sub->ref_ == kMaxRef)
155           sub->Decref();
156         else
157           --sub->ref_;
158         if (sub->ref_ == 0 && !sub->QuickDestroy()) {
159           sub->down_ = stack;
160           stack = sub;
161         }
162       }
163       if (re->nsub_ > 1)
164         delete[] subs;
165       re->nsub_ = 0;
166     }
167     delete re;
168   }
169 }
170 
AddRuneToString(Rune r)171 void Regexp::AddRuneToString(Rune r) {
172   DCHECK(op_ == kRegexpLiteralString);
173   if (nrunes_ == 0) {
174     // start with 8
175     runes_ = new Rune[8];
176   } else if (nrunes_ >= 8 && (nrunes_ & (nrunes_ - 1)) == 0) {
177     // double on powers of two
178     Rune *old = runes_;
179     runes_ = new Rune[nrunes_ * 2];
180     for (int i = 0; i < nrunes_; i++)
181       runes_[i] = old[i];
182     delete[] old;
183   }
184 
185   runes_[nrunes_++] = r;
186 }
187 
HaveMatch(int match_id,ParseFlags flags)188 Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) {
189   Regexp* re = new Regexp(kRegexpHaveMatch, flags);
190   re->match_id_ = match_id;
191   return re;
192 }
193 
StarPlusOrQuest(RegexpOp op,Regexp * sub,ParseFlags flags)194 Regexp* Regexp::StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags) {
195   // Squash **, ++ and ??.
196   if (op == sub->op() && flags == sub->parse_flags())
197     return sub;
198 
199   // Squash *+, *?, +*, +?, ?* and ?+. They all squash to *, so because
200   // op is Star/Plus/Quest, we just have to check that sub->op() is too.
201   if ((sub->op() == kRegexpStar ||
202        sub->op() == kRegexpPlus ||
203        sub->op() == kRegexpQuest) &&
204       flags == sub->parse_flags()) {
205     // If sub is Star, no need to rewrite it.
206     if (sub->op() == kRegexpStar)
207       return sub;
208 
209     // Rewrite sub to Star.
210     Regexp* re = new Regexp(kRegexpStar, flags);
211     re->AllocSub(1);
212     re->sub()[0] = sub->sub()[0]->Incref();
213     sub->Decref();  // We didn't consume the reference after all.
214     return re;
215   }
216 
217   Regexp* re = new Regexp(op, flags);
218   re->AllocSub(1);
219   re->sub()[0] = sub;
220   return re;
221 }
222 
Plus(Regexp * sub,ParseFlags flags)223 Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) {
224   return StarPlusOrQuest(kRegexpPlus, sub, flags);
225 }
226 
Star(Regexp * sub,ParseFlags flags)227 Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) {
228   return StarPlusOrQuest(kRegexpStar, sub, flags);
229 }
230 
Quest(Regexp * sub,ParseFlags flags)231 Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) {
232   return StarPlusOrQuest(kRegexpQuest, sub, flags);
233 }
234 
ConcatOrAlternate(RegexpOp op,Regexp ** sub,int nsub,ParseFlags flags,bool can_factor)235 Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
236                                   ParseFlags flags, bool can_factor) {
237   if (nsub == 1)
238     return sub[0];
239 
240   if (nsub == 0) {
241     if (op == kRegexpAlternate)
242       return new Regexp(kRegexpNoMatch, flags);
243     else
244       return new Regexp(kRegexpEmptyMatch, flags);
245   }
246 
247   PODArray<Regexp*> subcopy;
248   if (op == kRegexpAlternate && can_factor) {
249     // Going to edit sub; make a copy so we don't step on caller.
250     subcopy = PODArray<Regexp*>(nsub);
251     memmove(subcopy.data(), sub, nsub * sizeof sub[0]);
252     sub = subcopy.data();
253     nsub = FactorAlternation(sub, nsub, flags);
254     if (nsub == 1) {
255       Regexp* re = sub[0];
256       return re;
257     }
258   }
259 
260   if (nsub > kMaxNsub) {
261     // Too many subexpressions to fit in a single Regexp.
262     // Make a two-level tree.  Two levels gets us to 65535^2.
263     int nbigsub = (nsub+kMaxNsub-1)/kMaxNsub;
264     Regexp* re = new Regexp(op, flags);
265     re->AllocSub(nbigsub);
266     Regexp** subs = re->sub();
267     for (int i = 0; i < nbigsub - 1; i++)
268       subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags, false);
269     subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub,
270                                           nsub - (nbigsub-1)*kMaxNsub, flags,
271                                           false);
272     return re;
273   }
274 
275   Regexp* re = new Regexp(op, flags);
276   re->AllocSub(nsub);
277   Regexp** subs = re->sub();
278   for (int i = 0; i < nsub; i++)
279     subs[i] = sub[i];
280   return re;
281 }
282 
Concat(Regexp ** sub,int nsub,ParseFlags flags)283 Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) {
284   return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false);
285 }
286 
Alternate(Regexp ** sub,int nsub,ParseFlags flags)287 Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) {
288   return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true);
289 }
290 
AlternateNoFactor(Regexp ** sub,int nsub,ParseFlags flags)291 Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) {
292   return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false);
293 }
294 
Capture(Regexp * sub,ParseFlags flags,int cap)295 Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) {
296   Regexp* re = new Regexp(kRegexpCapture, flags);
297   re->AllocSub(1);
298   re->sub()[0] = sub;
299   re->cap_ = cap;
300   return re;
301 }
302 
Repeat(Regexp * sub,ParseFlags flags,int min,int max)303 Regexp* Regexp::Repeat(Regexp* sub, ParseFlags flags, int min, int max) {
304   Regexp* re = new Regexp(kRegexpRepeat, flags);
305   re->AllocSub(1);
306   re->sub()[0] = sub;
307   re->min_ = min;
308   re->max_ = max;
309   return re;
310 }
311 
NewLiteral(Rune rune,ParseFlags flags)312 Regexp* Regexp::NewLiteral(Rune rune, ParseFlags flags) {
313   Regexp* re = new Regexp(kRegexpLiteral, flags);
314   re->rune_ = rune;
315   return re;
316 }
317 
LiteralString(Rune * runes,int nrunes,ParseFlags flags)318 Regexp* Regexp::LiteralString(Rune* runes, int nrunes, ParseFlags flags) {
319   if (nrunes <= 0)
320     return new Regexp(kRegexpEmptyMatch, flags);
321   if (nrunes == 1)
322     return NewLiteral(runes[0], flags);
323   Regexp* re = new Regexp(kRegexpLiteralString, flags);
324   for (int i = 0; i < nrunes; i++)
325     re->AddRuneToString(runes[i]);
326   return re;
327 }
328 
NewCharClass(CharClass * cc,ParseFlags flags)329 Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) {
330   Regexp* re = new Regexp(kRegexpCharClass, flags);
331   re->cc_ = cc;
332   return re;
333 }
334 
Swap(Regexp * that)335 void Regexp::Swap(Regexp* that) {
336   // Regexp is not trivially copyable, so we cannot freely copy it with
337   // memmove(3), but swapping objects like so is safe for our purposes.
338   char tmp[sizeof *this];
339   void* vthis = reinterpret_cast<void*>(this);
340   void* vthat = reinterpret_cast<void*>(that);
341   memmove(tmp, vthis, sizeof *this);
342   memmove(vthis, vthat, sizeof *this);
343   memmove(vthat, tmp, sizeof *this);
344 }
345 
346 // Tests equality of all top-level structure but not subregexps.
TopEqual(Regexp * a,Regexp * b)347 static bool TopEqual(Regexp* a, Regexp* b) {
348   if (a->op() != b->op())
349     return false;
350 
351   switch (a->op()) {
352     case kRegexpNoMatch:
353     case kRegexpEmptyMatch:
354     case kRegexpAnyChar:
355     case kRegexpAnyByte:
356     case kRegexpBeginLine:
357     case kRegexpEndLine:
358     case kRegexpWordBoundary:
359     case kRegexpNoWordBoundary:
360     case kRegexpBeginText:
361       return true;
362 
363     case kRegexpEndText:
364       // The parse flags remember whether it's \z or (?-m:$),
365       // which matters when testing against PCRE.
366       return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0;
367 
368     case kRegexpLiteral:
369       return a->rune() == b->rune() &&
370              ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0;
371 
372     case kRegexpLiteralString:
373       return a->nrunes() == b->nrunes() &&
374              ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 &&
375              memcmp(a->runes(), b->runes(),
376                     a->nrunes() * sizeof a->runes()[0]) == 0;
377 
378     case kRegexpAlternate:
379     case kRegexpConcat:
380       return a->nsub() == b->nsub();
381 
382     case kRegexpStar:
383     case kRegexpPlus:
384     case kRegexpQuest:
385       return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0;
386 
387     case kRegexpRepeat:
388       return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 &&
389              a->min() == b->min() &&
390              a->max() == b->max();
391 
392     case kRegexpCapture:
393       return a->cap() == b->cap() && a->name() == b->name();
394 
395     case kRegexpHaveMatch:
396       return a->match_id() == b->match_id();
397 
398     case kRegexpCharClass: {
399       CharClass* acc = a->cc();
400       CharClass* bcc = b->cc();
401       return acc->size() == bcc->size() &&
402              acc->end() - acc->begin() == bcc->end() - bcc->begin() &&
403              memcmp(acc->begin(), bcc->begin(),
404                     (acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0;
405     }
406   }
407 
408   LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op();
409   return 0;
410 }
411 
Equal(Regexp * a,Regexp * b)412 bool Regexp::Equal(Regexp* a, Regexp* b) {
413   if (a == NULL || b == NULL)
414     return a == b;
415 
416   if (!TopEqual(a, b))
417     return false;
418 
419   // Fast path:
420   // return without allocating vector if there are no subregexps.
421   switch (a->op()) {
422     case kRegexpAlternate:
423     case kRegexpConcat:
424     case kRegexpStar:
425     case kRegexpPlus:
426     case kRegexpQuest:
427     case kRegexpRepeat:
428     case kRegexpCapture:
429       break;
430 
431     default:
432       return true;
433   }
434 
435   // Committed to doing real work.
436   // The stack (vector) has pairs of regexps waiting to
437   // be compared.  The regexps are only equal if
438   // all the pairs end up being equal.
439   std::vector<Regexp*> stk;
440 
441   for (;;) {
442     // Invariant: TopEqual(a, b) == true.
443     Regexp* a2;
444     Regexp* b2;
445     switch (a->op()) {
446       default:
447         break;
448       case kRegexpAlternate:
449       case kRegexpConcat:
450         for (int i = 0; i < a->nsub(); i++) {
451           a2 = a->sub()[i];
452           b2 = b->sub()[i];
453           if (!TopEqual(a2, b2))
454             return false;
455           stk.push_back(a2);
456           stk.push_back(b2);
457         }
458         break;
459 
460       case kRegexpStar:
461       case kRegexpPlus:
462       case kRegexpQuest:
463       case kRegexpRepeat:
464       case kRegexpCapture:
465         a2 = a->sub()[0];
466         b2 = b->sub()[0];
467         if (!TopEqual(a2, b2))
468           return false;
469         // Really:
470         //   stk.push_back(a2);
471         //   stk.push_back(b2);
472         //   break;
473         // but faster to assign directly and loop.
474         a = a2;
475         b = b2;
476         continue;
477     }
478 
479     size_t n = stk.size();
480     if (n == 0)
481       break;
482 
483     DCHECK_GE(n, 2);
484     a = stk[n-2];
485     b = stk[n-1];
486     stk.resize(n-2);
487   }
488 
489   return true;
490 }
491 
492 // Keep in sync with enum RegexpStatusCode in regexp.h
493 static const char *kErrorStrings[] = {
494   "no error",
495   "unexpected error",
496   "invalid escape sequence",
497   "invalid character class",
498   "invalid character class range",
499   "missing ]",
500   "missing )",
501   "unexpected )",
502   "trailing \\",
503   "no argument for repetition operator",
504   "invalid repetition size",
505   "bad repetition operator",
506   "invalid perl operator",
507   "invalid UTF-8",
508   "invalid named capture group",
509 };
510 
CodeText(enum RegexpStatusCode code)511 std::string RegexpStatus::CodeText(enum RegexpStatusCode code) {
512   if (code < 0 || code >= arraysize(kErrorStrings))
513     code = kRegexpInternalError;
514   return kErrorStrings[code];
515 }
516 
Text() const517 std::string RegexpStatus::Text() const {
518   if (error_arg_.empty())
519     return CodeText(code_);
520   std::string s;
521   s.append(CodeText(code_));
522   s.append(": ");
523   s.append(error_arg_.data(), error_arg_.size());
524   return s;
525 }
526 
Copy(const RegexpStatus & status)527 void RegexpStatus::Copy(const RegexpStatus& status) {
528   code_ = status.code_;
529   error_arg_ = status.error_arg_;
530 }
531 
532 typedef int Ignored;  // Walker<void> doesn't exist
533 
534 // Walker subclass to count capturing parens in regexp.
535 class NumCapturesWalker : public Regexp::Walker<Ignored> {
536  public:
NumCapturesWalker()537   NumCapturesWalker() : ncapture_(0) {}
ncapture()538   int ncapture() { return ncapture_; }
539 
PreVisit(Regexp * re,Ignored ignored,bool * stop)540   virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
541     if (re->op() == kRegexpCapture)
542       ncapture_++;
543     return ignored;
544   }
545 
ShortVisit(Regexp * re,Ignored ignored)546   virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
547     // Should never be called: we use Walk(), not WalkExponential().
548 #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
549     LOG(DFATAL) << "NumCapturesWalker::ShortVisit called";
550 #endif
551     return ignored;
552   }
553 
554  private:
555   int ncapture_;
556 
557   NumCapturesWalker(const NumCapturesWalker&) = delete;
558   NumCapturesWalker& operator=(const NumCapturesWalker&) = delete;
559 };
560 
NumCaptures()561 int Regexp::NumCaptures() {
562   NumCapturesWalker w;
563   w.Walk(this, 0);
564   return w.ncapture();
565 }
566 
567 // Walker class to build map of named capture groups and their indices.
568 class NamedCapturesWalker : public Regexp::Walker<Ignored> {
569  public:
NamedCapturesWalker()570   NamedCapturesWalker() : map_(NULL) {}
~NamedCapturesWalker()571   ~NamedCapturesWalker() { delete map_; }
572 
TakeMap()573   std::map<std::string, int>* TakeMap() {
574     std::map<std::string, int>* m = map_;
575     map_ = NULL;
576     return m;
577   }
578 
PreVisit(Regexp * re,Ignored ignored,bool * stop)579   virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
580     if (re->op() == kRegexpCapture && re->name() != NULL) {
581       // Allocate map once we find a name.
582       if (map_ == NULL)
583         map_ = new std::map<std::string, int>;
584 
585       // Record first occurrence of each name.
586       // (The rule is that if you have the same name
587       // multiple times, only the leftmost one counts.)
588       if (map_->find(*re->name()) == map_->end())
589         (*map_)[*re->name()] = re->cap();
590     }
591     return ignored;
592   }
593 
ShortVisit(Regexp * re,Ignored ignored)594   virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
595     // Should never be called: we use Walk(), not WalkExponential().
596 #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
597     LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called";
598 #endif
599     return ignored;
600   }
601 
602  private:
603   std::map<std::string, int>* map_;
604 
605   NamedCapturesWalker(const NamedCapturesWalker&) = delete;
606   NamedCapturesWalker& operator=(const NamedCapturesWalker&) = delete;
607 };
608 
NamedCaptures()609 std::map<std::string, int>* Regexp::NamedCaptures() {
610   NamedCapturesWalker w;
611   w.Walk(this, 0);
612   return w.TakeMap();
613 }
614 
615 // Walker class to build map from capture group indices to their names.
616 class CaptureNamesWalker : public Regexp::Walker<Ignored> {
617  public:
CaptureNamesWalker()618   CaptureNamesWalker() : map_(NULL) {}
~CaptureNamesWalker()619   ~CaptureNamesWalker() { delete map_; }
620 
TakeMap()621   std::map<int, std::string>* TakeMap() {
622     std::map<int, std::string>* m = map_;
623     map_ = NULL;
624     return m;
625   }
626 
PreVisit(Regexp * re,Ignored ignored,bool * stop)627   virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
628     if (re->op() == kRegexpCapture && re->name() != NULL) {
629       // Allocate map once we find a name.
630       if (map_ == NULL)
631         map_ = new std::map<int, std::string>;
632 
633       (*map_)[re->cap()] = *re->name();
634     }
635     return ignored;
636   }
637 
ShortVisit(Regexp * re,Ignored ignored)638   virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
639     // Should never be called: we use Walk(), not WalkExponential().
640 #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
641     LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called";
642 #endif
643     return ignored;
644   }
645 
646  private:
647   std::map<int, std::string>* map_;
648 
649   CaptureNamesWalker(const CaptureNamesWalker&) = delete;
650   CaptureNamesWalker& operator=(const CaptureNamesWalker&) = delete;
651 };
652 
CaptureNames()653 std::map<int, std::string>* Regexp::CaptureNames() {
654   CaptureNamesWalker w;
655   w.Walk(this, 0);
656   return w.TakeMap();
657 }
658 
ConvertRunesToBytes(bool latin1,Rune * runes,int nrunes,std::string * bytes)659 void ConvertRunesToBytes(bool latin1, Rune* runes, int nrunes,
660                          std::string* bytes) {
661   if (latin1) {
662     bytes->resize(nrunes);
663     for (int i = 0; i < nrunes; i++)
664       (*bytes)[i] = static_cast<char>(runes[i]);
665   } else {
666     bytes->resize(nrunes * UTFmax);  // worst case
667     char* p = &(*bytes)[0];
668     for (int i = 0; i < nrunes; i++)
669       p += runetochar(p, &runes[i]);
670     bytes->resize(p - &(*bytes)[0]);
671     bytes->shrink_to_fit();
672   }
673 }
674 
675 // Determines whether regexp matches must be anchored
676 // with a fixed string prefix.  If so, returns the prefix and
677 // the regexp that remains after the prefix.  The prefix might
678 // be ASCII case-insensitive.
RequiredPrefix(std::string * prefix,bool * foldcase,Regexp ** suffix)679 bool Regexp::RequiredPrefix(std::string* prefix, bool* foldcase,
680                             Regexp** suffix) {
681   prefix->clear();
682   *foldcase = false;
683   *suffix = NULL;
684 
685   // No need for a walker: the regexp must be of the form
686   // 1. some number of ^ anchors
687   // 2. a literal char or string
688   // 3. the rest
689   if (op_ != kRegexpConcat)
690     return false;
691   int i = 0;
692   while (i < nsub_ && sub()[i]->op_ == kRegexpBeginText)
693     i++;
694   if (i == 0 || i >= nsub_)
695     return false;
696   Regexp* re = sub()[i];
697   if (re->op_ != kRegexpLiteral &&
698       re->op_ != kRegexpLiteralString)
699     return false;
700   i++;
701   if (i < nsub_) {
702     for (int j = i; j < nsub_; j++)
703       sub()[j]->Incref();
704     *suffix = Concat(sub() + i, nsub_ - i, parse_flags());
705   } else {
706     *suffix = new Regexp(kRegexpEmptyMatch, parse_flags());
707   }
708 
709   bool latin1 = (re->parse_flags() & Latin1) != 0;
710   Rune* runes = re->op_ == kRegexpLiteral ? &re->rune_ : re->runes_;
711   int nrunes = re->op_ == kRegexpLiteral ? 1 : re->nrunes_;
712   ConvertRunesToBytes(latin1, runes, nrunes, prefix);
713   *foldcase = (re->parse_flags() & FoldCase) != 0;
714   return true;
715 }
716 
717 // Determines whether regexp matches must be unanchored
718 // with a fixed string prefix.  If so, returns the prefix.
719 // The prefix might be ASCII case-insensitive.
RequiredPrefixForAccel(std::string * prefix,bool * foldcase)720 bool Regexp::RequiredPrefixForAccel(std::string* prefix, bool* foldcase) {
721   prefix->clear();
722   *foldcase = false;
723 
724   // No need for a walker: the regexp must either begin with or be
725   // a literal char or string. We "see through" capturing groups,
726   // but make no effort to glue multiple prefix fragments together.
727   Regexp* re = op_ == kRegexpConcat && nsub_ > 0 ? sub()[0] : this;
728   while (re->op_ == kRegexpCapture) {
729     re = re->sub()[0];
730     if (re->op_ == kRegexpConcat && re->nsub_ > 0)
731       re = re->sub()[0];
732   }
733   if (re->op_ != kRegexpLiteral &&
734       re->op_ != kRegexpLiteralString)
735     return false;
736 
737   bool latin1 = (re->parse_flags() & Latin1) != 0;
738   Rune* runes = re->op_ == kRegexpLiteral ? &re->rune_ : re->runes_;
739   int nrunes = re->op_ == kRegexpLiteral ? 1 : re->nrunes_;
740   ConvertRunesToBytes(latin1, runes, nrunes, prefix);
741   *foldcase = (re->parse_flags() & FoldCase) != 0;
742   return true;
743 }
744 
745 // Character class builder is a balanced binary tree (STL set)
746 // containing non-overlapping, non-abutting RuneRanges.
747 // The less-than operator used in the tree treats two
748 // ranges as equal if they overlap at all, so that
749 // lookups for a particular Rune are possible.
750 
CharClassBuilder()751 CharClassBuilder::CharClassBuilder() {
752   nrunes_ = 0;
753   upper_ = 0;
754   lower_ = 0;
755 }
756 
757 // Add lo-hi to the class; return whether class got bigger.
AddRange(Rune lo,Rune hi)758 bool CharClassBuilder::AddRange(Rune lo, Rune hi) {
759   if (hi < lo)
760     return false;
761 
762   if (lo <= 'z' && hi >= 'A') {
763     // Overlaps some alpha, maybe not all.
764     // Update bitmaps telling which ASCII letters are in the set.
765     Rune lo1 = std::max<Rune>(lo, 'A');
766     Rune hi1 = std::min<Rune>(hi, 'Z');
767     if (lo1 <= hi1)
768       upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A');
769 
770     lo1 = std::max<Rune>(lo, 'a');
771     hi1 = std::min<Rune>(hi, 'z');
772     if (lo1 <= hi1)
773       lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a');
774   }
775 
776   {  // Check whether lo, hi is already in the class.
777     iterator it = ranges_.find(RuneRange(lo, lo));
778     if (it != end() && it->lo <= lo && hi <= it->hi)
779       return false;
780   }
781 
782   // Look for a range abutting lo on the left.
783   // If it exists, take it out and increase our range.
784   if (lo > 0) {
785     iterator it = ranges_.find(RuneRange(lo-1, lo-1));
786     if (it != end()) {
787       lo = it->lo;
788       if (it->hi > hi)
789         hi = it->hi;
790       nrunes_ -= it->hi - it->lo + 1;
791       ranges_.erase(it);
792     }
793   }
794 
795   // Look for a range abutting hi on the right.
796   // If it exists, take it out and increase our range.
797   if (hi < Runemax) {
798     iterator it = ranges_.find(RuneRange(hi+1, hi+1));
799     if (it != end()) {
800       hi = it->hi;
801       nrunes_ -= it->hi - it->lo + 1;
802       ranges_.erase(it);
803     }
804   }
805 
806   // Look for ranges between lo and hi.  Take them out.
807   // This is only safe because the set has no overlapping ranges.
808   // We've already removed any ranges abutting lo and hi, so
809   // any that overlap [lo, hi] must be contained within it.
810   for (;;) {
811     iterator it = ranges_.find(RuneRange(lo, hi));
812     if (it == end())
813       break;
814     nrunes_ -= it->hi - it->lo + 1;
815     ranges_.erase(it);
816   }
817 
818   // Finally, add [lo, hi].
819   nrunes_ += hi - lo + 1;
820   ranges_.insert(RuneRange(lo, hi));
821   return true;
822 }
823 
AddCharClass(CharClassBuilder * cc)824 void CharClassBuilder::AddCharClass(CharClassBuilder *cc) {
825   for (iterator it = cc->begin(); it != cc->end(); ++it)
826     AddRange(it->lo, it->hi);
827 }
828 
Contains(Rune r)829 bool CharClassBuilder::Contains(Rune r) {
830   return ranges_.find(RuneRange(r, r)) != end();
831 }
832 
833 // Does the character class behave the same on A-Z as on a-z?
FoldsASCII()834 bool CharClassBuilder::FoldsASCII() {
835   return ((upper_ ^ lower_) & AlphaMask) == 0;
836 }
837 
Copy()838 CharClassBuilder* CharClassBuilder::Copy() {
839   CharClassBuilder* cc = new CharClassBuilder;
840   for (iterator it = begin(); it != end(); ++it)
841     cc->ranges_.insert(RuneRange(it->lo, it->hi));
842   cc->upper_ = upper_;
843   cc->lower_ = lower_;
844   cc->nrunes_ = nrunes_;
845   return cc;
846 }
847 
848 
849 
RemoveAbove(Rune r)850 void CharClassBuilder::RemoveAbove(Rune r) {
851   if (r >= Runemax)
852     return;
853 
854   if (r < 'z') {
855     if (r < 'a')
856       lower_ = 0;
857     else
858       lower_ &= AlphaMask >> ('z' - r);
859   }
860 
861   if (r < 'Z') {
862     if (r < 'A')
863       upper_ = 0;
864     else
865       upper_ &= AlphaMask >> ('Z' - r);
866   }
867 
868   for (;;) {
869 
870     iterator it = ranges_.find(RuneRange(r + 1, Runemax));
871     if (it == end())
872       break;
873     RuneRange rr = *it;
874     ranges_.erase(it);
875     nrunes_ -= rr.hi - rr.lo + 1;
876     if (rr.lo <= r) {
877       rr.hi = r;
878       ranges_.insert(rr);
879       nrunes_ += rr.hi - rr.lo + 1;
880     }
881   }
882 }
883 
Negate()884 void CharClassBuilder::Negate() {
885   // Build up negation and then copy in.
886   // Could edit ranges in place, but C++ won't let me.
887   std::vector<RuneRange> v;
888   v.reserve(ranges_.size() + 1);
889 
890   // In negation, first range begins at 0, unless
891   // the current class begins at 0.
892   iterator it = begin();
893   if (it == end()) {
894     v.push_back(RuneRange(0, Runemax));
895   } else {
896     int nextlo = 0;
897     if (it->lo == 0) {
898       nextlo = it->hi + 1;
899       ++it;
900     }
901     for (; it != end(); ++it) {
902       v.push_back(RuneRange(nextlo, it->lo - 1));
903       nextlo = it->hi + 1;
904     }
905     if (nextlo <= Runemax)
906       v.push_back(RuneRange(nextlo, Runemax));
907   }
908 
909   ranges_.clear();
910   for (size_t i = 0; i < v.size(); i++)
911     ranges_.insert(v[i]);
912 
913   upper_ = AlphaMask & ~upper_;
914   lower_ = AlphaMask & ~lower_;
915   nrunes_ = Runemax+1 - nrunes_;
916 }
917 
918 // Character class is a sorted list of ranges.
919 // The ranges are allocated in the same block as the header,
920 // necessitating a special allocator and Delete method.
921 
New(size_t maxranges)922 CharClass* CharClass::New(size_t maxranges) {
923   CharClass* cc;
924   uint8_t* data = new uint8_t[sizeof *cc + maxranges*sizeof cc->ranges_[0]];
925   cc = reinterpret_cast<CharClass*>(data);
926   cc->ranges_ = reinterpret_cast<RuneRange*>(data + sizeof *cc);
927   cc->nranges_ = 0;
928   cc->folds_ascii_ = false;
929   cc->nrunes_ = 0;
930   return cc;
931 }
932 
Delete()933 void CharClass::Delete() {
934   uint8_t* data = reinterpret_cast<uint8_t*>(this);
935   delete[] data;
936 }
937 
Negate()938 CharClass* CharClass::Negate() {
939   CharClass* cc = CharClass::New(static_cast<size_t>(nranges_+1));
940   cc->folds_ascii_ = folds_ascii_;
941   cc->nrunes_ = Runemax + 1 - nrunes_;
942   int n = 0;
943   int nextlo = 0;
944   for (CharClass::iterator it = begin(); it != end(); ++it) {
945     if (it->lo == nextlo) {
946       nextlo = it->hi + 1;
947     } else {
948       cc->ranges_[n++] = RuneRange(nextlo, it->lo - 1);
949       nextlo = it->hi + 1;
950     }
951   }
952   if (nextlo <= Runemax)
953     cc->ranges_[n++] = RuneRange(nextlo, Runemax);
954   cc->nranges_ = n;
955   return cc;
956 }
957 
Contains(Rune r) const958 bool CharClass::Contains(Rune r) const {
959   RuneRange* rr = ranges_;
960   int n = nranges_;
961   while (n > 0) {
962     int m = n/2;
963     if (rr[m].hi < r) {
964       rr += m+1;
965       n -= m+1;
966     } else if (r < rr[m].lo) {
967       n = m;
968     } else {  // rr[m].lo <= r && r <= rr[m].hi
969       return true;
970     }
971   }
972   return false;
973 }
974 
GetCharClass()975 CharClass* CharClassBuilder::GetCharClass() {
976   CharClass* cc = CharClass::New(ranges_.size());
977   int n = 0;
978   for (iterator it = begin(); it != end(); ++it)
979     cc->ranges_[n++] = *it;
980   cc->nranges_ = n;
981   DCHECK_LE(n, static_cast<int>(ranges_.size()));
982   cc->nrunes_ = nrunes_;
983   cc->folds_ascii_ = FoldsASCII();
984   return cc;
985 }
986 
987 }  // namespace re2
988