1 // Copyright 2006 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Regular expression representation.
6 // Tested by parse_test.cc
7
8 #include "re2/regexp.h"
9
10 #include <stddef.h>
11 #include <stdint.h>
12 #include <string.h>
13 #include <algorithm>
14 #include <map>
15 #include <mutex>
16 #include <string>
17 #include <vector>
18
19 #include "util/util.h"
20 #include "util/logging.h"
21 #include "util/mutex.h"
22 #include "util/utf.h"
23 #include "re2/pod_array.h"
24 #include "re2/stringpiece.h"
25 #include "re2/walker-inl.h"
26
27 namespace re2 {
28
29 // Constructor. Allocates vectors as appropriate for operator.
Regexp(RegexpOp op,ParseFlags parse_flags)30 Regexp::Regexp(RegexpOp op, ParseFlags parse_flags)
31 : op_(static_cast<uint8_t>(op)),
32 simple_(false),
33 parse_flags_(static_cast<uint16_t>(parse_flags)),
34 ref_(1),
35 nsub_(0),
36 down_(NULL) {
37 subone_ = NULL;
38 memset(the_union_, 0, sizeof the_union_);
39 }
40
41 // Destructor. Assumes already cleaned up children.
42 // Private: use Decref() instead of delete to destroy Regexps.
43 // Can't call Decref on the sub-Regexps here because
44 // that could cause arbitrarily deep recursion, so
45 // required Decref() to have handled them for us.
~Regexp()46 Regexp::~Regexp() {
47 if (nsub_ > 0)
48 LOG(DFATAL) << "Regexp not destroyed.";
49
50 switch (op_) {
51 default:
52 break;
53 case kRegexpCapture:
54 delete name_;
55 break;
56 case kRegexpLiteralString:
57 delete[] runes_;
58 break;
59 case kRegexpCharClass:
60 if (cc_)
61 cc_->Delete();
62 delete ccb_;
63 break;
64 }
65 }
66
67 // If it's possible to destroy this regexp without recurring,
68 // do so and return true. Else return false.
QuickDestroy()69 bool Regexp::QuickDestroy() {
70 if (nsub_ == 0) {
71 delete this;
72 return true;
73 }
74 return false;
75 }
76
77 // Lazily allocated.
78 static Mutex* ref_mutex;
79 static std::map<Regexp*, int>* ref_map;
80
Ref()81 int Regexp::Ref() {
82 if (ref_ < kMaxRef)
83 return ref_;
84
85 MutexLock l(ref_mutex);
86 return (*ref_map)[this];
87 }
88
89 // Increments reference count, returns object as convenience.
Incref()90 Regexp* Regexp::Incref() {
91 if (ref_ >= kMaxRef-1) {
92 static std::once_flag ref_once;
93 std::call_once(ref_once, []() {
94 ref_mutex = new Mutex;
95 ref_map = new std::map<Regexp*, int>;
96 });
97
98 // Store ref count in overflow map.
99 MutexLock l(ref_mutex);
100 if (ref_ == kMaxRef) {
101 // already overflowed
102 (*ref_map)[this]++;
103 } else {
104 // overflowing now
105 (*ref_map)[this] = kMaxRef;
106 ref_ = kMaxRef;
107 }
108 return this;
109 }
110
111 ref_++;
112 return this;
113 }
114
115 // Decrements reference count and deletes this object if count reaches 0.
Decref()116 void Regexp::Decref() {
117 if (ref_ == kMaxRef) {
118 // Ref count is stored in overflow map.
119 MutexLock l(ref_mutex);
120 int r = (*ref_map)[this] - 1;
121 if (r < kMaxRef) {
122 ref_ = static_cast<uint16_t>(r);
123 ref_map->erase(this);
124 } else {
125 (*ref_map)[this] = r;
126 }
127 return;
128 }
129 ref_--;
130 if (ref_ == 0)
131 Destroy();
132 }
133
134 // Deletes this object; ref count has count reached 0.
Destroy()135 void Regexp::Destroy() {
136 if (QuickDestroy())
137 return;
138
139 // Handle recursive Destroy with explicit stack
140 // to avoid arbitrarily deep recursion on process stack [sigh].
141 down_ = NULL;
142 Regexp* stack = this;
143 while (stack != NULL) {
144 Regexp* re = stack;
145 stack = re->down_;
146 if (re->ref_ != 0)
147 LOG(DFATAL) << "Bad reference count " << re->ref_;
148 if (re->nsub_ > 0) {
149 Regexp** subs = re->sub();
150 for (int i = 0; i < re->nsub_; i++) {
151 Regexp* sub = subs[i];
152 if (sub == NULL)
153 continue;
154 if (sub->ref_ == kMaxRef)
155 sub->Decref();
156 else
157 --sub->ref_;
158 if (sub->ref_ == 0 && !sub->QuickDestroy()) {
159 sub->down_ = stack;
160 stack = sub;
161 }
162 }
163 if (re->nsub_ > 1)
164 delete[] subs;
165 re->nsub_ = 0;
166 }
167 delete re;
168 }
169 }
170
AddRuneToString(Rune r)171 void Regexp::AddRuneToString(Rune r) {
172 DCHECK(op_ == kRegexpLiteralString);
173 if (nrunes_ == 0) {
174 // start with 8
175 runes_ = new Rune[8];
176 } else if (nrunes_ >= 8 && (nrunes_ & (nrunes_ - 1)) == 0) {
177 // double on powers of two
178 Rune *old = runes_;
179 runes_ = new Rune[nrunes_ * 2];
180 for (int i = 0; i < nrunes_; i++)
181 runes_[i] = old[i];
182 delete[] old;
183 }
184
185 runes_[nrunes_++] = r;
186 }
187
HaveMatch(int match_id,ParseFlags flags)188 Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) {
189 Regexp* re = new Regexp(kRegexpHaveMatch, flags);
190 re->match_id_ = match_id;
191 return re;
192 }
193
StarPlusOrQuest(RegexpOp op,Regexp * sub,ParseFlags flags)194 Regexp* Regexp::StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags) {
195 // Squash **, ++ and ??.
196 if (op == sub->op() && flags == sub->parse_flags())
197 return sub;
198
199 // Squash *+, *?, +*, +?, ?* and ?+. They all squash to *, so because
200 // op is Star/Plus/Quest, we just have to check that sub->op() is too.
201 if ((sub->op() == kRegexpStar ||
202 sub->op() == kRegexpPlus ||
203 sub->op() == kRegexpQuest) &&
204 flags == sub->parse_flags()) {
205 // If sub is Star, no need to rewrite it.
206 if (sub->op() == kRegexpStar)
207 return sub;
208
209 // Rewrite sub to Star.
210 Regexp* re = new Regexp(kRegexpStar, flags);
211 re->AllocSub(1);
212 re->sub()[0] = sub->sub()[0]->Incref();
213 sub->Decref(); // We didn't consume the reference after all.
214 return re;
215 }
216
217 Regexp* re = new Regexp(op, flags);
218 re->AllocSub(1);
219 re->sub()[0] = sub;
220 return re;
221 }
222
Plus(Regexp * sub,ParseFlags flags)223 Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) {
224 return StarPlusOrQuest(kRegexpPlus, sub, flags);
225 }
226
Star(Regexp * sub,ParseFlags flags)227 Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) {
228 return StarPlusOrQuest(kRegexpStar, sub, flags);
229 }
230
Quest(Regexp * sub,ParseFlags flags)231 Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) {
232 return StarPlusOrQuest(kRegexpQuest, sub, flags);
233 }
234
ConcatOrAlternate(RegexpOp op,Regexp ** sub,int nsub,ParseFlags flags,bool can_factor)235 Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
236 ParseFlags flags, bool can_factor) {
237 if (nsub == 1)
238 return sub[0];
239
240 if (nsub == 0) {
241 if (op == kRegexpAlternate)
242 return new Regexp(kRegexpNoMatch, flags);
243 else
244 return new Regexp(kRegexpEmptyMatch, flags);
245 }
246
247 PODArray<Regexp*> subcopy;
248 if (op == kRegexpAlternate && can_factor) {
249 // Going to edit sub; make a copy so we don't step on caller.
250 subcopy = PODArray<Regexp*>(nsub);
251 memmove(subcopy.data(), sub, nsub * sizeof sub[0]);
252 sub = subcopy.data();
253 nsub = FactorAlternation(sub, nsub, flags);
254 if (nsub == 1) {
255 Regexp* re = sub[0];
256 return re;
257 }
258 }
259
260 if (nsub > kMaxNsub) {
261 // Too many subexpressions to fit in a single Regexp.
262 // Make a two-level tree. Two levels gets us to 65535^2.
263 int nbigsub = (nsub+kMaxNsub-1)/kMaxNsub;
264 Regexp* re = new Regexp(op, flags);
265 re->AllocSub(nbigsub);
266 Regexp** subs = re->sub();
267 for (int i = 0; i < nbigsub - 1; i++)
268 subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags, false);
269 subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub,
270 nsub - (nbigsub-1)*kMaxNsub, flags,
271 false);
272 return re;
273 }
274
275 Regexp* re = new Regexp(op, flags);
276 re->AllocSub(nsub);
277 Regexp** subs = re->sub();
278 for (int i = 0; i < nsub; i++)
279 subs[i] = sub[i];
280 return re;
281 }
282
Concat(Regexp ** sub,int nsub,ParseFlags flags)283 Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) {
284 return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false);
285 }
286
Alternate(Regexp ** sub,int nsub,ParseFlags flags)287 Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) {
288 return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true);
289 }
290
AlternateNoFactor(Regexp ** sub,int nsub,ParseFlags flags)291 Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) {
292 return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false);
293 }
294
Capture(Regexp * sub,ParseFlags flags,int cap)295 Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) {
296 Regexp* re = new Regexp(kRegexpCapture, flags);
297 re->AllocSub(1);
298 re->sub()[0] = sub;
299 re->cap_ = cap;
300 return re;
301 }
302
Repeat(Regexp * sub,ParseFlags flags,int min,int max)303 Regexp* Regexp::Repeat(Regexp* sub, ParseFlags flags, int min, int max) {
304 Regexp* re = new Regexp(kRegexpRepeat, flags);
305 re->AllocSub(1);
306 re->sub()[0] = sub;
307 re->min_ = min;
308 re->max_ = max;
309 return re;
310 }
311
NewLiteral(Rune rune,ParseFlags flags)312 Regexp* Regexp::NewLiteral(Rune rune, ParseFlags flags) {
313 Regexp* re = new Regexp(kRegexpLiteral, flags);
314 re->rune_ = rune;
315 return re;
316 }
317
LiteralString(Rune * runes,int nrunes,ParseFlags flags)318 Regexp* Regexp::LiteralString(Rune* runes, int nrunes, ParseFlags flags) {
319 if (nrunes <= 0)
320 return new Regexp(kRegexpEmptyMatch, flags);
321 if (nrunes == 1)
322 return NewLiteral(runes[0], flags);
323 Regexp* re = new Regexp(kRegexpLiteralString, flags);
324 for (int i = 0; i < nrunes; i++)
325 re->AddRuneToString(runes[i]);
326 return re;
327 }
328
NewCharClass(CharClass * cc,ParseFlags flags)329 Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) {
330 Regexp* re = new Regexp(kRegexpCharClass, flags);
331 re->cc_ = cc;
332 return re;
333 }
334
Swap(Regexp * that)335 void Regexp::Swap(Regexp* that) {
336 // Regexp is not trivially copyable, so we cannot freely copy it with
337 // memmove(3), but swapping objects like so is safe for our purposes.
338 char tmp[sizeof *this];
339 void* vthis = reinterpret_cast<void*>(this);
340 void* vthat = reinterpret_cast<void*>(that);
341 memmove(tmp, vthis, sizeof *this);
342 memmove(vthis, vthat, sizeof *this);
343 memmove(vthat, tmp, sizeof *this);
344 }
345
346 // Tests equality of all top-level structure but not subregexps.
TopEqual(Regexp * a,Regexp * b)347 static bool TopEqual(Regexp* a, Regexp* b) {
348 if (a->op() != b->op())
349 return false;
350
351 switch (a->op()) {
352 case kRegexpNoMatch:
353 case kRegexpEmptyMatch:
354 case kRegexpAnyChar:
355 case kRegexpAnyByte:
356 case kRegexpBeginLine:
357 case kRegexpEndLine:
358 case kRegexpWordBoundary:
359 case kRegexpNoWordBoundary:
360 case kRegexpBeginText:
361 return true;
362
363 case kRegexpEndText:
364 // The parse flags remember whether it's \z or (?-m:$),
365 // which matters when testing against PCRE.
366 return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0;
367
368 case kRegexpLiteral:
369 return a->rune() == b->rune() &&
370 ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0;
371
372 case kRegexpLiteralString:
373 return a->nrunes() == b->nrunes() &&
374 ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 &&
375 memcmp(a->runes(), b->runes(),
376 a->nrunes() * sizeof a->runes()[0]) == 0;
377
378 case kRegexpAlternate:
379 case kRegexpConcat:
380 return a->nsub() == b->nsub();
381
382 case kRegexpStar:
383 case kRegexpPlus:
384 case kRegexpQuest:
385 return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0;
386
387 case kRegexpRepeat:
388 return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 &&
389 a->min() == b->min() &&
390 a->max() == b->max();
391
392 case kRegexpCapture:
393 return a->cap() == b->cap() && a->name() == b->name();
394
395 case kRegexpHaveMatch:
396 return a->match_id() == b->match_id();
397
398 case kRegexpCharClass: {
399 CharClass* acc = a->cc();
400 CharClass* bcc = b->cc();
401 return acc->size() == bcc->size() &&
402 acc->end() - acc->begin() == bcc->end() - bcc->begin() &&
403 memcmp(acc->begin(), bcc->begin(),
404 (acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0;
405 }
406 }
407
408 LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op();
409 return 0;
410 }
411
Equal(Regexp * a,Regexp * b)412 bool Regexp::Equal(Regexp* a, Regexp* b) {
413 if (a == NULL || b == NULL)
414 return a == b;
415
416 if (!TopEqual(a, b))
417 return false;
418
419 // Fast path:
420 // return without allocating vector if there are no subregexps.
421 switch (a->op()) {
422 case kRegexpAlternate:
423 case kRegexpConcat:
424 case kRegexpStar:
425 case kRegexpPlus:
426 case kRegexpQuest:
427 case kRegexpRepeat:
428 case kRegexpCapture:
429 break;
430
431 default:
432 return true;
433 }
434
435 // Committed to doing real work.
436 // The stack (vector) has pairs of regexps waiting to
437 // be compared. The regexps are only equal if
438 // all the pairs end up being equal.
439 std::vector<Regexp*> stk;
440
441 for (;;) {
442 // Invariant: TopEqual(a, b) == true.
443 Regexp* a2;
444 Regexp* b2;
445 switch (a->op()) {
446 default:
447 break;
448 case kRegexpAlternate:
449 case kRegexpConcat:
450 for (int i = 0; i < a->nsub(); i++) {
451 a2 = a->sub()[i];
452 b2 = b->sub()[i];
453 if (!TopEqual(a2, b2))
454 return false;
455 stk.push_back(a2);
456 stk.push_back(b2);
457 }
458 break;
459
460 case kRegexpStar:
461 case kRegexpPlus:
462 case kRegexpQuest:
463 case kRegexpRepeat:
464 case kRegexpCapture:
465 a2 = a->sub()[0];
466 b2 = b->sub()[0];
467 if (!TopEqual(a2, b2))
468 return false;
469 // Really:
470 // stk.push_back(a2);
471 // stk.push_back(b2);
472 // break;
473 // but faster to assign directly and loop.
474 a = a2;
475 b = b2;
476 continue;
477 }
478
479 size_t n = stk.size();
480 if (n == 0)
481 break;
482
483 DCHECK_GE(n, 2);
484 a = stk[n-2];
485 b = stk[n-1];
486 stk.resize(n-2);
487 }
488
489 return true;
490 }
491
492 // Keep in sync with enum RegexpStatusCode in regexp.h
493 static const char *kErrorStrings[] = {
494 "no error",
495 "unexpected error",
496 "invalid escape sequence",
497 "invalid character class",
498 "invalid character class range",
499 "missing ]",
500 "missing )",
501 "unexpected )",
502 "trailing \\",
503 "no argument for repetition operator",
504 "invalid repetition size",
505 "bad repetition operator",
506 "invalid perl operator",
507 "invalid UTF-8",
508 "invalid named capture group",
509 };
510
CodeText(enum RegexpStatusCode code)511 std::string RegexpStatus::CodeText(enum RegexpStatusCode code) {
512 if (code < 0 || code >= arraysize(kErrorStrings))
513 code = kRegexpInternalError;
514 return kErrorStrings[code];
515 }
516
Text() const517 std::string RegexpStatus::Text() const {
518 if (error_arg_.empty())
519 return CodeText(code_);
520 std::string s;
521 s.append(CodeText(code_));
522 s.append(": ");
523 s.append(error_arg_.data(), error_arg_.size());
524 return s;
525 }
526
Copy(const RegexpStatus & status)527 void RegexpStatus::Copy(const RegexpStatus& status) {
528 code_ = status.code_;
529 error_arg_ = status.error_arg_;
530 }
531
532 typedef int Ignored; // Walker<void> doesn't exist
533
534 // Walker subclass to count capturing parens in regexp.
535 class NumCapturesWalker : public Regexp::Walker<Ignored> {
536 public:
NumCapturesWalker()537 NumCapturesWalker() : ncapture_(0) {}
ncapture()538 int ncapture() { return ncapture_; }
539
PreVisit(Regexp * re,Ignored ignored,bool * stop)540 virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
541 if (re->op() == kRegexpCapture)
542 ncapture_++;
543 return ignored;
544 }
545
ShortVisit(Regexp * re,Ignored ignored)546 virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
547 // Should never be called: we use Walk(), not WalkExponential().
548 #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
549 LOG(DFATAL) << "NumCapturesWalker::ShortVisit called";
550 #endif
551 return ignored;
552 }
553
554 private:
555 int ncapture_;
556
557 NumCapturesWalker(const NumCapturesWalker&) = delete;
558 NumCapturesWalker& operator=(const NumCapturesWalker&) = delete;
559 };
560
NumCaptures()561 int Regexp::NumCaptures() {
562 NumCapturesWalker w;
563 w.Walk(this, 0);
564 return w.ncapture();
565 }
566
567 // Walker class to build map of named capture groups and their indices.
568 class NamedCapturesWalker : public Regexp::Walker<Ignored> {
569 public:
NamedCapturesWalker()570 NamedCapturesWalker() : map_(NULL) {}
~NamedCapturesWalker()571 ~NamedCapturesWalker() { delete map_; }
572
TakeMap()573 std::map<std::string, int>* TakeMap() {
574 std::map<std::string, int>* m = map_;
575 map_ = NULL;
576 return m;
577 }
578
PreVisit(Regexp * re,Ignored ignored,bool * stop)579 virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
580 if (re->op() == kRegexpCapture && re->name() != NULL) {
581 // Allocate map once we find a name.
582 if (map_ == NULL)
583 map_ = new std::map<std::string, int>;
584
585 // Record first occurrence of each name.
586 // (The rule is that if you have the same name
587 // multiple times, only the leftmost one counts.)
588 if (map_->find(*re->name()) == map_->end())
589 (*map_)[*re->name()] = re->cap();
590 }
591 return ignored;
592 }
593
ShortVisit(Regexp * re,Ignored ignored)594 virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
595 // Should never be called: we use Walk(), not WalkExponential().
596 #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
597 LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called";
598 #endif
599 return ignored;
600 }
601
602 private:
603 std::map<std::string, int>* map_;
604
605 NamedCapturesWalker(const NamedCapturesWalker&) = delete;
606 NamedCapturesWalker& operator=(const NamedCapturesWalker&) = delete;
607 };
608
NamedCaptures()609 std::map<std::string, int>* Regexp::NamedCaptures() {
610 NamedCapturesWalker w;
611 w.Walk(this, 0);
612 return w.TakeMap();
613 }
614
615 // Walker class to build map from capture group indices to their names.
616 class CaptureNamesWalker : public Regexp::Walker<Ignored> {
617 public:
CaptureNamesWalker()618 CaptureNamesWalker() : map_(NULL) {}
~CaptureNamesWalker()619 ~CaptureNamesWalker() { delete map_; }
620
TakeMap()621 std::map<int, std::string>* TakeMap() {
622 std::map<int, std::string>* m = map_;
623 map_ = NULL;
624 return m;
625 }
626
PreVisit(Regexp * re,Ignored ignored,bool * stop)627 virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
628 if (re->op() == kRegexpCapture && re->name() != NULL) {
629 // Allocate map once we find a name.
630 if (map_ == NULL)
631 map_ = new std::map<int, std::string>;
632
633 (*map_)[re->cap()] = *re->name();
634 }
635 return ignored;
636 }
637
ShortVisit(Regexp * re,Ignored ignored)638 virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
639 // Should never be called: we use Walk(), not WalkExponential().
640 #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
641 LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called";
642 #endif
643 return ignored;
644 }
645
646 private:
647 std::map<int, std::string>* map_;
648
649 CaptureNamesWalker(const CaptureNamesWalker&) = delete;
650 CaptureNamesWalker& operator=(const CaptureNamesWalker&) = delete;
651 };
652
CaptureNames()653 std::map<int, std::string>* Regexp::CaptureNames() {
654 CaptureNamesWalker w;
655 w.Walk(this, 0);
656 return w.TakeMap();
657 }
658
ConvertRunesToBytes(bool latin1,Rune * runes,int nrunes,std::string * bytes)659 void ConvertRunesToBytes(bool latin1, Rune* runes, int nrunes,
660 std::string* bytes) {
661 if (latin1) {
662 bytes->resize(nrunes);
663 for (int i = 0; i < nrunes; i++)
664 (*bytes)[i] = static_cast<char>(runes[i]);
665 } else {
666 bytes->resize(nrunes * UTFmax); // worst case
667 char* p = &(*bytes)[0];
668 for (int i = 0; i < nrunes; i++)
669 p += runetochar(p, &runes[i]);
670 bytes->resize(p - &(*bytes)[0]);
671 bytes->shrink_to_fit();
672 }
673 }
674
675 // Determines whether regexp matches must be anchored
676 // with a fixed string prefix. If so, returns the prefix and
677 // the regexp that remains after the prefix. The prefix might
678 // be ASCII case-insensitive.
RequiredPrefix(std::string * prefix,bool * foldcase,Regexp ** suffix)679 bool Regexp::RequiredPrefix(std::string* prefix, bool* foldcase,
680 Regexp** suffix) {
681 prefix->clear();
682 *foldcase = false;
683 *suffix = NULL;
684
685 // No need for a walker: the regexp must be of the form
686 // 1. some number of ^ anchors
687 // 2. a literal char or string
688 // 3. the rest
689 if (op_ != kRegexpConcat)
690 return false;
691 int i = 0;
692 while (i < nsub_ && sub()[i]->op_ == kRegexpBeginText)
693 i++;
694 if (i == 0 || i >= nsub_)
695 return false;
696 Regexp* re = sub()[i];
697 if (re->op_ != kRegexpLiteral &&
698 re->op_ != kRegexpLiteralString)
699 return false;
700 i++;
701 if (i < nsub_) {
702 for (int j = i; j < nsub_; j++)
703 sub()[j]->Incref();
704 *suffix = Concat(sub() + i, nsub_ - i, parse_flags());
705 } else {
706 *suffix = new Regexp(kRegexpEmptyMatch, parse_flags());
707 }
708
709 bool latin1 = (re->parse_flags() & Latin1) != 0;
710 Rune* runes = re->op_ == kRegexpLiteral ? &re->rune_ : re->runes_;
711 int nrunes = re->op_ == kRegexpLiteral ? 1 : re->nrunes_;
712 ConvertRunesToBytes(latin1, runes, nrunes, prefix);
713 *foldcase = (re->parse_flags() & FoldCase) != 0;
714 return true;
715 }
716
717 // Determines whether regexp matches must be unanchored
718 // with a fixed string prefix. If so, returns the prefix.
719 // The prefix might be ASCII case-insensitive.
RequiredPrefixForAccel(std::string * prefix,bool * foldcase)720 bool Regexp::RequiredPrefixForAccel(std::string* prefix, bool* foldcase) {
721 prefix->clear();
722 *foldcase = false;
723
724 // No need for a walker: the regexp must either begin with or be
725 // a literal char or string. We "see through" capturing groups,
726 // but make no effort to glue multiple prefix fragments together.
727 Regexp* re = op_ == kRegexpConcat && nsub_ > 0 ? sub()[0] : this;
728 while (re->op_ == kRegexpCapture) {
729 re = re->sub()[0];
730 if (re->op_ == kRegexpConcat && re->nsub_ > 0)
731 re = re->sub()[0];
732 }
733 if (re->op_ != kRegexpLiteral &&
734 re->op_ != kRegexpLiteralString)
735 return false;
736
737 bool latin1 = (re->parse_flags() & Latin1) != 0;
738 Rune* runes = re->op_ == kRegexpLiteral ? &re->rune_ : re->runes_;
739 int nrunes = re->op_ == kRegexpLiteral ? 1 : re->nrunes_;
740 ConvertRunesToBytes(latin1, runes, nrunes, prefix);
741 *foldcase = (re->parse_flags() & FoldCase) != 0;
742 return true;
743 }
744
745 // Character class builder is a balanced binary tree (STL set)
746 // containing non-overlapping, non-abutting RuneRanges.
747 // The less-than operator used in the tree treats two
748 // ranges as equal if they overlap at all, so that
749 // lookups for a particular Rune are possible.
750
CharClassBuilder()751 CharClassBuilder::CharClassBuilder() {
752 nrunes_ = 0;
753 upper_ = 0;
754 lower_ = 0;
755 }
756
757 // Add lo-hi to the class; return whether class got bigger.
AddRange(Rune lo,Rune hi)758 bool CharClassBuilder::AddRange(Rune lo, Rune hi) {
759 if (hi < lo)
760 return false;
761
762 if (lo <= 'z' && hi >= 'A') {
763 // Overlaps some alpha, maybe not all.
764 // Update bitmaps telling which ASCII letters are in the set.
765 Rune lo1 = std::max<Rune>(lo, 'A');
766 Rune hi1 = std::min<Rune>(hi, 'Z');
767 if (lo1 <= hi1)
768 upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A');
769
770 lo1 = std::max<Rune>(lo, 'a');
771 hi1 = std::min<Rune>(hi, 'z');
772 if (lo1 <= hi1)
773 lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a');
774 }
775
776 { // Check whether lo, hi is already in the class.
777 iterator it = ranges_.find(RuneRange(lo, lo));
778 if (it != end() && it->lo <= lo && hi <= it->hi)
779 return false;
780 }
781
782 // Look for a range abutting lo on the left.
783 // If it exists, take it out and increase our range.
784 if (lo > 0) {
785 iterator it = ranges_.find(RuneRange(lo-1, lo-1));
786 if (it != end()) {
787 lo = it->lo;
788 if (it->hi > hi)
789 hi = it->hi;
790 nrunes_ -= it->hi - it->lo + 1;
791 ranges_.erase(it);
792 }
793 }
794
795 // Look for a range abutting hi on the right.
796 // If it exists, take it out and increase our range.
797 if (hi < Runemax) {
798 iterator it = ranges_.find(RuneRange(hi+1, hi+1));
799 if (it != end()) {
800 hi = it->hi;
801 nrunes_ -= it->hi - it->lo + 1;
802 ranges_.erase(it);
803 }
804 }
805
806 // Look for ranges between lo and hi. Take them out.
807 // This is only safe because the set has no overlapping ranges.
808 // We've already removed any ranges abutting lo and hi, so
809 // any that overlap [lo, hi] must be contained within it.
810 for (;;) {
811 iterator it = ranges_.find(RuneRange(lo, hi));
812 if (it == end())
813 break;
814 nrunes_ -= it->hi - it->lo + 1;
815 ranges_.erase(it);
816 }
817
818 // Finally, add [lo, hi].
819 nrunes_ += hi - lo + 1;
820 ranges_.insert(RuneRange(lo, hi));
821 return true;
822 }
823
AddCharClass(CharClassBuilder * cc)824 void CharClassBuilder::AddCharClass(CharClassBuilder *cc) {
825 for (iterator it = cc->begin(); it != cc->end(); ++it)
826 AddRange(it->lo, it->hi);
827 }
828
Contains(Rune r)829 bool CharClassBuilder::Contains(Rune r) {
830 return ranges_.find(RuneRange(r, r)) != end();
831 }
832
833 // Does the character class behave the same on A-Z as on a-z?
FoldsASCII()834 bool CharClassBuilder::FoldsASCII() {
835 return ((upper_ ^ lower_) & AlphaMask) == 0;
836 }
837
Copy()838 CharClassBuilder* CharClassBuilder::Copy() {
839 CharClassBuilder* cc = new CharClassBuilder;
840 for (iterator it = begin(); it != end(); ++it)
841 cc->ranges_.insert(RuneRange(it->lo, it->hi));
842 cc->upper_ = upper_;
843 cc->lower_ = lower_;
844 cc->nrunes_ = nrunes_;
845 return cc;
846 }
847
848
849
RemoveAbove(Rune r)850 void CharClassBuilder::RemoveAbove(Rune r) {
851 if (r >= Runemax)
852 return;
853
854 if (r < 'z') {
855 if (r < 'a')
856 lower_ = 0;
857 else
858 lower_ &= AlphaMask >> ('z' - r);
859 }
860
861 if (r < 'Z') {
862 if (r < 'A')
863 upper_ = 0;
864 else
865 upper_ &= AlphaMask >> ('Z' - r);
866 }
867
868 for (;;) {
869
870 iterator it = ranges_.find(RuneRange(r + 1, Runemax));
871 if (it == end())
872 break;
873 RuneRange rr = *it;
874 ranges_.erase(it);
875 nrunes_ -= rr.hi - rr.lo + 1;
876 if (rr.lo <= r) {
877 rr.hi = r;
878 ranges_.insert(rr);
879 nrunes_ += rr.hi - rr.lo + 1;
880 }
881 }
882 }
883
Negate()884 void CharClassBuilder::Negate() {
885 // Build up negation and then copy in.
886 // Could edit ranges in place, but C++ won't let me.
887 std::vector<RuneRange> v;
888 v.reserve(ranges_.size() + 1);
889
890 // In negation, first range begins at 0, unless
891 // the current class begins at 0.
892 iterator it = begin();
893 if (it == end()) {
894 v.push_back(RuneRange(0, Runemax));
895 } else {
896 int nextlo = 0;
897 if (it->lo == 0) {
898 nextlo = it->hi + 1;
899 ++it;
900 }
901 for (; it != end(); ++it) {
902 v.push_back(RuneRange(nextlo, it->lo - 1));
903 nextlo = it->hi + 1;
904 }
905 if (nextlo <= Runemax)
906 v.push_back(RuneRange(nextlo, Runemax));
907 }
908
909 ranges_.clear();
910 for (size_t i = 0; i < v.size(); i++)
911 ranges_.insert(v[i]);
912
913 upper_ = AlphaMask & ~upper_;
914 lower_ = AlphaMask & ~lower_;
915 nrunes_ = Runemax+1 - nrunes_;
916 }
917
918 // Character class is a sorted list of ranges.
919 // The ranges are allocated in the same block as the header,
920 // necessitating a special allocator and Delete method.
921
New(size_t maxranges)922 CharClass* CharClass::New(size_t maxranges) {
923 CharClass* cc;
924 uint8_t* data = new uint8_t[sizeof *cc + maxranges*sizeof cc->ranges_[0]];
925 cc = reinterpret_cast<CharClass*>(data);
926 cc->ranges_ = reinterpret_cast<RuneRange*>(data + sizeof *cc);
927 cc->nranges_ = 0;
928 cc->folds_ascii_ = false;
929 cc->nrunes_ = 0;
930 return cc;
931 }
932
Delete()933 void CharClass::Delete() {
934 uint8_t* data = reinterpret_cast<uint8_t*>(this);
935 delete[] data;
936 }
937
Negate()938 CharClass* CharClass::Negate() {
939 CharClass* cc = CharClass::New(static_cast<size_t>(nranges_+1));
940 cc->folds_ascii_ = folds_ascii_;
941 cc->nrunes_ = Runemax + 1 - nrunes_;
942 int n = 0;
943 int nextlo = 0;
944 for (CharClass::iterator it = begin(); it != end(); ++it) {
945 if (it->lo == nextlo) {
946 nextlo = it->hi + 1;
947 } else {
948 cc->ranges_[n++] = RuneRange(nextlo, it->lo - 1);
949 nextlo = it->hi + 1;
950 }
951 }
952 if (nextlo <= Runemax)
953 cc->ranges_[n++] = RuneRange(nextlo, Runemax);
954 cc->nranges_ = n;
955 return cc;
956 }
957
Contains(Rune r) const958 bool CharClass::Contains(Rune r) const {
959 RuneRange* rr = ranges_;
960 int n = nranges_;
961 while (n > 0) {
962 int m = n/2;
963 if (rr[m].hi < r) {
964 rr += m+1;
965 n -= m+1;
966 } else if (r < rr[m].lo) {
967 n = m;
968 } else { // rr[m].lo <= r && r <= rr[m].hi
969 return true;
970 }
971 }
972 return false;
973 }
974
GetCharClass()975 CharClass* CharClassBuilder::GetCharClass() {
976 CharClass* cc = CharClass::New(ranges_.size());
977 int n = 0;
978 for (iterator it = begin(); it != end(); ++it)
979 cc->ranges_[n++] = *it;
980 cc->nranges_ = n;
981 DCHECK_LE(n, static_cast<int>(ranges_.size()));
982 cc->nrunes_ = nrunes_;
983 cc->folds_ascii_ = FoldsASCII();
984 return cc;
985 }
986
987 } // namespace re2
988