1 // Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
4 //
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
10 
11 use std::cell::RefCell;
12 use std::collections::HashMap;
13 use std::cmp;
14 use std::sync::Arc;
15 
16 use thread_local::CachedThreadLocal;
17 use syntax::{Expr, ExprBuilder, Literals};
18 
19 use backtrack;
20 use compile::Compiler;
21 use dfa;
22 use error::Error;
23 use input::{ByteInput, CharInput};
24 use literals::LiteralSearcher;
25 use pikevm;
26 use prog::Program;
27 use re_builder::RegexOptions;
28 use re_bytes;
29 use re_set;
30 use re_trait::{RegularExpression, Slot, Locations, as_slots};
31 use re_unicode;
32 use utf8::next_utf8;
33 
34 /// Exec manages the execution of a regular expression.
35 ///
36 /// In particular, this manages the various compiled forms of a single regular
37 /// expression and the choice of which matching engine to use to execute a
38 /// regular expression.
39 pub struct Exec {
40     /// All read only state.
41     ro: Arc<ExecReadOnly>,
42     /// Caches for the various matching engines.
43     cache: CachedThreadLocal<ProgramCache>,
44 }
45 
46 /// ExecNoSync is like Exec, except it embeds a reference to a cache. This
47 /// means it is no longer Sync, but we can now avoid the overhead of
48 /// synchronization to fetch the cache.
49 #[derive(Debug)]
50 pub struct ExecNoSync<'c> {
51     /// All read only state.
52     ro: &'c Arc<ExecReadOnly>,
53     /// Caches for the various matching engines.
54     cache: &'c ProgramCache,
55 }
56 
57 /// ExecNoSyncStr is like ExecNoSync, but matches on &str instead of &[u8].
58 pub struct ExecNoSyncStr<'c>(ExecNoSync<'c>);
59 
60 /// ExecReadOnly comprises all read only state for a regex. Namely, all such
61 /// state is determined at compile time and never changes during search.
62 #[derive(Debug)]
63 struct ExecReadOnly {
64     /// The original regular expressions given by the caller to compile.
65     res: Vec<String>,
66     /// A compiled program that is used in the NFA simulation and backtracking.
67     /// It can be byte-based or Unicode codepoint based.
68     ///
69     /// N.B. It is not possibly to make this byte-based from the public API.
70     /// It is only used for testing byte based programs in the NFA simulations.
71     nfa: Program,
72     /// A compiled byte based program for DFA execution. This is only used
73     /// if a DFA can be executed. (Currently, only word boundary assertions are
74     /// not supported.) Note that this program contains an embedded `.*?`
75     /// preceding the first capture group, unless the regex is anchored at the
76     /// beginning.
77     dfa: Program,
78     /// The same as above, except the program is reversed (and there is no
79     /// preceding `.*?`). This is used by the DFA to find the starting location
80     /// of matches.
81     dfa_reverse: Program,
82     /// A set of suffix literals extracted from the regex.
83     ///
84     /// Prefix literals are stored on the `Program`, since they are used inside
85     /// the matching engines.
86     suffixes: LiteralSearcher,
87     /// match_type encodes as much upfront knowledge about how we're going to
88     /// execute a search as possible.
89     match_type: MatchType,
90 }
91 
92 /// Facilitates the construction of an executor by exposing various knobs
93 /// to control how a regex is executed and what kinds of resources it's
94 /// permitted to use.
95 pub struct ExecBuilder {
96     options: RegexOptions,
97     match_type: Option<MatchType>,
98     bytes: bool,
99     only_utf8: bool,
100 }
101 
102 /// Parsed represents a set of parsed regular expressions and their detected
103 /// literals.
104 struct Parsed {
105     exprs: Vec<Expr>,
106     prefixes: Literals,
107     suffixes: Literals,
108     bytes: bool,
109 }
110 
111 impl ExecBuilder {
112     /// Create a regex execution builder.
113     ///
114     /// This uses default settings for everything except the regex itself,
115     /// which must be provided. Further knobs can be set by calling methods,
116     /// and then finally, `build` to actually create the executor.
new(re: &str) -> Self117     pub fn new(re: &str) -> Self {
118         Self::new_many(&[re])
119     }
120 
121     /// Like new, but compiles the union of the given regular expressions.
122     ///
123     /// Note that when compiling 2 or more regular expressions, capture groups
124     /// are completely unsupported. (This means both `find` and `captures`
125     /// wont work.)
new_many<I, S>(res: I) -> Self where S: AsRef<str>, I: IntoIterator<Item=S>126     pub fn new_many<I, S>(res: I) -> Self
127             where S: AsRef<str>, I: IntoIterator<Item=S> {
128         let mut opts = RegexOptions::default();
129         opts.pats = res.into_iter().map(|s| s.as_ref().to_owned()).collect();
130         Self::new_options(opts)
131     }
132 
133     /// Create a regex execution builder.
new_options(opts: RegexOptions) -> Self134     pub fn new_options(opts: RegexOptions) -> Self {
135         ExecBuilder {
136             options: opts,
137             match_type: None,
138             bytes: false,
139             only_utf8: true,
140         }
141     }
142 
143     /// Set the matching engine to be automatically determined.
144     ///
145     /// This is the default state and will apply whatever optimizations are
146     /// possible, such as running a DFA.
147     ///
148     /// This overrides whatever was previously set via the `nfa` or
149     /// `bounded_backtracking` methods.
automatic(mut self) -> Self150     pub fn automatic(mut self) -> Self {
151         self.match_type = None;
152         self
153     }
154 
155     /// Sets the matching engine to use the NFA algorithm no matter what
156     /// optimizations are possible.
157     ///
158     /// This overrides whatever was previously set via the `automatic` or
159     /// `bounded_backtracking` methods.
nfa(mut self) -> Self160     pub fn nfa(mut self) -> Self {
161         self.match_type = Some(MatchType::Nfa(MatchNfaType::PikeVM));
162         self
163     }
164 
165     /// Sets the matching engine to use a bounded backtracking engine no
166     /// matter what optimizations are possible.
167     ///
168     /// One must use this with care, since the bounded backtracking engine
169     /// uses memory proportion to `len(regex) * len(text)`.
170     ///
171     /// This overrides whatever was previously set via the `automatic` or
172     /// `nfa` methods.
bounded_backtracking(mut self) -> Self173     pub fn bounded_backtracking(mut self) -> Self {
174         self.match_type = Some(MatchType::Nfa(MatchNfaType::Backtrack));
175         self
176     }
177 
178     /// Compiles byte based programs for use with the NFA matching engines.
179     ///
180     /// By default, the NFA engines match on Unicode scalar values. They can
181     /// be made to use byte based programs instead. In general, the byte based
182     /// programs are slower because of a less efficient encoding of character
183     /// classes.
184     ///
185     /// Note that this does not impact DFA matching engines, which always
186     /// execute on bytes.
bytes(mut self, yes: bool) -> Self187     pub fn bytes(mut self, yes: bool) -> Self {
188         self.bytes = yes;
189         self
190     }
191 
192     /// When disabled, the program compiled may match arbitrary bytes.
193     ///
194     /// When enabled (the default), all compiled programs exclusively match
195     /// valid UTF-8 bytes.
only_utf8(mut self, yes: bool) -> Self196     pub fn only_utf8(mut self, yes: bool) -> Self {
197         self.only_utf8 = yes;
198         self
199     }
200 
201     /// Set the Unicode flag.
unicode(mut self, yes: bool) -> Self202     pub fn unicode(mut self, yes: bool) -> Self {
203         self.options.unicode = yes;
204         self
205     }
206 
207     /// Parse the current set of patterns into their AST and extract literals.
parse(&self) -> Result<Parsed, Error>208     fn parse(&self) -> Result<Parsed, Error> {
209         let mut exprs = Vec::with_capacity(self.options.pats.len());
210         let mut prefixes = Some(Literals::empty());
211         let mut suffixes = Some(Literals::empty());
212         let mut bytes = false;
213         let is_set = self.options.pats.len() > 1;
214         // If we're compiling a regex set and that set has any anchored
215         // expressions, then disable all literal optimizations.
216         for pat in &self.options.pats {
217             let parser =
218                 ExprBuilder::new()
219                     .case_insensitive(self.options.case_insensitive)
220                     .multi_line(self.options.multi_line)
221                     .dot_matches_new_line(self.options.dot_matches_new_line)
222                     .swap_greed(self.options.swap_greed)
223                     .ignore_whitespace(self.options.ignore_whitespace)
224                     .unicode(self.options.unicode)
225                     .allow_bytes(!self.only_utf8);
226             let expr = try!(parser.parse(pat));
227             bytes = bytes || expr.has_bytes();
228 
229             if !expr.is_anchored_start() && expr.has_anchored_start() {
230                 // Partial anchors unfortunately make it hard to use prefixes,
231                 // so disable them.
232                 prefixes = None;
233             } else if is_set && expr.is_anchored_start() {
234                 // Regex sets with anchors do not go well with literal
235                 // optimizations.
236                 prefixes = None;
237             }
238             prefixes = prefixes.and_then(|mut prefixes| {
239                 if !prefixes.union_prefixes(&expr) {
240                     None
241                 } else {
242                     Some(prefixes)
243                 }
244             });
245 
246             if !expr.is_anchored_end() && expr.has_anchored_end() {
247                 // Partial anchors unfortunately make it hard to use suffixes,
248                 // so disable them.
249                 suffixes = None;
250             } else if is_set && expr.is_anchored_end() {
251                 // Regex sets with anchors do not go well with literal
252                 // optimizations.
253                 prefixes = None;
254             }
255             suffixes = suffixes.and_then(|mut suffixes| {
256                 if !suffixes.union_suffixes(&expr) {
257                     None
258                 } else {
259                     Some(suffixes)
260                 }
261             });
262             exprs.push(expr);
263         }
264         Ok(Parsed {
265             exprs: exprs,
266             prefixes: prefixes.unwrap_or(Literals::empty()),
267             suffixes: suffixes.unwrap_or(Literals::empty()),
268             bytes: bytes,
269         })
270     }
271 
272     /// Build an executor that can run a regular expression.
build(self) -> Result<Exec, Error>273     pub fn build(self) -> Result<Exec, Error> {
274         // Special case when we have no patterns to compile.
275         // This can happen when compiling a regex set.
276         if self.options.pats.is_empty() {
277             let ro = Arc::new(ExecReadOnly {
278                 res: vec![],
279                 nfa: Program::new(),
280                 dfa: Program::new(),
281                 dfa_reverse: Program::new(),
282                 suffixes: LiteralSearcher::empty(),
283                 match_type: MatchType::Nothing,
284             });
285             return Ok(Exec { ro: ro, cache: CachedThreadLocal::new() });
286         }
287         let parsed = try!(self.parse());
288         let mut nfa = try!(
289             Compiler::new()
290                      .size_limit(self.options.size_limit)
291                      .bytes(self.bytes || parsed.bytes)
292                      .only_utf8(self.only_utf8)
293                      .compile(&parsed.exprs));
294         let mut dfa = try!(
295             Compiler::new()
296                      .size_limit(self.options.size_limit)
297                      .dfa(true)
298                      .only_utf8(self.only_utf8)
299                      .compile(&parsed.exprs));
300         let mut dfa_reverse = try!(
301             Compiler::new()
302                      .size_limit(self.options.size_limit)
303                      .dfa(true)
304                      .only_utf8(self.only_utf8)
305                      .reverse(true)
306                      .compile(&parsed.exprs));
307 
308         let prefixes = parsed.prefixes.unambiguous_prefixes();
309         let suffixes = parsed.suffixes.unambiguous_suffixes();
310         nfa.prefixes = LiteralSearcher::prefixes(prefixes);
311         dfa.prefixes = nfa.prefixes.clone();
312         dfa.dfa_size_limit = self.options.dfa_size_limit;
313         dfa_reverse.dfa_size_limit = self.options.dfa_size_limit;
314 
315         let mut ro = ExecReadOnly {
316             res: self.options.pats,
317             nfa: nfa,
318             dfa: dfa,
319             dfa_reverse: dfa_reverse,
320             suffixes: LiteralSearcher::suffixes(suffixes),
321             match_type: MatchType::Nothing,
322         };
323         ro.match_type = ro.choose_match_type(self.match_type);
324 
325         let ro = Arc::new(ro);
326         Ok(Exec { ro: ro, cache: CachedThreadLocal::new() })
327     }
328 }
329 
330 impl<'c> RegularExpression for ExecNoSyncStr<'c> {
331     type Text = str;
332 
slots_len(&self) -> usize333     fn slots_len(&self) -> usize { self.0.slots_len() }
334 
next_after_empty(&self, text: &str, i: usize) -> usize335     fn next_after_empty(&self, text: &str, i: usize) -> usize {
336         next_utf8(text.as_bytes(), i)
337     }
338 
339     #[inline(always)] // reduces constant overhead
shortest_match_at(&self, text: &str, start: usize) -> Option<usize>340     fn shortest_match_at(&self, text: &str, start: usize) -> Option<usize> {
341         self.0.shortest_match_at(text.as_bytes(), start)
342     }
343 
344     #[inline(always)] // reduces constant overhead
is_match_at(&self, text: &str, start: usize) -> bool345     fn is_match_at(&self, text: &str, start: usize) -> bool {
346         self.0.is_match_at(text.as_bytes(), start)
347     }
348 
349     #[inline(always)] // reduces constant overhead
find_at(&self, text: &str, start: usize) -> Option<(usize, usize)>350     fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> {
351         self.0.find_at(text.as_bytes(), start)
352     }
353 
354     #[inline(always)] // reduces constant overhead
read_captures_at( &self, locs: &mut Locations, text: &str, start: usize, ) -> Option<(usize, usize)>355     fn read_captures_at(
356         &self,
357         locs: &mut Locations,
358         text: &str,
359         start: usize,
360     ) -> Option<(usize, usize)> {
361         self.0.read_captures_at(locs, text.as_bytes(), start)
362     }
363 }
364 
365 impl<'c> RegularExpression for ExecNoSync<'c> {
366     type Text = [u8];
367 
368     /// Returns the number of capture slots in the regular expression. (There
369     /// are two slots for every capture group, corresponding to possibly empty
370     /// start and end locations of the capture.)
slots_len(&self) -> usize371     fn slots_len(&self) -> usize {
372         self.ro.nfa.captures.len() * 2
373     }
374 
next_after_empty(&self, _text: &[u8], i: usize) -> usize375     fn next_after_empty(&self, _text: &[u8], i: usize) -> usize {
376         i + 1
377     }
378 
379     /// Returns the end of a match location, possibly occurring before the
380     /// end location of the correct leftmost-first match.
381     #[inline(always)] // reduces constant overhead
shortest_match_at(&self, text: &[u8], start: usize) -> Option<usize>382     fn shortest_match_at(&self, text: &[u8], start: usize) -> Option<usize> {
383         if !self.is_anchor_end_match(text) {
384             return None;
385         }
386         match self.ro.match_type {
387             MatchType::Literal(ty) => {
388                 self.find_literals(ty, text, start).map(|(_, e)| e)
389             }
390             MatchType::Dfa | MatchType::DfaMany => {
391                 match self.shortest_dfa(text, start) {
392                     dfa::Result::Match(end) => Some(end),
393                     dfa::Result::NoMatch(_) => None,
394                     dfa::Result::Quit => self.shortest_nfa(text, start),
395                 }
396             }
397             MatchType::DfaAnchoredReverse => {
398                 match dfa::Fsm::reverse(
399                     &self.ro.dfa_reverse,
400                     &self.cache,
401                     true,
402                     &text[start..],
403                     text.len(),
404                 ) {
405                     dfa::Result::Match(_) => Some(text.len()),
406                     dfa::Result::NoMatch(_) => None,
407                     dfa::Result::Quit => self.shortest_nfa(text, start),
408                 }
409             }
410             MatchType::DfaSuffix => {
411                 match self.shortest_dfa_reverse_suffix(text, start) {
412                     dfa::Result::Match(e) => Some(e),
413                     dfa::Result::NoMatch(_) => None,
414                     dfa::Result::Quit => self.shortest_nfa(text, start),
415                 }
416             }
417             MatchType::Nfa(ty) => self.shortest_nfa_type(ty, text, start),
418             MatchType::Nothing => None,
419         }
420     }
421 
422     /// Returns true if and only if the regex matches text.
423     ///
424     /// For single regular expressions, this is equivalent to calling
425     /// shortest_match(...).is_some().
426     #[inline(always)] // reduces constant overhead
is_match_at(&self, text: &[u8], start: usize) -> bool427     fn is_match_at(&self, text: &[u8], start: usize) -> bool {
428         if !self.is_anchor_end_match(text) {
429             return false;
430         }
431         // We need to do this dance because shortest_match relies on the NFA
432         // filling in captures[1], but a RegexSet has no captures. In other
433         // words, a RegexSet can't (currently) use shortest_match. ---AG
434         match self.ro.match_type {
435             MatchType::Literal(ty) => {
436                 self.find_literals(ty, text, start).is_some()
437             }
438             MatchType::Dfa | MatchType::DfaMany => {
439                 match self.shortest_dfa(text, start) {
440                     dfa::Result::Match(_) => true,
441                     dfa::Result::NoMatch(_) => false,
442                     dfa::Result::Quit => self.match_nfa(text, start),
443                 }
444             }
445             MatchType::DfaAnchoredReverse => {
446                 match dfa::Fsm::reverse(
447                     &self.ro.dfa_reverse,
448                     &self.cache,
449                     true,
450                     &text[start..],
451                     text.len(),
452                 ) {
453                     dfa::Result::Match(_) => true,
454                     dfa::Result::NoMatch(_) => false,
455                     dfa::Result::Quit => self.match_nfa(text, start),
456                 }
457             }
458             MatchType::DfaSuffix => {
459                 match self.shortest_dfa_reverse_suffix(text, start) {
460                     dfa::Result::Match(_) => true,
461                     dfa::Result::NoMatch(_) => false,
462                     dfa::Result::Quit => self.match_nfa(text, start),
463                 }
464             }
465             MatchType::Nfa(ty) => self.match_nfa_type(ty, text, start),
466             MatchType::Nothing => false,
467         }
468     }
469 
470     /// Finds the start and end location of the leftmost-first match, starting
471     /// at the given location.
472     #[inline(always)] // reduces constant overhead
find_at(&self, text: &[u8], start: usize) -> Option<(usize, usize)>473     fn find_at(&self, text: &[u8], start: usize) -> Option<(usize, usize)> {
474         if !self.is_anchor_end_match(text) {
475             return None;
476         }
477         match self.ro.match_type {
478             MatchType::Literal(ty) => {
479                 self.find_literals(ty, text, start)
480             }
481             MatchType::Dfa => {
482                 match self.find_dfa_forward(text, start) {
483                     dfa::Result::Match((s, e)) => Some((s, e)),
484                     dfa::Result::NoMatch(_) => None,
485                     dfa::Result::Quit => {
486                         self.find_nfa(MatchNfaType::Auto, text, start)
487                     }
488                 }
489             }
490             MatchType::DfaAnchoredReverse => {
491                 match self.find_dfa_anchored_reverse(text, start) {
492                     dfa::Result::Match((s, e)) => Some((s, e)),
493                     dfa::Result::NoMatch(_) => None,
494                     dfa::Result::Quit => {
495                         self.find_nfa(MatchNfaType::Auto, text, start)
496                     }
497                 }
498             }
499             MatchType::DfaSuffix => {
500                 match self.find_dfa_reverse_suffix(text, start) {
501                     dfa::Result::Match((s, e)) => Some((s, e)),
502                     dfa::Result::NoMatch(_) => None,
503                     dfa::Result::Quit => {
504                         self.find_nfa(MatchNfaType::Auto, text, start)
505                     }
506                 }
507             }
508             MatchType::Nfa(ty) => self.find_nfa(ty, text, start),
509             MatchType::Nothing => None,
510             MatchType::DfaMany => {
511                 unreachable!("BUG: RegexSet cannot be used with find")
512             }
513         }
514     }
515 
516     /// Finds the start and end location of the leftmost-first match and also
517     /// fills in all matching capture groups.
518     ///
519     /// The number of capture slots given should be equal to the total number
520     /// of capture slots in the compiled program.
521     ///
522     /// Note that the first two slots always correspond to the start and end
523     /// locations of the overall match.
read_captures_at( &self, locs: &mut Locations, text: &[u8], start: usize, ) -> Option<(usize, usize)>524     fn read_captures_at(
525         &self,
526         locs: &mut Locations,
527         text: &[u8],
528         start: usize,
529     ) -> Option<(usize, usize)> {
530         let slots = as_slots(locs);
531         for slot in slots.iter_mut() {
532             *slot = None;
533         }
534         // If the caller unnecessarily uses this, then we try to save them
535         // from themselves.
536         match slots.len() {
537             0 => return self.find_at(text, start),
538             2 => {
539                 return self.find_at(text, start).map(|(s, e)| {
540                     slots[0] = Some(s);
541                     slots[1] = Some(e);
542                     (s, e)
543                 });
544             }
545             _ => {} // fallthrough
546         }
547         if !self.is_anchor_end_match(text) {
548             return None;
549         }
550         match self.ro.match_type {
551             MatchType::Literal(ty) => {
552                 self.find_literals(ty, text, start).and_then(|(s, e)| {
553                     self.captures_nfa_with_match(slots, text, s, e)
554                 })
555             }
556             MatchType::Dfa => {
557                 match self.find_dfa_forward(text, start) {
558                     dfa::Result::Match((s, e)) => {
559                         self.captures_nfa_with_match(slots, text, s, e)
560                     }
561                     dfa::Result::NoMatch(_) => None,
562                     dfa::Result::Quit => self.captures_nfa(slots, text, start),
563                 }
564             }
565             MatchType::DfaAnchoredReverse => {
566                 match self.find_dfa_anchored_reverse(text, start) {
567                     dfa::Result::Match((s, e)) => {
568                         self.captures_nfa_with_match(slots, text, s, e)
569                     }
570                     dfa::Result::NoMatch(_) => None,
571                     dfa::Result::Quit => self.captures_nfa(slots, text, start),
572                 }
573             }
574             MatchType::DfaSuffix => {
575                 match self.find_dfa_reverse_suffix(text, start) {
576                     dfa::Result::Match((s, e)) => {
577                         self.captures_nfa_with_match(slots, text, s, e)
578                     }
579                     dfa::Result::NoMatch(_) => None,
580                     dfa::Result::Quit => self.captures_nfa(slots, text, start),
581                 }
582             }
583             MatchType::Nfa(ty) => {
584                 self.captures_nfa_type(ty, slots, text, start)
585             }
586             MatchType::Nothing => None,
587             MatchType::DfaMany => {
588                 unreachable!("BUG: RegexSet cannot be used with captures")
589             }
590         }
591     }
592 }
593 
594 impl<'c> ExecNoSync<'c> {
595     /// Finds the leftmost-first match using only literal search.
596     #[inline(always)] // reduces constant overhead
find_literals( &self, ty: MatchLiteralType, text: &[u8], start: usize, ) -> Option<(usize, usize)>597     fn find_literals(
598         &self,
599         ty: MatchLiteralType,
600         text: &[u8],
601         start: usize,
602     ) -> Option<(usize, usize)> {
603         use self::MatchLiteralType::*;
604         match ty {
605             Unanchored => {
606                 let lits = &self.ro.nfa.prefixes;
607                 lits.find(&text[start..])
608                     .map(|(s, e)| (start + s, start + e))
609             }
610             AnchoredStart => {
611                 let lits = &self.ro.nfa.prefixes;
612                 lits.find_start(&text[start..])
613                     .map(|(s, e)| (start + s, start + e))
614             }
615             AnchoredEnd => {
616                 let lits = &self.ro.suffixes;
617                 lits.find_end(&text[start..])
618                     .map(|(s, e)| (start + s, start + e))
619             }
620         }
621     }
622 
623     /// Finds the leftmost-first match (start and end) using only the DFA.
624     ///
625     /// If the result returned indicates that the DFA quit, then another
626     /// matching engine should be used.
627     #[inline(always)] // reduces constant overhead
find_dfa_forward( &self, text: &[u8], start: usize, ) -> dfa::Result<(usize, usize)>628     fn find_dfa_forward(
629         &self,
630         text: &[u8],
631         start: usize,
632     ) -> dfa::Result<(usize, usize)> {
633         use dfa::Result::*;
634         let end = match dfa::Fsm::forward(
635             &self.ro.dfa,
636             &self.cache,
637             false,
638             text,
639             start,
640         ) {
641             NoMatch(i) => return NoMatch(i),
642             Quit => return Quit,
643             Match(end) if start == end => return Match((start, start)),
644             Match(end) => end,
645         };
646         // Now run the DFA in reverse to find the start of the match.
647         match dfa::Fsm::reverse(
648             &self.ro.dfa_reverse,
649             &self.cache,
650             false,
651             &text[start..],
652             end - start,
653         ) {
654             Match(s) => Match((start + s, end)),
655             NoMatch(i) => NoMatch(i),
656             Quit => Quit,
657         }
658     }
659 
660     /// Finds the leftmost-first match (start and end) using only the DFA,
661     /// but assumes the regex is anchored at the end and therefore starts at
662     /// the end of the regex and matches in reverse.
663     ///
664     /// If the result returned indicates that the DFA quit, then another
665     /// matching engine should be used.
666     #[inline(always)] // reduces constant overhead
find_dfa_anchored_reverse( &self, text: &[u8], start: usize, ) -> dfa::Result<(usize, usize)>667     fn find_dfa_anchored_reverse(
668         &self,
669         text: &[u8],
670         start: usize,
671     ) -> dfa::Result<(usize, usize)> {
672         use dfa::Result::*;
673         match dfa::Fsm::reverse(
674             &self.ro.dfa_reverse,
675             &self.cache,
676             false,
677             &text[start..],
678             text.len() - start,
679         ) {
680             Match(s) => Match((start + s, text.len())),
681             NoMatch(i) => NoMatch(i),
682             Quit => Quit,
683         }
684     }
685 
686     /// Finds the end of the shortest match using only the DFA.
687     #[inline(always)] // reduces constant overhead
shortest_dfa(&self, text: &[u8], start: usize) -> dfa::Result<usize>688     fn shortest_dfa(&self, text: &[u8], start: usize) -> dfa::Result<usize> {
689         dfa::Fsm::forward(&self.ro.dfa, &self.cache, true, text, start)
690     }
691 
692     /// Finds the end of the shortest match using only the DFA by scanning for
693     /// suffix literals.
694     ///
695     #[inline(always)] // reduces constant overhead
shortest_dfa_reverse_suffix( &self, text: &[u8], start: usize, ) -> dfa::Result<usize>696     fn shortest_dfa_reverse_suffix(
697         &self,
698         text: &[u8],
699         start: usize,
700     ) -> dfa::Result<usize> {
701         match self.exec_dfa_reverse_suffix(text, start) {
702             None => self.shortest_dfa(text, start),
703             Some(r) => r.map(|(_, end)| end),
704         }
705     }
706 
707     /// Finds the end of the shortest match using only the DFA by scanning for
708     /// suffix literals. It also reports the start of the match.
709     ///
710     /// Note that if None is returned, then the optimization gave up to avoid
711     /// worst case quadratic behavior. A forward scanning DFA should be tried
712     /// next.
713     ///
714     /// If a match is returned and the full leftmost-first match is desired,
715     /// then a forward scan starting from the beginning of the match must be
716     /// done.
717     ///
718     /// If the result returned indicates that the DFA quit, then another
719     /// matching engine should be used.
720     #[inline(always)] // reduces constant overhead
exec_dfa_reverse_suffix( &self, text: &[u8], original_start: usize, ) -> Option<dfa::Result<(usize, usize)>>721     fn exec_dfa_reverse_suffix(
722         &self,
723         text: &[u8],
724         original_start: usize,
725     ) -> Option<dfa::Result<(usize, usize)>> {
726         use dfa::Result::*;
727 
728         let lcs = self.ro.suffixes.lcs();
729         debug_assert!(lcs.len() >= 1);
730         let mut start = original_start;
731         let mut end = start;
732         while end <= text.len() {
733             start = end;
734             end = end + match lcs.find(&text[end..]) {
735                 None => return Some(NoMatch(text.len())),
736                 Some(start) => start + lcs.len(),
737             };
738             match dfa::Fsm::reverse(
739                 &self.ro.dfa_reverse,
740                 &self.cache,
741                 false,
742                 &text[start..end],
743                 end - start,
744             ) {
745                 Match(0) | NoMatch(0) => return None,
746                 Match(s) => return Some(Match((s + start, end))),
747                 NoMatch(_) => continue,
748                 Quit => return Some(Quit),
749             };
750         }
751         Some(NoMatch(text.len()))
752     }
753 
754     /// Finds the leftmost-first match (start and end) using only the DFA
755     /// by scanning for suffix literals.
756     ///
757     /// If the result returned indicates that the DFA quit, then another
758     /// matching engine should be used.
759     #[inline(always)] // reduces constant overhead
find_dfa_reverse_suffix( &self, text: &[u8], start: usize, ) -> dfa::Result<(usize, usize)>760     fn find_dfa_reverse_suffix(
761         &self,
762         text: &[u8],
763         start: usize,
764     ) -> dfa::Result<(usize, usize)> {
765         use dfa::Result::*;
766 
767         let match_start = match self.exec_dfa_reverse_suffix(text, start) {
768             None => return self.find_dfa_forward(text, start),
769             Some(Match((start, _))) => start,
770             Some(r) => return r,
771         };
772         // At this point, we've found a match. The only way to quit now
773         // without a match is if the DFA gives up (seems unlikely).
774         //
775         // Now run the DFA forwards to find the proper end of the match.
776         // (The suffix literal match can only indicate the earliest
777         // possible end location, which may appear before the end of the
778         // leftmost-first match.)
779         match dfa::Fsm::forward(
780             &self.ro.dfa,
781             &self.cache,
782             false,
783             text,
784             match_start,
785         ) {
786             NoMatch(_) => panic!("BUG: reverse match implies forward match"),
787             Quit => Quit,
788             Match(e) => Match((match_start, e)),
789         }
790     }
791 
792     /// Executes the NFA engine to return whether there is a match or not.
793     ///
794     /// Ideally, we could use shortest_nfa(...).is_some() and get the same
795     /// performance characteristics, but regex sets don't have captures, which
796     /// shortest_nfa depends on.
match_nfa( &self, text: &[u8], start: usize, ) -> bool797     fn match_nfa(
798         &self,
799         text: &[u8],
800         start: usize,
801     ) -> bool {
802         self.match_nfa_type(MatchNfaType::Auto, text, start)
803     }
804 
805     /// Like match_nfa, but allows specification of the type of NFA engine.
match_nfa_type( &self, ty: MatchNfaType, text: &[u8], start: usize, ) -> bool806     fn match_nfa_type(
807         &self,
808         ty: MatchNfaType,
809         text: &[u8],
810         start: usize,
811     ) -> bool {
812         self.exec_nfa(ty, &mut [false], &mut [], true, text, start)
813     }
814 
815     /// Finds the shortest match using an NFA.
shortest_nfa(&self, text: &[u8], start: usize) -> Option<usize>816     fn shortest_nfa(&self, text: &[u8], start: usize) -> Option<usize> {
817         self.shortest_nfa_type(MatchNfaType::Auto, text, start)
818     }
819 
820     /// Like shortest_nfa, but allows specification of the type of NFA engine.
shortest_nfa_type( &self, ty: MatchNfaType, text: &[u8], start: usize, ) -> Option<usize>821     fn shortest_nfa_type(
822         &self,
823         ty: MatchNfaType,
824         text: &[u8],
825         start: usize,
826     ) -> Option<usize> {
827         let mut slots = [None, None];
828         if self.exec_nfa(ty, &mut [false], &mut slots, true, text, start) {
829             slots[1]
830         } else {
831             None
832         }
833     }
834 
835     /// Like find, but executes an NFA engine.
find_nfa( &self, ty: MatchNfaType, text: &[u8], start: usize, ) -> Option<(usize, usize)>836     fn find_nfa(
837         &self,
838         ty: MatchNfaType,
839         text: &[u8],
840         start: usize,
841     ) -> Option<(usize, usize)> {
842         let mut slots = [None, None];
843         if self.exec_nfa(ty, &mut [false], &mut slots, false, text, start) {
844             match (slots[0], slots[1]) {
845                 (Some(s), Some(e)) => Some((s, e)),
846                 _ => None,
847             }
848         } else {
849             None
850         }
851     }
852 
853     /// Like find_nfa, but fills in captures and restricts the search space
854     /// using previously found match information.
855     ///
856     /// `slots` should have length equal to `2 * nfa.captures.len()`.
captures_nfa_with_match( &self, slots: &mut [Slot], text: &[u8], match_start: usize, match_end: usize, ) -> Option<(usize, usize)>857     fn captures_nfa_with_match(
858         &self,
859         slots: &mut [Slot],
860         text: &[u8],
861         match_start: usize,
862         match_end: usize,
863     ) -> Option<(usize, usize)> {
864         // We can't use match_end directly, because we may need to examine one
865         // "character" after the end of a match for lookahead operators. We
866         // need to move two characters beyond the end, since some look-around
867         // operations may falsely assume a premature end of text otherwise.
868         let e = cmp::min(
869             next_utf8(text, next_utf8(text, match_end)), text.len());
870         self.captures_nfa(slots, &text[..e], match_start)
871     }
872 
873     /// Like find_nfa, but fills in captures.
874     ///
875     /// `slots` should have length equal to `2 * nfa.captures.len()`.
captures_nfa( &self, slots: &mut [Slot], text: &[u8], start: usize, ) -> Option<(usize, usize)>876     fn captures_nfa(
877         &self,
878         slots: &mut [Slot],
879         text: &[u8],
880         start: usize,
881     ) -> Option<(usize, usize)> {
882         self.captures_nfa_type(MatchNfaType::Auto, slots, text, start)
883     }
884 
885     /// Like captures_nfa, but allows specification of type of NFA engine.
captures_nfa_type( &self, ty: MatchNfaType, slots: &mut [Slot], text: &[u8], start: usize, ) -> Option<(usize, usize)>886     fn captures_nfa_type(
887         &self,
888         ty: MatchNfaType,
889         slots: &mut [Slot],
890         text: &[u8],
891         start: usize,
892     ) -> Option<(usize, usize)> {
893         if self.exec_nfa(ty, &mut [false], slots, false, text, start) {
894             match (slots[0], slots[1]) {
895                 (Some(s), Some(e)) => Some((s, e)),
896                 _ => None,
897             }
898         } else {
899             None
900         }
901     }
902 
exec_nfa( &self, mut ty: MatchNfaType, matches: &mut [bool], slots: &mut [Slot], quit_after_match: bool, text: &[u8], start: usize, ) -> bool903     fn exec_nfa(
904         &self,
905         mut ty: MatchNfaType,
906         matches: &mut [bool],
907         slots: &mut [Slot],
908         quit_after_match: bool,
909         text: &[u8],
910         start: usize,
911     ) -> bool {
912         use self::MatchNfaType::*;
913         if let Auto = ty {
914             if backtrack::should_exec(self.ro.nfa.len(), text.len()) {
915                 ty = Backtrack;
916             } else {
917                 ty = PikeVM;
918             }
919         }
920         match ty {
921             Auto => unreachable!(),
922             Backtrack => self.exec_backtrack(matches, slots, text, start),
923             PikeVM => {
924                 self.exec_pikevm(
925                     matches, slots, quit_after_match, text, start)
926             }
927         }
928     }
929 
930     /// Always run the NFA algorithm.
exec_pikevm( &self, matches: &mut [bool], slots: &mut [Slot], quit_after_match: bool, text: &[u8], start: usize, ) -> bool931     fn exec_pikevm(
932         &self,
933         matches: &mut [bool],
934         slots: &mut [Slot],
935         quit_after_match: bool,
936         text: &[u8],
937         start: usize,
938     ) -> bool {
939         if self.ro.nfa.uses_bytes() {
940             pikevm::Fsm::exec(
941                 &self.ro.nfa,
942                 &self.cache,
943                 matches,
944                 slots,
945                 quit_after_match,
946                 ByteInput::new(text, self.ro.nfa.only_utf8),
947                 start)
948         } else {
949             pikevm::Fsm::exec(
950                 &self.ro.nfa,
951                 &self.cache,
952                 matches,
953                 slots,
954                 quit_after_match,
955                 CharInput::new(text),
956                 start)
957         }
958     }
959 
960     /// Always runs the NFA using bounded backtracking.
exec_backtrack( &self, matches: &mut [bool], slots: &mut [Slot], text: &[u8], start: usize, ) -> bool961     fn exec_backtrack(
962         &self,
963         matches: &mut [bool],
964         slots: &mut [Slot],
965         text: &[u8],
966         start: usize,
967     ) -> bool {
968         if self.ro.nfa.uses_bytes() {
969             backtrack::Bounded::exec(
970                 &self.ro.nfa,
971                 &self.cache,
972                 matches,
973                 slots,
974                 ByteInput::new(text, self.ro.nfa.only_utf8),
975                 start)
976         } else {
977             backtrack::Bounded::exec(
978                 &self.ro.nfa,
979                 &self.cache,
980                 matches,
981                 slots,
982                 CharInput::new(text),
983                 start)
984         }
985     }
986 
987     /// Finds which regular expressions match the given text.
988     ///
989     /// `matches` should have length equal to the number of regexes being
990     /// searched.
991     ///
992     /// This is only useful when one wants to know which regexes in a set
993     /// match some text.
many_matches_at( &self, matches: &mut [bool], text: &[u8], start: usize, ) -> bool994     pub fn many_matches_at(
995         &self,
996         matches: &mut [bool],
997         text: &[u8],
998         start: usize,
999     ) -> bool {
1000         use self::MatchType::*;
1001         if !self.is_anchor_end_match(text) {
1002             return false;
1003         }
1004         match self.ro.match_type {
1005             Literal(ty) => {
1006                 debug_assert!(matches.len() == 1);
1007                 matches[0] = self.find_literals(ty, text, start).is_some();
1008                 matches[0]
1009             }
1010             Dfa | DfaAnchoredReverse | DfaSuffix | DfaMany => {
1011                 match dfa::Fsm::forward_many(
1012                     &self.ro.dfa,
1013                     &self.cache,
1014                     matches,
1015                     text,
1016                     start,
1017                 ) {
1018                     dfa::Result::Match(_) => true,
1019                     dfa::Result::NoMatch(_) => false,
1020                     dfa::Result::Quit => {
1021                         self.exec_nfa(
1022                             MatchNfaType::Auto,
1023                             matches,
1024                             &mut [],
1025                             false,
1026                             text,
1027                             start)
1028                     }
1029                 }
1030             }
1031             Nfa(ty) => self.exec_nfa(ty, matches, &mut [], false, text, start),
1032             Nothing => false,
1033         }
1034     }
1035 
1036     #[inline(always)] // reduces constant overhead
is_anchor_end_match(&self, text: &[u8]) -> bool1037     fn is_anchor_end_match(&self, text: &[u8]) -> bool {
1038         // Only do this check if the haystack is big (>1MB).
1039         if text.len() > (1<<20) && self.ro.nfa.is_anchored_end {
1040             let lcs = self.ro.suffixes.lcs();
1041             if lcs.len() >= 1 && !lcs.is_suffix(text) {
1042                 return false;
1043             }
1044         }
1045         true
1046     }
1047 
capture_name_idx(&self) -> &Arc<HashMap<String, usize>>1048     pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1049         &self.ro.nfa.capture_name_idx
1050     }
1051 }
1052 
1053 impl<'c> ExecNoSyncStr<'c> {
capture_name_idx(&self) -> &Arc<HashMap<String, usize>>1054     pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1055         self.0.capture_name_idx()
1056     }
1057 }
1058 
1059 impl Exec {
1060     /// Get a searcher that isn't Sync.
1061     #[inline(always)] // reduces constant overhead
searcher(&self) -> ExecNoSync1062     pub fn searcher(&self) -> ExecNoSync {
1063         let create = || Box::new(RefCell::new(ProgramCacheInner::new(&self.ro)));
1064         ExecNoSync {
1065             ro: &self.ro, // a clone is too expensive here! (and not needed)
1066             cache: self.cache.get_or(create),
1067         }
1068     }
1069 
1070     /// Get a searcher that isn't Sync and can match on &str.
1071     #[inline(always)] // reduces constant overhead
searcher_str(&self) -> ExecNoSyncStr1072     pub fn searcher_str(&self) -> ExecNoSyncStr {
1073         ExecNoSyncStr(self.searcher())
1074     }
1075 
1076     /// Build a Regex from this executor.
into_regex(self) -> re_unicode::Regex1077     pub fn into_regex(self) -> re_unicode::Regex {
1078         re_unicode::Regex::from(self)
1079     }
1080 
1081     /// Build a RegexSet from this executor.
into_regex_set(self) -> re_set::unicode::RegexSet1082     pub fn into_regex_set(self) -> re_set::unicode::RegexSet {
1083         re_set::unicode::RegexSet::from(self)
1084     }
1085 
1086     /// Build a Regex from this executor that can match arbitrary bytes.
into_byte_regex(self) -> re_bytes::Regex1087     pub fn into_byte_regex(self) -> re_bytes::Regex {
1088         re_bytes::Regex::from(self)
1089     }
1090 
1091     /// Build a RegexSet from this executor that can match arbitrary bytes.
into_byte_regex_set(self) -> re_set::bytes::RegexSet1092     pub fn into_byte_regex_set(self) -> re_set::bytes::RegexSet {
1093         re_set::bytes::RegexSet::from(self)
1094     }
1095 
1096     /// The original regular expressions given by the caller that were
1097     /// compiled.
regex_strings(&self) -> &[String]1098     pub fn regex_strings(&self) -> &[String] {
1099         &self.ro.res
1100     }
1101 
1102     /// Return a slice of capture names.
1103     ///
1104     /// Any capture that isn't named is None.
capture_names(&self) -> &[Option<String>]1105     pub fn capture_names(&self) -> &[Option<String>] {
1106         &self.ro.nfa.captures
1107     }
1108 
1109     /// Return a reference to named groups mapping (from group name to
1110     /// group position).
capture_name_idx(&self) -> &Arc<HashMap<String, usize>>1111     pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1112         &self.ro.nfa.capture_name_idx
1113     }
1114 }
1115 
1116 impl Clone for Exec {
clone(&self) -> Exec1117     fn clone(&self) -> Exec {
1118         Exec {
1119             ro: self.ro.clone(),
1120             cache: CachedThreadLocal::new(),
1121         }
1122     }
1123 }
1124 
1125 impl ExecReadOnly {
choose_match_type(&self, hint: Option<MatchType>) -> MatchType1126     fn choose_match_type(&self, hint: Option<MatchType>) -> MatchType {
1127         use self::MatchType::*;
1128         if let Some(Nfa(_)) = hint {
1129             return hint.unwrap();
1130         }
1131         // If the NFA is empty, then we'll never match anything.
1132         if self.nfa.insts.is_empty() {
1133             return Nothing;
1134         }
1135         // If our set of prefixes is complete, then we can use it to find
1136         // a match in lieu of a regex engine. This doesn't quite work well in
1137         // the presence of multiple regexes, so only do it when there's one.
1138         //
1139         // TODO(burntsushi): Also, don't try to match literals if the regex is
1140         // partially anchored. We could technically do it, but we'd need to
1141         // create two sets of literals: all of them and then the subset that
1142         // aren't anchored. We would then only search for all of them when at
1143         // the beginning of the input and use the subset in all other cases.
1144         if self.res.len() == 1 {
1145             if self.nfa.prefixes.complete() {
1146                 return if self.nfa.is_anchored_start {
1147                     Literal(MatchLiteralType::AnchoredStart)
1148                 } else {
1149                     Literal(MatchLiteralType::Unanchored)
1150                 };
1151             }
1152             if self.suffixes.complete() {
1153                 return if self.nfa.is_anchored_end {
1154                     Literal(MatchLiteralType::AnchoredEnd)
1155                 } else {
1156                     // This case shouldn't happen. When the regex isn't
1157                     // anchored, then complete prefixes should imply complete
1158                     // suffixes.
1159                     Literal(MatchLiteralType::Unanchored)
1160                 };
1161             }
1162         }
1163         // If we can execute the DFA, then we totally should.
1164         if dfa::can_exec(&self.dfa) {
1165             // Regex sets require a slightly specialized path.
1166             if self.res.len() >= 2 {
1167                 return DfaMany;
1168             }
1169             // If the regex is anchored at the end but not the start, then
1170             // just match in reverse from the end of the haystack.
1171             if !self.nfa.is_anchored_start && self.nfa.is_anchored_end {
1172                 return DfaAnchoredReverse;
1173             }
1174             // If there's a longish suffix literal, then it might be faster
1175             // to look for that first.
1176             if self.should_suffix_scan() {
1177                 return DfaSuffix;
1178             }
1179             // Fall back to your garden variety forward searching lazy DFA.
1180             return Dfa;
1181         }
1182         // We're so totally hosed.
1183         Nfa(MatchNfaType::Auto)
1184     }
1185 
1186     /// Returns true if the program is amenable to suffix scanning.
1187     ///
1188     /// When this is true, as a heuristic, we assume it is OK to quickly scan
1189     /// for suffix literals and then do a *reverse* DFA match from any matches
1190     /// produced by the literal scan. (And then followed by a forward DFA
1191     /// search, since the previously found suffix literal maybe not actually be
1192     /// the end of a match.)
1193     ///
1194     /// This is a bit of a specialized optimization, but can result in pretty
1195     /// big performance wins if 1) there are no prefix literals and 2) the
1196     /// suffix literals are pretty rare in the text. (1) is obviously easy to
1197     /// account for but (2) is harder. As a proxy, we assume that longer
1198     /// strings are generally rarer, so we only enable this optimization when
1199     /// we have a meaty suffix.
should_suffix_scan(&self) -> bool1200     fn should_suffix_scan(&self) -> bool {
1201         if self.suffixes.is_empty() {
1202             return false;
1203         }
1204         let lcs_len = self.suffixes.lcs().char_len();
1205         lcs_len >= 3 && lcs_len > self.dfa.prefixes.lcp().char_len()
1206     }
1207 }
1208 
1209 #[derive(Clone, Copy, Debug)]
1210 enum MatchType {
1211     /// A single or multiple literal search. This is only used when the regex
1212     /// can be decomposed into unambiguous literal search.
1213     Literal(MatchLiteralType),
1214     /// A normal DFA search.
1215     Dfa,
1216     /// A reverse DFA search starting from the end of a haystack.
1217     DfaAnchoredReverse,
1218     /// A reverse DFA search with suffix literal scanning.
1219     DfaSuffix,
1220     /// Use the DFA on two or more regular expressions.
1221     DfaMany,
1222     /// An NFA variant.
1223     Nfa(MatchNfaType),
1224     /// No match is ever possible, so don't ever try to search.
1225     Nothing,
1226 }
1227 
1228 #[derive(Clone, Copy, Debug)]
1229 enum MatchLiteralType {
1230     /// Match literals anywhere in text.
1231     Unanchored,
1232     /// Match literals only at the start of text.
1233     AnchoredStart,
1234     /// Match literals only at the end of text.
1235     AnchoredEnd,
1236 }
1237 
1238 #[derive(Clone, Copy, Debug)]
1239 enum MatchNfaType {
1240     /// Choose between Backtrack and PikeVM.
1241     Auto,
1242     /// NFA bounded backtracking.
1243     ///
1244     /// (This is only set by tests, since it never makes sense to always want
1245     /// backtracking.)
1246     Backtrack,
1247     /// The Pike VM.
1248     ///
1249     /// (This is only set by tests, since it never makes sense to always want
1250     /// the Pike VM.)
1251     PikeVM,
1252 }
1253 
1254 /// ProgramCache maintains reusable allocations for each matching engine
1255 /// available to a particular program.
1256 pub type ProgramCache = RefCell<ProgramCacheInner>;
1257 
1258 #[derive(Clone, Debug)]
1259 pub struct ProgramCacheInner {
1260     pub pikevm: pikevm::Cache,
1261     pub backtrack: backtrack::Cache,
1262     pub dfa: dfa::Cache,
1263     pub dfa_reverse: dfa::Cache,
1264 }
1265 
1266 impl ProgramCacheInner {
new(ro: &ExecReadOnly) -> Self1267     fn new(ro: &ExecReadOnly) -> Self {
1268         ProgramCacheInner {
1269             pikevm: pikevm::Cache::new(&ro.nfa),
1270             backtrack: backtrack::Cache::new(&ro.nfa),
1271             dfa: dfa::Cache::new(&ro.dfa),
1272             dfa_reverse: dfa::Cache::new(&ro.dfa_reverse),
1273         }
1274     }
1275 }
1276