1 use std::cell::RefCell;
2 use std::collections::HashMap;
3 use std::sync::Arc;
4 
5 #[cfg(feature = "perf-literal")]
6 use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
7 use syntax::hir::literal::Literals;
8 use syntax::hir::Hir;
9 use syntax::ParserBuilder;
10 
11 use backtrack;
12 use cache::{Cached, CachedGuard};
13 use compile::Compiler;
14 #[cfg(feature = "perf-dfa")]
15 use dfa;
16 use error::Error;
17 use input::{ByteInput, CharInput};
18 use literal::LiteralSearcher;
19 use pikevm;
20 use prog::Program;
21 use re_builder::RegexOptions;
22 use re_bytes;
23 use re_set;
24 use re_trait::{Locations, RegularExpression, Slot};
25 use re_unicode;
26 use utf8::next_utf8;
27 
28 /// `Exec` manages the execution of a regular expression.
29 ///
30 /// In particular, this manages the various compiled forms of a single regular
31 /// expression and the choice of which matching engine to use to execute a
32 /// regular expression.
33 #[derive(Debug)]
34 pub struct Exec {
35     /// All read only state.
36     ro: Arc<ExecReadOnly>,
37     /// Caches for the various matching engines.
38     cache: Cached<ProgramCache>,
39 }
40 
41 /// `ExecNoSync` is like `Exec`, except it embeds a reference to a cache. This
42 /// means it is no longer Sync, but we can now avoid the overhead of
43 /// synchronization to fetch the cache.
44 #[derive(Debug)]
45 pub struct ExecNoSync<'c> {
46     /// All read only state.
47     ro: &'c Arc<ExecReadOnly>,
48     /// Caches for the various matching engines.
49     cache: CachedGuard<'c, ProgramCache>,
50 }
51 
52 /// `ExecNoSyncStr` is like `ExecNoSync`, but matches on &str instead of &[u8].
53 #[derive(Debug)]
54 pub struct ExecNoSyncStr<'c>(ExecNoSync<'c>);
55 
56 /// `ExecReadOnly` comprises all read only state for a regex. Namely, all such
57 /// state is determined at compile time and never changes during search.
58 #[derive(Debug)]
59 struct ExecReadOnly {
60     /// The original regular expressions given by the caller to compile.
61     res: Vec<String>,
62     /// A compiled program that is used in the NFA simulation and backtracking.
63     /// It can be byte-based or Unicode codepoint based.
64     ///
65     /// N.B. It is not possibly to make this byte-based from the public API.
66     /// It is only used for testing byte based programs in the NFA simulations.
67     nfa: Program,
68     /// A compiled byte based program for DFA execution. This is only used
69     /// if a DFA can be executed. (Currently, only word boundary assertions are
70     /// not supported.) Note that this program contains an embedded `.*?`
71     /// preceding the first capture group, unless the regex is anchored at the
72     /// beginning.
73     dfa: Program,
74     /// The same as above, except the program is reversed (and there is no
75     /// preceding `.*?`). This is used by the DFA to find the starting location
76     /// of matches.
77     dfa_reverse: Program,
78     /// A set of suffix literals extracted from the regex.
79     ///
80     /// Prefix literals are stored on the `Program`, since they are used inside
81     /// the matching engines.
82     suffixes: LiteralSearcher,
83     /// An Aho-Corasick automaton with leftmost-first match semantics.
84     ///
85     /// This is only set when the entire regex is a simple unanchored
86     /// alternation of literals. We could probably use it more circumstances,
87     /// but this is already hacky enough in this architecture.
88     ///
89     /// N.B. We use u32 as a state ID representation under the assumption that
90     /// if we were to exhaust the ID space, we probably would have long
91     /// surpassed the compilation size limit.
92     #[cfg(feature = "perf-literal")]
93     ac: Option<AhoCorasick<u32>>,
94     /// match_type encodes as much upfront knowledge about how we're going to
95     /// execute a search as possible.
96     match_type: MatchType,
97 }
98 
99 /// Facilitates the construction of an executor by exposing various knobs
100 /// to control how a regex is executed and what kinds of resources it's
101 /// permitted to use.
102 // `ExecBuilder` is only public via the `internal` module, so avoid deriving
103 // `Debug`.
104 #[allow(missing_debug_implementations)]
105 pub struct ExecBuilder {
106     options: RegexOptions,
107     match_type: Option<MatchType>,
108     bytes: bool,
109     only_utf8: bool,
110 }
111 
112 /// Parsed represents a set of parsed regular expressions and their detected
113 /// literals.
114 struct Parsed {
115     exprs: Vec<Hir>,
116     prefixes: Literals,
117     suffixes: Literals,
118     bytes: bool,
119 }
120 
121 impl ExecBuilder {
122     /// Create a regex execution builder.
123     ///
124     /// This uses default settings for everything except the regex itself,
125     /// which must be provided. Further knobs can be set by calling methods,
126     /// and then finally, `build` to actually create the executor.
new(re: &str) -> Self127     pub fn new(re: &str) -> Self {
128         Self::new_many(&[re])
129     }
130 
131     /// Like new, but compiles the union of the given regular expressions.
132     ///
133     /// Note that when compiling 2 or more regular expressions, capture groups
134     /// are completely unsupported. (This means both `find` and `captures`
135     /// wont work.)
new_many<I, S>(res: I) -> Self where S: AsRef<str>, I: IntoIterator<Item = S>,136     pub fn new_many<I, S>(res: I) -> Self
137     where
138         S: AsRef<str>,
139         I: IntoIterator<Item = S>,
140     {
141         let mut opts = RegexOptions::default();
142         opts.pats = res.into_iter().map(|s| s.as_ref().to_owned()).collect();
143         Self::new_options(opts)
144     }
145 
146     /// Create a regex execution builder.
new_options(opts: RegexOptions) -> Self147     pub fn new_options(opts: RegexOptions) -> Self {
148         ExecBuilder {
149             options: opts,
150             match_type: None,
151             bytes: false,
152             only_utf8: true,
153         }
154     }
155 
156     /// Set the matching engine to be automatically determined.
157     ///
158     /// This is the default state and will apply whatever optimizations are
159     /// possible, such as running a DFA.
160     ///
161     /// This overrides whatever was previously set via the `nfa` or
162     /// `bounded_backtracking` methods.
automatic(mut self) -> Self163     pub fn automatic(mut self) -> Self {
164         self.match_type = None;
165         self
166     }
167 
168     /// Sets the matching engine to use the NFA algorithm no matter what
169     /// optimizations are possible.
170     ///
171     /// This overrides whatever was previously set via the `automatic` or
172     /// `bounded_backtracking` methods.
nfa(mut self) -> Self173     pub fn nfa(mut self) -> Self {
174         self.match_type = Some(MatchType::Nfa(MatchNfaType::PikeVM));
175         self
176     }
177 
178     /// Sets the matching engine to use a bounded backtracking engine no
179     /// matter what optimizations are possible.
180     ///
181     /// One must use this with care, since the bounded backtracking engine
182     /// uses memory proportion to `len(regex) * len(text)`.
183     ///
184     /// This overrides whatever was previously set via the `automatic` or
185     /// `nfa` methods.
bounded_backtracking(mut self) -> Self186     pub fn bounded_backtracking(mut self) -> Self {
187         self.match_type = Some(MatchType::Nfa(MatchNfaType::Backtrack));
188         self
189     }
190 
191     /// Compiles byte based programs for use with the NFA matching engines.
192     ///
193     /// By default, the NFA engines match on Unicode scalar values. They can
194     /// be made to use byte based programs instead. In general, the byte based
195     /// programs are slower because of a less efficient encoding of character
196     /// classes.
197     ///
198     /// Note that this does not impact DFA matching engines, which always
199     /// execute on bytes.
bytes(mut self, yes: bool) -> Self200     pub fn bytes(mut self, yes: bool) -> Self {
201         self.bytes = yes;
202         self
203     }
204 
205     /// When disabled, the program compiled may match arbitrary bytes.
206     ///
207     /// When enabled (the default), all compiled programs exclusively match
208     /// valid UTF-8 bytes.
only_utf8(mut self, yes: bool) -> Self209     pub fn only_utf8(mut self, yes: bool) -> Self {
210         self.only_utf8 = yes;
211         self
212     }
213 
214     /// Set the Unicode flag.
unicode(mut self, yes: bool) -> Self215     pub fn unicode(mut self, yes: bool) -> Self {
216         self.options.unicode = yes;
217         self
218     }
219 
220     /// Parse the current set of patterns into their AST and extract literals.
parse(&self) -> Result<Parsed, Error>221     fn parse(&self) -> Result<Parsed, Error> {
222         let mut exprs = Vec::with_capacity(self.options.pats.len());
223         let mut prefixes = Some(Literals::empty());
224         let mut suffixes = Some(Literals::empty());
225         let mut bytes = false;
226         let is_set = self.options.pats.len() > 1;
227         // If we're compiling a regex set and that set has any anchored
228         // expressions, then disable all literal optimizations.
229         for pat in &self.options.pats {
230             let mut parser = ParserBuilder::new()
231                 .octal(self.options.octal)
232                 .case_insensitive(self.options.case_insensitive)
233                 .multi_line(self.options.multi_line)
234                 .dot_matches_new_line(self.options.dot_matches_new_line)
235                 .swap_greed(self.options.swap_greed)
236                 .ignore_whitespace(self.options.ignore_whitespace)
237                 .unicode(self.options.unicode)
238                 .allow_invalid_utf8(!self.only_utf8)
239                 .nest_limit(self.options.nest_limit)
240                 .build();
241             let expr =
242                 parser.parse(pat).map_err(|e| Error::Syntax(e.to_string()))?;
243             bytes = bytes || !expr.is_always_utf8();
244 
245             if cfg!(feature = "perf-literal") {
246                 if !expr.is_anchored_start() && expr.is_any_anchored_start() {
247                     // Partial anchors unfortunately make it hard to use
248                     // prefixes, so disable them.
249                     prefixes = None;
250                 } else if is_set && expr.is_anchored_start() {
251                     // Regex sets with anchors do not go well with literal
252                     // optimizations.
253                     prefixes = None;
254                 }
255                 prefixes = prefixes.and_then(|mut prefixes| {
256                     if !prefixes.union_prefixes(&expr) {
257                         None
258                     } else {
259                         Some(prefixes)
260                     }
261                 });
262 
263                 if !expr.is_anchored_end() && expr.is_any_anchored_end() {
264                     // Partial anchors unfortunately make it hard to use
265                     // suffixes, so disable them.
266                     suffixes = None;
267                 } else if is_set && expr.is_anchored_end() {
268                     // Regex sets with anchors do not go well with literal
269                     // optimizations.
270                     suffixes = None;
271                 }
272                 suffixes = suffixes.and_then(|mut suffixes| {
273                     if !suffixes.union_suffixes(&expr) {
274                         None
275                     } else {
276                         Some(suffixes)
277                     }
278                 });
279             }
280             exprs.push(expr);
281         }
282         Ok(Parsed {
283             exprs: exprs,
284             prefixes: prefixes.unwrap_or_else(Literals::empty),
285             suffixes: suffixes.unwrap_or_else(Literals::empty),
286             bytes: bytes,
287         })
288     }
289 
290     /// Build an executor that can run a regular expression.
build(self) -> Result<Exec, Error>291     pub fn build(self) -> Result<Exec, Error> {
292         // Special case when we have no patterns to compile.
293         // This can happen when compiling a regex set.
294         if self.options.pats.is_empty() {
295             let ro = Arc::new(ExecReadOnly {
296                 res: vec![],
297                 nfa: Program::new(),
298                 dfa: Program::new(),
299                 dfa_reverse: Program::new(),
300                 suffixes: LiteralSearcher::empty(),
301                 #[cfg(feature = "perf-literal")]
302                 ac: None,
303                 match_type: MatchType::Nothing,
304             });
305             return Ok(Exec { ro: ro, cache: Cached::new() });
306         }
307         let parsed = self.parse()?;
308         let mut nfa = Compiler::new()
309             .size_limit(self.options.size_limit)
310             .bytes(self.bytes || parsed.bytes)
311             .only_utf8(self.only_utf8)
312             .compile(&parsed.exprs)?;
313         let mut dfa = Compiler::new()
314             .size_limit(self.options.size_limit)
315             .dfa(true)
316             .only_utf8(self.only_utf8)
317             .compile(&parsed.exprs)?;
318         let mut dfa_reverse = Compiler::new()
319             .size_limit(self.options.size_limit)
320             .dfa(true)
321             .only_utf8(self.only_utf8)
322             .reverse(true)
323             .compile(&parsed.exprs)?;
324 
325         #[cfg(feature = "perf-literal")]
326         let ac = self.build_aho_corasick(&parsed);
327         nfa.prefixes = LiteralSearcher::prefixes(parsed.prefixes);
328         dfa.prefixes = nfa.prefixes.clone();
329         dfa.dfa_size_limit = self.options.dfa_size_limit;
330         dfa_reverse.dfa_size_limit = self.options.dfa_size_limit;
331 
332         let mut ro = ExecReadOnly {
333             res: self.options.pats,
334             nfa: nfa,
335             dfa: dfa,
336             dfa_reverse: dfa_reverse,
337             suffixes: LiteralSearcher::suffixes(parsed.suffixes),
338             #[cfg(feature = "perf-literal")]
339             ac: ac,
340             match_type: MatchType::Nothing,
341         };
342         ro.match_type = ro.choose_match_type(self.match_type);
343 
344         let ro = Arc::new(ro);
345         Ok(Exec { ro: ro, cache: Cached::new() })
346     }
347 
348     #[cfg(feature = "perf-literal")]
build_aho_corasick(&self, parsed: &Parsed) -> Option<AhoCorasick<u32>>349     fn build_aho_corasick(&self, parsed: &Parsed) -> Option<AhoCorasick<u32>> {
350         if parsed.exprs.len() != 1 {
351             return None;
352         }
353         let lits = match alternation_literals(&parsed.exprs[0]) {
354             None => return None,
355             Some(lits) => lits,
356         };
357         // If we have a small number of literals, then let Teddy handle
358         // things (see literal/mod.rs).
359         if lits.len() <= 32 {
360             return None;
361         }
362         Some(
363             AhoCorasickBuilder::new()
364                 .match_kind(MatchKind::LeftmostFirst)
365                 .auto_configure(&lits)
366                 // We always want this to reduce size, regardless
367                 // of what auto-configure does.
368                 .byte_classes(true)
369                 .build_with_size::<u32, _, _>(&lits)
370                 // This should never happen because we'd long exceed the
371                 // compilation limit for regexes first.
372                 .expect("AC automaton too big"),
373         )
374     }
375 }
376 
377 impl<'c> RegularExpression for ExecNoSyncStr<'c> {
378     type Text = str;
379 
slots_len(&self) -> usize380     fn slots_len(&self) -> usize {
381         self.0.slots_len()
382     }
383 
next_after_empty(&self, text: &str, i: usize) -> usize384     fn next_after_empty(&self, text: &str, i: usize) -> usize {
385         next_utf8(text.as_bytes(), i)
386     }
387 
388     #[cfg_attr(feature = "perf-inline", inline(always))]
shortest_match_at(&self, text: &str, start: usize) -> Option<usize>389     fn shortest_match_at(&self, text: &str, start: usize) -> Option<usize> {
390         self.0.shortest_match_at(text.as_bytes(), start)
391     }
392 
393     #[cfg_attr(feature = "perf-inline", inline(always))]
is_match_at(&self, text: &str, start: usize) -> bool394     fn is_match_at(&self, text: &str, start: usize) -> bool {
395         self.0.is_match_at(text.as_bytes(), start)
396     }
397 
398     #[cfg_attr(feature = "perf-inline", inline(always))]
find_at(&self, text: &str, start: usize) -> Option<(usize, usize)>399     fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> {
400         self.0.find_at(text.as_bytes(), start)
401     }
402 
403     #[cfg_attr(feature = "perf-inline", inline(always))]
captures_read_at( &self, locs: &mut Locations, text: &str, start: usize, ) -> Option<(usize, usize)>404     fn captures_read_at(
405         &self,
406         locs: &mut Locations,
407         text: &str,
408         start: usize,
409     ) -> Option<(usize, usize)> {
410         self.0.captures_read_at(locs, text.as_bytes(), start)
411     }
412 }
413 
414 impl<'c> RegularExpression for ExecNoSync<'c> {
415     type Text = [u8];
416 
417     /// Returns the number of capture slots in the regular expression. (There
418     /// are two slots for every capture group, corresponding to possibly empty
419     /// start and end locations of the capture.)
slots_len(&self) -> usize420     fn slots_len(&self) -> usize {
421         self.ro.nfa.captures.len() * 2
422     }
423 
next_after_empty(&self, _text: &[u8], i: usize) -> usize424     fn next_after_empty(&self, _text: &[u8], i: usize) -> usize {
425         i + 1
426     }
427 
428     /// Returns the end of a match location, possibly occurring before the
429     /// end location of the correct leftmost-first match.
430     #[cfg_attr(feature = "perf-inline", inline(always))]
shortest_match_at(&self, text: &[u8], start: usize) -> Option<usize>431     fn shortest_match_at(&self, text: &[u8], start: usize) -> Option<usize> {
432         if !self.is_anchor_end_match(text) {
433             return None;
434         }
435         match self.ro.match_type {
436             #[cfg(feature = "perf-literal")]
437             MatchType::Literal(ty) => {
438                 self.find_literals(ty, text, start).map(|(_, e)| e)
439             }
440             #[cfg(feature = "perf-dfa")]
441             MatchType::Dfa | MatchType::DfaMany => {
442                 match self.shortest_dfa(text, start) {
443                     dfa::Result::Match(end) => Some(end),
444                     dfa::Result::NoMatch(_) => None,
445                     dfa::Result::Quit => self.shortest_nfa(text, start),
446                 }
447             }
448             #[cfg(feature = "perf-dfa")]
449             MatchType::DfaAnchoredReverse => {
450                 match dfa::Fsm::reverse(
451                     &self.ro.dfa_reverse,
452                     self.cache.value(),
453                     true,
454                     &text[start..],
455                     text.len(),
456                 ) {
457                     dfa::Result::Match(_) => Some(text.len()),
458                     dfa::Result::NoMatch(_) => None,
459                     dfa::Result::Quit => self.shortest_nfa(text, start),
460                 }
461             }
462             #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
463             MatchType::DfaSuffix => {
464                 match self.shortest_dfa_reverse_suffix(text, start) {
465                     dfa::Result::Match(e) => Some(e),
466                     dfa::Result::NoMatch(_) => None,
467                     dfa::Result::Quit => self.shortest_nfa(text, start),
468                 }
469             }
470             MatchType::Nfa(ty) => self.shortest_nfa_type(ty, text, start),
471             MatchType::Nothing => None,
472         }
473     }
474 
475     /// Returns true if and only if the regex matches text.
476     ///
477     /// For single regular expressions, this is equivalent to calling
478     /// shortest_match(...).is_some().
479     #[cfg_attr(feature = "perf-inline", inline(always))]
is_match_at(&self, text: &[u8], start: usize) -> bool480     fn is_match_at(&self, text: &[u8], start: usize) -> bool {
481         if !self.is_anchor_end_match(text) {
482             return false;
483         }
484         // We need to do this dance because shortest_match relies on the NFA
485         // filling in captures[1], but a RegexSet has no captures. In other
486         // words, a RegexSet can't (currently) use shortest_match. ---AG
487         match self.ro.match_type {
488             #[cfg(feature = "perf-literal")]
489             MatchType::Literal(ty) => {
490                 self.find_literals(ty, text, start).is_some()
491             }
492             #[cfg(feature = "perf-dfa")]
493             MatchType::Dfa | MatchType::DfaMany => {
494                 match self.shortest_dfa(text, start) {
495                     dfa::Result::Match(_) => true,
496                     dfa::Result::NoMatch(_) => false,
497                     dfa::Result::Quit => self.match_nfa(text, start),
498                 }
499             }
500             #[cfg(feature = "perf-dfa")]
501             MatchType::DfaAnchoredReverse => {
502                 match dfa::Fsm::reverse(
503                     &self.ro.dfa_reverse,
504                     self.cache.value(),
505                     true,
506                     &text[start..],
507                     text.len(),
508                 ) {
509                     dfa::Result::Match(_) => true,
510                     dfa::Result::NoMatch(_) => false,
511                     dfa::Result::Quit => self.match_nfa(text, start),
512                 }
513             }
514             #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
515             MatchType::DfaSuffix => {
516                 match self.shortest_dfa_reverse_suffix(text, start) {
517                     dfa::Result::Match(_) => true,
518                     dfa::Result::NoMatch(_) => false,
519                     dfa::Result::Quit => self.match_nfa(text, start),
520                 }
521             }
522             MatchType::Nfa(ty) => self.match_nfa_type(ty, text, start),
523             MatchType::Nothing => false,
524         }
525     }
526 
527     /// Finds the start and end location of the leftmost-first match, starting
528     /// at the given location.
529     #[cfg_attr(feature = "perf-inline", inline(always))]
find_at(&self, text: &[u8], start: usize) -> Option<(usize, usize)>530     fn find_at(&self, text: &[u8], start: usize) -> Option<(usize, usize)> {
531         if !self.is_anchor_end_match(text) {
532             return None;
533         }
534         match self.ro.match_type {
535             #[cfg(feature = "perf-literal")]
536             MatchType::Literal(ty) => self.find_literals(ty, text, start),
537             #[cfg(feature = "perf-dfa")]
538             MatchType::Dfa => match self.find_dfa_forward(text, start) {
539                 dfa::Result::Match((s, e)) => Some((s, e)),
540                 dfa::Result::NoMatch(_) => None,
541                 dfa::Result::Quit => {
542                     self.find_nfa(MatchNfaType::Auto, text, start)
543                 }
544             },
545             #[cfg(feature = "perf-dfa")]
546             MatchType::DfaAnchoredReverse => {
547                 match self.find_dfa_anchored_reverse(text, start) {
548                     dfa::Result::Match((s, e)) => Some((s, e)),
549                     dfa::Result::NoMatch(_) => None,
550                     dfa::Result::Quit => {
551                         self.find_nfa(MatchNfaType::Auto, text, start)
552                     }
553                 }
554             }
555             #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
556             MatchType::DfaSuffix => {
557                 match self.find_dfa_reverse_suffix(text, start) {
558                     dfa::Result::Match((s, e)) => Some((s, e)),
559                     dfa::Result::NoMatch(_) => None,
560                     dfa::Result::Quit => {
561                         self.find_nfa(MatchNfaType::Auto, text, start)
562                     }
563                 }
564             }
565             MatchType::Nfa(ty) => self.find_nfa(ty, text, start),
566             MatchType::Nothing => None,
567             #[cfg(feature = "perf-dfa")]
568             MatchType::DfaMany => {
569                 unreachable!("BUG: RegexSet cannot be used with find")
570             }
571         }
572     }
573 
574     /// Finds the start and end location of the leftmost-first match and also
575     /// fills in all matching capture groups.
576     ///
577     /// The number of capture slots given should be equal to the total number
578     /// of capture slots in the compiled program.
579     ///
580     /// Note that the first two slots always correspond to the start and end
581     /// locations of the overall match.
captures_read_at( &self, locs: &mut Locations, text: &[u8], start: usize, ) -> Option<(usize, usize)>582     fn captures_read_at(
583         &self,
584         locs: &mut Locations,
585         text: &[u8],
586         start: usize,
587     ) -> Option<(usize, usize)> {
588         let slots = locs.as_slots();
589         for slot in slots.iter_mut() {
590             *slot = None;
591         }
592         // If the caller unnecessarily uses this, then we try to save them
593         // from themselves.
594         match slots.len() {
595             0 => return self.find_at(text, start),
596             2 => {
597                 return self.find_at(text, start).map(|(s, e)| {
598                     slots[0] = Some(s);
599                     slots[1] = Some(e);
600                     (s, e)
601                 });
602             }
603             _ => {} // fallthrough
604         }
605         if !self.is_anchor_end_match(text) {
606             return None;
607         }
608         match self.ro.match_type {
609             #[cfg(feature = "perf-literal")]
610             MatchType::Literal(ty) => {
611                 self.find_literals(ty, text, start).and_then(|(s, e)| {
612                     self.captures_nfa_type(
613                         MatchNfaType::Auto,
614                         slots,
615                         text,
616                         s,
617                         e,
618                     )
619                 })
620             }
621             #[cfg(feature = "perf-dfa")]
622             MatchType::Dfa => {
623                 if self.ro.nfa.is_anchored_start {
624                     self.captures_nfa(slots, text, start)
625                 } else {
626                     match self.find_dfa_forward(text, start) {
627                         dfa::Result::Match((s, e)) => self.captures_nfa_type(
628                             MatchNfaType::Auto,
629                             slots,
630                             text,
631                             s,
632                             e,
633                         ),
634                         dfa::Result::NoMatch(_) => None,
635                         dfa::Result::Quit => {
636                             self.captures_nfa(slots, text, start)
637                         }
638                     }
639                 }
640             }
641             #[cfg(feature = "perf-dfa")]
642             MatchType::DfaAnchoredReverse => {
643                 match self.find_dfa_anchored_reverse(text, start) {
644                     dfa::Result::Match((s, e)) => self.captures_nfa_type(
645                         MatchNfaType::Auto,
646                         slots,
647                         text,
648                         s,
649                         e,
650                     ),
651                     dfa::Result::NoMatch(_) => None,
652                     dfa::Result::Quit => self.captures_nfa(slots, text, start),
653                 }
654             }
655             #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
656             MatchType::DfaSuffix => {
657                 match self.find_dfa_reverse_suffix(text, start) {
658                     dfa::Result::Match((s, e)) => self.captures_nfa_type(
659                         MatchNfaType::Auto,
660                         slots,
661                         text,
662                         s,
663                         e,
664                     ),
665                     dfa::Result::NoMatch(_) => None,
666                     dfa::Result::Quit => self.captures_nfa(slots, text, start),
667                 }
668             }
669             MatchType::Nfa(ty) => {
670                 self.captures_nfa_type(ty, slots, text, start, text.len())
671             }
672             MatchType::Nothing => None,
673             #[cfg(feature = "perf-dfa")]
674             MatchType::DfaMany => {
675                 unreachable!("BUG: RegexSet cannot be used with captures")
676             }
677         }
678     }
679 }
680 
681 impl<'c> ExecNoSync<'c> {
682     /// Finds the leftmost-first match using only literal search.
683     #[cfg(feature = "perf-literal")]
684     #[cfg_attr(feature = "perf-inline", inline(always))]
find_literals( &self, ty: MatchLiteralType, text: &[u8], start: usize, ) -> Option<(usize, usize)>685     fn find_literals(
686         &self,
687         ty: MatchLiteralType,
688         text: &[u8],
689         start: usize,
690     ) -> Option<(usize, usize)> {
691         use self::MatchLiteralType::*;
692         match ty {
693             Unanchored => {
694                 let lits = &self.ro.nfa.prefixes;
695                 lits.find(&text[start..]).map(|(s, e)| (start + s, start + e))
696             }
697             AnchoredStart => {
698                 let lits = &self.ro.nfa.prefixes;
699                 if start == 0 || !self.ro.nfa.is_anchored_start {
700                     lits.find_start(&text[start..])
701                         .map(|(s, e)| (start + s, start + e))
702                 } else {
703                     None
704                 }
705             }
706             AnchoredEnd => {
707                 let lits = &self.ro.suffixes;
708                 lits.find_end(&text[start..])
709                     .map(|(s, e)| (start + s, start + e))
710             }
711             AhoCorasick => self
712                 .ro
713                 .ac
714                 .as_ref()
715                 .unwrap()
716                 .find(&text[start..])
717                 .map(|m| (start + m.start(), start + m.end())),
718         }
719     }
720 
721     /// Finds the leftmost-first match (start and end) using only the DFA.
722     ///
723     /// If the result returned indicates that the DFA quit, then another
724     /// matching engine should be used.
725     #[cfg(feature = "perf-dfa")]
726     #[cfg_attr(feature = "perf-inline", inline(always))]
find_dfa_forward( &self, text: &[u8], start: usize, ) -> dfa::Result<(usize, usize)>727     fn find_dfa_forward(
728         &self,
729         text: &[u8],
730         start: usize,
731     ) -> dfa::Result<(usize, usize)> {
732         use dfa::Result::*;
733         let end = match dfa::Fsm::forward(
734             &self.ro.dfa,
735             self.cache.value(),
736             false,
737             text,
738             start,
739         ) {
740             NoMatch(i) => return NoMatch(i),
741             Quit => return Quit,
742             Match(end) if start == end => return Match((start, start)),
743             Match(end) => end,
744         };
745         // Now run the DFA in reverse to find the start of the match.
746         match dfa::Fsm::reverse(
747             &self.ro.dfa_reverse,
748             self.cache.value(),
749             false,
750             &text[start..],
751             end - start,
752         ) {
753             Match(s) => Match((start + s, end)),
754             NoMatch(i) => NoMatch(i),
755             Quit => Quit,
756         }
757     }
758 
759     /// Finds the leftmost-first match (start and end) using only the DFA,
760     /// but assumes the regex is anchored at the end and therefore starts at
761     /// the end of the regex and matches in reverse.
762     ///
763     /// If the result returned indicates that the DFA quit, then another
764     /// matching engine should be used.
765     #[cfg(feature = "perf-dfa")]
766     #[cfg_attr(feature = "perf-inline", inline(always))]
find_dfa_anchored_reverse( &self, text: &[u8], start: usize, ) -> dfa::Result<(usize, usize)>767     fn find_dfa_anchored_reverse(
768         &self,
769         text: &[u8],
770         start: usize,
771     ) -> dfa::Result<(usize, usize)> {
772         use dfa::Result::*;
773         match dfa::Fsm::reverse(
774             &self.ro.dfa_reverse,
775             self.cache.value(),
776             false,
777             &text[start..],
778             text.len() - start,
779         ) {
780             Match(s) => Match((start + s, text.len())),
781             NoMatch(i) => NoMatch(i),
782             Quit => Quit,
783         }
784     }
785 
786     /// Finds the end of the shortest match using only the DFA.
787     #[cfg(feature = "perf-dfa")]
788     #[cfg_attr(feature = "perf-inline", inline(always))]
shortest_dfa(&self, text: &[u8], start: usize) -> dfa::Result<usize>789     fn shortest_dfa(&self, text: &[u8], start: usize) -> dfa::Result<usize> {
790         dfa::Fsm::forward(&self.ro.dfa, self.cache.value(), true, text, start)
791     }
792 
793     /// Finds the end of the shortest match using only the DFA by scanning for
794     /// suffix literals.
795     #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
796     #[cfg_attr(feature = "perf-inline", inline(always))]
shortest_dfa_reverse_suffix( &self, text: &[u8], start: usize, ) -> dfa::Result<usize>797     fn shortest_dfa_reverse_suffix(
798         &self,
799         text: &[u8],
800         start: usize,
801     ) -> dfa::Result<usize> {
802         match self.exec_dfa_reverse_suffix(text, start) {
803             None => self.shortest_dfa(text, start),
804             Some(r) => r.map(|(_, end)| end),
805         }
806     }
807 
808     /// Finds the end of the shortest match using only the DFA by scanning for
809     /// suffix literals. It also reports the start of the match.
810     ///
811     /// Note that if None is returned, then the optimization gave up to avoid
812     /// worst case quadratic behavior. A forward scanning DFA should be tried
813     /// next.
814     ///
815     /// If a match is returned and the full leftmost-first match is desired,
816     /// then a forward scan starting from the beginning of the match must be
817     /// done.
818     ///
819     /// If the result returned indicates that the DFA quit, then another
820     /// matching engine should be used.
821     #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
822     #[cfg_attr(feature = "perf-inline", inline(always))]
exec_dfa_reverse_suffix( &self, text: &[u8], original_start: usize, ) -> Option<dfa::Result<(usize, usize)>>823     fn exec_dfa_reverse_suffix(
824         &self,
825         text: &[u8],
826         original_start: usize,
827     ) -> Option<dfa::Result<(usize, usize)>> {
828         use dfa::Result::*;
829 
830         let lcs = self.ro.suffixes.lcs();
831         debug_assert!(lcs.len() >= 1);
832         let mut start = original_start;
833         let mut end = start;
834         let mut last_literal = start;
835         while end <= text.len() {
836             last_literal += match lcs.find(&text[last_literal..]) {
837                 None => return Some(NoMatch(text.len())),
838                 Some(i) => i,
839             };
840             end = last_literal + lcs.len();
841             match dfa::Fsm::reverse(
842                 &self.ro.dfa_reverse,
843                 self.cache.value(),
844                 false,
845                 &text[start..end],
846                 end - start,
847             ) {
848                 Match(0) | NoMatch(0) => return None,
849                 Match(i) => return Some(Match((start + i, end))),
850                 NoMatch(i) => {
851                     start += i;
852                     last_literal += 1;
853                     continue;
854                 }
855                 Quit => return Some(Quit),
856             };
857         }
858         Some(NoMatch(text.len()))
859     }
860 
861     /// Finds the leftmost-first match (start and end) using only the DFA
862     /// by scanning for suffix literals.
863     ///
864     /// If the result returned indicates that the DFA quit, then another
865     /// matching engine should be used.
866     #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
867     #[cfg_attr(feature = "perf-inline", inline(always))]
find_dfa_reverse_suffix( &self, text: &[u8], start: usize, ) -> dfa::Result<(usize, usize)>868     fn find_dfa_reverse_suffix(
869         &self,
870         text: &[u8],
871         start: usize,
872     ) -> dfa::Result<(usize, usize)> {
873         use dfa::Result::*;
874 
875         let match_start = match self.exec_dfa_reverse_suffix(text, start) {
876             None => return self.find_dfa_forward(text, start),
877             Some(Match((start, _))) => start,
878             Some(r) => return r,
879         };
880         // At this point, we've found a match. The only way to quit now
881         // without a match is if the DFA gives up (seems unlikely).
882         //
883         // Now run the DFA forwards to find the proper end of the match.
884         // (The suffix literal match can only indicate the earliest
885         // possible end location, which may appear before the end of the
886         // leftmost-first match.)
887         match dfa::Fsm::forward(
888             &self.ro.dfa,
889             self.cache.value(),
890             false,
891             text,
892             match_start,
893         ) {
894             NoMatch(_) => panic!("BUG: reverse match implies forward match"),
895             Quit => Quit,
896             Match(e) => Match((match_start, e)),
897         }
898     }
899 
900     /// Executes the NFA engine to return whether there is a match or not.
901     ///
902     /// Ideally, we could use shortest_nfa(...).is_some() and get the same
903     /// performance characteristics, but regex sets don't have captures, which
904     /// shortest_nfa depends on.
905     #[cfg(feature = "perf-dfa")]
match_nfa(&self, text: &[u8], start: usize) -> bool906     fn match_nfa(&self, text: &[u8], start: usize) -> bool {
907         self.match_nfa_type(MatchNfaType::Auto, text, start)
908     }
909 
910     /// Like match_nfa, but allows specification of the type of NFA engine.
match_nfa_type( &self, ty: MatchNfaType, text: &[u8], start: usize, ) -> bool911     fn match_nfa_type(
912         &self,
913         ty: MatchNfaType,
914         text: &[u8],
915         start: usize,
916     ) -> bool {
917         self.exec_nfa(
918             ty,
919             &mut [false],
920             &mut [],
921             true,
922             false,
923             text,
924             start,
925             text.len(),
926         )
927     }
928 
929     /// Finds the shortest match using an NFA.
930     #[cfg(feature = "perf-dfa")]
shortest_nfa(&self, text: &[u8], start: usize) -> Option<usize>931     fn shortest_nfa(&self, text: &[u8], start: usize) -> Option<usize> {
932         self.shortest_nfa_type(MatchNfaType::Auto, text, start)
933     }
934 
935     /// Like shortest_nfa, but allows specification of the type of NFA engine.
shortest_nfa_type( &self, ty: MatchNfaType, text: &[u8], start: usize, ) -> Option<usize>936     fn shortest_nfa_type(
937         &self,
938         ty: MatchNfaType,
939         text: &[u8],
940         start: usize,
941     ) -> Option<usize> {
942         let mut slots = [None, None];
943         if self.exec_nfa(
944             ty,
945             &mut [false],
946             &mut slots,
947             true,
948             true,
949             text,
950             start,
951             text.len(),
952         ) {
953             slots[1]
954         } else {
955             None
956         }
957     }
958 
959     /// Like find, but executes an NFA engine.
find_nfa( &self, ty: MatchNfaType, text: &[u8], start: usize, ) -> Option<(usize, usize)>960     fn find_nfa(
961         &self,
962         ty: MatchNfaType,
963         text: &[u8],
964         start: usize,
965     ) -> Option<(usize, usize)> {
966         let mut slots = [None, None];
967         if self.exec_nfa(
968             ty,
969             &mut [false],
970             &mut slots,
971             false,
972             false,
973             text,
974             start,
975             text.len(),
976         ) {
977             match (slots[0], slots[1]) {
978                 (Some(s), Some(e)) => Some((s, e)),
979                 _ => None,
980             }
981         } else {
982             None
983         }
984     }
985 
986     /// Like find_nfa, but fills in captures.
987     ///
988     /// `slots` should have length equal to `2 * nfa.captures.len()`.
989     #[cfg(feature = "perf-dfa")]
captures_nfa( &self, slots: &mut [Slot], text: &[u8], start: usize, ) -> Option<(usize, usize)>990     fn captures_nfa(
991         &self,
992         slots: &mut [Slot],
993         text: &[u8],
994         start: usize,
995     ) -> Option<(usize, usize)> {
996         self.captures_nfa_type(
997             MatchNfaType::Auto,
998             slots,
999             text,
1000             start,
1001             text.len(),
1002         )
1003     }
1004 
1005     /// Like captures_nfa, but allows specification of type of NFA engine.
captures_nfa_type( &self, ty: MatchNfaType, slots: &mut [Slot], text: &[u8], start: usize, end: usize, ) -> Option<(usize, usize)>1006     fn captures_nfa_type(
1007         &self,
1008         ty: MatchNfaType,
1009         slots: &mut [Slot],
1010         text: &[u8],
1011         start: usize,
1012         end: usize,
1013     ) -> Option<(usize, usize)> {
1014         if self.exec_nfa(
1015             ty,
1016             &mut [false],
1017             slots,
1018             false,
1019             false,
1020             text,
1021             start,
1022             end,
1023         ) {
1024             match (slots[0], slots[1]) {
1025                 (Some(s), Some(e)) => Some((s, e)),
1026                 _ => None,
1027             }
1028         } else {
1029             None
1030         }
1031     }
1032 
exec_nfa( &self, mut ty: MatchNfaType, matches: &mut [bool], slots: &mut [Slot], quit_after_match: bool, quit_after_match_with_pos: bool, text: &[u8], start: usize, end: usize, ) -> bool1033     fn exec_nfa(
1034         &self,
1035         mut ty: MatchNfaType,
1036         matches: &mut [bool],
1037         slots: &mut [Slot],
1038         quit_after_match: bool,
1039         quit_after_match_with_pos: bool,
1040         text: &[u8],
1041         start: usize,
1042         end: usize,
1043     ) -> bool {
1044         use self::MatchNfaType::*;
1045         if let Auto = ty {
1046             if backtrack::should_exec(self.ro.nfa.len(), text.len()) {
1047                 ty = Backtrack;
1048             } else {
1049                 ty = PikeVM;
1050             }
1051         }
1052         // The backtracker can't return the shortest match position as it is
1053         // implemented today. So if someone calls `shortest_match` and we need
1054         // to run an NFA, then use the PikeVM.
1055         if quit_after_match_with_pos || ty == PikeVM {
1056             self.exec_pikevm(
1057                 matches,
1058                 slots,
1059                 quit_after_match,
1060                 text,
1061                 start,
1062                 end,
1063             )
1064         } else {
1065             self.exec_backtrack(matches, slots, text, start, end)
1066         }
1067     }
1068 
1069     /// Always run the NFA algorithm.
exec_pikevm( &self, matches: &mut [bool], slots: &mut [Slot], quit_after_match: bool, text: &[u8], start: usize, end: usize, ) -> bool1070     fn exec_pikevm(
1071         &self,
1072         matches: &mut [bool],
1073         slots: &mut [Slot],
1074         quit_after_match: bool,
1075         text: &[u8],
1076         start: usize,
1077         end: usize,
1078     ) -> bool {
1079         if self.ro.nfa.uses_bytes() {
1080             pikevm::Fsm::exec(
1081                 &self.ro.nfa,
1082                 self.cache.value(),
1083                 matches,
1084                 slots,
1085                 quit_after_match,
1086                 ByteInput::new(text, self.ro.nfa.only_utf8),
1087                 start,
1088                 end,
1089             )
1090         } else {
1091             pikevm::Fsm::exec(
1092                 &self.ro.nfa,
1093                 self.cache.value(),
1094                 matches,
1095                 slots,
1096                 quit_after_match,
1097                 CharInput::new(text),
1098                 start,
1099                 end,
1100             )
1101         }
1102     }
1103 
1104     /// Always runs the NFA using bounded backtracking.
exec_backtrack( &self, matches: &mut [bool], slots: &mut [Slot], text: &[u8], start: usize, end: usize, ) -> bool1105     fn exec_backtrack(
1106         &self,
1107         matches: &mut [bool],
1108         slots: &mut [Slot],
1109         text: &[u8],
1110         start: usize,
1111         end: usize,
1112     ) -> bool {
1113         if self.ro.nfa.uses_bytes() {
1114             backtrack::Bounded::exec(
1115                 &self.ro.nfa,
1116                 self.cache.value(),
1117                 matches,
1118                 slots,
1119                 ByteInput::new(text, self.ro.nfa.only_utf8),
1120                 start,
1121                 end,
1122             )
1123         } else {
1124             backtrack::Bounded::exec(
1125                 &self.ro.nfa,
1126                 self.cache.value(),
1127                 matches,
1128                 slots,
1129                 CharInput::new(text),
1130                 start,
1131                 end,
1132             )
1133         }
1134     }
1135 
1136     /// Finds which regular expressions match the given text.
1137     ///
1138     /// `matches` should have length equal to the number of regexes being
1139     /// searched.
1140     ///
1141     /// This is only useful when one wants to know which regexes in a set
1142     /// match some text.
many_matches_at( &self, matches: &mut [bool], text: &[u8], start: usize, ) -> bool1143     pub fn many_matches_at(
1144         &self,
1145         matches: &mut [bool],
1146         text: &[u8],
1147         start: usize,
1148     ) -> bool {
1149         use self::MatchType::*;
1150         if !self.is_anchor_end_match(text) {
1151             return false;
1152         }
1153         match self.ro.match_type {
1154             #[cfg(feature = "perf-literal")]
1155             Literal(ty) => {
1156                 debug_assert_eq!(matches.len(), 1);
1157                 matches[0] = self.find_literals(ty, text, start).is_some();
1158                 matches[0]
1159             }
1160             #[cfg(feature = "perf-dfa")]
1161             Dfa | DfaAnchoredReverse | DfaMany => {
1162                 match dfa::Fsm::forward_many(
1163                     &self.ro.dfa,
1164                     self.cache.value(),
1165                     matches,
1166                     text,
1167                     start,
1168                 ) {
1169                     dfa::Result::Match(_) => true,
1170                     dfa::Result::NoMatch(_) => false,
1171                     dfa::Result::Quit => self.exec_nfa(
1172                         MatchNfaType::Auto,
1173                         matches,
1174                         &mut [],
1175                         false,
1176                         false,
1177                         text,
1178                         start,
1179                         text.len(),
1180                     ),
1181                 }
1182             }
1183             #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
1184             DfaSuffix => {
1185                 match dfa::Fsm::forward_many(
1186                     &self.ro.dfa,
1187                     self.cache.value(),
1188                     matches,
1189                     text,
1190                     start,
1191                 ) {
1192                     dfa::Result::Match(_) => true,
1193                     dfa::Result::NoMatch(_) => false,
1194                     dfa::Result::Quit => self.exec_nfa(
1195                         MatchNfaType::Auto,
1196                         matches,
1197                         &mut [],
1198                         false,
1199                         false,
1200                         text,
1201                         start,
1202                         text.len(),
1203                     ),
1204                 }
1205             }
1206             Nfa(ty) => self.exec_nfa(
1207                 ty,
1208                 matches,
1209                 &mut [],
1210                 false,
1211                 false,
1212                 text,
1213                 start,
1214                 text.len(),
1215             ),
1216             Nothing => false,
1217         }
1218     }
1219 
1220     #[cfg_attr(feature = "perf-inline", inline(always))]
is_anchor_end_match(&self, text: &[u8]) -> bool1221     fn is_anchor_end_match(&self, text: &[u8]) -> bool {
1222         #[cfg(not(feature = "perf-literal"))]
1223         fn imp(_: &ExecReadOnly, _: &[u8]) -> bool {
1224             true
1225         }
1226 
1227         #[cfg(feature = "perf-literal")]
1228         fn imp(ro: &ExecReadOnly, text: &[u8]) -> bool {
1229             // Only do this check if the haystack is big (>1MB).
1230             if text.len() > (1 << 20) && ro.nfa.is_anchored_end {
1231                 let lcs = ro.suffixes.lcs();
1232                 if lcs.len() >= 1 && !lcs.is_suffix(text) {
1233                     return false;
1234                 }
1235             }
1236             true
1237         }
1238 
1239         imp(&self.ro, text)
1240     }
1241 
capture_name_idx(&self) -> &Arc<HashMap<String, usize>>1242     pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1243         &self.ro.nfa.capture_name_idx
1244     }
1245 }
1246 
1247 impl<'c> ExecNoSyncStr<'c> {
capture_name_idx(&self) -> &Arc<HashMap<String, usize>>1248     pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1249         self.0.capture_name_idx()
1250     }
1251 }
1252 
1253 impl Exec {
1254     /// Get a searcher that isn't Sync.
1255     #[cfg_attr(feature = "perf-inline", inline(always))]
searcher(&self) -> ExecNoSync1256     pub fn searcher(&self) -> ExecNoSync {
1257         let create = || RefCell::new(ProgramCacheInner::new(&self.ro));
1258         ExecNoSync {
1259             ro: &self.ro, // a clone is too expensive here! (and not needed)
1260             cache: self.cache.get_or(create),
1261         }
1262     }
1263 
1264     /// Get a searcher that isn't Sync and can match on &str.
1265     #[cfg_attr(feature = "perf-inline", inline(always))]
searcher_str(&self) -> ExecNoSyncStr1266     pub fn searcher_str(&self) -> ExecNoSyncStr {
1267         ExecNoSyncStr(self.searcher())
1268     }
1269 
1270     /// Build a Regex from this executor.
into_regex(self) -> re_unicode::Regex1271     pub fn into_regex(self) -> re_unicode::Regex {
1272         re_unicode::Regex::from(self)
1273     }
1274 
1275     /// Build a RegexSet from this executor.
into_regex_set(self) -> re_set::unicode::RegexSet1276     pub fn into_regex_set(self) -> re_set::unicode::RegexSet {
1277         re_set::unicode::RegexSet::from(self)
1278     }
1279 
1280     /// Build a Regex from this executor that can match arbitrary bytes.
into_byte_regex(self) -> re_bytes::Regex1281     pub fn into_byte_regex(self) -> re_bytes::Regex {
1282         re_bytes::Regex::from(self)
1283     }
1284 
1285     /// Build a RegexSet from this executor that can match arbitrary bytes.
into_byte_regex_set(self) -> re_set::bytes::RegexSet1286     pub fn into_byte_regex_set(self) -> re_set::bytes::RegexSet {
1287         re_set::bytes::RegexSet::from(self)
1288     }
1289 
1290     /// The original regular expressions given by the caller that were
1291     /// compiled.
regex_strings(&self) -> &[String]1292     pub fn regex_strings(&self) -> &[String] {
1293         &self.ro.res
1294     }
1295 
1296     /// Return a slice of capture names.
1297     ///
1298     /// Any capture that isn't named is None.
capture_names(&self) -> &[Option<String>]1299     pub fn capture_names(&self) -> &[Option<String>] {
1300         &self.ro.nfa.captures
1301     }
1302 
1303     /// Return a reference to named groups mapping (from group name to
1304     /// group position).
capture_name_idx(&self) -> &Arc<HashMap<String, usize>>1305     pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1306         &self.ro.nfa.capture_name_idx
1307     }
1308 }
1309 
1310 impl Clone for Exec {
clone(&self) -> Exec1311     fn clone(&self) -> Exec {
1312         Exec { ro: self.ro.clone(), cache: Cached::new() }
1313     }
1314 }
1315 
1316 impl ExecReadOnly {
choose_match_type(&self, hint: Option<MatchType>) -> MatchType1317     fn choose_match_type(&self, hint: Option<MatchType>) -> MatchType {
1318         if let Some(MatchType::Nfa(_)) = hint {
1319             return hint.unwrap();
1320         }
1321         // If the NFA is empty, then we'll never match anything.
1322         if self.nfa.insts.is_empty() {
1323             return MatchType::Nothing;
1324         }
1325         if let Some(literalty) = self.choose_literal_match_type() {
1326             return literalty;
1327         }
1328         if let Some(dfaty) = self.choose_dfa_match_type() {
1329             return dfaty;
1330         }
1331         // We're so totally hosed.
1332         MatchType::Nfa(MatchNfaType::Auto)
1333     }
1334 
1335     /// If a plain literal scan can be used, then a corresponding literal
1336     /// search type is returned.
choose_literal_match_type(&self) -> Option<MatchType>1337     fn choose_literal_match_type(&self) -> Option<MatchType> {
1338         #[cfg(not(feature = "perf-literal"))]
1339         fn imp(_: &ExecReadOnly) -> Option<MatchType> {
1340             None
1341         }
1342 
1343         #[cfg(feature = "perf-literal")]
1344         fn imp(ro: &ExecReadOnly) -> Option<MatchType> {
1345             // If our set of prefixes is complete, then we can use it to find
1346             // a match in lieu of a regex engine. This doesn't quite work well
1347             // in the presence of multiple regexes, so only do it when there's
1348             // one.
1349             //
1350             // TODO(burntsushi): Also, don't try to match literals if the regex
1351             // is partially anchored. We could technically do it, but we'd need
1352             // to create two sets of literals: all of them and then the subset
1353             // that aren't anchored. We would then only search for all of them
1354             // when at the beginning of the input and use the subset in all
1355             // other cases.
1356             if ro.res.len() != 1 {
1357                 return None;
1358             }
1359             if ro.ac.is_some() {
1360                 return Some(MatchType::Literal(
1361                     MatchLiteralType::AhoCorasick,
1362                 ));
1363             }
1364             if ro.nfa.prefixes.complete() {
1365                 return if ro.nfa.is_anchored_start {
1366                     Some(MatchType::Literal(MatchLiteralType::AnchoredStart))
1367                 } else {
1368                     Some(MatchType::Literal(MatchLiteralType::Unanchored))
1369                 };
1370             }
1371             if ro.suffixes.complete() {
1372                 return if ro.nfa.is_anchored_end {
1373                     Some(MatchType::Literal(MatchLiteralType::AnchoredEnd))
1374                 } else {
1375                     // This case shouldn't happen. When the regex isn't
1376                     // anchored, then complete prefixes should imply complete
1377                     // suffixes.
1378                     Some(MatchType::Literal(MatchLiteralType::Unanchored))
1379                 };
1380             }
1381             None
1382         }
1383 
1384         imp(self)
1385     }
1386 
1387     /// If a DFA scan can be used, then choose the appropriate DFA strategy.
choose_dfa_match_type(&self) -> Option<MatchType>1388     fn choose_dfa_match_type(&self) -> Option<MatchType> {
1389         #[cfg(not(feature = "perf-dfa"))]
1390         fn imp(_: &ExecReadOnly) -> Option<MatchType> {
1391             None
1392         }
1393 
1394         #[cfg(feature = "perf-dfa")]
1395         fn imp(ro: &ExecReadOnly) -> Option<MatchType> {
1396             if !dfa::can_exec(&ro.dfa) {
1397                 return None;
1398             }
1399             // Regex sets require a slightly specialized path.
1400             if ro.res.len() >= 2 {
1401                 return Some(MatchType::DfaMany);
1402             }
1403             // If the regex is anchored at the end but not the start, then
1404             // just match in reverse from the end of the haystack.
1405             if !ro.nfa.is_anchored_start && ro.nfa.is_anchored_end {
1406                 return Some(MatchType::DfaAnchoredReverse);
1407             }
1408             #[cfg(feature = "perf-literal")]
1409             {
1410                 // If there's a longish suffix literal, then it might be faster
1411                 // to look for that first.
1412                 if ro.should_suffix_scan() {
1413                     return Some(MatchType::DfaSuffix);
1414                 }
1415             }
1416             // Fall back to your garden variety forward searching lazy DFA.
1417             Some(MatchType::Dfa)
1418         }
1419 
1420         imp(self)
1421     }
1422 
1423     /// Returns true if the program is amenable to suffix scanning.
1424     ///
1425     /// When this is true, as a heuristic, we assume it is OK to quickly scan
1426     /// for suffix literals and then do a *reverse* DFA match from any matches
1427     /// produced by the literal scan. (And then followed by a forward DFA
1428     /// search, since the previously found suffix literal maybe not actually be
1429     /// the end of a match.)
1430     ///
1431     /// This is a bit of a specialized optimization, but can result in pretty
1432     /// big performance wins if 1) there are no prefix literals and 2) the
1433     /// suffix literals are pretty rare in the text. (1) is obviously easy to
1434     /// account for but (2) is harder. As a proxy, we assume that longer
1435     /// strings are generally rarer, so we only enable this optimization when
1436     /// we have a meaty suffix.
1437     #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
should_suffix_scan(&self) -> bool1438     fn should_suffix_scan(&self) -> bool {
1439         if self.suffixes.is_empty() {
1440             return false;
1441         }
1442         let lcs_len = self.suffixes.lcs().char_len();
1443         lcs_len >= 3 && lcs_len > self.dfa.prefixes.lcp().char_len()
1444     }
1445 }
1446 
1447 #[derive(Clone, Copy, Debug)]
1448 enum MatchType {
1449     /// A single or multiple literal search. This is only used when the regex
1450     /// can be decomposed into a literal search.
1451     #[cfg(feature = "perf-literal")]
1452     Literal(MatchLiteralType),
1453     /// A normal DFA search.
1454     #[cfg(feature = "perf-dfa")]
1455     Dfa,
1456     /// A reverse DFA search starting from the end of a haystack.
1457     #[cfg(feature = "perf-dfa")]
1458     DfaAnchoredReverse,
1459     /// A reverse DFA search with suffix literal scanning.
1460     #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
1461     DfaSuffix,
1462     /// Use the DFA on two or more regular expressions.
1463     #[cfg(feature = "perf-dfa")]
1464     DfaMany,
1465     /// An NFA variant.
1466     Nfa(MatchNfaType),
1467     /// No match is ever possible, so don't ever try to search.
1468     Nothing,
1469 }
1470 
1471 #[derive(Clone, Copy, Debug)]
1472 #[cfg(feature = "perf-literal")]
1473 enum MatchLiteralType {
1474     /// Match literals anywhere in text.
1475     Unanchored,
1476     /// Match literals only at the start of text.
1477     AnchoredStart,
1478     /// Match literals only at the end of text.
1479     AnchoredEnd,
1480     /// Use an Aho-Corasick automaton. This requires `ac` to be Some on
1481     /// ExecReadOnly.
1482     AhoCorasick,
1483 }
1484 
1485 #[derive(Clone, Copy, Debug, Eq, PartialEq)]
1486 enum MatchNfaType {
1487     /// Choose between Backtrack and PikeVM.
1488     Auto,
1489     /// NFA bounded backtracking.
1490     ///
1491     /// (This is only set by tests, since it never makes sense to always want
1492     /// backtracking.)
1493     Backtrack,
1494     /// The Pike VM.
1495     ///
1496     /// (This is only set by tests, since it never makes sense to always want
1497     /// the Pike VM.)
1498     PikeVM,
1499 }
1500 
1501 /// `ProgramCache` maintains reusable allocations for each matching engine
1502 /// available to a particular program.
1503 pub type ProgramCache = RefCell<ProgramCacheInner>;
1504 
1505 #[derive(Debug)]
1506 pub struct ProgramCacheInner {
1507     pub pikevm: pikevm::Cache,
1508     pub backtrack: backtrack::Cache,
1509     #[cfg(feature = "perf-dfa")]
1510     pub dfa: dfa::Cache,
1511     #[cfg(feature = "perf-dfa")]
1512     pub dfa_reverse: dfa::Cache,
1513 }
1514 
1515 impl ProgramCacheInner {
new(ro: &ExecReadOnly) -> Self1516     fn new(ro: &ExecReadOnly) -> Self {
1517         ProgramCacheInner {
1518             pikevm: pikevm::Cache::new(&ro.nfa),
1519             backtrack: backtrack::Cache::new(&ro.nfa),
1520             #[cfg(feature = "perf-dfa")]
1521             dfa: dfa::Cache::new(&ro.dfa),
1522             #[cfg(feature = "perf-dfa")]
1523             dfa_reverse: dfa::Cache::new(&ro.dfa_reverse),
1524         }
1525     }
1526 }
1527 
1528 /// Alternation literals checks if the given HIR is a simple alternation of
1529 /// literals, and if so, returns them. Otherwise, this returns None.
1530 #[cfg(feature = "perf-literal")]
alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>>1531 fn alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>> {
1532     use syntax::hir::{HirKind, Literal};
1533 
1534     // This is pretty hacky, but basically, if `is_alternation_literal` is
1535     // true, then we can make several assumptions about the structure of our
1536     // HIR. This is what justifies the `unreachable!` statements below.
1537     //
1538     // This code should be refactored once we overhaul this crate's
1539     // optimization pipeline, because this is a terribly inflexible way to go
1540     // about things.
1541 
1542     if !expr.is_alternation_literal() {
1543         return None;
1544     }
1545     let alts = match *expr.kind() {
1546         HirKind::Alternation(ref alts) => alts,
1547         _ => return None, // one literal isn't worth it
1548     };
1549 
1550     let extendlit = |lit: &Literal, dst: &mut Vec<u8>| match *lit {
1551         Literal::Unicode(c) => {
1552             let mut buf = [0; 4];
1553             dst.extend_from_slice(c.encode_utf8(&mut buf).as_bytes());
1554         }
1555         Literal::Byte(b) => {
1556             dst.push(b);
1557         }
1558     };
1559 
1560     let mut lits = vec![];
1561     for alt in alts {
1562         let mut lit = vec![];
1563         match *alt.kind() {
1564             HirKind::Literal(ref x) => extendlit(x, &mut lit),
1565             HirKind::Concat(ref exprs) => {
1566                 for e in exprs {
1567                     match *e.kind() {
1568                         HirKind::Literal(ref x) => extendlit(x, &mut lit),
1569                         _ => unreachable!("expected literal, got {:?}", e),
1570                     }
1571                 }
1572             }
1573             _ => unreachable!("expected literal or concat, got {:?}", alt),
1574         }
1575         lits.push(lit);
1576     }
1577     Some(lits)
1578 }
1579 
1580 #[cfg(test)]
1581 mod test {
1582     #[test]
uppercut_s_backtracking_bytes_default_bytes_mismatch()1583     fn uppercut_s_backtracking_bytes_default_bytes_mismatch() {
1584         use internal::ExecBuilder;
1585 
1586         let backtrack_bytes_re = ExecBuilder::new("^S")
1587             .bounded_backtracking()
1588             .only_utf8(false)
1589             .build()
1590             .map(|exec| exec.into_byte_regex())
1591             .map_err(|err| format!("{}", err))
1592             .unwrap();
1593 
1594         let default_bytes_re = ExecBuilder::new("^S")
1595             .only_utf8(false)
1596             .build()
1597             .map(|exec| exec.into_byte_regex())
1598             .map_err(|err| format!("{}", err))
1599             .unwrap();
1600 
1601         let input = vec![83, 83];
1602 
1603         let s1 = backtrack_bytes_re.split(&input);
1604         let s2 = default_bytes_re.split(&input);
1605         for (chunk1, chunk2) in s1.zip(s2) {
1606             assert_eq!(chunk1, chunk2);
1607         }
1608     }
1609 
1610     #[test]
unicode_lit_star_backtracking_utf8bytes_default_utf8bytes_mismatch()1611     fn unicode_lit_star_backtracking_utf8bytes_default_utf8bytes_mismatch() {
1612         use internal::ExecBuilder;
1613 
1614         let backtrack_bytes_re = ExecBuilder::new(r"^(?u:\*)")
1615             .bounded_backtracking()
1616             .bytes(true)
1617             .build()
1618             .map(|exec| exec.into_regex())
1619             .map_err(|err| format!("{}", err))
1620             .unwrap();
1621 
1622         let default_bytes_re = ExecBuilder::new(r"^(?u:\*)")
1623             .bytes(true)
1624             .build()
1625             .map(|exec| exec.into_regex())
1626             .map_err(|err| format!("{}", err))
1627             .unwrap();
1628 
1629         let input = "**";
1630 
1631         let s1 = backtrack_bytes_re.split(input);
1632         let s2 = default_bytes_re.split(input);
1633         for (chunk1, chunk2) in s1.zip(s2) {
1634             assert_eq!(chunk1, chunk2);
1635         }
1636     }
1637 }
1638