1 use std::cell::RefCell;
2 use std::collections::HashMap;
3 use std::sync::Arc;
4 
5 #[cfg(feature = "perf-literal")]
6 use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
7 use syntax::hir::literal::Literals;
8 use syntax::hir::Hir;
9 use syntax::ParserBuilder;
10 
11 use backtrack;
12 use cache::{Cached, CachedGuard};
13 use compile::Compiler;
14 #[cfg(feature = "perf-dfa")]
15 use dfa;
16 use error::Error;
17 use input::{ByteInput, CharInput};
18 use literal::LiteralSearcher;
19 use pikevm;
20 use prog::Program;
21 use re_builder::RegexOptions;
22 use re_bytes;
23 use re_set;
24 use re_trait::{Locations, RegularExpression, Slot};
25 use re_unicode;
26 use utf8::next_utf8;
27 
28 /// `Exec` manages the execution of a regular expression.
29 ///
30 /// In particular, this manages the various compiled forms of a single regular
31 /// expression and the choice of which matching engine to use to execute a
32 /// regular expression.
33 pub struct Exec {
34     /// All read only state.
35     ro: Arc<ExecReadOnly>,
36     /// Caches for the various matching engines.
37     cache: Cached<ProgramCache>,
38 }
39 
40 /// `ExecNoSync` is like `Exec`, except it embeds a reference to a cache. This
41 /// means it is no longer Sync, but we can now avoid the overhead of
42 /// synchronization to fetch the cache.
43 #[derive(Debug)]
44 pub struct ExecNoSync<'c> {
45     /// All read only state.
46     ro: &'c Arc<ExecReadOnly>,
47     /// Caches for the various matching engines.
48     cache: CachedGuard<'c, ProgramCache>,
49 }
50 
51 /// `ExecNoSyncStr` is like `ExecNoSync`, but matches on &str instead of &[u8].
52 pub struct ExecNoSyncStr<'c>(ExecNoSync<'c>);
53 
54 /// `ExecReadOnly` comprises all read only state for a regex. Namely, all such
55 /// state is determined at compile time and never changes during search.
56 #[derive(Debug)]
57 struct ExecReadOnly {
58     /// The original regular expressions given by the caller to compile.
59     res: Vec<String>,
60     /// A compiled program that is used in the NFA simulation and backtracking.
61     /// It can be byte-based or Unicode codepoint based.
62     ///
63     /// N.B. It is not possibly to make this byte-based from the public API.
64     /// It is only used for testing byte based programs in the NFA simulations.
65     nfa: Program,
66     /// A compiled byte based program for DFA execution. This is only used
67     /// if a DFA can be executed. (Currently, only word boundary assertions are
68     /// not supported.) Note that this program contains an embedded `.*?`
69     /// preceding the first capture group, unless the regex is anchored at the
70     /// beginning.
71     dfa: Program,
72     /// The same as above, except the program is reversed (and there is no
73     /// preceding `.*?`). This is used by the DFA to find the starting location
74     /// of matches.
75     dfa_reverse: Program,
76     /// A set of suffix literals extracted from the regex.
77     ///
78     /// Prefix literals are stored on the `Program`, since they are used inside
79     /// the matching engines.
80     suffixes: LiteralSearcher,
81     /// An Aho-Corasick automaton with leftmost-first match semantics.
82     ///
83     /// This is only set when the entire regex is a simple unanchored
84     /// alternation of literals. We could probably use it more circumstances,
85     /// but this is already hacky enough in this architecture.
86     ///
87     /// N.B. We use u32 as a state ID representation under the assumption that
88     /// if we were to exhaust the ID space, we probably would have long
89     /// surpassed the compilation size limit.
90     #[cfg(feature = "perf-literal")]
91     ac: Option<AhoCorasick<u32>>,
92     /// match_type encodes as much upfront knowledge about how we're going to
93     /// execute a search as possible.
94     match_type: MatchType,
95 }
96 
97 /// Facilitates the construction of an executor by exposing various knobs
98 /// to control how a regex is executed and what kinds of resources it's
99 /// permitted to use.
100 pub struct ExecBuilder {
101     options: RegexOptions,
102     match_type: Option<MatchType>,
103     bytes: bool,
104     only_utf8: bool,
105 }
106 
107 /// Parsed represents a set of parsed regular expressions and their detected
108 /// literals.
109 struct Parsed {
110     exprs: Vec<Hir>,
111     prefixes: Literals,
112     suffixes: Literals,
113     bytes: bool,
114 }
115 
116 impl ExecBuilder {
117     /// Create a regex execution builder.
118     ///
119     /// This uses default settings for everything except the regex itself,
120     /// which must be provided. Further knobs can be set by calling methods,
121     /// and then finally, `build` to actually create the executor.
new(re: &str) -> Self122     pub fn new(re: &str) -> Self {
123         Self::new_many(&[re])
124     }
125 
126     /// Like new, but compiles the union of the given regular expressions.
127     ///
128     /// Note that when compiling 2 or more regular expressions, capture groups
129     /// are completely unsupported. (This means both `find` and `captures`
130     /// wont work.)
new_many<I, S>(res: I) -> Self where S: AsRef<str>, I: IntoIterator<Item = S>,131     pub fn new_many<I, S>(res: I) -> Self
132     where
133         S: AsRef<str>,
134         I: IntoIterator<Item = S>,
135     {
136         let mut opts = RegexOptions::default();
137         opts.pats = res.into_iter().map(|s| s.as_ref().to_owned()).collect();
138         Self::new_options(opts)
139     }
140 
141     /// Create a regex execution builder.
new_options(opts: RegexOptions) -> Self142     pub fn new_options(opts: RegexOptions) -> Self {
143         ExecBuilder {
144             options: opts,
145             match_type: None,
146             bytes: false,
147             only_utf8: true,
148         }
149     }
150 
151     /// Set the matching engine to be automatically determined.
152     ///
153     /// This is the default state and will apply whatever optimizations are
154     /// possible, such as running a DFA.
155     ///
156     /// This overrides whatever was previously set via the `nfa` or
157     /// `bounded_backtracking` methods.
automatic(mut self) -> Self158     pub fn automatic(mut self) -> Self {
159         self.match_type = None;
160         self
161     }
162 
163     /// Sets the matching engine to use the NFA algorithm no matter what
164     /// optimizations are possible.
165     ///
166     /// This overrides whatever was previously set via the `automatic` or
167     /// `bounded_backtracking` methods.
nfa(mut self) -> Self168     pub fn nfa(mut self) -> Self {
169         self.match_type = Some(MatchType::Nfa(MatchNfaType::PikeVM));
170         self
171     }
172 
173     /// Sets the matching engine to use a bounded backtracking engine no
174     /// matter what optimizations are possible.
175     ///
176     /// One must use this with care, since the bounded backtracking engine
177     /// uses memory proportion to `len(regex) * len(text)`.
178     ///
179     /// This overrides whatever was previously set via the `automatic` or
180     /// `nfa` methods.
bounded_backtracking(mut self) -> Self181     pub fn bounded_backtracking(mut self) -> Self {
182         self.match_type = Some(MatchType::Nfa(MatchNfaType::Backtrack));
183         self
184     }
185 
186     /// Compiles byte based programs for use with the NFA matching engines.
187     ///
188     /// By default, the NFA engines match on Unicode scalar values. They can
189     /// be made to use byte based programs instead. In general, the byte based
190     /// programs are slower because of a less efficient encoding of character
191     /// classes.
192     ///
193     /// Note that this does not impact DFA matching engines, which always
194     /// execute on bytes.
bytes(mut self, yes: bool) -> Self195     pub fn bytes(mut self, yes: bool) -> Self {
196         self.bytes = yes;
197         self
198     }
199 
200     /// When disabled, the program compiled may match arbitrary bytes.
201     ///
202     /// When enabled (the default), all compiled programs exclusively match
203     /// valid UTF-8 bytes.
only_utf8(mut self, yes: bool) -> Self204     pub fn only_utf8(mut self, yes: bool) -> Self {
205         self.only_utf8 = yes;
206         self
207     }
208 
209     /// Set the Unicode flag.
unicode(mut self, yes: bool) -> Self210     pub fn unicode(mut self, yes: bool) -> Self {
211         self.options.unicode = yes;
212         self
213     }
214 
215     /// Parse the current set of patterns into their AST and extract literals.
parse(&self) -> Result<Parsed, Error>216     fn parse(&self) -> Result<Parsed, Error> {
217         let mut exprs = Vec::with_capacity(self.options.pats.len());
218         let mut prefixes = Some(Literals::empty());
219         let mut suffixes = Some(Literals::empty());
220         let mut bytes = false;
221         let is_set = self.options.pats.len() > 1;
222         // If we're compiling a regex set and that set has any anchored
223         // expressions, then disable all literal optimizations.
224         for pat in &self.options.pats {
225             let mut parser = ParserBuilder::new()
226                 .octal(self.options.octal)
227                 .case_insensitive(self.options.case_insensitive)
228                 .multi_line(self.options.multi_line)
229                 .dot_matches_new_line(self.options.dot_matches_new_line)
230                 .swap_greed(self.options.swap_greed)
231                 .ignore_whitespace(self.options.ignore_whitespace)
232                 .unicode(self.options.unicode)
233                 .allow_invalid_utf8(!self.only_utf8)
234                 .nest_limit(self.options.nest_limit)
235                 .build();
236             let expr =
237                 parser.parse(pat).map_err(|e| Error::Syntax(e.to_string()))?;
238             bytes = bytes || !expr.is_always_utf8();
239 
240             if cfg!(feature = "perf-literal") {
241                 if !expr.is_anchored_start() && expr.is_any_anchored_start() {
242                     // Partial anchors unfortunately make it hard to use
243                     // prefixes, so disable them.
244                     prefixes = None;
245                 } else if is_set && expr.is_anchored_start() {
246                     // Regex sets with anchors do not go well with literal
247                     // optimizations.
248                     prefixes = None;
249                 }
250                 prefixes = prefixes.and_then(|mut prefixes| {
251                     if !prefixes.union_prefixes(&expr) {
252                         None
253                     } else {
254                         Some(prefixes)
255                     }
256                 });
257 
258                 if !expr.is_anchored_end() && expr.is_any_anchored_end() {
259                     // Partial anchors unfortunately make it hard to use
260                     // suffixes, so disable them.
261                     suffixes = None;
262                 } else if is_set && expr.is_anchored_end() {
263                     // Regex sets with anchors do not go well with literal
264                     // optimizations.
265                     suffixes = None;
266                 }
267                 suffixes = suffixes.and_then(|mut suffixes| {
268                     if !suffixes.union_suffixes(&expr) {
269                         None
270                     } else {
271                         Some(suffixes)
272                     }
273                 });
274             }
275             exprs.push(expr);
276         }
277         Ok(Parsed {
278             exprs: exprs,
279             prefixes: prefixes.unwrap_or_else(Literals::empty),
280             suffixes: suffixes.unwrap_or_else(Literals::empty),
281             bytes: bytes,
282         })
283     }
284 
285     /// Build an executor that can run a regular expression.
build(self) -> Result<Exec, Error>286     pub fn build(self) -> Result<Exec, Error> {
287         // Special case when we have no patterns to compile.
288         // This can happen when compiling a regex set.
289         if self.options.pats.is_empty() {
290             let ro = Arc::new(ExecReadOnly {
291                 res: vec![],
292                 nfa: Program::new(),
293                 dfa: Program::new(),
294                 dfa_reverse: Program::new(),
295                 suffixes: LiteralSearcher::empty(),
296                 #[cfg(feature = "perf-literal")]
297                 ac: None,
298                 match_type: MatchType::Nothing,
299             });
300             return Ok(Exec { ro: ro, cache: Cached::new() });
301         }
302         let parsed = self.parse()?;
303         let mut nfa = Compiler::new()
304             .size_limit(self.options.size_limit)
305             .bytes(self.bytes || parsed.bytes)
306             .only_utf8(self.only_utf8)
307             .compile(&parsed.exprs)?;
308         let mut dfa = Compiler::new()
309             .size_limit(self.options.size_limit)
310             .dfa(true)
311             .only_utf8(self.only_utf8)
312             .compile(&parsed.exprs)?;
313         let mut dfa_reverse = Compiler::new()
314             .size_limit(self.options.size_limit)
315             .dfa(true)
316             .only_utf8(self.only_utf8)
317             .reverse(true)
318             .compile(&parsed.exprs)?;
319 
320         #[cfg(feature = "perf-literal")]
321         let ac = self.build_aho_corasick(&parsed);
322         nfa.prefixes = LiteralSearcher::prefixes(parsed.prefixes);
323         dfa.prefixes = nfa.prefixes.clone();
324         dfa.dfa_size_limit = self.options.dfa_size_limit;
325         dfa_reverse.dfa_size_limit = self.options.dfa_size_limit;
326 
327         let mut ro = ExecReadOnly {
328             res: self.options.pats,
329             nfa: nfa,
330             dfa: dfa,
331             dfa_reverse: dfa_reverse,
332             suffixes: LiteralSearcher::suffixes(parsed.suffixes),
333             #[cfg(feature = "perf-literal")]
334             ac: ac,
335             match_type: MatchType::Nothing,
336         };
337         ro.match_type = ro.choose_match_type(self.match_type);
338 
339         let ro = Arc::new(ro);
340         Ok(Exec { ro: ro, cache: Cached::new() })
341     }
342 
343     #[cfg(feature = "perf-literal")]
build_aho_corasick(&self, parsed: &Parsed) -> Option<AhoCorasick<u32>>344     fn build_aho_corasick(&self, parsed: &Parsed) -> Option<AhoCorasick<u32>> {
345         if parsed.exprs.len() != 1 {
346             return None;
347         }
348         let lits = match alternation_literals(&parsed.exprs[0]) {
349             None => return None,
350             Some(lits) => lits,
351         };
352         // If we have a small number of literals, then let Teddy handle
353         // things (see literal/mod.rs).
354         if lits.len() <= 32 {
355             return None;
356         }
357         Some(
358             AhoCorasickBuilder::new()
359                 .match_kind(MatchKind::LeftmostFirst)
360                 .auto_configure(&lits)
361                 // We always want this to reduce size, regardless
362                 // of what auto-configure does.
363                 .byte_classes(true)
364                 .build_with_size::<u32, _, _>(&lits)
365                 // This should never happen because we'd long exceed the
366                 // compilation limit for regexes first.
367                 .expect("AC automaton too big"),
368         )
369     }
370 }
371 
372 impl<'c> RegularExpression for ExecNoSyncStr<'c> {
373     type Text = str;
374 
slots_len(&self) -> usize375     fn slots_len(&self) -> usize {
376         self.0.slots_len()
377     }
378 
next_after_empty(&self, text: &str, i: usize) -> usize379     fn next_after_empty(&self, text: &str, i: usize) -> usize {
380         next_utf8(text.as_bytes(), i)
381     }
382 
383     #[cfg_attr(feature = "perf-inline", inline(always))]
shortest_match_at(&self, text: &str, start: usize) -> Option<usize>384     fn shortest_match_at(&self, text: &str, start: usize) -> Option<usize> {
385         self.0.shortest_match_at(text.as_bytes(), start)
386     }
387 
388     #[cfg_attr(feature = "perf-inline", inline(always))]
is_match_at(&self, text: &str, start: usize) -> bool389     fn is_match_at(&self, text: &str, start: usize) -> bool {
390         self.0.is_match_at(text.as_bytes(), start)
391     }
392 
393     #[cfg_attr(feature = "perf-inline", inline(always))]
find_at(&self, text: &str, start: usize) -> Option<(usize, usize)>394     fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> {
395         self.0.find_at(text.as_bytes(), start)
396     }
397 
398     #[cfg_attr(feature = "perf-inline", inline(always))]
captures_read_at( &self, locs: &mut Locations, text: &str, start: usize, ) -> Option<(usize, usize)>399     fn captures_read_at(
400         &self,
401         locs: &mut Locations,
402         text: &str,
403         start: usize,
404     ) -> Option<(usize, usize)> {
405         self.0.captures_read_at(locs, text.as_bytes(), start)
406     }
407 }
408 
409 impl<'c> RegularExpression for ExecNoSync<'c> {
410     type Text = [u8];
411 
412     /// Returns the number of capture slots in the regular expression. (There
413     /// are two slots for every capture group, corresponding to possibly empty
414     /// start and end locations of the capture.)
slots_len(&self) -> usize415     fn slots_len(&self) -> usize {
416         self.ro.nfa.captures.len() * 2
417     }
418 
next_after_empty(&self, _text: &[u8], i: usize) -> usize419     fn next_after_empty(&self, _text: &[u8], i: usize) -> usize {
420         i + 1
421     }
422 
423     /// Returns the end of a match location, possibly occurring before the
424     /// end location of the correct leftmost-first match.
425     #[cfg_attr(feature = "perf-inline", inline(always))]
shortest_match_at(&self, text: &[u8], start: usize) -> Option<usize>426     fn shortest_match_at(&self, text: &[u8], start: usize) -> Option<usize> {
427         if !self.is_anchor_end_match(text) {
428             return None;
429         }
430         match self.ro.match_type {
431             #[cfg(feature = "perf-literal")]
432             MatchType::Literal(ty) => {
433                 self.find_literals(ty, text, start).map(|(_, e)| e)
434             }
435             #[cfg(feature = "perf-dfa")]
436             MatchType::Dfa | MatchType::DfaMany => {
437                 match self.shortest_dfa(text, start) {
438                     dfa::Result::Match(end) => Some(end),
439                     dfa::Result::NoMatch(_) => None,
440                     dfa::Result::Quit => self.shortest_nfa(text, start),
441                 }
442             }
443             #[cfg(feature = "perf-dfa")]
444             MatchType::DfaAnchoredReverse => {
445                 match dfa::Fsm::reverse(
446                     &self.ro.dfa_reverse,
447                     self.cache.value(),
448                     true,
449                     &text[start..],
450                     text.len(),
451                 ) {
452                     dfa::Result::Match(_) => Some(text.len()),
453                     dfa::Result::NoMatch(_) => None,
454                     dfa::Result::Quit => self.shortest_nfa(text, start),
455                 }
456             }
457             #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
458             MatchType::DfaSuffix => {
459                 match self.shortest_dfa_reverse_suffix(text, start) {
460                     dfa::Result::Match(e) => Some(e),
461                     dfa::Result::NoMatch(_) => None,
462                     dfa::Result::Quit => self.shortest_nfa(text, start),
463                 }
464             }
465             MatchType::Nfa(ty) => self.shortest_nfa_type(ty, text, start),
466             MatchType::Nothing => None,
467         }
468     }
469 
470     /// Returns true if and only if the regex matches text.
471     ///
472     /// For single regular expressions, this is equivalent to calling
473     /// shortest_match(...).is_some().
474     #[cfg_attr(feature = "perf-inline", inline(always))]
is_match_at(&self, text: &[u8], start: usize) -> bool475     fn is_match_at(&self, text: &[u8], start: usize) -> bool {
476         if !self.is_anchor_end_match(text) {
477             return false;
478         }
479         // We need to do this dance because shortest_match relies on the NFA
480         // filling in captures[1], but a RegexSet has no captures. In other
481         // words, a RegexSet can't (currently) use shortest_match. ---AG
482         match self.ro.match_type {
483             #[cfg(feature = "perf-literal")]
484             MatchType::Literal(ty) => {
485                 self.find_literals(ty, text, start).is_some()
486             }
487             #[cfg(feature = "perf-dfa")]
488             MatchType::Dfa | MatchType::DfaMany => {
489                 match self.shortest_dfa(text, start) {
490                     dfa::Result::Match(_) => true,
491                     dfa::Result::NoMatch(_) => false,
492                     dfa::Result::Quit => self.match_nfa(text, start),
493                 }
494             }
495             #[cfg(feature = "perf-dfa")]
496             MatchType::DfaAnchoredReverse => {
497                 match dfa::Fsm::reverse(
498                     &self.ro.dfa_reverse,
499                     self.cache.value(),
500                     true,
501                     &text[start..],
502                     text.len(),
503                 ) {
504                     dfa::Result::Match(_) => true,
505                     dfa::Result::NoMatch(_) => false,
506                     dfa::Result::Quit => self.match_nfa(text, start),
507                 }
508             }
509             #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
510             MatchType::DfaSuffix => {
511                 match self.shortest_dfa_reverse_suffix(text, start) {
512                     dfa::Result::Match(_) => true,
513                     dfa::Result::NoMatch(_) => false,
514                     dfa::Result::Quit => self.match_nfa(text, start),
515                 }
516             }
517             MatchType::Nfa(ty) => self.match_nfa_type(ty, text, start),
518             MatchType::Nothing => false,
519         }
520     }
521 
522     /// Finds the start and end location of the leftmost-first match, starting
523     /// at the given location.
524     #[cfg_attr(feature = "perf-inline", inline(always))]
find_at(&self, text: &[u8], start: usize) -> Option<(usize, usize)>525     fn find_at(&self, text: &[u8], start: usize) -> Option<(usize, usize)> {
526         if !self.is_anchor_end_match(text) {
527             return None;
528         }
529         match self.ro.match_type {
530             #[cfg(feature = "perf-literal")]
531             MatchType::Literal(ty) => self.find_literals(ty, text, start),
532             #[cfg(feature = "perf-dfa")]
533             MatchType::Dfa => match self.find_dfa_forward(text, start) {
534                 dfa::Result::Match((s, e)) => Some((s, e)),
535                 dfa::Result::NoMatch(_) => None,
536                 dfa::Result::Quit => {
537                     self.find_nfa(MatchNfaType::Auto, text, start)
538                 }
539             },
540             #[cfg(feature = "perf-dfa")]
541             MatchType::DfaAnchoredReverse => {
542                 match self.find_dfa_anchored_reverse(text, start) {
543                     dfa::Result::Match((s, e)) => Some((s, e)),
544                     dfa::Result::NoMatch(_) => None,
545                     dfa::Result::Quit => {
546                         self.find_nfa(MatchNfaType::Auto, text, start)
547                     }
548                 }
549             }
550             #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
551             MatchType::DfaSuffix => {
552                 match self.find_dfa_reverse_suffix(text, start) {
553                     dfa::Result::Match((s, e)) => Some((s, e)),
554                     dfa::Result::NoMatch(_) => None,
555                     dfa::Result::Quit => {
556                         self.find_nfa(MatchNfaType::Auto, text, start)
557                     }
558                 }
559             }
560             MatchType::Nfa(ty) => self.find_nfa(ty, text, start),
561             MatchType::Nothing => None,
562             #[cfg(feature = "perf-dfa")]
563             MatchType::DfaMany => {
564                 unreachable!("BUG: RegexSet cannot be used with find")
565             }
566         }
567     }
568 
569     /// Finds the start and end location of the leftmost-first match and also
570     /// fills in all matching capture groups.
571     ///
572     /// The number of capture slots given should be equal to the total number
573     /// of capture slots in the compiled program.
574     ///
575     /// Note that the first two slots always correspond to the start and end
576     /// locations of the overall match.
captures_read_at( &self, locs: &mut Locations, text: &[u8], start: usize, ) -> Option<(usize, usize)>577     fn captures_read_at(
578         &self,
579         locs: &mut Locations,
580         text: &[u8],
581         start: usize,
582     ) -> Option<(usize, usize)> {
583         let slots = locs.as_slots();
584         for slot in slots.iter_mut() {
585             *slot = None;
586         }
587         // If the caller unnecessarily uses this, then we try to save them
588         // from themselves.
589         match slots.len() {
590             0 => return self.find_at(text, start),
591             2 => {
592                 return self.find_at(text, start).map(|(s, e)| {
593                     slots[0] = Some(s);
594                     slots[1] = Some(e);
595                     (s, e)
596                 });
597             }
598             _ => {} // fallthrough
599         }
600         if !self.is_anchor_end_match(text) {
601             return None;
602         }
603         match self.ro.match_type {
604             #[cfg(feature = "perf-literal")]
605             MatchType::Literal(ty) => {
606                 self.find_literals(ty, text, start).and_then(|(s, e)| {
607                     self.captures_nfa_type(
608                         MatchNfaType::Auto,
609                         slots,
610                         text,
611                         s,
612                         e,
613                     )
614                 })
615             }
616             #[cfg(feature = "perf-dfa")]
617             MatchType::Dfa => {
618                 if self.ro.nfa.is_anchored_start {
619                     self.captures_nfa(slots, text, start)
620                 } else {
621                     match self.find_dfa_forward(text, start) {
622                         dfa::Result::Match((s, e)) => self.captures_nfa_type(
623                             MatchNfaType::Auto,
624                             slots,
625                             text,
626                             s,
627                             e,
628                         ),
629                         dfa::Result::NoMatch(_) => None,
630                         dfa::Result::Quit => {
631                             self.captures_nfa(slots, text, start)
632                         }
633                     }
634                 }
635             }
636             #[cfg(feature = "perf-dfa")]
637             MatchType::DfaAnchoredReverse => {
638                 match self.find_dfa_anchored_reverse(text, start) {
639                     dfa::Result::Match((s, e)) => self.captures_nfa_type(
640                         MatchNfaType::Auto,
641                         slots,
642                         text,
643                         s,
644                         e,
645                     ),
646                     dfa::Result::NoMatch(_) => None,
647                     dfa::Result::Quit => self.captures_nfa(slots, text, start),
648                 }
649             }
650             #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
651             MatchType::DfaSuffix => {
652                 match self.find_dfa_reverse_suffix(text, start) {
653                     dfa::Result::Match((s, e)) => self.captures_nfa_type(
654                         MatchNfaType::Auto,
655                         slots,
656                         text,
657                         s,
658                         e,
659                     ),
660                     dfa::Result::NoMatch(_) => None,
661                     dfa::Result::Quit => self.captures_nfa(slots, text, start),
662                 }
663             }
664             MatchType::Nfa(ty) => {
665                 self.captures_nfa_type(ty, slots, text, start, text.len())
666             }
667             MatchType::Nothing => None,
668             #[cfg(feature = "perf-dfa")]
669             MatchType::DfaMany => {
670                 unreachable!("BUG: RegexSet cannot be used with captures")
671             }
672         }
673     }
674 }
675 
676 impl<'c> ExecNoSync<'c> {
677     /// Finds the leftmost-first match using only literal search.
678     #[cfg(feature = "perf-literal")]
679     #[cfg_attr(feature = "perf-inline", inline(always))]
find_literals( &self, ty: MatchLiteralType, text: &[u8], start: usize, ) -> Option<(usize, usize)>680     fn find_literals(
681         &self,
682         ty: MatchLiteralType,
683         text: &[u8],
684         start: usize,
685     ) -> Option<(usize, usize)> {
686         use self::MatchLiteralType::*;
687         match ty {
688             Unanchored => {
689                 let lits = &self.ro.nfa.prefixes;
690                 lits.find(&text[start..]).map(|(s, e)| (start + s, start + e))
691             }
692             AnchoredStart => {
693                 let lits = &self.ro.nfa.prefixes;
694                 if start == 0 || !self.ro.nfa.is_anchored_start {
695                     lits.find_start(&text[start..])
696                         .map(|(s, e)| (start + s, start + e))
697                 } else {
698                     None
699                 }
700             }
701             AnchoredEnd => {
702                 let lits = &self.ro.suffixes;
703                 lits.find_end(&text[start..])
704                     .map(|(s, e)| (start + s, start + e))
705             }
706             AhoCorasick => self
707                 .ro
708                 .ac
709                 .as_ref()
710                 .unwrap()
711                 .find(&text[start..])
712                 .map(|m| (start + m.start(), start + m.end())),
713         }
714     }
715 
716     /// Finds the leftmost-first match (start and end) using only the DFA.
717     ///
718     /// If the result returned indicates that the DFA quit, then another
719     /// matching engine should be used.
720     #[cfg(feature = "perf-dfa")]
721     #[cfg_attr(feature = "perf-inline", inline(always))]
find_dfa_forward( &self, text: &[u8], start: usize, ) -> dfa::Result<(usize, usize)>722     fn find_dfa_forward(
723         &self,
724         text: &[u8],
725         start: usize,
726     ) -> dfa::Result<(usize, usize)> {
727         use dfa::Result::*;
728         let end = match dfa::Fsm::forward(
729             &self.ro.dfa,
730             self.cache.value(),
731             false,
732             text,
733             start,
734         ) {
735             NoMatch(i) => return NoMatch(i),
736             Quit => return Quit,
737             Match(end) if start == end => return Match((start, start)),
738             Match(end) => end,
739         };
740         // Now run the DFA in reverse to find the start of the match.
741         match dfa::Fsm::reverse(
742             &self.ro.dfa_reverse,
743             self.cache.value(),
744             false,
745             &text[start..],
746             end - start,
747         ) {
748             Match(s) => Match((start + s, end)),
749             NoMatch(i) => NoMatch(i),
750             Quit => Quit,
751         }
752     }
753 
754     /// Finds the leftmost-first match (start and end) using only the DFA,
755     /// but assumes the regex is anchored at the end and therefore starts at
756     /// the end of the regex and matches in reverse.
757     ///
758     /// If the result returned indicates that the DFA quit, then another
759     /// matching engine should be used.
760     #[cfg(feature = "perf-dfa")]
761     #[cfg_attr(feature = "perf-inline", inline(always))]
find_dfa_anchored_reverse( &self, text: &[u8], start: usize, ) -> dfa::Result<(usize, usize)>762     fn find_dfa_anchored_reverse(
763         &self,
764         text: &[u8],
765         start: usize,
766     ) -> dfa::Result<(usize, usize)> {
767         use dfa::Result::*;
768         match dfa::Fsm::reverse(
769             &self.ro.dfa_reverse,
770             self.cache.value(),
771             false,
772             &text[start..],
773             text.len() - start,
774         ) {
775             Match(s) => Match((start + s, text.len())),
776             NoMatch(i) => NoMatch(i),
777             Quit => Quit,
778         }
779     }
780 
781     /// Finds the end of the shortest match using only the DFA.
782     #[cfg(feature = "perf-dfa")]
783     #[cfg_attr(feature = "perf-inline", inline(always))]
shortest_dfa(&self, text: &[u8], start: usize) -> dfa::Result<usize>784     fn shortest_dfa(&self, text: &[u8], start: usize) -> dfa::Result<usize> {
785         dfa::Fsm::forward(&self.ro.dfa, self.cache.value(), true, text, start)
786     }
787 
788     /// Finds the end of the shortest match using only the DFA by scanning for
789     /// suffix literals.
790     #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
791     #[cfg_attr(feature = "perf-inline", inline(always))]
shortest_dfa_reverse_suffix( &self, text: &[u8], start: usize, ) -> dfa::Result<usize>792     fn shortest_dfa_reverse_suffix(
793         &self,
794         text: &[u8],
795         start: usize,
796     ) -> dfa::Result<usize> {
797         match self.exec_dfa_reverse_suffix(text, start) {
798             None => self.shortest_dfa(text, start),
799             Some(r) => r.map(|(_, end)| end),
800         }
801     }
802 
803     /// Finds the end of the shortest match using only the DFA by scanning for
804     /// suffix literals. It also reports the start of the match.
805     ///
806     /// Note that if None is returned, then the optimization gave up to avoid
807     /// worst case quadratic behavior. A forward scanning DFA should be tried
808     /// next.
809     ///
810     /// If a match is returned and the full leftmost-first match is desired,
811     /// then a forward scan starting from the beginning of the match must be
812     /// done.
813     ///
814     /// If the result returned indicates that the DFA quit, then another
815     /// matching engine should be used.
816     #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
817     #[cfg_attr(feature = "perf-inline", inline(always))]
exec_dfa_reverse_suffix( &self, text: &[u8], original_start: usize, ) -> Option<dfa::Result<(usize, usize)>>818     fn exec_dfa_reverse_suffix(
819         &self,
820         text: &[u8],
821         original_start: usize,
822     ) -> Option<dfa::Result<(usize, usize)>> {
823         use dfa::Result::*;
824 
825         let lcs = self.ro.suffixes.lcs();
826         debug_assert!(lcs.len() >= 1);
827         let mut start = original_start;
828         let mut end = start;
829         let mut last_literal = start;
830         while end <= text.len() {
831             last_literal += match lcs.find(&text[last_literal..]) {
832                 None => return Some(NoMatch(text.len())),
833                 Some(i) => i,
834             };
835             end = last_literal + lcs.len();
836             match dfa::Fsm::reverse(
837                 &self.ro.dfa_reverse,
838                 self.cache.value(),
839                 false,
840                 &text[start..end],
841                 end - start,
842             ) {
843                 Match(0) | NoMatch(0) => return None,
844                 Match(i) => return Some(Match((start + i, end))),
845                 NoMatch(i) => {
846                     start += i;
847                     last_literal += 1;
848                     continue;
849                 }
850                 Quit => return Some(Quit),
851             };
852         }
853         Some(NoMatch(text.len()))
854     }
855 
856     /// Finds the leftmost-first match (start and end) using only the DFA
857     /// by scanning for suffix literals.
858     ///
859     /// If the result returned indicates that the DFA quit, then another
860     /// matching engine should be used.
861     #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
862     #[cfg_attr(feature = "perf-inline", inline(always))]
find_dfa_reverse_suffix( &self, text: &[u8], start: usize, ) -> dfa::Result<(usize, usize)>863     fn find_dfa_reverse_suffix(
864         &self,
865         text: &[u8],
866         start: usize,
867     ) -> dfa::Result<(usize, usize)> {
868         use dfa::Result::*;
869 
870         let match_start = match self.exec_dfa_reverse_suffix(text, start) {
871             None => return self.find_dfa_forward(text, start),
872             Some(Match((start, _))) => start,
873             Some(r) => return r,
874         };
875         // At this point, we've found a match. The only way to quit now
876         // without a match is if the DFA gives up (seems unlikely).
877         //
878         // Now run the DFA forwards to find the proper end of the match.
879         // (The suffix literal match can only indicate the earliest
880         // possible end location, which may appear before the end of the
881         // leftmost-first match.)
882         match dfa::Fsm::forward(
883             &self.ro.dfa,
884             self.cache.value(),
885             false,
886             text,
887             match_start,
888         ) {
889             NoMatch(_) => panic!("BUG: reverse match implies forward match"),
890             Quit => Quit,
891             Match(e) => Match((match_start, e)),
892         }
893     }
894 
895     /// Executes the NFA engine to return whether there is a match or not.
896     ///
897     /// Ideally, we could use shortest_nfa(...).is_some() and get the same
898     /// performance characteristics, but regex sets don't have captures, which
899     /// shortest_nfa depends on.
900     #[cfg(feature = "perf-dfa")]
match_nfa(&self, text: &[u8], start: usize) -> bool901     fn match_nfa(&self, text: &[u8], start: usize) -> bool {
902         self.match_nfa_type(MatchNfaType::Auto, text, start)
903     }
904 
905     /// Like match_nfa, but allows specification of the type of NFA engine.
match_nfa_type( &self, ty: MatchNfaType, text: &[u8], start: usize, ) -> bool906     fn match_nfa_type(
907         &self,
908         ty: MatchNfaType,
909         text: &[u8],
910         start: usize,
911     ) -> bool {
912         self.exec_nfa(
913             ty,
914             &mut [false],
915             &mut [],
916             true,
917             false,
918             text,
919             start,
920             text.len(),
921         )
922     }
923 
924     /// Finds the shortest match using an NFA.
925     #[cfg(feature = "perf-dfa")]
shortest_nfa(&self, text: &[u8], start: usize) -> Option<usize>926     fn shortest_nfa(&self, text: &[u8], start: usize) -> Option<usize> {
927         self.shortest_nfa_type(MatchNfaType::Auto, text, start)
928     }
929 
930     /// Like shortest_nfa, but allows specification of the type of NFA engine.
shortest_nfa_type( &self, ty: MatchNfaType, text: &[u8], start: usize, ) -> Option<usize>931     fn shortest_nfa_type(
932         &self,
933         ty: MatchNfaType,
934         text: &[u8],
935         start: usize,
936     ) -> Option<usize> {
937         let mut slots = [None, None];
938         if self.exec_nfa(
939             ty,
940             &mut [false],
941             &mut slots,
942             true,
943             true,
944             text,
945             start,
946             text.len(),
947         ) {
948             slots[1]
949         } else {
950             None
951         }
952     }
953 
954     /// Like find, but executes an NFA engine.
find_nfa( &self, ty: MatchNfaType, text: &[u8], start: usize, ) -> Option<(usize, usize)>955     fn find_nfa(
956         &self,
957         ty: MatchNfaType,
958         text: &[u8],
959         start: usize,
960     ) -> Option<(usize, usize)> {
961         let mut slots = [None, None];
962         if self.exec_nfa(
963             ty,
964             &mut [false],
965             &mut slots,
966             false,
967             false,
968             text,
969             start,
970             text.len(),
971         ) {
972             match (slots[0], slots[1]) {
973                 (Some(s), Some(e)) => Some((s, e)),
974                 _ => None,
975             }
976         } else {
977             None
978         }
979     }
980 
981     /// Like find_nfa, but fills in captures.
982     ///
983     /// `slots` should have length equal to `2 * nfa.captures.len()`.
984     #[cfg(feature = "perf-dfa")]
captures_nfa( &self, slots: &mut [Slot], text: &[u8], start: usize, ) -> Option<(usize, usize)>985     fn captures_nfa(
986         &self,
987         slots: &mut [Slot],
988         text: &[u8],
989         start: usize,
990     ) -> Option<(usize, usize)> {
991         self.captures_nfa_type(
992             MatchNfaType::Auto,
993             slots,
994             text,
995             start,
996             text.len(),
997         )
998     }
999 
1000     /// Like captures_nfa, but allows specification of type of NFA engine.
captures_nfa_type( &self, ty: MatchNfaType, slots: &mut [Slot], text: &[u8], start: usize, end: usize, ) -> Option<(usize, usize)>1001     fn captures_nfa_type(
1002         &self,
1003         ty: MatchNfaType,
1004         slots: &mut [Slot],
1005         text: &[u8],
1006         start: usize,
1007         end: usize,
1008     ) -> Option<(usize, usize)> {
1009         if self.exec_nfa(
1010             ty,
1011             &mut [false],
1012             slots,
1013             false,
1014             false,
1015             text,
1016             start,
1017             end,
1018         ) {
1019             match (slots[0], slots[1]) {
1020                 (Some(s), Some(e)) => Some((s, e)),
1021                 _ => None,
1022             }
1023         } else {
1024             None
1025         }
1026     }
1027 
exec_nfa( &self, mut ty: MatchNfaType, matches: &mut [bool], slots: &mut [Slot], quit_after_match: bool, quit_after_match_with_pos: bool, text: &[u8], start: usize, end: usize, ) -> bool1028     fn exec_nfa(
1029         &self,
1030         mut ty: MatchNfaType,
1031         matches: &mut [bool],
1032         slots: &mut [Slot],
1033         quit_after_match: bool,
1034         quit_after_match_with_pos: bool,
1035         text: &[u8],
1036         start: usize,
1037         end: usize,
1038     ) -> bool {
1039         use self::MatchNfaType::*;
1040         if let Auto = ty {
1041             if backtrack::should_exec(self.ro.nfa.len(), text.len()) {
1042                 ty = Backtrack;
1043             } else {
1044                 ty = PikeVM;
1045             }
1046         }
1047         // The backtracker can't return the shortest match position as it is
1048         // implemented today. So if someone calls `shortest_match` and we need
1049         // to run an NFA, then use the PikeVM.
1050         if quit_after_match_with_pos || ty == PikeVM {
1051             self.exec_pikevm(
1052                 matches,
1053                 slots,
1054                 quit_after_match,
1055                 text,
1056                 start,
1057                 end,
1058             )
1059         } else {
1060             self.exec_backtrack(matches, slots, text, start, end)
1061         }
1062     }
1063 
1064     /// Always run the NFA algorithm.
exec_pikevm( &self, matches: &mut [bool], slots: &mut [Slot], quit_after_match: bool, text: &[u8], start: usize, end: usize, ) -> bool1065     fn exec_pikevm(
1066         &self,
1067         matches: &mut [bool],
1068         slots: &mut [Slot],
1069         quit_after_match: bool,
1070         text: &[u8],
1071         start: usize,
1072         end: usize,
1073     ) -> bool {
1074         if self.ro.nfa.uses_bytes() {
1075             pikevm::Fsm::exec(
1076                 &self.ro.nfa,
1077                 self.cache.value(),
1078                 matches,
1079                 slots,
1080                 quit_after_match,
1081                 ByteInput::new(text, self.ro.nfa.only_utf8),
1082                 start,
1083                 end,
1084             )
1085         } else {
1086             pikevm::Fsm::exec(
1087                 &self.ro.nfa,
1088                 self.cache.value(),
1089                 matches,
1090                 slots,
1091                 quit_after_match,
1092                 CharInput::new(text),
1093                 start,
1094                 end,
1095             )
1096         }
1097     }
1098 
1099     /// Always runs the NFA using bounded backtracking.
exec_backtrack( &self, matches: &mut [bool], slots: &mut [Slot], text: &[u8], start: usize, end: usize, ) -> bool1100     fn exec_backtrack(
1101         &self,
1102         matches: &mut [bool],
1103         slots: &mut [Slot],
1104         text: &[u8],
1105         start: usize,
1106         end: usize,
1107     ) -> bool {
1108         if self.ro.nfa.uses_bytes() {
1109             backtrack::Bounded::exec(
1110                 &self.ro.nfa,
1111                 self.cache.value(),
1112                 matches,
1113                 slots,
1114                 ByteInput::new(text, self.ro.nfa.only_utf8),
1115                 start,
1116                 end,
1117             )
1118         } else {
1119             backtrack::Bounded::exec(
1120                 &self.ro.nfa,
1121                 self.cache.value(),
1122                 matches,
1123                 slots,
1124                 CharInput::new(text),
1125                 start,
1126                 end,
1127             )
1128         }
1129     }
1130 
1131     /// Finds which regular expressions match the given text.
1132     ///
1133     /// `matches` should have length equal to the number of regexes being
1134     /// searched.
1135     ///
1136     /// This is only useful when one wants to know which regexes in a set
1137     /// match some text.
many_matches_at( &self, matches: &mut [bool], text: &[u8], start: usize, ) -> bool1138     pub fn many_matches_at(
1139         &self,
1140         matches: &mut [bool],
1141         text: &[u8],
1142         start: usize,
1143     ) -> bool {
1144         use self::MatchType::*;
1145         if !self.is_anchor_end_match(text) {
1146             return false;
1147         }
1148         match self.ro.match_type {
1149             #[cfg(feature = "perf-literal")]
1150             Literal(ty) => {
1151                 debug_assert_eq!(matches.len(), 1);
1152                 matches[0] = self.find_literals(ty, text, start).is_some();
1153                 matches[0]
1154             }
1155             #[cfg(feature = "perf-dfa")]
1156             Dfa | DfaAnchoredReverse | DfaMany => {
1157                 match dfa::Fsm::forward_many(
1158                     &self.ro.dfa,
1159                     self.cache.value(),
1160                     matches,
1161                     text,
1162                     start,
1163                 ) {
1164                     dfa::Result::Match(_) => true,
1165                     dfa::Result::NoMatch(_) => false,
1166                     dfa::Result::Quit => self.exec_nfa(
1167                         MatchNfaType::Auto,
1168                         matches,
1169                         &mut [],
1170                         false,
1171                         false,
1172                         text,
1173                         start,
1174                         text.len(),
1175                     ),
1176                 }
1177             }
1178             #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
1179             DfaSuffix => {
1180                 match dfa::Fsm::forward_many(
1181                     &self.ro.dfa,
1182                     self.cache.value(),
1183                     matches,
1184                     text,
1185                     start,
1186                 ) {
1187                     dfa::Result::Match(_) => true,
1188                     dfa::Result::NoMatch(_) => false,
1189                     dfa::Result::Quit => self.exec_nfa(
1190                         MatchNfaType::Auto,
1191                         matches,
1192                         &mut [],
1193                         false,
1194                         false,
1195                         text,
1196                         start,
1197                         text.len(),
1198                     ),
1199                 }
1200             }
1201             Nfa(ty) => self.exec_nfa(
1202                 ty,
1203                 matches,
1204                 &mut [],
1205                 false,
1206                 false,
1207                 text,
1208                 start,
1209                 text.len(),
1210             ),
1211             Nothing => false,
1212         }
1213     }
1214 
1215     #[cfg_attr(feature = "perf-inline", inline(always))]
is_anchor_end_match(&self, text: &[u8]) -> bool1216     fn is_anchor_end_match(&self, text: &[u8]) -> bool {
1217         #[cfg(not(feature = "perf-literal"))]
1218         fn imp(_: &ExecReadOnly, _: &[u8]) -> bool {
1219             true
1220         }
1221 
1222         #[cfg(feature = "perf-literal")]
1223         fn imp(ro: &ExecReadOnly, text: &[u8]) -> bool {
1224             // Only do this check if the haystack is big (>1MB).
1225             if text.len() > (1 << 20) && ro.nfa.is_anchored_end {
1226                 let lcs = ro.suffixes.lcs();
1227                 if lcs.len() >= 1 && !lcs.is_suffix(text) {
1228                     return false;
1229                 }
1230             }
1231             true
1232         }
1233 
1234         imp(&self.ro, text)
1235     }
1236 
capture_name_idx(&self) -> &Arc<HashMap<String, usize>>1237     pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1238         &self.ro.nfa.capture_name_idx
1239     }
1240 }
1241 
1242 impl<'c> ExecNoSyncStr<'c> {
capture_name_idx(&self) -> &Arc<HashMap<String, usize>>1243     pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1244         self.0.capture_name_idx()
1245     }
1246 }
1247 
1248 impl Exec {
1249     /// Get a searcher that isn't Sync.
1250     #[cfg_attr(feature = "perf-inline", inline(always))]
searcher(&self) -> ExecNoSync1251     pub fn searcher(&self) -> ExecNoSync {
1252         let create = || RefCell::new(ProgramCacheInner::new(&self.ro));
1253         ExecNoSync {
1254             ro: &self.ro, // a clone is too expensive here! (and not needed)
1255             cache: self.cache.get_or(create),
1256         }
1257     }
1258 
1259     /// Get a searcher that isn't Sync and can match on &str.
1260     #[cfg_attr(feature = "perf-inline", inline(always))]
searcher_str(&self) -> ExecNoSyncStr1261     pub fn searcher_str(&self) -> ExecNoSyncStr {
1262         ExecNoSyncStr(self.searcher())
1263     }
1264 
1265     /// Build a Regex from this executor.
into_regex(self) -> re_unicode::Regex1266     pub fn into_regex(self) -> re_unicode::Regex {
1267         re_unicode::Regex::from(self)
1268     }
1269 
1270     /// Build a RegexSet from this executor.
into_regex_set(self) -> re_set::unicode::RegexSet1271     pub fn into_regex_set(self) -> re_set::unicode::RegexSet {
1272         re_set::unicode::RegexSet::from(self)
1273     }
1274 
1275     /// Build a Regex from this executor that can match arbitrary bytes.
into_byte_regex(self) -> re_bytes::Regex1276     pub fn into_byte_regex(self) -> re_bytes::Regex {
1277         re_bytes::Regex::from(self)
1278     }
1279 
1280     /// Build a RegexSet from this executor that can match arbitrary bytes.
into_byte_regex_set(self) -> re_set::bytes::RegexSet1281     pub fn into_byte_regex_set(self) -> re_set::bytes::RegexSet {
1282         re_set::bytes::RegexSet::from(self)
1283     }
1284 
1285     /// The original regular expressions given by the caller that were
1286     /// compiled.
regex_strings(&self) -> &[String]1287     pub fn regex_strings(&self) -> &[String] {
1288         &self.ro.res
1289     }
1290 
1291     /// Return a slice of capture names.
1292     ///
1293     /// Any capture that isn't named is None.
capture_names(&self) -> &[Option<String>]1294     pub fn capture_names(&self) -> &[Option<String>] {
1295         &self.ro.nfa.captures
1296     }
1297 
1298     /// Return a reference to named groups mapping (from group name to
1299     /// group position).
capture_name_idx(&self) -> &Arc<HashMap<String, usize>>1300     pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1301         &self.ro.nfa.capture_name_idx
1302     }
1303 }
1304 
1305 impl Clone for Exec {
clone(&self) -> Exec1306     fn clone(&self) -> Exec {
1307         Exec { ro: self.ro.clone(), cache: Cached::new() }
1308     }
1309 }
1310 
1311 impl ExecReadOnly {
choose_match_type(&self, hint: Option<MatchType>) -> MatchType1312     fn choose_match_type(&self, hint: Option<MatchType>) -> MatchType {
1313         if let Some(MatchType::Nfa(_)) = hint {
1314             return hint.unwrap();
1315         }
1316         // If the NFA is empty, then we'll never match anything.
1317         if self.nfa.insts.is_empty() {
1318             return MatchType::Nothing;
1319         }
1320         if let Some(literalty) = self.choose_literal_match_type() {
1321             return literalty;
1322         }
1323         if let Some(dfaty) = self.choose_dfa_match_type() {
1324             return dfaty;
1325         }
1326         // We're so totally hosed.
1327         MatchType::Nfa(MatchNfaType::Auto)
1328     }
1329 
1330     /// If a plain literal scan can be used, then a corresponding literal
1331     /// search type is returned.
choose_literal_match_type(&self) -> Option<MatchType>1332     fn choose_literal_match_type(&self) -> Option<MatchType> {
1333         #[cfg(not(feature = "perf-literal"))]
1334         fn imp(_: &ExecReadOnly) -> Option<MatchType> {
1335             None
1336         }
1337 
1338         #[cfg(feature = "perf-literal")]
1339         fn imp(ro: &ExecReadOnly) -> Option<MatchType> {
1340             // If our set of prefixes is complete, then we can use it to find
1341             // a match in lieu of a regex engine. This doesn't quite work well
1342             // in the presence of multiple regexes, so only do it when there's
1343             // one.
1344             //
1345             // TODO(burntsushi): Also, don't try to match literals if the regex
1346             // is partially anchored. We could technically do it, but we'd need
1347             // to create two sets of literals: all of them and then the subset
1348             // that aren't anchored. We would then only search for all of them
1349             // when at the beginning of the input and use the subset in all
1350             // other cases.
1351             if ro.res.len() != 1 {
1352                 return None;
1353             }
1354             if ro.ac.is_some() {
1355                 return Some(MatchType::Literal(
1356                     MatchLiteralType::AhoCorasick,
1357                 ));
1358             }
1359             if ro.nfa.prefixes.complete() {
1360                 return if ro.nfa.is_anchored_start {
1361                     Some(MatchType::Literal(MatchLiteralType::AnchoredStart))
1362                 } else {
1363                     Some(MatchType::Literal(MatchLiteralType::Unanchored))
1364                 };
1365             }
1366             if ro.suffixes.complete() {
1367                 return if ro.nfa.is_anchored_end {
1368                     Some(MatchType::Literal(MatchLiteralType::AnchoredEnd))
1369                 } else {
1370                     // This case shouldn't happen. When the regex isn't
1371                     // anchored, then complete prefixes should imply complete
1372                     // suffixes.
1373                     Some(MatchType::Literal(MatchLiteralType::Unanchored))
1374                 };
1375             }
1376             None
1377         }
1378 
1379         imp(self)
1380     }
1381 
1382     /// If a DFA scan can be used, then choose the appropriate DFA strategy.
choose_dfa_match_type(&self) -> Option<MatchType>1383     fn choose_dfa_match_type(&self) -> Option<MatchType> {
1384         #[cfg(not(feature = "perf-dfa"))]
1385         fn imp(_: &ExecReadOnly) -> Option<MatchType> {
1386             None
1387         }
1388 
1389         #[cfg(feature = "perf-dfa")]
1390         fn imp(ro: &ExecReadOnly) -> Option<MatchType> {
1391             if !dfa::can_exec(&ro.dfa) {
1392                 return None;
1393             }
1394             // Regex sets require a slightly specialized path.
1395             if ro.res.len() >= 2 {
1396                 return Some(MatchType::DfaMany);
1397             }
1398             // If the regex is anchored at the end but not the start, then
1399             // just match in reverse from the end of the haystack.
1400             if !ro.nfa.is_anchored_start && ro.nfa.is_anchored_end {
1401                 return Some(MatchType::DfaAnchoredReverse);
1402             }
1403             #[cfg(feature = "perf-literal")]
1404             {
1405                 // If there's a longish suffix literal, then it might be faster
1406                 // to look for that first.
1407                 if ro.should_suffix_scan() {
1408                     return Some(MatchType::DfaSuffix);
1409                 }
1410             }
1411             // Fall back to your garden variety forward searching lazy DFA.
1412             Some(MatchType::Dfa)
1413         }
1414 
1415         imp(self)
1416     }
1417 
1418     /// Returns true if the program is amenable to suffix scanning.
1419     ///
1420     /// When this is true, as a heuristic, we assume it is OK to quickly scan
1421     /// for suffix literals and then do a *reverse* DFA match from any matches
1422     /// produced by the literal scan. (And then followed by a forward DFA
1423     /// search, since the previously found suffix literal maybe not actually be
1424     /// the end of a match.)
1425     ///
1426     /// This is a bit of a specialized optimization, but can result in pretty
1427     /// big performance wins if 1) there are no prefix literals and 2) the
1428     /// suffix literals are pretty rare in the text. (1) is obviously easy to
1429     /// account for but (2) is harder. As a proxy, we assume that longer
1430     /// strings are generally rarer, so we only enable this optimization when
1431     /// we have a meaty suffix.
1432     #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
should_suffix_scan(&self) -> bool1433     fn should_suffix_scan(&self) -> bool {
1434         if self.suffixes.is_empty() {
1435             return false;
1436         }
1437         let lcs_len = self.suffixes.lcs().char_len();
1438         lcs_len >= 3 && lcs_len > self.dfa.prefixes.lcp().char_len()
1439     }
1440 }
1441 
1442 #[derive(Clone, Copy, Debug)]
1443 enum MatchType {
1444     /// A single or multiple literal search. This is only used when the regex
1445     /// can be decomposed into a literal search.
1446     #[cfg(feature = "perf-literal")]
1447     Literal(MatchLiteralType),
1448     /// A normal DFA search.
1449     #[cfg(feature = "perf-dfa")]
1450     Dfa,
1451     /// A reverse DFA search starting from the end of a haystack.
1452     #[cfg(feature = "perf-dfa")]
1453     DfaAnchoredReverse,
1454     /// A reverse DFA search with suffix literal scanning.
1455     #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
1456     DfaSuffix,
1457     /// Use the DFA on two or more regular expressions.
1458     #[cfg(feature = "perf-dfa")]
1459     DfaMany,
1460     /// An NFA variant.
1461     Nfa(MatchNfaType),
1462     /// No match is ever possible, so don't ever try to search.
1463     Nothing,
1464 }
1465 
1466 #[derive(Clone, Copy, Debug)]
1467 #[cfg(feature = "perf-literal")]
1468 enum MatchLiteralType {
1469     /// Match literals anywhere in text.
1470     Unanchored,
1471     /// Match literals only at the start of text.
1472     AnchoredStart,
1473     /// Match literals only at the end of text.
1474     AnchoredEnd,
1475     /// Use an Aho-Corasick automaton. This requires `ac` to be Some on
1476     /// ExecReadOnly.
1477     AhoCorasick,
1478 }
1479 
1480 #[derive(Clone, Copy, Debug, Eq, PartialEq)]
1481 enum MatchNfaType {
1482     /// Choose between Backtrack and PikeVM.
1483     Auto,
1484     /// NFA bounded backtracking.
1485     ///
1486     /// (This is only set by tests, since it never makes sense to always want
1487     /// backtracking.)
1488     Backtrack,
1489     /// The Pike VM.
1490     ///
1491     /// (This is only set by tests, since it never makes sense to always want
1492     /// the Pike VM.)
1493     PikeVM,
1494 }
1495 
1496 /// `ProgramCache` maintains reusable allocations for each matching engine
1497 /// available to a particular program.
1498 pub type ProgramCache = RefCell<ProgramCacheInner>;
1499 
1500 #[derive(Debug)]
1501 pub struct ProgramCacheInner {
1502     pub pikevm: pikevm::Cache,
1503     pub backtrack: backtrack::Cache,
1504     #[cfg(feature = "perf-dfa")]
1505     pub dfa: dfa::Cache,
1506     #[cfg(feature = "perf-dfa")]
1507     pub dfa_reverse: dfa::Cache,
1508 }
1509 
1510 impl ProgramCacheInner {
new(ro: &ExecReadOnly) -> Self1511     fn new(ro: &ExecReadOnly) -> Self {
1512         ProgramCacheInner {
1513             pikevm: pikevm::Cache::new(&ro.nfa),
1514             backtrack: backtrack::Cache::new(&ro.nfa),
1515             #[cfg(feature = "perf-dfa")]
1516             dfa: dfa::Cache::new(&ro.dfa),
1517             #[cfg(feature = "perf-dfa")]
1518             dfa_reverse: dfa::Cache::new(&ro.dfa_reverse),
1519         }
1520     }
1521 }
1522 
1523 /// Alternation literals checks if the given HIR is a simple alternation of
1524 /// literals, and if so, returns them. Otherwise, this returns None.
1525 #[cfg(feature = "perf-literal")]
alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>>1526 fn alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>> {
1527     use syntax::hir::{HirKind, Literal};
1528 
1529     // This is pretty hacky, but basically, if `is_alternation_literal` is
1530     // true, then we can make several assumptions about the structure of our
1531     // HIR. This is what justifies the `unreachable!` statements below.
1532     //
1533     // This code should be refactored once we overhaul this crate's
1534     // optimization pipeline, because this is a terribly inflexible way to go
1535     // about things.
1536 
1537     if !expr.is_alternation_literal() {
1538         return None;
1539     }
1540     let alts = match *expr.kind() {
1541         HirKind::Alternation(ref alts) => alts,
1542         _ => return None, // one literal isn't worth it
1543     };
1544 
1545     let extendlit = |lit: &Literal, dst: &mut Vec<u8>| match *lit {
1546         Literal::Unicode(c) => {
1547             let mut buf = [0; 4];
1548             dst.extend_from_slice(c.encode_utf8(&mut buf).as_bytes());
1549         }
1550         Literal::Byte(b) => {
1551             dst.push(b);
1552         }
1553     };
1554 
1555     let mut lits = vec![];
1556     for alt in alts {
1557         let mut lit = vec![];
1558         match *alt.kind() {
1559             HirKind::Literal(ref x) => extendlit(x, &mut lit),
1560             HirKind::Concat(ref exprs) => {
1561                 for e in exprs {
1562                     match *e.kind() {
1563                         HirKind::Literal(ref x) => extendlit(x, &mut lit),
1564                         _ => unreachable!("expected literal, got {:?}", e),
1565                     }
1566                 }
1567             }
1568             _ => unreachable!("expected literal or concat, got {:?}", alt),
1569         }
1570         lits.push(lit);
1571     }
1572     Some(lits)
1573 }
1574 
1575 #[cfg(test)]
1576 mod test {
1577     #[test]
uppercut_s_backtracking_bytes_default_bytes_mismatch()1578     fn uppercut_s_backtracking_bytes_default_bytes_mismatch() {
1579         use internal::ExecBuilder;
1580 
1581         let backtrack_bytes_re = ExecBuilder::new("^S")
1582             .bounded_backtracking()
1583             .only_utf8(false)
1584             .build()
1585             .map(|exec| exec.into_byte_regex())
1586             .map_err(|err| format!("{}", err))
1587             .unwrap();
1588 
1589         let default_bytes_re = ExecBuilder::new("^S")
1590             .only_utf8(false)
1591             .build()
1592             .map(|exec| exec.into_byte_regex())
1593             .map_err(|err| format!("{}", err))
1594             .unwrap();
1595 
1596         let input = vec![83, 83];
1597 
1598         let s1 = backtrack_bytes_re.split(&input);
1599         let s2 = default_bytes_re.split(&input);
1600         for (chunk1, chunk2) in s1.zip(s2) {
1601             assert_eq!(chunk1, chunk2);
1602         }
1603     }
1604 
1605     #[test]
unicode_lit_star_backtracking_utf8bytes_default_utf8bytes_mismatch()1606     fn unicode_lit_star_backtracking_utf8bytes_default_utf8bytes_mismatch() {
1607         use internal::ExecBuilder;
1608 
1609         let backtrack_bytes_re = ExecBuilder::new(r"^(?u:\*)")
1610             .bounded_backtracking()
1611             .bytes(true)
1612             .build()
1613             .map(|exec| exec.into_regex())
1614             .map_err(|err| format!("{}", err))
1615             .unwrap();
1616 
1617         let default_bytes_re = ExecBuilder::new(r"^(?u:\*)")
1618             .bytes(true)
1619             .build()
1620             .map(|exec| exec.into_regex())
1621             .map_err(|err| format!("{}", err))
1622             .unwrap();
1623 
1624         let input = "**";
1625 
1626         let s1 = backtrack_bytes_re.split(input);
1627         let s2 = default_bytes_re.split(input);
1628         for (chunk1, chunk2) in s1.zip(s2) {
1629             assert_eq!(chunk1, chunk2);
1630         }
1631     }
1632 }
1633