1 use std::cell::RefCell;
2 use std::collections::HashMap;
3 use std::panic::AssertUnwindSafe;
4 use std::sync::Arc;
5 
6 #[cfg(feature = "perf-literal")]
7 use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
8 use regex_syntax::hir::literal::Literals;
9 use regex_syntax::hir::Hir;
10 use regex_syntax::ParserBuilder;
11 
12 use crate::backtrack;
13 use crate::compile::Compiler;
14 #[cfg(feature = "perf-dfa")]
15 use crate::dfa;
16 use crate::error::Error;
17 use crate::input::{ByteInput, CharInput};
18 use crate::literal::LiteralSearcher;
19 use crate::pikevm;
20 use crate::pool::{Pool, PoolGuard};
21 use crate::prog::Program;
22 use crate::re_builder::RegexOptions;
23 use crate::re_bytes;
24 use crate::re_set;
25 use crate::re_trait::{Locations, RegularExpression, Slot};
26 use crate::re_unicode;
27 use crate::utf8::next_utf8;
28 
29 /// `Exec` manages the execution of a regular expression.
30 ///
31 /// In particular, this manages the various compiled forms of a single regular
32 /// expression and the choice of which matching engine to use to execute a
33 /// regular expression.
34 #[derive(Debug)]
35 pub struct Exec {
36     /// All read only state.
37     ro: Arc<ExecReadOnly>,
38     /// A pool of reusable values for the various matching engines.
39     ///
40     /// Note that boxing this value is not strictly necessary, but it is an
41     /// easy way to ensure that T does not bloat the stack sized used by a pool
42     /// in the case where T is big. And this turns out to be the case at the
43     /// time of writing for regex's use of this pool. At the time of writing,
44     /// the size of a Regex on the stack is 856 bytes. Boxing this value
45     /// reduces that size to 16 bytes.
46     pool: Box<Pool<ProgramCache>>,
47 }
48 
49 /// `ExecNoSync` is like `Exec`, except it embeds a reference to a cache. This
50 /// means it is no longer Sync, but we can now avoid the overhead of
51 /// synchronization to fetch the cache.
52 #[derive(Debug)]
53 pub struct ExecNoSync<'c> {
54     /// All read only state.
55     ro: &'c Arc<ExecReadOnly>,
56     /// Caches for the various matching engines.
57     cache: PoolGuard<'c, ProgramCache>,
58 }
59 
60 /// `ExecNoSyncStr` is like `ExecNoSync`, but matches on &str instead of &[u8].
61 #[derive(Debug)]
62 pub struct ExecNoSyncStr<'c>(ExecNoSync<'c>);
63 
64 /// `ExecReadOnly` comprises all read only state for a regex. Namely, all such
65 /// state is determined at compile time and never changes during search.
66 #[derive(Debug)]
67 struct ExecReadOnly {
68     /// The original regular expressions given by the caller to compile.
69     res: Vec<String>,
70     /// A compiled program that is used in the NFA simulation and backtracking.
71     /// It can be byte-based or Unicode codepoint based.
72     ///
73     /// N.B. It is not possibly to make this byte-based from the public API.
74     /// It is only used for testing byte based programs in the NFA simulations.
75     nfa: Program,
76     /// A compiled byte based program for DFA execution. This is only used
77     /// if a DFA can be executed. (Currently, only word boundary assertions are
78     /// not supported.) Note that this program contains an embedded `.*?`
79     /// preceding the first capture group, unless the regex is anchored at the
80     /// beginning.
81     dfa: Program,
82     /// The same as above, except the program is reversed (and there is no
83     /// preceding `.*?`). This is used by the DFA to find the starting location
84     /// of matches.
85     dfa_reverse: Program,
86     /// A set of suffix literals extracted from the regex.
87     ///
88     /// Prefix literals are stored on the `Program`, since they are used inside
89     /// the matching engines.
90     suffixes: LiteralSearcher,
91     /// An Aho-Corasick automaton with leftmost-first match semantics.
92     ///
93     /// This is only set when the entire regex is a simple unanchored
94     /// alternation of literals. We could probably use it more circumstances,
95     /// but this is already hacky enough in this architecture.
96     ///
97     /// N.B. We use u32 as a state ID representation under the assumption that
98     /// if we were to exhaust the ID space, we probably would have long
99     /// surpassed the compilation size limit.
100     #[cfg(feature = "perf-literal")]
101     ac: Option<AhoCorasick<u32>>,
102     /// match_type encodes as much upfront knowledge about how we're going to
103     /// execute a search as possible.
104     match_type: MatchType,
105 }
106 
107 /// Facilitates the construction of an executor by exposing various knobs
108 /// to control how a regex is executed and what kinds of resources it's
109 /// permitted to use.
110 // `ExecBuilder` is only public via the `internal` module, so avoid deriving
111 // `Debug`.
112 #[allow(missing_debug_implementations)]
113 pub struct ExecBuilder {
114     options: RegexOptions,
115     match_type: Option<MatchType>,
116     bytes: bool,
117     only_utf8: bool,
118 }
119 
120 /// Parsed represents a set of parsed regular expressions and their detected
121 /// literals.
122 struct Parsed {
123     exprs: Vec<Hir>,
124     prefixes: Literals,
125     suffixes: Literals,
126     bytes: bool,
127 }
128 
129 impl ExecBuilder {
130     /// Create a regex execution builder.
131     ///
132     /// This uses default settings for everything except the regex itself,
133     /// which must be provided. Further knobs can be set by calling methods,
134     /// and then finally, `build` to actually create the executor.
new(re: &str) -> Self135     pub fn new(re: &str) -> Self {
136         Self::new_many(&[re])
137     }
138 
139     /// Like new, but compiles the union of the given regular expressions.
140     ///
141     /// Note that when compiling 2 or more regular expressions, capture groups
142     /// are completely unsupported. (This means both `find` and `captures`
143     /// won't work.)
new_many<I, S>(res: I) -> Self where S: AsRef<str>, I: IntoIterator<Item = S>,144     pub fn new_many<I, S>(res: I) -> Self
145     where
146         S: AsRef<str>,
147         I: IntoIterator<Item = S>,
148     {
149         let mut opts = RegexOptions::default();
150         opts.pats = res.into_iter().map(|s| s.as_ref().to_owned()).collect();
151         Self::new_options(opts)
152     }
153 
154     /// Create a regex execution builder.
new_options(opts: RegexOptions) -> Self155     pub fn new_options(opts: RegexOptions) -> Self {
156         ExecBuilder {
157             options: opts,
158             match_type: None,
159             bytes: false,
160             only_utf8: true,
161         }
162     }
163 
164     /// Set the matching engine to be automatically determined.
165     ///
166     /// This is the default state and will apply whatever optimizations are
167     /// possible, such as running a DFA.
168     ///
169     /// This overrides whatever was previously set via the `nfa` or
170     /// `bounded_backtracking` methods.
automatic(mut self) -> Self171     pub fn automatic(mut self) -> Self {
172         self.match_type = None;
173         self
174     }
175 
176     /// Sets the matching engine to use the NFA algorithm no matter what
177     /// optimizations are possible.
178     ///
179     /// This overrides whatever was previously set via the `automatic` or
180     /// `bounded_backtracking` methods.
nfa(mut self) -> Self181     pub fn nfa(mut self) -> Self {
182         self.match_type = Some(MatchType::Nfa(MatchNfaType::PikeVM));
183         self
184     }
185 
186     /// Sets the matching engine to use a bounded backtracking engine no
187     /// matter what optimizations are possible.
188     ///
189     /// One must use this with care, since the bounded backtracking engine
190     /// uses memory proportion to `len(regex) * len(text)`.
191     ///
192     /// This overrides whatever was previously set via the `automatic` or
193     /// `nfa` methods.
bounded_backtracking(mut self) -> Self194     pub fn bounded_backtracking(mut self) -> Self {
195         self.match_type = Some(MatchType::Nfa(MatchNfaType::Backtrack));
196         self
197     }
198 
199     /// Compiles byte based programs for use with the NFA matching engines.
200     ///
201     /// By default, the NFA engines match on Unicode scalar values. They can
202     /// be made to use byte based programs instead. In general, the byte based
203     /// programs are slower because of a less efficient encoding of character
204     /// classes.
205     ///
206     /// Note that this does not impact DFA matching engines, which always
207     /// execute on bytes.
bytes(mut self, yes: bool) -> Self208     pub fn bytes(mut self, yes: bool) -> Self {
209         self.bytes = yes;
210         self
211     }
212 
213     /// When disabled, the program compiled may match arbitrary bytes.
214     ///
215     /// When enabled (the default), all compiled programs exclusively match
216     /// valid UTF-8 bytes.
only_utf8(mut self, yes: bool) -> Self217     pub fn only_utf8(mut self, yes: bool) -> Self {
218         self.only_utf8 = yes;
219         self
220     }
221 
222     /// Set the Unicode flag.
unicode(mut self, yes: bool) -> Self223     pub fn unicode(mut self, yes: bool) -> Self {
224         self.options.unicode = yes;
225         self
226     }
227 
228     /// Parse the current set of patterns into their AST and extract literals.
parse(&self) -> Result<Parsed, Error>229     fn parse(&self) -> Result<Parsed, Error> {
230         let mut exprs = Vec::with_capacity(self.options.pats.len());
231         let mut prefixes = Some(Literals::empty());
232         let mut suffixes = Some(Literals::empty());
233         let mut bytes = false;
234         let is_set = self.options.pats.len() > 1;
235         // If we're compiling a regex set and that set has any anchored
236         // expressions, then disable all literal optimizations.
237         for pat in &self.options.pats {
238             let mut parser = ParserBuilder::new()
239                 .octal(self.options.octal)
240                 .case_insensitive(self.options.case_insensitive)
241                 .multi_line(self.options.multi_line)
242                 .dot_matches_new_line(self.options.dot_matches_new_line)
243                 .swap_greed(self.options.swap_greed)
244                 .ignore_whitespace(self.options.ignore_whitespace)
245                 .unicode(self.options.unicode)
246                 .allow_invalid_utf8(!self.only_utf8)
247                 .nest_limit(self.options.nest_limit)
248                 .build();
249             let expr =
250                 parser.parse(pat).map_err(|e| Error::Syntax(e.to_string()))?;
251             bytes = bytes || !expr.is_always_utf8();
252 
253             if cfg!(feature = "perf-literal") {
254                 if !expr.is_anchored_start() && expr.is_any_anchored_start() {
255                     // Partial anchors unfortunately make it hard to use
256                     // prefixes, so disable them.
257                     prefixes = None;
258                 } else if is_set && expr.is_anchored_start() {
259                     // Regex sets with anchors do not go well with literal
260                     // optimizations.
261                     prefixes = None;
262                 }
263                 prefixes = prefixes.and_then(|mut prefixes| {
264                     if !prefixes.union_prefixes(&expr) {
265                         None
266                     } else {
267                         Some(prefixes)
268                     }
269                 });
270 
271                 if !expr.is_anchored_end() && expr.is_any_anchored_end() {
272                     // Partial anchors unfortunately make it hard to use
273                     // suffixes, so disable them.
274                     suffixes = None;
275                 } else if is_set && expr.is_anchored_end() {
276                     // Regex sets with anchors do not go well with literal
277                     // optimizations.
278                     suffixes = None;
279                 }
280                 suffixes = suffixes.and_then(|mut suffixes| {
281                     if !suffixes.union_suffixes(&expr) {
282                         None
283                     } else {
284                         Some(suffixes)
285                     }
286                 });
287             }
288             exprs.push(expr);
289         }
290         Ok(Parsed {
291             exprs: exprs,
292             prefixes: prefixes.unwrap_or_else(Literals::empty),
293             suffixes: suffixes.unwrap_or_else(Literals::empty),
294             bytes: bytes,
295         })
296     }
297 
298     /// Build an executor that can run a regular expression.
build(self) -> Result<Exec, Error>299     pub fn build(self) -> Result<Exec, Error> {
300         // Special case when we have no patterns to compile.
301         // This can happen when compiling a regex set.
302         if self.options.pats.is_empty() {
303             let ro = Arc::new(ExecReadOnly {
304                 res: vec![],
305                 nfa: Program::new(),
306                 dfa: Program::new(),
307                 dfa_reverse: Program::new(),
308                 suffixes: LiteralSearcher::empty(),
309                 #[cfg(feature = "perf-literal")]
310                 ac: None,
311                 match_type: MatchType::Nothing,
312             });
313             let pool = ExecReadOnly::new_pool(&ro);
314             return Ok(Exec { ro: ro, pool });
315         }
316         let parsed = self.parse()?;
317         let mut nfa = Compiler::new()
318             .size_limit(self.options.size_limit)
319             .bytes(self.bytes || parsed.bytes)
320             .only_utf8(self.only_utf8)
321             .compile(&parsed.exprs)?;
322         let mut dfa = Compiler::new()
323             .size_limit(self.options.size_limit)
324             .dfa(true)
325             .only_utf8(self.only_utf8)
326             .compile(&parsed.exprs)?;
327         let mut dfa_reverse = Compiler::new()
328             .size_limit(self.options.size_limit)
329             .dfa(true)
330             .only_utf8(self.only_utf8)
331             .reverse(true)
332             .compile(&parsed.exprs)?;
333 
334         #[cfg(feature = "perf-literal")]
335         let ac = self.build_aho_corasick(&parsed);
336         nfa.prefixes = LiteralSearcher::prefixes(parsed.prefixes);
337         dfa.prefixes = nfa.prefixes.clone();
338         dfa.dfa_size_limit = self.options.dfa_size_limit;
339         dfa_reverse.dfa_size_limit = self.options.dfa_size_limit;
340 
341         let mut ro = ExecReadOnly {
342             res: self.options.pats,
343             nfa: nfa,
344             dfa: dfa,
345             dfa_reverse: dfa_reverse,
346             suffixes: LiteralSearcher::suffixes(parsed.suffixes),
347             #[cfg(feature = "perf-literal")]
348             ac: ac,
349             match_type: MatchType::Nothing,
350         };
351         ro.match_type = ro.choose_match_type(self.match_type);
352 
353         let ro = Arc::new(ro);
354         let pool = ExecReadOnly::new_pool(&ro);
355         Ok(Exec { ro, pool })
356     }
357 
358     #[cfg(feature = "perf-literal")]
build_aho_corasick(&self, parsed: &Parsed) -> Option<AhoCorasick<u32>>359     fn build_aho_corasick(&self, parsed: &Parsed) -> Option<AhoCorasick<u32>> {
360         if parsed.exprs.len() != 1 {
361             return None;
362         }
363         let lits = match alternation_literals(&parsed.exprs[0]) {
364             None => return None,
365             Some(lits) => lits,
366         };
367         // If we have a small number of literals, then let Teddy handle
368         // things (see literal/mod.rs).
369         if lits.len() <= 32 {
370             return None;
371         }
372         Some(
373             AhoCorasickBuilder::new()
374                 .match_kind(MatchKind::LeftmostFirst)
375                 .auto_configure(&lits)
376                 .build_with_size::<u32, _, _>(&lits)
377                 // This should never happen because we'd long exceed the
378                 // compilation limit for regexes first.
379                 .expect("AC automaton too big"),
380         )
381     }
382 }
383 
384 impl<'c> RegularExpression for ExecNoSyncStr<'c> {
385     type Text = str;
386 
slots_len(&self) -> usize387     fn slots_len(&self) -> usize {
388         self.0.slots_len()
389     }
390 
next_after_empty(&self, text: &str, i: usize) -> usize391     fn next_after_empty(&self, text: &str, i: usize) -> usize {
392         next_utf8(text.as_bytes(), i)
393     }
394 
395     #[cfg_attr(feature = "perf-inline", inline(always))]
shortest_match_at(&self, text: &str, start: usize) -> Option<usize>396     fn shortest_match_at(&self, text: &str, start: usize) -> Option<usize> {
397         self.0.shortest_match_at(text.as_bytes(), start)
398     }
399 
400     #[cfg_attr(feature = "perf-inline", inline(always))]
is_match_at(&self, text: &str, start: usize) -> bool401     fn is_match_at(&self, text: &str, start: usize) -> bool {
402         self.0.is_match_at(text.as_bytes(), start)
403     }
404 
405     #[cfg_attr(feature = "perf-inline", inline(always))]
find_at(&self, text: &str, start: usize) -> Option<(usize, usize)>406     fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> {
407         self.0.find_at(text.as_bytes(), start)
408     }
409 
410     #[cfg_attr(feature = "perf-inline", inline(always))]
captures_read_at( &self, locs: &mut Locations, text: &str, start: usize, ) -> Option<(usize, usize)>411     fn captures_read_at(
412         &self,
413         locs: &mut Locations,
414         text: &str,
415         start: usize,
416     ) -> Option<(usize, usize)> {
417         self.0.captures_read_at(locs, text.as_bytes(), start)
418     }
419 }
420 
421 impl<'c> RegularExpression for ExecNoSync<'c> {
422     type Text = [u8];
423 
424     /// Returns the number of capture slots in the regular expression. (There
425     /// are two slots for every capture group, corresponding to possibly empty
426     /// start and end locations of the capture.)
slots_len(&self) -> usize427     fn slots_len(&self) -> usize {
428         self.ro.nfa.captures.len() * 2
429     }
430 
next_after_empty(&self, _text: &[u8], i: usize) -> usize431     fn next_after_empty(&self, _text: &[u8], i: usize) -> usize {
432         i + 1
433     }
434 
435     /// Returns the end of a match location, possibly occurring before the
436     /// end location of the correct leftmost-first match.
437     #[cfg_attr(feature = "perf-inline", inline(always))]
shortest_match_at(&self, text: &[u8], start: usize) -> Option<usize>438     fn shortest_match_at(&self, text: &[u8], start: usize) -> Option<usize> {
439         if !self.is_anchor_end_match(text) {
440             return None;
441         }
442         match self.ro.match_type {
443             #[cfg(feature = "perf-literal")]
444             MatchType::Literal(ty) => {
445                 self.find_literals(ty, text, start).map(|(_, e)| e)
446             }
447             #[cfg(feature = "perf-dfa")]
448             MatchType::Dfa | MatchType::DfaMany => {
449                 match self.shortest_dfa(text, start) {
450                     dfa::Result::Match(end) => Some(end),
451                     dfa::Result::NoMatch(_) => None,
452                     dfa::Result::Quit => self.shortest_nfa(text, start),
453                 }
454             }
455             #[cfg(feature = "perf-dfa")]
456             MatchType::DfaAnchoredReverse => {
457                 match dfa::Fsm::reverse(
458                     &self.ro.dfa_reverse,
459                     self.cache.value(),
460                     true,
461                     &text[start..],
462                     text.len(),
463                 ) {
464                     dfa::Result::Match(_) => Some(text.len()),
465                     dfa::Result::NoMatch(_) => None,
466                     dfa::Result::Quit => self.shortest_nfa(text, start),
467                 }
468             }
469             #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
470             MatchType::DfaSuffix => {
471                 match self.shortest_dfa_reverse_suffix(text, start) {
472                     dfa::Result::Match(e) => Some(e),
473                     dfa::Result::NoMatch(_) => None,
474                     dfa::Result::Quit => self.shortest_nfa(text, start),
475                 }
476             }
477             MatchType::Nfa(ty) => self.shortest_nfa_type(ty, text, start),
478             MatchType::Nothing => None,
479         }
480     }
481 
482     /// Returns true if and only if the regex matches text.
483     ///
484     /// For single regular expressions, this is equivalent to calling
485     /// shortest_match(...).is_some().
486     #[cfg_attr(feature = "perf-inline", inline(always))]
is_match_at(&self, text: &[u8], start: usize) -> bool487     fn is_match_at(&self, text: &[u8], start: usize) -> bool {
488         if !self.is_anchor_end_match(text) {
489             return false;
490         }
491         // We need to do this dance because shortest_match relies on the NFA
492         // filling in captures[1], but a RegexSet has no captures. In other
493         // words, a RegexSet can't (currently) use shortest_match. ---AG
494         match self.ro.match_type {
495             #[cfg(feature = "perf-literal")]
496             MatchType::Literal(ty) => {
497                 self.find_literals(ty, text, start).is_some()
498             }
499             #[cfg(feature = "perf-dfa")]
500             MatchType::Dfa | MatchType::DfaMany => {
501                 match self.shortest_dfa(text, start) {
502                     dfa::Result::Match(_) => true,
503                     dfa::Result::NoMatch(_) => false,
504                     dfa::Result::Quit => self.match_nfa(text, start),
505                 }
506             }
507             #[cfg(feature = "perf-dfa")]
508             MatchType::DfaAnchoredReverse => {
509                 match dfa::Fsm::reverse(
510                     &self.ro.dfa_reverse,
511                     self.cache.value(),
512                     true,
513                     &text[start..],
514                     text.len(),
515                 ) {
516                     dfa::Result::Match(_) => true,
517                     dfa::Result::NoMatch(_) => false,
518                     dfa::Result::Quit => self.match_nfa(text, start),
519                 }
520             }
521             #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
522             MatchType::DfaSuffix => {
523                 match self.shortest_dfa_reverse_suffix(text, start) {
524                     dfa::Result::Match(_) => true,
525                     dfa::Result::NoMatch(_) => false,
526                     dfa::Result::Quit => self.match_nfa(text, start),
527                 }
528             }
529             MatchType::Nfa(ty) => self.match_nfa_type(ty, text, start),
530             MatchType::Nothing => false,
531         }
532     }
533 
534     /// Finds the start and end location of the leftmost-first match, starting
535     /// at the given location.
536     #[cfg_attr(feature = "perf-inline", inline(always))]
find_at(&self, text: &[u8], start: usize) -> Option<(usize, usize)>537     fn find_at(&self, text: &[u8], start: usize) -> Option<(usize, usize)> {
538         if !self.is_anchor_end_match(text) {
539             return None;
540         }
541         match self.ro.match_type {
542             #[cfg(feature = "perf-literal")]
543             MatchType::Literal(ty) => self.find_literals(ty, text, start),
544             #[cfg(feature = "perf-dfa")]
545             MatchType::Dfa => match self.find_dfa_forward(text, start) {
546                 dfa::Result::Match((s, e)) => Some((s, e)),
547                 dfa::Result::NoMatch(_) => None,
548                 dfa::Result::Quit => {
549                     self.find_nfa(MatchNfaType::Auto, text, start)
550                 }
551             },
552             #[cfg(feature = "perf-dfa")]
553             MatchType::DfaAnchoredReverse => {
554                 match self.find_dfa_anchored_reverse(text, start) {
555                     dfa::Result::Match((s, e)) => Some((s, e)),
556                     dfa::Result::NoMatch(_) => None,
557                     dfa::Result::Quit => {
558                         self.find_nfa(MatchNfaType::Auto, text, start)
559                     }
560                 }
561             }
562             #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
563             MatchType::DfaSuffix => {
564                 match self.find_dfa_reverse_suffix(text, start) {
565                     dfa::Result::Match((s, e)) => Some((s, e)),
566                     dfa::Result::NoMatch(_) => None,
567                     dfa::Result::Quit => {
568                         self.find_nfa(MatchNfaType::Auto, text, start)
569                     }
570                 }
571             }
572             MatchType::Nfa(ty) => self.find_nfa(ty, text, start),
573             MatchType::Nothing => None,
574             #[cfg(feature = "perf-dfa")]
575             MatchType::DfaMany => {
576                 unreachable!("BUG: RegexSet cannot be used with find")
577             }
578         }
579     }
580 
581     /// Finds the start and end location of the leftmost-first match and also
582     /// fills in all matching capture groups.
583     ///
584     /// The number of capture slots given should be equal to the total number
585     /// of capture slots in the compiled program.
586     ///
587     /// Note that the first two slots always correspond to the start and end
588     /// locations of the overall match.
captures_read_at( &self, locs: &mut Locations, text: &[u8], start: usize, ) -> Option<(usize, usize)>589     fn captures_read_at(
590         &self,
591         locs: &mut Locations,
592         text: &[u8],
593         start: usize,
594     ) -> Option<(usize, usize)> {
595         let slots = locs.as_slots();
596         for slot in slots.iter_mut() {
597             *slot = None;
598         }
599         // If the caller unnecessarily uses this, then we try to save them
600         // from themselves.
601         match slots.len() {
602             0 => return self.find_at(text, start),
603             2 => {
604                 return self.find_at(text, start).map(|(s, e)| {
605                     slots[0] = Some(s);
606                     slots[1] = Some(e);
607                     (s, e)
608                 });
609             }
610             _ => {} // fallthrough
611         }
612         if !self.is_anchor_end_match(text) {
613             return None;
614         }
615         match self.ro.match_type {
616             #[cfg(feature = "perf-literal")]
617             MatchType::Literal(ty) => {
618                 self.find_literals(ty, text, start).and_then(|(s, e)| {
619                     self.captures_nfa_type(
620                         MatchNfaType::Auto,
621                         slots,
622                         text,
623                         s,
624                         e,
625                     )
626                 })
627             }
628             #[cfg(feature = "perf-dfa")]
629             MatchType::Dfa => {
630                 if self.ro.nfa.is_anchored_start {
631                     self.captures_nfa(slots, text, start)
632                 } else {
633                     match self.find_dfa_forward(text, start) {
634                         dfa::Result::Match((s, e)) => self.captures_nfa_type(
635                             MatchNfaType::Auto,
636                             slots,
637                             text,
638                             s,
639                             e,
640                         ),
641                         dfa::Result::NoMatch(_) => None,
642                         dfa::Result::Quit => {
643                             self.captures_nfa(slots, text, start)
644                         }
645                     }
646                 }
647             }
648             #[cfg(feature = "perf-dfa")]
649             MatchType::DfaAnchoredReverse => {
650                 match self.find_dfa_anchored_reverse(text, start) {
651                     dfa::Result::Match((s, e)) => self.captures_nfa_type(
652                         MatchNfaType::Auto,
653                         slots,
654                         text,
655                         s,
656                         e,
657                     ),
658                     dfa::Result::NoMatch(_) => None,
659                     dfa::Result::Quit => self.captures_nfa(slots, text, start),
660                 }
661             }
662             #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
663             MatchType::DfaSuffix => {
664                 match self.find_dfa_reverse_suffix(text, start) {
665                     dfa::Result::Match((s, e)) => self.captures_nfa_type(
666                         MatchNfaType::Auto,
667                         slots,
668                         text,
669                         s,
670                         e,
671                     ),
672                     dfa::Result::NoMatch(_) => None,
673                     dfa::Result::Quit => self.captures_nfa(slots, text, start),
674                 }
675             }
676             MatchType::Nfa(ty) => {
677                 self.captures_nfa_type(ty, slots, text, start, text.len())
678             }
679             MatchType::Nothing => None,
680             #[cfg(feature = "perf-dfa")]
681             MatchType::DfaMany => {
682                 unreachable!("BUG: RegexSet cannot be used with captures")
683             }
684         }
685     }
686 }
687 
688 impl<'c> ExecNoSync<'c> {
689     /// Finds the leftmost-first match using only literal search.
690     #[cfg(feature = "perf-literal")]
691     #[cfg_attr(feature = "perf-inline", inline(always))]
find_literals( &self, ty: MatchLiteralType, text: &[u8], start: usize, ) -> Option<(usize, usize)>692     fn find_literals(
693         &self,
694         ty: MatchLiteralType,
695         text: &[u8],
696         start: usize,
697     ) -> Option<(usize, usize)> {
698         use self::MatchLiteralType::*;
699         match ty {
700             Unanchored => {
701                 let lits = &self.ro.nfa.prefixes;
702                 lits.find(&text[start..]).map(|(s, e)| (start + s, start + e))
703             }
704             AnchoredStart => {
705                 let lits = &self.ro.nfa.prefixes;
706                 if start == 0 || !self.ro.nfa.is_anchored_start {
707                     lits.find_start(&text[start..])
708                         .map(|(s, e)| (start + s, start + e))
709                 } else {
710                     None
711                 }
712             }
713             AnchoredEnd => {
714                 let lits = &self.ro.suffixes;
715                 lits.find_end(&text[start..])
716                     .map(|(s, e)| (start + s, start + e))
717             }
718             AhoCorasick => self
719                 .ro
720                 .ac
721                 .as_ref()
722                 .unwrap()
723                 .find(&text[start..])
724                 .map(|m| (start + m.start(), start + m.end())),
725         }
726     }
727 
728     /// Finds the leftmost-first match (start and end) using only the DFA.
729     ///
730     /// If the result returned indicates that the DFA quit, then another
731     /// matching engine should be used.
732     #[cfg(feature = "perf-dfa")]
733     #[cfg_attr(feature = "perf-inline", inline(always))]
find_dfa_forward( &self, text: &[u8], start: usize, ) -> dfa::Result<(usize, usize)>734     fn find_dfa_forward(
735         &self,
736         text: &[u8],
737         start: usize,
738     ) -> dfa::Result<(usize, usize)> {
739         use crate::dfa::Result::*;
740         let end = match dfa::Fsm::forward(
741             &self.ro.dfa,
742             self.cache.value(),
743             false,
744             text,
745             start,
746         ) {
747             NoMatch(i) => return NoMatch(i),
748             Quit => return Quit,
749             Match(end) if start == end => return Match((start, start)),
750             Match(end) => end,
751         };
752         // Now run the DFA in reverse to find the start of the match.
753         match dfa::Fsm::reverse(
754             &self.ro.dfa_reverse,
755             self.cache.value(),
756             false,
757             &text[start..],
758             end - start,
759         ) {
760             Match(s) => Match((start + s, end)),
761             NoMatch(i) => NoMatch(i),
762             Quit => Quit,
763         }
764     }
765 
766     /// Finds the leftmost-first match (start and end) using only the DFA,
767     /// but assumes the regex is anchored at the end and therefore starts at
768     /// the end of the regex and matches in reverse.
769     ///
770     /// If the result returned indicates that the DFA quit, then another
771     /// matching engine should be used.
772     #[cfg(feature = "perf-dfa")]
773     #[cfg_attr(feature = "perf-inline", inline(always))]
find_dfa_anchored_reverse( &self, text: &[u8], start: usize, ) -> dfa::Result<(usize, usize)>774     fn find_dfa_anchored_reverse(
775         &self,
776         text: &[u8],
777         start: usize,
778     ) -> dfa::Result<(usize, usize)> {
779         use crate::dfa::Result::*;
780         match dfa::Fsm::reverse(
781             &self.ro.dfa_reverse,
782             self.cache.value(),
783             false,
784             &text[start..],
785             text.len() - start,
786         ) {
787             Match(s) => Match((start + s, text.len())),
788             NoMatch(i) => NoMatch(i),
789             Quit => Quit,
790         }
791     }
792 
793     /// Finds the end of the shortest match using only the DFA.
794     #[cfg(feature = "perf-dfa")]
795     #[cfg_attr(feature = "perf-inline", inline(always))]
shortest_dfa(&self, text: &[u8], start: usize) -> dfa::Result<usize>796     fn shortest_dfa(&self, text: &[u8], start: usize) -> dfa::Result<usize> {
797         dfa::Fsm::forward(&self.ro.dfa, self.cache.value(), true, text, start)
798     }
799 
800     /// Finds the end of the shortest match using only the DFA by scanning for
801     /// suffix literals.
802     #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
803     #[cfg_attr(feature = "perf-inline", inline(always))]
shortest_dfa_reverse_suffix( &self, text: &[u8], start: usize, ) -> dfa::Result<usize>804     fn shortest_dfa_reverse_suffix(
805         &self,
806         text: &[u8],
807         start: usize,
808     ) -> dfa::Result<usize> {
809         match self.exec_dfa_reverse_suffix(text, start) {
810             None => self.shortest_dfa(text, start),
811             Some(r) => r.map(|(_, end)| end),
812         }
813     }
814 
815     /// Finds the end of the shortest match using only the DFA by scanning for
816     /// suffix literals. It also reports the start of the match.
817     ///
818     /// Note that if None is returned, then the optimization gave up to avoid
819     /// worst case quadratic behavior. A forward scanning DFA should be tried
820     /// next.
821     ///
822     /// If a match is returned and the full leftmost-first match is desired,
823     /// then a forward scan starting from the beginning of the match must be
824     /// done.
825     ///
826     /// If the result returned indicates that the DFA quit, then another
827     /// matching engine should be used.
828     #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
829     #[cfg_attr(feature = "perf-inline", inline(always))]
exec_dfa_reverse_suffix( &self, text: &[u8], original_start: usize, ) -> Option<dfa::Result<(usize, usize)>>830     fn exec_dfa_reverse_suffix(
831         &self,
832         text: &[u8],
833         original_start: usize,
834     ) -> Option<dfa::Result<(usize, usize)>> {
835         use crate::dfa::Result::*;
836 
837         let lcs = self.ro.suffixes.lcs();
838         debug_assert!(lcs.len() >= 1);
839         let mut start = original_start;
840         let mut end = start;
841         let mut last_literal = start;
842         while end <= text.len() {
843             last_literal += match lcs.find(&text[last_literal..]) {
844                 None => return Some(NoMatch(text.len())),
845                 Some(i) => i,
846             };
847             end = last_literal + lcs.len();
848             match dfa::Fsm::reverse(
849                 &self.ro.dfa_reverse,
850                 self.cache.value(),
851                 false,
852                 &text[start..end],
853                 end - start,
854             ) {
855                 Match(0) | NoMatch(0) => return None,
856                 Match(i) => return Some(Match((start + i, end))),
857                 NoMatch(i) => {
858                     start += i;
859                     last_literal += 1;
860                     continue;
861                 }
862                 Quit => return Some(Quit),
863             };
864         }
865         Some(NoMatch(text.len()))
866     }
867 
868     /// Finds the leftmost-first match (start and end) using only the DFA
869     /// by scanning for suffix literals.
870     ///
871     /// If the result returned indicates that the DFA quit, then another
872     /// matching engine should be used.
873     #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
874     #[cfg_attr(feature = "perf-inline", inline(always))]
find_dfa_reverse_suffix( &self, text: &[u8], start: usize, ) -> dfa::Result<(usize, usize)>875     fn find_dfa_reverse_suffix(
876         &self,
877         text: &[u8],
878         start: usize,
879     ) -> dfa::Result<(usize, usize)> {
880         use crate::dfa::Result::*;
881 
882         let match_start = match self.exec_dfa_reverse_suffix(text, start) {
883             None => return self.find_dfa_forward(text, start),
884             Some(Match((start, _))) => start,
885             Some(r) => return r,
886         };
887         // At this point, we've found a match. The only way to quit now
888         // without a match is if the DFA gives up (seems unlikely).
889         //
890         // Now run the DFA forwards to find the proper end of the match.
891         // (The suffix literal match can only indicate the earliest
892         // possible end location, which may appear before the end of the
893         // leftmost-first match.)
894         match dfa::Fsm::forward(
895             &self.ro.dfa,
896             self.cache.value(),
897             false,
898             text,
899             match_start,
900         ) {
901             NoMatch(_) => panic!("BUG: reverse match implies forward match"),
902             Quit => Quit,
903             Match(e) => Match((match_start, e)),
904         }
905     }
906 
907     /// Executes the NFA engine to return whether there is a match or not.
908     ///
909     /// Ideally, we could use shortest_nfa(...).is_some() and get the same
910     /// performance characteristics, but regex sets don't have captures, which
911     /// shortest_nfa depends on.
912     #[cfg(feature = "perf-dfa")]
match_nfa(&self, text: &[u8], start: usize) -> bool913     fn match_nfa(&self, text: &[u8], start: usize) -> bool {
914         self.match_nfa_type(MatchNfaType::Auto, text, start)
915     }
916 
917     /// Like match_nfa, but allows specification of the type of NFA engine.
match_nfa_type( &self, ty: MatchNfaType, text: &[u8], start: usize, ) -> bool918     fn match_nfa_type(
919         &self,
920         ty: MatchNfaType,
921         text: &[u8],
922         start: usize,
923     ) -> bool {
924         self.exec_nfa(
925             ty,
926             &mut [false],
927             &mut [],
928             true,
929             false,
930             text,
931             start,
932             text.len(),
933         )
934     }
935 
936     /// Finds the shortest match using an NFA.
937     #[cfg(feature = "perf-dfa")]
shortest_nfa(&self, text: &[u8], start: usize) -> Option<usize>938     fn shortest_nfa(&self, text: &[u8], start: usize) -> Option<usize> {
939         self.shortest_nfa_type(MatchNfaType::Auto, text, start)
940     }
941 
942     /// Like shortest_nfa, but allows specification of the type of NFA engine.
shortest_nfa_type( &self, ty: MatchNfaType, text: &[u8], start: usize, ) -> Option<usize>943     fn shortest_nfa_type(
944         &self,
945         ty: MatchNfaType,
946         text: &[u8],
947         start: usize,
948     ) -> Option<usize> {
949         let mut slots = [None, None];
950         if self.exec_nfa(
951             ty,
952             &mut [false],
953             &mut slots,
954             true,
955             true,
956             text,
957             start,
958             text.len(),
959         ) {
960             slots[1]
961         } else {
962             None
963         }
964     }
965 
966     /// Like find, but executes an NFA engine.
find_nfa( &self, ty: MatchNfaType, text: &[u8], start: usize, ) -> Option<(usize, usize)>967     fn find_nfa(
968         &self,
969         ty: MatchNfaType,
970         text: &[u8],
971         start: usize,
972     ) -> Option<(usize, usize)> {
973         let mut slots = [None, None];
974         if self.exec_nfa(
975             ty,
976             &mut [false],
977             &mut slots,
978             false,
979             false,
980             text,
981             start,
982             text.len(),
983         ) {
984             match (slots[0], slots[1]) {
985                 (Some(s), Some(e)) => Some((s, e)),
986                 _ => None,
987             }
988         } else {
989             None
990         }
991     }
992 
993     /// Like find_nfa, but fills in captures.
994     ///
995     /// `slots` should have length equal to `2 * nfa.captures.len()`.
996     #[cfg(feature = "perf-dfa")]
captures_nfa( &self, slots: &mut [Slot], text: &[u8], start: usize, ) -> Option<(usize, usize)>997     fn captures_nfa(
998         &self,
999         slots: &mut [Slot],
1000         text: &[u8],
1001         start: usize,
1002     ) -> Option<(usize, usize)> {
1003         self.captures_nfa_type(
1004             MatchNfaType::Auto,
1005             slots,
1006             text,
1007             start,
1008             text.len(),
1009         )
1010     }
1011 
1012     /// Like captures_nfa, but allows specification of type of NFA engine.
captures_nfa_type( &self, ty: MatchNfaType, slots: &mut [Slot], text: &[u8], start: usize, end: usize, ) -> Option<(usize, usize)>1013     fn captures_nfa_type(
1014         &self,
1015         ty: MatchNfaType,
1016         slots: &mut [Slot],
1017         text: &[u8],
1018         start: usize,
1019         end: usize,
1020     ) -> Option<(usize, usize)> {
1021         if self.exec_nfa(
1022             ty,
1023             &mut [false],
1024             slots,
1025             false,
1026             false,
1027             text,
1028             start,
1029             end,
1030         ) {
1031             match (slots[0], slots[1]) {
1032                 (Some(s), Some(e)) => Some((s, e)),
1033                 _ => None,
1034             }
1035         } else {
1036             None
1037         }
1038     }
1039 
exec_nfa( &self, mut ty: MatchNfaType, matches: &mut [bool], slots: &mut [Slot], quit_after_match: bool, quit_after_match_with_pos: bool, text: &[u8], start: usize, end: usize, ) -> bool1040     fn exec_nfa(
1041         &self,
1042         mut ty: MatchNfaType,
1043         matches: &mut [bool],
1044         slots: &mut [Slot],
1045         quit_after_match: bool,
1046         quit_after_match_with_pos: bool,
1047         text: &[u8],
1048         start: usize,
1049         end: usize,
1050     ) -> bool {
1051         use self::MatchNfaType::*;
1052         if let Auto = ty {
1053             if backtrack::should_exec(self.ro.nfa.len(), text.len()) {
1054                 ty = Backtrack;
1055             } else {
1056                 ty = PikeVM;
1057             }
1058         }
1059         // The backtracker can't return the shortest match position as it is
1060         // implemented today. So if someone calls `shortest_match` and we need
1061         // to run an NFA, then use the PikeVM.
1062         if quit_after_match_with_pos || ty == PikeVM {
1063             self.exec_pikevm(
1064                 matches,
1065                 slots,
1066                 quit_after_match,
1067                 text,
1068                 start,
1069                 end,
1070             )
1071         } else {
1072             self.exec_backtrack(matches, slots, text, start, end)
1073         }
1074     }
1075 
1076     /// Always run the NFA algorithm.
exec_pikevm( &self, matches: &mut [bool], slots: &mut [Slot], quit_after_match: bool, text: &[u8], start: usize, end: usize, ) -> bool1077     fn exec_pikevm(
1078         &self,
1079         matches: &mut [bool],
1080         slots: &mut [Slot],
1081         quit_after_match: bool,
1082         text: &[u8],
1083         start: usize,
1084         end: usize,
1085     ) -> bool {
1086         if self.ro.nfa.uses_bytes() {
1087             pikevm::Fsm::exec(
1088                 &self.ro.nfa,
1089                 self.cache.value(),
1090                 matches,
1091                 slots,
1092                 quit_after_match,
1093                 ByteInput::new(text, self.ro.nfa.only_utf8),
1094                 start,
1095                 end,
1096             )
1097         } else {
1098             pikevm::Fsm::exec(
1099                 &self.ro.nfa,
1100                 self.cache.value(),
1101                 matches,
1102                 slots,
1103                 quit_after_match,
1104                 CharInput::new(text),
1105                 start,
1106                 end,
1107             )
1108         }
1109     }
1110 
1111     /// Always runs the NFA using bounded backtracking.
exec_backtrack( &self, matches: &mut [bool], slots: &mut [Slot], text: &[u8], start: usize, end: usize, ) -> bool1112     fn exec_backtrack(
1113         &self,
1114         matches: &mut [bool],
1115         slots: &mut [Slot],
1116         text: &[u8],
1117         start: usize,
1118         end: usize,
1119     ) -> bool {
1120         if self.ro.nfa.uses_bytes() {
1121             backtrack::Bounded::exec(
1122                 &self.ro.nfa,
1123                 self.cache.value(),
1124                 matches,
1125                 slots,
1126                 ByteInput::new(text, self.ro.nfa.only_utf8),
1127                 start,
1128                 end,
1129             )
1130         } else {
1131             backtrack::Bounded::exec(
1132                 &self.ro.nfa,
1133                 self.cache.value(),
1134                 matches,
1135                 slots,
1136                 CharInput::new(text),
1137                 start,
1138                 end,
1139             )
1140         }
1141     }
1142 
1143     /// Finds which regular expressions match the given text.
1144     ///
1145     /// `matches` should have length equal to the number of regexes being
1146     /// searched.
1147     ///
1148     /// This is only useful when one wants to know which regexes in a set
1149     /// match some text.
many_matches_at( &self, matches: &mut [bool], text: &[u8], start: usize, ) -> bool1150     pub fn many_matches_at(
1151         &self,
1152         matches: &mut [bool],
1153         text: &[u8],
1154         start: usize,
1155     ) -> bool {
1156         use self::MatchType::*;
1157         if !self.is_anchor_end_match(text) {
1158             return false;
1159         }
1160         match self.ro.match_type {
1161             #[cfg(feature = "perf-literal")]
1162             Literal(ty) => {
1163                 debug_assert_eq!(matches.len(), 1);
1164                 matches[0] = self.find_literals(ty, text, start).is_some();
1165                 matches[0]
1166             }
1167             #[cfg(feature = "perf-dfa")]
1168             Dfa | DfaAnchoredReverse | DfaMany => {
1169                 match dfa::Fsm::forward_many(
1170                     &self.ro.dfa,
1171                     self.cache.value(),
1172                     matches,
1173                     text,
1174                     start,
1175                 ) {
1176                     dfa::Result::Match(_) => true,
1177                     dfa::Result::NoMatch(_) => false,
1178                     dfa::Result::Quit => self.exec_nfa(
1179                         MatchNfaType::Auto,
1180                         matches,
1181                         &mut [],
1182                         false,
1183                         false,
1184                         text,
1185                         start,
1186                         text.len(),
1187                     ),
1188                 }
1189             }
1190             #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
1191             DfaSuffix => {
1192                 match dfa::Fsm::forward_many(
1193                     &self.ro.dfa,
1194                     self.cache.value(),
1195                     matches,
1196                     text,
1197                     start,
1198                 ) {
1199                     dfa::Result::Match(_) => true,
1200                     dfa::Result::NoMatch(_) => false,
1201                     dfa::Result::Quit => self.exec_nfa(
1202                         MatchNfaType::Auto,
1203                         matches,
1204                         &mut [],
1205                         false,
1206                         false,
1207                         text,
1208                         start,
1209                         text.len(),
1210                     ),
1211                 }
1212             }
1213             Nfa(ty) => self.exec_nfa(
1214                 ty,
1215                 matches,
1216                 &mut [],
1217                 false,
1218                 false,
1219                 text,
1220                 start,
1221                 text.len(),
1222             ),
1223             Nothing => false,
1224         }
1225     }
1226 
1227     #[cfg_attr(feature = "perf-inline", inline(always))]
is_anchor_end_match(&self, text: &[u8]) -> bool1228     fn is_anchor_end_match(&self, text: &[u8]) -> bool {
1229         #[cfg(not(feature = "perf-literal"))]
1230         fn imp(_: &ExecReadOnly, _: &[u8]) -> bool {
1231             true
1232         }
1233 
1234         #[cfg(feature = "perf-literal")]
1235         fn imp(ro: &ExecReadOnly, text: &[u8]) -> bool {
1236             // Only do this check if the haystack is big (>1MB).
1237             if text.len() > (1 << 20) && ro.nfa.is_anchored_end {
1238                 let lcs = ro.suffixes.lcs();
1239                 if lcs.len() >= 1 && !lcs.is_suffix(text) {
1240                     return false;
1241                 }
1242             }
1243             true
1244         }
1245 
1246         imp(&self.ro, text)
1247     }
1248 
capture_name_idx(&self) -> &Arc<HashMap<String, usize>>1249     pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1250         &self.ro.nfa.capture_name_idx
1251     }
1252 }
1253 
1254 impl<'c> ExecNoSyncStr<'c> {
capture_name_idx(&self) -> &Arc<HashMap<String, usize>>1255     pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1256         self.0.capture_name_idx()
1257     }
1258 }
1259 
1260 impl Exec {
1261     /// Get a searcher that isn't Sync.
1262     #[cfg_attr(feature = "perf-inline", inline(always))]
searcher(&self) -> ExecNoSync<'_>1263     pub fn searcher(&self) -> ExecNoSync<'_> {
1264         ExecNoSync {
1265             ro: &self.ro, // a clone is too expensive here! (and not needed)
1266             cache: self.pool.get(),
1267         }
1268     }
1269 
1270     /// Get a searcher that isn't Sync and can match on &str.
1271     #[cfg_attr(feature = "perf-inline", inline(always))]
searcher_str(&self) -> ExecNoSyncStr<'_>1272     pub fn searcher_str(&self) -> ExecNoSyncStr<'_> {
1273         ExecNoSyncStr(self.searcher())
1274     }
1275 
1276     /// Build a Regex from this executor.
into_regex(self) -> re_unicode::Regex1277     pub fn into_regex(self) -> re_unicode::Regex {
1278         re_unicode::Regex::from(self)
1279     }
1280 
1281     /// Build a RegexSet from this executor.
into_regex_set(self) -> re_set::unicode::RegexSet1282     pub fn into_regex_set(self) -> re_set::unicode::RegexSet {
1283         re_set::unicode::RegexSet::from(self)
1284     }
1285 
1286     /// Build a Regex from this executor that can match arbitrary bytes.
into_byte_regex(self) -> re_bytes::Regex1287     pub fn into_byte_regex(self) -> re_bytes::Regex {
1288         re_bytes::Regex::from(self)
1289     }
1290 
1291     /// Build a RegexSet from this executor that can match arbitrary bytes.
into_byte_regex_set(self) -> re_set::bytes::RegexSet1292     pub fn into_byte_regex_set(self) -> re_set::bytes::RegexSet {
1293         re_set::bytes::RegexSet::from(self)
1294     }
1295 
1296     /// The original regular expressions given by the caller that were
1297     /// compiled.
regex_strings(&self) -> &[String]1298     pub fn regex_strings(&self) -> &[String] {
1299         &self.ro.res
1300     }
1301 
1302     /// Return a slice of capture names.
1303     ///
1304     /// Any capture that isn't named is None.
capture_names(&self) -> &[Option<String>]1305     pub fn capture_names(&self) -> &[Option<String>] {
1306         &self.ro.nfa.captures
1307     }
1308 
1309     /// Return a reference to named groups mapping (from group name to
1310     /// group position).
capture_name_idx(&self) -> &Arc<HashMap<String, usize>>1311     pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1312         &self.ro.nfa.capture_name_idx
1313     }
1314 }
1315 
1316 impl Clone for Exec {
clone(&self) -> Exec1317     fn clone(&self) -> Exec {
1318         let pool = ExecReadOnly::new_pool(&self.ro);
1319         Exec { ro: self.ro.clone(), pool }
1320     }
1321 }
1322 
1323 impl ExecReadOnly {
choose_match_type(&self, hint: Option<MatchType>) -> MatchType1324     fn choose_match_type(&self, hint: Option<MatchType>) -> MatchType {
1325         if let Some(MatchType::Nfa(_)) = hint {
1326             return hint.unwrap();
1327         }
1328         // If the NFA is empty, then we'll never match anything.
1329         if self.nfa.insts.is_empty() {
1330             return MatchType::Nothing;
1331         }
1332         if let Some(literalty) = self.choose_literal_match_type() {
1333             return literalty;
1334         }
1335         if let Some(dfaty) = self.choose_dfa_match_type() {
1336             return dfaty;
1337         }
1338         // We're so totally hosed.
1339         MatchType::Nfa(MatchNfaType::Auto)
1340     }
1341 
1342     /// If a plain literal scan can be used, then a corresponding literal
1343     /// search type is returned.
choose_literal_match_type(&self) -> Option<MatchType>1344     fn choose_literal_match_type(&self) -> Option<MatchType> {
1345         #[cfg(not(feature = "perf-literal"))]
1346         fn imp(_: &ExecReadOnly) -> Option<MatchType> {
1347             None
1348         }
1349 
1350         #[cfg(feature = "perf-literal")]
1351         fn imp(ro: &ExecReadOnly) -> Option<MatchType> {
1352             // If our set of prefixes is complete, then we can use it to find
1353             // a match in lieu of a regex engine. This doesn't quite work well
1354             // in the presence of multiple regexes, so only do it when there's
1355             // one.
1356             //
1357             // TODO(burntsushi): Also, don't try to match literals if the regex
1358             // is partially anchored. We could technically do it, but we'd need
1359             // to create two sets of literals: all of them and then the subset
1360             // that aren't anchored. We would then only search for all of them
1361             // when at the beginning of the input and use the subset in all
1362             // other cases.
1363             if ro.res.len() != 1 {
1364                 return None;
1365             }
1366             if ro.ac.is_some() {
1367                 return Some(MatchType::Literal(
1368                     MatchLiteralType::AhoCorasick,
1369                 ));
1370             }
1371             if ro.nfa.prefixes.complete() {
1372                 return if ro.nfa.is_anchored_start {
1373                     Some(MatchType::Literal(MatchLiteralType::AnchoredStart))
1374                 } else {
1375                     Some(MatchType::Literal(MatchLiteralType::Unanchored))
1376                 };
1377             }
1378             if ro.suffixes.complete() {
1379                 return if ro.nfa.is_anchored_end {
1380                     Some(MatchType::Literal(MatchLiteralType::AnchoredEnd))
1381                 } else {
1382                     // This case shouldn't happen. When the regex isn't
1383                     // anchored, then complete prefixes should imply complete
1384                     // suffixes.
1385                     Some(MatchType::Literal(MatchLiteralType::Unanchored))
1386                 };
1387             }
1388             None
1389         }
1390 
1391         imp(self)
1392     }
1393 
1394     /// If a DFA scan can be used, then choose the appropriate DFA strategy.
choose_dfa_match_type(&self) -> Option<MatchType>1395     fn choose_dfa_match_type(&self) -> Option<MatchType> {
1396         #[cfg(not(feature = "perf-dfa"))]
1397         fn imp(_: &ExecReadOnly) -> Option<MatchType> {
1398             None
1399         }
1400 
1401         #[cfg(feature = "perf-dfa")]
1402         fn imp(ro: &ExecReadOnly) -> Option<MatchType> {
1403             if !dfa::can_exec(&ro.dfa) {
1404                 return None;
1405             }
1406             // Regex sets require a slightly specialized path.
1407             if ro.res.len() >= 2 {
1408                 return Some(MatchType::DfaMany);
1409             }
1410             // If the regex is anchored at the end but not the start, then
1411             // just match in reverse from the end of the haystack.
1412             if !ro.nfa.is_anchored_start && ro.nfa.is_anchored_end {
1413                 return Some(MatchType::DfaAnchoredReverse);
1414             }
1415             #[cfg(feature = "perf-literal")]
1416             {
1417                 // If there's a longish suffix literal, then it might be faster
1418                 // to look for that first.
1419                 if ro.should_suffix_scan() {
1420                     return Some(MatchType::DfaSuffix);
1421                 }
1422             }
1423             // Fall back to your garden variety forward searching lazy DFA.
1424             Some(MatchType::Dfa)
1425         }
1426 
1427         imp(self)
1428     }
1429 
1430     /// Returns true if the program is amenable to suffix scanning.
1431     ///
1432     /// When this is true, as a heuristic, we assume it is OK to quickly scan
1433     /// for suffix literals and then do a *reverse* DFA match from any matches
1434     /// produced by the literal scan. (And then followed by a forward DFA
1435     /// search, since the previously found suffix literal maybe not actually be
1436     /// the end of a match.)
1437     ///
1438     /// This is a bit of a specialized optimization, but can result in pretty
1439     /// big performance wins if 1) there are no prefix literals and 2) the
1440     /// suffix literals are pretty rare in the text. (1) is obviously easy to
1441     /// account for but (2) is harder. As a proxy, we assume that longer
1442     /// strings are generally rarer, so we only enable this optimization when
1443     /// we have a meaty suffix.
1444     #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
should_suffix_scan(&self) -> bool1445     fn should_suffix_scan(&self) -> bool {
1446         if self.suffixes.is_empty() {
1447             return false;
1448         }
1449         let lcs_len = self.suffixes.lcs().char_len();
1450         lcs_len >= 3 && lcs_len > self.dfa.prefixes.lcp().char_len()
1451     }
1452 
new_pool(ro: &Arc<ExecReadOnly>) -> Box<Pool<ProgramCache>>1453     fn new_pool(ro: &Arc<ExecReadOnly>) -> Box<Pool<ProgramCache>> {
1454         let ro = ro.clone();
1455         Box::new(Pool::new(Box::new(move || {
1456             AssertUnwindSafe(RefCell::new(ProgramCacheInner::new(&ro)))
1457         })))
1458     }
1459 }
1460 
1461 #[derive(Clone, Copy, Debug)]
1462 enum MatchType {
1463     /// A single or multiple literal search. This is only used when the regex
1464     /// can be decomposed into a literal search.
1465     #[cfg(feature = "perf-literal")]
1466     Literal(MatchLiteralType),
1467     /// A normal DFA search.
1468     #[cfg(feature = "perf-dfa")]
1469     Dfa,
1470     /// A reverse DFA search starting from the end of a haystack.
1471     #[cfg(feature = "perf-dfa")]
1472     DfaAnchoredReverse,
1473     /// A reverse DFA search with suffix literal scanning.
1474     #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
1475     DfaSuffix,
1476     /// Use the DFA on two or more regular expressions.
1477     #[cfg(feature = "perf-dfa")]
1478     DfaMany,
1479     /// An NFA variant.
1480     Nfa(MatchNfaType),
1481     /// No match is ever possible, so don't ever try to search.
1482     Nothing,
1483 }
1484 
1485 #[derive(Clone, Copy, Debug)]
1486 #[cfg(feature = "perf-literal")]
1487 enum MatchLiteralType {
1488     /// Match literals anywhere in text.
1489     Unanchored,
1490     /// Match literals only at the start of text.
1491     AnchoredStart,
1492     /// Match literals only at the end of text.
1493     AnchoredEnd,
1494     /// Use an Aho-Corasick automaton. This requires `ac` to be Some on
1495     /// ExecReadOnly.
1496     AhoCorasick,
1497 }
1498 
1499 #[derive(Clone, Copy, Debug, Eq, PartialEq)]
1500 enum MatchNfaType {
1501     /// Choose between Backtrack and PikeVM.
1502     Auto,
1503     /// NFA bounded backtracking.
1504     ///
1505     /// (This is only set by tests, since it never makes sense to always want
1506     /// backtracking.)
1507     Backtrack,
1508     /// The Pike VM.
1509     ///
1510     /// (This is only set by tests, since it never makes sense to always want
1511     /// the Pike VM.)
1512     PikeVM,
1513 }
1514 
1515 /// `ProgramCache` maintains reusable allocations for each matching engine
1516 /// available to a particular program.
1517 ///
1518 /// We declare this as unwind safe since it's a cache that's only used for
1519 /// performance purposes. If a panic occurs, it is (or should be) always safe
1520 /// to continue using the same regex object.
1521 pub type ProgramCache = AssertUnwindSafe<RefCell<ProgramCacheInner>>;
1522 
1523 #[derive(Debug)]
1524 pub struct ProgramCacheInner {
1525     pub pikevm: pikevm::Cache,
1526     pub backtrack: backtrack::Cache,
1527     #[cfg(feature = "perf-dfa")]
1528     pub dfa: dfa::Cache,
1529     #[cfg(feature = "perf-dfa")]
1530     pub dfa_reverse: dfa::Cache,
1531 }
1532 
1533 impl ProgramCacheInner {
new(ro: &ExecReadOnly) -> Self1534     fn new(ro: &ExecReadOnly) -> Self {
1535         ProgramCacheInner {
1536             pikevm: pikevm::Cache::new(&ro.nfa),
1537             backtrack: backtrack::Cache::new(&ro.nfa),
1538             #[cfg(feature = "perf-dfa")]
1539             dfa: dfa::Cache::new(&ro.dfa),
1540             #[cfg(feature = "perf-dfa")]
1541             dfa_reverse: dfa::Cache::new(&ro.dfa_reverse),
1542         }
1543     }
1544 }
1545 
1546 /// Alternation literals checks if the given HIR is a simple alternation of
1547 /// literals, and if so, returns them. Otherwise, this returns None.
1548 #[cfg(feature = "perf-literal")]
alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>>1549 fn alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>> {
1550     use regex_syntax::hir::{HirKind, Literal};
1551 
1552     // This is pretty hacky, but basically, if `is_alternation_literal` is
1553     // true, then we can make several assumptions about the structure of our
1554     // HIR. This is what justifies the `unreachable!` statements below.
1555     //
1556     // This code should be refactored once we overhaul this crate's
1557     // optimization pipeline, because this is a terribly inflexible way to go
1558     // about things.
1559 
1560     if !expr.is_alternation_literal() {
1561         return None;
1562     }
1563     let alts = match *expr.kind() {
1564         HirKind::Alternation(ref alts) => alts,
1565         _ => return None, // one literal isn't worth it
1566     };
1567 
1568     let extendlit = |lit: &Literal, dst: &mut Vec<u8>| match *lit {
1569         Literal::Unicode(c) => {
1570             let mut buf = [0; 4];
1571             dst.extend_from_slice(c.encode_utf8(&mut buf).as_bytes());
1572         }
1573         Literal::Byte(b) => {
1574             dst.push(b);
1575         }
1576     };
1577 
1578     let mut lits = vec![];
1579     for alt in alts {
1580         let mut lit = vec![];
1581         match *alt.kind() {
1582             HirKind::Literal(ref x) => extendlit(x, &mut lit),
1583             HirKind::Concat(ref exprs) => {
1584                 for e in exprs {
1585                     match *e.kind() {
1586                         HirKind::Literal(ref x) => extendlit(x, &mut lit),
1587                         _ => unreachable!("expected literal, got {:?}", e),
1588                     }
1589                 }
1590             }
1591             _ => unreachable!("expected literal or concat, got {:?}", alt),
1592         }
1593         lits.push(lit);
1594     }
1595     Some(lits)
1596 }
1597 
1598 #[cfg(test)]
1599 mod test {
1600     #[test]
uppercut_s_backtracking_bytes_default_bytes_mismatch()1601     fn uppercut_s_backtracking_bytes_default_bytes_mismatch() {
1602         use crate::internal::ExecBuilder;
1603 
1604         let backtrack_bytes_re = ExecBuilder::new("^S")
1605             .bounded_backtracking()
1606             .only_utf8(false)
1607             .build()
1608             .map(|exec| exec.into_byte_regex())
1609             .map_err(|err| format!("{}", err))
1610             .unwrap();
1611 
1612         let default_bytes_re = ExecBuilder::new("^S")
1613             .only_utf8(false)
1614             .build()
1615             .map(|exec| exec.into_byte_regex())
1616             .map_err(|err| format!("{}", err))
1617             .unwrap();
1618 
1619         let input = vec![83, 83];
1620 
1621         let s1 = backtrack_bytes_re.split(&input);
1622         let s2 = default_bytes_re.split(&input);
1623         for (chunk1, chunk2) in s1.zip(s2) {
1624             assert_eq!(chunk1, chunk2);
1625         }
1626     }
1627 
1628     #[test]
unicode_lit_star_backtracking_utf8bytes_default_utf8bytes_mismatch()1629     fn unicode_lit_star_backtracking_utf8bytes_default_utf8bytes_mismatch() {
1630         use crate::internal::ExecBuilder;
1631 
1632         let backtrack_bytes_re = ExecBuilder::new(r"^(?u:\*)")
1633             .bounded_backtracking()
1634             .bytes(true)
1635             .build()
1636             .map(|exec| exec.into_regex())
1637             .map_err(|err| format!("{}", err))
1638             .unwrap();
1639 
1640         let default_bytes_re = ExecBuilder::new(r"^(?u:\*)")
1641             .bytes(true)
1642             .build()
1643             .map(|exec| exec.into_regex())
1644             .map_err(|err| format!("{}", err))
1645             .unwrap();
1646 
1647         let input = "**";
1648 
1649         let s1 = backtrack_bytes_re.split(input);
1650         let s2 = default_bytes_re.split(input);
1651         for (chunk1, chunk2) in s1.zip(s2) {
1652             assert_eq!(chunk1, chunk2);
1653         }
1654     }
1655 }
1656