1 // Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
4 //
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
10 
11 use std::cell::RefCell;
12 use std::collections::HashMap;
13 use std::sync::Arc;
14 
15 use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
16 use thread_local::CachedThreadLocal;
17 use syntax::ParserBuilder;
18 use syntax::hir::Hir;
19 use syntax::hir::literal::Literals;
20 
21 use backtrack;
22 use compile::Compiler;
23 use dfa;
24 use error::Error;
25 use input::{ByteInput, CharInput};
26 use literal::LiteralSearcher;
27 use pikevm;
28 use prog::Program;
29 use re_builder::RegexOptions;
30 use re_bytes;
31 use re_set;
32 use re_trait::{RegularExpression, Slot, Locations};
33 use re_unicode;
34 use utf8::next_utf8;
35 
36 /// `Exec` manages the execution of a regular expression.
37 ///
38 /// In particular, this manages the various compiled forms of a single regular
39 /// expression and the choice of which matching engine to use to execute a
40 /// regular expression.
41 pub struct Exec {
42     /// All read only state.
43     ro: Arc<ExecReadOnly>,
44     /// Caches for the various matching engines.
45     cache: CachedThreadLocal<ProgramCache>,
46 }
47 
48 /// `ExecNoSync` is like `Exec`, except it embeds a reference to a cache. This
49 /// means it is no longer Sync, but we can now avoid the overhead of
50 /// synchronization to fetch the cache.
51 #[derive(Debug)]
52 pub struct ExecNoSync<'c> {
53     /// All read only state.
54     ro: &'c Arc<ExecReadOnly>,
55     /// Caches for the various matching engines.
56     cache: &'c ProgramCache,
57 }
58 
59 /// `ExecNoSyncStr` is like `ExecNoSync`, but matches on &str instead of &[u8].
60 pub struct ExecNoSyncStr<'c>(ExecNoSync<'c>);
61 
62 /// `ExecReadOnly` comprises all read only state for a regex. Namely, all such
63 /// state is determined at compile time and never changes during search.
64 #[derive(Debug)]
65 struct ExecReadOnly {
66     /// The original regular expressions given by the caller to compile.
67     res: Vec<String>,
68     /// A compiled program that is used in the NFA simulation and backtracking.
69     /// It can be byte-based or Unicode codepoint based.
70     ///
71     /// N.B. It is not possibly to make this byte-based from the public API.
72     /// It is only used for testing byte based programs in the NFA simulations.
73     nfa: Program,
74     /// A compiled byte based program for DFA execution. This is only used
75     /// if a DFA can be executed. (Currently, only word boundary assertions are
76     /// not supported.) Note that this program contains an embedded `.*?`
77     /// preceding the first capture group, unless the regex is anchored at the
78     /// beginning.
79     dfa: Program,
80     /// The same as above, except the program is reversed (and there is no
81     /// preceding `.*?`). This is used by the DFA to find the starting location
82     /// of matches.
83     dfa_reverse: Program,
84     /// A set of suffix literals extracted from the regex.
85     ///
86     /// Prefix literals are stored on the `Program`, since they are used inside
87     /// the matching engines.
88     suffixes: LiteralSearcher,
89     /// An Aho-Corasick automaton with leftmost-first match semantics.
90     ///
91     /// This is only set when the entire regex is a simple unanchored
92     /// alternation of literals. We could probably use it more circumstances,
93     /// but this is already hacky enough in this architecture.
94     ///
95     /// N.B. We use u32 as a state ID representation under the assumption that
96     /// if we were to exhaust the ID space, we probably would have long
97     /// surpassed the compilation size limit.
98     ac: Option<AhoCorasick<u32>>,
99     /// match_type encodes as much upfront knowledge about how we're going to
100     /// execute a search as possible.
101     match_type: MatchType,
102 }
103 
104 /// Facilitates the construction of an executor by exposing various knobs
105 /// to control how a regex is executed and what kinds of resources it's
106 /// permitted to use.
107 pub struct ExecBuilder {
108     options: RegexOptions,
109     match_type: Option<MatchType>,
110     bytes: bool,
111     only_utf8: bool,
112 }
113 
114 /// Parsed represents a set of parsed regular expressions and their detected
115 /// literals.
116 struct Parsed {
117     exprs: Vec<Hir>,
118     prefixes: Literals,
119     suffixes: Literals,
120     bytes: bool,
121 }
122 
123 impl ExecBuilder {
124     /// Create a regex execution builder.
125     ///
126     /// This uses default settings for everything except the regex itself,
127     /// which must be provided. Further knobs can be set by calling methods,
128     /// and then finally, `build` to actually create the executor.
new(re: &str) -> Self129     pub fn new(re: &str) -> Self {
130         Self::new_many(&[re])
131     }
132 
133     /// Like new, but compiles the union of the given regular expressions.
134     ///
135     /// Note that when compiling 2 or more regular expressions, capture groups
136     /// are completely unsupported. (This means both `find` and `captures`
137     /// wont work.)
new_many<I, S>(res: I) -> Self where S: AsRef<str>, I: IntoIterator<Item=S>138     pub fn new_many<I, S>(res: I) -> Self
139             where S: AsRef<str>, I: IntoIterator<Item=S> {
140         let mut opts = RegexOptions::default();
141         opts.pats = res.into_iter().map(|s| s.as_ref().to_owned()).collect();
142         Self::new_options(opts)
143     }
144 
145     /// Create a regex execution builder.
new_options(opts: RegexOptions) -> Self146     pub fn new_options(opts: RegexOptions) -> Self {
147         ExecBuilder {
148             options: opts,
149             match_type: None,
150             bytes: false,
151             only_utf8: true,
152         }
153     }
154 
155     /// Set the matching engine to be automatically determined.
156     ///
157     /// This is the default state and will apply whatever optimizations are
158     /// possible, such as running a DFA.
159     ///
160     /// This overrides whatever was previously set via the `nfa` or
161     /// `bounded_backtracking` methods.
automatic(mut self) -> Self162     pub fn automatic(mut self) -> Self {
163         self.match_type = None;
164         self
165     }
166 
167     /// Sets the matching engine to use the NFA algorithm no matter what
168     /// optimizations are possible.
169     ///
170     /// This overrides whatever was previously set via the `automatic` or
171     /// `bounded_backtracking` methods.
nfa(mut self) -> Self172     pub fn nfa(mut self) -> Self {
173         self.match_type = Some(MatchType::Nfa(MatchNfaType::PikeVM));
174         self
175     }
176 
177     /// Sets the matching engine to use a bounded backtracking engine no
178     /// matter what optimizations are possible.
179     ///
180     /// One must use this with care, since the bounded backtracking engine
181     /// uses memory proportion to `len(regex) * len(text)`.
182     ///
183     /// This overrides whatever was previously set via the `automatic` or
184     /// `nfa` methods.
bounded_backtracking(mut self) -> Self185     pub fn bounded_backtracking(mut self) -> Self {
186         self.match_type = Some(MatchType::Nfa(MatchNfaType::Backtrack));
187         self
188     }
189 
190     /// Compiles byte based programs for use with the NFA matching engines.
191     ///
192     /// By default, the NFA engines match on Unicode scalar values. They can
193     /// be made to use byte based programs instead. In general, the byte based
194     /// programs are slower because of a less efficient encoding of character
195     /// classes.
196     ///
197     /// Note that this does not impact DFA matching engines, which always
198     /// execute on bytes.
bytes(mut self, yes: bool) -> Self199     pub fn bytes(mut self, yes: bool) -> Self {
200         self.bytes = yes;
201         self
202     }
203 
204     /// When disabled, the program compiled may match arbitrary bytes.
205     ///
206     /// When enabled (the default), all compiled programs exclusively match
207     /// valid UTF-8 bytes.
only_utf8(mut self, yes: bool) -> Self208     pub fn only_utf8(mut self, yes: bool) -> Self {
209         self.only_utf8 = yes;
210         self
211     }
212 
213     /// Set the Unicode flag.
unicode(mut self, yes: bool) -> Self214     pub fn unicode(mut self, yes: bool) -> Self {
215         self.options.unicode = yes;
216         self
217     }
218 
219     /// Parse the current set of patterns into their AST and extract literals.
parse(&self) -> Result<Parsed, Error>220     fn parse(&self) -> Result<Parsed, Error> {
221         let mut exprs = Vec::with_capacity(self.options.pats.len());
222         let mut prefixes = Some(Literals::empty());
223         let mut suffixes = Some(Literals::empty());
224         let mut bytes = false;
225         let is_set = self.options.pats.len() > 1;
226         // If we're compiling a regex set and that set has any anchored
227         // expressions, then disable all literal optimizations.
228         for pat in &self.options.pats {
229             let mut parser =
230                 ParserBuilder::new()
231                     .octal(self.options.octal)
232                     .case_insensitive(self.options.case_insensitive)
233                     .multi_line(self.options.multi_line)
234                     .dot_matches_new_line(self.options.dot_matches_new_line)
235                     .swap_greed(self.options.swap_greed)
236                     .ignore_whitespace(self.options.ignore_whitespace)
237                     .unicode(self.options.unicode)
238                     .allow_invalid_utf8(!self.only_utf8)
239                     .nest_limit(self.options.nest_limit)
240                     .build();
241             let expr = parser
242                 .parse(pat)
243                 .map_err(|e| Error::Syntax(e.to_string()))?;
244             bytes = bytes || !expr.is_always_utf8();
245 
246             if !expr.is_anchored_start() && expr.is_any_anchored_start() {
247                 // Partial anchors unfortunately make it hard to use prefixes,
248                 // so disable them.
249                 prefixes = None;
250             } else if is_set && expr.is_anchored_start() {
251                 // Regex sets with anchors do not go well with literal
252                 // optimizations.
253                 prefixes = None;
254             }
255             prefixes = prefixes.and_then(|mut prefixes| {
256                 if !prefixes.union_prefixes(&expr) {
257                     None
258                 } else {
259                     Some(prefixes)
260                 }
261             });
262 
263             if !expr.is_anchored_end() && expr.is_any_anchored_end() {
264                 // Partial anchors unfortunately make it hard to use suffixes,
265                 // so disable them.
266                 suffixes = None;
267             } else if is_set && expr.is_anchored_end() {
268                 // Regex sets with anchors do not go well with literal
269                 // optimizations.
270                 suffixes = None;
271             }
272             suffixes = suffixes.and_then(|mut suffixes| {
273                 if !suffixes.union_suffixes(&expr) {
274                     None
275                 } else {
276                     Some(suffixes)
277                 }
278             });
279             exprs.push(expr);
280         }
281         Ok(Parsed {
282             exprs: exprs,
283             prefixes: prefixes.unwrap_or_else(Literals::empty),
284             suffixes: suffixes.unwrap_or_else(Literals::empty),
285             bytes: bytes,
286         })
287     }
288 
289     /// Build an executor that can run a regular expression.
build(self) -> Result<Exec, Error>290     pub fn build(self) -> Result<Exec, Error> {
291         // Special case when we have no patterns to compile.
292         // This can happen when compiling a regex set.
293         if self.options.pats.is_empty() {
294             let ro = Arc::new(ExecReadOnly {
295                 res: vec![],
296                 nfa: Program::new(),
297                 dfa: Program::new(),
298                 dfa_reverse: Program::new(),
299                 suffixes: LiteralSearcher::empty(),
300                 ac: None,
301                 match_type: MatchType::Nothing,
302             });
303             return Ok(Exec { ro: ro, cache: CachedThreadLocal::new() });
304         }
305         let parsed = self.parse()?;
306         let mut nfa =
307             Compiler::new()
308                      .size_limit(self.options.size_limit)
309                      .bytes(self.bytes || parsed.bytes)
310                      .only_utf8(self.only_utf8)
311                      .compile(&parsed.exprs)?;
312         let mut dfa =
313             Compiler::new()
314                      .size_limit(self.options.size_limit)
315                      .dfa(true)
316                      .only_utf8(self.only_utf8)
317                      .compile(&parsed.exprs)?;
318         let mut dfa_reverse =
319             Compiler::new()
320                      .size_limit(self.options.size_limit)
321                      .dfa(true)
322                      .only_utf8(self.only_utf8)
323                      .reverse(true)
324                      .compile(&parsed.exprs)?;
325 
326         nfa.prefixes = LiteralSearcher::prefixes(parsed.prefixes);
327         dfa.prefixes = nfa.prefixes.clone();
328         dfa.dfa_size_limit = self.options.dfa_size_limit;
329         dfa_reverse.dfa_size_limit = self.options.dfa_size_limit;
330 
331         let mut ac = None;
332         if parsed.exprs.len() == 1 {
333             if let Some(lits) = alternation_literals(&parsed.exprs[0]) {
334                 // If we have a small number of literals, then let Teddy
335                 // handle things (see literal/mod.rs).
336                 if lits.len() > 32 {
337                     let fsm = AhoCorasickBuilder::new()
338                         .match_kind(MatchKind::LeftmostFirst)
339                         .auto_configure(&lits)
340                         // We always want this to reduce size, regardless of
341                         // what auto-configure does.
342                         .byte_classes(true)
343                         .build_with_size::<u32, _, _>(&lits)
344                         .expect("AC automaton too big");
345                     ac = Some(fsm);
346                 }
347             }
348         }
349 
350         let mut ro = ExecReadOnly {
351             res: self.options.pats,
352             nfa: nfa,
353             dfa: dfa,
354             dfa_reverse: dfa_reverse,
355             suffixes: LiteralSearcher::suffixes(parsed.suffixes),
356             ac: ac,
357             match_type: MatchType::Nothing,
358         };
359         ro.match_type = ro.choose_match_type(self.match_type);
360 
361         let ro = Arc::new(ro);
362         Ok(Exec { ro: ro, cache: CachedThreadLocal::new() })
363     }
364 }
365 
366 impl<'c> RegularExpression for ExecNoSyncStr<'c> {
367     type Text = str;
368 
slots_len(&self) -> usize369     fn slots_len(&self) -> usize { self.0.slots_len() }
370 
next_after_empty(&self, text: &str, i: usize) -> usize371     fn next_after_empty(&self, text: &str, i: usize) -> usize {
372         next_utf8(text.as_bytes(), i)
373     }
374 
375     #[inline(always)] // reduces constant overhead
shortest_match_at(&self, text: &str, start: usize) -> Option<usize>376     fn shortest_match_at(&self, text: &str, start: usize) -> Option<usize> {
377         self.0.shortest_match_at(text.as_bytes(), start)
378     }
379 
380     #[inline(always)] // reduces constant overhead
is_match_at(&self, text: &str, start: usize) -> bool381     fn is_match_at(&self, text: &str, start: usize) -> bool {
382         self.0.is_match_at(text.as_bytes(), start)
383     }
384 
385     #[inline(always)] // reduces constant overhead
find_at(&self, text: &str, start: usize) -> Option<(usize, usize)>386     fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> {
387         self.0.find_at(text.as_bytes(), start)
388     }
389 
390     #[inline(always)] // reduces constant overhead
captures_read_at( &self, locs: &mut Locations, text: &str, start: usize, ) -> Option<(usize, usize)>391     fn captures_read_at(
392         &self,
393         locs: &mut Locations,
394         text: &str,
395         start: usize,
396     ) -> Option<(usize, usize)> {
397         self.0.captures_read_at(locs, text.as_bytes(), start)
398     }
399 }
400 
401 impl<'c> RegularExpression for ExecNoSync<'c> {
402     type Text = [u8];
403 
404     /// Returns the number of capture slots in the regular expression. (There
405     /// are two slots for every capture group, corresponding to possibly empty
406     /// start and end locations of the capture.)
slots_len(&self) -> usize407     fn slots_len(&self) -> usize {
408         self.ro.nfa.captures.len() * 2
409     }
410 
next_after_empty(&self, _text: &[u8], i: usize) -> usize411     fn next_after_empty(&self, _text: &[u8], i: usize) -> usize {
412         i + 1
413     }
414 
415     /// Returns the end of a match location, possibly occurring before the
416     /// end location of the correct leftmost-first match.
417     #[inline(always)] // reduces constant overhead
shortest_match_at(&self, text: &[u8], start: usize) -> Option<usize>418     fn shortest_match_at(&self, text: &[u8], start: usize) -> Option<usize> {
419         if !self.is_anchor_end_match(text) {
420             return None;
421         }
422         match self.ro.match_type {
423             MatchType::Literal(ty) => {
424                 self.find_literals(ty, text, start).map(|(_, e)| e)
425             }
426             MatchType::Dfa | MatchType::DfaMany => {
427                 match self.shortest_dfa(text, start) {
428                     dfa::Result::Match(end) => Some(end),
429                     dfa::Result::NoMatch(_) => None,
430                     dfa::Result::Quit => self.shortest_nfa(text, start),
431                 }
432             }
433             MatchType::DfaAnchoredReverse => {
434                 match dfa::Fsm::reverse(
435                     &self.ro.dfa_reverse,
436                     self.cache,
437                     true,
438                     &text[start..],
439                     text.len(),
440                 ) {
441                     dfa::Result::Match(_) => Some(text.len()),
442                     dfa::Result::NoMatch(_) => None,
443                     dfa::Result::Quit => self.shortest_nfa(text, start),
444                 }
445             }
446             MatchType::DfaSuffix => {
447                 match self.shortest_dfa_reverse_suffix(text, start) {
448                     dfa::Result::Match(e) => Some(e),
449                     dfa::Result::NoMatch(_) => None,
450                     dfa::Result::Quit => self.shortest_nfa(text, start),
451                 }
452             }
453             MatchType::Nfa(ty) => self.shortest_nfa_type(ty, text, start),
454             MatchType::Nothing => None,
455         }
456     }
457 
458     /// Returns true if and only if the regex matches text.
459     ///
460     /// For single regular expressions, this is equivalent to calling
461     /// shortest_match(...).is_some().
462     #[inline(always)] // reduces constant overhead
is_match_at(&self, text: &[u8], start: usize) -> bool463     fn is_match_at(&self, text: &[u8], start: usize) -> bool {
464         if !self.is_anchor_end_match(text) {
465             return false;
466         }
467         // We need to do this dance because shortest_match relies on the NFA
468         // filling in captures[1], but a RegexSet has no captures. In other
469         // words, a RegexSet can't (currently) use shortest_match. ---AG
470         match self.ro.match_type {
471             MatchType::Literal(ty) => {
472                 self.find_literals(ty, text, start).is_some()
473             }
474             MatchType::Dfa | MatchType::DfaMany => {
475                 match self.shortest_dfa(text, start) {
476                     dfa::Result::Match(_) => true,
477                     dfa::Result::NoMatch(_) => false,
478                     dfa::Result::Quit => self.match_nfa(text, start),
479                 }
480             }
481             MatchType::DfaAnchoredReverse => {
482                 match dfa::Fsm::reverse(
483                     &self.ro.dfa_reverse,
484                     self.cache,
485                     true,
486                     &text[start..],
487                     text.len(),
488                 ) {
489                     dfa::Result::Match(_) => true,
490                     dfa::Result::NoMatch(_) => false,
491                     dfa::Result::Quit => self.match_nfa(text, start),
492                 }
493             }
494             MatchType::DfaSuffix => {
495                 match self.shortest_dfa_reverse_suffix(text, start) {
496                     dfa::Result::Match(_) => true,
497                     dfa::Result::NoMatch(_) => false,
498                     dfa::Result::Quit => self.match_nfa(text, start),
499                 }
500             }
501             MatchType::Nfa(ty) => self.match_nfa_type(ty, text, start),
502             MatchType::Nothing => false,
503         }
504     }
505 
506     /// Finds the start and end location of the leftmost-first match, starting
507     /// at the given location.
508     #[inline(always)] // reduces constant overhead
find_at(&self, text: &[u8], start: usize) -> Option<(usize, usize)>509     fn find_at(&self, text: &[u8], start: usize) -> Option<(usize, usize)> {
510         if !self.is_anchor_end_match(text) {
511             return None;
512         }
513         match self.ro.match_type {
514             MatchType::Literal(ty) => {
515                 self.find_literals(ty, text, start)
516             }
517             MatchType::Dfa => {
518                 match self.find_dfa_forward(text, start) {
519                     dfa::Result::Match((s, e)) => Some((s, e)),
520                     dfa::Result::NoMatch(_) => None,
521                     dfa::Result::Quit => {
522                         self.find_nfa(MatchNfaType::Auto, text, start)
523                     }
524                 }
525             }
526             MatchType::DfaAnchoredReverse => {
527                 match self.find_dfa_anchored_reverse(text, start) {
528                     dfa::Result::Match((s, e)) => Some((s, e)),
529                     dfa::Result::NoMatch(_) => None,
530                     dfa::Result::Quit => {
531                         self.find_nfa(MatchNfaType::Auto, text, start)
532                     }
533                 }
534             }
535             MatchType::DfaSuffix => {
536                 match self.find_dfa_reverse_suffix(text, start) {
537                     dfa::Result::Match((s, e)) => Some((s, e)),
538                     dfa::Result::NoMatch(_) => None,
539                     dfa::Result::Quit => {
540                         self.find_nfa(MatchNfaType::Auto, text, start)
541                     }
542                 }
543             }
544             MatchType::Nfa(ty) => self.find_nfa(ty, text, start),
545             MatchType::Nothing => None,
546             MatchType::DfaMany => {
547                 unreachable!("BUG: RegexSet cannot be used with find")
548             }
549         }
550     }
551 
552     /// Finds the start and end location of the leftmost-first match and also
553     /// fills in all matching capture groups.
554     ///
555     /// The number of capture slots given should be equal to the total number
556     /// of capture slots in the compiled program.
557     ///
558     /// Note that the first two slots always correspond to the start and end
559     /// locations of the overall match.
captures_read_at( &self, locs: &mut Locations, text: &[u8], start: usize, ) -> Option<(usize, usize)>560     fn captures_read_at(
561         &self,
562         locs: &mut Locations,
563         text: &[u8],
564         start: usize,
565     ) -> Option<(usize, usize)> {
566         let slots = locs.as_slots();
567         for slot in slots.iter_mut() {
568             *slot = None;
569         }
570         // If the caller unnecessarily uses this, then we try to save them
571         // from themselves.
572         match slots.len() {
573             0 => return self.find_at(text, start),
574             2 => {
575                 return self.find_at(text, start).map(|(s, e)| {
576                     slots[0] = Some(s);
577                     slots[1] = Some(e);
578                     (s, e)
579                 });
580             }
581             _ => {} // fallthrough
582         }
583         if !self.is_anchor_end_match(text) {
584             return None;
585         }
586         match self.ro.match_type {
587             MatchType::Literal(ty) => {
588                 self.find_literals(ty, text, start).and_then(|(s, e)| {
589                     self.captures_nfa_type(
590                         MatchNfaType::Auto, slots, text, s, e)
591                 })
592             }
593             MatchType::Dfa => {
594                 if self.ro.nfa.is_anchored_start {
595                     self.captures_nfa(slots, text, start)
596                 } else {
597                     match self.find_dfa_forward(text, start) {
598                         dfa::Result::Match((s, e)) => {
599                             self.captures_nfa_type(
600                                 MatchNfaType::Auto, slots, text, s, e)
601                         }
602                         dfa::Result::NoMatch(_) => None,
603                         dfa::Result::Quit => {
604                             self.captures_nfa(slots, text, start)
605                         }
606                     }
607                 }
608             }
609             MatchType::DfaAnchoredReverse => {
610                 match self.find_dfa_anchored_reverse(text, start) {
611                     dfa::Result::Match((s, e)) => {
612                         self.captures_nfa_type(
613                             MatchNfaType::Auto, slots, text, s, e)
614                     }
615                     dfa::Result::NoMatch(_) => None,
616                     dfa::Result::Quit => self.captures_nfa(slots, text, start),
617                 }
618             }
619             MatchType::DfaSuffix => {
620                 match self.find_dfa_reverse_suffix(text, start) {
621                     dfa::Result::Match((s, e)) => {
622                         self.captures_nfa_type(
623                             MatchNfaType::Auto, slots, text, s, e)
624                     }
625                     dfa::Result::NoMatch(_) => None,
626                     dfa::Result::Quit => self.captures_nfa(slots, text, start),
627                 }
628             }
629             MatchType::Nfa(ty) => {
630                 self.captures_nfa_type(ty, slots, text, start, text.len())
631             }
632             MatchType::Nothing => None,
633             MatchType::DfaMany => {
634                 unreachable!("BUG: RegexSet cannot be used with captures")
635             }
636         }
637     }
638 }
639 
640 impl<'c> ExecNoSync<'c> {
641     /// Finds the leftmost-first match using only literal search.
642     #[inline(always)] // reduces constant overhead
find_literals( &self, ty: MatchLiteralType, text: &[u8], start: usize, ) -> Option<(usize, usize)>643     fn find_literals(
644         &self,
645         ty: MatchLiteralType,
646         text: &[u8],
647         start: usize,
648     ) -> Option<(usize, usize)> {
649         use self::MatchLiteralType::*;
650         match ty {
651             Unanchored => {
652                 let lits = &self.ro.nfa.prefixes;
653                 lits.find(&text[start..])
654                     .map(|(s, e)| (start + s, start + e))
655             }
656             AnchoredStart => {
657                 let lits = &self.ro.nfa.prefixes;
658                 if !self.ro.nfa.is_anchored_start
659                     || (self.ro.nfa.is_anchored_start && start == 0) {
660                     lits.find_start(&text[start..])
661                         .map(|(s, e)| (start + s, start + e))
662                 } else {
663                     None
664                 }
665             }
666             AnchoredEnd => {
667                 let lits = &self.ro.suffixes;
668                 lits.find_end(&text[start..])
669                     .map(|(s, e)| (start + s, start + e))
670             }
671             AhoCorasick => {
672                 self.ro.ac.as_ref().unwrap()
673                     .find(&text[start..])
674                     .map(|m| (start + m.start(), start + m.end()))
675             }
676         }
677     }
678 
679     /// Finds the leftmost-first match (start and end) using only the DFA.
680     ///
681     /// If the result returned indicates that the DFA quit, then another
682     /// matching engine should be used.
683     #[inline(always)] // reduces constant overhead
find_dfa_forward( &self, text: &[u8], start: usize, ) -> dfa::Result<(usize, usize)>684     fn find_dfa_forward(
685         &self,
686         text: &[u8],
687         start: usize,
688     ) -> dfa::Result<(usize, usize)> {
689         use dfa::Result::*;
690         let end = match dfa::Fsm::forward(
691             &self.ro.dfa,
692             self.cache,
693             false,
694             text,
695             start,
696         ) {
697             NoMatch(i) => return NoMatch(i),
698             Quit => return Quit,
699             Match(end) if start == end => return Match((start, start)),
700             Match(end) => end,
701         };
702         // Now run the DFA in reverse to find the start of the match.
703         match dfa::Fsm::reverse(
704             &self.ro.dfa_reverse,
705             self.cache,
706             false,
707             &text[start..],
708             end - start,
709         ) {
710             Match(s) => Match((start + s, end)),
711             NoMatch(i) => NoMatch(i),
712             Quit => Quit,
713         }
714     }
715 
716     /// Finds the leftmost-first match (start and end) using only the DFA,
717     /// but assumes the regex is anchored at the end and therefore starts at
718     /// the end of the regex and matches in reverse.
719     ///
720     /// If the result returned indicates that the DFA quit, then another
721     /// matching engine should be used.
722     #[inline(always)] // reduces constant overhead
find_dfa_anchored_reverse( &self, text: &[u8], start: usize, ) -> dfa::Result<(usize, usize)>723     fn find_dfa_anchored_reverse(
724         &self,
725         text: &[u8],
726         start: usize,
727     ) -> dfa::Result<(usize, usize)> {
728         use dfa::Result::*;
729         match dfa::Fsm::reverse(
730             &self.ro.dfa_reverse,
731             self.cache,
732             false,
733             &text[start..],
734             text.len() - start,
735         ) {
736             Match(s) => Match((start + s, text.len())),
737             NoMatch(i) => NoMatch(i),
738             Quit => Quit,
739         }
740     }
741 
742     /// Finds the end of the shortest match using only the DFA.
743     #[inline(always)] // reduces constant overhead
shortest_dfa(&self, text: &[u8], start: usize) -> dfa::Result<usize>744     fn shortest_dfa(&self, text: &[u8], start: usize) -> dfa::Result<usize> {
745         dfa::Fsm::forward(&self.ro.dfa, self.cache, true, text, start)
746     }
747 
748     /// Finds the end of the shortest match using only the DFA by scanning for
749     /// suffix literals.
750     ///
751     #[inline(always)] // reduces constant overhead
shortest_dfa_reverse_suffix( &self, text: &[u8], start: usize, ) -> dfa::Result<usize>752     fn shortest_dfa_reverse_suffix(
753         &self,
754         text: &[u8],
755         start: usize,
756     ) -> dfa::Result<usize> {
757         match self.exec_dfa_reverse_suffix(text, start) {
758             None => self.shortest_dfa(text, start),
759             Some(r) => r.map(|(_, end)| end),
760         }
761     }
762 
763     /// Finds the end of the shortest match using only the DFA by scanning for
764     /// suffix literals. It also reports the start of the match.
765     ///
766     /// Note that if None is returned, then the optimization gave up to avoid
767     /// worst case quadratic behavior. A forward scanning DFA should be tried
768     /// next.
769     ///
770     /// If a match is returned and the full leftmost-first match is desired,
771     /// then a forward scan starting from the beginning of the match must be
772     /// done.
773     ///
774     /// If the result returned indicates that the DFA quit, then another
775     /// matching engine should be used.
776     #[inline(always)] // reduces constant overhead
exec_dfa_reverse_suffix( &self, text: &[u8], original_start: usize, ) -> Option<dfa::Result<(usize, usize)>>777     fn exec_dfa_reverse_suffix(
778         &self,
779         text: &[u8],
780         original_start: usize,
781     ) -> Option<dfa::Result<(usize, usize)>> {
782         use dfa::Result::*;
783 
784         let lcs = self.ro.suffixes.lcs();
785         debug_assert!(lcs.len() >= 1);
786         let mut start = original_start;
787         let mut end = start;
788         let mut last_literal = start;
789         while end <= text.len() {
790             last_literal += match lcs.find(&text[last_literal..]) {
791                 None => return Some(NoMatch(text.len())),
792                 Some(i) => i,
793             };
794             end = last_literal + lcs.len();
795             match dfa::Fsm::reverse(
796                 &self.ro.dfa_reverse,
797                 self.cache,
798                 false,
799                 &text[start..end],
800                 end - start,
801             ) {
802                 Match(0) | NoMatch(0) => return None,
803                 Match(i) => return Some(Match((start + i, end))),
804                 NoMatch(i) => {
805                     start += i;
806                     last_literal += 1;
807                     continue;
808                 }
809                 Quit => return Some(Quit),
810             };
811         }
812         Some(NoMatch(text.len()))
813     }
814 
815     /// Finds the leftmost-first match (start and end) using only the DFA
816     /// by scanning for suffix literals.
817     ///
818     /// If the result returned indicates that the DFA quit, then another
819     /// matching engine should be used.
820     #[inline(always)] // reduces constant overhead
find_dfa_reverse_suffix( &self, text: &[u8], start: usize, ) -> dfa::Result<(usize, usize)>821     fn find_dfa_reverse_suffix(
822         &self,
823         text: &[u8],
824         start: usize,
825     ) -> dfa::Result<(usize, usize)> {
826         use dfa::Result::*;
827 
828         let match_start = match self.exec_dfa_reverse_suffix(text, start) {
829             None => return self.find_dfa_forward(text, start),
830             Some(Match((start, _))) => start,
831             Some(r) => return r,
832         };
833         // At this point, we've found a match. The only way to quit now
834         // without a match is if the DFA gives up (seems unlikely).
835         //
836         // Now run the DFA forwards to find the proper end of the match.
837         // (The suffix literal match can only indicate the earliest
838         // possible end location, which may appear before the end of the
839         // leftmost-first match.)
840         match dfa::Fsm::forward(
841             &self.ro.dfa,
842             self.cache,
843             false,
844             text,
845             match_start,
846         ) {
847             NoMatch(_) => panic!("BUG: reverse match implies forward match"),
848             Quit => Quit,
849             Match(e) => Match((match_start, e)),
850         }
851     }
852 
853     /// Executes the NFA engine to return whether there is a match or not.
854     ///
855     /// Ideally, we could use shortest_nfa(...).is_some() and get the same
856     /// performance characteristics, but regex sets don't have captures, which
857     /// shortest_nfa depends on.
match_nfa( &self, text: &[u8], start: usize, ) -> bool858     fn match_nfa(
859         &self,
860         text: &[u8],
861         start: usize,
862     ) -> bool {
863         self.match_nfa_type(MatchNfaType::Auto, text, start)
864     }
865 
866     /// Like match_nfa, but allows specification of the type of NFA engine.
match_nfa_type( &self, ty: MatchNfaType, text: &[u8], start: usize, ) -> bool867     fn match_nfa_type(
868         &self,
869         ty: MatchNfaType,
870         text: &[u8],
871         start: usize,
872     ) -> bool {
873         self.exec_nfa(ty, &mut [false], &mut [], true, text, start, text.len())
874     }
875 
876     /// Finds the shortest match using an NFA.
shortest_nfa(&self, text: &[u8], start: usize) -> Option<usize>877     fn shortest_nfa(&self, text: &[u8], start: usize) -> Option<usize> {
878         self.shortest_nfa_type(MatchNfaType::Auto, text, start)
879     }
880 
881     /// Like shortest_nfa, but allows specification of the type of NFA engine.
shortest_nfa_type( &self, ty: MatchNfaType, text: &[u8], start: usize, ) -> Option<usize>882     fn shortest_nfa_type(
883         &self,
884         ty: MatchNfaType,
885         text: &[u8],
886         start: usize,
887     ) -> Option<usize> {
888         let mut slots = [None, None];
889         if self.exec_nfa(
890             ty,
891             &mut [false],
892             &mut slots,
893             true,
894             text,
895             start,
896             text.len()
897         ) {
898             slots[1]
899         } else {
900             None
901         }
902     }
903 
904     /// Like find, but executes an NFA engine.
find_nfa( &self, ty: MatchNfaType, text: &[u8], start: usize, ) -> Option<(usize, usize)>905     fn find_nfa(
906         &self,
907         ty: MatchNfaType,
908         text: &[u8],
909         start: usize,
910     ) -> Option<(usize, usize)> {
911         let mut slots = [None, None];
912         if self.exec_nfa(
913             ty,
914             &mut [false],
915             &mut slots,
916             false,
917             text,
918             start,
919             text.len()
920         ) {
921             match (slots[0], slots[1]) {
922                 (Some(s), Some(e)) => Some((s, e)),
923                 _ => None,
924             }
925         } else {
926             None
927         }
928     }
929 
930     /// Like find_nfa, but fills in captures.
931     ///
932     /// `slots` should have length equal to `2 * nfa.captures.len()`.
captures_nfa( &self, slots: &mut [Slot], text: &[u8], start: usize, ) -> Option<(usize, usize)>933     fn captures_nfa(
934         &self,
935         slots: &mut [Slot],
936         text: &[u8],
937         start: usize,
938     ) -> Option<(usize, usize)> {
939         self.captures_nfa_type(
940             MatchNfaType::Auto, slots, text, start, text.len())
941     }
942 
943     /// Like captures_nfa, but allows specification of type of NFA engine.
captures_nfa_type( &self, ty: MatchNfaType, slots: &mut [Slot], text: &[u8], start: usize, end: usize, ) -> Option<(usize, usize)>944     fn captures_nfa_type(
945         &self,
946         ty: MatchNfaType,
947         slots: &mut [Slot],
948         text: &[u8],
949         start: usize,
950         end: usize,
951     ) -> Option<(usize, usize)> {
952         if self.exec_nfa(ty, &mut [false], slots, false, text, start, end) {
953             match (slots[0], slots[1]) {
954                 (Some(s), Some(e)) => Some((s, e)),
955                 _ => None,
956             }
957         } else {
958             None
959         }
960     }
961 
exec_nfa( &self, mut ty: MatchNfaType, matches: &mut [bool], slots: &mut [Slot], quit_after_match: bool, text: &[u8], start: usize, end: usize, ) -> bool962     fn exec_nfa(
963         &self,
964         mut ty: MatchNfaType,
965         matches: &mut [bool],
966         slots: &mut [Slot],
967         quit_after_match: bool,
968         text: &[u8],
969         start: usize,
970         end: usize,
971     ) -> bool {
972         use self::MatchNfaType::*;
973         if let Auto = ty {
974             if backtrack::should_exec(self.ro.nfa.len(), text.len()) {
975                 ty = Backtrack;
976             } else {
977                 ty = PikeVM;
978             }
979         }
980         match ty {
981             Auto => unreachable!(),
982             Backtrack => self.exec_backtrack(matches, slots, text, start, end),
983             PikeVM => {
984                 self.exec_pikevm(
985                     matches, slots, quit_after_match, text, start, end)
986             }
987         }
988     }
989 
990     /// Always run the NFA algorithm.
exec_pikevm( &self, matches: &mut [bool], slots: &mut [Slot], quit_after_match: bool, text: &[u8], start: usize, end: usize, ) -> bool991     fn exec_pikevm(
992         &self,
993         matches: &mut [bool],
994         slots: &mut [Slot],
995         quit_after_match: bool,
996         text: &[u8],
997         start: usize,
998         end: usize,
999     ) -> bool {
1000         if self.ro.nfa.uses_bytes() {
1001             pikevm::Fsm::exec(
1002                 &self.ro.nfa,
1003                 self.cache,
1004                 matches,
1005                 slots,
1006                 quit_after_match,
1007                 ByteInput::new(text, self.ro.nfa.only_utf8),
1008                 start,
1009                 end)
1010         } else {
1011             pikevm::Fsm::exec(
1012                 &self.ro.nfa,
1013                 self.cache,
1014                 matches,
1015                 slots,
1016                 quit_after_match,
1017                 CharInput::new(text),
1018                 start,
1019                 end)
1020         }
1021     }
1022 
1023     /// Always runs the NFA using bounded backtracking.
exec_backtrack( &self, matches: &mut [bool], slots: &mut [Slot], text: &[u8], start: usize, end: usize, ) -> bool1024     fn exec_backtrack(
1025         &self,
1026         matches: &mut [bool],
1027         slots: &mut [Slot],
1028         text: &[u8],
1029         start: usize,
1030         end: usize,
1031     ) -> bool {
1032         if self.ro.nfa.uses_bytes() {
1033             backtrack::Bounded::exec(
1034                 &self.ro.nfa,
1035                 self.cache,
1036                 matches,
1037                 slots,
1038                 ByteInput::new(text, self.ro.nfa.only_utf8),
1039                 start,
1040                 end)
1041         } else {
1042             backtrack::Bounded::exec(
1043                 &self.ro.nfa,
1044                 self.cache,
1045                 matches,
1046                 slots,
1047                 CharInput::new(text),
1048                 start,
1049                 end)
1050         }
1051     }
1052 
1053     /// Finds which regular expressions match the given text.
1054     ///
1055     /// `matches` should have length equal to the number of regexes being
1056     /// searched.
1057     ///
1058     /// This is only useful when one wants to know which regexes in a set
1059     /// match some text.
many_matches_at( &self, matches: &mut [bool], text: &[u8], start: usize, ) -> bool1060     pub fn many_matches_at(
1061         &self,
1062         matches: &mut [bool],
1063         text: &[u8],
1064         start: usize,
1065     ) -> bool {
1066         use self::MatchType::*;
1067         if !self.is_anchor_end_match(text) {
1068             return false;
1069         }
1070         match self.ro.match_type {
1071             Literal(ty) => {
1072                 debug_assert_eq!(matches.len(), 1);
1073                 matches[0] = self.find_literals(ty, text, start).is_some();
1074                 matches[0]
1075             }
1076             Dfa | DfaAnchoredReverse | DfaSuffix | DfaMany => {
1077                 match dfa::Fsm::forward_many(
1078                     &self.ro.dfa,
1079                     self.cache,
1080                     matches,
1081                     text,
1082                     start,
1083                 ) {
1084                     dfa::Result::Match(_) => true,
1085                     dfa::Result::NoMatch(_) => false,
1086                     dfa::Result::Quit => {
1087                         self.exec_nfa(
1088                             MatchNfaType::Auto,
1089                             matches,
1090                             &mut [],
1091                             false,
1092                             text,
1093                             start,
1094                             text.len())
1095                     }
1096                 }
1097             }
1098             Nfa(ty) => {
1099                 self.exec_nfa(
1100                     ty, matches, &mut [], false, text, start, text.len())
1101             }
1102             Nothing => false,
1103         }
1104     }
1105 
1106     #[inline(always)] // reduces constant overhead
is_anchor_end_match(&self, text: &[u8]) -> bool1107     fn is_anchor_end_match(&self, text: &[u8]) -> bool {
1108         // Only do this check if the haystack is big (>1MB).
1109         if text.len() > (1<<20) && self.ro.nfa.is_anchored_end {
1110             let lcs = self.ro.suffixes.lcs();
1111             if lcs.len() >= 1 && !lcs.is_suffix(text) {
1112                 return false;
1113             }
1114         }
1115         true
1116     }
1117 
capture_name_idx(&self) -> &Arc<HashMap<String, usize>>1118     pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1119         &self.ro.nfa.capture_name_idx
1120     }
1121 }
1122 
1123 impl<'c> ExecNoSyncStr<'c> {
capture_name_idx(&self) -> &Arc<HashMap<String, usize>>1124     pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1125         self.0.capture_name_idx()
1126     }
1127 }
1128 
1129 impl Exec {
1130     /// Get a searcher that isn't Sync.
1131     #[inline(always)] // reduces constant overhead
searcher(&self) -> ExecNoSync1132     pub fn searcher(&self) -> ExecNoSync {
1133         let create = || {
1134             Box::new(RefCell::new(ProgramCacheInner::new(&self.ro)))
1135         };
1136         ExecNoSync {
1137             ro: &self.ro, // a clone is too expensive here! (and not needed)
1138             cache: self.cache.get_or(create),
1139         }
1140     }
1141 
1142     /// Get a searcher that isn't Sync and can match on &str.
1143     #[inline(always)] // reduces constant overhead
searcher_str(&self) -> ExecNoSyncStr1144     pub fn searcher_str(&self) -> ExecNoSyncStr {
1145         ExecNoSyncStr(self.searcher())
1146     }
1147 
1148     /// Build a Regex from this executor.
into_regex(self) -> re_unicode::Regex1149     pub fn into_regex(self) -> re_unicode::Regex {
1150         re_unicode::Regex::from(self)
1151     }
1152 
1153     /// Build a RegexSet from this executor.
into_regex_set(self) -> re_set::unicode::RegexSet1154     pub fn into_regex_set(self) -> re_set::unicode::RegexSet {
1155         re_set::unicode::RegexSet::from(self)
1156     }
1157 
1158     /// Build a Regex from this executor that can match arbitrary bytes.
into_byte_regex(self) -> re_bytes::Regex1159     pub fn into_byte_regex(self) -> re_bytes::Regex {
1160         re_bytes::Regex::from(self)
1161     }
1162 
1163     /// Build a RegexSet from this executor that can match arbitrary bytes.
into_byte_regex_set(self) -> re_set::bytes::RegexSet1164     pub fn into_byte_regex_set(self) -> re_set::bytes::RegexSet {
1165         re_set::bytes::RegexSet::from(self)
1166     }
1167 
1168     /// The original regular expressions given by the caller that were
1169     /// compiled.
regex_strings(&self) -> &[String]1170     pub fn regex_strings(&self) -> &[String] {
1171         &self.ro.res
1172     }
1173 
1174     /// Return a slice of capture names.
1175     ///
1176     /// Any capture that isn't named is None.
capture_names(&self) -> &[Option<String>]1177     pub fn capture_names(&self) -> &[Option<String>] {
1178         &self.ro.nfa.captures
1179     }
1180 
1181     /// Return a reference to named groups mapping (from group name to
1182     /// group position).
capture_name_idx(&self) -> &Arc<HashMap<String, usize>>1183     pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1184         &self.ro.nfa.capture_name_idx
1185     }
1186 }
1187 
1188 impl Clone for Exec {
clone(&self) -> Exec1189     fn clone(&self) -> Exec {
1190         Exec {
1191             ro: self.ro.clone(),
1192             cache: CachedThreadLocal::new(),
1193         }
1194     }
1195 }
1196 
1197 impl ExecReadOnly {
choose_match_type(&self, hint: Option<MatchType>) -> MatchType1198     fn choose_match_type(&self, hint: Option<MatchType>) -> MatchType {
1199         use self::MatchType::*;
1200         if let Some(Nfa(_)) = hint {
1201             return hint.unwrap();
1202         }
1203         // If the NFA is empty, then we'll never match anything.
1204         if self.nfa.insts.is_empty() {
1205             return Nothing;
1206         }
1207         // If our set of prefixes is complete, then we can use it to find
1208         // a match in lieu of a regex engine. This doesn't quite work well in
1209         // the presence of multiple regexes, so only do it when there's one.
1210         //
1211         // TODO(burntsushi): Also, don't try to match literals if the regex is
1212         // partially anchored. We could technically do it, but we'd need to
1213         // create two sets of literals: all of them and then the subset that
1214         // aren't anchored. We would then only search for all of them when at
1215         // the beginning of the input and use the subset in all other cases.
1216         if self.res.len() == 1 {
1217             if self.ac.is_some() {
1218                 return Literal(MatchLiteralType::AhoCorasick);
1219             }
1220             if self.nfa.prefixes.complete() {
1221                 return if self.nfa.is_anchored_start {
1222                     Literal(MatchLiteralType::AnchoredStart)
1223                 } else {
1224                     Literal(MatchLiteralType::Unanchored)
1225                 };
1226             }
1227             if self.suffixes.complete() {
1228                 return if self.nfa.is_anchored_end {
1229                     Literal(MatchLiteralType::AnchoredEnd)
1230                 } else {
1231                     // This case shouldn't happen. When the regex isn't
1232                     // anchored, then complete prefixes should imply complete
1233                     // suffixes.
1234                     Literal(MatchLiteralType::Unanchored)
1235                 };
1236             }
1237         }
1238         // If we can execute the DFA, then we totally should.
1239         if dfa::can_exec(&self.dfa) {
1240             // Regex sets require a slightly specialized path.
1241             if self.res.len() >= 2 {
1242                 return DfaMany;
1243             }
1244             // If the regex is anchored at the end but not the start, then
1245             // just match in reverse from the end of the haystack.
1246             if !self.nfa.is_anchored_start && self.nfa.is_anchored_end {
1247                 return DfaAnchoredReverse;
1248             }
1249             // If there's a longish suffix literal, then it might be faster
1250             // to look for that first.
1251             if self.should_suffix_scan() {
1252                 return DfaSuffix;
1253             }
1254             // Fall back to your garden variety forward searching lazy DFA.
1255             return Dfa;
1256         }
1257         // We're so totally hosed.
1258         Nfa(MatchNfaType::Auto)
1259     }
1260 
1261     /// Returns true if the program is amenable to suffix scanning.
1262     ///
1263     /// When this is true, as a heuristic, we assume it is OK to quickly scan
1264     /// for suffix literals and then do a *reverse* DFA match from any matches
1265     /// produced by the literal scan. (And then followed by a forward DFA
1266     /// search, since the previously found suffix literal maybe not actually be
1267     /// the end of a match.)
1268     ///
1269     /// This is a bit of a specialized optimization, but can result in pretty
1270     /// big performance wins if 1) there are no prefix literals and 2) the
1271     /// suffix literals are pretty rare in the text. (1) is obviously easy to
1272     /// account for but (2) is harder. As a proxy, we assume that longer
1273     /// strings are generally rarer, so we only enable this optimization when
1274     /// we have a meaty suffix.
should_suffix_scan(&self) -> bool1275     fn should_suffix_scan(&self) -> bool {
1276         if self.suffixes.is_empty() {
1277             return false;
1278         }
1279         let lcs_len = self.suffixes.lcs().char_len();
1280         lcs_len >= 3 && lcs_len > self.dfa.prefixes.lcp().char_len()
1281     }
1282 }
1283 
1284 #[derive(Clone, Copy, Debug)]
1285 enum MatchType {
1286     /// A single or multiple literal search. This is only used when the regex
1287     /// can be decomposed into a literal search.
1288     Literal(MatchLiteralType),
1289     /// A normal DFA search.
1290     Dfa,
1291     /// A reverse DFA search starting from the end of a haystack.
1292     DfaAnchoredReverse,
1293     /// A reverse DFA search with suffix literal scanning.
1294     DfaSuffix,
1295     /// Use the DFA on two or more regular expressions.
1296     DfaMany,
1297     /// An NFA variant.
1298     Nfa(MatchNfaType),
1299     /// No match is ever possible, so don't ever try to search.
1300     Nothing,
1301 }
1302 
1303 #[derive(Clone, Copy, Debug)]
1304 enum MatchLiteralType {
1305     /// Match literals anywhere in text.
1306     Unanchored,
1307     /// Match literals only at the start of text.
1308     AnchoredStart,
1309     /// Match literals only at the end of text.
1310     AnchoredEnd,
1311     /// Use an Aho-Corasick automaton. This requires `ac` to be Some on
1312     /// ExecReadOnly.
1313     AhoCorasick,
1314 }
1315 
1316 #[derive(Clone, Copy, Debug)]
1317 enum MatchNfaType {
1318     /// Choose between Backtrack and PikeVM.
1319     Auto,
1320     /// NFA bounded backtracking.
1321     ///
1322     /// (This is only set by tests, since it never makes sense to always want
1323     /// backtracking.)
1324     Backtrack,
1325     /// The Pike VM.
1326     ///
1327     /// (This is only set by tests, since it never makes sense to always want
1328     /// the Pike VM.)
1329     PikeVM,
1330 }
1331 
1332 /// `ProgramCache` maintains reusable allocations for each matching engine
1333 /// available to a particular program.
1334 pub type ProgramCache = RefCell<ProgramCacheInner>;
1335 
1336 #[derive(Debug)]
1337 pub struct ProgramCacheInner {
1338     pub pikevm: pikevm::Cache,
1339     pub backtrack: backtrack::Cache,
1340     pub dfa: dfa::Cache,
1341     pub dfa_reverse: dfa::Cache,
1342 }
1343 
1344 impl ProgramCacheInner {
new(ro: &ExecReadOnly) -> Self1345     fn new(ro: &ExecReadOnly) -> Self {
1346         ProgramCacheInner {
1347             pikevm: pikevm::Cache::new(&ro.nfa),
1348             backtrack: backtrack::Cache::new(&ro.nfa),
1349             dfa: dfa::Cache::new(&ro.dfa),
1350             dfa_reverse: dfa::Cache::new(&ro.dfa_reverse),
1351         }
1352     }
1353 }
1354 
1355 /// Alternation literals checks if the given HIR is a simple alternation of
1356 /// literals, and if so, returns them. Otherwise, this returns None.
alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>>1357 fn alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>> {
1358     use syntax::hir::{HirKind, Literal};
1359 
1360     // This is pretty hacky, but basically, if `is_alternation_literal` is
1361     // true, then we can make several assumptions about the structure of our
1362     // HIR. This is what justifies the `unreachable!` statements below.
1363     //
1364     // This code should be refactored once we overhaul this crate's
1365     // optimization pipeline, because this is a terribly inflexible way to go
1366     // about things.
1367 
1368     if !expr.is_alternation_literal() {
1369         return None;
1370     }
1371     let alts = match *expr.kind() {
1372         HirKind::Alternation(ref alts) => alts,
1373         _ => return None, // one literal isn't worth it
1374     };
1375 
1376     let extendlit = |lit: &Literal, dst: &mut Vec<u8>| {
1377         match *lit {
1378             Literal::Unicode(c) => {
1379                 let mut buf = [0; 4];
1380                 dst.extend_from_slice(c.encode_utf8(&mut buf).as_bytes());
1381             }
1382             Literal::Byte(b) => {
1383                 dst.push(b);
1384             }
1385         }
1386     };
1387 
1388     let mut lits = vec![];
1389     for alt in alts {
1390         let mut lit = vec![];
1391         match *alt.kind() {
1392             HirKind::Literal(ref x) => extendlit(x, &mut lit),
1393             HirKind::Concat(ref exprs) => {
1394                 for e in exprs {
1395                     match *e.kind() {
1396                         HirKind::Literal(ref x) => extendlit(x, &mut lit),
1397                         _ => unreachable!("expected literal, got {:?}", e),
1398                     }
1399                 }
1400             }
1401             _ => unreachable!("expected literal or concat, got {:?}", alt),
1402         }
1403         lits.push(lit);
1404     }
1405     Some(lits)
1406 }
1407 
1408 #[cfg(test)]
1409 mod test {
1410     #[test]
uppercut_s_backtracking_bytes_default_bytes_mismatch()1411     fn uppercut_s_backtracking_bytes_default_bytes_mismatch() {
1412         use internal::ExecBuilder;
1413 
1414         let backtrack_bytes_re = ExecBuilder::new("^S")
1415             .bounded_backtracking()
1416             .only_utf8(false)
1417             .build()
1418             .map(|exec| exec.into_byte_regex())
1419             .map_err(|err| format!("{}", err))
1420             .unwrap();
1421 
1422         let default_bytes_re = ExecBuilder::new("^S")
1423             .only_utf8(false)
1424             .build()
1425             .map(|exec| exec.into_byte_regex())
1426             .map_err(|err| format!("{}", err))
1427             .unwrap();
1428 
1429         let input = vec![83, 83];
1430 
1431         let s1 = backtrack_bytes_re.split(&input);
1432         let s2 = default_bytes_re.split(&input);
1433         for (chunk1, chunk2) in s1.zip(s2) {
1434             assert_eq!(chunk1, chunk2);
1435         }
1436     }
1437 
1438     #[test]
unicode_lit_star_backtracking_utf8bytes_default_utf8bytes_mismatch()1439     fn unicode_lit_star_backtracking_utf8bytes_default_utf8bytes_mismatch() {
1440         use internal::ExecBuilder;
1441 
1442         let backtrack_bytes_re = ExecBuilder::new(r"^(?u:\*)")
1443             .bounded_backtracking()
1444             .bytes(true)
1445             .build()
1446             .map(|exec| exec.into_regex())
1447             .map_err(|err| format!("{}", err))
1448             .unwrap();
1449 
1450         let default_bytes_re = ExecBuilder::new(r"^(?u:\*)")
1451             .bytes(true)
1452             .build()
1453             .map(|exec| exec.into_regex())
1454             .map_err(|err| format!("{}", err))
1455             .unwrap();
1456 
1457         let input = "**";
1458 
1459         let s1 = backtrack_bytes_re.split(input);
1460         let s2 = default_bytes_re.split(input);
1461         for (chunk1, chunk2) in s1.zip(s2) {
1462             assert_eq!(chunk1, chunk2);
1463         }
1464     }
1465 }
1466