1 // Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
4 //
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
10 
11 use std::cell::RefCell;
12 use std::collections::HashMap;
13 use std::cmp;
14 use std::sync::Arc;
15 
16 use thread_local::CachedThreadLocal;
17 use syntax::ParserBuilder;
18 use syntax::hir::Hir;
19 use syntax::hir::literal::Literals;
20 
21 use backtrack;
22 use compile::Compiler;
23 use dfa;
24 use error::Error;
25 use input::{ByteInput, CharInput};
26 use literal::LiteralSearcher;
27 use pikevm;
28 use prog::Program;
29 use re_builder::RegexOptions;
30 use re_bytes;
31 use re_set;
32 use re_trait::{RegularExpression, Slot, Locations, as_slots};
33 use re_unicode;
34 use utf8::next_utf8;
35 
36 /// `Exec` manages the execution of a regular expression.
37 ///
38 /// In particular, this manages the various compiled forms of a single regular
39 /// expression and the choice of which matching engine to use to execute a
40 /// regular expression.
41 pub struct Exec {
42     /// All read only state.
43     ro: Arc<ExecReadOnly>,
44     /// Caches for the various matching engines.
45     cache: CachedThreadLocal<ProgramCache>,
46 }
47 
48 /// `ExecNoSync` is like `Exec`, except it embeds a reference to a cache. This
49 /// means it is no longer Sync, but we can now avoid the overhead of
50 /// synchronization to fetch the cache.
51 #[derive(Debug)]
52 pub struct ExecNoSync<'c> {
53     /// All read only state.
54     ro: &'c Arc<ExecReadOnly>,
55     /// Caches for the various matching engines.
56     cache: &'c ProgramCache,
57 }
58 
59 /// `ExecNoSyncStr` is like `ExecNoSync`, but matches on &str instead of &[u8].
60 pub struct ExecNoSyncStr<'c>(ExecNoSync<'c>);
61 
62 /// `ExecReadOnly` comprises all read only state for a regex. Namely, all such
63 /// state is determined at compile time and never changes during search.
64 #[derive(Debug)]
65 struct ExecReadOnly {
66     /// The original regular expressions given by the caller to compile.
67     res: Vec<String>,
68     /// A compiled program that is used in the NFA simulation and backtracking.
69     /// It can be byte-based or Unicode codepoint based.
70     ///
71     /// N.B. It is not possibly to make this byte-based from the public API.
72     /// It is only used for testing byte based programs in the NFA simulations.
73     nfa: Program,
74     /// A compiled byte based program for DFA execution. This is only used
75     /// if a DFA can be executed. (Currently, only word boundary assertions are
76     /// not supported.) Note that this program contains an embedded `.*?`
77     /// preceding the first capture group, unless the regex is anchored at the
78     /// beginning.
79     dfa: Program,
80     /// The same as above, except the program is reversed (and there is no
81     /// preceding `.*?`). This is used by the DFA to find the starting location
82     /// of matches.
83     dfa_reverse: Program,
84     /// A set of suffix literals extracted from the regex.
85     ///
86     /// Prefix literals are stored on the `Program`, since they are used inside
87     /// the matching engines.
88     suffixes: LiteralSearcher,
89     /// match_type encodes as much upfront knowledge about how we're going to
90     /// execute a search as possible.
91     match_type: MatchType,
92 }
93 
94 /// Facilitates the construction of an executor by exposing various knobs
95 /// to control how a regex is executed and what kinds of resources it's
96 /// permitted to use.
97 pub struct ExecBuilder {
98     options: RegexOptions,
99     match_type: Option<MatchType>,
100     bytes: bool,
101     only_utf8: bool,
102 }
103 
104 /// Parsed represents a set of parsed regular expressions and their detected
105 /// literals.
106 struct Parsed {
107     exprs: Vec<Hir>,
108     prefixes: Literals,
109     suffixes: Literals,
110     bytes: bool,
111 }
112 
113 impl ExecBuilder {
114     /// Create a regex execution builder.
115     ///
116     /// This uses default settings for everything except the regex itself,
117     /// which must be provided. Further knobs can be set by calling methods,
118     /// and then finally, `build` to actually create the executor.
new(re: &str) -> Self119     pub fn new(re: &str) -> Self {
120         Self::new_many(&[re])
121     }
122 
123     /// Like new, but compiles the union of the given regular expressions.
124     ///
125     /// Note that when compiling 2 or more regular expressions, capture groups
126     /// are completely unsupported. (This means both `find` and `captures`
127     /// wont work.)
new_many<I, S>(res: I) -> Self where S: AsRef<str>, I: IntoIterator<Item=S>128     pub fn new_many<I, S>(res: I) -> Self
129             where S: AsRef<str>, I: IntoIterator<Item=S> {
130         let mut opts = RegexOptions::default();
131         opts.pats = res.into_iter().map(|s| s.as_ref().to_owned()).collect();
132         Self::new_options(opts)
133     }
134 
135     /// Create a regex execution builder.
new_options(opts: RegexOptions) -> Self136     pub fn new_options(opts: RegexOptions) -> Self {
137         ExecBuilder {
138             options: opts,
139             match_type: None,
140             bytes: false,
141             only_utf8: true,
142         }
143     }
144 
145     /// Set the matching engine to be automatically determined.
146     ///
147     /// This is the default state and will apply whatever optimizations are
148     /// possible, such as running a DFA.
149     ///
150     /// This overrides whatever was previously set via the `nfa` or
151     /// `bounded_backtracking` methods.
automatic(mut self) -> Self152     pub fn automatic(mut self) -> Self {
153         self.match_type = None;
154         self
155     }
156 
157     /// Sets the matching engine to use the NFA algorithm no matter what
158     /// optimizations are possible.
159     ///
160     /// This overrides whatever was previously set via the `automatic` or
161     /// `bounded_backtracking` methods.
nfa(mut self) -> Self162     pub fn nfa(mut self) -> Self {
163         self.match_type = Some(MatchType::Nfa(MatchNfaType::PikeVM));
164         self
165     }
166 
167     /// Sets the matching engine to use a bounded backtracking engine no
168     /// matter what optimizations are possible.
169     ///
170     /// One must use this with care, since the bounded backtracking engine
171     /// uses memory proportion to `len(regex) * len(text)`.
172     ///
173     /// This overrides whatever was previously set via the `automatic` or
174     /// `nfa` methods.
bounded_backtracking(mut self) -> Self175     pub fn bounded_backtracking(mut self) -> Self {
176         self.match_type = Some(MatchType::Nfa(MatchNfaType::Backtrack));
177         self
178     }
179 
180     /// Compiles byte based programs for use with the NFA matching engines.
181     ///
182     /// By default, the NFA engines match on Unicode scalar values. They can
183     /// be made to use byte based programs instead. In general, the byte based
184     /// programs are slower because of a less efficient encoding of character
185     /// classes.
186     ///
187     /// Note that this does not impact DFA matching engines, which always
188     /// execute on bytes.
bytes(mut self, yes: bool) -> Self189     pub fn bytes(mut self, yes: bool) -> Self {
190         self.bytes = yes;
191         self
192     }
193 
194     /// When disabled, the program compiled may match arbitrary bytes.
195     ///
196     /// When enabled (the default), all compiled programs exclusively match
197     /// valid UTF-8 bytes.
only_utf8(mut self, yes: bool) -> Self198     pub fn only_utf8(mut self, yes: bool) -> Self {
199         self.only_utf8 = yes;
200         self
201     }
202 
203     /// Set the Unicode flag.
unicode(mut self, yes: bool) -> Self204     pub fn unicode(mut self, yes: bool) -> Self {
205         self.options.unicode = yes;
206         self
207     }
208 
209     /// Parse the current set of patterns into their AST and extract literals.
parse(&self) -> Result<Parsed, Error>210     fn parse(&self) -> Result<Parsed, Error> {
211         let mut exprs = Vec::with_capacity(self.options.pats.len());
212         let mut prefixes = Some(Literals::empty());
213         let mut suffixes = Some(Literals::empty());
214         let mut bytes = false;
215         let is_set = self.options.pats.len() > 1;
216         // If we're compiling a regex set and that set has any anchored
217         // expressions, then disable all literal optimizations.
218         for pat in &self.options.pats {
219             let mut parser =
220                 ParserBuilder::new()
221                     .octal(self.options.octal)
222                     .case_insensitive(self.options.case_insensitive)
223                     .multi_line(self.options.multi_line)
224                     .dot_matches_new_line(self.options.dot_matches_new_line)
225                     .swap_greed(self.options.swap_greed)
226                     .ignore_whitespace(self.options.ignore_whitespace)
227                     .unicode(self.options.unicode)
228                     .allow_invalid_utf8(!self.only_utf8)
229                     .nest_limit(self.options.nest_limit)
230                     .build();
231             let expr = parser
232                 .parse(pat)
233                 .map_err(|e| Error::Syntax(e.to_string()))?;
234             bytes = bytes || !expr.is_always_utf8();
235 
236             if !expr.is_anchored_start() && expr.is_any_anchored_start() {
237                 // Partial anchors unfortunately make it hard to use prefixes,
238                 // so disable them.
239                 prefixes = None;
240             } else if is_set && expr.is_anchored_start() {
241                 // Regex sets with anchors do not go well with literal
242                 // optimizations.
243                 prefixes = None;
244             }
245             prefixes = prefixes.and_then(|mut prefixes| {
246                 if !prefixes.union_prefixes(&expr) {
247                     None
248                 } else {
249                     Some(prefixes)
250                 }
251             });
252 
253             if !expr.is_anchored_end() && expr.is_any_anchored_end() {
254                 // Partial anchors unfortunately make it hard to use suffixes,
255                 // so disable them.
256                 suffixes = None;
257             } else if is_set && expr.is_anchored_end() {
258                 // Regex sets with anchors do not go well with literal
259                 // optimizations.
260                 suffixes = None;
261             }
262             suffixes = suffixes.and_then(|mut suffixes| {
263                 if !suffixes.union_suffixes(&expr) {
264                     None
265                 } else {
266                     Some(suffixes)
267                 }
268             });
269             exprs.push(expr);
270         }
271         Ok(Parsed {
272             exprs: exprs,
273             prefixes: prefixes.unwrap_or_else(Literals::empty),
274             suffixes: suffixes.unwrap_or_else(Literals::empty),
275             bytes: bytes,
276         })
277     }
278 
279     /// Build an executor that can run a regular expression.
build(self) -> Result<Exec, Error>280     pub fn build(self) -> Result<Exec, Error> {
281         // Special case when we have no patterns to compile.
282         // This can happen when compiling a regex set.
283         if self.options.pats.is_empty() {
284             let ro = Arc::new(ExecReadOnly {
285                 res: vec![],
286                 nfa: Program::new(),
287                 dfa: Program::new(),
288                 dfa_reverse: Program::new(),
289                 suffixes: LiteralSearcher::empty(),
290                 match_type: MatchType::Nothing,
291             });
292             return Ok(Exec { ro: ro, cache: CachedThreadLocal::new() });
293         }
294         let parsed = self.parse()?;
295         let mut nfa =
296             Compiler::new()
297                      .size_limit(self.options.size_limit)
298                      .bytes(self.bytes || parsed.bytes)
299                      .only_utf8(self.only_utf8)
300                      .compile(&parsed.exprs)?;
301         let mut dfa =
302             Compiler::new()
303                      .size_limit(self.options.size_limit)
304                      .dfa(true)
305                      .only_utf8(self.only_utf8)
306                      .compile(&parsed.exprs)?;
307         let mut dfa_reverse =
308             Compiler::new()
309                      .size_limit(self.options.size_limit)
310                      .dfa(true)
311                      .only_utf8(self.only_utf8)
312                      .reverse(true)
313                      .compile(&parsed.exprs)?;
314 
315         let prefixes = parsed.prefixes.unambiguous_prefixes();
316         let suffixes = parsed.suffixes.unambiguous_suffixes();
317         nfa.prefixes = LiteralSearcher::prefixes(prefixes);
318         dfa.prefixes = nfa.prefixes.clone();
319         dfa.dfa_size_limit = self.options.dfa_size_limit;
320         dfa_reverse.dfa_size_limit = self.options.dfa_size_limit;
321 
322         let mut ro = ExecReadOnly {
323             res: self.options.pats,
324             nfa: nfa,
325             dfa: dfa,
326             dfa_reverse: dfa_reverse,
327             suffixes: LiteralSearcher::suffixes(suffixes),
328             match_type: MatchType::Nothing,
329         };
330         ro.match_type = ro.choose_match_type(self.match_type);
331 
332         let ro = Arc::new(ro);
333         Ok(Exec { ro: ro, cache: CachedThreadLocal::new() })
334     }
335 }
336 
337 impl<'c> RegularExpression for ExecNoSyncStr<'c> {
338     type Text = str;
339 
slots_len(&self) -> usize340     fn slots_len(&self) -> usize { self.0.slots_len() }
341 
next_after_empty(&self, text: &str, i: usize) -> usize342     fn next_after_empty(&self, text: &str, i: usize) -> usize {
343         next_utf8(text.as_bytes(), i)
344     }
345 
346     #[inline(always)] // reduces constant overhead
shortest_match_at(&self, text: &str, start: usize) -> Option<usize>347     fn shortest_match_at(&self, text: &str, start: usize) -> Option<usize> {
348         self.0.shortest_match_at(text.as_bytes(), start)
349     }
350 
351     #[inline(always)] // reduces constant overhead
is_match_at(&self, text: &str, start: usize) -> bool352     fn is_match_at(&self, text: &str, start: usize) -> bool {
353         self.0.is_match_at(text.as_bytes(), start)
354     }
355 
356     #[inline(always)] // reduces constant overhead
find_at(&self, text: &str, start: usize) -> Option<(usize, usize)>357     fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> {
358         self.0.find_at(text.as_bytes(), start)
359     }
360 
361     #[inline(always)] // reduces constant overhead
read_captures_at( &self, locs: &mut Locations, text: &str, start: usize, ) -> Option<(usize, usize)>362     fn read_captures_at(
363         &self,
364         locs: &mut Locations,
365         text: &str,
366         start: usize,
367     ) -> Option<(usize, usize)> {
368         self.0.read_captures_at(locs, text.as_bytes(), start)
369     }
370 }
371 
372 impl<'c> RegularExpression for ExecNoSync<'c> {
373     type Text = [u8];
374 
375     /// Returns the number of capture slots in the regular expression. (There
376     /// are two slots for every capture group, corresponding to possibly empty
377     /// start and end locations of the capture.)
slots_len(&self) -> usize378     fn slots_len(&self) -> usize {
379         self.ro.nfa.captures.len() * 2
380     }
381 
next_after_empty(&self, _text: &[u8], i: usize) -> usize382     fn next_after_empty(&self, _text: &[u8], i: usize) -> usize {
383         i + 1
384     }
385 
386     /// Returns the end of a match location, possibly occurring before the
387     /// end location of the correct leftmost-first match.
388     #[inline(always)] // reduces constant overhead
shortest_match_at(&self, text: &[u8], start: usize) -> Option<usize>389     fn shortest_match_at(&self, text: &[u8], start: usize) -> Option<usize> {
390         if !self.is_anchor_end_match(text) {
391             return None;
392         }
393         match self.ro.match_type {
394             MatchType::Literal(ty) => {
395                 self.find_literals(ty, text, start).map(|(_, e)| e)
396             }
397             MatchType::Dfa | MatchType::DfaMany => {
398                 match self.shortest_dfa(text, start) {
399                     dfa::Result::Match(end) => Some(end),
400                     dfa::Result::NoMatch(_) => None,
401                     dfa::Result::Quit => self.shortest_nfa(text, start),
402                 }
403             }
404             MatchType::DfaAnchoredReverse => {
405                 match dfa::Fsm::reverse(
406                     &self.ro.dfa_reverse,
407                     self.cache,
408                     true,
409                     &text[start..],
410                     text.len(),
411                 ) {
412                     dfa::Result::Match(_) => Some(text.len()),
413                     dfa::Result::NoMatch(_) => None,
414                     dfa::Result::Quit => self.shortest_nfa(text, start),
415                 }
416             }
417             MatchType::DfaSuffix => {
418                 match self.shortest_dfa_reverse_suffix(text, start) {
419                     dfa::Result::Match(e) => Some(e),
420                     dfa::Result::NoMatch(_) => None,
421                     dfa::Result::Quit => self.shortest_nfa(text, start),
422                 }
423             }
424             MatchType::Nfa(ty) => self.shortest_nfa_type(ty, text, start),
425             MatchType::Nothing => None,
426         }
427     }
428 
429     /// Returns true if and only if the regex matches text.
430     ///
431     /// For single regular expressions, this is equivalent to calling
432     /// shortest_match(...).is_some().
433     #[inline(always)] // reduces constant overhead
is_match_at(&self, text: &[u8], start: usize) -> bool434     fn is_match_at(&self, text: &[u8], start: usize) -> bool {
435         if !self.is_anchor_end_match(text) {
436             return false;
437         }
438         // We need to do this dance because shortest_match relies on the NFA
439         // filling in captures[1], but a RegexSet has no captures. In other
440         // words, a RegexSet can't (currently) use shortest_match. ---AG
441         match self.ro.match_type {
442             MatchType::Literal(ty) => {
443                 self.find_literals(ty, text, start).is_some()
444             }
445             MatchType::Dfa | MatchType::DfaMany => {
446                 match self.shortest_dfa(text, start) {
447                     dfa::Result::Match(_) => true,
448                     dfa::Result::NoMatch(_) => false,
449                     dfa::Result::Quit => self.match_nfa(text, start),
450                 }
451             }
452             MatchType::DfaAnchoredReverse => {
453                 match dfa::Fsm::reverse(
454                     &self.ro.dfa_reverse,
455                     self.cache,
456                     true,
457                     &text[start..],
458                     text.len(),
459                 ) {
460                     dfa::Result::Match(_) => true,
461                     dfa::Result::NoMatch(_) => false,
462                     dfa::Result::Quit => self.match_nfa(text, start),
463                 }
464             }
465             MatchType::DfaSuffix => {
466                 match self.shortest_dfa_reverse_suffix(text, start) {
467                     dfa::Result::Match(_) => true,
468                     dfa::Result::NoMatch(_) => false,
469                     dfa::Result::Quit => self.match_nfa(text, start),
470                 }
471             }
472             MatchType::Nfa(ty) => self.match_nfa_type(ty, text, start),
473             MatchType::Nothing => false,
474         }
475     }
476 
477     /// Finds the start and end location of the leftmost-first match, starting
478     /// at the given location.
479     #[inline(always)] // reduces constant overhead
find_at(&self, text: &[u8], start: usize) -> Option<(usize, usize)>480     fn find_at(&self, text: &[u8], start: usize) -> Option<(usize, usize)> {
481         if !self.is_anchor_end_match(text) {
482             return None;
483         }
484         match self.ro.match_type {
485             MatchType::Literal(ty) => {
486                 self.find_literals(ty, text, start)
487             }
488             MatchType::Dfa => {
489                 match self.find_dfa_forward(text, start) {
490                     dfa::Result::Match((s, e)) => Some((s, e)),
491                     dfa::Result::NoMatch(_) => None,
492                     dfa::Result::Quit => {
493                         self.find_nfa(MatchNfaType::Auto, text, start)
494                     }
495                 }
496             }
497             MatchType::DfaAnchoredReverse => {
498                 match self.find_dfa_anchored_reverse(text, start) {
499                     dfa::Result::Match((s, e)) => Some((s, e)),
500                     dfa::Result::NoMatch(_) => None,
501                     dfa::Result::Quit => {
502                         self.find_nfa(MatchNfaType::Auto, text, start)
503                     }
504                 }
505             }
506             MatchType::DfaSuffix => {
507                 match self.find_dfa_reverse_suffix(text, start) {
508                     dfa::Result::Match((s, e)) => Some((s, e)),
509                     dfa::Result::NoMatch(_) => None,
510                     dfa::Result::Quit => {
511                         self.find_nfa(MatchNfaType::Auto, text, start)
512                     }
513                 }
514             }
515             MatchType::Nfa(ty) => self.find_nfa(ty, text, start),
516             MatchType::Nothing => None,
517             MatchType::DfaMany => {
518                 unreachable!("BUG: RegexSet cannot be used with find")
519             }
520         }
521     }
522 
523     /// Finds the start and end location of the leftmost-first match and also
524     /// fills in all matching capture groups.
525     ///
526     /// The number of capture slots given should be equal to the total number
527     /// of capture slots in the compiled program.
528     ///
529     /// Note that the first two slots always correspond to the start and end
530     /// locations of the overall match.
read_captures_at( &self, locs: &mut Locations, text: &[u8], start: usize, ) -> Option<(usize, usize)>531     fn read_captures_at(
532         &self,
533         locs: &mut Locations,
534         text: &[u8],
535         start: usize,
536     ) -> Option<(usize, usize)> {
537         let slots = as_slots(locs);
538         for slot in slots.iter_mut() {
539             *slot = None;
540         }
541         // If the caller unnecessarily uses this, then we try to save them
542         // from themselves.
543         match slots.len() {
544             0 => return self.find_at(text, start),
545             2 => {
546                 return self.find_at(text, start).map(|(s, e)| {
547                     slots[0] = Some(s);
548                     slots[1] = Some(e);
549                     (s, e)
550                 });
551             }
552             _ => {} // fallthrough
553         }
554         if !self.is_anchor_end_match(text) {
555             return None;
556         }
557         match self.ro.match_type {
558             MatchType::Literal(ty) => {
559                 self.find_literals(ty, text, start).and_then(|(s, e)| {
560                     self.captures_nfa_with_match(slots, text, s, e)
561                 })
562             }
563             MatchType::Dfa => {
564                 if self.ro.nfa.is_anchored_start {
565                     self.captures_nfa(slots, text, start)
566                 } else {
567                     match self.find_dfa_forward(text, start) {
568                         dfa::Result::Match((s, e)) => {
569                             self.captures_nfa_with_match(slots, text, s, e)
570                         }
571                         dfa::Result::NoMatch(_) => None,
572                         dfa::Result::Quit => self.captures_nfa(slots, text, start),
573                     }
574                 }
575             }
576             MatchType::DfaAnchoredReverse => {
577                 match self.find_dfa_anchored_reverse(text, start) {
578                     dfa::Result::Match((s, e)) => {
579                         self.captures_nfa_with_match(slots, text, s, e)
580                     }
581                     dfa::Result::NoMatch(_) => None,
582                     dfa::Result::Quit => self.captures_nfa(slots, text, start),
583                 }
584             }
585             MatchType::DfaSuffix => {
586                 match self.find_dfa_reverse_suffix(text, start) {
587                     dfa::Result::Match((s, e)) => {
588                         self.captures_nfa_with_match(slots, text, s, e)
589                     }
590                     dfa::Result::NoMatch(_) => None,
591                     dfa::Result::Quit => self.captures_nfa(slots, text, start),
592                 }
593             }
594             MatchType::Nfa(ty) => {
595                 self.captures_nfa_type(ty, slots, text, start)
596             }
597             MatchType::Nothing => None,
598             MatchType::DfaMany => {
599                 unreachable!("BUG: RegexSet cannot be used with captures")
600             }
601         }
602     }
603 }
604 
605 impl<'c> ExecNoSync<'c> {
606     /// Finds the leftmost-first match using only literal search.
607     #[inline(always)] // reduces constant overhead
find_literals( &self, ty: MatchLiteralType, text: &[u8], start: usize, ) -> Option<(usize, usize)>608     fn find_literals(
609         &self,
610         ty: MatchLiteralType,
611         text: &[u8],
612         start: usize,
613     ) -> Option<(usize, usize)> {
614         use self::MatchLiteralType::*;
615         match ty {
616             Unanchored => {
617                 let lits = &self.ro.nfa.prefixes;
618                 lits.find(&text[start..])
619                     .map(|(s, e)| (start + s, start + e))
620             }
621             AnchoredStart => {
622                 let lits = &self.ro.nfa.prefixes;
623                 if !self.ro.nfa.is_anchored_start
624                     || (self.ro.nfa.is_anchored_start && start == 0) {
625                     lits.find_start(&text[start..])
626                         .map(|(s, e)| (start + s, start + e))
627                 } else {
628                     None
629                 }
630             }
631             AnchoredEnd => {
632                 let lits = &self.ro.suffixes;
633                 lits.find_end(&text[start..])
634                     .map(|(s, e)| (start + s, start + e))
635             }
636         }
637     }
638 
639     /// Finds the leftmost-first match (start and end) using only the DFA.
640     ///
641     /// If the result returned indicates that the DFA quit, then another
642     /// matching engine should be used.
643     #[inline(always)] // reduces constant overhead
find_dfa_forward( &self, text: &[u8], start: usize, ) -> dfa::Result<(usize, usize)>644     fn find_dfa_forward(
645         &self,
646         text: &[u8],
647         start: usize,
648     ) -> dfa::Result<(usize, usize)> {
649         use dfa::Result::*;
650         let end = match dfa::Fsm::forward(
651             &self.ro.dfa,
652             self.cache,
653             false,
654             text,
655             start,
656         ) {
657             NoMatch(i) => return NoMatch(i),
658             Quit => return Quit,
659             Match(end) if start == end => return Match((start, start)),
660             Match(end) => end,
661         };
662         // Now run the DFA in reverse to find the start of the match.
663         match dfa::Fsm::reverse(
664             &self.ro.dfa_reverse,
665             self.cache,
666             false,
667             &text[start..],
668             end - start,
669         ) {
670             Match(s) => Match((start + s, end)),
671             NoMatch(i) => NoMatch(i),
672             Quit => Quit,
673         }
674     }
675 
676     /// Finds the leftmost-first match (start and end) using only the DFA,
677     /// but assumes the regex is anchored at the end and therefore starts at
678     /// the end of the regex and matches in reverse.
679     ///
680     /// If the result returned indicates that the DFA quit, then another
681     /// matching engine should be used.
682     #[inline(always)] // reduces constant overhead
find_dfa_anchored_reverse( &self, text: &[u8], start: usize, ) -> dfa::Result<(usize, usize)>683     fn find_dfa_anchored_reverse(
684         &self,
685         text: &[u8],
686         start: usize,
687     ) -> dfa::Result<(usize, usize)> {
688         use dfa::Result::*;
689         match dfa::Fsm::reverse(
690             &self.ro.dfa_reverse,
691             self.cache,
692             false,
693             &text[start..],
694             text.len() - start,
695         ) {
696             Match(s) => Match((start + s, text.len())),
697             NoMatch(i) => NoMatch(i),
698             Quit => Quit,
699         }
700     }
701 
702     /// Finds the end of the shortest match using only the DFA.
703     #[inline(always)] // reduces constant overhead
shortest_dfa(&self, text: &[u8], start: usize) -> dfa::Result<usize>704     fn shortest_dfa(&self, text: &[u8], start: usize) -> dfa::Result<usize> {
705         dfa::Fsm::forward(&self.ro.dfa, self.cache, true, text, start)
706     }
707 
708     /// Finds the end of the shortest match using only the DFA by scanning for
709     /// suffix literals.
710     ///
711     #[inline(always)] // reduces constant overhead
shortest_dfa_reverse_suffix( &self, text: &[u8], start: usize, ) -> dfa::Result<usize>712     fn shortest_dfa_reverse_suffix(
713         &self,
714         text: &[u8],
715         start: usize,
716     ) -> dfa::Result<usize> {
717         match self.exec_dfa_reverse_suffix(text, start) {
718             None => self.shortest_dfa(text, start),
719             Some(r) => r.map(|(_, end)| end),
720         }
721     }
722 
723     /// Finds the end of the shortest match using only the DFA by scanning for
724     /// suffix literals. It also reports the start of the match.
725     ///
726     /// Note that if None is returned, then the optimization gave up to avoid
727     /// worst case quadratic behavior. A forward scanning DFA should be tried
728     /// next.
729     ///
730     /// If a match is returned and the full leftmost-first match is desired,
731     /// then a forward scan starting from the beginning of the match must be
732     /// done.
733     ///
734     /// If the result returned indicates that the DFA quit, then another
735     /// matching engine should be used.
736     #[inline(always)] // reduces constant overhead
exec_dfa_reverse_suffix( &self, text: &[u8], original_start: usize, ) -> Option<dfa::Result<(usize, usize)>>737     fn exec_dfa_reverse_suffix(
738         &self,
739         text: &[u8],
740         original_start: usize,
741     ) -> Option<dfa::Result<(usize, usize)>> {
742         use dfa::Result::*;
743 
744         let lcs = self.ro.suffixes.lcs();
745         debug_assert!(lcs.len() >= 1);
746         let mut start = original_start;
747         let mut end = start;
748         while end <= text.len() {
749             start = end;
750             end += match lcs.find(&text[end..]) {
751                 None => return Some(NoMatch(text.len())),
752                 Some(start) => start + lcs.len(),
753             };
754             match dfa::Fsm::reverse(
755                 &self.ro.dfa_reverse,
756                 self.cache,
757                 false,
758                 &text[start..end],
759                 end - start,
760             ) {
761                 Match(0) | NoMatch(0) => return None,
762                 Match(s) => return Some(Match((s + start, end))),
763                 NoMatch(_) => continue,
764                 Quit => return Some(Quit),
765             };
766         }
767         Some(NoMatch(text.len()))
768     }
769 
770     /// Finds the leftmost-first match (start and end) using only the DFA
771     /// by scanning for suffix literals.
772     ///
773     /// If the result returned indicates that the DFA quit, then another
774     /// matching engine should be used.
775     #[inline(always)] // reduces constant overhead
find_dfa_reverse_suffix( &self, text: &[u8], start: usize, ) -> dfa::Result<(usize, usize)>776     fn find_dfa_reverse_suffix(
777         &self,
778         text: &[u8],
779         start: usize,
780     ) -> dfa::Result<(usize, usize)> {
781         use dfa::Result::*;
782 
783         let match_start = match self.exec_dfa_reverse_suffix(text, start) {
784             None => return self.find_dfa_forward(text, start),
785             Some(Match((start, _))) => start,
786             Some(r) => return r,
787         };
788         // At this point, we've found a match. The only way to quit now
789         // without a match is if the DFA gives up (seems unlikely).
790         //
791         // Now run the DFA forwards to find the proper end of the match.
792         // (The suffix literal match can only indicate the earliest
793         // possible end location, which may appear before the end of the
794         // leftmost-first match.)
795         match dfa::Fsm::forward(
796             &self.ro.dfa,
797             self.cache,
798             false,
799             text,
800             match_start,
801         ) {
802             NoMatch(_) => panic!("BUG: reverse match implies forward match"),
803             Quit => Quit,
804             Match(e) => Match((match_start, e)),
805         }
806     }
807 
808     /// Executes the NFA engine to return whether there is a match or not.
809     ///
810     /// Ideally, we could use shortest_nfa(...).is_some() and get the same
811     /// performance characteristics, but regex sets don't have captures, which
812     /// shortest_nfa depends on.
match_nfa( &self, text: &[u8], start: usize, ) -> bool813     fn match_nfa(
814         &self,
815         text: &[u8],
816         start: usize,
817     ) -> bool {
818         self.match_nfa_type(MatchNfaType::Auto, text, start)
819     }
820 
821     /// Like match_nfa, but allows specification of the type of NFA engine.
match_nfa_type( &self, ty: MatchNfaType, text: &[u8], start: usize, ) -> bool822     fn match_nfa_type(
823         &self,
824         ty: MatchNfaType,
825         text: &[u8],
826         start: usize,
827     ) -> bool {
828         self.exec_nfa(ty, &mut [false], &mut [], true, text, start)
829     }
830 
831     /// Finds the shortest match using an NFA.
shortest_nfa(&self, text: &[u8], start: usize) -> Option<usize>832     fn shortest_nfa(&self, text: &[u8], start: usize) -> Option<usize> {
833         self.shortest_nfa_type(MatchNfaType::Auto, text, start)
834     }
835 
836     /// Like shortest_nfa, but allows specification of the type of NFA engine.
shortest_nfa_type( &self, ty: MatchNfaType, text: &[u8], start: usize, ) -> Option<usize>837     fn shortest_nfa_type(
838         &self,
839         ty: MatchNfaType,
840         text: &[u8],
841         start: usize,
842     ) -> Option<usize> {
843         let mut slots = [None, None];
844         if self.exec_nfa(ty, &mut [false], &mut slots, true, text, start) {
845             slots[1]
846         } else {
847             None
848         }
849     }
850 
851     /// Like find, but executes an NFA engine.
find_nfa( &self, ty: MatchNfaType, text: &[u8], start: usize, ) -> Option<(usize, usize)>852     fn find_nfa(
853         &self,
854         ty: MatchNfaType,
855         text: &[u8],
856         start: usize,
857     ) -> Option<(usize, usize)> {
858         let mut slots = [None, None];
859         if self.exec_nfa(ty, &mut [false], &mut slots, false, text, start) {
860             match (slots[0], slots[1]) {
861                 (Some(s), Some(e)) => Some((s, e)),
862                 _ => None,
863             }
864         } else {
865             None
866         }
867     }
868 
869     /// Like find_nfa, but fills in captures and restricts the search space
870     /// using previously found match information.
871     ///
872     /// `slots` should have length equal to `2 * nfa.captures.len()`.
captures_nfa_with_match( &self, slots: &mut [Slot], text: &[u8], match_start: usize, match_end: usize, ) -> Option<(usize, usize)>873     fn captures_nfa_with_match(
874         &self,
875         slots: &mut [Slot],
876         text: &[u8],
877         match_start: usize,
878         match_end: usize,
879     ) -> Option<(usize, usize)> {
880         // We can't use match_end directly, because we may need to examine one
881         // "character" after the end of a match for lookahead operators. We
882         // need to move two characters beyond the end, since some look-around
883         // operations may falsely assume a premature end of text otherwise.
884         let e = cmp::min(
885             next_utf8(text, next_utf8(text, match_end)), text.len());
886         self.captures_nfa(slots, &text[..e], match_start)
887     }
888 
889     /// Like find_nfa, but fills in captures.
890     ///
891     /// `slots` should have length equal to `2 * nfa.captures.len()`.
captures_nfa( &self, slots: &mut [Slot], text: &[u8], start: usize, ) -> Option<(usize, usize)>892     fn captures_nfa(
893         &self,
894         slots: &mut [Slot],
895         text: &[u8],
896         start: usize,
897     ) -> Option<(usize, usize)> {
898         self.captures_nfa_type(MatchNfaType::Auto, slots, text, start)
899     }
900 
901     /// Like captures_nfa, but allows specification of type of NFA engine.
captures_nfa_type( &self, ty: MatchNfaType, slots: &mut [Slot], text: &[u8], start: usize, ) -> Option<(usize, usize)>902     fn captures_nfa_type(
903         &self,
904         ty: MatchNfaType,
905         slots: &mut [Slot],
906         text: &[u8],
907         start: usize,
908     ) -> Option<(usize, usize)> {
909         if self.exec_nfa(ty, &mut [false], slots, false, text, start) {
910             match (slots[0], slots[1]) {
911                 (Some(s), Some(e)) => Some((s, e)),
912                 _ => None,
913             }
914         } else {
915             None
916         }
917     }
918 
exec_nfa( &self, mut ty: MatchNfaType, matches: &mut [bool], slots: &mut [Slot], quit_after_match: bool, text: &[u8], start: usize, ) -> bool919     fn exec_nfa(
920         &self,
921         mut ty: MatchNfaType,
922         matches: &mut [bool],
923         slots: &mut [Slot],
924         quit_after_match: bool,
925         text: &[u8],
926         start: usize,
927     ) -> bool {
928         use self::MatchNfaType::*;
929         if let Auto = ty {
930             if backtrack::should_exec(self.ro.nfa.len(), text.len()) {
931                 ty = Backtrack;
932             } else {
933                 ty = PikeVM;
934             }
935         }
936         match ty {
937             Auto => unreachable!(),
938             Backtrack => self.exec_backtrack(matches, slots, text, start),
939             PikeVM => {
940                 self.exec_pikevm(
941                     matches, slots, quit_after_match, text, start)
942             }
943         }
944     }
945 
946     /// Always run the NFA algorithm.
exec_pikevm( &self, matches: &mut [bool], slots: &mut [Slot], quit_after_match: bool, text: &[u8], start: usize, ) -> bool947     fn exec_pikevm(
948         &self,
949         matches: &mut [bool],
950         slots: &mut [Slot],
951         quit_after_match: bool,
952         text: &[u8],
953         start: usize,
954     ) -> bool {
955         if self.ro.nfa.uses_bytes() {
956             pikevm::Fsm::exec(
957                 &self.ro.nfa,
958                 self.cache,
959                 matches,
960                 slots,
961                 quit_after_match,
962                 ByteInput::new(text, self.ro.nfa.only_utf8),
963                 start)
964         } else {
965             pikevm::Fsm::exec(
966                 &self.ro.nfa,
967                 self.cache,
968                 matches,
969                 slots,
970                 quit_after_match,
971                 CharInput::new(text),
972                 start)
973         }
974     }
975 
976     /// Always runs the NFA using bounded backtracking.
exec_backtrack( &self, matches: &mut [bool], slots: &mut [Slot], text: &[u8], start: usize, ) -> bool977     fn exec_backtrack(
978         &self,
979         matches: &mut [bool],
980         slots: &mut [Slot],
981         text: &[u8],
982         start: usize,
983     ) -> bool {
984         if self.ro.nfa.uses_bytes() {
985             backtrack::Bounded::exec(
986                 &self.ro.nfa,
987                 self.cache,
988                 matches,
989                 slots,
990                 ByteInput::new(text, self.ro.nfa.only_utf8),
991                 start)
992         } else {
993             backtrack::Bounded::exec(
994                 &self.ro.nfa,
995                 self.cache,
996                 matches,
997                 slots,
998                 CharInput::new(text),
999                 start)
1000         }
1001     }
1002 
1003     /// Finds which regular expressions match the given text.
1004     ///
1005     /// `matches` should have length equal to the number of regexes being
1006     /// searched.
1007     ///
1008     /// This is only useful when one wants to know which regexes in a set
1009     /// match some text.
many_matches_at( &self, matches: &mut [bool], text: &[u8], start: usize, ) -> bool1010     pub fn many_matches_at(
1011         &self,
1012         matches: &mut [bool],
1013         text: &[u8],
1014         start: usize,
1015     ) -> bool {
1016         use self::MatchType::*;
1017         if !self.is_anchor_end_match(text) {
1018             return false;
1019         }
1020         match self.ro.match_type {
1021             Literal(ty) => {
1022                 debug_assert_eq!(matches.len(), 1);
1023                 matches[0] = self.find_literals(ty, text, start).is_some();
1024                 matches[0]
1025             }
1026             Dfa | DfaAnchoredReverse | DfaSuffix | DfaMany => {
1027                 match dfa::Fsm::forward_many(
1028                     &self.ro.dfa,
1029                     self.cache,
1030                     matches,
1031                     text,
1032                     start,
1033                 ) {
1034                     dfa::Result::Match(_) => true,
1035                     dfa::Result::NoMatch(_) => false,
1036                     dfa::Result::Quit => {
1037                         self.exec_nfa(
1038                             MatchNfaType::Auto,
1039                             matches,
1040                             &mut [],
1041                             false,
1042                             text,
1043                             start)
1044                     }
1045                 }
1046             }
1047             Nfa(ty) => self.exec_nfa(ty, matches, &mut [], false, text, start),
1048             Nothing => false,
1049         }
1050     }
1051 
1052     #[inline(always)] // reduces constant overhead
is_anchor_end_match(&self, text: &[u8]) -> bool1053     fn is_anchor_end_match(&self, text: &[u8]) -> bool {
1054         // Only do this check if the haystack is big (>1MB).
1055         if text.len() > (1<<20) && self.ro.nfa.is_anchored_end {
1056             let lcs = self.ro.suffixes.lcs();
1057             if lcs.len() >= 1 && !lcs.is_suffix(text) {
1058                 return false;
1059             }
1060         }
1061         true
1062     }
1063 
capture_name_idx(&self) -> &Arc<HashMap<String, usize>>1064     pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1065         &self.ro.nfa.capture_name_idx
1066     }
1067 }
1068 
1069 impl<'c> ExecNoSyncStr<'c> {
capture_name_idx(&self) -> &Arc<HashMap<String, usize>>1070     pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1071         self.0.capture_name_idx()
1072     }
1073 }
1074 
1075 impl Exec {
1076     /// Get a searcher that isn't Sync.
1077     #[inline(always)] // reduces constant overhead
searcher(&self) -> ExecNoSync1078     pub fn searcher(&self) -> ExecNoSync {
1079         let create = || Box::new(RefCell::new(ProgramCacheInner::new(&self.ro)));
1080         ExecNoSync {
1081             ro: &self.ro, // a clone is too expensive here! (and not needed)
1082             cache: self.cache.get_or(create),
1083         }
1084     }
1085 
1086     /// Get a searcher that isn't Sync and can match on &str.
1087     #[inline(always)] // reduces constant overhead
searcher_str(&self) -> ExecNoSyncStr1088     pub fn searcher_str(&self) -> ExecNoSyncStr {
1089         ExecNoSyncStr(self.searcher())
1090     }
1091 
1092     /// Build a Regex from this executor.
into_regex(self) -> re_unicode::Regex1093     pub fn into_regex(self) -> re_unicode::Regex {
1094         re_unicode::Regex::from(self)
1095     }
1096 
1097     /// Build a RegexSet from this executor.
into_regex_set(self) -> re_set::unicode::RegexSet1098     pub fn into_regex_set(self) -> re_set::unicode::RegexSet {
1099         re_set::unicode::RegexSet::from(self)
1100     }
1101 
1102     /// Build a Regex from this executor that can match arbitrary bytes.
into_byte_regex(self) -> re_bytes::Regex1103     pub fn into_byte_regex(self) -> re_bytes::Regex {
1104         re_bytes::Regex::from(self)
1105     }
1106 
1107     /// Build a RegexSet from this executor that can match arbitrary bytes.
into_byte_regex_set(self) -> re_set::bytes::RegexSet1108     pub fn into_byte_regex_set(self) -> re_set::bytes::RegexSet {
1109         re_set::bytes::RegexSet::from(self)
1110     }
1111 
1112     /// The original regular expressions given by the caller that were
1113     /// compiled.
regex_strings(&self) -> &[String]1114     pub fn regex_strings(&self) -> &[String] {
1115         &self.ro.res
1116     }
1117 
1118     /// Return a slice of capture names.
1119     ///
1120     /// Any capture that isn't named is None.
capture_names(&self) -> &[Option<String>]1121     pub fn capture_names(&self) -> &[Option<String>] {
1122         &self.ro.nfa.captures
1123     }
1124 
1125     /// Return a reference to named groups mapping (from group name to
1126     /// group position).
capture_name_idx(&self) -> &Arc<HashMap<String, usize>>1127     pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1128         &self.ro.nfa.capture_name_idx
1129     }
1130 }
1131 
1132 impl Clone for Exec {
clone(&self) -> Exec1133     fn clone(&self) -> Exec {
1134         Exec {
1135             ro: self.ro.clone(),
1136             cache: CachedThreadLocal::new(),
1137         }
1138     }
1139 }
1140 
1141 impl ExecReadOnly {
choose_match_type(&self, hint: Option<MatchType>) -> MatchType1142     fn choose_match_type(&self, hint: Option<MatchType>) -> MatchType {
1143         use self::MatchType::*;
1144         if let Some(Nfa(_)) = hint {
1145             return hint.unwrap();
1146         }
1147         // If the NFA is empty, then we'll never match anything.
1148         if self.nfa.insts.is_empty() {
1149             return Nothing;
1150         }
1151         // If our set of prefixes is complete, then we can use it to find
1152         // a match in lieu of a regex engine. This doesn't quite work well in
1153         // the presence of multiple regexes, so only do it when there's one.
1154         //
1155         // TODO(burntsushi): Also, don't try to match literals if the regex is
1156         // partially anchored. We could technically do it, but we'd need to
1157         // create two sets of literals: all of them and then the subset that
1158         // aren't anchored. We would then only search for all of them when at
1159         // the beginning of the input and use the subset in all other cases.
1160         if self.res.len() == 1 {
1161             if self.nfa.prefixes.complete() {
1162                 return if self.nfa.is_anchored_start {
1163                     Literal(MatchLiteralType::AnchoredStart)
1164                 } else {
1165                     Literal(MatchLiteralType::Unanchored)
1166                 };
1167             }
1168             if self.suffixes.complete() {
1169                 return if self.nfa.is_anchored_end {
1170                     Literal(MatchLiteralType::AnchoredEnd)
1171                 } else {
1172                     // This case shouldn't happen. When the regex isn't
1173                     // anchored, then complete prefixes should imply complete
1174                     // suffixes.
1175                     Literal(MatchLiteralType::Unanchored)
1176                 };
1177             }
1178         }
1179         // If we can execute the DFA, then we totally should.
1180         if dfa::can_exec(&self.dfa) {
1181             // Regex sets require a slightly specialized path.
1182             if self.res.len() >= 2 {
1183                 return DfaMany;
1184             }
1185             // If the regex is anchored at the end but not the start, then
1186             // just match in reverse from the end of the haystack.
1187             if !self.nfa.is_anchored_start && self.nfa.is_anchored_end {
1188                 return DfaAnchoredReverse;
1189             }
1190             // If there's a longish suffix literal, then it might be faster
1191             // to look for that first.
1192             if self.should_suffix_scan() {
1193                 return DfaSuffix;
1194             }
1195             // Fall back to your garden variety forward searching lazy DFA.
1196             return Dfa;
1197         }
1198         // We're so totally hosed.
1199         Nfa(MatchNfaType::Auto)
1200     }
1201 
1202     /// Returns true if the program is amenable to suffix scanning.
1203     ///
1204     /// When this is true, as a heuristic, we assume it is OK to quickly scan
1205     /// for suffix literals and then do a *reverse* DFA match from any matches
1206     /// produced by the literal scan. (And then followed by a forward DFA
1207     /// search, since the previously found suffix literal maybe not actually be
1208     /// the end of a match.)
1209     ///
1210     /// This is a bit of a specialized optimization, but can result in pretty
1211     /// big performance wins if 1) there are no prefix literals and 2) the
1212     /// suffix literals are pretty rare in the text. (1) is obviously easy to
1213     /// account for but (2) is harder. As a proxy, we assume that longer
1214     /// strings are generally rarer, so we only enable this optimization when
1215     /// we have a meaty suffix.
should_suffix_scan(&self) -> bool1216     fn should_suffix_scan(&self) -> bool {
1217         if self.suffixes.is_empty() {
1218             return false;
1219         }
1220         let lcs_len = self.suffixes.lcs().char_len();
1221         lcs_len >= 3 && lcs_len > self.dfa.prefixes.lcp().char_len()
1222     }
1223 }
1224 
1225 #[derive(Clone, Copy, Debug)]
1226 enum MatchType {
1227     /// A single or multiple literal search. This is only used when the regex
1228     /// can be decomposed into unambiguous literal search.
1229     Literal(MatchLiteralType),
1230     /// A normal DFA search.
1231     Dfa,
1232     /// A reverse DFA search starting from the end of a haystack.
1233     DfaAnchoredReverse,
1234     /// A reverse DFA search with suffix literal scanning.
1235     DfaSuffix,
1236     /// Use the DFA on two or more regular expressions.
1237     DfaMany,
1238     /// An NFA variant.
1239     Nfa(MatchNfaType),
1240     /// No match is ever possible, so don't ever try to search.
1241     Nothing,
1242 }
1243 
1244 #[derive(Clone, Copy, Debug)]
1245 enum MatchLiteralType {
1246     /// Match literals anywhere in text.
1247     Unanchored,
1248     /// Match literals only at the start of text.
1249     AnchoredStart,
1250     /// Match literals only at the end of text.
1251     AnchoredEnd,
1252 }
1253 
1254 #[derive(Clone, Copy, Debug)]
1255 enum MatchNfaType {
1256     /// Choose between Backtrack and PikeVM.
1257     Auto,
1258     /// NFA bounded backtracking.
1259     ///
1260     /// (This is only set by tests, since it never makes sense to always want
1261     /// backtracking.)
1262     Backtrack,
1263     /// The Pike VM.
1264     ///
1265     /// (This is only set by tests, since it never makes sense to always want
1266     /// the Pike VM.)
1267     PikeVM,
1268 }
1269 
1270 /// `ProgramCache` maintains reusable allocations for each matching engine
1271 /// available to a particular program.
1272 pub type ProgramCache = RefCell<ProgramCacheInner>;
1273 
1274 #[derive(Clone, Debug)]
1275 pub struct ProgramCacheInner {
1276     pub pikevm: pikevm::Cache,
1277     pub backtrack: backtrack::Cache,
1278     pub dfa: dfa::Cache,
1279     pub dfa_reverse: dfa::Cache,
1280 }
1281 
1282 impl ProgramCacheInner {
new(ro: &ExecReadOnly) -> Self1283     fn new(ro: &ExecReadOnly) -> Self {
1284         ProgramCacheInner {
1285             pikevm: pikevm::Cache::new(&ro.nfa),
1286             backtrack: backtrack::Cache::new(&ro.nfa),
1287             dfa: dfa::Cache::new(&ro.dfa),
1288             dfa_reverse: dfa::Cache::new(&ro.dfa_reverse),
1289         }
1290     }
1291 }
1292 
1293 #[cfg(test)]
1294 mod test {
1295     #[test]
uppercut_s_backtracking_bytes_default_bytes_mismatch()1296     fn uppercut_s_backtracking_bytes_default_bytes_mismatch() {
1297         use internal::ExecBuilder;
1298 
1299         let backtrack_bytes_re = ExecBuilder::new("^S")
1300             .bounded_backtracking()
1301             .only_utf8(false)
1302             .build()
1303             .map(|exec| exec.into_byte_regex())
1304             .map_err(|err| format!("{}", err))
1305             .unwrap();
1306 
1307         let default_bytes_re = ExecBuilder::new("^S")
1308             .only_utf8(false)
1309             .build()
1310             .map(|exec| exec.into_byte_regex())
1311             .map_err(|err| format!("{}", err))
1312             .unwrap();
1313 
1314         let input = vec![83, 83];
1315 
1316         let s1 = backtrack_bytes_re.split(&input);
1317         let s2 = default_bytes_re.split(&input);
1318         for (chunk1, chunk2) in s1.zip(s2) {
1319             assert_eq!(chunk1, chunk2);
1320         }
1321     }
1322 
1323     #[test]
unicode_lit_star_backtracking_utf8bytes_default_utf8bytes_mismatch()1324     fn unicode_lit_star_backtracking_utf8bytes_default_utf8bytes_mismatch() {
1325         use internal::ExecBuilder;
1326 
1327         let backtrack_bytes_re = ExecBuilder::new(r"^(?u:\*)")
1328             .bounded_backtracking()
1329             .bytes(true)
1330             .build()
1331             .map(|exec| exec.into_regex())
1332             .map_err(|err| format!("{}", err))
1333             .unwrap();
1334 
1335         let default_bytes_re = ExecBuilder::new(r"^(?u:\*)")
1336             .bytes(true)
1337             .build()
1338             .map(|exec| exec.into_regex())
1339             .map_err(|err| format!("{}", err))
1340             .unwrap();
1341 
1342         let input = "**";
1343 
1344         let s1 = backtrack_bytes_re.split(input);
1345         let s2 = default_bytes_re.split(input);
1346         for (chunk1, chunk2) in s1.zip(s2) {
1347             assert_eq!(chunk1, chunk2);
1348         }
1349     }
1350 }
1351