1 use std::cell::RefCell;
2 use std::collections::HashMap;
3 use std::sync::Arc;
4
5 #[cfg(feature = "perf-literal")]
6 use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
7 use syntax::hir::literal::Literals;
8 use syntax::hir::Hir;
9 use syntax::ParserBuilder;
10
11 use backtrack;
12 use cache::{Cached, CachedGuard};
13 use compile::Compiler;
14 #[cfg(feature = "perf-dfa")]
15 use dfa;
16 use error::Error;
17 use input::{ByteInput, CharInput};
18 use literal::LiteralSearcher;
19 use pikevm;
20 use prog::Program;
21 use re_builder::RegexOptions;
22 use re_bytes;
23 use re_set;
24 use re_trait::{Locations, RegularExpression, Slot};
25 use re_unicode;
26 use utf8::next_utf8;
27
28 /// `Exec` manages the execution of a regular expression.
29 ///
30 /// In particular, this manages the various compiled forms of a single regular
31 /// expression and the choice of which matching engine to use to execute a
32 /// regular expression.
33 pub struct Exec {
34 /// All read only state.
35 ro: Arc<ExecReadOnly>,
36 /// Caches for the various matching engines.
37 cache: Cached<ProgramCache>,
38 }
39
40 /// `ExecNoSync` is like `Exec`, except it embeds a reference to a cache. This
41 /// means it is no longer Sync, but we can now avoid the overhead of
42 /// synchronization to fetch the cache.
43 #[derive(Debug)]
44 pub struct ExecNoSync<'c> {
45 /// All read only state.
46 ro: &'c Arc<ExecReadOnly>,
47 /// Caches for the various matching engines.
48 cache: CachedGuard<'c, ProgramCache>,
49 }
50
51 /// `ExecNoSyncStr` is like `ExecNoSync`, but matches on &str instead of &[u8].
52 pub struct ExecNoSyncStr<'c>(ExecNoSync<'c>);
53
54 /// `ExecReadOnly` comprises all read only state for a regex. Namely, all such
55 /// state is determined at compile time and never changes during search.
56 #[derive(Debug)]
57 struct ExecReadOnly {
58 /// The original regular expressions given by the caller to compile.
59 res: Vec<String>,
60 /// A compiled program that is used in the NFA simulation and backtracking.
61 /// It can be byte-based or Unicode codepoint based.
62 ///
63 /// N.B. It is not possibly to make this byte-based from the public API.
64 /// It is only used for testing byte based programs in the NFA simulations.
65 nfa: Program,
66 /// A compiled byte based program for DFA execution. This is only used
67 /// if a DFA can be executed. (Currently, only word boundary assertions are
68 /// not supported.) Note that this program contains an embedded `.*?`
69 /// preceding the first capture group, unless the regex is anchored at the
70 /// beginning.
71 dfa: Program,
72 /// The same as above, except the program is reversed (and there is no
73 /// preceding `.*?`). This is used by the DFA to find the starting location
74 /// of matches.
75 dfa_reverse: Program,
76 /// A set of suffix literals extracted from the regex.
77 ///
78 /// Prefix literals are stored on the `Program`, since they are used inside
79 /// the matching engines.
80 suffixes: LiteralSearcher,
81 /// An Aho-Corasick automaton with leftmost-first match semantics.
82 ///
83 /// This is only set when the entire regex is a simple unanchored
84 /// alternation of literals. We could probably use it more circumstances,
85 /// but this is already hacky enough in this architecture.
86 ///
87 /// N.B. We use u32 as a state ID representation under the assumption that
88 /// if we were to exhaust the ID space, we probably would have long
89 /// surpassed the compilation size limit.
90 #[cfg(feature = "perf-literal")]
91 ac: Option<AhoCorasick<u32>>,
92 /// match_type encodes as much upfront knowledge about how we're going to
93 /// execute a search as possible.
94 match_type: MatchType,
95 }
96
97 /// Facilitates the construction of an executor by exposing various knobs
98 /// to control how a regex is executed and what kinds of resources it's
99 /// permitted to use.
100 pub struct ExecBuilder {
101 options: RegexOptions,
102 match_type: Option<MatchType>,
103 bytes: bool,
104 only_utf8: bool,
105 }
106
107 /// Parsed represents a set of parsed regular expressions and their detected
108 /// literals.
109 struct Parsed {
110 exprs: Vec<Hir>,
111 prefixes: Literals,
112 suffixes: Literals,
113 bytes: bool,
114 }
115
116 impl ExecBuilder {
117 /// Create a regex execution builder.
118 ///
119 /// This uses default settings for everything except the regex itself,
120 /// which must be provided. Further knobs can be set by calling methods,
121 /// and then finally, `build` to actually create the executor.
new(re: &str) -> Self122 pub fn new(re: &str) -> Self {
123 Self::new_many(&[re])
124 }
125
126 /// Like new, but compiles the union of the given regular expressions.
127 ///
128 /// Note that when compiling 2 or more regular expressions, capture groups
129 /// are completely unsupported. (This means both `find` and `captures`
130 /// wont work.)
new_many<I, S>(res: I) -> Self where S: AsRef<str>, I: IntoIterator<Item = S>,131 pub fn new_many<I, S>(res: I) -> Self
132 where
133 S: AsRef<str>,
134 I: IntoIterator<Item = S>,
135 {
136 let mut opts = RegexOptions::default();
137 opts.pats = res.into_iter().map(|s| s.as_ref().to_owned()).collect();
138 Self::new_options(opts)
139 }
140
141 /// Create a regex execution builder.
new_options(opts: RegexOptions) -> Self142 pub fn new_options(opts: RegexOptions) -> Self {
143 ExecBuilder {
144 options: opts,
145 match_type: None,
146 bytes: false,
147 only_utf8: true,
148 }
149 }
150
151 /// Set the matching engine to be automatically determined.
152 ///
153 /// This is the default state and will apply whatever optimizations are
154 /// possible, such as running a DFA.
155 ///
156 /// This overrides whatever was previously set via the `nfa` or
157 /// `bounded_backtracking` methods.
automatic(mut self) -> Self158 pub fn automatic(mut self) -> Self {
159 self.match_type = None;
160 self
161 }
162
163 /// Sets the matching engine to use the NFA algorithm no matter what
164 /// optimizations are possible.
165 ///
166 /// This overrides whatever was previously set via the `automatic` or
167 /// `bounded_backtracking` methods.
nfa(mut self) -> Self168 pub fn nfa(mut self) -> Self {
169 self.match_type = Some(MatchType::Nfa(MatchNfaType::PikeVM));
170 self
171 }
172
173 /// Sets the matching engine to use a bounded backtracking engine no
174 /// matter what optimizations are possible.
175 ///
176 /// One must use this with care, since the bounded backtracking engine
177 /// uses memory proportion to `len(regex) * len(text)`.
178 ///
179 /// This overrides whatever was previously set via the `automatic` or
180 /// `nfa` methods.
bounded_backtracking(mut self) -> Self181 pub fn bounded_backtracking(mut self) -> Self {
182 self.match_type = Some(MatchType::Nfa(MatchNfaType::Backtrack));
183 self
184 }
185
186 /// Compiles byte based programs for use with the NFA matching engines.
187 ///
188 /// By default, the NFA engines match on Unicode scalar values. They can
189 /// be made to use byte based programs instead. In general, the byte based
190 /// programs are slower because of a less efficient encoding of character
191 /// classes.
192 ///
193 /// Note that this does not impact DFA matching engines, which always
194 /// execute on bytes.
bytes(mut self, yes: bool) -> Self195 pub fn bytes(mut self, yes: bool) -> Self {
196 self.bytes = yes;
197 self
198 }
199
200 /// When disabled, the program compiled may match arbitrary bytes.
201 ///
202 /// When enabled (the default), all compiled programs exclusively match
203 /// valid UTF-8 bytes.
only_utf8(mut self, yes: bool) -> Self204 pub fn only_utf8(mut self, yes: bool) -> Self {
205 self.only_utf8 = yes;
206 self
207 }
208
209 /// Set the Unicode flag.
unicode(mut self, yes: bool) -> Self210 pub fn unicode(mut self, yes: bool) -> Self {
211 self.options.unicode = yes;
212 self
213 }
214
215 /// Parse the current set of patterns into their AST and extract literals.
parse(&self) -> Result<Parsed, Error>216 fn parse(&self) -> Result<Parsed, Error> {
217 let mut exprs = Vec::with_capacity(self.options.pats.len());
218 let mut prefixes = Some(Literals::empty());
219 let mut suffixes = Some(Literals::empty());
220 let mut bytes = false;
221 let is_set = self.options.pats.len() > 1;
222 // If we're compiling a regex set and that set has any anchored
223 // expressions, then disable all literal optimizations.
224 for pat in &self.options.pats {
225 let mut parser = ParserBuilder::new()
226 .octal(self.options.octal)
227 .case_insensitive(self.options.case_insensitive)
228 .multi_line(self.options.multi_line)
229 .dot_matches_new_line(self.options.dot_matches_new_line)
230 .swap_greed(self.options.swap_greed)
231 .ignore_whitespace(self.options.ignore_whitespace)
232 .unicode(self.options.unicode)
233 .allow_invalid_utf8(!self.only_utf8)
234 .nest_limit(self.options.nest_limit)
235 .build();
236 let expr =
237 parser.parse(pat).map_err(|e| Error::Syntax(e.to_string()))?;
238 bytes = bytes || !expr.is_always_utf8();
239
240 if cfg!(feature = "perf-literal") {
241 if !expr.is_anchored_start() && expr.is_any_anchored_start() {
242 // Partial anchors unfortunately make it hard to use
243 // prefixes, so disable them.
244 prefixes = None;
245 } else if is_set && expr.is_anchored_start() {
246 // Regex sets with anchors do not go well with literal
247 // optimizations.
248 prefixes = None;
249 }
250 prefixes = prefixes.and_then(|mut prefixes| {
251 if !prefixes.union_prefixes(&expr) {
252 None
253 } else {
254 Some(prefixes)
255 }
256 });
257
258 if !expr.is_anchored_end() && expr.is_any_anchored_end() {
259 // Partial anchors unfortunately make it hard to use
260 // suffixes, so disable them.
261 suffixes = None;
262 } else if is_set && expr.is_anchored_end() {
263 // Regex sets with anchors do not go well with literal
264 // optimizations.
265 suffixes = None;
266 }
267 suffixes = suffixes.and_then(|mut suffixes| {
268 if !suffixes.union_suffixes(&expr) {
269 None
270 } else {
271 Some(suffixes)
272 }
273 });
274 }
275 exprs.push(expr);
276 }
277 Ok(Parsed {
278 exprs: exprs,
279 prefixes: prefixes.unwrap_or_else(Literals::empty),
280 suffixes: suffixes.unwrap_or_else(Literals::empty),
281 bytes: bytes,
282 })
283 }
284
285 /// Build an executor that can run a regular expression.
build(self) -> Result<Exec, Error>286 pub fn build(self) -> Result<Exec, Error> {
287 // Special case when we have no patterns to compile.
288 // This can happen when compiling a regex set.
289 if self.options.pats.is_empty() {
290 let ro = Arc::new(ExecReadOnly {
291 res: vec![],
292 nfa: Program::new(),
293 dfa: Program::new(),
294 dfa_reverse: Program::new(),
295 suffixes: LiteralSearcher::empty(),
296 #[cfg(feature = "perf-literal")]
297 ac: None,
298 match_type: MatchType::Nothing,
299 });
300 return Ok(Exec { ro: ro, cache: Cached::new() });
301 }
302 let parsed = self.parse()?;
303 let mut nfa = Compiler::new()
304 .size_limit(self.options.size_limit)
305 .bytes(self.bytes || parsed.bytes)
306 .only_utf8(self.only_utf8)
307 .compile(&parsed.exprs)?;
308 let mut dfa = Compiler::new()
309 .size_limit(self.options.size_limit)
310 .dfa(true)
311 .only_utf8(self.only_utf8)
312 .compile(&parsed.exprs)?;
313 let mut dfa_reverse = Compiler::new()
314 .size_limit(self.options.size_limit)
315 .dfa(true)
316 .only_utf8(self.only_utf8)
317 .reverse(true)
318 .compile(&parsed.exprs)?;
319
320 #[cfg(feature = "perf-literal")]
321 let ac = self.build_aho_corasick(&parsed);
322 nfa.prefixes = LiteralSearcher::prefixes(parsed.prefixes);
323 dfa.prefixes = nfa.prefixes.clone();
324 dfa.dfa_size_limit = self.options.dfa_size_limit;
325 dfa_reverse.dfa_size_limit = self.options.dfa_size_limit;
326
327 let mut ro = ExecReadOnly {
328 res: self.options.pats,
329 nfa: nfa,
330 dfa: dfa,
331 dfa_reverse: dfa_reverse,
332 suffixes: LiteralSearcher::suffixes(parsed.suffixes),
333 #[cfg(feature = "perf-literal")]
334 ac: ac,
335 match_type: MatchType::Nothing,
336 };
337 ro.match_type = ro.choose_match_type(self.match_type);
338
339 let ro = Arc::new(ro);
340 Ok(Exec { ro: ro, cache: Cached::new() })
341 }
342
343 #[cfg(feature = "perf-literal")]
build_aho_corasick(&self, parsed: &Parsed) -> Option<AhoCorasick<u32>>344 fn build_aho_corasick(&self, parsed: &Parsed) -> Option<AhoCorasick<u32>> {
345 if parsed.exprs.len() != 1 {
346 return None;
347 }
348 let lits = match alternation_literals(&parsed.exprs[0]) {
349 None => return None,
350 Some(lits) => lits,
351 };
352 // If we have a small number of literals, then let Teddy handle
353 // things (see literal/mod.rs).
354 if lits.len() <= 32 {
355 return None;
356 }
357 Some(
358 AhoCorasickBuilder::new()
359 .match_kind(MatchKind::LeftmostFirst)
360 .auto_configure(&lits)
361 // We always want this to reduce size, regardless
362 // of what auto-configure does.
363 .byte_classes(true)
364 .build_with_size::<u32, _, _>(&lits)
365 // This should never happen because we'd long exceed the
366 // compilation limit for regexes first.
367 .expect("AC automaton too big"),
368 )
369 }
370 }
371
372 impl<'c> RegularExpression for ExecNoSyncStr<'c> {
373 type Text = str;
374
slots_len(&self) -> usize375 fn slots_len(&self) -> usize {
376 self.0.slots_len()
377 }
378
next_after_empty(&self, text: &str, i: usize) -> usize379 fn next_after_empty(&self, text: &str, i: usize) -> usize {
380 next_utf8(text.as_bytes(), i)
381 }
382
383 #[cfg_attr(feature = "perf-inline", inline(always))]
shortest_match_at(&self, text: &str, start: usize) -> Option<usize>384 fn shortest_match_at(&self, text: &str, start: usize) -> Option<usize> {
385 self.0.shortest_match_at(text.as_bytes(), start)
386 }
387
388 #[cfg_attr(feature = "perf-inline", inline(always))]
is_match_at(&self, text: &str, start: usize) -> bool389 fn is_match_at(&self, text: &str, start: usize) -> bool {
390 self.0.is_match_at(text.as_bytes(), start)
391 }
392
393 #[cfg_attr(feature = "perf-inline", inline(always))]
find_at(&self, text: &str, start: usize) -> Option<(usize, usize)>394 fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> {
395 self.0.find_at(text.as_bytes(), start)
396 }
397
398 #[cfg_attr(feature = "perf-inline", inline(always))]
captures_read_at( &self, locs: &mut Locations, text: &str, start: usize, ) -> Option<(usize, usize)>399 fn captures_read_at(
400 &self,
401 locs: &mut Locations,
402 text: &str,
403 start: usize,
404 ) -> Option<(usize, usize)> {
405 self.0.captures_read_at(locs, text.as_bytes(), start)
406 }
407 }
408
409 impl<'c> RegularExpression for ExecNoSync<'c> {
410 type Text = [u8];
411
412 /// Returns the number of capture slots in the regular expression. (There
413 /// are two slots for every capture group, corresponding to possibly empty
414 /// start and end locations of the capture.)
slots_len(&self) -> usize415 fn slots_len(&self) -> usize {
416 self.ro.nfa.captures.len() * 2
417 }
418
next_after_empty(&self, _text: &[u8], i: usize) -> usize419 fn next_after_empty(&self, _text: &[u8], i: usize) -> usize {
420 i + 1
421 }
422
423 /// Returns the end of a match location, possibly occurring before the
424 /// end location of the correct leftmost-first match.
425 #[cfg_attr(feature = "perf-inline", inline(always))]
shortest_match_at(&self, text: &[u8], start: usize) -> Option<usize>426 fn shortest_match_at(&self, text: &[u8], start: usize) -> Option<usize> {
427 if !self.is_anchor_end_match(text) {
428 return None;
429 }
430 match self.ro.match_type {
431 #[cfg(feature = "perf-literal")]
432 MatchType::Literal(ty) => {
433 self.find_literals(ty, text, start).map(|(_, e)| e)
434 }
435 #[cfg(feature = "perf-dfa")]
436 MatchType::Dfa | MatchType::DfaMany => {
437 match self.shortest_dfa(text, start) {
438 dfa::Result::Match(end) => Some(end),
439 dfa::Result::NoMatch(_) => None,
440 dfa::Result::Quit => self.shortest_nfa(text, start),
441 }
442 }
443 #[cfg(feature = "perf-dfa")]
444 MatchType::DfaAnchoredReverse => {
445 match dfa::Fsm::reverse(
446 &self.ro.dfa_reverse,
447 self.cache.value(),
448 true,
449 &text[start..],
450 text.len(),
451 ) {
452 dfa::Result::Match(_) => Some(text.len()),
453 dfa::Result::NoMatch(_) => None,
454 dfa::Result::Quit => self.shortest_nfa(text, start),
455 }
456 }
457 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
458 MatchType::DfaSuffix => {
459 match self.shortest_dfa_reverse_suffix(text, start) {
460 dfa::Result::Match(e) => Some(e),
461 dfa::Result::NoMatch(_) => None,
462 dfa::Result::Quit => self.shortest_nfa(text, start),
463 }
464 }
465 MatchType::Nfa(ty) => self.shortest_nfa_type(ty, text, start),
466 MatchType::Nothing => None,
467 }
468 }
469
470 /// Returns true if and only if the regex matches text.
471 ///
472 /// For single regular expressions, this is equivalent to calling
473 /// shortest_match(...).is_some().
474 #[cfg_attr(feature = "perf-inline", inline(always))]
is_match_at(&self, text: &[u8], start: usize) -> bool475 fn is_match_at(&self, text: &[u8], start: usize) -> bool {
476 if !self.is_anchor_end_match(text) {
477 return false;
478 }
479 // We need to do this dance because shortest_match relies on the NFA
480 // filling in captures[1], but a RegexSet has no captures. In other
481 // words, a RegexSet can't (currently) use shortest_match. ---AG
482 match self.ro.match_type {
483 #[cfg(feature = "perf-literal")]
484 MatchType::Literal(ty) => {
485 self.find_literals(ty, text, start).is_some()
486 }
487 #[cfg(feature = "perf-dfa")]
488 MatchType::Dfa | MatchType::DfaMany => {
489 match self.shortest_dfa(text, start) {
490 dfa::Result::Match(_) => true,
491 dfa::Result::NoMatch(_) => false,
492 dfa::Result::Quit => self.match_nfa(text, start),
493 }
494 }
495 #[cfg(feature = "perf-dfa")]
496 MatchType::DfaAnchoredReverse => {
497 match dfa::Fsm::reverse(
498 &self.ro.dfa_reverse,
499 self.cache.value(),
500 true,
501 &text[start..],
502 text.len(),
503 ) {
504 dfa::Result::Match(_) => true,
505 dfa::Result::NoMatch(_) => false,
506 dfa::Result::Quit => self.match_nfa(text, start),
507 }
508 }
509 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
510 MatchType::DfaSuffix => {
511 match self.shortest_dfa_reverse_suffix(text, start) {
512 dfa::Result::Match(_) => true,
513 dfa::Result::NoMatch(_) => false,
514 dfa::Result::Quit => self.match_nfa(text, start),
515 }
516 }
517 MatchType::Nfa(ty) => self.match_nfa_type(ty, text, start),
518 MatchType::Nothing => false,
519 }
520 }
521
522 /// Finds the start and end location of the leftmost-first match, starting
523 /// at the given location.
524 #[cfg_attr(feature = "perf-inline", inline(always))]
find_at(&self, text: &[u8], start: usize) -> Option<(usize, usize)>525 fn find_at(&self, text: &[u8], start: usize) -> Option<(usize, usize)> {
526 if !self.is_anchor_end_match(text) {
527 return None;
528 }
529 match self.ro.match_type {
530 #[cfg(feature = "perf-literal")]
531 MatchType::Literal(ty) => self.find_literals(ty, text, start),
532 #[cfg(feature = "perf-dfa")]
533 MatchType::Dfa => match self.find_dfa_forward(text, start) {
534 dfa::Result::Match((s, e)) => Some((s, e)),
535 dfa::Result::NoMatch(_) => None,
536 dfa::Result::Quit => {
537 self.find_nfa(MatchNfaType::Auto, text, start)
538 }
539 },
540 #[cfg(feature = "perf-dfa")]
541 MatchType::DfaAnchoredReverse => {
542 match self.find_dfa_anchored_reverse(text, start) {
543 dfa::Result::Match((s, e)) => Some((s, e)),
544 dfa::Result::NoMatch(_) => None,
545 dfa::Result::Quit => {
546 self.find_nfa(MatchNfaType::Auto, text, start)
547 }
548 }
549 }
550 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
551 MatchType::DfaSuffix => {
552 match self.find_dfa_reverse_suffix(text, start) {
553 dfa::Result::Match((s, e)) => Some((s, e)),
554 dfa::Result::NoMatch(_) => None,
555 dfa::Result::Quit => {
556 self.find_nfa(MatchNfaType::Auto, text, start)
557 }
558 }
559 }
560 MatchType::Nfa(ty) => self.find_nfa(ty, text, start),
561 MatchType::Nothing => None,
562 #[cfg(feature = "perf-dfa")]
563 MatchType::DfaMany => {
564 unreachable!("BUG: RegexSet cannot be used with find")
565 }
566 }
567 }
568
569 /// Finds the start and end location of the leftmost-first match and also
570 /// fills in all matching capture groups.
571 ///
572 /// The number of capture slots given should be equal to the total number
573 /// of capture slots in the compiled program.
574 ///
575 /// Note that the first two slots always correspond to the start and end
576 /// locations of the overall match.
captures_read_at( &self, locs: &mut Locations, text: &[u8], start: usize, ) -> Option<(usize, usize)>577 fn captures_read_at(
578 &self,
579 locs: &mut Locations,
580 text: &[u8],
581 start: usize,
582 ) -> Option<(usize, usize)> {
583 let slots = locs.as_slots();
584 for slot in slots.iter_mut() {
585 *slot = None;
586 }
587 // If the caller unnecessarily uses this, then we try to save them
588 // from themselves.
589 match slots.len() {
590 0 => return self.find_at(text, start),
591 2 => {
592 return self.find_at(text, start).map(|(s, e)| {
593 slots[0] = Some(s);
594 slots[1] = Some(e);
595 (s, e)
596 });
597 }
598 _ => {} // fallthrough
599 }
600 if !self.is_anchor_end_match(text) {
601 return None;
602 }
603 match self.ro.match_type {
604 #[cfg(feature = "perf-literal")]
605 MatchType::Literal(ty) => {
606 self.find_literals(ty, text, start).and_then(|(s, e)| {
607 self.captures_nfa_type(
608 MatchNfaType::Auto,
609 slots,
610 text,
611 s,
612 e,
613 )
614 })
615 }
616 #[cfg(feature = "perf-dfa")]
617 MatchType::Dfa => {
618 if self.ro.nfa.is_anchored_start {
619 self.captures_nfa(slots, text, start)
620 } else {
621 match self.find_dfa_forward(text, start) {
622 dfa::Result::Match((s, e)) => self.captures_nfa_type(
623 MatchNfaType::Auto,
624 slots,
625 text,
626 s,
627 e,
628 ),
629 dfa::Result::NoMatch(_) => None,
630 dfa::Result::Quit => {
631 self.captures_nfa(slots, text, start)
632 }
633 }
634 }
635 }
636 #[cfg(feature = "perf-dfa")]
637 MatchType::DfaAnchoredReverse => {
638 match self.find_dfa_anchored_reverse(text, start) {
639 dfa::Result::Match((s, e)) => self.captures_nfa_type(
640 MatchNfaType::Auto,
641 slots,
642 text,
643 s,
644 e,
645 ),
646 dfa::Result::NoMatch(_) => None,
647 dfa::Result::Quit => self.captures_nfa(slots, text, start),
648 }
649 }
650 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
651 MatchType::DfaSuffix => {
652 match self.find_dfa_reverse_suffix(text, start) {
653 dfa::Result::Match((s, e)) => self.captures_nfa_type(
654 MatchNfaType::Auto,
655 slots,
656 text,
657 s,
658 e,
659 ),
660 dfa::Result::NoMatch(_) => None,
661 dfa::Result::Quit => self.captures_nfa(slots, text, start),
662 }
663 }
664 MatchType::Nfa(ty) => {
665 self.captures_nfa_type(ty, slots, text, start, text.len())
666 }
667 MatchType::Nothing => None,
668 #[cfg(feature = "perf-dfa")]
669 MatchType::DfaMany => {
670 unreachable!("BUG: RegexSet cannot be used with captures")
671 }
672 }
673 }
674 }
675
676 impl<'c> ExecNoSync<'c> {
677 /// Finds the leftmost-first match using only literal search.
678 #[cfg(feature = "perf-literal")]
679 #[cfg_attr(feature = "perf-inline", inline(always))]
find_literals( &self, ty: MatchLiteralType, text: &[u8], start: usize, ) -> Option<(usize, usize)>680 fn find_literals(
681 &self,
682 ty: MatchLiteralType,
683 text: &[u8],
684 start: usize,
685 ) -> Option<(usize, usize)> {
686 use self::MatchLiteralType::*;
687 match ty {
688 Unanchored => {
689 let lits = &self.ro.nfa.prefixes;
690 lits.find(&text[start..]).map(|(s, e)| (start + s, start + e))
691 }
692 AnchoredStart => {
693 let lits = &self.ro.nfa.prefixes;
694 if start == 0 || !self.ro.nfa.is_anchored_start {
695 lits.find_start(&text[start..])
696 .map(|(s, e)| (start + s, start + e))
697 } else {
698 None
699 }
700 }
701 AnchoredEnd => {
702 let lits = &self.ro.suffixes;
703 lits.find_end(&text[start..])
704 .map(|(s, e)| (start + s, start + e))
705 }
706 AhoCorasick => self
707 .ro
708 .ac
709 .as_ref()
710 .unwrap()
711 .find(&text[start..])
712 .map(|m| (start + m.start(), start + m.end())),
713 }
714 }
715
716 /// Finds the leftmost-first match (start and end) using only the DFA.
717 ///
718 /// If the result returned indicates that the DFA quit, then another
719 /// matching engine should be used.
720 #[cfg(feature = "perf-dfa")]
721 #[cfg_attr(feature = "perf-inline", inline(always))]
find_dfa_forward( &self, text: &[u8], start: usize, ) -> dfa::Result<(usize, usize)>722 fn find_dfa_forward(
723 &self,
724 text: &[u8],
725 start: usize,
726 ) -> dfa::Result<(usize, usize)> {
727 use dfa::Result::*;
728 let end = match dfa::Fsm::forward(
729 &self.ro.dfa,
730 self.cache.value(),
731 false,
732 text,
733 start,
734 ) {
735 NoMatch(i) => return NoMatch(i),
736 Quit => return Quit,
737 Match(end) if start == end => return Match((start, start)),
738 Match(end) => end,
739 };
740 // Now run the DFA in reverse to find the start of the match.
741 match dfa::Fsm::reverse(
742 &self.ro.dfa_reverse,
743 self.cache.value(),
744 false,
745 &text[start..],
746 end - start,
747 ) {
748 Match(s) => Match((start + s, end)),
749 NoMatch(i) => NoMatch(i),
750 Quit => Quit,
751 }
752 }
753
754 /// Finds the leftmost-first match (start and end) using only the DFA,
755 /// but assumes the regex is anchored at the end and therefore starts at
756 /// the end of the regex and matches in reverse.
757 ///
758 /// If the result returned indicates that the DFA quit, then another
759 /// matching engine should be used.
760 #[cfg(feature = "perf-dfa")]
761 #[cfg_attr(feature = "perf-inline", inline(always))]
find_dfa_anchored_reverse( &self, text: &[u8], start: usize, ) -> dfa::Result<(usize, usize)>762 fn find_dfa_anchored_reverse(
763 &self,
764 text: &[u8],
765 start: usize,
766 ) -> dfa::Result<(usize, usize)> {
767 use dfa::Result::*;
768 match dfa::Fsm::reverse(
769 &self.ro.dfa_reverse,
770 self.cache.value(),
771 false,
772 &text[start..],
773 text.len() - start,
774 ) {
775 Match(s) => Match((start + s, text.len())),
776 NoMatch(i) => NoMatch(i),
777 Quit => Quit,
778 }
779 }
780
781 /// Finds the end of the shortest match using only the DFA.
782 #[cfg(feature = "perf-dfa")]
783 #[cfg_attr(feature = "perf-inline", inline(always))]
shortest_dfa(&self, text: &[u8], start: usize) -> dfa::Result<usize>784 fn shortest_dfa(&self, text: &[u8], start: usize) -> dfa::Result<usize> {
785 dfa::Fsm::forward(&self.ro.dfa, self.cache.value(), true, text, start)
786 }
787
788 /// Finds the end of the shortest match using only the DFA by scanning for
789 /// suffix literals.
790 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
791 #[cfg_attr(feature = "perf-inline", inline(always))]
shortest_dfa_reverse_suffix( &self, text: &[u8], start: usize, ) -> dfa::Result<usize>792 fn shortest_dfa_reverse_suffix(
793 &self,
794 text: &[u8],
795 start: usize,
796 ) -> dfa::Result<usize> {
797 match self.exec_dfa_reverse_suffix(text, start) {
798 None => self.shortest_dfa(text, start),
799 Some(r) => r.map(|(_, end)| end),
800 }
801 }
802
803 /// Finds the end of the shortest match using only the DFA by scanning for
804 /// suffix literals. It also reports the start of the match.
805 ///
806 /// Note that if None is returned, then the optimization gave up to avoid
807 /// worst case quadratic behavior. A forward scanning DFA should be tried
808 /// next.
809 ///
810 /// If a match is returned and the full leftmost-first match is desired,
811 /// then a forward scan starting from the beginning of the match must be
812 /// done.
813 ///
814 /// If the result returned indicates that the DFA quit, then another
815 /// matching engine should be used.
816 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
817 #[cfg_attr(feature = "perf-inline", inline(always))]
exec_dfa_reverse_suffix( &self, text: &[u8], original_start: usize, ) -> Option<dfa::Result<(usize, usize)>>818 fn exec_dfa_reverse_suffix(
819 &self,
820 text: &[u8],
821 original_start: usize,
822 ) -> Option<dfa::Result<(usize, usize)>> {
823 use dfa::Result::*;
824
825 let lcs = self.ro.suffixes.lcs();
826 debug_assert!(lcs.len() >= 1);
827 let mut start = original_start;
828 let mut end = start;
829 let mut last_literal = start;
830 while end <= text.len() {
831 last_literal += match lcs.find(&text[last_literal..]) {
832 None => return Some(NoMatch(text.len())),
833 Some(i) => i,
834 };
835 end = last_literal + lcs.len();
836 match dfa::Fsm::reverse(
837 &self.ro.dfa_reverse,
838 self.cache.value(),
839 false,
840 &text[start..end],
841 end - start,
842 ) {
843 Match(0) | NoMatch(0) => return None,
844 Match(i) => return Some(Match((start + i, end))),
845 NoMatch(i) => {
846 start += i;
847 last_literal += 1;
848 continue;
849 }
850 Quit => return Some(Quit),
851 };
852 }
853 Some(NoMatch(text.len()))
854 }
855
856 /// Finds the leftmost-first match (start and end) using only the DFA
857 /// by scanning for suffix literals.
858 ///
859 /// If the result returned indicates that the DFA quit, then another
860 /// matching engine should be used.
861 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
862 #[cfg_attr(feature = "perf-inline", inline(always))]
find_dfa_reverse_suffix( &self, text: &[u8], start: usize, ) -> dfa::Result<(usize, usize)>863 fn find_dfa_reverse_suffix(
864 &self,
865 text: &[u8],
866 start: usize,
867 ) -> dfa::Result<(usize, usize)> {
868 use dfa::Result::*;
869
870 let match_start = match self.exec_dfa_reverse_suffix(text, start) {
871 None => return self.find_dfa_forward(text, start),
872 Some(Match((start, _))) => start,
873 Some(r) => return r,
874 };
875 // At this point, we've found a match. The only way to quit now
876 // without a match is if the DFA gives up (seems unlikely).
877 //
878 // Now run the DFA forwards to find the proper end of the match.
879 // (The suffix literal match can only indicate the earliest
880 // possible end location, which may appear before the end of the
881 // leftmost-first match.)
882 match dfa::Fsm::forward(
883 &self.ro.dfa,
884 self.cache.value(),
885 false,
886 text,
887 match_start,
888 ) {
889 NoMatch(_) => panic!("BUG: reverse match implies forward match"),
890 Quit => Quit,
891 Match(e) => Match((match_start, e)),
892 }
893 }
894
895 /// Executes the NFA engine to return whether there is a match or not.
896 ///
897 /// Ideally, we could use shortest_nfa(...).is_some() and get the same
898 /// performance characteristics, but regex sets don't have captures, which
899 /// shortest_nfa depends on.
900 #[cfg(feature = "perf-dfa")]
match_nfa(&self, text: &[u8], start: usize) -> bool901 fn match_nfa(&self, text: &[u8], start: usize) -> bool {
902 self.match_nfa_type(MatchNfaType::Auto, text, start)
903 }
904
905 /// Like match_nfa, but allows specification of the type of NFA engine.
match_nfa_type( &self, ty: MatchNfaType, text: &[u8], start: usize, ) -> bool906 fn match_nfa_type(
907 &self,
908 ty: MatchNfaType,
909 text: &[u8],
910 start: usize,
911 ) -> bool {
912 self.exec_nfa(
913 ty,
914 &mut [false],
915 &mut [],
916 true,
917 false,
918 text,
919 start,
920 text.len(),
921 )
922 }
923
924 /// Finds the shortest match using an NFA.
925 #[cfg(feature = "perf-dfa")]
shortest_nfa(&self, text: &[u8], start: usize) -> Option<usize>926 fn shortest_nfa(&self, text: &[u8], start: usize) -> Option<usize> {
927 self.shortest_nfa_type(MatchNfaType::Auto, text, start)
928 }
929
930 /// Like shortest_nfa, but allows specification of the type of NFA engine.
shortest_nfa_type( &self, ty: MatchNfaType, text: &[u8], start: usize, ) -> Option<usize>931 fn shortest_nfa_type(
932 &self,
933 ty: MatchNfaType,
934 text: &[u8],
935 start: usize,
936 ) -> Option<usize> {
937 let mut slots = [None, None];
938 if self.exec_nfa(
939 ty,
940 &mut [false],
941 &mut slots,
942 true,
943 true,
944 text,
945 start,
946 text.len(),
947 ) {
948 slots[1]
949 } else {
950 None
951 }
952 }
953
954 /// Like find, but executes an NFA engine.
find_nfa( &self, ty: MatchNfaType, text: &[u8], start: usize, ) -> Option<(usize, usize)>955 fn find_nfa(
956 &self,
957 ty: MatchNfaType,
958 text: &[u8],
959 start: usize,
960 ) -> Option<(usize, usize)> {
961 let mut slots = [None, None];
962 if self.exec_nfa(
963 ty,
964 &mut [false],
965 &mut slots,
966 false,
967 false,
968 text,
969 start,
970 text.len(),
971 ) {
972 match (slots[0], slots[1]) {
973 (Some(s), Some(e)) => Some((s, e)),
974 _ => None,
975 }
976 } else {
977 None
978 }
979 }
980
981 /// Like find_nfa, but fills in captures.
982 ///
983 /// `slots` should have length equal to `2 * nfa.captures.len()`.
984 #[cfg(feature = "perf-dfa")]
captures_nfa( &self, slots: &mut [Slot], text: &[u8], start: usize, ) -> Option<(usize, usize)>985 fn captures_nfa(
986 &self,
987 slots: &mut [Slot],
988 text: &[u8],
989 start: usize,
990 ) -> Option<(usize, usize)> {
991 self.captures_nfa_type(
992 MatchNfaType::Auto,
993 slots,
994 text,
995 start,
996 text.len(),
997 )
998 }
999
1000 /// Like captures_nfa, but allows specification of type of NFA engine.
captures_nfa_type( &self, ty: MatchNfaType, slots: &mut [Slot], text: &[u8], start: usize, end: usize, ) -> Option<(usize, usize)>1001 fn captures_nfa_type(
1002 &self,
1003 ty: MatchNfaType,
1004 slots: &mut [Slot],
1005 text: &[u8],
1006 start: usize,
1007 end: usize,
1008 ) -> Option<(usize, usize)> {
1009 if self.exec_nfa(
1010 ty,
1011 &mut [false],
1012 slots,
1013 false,
1014 false,
1015 text,
1016 start,
1017 end,
1018 ) {
1019 match (slots[0], slots[1]) {
1020 (Some(s), Some(e)) => Some((s, e)),
1021 _ => None,
1022 }
1023 } else {
1024 None
1025 }
1026 }
1027
exec_nfa( &self, mut ty: MatchNfaType, matches: &mut [bool], slots: &mut [Slot], quit_after_match: bool, quit_after_match_with_pos: bool, text: &[u8], start: usize, end: usize, ) -> bool1028 fn exec_nfa(
1029 &self,
1030 mut ty: MatchNfaType,
1031 matches: &mut [bool],
1032 slots: &mut [Slot],
1033 quit_after_match: bool,
1034 quit_after_match_with_pos: bool,
1035 text: &[u8],
1036 start: usize,
1037 end: usize,
1038 ) -> bool {
1039 use self::MatchNfaType::*;
1040 if let Auto = ty {
1041 if backtrack::should_exec(self.ro.nfa.len(), text.len()) {
1042 ty = Backtrack;
1043 } else {
1044 ty = PikeVM;
1045 }
1046 }
1047 // The backtracker can't return the shortest match position as it is
1048 // implemented today. So if someone calls `shortest_match` and we need
1049 // to run an NFA, then use the PikeVM.
1050 if quit_after_match_with_pos || ty == PikeVM {
1051 self.exec_pikevm(
1052 matches,
1053 slots,
1054 quit_after_match,
1055 text,
1056 start,
1057 end,
1058 )
1059 } else {
1060 self.exec_backtrack(matches, slots, text, start, end)
1061 }
1062 }
1063
1064 /// Always run the NFA algorithm.
exec_pikevm( &self, matches: &mut [bool], slots: &mut [Slot], quit_after_match: bool, text: &[u8], start: usize, end: usize, ) -> bool1065 fn exec_pikevm(
1066 &self,
1067 matches: &mut [bool],
1068 slots: &mut [Slot],
1069 quit_after_match: bool,
1070 text: &[u8],
1071 start: usize,
1072 end: usize,
1073 ) -> bool {
1074 if self.ro.nfa.uses_bytes() {
1075 pikevm::Fsm::exec(
1076 &self.ro.nfa,
1077 self.cache.value(),
1078 matches,
1079 slots,
1080 quit_after_match,
1081 ByteInput::new(text, self.ro.nfa.only_utf8),
1082 start,
1083 end,
1084 )
1085 } else {
1086 pikevm::Fsm::exec(
1087 &self.ro.nfa,
1088 self.cache.value(),
1089 matches,
1090 slots,
1091 quit_after_match,
1092 CharInput::new(text),
1093 start,
1094 end,
1095 )
1096 }
1097 }
1098
1099 /// Always runs the NFA using bounded backtracking.
exec_backtrack( &self, matches: &mut [bool], slots: &mut [Slot], text: &[u8], start: usize, end: usize, ) -> bool1100 fn exec_backtrack(
1101 &self,
1102 matches: &mut [bool],
1103 slots: &mut [Slot],
1104 text: &[u8],
1105 start: usize,
1106 end: usize,
1107 ) -> bool {
1108 if self.ro.nfa.uses_bytes() {
1109 backtrack::Bounded::exec(
1110 &self.ro.nfa,
1111 self.cache.value(),
1112 matches,
1113 slots,
1114 ByteInput::new(text, self.ro.nfa.only_utf8),
1115 start,
1116 end,
1117 )
1118 } else {
1119 backtrack::Bounded::exec(
1120 &self.ro.nfa,
1121 self.cache.value(),
1122 matches,
1123 slots,
1124 CharInput::new(text),
1125 start,
1126 end,
1127 )
1128 }
1129 }
1130
1131 /// Finds which regular expressions match the given text.
1132 ///
1133 /// `matches` should have length equal to the number of regexes being
1134 /// searched.
1135 ///
1136 /// This is only useful when one wants to know which regexes in a set
1137 /// match some text.
many_matches_at( &self, matches: &mut [bool], text: &[u8], start: usize, ) -> bool1138 pub fn many_matches_at(
1139 &self,
1140 matches: &mut [bool],
1141 text: &[u8],
1142 start: usize,
1143 ) -> bool {
1144 use self::MatchType::*;
1145 if !self.is_anchor_end_match(text) {
1146 return false;
1147 }
1148 match self.ro.match_type {
1149 #[cfg(feature = "perf-literal")]
1150 Literal(ty) => {
1151 debug_assert_eq!(matches.len(), 1);
1152 matches[0] = self.find_literals(ty, text, start).is_some();
1153 matches[0]
1154 }
1155 #[cfg(feature = "perf-dfa")]
1156 Dfa | DfaAnchoredReverse | DfaMany => {
1157 match dfa::Fsm::forward_many(
1158 &self.ro.dfa,
1159 self.cache.value(),
1160 matches,
1161 text,
1162 start,
1163 ) {
1164 dfa::Result::Match(_) => true,
1165 dfa::Result::NoMatch(_) => false,
1166 dfa::Result::Quit => self.exec_nfa(
1167 MatchNfaType::Auto,
1168 matches,
1169 &mut [],
1170 false,
1171 false,
1172 text,
1173 start,
1174 text.len(),
1175 ),
1176 }
1177 }
1178 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
1179 DfaSuffix => {
1180 match dfa::Fsm::forward_many(
1181 &self.ro.dfa,
1182 self.cache.value(),
1183 matches,
1184 text,
1185 start,
1186 ) {
1187 dfa::Result::Match(_) => true,
1188 dfa::Result::NoMatch(_) => false,
1189 dfa::Result::Quit => self.exec_nfa(
1190 MatchNfaType::Auto,
1191 matches,
1192 &mut [],
1193 false,
1194 false,
1195 text,
1196 start,
1197 text.len(),
1198 ),
1199 }
1200 }
1201 Nfa(ty) => self.exec_nfa(
1202 ty,
1203 matches,
1204 &mut [],
1205 false,
1206 false,
1207 text,
1208 start,
1209 text.len(),
1210 ),
1211 Nothing => false,
1212 }
1213 }
1214
1215 #[cfg_attr(feature = "perf-inline", inline(always))]
is_anchor_end_match(&self, text: &[u8]) -> bool1216 fn is_anchor_end_match(&self, text: &[u8]) -> bool {
1217 #[cfg(not(feature = "perf-literal"))]
1218 fn imp(_: &ExecReadOnly, _: &[u8]) -> bool {
1219 true
1220 }
1221
1222 #[cfg(feature = "perf-literal")]
1223 fn imp(ro: &ExecReadOnly, text: &[u8]) -> bool {
1224 // Only do this check if the haystack is big (>1MB).
1225 if text.len() > (1 << 20) && ro.nfa.is_anchored_end {
1226 let lcs = ro.suffixes.lcs();
1227 if lcs.len() >= 1 && !lcs.is_suffix(text) {
1228 return false;
1229 }
1230 }
1231 true
1232 }
1233
1234 imp(&self.ro, text)
1235 }
1236
capture_name_idx(&self) -> &Arc<HashMap<String, usize>>1237 pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1238 &self.ro.nfa.capture_name_idx
1239 }
1240 }
1241
1242 impl<'c> ExecNoSyncStr<'c> {
capture_name_idx(&self) -> &Arc<HashMap<String, usize>>1243 pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1244 self.0.capture_name_idx()
1245 }
1246 }
1247
1248 impl Exec {
1249 /// Get a searcher that isn't Sync.
1250 #[cfg_attr(feature = "perf-inline", inline(always))]
searcher(&self) -> ExecNoSync1251 pub fn searcher(&self) -> ExecNoSync {
1252 let create = || RefCell::new(ProgramCacheInner::new(&self.ro));
1253 ExecNoSync {
1254 ro: &self.ro, // a clone is too expensive here! (and not needed)
1255 cache: self.cache.get_or(create),
1256 }
1257 }
1258
1259 /// Get a searcher that isn't Sync and can match on &str.
1260 #[cfg_attr(feature = "perf-inline", inline(always))]
searcher_str(&self) -> ExecNoSyncStr1261 pub fn searcher_str(&self) -> ExecNoSyncStr {
1262 ExecNoSyncStr(self.searcher())
1263 }
1264
1265 /// Build a Regex from this executor.
into_regex(self) -> re_unicode::Regex1266 pub fn into_regex(self) -> re_unicode::Regex {
1267 re_unicode::Regex::from(self)
1268 }
1269
1270 /// Build a RegexSet from this executor.
into_regex_set(self) -> re_set::unicode::RegexSet1271 pub fn into_regex_set(self) -> re_set::unicode::RegexSet {
1272 re_set::unicode::RegexSet::from(self)
1273 }
1274
1275 /// Build a Regex from this executor that can match arbitrary bytes.
into_byte_regex(self) -> re_bytes::Regex1276 pub fn into_byte_regex(self) -> re_bytes::Regex {
1277 re_bytes::Regex::from(self)
1278 }
1279
1280 /// Build a RegexSet from this executor that can match arbitrary bytes.
into_byte_regex_set(self) -> re_set::bytes::RegexSet1281 pub fn into_byte_regex_set(self) -> re_set::bytes::RegexSet {
1282 re_set::bytes::RegexSet::from(self)
1283 }
1284
1285 /// The original regular expressions given by the caller that were
1286 /// compiled.
regex_strings(&self) -> &[String]1287 pub fn regex_strings(&self) -> &[String] {
1288 &self.ro.res
1289 }
1290
1291 /// Return a slice of capture names.
1292 ///
1293 /// Any capture that isn't named is None.
capture_names(&self) -> &[Option<String>]1294 pub fn capture_names(&self) -> &[Option<String>] {
1295 &self.ro.nfa.captures
1296 }
1297
1298 /// Return a reference to named groups mapping (from group name to
1299 /// group position).
capture_name_idx(&self) -> &Arc<HashMap<String, usize>>1300 pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1301 &self.ro.nfa.capture_name_idx
1302 }
1303 }
1304
1305 impl Clone for Exec {
clone(&self) -> Exec1306 fn clone(&self) -> Exec {
1307 Exec { ro: self.ro.clone(), cache: Cached::new() }
1308 }
1309 }
1310
1311 impl ExecReadOnly {
choose_match_type(&self, hint: Option<MatchType>) -> MatchType1312 fn choose_match_type(&self, hint: Option<MatchType>) -> MatchType {
1313 if let Some(MatchType::Nfa(_)) = hint {
1314 return hint.unwrap();
1315 }
1316 // If the NFA is empty, then we'll never match anything.
1317 if self.nfa.insts.is_empty() {
1318 return MatchType::Nothing;
1319 }
1320 if let Some(literalty) = self.choose_literal_match_type() {
1321 return literalty;
1322 }
1323 if let Some(dfaty) = self.choose_dfa_match_type() {
1324 return dfaty;
1325 }
1326 // We're so totally hosed.
1327 MatchType::Nfa(MatchNfaType::Auto)
1328 }
1329
1330 /// If a plain literal scan can be used, then a corresponding literal
1331 /// search type is returned.
choose_literal_match_type(&self) -> Option<MatchType>1332 fn choose_literal_match_type(&self) -> Option<MatchType> {
1333 #[cfg(not(feature = "perf-literal"))]
1334 fn imp(_: &ExecReadOnly) -> Option<MatchType> {
1335 None
1336 }
1337
1338 #[cfg(feature = "perf-literal")]
1339 fn imp(ro: &ExecReadOnly) -> Option<MatchType> {
1340 // If our set of prefixes is complete, then we can use it to find
1341 // a match in lieu of a regex engine. This doesn't quite work well
1342 // in the presence of multiple regexes, so only do it when there's
1343 // one.
1344 //
1345 // TODO(burntsushi): Also, don't try to match literals if the regex
1346 // is partially anchored. We could technically do it, but we'd need
1347 // to create two sets of literals: all of them and then the subset
1348 // that aren't anchored. We would then only search for all of them
1349 // when at the beginning of the input and use the subset in all
1350 // other cases.
1351 if ro.res.len() != 1 {
1352 return None;
1353 }
1354 if ro.ac.is_some() {
1355 return Some(MatchType::Literal(
1356 MatchLiteralType::AhoCorasick,
1357 ));
1358 }
1359 if ro.nfa.prefixes.complete() {
1360 return if ro.nfa.is_anchored_start {
1361 Some(MatchType::Literal(MatchLiteralType::AnchoredStart))
1362 } else {
1363 Some(MatchType::Literal(MatchLiteralType::Unanchored))
1364 };
1365 }
1366 if ro.suffixes.complete() {
1367 return if ro.nfa.is_anchored_end {
1368 Some(MatchType::Literal(MatchLiteralType::AnchoredEnd))
1369 } else {
1370 // This case shouldn't happen. When the regex isn't
1371 // anchored, then complete prefixes should imply complete
1372 // suffixes.
1373 Some(MatchType::Literal(MatchLiteralType::Unanchored))
1374 };
1375 }
1376 None
1377 }
1378
1379 imp(self)
1380 }
1381
1382 /// If a DFA scan can be used, then choose the appropriate DFA strategy.
choose_dfa_match_type(&self) -> Option<MatchType>1383 fn choose_dfa_match_type(&self) -> Option<MatchType> {
1384 #[cfg(not(feature = "perf-dfa"))]
1385 fn imp(_: &ExecReadOnly) -> Option<MatchType> {
1386 None
1387 }
1388
1389 #[cfg(feature = "perf-dfa")]
1390 fn imp(ro: &ExecReadOnly) -> Option<MatchType> {
1391 if !dfa::can_exec(&ro.dfa) {
1392 return None;
1393 }
1394 // Regex sets require a slightly specialized path.
1395 if ro.res.len() >= 2 {
1396 return Some(MatchType::DfaMany);
1397 }
1398 // If the regex is anchored at the end but not the start, then
1399 // just match in reverse from the end of the haystack.
1400 if !ro.nfa.is_anchored_start && ro.nfa.is_anchored_end {
1401 return Some(MatchType::DfaAnchoredReverse);
1402 }
1403 #[cfg(feature = "perf-literal")]
1404 {
1405 // If there's a longish suffix literal, then it might be faster
1406 // to look for that first.
1407 if ro.should_suffix_scan() {
1408 return Some(MatchType::DfaSuffix);
1409 }
1410 }
1411 // Fall back to your garden variety forward searching lazy DFA.
1412 Some(MatchType::Dfa)
1413 }
1414
1415 imp(self)
1416 }
1417
1418 /// Returns true if the program is amenable to suffix scanning.
1419 ///
1420 /// When this is true, as a heuristic, we assume it is OK to quickly scan
1421 /// for suffix literals and then do a *reverse* DFA match from any matches
1422 /// produced by the literal scan. (And then followed by a forward DFA
1423 /// search, since the previously found suffix literal maybe not actually be
1424 /// the end of a match.)
1425 ///
1426 /// This is a bit of a specialized optimization, but can result in pretty
1427 /// big performance wins if 1) there are no prefix literals and 2) the
1428 /// suffix literals are pretty rare in the text. (1) is obviously easy to
1429 /// account for but (2) is harder. As a proxy, we assume that longer
1430 /// strings are generally rarer, so we only enable this optimization when
1431 /// we have a meaty suffix.
1432 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
should_suffix_scan(&self) -> bool1433 fn should_suffix_scan(&self) -> bool {
1434 if self.suffixes.is_empty() {
1435 return false;
1436 }
1437 let lcs_len = self.suffixes.lcs().char_len();
1438 lcs_len >= 3 && lcs_len > self.dfa.prefixes.lcp().char_len()
1439 }
1440 }
1441
1442 #[derive(Clone, Copy, Debug)]
1443 enum MatchType {
1444 /// A single or multiple literal search. This is only used when the regex
1445 /// can be decomposed into a literal search.
1446 #[cfg(feature = "perf-literal")]
1447 Literal(MatchLiteralType),
1448 /// A normal DFA search.
1449 #[cfg(feature = "perf-dfa")]
1450 Dfa,
1451 /// A reverse DFA search starting from the end of a haystack.
1452 #[cfg(feature = "perf-dfa")]
1453 DfaAnchoredReverse,
1454 /// A reverse DFA search with suffix literal scanning.
1455 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
1456 DfaSuffix,
1457 /// Use the DFA on two or more regular expressions.
1458 #[cfg(feature = "perf-dfa")]
1459 DfaMany,
1460 /// An NFA variant.
1461 Nfa(MatchNfaType),
1462 /// No match is ever possible, so don't ever try to search.
1463 Nothing,
1464 }
1465
1466 #[derive(Clone, Copy, Debug)]
1467 #[cfg(feature = "perf-literal")]
1468 enum MatchLiteralType {
1469 /// Match literals anywhere in text.
1470 Unanchored,
1471 /// Match literals only at the start of text.
1472 AnchoredStart,
1473 /// Match literals only at the end of text.
1474 AnchoredEnd,
1475 /// Use an Aho-Corasick automaton. This requires `ac` to be Some on
1476 /// ExecReadOnly.
1477 AhoCorasick,
1478 }
1479
1480 #[derive(Clone, Copy, Debug, Eq, PartialEq)]
1481 enum MatchNfaType {
1482 /// Choose between Backtrack and PikeVM.
1483 Auto,
1484 /// NFA bounded backtracking.
1485 ///
1486 /// (This is only set by tests, since it never makes sense to always want
1487 /// backtracking.)
1488 Backtrack,
1489 /// The Pike VM.
1490 ///
1491 /// (This is only set by tests, since it never makes sense to always want
1492 /// the Pike VM.)
1493 PikeVM,
1494 }
1495
1496 /// `ProgramCache` maintains reusable allocations for each matching engine
1497 /// available to a particular program.
1498 pub type ProgramCache = RefCell<ProgramCacheInner>;
1499
1500 #[derive(Debug)]
1501 pub struct ProgramCacheInner {
1502 pub pikevm: pikevm::Cache,
1503 pub backtrack: backtrack::Cache,
1504 #[cfg(feature = "perf-dfa")]
1505 pub dfa: dfa::Cache,
1506 #[cfg(feature = "perf-dfa")]
1507 pub dfa_reverse: dfa::Cache,
1508 }
1509
1510 impl ProgramCacheInner {
new(ro: &ExecReadOnly) -> Self1511 fn new(ro: &ExecReadOnly) -> Self {
1512 ProgramCacheInner {
1513 pikevm: pikevm::Cache::new(&ro.nfa),
1514 backtrack: backtrack::Cache::new(&ro.nfa),
1515 #[cfg(feature = "perf-dfa")]
1516 dfa: dfa::Cache::new(&ro.dfa),
1517 #[cfg(feature = "perf-dfa")]
1518 dfa_reverse: dfa::Cache::new(&ro.dfa_reverse),
1519 }
1520 }
1521 }
1522
1523 /// Alternation literals checks if the given HIR is a simple alternation of
1524 /// literals, and if so, returns them. Otherwise, this returns None.
1525 #[cfg(feature = "perf-literal")]
alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>>1526 fn alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>> {
1527 use syntax::hir::{HirKind, Literal};
1528
1529 // This is pretty hacky, but basically, if `is_alternation_literal` is
1530 // true, then we can make several assumptions about the structure of our
1531 // HIR. This is what justifies the `unreachable!` statements below.
1532 //
1533 // This code should be refactored once we overhaul this crate's
1534 // optimization pipeline, because this is a terribly inflexible way to go
1535 // about things.
1536
1537 if !expr.is_alternation_literal() {
1538 return None;
1539 }
1540 let alts = match *expr.kind() {
1541 HirKind::Alternation(ref alts) => alts,
1542 _ => return None, // one literal isn't worth it
1543 };
1544
1545 let extendlit = |lit: &Literal, dst: &mut Vec<u8>| match *lit {
1546 Literal::Unicode(c) => {
1547 let mut buf = [0; 4];
1548 dst.extend_from_slice(c.encode_utf8(&mut buf).as_bytes());
1549 }
1550 Literal::Byte(b) => {
1551 dst.push(b);
1552 }
1553 };
1554
1555 let mut lits = vec![];
1556 for alt in alts {
1557 let mut lit = vec![];
1558 match *alt.kind() {
1559 HirKind::Literal(ref x) => extendlit(x, &mut lit),
1560 HirKind::Concat(ref exprs) => {
1561 for e in exprs {
1562 match *e.kind() {
1563 HirKind::Literal(ref x) => extendlit(x, &mut lit),
1564 _ => unreachable!("expected literal, got {:?}", e),
1565 }
1566 }
1567 }
1568 _ => unreachable!("expected literal or concat, got {:?}", alt),
1569 }
1570 lits.push(lit);
1571 }
1572 Some(lits)
1573 }
1574
1575 #[cfg(test)]
1576 mod test {
1577 #[test]
uppercut_s_backtracking_bytes_default_bytes_mismatch()1578 fn uppercut_s_backtracking_bytes_default_bytes_mismatch() {
1579 use internal::ExecBuilder;
1580
1581 let backtrack_bytes_re = ExecBuilder::new("^S")
1582 .bounded_backtracking()
1583 .only_utf8(false)
1584 .build()
1585 .map(|exec| exec.into_byte_regex())
1586 .map_err(|err| format!("{}", err))
1587 .unwrap();
1588
1589 let default_bytes_re = ExecBuilder::new("^S")
1590 .only_utf8(false)
1591 .build()
1592 .map(|exec| exec.into_byte_regex())
1593 .map_err(|err| format!("{}", err))
1594 .unwrap();
1595
1596 let input = vec![83, 83];
1597
1598 let s1 = backtrack_bytes_re.split(&input);
1599 let s2 = default_bytes_re.split(&input);
1600 for (chunk1, chunk2) in s1.zip(s2) {
1601 assert_eq!(chunk1, chunk2);
1602 }
1603 }
1604
1605 #[test]
unicode_lit_star_backtracking_utf8bytes_default_utf8bytes_mismatch()1606 fn unicode_lit_star_backtracking_utf8bytes_default_utf8bytes_mismatch() {
1607 use internal::ExecBuilder;
1608
1609 let backtrack_bytes_re = ExecBuilder::new(r"^(?u:\*)")
1610 .bounded_backtracking()
1611 .bytes(true)
1612 .build()
1613 .map(|exec| exec.into_regex())
1614 .map_err(|err| format!("{}", err))
1615 .unwrap();
1616
1617 let default_bytes_re = ExecBuilder::new(r"^(?u:\*)")
1618 .bytes(true)
1619 .build()
1620 .map(|exec| exec.into_regex())
1621 .map_err(|err| format!("{}", err))
1622 .unwrap();
1623
1624 let input = "**";
1625
1626 let s1 = backtrack_bytes_re.split(input);
1627 let s2 = default_bytes_re.split(input);
1628 for (chunk1, chunk2) in s1.zip(s2) {
1629 assert_eq!(chunk1, chunk2);
1630 }
1631 }
1632 }
1633