1 use std::cell::RefCell;
2 use std::collections::HashMap;
3 use std::sync::Arc;
4
5 #[cfg(feature = "perf-literal")]
6 use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
7 use syntax::hir::literal::Literals;
8 use syntax::hir::Hir;
9 use syntax::ParserBuilder;
10
11 use backtrack;
12 use cache::{Cached, CachedGuard};
13 use compile::Compiler;
14 #[cfg(feature = "perf-dfa")]
15 use dfa;
16 use error::Error;
17 use input::{ByteInput, CharInput};
18 use literal::LiteralSearcher;
19 use pikevm;
20 use prog::Program;
21 use re_builder::RegexOptions;
22 use re_bytes;
23 use re_set;
24 use re_trait::{Locations, RegularExpression, Slot};
25 use re_unicode;
26 use utf8::next_utf8;
27
28 /// `Exec` manages the execution of a regular expression.
29 ///
30 /// In particular, this manages the various compiled forms of a single regular
31 /// expression and the choice of which matching engine to use to execute a
32 /// regular expression.
33 #[derive(Debug)]
34 pub struct Exec {
35 /// All read only state.
36 ro: Arc<ExecReadOnly>,
37 /// Caches for the various matching engines.
38 cache: Cached<ProgramCache>,
39 }
40
41 /// `ExecNoSync` is like `Exec`, except it embeds a reference to a cache. This
42 /// means it is no longer Sync, but we can now avoid the overhead of
43 /// synchronization to fetch the cache.
44 #[derive(Debug)]
45 pub struct ExecNoSync<'c> {
46 /// All read only state.
47 ro: &'c Arc<ExecReadOnly>,
48 /// Caches for the various matching engines.
49 cache: CachedGuard<'c, ProgramCache>,
50 }
51
52 /// `ExecNoSyncStr` is like `ExecNoSync`, but matches on &str instead of &[u8].
53 #[derive(Debug)]
54 pub struct ExecNoSyncStr<'c>(ExecNoSync<'c>);
55
56 /// `ExecReadOnly` comprises all read only state for a regex. Namely, all such
57 /// state is determined at compile time and never changes during search.
58 #[derive(Debug)]
59 struct ExecReadOnly {
60 /// The original regular expressions given by the caller to compile.
61 res: Vec<String>,
62 /// A compiled program that is used in the NFA simulation and backtracking.
63 /// It can be byte-based or Unicode codepoint based.
64 ///
65 /// N.B. It is not possibly to make this byte-based from the public API.
66 /// It is only used for testing byte based programs in the NFA simulations.
67 nfa: Program,
68 /// A compiled byte based program for DFA execution. This is only used
69 /// if a DFA can be executed. (Currently, only word boundary assertions are
70 /// not supported.) Note that this program contains an embedded `.*?`
71 /// preceding the first capture group, unless the regex is anchored at the
72 /// beginning.
73 dfa: Program,
74 /// The same as above, except the program is reversed (and there is no
75 /// preceding `.*?`). This is used by the DFA to find the starting location
76 /// of matches.
77 dfa_reverse: Program,
78 /// A set of suffix literals extracted from the regex.
79 ///
80 /// Prefix literals are stored on the `Program`, since they are used inside
81 /// the matching engines.
82 suffixes: LiteralSearcher,
83 /// An Aho-Corasick automaton with leftmost-first match semantics.
84 ///
85 /// This is only set when the entire regex is a simple unanchored
86 /// alternation of literals. We could probably use it more circumstances,
87 /// but this is already hacky enough in this architecture.
88 ///
89 /// N.B. We use u32 as a state ID representation under the assumption that
90 /// if we were to exhaust the ID space, we probably would have long
91 /// surpassed the compilation size limit.
92 #[cfg(feature = "perf-literal")]
93 ac: Option<AhoCorasick<u32>>,
94 /// match_type encodes as much upfront knowledge about how we're going to
95 /// execute a search as possible.
96 match_type: MatchType,
97 }
98
99 /// Facilitates the construction of an executor by exposing various knobs
100 /// to control how a regex is executed and what kinds of resources it's
101 /// permitted to use.
102 // `ExecBuilder` is only public via the `internal` module, so avoid deriving
103 // `Debug`.
104 #[allow(missing_debug_implementations)]
105 pub struct ExecBuilder {
106 options: RegexOptions,
107 match_type: Option<MatchType>,
108 bytes: bool,
109 only_utf8: bool,
110 }
111
112 /// Parsed represents a set of parsed regular expressions and their detected
113 /// literals.
114 struct Parsed {
115 exprs: Vec<Hir>,
116 prefixes: Literals,
117 suffixes: Literals,
118 bytes: bool,
119 }
120
121 impl ExecBuilder {
122 /// Create a regex execution builder.
123 ///
124 /// This uses default settings for everything except the regex itself,
125 /// which must be provided. Further knobs can be set by calling methods,
126 /// and then finally, `build` to actually create the executor.
new(re: &str) -> Self127 pub fn new(re: &str) -> Self {
128 Self::new_many(&[re])
129 }
130
131 /// Like new, but compiles the union of the given regular expressions.
132 ///
133 /// Note that when compiling 2 or more regular expressions, capture groups
134 /// are completely unsupported. (This means both `find` and `captures`
135 /// wont work.)
new_many<I, S>(res: I) -> Self where S: AsRef<str>, I: IntoIterator<Item = S>,136 pub fn new_many<I, S>(res: I) -> Self
137 where
138 S: AsRef<str>,
139 I: IntoIterator<Item = S>,
140 {
141 let mut opts = RegexOptions::default();
142 opts.pats = res.into_iter().map(|s| s.as_ref().to_owned()).collect();
143 Self::new_options(opts)
144 }
145
146 /// Create a regex execution builder.
new_options(opts: RegexOptions) -> Self147 pub fn new_options(opts: RegexOptions) -> Self {
148 ExecBuilder {
149 options: opts,
150 match_type: None,
151 bytes: false,
152 only_utf8: true,
153 }
154 }
155
156 /// Set the matching engine to be automatically determined.
157 ///
158 /// This is the default state and will apply whatever optimizations are
159 /// possible, such as running a DFA.
160 ///
161 /// This overrides whatever was previously set via the `nfa` or
162 /// `bounded_backtracking` methods.
automatic(mut self) -> Self163 pub fn automatic(mut self) -> Self {
164 self.match_type = None;
165 self
166 }
167
168 /// Sets the matching engine to use the NFA algorithm no matter what
169 /// optimizations are possible.
170 ///
171 /// This overrides whatever was previously set via the `automatic` or
172 /// `bounded_backtracking` methods.
nfa(mut self) -> Self173 pub fn nfa(mut self) -> Self {
174 self.match_type = Some(MatchType::Nfa(MatchNfaType::PikeVM));
175 self
176 }
177
178 /// Sets the matching engine to use a bounded backtracking engine no
179 /// matter what optimizations are possible.
180 ///
181 /// One must use this with care, since the bounded backtracking engine
182 /// uses memory proportion to `len(regex) * len(text)`.
183 ///
184 /// This overrides whatever was previously set via the `automatic` or
185 /// `nfa` methods.
bounded_backtracking(mut self) -> Self186 pub fn bounded_backtracking(mut self) -> Self {
187 self.match_type = Some(MatchType::Nfa(MatchNfaType::Backtrack));
188 self
189 }
190
191 /// Compiles byte based programs for use with the NFA matching engines.
192 ///
193 /// By default, the NFA engines match on Unicode scalar values. They can
194 /// be made to use byte based programs instead. In general, the byte based
195 /// programs are slower because of a less efficient encoding of character
196 /// classes.
197 ///
198 /// Note that this does not impact DFA matching engines, which always
199 /// execute on bytes.
bytes(mut self, yes: bool) -> Self200 pub fn bytes(mut self, yes: bool) -> Self {
201 self.bytes = yes;
202 self
203 }
204
205 /// When disabled, the program compiled may match arbitrary bytes.
206 ///
207 /// When enabled (the default), all compiled programs exclusively match
208 /// valid UTF-8 bytes.
only_utf8(mut self, yes: bool) -> Self209 pub fn only_utf8(mut self, yes: bool) -> Self {
210 self.only_utf8 = yes;
211 self
212 }
213
214 /// Set the Unicode flag.
unicode(mut self, yes: bool) -> Self215 pub fn unicode(mut self, yes: bool) -> Self {
216 self.options.unicode = yes;
217 self
218 }
219
220 /// Parse the current set of patterns into their AST and extract literals.
parse(&self) -> Result<Parsed, Error>221 fn parse(&self) -> Result<Parsed, Error> {
222 let mut exprs = Vec::with_capacity(self.options.pats.len());
223 let mut prefixes = Some(Literals::empty());
224 let mut suffixes = Some(Literals::empty());
225 let mut bytes = false;
226 let is_set = self.options.pats.len() > 1;
227 // If we're compiling a regex set and that set has any anchored
228 // expressions, then disable all literal optimizations.
229 for pat in &self.options.pats {
230 let mut parser = ParserBuilder::new()
231 .octal(self.options.octal)
232 .case_insensitive(self.options.case_insensitive)
233 .multi_line(self.options.multi_line)
234 .dot_matches_new_line(self.options.dot_matches_new_line)
235 .swap_greed(self.options.swap_greed)
236 .ignore_whitespace(self.options.ignore_whitespace)
237 .unicode(self.options.unicode)
238 .allow_invalid_utf8(!self.only_utf8)
239 .nest_limit(self.options.nest_limit)
240 .build();
241 let expr =
242 parser.parse(pat).map_err(|e| Error::Syntax(e.to_string()))?;
243 bytes = bytes || !expr.is_always_utf8();
244
245 if cfg!(feature = "perf-literal") {
246 if !expr.is_anchored_start() && expr.is_any_anchored_start() {
247 // Partial anchors unfortunately make it hard to use
248 // prefixes, so disable them.
249 prefixes = None;
250 } else if is_set && expr.is_anchored_start() {
251 // Regex sets with anchors do not go well with literal
252 // optimizations.
253 prefixes = None;
254 }
255 prefixes = prefixes.and_then(|mut prefixes| {
256 if !prefixes.union_prefixes(&expr) {
257 None
258 } else {
259 Some(prefixes)
260 }
261 });
262
263 if !expr.is_anchored_end() && expr.is_any_anchored_end() {
264 // Partial anchors unfortunately make it hard to use
265 // suffixes, so disable them.
266 suffixes = None;
267 } else if is_set && expr.is_anchored_end() {
268 // Regex sets with anchors do not go well with literal
269 // optimizations.
270 suffixes = None;
271 }
272 suffixes = suffixes.and_then(|mut suffixes| {
273 if !suffixes.union_suffixes(&expr) {
274 None
275 } else {
276 Some(suffixes)
277 }
278 });
279 }
280 exprs.push(expr);
281 }
282 Ok(Parsed {
283 exprs: exprs,
284 prefixes: prefixes.unwrap_or_else(Literals::empty),
285 suffixes: suffixes.unwrap_or_else(Literals::empty),
286 bytes: bytes,
287 })
288 }
289
290 /// Build an executor that can run a regular expression.
build(self) -> Result<Exec, Error>291 pub fn build(self) -> Result<Exec, Error> {
292 // Special case when we have no patterns to compile.
293 // This can happen when compiling a regex set.
294 if self.options.pats.is_empty() {
295 let ro = Arc::new(ExecReadOnly {
296 res: vec![],
297 nfa: Program::new(),
298 dfa: Program::new(),
299 dfa_reverse: Program::new(),
300 suffixes: LiteralSearcher::empty(),
301 #[cfg(feature = "perf-literal")]
302 ac: None,
303 match_type: MatchType::Nothing,
304 });
305 return Ok(Exec { ro: ro, cache: Cached::new() });
306 }
307 let parsed = self.parse()?;
308 let mut nfa = Compiler::new()
309 .size_limit(self.options.size_limit)
310 .bytes(self.bytes || parsed.bytes)
311 .only_utf8(self.only_utf8)
312 .compile(&parsed.exprs)?;
313 let mut dfa = Compiler::new()
314 .size_limit(self.options.size_limit)
315 .dfa(true)
316 .only_utf8(self.only_utf8)
317 .compile(&parsed.exprs)?;
318 let mut dfa_reverse = Compiler::new()
319 .size_limit(self.options.size_limit)
320 .dfa(true)
321 .only_utf8(self.only_utf8)
322 .reverse(true)
323 .compile(&parsed.exprs)?;
324
325 #[cfg(feature = "perf-literal")]
326 let ac = self.build_aho_corasick(&parsed);
327 nfa.prefixes = LiteralSearcher::prefixes(parsed.prefixes);
328 dfa.prefixes = nfa.prefixes.clone();
329 dfa.dfa_size_limit = self.options.dfa_size_limit;
330 dfa_reverse.dfa_size_limit = self.options.dfa_size_limit;
331
332 let mut ro = ExecReadOnly {
333 res: self.options.pats,
334 nfa: nfa,
335 dfa: dfa,
336 dfa_reverse: dfa_reverse,
337 suffixes: LiteralSearcher::suffixes(parsed.suffixes),
338 #[cfg(feature = "perf-literal")]
339 ac: ac,
340 match_type: MatchType::Nothing,
341 };
342 ro.match_type = ro.choose_match_type(self.match_type);
343
344 let ro = Arc::new(ro);
345 Ok(Exec { ro: ro, cache: Cached::new() })
346 }
347
348 #[cfg(feature = "perf-literal")]
build_aho_corasick(&self, parsed: &Parsed) -> Option<AhoCorasick<u32>>349 fn build_aho_corasick(&self, parsed: &Parsed) -> Option<AhoCorasick<u32>> {
350 if parsed.exprs.len() != 1 {
351 return None;
352 }
353 let lits = match alternation_literals(&parsed.exprs[0]) {
354 None => return None,
355 Some(lits) => lits,
356 };
357 // If we have a small number of literals, then let Teddy handle
358 // things (see literal/mod.rs).
359 if lits.len() <= 32 {
360 return None;
361 }
362 Some(
363 AhoCorasickBuilder::new()
364 .match_kind(MatchKind::LeftmostFirst)
365 .auto_configure(&lits)
366 // We always want this to reduce size, regardless
367 // of what auto-configure does.
368 .byte_classes(true)
369 .build_with_size::<u32, _, _>(&lits)
370 // This should never happen because we'd long exceed the
371 // compilation limit for regexes first.
372 .expect("AC automaton too big"),
373 )
374 }
375 }
376
377 impl<'c> RegularExpression for ExecNoSyncStr<'c> {
378 type Text = str;
379
slots_len(&self) -> usize380 fn slots_len(&self) -> usize {
381 self.0.slots_len()
382 }
383
next_after_empty(&self, text: &str, i: usize) -> usize384 fn next_after_empty(&self, text: &str, i: usize) -> usize {
385 next_utf8(text.as_bytes(), i)
386 }
387
388 #[cfg_attr(feature = "perf-inline", inline(always))]
shortest_match_at(&self, text: &str, start: usize) -> Option<usize>389 fn shortest_match_at(&self, text: &str, start: usize) -> Option<usize> {
390 self.0.shortest_match_at(text.as_bytes(), start)
391 }
392
393 #[cfg_attr(feature = "perf-inline", inline(always))]
is_match_at(&self, text: &str, start: usize) -> bool394 fn is_match_at(&self, text: &str, start: usize) -> bool {
395 self.0.is_match_at(text.as_bytes(), start)
396 }
397
398 #[cfg_attr(feature = "perf-inline", inline(always))]
find_at(&self, text: &str, start: usize) -> Option<(usize, usize)>399 fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> {
400 self.0.find_at(text.as_bytes(), start)
401 }
402
403 #[cfg_attr(feature = "perf-inline", inline(always))]
captures_read_at( &self, locs: &mut Locations, text: &str, start: usize, ) -> Option<(usize, usize)>404 fn captures_read_at(
405 &self,
406 locs: &mut Locations,
407 text: &str,
408 start: usize,
409 ) -> Option<(usize, usize)> {
410 self.0.captures_read_at(locs, text.as_bytes(), start)
411 }
412 }
413
414 impl<'c> RegularExpression for ExecNoSync<'c> {
415 type Text = [u8];
416
417 /// Returns the number of capture slots in the regular expression. (There
418 /// are two slots for every capture group, corresponding to possibly empty
419 /// start and end locations of the capture.)
slots_len(&self) -> usize420 fn slots_len(&self) -> usize {
421 self.ro.nfa.captures.len() * 2
422 }
423
next_after_empty(&self, _text: &[u8], i: usize) -> usize424 fn next_after_empty(&self, _text: &[u8], i: usize) -> usize {
425 i + 1
426 }
427
428 /// Returns the end of a match location, possibly occurring before the
429 /// end location of the correct leftmost-first match.
430 #[cfg_attr(feature = "perf-inline", inline(always))]
shortest_match_at(&self, text: &[u8], start: usize) -> Option<usize>431 fn shortest_match_at(&self, text: &[u8], start: usize) -> Option<usize> {
432 if !self.is_anchor_end_match(text) {
433 return None;
434 }
435 match self.ro.match_type {
436 #[cfg(feature = "perf-literal")]
437 MatchType::Literal(ty) => {
438 self.find_literals(ty, text, start).map(|(_, e)| e)
439 }
440 #[cfg(feature = "perf-dfa")]
441 MatchType::Dfa | MatchType::DfaMany => {
442 match self.shortest_dfa(text, start) {
443 dfa::Result::Match(end) => Some(end),
444 dfa::Result::NoMatch(_) => None,
445 dfa::Result::Quit => self.shortest_nfa(text, start),
446 }
447 }
448 #[cfg(feature = "perf-dfa")]
449 MatchType::DfaAnchoredReverse => {
450 match dfa::Fsm::reverse(
451 &self.ro.dfa_reverse,
452 self.cache.value(),
453 true,
454 &text[start..],
455 text.len(),
456 ) {
457 dfa::Result::Match(_) => Some(text.len()),
458 dfa::Result::NoMatch(_) => None,
459 dfa::Result::Quit => self.shortest_nfa(text, start),
460 }
461 }
462 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
463 MatchType::DfaSuffix => {
464 match self.shortest_dfa_reverse_suffix(text, start) {
465 dfa::Result::Match(e) => Some(e),
466 dfa::Result::NoMatch(_) => None,
467 dfa::Result::Quit => self.shortest_nfa(text, start),
468 }
469 }
470 MatchType::Nfa(ty) => self.shortest_nfa_type(ty, text, start),
471 MatchType::Nothing => None,
472 }
473 }
474
475 /// Returns true if and only if the regex matches text.
476 ///
477 /// For single regular expressions, this is equivalent to calling
478 /// shortest_match(...).is_some().
479 #[cfg_attr(feature = "perf-inline", inline(always))]
is_match_at(&self, text: &[u8], start: usize) -> bool480 fn is_match_at(&self, text: &[u8], start: usize) -> bool {
481 if !self.is_anchor_end_match(text) {
482 return false;
483 }
484 // We need to do this dance because shortest_match relies on the NFA
485 // filling in captures[1], but a RegexSet has no captures. In other
486 // words, a RegexSet can't (currently) use shortest_match. ---AG
487 match self.ro.match_type {
488 #[cfg(feature = "perf-literal")]
489 MatchType::Literal(ty) => {
490 self.find_literals(ty, text, start).is_some()
491 }
492 #[cfg(feature = "perf-dfa")]
493 MatchType::Dfa | MatchType::DfaMany => {
494 match self.shortest_dfa(text, start) {
495 dfa::Result::Match(_) => true,
496 dfa::Result::NoMatch(_) => false,
497 dfa::Result::Quit => self.match_nfa(text, start),
498 }
499 }
500 #[cfg(feature = "perf-dfa")]
501 MatchType::DfaAnchoredReverse => {
502 match dfa::Fsm::reverse(
503 &self.ro.dfa_reverse,
504 self.cache.value(),
505 true,
506 &text[start..],
507 text.len(),
508 ) {
509 dfa::Result::Match(_) => true,
510 dfa::Result::NoMatch(_) => false,
511 dfa::Result::Quit => self.match_nfa(text, start),
512 }
513 }
514 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
515 MatchType::DfaSuffix => {
516 match self.shortest_dfa_reverse_suffix(text, start) {
517 dfa::Result::Match(_) => true,
518 dfa::Result::NoMatch(_) => false,
519 dfa::Result::Quit => self.match_nfa(text, start),
520 }
521 }
522 MatchType::Nfa(ty) => self.match_nfa_type(ty, text, start),
523 MatchType::Nothing => false,
524 }
525 }
526
527 /// Finds the start and end location of the leftmost-first match, starting
528 /// at the given location.
529 #[cfg_attr(feature = "perf-inline", inline(always))]
find_at(&self, text: &[u8], start: usize) -> Option<(usize, usize)>530 fn find_at(&self, text: &[u8], start: usize) -> Option<(usize, usize)> {
531 if !self.is_anchor_end_match(text) {
532 return None;
533 }
534 match self.ro.match_type {
535 #[cfg(feature = "perf-literal")]
536 MatchType::Literal(ty) => self.find_literals(ty, text, start),
537 #[cfg(feature = "perf-dfa")]
538 MatchType::Dfa => match self.find_dfa_forward(text, start) {
539 dfa::Result::Match((s, e)) => Some((s, e)),
540 dfa::Result::NoMatch(_) => None,
541 dfa::Result::Quit => {
542 self.find_nfa(MatchNfaType::Auto, text, start)
543 }
544 },
545 #[cfg(feature = "perf-dfa")]
546 MatchType::DfaAnchoredReverse => {
547 match self.find_dfa_anchored_reverse(text, start) {
548 dfa::Result::Match((s, e)) => Some((s, e)),
549 dfa::Result::NoMatch(_) => None,
550 dfa::Result::Quit => {
551 self.find_nfa(MatchNfaType::Auto, text, start)
552 }
553 }
554 }
555 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
556 MatchType::DfaSuffix => {
557 match self.find_dfa_reverse_suffix(text, start) {
558 dfa::Result::Match((s, e)) => Some((s, e)),
559 dfa::Result::NoMatch(_) => None,
560 dfa::Result::Quit => {
561 self.find_nfa(MatchNfaType::Auto, text, start)
562 }
563 }
564 }
565 MatchType::Nfa(ty) => self.find_nfa(ty, text, start),
566 MatchType::Nothing => None,
567 #[cfg(feature = "perf-dfa")]
568 MatchType::DfaMany => {
569 unreachable!("BUG: RegexSet cannot be used with find")
570 }
571 }
572 }
573
574 /// Finds the start and end location of the leftmost-first match and also
575 /// fills in all matching capture groups.
576 ///
577 /// The number of capture slots given should be equal to the total number
578 /// of capture slots in the compiled program.
579 ///
580 /// Note that the first two slots always correspond to the start and end
581 /// locations of the overall match.
captures_read_at( &self, locs: &mut Locations, text: &[u8], start: usize, ) -> Option<(usize, usize)>582 fn captures_read_at(
583 &self,
584 locs: &mut Locations,
585 text: &[u8],
586 start: usize,
587 ) -> Option<(usize, usize)> {
588 let slots = locs.as_slots();
589 for slot in slots.iter_mut() {
590 *slot = None;
591 }
592 // If the caller unnecessarily uses this, then we try to save them
593 // from themselves.
594 match slots.len() {
595 0 => return self.find_at(text, start),
596 2 => {
597 return self.find_at(text, start).map(|(s, e)| {
598 slots[0] = Some(s);
599 slots[1] = Some(e);
600 (s, e)
601 });
602 }
603 _ => {} // fallthrough
604 }
605 if !self.is_anchor_end_match(text) {
606 return None;
607 }
608 match self.ro.match_type {
609 #[cfg(feature = "perf-literal")]
610 MatchType::Literal(ty) => {
611 self.find_literals(ty, text, start).and_then(|(s, e)| {
612 self.captures_nfa_type(
613 MatchNfaType::Auto,
614 slots,
615 text,
616 s,
617 e,
618 )
619 })
620 }
621 #[cfg(feature = "perf-dfa")]
622 MatchType::Dfa => {
623 if self.ro.nfa.is_anchored_start {
624 self.captures_nfa(slots, text, start)
625 } else {
626 match self.find_dfa_forward(text, start) {
627 dfa::Result::Match((s, e)) => self.captures_nfa_type(
628 MatchNfaType::Auto,
629 slots,
630 text,
631 s,
632 e,
633 ),
634 dfa::Result::NoMatch(_) => None,
635 dfa::Result::Quit => {
636 self.captures_nfa(slots, text, start)
637 }
638 }
639 }
640 }
641 #[cfg(feature = "perf-dfa")]
642 MatchType::DfaAnchoredReverse => {
643 match self.find_dfa_anchored_reverse(text, start) {
644 dfa::Result::Match((s, e)) => self.captures_nfa_type(
645 MatchNfaType::Auto,
646 slots,
647 text,
648 s,
649 e,
650 ),
651 dfa::Result::NoMatch(_) => None,
652 dfa::Result::Quit => self.captures_nfa(slots, text, start),
653 }
654 }
655 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
656 MatchType::DfaSuffix => {
657 match self.find_dfa_reverse_suffix(text, start) {
658 dfa::Result::Match((s, e)) => self.captures_nfa_type(
659 MatchNfaType::Auto,
660 slots,
661 text,
662 s,
663 e,
664 ),
665 dfa::Result::NoMatch(_) => None,
666 dfa::Result::Quit => self.captures_nfa(slots, text, start),
667 }
668 }
669 MatchType::Nfa(ty) => {
670 self.captures_nfa_type(ty, slots, text, start, text.len())
671 }
672 MatchType::Nothing => None,
673 #[cfg(feature = "perf-dfa")]
674 MatchType::DfaMany => {
675 unreachable!("BUG: RegexSet cannot be used with captures")
676 }
677 }
678 }
679 }
680
681 impl<'c> ExecNoSync<'c> {
682 /// Finds the leftmost-first match using only literal search.
683 #[cfg(feature = "perf-literal")]
684 #[cfg_attr(feature = "perf-inline", inline(always))]
find_literals( &self, ty: MatchLiteralType, text: &[u8], start: usize, ) -> Option<(usize, usize)>685 fn find_literals(
686 &self,
687 ty: MatchLiteralType,
688 text: &[u8],
689 start: usize,
690 ) -> Option<(usize, usize)> {
691 use self::MatchLiteralType::*;
692 match ty {
693 Unanchored => {
694 let lits = &self.ro.nfa.prefixes;
695 lits.find(&text[start..]).map(|(s, e)| (start + s, start + e))
696 }
697 AnchoredStart => {
698 let lits = &self.ro.nfa.prefixes;
699 if start == 0 || !self.ro.nfa.is_anchored_start {
700 lits.find_start(&text[start..])
701 .map(|(s, e)| (start + s, start + e))
702 } else {
703 None
704 }
705 }
706 AnchoredEnd => {
707 let lits = &self.ro.suffixes;
708 lits.find_end(&text[start..])
709 .map(|(s, e)| (start + s, start + e))
710 }
711 AhoCorasick => self
712 .ro
713 .ac
714 .as_ref()
715 .unwrap()
716 .find(&text[start..])
717 .map(|m| (start + m.start(), start + m.end())),
718 }
719 }
720
721 /// Finds the leftmost-first match (start and end) using only the DFA.
722 ///
723 /// If the result returned indicates that the DFA quit, then another
724 /// matching engine should be used.
725 #[cfg(feature = "perf-dfa")]
726 #[cfg_attr(feature = "perf-inline", inline(always))]
find_dfa_forward( &self, text: &[u8], start: usize, ) -> dfa::Result<(usize, usize)>727 fn find_dfa_forward(
728 &self,
729 text: &[u8],
730 start: usize,
731 ) -> dfa::Result<(usize, usize)> {
732 use dfa::Result::*;
733 let end = match dfa::Fsm::forward(
734 &self.ro.dfa,
735 self.cache.value(),
736 false,
737 text,
738 start,
739 ) {
740 NoMatch(i) => return NoMatch(i),
741 Quit => return Quit,
742 Match(end) if start == end => return Match((start, start)),
743 Match(end) => end,
744 };
745 // Now run the DFA in reverse to find the start of the match.
746 match dfa::Fsm::reverse(
747 &self.ro.dfa_reverse,
748 self.cache.value(),
749 false,
750 &text[start..],
751 end - start,
752 ) {
753 Match(s) => Match((start + s, end)),
754 NoMatch(i) => NoMatch(i),
755 Quit => Quit,
756 }
757 }
758
759 /// Finds the leftmost-first match (start and end) using only the DFA,
760 /// but assumes the regex is anchored at the end and therefore starts at
761 /// the end of the regex and matches in reverse.
762 ///
763 /// If the result returned indicates that the DFA quit, then another
764 /// matching engine should be used.
765 #[cfg(feature = "perf-dfa")]
766 #[cfg_attr(feature = "perf-inline", inline(always))]
find_dfa_anchored_reverse( &self, text: &[u8], start: usize, ) -> dfa::Result<(usize, usize)>767 fn find_dfa_anchored_reverse(
768 &self,
769 text: &[u8],
770 start: usize,
771 ) -> dfa::Result<(usize, usize)> {
772 use dfa::Result::*;
773 match dfa::Fsm::reverse(
774 &self.ro.dfa_reverse,
775 self.cache.value(),
776 false,
777 &text[start..],
778 text.len() - start,
779 ) {
780 Match(s) => Match((start + s, text.len())),
781 NoMatch(i) => NoMatch(i),
782 Quit => Quit,
783 }
784 }
785
786 /// Finds the end of the shortest match using only the DFA.
787 #[cfg(feature = "perf-dfa")]
788 #[cfg_attr(feature = "perf-inline", inline(always))]
shortest_dfa(&self, text: &[u8], start: usize) -> dfa::Result<usize>789 fn shortest_dfa(&self, text: &[u8], start: usize) -> dfa::Result<usize> {
790 dfa::Fsm::forward(&self.ro.dfa, self.cache.value(), true, text, start)
791 }
792
793 /// Finds the end of the shortest match using only the DFA by scanning for
794 /// suffix literals.
795 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
796 #[cfg_attr(feature = "perf-inline", inline(always))]
shortest_dfa_reverse_suffix( &self, text: &[u8], start: usize, ) -> dfa::Result<usize>797 fn shortest_dfa_reverse_suffix(
798 &self,
799 text: &[u8],
800 start: usize,
801 ) -> dfa::Result<usize> {
802 match self.exec_dfa_reverse_suffix(text, start) {
803 None => self.shortest_dfa(text, start),
804 Some(r) => r.map(|(_, end)| end),
805 }
806 }
807
808 /// Finds the end of the shortest match using only the DFA by scanning for
809 /// suffix literals. It also reports the start of the match.
810 ///
811 /// Note that if None is returned, then the optimization gave up to avoid
812 /// worst case quadratic behavior. A forward scanning DFA should be tried
813 /// next.
814 ///
815 /// If a match is returned and the full leftmost-first match is desired,
816 /// then a forward scan starting from the beginning of the match must be
817 /// done.
818 ///
819 /// If the result returned indicates that the DFA quit, then another
820 /// matching engine should be used.
821 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
822 #[cfg_attr(feature = "perf-inline", inline(always))]
exec_dfa_reverse_suffix( &self, text: &[u8], original_start: usize, ) -> Option<dfa::Result<(usize, usize)>>823 fn exec_dfa_reverse_suffix(
824 &self,
825 text: &[u8],
826 original_start: usize,
827 ) -> Option<dfa::Result<(usize, usize)>> {
828 use dfa::Result::*;
829
830 let lcs = self.ro.suffixes.lcs();
831 debug_assert!(lcs.len() >= 1);
832 let mut start = original_start;
833 let mut end = start;
834 let mut last_literal = start;
835 while end <= text.len() {
836 last_literal += match lcs.find(&text[last_literal..]) {
837 None => return Some(NoMatch(text.len())),
838 Some(i) => i,
839 };
840 end = last_literal + lcs.len();
841 match dfa::Fsm::reverse(
842 &self.ro.dfa_reverse,
843 self.cache.value(),
844 false,
845 &text[start..end],
846 end - start,
847 ) {
848 Match(0) | NoMatch(0) => return None,
849 Match(i) => return Some(Match((start + i, end))),
850 NoMatch(i) => {
851 start += i;
852 last_literal += 1;
853 continue;
854 }
855 Quit => return Some(Quit),
856 };
857 }
858 Some(NoMatch(text.len()))
859 }
860
861 /// Finds the leftmost-first match (start and end) using only the DFA
862 /// by scanning for suffix literals.
863 ///
864 /// If the result returned indicates that the DFA quit, then another
865 /// matching engine should be used.
866 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
867 #[cfg_attr(feature = "perf-inline", inline(always))]
find_dfa_reverse_suffix( &self, text: &[u8], start: usize, ) -> dfa::Result<(usize, usize)>868 fn find_dfa_reverse_suffix(
869 &self,
870 text: &[u8],
871 start: usize,
872 ) -> dfa::Result<(usize, usize)> {
873 use dfa::Result::*;
874
875 let match_start = match self.exec_dfa_reverse_suffix(text, start) {
876 None => return self.find_dfa_forward(text, start),
877 Some(Match((start, _))) => start,
878 Some(r) => return r,
879 };
880 // At this point, we've found a match. The only way to quit now
881 // without a match is if the DFA gives up (seems unlikely).
882 //
883 // Now run the DFA forwards to find the proper end of the match.
884 // (The suffix literal match can only indicate the earliest
885 // possible end location, which may appear before the end of the
886 // leftmost-first match.)
887 match dfa::Fsm::forward(
888 &self.ro.dfa,
889 self.cache.value(),
890 false,
891 text,
892 match_start,
893 ) {
894 NoMatch(_) => panic!("BUG: reverse match implies forward match"),
895 Quit => Quit,
896 Match(e) => Match((match_start, e)),
897 }
898 }
899
900 /// Executes the NFA engine to return whether there is a match or not.
901 ///
902 /// Ideally, we could use shortest_nfa(...).is_some() and get the same
903 /// performance characteristics, but regex sets don't have captures, which
904 /// shortest_nfa depends on.
905 #[cfg(feature = "perf-dfa")]
match_nfa(&self, text: &[u8], start: usize) -> bool906 fn match_nfa(&self, text: &[u8], start: usize) -> bool {
907 self.match_nfa_type(MatchNfaType::Auto, text, start)
908 }
909
910 /// Like match_nfa, but allows specification of the type of NFA engine.
match_nfa_type( &self, ty: MatchNfaType, text: &[u8], start: usize, ) -> bool911 fn match_nfa_type(
912 &self,
913 ty: MatchNfaType,
914 text: &[u8],
915 start: usize,
916 ) -> bool {
917 self.exec_nfa(
918 ty,
919 &mut [false],
920 &mut [],
921 true,
922 false,
923 text,
924 start,
925 text.len(),
926 )
927 }
928
929 /// Finds the shortest match using an NFA.
930 #[cfg(feature = "perf-dfa")]
shortest_nfa(&self, text: &[u8], start: usize) -> Option<usize>931 fn shortest_nfa(&self, text: &[u8], start: usize) -> Option<usize> {
932 self.shortest_nfa_type(MatchNfaType::Auto, text, start)
933 }
934
935 /// Like shortest_nfa, but allows specification of the type of NFA engine.
shortest_nfa_type( &self, ty: MatchNfaType, text: &[u8], start: usize, ) -> Option<usize>936 fn shortest_nfa_type(
937 &self,
938 ty: MatchNfaType,
939 text: &[u8],
940 start: usize,
941 ) -> Option<usize> {
942 let mut slots = [None, None];
943 if self.exec_nfa(
944 ty,
945 &mut [false],
946 &mut slots,
947 true,
948 true,
949 text,
950 start,
951 text.len(),
952 ) {
953 slots[1]
954 } else {
955 None
956 }
957 }
958
959 /// Like find, but executes an NFA engine.
find_nfa( &self, ty: MatchNfaType, text: &[u8], start: usize, ) -> Option<(usize, usize)>960 fn find_nfa(
961 &self,
962 ty: MatchNfaType,
963 text: &[u8],
964 start: usize,
965 ) -> Option<(usize, usize)> {
966 let mut slots = [None, None];
967 if self.exec_nfa(
968 ty,
969 &mut [false],
970 &mut slots,
971 false,
972 false,
973 text,
974 start,
975 text.len(),
976 ) {
977 match (slots[0], slots[1]) {
978 (Some(s), Some(e)) => Some((s, e)),
979 _ => None,
980 }
981 } else {
982 None
983 }
984 }
985
986 /// Like find_nfa, but fills in captures.
987 ///
988 /// `slots` should have length equal to `2 * nfa.captures.len()`.
989 #[cfg(feature = "perf-dfa")]
captures_nfa( &self, slots: &mut [Slot], text: &[u8], start: usize, ) -> Option<(usize, usize)>990 fn captures_nfa(
991 &self,
992 slots: &mut [Slot],
993 text: &[u8],
994 start: usize,
995 ) -> Option<(usize, usize)> {
996 self.captures_nfa_type(
997 MatchNfaType::Auto,
998 slots,
999 text,
1000 start,
1001 text.len(),
1002 )
1003 }
1004
1005 /// Like captures_nfa, but allows specification of type of NFA engine.
captures_nfa_type( &self, ty: MatchNfaType, slots: &mut [Slot], text: &[u8], start: usize, end: usize, ) -> Option<(usize, usize)>1006 fn captures_nfa_type(
1007 &self,
1008 ty: MatchNfaType,
1009 slots: &mut [Slot],
1010 text: &[u8],
1011 start: usize,
1012 end: usize,
1013 ) -> Option<(usize, usize)> {
1014 if self.exec_nfa(
1015 ty,
1016 &mut [false],
1017 slots,
1018 false,
1019 false,
1020 text,
1021 start,
1022 end,
1023 ) {
1024 match (slots[0], slots[1]) {
1025 (Some(s), Some(e)) => Some((s, e)),
1026 _ => None,
1027 }
1028 } else {
1029 None
1030 }
1031 }
1032
exec_nfa( &self, mut ty: MatchNfaType, matches: &mut [bool], slots: &mut [Slot], quit_after_match: bool, quit_after_match_with_pos: bool, text: &[u8], start: usize, end: usize, ) -> bool1033 fn exec_nfa(
1034 &self,
1035 mut ty: MatchNfaType,
1036 matches: &mut [bool],
1037 slots: &mut [Slot],
1038 quit_after_match: bool,
1039 quit_after_match_with_pos: bool,
1040 text: &[u8],
1041 start: usize,
1042 end: usize,
1043 ) -> bool {
1044 use self::MatchNfaType::*;
1045 if let Auto = ty {
1046 if backtrack::should_exec(self.ro.nfa.len(), text.len()) {
1047 ty = Backtrack;
1048 } else {
1049 ty = PikeVM;
1050 }
1051 }
1052 // The backtracker can't return the shortest match position as it is
1053 // implemented today. So if someone calls `shortest_match` and we need
1054 // to run an NFA, then use the PikeVM.
1055 if quit_after_match_with_pos || ty == PikeVM {
1056 self.exec_pikevm(
1057 matches,
1058 slots,
1059 quit_after_match,
1060 text,
1061 start,
1062 end,
1063 )
1064 } else {
1065 self.exec_backtrack(matches, slots, text, start, end)
1066 }
1067 }
1068
1069 /// Always run the NFA algorithm.
exec_pikevm( &self, matches: &mut [bool], slots: &mut [Slot], quit_after_match: bool, text: &[u8], start: usize, end: usize, ) -> bool1070 fn exec_pikevm(
1071 &self,
1072 matches: &mut [bool],
1073 slots: &mut [Slot],
1074 quit_after_match: bool,
1075 text: &[u8],
1076 start: usize,
1077 end: usize,
1078 ) -> bool {
1079 if self.ro.nfa.uses_bytes() {
1080 pikevm::Fsm::exec(
1081 &self.ro.nfa,
1082 self.cache.value(),
1083 matches,
1084 slots,
1085 quit_after_match,
1086 ByteInput::new(text, self.ro.nfa.only_utf8),
1087 start,
1088 end,
1089 )
1090 } else {
1091 pikevm::Fsm::exec(
1092 &self.ro.nfa,
1093 self.cache.value(),
1094 matches,
1095 slots,
1096 quit_after_match,
1097 CharInput::new(text),
1098 start,
1099 end,
1100 )
1101 }
1102 }
1103
1104 /// Always runs the NFA using bounded backtracking.
exec_backtrack( &self, matches: &mut [bool], slots: &mut [Slot], text: &[u8], start: usize, end: usize, ) -> bool1105 fn exec_backtrack(
1106 &self,
1107 matches: &mut [bool],
1108 slots: &mut [Slot],
1109 text: &[u8],
1110 start: usize,
1111 end: usize,
1112 ) -> bool {
1113 if self.ro.nfa.uses_bytes() {
1114 backtrack::Bounded::exec(
1115 &self.ro.nfa,
1116 self.cache.value(),
1117 matches,
1118 slots,
1119 ByteInput::new(text, self.ro.nfa.only_utf8),
1120 start,
1121 end,
1122 )
1123 } else {
1124 backtrack::Bounded::exec(
1125 &self.ro.nfa,
1126 self.cache.value(),
1127 matches,
1128 slots,
1129 CharInput::new(text),
1130 start,
1131 end,
1132 )
1133 }
1134 }
1135
1136 /// Finds which regular expressions match the given text.
1137 ///
1138 /// `matches` should have length equal to the number of regexes being
1139 /// searched.
1140 ///
1141 /// This is only useful when one wants to know which regexes in a set
1142 /// match some text.
many_matches_at( &self, matches: &mut [bool], text: &[u8], start: usize, ) -> bool1143 pub fn many_matches_at(
1144 &self,
1145 matches: &mut [bool],
1146 text: &[u8],
1147 start: usize,
1148 ) -> bool {
1149 use self::MatchType::*;
1150 if !self.is_anchor_end_match(text) {
1151 return false;
1152 }
1153 match self.ro.match_type {
1154 #[cfg(feature = "perf-literal")]
1155 Literal(ty) => {
1156 debug_assert_eq!(matches.len(), 1);
1157 matches[0] = self.find_literals(ty, text, start).is_some();
1158 matches[0]
1159 }
1160 #[cfg(feature = "perf-dfa")]
1161 Dfa | DfaAnchoredReverse | DfaMany => {
1162 match dfa::Fsm::forward_many(
1163 &self.ro.dfa,
1164 self.cache.value(),
1165 matches,
1166 text,
1167 start,
1168 ) {
1169 dfa::Result::Match(_) => true,
1170 dfa::Result::NoMatch(_) => false,
1171 dfa::Result::Quit => self.exec_nfa(
1172 MatchNfaType::Auto,
1173 matches,
1174 &mut [],
1175 false,
1176 false,
1177 text,
1178 start,
1179 text.len(),
1180 ),
1181 }
1182 }
1183 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
1184 DfaSuffix => {
1185 match dfa::Fsm::forward_many(
1186 &self.ro.dfa,
1187 self.cache.value(),
1188 matches,
1189 text,
1190 start,
1191 ) {
1192 dfa::Result::Match(_) => true,
1193 dfa::Result::NoMatch(_) => false,
1194 dfa::Result::Quit => self.exec_nfa(
1195 MatchNfaType::Auto,
1196 matches,
1197 &mut [],
1198 false,
1199 false,
1200 text,
1201 start,
1202 text.len(),
1203 ),
1204 }
1205 }
1206 Nfa(ty) => self.exec_nfa(
1207 ty,
1208 matches,
1209 &mut [],
1210 false,
1211 false,
1212 text,
1213 start,
1214 text.len(),
1215 ),
1216 Nothing => false,
1217 }
1218 }
1219
1220 #[cfg_attr(feature = "perf-inline", inline(always))]
is_anchor_end_match(&self, text: &[u8]) -> bool1221 fn is_anchor_end_match(&self, text: &[u8]) -> bool {
1222 #[cfg(not(feature = "perf-literal"))]
1223 fn imp(_: &ExecReadOnly, _: &[u8]) -> bool {
1224 true
1225 }
1226
1227 #[cfg(feature = "perf-literal")]
1228 fn imp(ro: &ExecReadOnly, text: &[u8]) -> bool {
1229 // Only do this check if the haystack is big (>1MB).
1230 if text.len() > (1 << 20) && ro.nfa.is_anchored_end {
1231 let lcs = ro.suffixes.lcs();
1232 if lcs.len() >= 1 && !lcs.is_suffix(text) {
1233 return false;
1234 }
1235 }
1236 true
1237 }
1238
1239 imp(&self.ro, text)
1240 }
1241
capture_name_idx(&self) -> &Arc<HashMap<String, usize>>1242 pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1243 &self.ro.nfa.capture_name_idx
1244 }
1245 }
1246
1247 impl<'c> ExecNoSyncStr<'c> {
capture_name_idx(&self) -> &Arc<HashMap<String, usize>>1248 pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1249 self.0.capture_name_idx()
1250 }
1251 }
1252
1253 impl Exec {
1254 /// Get a searcher that isn't Sync.
1255 #[cfg_attr(feature = "perf-inline", inline(always))]
searcher(&self) -> ExecNoSync1256 pub fn searcher(&self) -> ExecNoSync {
1257 let create = || RefCell::new(ProgramCacheInner::new(&self.ro));
1258 ExecNoSync {
1259 ro: &self.ro, // a clone is too expensive here! (and not needed)
1260 cache: self.cache.get_or(create),
1261 }
1262 }
1263
1264 /// Get a searcher that isn't Sync and can match on &str.
1265 #[cfg_attr(feature = "perf-inline", inline(always))]
searcher_str(&self) -> ExecNoSyncStr1266 pub fn searcher_str(&self) -> ExecNoSyncStr {
1267 ExecNoSyncStr(self.searcher())
1268 }
1269
1270 /// Build a Regex from this executor.
into_regex(self) -> re_unicode::Regex1271 pub fn into_regex(self) -> re_unicode::Regex {
1272 re_unicode::Regex::from(self)
1273 }
1274
1275 /// Build a RegexSet from this executor.
into_regex_set(self) -> re_set::unicode::RegexSet1276 pub fn into_regex_set(self) -> re_set::unicode::RegexSet {
1277 re_set::unicode::RegexSet::from(self)
1278 }
1279
1280 /// Build a Regex from this executor that can match arbitrary bytes.
into_byte_regex(self) -> re_bytes::Regex1281 pub fn into_byte_regex(self) -> re_bytes::Regex {
1282 re_bytes::Regex::from(self)
1283 }
1284
1285 /// Build a RegexSet from this executor that can match arbitrary bytes.
into_byte_regex_set(self) -> re_set::bytes::RegexSet1286 pub fn into_byte_regex_set(self) -> re_set::bytes::RegexSet {
1287 re_set::bytes::RegexSet::from(self)
1288 }
1289
1290 /// The original regular expressions given by the caller that were
1291 /// compiled.
regex_strings(&self) -> &[String]1292 pub fn regex_strings(&self) -> &[String] {
1293 &self.ro.res
1294 }
1295
1296 /// Return a slice of capture names.
1297 ///
1298 /// Any capture that isn't named is None.
capture_names(&self) -> &[Option<String>]1299 pub fn capture_names(&self) -> &[Option<String>] {
1300 &self.ro.nfa.captures
1301 }
1302
1303 /// Return a reference to named groups mapping (from group name to
1304 /// group position).
capture_name_idx(&self) -> &Arc<HashMap<String, usize>>1305 pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1306 &self.ro.nfa.capture_name_idx
1307 }
1308 }
1309
1310 impl Clone for Exec {
clone(&self) -> Exec1311 fn clone(&self) -> Exec {
1312 Exec { ro: self.ro.clone(), cache: Cached::new() }
1313 }
1314 }
1315
1316 impl ExecReadOnly {
choose_match_type(&self, hint: Option<MatchType>) -> MatchType1317 fn choose_match_type(&self, hint: Option<MatchType>) -> MatchType {
1318 if let Some(MatchType::Nfa(_)) = hint {
1319 return hint.unwrap();
1320 }
1321 // If the NFA is empty, then we'll never match anything.
1322 if self.nfa.insts.is_empty() {
1323 return MatchType::Nothing;
1324 }
1325 if let Some(literalty) = self.choose_literal_match_type() {
1326 return literalty;
1327 }
1328 if let Some(dfaty) = self.choose_dfa_match_type() {
1329 return dfaty;
1330 }
1331 // We're so totally hosed.
1332 MatchType::Nfa(MatchNfaType::Auto)
1333 }
1334
1335 /// If a plain literal scan can be used, then a corresponding literal
1336 /// search type is returned.
choose_literal_match_type(&self) -> Option<MatchType>1337 fn choose_literal_match_type(&self) -> Option<MatchType> {
1338 #[cfg(not(feature = "perf-literal"))]
1339 fn imp(_: &ExecReadOnly) -> Option<MatchType> {
1340 None
1341 }
1342
1343 #[cfg(feature = "perf-literal")]
1344 fn imp(ro: &ExecReadOnly) -> Option<MatchType> {
1345 // If our set of prefixes is complete, then we can use it to find
1346 // a match in lieu of a regex engine. This doesn't quite work well
1347 // in the presence of multiple regexes, so only do it when there's
1348 // one.
1349 //
1350 // TODO(burntsushi): Also, don't try to match literals if the regex
1351 // is partially anchored. We could technically do it, but we'd need
1352 // to create two sets of literals: all of them and then the subset
1353 // that aren't anchored. We would then only search for all of them
1354 // when at the beginning of the input and use the subset in all
1355 // other cases.
1356 if ro.res.len() != 1 {
1357 return None;
1358 }
1359 if ro.ac.is_some() {
1360 return Some(MatchType::Literal(
1361 MatchLiteralType::AhoCorasick,
1362 ));
1363 }
1364 if ro.nfa.prefixes.complete() {
1365 return if ro.nfa.is_anchored_start {
1366 Some(MatchType::Literal(MatchLiteralType::AnchoredStart))
1367 } else {
1368 Some(MatchType::Literal(MatchLiteralType::Unanchored))
1369 };
1370 }
1371 if ro.suffixes.complete() {
1372 return if ro.nfa.is_anchored_end {
1373 Some(MatchType::Literal(MatchLiteralType::AnchoredEnd))
1374 } else {
1375 // This case shouldn't happen. When the regex isn't
1376 // anchored, then complete prefixes should imply complete
1377 // suffixes.
1378 Some(MatchType::Literal(MatchLiteralType::Unanchored))
1379 };
1380 }
1381 None
1382 }
1383
1384 imp(self)
1385 }
1386
1387 /// If a DFA scan can be used, then choose the appropriate DFA strategy.
choose_dfa_match_type(&self) -> Option<MatchType>1388 fn choose_dfa_match_type(&self) -> Option<MatchType> {
1389 #[cfg(not(feature = "perf-dfa"))]
1390 fn imp(_: &ExecReadOnly) -> Option<MatchType> {
1391 None
1392 }
1393
1394 #[cfg(feature = "perf-dfa")]
1395 fn imp(ro: &ExecReadOnly) -> Option<MatchType> {
1396 if !dfa::can_exec(&ro.dfa) {
1397 return None;
1398 }
1399 // Regex sets require a slightly specialized path.
1400 if ro.res.len() >= 2 {
1401 return Some(MatchType::DfaMany);
1402 }
1403 // If the regex is anchored at the end but not the start, then
1404 // just match in reverse from the end of the haystack.
1405 if !ro.nfa.is_anchored_start && ro.nfa.is_anchored_end {
1406 return Some(MatchType::DfaAnchoredReverse);
1407 }
1408 #[cfg(feature = "perf-literal")]
1409 {
1410 // If there's a longish suffix literal, then it might be faster
1411 // to look for that first.
1412 if ro.should_suffix_scan() {
1413 return Some(MatchType::DfaSuffix);
1414 }
1415 }
1416 // Fall back to your garden variety forward searching lazy DFA.
1417 Some(MatchType::Dfa)
1418 }
1419
1420 imp(self)
1421 }
1422
1423 /// Returns true if the program is amenable to suffix scanning.
1424 ///
1425 /// When this is true, as a heuristic, we assume it is OK to quickly scan
1426 /// for suffix literals and then do a *reverse* DFA match from any matches
1427 /// produced by the literal scan. (And then followed by a forward DFA
1428 /// search, since the previously found suffix literal maybe not actually be
1429 /// the end of a match.)
1430 ///
1431 /// This is a bit of a specialized optimization, but can result in pretty
1432 /// big performance wins if 1) there are no prefix literals and 2) the
1433 /// suffix literals are pretty rare in the text. (1) is obviously easy to
1434 /// account for but (2) is harder. As a proxy, we assume that longer
1435 /// strings are generally rarer, so we only enable this optimization when
1436 /// we have a meaty suffix.
1437 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
should_suffix_scan(&self) -> bool1438 fn should_suffix_scan(&self) -> bool {
1439 if self.suffixes.is_empty() {
1440 return false;
1441 }
1442 let lcs_len = self.suffixes.lcs().char_len();
1443 lcs_len >= 3 && lcs_len > self.dfa.prefixes.lcp().char_len()
1444 }
1445 }
1446
1447 #[derive(Clone, Copy, Debug)]
1448 enum MatchType {
1449 /// A single or multiple literal search. This is only used when the regex
1450 /// can be decomposed into a literal search.
1451 #[cfg(feature = "perf-literal")]
1452 Literal(MatchLiteralType),
1453 /// A normal DFA search.
1454 #[cfg(feature = "perf-dfa")]
1455 Dfa,
1456 /// A reverse DFA search starting from the end of a haystack.
1457 #[cfg(feature = "perf-dfa")]
1458 DfaAnchoredReverse,
1459 /// A reverse DFA search with suffix literal scanning.
1460 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
1461 DfaSuffix,
1462 /// Use the DFA on two or more regular expressions.
1463 #[cfg(feature = "perf-dfa")]
1464 DfaMany,
1465 /// An NFA variant.
1466 Nfa(MatchNfaType),
1467 /// No match is ever possible, so don't ever try to search.
1468 Nothing,
1469 }
1470
1471 #[derive(Clone, Copy, Debug)]
1472 #[cfg(feature = "perf-literal")]
1473 enum MatchLiteralType {
1474 /// Match literals anywhere in text.
1475 Unanchored,
1476 /// Match literals only at the start of text.
1477 AnchoredStart,
1478 /// Match literals only at the end of text.
1479 AnchoredEnd,
1480 /// Use an Aho-Corasick automaton. This requires `ac` to be Some on
1481 /// ExecReadOnly.
1482 AhoCorasick,
1483 }
1484
1485 #[derive(Clone, Copy, Debug, Eq, PartialEq)]
1486 enum MatchNfaType {
1487 /// Choose between Backtrack and PikeVM.
1488 Auto,
1489 /// NFA bounded backtracking.
1490 ///
1491 /// (This is only set by tests, since it never makes sense to always want
1492 /// backtracking.)
1493 Backtrack,
1494 /// The Pike VM.
1495 ///
1496 /// (This is only set by tests, since it never makes sense to always want
1497 /// the Pike VM.)
1498 PikeVM,
1499 }
1500
1501 /// `ProgramCache` maintains reusable allocations for each matching engine
1502 /// available to a particular program.
1503 pub type ProgramCache = RefCell<ProgramCacheInner>;
1504
1505 #[derive(Debug)]
1506 pub struct ProgramCacheInner {
1507 pub pikevm: pikevm::Cache,
1508 pub backtrack: backtrack::Cache,
1509 #[cfg(feature = "perf-dfa")]
1510 pub dfa: dfa::Cache,
1511 #[cfg(feature = "perf-dfa")]
1512 pub dfa_reverse: dfa::Cache,
1513 }
1514
1515 impl ProgramCacheInner {
new(ro: &ExecReadOnly) -> Self1516 fn new(ro: &ExecReadOnly) -> Self {
1517 ProgramCacheInner {
1518 pikevm: pikevm::Cache::new(&ro.nfa),
1519 backtrack: backtrack::Cache::new(&ro.nfa),
1520 #[cfg(feature = "perf-dfa")]
1521 dfa: dfa::Cache::new(&ro.dfa),
1522 #[cfg(feature = "perf-dfa")]
1523 dfa_reverse: dfa::Cache::new(&ro.dfa_reverse),
1524 }
1525 }
1526 }
1527
1528 /// Alternation literals checks if the given HIR is a simple alternation of
1529 /// literals, and if so, returns them. Otherwise, this returns None.
1530 #[cfg(feature = "perf-literal")]
alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>>1531 fn alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>> {
1532 use syntax::hir::{HirKind, Literal};
1533
1534 // This is pretty hacky, but basically, if `is_alternation_literal` is
1535 // true, then we can make several assumptions about the structure of our
1536 // HIR. This is what justifies the `unreachable!` statements below.
1537 //
1538 // This code should be refactored once we overhaul this crate's
1539 // optimization pipeline, because this is a terribly inflexible way to go
1540 // about things.
1541
1542 if !expr.is_alternation_literal() {
1543 return None;
1544 }
1545 let alts = match *expr.kind() {
1546 HirKind::Alternation(ref alts) => alts,
1547 _ => return None, // one literal isn't worth it
1548 };
1549
1550 let extendlit = |lit: &Literal, dst: &mut Vec<u8>| match *lit {
1551 Literal::Unicode(c) => {
1552 let mut buf = [0; 4];
1553 dst.extend_from_slice(c.encode_utf8(&mut buf).as_bytes());
1554 }
1555 Literal::Byte(b) => {
1556 dst.push(b);
1557 }
1558 };
1559
1560 let mut lits = vec![];
1561 for alt in alts {
1562 let mut lit = vec![];
1563 match *alt.kind() {
1564 HirKind::Literal(ref x) => extendlit(x, &mut lit),
1565 HirKind::Concat(ref exprs) => {
1566 for e in exprs {
1567 match *e.kind() {
1568 HirKind::Literal(ref x) => extendlit(x, &mut lit),
1569 _ => unreachable!("expected literal, got {:?}", e),
1570 }
1571 }
1572 }
1573 _ => unreachable!("expected literal or concat, got {:?}", alt),
1574 }
1575 lits.push(lit);
1576 }
1577 Some(lits)
1578 }
1579
1580 #[cfg(test)]
1581 mod test {
1582 #[test]
uppercut_s_backtracking_bytes_default_bytes_mismatch()1583 fn uppercut_s_backtracking_bytes_default_bytes_mismatch() {
1584 use internal::ExecBuilder;
1585
1586 let backtrack_bytes_re = ExecBuilder::new("^S")
1587 .bounded_backtracking()
1588 .only_utf8(false)
1589 .build()
1590 .map(|exec| exec.into_byte_regex())
1591 .map_err(|err| format!("{}", err))
1592 .unwrap();
1593
1594 let default_bytes_re = ExecBuilder::new("^S")
1595 .only_utf8(false)
1596 .build()
1597 .map(|exec| exec.into_byte_regex())
1598 .map_err(|err| format!("{}", err))
1599 .unwrap();
1600
1601 let input = vec![83, 83];
1602
1603 let s1 = backtrack_bytes_re.split(&input);
1604 let s2 = default_bytes_re.split(&input);
1605 for (chunk1, chunk2) in s1.zip(s2) {
1606 assert_eq!(chunk1, chunk2);
1607 }
1608 }
1609
1610 #[test]
unicode_lit_star_backtracking_utf8bytes_default_utf8bytes_mismatch()1611 fn unicode_lit_star_backtracking_utf8bytes_default_utf8bytes_mismatch() {
1612 use internal::ExecBuilder;
1613
1614 let backtrack_bytes_re = ExecBuilder::new(r"^(?u:\*)")
1615 .bounded_backtracking()
1616 .bytes(true)
1617 .build()
1618 .map(|exec| exec.into_regex())
1619 .map_err(|err| format!("{}", err))
1620 .unwrap();
1621
1622 let default_bytes_re = ExecBuilder::new(r"^(?u:\*)")
1623 .bytes(true)
1624 .build()
1625 .map(|exec| exec.into_regex())
1626 .map_err(|err| format!("{}", err))
1627 .unwrap();
1628
1629 let input = "**";
1630
1631 let s1 = backtrack_bytes_re.split(input);
1632 let s2 = default_bytes_re.split(input);
1633 for (chunk1, chunk2) in s1.zip(s2) {
1634 assert_eq!(chunk1, chunk2);
1635 }
1636 }
1637 }
1638