1 // Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
4 //
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
10
11 use std::cell::RefCell;
12 use std::collections::HashMap;
13 use std::sync::Arc;
14
15 use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
16 use thread_local::CachedThreadLocal;
17 use syntax::ParserBuilder;
18 use syntax::hir::Hir;
19 use syntax::hir::literal::Literals;
20
21 use backtrack;
22 use compile::Compiler;
23 use dfa;
24 use error::Error;
25 use input::{ByteInput, CharInput};
26 use literal::LiteralSearcher;
27 use pikevm;
28 use prog::Program;
29 use re_builder::RegexOptions;
30 use re_bytes;
31 use re_set;
32 use re_trait::{RegularExpression, Slot, Locations};
33 use re_unicode;
34 use utf8::next_utf8;
35
36 /// `Exec` manages the execution of a regular expression.
37 ///
38 /// In particular, this manages the various compiled forms of a single regular
39 /// expression and the choice of which matching engine to use to execute a
40 /// regular expression.
41 pub struct Exec {
42 /// All read only state.
43 ro: Arc<ExecReadOnly>,
44 /// Caches for the various matching engines.
45 cache: CachedThreadLocal<ProgramCache>,
46 }
47
48 /// `ExecNoSync` is like `Exec`, except it embeds a reference to a cache. This
49 /// means it is no longer Sync, but we can now avoid the overhead of
50 /// synchronization to fetch the cache.
51 #[derive(Debug)]
52 pub struct ExecNoSync<'c> {
53 /// All read only state.
54 ro: &'c Arc<ExecReadOnly>,
55 /// Caches for the various matching engines.
56 cache: &'c ProgramCache,
57 }
58
59 /// `ExecNoSyncStr` is like `ExecNoSync`, but matches on &str instead of &[u8].
60 pub struct ExecNoSyncStr<'c>(ExecNoSync<'c>);
61
62 /// `ExecReadOnly` comprises all read only state for a regex. Namely, all such
63 /// state is determined at compile time and never changes during search.
64 #[derive(Debug)]
65 struct ExecReadOnly {
66 /// The original regular expressions given by the caller to compile.
67 res: Vec<String>,
68 /// A compiled program that is used in the NFA simulation and backtracking.
69 /// It can be byte-based or Unicode codepoint based.
70 ///
71 /// N.B. It is not possibly to make this byte-based from the public API.
72 /// It is only used for testing byte based programs in the NFA simulations.
73 nfa: Program,
74 /// A compiled byte based program for DFA execution. This is only used
75 /// if a DFA can be executed. (Currently, only word boundary assertions are
76 /// not supported.) Note that this program contains an embedded `.*?`
77 /// preceding the first capture group, unless the regex is anchored at the
78 /// beginning.
79 dfa: Program,
80 /// The same as above, except the program is reversed (and there is no
81 /// preceding `.*?`). This is used by the DFA to find the starting location
82 /// of matches.
83 dfa_reverse: Program,
84 /// A set of suffix literals extracted from the regex.
85 ///
86 /// Prefix literals are stored on the `Program`, since they are used inside
87 /// the matching engines.
88 suffixes: LiteralSearcher,
89 /// An Aho-Corasick automaton with leftmost-first match semantics.
90 ///
91 /// This is only set when the entire regex is a simple unanchored
92 /// alternation of literals. We could probably use it more circumstances,
93 /// but this is already hacky enough in this architecture.
94 ///
95 /// N.B. We use u32 as a state ID representation under the assumption that
96 /// if we were to exhaust the ID space, we probably would have long
97 /// surpassed the compilation size limit.
98 ac: Option<AhoCorasick<u32>>,
99 /// match_type encodes as much upfront knowledge about how we're going to
100 /// execute a search as possible.
101 match_type: MatchType,
102 }
103
104 /// Facilitates the construction of an executor by exposing various knobs
105 /// to control how a regex is executed and what kinds of resources it's
106 /// permitted to use.
107 pub struct ExecBuilder {
108 options: RegexOptions,
109 match_type: Option<MatchType>,
110 bytes: bool,
111 only_utf8: bool,
112 }
113
114 /// Parsed represents a set of parsed regular expressions and their detected
115 /// literals.
116 struct Parsed {
117 exprs: Vec<Hir>,
118 prefixes: Literals,
119 suffixes: Literals,
120 bytes: bool,
121 }
122
123 impl ExecBuilder {
124 /// Create a regex execution builder.
125 ///
126 /// This uses default settings for everything except the regex itself,
127 /// which must be provided. Further knobs can be set by calling methods,
128 /// and then finally, `build` to actually create the executor.
new(re: &str) -> Self129 pub fn new(re: &str) -> Self {
130 Self::new_many(&[re])
131 }
132
133 /// Like new, but compiles the union of the given regular expressions.
134 ///
135 /// Note that when compiling 2 or more regular expressions, capture groups
136 /// are completely unsupported. (This means both `find` and `captures`
137 /// wont work.)
new_many<I, S>(res: I) -> Self where S: AsRef<str>, I: IntoIterator<Item=S>138 pub fn new_many<I, S>(res: I) -> Self
139 where S: AsRef<str>, I: IntoIterator<Item=S> {
140 let mut opts = RegexOptions::default();
141 opts.pats = res.into_iter().map(|s| s.as_ref().to_owned()).collect();
142 Self::new_options(opts)
143 }
144
145 /// Create a regex execution builder.
new_options(opts: RegexOptions) -> Self146 pub fn new_options(opts: RegexOptions) -> Self {
147 ExecBuilder {
148 options: opts,
149 match_type: None,
150 bytes: false,
151 only_utf8: true,
152 }
153 }
154
155 /// Set the matching engine to be automatically determined.
156 ///
157 /// This is the default state and will apply whatever optimizations are
158 /// possible, such as running a DFA.
159 ///
160 /// This overrides whatever was previously set via the `nfa` or
161 /// `bounded_backtracking` methods.
automatic(mut self) -> Self162 pub fn automatic(mut self) -> Self {
163 self.match_type = None;
164 self
165 }
166
167 /// Sets the matching engine to use the NFA algorithm no matter what
168 /// optimizations are possible.
169 ///
170 /// This overrides whatever was previously set via the `automatic` or
171 /// `bounded_backtracking` methods.
nfa(mut self) -> Self172 pub fn nfa(mut self) -> Self {
173 self.match_type = Some(MatchType::Nfa(MatchNfaType::PikeVM));
174 self
175 }
176
177 /// Sets the matching engine to use a bounded backtracking engine no
178 /// matter what optimizations are possible.
179 ///
180 /// One must use this with care, since the bounded backtracking engine
181 /// uses memory proportion to `len(regex) * len(text)`.
182 ///
183 /// This overrides whatever was previously set via the `automatic` or
184 /// `nfa` methods.
bounded_backtracking(mut self) -> Self185 pub fn bounded_backtracking(mut self) -> Self {
186 self.match_type = Some(MatchType::Nfa(MatchNfaType::Backtrack));
187 self
188 }
189
190 /// Compiles byte based programs for use with the NFA matching engines.
191 ///
192 /// By default, the NFA engines match on Unicode scalar values. They can
193 /// be made to use byte based programs instead. In general, the byte based
194 /// programs are slower because of a less efficient encoding of character
195 /// classes.
196 ///
197 /// Note that this does not impact DFA matching engines, which always
198 /// execute on bytes.
bytes(mut self, yes: bool) -> Self199 pub fn bytes(mut self, yes: bool) -> Self {
200 self.bytes = yes;
201 self
202 }
203
204 /// When disabled, the program compiled may match arbitrary bytes.
205 ///
206 /// When enabled (the default), all compiled programs exclusively match
207 /// valid UTF-8 bytes.
only_utf8(mut self, yes: bool) -> Self208 pub fn only_utf8(mut self, yes: bool) -> Self {
209 self.only_utf8 = yes;
210 self
211 }
212
213 /// Set the Unicode flag.
unicode(mut self, yes: bool) -> Self214 pub fn unicode(mut self, yes: bool) -> Self {
215 self.options.unicode = yes;
216 self
217 }
218
219 /// Parse the current set of patterns into their AST and extract literals.
parse(&self) -> Result<Parsed, Error>220 fn parse(&self) -> Result<Parsed, Error> {
221 let mut exprs = Vec::with_capacity(self.options.pats.len());
222 let mut prefixes = Some(Literals::empty());
223 let mut suffixes = Some(Literals::empty());
224 let mut bytes = false;
225 let is_set = self.options.pats.len() > 1;
226 // If we're compiling a regex set and that set has any anchored
227 // expressions, then disable all literal optimizations.
228 for pat in &self.options.pats {
229 let mut parser =
230 ParserBuilder::new()
231 .octal(self.options.octal)
232 .case_insensitive(self.options.case_insensitive)
233 .multi_line(self.options.multi_line)
234 .dot_matches_new_line(self.options.dot_matches_new_line)
235 .swap_greed(self.options.swap_greed)
236 .ignore_whitespace(self.options.ignore_whitespace)
237 .unicode(self.options.unicode)
238 .allow_invalid_utf8(!self.only_utf8)
239 .nest_limit(self.options.nest_limit)
240 .build();
241 let expr = parser
242 .parse(pat)
243 .map_err(|e| Error::Syntax(e.to_string()))?;
244 bytes = bytes || !expr.is_always_utf8();
245
246 if !expr.is_anchored_start() && expr.is_any_anchored_start() {
247 // Partial anchors unfortunately make it hard to use prefixes,
248 // so disable them.
249 prefixes = None;
250 } else if is_set && expr.is_anchored_start() {
251 // Regex sets with anchors do not go well with literal
252 // optimizations.
253 prefixes = None;
254 }
255 prefixes = prefixes.and_then(|mut prefixes| {
256 if !prefixes.union_prefixes(&expr) {
257 None
258 } else {
259 Some(prefixes)
260 }
261 });
262
263 if !expr.is_anchored_end() && expr.is_any_anchored_end() {
264 // Partial anchors unfortunately make it hard to use suffixes,
265 // so disable them.
266 suffixes = None;
267 } else if is_set && expr.is_anchored_end() {
268 // Regex sets with anchors do not go well with literal
269 // optimizations.
270 suffixes = None;
271 }
272 suffixes = suffixes.and_then(|mut suffixes| {
273 if !suffixes.union_suffixes(&expr) {
274 None
275 } else {
276 Some(suffixes)
277 }
278 });
279 exprs.push(expr);
280 }
281 Ok(Parsed {
282 exprs: exprs,
283 prefixes: prefixes.unwrap_or_else(Literals::empty),
284 suffixes: suffixes.unwrap_or_else(Literals::empty),
285 bytes: bytes,
286 })
287 }
288
289 /// Build an executor that can run a regular expression.
build(self) -> Result<Exec, Error>290 pub fn build(self) -> Result<Exec, Error> {
291 // Special case when we have no patterns to compile.
292 // This can happen when compiling a regex set.
293 if self.options.pats.is_empty() {
294 let ro = Arc::new(ExecReadOnly {
295 res: vec![],
296 nfa: Program::new(),
297 dfa: Program::new(),
298 dfa_reverse: Program::new(),
299 suffixes: LiteralSearcher::empty(),
300 ac: None,
301 match_type: MatchType::Nothing,
302 });
303 return Ok(Exec { ro: ro, cache: CachedThreadLocal::new() });
304 }
305 let parsed = self.parse()?;
306 let mut nfa =
307 Compiler::new()
308 .size_limit(self.options.size_limit)
309 .bytes(self.bytes || parsed.bytes)
310 .only_utf8(self.only_utf8)
311 .compile(&parsed.exprs)?;
312 let mut dfa =
313 Compiler::new()
314 .size_limit(self.options.size_limit)
315 .dfa(true)
316 .only_utf8(self.only_utf8)
317 .compile(&parsed.exprs)?;
318 let mut dfa_reverse =
319 Compiler::new()
320 .size_limit(self.options.size_limit)
321 .dfa(true)
322 .only_utf8(self.only_utf8)
323 .reverse(true)
324 .compile(&parsed.exprs)?;
325
326 nfa.prefixes = LiteralSearcher::prefixes(parsed.prefixes);
327 dfa.prefixes = nfa.prefixes.clone();
328 dfa.dfa_size_limit = self.options.dfa_size_limit;
329 dfa_reverse.dfa_size_limit = self.options.dfa_size_limit;
330
331 let mut ac = None;
332 if parsed.exprs.len() == 1 {
333 if let Some(lits) = alternation_literals(&parsed.exprs[0]) {
334 // If we have a small number of literals, then let Teddy
335 // handle things (see literal/mod.rs).
336 if lits.len() > 32 {
337 let fsm = AhoCorasickBuilder::new()
338 .match_kind(MatchKind::LeftmostFirst)
339 .auto_configure(&lits)
340 // We always want this to reduce size, regardless of
341 // what auto-configure does.
342 .byte_classes(true)
343 .build_with_size::<u32, _, _>(&lits)
344 .expect("AC automaton too big");
345 ac = Some(fsm);
346 }
347 }
348 }
349
350 let mut ro = ExecReadOnly {
351 res: self.options.pats,
352 nfa: nfa,
353 dfa: dfa,
354 dfa_reverse: dfa_reverse,
355 suffixes: LiteralSearcher::suffixes(parsed.suffixes),
356 ac: ac,
357 match_type: MatchType::Nothing,
358 };
359 ro.match_type = ro.choose_match_type(self.match_type);
360
361 let ro = Arc::new(ro);
362 Ok(Exec { ro: ro, cache: CachedThreadLocal::new() })
363 }
364 }
365
366 impl<'c> RegularExpression for ExecNoSyncStr<'c> {
367 type Text = str;
368
slots_len(&self) -> usize369 fn slots_len(&self) -> usize { self.0.slots_len() }
370
next_after_empty(&self, text: &str, i: usize) -> usize371 fn next_after_empty(&self, text: &str, i: usize) -> usize {
372 next_utf8(text.as_bytes(), i)
373 }
374
375 #[inline(always)] // reduces constant overhead
shortest_match_at(&self, text: &str, start: usize) -> Option<usize>376 fn shortest_match_at(&self, text: &str, start: usize) -> Option<usize> {
377 self.0.shortest_match_at(text.as_bytes(), start)
378 }
379
380 #[inline(always)] // reduces constant overhead
is_match_at(&self, text: &str, start: usize) -> bool381 fn is_match_at(&self, text: &str, start: usize) -> bool {
382 self.0.is_match_at(text.as_bytes(), start)
383 }
384
385 #[inline(always)] // reduces constant overhead
find_at(&self, text: &str, start: usize) -> Option<(usize, usize)>386 fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> {
387 self.0.find_at(text.as_bytes(), start)
388 }
389
390 #[inline(always)] // reduces constant overhead
captures_read_at( &self, locs: &mut Locations, text: &str, start: usize, ) -> Option<(usize, usize)>391 fn captures_read_at(
392 &self,
393 locs: &mut Locations,
394 text: &str,
395 start: usize,
396 ) -> Option<(usize, usize)> {
397 self.0.captures_read_at(locs, text.as_bytes(), start)
398 }
399 }
400
401 impl<'c> RegularExpression for ExecNoSync<'c> {
402 type Text = [u8];
403
404 /// Returns the number of capture slots in the regular expression. (There
405 /// are two slots for every capture group, corresponding to possibly empty
406 /// start and end locations of the capture.)
slots_len(&self) -> usize407 fn slots_len(&self) -> usize {
408 self.ro.nfa.captures.len() * 2
409 }
410
next_after_empty(&self, _text: &[u8], i: usize) -> usize411 fn next_after_empty(&self, _text: &[u8], i: usize) -> usize {
412 i + 1
413 }
414
415 /// Returns the end of a match location, possibly occurring before the
416 /// end location of the correct leftmost-first match.
417 #[inline(always)] // reduces constant overhead
shortest_match_at(&self, text: &[u8], start: usize) -> Option<usize>418 fn shortest_match_at(&self, text: &[u8], start: usize) -> Option<usize> {
419 if !self.is_anchor_end_match(text) {
420 return None;
421 }
422 match self.ro.match_type {
423 MatchType::Literal(ty) => {
424 self.find_literals(ty, text, start).map(|(_, e)| e)
425 }
426 MatchType::Dfa | MatchType::DfaMany => {
427 match self.shortest_dfa(text, start) {
428 dfa::Result::Match(end) => Some(end),
429 dfa::Result::NoMatch(_) => None,
430 dfa::Result::Quit => self.shortest_nfa(text, start),
431 }
432 }
433 MatchType::DfaAnchoredReverse => {
434 match dfa::Fsm::reverse(
435 &self.ro.dfa_reverse,
436 self.cache,
437 true,
438 &text[start..],
439 text.len(),
440 ) {
441 dfa::Result::Match(_) => Some(text.len()),
442 dfa::Result::NoMatch(_) => None,
443 dfa::Result::Quit => self.shortest_nfa(text, start),
444 }
445 }
446 MatchType::DfaSuffix => {
447 match self.shortest_dfa_reverse_suffix(text, start) {
448 dfa::Result::Match(e) => Some(e),
449 dfa::Result::NoMatch(_) => None,
450 dfa::Result::Quit => self.shortest_nfa(text, start),
451 }
452 }
453 MatchType::Nfa(ty) => self.shortest_nfa_type(ty, text, start),
454 MatchType::Nothing => None,
455 }
456 }
457
458 /// Returns true if and only if the regex matches text.
459 ///
460 /// For single regular expressions, this is equivalent to calling
461 /// shortest_match(...).is_some().
462 #[inline(always)] // reduces constant overhead
is_match_at(&self, text: &[u8], start: usize) -> bool463 fn is_match_at(&self, text: &[u8], start: usize) -> bool {
464 if !self.is_anchor_end_match(text) {
465 return false;
466 }
467 // We need to do this dance because shortest_match relies on the NFA
468 // filling in captures[1], but a RegexSet has no captures. In other
469 // words, a RegexSet can't (currently) use shortest_match. ---AG
470 match self.ro.match_type {
471 MatchType::Literal(ty) => {
472 self.find_literals(ty, text, start).is_some()
473 }
474 MatchType::Dfa | MatchType::DfaMany => {
475 match self.shortest_dfa(text, start) {
476 dfa::Result::Match(_) => true,
477 dfa::Result::NoMatch(_) => false,
478 dfa::Result::Quit => self.match_nfa(text, start),
479 }
480 }
481 MatchType::DfaAnchoredReverse => {
482 match dfa::Fsm::reverse(
483 &self.ro.dfa_reverse,
484 self.cache,
485 true,
486 &text[start..],
487 text.len(),
488 ) {
489 dfa::Result::Match(_) => true,
490 dfa::Result::NoMatch(_) => false,
491 dfa::Result::Quit => self.match_nfa(text, start),
492 }
493 }
494 MatchType::DfaSuffix => {
495 match self.shortest_dfa_reverse_suffix(text, start) {
496 dfa::Result::Match(_) => true,
497 dfa::Result::NoMatch(_) => false,
498 dfa::Result::Quit => self.match_nfa(text, start),
499 }
500 }
501 MatchType::Nfa(ty) => self.match_nfa_type(ty, text, start),
502 MatchType::Nothing => false,
503 }
504 }
505
506 /// Finds the start and end location of the leftmost-first match, starting
507 /// at the given location.
508 #[inline(always)] // reduces constant overhead
find_at(&self, text: &[u8], start: usize) -> Option<(usize, usize)>509 fn find_at(&self, text: &[u8], start: usize) -> Option<(usize, usize)> {
510 if !self.is_anchor_end_match(text) {
511 return None;
512 }
513 match self.ro.match_type {
514 MatchType::Literal(ty) => {
515 self.find_literals(ty, text, start)
516 }
517 MatchType::Dfa => {
518 match self.find_dfa_forward(text, start) {
519 dfa::Result::Match((s, e)) => Some((s, e)),
520 dfa::Result::NoMatch(_) => None,
521 dfa::Result::Quit => {
522 self.find_nfa(MatchNfaType::Auto, text, start)
523 }
524 }
525 }
526 MatchType::DfaAnchoredReverse => {
527 match self.find_dfa_anchored_reverse(text, start) {
528 dfa::Result::Match((s, e)) => Some((s, e)),
529 dfa::Result::NoMatch(_) => None,
530 dfa::Result::Quit => {
531 self.find_nfa(MatchNfaType::Auto, text, start)
532 }
533 }
534 }
535 MatchType::DfaSuffix => {
536 match self.find_dfa_reverse_suffix(text, start) {
537 dfa::Result::Match((s, e)) => Some((s, e)),
538 dfa::Result::NoMatch(_) => None,
539 dfa::Result::Quit => {
540 self.find_nfa(MatchNfaType::Auto, text, start)
541 }
542 }
543 }
544 MatchType::Nfa(ty) => self.find_nfa(ty, text, start),
545 MatchType::Nothing => None,
546 MatchType::DfaMany => {
547 unreachable!("BUG: RegexSet cannot be used with find")
548 }
549 }
550 }
551
552 /// Finds the start and end location of the leftmost-first match and also
553 /// fills in all matching capture groups.
554 ///
555 /// The number of capture slots given should be equal to the total number
556 /// of capture slots in the compiled program.
557 ///
558 /// Note that the first two slots always correspond to the start and end
559 /// locations of the overall match.
captures_read_at( &self, locs: &mut Locations, text: &[u8], start: usize, ) -> Option<(usize, usize)>560 fn captures_read_at(
561 &self,
562 locs: &mut Locations,
563 text: &[u8],
564 start: usize,
565 ) -> Option<(usize, usize)> {
566 let slots = locs.as_slots();
567 for slot in slots.iter_mut() {
568 *slot = None;
569 }
570 // If the caller unnecessarily uses this, then we try to save them
571 // from themselves.
572 match slots.len() {
573 0 => return self.find_at(text, start),
574 2 => {
575 return self.find_at(text, start).map(|(s, e)| {
576 slots[0] = Some(s);
577 slots[1] = Some(e);
578 (s, e)
579 });
580 }
581 _ => {} // fallthrough
582 }
583 if !self.is_anchor_end_match(text) {
584 return None;
585 }
586 match self.ro.match_type {
587 MatchType::Literal(ty) => {
588 self.find_literals(ty, text, start).and_then(|(s, e)| {
589 self.captures_nfa_type(
590 MatchNfaType::Auto, slots, text, s, e)
591 })
592 }
593 MatchType::Dfa => {
594 if self.ro.nfa.is_anchored_start {
595 self.captures_nfa(slots, text, start)
596 } else {
597 match self.find_dfa_forward(text, start) {
598 dfa::Result::Match((s, e)) => {
599 self.captures_nfa_type(
600 MatchNfaType::Auto, slots, text, s, e)
601 }
602 dfa::Result::NoMatch(_) => None,
603 dfa::Result::Quit => {
604 self.captures_nfa(slots, text, start)
605 }
606 }
607 }
608 }
609 MatchType::DfaAnchoredReverse => {
610 match self.find_dfa_anchored_reverse(text, start) {
611 dfa::Result::Match((s, e)) => {
612 self.captures_nfa_type(
613 MatchNfaType::Auto, slots, text, s, e)
614 }
615 dfa::Result::NoMatch(_) => None,
616 dfa::Result::Quit => self.captures_nfa(slots, text, start),
617 }
618 }
619 MatchType::DfaSuffix => {
620 match self.find_dfa_reverse_suffix(text, start) {
621 dfa::Result::Match((s, e)) => {
622 self.captures_nfa_type(
623 MatchNfaType::Auto, slots, text, s, e)
624 }
625 dfa::Result::NoMatch(_) => None,
626 dfa::Result::Quit => self.captures_nfa(slots, text, start),
627 }
628 }
629 MatchType::Nfa(ty) => {
630 self.captures_nfa_type(ty, slots, text, start, text.len())
631 }
632 MatchType::Nothing => None,
633 MatchType::DfaMany => {
634 unreachable!("BUG: RegexSet cannot be used with captures")
635 }
636 }
637 }
638 }
639
640 impl<'c> ExecNoSync<'c> {
641 /// Finds the leftmost-first match using only literal search.
642 #[inline(always)] // reduces constant overhead
find_literals( &self, ty: MatchLiteralType, text: &[u8], start: usize, ) -> Option<(usize, usize)>643 fn find_literals(
644 &self,
645 ty: MatchLiteralType,
646 text: &[u8],
647 start: usize,
648 ) -> Option<(usize, usize)> {
649 use self::MatchLiteralType::*;
650 match ty {
651 Unanchored => {
652 let lits = &self.ro.nfa.prefixes;
653 lits.find(&text[start..])
654 .map(|(s, e)| (start + s, start + e))
655 }
656 AnchoredStart => {
657 let lits = &self.ro.nfa.prefixes;
658 if !self.ro.nfa.is_anchored_start
659 || (self.ro.nfa.is_anchored_start && start == 0) {
660 lits.find_start(&text[start..])
661 .map(|(s, e)| (start + s, start + e))
662 } else {
663 None
664 }
665 }
666 AnchoredEnd => {
667 let lits = &self.ro.suffixes;
668 lits.find_end(&text[start..])
669 .map(|(s, e)| (start + s, start + e))
670 }
671 AhoCorasick => {
672 self.ro.ac.as_ref().unwrap()
673 .find(&text[start..])
674 .map(|m| (start + m.start(), start + m.end()))
675 }
676 }
677 }
678
679 /// Finds the leftmost-first match (start and end) using only the DFA.
680 ///
681 /// If the result returned indicates that the DFA quit, then another
682 /// matching engine should be used.
683 #[inline(always)] // reduces constant overhead
find_dfa_forward( &self, text: &[u8], start: usize, ) -> dfa::Result<(usize, usize)>684 fn find_dfa_forward(
685 &self,
686 text: &[u8],
687 start: usize,
688 ) -> dfa::Result<(usize, usize)> {
689 use dfa::Result::*;
690 let end = match dfa::Fsm::forward(
691 &self.ro.dfa,
692 self.cache,
693 false,
694 text,
695 start,
696 ) {
697 NoMatch(i) => return NoMatch(i),
698 Quit => return Quit,
699 Match(end) if start == end => return Match((start, start)),
700 Match(end) => end,
701 };
702 // Now run the DFA in reverse to find the start of the match.
703 match dfa::Fsm::reverse(
704 &self.ro.dfa_reverse,
705 self.cache,
706 false,
707 &text[start..],
708 end - start,
709 ) {
710 Match(s) => Match((start + s, end)),
711 NoMatch(i) => NoMatch(i),
712 Quit => Quit,
713 }
714 }
715
716 /// Finds the leftmost-first match (start and end) using only the DFA,
717 /// but assumes the regex is anchored at the end and therefore starts at
718 /// the end of the regex and matches in reverse.
719 ///
720 /// If the result returned indicates that the DFA quit, then another
721 /// matching engine should be used.
722 #[inline(always)] // reduces constant overhead
find_dfa_anchored_reverse( &self, text: &[u8], start: usize, ) -> dfa::Result<(usize, usize)>723 fn find_dfa_anchored_reverse(
724 &self,
725 text: &[u8],
726 start: usize,
727 ) -> dfa::Result<(usize, usize)> {
728 use dfa::Result::*;
729 match dfa::Fsm::reverse(
730 &self.ro.dfa_reverse,
731 self.cache,
732 false,
733 &text[start..],
734 text.len() - start,
735 ) {
736 Match(s) => Match((start + s, text.len())),
737 NoMatch(i) => NoMatch(i),
738 Quit => Quit,
739 }
740 }
741
742 /// Finds the end of the shortest match using only the DFA.
743 #[inline(always)] // reduces constant overhead
shortest_dfa(&self, text: &[u8], start: usize) -> dfa::Result<usize>744 fn shortest_dfa(&self, text: &[u8], start: usize) -> dfa::Result<usize> {
745 dfa::Fsm::forward(&self.ro.dfa, self.cache, true, text, start)
746 }
747
748 /// Finds the end of the shortest match using only the DFA by scanning for
749 /// suffix literals.
750 ///
751 #[inline(always)] // reduces constant overhead
shortest_dfa_reverse_suffix( &self, text: &[u8], start: usize, ) -> dfa::Result<usize>752 fn shortest_dfa_reverse_suffix(
753 &self,
754 text: &[u8],
755 start: usize,
756 ) -> dfa::Result<usize> {
757 match self.exec_dfa_reverse_suffix(text, start) {
758 None => self.shortest_dfa(text, start),
759 Some(r) => r.map(|(_, end)| end),
760 }
761 }
762
763 /// Finds the end of the shortest match using only the DFA by scanning for
764 /// suffix literals. It also reports the start of the match.
765 ///
766 /// Note that if None is returned, then the optimization gave up to avoid
767 /// worst case quadratic behavior. A forward scanning DFA should be tried
768 /// next.
769 ///
770 /// If a match is returned and the full leftmost-first match is desired,
771 /// then a forward scan starting from the beginning of the match must be
772 /// done.
773 ///
774 /// If the result returned indicates that the DFA quit, then another
775 /// matching engine should be used.
776 #[inline(always)] // reduces constant overhead
exec_dfa_reverse_suffix( &self, text: &[u8], original_start: usize, ) -> Option<dfa::Result<(usize, usize)>>777 fn exec_dfa_reverse_suffix(
778 &self,
779 text: &[u8],
780 original_start: usize,
781 ) -> Option<dfa::Result<(usize, usize)>> {
782 use dfa::Result::*;
783
784 let lcs = self.ro.suffixes.lcs();
785 debug_assert!(lcs.len() >= 1);
786 let mut start = original_start;
787 let mut end = start;
788 let mut last_literal = start;
789 while end <= text.len() {
790 last_literal += match lcs.find(&text[last_literal..]) {
791 None => return Some(NoMatch(text.len())),
792 Some(i) => i,
793 };
794 end = last_literal + lcs.len();
795 match dfa::Fsm::reverse(
796 &self.ro.dfa_reverse,
797 self.cache,
798 false,
799 &text[start..end],
800 end - start,
801 ) {
802 Match(0) | NoMatch(0) => return None,
803 Match(i) => return Some(Match((start + i, end))),
804 NoMatch(i) => {
805 start += i;
806 last_literal += 1;
807 continue;
808 }
809 Quit => return Some(Quit),
810 };
811 }
812 Some(NoMatch(text.len()))
813 }
814
815 /// Finds the leftmost-first match (start and end) using only the DFA
816 /// by scanning for suffix literals.
817 ///
818 /// If the result returned indicates that the DFA quit, then another
819 /// matching engine should be used.
820 #[inline(always)] // reduces constant overhead
find_dfa_reverse_suffix( &self, text: &[u8], start: usize, ) -> dfa::Result<(usize, usize)>821 fn find_dfa_reverse_suffix(
822 &self,
823 text: &[u8],
824 start: usize,
825 ) -> dfa::Result<(usize, usize)> {
826 use dfa::Result::*;
827
828 let match_start = match self.exec_dfa_reverse_suffix(text, start) {
829 None => return self.find_dfa_forward(text, start),
830 Some(Match((start, _))) => start,
831 Some(r) => return r,
832 };
833 // At this point, we've found a match. The only way to quit now
834 // without a match is if the DFA gives up (seems unlikely).
835 //
836 // Now run the DFA forwards to find the proper end of the match.
837 // (The suffix literal match can only indicate the earliest
838 // possible end location, which may appear before the end of the
839 // leftmost-first match.)
840 match dfa::Fsm::forward(
841 &self.ro.dfa,
842 self.cache,
843 false,
844 text,
845 match_start,
846 ) {
847 NoMatch(_) => panic!("BUG: reverse match implies forward match"),
848 Quit => Quit,
849 Match(e) => Match((match_start, e)),
850 }
851 }
852
853 /// Executes the NFA engine to return whether there is a match or not.
854 ///
855 /// Ideally, we could use shortest_nfa(...).is_some() and get the same
856 /// performance characteristics, but regex sets don't have captures, which
857 /// shortest_nfa depends on.
match_nfa( &self, text: &[u8], start: usize, ) -> bool858 fn match_nfa(
859 &self,
860 text: &[u8],
861 start: usize,
862 ) -> bool {
863 self.match_nfa_type(MatchNfaType::Auto, text, start)
864 }
865
866 /// Like match_nfa, but allows specification of the type of NFA engine.
match_nfa_type( &self, ty: MatchNfaType, text: &[u8], start: usize, ) -> bool867 fn match_nfa_type(
868 &self,
869 ty: MatchNfaType,
870 text: &[u8],
871 start: usize,
872 ) -> bool {
873 self.exec_nfa(ty, &mut [false], &mut [], true, text, start, text.len())
874 }
875
876 /// Finds the shortest match using an NFA.
shortest_nfa(&self, text: &[u8], start: usize) -> Option<usize>877 fn shortest_nfa(&self, text: &[u8], start: usize) -> Option<usize> {
878 self.shortest_nfa_type(MatchNfaType::Auto, text, start)
879 }
880
881 /// Like shortest_nfa, but allows specification of the type of NFA engine.
shortest_nfa_type( &self, ty: MatchNfaType, text: &[u8], start: usize, ) -> Option<usize>882 fn shortest_nfa_type(
883 &self,
884 ty: MatchNfaType,
885 text: &[u8],
886 start: usize,
887 ) -> Option<usize> {
888 let mut slots = [None, None];
889 if self.exec_nfa(
890 ty,
891 &mut [false],
892 &mut slots,
893 true,
894 text,
895 start,
896 text.len()
897 ) {
898 slots[1]
899 } else {
900 None
901 }
902 }
903
904 /// Like find, but executes an NFA engine.
find_nfa( &self, ty: MatchNfaType, text: &[u8], start: usize, ) -> Option<(usize, usize)>905 fn find_nfa(
906 &self,
907 ty: MatchNfaType,
908 text: &[u8],
909 start: usize,
910 ) -> Option<(usize, usize)> {
911 let mut slots = [None, None];
912 if self.exec_nfa(
913 ty,
914 &mut [false],
915 &mut slots,
916 false,
917 text,
918 start,
919 text.len()
920 ) {
921 match (slots[0], slots[1]) {
922 (Some(s), Some(e)) => Some((s, e)),
923 _ => None,
924 }
925 } else {
926 None
927 }
928 }
929
930 /// Like find_nfa, but fills in captures.
931 ///
932 /// `slots` should have length equal to `2 * nfa.captures.len()`.
captures_nfa( &self, slots: &mut [Slot], text: &[u8], start: usize, ) -> Option<(usize, usize)>933 fn captures_nfa(
934 &self,
935 slots: &mut [Slot],
936 text: &[u8],
937 start: usize,
938 ) -> Option<(usize, usize)> {
939 self.captures_nfa_type(
940 MatchNfaType::Auto, slots, text, start, text.len())
941 }
942
943 /// Like captures_nfa, but allows specification of type of NFA engine.
captures_nfa_type( &self, ty: MatchNfaType, slots: &mut [Slot], text: &[u8], start: usize, end: usize, ) -> Option<(usize, usize)>944 fn captures_nfa_type(
945 &self,
946 ty: MatchNfaType,
947 slots: &mut [Slot],
948 text: &[u8],
949 start: usize,
950 end: usize,
951 ) -> Option<(usize, usize)> {
952 if self.exec_nfa(ty, &mut [false], slots, false, text, start, end) {
953 match (slots[0], slots[1]) {
954 (Some(s), Some(e)) => Some((s, e)),
955 _ => None,
956 }
957 } else {
958 None
959 }
960 }
961
exec_nfa( &self, mut ty: MatchNfaType, matches: &mut [bool], slots: &mut [Slot], quit_after_match: bool, text: &[u8], start: usize, end: usize, ) -> bool962 fn exec_nfa(
963 &self,
964 mut ty: MatchNfaType,
965 matches: &mut [bool],
966 slots: &mut [Slot],
967 quit_after_match: bool,
968 text: &[u8],
969 start: usize,
970 end: usize,
971 ) -> bool {
972 use self::MatchNfaType::*;
973 if let Auto = ty {
974 if backtrack::should_exec(self.ro.nfa.len(), text.len()) {
975 ty = Backtrack;
976 } else {
977 ty = PikeVM;
978 }
979 }
980 match ty {
981 Auto => unreachable!(),
982 Backtrack => self.exec_backtrack(matches, slots, text, start, end),
983 PikeVM => {
984 self.exec_pikevm(
985 matches, slots, quit_after_match, text, start, end)
986 }
987 }
988 }
989
990 /// Always run the NFA algorithm.
exec_pikevm( &self, matches: &mut [bool], slots: &mut [Slot], quit_after_match: bool, text: &[u8], start: usize, end: usize, ) -> bool991 fn exec_pikevm(
992 &self,
993 matches: &mut [bool],
994 slots: &mut [Slot],
995 quit_after_match: bool,
996 text: &[u8],
997 start: usize,
998 end: usize,
999 ) -> bool {
1000 if self.ro.nfa.uses_bytes() {
1001 pikevm::Fsm::exec(
1002 &self.ro.nfa,
1003 self.cache,
1004 matches,
1005 slots,
1006 quit_after_match,
1007 ByteInput::new(text, self.ro.nfa.only_utf8),
1008 start,
1009 end)
1010 } else {
1011 pikevm::Fsm::exec(
1012 &self.ro.nfa,
1013 self.cache,
1014 matches,
1015 slots,
1016 quit_after_match,
1017 CharInput::new(text),
1018 start,
1019 end)
1020 }
1021 }
1022
1023 /// Always runs the NFA using bounded backtracking.
exec_backtrack( &self, matches: &mut [bool], slots: &mut [Slot], text: &[u8], start: usize, end: usize, ) -> bool1024 fn exec_backtrack(
1025 &self,
1026 matches: &mut [bool],
1027 slots: &mut [Slot],
1028 text: &[u8],
1029 start: usize,
1030 end: usize,
1031 ) -> bool {
1032 if self.ro.nfa.uses_bytes() {
1033 backtrack::Bounded::exec(
1034 &self.ro.nfa,
1035 self.cache,
1036 matches,
1037 slots,
1038 ByteInput::new(text, self.ro.nfa.only_utf8),
1039 start,
1040 end)
1041 } else {
1042 backtrack::Bounded::exec(
1043 &self.ro.nfa,
1044 self.cache,
1045 matches,
1046 slots,
1047 CharInput::new(text),
1048 start,
1049 end)
1050 }
1051 }
1052
1053 /// Finds which regular expressions match the given text.
1054 ///
1055 /// `matches` should have length equal to the number of regexes being
1056 /// searched.
1057 ///
1058 /// This is only useful when one wants to know which regexes in a set
1059 /// match some text.
many_matches_at( &self, matches: &mut [bool], text: &[u8], start: usize, ) -> bool1060 pub fn many_matches_at(
1061 &self,
1062 matches: &mut [bool],
1063 text: &[u8],
1064 start: usize,
1065 ) -> bool {
1066 use self::MatchType::*;
1067 if !self.is_anchor_end_match(text) {
1068 return false;
1069 }
1070 match self.ro.match_type {
1071 Literal(ty) => {
1072 debug_assert_eq!(matches.len(), 1);
1073 matches[0] = self.find_literals(ty, text, start).is_some();
1074 matches[0]
1075 }
1076 Dfa | DfaAnchoredReverse | DfaSuffix | DfaMany => {
1077 match dfa::Fsm::forward_many(
1078 &self.ro.dfa,
1079 self.cache,
1080 matches,
1081 text,
1082 start,
1083 ) {
1084 dfa::Result::Match(_) => true,
1085 dfa::Result::NoMatch(_) => false,
1086 dfa::Result::Quit => {
1087 self.exec_nfa(
1088 MatchNfaType::Auto,
1089 matches,
1090 &mut [],
1091 false,
1092 text,
1093 start,
1094 text.len())
1095 }
1096 }
1097 }
1098 Nfa(ty) => {
1099 self.exec_nfa(
1100 ty, matches, &mut [], false, text, start, text.len())
1101 }
1102 Nothing => false,
1103 }
1104 }
1105
1106 #[inline(always)] // reduces constant overhead
is_anchor_end_match(&self, text: &[u8]) -> bool1107 fn is_anchor_end_match(&self, text: &[u8]) -> bool {
1108 // Only do this check if the haystack is big (>1MB).
1109 if text.len() > (1<<20) && self.ro.nfa.is_anchored_end {
1110 let lcs = self.ro.suffixes.lcs();
1111 if lcs.len() >= 1 && !lcs.is_suffix(text) {
1112 return false;
1113 }
1114 }
1115 true
1116 }
1117
capture_name_idx(&self) -> &Arc<HashMap<String, usize>>1118 pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1119 &self.ro.nfa.capture_name_idx
1120 }
1121 }
1122
1123 impl<'c> ExecNoSyncStr<'c> {
capture_name_idx(&self) -> &Arc<HashMap<String, usize>>1124 pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1125 self.0.capture_name_idx()
1126 }
1127 }
1128
1129 impl Exec {
1130 /// Get a searcher that isn't Sync.
1131 #[inline(always)] // reduces constant overhead
searcher(&self) -> ExecNoSync1132 pub fn searcher(&self) -> ExecNoSync {
1133 let create = || {
1134 Box::new(RefCell::new(ProgramCacheInner::new(&self.ro)))
1135 };
1136 ExecNoSync {
1137 ro: &self.ro, // a clone is too expensive here! (and not needed)
1138 cache: self.cache.get_or(create),
1139 }
1140 }
1141
1142 /// Get a searcher that isn't Sync and can match on &str.
1143 #[inline(always)] // reduces constant overhead
searcher_str(&self) -> ExecNoSyncStr1144 pub fn searcher_str(&self) -> ExecNoSyncStr {
1145 ExecNoSyncStr(self.searcher())
1146 }
1147
1148 /// Build a Regex from this executor.
into_regex(self) -> re_unicode::Regex1149 pub fn into_regex(self) -> re_unicode::Regex {
1150 re_unicode::Regex::from(self)
1151 }
1152
1153 /// Build a RegexSet from this executor.
into_regex_set(self) -> re_set::unicode::RegexSet1154 pub fn into_regex_set(self) -> re_set::unicode::RegexSet {
1155 re_set::unicode::RegexSet::from(self)
1156 }
1157
1158 /// Build a Regex from this executor that can match arbitrary bytes.
into_byte_regex(self) -> re_bytes::Regex1159 pub fn into_byte_regex(self) -> re_bytes::Regex {
1160 re_bytes::Regex::from(self)
1161 }
1162
1163 /// Build a RegexSet from this executor that can match arbitrary bytes.
into_byte_regex_set(self) -> re_set::bytes::RegexSet1164 pub fn into_byte_regex_set(self) -> re_set::bytes::RegexSet {
1165 re_set::bytes::RegexSet::from(self)
1166 }
1167
1168 /// The original regular expressions given by the caller that were
1169 /// compiled.
regex_strings(&self) -> &[String]1170 pub fn regex_strings(&self) -> &[String] {
1171 &self.ro.res
1172 }
1173
1174 /// Return a slice of capture names.
1175 ///
1176 /// Any capture that isn't named is None.
capture_names(&self) -> &[Option<String>]1177 pub fn capture_names(&self) -> &[Option<String>] {
1178 &self.ro.nfa.captures
1179 }
1180
1181 /// Return a reference to named groups mapping (from group name to
1182 /// group position).
capture_name_idx(&self) -> &Arc<HashMap<String, usize>>1183 pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1184 &self.ro.nfa.capture_name_idx
1185 }
1186 }
1187
1188 impl Clone for Exec {
clone(&self) -> Exec1189 fn clone(&self) -> Exec {
1190 Exec {
1191 ro: self.ro.clone(),
1192 cache: CachedThreadLocal::new(),
1193 }
1194 }
1195 }
1196
1197 impl ExecReadOnly {
choose_match_type(&self, hint: Option<MatchType>) -> MatchType1198 fn choose_match_type(&self, hint: Option<MatchType>) -> MatchType {
1199 use self::MatchType::*;
1200 if let Some(Nfa(_)) = hint {
1201 return hint.unwrap();
1202 }
1203 // If the NFA is empty, then we'll never match anything.
1204 if self.nfa.insts.is_empty() {
1205 return Nothing;
1206 }
1207 // If our set of prefixes is complete, then we can use it to find
1208 // a match in lieu of a regex engine. This doesn't quite work well in
1209 // the presence of multiple regexes, so only do it when there's one.
1210 //
1211 // TODO(burntsushi): Also, don't try to match literals if the regex is
1212 // partially anchored. We could technically do it, but we'd need to
1213 // create two sets of literals: all of them and then the subset that
1214 // aren't anchored. We would then only search for all of them when at
1215 // the beginning of the input and use the subset in all other cases.
1216 if self.res.len() == 1 {
1217 if self.ac.is_some() {
1218 return Literal(MatchLiteralType::AhoCorasick);
1219 }
1220 if self.nfa.prefixes.complete() {
1221 return if self.nfa.is_anchored_start {
1222 Literal(MatchLiteralType::AnchoredStart)
1223 } else {
1224 Literal(MatchLiteralType::Unanchored)
1225 };
1226 }
1227 if self.suffixes.complete() {
1228 return if self.nfa.is_anchored_end {
1229 Literal(MatchLiteralType::AnchoredEnd)
1230 } else {
1231 // This case shouldn't happen. When the regex isn't
1232 // anchored, then complete prefixes should imply complete
1233 // suffixes.
1234 Literal(MatchLiteralType::Unanchored)
1235 };
1236 }
1237 }
1238 // If we can execute the DFA, then we totally should.
1239 if dfa::can_exec(&self.dfa) {
1240 // Regex sets require a slightly specialized path.
1241 if self.res.len() >= 2 {
1242 return DfaMany;
1243 }
1244 // If the regex is anchored at the end but not the start, then
1245 // just match in reverse from the end of the haystack.
1246 if !self.nfa.is_anchored_start && self.nfa.is_anchored_end {
1247 return DfaAnchoredReverse;
1248 }
1249 // If there's a longish suffix literal, then it might be faster
1250 // to look for that first.
1251 if self.should_suffix_scan() {
1252 return DfaSuffix;
1253 }
1254 // Fall back to your garden variety forward searching lazy DFA.
1255 return Dfa;
1256 }
1257 // We're so totally hosed.
1258 Nfa(MatchNfaType::Auto)
1259 }
1260
1261 /// Returns true if the program is amenable to suffix scanning.
1262 ///
1263 /// When this is true, as a heuristic, we assume it is OK to quickly scan
1264 /// for suffix literals and then do a *reverse* DFA match from any matches
1265 /// produced by the literal scan. (And then followed by a forward DFA
1266 /// search, since the previously found suffix literal maybe not actually be
1267 /// the end of a match.)
1268 ///
1269 /// This is a bit of a specialized optimization, but can result in pretty
1270 /// big performance wins if 1) there are no prefix literals and 2) the
1271 /// suffix literals are pretty rare in the text. (1) is obviously easy to
1272 /// account for but (2) is harder. As a proxy, we assume that longer
1273 /// strings are generally rarer, so we only enable this optimization when
1274 /// we have a meaty suffix.
should_suffix_scan(&self) -> bool1275 fn should_suffix_scan(&self) -> bool {
1276 if self.suffixes.is_empty() {
1277 return false;
1278 }
1279 let lcs_len = self.suffixes.lcs().char_len();
1280 lcs_len >= 3 && lcs_len > self.dfa.prefixes.lcp().char_len()
1281 }
1282 }
1283
1284 #[derive(Clone, Copy, Debug)]
1285 enum MatchType {
1286 /// A single or multiple literal search. This is only used when the regex
1287 /// can be decomposed into a literal search.
1288 Literal(MatchLiteralType),
1289 /// A normal DFA search.
1290 Dfa,
1291 /// A reverse DFA search starting from the end of a haystack.
1292 DfaAnchoredReverse,
1293 /// A reverse DFA search with suffix literal scanning.
1294 DfaSuffix,
1295 /// Use the DFA on two or more regular expressions.
1296 DfaMany,
1297 /// An NFA variant.
1298 Nfa(MatchNfaType),
1299 /// No match is ever possible, so don't ever try to search.
1300 Nothing,
1301 }
1302
1303 #[derive(Clone, Copy, Debug)]
1304 enum MatchLiteralType {
1305 /// Match literals anywhere in text.
1306 Unanchored,
1307 /// Match literals only at the start of text.
1308 AnchoredStart,
1309 /// Match literals only at the end of text.
1310 AnchoredEnd,
1311 /// Use an Aho-Corasick automaton. This requires `ac` to be Some on
1312 /// ExecReadOnly.
1313 AhoCorasick,
1314 }
1315
1316 #[derive(Clone, Copy, Debug)]
1317 enum MatchNfaType {
1318 /// Choose between Backtrack and PikeVM.
1319 Auto,
1320 /// NFA bounded backtracking.
1321 ///
1322 /// (This is only set by tests, since it never makes sense to always want
1323 /// backtracking.)
1324 Backtrack,
1325 /// The Pike VM.
1326 ///
1327 /// (This is only set by tests, since it never makes sense to always want
1328 /// the Pike VM.)
1329 PikeVM,
1330 }
1331
1332 /// `ProgramCache` maintains reusable allocations for each matching engine
1333 /// available to a particular program.
1334 pub type ProgramCache = RefCell<ProgramCacheInner>;
1335
1336 #[derive(Debug)]
1337 pub struct ProgramCacheInner {
1338 pub pikevm: pikevm::Cache,
1339 pub backtrack: backtrack::Cache,
1340 pub dfa: dfa::Cache,
1341 pub dfa_reverse: dfa::Cache,
1342 }
1343
1344 impl ProgramCacheInner {
new(ro: &ExecReadOnly) -> Self1345 fn new(ro: &ExecReadOnly) -> Self {
1346 ProgramCacheInner {
1347 pikevm: pikevm::Cache::new(&ro.nfa),
1348 backtrack: backtrack::Cache::new(&ro.nfa),
1349 dfa: dfa::Cache::new(&ro.dfa),
1350 dfa_reverse: dfa::Cache::new(&ro.dfa_reverse),
1351 }
1352 }
1353 }
1354
1355 /// Alternation literals checks if the given HIR is a simple alternation of
1356 /// literals, and if so, returns them. Otherwise, this returns None.
alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>>1357 fn alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>> {
1358 use syntax::hir::{HirKind, Literal};
1359
1360 // This is pretty hacky, but basically, if `is_alternation_literal` is
1361 // true, then we can make several assumptions about the structure of our
1362 // HIR. This is what justifies the `unreachable!` statements below.
1363 //
1364 // This code should be refactored once we overhaul this crate's
1365 // optimization pipeline, because this is a terribly inflexible way to go
1366 // about things.
1367
1368 if !expr.is_alternation_literal() {
1369 return None;
1370 }
1371 let alts = match *expr.kind() {
1372 HirKind::Alternation(ref alts) => alts,
1373 _ => return None, // one literal isn't worth it
1374 };
1375
1376 let extendlit = |lit: &Literal, dst: &mut Vec<u8>| {
1377 match *lit {
1378 Literal::Unicode(c) => {
1379 let mut buf = [0; 4];
1380 dst.extend_from_slice(c.encode_utf8(&mut buf).as_bytes());
1381 }
1382 Literal::Byte(b) => {
1383 dst.push(b);
1384 }
1385 }
1386 };
1387
1388 let mut lits = vec![];
1389 for alt in alts {
1390 let mut lit = vec![];
1391 match *alt.kind() {
1392 HirKind::Literal(ref x) => extendlit(x, &mut lit),
1393 HirKind::Concat(ref exprs) => {
1394 for e in exprs {
1395 match *e.kind() {
1396 HirKind::Literal(ref x) => extendlit(x, &mut lit),
1397 _ => unreachable!("expected literal, got {:?}", e),
1398 }
1399 }
1400 }
1401 _ => unreachable!("expected literal or concat, got {:?}", alt),
1402 }
1403 lits.push(lit);
1404 }
1405 Some(lits)
1406 }
1407
1408 #[cfg(test)]
1409 mod test {
1410 #[test]
uppercut_s_backtracking_bytes_default_bytes_mismatch()1411 fn uppercut_s_backtracking_bytes_default_bytes_mismatch() {
1412 use internal::ExecBuilder;
1413
1414 let backtrack_bytes_re = ExecBuilder::new("^S")
1415 .bounded_backtracking()
1416 .only_utf8(false)
1417 .build()
1418 .map(|exec| exec.into_byte_regex())
1419 .map_err(|err| format!("{}", err))
1420 .unwrap();
1421
1422 let default_bytes_re = ExecBuilder::new("^S")
1423 .only_utf8(false)
1424 .build()
1425 .map(|exec| exec.into_byte_regex())
1426 .map_err(|err| format!("{}", err))
1427 .unwrap();
1428
1429 let input = vec![83, 83];
1430
1431 let s1 = backtrack_bytes_re.split(&input);
1432 let s2 = default_bytes_re.split(&input);
1433 for (chunk1, chunk2) in s1.zip(s2) {
1434 assert_eq!(chunk1, chunk2);
1435 }
1436 }
1437
1438 #[test]
unicode_lit_star_backtracking_utf8bytes_default_utf8bytes_mismatch()1439 fn unicode_lit_star_backtracking_utf8bytes_default_utf8bytes_mismatch() {
1440 use internal::ExecBuilder;
1441
1442 let backtrack_bytes_re = ExecBuilder::new(r"^(?u:\*)")
1443 .bounded_backtracking()
1444 .bytes(true)
1445 .build()
1446 .map(|exec| exec.into_regex())
1447 .map_err(|err| format!("{}", err))
1448 .unwrap();
1449
1450 let default_bytes_re = ExecBuilder::new(r"^(?u:\*)")
1451 .bytes(true)
1452 .build()
1453 .map(|exec| exec.into_regex())
1454 .map_err(|err| format!("{}", err))
1455 .unwrap();
1456
1457 let input = "**";
1458
1459 let s1 = backtrack_bytes_re.split(input);
1460 let s2 = default_bytes_re.split(input);
1461 for (chunk1, chunk2) in s1.zip(s2) {
1462 assert_eq!(chunk1, chunk2);
1463 }
1464 }
1465 }
1466