1 // Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT 2 // file at the top-level directory of this distribution and at 3 // http://rust-lang.org/COPYRIGHT. 4 // 5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or 6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license 7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your 8 // option. This file may not be copied, modified, or distributed 9 // except according to those terms. 10 11 use std::cell::RefCell; 12 use std::collections::HashMap; 13 use std::cmp; 14 use std::sync::Arc; 15 16 use thread_local::CachedThreadLocal; 17 use syntax::{Expr, ExprBuilder, Literals}; 18 19 use backtrack; 20 use compile::Compiler; 21 use dfa; 22 use error::Error; 23 use input::{ByteInput, CharInput}; 24 use literals::LiteralSearcher; 25 use pikevm; 26 use prog::Program; 27 use re_builder::RegexOptions; 28 use re_bytes; 29 use re_set; 30 use re_trait::{RegularExpression, Slot, Locations, as_slots}; 31 use re_unicode; 32 use utf8::next_utf8; 33 34 /// Exec manages the execution of a regular expression. 35 /// 36 /// In particular, this manages the various compiled forms of a single regular 37 /// expression and the choice of which matching engine to use to execute a 38 /// regular expression. 39 pub struct Exec { 40 /// All read only state. 41 ro: Arc<ExecReadOnly>, 42 /// Caches for the various matching engines. 43 cache: CachedThreadLocal<ProgramCache>, 44 } 45 46 /// ExecNoSync is like Exec, except it embeds a reference to a cache. This 47 /// means it is no longer Sync, but we can now avoid the overhead of 48 /// synchronization to fetch the cache. 49 #[derive(Debug)] 50 pub struct ExecNoSync<'c> { 51 /// All read only state. 52 ro: &'c Arc<ExecReadOnly>, 53 /// Caches for the various matching engines. 54 cache: &'c ProgramCache, 55 } 56 57 /// ExecNoSyncStr is like ExecNoSync, but matches on &str instead of &[u8]. 58 pub struct ExecNoSyncStr<'c>(ExecNoSync<'c>); 59 60 /// ExecReadOnly comprises all read only state for a regex. Namely, all such 61 /// state is determined at compile time and never changes during search. 62 #[derive(Debug)] 63 struct ExecReadOnly { 64 /// The original regular expressions given by the caller to compile. 65 res: Vec<String>, 66 /// A compiled program that is used in the NFA simulation and backtracking. 67 /// It can be byte-based or Unicode codepoint based. 68 /// 69 /// N.B. It is not possibly to make this byte-based from the public API. 70 /// It is only used for testing byte based programs in the NFA simulations. 71 nfa: Program, 72 /// A compiled byte based program for DFA execution. This is only used 73 /// if a DFA can be executed. (Currently, only word boundary assertions are 74 /// not supported.) Note that this program contains an embedded `.*?` 75 /// preceding the first capture group, unless the regex is anchored at the 76 /// beginning. 77 dfa: Program, 78 /// The same as above, except the program is reversed (and there is no 79 /// preceding `.*?`). This is used by the DFA to find the starting location 80 /// of matches. 81 dfa_reverse: Program, 82 /// A set of suffix literals extracted from the regex. 83 /// 84 /// Prefix literals are stored on the `Program`, since they are used inside 85 /// the matching engines. 86 suffixes: LiteralSearcher, 87 /// match_type encodes as much upfront knowledge about how we're going to 88 /// execute a search as possible. 89 match_type: MatchType, 90 } 91 92 /// Facilitates the construction of an executor by exposing various knobs 93 /// to control how a regex is executed and what kinds of resources it's 94 /// permitted to use. 95 pub struct ExecBuilder { 96 options: RegexOptions, 97 match_type: Option<MatchType>, 98 bytes: bool, 99 only_utf8: bool, 100 } 101 102 /// Parsed represents a set of parsed regular expressions and their detected 103 /// literals. 104 struct Parsed { 105 exprs: Vec<Expr>, 106 prefixes: Literals, 107 suffixes: Literals, 108 bytes: bool, 109 } 110 111 impl ExecBuilder { 112 /// Create a regex execution builder. 113 /// 114 /// This uses default settings for everything except the regex itself, 115 /// which must be provided. Further knobs can be set by calling methods, 116 /// and then finally, `build` to actually create the executor. new(re: &str) -> Self117 pub fn new(re: &str) -> Self { 118 Self::new_many(&[re]) 119 } 120 121 /// Like new, but compiles the union of the given regular expressions. 122 /// 123 /// Note that when compiling 2 or more regular expressions, capture groups 124 /// are completely unsupported. (This means both `find` and `captures` 125 /// wont work.) new_many<I, S>(res: I) -> Self where S: AsRef<str>, I: IntoIterator<Item=S>126 pub fn new_many<I, S>(res: I) -> Self 127 where S: AsRef<str>, I: IntoIterator<Item=S> { 128 let mut opts = RegexOptions::default(); 129 opts.pats = res.into_iter().map(|s| s.as_ref().to_owned()).collect(); 130 Self::new_options(opts) 131 } 132 133 /// Create a regex execution builder. new_options(opts: RegexOptions) -> Self134 pub fn new_options(opts: RegexOptions) -> Self { 135 ExecBuilder { 136 options: opts, 137 match_type: None, 138 bytes: false, 139 only_utf8: true, 140 } 141 } 142 143 /// Set the matching engine to be automatically determined. 144 /// 145 /// This is the default state and will apply whatever optimizations are 146 /// possible, such as running a DFA. 147 /// 148 /// This overrides whatever was previously set via the `nfa` or 149 /// `bounded_backtracking` methods. automatic(mut self) -> Self150 pub fn automatic(mut self) -> Self { 151 self.match_type = None; 152 self 153 } 154 155 /// Sets the matching engine to use the NFA algorithm no matter what 156 /// optimizations are possible. 157 /// 158 /// This overrides whatever was previously set via the `automatic` or 159 /// `bounded_backtracking` methods. nfa(mut self) -> Self160 pub fn nfa(mut self) -> Self { 161 self.match_type = Some(MatchType::Nfa(MatchNfaType::PikeVM)); 162 self 163 } 164 165 /// Sets the matching engine to use a bounded backtracking engine no 166 /// matter what optimizations are possible. 167 /// 168 /// One must use this with care, since the bounded backtracking engine 169 /// uses memory proportion to `len(regex) * len(text)`. 170 /// 171 /// This overrides whatever was previously set via the `automatic` or 172 /// `nfa` methods. bounded_backtracking(mut self) -> Self173 pub fn bounded_backtracking(mut self) -> Self { 174 self.match_type = Some(MatchType::Nfa(MatchNfaType::Backtrack)); 175 self 176 } 177 178 /// Compiles byte based programs for use with the NFA matching engines. 179 /// 180 /// By default, the NFA engines match on Unicode scalar values. They can 181 /// be made to use byte based programs instead. In general, the byte based 182 /// programs are slower because of a less efficient encoding of character 183 /// classes. 184 /// 185 /// Note that this does not impact DFA matching engines, which always 186 /// execute on bytes. bytes(mut self, yes: bool) -> Self187 pub fn bytes(mut self, yes: bool) -> Self { 188 self.bytes = yes; 189 self 190 } 191 192 /// When disabled, the program compiled may match arbitrary bytes. 193 /// 194 /// When enabled (the default), all compiled programs exclusively match 195 /// valid UTF-8 bytes. only_utf8(mut self, yes: bool) -> Self196 pub fn only_utf8(mut self, yes: bool) -> Self { 197 self.only_utf8 = yes; 198 self 199 } 200 201 /// Set the Unicode flag. unicode(mut self, yes: bool) -> Self202 pub fn unicode(mut self, yes: bool) -> Self { 203 self.options.unicode = yes; 204 self 205 } 206 207 /// Parse the current set of patterns into their AST and extract literals. parse(&self) -> Result<Parsed, Error>208 fn parse(&self) -> Result<Parsed, Error> { 209 let mut exprs = Vec::with_capacity(self.options.pats.len()); 210 let mut prefixes = Some(Literals::empty()); 211 let mut suffixes = Some(Literals::empty()); 212 let mut bytes = false; 213 let is_set = self.options.pats.len() > 1; 214 // If we're compiling a regex set and that set has any anchored 215 // expressions, then disable all literal optimizations. 216 for pat in &self.options.pats { 217 let parser = 218 ExprBuilder::new() 219 .case_insensitive(self.options.case_insensitive) 220 .multi_line(self.options.multi_line) 221 .dot_matches_new_line(self.options.dot_matches_new_line) 222 .swap_greed(self.options.swap_greed) 223 .ignore_whitespace(self.options.ignore_whitespace) 224 .unicode(self.options.unicode) 225 .allow_bytes(!self.only_utf8); 226 let expr = try!(parser.parse(pat)); 227 bytes = bytes || expr.has_bytes(); 228 229 if !expr.is_anchored_start() && expr.has_anchored_start() { 230 // Partial anchors unfortunately make it hard to use prefixes, 231 // so disable them. 232 prefixes = None; 233 } else if is_set && expr.is_anchored_start() { 234 // Regex sets with anchors do not go well with literal 235 // optimizations. 236 prefixes = None; 237 } 238 prefixes = prefixes.and_then(|mut prefixes| { 239 if !prefixes.union_prefixes(&expr) { 240 None 241 } else { 242 Some(prefixes) 243 } 244 }); 245 246 if !expr.is_anchored_end() && expr.has_anchored_end() { 247 // Partial anchors unfortunately make it hard to use suffixes, 248 // so disable them. 249 suffixes = None; 250 } else if is_set && expr.is_anchored_end() { 251 // Regex sets with anchors do not go well with literal 252 // optimizations. 253 prefixes = None; 254 } 255 suffixes = suffixes.and_then(|mut suffixes| { 256 if !suffixes.union_suffixes(&expr) { 257 None 258 } else { 259 Some(suffixes) 260 } 261 }); 262 exprs.push(expr); 263 } 264 Ok(Parsed { 265 exprs: exprs, 266 prefixes: prefixes.unwrap_or(Literals::empty()), 267 suffixes: suffixes.unwrap_or(Literals::empty()), 268 bytes: bytes, 269 }) 270 } 271 272 /// Build an executor that can run a regular expression. build(self) -> Result<Exec, Error>273 pub fn build(self) -> Result<Exec, Error> { 274 // Special case when we have no patterns to compile. 275 // This can happen when compiling a regex set. 276 if self.options.pats.is_empty() { 277 let ro = Arc::new(ExecReadOnly { 278 res: vec![], 279 nfa: Program::new(), 280 dfa: Program::new(), 281 dfa_reverse: Program::new(), 282 suffixes: LiteralSearcher::empty(), 283 match_type: MatchType::Nothing, 284 }); 285 return Ok(Exec { ro: ro, cache: CachedThreadLocal::new() }); 286 } 287 let parsed = try!(self.parse()); 288 let mut nfa = try!( 289 Compiler::new() 290 .size_limit(self.options.size_limit) 291 .bytes(self.bytes || parsed.bytes) 292 .only_utf8(self.only_utf8) 293 .compile(&parsed.exprs)); 294 let mut dfa = try!( 295 Compiler::new() 296 .size_limit(self.options.size_limit) 297 .dfa(true) 298 .only_utf8(self.only_utf8) 299 .compile(&parsed.exprs)); 300 let mut dfa_reverse = try!( 301 Compiler::new() 302 .size_limit(self.options.size_limit) 303 .dfa(true) 304 .only_utf8(self.only_utf8) 305 .reverse(true) 306 .compile(&parsed.exprs)); 307 308 let prefixes = parsed.prefixes.unambiguous_prefixes(); 309 let suffixes = parsed.suffixes.unambiguous_suffixes(); 310 nfa.prefixes = LiteralSearcher::prefixes(prefixes); 311 dfa.prefixes = nfa.prefixes.clone(); 312 dfa.dfa_size_limit = self.options.dfa_size_limit; 313 dfa_reverse.dfa_size_limit = self.options.dfa_size_limit; 314 315 let mut ro = ExecReadOnly { 316 res: self.options.pats, 317 nfa: nfa, 318 dfa: dfa, 319 dfa_reverse: dfa_reverse, 320 suffixes: LiteralSearcher::suffixes(suffixes), 321 match_type: MatchType::Nothing, 322 }; 323 ro.match_type = ro.choose_match_type(self.match_type); 324 325 let ro = Arc::new(ro); 326 Ok(Exec { ro: ro, cache: CachedThreadLocal::new() }) 327 } 328 } 329 330 impl<'c> RegularExpression for ExecNoSyncStr<'c> { 331 type Text = str; 332 slots_len(&self) -> usize333 fn slots_len(&self) -> usize { self.0.slots_len() } 334 next_after_empty(&self, text: &str, i: usize) -> usize335 fn next_after_empty(&self, text: &str, i: usize) -> usize { 336 next_utf8(text.as_bytes(), i) 337 } 338 339 #[inline(always)] // reduces constant overhead shortest_match_at(&self, text: &str, start: usize) -> Option<usize>340 fn shortest_match_at(&self, text: &str, start: usize) -> Option<usize> { 341 self.0.shortest_match_at(text.as_bytes(), start) 342 } 343 344 #[inline(always)] // reduces constant overhead is_match_at(&self, text: &str, start: usize) -> bool345 fn is_match_at(&self, text: &str, start: usize) -> bool { 346 self.0.is_match_at(text.as_bytes(), start) 347 } 348 349 #[inline(always)] // reduces constant overhead find_at(&self, text: &str, start: usize) -> Option<(usize, usize)>350 fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> { 351 self.0.find_at(text.as_bytes(), start) 352 } 353 354 #[inline(always)] // reduces constant overhead read_captures_at( &self, locs: &mut Locations, text: &str, start: usize, ) -> Option<(usize, usize)>355 fn read_captures_at( 356 &self, 357 locs: &mut Locations, 358 text: &str, 359 start: usize, 360 ) -> Option<(usize, usize)> { 361 self.0.read_captures_at(locs, text.as_bytes(), start) 362 } 363 } 364 365 impl<'c> RegularExpression for ExecNoSync<'c> { 366 type Text = [u8]; 367 368 /// Returns the number of capture slots in the regular expression. (There 369 /// are two slots for every capture group, corresponding to possibly empty 370 /// start and end locations of the capture.) slots_len(&self) -> usize371 fn slots_len(&self) -> usize { 372 self.ro.nfa.captures.len() * 2 373 } 374 next_after_empty(&self, _text: &[u8], i: usize) -> usize375 fn next_after_empty(&self, _text: &[u8], i: usize) -> usize { 376 i + 1 377 } 378 379 /// Returns the end of a match location, possibly occurring before the 380 /// end location of the correct leftmost-first match. 381 #[inline(always)] // reduces constant overhead shortest_match_at(&self, text: &[u8], start: usize) -> Option<usize>382 fn shortest_match_at(&self, text: &[u8], start: usize) -> Option<usize> { 383 if !self.is_anchor_end_match(text) { 384 return None; 385 } 386 match self.ro.match_type { 387 MatchType::Literal(ty) => { 388 self.find_literals(ty, text, start).map(|(_, e)| e) 389 } 390 MatchType::Dfa | MatchType::DfaMany => { 391 match self.shortest_dfa(text, start) { 392 dfa::Result::Match(end) => Some(end), 393 dfa::Result::NoMatch(_) => None, 394 dfa::Result::Quit => self.shortest_nfa(text, start), 395 } 396 } 397 MatchType::DfaAnchoredReverse => { 398 match dfa::Fsm::reverse( 399 &self.ro.dfa_reverse, 400 &self.cache, 401 true, 402 &text[start..], 403 text.len(), 404 ) { 405 dfa::Result::Match(_) => Some(text.len()), 406 dfa::Result::NoMatch(_) => None, 407 dfa::Result::Quit => self.shortest_nfa(text, start), 408 } 409 } 410 MatchType::DfaSuffix => { 411 match self.shortest_dfa_reverse_suffix(text, start) { 412 dfa::Result::Match(e) => Some(e), 413 dfa::Result::NoMatch(_) => None, 414 dfa::Result::Quit => self.shortest_nfa(text, start), 415 } 416 } 417 MatchType::Nfa(ty) => self.shortest_nfa_type(ty, text, start), 418 MatchType::Nothing => None, 419 } 420 } 421 422 /// Returns true if and only if the regex matches text. 423 /// 424 /// For single regular expressions, this is equivalent to calling 425 /// shortest_match(...).is_some(). 426 #[inline(always)] // reduces constant overhead is_match_at(&self, text: &[u8], start: usize) -> bool427 fn is_match_at(&self, text: &[u8], start: usize) -> bool { 428 if !self.is_anchor_end_match(text) { 429 return false; 430 } 431 // We need to do this dance because shortest_match relies on the NFA 432 // filling in captures[1], but a RegexSet has no captures. In other 433 // words, a RegexSet can't (currently) use shortest_match. ---AG 434 match self.ro.match_type { 435 MatchType::Literal(ty) => { 436 self.find_literals(ty, text, start).is_some() 437 } 438 MatchType::Dfa | MatchType::DfaMany => { 439 match self.shortest_dfa(text, start) { 440 dfa::Result::Match(_) => true, 441 dfa::Result::NoMatch(_) => false, 442 dfa::Result::Quit => self.match_nfa(text, start), 443 } 444 } 445 MatchType::DfaAnchoredReverse => { 446 match dfa::Fsm::reverse( 447 &self.ro.dfa_reverse, 448 &self.cache, 449 true, 450 &text[start..], 451 text.len(), 452 ) { 453 dfa::Result::Match(_) => true, 454 dfa::Result::NoMatch(_) => false, 455 dfa::Result::Quit => self.match_nfa(text, start), 456 } 457 } 458 MatchType::DfaSuffix => { 459 match self.shortest_dfa_reverse_suffix(text, start) { 460 dfa::Result::Match(_) => true, 461 dfa::Result::NoMatch(_) => false, 462 dfa::Result::Quit => self.match_nfa(text, start), 463 } 464 } 465 MatchType::Nfa(ty) => self.match_nfa_type(ty, text, start), 466 MatchType::Nothing => false, 467 } 468 } 469 470 /// Finds the start and end location of the leftmost-first match, starting 471 /// at the given location. 472 #[inline(always)] // reduces constant overhead find_at(&self, text: &[u8], start: usize) -> Option<(usize, usize)>473 fn find_at(&self, text: &[u8], start: usize) -> Option<(usize, usize)> { 474 if !self.is_anchor_end_match(text) { 475 return None; 476 } 477 match self.ro.match_type { 478 MatchType::Literal(ty) => { 479 self.find_literals(ty, text, start) 480 } 481 MatchType::Dfa => { 482 match self.find_dfa_forward(text, start) { 483 dfa::Result::Match((s, e)) => Some((s, e)), 484 dfa::Result::NoMatch(_) => None, 485 dfa::Result::Quit => { 486 self.find_nfa(MatchNfaType::Auto, text, start) 487 } 488 } 489 } 490 MatchType::DfaAnchoredReverse => { 491 match self.find_dfa_anchored_reverse(text, start) { 492 dfa::Result::Match((s, e)) => Some((s, e)), 493 dfa::Result::NoMatch(_) => None, 494 dfa::Result::Quit => { 495 self.find_nfa(MatchNfaType::Auto, text, start) 496 } 497 } 498 } 499 MatchType::DfaSuffix => { 500 match self.find_dfa_reverse_suffix(text, start) { 501 dfa::Result::Match((s, e)) => Some((s, e)), 502 dfa::Result::NoMatch(_) => None, 503 dfa::Result::Quit => { 504 self.find_nfa(MatchNfaType::Auto, text, start) 505 } 506 } 507 } 508 MatchType::Nfa(ty) => self.find_nfa(ty, text, start), 509 MatchType::Nothing => None, 510 MatchType::DfaMany => { 511 unreachable!("BUG: RegexSet cannot be used with find") 512 } 513 } 514 } 515 516 /// Finds the start and end location of the leftmost-first match and also 517 /// fills in all matching capture groups. 518 /// 519 /// The number of capture slots given should be equal to the total number 520 /// of capture slots in the compiled program. 521 /// 522 /// Note that the first two slots always correspond to the start and end 523 /// locations of the overall match. read_captures_at( &self, locs: &mut Locations, text: &[u8], start: usize, ) -> Option<(usize, usize)>524 fn read_captures_at( 525 &self, 526 locs: &mut Locations, 527 text: &[u8], 528 start: usize, 529 ) -> Option<(usize, usize)> { 530 let slots = as_slots(locs); 531 for slot in slots.iter_mut() { 532 *slot = None; 533 } 534 // If the caller unnecessarily uses this, then we try to save them 535 // from themselves. 536 match slots.len() { 537 0 => return self.find_at(text, start), 538 2 => { 539 return self.find_at(text, start).map(|(s, e)| { 540 slots[0] = Some(s); 541 slots[1] = Some(e); 542 (s, e) 543 }); 544 } 545 _ => {} // fallthrough 546 } 547 if !self.is_anchor_end_match(text) { 548 return None; 549 } 550 match self.ro.match_type { 551 MatchType::Literal(ty) => { 552 self.find_literals(ty, text, start).and_then(|(s, e)| { 553 self.captures_nfa_with_match(slots, text, s, e) 554 }) 555 } 556 MatchType::Dfa => { 557 match self.find_dfa_forward(text, start) { 558 dfa::Result::Match((s, e)) => { 559 self.captures_nfa_with_match(slots, text, s, e) 560 } 561 dfa::Result::NoMatch(_) => None, 562 dfa::Result::Quit => self.captures_nfa(slots, text, start), 563 } 564 } 565 MatchType::DfaAnchoredReverse => { 566 match self.find_dfa_anchored_reverse(text, start) { 567 dfa::Result::Match((s, e)) => { 568 self.captures_nfa_with_match(slots, text, s, e) 569 } 570 dfa::Result::NoMatch(_) => None, 571 dfa::Result::Quit => self.captures_nfa(slots, text, start), 572 } 573 } 574 MatchType::DfaSuffix => { 575 match self.find_dfa_reverse_suffix(text, start) { 576 dfa::Result::Match((s, e)) => { 577 self.captures_nfa_with_match(slots, text, s, e) 578 } 579 dfa::Result::NoMatch(_) => None, 580 dfa::Result::Quit => self.captures_nfa(slots, text, start), 581 } 582 } 583 MatchType::Nfa(ty) => { 584 self.captures_nfa_type(ty, slots, text, start) 585 } 586 MatchType::Nothing => None, 587 MatchType::DfaMany => { 588 unreachable!("BUG: RegexSet cannot be used with captures") 589 } 590 } 591 } 592 } 593 594 impl<'c> ExecNoSync<'c> { 595 /// Finds the leftmost-first match using only literal search. 596 #[inline(always)] // reduces constant overhead find_literals( &self, ty: MatchLiteralType, text: &[u8], start: usize, ) -> Option<(usize, usize)>597 fn find_literals( 598 &self, 599 ty: MatchLiteralType, 600 text: &[u8], 601 start: usize, 602 ) -> Option<(usize, usize)> { 603 use self::MatchLiteralType::*; 604 match ty { 605 Unanchored => { 606 let lits = &self.ro.nfa.prefixes; 607 lits.find(&text[start..]) 608 .map(|(s, e)| (start + s, start + e)) 609 } 610 AnchoredStart => { 611 let lits = &self.ro.nfa.prefixes; 612 lits.find_start(&text[start..]) 613 .map(|(s, e)| (start + s, start + e)) 614 } 615 AnchoredEnd => { 616 let lits = &self.ro.suffixes; 617 lits.find_end(&text[start..]) 618 .map(|(s, e)| (start + s, start + e)) 619 } 620 } 621 } 622 623 /// Finds the leftmost-first match (start and end) using only the DFA. 624 /// 625 /// If the result returned indicates that the DFA quit, then another 626 /// matching engine should be used. 627 #[inline(always)] // reduces constant overhead find_dfa_forward( &self, text: &[u8], start: usize, ) -> dfa::Result<(usize, usize)>628 fn find_dfa_forward( 629 &self, 630 text: &[u8], 631 start: usize, 632 ) -> dfa::Result<(usize, usize)> { 633 use dfa::Result::*; 634 let end = match dfa::Fsm::forward( 635 &self.ro.dfa, 636 &self.cache, 637 false, 638 text, 639 start, 640 ) { 641 NoMatch(i) => return NoMatch(i), 642 Quit => return Quit, 643 Match(end) if start == end => return Match((start, start)), 644 Match(end) => end, 645 }; 646 // Now run the DFA in reverse to find the start of the match. 647 match dfa::Fsm::reverse( 648 &self.ro.dfa_reverse, 649 &self.cache, 650 false, 651 &text[start..], 652 end - start, 653 ) { 654 Match(s) => Match((start + s, end)), 655 NoMatch(i) => NoMatch(i), 656 Quit => Quit, 657 } 658 } 659 660 /// Finds the leftmost-first match (start and end) using only the DFA, 661 /// but assumes the regex is anchored at the end and therefore starts at 662 /// the end of the regex and matches in reverse. 663 /// 664 /// If the result returned indicates that the DFA quit, then another 665 /// matching engine should be used. 666 #[inline(always)] // reduces constant overhead find_dfa_anchored_reverse( &self, text: &[u8], start: usize, ) -> dfa::Result<(usize, usize)>667 fn find_dfa_anchored_reverse( 668 &self, 669 text: &[u8], 670 start: usize, 671 ) -> dfa::Result<(usize, usize)> { 672 use dfa::Result::*; 673 match dfa::Fsm::reverse( 674 &self.ro.dfa_reverse, 675 &self.cache, 676 false, 677 &text[start..], 678 text.len() - start, 679 ) { 680 Match(s) => Match((start + s, text.len())), 681 NoMatch(i) => NoMatch(i), 682 Quit => Quit, 683 } 684 } 685 686 /// Finds the end of the shortest match using only the DFA. 687 #[inline(always)] // reduces constant overhead shortest_dfa(&self, text: &[u8], start: usize) -> dfa::Result<usize>688 fn shortest_dfa(&self, text: &[u8], start: usize) -> dfa::Result<usize> { 689 dfa::Fsm::forward(&self.ro.dfa, &self.cache, true, text, start) 690 } 691 692 /// Finds the end of the shortest match using only the DFA by scanning for 693 /// suffix literals. 694 /// 695 #[inline(always)] // reduces constant overhead shortest_dfa_reverse_suffix( &self, text: &[u8], start: usize, ) -> dfa::Result<usize>696 fn shortest_dfa_reverse_suffix( 697 &self, 698 text: &[u8], 699 start: usize, 700 ) -> dfa::Result<usize> { 701 match self.exec_dfa_reverse_suffix(text, start) { 702 None => self.shortest_dfa(text, start), 703 Some(r) => r.map(|(_, end)| end), 704 } 705 } 706 707 /// Finds the end of the shortest match using only the DFA by scanning for 708 /// suffix literals. It also reports the start of the match. 709 /// 710 /// Note that if None is returned, then the optimization gave up to avoid 711 /// worst case quadratic behavior. A forward scanning DFA should be tried 712 /// next. 713 /// 714 /// If a match is returned and the full leftmost-first match is desired, 715 /// then a forward scan starting from the beginning of the match must be 716 /// done. 717 /// 718 /// If the result returned indicates that the DFA quit, then another 719 /// matching engine should be used. 720 #[inline(always)] // reduces constant overhead exec_dfa_reverse_suffix( &self, text: &[u8], original_start: usize, ) -> Option<dfa::Result<(usize, usize)>>721 fn exec_dfa_reverse_suffix( 722 &self, 723 text: &[u8], 724 original_start: usize, 725 ) -> Option<dfa::Result<(usize, usize)>> { 726 use dfa::Result::*; 727 728 let lcs = self.ro.suffixes.lcs(); 729 debug_assert!(lcs.len() >= 1); 730 let mut start = original_start; 731 let mut end = start; 732 while end <= text.len() { 733 start = end; 734 end = end + match lcs.find(&text[end..]) { 735 None => return Some(NoMatch(text.len())), 736 Some(start) => start + lcs.len(), 737 }; 738 match dfa::Fsm::reverse( 739 &self.ro.dfa_reverse, 740 &self.cache, 741 false, 742 &text[start..end], 743 end - start, 744 ) { 745 Match(0) | NoMatch(0) => return None, 746 Match(s) => return Some(Match((s + start, end))), 747 NoMatch(_) => continue, 748 Quit => return Some(Quit), 749 }; 750 } 751 Some(NoMatch(text.len())) 752 } 753 754 /// Finds the leftmost-first match (start and end) using only the DFA 755 /// by scanning for suffix literals. 756 /// 757 /// If the result returned indicates that the DFA quit, then another 758 /// matching engine should be used. 759 #[inline(always)] // reduces constant overhead find_dfa_reverse_suffix( &self, text: &[u8], start: usize, ) -> dfa::Result<(usize, usize)>760 fn find_dfa_reverse_suffix( 761 &self, 762 text: &[u8], 763 start: usize, 764 ) -> dfa::Result<(usize, usize)> { 765 use dfa::Result::*; 766 767 let match_start = match self.exec_dfa_reverse_suffix(text, start) { 768 None => return self.find_dfa_forward(text, start), 769 Some(Match((start, _))) => start, 770 Some(r) => return r, 771 }; 772 // At this point, we've found a match. The only way to quit now 773 // without a match is if the DFA gives up (seems unlikely). 774 // 775 // Now run the DFA forwards to find the proper end of the match. 776 // (The suffix literal match can only indicate the earliest 777 // possible end location, which may appear before the end of the 778 // leftmost-first match.) 779 match dfa::Fsm::forward( 780 &self.ro.dfa, 781 &self.cache, 782 false, 783 text, 784 match_start, 785 ) { 786 NoMatch(_) => panic!("BUG: reverse match implies forward match"), 787 Quit => Quit, 788 Match(e) => Match((match_start, e)), 789 } 790 } 791 792 /// Executes the NFA engine to return whether there is a match or not. 793 /// 794 /// Ideally, we could use shortest_nfa(...).is_some() and get the same 795 /// performance characteristics, but regex sets don't have captures, which 796 /// shortest_nfa depends on. match_nfa( &self, text: &[u8], start: usize, ) -> bool797 fn match_nfa( 798 &self, 799 text: &[u8], 800 start: usize, 801 ) -> bool { 802 self.match_nfa_type(MatchNfaType::Auto, text, start) 803 } 804 805 /// Like match_nfa, but allows specification of the type of NFA engine. match_nfa_type( &self, ty: MatchNfaType, text: &[u8], start: usize, ) -> bool806 fn match_nfa_type( 807 &self, 808 ty: MatchNfaType, 809 text: &[u8], 810 start: usize, 811 ) -> bool { 812 self.exec_nfa(ty, &mut [false], &mut [], true, text, start) 813 } 814 815 /// Finds the shortest match using an NFA. shortest_nfa(&self, text: &[u8], start: usize) -> Option<usize>816 fn shortest_nfa(&self, text: &[u8], start: usize) -> Option<usize> { 817 self.shortest_nfa_type(MatchNfaType::Auto, text, start) 818 } 819 820 /// Like shortest_nfa, but allows specification of the type of NFA engine. shortest_nfa_type( &self, ty: MatchNfaType, text: &[u8], start: usize, ) -> Option<usize>821 fn shortest_nfa_type( 822 &self, 823 ty: MatchNfaType, 824 text: &[u8], 825 start: usize, 826 ) -> Option<usize> { 827 let mut slots = [None, None]; 828 if self.exec_nfa(ty, &mut [false], &mut slots, true, text, start) { 829 slots[1] 830 } else { 831 None 832 } 833 } 834 835 /// Like find, but executes an NFA engine. find_nfa( &self, ty: MatchNfaType, text: &[u8], start: usize, ) -> Option<(usize, usize)>836 fn find_nfa( 837 &self, 838 ty: MatchNfaType, 839 text: &[u8], 840 start: usize, 841 ) -> Option<(usize, usize)> { 842 let mut slots = [None, None]; 843 if self.exec_nfa(ty, &mut [false], &mut slots, false, text, start) { 844 match (slots[0], slots[1]) { 845 (Some(s), Some(e)) => Some((s, e)), 846 _ => None, 847 } 848 } else { 849 None 850 } 851 } 852 853 /// Like find_nfa, but fills in captures and restricts the search space 854 /// using previously found match information. 855 /// 856 /// `slots` should have length equal to `2 * nfa.captures.len()`. captures_nfa_with_match( &self, slots: &mut [Slot], text: &[u8], match_start: usize, match_end: usize, ) -> Option<(usize, usize)>857 fn captures_nfa_with_match( 858 &self, 859 slots: &mut [Slot], 860 text: &[u8], 861 match_start: usize, 862 match_end: usize, 863 ) -> Option<(usize, usize)> { 864 // We can't use match_end directly, because we may need to examine one 865 // "character" after the end of a match for lookahead operators. We 866 // need to move two characters beyond the end, since some look-around 867 // operations may falsely assume a premature end of text otherwise. 868 let e = cmp::min( 869 next_utf8(text, next_utf8(text, match_end)), text.len()); 870 self.captures_nfa(slots, &text[..e], match_start) 871 } 872 873 /// Like find_nfa, but fills in captures. 874 /// 875 /// `slots` should have length equal to `2 * nfa.captures.len()`. captures_nfa( &self, slots: &mut [Slot], text: &[u8], start: usize, ) -> Option<(usize, usize)>876 fn captures_nfa( 877 &self, 878 slots: &mut [Slot], 879 text: &[u8], 880 start: usize, 881 ) -> Option<(usize, usize)> { 882 self.captures_nfa_type(MatchNfaType::Auto, slots, text, start) 883 } 884 885 /// Like captures_nfa, but allows specification of type of NFA engine. captures_nfa_type( &self, ty: MatchNfaType, slots: &mut [Slot], text: &[u8], start: usize, ) -> Option<(usize, usize)>886 fn captures_nfa_type( 887 &self, 888 ty: MatchNfaType, 889 slots: &mut [Slot], 890 text: &[u8], 891 start: usize, 892 ) -> Option<(usize, usize)> { 893 if self.exec_nfa(ty, &mut [false], slots, false, text, start) { 894 match (slots[0], slots[1]) { 895 (Some(s), Some(e)) => Some((s, e)), 896 _ => None, 897 } 898 } else { 899 None 900 } 901 } 902 exec_nfa( &self, mut ty: MatchNfaType, matches: &mut [bool], slots: &mut [Slot], quit_after_match: bool, text: &[u8], start: usize, ) -> bool903 fn exec_nfa( 904 &self, 905 mut ty: MatchNfaType, 906 matches: &mut [bool], 907 slots: &mut [Slot], 908 quit_after_match: bool, 909 text: &[u8], 910 start: usize, 911 ) -> bool { 912 use self::MatchNfaType::*; 913 if let Auto = ty { 914 if backtrack::should_exec(self.ro.nfa.len(), text.len()) { 915 ty = Backtrack; 916 } else { 917 ty = PikeVM; 918 } 919 } 920 match ty { 921 Auto => unreachable!(), 922 Backtrack => self.exec_backtrack(matches, slots, text, start), 923 PikeVM => { 924 self.exec_pikevm( 925 matches, slots, quit_after_match, text, start) 926 } 927 } 928 } 929 930 /// Always run the NFA algorithm. exec_pikevm( &self, matches: &mut [bool], slots: &mut [Slot], quit_after_match: bool, text: &[u8], start: usize, ) -> bool931 fn exec_pikevm( 932 &self, 933 matches: &mut [bool], 934 slots: &mut [Slot], 935 quit_after_match: bool, 936 text: &[u8], 937 start: usize, 938 ) -> bool { 939 if self.ro.nfa.uses_bytes() { 940 pikevm::Fsm::exec( 941 &self.ro.nfa, 942 &self.cache, 943 matches, 944 slots, 945 quit_after_match, 946 ByteInput::new(text, self.ro.nfa.only_utf8), 947 start) 948 } else { 949 pikevm::Fsm::exec( 950 &self.ro.nfa, 951 &self.cache, 952 matches, 953 slots, 954 quit_after_match, 955 CharInput::new(text), 956 start) 957 } 958 } 959 960 /// Always runs the NFA using bounded backtracking. exec_backtrack( &self, matches: &mut [bool], slots: &mut [Slot], text: &[u8], start: usize, ) -> bool961 fn exec_backtrack( 962 &self, 963 matches: &mut [bool], 964 slots: &mut [Slot], 965 text: &[u8], 966 start: usize, 967 ) -> bool { 968 if self.ro.nfa.uses_bytes() { 969 backtrack::Bounded::exec( 970 &self.ro.nfa, 971 &self.cache, 972 matches, 973 slots, 974 ByteInput::new(text, self.ro.nfa.only_utf8), 975 start) 976 } else { 977 backtrack::Bounded::exec( 978 &self.ro.nfa, 979 &self.cache, 980 matches, 981 slots, 982 CharInput::new(text), 983 start) 984 } 985 } 986 987 /// Finds which regular expressions match the given text. 988 /// 989 /// `matches` should have length equal to the number of regexes being 990 /// searched. 991 /// 992 /// This is only useful when one wants to know which regexes in a set 993 /// match some text. many_matches_at( &self, matches: &mut [bool], text: &[u8], start: usize, ) -> bool994 pub fn many_matches_at( 995 &self, 996 matches: &mut [bool], 997 text: &[u8], 998 start: usize, 999 ) -> bool { 1000 use self::MatchType::*; 1001 if !self.is_anchor_end_match(text) { 1002 return false; 1003 } 1004 match self.ro.match_type { 1005 Literal(ty) => { 1006 debug_assert!(matches.len() == 1); 1007 matches[0] = self.find_literals(ty, text, start).is_some(); 1008 matches[0] 1009 } 1010 Dfa | DfaAnchoredReverse | DfaSuffix | DfaMany => { 1011 match dfa::Fsm::forward_many( 1012 &self.ro.dfa, 1013 &self.cache, 1014 matches, 1015 text, 1016 start, 1017 ) { 1018 dfa::Result::Match(_) => true, 1019 dfa::Result::NoMatch(_) => false, 1020 dfa::Result::Quit => { 1021 self.exec_nfa( 1022 MatchNfaType::Auto, 1023 matches, 1024 &mut [], 1025 false, 1026 text, 1027 start) 1028 } 1029 } 1030 } 1031 Nfa(ty) => self.exec_nfa(ty, matches, &mut [], false, text, start), 1032 Nothing => false, 1033 } 1034 } 1035 1036 #[inline(always)] // reduces constant overhead is_anchor_end_match(&self, text: &[u8]) -> bool1037 fn is_anchor_end_match(&self, text: &[u8]) -> bool { 1038 // Only do this check if the haystack is big (>1MB). 1039 if text.len() > (1<<20) && self.ro.nfa.is_anchored_end { 1040 let lcs = self.ro.suffixes.lcs(); 1041 if lcs.len() >= 1 && !lcs.is_suffix(text) { 1042 return false; 1043 } 1044 } 1045 true 1046 } 1047 capture_name_idx(&self) -> &Arc<HashMap<String, usize>>1048 pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> { 1049 &self.ro.nfa.capture_name_idx 1050 } 1051 } 1052 1053 impl<'c> ExecNoSyncStr<'c> { capture_name_idx(&self) -> &Arc<HashMap<String, usize>>1054 pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> { 1055 self.0.capture_name_idx() 1056 } 1057 } 1058 1059 impl Exec { 1060 /// Get a searcher that isn't Sync. 1061 #[inline(always)] // reduces constant overhead searcher(&self) -> ExecNoSync1062 pub fn searcher(&self) -> ExecNoSync { 1063 let create = || Box::new(RefCell::new(ProgramCacheInner::new(&self.ro))); 1064 ExecNoSync { 1065 ro: &self.ro, // a clone is too expensive here! (and not needed) 1066 cache: self.cache.get_or(create), 1067 } 1068 } 1069 1070 /// Get a searcher that isn't Sync and can match on &str. 1071 #[inline(always)] // reduces constant overhead searcher_str(&self) -> ExecNoSyncStr1072 pub fn searcher_str(&self) -> ExecNoSyncStr { 1073 ExecNoSyncStr(self.searcher()) 1074 } 1075 1076 /// Build a Regex from this executor. into_regex(self) -> re_unicode::Regex1077 pub fn into_regex(self) -> re_unicode::Regex { 1078 re_unicode::Regex::from(self) 1079 } 1080 1081 /// Build a RegexSet from this executor. into_regex_set(self) -> re_set::unicode::RegexSet1082 pub fn into_regex_set(self) -> re_set::unicode::RegexSet { 1083 re_set::unicode::RegexSet::from(self) 1084 } 1085 1086 /// Build a Regex from this executor that can match arbitrary bytes. into_byte_regex(self) -> re_bytes::Regex1087 pub fn into_byte_regex(self) -> re_bytes::Regex { 1088 re_bytes::Regex::from(self) 1089 } 1090 1091 /// Build a RegexSet from this executor that can match arbitrary bytes. into_byte_regex_set(self) -> re_set::bytes::RegexSet1092 pub fn into_byte_regex_set(self) -> re_set::bytes::RegexSet { 1093 re_set::bytes::RegexSet::from(self) 1094 } 1095 1096 /// The original regular expressions given by the caller that were 1097 /// compiled. regex_strings(&self) -> &[String]1098 pub fn regex_strings(&self) -> &[String] { 1099 &self.ro.res 1100 } 1101 1102 /// Return a slice of capture names. 1103 /// 1104 /// Any capture that isn't named is None. capture_names(&self) -> &[Option<String>]1105 pub fn capture_names(&self) -> &[Option<String>] { 1106 &self.ro.nfa.captures 1107 } 1108 1109 /// Return a reference to named groups mapping (from group name to 1110 /// group position). capture_name_idx(&self) -> &Arc<HashMap<String, usize>>1111 pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> { 1112 &self.ro.nfa.capture_name_idx 1113 } 1114 } 1115 1116 impl Clone for Exec { clone(&self) -> Exec1117 fn clone(&self) -> Exec { 1118 Exec { 1119 ro: self.ro.clone(), 1120 cache: CachedThreadLocal::new(), 1121 } 1122 } 1123 } 1124 1125 impl ExecReadOnly { choose_match_type(&self, hint: Option<MatchType>) -> MatchType1126 fn choose_match_type(&self, hint: Option<MatchType>) -> MatchType { 1127 use self::MatchType::*; 1128 if let Some(Nfa(_)) = hint { 1129 return hint.unwrap(); 1130 } 1131 // If the NFA is empty, then we'll never match anything. 1132 if self.nfa.insts.is_empty() { 1133 return Nothing; 1134 } 1135 // If our set of prefixes is complete, then we can use it to find 1136 // a match in lieu of a regex engine. This doesn't quite work well in 1137 // the presence of multiple regexes, so only do it when there's one. 1138 // 1139 // TODO(burntsushi): Also, don't try to match literals if the regex is 1140 // partially anchored. We could technically do it, but we'd need to 1141 // create two sets of literals: all of them and then the subset that 1142 // aren't anchored. We would then only search for all of them when at 1143 // the beginning of the input and use the subset in all other cases. 1144 if self.res.len() == 1 { 1145 if self.nfa.prefixes.complete() { 1146 return if self.nfa.is_anchored_start { 1147 Literal(MatchLiteralType::AnchoredStart) 1148 } else { 1149 Literal(MatchLiteralType::Unanchored) 1150 }; 1151 } 1152 if self.suffixes.complete() { 1153 return if self.nfa.is_anchored_end { 1154 Literal(MatchLiteralType::AnchoredEnd) 1155 } else { 1156 // This case shouldn't happen. When the regex isn't 1157 // anchored, then complete prefixes should imply complete 1158 // suffixes. 1159 Literal(MatchLiteralType::Unanchored) 1160 }; 1161 } 1162 } 1163 // If we can execute the DFA, then we totally should. 1164 if dfa::can_exec(&self.dfa) { 1165 // Regex sets require a slightly specialized path. 1166 if self.res.len() >= 2 { 1167 return DfaMany; 1168 } 1169 // If the regex is anchored at the end but not the start, then 1170 // just match in reverse from the end of the haystack. 1171 if !self.nfa.is_anchored_start && self.nfa.is_anchored_end { 1172 return DfaAnchoredReverse; 1173 } 1174 // If there's a longish suffix literal, then it might be faster 1175 // to look for that first. 1176 if self.should_suffix_scan() { 1177 return DfaSuffix; 1178 } 1179 // Fall back to your garden variety forward searching lazy DFA. 1180 return Dfa; 1181 } 1182 // We're so totally hosed. 1183 Nfa(MatchNfaType::Auto) 1184 } 1185 1186 /// Returns true if the program is amenable to suffix scanning. 1187 /// 1188 /// When this is true, as a heuristic, we assume it is OK to quickly scan 1189 /// for suffix literals and then do a *reverse* DFA match from any matches 1190 /// produced by the literal scan. (And then followed by a forward DFA 1191 /// search, since the previously found suffix literal maybe not actually be 1192 /// the end of a match.) 1193 /// 1194 /// This is a bit of a specialized optimization, but can result in pretty 1195 /// big performance wins if 1) there are no prefix literals and 2) the 1196 /// suffix literals are pretty rare in the text. (1) is obviously easy to 1197 /// account for but (2) is harder. As a proxy, we assume that longer 1198 /// strings are generally rarer, so we only enable this optimization when 1199 /// we have a meaty suffix. should_suffix_scan(&self) -> bool1200 fn should_suffix_scan(&self) -> bool { 1201 if self.suffixes.is_empty() { 1202 return false; 1203 } 1204 let lcs_len = self.suffixes.lcs().char_len(); 1205 lcs_len >= 3 && lcs_len > self.dfa.prefixes.lcp().char_len() 1206 } 1207 } 1208 1209 #[derive(Clone, Copy, Debug)] 1210 enum MatchType { 1211 /// A single or multiple literal search. This is only used when the regex 1212 /// can be decomposed into unambiguous literal search. 1213 Literal(MatchLiteralType), 1214 /// A normal DFA search. 1215 Dfa, 1216 /// A reverse DFA search starting from the end of a haystack. 1217 DfaAnchoredReverse, 1218 /// A reverse DFA search with suffix literal scanning. 1219 DfaSuffix, 1220 /// Use the DFA on two or more regular expressions. 1221 DfaMany, 1222 /// An NFA variant. 1223 Nfa(MatchNfaType), 1224 /// No match is ever possible, so don't ever try to search. 1225 Nothing, 1226 } 1227 1228 #[derive(Clone, Copy, Debug)] 1229 enum MatchLiteralType { 1230 /// Match literals anywhere in text. 1231 Unanchored, 1232 /// Match literals only at the start of text. 1233 AnchoredStart, 1234 /// Match literals only at the end of text. 1235 AnchoredEnd, 1236 } 1237 1238 #[derive(Clone, Copy, Debug)] 1239 enum MatchNfaType { 1240 /// Choose between Backtrack and PikeVM. 1241 Auto, 1242 /// NFA bounded backtracking. 1243 /// 1244 /// (This is only set by tests, since it never makes sense to always want 1245 /// backtracking.) 1246 Backtrack, 1247 /// The Pike VM. 1248 /// 1249 /// (This is only set by tests, since it never makes sense to always want 1250 /// the Pike VM.) 1251 PikeVM, 1252 } 1253 1254 /// ProgramCache maintains reusable allocations for each matching engine 1255 /// available to a particular program. 1256 pub type ProgramCache = RefCell<ProgramCacheInner>; 1257 1258 #[derive(Clone, Debug)] 1259 pub struct ProgramCacheInner { 1260 pub pikevm: pikevm::Cache, 1261 pub backtrack: backtrack::Cache, 1262 pub dfa: dfa::Cache, 1263 pub dfa_reverse: dfa::Cache, 1264 } 1265 1266 impl ProgramCacheInner { new(ro: &ExecReadOnly) -> Self1267 fn new(ro: &ExecReadOnly) -> Self { 1268 ProgramCacheInner { 1269 pikevm: pikevm::Cache::new(&ro.nfa), 1270 backtrack: backtrack::Cache::new(&ro.nfa), 1271 dfa: dfa::Cache::new(&ro.dfa), 1272 dfa_reverse: dfa::Cache::new(&ro.dfa_reverse), 1273 } 1274 } 1275 } 1276