1 // Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT 2 // file at the top-level directory of this distribution and at 3 // http://rust-lang.org/COPYRIGHT. 4 // 5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or 6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license 7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your 8 // option. This file may not be copied, modified, or distributed 9 // except according to those terms. 10 11 use std::cell::RefCell; 12 use std::collections::HashMap; 13 use std::cmp; 14 use std::sync::Arc; 15 16 use thread_local::CachedThreadLocal; 17 use syntax::ParserBuilder; 18 use syntax::hir::Hir; 19 use syntax::hir::literal::Literals; 20 21 use backtrack; 22 use compile::Compiler; 23 use dfa; 24 use error::Error; 25 use input::{ByteInput, CharInput}; 26 use literal::LiteralSearcher; 27 use pikevm; 28 use prog::Program; 29 use re_builder::RegexOptions; 30 use re_bytes; 31 use re_set; 32 use re_trait::{RegularExpression, Slot, Locations, as_slots}; 33 use re_unicode; 34 use utf8::next_utf8; 35 36 /// `Exec` manages the execution of a regular expression. 37 /// 38 /// In particular, this manages the various compiled forms of a single regular 39 /// expression and the choice of which matching engine to use to execute a 40 /// regular expression. 41 pub struct Exec { 42 /// All read only state. 43 ro: Arc<ExecReadOnly>, 44 /// Caches for the various matching engines. 45 cache: CachedThreadLocal<ProgramCache>, 46 } 47 48 /// `ExecNoSync` is like `Exec`, except it embeds a reference to a cache. This 49 /// means it is no longer Sync, but we can now avoid the overhead of 50 /// synchronization to fetch the cache. 51 #[derive(Debug)] 52 pub struct ExecNoSync<'c> { 53 /// All read only state. 54 ro: &'c Arc<ExecReadOnly>, 55 /// Caches for the various matching engines. 56 cache: &'c ProgramCache, 57 } 58 59 /// `ExecNoSyncStr` is like `ExecNoSync`, but matches on &str instead of &[u8]. 60 pub struct ExecNoSyncStr<'c>(ExecNoSync<'c>); 61 62 /// `ExecReadOnly` comprises all read only state for a regex. Namely, all such 63 /// state is determined at compile time and never changes during search. 64 #[derive(Debug)] 65 struct ExecReadOnly { 66 /// The original regular expressions given by the caller to compile. 67 res: Vec<String>, 68 /// A compiled program that is used in the NFA simulation and backtracking. 69 /// It can be byte-based or Unicode codepoint based. 70 /// 71 /// N.B. It is not possibly to make this byte-based from the public API. 72 /// It is only used for testing byte based programs in the NFA simulations. 73 nfa: Program, 74 /// A compiled byte based program for DFA execution. This is only used 75 /// if a DFA can be executed. (Currently, only word boundary assertions are 76 /// not supported.) Note that this program contains an embedded `.*?` 77 /// preceding the first capture group, unless the regex is anchored at the 78 /// beginning. 79 dfa: Program, 80 /// The same as above, except the program is reversed (and there is no 81 /// preceding `.*?`). This is used by the DFA to find the starting location 82 /// of matches. 83 dfa_reverse: Program, 84 /// A set of suffix literals extracted from the regex. 85 /// 86 /// Prefix literals are stored on the `Program`, since they are used inside 87 /// the matching engines. 88 suffixes: LiteralSearcher, 89 /// match_type encodes as much upfront knowledge about how we're going to 90 /// execute a search as possible. 91 match_type: MatchType, 92 } 93 94 /// Facilitates the construction of an executor by exposing various knobs 95 /// to control how a regex is executed and what kinds of resources it's 96 /// permitted to use. 97 pub struct ExecBuilder { 98 options: RegexOptions, 99 match_type: Option<MatchType>, 100 bytes: bool, 101 only_utf8: bool, 102 } 103 104 /// Parsed represents a set of parsed regular expressions and their detected 105 /// literals. 106 struct Parsed { 107 exprs: Vec<Hir>, 108 prefixes: Literals, 109 suffixes: Literals, 110 bytes: bool, 111 } 112 113 impl ExecBuilder { 114 /// Create a regex execution builder. 115 /// 116 /// This uses default settings for everything except the regex itself, 117 /// which must be provided. Further knobs can be set by calling methods, 118 /// and then finally, `build` to actually create the executor. new(re: &str) -> Self119 pub fn new(re: &str) -> Self { 120 Self::new_many(&[re]) 121 } 122 123 /// Like new, but compiles the union of the given regular expressions. 124 /// 125 /// Note that when compiling 2 or more regular expressions, capture groups 126 /// are completely unsupported. (This means both `find` and `captures` 127 /// wont work.) new_many<I, S>(res: I) -> Self where S: AsRef<str>, I: IntoIterator<Item=S>128 pub fn new_many<I, S>(res: I) -> Self 129 where S: AsRef<str>, I: IntoIterator<Item=S> { 130 let mut opts = RegexOptions::default(); 131 opts.pats = res.into_iter().map(|s| s.as_ref().to_owned()).collect(); 132 Self::new_options(opts) 133 } 134 135 /// Create a regex execution builder. new_options(opts: RegexOptions) -> Self136 pub fn new_options(opts: RegexOptions) -> Self { 137 ExecBuilder { 138 options: opts, 139 match_type: None, 140 bytes: false, 141 only_utf8: true, 142 } 143 } 144 145 /// Set the matching engine to be automatically determined. 146 /// 147 /// This is the default state and will apply whatever optimizations are 148 /// possible, such as running a DFA. 149 /// 150 /// This overrides whatever was previously set via the `nfa` or 151 /// `bounded_backtracking` methods. automatic(mut self) -> Self152 pub fn automatic(mut self) -> Self { 153 self.match_type = None; 154 self 155 } 156 157 /// Sets the matching engine to use the NFA algorithm no matter what 158 /// optimizations are possible. 159 /// 160 /// This overrides whatever was previously set via the `automatic` or 161 /// `bounded_backtracking` methods. nfa(mut self) -> Self162 pub fn nfa(mut self) -> Self { 163 self.match_type = Some(MatchType::Nfa(MatchNfaType::PikeVM)); 164 self 165 } 166 167 /// Sets the matching engine to use a bounded backtracking engine no 168 /// matter what optimizations are possible. 169 /// 170 /// One must use this with care, since the bounded backtracking engine 171 /// uses memory proportion to `len(regex) * len(text)`. 172 /// 173 /// This overrides whatever was previously set via the `automatic` or 174 /// `nfa` methods. bounded_backtracking(mut self) -> Self175 pub fn bounded_backtracking(mut self) -> Self { 176 self.match_type = Some(MatchType::Nfa(MatchNfaType::Backtrack)); 177 self 178 } 179 180 /// Compiles byte based programs for use with the NFA matching engines. 181 /// 182 /// By default, the NFA engines match on Unicode scalar values. They can 183 /// be made to use byte based programs instead. In general, the byte based 184 /// programs are slower because of a less efficient encoding of character 185 /// classes. 186 /// 187 /// Note that this does not impact DFA matching engines, which always 188 /// execute on bytes. bytes(mut self, yes: bool) -> Self189 pub fn bytes(mut self, yes: bool) -> Self { 190 self.bytes = yes; 191 self 192 } 193 194 /// When disabled, the program compiled may match arbitrary bytes. 195 /// 196 /// When enabled (the default), all compiled programs exclusively match 197 /// valid UTF-8 bytes. only_utf8(mut self, yes: bool) -> Self198 pub fn only_utf8(mut self, yes: bool) -> Self { 199 self.only_utf8 = yes; 200 self 201 } 202 203 /// Set the Unicode flag. unicode(mut self, yes: bool) -> Self204 pub fn unicode(mut self, yes: bool) -> Self { 205 self.options.unicode = yes; 206 self 207 } 208 209 /// Parse the current set of patterns into their AST and extract literals. parse(&self) -> Result<Parsed, Error>210 fn parse(&self) -> Result<Parsed, Error> { 211 let mut exprs = Vec::with_capacity(self.options.pats.len()); 212 let mut prefixes = Some(Literals::empty()); 213 let mut suffixes = Some(Literals::empty()); 214 let mut bytes = false; 215 let is_set = self.options.pats.len() > 1; 216 // If we're compiling a regex set and that set has any anchored 217 // expressions, then disable all literal optimizations. 218 for pat in &self.options.pats { 219 let mut parser = 220 ParserBuilder::new() 221 .octal(self.options.octal) 222 .case_insensitive(self.options.case_insensitive) 223 .multi_line(self.options.multi_line) 224 .dot_matches_new_line(self.options.dot_matches_new_line) 225 .swap_greed(self.options.swap_greed) 226 .ignore_whitespace(self.options.ignore_whitespace) 227 .unicode(self.options.unicode) 228 .allow_invalid_utf8(!self.only_utf8) 229 .nest_limit(self.options.nest_limit) 230 .build(); 231 let expr = parser 232 .parse(pat) 233 .map_err(|e| Error::Syntax(e.to_string()))?; 234 bytes = bytes || !expr.is_always_utf8(); 235 236 if !expr.is_anchored_start() && expr.is_any_anchored_start() { 237 // Partial anchors unfortunately make it hard to use prefixes, 238 // so disable them. 239 prefixes = None; 240 } else if is_set && expr.is_anchored_start() { 241 // Regex sets with anchors do not go well with literal 242 // optimizations. 243 prefixes = None; 244 } 245 prefixes = prefixes.and_then(|mut prefixes| { 246 if !prefixes.union_prefixes(&expr) { 247 None 248 } else { 249 Some(prefixes) 250 } 251 }); 252 253 if !expr.is_anchored_end() && expr.is_any_anchored_end() { 254 // Partial anchors unfortunately make it hard to use suffixes, 255 // so disable them. 256 suffixes = None; 257 } else if is_set && expr.is_anchored_end() { 258 // Regex sets with anchors do not go well with literal 259 // optimizations. 260 suffixes = None; 261 } 262 suffixes = suffixes.and_then(|mut suffixes| { 263 if !suffixes.union_suffixes(&expr) { 264 None 265 } else { 266 Some(suffixes) 267 } 268 }); 269 exprs.push(expr); 270 } 271 Ok(Parsed { 272 exprs: exprs, 273 prefixes: prefixes.unwrap_or_else(Literals::empty), 274 suffixes: suffixes.unwrap_or_else(Literals::empty), 275 bytes: bytes, 276 }) 277 } 278 279 /// Build an executor that can run a regular expression. build(self) -> Result<Exec, Error>280 pub fn build(self) -> Result<Exec, Error> { 281 // Special case when we have no patterns to compile. 282 // This can happen when compiling a regex set. 283 if self.options.pats.is_empty() { 284 let ro = Arc::new(ExecReadOnly { 285 res: vec![], 286 nfa: Program::new(), 287 dfa: Program::new(), 288 dfa_reverse: Program::new(), 289 suffixes: LiteralSearcher::empty(), 290 match_type: MatchType::Nothing, 291 }); 292 return Ok(Exec { ro: ro, cache: CachedThreadLocal::new() }); 293 } 294 let parsed = self.parse()?; 295 let mut nfa = 296 Compiler::new() 297 .size_limit(self.options.size_limit) 298 .bytes(self.bytes || parsed.bytes) 299 .only_utf8(self.only_utf8) 300 .compile(&parsed.exprs)?; 301 let mut dfa = 302 Compiler::new() 303 .size_limit(self.options.size_limit) 304 .dfa(true) 305 .only_utf8(self.only_utf8) 306 .compile(&parsed.exprs)?; 307 let mut dfa_reverse = 308 Compiler::new() 309 .size_limit(self.options.size_limit) 310 .dfa(true) 311 .only_utf8(self.only_utf8) 312 .reverse(true) 313 .compile(&parsed.exprs)?; 314 315 let prefixes = parsed.prefixes.unambiguous_prefixes(); 316 let suffixes = parsed.suffixes.unambiguous_suffixes(); 317 nfa.prefixes = LiteralSearcher::prefixes(prefixes); 318 dfa.prefixes = nfa.prefixes.clone(); 319 dfa.dfa_size_limit = self.options.dfa_size_limit; 320 dfa_reverse.dfa_size_limit = self.options.dfa_size_limit; 321 322 let mut ro = ExecReadOnly { 323 res: self.options.pats, 324 nfa: nfa, 325 dfa: dfa, 326 dfa_reverse: dfa_reverse, 327 suffixes: LiteralSearcher::suffixes(suffixes), 328 match_type: MatchType::Nothing, 329 }; 330 ro.match_type = ro.choose_match_type(self.match_type); 331 332 let ro = Arc::new(ro); 333 Ok(Exec { ro: ro, cache: CachedThreadLocal::new() }) 334 } 335 } 336 337 impl<'c> RegularExpression for ExecNoSyncStr<'c> { 338 type Text = str; 339 slots_len(&self) -> usize340 fn slots_len(&self) -> usize { self.0.slots_len() } 341 next_after_empty(&self, text: &str, i: usize) -> usize342 fn next_after_empty(&self, text: &str, i: usize) -> usize { 343 next_utf8(text.as_bytes(), i) 344 } 345 346 #[inline(always)] // reduces constant overhead shortest_match_at(&self, text: &str, start: usize) -> Option<usize>347 fn shortest_match_at(&self, text: &str, start: usize) -> Option<usize> { 348 self.0.shortest_match_at(text.as_bytes(), start) 349 } 350 351 #[inline(always)] // reduces constant overhead is_match_at(&self, text: &str, start: usize) -> bool352 fn is_match_at(&self, text: &str, start: usize) -> bool { 353 self.0.is_match_at(text.as_bytes(), start) 354 } 355 356 #[inline(always)] // reduces constant overhead find_at(&self, text: &str, start: usize) -> Option<(usize, usize)>357 fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> { 358 self.0.find_at(text.as_bytes(), start) 359 } 360 361 #[inline(always)] // reduces constant overhead read_captures_at( &self, locs: &mut Locations, text: &str, start: usize, ) -> Option<(usize, usize)>362 fn read_captures_at( 363 &self, 364 locs: &mut Locations, 365 text: &str, 366 start: usize, 367 ) -> Option<(usize, usize)> { 368 self.0.read_captures_at(locs, text.as_bytes(), start) 369 } 370 } 371 372 impl<'c> RegularExpression for ExecNoSync<'c> { 373 type Text = [u8]; 374 375 /// Returns the number of capture slots in the regular expression. (There 376 /// are two slots for every capture group, corresponding to possibly empty 377 /// start and end locations of the capture.) slots_len(&self) -> usize378 fn slots_len(&self) -> usize { 379 self.ro.nfa.captures.len() * 2 380 } 381 next_after_empty(&self, _text: &[u8], i: usize) -> usize382 fn next_after_empty(&self, _text: &[u8], i: usize) -> usize { 383 i + 1 384 } 385 386 /// Returns the end of a match location, possibly occurring before the 387 /// end location of the correct leftmost-first match. 388 #[inline(always)] // reduces constant overhead shortest_match_at(&self, text: &[u8], start: usize) -> Option<usize>389 fn shortest_match_at(&self, text: &[u8], start: usize) -> Option<usize> { 390 if !self.is_anchor_end_match(text) { 391 return None; 392 } 393 match self.ro.match_type { 394 MatchType::Literal(ty) => { 395 self.find_literals(ty, text, start).map(|(_, e)| e) 396 } 397 MatchType::Dfa | MatchType::DfaMany => { 398 match self.shortest_dfa(text, start) { 399 dfa::Result::Match(end) => Some(end), 400 dfa::Result::NoMatch(_) => None, 401 dfa::Result::Quit => self.shortest_nfa(text, start), 402 } 403 } 404 MatchType::DfaAnchoredReverse => { 405 match dfa::Fsm::reverse( 406 &self.ro.dfa_reverse, 407 self.cache, 408 true, 409 &text[start..], 410 text.len(), 411 ) { 412 dfa::Result::Match(_) => Some(text.len()), 413 dfa::Result::NoMatch(_) => None, 414 dfa::Result::Quit => self.shortest_nfa(text, start), 415 } 416 } 417 MatchType::DfaSuffix => { 418 match self.shortest_dfa_reverse_suffix(text, start) { 419 dfa::Result::Match(e) => Some(e), 420 dfa::Result::NoMatch(_) => None, 421 dfa::Result::Quit => self.shortest_nfa(text, start), 422 } 423 } 424 MatchType::Nfa(ty) => self.shortest_nfa_type(ty, text, start), 425 MatchType::Nothing => None, 426 } 427 } 428 429 /// Returns true if and only if the regex matches text. 430 /// 431 /// For single regular expressions, this is equivalent to calling 432 /// shortest_match(...).is_some(). 433 #[inline(always)] // reduces constant overhead is_match_at(&self, text: &[u8], start: usize) -> bool434 fn is_match_at(&self, text: &[u8], start: usize) -> bool { 435 if !self.is_anchor_end_match(text) { 436 return false; 437 } 438 // We need to do this dance because shortest_match relies on the NFA 439 // filling in captures[1], but a RegexSet has no captures. In other 440 // words, a RegexSet can't (currently) use shortest_match. ---AG 441 match self.ro.match_type { 442 MatchType::Literal(ty) => { 443 self.find_literals(ty, text, start).is_some() 444 } 445 MatchType::Dfa | MatchType::DfaMany => { 446 match self.shortest_dfa(text, start) { 447 dfa::Result::Match(_) => true, 448 dfa::Result::NoMatch(_) => false, 449 dfa::Result::Quit => self.match_nfa(text, start), 450 } 451 } 452 MatchType::DfaAnchoredReverse => { 453 match dfa::Fsm::reverse( 454 &self.ro.dfa_reverse, 455 self.cache, 456 true, 457 &text[start..], 458 text.len(), 459 ) { 460 dfa::Result::Match(_) => true, 461 dfa::Result::NoMatch(_) => false, 462 dfa::Result::Quit => self.match_nfa(text, start), 463 } 464 } 465 MatchType::DfaSuffix => { 466 match self.shortest_dfa_reverse_suffix(text, start) { 467 dfa::Result::Match(_) => true, 468 dfa::Result::NoMatch(_) => false, 469 dfa::Result::Quit => self.match_nfa(text, start), 470 } 471 } 472 MatchType::Nfa(ty) => self.match_nfa_type(ty, text, start), 473 MatchType::Nothing => false, 474 } 475 } 476 477 /// Finds the start and end location of the leftmost-first match, starting 478 /// at the given location. 479 #[inline(always)] // reduces constant overhead find_at(&self, text: &[u8], start: usize) -> Option<(usize, usize)>480 fn find_at(&self, text: &[u8], start: usize) -> Option<(usize, usize)> { 481 if !self.is_anchor_end_match(text) { 482 return None; 483 } 484 match self.ro.match_type { 485 MatchType::Literal(ty) => { 486 self.find_literals(ty, text, start) 487 } 488 MatchType::Dfa => { 489 match self.find_dfa_forward(text, start) { 490 dfa::Result::Match((s, e)) => Some((s, e)), 491 dfa::Result::NoMatch(_) => None, 492 dfa::Result::Quit => { 493 self.find_nfa(MatchNfaType::Auto, text, start) 494 } 495 } 496 } 497 MatchType::DfaAnchoredReverse => { 498 match self.find_dfa_anchored_reverse(text, start) { 499 dfa::Result::Match((s, e)) => Some((s, e)), 500 dfa::Result::NoMatch(_) => None, 501 dfa::Result::Quit => { 502 self.find_nfa(MatchNfaType::Auto, text, start) 503 } 504 } 505 } 506 MatchType::DfaSuffix => { 507 match self.find_dfa_reverse_suffix(text, start) { 508 dfa::Result::Match((s, e)) => Some((s, e)), 509 dfa::Result::NoMatch(_) => None, 510 dfa::Result::Quit => { 511 self.find_nfa(MatchNfaType::Auto, text, start) 512 } 513 } 514 } 515 MatchType::Nfa(ty) => self.find_nfa(ty, text, start), 516 MatchType::Nothing => None, 517 MatchType::DfaMany => { 518 unreachable!("BUG: RegexSet cannot be used with find") 519 } 520 } 521 } 522 523 /// Finds the start and end location of the leftmost-first match and also 524 /// fills in all matching capture groups. 525 /// 526 /// The number of capture slots given should be equal to the total number 527 /// of capture slots in the compiled program. 528 /// 529 /// Note that the first two slots always correspond to the start and end 530 /// locations of the overall match. read_captures_at( &self, locs: &mut Locations, text: &[u8], start: usize, ) -> Option<(usize, usize)>531 fn read_captures_at( 532 &self, 533 locs: &mut Locations, 534 text: &[u8], 535 start: usize, 536 ) -> Option<(usize, usize)> { 537 let slots = as_slots(locs); 538 for slot in slots.iter_mut() { 539 *slot = None; 540 } 541 // If the caller unnecessarily uses this, then we try to save them 542 // from themselves. 543 match slots.len() { 544 0 => return self.find_at(text, start), 545 2 => { 546 return self.find_at(text, start).map(|(s, e)| { 547 slots[0] = Some(s); 548 slots[1] = Some(e); 549 (s, e) 550 }); 551 } 552 _ => {} // fallthrough 553 } 554 if !self.is_anchor_end_match(text) { 555 return None; 556 } 557 match self.ro.match_type { 558 MatchType::Literal(ty) => { 559 self.find_literals(ty, text, start).and_then(|(s, e)| { 560 self.captures_nfa_with_match(slots, text, s, e) 561 }) 562 } 563 MatchType::Dfa => { 564 if self.ro.nfa.is_anchored_start { 565 self.captures_nfa(slots, text, start) 566 } else { 567 match self.find_dfa_forward(text, start) { 568 dfa::Result::Match((s, e)) => { 569 self.captures_nfa_with_match(slots, text, s, e) 570 } 571 dfa::Result::NoMatch(_) => None, 572 dfa::Result::Quit => self.captures_nfa(slots, text, start), 573 } 574 } 575 } 576 MatchType::DfaAnchoredReverse => { 577 match self.find_dfa_anchored_reverse(text, start) { 578 dfa::Result::Match((s, e)) => { 579 self.captures_nfa_with_match(slots, text, s, e) 580 } 581 dfa::Result::NoMatch(_) => None, 582 dfa::Result::Quit => self.captures_nfa(slots, text, start), 583 } 584 } 585 MatchType::DfaSuffix => { 586 match self.find_dfa_reverse_suffix(text, start) { 587 dfa::Result::Match((s, e)) => { 588 self.captures_nfa_with_match(slots, text, s, e) 589 } 590 dfa::Result::NoMatch(_) => None, 591 dfa::Result::Quit => self.captures_nfa(slots, text, start), 592 } 593 } 594 MatchType::Nfa(ty) => { 595 self.captures_nfa_type(ty, slots, text, start) 596 } 597 MatchType::Nothing => None, 598 MatchType::DfaMany => { 599 unreachable!("BUG: RegexSet cannot be used with captures") 600 } 601 } 602 } 603 } 604 605 impl<'c> ExecNoSync<'c> { 606 /// Finds the leftmost-first match using only literal search. 607 #[inline(always)] // reduces constant overhead find_literals( &self, ty: MatchLiteralType, text: &[u8], start: usize, ) -> Option<(usize, usize)>608 fn find_literals( 609 &self, 610 ty: MatchLiteralType, 611 text: &[u8], 612 start: usize, 613 ) -> Option<(usize, usize)> { 614 use self::MatchLiteralType::*; 615 match ty { 616 Unanchored => { 617 let lits = &self.ro.nfa.prefixes; 618 lits.find(&text[start..]) 619 .map(|(s, e)| (start + s, start + e)) 620 } 621 AnchoredStart => { 622 let lits = &self.ro.nfa.prefixes; 623 if !self.ro.nfa.is_anchored_start 624 || (self.ro.nfa.is_anchored_start && start == 0) { 625 lits.find_start(&text[start..]) 626 .map(|(s, e)| (start + s, start + e)) 627 } else { 628 None 629 } 630 } 631 AnchoredEnd => { 632 let lits = &self.ro.suffixes; 633 lits.find_end(&text[start..]) 634 .map(|(s, e)| (start + s, start + e)) 635 } 636 } 637 } 638 639 /// Finds the leftmost-first match (start and end) using only the DFA. 640 /// 641 /// If the result returned indicates that the DFA quit, then another 642 /// matching engine should be used. 643 #[inline(always)] // reduces constant overhead find_dfa_forward( &self, text: &[u8], start: usize, ) -> dfa::Result<(usize, usize)>644 fn find_dfa_forward( 645 &self, 646 text: &[u8], 647 start: usize, 648 ) -> dfa::Result<(usize, usize)> { 649 use dfa::Result::*; 650 let end = match dfa::Fsm::forward( 651 &self.ro.dfa, 652 self.cache, 653 false, 654 text, 655 start, 656 ) { 657 NoMatch(i) => return NoMatch(i), 658 Quit => return Quit, 659 Match(end) if start == end => return Match((start, start)), 660 Match(end) => end, 661 }; 662 // Now run the DFA in reverse to find the start of the match. 663 match dfa::Fsm::reverse( 664 &self.ro.dfa_reverse, 665 self.cache, 666 false, 667 &text[start..], 668 end - start, 669 ) { 670 Match(s) => Match((start + s, end)), 671 NoMatch(i) => NoMatch(i), 672 Quit => Quit, 673 } 674 } 675 676 /// Finds the leftmost-first match (start and end) using only the DFA, 677 /// but assumes the regex is anchored at the end and therefore starts at 678 /// the end of the regex and matches in reverse. 679 /// 680 /// If the result returned indicates that the DFA quit, then another 681 /// matching engine should be used. 682 #[inline(always)] // reduces constant overhead find_dfa_anchored_reverse( &self, text: &[u8], start: usize, ) -> dfa::Result<(usize, usize)>683 fn find_dfa_anchored_reverse( 684 &self, 685 text: &[u8], 686 start: usize, 687 ) -> dfa::Result<(usize, usize)> { 688 use dfa::Result::*; 689 match dfa::Fsm::reverse( 690 &self.ro.dfa_reverse, 691 self.cache, 692 false, 693 &text[start..], 694 text.len() - start, 695 ) { 696 Match(s) => Match((start + s, text.len())), 697 NoMatch(i) => NoMatch(i), 698 Quit => Quit, 699 } 700 } 701 702 /// Finds the end of the shortest match using only the DFA. 703 #[inline(always)] // reduces constant overhead shortest_dfa(&self, text: &[u8], start: usize) -> dfa::Result<usize>704 fn shortest_dfa(&self, text: &[u8], start: usize) -> dfa::Result<usize> { 705 dfa::Fsm::forward(&self.ro.dfa, self.cache, true, text, start) 706 } 707 708 /// Finds the end of the shortest match using only the DFA by scanning for 709 /// suffix literals. 710 /// 711 #[inline(always)] // reduces constant overhead shortest_dfa_reverse_suffix( &self, text: &[u8], start: usize, ) -> dfa::Result<usize>712 fn shortest_dfa_reverse_suffix( 713 &self, 714 text: &[u8], 715 start: usize, 716 ) -> dfa::Result<usize> { 717 match self.exec_dfa_reverse_suffix(text, start) { 718 None => self.shortest_dfa(text, start), 719 Some(r) => r.map(|(_, end)| end), 720 } 721 } 722 723 /// Finds the end of the shortest match using only the DFA by scanning for 724 /// suffix literals. It also reports the start of the match. 725 /// 726 /// Note that if None is returned, then the optimization gave up to avoid 727 /// worst case quadratic behavior. A forward scanning DFA should be tried 728 /// next. 729 /// 730 /// If a match is returned and the full leftmost-first match is desired, 731 /// then a forward scan starting from the beginning of the match must be 732 /// done. 733 /// 734 /// If the result returned indicates that the DFA quit, then another 735 /// matching engine should be used. 736 #[inline(always)] // reduces constant overhead exec_dfa_reverse_suffix( &self, text: &[u8], original_start: usize, ) -> Option<dfa::Result<(usize, usize)>>737 fn exec_dfa_reverse_suffix( 738 &self, 739 text: &[u8], 740 original_start: usize, 741 ) -> Option<dfa::Result<(usize, usize)>> { 742 use dfa::Result::*; 743 744 let lcs = self.ro.suffixes.lcs(); 745 debug_assert!(lcs.len() >= 1); 746 let mut start = original_start; 747 let mut end = start; 748 while end <= text.len() { 749 start = end; 750 end += match lcs.find(&text[end..]) { 751 None => return Some(NoMatch(text.len())), 752 Some(start) => start + lcs.len(), 753 }; 754 match dfa::Fsm::reverse( 755 &self.ro.dfa_reverse, 756 self.cache, 757 false, 758 &text[start..end], 759 end - start, 760 ) { 761 Match(0) | NoMatch(0) => return None, 762 Match(s) => return Some(Match((s + start, end))), 763 NoMatch(_) => continue, 764 Quit => return Some(Quit), 765 }; 766 } 767 Some(NoMatch(text.len())) 768 } 769 770 /// Finds the leftmost-first match (start and end) using only the DFA 771 /// by scanning for suffix literals. 772 /// 773 /// If the result returned indicates that the DFA quit, then another 774 /// matching engine should be used. 775 #[inline(always)] // reduces constant overhead find_dfa_reverse_suffix( &self, text: &[u8], start: usize, ) -> dfa::Result<(usize, usize)>776 fn find_dfa_reverse_suffix( 777 &self, 778 text: &[u8], 779 start: usize, 780 ) -> dfa::Result<(usize, usize)> { 781 use dfa::Result::*; 782 783 let match_start = match self.exec_dfa_reverse_suffix(text, start) { 784 None => return self.find_dfa_forward(text, start), 785 Some(Match((start, _))) => start, 786 Some(r) => return r, 787 }; 788 // At this point, we've found a match. The only way to quit now 789 // without a match is if the DFA gives up (seems unlikely). 790 // 791 // Now run the DFA forwards to find the proper end of the match. 792 // (The suffix literal match can only indicate the earliest 793 // possible end location, which may appear before the end of the 794 // leftmost-first match.) 795 match dfa::Fsm::forward( 796 &self.ro.dfa, 797 self.cache, 798 false, 799 text, 800 match_start, 801 ) { 802 NoMatch(_) => panic!("BUG: reverse match implies forward match"), 803 Quit => Quit, 804 Match(e) => Match((match_start, e)), 805 } 806 } 807 808 /// Executes the NFA engine to return whether there is a match or not. 809 /// 810 /// Ideally, we could use shortest_nfa(...).is_some() and get the same 811 /// performance characteristics, but regex sets don't have captures, which 812 /// shortest_nfa depends on. match_nfa( &self, text: &[u8], start: usize, ) -> bool813 fn match_nfa( 814 &self, 815 text: &[u8], 816 start: usize, 817 ) -> bool { 818 self.match_nfa_type(MatchNfaType::Auto, text, start) 819 } 820 821 /// Like match_nfa, but allows specification of the type of NFA engine. match_nfa_type( &self, ty: MatchNfaType, text: &[u8], start: usize, ) -> bool822 fn match_nfa_type( 823 &self, 824 ty: MatchNfaType, 825 text: &[u8], 826 start: usize, 827 ) -> bool { 828 self.exec_nfa(ty, &mut [false], &mut [], true, text, start) 829 } 830 831 /// Finds the shortest match using an NFA. shortest_nfa(&self, text: &[u8], start: usize) -> Option<usize>832 fn shortest_nfa(&self, text: &[u8], start: usize) -> Option<usize> { 833 self.shortest_nfa_type(MatchNfaType::Auto, text, start) 834 } 835 836 /// Like shortest_nfa, but allows specification of the type of NFA engine. shortest_nfa_type( &self, ty: MatchNfaType, text: &[u8], start: usize, ) -> Option<usize>837 fn shortest_nfa_type( 838 &self, 839 ty: MatchNfaType, 840 text: &[u8], 841 start: usize, 842 ) -> Option<usize> { 843 let mut slots = [None, None]; 844 if self.exec_nfa(ty, &mut [false], &mut slots, true, text, start) { 845 slots[1] 846 } else { 847 None 848 } 849 } 850 851 /// Like find, but executes an NFA engine. find_nfa( &self, ty: MatchNfaType, text: &[u8], start: usize, ) -> Option<(usize, usize)>852 fn find_nfa( 853 &self, 854 ty: MatchNfaType, 855 text: &[u8], 856 start: usize, 857 ) -> Option<(usize, usize)> { 858 let mut slots = [None, None]; 859 if self.exec_nfa(ty, &mut [false], &mut slots, false, text, start) { 860 match (slots[0], slots[1]) { 861 (Some(s), Some(e)) => Some((s, e)), 862 _ => None, 863 } 864 } else { 865 None 866 } 867 } 868 869 /// Like find_nfa, but fills in captures and restricts the search space 870 /// using previously found match information. 871 /// 872 /// `slots` should have length equal to `2 * nfa.captures.len()`. captures_nfa_with_match( &self, slots: &mut [Slot], text: &[u8], match_start: usize, match_end: usize, ) -> Option<(usize, usize)>873 fn captures_nfa_with_match( 874 &self, 875 slots: &mut [Slot], 876 text: &[u8], 877 match_start: usize, 878 match_end: usize, 879 ) -> Option<(usize, usize)> { 880 // We can't use match_end directly, because we may need to examine one 881 // "character" after the end of a match for lookahead operators. We 882 // need to move two characters beyond the end, since some look-around 883 // operations may falsely assume a premature end of text otherwise. 884 let e = cmp::min( 885 next_utf8(text, next_utf8(text, match_end)), text.len()); 886 self.captures_nfa(slots, &text[..e], match_start) 887 } 888 889 /// Like find_nfa, but fills in captures. 890 /// 891 /// `slots` should have length equal to `2 * nfa.captures.len()`. captures_nfa( &self, slots: &mut [Slot], text: &[u8], start: usize, ) -> Option<(usize, usize)>892 fn captures_nfa( 893 &self, 894 slots: &mut [Slot], 895 text: &[u8], 896 start: usize, 897 ) -> Option<(usize, usize)> { 898 self.captures_nfa_type(MatchNfaType::Auto, slots, text, start) 899 } 900 901 /// Like captures_nfa, but allows specification of type of NFA engine. captures_nfa_type( &self, ty: MatchNfaType, slots: &mut [Slot], text: &[u8], start: usize, ) -> Option<(usize, usize)>902 fn captures_nfa_type( 903 &self, 904 ty: MatchNfaType, 905 slots: &mut [Slot], 906 text: &[u8], 907 start: usize, 908 ) -> Option<(usize, usize)> { 909 if self.exec_nfa(ty, &mut [false], slots, false, text, start) { 910 match (slots[0], slots[1]) { 911 (Some(s), Some(e)) => Some((s, e)), 912 _ => None, 913 } 914 } else { 915 None 916 } 917 } 918 exec_nfa( &self, mut ty: MatchNfaType, matches: &mut [bool], slots: &mut [Slot], quit_after_match: bool, text: &[u8], start: usize, ) -> bool919 fn exec_nfa( 920 &self, 921 mut ty: MatchNfaType, 922 matches: &mut [bool], 923 slots: &mut [Slot], 924 quit_after_match: bool, 925 text: &[u8], 926 start: usize, 927 ) -> bool { 928 use self::MatchNfaType::*; 929 if let Auto = ty { 930 if backtrack::should_exec(self.ro.nfa.len(), text.len()) { 931 ty = Backtrack; 932 } else { 933 ty = PikeVM; 934 } 935 } 936 match ty { 937 Auto => unreachable!(), 938 Backtrack => self.exec_backtrack(matches, slots, text, start), 939 PikeVM => { 940 self.exec_pikevm( 941 matches, slots, quit_after_match, text, start) 942 } 943 } 944 } 945 946 /// Always run the NFA algorithm. exec_pikevm( &self, matches: &mut [bool], slots: &mut [Slot], quit_after_match: bool, text: &[u8], start: usize, ) -> bool947 fn exec_pikevm( 948 &self, 949 matches: &mut [bool], 950 slots: &mut [Slot], 951 quit_after_match: bool, 952 text: &[u8], 953 start: usize, 954 ) -> bool { 955 if self.ro.nfa.uses_bytes() { 956 pikevm::Fsm::exec( 957 &self.ro.nfa, 958 self.cache, 959 matches, 960 slots, 961 quit_after_match, 962 ByteInput::new(text, self.ro.nfa.only_utf8), 963 start) 964 } else { 965 pikevm::Fsm::exec( 966 &self.ro.nfa, 967 self.cache, 968 matches, 969 slots, 970 quit_after_match, 971 CharInput::new(text), 972 start) 973 } 974 } 975 976 /// Always runs the NFA using bounded backtracking. exec_backtrack( &self, matches: &mut [bool], slots: &mut [Slot], text: &[u8], start: usize, ) -> bool977 fn exec_backtrack( 978 &self, 979 matches: &mut [bool], 980 slots: &mut [Slot], 981 text: &[u8], 982 start: usize, 983 ) -> bool { 984 if self.ro.nfa.uses_bytes() { 985 backtrack::Bounded::exec( 986 &self.ro.nfa, 987 self.cache, 988 matches, 989 slots, 990 ByteInput::new(text, self.ro.nfa.only_utf8), 991 start) 992 } else { 993 backtrack::Bounded::exec( 994 &self.ro.nfa, 995 self.cache, 996 matches, 997 slots, 998 CharInput::new(text), 999 start) 1000 } 1001 } 1002 1003 /// Finds which regular expressions match the given text. 1004 /// 1005 /// `matches` should have length equal to the number of regexes being 1006 /// searched. 1007 /// 1008 /// This is only useful when one wants to know which regexes in a set 1009 /// match some text. many_matches_at( &self, matches: &mut [bool], text: &[u8], start: usize, ) -> bool1010 pub fn many_matches_at( 1011 &self, 1012 matches: &mut [bool], 1013 text: &[u8], 1014 start: usize, 1015 ) -> bool { 1016 use self::MatchType::*; 1017 if !self.is_anchor_end_match(text) { 1018 return false; 1019 } 1020 match self.ro.match_type { 1021 Literal(ty) => { 1022 debug_assert_eq!(matches.len(), 1); 1023 matches[0] = self.find_literals(ty, text, start).is_some(); 1024 matches[0] 1025 } 1026 Dfa | DfaAnchoredReverse | DfaSuffix | DfaMany => { 1027 match dfa::Fsm::forward_many( 1028 &self.ro.dfa, 1029 self.cache, 1030 matches, 1031 text, 1032 start, 1033 ) { 1034 dfa::Result::Match(_) => true, 1035 dfa::Result::NoMatch(_) => false, 1036 dfa::Result::Quit => { 1037 self.exec_nfa( 1038 MatchNfaType::Auto, 1039 matches, 1040 &mut [], 1041 false, 1042 text, 1043 start) 1044 } 1045 } 1046 } 1047 Nfa(ty) => self.exec_nfa(ty, matches, &mut [], false, text, start), 1048 Nothing => false, 1049 } 1050 } 1051 1052 #[inline(always)] // reduces constant overhead is_anchor_end_match(&self, text: &[u8]) -> bool1053 fn is_anchor_end_match(&self, text: &[u8]) -> bool { 1054 // Only do this check if the haystack is big (>1MB). 1055 if text.len() > (1<<20) && self.ro.nfa.is_anchored_end { 1056 let lcs = self.ro.suffixes.lcs(); 1057 if lcs.len() >= 1 && !lcs.is_suffix(text) { 1058 return false; 1059 } 1060 } 1061 true 1062 } 1063 capture_name_idx(&self) -> &Arc<HashMap<String, usize>>1064 pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> { 1065 &self.ro.nfa.capture_name_idx 1066 } 1067 } 1068 1069 impl<'c> ExecNoSyncStr<'c> { capture_name_idx(&self) -> &Arc<HashMap<String, usize>>1070 pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> { 1071 self.0.capture_name_idx() 1072 } 1073 } 1074 1075 impl Exec { 1076 /// Get a searcher that isn't Sync. 1077 #[inline(always)] // reduces constant overhead searcher(&self) -> ExecNoSync1078 pub fn searcher(&self) -> ExecNoSync { 1079 let create = || Box::new(RefCell::new(ProgramCacheInner::new(&self.ro))); 1080 ExecNoSync { 1081 ro: &self.ro, // a clone is too expensive here! (and not needed) 1082 cache: self.cache.get_or(create), 1083 } 1084 } 1085 1086 /// Get a searcher that isn't Sync and can match on &str. 1087 #[inline(always)] // reduces constant overhead searcher_str(&self) -> ExecNoSyncStr1088 pub fn searcher_str(&self) -> ExecNoSyncStr { 1089 ExecNoSyncStr(self.searcher()) 1090 } 1091 1092 /// Build a Regex from this executor. into_regex(self) -> re_unicode::Regex1093 pub fn into_regex(self) -> re_unicode::Regex { 1094 re_unicode::Regex::from(self) 1095 } 1096 1097 /// Build a RegexSet from this executor. into_regex_set(self) -> re_set::unicode::RegexSet1098 pub fn into_regex_set(self) -> re_set::unicode::RegexSet { 1099 re_set::unicode::RegexSet::from(self) 1100 } 1101 1102 /// Build a Regex from this executor that can match arbitrary bytes. into_byte_regex(self) -> re_bytes::Regex1103 pub fn into_byte_regex(self) -> re_bytes::Regex { 1104 re_bytes::Regex::from(self) 1105 } 1106 1107 /// Build a RegexSet from this executor that can match arbitrary bytes. into_byte_regex_set(self) -> re_set::bytes::RegexSet1108 pub fn into_byte_regex_set(self) -> re_set::bytes::RegexSet { 1109 re_set::bytes::RegexSet::from(self) 1110 } 1111 1112 /// The original regular expressions given by the caller that were 1113 /// compiled. regex_strings(&self) -> &[String]1114 pub fn regex_strings(&self) -> &[String] { 1115 &self.ro.res 1116 } 1117 1118 /// Return a slice of capture names. 1119 /// 1120 /// Any capture that isn't named is None. capture_names(&self) -> &[Option<String>]1121 pub fn capture_names(&self) -> &[Option<String>] { 1122 &self.ro.nfa.captures 1123 } 1124 1125 /// Return a reference to named groups mapping (from group name to 1126 /// group position). capture_name_idx(&self) -> &Arc<HashMap<String, usize>>1127 pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> { 1128 &self.ro.nfa.capture_name_idx 1129 } 1130 } 1131 1132 impl Clone for Exec { clone(&self) -> Exec1133 fn clone(&self) -> Exec { 1134 Exec { 1135 ro: self.ro.clone(), 1136 cache: CachedThreadLocal::new(), 1137 } 1138 } 1139 } 1140 1141 impl ExecReadOnly { choose_match_type(&self, hint: Option<MatchType>) -> MatchType1142 fn choose_match_type(&self, hint: Option<MatchType>) -> MatchType { 1143 use self::MatchType::*; 1144 if let Some(Nfa(_)) = hint { 1145 return hint.unwrap(); 1146 } 1147 // If the NFA is empty, then we'll never match anything. 1148 if self.nfa.insts.is_empty() { 1149 return Nothing; 1150 } 1151 // If our set of prefixes is complete, then we can use it to find 1152 // a match in lieu of a regex engine. This doesn't quite work well in 1153 // the presence of multiple regexes, so only do it when there's one. 1154 // 1155 // TODO(burntsushi): Also, don't try to match literals if the regex is 1156 // partially anchored. We could technically do it, but we'd need to 1157 // create two sets of literals: all of them and then the subset that 1158 // aren't anchored. We would then only search for all of them when at 1159 // the beginning of the input and use the subset in all other cases. 1160 if self.res.len() == 1 { 1161 if self.nfa.prefixes.complete() { 1162 return if self.nfa.is_anchored_start { 1163 Literal(MatchLiteralType::AnchoredStart) 1164 } else { 1165 Literal(MatchLiteralType::Unanchored) 1166 }; 1167 } 1168 if self.suffixes.complete() { 1169 return if self.nfa.is_anchored_end { 1170 Literal(MatchLiteralType::AnchoredEnd) 1171 } else { 1172 // This case shouldn't happen. When the regex isn't 1173 // anchored, then complete prefixes should imply complete 1174 // suffixes. 1175 Literal(MatchLiteralType::Unanchored) 1176 }; 1177 } 1178 } 1179 // If we can execute the DFA, then we totally should. 1180 if dfa::can_exec(&self.dfa) { 1181 // Regex sets require a slightly specialized path. 1182 if self.res.len() >= 2 { 1183 return DfaMany; 1184 } 1185 // If the regex is anchored at the end but not the start, then 1186 // just match in reverse from the end of the haystack. 1187 if !self.nfa.is_anchored_start && self.nfa.is_anchored_end { 1188 return DfaAnchoredReverse; 1189 } 1190 // If there's a longish suffix literal, then it might be faster 1191 // to look for that first. 1192 if self.should_suffix_scan() { 1193 return DfaSuffix; 1194 } 1195 // Fall back to your garden variety forward searching lazy DFA. 1196 return Dfa; 1197 } 1198 // We're so totally hosed. 1199 Nfa(MatchNfaType::Auto) 1200 } 1201 1202 /// Returns true if the program is amenable to suffix scanning. 1203 /// 1204 /// When this is true, as a heuristic, we assume it is OK to quickly scan 1205 /// for suffix literals and then do a *reverse* DFA match from any matches 1206 /// produced by the literal scan. (And then followed by a forward DFA 1207 /// search, since the previously found suffix literal maybe not actually be 1208 /// the end of a match.) 1209 /// 1210 /// This is a bit of a specialized optimization, but can result in pretty 1211 /// big performance wins if 1) there are no prefix literals and 2) the 1212 /// suffix literals are pretty rare in the text. (1) is obviously easy to 1213 /// account for but (2) is harder. As a proxy, we assume that longer 1214 /// strings are generally rarer, so we only enable this optimization when 1215 /// we have a meaty suffix. should_suffix_scan(&self) -> bool1216 fn should_suffix_scan(&self) -> bool { 1217 if self.suffixes.is_empty() { 1218 return false; 1219 } 1220 let lcs_len = self.suffixes.lcs().char_len(); 1221 lcs_len >= 3 && lcs_len > self.dfa.prefixes.lcp().char_len() 1222 } 1223 } 1224 1225 #[derive(Clone, Copy, Debug)] 1226 enum MatchType { 1227 /// A single or multiple literal search. This is only used when the regex 1228 /// can be decomposed into unambiguous literal search. 1229 Literal(MatchLiteralType), 1230 /// A normal DFA search. 1231 Dfa, 1232 /// A reverse DFA search starting from the end of a haystack. 1233 DfaAnchoredReverse, 1234 /// A reverse DFA search with suffix literal scanning. 1235 DfaSuffix, 1236 /// Use the DFA on two or more regular expressions. 1237 DfaMany, 1238 /// An NFA variant. 1239 Nfa(MatchNfaType), 1240 /// No match is ever possible, so don't ever try to search. 1241 Nothing, 1242 } 1243 1244 #[derive(Clone, Copy, Debug)] 1245 enum MatchLiteralType { 1246 /// Match literals anywhere in text. 1247 Unanchored, 1248 /// Match literals only at the start of text. 1249 AnchoredStart, 1250 /// Match literals only at the end of text. 1251 AnchoredEnd, 1252 } 1253 1254 #[derive(Clone, Copy, Debug)] 1255 enum MatchNfaType { 1256 /// Choose between Backtrack and PikeVM. 1257 Auto, 1258 /// NFA bounded backtracking. 1259 /// 1260 /// (This is only set by tests, since it never makes sense to always want 1261 /// backtracking.) 1262 Backtrack, 1263 /// The Pike VM. 1264 /// 1265 /// (This is only set by tests, since it never makes sense to always want 1266 /// the Pike VM.) 1267 PikeVM, 1268 } 1269 1270 /// `ProgramCache` maintains reusable allocations for each matching engine 1271 /// available to a particular program. 1272 pub type ProgramCache = RefCell<ProgramCacheInner>; 1273 1274 #[derive(Clone, Debug)] 1275 pub struct ProgramCacheInner { 1276 pub pikevm: pikevm::Cache, 1277 pub backtrack: backtrack::Cache, 1278 pub dfa: dfa::Cache, 1279 pub dfa_reverse: dfa::Cache, 1280 } 1281 1282 impl ProgramCacheInner { new(ro: &ExecReadOnly) -> Self1283 fn new(ro: &ExecReadOnly) -> Self { 1284 ProgramCacheInner { 1285 pikevm: pikevm::Cache::new(&ro.nfa), 1286 backtrack: backtrack::Cache::new(&ro.nfa), 1287 dfa: dfa::Cache::new(&ro.dfa), 1288 dfa_reverse: dfa::Cache::new(&ro.dfa_reverse), 1289 } 1290 } 1291 } 1292 1293 #[cfg(test)] 1294 mod test { 1295 #[test] uppercut_s_backtracking_bytes_default_bytes_mismatch()1296 fn uppercut_s_backtracking_bytes_default_bytes_mismatch() { 1297 use internal::ExecBuilder; 1298 1299 let backtrack_bytes_re = ExecBuilder::new("^S") 1300 .bounded_backtracking() 1301 .only_utf8(false) 1302 .build() 1303 .map(|exec| exec.into_byte_regex()) 1304 .map_err(|err| format!("{}", err)) 1305 .unwrap(); 1306 1307 let default_bytes_re = ExecBuilder::new("^S") 1308 .only_utf8(false) 1309 .build() 1310 .map(|exec| exec.into_byte_regex()) 1311 .map_err(|err| format!("{}", err)) 1312 .unwrap(); 1313 1314 let input = vec![83, 83]; 1315 1316 let s1 = backtrack_bytes_re.split(&input); 1317 let s2 = default_bytes_re.split(&input); 1318 for (chunk1, chunk2) in s1.zip(s2) { 1319 assert_eq!(chunk1, chunk2); 1320 } 1321 } 1322 1323 #[test] unicode_lit_star_backtracking_utf8bytes_default_utf8bytes_mismatch()1324 fn unicode_lit_star_backtracking_utf8bytes_default_utf8bytes_mismatch() { 1325 use internal::ExecBuilder; 1326 1327 let backtrack_bytes_re = ExecBuilder::new(r"^(?u:\*)") 1328 .bounded_backtracking() 1329 .bytes(true) 1330 .build() 1331 .map(|exec| exec.into_regex()) 1332 .map_err(|err| format!("{}", err)) 1333 .unwrap(); 1334 1335 let default_bytes_re = ExecBuilder::new(r"^(?u:\*)") 1336 .bytes(true) 1337 .build() 1338 .map(|exec| exec.into_regex()) 1339 .map_err(|err| format!("{}", err)) 1340 .unwrap(); 1341 1342 let input = "**"; 1343 1344 let s1 = backtrack_bytes_re.split(input); 1345 let s2 = default_bytes_re.split(input); 1346 for (chunk1, chunk2) in s1.zip(s2) { 1347 assert_eq!(chunk1, chunk2); 1348 } 1349 } 1350 } 1351