1 // This module provides an NFA compiler using Thompson's construction 2 // algorithm. The compiler takes a regex-syntax::Hir as input and emits an NFA 3 // graph as output. The NFA graph is structured in a way that permits it to be 4 // executed by a virtual machine and also used to efficiently build a DFA. 5 // 6 // The compiler deals with a slightly expanded set of NFA states that notably 7 // includes an empty node that has exactly one epsilon transition to the next 8 // state. In other words, it's a "goto" instruction if one views Thompson's NFA 9 // as a set of bytecode instructions. These goto instructions are removed in 10 // a subsequent phase before returning the NFA to the caller. The purpose of 11 // these empty nodes is that they make the construction algorithm substantially 12 // simpler to implement. We remove them before returning to the caller because 13 // they can represent substantial overhead when traversing the NFA graph 14 // (either while searching using the NFA directly or while building a DFA). 15 // 16 // In the future, it would be nice to provide a Glushkov compiler as well, 17 // as it would work well as a bit-parallel NFA for smaller regexes. But 18 // the Thompson construction is one I'm more familiar with and seems more 19 // straight-forward to deal with when it comes to large Unicode character 20 // classes. 21 // 22 // Internally, the compiler uses interior mutability to improve composition 23 // in the face of the borrow checker. In particular, we'd really like to be 24 // able to write things like this: 25 // 26 // self.c_concat(exprs.iter().map(|e| self.c(e))) 27 // 28 // Which elegantly uses iterators to build up a sequence of compiled regex 29 // sub-expressions and then hands it off to the concatenating compiler 30 // routine. Without interior mutability, the borrow checker won't let us 31 // borrow `self` mutably both inside and outside the closure at the same 32 // time. 33 34 use std::cell::RefCell; 35 use std::mem; 36 37 use regex_syntax::hir::{self, Hir, HirKind}; 38 use regex_syntax::utf8::{Utf8Range, Utf8Sequences}; 39 40 use classes::ByteClassSet; 41 use error::{Error, Result}; 42 use nfa::map::{Utf8BoundedMap, Utf8SuffixKey, Utf8SuffixMap}; 43 use nfa::range_trie::RangeTrie; 44 use nfa::{State, StateID, Transition, NFA}; 45 46 /// Config knobs for the NFA compiler. See the builder's methods for more 47 /// docs on each one. 48 #[derive(Clone, Copy, Debug)] 49 struct Config { 50 anchored: bool, 51 allow_invalid_utf8: bool, 52 reverse: bool, 53 shrink: bool, 54 } 55 56 impl Default for Config { default() -> Config57 fn default() -> Config { 58 Config { 59 anchored: false, 60 allow_invalid_utf8: false, 61 reverse: false, 62 shrink: true, 63 } 64 } 65 } 66 67 /// A builder for compiling an NFA. 68 #[derive(Clone, Debug)] 69 pub struct Builder { 70 config: Config, 71 } 72 73 impl Builder { 74 /// Create a new NFA builder with its default configuration. new() -> Builder75 pub fn new() -> Builder { 76 Builder { config: Config::default() } 77 } 78 79 /// Compile the given high level intermediate representation of a regular 80 /// expression into an NFA. 81 /// 82 /// If there was a problem building the NFA, then an error is returned. 83 /// For example, if the regex uses unsupported features (such as zero-width 84 /// assertions), then an error is returned. build(&self, expr: &Hir) -> Result<NFA>85 pub fn build(&self, expr: &Hir) -> Result<NFA> { 86 let mut nfa = NFA::always_match(); 87 self.build_with(&mut Compiler::new(), &mut nfa, expr)?; 88 Ok(nfa) 89 } 90 91 /// Compile the given high level intermediate representation of a regular 92 /// expression into the NFA given using the given compiler. Callers may 93 /// prefer this over `build` if they would like to reuse allocations while 94 /// compiling many regular expressions. 95 /// 96 /// On success, the given NFA is completely overwritten with the NFA 97 /// produced by the compiler. 98 /// 99 /// If there was a problem building the NFA, then an error is returned. For 100 /// example, if the regex uses unsupported features (such as zero-width 101 /// assertions), then an error is returned. When an error is returned, 102 /// the contents of `nfa` are unspecified and should not be relied upon. 103 /// However, it can still be reused in subsequent calls to this method. build_with( &self, compiler: &mut Compiler, nfa: &mut NFA, expr: &Hir, ) -> Result<()>104 pub fn build_with( 105 &self, 106 compiler: &mut Compiler, 107 nfa: &mut NFA, 108 expr: &Hir, 109 ) -> Result<()> { 110 compiler.clear(); 111 compiler.configure(self.config); 112 compiler.compile(nfa, expr) 113 } 114 115 /// Set whether matching must be anchored at the beginning of the input. 116 /// 117 /// When enabled, a match must begin at the start of the input. When 118 /// disabled, the NFA will act as if the pattern started with a `.*?`, 119 /// which enables a match to appear anywhere. 120 /// 121 /// By default this is disabled. anchored(&mut self, yes: bool) -> &mut Builder122 pub fn anchored(&mut self, yes: bool) -> &mut Builder { 123 self.config.anchored = yes; 124 self 125 } 126 127 /// When enabled, the builder will permit the construction of an NFA that 128 /// may match invalid UTF-8. 129 /// 130 /// When disabled (the default), the builder is guaranteed to produce a 131 /// regex that will only ever match valid UTF-8 (otherwise, the builder 132 /// will return an error). allow_invalid_utf8(&mut self, yes: bool) -> &mut Builder133 pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut Builder { 134 self.config.allow_invalid_utf8 = yes; 135 self 136 } 137 138 /// Reverse the NFA. 139 /// 140 /// A NFA reversal is performed by reversing all of the concatenated 141 /// sub-expressions in the original pattern, recursively. The resulting 142 /// NFA can be used to match the pattern starting from the end of a string 143 /// instead of the beginning of a string. 144 /// 145 /// Reversing the NFA is useful for building a reverse DFA, which is most 146 /// useful for finding the start of a match. reverse(&mut self, yes: bool) -> &mut Builder147 pub fn reverse(&mut self, yes: bool) -> &mut Builder { 148 self.config.reverse = yes; 149 self 150 } 151 152 /// Apply best effort heuristics to shrink the NFA at the expense of more 153 /// time/memory. 154 /// 155 /// This is enabled by default. Generally speaking, if one is using an NFA 156 /// to compile DFA, then the extra time used to shrink the NFA will be 157 /// more than made up for during DFA construction (potentially by a lot). 158 /// In other words, enabling this can substantially decrease the overall 159 /// amount of time it takes to build a DFA. 160 /// 161 /// The only reason to disable this if you want to compile an NFA and start 162 /// using it as quickly as possible without needing to build a DFA. shrink(&mut self, yes: bool) -> &mut Builder163 pub fn shrink(&mut self, yes: bool) -> &mut Builder { 164 self.config.shrink = yes; 165 self 166 } 167 } 168 169 /// A compiler that converts a regex abstract syntax to an NFA via Thompson's 170 /// construction. Namely, this compiler permits epsilon transitions between 171 /// states. 172 /// 173 /// Users of this crate cannot use a compiler directly. Instead, all one can 174 /// do is create one and use it via the 175 /// [`Builder::build_with`](struct.Builder.html#method.build_with) 176 /// method. This permits callers to reuse compilers in order to amortize 177 /// allocations. 178 #[derive(Clone, Debug)] 179 pub struct Compiler { 180 /// The set of compiled NFA states. Once a state is compiled, it is 181 /// assigned a state ID equivalent to its index in this list. Subsequent 182 /// compilation can modify previous states by adding new transitions. 183 states: RefCell<Vec<CState>>, 184 /// The configuration from the builder. 185 config: Config, 186 /// State used for compiling character classes to UTF-8 byte automata. 187 /// State is not retained between character class compilations. This just 188 /// serves to amortize allocation to the extent possible. 189 utf8_state: RefCell<Utf8State>, 190 /// State used for arranging character classes in reverse into a trie. 191 trie_state: RefCell<RangeTrie>, 192 /// State used for caching common suffixes when compiling reverse UTF-8 193 /// automata (for Unicode character classes). 194 utf8_suffix: RefCell<Utf8SuffixMap>, 195 /// A map used to re-map state IDs when translating the compiler's internal 196 /// NFA state representation to the external NFA representation. 197 remap: RefCell<Vec<StateID>>, 198 /// A set of compiler internal state IDs that correspond to states that are 199 /// exclusively epsilon transitions, i.e., goto instructions, combined with 200 /// the state that they point to. This is used to record said states while 201 /// transforming the compiler's internal NFA representation to the external 202 /// form. 203 empties: RefCell<Vec<(StateID, StateID)>>, 204 } 205 206 /// A compiler intermediate state representation for an NFA that is only used 207 /// during compilation. Once compilation is done, `CState`s are converted to 208 /// `State`s, which have a much simpler representation. 209 #[derive(Clone, Debug, Eq, PartialEq)] 210 enum CState { 211 /// An empty state whose only purpose is to forward the automaton to 212 /// another state via en epsilon transition. These are useful during 213 /// compilation but are otherwise removed at the end. 214 Empty { next: StateID }, 215 /// A state that only transitions to `next` if the current input byte is 216 /// in the range `[start, end]` (inclusive on both ends). 217 Range { range: Transition }, 218 /// A state with possibly many transitions, represented in a sparse 219 /// fashion. Transitions are ordered lexicographically by input range. 220 /// As such, this may only be used when every transition has equal 221 /// priority. (In practice, this is only used for encoding large UTF-8 222 /// automata.) 223 Sparse { ranges: Vec<Transition> }, 224 /// An alternation such that there exists an epsilon transition to all 225 /// states in `alternates`, where matches found via earlier transitions 226 /// are preferred over later transitions. 227 Union { alternates: Vec<StateID> }, 228 /// An alternation such that there exists an epsilon transition to all 229 /// states in `alternates`, where matches found via later transitions 230 /// are preferred over earlier transitions. 231 /// 232 /// This "reverse" state exists for convenience during compilation that 233 /// permits easy construction of non-greedy combinations of NFA states. 234 /// At the end of compilation, Union and UnionReverse states are merged 235 /// into one Union type of state, where the latter has its epsilon 236 /// transitions reversed to reflect the priority inversion. 237 UnionReverse { alternates: Vec<StateID> }, 238 /// A match state. There is exactly one such occurrence of this state in 239 /// an NFA. 240 Match, 241 } 242 243 /// A value that represents the result of compiling a sub-expression of a 244 /// regex's HIR. Specifically, this represents a sub-graph of the NFA that 245 /// has an initial state at `start` and a final state at `end`. 246 #[derive(Clone, Copy, Debug)] 247 pub struct ThompsonRef { 248 start: StateID, 249 end: StateID, 250 } 251 252 impl Compiler { 253 /// Create a new compiler. new() -> Compiler254 pub fn new() -> Compiler { 255 Compiler { 256 states: RefCell::new(vec![]), 257 config: Config::default(), 258 utf8_state: RefCell::new(Utf8State::new()), 259 trie_state: RefCell::new(RangeTrie::new()), 260 utf8_suffix: RefCell::new(Utf8SuffixMap::new(1000)), 261 remap: RefCell::new(vec![]), 262 empties: RefCell::new(vec![]), 263 } 264 } 265 266 /// Clear any memory used by this compiler such that it is ready to compile 267 /// a new regex. 268 /// 269 /// It is preferrable to reuse a compiler if possible in order to reuse 270 /// allocations. clear(&self)271 fn clear(&self) { 272 self.states.borrow_mut().clear(); 273 // We don't need to clear anything else since they are cleared on 274 // their own and only when they are used. 275 } 276 277 /// Configure this compiler from the builder's knobs. 278 /// 279 /// The compiler is always reconfigured by the builder before using it to 280 /// build an NFA. configure(&mut self, config: Config)281 fn configure(&mut self, config: Config) { 282 self.config = config; 283 } 284 285 /// Convert the current intermediate NFA to its final compiled form. compile(&self, nfa: &mut NFA, expr: &Hir) -> Result<()>286 fn compile(&self, nfa: &mut NFA, expr: &Hir) -> Result<()> { 287 nfa.anchored = self.config.anchored; 288 289 let mut start = self.add_empty(); 290 if !nfa.anchored { 291 let compiled = if self.config.allow_invalid_utf8 { 292 self.c_unanchored_prefix_invalid_utf8()? 293 } else { 294 self.c_unanchored_prefix_valid_utf8()? 295 }; 296 self.patch(start, compiled.start); 297 start = compiled.end; 298 } 299 let compiled = self.c(&expr)?; 300 let match_id = self.add_match(); 301 self.patch(start, compiled.start); 302 self.patch(compiled.end, match_id); 303 self.finish(nfa); 304 Ok(()) 305 } 306 307 /// Finishes the compilation process and populates the provide NFA with 308 /// the final graph. finish(&self, nfa: &mut NFA)309 fn finish(&self, nfa: &mut NFA) { 310 let mut bstates = self.states.borrow_mut(); 311 let mut remap = self.remap.borrow_mut(); 312 remap.resize(bstates.len(), 0); 313 let mut empties = self.empties.borrow_mut(); 314 empties.clear(); 315 316 // We don't reuse allocations here becuase this is what we're 317 // returning. 318 nfa.states.clear(); 319 let mut byteset = ByteClassSet::new(); 320 321 // The idea here is to convert our intermediate states to their final 322 // form. The only real complexity here is the process of converting 323 // transitions, which are expressed in terms of state IDs. The new 324 // set of states will be smaller because of partial epsilon removal, 325 // so the state IDs will not be the same. 326 for (id, bstate) in bstates.iter_mut().enumerate() { 327 match *bstate { 328 CState::Empty { next } => { 329 // Since we're removing empty states, we need to handle 330 // them later since we don't yet know which new state this 331 // empty state will be mapped to. 332 empties.push((id, next)); 333 } 334 CState::Range { ref range } => { 335 remap[id] = nfa.states.len(); 336 byteset.set_range(range.start, range.end); 337 nfa.states.push(State::Range { range: range.clone() }); 338 } 339 CState::Sparse { ref mut ranges } => { 340 remap[id] = nfa.states.len(); 341 342 let ranges = mem::replace(ranges, vec![]); 343 for r in &ranges { 344 byteset.set_range(r.start, r.end); 345 } 346 nfa.states.push(State::Sparse { 347 ranges: ranges.into_boxed_slice(), 348 }); 349 } 350 CState::Union { ref mut alternates } => { 351 remap[id] = nfa.states.len(); 352 353 let alternates = mem::replace(alternates, vec![]); 354 nfa.states.push(State::Union { 355 alternates: alternates.into_boxed_slice(), 356 }); 357 } 358 CState::UnionReverse { ref mut alternates } => { 359 remap[id] = nfa.states.len(); 360 361 let mut alternates = mem::replace(alternates, vec![]); 362 alternates.reverse(); 363 nfa.states.push(State::Union { 364 alternates: alternates.into_boxed_slice(), 365 }); 366 } 367 CState::Match => { 368 remap[id] = nfa.states.len(); 369 nfa.states.push(State::Match); 370 } 371 } 372 } 373 for &(empty_id, mut empty_next) in empties.iter() { 374 // empty states can point to other empty states, forming a chain. 375 // So we must follow the chain until the end, which must end at 376 // a non-empty state, and therefore, a state that is correctly 377 // remapped. We are guaranteed to terminate because our compiler 378 // never builds a loop among empty states. 379 while let CState::Empty { next } = bstates[empty_next] { 380 empty_next = next; 381 } 382 remap[empty_id] = remap[empty_next]; 383 } 384 for state in &mut nfa.states { 385 state.remap(&remap); 386 } 387 // The compiler always begins the NFA at the first state. 388 nfa.start = remap[0]; 389 nfa.byte_classes = byteset.byte_classes(); 390 } 391 c(&self, expr: &Hir) -> Result<ThompsonRef>392 fn c(&self, expr: &Hir) -> Result<ThompsonRef> { 393 match *expr.kind() { 394 HirKind::Empty => { 395 let id = self.add_empty(); 396 Ok(ThompsonRef { start: id, end: id }) 397 } 398 HirKind::Literal(hir::Literal::Unicode(ch)) => { 399 let mut buf = [0; 4]; 400 let it = ch 401 .encode_utf8(&mut buf) 402 .as_bytes() 403 .iter() 404 .map(|&b| Ok(self.c_range(b, b))); 405 self.c_concat(it) 406 } 407 HirKind::Literal(hir::Literal::Byte(b)) => Ok(self.c_range(b, b)), 408 HirKind::Class(hir::Class::Bytes(ref cls)) => { 409 self.c_byte_class(cls) 410 } 411 HirKind::Class(hir::Class::Unicode(ref cls)) => { 412 self.c_unicode_class(cls) 413 } 414 HirKind::Repetition(ref rep) => self.c_repetition(rep), 415 HirKind::Group(ref group) => self.c(&*group.hir), 416 HirKind::Concat(ref exprs) => { 417 self.c_concat(exprs.iter().map(|e| self.c(e))) 418 } 419 HirKind::Alternation(ref exprs) => { 420 self.c_alternation(exprs.iter().map(|e| self.c(e))) 421 } 422 HirKind::Anchor(_) => Err(Error::unsupported_anchor()), 423 HirKind::WordBoundary(_) => Err(Error::unsupported_word()), 424 } 425 } 426 c_concat<I>(&self, mut it: I) -> Result<ThompsonRef> where I: DoubleEndedIterator<Item = Result<ThompsonRef>>,427 fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef> 428 where 429 I: DoubleEndedIterator<Item = Result<ThompsonRef>>, 430 { 431 let first = 432 if self.config.reverse { it.next_back() } else { it.next() }; 433 let ThompsonRef { start, mut end } = match first { 434 Some(result) => result?, 435 None => return Ok(self.c_empty()), 436 }; 437 loop { 438 let next = 439 if self.config.reverse { it.next_back() } else { it.next() }; 440 let compiled = match next { 441 Some(result) => result?, 442 None => break, 443 }; 444 self.patch(end, compiled.start); 445 end = compiled.end; 446 } 447 Ok(ThompsonRef { start, end }) 448 } 449 c_alternation<I>(&self, mut it: I) -> Result<ThompsonRef> where I: Iterator<Item = Result<ThompsonRef>>,450 fn c_alternation<I>(&self, mut it: I) -> Result<ThompsonRef> 451 where 452 I: Iterator<Item = Result<ThompsonRef>>, 453 { 454 let first = it.next().expect("alternations must be non-empty")?; 455 let second = match it.next() { 456 None => return Ok(first), 457 Some(result) => result?, 458 }; 459 460 let union = self.add_union(); 461 let end = self.add_empty(); 462 self.patch(union, first.start); 463 self.patch(first.end, end); 464 self.patch(union, second.start); 465 self.patch(second.end, end); 466 for result in it { 467 let compiled = result?; 468 self.patch(union, compiled.start); 469 self.patch(compiled.end, end); 470 } 471 Ok(ThompsonRef { start: union, end }) 472 } 473 c_repetition(&self, rep: &hir::Repetition) -> Result<ThompsonRef>474 fn c_repetition(&self, rep: &hir::Repetition) -> Result<ThompsonRef> { 475 match rep.kind { 476 hir::RepetitionKind::ZeroOrOne => { 477 self.c_zero_or_one(&rep.hir, rep.greedy) 478 } 479 hir::RepetitionKind::ZeroOrMore => { 480 self.c_at_least(&rep.hir, rep.greedy, 0) 481 } 482 hir::RepetitionKind::OneOrMore => { 483 self.c_at_least(&rep.hir, rep.greedy, 1) 484 } 485 hir::RepetitionKind::Range(ref rng) => match *rng { 486 hir::RepetitionRange::Exactly(count) => { 487 self.c_exactly(&rep.hir, count) 488 } 489 hir::RepetitionRange::AtLeast(m) => { 490 self.c_at_least(&rep.hir, rep.greedy, m) 491 } 492 hir::RepetitionRange::Bounded(min, max) => { 493 self.c_bounded(&rep.hir, rep.greedy, min, max) 494 } 495 }, 496 } 497 } 498 c_bounded( &self, expr: &Hir, greedy: bool, min: u32, max: u32, ) -> Result<ThompsonRef>499 fn c_bounded( 500 &self, 501 expr: &Hir, 502 greedy: bool, 503 min: u32, 504 max: u32, 505 ) -> Result<ThompsonRef> { 506 let prefix = self.c_exactly(expr, min)?; 507 if min == max { 508 return Ok(prefix); 509 } 510 511 // It is tempting here to compile the rest here as a concatenation 512 // of zero-or-one matches. i.e., for `a{2,5}`, compile it as if it 513 // were `aaa?a?a?`. The problem here is that it leads to this program: 514 // 515 // >000000: 61 => 01 516 // 000001: 61 => 02 517 // 000002: alt(03, 04) 518 // 000003: 61 => 04 519 // 000004: alt(05, 06) 520 // 000005: 61 => 06 521 // 000006: alt(07, 08) 522 // 000007: 61 => 08 523 // 000008: MATCH 524 // 525 // And effectively, once you hit state 2, the epsilon closure will 526 // include states 3, 5, 5, 6, 7 and 8, which is quite a bit. It is 527 // better to instead compile it like so: 528 // 529 // >000000: 61 => 01 530 // 000001: 61 => 02 531 // 000002: alt(03, 08) 532 // 000003: 61 => 04 533 // 000004: alt(05, 08) 534 // 000005: 61 => 06 535 // 000006: alt(07, 08) 536 // 000007: 61 => 08 537 // 000008: MATCH 538 // 539 // So that the epsilon closure of state 2 is now just 3 and 8. 540 let empty = self.add_empty(); 541 let mut prev_end = prefix.end; 542 for _ in min..max { 543 let union = if greedy { 544 self.add_union() 545 } else { 546 self.add_reverse_union() 547 }; 548 let compiled = self.c(expr)?; 549 self.patch(prev_end, union); 550 self.patch(union, compiled.start); 551 self.patch(union, empty); 552 prev_end = compiled.end; 553 } 554 self.patch(prev_end, empty); 555 Ok(ThompsonRef { start: prefix.start, end: empty }) 556 } 557 c_at_least( &self, expr: &Hir, greedy: bool, n: u32, ) -> Result<ThompsonRef>558 fn c_at_least( 559 &self, 560 expr: &Hir, 561 greedy: bool, 562 n: u32, 563 ) -> Result<ThompsonRef> { 564 if n == 0 { 565 let union = if greedy { 566 self.add_union() 567 } else { 568 self.add_reverse_union() 569 }; 570 let compiled = self.c(expr)?; 571 self.patch(union, compiled.start); 572 self.patch(compiled.end, union); 573 Ok(ThompsonRef { start: union, end: union }) 574 } else if n == 1 { 575 let compiled = self.c(expr)?; 576 let union = if greedy { 577 self.add_union() 578 } else { 579 self.add_reverse_union() 580 }; 581 self.patch(compiled.end, union); 582 self.patch(union, compiled.start); 583 Ok(ThompsonRef { start: compiled.start, end: union }) 584 } else { 585 let prefix = self.c_exactly(expr, n - 1)?; 586 let last = self.c(expr)?; 587 let union = if greedy { 588 self.add_union() 589 } else { 590 self.add_reverse_union() 591 }; 592 self.patch(prefix.end, last.start); 593 self.patch(last.end, union); 594 self.patch(union, last.start); 595 Ok(ThompsonRef { start: prefix.start, end: union }) 596 } 597 } 598 c_zero_or_one(&self, expr: &Hir, greedy: bool) -> Result<ThompsonRef>599 fn c_zero_or_one(&self, expr: &Hir, greedy: bool) -> Result<ThompsonRef> { 600 let union = 601 if greedy { self.add_union() } else { self.add_reverse_union() }; 602 let compiled = self.c(expr)?; 603 let empty = self.add_empty(); 604 self.patch(union, compiled.start); 605 self.patch(union, empty); 606 self.patch(compiled.end, empty); 607 Ok(ThompsonRef { start: union, end: empty }) 608 } 609 c_exactly(&self, expr: &Hir, n: u32) -> Result<ThompsonRef>610 fn c_exactly(&self, expr: &Hir, n: u32) -> Result<ThompsonRef> { 611 let it = (0..n).map(|_| self.c(expr)); 612 self.c_concat(it) 613 } 614 c_byte_class(&self, cls: &hir::ClassBytes) -> Result<ThompsonRef>615 fn c_byte_class(&self, cls: &hir::ClassBytes) -> Result<ThompsonRef> { 616 let end = self.add_empty(); 617 let mut trans = Vec::with_capacity(cls.ranges().len()); 618 for r in cls.iter() { 619 trans.push(Transition { 620 start: r.start(), 621 end: r.end(), 622 next: end, 623 }); 624 } 625 Ok(ThompsonRef { start: self.add_sparse(trans), end }) 626 } 627 c_unicode_class(&self, cls: &hir::ClassUnicode) -> Result<ThompsonRef>628 fn c_unicode_class(&self, cls: &hir::ClassUnicode) -> Result<ThompsonRef> { 629 // If all we have are ASCII ranges wrapped in a Unicode package, then 630 // there is zero reason to bring out the big guns. We can fit all ASCII 631 // ranges within a single sparse transition. 632 if cls.is_all_ascii() { 633 let end = self.add_empty(); 634 let mut trans = Vec::with_capacity(cls.ranges().len()); 635 for r in cls.iter() { 636 assert!(r.start() <= '\x7F'); 637 assert!(r.end() <= '\x7F'); 638 trans.push(Transition { 639 start: r.start() as u8, 640 end: r.end() as u8, 641 next: end, 642 }); 643 } 644 Ok(ThompsonRef { start: self.add_sparse(trans), end }) 645 } else if self.config.reverse { 646 if !self.config.shrink { 647 // When we don't want to spend the extra time shrinking, we 648 // compile the UTF-8 automaton in reverse using something like 649 // the "naive" approach, but will attempt to re-use common 650 // suffixes. 651 self.c_unicode_class_reverse_with_suffix(cls) 652 } else { 653 // When we want to shrink our NFA for reverse UTF-8 automata, 654 // we cannot feed UTF-8 sequences directly to the UTF-8 655 // compiler, since the UTF-8 compiler requires all sequences 656 // to be lexicographically sorted. Instead, we organize our 657 // sequences into a range trie, which can then output our 658 // sequences in the correct order. Unfortunately, building the 659 // range trie is fairly expensive (but not nearly as expensive 660 // as building a DFA). Hence the reason why the 'shrink' option 661 // exists, so that this path can be toggled off. 662 let mut trie = self.trie_state.borrow_mut(); 663 trie.clear(); 664 665 for rng in cls.iter() { 666 for mut seq in Utf8Sequences::new(rng.start(), rng.end()) { 667 seq.reverse(); 668 trie.insert(seq.as_slice()); 669 } 670 } 671 let mut utf8_state = self.utf8_state.borrow_mut(); 672 let mut utf8c = Utf8Compiler::new(self, &mut *utf8_state); 673 trie.iter(|seq| { 674 utf8c.add(&seq); 675 }); 676 Ok(utf8c.finish()) 677 } 678 } else { 679 // In the forward direction, we always shrink our UTF-8 automata 680 // because we can stream it right into the UTF-8 compiler. There 681 // is almost no downside (in either memory or time) to using this 682 // approach. 683 let mut utf8_state = self.utf8_state.borrow_mut(); 684 let mut utf8c = Utf8Compiler::new(self, &mut *utf8_state); 685 for rng in cls.iter() { 686 for seq in Utf8Sequences::new(rng.start(), rng.end()) { 687 utf8c.add(seq.as_slice()); 688 } 689 } 690 Ok(utf8c.finish()) 691 } 692 693 // For reference, the code below is the "naive" version of compiling a 694 // UTF-8 automaton. It is deliciously simple (and works for both the 695 // forward and reverse cases), but will unfortunately produce very 696 // large NFAs. When compiling a forward automaton, the size difference 697 // can sometimes be an order of magnitude. For example, the '\w' regex 698 // will generate about ~3000 NFA states using the naive approach below, 699 // but only 283 states when using the approach above. This is because 700 // the approach above actually compiles a *minimal* (or near minimal, 701 // because of the bounded hashmap) UTF-8 automaton. 702 // 703 // The code below is kept as a reference point in order to make it 704 // easier to understand the higher level goal here. 705 /* 706 let it = cls 707 .iter() 708 .flat_map(|rng| Utf8Sequences::new(rng.start(), rng.end())) 709 .map(|seq| { 710 let it = seq 711 .as_slice() 712 .iter() 713 .map(|rng| Ok(self.c_range(rng.start, rng.end))); 714 self.c_concat(it) 715 }); 716 self.c_alternation(it); 717 */ 718 } 719 c_unicode_class_reverse_with_suffix( &self, cls: &hir::ClassUnicode, ) -> Result<ThompsonRef>720 fn c_unicode_class_reverse_with_suffix( 721 &self, 722 cls: &hir::ClassUnicode, 723 ) -> Result<ThompsonRef> { 724 // N.B. It would likely be better to cache common *prefixes* in the 725 // reverse direction, but it's not quite clear how to do that. The 726 // advantage of caching suffixes is that it does give us a win, and 727 // has a very small additional overhead. 728 let mut cache = self.utf8_suffix.borrow_mut(); 729 cache.clear(); 730 731 let union = self.add_union(); 732 let alt_end = self.add_empty(); 733 for urng in cls.iter() { 734 for seq in Utf8Sequences::new(urng.start(), urng.end()) { 735 let mut end = alt_end; 736 for brng in seq.as_slice() { 737 let key = Utf8SuffixKey { 738 from: end, 739 start: brng.start, 740 end: brng.end, 741 }; 742 let hash = cache.hash(&key); 743 if let Some(id) = cache.get(&key, hash) { 744 end = id; 745 continue; 746 } 747 748 let compiled = self.c_range(brng.start, brng.end); 749 self.patch(compiled.end, end); 750 end = compiled.start; 751 cache.set(key, hash, end); 752 } 753 self.patch(union, end); 754 } 755 } 756 Ok(ThompsonRef { start: union, end: alt_end }) 757 } 758 c_range(&self, start: u8, end: u8) -> ThompsonRef759 fn c_range(&self, start: u8, end: u8) -> ThompsonRef { 760 let id = self.add_range(start, end); 761 ThompsonRef { start: id, end: id } 762 } 763 c_empty(&self) -> ThompsonRef764 fn c_empty(&self) -> ThompsonRef { 765 let id = self.add_empty(); 766 ThompsonRef { start: id, end: id } 767 } 768 c_unanchored_prefix_valid_utf8(&self) -> Result<ThompsonRef>769 fn c_unanchored_prefix_valid_utf8(&self) -> Result<ThompsonRef> { 770 self.c(&Hir::repetition(hir::Repetition { 771 kind: hir::RepetitionKind::ZeroOrMore, 772 greedy: false, 773 hir: Box::new(Hir::any(false)), 774 })) 775 } 776 c_unanchored_prefix_invalid_utf8(&self) -> Result<ThompsonRef>777 fn c_unanchored_prefix_invalid_utf8(&self) -> Result<ThompsonRef> { 778 self.c(&Hir::repetition(hir::Repetition { 779 kind: hir::RepetitionKind::ZeroOrMore, 780 greedy: false, 781 hir: Box::new(Hir::any(true)), 782 })) 783 } 784 patch(&self, from: StateID, to: StateID)785 fn patch(&self, from: StateID, to: StateID) { 786 match self.states.borrow_mut()[from] { 787 CState::Empty { ref mut next } => { 788 *next = to; 789 } 790 CState::Range { ref mut range } => { 791 range.next = to; 792 } 793 CState::Sparse { .. } => { 794 panic!("cannot patch from a sparse NFA state") 795 } 796 CState::Union { ref mut alternates } => { 797 alternates.push(to); 798 } 799 CState::UnionReverse { ref mut alternates } => { 800 alternates.push(to); 801 } 802 CState::Match => {} 803 } 804 } 805 add_empty(&self) -> StateID806 fn add_empty(&self) -> StateID { 807 let id = self.states.borrow().len(); 808 self.states.borrow_mut().push(CState::Empty { next: 0 }); 809 id 810 } 811 add_range(&self, start: u8, end: u8) -> StateID812 fn add_range(&self, start: u8, end: u8) -> StateID { 813 let id = self.states.borrow().len(); 814 let trans = Transition { start, end, next: 0 }; 815 let state = CState::Range { range: trans }; 816 self.states.borrow_mut().push(state); 817 id 818 } 819 add_sparse(&self, ranges: Vec<Transition>) -> StateID820 fn add_sparse(&self, ranges: Vec<Transition>) -> StateID { 821 if ranges.len() == 1 { 822 let id = self.states.borrow().len(); 823 let state = CState::Range { range: ranges[0] }; 824 self.states.borrow_mut().push(state); 825 return id; 826 } 827 let id = self.states.borrow().len(); 828 let state = CState::Sparse { ranges }; 829 self.states.borrow_mut().push(state); 830 id 831 } 832 add_union(&self) -> StateID833 fn add_union(&self) -> StateID { 834 let id = self.states.borrow().len(); 835 let state = CState::Union { alternates: vec![] }; 836 self.states.borrow_mut().push(state); 837 id 838 } 839 add_reverse_union(&self) -> StateID840 fn add_reverse_union(&self) -> StateID { 841 let id = self.states.borrow().len(); 842 let state = CState::UnionReverse { alternates: vec![] }; 843 self.states.borrow_mut().push(state); 844 id 845 } 846 add_match(&self) -> StateID847 fn add_match(&self) -> StateID { 848 let id = self.states.borrow().len(); 849 self.states.borrow_mut().push(CState::Match); 850 id 851 } 852 } 853 854 #[derive(Debug)] 855 struct Utf8Compiler<'a> { 856 nfac: &'a Compiler, 857 state: &'a mut Utf8State, 858 target: StateID, 859 } 860 861 #[derive(Clone, Debug)] 862 struct Utf8State { 863 compiled: Utf8BoundedMap, 864 uncompiled: Vec<Utf8Node>, 865 } 866 867 #[derive(Clone, Debug)] 868 struct Utf8Node { 869 trans: Vec<Transition>, 870 last: Option<Utf8LastTransition>, 871 } 872 873 #[derive(Clone, Debug)] 874 struct Utf8LastTransition { 875 start: u8, 876 end: u8, 877 } 878 879 impl Utf8State { new() -> Utf8State880 fn new() -> Utf8State { 881 Utf8State { compiled: Utf8BoundedMap::new(5000), uncompiled: vec![] } 882 } 883 clear(&mut self)884 fn clear(&mut self) { 885 self.compiled.clear(); 886 self.uncompiled.clear(); 887 } 888 } 889 890 impl<'a> Utf8Compiler<'a> { new(nfac: &'a Compiler, state: &'a mut Utf8State) -> Utf8Compiler<'a>891 fn new(nfac: &'a Compiler, state: &'a mut Utf8State) -> Utf8Compiler<'a> { 892 let target = nfac.add_empty(); 893 state.clear(); 894 let mut utf8c = Utf8Compiler { nfac, state, target }; 895 utf8c.add_empty(); 896 utf8c 897 } 898 finish(&mut self) -> ThompsonRef899 fn finish(&mut self) -> ThompsonRef { 900 self.compile_from(0); 901 let node = self.pop_root(); 902 let start = self.compile(node); 903 ThompsonRef { start, end: self.target } 904 } 905 add(&mut self, ranges: &[Utf8Range])906 fn add(&mut self, ranges: &[Utf8Range]) { 907 let prefix_len = ranges 908 .iter() 909 .zip(&self.state.uncompiled) 910 .take_while(|&(range, node)| { 911 node.last.as_ref().map_or(false, |t| { 912 (t.start, t.end) == (range.start, range.end) 913 }) 914 }) 915 .count(); 916 assert!(prefix_len < ranges.len()); 917 self.compile_from(prefix_len); 918 self.add_suffix(&ranges[prefix_len..]); 919 } 920 compile_from(&mut self, from: usize)921 fn compile_from(&mut self, from: usize) { 922 let mut next = self.target; 923 while from + 1 < self.state.uncompiled.len() { 924 let node = self.pop_freeze(next); 925 next = self.compile(node); 926 } 927 self.top_last_freeze(next); 928 } 929 compile(&mut self, node: Vec<Transition>) -> StateID930 fn compile(&mut self, node: Vec<Transition>) -> StateID { 931 let hash = self.state.compiled.hash(&node); 932 if let Some(id) = self.state.compiled.get(&node, hash) { 933 return id; 934 } 935 let id = self.nfac.add_sparse(node.clone()); 936 self.state.compiled.set(node, hash, id); 937 id 938 } 939 add_suffix(&mut self, ranges: &[Utf8Range])940 fn add_suffix(&mut self, ranges: &[Utf8Range]) { 941 assert!(!ranges.is_empty()); 942 let last = self 943 .state 944 .uncompiled 945 .len() 946 .checked_sub(1) 947 .expect("non-empty nodes"); 948 assert!(self.state.uncompiled[last].last.is_none()); 949 self.state.uncompiled[last].last = Some(Utf8LastTransition { 950 start: ranges[0].start, 951 end: ranges[0].end, 952 }); 953 for r in &ranges[1..] { 954 self.state.uncompiled.push(Utf8Node { 955 trans: vec![], 956 last: Some(Utf8LastTransition { start: r.start, end: r.end }), 957 }); 958 } 959 } 960 add_empty(&mut self)961 fn add_empty(&mut self) { 962 self.state.uncompiled.push(Utf8Node { trans: vec![], last: None }); 963 } 964 pop_freeze(&mut self, next: StateID) -> Vec<Transition>965 fn pop_freeze(&mut self, next: StateID) -> Vec<Transition> { 966 let mut uncompiled = self.state.uncompiled.pop().unwrap(); 967 uncompiled.set_last_transition(next); 968 uncompiled.trans 969 } 970 pop_root(&mut self) -> Vec<Transition>971 fn pop_root(&mut self) -> Vec<Transition> { 972 assert_eq!(self.state.uncompiled.len(), 1); 973 assert!(self.state.uncompiled[0].last.is_none()); 974 self.state.uncompiled.pop().expect("non-empty nodes").trans 975 } 976 top_last_freeze(&mut self, next: StateID)977 fn top_last_freeze(&mut self, next: StateID) { 978 let last = self 979 .state 980 .uncompiled 981 .len() 982 .checked_sub(1) 983 .expect("non-empty nodes"); 984 self.state.uncompiled[last].set_last_transition(next); 985 } 986 } 987 988 impl Utf8Node { set_last_transition(&mut self, next: StateID)989 fn set_last_transition(&mut self, next: StateID) { 990 if let Some(last) = self.last.take() { 991 self.trans.push(Transition { 992 start: last.start, 993 end: last.end, 994 next, 995 }); 996 } 997 } 998 } 999 1000 #[cfg(test)] 1001 mod tests { 1002 use regex_syntax::hir::Hir; 1003 use regex_syntax::ParserBuilder; 1004 1005 use super::{Builder, State, StateID, Transition, NFA}; 1006 parse(pattern: &str) -> Hir1007 fn parse(pattern: &str) -> Hir { 1008 ParserBuilder::new().build().parse(pattern).unwrap() 1009 } 1010 build(pattern: &str) -> NFA1011 fn build(pattern: &str) -> NFA { 1012 Builder::new().anchored(true).build(&parse(pattern)).unwrap() 1013 } 1014 s_byte(byte: u8, next: StateID) -> State1015 fn s_byte(byte: u8, next: StateID) -> State { 1016 let trans = Transition { start: byte, end: byte, next }; 1017 State::Range { range: trans } 1018 } 1019 s_range(start: u8, end: u8, next: StateID) -> State1020 fn s_range(start: u8, end: u8, next: StateID) -> State { 1021 let trans = Transition { start, end, next }; 1022 State::Range { range: trans } 1023 } 1024 s_sparse(ranges: &[(u8, u8, StateID)]) -> State1025 fn s_sparse(ranges: &[(u8, u8, StateID)]) -> State { 1026 let ranges = ranges 1027 .iter() 1028 .map(|&(start, end, next)| Transition { start, end, next }) 1029 .collect(); 1030 State::Sparse { ranges } 1031 } 1032 s_union(alts: &[StateID]) -> State1033 fn s_union(alts: &[StateID]) -> State { 1034 State::Union { alternates: alts.to_vec().into_boxed_slice() } 1035 } 1036 s_match() -> State1037 fn s_match() -> State { 1038 State::Match 1039 } 1040 1041 #[test] errors()1042 fn errors() { 1043 // unsupported anchors 1044 assert!(Builder::new().build(&parse(r"^")).is_err()); 1045 assert!(Builder::new().build(&parse(r"$")).is_err()); 1046 assert!(Builder::new().build(&parse(r"\A")).is_err()); 1047 assert!(Builder::new().build(&parse(r"\z")).is_err()); 1048 1049 // unsupported word boundaries 1050 assert!(Builder::new().build(&parse(r"\b")).is_err()); 1051 assert!(Builder::new().build(&parse(r"\B")).is_err()); 1052 assert!(Builder::new().build(&parse(r"(?-u)\b")).is_err()); 1053 } 1054 1055 // Test that building an unanchored NFA has an appropriate `.*?` prefix. 1056 #[test] compile_unanchored_prefix()1057 fn compile_unanchored_prefix() { 1058 // When the machine can only match valid UTF-8. 1059 let nfa = Builder::new().anchored(false).build(&parse(r"a")).unwrap(); 1060 // There should be many states since the `.` in `.*?` matches any 1061 // Unicode scalar value. 1062 assert_eq!(11, nfa.len()); 1063 assert_eq!(nfa.states[10], s_match()); 1064 assert_eq!(nfa.states[9], s_byte(b'a', 10)); 1065 1066 // When the machine can match invalid UTF-8. 1067 let nfa = Builder::new() 1068 .anchored(false) 1069 .allow_invalid_utf8(true) 1070 .build(&parse(r"a")) 1071 .unwrap(); 1072 assert_eq!( 1073 nfa.states, 1074 &[ 1075 s_union(&[2, 1]), 1076 s_range(0, 255, 0), 1077 s_byte(b'a', 3), 1078 s_match(), 1079 ] 1080 ); 1081 } 1082 1083 #[test] compile_empty()1084 fn compile_empty() { 1085 assert_eq!(build("").states, &[s_match(),]); 1086 } 1087 1088 #[test] compile_literal()1089 fn compile_literal() { 1090 assert_eq!(build("a").states, &[s_byte(b'a', 1), s_match(),]); 1091 assert_eq!( 1092 build("ab").states, 1093 &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(),] 1094 ); 1095 assert_eq!( 1096 build("☃").states, 1097 &[s_byte(0xE2, 1), s_byte(0x98, 2), s_byte(0x83, 3), s_match(),] 1098 ); 1099 1100 // Check that non-UTF-8 literals work. 1101 let hir = ParserBuilder::new() 1102 .allow_invalid_utf8(true) 1103 .build() 1104 .parse(r"(?-u)\xFF") 1105 .unwrap(); 1106 let nfa = Builder::new() 1107 .anchored(true) 1108 .allow_invalid_utf8(true) 1109 .build(&hir) 1110 .unwrap(); 1111 assert_eq!(nfa.states, &[s_byte(b'\xFF', 1), s_match(),]); 1112 } 1113 1114 #[test] compile_class()1115 fn compile_class() { 1116 assert_eq!( 1117 build(r"[a-z]").states, 1118 &[s_range(b'a', b'z', 1), s_match(),] 1119 ); 1120 assert_eq!( 1121 build(r"[x-za-c]").states, 1122 &[s_sparse(&[(b'a', b'c', 1), (b'x', b'z', 1)]), s_match()] 1123 ); 1124 assert_eq!( 1125 build(r"[\u03B1-\u03B4]").states, 1126 &[s_range(0xB1, 0xB4, 2), s_byte(0xCE, 0), s_match()] 1127 ); 1128 assert_eq!( 1129 build(r"[\u03B1-\u03B4\u{1F919}-\u{1F91E}]").states, 1130 &[ 1131 s_range(0xB1, 0xB4, 5), 1132 s_range(0x99, 0x9E, 5), 1133 s_byte(0xA4, 1), 1134 s_byte(0x9F, 2), 1135 s_sparse(&[(0xCE, 0xCE, 0), (0xF0, 0xF0, 3)]), 1136 s_match(), 1137 ] 1138 ); 1139 assert_eq!( 1140 build(r"[a-z☃]").states, 1141 &[ 1142 s_byte(0x83, 3), 1143 s_byte(0x98, 0), 1144 s_sparse(&[(b'a', b'z', 3), (0xE2, 0xE2, 1)]), 1145 s_match(), 1146 ] 1147 ); 1148 } 1149 1150 #[test] compile_repetition()1151 fn compile_repetition() { 1152 assert_eq!( 1153 build(r"a?").states, 1154 &[s_union(&[1, 2]), s_byte(b'a', 2), s_match(),] 1155 ); 1156 assert_eq!( 1157 build(r"a??").states, 1158 &[s_union(&[2, 1]), s_byte(b'a', 2), s_match(),] 1159 ); 1160 } 1161 1162 #[test] compile_group()1163 fn compile_group() { 1164 assert_eq!( 1165 build(r"ab+").states, 1166 &[s_byte(b'a', 1), s_byte(b'b', 2), s_union(&[1, 3]), s_match(),] 1167 ); 1168 assert_eq!( 1169 build(r"(ab)").states, 1170 &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(),] 1171 ); 1172 assert_eq!( 1173 build(r"(ab)+").states, 1174 &[s_byte(b'a', 1), s_byte(b'b', 2), s_union(&[0, 3]), s_match(),] 1175 ); 1176 } 1177 1178 #[test] compile_alternation()1179 fn compile_alternation() { 1180 assert_eq!( 1181 build(r"a|b").states, 1182 &[s_byte(b'a', 3), s_byte(b'b', 3), s_union(&[0, 1]), s_match(),] 1183 ); 1184 assert_eq!( 1185 build(r"|b").states, 1186 &[s_byte(b'b', 2), s_union(&[2, 0]), s_match(),] 1187 ); 1188 assert_eq!( 1189 build(r"a|").states, 1190 &[s_byte(b'a', 2), s_union(&[0, 2]), s_match(),] 1191 ); 1192 } 1193 } 1194