1 // This module provides an NFA compiler using Thompson's construction
2 // algorithm. The compiler takes a regex-syntax::Hir as input and emits an NFA
3 // graph as output. The NFA graph is structured in a way that permits it to be
4 // executed by a virtual machine and also used to efficiently build a DFA.
5 //
6 // The compiler deals with a slightly expanded set of NFA states that notably
7 // includes an empty node that has exactly one epsilon transition to the next
8 // state. In other words, it's a "goto" instruction if one views Thompson's NFA
9 // as a set of bytecode instructions. These goto instructions are removed in
10 // a subsequent phase before returning the NFA to the caller. The purpose of
11 // these empty nodes is that they make the construction algorithm substantially
12 // simpler to implement. We remove them before returning to the caller because
13 // they can represent substantial overhead when traversing the NFA graph
14 // (either while searching using the NFA directly or while building a DFA).
15 //
16 // In the future, it would be nice to provide a Glushkov compiler as well,
17 // as it would work well as a bit-parallel NFA for smaller regexes. But
18 // the Thompson construction is one I'm more familiar with and seems more
19 // straight-forward to deal with when it comes to large Unicode character
20 // classes.
21 //
22 // Internally, the compiler uses interior mutability to improve composition
23 // in the face of the borrow checker. In particular, we'd really like to be
24 // able to write things like this:
25 //
26 //     self.c_concat(exprs.iter().map(|e| self.c(e)))
27 //
28 // Which elegantly uses iterators to build up a sequence of compiled regex
29 // sub-expressions and then hands it off to the concatenating compiler
30 // routine. Without interior mutability, the borrow checker won't let us
31 // borrow `self` mutably both inside and outside the closure at the same
32 // time.
33 
34 use std::cell::RefCell;
35 use std::mem;
36 
37 use regex_syntax::hir::{self, Hir, HirKind};
38 use regex_syntax::utf8::{Utf8Range, Utf8Sequences};
39 
40 use classes::ByteClassSet;
41 use error::{Error, Result};
42 use nfa::map::{Utf8BoundedMap, Utf8SuffixKey, Utf8SuffixMap};
43 use nfa::range_trie::RangeTrie;
44 use nfa::{State, StateID, Transition, NFA};
45 
46 /// Config knobs for the NFA compiler. See the builder's methods for more
47 /// docs on each one.
48 #[derive(Clone, Copy, Debug)]
49 struct Config {
50     anchored: bool,
51     allow_invalid_utf8: bool,
52     reverse: bool,
53     shrink: bool,
54 }
55 
56 impl Default for Config {
default() -> Config57     fn default() -> Config {
58         Config {
59             anchored: false,
60             allow_invalid_utf8: false,
61             reverse: false,
62             shrink: true,
63         }
64     }
65 }
66 
67 /// A builder for compiling an NFA.
68 #[derive(Clone, Debug)]
69 pub struct Builder {
70     config: Config,
71 }
72 
73 impl Builder {
74     /// Create a new NFA builder with its default configuration.
new() -> Builder75     pub fn new() -> Builder {
76         Builder { config: Config::default() }
77     }
78 
79     /// Compile the given high level intermediate representation of a regular
80     /// expression into an NFA.
81     ///
82     /// If there was a problem building the NFA, then an error is returned.
83     /// For example, if the regex uses unsupported features (such as zero-width
84     /// assertions), then an error is returned.
build(&self, expr: &Hir) -> Result<NFA>85     pub fn build(&self, expr: &Hir) -> Result<NFA> {
86         let mut nfa = NFA::always_match();
87         self.build_with(&mut Compiler::new(), &mut nfa, expr)?;
88         Ok(nfa)
89     }
90 
91     /// Compile the given high level intermediate representation of a regular
92     /// expression into the NFA given using the given compiler. Callers may
93     /// prefer this over `build` if they would like to reuse allocations while
94     /// compiling many regular expressions.
95     ///
96     /// On success, the given NFA is completely overwritten with the NFA
97     /// produced by the compiler.
98     ///
99     /// If there was a problem building the NFA, then an error is returned. For
100     /// example, if the regex uses unsupported features (such as zero-width
101     /// assertions), then an error is returned. When an error is returned,
102     /// the contents of `nfa` are unspecified and should not be relied upon.
103     /// However, it can still be reused in subsequent calls to this method.
build_with( &self, compiler: &mut Compiler, nfa: &mut NFA, expr: &Hir, ) -> Result<()>104     pub fn build_with(
105         &self,
106         compiler: &mut Compiler,
107         nfa: &mut NFA,
108         expr: &Hir,
109     ) -> Result<()> {
110         compiler.clear();
111         compiler.configure(self.config);
112         compiler.compile(nfa, expr)
113     }
114 
115     /// Set whether matching must be anchored at the beginning of the input.
116     ///
117     /// When enabled, a match must begin at the start of the input. When
118     /// disabled, the NFA will act as if the pattern started with a `.*?`,
119     /// which enables a match to appear anywhere.
120     ///
121     /// By default this is disabled.
anchored(&mut self, yes: bool) -> &mut Builder122     pub fn anchored(&mut self, yes: bool) -> &mut Builder {
123         self.config.anchored = yes;
124         self
125     }
126 
127     /// When enabled, the builder will permit the construction of an NFA that
128     /// may match invalid UTF-8.
129     ///
130     /// When disabled (the default), the builder is guaranteed to produce a
131     /// regex that will only ever match valid UTF-8 (otherwise, the builder
132     /// will return an error).
allow_invalid_utf8(&mut self, yes: bool) -> &mut Builder133     pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut Builder {
134         self.config.allow_invalid_utf8 = yes;
135         self
136     }
137 
138     /// Reverse the NFA.
139     ///
140     /// A NFA reversal is performed by reversing all of the concatenated
141     /// sub-expressions in the original pattern, recursively. The resulting
142     /// NFA can be used to match the pattern starting from the end of a string
143     /// instead of the beginning of a string.
144     ///
145     /// Reversing the NFA is useful for building a reverse DFA, which is most
146     /// useful for finding the start of a match.
reverse(&mut self, yes: bool) -> &mut Builder147     pub fn reverse(&mut self, yes: bool) -> &mut Builder {
148         self.config.reverse = yes;
149         self
150     }
151 
152     /// Apply best effort heuristics to shrink the NFA at the expense of more
153     /// time/memory.
154     ///
155     /// This is enabled by default. Generally speaking, if one is using an NFA
156     /// to compile DFA, then the extra time used to shrink the NFA will be
157     /// more than made up for during DFA construction (potentially by a lot).
158     /// In other words, enabling this can substantially decrease the overall
159     /// amount of time it takes to build a DFA.
160     ///
161     /// The only reason to disable this if you want to compile an NFA and start
162     /// using it as quickly as possible without needing to build a DFA.
shrink(&mut self, yes: bool) -> &mut Builder163     pub fn shrink(&mut self, yes: bool) -> &mut Builder {
164         self.config.shrink = yes;
165         self
166     }
167 }
168 
169 /// A compiler that converts a regex abstract syntax to an NFA via Thompson's
170 /// construction. Namely, this compiler permits epsilon transitions between
171 /// states.
172 ///
173 /// Users of this crate cannot use a compiler directly. Instead, all one can
174 /// do is create one and use it via the
175 /// [`Builder::build_with`](struct.Builder.html#method.build_with)
176 /// method. This permits callers to reuse compilers in order to amortize
177 /// allocations.
178 #[derive(Clone, Debug)]
179 pub struct Compiler {
180     /// The set of compiled NFA states. Once a state is compiled, it is
181     /// assigned a state ID equivalent to its index in this list. Subsequent
182     /// compilation can modify previous states by adding new transitions.
183     states: RefCell<Vec<CState>>,
184     /// The configuration from the builder.
185     config: Config,
186     /// State used for compiling character classes to UTF-8 byte automata.
187     /// State is not retained between character class compilations. This just
188     /// serves to amortize allocation to the extent possible.
189     utf8_state: RefCell<Utf8State>,
190     /// State used for arranging character classes in reverse into a trie.
191     trie_state: RefCell<RangeTrie>,
192     /// State used for caching common suffixes when compiling reverse UTF-8
193     /// automata (for Unicode character classes).
194     utf8_suffix: RefCell<Utf8SuffixMap>,
195     /// A map used to re-map state IDs when translating the compiler's internal
196     /// NFA state representation to the external NFA representation.
197     remap: RefCell<Vec<StateID>>,
198     /// A set of compiler internal state IDs that correspond to states that are
199     /// exclusively epsilon transitions, i.e., goto instructions, combined with
200     /// the state that they point to. This is used to record said states while
201     /// transforming the compiler's internal NFA representation to the external
202     /// form.
203     empties: RefCell<Vec<(StateID, StateID)>>,
204 }
205 
206 /// A compiler intermediate state representation for an NFA that is only used
207 /// during compilation. Once compilation is done, `CState`s are converted to
208 /// `State`s, which have a much simpler representation.
209 #[derive(Clone, Debug, Eq, PartialEq)]
210 enum CState {
211     /// An empty state whose only purpose is to forward the automaton to
212     /// another state via en epsilon transition. These are useful during
213     /// compilation but are otherwise removed at the end.
214     Empty { next: StateID },
215     /// A state that only transitions to `next` if the current input byte is
216     /// in the range `[start, end]` (inclusive on both ends).
217     Range { range: Transition },
218     /// A state with possibly many transitions, represented in a sparse
219     /// fashion. Transitions are ordered lexicographically by input range.
220     /// As such, this may only be used when every transition has equal
221     /// priority. (In practice, this is only used for encoding large UTF-8
222     /// automata.)
223     Sparse { ranges: Vec<Transition> },
224     /// An alternation such that there exists an epsilon transition to all
225     /// states in `alternates`, where matches found via earlier transitions
226     /// are preferred over later transitions.
227     Union { alternates: Vec<StateID> },
228     /// An alternation such that there exists an epsilon transition to all
229     /// states in `alternates`, where matches found via later transitions
230     /// are preferred over earlier transitions.
231     ///
232     /// This "reverse" state exists for convenience during compilation that
233     /// permits easy construction of non-greedy combinations of NFA states.
234     /// At the end of compilation, Union and UnionReverse states are merged
235     /// into one Union type of state, where the latter has its epsilon
236     /// transitions reversed to reflect the priority inversion.
237     UnionReverse { alternates: Vec<StateID> },
238     /// A match state. There is exactly one such occurrence of this state in
239     /// an NFA.
240     Match,
241 }
242 
243 /// A value that represents the result of compiling a sub-expression of a
244 /// regex's HIR. Specifically, this represents a sub-graph of the NFA that
245 /// has an initial state at `start` and a final state at `end`.
246 #[derive(Clone, Copy, Debug)]
247 pub struct ThompsonRef {
248     start: StateID,
249     end: StateID,
250 }
251 
252 impl Compiler {
253     /// Create a new compiler.
new() -> Compiler254     pub fn new() -> Compiler {
255         Compiler {
256             states: RefCell::new(vec![]),
257             config: Config::default(),
258             utf8_state: RefCell::new(Utf8State::new()),
259             trie_state: RefCell::new(RangeTrie::new()),
260             utf8_suffix: RefCell::new(Utf8SuffixMap::new(1000)),
261             remap: RefCell::new(vec![]),
262             empties: RefCell::new(vec![]),
263         }
264     }
265 
266     /// Clear any memory used by this compiler such that it is ready to compile
267     /// a new regex.
268     ///
269     /// It is preferrable to reuse a compiler if possible in order to reuse
270     /// allocations.
clear(&self)271     fn clear(&self) {
272         self.states.borrow_mut().clear();
273         // We don't need to clear anything else since they are cleared on
274         // their own and only when they are used.
275     }
276 
277     /// Configure this compiler from the builder's knobs.
278     ///
279     /// The compiler is always reconfigured by the builder before using it to
280     /// build an NFA.
configure(&mut self, config: Config)281     fn configure(&mut self, config: Config) {
282         self.config = config;
283     }
284 
285     /// Convert the current intermediate NFA to its final compiled form.
compile(&self, nfa: &mut NFA, expr: &Hir) -> Result<()>286     fn compile(&self, nfa: &mut NFA, expr: &Hir) -> Result<()> {
287         nfa.anchored = self.config.anchored;
288 
289         let mut start = self.add_empty();
290         if !nfa.anchored {
291             let compiled = if self.config.allow_invalid_utf8 {
292                 self.c_unanchored_prefix_invalid_utf8()?
293             } else {
294                 self.c_unanchored_prefix_valid_utf8()?
295             };
296             self.patch(start, compiled.start);
297             start = compiled.end;
298         }
299         let compiled = self.c(&expr)?;
300         let match_id = self.add_match();
301         self.patch(start, compiled.start);
302         self.patch(compiled.end, match_id);
303         self.finish(nfa);
304         Ok(())
305     }
306 
307     /// Finishes the compilation process and populates the provide NFA with
308     /// the final graph.
finish(&self, nfa: &mut NFA)309     fn finish(&self, nfa: &mut NFA) {
310         let mut bstates = self.states.borrow_mut();
311         let mut remap = self.remap.borrow_mut();
312         remap.resize(bstates.len(), 0);
313         let mut empties = self.empties.borrow_mut();
314         empties.clear();
315 
316         // We don't reuse allocations here becuase this is what we're
317         // returning.
318         nfa.states.clear();
319         let mut byteset = ByteClassSet::new();
320 
321         // The idea here is to convert our intermediate states to their final
322         // form. The only real complexity here is the process of converting
323         // transitions, which are expressed in terms of state IDs. The new
324         // set of states will be smaller because of partial epsilon removal,
325         // so the state IDs will not be the same.
326         for (id, bstate) in bstates.iter_mut().enumerate() {
327             match *bstate {
328                 CState::Empty { next } => {
329                     // Since we're removing empty states, we need to handle
330                     // them later since we don't yet know which new state this
331                     // empty state will be mapped to.
332                     empties.push((id, next));
333                 }
334                 CState::Range { ref range } => {
335                     remap[id] = nfa.states.len();
336                     byteset.set_range(range.start, range.end);
337                     nfa.states.push(State::Range { range: range.clone() });
338                 }
339                 CState::Sparse { ref mut ranges } => {
340                     remap[id] = nfa.states.len();
341 
342                     let ranges = mem::replace(ranges, vec![]);
343                     for r in &ranges {
344                         byteset.set_range(r.start, r.end);
345                     }
346                     nfa.states.push(State::Sparse {
347                         ranges: ranges.into_boxed_slice(),
348                     });
349                 }
350                 CState::Union { ref mut alternates } => {
351                     remap[id] = nfa.states.len();
352 
353                     let alternates = mem::replace(alternates, vec![]);
354                     nfa.states.push(State::Union {
355                         alternates: alternates.into_boxed_slice(),
356                     });
357                 }
358                 CState::UnionReverse { ref mut alternates } => {
359                     remap[id] = nfa.states.len();
360 
361                     let mut alternates = mem::replace(alternates, vec![]);
362                     alternates.reverse();
363                     nfa.states.push(State::Union {
364                         alternates: alternates.into_boxed_slice(),
365                     });
366                 }
367                 CState::Match => {
368                     remap[id] = nfa.states.len();
369                     nfa.states.push(State::Match);
370                 }
371             }
372         }
373         for &(empty_id, mut empty_next) in empties.iter() {
374             // empty states can point to other empty states, forming a chain.
375             // So we must follow the chain until the end, which must end at
376             // a non-empty state, and therefore, a state that is correctly
377             // remapped. We are guaranteed to terminate because our compiler
378             // never builds a loop among empty states.
379             while let CState::Empty { next } = bstates[empty_next] {
380                 empty_next = next;
381             }
382             remap[empty_id] = remap[empty_next];
383         }
384         for state in &mut nfa.states {
385             state.remap(&remap);
386         }
387         // The compiler always begins the NFA at the first state.
388         nfa.start = remap[0];
389         nfa.byte_classes = byteset.byte_classes();
390     }
391 
c(&self, expr: &Hir) -> Result<ThompsonRef>392     fn c(&self, expr: &Hir) -> Result<ThompsonRef> {
393         match *expr.kind() {
394             HirKind::Empty => {
395                 let id = self.add_empty();
396                 Ok(ThompsonRef { start: id, end: id })
397             }
398             HirKind::Literal(hir::Literal::Unicode(ch)) => {
399                 let mut buf = [0; 4];
400                 let it = ch
401                     .encode_utf8(&mut buf)
402                     .as_bytes()
403                     .iter()
404                     .map(|&b| Ok(self.c_range(b, b)));
405                 self.c_concat(it)
406             }
407             HirKind::Literal(hir::Literal::Byte(b)) => Ok(self.c_range(b, b)),
408             HirKind::Class(hir::Class::Bytes(ref cls)) => {
409                 self.c_byte_class(cls)
410             }
411             HirKind::Class(hir::Class::Unicode(ref cls)) => {
412                 self.c_unicode_class(cls)
413             }
414             HirKind::Repetition(ref rep) => self.c_repetition(rep),
415             HirKind::Group(ref group) => self.c(&*group.hir),
416             HirKind::Concat(ref exprs) => {
417                 self.c_concat(exprs.iter().map(|e| self.c(e)))
418             }
419             HirKind::Alternation(ref exprs) => {
420                 self.c_alternation(exprs.iter().map(|e| self.c(e)))
421             }
422             HirKind::Anchor(_) => Err(Error::unsupported_anchor()),
423             HirKind::WordBoundary(_) => Err(Error::unsupported_word()),
424         }
425     }
426 
c_concat<I>(&self, mut it: I) -> Result<ThompsonRef> where I: DoubleEndedIterator<Item = Result<ThompsonRef>>,427     fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef>
428     where
429         I: DoubleEndedIterator<Item = Result<ThompsonRef>>,
430     {
431         let first =
432             if self.config.reverse { it.next_back() } else { it.next() };
433         let ThompsonRef { start, mut end } = match first {
434             Some(result) => result?,
435             None => return Ok(self.c_empty()),
436         };
437         loop {
438             let next =
439                 if self.config.reverse { it.next_back() } else { it.next() };
440             let compiled = match next {
441                 Some(result) => result?,
442                 None => break,
443             };
444             self.patch(end, compiled.start);
445             end = compiled.end;
446         }
447         Ok(ThompsonRef { start, end })
448     }
449 
c_alternation<I>(&self, mut it: I) -> Result<ThompsonRef> where I: Iterator<Item = Result<ThompsonRef>>,450     fn c_alternation<I>(&self, mut it: I) -> Result<ThompsonRef>
451     where
452         I: Iterator<Item = Result<ThompsonRef>>,
453     {
454         let first = it.next().expect("alternations must be non-empty")?;
455         let second = match it.next() {
456             None => return Ok(first),
457             Some(result) => result?,
458         };
459 
460         let union = self.add_union();
461         let end = self.add_empty();
462         self.patch(union, first.start);
463         self.patch(first.end, end);
464         self.patch(union, second.start);
465         self.patch(second.end, end);
466         for result in it {
467             let compiled = result?;
468             self.patch(union, compiled.start);
469             self.patch(compiled.end, end);
470         }
471         Ok(ThompsonRef { start: union, end })
472     }
473 
c_repetition(&self, rep: &hir::Repetition) -> Result<ThompsonRef>474     fn c_repetition(&self, rep: &hir::Repetition) -> Result<ThompsonRef> {
475         match rep.kind {
476             hir::RepetitionKind::ZeroOrOne => {
477                 self.c_zero_or_one(&rep.hir, rep.greedy)
478             }
479             hir::RepetitionKind::ZeroOrMore => {
480                 self.c_at_least(&rep.hir, rep.greedy, 0)
481             }
482             hir::RepetitionKind::OneOrMore => {
483                 self.c_at_least(&rep.hir, rep.greedy, 1)
484             }
485             hir::RepetitionKind::Range(ref rng) => match *rng {
486                 hir::RepetitionRange::Exactly(count) => {
487                     self.c_exactly(&rep.hir, count)
488                 }
489                 hir::RepetitionRange::AtLeast(m) => {
490                     self.c_at_least(&rep.hir, rep.greedy, m)
491                 }
492                 hir::RepetitionRange::Bounded(min, max) => {
493                     self.c_bounded(&rep.hir, rep.greedy, min, max)
494                 }
495             },
496         }
497     }
498 
c_bounded( &self, expr: &Hir, greedy: bool, min: u32, max: u32, ) -> Result<ThompsonRef>499     fn c_bounded(
500         &self,
501         expr: &Hir,
502         greedy: bool,
503         min: u32,
504         max: u32,
505     ) -> Result<ThompsonRef> {
506         let prefix = self.c_exactly(expr, min)?;
507         if min == max {
508             return Ok(prefix);
509         }
510 
511         // It is tempting here to compile the rest here as a concatenation
512         // of zero-or-one matches. i.e., for `a{2,5}`, compile it as if it
513         // were `aaa?a?a?`. The problem here is that it leads to this program:
514         //
515         //     >000000: 61 => 01
516         //     000001: 61 => 02
517         //     000002: alt(03, 04)
518         //     000003: 61 => 04
519         //     000004: alt(05, 06)
520         //     000005: 61 => 06
521         //     000006: alt(07, 08)
522         //     000007: 61 => 08
523         //     000008: MATCH
524         //
525         // And effectively, once you hit state 2, the epsilon closure will
526         // include states 3, 5, 5, 6, 7 and 8, which is quite a bit. It is
527         // better to instead compile it like so:
528         //
529         //     >000000: 61 => 01
530         //      000001: 61 => 02
531         //      000002: alt(03, 08)
532         //      000003: 61 => 04
533         //      000004: alt(05, 08)
534         //      000005: 61 => 06
535         //      000006: alt(07, 08)
536         //      000007: 61 => 08
537         //      000008: MATCH
538         //
539         // So that the epsilon closure of state 2 is now just 3 and 8.
540         let empty = self.add_empty();
541         let mut prev_end = prefix.end;
542         for _ in min..max {
543             let union = if greedy {
544                 self.add_union()
545             } else {
546                 self.add_reverse_union()
547             };
548             let compiled = self.c(expr)?;
549             self.patch(prev_end, union);
550             self.patch(union, compiled.start);
551             self.patch(union, empty);
552             prev_end = compiled.end;
553         }
554         self.patch(prev_end, empty);
555         Ok(ThompsonRef { start: prefix.start, end: empty })
556     }
557 
c_at_least( &self, expr: &Hir, greedy: bool, n: u32, ) -> Result<ThompsonRef>558     fn c_at_least(
559         &self,
560         expr: &Hir,
561         greedy: bool,
562         n: u32,
563     ) -> Result<ThompsonRef> {
564         if n == 0 {
565             let union = if greedy {
566                 self.add_union()
567             } else {
568                 self.add_reverse_union()
569             };
570             let compiled = self.c(expr)?;
571             self.patch(union, compiled.start);
572             self.patch(compiled.end, union);
573             Ok(ThompsonRef { start: union, end: union })
574         } else if n == 1 {
575             let compiled = self.c(expr)?;
576             let union = if greedy {
577                 self.add_union()
578             } else {
579                 self.add_reverse_union()
580             };
581             self.patch(compiled.end, union);
582             self.patch(union, compiled.start);
583             Ok(ThompsonRef { start: compiled.start, end: union })
584         } else {
585             let prefix = self.c_exactly(expr, n - 1)?;
586             let last = self.c(expr)?;
587             let union = if greedy {
588                 self.add_union()
589             } else {
590                 self.add_reverse_union()
591             };
592             self.patch(prefix.end, last.start);
593             self.patch(last.end, union);
594             self.patch(union, last.start);
595             Ok(ThompsonRef { start: prefix.start, end: union })
596         }
597     }
598 
c_zero_or_one(&self, expr: &Hir, greedy: bool) -> Result<ThompsonRef>599     fn c_zero_or_one(&self, expr: &Hir, greedy: bool) -> Result<ThompsonRef> {
600         let union =
601             if greedy { self.add_union() } else { self.add_reverse_union() };
602         let compiled = self.c(expr)?;
603         let empty = self.add_empty();
604         self.patch(union, compiled.start);
605         self.patch(union, empty);
606         self.patch(compiled.end, empty);
607         Ok(ThompsonRef { start: union, end: empty })
608     }
609 
c_exactly(&self, expr: &Hir, n: u32) -> Result<ThompsonRef>610     fn c_exactly(&self, expr: &Hir, n: u32) -> Result<ThompsonRef> {
611         let it = (0..n).map(|_| self.c(expr));
612         self.c_concat(it)
613     }
614 
c_byte_class(&self, cls: &hir::ClassBytes) -> Result<ThompsonRef>615     fn c_byte_class(&self, cls: &hir::ClassBytes) -> Result<ThompsonRef> {
616         let end = self.add_empty();
617         let mut trans = Vec::with_capacity(cls.ranges().len());
618         for r in cls.iter() {
619             trans.push(Transition {
620                 start: r.start(),
621                 end: r.end(),
622                 next: end,
623             });
624         }
625         Ok(ThompsonRef { start: self.add_sparse(trans), end })
626     }
627 
c_unicode_class(&self, cls: &hir::ClassUnicode) -> Result<ThompsonRef>628     fn c_unicode_class(&self, cls: &hir::ClassUnicode) -> Result<ThompsonRef> {
629         // If all we have are ASCII ranges wrapped in a Unicode package, then
630         // there is zero reason to bring out the big guns. We can fit all ASCII
631         // ranges within a single sparse transition.
632         if cls.is_all_ascii() {
633             let end = self.add_empty();
634             let mut trans = Vec::with_capacity(cls.ranges().len());
635             for r in cls.iter() {
636                 assert!(r.start() <= '\x7F');
637                 assert!(r.end() <= '\x7F');
638                 trans.push(Transition {
639                     start: r.start() as u8,
640                     end: r.end() as u8,
641                     next: end,
642                 });
643             }
644             Ok(ThompsonRef { start: self.add_sparse(trans), end })
645         } else if self.config.reverse {
646             if !self.config.shrink {
647                 // When we don't want to spend the extra time shrinking, we
648                 // compile the UTF-8 automaton in reverse using something like
649                 // the "naive" approach, but will attempt to re-use common
650                 // suffixes.
651                 self.c_unicode_class_reverse_with_suffix(cls)
652             } else {
653                 // When we want to shrink our NFA for reverse UTF-8 automata,
654                 // we cannot feed UTF-8 sequences directly to the UTF-8
655                 // compiler, since the UTF-8 compiler requires all sequences
656                 // to be lexicographically sorted. Instead, we organize our
657                 // sequences into a range trie, which can then output our
658                 // sequences in the correct order. Unfortunately, building the
659                 // range trie is fairly expensive (but not nearly as expensive
660                 // as building a DFA). Hence the reason why the 'shrink' option
661                 // exists, so that this path can be toggled off.
662                 let mut trie = self.trie_state.borrow_mut();
663                 trie.clear();
664 
665                 for rng in cls.iter() {
666                     for mut seq in Utf8Sequences::new(rng.start(), rng.end()) {
667                         seq.reverse();
668                         trie.insert(seq.as_slice());
669                     }
670                 }
671                 let mut utf8_state = self.utf8_state.borrow_mut();
672                 let mut utf8c = Utf8Compiler::new(self, &mut *utf8_state);
673                 trie.iter(|seq| {
674                     utf8c.add(&seq);
675                 });
676                 Ok(utf8c.finish())
677             }
678         } else {
679             // In the forward direction, we always shrink our UTF-8 automata
680             // because we can stream it right into the UTF-8 compiler. There
681             // is almost no downside (in either memory or time) to using this
682             // approach.
683             let mut utf8_state = self.utf8_state.borrow_mut();
684             let mut utf8c = Utf8Compiler::new(self, &mut *utf8_state);
685             for rng in cls.iter() {
686                 for seq in Utf8Sequences::new(rng.start(), rng.end()) {
687                     utf8c.add(seq.as_slice());
688                 }
689             }
690             Ok(utf8c.finish())
691         }
692 
693         // For reference, the code below is the "naive" version of compiling a
694         // UTF-8 automaton. It is deliciously simple (and works for both the
695         // forward and reverse cases), but will unfortunately produce very
696         // large NFAs. When compiling a forward automaton, the size difference
697         // can sometimes be an order of magnitude. For example, the '\w' regex
698         // will generate about ~3000 NFA states using the naive approach below,
699         // but only 283 states when using the approach above. This is because
700         // the approach above actually compiles a *minimal* (or near minimal,
701         // because of the bounded hashmap) UTF-8 automaton.
702         //
703         // The code below is kept as a reference point in order to make it
704         // easier to understand the higher level goal here.
705         /*
706         let it = cls
707             .iter()
708             .flat_map(|rng| Utf8Sequences::new(rng.start(), rng.end()))
709             .map(|seq| {
710                 let it = seq
711                     .as_slice()
712                     .iter()
713                     .map(|rng| Ok(self.c_range(rng.start, rng.end)));
714                 self.c_concat(it)
715             });
716         self.c_alternation(it);
717         */
718     }
719 
c_unicode_class_reverse_with_suffix( &self, cls: &hir::ClassUnicode, ) -> Result<ThompsonRef>720     fn c_unicode_class_reverse_with_suffix(
721         &self,
722         cls: &hir::ClassUnicode,
723     ) -> Result<ThompsonRef> {
724         // N.B. It would likely be better to cache common *prefixes* in the
725         // reverse direction, but it's not quite clear how to do that. The
726         // advantage of caching suffixes is that it does give us a win, and
727         // has a very small additional overhead.
728         let mut cache = self.utf8_suffix.borrow_mut();
729         cache.clear();
730 
731         let union = self.add_union();
732         let alt_end = self.add_empty();
733         for urng in cls.iter() {
734             for seq in Utf8Sequences::new(urng.start(), urng.end()) {
735                 let mut end = alt_end;
736                 for brng in seq.as_slice() {
737                     let key = Utf8SuffixKey {
738                         from: end,
739                         start: brng.start,
740                         end: brng.end,
741                     };
742                     let hash = cache.hash(&key);
743                     if let Some(id) = cache.get(&key, hash) {
744                         end = id;
745                         continue;
746                     }
747 
748                     let compiled = self.c_range(brng.start, brng.end);
749                     self.patch(compiled.end, end);
750                     end = compiled.start;
751                     cache.set(key, hash, end);
752                 }
753                 self.patch(union, end);
754             }
755         }
756         Ok(ThompsonRef { start: union, end: alt_end })
757     }
758 
c_range(&self, start: u8, end: u8) -> ThompsonRef759     fn c_range(&self, start: u8, end: u8) -> ThompsonRef {
760         let id = self.add_range(start, end);
761         ThompsonRef { start: id, end: id }
762     }
763 
c_empty(&self) -> ThompsonRef764     fn c_empty(&self) -> ThompsonRef {
765         let id = self.add_empty();
766         ThompsonRef { start: id, end: id }
767     }
768 
c_unanchored_prefix_valid_utf8(&self) -> Result<ThompsonRef>769     fn c_unanchored_prefix_valid_utf8(&self) -> Result<ThompsonRef> {
770         self.c(&Hir::repetition(hir::Repetition {
771             kind: hir::RepetitionKind::ZeroOrMore,
772             greedy: false,
773             hir: Box::new(Hir::any(false)),
774         }))
775     }
776 
c_unanchored_prefix_invalid_utf8(&self) -> Result<ThompsonRef>777     fn c_unanchored_prefix_invalid_utf8(&self) -> Result<ThompsonRef> {
778         self.c(&Hir::repetition(hir::Repetition {
779             kind: hir::RepetitionKind::ZeroOrMore,
780             greedy: false,
781             hir: Box::new(Hir::any(true)),
782         }))
783     }
784 
patch(&self, from: StateID, to: StateID)785     fn patch(&self, from: StateID, to: StateID) {
786         match self.states.borrow_mut()[from] {
787             CState::Empty { ref mut next } => {
788                 *next = to;
789             }
790             CState::Range { ref mut range } => {
791                 range.next = to;
792             }
793             CState::Sparse { .. } => {
794                 panic!("cannot patch from a sparse NFA state")
795             }
796             CState::Union { ref mut alternates } => {
797                 alternates.push(to);
798             }
799             CState::UnionReverse { ref mut alternates } => {
800                 alternates.push(to);
801             }
802             CState::Match => {}
803         }
804     }
805 
add_empty(&self) -> StateID806     fn add_empty(&self) -> StateID {
807         let id = self.states.borrow().len();
808         self.states.borrow_mut().push(CState::Empty { next: 0 });
809         id
810     }
811 
add_range(&self, start: u8, end: u8) -> StateID812     fn add_range(&self, start: u8, end: u8) -> StateID {
813         let id = self.states.borrow().len();
814         let trans = Transition { start, end, next: 0 };
815         let state = CState::Range { range: trans };
816         self.states.borrow_mut().push(state);
817         id
818     }
819 
add_sparse(&self, ranges: Vec<Transition>) -> StateID820     fn add_sparse(&self, ranges: Vec<Transition>) -> StateID {
821         if ranges.len() == 1 {
822             let id = self.states.borrow().len();
823             let state = CState::Range { range: ranges[0] };
824             self.states.borrow_mut().push(state);
825             return id;
826         }
827         let id = self.states.borrow().len();
828         let state = CState::Sparse { ranges };
829         self.states.borrow_mut().push(state);
830         id
831     }
832 
add_union(&self) -> StateID833     fn add_union(&self) -> StateID {
834         let id = self.states.borrow().len();
835         let state = CState::Union { alternates: vec![] };
836         self.states.borrow_mut().push(state);
837         id
838     }
839 
add_reverse_union(&self) -> StateID840     fn add_reverse_union(&self) -> StateID {
841         let id = self.states.borrow().len();
842         let state = CState::UnionReverse { alternates: vec![] };
843         self.states.borrow_mut().push(state);
844         id
845     }
846 
add_match(&self) -> StateID847     fn add_match(&self) -> StateID {
848         let id = self.states.borrow().len();
849         self.states.borrow_mut().push(CState::Match);
850         id
851     }
852 }
853 
854 #[derive(Debug)]
855 struct Utf8Compiler<'a> {
856     nfac: &'a Compiler,
857     state: &'a mut Utf8State,
858     target: StateID,
859 }
860 
861 #[derive(Clone, Debug)]
862 struct Utf8State {
863     compiled: Utf8BoundedMap,
864     uncompiled: Vec<Utf8Node>,
865 }
866 
867 #[derive(Clone, Debug)]
868 struct Utf8Node {
869     trans: Vec<Transition>,
870     last: Option<Utf8LastTransition>,
871 }
872 
873 #[derive(Clone, Debug)]
874 struct Utf8LastTransition {
875     start: u8,
876     end: u8,
877 }
878 
879 impl Utf8State {
new() -> Utf8State880     fn new() -> Utf8State {
881         Utf8State { compiled: Utf8BoundedMap::new(5000), uncompiled: vec![] }
882     }
883 
clear(&mut self)884     fn clear(&mut self) {
885         self.compiled.clear();
886         self.uncompiled.clear();
887     }
888 }
889 
890 impl<'a> Utf8Compiler<'a> {
new(nfac: &'a Compiler, state: &'a mut Utf8State) -> Utf8Compiler<'a>891     fn new(nfac: &'a Compiler, state: &'a mut Utf8State) -> Utf8Compiler<'a> {
892         let target = nfac.add_empty();
893         state.clear();
894         let mut utf8c = Utf8Compiler { nfac, state, target };
895         utf8c.add_empty();
896         utf8c
897     }
898 
finish(&mut self) -> ThompsonRef899     fn finish(&mut self) -> ThompsonRef {
900         self.compile_from(0);
901         let node = self.pop_root();
902         let start = self.compile(node);
903         ThompsonRef { start, end: self.target }
904     }
905 
add(&mut self, ranges: &[Utf8Range])906     fn add(&mut self, ranges: &[Utf8Range]) {
907         let prefix_len = ranges
908             .iter()
909             .zip(&self.state.uncompiled)
910             .take_while(|&(range, node)| {
911                 node.last.as_ref().map_or(false, |t| {
912                     (t.start, t.end) == (range.start, range.end)
913                 })
914             })
915             .count();
916         assert!(prefix_len < ranges.len());
917         self.compile_from(prefix_len);
918         self.add_suffix(&ranges[prefix_len..]);
919     }
920 
compile_from(&mut self, from: usize)921     fn compile_from(&mut self, from: usize) {
922         let mut next = self.target;
923         while from + 1 < self.state.uncompiled.len() {
924             let node = self.pop_freeze(next);
925             next = self.compile(node);
926         }
927         self.top_last_freeze(next);
928     }
929 
compile(&mut self, node: Vec<Transition>) -> StateID930     fn compile(&mut self, node: Vec<Transition>) -> StateID {
931         let hash = self.state.compiled.hash(&node);
932         if let Some(id) = self.state.compiled.get(&node, hash) {
933             return id;
934         }
935         let id = self.nfac.add_sparse(node.clone());
936         self.state.compiled.set(node, hash, id);
937         id
938     }
939 
add_suffix(&mut self, ranges: &[Utf8Range])940     fn add_suffix(&mut self, ranges: &[Utf8Range]) {
941         assert!(!ranges.is_empty());
942         let last = self
943             .state
944             .uncompiled
945             .len()
946             .checked_sub(1)
947             .expect("non-empty nodes");
948         assert!(self.state.uncompiled[last].last.is_none());
949         self.state.uncompiled[last].last = Some(Utf8LastTransition {
950             start: ranges[0].start,
951             end: ranges[0].end,
952         });
953         for r in &ranges[1..] {
954             self.state.uncompiled.push(Utf8Node {
955                 trans: vec![],
956                 last: Some(Utf8LastTransition { start: r.start, end: r.end }),
957             });
958         }
959     }
960 
add_empty(&mut self)961     fn add_empty(&mut self) {
962         self.state.uncompiled.push(Utf8Node { trans: vec![], last: None });
963     }
964 
pop_freeze(&mut self, next: StateID) -> Vec<Transition>965     fn pop_freeze(&mut self, next: StateID) -> Vec<Transition> {
966         let mut uncompiled = self.state.uncompiled.pop().unwrap();
967         uncompiled.set_last_transition(next);
968         uncompiled.trans
969     }
970 
pop_root(&mut self) -> Vec<Transition>971     fn pop_root(&mut self) -> Vec<Transition> {
972         assert_eq!(self.state.uncompiled.len(), 1);
973         assert!(self.state.uncompiled[0].last.is_none());
974         self.state.uncompiled.pop().expect("non-empty nodes").trans
975     }
976 
top_last_freeze(&mut self, next: StateID)977     fn top_last_freeze(&mut self, next: StateID) {
978         let last = self
979             .state
980             .uncompiled
981             .len()
982             .checked_sub(1)
983             .expect("non-empty nodes");
984         self.state.uncompiled[last].set_last_transition(next);
985     }
986 }
987 
988 impl Utf8Node {
set_last_transition(&mut self, next: StateID)989     fn set_last_transition(&mut self, next: StateID) {
990         if let Some(last) = self.last.take() {
991             self.trans.push(Transition {
992                 start: last.start,
993                 end: last.end,
994                 next,
995             });
996         }
997     }
998 }
999 
1000 #[cfg(test)]
1001 mod tests {
1002     use regex_syntax::hir::Hir;
1003     use regex_syntax::ParserBuilder;
1004 
1005     use super::{Builder, State, StateID, Transition, NFA};
1006 
parse(pattern: &str) -> Hir1007     fn parse(pattern: &str) -> Hir {
1008         ParserBuilder::new().build().parse(pattern).unwrap()
1009     }
1010 
build(pattern: &str) -> NFA1011     fn build(pattern: &str) -> NFA {
1012         Builder::new().anchored(true).build(&parse(pattern)).unwrap()
1013     }
1014 
s_byte(byte: u8, next: StateID) -> State1015     fn s_byte(byte: u8, next: StateID) -> State {
1016         let trans = Transition { start: byte, end: byte, next };
1017         State::Range { range: trans }
1018     }
1019 
s_range(start: u8, end: u8, next: StateID) -> State1020     fn s_range(start: u8, end: u8, next: StateID) -> State {
1021         let trans = Transition { start, end, next };
1022         State::Range { range: trans }
1023     }
1024 
s_sparse(ranges: &[(u8, u8, StateID)]) -> State1025     fn s_sparse(ranges: &[(u8, u8, StateID)]) -> State {
1026         let ranges = ranges
1027             .iter()
1028             .map(|&(start, end, next)| Transition { start, end, next })
1029             .collect();
1030         State::Sparse { ranges }
1031     }
1032 
s_union(alts: &[StateID]) -> State1033     fn s_union(alts: &[StateID]) -> State {
1034         State::Union { alternates: alts.to_vec().into_boxed_slice() }
1035     }
1036 
s_match() -> State1037     fn s_match() -> State {
1038         State::Match
1039     }
1040 
1041     #[test]
errors()1042     fn errors() {
1043         // unsupported anchors
1044         assert!(Builder::new().build(&parse(r"^")).is_err());
1045         assert!(Builder::new().build(&parse(r"$")).is_err());
1046         assert!(Builder::new().build(&parse(r"\A")).is_err());
1047         assert!(Builder::new().build(&parse(r"\z")).is_err());
1048 
1049         // unsupported word boundaries
1050         assert!(Builder::new().build(&parse(r"\b")).is_err());
1051         assert!(Builder::new().build(&parse(r"\B")).is_err());
1052         assert!(Builder::new().build(&parse(r"(?-u)\b")).is_err());
1053     }
1054 
1055     // Test that building an unanchored NFA has an appropriate `.*?` prefix.
1056     #[test]
compile_unanchored_prefix()1057     fn compile_unanchored_prefix() {
1058         // When the machine can only match valid UTF-8.
1059         let nfa = Builder::new().anchored(false).build(&parse(r"a")).unwrap();
1060         // There should be many states since the `.` in `.*?` matches any
1061         // Unicode scalar value.
1062         assert_eq!(11, nfa.len());
1063         assert_eq!(nfa.states[10], s_match());
1064         assert_eq!(nfa.states[9], s_byte(b'a', 10));
1065 
1066         // When the machine can match invalid UTF-8.
1067         let nfa = Builder::new()
1068             .anchored(false)
1069             .allow_invalid_utf8(true)
1070             .build(&parse(r"a"))
1071             .unwrap();
1072         assert_eq!(
1073             nfa.states,
1074             &[
1075                 s_union(&[2, 1]),
1076                 s_range(0, 255, 0),
1077                 s_byte(b'a', 3),
1078                 s_match(),
1079             ]
1080         );
1081     }
1082 
1083     #[test]
compile_empty()1084     fn compile_empty() {
1085         assert_eq!(build("").states, &[s_match(),]);
1086     }
1087 
1088     #[test]
compile_literal()1089     fn compile_literal() {
1090         assert_eq!(build("a").states, &[s_byte(b'a', 1), s_match(),]);
1091         assert_eq!(
1092             build("ab").states,
1093             &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(),]
1094         );
1095         assert_eq!(
1096             build("☃").states,
1097             &[s_byte(0xE2, 1), s_byte(0x98, 2), s_byte(0x83, 3), s_match(),]
1098         );
1099 
1100         // Check that non-UTF-8 literals work.
1101         let hir = ParserBuilder::new()
1102             .allow_invalid_utf8(true)
1103             .build()
1104             .parse(r"(?-u)\xFF")
1105             .unwrap();
1106         let nfa = Builder::new()
1107             .anchored(true)
1108             .allow_invalid_utf8(true)
1109             .build(&hir)
1110             .unwrap();
1111         assert_eq!(nfa.states, &[s_byte(b'\xFF', 1), s_match(),]);
1112     }
1113 
1114     #[test]
compile_class()1115     fn compile_class() {
1116         assert_eq!(
1117             build(r"[a-z]").states,
1118             &[s_range(b'a', b'z', 1), s_match(),]
1119         );
1120         assert_eq!(
1121             build(r"[x-za-c]").states,
1122             &[s_sparse(&[(b'a', b'c', 1), (b'x', b'z', 1)]), s_match()]
1123         );
1124         assert_eq!(
1125             build(r"[\u03B1-\u03B4]").states,
1126             &[s_range(0xB1, 0xB4, 2), s_byte(0xCE, 0), s_match()]
1127         );
1128         assert_eq!(
1129             build(r"[\u03B1-\u03B4\u{1F919}-\u{1F91E}]").states,
1130             &[
1131                 s_range(0xB1, 0xB4, 5),
1132                 s_range(0x99, 0x9E, 5),
1133                 s_byte(0xA4, 1),
1134                 s_byte(0x9F, 2),
1135                 s_sparse(&[(0xCE, 0xCE, 0), (0xF0, 0xF0, 3)]),
1136                 s_match(),
1137             ]
1138         );
1139         assert_eq!(
1140             build(r"[a-z☃]").states,
1141             &[
1142                 s_byte(0x83, 3),
1143                 s_byte(0x98, 0),
1144                 s_sparse(&[(b'a', b'z', 3), (0xE2, 0xE2, 1)]),
1145                 s_match(),
1146             ]
1147         );
1148     }
1149 
1150     #[test]
compile_repetition()1151     fn compile_repetition() {
1152         assert_eq!(
1153             build(r"a?").states,
1154             &[s_union(&[1, 2]), s_byte(b'a', 2), s_match(),]
1155         );
1156         assert_eq!(
1157             build(r"a??").states,
1158             &[s_union(&[2, 1]), s_byte(b'a', 2), s_match(),]
1159         );
1160     }
1161 
1162     #[test]
compile_group()1163     fn compile_group() {
1164         assert_eq!(
1165             build(r"ab+").states,
1166             &[s_byte(b'a', 1), s_byte(b'b', 2), s_union(&[1, 3]), s_match(),]
1167         );
1168         assert_eq!(
1169             build(r"(ab)").states,
1170             &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(),]
1171         );
1172         assert_eq!(
1173             build(r"(ab)+").states,
1174             &[s_byte(b'a', 1), s_byte(b'b', 2), s_union(&[0, 3]), s_match(),]
1175         );
1176     }
1177 
1178     #[test]
compile_alternation()1179     fn compile_alternation() {
1180         assert_eq!(
1181             build(r"a|b").states,
1182             &[s_byte(b'a', 3), s_byte(b'b', 3), s_union(&[0, 1]), s_match(),]
1183         );
1184         assert_eq!(
1185             build(r"|b").states,
1186             &[s_byte(b'b', 2), s_union(&[2, 0]), s_match(),]
1187         );
1188         assert_eq!(
1189             build(r"a|").states,
1190             &[s_byte(b'a', 2), s_union(&[0, 2]), s_match(),]
1191         );
1192     }
1193 }
1194