1 use std::collections::HashMap;
2 use std::mem;
3 use std::rc::Rc;
4 
5 use dense;
6 use error::Result;
7 use nfa::{self, NFA};
8 use sparse_set::SparseSet;
9 use state_id::{StateID, dead_id};
10 
11 type DFARepr<S> = dense::Repr<Vec<S>, S>;
12 
13 /// A determinizer converts an NFA to a DFA.
14 ///
15 /// This determinizer follows the typical powerset construction, where each
16 /// DFA state is comprised of one or more NFA states. In the worst case, there
17 /// is one DFA state for every possible combination of NFA states. In practice,
18 /// this only happens in certain conditions, typically when there are bounded
19 /// repetitions.
20 ///
21 /// The type variable `S` refers to the chosen state identifier representation
22 /// used for the DFA.
23 ///
24 /// The lifetime variable `'a` refers to the lifetime of the NFA being
25 /// converted to a DFA.
26 #[derive(Debug)]
27 pub(crate) struct Determinizer<'a, S: StateID> {
28     /// The NFA we're converting into a DFA.
29     nfa: &'a NFA,
30     /// The DFA we're building.
31     dfa: DFARepr<S>,
32     /// Each DFA state being built is defined as an *ordered* set of NFA
33     /// states, along with a flag indicating whether the state is a match
34     /// state or not.
35     ///
36     /// This is never empty. The first state is always a dummy state such that
37     /// a state id == 0 corresponds to a dead state.
38     builder_states: Vec<Rc<State>>,
39     /// A cache of DFA states that already exist and can be easily looked up
40     /// via ordered sets of NFA states.
41     cache: HashMap<Rc<State>, S>,
42     /// Scratch space for a stack of NFA states to visit, for depth first
43     /// visiting without recursion.
44     stack: Vec<nfa::StateID>,
45     /// Scratch space for storing an ordered sequence of NFA states, for
46     /// amortizing allocation.
47     scratch_nfa_states: Vec<nfa::StateID>,
48     /// Whether to build a DFA that finds the longest possible match.
49     longest_match: bool,
50 }
51 
52 /// An intermediate representation for a DFA state during determinization.
53 #[derive(Debug, Eq, Hash, PartialEq)]
54 struct State {
55     /// Whether this state is a match state or not.
56     is_match: bool,
57     /// An ordered sequence of NFA states that make up this DFA state.
58     nfa_states: Vec<nfa::StateID>,
59 }
60 
61 impl<'a, S: StateID> Determinizer<'a, S> {
62     /// Create a new determinizer for converting the given NFA to a DFA.
new(nfa: &'a NFA) -> Determinizer<'a, S>63     pub fn new(nfa: &'a NFA) -> Determinizer<'a, S> {
64         let dead = Rc::new(State::dead());
65         let mut cache = HashMap::default();
66         cache.insert(dead.clone(), dead_id());
67 
68         Determinizer {
69             nfa: nfa,
70             dfa: DFARepr::empty().anchored(nfa.is_anchored()),
71             builder_states: vec![dead],
72             cache: cache,
73             stack: vec![],
74             scratch_nfa_states: vec![],
75             longest_match: false,
76         }
77     }
78 
79     /// Instruct the determinizer to use equivalence classes as the transition
80     /// alphabet instead of all possible byte values.
with_byte_classes(mut self) -> Determinizer<'a, S>81     pub fn with_byte_classes(mut self) -> Determinizer<'a, S> {
82         let byte_classes = self.nfa.byte_classes().clone();
83         self.dfa = DFARepr::empty_with_byte_classes(byte_classes)
84             .anchored(self.nfa.is_anchored());
85         self
86     }
87 
88     /// Instruct the determinizer to build a DFA that recognizes the longest
89     /// possible match instead of the leftmost first match. This is useful when
90     /// constructing reverse DFAs for finding the start of a match.
longest_match(mut self, yes: bool) -> Determinizer<'a, S>91     pub fn longest_match(mut self, yes: bool) -> Determinizer<'a, S> {
92         self.longest_match = yes;
93         self
94     }
95 
96     /// Build the DFA. If there was a problem constructing the DFA (e.g., if
97     /// the chosen state identifier representation is too small), then an error
98     /// is returned.
build(mut self) -> Result<DFARepr<S>>99     pub fn build(mut self) -> Result<DFARepr<S>> {
100         let representative_bytes: Vec<u8> =
101             self.dfa.byte_classes().representatives().collect();
102         let mut sparse = self.new_sparse_set();
103         let mut uncompiled = vec![self.add_start(&mut sparse)?];
104         while let Some(dfa_id) = uncompiled.pop() {
105             for &b in &representative_bytes {
106                 let (next_dfa_id, is_new) = self.cached_state(
107                     dfa_id, b, &mut sparse,
108                 )?;
109                 self.dfa.add_transition(dfa_id, b, next_dfa_id);
110                 if is_new {
111                     uncompiled.push(next_dfa_id);
112                 }
113             }
114         }
115 
116         // At this point, we shuffle the matching states in the final DFA to
117         // the beginning. This permits a DFA's match loop to detect a match
118         // condition by merely inspecting the current state's identifier, and
119         // avoids the need for any additional auxiliary storage.
120         let is_match: Vec<bool> = self
121             .builder_states
122             .iter()
123             .map(|s| s.is_match)
124             .collect();
125         self.dfa.shuffle_match_states(&is_match);
126         Ok(self.dfa)
127     }
128 
129     /// Return the identifier for the next DFA state given an existing DFA
130     /// state and an input byte. If the next DFA state already exists, then
131     /// return its identifier from the cache. Otherwise, build the state, cache
132     /// it and return its identifier.
133     ///
134     /// The given sparse set is used for scratch space. It must have a capacity
135     /// equivalent to the total number of NFA states, but its contents are
136     /// otherwise unspecified.
137     ///
138     /// This routine returns a boolean indicating whether a new state was
139     /// built. If a new state is built, then the caller needs to add it to its
140     /// frontier of uncompiled DFA states to compute transitions for.
cached_state( &mut self, dfa_id: S, b: u8, sparse: &mut SparseSet, ) -> Result<(S, bool)>141     fn cached_state(
142         &mut self,
143         dfa_id: S,
144         b: u8,
145         sparse: &mut SparseSet,
146     ) -> Result<(S, bool)> {
147         sparse.clear();
148         // Compute the set of all reachable NFA states, including epsilons.
149         self.next(dfa_id, b, sparse);
150         // Build a candidate state and check if it has already been built.
151         let state = self.new_state(sparse);
152         if let Some(&cached_id) = self.cache.get(&state) {
153             // Since we have a cached state, put the constructed state's
154             // memory back into our scratch space, so that it can be reused.
155             mem::replace(&mut self.scratch_nfa_states, state.nfa_states);
156             return Ok((cached_id, false));
157         }
158         // Nothing was in the cache, so add this state to the cache.
159         self.add_state(state).map(|s| (s, true))
160     }
161 
162     /// Compute the set of all eachable NFA states, including the full epsilon
163     /// closure, from a DFA state for a single byte of input.
next( &mut self, dfa_id: S, b: u8, next_nfa_states: &mut SparseSet, )164     fn next(
165         &mut self,
166         dfa_id: S,
167         b: u8,
168         next_nfa_states: &mut SparseSet,
169     ) {
170         next_nfa_states.clear();
171         for i in 0..self.builder_states[dfa_id.to_usize()].nfa_states.len() {
172             let nfa_id = self.builder_states[dfa_id.to_usize()].nfa_states[i];
173             match *self.nfa.state(nfa_id) {
174                 nfa::State::Union { .. } | nfa::State::Match => {}
175                 nfa::State::Range { start, end, next } => {
176                     if start <= b && b <= end {
177                         self.epsilon_closure(next, next_nfa_states);
178                     }
179                 }
180             }
181         }
182     }
183 
184     /// Compute the epsilon closure for the given NFA state.
epsilon_closure(&mut self, start: nfa::StateID, set: &mut SparseSet)185     fn epsilon_closure(&mut self, start: nfa::StateID, set: &mut SparseSet) {
186         if !self.nfa.state(start).is_epsilon() {
187             set.insert(start);
188             return;
189         }
190 
191         self.stack.push(start);
192         while let Some(mut id) = self.stack.pop() {
193             loop {
194                 if set.contains(id) {
195                     break;
196                 }
197                 set.insert(id);
198                 match *self.nfa.state(id) {
199                     nfa::State::Range { .. } | nfa::State::Match => break,
200                     nfa::State::Union { ref alternates } => {
201                         id = match alternates.get(0) {
202                             None => break,
203                             Some(&id) => id,
204                         };
205                         self.stack.extend(alternates[1..].iter().rev());
206                     }
207                 }
208             }
209         }
210     }
211 
212     /// Compute the initial DFA state and return its identifier.
213     ///
214     /// The sparse set given is used for scratch space, and must have capacity
215     /// equal to the total number of NFA states. Its contents are unspecified.
add_start(&mut self, sparse: &mut SparseSet) -> Result<S>216     fn add_start(&mut self, sparse: &mut SparseSet) -> Result<S> {
217         sparse.clear();
218         self.epsilon_closure(self.nfa.start(), sparse);
219         let state = self.new_state(&sparse);
220         let id = self.add_state(state)?;
221         self.dfa.set_start_state(id);
222         Ok(id)
223     }
224 
225     /// Add the given state to the DFA and make it available in the cache.
226     ///
227     /// The state initially has no transitions. That is, it transitions to the
228     /// dead state for all possible inputs.
add_state(&mut self, state: State) -> Result<S>229     fn add_state(&mut self, state: State) -> Result<S> {
230         let id = self.dfa.add_empty_state()?;
231         let rstate = Rc::new(state);
232         self.builder_states.push(rstate.clone());
233         self.cache.insert(rstate, id);
234         Ok(id)
235     }
236 
237     /// Convert the given set of ordered NFA states to a DFA state.
new_state(&mut self, set: &SparseSet) -> State238     fn new_state(&mut self, set: &SparseSet) -> State {
239         let mut state = State {
240             is_match: false,
241             nfa_states: mem::replace(&mut self.scratch_nfa_states, vec![]),
242         };
243         state.nfa_states.clear();
244 
245         for &id in set {
246             match *self.nfa.state(id) {
247                 nfa::State::Range { .. } => {
248                     state.nfa_states.push(id);
249                 }
250                 nfa::State::Match => {
251                     state.is_match = true;
252                     if !self.longest_match {
253                         break;
254                     }
255                 }
256                 nfa::State::Union { .. } => {}
257             }
258         }
259         state
260     }
261 
262     /// Create a new sparse set with enough capacity to hold all NFA states.
new_sparse_set(&self) -> SparseSet263     fn new_sparse_set(&self) -> SparseSet {
264         SparseSet::new(self.nfa.len())
265     }
266 }
267 
268 impl State {
269     /// Create a new empty dead state.
dead() -> State270     fn dead() -> State {
271         State { nfa_states: vec![], is_match: false }
272     }
273 }
274