1 use std::collections::HashMap; 2 use std::mem; 3 use std::rc::Rc; 4 5 use dense; 6 use error::Result; 7 use nfa::{self, NFA}; 8 use sparse_set::SparseSet; 9 use state_id::{StateID, dead_id}; 10 11 type DFARepr<S> = dense::Repr<Vec<S>, S>; 12 13 /// A determinizer converts an NFA to a DFA. 14 /// 15 /// This determinizer follows the typical powerset construction, where each 16 /// DFA state is comprised of one or more NFA states. In the worst case, there 17 /// is one DFA state for every possible combination of NFA states. In practice, 18 /// this only happens in certain conditions, typically when there are bounded 19 /// repetitions. 20 /// 21 /// The type variable `S` refers to the chosen state identifier representation 22 /// used for the DFA. 23 /// 24 /// The lifetime variable `'a` refers to the lifetime of the NFA being 25 /// converted to a DFA. 26 #[derive(Debug)] 27 pub(crate) struct Determinizer<'a, S: StateID> { 28 /// The NFA we're converting into a DFA. 29 nfa: &'a NFA, 30 /// The DFA we're building. 31 dfa: DFARepr<S>, 32 /// Each DFA state being built is defined as an *ordered* set of NFA 33 /// states, along with a flag indicating whether the state is a match 34 /// state or not. 35 /// 36 /// This is never empty. The first state is always a dummy state such that 37 /// a state id == 0 corresponds to a dead state. 38 builder_states: Vec<Rc<State>>, 39 /// A cache of DFA states that already exist and can be easily looked up 40 /// via ordered sets of NFA states. 41 cache: HashMap<Rc<State>, S>, 42 /// Scratch space for a stack of NFA states to visit, for depth first 43 /// visiting without recursion. 44 stack: Vec<nfa::StateID>, 45 /// Scratch space for storing an ordered sequence of NFA states, for 46 /// amortizing allocation. 47 scratch_nfa_states: Vec<nfa::StateID>, 48 /// Whether to build a DFA that finds the longest possible match. 49 longest_match: bool, 50 } 51 52 /// An intermediate representation for a DFA state during determinization. 53 #[derive(Debug, Eq, Hash, PartialEq)] 54 struct State { 55 /// Whether this state is a match state or not. 56 is_match: bool, 57 /// An ordered sequence of NFA states that make up this DFA state. 58 nfa_states: Vec<nfa::StateID>, 59 } 60 61 impl<'a, S: StateID> Determinizer<'a, S> { 62 /// Create a new determinizer for converting the given NFA to a DFA. new(nfa: &'a NFA) -> Determinizer<'a, S>63 pub fn new(nfa: &'a NFA) -> Determinizer<'a, S> { 64 let dead = Rc::new(State::dead()); 65 let mut cache = HashMap::default(); 66 cache.insert(dead.clone(), dead_id()); 67 68 Determinizer { 69 nfa: nfa, 70 dfa: DFARepr::empty().anchored(nfa.is_anchored()), 71 builder_states: vec![dead], 72 cache: cache, 73 stack: vec![], 74 scratch_nfa_states: vec![], 75 longest_match: false, 76 } 77 } 78 79 /// Instruct the determinizer to use equivalence classes as the transition 80 /// alphabet instead of all possible byte values. with_byte_classes(mut self) -> Determinizer<'a, S>81 pub fn with_byte_classes(mut self) -> Determinizer<'a, S> { 82 let byte_classes = self.nfa.byte_classes().clone(); 83 self.dfa = DFARepr::empty_with_byte_classes(byte_classes) 84 .anchored(self.nfa.is_anchored()); 85 self 86 } 87 88 /// Instruct the determinizer to build a DFA that recognizes the longest 89 /// possible match instead of the leftmost first match. This is useful when 90 /// constructing reverse DFAs for finding the start of a match. longest_match(mut self, yes: bool) -> Determinizer<'a, S>91 pub fn longest_match(mut self, yes: bool) -> Determinizer<'a, S> { 92 self.longest_match = yes; 93 self 94 } 95 96 /// Build the DFA. If there was a problem constructing the DFA (e.g., if 97 /// the chosen state identifier representation is too small), then an error 98 /// is returned. build(mut self) -> Result<DFARepr<S>>99 pub fn build(mut self) -> Result<DFARepr<S>> { 100 let representative_bytes: Vec<u8> = 101 self.dfa.byte_classes().representatives().collect(); 102 let mut sparse = self.new_sparse_set(); 103 let mut uncompiled = vec![self.add_start(&mut sparse)?]; 104 while let Some(dfa_id) = uncompiled.pop() { 105 for &b in &representative_bytes { 106 let (next_dfa_id, is_new) = self.cached_state( 107 dfa_id, b, &mut sparse, 108 )?; 109 self.dfa.add_transition(dfa_id, b, next_dfa_id); 110 if is_new { 111 uncompiled.push(next_dfa_id); 112 } 113 } 114 } 115 116 // At this point, we shuffle the matching states in the final DFA to 117 // the beginning. This permits a DFA's match loop to detect a match 118 // condition by merely inspecting the current state's identifier, and 119 // avoids the need for any additional auxiliary storage. 120 let is_match: Vec<bool> = self 121 .builder_states 122 .iter() 123 .map(|s| s.is_match) 124 .collect(); 125 self.dfa.shuffle_match_states(&is_match); 126 Ok(self.dfa) 127 } 128 129 /// Return the identifier for the next DFA state given an existing DFA 130 /// state and an input byte. If the next DFA state already exists, then 131 /// return its identifier from the cache. Otherwise, build the state, cache 132 /// it and return its identifier. 133 /// 134 /// The given sparse set is used for scratch space. It must have a capacity 135 /// equivalent to the total number of NFA states, but its contents are 136 /// otherwise unspecified. 137 /// 138 /// This routine returns a boolean indicating whether a new state was 139 /// built. If a new state is built, then the caller needs to add it to its 140 /// frontier of uncompiled DFA states to compute transitions for. cached_state( &mut self, dfa_id: S, b: u8, sparse: &mut SparseSet, ) -> Result<(S, bool)>141 fn cached_state( 142 &mut self, 143 dfa_id: S, 144 b: u8, 145 sparse: &mut SparseSet, 146 ) -> Result<(S, bool)> { 147 sparse.clear(); 148 // Compute the set of all reachable NFA states, including epsilons. 149 self.next(dfa_id, b, sparse); 150 // Build a candidate state and check if it has already been built. 151 let state = self.new_state(sparse); 152 if let Some(&cached_id) = self.cache.get(&state) { 153 // Since we have a cached state, put the constructed state's 154 // memory back into our scratch space, so that it can be reused. 155 mem::replace(&mut self.scratch_nfa_states, state.nfa_states); 156 return Ok((cached_id, false)); 157 } 158 // Nothing was in the cache, so add this state to the cache. 159 self.add_state(state).map(|s| (s, true)) 160 } 161 162 /// Compute the set of all eachable NFA states, including the full epsilon 163 /// closure, from a DFA state for a single byte of input. next( &mut self, dfa_id: S, b: u8, next_nfa_states: &mut SparseSet, )164 fn next( 165 &mut self, 166 dfa_id: S, 167 b: u8, 168 next_nfa_states: &mut SparseSet, 169 ) { 170 next_nfa_states.clear(); 171 for i in 0..self.builder_states[dfa_id.to_usize()].nfa_states.len() { 172 let nfa_id = self.builder_states[dfa_id.to_usize()].nfa_states[i]; 173 match *self.nfa.state(nfa_id) { 174 nfa::State::Union { .. } | nfa::State::Match => {} 175 nfa::State::Range { start, end, next } => { 176 if start <= b && b <= end { 177 self.epsilon_closure(next, next_nfa_states); 178 } 179 } 180 } 181 } 182 } 183 184 /// Compute the epsilon closure for the given NFA state. epsilon_closure(&mut self, start: nfa::StateID, set: &mut SparseSet)185 fn epsilon_closure(&mut self, start: nfa::StateID, set: &mut SparseSet) { 186 if !self.nfa.state(start).is_epsilon() { 187 set.insert(start); 188 return; 189 } 190 191 self.stack.push(start); 192 while let Some(mut id) = self.stack.pop() { 193 loop { 194 if set.contains(id) { 195 break; 196 } 197 set.insert(id); 198 match *self.nfa.state(id) { 199 nfa::State::Range { .. } | nfa::State::Match => break, 200 nfa::State::Union { ref alternates } => { 201 id = match alternates.get(0) { 202 None => break, 203 Some(&id) => id, 204 }; 205 self.stack.extend(alternates[1..].iter().rev()); 206 } 207 } 208 } 209 } 210 } 211 212 /// Compute the initial DFA state and return its identifier. 213 /// 214 /// The sparse set given is used for scratch space, and must have capacity 215 /// equal to the total number of NFA states. Its contents are unspecified. add_start(&mut self, sparse: &mut SparseSet) -> Result<S>216 fn add_start(&mut self, sparse: &mut SparseSet) -> Result<S> { 217 sparse.clear(); 218 self.epsilon_closure(self.nfa.start(), sparse); 219 let state = self.new_state(&sparse); 220 let id = self.add_state(state)?; 221 self.dfa.set_start_state(id); 222 Ok(id) 223 } 224 225 /// Add the given state to the DFA and make it available in the cache. 226 /// 227 /// The state initially has no transitions. That is, it transitions to the 228 /// dead state for all possible inputs. add_state(&mut self, state: State) -> Result<S>229 fn add_state(&mut self, state: State) -> Result<S> { 230 let id = self.dfa.add_empty_state()?; 231 let rstate = Rc::new(state); 232 self.builder_states.push(rstate.clone()); 233 self.cache.insert(rstate, id); 234 Ok(id) 235 } 236 237 /// Convert the given set of ordered NFA states to a DFA state. new_state(&mut self, set: &SparseSet) -> State238 fn new_state(&mut self, set: &SparseSet) -> State { 239 let mut state = State { 240 is_match: false, 241 nfa_states: mem::replace(&mut self.scratch_nfa_states, vec![]), 242 }; 243 state.nfa_states.clear(); 244 245 for &id in set { 246 match *self.nfa.state(id) { 247 nfa::State::Range { .. } => { 248 state.nfa_states.push(id); 249 } 250 nfa::State::Match => { 251 state.is_match = true; 252 if !self.longest_match { 253 break; 254 } 255 } 256 nfa::State::Union { .. } => {} 257 } 258 } 259 state 260 } 261 262 /// Create a new sparse set with enough capacity to hold all NFA states. new_sparse_set(&self) -> SparseSet263 fn new_sparse_set(&self) -> SparseSet { 264 SparseSet::new(self.nfa.len()) 265 } 266 } 267 268 impl State { 269 /// Create a new empty dead state. dead() -> State270 fn dead() -> State { 271 State { nfa_states: vec![], is_match: false } 272 } 273 } 274