1 //! Constructs a DFA which picks the longest matching regular
2 //! expression from the input.
3 
4 use collections::Set;
5 use kernel_set::{Kernel, KernelSet};
6 use lexer::nfa::{self, NFAConstructionError, NFAStateIndex, Test, NFA};
7 use lexer::re;
8 use std::fmt::{Debug, Display, Error, Formatter};
9 use std::rc::Rc;
10 
11 #[cfg(test)]
12 mod test;
13 
14 #[cfg(test)]
15 pub mod interpret;
16 
17 mod overlap;
18 
19 #[derive(Clone, Debug, PartialEq, Eq)]
20 pub struct DFA {
21     pub states: Vec<State>,
22 }
23 
24 #[derive(Copy, Clone, Debug, PartialOrd, Ord, PartialEq, Eq)]
25 pub struct Precedence(pub usize);
26 
27 #[derive(Debug)]
28 pub enum DFAConstructionError {
29     NFAConstructionError {
30         index: NFAIndex,
31         error: NFAConstructionError,
32     },
33 
34     /// Either of the two regexs listed could match, and they have equal
35     /// priority.
36     Ambiguity { match0: NFAIndex, match1: NFAIndex },
37 }
38 
build_dfa( regexs: &[re::Regex], precedences: &[Precedence], ) -> Result<DFA, DFAConstructionError>39 pub fn build_dfa(
40     regexs: &[re::Regex],
41     precedences: &[Precedence],
42 ) -> Result<DFA, DFAConstructionError> {
43     assert_eq!(regexs.len(), precedences.len());
44     let nfas = regexs
45         .iter()
46         .enumerate()
47         .map(|(i, r)| match NFA::from_re(r) {
48             Ok(nfa) => Ok(nfa),
49             Err(e) => Err(DFAConstructionError::NFAConstructionError {
50                 index: NFAIndex(i),
51                 error: e,
52             }),
53         })
54         .collect::<Result<Vec<_>, _>>()?;
55 
56     let builder = DFABuilder {
57         nfas: &nfas,
58         precedences: precedences.to_vec(),
59     };
60     let dfa = builder.build()?;
61     Ok(dfa)
62 }
63 
64 struct DFABuilder<'nfa> {
65     nfas: &'nfa [NFA],
66     precedences: Vec<Precedence>,
67 }
68 
69 #[derive(Clone, Debug, PartialEq, Eq)]
70 pub struct State {
71     item_set: DFAItemSet,
72     pub kind: Kind,
73     pub test_edges: Vec<(Test, DFAStateIndex)>,
74     pub other_edge: DFAStateIndex,
75 }
76 
77 #[derive(Clone, Debug, PartialEq, Eq)]
78 pub enum Kind {
79     Accepts(NFAIndex),
80     Reject,
81     Neither,
82 }
83 
84 #[derive(Copy, Clone, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
85 pub struct NFAIndex(usize);
86 
87 #[derive(Copy, Clone, Hash, PartialEq, Eq, PartialOrd, Ord)]
88 pub struct DFAStateIndex(usize);
89 
90 type DFAKernelSet = KernelSet<DFAItemSet>;
91 
92 #[derive(Clone, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
93 struct DFAItemSet {
94     items: Rc<Vec<Item>>,
95 }
96 
97 #[derive(Copy, Clone, Hash, PartialEq, Eq, PartialOrd, Ord)]
98 struct Item {
99     // which regular expression?
100     nfa_index: NFAIndex,
101 
102     // what state within the NFA are we at?
103     nfa_state: NFAStateIndex,
104 }
105 
106 const START: DFAStateIndex = DFAStateIndex(0);
107 
108 impl<'nfa> DFABuilder<'nfa> {
build(&self) -> Result<DFA, DFAConstructionError>109     fn build(&self) -> Result<DFA, DFAConstructionError> {
110         let mut kernel_set = KernelSet::new();
111         let mut states = vec![];
112 
113         let start_state_index = self.start_state(&mut kernel_set);
114         assert_eq!(start_state_index, START);
115 
116         while let Some(item_set) = kernel_set.next() {
117             // collect all the specific tests we expect from any of
118             // the items in this state
119             let tests: Set<Test> = item_set
120                 .items
121                 .iter()
122                 .flat_map(|&item| {
123                     self.nfa(item)
124                         .edges::<Test>(item.nfa_state)
125                         .map(|edge| edge.label)
126                 })
127                 .collect();
128             let tests = overlap::remove_overlap(&tests);
129 
130             // if any NFA is in an accepting state, that makes this
131             // DFA state an accepting state
132             let mut all_accepts: Vec<(Precedence, NFAIndex)> = item_set
133                 .items
134                 .iter()
135                 .cloned()
136                 .filter(|&item| self.nfa(item).is_accepting_state(item.nfa_state))
137                 .map(|item| (self.precedences[item.nfa_index.0], item.nfa_index))
138                 .collect();
139 
140             // if all NFAs are in a rejecting state, that makes this
141             // DFA a rejecting state
142             let all_rejects: bool = item_set
143                 .items
144                 .iter()
145                 .all(|&item| self.nfa(item).is_rejecting_state(item.nfa_state));
146 
147             let kind = if all_rejects || item_set.items.is_empty() {
148                 Kind::Reject
149             } else if all_accepts.is_empty() {
150                 Kind::Neither
151             } else if all_accepts.len() == 1 {
152                 // accepts just one NFA, easy case
153                 Kind::Accepts(all_accepts[0].1)
154             } else {
155                 all_accepts.sort(); // sort regex with higher precedence, well, higher
156                 let (best_priority, best_nfa) = all_accepts[all_accepts.len() - 1];
157                 let (next_priority, next_nfa) = all_accepts[all_accepts.len() - 2];
158                 if best_priority == next_priority {
159                     return Err(DFAConstructionError::Ambiguity {
160                         match0: best_nfa,
161                         match1: next_nfa,
162                     });
163                 }
164                 Kind::Accepts(best_nfa)
165             };
166 
167             // for each specific test, find what happens if we see a
168             // character matching that test
169             let mut test_edges: Vec<(Test, DFAStateIndex)> = tests
170                 .iter()
171                 .map(|&test| {
172                     let items: Vec<_> = item_set
173                         .items
174                         .iter()
175                         .filter_map(|&item| self.accept_test(item, test))
176                         .collect();
177 
178                     // at least one of those items should accept this test
179                     assert!(!items.is_empty());
180 
181                     (test, kernel_set.add_state(self.transitive_closure(items)))
182                 })
183                 .collect();
184 
185             test_edges.sort();
186 
187             // Consider what there is some character that doesn't meet
188             // any of the tests. In this case, we can just ignore all
189             // the test edges for each of the items and just union all
190             // the "other" edges -- because if it were one of those
191             // test edges, then that transition is represented above.
192             let other_transitions: Vec<_> = item_set
193                 .items
194                 .iter()
195                 .filter_map(|&item| self.accept_other(item))
196                 .collect();
197 
198             // we never know the full set
199             assert!(item_set.items.is_empty() || !other_transitions.is_empty());
200 
201             let other_edge = kernel_set.add_state(self.transitive_closure(other_transitions));
202 
203             let state = State {
204                 item_set,
205                 kind,
206                 test_edges,
207                 other_edge,
208             };
209 
210             states.push(state);
211         }
212 
213         Ok(DFA { states })
214     }
215 
start_state(&self, kernel_set: &mut DFAKernelSet) -> DFAStateIndex216     fn start_state(&self, kernel_set: &mut DFAKernelSet) -> DFAStateIndex {
217         // starting state is at the beginning of all regular expressions
218         let items: Vec<_> = (0..self.nfas.len())
219             .map(|i| Item {
220                 nfa_index: NFAIndex(i),
221                 nfa_state: nfa::START,
222             })
223             .collect();
224         let item_set = self.transitive_closure(items);
225         kernel_set.add_state(item_set)
226     }
227 
accept_test(&self, item: Item, test: Test) -> Option<Item>228     fn accept_test(&self, item: Item, test: Test) -> Option<Item> {
229         let nfa = self.nfa(item);
230 
231         let matching_test = nfa
232             .edges::<Test>(item.nfa_state)
233             .filter(|edge| edge.label.intersects(test))
234             .map(|edge| item.to(edge.to));
235 
236         let matching_other = nfa
237             .edges::<nfa::Other>(item.nfa_state)
238             .map(|edge| item.to(edge.to));
239 
240         matching_test.chain(matching_other).next()
241     }
242 
accept_other(&self, item: Item) -> Option<Item>243     fn accept_other(&self, item: Item) -> Option<Item> {
244         let nfa = self.nfa(item);
245         nfa.edges::<nfa::Other>(item.nfa_state)
246             .map(|edge| item.to(edge.to))
247             .next()
248     }
249 
transitive_closure(&self, mut items: Vec<Item>) -> DFAItemSet250     fn transitive_closure(&self, mut items: Vec<Item>) -> DFAItemSet {
251         let mut observed: Set<Item> = items.iter().cloned().collect();
252 
253         let mut counter = 0;
254         while counter < items.len() {
255             let item = items[counter];
256             let derived_states = self
257                 .nfa(item)
258                 .edges::<nfa::Noop>(item.nfa_state)
259                 .map(|edge| item.to(edge.to))
260                 .filter(|&item| observed.insert(item));
261             items.extend(derived_states);
262             counter += 1;
263         }
264 
265         items.sort();
266         items.dedup();
267 
268         DFAItemSet {
269             items: Rc::new(items),
270         }
271     }
272 
nfa(&self, item: Item) -> &NFA273     fn nfa(&self, item: Item) -> &NFA {
274         &self.nfas[item.nfa_index.0]
275     }
276 }
277 
278 impl Kernel for DFAItemSet {
279     type Index = DFAStateIndex;
280 
index(c: usize) -> DFAStateIndex281     fn index(c: usize) -> DFAStateIndex {
282         DFAStateIndex(c)
283     }
284 }
285 
286 impl DFA {
state(&self, index: DFAStateIndex) -> &State287     fn state(&self, index: DFAStateIndex) -> &State {
288         &self.states[index.0]
289     }
290 }
291 
292 impl Item {
to(&self, s: NFAStateIndex) -> Item293     fn to(&self, s: NFAStateIndex) -> Item {
294         Item {
295             nfa_index: self.nfa_index,
296             nfa_state: s,
297         }
298     }
299 }
300 
301 impl Debug for DFAStateIndex {
fmt(&self, fmt: &mut Formatter) -> Result<(), Error>302     fn fmt(&self, fmt: &mut Formatter) -> Result<(), Error> {
303         write!(fmt, "DFA{}", self.0)
304     }
305 }
306 
307 impl Display for DFAStateIndex {
fmt(&self, fmt: &mut Formatter) -> Result<(), Error>308     fn fmt(&self, fmt: &mut Formatter) -> Result<(), Error> {
309         Debug::fmt(self, fmt)
310     }
311 }
312 
313 impl NFAIndex {
index(self) -> usize314     pub fn index(self) -> usize {
315         self.0
316     }
317 }
318 
319 impl DFAStateIndex {
index(self) -> usize320     pub fn index(self) -> usize {
321         self.0
322     }
323 }
324 
325 impl Debug for Item {
fmt(&self, fmt: &mut Formatter) -> Result<(), Error>326     fn fmt(&self, fmt: &mut Formatter) -> Result<(), Error> {
327         write!(fmt, "({:?}:{:?})", self.nfa_index, self.nfa_state)
328     }
329 }
330