1 //! Constructs a DFA which picks the longest matching regular
2 //! expression from the input.
3
4 use collections::Set;
5 use kernel_set::{Kernel, KernelSet};
6 use lexer::nfa::{self, NFAConstructionError, NFAStateIndex, Test, NFA};
7 use lexer::re;
8 use std::fmt::{Debug, Display, Error, Formatter};
9 use std::rc::Rc;
10
11 #[cfg(test)]
12 mod test;
13
14 #[cfg(test)]
15 pub mod interpret;
16
17 mod overlap;
18
19 #[derive(Clone, Debug, PartialEq, Eq)]
20 pub struct DFA {
21 pub states: Vec<State>,
22 }
23
24 #[derive(Copy, Clone, Debug, PartialOrd, Ord, PartialEq, Eq)]
25 pub struct Precedence(pub usize);
26
27 #[derive(Debug)]
28 pub enum DFAConstructionError {
29 NFAConstructionError {
30 index: NFAIndex,
31 error: NFAConstructionError,
32 },
33
34 /// Either of the two regexs listed could match, and they have equal
35 /// priority.
36 Ambiguity { match0: NFAIndex, match1: NFAIndex },
37 }
38
build_dfa( regexs: &[re::Regex], precedences: &[Precedence], ) -> Result<DFA, DFAConstructionError>39 pub fn build_dfa(
40 regexs: &[re::Regex],
41 precedences: &[Precedence],
42 ) -> Result<DFA, DFAConstructionError> {
43 assert_eq!(regexs.len(), precedences.len());
44 let nfas = regexs
45 .iter()
46 .enumerate()
47 .map(|(i, r)| match NFA::from_re(r) {
48 Ok(nfa) => Ok(nfa),
49 Err(e) => Err(DFAConstructionError::NFAConstructionError {
50 index: NFAIndex(i),
51 error: e,
52 }),
53 })
54 .collect::<Result<Vec<_>, _>>()?;
55
56 let builder = DFABuilder {
57 nfas: &nfas,
58 precedences: precedences.to_vec(),
59 };
60 let dfa = builder.build()?;
61 Ok(dfa)
62 }
63
64 struct DFABuilder<'nfa> {
65 nfas: &'nfa [NFA],
66 precedences: Vec<Precedence>,
67 }
68
69 #[derive(Clone, Debug, PartialEq, Eq)]
70 pub struct State {
71 item_set: DFAItemSet,
72 pub kind: Kind,
73 pub test_edges: Vec<(Test, DFAStateIndex)>,
74 pub other_edge: DFAStateIndex,
75 }
76
77 #[derive(Clone, Debug, PartialEq, Eq)]
78 pub enum Kind {
79 Accepts(NFAIndex),
80 Reject,
81 Neither,
82 }
83
84 #[derive(Copy, Clone, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
85 pub struct NFAIndex(usize);
86
87 #[derive(Copy, Clone, Hash, PartialEq, Eq, PartialOrd, Ord)]
88 pub struct DFAStateIndex(usize);
89
90 type DFAKernelSet = KernelSet<DFAItemSet>;
91
92 #[derive(Clone, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
93 struct DFAItemSet {
94 items: Rc<Vec<Item>>,
95 }
96
97 #[derive(Copy, Clone, Hash, PartialEq, Eq, PartialOrd, Ord)]
98 struct Item {
99 // which regular expression?
100 nfa_index: NFAIndex,
101
102 // what state within the NFA are we at?
103 nfa_state: NFAStateIndex,
104 }
105
106 const START: DFAStateIndex = DFAStateIndex(0);
107
108 impl<'nfa> DFABuilder<'nfa> {
build(&self) -> Result<DFA, DFAConstructionError>109 fn build(&self) -> Result<DFA, DFAConstructionError> {
110 let mut kernel_set = KernelSet::new();
111 let mut states = vec![];
112
113 let start_state_index = self.start_state(&mut kernel_set);
114 assert_eq!(start_state_index, START);
115
116 while let Some(item_set) = kernel_set.next() {
117 // collect all the specific tests we expect from any of
118 // the items in this state
119 let tests: Set<Test> = item_set
120 .items
121 .iter()
122 .flat_map(|&item| {
123 self.nfa(item)
124 .edges::<Test>(item.nfa_state)
125 .map(|edge| edge.label)
126 })
127 .collect();
128 let tests = overlap::remove_overlap(&tests);
129
130 // if any NFA is in an accepting state, that makes this
131 // DFA state an accepting state
132 let mut all_accepts: Vec<(Precedence, NFAIndex)> = item_set
133 .items
134 .iter()
135 .cloned()
136 .filter(|&item| self.nfa(item).is_accepting_state(item.nfa_state))
137 .map(|item| (self.precedences[item.nfa_index.0], item.nfa_index))
138 .collect();
139
140 // if all NFAs are in a rejecting state, that makes this
141 // DFA a rejecting state
142 let all_rejects: bool = item_set
143 .items
144 .iter()
145 .all(|&item| self.nfa(item).is_rejecting_state(item.nfa_state));
146
147 let kind = if all_rejects || item_set.items.is_empty() {
148 Kind::Reject
149 } else if all_accepts.is_empty() {
150 Kind::Neither
151 } else if all_accepts.len() == 1 {
152 // accepts just one NFA, easy case
153 Kind::Accepts(all_accepts[0].1)
154 } else {
155 all_accepts.sort(); // sort regex with higher precedence, well, higher
156 let (best_priority, best_nfa) = all_accepts[all_accepts.len() - 1];
157 let (next_priority, next_nfa) = all_accepts[all_accepts.len() - 2];
158 if best_priority == next_priority {
159 return Err(DFAConstructionError::Ambiguity {
160 match0: best_nfa,
161 match1: next_nfa,
162 });
163 }
164 Kind::Accepts(best_nfa)
165 };
166
167 // for each specific test, find what happens if we see a
168 // character matching that test
169 let mut test_edges: Vec<(Test, DFAStateIndex)> = tests
170 .iter()
171 .map(|&test| {
172 let items: Vec<_> = item_set
173 .items
174 .iter()
175 .filter_map(|&item| self.accept_test(item, test))
176 .collect();
177
178 // at least one of those items should accept this test
179 assert!(!items.is_empty());
180
181 (test, kernel_set.add_state(self.transitive_closure(items)))
182 })
183 .collect();
184
185 test_edges.sort();
186
187 // Consider what there is some character that doesn't meet
188 // any of the tests. In this case, we can just ignore all
189 // the test edges for each of the items and just union all
190 // the "other" edges -- because if it were one of those
191 // test edges, then that transition is represented above.
192 let other_transitions: Vec<_> = item_set
193 .items
194 .iter()
195 .filter_map(|&item| self.accept_other(item))
196 .collect();
197
198 // we never know the full set
199 assert!(item_set.items.is_empty() || !other_transitions.is_empty());
200
201 let other_edge = kernel_set.add_state(self.transitive_closure(other_transitions));
202
203 let state = State {
204 item_set,
205 kind,
206 test_edges,
207 other_edge,
208 };
209
210 states.push(state);
211 }
212
213 Ok(DFA { states })
214 }
215
start_state(&self, kernel_set: &mut DFAKernelSet) -> DFAStateIndex216 fn start_state(&self, kernel_set: &mut DFAKernelSet) -> DFAStateIndex {
217 // starting state is at the beginning of all regular expressions
218 let items: Vec<_> = (0..self.nfas.len())
219 .map(|i| Item {
220 nfa_index: NFAIndex(i),
221 nfa_state: nfa::START,
222 })
223 .collect();
224 let item_set = self.transitive_closure(items);
225 kernel_set.add_state(item_set)
226 }
227
accept_test(&self, item: Item, test: Test) -> Option<Item>228 fn accept_test(&self, item: Item, test: Test) -> Option<Item> {
229 let nfa = self.nfa(item);
230
231 let matching_test = nfa
232 .edges::<Test>(item.nfa_state)
233 .filter(|edge| edge.label.intersects(test))
234 .map(|edge| item.to(edge.to));
235
236 let matching_other = nfa
237 .edges::<nfa::Other>(item.nfa_state)
238 .map(|edge| item.to(edge.to));
239
240 matching_test.chain(matching_other).next()
241 }
242
accept_other(&self, item: Item) -> Option<Item>243 fn accept_other(&self, item: Item) -> Option<Item> {
244 let nfa = self.nfa(item);
245 nfa.edges::<nfa::Other>(item.nfa_state)
246 .map(|edge| item.to(edge.to))
247 .next()
248 }
249
transitive_closure(&self, mut items: Vec<Item>) -> DFAItemSet250 fn transitive_closure(&self, mut items: Vec<Item>) -> DFAItemSet {
251 let mut observed: Set<Item> = items.iter().cloned().collect();
252
253 let mut counter = 0;
254 while counter < items.len() {
255 let item = items[counter];
256 let derived_states = self
257 .nfa(item)
258 .edges::<nfa::Noop>(item.nfa_state)
259 .map(|edge| item.to(edge.to))
260 .filter(|&item| observed.insert(item));
261 items.extend(derived_states);
262 counter += 1;
263 }
264
265 items.sort();
266 items.dedup();
267
268 DFAItemSet {
269 items: Rc::new(items),
270 }
271 }
272
nfa(&self, item: Item) -> &NFA273 fn nfa(&self, item: Item) -> &NFA {
274 &self.nfas[item.nfa_index.0]
275 }
276 }
277
278 impl Kernel for DFAItemSet {
279 type Index = DFAStateIndex;
280
index(c: usize) -> DFAStateIndex281 fn index(c: usize) -> DFAStateIndex {
282 DFAStateIndex(c)
283 }
284 }
285
286 impl DFA {
state(&self, index: DFAStateIndex) -> &State287 fn state(&self, index: DFAStateIndex) -> &State {
288 &self.states[index.0]
289 }
290 }
291
292 impl Item {
to(&self, s: NFAStateIndex) -> Item293 fn to(&self, s: NFAStateIndex) -> Item {
294 Item {
295 nfa_index: self.nfa_index,
296 nfa_state: s,
297 }
298 }
299 }
300
301 impl Debug for DFAStateIndex {
fmt(&self, fmt: &mut Formatter) -> Result<(), Error>302 fn fmt(&self, fmt: &mut Formatter) -> Result<(), Error> {
303 write!(fmt, "DFA{}", self.0)
304 }
305 }
306
307 impl Display for DFAStateIndex {
fmt(&self, fmt: &mut Formatter) -> Result<(), Error>308 fn fmt(&self, fmt: &mut Formatter) -> Result<(), Error> {
309 Debug::fmt(self, fmt)
310 }
311 }
312
313 impl NFAIndex {
index(self) -> usize314 pub fn index(self) -> usize {
315 self.0
316 }
317 }
318
319 impl DFAStateIndex {
index(self) -> usize320 pub fn index(self) -> usize {
321 self.0
322 }
323 }
324
325 impl Debug for Item {
fmt(&self, fmt: &mut Formatter) -> Result<(), Error>326 fn fmt(&self, fmt: &mut Formatter) -> Result<(), Error> {
327 write!(fmt, "({:?}:{:?})", self.nfa_index, self.nfa_state)
328 }
329 }
330