1 // Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
4 //
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
10 
11 // This module implements the Pike VM. That is, it guarantees linear time
12 // search of a regex on any text with memory use proportional to the size of
13 // the regex.
14 //
15 // It is equal in power to the backtracking engine in this crate, except the
16 // backtracking engine is typically faster on small regexes/texts at the
17 // expense of a bigger memory footprint.
18 //
19 // It can do more than the DFA can (specifically, record capture locations
20 // and execute Unicode word boundary assertions), but at a slower speed.
21 // Specifically, the Pike VM exectues a DFA implicitly by repeatedly expanding
22 // epsilon transitions. That is, the Pike VM engine can be in multiple states
23 // at once where as the DFA is only ever in one state at a time.
24 //
25 // Therefore, the Pike VM is generally treated as the fallback when the other
26 // matching engines either aren't feasible to run or are insufficient.
27 
28 use std::mem;
29 
30 use exec::ProgramCache;
31 use input::{Input, InputAt};
32 use prog::{Program, InstPtr};
33 use re_trait::Slot;
34 use sparse::SparseSet;
35 
36 /// An NFA simulation matching engine.
37 #[derive(Debug)]
38 pub struct Fsm<'r, I> {
39     /// The sequence of opcodes (among other things) that is actually executed.
40     ///
41     /// The program may be byte oriented or Unicode codepoint oriented.
42     prog: &'r Program,
43     /// An explicit stack used for following epsilon transitions. (This is
44     /// borrowed from the cache.)
45     stack: &'r mut Vec<FollowEpsilon>,
46     /// The input to search.
47     input: I,
48 }
49 
50 /// A cached allocation that can be reused on each execution.
51 #[derive(Clone, Debug)]
52 pub struct Cache {
53     /// A pair of ordered sets for tracking NFA states.
54     clist: Threads,
55     nlist: Threads,
56     /// An explicit stack used for following epsilon transitions.
57     stack: Vec<FollowEpsilon>,
58 }
59 
60 /// An ordered set of NFA states and their captures.
61 #[derive(Clone, Debug)]
62 struct Threads {
63     /// An ordered set of opcodes (each opcode is an NFA state).
64     set: SparseSet,
65     /// Captures for every NFA state.
66     ///
67     /// It is stored in row-major order, where the columns are the capture
68     /// slots and the rows are the states.
69     caps: Vec<Slot>,
70     /// The number of capture slots stored per thread. (Every capture has
71     /// two slots.)
72     slots_per_thread: usize,
73 }
74 
75 /// A representation of an explicit stack frame when following epsilon
76 /// transitions. This is used to avoid recursion.
77 #[derive(Clone, Debug)]
78 enum FollowEpsilon {
79     /// Follow transitions at the given instruction pointer.
80     IP(InstPtr),
81     /// Restore the capture slot with the given position in the input.
82     Capture { slot: usize, pos: Slot },
83 }
84 
85 impl Cache {
86     /// Create a new allocation used by the NFA machine to record execution
87     /// and captures.
new(_prog: &Program) -> Self88     pub fn new(_prog: &Program) -> Self {
89         Cache {
90             clist: Threads::new(),
91             nlist: Threads::new(),
92             stack: vec![],
93         }
94     }
95 }
96 
97 impl<'r, I: Input> Fsm<'r, I> {
98     /// Execute the NFA matching engine.
99     ///
100     /// If there's a match, `exec` returns `true` and populates the given
101     /// captures accordingly.
exec( prog: &'r Program, cache: &ProgramCache, matches: &mut [bool], slots: &mut [Slot], quit_after_match: bool, input: I, start: usize, ) -> bool102     pub fn exec(
103         prog: &'r Program,
104         cache: &ProgramCache,
105         matches: &mut [bool],
106         slots: &mut [Slot],
107         quit_after_match: bool,
108         input: I,
109         start: usize,
110     ) -> bool {
111         let mut cache = cache.borrow_mut();
112         let cache = &mut cache.pikevm;
113         cache.clist.resize(prog.len(), prog.captures.len());
114         cache.nlist.resize(prog.len(), prog.captures.len());
115         let at = input.at(start);
116         Fsm {
117             prog: prog,
118             stack: &mut cache.stack,
119             input: input,
120         }.exec_(
121             &mut cache.clist,
122             &mut cache.nlist,
123             matches,
124             slots,
125             quit_after_match,
126             at,
127         )
128     }
129 
exec_( &mut self, mut clist: &mut Threads, mut nlist: &mut Threads, matches: &mut [bool], slots: &mut [Slot], quit_after_match: bool, mut at: InputAt, ) -> bool130     fn exec_(
131         &mut self,
132         mut clist: &mut Threads,
133         mut nlist: &mut Threads,
134         matches: &mut [bool],
135         slots: &mut [Slot],
136         quit_after_match: bool,
137         mut at: InputAt,
138     ) -> bool {
139         let mut matched = false;
140         let mut all_matched = false;
141         clist.set.clear();
142         nlist.set.clear();
143 'LOOP:  loop {
144             if clist.set.is_empty() {
145                 // Three ways to bail out when our current set of threads is
146                 // empty.
147                 //
148                 // 1. We have a match---so we're done exploring any possible
149                 //    alternatives. Time to quit. (We can't do this if we're
150                 //    looking for matches for multiple regexes, unless we know
151                 //    they all matched.)
152                 //
153                 // 2. If the expression starts with a '^' we can terminate as
154                 //    soon as the last thread dies.
155                 if (matched && matches.len() <= 1)
156                     || all_matched
157                     || (!at.is_start() && self.prog.is_anchored_start) {
158                     break;
159                 }
160 
161                 // 3. If there's a literal prefix for the program, try to
162                 //    jump ahead quickly. If it can't be found, then we can
163                 //    bail out early.
164                 if !self.prog.prefixes.is_empty() {
165                     at = match self.input.prefix_at(&self.prog.prefixes, at) {
166                         None => break,
167                         Some(at) => at,
168                     };
169                 }
170             }
171 
172             // This simulates a preceding '.*?' for every regex by adding
173             // a state starting at the current position in the input for the
174             // beginning of the program only if we don't already have a match.
175             if clist.set.is_empty()
176                 || (!self.prog.is_anchored_start && !all_matched) {
177                 self.add(&mut clist, slots, 0, at);
178             }
179             // The previous call to "add" actually inspects the position just
180             // before the current character. For stepping through the machine,
181             // we can to look at the current character, so we advance the
182             // input.
183             let at_next = self.input.at(at.next_pos());
184             for i in 0..clist.set.len() {
185                 let ip = clist.set[i];
186                 if self.step(
187                     &mut nlist,
188                     matches,
189                     slots,
190                     clist.caps(ip),
191                     ip,
192                     at,
193                     at_next,
194                 ) {
195                     matched = true;
196                     all_matched = all_matched || matches.iter().all(|&b| b);
197                     if quit_after_match {
198                         // If we only care if a match occurs (not its
199                         // position), then we can quit right now.
200                         break 'LOOP;
201                     }
202                     if self.prog.matches.len() == 1 {
203                         // We don't need to check the rest of the threads
204                         // in this set because we've matched something
205                         // ("leftmost-first"). However, we still need to check
206                         // threads in the next set to support things like
207                         // greedy matching.
208                         //
209                         // This is only true on normal regexes. For regex sets,
210                         // we need to mush on to observe other matches.
211                         break;
212                     }
213                 }
214             }
215             if at.is_end() {
216                 break;
217             }
218             at = at_next;
219             mem::swap(clist, nlist);
220             nlist.set.clear();
221         }
222         matched
223     }
224 
225     /// Step through the input, one token (byte or codepoint) at a time.
226     ///
227     /// nlist is the set of states that will be processed on the next token
228     /// in the input.
229     ///
230     /// caps is the set of captures passed by the caller of the NFA. They are
231     /// written to only when a match state is visited.
232     ///
233     /// thread_caps is the set of captures set for the current NFA state, ip.
234     ///
235     /// at and at_next are the current and next positions in the input. at or
236     /// at_next may be EOF.
step( &mut self, nlist: &mut Threads, matches: &mut [bool], slots: &mut [Slot], thread_caps: &mut [Option<usize>], ip: usize, at: InputAt, at_next: InputAt, ) -> bool237     fn step(
238         &mut self,
239         nlist: &mut Threads,
240         matches: &mut [bool],
241         slots: &mut [Slot],
242         thread_caps: &mut [Option<usize>],
243         ip: usize,
244         at: InputAt,
245         at_next: InputAt,
246     ) -> bool {
247         use prog::Inst::*;
248         match self.prog[ip] {
249             Match(match_slot) => {
250                 if match_slot < matches.len() {
251                     matches[match_slot] = true;
252                 }
253                 for (slot, val) in slots.iter_mut().zip(thread_caps.iter()) {
254                     *slot = *val;
255                 }
256                 true
257             }
258             Char(ref inst) => {
259                 if inst.c == at.char() {
260                     self.add(nlist, thread_caps, inst.goto, at_next);
261                 }
262                 false
263             }
264             Ranges(ref inst) => {
265                 if inst.matches(at.char()) {
266                     self.add(nlist, thread_caps, inst.goto, at_next);
267                 }
268                 false
269             }
270             Bytes(ref inst) => {
271                 if let Some(b) = at.byte() {
272                     if inst.matches(b) {
273                         self.add(nlist, thread_caps, inst.goto, at_next);
274                     }
275                 }
276                 false
277             }
278             EmptyLook(_) | Save(_) | Split(_) => false,
279         }
280     }
281 
282     /// Follows epsilon transitions and adds them for processing to nlist,
283     /// starting at and including ip.
add( &mut self, nlist: &mut Threads, thread_caps: &mut [Option<usize>], ip: usize, at: InputAt, )284     fn add(
285         &mut self,
286         nlist: &mut Threads,
287         thread_caps: &mut [Option<usize>],
288         ip: usize,
289         at: InputAt,
290     ) {
291         self.stack.push(FollowEpsilon::IP(ip));
292         while let Some(frame) = self.stack.pop() {
293             match frame {
294                 FollowEpsilon::IP(ip) => {
295                     self.add_step(nlist, thread_caps, ip, at);
296                 }
297                 FollowEpsilon::Capture { slot, pos } => {
298                     thread_caps[slot] = pos;
299                 }
300             }
301         }
302     }
303 
304     /// A helper function for add that avoids excessive pushing to the stack.
add_step( &mut self, nlist: &mut Threads, thread_caps: &mut [Option<usize>], mut ip: usize, at: InputAt, )305     fn add_step(
306         &mut self,
307         nlist: &mut Threads,
308         thread_caps: &mut [Option<usize>],
309         mut ip: usize,
310         at: InputAt,
311     ) {
312         // Instead of pushing and popping to the stack, we mutate ip as we
313         // traverse the set of states. We only push to the stack when we
314         // absolutely need recursion (restoring captures or following a
315         // branch).
316         use prog::Inst::*;
317         loop {
318             // Don't visit states we've already added.
319             if nlist.set.contains(ip) {
320                 return;
321             }
322             nlist.set.insert(ip);
323             match self.prog[ip] {
324                 EmptyLook(ref inst) => {
325                     if self.input.is_empty_match(at, inst) {
326                         ip = inst.goto;
327                     }
328                 }
329                 Save(ref inst) => {
330                     if inst.slot < thread_caps.len() {
331                         self.stack.push(FollowEpsilon::Capture {
332                             slot: inst.slot,
333                             pos: thread_caps[inst.slot],
334                         });
335                         thread_caps[inst.slot] = Some(at.pos());
336                     }
337                     ip = inst.goto;
338                 }
339                 Split(ref inst) => {
340                     self.stack.push(FollowEpsilon::IP(inst.goto2));
341                     ip = inst.goto1;
342                 }
343                 Match(_) | Char(_) | Ranges(_) | Bytes(_) => {
344                     let t = &mut nlist.caps(ip);
345                     for (slot, val) in t.iter_mut().zip(thread_caps.iter()) {
346                         *slot = *val;
347                     }
348                     return;
349                 }
350             }
351         }
352     }
353 }
354 
355 impl Threads {
new() -> Self356     fn new() -> Self {
357         Threads {
358             set: SparseSet::new(0),
359             caps: vec![],
360             slots_per_thread: 0,
361         }
362     }
363 
resize(&mut self, num_insts: usize, ncaps: usize)364     fn resize(&mut self, num_insts: usize, ncaps: usize) {
365         if num_insts == self.set.capacity() {
366             return;
367         }
368         self.slots_per_thread = ncaps * 2;
369         self.set = SparseSet::new(num_insts);
370         self.caps = vec![None; self.slots_per_thread * num_insts];
371     }
372 
caps(&mut self, pc: usize) -> &mut [Option<usize>]373     fn caps(&mut self, pc: usize) -> &mut [Option<usize>] {
374         let i = pc * self.slots_per_thread;
375         &mut self.caps[i..i + self.slots_per_thread]
376     }
377 }
378