1 // This module implements the Pike VM. That is, it guarantees linear time
2 // search of a regex on any text with memory use proportional to the size of
3 // the regex.
4 //
5 // It is equal in power to the backtracking engine in this crate, except the
6 // backtracking engine is typically faster on small regexes/texts at the
7 // expense of a bigger memory footprint.
8 //
9 // It can do more than the DFA can (specifically, record capture locations
10 // and execute Unicode word boundary assertions), but at a slower speed.
11 // Specifically, the Pike VM executes a DFA implicitly by repeatedly expanding
12 // epsilon transitions. That is, the Pike VM engine can be in multiple states
13 // at once where as the DFA is only ever in one state at a time.
14 //
15 // Therefore, the Pike VM is generally treated as the fallback when the other
16 // matching engines either aren't feasible to run or are insufficient.
17 
18 use std::mem;
19 
20 use crate::exec::ProgramCache;
21 use crate::input::{Input, InputAt};
22 use crate::prog::{InstPtr, Program};
23 use crate::re_trait::Slot;
24 use crate::sparse::SparseSet;
25 
26 /// An NFA simulation matching engine.
27 #[derive(Debug)]
28 pub struct Fsm<'r, I> {
29     /// The sequence of opcodes (among other things) that is actually executed.
30     ///
31     /// The program may be byte oriented or Unicode codepoint oriented.
32     prog: &'r Program,
33     /// An explicit stack used for following epsilon transitions. (This is
34     /// borrowed from the cache.)
35     stack: &'r mut Vec<FollowEpsilon>,
36     /// The input to search.
37     input: I,
38 }
39 
40 /// A cached allocation that can be reused on each execution.
41 #[derive(Clone, Debug)]
42 pub struct Cache {
43     /// A pair of ordered sets for tracking NFA states.
44     clist: Threads,
45     nlist: Threads,
46     /// An explicit stack used for following epsilon transitions.
47     stack: Vec<FollowEpsilon>,
48 }
49 
50 /// An ordered set of NFA states and their captures.
51 #[derive(Clone, Debug)]
52 struct Threads {
53     /// An ordered set of opcodes (each opcode is an NFA state).
54     set: SparseSet,
55     /// Captures for every NFA state.
56     ///
57     /// It is stored in row-major order, where the columns are the capture
58     /// slots and the rows are the states.
59     caps: Vec<Slot>,
60     /// The number of capture slots stored per thread. (Every capture has
61     /// two slots.)
62     slots_per_thread: usize,
63 }
64 
65 /// A representation of an explicit stack frame when following epsilon
66 /// transitions. This is used to avoid recursion.
67 #[derive(Clone, Debug)]
68 enum FollowEpsilon {
69     /// Follow transitions at the given instruction pointer.
70     IP(InstPtr),
71     /// Restore the capture slot with the given position in the input.
72     Capture { slot: usize, pos: Slot },
73 }
74 
75 impl Cache {
76     /// Create a new allocation used by the NFA machine to record execution
77     /// and captures.
new(_prog: &Program) -> Self78     pub fn new(_prog: &Program) -> Self {
79         Cache { clist: Threads::new(), nlist: Threads::new(), stack: vec![] }
80     }
81 }
82 
83 impl<'r, I: Input> Fsm<'r, I> {
84     /// Execute the NFA matching engine.
85     ///
86     /// If there's a match, `exec` returns `true` and populates the given
87     /// captures accordingly.
exec( prog: &'r Program, cache: &ProgramCache, matches: &mut [bool], slots: &mut [Slot], quit_after_match: bool, input: I, start: usize, end: usize, ) -> bool88     pub fn exec(
89         prog: &'r Program,
90         cache: &ProgramCache,
91         matches: &mut [bool],
92         slots: &mut [Slot],
93         quit_after_match: bool,
94         input: I,
95         start: usize,
96         end: usize,
97     ) -> bool {
98         let mut cache = cache.borrow_mut();
99         let cache = &mut cache.pikevm;
100         cache.clist.resize(prog.len(), prog.captures.len());
101         cache.nlist.resize(prog.len(), prog.captures.len());
102         let at = input.at(start);
103         Fsm { prog: prog, stack: &mut cache.stack, input: input }.exec_(
104             &mut cache.clist,
105             &mut cache.nlist,
106             matches,
107             slots,
108             quit_after_match,
109             at,
110             end,
111         )
112     }
113 
exec_( &mut self, mut clist: &mut Threads, mut nlist: &mut Threads, matches: &mut [bool], slots: &mut [Slot], quit_after_match: bool, mut at: InputAt, end: usize, ) -> bool114     fn exec_(
115         &mut self,
116         mut clist: &mut Threads,
117         mut nlist: &mut Threads,
118         matches: &mut [bool],
119         slots: &mut [Slot],
120         quit_after_match: bool,
121         mut at: InputAt,
122         end: usize,
123     ) -> bool {
124         let mut matched = false;
125         let mut all_matched = false;
126         clist.set.clear();
127         nlist.set.clear();
128         'LOOP: loop {
129             if clist.set.is_empty() {
130                 // Three ways to bail out when our current set of threads is
131                 // empty.
132                 //
133                 // 1. We have a match---so we're done exploring any possible
134                 //    alternatives. Time to quit. (We can't do this if we're
135                 //    looking for matches for multiple regexes, unless we know
136                 //    they all matched.)
137                 //
138                 // 2. If the expression starts with a '^' we can terminate as
139                 //    soon as the last thread dies.
140                 if (matched && matches.len() <= 1)
141                     || all_matched
142                     || (!at.is_start() && self.prog.is_anchored_start)
143                 {
144                     break;
145                 }
146 
147                 // 3. If there's a literal prefix for the program, try to
148                 //    jump ahead quickly. If it can't be found, then we can
149                 //    bail out early.
150                 if !self.prog.prefixes.is_empty() {
151                     at = match self.input.prefix_at(&self.prog.prefixes, at) {
152                         None => break,
153                         Some(at) => at,
154                     };
155                 }
156             }
157 
158             // This simulates a preceding '.*?' for every regex by adding
159             // a state starting at the current position in the input for the
160             // beginning of the program only if we don't already have a match.
161             if clist.set.is_empty()
162                 || (!self.prog.is_anchored_start && !all_matched)
163             {
164                 self.add(&mut clist, slots, 0, at);
165             }
166             // The previous call to "add" actually inspects the position just
167             // before the current character. For stepping through the machine,
168             // we can to look at the current character, so we advance the
169             // input.
170             let at_next = self.input.at(at.next_pos());
171             for i in 0..clist.set.len() {
172                 let ip = clist.set[i];
173                 if self.step(
174                     &mut nlist,
175                     matches,
176                     slots,
177                     clist.caps(ip),
178                     ip,
179                     at,
180                     at_next,
181                 ) {
182                     matched = true;
183                     all_matched = all_matched || matches.iter().all(|&b| b);
184                     if quit_after_match {
185                         // If we only care if a match occurs (not its
186                         // position), then we can quit right now.
187                         break 'LOOP;
188                     }
189                     if self.prog.matches.len() == 1 {
190                         // We don't need to check the rest of the threads
191                         // in this set because we've matched something
192                         // ("leftmost-first"). However, we still need to check
193                         // threads in the next set to support things like
194                         // greedy matching.
195                         //
196                         // This is only true on normal regexes. For regex sets,
197                         // we need to mush on to observe other matches.
198                         break;
199                     }
200                 }
201             }
202             if at.pos() >= end {
203                 break;
204             }
205             at = at_next;
206             mem::swap(clist, nlist);
207             nlist.set.clear();
208         }
209         matched
210     }
211 
212     /// Step through the input, one token (byte or codepoint) at a time.
213     ///
214     /// nlist is the set of states that will be processed on the next token
215     /// in the input.
216     ///
217     /// caps is the set of captures passed by the caller of the NFA. They are
218     /// written to only when a match state is visited.
219     ///
220     /// thread_caps is the set of captures set for the current NFA state, ip.
221     ///
222     /// at and at_next are the current and next positions in the input. at or
223     /// at_next may be EOF.
step( &mut self, nlist: &mut Threads, matches: &mut [bool], slots: &mut [Slot], thread_caps: &mut [Option<usize>], ip: usize, at: InputAt, at_next: InputAt, ) -> bool224     fn step(
225         &mut self,
226         nlist: &mut Threads,
227         matches: &mut [bool],
228         slots: &mut [Slot],
229         thread_caps: &mut [Option<usize>],
230         ip: usize,
231         at: InputAt,
232         at_next: InputAt,
233     ) -> bool {
234         use crate::prog::Inst::*;
235         match self.prog[ip] {
236             Match(match_slot) => {
237                 if match_slot < matches.len() {
238                     matches[match_slot] = true;
239                 }
240                 for (slot, val) in slots.iter_mut().zip(thread_caps.iter()) {
241                     *slot = *val;
242                 }
243                 true
244             }
245             Char(ref inst) => {
246                 if inst.c == at.char() {
247                     self.add(nlist, thread_caps, inst.goto, at_next);
248                 }
249                 false
250             }
251             Ranges(ref inst) => {
252                 if inst.matches(at.char()) {
253                     self.add(nlist, thread_caps, inst.goto, at_next);
254                 }
255                 false
256             }
257             Bytes(ref inst) => {
258                 if let Some(b) = at.byte() {
259                     if inst.matches(b) {
260                         self.add(nlist, thread_caps, inst.goto, at_next);
261                     }
262                 }
263                 false
264             }
265             EmptyLook(_) | Save(_) | Split(_) => false,
266         }
267     }
268 
269     /// Follows epsilon transitions and adds them for processing to nlist,
270     /// starting at and including ip.
add( &mut self, nlist: &mut Threads, thread_caps: &mut [Option<usize>], ip: usize, at: InputAt, )271     fn add(
272         &mut self,
273         nlist: &mut Threads,
274         thread_caps: &mut [Option<usize>],
275         ip: usize,
276         at: InputAt,
277     ) {
278         self.stack.push(FollowEpsilon::IP(ip));
279         while let Some(frame) = self.stack.pop() {
280             match frame {
281                 FollowEpsilon::IP(ip) => {
282                     self.add_step(nlist, thread_caps, ip, at);
283                 }
284                 FollowEpsilon::Capture { slot, pos } => {
285                     thread_caps[slot] = pos;
286                 }
287             }
288         }
289     }
290 
291     /// A helper function for add that avoids excessive pushing to the stack.
add_step( &mut self, nlist: &mut Threads, thread_caps: &mut [Option<usize>], mut ip: usize, at: InputAt, )292     fn add_step(
293         &mut self,
294         nlist: &mut Threads,
295         thread_caps: &mut [Option<usize>],
296         mut ip: usize,
297         at: InputAt,
298     ) {
299         // Instead of pushing and popping to the stack, we mutate ip as we
300         // traverse the set of states. We only push to the stack when we
301         // absolutely need recursion (restoring captures or following a
302         // branch).
303         use crate::prog::Inst::*;
304         loop {
305             // Don't visit states we've already added.
306             if nlist.set.contains(ip) {
307                 return;
308             }
309             nlist.set.insert(ip);
310             match self.prog[ip] {
311                 EmptyLook(ref inst) => {
312                     if self.input.is_empty_match(at, inst) {
313                         ip = inst.goto;
314                     }
315                 }
316                 Save(ref inst) => {
317                     if inst.slot < thread_caps.len() {
318                         self.stack.push(FollowEpsilon::Capture {
319                             slot: inst.slot,
320                             pos: thread_caps[inst.slot],
321                         });
322                         thread_caps[inst.slot] = Some(at.pos());
323                     }
324                     ip = inst.goto;
325                 }
326                 Split(ref inst) => {
327                     self.stack.push(FollowEpsilon::IP(inst.goto2));
328                     ip = inst.goto1;
329                 }
330                 Match(_) | Char(_) | Ranges(_) | Bytes(_) => {
331                     let t = &mut nlist.caps(ip);
332                     for (slot, val) in t.iter_mut().zip(thread_caps.iter()) {
333                         *slot = *val;
334                     }
335                     return;
336                 }
337             }
338         }
339     }
340 }
341 
342 impl Threads {
new() -> Self343     fn new() -> Self {
344         Threads { set: SparseSet::new(0), caps: vec![], slots_per_thread: 0 }
345     }
346 
resize(&mut self, num_insts: usize, ncaps: usize)347     fn resize(&mut self, num_insts: usize, ncaps: usize) {
348         if num_insts == self.set.capacity() {
349             return;
350         }
351         self.slots_per_thread = ncaps * 2;
352         self.set = SparseSet::new(num_insts);
353         self.caps = vec![None; self.slots_per_thread * num_insts];
354     }
355 
caps(&mut self, pc: usize) -> &mut [Option<usize>]356     fn caps(&mut self, pc: usize) -> &mut [Option<usize>] {
357         let i = pc * self.slots_per_thread;
358         &mut self.caps[i..i + self.slots_per_thread]
359     }
360 }
361