1 // Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT 2 // file at the top-level directory of this distribution and at 3 // http://rust-lang.org/COPYRIGHT. 4 // 5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or 6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license 7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your 8 // option. This file may not be copied, modified, or distributed 9 // except according to those terms. 10 11 // This module implements the Pike VM. That is, it guarantees linear time 12 // search of a regex on any text with memory use proportional to the size of 13 // the regex. 14 // 15 // It is equal in power to the backtracking engine in this crate, except the 16 // backtracking engine is typically faster on small regexes/texts at the 17 // expense of a bigger memory footprint. 18 // 19 // It can do more than the DFA can (specifically, record capture locations 20 // and execute Unicode word boundary assertions), but at a slower speed. 21 // Specifically, the Pike VM exectues a DFA implicitly by repeatedly expanding 22 // epsilon transitions. That is, the Pike VM engine can be in multiple states 23 // at once where as the DFA is only ever in one state at a time. 24 // 25 // Therefore, the Pike VM is generally treated as the fallback when the other 26 // matching engines either aren't feasible to run or are insufficient. 27 28 use std::mem; 29 30 use exec::ProgramCache; 31 use input::{Input, InputAt}; 32 use prog::{Program, InstPtr}; 33 use re_trait::Slot; 34 use sparse::SparseSet; 35 36 /// An NFA simulation matching engine. 37 #[derive(Debug)] 38 pub struct Fsm<'r, I> { 39 /// The sequence of opcodes (among other things) that is actually executed. 40 /// 41 /// The program may be byte oriented or Unicode codepoint oriented. 42 prog: &'r Program, 43 /// An explicit stack used for following epsilon transitions. (This is 44 /// borrowed from the cache.) 45 stack: &'r mut Vec<FollowEpsilon>, 46 /// The input to search. 47 input: I, 48 } 49 50 /// A cached allocation that can be reused on each execution. 51 #[derive(Clone, Debug)] 52 pub struct Cache { 53 /// A pair of ordered sets for tracking NFA states. 54 clist: Threads, 55 nlist: Threads, 56 /// An explicit stack used for following epsilon transitions. 57 stack: Vec<FollowEpsilon>, 58 } 59 60 /// An ordered set of NFA states and their captures. 61 #[derive(Clone, Debug)] 62 struct Threads { 63 /// An ordered set of opcodes (each opcode is an NFA state). 64 set: SparseSet, 65 /// Captures for every NFA state. 66 /// 67 /// It is stored in row-major order, where the columns are the capture 68 /// slots and the rows are the states. 69 caps: Vec<Slot>, 70 /// The number of capture slots stored per thread. (Every capture has 71 /// two slots.) 72 slots_per_thread: usize, 73 } 74 75 /// A representation of an explicit stack frame when following epsilon 76 /// transitions. This is used to avoid recursion. 77 #[derive(Clone, Debug)] 78 enum FollowEpsilon { 79 /// Follow transitions at the given instruction pointer. 80 IP(InstPtr), 81 /// Restore the capture slot with the given position in the input. 82 Capture { slot: usize, pos: Slot }, 83 } 84 85 impl Cache { 86 /// Create a new allocation used by the NFA machine to record execution 87 /// and captures. new(_prog: &Program) -> Self88 pub fn new(_prog: &Program) -> Self { 89 Cache { 90 clist: Threads::new(), 91 nlist: Threads::new(), 92 stack: vec![], 93 } 94 } 95 } 96 97 impl<'r, I: Input> Fsm<'r, I> { 98 /// Execute the NFA matching engine. 99 /// 100 /// If there's a match, `exec` returns `true` and populates the given 101 /// captures accordingly. exec( prog: &'r Program, cache: &ProgramCache, matches: &mut [bool], slots: &mut [Slot], quit_after_match: bool, input: I, start: usize, ) -> bool102 pub fn exec( 103 prog: &'r Program, 104 cache: &ProgramCache, 105 matches: &mut [bool], 106 slots: &mut [Slot], 107 quit_after_match: bool, 108 input: I, 109 start: usize, 110 ) -> bool { 111 let mut cache = cache.borrow_mut(); 112 let cache = &mut cache.pikevm; 113 cache.clist.resize(prog.len(), prog.captures.len()); 114 cache.nlist.resize(prog.len(), prog.captures.len()); 115 let at = input.at(start); 116 Fsm { 117 prog: prog, 118 stack: &mut cache.stack, 119 input: input, 120 }.exec_( 121 &mut cache.clist, 122 &mut cache.nlist, 123 matches, 124 slots, 125 quit_after_match, 126 at, 127 ) 128 } 129 exec_( &mut self, mut clist: &mut Threads, mut nlist: &mut Threads, matches: &mut [bool], slots: &mut [Slot], quit_after_match: bool, mut at: InputAt, ) -> bool130 fn exec_( 131 &mut self, 132 mut clist: &mut Threads, 133 mut nlist: &mut Threads, 134 matches: &mut [bool], 135 slots: &mut [Slot], 136 quit_after_match: bool, 137 mut at: InputAt, 138 ) -> bool { 139 let mut matched = false; 140 let mut all_matched = false; 141 clist.set.clear(); 142 nlist.set.clear(); 143 'LOOP: loop { 144 if clist.set.is_empty() { 145 // Three ways to bail out when our current set of threads is 146 // empty. 147 // 148 // 1. We have a match---so we're done exploring any possible 149 // alternatives. Time to quit. (We can't do this if we're 150 // looking for matches for multiple regexes, unless we know 151 // they all matched.) 152 // 153 // 2. If the expression starts with a '^' we can terminate as 154 // soon as the last thread dies. 155 if (matched && matches.len() <= 1) 156 || all_matched 157 || (!at.is_start() && self.prog.is_anchored_start) { 158 break; 159 } 160 161 // 3. If there's a literal prefix for the program, try to 162 // jump ahead quickly. If it can't be found, then we can 163 // bail out early. 164 if !self.prog.prefixes.is_empty() { 165 at = match self.input.prefix_at(&self.prog.prefixes, at) { 166 None => break, 167 Some(at) => at, 168 }; 169 } 170 } 171 172 // This simulates a preceding '.*?' for every regex by adding 173 // a state starting at the current position in the input for the 174 // beginning of the program only if we don't already have a match. 175 if clist.set.is_empty() 176 || (!self.prog.is_anchored_start && !all_matched) { 177 self.add(&mut clist, slots, 0, at); 178 } 179 // The previous call to "add" actually inspects the position just 180 // before the current character. For stepping through the machine, 181 // we can to look at the current character, so we advance the 182 // input. 183 let at_next = self.input.at(at.next_pos()); 184 for i in 0..clist.set.len() { 185 let ip = clist.set[i]; 186 if self.step( 187 &mut nlist, 188 matches, 189 slots, 190 clist.caps(ip), 191 ip, 192 at, 193 at_next, 194 ) { 195 matched = true; 196 all_matched = all_matched || matches.iter().all(|&b| b); 197 if quit_after_match { 198 // If we only care if a match occurs (not its 199 // position), then we can quit right now. 200 break 'LOOP; 201 } 202 if self.prog.matches.len() == 1 { 203 // We don't need to check the rest of the threads 204 // in this set because we've matched something 205 // ("leftmost-first"). However, we still need to check 206 // threads in the next set to support things like 207 // greedy matching. 208 // 209 // This is only true on normal regexes. For regex sets, 210 // we need to mush on to observe other matches. 211 break; 212 } 213 } 214 } 215 if at.is_end() { 216 break; 217 } 218 at = at_next; 219 mem::swap(clist, nlist); 220 nlist.set.clear(); 221 } 222 matched 223 } 224 225 /// Step through the input, one token (byte or codepoint) at a time. 226 /// 227 /// nlist is the set of states that will be processed on the next token 228 /// in the input. 229 /// 230 /// caps is the set of captures passed by the caller of the NFA. They are 231 /// written to only when a match state is visited. 232 /// 233 /// thread_caps is the set of captures set for the current NFA state, ip. 234 /// 235 /// at and at_next are the current and next positions in the input. at or 236 /// at_next may be EOF. step( &mut self, nlist: &mut Threads, matches: &mut [bool], slots: &mut [Slot], thread_caps: &mut [Option<usize>], ip: usize, at: InputAt, at_next: InputAt, ) -> bool237 fn step( 238 &mut self, 239 nlist: &mut Threads, 240 matches: &mut [bool], 241 slots: &mut [Slot], 242 thread_caps: &mut [Option<usize>], 243 ip: usize, 244 at: InputAt, 245 at_next: InputAt, 246 ) -> bool { 247 use prog::Inst::*; 248 match self.prog[ip] { 249 Match(match_slot) => { 250 if match_slot < matches.len() { 251 matches[match_slot] = true; 252 } 253 for (slot, val) in slots.iter_mut().zip(thread_caps.iter()) { 254 *slot = *val; 255 } 256 true 257 } 258 Char(ref inst) => { 259 if inst.c == at.char() { 260 self.add(nlist, thread_caps, inst.goto, at_next); 261 } 262 false 263 } 264 Ranges(ref inst) => { 265 if inst.matches(at.char()) { 266 self.add(nlist, thread_caps, inst.goto, at_next); 267 } 268 false 269 } 270 Bytes(ref inst) => { 271 if let Some(b) = at.byte() { 272 if inst.matches(b) { 273 self.add(nlist, thread_caps, inst.goto, at_next); 274 } 275 } 276 false 277 } 278 EmptyLook(_) | Save(_) | Split(_) => false, 279 } 280 } 281 282 /// Follows epsilon transitions and adds them for processing to nlist, 283 /// starting at and including ip. add( &mut self, nlist: &mut Threads, thread_caps: &mut [Option<usize>], ip: usize, at: InputAt, )284 fn add( 285 &mut self, 286 nlist: &mut Threads, 287 thread_caps: &mut [Option<usize>], 288 ip: usize, 289 at: InputAt, 290 ) { 291 self.stack.push(FollowEpsilon::IP(ip)); 292 while let Some(frame) = self.stack.pop() { 293 match frame { 294 FollowEpsilon::IP(ip) => { 295 self.add_step(nlist, thread_caps, ip, at); 296 } 297 FollowEpsilon::Capture { slot, pos } => { 298 thread_caps[slot] = pos; 299 } 300 } 301 } 302 } 303 304 /// A helper function for add that avoids excessive pushing to the stack. add_step( &mut self, nlist: &mut Threads, thread_caps: &mut [Option<usize>], mut ip: usize, at: InputAt, )305 fn add_step( 306 &mut self, 307 nlist: &mut Threads, 308 thread_caps: &mut [Option<usize>], 309 mut ip: usize, 310 at: InputAt, 311 ) { 312 // Instead of pushing and popping to the stack, we mutate ip as we 313 // traverse the set of states. We only push to the stack when we 314 // absolutely need recursion (restoring captures or following a 315 // branch). 316 use prog::Inst::*; 317 loop { 318 // Don't visit states we've already added. 319 if nlist.set.contains(ip) { 320 return; 321 } 322 nlist.set.insert(ip); 323 match self.prog[ip] { 324 EmptyLook(ref inst) => { 325 if self.input.is_empty_match(at, inst) { 326 ip = inst.goto; 327 } 328 } 329 Save(ref inst) => { 330 if inst.slot < thread_caps.len() { 331 self.stack.push(FollowEpsilon::Capture { 332 slot: inst.slot, 333 pos: thread_caps[inst.slot], 334 }); 335 thread_caps[inst.slot] = Some(at.pos()); 336 } 337 ip = inst.goto; 338 } 339 Split(ref inst) => { 340 self.stack.push(FollowEpsilon::IP(inst.goto2)); 341 ip = inst.goto1; 342 } 343 Match(_) | Char(_) | Ranges(_) | Bytes(_) => { 344 let t = &mut nlist.caps(ip); 345 for (slot, val) in t.iter_mut().zip(thread_caps.iter()) { 346 *slot = *val; 347 } 348 return; 349 } 350 } 351 } 352 } 353 } 354 355 impl Threads { new() -> Self356 fn new() -> Self { 357 Threads { 358 set: SparseSet::new(0), 359 caps: vec![], 360 slots_per_thread: 0, 361 } 362 } 363 resize(&mut self, num_insts: usize, ncaps: usize)364 fn resize(&mut self, num_insts: usize, ncaps: usize) { 365 if num_insts == self.set.capacity() { 366 return; 367 } 368 self.slots_per_thread = ncaps * 2; 369 self.set = SparseSet::new(num_insts); 370 self.caps = vec![None; self.slots_per_thread * num_insts]; 371 } 372 caps(&mut self, pc: usize) -> &mut [Option<usize>]373 fn caps(&mut self, pc: usize) -> &mut [Option<usize>] { 374 let i = pc * self.slots_per_thread; 375 &mut self.caps[i..i + self.slots_per_thread] 376 } 377 } 378