1 // Copyright 2014-2016 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
4 //
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
10
11 use std::collections::HashMap;
12 use std::iter;
13 use std::result;
14 use std::sync::Arc;
15
16 use syntax::is_word_byte;
17 use syntax::hir::{self, Hir};
18 use utf8_ranges::{Utf8Range, Utf8Sequence, Utf8Sequences};
19
20 use prog::{
21 Program, Inst, InstPtr, EmptyLook,
22 InstSave, InstSplit, InstEmptyLook, InstChar, InstRanges, InstBytes,
23 };
24
25 use Error;
26
27 type Result = result::Result<Patch, Error>;
28
29 #[derive(Debug)]
30 struct Patch {
31 hole: Hole,
32 entry: InstPtr,
33 }
34
35 /// A compiler translates a regular expression AST to a sequence of
36 /// instructions. The sequence of instructions represents an NFA.
37 pub struct Compiler {
38 insts: Vec<MaybeInst>,
39 compiled: Program,
40 capture_name_idx: HashMap<String, usize>,
41 num_exprs: usize,
42 size_limit: usize,
43 suffix_cache: SuffixCache,
44 utf8_seqs: Option<Utf8Sequences>,
45 byte_classes: ByteClassSet,
46 }
47
48 impl Compiler {
49 /// Create a new regular expression compiler.
50 ///
51 /// Various options can be set before calling `compile` on an expression.
new() -> Self52 pub fn new() -> Self {
53 Compiler {
54 insts: vec![],
55 compiled: Program::new(),
56 capture_name_idx: HashMap::new(),
57 num_exprs: 0,
58 size_limit: 10 * (1 << 20),
59 suffix_cache: SuffixCache::new(1000),
60 utf8_seqs: Some(Utf8Sequences::new('\x00', '\x00')),
61 byte_classes: ByteClassSet::new(),
62 }
63 }
64
65 /// The size of the resulting program is limited by size_limit. If
66 /// the program approximately exceeds the given size (in bytes), then
67 /// compilation will stop and return an error.
size_limit(mut self, size_limit: usize) -> Self68 pub fn size_limit(mut self, size_limit: usize) -> Self {
69 self.size_limit = size_limit;
70 self
71 }
72
73 /// If bytes is true, then the program is compiled as a byte based
74 /// automaton, which incorporates UTF-8 decoding into the machine. If it's
75 /// false, then the automaton is Unicode scalar value based, e.g., an
76 /// engine utilizing such an automaton is resposible for UTF-8 decoding.
77 ///
78 /// The specific invariant is that when returning a byte based machine,
79 /// the neither the `Char` nor `Ranges` instructions are produced.
80 /// Conversely, when producing a Unicode scalar value machine, the `Bytes`
81 /// instruction is never produced.
82 ///
83 /// Note that `dfa(true)` implies `bytes(true)`.
bytes(mut self, yes: bool) -> Self84 pub fn bytes(mut self, yes: bool) -> Self {
85 self.compiled.is_bytes = yes;
86 self
87 }
88
89 /// When disabled, the program compiled may match arbitrary bytes.
90 ///
91 /// When enabled (the default), all compiled programs exclusively match
92 /// valid UTF-8 bytes.
only_utf8(mut self, yes: bool) -> Self93 pub fn only_utf8(mut self, yes: bool) -> Self {
94 self.compiled.only_utf8 = yes;
95 self
96 }
97
98 /// When set, the machine returned is suitable for use in the DFA matching
99 /// engine.
100 ///
101 /// In particular, this ensures that if the regex is not anchored in the
102 /// beginning, then a preceding `.*?` is included in the program. (The NFA
103 /// based engines handle the preceding `.*?` explicitly, which is difficult
104 /// or impossible in the DFA engine.)
dfa(mut self, yes: bool) -> Self105 pub fn dfa(mut self, yes: bool) -> Self {
106 self.compiled.is_dfa = yes;
107 self
108 }
109
110 /// When set, the machine returned is suitable for matching text in
111 /// reverse. In particular, all concatenations are flipped.
reverse(mut self, yes: bool) -> Self112 pub fn reverse(mut self, yes: bool) -> Self {
113 self.compiled.is_reverse = yes;
114 self
115 }
116
117 /// Compile a regular expression given its AST.
118 ///
119 /// The compiler is guaranteed to succeed unless the program exceeds the
120 /// specified size limit. If the size limit is exceeded, then compilation
121 /// stops and returns an error.
compile( mut self, exprs: &[Hir], ) -> result::Result<Program, Error>122 pub fn compile(
123 mut self,
124 exprs: &[Hir],
125 ) -> result::Result<Program, Error> {
126 debug_assert!(exprs.len() >= 1);
127 self.num_exprs = exprs.len();
128 if exprs.len() == 1 {
129 self.compile_one(&exprs[0])
130 } else {
131 self.compile_many(exprs)
132 }
133 }
134
compile_one(mut self, expr: &Hir) -> result::Result<Program, Error>135 fn compile_one(mut self, expr: &Hir) -> result::Result<Program, Error> {
136 // If we're compiling a forward DFA and we aren't anchored, then
137 // add a `.*?` before the first capture group.
138 // Other matching engines handle this by baking the logic into the
139 // matching engine itself.
140 let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
141 self.compiled.is_anchored_start = expr.is_anchored_start();
142 self.compiled.is_anchored_end = expr.is_anchored_end();
143 if self.compiled.needs_dotstar() {
144 dotstar_patch = self.c_dotstar()?;
145 self.compiled.start = dotstar_patch.entry;
146 }
147 self.compiled.captures = vec![None];
148 let patch = self.c_capture(0, expr)?;
149 if self.compiled.needs_dotstar() {
150 self.fill(dotstar_patch.hole, patch.entry);
151 } else {
152 self.compiled.start = patch.entry;
153 }
154 self.fill_to_next(patch.hole);
155 self.compiled.matches = vec![self.insts.len()];
156 self.push_compiled(Inst::Match(0));
157 self.compile_finish()
158 }
159
compile_many( mut self, exprs: &[Hir], ) -> result::Result<Program, Error>160 fn compile_many(
161 mut self,
162 exprs: &[Hir],
163 ) -> result::Result<Program, Error> {
164 debug_assert!(exprs.len() > 1);
165
166 self.compiled.is_anchored_start =
167 exprs.iter().all(|e| e.is_anchored_start());
168 self.compiled.is_anchored_end =
169 exprs.iter().all(|e| e.is_anchored_end());
170 let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
171 if self.compiled.needs_dotstar() {
172 dotstar_patch = self.c_dotstar()?;
173 self.compiled.start = dotstar_patch.entry;
174 } else {
175 self.compiled.start = 0; // first instruction is always split
176 }
177 self.fill_to_next(dotstar_patch.hole);
178
179 let mut prev_hole = Hole::None;
180 for (i, expr) in exprs[0..exprs.len() - 1].iter().enumerate() {
181 self.fill_to_next(prev_hole);
182 let split = self.push_split_hole();
183 let Patch { hole, entry } = self.c_capture(0, expr)?;
184 self.fill_to_next(hole);
185 self.compiled.matches.push(self.insts.len());
186 self.push_compiled(Inst::Match(i));
187 prev_hole = self.fill_split(split, Some(entry), None);
188 }
189 let i = exprs.len() - 1;
190 let Patch { hole, entry } = self.c_capture(0, &exprs[i])?;
191 self.fill(prev_hole, entry);
192 self.fill_to_next(hole);
193 self.compiled.matches.push(self.insts.len());
194 self.push_compiled(Inst::Match(i));
195 self.compile_finish()
196 }
197
compile_finish(mut self) -> result::Result<Program, Error>198 fn compile_finish(mut self) -> result::Result<Program, Error> {
199 self.compiled.insts =
200 self.insts.into_iter().map(|inst| inst.unwrap()).collect();
201 self.compiled.byte_classes = self.byte_classes.byte_classes();
202 self.compiled.capture_name_idx = Arc::new(self.capture_name_idx);
203 Ok(self.compiled)
204 }
205
206 /// Compile expr into self.insts, returning a patch on success,
207 /// or an error if we run out of memory.
208 ///
209 /// All of the c_* methods of the compiler share the contract outlined
210 /// here.
211 ///
212 /// The main thing that a c_* method does is mutate `self.insts`
213 /// to add a list of mostly compiled instructions required to execute
214 /// the given expression. `self.insts` contains MaybeInsts rather than
215 /// Insts because there is some backpatching required.
216 ///
217 /// The `Patch` value returned by each c_* method provides metadata
218 /// about the compiled instructions emitted to `self.insts`. The
219 /// `entry` member of the patch refers to the first instruction
220 /// (the entry point), while the `hole` member contains zero or
221 /// more offsets to partial instructions that need to be backpatched.
222 /// The c_* routine can't know where its list of instructions are going to
223 /// jump to after execution, so it is up to the caller to patch
224 /// these jumps to point to the right place. So compiling some
225 /// expression, e, we would end up with a situation that looked like:
226 ///
227 /// ```text
228 /// self.insts = [ ..., i1, i2, ..., iexit1, ..., iexitn, ...]
229 /// ^ ^ ^
230 /// | \ /
231 /// entry \ /
232 /// hole
233 /// ```
234 ///
235 /// To compile two expressions, e1 and e2, concatinated together we
236 /// would do:
237 ///
238 /// ```ignore
239 /// let patch1 = self.c(e1);
240 /// let patch2 = self.c(e2);
241 /// ```
242 ///
243 /// while leaves us with a situation that looks like
244 ///
245 /// ```text
246 /// self.insts = [ ..., i1, ..., iexit1, ..., i2, ..., iexit2 ]
247 /// ^ ^ ^ ^
248 /// | | | |
249 /// entry1 hole1 entry2 hole2
250 /// ```
251 ///
252 /// Then to merge the two patches together into one we would backpatch
253 /// hole1 with entry2 and return a new patch that enters at entry1
254 /// and has hole2 for a hole. In fact, if you look at the c_concat
255 /// method you will see that it does exactly this, though it handles
256 /// a list of expressions rather than just the two that we use for
257 /// an example.
c(&mut self, expr: &Hir) -> Result258 fn c(&mut self, expr: &Hir) -> Result {
259 use prog;
260 use syntax::hir::HirKind::*;
261
262 self.check_size()?;
263 match *expr.kind() {
264 Empty => Ok(Patch { hole: Hole::None, entry: self.insts.len() }),
265 Literal(hir::Literal::Unicode(c)) => {
266 self.c_literal(&[c])
267 }
268 Literal(hir::Literal::Byte(b)) => {
269 assert!(self.compiled.uses_bytes());
270 self.c_bytes(&[b])
271 }
272 Class(hir::Class::Unicode(ref cls)) => {
273 self.c_class(cls.ranges())
274 }
275 Class(hir::Class::Bytes(ref cls)) => {
276 if self.compiled.uses_bytes() {
277 self.c_class_bytes(cls.ranges())
278 } else {
279 assert!(cls.is_all_ascii());
280 let mut char_ranges = vec![];
281 for r in cls.iter() {
282 let (s, e) = (r.start() as char, r.end() as char);
283 char_ranges.push(hir::ClassUnicodeRange::new(s, e));
284 }
285 self.c_class(&char_ranges)
286 }
287 }
288 Anchor(hir::Anchor::StartLine) if self.compiled.is_reverse => {
289 self.byte_classes.set_range(b'\n', b'\n');
290 self.c_empty_look(prog::EmptyLook::EndLine)
291 }
292 Anchor(hir::Anchor::StartLine) => {
293 self.byte_classes.set_range(b'\n', b'\n');
294 self.c_empty_look(prog::EmptyLook::StartLine)
295 }
296 Anchor(hir::Anchor::EndLine) if self.compiled.is_reverse => {
297 self.byte_classes.set_range(b'\n', b'\n');
298 self.c_empty_look(prog::EmptyLook::StartLine)
299 }
300 Anchor(hir::Anchor::EndLine) => {
301 self.byte_classes.set_range(b'\n', b'\n');
302 self.c_empty_look(prog::EmptyLook::EndLine)
303 }
304 Anchor(hir::Anchor::StartText) if self.compiled.is_reverse => {
305 self.c_empty_look(prog::EmptyLook::EndText)
306 }
307 Anchor(hir::Anchor::StartText) => {
308 self.c_empty_look(prog::EmptyLook::StartText)
309 }
310 Anchor(hir::Anchor::EndText) if self.compiled.is_reverse => {
311 self.c_empty_look(prog::EmptyLook::StartText)
312 }
313 Anchor(hir::Anchor::EndText) => {
314 self.c_empty_look(prog::EmptyLook::EndText)
315 }
316 WordBoundary(hir::WordBoundary::Unicode) => {
317 self.compiled.has_unicode_word_boundary = true;
318 self.byte_classes.set_word_boundary();
319 self.c_empty_look(prog::EmptyLook::WordBoundary)
320 }
321 WordBoundary(hir::WordBoundary::UnicodeNegate) => {
322 self.compiled.has_unicode_word_boundary = true;
323 self.byte_classes.set_word_boundary();
324 self.c_empty_look(prog::EmptyLook::NotWordBoundary)
325 }
326 WordBoundary(hir::WordBoundary::Ascii) => {
327 self.byte_classes.set_word_boundary();
328 self.c_empty_look(prog::EmptyLook::WordBoundaryAscii)
329 }
330 WordBoundary(hir::WordBoundary::AsciiNegate) => {
331 self.byte_classes.set_word_boundary();
332 self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii)
333 }
334 Group(ref g) => {
335 match g.kind {
336 hir::GroupKind::NonCapturing => self.c(&g.hir),
337 hir::GroupKind::CaptureIndex(index) => {
338 if index as usize >= self.compiled.captures.len() {
339 self.compiled.captures.push(None);
340 }
341 self.c_capture(2 * index as usize, &g.hir)
342 }
343 hir::GroupKind::CaptureName { index, ref name } => {
344 if index as usize >= self.compiled.captures.len() {
345 let n = name.to_string();
346 self.compiled.captures.push(Some(n.clone()));
347 self.capture_name_idx.insert(n, index as usize);
348 }
349 self.c_capture(2 * index as usize, &g.hir)
350 }
351 }
352 }
353 Concat(ref es) => {
354 if self.compiled.is_reverse {
355 self.c_concat(es.iter().rev())
356 } else {
357 self.c_concat(es)
358 }
359 }
360 Alternation(ref es) => self.c_alternate(&**es),
361 Repetition(ref rep) => self.c_repeat(rep),
362 }
363 }
364
c_capture(&mut self, first_slot: usize, expr: &Hir) -> Result365 fn c_capture(&mut self, first_slot: usize, expr: &Hir) -> Result {
366 if self.num_exprs > 1 || self.compiled.is_dfa {
367 // Don't ever compile Save instructions for regex sets because
368 // they are never used. They are also never used in DFA programs
369 // because DFAs can't handle captures.
370 self.c(expr)
371 } else {
372 let entry = self.insts.len();
373 let hole = self.push_hole(InstHole::Save { slot: first_slot });
374 let patch = self.c(expr)?;
375 self.fill(hole, patch.entry);
376 self.fill_to_next(patch.hole);
377 let hole = self.push_hole(InstHole::Save { slot: first_slot + 1 });
378 Ok(Patch { hole: hole, entry: entry })
379 }
380 }
381
c_dotstar(&mut self) -> Result382 fn c_dotstar(&mut self) -> Result {
383 Ok(if !self.compiled.only_utf8() {
384 self.c(&Hir::repetition(hir::Repetition {
385 kind: hir::RepetitionKind::ZeroOrMore,
386 greedy: false,
387 hir: Box::new(Hir::any(true)),
388 }))?
389 } else {
390 self.c(&Hir::repetition(hir::Repetition {
391 kind: hir::RepetitionKind::ZeroOrMore,
392 greedy: false,
393 hir: Box::new(Hir::any(false)),
394 }))?
395 })
396 }
397
c_literal(&mut self, chars: &[char]) -> Result398 fn c_literal(&mut self, chars: &[char]) -> Result {
399 debug_assert!(!chars.is_empty());
400 let mut chars: Box<Iterator<Item=&char>> =
401 if self.compiled.is_reverse {
402 Box::new(chars.iter().rev())
403 } else {
404 Box::new(chars.iter())
405 };
406 let first = *chars.next().expect("non-empty literal");
407 let Patch { mut hole, entry } = self.c_char(first)?;
408 for &c in chars {
409 let p = self.c_char(c)?;
410 self.fill(hole, p.entry);
411 hole = p.hole;
412 }
413 Ok(Patch { hole: hole, entry: entry })
414 }
415
c_char(&mut self, c: char) -> Result416 fn c_char(&mut self, c: char) -> Result {
417 self.c_class(&[hir::ClassUnicodeRange::new(c, c)])
418 }
419
c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> Result420 fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> Result {
421 assert!(!ranges.is_empty());
422 if self.compiled.uses_bytes() {
423 CompileClass {
424 c: self,
425 ranges: ranges,
426 }.compile()
427 } else {
428 let ranges: Vec<(char, char)> =
429 ranges.iter().map(|r| (r.start(), r.end())).collect();
430 let hole = if ranges.len() == 1 && ranges[0].0 == ranges[0].1 {
431 self.push_hole(InstHole::Char { c: ranges[0].0 })
432 } else {
433 self.push_hole(InstHole::Ranges { ranges: ranges })
434 };
435 Ok(Patch { hole: hole, entry: self.insts.len() - 1 })
436 }
437 }
438
c_bytes(&mut self, bytes: &[u8]) -> Result439 fn c_bytes(&mut self, bytes: &[u8]) -> Result {
440 debug_assert!(!bytes.is_empty());
441 let mut bytes: Box<Iterator<Item=&u8>> =
442 if self.compiled.is_reverse {
443 Box::new(bytes.iter().rev())
444 } else {
445 Box::new(bytes.iter())
446 };
447 let first = *bytes.next().expect("non-empty literal");
448 let Patch { mut hole, entry } = self.c_byte(first)?;
449 for &b in bytes {
450 let p = self.c_byte(b)?;
451 self.fill(hole, p.entry);
452 hole = p.hole;
453 }
454 Ok(Patch { hole: hole, entry: entry })
455 }
456
c_byte(&mut self, b: u8) -> Result457 fn c_byte(&mut self, b: u8) -> Result {
458 self.c_class_bytes(&[hir::ClassBytesRange::new(b, b)])
459 }
460
c_class_bytes(&mut self, ranges: &[hir::ClassBytesRange]) -> Result461 fn c_class_bytes(&mut self, ranges: &[hir::ClassBytesRange]) -> Result {
462 debug_assert!(!ranges.is_empty());
463
464 let first_split_entry = self.insts.len();
465 let mut holes = vec![];
466 let mut prev_hole = Hole::None;
467 for r in &ranges[0..ranges.len() - 1] {
468 self.fill_to_next(prev_hole);
469 let split = self.push_split_hole();
470 let next = self.insts.len();
471 self.byte_classes.set_range(r.start(), r.end());
472 holes.push(self.push_hole(InstHole::Bytes {
473 start: r.start(), end: r.end(),
474 }));
475 prev_hole = self.fill_split(split, Some(next), None);
476 }
477 let next = self.insts.len();
478 let r = &ranges[ranges.len() - 1];
479 self.byte_classes.set_range(r.start(), r.end());
480 holes.push(self.push_hole(InstHole::Bytes {
481 start: r.start(), end: r.end(),
482 }));
483 self.fill(prev_hole, next);
484 Ok(Patch { hole: Hole::Many(holes), entry: first_split_entry })
485 }
486
c_empty_look(&mut self, look: EmptyLook) -> Result487 fn c_empty_look(&mut self, look: EmptyLook) -> Result {
488 let hole = self.push_hole(InstHole::EmptyLook { look: look });
489 Ok(Patch { hole: hole, entry: self.insts.len() - 1 })
490 }
491
c_concat<'a, I>(&mut self, exprs: I) -> Result where I: IntoIterator<Item=&'a Hir>492 fn c_concat<'a, I>(&mut self, exprs: I) -> Result
493 where I: IntoIterator<Item=&'a Hir> {
494 let mut exprs = exprs.into_iter();
495 let first = match exprs.next() {
496 Some(expr) => expr,
497 None => {
498 return Ok(Patch { hole: Hole::None, entry: self.insts.len() })
499 }
500 };
501 let Patch { mut hole, entry } = self.c(first)?;
502 for e in exprs {
503 let p = self.c(e)?;
504 self.fill(hole, p.entry);
505 hole = p.hole;
506 }
507 Ok(Patch { hole: hole, entry: entry })
508 }
509
c_alternate(&mut self, exprs: &[Hir]) -> Result510 fn c_alternate(&mut self, exprs: &[Hir]) -> Result {
511 debug_assert!(
512 exprs.len() >= 2, "alternates must have at least 2 exprs");
513
514 // Initial entry point is always the first split.
515 let first_split_entry = self.insts.len();
516
517 // Save up all of the holes from each alternate. They will all get
518 // patched to point to the same location.
519 let mut holes = vec![];
520
521 let mut prev_hole = Hole::None;
522 for e in &exprs[0..exprs.len() - 1] {
523 self.fill_to_next(prev_hole);
524 let split = self.push_split_hole();
525 let prev_entry = self.insts.len();
526 let Patch { hole, entry } = self.c(e)?;
527 if prev_entry == self.insts.len() {
528 // TODO(burntsushi): It is kind of silly that we don't support
529 // empty-subexpressions in alternates, but it is supremely
530 // awkward to support them in the existing compiler
531 // infrastructure. This entire compiler needs to be thrown out
532 // anyway, so don't feel too bad.
533 return Err(Error::Syntax(
534 "alternations cannot currently contain \
535 empty sub-expressions".to_string()));
536 }
537 holes.push(hole);
538 prev_hole = self.fill_split(split, Some(entry), None);
539 }
540 let prev_entry = self.insts.len();
541 let Patch { hole, entry } = self.c(&exprs[exprs.len() - 1])?;
542 if prev_entry == self.insts.len() {
543 // TODO(burntsushi): See TODO above.
544 return Err(Error::Syntax(
545 "alternations cannot currently contain \
546 empty sub-expressions".to_string()));
547 }
548 holes.push(hole);
549 self.fill(prev_hole, entry);
550 Ok(Patch { hole: Hole::Many(holes), entry: first_split_entry })
551 }
552
c_repeat(&mut self, rep: &hir::Repetition) -> Result553 fn c_repeat(&mut self, rep: &hir::Repetition) -> Result {
554 use syntax::hir::RepetitionKind::*;
555 match rep.kind {
556 ZeroOrOne => self.c_repeat_zero_or_one(&rep.hir, rep.greedy),
557 ZeroOrMore => self.c_repeat_zero_or_more(&rep.hir, rep.greedy),
558 OneOrMore => self.c_repeat_one_or_more(&rep.hir, rep.greedy),
559 Range(hir::RepetitionRange::Exactly(min_max)) => {
560 self.c_repeat_range(&rep.hir, rep.greedy, min_max, min_max)
561 }
562 Range(hir::RepetitionRange::AtLeast(min)) => {
563 self.c_repeat_range_min_or_more(&rep.hir, rep.greedy, min)
564 }
565 Range(hir::RepetitionRange::Bounded(min, max)) => {
566 self.c_repeat_range(&rep.hir, rep.greedy, min, max)
567 }
568 }
569 }
570
c_repeat_zero_or_one(&mut self, expr: &Hir, greedy: bool) -> Result571 fn c_repeat_zero_or_one(&mut self, expr: &Hir, greedy: bool) -> Result {
572 let split_entry = self.insts.len();
573 let split = self.push_split_hole();
574 let Patch { hole: hole_rep, entry: entry_rep } = self.c(expr)?;
575
576 let split_hole = if greedy {
577 self.fill_split(split, Some(entry_rep), None)
578 } else {
579 self.fill_split(split, None, Some(entry_rep))
580 };
581 let holes = vec![hole_rep, split_hole];
582 Ok(Patch { hole: Hole::Many(holes), entry: split_entry })
583 }
584
c_repeat_zero_or_more(&mut self, expr: &Hir, greedy: bool) -> Result585 fn c_repeat_zero_or_more(&mut self, expr: &Hir, greedy: bool) -> Result {
586 let split_entry = self.insts.len();
587 let split = self.push_split_hole();
588 let Patch { hole: hole_rep, entry: entry_rep } = self.c(expr)?;
589
590 self.fill(hole_rep, split_entry);
591 let split_hole = if greedy {
592 self.fill_split(split, Some(entry_rep), None)
593 } else {
594 self.fill_split(split, None, Some(entry_rep))
595 };
596 Ok(Patch { hole: split_hole, entry: split_entry })
597 }
598
c_repeat_one_or_more(&mut self, expr: &Hir, greedy: bool) -> Result599 fn c_repeat_one_or_more(&mut self, expr: &Hir, greedy: bool) -> Result {
600 let Patch { hole: hole_rep, entry: entry_rep } = self.c(expr)?;
601 self.fill_to_next(hole_rep);
602 let split = self.push_split_hole();
603
604 let split_hole = if greedy {
605 self.fill_split(split, Some(entry_rep), None)
606 } else {
607 self.fill_split(split, None, Some(entry_rep))
608 };
609 Ok(Patch { hole: split_hole, entry: entry_rep })
610 }
611
c_repeat_range_min_or_more( &mut self, expr: &Hir, greedy: bool, min: u32, ) -> Result612 fn c_repeat_range_min_or_more(
613 &mut self,
614 expr: &Hir,
615 greedy: bool,
616 min: u32,
617 ) -> Result {
618 let min = u32_to_usize(min);
619 let patch_concat = self.c_concat(iter::repeat(expr).take(min))?;
620 let patch_rep = self.c_repeat_zero_or_more(expr, greedy)?;
621 self.fill(patch_concat.hole, patch_rep.entry);
622 Ok(Patch { hole: patch_rep.hole, entry: patch_concat.entry })
623 }
624
c_repeat_range( &mut self, expr: &Hir, greedy: bool, min: u32, max: u32, ) -> Result625 fn c_repeat_range(
626 &mut self,
627 expr: &Hir,
628 greedy: bool,
629 min: u32,
630 max: u32,
631 ) -> Result {
632 let (min, max) = (u32_to_usize(min), u32_to_usize(max));
633 let patch_concat = self.c_concat(iter::repeat(expr).take(min))?;
634 let initial_entry = patch_concat.entry;
635 if min == max {
636 return Ok(patch_concat);
637 }
638 // It is much simpler to compile, e.g., `a{2,5}` as:
639 //
640 // aaa?a?a?
641 //
642 // But you end up with a sequence of instructions like this:
643 //
644 // 0: 'a'
645 // 1: 'a',
646 // 2: split(3, 4)
647 // 3: 'a'
648 // 4: split(5, 6)
649 // 5: 'a'
650 // 6: split(7, 8)
651 // 7: 'a'
652 // 8: MATCH
653 //
654 // This is *incredibly* inefficient because the splits end
655 // up forming a chain, which has to be resolved everything a
656 // transition is followed.
657 let mut holes = vec![];
658 let mut prev_hole = patch_concat.hole;
659 for _ in min..max {
660 self.fill_to_next(prev_hole);
661 let split = self.push_split_hole();
662 let Patch { hole, entry } = self.c(expr)?;
663 prev_hole = hole;
664 if greedy {
665 holes.push(self.fill_split(split, Some(entry), None));
666 } else {
667 holes.push(self.fill_split(split, None, Some(entry)));
668 }
669 }
670 holes.push(prev_hole);
671 Ok(Patch { hole: Hole::Many(holes), entry: initial_entry })
672 }
673
fill(&mut self, hole: Hole, goto: InstPtr)674 fn fill(&mut self, hole: Hole, goto: InstPtr) {
675 match hole {
676 Hole::None => {}
677 Hole::One(pc) => {
678 self.insts[pc].fill(goto);
679 }
680 Hole::Many(holes) => {
681 for hole in holes {
682 self.fill(hole, goto);
683 }
684 }
685 }
686 }
687
fill_to_next(&mut self, hole: Hole)688 fn fill_to_next(&mut self, hole: Hole) {
689 let next = self.insts.len();
690 self.fill(hole, next);
691 }
692
fill_split( &mut self, hole: Hole, goto1: Option<InstPtr>, goto2: Option<InstPtr>, ) -> Hole693 fn fill_split(
694 &mut self,
695 hole: Hole,
696 goto1: Option<InstPtr>,
697 goto2: Option<InstPtr>,
698 ) -> Hole {
699 match hole {
700 Hole::None => Hole::None,
701 Hole::One(pc) => {
702 match (goto1, goto2) {
703 (Some(goto1), Some(goto2)) => {
704 self.insts[pc].fill_split(goto1, goto2);
705 Hole::None
706 }
707 (Some(goto1), None) => {
708 self.insts[pc].half_fill_split_goto1(goto1);
709 Hole::One(pc)
710 }
711 (None, Some(goto2)) => {
712 self.insts[pc].half_fill_split_goto2(goto2);
713 Hole::One(pc)
714 }
715 (None, None) => unreachable!("at least one of the split \
716 holes must be filled"),
717 }
718 }
719 Hole::Many(holes) => {
720 let mut new_holes = vec![];
721 for hole in holes {
722 new_holes.push(self.fill_split(hole, goto1, goto2));
723 }
724 if new_holes.is_empty() {
725 Hole::None
726 } else if new_holes.len() == 1 {
727 new_holes.pop().unwrap()
728 } else {
729 Hole::Many(new_holes)
730 }
731 }
732 }
733 }
734
push_compiled(&mut self, inst: Inst)735 fn push_compiled(&mut self, inst: Inst) {
736 self.insts.push(MaybeInst::Compiled(inst));
737 }
738
push_hole(&mut self, inst: InstHole) -> Hole739 fn push_hole(&mut self, inst: InstHole) -> Hole {
740 let hole = self.insts.len();
741 self.insts.push(MaybeInst::Uncompiled(inst));
742 Hole::One(hole)
743 }
744
push_split_hole(&mut self) -> Hole745 fn push_split_hole(&mut self) -> Hole {
746 let hole = self.insts.len();
747 self.insts.push(MaybeInst::Split);
748 Hole::One(hole)
749 }
750
check_size(&self) -> result::Result<(), Error>751 fn check_size(&self) -> result::Result<(), Error> {
752 use std::mem::size_of;
753
754 if self.insts.len() * size_of::<Inst>() > self.size_limit {
755 Err(Error::CompiledTooBig(self.size_limit))
756 } else {
757 Ok(())
758 }
759 }
760 }
761
762 #[derive(Debug)]
763 enum Hole {
764 None,
765 One(InstPtr),
766 Many(Vec<Hole>),
767 }
768
769 #[derive(Clone, Debug)]
770 enum MaybeInst {
771 Compiled(Inst),
772 Uncompiled(InstHole),
773 Split,
774 Split1(InstPtr),
775 Split2(InstPtr),
776 }
777
778 impl MaybeInst {
fill(&mut self, goto: InstPtr)779 fn fill(&mut self, goto: InstPtr) {
780 let filled = match *self {
781 MaybeInst::Uncompiled(ref inst) => inst.fill(goto),
782 MaybeInst::Split1(goto1) => {
783 Inst::Split(InstSplit { goto1: goto1, goto2: goto })
784 }
785 MaybeInst::Split2(goto2) => {
786 Inst::Split(InstSplit { goto1: goto, goto2: goto2 })
787 }
788 _ => unreachable!("not all instructions were compiled! \
789 found uncompiled instruction: {:?}", self),
790 };
791 *self = MaybeInst::Compiled(filled);
792 }
793
fill_split(&mut self, goto1: InstPtr, goto2: InstPtr)794 fn fill_split(&mut self, goto1: InstPtr, goto2: InstPtr) {
795 let filled = match *self {
796 MaybeInst::Split => {
797 Inst::Split(InstSplit { goto1: goto1, goto2: goto2 })
798 }
799 _ => unreachable!("must be called on Split instruction, \
800 instead it was called on: {:?}", self),
801 };
802 *self = MaybeInst::Compiled(filled);
803 }
804
half_fill_split_goto1(&mut self, goto1: InstPtr)805 fn half_fill_split_goto1(&mut self, goto1: InstPtr) {
806 let half_filled = match *self {
807 MaybeInst::Split => goto1,
808 _ => unreachable!("must be called on Split instruction, \
809 instead it was called on: {:?}", self),
810 };
811 *self = MaybeInst::Split1(half_filled);
812 }
813
half_fill_split_goto2(&mut self, goto2: InstPtr)814 fn half_fill_split_goto2(&mut self, goto2: InstPtr) {
815 let half_filled = match *self {
816 MaybeInst::Split => goto2,
817 _ => unreachable!("must be called on Split instruction, \
818 instead it was called on: {:?}", self),
819 };
820 *self = MaybeInst::Split2(half_filled);
821 }
822
unwrap(self) -> Inst823 fn unwrap(self) -> Inst {
824 match self {
825 MaybeInst::Compiled(inst) => inst,
826 _ => unreachable!("must be called on a compiled instruction, \
827 instead it was called on: {:?}", self),
828 }
829 }
830 }
831
832 #[derive(Clone, Debug)]
833 enum InstHole {
834 Save { slot: usize },
835 EmptyLook { look: EmptyLook },
836 Char { c: char },
837 Ranges { ranges: Vec<(char, char)> },
838 Bytes { start: u8, end: u8 },
839 }
840
841 impl InstHole {
fill(&self, goto: InstPtr) -> Inst842 fn fill(&self, goto: InstPtr) -> Inst {
843 match *self {
844 InstHole::Save { slot } => Inst::Save(InstSave {
845 goto: goto,
846 slot: slot,
847 }),
848 InstHole::EmptyLook { look } => Inst::EmptyLook(InstEmptyLook {
849 goto: goto,
850 look: look,
851 }),
852 InstHole::Char { c } => Inst::Char(InstChar {
853 goto: goto,
854 c: c,
855 }),
856 InstHole::Ranges { ref ranges } => Inst::Ranges(InstRanges {
857 goto: goto,
858 ranges: ranges.clone(),
859 }),
860 InstHole::Bytes { start, end } => Inst::Bytes(InstBytes {
861 goto: goto,
862 start: start,
863 end: end,
864 }),
865 }
866 }
867 }
868
869 struct CompileClass<'a, 'b> {
870 c: &'a mut Compiler,
871 ranges: &'b [hir::ClassUnicodeRange],
872 }
873
874 impl<'a, 'b> CompileClass<'a, 'b> {
compile(mut self) -> Result875 fn compile(mut self) -> Result {
876 let mut holes = vec![];
877 let mut initial_entry = None;
878 let mut last_split = Hole::None;
879 let mut utf8_seqs = self.c.utf8_seqs.take().unwrap();
880 self.c.suffix_cache.clear();
881
882 for (i, range) in self.ranges.iter().enumerate() {
883 let is_last_range = i + 1 == self.ranges.len();
884 utf8_seqs.reset(range.start(), range.end());
885 let mut it = (&mut utf8_seqs).peekable();
886 loop {
887 let utf8_seq = match it.next() {
888 None => break,
889 Some(utf8_seq) => utf8_seq,
890 };
891 if is_last_range && it.peek().is_none() {
892 let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?;
893 holes.push(hole);
894 self.c.fill(last_split, entry);
895 last_split = Hole::None;
896 if initial_entry.is_none() {
897 initial_entry = Some(entry);
898 }
899 } else {
900 if initial_entry.is_none() {
901 initial_entry = Some(self.c.insts.len());
902 }
903 self.c.fill_to_next(last_split);
904 last_split = self.c.push_split_hole();
905 let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?;
906 holes.push(hole);
907 last_split = self.c.fill_split(last_split, Some(entry), None);
908 }
909 }
910 }
911 self.c.utf8_seqs = Some(utf8_seqs);
912 Ok(Patch {
913 hole: Hole::Many(holes),
914 entry: initial_entry.unwrap(),
915 })
916 }
917
c_utf8_seq(&mut self, seq: &Utf8Sequence) -> Result918 fn c_utf8_seq(&mut self, seq: &Utf8Sequence) -> Result {
919 if self.c.compiled.is_reverse {
920 self.c_utf8_seq_(seq)
921 } else {
922 self.c_utf8_seq_(seq.into_iter().rev())
923 }
924 }
925
c_utf8_seq_<'r, I>(&mut self, seq: I) -> Result where I: IntoIterator<Item=&'r Utf8Range>926 fn c_utf8_seq_<'r, I>(&mut self, seq: I) -> Result
927 where I: IntoIterator<Item=&'r Utf8Range> {
928 // The initial instruction for each UTF-8 sequence should be the same.
929 let mut from_inst = ::std::usize::MAX;
930 let mut last_hole = Hole::None;
931 for byte_range in seq {
932 let key = SuffixCacheKey {
933 from_inst: from_inst,
934 start: byte_range.start,
935 end: byte_range.end,
936 };
937 {
938 let pc = self.c.insts.len();
939 if let Some(cached_pc) = self.c.suffix_cache.get(key, pc) {
940 from_inst = cached_pc;
941 continue;
942 }
943 }
944 self.c.byte_classes.set_range(byte_range.start, byte_range.end);
945 if from_inst == ::std::usize::MAX {
946 last_hole = self.c.push_hole(InstHole::Bytes {
947 start: byte_range.start,
948 end: byte_range.end,
949 });
950 } else {
951 self.c.push_compiled(Inst::Bytes(InstBytes {
952 goto: from_inst,
953 start: byte_range.start,
954 end: byte_range.end,
955 }));
956 }
957 from_inst = self.c.insts.len().checked_sub(1).unwrap();
958 debug_assert!(from_inst < ::std::usize::MAX);
959 }
960 debug_assert!(from_inst < ::std::usize::MAX);
961 Ok(Patch { hole: last_hole, entry: from_inst })
962 }
963 }
964
965 /// `SuffixCache` is a simple bounded hash map for caching suffix entries in
966 /// UTF-8 automata. For example, consider the Unicode range \u{0}-\u{FFFF}.
967 /// The set of byte ranges looks like this:
968 ///
969 /// [0-7F]
970 /// [C2-DF][80-BF]
971 /// [E0][A0-BF][80-BF]
972 /// [E1-EC][80-BF][80-BF]
973 /// [ED][80-9F][80-BF]
974 /// [EE-EF][80-BF][80-BF]
975 ///
976 /// Each line above translates to one alternate in the compiled regex program.
977 /// However, all but one of the alternates end in the same suffix, which is
978 /// a waste of an instruction. The suffix cache facilitates reusing them across
979 /// alternates.
980 ///
981 /// Note that a HashMap could be trivially used for this, but we don't need its
982 /// overhead. Some small bounded space (LRU style) is more than enough.
983 struct SuffixCache {
984 table: Vec<SuffixCacheEntry>,
985 // Every time the cache is cleared, we increment the version number instead
986 // of actually zeroing memory. Since we store a copy of the current version
987 // in every element, all we need to do is make sure to invalidate any stale
988 // entries upon access. This saves quite a bit of time!
989 version: usize,
990 }
991
992 #[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
993 struct SuffixCacheEntry {
994 key: SuffixCacheKey,
995 pc: InstPtr,
996 version: usize,
997 }
998
999 #[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
1000 struct SuffixCacheKey {
1001 from_inst: InstPtr,
1002 start: u8,
1003 end: u8,
1004 }
1005
1006 impl SuffixCache {
new(size: usize) -> Self1007 fn new(size: usize) -> Self {
1008 SuffixCache {
1009 table: vec![SuffixCacheEntry::default(); size],
1010 version: 0,
1011 }
1012 }
1013
get(&mut self, key: SuffixCacheKey, pc: InstPtr) -> Option<InstPtr>1014 fn get(&mut self, key: SuffixCacheKey, pc: InstPtr) -> Option<InstPtr> {
1015 let h = self.hash(&key);
1016 let e = self.table[h];
1017 if e.key == key && e.version == self.version {
1018 Some(e.pc)
1019 } else {
1020 self.table[h] = SuffixCacheEntry {
1021 key: key,
1022 pc: pc,
1023 version: self.version,
1024 };
1025 None
1026 }
1027 }
1028
clear(&mut self)1029 fn clear(&mut self) {
1030 self.version += 1;
1031 }
1032
hash(&self, suffix: &SuffixCacheKey) -> usize1033 fn hash(&self, suffix: &SuffixCacheKey) -> usize {
1034 // Basic FNV-1a hash as described:
1035 // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
1036 const FNV_PRIME: u64 = 1099511628211;
1037 let mut h = 14695981039346656037;
1038 h = (h ^ (suffix.from_inst as u64)).wrapping_mul(FNV_PRIME);
1039 h = (h ^ (suffix.start as u64)).wrapping_mul(FNV_PRIME);
1040 h = (h ^ (suffix.end as u64)).wrapping_mul(FNV_PRIME);
1041 (h as usize) % self.table.len()
1042 }
1043 }
1044
1045 struct ByteClassSet([bool; 256]);
1046
1047 impl ByteClassSet {
new() -> Self1048 fn new() -> Self {
1049 ByteClassSet([false; 256])
1050 }
1051
set_range(&mut self, start: u8, end: u8)1052 fn set_range(&mut self, start: u8, end: u8) {
1053 debug_assert!(start <= end);
1054 if start > 0 {
1055 self.0[start as usize - 1] = true;
1056 }
1057 self.0[end as usize] = true;
1058 }
1059
set_word_boundary(&mut self)1060 fn set_word_boundary(&mut self) {
1061 // We need to mark all ranges of bytes whose pairs result in
1062 // evaluating \b differently.
1063 let iswb = is_word_byte;
1064 let mut b1: u16 = 0;
1065 let mut b2: u16;
1066 while b1 <= 255 {
1067 b2 = b1 + 1;
1068 while b2 <= 255 && iswb(b1 as u8) == iswb(b2 as u8) {
1069 b2 += 1;
1070 }
1071 self.set_range(b1 as u8, (b2 - 1) as u8);
1072 b1 = b2;
1073 }
1074 }
1075
byte_classes(&self) -> Vec<u8>1076 fn byte_classes(&self) -> Vec<u8> {
1077 // N.B. If you're debugging the DFA, it's useful to simply return
1078 // `(0..256).collect()`, which effectively removes the byte classes
1079 // and makes the transitions easier to read.
1080 // (0usize..256).map(|x| x as u8).collect()
1081 let mut byte_classes = vec![0; 256];
1082 let mut class = 0u8;
1083 let mut i = 0;
1084 loop {
1085 byte_classes[i] = class as u8;
1086 if i >= 255 {
1087 break;
1088 }
1089 if self.0[i] {
1090 class = class.checked_add(1).unwrap();
1091 }
1092 i += 1;
1093 }
1094 byte_classes
1095 }
1096 }
1097
u32_to_usize(n: u32) -> usize1098 fn u32_to_usize(n: u32) -> usize {
1099 // In case usize is less than 32 bits, we need to guard against overflow.
1100 // On most platforms this compiles to nothing.
1101 // TODO Use `std::convert::TryFrom` once it's stable.
1102 if (n as u64) > (::std::usize::MAX as u64) {
1103 panic!("BUG: {} is too big to be pointer sized", n)
1104 }
1105 n as usize
1106 }
1107
1108 #[cfg(test)]
1109 mod tests {
1110 use super::ByteClassSet;
1111
1112 #[test]
byte_classes()1113 fn byte_classes() {
1114 let mut set = ByteClassSet::new();
1115 set.set_range(b'a', b'z');
1116 let classes = set.byte_classes();
1117 assert_eq!(classes[0], 0);
1118 assert_eq!(classes[1], 0);
1119 assert_eq!(classes[2], 0);
1120 assert_eq!(classes[b'a' as usize - 1], 0);
1121 assert_eq!(classes[b'a' as usize], 1);
1122 assert_eq!(classes[b'm' as usize], 1);
1123 assert_eq!(classes[b'z' as usize], 1);
1124 assert_eq!(classes[b'z' as usize + 1], 2);
1125 assert_eq!(classes[254], 2);
1126 assert_eq!(classes[255], 2);
1127
1128 let mut set = ByteClassSet::new();
1129 set.set_range(0, 2);
1130 set.set_range(4, 6);
1131 let classes = set.byte_classes();
1132 assert_eq!(classes[0], 0);
1133 assert_eq!(classes[1], 0);
1134 assert_eq!(classes[2], 0);
1135 assert_eq!(classes[3], 1);
1136 assert_eq!(classes[4], 2);
1137 assert_eq!(classes[5], 2);
1138 assert_eq!(classes[6], 2);
1139 assert_eq!(classes[7], 3);
1140 assert_eq!(classes[255], 3);
1141 }
1142
1143 #[test]
full_byte_classes()1144 fn full_byte_classes() {
1145 let mut set = ByteClassSet::new();
1146 for i in 0..256u16 {
1147 set.set_range(i as u8, i as u8);
1148 }
1149 assert_eq!(set.byte_classes().len(), 256);
1150 }
1151 }
1152