1 use std::collections::HashMap;
2 use std::iter;
3 use std::result;
4 use std::sync::Arc;
5 
6 use syntax::hir::{self, Hir};
7 use syntax::is_word_byte;
8 use syntax::utf8::{Utf8Range, Utf8Sequence, Utf8Sequences};
9 
10 use prog::{
11     EmptyLook, Inst, InstBytes, InstChar, InstEmptyLook, InstPtr, InstRanges,
12     InstSave, InstSplit, Program,
13 };
14 
15 use Error;
16 
17 type Result = result::Result<Patch, Error>;
18 
19 #[derive(Debug)]
20 struct Patch {
21     hole: Hole,
22     entry: InstPtr,
23 }
24 
25 /// A compiler translates a regular expression AST to a sequence of
26 /// instructions. The sequence of instructions represents an NFA.
27 pub struct Compiler {
28     insts: Vec<MaybeInst>,
29     compiled: Program,
30     capture_name_idx: HashMap<String, usize>,
31     num_exprs: usize,
32     size_limit: usize,
33     suffix_cache: SuffixCache,
34     utf8_seqs: Option<Utf8Sequences>,
35     byte_classes: ByteClassSet,
36 }
37 
38 impl Compiler {
39     /// Create a new regular expression compiler.
40     ///
41     /// Various options can be set before calling `compile` on an expression.
new() -> Self42     pub fn new() -> Self {
43         Compiler {
44             insts: vec![],
45             compiled: Program::new(),
46             capture_name_idx: HashMap::new(),
47             num_exprs: 0,
48             size_limit: 10 * (1 << 20),
49             suffix_cache: SuffixCache::new(1000),
50             utf8_seqs: Some(Utf8Sequences::new('\x00', '\x00')),
51             byte_classes: ByteClassSet::new(),
52         }
53     }
54 
55     /// The size of the resulting program is limited by size_limit. If
56     /// the program approximately exceeds the given size (in bytes), then
57     /// compilation will stop and return an error.
size_limit(mut self, size_limit: usize) -> Self58     pub fn size_limit(mut self, size_limit: usize) -> Self {
59         self.size_limit = size_limit;
60         self
61     }
62 
63     /// If bytes is true, then the program is compiled as a byte based
64     /// automaton, which incorporates UTF-8 decoding into the machine. If it's
65     /// false, then the automaton is Unicode scalar value based, e.g., an
66     /// engine utilizing such an automaton is responsible for UTF-8 decoding.
67     ///
68     /// The specific invariant is that when returning a byte based machine,
69     /// the neither the `Char` nor `Ranges` instructions are produced.
70     /// Conversely, when producing a Unicode scalar value machine, the `Bytes`
71     /// instruction is never produced.
72     ///
73     /// Note that `dfa(true)` implies `bytes(true)`.
bytes(mut self, yes: bool) -> Self74     pub fn bytes(mut self, yes: bool) -> Self {
75         self.compiled.is_bytes = yes;
76         self
77     }
78 
79     /// When disabled, the program compiled may match arbitrary bytes.
80     ///
81     /// When enabled (the default), all compiled programs exclusively match
82     /// valid UTF-8 bytes.
only_utf8(mut self, yes: bool) -> Self83     pub fn only_utf8(mut self, yes: bool) -> Self {
84         self.compiled.only_utf8 = yes;
85         self
86     }
87 
88     /// When set, the machine returned is suitable for use in the DFA matching
89     /// engine.
90     ///
91     /// In particular, this ensures that if the regex is not anchored in the
92     /// beginning, then a preceding `.*?` is included in the program. (The NFA
93     /// based engines handle the preceding `.*?` explicitly, which is difficult
94     /// or impossible in the DFA engine.)
dfa(mut self, yes: bool) -> Self95     pub fn dfa(mut self, yes: bool) -> Self {
96         self.compiled.is_dfa = yes;
97         self
98     }
99 
100     /// When set, the machine returned is suitable for matching text in
101     /// reverse. In particular, all concatenations are flipped.
reverse(mut self, yes: bool) -> Self102     pub fn reverse(mut self, yes: bool) -> Self {
103         self.compiled.is_reverse = yes;
104         self
105     }
106 
107     /// Compile a regular expression given its AST.
108     ///
109     /// The compiler is guaranteed to succeed unless the program exceeds the
110     /// specified size limit. If the size limit is exceeded, then compilation
111     /// stops and returns an error.
compile(mut self, exprs: &[Hir]) -> result::Result<Program, Error>112     pub fn compile(mut self, exprs: &[Hir]) -> result::Result<Program, Error> {
113         debug_assert!(!exprs.is_empty());
114         self.num_exprs = exprs.len();
115         if exprs.len() == 1 {
116             self.compile_one(&exprs[0])
117         } else {
118             self.compile_many(exprs)
119         }
120     }
121 
compile_one(mut self, expr: &Hir) -> result::Result<Program, Error>122     fn compile_one(mut self, expr: &Hir) -> result::Result<Program, Error> {
123         // If we're compiling a forward DFA and we aren't anchored, then
124         // add a `.*?` before the first capture group.
125         // Other matching engines handle this by baking the logic into the
126         // matching engine itself.
127         let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
128         self.compiled.is_anchored_start = expr.is_anchored_start();
129         self.compiled.is_anchored_end = expr.is_anchored_end();
130         if self.compiled.needs_dotstar() {
131             dotstar_patch = self.c_dotstar()?;
132             self.compiled.start = dotstar_patch.entry;
133         }
134         self.compiled.captures = vec![None];
135         let patch = self.c_capture(0, expr)?;
136         if self.compiled.needs_dotstar() {
137             self.fill(dotstar_patch.hole, patch.entry);
138         } else {
139             self.compiled.start = patch.entry;
140         }
141         self.fill_to_next(patch.hole);
142         self.compiled.matches = vec![self.insts.len()];
143         self.push_compiled(Inst::Match(0));
144         self.compile_finish()
145     }
146 
compile_many( mut self, exprs: &[Hir], ) -> result::Result<Program, Error>147     fn compile_many(
148         mut self,
149         exprs: &[Hir],
150     ) -> result::Result<Program, Error> {
151         debug_assert!(exprs.len() > 1);
152 
153         self.compiled.is_anchored_start =
154             exprs.iter().all(|e| e.is_anchored_start());
155         self.compiled.is_anchored_end =
156             exprs.iter().all(|e| e.is_anchored_end());
157         let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
158         if self.compiled.needs_dotstar() {
159             dotstar_patch = self.c_dotstar()?;
160             self.compiled.start = dotstar_patch.entry;
161         } else {
162             self.compiled.start = 0; // first instruction is always split
163         }
164         self.fill_to_next(dotstar_patch.hole);
165 
166         let mut prev_hole = Hole::None;
167         for (i, expr) in exprs[0..exprs.len() - 1].iter().enumerate() {
168             self.fill_to_next(prev_hole);
169             let split = self.push_split_hole();
170             let Patch { hole, entry } = self.c_capture(0, expr)?;
171             self.fill_to_next(hole);
172             self.compiled.matches.push(self.insts.len());
173             self.push_compiled(Inst::Match(i));
174             prev_hole = self.fill_split(split, Some(entry), None);
175         }
176         let i = exprs.len() - 1;
177         let Patch { hole, entry } = self.c_capture(0, &exprs[i])?;
178         self.fill(prev_hole, entry);
179         self.fill_to_next(hole);
180         self.compiled.matches.push(self.insts.len());
181         self.push_compiled(Inst::Match(i));
182         self.compile_finish()
183     }
184 
compile_finish(mut self) -> result::Result<Program, Error>185     fn compile_finish(mut self) -> result::Result<Program, Error> {
186         self.compiled.insts =
187             self.insts.into_iter().map(|inst| inst.unwrap()).collect();
188         self.compiled.byte_classes = self.byte_classes.byte_classes();
189         self.compiled.capture_name_idx = Arc::new(self.capture_name_idx);
190         Ok(self.compiled)
191     }
192 
193     /// Compile expr into self.insts, returning a patch on success,
194     /// or an error if we run out of memory.
195     ///
196     /// All of the c_* methods of the compiler share the contract outlined
197     /// here.
198     ///
199     /// The main thing that a c_* method does is mutate `self.insts`
200     /// to add a list of mostly compiled instructions required to execute
201     /// the given expression. `self.insts` contains MaybeInsts rather than
202     /// Insts because there is some backpatching required.
203     ///
204     /// The `Patch` value returned by each c_* method provides metadata
205     /// about the compiled instructions emitted to `self.insts`. The
206     /// `entry` member of the patch refers to the first instruction
207     /// (the entry point), while the `hole` member contains zero or
208     /// more offsets to partial instructions that need to be backpatched.
209     /// The c_* routine can't know where its list of instructions are going to
210     /// jump to after execution, so it is up to the caller to patch
211     /// these jumps to point to the right place. So compiling some
212     /// expression, e, we would end up with a situation that looked like:
213     ///
214     /// ```text
215     /// self.insts = [ ..., i1, i2, ..., iexit1, ..., iexitn, ...]
216     ///                     ^              ^             ^
217     ///                     |                \         /
218     ///                   entry                \     /
219     ///                                         hole
220     /// ```
221     ///
222     /// To compile two expressions, e1 and e2, concatinated together we
223     /// would do:
224     ///
225     /// ```ignore
226     /// let patch1 = self.c(e1);
227     /// let patch2 = self.c(e2);
228     /// ```
229     ///
230     /// while leaves us with a situation that looks like
231     ///
232     /// ```text
233     /// self.insts = [ ..., i1, ..., iexit1, ..., i2, ..., iexit2 ]
234     ///                     ^        ^            ^        ^
235     ///                     |        |            |        |
236     ///                entry1        hole1   entry2        hole2
237     /// ```
238     ///
239     /// Then to merge the two patches together into one we would backpatch
240     /// hole1 with entry2 and return a new patch that enters at entry1
241     /// and has hole2 for a hole. In fact, if you look at the c_concat
242     /// method you will see that it does exactly this, though it handles
243     /// a list of expressions rather than just the two that we use for
244     /// an example.
c(&mut self, expr: &Hir) -> Result245     fn c(&mut self, expr: &Hir) -> Result {
246         use prog;
247         use syntax::hir::HirKind::*;
248 
249         self.check_size()?;
250         match *expr.kind() {
251             Empty => Ok(Patch { hole: Hole::None, entry: self.insts.len() }),
252             Literal(hir::Literal::Unicode(c)) => self.c_char(c),
253             Literal(hir::Literal::Byte(b)) => {
254                 assert!(self.compiled.uses_bytes());
255                 self.c_byte(b)
256             }
257             Class(hir::Class::Unicode(ref cls)) => self.c_class(cls.ranges()),
258             Class(hir::Class::Bytes(ref cls)) => {
259                 if self.compiled.uses_bytes() {
260                     self.c_class_bytes(cls.ranges())
261                 } else {
262                     assert!(cls.is_all_ascii());
263                     let mut char_ranges = vec![];
264                     for r in cls.iter() {
265                         let (s, e) = (r.start() as char, r.end() as char);
266                         char_ranges.push(hir::ClassUnicodeRange::new(s, e));
267                     }
268                     self.c_class(&char_ranges)
269                 }
270             }
271             Anchor(hir::Anchor::StartLine) if self.compiled.is_reverse => {
272                 self.byte_classes.set_range(b'\n', b'\n');
273                 self.c_empty_look(prog::EmptyLook::EndLine)
274             }
275             Anchor(hir::Anchor::StartLine) => {
276                 self.byte_classes.set_range(b'\n', b'\n');
277                 self.c_empty_look(prog::EmptyLook::StartLine)
278             }
279             Anchor(hir::Anchor::EndLine) if self.compiled.is_reverse => {
280                 self.byte_classes.set_range(b'\n', b'\n');
281                 self.c_empty_look(prog::EmptyLook::StartLine)
282             }
283             Anchor(hir::Anchor::EndLine) => {
284                 self.byte_classes.set_range(b'\n', b'\n');
285                 self.c_empty_look(prog::EmptyLook::EndLine)
286             }
287             Anchor(hir::Anchor::StartText) if self.compiled.is_reverse => {
288                 self.c_empty_look(prog::EmptyLook::EndText)
289             }
290             Anchor(hir::Anchor::StartText) => {
291                 self.c_empty_look(prog::EmptyLook::StartText)
292             }
293             Anchor(hir::Anchor::EndText) if self.compiled.is_reverse => {
294                 self.c_empty_look(prog::EmptyLook::StartText)
295             }
296             Anchor(hir::Anchor::EndText) => {
297                 self.c_empty_look(prog::EmptyLook::EndText)
298             }
299             WordBoundary(hir::WordBoundary::Unicode) => {
300                 if !cfg!(feature = "unicode-perl") {
301                     return Err(Error::Syntax(
302                         "Unicode word boundaries are unavailable when \
303                          the unicode-perl feature is disabled"
304                             .to_string(),
305                     ));
306                 }
307                 self.compiled.has_unicode_word_boundary = true;
308                 self.byte_classes.set_word_boundary();
309                 self.c_empty_look(prog::EmptyLook::WordBoundary)
310             }
311             WordBoundary(hir::WordBoundary::UnicodeNegate) => {
312                 if !cfg!(feature = "unicode-perl") {
313                     return Err(Error::Syntax(
314                         "Unicode word boundaries are unavailable when \
315                          the unicode-perl feature is disabled"
316                             .to_string(),
317                     ));
318                 }
319                 self.compiled.has_unicode_word_boundary = true;
320                 self.byte_classes.set_word_boundary();
321                 self.c_empty_look(prog::EmptyLook::NotWordBoundary)
322             }
323             WordBoundary(hir::WordBoundary::Ascii) => {
324                 self.byte_classes.set_word_boundary();
325                 self.c_empty_look(prog::EmptyLook::WordBoundaryAscii)
326             }
327             WordBoundary(hir::WordBoundary::AsciiNegate) => {
328                 self.byte_classes.set_word_boundary();
329                 self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii)
330             }
331             Group(ref g) => match g.kind {
332                 hir::GroupKind::NonCapturing => self.c(&g.hir),
333                 hir::GroupKind::CaptureIndex(index) => {
334                     if index as usize >= self.compiled.captures.len() {
335                         self.compiled.captures.push(None);
336                     }
337                     self.c_capture(2 * index as usize, &g.hir)
338                 }
339                 hir::GroupKind::CaptureName { index, ref name } => {
340                     if index as usize >= self.compiled.captures.len() {
341                         let n = name.to_string();
342                         self.compiled.captures.push(Some(n.clone()));
343                         self.capture_name_idx.insert(n, index as usize);
344                     }
345                     self.c_capture(2 * index as usize, &g.hir)
346                 }
347             },
348             Concat(ref es) => {
349                 if self.compiled.is_reverse {
350                     self.c_concat(es.iter().rev())
351                 } else {
352                     self.c_concat(es)
353                 }
354             }
355             Alternation(ref es) => self.c_alternate(&**es),
356             Repetition(ref rep) => self.c_repeat(rep),
357         }
358     }
359 
c_capture(&mut self, first_slot: usize, expr: &Hir) -> Result360     fn c_capture(&mut self, first_slot: usize, expr: &Hir) -> Result {
361         if self.num_exprs > 1 || self.compiled.is_dfa {
362             // Don't ever compile Save instructions for regex sets because
363             // they are never used. They are also never used in DFA programs
364             // because DFAs can't handle captures.
365             self.c(expr)
366         } else {
367             let entry = self.insts.len();
368             let hole = self.push_hole(InstHole::Save { slot: first_slot });
369             let patch = self.c(expr)?;
370             self.fill(hole, patch.entry);
371             self.fill_to_next(patch.hole);
372             let hole = self.push_hole(InstHole::Save { slot: first_slot + 1 });
373             Ok(Patch { hole: hole, entry: entry })
374         }
375     }
376 
c_dotstar(&mut self) -> Result377     fn c_dotstar(&mut self) -> Result {
378         Ok(if !self.compiled.only_utf8() {
379             self.c(&Hir::repetition(hir::Repetition {
380                 kind: hir::RepetitionKind::ZeroOrMore,
381                 greedy: false,
382                 hir: Box::new(Hir::any(true)),
383             }))?
384         } else {
385             self.c(&Hir::repetition(hir::Repetition {
386                 kind: hir::RepetitionKind::ZeroOrMore,
387                 greedy: false,
388                 hir: Box::new(Hir::any(false)),
389             }))?
390         })
391     }
392 
c_char(&mut self, c: char) -> Result393     fn c_char(&mut self, c: char) -> Result {
394         self.c_class(&[hir::ClassUnicodeRange::new(c, c)])
395     }
396 
c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> Result397     fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> Result {
398         assert!(!ranges.is_empty());
399         if self.compiled.uses_bytes() {
400             CompileClass { c: self, ranges: ranges }.compile()
401         } else {
402             let ranges: Vec<(char, char)> =
403                 ranges.iter().map(|r| (r.start(), r.end())).collect();
404             let hole = if ranges.len() == 1 && ranges[0].0 == ranges[0].1 {
405                 self.push_hole(InstHole::Char { c: ranges[0].0 })
406             } else {
407                 self.push_hole(InstHole::Ranges { ranges: ranges })
408             };
409             Ok(Patch { hole: hole, entry: self.insts.len() - 1 })
410         }
411     }
412 
c_byte(&mut self, b: u8) -> Result413     fn c_byte(&mut self, b: u8) -> Result {
414         self.c_class_bytes(&[hir::ClassBytesRange::new(b, b)])
415     }
416 
c_class_bytes(&mut self, ranges: &[hir::ClassBytesRange]) -> Result417     fn c_class_bytes(&mut self, ranges: &[hir::ClassBytesRange]) -> Result {
418         debug_assert!(!ranges.is_empty());
419 
420         let first_split_entry = self.insts.len();
421         let mut holes = vec![];
422         let mut prev_hole = Hole::None;
423         for r in &ranges[0..ranges.len() - 1] {
424             self.fill_to_next(prev_hole);
425             let split = self.push_split_hole();
426             let next = self.insts.len();
427             self.byte_classes.set_range(r.start(), r.end());
428             holes.push(self.push_hole(InstHole::Bytes {
429                 start: r.start(),
430                 end: r.end(),
431             }));
432             prev_hole = self.fill_split(split, Some(next), None);
433         }
434         let next = self.insts.len();
435         let r = &ranges[ranges.len() - 1];
436         self.byte_classes.set_range(r.start(), r.end());
437         holes.push(
438             self.push_hole(InstHole::Bytes { start: r.start(), end: r.end() }),
439         );
440         self.fill(prev_hole, next);
441         Ok(Patch { hole: Hole::Many(holes), entry: first_split_entry })
442     }
443 
c_empty_look(&mut self, look: EmptyLook) -> Result444     fn c_empty_look(&mut self, look: EmptyLook) -> Result {
445         let hole = self.push_hole(InstHole::EmptyLook { look: look });
446         Ok(Patch { hole: hole, entry: self.insts.len() - 1 })
447     }
448 
c_concat<'a, I>(&mut self, exprs: I) -> Result where I: IntoIterator<Item = &'a Hir>,449     fn c_concat<'a, I>(&mut self, exprs: I) -> Result
450     where
451         I: IntoIterator<Item = &'a Hir>,
452     {
453         let mut exprs = exprs.into_iter();
454         let first = match exprs.next() {
455             Some(expr) => expr,
456             None => {
457                 return Ok(Patch { hole: Hole::None, entry: self.insts.len() })
458             }
459         };
460         let Patch { mut hole, entry } = self.c(first)?;
461         for e in exprs {
462             let p = self.c(e)?;
463             self.fill(hole, p.entry);
464             hole = p.hole;
465         }
466         Ok(Patch { hole: hole, entry: entry })
467     }
468 
c_alternate(&mut self, exprs: &[Hir]) -> Result469     fn c_alternate(&mut self, exprs: &[Hir]) -> Result {
470         debug_assert!(
471             exprs.len() >= 2,
472             "alternates must have at least 2 exprs"
473         );
474 
475         // Initial entry point is always the first split.
476         let first_split_entry = self.insts.len();
477 
478         // Save up all of the holes from each alternate. They will all get
479         // patched to point to the same location.
480         let mut holes = vec![];
481 
482         let mut prev_hole = Hole::None;
483         for e in &exprs[0..exprs.len() - 1] {
484             self.fill_to_next(prev_hole);
485             let split = self.push_split_hole();
486             let prev_entry = self.insts.len();
487             let Patch { hole, entry } = self.c(e)?;
488             if prev_entry == self.insts.len() {
489                 // TODO(burntsushi): It is kind of silly that we don't support
490                 // empty-subexpressions in alternates, but it is supremely
491                 // awkward to support them in the existing compiler
492                 // infrastructure. This entire compiler needs to be thrown out
493                 // anyway, so don't feel too bad.
494                 return Err(Error::Syntax(
495                     "alternations cannot currently contain \
496                      empty sub-expressions"
497                         .to_string(),
498                 ));
499             }
500             holes.push(hole);
501             prev_hole = self.fill_split(split, Some(entry), None);
502         }
503         let prev_entry = self.insts.len();
504         let Patch { hole, entry } = self.c(&exprs[exprs.len() - 1])?;
505         if prev_entry == self.insts.len() {
506             // TODO(burntsushi): See TODO above.
507             return Err(Error::Syntax(
508                 "alternations cannot currently contain \
509                  empty sub-expressions"
510                     .to_string(),
511             ));
512         }
513         holes.push(hole);
514         self.fill(prev_hole, entry);
515         Ok(Patch { hole: Hole::Many(holes), entry: first_split_entry })
516     }
517 
c_repeat(&mut self, rep: &hir::Repetition) -> Result518     fn c_repeat(&mut self, rep: &hir::Repetition) -> Result {
519         use syntax::hir::RepetitionKind::*;
520         match rep.kind {
521             ZeroOrOne => self.c_repeat_zero_or_one(&rep.hir, rep.greedy),
522             ZeroOrMore => self.c_repeat_zero_or_more(&rep.hir, rep.greedy),
523             OneOrMore => self.c_repeat_one_or_more(&rep.hir, rep.greedy),
524             Range(hir::RepetitionRange::Exactly(min_max)) => {
525                 self.c_repeat_range(&rep.hir, rep.greedy, min_max, min_max)
526             }
527             Range(hir::RepetitionRange::AtLeast(min)) => {
528                 self.c_repeat_range_min_or_more(&rep.hir, rep.greedy, min)
529             }
530             Range(hir::RepetitionRange::Bounded(min, max)) => {
531                 self.c_repeat_range(&rep.hir, rep.greedy, min, max)
532             }
533         }
534     }
535 
c_repeat_zero_or_one(&mut self, expr: &Hir, greedy: bool) -> Result536     fn c_repeat_zero_or_one(&mut self, expr: &Hir, greedy: bool) -> Result {
537         let split_entry = self.insts.len();
538         let split = self.push_split_hole();
539         let Patch { hole: hole_rep, entry: entry_rep } = self.c(expr)?;
540 
541         let split_hole = if greedy {
542             self.fill_split(split, Some(entry_rep), None)
543         } else {
544             self.fill_split(split, None, Some(entry_rep))
545         };
546         let holes = vec![hole_rep, split_hole];
547         Ok(Patch { hole: Hole::Many(holes), entry: split_entry })
548     }
549 
c_repeat_zero_or_more(&mut self, expr: &Hir, greedy: bool) -> Result550     fn c_repeat_zero_or_more(&mut self, expr: &Hir, greedy: bool) -> Result {
551         let split_entry = self.insts.len();
552         let split = self.push_split_hole();
553         let Patch { hole: hole_rep, entry: entry_rep } = self.c(expr)?;
554 
555         self.fill(hole_rep, split_entry);
556         let split_hole = if greedy {
557             self.fill_split(split, Some(entry_rep), None)
558         } else {
559             self.fill_split(split, None, Some(entry_rep))
560         };
561         Ok(Patch { hole: split_hole, entry: split_entry })
562     }
563 
c_repeat_one_or_more(&mut self, expr: &Hir, greedy: bool) -> Result564     fn c_repeat_one_or_more(&mut self, expr: &Hir, greedy: bool) -> Result {
565         let Patch { hole: hole_rep, entry: entry_rep } = self.c(expr)?;
566         self.fill_to_next(hole_rep);
567         let split = self.push_split_hole();
568 
569         let split_hole = if greedy {
570             self.fill_split(split, Some(entry_rep), None)
571         } else {
572             self.fill_split(split, None, Some(entry_rep))
573         };
574         Ok(Patch { hole: split_hole, entry: entry_rep })
575     }
576 
c_repeat_range_min_or_more( &mut self, expr: &Hir, greedy: bool, min: u32, ) -> Result577     fn c_repeat_range_min_or_more(
578         &mut self,
579         expr: &Hir,
580         greedy: bool,
581         min: u32,
582     ) -> Result {
583         let min = u32_to_usize(min);
584         let patch_concat = self.c_concat(iter::repeat(expr).take(min))?;
585         let patch_rep = self.c_repeat_zero_or_more(expr, greedy)?;
586         self.fill(patch_concat.hole, patch_rep.entry);
587         Ok(Patch { hole: patch_rep.hole, entry: patch_concat.entry })
588     }
589 
c_repeat_range( &mut self, expr: &Hir, greedy: bool, min: u32, max: u32, ) -> Result590     fn c_repeat_range(
591         &mut self,
592         expr: &Hir,
593         greedy: bool,
594         min: u32,
595         max: u32,
596     ) -> Result {
597         let (min, max) = (u32_to_usize(min), u32_to_usize(max));
598         let patch_concat = self.c_concat(iter::repeat(expr).take(min))?;
599         let initial_entry = patch_concat.entry;
600         if min == max {
601             return Ok(patch_concat);
602         }
603         // It is much simpler to compile, e.g., `a{2,5}` as:
604         //
605         //     aaa?a?a?
606         //
607         // But you end up with a sequence of instructions like this:
608         //
609         //     0: 'a'
610         //     1: 'a',
611         //     2: split(3, 4)
612         //     3: 'a'
613         //     4: split(5, 6)
614         //     5: 'a'
615         //     6: split(7, 8)
616         //     7: 'a'
617         //     8: MATCH
618         //
619         // This is *incredibly* inefficient because the splits end
620         // up forming a chain, which has to be resolved everything a
621         // transition is followed.
622         let mut holes = vec![];
623         let mut prev_hole = patch_concat.hole;
624         for _ in min..max {
625             self.fill_to_next(prev_hole);
626             let split = self.push_split_hole();
627             let Patch { hole, entry } = self.c(expr)?;
628             prev_hole = hole;
629             if greedy {
630                 holes.push(self.fill_split(split, Some(entry), None));
631             } else {
632                 holes.push(self.fill_split(split, None, Some(entry)));
633             }
634         }
635         holes.push(prev_hole);
636         Ok(Patch { hole: Hole::Many(holes), entry: initial_entry })
637     }
638 
fill(&mut self, hole: Hole, goto: InstPtr)639     fn fill(&mut self, hole: Hole, goto: InstPtr) {
640         match hole {
641             Hole::None => {}
642             Hole::One(pc) => {
643                 self.insts[pc].fill(goto);
644             }
645             Hole::Many(holes) => {
646                 for hole in holes {
647                     self.fill(hole, goto);
648                 }
649             }
650         }
651     }
652 
fill_to_next(&mut self, hole: Hole)653     fn fill_to_next(&mut self, hole: Hole) {
654         let next = self.insts.len();
655         self.fill(hole, next);
656     }
657 
fill_split( &mut self, hole: Hole, goto1: Option<InstPtr>, goto2: Option<InstPtr>, ) -> Hole658     fn fill_split(
659         &mut self,
660         hole: Hole,
661         goto1: Option<InstPtr>,
662         goto2: Option<InstPtr>,
663     ) -> Hole {
664         match hole {
665             Hole::None => Hole::None,
666             Hole::One(pc) => match (goto1, goto2) {
667                 (Some(goto1), Some(goto2)) => {
668                     self.insts[pc].fill_split(goto1, goto2);
669                     Hole::None
670                 }
671                 (Some(goto1), None) => {
672                     self.insts[pc].half_fill_split_goto1(goto1);
673                     Hole::One(pc)
674                 }
675                 (None, Some(goto2)) => {
676                     self.insts[pc].half_fill_split_goto2(goto2);
677                     Hole::One(pc)
678                 }
679                 (None, None) => unreachable!(
680                     "at least one of the split \
681                      holes must be filled"
682                 ),
683             },
684             Hole::Many(holes) => {
685                 let mut new_holes = vec![];
686                 for hole in holes {
687                     new_holes.push(self.fill_split(hole, goto1, goto2));
688                 }
689                 if new_holes.is_empty() {
690                     Hole::None
691                 } else if new_holes.len() == 1 {
692                     new_holes.pop().unwrap()
693                 } else {
694                     Hole::Many(new_holes)
695                 }
696             }
697         }
698     }
699 
push_compiled(&mut self, inst: Inst)700     fn push_compiled(&mut self, inst: Inst) {
701         self.insts.push(MaybeInst::Compiled(inst));
702     }
703 
push_hole(&mut self, inst: InstHole) -> Hole704     fn push_hole(&mut self, inst: InstHole) -> Hole {
705         let hole = self.insts.len();
706         self.insts.push(MaybeInst::Uncompiled(inst));
707         Hole::One(hole)
708     }
709 
push_split_hole(&mut self) -> Hole710     fn push_split_hole(&mut self) -> Hole {
711         let hole = self.insts.len();
712         self.insts.push(MaybeInst::Split);
713         Hole::One(hole)
714     }
715 
check_size(&self) -> result::Result<(), Error>716     fn check_size(&self) -> result::Result<(), Error> {
717         use std::mem::size_of;
718 
719         if self.insts.len() * size_of::<Inst>() > self.size_limit {
720             Err(Error::CompiledTooBig(self.size_limit))
721         } else {
722             Ok(())
723         }
724     }
725 }
726 
727 #[derive(Debug)]
728 enum Hole {
729     None,
730     One(InstPtr),
731     Many(Vec<Hole>),
732 }
733 
734 #[derive(Clone, Debug)]
735 enum MaybeInst {
736     Compiled(Inst),
737     Uncompiled(InstHole),
738     Split,
739     Split1(InstPtr),
740     Split2(InstPtr),
741 }
742 
743 impl MaybeInst {
fill(&mut self, goto: InstPtr)744     fn fill(&mut self, goto: InstPtr) {
745         let filled = match *self {
746             MaybeInst::Uncompiled(ref inst) => inst.fill(goto),
747             MaybeInst::Split1(goto1) => {
748                 Inst::Split(InstSplit { goto1: goto1, goto2: goto })
749             }
750             MaybeInst::Split2(goto2) => {
751                 Inst::Split(InstSplit { goto1: goto, goto2: goto2 })
752             }
753             _ => unreachable!(
754                 "not all instructions were compiled! \
755                  found uncompiled instruction: {:?}",
756                 self
757             ),
758         };
759         *self = MaybeInst::Compiled(filled);
760     }
761 
fill_split(&mut self, goto1: InstPtr, goto2: InstPtr)762     fn fill_split(&mut self, goto1: InstPtr, goto2: InstPtr) {
763         let filled = match *self {
764             MaybeInst::Split => {
765                 Inst::Split(InstSplit { goto1: goto1, goto2: goto2 })
766             }
767             _ => unreachable!(
768                 "must be called on Split instruction, \
769                  instead it was called on: {:?}",
770                 self
771             ),
772         };
773         *self = MaybeInst::Compiled(filled);
774     }
775 
half_fill_split_goto1(&mut self, goto1: InstPtr)776     fn half_fill_split_goto1(&mut self, goto1: InstPtr) {
777         let half_filled = match *self {
778             MaybeInst::Split => goto1,
779             _ => unreachable!(
780                 "must be called on Split instruction, \
781                  instead it was called on: {:?}",
782                 self
783             ),
784         };
785         *self = MaybeInst::Split1(half_filled);
786     }
787 
half_fill_split_goto2(&mut self, goto2: InstPtr)788     fn half_fill_split_goto2(&mut self, goto2: InstPtr) {
789         let half_filled = match *self {
790             MaybeInst::Split => goto2,
791             _ => unreachable!(
792                 "must be called on Split instruction, \
793                  instead it was called on: {:?}",
794                 self
795             ),
796         };
797         *self = MaybeInst::Split2(half_filled);
798     }
799 
unwrap(self) -> Inst800     fn unwrap(self) -> Inst {
801         match self {
802             MaybeInst::Compiled(inst) => inst,
803             _ => unreachable!(
804                 "must be called on a compiled instruction, \
805                  instead it was called on: {:?}",
806                 self
807             ),
808         }
809     }
810 }
811 
812 #[derive(Clone, Debug)]
813 enum InstHole {
814     Save { slot: usize },
815     EmptyLook { look: EmptyLook },
816     Char { c: char },
817     Ranges { ranges: Vec<(char, char)> },
818     Bytes { start: u8, end: u8 },
819 }
820 
821 impl InstHole {
fill(&self, goto: InstPtr) -> Inst822     fn fill(&self, goto: InstPtr) -> Inst {
823         match *self {
824             InstHole::Save { slot } => {
825                 Inst::Save(InstSave { goto: goto, slot: slot })
826             }
827             InstHole::EmptyLook { look } => {
828                 Inst::EmptyLook(InstEmptyLook { goto: goto, look: look })
829             }
830             InstHole::Char { c } => Inst::Char(InstChar { goto: goto, c: c }),
831             InstHole::Ranges { ref ranges } => {
832                 Inst::Ranges(InstRanges { goto: goto, ranges: ranges.clone() })
833             }
834             InstHole::Bytes { start, end } => {
835                 Inst::Bytes(InstBytes { goto: goto, start: start, end: end })
836             }
837         }
838     }
839 }
840 
841 struct CompileClass<'a, 'b> {
842     c: &'a mut Compiler,
843     ranges: &'b [hir::ClassUnicodeRange],
844 }
845 
846 impl<'a, 'b> CompileClass<'a, 'b> {
compile(mut self) -> Result847     fn compile(mut self) -> Result {
848         let mut holes = vec![];
849         let mut initial_entry = None;
850         let mut last_split = Hole::None;
851         let mut utf8_seqs = self.c.utf8_seqs.take().unwrap();
852         self.c.suffix_cache.clear();
853 
854         for (i, range) in self.ranges.iter().enumerate() {
855             let is_last_range = i + 1 == self.ranges.len();
856             utf8_seqs.reset(range.start(), range.end());
857             let mut it = (&mut utf8_seqs).peekable();
858             loop {
859                 let utf8_seq = match it.next() {
860                     None => break,
861                     Some(utf8_seq) => utf8_seq,
862                 };
863                 if is_last_range && it.peek().is_none() {
864                     let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?;
865                     holes.push(hole);
866                     self.c.fill(last_split, entry);
867                     last_split = Hole::None;
868                     if initial_entry.is_none() {
869                         initial_entry = Some(entry);
870                     }
871                 } else {
872                     if initial_entry.is_none() {
873                         initial_entry = Some(self.c.insts.len());
874                     }
875                     self.c.fill_to_next(last_split);
876                     last_split = self.c.push_split_hole();
877                     let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?;
878                     holes.push(hole);
879                     last_split =
880                         self.c.fill_split(last_split, Some(entry), None);
881                 }
882             }
883         }
884         self.c.utf8_seqs = Some(utf8_seqs);
885         Ok(Patch { hole: Hole::Many(holes), entry: initial_entry.unwrap() })
886     }
887 
c_utf8_seq(&mut self, seq: &Utf8Sequence) -> Result888     fn c_utf8_seq(&mut self, seq: &Utf8Sequence) -> Result {
889         if self.c.compiled.is_reverse {
890             self.c_utf8_seq_(seq)
891         } else {
892             self.c_utf8_seq_(seq.into_iter().rev())
893         }
894     }
895 
c_utf8_seq_<'r, I>(&mut self, seq: I) -> Result where I: IntoIterator<Item = &'r Utf8Range>,896     fn c_utf8_seq_<'r, I>(&mut self, seq: I) -> Result
897     where
898         I: IntoIterator<Item = &'r Utf8Range>,
899     {
900         // The initial instruction for each UTF-8 sequence should be the same.
901         let mut from_inst = ::std::usize::MAX;
902         let mut last_hole = Hole::None;
903         for byte_range in seq {
904             let key = SuffixCacheKey {
905                 from_inst: from_inst,
906                 start: byte_range.start,
907                 end: byte_range.end,
908             };
909             {
910                 let pc = self.c.insts.len();
911                 if let Some(cached_pc) = self.c.suffix_cache.get(key, pc) {
912                     from_inst = cached_pc;
913                     continue;
914                 }
915             }
916             self.c.byte_classes.set_range(byte_range.start, byte_range.end);
917             if from_inst == ::std::usize::MAX {
918                 last_hole = self.c.push_hole(InstHole::Bytes {
919                     start: byte_range.start,
920                     end: byte_range.end,
921                 });
922             } else {
923                 self.c.push_compiled(Inst::Bytes(InstBytes {
924                     goto: from_inst,
925                     start: byte_range.start,
926                     end: byte_range.end,
927                 }));
928             }
929             from_inst = self.c.insts.len().checked_sub(1).unwrap();
930             debug_assert!(from_inst < ::std::usize::MAX);
931         }
932         debug_assert!(from_inst < ::std::usize::MAX);
933         Ok(Patch { hole: last_hole, entry: from_inst })
934     }
935 }
936 
937 /// `SuffixCache` is a simple bounded hash map for caching suffix entries in
938 /// UTF-8 automata. For example, consider the Unicode range \u{0}-\u{FFFF}.
939 /// The set of byte ranges looks like this:
940 ///
941 /// [0-7F]
942 /// [C2-DF][80-BF]
943 /// [E0][A0-BF][80-BF]
944 /// [E1-EC][80-BF][80-BF]
945 /// [ED][80-9F][80-BF]
946 /// [EE-EF][80-BF][80-BF]
947 ///
948 /// Each line above translates to one alternate in the compiled regex program.
949 /// However, all but one of the alternates end in the same suffix, which is
950 /// a waste of an instruction. The suffix cache facilitates reusing them across
951 /// alternates.
952 ///
953 /// Note that a HashMap could be trivially used for this, but we don't need its
954 /// overhead. Some small bounded space (LRU style) is more than enough.
955 ///
956 /// This uses similar idea to [`SparseSet`](../sparse/struct.SparseSet.html),
957 /// except it uses hashes as original indices and then compares full keys for
958 /// validation against `dense` array.
959 struct SuffixCache {
960     sparse: Box<[usize]>,
961     dense: Vec<SuffixCacheEntry>,
962 }
963 
964 #[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
965 struct SuffixCacheEntry {
966     key: SuffixCacheKey,
967     pc: InstPtr,
968 }
969 
970 #[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
971 struct SuffixCacheKey {
972     from_inst: InstPtr,
973     start: u8,
974     end: u8,
975 }
976 
977 impl SuffixCache {
new(size: usize) -> Self978     fn new(size: usize) -> Self {
979         SuffixCache {
980             sparse: vec![0usize; size].into(),
981             dense: Vec::with_capacity(size),
982         }
983     }
984 
get(&mut self, key: SuffixCacheKey, pc: InstPtr) -> Option<InstPtr>985     fn get(&mut self, key: SuffixCacheKey, pc: InstPtr) -> Option<InstPtr> {
986         let hash = self.hash(&key);
987         let pos = &mut self.sparse[hash];
988         if let Some(entry) = self.dense.get(*pos) {
989             if entry.key == key {
990                 return Some(entry.pc);
991             }
992         }
993         *pos = self.dense.len();
994         self.dense.push(SuffixCacheEntry { key: key, pc: pc });
995         None
996     }
997 
clear(&mut self)998     fn clear(&mut self) {
999         self.dense.clear();
1000     }
1001 
hash(&self, suffix: &SuffixCacheKey) -> usize1002     fn hash(&self, suffix: &SuffixCacheKey) -> usize {
1003         // Basic FNV-1a hash as described:
1004         // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
1005         const FNV_PRIME: u64 = 1099511628211;
1006         let mut h = 14695981039346656037;
1007         h = (h ^ (suffix.from_inst as u64)).wrapping_mul(FNV_PRIME);
1008         h = (h ^ (suffix.start as u64)).wrapping_mul(FNV_PRIME);
1009         h = (h ^ (suffix.end as u64)).wrapping_mul(FNV_PRIME);
1010         (h as usize) % self.sparse.len()
1011     }
1012 }
1013 
1014 struct ByteClassSet([bool; 256]);
1015 
1016 impl ByteClassSet {
new() -> Self1017     fn new() -> Self {
1018         ByteClassSet([false; 256])
1019     }
1020 
set_range(&mut self, start: u8, end: u8)1021     fn set_range(&mut self, start: u8, end: u8) {
1022         debug_assert!(start <= end);
1023         if start > 0 {
1024             self.0[start as usize - 1] = true;
1025         }
1026         self.0[end as usize] = true;
1027     }
1028 
set_word_boundary(&mut self)1029     fn set_word_boundary(&mut self) {
1030         // We need to mark all ranges of bytes whose pairs result in
1031         // evaluating \b differently.
1032         let iswb = is_word_byte;
1033         let mut b1: u16 = 0;
1034         let mut b2: u16;
1035         while b1 <= 255 {
1036             b2 = b1 + 1;
1037             while b2 <= 255 && iswb(b1 as u8) == iswb(b2 as u8) {
1038                 b2 += 1;
1039             }
1040             self.set_range(b1 as u8, (b2 - 1) as u8);
1041             b1 = b2;
1042         }
1043     }
1044 
byte_classes(&self) -> Vec<u8>1045     fn byte_classes(&self) -> Vec<u8> {
1046         // N.B. If you're debugging the DFA, it's useful to simply return
1047         // `(0..256).collect()`, which effectively removes the byte classes
1048         // and makes the transitions easier to read.
1049         // (0usize..256).map(|x| x as u8).collect()
1050         let mut byte_classes = vec![0; 256];
1051         let mut class = 0u8;
1052         let mut i = 0;
1053         loop {
1054             byte_classes[i] = class as u8;
1055             if i >= 255 {
1056                 break;
1057             }
1058             if self.0[i] {
1059                 class = class.checked_add(1).unwrap();
1060             }
1061             i += 1;
1062         }
1063         byte_classes
1064     }
1065 }
1066 
u32_to_usize(n: u32) -> usize1067 fn u32_to_usize(n: u32) -> usize {
1068     // In case usize is less than 32 bits, we need to guard against overflow.
1069     // On most platforms this compiles to nothing.
1070     // TODO Use `std::convert::TryFrom` once it's stable.
1071     if (n as u64) > (::std::usize::MAX as u64) {
1072         panic!("BUG: {} is too big to be pointer sized", n)
1073     }
1074     n as usize
1075 }
1076 
1077 #[cfg(test)]
1078 mod tests {
1079     use super::ByteClassSet;
1080 
1081     #[test]
byte_classes()1082     fn byte_classes() {
1083         let mut set = ByteClassSet::new();
1084         set.set_range(b'a', b'z');
1085         let classes = set.byte_classes();
1086         assert_eq!(classes[0], 0);
1087         assert_eq!(classes[1], 0);
1088         assert_eq!(classes[2], 0);
1089         assert_eq!(classes[b'a' as usize - 1], 0);
1090         assert_eq!(classes[b'a' as usize], 1);
1091         assert_eq!(classes[b'm' as usize], 1);
1092         assert_eq!(classes[b'z' as usize], 1);
1093         assert_eq!(classes[b'z' as usize + 1], 2);
1094         assert_eq!(classes[254], 2);
1095         assert_eq!(classes[255], 2);
1096 
1097         let mut set = ByteClassSet::new();
1098         set.set_range(0, 2);
1099         set.set_range(4, 6);
1100         let classes = set.byte_classes();
1101         assert_eq!(classes[0], 0);
1102         assert_eq!(classes[1], 0);
1103         assert_eq!(classes[2], 0);
1104         assert_eq!(classes[3], 1);
1105         assert_eq!(classes[4], 2);
1106         assert_eq!(classes[5], 2);
1107         assert_eq!(classes[6], 2);
1108         assert_eq!(classes[7], 3);
1109         assert_eq!(classes[255], 3);
1110     }
1111 
1112     #[test]
full_byte_classes()1113     fn full_byte_classes() {
1114         let mut set = ByteClassSet::new();
1115         for i in 0..256u16 {
1116             set.set_range(i as u8, i as u8);
1117         }
1118         assert_eq!(set.byte_classes().len(), 256);
1119     }
1120 }
1121