1 use std::collections::HashMap;
2 use std::iter;
3 use std::result;
4 use std::sync::Arc;
5 
6 use syntax::hir::{self, Hir};
7 use syntax::is_word_byte;
8 use syntax::utf8::{Utf8Range, Utf8Sequence, Utf8Sequences};
9 
10 use prog::{
11     EmptyLook, Inst, InstBytes, InstChar, InstEmptyLook, InstPtr, InstRanges,
12     InstSave, InstSplit, Program,
13 };
14 
15 use Error;
16 
17 type Result = result::Result<Patch, Error>;
18 
19 #[derive(Debug)]
20 struct Patch {
21     hole: Hole,
22     entry: InstPtr,
23 }
24 
25 /// A compiler translates a regular expression AST to a sequence of
26 /// instructions. The sequence of instructions represents an NFA.
27 pub struct Compiler {
28     insts: Vec<MaybeInst>,
29     compiled: Program,
30     capture_name_idx: HashMap<String, usize>,
31     num_exprs: usize,
32     size_limit: usize,
33     suffix_cache: SuffixCache,
34     utf8_seqs: Option<Utf8Sequences>,
35     byte_classes: ByteClassSet,
36 }
37 
38 impl Compiler {
39     /// Create a new regular expression compiler.
40     ///
41     /// Various options can be set before calling `compile` on an expression.
new() -> Self42     pub fn new() -> Self {
43         Compiler {
44             insts: vec![],
45             compiled: Program::new(),
46             capture_name_idx: HashMap::new(),
47             num_exprs: 0,
48             size_limit: 10 * (1 << 20),
49             suffix_cache: SuffixCache::new(1000),
50             utf8_seqs: Some(Utf8Sequences::new('\x00', '\x00')),
51             byte_classes: ByteClassSet::new(),
52         }
53     }
54 
55     /// The size of the resulting program is limited by size_limit. If
56     /// the program approximately exceeds the given size (in bytes), then
57     /// compilation will stop and return an error.
size_limit(mut self, size_limit: usize) -> Self58     pub fn size_limit(mut self, size_limit: usize) -> Self {
59         self.size_limit = size_limit;
60         self
61     }
62 
63     /// If bytes is true, then the program is compiled as a byte based
64     /// automaton, which incorporates UTF-8 decoding into the machine. If it's
65     /// false, then the automaton is Unicode scalar value based, e.g., an
66     /// engine utilizing such an automaton is responsible for UTF-8 decoding.
67     ///
68     /// The specific invariant is that when returning a byte based machine,
69     /// the neither the `Char` nor `Ranges` instructions are produced.
70     /// Conversely, when producing a Unicode scalar value machine, the `Bytes`
71     /// instruction is never produced.
72     ///
73     /// Note that `dfa(true)` implies `bytes(true)`.
bytes(mut self, yes: bool) -> Self74     pub fn bytes(mut self, yes: bool) -> Self {
75         self.compiled.is_bytes = yes;
76         self
77     }
78 
79     /// When disabled, the program compiled may match arbitrary bytes.
80     ///
81     /// When enabled (the default), all compiled programs exclusively match
82     /// valid UTF-8 bytes.
only_utf8(mut self, yes: bool) -> Self83     pub fn only_utf8(mut self, yes: bool) -> Self {
84         self.compiled.only_utf8 = yes;
85         self
86     }
87 
88     /// When set, the machine returned is suitable for use in the DFA matching
89     /// engine.
90     ///
91     /// In particular, this ensures that if the regex is not anchored in the
92     /// beginning, then a preceding `.*?` is included in the program. (The NFA
93     /// based engines handle the preceding `.*?` explicitly, which is difficult
94     /// or impossible in the DFA engine.)
dfa(mut self, yes: bool) -> Self95     pub fn dfa(mut self, yes: bool) -> Self {
96         self.compiled.is_dfa = yes;
97         self
98     }
99 
100     /// When set, the machine returned is suitable for matching text in
101     /// reverse. In particular, all concatenations are flipped.
reverse(mut self, yes: bool) -> Self102     pub fn reverse(mut self, yes: bool) -> Self {
103         self.compiled.is_reverse = yes;
104         self
105     }
106 
107     /// Compile a regular expression given its AST.
108     ///
109     /// The compiler is guaranteed to succeed unless the program exceeds the
110     /// specified size limit. If the size limit is exceeded, then compilation
111     /// stops and returns an error.
compile(mut self, exprs: &[Hir]) -> result::Result<Program, Error>112     pub fn compile(mut self, exprs: &[Hir]) -> result::Result<Program, Error> {
113         debug_assert!(!exprs.is_empty());
114         self.num_exprs = exprs.len();
115         if exprs.len() == 1 {
116             self.compile_one(&exprs[0])
117         } else {
118             self.compile_many(exprs)
119         }
120     }
121 
compile_one(mut self, expr: &Hir) -> result::Result<Program, Error>122     fn compile_one(mut self, expr: &Hir) -> result::Result<Program, Error> {
123         // If we're compiling a forward DFA and we aren't anchored, then
124         // add a `.*?` before the first capture group.
125         // Other matching engines handle this by baking the logic into the
126         // matching engine itself.
127         let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
128         self.compiled.is_anchored_start = expr.is_anchored_start();
129         self.compiled.is_anchored_end = expr.is_anchored_end();
130         if self.compiled.needs_dotstar() {
131             dotstar_patch = self.c_dotstar()?;
132             self.compiled.start = dotstar_patch.entry;
133         }
134         self.compiled.captures = vec![None];
135         let patch = self.c_capture(0, expr)?;
136         if self.compiled.needs_dotstar() {
137             self.fill(dotstar_patch.hole, patch.entry);
138         } else {
139             self.compiled.start = patch.entry;
140         }
141         self.fill_to_next(patch.hole);
142         self.compiled.matches = vec![self.insts.len()];
143         self.push_compiled(Inst::Match(0));
144         self.compile_finish()
145     }
146 
compile_many( mut self, exprs: &[Hir], ) -> result::Result<Program, Error>147     fn compile_many(
148         mut self,
149         exprs: &[Hir],
150     ) -> result::Result<Program, Error> {
151         debug_assert!(exprs.len() > 1);
152 
153         self.compiled.is_anchored_start =
154             exprs.iter().all(|e| e.is_anchored_start());
155         self.compiled.is_anchored_end =
156             exprs.iter().all(|e| e.is_anchored_end());
157         let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
158         if self.compiled.needs_dotstar() {
159             dotstar_patch = self.c_dotstar()?;
160             self.compiled.start = dotstar_patch.entry;
161         } else {
162             self.compiled.start = 0; // first instruction is always split
163         }
164         self.fill_to_next(dotstar_patch.hole);
165 
166         let mut prev_hole = Hole::None;
167         for (i, expr) in exprs[0..exprs.len() - 1].iter().enumerate() {
168             self.fill_to_next(prev_hole);
169             let split = self.push_split_hole();
170             let Patch { hole, entry } = self.c_capture(0, expr)?;
171             self.fill_to_next(hole);
172             self.compiled.matches.push(self.insts.len());
173             self.push_compiled(Inst::Match(i));
174             prev_hole = self.fill_split(split, Some(entry), None);
175         }
176         let i = exprs.len() - 1;
177         let Patch { hole, entry } = self.c_capture(0, &exprs[i])?;
178         self.fill(prev_hole, entry);
179         self.fill_to_next(hole);
180         self.compiled.matches.push(self.insts.len());
181         self.push_compiled(Inst::Match(i));
182         self.compile_finish()
183     }
184 
compile_finish(mut self) -> result::Result<Program, Error>185     fn compile_finish(mut self) -> result::Result<Program, Error> {
186         self.compiled.insts =
187             self.insts.into_iter().map(|inst| inst.unwrap()).collect();
188         self.compiled.byte_classes = self.byte_classes.byte_classes();
189         self.compiled.capture_name_idx = Arc::new(self.capture_name_idx);
190         Ok(self.compiled)
191     }
192 
193     /// Compile expr into self.insts, returning a patch on success,
194     /// or an error if we run out of memory.
195     ///
196     /// All of the c_* methods of the compiler share the contract outlined
197     /// here.
198     ///
199     /// The main thing that a c_* method does is mutate `self.insts`
200     /// to add a list of mostly compiled instructions required to execute
201     /// the given expression. `self.insts` contains MaybeInsts rather than
202     /// Insts because there is some backpatching required.
203     ///
204     /// The `Patch` value returned by each c_* method provides metadata
205     /// about the compiled instructions emitted to `self.insts`. The
206     /// `entry` member of the patch refers to the first instruction
207     /// (the entry point), while the `hole` member contains zero or
208     /// more offsets to partial instructions that need to be backpatched.
209     /// The c_* routine can't know where its list of instructions are going to
210     /// jump to after execution, so it is up to the caller to patch
211     /// these jumps to point to the right place. So compiling some
212     /// expression, e, we would end up with a situation that looked like:
213     ///
214     /// ```text
215     /// self.insts = [ ..., i1, i2, ..., iexit1, ..., iexitn, ...]
216     ///                     ^              ^             ^
217     ///                     |                \         /
218     ///                   entry                \     /
219     ///                                         hole
220     /// ```
221     ///
222     /// To compile two expressions, e1 and e2, concatinated together we
223     /// would do:
224     ///
225     /// ```ignore
226     /// let patch1 = self.c(e1);
227     /// let patch2 = self.c(e2);
228     /// ```
229     ///
230     /// while leaves us with a situation that looks like
231     ///
232     /// ```text
233     /// self.insts = [ ..., i1, ..., iexit1, ..., i2, ..., iexit2 ]
234     ///                     ^        ^            ^        ^
235     ///                     |        |            |        |
236     ///                entry1        hole1   entry2        hole2
237     /// ```
238     ///
239     /// Then to merge the two patches together into one we would backpatch
240     /// hole1 with entry2 and return a new patch that enters at entry1
241     /// and has hole2 for a hole. In fact, if you look at the c_concat
242     /// method you will see that it does exactly this, though it handles
243     /// a list of expressions rather than just the two that we use for
244     /// an example.
c(&mut self, expr: &Hir) -> Result245     fn c(&mut self, expr: &Hir) -> Result {
246         use prog;
247         use syntax::hir::HirKind::*;
248 
249         self.check_size()?;
250         match *expr.kind() {
251             Empty => Ok(Patch { hole: Hole::None, entry: self.insts.len() }),
252             Literal(hir::Literal::Unicode(c)) => self.c_char(c),
253             Literal(hir::Literal::Byte(b)) => {
254                 assert!(self.compiled.uses_bytes());
255                 self.c_byte(b)
256             }
257             Class(hir::Class::Unicode(ref cls)) => self.c_class(cls.ranges()),
258             Class(hir::Class::Bytes(ref cls)) => {
259                 if self.compiled.uses_bytes() {
260                     self.c_class_bytes(cls.ranges())
261                 } else {
262                     assert!(cls.is_all_ascii());
263                     let mut char_ranges = vec![];
264                     for r in cls.iter() {
265                         let (s, e) = (r.start() as char, r.end() as char);
266                         char_ranges.push(hir::ClassUnicodeRange::new(s, e));
267                     }
268                     self.c_class(&char_ranges)
269                 }
270             }
271             Anchor(hir::Anchor::StartLine) if self.compiled.is_reverse => {
272                 self.byte_classes.set_range(b'\n', b'\n');
273                 self.c_empty_look(prog::EmptyLook::EndLine)
274             }
275             Anchor(hir::Anchor::StartLine) => {
276                 self.byte_classes.set_range(b'\n', b'\n');
277                 self.c_empty_look(prog::EmptyLook::StartLine)
278             }
279             Anchor(hir::Anchor::EndLine) if self.compiled.is_reverse => {
280                 self.byte_classes.set_range(b'\n', b'\n');
281                 self.c_empty_look(prog::EmptyLook::StartLine)
282             }
283             Anchor(hir::Anchor::EndLine) => {
284                 self.byte_classes.set_range(b'\n', b'\n');
285                 self.c_empty_look(prog::EmptyLook::EndLine)
286             }
287             Anchor(hir::Anchor::StartText) if self.compiled.is_reverse => {
288                 self.c_empty_look(prog::EmptyLook::EndText)
289             }
290             Anchor(hir::Anchor::StartText) => {
291                 self.c_empty_look(prog::EmptyLook::StartText)
292             }
293             Anchor(hir::Anchor::EndText) if self.compiled.is_reverse => {
294                 self.c_empty_look(prog::EmptyLook::StartText)
295             }
296             Anchor(hir::Anchor::EndText) => {
297                 self.c_empty_look(prog::EmptyLook::EndText)
298             }
299             WordBoundary(hir::WordBoundary::Unicode) => {
300                 if !cfg!(feature = "unicode-perl") {
301                     return Err(Error::Syntax(
302                         "Unicode word boundaries are unavailable when \
303                          the unicode-perl feature is disabled"
304                             .to_string(),
305                     ));
306                 }
307                 self.compiled.has_unicode_word_boundary = true;
308                 self.byte_classes.set_word_boundary();
309                 self.c_empty_look(prog::EmptyLook::WordBoundary)
310             }
311             WordBoundary(hir::WordBoundary::UnicodeNegate) => {
312                 if !cfg!(feature = "unicode-perl") {
313                     return Err(Error::Syntax(
314                         "Unicode word boundaries are unavailable when \
315                          the unicode-perl feature is disabled"
316                             .to_string(),
317                     ));
318                 }
319                 self.compiled.has_unicode_word_boundary = true;
320                 self.byte_classes.set_word_boundary();
321                 self.c_empty_look(prog::EmptyLook::NotWordBoundary)
322             }
323             WordBoundary(hir::WordBoundary::Ascii) => {
324                 self.byte_classes.set_word_boundary();
325                 self.c_empty_look(prog::EmptyLook::WordBoundaryAscii)
326             }
327             WordBoundary(hir::WordBoundary::AsciiNegate) => {
328                 self.byte_classes.set_word_boundary();
329                 self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii)
330             }
331             Group(ref g) => match g.kind {
332                 hir::GroupKind::NonCapturing => self.c(&g.hir),
333                 hir::GroupKind::CaptureIndex(index) => {
334                     if index as usize >= self.compiled.captures.len() {
335                         self.compiled.captures.push(None);
336                     }
337                     self.c_capture(2 * index as usize, &g.hir)
338                 }
339                 hir::GroupKind::CaptureName { index, ref name } => {
340                     if index as usize >= self.compiled.captures.len() {
341                         let n = name.to_string();
342                         self.compiled.captures.push(Some(n.clone()));
343                         self.capture_name_idx.insert(n, index as usize);
344                     }
345                     self.c_capture(2 * index as usize, &g.hir)
346                 }
347             },
348             Concat(ref es) => {
349                 if self.compiled.is_reverse {
350                     self.c_concat(es.iter().rev())
351                 } else {
352                     self.c_concat(es)
353                 }
354             }
355             Alternation(ref es) => self.c_alternate(&**es),
356             Repetition(ref rep) => self.c_repeat(rep),
357         }
358     }
359 
c_capture(&mut self, first_slot: usize, expr: &Hir) -> Result360     fn c_capture(&mut self, first_slot: usize, expr: &Hir) -> Result {
361         if self.num_exprs > 1 || self.compiled.is_dfa {
362             // Don't ever compile Save instructions for regex sets because
363             // they are never used. They are also never used in DFA programs
364             // because DFAs can't handle captures.
365             self.c(expr)
366         } else {
367             let entry = self.insts.len();
368             let hole = self.push_hole(InstHole::Save { slot: first_slot });
369             let patch = self.c(expr)?;
370             self.fill(hole, patch.entry);
371             self.fill_to_next(patch.hole);
372             let hole = self.push_hole(InstHole::Save { slot: first_slot + 1 });
373             Ok(Patch { hole: hole, entry: entry })
374         }
375     }
376 
c_dotstar(&mut self) -> Result377     fn c_dotstar(&mut self) -> Result {
378         Ok(if !self.compiled.only_utf8() {
379             self.c(&Hir::repetition(hir::Repetition {
380                 kind: hir::RepetitionKind::ZeroOrMore,
381                 greedy: false,
382                 hir: Box::new(Hir::any(true)),
383             }))?
384         } else {
385             self.c(&Hir::repetition(hir::Repetition {
386                 kind: hir::RepetitionKind::ZeroOrMore,
387                 greedy: false,
388                 hir: Box::new(Hir::any(false)),
389             }))?
390         })
391     }
392 
c_char(&mut self, c: char) -> Result393     fn c_char(&mut self, c: char) -> Result {
394         if self.compiled.uses_bytes() {
395             if c.is_ascii() {
396                 let b = c as u8;
397                 let hole =
398                     self.push_hole(InstHole::Bytes { start: b, end: b });
399                 self.byte_classes.set_range(b, b);
400                 Ok(Patch { hole, entry: self.insts.len() - 1 })
401             } else {
402                 self.c_class(&[hir::ClassUnicodeRange::new(c, c)])
403             }
404         } else {
405             let hole = self.push_hole(InstHole::Char { c: c });
406             Ok(Patch { hole, entry: self.insts.len() - 1 })
407         }
408     }
409 
c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> Result410     fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> Result {
411         assert!(!ranges.is_empty());
412         if self.compiled.uses_bytes() {
413             CompileClass { c: self, ranges: ranges }.compile()
414         } else {
415             let ranges: Vec<(char, char)> =
416                 ranges.iter().map(|r| (r.start(), r.end())).collect();
417             let hole = if ranges.len() == 1 && ranges[0].0 == ranges[0].1 {
418                 self.push_hole(InstHole::Char { c: ranges[0].0 })
419             } else {
420                 self.push_hole(InstHole::Ranges { ranges: ranges })
421             };
422             Ok(Patch { hole: hole, entry: self.insts.len() - 1 })
423         }
424     }
425 
c_byte(&mut self, b: u8) -> Result426     fn c_byte(&mut self, b: u8) -> Result {
427         self.c_class_bytes(&[hir::ClassBytesRange::new(b, b)])
428     }
429 
c_class_bytes(&mut self, ranges: &[hir::ClassBytesRange]) -> Result430     fn c_class_bytes(&mut self, ranges: &[hir::ClassBytesRange]) -> Result {
431         debug_assert!(!ranges.is_empty());
432 
433         let first_split_entry = self.insts.len();
434         let mut holes = vec![];
435         let mut prev_hole = Hole::None;
436         for r in &ranges[0..ranges.len() - 1] {
437             self.fill_to_next(prev_hole);
438             let split = self.push_split_hole();
439             let next = self.insts.len();
440             self.byte_classes.set_range(r.start(), r.end());
441             holes.push(self.push_hole(InstHole::Bytes {
442                 start: r.start(),
443                 end: r.end(),
444             }));
445             prev_hole = self.fill_split(split, Some(next), None);
446         }
447         let next = self.insts.len();
448         let r = &ranges[ranges.len() - 1];
449         self.byte_classes.set_range(r.start(), r.end());
450         holes.push(
451             self.push_hole(InstHole::Bytes { start: r.start(), end: r.end() }),
452         );
453         self.fill(prev_hole, next);
454         Ok(Patch { hole: Hole::Many(holes), entry: first_split_entry })
455     }
456 
c_empty_look(&mut self, look: EmptyLook) -> Result457     fn c_empty_look(&mut self, look: EmptyLook) -> Result {
458         let hole = self.push_hole(InstHole::EmptyLook { look: look });
459         Ok(Patch { hole: hole, entry: self.insts.len() - 1 })
460     }
461 
c_concat<'a, I>(&mut self, exprs: I) -> Result where I: IntoIterator<Item = &'a Hir>,462     fn c_concat<'a, I>(&mut self, exprs: I) -> Result
463     where
464         I: IntoIterator<Item = &'a Hir>,
465     {
466         let mut exprs = exprs.into_iter();
467         let first = match exprs.next() {
468             Some(expr) => expr,
469             None => {
470                 return Ok(Patch { hole: Hole::None, entry: self.insts.len() })
471             }
472         };
473         let Patch { mut hole, entry } = self.c(first)?;
474         for e in exprs {
475             let p = self.c(e)?;
476             self.fill(hole, p.entry);
477             hole = p.hole;
478         }
479         Ok(Patch { hole: hole, entry: entry })
480     }
481 
c_alternate(&mut self, exprs: &[Hir]) -> Result482     fn c_alternate(&mut self, exprs: &[Hir]) -> Result {
483         debug_assert!(
484             exprs.len() >= 2,
485             "alternates must have at least 2 exprs"
486         );
487 
488         // Initial entry point is always the first split.
489         let first_split_entry = self.insts.len();
490 
491         // Save up all of the holes from each alternate. They will all get
492         // patched to point to the same location.
493         let mut holes = vec![];
494 
495         let mut prev_hole = Hole::None;
496         for e in &exprs[0..exprs.len() - 1] {
497             self.fill_to_next(prev_hole);
498             let split = self.push_split_hole();
499             let prev_entry = self.insts.len();
500             let Patch { hole, entry } = self.c(e)?;
501             if prev_entry == self.insts.len() {
502                 // TODO(burntsushi): It is kind of silly that we don't support
503                 // empty-subexpressions in alternates, but it is supremely
504                 // awkward to support them in the existing compiler
505                 // infrastructure. This entire compiler needs to be thrown out
506                 // anyway, so don't feel too bad.
507                 return Err(Error::Syntax(
508                     "alternations cannot currently contain \
509                      empty sub-expressions"
510                         .to_string(),
511                 ));
512             }
513             holes.push(hole);
514             prev_hole = self.fill_split(split, Some(entry), None);
515         }
516         let prev_entry = self.insts.len();
517         let Patch { hole, entry } = self.c(&exprs[exprs.len() - 1])?;
518         if prev_entry == self.insts.len() {
519             // TODO(burntsushi): See TODO above.
520             return Err(Error::Syntax(
521                 "alternations cannot currently contain \
522                  empty sub-expressions"
523                     .to_string(),
524             ));
525         }
526         holes.push(hole);
527         self.fill(prev_hole, entry);
528         Ok(Patch { hole: Hole::Many(holes), entry: first_split_entry })
529     }
530 
c_repeat(&mut self, rep: &hir::Repetition) -> Result531     fn c_repeat(&mut self, rep: &hir::Repetition) -> Result {
532         use syntax::hir::RepetitionKind::*;
533         match rep.kind {
534             ZeroOrOne => self.c_repeat_zero_or_one(&rep.hir, rep.greedy),
535             ZeroOrMore => self.c_repeat_zero_or_more(&rep.hir, rep.greedy),
536             OneOrMore => self.c_repeat_one_or_more(&rep.hir, rep.greedy),
537             Range(hir::RepetitionRange::Exactly(min_max)) => {
538                 self.c_repeat_range(&rep.hir, rep.greedy, min_max, min_max)
539             }
540             Range(hir::RepetitionRange::AtLeast(min)) => {
541                 self.c_repeat_range_min_or_more(&rep.hir, rep.greedy, min)
542             }
543             Range(hir::RepetitionRange::Bounded(min, max)) => {
544                 self.c_repeat_range(&rep.hir, rep.greedy, min, max)
545             }
546         }
547     }
548 
c_repeat_zero_or_one(&mut self, expr: &Hir, greedy: bool) -> Result549     fn c_repeat_zero_or_one(&mut self, expr: &Hir, greedy: bool) -> Result {
550         let split_entry = self.insts.len();
551         let split = self.push_split_hole();
552         let Patch { hole: hole_rep, entry: entry_rep } = self.c(expr)?;
553 
554         let split_hole = if greedy {
555             self.fill_split(split, Some(entry_rep), None)
556         } else {
557             self.fill_split(split, None, Some(entry_rep))
558         };
559         let holes = vec![hole_rep, split_hole];
560         Ok(Patch { hole: Hole::Many(holes), entry: split_entry })
561     }
562 
c_repeat_zero_or_more(&mut self, expr: &Hir, greedy: bool) -> Result563     fn c_repeat_zero_or_more(&mut self, expr: &Hir, greedy: bool) -> Result {
564         let split_entry = self.insts.len();
565         let split = self.push_split_hole();
566         let Patch { hole: hole_rep, entry: entry_rep } = self.c(expr)?;
567 
568         self.fill(hole_rep, split_entry);
569         let split_hole = if greedy {
570             self.fill_split(split, Some(entry_rep), None)
571         } else {
572             self.fill_split(split, None, Some(entry_rep))
573         };
574         Ok(Patch { hole: split_hole, entry: split_entry })
575     }
576 
c_repeat_one_or_more(&mut self, expr: &Hir, greedy: bool) -> Result577     fn c_repeat_one_or_more(&mut self, expr: &Hir, greedy: bool) -> Result {
578         let Patch { hole: hole_rep, entry: entry_rep } = self.c(expr)?;
579         self.fill_to_next(hole_rep);
580         let split = self.push_split_hole();
581 
582         let split_hole = if greedy {
583             self.fill_split(split, Some(entry_rep), None)
584         } else {
585             self.fill_split(split, None, Some(entry_rep))
586         };
587         Ok(Patch { hole: split_hole, entry: entry_rep })
588     }
589 
c_repeat_range_min_or_more( &mut self, expr: &Hir, greedy: bool, min: u32, ) -> Result590     fn c_repeat_range_min_or_more(
591         &mut self,
592         expr: &Hir,
593         greedy: bool,
594         min: u32,
595     ) -> Result {
596         let min = u32_to_usize(min);
597         let patch_concat = self.c_concat(iter::repeat(expr).take(min))?;
598         let patch_rep = self.c_repeat_zero_or_more(expr, greedy)?;
599         self.fill(patch_concat.hole, patch_rep.entry);
600         Ok(Patch { hole: patch_rep.hole, entry: patch_concat.entry })
601     }
602 
c_repeat_range( &mut self, expr: &Hir, greedy: bool, min: u32, max: u32, ) -> Result603     fn c_repeat_range(
604         &mut self,
605         expr: &Hir,
606         greedy: bool,
607         min: u32,
608         max: u32,
609     ) -> Result {
610         let (min, max) = (u32_to_usize(min), u32_to_usize(max));
611         let patch_concat = self.c_concat(iter::repeat(expr).take(min))?;
612         let initial_entry = patch_concat.entry;
613         if min == max {
614             return Ok(patch_concat);
615         }
616         // It is much simpler to compile, e.g., `a{2,5}` as:
617         //
618         //     aaa?a?a?
619         //
620         // But you end up with a sequence of instructions like this:
621         //
622         //     0: 'a'
623         //     1: 'a',
624         //     2: split(3, 4)
625         //     3: 'a'
626         //     4: split(5, 6)
627         //     5: 'a'
628         //     6: split(7, 8)
629         //     7: 'a'
630         //     8: MATCH
631         //
632         // This is *incredibly* inefficient because the splits end
633         // up forming a chain, which has to be resolved everything a
634         // transition is followed.
635         let mut holes = vec![];
636         let mut prev_hole = patch_concat.hole;
637         for _ in min..max {
638             self.fill_to_next(prev_hole);
639             let split = self.push_split_hole();
640             let Patch { hole, entry } = self.c(expr)?;
641             prev_hole = hole;
642             if greedy {
643                 holes.push(self.fill_split(split, Some(entry), None));
644             } else {
645                 holes.push(self.fill_split(split, None, Some(entry)));
646             }
647         }
648         holes.push(prev_hole);
649         Ok(Patch { hole: Hole::Many(holes), entry: initial_entry })
650     }
651 
fill(&mut self, hole: Hole, goto: InstPtr)652     fn fill(&mut self, hole: Hole, goto: InstPtr) {
653         match hole {
654             Hole::None => {}
655             Hole::One(pc) => {
656                 self.insts[pc].fill(goto);
657             }
658             Hole::Many(holes) => {
659                 for hole in holes {
660                     self.fill(hole, goto);
661                 }
662             }
663         }
664     }
665 
fill_to_next(&mut self, hole: Hole)666     fn fill_to_next(&mut self, hole: Hole) {
667         let next = self.insts.len();
668         self.fill(hole, next);
669     }
670 
fill_split( &mut self, hole: Hole, goto1: Option<InstPtr>, goto2: Option<InstPtr>, ) -> Hole671     fn fill_split(
672         &mut self,
673         hole: Hole,
674         goto1: Option<InstPtr>,
675         goto2: Option<InstPtr>,
676     ) -> Hole {
677         match hole {
678             Hole::None => Hole::None,
679             Hole::One(pc) => match (goto1, goto2) {
680                 (Some(goto1), Some(goto2)) => {
681                     self.insts[pc].fill_split(goto1, goto2);
682                     Hole::None
683                 }
684                 (Some(goto1), None) => {
685                     self.insts[pc].half_fill_split_goto1(goto1);
686                     Hole::One(pc)
687                 }
688                 (None, Some(goto2)) => {
689                     self.insts[pc].half_fill_split_goto2(goto2);
690                     Hole::One(pc)
691                 }
692                 (None, None) => unreachable!(
693                     "at least one of the split \
694                      holes must be filled"
695                 ),
696             },
697             Hole::Many(holes) => {
698                 let mut new_holes = vec![];
699                 for hole in holes {
700                     new_holes.push(self.fill_split(hole, goto1, goto2));
701                 }
702                 if new_holes.is_empty() {
703                     Hole::None
704                 } else if new_holes.len() == 1 {
705                     new_holes.pop().unwrap()
706                 } else {
707                     Hole::Many(new_holes)
708                 }
709             }
710         }
711     }
712 
push_compiled(&mut self, inst: Inst)713     fn push_compiled(&mut self, inst: Inst) {
714         self.insts.push(MaybeInst::Compiled(inst));
715     }
716 
push_hole(&mut self, inst: InstHole) -> Hole717     fn push_hole(&mut self, inst: InstHole) -> Hole {
718         let hole = self.insts.len();
719         self.insts.push(MaybeInst::Uncompiled(inst));
720         Hole::One(hole)
721     }
722 
push_split_hole(&mut self) -> Hole723     fn push_split_hole(&mut self) -> Hole {
724         let hole = self.insts.len();
725         self.insts.push(MaybeInst::Split);
726         Hole::One(hole)
727     }
728 
check_size(&self) -> result::Result<(), Error>729     fn check_size(&self) -> result::Result<(), Error> {
730         use std::mem::size_of;
731 
732         if self.insts.len() * size_of::<Inst>() > self.size_limit {
733             Err(Error::CompiledTooBig(self.size_limit))
734         } else {
735             Ok(())
736         }
737     }
738 }
739 
740 #[derive(Debug)]
741 enum Hole {
742     None,
743     One(InstPtr),
744     Many(Vec<Hole>),
745 }
746 
747 #[derive(Clone, Debug)]
748 enum MaybeInst {
749     Compiled(Inst),
750     Uncompiled(InstHole),
751     Split,
752     Split1(InstPtr),
753     Split2(InstPtr),
754 }
755 
756 impl MaybeInst {
fill(&mut self, goto: InstPtr)757     fn fill(&mut self, goto: InstPtr) {
758         let filled = match *self {
759             MaybeInst::Uncompiled(ref inst) => inst.fill(goto),
760             MaybeInst::Split1(goto1) => {
761                 Inst::Split(InstSplit { goto1: goto1, goto2: goto })
762             }
763             MaybeInst::Split2(goto2) => {
764                 Inst::Split(InstSplit { goto1: goto, goto2: goto2 })
765             }
766             _ => unreachable!(
767                 "not all instructions were compiled! \
768                  found uncompiled instruction: {:?}",
769                 self
770             ),
771         };
772         *self = MaybeInst::Compiled(filled);
773     }
774 
fill_split(&mut self, goto1: InstPtr, goto2: InstPtr)775     fn fill_split(&mut self, goto1: InstPtr, goto2: InstPtr) {
776         let filled = match *self {
777             MaybeInst::Split => {
778                 Inst::Split(InstSplit { goto1: goto1, goto2: goto2 })
779             }
780             _ => unreachable!(
781                 "must be called on Split instruction, \
782                  instead it was called on: {:?}",
783                 self
784             ),
785         };
786         *self = MaybeInst::Compiled(filled);
787     }
788 
half_fill_split_goto1(&mut self, goto1: InstPtr)789     fn half_fill_split_goto1(&mut self, goto1: InstPtr) {
790         let half_filled = match *self {
791             MaybeInst::Split => goto1,
792             _ => unreachable!(
793                 "must be called on Split instruction, \
794                  instead it was called on: {:?}",
795                 self
796             ),
797         };
798         *self = MaybeInst::Split1(half_filled);
799     }
800 
half_fill_split_goto2(&mut self, goto2: InstPtr)801     fn half_fill_split_goto2(&mut self, goto2: InstPtr) {
802         let half_filled = match *self {
803             MaybeInst::Split => goto2,
804             _ => unreachable!(
805                 "must be called on Split instruction, \
806                  instead it was called on: {:?}",
807                 self
808             ),
809         };
810         *self = MaybeInst::Split2(half_filled);
811     }
812 
unwrap(self) -> Inst813     fn unwrap(self) -> Inst {
814         match self {
815             MaybeInst::Compiled(inst) => inst,
816             _ => unreachable!(
817                 "must be called on a compiled instruction, \
818                  instead it was called on: {:?}",
819                 self
820             ),
821         }
822     }
823 }
824 
825 #[derive(Clone, Debug)]
826 enum InstHole {
827     Save { slot: usize },
828     EmptyLook { look: EmptyLook },
829     Char { c: char },
830     Ranges { ranges: Vec<(char, char)> },
831     Bytes { start: u8, end: u8 },
832 }
833 
834 impl InstHole {
fill(&self, goto: InstPtr) -> Inst835     fn fill(&self, goto: InstPtr) -> Inst {
836         match *self {
837             InstHole::Save { slot } => {
838                 Inst::Save(InstSave { goto: goto, slot: slot })
839             }
840             InstHole::EmptyLook { look } => {
841                 Inst::EmptyLook(InstEmptyLook { goto: goto, look: look })
842             }
843             InstHole::Char { c } => Inst::Char(InstChar { goto: goto, c: c }),
844             InstHole::Ranges { ref ranges } => {
845                 Inst::Ranges(InstRanges { goto: goto, ranges: ranges.clone() })
846             }
847             InstHole::Bytes { start, end } => {
848                 Inst::Bytes(InstBytes { goto: goto, start: start, end: end })
849             }
850         }
851     }
852 }
853 
854 struct CompileClass<'a, 'b> {
855     c: &'a mut Compiler,
856     ranges: &'b [hir::ClassUnicodeRange],
857 }
858 
859 impl<'a, 'b> CompileClass<'a, 'b> {
compile(mut self) -> Result860     fn compile(mut self) -> Result {
861         let mut holes = vec![];
862         let mut initial_entry = None;
863         let mut last_split = Hole::None;
864         let mut utf8_seqs = self.c.utf8_seqs.take().unwrap();
865         self.c.suffix_cache.clear();
866 
867         for (i, range) in self.ranges.iter().enumerate() {
868             let is_last_range = i + 1 == self.ranges.len();
869             utf8_seqs.reset(range.start(), range.end());
870             let mut it = (&mut utf8_seqs).peekable();
871             loop {
872                 let utf8_seq = match it.next() {
873                     None => break,
874                     Some(utf8_seq) => utf8_seq,
875                 };
876                 if is_last_range && it.peek().is_none() {
877                     let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?;
878                     holes.push(hole);
879                     self.c.fill(last_split, entry);
880                     last_split = Hole::None;
881                     if initial_entry.is_none() {
882                         initial_entry = Some(entry);
883                     }
884                 } else {
885                     if initial_entry.is_none() {
886                         initial_entry = Some(self.c.insts.len());
887                     }
888                     self.c.fill_to_next(last_split);
889                     last_split = self.c.push_split_hole();
890                     let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?;
891                     holes.push(hole);
892                     last_split =
893                         self.c.fill_split(last_split, Some(entry), None);
894                 }
895             }
896         }
897         self.c.utf8_seqs = Some(utf8_seqs);
898         Ok(Patch { hole: Hole::Many(holes), entry: initial_entry.unwrap() })
899     }
900 
c_utf8_seq(&mut self, seq: &Utf8Sequence) -> Result901     fn c_utf8_seq(&mut self, seq: &Utf8Sequence) -> Result {
902         if self.c.compiled.is_reverse {
903             self.c_utf8_seq_(seq)
904         } else {
905             self.c_utf8_seq_(seq.into_iter().rev())
906         }
907     }
908 
c_utf8_seq_<'r, I>(&mut self, seq: I) -> Result where I: IntoIterator<Item = &'r Utf8Range>,909     fn c_utf8_seq_<'r, I>(&mut self, seq: I) -> Result
910     where
911         I: IntoIterator<Item = &'r Utf8Range>,
912     {
913         // The initial instruction for each UTF-8 sequence should be the same.
914         let mut from_inst = ::std::usize::MAX;
915         let mut last_hole = Hole::None;
916         for byte_range in seq {
917             let key = SuffixCacheKey {
918                 from_inst: from_inst,
919                 start: byte_range.start,
920                 end: byte_range.end,
921             };
922             {
923                 let pc = self.c.insts.len();
924                 if let Some(cached_pc) = self.c.suffix_cache.get(key, pc) {
925                     from_inst = cached_pc;
926                     continue;
927                 }
928             }
929             self.c.byte_classes.set_range(byte_range.start, byte_range.end);
930             if from_inst == ::std::usize::MAX {
931                 last_hole = self.c.push_hole(InstHole::Bytes {
932                     start: byte_range.start,
933                     end: byte_range.end,
934                 });
935             } else {
936                 self.c.push_compiled(Inst::Bytes(InstBytes {
937                     goto: from_inst,
938                     start: byte_range.start,
939                     end: byte_range.end,
940                 }));
941             }
942             from_inst = self.c.insts.len().checked_sub(1).unwrap();
943             debug_assert!(from_inst < ::std::usize::MAX);
944         }
945         debug_assert!(from_inst < ::std::usize::MAX);
946         Ok(Patch { hole: last_hole, entry: from_inst })
947     }
948 }
949 
950 /// `SuffixCache` is a simple bounded hash map for caching suffix entries in
951 /// UTF-8 automata. For example, consider the Unicode range \u{0}-\u{FFFF}.
952 /// The set of byte ranges looks like this:
953 ///
954 /// [0-7F]
955 /// [C2-DF][80-BF]
956 /// [E0][A0-BF][80-BF]
957 /// [E1-EC][80-BF][80-BF]
958 /// [ED][80-9F][80-BF]
959 /// [EE-EF][80-BF][80-BF]
960 ///
961 /// Each line above translates to one alternate in the compiled regex program.
962 /// However, all but one of the alternates end in the same suffix, which is
963 /// a waste of an instruction. The suffix cache facilitates reusing them across
964 /// alternates.
965 ///
966 /// Note that a HashMap could be trivially used for this, but we don't need its
967 /// overhead. Some small bounded space (LRU style) is more than enough.
968 ///
969 /// This uses similar idea to [`SparseSet`](../sparse/struct.SparseSet.html),
970 /// except it uses hashes as original indices and then compares full keys for
971 /// validation against `dense` array.
972 struct SuffixCache {
973     sparse: Box<[usize]>,
974     dense: Vec<SuffixCacheEntry>,
975 }
976 
977 #[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
978 struct SuffixCacheEntry {
979     key: SuffixCacheKey,
980     pc: InstPtr,
981 }
982 
983 #[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
984 struct SuffixCacheKey {
985     from_inst: InstPtr,
986     start: u8,
987     end: u8,
988 }
989 
990 impl SuffixCache {
new(size: usize) -> Self991     fn new(size: usize) -> Self {
992         SuffixCache {
993             sparse: vec![0usize; size].into(),
994             dense: Vec::with_capacity(size),
995         }
996     }
997 
get(&mut self, key: SuffixCacheKey, pc: InstPtr) -> Option<InstPtr>998     fn get(&mut self, key: SuffixCacheKey, pc: InstPtr) -> Option<InstPtr> {
999         let hash = self.hash(&key);
1000         let pos = &mut self.sparse[hash];
1001         if let Some(entry) = self.dense.get(*pos) {
1002             if entry.key == key {
1003                 return Some(entry.pc);
1004             }
1005         }
1006         *pos = self.dense.len();
1007         self.dense.push(SuffixCacheEntry { key: key, pc: pc });
1008         None
1009     }
1010 
clear(&mut self)1011     fn clear(&mut self) {
1012         self.dense.clear();
1013     }
1014 
hash(&self, suffix: &SuffixCacheKey) -> usize1015     fn hash(&self, suffix: &SuffixCacheKey) -> usize {
1016         // Basic FNV-1a hash as described:
1017         // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
1018         const FNV_PRIME: u64 = 1099511628211;
1019         let mut h = 14695981039346656037;
1020         h = (h ^ (suffix.from_inst as u64)).wrapping_mul(FNV_PRIME);
1021         h = (h ^ (suffix.start as u64)).wrapping_mul(FNV_PRIME);
1022         h = (h ^ (suffix.end as u64)).wrapping_mul(FNV_PRIME);
1023         (h as usize) % self.sparse.len()
1024     }
1025 }
1026 
1027 struct ByteClassSet([bool; 256]);
1028 
1029 impl ByteClassSet {
new() -> Self1030     fn new() -> Self {
1031         ByteClassSet([false; 256])
1032     }
1033 
set_range(&mut self, start: u8, end: u8)1034     fn set_range(&mut self, start: u8, end: u8) {
1035         debug_assert!(start <= end);
1036         if start > 0 {
1037             self.0[start as usize - 1] = true;
1038         }
1039         self.0[end as usize] = true;
1040     }
1041 
set_word_boundary(&mut self)1042     fn set_word_boundary(&mut self) {
1043         // We need to mark all ranges of bytes whose pairs result in
1044         // evaluating \b differently.
1045         let iswb = is_word_byte;
1046         let mut b1: u16 = 0;
1047         let mut b2: u16;
1048         while b1 <= 255 {
1049             b2 = b1 + 1;
1050             while b2 <= 255 && iswb(b1 as u8) == iswb(b2 as u8) {
1051                 b2 += 1;
1052             }
1053             self.set_range(b1 as u8, (b2 - 1) as u8);
1054             b1 = b2;
1055         }
1056     }
1057 
byte_classes(&self) -> Vec<u8>1058     fn byte_classes(&self) -> Vec<u8> {
1059         // N.B. If you're debugging the DFA, it's useful to simply return
1060         // `(0..256).collect()`, which effectively removes the byte classes
1061         // and makes the transitions easier to read.
1062         // (0usize..256).map(|x| x as u8).collect()
1063         let mut byte_classes = vec![0; 256];
1064         let mut class = 0u8;
1065         let mut i = 0;
1066         loop {
1067             byte_classes[i] = class as u8;
1068             if i >= 255 {
1069                 break;
1070             }
1071             if self.0[i] {
1072                 class = class.checked_add(1).unwrap();
1073             }
1074             i += 1;
1075         }
1076         byte_classes
1077     }
1078 }
1079 
u32_to_usize(n: u32) -> usize1080 fn u32_to_usize(n: u32) -> usize {
1081     // In case usize is less than 32 bits, we need to guard against overflow.
1082     // On most platforms this compiles to nothing.
1083     // TODO Use `std::convert::TryFrom` once it's stable.
1084     if (n as u64) > (::std::usize::MAX as u64) {
1085         panic!("BUG: {} is too big to be pointer sized", n)
1086     }
1087     n as usize
1088 }
1089 
1090 #[cfg(test)]
1091 mod tests {
1092     use super::ByteClassSet;
1093 
1094     #[test]
byte_classes()1095     fn byte_classes() {
1096         let mut set = ByteClassSet::new();
1097         set.set_range(b'a', b'z');
1098         let classes = set.byte_classes();
1099         assert_eq!(classes[0], 0);
1100         assert_eq!(classes[1], 0);
1101         assert_eq!(classes[2], 0);
1102         assert_eq!(classes[b'a' as usize - 1], 0);
1103         assert_eq!(classes[b'a' as usize], 1);
1104         assert_eq!(classes[b'm' as usize], 1);
1105         assert_eq!(classes[b'z' as usize], 1);
1106         assert_eq!(classes[b'z' as usize + 1], 2);
1107         assert_eq!(classes[254], 2);
1108         assert_eq!(classes[255], 2);
1109 
1110         let mut set = ByteClassSet::new();
1111         set.set_range(0, 2);
1112         set.set_range(4, 6);
1113         let classes = set.byte_classes();
1114         assert_eq!(classes[0], 0);
1115         assert_eq!(classes[1], 0);
1116         assert_eq!(classes[2], 0);
1117         assert_eq!(classes[3], 1);
1118         assert_eq!(classes[4], 2);
1119         assert_eq!(classes[5], 2);
1120         assert_eq!(classes[6], 2);
1121         assert_eq!(classes[7], 3);
1122         assert_eq!(classes[255], 3);
1123     }
1124 
1125     #[test]
full_byte_classes()1126     fn full_byte_classes() {
1127         let mut set = ByteClassSet::new();
1128         for i in 0..256u16 {
1129             set.set_range(i as u8, i as u8);
1130         }
1131         assert_eq!(set.byte_classes().len(), 256);
1132     }
1133 }
1134