1 use std::collections::HashMap;
2 use std::fmt;
3 use std::iter;
4 use std::result;
5 use std::sync::Arc;
6 
7 use syntax::hir::{self, Hir};
8 use syntax::is_word_byte;
9 use syntax::utf8::{Utf8Range, Utf8Sequence, Utf8Sequences};
10 
11 use prog::{
12     EmptyLook, Inst, InstBytes, InstChar, InstEmptyLook, InstPtr, InstRanges,
13     InstSave, InstSplit, Program,
14 };
15 
16 use Error;
17 
18 type Result = result::Result<Patch, Error>;
19 type ResultOrEmpty = result::Result<Option<Patch>, Error>;
20 
21 #[derive(Debug)]
22 struct Patch {
23     hole: Hole,
24     entry: InstPtr,
25 }
26 
27 /// A compiler translates a regular expression AST to a sequence of
28 /// instructions. The sequence of instructions represents an NFA.
29 // `Compiler` is only public via the `internal` module, so avoid deriving
30 // `Debug`.
31 #[allow(missing_debug_implementations)]
32 pub struct Compiler {
33     insts: Vec<MaybeInst>,
34     compiled: Program,
35     capture_name_idx: HashMap<String, usize>,
36     num_exprs: usize,
37     size_limit: usize,
38     suffix_cache: SuffixCache,
39     utf8_seqs: Option<Utf8Sequences>,
40     byte_classes: ByteClassSet,
41 }
42 
43 impl Compiler {
44     /// Create a new regular expression compiler.
45     ///
46     /// Various options can be set before calling `compile` on an expression.
new() -> Self47     pub fn new() -> Self {
48         Compiler {
49             insts: vec![],
50             compiled: Program::new(),
51             capture_name_idx: HashMap::new(),
52             num_exprs: 0,
53             size_limit: 10 * (1 << 20),
54             suffix_cache: SuffixCache::new(1000),
55             utf8_seqs: Some(Utf8Sequences::new('\x00', '\x00')),
56             byte_classes: ByteClassSet::new(),
57         }
58     }
59 
60     /// The size of the resulting program is limited by size_limit. If
61     /// the program approximately exceeds the given size (in bytes), then
62     /// compilation will stop and return an error.
size_limit(mut self, size_limit: usize) -> Self63     pub fn size_limit(mut self, size_limit: usize) -> Self {
64         self.size_limit = size_limit;
65         self
66     }
67 
68     /// If bytes is true, then the program is compiled as a byte based
69     /// automaton, which incorporates UTF-8 decoding into the machine. If it's
70     /// false, then the automaton is Unicode scalar value based, e.g., an
71     /// engine utilizing such an automaton is responsible for UTF-8 decoding.
72     ///
73     /// The specific invariant is that when returning a byte based machine,
74     /// the neither the `Char` nor `Ranges` instructions are produced.
75     /// Conversely, when producing a Unicode scalar value machine, the `Bytes`
76     /// instruction is never produced.
77     ///
78     /// Note that `dfa(true)` implies `bytes(true)`.
bytes(mut self, yes: bool) -> Self79     pub fn bytes(mut self, yes: bool) -> Self {
80         self.compiled.is_bytes = yes;
81         self
82     }
83 
84     /// When disabled, the program compiled may match arbitrary bytes.
85     ///
86     /// When enabled (the default), all compiled programs exclusively match
87     /// valid UTF-8 bytes.
only_utf8(mut self, yes: bool) -> Self88     pub fn only_utf8(mut self, yes: bool) -> Self {
89         self.compiled.only_utf8 = yes;
90         self
91     }
92 
93     /// When set, the machine returned is suitable for use in the DFA matching
94     /// engine.
95     ///
96     /// In particular, this ensures that if the regex is not anchored in the
97     /// beginning, then a preceding `.*?` is included in the program. (The NFA
98     /// based engines handle the preceding `.*?` explicitly, which is difficult
99     /// or impossible in the DFA engine.)
dfa(mut self, yes: bool) -> Self100     pub fn dfa(mut self, yes: bool) -> Self {
101         self.compiled.is_dfa = yes;
102         self
103     }
104 
105     /// When set, the machine returned is suitable for matching text in
106     /// reverse. In particular, all concatenations are flipped.
reverse(mut self, yes: bool) -> Self107     pub fn reverse(mut self, yes: bool) -> Self {
108         self.compiled.is_reverse = yes;
109         self
110     }
111 
112     /// Compile a regular expression given its AST.
113     ///
114     /// The compiler is guaranteed to succeed unless the program exceeds the
115     /// specified size limit. If the size limit is exceeded, then compilation
116     /// stops and returns an error.
compile(mut self, exprs: &[Hir]) -> result::Result<Program, Error>117     pub fn compile(mut self, exprs: &[Hir]) -> result::Result<Program, Error> {
118         debug_assert!(!exprs.is_empty());
119         self.num_exprs = exprs.len();
120         if exprs.len() == 1 {
121             self.compile_one(&exprs[0])
122         } else {
123             self.compile_many(exprs)
124         }
125     }
126 
compile_one(mut self, expr: &Hir) -> result::Result<Program, Error>127     fn compile_one(mut self, expr: &Hir) -> result::Result<Program, Error> {
128         // If we're compiling a forward DFA and we aren't anchored, then
129         // add a `.*?` before the first capture group.
130         // Other matching engines handle this by baking the logic into the
131         // matching engine itself.
132         let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
133         self.compiled.is_anchored_start = expr.is_anchored_start();
134         self.compiled.is_anchored_end = expr.is_anchored_end();
135         if self.compiled.needs_dotstar() {
136             dotstar_patch = self.c_dotstar()?;
137             self.compiled.start = dotstar_patch.entry;
138         }
139         self.compiled.captures = vec![None];
140         let patch = self.c_capture(0, expr)?.unwrap_or(self.next_inst());
141         if self.compiled.needs_dotstar() {
142             self.fill(dotstar_patch.hole, patch.entry);
143         } else {
144             self.compiled.start = patch.entry;
145         }
146         self.fill_to_next(patch.hole);
147         self.compiled.matches = vec![self.insts.len()];
148         self.push_compiled(Inst::Match(0));
149         self.compile_finish()
150     }
151 
compile_many( mut self, exprs: &[Hir], ) -> result::Result<Program, Error>152     fn compile_many(
153         mut self,
154         exprs: &[Hir],
155     ) -> result::Result<Program, Error> {
156         debug_assert!(exprs.len() > 1);
157 
158         self.compiled.is_anchored_start =
159             exprs.iter().all(|e| e.is_anchored_start());
160         self.compiled.is_anchored_end =
161             exprs.iter().all(|e| e.is_anchored_end());
162         let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
163         if self.compiled.needs_dotstar() {
164             dotstar_patch = self.c_dotstar()?;
165             self.compiled.start = dotstar_patch.entry;
166         } else {
167             self.compiled.start = 0; // first instruction is always split
168         }
169         self.fill_to_next(dotstar_patch.hole);
170 
171         let mut prev_hole = Hole::None;
172         for (i, expr) in exprs[0..exprs.len() - 1].iter().enumerate() {
173             self.fill_to_next(prev_hole);
174             let split = self.push_split_hole();
175             let Patch { hole, entry } =
176                 self.c_capture(0, expr)?.unwrap_or(self.next_inst());
177             self.fill_to_next(hole);
178             self.compiled.matches.push(self.insts.len());
179             self.push_compiled(Inst::Match(i));
180             prev_hole = self.fill_split(split, Some(entry), None);
181         }
182         let i = exprs.len() - 1;
183         let Patch { hole, entry } =
184             self.c_capture(0, &exprs[i])?.unwrap_or(self.next_inst());
185         self.fill(prev_hole, entry);
186         self.fill_to_next(hole);
187         self.compiled.matches.push(self.insts.len());
188         self.push_compiled(Inst::Match(i));
189         self.compile_finish()
190     }
191 
compile_finish(mut self) -> result::Result<Program, Error>192     fn compile_finish(mut self) -> result::Result<Program, Error> {
193         self.compiled.insts =
194             self.insts.into_iter().map(|inst| inst.unwrap()).collect();
195         self.compiled.byte_classes = self.byte_classes.byte_classes();
196         self.compiled.capture_name_idx = Arc::new(self.capture_name_idx);
197         Ok(self.compiled)
198     }
199 
200     /// Compile expr into self.insts, returning a patch on success,
201     /// or an error if we run out of memory.
202     ///
203     /// All of the c_* methods of the compiler share the contract outlined
204     /// here.
205     ///
206     /// The main thing that a c_* method does is mutate `self.insts`
207     /// to add a list of mostly compiled instructions required to execute
208     /// the given expression. `self.insts` contains MaybeInsts rather than
209     /// Insts because there is some backpatching required.
210     ///
211     /// The `Patch` value returned by each c_* method provides metadata
212     /// about the compiled instructions emitted to `self.insts`. The
213     /// `entry` member of the patch refers to the first instruction
214     /// (the entry point), while the `hole` member contains zero or
215     /// more offsets to partial instructions that need to be backpatched.
216     /// The c_* routine can't know where its list of instructions are going to
217     /// jump to after execution, so it is up to the caller to patch
218     /// these jumps to point to the right place. So compiling some
219     /// expression, e, we would end up with a situation that looked like:
220     ///
221     /// ```text
222     /// self.insts = [ ..., i1, i2, ..., iexit1, ..., iexitn, ...]
223     ///                     ^              ^             ^
224     ///                     |                \         /
225     ///                   entry                \     /
226     ///                                         hole
227     /// ```
228     ///
229     /// To compile two expressions, e1 and e2, concatenated together we
230     /// would do:
231     ///
232     /// ```ignore
233     /// let patch1 = self.c(e1);
234     /// let patch2 = self.c(e2);
235     /// ```
236     ///
237     /// while leaves us with a situation that looks like
238     ///
239     /// ```text
240     /// self.insts = [ ..., i1, ..., iexit1, ..., i2, ..., iexit2 ]
241     ///                     ^        ^            ^        ^
242     ///                     |        |            |        |
243     ///                entry1        hole1   entry2        hole2
244     /// ```
245     ///
246     /// Then to merge the two patches together into one we would backpatch
247     /// hole1 with entry2 and return a new patch that enters at entry1
248     /// and has hole2 for a hole. In fact, if you look at the c_concat
249     /// method you will see that it does exactly this, though it handles
250     /// a list of expressions rather than just the two that we use for
251     /// an example.
252     ///
253     /// Ok(None) is returned when an expression is compiled to no
254     /// instruction, and so no patch.entry value makes sense.
c(&mut self, expr: &Hir) -> ResultOrEmpty255     fn c(&mut self, expr: &Hir) -> ResultOrEmpty {
256         use prog;
257         use syntax::hir::HirKind::*;
258 
259         self.check_size()?;
260         match *expr.kind() {
261             Empty => Ok(None),
262             Literal(hir::Literal::Unicode(c)) => self.c_char(c),
263             Literal(hir::Literal::Byte(b)) => {
264                 assert!(self.compiled.uses_bytes());
265                 self.c_byte(b)
266             }
267             Class(hir::Class::Unicode(ref cls)) => self.c_class(cls.ranges()),
268             Class(hir::Class::Bytes(ref cls)) => {
269                 if self.compiled.uses_bytes() {
270                     self.c_class_bytes(cls.ranges())
271                 } else {
272                     assert!(cls.is_all_ascii());
273                     let mut char_ranges = vec![];
274                     for r in cls.iter() {
275                         let (s, e) = (r.start() as char, r.end() as char);
276                         char_ranges.push(hir::ClassUnicodeRange::new(s, e));
277                     }
278                     self.c_class(&char_ranges)
279                 }
280             }
281             Anchor(hir::Anchor::StartLine) if self.compiled.is_reverse => {
282                 self.byte_classes.set_range(b'\n', b'\n');
283                 self.c_empty_look(prog::EmptyLook::EndLine)
284             }
285             Anchor(hir::Anchor::StartLine) => {
286                 self.byte_classes.set_range(b'\n', b'\n');
287                 self.c_empty_look(prog::EmptyLook::StartLine)
288             }
289             Anchor(hir::Anchor::EndLine) if self.compiled.is_reverse => {
290                 self.byte_classes.set_range(b'\n', b'\n');
291                 self.c_empty_look(prog::EmptyLook::StartLine)
292             }
293             Anchor(hir::Anchor::EndLine) => {
294                 self.byte_classes.set_range(b'\n', b'\n');
295                 self.c_empty_look(prog::EmptyLook::EndLine)
296             }
297             Anchor(hir::Anchor::StartText) if self.compiled.is_reverse => {
298                 self.c_empty_look(prog::EmptyLook::EndText)
299             }
300             Anchor(hir::Anchor::StartText) => {
301                 self.c_empty_look(prog::EmptyLook::StartText)
302             }
303             Anchor(hir::Anchor::EndText) if self.compiled.is_reverse => {
304                 self.c_empty_look(prog::EmptyLook::StartText)
305             }
306             Anchor(hir::Anchor::EndText) => {
307                 self.c_empty_look(prog::EmptyLook::EndText)
308             }
309             WordBoundary(hir::WordBoundary::Unicode) => {
310                 if !cfg!(feature = "unicode-perl") {
311                     return Err(Error::Syntax(
312                         "Unicode word boundaries are unavailable when \
313                          the unicode-perl feature is disabled"
314                             .to_string(),
315                     ));
316                 }
317                 self.compiled.has_unicode_word_boundary = true;
318                 self.byte_classes.set_word_boundary();
319                 self.c_empty_look(prog::EmptyLook::WordBoundary)
320             }
321             WordBoundary(hir::WordBoundary::UnicodeNegate) => {
322                 if !cfg!(feature = "unicode-perl") {
323                     return Err(Error::Syntax(
324                         "Unicode word boundaries are unavailable when \
325                          the unicode-perl feature is disabled"
326                             .to_string(),
327                     ));
328                 }
329                 self.compiled.has_unicode_word_boundary = true;
330                 self.byte_classes.set_word_boundary();
331                 self.c_empty_look(prog::EmptyLook::NotWordBoundary)
332             }
333             WordBoundary(hir::WordBoundary::Ascii) => {
334                 self.byte_classes.set_word_boundary();
335                 self.c_empty_look(prog::EmptyLook::WordBoundaryAscii)
336             }
337             WordBoundary(hir::WordBoundary::AsciiNegate) => {
338                 self.byte_classes.set_word_boundary();
339                 self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii)
340             }
341             Group(ref g) => match g.kind {
342                 hir::GroupKind::NonCapturing => self.c(&g.hir),
343                 hir::GroupKind::CaptureIndex(index) => {
344                     if index as usize >= self.compiled.captures.len() {
345                         self.compiled.captures.push(None);
346                     }
347                     self.c_capture(2 * index as usize, &g.hir)
348                 }
349                 hir::GroupKind::CaptureName { index, ref name } => {
350                     if index as usize >= self.compiled.captures.len() {
351                         let n = name.to_string();
352                         self.compiled.captures.push(Some(n.clone()));
353                         self.capture_name_idx.insert(n, index as usize);
354                     }
355                     self.c_capture(2 * index as usize, &g.hir)
356                 }
357             },
358             Concat(ref es) => {
359                 if self.compiled.is_reverse {
360                     self.c_concat(es.iter().rev())
361                 } else {
362                     self.c_concat(es)
363                 }
364             }
365             Alternation(ref es) => self.c_alternate(&**es),
366             Repetition(ref rep) => self.c_repeat(rep),
367         }
368     }
369 
c_capture(&mut self, first_slot: usize, expr: &Hir) -> ResultOrEmpty370     fn c_capture(&mut self, first_slot: usize, expr: &Hir) -> ResultOrEmpty {
371         if self.num_exprs > 1 || self.compiled.is_dfa {
372             // Don't ever compile Save instructions for regex sets because
373             // they are never used. They are also never used in DFA programs
374             // because DFAs can't handle captures.
375             self.c(expr)
376         } else {
377             let entry = self.insts.len();
378             let hole = self.push_hole(InstHole::Save { slot: first_slot });
379             let patch = self.c(expr)?.unwrap_or(self.next_inst());
380             self.fill(hole, patch.entry);
381             self.fill_to_next(patch.hole);
382             let hole = self.push_hole(InstHole::Save { slot: first_slot + 1 });
383             Ok(Some(Patch { hole: hole, entry: entry }))
384         }
385     }
386 
c_dotstar(&mut self) -> Result387     fn c_dotstar(&mut self) -> Result {
388         Ok(if !self.compiled.only_utf8() {
389             self.c(&Hir::repetition(hir::Repetition {
390                 kind: hir::RepetitionKind::ZeroOrMore,
391                 greedy: false,
392                 hir: Box::new(Hir::any(true)),
393             }))?
394             .unwrap()
395         } else {
396             self.c(&Hir::repetition(hir::Repetition {
397                 kind: hir::RepetitionKind::ZeroOrMore,
398                 greedy: false,
399                 hir: Box::new(Hir::any(false)),
400             }))?
401             .unwrap()
402         })
403     }
404 
c_char(&mut self, c: char) -> ResultOrEmpty405     fn c_char(&mut self, c: char) -> ResultOrEmpty {
406         if self.compiled.uses_bytes() {
407             if c.is_ascii() {
408                 let b = c as u8;
409                 let hole =
410                     self.push_hole(InstHole::Bytes { start: b, end: b });
411                 self.byte_classes.set_range(b, b);
412                 Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
413             } else {
414                 self.c_class(&[hir::ClassUnicodeRange::new(c, c)])
415             }
416         } else {
417             let hole = self.push_hole(InstHole::Char { c: c });
418             Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
419         }
420     }
421 
c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> ResultOrEmpty422     fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> ResultOrEmpty {
423         assert!(!ranges.is_empty());
424         if self.compiled.uses_bytes() {
425             Ok(Some(CompileClass { c: self, ranges: ranges }.compile()?))
426         } else {
427             let ranges: Vec<(char, char)> =
428                 ranges.iter().map(|r| (r.start(), r.end())).collect();
429             let hole = if ranges.len() == 1 && ranges[0].0 == ranges[0].1 {
430                 self.push_hole(InstHole::Char { c: ranges[0].0 })
431             } else {
432                 self.push_hole(InstHole::Ranges { ranges: ranges })
433             };
434             Ok(Some(Patch { hole: hole, entry: self.insts.len() - 1 }))
435         }
436     }
437 
c_byte(&mut self, b: u8) -> ResultOrEmpty438     fn c_byte(&mut self, b: u8) -> ResultOrEmpty {
439         self.c_class_bytes(&[hir::ClassBytesRange::new(b, b)])
440     }
441 
c_class_bytes( &mut self, ranges: &[hir::ClassBytesRange], ) -> ResultOrEmpty442     fn c_class_bytes(
443         &mut self,
444         ranges: &[hir::ClassBytesRange],
445     ) -> ResultOrEmpty {
446         debug_assert!(!ranges.is_empty());
447 
448         let first_split_entry = self.insts.len();
449         let mut holes = vec![];
450         let mut prev_hole = Hole::None;
451         for r in &ranges[0..ranges.len() - 1] {
452             self.fill_to_next(prev_hole);
453             let split = self.push_split_hole();
454             let next = self.insts.len();
455             self.byte_classes.set_range(r.start(), r.end());
456             holes.push(self.push_hole(InstHole::Bytes {
457                 start: r.start(),
458                 end: r.end(),
459             }));
460             prev_hole = self.fill_split(split, Some(next), None);
461         }
462         let next = self.insts.len();
463         let r = &ranges[ranges.len() - 1];
464         self.byte_classes.set_range(r.start(), r.end());
465         holes.push(
466             self.push_hole(InstHole::Bytes { start: r.start(), end: r.end() }),
467         );
468         self.fill(prev_hole, next);
469         Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry }))
470     }
471 
c_empty_look(&mut self, look: EmptyLook) -> ResultOrEmpty472     fn c_empty_look(&mut self, look: EmptyLook) -> ResultOrEmpty {
473         let hole = self.push_hole(InstHole::EmptyLook { look: look });
474         Ok(Some(Patch { hole: hole, entry: self.insts.len() - 1 }))
475     }
476 
c_concat<'a, I>(&mut self, exprs: I) -> ResultOrEmpty where I: IntoIterator<Item = &'a Hir>,477     fn c_concat<'a, I>(&mut self, exprs: I) -> ResultOrEmpty
478     where
479         I: IntoIterator<Item = &'a Hir>,
480     {
481         let mut exprs = exprs.into_iter();
482         let Patch { mut hole, entry } = loop {
483             match exprs.next() {
484                 None => return Ok(None),
485                 Some(e) => {
486                     if let Some(p) = self.c(e)? {
487                         break p;
488                     }
489                 }
490             }
491         };
492         for e in exprs {
493             if let Some(p) = self.c(e)? {
494                 self.fill(hole, p.entry);
495                 hole = p.hole;
496             }
497         }
498         Ok(Some(Patch { hole: hole, entry: entry }))
499     }
500 
c_alternate(&mut self, exprs: &[Hir]) -> ResultOrEmpty501     fn c_alternate(&mut self, exprs: &[Hir]) -> ResultOrEmpty {
502         debug_assert!(
503             exprs.len() >= 2,
504             "alternates must have at least 2 exprs"
505         );
506 
507         // Initial entry point is always the first split.
508         let first_split_entry = self.insts.len();
509 
510         // Save up all of the holes from each alternate. They will all get
511         // patched to point to the same location.
512         let mut holes = vec![];
513 
514         // true indicates that the hole is a split where we want to fill
515         // the second branch.
516         let mut prev_hole = (Hole::None, false);
517         for e in &exprs[0..exprs.len() - 1] {
518             if prev_hole.1 {
519                 let next = self.insts.len();
520                 self.fill_split(prev_hole.0, None, Some(next));
521             } else {
522                 self.fill_to_next(prev_hole.0);
523             }
524             let split = self.push_split_hole();
525             if let Some(Patch { hole, entry }) = self.c(e)? {
526                 holes.push(hole);
527                 prev_hole = (self.fill_split(split, Some(entry), None), false);
528             } else {
529                 let (split1, split2) = split.dup_one();
530                 holes.push(split1);
531                 prev_hole = (split2, true);
532             }
533         }
534         if let Some(Patch { hole, entry }) = self.c(&exprs[exprs.len() - 1])? {
535             holes.push(hole);
536             if prev_hole.1 {
537                 self.fill_split(prev_hole.0, None, Some(entry));
538             } else {
539                 self.fill(prev_hole.0, entry);
540             }
541         } else {
542             // We ignore prev_hole.1. When it's true, it means we have two
543             // empty branches both pushing prev_hole.0 into holes, so both
544             // branches will go to the same place anyway.
545             holes.push(prev_hole.0);
546         }
547         Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry }))
548     }
549 
c_repeat(&mut self, rep: &hir::Repetition) -> ResultOrEmpty550     fn c_repeat(&mut self, rep: &hir::Repetition) -> ResultOrEmpty {
551         use syntax::hir::RepetitionKind::*;
552         match rep.kind {
553             ZeroOrOne => self.c_repeat_zero_or_one(&rep.hir, rep.greedy),
554             ZeroOrMore => self.c_repeat_zero_or_more(&rep.hir, rep.greedy),
555             OneOrMore => self.c_repeat_one_or_more(&rep.hir, rep.greedy),
556             Range(hir::RepetitionRange::Exactly(min_max)) => {
557                 self.c_repeat_range(&rep.hir, rep.greedy, min_max, min_max)
558             }
559             Range(hir::RepetitionRange::AtLeast(min)) => {
560                 self.c_repeat_range_min_or_more(&rep.hir, rep.greedy, min)
561             }
562             Range(hir::RepetitionRange::Bounded(min, max)) => {
563                 self.c_repeat_range(&rep.hir, rep.greedy, min, max)
564             }
565         }
566     }
567 
c_repeat_zero_or_one( &mut self, expr: &Hir, greedy: bool, ) -> ResultOrEmpty568     fn c_repeat_zero_or_one(
569         &mut self,
570         expr: &Hir,
571         greedy: bool,
572     ) -> ResultOrEmpty {
573         let split_entry = self.insts.len();
574         let split = self.push_split_hole();
575         let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
576             Some(p) => p,
577             None => return self.pop_split_hole(),
578         };
579         let split_hole = if greedy {
580             self.fill_split(split, Some(entry_rep), None)
581         } else {
582             self.fill_split(split, None, Some(entry_rep))
583         };
584         let holes = vec![hole_rep, split_hole];
585         Ok(Some(Patch { hole: Hole::Many(holes), entry: split_entry }))
586     }
587 
c_repeat_zero_or_more( &mut self, expr: &Hir, greedy: bool, ) -> ResultOrEmpty588     fn c_repeat_zero_or_more(
589         &mut self,
590         expr: &Hir,
591         greedy: bool,
592     ) -> ResultOrEmpty {
593         let split_entry = self.insts.len();
594         let split = self.push_split_hole();
595         let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
596             Some(p) => p,
597             None => return self.pop_split_hole(),
598         };
599 
600         self.fill(hole_rep, split_entry);
601         let split_hole = if greedy {
602             self.fill_split(split, Some(entry_rep), None)
603         } else {
604             self.fill_split(split, None, Some(entry_rep))
605         };
606         Ok(Some(Patch { hole: split_hole, entry: split_entry }))
607     }
608 
c_repeat_one_or_more( &mut self, expr: &Hir, greedy: bool, ) -> ResultOrEmpty609     fn c_repeat_one_or_more(
610         &mut self,
611         expr: &Hir,
612         greedy: bool,
613     ) -> ResultOrEmpty {
614         let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
615             Some(p) => p,
616             None => return Ok(None),
617         };
618         self.fill_to_next(hole_rep);
619         let split = self.push_split_hole();
620 
621         let split_hole = if greedy {
622             self.fill_split(split, Some(entry_rep), None)
623         } else {
624             self.fill_split(split, None, Some(entry_rep))
625         };
626         Ok(Some(Patch { hole: split_hole, entry: entry_rep }))
627     }
628 
c_repeat_range_min_or_more( &mut self, expr: &Hir, greedy: bool, min: u32, ) -> ResultOrEmpty629     fn c_repeat_range_min_or_more(
630         &mut self,
631         expr: &Hir,
632         greedy: bool,
633         min: u32,
634     ) -> ResultOrEmpty {
635         let min = u32_to_usize(min);
636         // Using next_inst() is ok, because we can't return it (concat would
637         // have to return Some(_) while c_repeat_range_min_or_more returns
638         // None).
639         let patch_concat = self
640             .c_concat(iter::repeat(expr).take(min))?
641             .unwrap_or(self.next_inst());
642         if let Some(patch_rep) = self.c_repeat_zero_or_more(expr, greedy)? {
643             self.fill(patch_concat.hole, patch_rep.entry);
644             Ok(Some(Patch { hole: patch_rep.hole, entry: patch_concat.entry }))
645         } else {
646             Ok(None)
647         }
648     }
649 
c_repeat_range( &mut self, expr: &Hir, greedy: bool, min: u32, max: u32, ) -> ResultOrEmpty650     fn c_repeat_range(
651         &mut self,
652         expr: &Hir,
653         greedy: bool,
654         min: u32,
655         max: u32,
656     ) -> ResultOrEmpty {
657         let (min, max) = (u32_to_usize(min), u32_to_usize(max));
658         debug_assert!(min <= max);
659         let patch_concat = self.c_concat(iter::repeat(expr).take(min))?;
660         if min == max {
661             return Ok(patch_concat);
662         }
663         // Same reasoning as in c_repeat_range_min_or_more (we know that min <
664         // max at this point).
665         let patch_concat = patch_concat.unwrap_or(self.next_inst());
666         let initial_entry = patch_concat.entry;
667         // It is much simpler to compile, e.g., `a{2,5}` as:
668         //
669         //     aaa?a?a?
670         //
671         // But you end up with a sequence of instructions like this:
672         //
673         //     0: 'a'
674         //     1: 'a',
675         //     2: split(3, 4)
676         //     3: 'a'
677         //     4: split(5, 6)
678         //     5: 'a'
679         //     6: split(7, 8)
680         //     7: 'a'
681         //     8: MATCH
682         //
683         // This is *incredibly* inefficient because the splits end
684         // up forming a chain, which has to be resolved everything a
685         // transition is followed.
686         let mut holes = vec![];
687         let mut prev_hole = patch_concat.hole;
688         for _ in min..max {
689             self.fill_to_next(prev_hole);
690             let split = self.push_split_hole();
691             let Patch { hole, entry } = match self.c(expr)? {
692                 Some(p) => p,
693                 None => return self.pop_split_hole(),
694             };
695             prev_hole = hole;
696             if greedy {
697                 holes.push(self.fill_split(split, Some(entry), None));
698             } else {
699                 holes.push(self.fill_split(split, None, Some(entry)));
700             }
701         }
702         holes.push(prev_hole);
703         Ok(Some(Patch { hole: Hole::Many(holes), entry: initial_entry }))
704     }
705 
706     /// Can be used as a default value for the c_* functions when the call to
707     /// c_function is followed by inserting at least one instruction that is
708     /// always executed after the ones written by the c* function.
next_inst(&self) -> Patch709     fn next_inst(&self) -> Patch {
710         Patch { hole: Hole::None, entry: self.insts.len() }
711     }
712 
fill(&mut self, hole: Hole, goto: InstPtr)713     fn fill(&mut self, hole: Hole, goto: InstPtr) {
714         match hole {
715             Hole::None => {}
716             Hole::One(pc) => {
717                 self.insts[pc].fill(goto);
718             }
719             Hole::Many(holes) => {
720                 for hole in holes {
721                     self.fill(hole, goto);
722                 }
723             }
724         }
725     }
726 
fill_to_next(&mut self, hole: Hole)727     fn fill_to_next(&mut self, hole: Hole) {
728         let next = self.insts.len();
729         self.fill(hole, next);
730     }
731 
fill_split( &mut self, hole: Hole, goto1: Option<InstPtr>, goto2: Option<InstPtr>, ) -> Hole732     fn fill_split(
733         &mut self,
734         hole: Hole,
735         goto1: Option<InstPtr>,
736         goto2: Option<InstPtr>,
737     ) -> Hole {
738         match hole {
739             Hole::None => Hole::None,
740             Hole::One(pc) => match (goto1, goto2) {
741                 (Some(goto1), Some(goto2)) => {
742                     self.insts[pc].fill_split(goto1, goto2);
743                     Hole::None
744                 }
745                 (Some(goto1), None) => {
746                     self.insts[pc].half_fill_split_goto1(goto1);
747                     Hole::One(pc)
748                 }
749                 (None, Some(goto2)) => {
750                     self.insts[pc].half_fill_split_goto2(goto2);
751                     Hole::One(pc)
752                 }
753                 (None, None) => unreachable!(
754                     "at least one of the split \
755                      holes must be filled"
756                 ),
757             },
758             Hole::Many(holes) => {
759                 let mut new_holes = vec![];
760                 for hole in holes {
761                     new_holes.push(self.fill_split(hole, goto1, goto2));
762                 }
763                 if new_holes.is_empty() {
764                     Hole::None
765                 } else if new_holes.len() == 1 {
766                     new_holes.pop().unwrap()
767                 } else {
768                     Hole::Many(new_holes)
769                 }
770             }
771         }
772     }
773 
push_compiled(&mut self, inst: Inst)774     fn push_compiled(&mut self, inst: Inst) {
775         self.insts.push(MaybeInst::Compiled(inst));
776     }
777 
push_hole(&mut self, inst: InstHole) -> Hole778     fn push_hole(&mut self, inst: InstHole) -> Hole {
779         let hole = self.insts.len();
780         self.insts.push(MaybeInst::Uncompiled(inst));
781         Hole::One(hole)
782     }
783 
push_split_hole(&mut self) -> Hole784     fn push_split_hole(&mut self) -> Hole {
785         let hole = self.insts.len();
786         self.insts.push(MaybeInst::Split);
787         Hole::One(hole)
788     }
789 
pop_split_hole(&mut self) -> ResultOrEmpty790     fn pop_split_hole(&mut self) -> ResultOrEmpty {
791         self.insts.pop();
792         Ok(None)
793     }
794 
check_size(&self) -> result::Result<(), Error>795     fn check_size(&self) -> result::Result<(), Error> {
796         use std::mem::size_of;
797 
798         if self.insts.len() * size_of::<Inst>() > self.size_limit {
799             Err(Error::CompiledTooBig(self.size_limit))
800         } else {
801             Ok(())
802         }
803     }
804 }
805 
806 #[derive(Debug)]
807 enum Hole {
808     None,
809     One(InstPtr),
810     Many(Vec<Hole>),
811 }
812 
813 impl Hole {
dup_one(self) -> (Self, Self)814     fn dup_one(self) -> (Self, Self) {
815         match self {
816             Hole::One(pc) => (Hole::One(pc), Hole::One(pc)),
817             Hole::None | Hole::Many(_) => {
818                 unreachable!("must be called on single hole")
819             }
820         }
821     }
822 }
823 
824 #[derive(Clone, Debug)]
825 enum MaybeInst {
826     Compiled(Inst),
827     Uncompiled(InstHole),
828     Split,
829     Split1(InstPtr),
830     Split2(InstPtr),
831 }
832 
833 impl MaybeInst {
fill(&mut self, goto: InstPtr)834     fn fill(&mut self, goto: InstPtr) {
835         let maybeinst = match *self {
836             MaybeInst::Split => MaybeInst::Split1(goto),
837             MaybeInst::Uncompiled(ref inst) => {
838                 MaybeInst::Compiled(inst.fill(goto))
839             }
840             MaybeInst::Split1(goto1) => {
841                 MaybeInst::Compiled(Inst::Split(InstSplit {
842                     goto1: goto1,
843                     goto2: goto,
844                 }))
845             }
846             MaybeInst::Split2(goto2) => {
847                 MaybeInst::Compiled(Inst::Split(InstSplit {
848                     goto1: goto,
849                     goto2: goto2,
850                 }))
851             }
852             _ => unreachable!(
853                 "not all instructions were compiled! \
854                  found uncompiled instruction: {:?}",
855                 self
856             ),
857         };
858         *self = maybeinst;
859     }
860 
fill_split(&mut self, goto1: InstPtr, goto2: InstPtr)861     fn fill_split(&mut self, goto1: InstPtr, goto2: InstPtr) {
862         let filled = match *self {
863             MaybeInst::Split => {
864                 Inst::Split(InstSplit { goto1: goto1, goto2: goto2 })
865             }
866             _ => unreachable!(
867                 "must be called on Split instruction, \
868                  instead it was called on: {:?}",
869                 self
870             ),
871         };
872         *self = MaybeInst::Compiled(filled);
873     }
874 
half_fill_split_goto1(&mut self, goto1: InstPtr)875     fn half_fill_split_goto1(&mut self, goto1: InstPtr) {
876         let half_filled = match *self {
877             MaybeInst::Split => goto1,
878             _ => unreachable!(
879                 "must be called on Split instruction, \
880                  instead it was called on: {:?}",
881                 self
882             ),
883         };
884         *self = MaybeInst::Split1(half_filled);
885     }
886 
half_fill_split_goto2(&mut self, goto2: InstPtr)887     fn half_fill_split_goto2(&mut self, goto2: InstPtr) {
888         let half_filled = match *self {
889             MaybeInst::Split => goto2,
890             _ => unreachable!(
891                 "must be called on Split instruction, \
892                  instead it was called on: {:?}",
893                 self
894             ),
895         };
896         *self = MaybeInst::Split2(half_filled);
897     }
898 
unwrap(self) -> Inst899     fn unwrap(self) -> Inst {
900         match self {
901             MaybeInst::Compiled(inst) => inst,
902             _ => unreachable!(
903                 "must be called on a compiled instruction, \
904                  instead it was called on: {:?}",
905                 self
906             ),
907         }
908     }
909 }
910 
911 #[derive(Clone, Debug)]
912 enum InstHole {
913     Save { slot: usize },
914     EmptyLook { look: EmptyLook },
915     Char { c: char },
916     Ranges { ranges: Vec<(char, char)> },
917     Bytes { start: u8, end: u8 },
918 }
919 
920 impl InstHole {
fill(&self, goto: InstPtr) -> Inst921     fn fill(&self, goto: InstPtr) -> Inst {
922         match *self {
923             InstHole::Save { slot } => {
924                 Inst::Save(InstSave { goto: goto, slot: slot })
925             }
926             InstHole::EmptyLook { look } => {
927                 Inst::EmptyLook(InstEmptyLook { goto: goto, look: look })
928             }
929             InstHole::Char { c } => Inst::Char(InstChar { goto: goto, c: c }),
930             InstHole::Ranges { ref ranges } => {
931                 Inst::Ranges(InstRanges { goto: goto, ranges: ranges.clone() })
932             }
933             InstHole::Bytes { start, end } => {
934                 Inst::Bytes(InstBytes { goto: goto, start: start, end: end })
935             }
936         }
937     }
938 }
939 
940 struct CompileClass<'a, 'b> {
941     c: &'a mut Compiler,
942     ranges: &'b [hir::ClassUnicodeRange],
943 }
944 
945 impl<'a, 'b> CompileClass<'a, 'b> {
compile(mut self) -> Result946     fn compile(mut self) -> Result {
947         let mut holes = vec![];
948         let mut initial_entry = None;
949         let mut last_split = Hole::None;
950         let mut utf8_seqs = self.c.utf8_seqs.take().unwrap();
951         self.c.suffix_cache.clear();
952 
953         for (i, range) in self.ranges.iter().enumerate() {
954             let is_last_range = i + 1 == self.ranges.len();
955             utf8_seqs.reset(range.start(), range.end());
956             let mut it = (&mut utf8_seqs).peekable();
957             loop {
958                 let utf8_seq = match it.next() {
959                     None => break,
960                     Some(utf8_seq) => utf8_seq,
961                 };
962                 if is_last_range && it.peek().is_none() {
963                     let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?;
964                     holes.push(hole);
965                     self.c.fill(last_split, entry);
966                     last_split = Hole::None;
967                     if initial_entry.is_none() {
968                         initial_entry = Some(entry);
969                     }
970                 } else {
971                     if initial_entry.is_none() {
972                         initial_entry = Some(self.c.insts.len());
973                     }
974                     self.c.fill_to_next(last_split);
975                     last_split = self.c.push_split_hole();
976                     let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?;
977                     holes.push(hole);
978                     last_split =
979                         self.c.fill_split(last_split, Some(entry), None);
980                 }
981             }
982         }
983         self.c.utf8_seqs = Some(utf8_seqs);
984         Ok(Patch { hole: Hole::Many(holes), entry: initial_entry.unwrap() })
985     }
986 
c_utf8_seq(&mut self, seq: &Utf8Sequence) -> Result987     fn c_utf8_seq(&mut self, seq: &Utf8Sequence) -> Result {
988         if self.c.compiled.is_reverse {
989             self.c_utf8_seq_(seq)
990         } else {
991             self.c_utf8_seq_(seq.into_iter().rev())
992         }
993     }
994 
c_utf8_seq_<'r, I>(&mut self, seq: I) -> Result where I: IntoIterator<Item = &'r Utf8Range>,995     fn c_utf8_seq_<'r, I>(&mut self, seq: I) -> Result
996     where
997         I: IntoIterator<Item = &'r Utf8Range>,
998     {
999         // The initial instruction for each UTF-8 sequence should be the same.
1000         let mut from_inst = ::std::usize::MAX;
1001         let mut last_hole = Hole::None;
1002         for byte_range in seq {
1003             let key = SuffixCacheKey {
1004                 from_inst: from_inst,
1005                 start: byte_range.start,
1006                 end: byte_range.end,
1007             };
1008             {
1009                 let pc = self.c.insts.len();
1010                 if let Some(cached_pc) = self.c.suffix_cache.get(key, pc) {
1011                     from_inst = cached_pc;
1012                     continue;
1013                 }
1014             }
1015             self.c.byte_classes.set_range(byte_range.start, byte_range.end);
1016             if from_inst == ::std::usize::MAX {
1017                 last_hole = self.c.push_hole(InstHole::Bytes {
1018                     start: byte_range.start,
1019                     end: byte_range.end,
1020                 });
1021             } else {
1022                 self.c.push_compiled(Inst::Bytes(InstBytes {
1023                     goto: from_inst,
1024                     start: byte_range.start,
1025                     end: byte_range.end,
1026                 }));
1027             }
1028             from_inst = self.c.insts.len().checked_sub(1).unwrap();
1029             debug_assert!(from_inst < ::std::usize::MAX);
1030         }
1031         debug_assert!(from_inst < ::std::usize::MAX);
1032         Ok(Patch { hole: last_hole, entry: from_inst })
1033     }
1034 }
1035 
1036 /// `SuffixCache` is a simple bounded hash map for caching suffix entries in
1037 /// UTF-8 automata. For example, consider the Unicode range \u{0}-\u{FFFF}.
1038 /// The set of byte ranges looks like this:
1039 ///
1040 /// [0-7F]
1041 /// [C2-DF][80-BF]
1042 /// [E0][A0-BF][80-BF]
1043 /// [E1-EC][80-BF][80-BF]
1044 /// [ED][80-9F][80-BF]
1045 /// [EE-EF][80-BF][80-BF]
1046 ///
1047 /// Each line above translates to one alternate in the compiled regex program.
1048 /// However, all but one of the alternates end in the same suffix, which is
1049 /// a waste of an instruction. The suffix cache facilitates reusing them across
1050 /// alternates.
1051 ///
1052 /// Note that a HashMap could be trivially used for this, but we don't need its
1053 /// overhead. Some small bounded space (LRU style) is more than enough.
1054 ///
1055 /// This uses similar idea to [`SparseSet`](../sparse/struct.SparseSet.html),
1056 /// except it uses hashes as original indices and then compares full keys for
1057 /// validation against `dense` array.
1058 #[derive(Debug)]
1059 struct SuffixCache {
1060     sparse: Box<[usize]>,
1061     dense: Vec<SuffixCacheEntry>,
1062 }
1063 
1064 #[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
1065 struct SuffixCacheEntry {
1066     key: SuffixCacheKey,
1067     pc: InstPtr,
1068 }
1069 
1070 #[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
1071 struct SuffixCacheKey {
1072     from_inst: InstPtr,
1073     start: u8,
1074     end: u8,
1075 }
1076 
1077 impl SuffixCache {
new(size: usize) -> Self1078     fn new(size: usize) -> Self {
1079         SuffixCache {
1080             sparse: vec![0usize; size].into(),
1081             dense: Vec::with_capacity(size),
1082         }
1083     }
1084 
get(&mut self, key: SuffixCacheKey, pc: InstPtr) -> Option<InstPtr>1085     fn get(&mut self, key: SuffixCacheKey, pc: InstPtr) -> Option<InstPtr> {
1086         let hash = self.hash(&key);
1087         let pos = &mut self.sparse[hash];
1088         if let Some(entry) = self.dense.get(*pos) {
1089             if entry.key == key {
1090                 return Some(entry.pc);
1091             }
1092         }
1093         *pos = self.dense.len();
1094         self.dense.push(SuffixCacheEntry { key: key, pc: pc });
1095         None
1096     }
1097 
clear(&mut self)1098     fn clear(&mut self) {
1099         self.dense.clear();
1100     }
1101 
hash(&self, suffix: &SuffixCacheKey) -> usize1102     fn hash(&self, suffix: &SuffixCacheKey) -> usize {
1103         // Basic FNV-1a hash as described:
1104         // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
1105         const FNV_PRIME: u64 = 1099511628211;
1106         let mut h = 14695981039346656037;
1107         h = (h ^ (suffix.from_inst as u64)).wrapping_mul(FNV_PRIME);
1108         h = (h ^ (suffix.start as u64)).wrapping_mul(FNV_PRIME);
1109         h = (h ^ (suffix.end as u64)).wrapping_mul(FNV_PRIME);
1110         (h as usize) % self.sparse.len()
1111     }
1112 }
1113 
1114 struct ByteClassSet([bool; 256]);
1115 
1116 impl ByteClassSet {
new() -> Self1117     fn new() -> Self {
1118         ByteClassSet([false; 256])
1119     }
1120 
set_range(&mut self, start: u8, end: u8)1121     fn set_range(&mut self, start: u8, end: u8) {
1122         debug_assert!(start <= end);
1123         if start > 0 {
1124             self.0[start as usize - 1] = true;
1125         }
1126         self.0[end as usize] = true;
1127     }
1128 
set_word_boundary(&mut self)1129     fn set_word_boundary(&mut self) {
1130         // We need to mark all ranges of bytes whose pairs result in
1131         // evaluating \b differently.
1132         let iswb = is_word_byte;
1133         let mut b1: u16 = 0;
1134         let mut b2: u16;
1135         while b1 <= 255 {
1136             b2 = b1 + 1;
1137             while b2 <= 255 && iswb(b1 as u8) == iswb(b2 as u8) {
1138                 b2 += 1;
1139             }
1140             self.set_range(b1 as u8, (b2 - 1) as u8);
1141             b1 = b2;
1142         }
1143     }
1144 
byte_classes(&self) -> Vec<u8>1145     fn byte_classes(&self) -> Vec<u8> {
1146         // N.B. If you're debugging the DFA, it's useful to simply return
1147         // `(0..256).collect()`, which effectively removes the byte classes
1148         // and makes the transitions easier to read.
1149         // (0usize..256).map(|x| x as u8).collect()
1150         let mut byte_classes = vec![0; 256];
1151         let mut class = 0u8;
1152         let mut i = 0;
1153         loop {
1154             byte_classes[i] = class as u8;
1155             if i >= 255 {
1156                 break;
1157             }
1158             if self.0[i] {
1159                 class = class.checked_add(1).unwrap();
1160             }
1161             i += 1;
1162         }
1163         byte_classes
1164     }
1165 }
1166 
1167 impl fmt::Debug for ByteClassSet {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result1168     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1169         f.debug_tuple("ByteClassSet").field(&&self.0[..]).finish()
1170     }
1171 }
1172 
u32_to_usize(n: u32) -> usize1173 fn u32_to_usize(n: u32) -> usize {
1174     // In case usize is less than 32 bits, we need to guard against overflow.
1175     // On most platforms this compiles to nothing.
1176     // TODO Use `std::convert::TryFrom` once it's stable.
1177     if (n as u64) > (::std::usize::MAX as u64) {
1178         panic!("BUG: {} is too big to be pointer sized", n)
1179     }
1180     n as usize
1181 }
1182 
1183 #[cfg(test)]
1184 mod tests {
1185     use super::ByteClassSet;
1186 
1187     #[test]
byte_classes()1188     fn byte_classes() {
1189         let mut set = ByteClassSet::new();
1190         set.set_range(b'a', b'z');
1191         let classes = set.byte_classes();
1192         assert_eq!(classes[0], 0);
1193         assert_eq!(classes[1], 0);
1194         assert_eq!(classes[2], 0);
1195         assert_eq!(classes[b'a' as usize - 1], 0);
1196         assert_eq!(classes[b'a' as usize], 1);
1197         assert_eq!(classes[b'm' as usize], 1);
1198         assert_eq!(classes[b'z' as usize], 1);
1199         assert_eq!(classes[b'z' as usize + 1], 2);
1200         assert_eq!(classes[254], 2);
1201         assert_eq!(classes[255], 2);
1202 
1203         let mut set = ByteClassSet::new();
1204         set.set_range(0, 2);
1205         set.set_range(4, 6);
1206         let classes = set.byte_classes();
1207         assert_eq!(classes[0], 0);
1208         assert_eq!(classes[1], 0);
1209         assert_eq!(classes[2], 0);
1210         assert_eq!(classes[3], 1);
1211         assert_eq!(classes[4], 2);
1212         assert_eq!(classes[5], 2);
1213         assert_eq!(classes[6], 2);
1214         assert_eq!(classes[7], 3);
1215         assert_eq!(classes[255], 3);
1216     }
1217 
1218     #[test]
full_byte_classes()1219     fn full_byte_classes() {
1220         let mut set = ByteClassSet::new();
1221         for i in 0..256u16 {
1222             set.set_range(i as u8, i as u8);
1223         }
1224         assert_eq!(set.byte_classes().len(), 256);
1225     }
1226 }
1227