1 // Copyright 2014-2016 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
4 //
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
10 
11 use std::collections::HashMap;
12 use std::iter;
13 use std::result;
14 use std::sync::Arc;
15 
16 use syntax::is_word_byte;
17 use syntax::hir::{self, Hir};
18 use utf8_ranges::{Utf8Range, Utf8Sequence, Utf8Sequences};
19 
20 use prog::{
21     Program, Inst, InstPtr, EmptyLook,
22     InstSave, InstSplit, InstEmptyLook, InstChar, InstRanges, InstBytes,
23 };
24 
25 use Error;
26 
27 type Result = result::Result<Patch, Error>;
28 
29 #[derive(Debug)]
30 struct Patch {
31     hole: Hole,
32     entry: InstPtr,
33 }
34 
35 /// A compiler translates a regular expression AST to a sequence of
36 /// instructions. The sequence of instructions represents an NFA.
37 pub struct Compiler {
38     insts: Vec<MaybeInst>,
39     compiled: Program,
40     capture_name_idx: HashMap<String, usize>,
41     num_exprs: usize,
42     size_limit: usize,
43     suffix_cache: SuffixCache,
44     utf8_seqs: Option<Utf8Sequences>,
45     byte_classes: ByteClassSet,
46 }
47 
48 impl Compiler {
49     /// Create a new regular expression compiler.
50     ///
51     /// Various options can be set before calling `compile` on an expression.
new() -> Self52     pub fn new() -> Self {
53         Compiler {
54             insts: vec![],
55             compiled: Program::new(),
56             capture_name_idx: HashMap::new(),
57             num_exprs: 0,
58             size_limit: 10 * (1 << 20),
59             suffix_cache: SuffixCache::new(1000),
60             utf8_seqs: Some(Utf8Sequences::new('\x00', '\x00')),
61             byte_classes: ByteClassSet::new(),
62         }
63     }
64 
65     /// The size of the resulting program is limited by size_limit. If
66     /// the program approximately exceeds the given size (in bytes), then
67     /// compilation will stop and return an error.
size_limit(mut self, size_limit: usize) -> Self68     pub fn size_limit(mut self, size_limit: usize) -> Self {
69         self.size_limit = size_limit;
70         self
71     }
72 
73     /// If bytes is true, then the program is compiled as a byte based
74     /// automaton, which incorporates UTF-8 decoding into the machine. If it's
75     /// false, then the automaton is Unicode scalar value based, e.g., an
76     /// engine utilizing such an automaton is resposible for UTF-8 decoding.
77     ///
78     /// The specific invariant is that when returning a byte based machine,
79     /// the neither the `Char` nor `Ranges` instructions are produced.
80     /// Conversely, when producing a Unicode scalar value machine, the `Bytes`
81     /// instruction is never produced.
82     ///
83     /// Note that `dfa(true)` implies `bytes(true)`.
bytes(mut self, yes: bool) -> Self84     pub fn bytes(mut self, yes: bool) -> Self {
85         self.compiled.is_bytes = yes;
86         self
87     }
88 
89     /// When disabled, the program compiled may match arbitrary bytes.
90     ///
91     /// When enabled (the default), all compiled programs exclusively match
92     /// valid UTF-8 bytes.
only_utf8(mut self, yes: bool) -> Self93     pub fn only_utf8(mut self, yes: bool) -> Self {
94         self.compiled.only_utf8 = yes;
95         self
96     }
97 
98     /// When set, the machine returned is suitable for use in the DFA matching
99     /// engine.
100     ///
101     /// In particular, this ensures that if the regex is not anchored in the
102     /// beginning, then a preceding `.*?` is included in the program. (The NFA
103     /// based engines handle the preceding `.*?` explicitly, which is difficult
104     /// or impossible in the DFA engine.)
dfa(mut self, yes: bool) -> Self105     pub fn dfa(mut self, yes: bool) -> Self {
106         self.compiled.is_dfa = yes;
107         self
108     }
109 
110     /// When set, the machine returned is suitable for matching text in
111     /// reverse. In particular, all concatenations are flipped.
reverse(mut self, yes: bool) -> Self112     pub fn reverse(mut self, yes: bool) -> Self {
113         self.compiled.is_reverse = yes;
114         self
115     }
116 
117     /// Compile a regular expression given its AST.
118     ///
119     /// The compiler is guaranteed to succeed unless the program exceeds the
120     /// specified size limit. If the size limit is exceeded, then compilation
121     /// stops and returns an error.
compile( mut self, exprs: &[Hir], ) -> result::Result<Program, Error>122     pub fn compile(
123         mut self,
124         exprs: &[Hir],
125     ) -> result::Result<Program, Error> {
126         debug_assert!(exprs.len() >= 1);
127         self.num_exprs = exprs.len();
128         if exprs.len() == 1 {
129             self.compile_one(&exprs[0])
130         } else {
131             self.compile_many(exprs)
132         }
133     }
134 
compile_one(mut self, expr: &Hir) -> result::Result<Program, Error>135     fn compile_one(mut self, expr: &Hir) -> result::Result<Program, Error> {
136         // If we're compiling a forward DFA and we aren't anchored, then
137         // add a `.*?` before the first capture group.
138         // Other matching engines handle this by baking the logic into the
139         // matching engine itself.
140         let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
141         self.compiled.is_anchored_start = expr.is_anchored_start();
142         self.compiled.is_anchored_end = expr.is_anchored_end();
143         if self.compiled.needs_dotstar() {
144             dotstar_patch = self.c_dotstar()?;
145             self.compiled.start = dotstar_patch.entry;
146         }
147         self.compiled.captures = vec![None];
148         let patch = self.c_capture(0, expr)?;
149         if self.compiled.needs_dotstar() {
150             self.fill(dotstar_patch.hole, patch.entry);
151         } else {
152             self.compiled.start = patch.entry;
153         }
154         self.fill_to_next(patch.hole);
155         self.compiled.matches = vec![self.insts.len()];
156         self.push_compiled(Inst::Match(0));
157         self.compile_finish()
158     }
159 
compile_many( mut self, exprs: &[Hir], ) -> result::Result<Program, Error>160     fn compile_many(
161         mut self,
162         exprs: &[Hir],
163     ) -> result::Result<Program, Error> {
164         debug_assert!(exprs.len() > 1);
165 
166         self.compiled.is_anchored_start =
167             exprs.iter().all(|e| e.is_anchored_start());
168         self.compiled.is_anchored_end =
169             exprs.iter().all(|e| e.is_anchored_end());
170         let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
171         if self.compiled.needs_dotstar() {
172             dotstar_patch = self.c_dotstar()?;
173             self.compiled.start = dotstar_patch.entry;
174         } else {
175             self.compiled.start = 0; // first instruction is always split
176         }
177         self.fill_to_next(dotstar_patch.hole);
178 
179         let mut prev_hole = Hole::None;
180         for (i, expr) in exprs[0..exprs.len() - 1].iter().enumerate() {
181             self.fill_to_next(prev_hole);
182             let split = self.push_split_hole();
183             let Patch { hole, entry } = self.c_capture(0, expr)?;
184             self.fill_to_next(hole);
185             self.compiled.matches.push(self.insts.len());
186             self.push_compiled(Inst::Match(i));
187             prev_hole = self.fill_split(split, Some(entry), None);
188         }
189         let i = exprs.len() - 1;
190         let Patch { hole, entry } = self.c_capture(0, &exprs[i])?;
191         self.fill(prev_hole, entry);
192         self.fill_to_next(hole);
193         self.compiled.matches.push(self.insts.len());
194         self.push_compiled(Inst::Match(i));
195         self.compile_finish()
196     }
197 
compile_finish(mut self) -> result::Result<Program, Error>198     fn compile_finish(mut self) -> result::Result<Program, Error> {
199         self.compiled.insts =
200             self.insts.into_iter().map(|inst| inst.unwrap()).collect();
201         self.compiled.byte_classes = self.byte_classes.byte_classes();
202         self.compiled.capture_name_idx = Arc::new(self.capture_name_idx);
203         Ok(self.compiled)
204     }
205 
206     /// Compile expr into self.insts, returning a patch on success,
207     /// or an error if we run out of memory.
208     ///
209     /// All of the c_* methods of the compiler share the contract outlined
210     /// here.
211     ///
212     /// The main thing that a c_* method does is mutate `self.insts`
213     /// to add a list of mostly compiled instructions required to execute
214     /// the given expression. `self.insts` contains MaybeInsts rather than
215     /// Insts because there is some backpatching required.
216     ///
217     /// The `Patch` value returned by each c_* method provides metadata
218     /// about the compiled instructions emitted to `self.insts`. The
219     /// `entry` member of the patch refers to the first instruction
220     /// (the entry point), while the `hole` member contains zero or
221     /// more offsets to partial instructions that need to be backpatched.
222     /// The c_* routine can't know where its list of instructions are going to
223     /// jump to after execution, so it is up to the caller to patch
224     /// these jumps to point to the right place. So compiling some
225     /// expression, e, we would end up with a situation that looked like:
226     ///
227     /// ```text
228     /// self.insts = [ ..., i1, i2, ..., iexit1, ..., iexitn, ...]
229     ///                     ^              ^             ^
230     ///                     |                \         /
231     ///                   entry                \     /
232     ///                                         hole
233     /// ```
234     ///
235     /// To compile two expressions, e1 and e2, concatinated together we
236     /// would do:
237     ///
238     /// ```ignore
239     /// let patch1 = self.c(e1);
240     /// let patch2 = self.c(e2);
241     /// ```
242     ///
243     /// while leaves us with a situation that looks like
244     ///
245     /// ```text
246     /// self.insts = [ ..., i1, ..., iexit1, ..., i2, ..., iexit2 ]
247     ///                     ^        ^            ^        ^
248     ///                     |        |            |        |
249     ///                entry1        hole1   entry2        hole2
250     /// ```
251     ///
252     /// Then to merge the two patches together into one we would backpatch
253     /// hole1 with entry2 and return a new patch that enters at entry1
254     /// and has hole2 for a hole. In fact, if you look at the c_concat
255     /// method you will see that it does exactly this, though it handles
256     /// a list of expressions rather than just the two that we use for
257     /// an example.
c(&mut self, expr: &Hir) -> Result258     fn c(&mut self, expr: &Hir) -> Result {
259         use prog;
260         use syntax::hir::HirKind::*;
261 
262         self.check_size()?;
263         match *expr.kind() {
264             Empty => Ok(Patch { hole: Hole::None, entry: self.insts.len() }),
265             Literal(hir::Literal::Unicode(c)) => {
266                 self.c_literal(&[c])
267             }
268             Literal(hir::Literal::Byte(b)) => {
269                 assert!(self.compiled.uses_bytes());
270                 self.c_bytes(&[b])
271             }
272             Class(hir::Class::Unicode(ref cls)) => {
273                 self.c_class(cls.ranges())
274             }
275             Class(hir::Class::Bytes(ref cls)) => {
276                 if self.compiled.uses_bytes() {
277                     self.c_class_bytes(cls.ranges())
278                 } else {
279                     assert!(cls.is_all_ascii());
280                     let mut char_ranges = vec![];
281                     for r in cls.iter() {
282                         let (s, e) = (r.start() as char, r.end() as char);
283                         char_ranges.push(hir::ClassUnicodeRange::new(s, e));
284                     }
285                     self.c_class(&char_ranges)
286                 }
287             }
288             Anchor(hir::Anchor::StartLine) if self.compiled.is_reverse => {
289                 self.byte_classes.set_range(b'\n', b'\n');
290                 self.c_empty_look(prog::EmptyLook::EndLine)
291             }
292             Anchor(hir::Anchor::StartLine) => {
293                 self.byte_classes.set_range(b'\n', b'\n');
294                 self.c_empty_look(prog::EmptyLook::StartLine)
295             }
296             Anchor(hir::Anchor::EndLine) if self.compiled.is_reverse => {
297                 self.byte_classes.set_range(b'\n', b'\n');
298                 self.c_empty_look(prog::EmptyLook::StartLine)
299             }
300             Anchor(hir::Anchor::EndLine) => {
301                 self.byte_classes.set_range(b'\n', b'\n');
302                 self.c_empty_look(prog::EmptyLook::EndLine)
303             }
304             Anchor(hir::Anchor::StartText) if self.compiled.is_reverse => {
305                 self.c_empty_look(prog::EmptyLook::EndText)
306             }
307             Anchor(hir::Anchor::StartText) => {
308                 self.c_empty_look(prog::EmptyLook::StartText)
309             }
310             Anchor(hir::Anchor::EndText) if self.compiled.is_reverse => {
311                 self.c_empty_look(prog::EmptyLook::StartText)
312             }
313             Anchor(hir::Anchor::EndText) => {
314                 self.c_empty_look(prog::EmptyLook::EndText)
315             }
316             WordBoundary(hir::WordBoundary::Unicode) => {
317                 self.compiled.has_unicode_word_boundary = true;
318                 self.byte_classes.set_word_boundary();
319                 self.c_empty_look(prog::EmptyLook::WordBoundary)
320             }
321             WordBoundary(hir::WordBoundary::UnicodeNegate) => {
322                 self.compiled.has_unicode_word_boundary = true;
323                 self.byte_classes.set_word_boundary();
324                 self.c_empty_look(prog::EmptyLook::NotWordBoundary)
325             }
326             WordBoundary(hir::WordBoundary::Ascii) => {
327                 self.byte_classes.set_word_boundary();
328                 self.c_empty_look(prog::EmptyLook::WordBoundaryAscii)
329             }
330             WordBoundary(hir::WordBoundary::AsciiNegate) => {
331                 self.byte_classes.set_word_boundary();
332                 self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii)
333             }
334             Group(ref g) => {
335                 match g.kind {
336                     hir::GroupKind::NonCapturing => self.c(&g.hir),
337                     hir::GroupKind::CaptureIndex(index) => {
338                         if index as usize >= self.compiled.captures.len() {
339                             self.compiled.captures.push(None);
340                         }
341                         self.c_capture(2 * index as usize, &g.hir)
342                     }
343                     hir::GroupKind::CaptureName { index, ref name } => {
344                         if index as usize >= self.compiled.captures.len() {
345                             let n = name.to_string();
346                             self.compiled.captures.push(Some(n.clone()));
347                             self.capture_name_idx.insert(n, index as usize);
348                         }
349                         self.c_capture(2 * index as usize, &g.hir)
350                     }
351                 }
352             }
353             Concat(ref es) => {
354                 if self.compiled.is_reverse {
355                     self.c_concat(es.iter().rev())
356                 } else {
357                     self.c_concat(es)
358                 }
359             }
360             Alternation(ref es) => self.c_alternate(&**es),
361             Repetition(ref rep) => self.c_repeat(rep),
362         }
363     }
364 
c_capture(&mut self, first_slot: usize, expr: &Hir) -> Result365     fn c_capture(&mut self, first_slot: usize, expr: &Hir) -> Result {
366         if self.num_exprs > 1 || self.compiled.is_dfa {
367             // Don't ever compile Save instructions for regex sets because
368             // they are never used. They are also never used in DFA programs
369             // because DFAs can't handle captures.
370             self.c(expr)
371         } else {
372             let entry = self.insts.len();
373             let hole = self.push_hole(InstHole::Save { slot: first_slot });
374             let patch = self.c(expr)?;
375             self.fill(hole, patch.entry);
376             self.fill_to_next(patch.hole);
377             let hole = self.push_hole(InstHole::Save { slot: first_slot + 1 });
378             Ok(Patch { hole: hole, entry: entry })
379         }
380     }
381 
c_dotstar(&mut self) -> Result382     fn c_dotstar(&mut self) -> Result {
383         Ok(if !self.compiled.only_utf8() {
384             self.c(&Hir::repetition(hir::Repetition {
385                 kind: hir::RepetitionKind::ZeroOrMore,
386                 greedy: false,
387                 hir: Box::new(Hir::any(true)),
388             }))?
389         } else {
390             self.c(&Hir::repetition(hir::Repetition {
391                 kind: hir::RepetitionKind::ZeroOrMore,
392                 greedy: false,
393                 hir: Box::new(Hir::any(false)),
394             }))?
395         })
396     }
397 
c_literal(&mut self, chars: &[char]) -> Result398     fn c_literal(&mut self, chars: &[char]) -> Result {
399         debug_assert!(!chars.is_empty());
400         let mut chars: Box<Iterator<Item=&char>> =
401             if self.compiled.is_reverse {
402                 Box::new(chars.iter().rev())
403             } else {
404                 Box::new(chars.iter())
405             };
406         let first = *chars.next().expect("non-empty literal");
407         let Patch { mut hole, entry } = self.c_char(first)?;
408         for &c in chars {
409             let p = self.c_char(c)?;
410             self.fill(hole, p.entry);
411             hole = p.hole;
412         }
413         Ok(Patch { hole: hole, entry: entry })
414     }
415 
c_char(&mut self, c: char) -> Result416     fn c_char(&mut self, c: char) -> Result {
417         self.c_class(&[hir::ClassUnicodeRange::new(c, c)])
418     }
419 
c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> Result420     fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> Result {
421         assert!(!ranges.is_empty());
422         if self.compiled.uses_bytes() {
423             CompileClass {
424                 c: self,
425                 ranges: ranges,
426             }.compile()
427         } else {
428             let ranges: Vec<(char, char)> =
429                 ranges.iter().map(|r| (r.start(), r.end())).collect();
430             let hole = if ranges.len() == 1 && ranges[0].0 == ranges[0].1 {
431                 self.push_hole(InstHole::Char { c: ranges[0].0 })
432             } else {
433                 self.push_hole(InstHole::Ranges { ranges: ranges })
434             };
435             Ok(Patch { hole: hole, entry: self.insts.len() - 1 })
436         }
437     }
438 
c_bytes(&mut self, bytes: &[u8]) -> Result439     fn c_bytes(&mut self, bytes: &[u8]) -> Result {
440         debug_assert!(!bytes.is_empty());
441         let mut bytes: Box<Iterator<Item=&u8>> =
442             if self.compiled.is_reverse {
443                 Box::new(bytes.iter().rev())
444             } else {
445                 Box::new(bytes.iter())
446             };
447         let first = *bytes.next().expect("non-empty literal");
448         let Patch { mut hole, entry } = self.c_byte(first)?;
449         for &b in bytes {
450             let p = self.c_byte(b)?;
451             self.fill(hole, p.entry);
452             hole = p.hole;
453         }
454         Ok(Patch { hole: hole, entry: entry })
455     }
456 
c_byte(&mut self, b: u8) -> Result457     fn c_byte(&mut self, b: u8) -> Result {
458         self.c_class_bytes(&[hir::ClassBytesRange::new(b, b)])
459     }
460 
c_class_bytes(&mut self, ranges: &[hir::ClassBytesRange]) -> Result461     fn c_class_bytes(&mut self, ranges: &[hir::ClassBytesRange]) -> Result {
462         debug_assert!(!ranges.is_empty());
463 
464         let first_split_entry = self.insts.len();
465         let mut holes = vec![];
466         let mut prev_hole = Hole::None;
467         for r in &ranges[0..ranges.len() - 1] {
468             self.fill_to_next(prev_hole);
469             let split = self.push_split_hole();
470             let next = self.insts.len();
471             self.byte_classes.set_range(r.start(), r.end());
472             holes.push(self.push_hole(InstHole::Bytes {
473                 start: r.start(), end: r.end(),
474             }));
475             prev_hole = self.fill_split(split, Some(next), None);
476         }
477         let next = self.insts.len();
478         let r = &ranges[ranges.len() - 1];
479         self.byte_classes.set_range(r.start(), r.end());
480         holes.push(self.push_hole(InstHole::Bytes {
481             start: r.start(), end: r.end(),
482         }));
483         self.fill(prev_hole, next);
484         Ok(Patch { hole: Hole::Many(holes), entry: first_split_entry })
485     }
486 
c_empty_look(&mut self, look: EmptyLook) -> Result487     fn c_empty_look(&mut self, look: EmptyLook) -> Result {
488         let hole = self.push_hole(InstHole::EmptyLook { look: look });
489         Ok(Patch { hole: hole, entry: self.insts.len() - 1 })
490     }
491 
c_concat<'a, I>(&mut self, exprs: I) -> Result where I: IntoIterator<Item=&'a Hir>492     fn c_concat<'a, I>(&mut self, exprs: I) -> Result
493             where I: IntoIterator<Item=&'a Hir> {
494         let mut exprs = exprs.into_iter();
495         let first = match exprs.next() {
496             Some(expr) => expr,
497             None => {
498                 return Ok(Patch { hole: Hole::None, entry: self.insts.len() })
499             }
500         };
501         let Patch { mut hole, entry } = self.c(first)?;
502         for e in exprs {
503             let p = self.c(e)?;
504             self.fill(hole, p.entry);
505             hole = p.hole;
506         }
507         Ok(Patch { hole: hole, entry: entry })
508     }
509 
c_alternate(&mut self, exprs: &[Hir]) -> Result510     fn c_alternate(&mut self, exprs: &[Hir]) -> Result {
511         debug_assert!(
512             exprs.len() >= 2, "alternates must have at least 2 exprs");
513 
514         // Initial entry point is always the first split.
515         let first_split_entry = self.insts.len();
516 
517         // Save up all of the holes from each alternate. They will all get
518         // patched to point to the same location.
519         let mut holes = vec![];
520 
521         let mut prev_hole = Hole::None;
522         for e in &exprs[0..exprs.len() - 1] {
523             self.fill_to_next(prev_hole);
524             let split = self.push_split_hole();
525             let prev_entry = self.insts.len();
526             let Patch { hole, entry } = self.c(e)?;
527             if prev_entry == self.insts.len() {
528                 // TODO(burntsushi): It is kind of silly that we don't support
529                 // empty-subexpressions in alternates, but it is supremely
530                 // awkward to support them in the existing compiler
531                 // infrastructure. This entire compiler needs to be thrown out
532                 // anyway, so don't feel too bad.
533                 return Err(Error::Syntax(
534                     "alternations cannot currently contain \
535                      empty sub-expressions".to_string()));
536             }
537             holes.push(hole);
538             prev_hole = self.fill_split(split, Some(entry), None);
539         }
540         let prev_entry = self.insts.len();
541         let Patch { hole, entry } = self.c(&exprs[exprs.len() - 1])?;
542         if prev_entry == self.insts.len() {
543             // TODO(burntsushi): See TODO above.
544             return Err(Error::Syntax(
545                 "alternations cannot currently contain \
546                  empty sub-expressions".to_string()));
547         }
548         holes.push(hole);
549         self.fill(prev_hole, entry);
550         Ok(Patch { hole: Hole::Many(holes), entry: first_split_entry })
551     }
552 
c_repeat(&mut self, rep: &hir::Repetition) -> Result553     fn c_repeat(&mut self, rep: &hir::Repetition) -> Result {
554         use syntax::hir::RepetitionKind::*;
555         match rep.kind {
556             ZeroOrOne => self.c_repeat_zero_or_one(&rep.hir, rep.greedy),
557             ZeroOrMore => self.c_repeat_zero_or_more(&rep.hir, rep.greedy),
558             OneOrMore => self.c_repeat_one_or_more(&rep.hir, rep.greedy),
559             Range(hir::RepetitionRange::Exactly(min_max)) => {
560                 self.c_repeat_range(&rep.hir, rep.greedy, min_max, min_max)
561             }
562             Range(hir::RepetitionRange::AtLeast(min)) => {
563                 self.c_repeat_range_min_or_more(&rep.hir, rep.greedy, min)
564             }
565             Range(hir::RepetitionRange::Bounded(min, max)) => {
566                 self.c_repeat_range(&rep.hir, rep.greedy, min, max)
567             }
568         }
569     }
570 
c_repeat_zero_or_one(&mut self, expr: &Hir, greedy: bool) -> Result571     fn c_repeat_zero_or_one(&mut self, expr: &Hir, greedy: bool) -> Result {
572         let split_entry = self.insts.len();
573         let split = self.push_split_hole();
574         let Patch { hole: hole_rep, entry: entry_rep } = self.c(expr)?;
575 
576         let split_hole = if greedy {
577             self.fill_split(split, Some(entry_rep), None)
578         } else {
579             self.fill_split(split, None, Some(entry_rep))
580         };
581         let holes = vec![hole_rep, split_hole];
582         Ok(Patch { hole: Hole::Many(holes), entry: split_entry })
583     }
584 
c_repeat_zero_or_more(&mut self, expr: &Hir, greedy: bool) -> Result585     fn c_repeat_zero_or_more(&mut self, expr: &Hir, greedy: bool) -> Result {
586         let split_entry = self.insts.len();
587         let split = self.push_split_hole();
588         let Patch { hole: hole_rep, entry: entry_rep } = self.c(expr)?;
589 
590         self.fill(hole_rep, split_entry);
591         let split_hole = if greedy {
592             self.fill_split(split, Some(entry_rep), None)
593         } else {
594             self.fill_split(split, None, Some(entry_rep))
595         };
596         Ok(Patch { hole: split_hole, entry: split_entry })
597     }
598 
c_repeat_one_or_more(&mut self, expr: &Hir, greedy: bool) -> Result599     fn c_repeat_one_or_more(&mut self, expr: &Hir, greedy: bool) -> Result {
600         let Patch { hole: hole_rep, entry: entry_rep } = self.c(expr)?;
601         self.fill_to_next(hole_rep);
602         let split = self.push_split_hole();
603 
604         let split_hole = if greedy {
605             self.fill_split(split, Some(entry_rep), None)
606         } else {
607             self.fill_split(split, None, Some(entry_rep))
608         };
609         Ok(Patch { hole: split_hole, entry: entry_rep })
610     }
611 
c_repeat_range_min_or_more( &mut self, expr: &Hir, greedy: bool, min: u32, ) -> Result612     fn c_repeat_range_min_or_more(
613         &mut self,
614         expr: &Hir,
615         greedy: bool,
616         min: u32,
617     ) -> Result {
618         let min = u32_to_usize(min);
619         let patch_concat = self.c_concat(iter::repeat(expr).take(min))?;
620         let patch_rep = self.c_repeat_zero_or_more(expr, greedy)?;
621         self.fill(patch_concat.hole, patch_rep.entry);
622         Ok(Patch { hole: patch_rep.hole, entry: patch_concat.entry })
623     }
624 
c_repeat_range( &mut self, expr: &Hir, greedy: bool, min: u32, max: u32, ) -> Result625     fn c_repeat_range(
626         &mut self,
627         expr: &Hir,
628         greedy: bool,
629         min: u32,
630         max: u32,
631     ) -> Result {
632         let (min, max) = (u32_to_usize(min), u32_to_usize(max));
633         let patch_concat = self.c_concat(iter::repeat(expr).take(min))?;
634         let initial_entry = patch_concat.entry;
635         if min == max {
636             return Ok(patch_concat);
637         }
638         // It is much simpler to compile, e.g., `a{2,5}` as:
639         //
640         //     aaa?a?a?
641         //
642         // But you end up with a sequence of instructions like this:
643         //
644         //     0: 'a'
645         //     1: 'a',
646         //     2: split(3, 4)
647         //     3: 'a'
648         //     4: split(5, 6)
649         //     5: 'a'
650         //     6: split(7, 8)
651         //     7: 'a'
652         //     8: MATCH
653         //
654         // This is *incredibly* inefficient because the splits end
655         // up forming a chain, which has to be resolved everything a
656         // transition is followed.
657         let mut holes = vec![];
658         let mut prev_hole = patch_concat.hole;
659         for _ in min..max {
660             self.fill_to_next(prev_hole);
661             let split = self.push_split_hole();
662             let Patch { hole, entry } = self.c(expr)?;
663             prev_hole = hole;
664             if greedy {
665                 holes.push(self.fill_split(split, Some(entry), None));
666             } else {
667                 holes.push(self.fill_split(split, None, Some(entry)));
668             }
669         }
670         holes.push(prev_hole);
671         Ok(Patch { hole: Hole::Many(holes), entry: initial_entry })
672     }
673 
fill(&mut self, hole: Hole, goto: InstPtr)674     fn fill(&mut self, hole: Hole, goto: InstPtr) {
675         match hole {
676             Hole::None => {}
677             Hole::One(pc) => {
678                 self.insts[pc].fill(goto);
679             }
680             Hole::Many(holes) => {
681                 for hole in holes {
682                     self.fill(hole, goto);
683                 }
684             }
685         }
686     }
687 
fill_to_next(&mut self, hole: Hole)688     fn fill_to_next(&mut self, hole: Hole) {
689         let next = self.insts.len();
690         self.fill(hole, next);
691     }
692 
fill_split( &mut self, hole: Hole, goto1: Option<InstPtr>, goto2: Option<InstPtr>, ) -> Hole693     fn fill_split(
694         &mut self,
695         hole: Hole,
696         goto1: Option<InstPtr>,
697         goto2: Option<InstPtr>,
698     ) -> Hole {
699         match hole {
700             Hole::None => Hole::None,
701             Hole::One(pc) => {
702                 match (goto1, goto2) {
703                     (Some(goto1), Some(goto2)) => {
704                         self.insts[pc].fill_split(goto1, goto2);
705                         Hole::None
706                     }
707                     (Some(goto1), None) => {
708                         self.insts[pc].half_fill_split_goto1(goto1);
709                         Hole::One(pc)
710                     }
711                     (None, Some(goto2)) => {
712                         self.insts[pc].half_fill_split_goto2(goto2);
713                         Hole::One(pc)
714                     }
715                     (None, None) => unreachable!("at least one of the split \
716                                                   holes must be filled"),
717                 }
718             }
719             Hole::Many(holes) => {
720                 let mut new_holes = vec![];
721                 for hole in holes {
722                     new_holes.push(self.fill_split(hole, goto1, goto2));
723                 }
724                 if new_holes.is_empty() {
725                     Hole::None
726                 } else if new_holes.len() == 1 {
727                     new_holes.pop().unwrap()
728                 } else {
729                     Hole::Many(new_holes)
730                 }
731             }
732         }
733     }
734 
push_compiled(&mut self, inst: Inst)735     fn push_compiled(&mut self, inst: Inst) {
736         self.insts.push(MaybeInst::Compiled(inst));
737     }
738 
push_hole(&mut self, inst: InstHole) -> Hole739     fn push_hole(&mut self, inst: InstHole) -> Hole {
740         let hole = self.insts.len();
741         self.insts.push(MaybeInst::Uncompiled(inst));
742         Hole::One(hole)
743     }
744 
push_split_hole(&mut self) -> Hole745     fn push_split_hole(&mut self) -> Hole {
746         let hole = self.insts.len();
747         self.insts.push(MaybeInst::Split);
748         Hole::One(hole)
749     }
750 
check_size(&self) -> result::Result<(), Error>751     fn check_size(&self) -> result::Result<(), Error> {
752         use std::mem::size_of;
753 
754         if self.insts.len() * size_of::<Inst>() > self.size_limit {
755             Err(Error::CompiledTooBig(self.size_limit))
756         } else {
757             Ok(())
758         }
759     }
760 }
761 
762 #[derive(Debug)]
763 enum Hole {
764     None,
765     One(InstPtr),
766     Many(Vec<Hole>),
767 }
768 
769 #[derive(Clone, Debug)]
770 enum MaybeInst {
771     Compiled(Inst),
772     Uncompiled(InstHole),
773     Split,
774     Split1(InstPtr),
775     Split2(InstPtr),
776 }
777 
778 impl MaybeInst {
fill(&mut self, goto: InstPtr)779     fn fill(&mut self, goto: InstPtr) {
780         let filled = match *self {
781             MaybeInst::Uncompiled(ref inst) => inst.fill(goto),
782             MaybeInst::Split1(goto1) => {
783                 Inst::Split(InstSplit { goto1: goto1, goto2: goto })
784             }
785             MaybeInst::Split2(goto2) => {
786                 Inst::Split(InstSplit { goto1: goto, goto2: goto2 })
787             }
788             _ => unreachable!("not all instructions were compiled! \
789                                found uncompiled instruction: {:?}", self),
790         };
791         *self = MaybeInst::Compiled(filled);
792     }
793 
fill_split(&mut self, goto1: InstPtr, goto2: InstPtr)794     fn fill_split(&mut self, goto1: InstPtr, goto2: InstPtr) {
795         let filled = match *self {
796             MaybeInst::Split => {
797                 Inst::Split(InstSplit { goto1: goto1, goto2: goto2 })
798             }
799             _ => unreachable!("must be called on Split instruction, \
800                                instead it was called on: {:?}", self),
801         };
802         *self = MaybeInst::Compiled(filled);
803     }
804 
half_fill_split_goto1(&mut self, goto1: InstPtr)805     fn half_fill_split_goto1(&mut self, goto1: InstPtr) {
806         let half_filled = match *self {
807             MaybeInst::Split => goto1,
808             _ => unreachable!("must be called on Split instruction, \
809                                instead it was called on: {:?}", self),
810         };
811         *self = MaybeInst::Split1(half_filled);
812     }
813 
half_fill_split_goto2(&mut self, goto2: InstPtr)814     fn half_fill_split_goto2(&mut self, goto2: InstPtr) {
815         let half_filled = match *self {
816             MaybeInst::Split => goto2,
817             _ => unreachable!("must be called on Split instruction, \
818                                instead it was called on: {:?}", self),
819         };
820         *self = MaybeInst::Split2(half_filled);
821     }
822 
unwrap(self) -> Inst823     fn unwrap(self) -> Inst {
824         match self {
825             MaybeInst::Compiled(inst) => inst,
826             _ => unreachable!("must be called on a compiled instruction, \
827                                instead it was called on: {:?}", self),
828         }
829     }
830 }
831 
832 #[derive(Clone, Debug)]
833 enum InstHole {
834     Save { slot: usize },
835     EmptyLook { look: EmptyLook },
836     Char { c: char },
837     Ranges { ranges: Vec<(char, char)> },
838     Bytes { start: u8, end: u8 },
839 }
840 
841 impl InstHole {
fill(&self, goto: InstPtr) -> Inst842     fn fill(&self, goto: InstPtr) -> Inst {
843         match *self {
844             InstHole::Save { slot } => Inst::Save(InstSave {
845                 goto: goto,
846                 slot: slot,
847             }),
848             InstHole::EmptyLook { look } => Inst::EmptyLook(InstEmptyLook {
849                 goto: goto,
850                 look: look,
851             }),
852             InstHole::Char { c } => Inst::Char(InstChar {
853                 goto: goto,
854                 c: c,
855             }),
856             InstHole::Ranges { ref ranges } => Inst::Ranges(InstRanges {
857                 goto: goto,
858                 ranges: ranges.clone(),
859             }),
860             InstHole::Bytes { start, end } => Inst::Bytes(InstBytes {
861                 goto: goto,
862                 start: start,
863                 end: end,
864             }),
865         }
866     }
867 }
868 
869 struct CompileClass<'a, 'b> {
870     c: &'a mut Compiler,
871     ranges: &'b [hir::ClassUnicodeRange],
872 }
873 
874 impl<'a, 'b> CompileClass<'a, 'b> {
compile(mut self) -> Result875     fn compile(mut self) -> Result {
876         let mut holes = vec![];
877         let mut initial_entry = None;
878         let mut last_split = Hole::None;
879         let mut utf8_seqs = self.c.utf8_seqs.take().unwrap();
880         self.c.suffix_cache.clear();
881 
882         for (i, range) in self.ranges.iter().enumerate() {
883             let is_last_range = i + 1 == self.ranges.len();
884             utf8_seqs.reset(range.start(), range.end());
885             let mut it = (&mut utf8_seqs).peekable();
886             loop {
887                 let utf8_seq = match it.next() {
888                     None => break,
889                     Some(utf8_seq) => utf8_seq,
890                 };
891                 if is_last_range && it.peek().is_none() {
892                     let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?;
893                     holes.push(hole);
894                     self.c.fill(last_split, entry);
895                     last_split = Hole::None;
896                     if initial_entry.is_none() {
897                         initial_entry = Some(entry);
898                     }
899                 } else {
900                     if initial_entry.is_none() {
901                         initial_entry = Some(self.c.insts.len());
902                     }
903                     self.c.fill_to_next(last_split);
904                     last_split = self.c.push_split_hole();
905                     let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?;
906                     holes.push(hole);
907                     last_split = self.c.fill_split(last_split, Some(entry), None);
908                 }
909             }
910         }
911         self.c.utf8_seqs = Some(utf8_seqs);
912         Ok(Patch {
913             hole: Hole::Many(holes),
914             entry: initial_entry.unwrap(),
915         })
916     }
917 
c_utf8_seq(&mut self, seq: &Utf8Sequence) -> Result918     fn c_utf8_seq(&mut self, seq: &Utf8Sequence) -> Result {
919         if self.c.compiled.is_reverse {
920             self.c_utf8_seq_(seq)
921         } else {
922             self.c_utf8_seq_(seq.into_iter().rev())
923         }
924     }
925 
c_utf8_seq_<'r, I>(&mut self, seq: I) -> Result where I: IntoIterator<Item=&'r Utf8Range>926     fn c_utf8_seq_<'r, I>(&mut self, seq: I) -> Result
927             where I: IntoIterator<Item=&'r Utf8Range> {
928         // The initial instruction for each UTF-8 sequence should be the same.
929         let mut from_inst = ::std::usize::MAX;
930         let mut last_hole = Hole::None;
931         for byte_range in seq {
932             let key = SuffixCacheKey {
933                 from_inst: from_inst,
934                 start: byte_range.start,
935                 end: byte_range.end,
936             };
937             {
938                 let pc = self.c.insts.len();
939                 if let Some(cached_pc) = self.c.suffix_cache.get(key, pc) {
940                     from_inst = cached_pc;
941                     continue;
942                 }
943             }
944             self.c.byte_classes.set_range(byte_range.start, byte_range.end);
945             if from_inst == ::std::usize::MAX {
946                 last_hole = self.c.push_hole(InstHole::Bytes {
947                     start: byte_range.start,
948                     end: byte_range.end,
949                 });
950             } else {
951                 self.c.push_compiled(Inst::Bytes(InstBytes {
952                     goto: from_inst,
953                     start: byte_range.start,
954                     end: byte_range.end,
955                 }));
956             }
957             from_inst = self.c.insts.len().checked_sub(1).unwrap();
958             debug_assert!(from_inst < ::std::usize::MAX);
959         }
960         debug_assert!(from_inst < ::std::usize::MAX);
961         Ok(Patch { hole: last_hole, entry: from_inst })
962     }
963 }
964 
965 /// `SuffixCache` is a simple bounded hash map for caching suffix entries in
966 /// UTF-8 automata. For example, consider the Unicode range \u{0}-\u{FFFF}.
967 /// The set of byte ranges looks like this:
968 ///
969 /// [0-7F]
970 /// [C2-DF][80-BF]
971 /// [E0][A0-BF][80-BF]
972 /// [E1-EC][80-BF][80-BF]
973 /// [ED][80-9F][80-BF]
974 /// [EE-EF][80-BF][80-BF]
975 ///
976 /// Each line above translates to one alternate in the compiled regex program.
977 /// However, all but one of the alternates end in the same suffix, which is
978 /// a waste of an instruction. The suffix cache facilitates reusing them across
979 /// alternates.
980 ///
981 /// Note that a HashMap could be trivially used for this, but we don't need its
982 /// overhead. Some small bounded space (LRU style) is more than enough.
983 struct SuffixCache {
984     table: Vec<SuffixCacheEntry>,
985     // Every time the cache is cleared, we increment the version number instead
986     // of actually zeroing memory. Since we store a copy of the current version
987     // in every element, all we need to do is make sure to invalidate any stale
988     // entries upon access. This saves quite a bit of time!
989     version: usize,
990 }
991 
992 #[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
993 struct SuffixCacheEntry {
994     key: SuffixCacheKey,
995     pc: InstPtr,
996     version: usize,
997 }
998 
999 #[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
1000 struct SuffixCacheKey {
1001     from_inst: InstPtr,
1002     start: u8,
1003     end: u8,
1004 }
1005 
1006 impl SuffixCache {
new(size: usize) -> Self1007     fn new(size: usize) -> Self {
1008         SuffixCache {
1009             table: vec![SuffixCacheEntry::default(); size],
1010             version: 0,
1011         }
1012     }
1013 
get(&mut self, key: SuffixCacheKey, pc: InstPtr) -> Option<InstPtr>1014     fn get(&mut self, key: SuffixCacheKey, pc: InstPtr) -> Option<InstPtr> {
1015         let h = self.hash(&key);
1016         let e = self.table[h];
1017         if e.key == key && e.version == self.version {
1018             Some(e.pc)
1019         } else {
1020             self.table[h] = SuffixCacheEntry {
1021                 key: key,
1022                 pc: pc,
1023                 version: self.version,
1024             };
1025             None
1026         }
1027     }
1028 
clear(&mut self)1029     fn clear(&mut self) {
1030         self.version += 1;
1031     }
1032 
hash(&self, suffix: &SuffixCacheKey) -> usize1033     fn hash(&self, suffix: &SuffixCacheKey) -> usize {
1034         // Basic FNV-1a hash as described:
1035         // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
1036         const FNV_PRIME: u64 = 1099511628211;
1037         let mut h = 14695981039346656037;
1038         h = (h ^ (suffix.from_inst as u64)).wrapping_mul(FNV_PRIME);
1039         h = (h ^ (suffix.start as u64)).wrapping_mul(FNV_PRIME);
1040         h = (h ^ (suffix.end as u64)).wrapping_mul(FNV_PRIME);
1041         (h as usize) % self.table.len()
1042     }
1043 }
1044 
1045 struct ByteClassSet([bool; 256]);
1046 
1047 impl ByteClassSet {
new() -> Self1048     fn new() -> Self {
1049         ByteClassSet([false; 256])
1050     }
1051 
set_range(&mut self, start: u8, end: u8)1052     fn set_range(&mut self, start: u8, end: u8) {
1053         debug_assert!(start <= end);
1054         if start > 0 {
1055             self.0[start as usize - 1] = true;
1056         }
1057         self.0[end as usize] = true;
1058     }
1059 
set_word_boundary(&mut self)1060     fn set_word_boundary(&mut self) {
1061         // We need to mark all ranges of bytes whose pairs result in
1062         // evaluating \b differently.
1063         let iswb = is_word_byte;
1064         let mut b1: u16 = 0;
1065         let mut b2: u16;
1066         while b1 <= 255 {
1067             b2 = b1 + 1;
1068             while b2 <= 255 && iswb(b1 as u8) == iswb(b2 as u8) {
1069                 b2 += 1;
1070             }
1071             self.set_range(b1 as u8, (b2 - 1) as u8);
1072             b1 = b2;
1073         }
1074     }
1075 
byte_classes(&self) -> Vec<u8>1076     fn byte_classes(&self) -> Vec<u8> {
1077         // N.B. If you're debugging the DFA, it's useful to simply return
1078         // `(0..256).collect()`, which effectively removes the byte classes
1079         // and makes the transitions easier to read.
1080         // (0usize..256).map(|x| x as u8).collect()
1081         let mut byte_classes = vec![0; 256];
1082         let mut class = 0u8;
1083         let mut i = 0;
1084         loop {
1085             byte_classes[i] = class as u8;
1086             if i >= 255 {
1087                 break;
1088             }
1089             if self.0[i] {
1090                 class = class.checked_add(1).unwrap();
1091             }
1092             i += 1;
1093         }
1094         byte_classes
1095     }
1096 }
1097 
u32_to_usize(n: u32) -> usize1098 fn u32_to_usize(n: u32) -> usize {
1099     // In case usize is less than 32 bits, we need to guard against overflow.
1100     // On most platforms this compiles to nothing.
1101     // TODO Use `std::convert::TryFrom` once it's stable.
1102     if (n as u64) > (::std::usize::MAX as u64) {
1103         panic!("BUG: {} is too big to be pointer sized", n)
1104     }
1105     n as usize
1106 }
1107 
1108 #[cfg(test)]
1109 mod tests {
1110     use super::ByteClassSet;
1111 
1112     #[test]
byte_classes()1113     fn byte_classes() {
1114         let mut set = ByteClassSet::new();
1115         set.set_range(b'a', b'z');
1116         let classes = set.byte_classes();
1117         assert_eq!(classes[0], 0);
1118         assert_eq!(classes[1], 0);
1119         assert_eq!(classes[2], 0);
1120         assert_eq!(classes[b'a' as usize - 1], 0);
1121         assert_eq!(classes[b'a' as usize], 1);
1122         assert_eq!(classes[b'm' as usize], 1);
1123         assert_eq!(classes[b'z' as usize], 1);
1124         assert_eq!(classes[b'z' as usize + 1], 2);
1125         assert_eq!(classes[254], 2);
1126         assert_eq!(classes[255], 2);
1127 
1128         let mut set = ByteClassSet::new();
1129         set.set_range(0, 2);
1130         set.set_range(4, 6);
1131         let classes = set.byte_classes();
1132         assert_eq!(classes[0], 0);
1133         assert_eq!(classes[1], 0);
1134         assert_eq!(classes[2], 0);
1135         assert_eq!(classes[3], 1);
1136         assert_eq!(classes[4], 2);
1137         assert_eq!(classes[5], 2);
1138         assert_eq!(classes[6], 2);
1139         assert_eq!(classes[7], 3);
1140         assert_eq!(classes[255], 3);
1141     }
1142 
1143     #[test]
full_byte_classes()1144     fn full_byte_classes() {
1145         let mut set = ByteClassSet::new();
1146         for i in 0..256u16 {
1147             set.set_range(i as u8, i as u8);
1148         }
1149         assert_eq!(set.byte_classes().len(), 256);
1150     }
1151 }
1152