1 use std::collections::HashMap;
2 use std::fmt;
3 use std::iter;
4 use std::result;
5 use std::sync::Arc;
6
7 use syntax::hir::{self, Hir};
8 use syntax::is_word_byte;
9 use syntax::utf8::{Utf8Range, Utf8Sequence, Utf8Sequences};
10
11 use prog::{
12 EmptyLook, Inst, InstBytes, InstChar, InstEmptyLook, InstPtr, InstRanges,
13 InstSave, InstSplit, Program,
14 };
15
16 use Error;
17
18 type Result = result::Result<Patch, Error>;
19 type ResultOrEmpty = result::Result<Option<Patch>, Error>;
20
21 #[derive(Debug)]
22 struct Patch {
23 hole: Hole,
24 entry: InstPtr,
25 }
26
27 /// A compiler translates a regular expression AST to a sequence of
28 /// instructions. The sequence of instructions represents an NFA.
29 // `Compiler` is only public via the `internal` module, so avoid deriving
30 // `Debug`.
31 #[allow(missing_debug_implementations)]
32 pub struct Compiler {
33 insts: Vec<MaybeInst>,
34 compiled: Program,
35 capture_name_idx: HashMap<String, usize>,
36 num_exprs: usize,
37 size_limit: usize,
38 suffix_cache: SuffixCache,
39 utf8_seqs: Option<Utf8Sequences>,
40 byte_classes: ByteClassSet,
41 extra_inst_bytes: usize,
42 }
43
44 impl Compiler {
45 /// Create a new regular expression compiler.
46 ///
47 /// Various options can be set before calling `compile` on an expression.
new() -> Self48 pub fn new() -> Self {
49 Compiler {
50 insts: vec![],
51 compiled: Program::new(),
52 capture_name_idx: HashMap::new(),
53 num_exprs: 0,
54 size_limit: 10 * (1 << 20),
55 suffix_cache: SuffixCache::new(1000),
56 utf8_seqs: Some(Utf8Sequences::new('\x00', '\x00')),
57 byte_classes: ByteClassSet::new(),
58 extra_inst_bytes: 0,
59 }
60 }
61
62 /// The size of the resulting program is limited by size_limit. If
63 /// the program approximately exceeds the given size (in bytes), then
64 /// compilation will stop and return an error.
size_limit(mut self, size_limit: usize) -> Self65 pub fn size_limit(mut self, size_limit: usize) -> Self {
66 self.size_limit = size_limit;
67 self
68 }
69
70 /// If bytes is true, then the program is compiled as a byte based
71 /// automaton, which incorporates UTF-8 decoding into the machine. If it's
72 /// false, then the automaton is Unicode scalar value based, e.g., an
73 /// engine utilizing such an automaton is responsible for UTF-8 decoding.
74 ///
75 /// The specific invariant is that when returning a byte based machine,
76 /// the neither the `Char` nor `Ranges` instructions are produced.
77 /// Conversely, when producing a Unicode scalar value machine, the `Bytes`
78 /// instruction is never produced.
79 ///
80 /// Note that `dfa(true)` implies `bytes(true)`.
bytes(mut self, yes: bool) -> Self81 pub fn bytes(mut self, yes: bool) -> Self {
82 self.compiled.is_bytes = yes;
83 self
84 }
85
86 /// When disabled, the program compiled may match arbitrary bytes.
87 ///
88 /// When enabled (the default), all compiled programs exclusively match
89 /// valid UTF-8 bytes.
only_utf8(mut self, yes: bool) -> Self90 pub fn only_utf8(mut self, yes: bool) -> Self {
91 self.compiled.only_utf8 = yes;
92 self
93 }
94
95 /// When set, the machine returned is suitable for use in the DFA matching
96 /// engine.
97 ///
98 /// In particular, this ensures that if the regex is not anchored in the
99 /// beginning, then a preceding `.*?` is included in the program. (The NFA
100 /// based engines handle the preceding `.*?` explicitly, which is difficult
101 /// or impossible in the DFA engine.)
dfa(mut self, yes: bool) -> Self102 pub fn dfa(mut self, yes: bool) -> Self {
103 self.compiled.is_dfa = yes;
104 self
105 }
106
107 /// When set, the machine returned is suitable for matching text in
108 /// reverse. In particular, all concatenations are flipped.
reverse(mut self, yes: bool) -> Self109 pub fn reverse(mut self, yes: bool) -> Self {
110 self.compiled.is_reverse = yes;
111 self
112 }
113
114 /// Compile a regular expression given its AST.
115 ///
116 /// The compiler is guaranteed to succeed unless the program exceeds the
117 /// specified size limit. If the size limit is exceeded, then compilation
118 /// stops and returns an error.
compile(mut self, exprs: &[Hir]) -> result::Result<Program, Error>119 pub fn compile(mut self, exprs: &[Hir]) -> result::Result<Program, Error> {
120 debug_assert!(!exprs.is_empty());
121 self.num_exprs = exprs.len();
122 if exprs.len() == 1 {
123 self.compile_one(&exprs[0])
124 } else {
125 self.compile_many(exprs)
126 }
127 }
128
compile_one(mut self, expr: &Hir) -> result::Result<Program, Error>129 fn compile_one(mut self, expr: &Hir) -> result::Result<Program, Error> {
130 // If we're compiling a forward DFA and we aren't anchored, then
131 // add a `.*?` before the first capture group.
132 // Other matching engines handle this by baking the logic into the
133 // matching engine itself.
134 let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
135 self.compiled.is_anchored_start = expr.is_anchored_start();
136 self.compiled.is_anchored_end = expr.is_anchored_end();
137 if self.compiled.needs_dotstar() {
138 dotstar_patch = self.c_dotstar()?;
139 self.compiled.start = dotstar_patch.entry;
140 }
141 self.compiled.captures = vec![None];
142 let patch = self.c_capture(0, expr)?.unwrap_or(self.next_inst());
143 if self.compiled.needs_dotstar() {
144 self.fill(dotstar_patch.hole, patch.entry);
145 } else {
146 self.compiled.start = patch.entry;
147 }
148 self.fill_to_next(patch.hole);
149 self.compiled.matches = vec![self.insts.len()];
150 self.push_compiled(Inst::Match(0));
151 self.compile_finish()
152 }
153
compile_many( mut self, exprs: &[Hir], ) -> result::Result<Program, Error>154 fn compile_many(
155 mut self,
156 exprs: &[Hir],
157 ) -> result::Result<Program, Error> {
158 debug_assert!(exprs.len() > 1);
159
160 self.compiled.is_anchored_start =
161 exprs.iter().all(|e| e.is_anchored_start());
162 self.compiled.is_anchored_end =
163 exprs.iter().all(|e| e.is_anchored_end());
164 let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
165 if self.compiled.needs_dotstar() {
166 dotstar_patch = self.c_dotstar()?;
167 self.compiled.start = dotstar_patch.entry;
168 } else {
169 self.compiled.start = 0; // first instruction is always split
170 }
171 self.fill_to_next(dotstar_patch.hole);
172
173 let mut prev_hole = Hole::None;
174 for (i, expr) in exprs[0..exprs.len() - 1].iter().enumerate() {
175 self.fill_to_next(prev_hole);
176 let split = self.push_split_hole();
177 let Patch { hole, entry } =
178 self.c_capture(0, expr)?.unwrap_or(self.next_inst());
179 self.fill_to_next(hole);
180 self.compiled.matches.push(self.insts.len());
181 self.push_compiled(Inst::Match(i));
182 prev_hole = self.fill_split(split, Some(entry), None);
183 }
184 let i = exprs.len() - 1;
185 let Patch { hole, entry } =
186 self.c_capture(0, &exprs[i])?.unwrap_or(self.next_inst());
187 self.fill(prev_hole, entry);
188 self.fill_to_next(hole);
189 self.compiled.matches.push(self.insts.len());
190 self.push_compiled(Inst::Match(i));
191 self.compile_finish()
192 }
193
compile_finish(mut self) -> result::Result<Program, Error>194 fn compile_finish(mut self) -> result::Result<Program, Error> {
195 self.compiled.insts =
196 self.insts.into_iter().map(|inst| inst.unwrap()).collect();
197 self.compiled.byte_classes = self.byte_classes.byte_classes();
198 self.compiled.capture_name_idx = Arc::new(self.capture_name_idx);
199 Ok(self.compiled)
200 }
201
202 /// Compile expr into self.insts, returning a patch on success,
203 /// or an error if we run out of memory.
204 ///
205 /// All of the c_* methods of the compiler share the contract outlined
206 /// here.
207 ///
208 /// The main thing that a c_* method does is mutate `self.insts`
209 /// to add a list of mostly compiled instructions required to execute
210 /// the given expression. `self.insts` contains MaybeInsts rather than
211 /// Insts because there is some backpatching required.
212 ///
213 /// The `Patch` value returned by each c_* method provides metadata
214 /// about the compiled instructions emitted to `self.insts`. The
215 /// `entry` member of the patch refers to the first instruction
216 /// (the entry point), while the `hole` member contains zero or
217 /// more offsets to partial instructions that need to be backpatched.
218 /// The c_* routine can't know where its list of instructions are going to
219 /// jump to after execution, so it is up to the caller to patch
220 /// these jumps to point to the right place. So compiling some
221 /// expression, e, we would end up with a situation that looked like:
222 ///
223 /// ```text
224 /// self.insts = [ ..., i1, i2, ..., iexit1, ..., iexitn, ...]
225 /// ^ ^ ^
226 /// | \ /
227 /// entry \ /
228 /// hole
229 /// ```
230 ///
231 /// To compile two expressions, e1 and e2, concatenated together we
232 /// would do:
233 ///
234 /// ```ignore
235 /// let patch1 = self.c(e1);
236 /// let patch2 = self.c(e2);
237 /// ```
238 ///
239 /// while leaves us with a situation that looks like
240 ///
241 /// ```text
242 /// self.insts = [ ..., i1, ..., iexit1, ..., i2, ..., iexit2 ]
243 /// ^ ^ ^ ^
244 /// | | | |
245 /// entry1 hole1 entry2 hole2
246 /// ```
247 ///
248 /// Then to merge the two patches together into one we would backpatch
249 /// hole1 with entry2 and return a new patch that enters at entry1
250 /// and has hole2 for a hole. In fact, if you look at the c_concat
251 /// method you will see that it does exactly this, though it handles
252 /// a list of expressions rather than just the two that we use for
253 /// an example.
254 ///
255 /// Ok(None) is returned when an expression is compiled to no
256 /// instruction, and so no patch.entry value makes sense.
c(&mut self, expr: &Hir) -> ResultOrEmpty257 fn c(&mut self, expr: &Hir) -> ResultOrEmpty {
258 use prog;
259 use syntax::hir::HirKind::*;
260
261 self.check_size()?;
262 match *expr.kind() {
263 Empty => Ok(None),
264 Literal(hir::Literal::Unicode(c)) => self.c_char(c),
265 Literal(hir::Literal::Byte(b)) => {
266 assert!(self.compiled.uses_bytes());
267 self.c_byte(b)
268 }
269 Class(hir::Class::Unicode(ref cls)) => self.c_class(cls.ranges()),
270 Class(hir::Class::Bytes(ref cls)) => {
271 if self.compiled.uses_bytes() {
272 self.c_class_bytes(cls.ranges())
273 } else {
274 assert!(cls.is_all_ascii());
275 let mut char_ranges = vec![];
276 for r in cls.iter() {
277 let (s, e) = (r.start() as char, r.end() as char);
278 char_ranges.push(hir::ClassUnicodeRange::new(s, e));
279 }
280 self.c_class(&char_ranges)
281 }
282 }
283 Anchor(hir::Anchor::StartLine) if self.compiled.is_reverse => {
284 self.byte_classes.set_range(b'\n', b'\n');
285 self.c_empty_look(prog::EmptyLook::EndLine)
286 }
287 Anchor(hir::Anchor::StartLine) => {
288 self.byte_classes.set_range(b'\n', b'\n');
289 self.c_empty_look(prog::EmptyLook::StartLine)
290 }
291 Anchor(hir::Anchor::EndLine) if self.compiled.is_reverse => {
292 self.byte_classes.set_range(b'\n', b'\n');
293 self.c_empty_look(prog::EmptyLook::StartLine)
294 }
295 Anchor(hir::Anchor::EndLine) => {
296 self.byte_classes.set_range(b'\n', b'\n');
297 self.c_empty_look(prog::EmptyLook::EndLine)
298 }
299 Anchor(hir::Anchor::StartText) if self.compiled.is_reverse => {
300 self.c_empty_look(prog::EmptyLook::EndText)
301 }
302 Anchor(hir::Anchor::StartText) => {
303 self.c_empty_look(prog::EmptyLook::StartText)
304 }
305 Anchor(hir::Anchor::EndText) if self.compiled.is_reverse => {
306 self.c_empty_look(prog::EmptyLook::StartText)
307 }
308 Anchor(hir::Anchor::EndText) => {
309 self.c_empty_look(prog::EmptyLook::EndText)
310 }
311 WordBoundary(hir::WordBoundary::Unicode) => {
312 if !cfg!(feature = "unicode-perl") {
313 return Err(Error::Syntax(
314 "Unicode word boundaries are unavailable when \
315 the unicode-perl feature is disabled"
316 .to_string(),
317 ));
318 }
319 self.compiled.has_unicode_word_boundary = true;
320 self.byte_classes.set_word_boundary();
321 self.c_empty_look(prog::EmptyLook::WordBoundary)
322 }
323 WordBoundary(hir::WordBoundary::UnicodeNegate) => {
324 if !cfg!(feature = "unicode-perl") {
325 return Err(Error::Syntax(
326 "Unicode word boundaries are unavailable when \
327 the unicode-perl feature is disabled"
328 .to_string(),
329 ));
330 }
331 self.compiled.has_unicode_word_boundary = true;
332 self.byte_classes.set_word_boundary();
333 self.c_empty_look(prog::EmptyLook::NotWordBoundary)
334 }
335 WordBoundary(hir::WordBoundary::Ascii) => {
336 self.byte_classes.set_word_boundary();
337 self.c_empty_look(prog::EmptyLook::WordBoundaryAscii)
338 }
339 WordBoundary(hir::WordBoundary::AsciiNegate) => {
340 self.byte_classes.set_word_boundary();
341 self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii)
342 }
343 Group(ref g) => match g.kind {
344 hir::GroupKind::NonCapturing => self.c(&g.hir),
345 hir::GroupKind::CaptureIndex(index) => {
346 if index as usize >= self.compiled.captures.len() {
347 self.compiled.captures.push(None);
348 }
349 self.c_capture(2 * index as usize, &g.hir)
350 }
351 hir::GroupKind::CaptureName { index, ref name } => {
352 if index as usize >= self.compiled.captures.len() {
353 let n = name.to_string();
354 self.compiled.captures.push(Some(n.clone()));
355 self.capture_name_idx.insert(n, index as usize);
356 }
357 self.c_capture(2 * index as usize, &g.hir)
358 }
359 },
360 Concat(ref es) => {
361 if self.compiled.is_reverse {
362 self.c_concat(es.iter().rev())
363 } else {
364 self.c_concat(es)
365 }
366 }
367 Alternation(ref es) => self.c_alternate(&**es),
368 Repetition(ref rep) => self.c_repeat(rep),
369 }
370 }
371
c_capture(&mut self, first_slot: usize, expr: &Hir) -> ResultOrEmpty372 fn c_capture(&mut self, first_slot: usize, expr: &Hir) -> ResultOrEmpty {
373 if self.num_exprs > 1 || self.compiled.is_dfa {
374 // Don't ever compile Save instructions for regex sets because
375 // they are never used. They are also never used in DFA programs
376 // because DFAs can't handle captures.
377 self.c(expr)
378 } else {
379 let entry = self.insts.len();
380 let hole = self.push_hole(InstHole::Save { slot: first_slot });
381 let patch = self.c(expr)?.unwrap_or(self.next_inst());
382 self.fill(hole, patch.entry);
383 self.fill_to_next(patch.hole);
384 let hole = self.push_hole(InstHole::Save { slot: first_slot + 1 });
385 Ok(Some(Patch { hole: hole, entry: entry }))
386 }
387 }
388
c_dotstar(&mut self) -> Result389 fn c_dotstar(&mut self) -> Result {
390 Ok(if !self.compiled.only_utf8() {
391 self.c(&Hir::repetition(hir::Repetition {
392 kind: hir::RepetitionKind::ZeroOrMore,
393 greedy: false,
394 hir: Box::new(Hir::any(true)),
395 }))?
396 .unwrap()
397 } else {
398 self.c(&Hir::repetition(hir::Repetition {
399 kind: hir::RepetitionKind::ZeroOrMore,
400 greedy: false,
401 hir: Box::new(Hir::any(false)),
402 }))?
403 .unwrap()
404 })
405 }
406
c_char(&mut self, c: char) -> ResultOrEmpty407 fn c_char(&mut self, c: char) -> ResultOrEmpty {
408 if self.compiled.uses_bytes() {
409 if c.is_ascii() {
410 let b = c as u8;
411 let hole =
412 self.push_hole(InstHole::Bytes { start: b, end: b });
413 self.byte_classes.set_range(b, b);
414 Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
415 } else {
416 self.c_class(&[hir::ClassUnicodeRange::new(c, c)])
417 }
418 } else {
419 let hole = self.push_hole(InstHole::Char { c: c });
420 Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
421 }
422 }
423
c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> ResultOrEmpty424 fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> ResultOrEmpty {
425 use std::mem::size_of;
426
427 assert!(!ranges.is_empty());
428 if self.compiled.uses_bytes() {
429 Ok(Some(CompileClass { c: self, ranges: ranges }.compile()?))
430 } else {
431 let ranges: Vec<(char, char)> =
432 ranges.iter().map(|r| (r.start(), r.end())).collect();
433 let hole = if ranges.len() == 1 && ranges[0].0 == ranges[0].1 {
434 self.push_hole(InstHole::Char { c: ranges[0].0 })
435 } else {
436 self.extra_inst_bytes +=
437 ranges.len() * (size_of::<char>() * 2);
438 self.push_hole(InstHole::Ranges { ranges: ranges })
439 };
440 Ok(Some(Patch { hole: hole, entry: self.insts.len() - 1 }))
441 }
442 }
443
c_byte(&mut self, b: u8) -> ResultOrEmpty444 fn c_byte(&mut self, b: u8) -> ResultOrEmpty {
445 self.c_class_bytes(&[hir::ClassBytesRange::new(b, b)])
446 }
447
c_class_bytes( &mut self, ranges: &[hir::ClassBytesRange], ) -> ResultOrEmpty448 fn c_class_bytes(
449 &mut self,
450 ranges: &[hir::ClassBytesRange],
451 ) -> ResultOrEmpty {
452 debug_assert!(!ranges.is_empty());
453
454 let first_split_entry = self.insts.len();
455 let mut holes = vec![];
456 let mut prev_hole = Hole::None;
457 for r in &ranges[0..ranges.len() - 1] {
458 self.fill_to_next(prev_hole);
459 let split = self.push_split_hole();
460 let next = self.insts.len();
461 self.byte_classes.set_range(r.start(), r.end());
462 holes.push(self.push_hole(InstHole::Bytes {
463 start: r.start(),
464 end: r.end(),
465 }));
466 prev_hole = self.fill_split(split, Some(next), None);
467 }
468 let next = self.insts.len();
469 let r = &ranges[ranges.len() - 1];
470 self.byte_classes.set_range(r.start(), r.end());
471 holes.push(
472 self.push_hole(InstHole::Bytes { start: r.start(), end: r.end() }),
473 );
474 self.fill(prev_hole, next);
475 Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry }))
476 }
477
c_empty_look(&mut self, look: EmptyLook) -> ResultOrEmpty478 fn c_empty_look(&mut self, look: EmptyLook) -> ResultOrEmpty {
479 let hole = self.push_hole(InstHole::EmptyLook { look: look });
480 Ok(Some(Patch { hole: hole, entry: self.insts.len() - 1 }))
481 }
482
c_concat<'a, I>(&mut self, exprs: I) -> ResultOrEmpty where I: IntoIterator<Item = &'a Hir>,483 fn c_concat<'a, I>(&mut self, exprs: I) -> ResultOrEmpty
484 where
485 I: IntoIterator<Item = &'a Hir>,
486 {
487 let mut exprs = exprs.into_iter();
488 let Patch { mut hole, entry } = loop {
489 match exprs.next() {
490 None => return Ok(None),
491 Some(e) => {
492 if let Some(p) = self.c(e)? {
493 break p;
494 }
495 }
496 }
497 };
498 for e in exprs {
499 if let Some(p) = self.c(e)? {
500 self.fill(hole, p.entry);
501 hole = p.hole;
502 }
503 }
504 Ok(Some(Patch { hole: hole, entry: entry }))
505 }
506
c_alternate(&mut self, exprs: &[Hir]) -> ResultOrEmpty507 fn c_alternate(&mut self, exprs: &[Hir]) -> ResultOrEmpty {
508 debug_assert!(
509 exprs.len() >= 2,
510 "alternates must have at least 2 exprs"
511 );
512
513 // Initial entry point is always the first split.
514 let first_split_entry = self.insts.len();
515
516 // Save up all of the holes from each alternate. They will all get
517 // patched to point to the same location.
518 let mut holes = vec![];
519
520 // true indicates that the hole is a split where we want to fill
521 // the second branch.
522 let mut prev_hole = (Hole::None, false);
523 for e in &exprs[0..exprs.len() - 1] {
524 if prev_hole.1 {
525 let next = self.insts.len();
526 self.fill_split(prev_hole.0, None, Some(next));
527 } else {
528 self.fill_to_next(prev_hole.0);
529 }
530 let split = self.push_split_hole();
531 if let Some(Patch { hole, entry }) = self.c(e)? {
532 holes.push(hole);
533 prev_hole = (self.fill_split(split, Some(entry), None), false);
534 } else {
535 let (split1, split2) = split.dup_one();
536 holes.push(split1);
537 prev_hole = (split2, true);
538 }
539 }
540 if let Some(Patch { hole, entry }) = self.c(&exprs[exprs.len() - 1])? {
541 holes.push(hole);
542 if prev_hole.1 {
543 self.fill_split(prev_hole.0, None, Some(entry));
544 } else {
545 self.fill(prev_hole.0, entry);
546 }
547 } else {
548 // We ignore prev_hole.1. When it's true, it means we have two
549 // empty branches both pushing prev_hole.0 into holes, so both
550 // branches will go to the same place anyway.
551 holes.push(prev_hole.0);
552 }
553 Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry }))
554 }
555
c_repeat(&mut self, rep: &hir::Repetition) -> ResultOrEmpty556 fn c_repeat(&mut self, rep: &hir::Repetition) -> ResultOrEmpty {
557 use syntax::hir::RepetitionKind::*;
558 match rep.kind {
559 ZeroOrOne => self.c_repeat_zero_or_one(&rep.hir, rep.greedy),
560 ZeroOrMore => self.c_repeat_zero_or_more(&rep.hir, rep.greedy),
561 OneOrMore => self.c_repeat_one_or_more(&rep.hir, rep.greedy),
562 Range(hir::RepetitionRange::Exactly(min_max)) => {
563 self.c_repeat_range(&rep.hir, rep.greedy, min_max, min_max)
564 }
565 Range(hir::RepetitionRange::AtLeast(min)) => {
566 self.c_repeat_range_min_or_more(&rep.hir, rep.greedy, min)
567 }
568 Range(hir::RepetitionRange::Bounded(min, max)) => {
569 self.c_repeat_range(&rep.hir, rep.greedy, min, max)
570 }
571 }
572 }
573
c_repeat_zero_or_one( &mut self, expr: &Hir, greedy: bool, ) -> ResultOrEmpty574 fn c_repeat_zero_or_one(
575 &mut self,
576 expr: &Hir,
577 greedy: bool,
578 ) -> ResultOrEmpty {
579 let split_entry = self.insts.len();
580 let split = self.push_split_hole();
581 let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
582 Some(p) => p,
583 None => return self.pop_split_hole(),
584 };
585 let split_hole = if greedy {
586 self.fill_split(split, Some(entry_rep), None)
587 } else {
588 self.fill_split(split, None, Some(entry_rep))
589 };
590 let holes = vec![hole_rep, split_hole];
591 Ok(Some(Patch { hole: Hole::Many(holes), entry: split_entry }))
592 }
593
c_repeat_zero_or_more( &mut self, expr: &Hir, greedy: bool, ) -> ResultOrEmpty594 fn c_repeat_zero_or_more(
595 &mut self,
596 expr: &Hir,
597 greedy: bool,
598 ) -> ResultOrEmpty {
599 let split_entry = self.insts.len();
600 let split = self.push_split_hole();
601 let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
602 Some(p) => p,
603 None => return self.pop_split_hole(),
604 };
605
606 self.fill(hole_rep, split_entry);
607 let split_hole = if greedy {
608 self.fill_split(split, Some(entry_rep), None)
609 } else {
610 self.fill_split(split, None, Some(entry_rep))
611 };
612 Ok(Some(Patch { hole: split_hole, entry: split_entry }))
613 }
614
c_repeat_one_or_more( &mut self, expr: &Hir, greedy: bool, ) -> ResultOrEmpty615 fn c_repeat_one_or_more(
616 &mut self,
617 expr: &Hir,
618 greedy: bool,
619 ) -> ResultOrEmpty {
620 let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
621 Some(p) => p,
622 None => return Ok(None),
623 };
624 self.fill_to_next(hole_rep);
625 let split = self.push_split_hole();
626
627 let split_hole = if greedy {
628 self.fill_split(split, Some(entry_rep), None)
629 } else {
630 self.fill_split(split, None, Some(entry_rep))
631 };
632 Ok(Some(Patch { hole: split_hole, entry: entry_rep }))
633 }
634
c_repeat_range_min_or_more( &mut self, expr: &Hir, greedy: bool, min: u32, ) -> ResultOrEmpty635 fn c_repeat_range_min_or_more(
636 &mut self,
637 expr: &Hir,
638 greedy: bool,
639 min: u32,
640 ) -> ResultOrEmpty {
641 let min = u32_to_usize(min);
642 // Using next_inst() is ok, because we can't return it (concat would
643 // have to return Some(_) while c_repeat_range_min_or_more returns
644 // None).
645 let patch_concat = self
646 .c_concat(iter::repeat(expr).take(min))?
647 .unwrap_or(self.next_inst());
648 if let Some(patch_rep) = self.c_repeat_zero_or_more(expr, greedy)? {
649 self.fill(patch_concat.hole, patch_rep.entry);
650 Ok(Some(Patch { hole: patch_rep.hole, entry: patch_concat.entry }))
651 } else {
652 Ok(None)
653 }
654 }
655
c_repeat_range( &mut self, expr: &Hir, greedy: bool, min: u32, max: u32, ) -> ResultOrEmpty656 fn c_repeat_range(
657 &mut self,
658 expr: &Hir,
659 greedy: bool,
660 min: u32,
661 max: u32,
662 ) -> ResultOrEmpty {
663 let (min, max) = (u32_to_usize(min), u32_to_usize(max));
664 debug_assert!(min <= max);
665 let patch_concat = self.c_concat(iter::repeat(expr).take(min))?;
666 if min == max {
667 return Ok(patch_concat);
668 }
669 // Same reasoning as in c_repeat_range_min_or_more (we know that min <
670 // max at this point).
671 let patch_concat = patch_concat.unwrap_or(self.next_inst());
672 let initial_entry = patch_concat.entry;
673 // It is much simpler to compile, e.g., `a{2,5}` as:
674 //
675 // aaa?a?a?
676 //
677 // But you end up with a sequence of instructions like this:
678 //
679 // 0: 'a'
680 // 1: 'a',
681 // 2: split(3, 4)
682 // 3: 'a'
683 // 4: split(5, 6)
684 // 5: 'a'
685 // 6: split(7, 8)
686 // 7: 'a'
687 // 8: MATCH
688 //
689 // This is *incredibly* inefficient because the splits end
690 // up forming a chain, which has to be resolved everything a
691 // transition is followed.
692 let mut holes = vec![];
693 let mut prev_hole = patch_concat.hole;
694 for _ in min..max {
695 self.fill_to_next(prev_hole);
696 let split = self.push_split_hole();
697 let Patch { hole, entry } = match self.c(expr)? {
698 Some(p) => p,
699 None => return self.pop_split_hole(),
700 };
701 prev_hole = hole;
702 if greedy {
703 holes.push(self.fill_split(split, Some(entry), None));
704 } else {
705 holes.push(self.fill_split(split, None, Some(entry)));
706 }
707 }
708 holes.push(prev_hole);
709 Ok(Some(Patch { hole: Hole::Many(holes), entry: initial_entry }))
710 }
711
712 /// Can be used as a default value for the c_* functions when the call to
713 /// c_function is followed by inserting at least one instruction that is
714 /// always executed after the ones written by the c* function.
next_inst(&self) -> Patch715 fn next_inst(&self) -> Patch {
716 Patch { hole: Hole::None, entry: self.insts.len() }
717 }
718
fill(&mut self, hole: Hole, goto: InstPtr)719 fn fill(&mut self, hole: Hole, goto: InstPtr) {
720 match hole {
721 Hole::None => {}
722 Hole::One(pc) => {
723 self.insts[pc].fill(goto);
724 }
725 Hole::Many(holes) => {
726 for hole in holes {
727 self.fill(hole, goto);
728 }
729 }
730 }
731 }
732
fill_to_next(&mut self, hole: Hole)733 fn fill_to_next(&mut self, hole: Hole) {
734 let next = self.insts.len();
735 self.fill(hole, next);
736 }
737
fill_split( &mut self, hole: Hole, goto1: Option<InstPtr>, goto2: Option<InstPtr>, ) -> Hole738 fn fill_split(
739 &mut self,
740 hole: Hole,
741 goto1: Option<InstPtr>,
742 goto2: Option<InstPtr>,
743 ) -> Hole {
744 match hole {
745 Hole::None => Hole::None,
746 Hole::One(pc) => match (goto1, goto2) {
747 (Some(goto1), Some(goto2)) => {
748 self.insts[pc].fill_split(goto1, goto2);
749 Hole::None
750 }
751 (Some(goto1), None) => {
752 self.insts[pc].half_fill_split_goto1(goto1);
753 Hole::One(pc)
754 }
755 (None, Some(goto2)) => {
756 self.insts[pc].half_fill_split_goto2(goto2);
757 Hole::One(pc)
758 }
759 (None, None) => unreachable!(
760 "at least one of the split \
761 holes must be filled"
762 ),
763 },
764 Hole::Many(holes) => {
765 let mut new_holes = vec![];
766 for hole in holes {
767 new_holes.push(self.fill_split(hole, goto1, goto2));
768 }
769 if new_holes.is_empty() {
770 Hole::None
771 } else if new_holes.len() == 1 {
772 new_holes.pop().unwrap()
773 } else {
774 Hole::Many(new_holes)
775 }
776 }
777 }
778 }
779
push_compiled(&mut self, inst: Inst)780 fn push_compiled(&mut self, inst: Inst) {
781 self.insts.push(MaybeInst::Compiled(inst));
782 }
783
push_hole(&mut self, inst: InstHole) -> Hole784 fn push_hole(&mut self, inst: InstHole) -> Hole {
785 let hole = self.insts.len();
786 self.insts.push(MaybeInst::Uncompiled(inst));
787 Hole::One(hole)
788 }
789
push_split_hole(&mut self) -> Hole790 fn push_split_hole(&mut self) -> Hole {
791 let hole = self.insts.len();
792 self.insts.push(MaybeInst::Split);
793 Hole::One(hole)
794 }
795
pop_split_hole(&mut self) -> ResultOrEmpty796 fn pop_split_hole(&mut self) -> ResultOrEmpty {
797 self.insts.pop();
798 Ok(None)
799 }
800
check_size(&self) -> result::Result<(), Error>801 fn check_size(&self) -> result::Result<(), Error> {
802 use std::mem::size_of;
803
804 let size =
805 self.extra_inst_bytes + (self.insts.len() * size_of::<Inst>());
806 if size > self.size_limit {
807 Err(Error::CompiledTooBig(self.size_limit))
808 } else {
809 Ok(())
810 }
811 }
812 }
813
814 #[derive(Debug)]
815 enum Hole {
816 None,
817 One(InstPtr),
818 Many(Vec<Hole>),
819 }
820
821 impl Hole {
dup_one(self) -> (Self, Self)822 fn dup_one(self) -> (Self, Self) {
823 match self {
824 Hole::One(pc) => (Hole::One(pc), Hole::One(pc)),
825 Hole::None | Hole::Many(_) => {
826 unreachable!("must be called on single hole")
827 }
828 }
829 }
830 }
831
832 #[derive(Clone, Debug)]
833 enum MaybeInst {
834 Compiled(Inst),
835 Uncompiled(InstHole),
836 Split,
837 Split1(InstPtr),
838 Split2(InstPtr),
839 }
840
841 impl MaybeInst {
fill(&mut self, goto: InstPtr)842 fn fill(&mut self, goto: InstPtr) {
843 let maybeinst = match *self {
844 MaybeInst::Split => MaybeInst::Split1(goto),
845 MaybeInst::Uncompiled(ref inst) => {
846 MaybeInst::Compiled(inst.fill(goto))
847 }
848 MaybeInst::Split1(goto1) => {
849 MaybeInst::Compiled(Inst::Split(InstSplit {
850 goto1: goto1,
851 goto2: goto,
852 }))
853 }
854 MaybeInst::Split2(goto2) => {
855 MaybeInst::Compiled(Inst::Split(InstSplit {
856 goto1: goto,
857 goto2: goto2,
858 }))
859 }
860 _ => unreachable!(
861 "not all instructions were compiled! \
862 found uncompiled instruction: {:?}",
863 self
864 ),
865 };
866 *self = maybeinst;
867 }
868
fill_split(&mut self, goto1: InstPtr, goto2: InstPtr)869 fn fill_split(&mut self, goto1: InstPtr, goto2: InstPtr) {
870 let filled = match *self {
871 MaybeInst::Split => {
872 Inst::Split(InstSplit { goto1: goto1, goto2: goto2 })
873 }
874 _ => unreachable!(
875 "must be called on Split instruction, \
876 instead it was called on: {:?}",
877 self
878 ),
879 };
880 *self = MaybeInst::Compiled(filled);
881 }
882
half_fill_split_goto1(&mut self, goto1: InstPtr)883 fn half_fill_split_goto1(&mut self, goto1: InstPtr) {
884 let half_filled = match *self {
885 MaybeInst::Split => goto1,
886 _ => unreachable!(
887 "must be called on Split instruction, \
888 instead it was called on: {:?}",
889 self
890 ),
891 };
892 *self = MaybeInst::Split1(half_filled);
893 }
894
half_fill_split_goto2(&mut self, goto2: InstPtr)895 fn half_fill_split_goto2(&mut self, goto2: InstPtr) {
896 let half_filled = match *self {
897 MaybeInst::Split => goto2,
898 _ => unreachable!(
899 "must be called on Split instruction, \
900 instead it was called on: {:?}",
901 self
902 ),
903 };
904 *self = MaybeInst::Split2(half_filled);
905 }
906
unwrap(self) -> Inst907 fn unwrap(self) -> Inst {
908 match self {
909 MaybeInst::Compiled(inst) => inst,
910 _ => unreachable!(
911 "must be called on a compiled instruction, \
912 instead it was called on: {:?}",
913 self
914 ),
915 }
916 }
917 }
918
919 #[derive(Clone, Debug)]
920 enum InstHole {
921 Save { slot: usize },
922 EmptyLook { look: EmptyLook },
923 Char { c: char },
924 Ranges { ranges: Vec<(char, char)> },
925 Bytes { start: u8, end: u8 },
926 }
927
928 impl InstHole {
fill(&self, goto: InstPtr) -> Inst929 fn fill(&self, goto: InstPtr) -> Inst {
930 match *self {
931 InstHole::Save { slot } => {
932 Inst::Save(InstSave { goto: goto, slot: slot })
933 }
934 InstHole::EmptyLook { look } => {
935 Inst::EmptyLook(InstEmptyLook { goto: goto, look: look })
936 }
937 InstHole::Char { c } => Inst::Char(InstChar { goto: goto, c: c }),
938 InstHole::Ranges { ref ranges } => Inst::Ranges(InstRanges {
939 goto: goto,
940 ranges: ranges.clone().into_boxed_slice(),
941 }),
942 InstHole::Bytes { start, end } => {
943 Inst::Bytes(InstBytes { goto: goto, start: start, end: end })
944 }
945 }
946 }
947 }
948
949 struct CompileClass<'a, 'b> {
950 c: &'a mut Compiler,
951 ranges: &'b [hir::ClassUnicodeRange],
952 }
953
954 impl<'a, 'b> CompileClass<'a, 'b> {
compile(mut self) -> Result955 fn compile(mut self) -> Result {
956 let mut holes = vec![];
957 let mut initial_entry = None;
958 let mut last_split = Hole::None;
959 let mut utf8_seqs = self.c.utf8_seqs.take().unwrap();
960 self.c.suffix_cache.clear();
961
962 for (i, range) in self.ranges.iter().enumerate() {
963 let is_last_range = i + 1 == self.ranges.len();
964 utf8_seqs.reset(range.start(), range.end());
965 let mut it = (&mut utf8_seqs).peekable();
966 loop {
967 let utf8_seq = match it.next() {
968 None => break,
969 Some(utf8_seq) => utf8_seq,
970 };
971 if is_last_range && it.peek().is_none() {
972 let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?;
973 holes.push(hole);
974 self.c.fill(last_split, entry);
975 last_split = Hole::None;
976 if initial_entry.is_none() {
977 initial_entry = Some(entry);
978 }
979 } else {
980 if initial_entry.is_none() {
981 initial_entry = Some(self.c.insts.len());
982 }
983 self.c.fill_to_next(last_split);
984 last_split = self.c.push_split_hole();
985 let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?;
986 holes.push(hole);
987 last_split =
988 self.c.fill_split(last_split, Some(entry), None);
989 }
990 }
991 }
992 self.c.utf8_seqs = Some(utf8_seqs);
993 Ok(Patch { hole: Hole::Many(holes), entry: initial_entry.unwrap() })
994 }
995
c_utf8_seq(&mut self, seq: &Utf8Sequence) -> Result996 fn c_utf8_seq(&mut self, seq: &Utf8Sequence) -> Result {
997 if self.c.compiled.is_reverse {
998 self.c_utf8_seq_(seq)
999 } else {
1000 self.c_utf8_seq_(seq.into_iter().rev())
1001 }
1002 }
1003
c_utf8_seq_<'r, I>(&mut self, seq: I) -> Result where I: IntoIterator<Item = &'r Utf8Range>,1004 fn c_utf8_seq_<'r, I>(&mut self, seq: I) -> Result
1005 where
1006 I: IntoIterator<Item = &'r Utf8Range>,
1007 {
1008 // The initial instruction for each UTF-8 sequence should be the same.
1009 let mut from_inst = ::std::usize::MAX;
1010 let mut last_hole = Hole::None;
1011 for byte_range in seq {
1012 let key = SuffixCacheKey {
1013 from_inst: from_inst,
1014 start: byte_range.start,
1015 end: byte_range.end,
1016 };
1017 {
1018 let pc = self.c.insts.len();
1019 if let Some(cached_pc) = self.c.suffix_cache.get(key, pc) {
1020 from_inst = cached_pc;
1021 continue;
1022 }
1023 }
1024 self.c.byte_classes.set_range(byte_range.start, byte_range.end);
1025 if from_inst == ::std::usize::MAX {
1026 last_hole = self.c.push_hole(InstHole::Bytes {
1027 start: byte_range.start,
1028 end: byte_range.end,
1029 });
1030 } else {
1031 self.c.push_compiled(Inst::Bytes(InstBytes {
1032 goto: from_inst,
1033 start: byte_range.start,
1034 end: byte_range.end,
1035 }));
1036 }
1037 from_inst = self.c.insts.len().checked_sub(1).unwrap();
1038 debug_assert!(from_inst < ::std::usize::MAX);
1039 }
1040 debug_assert!(from_inst < ::std::usize::MAX);
1041 Ok(Patch { hole: last_hole, entry: from_inst })
1042 }
1043 }
1044
1045 /// `SuffixCache` is a simple bounded hash map for caching suffix entries in
1046 /// UTF-8 automata. For example, consider the Unicode range \u{0}-\u{FFFF}.
1047 /// The set of byte ranges looks like this:
1048 ///
1049 /// [0-7F]
1050 /// [C2-DF][80-BF]
1051 /// [E0][A0-BF][80-BF]
1052 /// [E1-EC][80-BF][80-BF]
1053 /// [ED][80-9F][80-BF]
1054 /// [EE-EF][80-BF][80-BF]
1055 ///
1056 /// Each line above translates to one alternate in the compiled regex program.
1057 /// However, all but one of the alternates end in the same suffix, which is
1058 /// a waste of an instruction. The suffix cache facilitates reusing them across
1059 /// alternates.
1060 ///
1061 /// Note that a HashMap could be trivially used for this, but we don't need its
1062 /// overhead. Some small bounded space (LRU style) is more than enough.
1063 ///
1064 /// This uses similar idea to [`SparseSet`](../sparse/struct.SparseSet.html),
1065 /// except it uses hashes as original indices and then compares full keys for
1066 /// validation against `dense` array.
1067 #[derive(Debug)]
1068 struct SuffixCache {
1069 sparse: Box<[usize]>,
1070 dense: Vec<SuffixCacheEntry>,
1071 }
1072
1073 #[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
1074 struct SuffixCacheEntry {
1075 key: SuffixCacheKey,
1076 pc: InstPtr,
1077 }
1078
1079 #[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
1080 struct SuffixCacheKey {
1081 from_inst: InstPtr,
1082 start: u8,
1083 end: u8,
1084 }
1085
1086 impl SuffixCache {
new(size: usize) -> Self1087 fn new(size: usize) -> Self {
1088 SuffixCache {
1089 sparse: vec![0usize; size].into(),
1090 dense: Vec::with_capacity(size),
1091 }
1092 }
1093
get(&mut self, key: SuffixCacheKey, pc: InstPtr) -> Option<InstPtr>1094 fn get(&mut self, key: SuffixCacheKey, pc: InstPtr) -> Option<InstPtr> {
1095 let hash = self.hash(&key);
1096 let pos = &mut self.sparse[hash];
1097 if let Some(entry) = self.dense.get(*pos) {
1098 if entry.key == key {
1099 return Some(entry.pc);
1100 }
1101 }
1102 *pos = self.dense.len();
1103 self.dense.push(SuffixCacheEntry { key: key, pc: pc });
1104 None
1105 }
1106
clear(&mut self)1107 fn clear(&mut self) {
1108 self.dense.clear();
1109 }
1110
hash(&self, suffix: &SuffixCacheKey) -> usize1111 fn hash(&self, suffix: &SuffixCacheKey) -> usize {
1112 // Basic FNV-1a hash as described:
1113 // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
1114 const FNV_PRIME: u64 = 1099511628211;
1115 let mut h = 14695981039346656037;
1116 h = (h ^ (suffix.from_inst as u64)).wrapping_mul(FNV_PRIME);
1117 h = (h ^ (suffix.start as u64)).wrapping_mul(FNV_PRIME);
1118 h = (h ^ (suffix.end as u64)).wrapping_mul(FNV_PRIME);
1119 (h as usize) % self.sparse.len()
1120 }
1121 }
1122
1123 struct ByteClassSet([bool; 256]);
1124
1125 impl ByteClassSet {
new() -> Self1126 fn new() -> Self {
1127 ByteClassSet([false; 256])
1128 }
1129
set_range(&mut self, start: u8, end: u8)1130 fn set_range(&mut self, start: u8, end: u8) {
1131 debug_assert!(start <= end);
1132 if start > 0 {
1133 self.0[start as usize - 1] = true;
1134 }
1135 self.0[end as usize] = true;
1136 }
1137
set_word_boundary(&mut self)1138 fn set_word_boundary(&mut self) {
1139 // We need to mark all ranges of bytes whose pairs result in
1140 // evaluating \b differently.
1141 let iswb = is_word_byte;
1142 let mut b1: u16 = 0;
1143 let mut b2: u16;
1144 while b1 <= 255 {
1145 b2 = b1 + 1;
1146 while b2 <= 255 && iswb(b1 as u8) == iswb(b2 as u8) {
1147 b2 += 1;
1148 }
1149 self.set_range(b1 as u8, (b2 - 1) as u8);
1150 b1 = b2;
1151 }
1152 }
1153
byte_classes(&self) -> Vec<u8>1154 fn byte_classes(&self) -> Vec<u8> {
1155 // N.B. If you're debugging the DFA, it's useful to simply return
1156 // `(0..256).collect()`, which effectively removes the byte classes
1157 // and makes the transitions easier to read.
1158 // (0usize..256).map(|x| x as u8).collect()
1159 let mut byte_classes = vec![0; 256];
1160 let mut class = 0u8;
1161 let mut i = 0;
1162 loop {
1163 byte_classes[i] = class as u8;
1164 if i >= 255 {
1165 break;
1166 }
1167 if self.0[i] {
1168 class = class.checked_add(1).unwrap();
1169 }
1170 i += 1;
1171 }
1172 byte_classes
1173 }
1174 }
1175
1176 impl fmt::Debug for ByteClassSet {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result1177 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1178 f.debug_tuple("ByteClassSet").field(&&self.0[..]).finish()
1179 }
1180 }
1181
u32_to_usize(n: u32) -> usize1182 fn u32_to_usize(n: u32) -> usize {
1183 // In case usize is less than 32 bits, we need to guard against overflow.
1184 // On most platforms this compiles to nothing.
1185 // TODO Use `std::convert::TryFrom` once it's stable.
1186 if (n as u64) > (::std::usize::MAX as u64) {
1187 panic!("BUG: {} is too big to be pointer sized", n)
1188 }
1189 n as usize
1190 }
1191
1192 #[cfg(test)]
1193 mod tests {
1194 use super::ByteClassSet;
1195
1196 #[test]
byte_classes()1197 fn byte_classes() {
1198 let mut set = ByteClassSet::new();
1199 set.set_range(b'a', b'z');
1200 let classes = set.byte_classes();
1201 assert_eq!(classes[0], 0);
1202 assert_eq!(classes[1], 0);
1203 assert_eq!(classes[2], 0);
1204 assert_eq!(classes[b'a' as usize - 1], 0);
1205 assert_eq!(classes[b'a' as usize], 1);
1206 assert_eq!(classes[b'm' as usize], 1);
1207 assert_eq!(classes[b'z' as usize], 1);
1208 assert_eq!(classes[b'z' as usize + 1], 2);
1209 assert_eq!(classes[254], 2);
1210 assert_eq!(classes[255], 2);
1211
1212 let mut set = ByteClassSet::new();
1213 set.set_range(0, 2);
1214 set.set_range(4, 6);
1215 let classes = set.byte_classes();
1216 assert_eq!(classes[0], 0);
1217 assert_eq!(classes[1], 0);
1218 assert_eq!(classes[2], 0);
1219 assert_eq!(classes[3], 1);
1220 assert_eq!(classes[4], 2);
1221 assert_eq!(classes[5], 2);
1222 assert_eq!(classes[6], 2);
1223 assert_eq!(classes[7], 3);
1224 assert_eq!(classes[255], 3);
1225 }
1226
1227 #[test]
full_byte_classes()1228 fn full_byte_classes() {
1229 let mut set = ByteClassSet::new();
1230 for i in 0..256u16 {
1231 set.set_range(i as u8, i as u8);
1232 }
1233 assert_eq!(set.byte_classes().len(), 256);
1234 }
1235 }
1236