1 use std::collections::HashMap;
2 use std::iter;
3 use std::result;
4 use std::sync::Arc;
5
6 use syntax::hir::{self, Hir};
7 use syntax::is_word_byte;
8 use syntax::utf8::{Utf8Range, Utf8Sequence, Utf8Sequences};
9
10 use prog::{
11 EmptyLook, Inst, InstBytes, InstChar, InstEmptyLook, InstPtr, InstRanges,
12 InstSave, InstSplit, Program,
13 };
14
15 use Error;
16
17 type Result = result::Result<Patch, Error>;
18
19 #[derive(Debug)]
20 struct Patch {
21 hole: Hole,
22 entry: InstPtr,
23 }
24
25 /// A compiler translates a regular expression AST to a sequence of
26 /// instructions. The sequence of instructions represents an NFA.
27 pub struct Compiler {
28 insts: Vec<MaybeInst>,
29 compiled: Program,
30 capture_name_idx: HashMap<String, usize>,
31 num_exprs: usize,
32 size_limit: usize,
33 suffix_cache: SuffixCache,
34 utf8_seqs: Option<Utf8Sequences>,
35 byte_classes: ByteClassSet,
36 }
37
38 impl Compiler {
39 /// Create a new regular expression compiler.
40 ///
41 /// Various options can be set before calling `compile` on an expression.
new() -> Self42 pub fn new() -> Self {
43 Compiler {
44 insts: vec![],
45 compiled: Program::new(),
46 capture_name_idx: HashMap::new(),
47 num_exprs: 0,
48 size_limit: 10 * (1 << 20),
49 suffix_cache: SuffixCache::new(1000),
50 utf8_seqs: Some(Utf8Sequences::new('\x00', '\x00')),
51 byte_classes: ByteClassSet::new(),
52 }
53 }
54
55 /// The size of the resulting program is limited by size_limit. If
56 /// the program approximately exceeds the given size (in bytes), then
57 /// compilation will stop and return an error.
size_limit(mut self, size_limit: usize) -> Self58 pub fn size_limit(mut self, size_limit: usize) -> Self {
59 self.size_limit = size_limit;
60 self
61 }
62
63 /// If bytes is true, then the program is compiled as a byte based
64 /// automaton, which incorporates UTF-8 decoding into the machine. If it's
65 /// false, then the automaton is Unicode scalar value based, e.g., an
66 /// engine utilizing such an automaton is responsible for UTF-8 decoding.
67 ///
68 /// The specific invariant is that when returning a byte based machine,
69 /// the neither the `Char` nor `Ranges` instructions are produced.
70 /// Conversely, when producing a Unicode scalar value machine, the `Bytes`
71 /// instruction is never produced.
72 ///
73 /// Note that `dfa(true)` implies `bytes(true)`.
bytes(mut self, yes: bool) -> Self74 pub fn bytes(mut self, yes: bool) -> Self {
75 self.compiled.is_bytes = yes;
76 self
77 }
78
79 /// When disabled, the program compiled may match arbitrary bytes.
80 ///
81 /// When enabled (the default), all compiled programs exclusively match
82 /// valid UTF-8 bytes.
only_utf8(mut self, yes: bool) -> Self83 pub fn only_utf8(mut self, yes: bool) -> Self {
84 self.compiled.only_utf8 = yes;
85 self
86 }
87
88 /// When set, the machine returned is suitable for use in the DFA matching
89 /// engine.
90 ///
91 /// In particular, this ensures that if the regex is not anchored in the
92 /// beginning, then a preceding `.*?` is included in the program. (The NFA
93 /// based engines handle the preceding `.*?` explicitly, which is difficult
94 /// or impossible in the DFA engine.)
dfa(mut self, yes: bool) -> Self95 pub fn dfa(mut self, yes: bool) -> Self {
96 self.compiled.is_dfa = yes;
97 self
98 }
99
100 /// When set, the machine returned is suitable for matching text in
101 /// reverse. In particular, all concatenations are flipped.
reverse(mut self, yes: bool) -> Self102 pub fn reverse(mut self, yes: bool) -> Self {
103 self.compiled.is_reverse = yes;
104 self
105 }
106
107 /// Compile a regular expression given its AST.
108 ///
109 /// The compiler is guaranteed to succeed unless the program exceeds the
110 /// specified size limit. If the size limit is exceeded, then compilation
111 /// stops and returns an error.
compile(mut self, exprs: &[Hir]) -> result::Result<Program, Error>112 pub fn compile(mut self, exprs: &[Hir]) -> result::Result<Program, Error> {
113 debug_assert!(!exprs.is_empty());
114 self.num_exprs = exprs.len();
115 if exprs.len() == 1 {
116 self.compile_one(&exprs[0])
117 } else {
118 self.compile_many(exprs)
119 }
120 }
121
compile_one(mut self, expr: &Hir) -> result::Result<Program, Error>122 fn compile_one(mut self, expr: &Hir) -> result::Result<Program, Error> {
123 // If we're compiling a forward DFA and we aren't anchored, then
124 // add a `.*?` before the first capture group.
125 // Other matching engines handle this by baking the logic into the
126 // matching engine itself.
127 let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
128 self.compiled.is_anchored_start = expr.is_anchored_start();
129 self.compiled.is_anchored_end = expr.is_anchored_end();
130 if self.compiled.needs_dotstar() {
131 dotstar_patch = self.c_dotstar()?;
132 self.compiled.start = dotstar_patch.entry;
133 }
134 self.compiled.captures = vec![None];
135 let patch = self.c_capture(0, expr)?;
136 if self.compiled.needs_dotstar() {
137 self.fill(dotstar_patch.hole, patch.entry);
138 } else {
139 self.compiled.start = patch.entry;
140 }
141 self.fill_to_next(patch.hole);
142 self.compiled.matches = vec![self.insts.len()];
143 self.push_compiled(Inst::Match(0));
144 self.compile_finish()
145 }
146
compile_many( mut self, exprs: &[Hir], ) -> result::Result<Program, Error>147 fn compile_many(
148 mut self,
149 exprs: &[Hir],
150 ) -> result::Result<Program, Error> {
151 debug_assert!(exprs.len() > 1);
152
153 self.compiled.is_anchored_start =
154 exprs.iter().all(|e| e.is_anchored_start());
155 self.compiled.is_anchored_end =
156 exprs.iter().all(|e| e.is_anchored_end());
157 let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
158 if self.compiled.needs_dotstar() {
159 dotstar_patch = self.c_dotstar()?;
160 self.compiled.start = dotstar_patch.entry;
161 } else {
162 self.compiled.start = 0; // first instruction is always split
163 }
164 self.fill_to_next(dotstar_patch.hole);
165
166 let mut prev_hole = Hole::None;
167 for (i, expr) in exprs[0..exprs.len() - 1].iter().enumerate() {
168 self.fill_to_next(prev_hole);
169 let split = self.push_split_hole();
170 let Patch { hole, entry } = self.c_capture(0, expr)?;
171 self.fill_to_next(hole);
172 self.compiled.matches.push(self.insts.len());
173 self.push_compiled(Inst::Match(i));
174 prev_hole = self.fill_split(split, Some(entry), None);
175 }
176 let i = exprs.len() - 1;
177 let Patch { hole, entry } = self.c_capture(0, &exprs[i])?;
178 self.fill(prev_hole, entry);
179 self.fill_to_next(hole);
180 self.compiled.matches.push(self.insts.len());
181 self.push_compiled(Inst::Match(i));
182 self.compile_finish()
183 }
184
compile_finish(mut self) -> result::Result<Program, Error>185 fn compile_finish(mut self) -> result::Result<Program, Error> {
186 self.compiled.insts =
187 self.insts.into_iter().map(|inst| inst.unwrap()).collect();
188 self.compiled.byte_classes = self.byte_classes.byte_classes();
189 self.compiled.capture_name_idx = Arc::new(self.capture_name_idx);
190 Ok(self.compiled)
191 }
192
193 /// Compile expr into self.insts, returning a patch on success,
194 /// or an error if we run out of memory.
195 ///
196 /// All of the c_* methods of the compiler share the contract outlined
197 /// here.
198 ///
199 /// The main thing that a c_* method does is mutate `self.insts`
200 /// to add a list of mostly compiled instructions required to execute
201 /// the given expression. `self.insts` contains MaybeInsts rather than
202 /// Insts because there is some backpatching required.
203 ///
204 /// The `Patch` value returned by each c_* method provides metadata
205 /// about the compiled instructions emitted to `self.insts`. The
206 /// `entry` member of the patch refers to the first instruction
207 /// (the entry point), while the `hole` member contains zero or
208 /// more offsets to partial instructions that need to be backpatched.
209 /// The c_* routine can't know where its list of instructions are going to
210 /// jump to after execution, so it is up to the caller to patch
211 /// these jumps to point to the right place. So compiling some
212 /// expression, e, we would end up with a situation that looked like:
213 ///
214 /// ```text
215 /// self.insts = [ ..., i1, i2, ..., iexit1, ..., iexitn, ...]
216 /// ^ ^ ^
217 /// | \ /
218 /// entry \ /
219 /// hole
220 /// ```
221 ///
222 /// To compile two expressions, e1 and e2, concatinated together we
223 /// would do:
224 ///
225 /// ```ignore
226 /// let patch1 = self.c(e1);
227 /// let patch2 = self.c(e2);
228 /// ```
229 ///
230 /// while leaves us with a situation that looks like
231 ///
232 /// ```text
233 /// self.insts = [ ..., i1, ..., iexit1, ..., i2, ..., iexit2 ]
234 /// ^ ^ ^ ^
235 /// | | | |
236 /// entry1 hole1 entry2 hole2
237 /// ```
238 ///
239 /// Then to merge the two patches together into one we would backpatch
240 /// hole1 with entry2 and return a new patch that enters at entry1
241 /// and has hole2 for a hole. In fact, if you look at the c_concat
242 /// method you will see that it does exactly this, though it handles
243 /// a list of expressions rather than just the two that we use for
244 /// an example.
c(&mut self, expr: &Hir) -> Result245 fn c(&mut self, expr: &Hir) -> Result {
246 use prog;
247 use syntax::hir::HirKind::*;
248
249 self.check_size()?;
250 match *expr.kind() {
251 Empty => Ok(Patch { hole: Hole::None, entry: self.insts.len() }),
252 Literal(hir::Literal::Unicode(c)) => self.c_char(c),
253 Literal(hir::Literal::Byte(b)) => {
254 assert!(self.compiled.uses_bytes());
255 self.c_byte(b)
256 }
257 Class(hir::Class::Unicode(ref cls)) => self.c_class(cls.ranges()),
258 Class(hir::Class::Bytes(ref cls)) => {
259 if self.compiled.uses_bytes() {
260 self.c_class_bytes(cls.ranges())
261 } else {
262 assert!(cls.is_all_ascii());
263 let mut char_ranges = vec![];
264 for r in cls.iter() {
265 let (s, e) = (r.start() as char, r.end() as char);
266 char_ranges.push(hir::ClassUnicodeRange::new(s, e));
267 }
268 self.c_class(&char_ranges)
269 }
270 }
271 Anchor(hir::Anchor::StartLine) if self.compiled.is_reverse => {
272 self.byte_classes.set_range(b'\n', b'\n');
273 self.c_empty_look(prog::EmptyLook::EndLine)
274 }
275 Anchor(hir::Anchor::StartLine) => {
276 self.byte_classes.set_range(b'\n', b'\n');
277 self.c_empty_look(prog::EmptyLook::StartLine)
278 }
279 Anchor(hir::Anchor::EndLine) if self.compiled.is_reverse => {
280 self.byte_classes.set_range(b'\n', b'\n');
281 self.c_empty_look(prog::EmptyLook::StartLine)
282 }
283 Anchor(hir::Anchor::EndLine) => {
284 self.byte_classes.set_range(b'\n', b'\n');
285 self.c_empty_look(prog::EmptyLook::EndLine)
286 }
287 Anchor(hir::Anchor::StartText) if self.compiled.is_reverse => {
288 self.c_empty_look(prog::EmptyLook::EndText)
289 }
290 Anchor(hir::Anchor::StartText) => {
291 self.c_empty_look(prog::EmptyLook::StartText)
292 }
293 Anchor(hir::Anchor::EndText) if self.compiled.is_reverse => {
294 self.c_empty_look(prog::EmptyLook::StartText)
295 }
296 Anchor(hir::Anchor::EndText) => {
297 self.c_empty_look(prog::EmptyLook::EndText)
298 }
299 WordBoundary(hir::WordBoundary::Unicode) => {
300 if !cfg!(feature = "unicode-perl") {
301 return Err(Error::Syntax(
302 "Unicode word boundaries are unavailable when \
303 the unicode-perl feature is disabled"
304 .to_string(),
305 ));
306 }
307 self.compiled.has_unicode_word_boundary = true;
308 self.byte_classes.set_word_boundary();
309 self.c_empty_look(prog::EmptyLook::WordBoundary)
310 }
311 WordBoundary(hir::WordBoundary::UnicodeNegate) => {
312 if !cfg!(feature = "unicode-perl") {
313 return Err(Error::Syntax(
314 "Unicode word boundaries are unavailable when \
315 the unicode-perl feature is disabled"
316 .to_string(),
317 ));
318 }
319 self.compiled.has_unicode_word_boundary = true;
320 self.byte_classes.set_word_boundary();
321 self.c_empty_look(prog::EmptyLook::NotWordBoundary)
322 }
323 WordBoundary(hir::WordBoundary::Ascii) => {
324 self.byte_classes.set_word_boundary();
325 self.c_empty_look(prog::EmptyLook::WordBoundaryAscii)
326 }
327 WordBoundary(hir::WordBoundary::AsciiNegate) => {
328 self.byte_classes.set_word_boundary();
329 self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii)
330 }
331 Group(ref g) => match g.kind {
332 hir::GroupKind::NonCapturing => self.c(&g.hir),
333 hir::GroupKind::CaptureIndex(index) => {
334 if index as usize >= self.compiled.captures.len() {
335 self.compiled.captures.push(None);
336 }
337 self.c_capture(2 * index as usize, &g.hir)
338 }
339 hir::GroupKind::CaptureName { index, ref name } => {
340 if index as usize >= self.compiled.captures.len() {
341 let n = name.to_string();
342 self.compiled.captures.push(Some(n.clone()));
343 self.capture_name_idx.insert(n, index as usize);
344 }
345 self.c_capture(2 * index as usize, &g.hir)
346 }
347 },
348 Concat(ref es) => {
349 if self.compiled.is_reverse {
350 self.c_concat(es.iter().rev())
351 } else {
352 self.c_concat(es)
353 }
354 }
355 Alternation(ref es) => self.c_alternate(&**es),
356 Repetition(ref rep) => self.c_repeat(rep),
357 }
358 }
359
c_capture(&mut self, first_slot: usize, expr: &Hir) -> Result360 fn c_capture(&mut self, first_slot: usize, expr: &Hir) -> Result {
361 if self.num_exprs > 1 || self.compiled.is_dfa {
362 // Don't ever compile Save instructions for regex sets because
363 // they are never used. They are also never used in DFA programs
364 // because DFAs can't handle captures.
365 self.c(expr)
366 } else {
367 let entry = self.insts.len();
368 let hole = self.push_hole(InstHole::Save { slot: first_slot });
369 let patch = self.c(expr)?;
370 self.fill(hole, patch.entry);
371 self.fill_to_next(patch.hole);
372 let hole = self.push_hole(InstHole::Save { slot: first_slot + 1 });
373 Ok(Patch { hole: hole, entry: entry })
374 }
375 }
376
c_dotstar(&mut self) -> Result377 fn c_dotstar(&mut self) -> Result {
378 Ok(if !self.compiled.only_utf8() {
379 self.c(&Hir::repetition(hir::Repetition {
380 kind: hir::RepetitionKind::ZeroOrMore,
381 greedy: false,
382 hir: Box::new(Hir::any(true)),
383 }))?
384 } else {
385 self.c(&Hir::repetition(hir::Repetition {
386 kind: hir::RepetitionKind::ZeroOrMore,
387 greedy: false,
388 hir: Box::new(Hir::any(false)),
389 }))?
390 })
391 }
392
c_char(&mut self, c: char) -> Result393 fn c_char(&mut self, c: char) -> Result {
394 if self.compiled.uses_bytes() {
395 if c.is_ascii() {
396 let b = c as u8;
397 let hole =
398 self.push_hole(InstHole::Bytes { start: b, end: b });
399 self.byte_classes.set_range(b, b);
400 Ok(Patch { hole, entry: self.insts.len() - 1 })
401 } else {
402 self.c_class(&[hir::ClassUnicodeRange::new(c, c)])
403 }
404 } else {
405 let hole = self.push_hole(InstHole::Char { c: c });
406 Ok(Patch { hole, entry: self.insts.len() - 1 })
407 }
408 }
409
c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> Result410 fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> Result {
411 assert!(!ranges.is_empty());
412 if self.compiled.uses_bytes() {
413 CompileClass { c: self, ranges: ranges }.compile()
414 } else {
415 let ranges: Vec<(char, char)> =
416 ranges.iter().map(|r| (r.start(), r.end())).collect();
417 let hole = if ranges.len() == 1 && ranges[0].0 == ranges[0].1 {
418 self.push_hole(InstHole::Char { c: ranges[0].0 })
419 } else {
420 self.push_hole(InstHole::Ranges { ranges: ranges })
421 };
422 Ok(Patch { hole: hole, entry: self.insts.len() - 1 })
423 }
424 }
425
c_byte(&mut self, b: u8) -> Result426 fn c_byte(&mut self, b: u8) -> Result {
427 self.c_class_bytes(&[hir::ClassBytesRange::new(b, b)])
428 }
429
c_class_bytes(&mut self, ranges: &[hir::ClassBytesRange]) -> Result430 fn c_class_bytes(&mut self, ranges: &[hir::ClassBytesRange]) -> Result {
431 debug_assert!(!ranges.is_empty());
432
433 let first_split_entry = self.insts.len();
434 let mut holes = vec![];
435 let mut prev_hole = Hole::None;
436 for r in &ranges[0..ranges.len() - 1] {
437 self.fill_to_next(prev_hole);
438 let split = self.push_split_hole();
439 let next = self.insts.len();
440 self.byte_classes.set_range(r.start(), r.end());
441 holes.push(self.push_hole(InstHole::Bytes {
442 start: r.start(),
443 end: r.end(),
444 }));
445 prev_hole = self.fill_split(split, Some(next), None);
446 }
447 let next = self.insts.len();
448 let r = &ranges[ranges.len() - 1];
449 self.byte_classes.set_range(r.start(), r.end());
450 holes.push(
451 self.push_hole(InstHole::Bytes { start: r.start(), end: r.end() }),
452 );
453 self.fill(prev_hole, next);
454 Ok(Patch { hole: Hole::Many(holes), entry: first_split_entry })
455 }
456
c_empty_look(&mut self, look: EmptyLook) -> Result457 fn c_empty_look(&mut self, look: EmptyLook) -> Result {
458 let hole = self.push_hole(InstHole::EmptyLook { look: look });
459 Ok(Patch { hole: hole, entry: self.insts.len() - 1 })
460 }
461
c_concat<'a, I>(&mut self, exprs: I) -> Result where I: IntoIterator<Item = &'a Hir>,462 fn c_concat<'a, I>(&mut self, exprs: I) -> Result
463 where
464 I: IntoIterator<Item = &'a Hir>,
465 {
466 let mut exprs = exprs.into_iter();
467 let first = match exprs.next() {
468 Some(expr) => expr,
469 None => {
470 return Ok(Patch { hole: Hole::None, entry: self.insts.len() })
471 }
472 };
473 let Patch { mut hole, entry } = self.c(first)?;
474 for e in exprs {
475 let p = self.c(e)?;
476 self.fill(hole, p.entry);
477 hole = p.hole;
478 }
479 Ok(Patch { hole: hole, entry: entry })
480 }
481
c_alternate(&mut self, exprs: &[Hir]) -> Result482 fn c_alternate(&mut self, exprs: &[Hir]) -> Result {
483 debug_assert!(
484 exprs.len() >= 2,
485 "alternates must have at least 2 exprs"
486 );
487
488 // Initial entry point is always the first split.
489 let first_split_entry = self.insts.len();
490
491 // Save up all of the holes from each alternate. They will all get
492 // patched to point to the same location.
493 let mut holes = vec![];
494
495 let mut prev_hole = Hole::None;
496 for e in &exprs[0..exprs.len() - 1] {
497 self.fill_to_next(prev_hole);
498 let split = self.push_split_hole();
499 let prev_entry = self.insts.len();
500 let Patch { hole, entry } = self.c(e)?;
501 if prev_entry == self.insts.len() {
502 // TODO(burntsushi): It is kind of silly that we don't support
503 // empty-subexpressions in alternates, but it is supremely
504 // awkward to support them in the existing compiler
505 // infrastructure. This entire compiler needs to be thrown out
506 // anyway, so don't feel too bad.
507 return Err(Error::Syntax(
508 "alternations cannot currently contain \
509 empty sub-expressions"
510 .to_string(),
511 ));
512 }
513 holes.push(hole);
514 prev_hole = self.fill_split(split, Some(entry), None);
515 }
516 let prev_entry = self.insts.len();
517 let Patch { hole, entry } = self.c(&exprs[exprs.len() - 1])?;
518 if prev_entry == self.insts.len() {
519 // TODO(burntsushi): See TODO above.
520 return Err(Error::Syntax(
521 "alternations cannot currently contain \
522 empty sub-expressions"
523 .to_string(),
524 ));
525 }
526 holes.push(hole);
527 self.fill(prev_hole, entry);
528 Ok(Patch { hole: Hole::Many(holes), entry: first_split_entry })
529 }
530
c_repeat(&mut self, rep: &hir::Repetition) -> Result531 fn c_repeat(&mut self, rep: &hir::Repetition) -> Result {
532 use syntax::hir::RepetitionKind::*;
533 match rep.kind {
534 ZeroOrOne => self.c_repeat_zero_or_one(&rep.hir, rep.greedy),
535 ZeroOrMore => self.c_repeat_zero_or_more(&rep.hir, rep.greedy),
536 OneOrMore => self.c_repeat_one_or_more(&rep.hir, rep.greedy),
537 Range(hir::RepetitionRange::Exactly(min_max)) => {
538 self.c_repeat_range(&rep.hir, rep.greedy, min_max, min_max)
539 }
540 Range(hir::RepetitionRange::AtLeast(min)) => {
541 self.c_repeat_range_min_or_more(&rep.hir, rep.greedy, min)
542 }
543 Range(hir::RepetitionRange::Bounded(min, max)) => {
544 self.c_repeat_range(&rep.hir, rep.greedy, min, max)
545 }
546 }
547 }
548
c_repeat_zero_or_one(&mut self, expr: &Hir, greedy: bool) -> Result549 fn c_repeat_zero_or_one(&mut self, expr: &Hir, greedy: bool) -> Result {
550 let split_entry = self.insts.len();
551 let split = self.push_split_hole();
552 let Patch { hole: hole_rep, entry: entry_rep } = self.c(expr)?;
553
554 let split_hole = if greedy {
555 self.fill_split(split, Some(entry_rep), None)
556 } else {
557 self.fill_split(split, None, Some(entry_rep))
558 };
559 let holes = vec![hole_rep, split_hole];
560 Ok(Patch { hole: Hole::Many(holes), entry: split_entry })
561 }
562
c_repeat_zero_or_more(&mut self, expr: &Hir, greedy: bool) -> Result563 fn c_repeat_zero_or_more(&mut self, expr: &Hir, greedy: bool) -> Result {
564 let split_entry = self.insts.len();
565 let split = self.push_split_hole();
566 let Patch { hole: hole_rep, entry: entry_rep } = self.c(expr)?;
567
568 self.fill(hole_rep, split_entry);
569 let split_hole = if greedy {
570 self.fill_split(split, Some(entry_rep), None)
571 } else {
572 self.fill_split(split, None, Some(entry_rep))
573 };
574 Ok(Patch { hole: split_hole, entry: split_entry })
575 }
576
c_repeat_one_or_more(&mut self, expr: &Hir, greedy: bool) -> Result577 fn c_repeat_one_or_more(&mut self, expr: &Hir, greedy: bool) -> Result {
578 let Patch { hole: hole_rep, entry: entry_rep } = self.c(expr)?;
579 self.fill_to_next(hole_rep);
580 let split = self.push_split_hole();
581
582 let split_hole = if greedy {
583 self.fill_split(split, Some(entry_rep), None)
584 } else {
585 self.fill_split(split, None, Some(entry_rep))
586 };
587 Ok(Patch { hole: split_hole, entry: entry_rep })
588 }
589
c_repeat_range_min_or_more( &mut self, expr: &Hir, greedy: bool, min: u32, ) -> Result590 fn c_repeat_range_min_or_more(
591 &mut self,
592 expr: &Hir,
593 greedy: bool,
594 min: u32,
595 ) -> Result {
596 let min = u32_to_usize(min);
597 let patch_concat = self.c_concat(iter::repeat(expr).take(min))?;
598 let patch_rep = self.c_repeat_zero_or_more(expr, greedy)?;
599 self.fill(patch_concat.hole, patch_rep.entry);
600 Ok(Patch { hole: patch_rep.hole, entry: patch_concat.entry })
601 }
602
c_repeat_range( &mut self, expr: &Hir, greedy: bool, min: u32, max: u32, ) -> Result603 fn c_repeat_range(
604 &mut self,
605 expr: &Hir,
606 greedy: bool,
607 min: u32,
608 max: u32,
609 ) -> Result {
610 let (min, max) = (u32_to_usize(min), u32_to_usize(max));
611 let patch_concat = self.c_concat(iter::repeat(expr).take(min))?;
612 let initial_entry = patch_concat.entry;
613 if min == max {
614 return Ok(patch_concat);
615 }
616 // It is much simpler to compile, e.g., `a{2,5}` as:
617 //
618 // aaa?a?a?
619 //
620 // But you end up with a sequence of instructions like this:
621 //
622 // 0: 'a'
623 // 1: 'a',
624 // 2: split(3, 4)
625 // 3: 'a'
626 // 4: split(5, 6)
627 // 5: 'a'
628 // 6: split(7, 8)
629 // 7: 'a'
630 // 8: MATCH
631 //
632 // This is *incredibly* inefficient because the splits end
633 // up forming a chain, which has to be resolved everything a
634 // transition is followed.
635 let mut holes = vec![];
636 let mut prev_hole = patch_concat.hole;
637 for _ in min..max {
638 self.fill_to_next(prev_hole);
639 let split = self.push_split_hole();
640 let Patch { hole, entry } = self.c(expr)?;
641 prev_hole = hole;
642 if greedy {
643 holes.push(self.fill_split(split, Some(entry), None));
644 } else {
645 holes.push(self.fill_split(split, None, Some(entry)));
646 }
647 }
648 holes.push(prev_hole);
649 Ok(Patch { hole: Hole::Many(holes), entry: initial_entry })
650 }
651
fill(&mut self, hole: Hole, goto: InstPtr)652 fn fill(&mut self, hole: Hole, goto: InstPtr) {
653 match hole {
654 Hole::None => {}
655 Hole::One(pc) => {
656 self.insts[pc].fill(goto);
657 }
658 Hole::Many(holes) => {
659 for hole in holes {
660 self.fill(hole, goto);
661 }
662 }
663 }
664 }
665
fill_to_next(&mut self, hole: Hole)666 fn fill_to_next(&mut self, hole: Hole) {
667 let next = self.insts.len();
668 self.fill(hole, next);
669 }
670
fill_split( &mut self, hole: Hole, goto1: Option<InstPtr>, goto2: Option<InstPtr>, ) -> Hole671 fn fill_split(
672 &mut self,
673 hole: Hole,
674 goto1: Option<InstPtr>,
675 goto2: Option<InstPtr>,
676 ) -> Hole {
677 match hole {
678 Hole::None => Hole::None,
679 Hole::One(pc) => match (goto1, goto2) {
680 (Some(goto1), Some(goto2)) => {
681 self.insts[pc].fill_split(goto1, goto2);
682 Hole::None
683 }
684 (Some(goto1), None) => {
685 self.insts[pc].half_fill_split_goto1(goto1);
686 Hole::One(pc)
687 }
688 (None, Some(goto2)) => {
689 self.insts[pc].half_fill_split_goto2(goto2);
690 Hole::One(pc)
691 }
692 (None, None) => unreachable!(
693 "at least one of the split \
694 holes must be filled"
695 ),
696 },
697 Hole::Many(holes) => {
698 let mut new_holes = vec![];
699 for hole in holes {
700 new_holes.push(self.fill_split(hole, goto1, goto2));
701 }
702 if new_holes.is_empty() {
703 Hole::None
704 } else if new_holes.len() == 1 {
705 new_holes.pop().unwrap()
706 } else {
707 Hole::Many(new_holes)
708 }
709 }
710 }
711 }
712
push_compiled(&mut self, inst: Inst)713 fn push_compiled(&mut self, inst: Inst) {
714 self.insts.push(MaybeInst::Compiled(inst));
715 }
716
push_hole(&mut self, inst: InstHole) -> Hole717 fn push_hole(&mut self, inst: InstHole) -> Hole {
718 let hole = self.insts.len();
719 self.insts.push(MaybeInst::Uncompiled(inst));
720 Hole::One(hole)
721 }
722
push_split_hole(&mut self) -> Hole723 fn push_split_hole(&mut self) -> Hole {
724 let hole = self.insts.len();
725 self.insts.push(MaybeInst::Split);
726 Hole::One(hole)
727 }
728
check_size(&self) -> result::Result<(), Error>729 fn check_size(&self) -> result::Result<(), Error> {
730 use std::mem::size_of;
731
732 if self.insts.len() * size_of::<Inst>() > self.size_limit {
733 Err(Error::CompiledTooBig(self.size_limit))
734 } else {
735 Ok(())
736 }
737 }
738 }
739
740 #[derive(Debug)]
741 enum Hole {
742 None,
743 One(InstPtr),
744 Many(Vec<Hole>),
745 }
746
747 #[derive(Clone, Debug)]
748 enum MaybeInst {
749 Compiled(Inst),
750 Uncompiled(InstHole),
751 Split,
752 Split1(InstPtr),
753 Split2(InstPtr),
754 }
755
756 impl MaybeInst {
fill(&mut self, goto: InstPtr)757 fn fill(&mut self, goto: InstPtr) {
758 let filled = match *self {
759 MaybeInst::Uncompiled(ref inst) => inst.fill(goto),
760 MaybeInst::Split1(goto1) => {
761 Inst::Split(InstSplit { goto1: goto1, goto2: goto })
762 }
763 MaybeInst::Split2(goto2) => {
764 Inst::Split(InstSplit { goto1: goto, goto2: goto2 })
765 }
766 _ => unreachable!(
767 "not all instructions were compiled! \
768 found uncompiled instruction: {:?}",
769 self
770 ),
771 };
772 *self = MaybeInst::Compiled(filled);
773 }
774
fill_split(&mut self, goto1: InstPtr, goto2: InstPtr)775 fn fill_split(&mut self, goto1: InstPtr, goto2: InstPtr) {
776 let filled = match *self {
777 MaybeInst::Split => {
778 Inst::Split(InstSplit { goto1: goto1, goto2: goto2 })
779 }
780 _ => unreachable!(
781 "must be called on Split instruction, \
782 instead it was called on: {:?}",
783 self
784 ),
785 };
786 *self = MaybeInst::Compiled(filled);
787 }
788
half_fill_split_goto1(&mut self, goto1: InstPtr)789 fn half_fill_split_goto1(&mut self, goto1: InstPtr) {
790 let half_filled = match *self {
791 MaybeInst::Split => goto1,
792 _ => unreachable!(
793 "must be called on Split instruction, \
794 instead it was called on: {:?}",
795 self
796 ),
797 };
798 *self = MaybeInst::Split1(half_filled);
799 }
800
half_fill_split_goto2(&mut self, goto2: InstPtr)801 fn half_fill_split_goto2(&mut self, goto2: InstPtr) {
802 let half_filled = match *self {
803 MaybeInst::Split => goto2,
804 _ => unreachable!(
805 "must be called on Split instruction, \
806 instead it was called on: {:?}",
807 self
808 ),
809 };
810 *self = MaybeInst::Split2(half_filled);
811 }
812
unwrap(self) -> Inst813 fn unwrap(self) -> Inst {
814 match self {
815 MaybeInst::Compiled(inst) => inst,
816 _ => unreachable!(
817 "must be called on a compiled instruction, \
818 instead it was called on: {:?}",
819 self
820 ),
821 }
822 }
823 }
824
825 #[derive(Clone, Debug)]
826 enum InstHole {
827 Save { slot: usize },
828 EmptyLook { look: EmptyLook },
829 Char { c: char },
830 Ranges { ranges: Vec<(char, char)> },
831 Bytes { start: u8, end: u8 },
832 }
833
834 impl InstHole {
fill(&self, goto: InstPtr) -> Inst835 fn fill(&self, goto: InstPtr) -> Inst {
836 match *self {
837 InstHole::Save { slot } => {
838 Inst::Save(InstSave { goto: goto, slot: slot })
839 }
840 InstHole::EmptyLook { look } => {
841 Inst::EmptyLook(InstEmptyLook { goto: goto, look: look })
842 }
843 InstHole::Char { c } => Inst::Char(InstChar { goto: goto, c: c }),
844 InstHole::Ranges { ref ranges } => {
845 Inst::Ranges(InstRanges { goto: goto, ranges: ranges.clone() })
846 }
847 InstHole::Bytes { start, end } => {
848 Inst::Bytes(InstBytes { goto: goto, start: start, end: end })
849 }
850 }
851 }
852 }
853
854 struct CompileClass<'a, 'b> {
855 c: &'a mut Compiler,
856 ranges: &'b [hir::ClassUnicodeRange],
857 }
858
859 impl<'a, 'b> CompileClass<'a, 'b> {
compile(mut self) -> Result860 fn compile(mut self) -> Result {
861 let mut holes = vec![];
862 let mut initial_entry = None;
863 let mut last_split = Hole::None;
864 let mut utf8_seqs = self.c.utf8_seqs.take().unwrap();
865 self.c.suffix_cache.clear();
866
867 for (i, range) in self.ranges.iter().enumerate() {
868 let is_last_range = i + 1 == self.ranges.len();
869 utf8_seqs.reset(range.start(), range.end());
870 let mut it = (&mut utf8_seqs).peekable();
871 loop {
872 let utf8_seq = match it.next() {
873 None => break,
874 Some(utf8_seq) => utf8_seq,
875 };
876 if is_last_range && it.peek().is_none() {
877 let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?;
878 holes.push(hole);
879 self.c.fill(last_split, entry);
880 last_split = Hole::None;
881 if initial_entry.is_none() {
882 initial_entry = Some(entry);
883 }
884 } else {
885 if initial_entry.is_none() {
886 initial_entry = Some(self.c.insts.len());
887 }
888 self.c.fill_to_next(last_split);
889 last_split = self.c.push_split_hole();
890 let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?;
891 holes.push(hole);
892 last_split =
893 self.c.fill_split(last_split, Some(entry), None);
894 }
895 }
896 }
897 self.c.utf8_seqs = Some(utf8_seqs);
898 Ok(Patch { hole: Hole::Many(holes), entry: initial_entry.unwrap() })
899 }
900
c_utf8_seq(&mut self, seq: &Utf8Sequence) -> Result901 fn c_utf8_seq(&mut self, seq: &Utf8Sequence) -> Result {
902 if self.c.compiled.is_reverse {
903 self.c_utf8_seq_(seq)
904 } else {
905 self.c_utf8_seq_(seq.into_iter().rev())
906 }
907 }
908
c_utf8_seq_<'r, I>(&mut self, seq: I) -> Result where I: IntoIterator<Item = &'r Utf8Range>,909 fn c_utf8_seq_<'r, I>(&mut self, seq: I) -> Result
910 where
911 I: IntoIterator<Item = &'r Utf8Range>,
912 {
913 // The initial instruction for each UTF-8 sequence should be the same.
914 let mut from_inst = ::std::usize::MAX;
915 let mut last_hole = Hole::None;
916 for byte_range in seq {
917 let key = SuffixCacheKey {
918 from_inst: from_inst,
919 start: byte_range.start,
920 end: byte_range.end,
921 };
922 {
923 let pc = self.c.insts.len();
924 if let Some(cached_pc) = self.c.suffix_cache.get(key, pc) {
925 from_inst = cached_pc;
926 continue;
927 }
928 }
929 self.c.byte_classes.set_range(byte_range.start, byte_range.end);
930 if from_inst == ::std::usize::MAX {
931 last_hole = self.c.push_hole(InstHole::Bytes {
932 start: byte_range.start,
933 end: byte_range.end,
934 });
935 } else {
936 self.c.push_compiled(Inst::Bytes(InstBytes {
937 goto: from_inst,
938 start: byte_range.start,
939 end: byte_range.end,
940 }));
941 }
942 from_inst = self.c.insts.len().checked_sub(1).unwrap();
943 debug_assert!(from_inst < ::std::usize::MAX);
944 }
945 debug_assert!(from_inst < ::std::usize::MAX);
946 Ok(Patch { hole: last_hole, entry: from_inst })
947 }
948 }
949
950 /// `SuffixCache` is a simple bounded hash map for caching suffix entries in
951 /// UTF-8 automata. For example, consider the Unicode range \u{0}-\u{FFFF}.
952 /// The set of byte ranges looks like this:
953 ///
954 /// [0-7F]
955 /// [C2-DF][80-BF]
956 /// [E0][A0-BF][80-BF]
957 /// [E1-EC][80-BF][80-BF]
958 /// [ED][80-9F][80-BF]
959 /// [EE-EF][80-BF][80-BF]
960 ///
961 /// Each line above translates to one alternate in the compiled regex program.
962 /// However, all but one of the alternates end in the same suffix, which is
963 /// a waste of an instruction. The suffix cache facilitates reusing them across
964 /// alternates.
965 ///
966 /// Note that a HashMap could be trivially used for this, but we don't need its
967 /// overhead. Some small bounded space (LRU style) is more than enough.
968 ///
969 /// This uses similar idea to [`SparseSet`](../sparse/struct.SparseSet.html),
970 /// except it uses hashes as original indices and then compares full keys for
971 /// validation against `dense` array.
972 struct SuffixCache {
973 sparse: Box<[usize]>,
974 dense: Vec<SuffixCacheEntry>,
975 }
976
977 #[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
978 struct SuffixCacheEntry {
979 key: SuffixCacheKey,
980 pc: InstPtr,
981 }
982
983 #[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
984 struct SuffixCacheKey {
985 from_inst: InstPtr,
986 start: u8,
987 end: u8,
988 }
989
990 impl SuffixCache {
new(size: usize) -> Self991 fn new(size: usize) -> Self {
992 SuffixCache {
993 sparse: vec![0usize; size].into(),
994 dense: Vec::with_capacity(size),
995 }
996 }
997
get(&mut self, key: SuffixCacheKey, pc: InstPtr) -> Option<InstPtr>998 fn get(&mut self, key: SuffixCacheKey, pc: InstPtr) -> Option<InstPtr> {
999 let hash = self.hash(&key);
1000 let pos = &mut self.sparse[hash];
1001 if let Some(entry) = self.dense.get(*pos) {
1002 if entry.key == key {
1003 return Some(entry.pc);
1004 }
1005 }
1006 *pos = self.dense.len();
1007 self.dense.push(SuffixCacheEntry { key: key, pc: pc });
1008 None
1009 }
1010
clear(&mut self)1011 fn clear(&mut self) {
1012 self.dense.clear();
1013 }
1014
hash(&self, suffix: &SuffixCacheKey) -> usize1015 fn hash(&self, suffix: &SuffixCacheKey) -> usize {
1016 // Basic FNV-1a hash as described:
1017 // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
1018 const FNV_PRIME: u64 = 1099511628211;
1019 let mut h = 14695981039346656037;
1020 h = (h ^ (suffix.from_inst as u64)).wrapping_mul(FNV_PRIME);
1021 h = (h ^ (suffix.start as u64)).wrapping_mul(FNV_PRIME);
1022 h = (h ^ (suffix.end as u64)).wrapping_mul(FNV_PRIME);
1023 (h as usize) % self.sparse.len()
1024 }
1025 }
1026
1027 struct ByteClassSet([bool; 256]);
1028
1029 impl ByteClassSet {
new() -> Self1030 fn new() -> Self {
1031 ByteClassSet([false; 256])
1032 }
1033
set_range(&mut self, start: u8, end: u8)1034 fn set_range(&mut self, start: u8, end: u8) {
1035 debug_assert!(start <= end);
1036 if start > 0 {
1037 self.0[start as usize - 1] = true;
1038 }
1039 self.0[end as usize] = true;
1040 }
1041
set_word_boundary(&mut self)1042 fn set_word_boundary(&mut self) {
1043 // We need to mark all ranges of bytes whose pairs result in
1044 // evaluating \b differently.
1045 let iswb = is_word_byte;
1046 let mut b1: u16 = 0;
1047 let mut b2: u16;
1048 while b1 <= 255 {
1049 b2 = b1 + 1;
1050 while b2 <= 255 && iswb(b1 as u8) == iswb(b2 as u8) {
1051 b2 += 1;
1052 }
1053 self.set_range(b1 as u8, (b2 - 1) as u8);
1054 b1 = b2;
1055 }
1056 }
1057
byte_classes(&self) -> Vec<u8>1058 fn byte_classes(&self) -> Vec<u8> {
1059 // N.B. If you're debugging the DFA, it's useful to simply return
1060 // `(0..256).collect()`, which effectively removes the byte classes
1061 // and makes the transitions easier to read.
1062 // (0usize..256).map(|x| x as u8).collect()
1063 let mut byte_classes = vec![0; 256];
1064 let mut class = 0u8;
1065 let mut i = 0;
1066 loop {
1067 byte_classes[i] = class as u8;
1068 if i >= 255 {
1069 break;
1070 }
1071 if self.0[i] {
1072 class = class.checked_add(1).unwrap();
1073 }
1074 i += 1;
1075 }
1076 byte_classes
1077 }
1078 }
1079
u32_to_usize(n: u32) -> usize1080 fn u32_to_usize(n: u32) -> usize {
1081 // In case usize is less than 32 bits, we need to guard against overflow.
1082 // On most platforms this compiles to nothing.
1083 // TODO Use `std::convert::TryFrom` once it's stable.
1084 if (n as u64) > (::std::usize::MAX as u64) {
1085 panic!("BUG: {} is too big to be pointer sized", n)
1086 }
1087 n as usize
1088 }
1089
1090 #[cfg(test)]
1091 mod tests {
1092 use super::ByteClassSet;
1093
1094 #[test]
byte_classes()1095 fn byte_classes() {
1096 let mut set = ByteClassSet::new();
1097 set.set_range(b'a', b'z');
1098 let classes = set.byte_classes();
1099 assert_eq!(classes[0], 0);
1100 assert_eq!(classes[1], 0);
1101 assert_eq!(classes[2], 0);
1102 assert_eq!(classes[b'a' as usize - 1], 0);
1103 assert_eq!(classes[b'a' as usize], 1);
1104 assert_eq!(classes[b'm' as usize], 1);
1105 assert_eq!(classes[b'z' as usize], 1);
1106 assert_eq!(classes[b'z' as usize + 1], 2);
1107 assert_eq!(classes[254], 2);
1108 assert_eq!(classes[255], 2);
1109
1110 let mut set = ByteClassSet::new();
1111 set.set_range(0, 2);
1112 set.set_range(4, 6);
1113 let classes = set.byte_classes();
1114 assert_eq!(classes[0], 0);
1115 assert_eq!(classes[1], 0);
1116 assert_eq!(classes[2], 0);
1117 assert_eq!(classes[3], 1);
1118 assert_eq!(classes[4], 2);
1119 assert_eq!(classes[5], 2);
1120 assert_eq!(classes[6], 2);
1121 assert_eq!(classes[7], 3);
1122 assert_eq!(classes[255], 3);
1123 }
1124
1125 #[test]
full_byte_classes()1126 fn full_byte_classes() {
1127 let mut set = ByteClassSet::new();
1128 for i in 0..256u16 {
1129 set.set_range(i as u8, i as u8);
1130 }
1131 assert_eq!(set.byte_classes().len(), 256);
1132 }
1133 }
1134