1 use std::collections::HashMap;
2 use std::fmt;
3 use std::iter;
4 use std::result;
5 use std::sync::Arc;
6 
7 use syntax::hir::{self, Hir};
8 use syntax::is_word_byte;
9 use syntax::utf8::{Utf8Range, Utf8Sequence, Utf8Sequences};
10 
11 use prog::{
12     EmptyLook, Inst, InstBytes, InstChar, InstEmptyLook, InstPtr, InstRanges,
13     InstSave, InstSplit, Program,
14 };
15 
16 use Error;
17 
18 type Result = result::Result<Patch, Error>;
19 type ResultOrEmpty = result::Result<Option<Patch>, Error>;
20 
21 #[derive(Debug)]
22 struct Patch {
23     hole: Hole,
24     entry: InstPtr,
25 }
26 
27 /// A compiler translates a regular expression AST to a sequence of
28 /// instructions. The sequence of instructions represents an NFA.
29 // `Compiler` is only public via the `internal` module, so avoid deriving
30 // `Debug`.
31 #[allow(missing_debug_implementations)]
32 pub struct Compiler {
33     insts: Vec<MaybeInst>,
34     compiled: Program,
35     capture_name_idx: HashMap<String, usize>,
36     num_exprs: usize,
37     size_limit: usize,
38     suffix_cache: SuffixCache,
39     utf8_seqs: Option<Utf8Sequences>,
40     byte_classes: ByteClassSet,
41     extra_inst_bytes: usize,
42 }
43 
44 impl Compiler {
45     /// Create a new regular expression compiler.
46     ///
47     /// Various options can be set before calling `compile` on an expression.
new() -> Self48     pub fn new() -> Self {
49         Compiler {
50             insts: vec![],
51             compiled: Program::new(),
52             capture_name_idx: HashMap::new(),
53             num_exprs: 0,
54             size_limit: 10 * (1 << 20),
55             suffix_cache: SuffixCache::new(1000),
56             utf8_seqs: Some(Utf8Sequences::new('\x00', '\x00')),
57             byte_classes: ByteClassSet::new(),
58             extra_inst_bytes: 0,
59         }
60     }
61 
62     /// The size of the resulting program is limited by size_limit. If
63     /// the program approximately exceeds the given size (in bytes), then
64     /// compilation will stop and return an error.
size_limit(mut self, size_limit: usize) -> Self65     pub fn size_limit(mut self, size_limit: usize) -> Self {
66         self.size_limit = size_limit;
67         self
68     }
69 
70     /// If bytes is true, then the program is compiled as a byte based
71     /// automaton, which incorporates UTF-8 decoding into the machine. If it's
72     /// false, then the automaton is Unicode scalar value based, e.g., an
73     /// engine utilizing such an automaton is responsible for UTF-8 decoding.
74     ///
75     /// The specific invariant is that when returning a byte based machine,
76     /// the neither the `Char` nor `Ranges` instructions are produced.
77     /// Conversely, when producing a Unicode scalar value machine, the `Bytes`
78     /// instruction is never produced.
79     ///
80     /// Note that `dfa(true)` implies `bytes(true)`.
bytes(mut self, yes: bool) -> Self81     pub fn bytes(mut self, yes: bool) -> Self {
82         self.compiled.is_bytes = yes;
83         self
84     }
85 
86     /// When disabled, the program compiled may match arbitrary bytes.
87     ///
88     /// When enabled (the default), all compiled programs exclusively match
89     /// valid UTF-8 bytes.
only_utf8(mut self, yes: bool) -> Self90     pub fn only_utf8(mut self, yes: bool) -> Self {
91         self.compiled.only_utf8 = yes;
92         self
93     }
94 
95     /// When set, the machine returned is suitable for use in the DFA matching
96     /// engine.
97     ///
98     /// In particular, this ensures that if the regex is not anchored in the
99     /// beginning, then a preceding `.*?` is included in the program. (The NFA
100     /// based engines handle the preceding `.*?` explicitly, which is difficult
101     /// or impossible in the DFA engine.)
dfa(mut self, yes: bool) -> Self102     pub fn dfa(mut self, yes: bool) -> Self {
103         self.compiled.is_dfa = yes;
104         self
105     }
106 
107     /// When set, the machine returned is suitable for matching text in
108     /// reverse. In particular, all concatenations are flipped.
reverse(mut self, yes: bool) -> Self109     pub fn reverse(mut self, yes: bool) -> Self {
110         self.compiled.is_reverse = yes;
111         self
112     }
113 
114     /// Compile a regular expression given its AST.
115     ///
116     /// The compiler is guaranteed to succeed unless the program exceeds the
117     /// specified size limit. If the size limit is exceeded, then compilation
118     /// stops and returns an error.
compile(mut self, exprs: &[Hir]) -> result::Result<Program, Error>119     pub fn compile(mut self, exprs: &[Hir]) -> result::Result<Program, Error> {
120         debug_assert!(!exprs.is_empty());
121         self.num_exprs = exprs.len();
122         if exprs.len() == 1 {
123             self.compile_one(&exprs[0])
124         } else {
125             self.compile_many(exprs)
126         }
127     }
128 
compile_one(mut self, expr: &Hir) -> result::Result<Program, Error>129     fn compile_one(mut self, expr: &Hir) -> result::Result<Program, Error> {
130         // If we're compiling a forward DFA and we aren't anchored, then
131         // add a `.*?` before the first capture group.
132         // Other matching engines handle this by baking the logic into the
133         // matching engine itself.
134         let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
135         self.compiled.is_anchored_start = expr.is_anchored_start();
136         self.compiled.is_anchored_end = expr.is_anchored_end();
137         if self.compiled.needs_dotstar() {
138             dotstar_patch = self.c_dotstar()?;
139             self.compiled.start = dotstar_patch.entry;
140         }
141         self.compiled.captures = vec![None];
142         let patch = self.c_capture(0, expr)?.unwrap_or(self.next_inst());
143         if self.compiled.needs_dotstar() {
144             self.fill(dotstar_patch.hole, patch.entry);
145         } else {
146             self.compiled.start = patch.entry;
147         }
148         self.fill_to_next(patch.hole);
149         self.compiled.matches = vec![self.insts.len()];
150         self.push_compiled(Inst::Match(0));
151         self.compile_finish()
152     }
153 
compile_many( mut self, exprs: &[Hir], ) -> result::Result<Program, Error>154     fn compile_many(
155         mut self,
156         exprs: &[Hir],
157     ) -> result::Result<Program, Error> {
158         debug_assert!(exprs.len() > 1);
159 
160         self.compiled.is_anchored_start =
161             exprs.iter().all(|e| e.is_anchored_start());
162         self.compiled.is_anchored_end =
163             exprs.iter().all(|e| e.is_anchored_end());
164         let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
165         if self.compiled.needs_dotstar() {
166             dotstar_patch = self.c_dotstar()?;
167             self.compiled.start = dotstar_patch.entry;
168         } else {
169             self.compiled.start = 0; // first instruction is always split
170         }
171         self.fill_to_next(dotstar_patch.hole);
172 
173         let mut prev_hole = Hole::None;
174         for (i, expr) in exprs[0..exprs.len() - 1].iter().enumerate() {
175             self.fill_to_next(prev_hole);
176             let split = self.push_split_hole();
177             let Patch { hole, entry } =
178                 self.c_capture(0, expr)?.unwrap_or(self.next_inst());
179             self.fill_to_next(hole);
180             self.compiled.matches.push(self.insts.len());
181             self.push_compiled(Inst::Match(i));
182             prev_hole = self.fill_split(split, Some(entry), None);
183         }
184         let i = exprs.len() - 1;
185         let Patch { hole, entry } =
186             self.c_capture(0, &exprs[i])?.unwrap_or(self.next_inst());
187         self.fill(prev_hole, entry);
188         self.fill_to_next(hole);
189         self.compiled.matches.push(self.insts.len());
190         self.push_compiled(Inst::Match(i));
191         self.compile_finish()
192     }
193 
compile_finish(mut self) -> result::Result<Program, Error>194     fn compile_finish(mut self) -> result::Result<Program, Error> {
195         self.compiled.insts =
196             self.insts.into_iter().map(|inst| inst.unwrap()).collect();
197         self.compiled.byte_classes = self.byte_classes.byte_classes();
198         self.compiled.capture_name_idx = Arc::new(self.capture_name_idx);
199         Ok(self.compiled)
200     }
201 
202     /// Compile expr into self.insts, returning a patch on success,
203     /// or an error if we run out of memory.
204     ///
205     /// All of the c_* methods of the compiler share the contract outlined
206     /// here.
207     ///
208     /// The main thing that a c_* method does is mutate `self.insts`
209     /// to add a list of mostly compiled instructions required to execute
210     /// the given expression. `self.insts` contains MaybeInsts rather than
211     /// Insts because there is some backpatching required.
212     ///
213     /// The `Patch` value returned by each c_* method provides metadata
214     /// about the compiled instructions emitted to `self.insts`. The
215     /// `entry` member of the patch refers to the first instruction
216     /// (the entry point), while the `hole` member contains zero or
217     /// more offsets to partial instructions that need to be backpatched.
218     /// The c_* routine can't know where its list of instructions are going to
219     /// jump to after execution, so it is up to the caller to patch
220     /// these jumps to point to the right place. So compiling some
221     /// expression, e, we would end up with a situation that looked like:
222     ///
223     /// ```text
224     /// self.insts = [ ..., i1, i2, ..., iexit1, ..., iexitn, ...]
225     ///                     ^              ^             ^
226     ///                     |                \         /
227     ///                   entry                \     /
228     ///                                         hole
229     /// ```
230     ///
231     /// To compile two expressions, e1 and e2, concatenated together we
232     /// would do:
233     ///
234     /// ```ignore
235     /// let patch1 = self.c(e1);
236     /// let patch2 = self.c(e2);
237     /// ```
238     ///
239     /// while leaves us with a situation that looks like
240     ///
241     /// ```text
242     /// self.insts = [ ..., i1, ..., iexit1, ..., i2, ..., iexit2 ]
243     ///                     ^        ^            ^        ^
244     ///                     |        |            |        |
245     ///                entry1        hole1   entry2        hole2
246     /// ```
247     ///
248     /// Then to merge the two patches together into one we would backpatch
249     /// hole1 with entry2 and return a new patch that enters at entry1
250     /// and has hole2 for a hole. In fact, if you look at the c_concat
251     /// method you will see that it does exactly this, though it handles
252     /// a list of expressions rather than just the two that we use for
253     /// an example.
254     ///
255     /// Ok(None) is returned when an expression is compiled to no
256     /// instruction, and so no patch.entry value makes sense.
c(&mut self, expr: &Hir) -> ResultOrEmpty257     fn c(&mut self, expr: &Hir) -> ResultOrEmpty {
258         use prog;
259         use syntax::hir::HirKind::*;
260 
261         self.check_size()?;
262         match *expr.kind() {
263             Empty => Ok(None),
264             Literal(hir::Literal::Unicode(c)) => self.c_char(c),
265             Literal(hir::Literal::Byte(b)) => {
266                 assert!(self.compiled.uses_bytes());
267                 self.c_byte(b)
268             }
269             Class(hir::Class::Unicode(ref cls)) => self.c_class(cls.ranges()),
270             Class(hir::Class::Bytes(ref cls)) => {
271                 if self.compiled.uses_bytes() {
272                     self.c_class_bytes(cls.ranges())
273                 } else {
274                     assert!(cls.is_all_ascii());
275                     let mut char_ranges = vec![];
276                     for r in cls.iter() {
277                         let (s, e) = (r.start() as char, r.end() as char);
278                         char_ranges.push(hir::ClassUnicodeRange::new(s, e));
279                     }
280                     self.c_class(&char_ranges)
281                 }
282             }
283             Anchor(hir::Anchor::StartLine) if self.compiled.is_reverse => {
284                 self.byte_classes.set_range(b'\n', b'\n');
285                 self.c_empty_look(prog::EmptyLook::EndLine)
286             }
287             Anchor(hir::Anchor::StartLine) => {
288                 self.byte_classes.set_range(b'\n', b'\n');
289                 self.c_empty_look(prog::EmptyLook::StartLine)
290             }
291             Anchor(hir::Anchor::EndLine) if self.compiled.is_reverse => {
292                 self.byte_classes.set_range(b'\n', b'\n');
293                 self.c_empty_look(prog::EmptyLook::StartLine)
294             }
295             Anchor(hir::Anchor::EndLine) => {
296                 self.byte_classes.set_range(b'\n', b'\n');
297                 self.c_empty_look(prog::EmptyLook::EndLine)
298             }
299             Anchor(hir::Anchor::StartText) if self.compiled.is_reverse => {
300                 self.c_empty_look(prog::EmptyLook::EndText)
301             }
302             Anchor(hir::Anchor::StartText) => {
303                 self.c_empty_look(prog::EmptyLook::StartText)
304             }
305             Anchor(hir::Anchor::EndText) if self.compiled.is_reverse => {
306                 self.c_empty_look(prog::EmptyLook::StartText)
307             }
308             Anchor(hir::Anchor::EndText) => {
309                 self.c_empty_look(prog::EmptyLook::EndText)
310             }
311             WordBoundary(hir::WordBoundary::Unicode) => {
312                 if !cfg!(feature = "unicode-perl") {
313                     return Err(Error::Syntax(
314                         "Unicode word boundaries are unavailable when \
315                          the unicode-perl feature is disabled"
316                             .to_string(),
317                     ));
318                 }
319                 self.compiled.has_unicode_word_boundary = true;
320                 self.byte_classes.set_word_boundary();
321                 self.c_empty_look(prog::EmptyLook::WordBoundary)
322             }
323             WordBoundary(hir::WordBoundary::UnicodeNegate) => {
324                 if !cfg!(feature = "unicode-perl") {
325                     return Err(Error::Syntax(
326                         "Unicode word boundaries are unavailable when \
327                          the unicode-perl feature is disabled"
328                             .to_string(),
329                     ));
330                 }
331                 self.compiled.has_unicode_word_boundary = true;
332                 self.byte_classes.set_word_boundary();
333                 self.c_empty_look(prog::EmptyLook::NotWordBoundary)
334             }
335             WordBoundary(hir::WordBoundary::Ascii) => {
336                 self.byte_classes.set_word_boundary();
337                 self.c_empty_look(prog::EmptyLook::WordBoundaryAscii)
338             }
339             WordBoundary(hir::WordBoundary::AsciiNegate) => {
340                 self.byte_classes.set_word_boundary();
341                 self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii)
342             }
343             Group(ref g) => match g.kind {
344                 hir::GroupKind::NonCapturing => self.c(&g.hir),
345                 hir::GroupKind::CaptureIndex(index) => {
346                     if index as usize >= self.compiled.captures.len() {
347                         self.compiled.captures.push(None);
348                     }
349                     self.c_capture(2 * index as usize, &g.hir)
350                 }
351                 hir::GroupKind::CaptureName { index, ref name } => {
352                     if index as usize >= self.compiled.captures.len() {
353                         let n = name.to_string();
354                         self.compiled.captures.push(Some(n.clone()));
355                         self.capture_name_idx.insert(n, index as usize);
356                     }
357                     self.c_capture(2 * index as usize, &g.hir)
358                 }
359             },
360             Concat(ref es) => {
361                 if self.compiled.is_reverse {
362                     self.c_concat(es.iter().rev())
363                 } else {
364                     self.c_concat(es)
365                 }
366             }
367             Alternation(ref es) => self.c_alternate(&**es),
368             Repetition(ref rep) => self.c_repeat(rep),
369         }
370     }
371 
c_capture(&mut self, first_slot: usize, expr: &Hir) -> ResultOrEmpty372     fn c_capture(&mut self, first_slot: usize, expr: &Hir) -> ResultOrEmpty {
373         if self.num_exprs > 1 || self.compiled.is_dfa {
374             // Don't ever compile Save instructions for regex sets because
375             // they are never used. They are also never used in DFA programs
376             // because DFAs can't handle captures.
377             self.c(expr)
378         } else {
379             let entry = self.insts.len();
380             let hole = self.push_hole(InstHole::Save { slot: first_slot });
381             let patch = self.c(expr)?.unwrap_or(self.next_inst());
382             self.fill(hole, patch.entry);
383             self.fill_to_next(patch.hole);
384             let hole = self.push_hole(InstHole::Save { slot: first_slot + 1 });
385             Ok(Some(Patch { hole: hole, entry: entry }))
386         }
387     }
388 
c_dotstar(&mut self) -> Result389     fn c_dotstar(&mut self) -> Result {
390         Ok(if !self.compiled.only_utf8() {
391             self.c(&Hir::repetition(hir::Repetition {
392                 kind: hir::RepetitionKind::ZeroOrMore,
393                 greedy: false,
394                 hir: Box::new(Hir::any(true)),
395             }))?
396             .unwrap()
397         } else {
398             self.c(&Hir::repetition(hir::Repetition {
399                 kind: hir::RepetitionKind::ZeroOrMore,
400                 greedy: false,
401                 hir: Box::new(Hir::any(false)),
402             }))?
403             .unwrap()
404         })
405     }
406 
c_char(&mut self, c: char) -> ResultOrEmpty407     fn c_char(&mut self, c: char) -> ResultOrEmpty {
408         if self.compiled.uses_bytes() {
409             if c.is_ascii() {
410                 let b = c as u8;
411                 let hole =
412                     self.push_hole(InstHole::Bytes { start: b, end: b });
413                 self.byte_classes.set_range(b, b);
414                 Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
415             } else {
416                 self.c_class(&[hir::ClassUnicodeRange::new(c, c)])
417             }
418         } else {
419             let hole = self.push_hole(InstHole::Char { c: c });
420             Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
421         }
422     }
423 
c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> ResultOrEmpty424     fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> ResultOrEmpty {
425         use std::mem::size_of;
426 
427         assert!(!ranges.is_empty());
428         if self.compiled.uses_bytes() {
429             Ok(Some(CompileClass { c: self, ranges: ranges }.compile()?))
430         } else {
431             let ranges: Vec<(char, char)> =
432                 ranges.iter().map(|r| (r.start(), r.end())).collect();
433             let hole = if ranges.len() == 1 && ranges[0].0 == ranges[0].1 {
434                 self.push_hole(InstHole::Char { c: ranges[0].0 })
435             } else {
436                 self.extra_inst_bytes +=
437                     ranges.len() * (size_of::<char>() * 2);
438                 self.push_hole(InstHole::Ranges { ranges: ranges })
439             };
440             Ok(Some(Patch { hole: hole, entry: self.insts.len() - 1 }))
441         }
442     }
443 
c_byte(&mut self, b: u8) -> ResultOrEmpty444     fn c_byte(&mut self, b: u8) -> ResultOrEmpty {
445         self.c_class_bytes(&[hir::ClassBytesRange::new(b, b)])
446     }
447 
c_class_bytes( &mut self, ranges: &[hir::ClassBytesRange], ) -> ResultOrEmpty448     fn c_class_bytes(
449         &mut self,
450         ranges: &[hir::ClassBytesRange],
451     ) -> ResultOrEmpty {
452         debug_assert!(!ranges.is_empty());
453 
454         let first_split_entry = self.insts.len();
455         let mut holes = vec![];
456         let mut prev_hole = Hole::None;
457         for r in &ranges[0..ranges.len() - 1] {
458             self.fill_to_next(prev_hole);
459             let split = self.push_split_hole();
460             let next = self.insts.len();
461             self.byte_classes.set_range(r.start(), r.end());
462             holes.push(self.push_hole(InstHole::Bytes {
463                 start: r.start(),
464                 end: r.end(),
465             }));
466             prev_hole = self.fill_split(split, Some(next), None);
467         }
468         let next = self.insts.len();
469         let r = &ranges[ranges.len() - 1];
470         self.byte_classes.set_range(r.start(), r.end());
471         holes.push(
472             self.push_hole(InstHole::Bytes { start: r.start(), end: r.end() }),
473         );
474         self.fill(prev_hole, next);
475         Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry }))
476     }
477 
c_empty_look(&mut self, look: EmptyLook) -> ResultOrEmpty478     fn c_empty_look(&mut self, look: EmptyLook) -> ResultOrEmpty {
479         let hole = self.push_hole(InstHole::EmptyLook { look: look });
480         Ok(Some(Patch { hole: hole, entry: self.insts.len() - 1 }))
481     }
482 
c_concat<'a, I>(&mut self, exprs: I) -> ResultOrEmpty where I: IntoIterator<Item = &'a Hir>,483     fn c_concat<'a, I>(&mut self, exprs: I) -> ResultOrEmpty
484     where
485         I: IntoIterator<Item = &'a Hir>,
486     {
487         let mut exprs = exprs.into_iter();
488         let Patch { mut hole, entry } = loop {
489             match exprs.next() {
490                 None => return Ok(None),
491                 Some(e) => {
492                     if let Some(p) = self.c(e)? {
493                         break p;
494                     }
495                 }
496             }
497         };
498         for e in exprs {
499             if let Some(p) = self.c(e)? {
500                 self.fill(hole, p.entry);
501                 hole = p.hole;
502             }
503         }
504         Ok(Some(Patch { hole: hole, entry: entry }))
505     }
506 
c_alternate(&mut self, exprs: &[Hir]) -> ResultOrEmpty507     fn c_alternate(&mut self, exprs: &[Hir]) -> ResultOrEmpty {
508         debug_assert!(
509             exprs.len() >= 2,
510             "alternates must have at least 2 exprs"
511         );
512 
513         // Initial entry point is always the first split.
514         let first_split_entry = self.insts.len();
515 
516         // Save up all of the holes from each alternate. They will all get
517         // patched to point to the same location.
518         let mut holes = vec![];
519 
520         // true indicates that the hole is a split where we want to fill
521         // the second branch.
522         let mut prev_hole = (Hole::None, false);
523         for e in &exprs[0..exprs.len() - 1] {
524             if prev_hole.1 {
525                 let next = self.insts.len();
526                 self.fill_split(prev_hole.0, None, Some(next));
527             } else {
528                 self.fill_to_next(prev_hole.0);
529             }
530             let split = self.push_split_hole();
531             if let Some(Patch { hole, entry }) = self.c(e)? {
532                 holes.push(hole);
533                 prev_hole = (self.fill_split(split, Some(entry), None), false);
534             } else {
535                 let (split1, split2) = split.dup_one();
536                 holes.push(split1);
537                 prev_hole = (split2, true);
538             }
539         }
540         if let Some(Patch { hole, entry }) = self.c(&exprs[exprs.len() - 1])? {
541             holes.push(hole);
542             if prev_hole.1 {
543                 self.fill_split(prev_hole.0, None, Some(entry));
544             } else {
545                 self.fill(prev_hole.0, entry);
546             }
547         } else {
548             // We ignore prev_hole.1. When it's true, it means we have two
549             // empty branches both pushing prev_hole.0 into holes, so both
550             // branches will go to the same place anyway.
551             holes.push(prev_hole.0);
552         }
553         Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry }))
554     }
555 
c_repeat(&mut self, rep: &hir::Repetition) -> ResultOrEmpty556     fn c_repeat(&mut self, rep: &hir::Repetition) -> ResultOrEmpty {
557         use syntax::hir::RepetitionKind::*;
558         match rep.kind {
559             ZeroOrOne => self.c_repeat_zero_or_one(&rep.hir, rep.greedy),
560             ZeroOrMore => self.c_repeat_zero_or_more(&rep.hir, rep.greedy),
561             OneOrMore => self.c_repeat_one_or_more(&rep.hir, rep.greedy),
562             Range(hir::RepetitionRange::Exactly(min_max)) => {
563                 self.c_repeat_range(&rep.hir, rep.greedy, min_max, min_max)
564             }
565             Range(hir::RepetitionRange::AtLeast(min)) => {
566                 self.c_repeat_range_min_or_more(&rep.hir, rep.greedy, min)
567             }
568             Range(hir::RepetitionRange::Bounded(min, max)) => {
569                 self.c_repeat_range(&rep.hir, rep.greedy, min, max)
570             }
571         }
572     }
573 
c_repeat_zero_or_one( &mut self, expr: &Hir, greedy: bool, ) -> ResultOrEmpty574     fn c_repeat_zero_or_one(
575         &mut self,
576         expr: &Hir,
577         greedy: bool,
578     ) -> ResultOrEmpty {
579         let split_entry = self.insts.len();
580         let split = self.push_split_hole();
581         let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
582             Some(p) => p,
583             None => return self.pop_split_hole(),
584         };
585         let split_hole = if greedy {
586             self.fill_split(split, Some(entry_rep), None)
587         } else {
588             self.fill_split(split, None, Some(entry_rep))
589         };
590         let holes = vec![hole_rep, split_hole];
591         Ok(Some(Patch { hole: Hole::Many(holes), entry: split_entry }))
592     }
593 
c_repeat_zero_or_more( &mut self, expr: &Hir, greedy: bool, ) -> ResultOrEmpty594     fn c_repeat_zero_or_more(
595         &mut self,
596         expr: &Hir,
597         greedy: bool,
598     ) -> ResultOrEmpty {
599         let split_entry = self.insts.len();
600         let split = self.push_split_hole();
601         let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
602             Some(p) => p,
603             None => return self.pop_split_hole(),
604         };
605 
606         self.fill(hole_rep, split_entry);
607         let split_hole = if greedy {
608             self.fill_split(split, Some(entry_rep), None)
609         } else {
610             self.fill_split(split, None, Some(entry_rep))
611         };
612         Ok(Some(Patch { hole: split_hole, entry: split_entry }))
613     }
614 
c_repeat_one_or_more( &mut self, expr: &Hir, greedy: bool, ) -> ResultOrEmpty615     fn c_repeat_one_or_more(
616         &mut self,
617         expr: &Hir,
618         greedy: bool,
619     ) -> ResultOrEmpty {
620         let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
621             Some(p) => p,
622             None => return Ok(None),
623         };
624         self.fill_to_next(hole_rep);
625         let split = self.push_split_hole();
626 
627         let split_hole = if greedy {
628             self.fill_split(split, Some(entry_rep), None)
629         } else {
630             self.fill_split(split, None, Some(entry_rep))
631         };
632         Ok(Some(Patch { hole: split_hole, entry: entry_rep }))
633     }
634 
c_repeat_range_min_or_more( &mut self, expr: &Hir, greedy: bool, min: u32, ) -> ResultOrEmpty635     fn c_repeat_range_min_or_more(
636         &mut self,
637         expr: &Hir,
638         greedy: bool,
639         min: u32,
640     ) -> ResultOrEmpty {
641         let min = u32_to_usize(min);
642         // Using next_inst() is ok, because we can't return it (concat would
643         // have to return Some(_) while c_repeat_range_min_or_more returns
644         // None).
645         let patch_concat = self
646             .c_concat(iter::repeat(expr).take(min))?
647             .unwrap_or(self.next_inst());
648         if let Some(patch_rep) = self.c_repeat_zero_or_more(expr, greedy)? {
649             self.fill(patch_concat.hole, patch_rep.entry);
650             Ok(Some(Patch { hole: patch_rep.hole, entry: patch_concat.entry }))
651         } else {
652             Ok(None)
653         }
654     }
655 
c_repeat_range( &mut self, expr: &Hir, greedy: bool, min: u32, max: u32, ) -> ResultOrEmpty656     fn c_repeat_range(
657         &mut self,
658         expr: &Hir,
659         greedy: bool,
660         min: u32,
661         max: u32,
662     ) -> ResultOrEmpty {
663         let (min, max) = (u32_to_usize(min), u32_to_usize(max));
664         debug_assert!(min <= max);
665         let patch_concat = self.c_concat(iter::repeat(expr).take(min))?;
666         if min == max {
667             return Ok(patch_concat);
668         }
669         // Same reasoning as in c_repeat_range_min_or_more (we know that min <
670         // max at this point).
671         let patch_concat = patch_concat.unwrap_or(self.next_inst());
672         let initial_entry = patch_concat.entry;
673         // It is much simpler to compile, e.g., `a{2,5}` as:
674         //
675         //     aaa?a?a?
676         //
677         // But you end up with a sequence of instructions like this:
678         //
679         //     0: 'a'
680         //     1: 'a',
681         //     2: split(3, 4)
682         //     3: 'a'
683         //     4: split(5, 6)
684         //     5: 'a'
685         //     6: split(7, 8)
686         //     7: 'a'
687         //     8: MATCH
688         //
689         // This is *incredibly* inefficient because the splits end
690         // up forming a chain, which has to be resolved everything a
691         // transition is followed.
692         let mut holes = vec![];
693         let mut prev_hole = patch_concat.hole;
694         for _ in min..max {
695             self.fill_to_next(prev_hole);
696             let split = self.push_split_hole();
697             let Patch { hole, entry } = match self.c(expr)? {
698                 Some(p) => p,
699                 None => return self.pop_split_hole(),
700             };
701             prev_hole = hole;
702             if greedy {
703                 holes.push(self.fill_split(split, Some(entry), None));
704             } else {
705                 holes.push(self.fill_split(split, None, Some(entry)));
706             }
707         }
708         holes.push(prev_hole);
709         Ok(Some(Patch { hole: Hole::Many(holes), entry: initial_entry }))
710     }
711 
712     /// Can be used as a default value for the c_* functions when the call to
713     /// c_function is followed by inserting at least one instruction that is
714     /// always executed after the ones written by the c* function.
next_inst(&self) -> Patch715     fn next_inst(&self) -> Patch {
716         Patch { hole: Hole::None, entry: self.insts.len() }
717     }
718 
fill(&mut self, hole: Hole, goto: InstPtr)719     fn fill(&mut self, hole: Hole, goto: InstPtr) {
720         match hole {
721             Hole::None => {}
722             Hole::One(pc) => {
723                 self.insts[pc].fill(goto);
724             }
725             Hole::Many(holes) => {
726                 for hole in holes {
727                     self.fill(hole, goto);
728                 }
729             }
730         }
731     }
732 
fill_to_next(&mut self, hole: Hole)733     fn fill_to_next(&mut self, hole: Hole) {
734         let next = self.insts.len();
735         self.fill(hole, next);
736     }
737 
fill_split( &mut self, hole: Hole, goto1: Option<InstPtr>, goto2: Option<InstPtr>, ) -> Hole738     fn fill_split(
739         &mut self,
740         hole: Hole,
741         goto1: Option<InstPtr>,
742         goto2: Option<InstPtr>,
743     ) -> Hole {
744         match hole {
745             Hole::None => Hole::None,
746             Hole::One(pc) => match (goto1, goto2) {
747                 (Some(goto1), Some(goto2)) => {
748                     self.insts[pc].fill_split(goto1, goto2);
749                     Hole::None
750                 }
751                 (Some(goto1), None) => {
752                     self.insts[pc].half_fill_split_goto1(goto1);
753                     Hole::One(pc)
754                 }
755                 (None, Some(goto2)) => {
756                     self.insts[pc].half_fill_split_goto2(goto2);
757                     Hole::One(pc)
758                 }
759                 (None, None) => unreachable!(
760                     "at least one of the split \
761                      holes must be filled"
762                 ),
763             },
764             Hole::Many(holes) => {
765                 let mut new_holes = vec![];
766                 for hole in holes {
767                     new_holes.push(self.fill_split(hole, goto1, goto2));
768                 }
769                 if new_holes.is_empty() {
770                     Hole::None
771                 } else if new_holes.len() == 1 {
772                     new_holes.pop().unwrap()
773                 } else {
774                     Hole::Many(new_holes)
775                 }
776             }
777         }
778     }
779 
push_compiled(&mut self, inst: Inst)780     fn push_compiled(&mut self, inst: Inst) {
781         self.insts.push(MaybeInst::Compiled(inst));
782     }
783 
push_hole(&mut self, inst: InstHole) -> Hole784     fn push_hole(&mut self, inst: InstHole) -> Hole {
785         let hole = self.insts.len();
786         self.insts.push(MaybeInst::Uncompiled(inst));
787         Hole::One(hole)
788     }
789 
push_split_hole(&mut self) -> Hole790     fn push_split_hole(&mut self) -> Hole {
791         let hole = self.insts.len();
792         self.insts.push(MaybeInst::Split);
793         Hole::One(hole)
794     }
795 
pop_split_hole(&mut self) -> ResultOrEmpty796     fn pop_split_hole(&mut self) -> ResultOrEmpty {
797         self.insts.pop();
798         Ok(None)
799     }
800 
check_size(&self) -> result::Result<(), Error>801     fn check_size(&self) -> result::Result<(), Error> {
802         use std::mem::size_of;
803 
804         let size =
805             self.extra_inst_bytes + (self.insts.len() * size_of::<Inst>());
806         if size > self.size_limit {
807             Err(Error::CompiledTooBig(self.size_limit))
808         } else {
809             Ok(())
810         }
811     }
812 }
813 
814 #[derive(Debug)]
815 enum Hole {
816     None,
817     One(InstPtr),
818     Many(Vec<Hole>),
819 }
820 
821 impl Hole {
dup_one(self) -> (Self, Self)822     fn dup_one(self) -> (Self, Self) {
823         match self {
824             Hole::One(pc) => (Hole::One(pc), Hole::One(pc)),
825             Hole::None | Hole::Many(_) => {
826                 unreachable!("must be called on single hole")
827             }
828         }
829     }
830 }
831 
832 #[derive(Clone, Debug)]
833 enum MaybeInst {
834     Compiled(Inst),
835     Uncompiled(InstHole),
836     Split,
837     Split1(InstPtr),
838     Split2(InstPtr),
839 }
840 
841 impl MaybeInst {
fill(&mut self, goto: InstPtr)842     fn fill(&mut self, goto: InstPtr) {
843         let maybeinst = match *self {
844             MaybeInst::Split => MaybeInst::Split1(goto),
845             MaybeInst::Uncompiled(ref inst) => {
846                 MaybeInst::Compiled(inst.fill(goto))
847             }
848             MaybeInst::Split1(goto1) => {
849                 MaybeInst::Compiled(Inst::Split(InstSplit {
850                     goto1: goto1,
851                     goto2: goto,
852                 }))
853             }
854             MaybeInst::Split2(goto2) => {
855                 MaybeInst::Compiled(Inst::Split(InstSplit {
856                     goto1: goto,
857                     goto2: goto2,
858                 }))
859             }
860             _ => unreachable!(
861                 "not all instructions were compiled! \
862                  found uncompiled instruction: {:?}",
863                 self
864             ),
865         };
866         *self = maybeinst;
867     }
868 
fill_split(&mut self, goto1: InstPtr, goto2: InstPtr)869     fn fill_split(&mut self, goto1: InstPtr, goto2: InstPtr) {
870         let filled = match *self {
871             MaybeInst::Split => {
872                 Inst::Split(InstSplit { goto1: goto1, goto2: goto2 })
873             }
874             _ => unreachable!(
875                 "must be called on Split instruction, \
876                  instead it was called on: {:?}",
877                 self
878             ),
879         };
880         *self = MaybeInst::Compiled(filled);
881     }
882 
half_fill_split_goto1(&mut self, goto1: InstPtr)883     fn half_fill_split_goto1(&mut self, goto1: InstPtr) {
884         let half_filled = match *self {
885             MaybeInst::Split => goto1,
886             _ => unreachable!(
887                 "must be called on Split instruction, \
888                  instead it was called on: {:?}",
889                 self
890             ),
891         };
892         *self = MaybeInst::Split1(half_filled);
893     }
894 
half_fill_split_goto2(&mut self, goto2: InstPtr)895     fn half_fill_split_goto2(&mut self, goto2: InstPtr) {
896         let half_filled = match *self {
897             MaybeInst::Split => goto2,
898             _ => unreachable!(
899                 "must be called on Split instruction, \
900                  instead it was called on: {:?}",
901                 self
902             ),
903         };
904         *self = MaybeInst::Split2(half_filled);
905     }
906 
unwrap(self) -> Inst907     fn unwrap(self) -> Inst {
908         match self {
909             MaybeInst::Compiled(inst) => inst,
910             _ => unreachable!(
911                 "must be called on a compiled instruction, \
912                  instead it was called on: {:?}",
913                 self
914             ),
915         }
916     }
917 }
918 
919 #[derive(Clone, Debug)]
920 enum InstHole {
921     Save { slot: usize },
922     EmptyLook { look: EmptyLook },
923     Char { c: char },
924     Ranges { ranges: Vec<(char, char)> },
925     Bytes { start: u8, end: u8 },
926 }
927 
928 impl InstHole {
fill(&self, goto: InstPtr) -> Inst929     fn fill(&self, goto: InstPtr) -> Inst {
930         match *self {
931             InstHole::Save { slot } => {
932                 Inst::Save(InstSave { goto: goto, slot: slot })
933             }
934             InstHole::EmptyLook { look } => {
935                 Inst::EmptyLook(InstEmptyLook { goto: goto, look: look })
936             }
937             InstHole::Char { c } => Inst::Char(InstChar { goto: goto, c: c }),
938             InstHole::Ranges { ref ranges } => Inst::Ranges(InstRanges {
939                 goto: goto,
940                 ranges: ranges.clone().into_boxed_slice(),
941             }),
942             InstHole::Bytes { start, end } => {
943                 Inst::Bytes(InstBytes { goto: goto, start: start, end: end })
944             }
945         }
946     }
947 }
948 
949 struct CompileClass<'a, 'b> {
950     c: &'a mut Compiler,
951     ranges: &'b [hir::ClassUnicodeRange],
952 }
953 
954 impl<'a, 'b> CompileClass<'a, 'b> {
compile(mut self) -> Result955     fn compile(mut self) -> Result {
956         let mut holes = vec![];
957         let mut initial_entry = None;
958         let mut last_split = Hole::None;
959         let mut utf8_seqs = self.c.utf8_seqs.take().unwrap();
960         self.c.suffix_cache.clear();
961 
962         for (i, range) in self.ranges.iter().enumerate() {
963             let is_last_range = i + 1 == self.ranges.len();
964             utf8_seqs.reset(range.start(), range.end());
965             let mut it = (&mut utf8_seqs).peekable();
966             loop {
967                 let utf8_seq = match it.next() {
968                     None => break,
969                     Some(utf8_seq) => utf8_seq,
970                 };
971                 if is_last_range && it.peek().is_none() {
972                     let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?;
973                     holes.push(hole);
974                     self.c.fill(last_split, entry);
975                     last_split = Hole::None;
976                     if initial_entry.is_none() {
977                         initial_entry = Some(entry);
978                     }
979                 } else {
980                     if initial_entry.is_none() {
981                         initial_entry = Some(self.c.insts.len());
982                     }
983                     self.c.fill_to_next(last_split);
984                     last_split = self.c.push_split_hole();
985                     let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?;
986                     holes.push(hole);
987                     last_split =
988                         self.c.fill_split(last_split, Some(entry), None);
989                 }
990             }
991         }
992         self.c.utf8_seqs = Some(utf8_seqs);
993         Ok(Patch { hole: Hole::Many(holes), entry: initial_entry.unwrap() })
994     }
995 
c_utf8_seq(&mut self, seq: &Utf8Sequence) -> Result996     fn c_utf8_seq(&mut self, seq: &Utf8Sequence) -> Result {
997         if self.c.compiled.is_reverse {
998             self.c_utf8_seq_(seq)
999         } else {
1000             self.c_utf8_seq_(seq.into_iter().rev())
1001         }
1002     }
1003 
c_utf8_seq_<'r, I>(&mut self, seq: I) -> Result where I: IntoIterator<Item = &'r Utf8Range>,1004     fn c_utf8_seq_<'r, I>(&mut self, seq: I) -> Result
1005     where
1006         I: IntoIterator<Item = &'r Utf8Range>,
1007     {
1008         // The initial instruction for each UTF-8 sequence should be the same.
1009         let mut from_inst = ::std::usize::MAX;
1010         let mut last_hole = Hole::None;
1011         for byte_range in seq {
1012             let key = SuffixCacheKey {
1013                 from_inst: from_inst,
1014                 start: byte_range.start,
1015                 end: byte_range.end,
1016             };
1017             {
1018                 let pc = self.c.insts.len();
1019                 if let Some(cached_pc) = self.c.suffix_cache.get(key, pc) {
1020                     from_inst = cached_pc;
1021                     continue;
1022                 }
1023             }
1024             self.c.byte_classes.set_range(byte_range.start, byte_range.end);
1025             if from_inst == ::std::usize::MAX {
1026                 last_hole = self.c.push_hole(InstHole::Bytes {
1027                     start: byte_range.start,
1028                     end: byte_range.end,
1029                 });
1030             } else {
1031                 self.c.push_compiled(Inst::Bytes(InstBytes {
1032                     goto: from_inst,
1033                     start: byte_range.start,
1034                     end: byte_range.end,
1035                 }));
1036             }
1037             from_inst = self.c.insts.len().checked_sub(1).unwrap();
1038             debug_assert!(from_inst < ::std::usize::MAX);
1039         }
1040         debug_assert!(from_inst < ::std::usize::MAX);
1041         Ok(Patch { hole: last_hole, entry: from_inst })
1042     }
1043 }
1044 
1045 /// `SuffixCache` is a simple bounded hash map for caching suffix entries in
1046 /// UTF-8 automata. For example, consider the Unicode range \u{0}-\u{FFFF}.
1047 /// The set of byte ranges looks like this:
1048 ///
1049 /// [0-7F]
1050 /// [C2-DF][80-BF]
1051 /// [E0][A0-BF][80-BF]
1052 /// [E1-EC][80-BF][80-BF]
1053 /// [ED][80-9F][80-BF]
1054 /// [EE-EF][80-BF][80-BF]
1055 ///
1056 /// Each line above translates to one alternate in the compiled regex program.
1057 /// However, all but one of the alternates end in the same suffix, which is
1058 /// a waste of an instruction. The suffix cache facilitates reusing them across
1059 /// alternates.
1060 ///
1061 /// Note that a HashMap could be trivially used for this, but we don't need its
1062 /// overhead. Some small bounded space (LRU style) is more than enough.
1063 ///
1064 /// This uses similar idea to [`SparseSet`](../sparse/struct.SparseSet.html),
1065 /// except it uses hashes as original indices and then compares full keys for
1066 /// validation against `dense` array.
1067 #[derive(Debug)]
1068 struct SuffixCache {
1069     sparse: Box<[usize]>,
1070     dense: Vec<SuffixCacheEntry>,
1071 }
1072 
1073 #[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
1074 struct SuffixCacheEntry {
1075     key: SuffixCacheKey,
1076     pc: InstPtr,
1077 }
1078 
1079 #[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
1080 struct SuffixCacheKey {
1081     from_inst: InstPtr,
1082     start: u8,
1083     end: u8,
1084 }
1085 
1086 impl SuffixCache {
new(size: usize) -> Self1087     fn new(size: usize) -> Self {
1088         SuffixCache {
1089             sparse: vec![0usize; size].into(),
1090             dense: Vec::with_capacity(size),
1091         }
1092     }
1093 
get(&mut self, key: SuffixCacheKey, pc: InstPtr) -> Option<InstPtr>1094     fn get(&mut self, key: SuffixCacheKey, pc: InstPtr) -> Option<InstPtr> {
1095         let hash = self.hash(&key);
1096         let pos = &mut self.sparse[hash];
1097         if let Some(entry) = self.dense.get(*pos) {
1098             if entry.key == key {
1099                 return Some(entry.pc);
1100             }
1101         }
1102         *pos = self.dense.len();
1103         self.dense.push(SuffixCacheEntry { key: key, pc: pc });
1104         None
1105     }
1106 
clear(&mut self)1107     fn clear(&mut self) {
1108         self.dense.clear();
1109     }
1110 
hash(&self, suffix: &SuffixCacheKey) -> usize1111     fn hash(&self, suffix: &SuffixCacheKey) -> usize {
1112         // Basic FNV-1a hash as described:
1113         // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
1114         const FNV_PRIME: u64 = 1099511628211;
1115         let mut h = 14695981039346656037;
1116         h = (h ^ (suffix.from_inst as u64)).wrapping_mul(FNV_PRIME);
1117         h = (h ^ (suffix.start as u64)).wrapping_mul(FNV_PRIME);
1118         h = (h ^ (suffix.end as u64)).wrapping_mul(FNV_PRIME);
1119         (h as usize) % self.sparse.len()
1120     }
1121 }
1122 
1123 struct ByteClassSet([bool; 256]);
1124 
1125 impl ByteClassSet {
new() -> Self1126     fn new() -> Self {
1127         ByteClassSet([false; 256])
1128     }
1129 
set_range(&mut self, start: u8, end: u8)1130     fn set_range(&mut self, start: u8, end: u8) {
1131         debug_assert!(start <= end);
1132         if start > 0 {
1133             self.0[start as usize - 1] = true;
1134         }
1135         self.0[end as usize] = true;
1136     }
1137 
set_word_boundary(&mut self)1138     fn set_word_boundary(&mut self) {
1139         // We need to mark all ranges of bytes whose pairs result in
1140         // evaluating \b differently.
1141         let iswb = is_word_byte;
1142         let mut b1: u16 = 0;
1143         let mut b2: u16;
1144         while b1 <= 255 {
1145             b2 = b1 + 1;
1146             while b2 <= 255 && iswb(b1 as u8) == iswb(b2 as u8) {
1147                 b2 += 1;
1148             }
1149             self.set_range(b1 as u8, (b2 - 1) as u8);
1150             b1 = b2;
1151         }
1152     }
1153 
byte_classes(&self) -> Vec<u8>1154     fn byte_classes(&self) -> Vec<u8> {
1155         // N.B. If you're debugging the DFA, it's useful to simply return
1156         // `(0..256).collect()`, which effectively removes the byte classes
1157         // and makes the transitions easier to read.
1158         // (0usize..256).map(|x| x as u8).collect()
1159         let mut byte_classes = vec![0; 256];
1160         let mut class = 0u8;
1161         let mut i = 0;
1162         loop {
1163             byte_classes[i] = class as u8;
1164             if i >= 255 {
1165                 break;
1166             }
1167             if self.0[i] {
1168                 class = class.checked_add(1).unwrap();
1169             }
1170             i += 1;
1171         }
1172         byte_classes
1173     }
1174 }
1175 
1176 impl fmt::Debug for ByteClassSet {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result1177     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1178         f.debug_tuple("ByteClassSet").field(&&self.0[..]).finish()
1179     }
1180 }
1181 
u32_to_usize(n: u32) -> usize1182 fn u32_to_usize(n: u32) -> usize {
1183     // In case usize is less than 32 bits, we need to guard against overflow.
1184     // On most platforms this compiles to nothing.
1185     // TODO Use `std::convert::TryFrom` once it's stable.
1186     if (n as u64) > (::std::usize::MAX as u64) {
1187         panic!("BUG: {} is too big to be pointer sized", n)
1188     }
1189     n as usize
1190 }
1191 
1192 #[cfg(test)]
1193 mod tests {
1194     use super::ByteClassSet;
1195 
1196     #[test]
byte_classes()1197     fn byte_classes() {
1198         let mut set = ByteClassSet::new();
1199         set.set_range(b'a', b'z');
1200         let classes = set.byte_classes();
1201         assert_eq!(classes[0], 0);
1202         assert_eq!(classes[1], 0);
1203         assert_eq!(classes[2], 0);
1204         assert_eq!(classes[b'a' as usize - 1], 0);
1205         assert_eq!(classes[b'a' as usize], 1);
1206         assert_eq!(classes[b'm' as usize], 1);
1207         assert_eq!(classes[b'z' as usize], 1);
1208         assert_eq!(classes[b'z' as usize + 1], 2);
1209         assert_eq!(classes[254], 2);
1210         assert_eq!(classes[255], 2);
1211 
1212         let mut set = ByteClassSet::new();
1213         set.set_range(0, 2);
1214         set.set_range(4, 6);
1215         let classes = set.byte_classes();
1216         assert_eq!(classes[0], 0);
1217         assert_eq!(classes[1], 0);
1218         assert_eq!(classes[2], 0);
1219         assert_eq!(classes[3], 1);
1220         assert_eq!(classes[4], 2);
1221         assert_eq!(classes[5], 2);
1222         assert_eq!(classes[6], 2);
1223         assert_eq!(classes[7], 3);
1224         assert_eq!(classes[255], 3);
1225     }
1226 
1227     #[test]
full_byte_classes()1228     fn full_byte_classes() {
1229         let mut set = ByteClassSet::new();
1230         for i in 0..256u16 {
1231             set.set_range(i as u8, i as u8);
1232         }
1233         assert_eq!(set.byte_classes().len(), 256);
1234     }
1235 }
1236