1 use std::collections::HashMap;
2 use std::fmt;
3 use std::iter;
4 use std::result;
5 use std::sync::Arc;
6 
7 use regex_syntax::hir::{self, Hir};
8 use regex_syntax::is_word_byte;
9 use regex_syntax::utf8::{Utf8Range, Utf8Sequence, Utf8Sequences};
10 
11 use crate::prog::{
12     EmptyLook, Inst, InstBytes, InstChar, InstEmptyLook, InstPtr, InstRanges,
13     InstSave, InstSplit, Program,
14 };
15 
16 use crate::Error;
17 
18 type Result = result::Result<Patch, Error>;
19 type ResultOrEmpty = result::Result<Option<Patch>, Error>;
20 
21 #[derive(Debug)]
22 struct Patch {
23     hole: Hole,
24     entry: InstPtr,
25 }
26 
27 /// A compiler translates a regular expression AST to a sequence of
28 /// instructions. The sequence of instructions represents an NFA.
29 // `Compiler` is only public via the `internal` module, so avoid deriving
30 // `Debug`.
31 #[allow(missing_debug_implementations)]
32 pub struct Compiler {
33     insts: Vec<MaybeInst>,
34     compiled: Program,
35     capture_name_idx: HashMap<String, usize>,
36     num_exprs: usize,
37     size_limit: usize,
38     suffix_cache: SuffixCache,
39     utf8_seqs: Option<Utf8Sequences>,
40     byte_classes: ByteClassSet,
41     // This keeps track of extra bytes allocated while compiling the regex
42     // program. Currently, this corresponds to two things. First is the heap
43     // memory allocated by Unicode character classes ('InstRanges'). Second is
44     // a "fake" amount of memory used by empty sub-expressions, so that enough
45     // empty sub-expressions will ultimately trigger the compiler to bail
46     // because of a size limit restriction. (That empty sub-expressions don't
47     // add to heap memory usage is more-or-less an implementation detail.) In
48     // the second case, if we don't bail, then an excessively large repetition
49     // on an empty sub-expression can result in the compiler using a very large
50     // amount of CPU time.
51     extra_inst_bytes: usize,
52 }
53 
54 impl Compiler {
55     /// Create a new regular expression compiler.
56     ///
57     /// Various options can be set before calling `compile` on an expression.
new() -> Self58     pub fn new() -> Self {
59         Compiler {
60             insts: vec![],
61             compiled: Program::new(),
62             capture_name_idx: HashMap::new(),
63             num_exprs: 0,
64             size_limit: 10 * (1 << 20),
65             suffix_cache: SuffixCache::new(1000),
66             utf8_seqs: Some(Utf8Sequences::new('\x00', '\x00')),
67             byte_classes: ByteClassSet::new(),
68             extra_inst_bytes: 0,
69         }
70     }
71 
72     /// The size of the resulting program is limited by size_limit. If
73     /// the program approximately exceeds the given size (in bytes), then
74     /// compilation will stop and return an error.
size_limit(mut self, size_limit: usize) -> Self75     pub fn size_limit(mut self, size_limit: usize) -> Self {
76         self.size_limit = size_limit;
77         self
78     }
79 
80     /// If bytes is true, then the program is compiled as a byte based
81     /// automaton, which incorporates UTF-8 decoding into the machine. If it's
82     /// false, then the automaton is Unicode scalar value based, e.g., an
83     /// engine utilizing such an automaton is responsible for UTF-8 decoding.
84     ///
85     /// The specific invariant is that when returning a byte based machine,
86     /// the neither the `Char` nor `Ranges` instructions are produced.
87     /// Conversely, when producing a Unicode scalar value machine, the `Bytes`
88     /// instruction is never produced.
89     ///
90     /// Note that `dfa(true)` implies `bytes(true)`.
bytes(mut self, yes: bool) -> Self91     pub fn bytes(mut self, yes: bool) -> Self {
92         self.compiled.is_bytes = yes;
93         self
94     }
95 
96     /// When disabled, the program compiled may match arbitrary bytes.
97     ///
98     /// When enabled (the default), all compiled programs exclusively match
99     /// valid UTF-8 bytes.
only_utf8(mut self, yes: bool) -> Self100     pub fn only_utf8(mut self, yes: bool) -> Self {
101         self.compiled.only_utf8 = yes;
102         self
103     }
104 
105     /// When set, the machine returned is suitable for use in the DFA matching
106     /// engine.
107     ///
108     /// In particular, this ensures that if the regex is not anchored in the
109     /// beginning, then a preceding `.*?` is included in the program. (The NFA
110     /// based engines handle the preceding `.*?` explicitly, which is difficult
111     /// or impossible in the DFA engine.)
dfa(mut self, yes: bool) -> Self112     pub fn dfa(mut self, yes: bool) -> Self {
113         self.compiled.is_dfa = yes;
114         self
115     }
116 
117     /// When set, the machine returned is suitable for matching text in
118     /// reverse. In particular, all concatenations are flipped.
reverse(mut self, yes: bool) -> Self119     pub fn reverse(mut self, yes: bool) -> Self {
120         self.compiled.is_reverse = yes;
121         self
122     }
123 
124     /// Compile a regular expression given its AST.
125     ///
126     /// The compiler is guaranteed to succeed unless the program exceeds the
127     /// specified size limit. If the size limit is exceeded, then compilation
128     /// stops and returns an error.
compile(mut self, exprs: &[Hir]) -> result::Result<Program, Error>129     pub fn compile(mut self, exprs: &[Hir]) -> result::Result<Program, Error> {
130         debug_assert!(!exprs.is_empty());
131         self.num_exprs = exprs.len();
132         if exprs.len() == 1 {
133             self.compile_one(&exprs[0])
134         } else {
135             self.compile_many(exprs)
136         }
137     }
138 
compile_one(mut self, expr: &Hir) -> result::Result<Program, Error>139     fn compile_one(mut self, expr: &Hir) -> result::Result<Program, Error> {
140         // If we're compiling a forward DFA and we aren't anchored, then
141         // add a `.*?` before the first capture group.
142         // Other matching engines handle this by baking the logic into the
143         // matching engine itself.
144         let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
145         self.compiled.is_anchored_start = expr.is_anchored_start();
146         self.compiled.is_anchored_end = expr.is_anchored_end();
147         if self.compiled.needs_dotstar() {
148             dotstar_patch = self.c_dotstar()?;
149             self.compiled.start = dotstar_patch.entry;
150         }
151         self.compiled.captures = vec![None];
152         let patch = self.c_capture(0, expr)?.unwrap_or(self.next_inst());
153         if self.compiled.needs_dotstar() {
154             self.fill(dotstar_patch.hole, patch.entry);
155         } else {
156             self.compiled.start = patch.entry;
157         }
158         self.fill_to_next(patch.hole);
159         self.compiled.matches = vec![self.insts.len()];
160         self.push_compiled(Inst::Match(0));
161         self.compile_finish()
162     }
163 
compile_many( mut self, exprs: &[Hir], ) -> result::Result<Program, Error>164     fn compile_many(
165         mut self,
166         exprs: &[Hir],
167     ) -> result::Result<Program, Error> {
168         debug_assert!(exprs.len() > 1);
169 
170         self.compiled.is_anchored_start =
171             exprs.iter().all(|e| e.is_anchored_start());
172         self.compiled.is_anchored_end =
173             exprs.iter().all(|e| e.is_anchored_end());
174         let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
175         if self.compiled.needs_dotstar() {
176             dotstar_patch = self.c_dotstar()?;
177             self.compiled.start = dotstar_patch.entry;
178         } else {
179             self.compiled.start = 0; // first instruction is always split
180         }
181         self.fill_to_next(dotstar_patch.hole);
182 
183         let mut prev_hole = Hole::None;
184         for (i, expr) in exprs[0..exprs.len() - 1].iter().enumerate() {
185             self.fill_to_next(prev_hole);
186             let split = self.push_split_hole();
187             let Patch { hole, entry } =
188                 self.c_capture(0, expr)?.unwrap_or(self.next_inst());
189             self.fill_to_next(hole);
190             self.compiled.matches.push(self.insts.len());
191             self.push_compiled(Inst::Match(i));
192             prev_hole = self.fill_split(split, Some(entry), None);
193         }
194         let i = exprs.len() - 1;
195         let Patch { hole, entry } =
196             self.c_capture(0, &exprs[i])?.unwrap_or(self.next_inst());
197         self.fill(prev_hole, entry);
198         self.fill_to_next(hole);
199         self.compiled.matches.push(self.insts.len());
200         self.push_compiled(Inst::Match(i));
201         self.compile_finish()
202     }
203 
compile_finish(mut self) -> result::Result<Program, Error>204     fn compile_finish(mut self) -> result::Result<Program, Error> {
205         self.compiled.insts =
206             self.insts.into_iter().map(|inst| inst.unwrap()).collect();
207         self.compiled.byte_classes = self.byte_classes.byte_classes();
208         self.compiled.capture_name_idx = Arc::new(self.capture_name_idx);
209         Ok(self.compiled)
210     }
211 
212     /// Compile expr into self.insts, returning a patch on success,
213     /// or an error if we run out of memory.
214     ///
215     /// All of the c_* methods of the compiler share the contract outlined
216     /// here.
217     ///
218     /// The main thing that a c_* method does is mutate `self.insts`
219     /// to add a list of mostly compiled instructions required to execute
220     /// the given expression. `self.insts` contains MaybeInsts rather than
221     /// Insts because there is some backpatching required.
222     ///
223     /// The `Patch` value returned by each c_* method provides metadata
224     /// about the compiled instructions emitted to `self.insts`. The
225     /// `entry` member of the patch refers to the first instruction
226     /// (the entry point), while the `hole` member contains zero or
227     /// more offsets to partial instructions that need to be backpatched.
228     /// The c_* routine can't know where its list of instructions are going to
229     /// jump to after execution, so it is up to the caller to patch
230     /// these jumps to point to the right place. So compiling some
231     /// expression, e, we would end up with a situation that looked like:
232     ///
233     /// ```text
234     /// self.insts = [ ..., i1, i2, ..., iexit1, ..., iexitn, ...]
235     ///                     ^              ^             ^
236     ///                     |                \         /
237     ///                   entry                \     /
238     ///                                         hole
239     /// ```
240     ///
241     /// To compile two expressions, e1 and e2, concatenated together we
242     /// would do:
243     ///
244     /// ```ignore
245     /// let patch1 = self.c(e1);
246     /// let patch2 = self.c(e2);
247     /// ```
248     ///
249     /// while leaves us with a situation that looks like
250     ///
251     /// ```text
252     /// self.insts = [ ..., i1, ..., iexit1, ..., i2, ..., iexit2 ]
253     ///                     ^        ^            ^        ^
254     ///                     |        |            |        |
255     ///                entry1        hole1   entry2        hole2
256     /// ```
257     ///
258     /// Then to merge the two patches together into one we would backpatch
259     /// hole1 with entry2 and return a new patch that enters at entry1
260     /// and has hole2 for a hole. In fact, if you look at the c_concat
261     /// method you will see that it does exactly this, though it handles
262     /// a list of expressions rather than just the two that we use for
263     /// an example.
264     ///
265     /// Ok(None) is returned when an expression is compiled to no
266     /// instruction, and so no patch.entry value makes sense.
c(&mut self, expr: &Hir) -> ResultOrEmpty267     fn c(&mut self, expr: &Hir) -> ResultOrEmpty {
268         use crate::prog;
269         use regex_syntax::hir::HirKind::*;
270 
271         self.check_size()?;
272         match *expr.kind() {
273             Empty => self.c_empty(),
274             Literal(hir::Literal::Unicode(c)) => self.c_char(c),
275             Literal(hir::Literal::Byte(b)) => {
276                 assert!(self.compiled.uses_bytes());
277                 self.c_byte(b)
278             }
279             Class(hir::Class::Unicode(ref cls)) => self.c_class(cls.ranges()),
280             Class(hir::Class::Bytes(ref cls)) => {
281                 if self.compiled.uses_bytes() {
282                     self.c_class_bytes(cls.ranges())
283                 } else {
284                     assert!(cls.is_all_ascii());
285                     let mut char_ranges = vec![];
286                     for r in cls.iter() {
287                         let (s, e) = (r.start() as char, r.end() as char);
288                         char_ranges.push(hir::ClassUnicodeRange::new(s, e));
289                     }
290                     self.c_class(&char_ranges)
291                 }
292             }
293             Anchor(hir::Anchor::StartLine) if self.compiled.is_reverse => {
294                 self.byte_classes.set_range(b'\n', b'\n');
295                 self.c_empty_look(prog::EmptyLook::EndLine)
296             }
297             Anchor(hir::Anchor::StartLine) => {
298                 self.byte_classes.set_range(b'\n', b'\n');
299                 self.c_empty_look(prog::EmptyLook::StartLine)
300             }
301             Anchor(hir::Anchor::EndLine) if self.compiled.is_reverse => {
302                 self.byte_classes.set_range(b'\n', b'\n');
303                 self.c_empty_look(prog::EmptyLook::StartLine)
304             }
305             Anchor(hir::Anchor::EndLine) => {
306                 self.byte_classes.set_range(b'\n', b'\n');
307                 self.c_empty_look(prog::EmptyLook::EndLine)
308             }
309             Anchor(hir::Anchor::StartText) if self.compiled.is_reverse => {
310                 self.c_empty_look(prog::EmptyLook::EndText)
311             }
312             Anchor(hir::Anchor::StartText) => {
313                 self.c_empty_look(prog::EmptyLook::StartText)
314             }
315             Anchor(hir::Anchor::EndText) if self.compiled.is_reverse => {
316                 self.c_empty_look(prog::EmptyLook::StartText)
317             }
318             Anchor(hir::Anchor::EndText) => {
319                 self.c_empty_look(prog::EmptyLook::EndText)
320             }
321             WordBoundary(hir::WordBoundary::Unicode) => {
322                 if !cfg!(feature = "unicode-perl") {
323                     return Err(Error::Syntax(
324                         "Unicode word boundaries are unavailable when \
325                          the unicode-perl feature is disabled"
326                             .to_string(),
327                     ));
328                 }
329                 self.compiled.has_unicode_word_boundary = true;
330                 self.byte_classes.set_word_boundary();
331                 // We also make sure that all ASCII bytes are in a different
332                 // class from non-ASCII bytes. Otherwise, it's possible for
333                 // ASCII bytes to get lumped into the same class as non-ASCII
334                 // bytes. This in turn may cause the lazy DFA to falsely start
335                 // when it sees an ASCII byte that maps to a byte class with
336                 // non-ASCII bytes. This ensures that never happens.
337                 self.byte_classes.set_range(0, 0x7F);
338                 self.c_empty_look(prog::EmptyLook::WordBoundary)
339             }
340             WordBoundary(hir::WordBoundary::UnicodeNegate) => {
341                 if !cfg!(feature = "unicode-perl") {
342                     return Err(Error::Syntax(
343                         "Unicode word boundaries are unavailable when \
344                          the unicode-perl feature is disabled"
345                             .to_string(),
346                     ));
347                 }
348                 self.compiled.has_unicode_word_boundary = true;
349                 self.byte_classes.set_word_boundary();
350                 // See comments above for why we set the ASCII range here.
351                 self.byte_classes.set_range(0, 0x7F);
352                 self.c_empty_look(prog::EmptyLook::NotWordBoundary)
353             }
354             WordBoundary(hir::WordBoundary::Ascii) => {
355                 self.byte_classes.set_word_boundary();
356                 self.c_empty_look(prog::EmptyLook::WordBoundaryAscii)
357             }
358             WordBoundary(hir::WordBoundary::AsciiNegate) => {
359                 self.byte_classes.set_word_boundary();
360                 self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii)
361             }
362             Group(ref g) => match g.kind {
363                 hir::GroupKind::NonCapturing => self.c(&g.hir),
364                 hir::GroupKind::CaptureIndex(index) => {
365                     if index as usize >= self.compiled.captures.len() {
366                         self.compiled.captures.push(None);
367                     }
368                     self.c_capture(2 * index as usize, &g.hir)
369                 }
370                 hir::GroupKind::CaptureName { index, ref name } => {
371                     if index as usize >= self.compiled.captures.len() {
372                         let n = name.to_string();
373                         self.compiled.captures.push(Some(n.clone()));
374                         self.capture_name_idx.insert(n, index as usize);
375                     }
376                     self.c_capture(2 * index as usize, &g.hir)
377                 }
378             },
379             Concat(ref es) => {
380                 if self.compiled.is_reverse {
381                     self.c_concat(es.iter().rev())
382                 } else {
383                     self.c_concat(es)
384                 }
385             }
386             Alternation(ref es) => self.c_alternate(&**es),
387             Repetition(ref rep) => self.c_repeat(rep),
388         }
389     }
390 
c_empty(&mut self) -> ResultOrEmpty391     fn c_empty(&mut self) -> ResultOrEmpty {
392         // See: https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8
393         // See: CVE-2022-24713
394         //
395         // Since 'empty' sub-expressions don't increase the size of
396         // the actual compiled object, we "fake" an increase in its
397         // size so that our 'check_size_limit' routine will eventually
398         // stop compilation if there are too many empty sub-expressions
399         // (e.g., via a large repetition).
400         self.extra_inst_bytes += std::mem::size_of::<Inst>();
401         Ok(None)
402     }
403 
c_capture(&mut self, first_slot: usize, expr: &Hir) -> ResultOrEmpty404     fn c_capture(&mut self, first_slot: usize, expr: &Hir) -> ResultOrEmpty {
405         if self.num_exprs > 1 || self.compiled.is_dfa {
406             // Don't ever compile Save instructions for regex sets because
407             // they are never used. They are also never used in DFA programs
408             // because DFAs can't handle captures.
409             self.c(expr)
410         } else {
411             let entry = self.insts.len();
412             let hole = self.push_hole(InstHole::Save { slot: first_slot });
413             let patch = self.c(expr)?.unwrap_or(self.next_inst());
414             self.fill(hole, patch.entry);
415             self.fill_to_next(patch.hole);
416             let hole = self.push_hole(InstHole::Save { slot: first_slot + 1 });
417             Ok(Some(Patch { hole: hole, entry: entry }))
418         }
419     }
420 
c_dotstar(&mut self) -> Result421     fn c_dotstar(&mut self) -> Result {
422         Ok(if !self.compiled.only_utf8() {
423             self.c(&Hir::repetition(hir::Repetition {
424                 kind: hir::RepetitionKind::ZeroOrMore,
425                 greedy: false,
426                 hir: Box::new(Hir::any(true)),
427             }))?
428             .unwrap()
429         } else {
430             self.c(&Hir::repetition(hir::Repetition {
431                 kind: hir::RepetitionKind::ZeroOrMore,
432                 greedy: false,
433                 hir: Box::new(Hir::any(false)),
434             }))?
435             .unwrap()
436         })
437     }
438 
c_char(&mut self, c: char) -> ResultOrEmpty439     fn c_char(&mut self, c: char) -> ResultOrEmpty {
440         if self.compiled.uses_bytes() {
441             if c.is_ascii() {
442                 let b = c as u8;
443                 let hole =
444                     self.push_hole(InstHole::Bytes { start: b, end: b });
445                 self.byte_classes.set_range(b, b);
446                 Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
447             } else {
448                 self.c_class(&[hir::ClassUnicodeRange::new(c, c)])
449             }
450         } else {
451             let hole = self.push_hole(InstHole::Char { c: c });
452             Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
453         }
454     }
455 
c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> ResultOrEmpty456     fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> ResultOrEmpty {
457         use std::mem::size_of;
458 
459         assert!(!ranges.is_empty());
460         if self.compiled.uses_bytes() {
461             Ok(Some(CompileClass { c: self, ranges: ranges }.compile()?))
462         } else {
463             let ranges: Vec<(char, char)> =
464                 ranges.iter().map(|r| (r.start(), r.end())).collect();
465             let hole = if ranges.len() == 1 && ranges[0].0 == ranges[0].1 {
466                 self.push_hole(InstHole::Char { c: ranges[0].0 })
467             } else {
468                 self.extra_inst_bytes +=
469                     ranges.len() * (size_of::<char>() * 2);
470                 self.push_hole(InstHole::Ranges { ranges: ranges })
471             };
472             Ok(Some(Patch { hole: hole, entry: self.insts.len() - 1 }))
473         }
474     }
475 
c_byte(&mut self, b: u8) -> ResultOrEmpty476     fn c_byte(&mut self, b: u8) -> ResultOrEmpty {
477         self.c_class_bytes(&[hir::ClassBytesRange::new(b, b)])
478     }
479 
c_class_bytes( &mut self, ranges: &[hir::ClassBytesRange], ) -> ResultOrEmpty480     fn c_class_bytes(
481         &mut self,
482         ranges: &[hir::ClassBytesRange],
483     ) -> ResultOrEmpty {
484         debug_assert!(!ranges.is_empty());
485 
486         let first_split_entry = self.insts.len();
487         let mut holes = vec![];
488         let mut prev_hole = Hole::None;
489         for r in &ranges[0..ranges.len() - 1] {
490             self.fill_to_next(prev_hole);
491             let split = self.push_split_hole();
492             let next = self.insts.len();
493             self.byte_classes.set_range(r.start(), r.end());
494             holes.push(self.push_hole(InstHole::Bytes {
495                 start: r.start(),
496                 end: r.end(),
497             }));
498             prev_hole = self.fill_split(split, Some(next), None);
499         }
500         let next = self.insts.len();
501         let r = &ranges[ranges.len() - 1];
502         self.byte_classes.set_range(r.start(), r.end());
503         holes.push(
504             self.push_hole(InstHole::Bytes { start: r.start(), end: r.end() }),
505         );
506         self.fill(prev_hole, next);
507         Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry }))
508     }
509 
c_empty_look(&mut self, look: EmptyLook) -> ResultOrEmpty510     fn c_empty_look(&mut self, look: EmptyLook) -> ResultOrEmpty {
511         let hole = self.push_hole(InstHole::EmptyLook { look: look });
512         Ok(Some(Patch { hole: hole, entry: self.insts.len() - 1 }))
513     }
514 
c_concat<'a, I>(&mut self, exprs: I) -> ResultOrEmpty where I: IntoIterator<Item = &'a Hir>,515     fn c_concat<'a, I>(&mut self, exprs: I) -> ResultOrEmpty
516     where
517         I: IntoIterator<Item = &'a Hir>,
518     {
519         let mut exprs = exprs.into_iter();
520         let Patch { mut hole, entry } = loop {
521             match exprs.next() {
522                 None => return self.c_empty(),
523                 Some(e) => {
524                     if let Some(p) = self.c(e)? {
525                         break p;
526                     }
527                 }
528             }
529         };
530         for e in exprs {
531             if let Some(p) = self.c(e)? {
532                 self.fill(hole, p.entry);
533                 hole = p.hole;
534             }
535         }
536         Ok(Some(Patch { hole: hole, entry: entry }))
537     }
538 
c_alternate(&mut self, exprs: &[Hir]) -> ResultOrEmpty539     fn c_alternate(&mut self, exprs: &[Hir]) -> ResultOrEmpty {
540         debug_assert!(
541             exprs.len() >= 2,
542             "alternates must have at least 2 exprs"
543         );
544 
545         // Initial entry point is always the first split.
546         let first_split_entry = self.insts.len();
547 
548         // Save up all of the holes from each alternate. They will all get
549         // patched to point to the same location.
550         let mut holes = vec![];
551 
552         // true indicates that the hole is a split where we want to fill
553         // the second branch.
554         let mut prev_hole = (Hole::None, false);
555         for e in &exprs[0..exprs.len() - 1] {
556             if prev_hole.1 {
557                 let next = self.insts.len();
558                 self.fill_split(prev_hole.0, None, Some(next));
559             } else {
560                 self.fill_to_next(prev_hole.0);
561             }
562             let split = self.push_split_hole();
563             if let Some(Patch { hole, entry }) = self.c(e)? {
564                 holes.push(hole);
565                 prev_hole = (self.fill_split(split, Some(entry), None), false);
566             } else {
567                 let (split1, split2) = split.dup_one();
568                 holes.push(split1);
569                 prev_hole = (split2, true);
570             }
571         }
572         if let Some(Patch { hole, entry }) = self.c(&exprs[exprs.len() - 1])? {
573             holes.push(hole);
574             if prev_hole.1 {
575                 self.fill_split(prev_hole.0, None, Some(entry));
576             } else {
577                 self.fill(prev_hole.0, entry);
578             }
579         } else {
580             // We ignore prev_hole.1. When it's true, it means we have two
581             // empty branches both pushing prev_hole.0 into holes, so both
582             // branches will go to the same place anyway.
583             holes.push(prev_hole.0);
584         }
585         Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry }))
586     }
587 
c_repeat(&mut self, rep: &hir::Repetition) -> ResultOrEmpty588     fn c_repeat(&mut self, rep: &hir::Repetition) -> ResultOrEmpty {
589         use regex_syntax::hir::RepetitionKind::*;
590         match rep.kind {
591             ZeroOrOne => self.c_repeat_zero_or_one(&rep.hir, rep.greedy),
592             ZeroOrMore => self.c_repeat_zero_or_more(&rep.hir, rep.greedy),
593             OneOrMore => self.c_repeat_one_or_more(&rep.hir, rep.greedy),
594             Range(hir::RepetitionRange::Exactly(min_max)) => {
595                 self.c_repeat_range(&rep.hir, rep.greedy, min_max, min_max)
596             }
597             Range(hir::RepetitionRange::AtLeast(min)) => {
598                 self.c_repeat_range_min_or_more(&rep.hir, rep.greedy, min)
599             }
600             Range(hir::RepetitionRange::Bounded(min, max)) => {
601                 self.c_repeat_range(&rep.hir, rep.greedy, min, max)
602             }
603         }
604     }
605 
c_repeat_zero_or_one( &mut self, expr: &Hir, greedy: bool, ) -> ResultOrEmpty606     fn c_repeat_zero_or_one(
607         &mut self,
608         expr: &Hir,
609         greedy: bool,
610     ) -> ResultOrEmpty {
611         let split_entry = self.insts.len();
612         let split = self.push_split_hole();
613         let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
614             Some(p) => p,
615             None => return self.pop_split_hole(),
616         };
617         let split_hole = if greedy {
618             self.fill_split(split, Some(entry_rep), None)
619         } else {
620             self.fill_split(split, None, Some(entry_rep))
621         };
622         let holes = vec![hole_rep, split_hole];
623         Ok(Some(Patch { hole: Hole::Many(holes), entry: split_entry }))
624     }
625 
c_repeat_zero_or_more( &mut self, expr: &Hir, greedy: bool, ) -> ResultOrEmpty626     fn c_repeat_zero_or_more(
627         &mut self,
628         expr: &Hir,
629         greedy: bool,
630     ) -> ResultOrEmpty {
631         let split_entry = self.insts.len();
632         let split = self.push_split_hole();
633         let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
634             Some(p) => p,
635             None => return self.pop_split_hole(),
636         };
637 
638         self.fill(hole_rep, split_entry);
639         let split_hole = if greedy {
640             self.fill_split(split, Some(entry_rep), None)
641         } else {
642             self.fill_split(split, None, Some(entry_rep))
643         };
644         Ok(Some(Patch { hole: split_hole, entry: split_entry }))
645     }
646 
c_repeat_one_or_more( &mut self, expr: &Hir, greedy: bool, ) -> ResultOrEmpty647     fn c_repeat_one_or_more(
648         &mut self,
649         expr: &Hir,
650         greedy: bool,
651     ) -> ResultOrEmpty {
652         let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
653             Some(p) => p,
654             None => return Ok(None),
655         };
656         self.fill_to_next(hole_rep);
657         let split = self.push_split_hole();
658 
659         let split_hole = if greedy {
660             self.fill_split(split, Some(entry_rep), None)
661         } else {
662             self.fill_split(split, None, Some(entry_rep))
663         };
664         Ok(Some(Patch { hole: split_hole, entry: entry_rep }))
665     }
666 
c_repeat_range_min_or_more( &mut self, expr: &Hir, greedy: bool, min: u32, ) -> ResultOrEmpty667     fn c_repeat_range_min_or_more(
668         &mut self,
669         expr: &Hir,
670         greedy: bool,
671         min: u32,
672     ) -> ResultOrEmpty {
673         let min = u32_to_usize(min);
674         // Using next_inst() is ok, because we can't return it (concat would
675         // have to return Some(_) while c_repeat_range_min_or_more returns
676         // None).
677         let patch_concat = self
678             .c_concat(iter::repeat(expr).take(min))?
679             .unwrap_or(self.next_inst());
680         if let Some(patch_rep) = self.c_repeat_zero_or_more(expr, greedy)? {
681             self.fill(patch_concat.hole, patch_rep.entry);
682             Ok(Some(Patch { hole: patch_rep.hole, entry: patch_concat.entry }))
683         } else {
684             Ok(None)
685         }
686     }
687 
c_repeat_range( &mut self, expr: &Hir, greedy: bool, min: u32, max: u32, ) -> ResultOrEmpty688     fn c_repeat_range(
689         &mut self,
690         expr: &Hir,
691         greedy: bool,
692         min: u32,
693         max: u32,
694     ) -> ResultOrEmpty {
695         let (min, max) = (u32_to_usize(min), u32_to_usize(max));
696         debug_assert!(min <= max);
697         let patch_concat = self.c_concat(iter::repeat(expr).take(min))?;
698         if min == max {
699             return Ok(patch_concat);
700         }
701         // Same reasoning as in c_repeat_range_min_or_more (we know that min <
702         // max at this point).
703         let patch_concat = patch_concat.unwrap_or(self.next_inst());
704         let initial_entry = patch_concat.entry;
705         // It is much simpler to compile, e.g., `a{2,5}` as:
706         //
707         //     aaa?a?a?
708         //
709         // But you end up with a sequence of instructions like this:
710         //
711         //     0: 'a'
712         //     1: 'a',
713         //     2: split(3, 4)
714         //     3: 'a'
715         //     4: split(5, 6)
716         //     5: 'a'
717         //     6: split(7, 8)
718         //     7: 'a'
719         //     8: MATCH
720         //
721         // This is *incredibly* inefficient because the splits end
722         // up forming a chain, which has to be resolved everything a
723         // transition is followed.
724         let mut holes = vec![];
725         let mut prev_hole = patch_concat.hole;
726         for _ in min..max {
727             self.fill_to_next(prev_hole);
728             let split = self.push_split_hole();
729             let Patch { hole, entry } = match self.c(expr)? {
730                 Some(p) => p,
731                 None => return self.pop_split_hole(),
732             };
733             prev_hole = hole;
734             if greedy {
735                 holes.push(self.fill_split(split, Some(entry), None));
736             } else {
737                 holes.push(self.fill_split(split, None, Some(entry)));
738             }
739         }
740         holes.push(prev_hole);
741         Ok(Some(Patch { hole: Hole::Many(holes), entry: initial_entry }))
742     }
743 
744     /// Can be used as a default value for the c_* functions when the call to
745     /// c_function is followed by inserting at least one instruction that is
746     /// always executed after the ones written by the c* function.
next_inst(&self) -> Patch747     fn next_inst(&self) -> Patch {
748         Patch { hole: Hole::None, entry: self.insts.len() }
749     }
750 
fill(&mut self, hole: Hole, goto: InstPtr)751     fn fill(&mut self, hole: Hole, goto: InstPtr) {
752         match hole {
753             Hole::None => {}
754             Hole::One(pc) => {
755                 self.insts[pc].fill(goto);
756             }
757             Hole::Many(holes) => {
758                 for hole in holes {
759                     self.fill(hole, goto);
760                 }
761             }
762         }
763     }
764 
fill_to_next(&mut self, hole: Hole)765     fn fill_to_next(&mut self, hole: Hole) {
766         let next = self.insts.len();
767         self.fill(hole, next);
768     }
769 
fill_split( &mut self, hole: Hole, goto1: Option<InstPtr>, goto2: Option<InstPtr>, ) -> Hole770     fn fill_split(
771         &mut self,
772         hole: Hole,
773         goto1: Option<InstPtr>,
774         goto2: Option<InstPtr>,
775     ) -> Hole {
776         match hole {
777             Hole::None => Hole::None,
778             Hole::One(pc) => match (goto1, goto2) {
779                 (Some(goto1), Some(goto2)) => {
780                     self.insts[pc].fill_split(goto1, goto2);
781                     Hole::None
782                 }
783                 (Some(goto1), None) => {
784                     self.insts[pc].half_fill_split_goto1(goto1);
785                     Hole::One(pc)
786                 }
787                 (None, Some(goto2)) => {
788                     self.insts[pc].half_fill_split_goto2(goto2);
789                     Hole::One(pc)
790                 }
791                 (None, None) => unreachable!(
792                     "at least one of the split \
793                      holes must be filled"
794                 ),
795             },
796             Hole::Many(holes) => {
797                 let mut new_holes = vec![];
798                 for hole in holes {
799                     new_holes.push(self.fill_split(hole, goto1, goto2));
800                 }
801                 if new_holes.is_empty() {
802                     Hole::None
803                 } else if new_holes.len() == 1 {
804                     new_holes.pop().unwrap()
805                 } else {
806                     Hole::Many(new_holes)
807                 }
808             }
809         }
810     }
811 
push_compiled(&mut self, inst: Inst)812     fn push_compiled(&mut self, inst: Inst) {
813         self.insts.push(MaybeInst::Compiled(inst));
814     }
815 
push_hole(&mut self, inst: InstHole) -> Hole816     fn push_hole(&mut self, inst: InstHole) -> Hole {
817         let hole = self.insts.len();
818         self.insts.push(MaybeInst::Uncompiled(inst));
819         Hole::One(hole)
820     }
821 
push_split_hole(&mut self) -> Hole822     fn push_split_hole(&mut self) -> Hole {
823         let hole = self.insts.len();
824         self.insts.push(MaybeInst::Split);
825         Hole::One(hole)
826     }
827 
pop_split_hole(&mut self) -> ResultOrEmpty828     fn pop_split_hole(&mut self) -> ResultOrEmpty {
829         self.insts.pop();
830         Ok(None)
831     }
832 
check_size(&self) -> result::Result<(), Error>833     fn check_size(&self) -> result::Result<(), Error> {
834         use std::mem::size_of;
835 
836         let size =
837             self.extra_inst_bytes + (self.insts.len() * size_of::<Inst>());
838         if size > self.size_limit {
839             Err(Error::CompiledTooBig(self.size_limit))
840         } else {
841             Ok(())
842         }
843     }
844 }
845 
846 #[derive(Debug)]
847 enum Hole {
848     None,
849     One(InstPtr),
850     Many(Vec<Hole>),
851 }
852 
853 impl Hole {
dup_one(self) -> (Self, Self)854     fn dup_one(self) -> (Self, Self) {
855         match self {
856             Hole::One(pc) => (Hole::One(pc), Hole::One(pc)),
857             Hole::None | Hole::Many(_) => {
858                 unreachable!("must be called on single hole")
859             }
860         }
861     }
862 }
863 
864 #[derive(Clone, Debug)]
865 enum MaybeInst {
866     Compiled(Inst),
867     Uncompiled(InstHole),
868     Split,
869     Split1(InstPtr),
870     Split2(InstPtr),
871 }
872 
873 impl MaybeInst {
fill(&mut self, goto: InstPtr)874     fn fill(&mut self, goto: InstPtr) {
875         let maybeinst = match *self {
876             MaybeInst::Split => MaybeInst::Split1(goto),
877             MaybeInst::Uncompiled(ref inst) => {
878                 MaybeInst::Compiled(inst.fill(goto))
879             }
880             MaybeInst::Split1(goto1) => {
881                 MaybeInst::Compiled(Inst::Split(InstSplit {
882                     goto1: goto1,
883                     goto2: goto,
884                 }))
885             }
886             MaybeInst::Split2(goto2) => {
887                 MaybeInst::Compiled(Inst::Split(InstSplit {
888                     goto1: goto,
889                     goto2: goto2,
890                 }))
891             }
892             _ => unreachable!(
893                 "not all instructions were compiled! \
894                  found uncompiled instruction: {:?}",
895                 self
896             ),
897         };
898         *self = maybeinst;
899     }
900 
fill_split(&mut self, goto1: InstPtr, goto2: InstPtr)901     fn fill_split(&mut self, goto1: InstPtr, goto2: InstPtr) {
902         let filled = match *self {
903             MaybeInst::Split => {
904                 Inst::Split(InstSplit { goto1: goto1, goto2: goto2 })
905             }
906             _ => unreachable!(
907                 "must be called on Split instruction, \
908                  instead it was called on: {:?}",
909                 self
910             ),
911         };
912         *self = MaybeInst::Compiled(filled);
913     }
914 
half_fill_split_goto1(&mut self, goto1: InstPtr)915     fn half_fill_split_goto1(&mut self, goto1: InstPtr) {
916         let half_filled = match *self {
917             MaybeInst::Split => goto1,
918             _ => unreachable!(
919                 "must be called on Split instruction, \
920                  instead it was called on: {:?}",
921                 self
922             ),
923         };
924         *self = MaybeInst::Split1(half_filled);
925     }
926 
half_fill_split_goto2(&mut self, goto2: InstPtr)927     fn half_fill_split_goto2(&mut self, goto2: InstPtr) {
928         let half_filled = match *self {
929             MaybeInst::Split => goto2,
930             _ => unreachable!(
931                 "must be called on Split instruction, \
932                  instead it was called on: {:?}",
933                 self
934             ),
935         };
936         *self = MaybeInst::Split2(half_filled);
937     }
938 
unwrap(self) -> Inst939     fn unwrap(self) -> Inst {
940         match self {
941             MaybeInst::Compiled(inst) => inst,
942             _ => unreachable!(
943                 "must be called on a compiled instruction, \
944                  instead it was called on: {:?}",
945                 self
946             ),
947         }
948     }
949 }
950 
951 #[derive(Clone, Debug)]
952 enum InstHole {
953     Save { slot: usize },
954     EmptyLook { look: EmptyLook },
955     Char { c: char },
956     Ranges { ranges: Vec<(char, char)> },
957     Bytes { start: u8, end: u8 },
958 }
959 
960 impl InstHole {
fill(&self, goto: InstPtr) -> Inst961     fn fill(&self, goto: InstPtr) -> Inst {
962         match *self {
963             InstHole::Save { slot } => {
964                 Inst::Save(InstSave { goto: goto, slot: slot })
965             }
966             InstHole::EmptyLook { look } => {
967                 Inst::EmptyLook(InstEmptyLook { goto: goto, look: look })
968             }
969             InstHole::Char { c } => Inst::Char(InstChar { goto: goto, c: c }),
970             InstHole::Ranges { ref ranges } => Inst::Ranges(InstRanges {
971                 goto: goto,
972                 ranges: ranges.clone().into_boxed_slice(),
973             }),
974             InstHole::Bytes { start, end } => {
975                 Inst::Bytes(InstBytes { goto: goto, start: start, end: end })
976             }
977         }
978     }
979 }
980 
981 struct CompileClass<'a, 'b> {
982     c: &'a mut Compiler,
983     ranges: &'b [hir::ClassUnicodeRange],
984 }
985 
986 impl<'a, 'b> CompileClass<'a, 'b> {
compile(mut self) -> Result987     fn compile(mut self) -> Result {
988         let mut holes = vec![];
989         let mut initial_entry = None;
990         let mut last_split = Hole::None;
991         let mut utf8_seqs = self.c.utf8_seqs.take().unwrap();
992         self.c.suffix_cache.clear();
993 
994         for (i, range) in self.ranges.iter().enumerate() {
995             let is_last_range = i + 1 == self.ranges.len();
996             utf8_seqs.reset(range.start(), range.end());
997             let mut it = (&mut utf8_seqs).peekable();
998             loop {
999                 let utf8_seq = match it.next() {
1000                     None => break,
1001                     Some(utf8_seq) => utf8_seq,
1002                 };
1003                 if is_last_range && it.peek().is_none() {
1004                     let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?;
1005                     holes.push(hole);
1006                     self.c.fill(last_split, entry);
1007                     last_split = Hole::None;
1008                     if initial_entry.is_none() {
1009                         initial_entry = Some(entry);
1010                     }
1011                 } else {
1012                     if initial_entry.is_none() {
1013                         initial_entry = Some(self.c.insts.len());
1014                     }
1015                     self.c.fill_to_next(last_split);
1016                     last_split = self.c.push_split_hole();
1017                     let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?;
1018                     holes.push(hole);
1019                     last_split =
1020                         self.c.fill_split(last_split, Some(entry), None);
1021                 }
1022             }
1023         }
1024         self.c.utf8_seqs = Some(utf8_seqs);
1025         Ok(Patch { hole: Hole::Many(holes), entry: initial_entry.unwrap() })
1026     }
1027 
c_utf8_seq(&mut self, seq: &Utf8Sequence) -> Result1028     fn c_utf8_seq(&mut self, seq: &Utf8Sequence) -> Result {
1029         if self.c.compiled.is_reverse {
1030             self.c_utf8_seq_(seq)
1031         } else {
1032             self.c_utf8_seq_(seq.into_iter().rev())
1033         }
1034     }
1035 
c_utf8_seq_<'r, I>(&mut self, seq: I) -> Result where I: IntoIterator<Item = &'r Utf8Range>,1036     fn c_utf8_seq_<'r, I>(&mut self, seq: I) -> Result
1037     where
1038         I: IntoIterator<Item = &'r Utf8Range>,
1039     {
1040         // The initial instruction for each UTF-8 sequence should be the same.
1041         let mut from_inst = ::std::usize::MAX;
1042         let mut last_hole = Hole::None;
1043         for byte_range in seq {
1044             let key = SuffixCacheKey {
1045                 from_inst: from_inst,
1046                 start: byte_range.start,
1047                 end: byte_range.end,
1048             };
1049             {
1050                 let pc = self.c.insts.len();
1051                 if let Some(cached_pc) = self.c.suffix_cache.get(key, pc) {
1052                     from_inst = cached_pc;
1053                     continue;
1054                 }
1055             }
1056             self.c.byte_classes.set_range(byte_range.start, byte_range.end);
1057             if from_inst == ::std::usize::MAX {
1058                 last_hole = self.c.push_hole(InstHole::Bytes {
1059                     start: byte_range.start,
1060                     end: byte_range.end,
1061                 });
1062             } else {
1063                 self.c.push_compiled(Inst::Bytes(InstBytes {
1064                     goto: from_inst,
1065                     start: byte_range.start,
1066                     end: byte_range.end,
1067                 }));
1068             }
1069             from_inst = self.c.insts.len().checked_sub(1).unwrap();
1070             debug_assert!(from_inst < ::std::usize::MAX);
1071         }
1072         debug_assert!(from_inst < ::std::usize::MAX);
1073         Ok(Patch { hole: last_hole, entry: from_inst })
1074     }
1075 }
1076 
1077 /// `SuffixCache` is a simple bounded hash map for caching suffix entries in
1078 /// UTF-8 automata. For example, consider the Unicode range \u{0}-\u{FFFF}.
1079 /// The set of byte ranges looks like this:
1080 ///
1081 /// [0-7F]
1082 /// [C2-DF][80-BF]
1083 /// [E0][A0-BF][80-BF]
1084 /// [E1-EC][80-BF][80-BF]
1085 /// [ED][80-9F][80-BF]
1086 /// [EE-EF][80-BF][80-BF]
1087 ///
1088 /// Each line above translates to one alternate in the compiled regex program.
1089 /// However, all but one of the alternates end in the same suffix, which is
1090 /// a waste of an instruction. The suffix cache facilitates reusing them across
1091 /// alternates.
1092 ///
1093 /// Note that a HashMap could be trivially used for this, but we don't need its
1094 /// overhead. Some small bounded space (LRU style) is more than enough.
1095 ///
1096 /// This uses similar idea to [`SparseSet`](../sparse/struct.SparseSet.html),
1097 /// except it uses hashes as original indices and then compares full keys for
1098 /// validation against `dense` array.
1099 #[derive(Debug)]
1100 struct SuffixCache {
1101     sparse: Box<[usize]>,
1102     dense: Vec<SuffixCacheEntry>,
1103 }
1104 
1105 #[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
1106 struct SuffixCacheEntry {
1107     key: SuffixCacheKey,
1108     pc: InstPtr,
1109 }
1110 
1111 #[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
1112 struct SuffixCacheKey {
1113     from_inst: InstPtr,
1114     start: u8,
1115     end: u8,
1116 }
1117 
1118 impl SuffixCache {
new(size: usize) -> Self1119     fn new(size: usize) -> Self {
1120         SuffixCache {
1121             sparse: vec![0usize; size].into(),
1122             dense: Vec::with_capacity(size),
1123         }
1124     }
1125 
get(&mut self, key: SuffixCacheKey, pc: InstPtr) -> Option<InstPtr>1126     fn get(&mut self, key: SuffixCacheKey, pc: InstPtr) -> Option<InstPtr> {
1127         let hash = self.hash(&key);
1128         let pos = &mut self.sparse[hash];
1129         if let Some(entry) = self.dense.get(*pos) {
1130             if entry.key == key {
1131                 return Some(entry.pc);
1132             }
1133         }
1134         *pos = self.dense.len();
1135         self.dense.push(SuffixCacheEntry { key: key, pc: pc });
1136         None
1137     }
1138 
clear(&mut self)1139     fn clear(&mut self) {
1140         self.dense.clear();
1141     }
1142 
hash(&self, suffix: &SuffixCacheKey) -> usize1143     fn hash(&self, suffix: &SuffixCacheKey) -> usize {
1144         // Basic FNV-1a hash as described:
1145         // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
1146         const FNV_PRIME: u64 = 1099511628211;
1147         let mut h = 14695981039346656037;
1148         h = (h ^ (suffix.from_inst as u64)).wrapping_mul(FNV_PRIME);
1149         h = (h ^ (suffix.start as u64)).wrapping_mul(FNV_PRIME);
1150         h = (h ^ (suffix.end as u64)).wrapping_mul(FNV_PRIME);
1151         (h as usize) % self.sparse.len()
1152     }
1153 }
1154 
1155 struct ByteClassSet([bool; 256]);
1156 
1157 impl ByteClassSet {
new() -> Self1158     fn new() -> Self {
1159         ByteClassSet([false; 256])
1160     }
1161 
set_range(&mut self, start: u8, end: u8)1162     fn set_range(&mut self, start: u8, end: u8) {
1163         debug_assert!(start <= end);
1164         if start > 0 {
1165             self.0[start as usize - 1] = true;
1166         }
1167         self.0[end as usize] = true;
1168     }
1169 
set_word_boundary(&mut self)1170     fn set_word_boundary(&mut self) {
1171         // We need to mark all ranges of bytes whose pairs result in
1172         // evaluating \b differently.
1173         let iswb = is_word_byte;
1174         let mut b1: u16 = 0;
1175         let mut b2: u16;
1176         while b1 <= 255 {
1177             b2 = b1 + 1;
1178             while b2 <= 255 && iswb(b1 as u8) == iswb(b2 as u8) {
1179                 b2 += 1;
1180             }
1181             self.set_range(b1 as u8, (b2 - 1) as u8);
1182             b1 = b2;
1183         }
1184     }
1185 
byte_classes(&self) -> Vec<u8>1186     fn byte_classes(&self) -> Vec<u8> {
1187         // N.B. If you're debugging the DFA, it's useful to simply return
1188         // `(0..256).collect()`, which effectively removes the byte classes
1189         // and makes the transitions easier to read.
1190         // (0usize..256).map(|x| x as u8).collect()
1191         let mut byte_classes = vec![0; 256];
1192         let mut class = 0u8;
1193         let mut i = 0;
1194         loop {
1195             byte_classes[i] = class as u8;
1196             if i >= 255 {
1197                 break;
1198             }
1199             if self.0[i] {
1200                 class = class.checked_add(1).unwrap();
1201             }
1202             i += 1;
1203         }
1204         byte_classes
1205     }
1206 }
1207 
1208 impl fmt::Debug for ByteClassSet {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result1209     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1210         f.debug_tuple("ByteClassSet").field(&&self.0[..]).finish()
1211     }
1212 }
1213 
u32_to_usize(n: u32) -> usize1214 fn u32_to_usize(n: u32) -> usize {
1215     // In case usize is less than 32 bits, we need to guard against overflow.
1216     // On most platforms this compiles to nothing.
1217     // TODO Use `std::convert::TryFrom` once it's stable.
1218     if (n as u64) > (::std::usize::MAX as u64) {
1219         panic!("BUG: {} is too big to be pointer sized", n)
1220     }
1221     n as usize
1222 }
1223 
1224 #[cfg(test)]
1225 mod tests {
1226     use super::ByteClassSet;
1227 
1228     #[test]
byte_classes()1229     fn byte_classes() {
1230         let mut set = ByteClassSet::new();
1231         set.set_range(b'a', b'z');
1232         let classes = set.byte_classes();
1233         assert_eq!(classes[0], 0);
1234         assert_eq!(classes[1], 0);
1235         assert_eq!(classes[2], 0);
1236         assert_eq!(classes[b'a' as usize - 1], 0);
1237         assert_eq!(classes[b'a' as usize], 1);
1238         assert_eq!(classes[b'm' as usize], 1);
1239         assert_eq!(classes[b'z' as usize], 1);
1240         assert_eq!(classes[b'z' as usize + 1], 2);
1241         assert_eq!(classes[254], 2);
1242         assert_eq!(classes[255], 2);
1243 
1244         let mut set = ByteClassSet::new();
1245         set.set_range(0, 2);
1246         set.set_range(4, 6);
1247         let classes = set.byte_classes();
1248         assert_eq!(classes[0], 0);
1249         assert_eq!(classes[1], 0);
1250         assert_eq!(classes[2], 0);
1251         assert_eq!(classes[3], 1);
1252         assert_eq!(classes[4], 2);
1253         assert_eq!(classes[5], 2);
1254         assert_eq!(classes[6], 2);
1255         assert_eq!(classes[7], 3);
1256         assert_eq!(classes[255], 3);
1257     }
1258 
1259     #[test]
full_byte_classes()1260     fn full_byte_classes() {
1261         let mut set = ByteClassSet::new();
1262         for i in 0..256u16 {
1263             set.set_range(i as u8, i as u8);
1264         }
1265         assert_eq!(set.byte_classes().len(), 256);
1266     }
1267 }
1268