1 // Copyright 2018 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
4 //
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
10 
11 /*!
12 Provides routines for extracting literal prefixes and suffixes from an `Hir`.
13 */
14 
15 use std::cmp;
16 use std::fmt;
17 use std::iter;
18 use std::mem;
19 use std::ops;
20 
21 use hir::{self, Hir, HirKind};
22 
23 /// A set of literal byte strings extracted from a regular expression.
24 ///
25 /// Every member of the set is a `Literal`, which is represented by a
26 /// `Vec<u8>`. (Notably, it may contain invalid UTF-8.) Every member is
27 /// said to be either *complete* or *cut*. A complete literal means that
28 /// it extends until the beginning (or end) of the regular expression. In
29 /// some circumstances, this can be used to indicate a match in the regular
30 /// expression.
31 ///
32 /// A key aspect of literal extraction is knowing when to stop. It is not
33 /// feasible to blindly extract all literals from a regular expression, even if
34 /// there are finitely many. For example, the regular expression `[0-9]{10}`
35 /// has `10^10` distinct literals. For this reason, literal extraction is
36 /// bounded to some low number by default using heuristics, but the limits can
37 /// be tweaked.
38 ///
39 /// **WARNING**: Literal extraction uses stack space proportional to the size
40 /// of the `Hir` expression. At some point, this drawback will be eliminated.
41 /// To protect yourself, set a reasonable
42 /// [`nest_limit` on your `Parser`](../../struct.ParserBuilder.html#method.nest_limit).
43 /// This is done for you by default.
44 #[derive(Clone, Eq, PartialEq)]
45 pub struct Literals {
46     lits: Vec<Literal>,
47     limit_size: usize,
48     limit_class: usize,
49 }
50 
51 /// A single member of a set of literals extracted from a regular expression.
52 ///
53 /// This type has `Deref` and `DerefMut` impls to `Vec<u8>` so that all slice
54 /// and `Vec` operations are available.
55 #[derive(Clone, Eq, Ord)]
56 pub struct Literal {
57     v: Vec<u8>,
58     cut: bool,
59 }
60 
61 impl Literals {
62     /// Returns a new empty set of literals using default limits.
empty() -> Literals63     pub fn empty() -> Literals {
64         Literals {
65             lits: vec![],
66             limit_size: 250,
67             limit_class: 10,
68         }
69     }
70 
71     /// Returns a set of literal prefixes extracted from the given `Hir`.
prefixes(expr: &Hir) -> Literals72     pub fn prefixes(expr: &Hir) -> Literals {
73         let mut lits = Literals::empty();
74         lits.union_prefixes(expr);
75         lits
76     }
77 
78     /// Returns a set of literal suffixes extracted from the given `Hir`.
suffixes(expr: &Hir) -> Literals79     pub fn suffixes(expr: &Hir) -> Literals {
80         let mut lits = Literals::empty();
81         lits.union_suffixes(expr);
82         lits
83     }
84 
85     /// Get the approximate size limit (in bytes) of this set.
limit_size(&self) -> usize86     pub fn limit_size(&self) -> usize {
87         self.limit_size
88     }
89 
90     /// Set the approximate size limit (in bytes) of this set.
91     ///
92     /// If extracting a literal would put the set over this limit, then
93     /// extraction stops.
94     ///
95     /// The new limits will only apply to additions to this set. Existing
96     /// members remain unchanged, even if the set exceeds the new limit.
set_limit_size(&mut self, size: usize) -> &mut Literals97     pub fn set_limit_size(&mut self, size: usize) -> &mut Literals {
98         self.limit_size = size;
99         self
100     }
101 
102     /// Get the character class size limit for this set.
limit_class(&self) -> usize103     pub fn limit_class(&self) -> usize {
104         self.limit_class
105     }
106 
107     /// Limits the size of character(or byte) classes considered.
108     ///
109     /// A value of `0` prevents all character classes from being considered.
110     ///
111     /// This limit also applies to case insensitive literals, since each
112     /// character in the case insensitive literal is converted to a class, and
113     /// then case folded.
114     ///
115     /// The new limits will only apply to additions to this set. Existing
116     /// members remain unchanged, even if the set exceeds the new limit.
set_limit_class(&mut self, size: usize) -> &mut Literals117     pub fn set_limit_class(&mut self, size: usize) -> &mut Literals {
118         self.limit_class = size;
119         self
120     }
121 
122     /// Returns the set of literals as a slice. Its order is unspecified.
literals(&self) -> &[Literal]123     pub fn literals(&self) -> &[Literal] {
124         &self.lits
125     }
126 
127     /// Returns the length of the smallest literal.
128     ///
129     /// Returns None is there are no literals in the set.
min_len(&self) -> Option<usize>130     pub fn min_len(&self) -> Option<usize> {
131         let mut min = None;
132         for lit in &self.lits {
133             match min {
134                 None => min = Some(lit.len()),
135                 Some(m) if lit.len() < m => min = Some(lit.len()),
136                 _ => {}
137             }
138         }
139         min
140     }
141 
142     /// Returns true if all members in this set are complete.
all_complete(&self) -> bool143     pub fn all_complete(&self) -> bool {
144         !self.lits.is_empty() && self.lits.iter().all(|l| !l.is_cut())
145     }
146 
147     /// Returns true if any member in this set is complete.
any_complete(&self) -> bool148     pub fn any_complete(&self) -> bool {
149         self.lits.iter().any(|lit| !lit.is_cut())
150     }
151 
152     /// Returns true if this set contains an empty literal.
contains_empty(&self) -> bool153     pub fn contains_empty(&self) -> bool {
154         self.lits.iter().any(|lit| lit.is_empty())
155     }
156 
157     /// Returns true if this set is empty or if all of its members is empty.
is_empty(&self) -> bool158     pub fn is_empty(&self) -> bool {
159         self.lits.is_empty() || self.lits.iter().all(|lit| lit.is_empty())
160     }
161 
162     /// Returns a new empty set of literals using this set's limits.
to_empty(&self) -> Literals163     pub fn to_empty(&self) -> Literals {
164         let mut lits = Literals::empty();
165         lits.set_limit_size(self.limit_size)
166             .set_limit_class(self.limit_class);
167         lits
168     }
169 
170     /// Returns the longest common prefix of all members in this set.
longest_common_prefix(&self) -> &[u8]171     pub fn longest_common_prefix(&self) -> &[u8] {
172         if self.is_empty() {
173             return &[];
174         }
175         let lit0 = &*self.lits[0];
176         let mut len = lit0.len();
177         for lit in &self.lits[1..] {
178             len = cmp::min(
179                 len,
180                 lit.iter()
181                    .zip(lit0)
182                    .take_while(|&(a, b)| a == b)
183                    .count());
184         }
185         &self.lits[0][..len]
186     }
187 
188     /// Returns the longest common suffix of all members in this set.
longest_common_suffix(&self) -> &[u8]189     pub fn longest_common_suffix(&self) -> &[u8] {
190         if self.is_empty() {
191             return &[];
192         }
193         let lit0 = &*self.lits[0];
194         let mut len = lit0.len();
195         for lit in &self.lits[1..] {
196             len = cmp::min(
197                 len,
198                 lit.iter()
199                    .rev()
200                    .zip(lit0.iter().rev())
201                    .take_while(|&(a, b)| a == b)
202                    .count());
203         }
204         &self.lits[0][self.lits[0].len() - len..]
205     }
206 
207     /// Returns a new set of literals with the given number of bytes trimmed
208     /// from the suffix of each literal.
209     ///
210     /// If any literal would be cut out completely by trimming, then None is
211     /// returned.
212     ///
213     /// Any duplicates that are created as a result of this transformation are
214     /// removed.
trim_suffix(&self, num_bytes: usize) -> Option<Literals>215     pub fn trim_suffix(&self, num_bytes: usize) -> Option<Literals> {
216         if self.min_len().map(|len| len <= num_bytes).unwrap_or(true) {
217             return None;
218         }
219         let mut new = self.to_empty();
220         for mut lit in self.lits.iter().cloned() {
221             let new_len = lit.len() - num_bytes;
222             lit.truncate(new_len);
223             lit.cut();
224             new.lits.push(lit);
225         }
226         new.lits.sort();
227         new.lits.dedup();
228         Some(new)
229     }
230 
231     /// Returns a new set of prefixes of this set of literals that are
232     /// guaranteed to be unambiguous.
233     ///
234     /// Any substring match with a member of the set is returned is guaranteed
235     /// to never overlap with a substring match of another member of the set
236     /// at the same starting position.
237     ///
238     /// Given any two members of the returned set, neither is a substring of
239     /// the other.
unambiguous_prefixes(&self) -> Literals240     pub fn unambiguous_prefixes(&self) -> Literals {
241         if self.lits.is_empty() {
242             return self.to_empty();
243         }
244         let mut old: Vec<Literal> = self.lits.iter().cloned().collect();
245         let mut new = self.to_empty();
246     'OUTER:
247         while let Some(mut candidate) = old.pop() {
248             if candidate.is_empty() {
249                 continue;
250             }
251             if new.lits.is_empty() {
252                 new.lits.push(candidate);
253                 continue;
254             }
255             for lit2 in &mut new.lits {
256                 if lit2.is_empty() {
257                     continue;
258                 }
259                 if &candidate == lit2 {
260                     // If the literal is already in the set, then we can
261                     // just drop it. But make sure that cut literals are
262                     // infectious!
263                     candidate.cut = candidate.cut || lit2.cut;
264                     lit2.cut = candidate.cut;
265                     continue 'OUTER;
266                 }
267                 if candidate.len() < lit2.len() {
268                     if let Some(i) = position(&candidate, &lit2) {
269                         candidate.cut();
270                         let mut lit3 = lit2.clone();
271                         lit3.truncate(i);
272                         lit3.cut();
273                         old.push(lit3);
274                         lit2.clear();
275                     }
276                 } else {
277                     if let Some(i) = position(&lit2, &candidate) {
278                         lit2.cut();
279                         let mut new_candidate = candidate.clone();
280                         new_candidate.truncate(i);
281                         new_candidate.cut();
282                         old.push(new_candidate);
283                         candidate.clear();
284                     }
285                 }
286                 // Oops, the candidate is already represented in the set.
287                 if candidate.is_empty() {
288                     continue 'OUTER;
289                 }
290             }
291             new.lits.push(candidate);
292         }
293         new.lits.retain(|lit| !lit.is_empty());
294         new.lits.sort();
295         new.lits.dedup();
296         new
297     }
298 
299     /// Returns a new set of suffixes of this set of literals that are
300     /// guaranteed to be unambiguous.
301     ///
302     /// Any substring match with a member of the set is returned is guaranteed
303     /// to never overlap with a substring match of another member of the set
304     /// at the same ending position.
305     ///
306     /// Given any two members of the returned set, neither is a substring of
307     /// the other.
unambiguous_suffixes(&self) -> Literals308     pub fn unambiguous_suffixes(&self) -> Literals {
309         // This is a touch wasteful...
310         let mut lits = self.clone();
311         lits.reverse();
312         let mut unamb = lits.unambiguous_prefixes();
313         unamb.reverse();
314         unamb
315     }
316 
317     /// Unions the prefixes from the given expression to this set.
318     ///
319     /// If prefixes could not be added (for example, this set would exceed its
320     /// size limits or the set of prefixes from `expr` includes the empty
321     /// string), then false is returned.
322     ///
323     /// Note that prefix literals extracted from `expr` are said to be complete
324     /// if and only if the literal extends from the beginning of `expr` to the
325     /// end of `expr`.
union_prefixes(&mut self, expr: &Hir) -> bool326     pub fn union_prefixes(&mut self, expr: &Hir) -> bool {
327         let mut lits = self.to_empty();
328         prefixes(expr, &mut lits);
329         !lits.is_empty() && !lits.contains_empty() && self.union(lits)
330     }
331 
332     /// Unions the suffixes from the given expression to this set.
333     ///
334     /// If suffixes could not be added (for example, this set would exceed its
335     /// size limits or the set of suffixes from `expr` includes the empty
336     /// string), then false is returned.
337     ///
338     /// Note that prefix literals extracted from `expr` are said to be complete
339     /// if and only if the literal extends from the end of `expr` to the
340     /// beginning of `expr`.
union_suffixes(&mut self, expr: &Hir) -> bool341     pub fn union_suffixes(&mut self, expr: &Hir) -> bool {
342         let mut lits = self.to_empty();
343         suffixes(expr, &mut lits);
344         lits.reverse();
345         !lits.is_empty() && !lits.contains_empty() && self.union(lits)
346     }
347 
348     /// Unions this set with another set.
349     ///
350     /// If the union would cause the set to exceed its limits, then the union
351     /// is skipped and it returns false. Otherwise, if the union succeeds, it
352     /// returns true.
union(&mut self, lits: Literals) -> bool353     pub fn union(&mut self, lits: Literals) -> bool {
354         if self.num_bytes() + lits.num_bytes() > self.limit_size {
355             return false;
356         }
357         if lits.is_empty() {
358             self.lits.push(Literal::empty());
359         } else {
360             self.lits.extend(lits.lits);
361         }
362         true
363     }
364 
365     /// Extends this set with another set.
366     ///
367     /// The set of literals is extended via a cross product.
368     ///
369     /// If a cross product would cause this set to exceed its limits, then the
370     /// cross product is skipped and it returns false. Otherwise, if the cross
371     /// product succeeds, it returns true.
cross_product(&mut self, lits: &Literals) -> bool372     pub fn cross_product(&mut self, lits: &Literals) -> bool {
373         if lits.is_empty() {
374             return true;
375         }
376         // Check that we make sure we stay in our limits.
377         let mut size_after;
378         if self.is_empty() || !self.any_complete() {
379             size_after = self.num_bytes();
380             for lits_lit in lits.literals() {
381                 size_after += lits_lit.len();
382             }
383         } else {
384             size_after = self.lits.iter().fold(0, |accum, lit| {
385                 accum + if lit.is_cut() { lit.len() } else { 0 }
386             });
387             for lits_lit in lits.literals() {
388                 for self_lit in self.literals() {
389                     if !self_lit.is_cut() {
390                         size_after += self_lit.len() + lits_lit.len();
391                     }
392                 }
393             }
394         }
395         if size_after > self.limit_size {
396             return false;
397         }
398 
399         let mut base = self.remove_complete();
400         if base.is_empty() {
401             base = vec![Literal::empty()];
402         }
403         for lits_lit in lits.literals() {
404             for mut self_lit in base.clone() {
405                 self_lit.extend(&**lits_lit);
406                 self_lit.cut = lits_lit.cut;
407                 self.lits.push(self_lit);
408             }
409         }
410         true
411     }
412 
413     /// Extends each literal in this set with the bytes given.
414     ///
415     /// If the set is empty, then the given literal is added to the set.
416     ///
417     /// If adding any number of bytes to all members of this set causes a limit
418     /// to be exceeded, then no bytes are added and false is returned. If a
419     /// prefix of `bytes` can be fit into this set, then it is used and all
420     /// resulting literals are cut.
cross_add(&mut self, bytes: &[u8]) -> bool421     pub fn cross_add(&mut self, bytes: &[u8]) -> bool {
422         // N.B. This could be implemented by simply calling cross_product with
423         // a literal set containing just `bytes`, but we can be smarter about
424         // taking shorter prefixes of `bytes` if they'll fit.
425         if bytes.is_empty() {
426             return true;
427         }
428         if self.lits.is_empty() {
429             let i = cmp::min(self.limit_size, bytes.len());
430             self.lits.push(Literal::new(bytes[..i].to_owned()));
431             self.lits[0].cut = i < bytes.len();
432             return !self.lits[0].is_cut();
433         }
434         let size = self.num_bytes();
435         if size + self.lits.len() >= self.limit_size {
436             return false;
437         }
438         let mut i = 1;
439         while size + (i * self.lits.len()) <= self.limit_size
440             && i < bytes.len() {
441             i += 1;
442         }
443         for lit in &mut self.lits {
444             if !lit.is_cut() {
445                 lit.extend(&bytes[..i]);
446                 if i < bytes.len() {
447                     lit.cut();
448                 }
449             }
450         }
451         true
452     }
453 
454     /// Adds the given literal to this set.
455     ///
456     /// Returns false if adding this literal would cause the class to be too
457     /// big.
add(&mut self, lit: Literal) -> bool458     pub fn add(&mut self, lit: Literal) -> bool {
459         if self.num_bytes() + lit.len() > self.limit_size {
460             return false;
461         }
462         self.lits.push(lit);
463         true
464     }
465 
466     /// Extends each literal in this set with the character class given.
467     ///
468     /// Returns false if the character class was too big to add.
add_char_class(&mut self, cls: &hir::ClassUnicode) -> bool469     pub fn add_char_class(&mut self, cls: &hir::ClassUnicode) -> bool {
470         self._add_char_class(cls, false)
471     }
472 
473     /// Extends each literal in this set with the character class given,
474     /// writing the bytes of each character in reverse.
475     ///
476     /// Returns false if the character class was too big to add.
add_char_class_reverse(&mut self, cls: &hir::ClassUnicode) -> bool477     fn add_char_class_reverse(&mut self, cls: &hir::ClassUnicode) -> bool {
478         self._add_char_class(cls, true)
479     }
480 
_add_char_class( &mut self, cls: &hir::ClassUnicode, reverse: bool, ) -> bool481     fn _add_char_class(
482         &mut self,
483         cls: &hir::ClassUnicode,
484         reverse: bool,
485     ) -> bool {
486         use std::char;
487 
488         if self.class_exceeds_limits(cls_char_count(cls)) {
489             return false;
490         }
491         let mut base = self.remove_complete();
492         if base.is_empty() {
493             base = vec![Literal::empty()];
494         }
495         for r in cls.iter() {
496             let (s, e) = (r.start as u32, r.end as u32 + 1);
497             for c in (s..e).filter_map(char::from_u32) {
498                 for mut lit in base.clone() {
499                     let mut bytes = c.to_string().into_bytes();
500                     if reverse {
501                         bytes.reverse();
502                     }
503                     lit.extend(&bytes);
504                     self.lits.push(lit);
505                 }
506             }
507         }
508         true
509     }
510 
511     /// Extends each literal in this set with the byte class given.
512     ///
513     /// Returns false if the byte class was too big to add.
add_byte_class(&mut self, cls: &hir::ClassBytes) -> bool514     pub fn add_byte_class(&mut self, cls: &hir::ClassBytes) -> bool {
515         if self.class_exceeds_limits(cls_byte_count(cls)) {
516             return false;
517         }
518         let mut base = self.remove_complete();
519         if base.is_empty() {
520             base = vec![Literal::empty()];
521         }
522         for r in cls.iter() {
523             let (s, e) = (r.start as u32, r.end as u32 + 1);
524             for b in (s..e).map(|b| b as u8) {
525                 for mut lit in base.clone() {
526                     lit.push(b);
527                     self.lits.push(lit);
528                 }
529             }
530         }
531         true
532     }
533 
534     /// Cuts every member of this set. When a member is cut, it can never
535     /// be extended.
cut(&mut self)536     pub fn cut(&mut self) {
537         for lit in &mut self.lits {
538             lit.cut();
539         }
540     }
541 
542     /// Reverses all members in place.
reverse(&mut self)543     pub fn reverse(&mut self) {
544         for lit in &mut self.lits {
545             lit.reverse();
546         }
547     }
548 
549     /// Clears this set of all members.
clear(&mut self)550     pub fn clear(&mut self) {
551         self.lits.clear();
552     }
553 
554     /// Pops all complete literals out of this set.
remove_complete(&mut self) -> Vec<Literal>555     fn remove_complete(&mut self) -> Vec<Literal> {
556         let mut base = vec![];
557         for lit in mem::replace(&mut self.lits, vec![]) {
558             if lit.is_cut() {
559                 self.lits.push(lit);
560             } else {
561                 base.push(lit);
562             }
563         }
564         base
565     }
566 
567     /// Returns the total number of bytes in this set.
num_bytes(&self) -> usize568     fn num_bytes(&self) -> usize {
569         self.lits.iter().fold(0, |accum, lit| accum + lit.len())
570     }
571 
572     /// Returns true if a character class with the given size would cause this
573     /// set to exceed its limits.
574     ///
575     /// The size given should correspond to the number of items in the class.
class_exceeds_limits(&self, size: usize) -> bool576     fn class_exceeds_limits(&self, size: usize) -> bool {
577         if size > self.limit_class {
578             return true;
579         }
580         // This is an approximation since codepoints in a char class can encode
581         // to 1-4 bytes.
582         let new_byte_count =
583             if self.lits.is_empty() {
584                 size
585             } else {
586                 self.lits
587                     .iter()
588                     .fold(0, |accum, lit| {
589                         accum + if lit.is_cut() {
590                             // If the literal is cut, then we'll never add
591                             // anything to it, so don't count it.
592                             0
593                         } else {
594                             (lit.len() + 1) * size
595                         }
596                     })
597             };
598         new_byte_count > self.limit_size
599     }
600 }
601 
prefixes(expr: &Hir, lits: &mut Literals)602 fn prefixes(expr: &Hir, lits: &mut Literals) {
603     match *expr.kind() {
604         HirKind::Literal(hir::Literal::Unicode(c)) => {
605             let mut buf = [0; 4];
606             lits.cross_add(c.encode_utf8(&mut buf).as_bytes());
607         }
608         HirKind::Literal(hir::Literal::Byte(b)) => {
609             lits.cross_add(&[b]);
610         }
611         HirKind::Class(hir::Class::Unicode(ref cls)) => {
612             if !lits.add_char_class(cls) {
613                 lits.cut();
614             }
615         }
616         HirKind::Class(hir::Class::Bytes(ref cls)) => {
617             if !lits.add_byte_class(cls) {
618                 lits.cut();
619             }
620         }
621         HirKind::Group(hir::Group { ref hir, .. }) => {
622             prefixes(&**hir, lits);
623         }
624         HirKind::Repetition(ref x) => {
625             match x.kind {
626                 hir::RepetitionKind::ZeroOrOne => {
627                     repeat_zero_or_one_literals(&x.hir, lits, prefixes);
628                 }
629                 hir::RepetitionKind::ZeroOrMore => {
630                     repeat_zero_or_more_literals(&x.hir, lits, prefixes);
631                 }
632                 hir::RepetitionKind::OneOrMore => {
633                     repeat_one_or_more_literals(&x.hir, lits, prefixes);
634                 }
635                 hir::RepetitionKind::Range(ref rng) => {
636                     let (min, max) = match *rng {
637                         hir::RepetitionRange::Exactly(m) => {
638                             (m, Some(m))
639                         }
640                         hir::RepetitionRange::AtLeast(m) => {
641                             (m, None)
642                         }
643                         hir::RepetitionRange::Bounded(m, n) => {
644                             (m, Some(n))
645                         }
646                     };
647                     repeat_range_literals(
648                         &x.hir, min, max, x.greedy, lits, prefixes)
649                 }
650             }
651         }
652         HirKind::Concat(ref es) if es.is_empty() => {}
653         HirKind::Concat(ref es) if es.len() == 1 => prefixes(&es[0], lits),
654         HirKind::Concat(ref es) => {
655             for e in es {
656                 if let HirKind::Anchor(hir::Anchor::StartText) = *e.kind() {
657                     if !lits.is_empty() {
658                         lits.cut();
659                         break;
660                     }
661                     lits.add(Literal::empty());
662                     continue;
663                 }
664                 let mut lits2 = lits.to_empty();
665                 prefixes(e, &mut lits2);
666                 if !lits.cross_product(&lits2) || !lits2.any_complete() {
667                     // If this expression couldn't yield any literal that
668                     // could be extended, then we need to quit. Since we're
669                     // short-circuiting, we also need to freeze every member.
670                     lits.cut();
671                     break;
672                 }
673             }
674         }
675         HirKind::Alternation(ref es) => {
676             alternate_literals(es, lits, prefixes);
677         }
678         _ => lits.cut(),
679     }
680 }
681 
suffixes(expr: &Hir, lits: &mut Literals)682 fn suffixes(expr: &Hir, lits: &mut Literals) {
683     match *expr.kind() {
684         HirKind::Literal(hir::Literal::Unicode(c)) => {
685             let mut buf = [0u8; 4];
686             let i = c.encode_utf8(&mut buf).len();
687             let mut buf = &mut buf[..i];
688             buf.reverse();
689             lits.cross_add(buf);
690         }
691         HirKind::Literal(hir::Literal::Byte(b)) => {
692             lits.cross_add(&[b]);
693         }
694         HirKind::Class(hir::Class::Unicode(ref cls)) => {
695             if !lits.add_char_class_reverse(cls) {
696                 lits.cut();
697             }
698         }
699         HirKind::Class(hir::Class::Bytes(ref cls)) => {
700             if !lits.add_byte_class(cls) {
701                 lits.cut();
702             }
703         }
704         HirKind::Group(hir::Group { ref hir, .. }) => {
705             suffixes(&**hir, lits);
706         }
707         HirKind::Repetition(ref x) => {
708             match x.kind {
709                 hir::RepetitionKind::ZeroOrOne => {
710                     repeat_zero_or_one_literals(&x.hir, lits, suffixes);
711                 }
712                 hir::RepetitionKind::ZeroOrMore => {
713                     repeat_zero_or_more_literals(&x.hir, lits, suffixes);
714                 }
715                 hir::RepetitionKind::OneOrMore => {
716                     repeat_one_or_more_literals(&x.hir, lits, suffixes);
717                 }
718                 hir::RepetitionKind::Range(ref rng) => {
719                     let (min, max) = match *rng {
720                         hir::RepetitionRange::Exactly(m) => {
721                             (m, Some(m))
722                         }
723                         hir::RepetitionRange::AtLeast(m) => {
724                             (m, None)
725                         }
726                         hir::RepetitionRange::Bounded(m, n) => {
727                             (m, Some(n))
728                         }
729                     };
730                     repeat_range_literals(
731                         &x.hir, min, max, x.greedy, lits, suffixes)
732                 }
733             }
734         }
735         HirKind::Concat(ref es) if es.is_empty() => {}
736         HirKind::Concat(ref es) if es.len() == 1 => suffixes(&es[0], lits),
737         HirKind::Concat(ref es) => {
738             for e in es.iter().rev() {
739                 if let HirKind::Anchor(hir::Anchor::EndText) = *e.kind() {
740                     if !lits.is_empty() {
741                         lits.cut();
742                         break;
743                     }
744                     lits.add(Literal::empty());
745                     continue;
746                 }
747                 let mut lits2 = lits.to_empty();
748                 suffixes(e, &mut lits2);
749                 if !lits.cross_product(&lits2) || !lits2.any_complete() {
750                     // If this expression couldn't yield any literal that
751                     // could be extended, then we need to quit. Since we're
752                     // short-circuiting, we also need to freeze every member.
753                     lits.cut();
754                     break;
755                 }
756             }
757         }
758         HirKind::Alternation(ref es) => {
759             alternate_literals(es, lits, suffixes);
760         }
761         _ => lits.cut(),
762     }
763 }
764 
repeat_zero_or_one_literals<F: FnMut(&Hir, &mut Literals)>( e: &Hir, lits: &mut Literals, mut f: F, )765 fn repeat_zero_or_one_literals<F: FnMut(&Hir, &mut Literals)>(
766     e: &Hir,
767     lits: &mut Literals,
768     mut f: F,
769 ) {
770     let (mut lits2, mut lits3) = (lits.clone(), lits.to_empty());
771     lits3.set_limit_size(lits.limit_size() / 2);
772     f(e, &mut lits3);
773 
774     if lits3.is_empty() || !lits2.cross_product(&lits3) {
775         lits.cut();
776         return;
777     }
778     lits2.add(Literal::empty());
779     if !lits.union(lits2) {
780         lits.cut();
781     }
782 }
783 
repeat_zero_or_more_literals<F: FnMut(&Hir, &mut Literals)>( e: &Hir, lits: &mut Literals, mut f: F, )784 fn repeat_zero_or_more_literals<F: FnMut(&Hir, &mut Literals)>(
785     e: &Hir,
786     lits: &mut Literals,
787     mut f: F,
788 ) {
789     let (mut lits2, mut lits3) = (lits.clone(), lits.to_empty());
790     lits3.set_limit_size(lits.limit_size() / 2);
791     f(e, &mut lits3);
792 
793     if lits3.is_empty() || !lits2.cross_product(&lits3) {
794         lits.cut();
795         return;
796     }
797     lits2.cut();
798     lits2.add(Literal::empty());
799     if !lits.union(lits2) {
800         lits.cut();
801     }
802 }
803 
repeat_one_or_more_literals<F: FnMut(&Hir, &mut Literals)>( e: &Hir, lits: &mut Literals, mut f: F, )804 fn repeat_one_or_more_literals<F: FnMut(&Hir, &mut Literals)>(
805     e: &Hir,
806     lits: &mut Literals,
807     mut f: F,
808 ) {
809     f(e, lits);
810     lits.cut();
811 }
812 
repeat_range_literals<F: FnMut(&Hir, &mut Literals)>( e: &Hir, min: u32, max: Option<u32>, greedy: bool, lits: &mut Literals, mut f: F, )813 fn repeat_range_literals<F: FnMut(&Hir, &mut Literals)>(
814     e: &Hir,
815     min: u32,
816     max: Option<u32>,
817     greedy: bool,
818     lits: &mut Literals,
819     mut f: F,
820 ) {
821     if min == 0 {
822         // This is a bit conservative. If `max` is set, then we could
823         // treat this as a finite set of alternations. For now, we
824         // just treat it as `e*`.
825         f(&Hir::repetition(hir::Repetition {
826             kind: hir::RepetitionKind::ZeroOrMore,
827             greedy: greedy,
828             hir: Box::new(e.clone()),
829         }), lits);
830     } else {
831         if min > 0 {
832             let n = cmp::min(lits.limit_size, min as usize);
833             let es = iter::repeat(e.clone()).take(n).collect();
834             f(&Hir::concat(es), lits);
835             if n < min as usize || lits.contains_empty() {
836                 lits.cut();
837             }
838         }
839         if max.map_or(true, |max| min < max) {
840             lits.cut();
841         }
842     }
843 }
844 
alternate_literals<F: FnMut(&Hir, &mut Literals)>( es: &[Hir], lits: &mut Literals, mut f: F, )845 fn alternate_literals<F: FnMut(&Hir, &mut Literals)>(
846     es: &[Hir],
847     lits: &mut Literals,
848     mut f: F,
849 ) {
850     let mut lits2 = lits.to_empty();
851     for e in es {
852         let mut lits3 = lits.to_empty();
853         lits3.set_limit_size(lits.limit_size() / 5);
854         f(e, &mut lits3);
855         if lits3.is_empty() || !lits2.union(lits3) {
856             // If we couldn't find suffixes for *any* of the
857             // alternates, then the entire alternation has to be thrown
858             // away and any existing members must be frozen. Similarly,
859             // if the union couldn't complete, stop and freeze.
860             lits.cut();
861             return;
862         }
863     }
864     if !lits.cross_product(&lits2) {
865         lits.cut();
866     }
867 }
868 
869 impl fmt::Debug for Literals {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result870     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
871         f.debug_struct("Literals")
872          .field("lits", &self.lits)
873          .field("limit_size", &self.limit_size)
874          .field("limit_class", &self.limit_class)
875          .finish()
876     }
877 }
878 
879 impl Literal {
880     /// Returns a new complete literal with the bytes given.
new(bytes: Vec<u8>) -> Literal881     pub fn new(bytes: Vec<u8>) -> Literal {
882         Literal { v: bytes, cut: false }
883     }
884 
885     /// Returns a new complete empty literal.
empty() -> Literal886     pub fn empty() -> Literal {
887         Literal { v: vec![], cut: false }
888     }
889 
890     /// Returns true if this literal was "cut."
is_cut(&self) -> bool891     pub fn is_cut(&self) -> bool {
892         self.cut
893     }
894 
895     /// Cuts this literal.
cut(&mut self)896     pub fn cut(&mut self) {
897         self.cut = true;
898     }
899 }
900 
901 impl PartialEq for Literal {
eq(&self, other: &Literal) -> bool902     fn eq(&self, other: &Literal) -> bool {
903         self.v == other.v
904     }
905 }
906 
907 impl PartialOrd for Literal {
partial_cmp(&self, other: &Literal) -> Option<cmp::Ordering>908     fn partial_cmp(&self, other: &Literal) -> Option<cmp::Ordering> {
909         self.v.partial_cmp(&other.v)
910     }
911 }
912 
913 impl fmt::Debug for Literal {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result914     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
915         if self.is_cut() {
916             write!(f, "Cut({})", escape_unicode(&self.v))
917         } else {
918             write!(f, "Complete({})", escape_unicode(&self.v))
919         }
920     }
921 }
922 
923 impl AsRef<[u8]> for Literal {
as_ref(&self) -> &[u8]924     fn as_ref(&self) -> &[u8] { &self.v }
925 }
926 
927 impl ops::Deref for Literal {
928     type Target = Vec<u8>;
deref(&self) -> &Vec<u8>929     fn deref(&self) -> &Vec<u8> { &self.v }
930 }
931 
932 impl ops::DerefMut for Literal {
deref_mut(&mut self) -> &mut Vec<u8>933     fn deref_mut(&mut self) -> &mut Vec<u8> { &mut self.v }
934 }
935 
position(needle: &[u8], mut haystack: &[u8]) -> Option<usize>936 fn position(needle: &[u8], mut haystack: &[u8]) -> Option<usize> {
937     let mut i = 0;
938     while haystack.len() >= needle.len() {
939         if needle == &haystack[..needle.len()] {
940             return Some(i);
941         }
942         i += 1;
943         haystack = &haystack[1..];
944     }
945     None
946 }
947 
escape_unicode(bytes: &[u8]) -> String948 fn escape_unicode(bytes: &[u8]) -> String {
949     let show = match ::std::str::from_utf8(bytes) {
950         Ok(v) => v.to_string(),
951         Err(_) => escape_bytes(bytes),
952     };
953     let mut space_escaped = String::new();
954     for c in show.chars() {
955         if c.is_whitespace() {
956             let escaped = if c as u32 <= 0x7F {
957                 escape_byte(c as u8)
958             } else {
959                 if c as u32 <= 0xFFFF {
960                     format!(r"\u{{{:04x}}}", c as u32)
961                 } else {
962                     format!(r"\U{{{:08x}}}", c as u32)
963                 }
964             };
965             space_escaped.push_str(&escaped);
966         } else {
967             space_escaped.push(c);
968         }
969     }
970     space_escaped
971 }
972 
escape_bytes(bytes: &[u8]) -> String973 fn escape_bytes(bytes: &[u8]) -> String {
974     let mut s = String::new();
975     for &b in bytes {
976         s.push_str(&escape_byte(b));
977     }
978     s
979 }
980 
escape_byte(byte: u8) -> String981 fn escape_byte(byte: u8) -> String {
982     use std::ascii::escape_default;
983 
984     let escaped: Vec<u8> = escape_default(byte).collect();
985     String::from_utf8_lossy(&escaped).into_owned()
986 }
987 
cls_char_count(cls: &hir::ClassUnicode) -> usize988 fn cls_char_count(cls: &hir::ClassUnicode) -> usize {
989     cls.iter()
990         .map(|&r| 1 + (r.end as u32) - (r.start as u32))
991         .sum::<u32>() as usize
992 }
993 
cls_byte_count(cls: &hir::ClassBytes) -> usize994 fn cls_byte_count(cls: &hir::ClassBytes) -> usize {
995     cls.iter()
996         .map(|&r| 1 + (r.end as u32) - (r.start as u32))
997         .sum::<u32>() as usize
998 }
999 
1000 #[cfg(test)]
1001 mod tests {
1002     use std::fmt;
1003 
1004     use ParserBuilder;
1005     use hir::Hir;
1006     use super::{Literals, Literal, escape_bytes};
1007 
1008     // To make test failures easier to read.
1009     #[derive(Debug, Eq, PartialEq)]
1010     struct Bytes(Vec<ULiteral>);
1011     #[derive(Debug, Eq, PartialEq)]
1012     struct Unicode(Vec<ULiteral>);
1013 
escape_lits(blits: &[Literal]) -> Vec<ULiteral>1014     fn escape_lits(blits: &[Literal]) -> Vec<ULiteral> {
1015         let mut ulits = vec![];
1016         for blit in blits {
1017             ulits.push(ULiteral {
1018                 v: escape_bytes(&blit),
1019                 cut: blit.is_cut(),
1020             });
1021         }
1022         ulits
1023     }
1024 
create_lits<I: IntoIterator<Item=Literal>>(it: I) -> Literals1025     fn create_lits<I: IntoIterator<Item=Literal>>(it: I) -> Literals {
1026         Literals {
1027             lits: it.into_iter().collect(),
1028             limit_size: 0,
1029             limit_class: 0,
1030         }
1031     }
1032 
1033     // Needs to be pub for 1.3?
1034     #[derive(Clone, Eq, PartialEq)]
1035     pub struct ULiteral {
1036         v: String,
1037         cut: bool,
1038     }
1039 
1040     impl ULiteral {
is_cut(&self) -> bool1041         fn is_cut(&self) -> bool { self.cut }
1042     }
1043 
1044     impl fmt::Debug for ULiteral {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result1045         fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1046             if self.is_cut() {
1047                 write!(f, "Cut({})", self.v)
1048             } else {
1049                 write!(f, "Complete({})", self.v)
1050             }
1051         }
1052     }
1053 
1054     impl PartialEq<Literal> for ULiteral {
eq(&self, other: &Literal) -> bool1055         fn eq(&self, other: &Literal) -> bool {
1056             self.v.as_bytes() == &*other.v && self.is_cut() == other.is_cut()
1057         }
1058     }
1059 
1060     impl PartialEq<ULiteral> for Literal {
eq(&self, other: &ULiteral) -> bool1061         fn eq(&self, other: &ULiteral) -> bool {
1062             &*self.v == other.v.as_bytes() && self.is_cut() == other.is_cut()
1063         }
1064     }
1065 
1066     #[allow(non_snake_case)]
C(s: &'static str) -> ULiteral1067     fn C(s: &'static str) -> ULiteral {
1068         ULiteral { v: s.to_owned(), cut: true }
1069     }
1070     #[allow(non_snake_case)]
M(s: &'static str) -> ULiteral1071     fn M(s: &'static str) -> ULiteral {
1072         ULiteral { v: s.to_owned(), cut: false }
1073     }
1074 
prefixes(lits: &mut Literals, expr: &Hir)1075     fn prefixes(lits: &mut Literals, expr: &Hir) {
1076         lits.union_prefixes(expr);
1077     }
1078 
suffixes(lits: &mut Literals, expr: &Hir)1079     fn suffixes(lits: &mut Literals, expr: &Hir) {
1080         lits.union_suffixes(expr);
1081     }
1082 
1083     macro_rules! assert_lit_eq {
1084         ($which:ident, $got_lits:expr, $($expected_lit:expr),*) => {{
1085             let expected: Vec<ULiteral> = vec![$($expected_lit),*];
1086             let lits = $got_lits;
1087             assert_eq!(
1088                 $which(expected.clone()),
1089                 $which(escape_lits(lits.literals())));
1090             assert_eq!(
1091                 !expected.is_empty() && expected.iter().all(|l| !l.is_cut()),
1092                 lits.all_complete());
1093             assert_eq!(
1094                 expected.iter().any(|l| !l.is_cut()),
1095                 lits.any_complete());
1096         }};
1097     }
1098 
1099     macro_rules! test_lit {
1100         ($name:ident, $which:ident, $re:expr) => {
1101             test_lit!($name, $which, $re,);
1102         };
1103         ($name:ident, $which:ident, $re:expr, $($lit:expr),*) => {
1104             #[test]
1105             fn $name() {
1106                 let expr = ParserBuilder::new()
1107                     .build()
1108                     .parse($re)
1109                     .unwrap();
1110                 let lits = Literals::$which(&expr);
1111                 assert_lit_eq!(Unicode, lits, $($lit),*);
1112 
1113                 let expr = ParserBuilder::new()
1114                     .allow_invalid_utf8(true)
1115                     .unicode(false)
1116                     .build()
1117                     .parse($re)
1118                     .unwrap();
1119                 let lits = Literals::$which(&expr);
1120                 assert_lit_eq!(Bytes, lits, $($lit),*);
1121             }
1122         };
1123     }
1124 
1125     // ************************************************************************
1126     // Tests for prefix literal extraction.
1127     // ************************************************************************
1128 
1129     // Elementary tests.
1130     test_lit!(pfx_one_lit1, prefixes, "a", M("a"));
1131     test_lit!(pfx_one_lit2, prefixes, "abc", M("abc"));
1132     test_lit!(pfx_one_lit3, prefixes, "(?u)☃", M("\\xe2\\x98\\x83"));
1133     test_lit!(pfx_one_lit4, prefixes, "(?ui)☃", M("\\xe2\\x98\\x83"));
1134     test_lit!(pfx_class1, prefixes, "[1-4]",
1135               M("1"), M("2"), M("3"), M("4"));
1136     test_lit!(pfx_class2, prefixes, "(?u)[☃Ⅰ]",
1137               M("\\xe2\\x85\\xa0"), M("\\xe2\\x98\\x83"));
1138     test_lit!(pfx_class3, prefixes, "(?ui)[☃Ⅰ]",
1139               M("\\xe2\\x85\\xa0"), M("\\xe2\\x85\\xb0"),
1140               M("\\xe2\\x98\\x83"));
1141     test_lit!(pfx_one_lit_casei1, prefixes, "(?i)a",
1142               M("A"), M("a"));
1143     test_lit!(pfx_one_lit_casei2, prefixes, "(?i)abc",
1144               M("ABC"), M("aBC"), M("AbC"), M("abC"),
1145               M("ABc"), M("aBc"), M("Abc"), M("abc"));
1146     test_lit!(pfx_group1, prefixes, "(a)", M("a"));
1147     test_lit!(pfx_rep_zero_or_one1, prefixes, "a?");
1148     test_lit!(pfx_rep_zero_or_one2, prefixes, "(?:abc)?");
1149     test_lit!(pfx_rep_zero_or_more1, prefixes, "a*");
1150     test_lit!(pfx_rep_zero_or_more2, prefixes, "(?:abc)*");
1151     test_lit!(pfx_rep_one_or_more1, prefixes, "a+", C("a"));
1152     test_lit!(pfx_rep_one_or_more2, prefixes, "(?:abc)+", C("abc"));
1153     test_lit!(pfx_rep_nested_one_or_more, prefixes, "(?:a+)+", C("a"));
1154     test_lit!(pfx_rep_range1, prefixes, "a{0}");
1155     test_lit!(pfx_rep_range2, prefixes, "a{0,}");
1156     test_lit!(pfx_rep_range3, prefixes, "a{0,1}");
1157     test_lit!(pfx_rep_range4, prefixes, "a{1}", M("a"));
1158     test_lit!(pfx_rep_range5, prefixes, "a{2}", M("aa"));
1159     test_lit!(pfx_rep_range6, prefixes, "a{1,2}", C("a"));
1160     test_lit!(pfx_rep_range7, prefixes, "a{2,3}", C("aa"));
1161 
1162     // Test regexes with concatenations.
1163     test_lit!(pfx_cat1, prefixes, "(?:a)(?:b)", M("ab"));
1164     test_lit!(pfx_cat2, prefixes, "[ab]z", M("az"), M("bz"));
1165     test_lit!(pfx_cat3, prefixes, "(?i)[ab]z",
1166               M("AZ"), M("BZ"), M("aZ"), M("bZ"),
1167               M("Az"), M("Bz"), M("az"), M("bz"));
1168     test_lit!(pfx_cat4, prefixes, "[ab][yz]",
1169               M("ay"), M("by"), M("az"), M("bz"));
1170     test_lit!(pfx_cat5, prefixes, "a*b", C("a"), M("b"));
1171     test_lit!(pfx_cat6, prefixes, "a*b*c", C("a"), C("b"), M("c"));
1172     test_lit!(pfx_cat7, prefixes, "a*b*c+", C("a"), C("b"), C("c"));
1173     test_lit!(pfx_cat8, prefixes, "a*b+c", C("a"), C("b"));
1174     test_lit!(pfx_cat9, prefixes, "a*b+c*", C("a"), C("b"));
1175     test_lit!(pfx_cat10, prefixes, "ab*", C("ab"), M("a"));
1176     test_lit!(pfx_cat11, prefixes, "ab*c", C("ab"), M("ac"));
1177     test_lit!(pfx_cat12, prefixes, "ab+", C("ab"));
1178     test_lit!(pfx_cat13, prefixes, "ab+c", C("ab"));
1179     test_lit!(pfx_cat14, prefixes, "a^", C("a"));
1180     test_lit!(pfx_cat15, prefixes, "$a");
1181     test_lit!(pfx_cat16, prefixes, r"ab*c", C("ab"), M("ac"));
1182     test_lit!(pfx_cat17, prefixes, r"ab+c", C("ab"));
1183     test_lit!(pfx_cat18, prefixes, r"z*azb", C("z"), M("azb"));
1184     test_lit!(pfx_cat19, prefixes, "a.z", C("a"));
1185 
1186     // Test regexes with alternations.
1187     test_lit!(pfx_alt1, prefixes, "a|b", M("a"), M("b"));
1188     test_lit!(pfx_alt2, prefixes, "[1-3]|b", M("1"), M("2"), M("3"), M("b"));
1189     test_lit!(pfx_alt3, prefixes, "y(?:a|b)z", M("yaz"), M("ybz"));
1190     test_lit!(pfx_alt4, prefixes, "a|b*");
1191     test_lit!(pfx_alt5, prefixes, "a|b+", M("a"), C("b"));
1192     test_lit!(pfx_alt6, prefixes, "a|(?:b|c*)");
1193     test_lit!(pfx_alt7, prefixes, "(a|b)*c|(a|ab)*c",
1194               C("a"), C("b"), M("c"), C("a"), C("ab"), M("c"));
1195     test_lit!(pfx_alt8, prefixes, "a*b|c", C("a"), M("b"), M("c"));
1196 
1197     // Test regexes with empty assertions.
1198     test_lit!(pfx_empty1, prefixes, "^a", M("a"));
1199     test_lit!(pfx_empty2, prefixes, "a${2}", C("a"));
1200     test_lit!(pfx_empty3, prefixes, "^abc", M("abc"));
1201     test_lit!(pfx_empty4, prefixes, "(?:^abc)|(?:^z)", M("abc"), M("z"));
1202 
1203     // Make sure some curious regexes have no prefixes.
1204     test_lit!(pfx_nothing1, prefixes, ".");
1205     test_lit!(pfx_nothing2, prefixes, "(?s).");
1206     test_lit!(pfx_nothing3, prefixes, "^");
1207     test_lit!(pfx_nothing4, prefixes, "$");
1208     test_lit!(pfx_nothing6, prefixes, "(?m)$");
1209     test_lit!(pfx_nothing7, prefixes, r"\b");
1210     test_lit!(pfx_nothing8, prefixes, r"\B");
1211 
1212     // Test a few regexes that defeat any prefix literal detection.
1213     test_lit!(pfx_defeated1, prefixes, ".a");
1214     test_lit!(pfx_defeated2, prefixes, "(?s).a");
1215     test_lit!(pfx_defeated3, prefixes, "a*b*c*");
1216     test_lit!(pfx_defeated4, prefixes, "a|.");
1217     test_lit!(pfx_defeated5, prefixes, ".|a");
1218     test_lit!(pfx_defeated6, prefixes, "a|^");
1219     test_lit!(pfx_defeated7, prefixes, ".(?:a(?:b)(?:c))");
1220     test_lit!(pfx_defeated8, prefixes, "$a");
1221     test_lit!(pfx_defeated9, prefixes, "(?m)$a");
1222     test_lit!(pfx_defeated10, prefixes, r"\ba");
1223     test_lit!(pfx_defeated11, prefixes, r"\Ba");
1224     test_lit!(pfx_defeated12, prefixes, "^*a");
1225     test_lit!(pfx_defeated13, prefixes, "^+a");
1226 
1227     test_lit!(
1228         pfx_crazy1,
1229         prefixes,
1230         r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]",
1231         C("Mo\\'am"), C("Mu\\'am"), C("Moam"), C("Muam"));
1232 
1233     // ************************************************************************
1234     // Tests for quiting prefix literal search.
1235     // ************************************************************************
1236 
1237     macro_rules! test_exhausted {
1238         ($name:ident, $which:ident, $re:expr) => {
1239             test_exhausted!($name, $which, $re,);
1240         };
1241         ($name:ident, $which:ident, $re:expr, $($lit:expr),*) => {
1242             #[test]
1243             fn $name() {
1244                 let expr = ParserBuilder::new()
1245                     .build()
1246                     .parse($re)
1247                     .unwrap();
1248                 let mut lits = Literals::empty();
1249                 lits.set_limit_size(20).set_limit_class(10);
1250                 $which(&mut lits, &expr);
1251                 assert_lit_eq!(Unicode, lits, $($lit),*);
1252 
1253                 let expr = ParserBuilder::new()
1254                     .allow_invalid_utf8(true)
1255                     .unicode(false)
1256                     .build()
1257                     .parse($re)
1258                     .unwrap();
1259                 let mut lits = Literals::empty();
1260                 lits.set_limit_size(20).set_limit_class(10);
1261                 $which(&mut lits, &expr);
1262                 assert_lit_eq!(Bytes, lits, $($lit),*);
1263             }
1264         };
1265     }
1266 
1267     // These test use a much lower limit than the default so that we can
1268     // write test cases of reasonable size.
1269     test_exhausted!(pfx_exhausted1, prefixes, "[a-z]");
1270     test_exhausted!(pfx_exhausted2, prefixes, "[a-z]*A");
1271     test_exhausted!(pfx_exhausted3, prefixes, "A[a-z]Z", C("A"));
1272     test_exhausted!(pfx_exhausted4, prefixes, "(?i)foobar",
1273                     C("FO"), C("fO"), C("Fo"), C("fo"));
1274     test_exhausted!(pfx_exhausted5, prefixes, "(?:ab){100}",
1275                     C("abababababababababab"));
1276     test_exhausted!(pfx_exhausted6, prefixes, "(?:(?:ab){100})*cd",
1277                     C("ababababab"), M("cd"));
1278     test_exhausted!(pfx_exhausted7, prefixes, "z(?:(?:ab){100})*cd",
1279                     C("zababababab"), M("zcd"));
1280     test_exhausted!(pfx_exhausted8, prefixes, "aaaaaaaaaaaaaaaaaaaaz",
1281                     C("aaaaaaaaaaaaaaaaaaaa"));
1282 
1283     // ************************************************************************
1284     // Tests for suffix literal extraction.
1285     // ************************************************************************
1286 
1287     // Elementary tests.
1288     test_lit!(sfx_one_lit1, suffixes, "a", M("a"));
1289     test_lit!(sfx_one_lit2, suffixes, "abc", M("abc"));
1290     test_lit!(sfx_one_lit3, suffixes, "(?u)☃", M("\\xe2\\x98\\x83"));
1291     test_lit!(sfx_one_lit4, suffixes, "(?ui)☃", M("\\xe2\\x98\\x83"));
1292     test_lit!(sfx_class1, suffixes, "[1-4]",
1293               M("1"), M("2"), M("3"), M("4"));
1294     test_lit!(sfx_class2, suffixes, "(?u)[☃Ⅰ]",
1295               M("\\xe2\\x85\\xa0"), M("\\xe2\\x98\\x83"));
1296     test_lit!(sfx_class3, suffixes, "(?ui)[☃Ⅰ]",
1297               M("\\xe2\\x85\\xa0"), M("\\xe2\\x85\\xb0"),
1298               M("\\xe2\\x98\\x83"));
1299     test_lit!(sfx_one_lit_casei1, suffixes, "(?i)a",
1300               M("A"), M("a"));
1301     test_lit!(sfx_one_lit_casei2, suffixes, "(?i)abc",
1302               M("ABC"), M("ABc"), M("AbC"), M("Abc"),
1303               M("aBC"), M("aBc"), M("abC"), M("abc"));
1304     test_lit!(sfx_group1, suffixes, "(a)", M("a"));
1305     test_lit!(sfx_rep_zero_or_one1, suffixes, "a?");
1306     test_lit!(sfx_rep_zero_or_one2, suffixes, "(?:abc)?");
1307     test_lit!(sfx_rep_zero_or_more1, suffixes, "a*");
1308     test_lit!(sfx_rep_zero_or_more2, suffixes, "(?:abc)*");
1309     test_lit!(sfx_rep_one_or_more1, suffixes, "a+", C("a"));
1310     test_lit!(sfx_rep_one_or_more2, suffixes, "(?:abc)+", C("abc"));
1311     test_lit!(sfx_rep_nested_one_or_more, suffixes, "(?:a+)+", C("a"));
1312     test_lit!(sfx_rep_range1, suffixes, "a{0}");
1313     test_lit!(sfx_rep_range2, suffixes, "a{0,}");
1314     test_lit!(sfx_rep_range3, suffixes, "a{0,1}");
1315     test_lit!(sfx_rep_range4, suffixes, "a{1}", M("a"));
1316     test_lit!(sfx_rep_range5, suffixes, "a{2}", M("aa"));
1317     test_lit!(sfx_rep_range6, suffixes, "a{1,2}", C("a"));
1318     test_lit!(sfx_rep_range7, suffixes, "a{2,3}", C("aa"));
1319 
1320     // Test regexes with concatenations.
1321     test_lit!(sfx_cat1, suffixes, "(?:a)(?:b)", M("ab"));
1322     test_lit!(sfx_cat2, suffixes, "[ab]z", M("az"), M("bz"));
1323     test_lit!(sfx_cat3, suffixes, "(?i)[ab]z",
1324               M("AZ"), M("Az"), M("BZ"), M("Bz"),
1325               M("aZ"), M("az"), M("bZ"), M("bz"));
1326     test_lit!(sfx_cat4, suffixes, "[ab][yz]",
1327               M("ay"), M("az"), M("by"), M("bz"));
1328     test_lit!(sfx_cat5, suffixes, "a*b", C("ab"), M("b"));
1329     test_lit!(sfx_cat6, suffixes, "a*b*c", C("bc"), C("ac"), M("c"));
1330     test_lit!(sfx_cat7, suffixes, "a*b*c+", C("c"));
1331     test_lit!(sfx_cat8, suffixes, "a*b+c", C("bc"));
1332     test_lit!(sfx_cat9, suffixes, "a*b+c*", C("c"), C("b"));
1333     test_lit!(sfx_cat10, suffixes, "ab*", C("b"), M("a"));
1334     test_lit!(sfx_cat11, suffixes, "ab*c", C("bc"), M("ac"));
1335     test_lit!(sfx_cat12, suffixes, "ab+", C("b"));
1336     test_lit!(sfx_cat13, suffixes, "ab+c", C("bc"));
1337     test_lit!(sfx_cat14, suffixes, "a^");
1338     test_lit!(sfx_cat15, suffixes, "$a", C("a"));
1339     test_lit!(sfx_cat16, suffixes, r"ab*c", C("bc"), M("ac"));
1340     test_lit!(sfx_cat17, suffixes, r"ab+c", C("bc"));
1341     test_lit!(sfx_cat18, suffixes, r"z*azb", C("zazb"), M("azb"));
1342     test_lit!(sfx_cat19, suffixes, "a.z", C("z"));
1343 
1344     // Test regexes with alternations.
1345     test_lit!(sfx_alt1, suffixes, "a|b", M("a"), M("b"));
1346     test_lit!(sfx_alt2, suffixes, "[1-3]|b", M("1"), M("2"), M("3"), M("b"));
1347     test_lit!(sfx_alt3, suffixes, "y(?:a|b)z", M("yaz"), M("ybz"));
1348     test_lit!(sfx_alt4, suffixes, "a|b*");
1349     test_lit!(sfx_alt5, suffixes, "a|b+", M("a"), C("b"));
1350     test_lit!(sfx_alt6, suffixes, "a|(?:b|c*)");
1351     test_lit!(sfx_alt7, suffixes, "(a|b)*c|(a|ab)*c",
1352               C("ac"), C("bc"), M("c"), C("ac"), C("abc"), M("c"));
1353     test_lit!(sfx_alt8, suffixes, "a*b|c", C("ab"), M("b"), M("c"));
1354 
1355     // Test regexes with empty assertions.
1356     test_lit!(sfx_empty1, suffixes, "a$", M("a"));
1357     test_lit!(sfx_empty2, suffixes, "${2}a", C("a"));
1358 
1359     // Make sure some curious regexes have no suffixes.
1360     test_lit!(sfx_nothing1, suffixes, ".");
1361     test_lit!(sfx_nothing2, suffixes, "(?s).");
1362     test_lit!(sfx_nothing3, suffixes, "^");
1363     test_lit!(sfx_nothing4, suffixes, "$");
1364     test_lit!(sfx_nothing6, suffixes, "(?m)$");
1365     test_lit!(sfx_nothing7, suffixes, r"\b");
1366     test_lit!(sfx_nothing8, suffixes, r"\B");
1367 
1368     // Test a few regexes that defeat any suffix literal detection.
1369     test_lit!(sfx_defeated1, suffixes, "a.");
1370     test_lit!(sfx_defeated2, suffixes, "(?s)a.");
1371     test_lit!(sfx_defeated3, suffixes, "a*b*c*");
1372     test_lit!(sfx_defeated4, suffixes, "a|.");
1373     test_lit!(sfx_defeated5, suffixes, ".|a");
1374     test_lit!(sfx_defeated6, suffixes, "a|^");
1375     test_lit!(sfx_defeated7, suffixes, "(?:a(?:b)(?:c)).");
1376     test_lit!(sfx_defeated8, suffixes, "a^");
1377     test_lit!(sfx_defeated9, suffixes, "(?m)a$");
1378     test_lit!(sfx_defeated10, suffixes, r"a\b");
1379     test_lit!(sfx_defeated11, suffixes, r"a\B");
1380     test_lit!(sfx_defeated12, suffixes, "a^*");
1381     test_lit!(sfx_defeated13, suffixes, "a^+");
1382 
1383     // These test use a much lower limit than the default so that we can
1384     // write test cases of reasonable size.
1385     test_exhausted!(sfx_exhausted1, suffixes, "[a-z]");
1386     test_exhausted!(sfx_exhausted2, suffixes, "A[a-z]*");
1387     test_exhausted!(sfx_exhausted3, suffixes, "A[a-z]Z", C("Z"));
1388     test_exhausted!(sfx_exhausted4, suffixes, "(?i)foobar",
1389                     C("AR"), C("Ar"), C("aR"), C("ar"));
1390     test_exhausted!(sfx_exhausted5, suffixes, "(?:ab){100}",
1391                     C("abababababababababab"));
1392     test_exhausted!(sfx_exhausted6, suffixes, "cd(?:(?:ab){100})*",
1393                     C("ababababab"), M("cd"));
1394     test_exhausted!(sfx_exhausted7, suffixes, "cd(?:(?:ab){100})*z",
1395                     C("abababababz"), M("cdz"));
1396     test_exhausted!(sfx_exhausted8, suffixes, "zaaaaaaaaaaaaaaaaaaaa",
1397                     C("aaaaaaaaaaaaaaaaaaaa"));
1398 
1399     // ************************************************************************
1400     // Tests for generating unambiguous literal sets.
1401     // ************************************************************************
1402 
1403     macro_rules! test_unamb {
1404         ($name:ident, $given:expr, $expected:expr) => {
1405             #[test]
1406             fn $name() {
1407                 let given: Vec<Literal> =
1408                     $given
1409                     .into_iter()
1410                     .map(|ul| {
1411                         let cut = ul.is_cut();
1412                         Literal { v: ul.v.into_bytes(), cut: cut }
1413                     })
1414                     .collect();
1415                 let lits = create_lits(given);
1416                 let got = lits.unambiguous_prefixes();
1417                 assert_eq!($expected, escape_lits(got.literals()));
1418             }
1419         };
1420     }
1421 
1422     test_unamb!(unambiguous1, vec![M("z"), M("azb")], vec![C("a"), C("z")]);
1423     test_unamb!(unambiguous2,
1424                 vec![M("zaaaaaa"), M("aa")], vec![C("aa"), C("z")]);
1425     test_unamb!(unambiguous3,
1426                 vec![M("Sherlock"), M("Watson")],
1427                 vec![M("Sherlock"), M("Watson")]);
1428     test_unamb!(unambiguous4, vec![M("abc"), M("bc")], vec![C("a"), C("bc")]);
1429     test_unamb!(unambiguous5, vec![M("bc"), M("abc")], vec![C("a"), C("bc")]);
1430     test_unamb!(unambiguous6, vec![M("a"), M("aa")], vec![C("a")]);
1431     test_unamb!(unambiguous7, vec![M("aa"), M("a")], vec![C("a")]);
1432     test_unamb!(unambiguous8, vec![M("ab"), M("a")], vec![C("a")]);
1433     test_unamb!(unambiguous9,
1434                 vec![M("ac"), M("bc"), M("c"), M("ac"), M("abc"), M("c")],
1435                 vec![C("a"), C("b"), C("c")]);
1436     test_unamb!(unambiguous10,
1437                 vec![M("Mo'"), M("Mu'"), M("Mo"), M("Mu")],
1438                 vec![C("Mo"), C("Mu")]);
1439     test_unamb!(unambiguous11,
1440                 vec![M("zazb"), M("azb")], vec![C("a"), C("z")]);
1441     test_unamb!(unambiguous12, vec![M("foo"), C("foo")], vec![C("foo")]);
1442     test_unamb!(unambiguous13,
1443                 vec![M("ABCX"), M("CDAX"), M("BCX")],
1444                 vec![C("A"), C("BCX"), C("CD")]);
1445     test_unamb!(unambiguous14,
1446                 vec![M("IMGX"), M("MVIX"), M("MGX"), M("DSX")],
1447                 vec![M("DSX"), C("I"), C("MGX"), C("MV")]);
1448     test_unamb!(unambiguous15,
1449                 vec![M("IMG_"), M("MG_"), M("CIMG")],
1450                 vec![C("C"), C("I"), C("MG_")]);
1451 
1452 
1453     // ************************************************************************
1454     // Tests for suffix trimming.
1455     // ************************************************************************
1456     macro_rules! test_trim {
1457         ($name:ident, $trim:expr, $given:expr, $expected:expr) => {
1458             #[test]
1459             fn $name() {
1460                 let given: Vec<Literal> =
1461                     $given
1462                     .into_iter()
1463                     .map(|ul| {
1464                         let cut = ul.is_cut();
1465                         Literal { v: ul.v.into_bytes(), cut: cut }
1466                     })
1467                     .collect();
1468                 let lits = create_lits(given);
1469                 let got = lits.trim_suffix($trim).unwrap();
1470                 assert_eq!($expected, escape_lits(got.literals()));
1471             }
1472         }
1473     }
1474 
1475     test_trim!(trim1, 1, vec![M("ab"), M("yz")], vec![C("a"), C("y")]);
1476     test_trim!(trim2, 1, vec![M("abc"), M("abd")], vec![C("ab")]);
1477     test_trim!(trim3, 2, vec![M("abc"), M("abd")], vec![C("a")]);
1478     test_trim!(trim4, 2, vec![M("abc"), M("ghij")], vec![C("a"), C("gh")]);
1479 
1480     // ************************************************************************
1481     // Tests for longest common prefix.
1482     // ************************************************************************
1483 
1484     macro_rules! test_lcp {
1485         ($name:ident, $given:expr, $expected:expr) => {
1486             #[test]
1487             fn $name() {
1488                 let given: Vec<Literal> =
1489                     $given
1490                     .into_iter()
1491                     .map(|s: &str| Literal {
1492                         v: s.to_owned().into_bytes(),
1493                         cut: false,
1494                     })
1495                     .collect();
1496                 let lits = create_lits(given);
1497                 let got = lits.longest_common_prefix();
1498                 assert_eq!($expected, escape_bytes(got));
1499             }
1500         };
1501     }
1502 
1503     test_lcp!(lcp1, vec!["a"], "a");
1504     test_lcp!(lcp2, vec![], "");
1505     test_lcp!(lcp3, vec!["a", "b"], "");
1506     test_lcp!(lcp4, vec!["ab", "ab"], "ab");
1507     test_lcp!(lcp5, vec!["ab", "a"], "a");
1508     test_lcp!(lcp6, vec!["a", "ab"], "a");
1509     test_lcp!(lcp7, vec!["ab", "b"], "");
1510     test_lcp!(lcp8, vec!["b", "ab"], "");
1511     test_lcp!(lcp9, vec!["foobar", "foobaz"], "fooba");
1512     test_lcp!(lcp10, vec!["foobar", "foobaz", "a"], "");
1513     test_lcp!(lcp11, vec!["a", "foobar", "foobaz"], "");
1514     test_lcp!(lcp12, vec!["foo", "flub", "flab", "floo"], "f");
1515 
1516     // ************************************************************************
1517     // Tests for longest common suffix.
1518     // ************************************************************************
1519 
1520     macro_rules! test_lcs {
1521         ($name:ident, $given:expr, $expected:expr) => {
1522             #[test]
1523             fn $name() {
1524                 let given: Vec<Literal> =
1525                     $given
1526                     .into_iter()
1527                     .map(|s: &str| Literal {
1528                         v: s.to_owned().into_bytes(),
1529                         cut: false,
1530                     })
1531                     .collect();
1532                 let lits = create_lits(given);
1533                 let got = lits.longest_common_suffix();
1534                 assert_eq!($expected, escape_bytes(got));
1535             }
1536         };
1537     }
1538 
1539     test_lcs!(lcs1, vec!["a"], "a");
1540     test_lcs!(lcs2, vec![], "");
1541     test_lcs!(lcs3, vec!["a", "b"], "");
1542     test_lcs!(lcs4, vec!["ab", "ab"], "ab");
1543     test_lcs!(lcs5, vec!["ab", "a"], "");
1544     test_lcs!(lcs6, vec!["a", "ab"], "");
1545     test_lcs!(lcs7, vec!["ab", "b"], "b");
1546     test_lcs!(lcs8, vec!["b", "ab"], "b");
1547     test_lcs!(lcs9, vec!["barfoo", "bazfoo"], "foo");
1548     test_lcs!(lcs10, vec!["barfoo", "bazfoo", "a"], "");
1549     test_lcs!(lcs11, vec!["a", "barfoo", "bazfoo"], "");
1550     test_lcs!(lcs12, vec!["flub", "bub", "boob", "dub"], "b");
1551 }
1552