1 // Copyright 2018 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
4 //
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
10
11 /*!
12 Provides routines for extracting literal prefixes and suffixes from an `Hir`.
13 */
14
15 use std::cmp;
16 use std::fmt;
17 use std::iter;
18 use std::mem;
19 use std::ops;
20
21 use hir::{self, Hir, HirKind};
22
23 /// A set of literal byte strings extracted from a regular expression.
24 ///
25 /// Every member of the set is a `Literal`, which is represented by a
26 /// `Vec<u8>`. (Notably, it may contain invalid UTF-8.) Every member is
27 /// said to be either *complete* or *cut*. A complete literal means that
28 /// it extends until the beginning (or end) of the regular expression. In
29 /// some circumstances, this can be used to indicate a match in the regular
30 /// expression.
31 ///
32 /// A key aspect of literal extraction is knowing when to stop. It is not
33 /// feasible to blindly extract all literals from a regular expression, even if
34 /// there are finitely many. For example, the regular expression `[0-9]{10}`
35 /// has `10^10` distinct literals. For this reason, literal extraction is
36 /// bounded to some low number by default using heuristics, but the limits can
37 /// be tweaked.
38 ///
39 /// **WARNING**: Literal extraction uses stack space proportional to the size
40 /// of the `Hir` expression. At some point, this drawback will be eliminated.
41 /// To protect yourself, set a reasonable
42 /// [`nest_limit` on your `Parser`](../../struct.ParserBuilder.html#method.nest_limit).
43 /// This is done for you by default.
44 #[derive(Clone, Eq, PartialEq)]
45 pub struct Literals {
46 lits: Vec<Literal>,
47 limit_size: usize,
48 limit_class: usize,
49 }
50
51 /// A single member of a set of literals extracted from a regular expression.
52 ///
53 /// This type has `Deref` and `DerefMut` impls to `Vec<u8>` so that all slice
54 /// and `Vec` operations are available.
55 #[derive(Clone, Eq, Ord)]
56 pub struct Literal {
57 v: Vec<u8>,
58 cut: bool,
59 }
60
61 impl Literals {
62 /// Returns a new empty set of literals using default limits.
empty() -> Literals63 pub fn empty() -> Literals {
64 Literals {
65 lits: vec![],
66 limit_size: 250,
67 limit_class: 10,
68 }
69 }
70
71 /// Returns a set of literal prefixes extracted from the given `Hir`.
prefixes(expr: &Hir) -> Literals72 pub fn prefixes(expr: &Hir) -> Literals {
73 let mut lits = Literals::empty();
74 lits.union_prefixes(expr);
75 lits
76 }
77
78 /// Returns a set of literal suffixes extracted from the given `Hir`.
suffixes(expr: &Hir) -> Literals79 pub fn suffixes(expr: &Hir) -> Literals {
80 let mut lits = Literals::empty();
81 lits.union_suffixes(expr);
82 lits
83 }
84
85 /// Get the approximate size limit (in bytes) of this set.
limit_size(&self) -> usize86 pub fn limit_size(&self) -> usize {
87 self.limit_size
88 }
89
90 /// Set the approximate size limit (in bytes) of this set.
91 ///
92 /// If extracting a literal would put the set over this limit, then
93 /// extraction stops.
94 ///
95 /// The new limits will only apply to additions to this set. Existing
96 /// members remain unchanged, even if the set exceeds the new limit.
set_limit_size(&mut self, size: usize) -> &mut Literals97 pub fn set_limit_size(&mut self, size: usize) -> &mut Literals {
98 self.limit_size = size;
99 self
100 }
101
102 /// Get the character class size limit for this set.
limit_class(&self) -> usize103 pub fn limit_class(&self) -> usize {
104 self.limit_class
105 }
106
107 /// Limits the size of character(or byte) classes considered.
108 ///
109 /// A value of `0` prevents all character classes from being considered.
110 ///
111 /// This limit also applies to case insensitive literals, since each
112 /// character in the case insensitive literal is converted to a class, and
113 /// then case folded.
114 ///
115 /// The new limits will only apply to additions to this set. Existing
116 /// members remain unchanged, even if the set exceeds the new limit.
set_limit_class(&mut self, size: usize) -> &mut Literals117 pub fn set_limit_class(&mut self, size: usize) -> &mut Literals {
118 self.limit_class = size;
119 self
120 }
121
122 /// Returns the set of literals as a slice. Its order is unspecified.
literals(&self) -> &[Literal]123 pub fn literals(&self) -> &[Literal] {
124 &self.lits
125 }
126
127 /// Returns the length of the smallest literal.
128 ///
129 /// Returns None is there are no literals in the set.
min_len(&self) -> Option<usize>130 pub fn min_len(&self) -> Option<usize> {
131 let mut min = None;
132 for lit in &self.lits {
133 match min {
134 None => min = Some(lit.len()),
135 Some(m) if lit.len() < m => min = Some(lit.len()),
136 _ => {}
137 }
138 }
139 min
140 }
141
142 /// Returns true if all members in this set are complete.
all_complete(&self) -> bool143 pub fn all_complete(&self) -> bool {
144 !self.lits.is_empty() && self.lits.iter().all(|l| !l.is_cut())
145 }
146
147 /// Returns true if any member in this set is complete.
any_complete(&self) -> bool148 pub fn any_complete(&self) -> bool {
149 self.lits.iter().any(|lit| !lit.is_cut())
150 }
151
152 /// Returns true if this set contains an empty literal.
contains_empty(&self) -> bool153 pub fn contains_empty(&self) -> bool {
154 self.lits.iter().any(|lit| lit.is_empty())
155 }
156
157 /// Returns true if this set is empty or if all of its members is empty.
is_empty(&self) -> bool158 pub fn is_empty(&self) -> bool {
159 self.lits.is_empty() || self.lits.iter().all(|lit| lit.is_empty())
160 }
161
162 /// Returns a new empty set of literals using this set's limits.
to_empty(&self) -> Literals163 pub fn to_empty(&self) -> Literals {
164 let mut lits = Literals::empty();
165 lits.set_limit_size(self.limit_size)
166 .set_limit_class(self.limit_class);
167 lits
168 }
169
170 /// Returns the longest common prefix of all members in this set.
longest_common_prefix(&self) -> &[u8]171 pub fn longest_common_prefix(&self) -> &[u8] {
172 if self.is_empty() {
173 return &[];
174 }
175 let lit0 = &*self.lits[0];
176 let mut len = lit0.len();
177 for lit in &self.lits[1..] {
178 len = cmp::min(
179 len,
180 lit.iter()
181 .zip(lit0)
182 .take_while(|&(a, b)| a == b)
183 .count());
184 }
185 &self.lits[0][..len]
186 }
187
188 /// Returns the longest common suffix of all members in this set.
longest_common_suffix(&self) -> &[u8]189 pub fn longest_common_suffix(&self) -> &[u8] {
190 if self.is_empty() {
191 return &[];
192 }
193 let lit0 = &*self.lits[0];
194 let mut len = lit0.len();
195 for lit in &self.lits[1..] {
196 len = cmp::min(
197 len,
198 lit.iter()
199 .rev()
200 .zip(lit0.iter().rev())
201 .take_while(|&(a, b)| a == b)
202 .count());
203 }
204 &self.lits[0][self.lits[0].len() - len..]
205 }
206
207 /// Returns a new set of literals with the given number of bytes trimmed
208 /// from the suffix of each literal.
209 ///
210 /// If any literal would be cut out completely by trimming, then None is
211 /// returned.
212 ///
213 /// Any duplicates that are created as a result of this transformation are
214 /// removed.
trim_suffix(&self, num_bytes: usize) -> Option<Literals>215 pub fn trim_suffix(&self, num_bytes: usize) -> Option<Literals> {
216 if self.min_len().map(|len| len <= num_bytes).unwrap_or(true) {
217 return None;
218 }
219 let mut new = self.to_empty();
220 for mut lit in self.lits.iter().cloned() {
221 let new_len = lit.len() - num_bytes;
222 lit.truncate(new_len);
223 lit.cut();
224 new.lits.push(lit);
225 }
226 new.lits.sort();
227 new.lits.dedup();
228 Some(new)
229 }
230
231 /// Returns a new set of prefixes of this set of literals that are
232 /// guaranteed to be unambiguous.
233 ///
234 /// Any substring match with a member of the set is returned is guaranteed
235 /// to never overlap with a substring match of another member of the set
236 /// at the same starting position.
237 ///
238 /// Given any two members of the returned set, neither is a substring of
239 /// the other.
unambiguous_prefixes(&self) -> Literals240 pub fn unambiguous_prefixes(&self) -> Literals {
241 if self.lits.is_empty() {
242 return self.to_empty();
243 }
244 let mut old: Vec<Literal> = self.lits.iter().cloned().collect();
245 let mut new = self.to_empty();
246 'OUTER:
247 while let Some(mut candidate) = old.pop() {
248 if candidate.is_empty() {
249 continue;
250 }
251 if new.lits.is_empty() {
252 new.lits.push(candidate);
253 continue;
254 }
255 for lit2 in &mut new.lits {
256 if lit2.is_empty() {
257 continue;
258 }
259 if &candidate == lit2 {
260 // If the literal is already in the set, then we can
261 // just drop it. But make sure that cut literals are
262 // infectious!
263 candidate.cut = candidate.cut || lit2.cut;
264 lit2.cut = candidate.cut;
265 continue 'OUTER;
266 }
267 if candidate.len() < lit2.len() {
268 if let Some(i) = position(&candidate, &lit2) {
269 candidate.cut();
270 let mut lit3 = lit2.clone();
271 lit3.truncate(i);
272 lit3.cut();
273 old.push(lit3);
274 lit2.clear();
275 }
276 } else {
277 if let Some(i) = position(&lit2, &candidate) {
278 lit2.cut();
279 let mut new_candidate = candidate.clone();
280 new_candidate.truncate(i);
281 new_candidate.cut();
282 old.push(new_candidate);
283 candidate.clear();
284 }
285 }
286 // Oops, the candidate is already represented in the set.
287 if candidate.is_empty() {
288 continue 'OUTER;
289 }
290 }
291 new.lits.push(candidate);
292 }
293 new.lits.retain(|lit| !lit.is_empty());
294 new.lits.sort();
295 new.lits.dedup();
296 new
297 }
298
299 /// Returns a new set of suffixes of this set of literals that are
300 /// guaranteed to be unambiguous.
301 ///
302 /// Any substring match with a member of the set is returned is guaranteed
303 /// to never overlap with a substring match of another member of the set
304 /// at the same ending position.
305 ///
306 /// Given any two members of the returned set, neither is a substring of
307 /// the other.
unambiguous_suffixes(&self) -> Literals308 pub fn unambiguous_suffixes(&self) -> Literals {
309 // This is a touch wasteful...
310 let mut lits = self.clone();
311 lits.reverse();
312 let mut unamb = lits.unambiguous_prefixes();
313 unamb.reverse();
314 unamb
315 }
316
317 /// Unions the prefixes from the given expression to this set.
318 ///
319 /// If prefixes could not be added (for example, this set would exceed its
320 /// size limits or the set of prefixes from `expr` includes the empty
321 /// string), then false is returned.
322 ///
323 /// Note that prefix literals extracted from `expr` are said to be complete
324 /// if and only if the literal extends from the beginning of `expr` to the
325 /// end of `expr`.
union_prefixes(&mut self, expr: &Hir) -> bool326 pub fn union_prefixes(&mut self, expr: &Hir) -> bool {
327 let mut lits = self.to_empty();
328 prefixes(expr, &mut lits);
329 !lits.is_empty() && !lits.contains_empty() && self.union(lits)
330 }
331
332 /// Unions the suffixes from the given expression to this set.
333 ///
334 /// If suffixes could not be added (for example, this set would exceed its
335 /// size limits or the set of suffixes from `expr` includes the empty
336 /// string), then false is returned.
337 ///
338 /// Note that prefix literals extracted from `expr` are said to be complete
339 /// if and only if the literal extends from the end of `expr` to the
340 /// beginning of `expr`.
union_suffixes(&mut self, expr: &Hir) -> bool341 pub fn union_suffixes(&mut self, expr: &Hir) -> bool {
342 let mut lits = self.to_empty();
343 suffixes(expr, &mut lits);
344 lits.reverse();
345 !lits.is_empty() && !lits.contains_empty() && self.union(lits)
346 }
347
348 /// Unions this set with another set.
349 ///
350 /// If the union would cause the set to exceed its limits, then the union
351 /// is skipped and it returns false. Otherwise, if the union succeeds, it
352 /// returns true.
union(&mut self, lits: Literals) -> bool353 pub fn union(&mut self, lits: Literals) -> bool {
354 if self.num_bytes() + lits.num_bytes() > self.limit_size {
355 return false;
356 }
357 if lits.is_empty() {
358 self.lits.push(Literal::empty());
359 } else {
360 self.lits.extend(lits.lits);
361 }
362 true
363 }
364
365 /// Extends this set with another set.
366 ///
367 /// The set of literals is extended via a cross product.
368 ///
369 /// If a cross product would cause this set to exceed its limits, then the
370 /// cross product is skipped and it returns false. Otherwise, if the cross
371 /// product succeeds, it returns true.
cross_product(&mut self, lits: &Literals) -> bool372 pub fn cross_product(&mut self, lits: &Literals) -> bool {
373 if lits.is_empty() {
374 return true;
375 }
376 // Check that we make sure we stay in our limits.
377 let mut size_after;
378 if self.is_empty() || !self.any_complete() {
379 size_after = self.num_bytes();
380 for lits_lit in lits.literals() {
381 size_after += lits_lit.len();
382 }
383 } else {
384 size_after = self.lits.iter().fold(0, |accum, lit| {
385 accum + if lit.is_cut() { lit.len() } else { 0 }
386 });
387 for lits_lit in lits.literals() {
388 for self_lit in self.literals() {
389 if !self_lit.is_cut() {
390 size_after += self_lit.len() + lits_lit.len();
391 }
392 }
393 }
394 }
395 if size_after > self.limit_size {
396 return false;
397 }
398
399 let mut base = self.remove_complete();
400 if base.is_empty() {
401 base = vec![Literal::empty()];
402 }
403 for lits_lit in lits.literals() {
404 for mut self_lit in base.clone() {
405 self_lit.extend(&**lits_lit);
406 self_lit.cut = lits_lit.cut;
407 self.lits.push(self_lit);
408 }
409 }
410 true
411 }
412
413 /// Extends each literal in this set with the bytes given.
414 ///
415 /// If the set is empty, then the given literal is added to the set.
416 ///
417 /// If adding any number of bytes to all members of this set causes a limit
418 /// to be exceeded, then no bytes are added and false is returned. If a
419 /// prefix of `bytes` can be fit into this set, then it is used and all
420 /// resulting literals are cut.
cross_add(&mut self, bytes: &[u8]) -> bool421 pub fn cross_add(&mut self, bytes: &[u8]) -> bool {
422 // N.B. This could be implemented by simply calling cross_product with
423 // a literal set containing just `bytes`, but we can be smarter about
424 // taking shorter prefixes of `bytes` if they'll fit.
425 if bytes.is_empty() {
426 return true;
427 }
428 if self.lits.is_empty() {
429 let i = cmp::min(self.limit_size, bytes.len());
430 self.lits.push(Literal::new(bytes[..i].to_owned()));
431 self.lits[0].cut = i < bytes.len();
432 return !self.lits[0].is_cut();
433 }
434 let size = self.num_bytes();
435 if size + self.lits.len() >= self.limit_size {
436 return false;
437 }
438 let mut i = 1;
439 while size + (i * self.lits.len()) <= self.limit_size
440 && i < bytes.len() {
441 i += 1;
442 }
443 for lit in &mut self.lits {
444 if !lit.is_cut() {
445 lit.extend(&bytes[..i]);
446 if i < bytes.len() {
447 lit.cut();
448 }
449 }
450 }
451 true
452 }
453
454 /// Adds the given literal to this set.
455 ///
456 /// Returns false if adding this literal would cause the class to be too
457 /// big.
add(&mut self, lit: Literal) -> bool458 pub fn add(&mut self, lit: Literal) -> bool {
459 if self.num_bytes() + lit.len() > self.limit_size {
460 return false;
461 }
462 self.lits.push(lit);
463 true
464 }
465
466 /// Extends each literal in this set with the character class given.
467 ///
468 /// Returns false if the character class was too big to add.
add_char_class(&mut self, cls: &hir::ClassUnicode) -> bool469 pub fn add_char_class(&mut self, cls: &hir::ClassUnicode) -> bool {
470 self._add_char_class(cls, false)
471 }
472
473 /// Extends each literal in this set with the character class given,
474 /// writing the bytes of each character in reverse.
475 ///
476 /// Returns false if the character class was too big to add.
add_char_class_reverse(&mut self, cls: &hir::ClassUnicode) -> bool477 fn add_char_class_reverse(&mut self, cls: &hir::ClassUnicode) -> bool {
478 self._add_char_class(cls, true)
479 }
480
_add_char_class( &mut self, cls: &hir::ClassUnicode, reverse: bool, ) -> bool481 fn _add_char_class(
482 &mut self,
483 cls: &hir::ClassUnicode,
484 reverse: bool,
485 ) -> bool {
486 use std::char;
487
488 if self.class_exceeds_limits(cls_char_count(cls)) {
489 return false;
490 }
491 let mut base = self.remove_complete();
492 if base.is_empty() {
493 base = vec![Literal::empty()];
494 }
495 for r in cls.iter() {
496 let (s, e) = (r.start as u32, r.end as u32 + 1);
497 for c in (s..e).filter_map(char::from_u32) {
498 for mut lit in base.clone() {
499 let mut bytes = c.to_string().into_bytes();
500 if reverse {
501 bytes.reverse();
502 }
503 lit.extend(&bytes);
504 self.lits.push(lit);
505 }
506 }
507 }
508 true
509 }
510
511 /// Extends each literal in this set with the byte class given.
512 ///
513 /// Returns false if the byte class was too big to add.
add_byte_class(&mut self, cls: &hir::ClassBytes) -> bool514 pub fn add_byte_class(&mut self, cls: &hir::ClassBytes) -> bool {
515 if self.class_exceeds_limits(cls_byte_count(cls)) {
516 return false;
517 }
518 let mut base = self.remove_complete();
519 if base.is_empty() {
520 base = vec![Literal::empty()];
521 }
522 for r in cls.iter() {
523 let (s, e) = (r.start as u32, r.end as u32 + 1);
524 for b in (s..e).map(|b| b as u8) {
525 for mut lit in base.clone() {
526 lit.push(b);
527 self.lits.push(lit);
528 }
529 }
530 }
531 true
532 }
533
534 /// Cuts every member of this set. When a member is cut, it can never
535 /// be extended.
cut(&mut self)536 pub fn cut(&mut self) {
537 for lit in &mut self.lits {
538 lit.cut();
539 }
540 }
541
542 /// Reverses all members in place.
reverse(&mut self)543 pub fn reverse(&mut self) {
544 for lit in &mut self.lits {
545 lit.reverse();
546 }
547 }
548
549 /// Clears this set of all members.
clear(&mut self)550 pub fn clear(&mut self) {
551 self.lits.clear();
552 }
553
554 /// Pops all complete literals out of this set.
remove_complete(&mut self) -> Vec<Literal>555 fn remove_complete(&mut self) -> Vec<Literal> {
556 let mut base = vec![];
557 for lit in mem::replace(&mut self.lits, vec![]) {
558 if lit.is_cut() {
559 self.lits.push(lit);
560 } else {
561 base.push(lit);
562 }
563 }
564 base
565 }
566
567 /// Returns the total number of bytes in this set.
num_bytes(&self) -> usize568 fn num_bytes(&self) -> usize {
569 self.lits.iter().fold(0, |accum, lit| accum + lit.len())
570 }
571
572 /// Returns true if a character class with the given size would cause this
573 /// set to exceed its limits.
574 ///
575 /// The size given should correspond to the number of items in the class.
class_exceeds_limits(&self, size: usize) -> bool576 fn class_exceeds_limits(&self, size: usize) -> bool {
577 if size > self.limit_class {
578 return true;
579 }
580 // This is an approximation since codepoints in a char class can encode
581 // to 1-4 bytes.
582 let new_byte_count =
583 if self.lits.is_empty() {
584 size
585 } else {
586 self.lits
587 .iter()
588 .fold(0, |accum, lit| {
589 accum + if lit.is_cut() {
590 // If the literal is cut, then we'll never add
591 // anything to it, so don't count it.
592 0
593 } else {
594 (lit.len() + 1) * size
595 }
596 })
597 };
598 new_byte_count > self.limit_size
599 }
600 }
601
prefixes(expr: &Hir, lits: &mut Literals)602 fn prefixes(expr: &Hir, lits: &mut Literals) {
603 match *expr.kind() {
604 HirKind::Literal(hir::Literal::Unicode(c)) => {
605 let mut buf = [0; 4];
606 lits.cross_add(c.encode_utf8(&mut buf).as_bytes());
607 }
608 HirKind::Literal(hir::Literal::Byte(b)) => {
609 lits.cross_add(&[b]);
610 }
611 HirKind::Class(hir::Class::Unicode(ref cls)) => {
612 if !lits.add_char_class(cls) {
613 lits.cut();
614 }
615 }
616 HirKind::Class(hir::Class::Bytes(ref cls)) => {
617 if !lits.add_byte_class(cls) {
618 lits.cut();
619 }
620 }
621 HirKind::Group(hir::Group { ref hir, .. }) => {
622 prefixes(&**hir, lits);
623 }
624 HirKind::Repetition(ref x) => {
625 match x.kind {
626 hir::RepetitionKind::ZeroOrOne => {
627 repeat_zero_or_one_literals(&x.hir, lits, prefixes);
628 }
629 hir::RepetitionKind::ZeroOrMore => {
630 repeat_zero_or_more_literals(&x.hir, lits, prefixes);
631 }
632 hir::RepetitionKind::OneOrMore => {
633 repeat_one_or_more_literals(&x.hir, lits, prefixes);
634 }
635 hir::RepetitionKind::Range(ref rng) => {
636 let (min, max) = match *rng {
637 hir::RepetitionRange::Exactly(m) => {
638 (m, Some(m))
639 }
640 hir::RepetitionRange::AtLeast(m) => {
641 (m, None)
642 }
643 hir::RepetitionRange::Bounded(m, n) => {
644 (m, Some(n))
645 }
646 };
647 repeat_range_literals(
648 &x.hir, min, max, x.greedy, lits, prefixes)
649 }
650 }
651 }
652 HirKind::Concat(ref es) if es.is_empty() => {}
653 HirKind::Concat(ref es) if es.len() == 1 => prefixes(&es[0], lits),
654 HirKind::Concat(ref es) => {
655 for e in es {
656 if let HirKind::Anchor(hir::Anchor::StartText) = *e.kind() {
657 if !lits.is_empty() {
658 lits.cut();
659 break;
660 }
661 lits.add(Literal::empty());
662 continue;
663 }
664 let mut lits2 = lits.to_empty();
665 prefixes(e, &mut lits2);
666 if !lits.cross_product(&lits2) || !lits2.any_complete() {
667 // If this expression couldn't yield any literal that
668 // could be extended, then we need to quit. Since we're
669 // short-circuiting, we also need to freeze every member.
670 lits.cut();
671 break;
672 }
673 }
674 }
675 HirKind::Alternation(ref es) => {
676 alternate_literals(es, lits, prefixes);
677 }
678 _ => lits.cut(),
679 }
680 }
681
suffixes(expr: &Hir, lits: &mut Literals)682 fn suffixes(expr: &Hir, lits: &mut Literals) {
683 match *expr.kind() {
684 HirKind::Literal(hir::Literal::Unicode(c)) => {
685 let mut buf = [0u8; 4];
686 let i = c.encode_utf8(&mut buf).len();
687 let mut buf = &mut buf[..i];
688 buf.reverse();
689 lits.cross_add(buf);
690 }
691 HirKind::Literal(hir::Literal::Byte(b)) => {
692 lits.cross_add(&[b]);
693 }
694 HirKind::Class(hir::Class::Unicode(ref cls)) => {
695 if !lits.add_char_class_reverse(cls) {
696 lits.cut();
697 }
698 }
699 HirKind::Class(hir::Class::Bytes(ref cls)) => {
700 if !lits.add_byte_class(cls) {
701 lits.cut();
702 }
703 }
704 HirKind::Group(hir::Group { ref hir, .. }) => {
705 suffixes(&**hir, lits);
706 }
707 HirKind::Repetition(ref x) => {
708 match x.kind {
709 hir::RepetitionKind::ZeroOrOne => {
710 repeat_zero_or_one_literals(&x.hir, lits, suffixes);
711 }
712 hir::RepetitionKind::ZeroOrMore => {
713 repeat_zero_or_more_literals(&x.hir, lits, suffixes);
714 }
715 hir::RepetitionKind::OneOrMore => {
716 repeat_one_or_more_literals(&x.hir, lits, suffixes);
717 }
718 hir::RepetitionKind::Range(ref rng) => {
719 let (min, max) = match *rng {
720 hir::RepetitionRange::Exactly(m) => {
721 (m, Some(m))
722 }
723 hir::RepetitionRange::AtLeast(m) => {
724 (m, None)
725 }
726 hir::RepetitionRange::Bounded(m, n) => {
727 (m, Some(n))
728 }
729 };
730 repeat_range_literals(
731 &x.hir, min, max, x.greedy, lits, suffixes)
732 }
733 }
734 }
735 HirKind::Concat(ref es) if es.is_empty() => {}
736 HirKind::Concat(ref es) if es.len() == 1 => suffixes(&es[0], lits),
737 HirKind::Concat(ref es) => {
738 for e in es.iter().rev() {
739 if let HirKind::Anchor(hir::Anchor::EndText) = *e.kind() {
740 if !lits.is_empty() {
741 lits.cut();
742 break;
743 }
744 lits.add(Literal::empty());
745 continue;
746 }
747 let mut lits2 = lits.to_empty();
748 suffixes(e, &mut lits2);
749 if !lits.cross_product(&lits2) || !lits2.any_complete() {
750 // If this expression couldn't yield any literal that
751 // could be extended, then we need to quit. Since we're
752 // short-circuiting, we also need to freeze every member.
753 lits.cut();
754 break;
755 }
756 }
757 }
758 HirKind::Alternation(ref es) => {
759 alternate_literals(es, lits, suffixes);
760 }
761 _ => lits.cut(),
762 }
763 }
764
repeat_zero_or_one_literals<F: FnMut(&Hir, &mut Literals)>( e: &Hir, lits: &mut Literals, mut f: F, )765 fn repeat_zero_or_one_literals<F: FnMut(&Hir, &mut Literals)>(
766 e: &Hir,
767 lits: &mut Literals,
768 mut f: F,
769 ) {
770 let (mut lits2, mut lits3) = (lits.clone(), lits.to_empty());
771 lits3.set_limit_size(lits.limit_size() / 2);
772 f(e, &mut lits3);
773
774 if lits3.is_empty() || !lits2.cross_product(&lits3) {
775 lits.cut();
776 return;
777 }
778 lits2.add(Literal::empty());
779 if !lits.union(lits2) {
780 lits.cut();
781 }
782 }
783
repeat_zero_or_more_literals<F: FnMut(&Hir, &mut Literals)>( e: &Hir, lits: &mut Literals, mut f: F, )784 fn repeat_zero_or_more_literals<F: FnMut(&Hir, &mut Literals)>(
785 e: &Hir,
786 lits: &mut Literals,
787 mut f: F,
788 ) {
789 let (mut lits2, mut lits3) = (lits.clone(), lits.to_empty());
790 lits3.set_limit_size(lits.limit_size() / 2);
791 f(e, &mut lits3);
792
793 if lits3.is_empty() || !lits2.cross_product(&lits3) {
794 lits.cut();
795 return;
796 }
797 lits2.cut();
798 lits2.add(Literal::empty());
799 if !lits.union(lits2) {
800 lits.cut();
801 }
802 }
803
repeat_one_or_more_literals<F: FnMut(&Hir, &mut Literals)>( e: &Hir, lits: &mut Literals, mut f: F, )804 fn repeat_one_or_more_literals<F: FnMut(&Hir, &mut Literals)>(
805 e: &Hir,
806 lits: &mut Literals,
807 mut f: F,
808 ) {
809 f(e, lits);
810 lits.cut();
811 }
812
repeat_range_literals<F: FnMut(&Hir, &mut Literals)>( e: &Hir, min: u32, max: Option<u32>, greedy: bool, lits: &mut Literals, mut f: F, )813 fn repeat_range_literals<F: FnMut(&Hir, &mut Literals)>(
814 e: &Hir,
815 min: u32,
816 max: Option<u32>,
817 greedy: bool,
818 lits: &mut Literals,
819 mut f: F,
820 ) {
821 if min == 0 {
822 // This is a bit conservative. If `max` is set, then we could
823 // treat this as a finite set of alternations. For now, we
824 // just treat it as `e*`.
825 f(&Hir::repetition(hir::Repetition {
826 kind: hir::RepetitionKind::ZeroOrMore,
827 greedy: greedy,
828 hir: Box::new(e.clone()),
829 }), lits);
830 } else {
831 if min > 0 {
832 let n = cmp::min(lits.limit_size, min as usize);
833 let es = iter::repeat(e.clone()).take(n).collect();
834 f(&Hir::concat(es), lits);
835 if n < min as usize || lits.contains_empty() {
836 lits.cut();
837 }
838 }
839 if max.map_or(true, |max| min < max) {
840 lits.cut();
841 }
842 }
843 }
844
alternate_literals<F: FnMut(&Hir, &mut Literals)>( es: &[Hir], lits: &mut Literals, mut f: F, )845 fn alternate_literals<F: FnMut(&Hir, &mut Literals)>(
846 es: &[Hir],
847 lits: &mut Literals,
848 mut f: F,
849 ) {
850 let mut lits2 = lits.to_empty();
851 for e in es {
852 let mut lits3 = lits.to_empty();
853 lits3.set_limit_size(lits.limit_size() / 5);
854 f(e, &mut lits3);
855 if lits3.is_empty() || !lits2.union(lits3) {
856 // If we couldn't find suffixes for *any* of the
857 // alternates, then the entire alternation has to be thrown
858 // away and any existing members must be frozen. Similarly,
859 // if the union couldn't complete, stop and freeze.
860 lits.cut();
861 return;
862 }
863 }
864 if !lits.cross_product(&lits2) {
865 lits.cut();
866 }
867 }
868
869 impl fmt::Debug for Literals {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result870 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
871 f.debug_struct("Literals")
872 .field("lits", &self.lits)
873 .field("limit_size", &self.limit_size)
874 .field("limit_class", &self.limit_class)
875 .finish()
876 }
877 }
878
879 impl Literal {
880 /// Returns a new complete literal with the bytes given.
new(bytes: Vec<u8>) -> Literal881 pub fn new(bytes: Vec<u8>) -> Literal {
882 Literal { v: bytes, cut: false }
883 }
884
885 /// Returns a new complete empty literal.
empty() -> Literal886 pub fn empty() -> Literal {
887 Literal { v: vec![], cut: false }
888 }
889
890 /// Returns true if this literal was "cut."
is_cut(&self) -> bool891 pub fn is_cut(&self) -> bool {
892 self.cut
893 }
894
895 /// Cuts this literal.
cut(&mut self)896 pub fn cut(&mut self) {
897 self.cut = true;
898 }
899 }
900
901 impl PartialEq for Literal {
eq(&self, other: &Literal) -> bool902 fn eq(&self, other: &Literal) -> bool {
903 self.v == other.v
904 }
905 }
906
907 impl PartialOrd for Literal {
partial_cmp(&self, other: &Literal) -> Option<cmp::Ordering>908 fn partial_cmp(&self, other: &Literal) -> Option<cmp::Ordering> {
909 self.v.partial_cmp(&other.v)
910 }
911 }
912
913 impl fmt::Debug for Literal {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result914 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
915 if self.is_cut() {
916 write!(f, "Cut({})", escape_unicode(&self.v))
917 } else {
918 write!(f, "Complete({})", escape_unicode(&self.v))
919 }
920 }
921 }
922
923 impl AsRef<[u8]> for Literal {
as_ref(&self) -> &[u8]924 fn as_ref(&self) -> &[u8] { &self.v }
925 }
926
927 impl ops::Deref for Literal {
928 type Target = Vec<u8>;
deref(&self) -> &Vec<u8>929 fn deref(&self) -> &Vec<u8> { &self.v }
930 }
931
932 impl ops::DerefMut for Literal {
deref_mut(&mut self) -> &mut Vec<u8>933 fn deref_mut(&mut self) -> &mut Vec<u8> { &mut self.v }
934 }
935
position(needle: &[u8], mut haystack: &[u8]) -> Option<usize>936 fn position(needle: &[u8], mut haystack: &[u8]) -> Option<usize> {
937 let mut i = 0;
938 while haystack.len() >= needle.len() {
939 if needle == &haystack[..needle.len()] {
940 return Some(i);
941 }
942 i += 1;
943 haystack = &haystack[1..];
944 }
945 None
946 }
947
escape_unicode(bytes: &[u8]) -> String948 fn escape_unicode(bytes: &[u8]) -> String {
949 let show = match ::std::str::from_utf8(bytes) {
950 Ok(v) => v.to_string(),
951 Err(_) => escape_bytes(bytes),
952 };
953 let mut space_escaped = String::new();
954 for c in show.chars() {
955 if c.is_whitespace() {
956 let escaped = if c as u32 <= 0x7F {
957 escape_byte(c as u8)
958 } else {
959 if c as u32 <= 0xFFFF {
960 format!(r"\u{{{:04x}}}", c as u32)
961 } else {
962 format!(r"\U{{{:08x}}}", c as u32)
963 }
964 };
965 space_escaped.push_str(&escaped);
966 } else {
967 space_escaped.push(c);
968 }
969 }
970 space_escaped
971 }
972
escape_bytes(bytes: &[u8]) -> String973 fn escape_bytes(bytes: &[u8]) -> String {
974 let mut s = String::new();
975 for &b in bytes {
976 s.push_str(&escape_byte(b));
977 }
978 s
979 }
980
escape_byte(byte: u8) -> String981 fn escape_byte(byte: u8) -> String {
982 use std::ascii::escape_default;
983
984 let escaped: Vec<u8> = escape_default(byte).collect();
985 String::from_utf8_lossy(&escaped).into_owned()
986 }
987
cls_char_count(cls: &hir::ClassUnicode) -> usize988 fn cls_char_count(cls: &hir::ClassUnicode) -> usize {
989 cls.iter()
990 .map(|&r| 1 + (r.end as u32) - (r.start as u32))
991 .sum::<u32>() as usize
992 }
993
cls_byte_count(cls: &hir::ClassBytes) -> usize994 fn cls_byte_count(cls: &hir::ClassBytes) -> usize {
995 cls.iter()
996 .map(|&r| 1 + (r.end as u32) - (r.start as u32))
997 .sum::<u32>() as usize
998 }
999
1000 #[cfg(test)]
1001 mod tests {
1002 use std::fmt;
1003
1004 use ParserBuilder;
1005 use hir::Hir;
1006 use super::{Literals, Literal, escape_bytes};
1007
1008 // To make test failures easier to read.
1009 #[derive(Debug, Eq, PartialEq)]
1010 struct Bytes(Vec<ULiteral>);
1011 #[derive(Debug, Eq, PartialEq)]
1012 struct Unicode(Vec<ULiteral>);
1013
escape_lits(blits: &[Literal]) -> Vec<ULiteral>1014 fn escape_lits(blits: &[Literal]) -> Vec<ULiteral> {
1015 let mut ulits = vec![];
1016 for blit in blits {
1017 ulits.push(ULiteral {
1018 v: escape_bytes(&blit),
1019 cut: blit.is_cut(),
1020 });
1021 }
1022 ulits
1023 }
1024
create_lits<I: IntoIterator<Item=Literal>>(it: I) -> Literals1025 fn create_lits<I: IntoIterator<Item=Literal>>(it: I) -> Literals {
1026 Literals {
1027 lits: it.into_iter().collect(),
1028 limit_size: 0,
1029 limit_class: 0,
1030 }
1031 }
1032
1033 // Needs to be pub for 1.3?
1034 #[derive(Clone, Eq, PartialEq)]
1035 pub struct ULiteral {
1036 v: String,
1037 cut: bool,
1038 }
1039
1040 impl ULiteral {
is_cut(&self) -> bool1041 fn is_cut(&self) -> bool { self.cut }
1042 }
1043
1044 impl fmt::Debug for ULiteral {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result1045 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1046 if self.is_cut() {
1047 write!(f, "Cut({})", self.v)
1048 } else {
1049 write!(f, "Complete({})", self.v)
1050 }
1051 }
1052 }
1053
1054 impl PartialEq<Literal> for ULiteral {
eq(&self, other: &Literal) -> bool1055 fn eq(&self, other: &Literal) -> bool {
1056 self.v.as_bytes() == &*other.v && self.is_cut() == other.is_cut()
1057 }
1058 }
1059
1060 impl PartialEq<ULiteral> for Literal {
eq(&self, other: &ULiteral) -> bool1061 fn eq(&self, other: &ULiteral) -> bool {
1062 &*self.v == other.v.as_bytes() && self.is_cut() == other.is_cut()
1063 }
1064 }
1065
1066 #[allow(non_snake_case)]
C(s: &'static str) -> ULiteral1067 fn C(s: &'static str) -> ULiteral {
1068 ULiteral { v: s.to_owned(), cut: true }
1069 }
1070 #[allow(non_snake_case)]
M(s: &'static str) -> ULiteral1071 fn M(s: &'static str) -> ULiteral {
1072 ULiteral { v: s.to_owned(), cut: false }
1073 }
1074
prefixes(lits: &mut Literals, expr: &Hir)1075 fn prefixes(lits: &mut Literals, expr: &Hir) {
1076 lits.union_prefixes(expr);
1077 }
1078
suffixes(lits: &mut Literals, expr: &Hir)1079 fn suffixes(lits: &mut Literals, expr: &Hir) {
1080 lits.union_suffixes(expr);
1081 }
1082
1083 macro_rules! assert_lit_eq {
1084 ($which:ident, $got_lits:expr, $($expected_lit:expr),*) => {{
1085 let expected: Vec<ULiteral> = vec![$($expected_lit),*];
1086 let lits = $got_lits;
1087 assert_eq!(
1088 $which(expected.clone()),
1089 $which(escape_lits(lits.literals())));
1090 assert_eq!(
1091 !expected.is_empty() && expected.iter().all(|l| !l.is_cut()),
1092 lits.all_complete());
1093 assert_eq!(
1094 expected.iter().any(|l| !l.is_cut()),
1095 lits.any_complete());
1096 }};
1097 }
1098
1099 macro_rules! test_lit {
1100 ($name:ident, $which:ident, $re:expr) => {
1101 test_lit!($name, $which, $re,);
1102 };
1103 ($name:ident, $which:ident, $re:expr, $($lit:expr),*) => {
1104 #[test]
1105 fn $name() {
1106 let expr = ParserBuilder::new()
1107 .build()
1108 .parse($re)
1109 .unwrap();
1110 let lits = Literals::$which(&expr);
1111 assert_lit_eq!(Unicode, lits, $($lit),*);
1112
1113 let expr = ParserBuilder::new()
1114 .allow_invalid_utf8(true)
1115 .unicode(false)
1116 .build()
1117 .parse($re)
1118 .unwrap();
1119 let lits = Literals::$which(&expr);
1120 assert_lit_eq!(Bytes, lits, $($lit),*);
1121 }
1122 };
1123 }
1124
1125 // ************************************************************************
1126 // Tests for prefix literal extraction.
1127 // ************************************************************************
1128
1129 // Elementary tests.
1130 test_lit!(pfx_one_lit1, prefixes, "a", M("a"));
1131 test_lit!(pfx_one_lit2, prefixes, "abc", M("abc"));
1132 test_lit!(pfx_one_lit3, prefixes, "(?u)☃", M("\\xe2\\x98\\x83"));
1133 test_lit!(pfx_one_lit4, prefixes, "(?ui)☃", M("\\xe2\\x98\\x83"));
1134 test_lit!(pfx_class1, prefixes, "[1-4]",
1135 M("1"), M("2"), M("3"), M("4"));
1136 test_lit!(pfx_class2, prefixes, "(?u)[☃Ⅰ]",
1137 M("\\xe2\\x85\\xa0"), M("\\xe2\\x98\\x83"));
1138 test_lit!(pfx_class3, prefixes, "(?ui)[☃Ⅰ]",
1139 M("\\xe2\\x85\\xa0"), M("\\xe2\\x85\\xb0"),
1140 M("\\xe2\\x98\\x83"));
1141 test_lit!(pfx_one_lit_casei1, prefixes, "(?i)a",
1142 M("A"), M("a"));
1143 test_lit!(pfx_one_lit_casei2, prefixes, "(?i)abc",
1144 M("ABC"), M("aBC"), M("AbC"), M("abC"),
1145 M("ABc"), M("aBc"), M("Abc"), M("abc"));
1146 test_lit!(pfx_group1, prefixes, "(a)", M("a"));
1147 test_lit!(pfx_rep_zero_or_one1, prefixes, "a?");
1148 test_lit!(pfx_rep_zero_or_one2, prefixes, "(?:abc)?");
1149 test_lit!(pfx_rep_zero_or_more1, prefixes, "a*");
1150 test_lit!(pfx_rep_zero_or_more2, prefixes, "(?:abc)*");
1151 test_lit!(pfx_rep_one_or_more1, prefixes, "a+", C("a"));
1152 test_lit!(pfx_rep_one_or_more2, prefixes, "(?:abc)+", C("abc"));
1153 test_lit!(pfx_rep_nested_one_or_more, prefixes, "(?:a+)+", C("a"));
1154 test_lit!(pfx_rep_range1, prefixes, "a{0}");
1155 test_lit!(pfx_rep_range2, prefixes, "a{0,}");
1156 test_lit!(pfx_rep_range3, prefixes, "a{0,1}");
1157 test_lit!(pfx_rep_range4, prefixes, "a{1}", M("a"));
1158 test_lit!(pfx_rep_range5, prefixes, "a{2}", M("aa"));
1159 test_lit!(pfx_rep_range6, prefixes, "a{1,2}", C("a"));
1160 test_lit!(pfx_rep_range7, prefixes, "a{2,3}", C("aa"));
1161
1162 // Test regexes with concatenations.
1163 test_lit!(pfx_cat1, prefixes, "(?:a)(?:b)", M("ab"));
1164 test_lit!(pfx_cat2, prefixes, "[ab]z", M("az"), M("bz"));
1165 test_lit!(pfx_cat3, prefixes, "(?i)[ab]z",
1166 M("AZ"), M("BZ"), M("aZ"), M("bZ"),
1167 M("Az"), M("Bz"), M("az"), M("bz"));
1168 test_lit!(pfx_cat4, prefixes, "[ab][yz]",
1169 M("ay"), M("by"), M("az"), M("bz"));
1170 test_lit!(pfx_cat5, prefixes, "a*b", C("a"), M("b"));
1171 test_lit!(pfx_cat6, prefixes, "a*b*c", C("a"), C("b"), M("c"));
1172 test_lit!(pfx_cat7, prefixes, "a*b*c+", C("a"), C("b"), C("c"));
1173 test_lit!(pfx_cat8, prefixes, "a*b+c", C("a"), C("b"));
1174 test_lit!(pfx_cat9, prefixes, "a*b+c*", C("a"), C("b"));
1175 test_lit!(pfx_cat10, prefixes, "ab*", C("ab"), M("a"));
1176 test_lit!(pfx_cat11, prefixes, "ab*c", C("ab"), M("ac"));
1177 test_lit!(pfx_cat12, prefixes, "ab+", C("ab"));
1178 test_lit!(pfx_cat13, prefixes, "ab+c", C("ab"));
1179 test_lit!(pfx_cat14, prefixes, "a^", C("a"));
1180 test_lit!(pfx_cat15, prefixes, "$a");
1181 test_lit!(pfx_cat16, prefixes, r"ab*c", C("ab"), M("ac"));
1182 test_lit!(pfx_cat17, prefixes, r"ab+c", C("ab"));
1183 test_lit!(pfx_cat18, prefixes, r"z*azb", C("z"), M("azb"));
1184 test_lit!(pfx_cat19, prefixes, "a.z", C("a"));
1185
1186 // Test regexes with alternations.
1187 test_lit!(pfx_alt1, prefixes, "a|b", M("a"), M("b"));
1188 test_lit!(pfx_alt2, prefixes, "[1-3]|b", M("1"), M("2"), M("3"), M("b"));
1189 test_lit!(pfx_alt3, prefixes, "y(?:a|b)z", M("yaz"), M("ybz"));
1190 test_lit!(pfx_alt4, prefixes, "a|b*");
1191 test_lit!(pfx_alt5, prefixes, "a|b+", M("a"), C("b"));
1192 test_lit!(pfx_alt6, prefixes, "a|(?:b|c*)");
1193 test_lit!(pfx_alt7, prefixes, "(a|b)*c|(a|ab)*c",
1194 C("a"), C("b"), M("c"), C("a"), C("ab"), M("c"));
1195 test_lit!(pfx_alt8, prefixes, "a*b|c", C("a"), M("b"), M("c"));
1196
1197 // Test regexes with empty assertions.
1198 test_lit!(pfx_empty1, prefixes, "^a", M("a"));
1199 test_lit!(pfx_empty2, prefixes, "a${2}", C("a"));
1200 test_lit!(pfx_empty3, prefixes, "^abc", M("abc"));
1201 test_lit!(pfx_empty4, prefixes, "(?:^abc)|(?:^z)", M("abc"), M("z"));
1202
1203 // Make sure some curious regexes have no prefixes.
1204 test_lit!(pfx_nothing1, prefixes, ".");
1205 test_lit!(pfx_nothing2, prefixes, "(?s).");
1206 test_lit!(pfx_nothing3, prefixes, "^");
1207 test_lit!(pfx_nothing4, prefixes, "$");
1208 test_lit!(pfx_nothing6, prefixes, "(?m)$");
1209 test_lit!(pfx_nothing7, prefixes, r"\b");
1210 test_lit!(pfx_nothing8, prefixes, r"\B");
1211
1212 // Test a few regexes that defeat any prefix literal detection.
1213 test_lit!(pfx_defeated1, prefixes, ".a");
1214 test_lit!(pfx_defeated2, prefixes, "(?s).a");
1215 test_lit!(pfx_defeated3, prefixes, "a*b*c*");
1216 test_lit!(pfx_defeated4, prefixes, "a|.");
1217 test_lit!(pfx_defeated5, prefixes, ".|a");
1218 test_lit!(pfx_defeated6, prefixes, "a|^");
1219 test_lit!(pfx_defeated7, prefixes, ".(?:a(?:b)(?:c))");
1220 test_lit!(pfx_defeated8, prefixes, "$a");
1221 test_lit!(pfx_defeated9, prefixes, "(?m)$a");
1222 test_lit!(pfx_defeated10, prefixes, r"\ba");
1223 test_lit!(pfx_defeated11, prefixes, r"\Ba");
1224 test_lit!(pfx_defeated12, prefixes, "^*a");
1225 test_lit!(pfx_defeated13, prefixes, "^+a");
1226
1227 test_lit!(
1228 pfx_crazy1,
1229 prefixes,
1230 r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]",
1231 C("Mo\\'am"), C("Mu\\'am"), C("Moam"), C("Muam"));
1232
1233 // ************************************************************************
1234 // Tests for quiting prefix literal search.
1235 // ************************************************************************
1236
1237 macro_rules! test_exhausted {
1238 ($name:ident, $which:ident, $re:expr) => {
1239 test_exhausted!($name, $which, $re,);
1240 };
1241 ($name:ident, $which:ident, $re:expr, $($lit:expr),*) => {
1242 #[test]
1243 fn $name() {
1244 let expr = ParserBuilder::new()
1245 .build()
1246 .parse($re)
1247 .unwrap();
1248 let mut lits = Literals::empty();
1249 lits.set_limit_size(20).set_limit_class(10);
1250 $which(&mut lits, &expr);
1251 assert_lit_eq!(Unicode, lits, $($lit),*);
1252
1253 let expr = ParserBuilder::new()
1254 .allow_invalid_utf8(true)
1255 .unicode(false)
1256 .build()
1257 .parse($re)
1258 .unwrap();
1259 let mut lits = Literals::empty();
1260 lits.set_limit_size(20).set_limit_class(10);
1261 $which(&mut lits, &expr);
1262 assert_lit_eq!(Bytes, lits, $($lit),*);
1263 }
1264 };
1265 }
1266
1267 // These test use a much lower limit than the default so that we can
1268 // write test cases of reasonable size.
1269 test_exhausted!(pfx_exhausted1, prefixes, "[a-z]");
1270 test_exhausted!(pfx_exhausted2, prefixes, "[a-z]*A");
1271 test_exhausted!(pfx_exhausted3, prefixes, "A[a-z]Z", C("A"));
1272 test_exhausted!(pfx_exhausted4, prefixes, "(?i)foobar",
1273 C("FO"), C("fO"), C("Fo"), C("fo"));
1274 test_exhausted!(pfx_exhausted5, prefixes, "(?:ab){100}",
1275 C("abababababababababab"));
1276 test_exhausted!(pfx_exhausted6, prefixes, "(?:(?:ab){100})*cd",
1277 C("ababababab"), M("cd"));
1278 test_exhausted!(pfx_exhausted7, prefixes, "z(?:(?:ab){100})*cd",
1279 C("zababababab"), M("zcd"));
1280 test_exhausted!(pfx_exhausted8, prefixes, "aaaaaaaaaaaaaaaaaaaaz",
1281 C("aaaaaaaaaaaaaaaaaaaa"));
1282
1283 // ************************************************************************
1284 // Tests for suffix literal extraction.
1285 // ************************************************************************
1286
1287 // Elementary tests.
1288 test_lit!(sfx_one_lit1, suffixes, "a", M("a"));
1289 test_lit!(sfx_one_lit2, suffixes, "abc", M("abc"));
1290 test_lit!(sfx_one_lit3, suffixes, "(?u)☃", M("\\xe2\\x98\\x83"));
1291 test_lit!(sfx_one_lit4, suffixes, "(?ui)☃", M("\\xe2\\x98\\x83"));
1292 test_lit!(sfx_class1, suffixes, "[1-4]",
1293 M("1"), M("2"), M("3"), M("4"));
1294 test_lit!(sfx_class2, suffixes, "(?u)[☃Ⅰ]",
1295 M("\\xe2\\x85\\xa0"), M("\\xe2\\x98\\x83"));
1296 test_lit!(sfx_class3, suffixes, "(?ui)[☃Ⅰ]",
1297 M("\\xe2\\x85\\xa0"), M("\\xe2\\x85\\xb0"),
1298 M("\\xe2\\x98\\x83"));
1299 test_lit!(sfx_one_lit_casei1, suffixes, "(?i)a",
1300 M("A"), M("a"));
1301 test_lit!(sfx_one_lit_casei2, suffixes, "(?i)abc",
1302 M("ABC"), M("ABc"), M("AbC"), M("Abc"),
1303 M("aBC"), M("aBc"), M("abC"), M("abc"));
1304 test_lit!(sfx_group1, suffixes, "(a)", M("a"));
1305 test_lit!(sfx_rep_zero_or_one1, suffixes, "a?");
1306 test_lit!(sfx_rep_zero_or_one2, suffixes, "(?:abc)?");
1307 test_lit!(sfx_rep_zero_or_more1, suffixes, "a*");
1308 test_lit!(sfx_rep_zero_or_more2, suffixes, "(?:abc)*");
1309 test_lit!(sfx_rep_one_or_more1, suffixes, "a+", C("a"));
1310 test_lit!(sfx_rep_one_or_more2, suffixes, "(?:abc)+", C("abc"));
1311 test_lit!(sfx_rep_nested_one_or_more, suffixes, "(?:a+)+", C("a"));
1312 test_lit!(sfx_rep_range1, suffixes, "a{0}");
1313 test_lit!(sfx_rep_range2, suffixes, "a{0,}");
1314 test_lit!(sfx_rep_range3, suffixes, "a{0,1}");
1315 test_lit!(sfx_rep_range4, suffixes, "a{1}", M("a"));
1316 test_lit!(sfx_rep_range5, suffixes, "a{2}", M("aa"));
1317 test_lit!(sfx_rep_range6, suffixes, "a{1,2}", C("a"));
1318 test_lit!(sfx_rep_range7, suffixes, "a{2,3}", C("aa"));
1319
1320 // Test regexes with concatenations.
1321 test_lit!(sfx_cat1, suffixes, "(?:a)(?:b)", M("ab"));
1322 test_lit!(sfx_cat2, suffixes, "[ab]z", M("az"), M("bz"));
1323 test_lit!(sfx_cat3, suffixes, "(?i)[ab]z",
1324 M("AZ"), M("Az"), M("BZ"), M("Bz"),
1325 M("aZ"), M("az"), M("bZ"), M("bz"));
1326 test_lit!(sfx_cat4, suffixes, "[ab][yz]",
1327 M("ay"), M("az"), M("by"), M("bz"));
1328 test_lit!(sfx_cat5, suffixes, "a*b", C("ab"), M("b"));
1329 test_lit!(sfx_cat6, suffixes, "a*b*c", C("bc"), C("ac"), M("c"));
1330 test_lit!(sfx_cat7, suffixes, "a*b*c+", C("c"));
1331 test_lit!(sfx_cat8, suffixes, "a*b+c", C("bc"));
1332 test_lit!(sfx_cat9, suffixes, "a*b+c*", C("c"), C("b"));
1333 test_lit!(sfx_cat10, suffixes, "ab*", C("b"), M("a"));
1334 test_lit!(sfx_cat11, suffixes, "ab*c", C("bc"), M("ac"));
1335 test_lit!(sfx_cat12, suffixes, "ab+", C("b"));
1336 test_lit!(sfx_cat13, suffixes, "ab+c", C("bc"));
1337 test_lit!(sfx_cat14, suffixes, "a^");
1338 test_lit!(sfx_cat15, suffixes, "$a", C("a"));
1339 test_lit!(sfx_cat16, suffixes, r"ab*c", C("bc"), M("ac"));
1340 test_lit!(sfx_cat17, suffixes, r"ab+c", C("bc"));
1341 test_lit!(sfx_cat18, suffixes, r"z*azb", C("zazb"), M("azb"));
1342 test_lit!(sfx_cat19, suffixes, "a.z", C("z"));
1343
1344 // Test regexes with alternations.
1345 test_lit!(sfx_alt1, suffixes, "a|b", M("a"), M("b"));
1346 test_lit!(sfx_alt2, suffixes, "[1-3]|b", M("1"), M("2"), M("3"), M("b"));
1347 test_lit!(sfx_alt3, suffixes, "y(?:a|b)z", M("yaz"), M("ybz"));
1348 test_lit!(sfx_alt4, suffixes, "a|b*");
1349 test_lit!(sfx_alt5, suffixes, "a|b+", M("a"), C("b"));
1350 test_lit!(sfx_alt6, suffixes, "a|(?:b|c*)");
1351 test_lit!(sfx_alt7, suffixes, "(a|b)*c|(a|ab)*c",
1352 C("ac"), C("bc"), M("c"), C("ac"), C("abc"), M("c"));
1353 test_lit!(sfx_alt8, suffixes, "a*b|c", C("ab"), M("b"), M("c"));
1354
1355 // Test regexes with empty assertions.
1356 test_lit!(sfx_empty1, suffixes, "a$", M("a"));
1357 test_lit!(sfx_empty2, suffixes, "${2}a", C("a"));
1358
1359 // Make sure some curious regexes have no suffixes.
1360 test_lit!(sfx_nothing1, suffixes, ".");
1361 test_lit!(sfx_nothing2, suffixes, "(?s).");
1362 test_lit!(sfx_nothing3, suffixes, "^");
1363 test_lit!(sfx_nothing4, suffixes, "$");
1364 test_lit!(sfx_nothing6, suffixes, "(?m)$");
1365 test_lit!(sfx_nothing7, suffixes, r"\b");
1366 test_lit!(sfx_nothing8, suffixes, r"\B");
1367
1368 // Test a few regexes that defeat any suffix literal detection.
1369 test_lit!(sfx_defeated1, suffixes, "a.");
1370 test_lit!(sfx_defeated2, suffixes, "(?s)a.");
1371 test_lit!(sfx_defeated3, suffixes, "a*b*c*");
1372 test_lit!(sfx_defeated4, suffixes, "a|.");
1373 test_lit!(sfx_defeated5, suffixes, ".|a");
1374 test_lit!(sfx_defeated6, suffixes, "a|^");
1375 test_lit!(sfx_defeated7, suffixes, "(?:a(?:b)(?:c)).");
1376 test_lit!(sfx_defeated8, suffixes, "a^");
1377 test_lit!(sfx_defeated9, suffixes, "(?m)a$");
1378 test_lit!(sfx_defeated10, suffixes, r"a\b");
1379 test_lit!(sfx_defeated11, suffixes, r"a\B");
1380 test_lit!(sfx_defeated12, suffixes, "a^*");
1381 test_lit!(sfx_defeated13, suffixes, "a^+");
1382
1383 // These test use a much lower limit than the default so that we can
1384 // write test cases of reasonable size.
1385 test_exhausted!(sfx_exhausted1, suffixes, "[a-z]");
1386 test_exhausted!(sfx_exhausted2, suffixes, "A[a-z]*");
1387 test_exhausted!(sfx_exhausted3, suffixes, "A[a-z]Z", C("Z"));
1388 test_exhausted!(sfx_exhausted4, suffixes, "(?i)foobar",
1389 C("AR"), C("Ar"), C("aR"), C("ar"));
1390 test_exhausted!(sfx_exhausted5, suffixes, "(?:ab){100}",
1391 C("abababababababababab"));
1392 test_exhausted!(sfx_exhausted6, suffixes, "cd(?:(?:ab){100})*",
1393 C("ababababab"), M("cd"));
1394 test_exhausted!(sfx_exhausted7, suffixes, "cd(?:(?:ab){100})*z",
1395 C("abababababz"), M("cdz"));
1396 test_exhausted!(sfx_exhausted8, suffixes, "zaaaaaaaaaaaaaaaaaaaa",
1397 C("aaaaaaaaaaaaaaaaaaaa"));
1398
1399 // ************************************************************************
1400 // Tests for generating unambiguous literal sets.
1401 // ************************************************************************
1402
1403 macro_rules! test_unamb {
1404 ($name:ident, $given:expr, $expected:expr) => {
1405 #[test]
1406 fn $name() {
1407 let given: Vec<Literal> =
1408 $given
1409 .into_iter()
1410 .map(|ul| {
1411 let cut = ul.is_cut();
1412 Literal { v: ul.v.into_bytes(), cut: cut }
1413 })
1414 .collect();
1415 let lits = create_lits(given);
1416 let got = lits.unambiguous_prefixes();
1417 assert_eq!($expected, escape_lits(got.literals()));
1418 }
1419 };
1420 }
1421
1422 test_unamb!(unambiguous1, vec![M("z"), M("azb")], vec![C("a"), C("z")]);
1423 test_unamb!(unambiguous2,
1424 vec![M("zaaaaaa"), M("aa")], vec![C("aa"), C("z")]);
1425 test_unamb!(unambiguous3,
1426 vec![M("Sherlock"), M("Watson")],
1427 vec![M("Sherlock"), M("Watson")]);
1428 test_unamb!(unambiguous4, vec![M("abc"), M("bc")], vec![C("a"), C("bc")]);
1429 test_unamb!(unambiguous5, vec![M("bc"), M("abc")], vec![C("a"), C("bc")]);
1430 test_unamb!(unambiguous6, vec![M("a"), M("aa")], vec![C("a")]);
1431 test_unamb!(unambiguous7, vec![M("aa"), M("a")], vec![C("a")]);
1432 test_unamb!(unambiguous8, vec![M("ab"), M("a")], vec![C("a")]);
1433 test_unamb!(unambiguous9,
1434 vec![M("ac"), M("bc"), M("c"), M("ac"), M("abc"), M("c")],
1435 vec![C("a"), C("b"), C("c")]);
1436 test_unamb!(unambiguous10,
1437 vec![M("Mo'"), M("Mu'"), M("Mo"), M("Mu")],
1438 vec![C("Mo"), C("Mu")]);
1439 test_unamb!(unambiguous11,
1440 vec![M("zazb"), M("azb")], vec![C("a"), C("z")]);
1441 test_unamb!(unambiguous12, vec![M("foo"), C("foo")], vec![C("foo")]);
1442 test_unamb!(unambiguous13,
1443 vec![M("ABCX"), M("CDAX"), M("BCX")],
1444 vec![C("A"), C("BCX"), C("CD")]);
1445 test_unamb!(unambiguous14,
1446 vec![M("IMGX"), M("MVIX"), M("MGX"), M("DSX")],
1447 vec![M("DSX"), C("I"), C("MGX"), C("MV")]);
1448 test_unamb!(unambiguous15,
1449 vec![M("IMG_"), M("MG_"), M("CIMG")],
1450 vec![C("C"), C("I"), C("MG_")]);
1451
1452
1453 // ************************************************************************
1454 // Tests for suffix trimming.
1455 // ************************************************************************
1456 macro_rules! test_trim {
1457 ($name:ident, $trim:expr, $given:expr, $expected:expr) => {
1458 #[test]
1459 fn $name() {
1460 let given: Vec<Literal> =
1461 $given
1462 .into_iter()
1463 .map(|ul| {
1464 let cut = ul.is_cut();
1465 Literal { v: ul.v.into_bytes(), cut: cut }
1466 })
1467 .collect();
1468 let lits = create_lits(given);
1469 let got = lits.trim_suffix($trim).unwrap();
1470 assert_eq!($expected, escape_lits(got.literals()));
1471 }
1472 }
1473 }
1474
1475 test_trim!(trim1, 1, vec![M("ab"), M("yz")], vec![C("a"), C("y")]);
1476 test_trim!(trim2, 1, vec![M("abc"), M("abd")], vec![C("ab")]);
1477 test_trim!(trim3, 2, vec![M("abc"), M("abd")], vec![C("a")]);
1478 test_trim!(trim4, 2, vec![M("abc"), M("ghij")], vec![C("a"), C("gh")]);
1479
1480 // ************************************************************************
1481 // Tests for longest common prefix.
1482 // ************************************************************************
1483
1484 macro_rules! test_lcp {
1485 ($name:ident, $given:expr, $expected:expr) => {
1486 #[test]
1487 fn $name() {
1488 let given: Vec<Literal> =
1489 $given
1490 .into_iter()
1491 .map(|s: &str| Literal {
1492 v: s.to_owned().into_bytes(),
1493 cut: false,
1494 })
1495 .collect();
1496 let lits = create_lits(given);
1497 let got = lits.longest_common_prefix();
1498 assert_eq!($expected, escape_bytes(got));
1499 }
1500 };
1501 }
1502
1503 test_lcp!(lcp1, vec!["a"], "a");
1504 test_lcp!(lcp2, vec![], "");
1505 test_lcp!(lcp3, vec!["a", "b"], "");
1506 test_lcp!(lcp4, vec!["ab", "ab"], "ab");
1507 test_lcp!(lcp5, vec!["ab", "a"], "a");
1508 test_lcp!(lcp6, vec!["a", "ab"], "a");
1509 test_lcp!(lcp7, vec!["ab", "b"], "");
1510 test_lcp!(lcp8, vec!["b", "ab"], "");
1511 test_lcp!(lcp9, vec!["foobar", "foobaz"], "fooba");
1512 test_lcp!(lcp10, vec!["foobar", "foobaz", "a"], "");
1513 test_lcp!(lcp11, vec!["a", "foobar", "foobaz"], "");
1514 test_lcp!(lcp12, vec!["foo", "flub", "flab", "floo"], "f");
1515
1516 // ************************************************************************
1517 // Tests for longest common suffix.
1518 // ************************************************************************
1519
1520 macro_rules! test_lcs {
1521 ($name:ident, $given:expr, $expected:expr) => {
1522 #[test]
1523 fn $name() {
1524 let given: Vec<Literal> =
1525 $given
1526 .into_iter()
1527 .map(|s: &str| Literal {
1528 v: s.to_owned().into_bytes(),
1529 cut: false,
1530 })
1531 .collect();
1532 let lits = create_lits(given);
1533 let got = lits.longest_common_suffix();
1534 assert_eq!($expected, escape_bytes(got));
1535 }
1536 };
1537 }
1538
1539 test_lcs!(lcs1, vec!["a"], "a");
1540 test_lcs!(lcs2, vec![], "");
1541 test_lcs!(lcs3, vec!["a", "b"], "");
1542 test_lcs!(lcs4, vec!["ab", "ab"], "ab");
1543 test_lcs!(lcs5, vec!["ab", "a"], "");
1544 test_lcs!(lcs6, vec!["a", "ab"], "");
1545 test_lcs!(lcs7, vec!["ab", "b"], "b");
1546 test_lcs!(lcs8, vec!["b", "ab"], "b");
1547 test_lcs!(lcs9, vec!["barfoo", "bazfoo"], "foo");
1548 test_lcs!(lcs10, vec!["barfoo", "bazfoo", "a"], "");
1549 test_lcs!(lcs11, vec!["a", "barfoo", "bazfoo"], "");
1550 test_lcs!(lcs12, vec!["flub", "bub", "boob", "dub"], "b");
1551 }
1552