1 /*! 2 Provides routines for extracting literal prefixes and suffixes from an `Hir`. 3 */ 4 5 use std::cmp; 6 use std::fmt; 7 use std::iter; 8 use std::mem; 9 use std::ops; 10 11 use crate::hir::{self, Hir, HirKind}; 12 13 /// A set of literal byte strings extracted from a regular expression. 14 /// 15 /// Every member of the set is a `Literal`, which is represented by a 16 /// `Vec<u8>`. (Notably, it may contain invalid UTF-8.) Every member is 17 /// said to be either *complete* or *cut*. A complete literal means that 18 /// it extends until the beginning (or end) of the regular expression. In 19 /// some circumstances, this can be used to indicate a match in the regular 20 /// expression. 21 /// 22 /// A key aspect of literal extraction is knowing when to stop. It is not 23 /// feasible to blindly extract all literals from a regular expression, even if 24 /// there are finitely many. For example, the regular expression `[0-9]{10}` 25 /// has `10^10` distinct literals. For this reason, literal extraction is 26 /// bounded to some low number by default using heuristics, but the limits can 27 /// be tweaked. 28 /// 29 /// **WARNING**: Literal extraction uses stack space proportional to the size 30 /// of the `Hir` expression. At some point, this drawback will be eliminated. 31 /// To protect yourself, set a reasonable 32 /// [`nest_limit` on your `Parser`](../../struct.ParserBuilder.html#method.nest_limit). 33 /// This is done for you by default. 34 #[derive(Clone, Eq, PartialEq)] 35 pub struct Literals { 36 lits: Vec<Literal>, 37 limit_size: usize, 38 limit_class: usize, 39 } 40 41 /// A single member of a set of literals extracted from a regular expression. 42 /// 43 /// This type has `Deref` and `DerefMut` impls to `Vec<u8>` so that all slice 44 /// and `Vec` operations are available. 45 #[derive(Clone, Eq, Ord)] 46 pub struct Literal { 47 v: Vec<u8>, 48 cut: bool, 49 } 50 51 impl Literals { 52 /// Returns a new empty set of literals using default limits. 53 pub fn empty() -> Literals { 54 Literals { lits: vec![], limit_size: 250, limit_class: 10 } 55 } 56 57 /// Returns a set of literal prefixes extracted from the given `Hir`. 58 pub fn prefixes(expr: &Hir) -> Literals { 59 let mut lits = Literals::empty(); 60 lits.union_prefixes(expr); 61 lits 62 } 63 64 /// Returns a set of literal suffixes extracted from the given `Hir`. 65 pub fn suffixes(expr: &Hir) -> Literals { 66 let mut lits = Literals::empty(); 67 lits.union_suffixes(expr); 68 lits 69 } 70 71 /// Get the approximate size limit (in bytes) of this set. 72 pub fn limit_size(&self) -> usize { 73 self.limit_size 74 } 75 76 /// Set the approximate size limit (in bytes) of this set. 77 /// 78 /// If extracting a literal would put the set over this limit, then 79 /// extraction stops. 80 /// 81 /// The new limits will only apply to additions to this set. Existing 82 /// members remain unchanged, even if the set exceeds the new limit. 83 pub fn set_limit_size(&mut self, size: usize) -> &mut Literals { 84 self.limit_size = size; 85 self 86 } 87 88 /// Get the character class size limit for this set. 89 pub fn limit_class(&self) -> usize { 90 self.limit_class 91 } 92 93 /// Limits the size of character(or byte) classes considered. 94 /// 95 /// A value of `0` prevents all character classes from being considered. 96 /// 97 /// This limit also applies to case insensitive literals, since each 98 /// character in the case insensitive literal is converted to a class, and 99 /// then case folded. 100 /// 101 /// The new limits will only apply to additions to this set. Existing 102 /// members remain unchanged, even if the set exceeds the new limit. 103 pub fn set_limit_class(&mut self, size: usize) -> &mut Literals { 104 self.limit_class = size; 105 self 106 } 107 108 /// Returns the set of literals as a slice. Its order is unspecified. 109 pub fn literals(&self) -> &[Literal] { 110 &self.lits 111 } 112 113 /// Returns the length of the smallest literal. 114 /// 115 /// Returns None is there are no literals in the set. 116 pub fn min_len(&self) -> Option<usize> { 117 let mut min = None; 118 for lit in &self.lits { 119 match min { 120 None => min = Some(lit.len()), 121 Some(m) if lit.len() < m => min = Some(lit.len()), 122 _ => {} 123 } 124 } 125 min 126 } 127 128 /// Returns true if all members in this set are complete. 129 pub fn all_complete(&self) -> bool { 130 !self.lits.is_empty() && self.lits.iter().all(|l| !l.is_cut()) 131 } 132 133 /// Returns true if any member in this set is complete. 134 pub fn any_complete(&self) -> bool { 135 self.lits.iter().any(|lit| !lit.is_cut()) 136 } 137 138 /// Returns true if this set contains an empty literal. 139 pub fn contains_empty(&self) -> bool { 140 self.lits.iter().any(|lit| lit.is_empty()) 141 } 142 143 /// Returns true if this set is empty or if all of its members is empty. 144 pub fn is_empty(&self) -> bool { 145 self.lits.is_empty() || self.lits.iter().all(|lit| lit.is_empty()) 146 } 147 148 /// Returns a new empty set of literals using this set's limits. 149 pub fn to_empty(&self) -> Literals { 150 let mut lits = Literals::empty(); 151 lits.set_limit_size(self.limit_size).set_limit_class(self.limit_class); 152 lits 153 } 154 155 /// Returns the longest common prefix of all members in this set. 156 pub fn longest_common_prefix(&self) -> &[u8] { 157 if self.is_empty() { 158 return &[]; 159 } 160 let lit0 = &*self.lits[0]; 161 let mut len = lit0.len(); 162 for lit in &self.lits[1..] { 163 len = cmp::min( 164 len, 165 lit.iter().zip(lit0).take_while(|&(a, b)| a == b).count(), 166 ); 167 } 168 &self.lits[0][..len] 169 } 170 171 /// Returns the longest common suffix of all members in this set. 172 pub fn longest_common_suffix(&self) -> &[u8] { 173 if self.is_empty() { 174 return &[]; 175 } 176 let lit0 = &*self.lits[0]; 177 let mut len = lit0.len(); 178 for lit in &self.lits[1..] { 179 len = cmp::min( 180 len, 181 lit.iter() 182 .rev() 183 .zip(lit0.iter().rev()) 184 .take_while(|&(a, b)| a == b) 185 .count(), 186 ); 187 } 188 &self.lits[0][self.lits[0].len() - len..] 189 } 190 191 /// Returns a new set of literals with the given number of bytes trimmed 192 /// from the suffix of each literal. 193 /// 194 /// If any literal would be cut out completely by trimming, then None is 195 /// returned. 196 /// 197 /// Any duplicates that are created as a result of this transformation are 198 /// removed. 199 pub fn trim_suffix(&self, num_bytes: usize) -> Option<Literals> { 200 if self.min_len().map(|len| len <= num_bytes).unwrap_or(true) { 201 return None; 202 } 203 let mut new = self.to_empty(); 204 for mut lit in self.lits.iter().cloned() { 205 let new_len = lit.len() - num_bytes; 206 lit.truncate(new_len); 207 lit.cut(); 208 new.lits.push(lit); 209 } 210 new.lits.sort(); 211 new.lits.dedup(); 212 Some(new) 213 } 214 215 /// Returns a new set of prefixes of this set of literals that are 216 /// guaranteed to be unambiguous. 217 /// 218 /// Any substring match with a member of the set is returned is guaranteed 219 /// to never overlap with a substring match of another member of the set 220 /// at the same starting position. 221 /// 222 /// Given any two members of the returned set, neither is a substring of 223 /// the other. 224 pub fn unambiguous_prefixes(&self) -> Literals { 225 if self.lits.is_empty() { 226 return self.to_empty(); 227 } 228 let mut old: Vec<Literal> = self.lits.iter().cloned().collect(); 229 let mut new = self.to_empty(); 230 'OUTER: while let Some(mut candidate) = old.pop() { 231 if candidate.is_empty() { 232 continue; 233 } 234 if new.lits.is_empty() { 235 new.lits.push(candidate); 236 continue; 237 } 238 for lit2 in &mut new.lits { 239 if lit2.is_empty() { 240 continue; 241 } 242 if &candidate == lit2 { 243 // If the literal is already in the set, then we can 244 // just drop it. But make sure that cut literals are 245 // infectious! 246 candidate.cut = candidate.cut || lit2.cut; 247 lit2.cut = candidate.cut; 248 continue 'OUTER; 249 } 250 if candidate.len() < lit2.len() { 251 if let Some(i) = position(&candidate, &lit2) { 252 candidate.cut(); 253 let mut lit3 = lit2.clone(); 254 lit3.truncate(i); 255 lit3.cut(); 256 old.push(lit3); 257 lit2.clear(); 258 } 259 } else { 260 if let Some(i) = position(&lit2, &candidate) { 261 lit2.cut(); 262 let mut new_candidate = candidate.clone(); 263 new_candidate.truncate(i); 264 new_candidate.cut(); 265 old.push(new_candidate); 266 candidate.clear(); 267 } 268 } 269 // Oops, the candidate is already represented in the set. 270 if candidate.is_empty() { 271 continue 'OUTER; 272 } 273 } 274 new.lits.push(candidate); 275 } 276 new.lits.retain(|lit| !lit.is_empty()); 277 new.lits.sort(); 278 new.lits.dedup(); 279 new 280 } 281 282 /// Returns a new set of suffixes of this set of literals that are 283 /// guaranteed to be unambiguous. 284 /// 285 /// Any substring match with a member of the set is returned is guaranteed 286 /// to never overlap with a substring match of another member of the set 287 /// at the same ending position. 288 /// 289 /// Given any two members of the returned set, neither is a substring of 290 /// the other. 291 pub fn unambiguous_suffixes(&self) -> Literals { 292 // This is a touch wasteful... 293 let mut lits = self.clone(); 294 lits.reverse(); 295 let mut unamb = lits.unambiguous_prefixes(); 296 unamb.reverse(); 297 unamb 298 } 299 300 /// Unions the prefixes from the given expression to this set. 301 /// 302 /// If prefixes could not be added (for example, this set would exceed its 303 /// size limits or the set of prefixes from `expr` includes the empty 304 /// string), then false is returned. 305 /// 306 /// Note that prefix literals extracted from `expr` are said to be complete 307 /// if and only if the literal extends from the beginning of `expr` to the 308 /// end of `expr`. 309 pub fn union_prefixes(&mut self, expr: &Hir) -> bool { 310 let mut lits = self.to_empty(); 311 prefixes(expr, &mut lits); 312 !lits.is_empty() && !lits.contains_empty() && self.union(lits) 313 } 314 315 /// Unions the suffixes from the given expression to this set. 316 /// 317 /// If suffixes could not be added (for example, this set would exceed its 318 /// size limits or the set of suffixes from `expr` includes the empty 319 /// string), then false is returned. 320 /// 321 /// Note that prefix literals extracted from `expr` are said to be complete 322 /// if and only if the literal extends from the end of `expr` to the 323 /// beginning of `expr`. 324 pub fn union_suffixes(&mut self, expr: &Hir) -> bool { 325 let mut lits = self.to_empty(); 326 suffixes(expr, &mut lits); 327 lits.reverse(); 328 !lits.is_empty() && !lits.contains_empty() && self.union(lits) 329 } 330 331 /// Unions this set with another set. 332 /// 333 /// If the union would cause the set to exceed its limits, then the union 334 /// is skipped and it returns false. Otherwise, if the union succeeds, it 335 /// returns true. 336 pub fn union(&mut self, lits: Literals) -> bool { 337 if self.num_bytes() + lits.num_bytes() > self.limit_size { 338 return false; 339 } 340 if lits.is_empty() { 341 self.lits.push(Literal::empty()); 342 } else { 343 self.lits.extend(lits.lits); 344 } 345 true 346 } 347 348 /// Extends this set with another set. 349 /// 350 /// The set of literals is extended via a cross product. 351 /// 352 /// If a cross product would cause this set to exceed its limits, then the 353 /// cross product is skipped and it returns false. Otherwise, if the cross 354 /// product succeeds, it returns true. 355 pub fn cross_product(&mut self, lits: &Literals) -> bool { 356 if lits.is_empty() { 357 return true; 358 } 359 // Check that we make sure we stay in our limits. 360 let mut size_after; 361 if self.is_empty() || !self.any_complete() { 362 size_after = self.num_bytes(); 363 for lits_lit in lits.literals() { 364 size_after += lits_lit.len(); 365 } 366 } else { 367 size_after = self.lits.iter().fold(0, |accum, lit| { 368 accum + if lit.is_cut() { lit.len() } else { 0 } 369 }); 370 for lits_lit in lits.literals() { 371 for self_lit in self.literals() { 372 if !self_lit.is_cut() { 373 size_after += self_lit.len() + lits_lit.len(); 374 } 375 } 376 } 377 } 378 if size_after > self.limit_size { 379 return false; 380 } 381 382 let mut base = self.remove_complete(); 383 if base.is_empty() { 384 base = vec![Literal::empty()]; 385 } 386 for lits_lit in lits.literals() { 387 for mut self_lit in base.clone() { 388 self_lit.extend(&**lits_lit); 389 self_lit.cut = lits_lit.cut; 390 self.lits.push(self_lit); 391 } 392 } 393 true 394 } 395 396 /// Extends each literal in this set with the bytes given. 397 /// 398 /// If the set is empty, then the given literal is added to the set. 399 /// 400 /// If adding any number of bytes to all members of this set causes a limit 401 /// to be exceeded, then no bytes are added and false is returned. If a 402 /// prefix of `bytes` can be fit into this set, then it is used and all 403 /// resulting literals are cut. 404 pub fn cross_add(&mut self, bytes: &[u8]) -> bool { 405 // N.B. This could be implemented by simply calling cross_product with 406 // a literal set containing just `bytes`, but we can be smarter about 407 // taking shorter prefixes of `bytes` if they'll fit. 408 if bytes.is_empty() { 409 return true; 410 } 411 if self.lits.is_empty() { 412 let i = cmp::min(self.limit_size, bytes.len()); 413 self.lits.push(Literal::new(bytes[..i].to_owned())); 414 self.lits[0].cut = i < bytes.len(); 415 return !self.lits[0].is_cut(); 416 } 417 let size = self.num_bytes(); 418 if size + self.lits.len() >= self.limit_size { 419 return false; 420 } 421 let mut i = 1; 422 while size + (i * self.lits.len()) <= self.limit_size 423 && i < bytes.len() 424 { 425 i += 1; 426 } 427 for lit in &mut self.lits { 428 if !lit.is_cut() { 429 lit.extend(&bytes[..i]); 430 if i < bytes.len() { 431 lit.cut(); 432 } 433 } 434 } 435 true 436 } 437 438 /// Adds the given literal to this set. 439 /// 440 /// Returns false if adding this literal would cause the class to be too 441 /// big. 442 pub fn add(&mut self, lit: Literal) -> bool { 443 if self.num_bytes() + lit.len() > self.limit_size { 444 return false; 445 } 446 self.lits.push(lit); 447 true 448 } 449 450 /// Extends each literal in this set with the character class given. 451 /// 452 /// Returns false if the character class was too big to add. 453 pub fn add_char_class(&mut self, cls: &hir::ClassUnicode) -> bool { 454 self._add_char_class(cls, false) 455 } 456 457 /// Extends each literal in this set with the character class given, 458 /// writing the bytes of each character in reverse. 459 /// 460 /// Returns false if the character class was too big to add. 461 fn add_char_class_reverse(&mut self, cls: &hir::ClassUnicode) -> bool { 462 self._add_char_class(cls, true) 463 } 464 465 fn _add_char_class( 466 &mut self, 467 cls: &hir::ClassUnicode, 468 reverse: bool, 469 ) -> bool { 470 use std::char; 471 472 if self.class_exceeds_limits(cls_char_count(cls)) { 473 return false; 474 } 475 let mut base = self.remove_complete(); 476 if base.is_empty() { 477 base = vec![Literal::empty()]; 478 } 479 for r in cls.iter() { 480 let (s, e) = (r.start as u32, r.end as u32 + 1); 481 for c in (s..e).filter_map(char::from_u32) { 482 for mut lit in base.clone() { 483 let mut bytes = c.to_string().into_bytes(); 484 if reverse { 485 bytes.reverse(); 486 } 487 lit.extend(&bytes); 488 self.lits.push(lit); 489 } 490 } 491 } 492 true 493 } 494 495 /// Extends each literal in this set with the byte class given. 496 /// 497 /// Returns false if the byte class was too big to add. 498 pub fn add_byte_class(&mut self, cls: &hir::ClassBytes) -> bool { 499 if self.class_exceeds_limits(cls_byte_count(cls)) { 500 return false; 501 } 502 let mut base = self.remove_complete(); 503 if base.is_empty() { 504 base = vec![Literal::empty()]; 505 } 506 for r in cls.iter() { 507 let (s, e) = (r.start as u32, r.end as u32 + 1); 508 for b in (s..e).map(|b| b as u8) { 509 for mut lit in base.clone() { 510 lit.push(b); 511 self.lits.push(lit); 512 } 513 } 514 } 515 true 516 } 517 518 /// Cuts every member of this set. When a member is cut, it can never 519 /// be extended. 520 pub fn cut(&mut self) { 521 for lit in &mut self.lits { 522 lit.cut(); 523 } 524 } 525 526 /// Reverses all members in place. 527 pub fn reverse(&mut self) { 528 for lit in &mut self.lits { 529 lit.reverse(); 530 } 531 } 532 533 /// Clears this set of all members. 534 pub fn clear(&mut self) { 535 self.lits.clear(); 536 } 537 538 /// Pops all complete literals out of this set. 539 fn remove_complete(&mut self) -> Vec<Literal> { 540 let mut base = vec![]; 541 for lit in mem::replace(&mut self.lits, vec![]) { 542 if lit.is_cut() { 543 self.lits.push(lit); 544 } else { 545 base.push(lit); 546 } 547 } 548 base 549 } 550 551 /// Returns the total number of bytes in this set. 552 fn num_bytes(&self) -> usize { 553 self.lits.iter().fold(0, |accum, lit| accum + lit.len()) 554 } 555 556 /// Returns true if a character class with the given size would cause this 557 /// set to exceed its limits. 558 /// 559 /// The size given should correspond to the number of items in the class. 560 fn class_exceeds_limits(&self, size: usize) -> bool { 561 if size > self.limit_class { 562 return true; 563 } 564 // This is an approximation since codepoints in a char class can encode 565 // to 1-4 bytes. 566 let new_byte_count = if self.lits.is_empty() { 567 size 568 } else { 569 self.lits.iter().fold(0, |accum, lit| { 570 accum 571 + if lit.is_cut() { 572 // If the literal is cut, then we'll never add 573 // anything to it, so don't count it. 574 0 575 } else { 576 (lit.len() + 1) * size 577 } 578 }) 579 }; 580 new_byte_count > self.limit_size 581 } 582 } 583 584 fn prefixes(expr: &Hir, lits: &mut Literals) { 585 match *expr.kind() { 586 HirKind::Literal(hir::Literal::Unicode(c)) => { 587 let mut buf = [0; 4]; 588 lits.cross_add(c.encode_utf8(&mut buf).as_bytes()); 589 } 590 HirKind::Literal(hir::Literal::Byte(b)) => { 591 lits.cross_add(&[b]); 592 } 593 HirKind::Class(hir::Class::Unicode(ref cls)) => { 594 if !lits.add_char_class(cls) { 595 lits.cut(); 596 } 597 } 598 HirKind::Class(hir::Class::Bytes(ref cls)) => { 599 if !lits.add_byte_class(cls) { 600 lits.cut(); 601 } 602 } 603 HirKind::Group(hir::Group { ref hir, .. }) => { 604 prefixes(&**hir, lits); 605 } 606 HirKind::Repetition(ref x) => match x.kind { 607 hir::RepetitionKind::ZeroOrOne => { 608 repeat_zero_or_one_literals(&x.hir, lits, prefixes); 609 } 610 hir::RepetitionKind::ZeroOrMore => { 611 repeat_zero_or_more_literals(&x.hir, lits, prefixes); 612 } 613 hir::RepetitionKind::OneOrMore => { 614 repeat_one_or_more_literals(&x.hir, lits, prefixes); 615 } 616 hir::RepetitionKind::Range(ref rng) => { 617 let (min, max) = match *rng { 618 hir::RepetitionRange::Exactly(m) => (m, Some(m)), 619 hir::RepetitionRange::AtLeast(m) => (m, None), 620 hir::RepetitionRange::Bounded(m, n) => (m, Some(n)), 621 }; 622 repeat_range_literals( 623 &x.hir, min, max, x.greedy, lits, prefixes, 624 ) 625 } 626 }, 627 HirKind::Concat(ref es) if es.is_empty() => {} 628 HirKind::Concat(ref es) if es.len() == 1 => prefixes(&es[0], lits), 629 HirKind::Concat(ref es) => { 630 for e in es { 631 if let HirKind::Anchor(hir::Anchor::StartText) = *e.kind() { 632 if !lits.is_empty() { 633 lits.cut(); 634 break; 635 } 636 lits.add(Literal::empty()); 637 continue; 638 } 639 let mut lits2 = lits.to_empty(); 640 prefixes(e, &mut lits2); 641 if !lits.cross_product(&lits2) || !lits2.any_complete() { 642 // If this expression couldn't yield any literal that 643 // could be extended, then we need to quit. Since we're 644 // short-circuiting, we also need to freeze every member. 645 lits.cut(); 646 break; 647 } 648 } 649 } 650 HirKind::Alternation(ref es) => { 651 alternate_literals(es, lits, prefixes); 652 } 653 _ => lits.cut(), 654 } 655 } 656 657 fn suffixes(expr: &Hir, lits: &mut Literals) { 658 match *expr.kind() { 659 HirKind::Literal(hir::Literal::Unicode(c)) => { 660 let mut buf = [0u8; 4]; 661 let i = c.encode_utf8(&mut buf).len(); 662 let buf = &mut buf[..i]; 663 buf.reverse(); 664 lits.cross_add(buf); 665 } 666 HirKind::Literal(hir::Literal::Byte(b)) => { 667 lits.cross_add(&[b]); 668 } 669 HirKind::Class(hir::Class::Unicode(ref cls)) => { 670 if !lits.add_char_class_reverse(cls) { 671 lits.cut(); 672 } 673 } 674 HirKind::Class(hir::Class::Bytes(ref cls)) => { 675 if !lits.add_byte_class(cls) { 676 lits.cut(); 677 } 678 } 679 HirKind::Group(hir::Group { ref hir, .. }) => { 680 suffixes(&**hir, lits); 681 } 682 HirKind::Repetition(ref x) => match x.kind { 683 hir::RepetitionKind::ZeroOrOne => { 684 repeat_zero_or_one_literals(&x.hir, lits, suffixes); 685 } 686 hir::RepetitionKind::ZeroOrMore => { 687 repeat_zero_or_more_literals(&x.hir, lits, suffixes); 688 } 689 hir::RepetitionKind::OneOrMore => { 690 repeat_one_or_more_literals(&x.hir, lits, suffixes); 691 } 692 hir::RepetitionKind::Range(ref rng) => { 693 let (min, max) = match *rng { 694 hir::RepetitionRange::Exactly(m) => (m, Some(m)), 695 hir::RepetitionRange::AtLeast(m) => (m, None), 696 hir::RepetitionRange::Bounded(m, n) => (m, Some(n)), 697 }; 698 repeat_range_literals( 699 &x.hir, min, max, x.greedy, lits, suffixes, 700 ) 701 } 702 }, 703 HirKind::Concat(ref es) if es.is_empty() => {} 704 HirKind::Concat(ref es) if es.len() == 1 => suffixes(&es[0], lits), 705 HirKind::Concat(ref es) => { 706 for e in es.iter().rev() { 707 if let HirKind::Anchor(hir::Anchor::EndText) = *e.kind() { 708 if !lits.is_empty() { 709 lits.cut(); 710 break; 711 } 712 lits.add(Literal::empty()); 713 continue; 714 } 715 let mut lits2 = lits.to_empty(); 716 suffixes(e, &mut lits2); 717 if !lits.cross_product(&lits2) || !lits2.any_complete() { 718 // If this expression couldn't yield any literal that 719 // could be extended, then we need to quit. Since we're 720 // short-circuiting, we also need to freeze every member. 721 lits.cut(); 722 break; 723 } 724 } 725 } 726 HirKind::Alternation(ref es) => { 727 alternate_literals(es, lits, suffixes); 728 } 729 _ => lits.cut(), 730 } 731 } 732 733 fn repeat_zero_or_one_literals<F: FnMut(&Hir, &mut Literals)>( 734 e: &Hir, 735 lits: &mut Literals, 736 mut f: F, 737 ) { 738 let (mut lits2, mut lits3) = (lits.clone(), lits.to_empty()); 739 lits3.set_limit_size(lits.limit_size() / 2); 740 f(e, &mut lits3); 741 742 if lits3.is_empty() || !lits2.cross_product(&lits3) { 743 lits.cut(); 744 return; 745 } 746 lits2.add(Literal::empty()); 747 if !lits.union(lits2) { 748 lits.cut(); 749 } 750 } 751 752 fn repeat_zero_or_more_literals<F: FnMut(&Hir, &mut Literals)>( 753 e: &Hir, 754 lits: &mut Literals, 755 mut f: F, 756 ) { 757 let (mut lits2, mut lits3) = (lits.clone(), lits.to_empty()); 758 lits3.set_limit_size(lits.limit_size() / 2); 759 f(e, &mut lits3); 760 761 if lits3.is_empty() || !lits2.cross_product(&lits3) { 762 lits.cut(); 763 return; 764 } 765 lits2.cut(); 766 lits2.add(Literal::empty()); 767 if !lits.union(lits2) { 768 lits.cut(); 769 } 770 } 771 772 fn repeat_one_or_more_literals<F: FnMut(&Hir, &mut Literals)>( 773 e: &Hir, 774 lits: &mut Literals, 775 mut f: F, 776 ) { 777 f(e, lits); 778 lits.cut(); 779 } 780 781 fn repeat_range_literals<F: FnMut(&Hir, &mut Literals)>( 782 e: &Hir, 783 min: u32, 784 max: Option<u32>, 785 greedy: bool, 786 lits: &mut Literals, 787 mut f: F, 788 ) { 789 if min == 0 { 790 // This is a bit conservative. If `max` is set, then we could 791 // treat this as a finite set of alternations. For now, we 792 // just treat it as `e*`. 793 f( 794 &Hir::repetition(hir::Repetition { 795 kind: hir::RepetitionKind::ZeroOrMore, 796 greedy: greedy, 797 hir: Box::new(e.clone()), 798 }), 799 lits, 800 ); 801 } else { 802 if min > 0 { 803 let n = cmp::min(lits.limit_size, min as usize); 804 let es = iter::repeat(e.clone()).take(n).collect(); 805 f(&Hir::concat(es), lits); 806 if n < min as usize || lits.contains_empty() { 807 lits.cut(); 808 } 809 } 810 if max.map_or(true, |max| min < max) { 811 lits.cut(); 812 } 813 } 814 } 815 816 fn alternate_literals<F: FnMut(&Hir, &mut Literals)>( 817 es: &[Hir], 818 lits: &mut Literals, 819 mut f: F, 820 ) { 821 let mut lits2 = lits.to_empty(); 822 for e in es { 823 let mut lits3 = lits.to_empty(); 824 lits3.set_limit_size(lits.limit_size() / 5); 825 f(e, &mut lits3); 826 if lits3.is_empty() || !lits2.union(lits3) { 827 // If we couldn't find suffixes for *any* of the 828 // alternates, then the entire alternation has to be thrown 829 // away and any existing members must be frozen. Similarly, 830 // if the union couldn't complete, stop and freeze. 831 lits.cut(); 832 return; 833 } 834 } 835 if !lits.cross_product(&lits2) { 836 lits.cut(); 837 } 838 } 839 840 impl fmt::Debug for Literals { 841 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 842 f.debug_struct("Literals") 843 .field("lits", &self.lits) 844 .field("limit_size", &self.limit_size) 845 .field("limit_class", &self.limit_class) 846 .finish() 847 } 848 } 849 850 impl Literal { 851 /// Returns a new complete literal with the bytes given. 852 pub fn new(bytes: Vec<u8>) -> Literal { 853 Literal { v: bytes, cut: false } 854 } 855 856 /// Returns a new complete empty literal. 857 pub fn empty() -> Literal { 858 Literal { v: vec![], cut: false } 859 } 860 861 /// Returns true if this literal was "cut." 862 pub fn is_cut(&self) -> bool { 863 self.cut 864 } 865 866 /// Cuts this literal. 867 pub fn cut(&mut self) { 868 self.cut = true; 869 } 870 } 871 872 impl PartialEq for Literal { 873 fn eq(&self, other: &Literal) -> bool { 874 self.v == other.v 875 } 876 } 877 878 impl PartialOrd for Literal { 879 fn partial_cmp(&self, other: &Literal) -> Option<cmp::Ordering> { 880 self.v.partial_cmp(&other.v) 881 } 882 } 883 884 impl fmt::Debug for Literal { 885 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 886 if self.is_cut() { 887 write!(f, "Cut({})", escape_unicode(&self.v)) 888 } else { 889 write!(f, "Complete({})", escape_unicode(&self.v)) 890 } 891 } 892 } 893 894 impl AsRef<[u8]> for Literal { 895 fn as_ref(&self) -> &[u8] { 896 &self.v 897 } 898 } 899 900 impl ops::Deref for Literal { 901 type Target = Vec<u8>; 902 fn deref(&self) -> &Vec<u8> { 903 &self.v 904 } 905 } 906 907 impl ops::DerefMut for Literal { 908 fn deref_mut(&mut self) -> &mut Vec<u8> { 909 &mut self.v 910 } 911 } 912 913 fn position(needle: &[u8], mut haystack: &[u8]) -> Option<usize> { 914 let mut i = 0; 915 while haystack.len() >= needle.len() { 916 if needle == &haystack[..needle.len()] { 917 return Some(i); 918 } 919 i += 1; 920 haystack = &haystack[1..]; 921 } 922 None 923 } 924 925 fn escape_unicode(bytes: &[u8]) -> String { 926 let show = match ::std::str::from_utf8(bytes) { 927 Ok(v) => v.to_string(), 928 Err(_) => escape_bytes(bytes), 929 }; 930 let mut space_escaped = String::new(); 931 for c in show.chars() { 932 if c.is_whitespace() { 933 let escaped = if c as u32 <= 0x7F { 934 escape_byte(c as u8) 935 } else { 936 if c as u32 <= 0xFFFF { 937 format!(r"\u{{{:04x}}}", c as u32) 938 } else { 939 format!(r"\U{{{:08x}}}", c as u32) 940 } 941 }; 942 space_escaped.push_str(&escaped); 943 } else { 944 space_escaped.push(c); 945 } 946 } 947 space_escaped 948 } 949 950 fn escape_bytes(bytes: &[u8]) -> String { 951 let mut s = String::new(); 952 for &b in bytes { 953 s.push_str(&escape_byte(b)); 954 } 955 s 956 } 957 958 fn escape_byte(byte: u8) -> String { 959 use std::ascii::escape_default; 960 961 let escaped: Vec<u8> = escape_default(byte).collect(); 962 String::from_utf8_lossy(&escaped).into_owned() 963 } 964 965 fn cls_char_count(cls: &hir::ClassUnicode) -> usize { 966 cls.iter().map(|&r| 1 + (r.end as u32) - (r.start as u32)).sum::<u32>() 967 as usize 968 } 969 970 fn cls_byte_count(cls: &hir::ClassBytes) -> usize { 971 cls.iter().map(|&r| 1 + (r.end as u32) - (r.start as u32)).sum::<u32>() 972 as usize 973 } 974 975 #[cfg(test)] 976 mod tests { 977 use std::fmt; 978 979 use super::{escape_bytes, Literal, Literals}; 980 use crate::hir::Hir; 981 use crate::ParserBuilder; 982 983 // To make test failures easier to read. 984 #[derive(Debug, Eq, PartialEq)] 985 struct Bytes(Vec<ULiteral>); 986 #[derive(Debug, Eq, PartialEq)] 987 struct Unicode(Vec<ULiteral>); 988 989 fn escape_lits(blits: &[Literal]) -> Vec<ULiteral> { 990 let mut ulits = vec![]; 991 for blit in blits { 992 ulits 993 .push(ULiteral { v: escape_bytes(&blit), cut: blit.is_cut() }); 994 } 995 ulits 996 } 997 998 fn create_lits<I: IntoIterator<Item = Literal>>(it: I) -> Literals { 999 Literals { 1000 lits: it.into_iter().collect(), 1001 limit_size: 0, 1002 limit_class: 0, 1003 } 1004 } 1005 1006 // Needs to be pub for 1.3? 1007 #[derive(Clone, Eq, PartialEq)] 1008 pub struct ULiteral { 1009 v: String, 1010 cut: bool, 1011 } 1012 1013 impl ULiteral { 1014 fn is_cut(&self) -> bool { 1015 self.cut 1016 } 1017 } 1018 1019 impl fmt::Debug for ULiteral { 1020 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 1021 if self.is_cut() { 1022 write!(f, "Cut({})", self.v) 1023 } else { 1024 write!(f, "Complete({})", self.v) 1025 } 1026 } 1027 } 1028 1029 impl PartialEq<Literal> for ULiteral { 1030 fn eq(&self, other: &Literal) -> bool { 1031 self.v.as_bytes() == &*other.v && self.is_cut() == other.is_cut() 1032 } 1033 } 1034 1035 impl PartialEq<ULiteral> for Literal { 1036 fn eq(&self, other: &ULiteral) -> bool { 1037 &*self.v == other.v.as_bytes() && self.is_cut() == other.is_cut() 1038 } 1039 } 1040 1041 #[allow(non_snake_case)] 1042 fn C(s: &'static str) -> ULiteral { 1043 ULiteral { v: s.to_owned(), cut: true } 1044 } 1045 #[allow(non_snake_case)] 1046 fn M(s: &'static str) -> ULiteral { 1047 ULiteral { v: s.to_owned(), cut: false } 1048 } 1049 1050 fn prefixes(lits: &mut Literals, expr: &Hir) { 1051 lits.union_prefixes(expr); 1052 } 1053 1054 fn suffixes(lits: &mut Literals, expr: &Hir) { 1055 lits.union_suffixes(expr); 1056 } 1057 1058 macro_rules! assert_lit_eq { 1059 ($which:ident, $got_lits:expr, $($expected_lit:expr),*) => {{ 1060 let expected: Vec<ULiteral> = vec![$($expected_lit),*]; 1061 let lits = $got_lits; 1062 assert_eq!( 1063 $which(expected.clone()), 1064 $which(escape_lits(lits.literals()))); 1065 assert_eq!( 1066 !expected.is_empty() && expected.iter().all(|l| !l.is_cut()), 1067 lits.all_complete()); 1068 assert_eq!( 1069 expected.iter().any(|l| !l.is_cut()), 1070 lits.any_complete()); 1071 }}; 1072 } 1073 1074 macro_rules! test_lit { 1075 ($name:ident, $which:ident, $re:expr) => { 1076 test_lit!($name, $which, $re,); 1077 }; 1078 ($name:ident, $which:ident, $re:expr, $($lit:expr),*) => { 1079 #[test] 1080 fn $name() { 1081 let expr = ParserBuilder::new() 1082 .build() 1083 .parse($re) 1084 .unwrap(); 1085 let lits = Literals::$which(&expr); 1086 assert_lit_eq!(Unicode, lits, $($lit),*); 1087 1088 let expr = ParserBuilder::new() 1089 .allow_invalid_utf8(true) 1090 .unicode(false) 1091 .build() 1092 .parse($re) 1093 .unwrap(); 1094 let lits = Literals::$which(&expr); 1095 assert_lit_eq!(Bytes, lits, $($lit),*); 1096 } 1097 }; 1098 } 1099 1100 // ************************************************************************ 1101 // Tests for prefix literal extraction. 1102 // ************************************************************************ 1103 1104 // Elementary tests. 1105 test_lit!(pfx_one_lit1, prefixes, "a", M("a")); 1106 test_lit!(pfx_one_lit2, prefixes, "abc", M("abc")); 1107 test_lit!(pfx_one_lit3, prefixes, "(?u)☃", M("\\xe2\\x98\\x83")); 1108 #[cfg(feature = "unicode-case")] 1109 test_lit!(pfx_one_lit4, prefixes, "(?ui)☃", M("\\xe2\\x98\\x83")); 1110 test_lit!(pfx_class1, prefixes, "[1-4]", M("1"), M("2"), M("3"), M("4")); 1111 test_lit!( 1112 pfx_class2, 1113 prefixes, 1114 "(?u)[☃Ⅰ]", 1115 M("\\xe2\\x85\\xa0"), 1116 M("\\xe2\\x98\\x83") 1117 ); 1118 #[cfg(feature = "unicode-case")] 1119 test_lit!( 1120 pfx_class3, 1121 prefixes, 1122 "(?ui)[☃Ⅰ]", 1123 M("\\xe2\\x85\\xa0"), 1124 M("\\xe2\\x85\\xb0"), 1125 M("\\xe2\\x98\\x83") 1126 ); 1127 test_lit!(pfx_one_lit_casei1, prefixes, "(?i-u)a", M("A"), M("a")); 1128 test_lit!( 1129 pfx_one_lit_casei2, 1130 prefixes, 1131 "(?i-u)abc", 1132 M("ABC"), 1133 M("aBC"), 1134 M("AbC"), 1135 M("abC"), 1136 M("ABc"), 1137 M("aBc"), 1138 M("Abc"), 1139 M("abc") 1140 ); 1141 test_lit!(pfx_group1, prefixes, "(a)", M("a")); 1142 test_lit!(pfx_rep_zero_or_one1, prefixes, "a?"); 1143 test_lit!(pfx_rep_zero_or_one2, prefixes, "(?:abc)?"); 1144 test_lit!(pfx_rep_zero_or_more1, prefixes, "a*"); 1145 test_lit!(pfx_rep_zero_or_more2, prefixes, "(?:abc)*"); 1146 test_lit!(pfx_rep_one_or_more1, prefixes, "a+", C("a")); 1147 test_lit!(pfx_rep_one_or_more2, prefixes, "(?:abc)+", C("abc")); 1148 test_lit!(pfx_rep_nested_one_or_more, prefixes, "(?:a+)+", C("a")); 1149 test_lit!(pfx_rep_range1, prefixes, "a{0}"); 1150 test_lit!(pfx_rep_range2, prefixes, "a{0,}"); 1151 test_lit!(pfx_rep_range3, prefixes, "a{0,1}"); 1152 test_lit!(pfx_rep_range4, prefixes, "a{1}", M("a")); 1153 test_lit!(pfx_rep_range5, prefixes, "a{2}", M("aa")); 1154 test_lit!(pfx_rep_range6, prefixes, "a{1,2}", C("a")); 1155 test_lit!(pfx_rep_range7, prefixes, "a{2,3}", C("aa")); 1156 1157 // Test regexes with concatenations. 1158 test_lit!(pfx_cat1, prefixes, "(?:a)(?:b)", M("ab")); 1159 test_lit!(pfx_cat2, prefixes, "[ab]z", M("az"), M("bz")); 1160 test_lit!( 1161 pfx_cat3, 1162 prefixes, 1163 "(?i-u)[ab]z", 1164 M("AZ"), 1165 M("BZ"), 1166 M("aZ"), 1167 M("bZ"), 1168 M("Az"), 1169 M("Bz"), 1170 M("az"), 1171 M("bz") 1172 ); 1173 test_lit!( 1174 pfx_cat4, 1175 prefixes, 1176 "[ab][yz]", 1177 M("ay"), 1178 M("by"), 1179 M("az"), 1180 M("bz") 1181 ); 1182 test_lit!(pfx_cat5, prefixes, "a*b", C("a"), M("b")); 1183 test_lit!(pfx_cat6, prefixes, "a*b*c", C("a"), C("b"), M("c")); 1184 test_lit!(pfx_cat7, prefixes, "a*b*c+", C("a"), C("b"), C("c")); 1185 test_lit!(pfx_cat8, prefixes, "a*b+c", C("a"), C("b")); 1186 test_lit!(pfx_cat9, prefixes, "a*b+c*", C("a"), C("b")); 1187 test_lit!(pfx_cat10, prefixes, "ab*", C("ab"), M("a")); 1188 test_lit!(pfx_cat11, prefixes, "ab*c", C("ab"), M("ac")); 1189 test_lit!(pfx_cat12, prefixes, "ab+", C("ab")); 1190 test_lit!(pfx_cat13, prefixes, "ab+c", C("ab")); 1191 test_lit!(pfx_cat14, prefixes, "a^", C("a")); 1192 test_lit!(pfx_cat15, prefixes, "$a"); 1193 test_lit!(pfx_cat16, prefixes, r"ab*c", C("ab"), M("ac")); 1194 test_lit!(pfx_cat17, prefixes, r"ab+c", C("ab")); 1195 test_lit!(pfx_cat18, prefixes, r"z*azb", C("z"), M("azb")); 1196 test_lit!(pfx_cat19, prefixes, "a.z", C("a")); 1197 1198 // Test regexes with alternations. 1199 test_lit!(pfx_alt1, prefixes, "a|b", M("a"), M("b")); 1200 test_lit!(pfx_alt2, prefixes, "[1-3]|b", M("1"), M("2"), M("3"), M("b")); 1201 test_lit!(pfx_alt3, prefixes, "y(?:a|b)z", M("yaz"), M("ybz")); 1202 test_lit!(pfx_alt4, prefixes, "a|b*"); 1203 test_lit!(pfx_alt5, prefixes, "a|b+", M("a"), C("b")); 1204 test_lit!(pfx_alt6, prefixes, "a|(?:b|c*)"); 1205 test_lit!( 1206 pfx_alt7, 1207 prefixes, 1208 "(a|b)*c|(a|ab)*c", 1209 C("a"), 1210 C("b"), 1211 M("c"), 1212 C("a"), 1213 C("ab"), 1214 M("c") 1215 ); 1216 test_lit!(pfx_alt8, prefixes, "a*b|c", C("a"), M("b"), M("c")); 1217 1218 // Test regexes with empty assertions. 1219 test_lit!(pfx_empty1, prefixes, "^a", M("a")); 1220 test_lit!(pfx_empty2, prefixes, "a${2}", C("a")); 1221 test_lit!(pfx_empty3, prefixes, "^abc", M("abc")); 1222 test_lit!(pfx_empty4, prefixes, "(?:^abc)|(?:^z)", M("abc"), M("z")); 1223 1224 // Make sure some curious regexes have no prefixes. 1225 test_lit!(pfx_nothing1, prefixes, "."); 1226 test_lit!(pfx_nothing2, prefixes, "(?s)."); 1227 test_lit!(pfx_nothing3, prefixes, "^"); 1228 test_lit!(pfx_nothing4, prefixes, "$"); 1229 test_lit!(pfx_nothing6, prefixes, "(?m)$"); 1230 test_lit!(pfx_nothing7, prefixes, r"\b"); 1231 test_lit!(pfx_nothing8, prefixes, r"\B"); 1232 1233 // Test a few regexes that defeat any prefix literal detection. 1234 test_lit!(pfx_defeated1, prefixes, ".a"); 1235 test_lit!(pfx_defeated2, prefixes, "(?s).a"); 1236 test_lit!(pfx_defeated3, prefixes, "a*b*c*"); 1237 test_lit!(pfx_defeated4, prefixes, "a|."); 1238 test_lit!(pfx_defeated5, prefixes, ".|a"); 1239 test_lit!(pfx_defeated6, prefixes, "a|^"); 1240 test_lit!(pfx_defeated7, prefixes, ".(?:a(?:b)(?:c))"); 1241 test_lit!(pfx_defeated8, prefixes, "$a"); 1242 test_lit!(pfx_defeated9, prefixes, "(?m)$a"); 1243 test_lit!(pfx_defeated10, prefixes, r"\ba"); 1244 test_lit!(pfx_defeated11, prefixes, r"\Ba"); 1245 test_lit!(pfx_defeated12, prefixes, "^*a"); 1246 test_lit!(pfx_defeated13, prefixes, "^+a"); 1247 1248 test_lit!( 1249 pfx_crazy1, 1250 prefixes, 1251 r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", 1252 C("Mo\\'am"), 1253 C("Mu\\'am"), 1254 C("Moam"), 1255 C("Muam") 1256 ); 1257 1258 // ************************************************************************ 1259 // Tests for quiting prefix literal search. 1260 // ************************************************************************ 1261 1262 macro_rules! test_exhausted { 1263 ($name:ident, $which:ident, $re:expr) => { 1264 test_exhausted!($name, $which, $re,); 1265 }; 1266 ($name:ident, $which:ident, $re:expr, $($lit:expr),*) => { 1267 #[test] 1268 fn $name() { 1269 let expr = ParserBuilder::new() 1270 .build() 1271 .parse($re) 1272 .unwrap(); 1273 let mut lits = Literals::empty(); 1274 lits.set_limit_size(20).set_limit_class(10); 1275 $which(&mut lits, &expr); 1276 assert_lit_eq!(Unicode, lits, $($lit),*); 1277 1278 let expr = ParserBuilder::new() 1279 .allow_invalid_utf8(true) 1280 .unicode(false) 1281 .build() 1282 .parse($re) 1283 .unwrap(); 1284 let mut lits = Literals::empty(); 1285 lits.set_limit_size(20).set_limit_class(10); 1286 $which(&mut lits, &expr); 1287 assert_lit_eq!(Bytes, lits, $($lit),*); 1288 } 1289 }; 1290 } 1291 1292 // These test use a much lower limit than the default so that we can 1293 // write test cases of reasonable size. 1294 test_exhausted!(pfx_exhausted1, prefixes, "[a-z]"); 1295 test_exhausted!(pfx_exhausted2, prefixes, "[a-z]*A"); 1296 test_exhausted!(pfx_exhausted3, prefixes, "A[a-z]Z", C("A")); 1297 test_exhausted!( 1298 pfx_exhausted4, 1299 prefixes, 1300 "(?i-u)foobar", 1301 C("FO"), 1302 C("fO"), 1303 C("Fo"), 1304 C("fo") 1305 ); 1306 test_exhausted!( 1307 pfx_exhausted5, 1308 prefixes, 1309 "(?:ab){100}", 1310 C("abababababababababab") 1311 ); 1312 test_exhausted!( 1313 pfx_exhausted6, 1314 prefixes, 1315 "(?:(?:ab){100})*cd", 1316 C("ababababab"), 1317 M("cd") 1318 ); 1319 test_exhausted!( 1320 pfx_exhausted7, 1321 prefixes, 1322 "z(?:(?:ab){100})*cd", 1323 C("zababababab"), 1324 M("zcd") 1325 ); 1326 test_exhausted!( 1327 pfx_exhausted8, 1328 prefixes, 1329 "aaaaaaaaaaaaaaaaaaaaz", 1330 C("aaaaaaaaaaaaaaaaaaaa") 1331 ); 1332 1333 // ************************************************************************ 1334 // Tests for suffix literal extraction. 1335 // ************************************************************************ 1336 1337 // Elementary tests. 1338 test_lit!(sfx_one_lit1, suffixes, "a", M("a")); 1339 test_lit!(sfx_one_lit2, suffixes, "abc", M("abc")); 1340 test_lit!(sfx_one_lit3, suffixes, "(?u)☃", M("\\xe2\\x98\\x83")); 1341 #[cfg(feature = "unicode-case")] 1342 test_lit!(sfx_one_lit4, suffixes, "(?ui)☃", M("\\xe2\\x98\\x83")); 1343 test_lit!(sfx_class1, suffixes, "[1-4]", M("1"), M("2"), M("3"), M("4")); 1344 test_lit!( 1345 sfx_class2, 1346 suffixes, 1347 "(?u)[☃Ⅰ]", 1348 M("\\xe2\\x85\\xa0"), 1349 M("\\xe2\\x98\\x83") 1350 ); 1351 #[cfg(feature = "unicode-case")] 1352 test_lit!( 1353 sfx_class3, 1354 suffixes, 1355 "(?ui)[☃Ⅰ]", 1356 M("\\xe2\\x85\\xa0"), 1357 M("\\xe2\\x85\\xb0"), 1358 M("\\xe2\\x98\\x83") 1359 ); 1360 test_lit!(sfx_one_lit_casei1, suffixes, "(?i-u)a", M("A"), M("a")); 1361 test_lit!( 1362 sfx_one_lit_casei2, 1363 suffixes, 1364 "(?i-u)abc", 1365 M("ABC"), 1366 M("ABc"), 1367 M("AbC"), 1368 M("Abc"), 1369 M("aBC"), 1370 M("aBc"), 1371 M("abC"), 1372 M("abc") 1373 ); 1374 test_lit!(sfx_group1, suffixes, "(a)", M("a")); 1375 test_lit!(sfx_rep_zero_or_one1, suffixes, "a?"); 1376 test_lit!(sfx_rep_zero_or_one2, suffixes, "(?:abc)?"); 1377 test_lit!(sfx_rep_zero_or_more1, suffixes, "a*"); 1378 test_lit!(sfx_rep_zero_or_more2, suffixes, "(?:abc)*"); 1379 test_lit!(sfx_rep_one_or_more1, suffixes, "a+", C("a")); 1380 test_lit!(sfx_rep_one_or_more2, suffixes, "(?:abc)+", C("abc")); 1381 test_lit!(sfx_rep_nested_one_or_more, suffixes, "(?:a+)+", C("a")); 1382 test_lit!(sfx_rep_range1, suffixes, "a{0}"); 1383 test_lit!(sfx_rep_range2, suffixes, "a{0,}"); 1384 test_lit!(sfx_rep_range3, suffixes, "a{0,1}"); 1385 test_lit!(sfx_rep_range4, suffixes, "a{1}", M("a")); 1386 test_lit!(sfx_rep_range5, suffixes, "a{2}", M("aa")); 1387 test_lit!(sfx_rep_range6, suffixes, "a{1,2}", C("a")); 1388 test_lit!(sfx_rep_range7, suffixes, "a{2,3}", C("aa")); 1389 1390 // Test regexes with concatenations. 1391 test_lit!(sfx_cat1, suffixes, "(?:a)(?:b)", M("ab")); 1392 test_lit!(sfx_cat2, suffixes, "[ab]z", M("az"), M("bz")); 1393 test_lit!( 1394 sfx_cat3, 1395 suffixes, 1396 "(?i-u)[ab]z", 1397 M("AZ"), 1398 M("Az"), 1399 M("BZ"), 1400 M("Bz"), 1401 M("aZ"), 1402 M("az"), 1403 M("bZ"), 1404 M("bz") 1405 ); 1406 test_lit!( 1407 sfx_cat4, 1408 suffixes, 1409 "[ab][yz]", 1410 M("ay"), 1411 M("az"), 1412 M("by"), 1413 M("bz") 1414 ); 1415 test_lit!(sfx_cat5, suffixes, "a*b", C("ab"), M("b")); 1416 test_lit!(sfx_cat6, suffixes, "a*b*c", C("bc"), C("ac"), M("c")); 1417 test_lit!(sfx_cat7, suffixes, "a*b*c+", C("c")); 1418 test_lit!(sfx_cat8, suffixes, "a*b+c", C("bc")); 1419 test_lit!(sfx_cat9, suffixes, "a*b+c*", C("c"), C("b")); 1420 test_lit!(sfx_cat10, suffixes, "ab*", C("b"), M("a")); 1421 test_lit!(sfx_cat11, suffixes, "ab*c", C("bc"), M("ac")); 1422 test_lit!(sfx_cat12, suffixes, "ab+", C("b")); 1423 test_lit!(sfx_cat13, suffixes, "ab+c", C("bc")); 1424 test_lit!(sfx_cat14, suffixes, "a^"); 1425 test_lit!(sfx_cat15, suffixes, "$a", C("a")); 1426 test_lit!(sfx_cat16, suffixes, r"ab*c", C("bc"), M("ac")); 1427 test_lit!(sfx_cat17, suffixes, r"ab+c", C("bc")); 1428 test_lit!(sfx_cat18, suffixes, r"z*azb", C("zazb"), M("azb")); 1429 test_lit!(sfx_cat19, suffixes, "a.z", C("z")); 1430 1431 // Test regexes with alternations. 1432 test_lit!(sfx_alt1, suffixes, "a|b", M("a"), M("b")); 1433 test_lit!(sfx_alt2, suffixes, "[1-3]|b", M("1"), M("2"), M("3"), M("b")); 1434 test_lit!(sfx_alt3, suffixes, "y(?:a|b)z", M("yaz"), M("ybz")); 1435 test_lit!(sfx_alt4, suffixes, "a|b*"); 1436 test_lit!(sfx_alt5, suffixes, "a|b+", M("a"), C("b")); 1437 test_lit!(sfx_alt6, suffixes, "a|(?:b|c*)"); 1438 test_lit!( 1439 sfx_alt7, 1440 suffixes, 1441 "(a|b)*c|(a|ab)*c", 1442 C("ac"), 1443 C("bc"), 1444 M("c"), 1445 C("ac"), 1446 C("abc"), 1447 M("c") 1448 ); 1449 test_lit!(sfx_alt8, suffixes, "a*b|c", C("ab"), M("b"), M("c")); 1450 1451 // Test regexes with empty assertions. 1452 test_lit!(sfx_empty1, suffixes, "a$", M("a")); 1453 test_lit!(sfx_empty2, suffixes, "${2}a", C("a")); 1454 1455 // Make sure some curious regexes have no suffixes. 1456 test_lit!(sfx_nothing1, suffixes, "."); 1457 test_lit!(sfx_nothing2, suffixes, "(?s)."); 1458 test_lit!(sfx_nothing3, suffixes, "^"); 1459 test_lit!(sfx_nothing4, suffixes, "$"); 1460 test_lit!(sfx_nothing6, suffixes, "(?m)$"); 1461 test_lit!(sfx_nothing7, suffixes, r"\b"); 1462 test_lit!(sfx_nothing8, suffixes, r"\B"); 1463 1464 // Test a few regexes that defeat any suffix literal detection. 1465 test_lit!(sfx_defeated1, suffixes, "a."); 1466 test_lit!(sfx_defeated2, suffixes, "(?s)a."); 1467 test_lit!(sfx_defeated3, suffixes, "a*b*c*"); 1468 test_lit!(sfx_defeated4, suffixes, "a|."); 1469 test_lit!(sfx_defeated5, suffixes, ".|a"); 1470 test_lit!(sfx_defeated6, suffixes, "a|^"); 1471 test_lit!(sfx_defeated7, suffixes, "(?:a(?:b)(?:c))."); 1472 test_lit!(sfx_defeated8, suffixes, "a^"); 1473 test_lit!(sfx_defeated9, suffixes, "(?m)a$"); 1474 test_lit!(sfx_defeated10, suffixes, r"a\b"); 1475 test_lit!(sfx_defeated11, suffixes, r"a\B"); 1476 test_lit!(sfx_defeated12, suffixes, "a^*"); 1477 test_lit!(sfx_defeated13, suffixes, "a^+"); 1478 1479 // These test use a much lower limit than the default so that we can 1480 // write test cases of reasonable size. 1481 test_exhausted!(sfx_exhausted1, suffixes, "[a-z]"); 1482 test_exhausted!(sfx_exhausted2, suffixes, "A[a-z]*"); 1483 test_exhausted!(sfx_exhausted3, suffixes, "A[a-z]Z", C("Z")); 1484 test_exhausted!( 1485 sfx_exhausted4, 1486 suffixes, 1487 "(?i-u)foobar", 1488 C("AR"), 1489 C("Ar"), 1490 C("aR"), 1491 C("ar") 1492 ); 1493 test_exhausted!( 1494 sfx_exhausted5, 1495 suffixes, 1496 "(?:ab){100}", 1497 C("abababababababababab") 1498 ); 1499 test_exhausted!( 1500 sfx_exhausted6, 1501 suffixes, 1502 "cd(?:(?:ab){100})*", 1503 C("ababababab"), 1504 M("cd") 1505 ); 1506 test_exhausted!( 1507 sfx_exhausted7, 1508 suffixes, 1509 "cd(?:(?:ab){100})*z", 1510 C("abababababz"), 1511 M("cdz") 1512 ); 1513 test_exhausted!( 1514 sfx_exhausted8, 1515 suffixes, 1516 "zaaaaaaaaaaaaaaaaaaaa", 1517 C("aaaaaaaaaaaaaaaaaaaa") 1518 ); 1519 1520 // ************************************************************************ 1521 // Tests for generating unambiguous literal sets. 1522 // ************************************************************************ 1523 1524 macro_rules! test_unamb { 1525 ($name:ident, $given:expr, $expected:expr) => { 1526 #[test] 1527 fn $name() { 1528 let given: Vec<Literal> = $given 1529 .into_iter() 1530 .map(|ul| { 1531 let cut = ul.is_cut(); 1532 Literal { v: ul.v.into_bytes(), cut: cut } 1533 }) 1534 .collect(); 1535 let lits = create_lits(given); 1536 let got = lits.unambiguous_prefixes(); 1537 assert_eq!($expected, escape_lits(got.literals())); 1538 } 1539 }; 1540 } 1541 1542 test_unamb!(unambiguous1, vec![M("z"), M("azb")], vec![C("a"), C("z")]); 1543 test_unamb!( 1544 unambiguous2, 1545 vec![M("zaaaaaa"), M("aa")], 1546 vec![C("aa"), C("z")] 1547 ); 1548 test_unamb!( 1549 unambiguous3, 1550 vec![M("Sherlock"), M("Watson")], 1551 vec![M("Sherlock"), M("Watson")] 1552 ); 1553 test_unamb!(unambiguous4, vec![M("abc"), M("bc")], vec![C("a"), C("bc")]); 1554 test_unamb!(unambiguous5, vec![M("bc"), M("abc")], vec![C("a"), C("bc")]); 1555 test_unamb!(unambiguous6, vec![M("a"), M("aa")], vec![C("a")]); 1556 test_unamb!(unambiguous7, vec![M("aa"), M("a")], vec![C("a")]); 1557 test_unamb!(unambiguous8, vec![M("ab"), M("a")], vec![C("a")]); 1558 test_unamb!( 1559 unambiguous9, 1560 vec![M("ac"), M("bc"), M("c"), M("ac"), M("abc"), M("c")], 1561 vec![C("a"), C("b"), C("c")] 1562 ); 1563 test_unamb!( 1564 unambiguous10, 1565 vec![M("Mo'"), M("Mu'"), M("Mo"), M("Mu")], 1566 vec![C("Mo"), C("Mu")] 1567 ); 1568 test_unamb!( 1569 unambiguous11, 1570 vec![M("zazb"), M("azb")], 1571 vec![C("a"), C("z")] 1572 ); 1573 test_unamb!(unambiguous12, vec![M("foo"), C("foo")], vec![C("foo")]); 1574 test_unamb!( 1575 unambiguous13, 1576 vec![M("ABCX"), M("CDAX"), M("BCX")], 1577 vec![C("A"), C("BCX"), C("CD")] 1578 ); 1579 test_unamb!( 1580 unambiguous14, 1581 vec![M("IMGX"), M("MVIX"), M("MGX"), M("DSX")], 1582 vec![M("DSX"), C("I"), C("MGX"), C("MV")] 1583 ); 1584 test_unamb!( 1585 unambiguous15, 1586 vec![M("IMG_"), M("MG_"), M("CIMG")], 1587 vec![C("C"), C("I"), C("MG_")] 1588 ); 1589 1590 // ************************************************************************ 1591 // Tests for suffix trimming. 1592 // ************************************************************************ 1593 macro_rules! test_trim { 1594 ($name:ident, $trim:expr, $given:expr, $expected:expr) => { 1595 #[test] 1596 fn $name() { 1597 let given: Vec<Literal> = $given 1598 .into_iter() 1599 .map(|ul| { 1600 let cut = ul.is_cut(); 1601 Literal { v: ul.v.into_bytes(), cut: cut } 1602 }) 1603 .collect(); 1604 let lits = create_lits(given); 1605 let got = lits.trim_suffix($trim).unwrap(); 1606 assert_eq!($expected, escape_lits(got.literals())); 1607 } 1608 }; 1609 } 1610 1611 test_trim!(trim1, 1, vec![M("ab"), M("yz")], vec![C("a"), C("y")]); 1612 test_trim!(trim2, 1, vec![M("abc"), M("abd")], vec![C("ab")]); 1613 test_trim!(trim3, 2, vec![M("abc"), M("abd")], vec![C("a")]); 1614 test_trim!(trim4, 2, vec![M("abc"), M("ghij")], vec![C("a"), C("gh")]); 1615 1616 // ************************************************************************ 1617 // Tests for longest common prefix. 1618 // ************************************************************************ 1619 1620 macro_rules! test_lcp { 1621 ($name:ident, $given:expr, $expected:expr) => { 1622 #[test] 1623 fn $name() { 1624 let given: Vec<Literal> = $given 1625 .into_iter() 1626 .map(|s: &str| Literal { 1627 v: s.to_owned().into_bytes(), 1628 cut: false, 1629 }) 1630 .collect(); 1631 let lits = create_lits(given); 1632 let got = lits.longest_common_prefix(); 1633 assert_eq!($expected, escape_bytes(got)); 1634 } 1635 }; 1636 } 1637 1638 test_lcp!(lcp1, vec!["a"], "a"); 1639 test_lcp!(lcp2, vec![], ""); 1640 test_lcp!(lcp3, vec!["a", "b"], ""); 1641 test_lcp!(lcp4, vec!["ab", "ab"], "ab"); 1642 test_lcp!(lcp5, vec!["ab", "a"], "a"); 1643 test_lcp!(lcp6, vec!["a", "ab"], "a"); 1644 test_lcp!(lcp7, vec!["ab", "b"], ""); 1645 test_lcp!(lcp8, vec!["b", "ab"], ""); 1646 test_lcp!(lcp9, vec!["foobar", "foobaz"], "fooba"); 1647 test_lcp!(lcp10, vec!["foobar", "foobaz", "a"], ""); 1648 test_lcp!(lcp11, vec!["a", "foobar", "foobaz"], ""); 1649 test_lcp!(lcp12, vec!["foo", "flub", "flab", "floo"], "f"); 1650 1651 // ************************************************************************ 1652 // Tests for longest common suffix. 1653 // ************************************************************************ 1654 1655 macro_rules! test_lcs { 1656 ($name:ident, $given:expr, $expected:expr) => { 1657 #[test] 1658 fn $name() { 1659 let given: Vec<Literal> = $given 1660 .into_iter() 1661 .map(|s: &str| Literal { 1662 v: s.to_owned().into_bytes(), 1663 cut: false, 1664 }) 1665 .collect(); 1666 let lits = create_lits(given); 1667 let got = lits.longest_common_suffix(); 1668 assert_eq!($expected, escape_bytes(got)); 1669 } 1670 }; 1671 } 1672 1673 test_lcs!(lcs1, vec!["a"], "a"); 1674 test_lcs!(lcs2, vec![], ""); 1675 test_lcs!(lcs3, vec!["a", "b"], ""); 1676 test_lcs!(lcs4, vec!["ab", "ab"], "ab"); 1677 test_lcs!(lcs5, vec!["ab", "a"], ""); 1678 test_lcs!(lcs6, vec!["a", "ab"], ""); 1679 test_lcs!(lcs7, vec!["ab", "b"], "b"); 1680 test_lcs!(lcs8, vec!["b", "ab"], "b"); 1681 test_lcs!(lcs9, vec!["barfoo", "bazfoo"], "foo"); 1682 test_lcs!(lcs10, vec!["barfoo", "bazfoo", "a"], ""); 1683 test_lcs!(lcs11, vec!["a", "barfoo", "bazfoo"], ""); 1684 test_lcs!(lcs12, vec!["flub", "bub", "boob", "dub"], "b"); 1685 } 1686