1 use core::cmp; 2 3 use crate::memmem::{prefilter::Pre, util}; 4 5 /// Two-Way search in the forward direction. 6 #[derive(Clone, Copy, Debug)] 7 pub(crate) struct Forward(TwoWay); 8 9 /// Two-Way search in the reverse direction. 10 #[derive(Clone, Copy, Debug)] 11 pub(crate) struct Reverse(TwoWay); 12 13 /// An implementation of the TwoWay substring search algorithm, with heuristics 14 /// for accelerating search based on frequency analysis. 15 /// 16 /// This searcher supports forward and reverse search, although not 17 /// simultaneously. It runs in O(n + m) time and O(1) space, where 18 /// `n ~ len(needle)` and `m ~ len(haystack)`. 19 /// 20 /// The implementation here roughly matches that which was developed by 21 /// Crochemore and Perrin in their 1991 paper "Two-way string-matching." The 22 /// changes in this implementation are 1) the use of zero-based indices, 2) a 23 /// heuristic skip table based on the last byte (borrowed from Rust's standard 24 /// library) and 3) the addition of heuristics for a fast skip loop. That is, 25 /// (3) this will detect bytes that are believed to be rare in the needle and 26 /// use fast vectorized instructions to find their occurrences quickly. The 27 /// Two-Way algorithm is then used to confirm whether a match at that location 28 /// occurred. 29 /// 30 /// The heuristic for fast skipping is automatically shut off if it's 31 /// detected to be ineffective at search time. Generally, this only occurs in 32 /// pathological cases. But this is generally necessary in order to preserve 33 /// a `O(n + m)` time bound. 34 /// 35 /// The code below is fairly complex and not obviously correct at all. It's 36 /// likely necessary to read the Two-Way paper cited above in order to fully 37 /// grok this code. The essence of it is: 38 /// 39 /// 1) Do something to detect a "critical" position in the needle. 40 /// 2) For the current position in the haystack, look if needle[critical..] 41 /// matches at that position. 42 /// 3) If so, look if needle[..critical] matches. 43 /// 4) If a mismatch occurs, shift the search by some amount based on the 44 /// critical position and a pre-computed shift. 45 /// 46 /// This type is wrapped in Forward and Reverse types that expose consistent 47 /// forward or reverse APIs. 48 #[derive(Clone, Copy, Debug)] 49 struct TwoWay { 50 /// A small bitset used as a quick prefilter (in addition to the faster 51 /// SIMD based prefilter). Namely, a bit 'i' is set if and only if b%64==i 52 /// for any b in the needle. 53 /// 54 /// When used as a prefilter, if the last byte at the current candidate 55 /// position is NOT in this set, then we can skip that entire candidate 56 /// position (the length of the needle). This is essentially the shift 57 /// trick found in Boyer-Moore, but only applied to bytes that don't appear 58 /// in the needle. 59 /// 60 /// N.B. This trick was inspired by something similar in std's 61 /// implementation of Two-Way. 62 byteset: ApproximateByteSet, 63 /// A critical position in needle. Specifically, this position corresponds 64 /// to beginning of either the minimal or maximal suffix in needle. (N.B. 65 /// See SuffixType below for why "minimal" isn't quite the correct word 66 /// here.) 67 /// 68 /// This is the position at which every search begins. Namely, search 69 /// starts by scanning text to the right of this position, and only if 70 /// there's a match does the text to the left of this position get scanned. 71 critical_pos: usize, 72 /// The amount we shift by in the Two-Way search algorithm. This 73 /// corresponds to the "small period" and "large period" cases. 74 shift: Shift, 75 } 76 77 impl Forward { 78 /// Create a searcher that uses the Two-Way algorithm by searching forwards 79 /// through any haystack. new(needle: &[u8]) -> Forward80 pub(crate) fn new(needle: &[u8]) -> Forward { 81 if needle.is_empty() { 82 return Forward(TwoWay::empty()); 83 } 84 85 let byteset = ApproximateByteSet::new(needle); 86 let min_suffix = Suffix::forward(needle, SuffixKind::Minimal); 87 let max_suffix = Suffix::forward(needle, SuffixKind::Maximal); 88 let (period_lower_bound, critical_pos) = 89 if min_suffix.pos > max_suffix.pos { 90 (min_suffix.period, min_suffix.pos) 91 } else { 92 (max_suffix.period, max_suffix.pos) 93 }; 94 let shift = Shift::forward(needle, period_lower_bound, critical_pos); 95 Forward(TwoWay { byteset, critical_pos, shift }) 96 } 97 98 /// Find the position of the first occurrence of this searcher's needle in 99 /// the given haystack. If one does not exist, then return None. 100 /// 101 /// This accepts prefilter state that is useful when using the same 102 /// searcher multiple times, such as in an iterator. 103 /// 104 /// Callers must guarantee that the needle is non-empty and its length is 105 /// <= the haystack's length. 106 #[inline(always)] find( &self, pre: Option<&mut Pre<'_>>, haystack: &[u8], needle: &[u8], ) -> Option<usize>107 pub(crate) fn find( 108 &self, 109 pre: Option<&mut Pre<'_>>, 110 haystack: &[u8], 111 needle: &[u8], 112 ) -> Option<usize> { 113 debug_assert!(!needle.is_empty(), "needle should not be empty"); 114 debug_assert!(needle.len() <= haystack.len(), "haystack too short"); 115 116 match self.0.shift { 117 Shift::Small { period } => { 118 self.find_small_imp(pre, haystack, needle, period) 119 } 120 Shift::Large { shift } => { 121 self.find_large_imp(pre, haystack, needle, shift) 122 } 123 } 124 } 125 126 /// Like find, but handles the degenerate substring test cases. This is 127 /// only useful for conveniently testing this substring implementation in 128 /// isolation. 129 #[cfg(test)] find_general( &self, pre: Option<&mut Pre<'_>>, haystack: &[u8], needle: &[u8], ) -> Option<usize>130 fn find_general( 131 &self, 132 pre: Option<&mut Pre<'_>>, 133 haystack: &[u8], 134 needle: &[u8], 135 ) -> Option<usize> { 136 if needle.is_empty() { 137 Some(0) 138 } else if haystack.len() < needle.len() { 139 None 140 } else { 141 self.find(pre, haystack, needle) 142 } 143 } 144 145 // Each of the two search implementations below can be accelerated by a 146 // prefilter, but it is not always enabled. To avoid its overhead when 147 // its disabled, we explicitly inline each search implementation based on 148 // whether a prefilter will be used or not. The decision on which to use 149 // is made in the parent meta searcher. 150 151 #[inline(always)] find_small_imp( &self, mut pre: Option<&mut Pre<'_>>, haystack: &[u8], needle: &[u8], period: usize, ) -> Option<usize>152 fn find_small_imp( 153 &self, 154 mut pre: Option<&mut Pre<'_>>, 155 haystack: &[u8], 156 needle: &[u8], 157 period: usize, 158 ) -> Option<usize> { 159 let last_byte = needle.len() - 1; 160 let mut pos = 0; 161 let mut shift = 0; 162 while pos + needle.len() <= haystack.len() { 163 let mut i = cmp::max(self.0.critical_pos, shift); 164 if let Some(pre) = pre.as_mut() { 165 if pre.should_call() { 166 pos += pre.call(&haystack[pos..], needle)?; 167 shift = 0; 168 i = self.0.critical_pos; 169 if pos + needle.len() > haystack.len() { 170 return None; 171 } 172 } 173 } 174 if !self.0.byteset.contains(haystack[pos + last_byte]) { 175 pos += needle.len(); 176 shift = 0; 177 continue; 178 } 179 while i < needle.len() && needle[i] == haystack[pos + i] { 180 i += 1; 181 } 182 if i < needle.len() { 183 pos += i - self.0.critical_pos + 1; 184 shift = 0; 185 } else { 186 let mut j = self.0.critical_pos; 187 while j > shift && needle[j] == haystack[pos + j] { 188 j -= 1; 189 } 190 if j <= shift && needle[shift] == haystack[pos + shift] { 191 return Some(pos); 192 } 193 pos += period; 194 shift = needle.len() - period; 195 } 196 } 197 None 198 } 199 200 #[inline(always)] find_large_imp( &self, mut pre: Option<&mut Pre<'_>>, haystack: &[u8], needle: &[u8], shift: usize, ) -> Option<usize>201 fn find_large_imp( 202 &self, 203 mut pre: Option<&mut Pre<'_>>, 204 haystack: &[u8], 205 needle: &[u8], 206 shift: usize, 207 ) -> Option<usize> { 208 let last_byte = needle.len() - 1; 209 let mut pos = 0; 210 'outer: while pos + needle.len() <= haystack.len() { 211 if let Some(pre) = pre.as_mut() { 212 if pre.should_call() { 213 pos += pre.call(&haystack[pos..], needle)?; 214 if pos + needle.len() > haystack.len() { 215 return None; 216 } 217 } 218 } 219 220 if !self.0.byteset.contains(haystack[pos + last_byte]) { 221 pos += needle.len(); 222 continue; 223 } 224 let mut i = self.0.critical_pos; 225 while i < needle.len() && needle[i] == haystack[pos + i] { 226 i += 1; 227 } 228 if i < needle.len() { 229 pos += i - self.0.critical_pos + 1; 230 } else { 231 for j in (0..self.0.critical_pos).rev() { 232 if needle[j] != haystack[pos + j] { 233 pos += shift; 234 continue 'outer; 235 } 236 } 237 return Some(pos); 238 } 239 } 240 None 241 } 242 } 243 244 impl Reverse { 245 /// Create a searcher that uses the Two-Way algorithm by searching in 246 /// reverse through any haystack. new(needle: &[u8]) -> Reverse247 pub(crate) fn new(needle: &[u8]) -> Reverse { 248 if needle.is_empty() { 249 return Reverse(TwoWay::empty()); 250 } 251 252 let byteset = ApproximateByteSet::new(needle); 253 let min_suffix = Suffix::reverse(needle, SuffixKind::Minimal); 254 let max_suffix = Suffix::reverse(needle, SuffixKind::Maximal); 255 let (period_lower_bound, critical_pos) = 256 if min_suffix.pos < max_suffix.pos { 257 (min_suffix.period, min_suffix.pos) 258 } else { 259 (max_suffix.period, max_suffix.pos) 260 }; 261 // let critical_pos = needle.len() - critical_pos; 262 let shift = Shift::reverse(needle, period_lower_bound, critical_pos); 263 Reverse(TwoWay { byteset, critical_pos, shift }) 264 } 265 266 /// Find the position of the last occurrence of this searcher's needle 267 /// in the given haystack. If one does not exist, then return None. 268 /// 269 /// This will automatically initialize prefilter state. This should only 270 /// be used for one-off searches. 271 /// 272 /// Callers must guarantee that the needle is non-empty and its length is 273 /// <= the haystack's length. 274 #[inline(always)] rfind( &self, haystack: &[u8], needle: &[u8], ) -> Option<usize>275 pub(crate) fn rfind( 276 &self, 277 haystack: &[u8], 278 needle: &[u8], 279 ) -> Option<usize> { 280 debug_assert!(!needle.is_empty(), "needle should not be empty"); 281 debug_assert!(needle.len() <= haystack.len(), "haystack too short"); 282 // For the reverse case, we don't use a prefilter. It's plausible that 283 // perhaps we should, but it's a lot of additional code to do it, and 284 // it's not clear that it's actually worth it. If you have a really 285 // compelling use case for this, please file an issue. 286 match self.0.shift { 287 Shift::Small { period } => { 288 self.rfind_small_imp(haystack, needle, period) 289 } 290 Shift::Large { shift } => { 291 self.rfind_large_imp(haystack, needle, shift) 292 } 293 } 294 } 295 296 /// Like rfind, but handles the degenerate substring test cases. This is 297 /// only useful for conveniently testing this substring implementation in 298 /// isolation. 299 #[cfg(test)] rfind_general(&self, haystack: &[u8], needle: &[u8]) -> Option<usize>300 fn rfind_general(&self, haystack: &[u8], needle: &[u8]) -> Option<usize> { 301 if needle.is_empty() { 302 Some(haystack.len()) 303 } else if haystack.len() < needle.len() { 304 None 305 } else { 306 self.rfind(haystack, needle) 307 } 308 } 309 310 #[inline(always)] rfind_small_imp( &self, haystack: &[u8], needle: &[u8], period: usize, ) -> Option<usize>311 fn rfind_small_imp( 312 &self, 313 haystack: &[u8], 314 needle: &[u8], 315 period: usize, 316 ) -> Option<usize> { 317 let nlen = needle.len(); 318 let mut pos = haystack.len(); 319 let mut shift = nlen; 320 while pos >= nlen { 321 if !self.0.byteset.contains(haystack[pos - nlen]) { 322 pos -= nlen; 323 shift = nlen; 324 continue; 325 } 326 let mut i = cmp::min(self.0.critical_pos, shift); 327 while i > 0 && needle[i - 1] == haystack[pos - nlen + i - 1] { 328 i -= 1; 329 } 330 if i > 0 || needle[0] != haystack[pos - nlen] { 331 pos -= self.0.critical_pos - i + 1; 332 shift = nlen; 333 } else { 334 let mut j = self.0.critical_pos; 335 while j < shift && needle[j] == haystack[pos - nlen + j] { 336 j += 1; 337 } 338 if j >= shift { 339 return Some(pos - nlen); 340 } 341 pos -= period; 342 shift = period; 343 } 344 } 345 None 346 } 347 348 #[inline(always)] rfind_large_imp( &self, haystack: &[u8], needle: &[u8], shift: usize, ) -> Option<usize>349 fn rfind_large_imp( 350 &self, 351 haystack: &[u8], 352 needle: &[u8], 353 shift: usize, 354 ) -> Option<usize> { 355 let nlen = needle.len(); 356 let mut pos = haystack.len(); 357 while pos >= nlen { 358 if !self.0.byteset.contains(haystack[pos - nlen]) { 359 pos -= nlen; 360 continue; 361 } 362 let mut i = self.0.critical_pos; 363 while i > 0 && needle[i - 1] == haystack[pos - nlen + i - 1] { 364 i -= 1; 365 } 366 if i > 0 || needle[0] != haystack[pos - nlen] { 367 pos -= self.0.critical_pos - i + 1; 368 } else { 369 let mut j = self.0.critical_pos; 370 while j < nlen && needle[j] == haystack[pos - nlen + j] { 371 j += 1; 372 } 373 if j == nlen { 374 return Some(pos - nlen); 375 } 376 pos -= shift; 377 } 378 } 379 None 380 } 381 } 382 383 impl TwoWay { empty() -> TwoWay384 fn empty() -> TwoWay { 385 TwoWay { 386 byteset: ApproximateByteSet::new(b""), 387 critical_pos: 0, 388 shift: Shift::Large { shift: 0 }, 389 } 390 } 391 } 392 393 /// A representation of the amount we're allowed to shift by during Two-Way 394 /// search. 395 /// 396 /// When computing a critical factorization of the needle, we find the position 397 /// of the critical factorization by finding the needle's maximal (or minimal) 398 /// suffix, along with the period of that suffix. It turns out that the period 399 /// of that suffix is a lower bound on the period of the needle itself. 400 /// 401 /// This lower bound is equivalent to the actual period of the needle in 402 /// some cases. To describe that case, we denote the needle as `x` where 403 /// `x = uv` and `v` is the lexicographic maximal suffix of `v`. The lower 404 /// bound given here is always the period of `v`, which is `<= period(x)`. The 405 /// case where `period(v) == period(x)` occurs when `len(u) < (len(x) / 2)` and 406 /// where `u` is a suffix of `v[0..period(v)]`. 407 /// 408 /// This case is important because the search algorithm for when the 409 /// periods are equivalent is slightly different than the search algorithm 410 /// for when the periods are not equivalent. In particular, when they aren't 411 /// equivalent, we know that the period of the needle is no less than half its 412 /// length. In this case, we shift by an amount less than or equal to the 413 /// period of the needle (determined by the maximum length of the components 414 /// of the critical factorization of `x`, i.e., `max(len(u), len(v))`).. 415 /// 416 /// The above two cases are represented by the variants below. Each entails 417 /// a different instantiation of the Two-Way search algorithm. 418 /// 419 /// N.B. If we could find a way to compute the exact period in all cases, 420 /// then we could collapse this case analysis and simplify the algorithm. The 421 /// Two-Way paper suggests this is possible, but more reading is required to 422 /// grok why the authors didn't pursue that path. 423 #[derive(Clone, Copy, Debug)] 424 enum Shift { 425 Small { period: usize }, 426 Large { shift: usize }, 427 } 428 429 impl Shift { 430 /// Compute the shift for a given needle in the forward direction. 431 /// 432 /// This requires a lower bound on the period and a critical position. 433 /// These can be computed by extracting both the minimal and maximal 434 /// lexicographic suffixes, and choosing the right-most starting position. 435 /// The lower bound on the period is then the period of the chosen suffix. forward( needle: &[u8], period_lower_bound: usize, critical_pos: usize, ) -> Shift436 fn forward( 437 needle: &[u8], 438 period_lower_bound: usize, 439 critical_pos: usize, 440 ) -> Shift { 441 let large = cmp::max(critical_pos, needle.len() - critical_pos); 442 if critical_pos * 2 >= needle.len() { 443 return Shift::Large { shift: large }; 444 } 445 446 let (u, v) = needle.split_at(critical_pos); 447 if !util::is_suffix(&v[..period_lower_bound], u) { 448 return Shift::Large { shift: large }; 449 } 450 Shift::Small { period: period_lower_bound } 451 } 452 453 /// Compute the shift for a given needle in the reverse direction. 454 /// 455 /// This requires a lower bound on the period and a critical position. 456 /// These can be computed by extracting both the minimal and maximal 457 /// lexicographic suffixes, and choosing the left-most starting position. 458 /// The lower bound on the period is then the period of the chosen suffix. reverse( needle: &[u8], period_lower_bound: usize, critical_pos: usize, ) -> Shift459 fn reverse( 460 needle: &[u8], 461 period_lower_bound: usize, 462 critical_pos: usize, 463 ) -> Shift { 464 let large = cmp::max(critical_pos, needle.len() - critical_pos); 465 if (needle.len() - critical_pos) * 2 >= needle.len() { 466 return Shift::Large { shift: large }; 467 } 468 469 let (v, u) = needle.split_at(critical_pos); 470 if !util::is_prefix(&v[v.len() - period_lower_bound..], u) { 471 return Shift::Large { shift: large }; 472 } 473 Shift::Small { period: period_lower_bound } 474 } 475 } 476 477 /// A suffix extracted from a needle along with its period. 478 #[derive(Debug)] 479 struct Suffix { 480 /// The starting position of this suffix. 481 /// 482 /// If this is a forward suffix, then `&bytes[pos..]` can be used. If this 483 /// is a reverse suffix, then `&bytes[..pos]` can be used. That is, for 484 /// forward suffixes, this is an inclusive starting position, where as for 485 /// reverse suffixes, this is an exclusive ending position. 486 pos: usize, 487 /// The period of this suffix. 488 /// 489 /// Note that this is NOT necessarily the period of the string from which 490 /// this suffix comes from. (It is always less than or equal to the period 491 /// of the original string.) 492 period: usize, 493 } 494 495 impl Suffix { forward(needle: &[u8], kind: SuffixKind) -> Suffix496 fn forward(needle: &[u8], kind: SuffixKind) -> Suffix { 497 debug_assert!(!needle.is_empty()); 498 499 // suffix represents our maximal (or minimal) suffix, along with 500 // its period. 501 let mut suffix = Suffix { pos: 0, period: 1 }; 502 // The start of a suffix in `needle` that we are considering as a 503 // more maximal (or minimal) suffix than what's in `suffix`. 504 let mut candidate_start = 1; 505 // The current offset of our suffixes that we're comparing. 506 // 507 // When the characters at this offset are the same, then we mush on 508 // to the next position since no decision is possible. When the 509 // candidate's character is greater (or lesser) than the corresponding 510 // character than our current maximal (or minimal) suffix, then the 511 // current suffix is changed over to the candidate and we restart our 512 // search. Otherwise, the candidate suffix is no good and we restart 513 // our search on the next candidate. 514 // 515 // The three cases above correspond to the three cases in the loop 516 // below. 517 let mut offset = 0; 518 519 while candidate_start + offset < needle.len() { 520 let current = needle[suffix.pos + offset]; 521 let candidate = needle[candidate_start + offset]; 522 match kind.cmp(current, candidate) { 523 SuffixOrdering::Accept => { 524 suffix = Suffix { pos: candidate_start, period: 1 }; 525 candidate_start += 1; 526 offset = 0; 527 } 528 SuffixOrdering::Skip => { 529 candidate_start += offset + 1; 530 offset = 0; 531 suffix.period = candidate_start - suffix.pos; 532 } 533 SuffixOrdering::Push => { 534 if offset + 1 == suffix.period { 535 candidate_start += suffix.period; 536 offset = 0; 537 } else { 538 offset += 1; 539 } 540 } 541 } 542 } 543 suffix 544 } 545 reverse(needle: &[u8], kind: SuffixKind) -> Suffix546 fn reverse(needle: &[u8], kind: SuffixKind) -> Suffix { 547 debug_assert!(!needle.is_empty()); 548 549 // See the comments in `forward` for how this works. 550 let mut suffix = Suffix { pos: needle.len(), period: 1 }; 551 if needle.len() == 1 { 552 return suffix; 553 } 554 let mut candidate_start = needle.len() - 1; 555 let mut offset = 0; 556 557 while offset < candidate_start { 558 let current = needle[suffix.pos - offset - 1]; 559 let candidate = needle[candidate_start - offset - 1]; 560 match kind.cmp(current, candidate) { 561 SuffixOrdering::Accept => { 562 suffix = Suffix { pos: candidate_start, period: 1 }; 563 candidate_start -= 1; 564 offset = 0; 565 } 566 SuffixOrdering::Skip => { 567 candidate_start -= offset + 1; 568 offset = 0; 569 suffix.period = suffix.pos - candidate_start; 570 } 571 SuffixOrdering::Push => { 572 if offset + 1 == suffix.period { 573 candidate_start -= suffix.period; 574 offset = 0; 575 } else { 576 offset += 1; 577 } 578 } 579 } 580 } 581 suffix 582 } 583 } 584 585 /// The kind of suffix to extract. 586 #[derive(Clone, Copy, Debug)] 587 enum SuffixKind { 588 /// Extract the smallest lexicographic suffix from a string. 589 /// 590 /// Technically, this doesn't actually pick the smallest lexicographic 591 /// suffix. e.g., Given the choice between `a` and `aa`, this will choose 592 /// the latter over the former, even though `a < aa`. The reasoning for 593 /// this isn't clear from the paper, but it still smells like a minimal 594 /// suffix. 595 Minimal, 596 /// Extract the largest lexicographic suffix from a string. 597 /// 598 /// Unlike `Minimal`, this really does pick the maximum suffix. e.g., Given 599 /// the choice between `z` and `zz`, this will choose the latter over the 600 /// former. 601 Maximal, 602 } 603 604 /// The result of comparing corresponding bytes between two suffixes. 605 #[derive(Clone, Copy, Debug)] 606 enum SuffixOrdering { 607 /// This occurs when the given candidate byte indicates that the candidate 608 /// suffix is better than the current maximal (or minimal) suffix. That is, 609 /// the current candidate suffix should supplant the current maximal (or 610 /// minimal) suffix. 611 Accept, 612 /// This occurs when the given candidate byte excludes the candidate suffix 613 /// from being better than the current maximal (or minimal) suffix. That 614 /// is, the current candidate suffix should be dropped and the next one 615 /// should be considered. 616 Skip, 617 /// This occurs when no decision to accept or skip the candidate suffix 618 /// can be made, e.g., when corresponding bytes are equivalent. In this 619 /// case, the next corresponding bytes should be compared. 620 Push, 621 } 622 623 impl SuffixKind { 624 /// Returns true if and only if the given candidate byte indicates that 625 /// it should replace the current suffix as the maximal (or minimal) 626 /// suffix. cmp(self, current: u8, candidate: u8) -> SuffixOrdering627 fn cmp(self, current: u8, candidate: u8) -> SuffixOrdering { 628 use self::SuffixOrdering::*; 629 630 match self { 631 SuffixKind::Minimal if candidate < current => Accept, 632 SuffixKind::Minimal if candidate > current => Skip, 633 SuffixKind::Minimal => Push, 634 SuffixKind::Maximal if candidate > current => Accept, 635 SuffixKind::Maximal if candidate < current => Skip, 636 SuffixKind::Maximal => Push, 637 } 638 } 639 } 640 641 /// A bitset used to track whether a particular byte exists in a needle or not. 642 /// 643 /// Namely, bit 'i' is set if and only if byte%64==i for any byte in the 644 /// needle. If a particular byte in the haystack is NOT in this set, then one 645 /// can conclude that it is also not in the needle, and thus, one can advance 646 /// in the haystack by needle.len() bytes. 647 #[derive(Clone, Copy, Debug)] 648 struct ApproximateByteSet(u64); 649 650 impl ApproximateByteSet { 651 /// Create a new set from the given needle. new(needle: &[u8]) -> ApproximateByteSet652 fn new(needle: &[u8]) -> ApproximateByteSet { 653 let mut bits = 0; 654 for &b in needle { 655 bits |= 1 << (b % 64); 656 } 657 ApproximateByteSet(bits) 658 } 659 660 /// Return true if and only if the given byte might be in this set. This 661 /// may return a false positive, but will never return a false negative. 662 #[inline(always)] contains(&self, byte: u8) -> bool663 fn contains(&self, byte: u8) -> bool { 664 self.0 & (1 << (byte % 64)) != 0 665 } 666 } 667 668 #[cfg(all(test, feature = "std", not(miri)))] 669 mod tests { 670 use quickcheck::quickcheck; 671 672 use super::*; 673 674 define_memmem_quickcheck_tests!( 675 super::simpletests::twoway_find, 676 super::simpletests::twoway_rfind 677 ); 678 679 /// Convenience wrapper for computing the suffix as a byte string. get_suffix_forward(needle: &[u8], kind: SuffixKind) -> (&[u8], usize)680 fn get_suffix_forward(needle: &[u8], kind: SuffixKind) -> (&[u8], usize) { 681 let s = Suffix::forward(needle, kind); 682 (&needle[s.pos..], s.period) 683 } 684 685 /// Convenience wrapper for computing the reverse suffix as a byte string. get_suffix_reverse(needle: &[u8], kind: SuffixKind) -> (&[u8], usize)686 fn get_suffix_reverse(needle: &[u8], kind: SuffixKind) -> (&[u8], usize) { 687 let s = Suffix::reverse(needle, kind); 688 (&needle[..s.pos], s.period) 689 } 690 691 /// Return all of the non-empty suffixes in the given byte string. suffixes(bytes: &[u8]) -> Vec<&[u8]>692 fn suffixes(bytes: &[u8]) -> Vec<&[u8]> { 693 (0..bytes.len()).map(|i| &bytes[i..]).collect() 694 } 695 696 /// Return the lexicographically maximal suffix of the given byte string. naive_maximal_suffix_forward(needle: &[u8]) -> &[u8]697 fn naive_maximal_suffix_forward(needle: &[u8]) -> &[u8] { 698 let mut sufs = suffixes(needle); 699 sufs.sort(); 700 sufs.pop().unwrap() 701 } 702 703 /// Return the lexicographically maximal suffix of the reverse of the given 704 /// byte string. naive_maximal_suffix_reverse(needle: &[u8]) -> Vec<u8>705 fn naive_maximal_suffix_reverse(needle: &[u8]) -> Vec<u8> { 706 let mut reversed = needle.to_vec(); 707 reversed.reverse(); 708 let mut got = naive_maximal_suffix_forward(&reversed).to_vec(); 709 got.reverse(); 710 got 711 } 712 713 #[test] suffix_forward()714 fn suffix_forward() { 715 macro_rules! assert_suffix_min { 716 ($given:expr, $expected:expr, $period:expr) => { 717 let (got_suffix, got_period) = 718 get_suffix_forward($given.as_bytes(), SuffixKind::Minimal); 719 let got_suffix = std::str::from_utf8(got_suffix).unwrap(); 720 assert_eq!(($expected, $period), (got_suffix, got_period)); 721 }; 722 } 723 724 macro_rules! assert_suffix_max { 725 ($given:expr, $expected:expr, $period:expr) => { 726 let (got_suffix, got_period) = 727 get_suffix_forward($given.as_bytes(), SuffixKind::Maximal); 728 let got_suffix = std::str::from_utf8(got_suffix).unwrap(); 729 assert_eq!(($expected, $period), (got_suffix, got_period)); 730 }; 731 } 732 733 assert_suffix_min!("a", "a", 1); 734 assert_suffix_max!("a", "a", 1); 735 736 assert_suffix_min!("ab", "ab", 2); 737 assert_suffix_max!("ab", "b", 1); 738 739 assert_suffix_min!("ba", "a", 1); 740 assert_suffix_max!("ba", "ba", 2); 741 742 assert_suffix_min!("abc", "abc", 3); 743 assert_suffix_max!("abc", "c", 1); 744 745 assert_suffix_min!("acb", "acb", 3); 746 assert_suffix_max!("acb", "cb", 2); 747 748 assert_suffix_min!("cba", "a", 1); 749 assert_suffix_max!("cba", "cba", 3); 750 751 assert_suffix_min!("abcabc", "abcabc", 3); 752 assert_suffix_max!("abcabc", "cabc", 3); 753 754 assert_suffix_min!("abcabcabc", "abcabcabc", 3); 755 assert_suffix_max!("abcabcabc", "cabcabc", 3); 756 757 assert_suffix_min!("abczz", "abczz", 5); 758 assert_suffix_max!("abczz", "zz", 1); 759 760 assert_suffix_min!("zzabc", "abc", 3); 761 assert_suffix_max!("zzabc", "zzabc", 5); 762 763 assert_suffix_min!("aaa", "aaa", 1); 764 assert_suffix_max!("aaa", "aaa", 1); 765 766 assert_suffix_min!("foobar", "ar", 2); 767 assert_suffix_max!("foobar", "r", 1); 768 } 769 770 #[test] suffix_reverse()771 fn suffix_reverse() { 772 macro_rules! assert_suffix_min { 773 ($given:expr, $expected:expr, $period:expr) => { 774 let (got_suffix, got_period) = 775 get_suffix_reverse($given.as_bytes(), SuffixKind::Minimal); 776 let got_suffix = std::str::from_utf8(got_suffix).unwrap(); 777 assert_eq!(($expected, $period), (got_suffix, got_period)); 778 }; 779 } 780 781 macro_rules! assert_suffix_max { 782 ($given:expr, $expected:expr, $period:expr) => { 783 let (got_suffix, got_period) = 784 get_suffix_reverse($given.as_bytes(), SuffixKind::Maximal); 785 let got_suffix = std::str::from_utf8(got_suffix).unwrap(); 786 assert_eq!(($expected, $period), (got_suffix, got_period)); 787 }; 788 } 789 790 assert_suffix_min!("a", "a", 1); 791 assert_suffix_max!("a", "a", 1); 792 793 assert_suffix_min!("ab", "a", 1); 794 assert_suffix_max!("ab", "ab", 2); 795 796 assert_suffix_min!("ba", "ba", 2); 797 assert_suffix_max!("ba", "b", 1); 798 799 assert_suffix_min!("abc", "a", 1); 800 assert_suffix_max!("abc", "abc", 3); 801 802 assert_suffix_min!("acb", "a", 1); 803 assert_suffix_max!("acb", "ac", 2); 804 805 assert_suffix_min!("cba", "cba", 3); 806 assert_suffix_max!("cba", "c", 1); 807 808 assert_suffix_min!("abcabc", "abca", 3); 809 assert_suffix_max!("abcabc", "abcabc", 3); 810 811 assert_suffix_min!("abcabcabc", "abcabca", 3); 812 assert_suffix_max!("abcabcabc", "abcabcabc", 3); 813 814 assert_suffix_min!("abczz", "a", 1); 815 assert_suffix_max!("abczz", "abczz", 5); 816 817 assert_suffix_min!("zzabc", "zza", 3); 818 assert_suffix_max!("zzabc", "zz", 1); 819 820 assert_suffix_min!("aaa", "aaa", 1); 821 assert_suffix_max!("aaa", "aaa", 1); 822 } 823 824 quickcheck! { 825 fn qc_suffix_forward_maximal(bytes: Vec<u8>) -> bool { 826 if bytes.is_empty() { 827 return true; 828 } 829 830 let (got, _) = get_suffix_forward(&bytes, SuffixKind::Maximal); 831 let expected = naive_maximal_suffix_forward(&bytes); 832 got == expected 833 } 834 835 fn qc_suffix_reverse_maximal(bytes: Vec<u8>) -> bool { 836 if bytes.is_empty() { 837 return true; 838 } 839 840 let (got, _) = get_suffix_reverse(&bytes, SuffixKind::Maximal); 841 let expected = naive_maximal_suffix_reverse(&bytes); 842 expected == got 843 } 844 } 845 } 846 847 #[cfg(test)] 848 mod simpletests { 849 use super::*; 850 twoway_find( haystack: &[u8], needle: &[u8], ) -> Option<usize>851 pub(crate) fn twoway_find( 852 haystack: &[u8], 853 needle: &[u8], 854 ) -> Option<usize> { 855 Forward::new(needle).find_general(None, haystack, needle) 856 } 857 twoway_rfind( haystack: &[u8], needle: &[u8], ) -> Option<usize>858 pub(crate) fn twoway_rfind( 859 haystack: &[u8], 860 needle: &[u8], 861 ) -> Option<usize> { 862 Reverse::new(needle).rfind_general(haystack, needle) 863 } 864 865 define_memmem_simple_tests!(twoway_find, twoway_rfind); 866 867 // This is a regression test caught by quickcheck that exercised a bug in 868 // the reverse small period handling. The bug was that we were using 'if j 869 // == shift' to determine if a match occurred, but the correct guard is 'if 870 // j >= shift', which matches the corresponding guard in the forward impl. 871 #[test] regression_rev_small_period()872 fn regression_rev_small_period() { 873 let rfind = super::simpletests::twoway_rfind; 874 let haystack = "ababaz"; 875 let needle = "abab"; 876 assert_eq!(Some(0), rfind(haystack.as_bytes(), needle.as_bytes())); 877 } 878 } 879