1 use core::cmp;
2 
3 use crate::memmem::{prefilter::Pre, util};
4 
5 /// Two-Way search in the forward direction.
6 #[derive(Clone, Copy, Debug)]
7 pub(crate) struct Forward(TwoWay);
8 
9 /// Two-Way search in the reverse direction.
10 #[derive(Clone, Copy, Debug)]
11 pub(crate) struct Reverse(TwoWay);
12 
13 /// An implementation of the TwoWay substring search algorithm, with heuristics
14 /// for accelerating search based on frequency analysis.
15 ///
16 /// This searcher supports forward and reverse search, although not
17 /// simultaneously. It runs in O(n + m) time and O(1) space, where
18 /// `n ~ len(needle)` and `m ~ len(haystack)`.
19 ///
20 /// The implementation here roughly matches that which was developed by
21 /// Crochemore and Perrin in their 1991 paper "Two-way string-matching." The
22 /// changes in this implementation are 1) the use of zero-based indices, 2) a
23 /// heuristic skip table based on the last byte (borrowed from Rust's standard
24 /// library) and 3) the addition of heuristics for a fast skip loop. That is,
25 /// (3) this will detect bytes that are believed to be rare in the needle and
26 /// use fast vectorized instructions to find their occurrences quickly. The
27 /// Two-Way algorithm is then used to confirm whether a match at that location
28 /// occurred.
29 ///
30 /// The heuristic for fast skipping is automatically shut off if it's
31 /// detected to be ineffective at search time. Generally, this only occurs in
32 /// pathological cases. But this is generally necessary in order to preserve
33 /// a `O(n + m)` time bound.
34 ///
35 /// The code below is fairly complex and not obviously correct at all. It's
36 /// likely necessary to read the Two-Way paper cited above in order to fully
37 /// grok this code. The essence of it is:
38 ///
39 /// 1) Do something to detect a "critical" position in the needle.
40 /// 2) For the current position in the haystack, look if needle[critical..]
41 ///    matches at that position.
42 /// 3) If so, look if needle[..critical] matches.
43 /// 4) If a mismatch occurs, shift the search by some amount based on the
44 ///    critical position and a pre-computed shift.
45 ///
46 /// This type is wrapped in Forward and Reverse types that expose consistent
47 /// forward or reverse APIs.
48 #[derive(Clone, Copy, Debug)]
49 struct TwoWay {
50     /// A small bitset used as a quick prefilter (in addition to the faster
51     /// SIMD based prefilter). Namely, a bit 'i' is set if and only if b%64==i
52     /// for any b in the needle.
53     ///
54     /// When used as a prefilter, if the last byte at the current candidate
55     /// position is NOT in this set, then we can skip that entire candidate
56     /// position (the length of the needle). This is essentially the shift
57     /// trick found in Boyer-Moore, but only applied to bytes that don't appear
58     /// in the needle.
59     ///
60     /// N.B. This trick was inspired by something similar in std's
61     /// implementation of Two-Way.
62     byteset: ApproximateByteSet,
63     /// A critical position in needle. Specifically, this position corresponds
64     /// to beginning of either the minimal or maximal suffix in needle. (N.B.
65     /// See SuffixType below for why "minimal" isn't quite the correct word
66     /// here.)
67     ///
68     /// This is the position at which every search begins. Namely, search
69     /// starts by scanning text to the right of this position, and only if
70     /// there's a match does the text to the left of this position get scanned.
71     critical_pos: usize,
72     /// The amount we shift by in the Two-Way search algorithm. This
73     /// corresponds to the "small period" and "large period" cases.
74     shift: Shift,
75 }
76 
77 impl Forward {
78     /// Create a searcher that uses the Two-Way algorithm by searching forwards
79     /// through any haystack.
new(needle: &[u8]) -> Forward80     pub(crate) fn new(needle: &[u8]) -> Forward {
81         if needle.is_empty() {
82             return Forward(TwoWay::empty());
83         }
84 
85         let byteset = ApproximateByteSet::new(needle);
86         let min_suffix = Suffix::forward(needle, SuffixKind::Minimal);
87         let max_suffix = Suffix::forward(needle, SuffixKind::Maximal);
88         let (period_lower_bound, critical_pos) =
89             if min_suffix.pos > max_suffix.pos {
90                 (min_suffix.period, min_suffix.pos)
91             } else {
92                 (max_suffix.period, max_suffix.pos)
93             };
94         let shift = Shift::forward(needle, period_lower_bound, critical_pos);
95         Forward(TwoWay { byteset, critical_pos, shift })
96     }
97 
98     /// Find the position of the first occurrence of this searcher's needle in
99     /// the given haystack. If one does not exist, then return None.
100     ///
101     /// This accepts prefilter state that is useful when using the same
102     /// searcher multiple times, such as in an iterator.
103     ///
104     /// Callers must guarantee that the needle is non-empty and its length is
105     /// <= the haystack's length.
106     #[inline(always)]
find( &self, pre: Option<&mut Pre<'_>>, haystack: &[u8], needle: &[u8], ) -> Option<usize>107     pub(crate) fn find(
108         &self,
109         pre: Option<&mut Pre<'_>>,
110         haystack: &[u8],
111         needle: &[u8],
112     ) -> Option<usize> {
113         debug_assert!(!needle.is_empty(), "needle should not be empty");
114         debug_assert!(needle.len() <= haystack.len(), "haystack too short");
115 
116         match self.0.shift {
117             Shift::Small { period } => {
118                 self.find_small_imp(pre, haystack, needle, period)
119             }
120             Shift::Large { shift } => {
121                 self.find_large_imp(pre, haystack, needle, shift)
122             }
123         }
124     }
125 
126     /// Like find, but handles the degenerate substring test cases. This is
127     /// only useful for conveniently testing this substring implementation in
128     /// isolation.
129     #[cfg(test)]
find_general( &self, pre: Option<&mut Pre<'_>>, haystack: &[u8], needle: &[u8], ) -> Option<usize>130     fn find_general(
131         &self,
132         pre: Option<&mut Pre<'_>>,
133         haystack: &[u8],
134         needle: &[u8],
135     ) -> Option<usize> {
136         if needle.is_empty() {
137             Some(0)
138         } else if haystack.len() < needle.len() {
139             None
140         } else {
141             self.find(pre, haystack, needle)
142         }
143     }
144 
145     // Each of the two search implementations below can be accelerated by a
146     // prefilter, but it is not always enabled. To avoid its overhead when
147     // its disabled, we explicitly inline each search implementation based on
148     // whether a prefilter will be used or not. The decision on which to use
149     // is made in the parent meta searcher.
150 
151     #[inline(always)]
find_small_imp( &self, mut pre: Option<&mut Pre<'_>>, haystack: &[u8], needle: &[u8], period: usize, ) -> Option<usize>152     fn find_small_imp(
153         &self,
154         mut pre: Option<&mut Pre<'_>>,
155         haystack: &[u8],
156         needle: &[u8],
157         period: usize,
158     ) -> Option<usize> {
159         let last_byte = needle.len() - 1;
160         let mut pos = 0;
161         let mut shift = 0;
162         while pos + needle.len() <= haystack.len() {
163             let mut i = cmp::max(self.0.critical_pos, shift);
164             if let Some(pre) = pre.as_mut() {
165                 if pre.should_call() {
166                     pos += pre.call(&haystack[pos..], needle)?;
167                     shift = 0;
168                     i = self.0.critical_pos;
169                     if pos + needle.len() > haystack.len() {
170                         return None;
171                     }
172                 }
173             }
174             if !self.0.byteset.contains(haystack[pos + last_byte]) {
175                 pos += needle.len();
176                 shift = 0;
177                 continue;
178             }
179             while i < needle.len() && needle[i] == haystack[pos + i] {
180                 i += 1;
181             }
182             if i < needle.len() {
183                 pos += i - self.0.critical_pos + 1;
184                 shift = 0;
185             } else {
186                 let mut j = self.0.critical_pos;
187                 while j > shift && needle[j] == haystack[pos + j] {
188                     j -= 1;
189                 }
190                 if j <= shift && needle[shift] == haystack[pos + shift] {
191                     return Some(pos);
192                 }
193                 pos += period;
194                 shift = needle.len() - period;
195             }
196         }
197         None
198     }
199 
200     #[inline(always)]
find_large_imp( &self, mut pre: Option<&mut Pre<'_>>, haystack: &[u8], needle: &[u8], shift: usize, ) -> Option<usize>201     fn find_large_imp(
202         &self,
203         mut pre: Option<&mut Pre<'_>>,
204         haystack: &[u8],
205         needle: &[u8],
206         shift: usize,
207     ) -> Option<usize> {
208         let last_byte = needle.len() - 1;
209         let mut pos = 0;
210         'outer: while pos + needle.len() <= haystack.len() {
211             if let Some(pre) = pre.as_mut() {
212                 if pre.should_call() {
213                     pos += pre.call(&haystack[pos..], needle)?;
214                     if pos + needle.len() > haystack.len() {
215                         return None;
216                     }
217                 }
218             }
219 
220             if !self.0.byteset.contains(haystack[pos + last_byte]) {
221                 pos += needle.len();
222                 continue;
223             }
224             let mut i = self.0.critical_pos;
225             while i < needle.len() && needle[i] == haystack[pos + i] {
226                 i += 1;
227             }
228             if i < needle.len() {
229                 pos += i - self.0.critical_pos + 1;
230             } else {
231                 for j in (0..self.0.critical_pos).rev() {
232                     if needle[j] != haystack[pos + j] {
233                         pos += shift;
234                         continue 'outer;
235                     }
236                 }
237                 return Some(pos);
238             }
239         }
240         None
241     }
242 }
243 
244 impl Reverse {
245     /// Create a searcher that uses the Two-Way algorithm by searching in
246     /// reverse through any haystack.
new(needle: &[u8]) -> Reverse247     pub(crate) fn new(needle: &[u8]) -> Reverse {
248         if needle.is_empty() {
249             return Reverse(TwoWay::empty());
250         }
251 
252         let byteset = ApproximateByteSet::new(needle);
253         let min_suffix = Suffix::reverse(needle, SuffixKind::Minimal);
254         let max_suffix = Suffix::reverse(needle, SuffixKind::Maximal);
255         let (period_lower_bound, critical_pos) =
256             if min_suffix.pos < max_suffix.pos {
257                 (min_suffix.period, min_suffix.pos)
258             } else {
259                 (max_suffix.period, max_suffix.pos)
260             };
261         // let critical_pos = needle.len() - critical_pos;
262         let shift = Shift::reverse(needle, period_lower_bound, critical_pos);
263         Reverse(TwoWay { byteset, critical_pos, shift })
264     }
265 
266     /// Find the position of the last occurrence of this searcher's needle
267     /// in the given haystack. If one does not exist, then return None.
268     ///
269     /// This will automatically initialize prefilter state. This should only
270     /// be used for one-off searches.
271     ///
272     /// Callers must guarantee that the needle is non-empty and its length is
273     /// <= the haystack's length.
274     #[inline(always)]
rfind( &self, haystack: &[u8], needle: &[u8], ) -> Option<usize>275     pub(crate) fn rfind(
276         &self,
277         haystack: &[u8],
278         needle: &[u8],
279     ) -> Option<usize> {
280         debug_assert!(!needle.is_empty(), "needle should not be empty");
281         debug_assert!(needle.len() <= haystack.len(), "haystack too short");
282         // For the reverse case, we don't use a prefilter. It's plausible that
283         // perhaps we should, but it's a lot of additional code to do it, and
284         // it's not clear that it's actually worth it. If you have a really
285         // compelling use case for this, please file an issue.
286         match self.0.shift {
287             Shift::Small { period } => {
288                 self.rfind_small_imp(haystack, needle, period)
289             }
290             Shift::Large { shift } => {
291                 self.rfind_large_imp(haystack, needle, shift)
292             }
293         }
294     }
295 
296     /// Like rfind, but handles the degenerate substring test cases. This is
297     /// only useful for conveniently testing this substring implementation in
298     /// isolation.
299     #[cfg(test)]
rfind_general(&self, haystack: &[u8], needle: &[u8]) -> Option<usize>300     fn rfind_general(&self, haystack: &[u8], needle: &[u8]) -> Option<usize> {
301         if needle.is_empty() {
302             Some(haystack.len())
303         } else if haystack.len() < needle.len() {
304             None
305         } else {
306             self.rfind(haystack, needle)
307         }
308     }
309 
310     #[inline(always)]
rfind_small_imp( &self, haystack: &[u8], needle: &[u8], period: usize, ) -> Option<usize>311     fn rfind_small_imp(
312         &self,
313         haystack: &[u8],
314         needle: &[u8],
315         period: usize,
316     ) -> Option<usize> {
317         let nlen = needle.len();
318         let mut pos = haystack.len();
319         let mut shift = nlen;
320         while pos >= nlen {
321             if !self.0.byteset.contains(haystack[pos - nlen]) {
322                 pos -= nlen;
323                 shift = nlen;
324                 continue;
325             }
326             let mut i = cmp::min(self.0.critical_pos, shift);
327             while i > 0 && needle[i - 1] == haystack[pos - nlen + i - 1] {
328                 i -= 1;
329             }
330             if i > 0 || needle[0] != haystack[pos - nlen] {
331                 pos -= self.0.critical_pos - i + 1;
332                 shift = nlen;
333             } else {
334                 let mut j = self.0.critical_pos;
335                 while j < shift && needle[j] == haystack[pos - nlen + j] {
336                     j += 1;
337                 }
338                 if j >= shift {
339                     return Some(pos - nlen);
340                 }
341                 pos -= period;
342                 shift = period;
343             }
344         }
345         None
346     }
347 
348     #[inline(always)]
rfind_large_imp( &self, haystack: &[u8], needle: &[u8], shift: usize, ) -> Option<usize>349     fn rfind_large_imp(
350         &self,
351         haystack: &[u8],
352         needle: &[u8],
353         shift: usize,
354     ) -> Option<usize> {
355         let nlen = needle.len();
356         let mut pos = haystack.len();
357         while pos >= nlen {
358             if !self.0.byteset.contains(haystack[pos - nlen]) {
359                 pos -= nlen;
360                 continue;
361             }
362             let mut i = self.0.critical_pos;
363             while i > 0 && needle[i - 1] == haystack[pos - nlen + i - 1] {
364                 i -= 1;
365             }
366             if i > 0 || needle[0] != haystack[pos - nlen] {
367                 pos -= self.0.critical_pos - i + 1;
368             } else {
369                 let mut j = self.0.critical_pos;
370                 while j < nlen && needle[j] == haystack[pos - nlen + j] {
371                     j += 1;
372                 }
373                 if j == nlen {
374                     return Some(pos - nlen);
375                 }
376                 pos -= shift;
377             }
378         }
379         None
380     }
381 }
382 
383 impl TwoWay {
empty() -> TwoWay384     fn empty() -> TwoWay {
385         TwoWay {
386             byteset: ApproximateByteSet::new(b""),
387             critical_pos: 0,
388             shift: Shift::Large { shift: 0 },
389         }
390     }
391 }
392 
393 /// A representation of the amount we're allowed to shift by during Two-Way
394 /// search.
395 ///
396 /// When computing a critical factorization of the needle, we find the position
397 /// of the critical factorization by finding the needle's maximal (or minimal)
398 /// suffix, along with the period of that suffix. It turns out that the period
399 /// of that suffix is a lower bound on the period of the needle itself.
400 ///
401 /// This lower bound is equivalent to the actual period of the needle in
402 /// some cases. To describe that case, we denote the needle as `x` where
403 /// `x = uv` and `v` is the lexicographic maximal suffix of `v`. The lower
404 /// bound given here is always the period of `v`, which is `<= period(x)`. The
405 /// case where `period(v) == period(x)` occurs when `len(u) < (len(x) / 2)` and
406 /// where `u` is a suffix of `v[0..period(v)]`.
407 ///
408 /// This case is important because the search algorithm for when the
409 /// periods are equivalent is slightly different than the search algorithm
410 /// for when the periods are not equivalent. In particular, when they aren't
411 /// equivalent, we know that the period of the needle is no less than half its
412 /// length. In this case, we shift by an amount less than or equal to the
413 /// period of the needle (determined by the maximum length of the components
414 /// of the critical factorization of `x`, i.e., `max(len(u), len(v))`)..
415 ///
416 /// The above two cases are represented by the variants below. Each entails
417 /// a different instantiation of the Two-Way search algorithm.
418 ///
419 /// N.B. If we could find a way to compute the exact period in all cases,
420 /// then we could collapse this case analysis and simplify the algorithm. The
421 /// Two-Way paper suggests this is possible, but more reading is required to
422 /// grok why the authors didn't pursue that path.
423 #[derive(Clone, Copy, Debug)]
424 enum Shift {
425     Small { period: usize },
426     Large { shift: usize },
427 }
428 
429 impl Shift {
430     /// Compute the shift for a given needle in the forward direction.
431     ///
432     /// This requires a lower bound on the period and a critical position.
433     /// These can be computed by extracting both the minimal and maximal
434     /// lexicographic suffixes, and choosing the right-most starting position.
435     /// The lower bound on the period is then the period of the chosen suffix.
forward( needle: &[u8], period_lower_bound: usize, critical_pos: usize, ) -> Shift436     fn forward(
437         needle: &[u8],
438         period_lower_bound: usize,
439         critical_pos: usize,
440     ) -> Shift {
441         let large = cmp::max(critical_pos, needle.len() - critical_pos);
442         if critical_pos * 2 >= needle.len() {
443             return Shift::Large { shift: large };
444         }
445 
446         let (u, v) = needle.split_at(critical_pos);
447         if !util::is_suffix(&v[..period_lower_bound], u) {
448             return Shift::Large { shift: large };
449         }
450         Shift::Small { period: period_lower_bound }
451     }
452 
453     /// Compute the shift for a given needle in the reverse direction.
454     ///
455     /// This requires a lower bound on the period and a critical position.
456     /// These can be computed by extracting both the minimal and maximal
457     /// lexicographic suffixes, and choosing the left-most starting position.
458     /// The lower bound on the period is then the period of the chosen suffix.
reverse( needle: &[u8], period_lower_bound: usize, critical_pos: usize, ) -> Shift459     fn reverse(
460         needle: &[u8],
461         period_lower_bound: usize,
462         critical_pos: usize,
463     ) -> Shift {
464         let large = cmp::max(critical_pos, needle.len() - critical_pos);
465         if (needle.len() - critical_pos) * 2 >= needle.len() {
466             return Shift::Large { shift: large };
467         }
468 
469         let (v, u) = needle.split_at(critical_pos);
470         if !util::is_prefix(&v[v.len() - period_lower_bound..], u) {
471             return Shift::Large { shift: large };
472         }
473         Shift::Small { period: period_lower_bound }
474     }
475 }
476 
477 /// A suffix extracted from a needle along with its period.
478 #[derive(Debug)]
479 struct Suffix {
480     /// The starting position of this suffix.
481     ///
482     /// If this is a forward suffix, then `&bytes[pos..]` can be used. If this
483     /// is a reverse suffix, then `&bytes[..pos]` can be used. That is, for
484     /// forward suffixes, this is an inclusive starting position, where as for
485     /// reverse suffixes, this is an exclusive ending position.
486     pos: usize,
487     /// The period of this suffix.
488     ///
489     /// Note that this is NOT necessarily the period of the string from which
490     /// this suffix comes from. (It is always less than or equal to the period
491     /// of the original string.)
492     period: usize,
493 }
494 
495 impl Suffix {
forward(needle: &[u8], kind: SuffixKind) -> Suffix496     fn forward(needle: &[u8], kind: SuffixKind) -> Suffix {
497         debug_assert!(!needle.is_empty());
498 
499         // suffix represents our maximal (or minimal) suffix, along with
500         // its period.
501         let mut suffix = Suffix { pos: 0, period: 1 };
502         // The start of a suffix in `needle` that we are considering as a
503         // more maximal (or minimal) suffix than what's in `suffix`.
504         let mut candidate_start = 1;
505         // The current offset of our suffixes that we're comparing.
506         //
507         // When the characters at this offset are the same, then we mush on
508         // to the next position since no decision is possible. When the
509         // candidate's character is greater (or lesser) than the corresponding
510         // character than our current maximal (or minimal) suffix, then the
511         // current suffix is changed over to the candidate and we restart our
512         // search. Otherwise, the candidate suffix is no good and we restart
513         // our search on the next candidate.
514         //
515         // The three cases above correspond to the three cases in the loop
516         // below.
517         let mut offset = 0;
518 
519         while candidate_start + offset < needle.len() {
520             let current = needle[suffix.pos + offset];
521             let candidate = needle[candidate_start + offset];
522             match kind.cmp(current, candidate) {
523                 SuffixOrdering::Accept => {
524                     suffix = Suffix { pos: candidate_start, period: 1 };
525                     candidate_start += 1;
526                     offset = 0;
527                 }
528                 SuffixOrdering::Skip => {
529                     candidate_start += offset + 1;
530                     offset = 0;
531                     suffix.period = candidate_start - suffix.pos;
532                 }
533                 SuffixOrdering::Push => {
534                     if offset + 1 == suffix.period {
535                         candidate_start += suffix.period;
536                         offset = 0;
537                     } else {
538                         offset += 1;
539                     }
540                 }
541             }
542         }
543         suffix
544     }
545 
reverse(needle: &[u8], kind: SuffixKind) -> Suffix546     fn reverse(needle: &[u8], kind: SuffixKind) -> Suffix {
547         debug_assert!(!needle.is_empty());
548 
549         // See the comments in `forward` for how this works.
550         let mut suffix = Suffix { pos: needle.len(), period: 1 };
551         if needle.len() == 1 {
552             return suffix;
553         }
554         let mut candidate_start = needle.len() - 1;
555         let mut offset = 0;
556 
557         while offset < candidate_start {
558             let current = needle[suffix.pos - offset - 1];
559             let candidate = needle[candidate_start - offset - 1];
560             match kind.cmp(current, candidate) {
561                 SuffixOrdering::Accept => {
562                     suffix = Suffix { pos: candidate_start, period: 1 };
563                     candidate_start -= 1;
564                     offset = 0;
565                 }
566                 SuffixOrdering::Skip => {
567                     candidate_start -= offset + 1;
568                     offset = 0;
569                     suffix.period = suffix.pos - candidate_start;
570                 }
571                 SuffixOrdering::Push => {
572                     if offset + 1 == suffix.period {
573                         candidate_start -= suffix.period;
574                         offset = 0;
575                     } else {
576                         offset += 1;
577                     }
578                 }
579             }
580         }
581         suffix
582     }
583 }
584 
585 /// The kind of suffix to extract.
586 #[derive(Clone, Copy, Debug)]
587 enum SuffixKind {
588     /// Extract the smallest lexicographic suffix from a string.
589     ///
590     /// Technically, this doesn't actually pick the smallest lexicographic
591     /// suffix. e.g., Given the choice between `a` and `aa`, this will choose
592     /// the latter over the former, even though `a < aa`. The reasoning for
593     /// this isn't clear from the paper, but it still smells like a minimal
594     /// suffix.
595     Minimal,
596     /// Extract the largest lexicographic suffix from a string.
597     ///
598     /// Unlike `Minimal`, this really does pick the maximum suffix. e.g., Given
599     /// the choice between `z` and `zz`, this will choose the latter over the
600     /// former.
601     Maximal,
602 }
603 
604 /// The result of comparing corresponding bytes between two suffixes.
605 #[derive(Clone, Copy, Debug)]
606 enum SuffixOrdering {
607     /// This occurs when the given candidate byte indicates that the candidate
608     /// suffix is better than the current maximal (or minimal) suffix. That is,
609     /// the current candidate suffix should supplant the current maximal (or
610     /// minimal) suffix.
611     Accept,
612     /// This occurs when the given candidate byte excludes the candidate suffix
613     /// from being better than the current maximal (or minimal) suffix. That
614     /// is, the current candidate suffix should be dropped and the next one
615     /// should be considered.
616     Skip,
617     /// This occurs when no decision to accept or skip the candidate suffix
618     /// can be made, e.g., when corresponding bytes are equivalent. In this
619     /// case, the next corresponding bytes should be compared.
620     Push,
621 }
622 
623 impl SuffixKind {
624     /// Returns true if and only if the given candidate byte indicates that
625     /// it should replace the current suffix as the maximal (or minimal)
626     /// suffix.
cmp(self, current: u8, candidate: u8) -> SuffixOrdering627     fn cmp(self, current: u8, candidate: u8) -> SuffixOrdering {
628         use self::SuffixOrdering::*;
629 
630         match self {
631             SuffixKind::Minimal if candidate < current => Accept,
632             SuffixKind::Minimal if candidate > current => Skip,
633             SuffixKind::Minimal => Push,
634             SuffixKind::Maximal if candidate > current => Accept,
635             SuffixKind::Maximal if candidate < current => Skip,
636             SuffixKind::Maximal => Push,
637         }
638     }
639 }
640 
641 /// A bitset used to track whether a particular byte exists in a needle or not.
642 ///
643 /// Namely, bit 'i' is set if and only if byte%64==i for any byte in the
644 /// needle. If a particular byte in the haystack is NOT in this set, then one
645 /// can conclude that it is also not in the needle, and thus, one can advance
646 /// in the haystack by needle.len() bytes.
647 #[derive(Clone, Copy, Debug)]
648 struct ApproximateByteSet(u64);
649 
650 impl ApproximateByteSet {
651     /// Create a new set from the given needle.
new(needle: &[u8]) -> ApproximateByteSet652     fn new(needle: &[u8]) -> ApproximateByteSet {
653         let mut bits = 0;
654         for &b in needle {
655             bits |= 1 << (b % 64);
656         }
657         ApproximateByteSet(bits)
658     }
659 
660     /// Return true if and only if the given byte might be in this set. This
661     /// may return a false positive, but will never return a false negative.
662     #[inline(always)]
contains(&self, byte: u8) -> bool663     fn contains(&self, byte: u8) -> bool {
664         self.0 & (1 << (byte % 64)) != 0
665     }
666 }
667 
668 #[cfg(all(test, feature = "std", not(miri)))]
669 mod tests {
670     use quickcheck::quickcheck;
671 
672     use super::*;
673 
674     define_memmem_quickcheck_tests!(
675         super::simpletests::twoway_find,
676         super::simpletests::twoway_rfind
677     );
678 
679     /// Convenience wrapper for computing the suffix as a byte string.
get_suffix_forward(needle: &[u8], kind: SuffixKind) -> (&[u8], usize)680     fn get_suffix_forward(needle: &[u8], kind: SuffixKind) -> (&[u8], usize) {
681         let s = Suffix::forward(needle, kind);
682         (&needle[s.pos..], s.period)
683     }
684 
685     /// Convenience wrapper for computing the reverse suffix as a byte string.
get_suffix_reverse(needle: &[u8], kind: SuffixKind) -> (&[u8], usize)686     fn get_suffix_reverse(needle: &[u8], kind: SuffixKind) -> (&[u8], usize) {
687         let s = Suffix::reverse(needle, kind);
688         (&needle[..s.pos], s.period)
689     }
690 
691     /// Return all of the non-empty suffixes in the given byte string.
suffixes(bytes: &[u8]) -> Vec<&[u8]>692     fn suffixes(bytes: &[u8]) -> Vec<&[u8]> {
693         (0..bytes.len()).map(|i| &bytes[i..]).collect()
694     }
695 
696     /// Return the lexicographically maximal suffix of the given byte string.
naive_maximal_suffix_forward(needle: &[u8]) -> &[u8]697     fn naive_maximal_suffix_forward(needle: &[u8]) -> &[u8] {
698         let mut sufs = suffixes(needle);
699         sufs.sort();
700         sufs.pop().unwrap()
701     }
702 
703     /// Return the lexicographically maximal suffix of the reverse of the given
704     /// byte string.
naive_maximal_suffix_reverse(needle: &[u8]) -> Vec<u8>705     fn naive_maximal_suffix_reverse(needle: &[u8]) -> Vec<u8> {
706         let mut reversed = needle.to_vec();
707         reversed.reverse();
708         let mut got = naive_maximal_suffix_forward(&reversed).to_vec();
709         got.reverse();
710         got
711     }
712 
713     #[test]
suffix_forward()714     fn suffix_forward() {
715         macro_rules! assert_suffix_min {
716             ($given:expr, $expected:expr, $period:expr) => {
717                 let (got_suffix, got_period) =
718                     get_suffix_forward($given.as_bytes(), SuffixKind::Minimal);
719                 let got_suffix = std::str::from_utf8(got_suffix).unwrap();
720                 assert_eq!(($expected, $period), (got_suffix, got_period));
721             };
722         }
723 
724         macro_rules! assert_suffix_max {
725             ($given:expr, $expected:expr, $period:expr) => {
726                 let (got_suffix, got_period) =
727                     get_suffix_forward($given.as_bytes(), SuffixKind::Maximal);
728                 let got_suffix = std::str::from_utf8(got_suffix).unwrap();
729                 assert_eq!(($expected, $period), (got_suffix, got_period));
730             };
731         }
732 
733         assert_suffix_min!("a", "a", 1);
734         assert_suffix_max!("a", "a", 1);
735 
736         assert_suffix_min!("ab", "ab", 2);
737         assert_suffix_max!("ab", "b", 1);
738 
739         assert_suffix_min!("ba", "a", 1);
740         assert_suffix_max!("ba", "ba", 2);
741 
742         assert_suffix_min!("abc", "abc", 3);
743         assert_suffix_max!("abc", "c", 1);
744 
745         assert_suffix_min!("acb", "acb", 3);
746         assert_suffix_max!("acb", "cb", 2);
747 
748         assert_suffix_min!("cba", "a", 1);
749         assert_suffix_max!("cba", "cba", 3);
750 
751         assert_suffix_min!("abcabc", "abcabc", 3);
752         assert_suffix_max!("abcabc", "cabc", 3);
753 
754         assert_suffix_min!("abcabcabc", "abcabcabc", 3);
755         assert_suffix_max!("abcabcabc", "cabcabc", 3);
756 
757         assert_suffix_min!("abczz", "abczz", 5);
758         assert_suffix_max!("abczz", "zz", 1);
759 
760         assert_suffix_min!("zzabc", "abc", 3);
761         assert_suffix_max!("zzabc", "zzabc", 5);
762 
763         assert_suffix_min!("aaa", "aaa", 1);
764         assert_suffix_max!("aaa", "aaa", 1);
765 
766         assert_suffix_min!("foobar", "ar", 2);
767         assert_suffix_max!("foobar", "r", 1);
768     }
769 
770     #[test]
suffix_reverse()771     fn suffix_reverse() {
772         macro_rules! assert_suffix_min {
773             ($given:expr, $expected:expr, $period:expr) => {
774                 let (got_suffix, got_period) =
775                     get_suffix_reverse($given.as_bytes(), SuffixKind::Minimal);
776                 let got_suffix = std::str::from_utf8(got_suffix).unwrap();
777                 assert_eq!(($expected, $period), (got_suffix, got_period));
778             };
779         }
780 
781         macro_rules! assert_suffix_max {
782             ($given:expr, $expected:expr, $period:expr) => {
783                 let (got_suffix, got_period) =
784                     get_suffix_reverse($given.as_bytes(), SuffixKind::Maximal);
785                 let got_suffix = std::str::from_utf8(got_suffix).unwrap();
786                 assert_eq!(($expected, $period), (got_suffix, got_period));
787             };
788         }
789 
790         assert_suffix_min!("a", "a", 1);
791         assert_suffix_max!("a", "a", 1);
792 
793         assert_suffix_min!("ab", "a", 1);
794         assert_suffix_max!("ab", "ab", 2);
795 
796         assert_suffix_min!("ba", "ba", 2);
797         assert_suffix_max!("ba", "b", 1);
798 
799         assert_suffix_min!("abc", "a", 1);
800         assert_suffix_max!("abc", "abc", 3);
801 
802         assert_suffix_min!("acb", "a", 1);
803         assert_suffix_max!("acb", "ac", 2);
804 
805         assert_suffix_min!("cba", "cba", 3);
806         assert_suffix_max!("cba", "c", 1);
807 
808         assert_suffix_min!("abcabc", "abca", 3);
809         assert_suffix_max!("abcabc", "abcabc", 3);
810 
811         assert_suffix_min!("abcabcabc", "abcabca", 3);
812         assert_suffix_max!("abcabcabc", "abcabcabc", 3);
813 
814         assert_suffix_min!("abczz", "a", 1);
815         assert_suffix_max!("abczz", "abczz", 5);
816 
817         assert_suffix_min!("zzabc", "zza", 3);
818         assert_suffix_max!("zzabc", "zz", 1);
819 
820         assert_suffix_min!("aaa", "aaa", 1);
821         assert_suffix_max!("aaa", "aaa", 1);
822     }
823 
824     quickcheck! {
825         fn qc_suffix_forward_maximal(bytes: Vec<u8>) -> bool {
826             if bytes.is_empty() {
827                 return true;
828             }
829 
830             let (got, _) = get_suffix_forward(&bytes, SuffixKind::Maximal);
831             let expected = naive_maximal_suffix_forward(&bytes);
832             got == expected
833         }
834 
835         fn qc_suffix_reverse_maximal(bytes: Vec<u8>) -> bool {
836             if bytes.is_empty() {
837                 return true;
838             }
839 
840             let (got, _) = get_suffix_reverse(&bytes, SuffixKind::Maximal);
841             let expected = naive_maximal_suffix_reverse(&bytes);
842             expected == got
843         }
844     }
845 }
846 
847 #[cfg(test)]
848 mod simpletests {
849     use super::*;
850 
twoway_find( haystack: &[u8], needle: &[u8], ) -> Option<usize>851     pub(crate) fn twoway_find(
852         haystack: &[u8],
853         needle: &[u8],
854     ) -> Option<usize> {
855         Forward::new(needle).find_general(None, haystack, needle)
856     }
857 
twoway_rfind( haystack: &[u8], needle: &[u8], ) -> Option<usize>858     pub(crate) fn twoway_rfind(
859         haystack: &[u8],
860         needle: &[u8],
861     ) -> Option<usize> {
862         Reverse::new(needle).rfind_general(haystack, needle)
863     }
864 
865     define_memmem_simple_tests!(twoway_find, twoway_rfind);
866 
867     // This is a regression test caught by quickcheck that exercised a bug in
868     // the reverse small period handling. The bug was that we were using 'if j
869     // == shift' to determine if a match occurred, but the correct guard is 'if
870     // j >= shift', which matches the corresponding guard in the forward impl.
871     #[test]
regression_rev_small_period()872     fn regression_rev_small_period() {
873         let rfind = super::simpletests::twoway_rfind;
874         let haystack = "ababaz";
875         let needle = "abab";
876         assert_eq!(Some(0), rfind(haystack.as_bytes(), needle.as_bytes()));
877     }
878 }
879