1 use core::cmp;
2 
3 use cow::CowBytes;
4 use ext_slice::ByteSlice;
5 use search::prefilter::{Freqy, PrefilterState};
6 
7 /// An implementation of the TwoWay substring search algorithm, with heuristics
8 /// for accelerating search based on frequency analysis.
9 ///
10 /// This searcher supports forward and reverse search, although not
11 /// simultaneously. It runs in O(n + m) time and O(1) space, where
12 /// `n ~ len(needle)` and `m ~ len(haystack)`.
13 ///
14 /// The implementation here roughly matches that which was developed by
15 /// Crochemore and Perrin in their 1991 paper "Two-way string-matching." The
16 /// only change in this implementation is the use of zero-based indices and
17 /// the addition of heuristics for a fast skip loop. That is, this will detect
18 /// bytes that are believed to be rare in the needle and use fast vectorized
19 /// instructions to find their occurrences quickly. The Two-Way algorithm is
20 /// then used to confirm whether a match at that location occurred.
21 ///
22 /// The heuristic for fast skipping is automatically shut off if it's
23 /// detected to be ineffective at search time. Generally, this only occurs in
24 /// pathological cases. But this is generally necessary in order to preserve
25 /// a `O(n + m)` time bound.
26 ///
27 /// The code below is fairly complex and not obviously correct at all. It's
28 /// likely necessary to read the Two-Way paper cited above in order to fully
29 /// grok this code.
30 #[derive(Clone, Debug)]
31 pub struct TwoWay<'b> {
32     /// The needle that we're looking for.
33     needle: CowBytes<'b>,
34     /// An implementation of a fast skip loop based on hard-coded frequency
35     /// data. This is only used when conditions are deemed favorable.
36     freqy: Freqy,
37     /// A critical position in needle. Specifically, this position corresponds
38     /// to beginning of either the minimal or maximal suffix in needle. (N.B.
39     /// See SuffixType below for why "minimal" isn't quite the correct word
40     /// here.)
41     ///
42     /// This is the position at which every search begins. Namely, search
43     /// starts by scanning text to the right of this position, and only if
44     /// there's a match does the text to the left of this position get scanned.
45     critical_pos: usize,
46     /// The amount we shift by in the Two-Way search algorithm. This
47     /// corresponds to the "small period" and "large period" cases.
48     shift: Shift,
49 }
50 
51 impl<'b> TwoWay<'b> {
52     /// Create a searcher that uses the Two-Way algorithm by searching forwards
53     /// through any haystack.
forward(needle: &'b [u8]) -> TwoWay<'b>54     pub fn forward(needle: &'b [u8]) -> TwoWay<'b> {
55         let freqy = Freqy::forward(needle);
56         if needle.is_empty() {
57             return TwoWay {
58                 needle: CowBytes::new(needle),
59                 freqy,
60                 critical_pos: 0,
61                 shift: Shift::Large { shift: 0 },
62             };
63         }
64 
65         let min_suffix = Suffix::forward(needle, SuffixKind::Minimal);
66         let max_suffix = Suffix::forward(needle, SuffixKind::Maximal);
67         let (period_lower_bound, critical_pos) =
68             if min_suffix.pos > max_suffix.pos {
69                 (min_suffix.period, min_suffix.pos)
70             } else {
71                 (max_suffix.period, max_suffix.pos)
72             };
73         let shift = Shift::forward(needle, period_lower_bound, critical_pos);
74         let needle = CowBytes::new(needle);
75         TwoWay { needle, freqy, critical_pos, shift }
76     }
77 
78     /// Create a searcher that uses the Two-Way algorithm by searching in
79     /// reverse through any haystack.
reverse(needle: &'b [u8]) -> TwoWay<'b>80     pub fn reverse(needle: &'b [u8]) -> TwoWay<'b> {
81         let freqy = Freqy::reverse(needle);
82         if needle.is_empty() {
83             return TwoWay {
84                 needle: CowBytes::new(needle),
85                 freqy,
86                 critical_pos: 0,
87                 shift: Shift::Large { shift: 0 },
88             };
89         }
90 
91         let min_suffix = Suffix::reverse(needle, SuffixKind::Minimal);
92         let max_suffix = Suffix::reverse(needle, SuffixKind::Maximal);
93         let (period_lower_bound, critical_pos) =
94             if min_suffix.pos < max_suffix.pos {
95                 (min_suffix.period, min_suffix.pos)
96             } else {
97                 (max_suffix.period, max_suffix.pos)
98             };
99         let shift = Shift::reverse(needle, period_lower_bound, critical_pos);
100         let needle = CowBytes::new(needle);
101         TwoWay { needle, freqy, critical_pos, shift }
102     }
103 
104     /// Return a fresh prefilter state that can be used with this searcher.
105     /// A prefilter state is used to track the effectiveness of a searcher's
106     /// prefilter for speeding up searches. Therefore, the prefilter state
107     /// should generally be reused on subsequent searches (such as in an
108     /// iterator). For searches on a different haystack, then a new prefilter
109     /// state should be used.
110     ///
111     /// This always initializes a valid prefilter state even if this searcher
112     /// does not have a prefilter enabled.
prefilter_state(&self) -> PrefilterState113     pub fn prefilter_state(&self) -> PrefilterState {
114         self.freqy.prefilter_state()
115     }
116 
117     /// Return the needle used by this searcher.
needle(&self) -> &[u8]118     pub fn needle(&self) -> &[u8] {
119         self.needle.as_slice()
120     }
121 
122     /// Convert this searched into an owned version, where the needle is
123     /// copied if it isn't already owned.
124     #[cfg(feature = "std")]
into_owned(self) -> TwoWay<'static>125     pub fn into_owned(self) -> TwoWay<'static> {
126         TwoWay {
127             needle: self.needle.into_owned(),
128             freqy: self.freqy,
129             critical_pos: self.critical_pos,
130             shift: self.shift,
131         }
132     }
133 
134     /// Find the position of the first occurrence of this searcher's needle in
135     /// the given haystack. If one does not exist, then return None.
136     ///
137     /// This will automatically initialize prefilter state. This should only
138     /// be used for one-off searches.
find(&self, haystack: &[u8]) -> Option<usize>139     pub fn find(&self, haystack: &[u8]) -> Option<usize> {
140         self.find_with(&mut self.prefilter_state(), haystack)
141     }
142 
143     /// Find the position of the last occurrence of this searcher's needle
144     /// in the given haystack. If one does not exist, then return None.
145     ///
146     /// This will automatically initialize prefilter state. This should only
147     /// be used for one-off searches.
rfind(&self, haystack: &[u8]) -> Option<usize>148     pub fn rfind(&self, haystack: &[u8]) -> Option<usize> {
149         self.rfind_with(&mut self.prefilter_state(), haystack)
150     }
151 
152     /// Find the position of the first occurrence of this searcher's needle in
153     /// the given haystack. If one does not exist, then return None.
154     ///
155     /// This accepts prefilter state that is useful when using the same
156     /// searcher multiple times, such as in an iterator.
find_with( &self, prestate: &mut PrefilterState, haystack: &[u8], ) -> Option<usize>157     pub fn find_with(
158         &self,
159         prestate: &mut PrefilterState,
160         haystack: &[u8],
161     ) -> Option<usize> {
162         if self.needle.is_empty() {
163             return Some(0);
164         } else if haystack.len() < self.needle.len() {
165             return None;
166         } else if self.needle.len() == 1 {
167             return haystack.find_byte(self.needle[0]);
168         }
169         match self.shift {
170             Shift::Small { period } => {
171                 self.find_small(prestate, haystack, period)
172             }
173             Shift::Large { shift } => {
174                 self.find_large(prestate, haystack, shift)
175             }
176         }
177     }
178 
179     /// Find the position of the last occurrence of this searcher's needle
180     /// in the given haystack. If one does not exist, then return None.
181     ///
182     /// This accepts prefilter state that is useful when using the same
183     /// searcher multiple times, such as in an iterator.
rfind_with( &self, prestate: &mut PrefilterState, haystack: &[u8], ) -> Option<usize>184     pub fn rfind_with(
185         &self,
186         prestate: &mut PrefilterState,
187         haystack: &[u8],
188     ) -> Option<usize> {
189         if self.needle.is_empty() {
190             return Some(haystack.len());
191         } else if haystack.len() < self.needle.len() {
192             return None;
193         } else if self.needle.len() == 1 {
194             return haystack.rfind_byte(self.needle[0]);
195         }
196         match self.shift {
197             Shift::Small { period } => {
198                 self.rfind_small(prestate, haystack, period)
199             }
200             Shift::Large { shift } => {
201                 self.rfind_large(prestate, haystack, shift)
202             }
203         }
204     }
205 
206     // Below is the actual implementation of TwoWay searching, including both
207     // forwards and backwards searching. Each forward and reverse search has
208     // two fairly similar implementations, each handling the small and large
209     // period cases, for a total 4 different search routines.
210     //
211     // On top of that, each search implementation can be accelerated by a
212     // Freqy prefilter, but it is not always enabled. To avoid its overhead
213     // when its disabled, we explicitly inline each search implementation based
214     // on whether Freqy will be used or not. This brings us up to a total of
215     // 8 monomorphized versions of the search code.
216 
217     #[inline(never)]
find_small( &self, prestate: &mut PrefilterState, haystack: &[u8], period: usize, ) -> Option<usize>218     fn find_small(
219         &self,
220         prestate: &mut PrefilterState,
221         haystack: &[u8],
222         period: usize,
223     ) -> Option<usize> {
224         if prestate.is_effective() {
225             self.find_small_imp(prestate, true, haystack, period)
226         } else {
227             self.find_small_imp(prestate, false, haystack, period)
228         }
229     }
230 
231     #[inline(always)]
find_small_imp( &self, prestate: &mut PrefilterState, prefilter: bool, haystack: &[u8], period: usize, ) -> Option<usize>232     fn find_small_imp(
233         &self,
234         prestate: &mut PrefilterState,
235         prefilter: bool,
236         haystack: &[u8],
237         period: usize,
238     ) -> Option<usize> {
239         let needle = self.needle.as_slice();
240         let mut pos = 0;
241         let mut shift = 0;
242         while pos + needle.len() <= haystack.len() {
243             let mut i = cmp::max(self.critical_pos, shift);
244             if prefilter && prestate.is_effective() {
245                 match self.freqy.find_candidate(prestate, &haystack[pos..]) {
246                     None => return None,
247                     Some(found) => {
248                         shift = 0;
249                         i = self.critical_pos;
250                         pos += found;
251                         if pos + needle.len() > haystack.len() {
252                             return None;
253                         }
254                     }
255                 }
256             }
257             while i < needle.len() && needle[i] == haystack[pos + i] {
258                 i += 1;
259             }
260             if i < needle.len() {
261                 pos += i - self.critical_pos + 1;
262                 shift = 0;
263             } else {
264                 let mut j = self.critical_pos;
265                 while j > shift && needle[j] == haystack[pos + j] {
266                     j -= 1;
267                 }
268                 if j <= shift && needle[shift] == haystack[pos + shift] {
269                     return Some(pos);
270                 }
271                 pos += period;
272                 shift = needle.len() - period;
273             }
274         }
275         None
276     }
277 
278     #[inline(never)]
find_large( &self, prestate: &mut PrefilterState, haystack: &[u8], shift: usize, ) -> Option<usize>279     fn find_large(
280         &self,
281         prestate: &mut PrefilterState,
282         haystack: &[u8],
283         shift: usize,
284     ) -> Option<usize> {
285         if prestate.is_effective() {
286             self.find_large_imp(prestate, true, haystack, shift)
287         } else {
288             self.find_large_imp(prestate, false, haystack, shift)
289         }
290     }
291 
292     #[inline(always)]
find_large_imp( &self, prestate: &mut PrefilterState, prefilter: bool, haystack: &[u8], shift: usize, ) -> Option<usize>293     fn find_large_imp(
294         &self,
295         prestate: &mut PrefilterState,
296         prefilter: bool,
297         haystack: &[u8],
298         shift: usize,
299     ) -> Option<usize> {
300         let needle = self.needle.as_slice();
301         let mut pos = 0;
302         while pos + needle.len() <= haystack.len() {
303             let mut i = self.critical_pos;
304             if prefilter && prestate.is_effective() {
305                 match self.freqy.find_candidate(prestate, &haystack[pos..]) {
306                     None => return None,
307                     Some(found) => {
308                         pos += found;
309                         if pos + needle.len() > haystack.len() {
310                             return None;
311                         }
312                     }
313                 }
314             }
315             while i < needle.len() && needle[i] == haystack[pos + i] {
316                 i += 1;
317             }
318             if i < needle.len() {
319                 pos += i - self.critical_pos + 1;
320             } else {
321                 let mut j = self.critical_pos;
322                 while j > 0 && needle[j] == haystack[pos + j] {
323                     j -= 1;
324                 }
325                 if j == 0 && needle[0] == haystack[pos] {
326                     return Some(pos);
327                 }
328                 pos += shift;
329             }
330         }
331         None
332     }
333 
334     #[inline(never)]
rfind_small( &self, prestate: &mut PrefilterState, haystack: &[u8], period: usize, ) -> Option<usize>335     fn rfind_small(
336         &self,
337         prestate: &mut PrefilterState,
338         haystack: &[u8],
339         period: usize,
340     ) -> Option<usize> {
341         if prestate.is_effective() {
342             self.rfind_small_imp(prestate, true, haystack, period)
343         } else {
344             self.rfind_small_imp(prestate, false, haystack, period)
345         }
346     }
347 
348     #[inline(always)]
rfind_small_imp( &self, prestate: &mut PrefilterState, prefilter: bool, haystack: &[u8], period: usize, ) -> Option<usize>349     fn rfind_small_imp(
350         &self,
351         prestate: &mut PrefilterState,
352         prefilter: bool,
353         haystack: &[u8],
354         period: usize,
355     ) -> Option<usize> {
356         let needle = &*self.needle;
357         let nlen = needle.len();
358         let mut pos = haystack.len();
359         let mut shift = nlen;
360         while pos >= nlen {
361             let mut i = cmp::min(self.critical_pos, shift);
362             if prefilter && prestate.is_effective() {
363                 match self.freqy.rfind_candidate(prestate, &haystack[..pos]) {
364                     None => return None,
365                     Some(found) => {
366                         shift = nlen;
367                         i = self.critical_pos;
368                         pos = found;
369                         if pos < nlen {
370                             return None;
371                         }
372                     }
373                 }
374             }
375             while i > 0 && needle[i - 1] == haystack[pos - nlen + i - 1] {
376                 i -= 1;
377             }
378             if i > 0 || needle[0] != haystack[pos - nlen] {
379                 pos -= self.critical_pos - i + 1;
380                 shift = nlen;
381             } else {
382                 let mut j = self.critical_pos;
383                 while j < shift && needle[j] == haystack[pos - nlen + j] {
384                     j += 1;
385                 }
386                 if j == shift {
387                     return Some(pos - nlen);
388                 }
389                 pos -= period;
390                 shift = period;
391             }
392         }
393         None
394     }
395 
396     #[inline(never)]
rfind_large( &self, prestate: &mut PrefilterState, haystack: &[u8], shift: usize, ) -> Option<usize>397     fn rfind_large(
398         &self,
399         prestate: &mut PrefilterState,
400         haystack: &[u8],
401         shift: usize,
402     ) -> Option<usize> {
403         if prestate.is_effective() {
404             self.rfind_large_imp(prestate, true, haystack, shift)
405         } else {
406             self.rfind_large_imp(prestate, false, haystack, shift)
407         }
408     }
409 
410     #[inline(always)]
rfind_large_imp( &self, prestate: &mut PrefilterState, prefilter: bool, haystack: &[u8], shift: usize, ) -> Option<usize>411     fn rfind_large_imp(
412         &self,
413         prestate: &mut PrefilterState,
414         prefilter: bool,
415         haystack: &[u8],
416         shift: usize,
417     ) -> Option<usize> {
418         let needle = &*self.needle;
419         let nlen = needle.len();
420         let mut pos = haystack.len();
421         while pos >= nlen {
422             if prefilter && prestate.is_effective() {
423                 match self.freqy.rfind_candidate(prestate, &haystack[..pos]) {
424                     None => return None,
425                     Some(found) => {
426                         pos = found;
427                         if pos < nlen {
428                             return None;
429                         }
430                     }
431                 }
432             }
433 
434             let mut i = self.critical_pos;
435             while i > 0 && needle[i - 1] == haystack[pos - nlen + i - 1] {
436                 i -= 1;
437             }
438             if i > 0 || needle[0] != haystack[pos - nlen] {
439                 pos -= self.critical_pos - i + 1;
440             } else {
441                 let mut j = self.critical_pos;
442                 while j < nlen && needle[j] == haystack[pos - nlen + j] {
443                     j += 1;
444                 }
445                 if j == nlen {
446                     return Some(pos - nlen);
447                 }
448                 pos -= shift;
449             }
450         }
451         None
452     }
453 }
454 
455 /// A representation of the amount we're allowed to shift by during Two-Way
456 /// search.
457 ///
458 /// When computing a critical factorization of the needle, we find the position
459 /// of the critical factorization by finding the needle's maximal (or minimal)
460 /// suffix, along with the period of that suffix. It turns out that the period
461 /// of that suffix is a lower bound on the period of the needle itself.
462 ///
463 /// This lower bound is equivalent to the actual period of the needle in
464 /// some cases. To describe that case, we denote the needle as `x` where
465 /// `x = uv` and `v` is the lexicographic maximal suffix of `v`. The lower
466 /// bound given here is always the period of `v`, which is `<= period(x)`. The
467 /// case where `period(v) == period(x)` occurs when `len(u) < (len(x) / 2)` and
468 /// where `u` is a suffix of `v[0..period(v)]`.
469 ///
470 /// This case is important because the search algorithm for when the
471 /// periods are equivalent is slightly different than the search algorithm
472 /// for when the periods are not equivalent. In particular, when they aren't
473 /// equivalent, we know that the period of the needle is no less than half its
474 /// length. In this case, we shift by an amount less than or equal to the
475 /// period of the needle (determined by the maximum length of the components
476 /// of the critical factorization of `x`, i.e., `max(len(u), len(v))`)..
477 ///
478 /// The above two cases are represented by the variants below. Each entails
479 /// a different instantiation of the Two-Way search algorithm.
480 ///
481 /// N.B. If we could find a way to compute the exact period in all cases,
482 /// then we could collapse this case analysis and simplify the algorithm. The
483 /// Two-Way paper suggests this is possible, but more reading is required to
484 /// grok why the authors didn't pursue that path.
485 #[derive(Clone, Debug)]
486 enum Shift {
487     Small { period: usize },
488     Large { shift: usize },
489 }
490 
491 impl Shift {
492     /// Compute the shift for a given needle in the forward direction.
493     ///
494     /// This requires a lower bound on the period and a critical position.
495     /// These can be computed by extracting both the minimal and maximal
496     /// lexicographic suffixes, and choosing the right-most starting position.
497     /// The lower bound on the period is then the period of the chosen suffix.
forward( needle: &[u8], period_lower_bound: usize, critical_pos: usize, ) -> Shift498     fn forward(
499         needle: &[u8],
500         period_lower_bound: usize,
501         critical_pos: usize,
502     ) -> Shift {
503         let large = cmp::max(critical_pos, needle.len() - critical_pos);
504         if critical_pos * 2 >= needle.len() {
505             return Shift::Large { shift: large };
506         }
507 
508         let (u, v) = needle.split_at(critical_pos);
509         if !v[..period_lower_bound].ends_with(u) {
510             return Shift::Large { shift: large };
511         }
512         Shift::Small { period: period_lower_bound }
513     }
514 
515     /// Compute the shift for a given needle in the reverse direction.
516     ///
517     /// This requires a lower bound on the period and a critical position.
518     /// These can be computed by extracting both the minimal and maximal
519     /// lexicographic suffixes, and choosing the left-most starting position.
520     /// The lower bound on the period is then the period of the chosen suffix.
reverse( needle: &[u8], period_lower_bound: usize, critical_pos: usize, ) -> Shift521     fn reverse(
522         needle: &[u8],
523         period_lower_bound: usize,
524         critical_pos: usize,
525     ) -> Shift {
526         let large = cmp::max(critical_pos, needle.len() - critical_pos);
527         if (needle.len() - critical_pos) * 2 >= needle.len() {
528             return Shift::Large { shift: large };
529         }
530 
531         let (v, u) = needle.split_at(critical_pos);
532         if !v[v.len() - period_lower_bound..].starts_with(u) {
533             return Shift::Large { shift: large };
534         }
535         Shift::Small { period: period_lower_bound }
536     }
537 }
538 
539 /// A suffix extracted from a needle along with its period.
540 #[derive(Debug)]
541 struct Suffix {
542     /// The starting position of this suffix.
543     ///
544     /// If this is a forward suffix, then `&bytes[pos..]` can be used. If this
545     /// is a reverse suffix, then `&bytes[..pos]` can be used. That is, for
546     /// forward suffixes, this is an inclusive starting position, where as for
547     /// reverse suffixes, this is an exclusive ending position.
548     pos: usize,
549     /// The period of this suffix.
550     ///
551     /// Note that this is NOT necessarily the period of the string from which
552     /// this suffix comes from. (It is always less than or equal to the period
553     /// of the original string.)
554     period: usize,
555 }
556 
557 impl Suffix {
forward(needle: &[u8], kind: SuffixKind) -> Suffix558     fn forward(needle: &[u8], kind: SuffixKind) -> Suffix {
559         debug_assert!(!needle.is_empty());
560 
561         // suffix represents our maximal (or minimal) suffix, along with
562         // its period.
563         let mut suffix = Suffix { pos: 0, period: 1 };
564         // The start of a suffix in `needle` that we are considering as a
565         // more maximal (or minimal) suffix than what's in `suffix`.
566         let mut candidate_start = 1;
567         // The current offset of our suffixes that we're comparing.
568         //
569         // When the characters at this offset are the same, then we mush on
570         // to the next position since no decision is possible. When the
571         // candidate's character is greater (or lesser) than the corresponding
572         // character than our current maximal (or minimal) suffix, then the
573         // current suffix is changed over to the candidate and we restart our
574         // search. Otherwise, the candidate suffix is no good and we restart
575         // our search on the next candidate.
576         //
577         // The three cases above correspond to the three cases in the loop
578         // below.
579         let mut offset = 0;
580 
581         while candidate_start + offset < needle.len() {
582             let current = needle[suffix.pos + offset];
583             let candidate = needle[candidate_start + offset];
584             match kind.cmp(current, candidate) {
585                 SuffixOrdering::Accept => {
586                     suffix = Suffix { pos: candidate_start, period: 1 };
587                     candidate_start += 1;
588                     offset = 0;
589                 }
590                 SuffixOrdering::Skip => {
591                     candidate_start += offset + 1;
592                     offset = 0;
593                     suffix.period = candidate_start - suffix.pos;
594                 }
595                 SuffixOrdering::Push => {
596                     if offset + 1 == suffix.period {
597                         candidate_start += suffix.period;
598                         offset = 0;
599                     } else {
600                         offset += 1;
601                     }
602                 }
603             }
604         }
605         suffix
606     }
607 
reverse(needle: &[u8], kind: SuffixKind) -> Suffix608     fn reverse(needle: &[u8], kind: SuffixKind) -> Suffix {
609         debug_assert!(!needle.is_empty());
610 
611         // See the comments in `forward` for how this works.
612         let mut suffix = Suffix { pos: needle.len(), period: 1 };
613         if needle.len() == 1 {
614             return suffix;
615         }
616         let mut candidate_start = needle.len() - 1;
617         let mut offset = 0;
618 
619         while offset < candidate_start {
620             let current = needle[suffix.pos - offset - 1];
621             let candidate = needle[candidate_start - offset - 1];
622             match kind.cmp(current, candidate) {
623                 SuffixOrdering::Accept => {
624                     suffix = Suffix { pos: candidate_start, period: 1 };
625                     candidate_start -= 1;
626                     offset = 0;
627                 }
628                 SuffixOrdering::Skip => {
629                     candidate_start -= offset + 1;
630                     offset = 0;
631                     suffix.period = suffix.pos - candidate_start;
632                 }
633                 SuffixOrdering::Push => {
634                     if offset + 1 == suffix.period {
635                         candidate_start -= suffix.period;
636                         offset = 0;
637                     } else {
638                         offset += 1;
639                     }
640                 }
641             }
642         }
643         suffix
644     }
645 }
646 
647 /// The kind of suffix to extract.
648 #[derive(Clone, Copy, Debug)]
649 enum SuffixKind {
650     /// Extract the smallest lexicographic suffix from a string.
651     ///
652     /// Technically, this doesn't actually pick the smallest lexicographic
653     /// suffix. e.g., Given the choice between `a` and `aa`, this will choose
654     /// the latter over the former, even though `a < aa`. The reasoning for
655     /// this isn't clear from the paper, but it still smells like a minimal
656     /// suffix.
657     Minimal,
658     /// Extract the largest lexicographic suffix from a string.
659     ///
660     /// Unlike `Minimal`, this really does pick the maximum suffix. e.g., Given
661     /// the choice between `z` and `zz`, this will choose the latter over the
662     /// former.
663     Maximal,
664 }
665 
666 /// The result of comparing corresponding bytes between two suffixes.
667 #[derive(Clone, Copy, Debug)]
668 enum SuffixOrdering {
669     /// This occurs when the given candidate byte indicates that the candidate
670     /// suffix is better than the current maximal (or minimal) suffix. That is,
671     /// the current candidate suffix should supplant the current maximal (or
672     /// minimal) suffix.
673     Accept,
674     /// This occurs when the given candidate byte excludes the candidate suffix
675     /// from being better than the current maximal (or minimal) suffix. That
676     /// is, the current candidate suffix should be dropped and the next one
677     /// should be considered.
678     Skip,
679     /// This occurs when no decision to accept or skip the candidate suffix
680     /// can be made, e.g., when corresponding bytes are equivalent. In this
681     /// case, the next corresponding bytes should be compared.
682     Push,
683 }
684 
685 impl SuffixKind {
686     /// Returns true if and only if the given candidate byte indicates that
687     /// it should replace the current suffix as the maximal (or minimal)
688     /// suffix.
cmp(self, current: u8, candidate: u8) -> SuffixOrdering689     fn cmp(self, current: u8, candidate: u8) -> SuffixOrdering {
690         use self::SuffixOrdering::*;
691 
692         match self {
693             SuffixKind::Minimal if candidate < current => Accept,
694             SuffixKind::Minimal if candidate > current => Skip,
695             SuffixKind::Minimal => Push,
696             SuffixKind::Maximal if candidate > current => Accept,
697             SuffixKind::Maximal if candidate < current => Skip,
698             SuffixKind::Maximal => Push,
699         }
700     }
701 }
702 
703 // N.B. There are more holistic tests in src/search/tests.rs.
704 #[cfg(test)]
705 mod tests {
706     use super::*;
707     use ext_slice::B;
708 
709     /// Convenience wrapper for computing the suffix as a byte string.
get_suffix_forward(needle: &[u8], kind: SuffixKind) -> (&[u8], usize)710     fn get_suffix_forward(needle: &[u8], kind: SuffixKind) -> (&[u8], usize) {
711         let s = Suffix::forward(needle, kind);
712         (&needle[s.pos..], s.period)
713     }
714 
715     /// Convenience wrapper for computing the reverse suffix as a byte string.
get_suffix_reverse(needle: &[u8], kind: SuffixKind) -> (&[u8], usize)716     fn get_suffix_reverse(needle: &[u8], kind: SuffixKind) -> (&[u8], usize) {
717         let s = Suffix::reverse(needle, kind);
718         (&needle[..s.pos], s.period)
719     }
720 
721     /// Return all of the non-empty suffixes in the given byte string.
suffixes(bytes: &[u8]) -> Vec<&[u8]>722     fn suffixes(bytes: &[u8]) -> Vec<&[u8]> {
723         (0..bytes.len()).map(|i| &bytes[i..]).collect()
724     }
725 
726     /// Return the lexicographically maximal suffix of the given byte string.
naive_maximal_suffix_forward(needle: &[u8]) -> &[u8]727     fn naive_maximal_suffix_forward(needle: &[u8]) -> &[u8] {
728         let mut sufs = suffixes(needle);
729         sufs.sort();
730         sufs.pop().unwrap()
731     }
732 
733     /// Return the lexicographically maximal suffix of the reverse of the given
734     /// byte string.
naive_maximal_suffix_reverse(needle: &[u8]) -> Vec<u8>735     fn naive_maximal_suffix_reverse(needle: &[u8]) -> Vec<u8> {
736         let mut reversed = needle.to_vec();
737         reversed.reverse();
738         let mut got = naive_maximal_suffix_forward(&reversed).to_vec();
739         got.reverse();
740         got
741     }
742 
743     #[test]
suffix_forward()744     fn suffix_forward() {
745         macro_rules! assert_suffix_min {
746             ($given:expr, $expected:expr, $period:expr) => {
747                 let (got_suffix, got_period) =
748                     get_suffix_forward($given.as_bytes(), SuffixKind::Minimal);
749                 assert_eq!((B($expected), $period), (got_suffix, got_period));
750             };
751         }
752 
753         macro_rules! assert_suffix_max {
754             ($given:expr, $expected:expr, $period:expr) => {
755                 let (got_suffix, got_period) =
756                     get_suffix_forward($given.as_bytes(), SuffixKind::Maximal);
757                 assert_eq!((B($expected), $period), (got_suffix, got_period));
758             };
759         }
760 
761         assert_suffix_min!("a", "a", 1);
762         assert_suffix_max!("a", "a", 1);
763 
764         assert_suffix_min!("ab", "ab", 2);
765         assert_suffix_max!("ab", "b", 1);
766 
767         assert_suffix_min!("ba", "a", 1);
768         assert_suffix_max!("ba", "ba", 2);
769 
770         assert_suffix_min!("abc", "abc", 3);
771         assert_suffix_max!("abc", "c", 1);
772 
773         assert_suffix_min!("acb", "acb", 3);
774         assert_suffix_max!("acb", "cb", 2);
775 
776         assert_suffix_min!("cba", "a", 1);
777         assert_suffix_max!("cba", "cba", 3);
778 
779         assert_suffix_min!("abcabc", "abcabc", 3);
780         assert_suffix_max!("abcabc", "cabc", 3);
781 
782         assert_suffix_min!("abcabcabc", "abcabcabc", 3);
783         assert_suffix_max!("abcabcabc", "cabcabc", 3);
784 
785         assert_suffix_min!("abczz", "abczz", 5);
786         assert_suffix_max!("abczz", "zz", 1);
787 
788         assert_suffix_min!("zzabc", "abc", 3);
789         assert_suffix_max!("zzabc", "zzabc", 5);
790 
791         assert_suffix_min!("aaa", "aaa", 1);
792         assert_suffix_max!("aaa", "aaa", 1);
793 
794         assert_suffix_min!("foobar", "ar", 2);
795         assert_suffix_max!("foobar", "r", 1);
796     }
797 
798     #[test]
suffix_reverse()799     fn suffix_reverse() {
800         macro_rules! assert_suffix_min {
801             ($given:expr, $expected:expr, $period:expr) => {
802                 let (got_suffix, got_period) =
803                     get_suffix_reverse($given.as_bytes(), SuffixKind::Minimal);
804                 assert_eq!((B($expected), $period), (got_suffix, got_period));
805             };
806         }
807 
808         macro_rules! assert_suffix_max {
809             ($given:expr, $expected:expr, $period:expr) => {
810                 let (got_suffix, got_period) =
811                     get_suffix_reverse($given.as_bytes(), SuffixKind::Maximal);
812                 assert_eq!((B($expected), $period), (got_suffix, got_period));
813             };
814         }
815 
816         assert_suffix_min!("a", "a", 1);
817         assert_suffix_max!("a", "a", 1);
818 
819         assert_suffix_min!("ab", "a", 1);
820         assert_suffix_max!("ab", "ab", 2);
821 
822         assert_suffix_min!("ba", "ba", 2);
823         assert_suffix_max!("ba", "b", 1);
824 
825         assert_suffix_min!("abc", "a", 1);
826         assert_suffix_max!("abc", "abc", 3);
827 
828         assert_suffix_min!("acb", "a", 1);
829         assert_suffix_max!("acb", "ac", 2);
830 
831         assert_suffix_min!("cba", "cba", 3);
832         assert_suffix_max!("cba", "c", 1);
833 
834         assert_suffix_min!("abcabc", "abca", 3);
835         assert_suffix_max!("abcabc", "abcabc", 3);
836 
837         assert_suffix_min!("abcabcabc", "abcabca", 3);
838         assert_suffix_max!("abcabcabc", "abcabcabc", 3);
839 
840         assert_suffix_min!("abczz", "a", 1);
841         assert_suffix_max!("abczz", "abczz", 5);
842 
843         assert_suffix_min!("zzabc", "zza", 3);
844         assert_suffix_max!("zzabc", "zz", 1);
845 
846         assert_suffix_min!("aaa", "aaa", 1);
847         assert_suffix_max!("aaa", "aaa", 1);
848     }
849 
850     quickcheck! {
851         fn qc_suffix_forward_maximal(bytes: Vec<u8>) -> bool {
852             if bytes.is_empty() {
853                 return true;
854             }
855 
856             let (got, _) = get_suffix_forward(&bytes, SuffixKind::Maximal);
857             let expected = naive_maximal_suffix_forward(&bytes);
858             got == expected
859         }
860 
861         fn qc_suffix_reverse_maximal(bytes: Vec<u8>) -> bool {
862             if bytes.is_empty() {
863                 return true;
864             }
865 
866             let (got, _) = get_suffix_reverse(&bytes, SuffixKind::Maximal);
867             let expected = naive_maximal_suffix_reverse(&bytes);
868             expected == got
869         }
870     }
871 }
872