1 /// A heuristic frequency based detection of rare bytes for substring search.
2 ///
3 /// This detector attempts to pick out two bytes in a needle that are predicted
4 /// to occur least frequently. The purpose is to use these bytes to implement
5 /// fast candidate search using vectorized code.
6 ///
7 /// A set of offsets is only computed for needles of length 2 or greater.
8 /// Smaller needles should be special cased by the substring search algorithm
9 /// in use. (e.g., Use memchr for single byte needles.)
10 ///
11 /// Note that we use `u8` to represent the offsets of the rare bytes in a
12 /// needle to reduce space usage. This means that rare byte occurring after the
13 /// first 255 bytes in a needle will never be used.
14 #[derive(Clone, Copy, Debug, Default)]
15 pub(crate) struct RareNeedleBytes {
16     /// The leftmost offset of the rarest byte in the needle, according to
17     /// pre-computed frequency analysis. The "leftmost offset" means that
18     /// rare1i <= i for all i where needle[i] == needle[rare1i].
19     rare1i: u8,
20     /// The leftmost offset of the second rarest byte in the needle, according
21     /// to pre-computed frequency analysis. The "leftmost offset" means that
22     /// rare2i <= i for all i where needle[i] == needle[rare2i].
23     ///
24     /// The second rarest byte is used as a type of guard for quickly detecting
25     /// a mismatch if the first byte matches. This is a hedge against
26     /// pathological cases where the pre-computed frequency analysis may be
27     /// off. (But of course, does not prevent *all* pathological cases.)
28     ///
29     /// In general, rare1i != rare2i by construction, although there is no hard
30     /// requirement that they be different. However, since the case of a single
31     /// byte needle is handled specially by memchr itself, rare2i generally
32     /// always should be different from rare1i since it would otherwise be
33     /// ineffective as a guard.
34     rare2i: u8,
35 }
36 
37 impl RareNeedleBytes {
38     /// Create a new pair of rare needle bytes with the given offsets. This is
39     /// only used in tests for generating input data.
40     #[cfg(all(test, feature = "std"))]
new(rare1i: u8, rare2i: u8) -> RareNeedleBytes41     pub(crate) fn new(rare1i: u8, rare2i: u8) -> RareNeedleBytes {
42         RareNeedleBytes { rare1i, rare2i }
43     }
44 
45     /// Detect the leftmost offsets of the two rarest bytes in the given
46     /// needle.
forward(needle: &[u8]) -> RareNeedleBytes47     pub(crate) fn forward(needle: &[u8]) -> RareNeedleBytes {
48         if needle.len() <= 1 || needle.len() > core::u8::MAX as usize {
49             // For needles bigger than u8::MAX, our offsets aren't big enough.
50             // (We make our offsets small to reduce stack copying.)
51             // If you have a use case for it, please file an issue. In that
52             // case, we should probably just adjust the routine below to pick
53             // some rare bytes from the first 255 bytes of the needle.
54             //
55             // Also note that for needles of size 0 or 1, they are special
56             // cased in Two-Way.
57             //
58             // TODO: Benchmar this.
59             return RareNeedleBytes { rare1i: 0, rare2i: 0 };
60         }
61 
62         // Find the rarest two bytes. We make them distinct by construction.
63         let (mut rare1, mut rare1i) = (needle[0], 0);
64         let (mut rare2, mut rare2i) = (needle[1], 1);
65         if rank(rare2) < rank(rare1) {
66             core::mem::swap(&mut rare1, &mut rare2);
67             core::mem::swap(&mut rare1i, &mut rare2i);
68         }
69         for (i, &b) in needle.iter().enumerate().skip(2) {
70             if rank(b) < rank(rare1) {
71                 rare2 = rare1;
72                 rare2i = rare1i;
73                 rare1 = b;
74                 rare1i = i as u8;
75             } else if b != rare1 && rank(b) < rank(rare2) {
76                 rare2 = b;
77                 rare2i = i as u8;
78             }
79         }
80         // While not strictly required, we really don't want these to be
81         // equivalent. If they were, it would reduce the effectiveness of
82         // candidate searching using these rare bytes by increasing the rate of
83         // false positives.
84         assert_ne!(rare1i, rare2i);
85         RareNeedleBytes { rare1i, rare2i }
86     }
87 
88     /// Return the rare bytes in the given needle in the forward direction.
89     /// The needle given must be the same one given to the RareNeedleBytes
90     /// constructor.
as_rare_bytes(&self, needle: &[u8]) -> (u8, u8)91     pub(crate) fn as_rare_bytes(&self, needle: &[u8]) -> (u8, u8) {
92         (needle[self.rare1i as usize], needle[self.rare2i as usize])
93     }
94 
95     /// Return the rare offsets such that the first offset is always <= to the
96     /// second offset. This is useful when the caller doesn't care whether
97     /// rare1 is rarer than rare2, but just wants to ensure that they are
98     /// ordered with respect to one another.
99     #[cfg(memchr_runtime_simd)]
as_rare_ordered_usize(&self) -> (usize, usize)100     pub(crate) fn as_rare_ordered_usize(&self) -> (usize, usize) {
101         let (rare1i, rare2i) = self.as_rare_ordered_u8();
102         (rare1i as usize, rare2i as usize)
103     }
104 
105     /// Like as_rare_ordered_usize, but returns the offsets as their native
106     /// u8 values.
107     #[cfg(memchr_runtime_simd)]
as_rare_ordered_u8(&self) -> (u8, u8)108     pub(crate) fn as_rare_ordered_u8(&self) -> (u8, u8) {
109         if self.rare1i <= self.rare2i {
110             (self.rare1i, self.rare2i)
111         } else {
112             (self.rare2i, self.rare1i)
113         }
114     }
115 
116     /// Return the rare offsets as usize values in the order in which they were
117     /// constructed. rare1, for example, is constructed as the "rarer" byte,
118     /// and thus, callers may want to treat it differently from rare2.
as_rare_usize(&self) -> (usize, usize)119     pub(crate) fn as_rare_usize(&self) -> (usize, usize) {
120         (self.rare1i as usize, self.rare2i as usize)
121     }
122 
123     /// Return the byte frequency rank of each byte. The higher the rank, the
124     /// more frequency the byte is predicted to be. The needle given must be
125     /// the same one given to the RareNeedleBytes constructor.
as_ranks(&self, needle: &[u8]) -> (usize, usize)126     pub(crate) fn as_ranks(&self, needle: &[u8]) -> (usize, usize) {
127         let (b1, b2) = self.as_rare_bytes(needle);
128         (rank(b1), rank(b2))
129     }
130 }
131 
132 /// Return the heuristical frequency rank of the given byte. A lower rank
133 /// means the byte is believed to occur less frequently.
rank(b: u8) -> usize134 fn rank(b: u8) -> usize {
135     crate::memmem::byte_frequencies::BYTE_FREQUENCIES[b as usize] as usize
136 }
137