1 use core::mem::size_of;
2 
3 use crate::memmem::{util::memcmp, vector::Vector, NeedleInfo};
4 
5 /// The minimum length of a needle required for this algorithm. The minimum
6 /// is 2 since a length of 1 should just use memchr and a length of 0 isn't
7 /// a case handled by this searcher.
8 pub(crate) const MIN_NEEDLE_LEN: usize = 2;
9 
10 /// The maximum length of a needle required for this algorithm.
11 ///
12 /// In reality, there is no hard max here. The code below can handle any
13 /// length needle. (Perhaps that suggests there are missing optimizations.)
14 /// Instead, this is a heuristic and a bound guaranteeing our linear time
15 /// complexity.
16 ///
17 /// It is a heuristic because when a candidate match is found, memcmp is run.
18 /// For very large needles with lots of false positives, memcmp can make the
19 /// code run quite slow.
20 ///
21 /// It is a bound because the worst case behavior with memcmp is multiplicative
22 /// in the size of the needle and haystack, and we want to keep that additive.
23 /// This bound ensures we still meet that bound theoretically, since it's just
24 /// a constant. We aren't acting in bad faith here, memcmp on tiny needles
25 /// is so fast that even in pathological cases (see pathological vector
26 /// benchmarks), this is still just as fast or faster in practice.
27 ///
28 /// This specific number was chosen by tweaking a bit and running benchmarks.
29 /// The rare-medium-needle, for example, gets about 5% faster by using this
30 /// algorithm instead of a prefilter-accelerated Two-Way. There's also a
31 /// theoretical desire to keep this number reasonably low, to mitigate the
32 /// impact of pathological cases. I did try 64, and some benchmarks got a
33 /// little better, and others (particularly the pathological ones), got a lot
34 /// worse. So... 32 it is?
35 pub(crate) const MAX_NEEDLE_LEN: usize = 32;
36 
37 /// The implementation of the forward vector accelerated substring search.
38 ///
39 /// This is extremely similar to the prefilter vector module by the same name.
40 /// The key difference is that this is not a prefilter. Instead, it handles
41 /// confirming its own matches. The trade off is that this only works with
42 /// smaller needles. The speed up here is that an inlined memcmp on a tiny
43 /// needle is very quick, even on pathological inputs. This is much better than
44 /// combining a prefilter with Two-Way, where using Two-Way to confirm the
45 /// match has higher latency.
46 ///
47 /// So why not use this for all needles? We could, and it would probably work
48 /// really well on most inputs. But its worst case is multiplicative and we
49 /// want to guarantee worst case additive time. Some of the benchmarks try to
50 /// justify this (see the pathological ones).
51 ///
52 /// The prefilter variant of this has more comments. Also note that we only
53 /// implement this for forward searches for now. If you have a compelling use
54 /// case for accelerated reverse search, please file an issue.
55 #[derive(Clone, Copy, Debug)]
56 pub(crate) struct Forward {
57     rare1i: u8,
58     rare2i: u8,
59 }
60 
61 impl Forward {
62     /// Create a new "generic simd" forward searcher. If one could not be
63     /// created from the given inputs, then None is returned.
new(ninfo: &NeedleInfo, needle: &[u8]) -> Option<Forward>64     pub(crate) fn new(ninfo: &NeedleInfo, needle: &[u8]) -> Option<Forward> {
65         let (rare1i, rare2i) = ninfo.rarebytes.as_rare_ordered_u8();
66         // If the needle is too short or too long, give up. Also, give up
67         // if the rare bytes detected are at the same position. (It likely
68         // suggests a degenerate case, although it should technically not be
69         // possible.)
70         if needle.len() < MIN_NEEDLE_LEN
71             || needle.len() > MAX_NEEDLE_LEN
72             || rare1i == rare2i
73         {
74             return None;
75         }
76         Some(Forward { rare1i, rare2i })
77     }
78 
79     /// Returns the minimum length of haystack that is needed for this searcher
80     /// to work for a particular vector. Passing a haystack with a length
81     /// smaller than this will cause `fwd_find` to panic.
82     #[inline(always)]
min_haystack_len<V: Vector>(&self) -> usize83     pub(crate) fn min_haystack_len<V: Vector>(&self) -> usize {
84         self.rare2i as usize + size_of::<V>()
85     }
86 }
87 
88 /// Searches the given haystack for the given needle. The needle given should
89 /// be the same as the needle that this searcher was initialized with.
90 ///
91 /// # Panics
92 ///
93 /// When the given haystack has a length smaller than `min_haystack_len`.
94 ///
95 /// # Safety
96 ///
97 /// Since this is meant to be used with vector functions, callers need to
98 /// specialize this inside of a function with a `target_feature` attribute.
99 /// Therefore, callers must ensure that whatever target feature is being used
100 /// supports the vector functions that this function is specialized for. (For
101 /// the specific vector functions used, see the Vector trait implementations.)
102 #[inline(always)]
fwd_find<V: Vector>( fwd: &Forward, haystack: &[u8], needle: &[u8], ) -> Option<usize>103 pub(crate) unsafe fn fwd_find<V: Vector>(
104     fwd: &Forward,
105     haystack: &[u8],
106     needle: &[u8],
107 ) -> Option<usize> {
108     // It would be nice if we didn't have this check here, since the meta
109     // searcher should handle it for us. But without this, I don't think we
110     // guarantee that end_ptr.sub(needle.len()) won't result in UB. We could
111     // put it as part of the safety contract, but it makes it more complicated
112     // than necessary.
113     if haystack.len() < needle.len() {
114         return None;
115     }
116     let min_haystack_len = fwd.min_haystack_len::<V>();
117     assert!(haystack.len() >= min_haystack_len, "haystack too small");
118     debug_assert!(needle.len() <= haystack.len());
119     debug_assert!(
120         needle.len() >= MIN_NEEDLE_LEN,
121         "needle must be at least {} bytes",
122         MIN_NEEDLE_LEN,
123     );
124     debug_assert!(
125         needle.len() <= MAX_NEEDLE_LEN,
126         "needle must be at most {} bytes",
127         MAX_NEEDLE_LEN,
128     );
129 
130     let (rare1i, rare2i) = (fwd.rare1i as usize, fwd.rare2i as usize);
131     let rare1chunk = V::splat(needle[rare1i]);
132     let rare2chunk = V::splat(needle[rare2i]);
133 
134     let start_ptr = haystack.as_ptr();
135     let end_ptr = start_ptr.add(haystack.len());
136     let max_ptr = end_ptr.sub(min_haystack_len);
137     let mut ptr = start_ptr;
138 
139     // N.B. I did experiment with unrolling the loop to deal with size(V)
140     // bytes at a time and 2*size(V) bytes at a time. The double unroll was
141     // marginally faster while the quadruple unroll was unambiguously slower.
142     // In the end, I decided the complexity from unrolling wasn't worth it. I
143     // used the memmem/krate/prebuilt/huge-en/ benchmarks to compare.
144     while ptr <= max_ptr {
145         let m = fwd_find_in_chunk(
146             fwd, needle, ptr, end_ptr, rare1chunk, rare2chunk, !0,
147         );
148         if let Some(chunki) = m {
149             return Some(matched(start_ptr, ptr, chunki));
150         }
151         ptr = ptr.add(size_of::<V>());
152     }
153     if ptr < end_ptr {
154         let remaining = diff(end_ptr, ptr);
155         debug_assert!(
156             remaining < min_haystack_len,
157             "remaining bytes should be smaller than the minimum haystack \
158              length of {}, but there are {} bytes remaining",
159             min_haystack_len,
160             remaining,
161         );
162         if remaining < needle.len() {
163             return None;
164         }
165         debug_assert!(
166             max_ptr < ptr,
167             "after main loop, ptr should have exceeded max_ptr",
168         );
169         let overlap = diff(ptr, max_ptr);
170         debug_assert!(
171             overlap > 0,
172             "overlap ({}) must always be non-zero",
173             overlap,
174         );
175         debug_assert!(
176             overlap < size_of::<V>(),
177             "overlap ({}) cannot possibly be >= than a vector ({})",
178             overlap,
179             size_of::<V>(),
180         );
181         // The mask has all of its bits set except for the first N least
182         // significant bits, where N=overlap. This way, any matches that
183         // occur in find_in_chunk within the overlap are automatically
184         // ignored.
185         let mask = !((1 << overlap) - 1);
186         ptr = max_ptr;
187         let m = fwd_find_in_chunk(
188             fwd, needle, ptr, end_ptr, rare1chunk, rare2chunk, mask,
189         );
190         if let Some(chunki) = m {
191             return Some(matched(start_ptr, ptr, chunki));
192         }
193     }
194     None
195 }
196 
197 /// Search for an occurrence of two rare bytes from the needle in the chunk
198 /// pointed to by ptr, with the end of the haystack pointed to by end_ptr. When
199 /// an occurrence is found, memcmp is run to check if a match occurs at the
200 /// corresponding position.
201 ///
202 /// rare1chunk and rare2chunk correspond to vectors with the rare1 and rare2
203 /// bytes repeated in each 8-bit lane, respectively.
204 ///
205 /// mask should have bits set corresponding the positions in the chunk in which
206 /// matches are considered. This is only used for the last vector load where
207 /// the beginning of the vector might have overlapped with the last load in
208 /// the main loop. The mask lets us avoid visiting positions that have already
209 /// been discarded as matches.
210 ///
211 /// # Safety
212 ///
213 /// It must be safe to do an unaligned read of size(V) bytes starting at both
214 /// (ptr + rare1i) and (ptr + rare2i). It must also be safe to do unaligned
215 /// loads on ptr up to (end_ptr - needle.len()).
216 #[inline(always)]
fwd_find_in_chunk<V: Vector>( fwd: &Forward, needle: &[u8], ptr: *const u8, end_ptr: *const u8, rare1chunk: V, rare2chunk: V, mask: u32, ) -> Option<usize>217 unsafe fn fwd_find_in_chunk<V: Vector>(
218     fwd: &Forward,
219     needle: &[u8],
220     ptr: *const u8,
221     end_ptr: *const u8,
222     rare1chunk: V,
223     rare2chunk: V,
224     mask: u32,
225 ) -> Option<usize> {
226     let chunk0 = V::load_unaligned(ptr.add(fwd.rare1i as usize));
227     let chunk1 = V::load_unaligned(ptr.add(fwd.rare2i as usize));
228 
229     let eq0 = chunk0.cmpeq(rare1chunk);
230     let eq1 = chunk1.cmpeq(rare2chunk);
231 
232     let mut match_offsets = eq0.and(eq1).movemask() & mask;
233     while match_offsets != 0 {
234         let offset = match_offsets.trailing_zeros() as usize;
235         let ptr = ptr.add(offset);
236         if end_ptr.sub(needle.len()) < ptr {
237             return None;
238         }
239         let chunk = core::slice::from_raw_parts(ptr, needle.len());
240         if memcmp(needle, chunk) {
241             return Some(offset);
242         }
243         match_offsets &= match_offsets - 1;
244     }
245     None
246 }
247 
248 /// Accepts a chunk-relative offset and returns a haystack relative offset
249 /// after updating the prefilter state.
250 ///
251 /// See the same function with the same name in the prefilter variant of this
252 /// algorithm to learned why it's tagged with inline(never). Even here, where
253 /// the function is simpler, inlining it leads to poorer codegen. (Although
254 /// it does improve some benchmarks, like prebuiltiter/huge-en/common-you.)
255 #[cold]
256 #[inline(never)]
matched(start_ptr: *const u8, ptr: *const u8, chunki: usize) -> usize257 fn matched(start_ptr: *const u8, ptr: *const u8, chunki: usize) -> usize {
258     diff(ptr, start_ptr) + chunki
259 }
260 
261 /// Subtract `b` from `a` and return the difference. `a` must be greater than
262 /// or equal to `b`.
diff(a: *const u8, b: *const u8) -> usize263 fn diff(a: *const u8, b: *const u8) -> usize {
264     debug_assert!(a >= b);
265     (a as usize) - (b as usize)
266 }
267