1 // See the README in this directory for an explanation of the Teddy algorithm.
2 // It is strongly recommended to peruse the README before trying to grok this
3 // code, as its use of SIMD is pretty opaque, although I tried to add comments
4 // where appropriate.
5 //
6 // Moreover, while there is a lot of code in this file, most of it is
7 // repeated variants of the same thing. Specifically, there are three Teddy
8 // variants: Slim 128-bit Teddy (8 buckets), Slim 256-bit Teddy (8 buckets)
9 // and Fat 256-bit Teddy (16 buckets). For each variant, there are three
10 // implementations, corresponding to mask lengths of 1, 2 and 3. Bringing it to
11 // a total of nine variants. Each one is structured roughly the same:
12 //
13 //     while at <= len(haystack) - CHUNK_SIZE:
14 //         let candidate = find_candidate_in_chunk(haystack, at)
15 //         if not all zeroes(candidate):
16 //             if match = verify(haystack, at, candidate):
17 //                 return match
18 //
19 // For the most part, this remains unchanged. The parts that vary are the
20 // verification routine (for slim vs fat Teddy) and the candidate extraction
21 // (based on the number of masks).
22 //
23 // In the code below, a "candidate" corresponds to a single vector with 8-bit
24 // lanes. Each lane is itself an 8-bit bitset, where the ith bit is set in the
25 // jth lane if and only if the byte occurring at position `j` is in the
26 // bucket `i` (where the `j`th position is the position in the current window
27 // of the haystack, which is always 16 or 32 bytes). Note to be careful here:
28 // the ith bit and the jth lane correspond to the least significant bits of the
29 // vector. So when visualizing how the current window of bytes is stored in a
30 // vector, you often need to flip it around. For example, the text `abcd` in a
31 // 4-byte vector would look like this:
32 //
33 //     01100100 01100011 01100010 01100001
34 //         d        c        b        a
35 //
36 // When the mask length is 1, then finding the candidate is pretty straight
37 // forward: you just apply the shuffle indices (from the haystack window) to
38 // the masks, and then AND them together, as described in the README. But for
39 // masks of length 2 and 3, you need to keep a little state. Specifically,
40 // you need to store the final 1 (for mask length 2) or 2 (for mask length 3)
41 // bytes of the candidate for use when searching the next window. This is for
42 // handling matches that span two windows.
43 //
44 // With respect to the repeated code, it would likely be possible to reduce
45 // the number of copies of code below using polymorphism, but I find this
46 // formulation clearer instead of needing to reason through generics. However,
47 // I admit, there may be a simpler generic construction that I'm missing.
48 //
49 // All variants are fairly heavily tested in src/packed/tests.rs.
50 
51 use std::arch::x86_64::*;
52 use std::mem;
53 
54 use crate::packed::pattern::{PatternID, Patterns};
55 use crate::packed::teddy::compile;
56 use crate::packed::vector::*;
57 use crate::Match;
58 
59 /// The Teddy runtime.
60 ///
61 /// A Teddy runtime can be used to quickly search for occurrences of one or
62 /// more patterns. While it does not scale to an arbitrary number of patterns
63 /// like Aho-Corasick, it does find occurrences for a small set of patterns
64 /// much more quickly than Aho-Corasick.
65 ///
66 /// Teddy cannot run on small haystacks below a certain size, which is
67 /// dependent on the type of matcher used. This size can be queried via the
68 /// `minimum_len` method. Violating this will result in a panic.
69 ///
70 /// Finally, when callers use a Teddy runtime, they must provide precisely the
71 /// patterns used to construct the Teddy matcher. Violating this will result
72 /// in either a panic or incorrect results, but will never sacrifice memory
73 /// safety.
74 #[derive(Clone, Debug)]
75 pub struct Teddy {
76     /// The allocation of patterns in buckets. This only contains the IDs of
77     /// patterns. In order to do full verification, callers must provide the
78     /// actual patterns when using Teddy.
79     pub buckets: Vec<Vec<PatternID>>,
80     /// The maximum identifier of a pattern. This is used as a sanity check to
81     /// ensure that the patterns provided by the caller are the same as the
82     /// patterns that were used to compile the matcher. This sanity check
83     /// permits safely eliminating bounds checks regardless of what patterns
84     /// are provided by the caller.
85     ///
86     /// Note that users of the aho-corasick crate cannot get this wrong. Only
87     /// code internal to this crate can get it wrong, since neither `Patterns`
88     /// type nor the Teddy runtime are public API items.
89     pub max_pattern_id: PatternID,
90     /// The actual runtime to use.
91     pub exec: Exec,
92 }
93 
94 impl Teddy {
95     /// Return the first occurrence of a match in the given haystack after or
96     /// starting at `at`.
97     ///
98     /// The patterns provided must be precisely the same patterns given to the
99     /// Teddy builder, otherwise this may panic or produce incorrect results.
100     ///
101     /// All matches are consistent with the match semantics (leftmost-first or
102     /// leftmost-longest) set on `pats`.
find_at( &self, pats: &Patterns, haystack: &[u8], at: usize, ) -> Option<Match>103     pub fn find_at(
104         &self,
105         pats: &Patterns,
106         haystack: &[u8],
107         at: usize,
108     ) -> Option<Match> {
109         // This assert is a bit subtle, but it's an important guarantee.
110         // Namely, if the maximum pattern ID seen by Teddy is the same as the
111         // one in the patterns given, then we are guaranteed that every pattern
112         // ID in all Teddy buckets are valid indices into `pats`. While this
113         // is nominally true, there is no guarantee that callers provide the
114         // same `pats` to both the Teddy builder and the searcher, which would
115         // otherwise make `find_at` unsafe to call. But this assert lets us
116         // keep this routine safe and eliminate an important bounds check in
117         // verification.
118         assert_eq!(
119             self.max_pattern_id,
120             pats.max_pattern_id(),
121             "teddy must be called with same patterns it was built with",
122         );
123         // SAFETY: The haystack must have at least a minimum number of bytes
124         // for Teddy to be able to work. The minimum number varies depending on
125         // which matcher is used below. If this is violated, then it's possible
126         // for searching to do out-of-bounds writes.
127         assert!(haystack[at..].len() >= self.minimum_len());
128         // SAFETY: The various Teddy matchers are always safe to call because
129         // the Teddy builder guarantees that a particular Exec variant is
130         // built only when it can be run the current CPU. That is, the Teddy
131         // builder will not produce a Exec::TeddySlim1Mask256 unless AVX2 is
132         // enabled. That is, our dynamic CPU feature detection is performed
133         // once in the builder, and we rely on the type system to avoid needing
134         // to do it again.
135         unsafe {
136             match self.exec {
137                 Exec::TeddySlim1Mask128(ref e) => {
138                     e.find_at(pats, self, haystack, at)
139                 }
140                 Exec::TeddySlim1Mask256(ref e) => {
141                     e.find_at(pats, self, haystack, at)
142                 }
143                 Exec::TeddyFat1Mask256(ref e) => {
144                     e.find_at(pats, self, haystack, at)
145                 }
146                 Exec::TeddySlim2Mask128(ref e) => {
147                     e.find_at(pats, self, haystack, at)
148                 }
149                 Exec::TeddySlim2Mask256(ref e) => {
150                     e.find_at(pats, self, haystack, at)
151                 }
152                 Exec::TeddyFat2Mask256(ref e) => {
153                     e.find_at(pats, self, haystack, at)
154                 }
155                 Exec::TeddySlim3Mask128(ref e) => {
156                     e.find_at(pats, self, haystack, at)
157                 }
158                 Exec::TeddySlim3Mask256(ref e) => {
159                     e.find_at(pats, self, haystack, at)
160                 }
161                 Exec::TeddyFat3Mask256(ref e) => {
162                     e.find_at(pats, self, haystack, at)
163                 }
164             }
165         }
166     }
167 
168     /// Returns the minimum length of a haystack that must be provided by
169     /// callers to this Teddy searcher. Providing a haystack shorter than this
170     /// will result in a panic, but will never violate memory safety.
minimum_len(&self) -> usize171     pub fn minimum_len(&self) -> usize {
172         // SAFETY: These values must be correct in order to ensure safety.
173         // The Teddy runtime assumes their haystacks have at least these
174         // lengths. Violating this will sacrifice memory safety.
175         match self.exec {
176             Exec::TeddySlim1Mask128(_) => 16,
177             Exec::TeddySlim1Mask256(_) => 32,
178             Exec::TeddyFat1Mask256(_) => 16,
179             Exec::TeddySlim2Mask128(_) => 17,
180             Exec::TeddySlim2Mask256(_) => 33,
181             Exec::TeddyFat2Mask256(_) => 17,
182             Exec::TeddySlim3Mask128(_) => 18,
183             Exec::TeddySlim3Mask256(_) => 34,
184             Exec::TeddyFat3Mask256(_) => 34,
185         }
186     }
187 
188     /// Returns the approximate total amount of heap used by this searcher, in
189     /// units of bytes.
heap_bytes(&self) -> usize190     pub fn heap_bytes(&self) -> usize {
191         let num_patterns = self.max_pattern_id as usize + 1;
192         self.buckets.len() * mem::size_of::<Vec<PatternID>>()
193             + num_patterns * mem::size_of::<PatternID>()
194     }
195 
196     /// Runs the verification routine for Slim 128-bit Teddy.
197     ///
198     /// The candidate given should be a collection of 8-bit bitsets (one bitset
199     /// per lane), where the ith bit is set in the jth lane if and only if the
200     /// byte occurring at `at + j` in `haystack` is in the bucket `i`.
201     ///
202     /// This is not safe to call unless the SSSE3 target feature is enabled.
203     /// The `target_feature` attribute is not applied since this function is
204     /// always forcefully inlined.
205     #[inline(always)]
verify128( &self, pats: &Patterns, haystack: &[u8], at: usize, cand: __m128i, ) -> Option<Match>206     unsafe fn verify128(
207         &self,
208         pats: &Patterns,
209         haystack: &[u8],
210         at: usize,
211         cand: __m128i,
212     ) -> Option<Match> {
213         debug_assert!(!is_all_zeroes128(cand));
214         debug_assert_eq!(8, self.buckets.len());
215 
216         // Convert the candidate into 64-bit chunks, and then verify each of
217         // those chunks.
218         let parts = unpack64x128(cand);
219         for (i, &part) in parts.iter().enumerate() {
220             let pos = at + i * 8;
221             if let Some(m) = self.verify64(pats, 8, haystack, pos, part) {
222                 return Some(m);
223             }
224         }
225         None
226     }
227 
228     /// Runs the verification routine for Slim 256-bit Teddy.
229     ///
230     /// The candidate given should be a collection of 8-bit bitsets (one bitset
231     /// per lane), where the ith bit is set in the jth lane if and only if the
232     /// byte occurring at `at + j` in `haystack` is in the bucket `i`.
233     ///
234     /// This is not safe to call unless the AVX2 target feature is enabled.
235     /// The `target_feature` attribute is not applied since this function is
236     /// always forcefully inlined.
237     #[inline(always)]
verify256( &self, pats: &Patterns, haystack: &[u8], at: usize, cand: __m256i, ) -> Option<Match>238     unsafe fn verify256(
239         &self,
240         pats: &Patterns,
241         haystack: &[u8],
242         at: usize,
243         cand: __m256i,
244     ) -> Option<Match> {
245         debug_assert!(!is_all_zeroes256(cand));
246         debug_assert_eq!(8, self.buckets.len());
247 
248         // Convert the candidate into 64-bit chunks, and then verify each of
249         // those chunks.
250         let parts = unpack64x256(cand);
251         for (i, &part) in parts.iter().enumerate() {
252             let pos = at + i * 8;
253             if let Some(m) = self.verify64(pats, 8, haystack, pos, part) {
254                 return Some(m);
255             }
256         }
257         None
258     }
259 
260     /// Runs the verification routine for Fat 256-bit Teddy.
261     ///
262     /// The candidate given should be a collection of 8-bit bitsets (one bitset
263     /// per lane), where the ith bit is set in the jth lane if and only if the
264     /// byte occurring at `at + (j < 16 ? j : j - 16)` in `haystack` is in the
265     /// bucket `j < 16 ? i : i + 8`.
266     ///
267     /// This is not safe to call unless the AVX2 target feature is enabled.
268     /// The `target_feature` attribute is not applied since this function is
269     /// always forcefully inlined.
270     #[inline(always)]
verify_fat256( &self, pats: &Patterns, haystack: &[u8], at: usize, cand: __m256i, ) -> Option<Match>271     unsafe fn verify_fat256(
272         &self,
273         pats: &Patterns,
274         haystack: &[u8],
275         at: usize,
276         cand: __m256i,
277     ) -> Option<Match> {
278         debug_assert!(!is_all_zeroes256(cand));
279         debug_assert_eq!(16, self.buckets.len());
280 
281         // This is a bit tricky, but we basically want to convert our
282         // candidate, which looks like this
283         //
284         //     a31 a30 ... a17 a16 a15 a14 ... a01 a00
285         //
286         // where each a(i) is an 8-bit bitset corresponding to the activated
287         // buckets, to this
288         //
289         //     a31 a15 a30 a14 a29 a13 ... a18 a02 a17 a01 a16 a00
290         //
291         // Namely, for Fat Teddy, the high 128-bits of the candidate correspond
292         // to the same bytes in the haystack in the low 128-bits (so we only
293         // scan 16 bytes at a time), but are for buckets 8-15 instead of 0-7.
294         //
295         // The verification routine wants to look at all potentially matching
296         // buckets before moving on to the next lane. So for example, both
297         // a16 and a00 both correspond to the first byte in our window; a00
298         // contains buckets 0-7 and a16 contains buckets 8-15. Specifically,
299         // a16 should be checked before a01. So the transformation shown above
300         // allows us to use our normal verification procedure with one small
301         // change: we treat each bitset as 16 bits instead of 8 bits.
302 
303         // Swap the 128-bit lanes in the candidate vector.
304         let swap = _mm256_permute4x64_epi64(cand, 0x4E);
305         // Interleave the bytes from the low 128-bit lanes, starting with
306         // cand first.
307         let r1 = _mm256_unpacklo_epi8(cand, swap);
308         // Interleave the bytes from the high 128-bit lanes, starting with
309         // cand first.
310         let r2 = _mm256_unpackhi_epi8(cand, swap);
311         // Now just take the 2 low 64-bit integers from both r1 and r2. We
312         // can drop the high 64-bit integers because they are a mirror image
313         // of the low 64-bit integers. All we care about are the low 128-bit
314         // lanes of r1 and r2. Combined, they contain all our 16-bit bitsets
315         // laid out in the desired order, as described above.
316         let parts = unpacklo64x256(r1, r2);
317         for (i, &part) in parts.iter().enumerate() {
318             let pos = at + i * 4;
319             if let Some(m) = self.verify64(pats, 16, haystack, pos, part) {
320                 return Some(m);
321             }
322         }
323         None
324     }
325 
326     /// Verify whether there are any matches starting at or after `at` in the
327     /// given `haystack`. The candidate given should correspond to either 8-bit
328     /// (for 8 buckets) or 16-bit (16 buckets) bitsets.
329     #[inline(always)]
verify64( &self, pats: &Patterns, bucket_count: usize, haystack: &[u8], at: usize, mut cand: u64, ) -> Option<Match>330     fn verify64(
331         &self,
332         pats: &Patterns,
333         bucket_count: usize,
334         haystack: &[u8],
335         at: usize,
336         mut cand: u64,
337     ) -> Option<Match> {
338         // N.B. While the bucket count is known from self.buckets.len(),
339         // requiring it as a parameter makes it easier for the optimizer to
340         // know its value, and thus produce more efficient codegen.
341         debug_assert!(bucket_count == 8 || bucket_count == 16);
342         while cand != 0 {
343             let bit = cand.trailing_zeros() as usize;
344             cand &= !(1 << bit);
345 
346             let at = at + (bit / bucket_count);
347             let bucket = bit % bucket_count;
348             if let Some(m) = self.verify_bucket(pats, haystack, bucket, at) {
349                 return Some(m);
350             }
351         }
352         None
353     }
354 
355     /// Verify whether there are any matches starting at `at` in the given
356     /// `haystack` corresponding only to patterns in the given bucket.
357     #[inline(always)]
verify_bucket( &self, pats: &Patterns, haystack: &[u8], bucket: usize, at: usize, ) -> Option<Match>358     fn verify_bucket(
359         &self,
360         pats: &Patterns,
361         haystack: &[u8],
362         bucket: usize,
363         at: usize,
364     ) -> Option<Match> {
365         // Forcing this function to not inline and be "cold" seems to help
366         // the codegen for Teddy overall. Interestingly, this is good for a
367         // 16% boost in the sherlock/packed/teddy/name/alt1 benchmark (among
368         // others). Overall, this seems like a problem with codegen, since
369         // creating the Match itself is a very small amount of code.
370         #[cold]
371         #[inline(never)]
372         fn match_from_span(
373             pati: PatternID,
374             start: usize,
375             end: usize,
376         ) -> Match {
377             Match::from_span(pati as usize, start, end)
378         }
379 
380         // N.B. The bounds check for this bucket lookup *should* be elided
381         // since we assert the number of buckets in each `find_at` routine,
382         // and the compiler can prove that the `% 8` (or `% 16`) in callers
383         // of this routine will always be in bounds.
384         for &pati in &self.buckets[bucket] {
385             // SAFETY: This is safe because we are guaranteed that every
386             // index in a Teddy bucket is a valid index into `pats`. This
387             // guarantee is upheld by the assert checking `max_pattern_id` in
388             // the beginning of `find_at` above.
389             //
390             // This explicit bounds check elision is (amazingly) good for a
391             // 25-50% boost in some benchmarks, particularly ones with a lot
392             // of short literals.
393             let pat = unsafe { pats.get_unchecked(pati) };
394             if pat.is_prefix(&haystack[at..]) {
395                 return Some(match_from_span(pati, at, at + pat.len()));
396             }
397         }
398         None
399     }
400 }
401 
402 /// Exec represents the different search strategies supported by the Teddy
403 /// runtime.
404 ///
405 /// This enum is an important safety abstraction. Namely, callers should only
406 /// construct a variant in this enum if it is safe to execute its corresponding
407 /// target features on the current CPU. The 128-bit searchers require SSSE3,
408 /// while the 256-bit searchers require AVX2.
409 #[derive(Clone, Debug)]
410 pub enum Exec {
411     TeddySlim1Mask128(TeddySlim1Mask128),
412     TeddySlim1Mask256(TeddySlim1Mask256),
413     TeddyFat1Mask256(TeddyFat1Mask256),
414     TeddySlim2Mask128(TeddySlim2Mask128),
415     TeddySlim2Mask256(TeddySlim2Mask256),
416     TeddyFat2Mask256(TeddyFat2Mask256),
417     TeddySlim3Mask128(TeddySlim3Mask128),
418     TeddySlim3Mask256(TeddySlim3Mask256),
419     TeddyFat3Mask256(TeddyFat3Mask256),
420 }
421 
422 // Most of the code below remains undocumented because they are effectively
423 // repeated versions of themselves. The general structure is described in the
424 // README and in the comments above.
425 
426 #[derive(Clone, Debug)]
427 pub struct TeddySlim1Mask128 {
428     pub mask1: Mask128,
429 }
430 
431 impl TeddySlim1Mask128 {
432     #[target_feature(enable = "ssse3")]
find_at( &self, pats: &Patterns, teddy: &Teddy, haystack: &[u8], mut at: usize, ) -> Option<Match>433     unsafe fn find_at(
434         &self,
435         pats: &Patterns,
436         teddy: &Teddy,
437         haystack: &[u8],
438         mut at: usize,
439     ) -> Option<Match> {
440         debug_assert!(haystack[at..].len() >= teddy.minimum_len());
441         // This assert helps eliminate bounds checks for bucket lookups in
442         // Teddy::verify_bucket, which has a small (3-4%) performance boost.
443         assert_eq!(8, teddy.buckets.len());
444 
445         let len = haystack.len();
446         while at <= len - 16 {
447             let c = self.candidate(haystack, at);
448             if !is_all_zeroes128(c) {
449                 if let Some(m) = teddy.verify128(pats, haystack, at, c) {
450                     return Some(m);
451                 }
452             }
453             at += 16;
454         }
455         if at < len {
456             at = len - 16;
457             let c = self.candidate(haystack, at);
458             if !is_all_zeroes128(c) {
459                 if let Some(m) = teddy.verify128(pats, haystack, at, c) {
460                     return Some(m);
461                 }
462             }
463         }
464         None
465     }
466 
467     #[inline(always)]
candidate(&self, haystack: &[u8], at: usize) -> __m128i468     unsafe fn candidate(&self, haystack: &[u8], at: usize) -> __m128i {
469         debug_assert!(haystack[at..].len() >= 16);
470 
471         let chunk = loadu128(haystack, at);
472         members1m128(chunk, self.mask1)
473     }
474 }
475 
476 #[derive(Clone, Debug)]
477 pub struct TeddySlim1Mask256 {
478     pub mask1: Mask256,
479 }
480 
481 impl TeddySlim1Mask256 {
482     #[target_feature(enable = "avx2")]
find_at( &self, pats: &Patterns, teddy: &Teddy, haystack: &[u8], mut at: usize, ) -> Option<Match>483     unsafe fn find_at(
484         &self,
485         pats: &Patterns,
486         teddy: &Teddy,
487         haystack: &[u8],
488         mut at: usize,
489     ) -> Option<Match> {
490         debug_assert!(haystack[at..].len() >= teddy.minimum_len());
491         // This assert helps eliminate bounds checks for bucket lookups in
492         // Teddy::verify_bucket, which has a small (3-4%) performance boost.
493         assert_eq!(8, teddy.buckets.len());
494 
495         let len = haystack.len();
496         while at <= len - 32 {
497             let c = self.candidate(haystack, at);
498             if !is_all_zeroes256(c) {
499                 if let Some(m) = teddy.verify256(pats, haystack, at, c) {
500                     return Some(m);
501                 }
502             }
503             at += 32;
504         }
505         if at < len {
506             at = len - 32;
507             let c = self.candidate(haystack, at);
508             if !is_all_zeroes256(c) {
509                 if let Some(m) = teddy.verify256(pats, haystack, at, c) {
510                     return Some(m);
511                 }
512             }
513         }
514         None
515     }
516 
517     #[inline(always)]
candidate(&self, haystack: &[u8], at: usize) -> __m256i518     unsafe fn candidate(&self, haystack: &[u8], at: usize) -> __m256i {
519         debug_assert!(haystack[at..].len() >= 32);
520 
521         let chunk = loadu256(haystack, at);
522         members1m256(chunk, self.mask1)
523     }
524 }
525 
526 #[derive(Clone, Debug)]
527 pub struct TeddyFat1Mask256 {
528     pub mask1: Mask256,
529 }
530 
531 impl TeddyFat1Mask256 {
532     #[target_feature(enable = "avx2")]
find_at( &self, pats: &Patterns, teddy: &Teddy, haystack: &[u8], mut at: usize, ) -> Option<Match>533     unsafe fn find_at(
534         &self,
535         pats: &Patterns,
536         teddy: &Teddy,
537         haystack: &[u8],
538         mut at: usize,
539     ) -> Option<Match> {
540         debug_assert!(haystack[at..].len() >= teddy.minimum_len());
541         // This assert helps eliminate bounds checks for bucket lookups in
542         // Teddy::verify_bucket, which has a small (3-4%) performance boost.
543         assert_eq!(16, teddy.buckets.len());
544 
545         let len = haystack.len();
546         while at <= len - 16 {
547             let c = self.candidate(haystack, at);
548             if !is_all_zeroes256(c) {
549                 if let Some(m) = teddy.verify_fat256(pats, haystack, at, c) {
550                     return Some(m);
551                 }
552             }
553             at += 16;
554         }
555         if at < len {
556             at = len - 16;
557             let c = self.candidate(haystack, at);
558             if !is_all_zeroes256(c) {
559                 if let Some(m) = teddy.verify_fat256(pats, haystack, at, c) {
560                     return Some(m);
561                 }
562             }
563         }
564         None
565     }
566 
567     #[inline(always)]
candidate(&self, haystack: &[u8], at: usize) -> __m256i568     unsafe fn candidate(&self, haystack: &[u8], at: usize) -> __m256i {
569         debug_assert!(haystack[at..].len() >= 16);
570 
571         let chunk = _mm256_broadcastsi128_si256(loadu128(haystack, at));
572         members1m256(chunk, self.mask1)
573     }
574 }
575 
576 #[derive(Clone, Debug)]
577 pub struct TeddySlim2Mask128 {
578     pub mask1: Mask128,
579     pub mask2: Mask128,
580 }
581 
582 impl TeddySlim2Mask128 {
583     #[target_feature(enable = "ssse3")]
find_at( &self, pats: &Patterns, teddy: &Teddy, haystack: &[u8], mut at: usize, ) -> Option<Match>584     unsafe fn find_at(
585         &self,
586         pats: &Patterns,
587         teddy: &Teddy,
588         haystack: &[u8],
589         mut at: usize,
590     ) -> Option<Match> {
591         debug_assert!(haystack[at..].len() >= teddy.minimum_len());
592         // This assert helps eliminate bounds checks for bucket lookups in
593         // Teddy::verify_bucket, which has a small (3-4%) performance boost.
594         assert_eq!(8, teddy.buckets.len());
595 
596         at += 1;
597         let len = haystack.len();
598         let mut prev0 = ones128();
599         while at <= len - 16 {
600             let c = self.candidate(haystack, at, &mut prev0);
601             if !is_all_zeroes128(c) {
602                 if let Some(m) = teddy.verify128(pats, haystack, at - 1, c) {
603                     return Some(m);
604                 }
605             }
606             at += 16;
607         }
608         if at < len {
609             at = len - 16;
610             prev0 = ones128();
611 
612             let c = self.candidate(haystack, at, &mut prev0);
613             if !is_all_zeroes128(c) {
614                 if let Some(m) = teddy.verify128(pats, haystack, at - 1, c) {
615                     return Some(m);
616                 }
617             }
618         }
619         None
620     }
621 
622     #[inline(always)]
candidate( &self, haystack: &[u8], at: usize, prev0: &mut __m128i, ) -> __m128i623     unsafe fn candidate(
624         &self,
625         haystack: &[u8],
626         at: usize,
627         prev0: &mut __m128i,
628     ) -> __m128i {
629         debug_assert!(haystack[at..].len() >= 16);
630 
631         let chunk = loadu128(haystack, at);
632         let (res0, res1) = members2m128(chunk, self.mask1, self.mask2);
633         let res0prev0 = _mm_alignr_epi8(res0, *prev0, 15);
634         _mm_and_si128(res0prev0, res1)
635     }
636 }
637 
638 #[derive(Clone, Debug)]
639 pub struct TeddySlim2Mask256 {
640     pub mask1: Mask256,
641     pub mask2: Mask256,
642 }
643 
644 impl TeddySlim2Mask256 {
645     #[target_feature(enable = "avx2")]
find_at( &self, pats: &Patterns, teddy: &Teddy, haystack: &[u8], mut at: usize, ) -> Option<Match>646     unsafe fn find_at(
647         &self,
648         pats: &Patterns,
649         teddy: &Teddy,
650         haystack: &[u8],
651         mut at: usize,
652     ) -> Option<Match> {
653         debug_assert!(haystack[at..].len() >= teddy.minimum_len());
654         // This assert helps eliminate bounds checks for bucket lookups in
655         // Teddy::verify_bucket, which has a small (3-4%) performance boost.
656         assert_eq!(8, teddy.buckets.len());
657 
658         at += 1;
659         let len = haystack.len();
660         let mut prev0 = ones256();
661         while at <= len - 32 {
662             let c = self.candidate(haystack, at, &mut prev0);
663             if !is_all_zeroes256(c) {
664                 if let Some(m) = teddy.verify256(pats, haystack, at - 1, c) {
665                     return Some(m);
666                 }
667             }
668             at += 32;
669         }
670         if at < len {
671             at = len - 32;
672             prev0 = ones256();
673 
674             let c = self.candidate(haystack, at, &mut prev0);
675             if !is_all_zeroes256(c) {
676                 if let Some(m) = teddy.verify256(pats, haystack, at - 1, c) {
677                     return Some(m);
678                 }
679             }
680         }
681         None
682     }
683 
684     #[inline(always)]
candidate( &self, haystack: &[u8], at: usize, prev0: &mut __m256i, ) -> __m256i685     unsafe fn candidate(
686         &self,
687         haystack: &[u8],
688         at: usize,
689         prev0: &mut __m256i,
690     ) -> __m256i {
691         debug_assert!(haystack[at..].len() >= 32);
692 
693         let chunk = loadu256(haystack, at);
694         let (res0, res1) = members2m256(chunk, self.mask1, self.mask2);
695         let res0prev0 = alignr256_15(res0, *prev0);
696         let res = _mm256_and_si256(res0prev0, res1);
697         *prev0 = res0;
698         res
699     }
700 }
701 
702 #[derive(Clone, Debug)]
703 pub struct TeddyFat2Mask256 {
704     pub mask1: Mask256,
705     pub mask2: Mask256,
706 }
707 
708 impl TeddyFat2Mask256 {
709     #[target_feature(enable = "avx2")]
find_at( &self, pats: &Patterns, teddy: &Teddy, haystack: &[u8], mut at: usize, ) -> Option<Match>710     unsafe fn find_at(
711         &self,
712         pats: &Patterns,
713         teddy: &Teddy,
714         haystack: &[u8],
715         mut at: usize,
716     ) -> Option<Match> {
717         debug_assert!(haystack[at..].len() >= teddy.minimum_len());
718         // This assert helps eliminate bounds checks for bucket lookups in
719         // Teddy::verify_bucket, which has a small (3-4%) performance boost.
720         assert_eq!(16, teddy.buckets.len());
721 
722         at += 1;
723         let len = haystack.len();
724         let mut prev0 = ones256();
725         while at <= len - 16 {
726             let c = self.candidate(haystack, at, &mut prev0);
727             if !is_all_zeroes256(c) {
728                 if let Some(m) = teddy.verify_fat256(pats, haystack, at - 1, c)
729                 {
730                     return Some(m);
731                 }
732             }
733             at += 16;
734         }
735         if at < len {
736             at = len - 16;
737             prev0 = ones256();
738 
739             let c = self.candidate(haystack, at, &mut prev0);
740             if !is_all_zeroes256(c) {
741                 if let Some(m) = teddy.verify_fat256(pats, haystack, at - 1, c)
742                 {
743                     return Some(m);
744                 }
745             }
746         }
747         None
748     }
749 
750     #[inline(always)]
candidate( &self, haystack: &[u8], at: usize, prev0: &mut __m256i, ) -> __m256i751     unsafe fn candidate(
752         &self,
753         haystack: &[u8],
754         at: usize,
755         prev0: &mut __m256i,
756     ) -> __m256i {
757         debug_assert!(haystack[at..].len() >= 16);
758 
759         let chunk = _mm256_broadcastsi128_si256(loadu128(haystack, at));
760         let (res0, res1) = members2m256(chunk, self.mask1, self.mask2);
761         let res0prev0 = _mm256_alignr_epi8(res0, *prev0, 15);
762         let res = _mm256_and_si256(res0prev0, res1);
763         *prev0 = res0;
764         res
765     }
766 }
767 
768 #[derive(Clone, Debug)]
769 pub struct TeddySlim3Mask128 {
770     pub mask1: Mask128,
771     pub mask2: Mask128,
772     pub mask3: Mask128,
773 }
774 
775 impl TeddySlim3Mask128 {
776     #[target_feature(enable = "ssse3")]
find_at( &self, pats: &Patterns, teddy: &Teddy, haystack: &[u8], mut at: usize, ) -> Option<Match>777     unsafe fn find_at(
778         &self,
779         pats: &Patterns,
780         teddy: &Teddy,
781         haystack: &[u8],
782         mut at: usize,
783     ) -> Option<Match> {
784         debug_assert!(haystack[at..].len() >= teddy.minimum_len());
785         // This assert helps eliminate bounds checks for bucket lookups in
786         // Teddy::verify_bucket, which has a small (3-4%) performance boost.
787         assert_eq!(8, teddy.buckets.len());
788 
789         at += 2;
790         let len = haystack.len();
791         let (mut prev0, mut prev1) = (ones128(), ones128());
792         while at <= len - 16 {
793             let c = self.candidate(haystack, at, &mut prev0, &mut prev1);
794             if !is_all_zeroes128(c) {
795                 if let Some(m) = teddy.verify128(pats, haystack, at - 2, c) {
796                     return Some(m);
797                 }
798             }
799             at += 16;
800         }
801         if at < len {
802             at = len - 16;
803             prev0 = ones128();
804             prev1 = ones128();
805 
806             let c = self.candidate(haystack, at, &mut prev0, &mut prev1);
807             if !is_all_zeroes128(c) {
808                 if let Some(m) = teddy.verify128(pats, haystack, at - 2, c) {
809                     return Some(m);
810                 }
811             }
812         }
813         None
814     }
815 
816     #[inline(always)]
candidate( &self, haystack: &[u8], at: usize, prev0: &mut __m128i, prev1: &mut __m128i, ) -> __m128i817     unsafe fn candidate(
818         &self,
819         haystack: &[u8],
820         at: usize,
821         prev0: &mut __m128i,
822         prev1: &mut __m128i,
823     ) -> __m128i {
824         debug_assert!(haystack[at..].len() >= 16);
825 
826         let chunk = loadu128(haystack, at);
827         let (res0, res1, res2) =
828             members3m128(chunk, self.mask1, self.mask2, self.mask3);
829         let res0prev0 = _mm_alignr_epi8(res0, *prev0, 14);
830         let res1prev1 = _mm_alignr_epi8(res1, *prev1, 15);
831         let res = _mm_and_si128(_mm_and_si128(res0prev0, res1prev1), res2);
832         *prev0 = res0;
833         *prev1 = res1;
834         res
835     }
836 }
837 
838 #[derive(Clone, Debug)]
839 pub struct TeddySlim3Mask256 {
840     pub mask1: Mask256,
841     pub mask2: Mask256,
842     pub mask3: Mask256,
843 }
844 
845 impl TeddySlim3Mask256 {
846     #[target_feature(enable = "avx2")]
find_at( &self, pats: &Patterns, teddy: &Teddy, haystack: &[u8], mut at: usize, ) -> Option<Match>847     unsafe fn find_at(
848         &self,
849         pats: &Patterns,
850         teddy: &Teddy,
851         haystack: &[u8],
852         mut at: usize,
853     ) -> Option<Match> {
854         debug_assert!(haystack[at..].len() >= teddy.minimum_len());
855         // This assert helps eliminate bounds checks for bucket lookups in
856         // Teddy::verify_bucket, which has a small (3-4%) performance boost.
857         assert_eq!(8, teddy.buckets.len());
858 
859         at += 2;
860         let len = haystack.len();
861         let (mut prev0, mut prev1) = (ones256(), ones256());
862         while at <= len - 32 {
863             let c = self.candidate(haystack, at, &mut prev0, &mut prev1);
864             if !is_all_zeroes256(c) {
865                 if let Some(m) = teddy.verify256(pats, haystack, at - 2, c) {
866                     return Some(m);
867                 }
868             }
869             at += 32;
870         }
871         if at < len {
872             at = len - 32;
873             prev0 = ones256();
874             prev1 = ones256();
875 
876             let c = self.candidate(haystack, at, &mut prev0, &mut prev1);
877             if !is_all_zeroes256(c) {
878                 if let Some(m) = teddy.verify256(pats, haystack, at - 2, c) {
879                     return Some(m);
880                 }
881             }
882         }
883         None
884     }
885 
886     #[inline(always)]
candidate( &self, haystack: &[u8], at: usize, prev0: &mut __m256i, prev1: &mut __m256i, ) -> __m256i887     unsafe fn candidate(
888         &self,
889         haystack: &[u8],
890         at: usize,
891         prev0: &mut __m256i,
892         prev1: &mut __m256i,
893     ) -> __m256i {
894         debug_assert!(haystack[at..].len() >= 32);
895 
896         let chunk = loadu256(haystack, at);
897         let (res0, res1, res2) =
898             members3m256(chunk, self.mask1, self.mask2, self.mask3);
899         let res0prev0 = alignr256_14(res0, *prev0);
900         let res1prev1 = alignr256_15(res1, *prev1);
901         let res =
902             _mm256_and_si256(_mm256_and_si256(res0prev0, res1prev1), res2);
903         *prev0 = res0;
904         *prev1 = res1;
905         res
906     }
907 }
908 
909 #[derive(Clone, Debug)]
910 pub struct TeddyFat3Mask256 {
911     pub mask1: Mask256,
912     pub mask2: Mask256,
913     pub mask3: Mask256,
914 }
915 
916 impl TeddyFat3Mask256 {
917     #[target_feature(enable = "avx2")]
find_at( &self, pats: &Patterns, teddy: &Teddy, haystack: &[u8], mut at: usize, ) -> Option<Match>918     unsafe fn find_at(
919         &self,
920         pats: &Patterns,
921         teddy: &Teddy,
922         haystack: &[u8],
923         mut at: usize,
924     ) -> Option<Match> {
925         debug_assert!(haystack[at..].len() >= teddy.minimum_len());
926         // This assert helps eliminate bounds checks for bucket lookups in
927         // Teddy::verify_bucket, which has a small (3-4%) performance boost.
928         assert_eq!(16, teddy.buckets.len());
929 
930         at += 2;
931         let len = haystack.len();
932         let (mut prev0, mut prev1) = (ones256(), ones256());
933         while at <= len - 16 {
934             let c = self.candidate(haystack, at, &mut prev0, &mut prev1);
935             if !is_all_zeroes256(c) {
936                 if let Some(m) = teddy.verify_fat256(pats, haystack, at - 2, c)
937                 {
938                     return Some(m);
939                 }
940             }
941             at += 16;
942         }
943         if at < len {
944             at = len - 16;
945             prev0 = ones256();
946             prev1 = ones256();
947 
948             let c = self.candidate(haystack, at, &mut prev0, &mut prev1);
949             if !is_all_zeroes256(c) {
950                 if let Some(m) = teddy.verify_fat256(pats, haystack, at - 2, c)
951                 {
952                     return Some(m);
953                 }
954             }
955         }
956         None
957     }
958 
959     #[inline(always)]
candidate( &self, haystack: &[u8], at: usize, prev0: &mut __m256i, prev1: &mut __m256i, ) -> __m256i960     unsafe fn candidate(
961         &self,
962         haystack: &[u8],
963         at: usize,
964         prev0: &mut __m256i,
965         prev1: &mut __m256i,
966     ) -> __m256i {
967         debug_assert!(haystack[at..].len() >= 16);
968 
969         let chunk = _mm256_broadcastsi128_si256(loadu128(haystack, at));
970         let (res0, res1, res2) =
971             members3m256(chunk, self.mask1, self.mask2, self.mask3);
972         let res0prev0 = _mm256_alignr_epi8(res0, *prev0, 14);
973         let res1prev1 = _mm256_alignr_epi8(res1, *prev1, 15);
974         let res =
975             _mm256_and_si256(_mm256_and_si256(res0prev0, res1prev1), res2);
976         *prev0 = res0;
977         *prev1 = res1;
978         res
979     }
980 }
981 
982 /// A 128-bit mask for the low and high nybbles in a set of patterns. Each
983 /// lane `j` corresponds to a bitset where the `i`th bit is set if and only if
984 /// the nybble `j` is in the bucket `i` at a particular position.
985 #[derive(Clone, Copy, Debug)]
986 pub struct Mask128 {
987     lo: __m128i,
988     hi: __m128i,
989 }
990 
991 impl Mask128 {
992     /// Create a new SIMD mask from the mask produced by the Teddy builder.
new(mask: compile::Mask) -> Mask128993     pub fn new(mask: compile::Mask) -> Mask128 {
994         // SAFETY: This is safe since [u8; 16] has the same representation
995         // as __m128i.
996         unsafe {
997             Mask128 {
998                 lo: mem::transmute(mask.lo128()),
999                 hi: mem::transmute(mask.hi128()),
1000             }
1001         }
1002     }
1003 }
1004 
1005 /// A 256-bit mask for the low and high nybbles in a set of patterns. Each
1006 /// lane `j` corresponds to a bitset where the `i`th bit is set if and only if
1007 /// the nybble `j` is in the bucket `i` at a particular position.
1008 ///
1009 /// This is slightly tweaked dependending on whether Slim or Fat Teddy is being
1010 /// used. For Slim Teddy, the bitsets in the lower 128-bits are the same as
1011 /// the bitsets in the higher 128-bits, so that we can search 32 bytes at a
1012 /// time. (Remember, the nybbles in the haystack are used as indices into these
1013 /// masks, and 256-bit shuffles only operate on 128-bit lanes.)
1014 ///
1015 /// For Fat Teddy, the bitsets are not repeated, but instead, the high 128
1016 /// bits correspond to buckets 8-15. So that a bitset `00100010` has buckets
1017 /// 1 and 5 set if it's in the lower 128 bits, but has buckets 9 and 13 set
1018 /// if it's in the higher 128 bits.
1019 #[derive(Clone, Copy, Debug)]
1020 pub struct Mask256 {
1021     lo: __m256i,
1022     hi: __m256i,
1023 }
1024 
1025 impl Mask256 {
1026     /// Create a new SIMD mask from the mask produced by the Teddy builder.
new(mask: compile::Mask) -> Mask2561027     pub fn new(mask: compile::Mask) -> Mask256 {
1028         // SAFETY: This is safe since [u8; 32] has the same representation
1029         // as __m256i.
1030         unsafe {
1031             Mask256 {
1032                 lo: mem::transmute(mask.lo256()),
1033                 hi: mem::transmute(mask.hi256()),
1034             }
1035         }
1036     }
1037 }
1038 
1039 // The "members" routines below are responsible for taking a chunk of bytes,
1040 // a number of nybble masks and returning the result of using the masks to
1041 // lookup bytes in the chunk. The results of the high and low nybble masks are
1042 // AND'ed together, such that each candidate returned is a vector, with byte
1043 // sized lanes, and where each lane is an 8-bit bitset corresponding to the
1044 // buckets that contain the corresponding byte.
1045 //
1046 // In the case of masks of length greater than 1, callers will need to keep
1047 // the results from the previous haystack's window, and then shift the vectors
1048 // so that they all line up. Then they can be AND'ed together.
1049 
1050 /// Return a candidate for Slim 128-bit Teddy, where `chunk` corresponds to a
1051 /// 16-byte window of the haystack (where the least significant byte
1052 /// corresponds to the start of the window), and `mask1` corresponds to a
1053 /// low/high mask for the first byte of all patterns that are being searched.
1054 #[target_feature(enable = "ssse3")]
members1m128(chunk: __m128i, mask1: Mask128) -> __m128i1055 unsafe fn members1m128(chunk: __m128i, mask1: Mask128) -> __m128i {
1056     let lomask = _mm_set1_epi8(0xF);
1057     let hlo = _mm_and_si128(chunk, lomask);
1058     let hhi = _mm_and_si128(_mm_srli_epi16(chunk, 4), lomask);
1059     _mm_and_si128(
1060         _mm_shuffle_epi8(mask1.lo, hlo),
1061         _mm_shuffle_epi8(mask1.hi, hhi),
1062     )
1063 }
1064 
1065 /// Return a candidate for Slim 256-bit Teddy, where `chunk` corresponds to a
1066 /// 32-byte window of the haystack (where the least significant byte
1067 /// corresponds to the start of the window), and `mask1` corresponds to a
1068 /// low/high mask for the first byte of all patterns that are being searched.
1069 ///
1070 /// Note that this can also be used for Fat Teddy, where the high 128 bits in
1071 /// `chunk` is the same as the low 128 bits, which corresponds to a 16 byte
1072 /// window in the haystack.
1073 #[target_feature(enable = "avx2")]
members1m256(chunk: __m256i, mask1: Mask256) -> __m256i1074 unsafe fn members1m256(chunk: __m256i, mask1: Mask256) -> __m256i {
1075     let lomask = _mm256_set1_epi8(0xF);
1076     let hlo = _mm256_and_si256(chunk, lomask);
1077     let hhi = _mm256_and_si256(_mm256_srli_epi16(chunk, 4), lomask);
1078     _mm256_and_si256(
1079         _mm256_shuffle_epi8(mask1.lo, hlo),
1080         _mm256_shuffle_epi8(mask1.hi, hhi),
1081     )
1082 }
1083 
1084 /// Return candidates for Slim 128-bit Teddy, where `chunk` corresponds
1085 /// to a 16-byte window of the haystack (where the least significant byte
1086 /// corresponds to the start of the window), and the masks correspond to a
1087 /// low/high mask for the first and second bytes of all patterns that are being
1088 /// searched. The vectors returned correspond to candidates for the first and
1089 /// second bytes in the patterns represented by the masks.
1090 #[target_feature(enable = "ssse3")]
members2m128( chunk: __m128i, mask1: Mask128, mask2: Mask128, ) -> (__m128i, __m128i)1091 unsafe fn members2m128(
1092     chunk: __m128i,
1093     mask1: Mask128,
1094     mask2: Mask128,
1095 ) -> (__m128i, __m128i) {
1096     let lomask = _mm_set1_epi8(0xF);
1097     let hlo = _mm_and_si128(chunk, lomask);
1098     let hhi = _mm_and_si128(_mm_srli_epi16(chunk, 4), lomask);
1099     let res0 = _mm_and_si128(
1100         _mm_shuffle_epi8(mask1.lo, hlo),
1101         _mm_shuffle_epi8(mask1.hi, hhi),
1102     );
1103     let res1 = _mm_and_si128(
1104         _mm_shuffle_epi8(mask2.lo, hlo),
1105         _mm_shuffle_epi8(mask2.hi, hhi),
1106     );
1107     (res0, res1)
1108 }
1109 
1110 /// Return candidates for Slim 256-bit Teddy, where `chunk` corresponds
1111 /// to a 32-byte window of the haystack (where the least significant byte
1112 /// corresponds to the start of the window), and the masks correspond to a
1113 /// low/high mask for the first and second bytes of all patterns that are being
1114 /// searched. The vectors returned correspond to candidates for the first and
1115 /// second bytes in the patterns represented by the masks.
1116 ///
1117 /// Note that this can also be used for Fat Teddy, where the high 128 bits in
1118 /// `chunk` is the same as the low 128 bits, which corresponds to a 16 byte
1119 /// window in the haystack.
1120 #[target_feature(enable = "avx2")]
members2m256( chunk: __m256i, mask1: Mask256, mask2: Mask256, ) -> (__m256i, __m256i)1121 unsafe fn members2m256(
1122     chunk: __m256i,
1123     mask1: Mask256,
1124     mask2: Mask256,
1125 ) -> (__m256i, __m256i) {
1126     let lomask = _mm256_set1_epi8(0xF);
1127     let hlo = _mm256_and_si256(chunk, lomask);
1128     let hhi = _mm256_and_si256(_mm256_srli_epi16(chunk, 4), lomask);
1129     let res0 = _mm256_and_si256(
1130         _mm256_shuffle_epi8(mask1.lo, hlo),
1131         _mm256_shuffle_epi8(mask1.hi, hhi),
1132     );
1133     let res1 = _mm256_and_si256(
1134         _mm256_shuffle_epi8(mask2.lo, hlo),
1135         _mm256_shuffle_epi8(mask2.hi, hhi),
1136     );
1137     (res0, res1)
1138 }
1139 
1140 /// Return candidates for Slim 128-bit Teddy, where `chunk` corresponds
1141 /// to a 16-byte window of the haystack (where the least significant byte
1142 /// corresponds to the start of the window), and the masks correspond to a
1143 /// low/high mask for the first, second and third bytes of all patterns that
1144 /// are being searched. The vectors returned correspond to candidates for the
1145 /// first, second and third bytes in the patterns represented by the masks.
1146 #[target_feature(enable = "ssse3")]
members3m128( chunk: __m128i, mask1: Mask128, mask2: Mask128, mask3: Mask128, ) -> (__m128i, __m128i, __m128i)1147 unsafe fn members3m128(
1148     chunk: __m128i,
1149     mask1: Mask128,
1150     mask2: Mask128,
1151     mask3: Mask128,
1152 ) -> (__m128i, __m128i, __m128i) {
1153     let lomask = _mm_set1_epi8(0xF);
1154     let hlo = _mm_and_si128(chunk, lomask);
1155     let hhi = _mm_and_si128(_mm_srli_epi16(chunk, 4), lomask);
1156     let res0 = _mm_and_si128(
1157         _mm_shuffle_epi8(mask1.lo, hlo),
1158         _mm_shuffle_epi8(mask1.hi, hhi),
1159     );
1160     let res1 = _mm_and_si128(
1161         _mm_shuffle_epi8(mask2.lo, hlo),
1162         _mm_shuffle_epi8(mask2.hi, hhi),
1163     );
1164     let res2 = _mm_and_si128(
1165         _mm_shuffle_epi8(mask3.lo, hlo),
1166         _mm_shuffle_epi8(mask3.hi, hhi),
1167     );
1168     (res0, res1, res2)
1169 }
1170 
1171 /// Return candidates for Slim 256-bit Teddy, where `chunk` corresponds
1172 /// to a 32-byte window of the haystack (where the least significant byte
1173 /// corresponds to the start of the window), and the masks correspond to a
1174 /// low/high mask for the first, second and third bytes of all patterns that
1175 /// are being searched. The vectors returned correspond to candidates for the
1176 /// first, second and third bytes in the patterns represented by the masks.
1177 ///
1178 /// Note that this can also be used for Fat Teddy, where the high 128 bits in
1179 /// `chunk` is the same as the low 128 bits, which corresponds to a 16 byte
1180 /// window in the haystack.
1181 #[target_feature(enable = "avx2")]
members3m256( chunk: __m256i, mask1: Mask256, mask2: Mask256, mask3: Mask256, ) -> (__m256i, __m256i, __m256i)1182 unsafe fn members3m256(
1183     chunk: __m256i,
1184     mask1: Mask256,
1185     mask2: Mask256,
1186     mask3: Mask256,
1187 ) -> (__m256i, __m256i, __m256i) {
1188     let lomask = _mm256_set1_epi8(0xF);
1189     let hlo = _mm256_and_si256(chunk, lomask);
1190     let hhi = _mm256_and_si256(_mm256_srli_epi16(chunk, 4), lomask);
1191     let res0 = _mm256_and_si256(
1192         _mm256_shuffle_epi8(mask1.lo, hlo),
1193         _mm256_shuffle_epi8(mask1.hi, hhi),
1194     );
1195     let res1 = _mm256_and_si256(
1196         _mm256_shuffle_epi8(mask2.lo, hlo),
1197         _mm256_shuffle_epi8(mask2.hi, hhi),
1198     );
1199     let res2 = _mm256_and_si256(
1200         _mm256_shuffle_epi8(mask3.lo, hlo),
1201         _mm256_shuffle_epi8(mask3.hi, hhi),
1202     );
1203     (res0, res1, res2)
1204 }
1205