1 use core::{arch::x86_64::*, cmp, mem::size_of};
2 
3 use super::sse2;
4 
5 const VECTOR_SIZE: usize = size_of::<__m256i>();
6 const VECTOR_ALIGN: usize = VECTOR_SIZE - 1;
7 
8 // The number of bytes to loop at in one iteration of memchr/memrchr.
9 const LOOP_SIZE: usize = 4 * VECTOR_SIZE;
10 
11 // The number of bytes to loop at in one iteration of memchr2/memrchr2 and
12 // memchr3/memrchr3. There was no observable difference between 128 and 64
13 // bytes in benchmarks. memchr3 in particular only gets a very slight speed up
14 // from the loop unrolling.
15 const LOOP_SIZE2: usize = 2 * VECTOR_SIZE;
16 
17 #[target_feature(enable = "avx2")]
memchr(n1: u8, haystack: &[u8]) -> Option<usize>18 pub unsafe fn memchr(n1: u8, haystack: &[u8]) -> Option<usize> {
19     // For a high level explanation for how this algorithm works, see the
20     // sse2 implementation. The avx implementation here is the same, but with
21     // 256-bit vectors instead of 128-bit vectors.
22 
23     // This routine is called whenever a match is detected. It is specifically
24     // marked as unlineable because it improves the codegen of the unrolled
25     // loop below. Inlining this seems to cause codegen with some extra adds
26     // and a load that aren't necessary. This seems to result in about a 10%
27     // improvement for the memchr1/crate/huge/never benchmark.
28     //
29     // Interestingly, I couldn't observe a similar improvement for memrchr.
30     #[cold]
31     #[inline(never)]
32     #[target_feature(enable = "avx2")]
33     unsafe fn matched(
34         start_ptr: *const u8,
35         ptr: *const u8,
36         eqa: __m256i,
37         eqb: __m256i,
38         eqc: __m256i,
39         eqd: __m256i,
40     ) -> usize {
41         let mut at = sub(ptr, start_ptr);
42         let mask = _mm256_movemask_epi8(eqa);
43         if mask != 0 {
44             return at + forward_pos(mask);
45         }
46 
47         at += VECTOR_SIZE;
48         let mask = _mm256_movemask_epi8(eqb);
49         if mask != 0 {
50             return at + forward_pos(mask);
51         }
52 
53         at += VECTOR_SIZE;
54         let mask = _mm256_movemask_epi8(eqc);
55         if mask != 0 {
56             return at + forward_pos(mask);
57         }
58 
59         at += VECTOR_SIZE;
60         let mask = _mm256_movemask_epi8(eqd);
61         debug_assert!(mask != 0);
62         at + forward_pos(mask)
63     }
64 
65     let start_ptr = haystack.as_ptr();
66     let end_ptr = start_ptr.add(haystack.len());
67     let mut ptr = start_ptr;
68 
69     if haystack.len() < VECTOR_SIZE {
70         // For small haystacks, defer to the SSE2 implementation. Codegen
71         // suggests this completely avoids touching the AVX vectors.
72         return sse2::memchr(n1, haystack);
73     }
74 
75     let vn1 = _mm256_set1_epi8(n1 as i8);
76     let loop_size = cmp::min(LOOP_SIZE, haystack.len());
77     if let Some(i) = forward_search1(start_ptr, end_ptr, ptr, vn1) {
78         return Some(i);
79     }
80 
81     ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN));
82     debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr);
83     while loop_size == LOOP_SIZE && ptr <= end_ptr.sub(loop_size) {
84         debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
85 
86         let a = _mm256_load_si256(ptr as *const __m256i);
87         let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i);
88         let c = _mm256_load_si256(ptr.add(2 * VECTOR_SIZE) as *const __m256i);
89         let d = _mm256_load_si256(ptr.add(3 * VECTOR_SIZE) as *const __m256i);
90         let eqa = _mm256_cmpeq_epi8(vn1, a);
91         let eqb = _mm256_cmpeq_epi8(vn1, b);
92         let eqc = _mm256_cmpeq_epi8(vn1, c);
93         let eqd = _mm256_cmpeq_epi8(vn1, d);
94         let or1 = _mm256_or_si256(eqa, eqb);
95         let or2 = _mm256_or_si256(eqc, eqd);
96         let or3 = _mm256_or_si256(or1, or2);
97 
98         if _mm256_movemask_epi8(or3) != 0 {
99             return Some(matched(start_ptr, ptr, eqa, eqb, eqc, eqd));
100         }
101         ptr = ptr.add(loop_size);
102     }
103     while ptr <= end_ptr.sub(VECTOR_SIZE) {
104         debug_assert!(sub(end_ptr, ptr) >= VECTOR_SIZE);
105 
106         if let Some(i) = forward_search1(start_ptr, end_ptr, ptr, vn1) {
107             return Some(i);
108         }
109         ptr = ptr.add(VECTOR_SIZE);
110     }
111     if ptr < end_ptr {
112         debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE);
113         ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr));
114         debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE);
115 
116         return forward_search1(start_ptr, end_ptr, ptr, vn1);
117     }
118     None
119 }
120 
121 #[target_feature(enable = "avx2")]
memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize>122 pub unsafe fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
123     #[cold]
124     #[inline(never)]
125     #[target_feature(enable = "avx2")]
126     unsafe fn matched(
127         start_ptr: *const u8,
128         ptr: *const u8,
129         eqa1: __m256i,
130         eqa2: __m256i,
131         eqb1: __m256i,
132         eqb2: __m256i,
133     ) -> usize {
134         let mut at = sub(ptr, start_ptr);
135         let mask1 = _mm256_movemask_epi8(eqa1);
136         let mask2 = _mm256_movemask_epi8(eqa2);
137         if mask1 != 0 || mask2 != 0 {
138             return at + forward_pos2(mask1, mask2);
139         }
140 
141         at += VECTOR_SIZE;
142         let mask1 = _mm256_movemask_epi8(eqb1);
143         let mask2 = _mm256_movemask_epi8(eqb2);
144         at + forward_pos2(mask1, mask2)
145     }
146 
147     let vn1 = _mm256_set1_epi8(n1 as i8);
148     let vn2 = _mm256_set1_epi8(n2 as i8);
149     let len = haystack.len();
150     let loop_size = cmp::min(LOOP_SIZE2, len);
151     let start_ptr = haystack.as_ptr();
152     let end_ptr = start_ptr.add(haystack.len());
153     let mut ptr = start_ptr;
154 
155     if haystack.len() < VECTOR_SIZE {
156         while ptr < end_ptr {
157             if *ptr == n1 || *ptr == n2 {
158                 return Some(sub(ptr, start_ptr));
159             }
160             ptr = ptr.offset(1);
161         }
162         return None;
163     }
164 
165     if let Some(i) = forward_search2(start_ptr, end_ptr, ptr, vn1, vn2) {
166         return Some(i);
167     }
168 
169     ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN));
170     debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr);
171     while loop_size == LOOP_SIZE2 && ptr <= end_ptr.sub(loop_size) {
172         debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
173 
174         let a = _mm256_load_si256(ptr as *const __m256i);
175         let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i);
176         let eqa1 = _mm256_cmpeq_epi8(vn1, a);
177         let eqb1 = _mm256_cmpeq_epi8(vn1, b);
178         let eqa2 = _mm256_cmpeq_epi8(vn2, a);
179         let eqb2 = _mm256_cmpeq_epi8(vn2, b);
180         let or1 = _mm256_or_si256(eqa1, eqb1);
181         let or2 = _mm256_or_si256(eqa2, eqb2);
182         let or3 = _mm256_or_si256(or1, or2);
183         if _mm256_movemask_epi8(or3) != 0 {
184             return Some(matched(start_ptr, ptr, eqa1, eqa2, eqb1, eqb2));
185         }
186         ptr = ptr.add(loop_size);
187     }
188     while ptr <= end_ptr.sub(VECTOR_SIZE) {
189         if let Some(i) = forward_search2(start_ptr, end_ptr, ptr, vn1, vn2) {
190             return Some(i);
191         }
192         ptr = ptr.add(VECTOR_SIZE);
193     }
194     if ptr < end_ptr {
195         debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE);
196         ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr));
197         debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE);
198 
199         return forward_search2(start_ptr, end_ptr, ptr, vn1, vn2);
200     }
201     None
202 }
203 
204 #[target_feature(enable = "avx2")]
memchr3( n1: u8, n2: u8, n3: u8, haystack: &[u8], ) -> Option<usize>205 pub unsafe fn memchr3(
206     n1: u8,
207     n2: u8,
208     n3: u8,
209     haystack: &[u8],
210 ) -> Option<usize> {
211     #[cold]
212     #[inline(never)]
213     #[target_feature(enable = "avx2")]
214     unsafe fn matched(
215         start_ptr: *const u8,
216         ptr: *const u8,
217         eqa1: __m256i,
218         eqa2: __m256i,
219         eqa3: __m256i,
220         eqb1: __m256i,
221         eqb2: __m256i,
222         eqb3: __m256i,
223     ) -> usize {
224         let mut at = sub(ptr, start_ptr);
225         let mask1 = _mm256_movemask_epi8(eqa1);
226         let mask2 = _mm256_movemask_epi8(eqa2);
227         let mask3 = _mm256_movemask_epi8(eqa3);
228         if mask1 != 0 || mask2 != 0 || mask3 != 0 {
229             return at + forward_pos3(mask1, mask2, mask3);
230         }
231 
232         at += VECTOR_SIZE;
233         let mask1 = _mm256_movemask_epi8(eqb1);
234         let mask2 = _mm256_movemask_epi8(eqb2);
235         let mask3 = _mm256_movemask_epi8(eqb3);
236         at + forward_pos3(mask1, mask2, mask3)
237     }
238 
239     let vn1 = _mm256_set1_epi8(n1 as i8);
240     let vn2 = _mm256_set1_epi8(n2 as i8);
241     let vn3 = _mm256_set1_epi8(n3 as i8);
242     let len = haystack.len();
243     let loop_size = cmp::min(LOOP_SIZE2, len);
244     let start_ptr = haystack.as_ptr();
245     let end_ptr = start_ptr.add(haystack.len());
246     let mut ptr = start_ptr;
247 
248     if haystack.len() < VECTOR_SIZE {
249         while ptr < end_ptr {
250             if *ptr == n1 || *ptr == n2 || *ptr == n3 {
251                 return Some(sub(ptr, start_ptr));
252             }
253             ptr = ptr.offset(1);
254         }
255         return None;
256     }
257 
258     if let Some(i) = forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) {
259         return Some(i);
260     }
261 
262     ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN));
263     debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr);
264     while loop_size == LOOP_SIZE2 && ptr <= end_ptr.sub(loop_size) {
265         debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
266 
267         let a = _mm256_load_si256(ptr as *const __m256i);
268         let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i);
269         let eqa1 = _mm256_cmpeq_epi8(vn1, a);
270         let eqb1 = _mm256_cmpeq_epi8(vn1, b);
271         let eqa2 = _mm256_cmpeq_epi8(vn2, a);
272         let eqb2 = _mm256_cmpeq_epi8(vn2, b);
273         let eqa3 = _mm256_cmpeq_epi8(vn3, a);
274         let eqb3 = _mm256_cmpeq_epi8(vn3, b);
275         let or1 = _mm256_or_si256(eqa1, eqb1);
276         let or2 = _mm256_or_si256(eqa2, eqb2);
277         let or3 = _mm256_or_si256(eqa3, eqb3);
278         let or4 = _mm256_or_si256(or1, or2);
279         let or5 = _mm256_or_si256(or3, or4);
280         if _mm256_movemask_epi8(or5) != 0 {
281             return Some(matched(
282                 start_ptr, ptr, eqa1, eqa2, eqa3, eqb1, eqb2, eqb3,
283             ));
284         }
285         ptr = ptr.add(loop_size);
286     }
287     while ptr <= end_ptr.sub(VECTOR_SIZE) {
288         if let Some(i) =
289             forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3)
290         {
291             return Some(i);
292         }
293         ptr = ptr.add(VECTOR_SIZE);
294     }
295     if ptr < end_ptr {
296         debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE);
297         ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr));
298         debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE);
299 
300         return forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3);
301     }
302     None
303 }
304 
305 #[target_feature(enable = "avx2")]
memrchr(n1: u8, haystack: &[u8]) -> Option<usize>306 pub unsafe fn memrchr(n1: u8, haystack: &[u8]) -> Option<usize> {
307     let vn1 = _mm256_set1_epi8(n1 as i8);
308     let len = haystack.len();
309     let loop_size = cmp::min(LOOP_SIZE, len);
310     let start_ptr = haystack.as_ptr();
311     let end_ptr = start_ptr.add(haystack.len());
312     let mut ptr = end_ptr;
313 
314     if haystack.len() < VECTOR_SIZE {
315         while ptr > start_ptr {
316             ptr = ptr.offset(-1);
317             if *ptr == n1 {
318                 return Some(sub(ptr, start_ptr));
319             }
320         }
321         return None;
322     }
323 
324     ptr = ptr.sub(VECTOR_SIZE);
325     if let Some(i) = reverse_search1(start_ptr, end_ptr, ptr, vn1) {
326         return Some(i);
327     }
328 
329     ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8;
330     debug_assert!(start_ptr <= ptr && ptr <= end_ptr);
331     while loop_size == LOOP_SIZE && ptr >= start_ptr.add(loop_size) {
332         debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
333 
334         ptr = ptr.sub(loop_size);
335         let a = _mm256_load_si256(ptr as *const __m256i);
336         let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i);
337         let c = _mm256_load_si256(ptr.add(2 * VECTOR_SIZE) as *const __m256i);
338         let d = _mm256_load_si256(ptr.add(3 * VECTOR_SIZE) as *const __m256i);
339         let eqa = _mm256_cmpeq_epi8(vn1, a);
340         let eqb = _mm256_cmpeq_epi8(vn1, b);
341         let eqc = _mm256_cmpeq_epi8(vn1, c);
342         let eqd = _mm256_cmpeq_epi8(vn1, d);
343         let or1 = _mm256_or_si256(eqa, eqb);
344         let or2 = _mm256_or_si256(eqc, eqd);
345         let or3 = _mm256_or_si256(or1, or2);
346         if _mm256_movemask_epi8(or3) != 0 {
347             let mut at = sub(ptr.add(3 * VECTOR_SIZE), start_ptr);
348             let mask = _mm256_movemask_epi8(eqd);
349             if mask != 0 {
350                 return Some(at + reverse_pos(mask));
351             }
352 
353             at -= VECTOR_SIZE;
354             let mask = _mm256_movemask_epi8(eqc);
355             if mask != 0 {
356                 return Some(at + reverse_pos(mask));
357             }
358 
359             at -= VECTOR_SIZE;
360             let mask = _mm256_movemask_epi8(eqb);
361             if mask != 0 {
362                 return Some(at + reverse_pos(mask));
363             }
364 
365             at -= VECTOR_SIZE;
366             let mask = _mm256_movemask_epi8(eqa);
367             debug_assert!(mask != 0);
368             return Some(at + reverse_pos(mask));
369         }
370     }
371     while ptr >= start_ptr.add(VECTOR_SIZE) {
372         ptr = ptr.sub(VECTOR_SIZE);
373         if let Some(i) = reverse_search1(start_ptr, end_ptr, ptr, vn1) {
374             return Some(i);
375         }
376     }
377     if ptr > start_ptr {
378         debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE);
379         return reverse_search1(start_ptr, end_ptr, start_ptr, vn1);
380     }
381     None
382 }
383 
384 #[target_feature(enable = "avx2")]
memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize>385 pub unsafe fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
386     let vn1 = _mm256_set1_epi8(n1 as i8);
387     let vn2 = _mm256_set1_epi8(n2 as i8);
388     let len = haystack.len();
389     let loop_size = cmp::min(LOOP_SIZE2, len);
390     let start_ptr = haystack.as_ptr();
391     let end_ptr = start_ptr.add(haystack.len());
392     let mut ptr = end_ptr;
393 
394     if haystack.len() < VECTOR_SIZE {
395         while ptr > start_ptr {
396             ptr = ptr.offset(-1);
397             if *ptr == n1 || *ptr == n2 {
398                 return Some(sub(ptr, start_ptr));
399             }
400         }
401         return None;
402     }
403 
404     ptr = ptr.sub(VECTOR_SIZE);
405     if let Some(i) = reverse_search2(start_ptr, end_ptr, ptr, vn1, vn2) {
406         return Some(i);
407     }
408 
409     ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8;
410     debug_assert!(start_ptr <= ptr && ptr <= end_ptr);
411     while loop_size == LOOP_SIZE2 && ptr >= start_ptr.add(loop_size) {
412         debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
413 
414         ptr = ptr.sub(loop_size);
415         let a = _mm256_load_si256(ptr as *const __m256i);
416         let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i);
417         let eqa1 = _mm256_cmpeq_epi8(vn1, a);
418         let eqb1 = _mm256_cmpeq_epi8(vn1, b);
419         let eqa2 = _mm256_cmpeq_epi8(vn2, a);
420         let eqb2 = _mm256_cmpeq_epi8(vn2, b);
421         let or1 = _mm256_or_si256(eqa1, eqb1);
422         let or2 = _mm256_or_si256(eqa2, eqb2);
423         let or3 = _mm256_or_si256(or1, or2);
424         if _mm256_movemask_epi8(or3) != 0 {
425             let mut at = sub(ptr.add(VECTOR_SIZE), start_ptr);
426             let mask1 = _mm256_movemask_epi8(eqb1);
427             let mask2 = _mm256_movemask_epi8(eqb2);
428             if mask1 != 0 || mask2 != 0 {
429                 return Some(at + reverse_pos2(mask1, mask2));
430             }
431 
432             at -= VECTOR_SIZE;
433             let mask1 = _mm256_movemask_epi8(eqa1);
434             let mask2 = _mm256_movemask_epi8(eqa2);
435             return Some(at + reverse_pos2(mask1, mask2));
436         }
437     }
438     while ptr >= start_ptr.add(VECTOR_SIZE) {
439         ptr = ptr.sub(VECTOR_SIZE);
440         if let Some(i) = reverse_search2(start_ptr, end_ptr, ptr, vn1, vn2) {
441             return Some(i);
442         }
443     }
444     if ptr > start_ptr {
445         debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE);
446         return reverse_search2(start_ptr, end_ptr, start_ptr, vn1, vn2);
447     }
448     None
449 }
450 
451 #[target_feature(enable = "avx2")]
memrchr3( n1: u8, n2: u8, n3: u8, haystack: &[u8], ) -> Option<usize>452 pub unsafe fn memrchr3(
453     n1: u8,
454     n2: u8,
455     n3: u8,
456     haystack: &[u8],
457 ) -> Option<usize> {
458     let vn1 = _mm256_set1_epi8(n1 as i8);
459     let vn2 = _mm256_set1_epi8(n2 as i8);
460     let vn3 = _mm256_set1_epi8(n3 as i8);
461     let len = haystack.len();
462     let loop_size = cmp::min(LOOP_SIZE2, len);
463     let start_ptr = haystack.as_ptr();
464     let end_ptr = start_ptr.add(haystack.len());
465     let mut ptr = end_ptr;
466 
467     if haystack.len() < VECTOR_SIZE {
468         while ptr > start_ptr {
469             ptr = ptr.offset(-1);
470             if *ptr == n1 || *ptr == n2 || *ptr == n3 {
471                 return Some(sub(ptr, start_ptr));
472             }
473         }
474         return None;
475     }
476 
477     ptr = ptr.sub(VECTOR_SIZE);
478     if let Some(i) = reverse_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) {
479         return Some(i);
480     }
481 
482     ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8;
483     debug_assert!(start_ptr <= ptr && ptr <= end_ptr);
484     while loop_size == LOOP_SIZE2 && ptr >= start_ptr.add(loop_size) {
485         debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
486 
487         ptr = ptr.sub(loop_size);
488         let a = _mm256_load_si256(ptr as *const __m256i);
489         let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i);
490         let eqa1 = _mm256_cmpeq_epi8(vn1, a);
491         let eqb1 = _mm256_cmpeq_epi8(vn1, b);
492         let eqa2 = _mm256_cmpeq_epi8(vn2, a);
493         let eqb2 = _mm256_cmpeq_epi8(vn2, b);
494         let eqa3 = _mm256_cmpeq_epi8(vn3, a);
495         let eqb3 = _mm256_cmpeq_epi8(vn3, b);
496         let or1 = _mm256_or_si256(eqa1, eqb1);
497         let or2 = _mm256_or_si256(eqa2, eqb2);
498         let or3 = _mm256_or_si256(eqa3, eqb3);
499         let or4 = _mm256_or_si256(or1, or2);
500         let or5 = _mm256_or_si256(or3, or4);
501         if _mm256_movemask_epi8(or5) != 0 {
502             let mut at = sub(ptr.add(VECTOR_SIZE), start_ptr);
503             let mask1 = _mm256_movemask_epi8(eqb1);
504             let mask2 = _mm256_movemask_epi8(eqb2);
505             let mask3 = _mm256_movemask_epi8(eqb3);
506             if mask1 != 0 || mask2 != 0 || mask3 != 0 {
507                 return Some(at + reverse_pos3(mask1, mask2, mask3));
508             }
509 
510             at -= VECTOR_SIZE;
511             let mask1 = _mm256_movemask_epi8(eqa1);
512             let mask2 = _mm256_movemask_epi8(eqa2);
513             let mask3 = _mm256_movemask_epi8(eqa3);
514             return Some(at + reverse_pos3(mask1, mask2, mask3));
515         }
516     }
517     while ptr >= start_ptr.add(VECTOR_SIZE) {
518         ptr = ptr.sub(VECTOR_SIZE);
519         if let Some(i) =
520             reverse_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3)
521         {
522             return Some(i);
523         }
524     }
525     if ptr > start_ptr {
526         debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE);
527         return reverse_search3(start_ptr, end_ptr, start_ptr, vn1, vn2, vn3);
528     }
529     None
530 }
531 
532 #[target_feature(enable = "avx2")]
forward_search1( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, vn1: __m256i, ) -> Option<usize>533 unsafe fn forward_search1(
534     start_ptr: *const u8,
535     end_ptr: *const u8,
536     ptr: *const u8,
537     vn1: __m256i,
538 ) -> Option<usize> {
539     debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
540     debug_assert!(start_ptr <= ptr);
541     debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
542 
543     let chunk = _mm256_loadu_si256(ptr as *const __m256i);
544     let mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(chunk, vn1));
545     if mask != 0 {
546         Some(sub(ptr, start_ptr) + forward_pos(mask))
547     } else {
548         None
549     }
550 }
551 
552 #[target_feature(enable = "avx2")]
forward_search2( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, vn1: __m256i, vn2: __m256i, ) -> Option<usize>553 unsafe fn forward_search2(
554     start_ptr: *const u8,
555     end_ptr: *const u8,
556     ptr: *const u8,
557     vn1: __m256i,
558     vn2: __m256i,
559 ) -> Option<usize> {
560     debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
561     debug_assert!(start_ptr <= ptr);
562     debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
563 
564     let chunk = _mm256_loadu_si256(ptr as *const __m256i);
565     let eq1 = _mm256_cmpeq_epi8(chunk, vn1);
566     let eq2 = _mm256_cmpeq_epi8(chunk, vn2);
567     if _mm256_movemask_epi8(_mm256_or_si256(eq1, eq2)) != 0 {
568         let mask1 = _mm256_movemask_epi8(eq1);
569         let mask2 = _mm256_movemask_epi8(eq2);
570         Some(sub(ptr, start_ptr) + forward_pos2(mask1, mask2))
571     } else {
572         None
573     }
574 }
575 
576 #[target_feature(enable = "avx2")]
forward_search3( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, vn1: __m256i, vn2: __m256i, vn3: __m256i, ) -> Option<usize>577 unsafe fn forward_search3(
578     start_ptr: *const u8,
579     end_ptr: *const u8,
580     ptr: *const u8,
581     vn1: __m256i,
582     vn2: __m256i,
583     vn3: __m256i,
584 ) -> Option<usize> {
585     debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
586     debug_assert!(start_ptr <= ptr);
587     debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
588 
589     let chunk = _mm256_loadu_si256(ptr as *const __m256i);
590     let eq1 = _mm256_cmpeq_epi8(chunk, vn1);
591     let eq2 = _mm256_cmpeq_epi8(chunk, vn2);
592     let eq3 = _mm256_cmpeq_epi8(chunk, vn3);
593     let or = _mm256_or_si256(eq1, eq2);
594     if _mm256_movemask_epi8(_mm256_or_si256(or, eq3)) != 0 {
595         let mask1 = _mm256_movemask_epi8(eq1);
596         let mask2 = _mm256_movemask_epi8(eq2);
597         let mask3 = _mm256_movemask_epi8(eq3);
598         Some(sub(ptr, start_ptr) + forward_pos3(mask1, mask2, mask3))
599     } else {
600         None
601     }
602 }
603 
604 #[target_feature(enable = "avx2")]
reverse_search1( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, vn1: __m256i, ) -> Option<usize>605 unsafe fn reverse_search1(
606     start_ptr: *const u8,
607     end_ptr: *const u8,
608     ptr: *const u8,
609     vn1: __m256i,
610 ) -> Option<usize> {
611     debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
612     debug_assert!(start_ptr <= ptr);
613     debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
614 
615     let chunk = _mm256_loadu_si256(ptr as *const __m256i);
616     let mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(vn1, chunk));
617     if mask != 0 {
618         Some(sub(ptr, start_ptr) + reverse_pos(mask))
619     } else {
620         None
621     }
622 }
623 
624 #[target_feature(enable = "avx2")]
reverse_search2( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, vn1: __m256i, vn2: __m256i, ) -> Option<usize>625 unsafe fn reverse_search2(
626     start_ptr: *const u8,
627     end_ptr: *const u8,
628     ptr: *const u8,
629     vn1: __m256i,
630     vn2: __m256i,
631 ) -> Option<usize> {
632     debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
633     debug_assert!(start_ptr <= ptr);
634     debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
635 
636     let chunk = _mm256_loadu_si256(ptr as *const __m256i);
637     let eq1 = _mm256_cmpeq_epi8(chunk, vn1);
638     let eq2 = _mm256_cmpeq_epi8(chunk, vn2);
639     if _mm256_movemask_epi8(_mm256_or_si256(eq1, eq2)) != 0 {
640         let mask1 = _mm256_movemask_epi8(eq1);
641         let mask2 = _mm256_movemask_epi8(eq2);
642         Some(sub(ptr, start_ptr) + reverse_pos2(mask1, mask2))
643     } else {
644         None
645     }
646 }
647 
648 #[target_feature(enable = "avx2")]
reverse_search3( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, vn1: __m256i, vn2: __m256i, vn3: __m256i, ) -> Option<usize>649 unsafe fn reverse_search3(
650     start_ptr: *const u8,
651     end_ptr: *const u8,
652     ptr: *const u8,
653     vn1: __m256i,
654     vn2: __m256i,
655     vn3: __m256i,
656 ) -> Option<usize> {
657     debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
658     debug_assert!(start_ptr <= ptr);
659     debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
660 
661     let chunk = _mm256_loadu_si256(ptr as *const __m256i);
662     let eq1 = _mm256_cmpeq_epi8(chunk, vn1);
663     let eq2 = _mm256_cmpeq_epi8(chunk, vn2);
664     let eq3 = _mm256_cmpeq_epi8(chunk, vn3);
665     let or = _mm256_or_si256(eq1, eq2);
666     if _mm256_movemask_epi8(_mm256_or_si256(or, eq3)) != 0 {
667         let mask1 = _mm256_movemask_epi8(eq1);
668         let mask2 = _mm256_movemask_epi8(eq2);
669         let mask3 = _mm256_movemask_epi8(eq3);
670         Some(sub(ptr, start_ptr) + reverse_pos3(mask1, mask2, mask3))
671     } else {
672         None
673     }
674 }
675 
676 /// Compute the position of the first matching byte from the given mask. The
677 /// position returned is always in the range [0, 31].
678 ///
679 /// The mask given is expected to be the result of _mm256_movemask_epi8.
forward_pos(mask: i32) -> usize680 fn forward_pos(mask: i32) -> usize {
681     // We are dealing with little endian here, where the most significant byte
682     // is at a higher address. That means the least significant bit that is set
683     // corresponds to the position of our first matching byte. That position
684     // corresponds to the number of zeros after the least significant bit.
685     mask.trailing_zeros() as usize
686 }
687 
688 /// Compute the position of the first matching byte from the given masks. The
689 /// position returned is always in the range [0, 31]. Each mask corresponds to
690 /// the equality comparison of a single byte.
691 ///
692 /// The masks given are expected to be the result of _mm256_movemask_epi8,
693 /// where at least one of the masks is non-zero (i.e., indicates a match).
forward_pos2(mask1: i32, mask2: i32) -> usize694 fn forward_pos2(mask1: i32, mask2: i32) -> usize {
695     debug_assert!(mask1 != 0 || mask2 != 0);
696 
697     forward_pos(mask1 | mask2)
698 }
699 
700 /// Compute the position of the first matching byte from the given masks. The
701 /// position returned is always in the range [0, 31]. Each mask corresponds to
702 /// the equality comparison of a single byte.
703 ///
704 /// The masks given are expected to be the result of _mm256_movemask_epi8,
705 /// where at least one of the masks is non-zero (i.e., indicates a match).
forward_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize706 fn forward_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize {
707     debug_assert!(mask1 != 0 || mask2 != 0 || mask3 != 0);
708 
709     forward_pos(mask1 | mask2 | mask3)
710 }
711 
712 /// Compute the position of the last matching byte from the given mask. The
713 /// position returned is always in the range [0, 31].
714 ///
715 /// The mask given is expected to be the result of _mm256_movemask_epi8.
reverse_pos(mask: i32) -> usize716 fn reverse_pos(mask: i32) -> usize {
717     // We are dealing with little endian here, where the most significant byte
718     // is at a higher address. That means the most significant bit that is set
719     // corresponds to the position of our last matching byte. The position from
720     // the end of the mask is therefore the number of leading zeros in a 32
721     // bit integer, and the position from the start of the mask is therefore
722     // 32 - (leading zeros) - 1.
723     VECTOR_SIZE - (mask as u32).leading_zeros() as usize - 1
724 }
725 
726 /// Compute the position of the last matching byte from the given masks. The
727 /// position returned is always in the range [0, 31]. Each mask corresponds to
728 /// the equality comparison of a single byte.
729 ///
730 /// The masks given are expected to be the result of _mm256_movemask_epi8,
731 /// where at least one of the masks is non-zero (i.e., indicates a match).
reverse_pos2(mask1: i32, mask2: i32) -> usize732 fn reverse_pos2(mask1: i32, mask2: i32) -> usize {
733     debug_assert!(mask1 != 0 || mask2 != 0);
734 
735     reverse_pos(mask1 | mask2)
736 }
737 
738 /// Compute the position of the last matching byte from the given masks. The
739 /// position returned is always in the range [0, 31]. Each mask corresponds to
740 /// the equality comparison of a single byte.
741 ///
742 /// The masks given are expected to be the result of _mm256_movemask_epi8,
743 /// where at least one of the masks is non-zero (i.e., indicates a match).
reverse_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize744 fn reverse_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize {
745     debug_assert!(mask1 != 0 || mask2 != 0 || mask3 != 0);
746 
747     reverse_pos(mask1 | mask2 | mask3)
748 }
749 
750 /// Subtract `b` from `a` and return the difference. `a` should be greater than
751 /// or equal to `b`.
sub(a: *const u8, b: *const u8) -> usize752 fn sub(a: *const u8, b: *const u8) -> usize {
753     debug_assert!(a >= b);
754     (a as usize) - (b as usize)
755 }
756