1 use core::arch::x86_64::*;
2 use core::cmp;
3 use core::mem::size_of;
4 
5 use x86::sse2;
6 
7 const VECTOR_SIZE: usize = size_of::<__m256i>();
8 const VECTOR_ALIGN: usize = VECTOR_SIZE - 1;
9 
10 // The number of bytes to loop at in one iteration of memchr/memrchr.
11 const LOOP_SIZE: usize = 4 * VECTOR_SIZE;
12 
13 // The number of bytes to loop at in one iteration of memchr2/memrchr2 and
14 // memchr3/memrchr3. There was no observable difference between 128 and 64
15 // bytes in benchmarks. memchr3 in particular only gets a very slight speed up
16 // from the loop unrolling.
17 const LOOP_SIZE2: usize = 2 * VECTOR_SIZE;
18 
19 #[target_feature(enable = "avx2")]
memchr(n1: u8, haystack: &[u8]) -> Option<usize>20 pub unsafe fn memchr(n1: u8, haystack: &[u8]) -> Option<usize> {
21     // For a high level explanation for how this algorithm works, see the
22     // sse2 implementation. The avx implementation here is the same, but with
23     // 256-bit vectors instead of 128-bit vectors.
24 
25     let start_ptr = haystack.as_ptr();
26     let end_ptr = haystack[haystack.len()..].as_ptr();
27     let mut ptr = start_ptr;
28 
29     if haystack.len() < VECTOR_SIZE {
30         // For small haystacks, defer to the SSE2 implementation. Codegen
31         // suggests this completely avoids touching the AVX vectors.
32         return sse2::memchr(n1, haystack);
33     }
34 
35     let vn1 = _mm256_set1_epi8(n1 as i8);
36     let loop_size = cmp::min(LOOP_SIZE, haystack.len());
37     if let Some(i) = forward_search1(start_ptr, end_ptr, ptr, vn1) {
38         return Some(i);
39     }
40 
41     ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN));
42     debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr);
43     while loop_size == LOOP_SIZE && ptr <= end_ptr.sub(loop_size) {
44         debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
45 
46         let a = _mm256_load_si256(ptr as *const __m256i);
47         let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i);
48         let c = _mm256_load_si256(ptr.add(2 * VECTOR_SIZE) as *const __m256i);
49         let d = _mm256_load_si256(ptr.add(3 * VECTOR_SIZE) as *const __m256i);
50         let eqa = _mm256_cmpeq_epi8(vn1, a);
51         let eqb = _mm256_cmpeq_epi8(vn1, b);
52         let eqc = _mm256_cmpeq_epi8(vn1, c);
53         let eqd = _mm256_cmpeq_epi8(vn1, d);
54         let or1 = _mm256_or_si256(eqa, eqb);
55         let or2 = _mm256_or_si256(eqc, eqd);
56         let or3 = _mm256_or_si256(or1, or2);
57         if _mm256_movemask_epi8(or3) != 0 {
58             let mut at = sub(ptr, start_ptr);
59             let mask = _mm256_movemask_epi8(eqa);
60             if mask != 0 {
61                 return Some(at + forward_pos(mask));
62             }
63 
64             at += VECTOR_SIZE;
65             let mask = _mm256_movemask_epi8(eqb);
66             if mask != 0 {
67                 return Some(at + forward_pos(mask));
68             }
69 
70             at += VECTOR_SIZE;
71             let mask = _mm256_movemask_epi8(eqc);
72             if mask != 0 {
73                 return Some(at + forward_pos(mask));
74             }
75 
76             at += VECTOR_SIZE;
77             let mask = _mm256_movemask_epi8(eqd);
78             debug_assert!(mask != 0);
79             return Some(at + forward_pos(mask));
80         }
81         ptr = ptr.add(loop_size);
82     }
83     while ptr <= end_ptr.sub(VECTOR_SIZE) {
84         debug_assert!(sub(end_ptr, ptr) >= VECTOR_SIZE);
85 
86         if let Some(i) = forward_search1(start_ptr, end_ptr, ptr, vn1) {
87             return Some(i);
88         }
89         ptr = ptr.add(VECTOR_SIZE);
90     }
91     if ptr < end_ptr {
92         debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE);
93         ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr));
94         debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE);
95 
96         return forward_search1(start_ptr, end_ptr, ptr, vn1);
97     }
98     None
99 }
100 
101 #[target_feature(enable = "avx2")]
memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize>102 pub unsafe fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
103     let vn1 = _mm256_set1_epi8(n1 as i8);
104     let vn2 = _mm256_set1_epi8(n2 as i8);
105     let len = haystack.len();
106     let loop_size = cmp::min(LOOP_SIZE2, len);
107     let start_ptr = haystack.as_ptr();
108     let end_ptr = haystack[haystack.len()..].as_ptr();
109     let mut ptr = start_ptr;
110 
111     if haystack.len() < VECTOR_SIZE {
112         while ptr < end_ptr {
113             if *ptr == n1 || *ptr == n2 {
114                 return Some(sub(ptr, start_ptr));
115             }
116             ptr = ptr.offset(1);
117         }
118         return None;
119     }
120 
121     if let Some(i) = forward_search2(start_ptr, end_ptr, ptr, vn1, vn2) {
122         return Some(i);
123     }
124 
125     ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN));
126     debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr);
127     while loop_size == LOOP_SIZE2 && ptr <= end_ptr.sub(loop_size) {
128         debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
129 
130         let a = _mm256_load_si256(ptr as *const __m256i);
131         let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i);
132         let eqa1 = _mm256_cmpeq_epi8(vn1, a);
133         let eqb1 = _mm256_cmpeq_epi8(vn1, b);
134         let eqa2 = _mm256_cmpeq_epi8(vn2, a);
135         let eqb2 = _mm256_cmpeq_epi8(vn2, b);
136         let or1 = _mm256_or_si256(eqa1, eqb1);
137         let or2 = _mm256_or_si256(eqa2, eqb2);
138         let or3 = _mm256_or_si256(or1, or2);
139         if _mm256_movemask_epi8(or3) != 0 {
140             let mut at = sub(ptr, start_ptr);
141             let mask1 = _mm256_movemask_epi8(eqa1);
142             let mask2 = _mm256_movemask_epi8(eqa2);
143             if mask1 != 0 || mask2 != 0 {
144                 return Some(at + forward_pos2(mask1, mask2));
145             }
146 
147             at += VECTOR_SIZE;
148             let mask1 = _mm256_movemask_epi8(eqb1);
149             let mask2 = _mm256_movemask_epi8(eqb2);
150             return Some(at + forward_pos2(mask1, mask2));
151         }
152         ptr = ptr.add(loop_size);
153     }
154     while ptr <= end_ptr.sub(VECTOR_SIZE) {
155         if let Some(i) = forward_search2(start_ptr, end_ptr, ptr, vn1, vn2) {
156             return Some(i);
157         }
158         ptr = ptr.add(VECTOR_SIZE);
159     }
160     if ptr < end_ptr {
161         debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE);
162         ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr));
163         debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE);
164 
165         return forward_search2(start_ptr, end_ptr, ptr, vn1, vn2);
166     }
167     None
168 }
169 
170 #[target_feature(enable = "avx2")]
memchr3( n1: u8, n2: u8, n3: u8, haystack: &[u8] ) -> Option<usize>171 pub unsafe fn memchr3(
172     n1: u8, n2: u8, n3: u8,
173     haystack: &[u8]
174 ) -> Option<usize> {
175     let vn1 = _mm256_set1_epi8(n1 as i8);
176     let vn2 = _mm256_set1_epi8(n2 as i8);
177     let vn3 = _mm256_set1_epi8(n3 as i8);
178     let len = haystack.len();
179     let loop_size = cmp::min(LOOP_SIZE2, len);
180     let start_ptr = haystack.as_ptr();
181     let end_ptr = haystack[haystack.len()..].as_ptr();
182     let mut ptr = start_ptr;
183 
184     if haystack.len() < VECTOR_SIZE {
185         while ptr < end_ptr {
186             if *ptr == n1 || *ptr == n2 || *ptr == n3 {
187                 return Some(sub(ptr, start_ptr));
188             }
189             ptr = ptr.offset(1);
190         }
191         return None;
192     }
193 
194     if let Some(i) = forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) {
195         return Some(i);
196     }
197 
198     ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN));
199     debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr);
200     while loop_size == LOOP_SIZE2 && ptr <= end_ptr.sub(loop_size) {
201         debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
202 
203         let a = _mm256_load_si256(ptr as *const __m256i);
204         let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i);
205         let eqa1 = _mm256_cmpeq_epi8(vn1, a);
206         let eqb1 = _mm256_cmpeq_epi8(vn1, b);
207         let eqa2 = _mm256_cmpeq_epi8(vn2, a);
208         let eqb2 = _mm256_cmpeq_epi8(vn2, b);
209         let eqa3 = _mm256_cmpeq_epi8(vn3, a);
210         let eqb3 = _mm256_cmpeq_epi8(vn3, b);
211         let or1 = _mm256_or_si256(eqa1, eqb1);
212         let or2 = _mm256_or_si256(eqa2, eqb2);
213         let or3 = _mm256_or_si256(eqa3, eqb3);
214         let or4 = _mm256_or_si256(or1, or2);
215         let or5 = _mm256_or_si256(or3, or4);
216         if _mm256_movemask_epi8(or5) != 0 {
217             let mut at = sub(ptr, start_ptr);
218             let mask1 = _mm256_movemask_epi8(eqa1);
219             let mask2 = _mm256_movemask_epi8(eqa2);
220             let mask3 = _mm256_movemask_epi8(eqa3);
221             if mask1 != 0 || mask2 != 0 || mask3 != 0 {
222                 return Some(at + forward_pos3(mask1, mask2, mask3));
223             }
224 
225             at += VECTOR_SIZE;
226             let mask1 = _mm256_movemask_epi8(eqb1);
227             let mask2 = _mm256_movemask_epi8(eqb2);
228             let mask3 = _mm256_movemask_epi8(eqb3);
229             if mask1 != 0 || mask2 != 0 || mask3 != 0 {
230                 return Some(at + forward_pos3(mask1, mask2, mask3));
231             }
232 
233             at += VECTOR_SIZE;
234             let mask1 = _mm256_movemask_epi8(eqb1);
235             let mask2 = _mm256_movemask_epi8(eqb2);
236             let mask3 = _mm256_movemask_epi8(eqb3);
237             return Some(at + forward_pos3(mask1, mask2, mask3));
238         }
239         ptr = ptr.add(loop_size);
240     }
241     while ptr <= end_ptr.sub(VECTOR_SIZE) {
242         if let Some(i) = forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) {
243             return Some(i);
244         }
245         ptr = ptr.add(VECTOR_SIZE);
246     }
247     if ptr < end_ptr {
248         debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE);
249         ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr));
250         debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE);
251 
252         return forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3);
253     }
254     None
255 }
256 
257 #[target_feature(enable = "avx2")]
memrchr(n1: u8, haystack: &[u8]) -> Option<usize>258 pub unsafe fn memrchr(n1: u8, haystack: &[u8]) -> Option<usize> {
259     let vn1 = _mm256_set1_epi8(n1 as i8);
260     let len = haystack.len();
261     let loop_size = cmp::min(LOOP_SIZE, len);
262     let start_ptr = haystack.as_ptr();
263     let end_ptr = haystack[haystack.len()..].as_ptr();
264     let mut ptr = end_ptr;
265 
266     if haystack.len() < VECTOR_SIZE {
267         while ptr > start_ptr {
268             ptr = ptr.offset(-1);
269             if *ptr == n1 {
270                 return Some(sub(ptr, start_ptr));
271             }
272         }
273         return None;
274     }
275 
276     ptr = ptr.sub(VECTOR_SIZE);
277     if let Some(i) = reverse_search1(start_ptr, end_ptr, ptr, vn1) {
278         return Some(i);
279     }
280 
281     ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8;
282     debug_assert!(start_ptr <= ptr && ptr <= end_ptr);
283     while loop_size == LOOP_SIZE && ptr >= start_ptr.add(loop_size) {
284         debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
285 
286         ptr = ptr.sub(loop_size);
287         let a = _mm256_load_si256(ptr as *const __m256i);
288         let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i);
289         let c = _mm256_load_si256(ptr.add(2 * VECTOR_SIZE) as *const __m256i);
290         let d = _mm256_load_si256(ptr.add(3 * VECTOR_SIZE) as *const __m256i);
291         let eqa = _mm256_cmpeq_epi8(vn1, a);
292         let eqb = _mm256_cmpeq_epi8(vn1, b);
293         let eqc = _mm256_cmpeq_epi8(vn1, c);
294         let eqd = _mm256_cmpeq_epi8(vn1, d);
295         let or1 = _mm256_or_si256(eqa, eqb);
296         let or2 = _mm256_or_si256(eqc, eqd);
297         let or3 = _mm256_or_si256(or1, or2);
298         if _mm256_movemask_epi8(or3) != 0 {
299             let mut at = sub(ptr.add(3 * VECTOR_SIZE), start_ptr);
300             let mask = _mm256_movemask_epi8(eqd);
301             if mask != 0 {
302                 return Some(at + reverse_pos(mask));
303             }
304 
305             at -= VECTOR_SIZE;
306             let mask = _mm256_movemask_epi8(eqc);
307             if mask != 0 {
308                 return Some(at + reverse_pos(mask));
309             }
310 
311             at -= VECTOR_SIZE;
312             let mask = _mm256_movemask_epi8(eqb);
313             if mask != 0 {
314                 return Some(at + reverse_pos(mask));
315             }
316 
317             at -= VECTOR_SIZE;
318             let mask = _mm256_movemask_epi8(eqa);
319             debug_assert!(mask != 0);
320             return Some(at + reverse_pos(mask));
321         }
322     }
323     while ptr >= start_ptr.add(VECTOR_SIZE) {
324         ptr = ptr.sub(VECTOR_SIZE);
325         if let Some(i) = reverse_search1(start_ptr, end_ptr, ptr, vn1) {
326             return Some(i);
327         }
328     }
329     if ptr > start_ptr {
330         debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE);
331         return reverse_search1(start_ptr, end_ptr, start_ptr, vn1);
332     }
333     None
334 }
335 
336 #[target_feature(enable = "avx2")]
memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize>337 pub unsafe fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
338     let vn1 = _mm256_set1_epi8(n1 as i8);
339     let vn2 = _mm256_set1_epi8(n2 as i8);
340     let len = haystack.len();
341     let loop_size = cmp::min(LOOP_SIZE2, len);
342     let start_ptr = haystack.as_ptr();
343     let end_ptr = haystack[haystack.len()..].as_ptr();
344     let mut ptr = end_ptr;
345 
346     if haystack.len() < VECTOR_SIZE {
347         while ptr > start_ptr {
348             ptr = ptr.offset(-1);
349             if *ptr == n1 || *ptr == n2 {
350                 return Some(sub(ptr, start_ptr));
351             }
352         }
353         return None;
354     }
355 
356     ptr = ptr.sub(VECTOR_SIZE);
357     if let Some(i) = reverse_search2(start_ptr, end_ptr, ptr, vn1, vn2) {
358         return Some(i);
359     }
360 
361     ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8;
362     debug_assert!(start_ptr <= ptr && ptr <= end_ptr);
363     while loop_size == LOOP_SIZE2 && ptr >= start_ptr.add(loop_size) {
364         debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
365 
366         ptr = ptr.sub(loop_size);
367         let a = _mm256_load_si256(ptr as *const __m256i);
368         let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i);
369         let eqa1 = _mm256_cmpeq_epi8(vn1, a);
370         let eqb1 = _mm256_cmpeq_epi8(vn1, b);
371         let eqa2 = _mm256_cmpeq_epi8(vn2, a);
372         let eqb2 = _mm256_cmpeq_epi8(vn2, b);
373         let or1 = _mm256_or_si256(eqa1, eqb1);
374         let or2 = _mm256_or_si256(eqa2, eqb2);
375         let or3 = _mm256_or_si256(or1, or2);
376         if _mm256_movemask_epi8(or3) != 0 {
377             let mut at = sub(ptr.add(VECTOR_SIZE), start_ptr);
378             let mask1 = _mm256_movemask_epi8(eqb1);
379             let mask2 = _mm256_movemask_epi8(eqb2);
380             if mask1 != 0 || mask2 != 0 {
381                 return Some(at + reverse_pos2(mask1, mask2));
382             }
383 
384             at -= VECTOR_SIZE;
385             let mask1 = _mm256_movemask_epi8(eqa1);
386             let mask2 = _mm256_movemask_epi8(eqa2);
387             return Some(at + reverse_pos2(mask1, mask2));
388         }
389     }
390     while ptr >= start_ptr.add(VECTOR_SIZE) {
391         ptr = ptr.sub(VECTOR_SIZE);
392         if let Some(i) = reverse_search2(start_ptr, end_ptr, ptr, vn1, vn2) {
393             return Some(i);
394         }
395     }
396     if ptr > start_ptr {
397         debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE);
398         return reverse_search2(start_ptr, end_ptr, start_ptr, vn1, vn2);
399     }
400     None
401 }
402 
403 #[target_feature(enable = "avx2")]
memrchr3( n1: u8, n2: u8, n3: u8, haystack: &[u8], ) -> Option<usize>404 pub unsafe fn memrchr3(
405     n1: u8, n2: u8, n3: u8,
406     haystack: &[u8],
407 ) -> Option<usize> {
408     let vn1 = _mm256_set1_epi8(n1 as i8);
409     let vn2 = _mm256_set1_epi8(n2 as i8);
410     let vn3 = _mm256_set1_epi8(n3 as i8);
411     let len = haystack.len();
412     let loop_size = cmp::min(LOOP_SIZE2, len);
413     let start_ptr = haystack.as_ptr();
414     let end_ptr = haystack[haystack.len()..].as_ptr();
415     let mut ptr = end_ptr;
416 
417     if haystack.len() < VECTOR_SIZE {
418         while ptr > start_ptr {
419             ptr = ptr.offset(-1);
420             if *ptr == n1 || *ptr == n2 || *ptr == n3 {
421                 return Some(sub(ptr, start_ptr));
422             }
423         }
424         return None;
425     }
426 
427     ptr = ptr.sub(VECTOR_SIZE);
428     if let Some(i) = reverse_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) {
429         return Some(i);
430     }
431 
432     ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8;
433     debug_assert!(start_ptr <= ptr && ptr <= end_ptr);
434     while loop_size == LOOP_SIZE2 && ptr >= start_ptr.add(loop_size) {
435         debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
436 
437         ptr = ptr.sub(loop_size);
438         let a = _mm256_load_si256(ptr as *const __m256i);
439         let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i);
440         let eqa1 = _mm256_cmpeq_epi8(vn1, a);
441         let eqb1 = _mm256_cmpeq_epi8(vn1, b);
442         let eqa2 = _mm256_cmpeq_epi8(vn2, a);
443         let eqb2 = _mm256_cmpeq_epi8(vn2, b);
444         let eqa3 = _mm256_cmpeq_epi8(vn3, a);
445         let eqb3 = _mm256_cmpeq_epi8(vn3, b);
446         let or1 = _mm256_or_si256(eqa1, eqb1);
447         let or2 = _mm256_or_si256(eqa2, eqb2);
448         let or3 = _mm256_or_si256(eqa3, eqb3);
449         let or4 = _mm256_or_si256(or1, or2);
450         let or5 = _mm256_or_si256(or3, or4);
451         if _mm256_movemask_epi8(or5) != 0 {
452             let mut at = sub(ptr.add(VECTOR_SIZE), start_ptr);
453             let mask1 = _mm256_movemask_epi8(eqb1);
454             let mask2 = _mm256_movemask_epi8(eqb2);
455             let mask3 = _mm256_movemask_epi8(eqb3);
456             if mask1 != 0 || mask2 != 0 || mask3 != 0 {
457                 return Some(at + reverse_pos3(mask1, mask2, mask3));
458             }
459 
460             at -= VECTOR_SIZE;
461             let mask1 = _mm256_movemask_epi8(eqa1);
462             let mask2 = _mm256_movemask_epi8(eqa2);
463             let mask3 = _mm256_movemask_epi8(eqa3);
464             return Some(at + reverse_pos3(mask1, mask2, mask3));
465         }
466     }
467     while ptr >= start_ptr.add(VECTOR_SIZE) {
468         ptr = ptr.sub(VECTOR_SIZE);
469         if let Some(i) = reverse_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) {
470             return Some(i);
471         }
472     }
473     if ptr > start_ptr {
474         debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE);
475         return reverse_search3(start_ptr, end_ptr, start_ptr, vn1, vn2, vn3);
476     }
477     None
478 }
479 
480 #[target_feature(enable = "avx2")]
forward_search1( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, vn1: __m256i, ) -> Option<usize>481 unsafe fn forward_search1(
482     start_ptr: *const u8,
483     end_ptr: *const u8,
484     ptr: *const u8,
485     vn1: __m256i,
486 ) -> Option<usize> {
487     debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
488     debug_assert!(start_ptr <= ptr);
489     debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
490 
491     let chunk = _mm256_loadu_si256(ptr as *const __m256i);
492     let mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(chunk, vn1));
493     if mask != 0 {
494         Some(sub(ptr, start_ptr) + forward_pos(mask))
495     } else {
496         None
497     }
498 }
499 
500 #[target_feature(enable = "avx2")]
forward_search2( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, vn1: __m256i, vn2: __m256i, ) -> Option<usize>501 unsafe fn forward_search2(
502     start_ptr: *const u8,
503     end_ptr: *const u8,
504     ptr: *const u8,
505     vn1: __m256i,
506     vn2: __m256i,
507 ) -> Option<usize> {
508     debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
509     debug_assert!(start_ptr <= ptr);
510     debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
511 
512     let chunk = _mm256_loadu_si256(ptr as *const __m256i);
513     let eq1 = _mm256_cmpeq_epi8(chunk, vn1);
514     let eq2 = _mm256_cmpeq_epi8(chunk, vn2);
515     if _mm256_movemask_epi8(_mm256_or_si256(eq1, eq2)) != 0 {
516         let mask1 = _mm256_movemask_epi8(eq1);
517         let mask2 = _mm256_movemask_epi8(eq2);
518         Some(sub(ptr, start_ptr) + forward_pos2(mask1, mask2))
519     } else {
520         None
521     }
522 }
523 
524 #[target_feature(enable = "avx2")]
forward_search3( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, vn1: __m256i, vn2: __m256i, vn3: __m256i, ) -> Option<usize>525 unsafe fn forward_search3(
526     start_ptr: *const u8,
527     end_ptr: *const u8,
528     ptr: *const u8,
529     vn1: __m256i,
530     vn2: __m256i,
531     vn3: __m256i,
532 ) -> Option<usize> {
533     debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
534     debug_assert!(start_ptr <= ptr);
535     debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
536 
537     let chunk = _mm256_loadu_si256(ptr as *const __m256i);
538     let eq1 = _mm256_cmpeq_epi8(chunk, vn1);
539     let eq2 = _mm256_cmpeq_epi8(chunk, vn2);
540     let eq3 = _mm256_cmpeq_epi8(chunk, vn3);
541     let or = _mm256_or_si256(eq1, eq2);
542     if _mm256_movemask_epi8(_mm256_or_si256(or, eq3)) != 0 {
543         let mask1 = _mm256_movemask_epi8(eq1);
544         let mask2 = _mm256_movemask_epi8(eq2);
545         let mask3 = _mm256_movemask_epi8(eq3);
546         Some(sub(ptr, start_ptr) + forward_pos3(mask1, mask2, mask3))
547     } else {
548         None
549     }
550 }
551 
552 #[target_feature(enable = "avx2")]
reverse_search1( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, vn1: __m256i, ) -> Option<usize>553 unsafe fn reverse_search1(
554     start_ptr: *const u8,
555     end_ptr: *const u8,
556     ptr: *const u8,
557     vn1: __m256i,
558 ) -> Option<usize> {
559     debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
560     debug_assert!(start_ptr <= ptr);
561     debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
562 
563     let chunk = _mm256_loadu_si256(ptr as *const __m256i);
564     let mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(vn1, chunk));
565     if mask != 0 {
566         Some(sub(ptr, start_ptr) + reverse_pos(mask))
567     } else {
568         None
569     }
570 }
571 
572 #[target_feature(enable = "avx2")]
reverse_search2( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, vn1: __m256i, vn2: __m256i, ) -> Option<usize>573 unsafe fn reverse_search2(
574     start_ptr: *const u8,
575     end_ptr: *const u8,
576     ptr: *const u8,
577     vn1: __m256i,
578     vn2: __m256i,
579 ) -> Option<usize> {
580     debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
581     debug_assert!(start_ptr <= ptr);
582     debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
583 
584     let chunk = _mm256_loadu_si256(ptr as *const __m256i);
585     let eq1 = _mm256_cmpeq_epi8(chunk, vn1);
586     let eq2 = _mm256_cmpeq_epi8(chunk, vn2);
587     if _mm256_movemask_epi8(_mm256_or_si256(eq1, eq2)) != 0 {
588         let mask1 = _mm256_movemask_epi8(eq1);
589         let mask2 = _mm256_movemask_epi8(eq2);
590         Some(sub(ptr, start_ptr) + reverse_pos2(mask1, mask2))
591     } else {
592         None
593     }
594 }
595 
596 #[target_feature(enable = "avx2")]
reverse_search3( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, vn1: __m256i, vn2: __m256i, vn3: __m256i, ) -> Option<usize>597 unsafe fn reverse_search3(
598     start_ptr: *const u8,
599     end_ptr: *const u8,
600     ptr: *const u8,
601     vn1: __m256i,
602     vn2: __m256i,
603     vn3: __m256i,
604 ) -> Option<usize> {
605     debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
606     debug_assert!(start_ptr <= ptr);
607     debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
608 
609     let chunk = _mm256_loadu_si256(ptr as *const __m256i);
610     let eq1 = _mm256_cmpeq_epi8(chunk, vn1);
611     let eq2 = _mm256_cmpeq_epi8(chunk, vn2);
612     let eq3 = _mm256_cmpeq_epi8(chunk, vn3);
613     let or = _mm256_or_si256(eq1, eq2);
614     if _mm256_movemask_epi8(_mm256_or_si256(or, eq3)) != 0 {
615         let mask1 = _mm256_movemask_epi8(eq1);
616         let mask2 = _mm256_movemask_epi8(eq2);
617         let mask3 = _mm256_movemask_epi8(eq3);
618         Some(sub(ptr, start_ptr) + reverse_pos3(mask1, mask2, mask3))
619     } else {
620         None
621     }
622 }
623 
624 /// Compute the position of the first matching byte from the given mask. The
625 /// position returned is always in the range [0, 31].
626 ///
627 /// The mask given is expected to be the result of _mm256_movemask_epi8.
forward_pos(mask: i32) -> usize628 fn forward_pos(mask: i32) -> usize {
629     // We are dealing with little endian here, where the most significant byte
630     // is at a higher address. That means the least significant bit that is set
631     // corresponds to the position of our first matching byte. That position
632     // corresponds to the number of zeros after the least significant bit.
633     mask.trailing_zeros() as usize
634 }
635 
636 /// Compute the position of the first matching byte from the given masks. The
637 /// position returned is always in the range [0, 31]. Each mask corresponds to
638 /// the equality comparison of a single byte.
639 ///
640 /// The masks given are expected to be the result of _mm256_movemask_epi8,
641 /// where at least one of the masks is non-zero (i.e., indicates a match).
forward_pos2(mask1: i32, mask2: i32) -> usize642 fn forward_pos2(mask1: i32, mask2: i32) -> usize {
643     debug_assert!(mask1 != 0 || mask2 != 0);
644 
645     forward_pos(mask1 | mask2)
646 }
647 
648 /// Compute the position of the first matching byte from the given masks. The
649 /// position returned is always in the range [0, 31]. Each mask corresponds to
650 /// the equality comparison of a single byte.
651 ///
652 /// The masks given are expected to be the result of _mm256_movemask_epi8,
653 /// where at least one of the masks is non-zero (i.e., indicates a match).
forward_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize654 fn forward_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize {
655     debug_assert!(mask1 != 0 || mask2 != 0 || mask3 != 0);
656 
657     forward_pos(mask1 | mask2 | mask3)
658 }
659 
660 /// Compute the position of the last matching byte from the given mask. The
661 /// position returned is always in the range [0, 31].
662 ///
663 /// The mask given is expected to be the result of _mm256_movemask_epi8.
reverse_pos(mask: i32) -> usize664 fn reverse_pos(mask: i32) -> usize {
665     // We are dealing with little endian here, where the most significant byte
666     // is at a higher address. That means the most significant bit that is set
667     // corresponds to the position of our last matching byte. The position from
668     // the end of the mask is therefore the number of leading zeros in a 32
669     // bit integer, and the position from the start of the mask is therefore
670     // 32 - (leading zeros) - 1.
671     VECTOR_SIZE - (mask as u32).leading_zeros() as usize - 1
672 }
673 
674 /// Compute the position of the last matching byte from the given masks. The
675 /// position returned is always in the range [0, 31]. Each mask corresponds to
676 /// the equality comparison of a single byte.
677 ///
678 /// The masks given are expected to be the result of _mm256_movemask_epi8,
679 /// where at least one of the masks is non-zero (i.e., indicates a match).
reverse_pos2(mask1: i32, mask2: i32) -> usize680 fn reverse_pos2(mask1: i32, mask2: i32) -> usize {
681     debug_assert!(mask1 != 0 || mask2 != 0);
682 
683     reverse_pos(mask1 | mask2)
684 }
685 
686 /// Compute the position of the last matching byte from the given masks. The
687 /// position returned is always in the range [0, 31]. Each mask corresponds to
688 /// the equality comparison of a single byte.
689 ///
690 /// The masks given are expected to be the result of _mm256_movemask_epi8,
691 /// where at least one of the masks is non-zero (i.e., indicates a match).
reverse_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize692 fn reverse_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize {
693     debug_assert!(mask1 != 0 || mask2 != 0 || mask3 != 0);
694 
695     reverse_pos(mask1 | mask2 | mask3)
696 }
697 
698 /// Subtract `b` from `a` and return the difference. `a` should be greater than
699 /// or equal to `b`.
sub(a: *const u8, b: *const u8) -> usize700 fn sub(a: *const u8, b: *const u8) -> usize {
701     debug_assert!(a >= b);
702     (a as usize) - (b as usize)
703 }
704