1 use core::{arch::x86_64::*, cmp, mem::size_of};
2
3 use super::sse2;
4
5 const VECTOR_SIZE: usize = size_of::<__m256i>();
6 const VECTOR_ALIGN: usize = VECTOR_SIZE - 1;
7
8 // The number of bytes to loop at in one iteration of memchr/memrchr.
9 const LOOP_SIZE: usize = 4 * VECTOR_SIZE;
10
11 // The number of bytes to loop at in one iteration of memchr2/memrchr2 and
12 // memchr3/memrchr3. There was no observable difference between 128 and 64
13 // bytes in benchmarks. memchr3 in particular only gets a very slight speed up
14 // from the loop unrolling.
15 const LOOP_SIZE2: usize = 2 * VECTOR_SIZE;
16
17 #[target_feature(enable = "avx2")]
memchr(n1: u8, haystack: &[u8]) -> Option<usize>18 pub unsafe fn memchr(n1: u8, haystack: &[u8]) -> Option<usize> {
19 // For a high level explanation for how this algorithm works, see the
20 // sse2 implementation. The avx implementation here is the same, but with
21 // 256-bit vectors instead of 128-bit vectors.
22
23 // This routine is called whenever a match is detected. It is specifically
24 // marked as unlineable because it improves the codegen of the unrolled
25 // loop below. Inlining this seems to cause codegen with some extra adds
26 // and a load that aren't necessary. This seems to result in about a 10%
27 // improvement for the memchr1/crate/huge/never benchmark.
28 //
29 // Interestingly, I couldn't observe a similar improvement for memrchr.
30 #[cold]
31 #[inline(never)]
32 #[target_feature(enable = "avx2")]
33 unsafe fn matched(
34 start_ptr: *const u8,
35 ptr: *const u8,
36 eqa: __m256i,
37 eqb: __m256i,
38 eqc: __m256i,
39 eqd: __m256i,
40 ) -> usize {
41 let mut at = sub(ptr, start_ptr);
42 let mask = _mm256_movemask_epi8(eqa);
43 if mask != 0 {
44 return at + forward_pos(mask);
45 }
46
47 at += VECTOR_SIZE;
48 let mask = _mm256_movemask_epi8(eqb);
49 if mask != 0 {
50 return at + forward_pos(mask);
51 }
52
53 at += VECTOR_SIZE;
54 let mask = _mm256_movemask_epi8(eqc);
55 if mask != 0 {
56 return at + forward_pos(mask);
57 }
58
59 at += VECTOR_SIZE;
60 let mask = _mm256_movemask_epi8(eqd);
61 debug_assert!(mask != 0);
62 at + forward_pos(mask)
63 }
64
65 let start_ptr = haystack.as_ptr();
66 let end_ptr = start_ptr.add(haystack.len());
67 let mut ptr = start_ptr;
68
69 if haystack.len() < VECTOR_SIZE {
70 // For small haystacks, defer to the SSE2 implementation. Codegen
71 // suggests this completely avoids touching the AVX vectors.
72 return sse2::memchr(n1, haystack);
73 }
74
75 let vn1 = _mm256_set1_epi8(n1 as i8);
76 let loop_size = cmp::min(LOOP_SIZE, haystack.len());
77 if let Some(i) = forward_search1(start_ptr, end_ptr, ptr, vn1) {
78 return Some(i);
79 }
80
81 ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN));
82 debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr);
83 while loop_size == LOOP_SIZE && ptr <= end_ptr.sub(loop_size) {
84 debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
85
86 let a = _mm256_load_si256(ptr as *const __m256i);
87 let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i);
88 let c = _mm256_load_si256(ptr.add(2 * VECTOR_SIZE) as *const __m256i);
89 let d = _mm256_load_si256(ptr.add(3 * VECTOR_SIZE) as *const __m256i);
90 let eqa = _mm256_cmpeq_epi8(vn1, a);
91 let eqb = _mm256_cmpeq_epi8(vn1, b);
92 let eqc = _mm256_cmpeq_epi8(vn1, c);
93 let eqd = _mm256_cmpeq_epi8(vn1, d);
94 let or1 = _mm256_or_si256(eqa, eqb);
95 let or2 = _mm256_or_si256(eqc, eqd);
96 let or3 = _mm256_or_si256(or1, or2);
97
98 if _mm256_movemask_epi8(or3) != 0 {
99 return Some(matched(start_ptr, ptr, eqa, eqb, eqc, eqd));
100 }
101 ptr = ptr.add(loop_size);
102 }
103 while ptr <= end_ptr.sub(VECTOR_SIZE) {
104 debug_assert!(sub(end_ptr, ptr) >= VECTOR_SIZE);
105
106 if let Some(i) = forward_search1(start_ptr, end_ptr, ptr, vn1) {
107 return Some(i);
108 }
109 ptr = ptr.add(VECTOR_SIZE);
110 }
111 if ptr < end_ptr {
112 debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE);
113 ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr));
114 debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE);
115
116 return forward_search1(start_ptr, end_ptr, ptr, vn1);
117 }
118 None
119 }
120
121 #[target_feature(enable = "avx2")]
memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize>122 pub unsafe fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
123 #[cold]
124 #[inline(never)]
125 #[target_feature(enable = "avx2")]
126 unsafe fn matched(
127 start_ptr: *const u8,
128 ptr: *const u8,
129 eqa1: __m256i,
130 eqa2: __m256i,
131 eqb1: __m256i,
132 eqb2: __m256i,
133 ) -> usize {
134 let mut at = sub(ptr, start_ptr);
135 let mask1 = _mm256_movemask_epi8(eqa1);
136 let mask2 = _mm256_movemask_epi8(eqa2);
137 if mask1 != 0 || mask2 != 0 {
138 return at + forward_pos2(mask1, mask2);
139 }
140
141 at += VECTOR_SIZE;
142 let mask1 = _mm256_movemask_epi8(eqb1);
143 let mask2 = _mm256_movemask_epi8(eqb2);
144 at + forward_pos2(mask1, mask2)
145 }
146
147 let vn1 = _mm256_set1_epi8(n1 as i8);
148 let vn2 = _mm256_set1_epi8(n2 as i8);
149 let len = haystack.len();
150 let loop_size = cmp::min(LOOP_SIZE2, len);
151 let start_ptr = haystack.as_ptr();
152 let end_ptr = start_ptr.add(haystack.len());
153 let mut ptr = start_ptr;
154
155 if haystack.len() < VECTOR_SIZE {
156 while ptr < end_ptr {
157 if *ptr == n1 || *ptr == n2 {
158 return Some(sub(ptr, start_ptr));
159 }
160 ptr = ptr.offset(1);
161 }
162 return None;
163 }
164
165 if let Some(i) = forward_search2(start_ptr, end_ptr, ptr, vn1, vn2) {
166 return Some(i);
167 }
168
169 ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN));
170 debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr);
171 while loop_size == LOOP_SIZE2 && ptr <= end_ptr.sub(loop_size) {
172 debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
173
174 let a = _mm256_load_si256(ptr as *const __m256i);
175 let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i);
176 let eqa1 = _mm256_cmpeq_epi8(vn1, a);
177 let eqb1 = _mm256_cmpeq_epi8(vn1, b);
178 let eqa2 = _mm256_cmpeq_epi8(vn2, a);
179 let eqb2 = _mm256_cmpeq_epi8(vn2, b);
180 let or1 = _mm256_or_si256(eqa1, eqb1);
181 let or2 = _mm256_or_si256(eqa2, eqb2);
182 let or3 = _mm256_or_si256(or1, or2);
183 if _mm256_movemask_epi8(or3) != 0 {
184 return Some(matched(start_ptr, ptr, eqa1, eqa2, eqb1, eqb2));
185 }
186 ptr = ptr.add(loop_size);
187 }
188 while ptr <= end_ptr.sub(VECTOR_SIZE) {
189 if let Some(i) = forward_search2(start_ptr, end_ptr, ptr, vn1, vn2) {
190 return Some(i);
191 }
192 ptr = ptr.add(VECTOR_SIZE);
193 }
194 if ptr < end_ptr {
195 debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE);
196 ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr));
197 debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE);
198
199 return forward_search2(start_ptr, end_ptr, ptr, vn1, vn2);
200 }
201 None
202 }
203
204 #[target_feature(enable = "avx2")]
memchr3( n1: u8, n2: u8, n3: u8, haystack: &[u8], ) -> Option<usize>205 pub unsafe fn memchr3(
206 n1: u8,
207 n2: u8,
208 n3: u8,
209 haystack: &[u8],
210 ) -> Option<usize> {
211 #[cold]
212 #[inline(never)]
213 #[target_feature(enable = "avx2")]
214 unsafe fn matched(
215 start_ptr: *const u8,
216 ptr: *const u8,
217 eqa1: __m256i,
218 eqa2: __m256i,
219 eqa3: __m256i,
220 eqb1: __m256i,
221 eqb2: __m256i,
222 eqb3: __m256i,
223 ) -> usize {
224 let mut at = sub(ptr, start_ptr);
225 let mask1 = _mm256_movemask_epi8(eqa1);
226 let mask2 = _mm256_movemask_epi8(eqa2);
227 let mask3 = _mm256_movemask_epi8(eqa3);
228 if mask1 != 0 || mask2 != 0 || mask3 != 0 {
229 return at + forward_pos3(mask1, mask2, mask3);
230 }
231
232 at += VECTOR_SIZE;
233 let mask1 = _mm256_movemask_epi8(eqb1);
234 let mask2 = _mm256_movemask_epi8(eqb2);
235 let mask3 = _mm256_movemask_epi8(eqb3);
236 at + forward_pos3(mask1, mask2, mask3)
237 }
238
239 let vn1 = _mm256_set1_epi8(n1 as i8);
240 let vn2 = _mm256_set1_epi8(n2 as i8);
241 let vn3 = _mm256_set1_epi8(n3 as i8);
242 let len = haystack.len();
243 let loop_size = cmp::min(LOOP_SIZE2, len);
244 let start_ptr = haystack.as_ptr();
245 let end_ptr = start_ptr.add(haystack.len());
246 let mut ptr = start_ptr;
247
248 if haystack.len() < VECTOR_SIZE {
249 while ptr < end_ptr {
250 if *ptr == n1 || *ptr == n2 || *ptr == n3 {
251 return Some(sub(ptr, start_ptr));
252 }
253 ptr = ptr.offset(1);
254 }
255 return None;
256 }
257
258 if let Some(i) = forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) {
259 return Some(i);
260 }
261
262 ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN));
263 debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr);
264 while loop_size == LOOP_SIZE2 && ptr <= end_ptr.sub(loop_size) {
265 debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
266
267 let a = _mm256_load_si256(ptr as *const __m256i);
268 let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i);
269 let eqa1 = _mm256_cmpeq_epi8(vn1, a);
270 let eqb1 = _mm256_cmpeq_epi8(vn1, b);
271 let eqa2 = _mm256_cmpeq_epi8(vn2, a);
272 let eqb2 = _mm256_cmpeq_epi8(vn2, b);
273 let eqa3 = _mm256_cmpeq_epi8(vn3, a);
274 let eqb3 = _mm256_cmpeq_epi8(vn3, b);
275 let or1 = _mm256_or_si256(eqa1, eqb1);
276 let or2 = _mm256_or_si256(eqa2, eqb2);
277 let or3 = _mm256_or_si256(eqa3, eqb3);
278 let or4 = _mm256_or_si256(or1, or2);
279 let or5 = _mm256_or_si256(or3, or4);
280 if _mm256_movemask_epi8(or5) != 0 {
281 return Some(matched(
282 start_ptr, ptr, eqa1, eqa2, eqa3, eqb1, eqb2, eqb3,
283 ));
284 }
285 ptr = ptr.add(loop_size);
286 }
287 while ptr <= end_ptr.sub(VECTOR_SIZE) {
288 if let Some(i) =
289 forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3)
290 {
291 return Some(i);
292 }
293 ptr = ptr.add(VECTOR_SIZE);
294 }
295 if ptr < end_ptr {
296 debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE);
297 ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr));
298 debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE);
299
300 return forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3);
301 }
302 None
303 }
304
305 #[target_feature(enable = "avx2")]
memrchr(n1: u8, haystack: &[u8]) -> Option<usize>306 pub unsafe fn memrchr(n1: u8, haystack: &[u8]) -> Option<usize> {
307 let vn1 = _mm256_set1_epi8(n1 as i8);
308 let len = haystack.len();
309 let loop_size = cmp::min(LOOP_SIZE, len);
310 let start_ptr = haystack.as_ptr();
311 let end_ptr = start_ptr.add(haystack.len());
312 let mut ptr = end_ptr;
313
314 if haystack.len() < VECTOR_SIZE {
315 while ptr > start_ptr {
316 ptr = ptr.offset(-1);
317 if *ptr == n1 {
318 return Some(sub(ptr, start_ptr));
319 }
320 }
321 return None;
322 }
323
324 ptr = ptr.sub(VECTOR_SIZE);
325 if let Some(i) = reverse_search1(start_ptr, end_ptr, ptr, vn1) {
326 return Some(i);
327 }
328
329 ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8;
330 debug_assert!(start_ptr <= ptr && ptr <= end_ptr);
331 while loop_size == LOOP_SIZE && ptr >= start_ptr.add(loop_size) {
332 debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
333
334 ptr = ptr.sub(loop_size);
335 let a = _mm256_load_si256(ptr as *const __m256i);
336 let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i);
337 let c = _mm256_load_si256(ptr.add(2 * VECTOR_SIZE) as *const __m256i);
338 let d = _mm256_load_si256(ptr.add(3 * VECTOR_SIZE) as *const __m256i);
339 let eqa = _mm256_cmpeq_epi8(vn1, a);
340 let eqb = _mm256_cmpeq_epi8(vn1, b);
341 let eqc = _mm256_cmpeq_epi8(vn1, c);
342 let eqd = _mm256_cmpeq_epi8(vn1, d);
343 let or1 = _mm256_or_si256(eqa, eqb);
344 let or2 = _mm256_or_si256(eqc, eqd);
345 let or3 = _mm256_or_si256(or1, or2);
346 if _mm256_movemask_epi8(or3) != 0 {
347 let mut at = sub(ptr.add(3 * VECTOR_SIZE), start_ptr);
348 let mask = _mm256_movemask_epi8(eqd);
349 if mask != 0 {
350 return Some(at + reverse_pos(mask));
351 }
352
353 at -= VECTOR_SIZE;
354 let mask = _mm256_movemask_epi8(eqc);
355 if mask != 0 {
356 return Some(at + reverse_pos(mask));
357 }
358
359 at -= VECTOR_SIZE;
360 let mask = _mm256_movemask_epi8(eqb);
361 if mask != 0 {
362 return Some(at + reverse_pos(mask));
363 }
364
365 at -= VECTOR_SIZE;
366 let mask = _mm256_movemask_epi8(eqa);
367 debug_assert!(mask != 0);
368 return Some(at + reverse_pos(mask));
369 }
370 }
371 while ptr >= start_ptr.add(VECTOR_SIZE) {
372 ptr = ptr.sub(VECTOR_SIZE);
373 if let Some(i) = reverse_search1(start_ptr, end_ptr, ptr, vn1) {
374 return Some(i);
375 }
376 }
377 if ptr > start_ptr {
378 debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE);
379 return reverse_search1(start_ptr, end_ptr, start_ptr, vn1);
380 }
381 None
382 }
383
384 #[target_feature(enable = "avx2")]
memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize>385 pub unsafe fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
386 let vn1 = _mm256_set1_epi8(n1 as i8);
387 let vn2 = _mm256_set1_epi8(n2 as i8);
388 let len = haystack.len();
389 let loop_size = cmp::min(LOOP_SIZE2, len);
390 let start_ptr = haystack.as_ptr();
391 let end_ptr = start_ptr.add(haystack.len());
392 let mut ptr = end_ptr;
393
394 if haystack.len() < VECTOR_SIZE {
395 while ptr > start_ptr {
396 ptr = ptr.offset(-1);
397 if *ptr == n1 || *ptr == n2 {
398 return Some(sub(ptr, start_ptr));
399 }
400 }
401 return None;
402 }
403
404 ptr = ptr.sub(VECTOR_SIZE);
405 if let Some(i) = reverse_search2(start_ptr, end_ptr, ptr, vn1, vn2) {
406 return Some(i);
407 }
408
409 ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8;
410 debug_assert!(start_ptr <= ptr && ptr <= end_ptr);
411 while loop_size == LOOP_SIZE2 && ptr >= start_ptr.add(loop_size) {
412 debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
413
414 ptr = ptr.sub(loop_size);
415 let a = _mm256_load_si256(ptr as *const __m256i);
416 let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i);
417 let eqa1 = _mm256_cmpeq_epi8(vn1, a);
418 let eqb1 = _mm256_cmpeq_epi8(vn1, b);
419 let eqa2 = _mm256_cmpeq_epi8(vn2, a);
420 let eqb2 = _mm256_cmpeq_epi8(vn2, b);
421 let or1 = _mm256_or_si256(eqa1, eqb1);
422 let or2 = _mm256_or_si256(eqa2, eqb2);
423 let or3 = _mm256_or_si256(or1, or2);
424 if _mm256_movemask_epi8(or3) != 0 {
425 let mut at = sub(ptr.add(VECTOR_SIZE), start_ptr);
426 let mask1 = _mm256_movemask_epi8(eqb1);
427 let mask2 = _mm256_movemask_epi8(eqb2);
428 if mask1 != 0 || mask2 != 0 {
429 return Some(at + reverse_pos2(mask1, mask2));
430 }
431
432 at -= VECTOR_SIZE;
433 let mask1 = _mm256_movemask_epi8(eqa1);
434 let mask2 = _mm256_movemask_epi8(eqa2);
435 return Some(at + reverse_pos2(mask1, mask2));
436 }
437 }
438 while ptr >= start_ptr.add(VECTOR_SIZE) {
439 ptr = ptr.sub(VECTOR_SIZE);
440 if let Some(i) = reverse_search2(start_ptr, end_ptr, ptr, vn1, vn2) {
441 return Some(i);
442 }
443 }
444 if ptr > start_ptr {
445 debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE);
446 return reverse_search2(start_ptr, end_ptr, start_ptr, vn1, vn2);
447 }
448 None
449 }
450
451 #[target_feature(enable = "avx2")]
memrchr3( n1: u8, n2: u8, n3: u8, haystack: &[u8], ) -> Option<usize>452 pub unsafe fn memrchr3(
453 n1: u8,
454 n2: u8,
455 n3: u8,
456 haystack: &[u8],
457 ) -> Option<usize> {
458 let vn1 = _mm256_set1_epi8(n1 as i8);
459 let vn2 = _mm256_set1_epi8(n2 as i8);
460 let vn3 = _mm256_set1_epi8(n3 as i8);
461 let len = haystack.len();
462 let loop_size = cmp::min(LOOP_SIZE2, len);
463 let start_ptr = haystack.as_ptr();
464 let end_ptr = start_ptr.add(haystack.len());
465 let mut ptr = end_ptr;
466
467 if haystack.len() < VECTOR_SIZE {
468 while ptr > start_ptr {
469 ptr = ptr.offset(-1);
470 if *ptr == n1 || *ptr == n2 || *ptr == n3 {
471 return Some(sub(ptr, start_ptr));
472 }
473 }
474 return None;
475 }
476
477 ptr = ptr.sub(VECTOR_SIZE);
478 if let Some(i) = reverse_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) {
479 return Some(i);
480 }
481
482 ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8;
483 debug_assert!(start_ptr <= ptr && ptr <= end_ptr);
484 while loop_size == LOOP_SIZE2 && ptr >= start_ptr.add(loop_size) {
485 debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
486
487 ptr = ptr.sub(loop_size);
488 let a = _mm256_load_si256(ptr as *const __m256i);
489 let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i);
490 let eqa1 = _mm256_cmpeq_epi8(vn1, a);
491 let eqb1 = _mm256_cmpeq_epi8(vn1, b);
492 let eqa2 = _mm256_cmpeq_epi8(vn2, a);
493 let eqb2 = _mm256_cmpeq_epi8(vn2, b);
494 let eqa3 = _mm256_cmpeq_epi8(vn3, a);
495 let eqb3 = _mm256_cmpeq_epi8(vn3, b);
496 let or1 = _mm256_or_si256(eqa1, eqb1);
497 let or2 = _mm256_or_si256(eqa2, eqb2);
498 let or3 = _mm256_or_si256(eqa3, eqb3);
499 let or4 = _mm256_or_si256(or1, or2);
500 let or5 = _mm256_or_si256(or3, or4);
501 if _mm256_movemask_epi8(or5) != 0 {
502 let mut at = sub(ptr.add(VECTOR_SIZE), start_ptr);
503 let mask1 = _mm256_movemask_epi8(eqb1);
504 let mask2 = _mm256_movemask_epi8(eqb2);
505 let mask3 = _mm256_movemask_epi8(eqb3);
506 if mask1 != 0 || mask2 != 0 || mask3 != 0 {
507 return Some(at + reverse_pos3(mask1, mask2, mask3));
508 }
509
510 at -= VECTOR_SIZE;
511 let mask1 = _mm256_movemask_epi8(eqa1);
512 let mask2 = _mm256_movemask_epi8(eqa2);
513 let mask3 = _mm256_movemask_epi8(eqa3);
514 return Some(at + reverse_pos3(mask1, mask2, mask3));
515 }
516 }
517 while ptr >= start_ptr.add(VECTOR_SIZE) {
518 ptr = ptr.sub(VECTOR_SIZE);
519 if let Some(i) =
520 reverse_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3)
521 {
522 return Some(i);
523 }
524 }
525 if ptr > start_ptr {
526 debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE);
527 return reverse_search3(start_ptr, end_ptr, start_ptr, vn1, vn2, vn3);
528 }
529 None
530 }
531
532 #[target_feature(enable = "avx2")]
forward_search1( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, vn1: __m256i, ) -> Option<usize>533 unsafe fn forward_search1(
534 start_ptr: *const u8,
535 end_ptr: *const u8,
536 ptr: *const u8,
537 vn1: __m256i,
538 ) -> Option<usize> {
539 debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
540 debug_assert!(start_ptr <= ptr);
541 debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
542
543 let chunk = _mm256_loadu_si256(ptr as *const __m256i);
544 let mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(chunk, vn1));
545 if mask != 0 {
546 Some(sub(ptr, start_ptr) + forward_pos(mask))
547 } else {
548 None
549 }
550 }
551
552 #[target_feature(enable = "avx2")]
forward_search2( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, vn1: __m256i, vn2: __m256i, ) -> Option<usize>553 unsafe fn forward_search2(
554 start_ptr: *const u8,
555 end_ptr: *const u8,
556 ptr: *const u8,
557 vn1: __m256i,
558 vn2: __m256i,
559 ) -> Option<usize> {
560 debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
561 debug_assert!(start_ptr <= ptr);
562 debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
563
564 let chunk = _mm256_loadu_si256(ptr as *const __m256i);
565 let eq1 = _mm256_cmpeq_epi8(chunk, vn1);
566 let eq2 = _mm256_cmpeq_epi8(chunk, vn2);
567 if _mm256_movemask_epi8(_mm256_or_si256(eq1, eq2)) != 0 {
568 let mask1 = _mm256_movemask_epi8(eq1);
569 let mask2 = _mm256_movemask_epi8(eq2);
570 Some(sub(ptr, start_ptr) + forward_pos2(mask1, mask2))
571 } else {
572 None
573 }
574 }
575
576 #[target_feature(enable = "avx2")]
forward_search3( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, vn1: __m256i, vn2: __m256i, vn3: __m256i, ) -> Option<usize>577 unsafe fn forward_search3(
578 start_ptr: *const u8,
579 end_ptr: *const u8,
580 ptr: *const u8,
581 vn1: __m256i,
582 vn2: __m256i,
583 vn3: __m256i,
584 ) -> Option<usize> {
585 debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
586 debug_assert!(start_ptr <= ptr);
587 debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
588
589 let chunk = _mm256_loadu_si256(ptr as *const __m256i);
590 let eq1 = _mm256_cmpeq_epi8(chunk, vn1);
591 let eq2 = _mm256_cmpeq_epi8(chunk, vn2);
592 let eq3 = _mm256_cmpeq_epi8(chunk, vn3);
593 let or = _mm256_or_si256(eq1, eq2);
594 if _mm256_movemask_epi8(_mm256_or_si256(or, eq3)) != 0 {
595 let mask1 = _mm256_movemask_epi8(eq1);
596 let mask2 = _mm256_movemask_epi8(eq2);
597 let mask3 = _mm256_movemask_epi8(eq3);
598 Some(sub(ptr, start_ptr) + forward_pos3(mask1, mask2, mask3))
599 } else {
600 None
601 }
602 }
603
604 #[target_feature(enable = "avx2")]
reverse_search1( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, vn1: __m256i, ) -> Option<usize>605 unsafe fn reverse_search1(
606 start_ptr: *const u8,
607 end_ptr: *const u8,
608 ptr: *const u8,
609 vn1: __m256i,
610 ) -> Option<usize> {
611 debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
612 debug_assert!(start_ptr <= ptr);
613 debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
614
615 let chunk = _mm256_loadu_si256(ptr as *const __m256i);
616 let mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(vn1, chunk));
617 if mask != 0 {
618 Some(sub(ptr, start_ptr) + reverse_pos(mask))
619 } else {
620 None
621 }
622 }
623
624 #[target_feature(enable = "avx2")]
reverse_search2( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, vn1: __m256i, vn2: __m256i, ) -> Option<usize>625 unsafe fn reverse_search2(
626 start_ptr: *const u8,
627 end_ptr: *const u8,
628 ptr: *const u8,
629 vn1: __m256i,
630 vn2: __m256i,
631 ) -> Option<usize> {
632 debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
633 debug_assert!(start_ptr <= ptr);
634 debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
635
636 let chunk = _mm256_loadu_si256(ptr as *const __m256i);
637 let eq1 = _mm256_cmpeq_epi8(chunk, vn1);
638 let eq2 = _mm256_cmpeq_epi8(chunk, vn2);
639 if _mm256_movemask_epi8(_mm256_or_si256(eq1, eq2)) != 0 {
640 let mask1 = _mm256_movemask_epi8(eq1);
641 let mask2 = _mm256_movemask_epi8(eq2);
642 Some(sub(ptr, start_ptr) + reverse_pos2(mask1, mask2))
643 } else {
644 None
645 }
646 }
647
648 #[target_feature(enable = "avx2")]
reverse_search3( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, vn1: __m256i, vn2: __m256i, vn3: __m256i, ) -> Option<usize>649 unsafe fn reverse_search3(
650 start_ptr: *const u8,
651 end_ptr: *const u8,
652 ptr: *const u8,
653 vn1: __m256i,
654 vn2: __m256i,
655 vn3: __m256i,
656 ) -> Option<usize> {
657 debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
658 debug_assert!(start_ptr <= ptr);
659 debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
660
661 let chunk = _mm256_loadu_si256(ptr as *const __m256i);
662 let eq1 = _mm256_cmpeq_epi8(chunk, vn1);
663 let eq2 = _mm256_cmpeq_epi8(chunk, vn2);
664 let eq3 = _mm256_cmpeq_epi8(chunk, vn3);
665 let or = _mm256_or_si256(eq1, eq2);
666 if _mm256_movemask_epi8(_mm256_or_si256(or, eq3)) != 0 {
667 let mask1 = _mm256_movemask_epi8(eq1);
668 let mask2 = _mm256_movemask_epi8(eq2);
669 let mask3 = _mm256_movemask_epi8(eq3);
670 Some(sub(ptr, start_ptr) + reverse_pos3(mask1, mask2, mask3))
671 } else {
672 None
673 }
674 }
675
676 /// Compute the position of the first matching byte from the given mask. The
677 /// position returned is always in the range [0, 31].
678 ///
679 /// The mask given is expected to be the result of _mm256_movemask_epi8.
forward_pos(mask: i32) -> usize680 fn forward_pos(mask: i32) -> usize {
681 // We are dealing with little endian here, where the most significant byte
682 // is at a higher address. That means the least significant bit that is set
683 // corresponds to the position of our first matching byte. That position
684 // corresponds to the number of zeros after the least significant bit.
685 mask.trailing_zeros() as usize
686 }
687
688 /// Compute the position of the first matching byte from the given masks. The
689 /// position returned is always in the range [0, 31]. Each mask corresponds to
690 /// the equality comparison of a single byte.
691 ///
692 /// The masks given are expected to be the result of _mm256_movemask_epi8,
693 /// where at least one of the masks is non-zero (i.e., indicates a match).
forward_pos2(mask1: i32, mask2: i32) -> usize694 fn forward_pos2(mask1: i32, mask2: i32) -> usize {
695 debug_assert!(mask1 != 0 || mask2 != 0);
696
697 forward_pos(mask1 | mask2)
698 }
699
700 /// Compute the position of the first matching byte from the given masks. The
701 /// position returned is always in the range [0, 31]. Each mask corresponds to
702 /// the equality comparison of a single byte.
703 ///
704 /// The masks given are expected to be the result of _mm256_movemask_epi8,
705 /// where at least one of the masks is non-zero (i.e., indicates a match).
forward_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize706 fn forward_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize {
707 debug_assert!(mask1 != 0 || mask2 != 0 || mask3 != 0);
708
709 forward_pos(mask1 | mask2 | mask3)
710 }
711
712 /// Compute the position of the last matching byte from the given mask. The
713 /// position returned is always in the range [0, 31].
714 ///
715 /// The mask given is expected to be the result of _mm256_movemask_epi8.
reverse_pos(mask: i32) -> usize716 fn reverse_pos(mask: i32) -> usize {
717 // We are dealing with little endian here, where the most significant byte
718 // is at a higher address. That means the most significant bit that is set
719 // corresponds to the position of our last matching byte. The position from
720 // the end of the mask is therefore the number of leading zeros in a 32
721 // bit integer, and the position from the start of the mask is therefore
722 // 32 - (leading zeros) - 1.
723 VECTOR_SIZE - (mask as u32).leading_zeros() as usize - 1
724 }
725
726 /// Compute the position of the last matching byte from the given masks. The
727 /// position returned is always in the range [0, 31]. Each mask corresponds to
728 /// the equality comparison of a single byte.
729 ///
730 /// The masks given are expected to be the result of _mm256_movemask_epi8,
731 /// where at least one of the masks is non-zero (i.e., indicates a match).
reverse_pos2(mask1: i32, mask2: i32) -> usize732 fn reverse_pos2(mask1: i32, mask2: i32) -> usize {
733 debug_assert!(mask1 != 0 || mask2 != 0);
734
735 reverse_pos(mask1 | mask2)
736 }
737
738 /// Compute the position of the last matching byte from the given masks. The
739 /// position returned is always in the range [0, 31]. Each mask corresponds to
740 /// the equality comparison of a single byte.
741 ///
742 /// The masks given are expected to be the result of _mm256_movemask_epi8,
743 /// where at least one of the masks is non-zero (i.e., indicates a match).
reverse_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize744 fn reverse_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize {
745 debug_assert!(mask1 != 0 || mask2 != 0 || mask3 != 0);
746
747 reverse_pos(mask1 | mask2 | mask3)
748 }
749
750 /// Subtract `b` from `a` and return the difference. `a` should be greater than
751 /// or equal to `b`.
sub(a: *const u8, b: *const u8) -> usize752 fn sub(a: *const u8, b: *const u8) -> usize {
753 debug_assert!(a >= b);
754 (a as usize) - (b as usize)
755 }
756