1 use core::arch::x86_64::*;
2 use core::cmp;
3 use core::mem::size_of;
4
5 use x86::sse2;
6
7 const VECTOR_SIZE: usize = size_of::<__m256i>();
8 const VECTOR_ALIGN: usize = VECTOR_SIZE - 1;
9
10 // The number of bytes to loop at in one iteration of memchr/memrchr.
11 const LOOP_SIZE: usize = 4 * VECTOR_SIZE;
12
13 // The number of bytes to loop at in one iteration of memchr2/memrchr2 and
14 // memchr3/memrchr3. There was no observable difference between 128 and 64
15 // bytes in benchmarks. memchr3 in particular only gets a very slight speed up
16 // from the loop unrolling.
17 const LOOP_SIZE2: usize = 2 * VECTOR_SIZE;
18
19 #[target_feature(enable = "avx2")]
memchr(n1: u8, haystack: &[u8]) -> Option<usize>20 pub unsafe fn memchr(n1: u8, haystack: &[u8]) -> Option<usize> {
21 // For a high level explanation for how this algorithm works, see the
22 // sse2 implementation. The avx implementation here is the same, but with
23 // 256-bit vectors instead of 128-bit vectors.
24
25 let start_ptr = haystack.as_ptr();
26 let end_ptr = haystack[haystack.len()..].as_ptr();
27 let mut ptr = start_ptr;
28
29 if haystack.len() < VECTOR_SIZE {
30 // For small haystacks, defer to the SSE2 implementation. Codegen
31 // suggests this completely avoids touching the AVX vectors.
32 return sse2::memchr(n1, haystack);
33 }
34
35 let vn1 = _mm256_set1_epi8(n1 as i8);
36 let loop_size = cmp::min(LOOP_SIZE, haystack.len());
37 if let Some(i) = forward_search1(start_ptr, end_ptr, ptr, vn1) {
38 return Some(i);
39 }
40
41 ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN));
42 debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr);
43 while loop_size == LOOP_SIZE && ptr <= end_ptr.sub(loop_size) {
44 debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
45
46 let a = _mm256_load_si256(ptr as *const __m256i);
47 let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i);
48 let c = _mm256_load_si256(ptr.add(2 * VECTOR_SIZE) as *const __m256i);
49 let d = _mm256_load_si256(ptr.add(3 * VECTOR_SIZE) as *const __m256i);
50 let eqa = _mm256_cmpeq_epi8(vn1, a);
51 let eqb = _mm256_cmpeq_epi8(vn1, b);
52 let eqc = _mm256_cmpeq_epi8(vn1, c);
53 let eqd = _mm256_cmpeq_epi8(vn1, d);
54 let or1 = _mm256_or_si256(eqa, eqb);
55 let or2 = _mm256_or_si256(eqc, eqd);
56 let or3 = _mm256_or_si256(or1, or2);
57 if _mm256_movemask_epi8(or3) != 0 {
58 let mut at = sub(ptr, start_ptr);
59 let mask = _mm256_movemask_epi8(eqa);
60 if mask != 0 {
61 return Some(at + forward_pos(mask));
62 }
63
64 at += VECTOR_SIZE;
65 let mask = _mm256_movemask_epi8(eqb);
66 if mask != 0 {
67 return Some(at + forward_pos(mask));
68 }
69
70 at += VECTOR_SIZE;
71 let mask = _mm256_movemask_epi8(eqc);
72 if mask != 0 {
73 return Some(at + forward_pos(mask));
74 }
75
76 at += VECTOR_SIZE;
77 let mask = _mm256_movemask_epi8(eqd);
78 debug_assert!(mask != 0);
79 return Some(at + forward_pos(mask));
80 }
81 ptr = ptr.add(loop_size);
82 }
83 while ptr <= end_ptr.sub(VECTOR_SIZE) {
84 debug_assert!(sub(end_ptr, ptr) >= VECTOR_SIZE);
85
86 if let Some(i) = forward_search1(start_ptr, end_ptr, ptr, vn1) {
87 return Some(i);
88 }
89 ptr = ptr.add(VECTOR_SIZE);
90 }
91 if ptr < end_ptr {
92 debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE);
93 ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr));
94 debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE);
95
96 return forward_search1(start_ptr, end_ptr, ptr, vn1);
97 }
98 None
99 }
100
101 #[target_feature(enable = "avx2")]
memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize>102 pub unsafe fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
103 let vn1 = _mm256_set1_epi8(n1 as i8);
104 let vn2 = _mm256_set1_epi8(n2 as i8);
105 let len = haystack.len();
106 let loop_size = cmp::min(LOOP_SIZE2, len);
107 let start_ptr = haystack.as_ptr();
108 let end_ptr = haystack[haystack.len()..].as_ptr();
109 let mut ptr = start_ptr;
110
111 if haystack.len() < VECTOR_SIZE {
112 while ptr < end_ptr {
113 if *ptr == n1 || *ptr == n2 {
114 return Some(sub(ptr, start_ptr));
115 }
116 ptr = ptr.offset(1);
117 }
118 return None;
119 }
120
121 if let Some(i) = forward_search2(start_ptr, end_ptr, ptr, vn1, vn2) {
122 return Some(i);
123 }
124
125 ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN));
126 debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr);
127 while loop_size == LOOP_SIZE2 && ptr <= end_ptr.sub(loop_size) {
128 debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
129
130 let a = _mm256_load_si256(ptr as *const __m256i);
131 let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i);
132 let eqa1 = _mm256_cmpeq_epi8(vn1, a);
133 let eqb1 = _mm256_cmpeq_epi8(vn1, b);
134 let eqa2 = _mm256_cmpeq_epi8(vn2, a);
135 let eqb2 = _mm256_cmpeq_epi8(vn2, b);
136 let or1 = _mm256_or_si256(eqa1, eqb1);
137 let or2 = _mm256_or_si256(eqa2, eqb2);
138 let or3 = _mm256_or_si256(or1, or2);
139 if _mm256_movemask_epi8(or3) != 0 {
140 let mut at = sub(ptr, start_ptr);
141 let mask1 = _mm256_movemask_epi8(eqa1);
142 let mask2 = _mm256_movemask_epi8(eqa2);
143 if mask1 != 0 || mask2 != 0 {
144 return Some(at + forward_pos2(mask1, mask2));
145 }
146
147 at += VECTOR_SIZE;
148 let mask1 = _mm256_movemask_epi8(eqb1);
149 let mask2 = _mm256_movemask_epi8(eqb2);
150 return Some(at + forward_pos2(mask1, mask2));
151 }
152 ptr = ptr.add(loop_size);
153 }
154 while ptr <= end_ptr.sub(VECTOR_SIZE) {
155 if let Some(i) = forward_search2(start_ptr, end_ptr, ptr, vn1, vn2) {
156 return Some(i);
157 }
158 ptr = ptr.add(VECTOR_SIZE);
159 }
160 if ptr < end_ptr {
161 debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE);
162 ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr));
163 debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE);
164
165 return forward_search2(start_ptr, end_ptr, ptr, vn1, vn2);
166 }
167 None
168 }
169
170 #[target_feature(enable = "avx2")]
memchr3( n1: u8, n2: u8, n3: u8, haystack: &[u8] ) -> Option<usize>171 pub unsafe fn memchr3(
172 n1: u8, n2: u8, n3: u8,
173 haystack: &[u8]
174 ) -> Option<usize> {
175 let vn1 = _mm256_set1_epi8(n1 as i8);
176 let vn2 = _mm256_set1_epi8(n2 as i8);
177 let vn3 = _mm256_set1_epi8(n3 as i8);
178 let len = haystack.len();
179 let loop_size = cmp::min(LOOP_SIZE2, len);
180 let start_ptr = haystack.as_ptr();
181 let end_ptr = haystack[haystack.len()..].as_ptr();
182 let mut ptr = start_ptr;
183
184 if haystack.len() < VECTOR_SIZE {
185 while ptr < end_ptr {
186 if *ptr == n1 || *ptr == n2 || *ptr == n3 {
187 return Some(sub(ptr, start_ptr));
188 }
189 ptr = ptr.offset(1);
190 }
191 return None;
192 }
193
194 if let Some(i) = forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) {
195 return Some(i);
196 }
197
198 ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN));
199 debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr);
200 while loop_size == LOOP_SIZE2 && ptr <= end_ptr.sub(loop_size) {
201 debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
202
203 let a = _mm256_load_si256(ptr as *const __m256i);
204 let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i);
205 let eqa1 = _mm256_cmpeq_epi8(vn1, a);
206 let eqb1 = _mm256_cmpeq_epi8(vn1, b);
207 let eqa2 = _mm256_cmpeq_epi8(vn2, a);
208 let eqb2 = _mm256_cmpeq_epi8(vn2, b);
209 let eqa3 = _mm256_cmpeq_epi8(vn3, a);
210 let eqb3 = _mm256_cmpeq_epi8(vn3, b);
211 let or1 = _mm256_or_si256(eqa1, eqb1);
212 let or2 = _mm256_or_si256(eqa2, eqb2);
213 let or3 = _mm256_or_si256(eqa3, eqb3);
214 let or4 = _mm256_or_si256(or1, or2);
215 let or5 = _mm256_or_si256(or3, or4);
216 if _mm256_movemask_epi8(or5) != 0 {
217 let mut at = sub(ptr, start_ptr);
218 let mask1 = _mm256_movemask_epi8(eqa1);
219 let mask2 = _mm256_movemask_epi8(eqa2);
220 let mask3 = _mm256_movemask_epi8(eqa3);
221 if mask1 != 0 || mask2 != 0 || mask3 != 0 {
222 return Some(at + forward_pos3(mask1, mask2, mask3));
223 }
224
225 at += VECTOR_SIZE;
226 let mask1 = _mm256_movemask_epi8(eqb1);
227 let mask2 = _mm256_movemask_epi8(eqb2);
228 let mask3 = _mm256_movemask_epi8(eqb3);
229 if mask1 != 0 || mask2 != 0 || mask3 != 0 {
230 return Some(at + forward_pos3(mask1, mask2, mask3));
231 }
232
233 at += VECTOR_SIZE;
234 let mask1 = _mm256_movemask_epi8(eqb1);
235 let mask2 = _mm256_movemask_epi8(eqb2);
236 let mask3 = _mm256_movemask_epi8(eqb3);
237 return Some(at + forward_pos3(mask1, mask2, mask3));
238 }
239 ptr = ptr.add(loop_size);
240 }
241 while ptr <= end_ptr.sub(VECTOR_SIZE) {
242 if let Some(i) = forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) {
243 return Some(i);
244 }
245 ptr = ptr.add(VECTOR_SIZE);
246 }
247 if ptr < end_ptr {
248 debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE);
249 ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr));
250 debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE);
251
252 return forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3);
253 }
254 None
255 }
256
257 #[target_feature(enable = "avx2")]
memrchr(n1: u8, haystack: &[u8]) -> Option<usize>258 pub unsafe fn memrchr(n1: u8, haystack: &[u8]) -> Option<usize> {
259 let vn1 = _mm256_set1_epi8(n1 as i8);
260 let len = haystack.len();
261 let loop_size = cmp::min(LOOP_SIZE, len);
262 let start_ptr = haystack.as_ptr();
263 let end_ptr = haystack[haystack.len()..].as_ptr();
264 let mut ptr = end_ptr;
265
266 if haystack.len() < VECTOR_SIZE {
267 while ptr > start_ptr {
268 ptr = ptr.offset(-1);
269 if *ptr == n1 {
270 return Some(sub(ptr, start_ptr));
271 }
272 }
273 return None;
274 }
275
276 ptr = ptr.sub(VECTOR_SIZE);
277 if let Some(i) = reverse_search1(start_ptr, end_ptr, ptr, vn1) {
278 return Some(i);
279 }
280
281 ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8;
282 debug_assert!(start_ptr <= ptr && ptr <= end_ptr);
283 while loop_size == LOOP_SIZE && ptr >= start_ptr.add(loop_size) {
284 debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
285
286 ptr = ptr.sub(loop_size);
287 let a = _mm256_load_si256(ptr as *const __m256i);
288 let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i);
289 let c = _mm256_load_si256(ptr.add(2 * VECTOR_SIZE) as *const __m256i);
290 let d = _mm256_load_si256(ptr.add(3 * VECTOR_SIZE) as *const __m256i);
291 let eqa = _mm256_cmpeq_epi8(vn1, a);
292 let eqb = _mm256_cmpeq_epi8(vn1, b);
293 let eqc = _mm256_cmpeq_epi8(vn1, c);
294 let eqd = _mm256_cmpeq_epi8(vn1, d);
295 let or1 = _mm256_or_si256(eqa, eqb);
296 let or2 = _mm256_or_si256(eqc, eqd);
297 let or3 = _mm256_or_si256(or1, or2);
298 if _mm256_movemask_epi8(or3) != 0 {
299 let mut at = sub(ptr.add(3 * VECTOR_SIZE), start_ptr);
300 let mask = _mm256_movemask_epi8(eqd);
301 if mask != 0 {
302 return Some(at + reverse_pos(mask));
303 }
304
305 at -= VECTOR_SIZE;
306 let mask = _mm256_movemask_epi8(eqc);
307 if mask != 0 {
308 return Some(at + reverse_pos(mask));
309 }
310
311 at -= VECTOR_SIZE;
312 let mask = _mm256_movemask_epi8(eqb);
313 if mask != 0 {
314 return Some(at + reverse_pos(mask));
315 }
316
317 at -= VECTOR_SIZE;
318 let mask = _mm256_movemask_epi8(eqa);
319 debug_assert!(mask != 0);
320 return Some(at + reverse_pos(mask));
321 }
322 }
323 while ptr >= start_ptr.add(VECTOR_SIZE) {
324 ptr = ptr.sub(VECTOR_SIZE);
325 if let Some(i) = reverse_search1(start_ptr, end_ptr, ptr, vn1) {
326 return Some(i);
327 }
328 }
329 if ptr > start_ptr {
330 debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE);
331 return reverse_search1(start_ptr, end_ptr, start_ptr, vn1);
332 }
333 None
334 }
335
336 #[target_feature(enable = "avx2")]
memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize>337 pub unsafe fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
338 let vn1 = _mm256_set1_epi8(n1 as i8);
339 let vn2 = _mm256_set1_epi8(n2 as i8);
340 let len = haystack.len();
341 let loop_size = cmp::min(LOOP_SIZE2, len);
342 let start_ptr = haystack.as_ptr();
343 let end_ptr = haystack[haystack.len()..].as_ptr();
344 let mut ptr = end_ptr;
345
346 if haystack.len() < VECTOR_SIZE {
347 while ptr > start_ptr {
348 ptr = ptr.offset(-1);
349 if *ptr == n1 || *ptr == n2 {
350 return Some(sub(ptr, start_ptr));
351 }
352 }
353 return None;
354 }
355
356 ptr = ptr.sub(VECTOR_SIZE);
357 if let Some(i) = reverse_search2(start_ptr, end_ptr, ptr, vn1, vn2) {
358 return Some(i);
359 }
360
361 ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8;
362 debug_assert!(start_ptr <= ptr && ptr <= end_ptr);
363 while loop_size == LOOP_SIZE2 && ptr >= start_ptr.add(loop_size) {
364 debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
365
366 ptr = ptr.sub(loop_size);
367 let a = _mm256_load_si256(ptr as *const __m256i);
368 let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i);
369 let eqa1 = _mm256_cmpeq_epi8(vn1, a);
370 let eqb1 = _mm256_cmpeq_epi8(vn1, b);
371 let eqa2 = _mm256_cmpeq_epi8(vn2, a);
372 let eqb2 = _mm256_cmpeq_epi8(vn2, b);
373 let or1 = _mm256_or_si256(eqa1, eqb1);
374 let or2 = _mm256_or_si256(eqa2, eqb2);
375 let or3 = _mm256_or_si256(or1, or2);
376 if _mm256_movemask_epi8(or3) != 0 {
377 let mut at = sub(ptr.add(VECTOR_SIZE), start_ptr);
378 let mask1 = _mm256_movemask_epi8(eqb1);
379 let mask2 = _mm256_movemask_epi8(eqb2);
380 if mask1 != 0 || mask2 != 0 {
381 return Some(at + reverse_pos2(mask1, mask2));
382 }
383
384 at -= VECTOR_SIZE;
385 let mask1 = _mm256_movemask_epi8(eqa1);
386 let mask2 = _mm256_movemask_epi8(eqa2);
387 return Some(at + reverse_pos2(mask1, mask2));
388 }
389 }
390 while ptr >= start_ptr.add(VECTOR_SIZE) {
391 ptr = ptr.sub(VECTOR_SIZE);
392 if let Some(i) = reverse_search2(start_ptr, end_ptr, ptr, vn1, vn2) {
393 return Some(i);
394 }
395 }
396 if ptr > start_ptr {
397 debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE);
398 return reverse_search2(start_ptr, end_ptr, start_ptr, vn1, vn2);
399 }
400 None
401 }
402
403 #[target_feature(enable = "avx2")]
memrchr3( n1: u8, n2: u8, n3: u8, haystack: &[u8], ) -> Option<usize>404 pub unsafe fn memrchr3(
405 n1: u8, n2: u8, n3: u8,
406 haystack: &[u8],
407 ) -> Option<usize> {
408 let vn1 = _mm256_set1_epi8(n1 as i8);
409 let vn2 = _mm256_set1_epi8(n2 as i8);
410 let vn3 = _mm256_set1_epi8(n3 as i8);
411 let len = haystack.len();
412 let loop_size = cmp::min(LOOP_SIZE2, len);
413 let start_ptr = haystack.as_ptr();
414 let end_ptr = haystack[haystack.len()..].as_ptr();
415 let mut ptr = end_ptr;
416
417 if haystack.len() < VECTOR_SIZE {
418 while ptr > start_ptr {
419 ptr = ptr.offset(-1);
420 if *ptr == n1 || *ptr == n2 || *ptr == n3 {
421 return Some(sub(ptr, start_ptr));
422 }
423 }
424 return None;
425 }
426
427 ptr = ptr.sub(VECTOR_SIZE);
428 if let Some(i) = reverse_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) {
429 return Some(i);
430 }
431
432 ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8;
433 debug_assert!(start_ptr <= ptr && ptr <= end_ptr);
434 while loop_size == LOOP_SIZE2 && ptr >= start_ptr.add(loop_size) {
435 debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
436
437 ptr = ptr.sub(loop_size);
438 let a = _mm256_load_si256(ptr as *const __m256i);
439 let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i);
440 let eqa1 = _mm256_cmpeq_epi8(vn1, a);
441 let eqb1 = _mm256_cmpeq_epi8(vn1, b);
442 let eqa2 = _mm256_cmpeq_epi8(vn2, a);
443 let eqb2 = _mm256_cmpeq_epi8(vn2, b);
444 let eqa3 = _mm256_cmpeq_epi8(vn3, a);
445 let eqb3 = _mm256_cmpeq_epi8(vn3, b);
446 let or1 = _mm256_or_si256(eqa1, eqb1);
447 let or2 = _mm256_or_si256(eqa2, eqb2);
448 let or3 = _mm256_or_si256(eqa3, eqb3);
449 let or4 = _mm256_or_si256(or1, or2);
450 let or5 = _mm256_or_si256(or3, or4);
451 if _mm256_movemask_epi8(or5) != 0 {
452 let mut at = sub(ptr.add(VECTOR_SIZE), start_ptr);
453 let mask1 = _mm256_movemask_epi8(eqb1);
454 let mask2 = _mm256_movemask_epi8(eqb2);
455 let mask3 = _mm256_movemask_epi8(eqb3);
456 if mask1 != 0 || mask2 != 0 || mask3 != 0 {
457 return Some(at + reverse_pos3(mask1, mask2, mask3));
458 }
459
460 at -= VECTOR_SIZE;
461 let mask1 = _mm256_movemask_epi8(eqa1);
462 let mask2 = _mm256_movemask_epi8(eqa2);
463 let mask3 = _mm256_movemask_epi8(eqa3);
464 return Some(at + reverse_pos3(mask1, mask2, mask3));
465 }
466 }
467 while ptr >= start_ptr.add(VECTOR_SIZE) {
468 ptr = ptr.sub(VECTOR_SIZE);
469 if let Some(i) = reverse_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) {
470 return Some(i);
471 }
472 }
473 if ptr > start_ptr {
474 debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE);
475 return reverse_search3(start_ptr, end_ptr, start_ptr, vn1, vn2, vn3);
476 }
477 None
478 }
479
480 #[target_feature(enable = "avx2")]
forward_search1( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, vn1: __m256i, ) -> Option<usize>481 unsafe fn forward_search1(
482 start_ptr: *const u8,
483 end_ptr: *const u8,
484 ptr: *const u8,
485 vn1: __m256i,
486 ) -> Option<usize> {
487 debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
488 debug_assert!(start_ptr <= ptr);
489 debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
490
491 let chunk = _mm256_loadu_si256(ptr as *const __m256i);
492 let mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(chunk, vn1));
493 if mask != 0 {
494 Some(sub(ptr, start_ptr) + forward_pos(mask))
495 } else {
496 None
497 }
498 }
499
500 #[target_feature(enable = "avx2")]
forward_search2( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, vn1: __m256i, vn2: __m256i, ) -> Option<usize>501 unsafe fn forward_search2(
502 start_ptr: *const u8,
503 end_ptr: *const u8,
504 ptr: *const u8,
505 vn1: __m256i,
506 vn2: __m256i,
507 ) -> Option<usize> {
508 debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
509 debug_assert!(start_ptr <= ptr);
510 debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
511
512 let chunk = _mm256_loadu_si256(ptr as *const __m256i);
513 let eq1 = _mm256_cmpeq_epi8(chunk, vn1);
514 let eq2 = _mm256_cmpeq_epi8(chunk, vn2);
515 if _mm256_movemask_epi8(_mm256_or_si256(eq1, eq2)) != 0 {
516 let mask1 = _mm256_movemask_epi8(eq1);
517 let mask2 = _mm256_movemask_epi8(eq2);
518 Some(sub(ptr, start_ptr) + forward_pos2(mask1, mask2))
519 } else {
520 None
521 }
522 }
523
524 #[target_feature(enable = "avx2")]
forward_search3( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, vn1: __m256i, vn2: __m256i, vn3: __m256i, ) -> Option<usize>525 unsafe fn forward_search3(
526 start_ptr: *const u8,
527 end_ptr: *const u8,
528 ptr: *const u8,
529 vn1: __m256i,
530 vn2: __m256i,
531 vn3: __m256i,
532 ) -> Option<usize> {
533 debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
534 debug_assert!(start_ptr <= ptr);
535 debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
536
537 let chunk = _mm256_loadu_si256(ptr as *const __m256i);
538 let eq1 = _mm256_cmpeq_epi8(chunk, vn1);
539 let eq2 = _mm256_cmpeq_epi8(chunk, vn2);
540 let eq3 = _mm256_cmpeq_epi8(chunk, vn3);
541 let or = _mm256_or_si256(eq1, eq2);
542 if _mm256_movemask_epi8(_mm256_or_si256(or, eq3)) != 0 {
543 let mask1 = _mm256_movemask_epi8(eq1);
544 let mask2 = _mm256_movemask_epi8(eq2);
545 let mask3 = _mm256_movemask_epi8(eq3);
546 Some(sub(ptr, start_ptr) + forward_pos3(mask1, mask2, mask3))
547 } else {
548 None
549 }
550 }
551
552 #[target_feature(enable = "avx2")]
reverse_search1( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, vn1: __m256i, ) -> Option<usize>553 unsafe fn reverse_search1(
554 start_ptr: *const u8,
555 end_ptr: *const u8,
556 ptr: *const u8,
557 vn1: __m256i,
558 ) -> Option<usize> {
559 debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
560 debug_assert!(start_ptr <= ptr);
561 debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
562
563 let chunk = _mm256_loadu_si256(ptr as *const __m256i);
564 let mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(vn1, chunk));
565 if mask != 0 {
566 Some(sub(ptr, start_ptr) + reverse_pos(mask))
567 } else {
568 None
569 }
570 }
571
572 #[target_feature(enable = "avx2")]
reverse_search2( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, vn1: __m256i, vn2: __m256i, ) -> Option<usize>573 unsafe fn reverse_search2(
574 start_ptr: *const u8,
575 end_ptr: *const u8,
576 ptr: *const u8,
577 vn1: __m256i,
578 vn2: __m256i,
579 ) -> Option<usize> {
580 debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
581 debug_assert!(start_ptr <= ptr);
582 debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
583
584 let chunk = _mm256_loadu_si256(ptr as *const __m256i);
585 let eq1 = _mm256_cmpeq_epi8(chunk, vn1);
586 let eq2 = _mm256_cmpeq_epi8(chunk, vn2);
587 if _mm256_movemask_epi8(_mm256_or_si256(eq1, eq2)) != 0 {
588 let mask1 = _mm256_movemask_epi8(eq1);
589 let mask2 = _mm256_movemask_epi8(eq2);
590 Some(sub(ptr, start_ptr) + reverse_pos2(mask1, mask2))
591 } else {
592 None
593 }
594 }
595
596 #[target_feature(enable = "avx2")]
reverse_search3( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, vn1: __m256i, vn2: __m256i, vn3: __m256i, ) -> Option<usize>597 unsafe fn reverse_search3(
598 start_ptr: *const u8,
599 end_ptr: *const u8,
600 ptr: *const u8,
601 vn1: __m256i,
602 vn2: __m256i,
603 vn3: __m256i,
604 ) -> Option<usize> {
605 debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
606 debug_assert!(start_ptr <= ptr);
607 debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
608
609 let chunk = _mm256_loadu_si256(ptr as *const __m256i);
610 let eq1 = _mm256_cmpeq_epi8(chunk, vn1);
611 let eq2 = _mm256_cmpeq_epi8(chunk, vn2);
612 let eq3 = _mm256_cmpeq_epi8(chunk, vn3);
613 let or = _mm256_or_si256(eq1, eq2);
614 if _mm256_movemask_epi8(_mm256_or_si256(or, eq3)) != 0 {
615 let mask1 = _mm256_movemask_epi8(eq1);
616 let mask2 = _mm256_movemask_epi8(eq2);
617 let mask3 = _mm256_movemask_epi8(eq3);
618 Some(sub(ptr, start_ptr) + reverse_pos3(mask1, mask2, mask3))
619 } else {
620 None
621 }
622 }
623
624 /// Compute the position of the first matching byte from the given mask. The
625 /// position returned is always in the range [0, 31].
626 ///
627 /// The mask given is expected to be the result of _mm256_movemask_epi8.
forward_pos(mask: i32) -> usize628 fn forward_pos(mask: i32) -> usize {
629 // We are dealing with little endian here, where the most significant byte
630 // is at a higher address. That means the least significant bit that is set
631 // corresponds to the position of our first matching byte. That position
632 // corresponds to the number of zeros after the least significant bit.
633 mask.trailing_zeros() as usize
634 }
635
636 /// Compute the position of the first matching byte from the given masks. The
637 /// position returned is always in the range [0, 31]. Each mask corresponds to
638 /// the equality comparison of a single byte.
639 ///
640 /// The masks given are expected to be the result of _mm256_movemask_epi8,
641 /// where at least one of the masks is non-zero (i.e., indicates a match).
forward_pos2(mask1: i32, mask2: i32) -> usize642 fn forward_pos2(mask1: i32, mask2: i32) -> usize {
643 debug_assert!(mask1 != 0 || mask2 != 0);
644
645 let i1 = forward_pos(mask1);
646 let i2 = forward_pos(mask2);
647 if i1 < i2 { i1 } else { i2 }
648 }
649
650 /// Compute the position of the first matching byte from the given masks. The
651 /// position returned is always in the range [0, 31]. Each mask corresponds to
652 /// the equality comparison of a single byte.
653 ///
654 /// The masks given are expected to be the result of _mm256_movemask_epi8,
655 /// where at least one of the masks is non-zero (i.e., indicates a match).
forward_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize656 fn forward_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize {
657 debug_assert!(mask1 != 0 || mask2 != 0 || mask3 != 0);
658
659 let i1 = forward_pos(mask1);
660 let i2 = forward_pos(mask2);
661 let i3 = forward_pos(mask3);
662 if i1 < i2 && i1 < i3 {
663 i1
664 } else if i2 < i3 {
665 i2
666 } else {
667 i3
668 }
669 }
670
671 /// Compute the position of the last matching byte from the given mask. The
672 /// position returned is always in the range [0, 31].
673 ///
674 /// The mask given is expected to be the result of _mm256_movemask_epi8.
reverse_pos(mask: i32) -> usize675 fn reverse_pos(mask: i32) -> usize {
676 // We are dealing with little endian here, where the most significant byte
677 // is at a higher address. That means the most significant bit that is set
678 // corresponds to the position of our last matching byte. The position from
679 // the end of the mask is therefore the number of leading zeros in a 32
680 // bit integer, and the position from the start of the mask is therefore
681 // 32 - (leading zeros) - 1.
682 VECTOR_SIZE - (mask as u32).leading_zeros() as usize - 1
683 }
684
685 /// Compute the position of the last matching byte from the given masks. The
686 /// position returned is always in the range [0, 31]. Each mask corresponds to
687 /// the equality comparison of a single byte.
688 ///
689 /// The masks given are expected to be the result of _mm256_movemask_epi8,
690 /// where at least one of the masks is non-zero (i.e., indicates a match).
reverse_pos2(mask1: i32, mask2: i32) -> usize691 fn reverse_pos2(mask1: i32, mask2: i32) -> usize {
692 debug_assert!(mask1 != 0 || mask2 != 0);
693
694 if mask1 == 0 {
695 reverse_pos(mask2)
696 } else if mask2 == 0 {
697 reverse_pos(mask1)
698 } else {
699 let i1 = reverse_pos(mask1);
700 let i2 = reverse_pos(mask2);
701 if i1 > i2 { i1 } else { i2 }
702 }
703 }
704
705 /// Compute the position of the last matching byte from the given masks. The
706 /// position returned is always in the range [0, 31]. Each mask corresponds to
707 /// the equality comparison of a single byte.
708 ///
709 /// The masks given are expected to be the result of _mm256_movemask_epi8,
710 /// where at least one of the masks is non-zero (i.e., indicates a match).
reverse_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize711 fn reverse_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize {
712 debug_assert!(mask1 != 0 || mask2 != 0 || mask3 != 0);
713
714 if mask1 == 0 {
715 reverse_pos2(mask2, mask3)
716 } else if mask2 == 0 {
717 reverse_pos2(mask1, mask3)
718 } else if mask3 == 0 {
719 reverse_pos2(mask1, mask2)
720 } else {
721 let i1 = reverse_pos(mask1);
722 let i2 = reverse_pos(mask2);
723 let i3 = reverse_pos(mask3);
724 if i1 > i2 && i1 > i3 {
725 i1
726 } else if i2 > i3 {
727 i2
728 } else {
729 i3
730 }
731 }
732 }
733
734 /// Subtract `b` from `a` and return the difference. `a` should be greater than
735 /// or equal to `b`.
sub(a: *const u8, b: *const u8) -> usize736 fn sub(a: *const u8, b: *const u8) -> usize {
737 debug_assert!(a >= b);
738 (a as usize) - (b as usize)
739 }
740