1 use core::arch::x86_64::*;
2 use core::cmp;
3 use core::mem::size_of;
4 
5 const VECTOR_SIZE: usize = size_of::<__m128i>();
6 const VECTOR_ALIGN: usize = VECTOR_SIZE - 1;
7 
8 // The number of bytes to loop at in one iteration of memchr/memrchr.
9 const LOOP_SIZE: usize = 4 * VECTOR_SIZE;
10 
11 // The number of bytes to loop at in one iteration of memchr2/memrchr2 and
12 // memchr3/memrchr3. There was no observable difference between 64 and 32 bytes
13 // in benchmarks. memchr3 in particular only gets a very slight speed up from
14 // the loop unrolling.
15 const LOOP_SIZE2: usize = 2 * VECTOR_SIZE;
16 
17 #[target_feature(enable = "sse2")]
memchr(n1: u8, haystack: &[u8]) -> Option<usize>18 pub unsafe fn memchr(n1: u8, haystack: &[u8]) -> Option<usize> {
19     // What follows is a fast SSE2-only algorithm to detect the position of
20     // `n1` in `haystack` if it exists. From what I know, this is the "classic"
21     // algorithm. I believe it can be found in places like glibc and Go's
22     // standard library. It appears to be well known and is elaborated on in
23     // more detail here: https://gms.tf/stdfind-and-memchr-optimizations.html
24     //
25     // While this routine is very long, the basic idea is actually very simple
26     // and can be expressed straight-forwardly in pseudo code:
27     //
28     //     needle = (n1 << 15) | (n1 << 14) | ... | (n1 << 1) | n1
29     //     // Note: shift amount in bytes
30     //
31     //     while i <= haystack.len() - 16:
32     //       // A 16 byte vector. Each byte in chunk corresponds to a byte in
33     //       // the haystack.
34     //       chunk = haystack[i:i+16]
35     //       // Compare bytes in needle with bytes in chunk. The result is a 16
36     //       // byte chunk where each byte is 0xFF if the corresponding bytes
37     //       // in needle and chunk were equal, or 0x00 otherwise.
38     //       eqs = cmpeq(needle, chunk)
39     //       // Return a 32 bit integer where the most significant 16 bits
40     //       // are always 0 and the lower 16 bits correspond to whether the
41     //       // most significant bit in the correspond byte in `eqs` is set.
42     //       // In other words, `mask as u16` has bit i set if and only if
43     //       // needle[i] == chunk[i].
44     //       mask = movemask(eqs)
45     //
46     //       // Mask is 0 if there is no match, and non-zero otherwise.
47     //       if mask != 0:
48     //         // trailing_zeros tells us the position of the least significant
49     //         // bit that is set.
50     //         return i + trailing_zeros(mask)
51     //
52     //     // haystack length may not be a multiple of 16, so search the rest.
53     //     while i < haystack.len():
54     //       if haystack[i] == n1:
55     //         return i
56     //
57     //     // No match found.
58     //     return NULL
59     //
60     // In fact, we could loosely translate the above code to Rust line-for-line
61     // and it would be a pretty fast algorithm. But, we pull out all the stops
62     // to go as fast as possible:
63     //
64     // 1. We use aligned loads. That is, we do some finagling to make sure our
65     //    primary loop not only proceeds in increments of 16 bytes, but that
66     //    the address of haystack's pointer that we dereference is aligned to
67     //    16 bytes. 16 is a magic number here because it is the size of SSE2
68     //    128-bit vector. (For the AVX2 algorithm, 32 is the magic number.)
69     //    Therefore, to get aligned loads, our pointer's address must be evenly
70     //    divisible by 16.
71     // 2. Our primary loop proceeds 64 bytes at a time instead of 16. It's
72     //    kind of like loop unrolling, but we combine the equality comparisons
73     //    using a vector OR such that we only need to extract a single mask to
74     //    determine whether a match exists or not. If so, then we do some
75     //    book-keeping to determine the precise location but otherwise mush on.
76     // 3. We use our "chunk" comparison routine in as many places as possible,
77     //    even if it means using unaligned loads. In particular, if haystack
78     //    starts with an unaligned address, then we do an unaligned load to
79     //    search the first 16 bytes. We then start our primary loop at the
80     //    smallest subsequent aligned address, which will actually overlap with
81     //    previously searched bytes. But we're OK with that. We do a similar
82     //    dance at the end of our primary loop. Finally, to avoid a
83     //    byte-at-a-time loop at the end, we do a final 16 byte unaligned load
84     //    that may overlap with a previous load. This is OK because it converts
85     //    a loop into a small number of very fast vector instructions.
86     //
87     // The primary downside of this algorithm is that it's effectively
88     // completely unsafe. Therefore, we have to be super careful to avoid
89     // undefined behavior:
90     //
91     // 1. We use raw pointers everywhere. Not only does dereferencing a pointer
92     //    require the pointer to be valid, but we actually can't even store the
93     //    address of an invalid pointer (unless it's 1 past the end of
94     //    haystack) without sacrificing performance.
95     // 2. _mm_loadu_si128 is used when you don't care about alignment, and
96     //    _mm_load_si128 is used when you do care. You cannot use the latter
97     //    on unaligned pointers.
98     // 3. We make liberal use of debug_assert! to check assumptions.
99     // 4. We make a concerted effort to stick with pointers instead of indices.
100     //    Indices are nicer because there's less to worry about with them (see
101     //    above about pointer offsets), but I could not get the compiler to
102     //    produce as good of code as what the below produces. In any case,
103     //    pointers are what we really care about here, and alignment is
104     //    expressed a bit more naturally with them.
105     //
106     // In general, most of the algorithms in this crate have a similar
107     // structure to what you see below, so this comment applies fairly well to
108     // all of them.
109 
110     let vn1 = _mm_set1_epi8(n1 as i8);
111     let len = haystack.len();
112     let loop_size = cmp::min(LOOP_SIZE, len);
113     let start_ptr = haystack.as_ptr();
114     let end_ptr = haystack[haystack.len()..].as_ptr();
115     let mut ptr = start_ptr;
116 
117     if haystack.len() < VECTOR_SIZE {
118         while ptr < end_ptr {
119             if *ptr == n1 {
120                 return Some(sub(ptr, start_ptr));
121             }
122             ptr = ptr.offset(1);
123         }
124         return None;
125     }
126 
127     if let Some(i) = forward_search1(start_ptr, end_ptr, ptr, vn1) {
128         return Some(i);
129     }
130 
131     ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN));
132     debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr);
133     while loop_size == LOOP_SIZE && ptr <= end_ptr.sub(loop_size) {
134         debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
135 
136         let a = _mm_load_si128(ptr as *const __m128i);
137         let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i);
138         let c = _mm_load_si128(ptr.add(2 * VECTOR_SIZE) as *const __m128i);
139         let d = _mm_load_si128(ptr.add(3 * VECTOR_SIZE) as *const __m128i);
140         let eqa = _mm_cmpeq_epi8(vn1, a);
141         let eqb = _mm_cmpeq_epi8(vn1, b);
142         let eqc = _mm_cmpeq_epi8(vn1, c);
143         let eqd = _mm_cmpeq_epi8(vn1, d);
144         let or1 = _mm_or_si128(eqa, eqb);
145         let or2 = _mm_or_si128(eqc, eqd);
146         let or3 = _mm_or_si128(or1, or2);
147         if _mm_movemask_epi8(or3) != 0 {
148             let mut at = sub(ptr, start_ptr);
149             let mask = _mm_movemask_epi8(eqa);
150             if mask != 0 {
151                 return Some(at + forward_pos(mask));
152             }
153 
154             at += VECTOR_SIZE;
155             let mask = _mm_movemask_epi8(eqb);
156             if mask != 0 {
157                 return Some(at + forward_pos(mask));
158             }
159 
160             at += VECTOR_SIZE;
161             let mask = _mm_movemask_epi8(eqc);
162             if mask != 0 {
163                 return Some(at + forward_pos(mask));
164             }
165 
166             at += VECTOR_SIZE;
167             let mask = _mm_movemask_epi8(eqd);
168             debug_assert!(mask != 0);
169             return Some(at + forward_pos(mask));
170         }
171         ptr = ptr.add(loop_size);
172     }
173     while ptr <= end_ptr.sub(VECTOR_SIZE) {
174         debug_assert!(sub(end_ptr, ptr) >= VECTOR_SIZE);
175 
176         if let Some(i) = forward_search1(start_ptr, end_ptr, ptr, vn1) {
177             return Some(i);
178         }
179         ptr = ptr.add(VECTOR_SIZE);
180     }
181     if ptr < end_ptr {
182         debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE);
183         ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr));
184         debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE);
185 
186         return forward_search1(start_ptr, end_ptr, ptr, vn1);
187     }
188     None
189 }
190 
191 #[target_feature(enable = "sse2")]
memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize>192 pub unsafe fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
193     let vn1 = _mm_set1_epi8(n1 as i8);
194     let vn2 = _mm_set1_epi8(n2 as i8);
195     let len = haystack.len();
196     let loop_size = cmp::min(LOOP_SIZE2, len);
197     let start_ptr = haystack.as_ptr();
198     let end_ptr = haystack[haystack.len()..].as_ptr();
199     let mut ptr = start_ptr;
200 
201     if haystack.len() < VECTOR_SIZE {
202         while ptr < end_ptr {
203             if *ptr == n1 || *ptr == n2 {
204                 return Some(sub(ptr, start_ptr));
205             }
206             ptr = ptr.offset(1);
207         }
208         return None;
209     }
210 
211     if let Some(i) = forward_search2(start_ptr, end_ptr, ptr, vn1, vn2) {
212         return Some(i);
213     }
214 
215     ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN));
216     debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr);
217     while loop_size == LOOP_SIZE2 && ptr <= end_ptr.sub(loop_size) {
218         debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
219 
220         let a = _mm_load_si128(ptr as *const __m128i);
221         let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i);
222         let eqa1 = _mm_cmpeq_epi8(vn1, a);
223         let eqb1 = _mm_cmpeq_epi8(vn1, b);
224         let eqa2 = _mm_cmpeq_epi8(vn2, a);
225         let eqb2 = _mm_cmpeq_epi8(vn2, b);
226         let or1 = _mm_or_si128(eqa1, eqb1);
227         let or2 = _mm_or_si128(eqa2, eqb2);
228         let or3 = _mm_or_si128(or1, or2);
229         if _mm_movemask_epi8(or3) != 0 {
230             let mut at = sub(ptr, start_ptr);
231             let mask1 = _mm_movemask_epi8(eqa1);
232             let mask2 = _mm_movemask_epi8(eqa2);
233             if mask1 != 0 || mask2 != 0 {
234                 return Some(at + forward_pos2(mask1, mask2));
235             }
236 
237             at += VECTOR_SIZE;
238             let mask1 = _mm_movemask_epi8(eqb1);
239             let mask2 = _mm_movemask_epi8(eqb2);
240             return Some(at + forward_pos2(mask1, mask2));
241         }
242         ptr = ptr.add(loop_size);
243     }
244     while ptr <= end_ptr.sub(VECTOR_SIZE) {
245         if let Some(i) = forward_search2(start_ptr, end_ptr, ptr, vn1, vn2) {
246             return Some(i);
247         }
248         ptr = ptr.add(VECTOR_SIZE);
249     }
250     if ptr < end_ptr {
251         debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE);
252         ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr));
253         debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE);
254 
255         return forward_search2(start_ptr, end_ptr, ptr, vn1, vn2);
256     }
257     None
258 }
259 
260 #[target_feature(enable = "sse2")]
memchr3( n1: u8, n2: u8, n3: u8, haystack: &[u8], ) -> Option<usize>261 pub unsafe fn memchr3(
262     n1: u8,
263     n2: u8,
264     n3: u8,
265     haystack: &[u8],
266 ) -> Option<usize> {
267     let vn1 = _mm_set1_epi8(n1 as i8);
268     let vn2 = _mm_set1_epi8(n2 as i8);
269     let vn3 = _mm_set1_epi8(n3 as i8);
270     let len = haystack.len();
271     let loop_size = cmp::min(LOOP_SIZE2, len);
272     let start_ptr = haystack.as_ptr();
273     let end_ptr = haystack[haystack.len()..].as_ptr();
274     let mut ptr = start_ptr;
275 
276     if haystack.len() < VECTOR_SIZE {
277         while ptr < end_ptr {
278             if *ptr == n1 || *ptr == n2 || *ptr == n3 {
279                 return Some(sub(ptr, start_ptr));
280             }
281             ptr = ptr.offset(1);
282         }
283         return None;
284     }
285 
286     if let Some(i) = forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) {
287         return Some(i);
288     }
289 
290     ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN));
291     debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr);
292     while loop_size == LOOP_SIZE2 && ptr <= end_ptr.sub(loop_size) {
293         debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
294 
295         let a = _mm_load_si128(ptr as *const __m128i);
296         let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i);
297         let eqa1 = _mm_cmpeq_epi8(vn1, a);
298         let eqb1 = _mm_cmpeq_epi8(vn1, b);
299         let eqa2 = _mm_cmpeq_epi8(vn2, a);
300         let eqb2 = _mm_cmpeq_epi8(vn2, b);
301         let eqa3 = _mm_cmpeq_epi8(vn3, a);
302         let eqb3 = _mm_cmpeq_epi8(vn3, b);
303         let or1 = _mm_or_si128(eqa1, eqb1);
304         let or2 = _mm_or_si128(eqa2, eqb2);
305         let or3 = _mm_or_si128(eqa3, eqb3);
306         let or4 = _mm_or_si128(or1, or2);
307         let or5 = _mm_or_si128(or3, or4);
308         if _mm_movemask_epi8(or5) != 0 {
309             let mut at = sub(ptr, start_ptr);
310             let mask1 = _mm_movemask_epi8(eqa1);
311             let mask2 = _mm_movemask_epi8(eqa2);
312             let mask3 = _mm_movemask_epi8(eqa3);
313             if mask1 != 0 || mask2 != 0 || mask3 != 0 {
314                 return Some(at + forward_pos3(mask1, mask2, mask3));
315             }
316 
317             at += VECTOR_SIZE;
318             let mask1 = _mm_movemask_epi8(eqb1);
319             let mask2 = _mm_movemask_epi8(eqb2);
320             let mask3 = _mm_movemask_epi8(eqb3);
321             return Some(at + forward_pos3(mask1, mask2, mask3));
322         }
323         ptr = ptr.add(loop_size);
324     }
325     while ptr <= end_ptr.sub(VECTOR_SIZE) {
326         if let Some(i) =
327             forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3)
328         {
329             return Some(i);
330         }
331         ptr = ptr.add(VECTOR_SIZE);
332     }
333     if ptr < end_ptr {
334         debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE);
335         ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr));
336         debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE);
337 
338         return forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3);
339     }
340     None
341 }
342 
343 #[target_feature(enable = "sse2")]
memrchr(n1: u8, haystack: &[u8]) -> Option<usize>344 pub unsafe fn memrchr(n1: u8, haystack: &[u8]) -> Option<usize> {
345     let vn1 = _mm_set1_epi8(n1 as i8);
346     let len = haystack.len();
347     let loop_size = cmp::min(LOOP_SIZE, len);
348     let start_ptr = haystack.as_ptr();
349     let end_ptr = haystack[haystack.len()..].as_ptr();
350     let mut ptr = end_ptr;
351 
352     if haystack.len() < VECTOR_SIZE {
353         while ptr > start_ptr {
354             ptr = ptr.offset(-1);
355             if *ptr == n1 {
356                 return Some(sub(ptr, start_ptr));
357             }
358         }
359         return None;
360     }
361 
362     ptr = ptr.sub(VECTOR_SIZE);
363     if let Some(i) = reverse_search1(start_ptr, end_ptr, ptr, vn1) {
364         return Some(i);
365     }
366 
367     ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8;
368     debug_assert!(start_ptr <= ptr && ptr <= end_ptr);
369     while loop_size == LOOP_SIZE && ptr >= start_ptr.add(loop_size) {
370         debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
371 
372         ptr = ptr.sub(loop_size);
373         let a = _mm_load_si128(ptr as *const __m128i);
374         let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i);
375         let c = _mm_load_si128(ptr.add(2 * VECTOR_SIZE) as *const __m128i);
376         let d = _mm_load_si128(ptr.add(3 * VECTOR_SIZE) as *const __m128i);
377         let eqa = _mm_cmpeq_epi8(vn1, a);
378         let eqb = _mm_cmpeq_epi8(vn1, b);
379         let eqc = _mm_cmpeq_epi8(vn1, c);
380         let eqd = _mm_cmpeq_epi8(vn1, d);
381         let or1 = _mm_or_si128(eqa, eqb);
382         let or2 = _mm_or_si128(eqc, eqd);
383         let or3 = _mm_or_si128(or1, or2);
384         if _mm_movemask_epi8(or3) != 0 {
385             let mut at = sub(ptr.add(3 * VECTOR_SIZE), start_ptr);
386             let mask = _mm_movemask_epi8(eqd);
387             if mask != 0 {
388                 return Some(at + reverse_pos(mask));
389             }
390 
391             at -= VECTOR_SIZE;
392             let mask = _mm_movemask_epi8(eqc);
393             if mask != 0 {
394                 return Some(at + reverse_pos(mask));
395             }
396 
397             at -= VECTOR_SIZE;
398             let mask = _mm_movemask_epi8(eqb);
399             if mask != 0 {
400                 return Some(at + reverse_pos(mask));
401             }
402 
403             at -= VECTOR_SIZE;
404             let mask = _mm_movemask_epi8(eqa);
405             debug_assert!(mask != 0);
406             return Some(at + reverse_pos(mask));
407         }
408     }
409     while ptr >= start_ptr.add(VECTOR_SIZE) {
410         ptr = ptr.sub(VECTOR_SIZE);
411         if let Some(i) = reverse_search1(start_ptr, end_ptr, ptr, vn1) {
412             return Some(i);
413         }
414     }
415     if ptr > start_ptr {
416         debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE);
417         return reverse_search1(start_ptr, end_ptr, start_ptr, vn1);
418     }
419     None
420 }
421 
422 #[target_feature(enable = "sse2")]
memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize>423 pub unsafe fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
424     let vn1 = _mm_set1_epi8(n1 as i8);
425     let vn2 = _mm_set1_epi8(n2 as i8);
426     let len = haystack.len();
427     let loop_size = cmp::min(LOOP_SIZE2, len);
428     let start_ptr = haystack.as_ptr();
429     let end_ptr = haystack[haystack.len()..].as_ptr();
430     let mut ptr = end_ptr;
431 
432     if haystack.len() < VECTOR_SIZE {
433         while ptr > start_ptr {
434             ptr = ptr.offset(-1);
435             if *ptr == n1 || *ptr == n2 {
436                 return Some(sub(ptr, start_ptr));
437             }
438         }
439         return None;
440     }
441 
442     ptr = ptr.sub(VECTOR_SIZE);
443     if let Some(i) = reverse_search2(start_ptr, end_ptr, ptr, vn1, vn2) {
444         return Some(i);
445     }
446 
447     ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8;
448     debug_assert!(start_ptr <= ptr && ptr <= end_ptr);
449     while loop_size == LOOP_SIZE2 && ptr >= start_ptr.add(loop_size) {
450         debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
451 
452         ptr = ptr.sub(loop_size);
453         let a = _mm_load_si128(ptr as *const __m128i);
454         let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i);
455         let eqa1 = _mm_cmpeq_epi8(vn1, a);
456         let eqb1 = _mm_cmpeq_epi8(vn1, b);
457         let eqa2 = _mm_cmpeq_epi8(vn2, a);
458         let eqb2 = _mm_cmpeq_epi8(vn2, b);
459         let or1 = _mm_or_si128(eqa1, eqb1);
460         let or2 = _mm_or_si128(eqa2, eqb2);
461         let or3 = _mm_or_si128(or1, or2);
462         if _mm_movemask_epi8(or3) != 0 {
463             let mut at = sub(ptr.add(VECTOR_SIZE), start_ptr);
464             let mask1 = _mm_movemask_epi8(eqb1);
465             let mask2 = _mm_movemask_epi8(eqb2);
466             if mask1 != 0 || mask2 != 0 {
467                 return Some(at + reverse_pos2(mask1, mask2));
468             }
469 
470             at -= VECTOR_SIZE;
471             let mask1 = _mm_movemask_epi8(eqa1);
472             let mask2 = _mm_movemask_epi8(eqa2);
473             return Some(at + reverse_pos2(mask1, mask2));
474         }
475     }
476     while ptr >= start_ptr.add(VECTOR_SIZE) {
477         ptr = ptr.sub(VECTOR_SIZE);
478         if let Some(i) = reverse_search2(start_ptr, end_ptr, ptr, vn1, vn2) {
479             return Some(i);
480         }
481     }
482     if ptr > start_ptr {
483         debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE);
484         return reverse_search2(start_ptr, end_ptr, start_ptr, vn1, vn2);
485     }
486     None
487 }
488 
489 #[target_feature(enable = "sse2")]
memrchr3( n1: u8, n2: u8, n3: u8, haystack: &[u8], ) -> Option<usize>490 pub unsafe fn memrchr3(
491     n1: u8,
492     n2: u8,
493     n3: u8,
494     haystack: &[u8],
495 ) -> Option<usize> {
496     let vn1 = _mm_set1_epi8(n1 as i8);
497     let vn2 = _mm_set1_epi8(n2 as i8);
498     let vn3 = _mm_set1_epi8(n3 as i8);
499     let len = haystack.len();
500     let loop_size = cmp::min(LOOP_SIZE2, len);
501     let start_ptr = haystack.as_ptr();
502     let end_ptr = haystack[haystack.len()..].as_ptr();
503     let mut ptr = end_ptr;
504 
505     if haystack.len() < VECTOR_SIZE {
506         while ptr > start_ptr {
507             ptr = ptr.offset(-1);
508             if *ptr == n1 || *ptr == n2 || *ptr == n3 {
509                 return Some(sub(ptr, start_ptr));
510             }
511         }
512         return None;
513     }
514 
515     ptr = ptr.sub(VECTOR_SIZE);
516     if let Some(i) = reverse_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) {
517         return Some(i);
518     }
519 
520     ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8;
521     debug_assert!(start_ptr <= ptr && ptr <= end_ptr);
522     while loop_size == LOOP_SIZE2 && ptr >= start_ptr.add(loop_size) {
523         debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
524 
525         ptr = ptr.sub(loop_size);
526         let a = _mm_load_si128(ptr as *const __m128i);
527         let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i);
528         let eqa1 = _mm_cmpeq_epi8(vn1, a);
529         let eqb1 = _mm_cmpeq_epi8(vn1, b);
530         let eqa2 = _mm_cmpeq_epi8(vn2, a);
531         let eqb2 = _mm_cmpeq_epi8(vn2, b);
532         let eqa3 = _mm_cmpeq_epi8(vn3, a);
533         let eqb3 = _mm_cmpeq_epi8(vn3, b);
534         let or1 = _mm_or_si128(eqa1, eqb1);
535         let or2 = _mm_or_si128(eqa2, eqb2);
536         let or3 = _mm_or_si128(eqa3, eqb3);
537         let or4 = _mm_or_si128(or1, or2);
538         let or5 = _mm_or_si128(or3, or4);
539         if _mm_movemask_epi8(or5) != 0 {
540             let mut at = sub(ptr.add(VECTOR_SIZE), start_ptr);
541             let mask1 = _mm_movemask_epi8(eqb1);
542             let mask2 = _mm_movemask_epi8(eqb2);
543             let mask3 = _mm_movemask_epi8(eqb3);
544             if mask1 != 0 || mask2 != 0 || mask3 != 0 {
545                 return Some(at + reverse_pos3(mask1, mask2, mask3));
546             }
547 
548             at -= VECTOR_SIZE;
549             let mask1 = _mm_movemask_epi8(eqa1);
550             let mask2 = _mm_movemask_epi8(eqa2);
551             let mask3 = _mm_movemask_epi8(eqa3);
552             return Some(at + reverse_pos3(mask1, mask2, mask3));
553         }
554     }
555     while ptr >= start_ptr.add(VECTOR_SIZE) {
556         ptr = ptr.sub(VECTOR_SIZE);
557         if let Some(i) =
558             reverse_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3)
559         {
560             return Some(i);
561         }
562     }
563     if ptr > start_ptr {
564         debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE);
565         return reverse_search3(start_ptr, end_ptr, start_ptr, vn1, vn2, vn3);
566     }
567     None
568 }
569 
570 #[target_feature(enable = "sse2")]
forward_search1( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, vn1: __m128i, ) -> Option<usize>571 pub unsafe fn forward_search1(
572     start_ptr: *const u8,
573     end_ptr: *const u8,
574     ptr: *const u8,
575     vn1: __m128i,
576 ) -> Option<usize> {
577     debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
578     debug_assert!(start_ptr <= ptr);
579     debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
580 
581     let chunk = _mm_loadu_si128(ptr as *const __m128i);
582     let mask = _mm_movemask_epi8(_mm_cmpeq_epi8(chunk, vn1));
583     if mask != 0 {
584         Some(sub(ptr, start_ptr) + forward_pos(mask))
585     } else {
586         None
587     }
588 }
589 
590 #[target_feature(enable = "sse2")]
forward_search2( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, vn1: __m128i, vn2: __m128i, ) -> Option<usize>591 unsafe fn forward_search2(
592     start_ptr: *const u8,
593     end_ptr: *const u8,
594     ptr: *const u8,
595     vn1: __m128i,
596     vn2: __m128i,
597 ) -> Option<usize> {
598     debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
599     debug_assert!(start_ptr <= ptr);
600     debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
601 
602     let chunk = _mm_loadu_si128(ptr as *const __m128i);
603     let eq1 = _mm_cmpeq_epi8(chunk, vn1);
604     let eq2 = _mm_cmpeq_epi8(chunk, vn2);
605     if _mm_movemask_epi8(_mm_or_si128(eq1, eq2)) != 0 {
606         let mask1 = _mm_movemask_epi8(eq1);
607         let mask2 = _mm_movemask_epi8(eq2);
608         Some(sub(ptr, start_ptr) + forward_pos2(mask1, mask2))
609     } else {
610         None
611     }
612 }
613 
614 #[target_feature(enable = "sse2")]
forward_search3( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, vn1: __m128i, vn2: __m128i, vn3: __m128i, ) -> Option<usize>615 pub unsafe fn forward_search3(
616     start_ptr: *const u8,
617     end_ptr: *const u8,
618     ptr: *const u8,
619     vn1: __m128i,
620     vn2: __m128i,
621     vn3: __m128i,
622 ) -> Option<usize> {
623     debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
624     debug_assert!(start_ptr <= ptr);
625     debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
626 
627     let chunk = _mm_loadu_si128(ptr as *const __m128i);
628     let eq1 = _mm_cmpeq_epi8(chunk, vn1);
629     let eq2 = _mm_cmpeq_epi8(chunk, vn2);
630     let eq3 = _mm_cmpeq_epi8(chunk, vn3);
631     let or = _mm_or_si128(eq1, eq2);
632     if _mm_movemask_epi8(_mm_or_si128(or, eq3)) != 0 {
633         let mask1 = _mm_movemask_epi8(eq1);
634         let mask2 = _mm_movemask_epi8(eq2);
635         let mask3 = _mm_movemask_epi8(eq3);
636         Some(sub(ptr, start_ptr) + forward_pos3(mask1, mask2, mask3))
637     } else {
638         None
639     }
640 }
641 
642 #[target_feature(enable = "sse2")]
reverse_search1( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, vn1: __m128i, ) -> Option<usize>643 unsafe fn reverse_search1(
644     start_ptr: *const u8,
645     end_ptr: *const u8,
646     ptr: *const u8,
647     vn1: __m128i,
648 ) -> Option<usize> {
649     debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
650     debug_assert!(start_ptr <= ptr);
651     debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
652 
653     let chunk = _mm_loadu_si128(ptr as *const __m128i);
654     let mask = _mm_movemask_epi8(_mm_cmpeq_epi8(vn1, chunk));
655     if mask != 0 {
656         Some(sub(ptr, start_ptr) + reverse_pos(mask))
657     } else {
658         None
659     }
660 }
661 
662 #[target_feature(enable = "sse2")]
reverse_search2( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, vn1: __m128i, vn2: __m128i, ) -> Option<usize>663 unsafe fn reverse_search2(
664     start_ptr: *const u8,
665     end_ptr: *const u8,
666     ptr: *const u8,
667     vn1: __m128i,
668     vn2: __m128i,
669 ) -> Option<usize> {
670     debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
671     debug_assert!(start_ptr <= ptr);
672     debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
673 
674     let chunk = _mm_loadu_si128(ptr as *const __m128i);
675     let eq1 = _mm_cmpeq_epi8(chunk, vn1);
676     let eq2 = _mm_cmpeq_epi8(chunk, vn2);
677     if _mm_movemask_epi8(_mm_or_si128(eq1, eq2)) != 0 {
678         let mask1 = _mm_movemask_epi8(eq1);
679         let mask2 = _mm_movemask_epi8(eq2);
680         Some(sub(ptr, start_ptr) + reverse_pos2(mask1, mask2))
681     } else {
682         None
683     }
684 }
685 
686 #[target_feature(enable = "sse2")]
reverse_search3( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, vn1: __m128i, vn2: __m128i, vn3: __m128i, ) -> Option<usize>687 unsafe fn reverse_search3(
688     start_ptr: *const u8,
689     end_ptr: *const u8,
690     ptr: *const u8,
691     vn1: __m128i,
692     vn2: __m128i,
693     vn3: __m128i,
694 ) -> Option<usize> {
695     debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
696     debug_assert!(start_ptr <= ptr);
697     debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
698 
699     let chunk = _mm_loadu_si128(ptr as *const __m128i);
700     let eq1 = _mm_cmpeq_epi8(chunk, vn1);
701     let eq2 = _mm_cmpeq_epi8(chunk, vn2);
702     let eq3 = _mm_cmpeq_epi8(chunk, vn3);
703     let or = _mm_or_si128(eq1, eq2);
704     if _mm_movemask_epi8(_mm_or_si128(or, eq3)) != 0 {
705         let mask1 = _mm_movemask_epi8(eq1);
706         let mask2 = _mm_movemask_epi8(eq2);
707         let mask3 = _mm_movemask_epi8(eq3);
708         Some(sub(ptr, start_ptr) + reverse_pos3(mask1, mask2, mask3))
709     } else {
710         None
711     }
712 }
713 
714 /// Compute the position of the first matching byte from the given mask. The
715 /// position returned is always in the range [0, 15].
716 ///
717 /// The mask given is expected to be the result of _mm_movemask_epi8.
forward_pos(mask: i32) -> usize718 fn forward_pos(mask: i32) -> usize {
719     // We are dealing with little endian here, where the most significant byte
720     // is at a higher address. That means the least significant bit that is set
721     // corresponds to the position of our first matching byte. That position
722     // corresponds to the number of zeros after the least significant bit.
723     mask.trailing_zeros() as usize
724 }
725 
726 /// Compute the position of the first matching byte from the given masks. The
727 /// position returned is always in the range [0, 15]. Each mask corresponds to
728 /// the equality comparison of a single byte.
729 ///
730 /// The masks given are expected to be the result of _mm_movemask_epi8, where
731 /// at least one of the masks is non-zero (i.e., indicates a match).
forward_pos2(mask1: i32, mask2: i32) -> usize732 fn forward_pos2(mask1: i32, mask2: i32) -> usize {
733     debug_assert!(mask1 != 0 || mask2 != 0);
734 
735     forward_pos(mask1 | mask2)
736 }
737 
738 /// Compute the position of the first matching byte from the given masks. The
739 /// position returned is always in the range [0, 15]. Each mask corresponds to
740 /// the equality comparison of a single byte.
741 ///
742 /// The masks given are expected to be the result of _mm_movemask_epi8, where
743 /// at least one of the masks is non-zero (i.e., indicates a match).
forward_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize744 fn forward_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize {
745     debug_assert!(mask1 != 0 || mask2 != 0 || mask3 != 0);
746 
747     forward_pos(mask1 | mask2 | mask3)
748 }
749 
750 /// Compute the position of the last matching byte from the given mask. The
751 /// position returned is always in the range [0, 15].
752 ///
753 /// The mask given is expected to be the result of _mm_movemask_epi8.
reverse_pos(mask: i32) -> usize754 fn reverse_pos(mask: i32) -> usize {
755     // We are dealing with little endian here, where the most significant byte
756     // is at a higher address. That means the most significant bit that is set
757     // corresponds to the position of our last matching byte. The position from
758     // the end of the mask is therefore the number of leading zeros in a 16
759     // bit integer, and the position from the start of the mask is therefore
760     // 16 - (leading zeros) - 1.
761     VECTOR_SIZE - (mask as u16).leading_zeros() as usize - 1
762 }
763 
764 /// Compute the position of the last matching byte from the given masks. The
765 /// position returned is always in the range [0, 15]. Each mask corresponds to
766 /// the equality comparison of a single byte.
767 ///
768 /// The masks given are expected to be the result of _mm_movemask_epi8, where
769 /// at least one of the masks is non-zero (i.e., indicates a match).
reverse_pos2(mask1: i32, mask2: i32) -> usize770 fn reverse_pos2(mask1: i32, mask2: i32) -> usize {
771     debug_assert!(mask1 != 0 || mask2 != 0);
772 
773     reverse_pos(mask1 | mask2)
774 }
775 
776 /// Compute the position of the last matching byte from the given masks. The
777 /// position returned is always in the range [0, 15]. Each mask corresponds to
778 /// the equality comparison of a single byte.
779 ///
780 /// The masks given are expected to be the result of _mm_movemask_epi8, where
781 /// at least one of the masks is non-zero (i.e., indicates a match).
reverse_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize782 fn reverse_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize {
783     debug_assert!(mask1 != 0 || mask2 != 0 || mask3 != 0);
784 
785     reverse_pos(mask1 | mask2 | mask3)
786 }
787 
788 /// Subtract `b` from `a` and return the difference. `a` should be greater than
789 /// or equal to `b`.
sub(a: *const u8, b: *const u8) -> usize790 fn sub(a: *const u8, b: *const u8) -> usize {
791     debug_assert!(a >= b);
792     (a as usize) - (b as usize)
793 }
794