1 use core::arch::x86_64::*;
2 use core::cmp;
3 use core::mem::size_of;
4 
5 const VECTOR_SIZE: usize = size_of::<__m128i>();
6 const VECTOR_ALIGN: usize = VECTOR_SIZE - 1;
7 
8 // The number of bytes to loop at in one iteration of memchr/memrchr.
9 const LOOP_SIZE: usize = 4 * VECTOR_SIZE;
10 
11 // The number of bytes to loop at in one iteration of memchr2/memrchr2 and
12 // memchr3/memrchr3. There was no observable difference between 64 and 32 bytes
13 // in benchmarks. memchr3 in particular only gets a very slight speed up from
14 // the loop unrolling.
15 const LOOP_SIZE2: usize = 2 * VECTOR_SIZE;
16 
17 #[target_feature(enable = "sse2")]
memchr(n1: u8, haystack: &[u8]) -> Option<usize>18 pub unsafe fn memchr(n1: u8, haystack: &[u8]) -> Option<usize> {
19     // What follows is a fast SSE2-only algorithm to detect the position of
20     // `n1` in `haystack` if it exists. From what I know, this is the "classic"
21     // algorithm. I believe it can be found in places like glibc and Go's
22     // standard library. It appears to be well known and is elaborated on in
23     // more detail here: https://gms.tf/stdfind-and-memchr-optimizations.html
24     //
25     // While this routine is very long, the basic idea is actually very simple
26     // and can be expressed straight-forwardly in pseudo code:
27     //
28     //     needle = (n1 << 15) | (n1 << 14) | ... | (n1 << 1) | n1
29     //     while i <= haystack.len() - 16:
30     //       // A 16 byte vector. Each byte in chunk corresponds to a byte in
31     //       // the haystack.
32     //       chunk = haystack[i:i+16]
33     //       // Compare bytes in needle with bytes in chunk. The result is a 16
34     //       // byte chunk where each byte is 0xFF if the corresponding bytes
35     //       // in needle and chunk were equal, or 0x00 otherwise.
36     //       eqs = cmpeq(needle, chunk)
37     //       // Return a 32 bit integer where the most significant 16 bits
38     //       // are always 0 and the lower 16 bits correspond to whether the
39     //       // most significant bit in the correspond byte in `eqs` is set.
40     //       // In other words, `mask as u16` has bit i set if and only if
41     //       // needle[i] == chunk[i].
42     //       mask = movemask(eqs)
43     //
44     //       // Mask is 0 if there is no match, and non-zero otherwise.
45     //       if mask != 0:
46     //         // trailing_zeros tells us the position of the least significant
47     //         // bit that is set.
48     //         return i + trailing_zeros(mask)
49     //
50     //     // haystack length may not be a multiple of 16, so search the rest.
51     //     while i < haystack.len():
52     //       if haystack[i] == n1:
53     //         return i
54     //
55     //     // No match found.
56     //     return NULL
57     //
58     // In fact, we could loosely translate the above code to Rust line-for-line
59     // and it would be a pretty fast algorithm. But, we pull out all the stops
60     // to go as fast as possible:
61     //
62     // 1. We use aligned loads. That is, we do some finagling to make sure our
63     //    primary loop not only proceeds in increments of 16 bytes, but that
64     //    the address of haystack's pointer that we dereference is aligned to
65     //    16 bytes. 16 is a magic number here because it is the size of SSE2
66     //    128-bit vector. (For the AVX2 algorithm, 32 is the magic number.)
67     //    Therefore, to get aligned loads, our pointer's address must be evenly
68     //    divisible by 16.
69     // 2. Our primary loop proceeds 64 bytes at a time instead of 16. It's
70     //    kind of like loop unrolling, but we combine the equality comparisons
71     //    using a vector OR such that we only need to extract a single mask to
72     //    determine whether a match exists or not. If so, then we do some
73     //    book-keeping to determine the precise location but otherwise mush on.
74     // 3. We use our "chunk" comparison routine in as many places as possible,
75     //    even if it means using unaligned loads. In particular, if haystack
76     //    starts with an unaligned address, then we do an unaligned load to
77     //    search the first 16 bytes. We then start our primary loop at the
78     //    smallest subsequent aligned address, which will actually overlap with
79     //    previously searched bytes. But we're OK with that. We do a similar
80     //    dance at the end of our primary loop. Finally, to avoid a
81     //    byte-at-a-time loop at the end, we do a final 16 byte unaligned load
82     //    that may overlap with a previous load. This is OK because it converts
83     //    a loop into a small number of very fast vector instructions.
84     //
85     // The primary downside of this algorithm is that it's effectively
86     // completely unsafe. Therefore, we have to be super careful to avoid
87     // undefined behavior:
88     //
89     // 1. We use raw pointers everywhere. Not only does dereferencing a pointer
90     //    require the pointer to be valid, but we actually can't even store the
91     //    address of an invalid pointer (unless it's 1 past the end of
92     //    haystack) without sacrificing performance.
93     // 2. _mm_loadu_si128 is used when you don't care about alignment, and
94     //    _mm_load_si128 is used when you do care. You cannot use the latter
95     //    on unaligned pointers.
96     // 3. We make liberal use of debug_assert! to check assumptions.
97     // 4. We make a concerted effort to stick with pointers instead of indices.
98     //    Indices are nicer because there's less to worry about with them (see
99     //    above about pointer offsets), but I could not get the compiler to
100     //    produce as good of code as what the below produces. In any case,
101     //    pointers are what we really care about here, and alignment is
102     //    expressed a bit more naturally with them.
103     //
104     // In general, most of the algorithms in this crate have a similar
105     // structure to what you see below, so this comment applies fairly well to
106     // all of them.
107 
108     let vn1 = _mm_set1_epi8(n1 as i8);
109     let len = haystack.len();
110     let loop_size = cmp::min(LOOP_SIZE, len);
111     let start_ptr = haystack.as_ptr();
112     let end_ptr = haystack[haystack.len()..].as_ptr();
113     let mut ptr = start_ptr;
114 
115     if haystack.len() < VECTOR_SIZE {
116         while ptr < end_ptr {
117             if *ptr == n1 {
118                 return Some(sub(ptr, start_ptr));
119             }
120             ptr = ptr.offset(1);
121         }
122         return None;
123     }
124 
125     if let Some(i) = forward_search1(start_ptr, end_ptr, ptr, vn1) {
126         return Some(i);
127     }
128 
129     ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN));
130     debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr);
131     while loop_size == LOOP_SIZE && ptr <= end_ptr.sub(loop_size) {
132         debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
133 
134         let a = _mm_load_si128(ptr as *const __m128i);
135         let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i);
136         let c = _mm_load_si128(ptr.add(2 * VECTOR_SIZE) as *const __m128i);
137         let d = _mm_load_si128(ptr.add(3 * VECTOR_SIZE) as *const __m128i);
138         let eqa = _mm_cmpeq_epi8(vn1, a);
139         let eqb = _mm_cmpeq_epi8(vn1, b);
140         let eqc = _mm_cmpeq_epi8(vn1, c);
141         let eqd = _mm_cmpeq_epi8(vn1, d);
142         let or1 = _mm_or_si128(eqa, eqb);
143         let or2 = _mm_or_si128(eqc, eqd);
144         let or3 = _mm_or_si128(or1, or2);
145         if _mm_movemask_epi8(or3) != 0 {
146             let mut at = sub(ptr, start_ptr);
147             let mask = _mm_movemask_epi8(eqa);
148             if mask != 0 {
149                 return Some(at + forward_pos(mask));
150             }
151 
152             at += VECTOR_SIZE;
153             let mask = _mm_movemask_epi8(eqb);
154             if mask != 0 {
155                 return Some(at + forward_pos(mask));
156             }
157 
158             at += VECTOR_SIZE;
159             let mask = _mm_movemask_epi8(eqc);
160             if mask != 0 {
161                 return Some(at + forward_pos(mask));
162             }
163 
164             at += VECTOR_SIZE;
165             let mask = _mm_movemask_epi8(eqd);
166             debug_assert!(mask != 0);
167             return Some(at + forward_pos(mask));
168         }
169         ptr = ptr.add(loop_size);
170     }
171     while ptr <= end_ptr.sub(VECTOR_SIZE) {
172         debug_assert!(sub(end_ptr, ptr) >= VECTOR_SIZE);
173 
174         if let Some(i) = forward_search1(start_ptr, end_ptr, ptr, vn1) {
175             return Some(i);
176         }
177         ptr = ptr.add(VECTOR_SIZE);
178     }
179     if ptr < end_ptr {
180         debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE);
181         ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr));
182         debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE);
183 
184         return forward_search1(start_ptr, end_ptr, ptr, vn1);
185     }
186     None
187 }
188 
189 #[target_feature(enable = "sse2")]
memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize>190 pub unsafe fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
191     let vn1 = _mm_set1_epi8(n1 as i8);
192     let vn2 = _mm_set1_epi8(n2 as i8);
193     let len = haystack.len();
194     let loop_size = cmp::min(LOOP_SIZE2, len);
195     let start_ptr = haystack.as_ptr();
196     let end_ptr = haystack[haystack.len()..].as_ptr();
197     let mut ptr = start_ptr;
198 
199     if haystack.len() < VECTOR_SIZE {
200         while ptr < end_ptr {
201             if *ptr == n1 || *ptr == n2 {
202                 return Some(sub(ptr, start_ptr));
203             }
204             ptr = ptr.offset(1);
205         }
206         return None;
207     }
208 
209     if let Some(i) = forward_search2(start_ptr, end_ptr, ptr, vn1, vn2) {
210         return Some(i);
211     }
212 
213     ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN));
214     debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr);
215     while loop_size == LOOP_SIZE2 && ptr <= end_ptr.sub(loop_size) {
216         debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
217 
218         let a = _mm_load_si128(ptr as *const __m128i);
219         let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i);
220         let eqa1 = _mm_cmpeq_epi8(vn1, a);
221         let eqb1 = _mm_cmpeq_epi8(vn1, b);
222         let eqa2 = _mm_cmpeq_epi8(vn2, a);
223         let eqb2 = _mm_cmpeq_epi8(vn2, b);
224         let or1 = _mm_or_si128(eqa1, eqb1);
225         let or2 = _mm_or_si128(eqa2, eqb2);
226         let or3 = _mm_or_si128(or1, or2);
227         if _mm_movemask_epi8(or3) != 0 {
228             let mut at = sub(ptr, start_ptr);
229             let mask1 = _mm_movemask_epi8(eqa1);
230             let mask2 = _mm_movemask_epi8(eqa2);
231             if mask1 != 0 || mask2 != 0 {
232                 return Some(at + forward_pos2(mask1, mask2));
233             }
234 
235             at += VECTOR_SIZE;
236             let mask1 = _mm_movemask_epi8(eqb1);
237             let mask2 = _mm_movemask_epi8(eqb2);
238             return Some(at + forward_pos2(mask1, mask2));
239         }
240         ptr = ptr.add(loop_size);
241     }
242     while ptr <= end_ptr.sub(VECTOR_SIZE) {
243         if let Some(i) = forward_search2(start_ptr, end_ptr, ptr, vn1, vn2) {
244             return Some(i);
245         }
246         ptr = ptr.add(VECTOR_SIZE);
247     }
248     if ptr < end_ptr {
249         debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE);
250         ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr));
251         debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE);
252 
253         return forward_search2(start_ptr, end_ptr, ptr, vn1, vn2);
254     }
255     None
256 }
257 
258 #[target_feature(enable = "sse2")]
memchr3( n1: u8, n2: u8, n3: u8, haystack: &[u8] ) -> Option<usize>259 pub unsafe fn memchr3(
260     n1: u8, n2: u8, n3: u8,
261     haystack: &[u8]
262 ) -> Option<usize> {
263     let vn1 = _mm_set1_epi8(n1 as i8);
264     let vn2 = _mm_set1_epi8(n2 as i8);
265     let vn3 = _mm_set1_epi8(n3 as i8);
266     let len = haystack.len();
267     let loop_size = cmp::min(LOOP_SIZE2, len);
268     let start_ptr = haystack.as_ptr();
269     let end_ptr = haystack[haystack.len()..].as_ptr();
270     let mut ptr = start_ptr;
271 
272     if haystack.len() < VECTOR_SIZE {
273         while ptr < end_ptr {
274             if *ptr == n1 || *ptr == n2 || *ptr == n3 {
275                 return Some(sub(ptr, start_ptr));
276             }
277             ptr = ptr.offset(1);
278         }
279         return None;
280     }
281 
282     if let Some(i) = forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) {
283         return Some(i);
284     }
285 
286     ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN));
287     debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr);
288     while loop_size == LOOP_SIZE2 && ptr <= end_ptr.sub(loop_size) {
289         debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
290 
291         let a = _mm_load_si128(ptr as *const __m128i);
292         let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i);
293         let eqa1 = _mm_cmpeq_epi8(vn1, a);
294         let eqb1 = _mm_cmpeq_epi8(vn1, b);
295         let eqa2 = _mm_cmpeq_epi8(vn2, a);
296         let eqb2 = _mm_cmpeq_epi8(vn2, b);
297         let eqa3 = _mm_cmpeq_epi8(vn3, a);
298         let eqb3 = _mm_cmpeq_epi8(vn3, b);
299         let or1 = _mm_or_si128(eqa1, eqb1);
300         let or2 = _mm_or_si128(eqa2, eqb2);
301         let or3 = _mm_or_si128(eqa3, eqb3);
302         let or4 = _mm_or_si128(or1, or2);
303         let or5 = _mm_or_si128(or3, or4);
304         if _mm_movemask_epi8(or5) != 0 {
305             let mut at = sub(ptr, start_ptr);
306             let mask1 = _mm_movemask_epi8(eqa1);
307             let mask2 = _mm_movemask_epi8(eqa2);
308             let mask3 = _mm_movemask_epi8(eqa3);
309             if mask1 != 0 || mask2 != 0 || mask3 != 0 {
310                 return Some(at + forward_pos3(mask1, mask2, mask3));
311             }
312 
313             at += VECTOR_SIZE;
314             let mask1 = _mm_movemask_epi8(eqb1);
315             let mask2 = _mm_movemask_epi8(eqb2);
316             let mask3 = _mm_movemask_epi8(eqb3);
317             if mask1 != 0 || mask2 != 0 || mask3 != 0 {
318                 return Some(at + forward_pos3(mask1, mask2, mask3));
319             }
320 
321             at += VECTOR_SIZE;
322             let mask1 = _mm_movemask_epi8(eqb1);
323             let mask2 = _mm_movemask_epi8(eqb2);
324             let mask3 = _mm_movemask_epi8(eqb3);
325             return Some(at + forward_pos3(mask1, mask2, mask3));
326         }
327         ptr = ptr.add(loop_size);
328     }
329     while ptr <= end_ptr.sub(VECTOR_SIZE) {
330         if let Some(i) = forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) {
331             return Some(i);
332         }
333         ptr = ptr.add(VECTOR_SIZE);
334     }
335     if ptr < end_ptr {
336         debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE);
337         ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr));
338         debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE);
339 
340         return forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3);
341     }
342     None
343 }
344 
345 #[target_feature(enable = "sse2")]
memrchr(n1: u8, haystack: &[u8]) -> Option<usize>346 pub unsafe fn memrchr(n1: u8, haystack: &[u8]) -> Option<usize> {
347     let vn1 = _mm_set1_epi8(n1 as i8);
348     let len = haystack.len();
349     let loop_size = cmp::min(LOOP_SIZE, len);
350     let start_ptr = haystack.as_ptr();
351     let end_ptr = haystack[haystack.len()..].as_ptr();
352     let mut ptr = end_ptr;
353 
354     if haystack.len() < VECTOR_SIZE {
355         while ptr > start_ptr {
356             ptr = ptr.offset(-1);
357             if *ptr == n1 {
358                 return Some(sub(ptr, start_ptr));
359             }
360         }
361         return None;
362     }
363 
364     ptr = ptr.sub(VECTOR_SIZE);
365     if let Some(i) = reverse_search1(start_ptr, end_ptr, ptr, vn1) {
366         return Some(i);
367     }
368 
369     ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8;
370     debug_assert!(start_ptr <= ptr && ptr <= end_ptr);
371     while loop_size == LOOP_SIZE && ptr >= start_ptr.add(loop_size) {
372         debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
373 
374         ptr = ptr.sub(loop_size);
375         let a = _mm_load_si128(ptr as *const __m128i);
376         let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i);
377         let c = _mm_load_si128(ptr.add(2 * VECTOR_SIZE) as *const __m128i);
378         let d = _mm_load_si128(ptr.add(3 * VECTOR_SIZE) as *const __m128i);
379         let eqa = _mm_cmpeq_epi8(vn1, a);
380         let eqb = _mm_cmpeq_epi8(vn1, b);
381         let eqc = _mm_cmpeq_epi8(vn1, c);
382         let eqd = _mm_cmpeq_epi8(vn1, d);
383         let or1 = _mm_or_si128(eqa, eqb);
384         let or2 = _mm_or_si128(eqc, eqd);
385         let or3 = _mm_or_si128(or1, or2);
386         if _mm_movemask_epi8(or3) != 0 {
387             let mut at = sub(ptr.add(3 * VECTOR_SIZE), start_ptr);
388             let mask = _mm_movemask_epi8(eqd);
389             if mask != 0 {
390                 return Some(at + reverse_pos(mask));
391             }
392 
393             at -= VECTOR_SIZE;
394             let mask = _mm_movemask_epi8(eqc);
395             if mask != 0 {
396                 return Some(at + reverse_pos(mask));
397             }
398 
399             at -= VECTOR_SIZE;
400             let mask = _mm_movemask_epi8(eqb);
401             if mask != 0 {
402                 return Some(at + reverse_pos(mask));
403             }
404 
405             at -= VECTOR_SIZE;
406             let mask = _mm_movemask_epi8(eqa);
407             debug_assert!(mask != 0);
408             return Some(at + reverse_pos(mask));
409         }
410     }
411     while ptr >= start_ptr.add(VECTOR_SIZE) {
412         ptr = ptr.sub(VECTOR_SIZE);
413         if let Some(i) = reverse_search1(start_ptr, end_ptr, ptr, vn1) {
414             return Some(i);
415         }
416     }
417     if ptr > start_ptr {
418         debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE);
419         return reverse_search1(start_ptr, end_ptr, start_ptr, vn1);
420     }
421     None
422 }
423 
424 #[target_feature(enable = "sse2")]
memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize>425 pub unsafe fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
426     let vn1 = _mm_set1_epi8(n1 as i8);
427     let vn2 = _mm_set1_epi8(n2 as i8);
428     let len = haystack.len();
429     let loop_size = cmp::min(LOOP_SIZE2, len);
430     let start_ptr = haystack.as_ptr();
431     let end_ptr = haystack[haystack.len()..].as_ptr();
432     let mut ptr = end_ptr;
433 
434     if haystack.len() < VECTOR_SIZE {
435         while ptr > start_ptr {
436             ptr = ptr.offset(-1);
437             if *ptr == n1 || *ptr == n2 {
438                 return Some(sub(ptr, start_ptr));
439             }
440         }
441         return None;
442     }
443 
444     ptr = ptr.sub(VECTOR_SIZE);
445     if let Some(i) = reverse_search2(start_ptr, end_ptr, ptr, vn1, vn2) {
446         return Some(i);
447     }
448 
449     ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8;
450     debug_assert!(start_ptr <= ptr && ptr <= end_ptr);
451     while loop_size == LOOP_SIZE2 && ptr >= start_ptr.add(loop_size) {
452         debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
453 
454         ptr = ptr.sub(loop_size);
455         let a = _mm_load_si128(ptr as *const __m128i);
456         let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i);
457         let eqa1 = _mm_cmpeq_epi8(vn1, a);
458         let eqb1 = _mm_cmpeq_epi8(vn1, b);
459         let eqa2 = _mm_cmpeq_epi8(vn2, a);
460         let eqb2 = _mm_cmpeq_epi8(vn2, b);
461         let or1 = _mm_or_si128(eqa1, eqb1);
462         let or2 = _mm_or_si128(eqa2, eqb2);
463         let or3 = _mm_or_si128(or1, or2);
464         if _mm_movemask_epi8(or3) != 0 {
465             let mut at = sub(ptr.add(VECTOR_SIZE), start_ptr);
466             let mask1 = _mm_movemask_epi8(eqb1);
467             let mask2 = _mm_movemask_epi8(eqb2);
468             if mask1 != 0 || mask2 != 0 {
469                 return Some(at + reverse_pos2(mask1, mask2));
470             }
471 
472             at -= VECTOR_SIZE;
473             let mask1 = _mm_movemask_epi8(eqa1);
474             let mask2 = _mm_movemask_epi8(eqa2);
475             return Some(at + reverse_pos2(mask1, mask2));
476         }
477     }
478     while ptr >= start_ptr.add(VECTOR_SIZE) {
479         ptr = ptr.sub(VECTOR_SIZE);
480         if let Some(i) = reverse_search2(start_ptr, end_ptr, ptr, vn1, vn2) {
481             return Some(i);
482         }
483     }
484     if ptr > start_ptr {
485         debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE);
486         return reverse_search2(start_ptr, end_ptr, start_ptr, vn1, vn2);
487     }
488     None
489 }
490 
491 #[target_feature(enable = "sse2")]
memrchr3( n1: u8, n2: u8, n3: u8, haystack: &[u8], ) -> Option<usize>492 pub unsafe fn memrchr3(
493     n1: u8, n2: u8, n3: u8,
494     haystack: &[u8],
495 ) -> Option<usize> {
496     let vn1 = _mm_set1_epi8(n1 as i8);
497     let vn2 = _mm_set1_epi8(n2 as i8);
498     let vn3 = _mm_set1_epi8(n3 as i8);
499     let len = haystack.len();
500     let loop_size = cmp::min(LOOP_SIZE2, len);
501     let start_ptr = haystack.as_ptr();
502     let end_ptr = haystack[haystack.len()..].as_ptr();
503     let mut ptr = end_ptr;
504 
505     if haystack.len() < VECTOR_SIZE {
506         while ptr > start_ptr {
507             ptr = ptr.offset(-1);
508             if *ptr == n1 || *ptr == n2 || *ptr == n3 {
509                 return Some(sub(ptr, start_ptr));
510             }
511         }
512         return None;
513     }
514 
515     ptr = ptr.sub(VECTOR_SIZE);
516     if let Some(i) = reverse_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) {
517         return Some(i);
518     }
519 
520     ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8;
521     debug_assert!(start_ptr <= ptr && ptr <= end_ptr);
522     while loop_size == LOOP_SIZE2 && ptr >= start_ptr.add(loop_size) {
523         debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
524 
525         ptr = ptr.sub(loop_size);
526         let a = _mm_load_si128(ptr as *const __m128i);
527         let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i);
528         let eqa1 = _mm_cmpeq_epi8(vn1, a);
529         let eqb1 = _mm_cmpeq_epi8(vn1, b);
530         let eqa2 = _mm_cmpeq_epi8(vn2, a);
531         let eqb2 = _mm_cmpeq_epi8(vn2, b);
532         let eqa3 = _mm_cmpeq_epi8(vn3, a);
533         let eqb3 = _mm_cmpeq_epi8(vn3, b);
534         let or1 = _mm_or_si128(eqa1, eqb1);
535         let or2 = _mm_or_si128(eqa2, eqb2);
536         let or3 = _mm_or_si128(eqa3, eqb3);
537         let or4 = _mm_or_si128(or1, or2);
538         let or5 = _mm_or_si128(or3, or4);
539         if _mm_movemask_epi8(or5) != 0 {
540             let mut at = sub(ptr.add(VECTOR_SIZE), start_ptr);
541             let mask1 = _mm_movemask_epi8(eqb1);
542             let mask2 = _mm_movemask_epi8(eqb2);
543             let mask3 = _mm_movemask_epi8(eqb3);
544             if mask1 != 0 || mask2 != 0 || mask3 != 0 {
545                 return Some(at + reverse_pos3(mask1, mask2, mask3));
546             }
547 
548             at -= VECTOR_SIZE;
549             let mask1 = _mm_movemask_epi8(eqa1);
550             let mask2 = _mm_movemask_epi8(eqa2);
551             let mask3 = _mm_movemask_epi8(eqa3);
552             return Some(at + reverse_pos3(mask1, mask2, mask3));
553         }
554     }
555     while ptr >= start_ptr.add(VECTOR_SIZE) {
556         ptr = ptr.sub(VECTOR_SIZE);
557         if let Some(i) = reverse_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) {
558             return Some(i);
559         }
560     }
561     if ptr > start_ptr {
562         debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE);
563         return reverse_search3(start_ptr, end_ptr, start_ptr, vn1, vn2, vn3);
564     }
565     None
566 }
567 
568 #[target_feature(enable = "sse2")]
forward_search1( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, vn1: __m128i, ) -> Option<usize>569 pub unsafe fn forward_search1(
570     start_ptr: *const u8,
571     end_ptr: *const u8,
572     ptr: *const u8,
573     vn1: __m128i,
574 ) -> Option<usize> {
575     debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
576     debug_assert!(start_ptr <= ptr);
577     debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
578 
579     let chunk = _mm_loadu_si128(ptr as *const __m128i);
580     let mask = _mm_movemask_epi8(_mm_cmpeq_epi8(chunk, vn1));
581     if mask != 0 {
582         Some(sub(ptr, start_ptr) + forward_pos(mask))
583     } else {
584         None
585     }
586 }
587 
588 #[target_feature(enable = "sse2")]
forward_search2( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, vn1: __m128i, vn2: __m128i, ) -> Option<usize>589 unsafe fn forward_search2(
590     start_ptr: *const u8,
591     end_ptr: *const u8,
592     ptr: *const u8,
593     vn1: __m128i,
594     vn2: __m128i,
595 ) -> Option<usize> {
596     debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
597     debug_assert!(start_ptr <= ptr);
598     debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
599 
600     let chunk = _mm_loadu_si128(ptr as *const __m128i);
601     let eq1 = _mm_cmpeq_epi8(chunk, vn1);
602     let eq2 = _mm_cmpeq_epi8(chunk, vn2);
603     if _mm_movemask_epi8(_mm_or_si128(eq1, eq2)) != 0 {
604         let mask1 = _mm_movemask_epi8(eq1);
605         let mask2 = _mm_movemask_epi8(eq2);
606         Some(sub(ptr, start_ptr) + forward_pos2(mask1, mask2))
607     } else {
608         None
609     }
610 }
611 
612 #[target_feature(enable = "sse2")]
forward_search3( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, vn1: __m128i, vn2: __m128i, vn3: __m128i, ) -> Option<usize>613 pub unsafe fn forward_search3(
614     start_ptr: *const u8,
615     end_ptr: *const u8,
616     ptr: *const u8,
617     vn1: __m128i,
618     vn2: __m128i,
619     vn3: __m128i,
620 ) -> Option<usize> {
621     debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
622     debug_assert!(start_ptr <= ptr);
623     debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
624 
625     let chunk = _mm_loadu_si128(ptr as *const __m128i);
626     let eq1 = _mm_cmpeq_epi8(chunk, vn1);
627     let eq2 = _mm_cmpeq_epi8(chunk, vn2);
628     let eq3 = _mm_cmpeq_epi8(chunk, vn3);
629     let or = _mm_or_si128(eq1, eq2);
630     if _mm_movemask_epi8(_mm_or_si128(or, eq3)) != 0 {
631         let mask1 = _mm_movemask_epi8(eq1);
632         let mask2 = _mm_movemask_epi8(eq2);
633         let mask3 = _mm_movemask_epi8(eq3);
634         Some(sub(ptr, start_ptr) + forward_pos3(mask1, mask2, mask3))
635     } else {
636         None
637     }
638 }
639 
640 #[target_feature(enable = "sse2")]
reverse_search1( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, vn1: __m128i, ) -> Option<usize>641 unsafe fn reverse_search1(
642     start_ptr: *const u8,
643     end_ptr: *const u8,
644     ptr: *const u8,
645     vn1: __m128i,
646 ) -> Option<usize> {
647     debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
648     debug_assert!(start_ptr <= ptr);
649     debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
650 
651     let chunk = _mm_loadu_si128(ptr as *const __m128i);
652     let mask = _mm_movemask_epi8(_mm_cmpeq_epi8(vn1, chunk));
653     if mask != 0 {
654         Some(sub(ptr, start_ptr) + reverse_pos(mask))
655     } else {
656         None
657     }
658 }
659 
660 #[target_feature(enable = "sse2")]
reverse_search2( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, vn1: __m128i, vn2: __m128i, ) -> Option<usize>661 unsafe fn reverse_search2(
662     start_ptr: *const u8,
663     end_ptr: *const u8,
664     ptr: *const u8,
665     vn1: __m128i,
666     vn2: __m128i,
667 ) -> Option<usize> {
668     debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
669     debug_assert!(start_ptr <= ptr);
670     debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
671 
672     let chunk = _mm_loadu_si128(ptr as *const __m128i);
673     let eq1 = _mm_cmpeq_epi8(chunk, vn1);
674     let eq2 = _mm_cmpeq_epi8(chunk, vn2);
675     if _mm_movemask_epi8(_mm_or_si128(eq1, eq2)) != 0 {
676         let mask1 = _mm_movemask_epi8(eq1);
677         let mask2 = _mm_movemask_epi8(eq2);
678         Some(sub(ptr, start_ptr) + reverse_pos2(mask1, mask2))
679     } else {
680         None
681     }
682 }
683 
684 #[target_feature(enable = "sse2")]
reverse_search3( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, vn1: __m128i, vn2: __m128i, vn3: __m128i, ) -> Option<usize>685 unsafe fn reverse_search3(
686     start_ptr: *const u8,
687     end_ptr: *const u8,
688     ptr: *const u8,
689     vn1: __m128i,
690     vn2: __m128i,
691     vn3: __m128i,
692 ) -> Option<usize> {
693     debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
694     debug_assert!(start_ptr <= ptr);
695     debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
696 
697     let chunk = _mm_loadu_si128(ptr as *const __m128i);
698     let eq1 = _mm_cmpeq_epi8(chunk, vn1);
699     let eq2 = _mm_cmpeq_epi8(chunk, vn2);
700     let eq3 = _mm_cmpeq_epi8(chunk, vn3);
701     let or = _mm_or_si128(eq1, eq2);
702     if _mm_movemask_epi8(_mm_or_si128(or, eq3)) != 0 {
703         let mask1 = _mm_movemask_epi8(eq1);
704         let mask2 = _mm_movemask_epi8(eq2);
705         let mask3 = _mm_movemask_epi8(eq3);
706         Some(sub(ptr, start_ptr) + reverse_pos3(mask1, mask2, mask3))
707     } else {
708         None
709     }
710 }
711 
712 /// Compute the position of the first matching byte from the given mask. The
713 /// position returned is always in the range [0, 15].
714 ///
715 /// The mask given is expected to be the result of _mm_movemask_epi8.
forward_pos(mask: i32) -> usize716 fn forward_pos(mask: i32) -> usize {
717     // We are dealing with little endian here, where the most significant byte
718     // is at a higher address. That means the least significant bit that is set
719     // corresponds to the position of our first matching byte. That position
720     // corresponds to the number of zeros after the least significant bit.
721     mask.trailing_zeros() as usize
722 }
723 
724 /// Compute the position of the first matching byte from the given masks. The
725 /// position returned is always in the range [0, 15]. Each mask corresponds to
726 /// the equality comparison of a single byte.
727 ///
728 /// The masks given are expected to be the result of _mm_movemask_epi8, where
729 /// at least one of the masks is non-zero (i.e., indicates a match).
forward_pos2(mask1: i32, mask2: i32) -> usize730 fn forward_pos2(mask1: i32, mask2: i32) -> usize {
731     debug_assert!(mask1 != 0 || mask2 != 0);
732 
733     forward_pos(mask1 | mask2)
734 }
735 
736 /// Compute the position of the first matching byte from the given masks. The
737 /// position returned is always in the range [0, 15]. Each mask corresponds to
738 /// the equality comparison of a single byte.
739 ///
740 /// The masks given are expected to be the result of _mm_movemask_epi8, where
741 /// at least one of the masks is non-zero (i.e., indicates a match).
forward_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize742 fn forward_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize {
743     debug_assert!(mask1 != 0 || mask2 != 0 || mask3 != 0);
744 
745     forward_pos(mask1 | mask2 | mask3)
746 }
747 
748 /// Compute the position of the last matching byte from the given mask. The
749 /// position returned is always in the range [0, 15].
750 ///
751 /// The mask given is expected to be the result of _mm_movemask_epi8.
reverse_pos(mask: i32) -> usize752 fn reverse_pos(mask: i32) -> usize {
753     // We are dealing with little endian here, where the most significant byte
754     // is at a higher address. That means the most significant bit that is set
755     // corresponds to the position of our last matching byte. The position from
756     // the end of the mask is therefore the number of leading zeros in a 16
757     // bit integer, and the position from the start of the mask is therefore
758     // 16 - (leading zeros) - 1.
759     VECTOR_SIZE - (mask as u16).leading_zeros() as usize - 1
760 }
761 
762 /// Compute the position of the last matching byte from the given masks. The
763 /// position returned is always in the range [0, 15]. Each mask corresponds to
764 /// the equality comparison of a single byte.
765 ///
766 /// The masks given are expected to be the result of _mm_movemask_epi8, where
767 /// at least one of the masks is non-zero (i.e., indicates a match).
reverse_pos2(mask1: i32, mask2: i32) -> usize768 fn reverse_pos2(mask1: i32, mask2: i32) -> usize {
769     debug_assert!(mask1 != 0 || mask2 != 0);
770 
771     reverse_pos(mask1 | mask2)
772 }
773 
774 /// Compute the position of the last matching byte from the given masks. The
775 /// position returned is always in the range [0, 15]. Each mask corresponds to
776 /// the equality comparison of a single byte.
777 ///
778 /// The masks given are expected to be the result of _mm_movemask_epi8, where
779 /// at least one of the masks is non-zero (i.e., indicates a match).
reverse_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize780 fn reverse_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize {
781     debug_assert!(mask1 != 0 || mask2 != 0 || mask3 != 0);
782 
783     reverse_pos(mask1 | mask2 | mask3)
784 }
785 
786 /// Subtract `b` from `a` and return the difference. `a` should be greater than
787 /// or equal to `b`.
sub(a: *const u8, b: *const u8) -> usize788 fn sub(a: *const u8, b: *const u8) -> usize {
789     debug_assert!(a >= b);
790     (a as usize) - (b as usize)
791 }
792