1 use core::arch::x86_64::*;
2 use core::cmp;
3 use core::mem::size_of;
4
5 const VECTOR_SIZE: usize = size_of::<__m128i>();
6 const VECTOR_ALIGN: usize = VECTOR_SIZE - 1;
7
8 // The number of bytes to loop at in one iteration of memchr/memrchr.
9 const LOOP_SIZE: usize = 4 * VECTOR_SIZE;
10
11 // The number of bytes to loop at in one iteration of memchr2/memrchr2 and
12 // memchr3/memrchr3. There was no observable difference between 64 and 32 bytes
13 // in benchmarks. memchr3 in particular only gets a very slight speed up from
14 // the loop unrolling.
15 const LOOP_SIZE2: usize = 2 * VECTOR_SIZE;
16
17 #[target_feature(enable = "sse2")]
memchr(n1: u8, haystack: &[u8]) -> Option<usize>18 pub unsafe fn memchr(n1: u8, haystack: &[u8]) -> Option<usize> {
19 // What follows is a fast SSE2-only algorithm to detect the position of
20 // `n1` in `haystack` if it exists. From what I know, this is the "classic"
21 // algorithm. I believe it can be found in places like glibc and Go's
22 // standard library. It appears to be well known and is elaborated on in
23 // more detail here: https://gms.tf/stdfind-and-memchr-optimizations.html
24 //
25 // While this routine is very long, the basic idea is actually very simple
26 // and can be expressed straight-forwardly in pseudo code:
27 //
28 // needle = (n1 << 15) | (n1 << 14) | ... | (n1 << 1) | n1
29 // while i <= haystack.len() - 16:
30 // // A 16 byte vector. Each byte in chunk corresponds to a byte in
31 // // the haystack.
32 // chunk = haystack[i:i+16]
33 // // Compare bytes in needle with bytes in chunk. The result is a 16
34 // // byte chunk where each byte is 0xFF if the corresponding bytes
35 // // in needle and chunk were equal, or 0x00 otherwise.
36 // eqs = cmpeq(needle, chunk)
37 // // Return a 32 bit integer where the most significant 16 bits
38 // // are always 0 and the lower 16 bits correspond to whether the
39 // // most significant bit in the correspond byte in `eqs` is set.
40 // // In other words, `mask as u16` has bit i set if and only if
41 // // needle[i] == chunk[i].
42 // mask = movemask(eqs)
43 //
44 // // Mask is 0 if there is no match, and non-zero otherwise.
45 // if mask != 0:
46 // // trailing_zeros tells us the position of the least significant
47 // // bit that is set.
48 // return i + trailing_zeros(mask)
49 //
50 // // haystack length may not be a multiple of 16, so search the rest.
51 // while i < haystack.len():
52 // if haystack[i] == n1:
53 // return i
54 //
55 // // No match found.
56 // return NULL
57 //
58 // In fact, we could loosely translate the above code to Rust line-for-line
59 // and it would be a pretty fast algorithm. But, we pull out all the stops
60 // to go as fast as possible:
61 //
62 // 1. We use aligned loads. That is, we do some finagling to make sure our
63 // primary loop not only proceeds in increments of 16 bytes, but that
64 // the address of haystack's pointer that we dereference is aligned to
65 // 16 bytes. 16 is a magic number here because it is the size of SSE2
66 // 128-bit vector. (For the AVX2 algorithm, 32 is the magic number.)
67 // Therefore, to get aligned loads, our pointer's address must be evenly
68 // divisible by 16.
69 // 2. Our primary loop proceeds 64 bytes at a time instead of 16. It's
70 // kind of like loop unrolling, but we combine the equality comparisons
71 // using a vector OR such that we only need to extract a single mask to
72 // determine whether a match exists or not. If so, then we do some
73 // book-keeping to determine the precise location but otherwise mush on.
74 // 3. We use our "chunk" comparison routine in as many places as possible,
75 // even if it means using unaligned loads. In particular, if haystack
76 // starts with an unaligned address, then we do an unaligned load to
77 // search the first 16 bytes. We then start our primary loop at the
78 // smallest subsequent aligned address, which will actually overlap with
79 // previously searched bytes. But we're OK with that. We do a similar
80 // dance at the end of our primary loop. Finally, to avoid a
81 // byte-at-a-time loop at the end, we do a final 16 byte unaligned load
82 // that may overlap with a previous load. This is OK because it converts
83 // a loop into a small number of very fast vector instructions.
84 //
85 // The primary downside of this algorithm is that it's effectively
86 // completely unsafe. Therefore, we have to be super careful to avoid
87 // undefined behavior:
88 //
89 // 1. We use raw pointers everywhere. Not only does dereferencing a pointer
90 // require the pointer to be valid, but we actually can't even store the
91 // address of an invalid pointer (unless it's 1 past the end of
92 // haystack) without sacrificing performance.
93 // 2. _mm_loadu_si128 is used when you don't care about alignment, and
94 // _mm_load_si128 is used when you do care. You cannot use the latter
95 // on unaligned pointers.
96 // 3. We make liberal use of debug_assert! to check assumptions.
97 // 4. We make a concerted effort to stick with pointers instead of indices.
98 // Indices are nicer because there's less to worry about with them (see
99 // above about pointer offsets), but I could not get the compiler to
100 // produce as good of code as what the below produces. In any case,
101 // pointers are what we really care about here, and alignment is
102 // expressed a bit more naturally with them.
103 //
104 // In general, most of the algorithms in this crate have a similar
105 // structure to what you see below, so this comment applies fairly well to
106 // all of them.
107
108 let vn1 = _mm_set1_epi8(n1 as i8);
109 let len = haystack.len();
110 let loop_size = cmp::min(LOOP_SIZE, len);
111 let start_ptr = haystack.as_ptr();
112 let end_ptr = haystack[haystack.len()..].as_ptr();
113 let mut ptr = start_ptr;
114
115 if haystack.len() < VECTOR_SIZE {
116 while ptr < end_ptr {
117 if *ptr == n1 {
118 return Some(sub(ptr, start_ptr));
119 }
120 ptr = ptr.offset(1);
121 }
122 return None;
123 }
124
125 if let Some(i) = forward_search1(start_ptr, end_ptr, ptr, vn1) {
126 return Some(i);
127 }
128
129 ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN));
130 debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr);
131 while loop_size == LOOP_SIZE && ptr <= end_ptr.sub(loop_size) {
132 debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
133
134 let a = _mm_load_si128(ptr as *const __m128i);
135 let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i);
136 let c = _mm_load_si128(ptr.add(2 * VECTOR_SIZE) as *const __m128i);
137 let d = _mm_load_si128(ptr.add(3 * VECTOR_SIZE) as *const __m128i);
138 let eqa = _mm_cmpeq_epi8(vn1, a);
139 let eqb = _mm_cmpeq_epi8(vn1, b);
140 let eqc = _mm_cmpeq_epi8(vn1, c);
141 let eqd = _mm_cmpeq_epi8(vn1, d);
142 let or1 = _mm_or_si128(eqa, eqb);
143 let or2 = _mm_or_si128(eqc, eqd);
144 let or3 = _mm_or_si128(or1, or2);
145 if _mm_movemask_epi8(or3) != 0 {
146 let mut at = sub(ptr, start_ptr);
147 let mask = _mm_movemask_epi8(eqa);
148 if mask != 0 {
149 return Some(at + forward_pos(mask));
150 }
151
152 at += VECTOR_SIZE;
153 let mask = _mm_movemask_epi8(eqb);
154 if mask != 0 {
155 return Some(at + forward_pos(mask));
156 }
157
158 at += VECTOR_SIZE;
159 let mask = _mm_movemask_epi8(eqc);
160 if mask != 0 {
161 return Some(at + forward_pos(mask));
162 }
163
164 at += VECTOR_SIZE;
165 let mask = _mm_movemask_epi8(eqd);
166 debug_assert!(mask != 0);
167 return Some(at + forward_pos(mask));
168 }
169 ptr = ptr.add(loop_size);
170 }
171 while ptr <= end_ptr.sub(VECTOR_SIZE) {
172 debug_assert!(sub(end_ptr, ptr) >= VECTOR_SIZE);
173
174 if let Some(i) = forward_search1(start_ptr, end_ptr, ptr, vn1) {
175 return Some(i);
176 }
177 ptr = ptr.add(VECTOR_SIZE);
178 }
179 if ptr < end_ptr {
180 debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE);
181 ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr));
182 debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE);
183
184 return forward_search1(start_ptr, end_ptr, ptr, vn1);
185 }
186 None
187 }
188
189 #[target_feature(enable = "sse2")]
memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize>190 pub unsafe fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
191 let vn1 = _mm_set1_epi8(n1 as i8);
192 let vn2 = _mm_set1_epi8(n2 as i8);
193 let len = haystack.len();
194 let loop_size = cmp::min(LOOP_SIZE2, len);
195 let start_ptr = haystack.as_ptr();
196 let end_ptr = haystack[haystack.len()..].as_ptr();
197 let mut ptr = start_ptr;
198
199 if haystack.len() < VECTOR_SIZE {
200 while ptr < end_ptr {
201 if *ptr == n1 || *ptr == n2 {
202 return Some(sub(ptr, start_ptr));
203 }
204 ptr = ptr.offset(1);
205 }
206 return None;
207 }
208
209 if let Some(i) = forward_search2(start_ptr, end_ptr, ptr, vn1, vn2) {
210 return Some(i);
211 }
212
213 ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN));
214 debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr);
215 while loop_size == LOOP_SIZE2 && ptr <= end_ptr.sub(loop_size) {
216 debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
217
218 let a = _mm_load_si128(ptr as *const __m128i);
219 let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i);
220 let eqa1 = _mm_cmpeq_epi8(vn1, a);
221 let eqb1 = _mm_cmpeq_epi8(vn1, b);
222 let eqa2 = _mm_cmpeq_epi8(vn2, a);
223 let eqb2 = _mm_cmpeq_epi8(vn2, b);
224 let or1 = _mm_or_si128(eqa1, eqb1);
225 let or2 = _mm_or_si128(eqa2, eqb2);
226 let or3 = _mm_or_si128(or1, or2);
227 if _mm_movemask_epi8(or3) != 0 {
228 let mut at = sub(ptr, start_ptr);
229 let mask1 = _mm_movemask_epi8(eqa1);
230 let mask2 = _mm_movemask_epi8(eqa2);
231 if mask1 != 0 || mask2 != 0 {
232 return Some(at + forward_pos2(mask1, mask2));
233 }
234
235 at += VECTOR_SIZE;
236 let mask1 = _mm_movemask_epi8(eqb1);
237 let mask2 = _mm_movemask_epi8(eqb2);
238 return Some(at + forward_pos2(mask1, mask2));
239 }
240 ptr = ptr.add(loop_size);
241 }
242 while ptr <= end_ptr.sub(VECTOR_SIZE) {
243 if let Some(i) = forward_search2(start_ptr, end_ptr, ptr, vn1, vn2) {
244 return Some(i);
245 }
246 ptr = ptr.add(VECTOR_SIZE);
247 }
248 if ptr < end_ptr {
249 debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE);
250 ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr));
251 debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE);
252
253 return forward_search2(start_ptr, end_ptr, ptr, vn1, vn2);
254 }
255 None
256 }
257
258 #[target_feature(enable = "sse2")]
memchr3( n1: u8, n2: u8, n3: u8, haystack: &[u8] ) -> Option<usize>259 pub unsafe fn memchr3(
260 n1: u8, n2: u8, n3: u8,
261 haystack: &[u8]
262 ) -> Option<usize> {
263 let vn1 = _mm_set1_epi8(n1 as i8);
264 let vn2 = _mm_set1_epi8(n2 as i8);
265 let vn3 = _mm_set1_epi8(n3 as i8);
266 let len = haystack.len();
267 let loop_size = cmp::min(LOOP_SIZE2, len);
268 let start_ptr = haystack.as_ptr();
269 let end_ptr = haystack[haystack.len()..].as_ptr();
270 let mut ptr = start_ptr;
271
272 if haystack.len() < VECTOR_SIZE {
273 while ptr < end_ptr {
274 if *ptr == n1 || *ptr == n2 || *ptr == n3 {
275 return Some(sub(ptr, start_ptr));
276 }
277 ptr = ptr.offset(1);
278 }
279 return None;
280 }
281
282 if let Some(i) = forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) {
283 return Some(i);
284 }
285
286 ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN));
287 debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr);
288 while loop_size == LOOP_SIZE2 && ptr <= end_ptr.sub(loop_size) {
289 debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
290
291 let a = _mm_load_si128(ptr as *const __m128i);
292 let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i);
293 let eqa1 = _mm_cmpeq_epi8(vn1, a);
294 let eqb1 = _mm_cmpeq_epi8(vn1, b);
295 let eqa2 = _mm_cmpeq_epi8(vn2, a);
296 let eqb2 = _mm_cmpeq_epi8(vn2, b);
297 let eqa3 = _mm_cmpeq_epi8(vn3, a);
298 let eqb3 = _mm_cmpeq_epi8(vn3, b);
299 let or1 = _mm_or_si128(eqa1, eqb1);
300 let or2 = _mm_or_si128(eqa2, eqb2);
301 let or3 = _mm_or_si128(eqa3, eqb3);
302 let or4 = _mm_or_si128(or1, or2);
303 let or5 = _mm_or_si128(or3, or4);
304 if _mm_movemask_epi8(or5) != 0 {
305 let mut at = sub(ptr, start_ptr);
306 let mask1 = _mm_movemask_epi8(eqa1);
307 let mask2 = _mm_movemask_epi8(eqa2);
308 let mask3 = _mm_movemask_epi8(eqa3);
309 if mask1 != 0 || mask2 != 0 || mask3 != 0 {
310 return Some(at + forward_pos3(mask1, mask2, mask3));
311 }
312
313 at += VECTOR_SIZE;
314 let mask1 = _mm_movemask_epi8(eqb1);
315 let mask2 = _mm_movemask_epi8(eqb2);
316 let mask3 = _mm_movemask_epi8(eqb3);
317 if mask1 != 0 || mask2 != 0 || mask3 != 0 {
318 return Some(at + forward_pos3(mask1, mask2, mask3));
319 }
320
321 at += VECTOR_SIZE;
322 let mask1 = _mm_movemask_epi8(eqb1);
323 let mask2 = _mm_movemask_epi8(eqb2);
324 let mask3 = _mm_movemask_epi8(eqb3);
325 return Some(at + forward_pos3(mask1, mask2, mask3));
326 }
327 ptr = ptr.add(loop_size);
328 }
329 while ptr <= end_ptr.sub(VECTOR_SIZE) {
330 if let Some(i) = forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) {
331 return Some(i);
332 }
333 ptr = ptr.add(VECTOR_SIZE);
334 }
335 if ptr < end_ptr {
336 debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE);
337 ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr));
338 debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE);
339
340 return forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3);
341 }
342 None
343 }
344
345 #[target_feature(enable = "sse2")]
memrchr(n1: u8, haystack: &[u8]) -> Option<usize>346 pub unsafe fn memrchr(n1: u8, haystack: &[u8]) -> Option<usize> {
347 let vn1 = _mm_set1_epi8(n1 as i8);
348 let len = haystack.len();
349 let loop_size = cmp::min(LOOP_SIZE, len);
350 let start_ptr = haystack.as_ptr();
351 let end_ptr = haystack[haystack.len()..].as_ptr();
352 let mut ptr = end_ptr;
353
354 if haystack.len() < VECTOR_SIZE {
355 while ptr > start_ptr {
356 ptr = ptr.offset(-1);
357 if *ptr == n1 {
358 return Some(sub(ptr, start_ptr));
359 }
360 }
361 return None;
362 }
363
364 ptr = ptr.sub(VECTOR_SIZE);
365 if let Some(i) = reverse_search1(start_ptr, end_ptr, ptr, vn1) {
366 return Some(i);
367 }
368
369 ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8;
370 debug_assert!(start_ptr <= ptr && ptr <= end_ptr);
371 while loop_size == LOOP_SIZE && ptr >= start_ptr.add(loop_size) {
372 debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
373
374 ptr = ptr.sub(loop_size);
375 let a = _mm_load_si128(ptr as *const __m128i);
376 let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i);
377 let c = _mm_load_si128(ptr.add(2 * VECTOR_SIZE) as *const __m128i);
378 let d = _mm_load_si128(ptr.add(3 * VECTOR_SIZE) as *const __m128i);
379 let eqa = _mm_cmpeq_epi8(vn1, a);
380 let eqb = _mm_cmpeq_epi8(vn1, b);
381 let eqc = _mm_cmpeq_epi8(vn1, c);
382 let eqd = _mm_cmpeq_epi8(vn1, d);
383 let or1 = _mm_or_si128(eqa, eqb);
384 let or2 = _mm_or_si128(eqc, eqd);
385 let or3 = _mm_or_si128(or1, or2);
386 if _mm_movemask_epi8(or3) != 0 {
387 let mut at = sub(ptr.add(3 * VECTOR_SIZE), start_ptr);
388 let mask = _mm_movemask_epi8(eqd);
389 if mask != 0 {
390 return Some(at + reverse_pos(mask));
391 }
392
393 at -= VECTOR_SIZE;
394 let mask = _mm_movemask_epi8(eqc);
395 if mask != 0 {
396 return Some(at + reverse_pos(mask));
397 }
398
399 at -= VECTOR_SIZE;
400 let mask = _mm_movemask_epi8(eqb);
401 if mask != 0 {
402 return Some(at + reverse_pos(mask));
403 }
404
405 at -= VECTOR_SIZE;
406 let mask = _mm_movemask_epi8(eqa);
407 debug_assert!(mask != 0);
408 return Some(at + reverse_pos(mask));
409 }
410 }
411 while ptr >= start_ptr.add(VECTOR_SIZE) {
412 ptr = ptr.sub(VECTOR_SIZE);
413 if let Some(i) = reverse_search1(start_ptr, end_ptr, ptr, vn1) {
414 return Some(i);
415 }
416 }
417 if ptr > start_ptr {
418 debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE);
419 return reverse_search1(start_ptr, end_ptr, start_ptr, vn1);
420 }
421 None
422 }
423
424 #[target_feature(enable = "sse2")]
memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize>425 pub unsafe fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
426 let vn1 = _mm_set1_epi8(n1 as i8);
427 let vn2 = _mm_set1_epi8(n2 as i8);
428 let len = haystack.len();
429 let loop_size = cmp::min(LOOP_SIZE2, len);
430 let start_ptr = haystack.as_ptr();
431 let end_ptr = haystack[haystack.len()..].as_ptr();
432 let mut ptr = end_ptr;
433
434 if haystack.len() < VECTOR_SIZE {
435 while ptr > start_ptr {
436 ptr = ptr.offset(-1);
437 if *ptr == n1 || *ptr == n2 {
438 return Some(sub(ptr, start_ptr));
439 }
440 }
441 return None;
442 }
443
444 ptr = ptr.sub(VECTOR_SIZE);
445 if let Some(i) = reverse_search2(start_ptr, end_ptr, ptr, vn1, vn2) {
446 return Some(i);
447 }
448
449 ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8;
450 debug_assert!(start_ptr <= ptr && ptr <= end_ptr);
451 while loop_size == LOOP_SIZE2 && ptr >= start_ptr.add(loop_size) {
452 debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
453
454 ptr = ptr.sub(loop_size);
455 let a = _mm_load_si128(ptr as *const __m128i);
456 let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i);
457 let eqa1 = _mm_cmpeq_epi8(vn1, a);
458 let eqb1 = _mm_cmpeq_epi8(vn1, b);
459 let eqa2 = _mm_cmpeq_epi8(vn2, a);
460 let eqb2 = _mm_cmpeq_epi8(vn2, b);
461 let or1 = _mm_or_si128(eqa1, eqb1);
462 let or2 = _mm_or_si128(eqa2, eqb2);
463 let or3 = _mm_or_si128(or1, or2);
464 if _mm_movemask_epi8(or3) != 0 {
465 let mut at = sub(ptr.add(VECTOR_SIZE), start_ptr);
466 let mask1 = _mm_movemask_epi8(eqb1);
467 let mask2 = _mm_movemask_epi8(eqb2);
468 if mask1 != 0 || mask2 != 0 {
469 return Some(at + reverse_pos2(mask1, mask2));
470 }
471
472 at -= VECTOR_SIZE;
473 let mask1 = _mm_movemask_epi8(eqa1);
474 let mask2 = _mm_movemask_epi8(eqa2);
475 return Some(at + reverse_pos2(mask1, mask2));
476 }
477 }
478 while ptr >= start_ptr.add(VECTOR_SIZE) {
479 ptr = ptr.sub(VECTOR_SIZE);
480 if let Some(i) = reverse_search2(start_ptr, end_ptr, ptr, vn1, vn2) {
481 return Some(i);
482 }
483 }
484 if ptr > start_ptr {
485 debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE);
486 return reverse_search2(start_ptr, end_ptr, start_ptr, vn1, vn2);
487 }
488 None
489 }
490
491 #[target_feature(enable = "sse2")]
memrchr3( n1: u8, n2: u8, n3: u8, haystack: &[u8], ) -> Option<usize>492 pub unsafe fn memrchr3(
493 n1: u8, n2: u8, n3: u8,
494 haystack: &[u8],
495 ) -> Option<usize> {
496 let vn1 = _mm_set1_epi8(n1 as i8);
497 let vn2 = _mm_set1_epi8(n2 as i8);
498 let vn3 = _mm_set1_epi8(n3 as i8);
499 let len = haystack.len();
500 let loop_size = cmp::min(LOOP_SIZE2, len);
501 let start_ptr = haystack.as_ptr();
502 let end_ptr = haystack[haystack.len()..].as_ptr();
503 let mut ptr = end_ptr;
504
505 if haystack.len() < VECTOR_SIZE {
506 while ptr > start_ptr {
507 ptr = ptr.offset(-1);
508 if *ptr == n1 || *ptr == n2 || *ptr == n3 {
509 return Some(sub(ptr, start_ptr));
510 }
511 }
512 return None;
513 }
514
515 ptr = ptr.sub(VECTOR_SIZE);
516 if let Some(i) = reverse_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) {
517 return Some(i);
518 }
519
520 ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8;
521 debug_assert!(start_ptr <= ptr && ptr <= end_ptr);
522 while loop_size == LOOP_SIZE2 && ptr >= start_ptr.add(loop_size) {
523 debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
524
525 ptr = ptr.sub(loop_size);
526 let a = _mm_load_si128(ptr as *const __m128i);
527 let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i);
528 let eqa1 = _mm_cmpeq_epi8(vn1, a);
529 let eqb1 = _mm_cmpeq_epi8(vn1, b);
530 let eqa2 = _mm_cmpeq_epi8(vn2, a);
531 let eqb2 = _mm_cmpeq_epi8(vn2, b);
532 let eqa3 = _mm_cmpeq_epi8(vn3, a);
533 let eqb3 = _mm_cmpeq_epi8(vn3, b);
534 let or1 = _mm_or_si128(eqa1, eqb1);
535 let or2 = _mm_or_si128(eqa2, eqb2);
536 let or3 = _mm_or_si128(eqa3, eqb3);
537 let or4 = _mm_or_si128(or1, or2);
538 let or5 = _mm_or_si128(or3, or4);
539 if _mm_movemask_epi8(or5) != 0 {
540 let mut at = sub(ptr.add(VECTOR_SIZE), start_ptr);
541 let mask1 = _mm_movemask_epi8(eqb1);
542 let mask2 = _mm_movemask_epi8(eqb2);
543 let mask3 = _mm_movemask_epi8(eqb3);
544 if mask1 != 0 || mask2 != 0 || mask3 != 0 {
545 return Some(at + reverse_pos3(mask1, mask2, mask3));
546 }
547
548 at -= VECTOR_SIZE;
549 let mask1 = _mm_movemask_epi8(eqa1);
550 let mask2 = _mm_movemask_epi8(eqa2);
551 let mask3 = _mm_movemask_epi8(eqa3);
552 return Some(at + reverse_pos3(mask1, mask2, mask3));
553 }
554 }
555 while ptr >= start_ptr.add(VECTOR_SIZE) {
556 ptr = ptr.sub(VECTOR_SIZE);
557 if let Some(i) = reverse_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) {
558 return Some(i);
559 }
560 }
561 if ptr > start_ptr {
562 debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE);
563 return reverse_search3(start_ptr, end_ptr, start_ptr, vn1, vn2, vn3);
564 }
565 None
566 }
567
568 #[target_feature(enable = "sse2")]
forward_search1( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, vn1: __m128i, ) -> Option<usize>569 pub unsafe fn forward_search1(
570 start_ptr: *const u8,
571 end_ptr: *const u8,
572 ptr: *const u8,
573 vn1: __m128i,
574 ) -> Option<usize> {
575 debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
576 debug_assert!(start_ptr <= ptr);
577 debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
578
579 let chunk = _mm_loadu_si128(ptr as *const __m128i);
580 let mask = _mm_movemask_epi8(_mm_cmpeq_epi8(chunk, vn1));
581 if mask != 0 {
582 Some(sub(ptr, start_ptr) + forward_pos(mask))
583 } else {
584 None
585 }
586 }
587
588 #[target_feature(enable = "sse2")]
forward_search2( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, vn1: __m128i, vn2: __m128i, ) -> Option<usize>589 unsafe fn forward_search2(
590 start_ptr: *const u8,
591 end_ptr: *const u8,
592 ptr: *const u8,
593 vn1: __m128i,
594 vn2: __m128i,
595 ) -> Option<usize> {
596 debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
597 debug_assert!(start_ptr <= ptr);
598 debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
599
600 let chunk = _mm_loadu_si128(ptr as *const __m128i);
601 let eq1 = _mm_cmpeq_epi8(chunk, vn1);
602 let eq2 = _mm_cmpeq_epi8(chunk, vn2);
603 if _mm_movemask_epi8(_mm_or_si128(eq1, eq2)) != 0 {
604 let mask1 = _mm_movemask_epi8(eq1);
605 let mask2 = _mm_movemask_epi8(eq2);
606 Some(sub(ptr, start_ptr) + forward_pos2(mask1, mask2))
607 } else {
608 None
609 }
610 }
611
612 #[target_feature(enable = "sse2")]
forward_search3( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, vn1: __m128i, vn2: __m128i, vn3: __m128i, ) -> Option<usize>613 pub unsafe fn forward_search3(
614 start_ptr: *const u8,
615 end_ptr: *const u8,
616 ptr: *const u8,
617 vn1: __m128i,
618 vn2: __m128i,
619 vn3: __m128i,
620 ) -> Option<usize> {
621 debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
622 debug_assert!(start_ptr <= ptr);
623 debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
624
625 let chunk = _mm_loadu_si128(ptr as *const __m128i);
626 let eq1 = _mm_cmpeq_epi8(chunk, vn1);
627 let eq2 = _mm_cmpeq_epi8(chunk, vn2);
628 let eq3 = _mm_cmpeq_epi8(chunk, vn3);
629 let or = _mm_or_si128(eq1, eq2);
630 if _mm_movemask_epi8(_mm_or_si128(or, eq3)) != 0 {
631 let mask1 = _mm_movemask_epi8(eq1);
632 let mask2 = _mm_movemask_epi8(eq2);
633 let mask3 = _mm_movemask_epi8(eq3);
634 Some(sub(ptr, start_ptr) + forward_pos3(mask1, mask2, mask3))
635 } else {
636 None
637 }
638 }
639
640 #[target_feature(enable = "sse2")]
reverse_search1( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, vn1: __m128i, ) -> Option<usize>641 unsafe fn reverse_search1(
642 start_ptr: *const u8,
643 end_ptr: *const u8,
644 ptr: *const u8,
645 vn1: __m128i,
646 ) -> Option<usize> {
647 debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
648 debug_assert!(start_ptr <= ptr);
649 debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
650
651 let chunk = _mm_loadu_si128(ptr as *const __m128i);
652 let mask = _mm_movemask_epi8(_mm_cmpeq_epi8(vn1, chunk));
653 if mask != 0 {
654 Some(sub(ptr, start_ptr) + reverse_pos(mask))
655 } else {
656 None
657 }
658 }
659
660 #[target_feature(enable = "sse2")]
reverse_search2( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, vn1: __m128i, vn2: __m128i, ) -> Option<usize>661 unsafe fn reverse_search2(
662 start_ptr: *const u8,
663 end_ptr: *const u8,
664 ptr: *const u8,
665 vn1: __m128i,
666 vn2: __m128i,
667 ) -> Option<usize> {
668 debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
669 debug_assert!(start_ptr <= ptr);
670 debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
671
672 let chunk = _mm_loadu_si128(ptr as *const __m128i);
673 let eq1 = _mm_cmpeq_epi8(chunk, vn1);
674 let eq2 = _mm_cmpeq_epi8(chunk, vn2);
675 if _mm_movemask_epi8(_mm_or_si128(eq1, eq2)) != 0 {
676 let mask1 = _mm_movemask_epi8(eq1);
677 let mask2 = _mm_movemask_epi8(eq2);
678 Some(sub(ptr, start_ptr) + reverse_pos2(mask1, mask2))
679 } else {
680 None
681 }
682 }
683
684 #[target_feature(enable = "sse2")]
reverse_search3( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, vn1: __m128i, vn2: __m128i, vn3: __m128i, ) -> Option<usize>685 unsafe fn reverse_search3(
686 start_ptr: *const u8,
687 end_ptr: *const u8,
688 ptr: *const u8,
689 vn1: __m128i,
690 vn2: __m128i,
691 vn3: __m128i,
692 ) -> Option<usize> {
693 debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
694 debug_assert!(start_ptr <= ptr);
695 debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
696
697 let chunk = _mm_loadu_si128(ptr as *const __m128i);
698 let eq1 = _mm_cmpeq_epi8(chunk, vn1);
699 let eq2 = _mm_cmpeq_epi8(chunk, vn2);
700 let eq3 = _mm_cmpeq_epi8(chunk, vn3);
701 let or = _mm_or_si128(eq1, eq2);
702 if _mm_movemask_epi8(_mm_or_si128(or, eq3)) != 0 {
703 let mask1 = _mm_movemask_epi8(eq1);
704 let mask2 = _mm_movemask_epi8(eq2);
705 let mask3 = _mm_movemask_epi8(eq3);
706 Some(sub(ptr, start_ptr) + reverse_pos3(mask1, mask2, mask3))
707 } else {
708 None
709 }
710 }
711
712 /// Compute the position of the first matching byte from the given mask. The
713 /// position returned is always in the range [0, 15].
714 ///
715 /// The mask given is expected to be the result of _mm_movemask_epi8.
forward_pos(mask: i32) -> usize716 fn forward_pos(mask: i32) -> usize {
717 // We are dealing with little endian here, where the most significant byte
718 // is at a higher address. That means the least significant bit that is set
719 // corresponds to the position of our first matching byte. That position
720 // corresponds to the number of zeros after the least significant bit.
721 mask.trailing_zeros() as usize
722 }
723
724 /// Compute the position of the first matching byte from the given masks. The
725 /// position returned is always in the range [0, 15]. Each mask corresponds to
726 /// the equality comparison of a single byte.
727 ///
728 /// The masks given are expected to be the result of _mm_movemask_epi8, where
729 /// at least one of the masks is non-zero (i.e., indicates a match).
forward_pos2(mask1: i32, mask2: i32) -> usize730 fn forward_pos2(mask1: i32, mask2: i32) -> usize {
731 debug_assert!(mask1 != 0 || mask2 != 0);
732
733 forward_pos(mask1 | mask2)
734 }
735
736 /// Compute the position of the first matching byte from the given masks. The
737 /// position returned is always in the range [0, 15]. Each mask corresponds to
738 /// the equality comparison of a single byte.
739 ///
740 /// The masks given are expected to be the result of _mm_movemask_epi8, where
741 /// at least one of the masks is non-zero (i.e., indicates a match).
forward_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize742 fn forward_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize {
743 debug_assert!(mask1 != 0 || mask2 != 0 || mask3 != 0);
744
745 forward_pos(mask1 | mask2 | mask3)
746 }
747
748 /// Compute the position of the last matching byte from the given mask. The
749 /// position returned is always in the range [0, 15].
750 ///
751 /// The mask given is expected to be the result of _mm_movemask_epi8.
reverse_pos(mask: i32) -> usize752 fn reverse_pos(mask: i32) -> usize {
753 // We are dealing with little endian here, where the most significant byte
754 // is at a higher address. That means the most significant bit that is set
755 // corresponds to the position of our last matching byte. The position from
756 // the end of the mask is therefore the number of leading zeros in a 16
757 // bit integer, and the position from the start of the mask is therefore
758 // 16 - (leading zeros) - 1.
759 VECTOR_SIZE - (mask as u16).leading_zeros() as usize - 1
760 }
761
762 /// Compute the position of the last matching byte from the given masks. The
763 /// position returned is always in the range [0, 15]. Each mask corresponds to
764 /// the equality comparison of a single byte.
765 ///
766 /// The masks given are expected to be the result of _mm_movemask_epi8, where
767 /// at least one of the masks is non-zero (i.e., indicates a match).
reverse_pos2(mask1: i32, mask2: i32) -> usize768 fn reverse_pos2(mask1: i32, mask2: i32) -> usize {
769 debug_assert!(mask1 != 0 || mask2 != 0);
770
771 reverse_pos(mask1 | mask2)
772 }
773
774 /// Compute the position of the last matching byte from the given masks. The
775 /// position returned is always in the range [0, 15]. Each mask corresponds to
776 /// the equality comparison of a single byte.
777 ///
778 /// The masks given are expected to be the result of _mm_movemask_epi8, where
779 /// at least one of the masks is non-zero (i.e., indicates a match).
reverse_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize780 fn reverse_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize {
781 debug_assert!(mask1 != 0 || mask2 != 0 || mask3 != 0);
782
783 reverse_pos(mask1 | mask2 | mask3)
784 }
785
786 /// Subtract `b` from `a` and return the difference. `a` should be greater than
787 /// or equal to `b`.
sub(a: *const u8, b: *const u8) -> usize788 fn sub(a: *const u8, b: *const u8) -> usize {
789 debug_assert!(a >= b);
790 (a as usize) - (b as usize)
791 }
792