1 use core::arch::x86_64::*;
2 use core::cmp;
3 use core::mem::size_of;
4
5 const VECTOR_SIZE: usize = size_of::<__m128i>();
6 const VECTOR_ALIGN: usize = VECTOR_SIZE - 1;
7
8 // The number of bytes to loop at in one iteration of memchr/memrchr.
9 const LOOP_SIZE: usize = 4 * VECTOR_SIZE;
10
11 // The number of bytes to loop at in one iteration of memchr2/memrchr2 and
12 // memchr3/memrchr3. There was no observable difference between 64 and 32 bytes
13 // in benchmarks. memchr3 in particular only gets a very slight speed up from
14 // the loop unrolling.
15 const LOOP_SIZE2: usize = 2 * VECTOR_SIZE;
16
17 #[target_feature(enable = "sse2")]
memchr(n1: u8, haystack: &[u8]) -> Option<usize>18 pub unsafe fn memchr(n1: u8, haystack: &[u8]) -> Option<usize> {
19 // What follows is a fast SSE2-only algorithm to detect the position of
20 // `n1` in `haystack` if it exists. From what I know, this is the "classic"
21 // algorithm. I believe it can be found in places like glibc and Go's
22 // standard library. It appears to be well known and is elaborated on in
23 // more detail here: https://gms.tf/stdfind-and-memchr-optimizations.html
24 //
25 // While this routine is very long, the basic idea is actually very simple
26 // and can be expressed straight-forwardly in pseudo code:
27 //
28 // needle = (n1 << 15) | (n1 << 14) | ... | (n1 << 1) | n1
29 // // Note: shift amount in bytes
30 //
31 // while i <= haystack.len() - 16:
32 // // A 16 byte vector. Each byte in chunk corresponds to a byte in
33 // // the haystack.
34 // chunk = haystack[i:i+16]
35 // // Compare bytes in needle with bytes in chunk. The result is a 16
36 // // byte chunk where each byte is 0xFF if the corresponding bytes
37 // // in needle and chunk were equal, or 0x00 otherwise.
38 // eqs = cmpeq(needle, chunk)
39 // // Return a 32 bit integer where the most significant 16 bits
40 // // are always 0 and the lower 16 bits correspond to whether the
41 // // most significant bit in the correspond byte in `eqs` is set.
42 // // In other words, `mask as u16` has bit i set if and only if
43 // // needle[i] == chunk[i].
44 // mask = movemask(eqs)
45 //
46 // // Mask is 0 if there is no match, and non-zero otherwise.
47 // if mask != 0:
48 // // trailing_zeros tells us the position of the least significant
49 // // bit that is set.
50 // return i + trailing_zeros(mask)
51 //
52 // // haystack length may not be a multiple of 16, so search the rest.
53 // while i < haystack.len():
54 // if haystack[i] == n1:
55 // return i
56 //
57 // // No match found.
58 // return NULL
59 //
60 // In fact, we could loosely translate the above code to Rust line-for-line
61 // and it would be a pretty fast algorithm. But, we pull out all the stops
62 // to go as fast as possible:
63 //
64 // 1. We use aligned loads. That is, we do some finagling to make sure our
65 // primary loop not only proceeds in increments of 16 bytes, but that
66 // the address of haystack's pointer that we dereference is aligned to
67 // 16 bytes. 16 is a magic number here because it is the size of SSE2
68 // 128-bit vector. (For the AVX2 algorithm, 32 is the magic number.)
69 // Therefore, to get aligned loads, our pointer's address must be evenly
70 // divisible by 16.
71 // 2. Our primary loop proceeds 64 bytes at a time instead of 16. It's
72 // kind of like loop unrolling, but we combine the equality comparisons
73 // using a vector OR such that we only need to extract a single mask to
74 // determine whether a match exists or not. If so, then we do some
75 // book-keeping to determine the precise location but otherwise mush on.
76 // 3. We use our "chunk" comparison routine in as many places as possible,
77 // even if it means using unaligned loads. In particular, if haystack
78 // starts with an unaligned address, then we do an unaligned load to
79 // search the first 16 bytes. We then start our primary loop at the
80 // smallest subsequent aligned address, which will actually overlap with
81 // previously searched bytes. But we're OK with that. We do a similar
82 // dance at the end of our primary loop. Finally, to avoid a
83 // byte-at-a-time loop at the end, we do a final 16 byte unaligned load
84 // that may overlap with a previous load. This is OK because it converts
85 // a loop into a small number of very fast vector instructions.
86 //
87 // The primary downside of this algorithm is that it's effectively
88 // completely unsafe. Therefore, we have to be super careful to avoid
89 // undefined behavior:
90 //
91 // 1. We use raw pointers everywhere. Not only does dereferencing a pointer
92 // require the pointer to be valid, but we actually can't even store the
93 // address of an invalid pointer (unless it's 1 past the end of
94 // haystack) without sacrificing performance.
95 // 2. _mm_loadu_si128 is used when you don't care about alignment, and
96 // _mm_load_si128 is used when you do care. You cannot use the latter
97 // on unaligned pointers.
98 // 3. We make liberal use of debug_assert! to check assumptions.
99 // 4. We make a concerted effort to stick with pointers instead of indices.
100 // Indices are nicer because there's less to worry about with them (see
101 // above about pointer offsets), but I could not get the compiler to
102 // produce as good of code as what the below produces. In any case,
103 // pointers are what we really care about here, and alignment is
104 // expressed a bit more naturally with them.
105 //
106 // In general, most of the algorithms in this crate have a similar
107 // structure to what you see below, so this comment applies fairly well to
108 // all of them.
109
110 let vn1 = _mm_set1_epi8(n1 as i8);
111 let len = haystack.len();
112 let loop_size = cmp::min(LOOP_SIZE, len);
113 let start_ptr = haystack.as_ptr();
114 let end_ptr = haystack[haystack.len()..].as_ptr();
115 let mut ptr = start_ptr;
116
117 if haystack.len() < VECTOR_SIZE {
118 while ptr < end_ptr {
119 if *ptr == n1 {
120 return Some(sub(ptr, start_ptr));
121 }
122 ptr = ptr.offset(1);
123 }
124 return None;
125 }
126
127 if let Some(i) = forward_search1(start_ptr, end_ptr, ptr, vn1) {
128 return Some(i);
129 }
130
131 ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN));
132 debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr);
133 while loop_size == LOOP_SIZE && ptr <= end_ptr.sub(loop_size) {
134 debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
135
136 let a = _mm_load_si128(ptr as *const __m128i);
137 let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i);
138 let c = _mm_load_si128(ptr.add(2 * VECTOR_SIZE) as *const __m128i);
139 let d = _mm_load_si128(ptr.add(3 * VECTOR_SIZE) as *const __m128i);
140 let eqa = _mm_cmpeq_epi8(vn1, a);
141 let eqb = _mm_cmpeq_epi8(vn1, b);
142 let eqc = _mm_cmpeq_epi8(vn1, c);
143 let eqd = _mm_cmpeq_epi8(vn1, d);
144 let or1 = _mm_or_si128(eqa, eqb);
145 let or2 = _mm_or_si128(eqc, eqd);
146 let or3 = _mm_or_si128(or1, or2);
147 if _mm_movemask_epi8(or3) != 0 {
148 let mut at = sub(ptr, start_ptr);
149 let mask = _mm_movemask_epi8(eqa);
150 if mask != 0 {
151 return Some(at + forward_pos(mask));
152 }
153
154 at += VECTOR_SIZE;
155 let mask = _mm_movemask_epi8(eqb);
156 if mask != 0 {
157 return Some(at + forward_pos(mask));
158 }
159
160 at += VECTOR_SIZE;
161 let mask = _mm_movemask_epi8(eqc);
162 if mask != 0 {
163 return Some(at + forward_pos(mask));
164 }
165
166 at += VECTOR_SIZE;
167 let mask = _mm_movemask_epi8(eqd);
168 debug_assert!(mask != 0);
169 return Some(at + forward_pos(mask));
170 }
171 ptr = ptr.add(loop_size);
172 }
173 while ptr <= end_ptr.sub(VECTOR_SIZE) {
174 debug_assert!(sub(end_ptr, ptr) >= VECTOR_SIZE);
175
176 if let Some(i) = forward_search1(start_ptr, end_ptr, ptr, vn1) {
177 return Some(i);
178 }
179 ptr = ptr.add(VECTOR_SIZE);
180 }
181 if ptr < end_ptr {
182 debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE);
183 ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr));
184 debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE);
185
186 return forward_search1(start_ptr, end_ptr, ptr, vn1);
187 }
188 None
189 }
190
191 #[target_feature(enable = "sse2")]
memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize>192 pub unsafe fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
193 let vn1 = _mm_set1_epi8(n1 as i8);
194 let vn2 = _mm_set1_epi8(n2 as i8);
195 let len = haystack.len();
196 let loop_size = cmp::min(LOOP_SIZE2, len);
197 let start_ptr = haystack.as_ptr();
198 let end_ptr = haystack[haystack.len()..].as_ptr();
199 let mut ptr = start_ptr;
200
201 if haystack.len() < VECTOR_SIZE {
202 while ptr < end_ptr {
203 if *ptr == n1 || *ptr == n2 {
204 return Some(sub(ptr, start_ptr));
205 }
206 ptr = ptr.offset(1);
207 }
208 return None;
209 }
210
211 if let Some(i) = forward_search2(start_ptr, end_ptr, ptr, vn1, vn2) {
212 return Some(i);
213 }
214
215 ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN));
216 debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr);
217 while loop_size == LOOP_SIZE2 && ptr <= end_ptr.sub(loop_size) {
218 debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
219
220 let a = _mm_load_si128(ptr as *const __m128i);
221 let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i);
222 let eqa1 = _mm_cmpeq_epi8(vn1, a);
223 let eqb1 = _mm_cmpeq_epi8(vn1, b);
224 let eqa2 = _mm_cmpeq_epi8(vn2, a);
225 let eqb2 = _mm_cmpeq_epi8(vn2, b);
226 let or1 = _mm_or_si128(eqa1, eqb1);
227 let or2 = _mm_or_si128(eqa2, eqb2);
228 let or3 = _mm_or_si128(or1, or2);
229 if _mm_movemask_epi8(or3) != 0 {
230 let mut at = sub(ptr, start_ptr);
231 let mask1 = _mm_movemask_epi8(eqa1);
232 let mask2 = _mm_movemask_epi8(eqa2);
233 if mask1 != 0 || mask2 != 0 {
234 return Some(at + forward_pos2(mask1, mask2));
235 }
236
237 at += VECTOR_SIZE;
238 let mask1 = _mm_movemask_epi8(eqb1);
239 let mask2 = _mm_movemask_epi8(eqb2);
240 return Some(at + forward_pos2(mask1, mask2));
241 }
242 ptr = ptr.add(loop_size);
243 }
244 while ptr <= end_ptr.sub(VECTOR_SIZE) {
245 if let Some(i) = forward_search2(start_ptr, end_ptr, ptr, vn1, vn2) {
246 return Some(i);
247 }
248 ptr = ptr.add(VECTOR_SIZE);
249 }
250 if ptr < end_ptr {
251 debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE);
252 ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr));
253 debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE);
254
255 return forward_search2(start_ptr, end_ptr, ptr, vn1, vn2);
256 }
257 None
258 }
259
260 #[target_feature(enable = "sse2")]
memchr3( n1: u8, n2: u8, n3: u8, haystack: &[u8], ) -> Option<usize>261 pub unsafe fn memchr3(
262 n1: u8,
263 n2: u8,
264 n3: u8,
265 haystack: &[u8],
266 ) -> Option<usize> {
267 let vn1 = _mm_set1_epi8(n1 as i8);
268 let vn2 = _mm_set1_epi8(n2 as i8);
269 let vn3 = _mm_set1_epi8(n3 as i8);
270 let len = haystack.len();
271 let loop_size = cmp::min(LOOP_SIZE2, len);
272 let start_ptr = haystack.as_ptr();
273 let end_ptr = haystack[haystack.len()..].as_ptr();
274 let mut ptr = start_ptr;
275
276 if haystack.len() < VECTOR_SIZE {
277 while ptr < end_ptr {
278 if *ptr == n1 || *ptr == n2 || *ptr == n3 {
279 return Some(sub(ptr, start_ptr));
280 }
281 ptr = ptr.offset(1);
282 }
283 return None;
284 }
285
286 if let Some(i) = forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) {
287 return Some(i);
288 }
289
290 ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN));
291 debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr);
292 while loop_size == LOOP_SIZE2 && ptr <= end_ptr.sub(loop_size) {
293 debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
294
295 let a = _mm_load_si128(ptr as *const __m128i);
296 let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i);
297 let eqa1 = _mm_cmpeq_epi8(vn1, a);
298 let eqb1 = _mm_cmpeq_epi8(vn1, b);
299 let eqa2 = _mm_cmpeq_epi8(vn2, a);
300 let eqb2 = _mm_cmpeq_epi8(vn2, b);
301 let eqa3 = _mm_cmpeq_epi8(vn3, a);
302 let eqb3 = _mm_cmpeq_epi8(vn3, b);
303 let or1 = _mm_or_si128(eqa1, eqb1);
304 let or2 = _mm_or_si128(eqa2, eqb2);
305 let or3 = _mm_or_si128(eqa3, eqb3);
306 let or4 = _mm_or_si128(or1, or2);
307 let or5 = _mm_or_si128(or3, or4);
308 if _mm_movemask_epi8(or5) != 0 {
309 let mut at = sub(ptr, start_ptr);
310 let mask1 = _mm_movemask_epi8(eqa1);
311 let mask2 = _mm_movemask_epi8(eqa2);
312 let mask3 = _mm_movemask_epi8(eqa3);
313 if mask1 != 0 || mask2 != 0 || mask3 != 0 {
314 return Some(at + forward_pos3(mask1, mask2, mask3));
315 }
316
317 at += VECTOR_SIZE;
318 let mask1 = _mm_movemask_epi8(eqb1);
319 let mask2 = _mm_movemask_epi8(eqb2);
320 let mask3 = _mm_movemask_epi8(eqb3);
321 return Some(at + forward_pos3(mask1, mask2, mask3));
322 }
323 ptr = ptr.add(loop_size);
324 }
325 while ptr <= end_ptr.sub(VECTOR_SIZE) {
326 if let Some(i) =
327 forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3)
328 {
329 return Some(i);
330 }
331 ptr = ptr.add(VECTOR_SIZE);
332 }
333 if ptr < end_ptr {
334 debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE);
335 ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr));
336 debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE);
337
338 return forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3);
339 }
340 None
341 }
342
343 #[target_feature(enable = "sse2")]
memrchr(n1: u8, haystack: &[u8]) -> Option<usize>344 pub unsafe fn memrchr(n1: u8, haystack: &[u8]) -> Option<usize> {
345 let vn1 = _mm_set1_epi8(n1 as i8);
346 let len = haystack.len();
347 let loop_size = cmp::min(LOOP_SIZE, len);
348 let start_ptr = haystack.as_ptr();
349 let end_ptr = haystack[haystack.len()..].as_ptr();
350 let mut ptr = end_ptr;
351
352 if haystack.len() < VECTOR_SIZE {
353 while ptr > start_ptr {
354 ptr = ptr.offset(-1);
355 if *ptr == n1 {
356 return Some(sub(ptr, start_ptr));
357 }
358 }
359 return None;
360 }
361
362 ptr = ptr.sub(VECTOR_SIZE);
363 if let Some(i) = reverse_search1(start_ptr, end_ptr, ptr, vn1) {
364 return Some(i);
365 }
366
367 ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8;
368 debug_assert!(start_ptr <= ptr && ptr <= end_ptr);
369 while loop_size == LOOP_SIZE && ptr >= start_ptr.add(loop_size) {
370 debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
371
372 ptr = ptr.sub(loop_size);
373 let a = _mm_load_si128(ptr as *const __m128i);
374 let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i);
375 let c = _mm_load_si128(ptr.add(2 * VECTOR_SIZE) as *const __m128i);
376 let d = _mm_load_si128(ptr.add(3 * VECTOR_SIZE) as *const __m128i);
377 let eqa = _mm_cmpeq_epi8(vn1, a);
378 let eqb = _mm_cmpeq_epi8(vn1, b);
379 let eqc = _mm_cmpeq_epi8(vn1, c);
380 let eqd = _mm_cmpeq_epi8(vn1, d);
381 let or1 = _mm_or_si128(eqa, eqb);
382 let or2 = _mm_or_si128(eqc, eqd);
383 let or3 = _mm_or_si128(or1, or2);
384 if _mm_movemask_epi8(or3) != 0 {
385 let mut at = sub(ptr.add(3 * VECTOR_SIZE), start_ptr);
386 let mask = _mm_movemask_epi8(eqd);
387 if mask != 0 {
388 return Some(at + reverse_pos(mask));
389 }
390
391 at -= VECTOR_SIZE;
392 let mask = _mm_movemask_epi8(eqc);
393 if mask != 0 {
394 return Some(at + reverse_pos(mask));
395 }
396
397 at -= VECTOR_SIZE;
398 let mask = _mm_movemask_epi8(eqb);
399 if mask != 0 {
400 return Some(at + reverse_pos(mask));
401 }
402
403 at -= VECTOR_SIZE;
404 let mask = _mm_movemask_epi8(eqa);
405 debug_assert!(mask != 0);
406 return Some(at + reverse_pos(mask));
407 }
408 }
409 while ptr >= start_ptr.add(VECTOR_SIZE) {
410 ptr = ptr.sub(VECTOR_SIZE);
411 if let Some(i) = reverse_search1(start_ptr, end_ptr, ptr, vn1) {
412 return Some(i);
413 }
414 }
415 if ptr > start_ptr {
416 debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE);
417 return reverse_search1(start_ptr, end_ptr, start_ptr, vn1);
418 }
419 None
420 }
421
422 #[target_feature(enable = "sse2")]
memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize>423 pub unsafe fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
424 let vn1 = _mm_set1_epi8(n1 as i8);
425 let vn2 = _mm_set1_epi8(n2 as i8);
426 let len = haystack.len();
427 let loop_size = cmp::min(LOOP_SIZE2, len);
428 let start_ptr = haystack.as_ptr();
429 let end_ptr = haystack[haystack.len()..].as_ptr();
430 let mut ptr = end_ptr;
431
432 if haystack.len() < VECTOR_SIZE {
433 while ptr > start_ptr {
434 ptr = ptr.offset(-1);
435 if *ptr == n1 || *ptr == n2 {
436 return Some(sub(ptr, start_ptr));
437 }
438 }
439 return None;
440 }
441
442 ptr = ptr.sub(VECTOR_SIZE);
443 if let Some(i) = reverse_search2(start_ptr, end_ptr, ptr, vn1, vn2) {
444 return Some(i);
445 }
446
447 ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8;
448 debug_assert!(start_ptr <= ptr && ptr <= end_ptr);
449 while loop_size == LOOP_SIZE2 && ptr >= start_ptr.add(loop_size) {
450 debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
451
452 ptr = ptr.sub(loop_size);
453 let a = _mm_load_si128(ptr as *const __m128i);
454 let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i);
455 let eqa1 = _mm_cmpeq_epi8(vn1, a);
456 let eqb1 = _mm_cmpeq_epi8(vn1, b);
457 let eqa2 = _mm_cmpeq_epi8(vn2, a);
458 let eqb2 = _mm_cmpeq_epi8(vn2, b);
459 let or1 = _mm_or_si128(eqa1, eqb1);
460 let or2 = _mm_or_si128(eqa2, eqb2);
461 let or3 = _mm_or_si128(or1, or2);
462 if _mm_movemask_epi8(or3) != 0 {
463 let mut at = sub(ptr.add(VECTOR_SIZE), start_ptr);
464 let mask1 = _mm_movemask_epi8(eqb1);
465 let mask2 = _mm_movemask_epi8(eqb2);
466 if mask1 != 0 || mask2 != 0 {
467 return Some(at + reverse_pos2(mask1, mask2));
468 }
469
470 at -= VECTOR_SIZE;
471 let mask1 = _mm_movemask_epi8(eqa1);
472 let mask2 = _mm_movemask_epi8(eqa2);
473 return Some(at + reverse_pos2(mask1, mask2));
474 }
475 }
476 while ptr >= start_ptr.add(VECTOR_SIZE) {
477 ptr = ptr.sub(VECTOR_SIZE);
478 if let Some(i) = reverse_search2(start_ptr, end_ptr, ptr, vn1, vn2) {
479 return Some(i);
480 }
481 }
482 if ptr > start_ptr {
483 debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE);
484 return reverse_search2(start_ptr, end_ptr, start_ptr, vn1, vn2);
485 }
486 None
487 }
488
489 #[target_feature(enable = "sse2")]
memrchr3( n1: u8, n2: u8, n3: u8, haystack: &[u8], ) -> Option<usize>490 pub unsafe fn memrchr3(
491 n1: u8,
492 n2: u8,
493 n3: u8,
494 haystack: &[u8],
495 ) -> Option<usize> {
496 let vn1 = _mm_set1_epi8(n1 as i8);
497 let vn2 = _mm_set1_epi8(n2 as i8);
498 let vn3 = _mm_set1_epi8(n3 as i8);
499 let len = haystack.len();
500 let loop_size = cmp::min(LOOP_SIZE2, len);
501 let start_ptr = haystack.as_ptr();
502 let end_ptr = haystack[haystack.len()..].as_ptr();
503 let mut ptr = end_ptr;
504
505 if haystack.len() < VECTOR_SIZE {
506 while ptr > start_ptr {
507 ptr = ptr.offset(-1);
508 if *ptr == n1 || *ptr == n2 || *ptr == n3 {
509 return Some(sub(ptr, start_ptr));
510 }
511 }
512 return None;
513 }
514
515 ptr = ptr.sub(VECTOR_SIZE);
516 if let Some(i) = reverse_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) {
517 return Some(i);
518 }
519
520 ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8;
521 debug_assert!(start_ptr <= ptr && ptr <= end_ptr);
522 while loop_size == LOOP_SIZE2 && ptr >= start_ptr.add(loop_size) {
523 debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
524
525 ptr = ptr.sub(loop_size);
526 let a = _mm_load_si128(ptr as *const __m128i);
527 let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i);
528 let eqa1 = _mm_cmpeq_epi8(vn1, a);
529 let eqb1 = _mm_cmpeq_epi8(vn1, b);
530 let eqa2 = _mm_cmpeq_epi8(vn2, a);
531 let eqb2 = _mm_cmpeq_epi8(vn2, b);
532 let eqa3 = _mm_cmpeq_epi8(vn3, a);
533 let eqb3 = _mm_cmpeq_epi8(vn3, b);
534 let or1 = _mm_or_si128(eqa1, eqb1);
535 let or2 = _mm_or_si128(eqa2, eqb2);
536 let or3 = _mm_or_si128(eqa3, eqb3);
537 let or4 = _mm_or_si128(or1, or2);
538 let or5 = _mm_or_si128(or3, or4);
539 if _mm_movemask_epi8(or5) != 0 {
540 let mut at = sub(ptr.add(VECTOR_SIZE), start_ptr);
541 let mask1 = _mm_movemask_epi8(eqb1);
542 let mask2 = _mm_movemask_epi8(eqb2);
543 let mask3 = _mm_movemask_epi8(eqb3);
544 if mask1 != 0 || mask2 != 0 || mask3 != 0 {
545 return Some(at + reverse_pos3(mask1, mask2, mask3));
546 }
547
548 at -= VECTOR_SIZE;
549 let mask1 = _mm_movemask_epi8(eqa1);
550 let mask2 = _mm_movemask_epi8(eqa2);
551 let mask3 = _mm_movemask_epi8(eqa3);
552 return Some(at + reverse_pos3(mask1, mask2, mask3));
553 }
554 }
555 while ptr >= start_ptr.add(VECTOR_SIZE) {
556 ptr = ptr.sub(VECTOR_SIZE);
557 if let Some(i) =
558 reverse_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3)
559 {
560 return Some(i);
561 }
562 }
563 if ptr > start_ptr {
564 debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE);
565 return reverse_search3(start_ptr, end_ptr, start_ptr, vn1, vn2, vn3);
566 }
567 None
568 }
569
570 #[target_feature(enable = "sse2")]
forward_search1( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, vn1: __m128i, ) -> Option<usize>571 pub unsafe fn forward_search1(
572 start_ptr: *const u8,
573 end_ptr: *const u8,
574 ptr: *const u8,
575 vn1: __m128i,
576 ) -> Option<usize> {
577 debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
578 debug_assert!(start_ptr <= ptr);
579 debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
580
581 let chunk = _mm_loadu_si128(ptr as *const __m128i);
582 let mask = _mm_movemask_epi8(_mm_cmpeq_epi8(chunk, vn1));
583 if mask != 0 {
584 Some(sub(ptr, start_ptr) + forward_pos(mask))
585 } else {
586 None
587 }
588 }
589
590 #[target_feature(enable = "sse2")]
forward_search2( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, vn1: __m128i, vn2: __m128i, ) -> Option<usize>591 unsafe fn forward_search2(
592 start_ptr: *const u8,
593 end_ptr: *const u8,
594 ptr: *const u8,
595 vn1: __m128i,
596 vn2: __m128i,
597 ) -> Option<usize> {
598 debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
599 debug_assert!(start_ptr <= ptr);
600 debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
601
602 let chunk = _mm_loadu_si128(ptr as *const __m128i);
603 let eq1 = _mm_cmpeq_epi8(chunk, vn1);
604 let eq2 = _mm_cmpeq_epi8(chunk, vn2);
605 if _mm_movemask_epi8(_mm_or_si128(eq1, eq2)) != 0 {
606 let mask1 = _mm_movemask_epi8(eq1);
607 let mask2 = _mm_movemask_epi8(eq2);
608 Some(sub(ptr, start_ptr) + forward_pos2(mask1, mask2))
609 } else {
610 None
611 }
612 }
613
614 #[target_feature(enable = "sse2")]
forward_search3( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, vn1: __m128i, vn2: __m128i, vn3: __m128i, ) -> Option<usize>615 pub unsafe fn forward_search3(
616 start_ptr: *const u8,
617 end_ptr: *const u8,
618 ptr: *const u8,
619 vn1: __m128i,
620 vn2: __m128i,
621 vn3: __m128i,
622 ) -> Option<usize> {
623 debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
624 debug_assert!(start_ptr <= ptr);
625 debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
626
627 let chunk = _mm_loadu_si128(ptr as *const __m128i);
628 let eq1 = _mm_cmpeq_epi8(chunk, vn1);
629 let eq2 = _mm_cmpeq_epi8(chunk, vn2);
630 let eq3 = _mm_cmpeq_epi8(chunk, vn3);
631 let or = _mm_or_si128(eq1, eq2);
632 if _mm_movemask_epi8(_mm_or_si128(or, eq3)) != 0 {
633 let mask1 = _mm_movemask_epi8(eq1);
634 let mask2 = _mm_movemask_epi8(eq2);
635 let mask3 = _mm_movemask_epi8(eq3);
636 Some(sub(ptr, start_ptr) + forward_pos3(mask1, mask2, mask3))
637 } else {
638 None
639 }
640 }
641
642 #[target_feature(enable = "sse2")]
reverse_search1( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, vn1: __m128i, ) -> Option<usize>643 unsafe fn reverse_search1(
644 start_ptr: *const u8,
645 end_ptr: *const u8,
646 ptr: *const u8,
647 vn1: __m128i,
648 ) -> Option<usize> {
649 debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
650 debug_assert!(start_ptr <= ptr);
651 debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
652
653 let chunk = _mm_loadu_si128(ptr as *const __m128i);
654 let mask = _mm_movemask_epi8(_mm_cmpeq_epi8(vn1, chunk));
655 if mask != 0 {
656 Some(sub(ptr, start_ptr) + reverse_pos(mask))
657 } else {
658 None
659 }
660 }
661
662 #[target_feature(enable = "sse2")]
reverse_search2( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, vn1: __m128i, vn2: __m128i, ) -> Option<usize>663 unsafe fn reverse_search2(
664 start_ptr: *const u8,
665 end_ptr: *const u8,
666 ptr: *const u8,
667 vn1: __m128i,
668 vn2: __m128i,
669 ) -> Option<usize> {
670 debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
671 debug_assert!(start_ptr <= ptr);
672 debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
673
674 let chunk = _mm_loadu_si128(ptr as *const __m128i);
675 let eq1 = _mm_cmpeq_epi8(chunk, vn1);
676 let eq2 = _mm_cmpeq_epi8(chunk, vn2);
677 if _mm_movemask_epi8(_mm_or_si128(eq1, eq2)) != 0 {
678 let mask1 = _mm_movemask_epi8(eq1);
679 let mask2 = _mm_movemask_epi8(eq2);
680 Some(sub(ptr, start_ptr) + reverse_pos2(mask1, mask2))
681 } else {
682 None
683 }
684 }
685
686 #[target_feature(enable = "sse2")]
reverse_search3( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, vn1: __m128i, vn2: __m128i, vn3: __m128i, ) -> Option<usize>687 unsafe fn reverse_search3(
688 start_ptr: *const u8,
689 end_ptr: *const u8,
690 ptr: *const u8,
691 vn1: __m128i,
692 vn2: __m128i,
693 vn3: __m128i,
694 ) -> Option<usize> {
695 debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
696 debug_assert!(start_ptr <= ptr);
697 debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
698
699 let chunk = _mm_loadu_si128(ptr as *const __m128i);
700 let eq1 = _mm_cmpeq_epi8(chunk, vn1);
701 let eq2 = _mm_cmpeq_epi8(chunk, vn2);
702 let eq3 = _mm_cmpeq_epi8(chunk, vn3);
703 let or = _mm_or_si128(eq1, eq2);
704 if _mm_movemask_epi8(_mm_or_si128(or, eq3)) != 0 {
705 let mask1 = _mm_movemask_epi8(eq1);
706 let mask2 = _mm_movemask_epi8(eq2);
707 let mask3 = _mm_movemask_epi8(eq3);
708 Some(sub(ptr, start_ptr) + reverse_pos3(mask1, mask2, mask3))
709 } else {
710 None
711 }
712 }
713
714 /// Compute the position of the first matching byte from the given mask. The
715 /// position returned is always in the range [0, 15].
716 ///
717 /// The mask given is expected to be the result of _mm_movemask_epi8.
forward_pos(mask: i32) -> usize718 fn forward_pos(mask: i32) -> usize {
719 // We are dealing with little endian here, where the most significant byte
720 // is at a higher address. That means the least significant bit that is set
721 // corresponds to the position of our first matching byte. That position
722 // corresponds to the number of zeros after the least significant bit.
723 mask.trailing_zeros() as usize
724 }
725
726 /// Compute the position of the first matching byte from the given masks. The
727 /// position returned is always in the range [0, 15]. Each mask corresponds to
728 /// the equality comparison of a single byte.
729 ///
730 /// The masks given are expected to be the result of _mm_movemask_epi8, where
731 /// at least one of the masks is non-zero (i.e., indicates a match).
forward_pos2(mask1: i32, mask2: i32) -> usize732 fn forward_pos2(mask1: i32, mask2: i32) -> usize {
733 debug_assert!(mask1 != 0 || mask2 != 0);
734
735 forward_pos(mask1 | mask2)
736 }
737
738 /// Compute the position of the first matching byte from the given masks. The
739 /// position returned is always in the range [0, 15]. Each mask corresponds to
740 /// the equality comparison of a single byte.
741 ///
742 /// The masks given are expected to be the result of _mm_movemask_epi8, where
743 /// at least one of the masks is non-zero (i.e., indicates a match).
forward_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize744 fn forward_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize {
745 debug_assert!(mask1 != 0 || mask2 != 0 || mask3 != 0);
746
747 forward_pos(mask1 | mask2 | mask3)
748 }
749
750 /// Compute the position of the last matching byte from the given mask. The
751 /// position returned is always in the range [0, 15].
752 ///
753 /// The mask given is expected to be the result of _mm_movemask_epi8.
reverse_pos(mask: i32) -> usize754 fn reverse_pos(mask: i32) -> usize {
755 // We are dealing with little endian here, where the most significant byte
756 // is at a higher address. That means the most significant bit that is set
757 // corresponds to the position of our last matching byte. The position from
758 // the end of the mask is therefore the number of leading zeros in a 16
759 // bit integer, and the position from the start of the mask is therefore
760 // 16 - (leading zeros) - 1.
761 VECTOR_SIZE - (mask as u16).leading_zeros() as usize - 1
762 }
763
764 /// Compute the position of the last matching byte from the given masks. The
765 /// position returned is always in the range [0, 15]. Each mask corresponds to
766 /// the equality comparison of a single byte.
767 ///
768 /// The masks given are expected to be the result of _mm_movemask_epi8, where
769 /// at least one of the masks is non-zero (i.e., indicates a match).
reverse_pos2(mask1: i32, mask2: i32) -> usize770 fn reverse_pos2(mask1: i32, mask2: i32) -> usize {
771 debug_assert!(mask1 != 0 || mask2 != 0);
772
773 reverse_pos(mask1 | mask2)
774 }
775
776 /// Compute the position of the last matching byte from the given masks. The
777 /// position returned is always in the range [0, 15]. Each mask corresponds to
778 /// the equality comparison of a single byte.
779 ///
780 /// The masks given are expected to be the result of _mm_movemask_epi8, where
781 /// at least one of the masks is non-zero (i.e., indicates a match).
reverse_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize782 fn reverse_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize {
783 debug_assert!(mask1 != 0 || mask2 != 0 || mask3 != 0);
784
785 reverse_pos(mask1 | mask2 | mask3)
786 }
787
788 /// Subtract `b` from `a` and return the difference. `a` should be greater than
789 /// or equal to `b`.
sub(a: *const u8, b: *const u8) -> usize790 fn sub(a: *const u8, b: *const u8) -> usize {
791 debug_assert!(a >= b);
792 (a as usize) - (b as usize)
793 }
794