1 /*
2 * Copyright (c) 2015-2020, Intel Corporation
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 *
7 * * Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Intel Corporation nor the names of its contributors
13 * may be used to endorse or promote products derived from this software
14 * without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 /** \file
30 * \brief Vermicelli: Intel SSE implementation.
31 *
32 * (users should include vermicelli.h)
33 */
34
35 #if !defined(HAVE_AVX512)
36
37 #define VERM_BOUNDARY 16
38 #define VERM_TYPE m128
39 #define VERM_SET_FN set16x8
40
41 static really_inline
vermSearchAligned(m128 chars,const u8 * buf,const u8 * buf_end,char negate)42 const u8 *vermSearchAligned(m128 chars, const u8 *buf, const u8 *buf_end,
43 char negate) {
44 assert((size_t)buf % 16 == 0);
45 for (; buf + 31 < buf_end; buf += 32) {
46 m128 data = load128(buf);
47 u32 z1 = movemask128(eq128(chars, data));
48 m128 data2 = load128(buf + 16);
49 u32 z2 = movemask128(eq128(chars, data2));
50 u32 z = z1 | (z2 << 16);
51 if (negate) {
52 z = ~z;
53 }
54 if (unlikely(z)) {
55 u32 pos = ctz32(z);
56 return buf + pos;
57 }
58 }
59 for (; buf + 15 < buf_end; buf += 16) {
60 m128 data = load128(buf);
61 u32 z = movemask128(eq128(chars, data));
62 if (negate) {
63 z = ~z & 0xffff;
64 }
65 if (unlikely(z)) {
66 u32 pos = ctz32(z);
67 return buf + pos;
68 }
69 }
70 return NULL;
71 }
72
73 static really_inline
vermSearchAlignedNocase(m128 chars,const u8 * buf,const u8 * buf_end,char negate)74 const u8 *vermSearchAlignedNocase(m128 chars, const u8 *buf,
75 const u8 *buf_end, char negate) {
76 assert((size_t)buf % 16 == 0);
77 m128 casemask = set16x8(CASE_CLEAR);
78
79 for (; buf + 31 < buf_end; buf += 32) {
80 m128 data = load128(buf);
81 u32 z1 = movemask128(eq128(chars, and128(casemask, data)));
82 m128 data2 = load128(buf + 16);
83 u32 z2 = movemask128(eq128(chars, and128(casemask, data2)));
84 u32 z = z1 | (z2 << 16);
85 if (negate) {
86 z = ~z;
87 }
88 if (unlikely(z)) {
89 u32 pos = ctz32(z);
90 return buf + pos;
91 }
92 }
93
94 for (; buf + 15 < buf_end; buf += 16) {
95 m128 data = load128(buf);
96 u32 z = movemask128(eq128(chars, and128(casemask, data)));
97 if (negate) {
98 z = ~z & 0xffff;
99 }
100 if (unlikely(z)) {
101 u32 pos = ctz32(z);
102 return buf + pos;
103 }
104 }
105 return NULL;
106 }
107
108 // returns NULL if not found
109 static really_inline
vermUnalign(m128 chars,const u8 * buf,char negate)110 const u8 *vermUnalign(m128 chars, const u8 *buf, char negate) {
111 m128 data = loadu128(buf); // unaligned
112 u32 z = movemask128(eq128(chars, data));
113 if (negate) {
114 z = ~z & 0xffff;
115 }
116 if (unlikely(z)) {
117 return buf + ctz32(z);
118 }
119 return NULL;
120 }
121
122 // returns NULL if not found
123 static really_inline
vermUnalignNocase(m128 chars,const u8 * buf,char negate)124 const u8 *vermUnalignNocase(m128 chars, const u8 *buf, char negate) {
125 m128 casemask = set16x8(CASE_CLEAR);
126 m128 data = loadu128(buf); // unaligned
127 u32 z = movemask128(eq128(chars, and128(casemask, data)));
128 if (negate) {
129 z = ~z & 0xffff;
130 }
131 if (unlikely(z)) {
132 return buf + ctz32(z);
133 }
134 return NULL;
135 }
136
137 static really_inline
dvermSearchAligned(m128 chars1,m128 chars2,u8 c1,u8 c2,const u8 * buf,const u8 * buf_end)138 const u8 *dvermSearchAligned(m128 chars1, m128 chars2, u8 c1, u8 c2,
139 const u8 *buf, const u8 *buf_end) {
140 for (; buf + 16 < buf_end; buf += 16) {
141 m128 data = load128(buf);
142 u32 z = movemask128(and128(eq128(chars1, data),
143 rshiftbyte_m128(eq128(chars2, data), 1)));
144 if (buf[15] == c1 && buf[16] == c2) {
145 z |= (1 << 15);
146 }
147 if (unlikely(z)) {
148 u32 pos = ctz32(z);
149 return buf + pos;
150 }
151 }
152
153 return NULL;
154 }
155
156 static really_inline
dvermSearchAlignedNocase(m128 chars1,m128 chars2,u8 c1,u8 c2,const u8 * buf,const u8 * buf_end)157 const u8 *dvermSearchAlignedNocase(m128 chars1, m128 chars2, u8 c1, u8 c2,
158 const u8 *buf, const u8 *buf_end) {
159 assert((size_t)buf % 16 == 0);
160 m128 casemask = set16x8(CASE_CLEAR);
161
162 for (; buf + 16 < buf_end; buf += 16) {
163 m128 data = load128(buf);
164 m128 v = and128(casemask, data);
165 u32 z = movemask128(and128(eq128(chars1, v),
166 rshiftbyte_m128(eq128(chars2, v), 1)));
167 if ((buf[15] & CASE_CLEAR) == c1 && (buf[16] & CASE_CLEAR) == c2) {
168 z |= (1 << 15);
169 }
170 if (unlikely(z)) {
171 u32 pos = ctz32(z);
172 return buf + pos;
173 }
174 }
175
176 return NULL;
177 }
178
179 static really_inline
dvermSearchAlignedMasked(m128 chars1,m128 chars2,m128 mask1,m128 mask2,u8 c1,u8 c2,u8 m1,u8 m2,const u8 * buf,const u8 * buf_end)180 const u8 *dvermSearchAlignedMasked(m128 chars1, m128 chars2,
181 m128 mask1, m128 mask2, u8 c1, u8 c2, u8 m1,
182 u8 m2, const u8 *buf, const u8 *buf_end) {
183 assert((size_t)buf % 16 == 0);
184
185 for (; buf + 16 < buf_end; buf += 16) {
186 m128 data = load128(buf);
187 m128 v1 = eq128(chars1, and128(data, mask1));
188 m128 v2 = eq128(chars2, and128(data, mask2));
189 u32 z = movemask128(and128(v1, rshiftbyte_m128(v2, 1)));
190
191 if ((buf[15] & m1) == c1 && (buf[16] & m2) == c2) {
192 z |= (1 << 15);
193 }
194 if (unlikely(z)) {
195 u32 pos = ctz32(z);
196 return buf + pos;
197 }
198 }
199
200 return NULL;
201 }
202
203 // returns NULL if not found
204 static really_inline
dvermPrecondition(m128 chars1,m128 chars2,const u8 * buf)205 const u8 *dvermPrecondition(m128 chars1, m128 chars2, const u8 *buf) {
206 m128 data = loadu128(buf); // unaligned
207 u32 z = movemask128(and128(eq128(chars1, data),
208 rshiftbyte_m128(eq128(chars2, data), 1)));
209
210 /* no fixup of the boundary required - the aligned run will pick it up */
211 if (unlikely(z)) {
212 u32 pos = ctz32(z);
213 return buf + pos;
214 }
215 return NULL;
216 }
217
218 // returns NULL if not found
219 static really_inline
dvermPreconditionNocase(m128 chars1,m128 chars2,const u8 * buf)220 const u8 *dvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) {
221 /* due to laziness, nonalphas and nocase having interesting behaviour */
222 m128 casemask = set16x8(CASE_CLEAR);
223 m128 data = loadu128(buf); // unaligned
224 m128 v = and128(casemask, data);
225 u32 z = movemask128(and128(eq128(chars1, v),
226 rshiftbyte_m128(eq128(chars2, v), 1)));
227
228 /* no fixup of the boundary required - the aligned run will pick it up */
229 if (unlikely(z)) {
230 u32 pos = ctz32(z);
231 return buf + pos;
232 }
233 return NULL;
234 }
235
236 // returns NULL if not found
237 static really_inline
dvermPreconditionMasked(m128 chars1,m128 chars2,m128 mask1,m128 mask2,const u8 * buf)238 const u8 *dvermPreconditionMasked(m128 chars1, m128 chars2,
239 m128 mask1, m128 mask2, const u8 *buf) {
240 m128 data = loadu128(buf); // unaligned
241 m128 v1 = eq128(chars1, and128(data, mask1));
242 m128 v2 = eq128(chars2, and128(data, mask2));
243 u32 z = movemask128(and128(v1, rshiftbyte_m128(v2, 1)));
244
245 /* no fixup of the boundary required - the aligned run will pick it up */
246 if (unlikely(z)) {
247 u32 pos = ctz32(z);
248 return buf + pos;
249 }
250 return NULL;
251 }
252
253 static really_inline
lastMatchOffset(const u8 * buf_end,u32 z)254 const u8 *lastMatchOffset(const u8 *buf_end, u32 z) {
255 assert(z);
256 return buf_end - 16 + 31 - clz32(z);
257 }
258
259 static really_inline
rvermSearchAligned(m128 chars,const u8 * buf,const u8 * buf_end,char negate)260 const u8 *rvermSearchAligned(m128 chars, const u8 *buf, const u8 *buf_end,
261 char negate) {
262 assert((size_t)buf_end % 16 == 0);
263 for (; buf + 15 < buf_end; buf_end -= 16) {
264 m128 data = load128(buf_end - 16);
265 u32 z = movemask128(eq128(chars, data));
266 if (negate) {
267 z = ~z & 0xffff;
268 }
269 if (unlikely(z)) {
270 return lastMatchOffset(buf_end, z);
271 }
272 }
273 return NULL;
274 }
275
276 static really_inline
rvermSearchAlignedNocase(m128 chars,const u8 * buf,const u8 * buf_end,char negate)277 const u8 *rvermSearchAlignedNocase(m128 chars, const u8 *buf,
278 const u8 *buf_end, char negate) {
279 assert((size_t)buf_end % 16 == 0);
280 m128 casemask = set16x8(CASE_CLEAR);
281
282 for (; buf + 15 < buf_end; buf_end -= 16) {
283 m128 data = load128(buf_end - 16);
284 u32 z = movemask128(eq128(chars, and128(casemask, data)));
285 if (negate) {
286 z = ~z & 0xffff;
287 }
288 if (unlikely(z)) {
289 return lastMatchOffset(buf_end, z);
290 }
291 }
292 return NULL;
293 }
294
295 // returns NULL if not found
296 static really_inline
rvermUnalign(m128 chars,const u8 * buf,char negate)297 const u8 *rvermUnalign(m128 chars, const u8 *buf, char negate) {
298 m128 data = loadu128(buf); // unaligned
299 u32 z = movemask128(eq128(chars, data));
300 if (negate) {
301 z = ~z & 0xffff;
302 }
303 if (unlikely(z)) {
304 return lastMatchOffset(buf + 16, z);
305 }
306 return NULL;
307 }
308
309 // returns NULL if not found
310 static really_inline
rvermUnalignNocase(m128 chars,const u8 * buf,char negate)311 const u8 *rvermUnalignNocase(m128 chars, const u8 *buf, char negate) {
312 m128 casemask = set16x8(CASE_CLEAR);
313 m128 data = loadu128(buf); // unaligned
314 u32 z = movemask128(eq128(chars, and128(casemask, data)));
315 if (negate) {
316 z = ~z & 0xffff;
317 }
318 if (unlikely(z)) {
319 return lastMatchOffset(buf + 16, z);
320 }
321 return NULL;
322 }
323
324 static really_inline
rdvermSearchAligned(m128 chars1,m128 chars2,u8 c1,u8 c2,const u8 * buf,const u8 * buf_end)325 const u8 *rdvermSearchAligned(m128 chars1, m128 chars2, u8 c1, u8 c2,
326 const u8 *buf, const u8 *buf_end) {
327 assert((size_t)buf_end % 16 == 0);
328
329 for (; buf + 16 < buf_end; buf_end -= 16) {
330 m128 data = load128(buf_end - 16);
331 u32 z = movemask128(and128(eq128(chars2, data),
332 lshiftbyte_m128(eq128(chars1, data), 1)));
333 if (buf_end[-17] == c1 && buf_end[-16] == c2) {
334 z |= 1;
335 }
336 if (unlikely(z)) {
337 return lastMatchOffset(buf_end, z);
338 }
339 }
340 return buf_end;
341 }
342
343 static really_inline
rdvermSearchAlignedNocase(m128 chars1,m128 chars2,u8 c1,u8 c2,const u8 * buf,const u8 * buf_end)344 const u8 *rdvermSearchAlignedNocase(m128 chars1, m128 chars2, u8 c1, u8 c2,
345 const u8 *buf, const u8 *buf_end) {
346 assert((size_t)buf_end % 16 == 0);
347 m128 casemask = set16x8(CASE_CLEAR);
348
349 for (; buf + 16 < buf_end; buf_end -= 16) {
350 m128 data = load128(buf_end - 16);
351 m128 v = and128(casemask, data);
352 u32 z = movemask128(and128(eq128(chars2, v),
353 lshiftbyte_m128(eq128(chars1, v), 1)));
354 if ((buf_end[-17] & CASE_CLEAR) == c1
355 && (buf_end[-16] & CASE_CLEAR) == c2) {
356 z |= 1;
357 }
358 if (unlikely(z)) {
359 return lastMatchOffset(buf_end, z);
360 }
361 }
362 return buf_end;
363 }
364
365 // returns NULL if not found
366 static really_inline
rdvermPrecondition(m128 chars1,m128 chars2,const u8 * buf)367 const u8 *rdvermPrecondition(m128 chars1, m128 chars2, const u8 *buf) {
368 m128 data = loadu128(buf);
369 u32 z = movemask128(and128(eq128(chars2, data),
370 lshiftbyte_m128(eq128(chars1, data), 1)));
371
372 /* no fixup of the boundary required - the aligned run will pick it up */
373 if (unlikely(z)) {
374 return lastMatchOffset(buf + 16, z);
375 }
376
377 return NULL;
378 }
379
380 // returns NULL if not found
381 static really_inline
rdvermPreconditionNocase(m128 chars1,m128 chars2,const u8 * buf)382 const u8 *rdvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) {
383 /* due to laziness, nonalphas and nocase having interesting behaviour */
384 m128 casemask = set16x8(CASE_CLEAR);
385 m128 data = loadu128(buf);
386 m128 v = and128(casemask, data);
387 u32 z = movemask128(and128(eq128(chars2, v),
388 lshiftbyte_m128(eq128(chars1, v), 1)));
389 /* no fixup of the boundary required - the aligned run will pick it up */
390 if (unlikely(z)) {
391 return lastMatchOffset(buf + 16, z);
392 }
393
394 return NULL;
395 }
396
397 #else // HAVE_AVX512
398
399 #define VERM_BOUNDARY 64
400 #define VERM_TYPE m512
401 #define VERM_SET_FN set64x8
402
403 static really_inline
vermMini(m512 chars,const u8 * buf,const u8 * buf_end,char negate)404 const u8 *vermMini(m512 chars, const u8 *buf, const u8 *buf_end, char negate) {
405 uintptr_t len = buf_end - buf;
406 __mmask64 mask = (~0ULL) >> (64 - len);
407 m512 data = loadu_maskz_m512(mask, buf);
408
409 u64a z = eq512mask(chars, data);
410
411 if (negate) {
412 z = ~z & mask;
413 }
414 z &= mask;
415 if (unlikely(z)) {
416 return buf + ctz64(z);
417 }
418 return NULL;
419 }
420
421 static really_inline
vermMiniNocase(m512 chars,const u8 * buf,const u8 * buf_end,char negate)422 const u8 *vermMiniNocase(m512 chars, const u8 *buf, const u8 *buf_end,
423 char negate) {
424 uintptr_t len = buf_end - buf;
425 __mmask64 mask = (~0ULL) >> (64 - len);
426 m512 data = loadu_maskz_m512(mask, buf);
427 m512 casemask = set64x8(CASE_CLEAR);
428 m512 v = and512(casemask, data);
429
430 u64a z = eq512mask(chars, v);
431
432 if (negate) {
433 z = ~z & mask;
434 }
435 z &= mask;
436 if (unlikely(z)) {
437 return buf + ctz64(z);
438 }
439 return NULL;
440 }
441
442 static really_inline
vermSearchAligned(m512 chars,const u8 * buf,const u8 * buf_end,char negate)443 const u8 *vermSearchAligned(m512 chars, const u8 *buf, const u8 *buf_end,
444 char negate) {
445 assert((size_t)buf % 64 == 0);
446 for (; buf + 63 < buf_end; buf += 64) {
447 m512 data = load512(buf);
448 u64a z = eq512mask(chars, data);
449 if (negate) {
450 z = ~z & ~0ULL;
451 }
452 if (unlikely(z)) {
453 u64a pos = ctz64(z);
454 return buf + pos;
455 }
456 }
457 return NULL;
458 }
459
460 static really_inline
vermSearchAlignedNocase(m512 chars,const u8 * buf,const u8 * buf_end,char negate)461 const u8 *vermSearchAlignedNocase(m512 chars, const u8 *buf,
462 const u8 *buf_end, char negate) {
463 assert((size_t)buf % 64 == 0);
464 m512 casemask = set64x8(CASE_CLEAR);
465
466 for (; buf + 63 < buf_end; buf += 64) {
467 m512 data = load512(buf);
468 u64a z = eq512mask(chars, and512(casemask, data));
469 if (negate) {
470 z = ~z & ~0ULL;
471 }
472 if (unlikely(z)) {
473 u64a pos = ctz64(z);
474 return buf + pos;
475 }
476 }
477 return NULL;
478 }
479
480 // returns NULL if not found
481 static really_inline
vermUnalign(m512 chars,const u8 * buf,char negate)482 const u8 *vermUnalign(m512 chars, const u8 *buf, char negate) {
483 m512 data = loadu512(buf); // unaligned
484 u64a z = eq512mask(chars, data);
485 if (negate) {
486 z = ~z & ~0ULL;
487 }
488 if (unlikely(z)) {
489 return buf + ctz64(z);
490 }
491 return NULL;
492 }
493
494 // returns NULL if not found
495 static really_inline
vermUnalignNocase(m512 chars,const u8 * buf,char negate)496 const u8 *vermUnalignNocase(m512 chars, const u8 *buf, char negate) {
497 m512 casemask = set64x8(CASE_CLEAR);
498 m512 data = loadu512(buf); // unaligned
499 u64a z = eq512mask(chars, and512(casemask, data));
500 if (negate) {
501 z = ~z & ~0ULL;
502 }
503 if (unlikely(z)) {
504 return buf + ctz64(z);
505 }
506 return NULL;
507 }
508
509 static really_inline
dvermMini(m512 chars1,m512 chars2,const u8 * buf,const u8 * buf_end)510 const u8 *dvermMini(m512 chars1, m512 chars2, const u8 *buf,
511 const u8 *buf_end) {
512 uintptr_t len = buf_end - buf;
513 __mmask64 mask = (~0ULL) >> (64 - len);
514 m512 data = loadu_maskz_m512(mask, buf);
515
516 u64a z = eq512mask(chars1, data) & (eq512mask(chars2, data) >> 1);
517
518 z &= mask;
519 if (unlikely(z)) {
520 u64a pos = ctz64(z);
521 return buf + pos;
522 }
523 return NULL;
524 }
525
526 static really_inline
dvermMiniNocase(m512 chars1,m512 chars2,const u8 * buf,const u8 * buf_end)527 const u8 *dvermMiniNocase(m512 chars1, m512 chars2, const u8 *buf,
528 const u8 *buf_end) {
529 uintptr_t len = buf_end - buf;
530 __mmask64 mask = (~0ULL) >> (64 - len);
531 m512 data = loadu_maskz_m512(mask, buf);
532 m512 casemask = set64x8(CASE_CLEAR);
533 m512 v = and512(casemask, data);
534
535 u64a z = eq512mask(chars1, v) & (eq512mask(chars2, v) >> 1);
536
537 z &= mask;
538 if (unlikely(z)) {
539 u64a pos = ctz64(z);
540 return buf + pos;
541 }
542 return NULL;
543 }
544
545 static really_inline
dvermMiniMasked(m512 chars1,m512 chars2,m512 mask1,m512 mask2,const u8 * buf,const u8 * buf_end)546 const u8 *dvermMiniMasked(m512 chars1, m512 chars2, m512 mask1, m512 mask2,
547 const u8 *buf, const u8 *buf_end) {
548 uintptr_t len = buf_end - buf;
549 __mmask64 mask = (~0ULL) >> (64 - len);
550 m512 data = loadu_maskz_m512(mask, buf);
551 m512 v1 = and512(data, mask1);
552 m512 v2 = and512(data, mask2);
553
554 u64a z = eq512mask(chars1, v1) & (eq512mask(chars2, v2) >> 1);
555
556 z &= mask;
557 if (unlikely(z)) {
558 u64a pos = ctz64(z);
559 return buf + pos;
560 }
561 return NULL;
562 }
563
564 static really_inline
dvermSearchAligned(m512 chars1,m512 chars2,u8 c1,u8 c2,const u8 * buf,const u8 * buf_end)565 const u8 *dvermSearchAligned(m512 chars1, m512 chars2, u8 c1, u8 c2,
566 const u8 *buf, const u8 *buf_end) {
567 for (; buf + 64 < buf_end; buf += 64) {
568 m512 data = load512(buf);
569 u64a z = eq512mask(chars1, data) & (eq512mask(chars2, data) >> 1);
570 if (buf[63] == c1 && buf[64] == c2) {
571 z |= (1ULL << 63);
572 }
573 if (unlikely(z)) {
574 u64a pos = ctz64(z);
575 return buf + pos;
576 }
577 }
578
579 return NULL;
580 }
581
582 static really_inline
dvermSearchAlignedNocase(m512 chars1,m512 chars2,u8 c1,u8 c2,const u8 * buf,const u8 * buf_end)583 const u8 *dvermSearchAlignedNocase(m512 chars1, m512 chars2, u8 c1, u8 c2,
584 const u8 *buf, const u8 *buf_end) {
585 assert((size_t)buf % 64 == 0);
586 m512 casemask = set64x8(CASE_CLEAR);
587
588 for (; buf + 64 < buf_end; buf += 64) {
589 m512 data = load512(buf);
590 m512 v = and512(casemask, data);
591 u64a z = eq512mask(chars1, v) & (eq512mask(chars2, v) >> 1);
592 if ((buf[63] & CASE_CLEAR) == c1 && (buf[64] & CASE_CLEAR) == c2) {
593 z |= (1ULL << 63);
594 }
595 if (unlikely(z)) {
596 u64a pos = ctz64(z);
597 return buf + pos;
598 }
599 }
600
601 return NULL;
602 }
603
604 static really_inline
dvermSearchAlignedMasked(m512 chars1,m512 chars2,m512 mask1,m512 mask2,u8 c1,u8 c2,u8 m1,u8 m2,const u8 * buf,const u8 * buf_end)605 const u8 *dvermSearchAlignedMasked(m512 chars1, m512 chars2,
606 m512 mask1, m512 mask2, u8 c1, u8 c2, u8 m1,
607 u8 m2, const u8 *buf, const u8 *buf_end) {
608 assert((size_t)buf % 64 == 0);
609
610 for (; buf + 64 < buf_end; buf += 64) {
611 m512 data = load512(buf);
612 m512 v1 = and512(data, mask1);
613 m512 v2 = and512(data, mask2);
614 u64a z = eq512mask(chars1, v1) & (eq512mask(chars2, v2) >> 1);
615
616 if ((buf[63] & m1) == c1 && (buf[64] & m2) == c2) {
617 z |= (1ULL << 63);
618 }
619 if (unlikely(z)) {
620 u64a pos = ctz64(z);
621 return buf + pos;
622 }
623 }
624
625 return NULL;
626 }
627
628 // returns NULL if not found
629 static really_inline
dvermPrecondition(m512 chars1,m512 chars2,const u8 * buf)630 const u8 *dvermPrecondition(m512 chars1, m512 chars2, const u8 *buf) {
631 m512 data = loadu512(buf); // unaligned
632 u64a z = eq512mask(chars1, data) & (eq512mask(chars2, data) >> 1);
633
634 /* no fixup of the boundary required - the aligned run will pick it up */
635 if (unlikely(z)) {
636 u64a pos = ctz64(z);
637 return buf + pos;
638 }
639 return NULL;
640 }
641
642 // returns NULL if not found
643 static really_inline
dvermPreconditionNocase(m512 chars1,m512 chars2,const u8 * buf)644 const u8 *dvermPreconditionNocase(m512 chars1, m512 chars2, const u8 *buf) {
645 /* due to laziness, nonalphas and nocase having interesting behaviour */
646 m512 casemask = set64x8(CASE_CLEAR);
647 m512 data = loadu512(buf); // unaligned
648 m512 v = and512(casemask, data);
649 u64a z = eq512mask(chars1, v) & (eq512mask(chars2, v) >> 1);
650
651 /* no fixup of the boundary required - the aligned run will pick it up */
652 if (unlikely(z)) {
653 u64a pos = ctz64(z);
654 return buf + pos;
655 }
656 return NULL;
657 }
658
659 // returns NULL if not found
660 static really_inline
dvermPreconditionMasked(m512 chars1,m512 chars2,m512 mask1,m512 mask2,const u8 * buf)661 const u8 *dvermPreconditionMasked(m512 chars1, m512 chars2,
662 m512 mask1, m512 mask2, const u8 *buf) {
663 m512 data = loadu512(buf); // unaligned
664 m512 v1 = and512(data, mask1);
665 m512 v2 = and512(data, mask2);
666 u64a z = eq512mask(chars1, v1) & (eq512mask(chars2, v2) >> 1);
667
668 /* no fixup of the boundary required - the aligned run will pick it up */
669 if (unlikely(z)) {
670 u64a pos = ctz64(z);
671 return buf + pos;
672 }
673 return NULL;
674 }
675
676 static really_inline
lastMatchOffset(const u8 * buf_end,u64a z)677 const u8 *lastMatchOffset(const u8 *buf_end, u64a z) {
678 assert(z);
679 return buf_end - 64 + 63 - clz64(z);
680 }
681
682 static really_inline
rvermMini(m512 chars,const u8 * buf,const u8 * buf_end,char negate)683 const u8 *rvermMini(m512 chars, const u8 *buf, const u8 *buf_end, char negate) {
684 uintptr_t len = buf_end - buf;
685 __mmask64 mask = (~0ULL) >> (64 - len);
686 m512 data = loadu_maskz_m512(mask, buf);
687
688 u64a z = eq512mask(chars, data);
689
690 if (negate) {
691 z = ~z & mask;
692 }
693 z &= mask;
694 if (unlikely(z)) {
695 return lastMatchOffset(buf + 64, z);
696 }
697 return NULL;
698 }
699
700 static really_inline
rvermMiniNocase(m512 chars,const u8 * buf,const u8 * buf_end,char negate)701 const u8 *rvermMiniNocase(m512 chars, const u8 *buf, const u8 *buf_end,
702 char negate) {
703 uintptr_t len = buf_end - buf;
704 __mmask64 mask = (~0ULL) >> (64 - len);
705 m512 data = loadu_maskz_m512(mask, buf);
706 m512 casemask = set64x8(CASE_CLEAR);
707 m512 v = and512(casemask, data);
708
709 u64a z = eq512mask(chars, v);
710
711 if (negate) {
712 z = ~z & mask;
713 }
714 z &= mask;
715 if (unlikely(z)) {
716 return lastMatchOffset(buf + 64, z);
717 }
718 return NULL;
719 }
720
721 static really_inline
rvermSearchAligned(m512 chars,const u8 * buf,const u8 * buf_end,char negate)722 const u8 *rvermSearchAligned(m512 chars, const u8 *buf, const u8 *buf_end,
723 char negate) {
724 assert((size_t)buf_end % 64 == 0);
725 for (; buf + 63 < buf_end; buf_end -= 64) {
726 m512 data = load512(buf_end - 64);
727 u64a z = eq512mask(chars, data);
728 if (negate) {
729 z = ~z & ~0ULL;
730 }
731 if (unlikely(z)) {
732 return lastMatchOffset(buf_end, z);
733 }
734 }
735 return NULL;
736 }
737
738 static really_inline
rvermSearchAlignedNocase(m512 chars,const u8 * buf,const u8 * buf_end,char negate)739 const u8 *rvermSearchAlignedNocase(m512 chars, const u8 *buf,
740 const u8 *buf_end, char negate) {
741 assert((size_t)buf_end % 64 == 0);
742 m512 casemask = set64x8(CASE_CLEAR);
743
744 for (; buf + 63 < buf_end; buf_end -= 64) {
745 m512 data = load512(buf_end - 64);
746 u64a z = eq512mask(chars, and512(casemask, data));
747 if (negate) {
748 z = ~z & ~0ULL;
749 }
750 if (unlikely(z)) {
751 return lastMatchOffset(buf_end, z);
752 }
753 }
754 return NULL;
755 }
756
757 // returns NULL if not found
758 static really_inline
rvermUnalign(m512 chars,const u8 * buf,char negate)759 const u8 *rvermUnalign(m512 chars, const u8 *buf, char negate) {
760 m512 data = loadu512(buf); // unaligned
761 u64a z = eq512mask(chars, data);
762 if (negate) {
763 z = ~z & ~0ULL;
764 }
765 if (unlikely(z)) {
766 return lastMatchOffset(buf + 64, z);
767 }
768 return NULL;
769 }
770
771 // returns NULL if not found
772 static really_inline
rvermUnalignNocase(m512 chars,const u8 * buf,char negate)773 const u8 *rvermUnalignNocase(m512 chars, const u8 *buf, char negate) {
774 m512 casemask = set64x8(CASE_CLEAR);
775 m512 data = loadu512(buf); // unaligned
776 u64a z = eq512mask(chars, and512(casemask, data));
777 if (negate) {
778 z = ~z & ~0ULL;
779 }
780 if (unlikely(z)) {
781 return lastMatchOffset(buf + 64, z);
782 }
783 return NULL;
784 }
785
786 static really_inline
rdvermMini(m512 chars1,m512 chars2,const u8 * buf,const u8 * buf_end)787 const u8 *rdvermMini(m512 chars1, m512 chars2, const u8 *buf,
788 const u8 *buf_end) {
789 uintptr_t len = buf_end - buf;
790 __mmask64 mask = (~0ULL) >> (64 - len);
791 m512 data = loadu_maskz_m512(mask, buf);
792
793 u64a z = eq512mask(chars2, data) & (eq512mask(chars1, data) << 1);
794
795 z &= mask;
796 if (unlikely(z)) {
797 return lastMatchOffset(buf + 64, z);
798 }
799 return NULL;
800 }
801
802 static really_inline
rdvermMiniNocase(m512 chars1,m512 chars2,const u8 * buf,const u8 * buf_end)803 const u8 *rdvermMiniNocase(m512 chars1, m512 chars2, const u8 *buf,
804 const u8 *buf_end) {
805 uintptr_t len = buf_end - buf;
806 __mmask64 mask = (~0ULL) >> (64 - len);
807 m512 data = loadu_maskz_m512(mask, buf);
808 m512 casemask = set64x8(CASE_CLEAR);
809 m512 v = and512(casemask, data);
810
811 u64a z = eq512mask(chars2, v) & (eq512mask(chars1, v) << 1);
812
813 z &= mask;
814 if (unlikely(z)) {
815 return lastMatchOffset(buf + 64, z);
816 }
817 return NULL;
818 }
819
820 static really_inline
rdvermSearchAligned(m512 chars1,m512 chars2,u8 c1,u8 c2,const u8 * buf,const u8 * buf_end)821 const u8 *rdvermSearchAligned(m512 chars1, m512 chars2, u8 c1, u8 c2,
822 const u8 *buf, const u8 *buf_end) {
823 assert((size_t)buf_end % 64 == 0);
824
825 for (; buf + 64 < buf_end; buf_end -= 64) {
826 m512 data = load512(buf_end - 64);
827 u64a z = eq512mask(chars2, data) & (eq512mask(chars1, data) << 1);
828 if (buf_end[-65] == c1 && buf_end[-64] == c2) {
829 z |= 1;
830 }
831 if (unlikely(z)) {
832 return lastMatchOffset(buf_end, z);
833 }
834 }
835 return buf_end;
836 }
837
838 static really_inline
rdvermSearchAlignedNocase(m512 chars1,m512 chars2,u8 c1,u8 c2,const u8 * buf,const u8 * buf_end)839 const u8 *rdvermSearchAlignedNocase(m512 chars1, m512 chars2, u8 c1, u8 c2,
840 const u8 *buf, const u8 *buf_end) {
841 assert((size_t)buf_end % 64 == 0);
842 m512 casemask = set64x8(CASE_CLEAR);
843
844 for (; buf + 64 < buf_end; buf_end -= 64) {
845 m512 data = load512(buf_end - 64);
846 m512 v = and512(casemask, data);
847 u64a z = eq512mask(chars2, v) & (eq512mask(chars1, v) << 1);
848 if ((buf_end[-65] & CASE_CLEAR) == c1
849 && (buf_end[-64] & CASE_CLEAR) == c2) {
850 z |= 1;
851 }
852 if (unlikely(z)) {
853 return lastMatchOffset(buf_end, z);
854 }
855 }
856 return buf_end;
857 }
858
859 // returns NULL if not found
860 static really_inline
rdvermPrecondition(m512 chars1,m512 chars2,const u8 * buf)861 const u8 *rdvermPrecondition(m512 chars1, m512 chars2, const u8 *buf) {
862 m512 data = loadu512(buf);
863 u64a z = eq512mask(chars2, data) & (eq512mask(chars1, data) << 1);
864
865 // no fixup of the boundary required - the aligned run will pick it up
866 if (unlikely(z)) {
867 return lastMatchOffset(buf + 64, z);
868 }
869
870 return NULL;
871 }
872
873 // returns NULL if not found
874 static really_inline
rdvermPreconditionNocase(m512 chars1,m512 chars2,const u8 * buf)875 const u8 *rdvermPreconditionNocase(m512 chars1, m512 chars2, const u8 *buf) {
876 // due to laziness, nonalphas and nocase having interesting behaviour
877 m512 casemask = set64x8(CASE_CLEAR);
878 m512 data = loadu512(buf);
879 m512 v = and512(casemask, data);
880 u64a z = eq512mask(chars2, v) & (eq512mask(chars1, v) << 1);
881 // no fixup of the boundary required - the aligned run will pick it up
882 if (unlikely(z)) {
883 return lastMatchOffset(buf + 64, z);
884 }
885
886 return NULL;
887 }
888
889 #endif // HAVE_AVX512
890