1 /*
2  * Copyright (c) 2015-2020, Intel Corporation
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions are met:
6  *
7  *  * Redistributions of source code must retain the above copyright notice,
8  *    this list of conditions and the following disclaimer.
9  *  * Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *  * Neither the name of Intel Corporation nor the names of its contributors
13  *    may be used to endorse or promote products derived from this software
14  *    without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26  * POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /** \file
30  * \brief Vermicelli: Intel SSE implementation.
31  *
32  * (users should include vermicelli.h)
33  */
34 
35 #if !defined(HAVE_AVX512)
36 
37 #define VERM_BOUNDARY 16
38 #define VERM_TYPE m128
39 #define VERM_SET_FN set16x8
40 
41 static really_inline
vermSearchAligned(m128 chars,const u8 * buf,const u8 * buf_end,char negate)42 const u8 *vermSearchAligned(m128 chars, const u8 *buf, const u8 *buf_end,
43                             char negate) {
44     assert((size_t)buf % 16 == 0);
45     for (; buf + 31 < buf_end; buf += 32) {
46         m128 data = load128(buf);
47         u32 z1 = movemask128(eq128(chars, data));
48         m128 data2 = load128(buf + 16);
49         u32 z2 = movemask128(eq128(chars, data2));
50         u32 z = z1 | (z2 << 16);
51         if (negate) {
52             z = ~z;
53         }
54         if (unlikely(z)) {
55             u32 pos = ctz32(z);
56             return buf + pos;
57         }
58     }
59     for (; buf + 15 < buf_end; buf += 16) {
60         m128 data = load128(buf);
61         u32 z = movemask128(eq128(chars, data));
62         if (negate) {
63             z = ~z & 0xffff;
64         }
65         if (unlikely(z)) {
66             u32 pos = ctz32(z);
67             return buf + pos;
68         }
69     }
70     return NULL;
71 }
72 
73 static really_inline
vermSearchAlignedNocase(m128 chars,const u8 * buf,const u8 * buf_end,char negate)74 const u8 *vermSearchAlignedNocase(m128 chars, const u8 *buf,
75                                   const u8 *buf_end, char negate) {
76     assert((size_t)buf % 16 == 0);
77     m128 casemask = set16x8(CASE_CLEAR);
78 
79     for (; buf + 31 < buf_end; buf += 32) {
80         m128 data = load128(buf);
81         u32 z1 = movemask128(eq128(chars, and128(casemask, data)));
82         m128 data2 = load128(buf + 16);
83         u32 z2 = movemask128(eq128(chars, and128(casemask, data2)));
84         u32 z = z1 | (z2 << 16);
85         if (negate) {
86             z = ~z;
87         }
88         if (unlikely(z)) {
89             u32 pos = ctz32(z);
90             return buf + pos;
91         }
92     }
93 
94     for (; buf + 15 < buf_end; buf += 16) {
95         m128 data = load128(buf);
96         u32 z = movemask128(eq128(chars, and128(casemask, data)));
97         if (negate) {
98             z = ~z & 0xffff;
99         }
100         if (unlikely(z)) {
101             u32 pos = ctz32(z);
102             return buf + pos;
103         }
104     }
105     return NULL;
106 }
107 
108 // returns NULL if not found
109 static really_inline
vermUnalign(m128 chars,const u8 * buf,char negate)110 const u8 *vermUnalign(m128 chars, const u8 *buf, char negate) {
111     m128 data = loadu128(buf); // unaligned
112     u32 z = movemask128(eq128(chars, data));
113     if (negate) {
114         z = ~z & 0xffff;
115     }
116     if (unlikely(z)) {
117         return buf + ctz32(z);
118     }
119     return NULL;
120 }
121 
122 // returns NULL if not found
123 static really_inline
vermUnalignNocase(m128 chars,const u8 * buf,char negate)124 const u8 *vermUnalignNocase(m128 chars, const u8 *buf, char negate) {
125     m128 casemask = set16x8(CASE_CLEAR);
126     m128 data = loadu128(buf); // unaligned
127     u32 z = movemask128(eq128(chars, and128(casemask, data)));
128     if (negate) {
129         z = ~z & 0xffff;
130     }
131     if (unlikely(z)) {
132         return buf + ctz32(z);
133     }
134     return NULL;
135 }
136 
137 static really_inline
dvermSearchAligned(m128 chars1,m128 chars2,u8 c1,u8 c2,const u8 * buf,const u8 * buf_end)138 const u8 *dvermSearchAligned(m128 chars1, m128 chars2, u8 c1, u8 c2,
139                              const u8 *buf, const u8 *buf_end) {
140     for (; buf + 16 < buf_end; buf += 16) {
141         m128 data = load128(buf);
142         u32 z = movemask128(and128(eq128(chars1, data),
143                                    rshiftbyte_m128(eq128(chars2, data), 1)));
144         if (buf[15] == c1 && buf[16] == c2) {
145             z |= (1 << 15);
146         }
147         if (unlikely(z)) {
148             u32 pos = ctz32(z);
149             return buf + pos;
150         }
151     }
152 
153     return NULL;
154 }
155 
156 static really_inline
dvermSearchAlignedNocase(m128 chars1,m128 chars2,u8 c1,u8 c2,const u8 * buf,const u8 * buf_end)157 const u8 *dvermSearchAlignedNocase(m128 chars1, m128 chars2, u8 c1, u8 c2,
158                                    const u8 *buf, const u8 *buf_end) {
159     assert((size_t)buf % 16 == 0);
160     m128 casemask = set16x8(CASE_CLEAR);
161 
162     for (; buf + 16 < buf_end; buf += 16) {
163         m128 data = load128(buf);
164         m128 v = and128(casemask, data);
165         u32 z = movemask128(and128(eq128(chars1, v),
166                                    rshiftbyte_m128(eq128(chars2, v), 1)));
167         if ((buf[15] & CASE_CLEAR) == c1 && (buf[16] & CASE_CLEAR) == c2) {
168             z |= (1 << 15);
169         }
170         if (unlikely(z)) {
171             u32 pos = ctz32(z);
172             return buf + pos;
173         }
174     }
175 
176     return NULL;
177 }
178 
179 static really_inline
dvermSearchAlignedMasked(m128 chars1,m128 chars2,m128 mask1,m128 mask2,u8 c1,u8 c2,u8 m1,u8 m2,const u8 * buf,const u8 * buf_end)180 const u8 *dvermSearchAlignedMasked(m128 chars1, m128 chars2,
181                                    m128 mask1, m128 mask2, u8 c1, u8 c2, u8 m1,
182                                    u8 m2, const u8 *buf, const u8 *buf_end) {
183     assert((size_t)buf % 16 == 0);
184 
185     for (; buf + 16 < buf_end; buf += 16) {
186         m128 data = load128(buf);
187         m128 v1 = eq128(chars1, and128(data, mask1));
188         m128 v2 = eq128(chars2, and128(data, mask2));
189         u32 z = movemask128(and128(v1, rshiftbyte_m128(v2, 1)));
190 
191         if ((buf[15] & m1) == c1 && (buf[16] & m2) == c2) {
192             z |= (1 << 15);
193         }
194         if (unlikely(z)) {
195             u32 pos = ctz32(z);
196             return buf + pos;
197         }
198     }
199 
200     return NULL;
201 }
202 
203 // returns NULL if not found
204 static really_inline
dvermPrecondition(m128 chars1,m128 chars2,const u8 * buf)205 const u8 *dvermPrecondition(m128 chars1, m128 chars2, const u8 *buf) {
206     m128 data = loadu128(buf); // unaligned
207     u32 z = movemask128(and128(eq128(chars1, data),
208                                rshiftbyte_m128(eq128(chars2, data), 1)));
209 
210     /* no fixup of the boundary required - the aligned run will pick it up */
211     if (unlikely(z)) {
212         u32 pos = ctz32(z);
213         return buf + pos;
214     }
215     return NULL;
216 }
217 
218 // returns NULL if not found
219 static really_inline
dvermPreconditionNocase(m128 chars1,m128 chars2,const u8 * buf)220 const u8 *dvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) {
221     /* due to laziness, nonalphas and nocase having interesting behaviour */
222     m128 casemask = set16x8(CASE_CLEAR);
223     m128 data = loadu128(buf); // unaligned
224     m128 v = and128(casemask, data);
225     u32 z = movemask128(and128(eq128(chars1, v),
226                                rshiftbyte_m128(eq128(chars2, v), 1)));
227 
228     /* no fixup of the boundary required - the aligned run will pick it up */
229     if (unlikely(z)) {
230         u32 pos = ctz32(z);
231         return buf + pos;
232     }
233     return NULL;
234 }
235 
236 // returns NULL if not found
237 static really_inline
dvermPreconditionMasked(m128 chars1,m128 chars2,m128 mask1,m128 mask2,const u8 * buf)238 const u8 *dvermPreconditionMasked(m128 chars1, m128 chars2,
239                                   m128 mask1, m128 mask2, const u8 *buf) {
240     m128 data = loadu128(buf); // unaligned
241     m128 v1 = eq128(chars1, and128(data, mask1));
242     m128 v2 = eq128(chars2, and128(data, mask2));
243     u32 z = movemask128(and128(v1, rshiftbyte_m128(v2, 1)));
244 
245     /* no fixup of the boundary required - the aligned run will pick it up */
246     if (unlikely(z)) {
247         u32 pos = ctz32(z);
248         return buf + pos;
249     }
250     return NULL;
251 }
252 
253 static really_inline
lastMatchOffset(const u8 * buf_end,u32 z)254 const u8 *lastMatchOffset(const u8 *buf_end, u32 z) {
255     assert(z);
256     return buf_end - 16 + 31 - clz32(z);
257 }
258 
259 static really_inline
rvermSearchAligned(m128 chars,const u8 * buf,const u8 * buf_end,char negate)260 const u8 *rvermSearchAligned(m128 chars, const u8 *buf, const u8 *buf_end,
261                              char negate) {
262     assert((size_t)buf_end % 16 == 0);
263     for (; buf + 15 < buf_end; buf_end -= 16) {
264         m128 data = load128(buf_end - 16);
265         u32 z = movemask128(eq128(chars, data));
266         if (negate) {
267             z = ~z & 0xffff;
268         }
269         if (unlikely(z)) {
270             return lastMatchOffset(buf_end, z);
271         }
272     }
273     return NULL;
274 }
275 
276 static really_inline
rvermSearchAlignedNocase(m128 chars,const u8 * buf,const u8 * buf_end,char negate)277 const u8 *rvermSearchAlignedNocase(m128 chars, const u8 *buf,
278                                    const u8 *buf_end, char negate) {
279     assert((size_t)buf_end % 16 == 0);
280     m128 casemask = set16x8(CASE_CLEAR);
281 
282     for (; buf + 15 < buf_end; buf_end -= 16) {
283         m128 data = load128(buf_end - 16);
284         u32 z = movemask128(eq128(chars, and128(casemask, data)));
285         if (negate) {
286             z = ~z & 0xffff;
287         }
288         if (unlikely(z)) {
289             return lastMatchOffset(buf_end, z);
290         }
291     }
292     return NULL;
293 }
294 
295 // returns NULL if not found
296 static really_inline
rvermUnalign(m128 chars,const u8 * buf,char negate)297 const u8 *rvermUnalign(m128 chars, const u8 *buf, char negate) {
298     m128 data = loadu128(buf); // unaligned
299     u32 z = movemask128(eq128(chars, data));
300     if (negate) {
301         z = ~z & 0xffff;
302     }
303     if (unlikely(z)) {
304         return lastMatchOffset(buf + 16, z);
305     }
306     return NULL;
307 }
308 
309 // returns NULL if not found
310 static really_inline
rvermUnalignNocase(m128 chars,const u8 * buf,char negate)311 const u8 *rvermUnalignNocase(m128 chars, const u8 *buf, char negate) {
312     m128 casemask = set16x8(CASE_CLEAR);
313     m128 data = loadu128(buf); // unaligned
314     u32 z = movemask128(eq128(chars, and128(casemask, data)));
315     if (negate) {
316         z = ~z & 0xffff;
317     }
318     if (unlikely(z)) {
319         return lastMatchOffset(buf + 16, z);
320     }
321     return NULL;
322 }
323 
324 static really_inline
rdvermSearchAligned(m128 chars1,m128 chars2,u8 c1,u8 c2,const u8 * buf,const u8 * buf_end)325 const u8 *rdvermSearchAligned(m128 chars1, m128 chars2, u8 c1, u8 c2,
326                               const u8 *buf, const u8 *buf_end) {
327     assert((size_t)buf_end % 16 == 0);
328 
329     for (; buf + 16 < buf_end; buf_end -= 16) {
330         m128 data = load128(buf_end - 16);
331         u32 z = movemask128(and128(eq128(chars2, data),
332                                    lshiftbyte_m128(eq128(chars1, data), 1)));
333         if (buf_end[-17] == c1 && buf_end[-16] == c2) {
334             z |= 1;
335         }
336         if (unlikely(z)) {
337             return lastMatchOffset(buf_end, z);
338         }
339     }
340     return buf_end;
341 }
342 
343 static really_inline
rdvermSearchAlignedNocase(m128 chars1,m128 chars2,u8 c1,u8 c2,const u8 * buf,const u8 * buf_end)344 const u8 *rdvermSearchAlignedNocase(m128 chars1, m128 chars2, u8 c1, u8 c2,
345                                     const u8 *buf, const u8 *buf_end) {
346     assert((size_t)buf_end % 16 == 0);
347     m128 casemask = set16x8(CASE_CLEAR);
348 
349     for (; buf + 16 < buf_end; buf_end -= 16) {
350         m128 data = load128(buf_end - 16);
351         m128 v = and128(casemask, data);
352         u32 z = movemask128(and128(eq128(chars2, v),
353                                    lshiftbyte_m128(eq128(chars1, v), 1)));
354         if ((buf_end[-17] & CASE_CLEAR) == c1
355             && (buf_end[-16] & CASE_CLEAR) == c2) {
356             z |= 1;
357         }
358         if (unlikely(z)) {
359             return lastMatchOffset(buf_end, z);
360         }
361     }
362     return buf_end;
363 }
364 
365 // returns NULL if not found
366 static really_inline
rdvermPrecondition(m128 chars1,m128 chars2,const u8 * buf)367 const u8 *rdvermPrecondition(m128 chars1, m128 chars2, const u8 *buf) {
368     m128 data = loadu128(buf);
369     u32 z = movemask128(and128(eq128(chars2, data),
370                                lshiftbyte_m128(eq128(chars1, data), 1)));
371 
372     /* no fixup of the boundary required - the aligned run will pick it up */
373     if (unlikely(z)) {
374         return lastMatchOffset(buf + 16, z);
375     }
376 
377     return NULL;
378 }
379 
380 // returns NULL if not found
381 static really_inline
rdvermPreconditionNocase(m128 chars1,m128 chars2,const u8 * buf)382 const u8 *rdvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) {
383     /* due to laziness, nonalphas and nocase having interesting behaviour */
384     m128 casemask = set16x8(CASE_CLEAR);
385     m128 data = loadu128(buf);
386     m128 v = and128(casemask, data);
387     u32 z = movemask128(and128(eq128(chars2, v),
388                                lshiftbyte_m128(eq128(chars1, v), 1)));
389     /* no fixup of the boundary required - the aligned run will pick it up */
390     if (unlikely(z)) {
391         return lastMatchOffset(buf + 16, z);
392     }
393 
394     return NULL;
395 }
396 
397 #else // HAVE_AVX512
398 
399 #define VERM_BOUNDARY 64
400 #define VERM_TYPE m512
401 #define VERM_SET_FN set64x8
402 
403 static really_inline
vermMini(m512 chars,const u8 * buf,const u8 * buf_end,char negate)404 const u8 *vermMini(m512 chars, const u8 *buf, const u8 *buf_end, char negate) {
405     uintptr_t len = buf_end - buf;
406     __mmask64 mask = (~0ULL) >> (64 - len);
407     m512 data = loadu_maskz_m512(mask, buf);
408 
409     u64a z = eq512mask(chars, data);
410 
411     if (negate) {
412         z = ~z & mask;
413     }
414     z &= mask;
415     if (unlikely(z)) {
416         return buf + ctz64(z);
417     }
418     return NULL;
419 }
420 
421 static really_inline
vermMiniNocase(m512 chars,const u8 * buf,const u8 * buf_end,char negate)422 const u8 *vermMiniNocase(m512 chars, const u8 *buf, const u8 *buf_end,
423                          char negate) {
424     uintptr_t len = buf_end - buf;
425     __mmask64 mask = (~0ULL) >> (64 - len);
426     m512 data = loadu_maskz_m512(mask, buf);
427     m512 casemask = set64x8(CASE_CLEAR);
428     m512 v = and512(casemask, data);
429 
430     u64a z = eq512mask(chars, v);
431 
432     if (negate) {
433         z = ~z & mask;
434     }
435     z &= mask;
436     if (unlikely(z)) {
437         return buf + ctz64(z);
438     }
439     return NULL;
440 }
441 
442 static really_inline
vermSearchAligned(m512 chars,const u8 * buf,const u8 * buf_end,char negate)443 const u8 *vermSearchAligned(m512 chars, const u8 *buf, const u8 *buf_end,
444                             char negate) {
445     assert((size_t)buf % 64 == 0);
446     for (; buf + 63 < buf_end; buf += 64) {
447         m512 data = load512(buf);
448         u64a z = eq512mask(chars, data);
449         if (negate) {
450             z = ~z & ~0ULL;
451         }
452         if (unlikely(z)) {
453             u64a pos = ctz64(z);
454             return buf + pos;
455         }
456     }
457     return NULL;
458 }
459 
460 static really_inline
vermSearchAlignedNocase(m512 chars,const u8 * buf,const u8 * buf_end,char negate)461 const u8 *vermSearchAlignedNocase(m512 chars, const u8 *buf,
462                                   const u8 *buf_end, char negate) {
463     assert((size_t)buf % 64 == 0);
464     m512 casemask = set64x8(CASE_CLEAR);
465 
466     for (; buf + 63 < buf_end; buf += 64) {
467         m512 data = load512(buf);
468         u64a z = eq512mask(chars, and512(casemask, data));
469         if (negate) {
470             z = ~z & ~0ULL;
471         }
472         if (unlikely(z)) {
473             u64a pos = ctz64(z);
474             return buf + pos;
475         }
476     }
477     return NULL;
478 }
479 
480 // returns NULL if not found
481 static really_inline
vermUnalign(m512 chars,const u8 * buf,char negate)482 const u8 *vermUnalign(m512 chars, const u8 *buf, char negate) {
483     m512 data = loadu512(buf); // unaligned
484     u64a z = eq512mask(chars, data);
485     if (negate) {
486         z = ~z & ~0ULL;
487     }
488     if (unlikely(z)) {
489         return buf + ctz64(z);
490     }
491     return NULL;
492 }
493 
494 // returns NULL if not found
495 static really_inline
vermUnalignNocase(m512 chars,const u8 * buf,char negate)496 const u8 *vermUnalignNocase(m512 chars, const u8 *buf, char negate) {
497     m512 casemask = set64x8(CASE_CLEAR);
498     m512 data = loadu512(buf); // unaligned
499     u64a z = eq512mask(chars, and512(casemask, data));
500     if (negate) {
501         z = ~z & ~0ULL;
502     }
503     if (unlikely(z)) {
504         return buf + ctz64(z);
505     }
506     return NULL;
507 }
508 
509 static really_inline
dvermMini(m512 chars1,m512 chars2,const u8 * buf,const u8 * buf_end)510 const u8 *dvermMini(m512 chars1, m512 chars2, const u8 *buf,
511                     const u8 *buf_end) {
512     uintptr_t len = buf_end - buf;
513     __mmask64 mask = (~0ULL) >> (64 - len);
514     m512 data = loadu_maskz_m512(mask, buf);
515 
516     u64a z = eq512mask(chars1, data) & (eq512mask(chars2, data) >> 1);
517 
518     z &= mask;
519     if (unlikely(z)) {
520         u64a pos = ctz64(z);
521         return buf + pos;
522     }
523     return NULL;
524 }
525 
526 static really_inline
dvermMiniNocase(m512 chars1,m512 chars2,const u8 * buf,const u8 * buf_end)527 const u8 *dvermMiniNocase(m512 chars1, m512 chars2, const u8 *buf,
528                           const u8 *buf_end) {
529     uintptr_t len = buf_end - buf;
530     __mmask64 mask = (~0ULL) >> (64 - len);
531     m512 data = loadu_maskz_m512(mask, buf);
532     m512 casemask = set64x8(CASE_CLEAR);
533     m512 v = and512(casemask, data);
534 
535     u64a z = eq512mask(chars1, v) & (eq512mask(chars2, v) >> 1);
536 
537     z &= mask;
538     if (unlikely(z)) {
539         u64a pos = ctz64(z);
540         return buf + pos;
541     }
542     return NULL;
543 }
544 
545 static really_inline
dvermMiniMasked(m512 chars1,m512 chars2,m512 mask1,m512 mask2,const u8 * buf,const u8 * buf_end)546 const u8 *dvermMiniMasked(m512 chars1, m512 chars2, m512 mask1, m512 mask2,
547                           const u8 *buf, const u8 *buf_end) {
548     uintptr_t len = buf_end - buf;
549     __mmask64 mask = (~0ULL) >> (64 - len);
550     m512 data = loadu_maskz_m512(mask, buf);
551     m512 v1 = and512(data, mask1);
552     m512 v2 = and512(data, mask2);
553 
554     u64a z = eq512mask(chars1, v1) & (eq512mask(chars2, v2) >> 1);
555 
556     z &= mask;
557     if (unlikely(z)) {
558         u64a pos = ctz64(z);
559         return buf + pos;
560     }
561     return NULL;
562 }
563 
564 static really_inline
dvermSearchAligned(m512 chars1,m512 chars2,u8 c1,u8 c2,const u8 * buf,const u8 * buf_end)565 const u8 *dvermSearchAligned(m512 chars1, m512 chars2, u8 c1, u8 c2,
566                              const u8 *buf, const u8 *buf_end) {
567     for (; buf + 64 < buf_end; buf += 64) {
568         m512 data = load512(buf);
569         u64a z = eq512mask(chars1, data) & (eq512mask(chars2, data) >> 1);
570         if (buf[63] == c1 && buf[64] == c2) {
571             z |= (1ULL << 63);
572         }
573         if (unlikely(z)) {
574             u64a pos = ctz64(z);
575             return buf + pos;
576         }
577     }
578 
579     return NULL;
580 }
581 
582 static really_inline
dvermSearchAlignedNocase(m512 chars1,m512 chars2,u8 c1,u8 c2,const u8 * buf,const u8 * buf_end)583 const u8 *dvermSearchAlignedNocase(m512 chars1, m512 chars2, u8 c1, u8 c2,
584                                    const u8 *buf, const u8 *buf_end) {
585     assert((size_t)buf % 64 == 0);
586     m512 casemask = set64x8(CASE_CLEAR);
587 
588     for (; buf + 64 < buf_end; buf += 64) {
589         m512 data = load512(buf);
590         m512 v = and512(casemask, data);
591         u64a z = eq512mask(chars1, v) & (eq512mask(chars2, v) >> 1);
592         if ((buf[63] & CASE_CLEAR) == c1 && (buf[64] & CASE_CLEAR) == c2) {
593             z |= (1ULL << 63);
594         }
595         if (unlikely(z)) {
596             u64a pos = ctz64(z);
597             return buf + pos;
598         }
599     }
600 
601     return NULL;
602 }
603 
604 static really_inline
dvermSearchAlignedMasked(m512 chars1,m512 chars2,m512 mask1,m512 mask2,u8 c1,u8 c2,u8 m1,u8 m2,const u8 * buf,const u8 * buf_end)605 const u8 *dvermSearchAlignedMasked(m512 chars1, m512 chars2,
606                                    m512 mask1, m512 mask2, u8 c1, u8 c2, u8 m1,
607                                    u8 m2, const u8 *buf, const u8 *buf_end) {
608     assert((size_t)buf % 64 == 0);
609 
610     for (; buf + 64 < buf_end; buf += 64) {
611         m512 data = load512(buf);
612         m512 v1 = and512(data, mask1);
613         m512 v2 = and512(data, mask2);
614         u64a z = eq512mask(chars1, v1) & (eq512mask(chars2, v2) >> 1);
615 
616         if ((buf[63] & m1) == c1 && (buf[64] & m2) == c2) {
617             z |= (1ULL << 63);
618         }
619         if (unlikely(z)) {
620             u64a pos = ctz64(z);
621             return buf + pos;
622         }
623     }
624 
625     return NULL;
626 }
627 
628 // returns NULL if not found
629 static really_inline
dvermPrecondition(m512 chars1,m512 chars2,const u8 * buf)630 const u8 *dvermPrecondition(m512 chars1, m512 chars2, const u8 *buf) {
631     m512 data = loadu512(buf); // unaligned
632     u64a z = eq512mask(chars1, data) & (eq512mask(chars2, data) >> 1);
633 
634     /* no fixup of the boundary required - the aligned run will pick it up */
635     if (unlikely(z)) {
636         u64a pos = ctz64(z);
637         return buf + pos;
638     }
639     return NULL;
640 }
641 
642 // returns NULL if not found
643 static really_inline
dvermPreconditionNocase(m512 chars1,m512 chars2,const u8 * buf)644 const u8 *dvermPreconditionNocase(m512 chars1, m512 chars2, const u8 *buf) {
645     /* due to laziness, nonalphas and nocase having interesting behaviour */
646     m512 casemask = set64x8(CASE_CLEAR);
647     m512 data = loadu512(buf); // unaligned
648     m512 v = and512(casemask, data);
649     u64a z = eq512mask(chars1, v) & (eq512mask(chars2, v) >> 1);
650 
651     /* no fixup of the boundary required - the aligned run will pick it up */
652     if (unlikely(z)) {
653         u64a pos = ctz64(z);
654         return buf + pos;
655     }
656     return NULL;
657 }
658 
659 // returns NULL if not found
660 static really_inline
dvermPreconditionMasked(m512 chars1,m512 chars2,m512 mask1,m512 mask2,const u8 * buf)661 const u8 *dvermPreconditionMasked(m512 chars1, m512 chars2,
662                                   m512 mask1, m512 mask2, const u8 *buf) {
663     m512 data = loadu512(buf); // unaligned
664     m512 v1 = and512(data, mask1);
665     m512 v2 = and512(data, mask2);
666     u64a z = eq512mask(chars1, v1) & (eq512mask(chars2, v2) >> 1);
667 
668     /* no fixup of the boundary required - the aligned run will pick it up */
669     if (unlikely(z)) {
670         u64a pos = ctz64(z);
671         return buf + pos;
672     }
673     return NULL;
674 }
675 
676 static really_inline
lastMatchOffset(const u8 * buf_end,u64a z)677 const u8 *lastMatchOffset(const u8 *buf_end, u64a z) {
678     assert(z);
679     return buf_end - 64 + 63 - clz64(z);
680 }
681 
682 static really_inline
rvermMini(m512 chars,const u8 * buf,const u8 * buf_end,char negate)683 const u8 *rvermMini(m512 chars, const u8 *buf, const u8 *buf_end, char negate) {
684     uintptr_t len = buf_end - buf;
685     __mmask64 mask = (~0ULL) >> (64 - len);
686     m512 data = loadu_maskz_m512(mask, buf);
687 
688     u64a z = eq512mask(chars, data);
689 
690     if (negate) {
691         z = ~z & mask;
692     }
693     z &= mask;
694     if (unlikely(z)) {
695         return lastMatchOffset(buf + 64, z);
696     }
697     return NULL;
698 }
699 
700 static really_inline
rvermMiniNocase(m512 chars,const u8 * buf,const u8 * buf_end,char negate)701 const u8 *rvermMiniNocase(m512 chars, const u8 *buf, const u8 *buf_end,
702                           char negate) {
703     uintptr_t len = buf_end - buf;
704     __mmask64 mask = (~0ULL) >> (64 - len);
705     m512 data = loadu_maskz_m512(mask, buf);
706     m512 casemask = set64x8(CASE_CLEAR);
707     m512 v = and512(casemask, data);
708 
709     u64a z = eq512mask(chars, v);
710 
711     if (negate) {
712         z = ~z & mask;
713     }
714     z &= mask;
715     if (unlikely(z)) {
716         return lastMatchOffset(buf + 64, z);
717     }
718     return NULL;
719 }
720 
721 static really_inline
rvermSearchAligned(m512 chars,const u8 * buf,const u8 * buf_end,char negate)722 const u8 *rvermSearchAligned(m512 chars, const u8 *buf, const u8 *buf_end,
723                              char negate) {
724     assert((size_t)buf_end % 64 == 0);
725     for (; buf + 63 < buf_end; buf_end -= 64) {
726         m512 data = load512(buf_end - 64);
727         u64a z = eq512mask(chars, data);
728         if (negate) {
729             z = ~z & ~0ULL;
730         }
731         if (unlikely(z)) {
732             return lastMatchOffset(buf_end, z);
733         }
734     }
735     return NULL;
736 }
737 
738 static really_inline
rvermSearchAlignedNocase(m512 chars,const u8 * buf,const u8 * buf_end,char negate)739 const u8 *rvermSearchAlignedNocase(m512 chars, const u8 *buf,
740                                    const u8 *buf_end, char negate) {
741     assert((size_t)buf_end % 64 == 0);
742     m512 casemask = set64x8(CASE_CLEAR);
743 
744     for (; buf + 63 < buf_end; buf_end -= 64) {
745         m512 data = load512(buf_end - 64);
746         u64a z = eq512mask(chars, and512(casemask, data));
747         if (negate) {
748             z = ~z & ~0ULL;
749         }
750         if (unlikely(z)) {
751             return lastMatchOffset(buf_end, z);
752         }
753     }
754     return NULL;
755 }
756 
757 // returns NULL if not found
758 static really_inline
rvermUnalign(m512 chars,const u8 * buf,char negate)759 const u8 *rvermUnalign(m512 chars, const u8 *buf, char negate) {
760     m512 data = loadu512(buf); // unaligned
761     u64a z = eq512mask(chars, data);
762     if (negate) {
763         z = ~z & ~0ULL;
764     }
765     if (unlikely(z)) {
766         return lastMatchOffset(buf + 64, z);
767     }
768     return NULL;
769 }
770 
771 // returns NULL if not found
772 static really_inline
rvermUnalignNocase(m512 chars,const u8 * buf,char negate)773 const u8 *rvermUnalignNocase(m512 chars, const u8 *buf, char negate) {
774     m512 casemask = set64x8(CASE_CLEAR);
775     m512 data = loadu512(buf); // unaligned
776     u64a z = eq512mask(chars, and512(casemask, data));
777     if (negate) {
778         z = ~z & ~0ULL;
779     }
780     if (unlikely(z)) {
781         return lastMatchOffset(buf + 64, z);
782     }
783     return NULL;
784 }
785 
786 static really_inline
rdvermMini(m512 chars1,m512 chars2,const u8 * buf,const u8 * buf_end)787 const u8 *rdvermMini(m512 chars1, m512 chars2, const u8 *buf,
788                     const u8 *buf_end) {
789     uintptr_t len = buf_end - buf;
790     __mmask64 mask = (~0ULL) >> (64 - len);
791     m512 data = loadu_maskz_m512(mask, buf);
792 
793     u64a z = eq512mask(chars2, data) & (eq512mask(chars1, data) << 1);
794 
795     z &= mask;
796     if (unlikely(z)) {
797         return lastMatchOffset(buf + 64, z);
798     }
799     return NULL;
800 }
801 
802 static really_inline
rdvermMiniNocase(m512 chars1,m512 chars2,const u8 * buf,const u8 * buf_end)803 const u8 *rdvermMiniNocase(m512 chars1, m512 chars2, const u8 *buf,
804                            const u8 *buf_end) {
805     uintptr_t len = buf_end - buf;
806     __mmask64 mask = (~0ULL) >> (64 - len);
807     m512 data = loadu_maskz_m512(mask, buf);
808     m512 casemask = set64x8(CASE_CLEAR);
809     m512 v = and512(casemask, data);
810 
811     u64a z = eq512mask(chars2, v) & (eq512mask(chars1, v) << 1);
812 
813     z &= mask;
814     if (unlikely(z)) {
815         return lastMatchOffset(buf + 64, z);
816     }
817     return NULL;
818 }
819 
820 static really_inline
rdvermSearchAligned(m512 chars1,m512 chars2,u8 c1,u8 c2,const u8 * buf,const u8 * buf_end)821 const u8 *rdvermSearchAligned(m512 chars1, m512 chars2, u8 c1, u8 c2,
822                               const u8 *buf, const u8 *buf_end) {
823     assert((size_t)buf_end % 64 == 0);
824 
825     for (; buf + 64 < buf_end; buf_end -= 64) {
826         m512 data = load512(buf_end - 64);
827         u64a z = eq512mask(chars2, data) & (eq512mask(chars1, data) << 1);
828         if (buf_end[-65] == c1 && buf_end[-64] == c2) {
829             z |= 1;
830         }
831         if (unlikely(z)) {
832             return lastMatchOffset(buf_end, z);
833         }
834     }
835     return buf_end;
836 }
837 
838 static really_inline
rdvermSearchAlignedNocase(m512 chars1,m512 chars2,u8 c1,u8 c2,const u8 * buf,const u8 * buf_end)839 const u8 *rdvermSearchAlignedNocase(m512 chars1, m512 chars2, u8 c1, u8 c2,
840                                     const u8 *buf, const u8 *buf_end) {
841     assert((size_t)buf_end % 64 == 0);
842     m512 casemask = set64x8(CASE_CLEAR);
843 
844     for (; buf + 64 < buf_end; buf_end -= 64) {
845         m512 data = load512(buf_end - 64);
846         m512 v = and512(casemask, data);
847         u64a z = eq512mask(chars2, v) & (eq512mask(chars1, v) << 1);
848         if ((buf_end[-65] & CASE_CLEAR) == c1
849             && (buf_end[-64] & CASE_CLEAR) == c2) {
850             z |= 1;
851         }
852         if (unlikely(z)) {
853             return lastMatchOffset(buf_end, z);
854         }
855     }
856     return buf_end;
857 }
858 
859 // returns NULL if not found
860 static really_inline
rdvermPrecondition(m512 chars1,m512 chars2,const u8 * buf)861 const u8 *rdvermPrecondition(m512 chars1, m512 chars2, const u8 *buf) {
862     m512 data = loadu512(buf);
863     u64a z = eq512mask(chars2, data) & (eq512mask(chars1, data) << 1);
864 
865     // no fixup of the boundary required - the aligned run will pick it up
866     if (unlikely(z)) {
867         return lastMatchOffset(buf + 64, z);
868     }
869 
870     return NULL;
871 }
872 
873 // returns NULL if not found
874 static really_inline
rdvermPreconditionNocase(m512 chars1,m512 chars2,const u8 * buf)875 const u8 *rdvermPreconditionNocase(m512 chars1, m512 chars2, const u8 *buf) {
876     // due to laziness, nonalphas and nocase having interesting behaviour
877     m512 casemask = set64x8(CASE_CLEAR);
878     m512 data = loadu512(buf);
879     m512 v = and512(casemask, data);
880     u64a z = eq512mask(chars2, v) & (eq512mask(chars1, v) << 1);
881     // no fixup of the boundary required - the aligned run will pick it up
882     if (unlikely(z)) {
883         return lastMatchOffset(buf + 64, z);
884     }
885 
886     return NULL;
887 }
888 
889 #endif // HAVE_AVX512
890