1 /*
2  * Copyright (c) 2015-2020, Intel Corporation
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions are met:
6  *
7  *  * Redistributions of source code must retain the above copyright notice,
8  *    this list of conditions and the following disclaimer.
9  *  * Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *  * Neither the name of Intel Corporation nor the names of its contributors
13  *    may be used to endorse or promote products derived from this software
14  *    without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26  * POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /** \file
30  * \brief Vermicelli: single-byte and double-byte acceleration.
31  */
32 
33 #ifndef VERMICELLI_H
34 #define VERMICELLI_H
35 
36 #include "util/bitutils.h"
37 #include "util/simd_utils.h"
38 #include "util/unaligned.h"
39 
40 #include "vermicelli_sse.h"
41 
42 static really_inline
vermicelliExec(char c,char nocase,const u8 * buf,const u8 * buf_end)43 const u8 *vermicelliExec(char c, char nocase, const u8 *buf,
44                          const u8 *buf_end) {
45     DEBUG_PRINTF("verm scan %s\\x%02hhx over %zu bytes\n",
46                  nocase ? "nocase " : "", c, (size_t)(buf_end - buf));
47     assert(buf < buf_end);
48 
49     VERM_TYPE chars = VERM_SET_FN(c); /* nocase already uppercase */
50 
51     // Handle small scans.
52 #ifdef HAVE_AVX512
53     if (buf_end - buf <= VERM_BOUNDARY) {
54         const u8 *ptr = nocase
55                       ? vermMiniNocase(chars, buf, buf_end, 0)
56                       : vermMini(chars, buf, buf_end, 0);
57         if (ptr) {
58             return ptr;
59         }
60         return buf_end;
61     }
62 #else
63     if (buf_end - buf < VERM_BOUNDARY) {
64         for (; buf < buf_end; buf++) {
65             char cur = (char)*buf;
66             if (nocase) {
67                 cur &= CASE_CLEAR;
68             }
69             if (cur == c) {
70                 break;
71             }
72         }
73         return buf;
74     }
75 #endif
76 
77     uintptr_t min = (uintptr_t)buf % VERM_BOUNDARY;
78     if (min) {
79         // Input isn't aligned, so we need to run one iteration with an
80         // unaligned load, then skip buf forward to the next aligned address.
81         // There's some small overlap here, but we don't mind scanning it twice
82         // if we can do it quickly, do we?
83         const u8 *ptr = nocase ? vermUnalignNocase(chars, buf, 0)
84                                : vermUnalign(chars, buf, 0);
85         if (ptr) {
86             return ptr;
87         }
88 
89         buf += VERM_BOUNDARY - min;
90         assert(buf < buf_end);
91     }
92 
93     // Aligned loops from here on in
94     const u8 *ptr = nocase ? vermSearchAlignedNocase(chars, buf, buf_end - 1, 0)
95                            : vermSearchAligned(chars, buf, buf_end - 1, 0);
96     if (ptr) {
97         return ptr;
98     }
99 
100     // Tidy up the mess at the end
101     ptr = nocase ? vermUnalignNocase(chars, buf_end - VERM_BOUNDARY, 0)
102                  : vermUnalign(chars, buf_end - VERM_BOUNDARY, 0);
103     return ptr ? ptr : buf_end;
104 }
105 
106 /* like vermicelliExec except returns the address of the first character which
107  * is not c */
108 static really_inline
nvermicelliExec(char c,char nocase,const u8 * buf,const u8 * buf_end)109 const u8 *nvermicelliExec(char c, char nocase, const u8 *buf,
110                          const u8 *buf_end) {
111     DEBUG_PRINTF("nverm scan %s\\x%02hhx over %zu bytes\n",
112                  nocase ? "nocase " : "", c, (size_t)(buf_end - buf));
113     assert(buf < buf_end);
114 
115     VERM_TYPE chars = VERM_SET_FN(c); /* nocase already uppercase */
116 
117     // Handle small scans.
118 #ifdef HAVE_AVX512
119     if (buf_end - buf <= VERM_BOUNDARY) {
120         const u8 *ptr = nocase
121                       ? vermMiniNocase(chars, buf, buf_end, 1)
122                       : vermMini(chars, buf, buf_end, 1);
123         if (ptr) {
124             return ptr;
125         }
126         return buf_end;
127     }
128 #else
129     if (buf_end - buf < VERM_BOUNDARY) {
130         for (; buf < buf_end; buf++) {
131             char cur = (char)*buf;
132             if (nocase) {
133                 cur &= CASE_CLEAR;
134             }
135             if (cur != c) {
136                 break;
137             }
138         }
139         return buf;
140     }
141 #endif
142 
143     size_t min = (size_t)buf % VERM_BOUNDARY;
144     if (min) {
145         // Input isn't aligned, so we need to run one iteration with an
146         // unaligned load, then skip buf forward to the next aligned address.
147         // There's some small overlap here, but we don't mind scanning it twice
148         // if we can do it quickly, do we?
149         const u8 *ptr = nocase ? vermUnalignNocase(chars, buf, 1)
150                                : vermUnalign(chars, buf, 1);
151         if (ptr) {
152             return ptr;
153         }
154 
155         buf += VERM_BOUNDARY - min;
156         assert(buf < buf_end);
157     }
158 
159     // Aligned loops from here on in
160     const u8 *ptr = nocase ? vermSearchAlignedNocase(chars, buf, buf_end - 1, 1)
161                            : vermSearchAligned(chars, buf, buf_end - 1, 1);
162     if (ptr) {
163         return ptr;
164     }
165 
166     // Tidy up the mess at the end
167     ptr = nocase ? vermUnalignNocase(chars, buf_end - VERM_BOUNDARY, 1)
168                  : vermUnalign(chars, buf_end - VERM_BOUNDARY, 1);
169     return ptr ? ptr : buf_end;
170 }
171 
172 static really_inline
vermicelliDoubleExec(char c1,char c2,char nocase,const u8 * buf,const u8 * buf_end)173 const u8 *vermicelliDoubleExec(char c1, char c2, char nocase, const u8 *buf,
174                                const u8 *buf_end) {
175     DEBUG_PRINTF("double verm scan %s\\x%02hhx%02hhx over %zu bytes\n",
176                  nocase ? "nocase " : "", c1, c2, (size_t)(buf_end - buf));
177     assert(buf < buf_end);
178 
179     VERM_TYPE chars1 = VERM_SET_FN(c1); /* nocase already uppercase */
180     VERM_TYPE chars2 = VERM_SET_FN(c2); /* nocase already uppercase */
181 
182 #ifdef HAVE_AVX512
183     if (buf_end - buf <= VERM_BOUNDARY) {
184         const u8 *ptr = nocase
185                       ? dvermMiniNocase(chars1, chars2, buf, buf_end)
186                       : dvermMini(chars1, chars2, buf, buf_end);
187         if (ptr) {
188             return ptr;
189         }
190 
191         /* check for partial match at end */
192         u8 mask = nocase ? CASE_CLEAR : 0xff;
193         if ((buf_end[-1] & mask) == (u8)c1) {
194             DEBUG_PRINTF("partial!!!\n");
195             return buf_end - 1;
196         }
197 
198         return buf_end;
199     }
200 #endif
201 
202     assert((buf_end - buf) >= VERM_BOUNDARY);
203     uintptr_t min = (uintptr_t)buf % VERM_BOUNDARY;
204     if (min) {
205         // Input isn't aligned, so we need to run one iteration with an
206         // unaligned load, then skip buf forward to the next aligned address.
207         // There's some small overlap here, but we don't mind scanning it twice
208         // if we can do it quickly, do we?
209         const u8 *ptr = nocase
210                         ? dvermPreconditionNocase(chars1, chars2, buf)
211                         : dvermPrecondition(chars1, chars2, buf);
212         if (ptr) {
213             return ptr;
214         }
215 
216         buf += VERM_BOUNDARY - min;
217         assert(buf < buf_end);
218     }
219 
220     // Aligned loops from here on in
221     const u8 *ptr = nocase ? dvermSearchAlignedNocase(chars1, chars2, c1, c2,
222                                                       buf, buf_end)
223                            : dvermSearchAligned(chars1, chars2, c1, c2, buf,
224                                                 buf_end);
225     if (ptr) {
226         return ptr;
227     }
228 
229     // Tidy up the mess at the end
230     ptr = nocase ? dvermPreconditionNocase(chars1, chars2,
231                                            buf_end - VERM_BOUNDARY)
232                  : dvermPrecondition(chars1, chars2, buf_end - VERM_BOUNDARY);
233 
234     if (ptr) {
235         return ptr;
236     }
237 
238     /* check for partial match at end */
239     u8 mask = nocase ? CASE_CLEAR : 0xff;
240     if ((buf_end[-1] & mask) == (u8)c1) {
241         DEBUG_PRINTF("partial!!!\n");
242         return buf_end - 1;
243     }
244 
245     return buf_end;
246 }
247 
248 static really_inline
vermicelliDoubleMaskedExec(char c1,char c2,char m1,char m2,const u8 * buf,const u8 * buf_end)249 const u8 *vermicelliDoubleMaskedExec(char c1, char c2, char m1, char m2,
250                                      const u8 *buf, const u8 *buf_end) {
251     DEBUG_PRINTF("double verm scan (\\x%02hhx&\\x%02hhx)(\\x%02hhx&\\x%02hhx) "
252                  "over %zu bytes\n", c1, m1, c2, m2, (size_t)(buf_end - buf));
253     assert(buf < buf_end);
254 
255     VERM_TYPE chars1 = VERM_SET_FN(c1);
256     VERM_TYPE chars2 = VERM_SET_FN(c2);
257     VERM_TYPE mask1 = VERM_SET_FN(m1);
258     VERM_TYPE mask2 = VERM_SET_FN(m2);
259 
260 #ifdef HAVE_AVX512
261     if (buf_end - buf <= VERM_BOUNDARY) {
262         const u8 *ptr = dvermMiniMasked(chars1, chars2, mask1, mask2, buf,
263                                         buf_end);
264         if (ptr) {
265             return ptr;
266         }
267 
268         /* check for partial match at end */
269         if ((buf_end[-1] & m1) == (u8)c1) {
270             DEBUG_PRINTF("partial!!!\n");
271             return buf_end - 1;
272         }
273 
274         return buf_end;
275     }
276 #endif
277 
278     assert((buf_end - buf) >= VERM_BOUNDARY);
279     uintptr_t min = (uintptr_t)buf % VERM_BOUNDARY;
280     if (min) {
281         // Input isn't aligned, so we need to run one iteration with an
282         // unaligned load, then skip buf forward to the next aligned address.
283         // There's some small overlap here, but we don't mind scanning it twice
284         // if we can do it quickly, do we?
285         const u8 *p = dvermPreconditionMasked(chars1, chars2, mask1, mask2, buf);
286         if (p) {
287             return p;
288         }
289 
290         buf += VERM_BOUNDARY - min;
291         assert(buf < buf_end);
292     }
293 
294     // Aligned loops from here on in
295     const u8 *ptr = dvermSearchAlignedMasked(chars1, chars2, mask1, mask2, c1,
296                                              c2, m1, m2, buf, buf_end);
297     if (ptr) {
298         return ptr;
299     }
300 
301     // Tidy up the mess at the end
302     ptr = dvermPreconditionMasked(chars1, chars2, mask1, mask2,
303                                   buf_end - VERM_BOUNDARY);
304 
305     if (ptr) {
306         return ptr;
307     }
308 
309     /* check for partial match at end */
310     if ((buf_end[-1] & m1) == (u8)c1) {
311         DEBUG_PRINTF("partial!!!\n");
312         return buf_end - 1;
313     }
314 
315     return buf_end;
316 }
317 
318 // Reverse vermicelli scan. Provides exact semantics and returns (buf - 1) if
319 // character not found.
320 static really_inline
rvermicelliExec(char c,char nocase,const u8 * buf,const u8 * buf_end)321 const u8 *rvermicelliExec(char c, char nocase, const u8 *buf,
322                           const u8 *buf_end) {
323     DEBUG_PRINTF("rev verm scan %s\\x%02hhx over %zu bytes\n",
324                  nocase ? "nocase " : "", c, (size_t)(buf_end - buf));
325     assert(buf < buf_end);
326 
327     VERM_TYPE chars = VERM_SET_FN(c); /* nocase already uppercase */
328 
329     // Handle small scans.
330 #ifdef HAVE_AVX512
331     if (buf_end - buf <= VERM_BOUNDARY) {
332         const u8 *ptr = nocase
333                       ? rvermMiniNocase(chars, buf, buf_end, 0)
334                       : rvermMini(chars, buf, buf_end, 0);
335         if (ptr) {
336             return ptr;
337         }
338         return buf - 1;
339     }
340 #else
341     if (buf_end - buf < VERM_BOUNDARY) {
342         for (buf_end--; buf_end >= buf; buf_end--) {
343             char cur = (char)*buf_end;
344             if (nocase) {
345                 cur &= CASE_CLEAR;
346             }
347             if (cur == c) {
348                 break;
349             }
350         }
351         return buf_end;
352     }
353 #endif
354 
355     size_t min = (size_t)buf_end % VERM_BOUNDARY;
356     if (min) {
357         // Input isn't aligned, so we need to run one iteration with an
358         // unaligned load, then skip buf backward to the next aligned address.
359         // There's some small overlap here, but we don't mind scanning it twice
360         // if we can do it quickly, do we?
361         const u8 *ptr = nocase ? rvermUnalignNocase(chars,
362                                                     buf_end - VERM_BOUNDARY,
363                                                     0)
364                                : rvermUnalign(chars, buf_end - VERM_BOUNDARY,
365                                               0);
366 
367         if (ptr) {
368             return ptr;
369         }
370 
371         buf_end -= min;
372         if (buf >= buf_end) {
373             return buf_end;
374         }
375     }
376 
377     // Aligned loops from here on in.
378     const u8 *ptr = nocase ? rvermSearchAlignedNocase(chars, buf, buf_end, 0)
379                            : rvermSearchAligned(chars, buf, buf_end, 0);
380     if (ptr) {
381         return ptr;
382     }
383 
384     // Tidy up the mess at the end, return buf - 1 if not found.
385     ptr = nocase ? rvermUnalignNocase(chars, buf, 0)
386                  : rvermUnalign(chars, buf, 0);
387     return ptr ? ptr : buf - 1;
388 }
389 
390 /* like rvermicelliExec except returns the address of the last character which
391  * is not c */
392 static really_inline
rnvermicelliExec(char c,char nocase,const u8 * buf,const u8 * buf_end)393 const u8 *rnvermicelliExec(char c, char nocase, const u8 *buf,
394                            const u8 *buf_end) {
395     DEBUG_PRINTF("rev verm scan %s\\x%02hhx over %zu bytes\n",
396                  nocase ? "nocase " : "", c, (size_t)(buf_end - buf));
397     assert(buf < buf_end);
398 
399     VERM_TYPE chars = VERM_SET_FN(c); /* nocase already uppercase */
400 
401     // Handle small scans.
402 #ifdef HAVE_AVX512
403     if (buf_end - buf <= VERM_BOUNDARY) {
404         const u8 *ptr = nocase
405                       ? rvermMiniNocase(chars, buf, buf_end, 1)
406                       : rvermMini(chars, buf, buf_end, 1);
407         if (ptr) {
408             return ptr;
409         }
410         return buf - 1;
411     }
412 #else
413     if (buf_end - buf < VERM_BOUNDARY) {
414         for (buf_end--; buf_end >= buf; buf_end--) {
415             char cur = (char)*buf_end;
416             if (nocase) {
417                 cur &= CASE_CLEAR;
418             }
419             if (cur != c) {
420                 break;
421             }
422         }
423         return buf_end;
424     }
425 #endif
426 
427     size_t min = (size_t)buf_end % VERM_BOUNDARY;
428     if (min) {
429         // Input isn't aligned, so we need to run one iteration with an
430         // unaligned load, then skip buf backward to the next aligned address.
431         // There's some small overlap here, but we don't mind scanning it twice
432         // if we can do it quickly, do we?
433         const u8 *ptr = nocase ? rvermUnalignNocase(chars,
434                                                     buf_end - VERM_BOUNDARY,
435                                                     1)
436                                : rvermUnalign(chars, buf_end - VERM_BOUNDARY,
437                                               1);
438 
439         if (ptr) {
440             return ptr;
441         }
442 
443         buf_end -= min;
444         if (buf >= buf_end) {
445             return buf_end;
446         }
447     }
448 
449     // Aligned loops from here on in.
450     const u8 *ptr = nocase ? rvermSearchAlignedNocase(chars, buf, buf_end, 1)
451                            : rvermSearchAligned(chars, buf, buf_end, 1);
452     if (ptr) {
453         return ptr;
454     }
455 
456     // Tidy up the mess at the end, return buf - 1 if not found.
457     ptr = nocase ? rvermUnalignNocase(chars, buf, 1)
458                  : rvermUnalign(chars, buf, 1);
459     return ptr ? ptr : buf - 1;
460 }
461 
462 /* returns highest offset of c2 (NOTE: not c1) */
463 static really_inline
rvermicelliDoubleExec(char c1,char c2,char nocase,const u8 * buf,const u8 * buf_end)464 const u8 *rvermicelliDoubleExec(char c1, char c2, char nocase, const u8 *buf,
465                                 const u8 *buf_end) {
466     DEBUG_PRINTF("rev double verm scan %s\\x%02hhx%02hhx over %zu bytes\n",
467                  nocase ? "nocase " : "", c1, c2, (size_t)(buf_end - buf));
468     assert(buf < buf_end);
469 
470     VERM_TYPE chars1 = VERM_SET_FN(c1); /* nocase already uppercase */
471     VERM_TYPE chars2 = VERM_SET_FN(c2); /* nocase already uppercase */
472 
473 #ifdef HAVE_AVX512
474     if (buf_end - buf <= VERM_BOUNDARY) {
475         const u8 *ptr = nocase
476                       ? rdvermMiniNocase(chars1, chars2, buf, buf_end)
477                       : rdvermMini(chars1, chars2, buf, buf_end);
478 
479         if (ptr) {
480             return ptr;
481         }
482 
483         // check for partial match at end ???
484         return buf - 1;
485     }
486 #endif
487 
488     assert((buf_end - buf) >= VERM_BOUNDARY);
489     size_t min = (size_t)buf_end % VERM_BOUNDARY;
490     if (min) {
491         // input not aligned, so we need to run one iteration with an unaligned
492         // load, then skip buf forward to the next aligned address. There's
493         // some small overlap here, but we don't mind scanning it twice if we
494         // can do it quickly, do we?
495         const u8 *ptr = nocase ? rdvermPreconditionNocase(chars1, chars2,
496                                                           buf_end - VERM_BOUNDARY)
497                                : rdvermPrecondition(chars1, chars2,
498                                                     buf_end - VERM_BOUNDARY);
499 
500         if (ptr) {
501             return ptr;
502         }
503 
504         buf_end -= min;
505         if (buf >= buf_end) {
506             return buf_end;
507         }
508     }
509 
510     // Aligned loops from here on in
511     if (nocase) {
512         return rdvermSearchAlignedNocase(chars1, chars2, c1, c2, buf, buf_end);
513     } else {
514         return rdvermSearchAligned(chars1, chars2, c1, c2, buf, buf_end);
515     }
516 }
517 
518 #endif /* VERMICELLI_H */
519