1;;
2;; Copyright (c) 2019-2020, Intel Corporation
3;;
4;; Redistribution and use in source and binary forms, with or without
5;; modification, are permitted provided that the following conditions are met:
6;;
7;;     * Redistributions of source code must retain the above copyright notice,
8;;       this list of conditions and the following disclaimer.
9;;     * Redistributions in binary form must reproduce the above copyright
10;;       notice, this list of conditions and the following disclaimer in the
11;;       documentation and/or other materials provided with the distribution.
12;;     * Neither the name of Intel Corporation nor the names of its contributors
13;;       may be used to endorse or promote products derived from this software
14;;       without specific prior written permission.
15;;
16;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26;;
27
28%include "include/os.asm"
29%include "include/reg_sizes.asm"
30
31section .data
32default rel
33
34align 16
35idx_tab8:
36        db 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,
37        db 0x8,  0x9,  0xA,  0xB,  0xC,  0xD,  0xE,  0xF,
38
39align 16
40add_16:
41        db 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
42        db 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10
43
44align 16
45idx_tab16:
46        dw 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
47
48align 16
49add_8:
50        dw 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8
51
52align 16
53idx_tab32:
54        dd 0x0,  0x1,  0x2,  0x3
55
56align 16
57add_4:
58        dd 0x4, 0x4, 0x4, 0x4
59
60align 16
61idx_tab64:
62        dq 0x0,  0x1
63
64align 16
65add_2:
66        dq 0x2, 0x2
67
68align 16
69bcast_mask:
70        db 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01,
71        db 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01
72
73align 64
74idx_rows_avx:
75        dd 0x00000000, 0x00000000, 0x00000000, 0x00000000
76        dd 0x10101010, 0x10101010, 0x10101010, 0x10101010
77        dd 0x20202020, 0x20202020, 0x20202020, 0x20202020
78        dd 0x30303030, 0x30303030, 0x30303030, 0x30303030
79        dd 0x40404040, 0x40404040, 0x40404040, 0x40404040
80        dd 0x50505050, 0x50505050, 0x50505050, 0x50505050
81        dd 0x60606060, 0x60606060, 0x60606060, 0x60606060
82        dd 0x70707070, 0x70707070, 0x70707070, 0x70707070
83        dd 0x80808080, 0x80808080, 0x80808080, 0x80808080
84        dd 0x90909090, 0x90909090, 0x90909090, 0x90909090
85        dd 0xa0a0a0a0, 0xa0a0a0a0, 0xa0a0a0a0, 0xa0a0a0a0
86        dd 0xb0b0b0b0, 0xb0b0b0b0, 0xb0b0b0b0, 0xb0b0b0b0
87        dd 0xc0c0c0c0, 0xc0c0c0c0, 0xc0c0c0c0, 0xc0c0c0c0
88        dd 0xd0d0d0d0, 0xd0d0d0d0, 0xd0d0d0d0, 0xd0d0d0d0
89        dd 0xe0e0e0e0, 0xe0e0e0e0, 0xe0e0e0e0, 0xe0e0e0e0
90        dd 0xf0f0f0f0, 0xf0f0f0f0, 0xf0f0f0f0, 0xf0f0f0f0
91
92align 64
93idx_rows_avx2:
94        dd 0x00000000, 0x00000000, 0x00000000, 0x00000000
95        dd 0x00000000, 0x00000000, 0x00000000, 0x00000000
96        dd 0x10101010, 0x10101010, 0x10101010, 0x10101010
97        dd 0x10101010, 0x10101010, 0x10101010, 0x10101010
98        dd 0x20202020, 0x20202020, 0x20202020, 0x20202020
99        dd 0x20202020, 0x20202020, 0x20202020, 0x20202020
100        dd 0x30303030, 0x30303030, 0x30303030, 0x30303030
101        dd 0x30303030, 0x30303030, 0x30303030, 0x30303030
102        dd 0x40404040, 0x40404040, 0x40404040, 0x40404040
103        dd 0x40404040, 0x40404040, 0x40404040, 0x40404040
104        dd 0x50505050, 0x50505050, 0x50505050, 0x50505050
105        dd 0x50505050, 0x50505050, 0x50505050, 0x50505050
106        dd 0x60606060, 0x60606060, 0x60606060, 0x60606060
107        dd 0x60606060, 0x60606060, 0x60606060, 0x60606060
108        dd 0x70707070, 0x70707070, 0x70707070, 0x70707070
109        dd 0x70707070, 0x70707070, 0x70707070, 0x70707070
110        dd 0x80808080, 0x80808080, 0x80808080, 0x80808080
111        dd 0x80808080, 0x80808080, 0x80808080, 0x80808080
112        dd 0x90909090, 0x90909090, 0x90909090, 0x90909090
113        dd 0x90909090, 0x90909090, 0x90909090, 0x90909090
114        dd 0xa0a0a0a0, 0xa0a0a0a0, 0xa0a0a0a0, 0xa0a0a0a0
115        dd 0xa0a0a0a0, 0xa0a0a0a0, 0xa0a0a0a0, 0xa0a0a0a0
116        dd 0xb0b0b0b0, 0xb0b0b0b0, 0xb0b0b0b0, 0xb0b0b0b0
117        dd 0xb0b0b0b0, 0xb0b0b0b0, 0xb0b0b0b0, 0xb0b0b0b0
118        dd 0xc0c0c0c0, 0xc0c0c0c0, 0xc0c0c0c0, 0xc0c0c0c0
119        dd 0xc0c0c0c0, 0xc0c0c0c0, 0xc0c0c0c0, 0xc0c0c0c0
120        dd 0xd0d0d0d0, 0xd0d0d0d0, 0xd0d0d0d0, 0xd0d0d0d0
121        dd 0xd0d0d0d0, 0xd0d0d0d0, 0xd0d0d0d0, 0xd0d0d0d0
122        dd 0xe0e0e0e0, 0xe0e0e0e0, 0xe0e0e0e0, 0xe0e0e0e0
123        dd 0xe0e0e0e0, 0xe0e0e0e0, 0xe0e0e0e0, 0xe0e0e0e0
124        dd 0xf0f0f0f0, 0xf0f0f0f0, 0xf0f0f0f0, 0xf0f0f0f0
125        dd 0xf0f0f0f0, 0xf0f0f0f0, 0xf0f0f0f0, 0xf0f0f0f0
126        ;; extra
127        dd 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f
128        dd 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f
129
130section .text
131
132%ifdef LINUX
133        %define arg1    rdi
134        %define arg2    rsi
135        %define arg3    rdx
136%else
137        %define arg1    rcx
138        %define arg2    rdx
139        %define arg3    r8
140%endif
141
142%define bcast_idx xmm0
143%define xadd      xmm1
144%define accum_val xmm2
145%define xindices  xmm3
146%define xtmp      xmm4
147%define xtmp2     xmm5
148%define tmp       r9
149%define offset    r10
150
151%define table   arg1
152%define idx     arg2
153%define size    arg3
154
155; uint8_t lookup_8bit_sse(const void *table, const uint32_t idx, const uint32_t size);
156; arg 1 : pointer to table to look up
157; arg 2 : index to look up
158; arg 3 : size of table to look up (multiple of 16 bytes)
159align 32
160MKGLOBAL(lookup_8bit_sse,function,internal)
161lookup_8bit_sse:
162
163        ;; Number of loop iters = matrix size / 4 (number of values in XMM)
164        shr     size, 4
165        je      exit8_sse
166
167        xor     offset, offset
168
169        ;; Broadcast idx to look up
170        movd    bcast_idx, DWORD(idx)
171        pxor    xtmp, xtmp
172        pxor    accum_val, accum_val
173        pshufb  bcast_idx, xtmp
174
175        movdqa  xadd,     [rel add_16]
176        movdqa  xindices, [rel idx_tab8]
177
178loop8_sse:
179        movdqa  xtmp, xindices
180
181        ;; Compare indices with idx
182        ;; This generates a mask with all 0s except for the position where idx matches (all 1s here)
183        pcmpeqb xtmp, bcast_idx
184
185        ;; Load next 16 values
186        movdqa  xtmp2, [table + offset]
187
188        ;; This generates data with all 0s except the value we are looking for in the index to look up
189        pand    xtmp2, xtmp
190
191        por     accum_val, xtmp2
192
193        ;; Get next 16 indices
194        paddb   xindices, xadd
195
196        add     offset, 16
197        dec     size
198
199        jne     loop8_sse
200
201        ;; Extract value from XMM register
202        movdqa  xtmp, accum_val
203        pslldq  xtmp, 8      ; shift left by 64 bits
204        por     accum_val, xtmp
205
206        movdqa  xtmp, accum_val
207        pslldq  xtmp, 4      ; shift left by 32 bits
208        por     accum_val, xtmp
209
210        movdqa  xtmp, accum_val
211        pslldq  xtmp, 2      ; shift left by 16 bits
212        por     accum_val, xtmp
213
214        movdqa  xtmp, accum_val
215        pslldq  xtmp, 1      ; shift left by 8 bits
216        por     accum_val, xtmp
217
218        pextrb  rax, accum_val, 15
219
220exit8_sse:
221        ret
222
223; uint8_t lookup_8bit_avx(const void *table, const uint32_t idx, const uint32_t size);
224; arg 1 : pointer to table to look up
225; arg 2 : index to look up
226; arg 3 : size of table to look up (multiple of 16 bytes)
227align 32
228MKGLOBAL(lookup_8bit_avx,function,internal)
229lookup_8bit_avx:
230        ;; Number of loop iters = matrix size / 4 (number of values in XMM)
231        shr     size, 4
232        je      exit8_avx
233
234        xor     offset, offset
235
236        ;; Broadcast idx to look up
237        vmovd   bcast_idx, DWORD(idx)
238        vpxor   xtmp, xtmp
239        vpxor   accum_val, accum_val
240        vpshufb bcast_idx, xtmp
241
242        vmovdqa xadd,     [rel add_16]
243        vmovdqa xindices, [rel idx_tab8]
244
245loop8_avx:
246        ;; Compare indices with idx
247        ;; This generates a mask with all 0s except for the position where idx matches (all 1s here)
248        vpcmpeqb xtmp, xindices, bcast_idx
249
250        ;; Load next 16 values
251        vmovdqa xtmp2, [table + offset]
252
253        ;; This generates data with all 0s except the value we are looking for in the index to look up
254        vpand   xtmp2, xtmp
255
256        vpor    accum_val, xtmp2
257
258        ;; Get next 16 indices
259        vpaddb  xindices, xadd
260
261        add     offset, 16
262        dec     size
263
264        jne     loop8_avx
265
266        ;; Extract value from XMM register
267        vpslldq xtmp, accum_val, 8      ; shift left by 64 bits
268        vpor    accum_val, xtmp
269
270        vpslldq xtmp, accum_val, 4      ; shift left by 32 bits
271        vpor    accum_val, xtmp
272
273        vpslldq xtmp, accum_val, 2      ; shift left by 16 bits
274        vpor    accum_val, xtmp
275
276        vpslldq xtmp, accum_val, 1      ; shift left by 8 bits
277        vpor    accum_val, xtmp
278
279        vpextrb rax, accum_val, 15
280
281exit8_avx:
282
283        ret
284
285; uint8_t lookup_16bit_sse(const void *table, const uint32_t idx, const uint32_t size);
286; arg 1 : pointer to table to look up
287; arg 2 : index to look up
288; arg 3 : size of table to look up
289align 32
290MKGLOBAL(lookup_16bit_sse,function,internal)
291lookup_16bit_sse:
292
293        ;; Number of loop iters = matrix size / 8 (number of values in XMM)
294        shr     size, 3
295        je      exit16_sse
296
297        xor     offset, offset
298
299        ;; Broadcast idx to look up
300        movd    bcast_idx, DWORD(idx)
301        movdqa  xtmp, [rel bcast_mask]
302        pxor    accum_val, accum_val
303        pshufb  bcast_idx, xtmp
304
305        movdqa  xadd,     [rel add_8]
306        movdqa  xindices, [rel idx_tab16]
307
308loop16_sse:
309
310        movdqa  xtmp, xindices
311
312        ;; Compare indices with idx
313        ;; This generates a mask with all 0s except for the position where idx matches (all 1s here)
314        pcmpeqw xtmp, bcast_idx
315
316        ;; Load next 8 values
317        movdqa  xtmp2, [table + offset]
318
319        ;; This generates data with all 0s except the value we are looking for in the index to look up
320        pand    xtmp2, xtmp
321
322        por     accum_val, xtmp2
323
324        ;; Get next 8 indices
325        paddw   xindices, xadd
326        add     offset, 16
327        dec     size
328
329        jne     loop16_sse
330
331        ;; Extract value from XMM register
332        movdqa  xtmp, accum_val
333        pslldq  xtmp, 8      ; shift left by 64 bits
334        por     accum_val, xtmp
335
336        movdqa  xtmp, accum_val
337        pslldq  xtmp, 4      ; shift left by 32 bits
338        por     accum_val, xtmp
339
340        movdqa  xtmp, accum_val
341        pslldq  xtmp, 2      ; shift left by 16 bits
342        por     accum_val, xtmp
343
344        pextrw  rax, accum_val, 7
345
346exit16_sse:
347        ret
348
349; uint8_t lookup_16bit_avx(const void *table, const uint32_t idx, const uint32_t size);
350; arg 1 : pointer to table to look up
351; arg 2 : index to look up
352; arg 3 : size of table to look up
353align 32
354MKGLOBAL(lookup_16bit_avx,function,internal)
355lookup_16bit_avx:
356
357        ;; Number of loop iters = matrix size / 8 (number of values in XMM)
358        shr     size, 3
359        je      exit16_avx
360
361        xor     offset, offset
362
363        ;; Broadcast idx to look up
364        vmovd   bcast_idx, DWORD(idx)
365        vmovdqa xtmp, [rel bcast_mask]
366        vpxor   accum_val, accum_val
367        vpshufb bcast_idx, xtmp
368
369        vmovdqa xadd,     [rel add_8]
370        vmovdqa xindices, [rel idx_tab16]
371
372loop16_avx:
373
374        ;; Compare indices with idx
375        ;; This generates a mask with all 0s except for the position where idx matches (all 1s here)
376        vpcmpeqw xtmp, xindices, bcast_idx
377
378        ;; Load next 16 values
379        vmovdqa xtmp2, [table + offset]
380
381        ;; This generates data with all 0s except the value we are looking for in the index to look up
382        vpand   xtmp2, xtmp
383
384        vpor    accum_val, xtmp2
385
386        ;; Get next 8 indices
387        vpaddw  xindices, xadd
388        add     offset, 16
389        dec     size
390
391        jne     loop16_avx
392
393        ;; Extract value from XMM register
394        vpslldq xtmp, accum_val, 8 ; shift left by 64 bits
395        vpor    accum_val, xtmp
396
397        vpslldq xtmp, accum_val, 4 ; shift left by 32 bits
398        vpor    accum_val, xtmp
399
400        vpslldq xtmp, accum_val, 2 ; shift left by 16 bits
401        vpor    accum_val, xtmp
402
403        vpextrw rax, accum_val, 7
404
405exit16_avx:
406        ret
407
408; uint32_t lookup_32bit_sse(const void *table, const uint32_t idx, const uint32_t size);
409; arg 1 : pointer to table to look up
410; arg 2 : index to look up
411; arg 3 : size of table to look up
412align 32
413MKGLOBAL(lookup_32bit_sse,function,internal)
414lookup_32bit_sse:
415
416        ;; Number of loop iters = matrix size / 4 (number of values in XMM)
417        shr     size, 2
418        je      exit32_sse
419
420        xor     offset, offset
421
422        ;; Broadcast idx to look up
423        movd    bcast_idx, DWORD(idx)
424        pxor    accum_val, accum_val
425        pshufd  bcast_idx, bcast_idx, 0
426
427        movdqa  xadd,     [rel add_4]
428        movdqa  xindices, [rel idx_tab32]
429
430loop32_sse:
431        movdqa  xtmp, xindices
432
433        ;; Compare indices with idx
434        ;; This generates a mask with all 0s except for the position where idx matches (all 1s here)
435        pcmpeqd xtmp, bcast_idx
436
437        ;; Load next 4 values
438        movdqa  xtmp2, [table + offset]
439
440        ;; This generates data with all 0s except the value we are looking for in the index to look up
441        pand    xtmp2, xtmp
442
443        por     accum_val, xtmp2
444
445        ;; Get next 4 indices
446        paddd   xindices, xadd
447        add     offset, 16
448        dec     size
449
450        jne     loop32_sse
451
452        ;; Extract value from XMM register
453        movdqa  xtmp, accum_val
454        psrldq  xtmp, 8      ; shift right by 64 bits
455        por     accum_val, xtmp
456
457        movdqa  xtmp, accum_val
458        psrldq  xtmp, 4      ; shift right by 32 bits
459        por     accum_val, xtmp
460
461        movd    eax, accum_val
462
463exit32_sse:
464        ret
465
466
467; uint32_t lookup_32bit_avx(const void *table, const uint32_t idx, const uint32_t size);
468; arg 1 : pointer to table to look up
469; arg 2 : index to look up
470; arg 3 : size of table to look up
471align 32
472MKGLOBAL(lookup_32bit_avx,function,internal)
473lookup_32bit_avx:
474        ;; Number of loop iters = matrix size / 4 (number of values in XMM)
475        shr     size, 2
476        je      exit32_avx
477
478        xor     offset, offset
479
480        ;; Broadcast idx to look up
481        vmovd   bcast_idx, DWORD(idx)
482        vpxor   accum_val, accum_val
483        vpshufd bcast_idx, bcast_idx, 0
484
485        vmovdqa xadd,     [rel add_4]
486        vmovdqa xindices, [rel idx_tab32]
487
488loop32_avx:
489        ;; Compare indices with idx
490        ;; This generates a mask with all 0s except for the position where idx matches (all 1s here)
491        vpcmpeqd xtmp, xindices, bcast_idx
492
493        ;; Load next 4 values
494        vmovdqa xtmp2, [table + offset]
495
496        ;; This generates data with all 0s except the value we are looking for in the index to look up
497        vpand   xtmp2, xtmp
498
499        vpor    accum_val, xtmp2
500
501        ;; Get next 4 indices
502        vpaddd  xindices, xadd
503        add     offset, 16
504        dec     size
505
506        jne     loop32_avx
507
508        ;; Extract value from XMM register
509        vpsrldq xtmp, accum_val, 8 ; shift right by 64 bits
510        vpor    accum_val, xtmp
511
512        vpsrldq xtmp, accum_val, 4 ; shift right by 32 bits
513        vpor    accum_val, xtmp
514
515        vmovd   eax, accum_val
516
517exit32_avx:
518        ret
519
520
521; uint64_t lookup_64bit_sse(const void *table, const uint32_t idx, const uint32_t size);
522; arg 1 : pointer to table to look up
523; arg 2 : index to look up
524; arg 3 : size of table to look up
525align 32
526MKGLOBAL(lookup_64bit_sse,function,internal)
527lookup_64bit_sse:
528        ;; Number of loop iters = matrix size / 2 (number of values in XMM)
529        shr     size, 1
530        je      exit64_sse
531
532        xor     offset, offset
533
534        ;; Broadcast idx to look up
535        movq    bcast_idx, idx
536        pxor    accum_val, accum_val
537        pinsrq  bcast_idx, idx, 1
538
539        movdqa  xadd,     [rel add_2]
540        movdqa  xindices, [rel idx_tab64]
541
542loop64_sse:
543        movdqa  xtmp, xindices
544
545        ;; Compare indices with idx
546        ;; This generates a mask with all 0s except for the position where idx matches (all 1s here)
547        pcmpeqq xtmp, bcast_idx
548
549        ;; Load next 2 values
550        movdqa  xtmp2, [table + offset]
551
552        ;; This generates data with all 0s except the value we are looking for in the index to look up
553        pand    xtmp2, xtmp
554
555        por     accum_val, xtmp2
556
557        ;; Get next 2 indices
558        paddq   xindices, xadd
559        add     offset, 16
560        dec     size
561
562        jne     loop64_sse
563
564        ;; Extract value from XMM register
565        movdqa  xtmp, accum_val
566        psrldq  xtmp, 8      ; shift right by 64 bits
567        por     accum_val, xtmp
568
569        movq     rax, accum_val
570
571exit64_sse:
572        ret
573
574
575; uint64_t lookup_64bit_avx(const void *table, const uint32_t idx, const uint32_t size);
576; arg 1 : pointer to table to look up
577; arg 2 : index to look up
578; arg 3 : size of table to look up
579align 32
580MKGLOBAL(lookup_64bit_avx,function,internal)
581lookup_64bit_avx:
582        ;; Number of loop iters = matrix size / 2 (number of values in XMM)
583        shr     size, 1
584        je      exit64_avx
585
586        xor     offset, offset
587
588        vmovq    bcast_idx, idx
589        vpxor    accum_val, accum_val
590        vpinsrq  bcast_idx, idx, 1
591
592        vmovdqa xadd,     [rel add_2]
593        vmovdqa xindices, [rel idx_tab64]
594
595loop64_avx:
596        ;; Compare indices with idx
597        ;; This generates a mask with all 0s except for the position where idx matches (all 1s here)
598        vpcmpeqq xtmp, xindices, bcast_idx
599
600        ;; Load next 2 values
601        vmovdqa xtmp2, [table + offset]
602
603        ;; This generates data with all 0s except the value we are looking for in the index to look up
604        vpand   xtmp2, xtmp
605
606        vpor    accum_val, xtmp2
607
608        ;; Get next 2 indices
609        vpaddq  xindices, xadd
610        add     offset, 16
611        dec     size
612
613        jne     loop64_avx
614
615        ;; Extract value from XMM register
616        vpsrldq xtmp, accum_val, 8 ; shift right by 64 bits
617        vpor    accum_val, xtmp
618
619        vmovq   rax, accum_val
620
621exit64_avx:
622        ret
623
624; __m128i lookup_16x8bit_sse(const __m128i indexes, const void *table)
625; arg 1 : vector with 16 8-bit indexes to be looked up
626; arg 2 : pointer to a 256 element table
627align 32
628MKGLOBAL(lookup_16x8bit_sse,function,internal)
629lookup_16x8bit_sse:
630%define arg_indexes xmm0
631%define arg_return  xmm0
632%define arg_table   arg1
633
634%ifndef LINUX
635%undef arg_table
636%define arg_table   arg2
637
638        ; Read indices from memory, as __m128i parameters are stored
639        ; in stack (aligned to 16 bytes) and its address is passed through GP register on Windows
640        movdqa          arg_indexes, [arg1]
641        mov             rax, rsp
642        sub             rsp, (10 * 16)
643        and             rsp, ~15
644        ;; xmm6:xmm15 need to be maintained for Windows
645        movdqa          [rsp + 0*16], xmm6
646        movdqa          [rsp + 1*16], xmm7
647        movdqa          [rsp + 2*16], xmm8
648        movdqa          [rsp + 3*16], xmm9
649        movdqa          [rsp + 4*16], xmm10
650        movdqa          [rsp + 5*16], xmm11
651        movdqa          [rsp + 6*16], xmm12
652        movdqa          [rsp + 7*16], xmm13
653        movdqa          [rsp + 8*16], xmm14
654        movdqa          [rsp + 9*16], xmm15
655%endif
656        movdqa          xmm15, [rel idx_rows_avx + (15 * 16)]
657        movdqa          xmm14, xmm15
658        psrlq           xmm14, 4
659        movdqa          xmm1, arg_indexes
660        movdqa          xmm2, arg_indexes
661        pand            xmm1, xmm15        ;; top nibble part of the index
662        pand            xmm2, xmm14        ;; low nibble part of the index
663
664        movdqa          xmm9,  xmm1
665        movdqa          xmm10, xmm1
666        movdqa          xmm11, xmm1
667        movdqa          xmm12, xmm1
668        movdqa          xmm13, xmm1
669        movdqa          xmm14, xmm1
670        pcmpeqb         xmm9, [rel idx_rows_avx + (0 * 16)]
671        movdqa          xmm3, [arg_table + (0 * 16)]
672        pcmpeqb         xmm10, [rel idx_rows_avx + (1 * 16)]
673        movdqa          xmm4, [arg_table + (1 * 16)]
674        pcmpeqb         xmm11, [rel idx_rows_avx + (2 * 16)]
675        movdqa          xmm5, [arg_table + (2 * 16)]
676        pcmpeqb         xmm12, [rel idx_rows_avx + (3 * 16)]
677        movdqa          xmm6, [arg_table + (3 * 16)]
678        pcmpeqb         xmm13, [rel idx_rows_avx + (4 * 16)]
679        movdqa          xmm7, [arg_table + (4 * 16)]
680        pcmpeqb         xmm14, [rel idx_rows_avx + (5 * 16)]
681        movdqa          xmm8, [arg_table + (5 * 16)]
682
683        pshufb          xmm3, xmm2
684        pshufb          xmm4, xmm2
685        pshufb          xmm5, xmm2
686        pshufb          xmm6, xmm2
687        pshufb          xmm7, xmm2
688        pshufb          xmm8, xmm2
689
690        pand            xmm9,  xmm3
691        pand            xmm10, xmm4
692        pand            xmm11, xmm5
693        pand            xmm12, xmm6
694        pand            xmm13, xmm7
695        pand            xmm14, xmm8
696
697        por             xmm9,  xmm10
698        por             xmm11, xmm12
699        por             xmm14, xmm13
700        movdqa          arg_return, xmm9
701        por             arg_return, xmm11
702
703        ;; xmm8 and xmm14 are used for final OR result from now on.
704        ;; arg_return & xmm14 carry current OR result.
705
706        movdqa          xmm9,  xmm1
707        movdqa          xmm10, xmm1
708        movdqa          xmm11, xmm1
709        movdqa          xmm12, xmm1
710        movdqa          xmm13, xmm1
711
712        pcmpeqb         xmm9,  [rel idx_rows_avx + (6 * 16)]
713        movdqa          xmm3, [arg_table + (6 * 16)]
714        pcmpeqb         xmm10, [rel idx_rows_avx + (7 * 16)]
715        movdqa          xmm4, [arg_table + (7 * 16)]
716        pcmpeqb         xmm11, [rel idx_rows_avx + (8 * 16)]
717        movdqa          xmm5, [arg_table + (8 * 16)]
718        pcmpeqb         xmm12, [rel idx_rows_avx + (9 * 16)]
719        movdqa          xmm6, [arg_table + (9 * 16)]
720        pcmpeqb         xmm13, [rel idx_rows_avx + (10 * 16)]
721        movdqa          xmm7, [arg_table + (10 * 16)]
722
723        pshufb          xmm3, xmm2
724        pshufb          xmm4, xmm2
725        pshufb          xmm5, xmm2
726        pshufb          xmm6, xmm2
727        pshufb          xmm7, xmm2
728
729        pand            xmm9,  xmm3
730        pand            xmm10, xmm4
731        pand            xmm11, xmm5
732        pand            xmm12, xmm6
733        pand            xmm13, xmm7
734
735        por             xmm9,  xmm10
736        por             xmm11, xmm12
737        por             xmm14, xmm13
738        por             arg_return, xmm9
739        por             xmm14, xmm11
740
741        ;; arg_return & xmm15 carry current OR result
742
743        movdqa          xmm9,  xmm1
744        movdqa          xmm10, xmm1
745        movdqa          xmm11, xmm1
746        movdqa          xmm12, xmm1
747        movdqa          xmm13, xmm1
748
749        pcmpeqb         xmm9,  [rel idx_rows_avx + (11 * 16)]
750        movdqa          xmm3, [arg_table + (11 * 16)]
751        pcmpeqb         xmm10, [rel idx_rows_avx + (12 * 16)]
752        movdqa          xmm4, [arg_table + (12 * 16)]
753        pcmpeqb         xmm11, [rel idx_rows_avx + (13 * 16)]
754        movdqa          xmm5, [arg_table + (13 * 16)]
755        pcmpeqb         xmm12, [rel idx_rows_avx + (14 * 16)]
756        movdqa          xmm6, [arg_table + (14 * 16)]
757        pcmpeqb         xmm13, [rel idx_rows_avx + (15 * 16)]
758        movdqa          xmm7, [arg_table + (15 * 16)]
759
760        pshufb          xmm3, xmm2
761        pshufb          xmm4, xmm2
762        pshufb          xmm5, xmm2
763        pshufb          xmm6, xmm2
764        pshufb          xmm7, xmm2
765
766        pand            xmm9,  xmm3
767        pand            xmm10, xmm4
768        pand            xmm11, xmm5
769        pand            xmm12, xmm6
770        pand            xmm13, xmm7
771
772        por             xmm9,  xmm10
773        por             xmm11, xmm12
774        por             xmm14, xmm13
775        por             arg_return, xmm9
776        por             xmm14, xmm11
777        por             arg_return, xmm14
778
779%ifndef LINUX
780        movdqa          xmm15, [rsp + 9*16]
781        movdqa          xmm14, [rsp + 8*16]
782        movdqa          xmm13, [rsp + 7*16]
783        movdqa          xmm12, [rsp + 6*16]
784        movdqa          xmm11, [rsp + 5*16]
785        movdqa          xmm10, [rsp + 4*16]
786        movdqa          xmm9,  [rsp + 3*16]
787        movdqa          xmm8,  [rsp + 2*16]
788        movdqa          xmm7,  [rsp + 1*16]
789        movdqa          xmm6,  [rsp + 0*16]
790%ifdef SAFE_DATA
791        pxor            xmm5, xmm5
792        movdqa          [rsp + 0*16], xmm5
793        movdqa          [rsp + 1*16], xmm5
794        movdqa          [rsp + 2*16], xmm5
795        movdqa          [rsp + 3*16], xmm5
796        movdqa          [rsp + 4*16], xmm5
797        movdqa          [rsp + 5*16], xmm5
798        movdqa          [rsp + 6*16], xmm5
799        movdqa          [rsp + 7*16], xmm5
800        movdqa          [rsp + 8*16], xmm5
801        movdqa          [rsp + 9*16], xmm5
802%endif                          ; SAFE_DATA
803        mov             rsp, rax
804%endif                          ; !LINUX
805        ret
806%undef arg_indexes
807%undef arg_return
808%undef arg_table
809
810; __m128i lookup_16x8bit_avx(const __m128i indexes, const void *table)
811; arg 1 : vector with 16 8-bit indexes to be looked up
812; arg 2 : pointer to a 256 element table
813align 32
814MKGLOBAL(lookup_16x8bit_avx,function,internal)
815lookup_16x8bit_avx:
816%define arg_indexes xmm0
817%define arg_return  xmm0
818%define arg_table   arg1
819
820%ifndef LINUX
821%undef arg_table
822%define arg_table   arg2
823
824        ; Read indices from memory, as __m128i parameters are stored
825        ; in stack (aligned to 16 bytes) and its address is passed through GP register on Windows
826        vmovdqa         arg_indexes, [arg1]
827        mov             rax, rsp
828        sub             rsp, (10 * 16)
829        and             rsp, ~15
830        ;; xmm6:xmm15 need to be maintained for Windows
831        vmovdqa         [rsp + 0*16], xmm6
832        vmovdqa         [rsp + 1*16], xmm7
833        vmovdqa         [rsp + 2*16], xmm8
834        vmovdqa         [rsp + 3*16], xmm9
835        vmovdqa         [rsp + 4*16], xmm10
836        vmovdqa         [rsp + 5*16], xmm11
837        vmovdqa         [rsp + 6*16], xmm12
838        vmovdqa         [rsp + 7*16], xmm13
839        vmovdqa         [rsp + 8*16], xmm14
840        vmovdqa         [rsp + 9*16], xmm15
841%endif                          ; !LINUX
842
843        vmovdqa         xmm15, [rel idx_rows_avx + (15 * 16)]
844        vpsrlq          xmm2, xmm15, 4
845
846        vpand           xmm1, xmm15, arg_indexes        ;; top nibble part of the index
847        vpand           xmm2, xmm2, arg_indexes         ;; low nibble part of the index
848
849        vpcmpeqb        xmm9,  xmm1, [rel idx_rows_avx + (0 * 16)]
850        vmovdqa         xmm3, [arg_table + (0 * 16)]
851        vpcmpeqb        xmm10, xmm1, [rel idx_rows_avx + (1 * 16)]
852        vmovdqa         xmm4, [arg_table + (1 * 16)]
853        vpcmpeqb        xmm11, xmm1, [rel idx_rows_avx + (2 * 16)]
854        vmovdqa         xmm5, [arg_table + (2 * 16)]
855        vpcmpeqb        xmm12, xmm1, [rel idx_rows_avx + (3 * 16)]
856        vmovdqa         xmm6, [arg_table + (3 * 16)]
857        vpcmpeqb        xmm13, xmm1, [rel idx_rows_avx + (4 * 16)]
858        vmovdqa         xmm7, [arg_table + (4 * 16)]
859        vpcmpeqb        xmm14, xmm1, [rel idx_rows_avx + (5 * 16)]
860        vmovdqa         xmm8, [arg_table + (5 * 16)]
861
862        vpshufb         xmm3, xmm3, xmm2
863        vpshufb         xmm4, xmm4, xmm2
864        vpshufb         xmm5, xmm5, xmm2
865        vpshufb         xmm6, xmm6, xmm2
866        vpshufb         xmm7, xmm7, xmm2
867        vpshufb         xmm8, xmm8, xmm2
868
869        vpand           xmm9,  xmm9,  xmm3
870        vpand           xmm10, xmm10, xmm4
871        vpand           xmm11, xmm11, xmm5
872        vpand           xmm12, xmm12, xmm6
873        vpand           xmm13, xmm13, xmm7
874        vpand           xmm14, xmm14, xmm8
875
876        vpor            xmm9,  xmm9,  xmm10
877        vpor            xmm11, xmm11, xmm12
878        vpor            xmm14, xmm13, xmm14
879        vpor            arg_return, xmm9, xmm11
880
881        ;; xmm8 and xmm14 are used for final OR result from now on.
882        ;; arg_return & xmm14 carry current OR result.
883
884        vpcmpeqb        xmm9,  xmm1, [rel idx_rows_avx + (6 * 16)]
885        vmovdqa         xmm3, [arg_table + (6 * 16)]
886        vpcmpeqb        xmm10, xmm1, [rel idx_rows_avx + (7 * 16)]
887        vmovdqa         xmm4, [arg_table + (7 * 16)]
888        vpcmpeqb        xmm11, xmm1, [rel idx_rows_avx + (8 * 16)]
889        vmovdqa         xmm5, [arg_table + (8 * 16)]
890        vpcmpeqb        xmm12, xmm1, [rel idx_rows_avx + (9 * 16)]
891        vmovdqa         xmm6, [arg_table + (9 * 16)]
892        vpcmpeqb        xmm13, xmm1, [rel idx_rows_avx + (10 * 16)]
893        vmovdqa         xmm7, [arg_table + (10 * 16)]
894
895        vpshufb         xmm3, xmm3, xmm2
896        vpshufb         xmm4, xmm4, xmm2
897        vpshufb         xmm5, xmm5, xmm2
898        vpshufb         xmm6, xmm6, xmm2
899        vpshufb         xmm7, xmm7, xmm2
900
901        vpand           xmm9,  xmm9,  xmm3
902        vpand           xmm10, xmm10, xmm4
903        vpand           xmm11, xmm11, xmm5
904        vpand           xmm12, xmm12, xmm6
905        vpand           xmm13, xmm13, xmm7
906
907        vpor            xmm9,  xmm9,  xmm10
908        vpor            xmm11, xmm11, xmm12
909        vpor            xmm15, xmm9,  xmm11
910        vpor            xmm8,  xmm14, xmm13
911
912        ;; arg_return, xmm15 & xmm8 carry current OR result
913
914        vpcmpeqb        xmm9,  xmm1, [rel idx_rows_avx + (11 * 16)]
915        vmovdqa         xmm3, [arg_table + (11 * 16)]
916        vpcmpeqb        xmm10, xmm1, [rel idx_rows_avx + (12 * 16)]
917        vmovdqa         xmm4, [arg_table + (12 * 16)]
918        vpcmpeqb        xmm11, xmm1, [rel idx_rows_avx + (13 * 16)]
919        vmovdqa         xmm5, [arg_table + (13 * 16)]
920        vpcmpeqb        xmm12, xmm1, [rel idx_rows_avx + (14 * 16)]
921        vmovdqa         xmm6, [arg_table + (14 * 16)]
922        vpcmpeqb        xmm13, xmm1, [rel idx_rows_avx + (15 * 16)]
923        vmovdqa         xmm7, [arg_table + (15 * 16)]
924
925        vpshufb         xmm3, xmm3, xmm2
926        vpshufb         xmm4, xmm4, xmm2
927        vpshufb         xmm5, xmm5, xmm2
928        vpshufb         xmm6, xmm6, xmm2
929        vpshufb         xmm7, xmm7, xmm2
930
931        vpand           xmm9,  xmm9,  xmm3
932        vpand           xmm10, xmm10, xmm4
933        vpand           xmm11, xmm11, xmm5
934        vpand           xmm12, xmm12, xmm6
935        vpand           xmm13, xmm13, xmm7
936
937        vpor            xmm14, xmm15, xmm8
938        vpor            xmm9,  xmm9,  xmm10
939        vpor            xmm11, xmm11, xmm12
940        vpor            xmm13, xmm13, xmm14
941        vpor            xmm15, xmm9,  xmm11
942        vpor            arg_return, arg_return, xmm13
943        vpor            arg_return, arg_return, xmm15
944
945%ifndef LINUX
946        vmovdqa         xmm15, [rsp + 9*16]
947        vmovdqa         xmm14, [rsp + 8*16]
948        vmovdqa         xmm13, [rsp + 7*16]
949        vmovdqa         xmm12, [rsp + 6*16]
950        vmovdqa         xmm11, [rsp + 5*16]
951        vmovdqa         xmm10, [rsp + 4*16]
952        vmovdqa         xmm9,  [rsp + 3*16]
953        vmovdqa         xmm8,  [rsp + 2*16]
954        vmovdqa         xmm7,  [rsp + 1*16]
955        vmovdqa         xmm6,  [rsp + 0*16]
956%ifdef SAFE_DATA
957        vpxor           xmm5, xmm5, xmm5
958        vmovdqa         [rsp + 0*16], xmm5
959        vmovdqa         [rsp + 1*16], xmm5
960        vmovdqa         [rsp + 2*16], xmm5
961        vmovdqa         [rsp + 3*16], xmm5
962        vmovdqa         [rsp + 4*16], xmm5
963        vmovdqa         [rsp + 5*16], xmm5
964        vmovdqa         [rsp + 6*16], xmm5
965        vmovdqa         [rsp + 7*16], xmm5
966        vmovdqa         [rsp + 8*16], xmm5
967        vmovdqa         [rsp + 9*16], xmm5
968%endif
969        mov             rsp, rax
970%endif                          ; !LINUX
971        ret
972%undef arg_indexes
973%undef arg_return
974%undef arg_table
975
976; __m256i lookup_32x8bit_avx2(const __m256i indexes, const void *table)
977; arg 1 : vector with 32 8-bit indexes to be looked up
978; arg 2 : pointer to a 256 element table
979align 32
980MKGLOBAL(lookup_32x8bit_avx2,function,internal)
981lookup_32x8bit_avx2:
982%define arg_indexes ymm0
983%define arg_return  ymm0
984%define arg_table   arg1
985
986%ifndef LINUX
987%undef arg_table
988%define arg_table   arg2
989
990        mov             rax, rsp
991        sub             rsp, (10 * 16)
992        and             rsp, ~31
993        ;; xmm6:xmm15 need to be maintained for Windows
994        vmovdqa         [rsp + 0*16], xmm6
995        vmovdqa         [rsp + 1*16], xmm7
996        vmovdqa         [rsp + 2*16], xmm8
997        vmovdqa         [rsp + 3*16], xmm9
998        vmovdqa         [rsp + 4*16], xmm10
999        vmovdqa         [rsp + 5*16], xmm11
1000        vmovdqa         [rsp + 6*16], xmm12
1001        vmovdqa         [rsp + 7*16], xmm13
1002        vmovdqa         [rsp + 8*16], xmm14
1003        vmovdqa         [rsp + 9*16], xmm15
1004%endif                          ; !LINUX
1005
1006        vmovdqa         ymm15, [rel idx_rows_avx2 + (15 * 32)]
1007        vpsrlq          ymm2, ymm15, 4
1008
1009        vpand           ymm1, ymm15, arg_indexes        ;; top nibble part of the index
1010        vpand           ymm2, ymm2, arg_indexes         ;; low nibble part of the index
1011
1012        vpcmpeqb        ymm9,  ymm1, [rel idx_rows_avx2 + (0 * 32)]
1013        vbroadcastf128  ymm3, [arg_table + (0 * 16)]
1014        vpcmpeqb        ymm10, ymm1, [rel idx_rows_avx2 + (1 * 32)]
1015        vbroadcastf128  ymm4, [arg_table + (1 * 16)]
1016        vpcmpeqb        ymm11, ymm1, [rel idx_rows_avx2 + (2 * 32)]
1017        vbroadcastf128  ymm5, [arg_table + (2 * 16)]
1018        vpcmpeqb        ymm12, ymm1, [rel idx_rows_avx2 + (3 * 32)]
1019        vbroadcastf128  ymm6, [arg_table + (3 * 16)]
1020        vpcmpeqb        ymm13, ymm1, [rel idx_rows_avx2 + (4 * 32)]
1021        vbroadcastf128  ymm7, [arg_table + (4 * 16)]
1022        vpcmpeqb        ymm14, ymm1, [rel idx_rows_avx2 + (5 * 32)]
1023        vbroadcastf128  ymm8, [arg_table + (5 * 16)]
1024
1025        vpshufb         ymm3, ymm3, ymm2
1026        vpshufb         ymm4, ymm4, ymm2
1027        vpshufb         ymm5, ymm5, ymm2
1028        vpshufb         ymm6, ymm6, ymm2
1029        vpshufb         ymm7, ymm7, ymm2
1030        vpshufb         ymm8, ymm8, ymm2
1031
1032        vpand           ymm9,  ymm9,  ymm3
1033        vpand           ymm10, ymm10, ymm4
1034        vpand           ymm11, ymm11, ymm5
1035        vpand           ymm12, ymm12, ymm6
1036        vpand           ymm13, ymm13, ymm7
1037        vpand           ymm14, ymm14, ymm8
1038
1039        vpor            ymm9,  ymm9,  ymm10
1040        vpor            ymm11, ymm11, ymm12
1041        vpor            ymm14, ymm13, ymm14
1042        vpor            arg_return, ymm9, ymm11
1043
1044        ;; ymm8 and ymm14 are used for final OR result from now on.
1045        ;; arg_return & ymm14 carry current OR result.
1046
1047        vpcmpeqb        ymm9,  ymm1, [rel idx_rows_avx2 + (6 * 32)]
1048        vbroadcastf128  ymm3, [arg_table + (6 * 16)]
1049        vpcmpeqb        ymm10, ymm1, [rel idx_rows_avx2 + (7 * 32)]
1050        vbroadcastf128  ymm4, [arg_table + (7 * 16)]
1051        vpcmpeqb        ymm11, ymm1, [rel idx_rows_avx2 + (8 * 32)]
1052        vbroadcastf128  ymm5, [arg_table + (8 * 16)]
1053        vpcmpeqb        ymm12, ymm1, [rel idx_rows_avx2 + (9 * 32)]
1054        vbroadcastf128  ymm6, [arg_table + (9 * 16)]
1055        vpcmpeqb        ymm13, ymm1, [rel idx_rows_avx2 + (10 * 32)]
1056        vbroadcastf128  ymm7, [arg_table + (10 * 16)]
1057
1058        vpshufb         ymm3, ymm3, ymm2
1059        vpshufb         ymm4, ymm4, ymm2
1060        vpshufb         ymm5, ymm5, ymm2
1061        vpshufb         ymm6, ymm6, ymm2
1062        vpshufb         ymm7, ymm7, ymm2
1063
1064        vpand           ymm9,  ymm9,  ymm3
1065        vpand           ymm10, ymm10, ymm4
1066        vpand           ymm11, ymm11, ymm5
1067        vpand           ymm12, ymm12, ymm6
1068        vpand           ymm13, ymm13, ymm7
1069
1070        vpor            ymm9,  ymm9,  ymm10
1071        vpor            ymm11, ymm11, ymm12
1072        vpor            ymm15, ymm9, ymm11
1073        vpor            ymm8,  ymm14, ymm13
1074
1075        ;; arg_return, ymm15 & ymm8 carry current OR result
1076
1077        vpcmpeqb        ymm9,  ymm1, [rel idx_rows_avx2 + (11 * 32)]
1078        vbroadcastf128  ymm3, [arg_table + (11 * 16)]
1079        vpcmpeqb        ymm10, ymm1, [rel idx_rows_avx2 + (12 * 32)]
1080        vbroadcastf128  ymm4, [arg_table + (12 * 16)]
1081        vpcmpeqb        ymm11, ymm1, [rel idx_rows_avx2 + (13 * 32)]
1082        vbroadcastf128  ymm5, [arg_table + (13 * 16)]
1083        vpcmpeqb        ymm12, ymm1, [rel idx_rows_avx2 + (14 * 32)]
1084        vbroadcastf128  ymm6, [arg_table + (14 * 16)]
1085        vpcmpeqb        ymm13, ymm1, [rel idx_rows_avx2 + (15 * 32)]
1086        vbroadcastf128  ymm7, [arg_table + (15 * 16)]
1087
1088        vpshufb         ymm3, ymm3, ymm2
1089        vpshufb         ymm4, ymm4, ymm2
1090        vpshufb         ymm5, ymm5, ymm2
1091        vpshufb         ymm6, ymm6, ymm2
1092        vpshufb         ymm7, ymm7, ymm2
1093
1094        vpand           ymm9,  ymm9,  ymm3
1095        vpand           ymm10, ymm10, ymm4
1096        vpand           ymm11, ymm11, ymm5
1097        vpand           ymm12, ymm12, ymm6
1098        vpand           ymm13, ymm13, ymm7
1099
1100        vpor            ymm14, ymm15, ymm8
1101        vpor            ymm9,  ymm9,  ymm10
1102        vpor            ymm11, ymm11, ymm12
1103        vpor            ymm13, ymm13, ymm14
1104        vpor            ymm15, ymm9, ymm11
1105        vpor            arg_return, arg_return, ymm13
1106        vpor            arg_return, arg_return, ymm15
1107
1108%ifndef LINUX
1109        vmovdqa         xmm15, [rsp + 9*16]
1110        vmovdqa         xmm14, [rsp + 8*16]
1111        vmovdqa         xmm13, [rsp + 7*16]
1112        vmovdqa         xmm12, [rsp + 6*16]
1113        vmovdqa         xmm11, [rsp + 5*16]
1114        vmovdqa         xmm10, [rsp + 4*16]
1115        vmovdqa         xmm9,  [rsp + 3*16]
1116        vmovdqa         xmm8,  [rsp + 2*16]
1117        vmovdqa         xmm7,  [rsp + 1*16]
1118        vmovdqa         xmm6,  [rsp + 0*16]
1119%ifdef SAFE_DATA
1120        vpxor           ymm5, ymm5, ymm5
1121        vmovdqa         [rsp + 0*16], ymm5
1122        vmovdqa         [rsp + 2*16], ymm5
1123        vmovdqa         [rsp + 4*16], ymm5
1124        vmovdqa         [rsp + 6*16], ymm5
1125        vmovdqa         [rsp + 8*16], ymm5
1126%endif
1127        mov             rsp, rax
1128%endif                          ; !LINUX
1129        ret
1130%undef arg_indexes
1131%undef arg_return
1132%undef arg_table
1133
1134%ifdef LINUX
1135section .note.GNU-stack noalloc noexec nowrite progbits
1136%endif
1137