1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3
24 * Copyright (c) 2019-2020 Samuel Neves
25 * Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
26 */
27
28#if defined(HAVE_SSE4_1)
29
30#define _ASM
31#include <sys/asm_linkage.h>
32
33#if defined(__ELF__) && defined(__CET__) && defined(__has_include)
34#if __has_include(<cet.h>)
35#include <cet.h>
36#endif
37#endif
38
39#if !defined(_CET_ENDBR)
40#define _CET_ENDBR
41#endif
42
43.intel_syntax noprefix
44.global zfs_blake3_compress_in_place_sse41
45.global zfs_blake3_compress_xof_sse41
46.global zfs_blake3_hash_many_sse41
47
48.text
49.type zfs_blake3_hash_many_sse41,@function
50.type zfs_blake3_compress_in_place_sse41,@function
51.type zfs_blake3_compress_xof_sse41,@function
52
53.p2align  6
54zfs_blake3_hash_many_sse41:
55        _CET_ENDBR
56        push    r15
57        push    r14
58        push    r13
59        push    r12
60        push    rbx
61        push    rbp
62        mov     rbp, rsp
63        sub     rsp, 360
64        and     rsp, 0xFFFFFFFFFFFFFFC0
65        neg     r9d
66        movd    xmm0, r9d
67        pshufd  xmm0, xmm0, 0x00
68        movdqa  xmmword ptr [rsp+0x130], xmm0
69        movdqa  xmm1, xmm0
70        pand    xmm1, xmmword ptr [ADD0+rip]
71        pand    xmm0, xmmword ptr [ADD1+rip]
72        movdqa  xmmword ptr [rsp+0x150], xmm0
73        movd    xmm0, r8d
74        pshufd  xmm0, xmm0, 0x00
75        paddd   xmm0, xmm1
76        movdqa  xmmword ptr [rsp+0x110], xmm0
77        pxor    xmm0, xmmword ptr [CMP_MSB_MASK+rip]
78        pxor    xmm1, xmmword ptr [CMP_MSB_MASK+rip]
79        pcmpgtd xmm1, xmm0
80        shr     r8, 32
81        movd    xmm2, r8d
82        pshufd  xmm2, xmm2, 0x00
83        psubd   xmm2, xmm1
84        movdqa  xmmword ptr [rsp+0x120], xmm2
85        mov     rbx, qword ptr [rbp+0x50]
86        mov     r15, rdx
87        shl     r15, 6
88        movzx   r13d, byte ptr [rbp+0x38]
89        movzx   r12d, byte ptr [rbp+0x48]
90        cmp     rsi, 4
91        jc      3f
922:
93        movdqu  xmm3, xmmword ptr [rcx]
94        pshufd  xmm0, xmm3, 0x00
95        pshufd  xmm1, xmm3, 0x55
96        pshufd  xmm2, xmm3, 0xAA
97        pshufd  xmm3, xmm3, 0xFF
98        movdqu  xmm7, xmmword ptr [rcx+0x10]
99        pshufd  xmm4, xmm7, 0x00
100        pshufd  xmm5, xmm7, 0x55
101        pshufd  xmm6, xmm7, 0xAA
102        pshufd  xmm7, xmm7, 0xFF
103        mov     r8, qword ptr [rdi]
104        mov     r9, qword ptr [rdi+0x8]
105        mov     r10, qword ptr [rdi+0x10]
106        mov     r11, qword ptr [rdi+0x18]
107        movzx   eax, byte ptr [rbp+0x40]
108        or      eax, r13d
109        xor     edx, edx
1109:
111        mov     r14d, eax
112        or      eax, r12d
113        add     rdx, 64
114        cmp     rdx, r15
115        cmovne  eax, r14d
116        movdqu  xmm8, xmmword ptr [r8+rdx-0x40]
117        movdqu  xmm9, xmmword ptr [r9+rdx-0x40]
118        movdqu  xmm10, xmmword ptr [r10+rdx-0x40]
119        movdqu  xmm11, xmmword ptr [r11+rdx-0x40]
120        movdqa  xmm12, xmm8
121        punpckldq xmm8, xmm9
122        punpckhdq xmm12, xmm9
123        movdqa  xmm14, xmm10
124        punpckldq xmm10, xmm11
125        punpckhdq xmm14, xmm11
126        movdqa  xmm9, xmm8
127        punpcklqdq xmm8, xmm10
128        punpckhqdq xmm9, xmm10
129        movdqa  xmm13, xmm12
130        punpcklqdq xmm12, xmm14
131        punpckhqdq xmm13, xmm14
132        movdqa  xmmword ptr [rsp], xmm8
133        movdqa  xmmword ptr [rsp+0x10], xmm9
134        movdqa  xmmword ptr [rsp+0x20], xmm12
135        movdqa  xmmword ptr [rsp+0x30], xmm13
136        movdqu  xmm8, xmmword ptr [r8+rdx-0x30]
137        movdqu  xmm9, xmmword ptr [r9+rdx-0x30]
138        movdqu  xmm10, xmmword ptr [r10+rdx-0x30]
139        movdqu  xmm11, xmmword ptr [r11+rdx-0x30]
140        movdqa  xmm12, xmm8
141        punpckldq xmm8, xmm9
142        punpckhdq xmm12, xmm9
143        movdqa  xmm14, xmm10
144        punpckldq xmm10, xmm11
145        punpckhdq xmm14, xmm11
146        movdqa  xmm9, xmm8
147        punpcklqdq xmm8, xmm10
148        punpckhqdq xmm9, xmm10
149        movdqa  xmm13, xmm12
150        punpcklqdq xmm12, xmm14
151        punpckhqdq xmm13, xmm14
152        movdqa  xmmword ptr [rsp+0x40], xmm8
153        movdqa  xmmword ptr [rsp+0x50], xmm9
154        movdqa  xmmword ptr [rsp+0x60], xmm12
155        movdqa  xmmword ptr [rsp+0x70], xmm13
156        movdqu  xmm8, xmmword ptr [r8+rdx-0x20]
157        movdqu  xmm9, xmmword ptr [r9+rdx-0x20]
158        movdqu  xmm10, xmmword ptr [r10+rdx-0x20]
159        movdqu  xmm11, xmmword ptr [r11+rdx-0x20]
160        movdqa  xmm12, xmm8
161        punpckldq xmm8, xmm9
162        punpckhdq xmm12, xmm9
163        movdqa  xmm14, xmm10
164        punpckldq xmm10, xmm11
165        punpckhdq xmm14, xmm11
166        movdqa  xmm9, xmm8
167        punpcklqdq xmm8, xmm10
168        punpckhqdq xmm9, xmm10
169        movdqa  xmm13, xmm12
170        punpcklqdq xmm12, xmm14
171        punpckhqdq xmm13, xmm14
172        movdqa  xmmword ptr [rsp+0x80], xmm8
173        movdqa  xmmword ptr [rsp+0x90], xmm9
174        movdqa  xmmword ptr [rsp+0xA0], xmm12
175        movdqa  xmmword ptr [rsp+0xB0], xmm13
176        movdqu  xmm8, xmmword ptr [r8+rdx-0x10]
177        movdqu  xmm9, xmmword ptr [r9+rdx-0x10]
178        movdqu  xmm10, xmmword ptr [r10+rdx-0x10]
179        movdqu  xmm11, xmmword ptr [r11+rdx-0x10]
180        movdqa  xmm12, xmm8
181        punpckldq xmm8, xmm9
182        punpckhdq xmm12, xmm9
183        movdqa  xmm14, xmm10
184        punpckldq xmm10, xmm11
185        punpckhdq xmm14, xmm11
186        movdqa  xmm9, xmm8
187        punpcklqdq xmm8, xmm10
188        punpckhqdq xmm9, xmm10
189        movdqa  xmm13, xmm12
190        punpcklqdq xmm12, xmm14
191        punpckhqdq xmm13, xmm14
192        movdqa  xmmword ptr [rsp+0xC0], xmm8
193        movdqa  xmmword ptr [rsp+0xD0], xmm9
194        movdqa  xmmword ptr [rsp+0xE0], xmm12
195        movdqa  xmmword ptr [rsp+0xF0], xmm13
196        movdqa  xmm9, xmmword ptr [BLAKE3_IV_1+rip]
197        movdqa  xmm10, xmmword ptr [BLAKE3_IV_2+rip]
198        movdqa  xmm11, xmmword ptr [BLAKE3_IV_3+rip]
199        movdqa  xmm12, xmmword ptr [rsp+0x110]
200        movdqa  xmm13, xmmword ptr [rsp+0x120]
201        movdqa  xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
202        movd    xmm15, eax
203        pshufd  xmm15, xmm15, 0x00
204        prefetcht0 [r8+rdx+0x80]
205        prefetcht0 [r9+rdx+0x80]
206        prefetcht0 [r10+rdx+0x80]
207        prefetcht0 [r11+rdx+0x80]
208        paddd   xmm0, xmmword ptr [rsp]
209        paddd   xmm1, xmmword ptr [rsp+0x20]
210        paddd   xmm2, xmmword ptr [rsp+0x40]
211        paddd   xmm3, xmmword ptr [rsp+0x60]
212        paddd   xmm0, xmm4
213        paddd   xmm1, xmm5
214        paddd   xmm2, xmm6
215        paddd   xmm3, xmm7
216        pxor    xmm12, xmm0
217        pxor    xmm13, xmm1
218        pxor    xmm14, xmm2
219        pxor    xmm15, xmm3
220        movdqa  xmm8, xmmword ptr [ROT16+rip]
221        pshufb  xmm12, xmm8
222        pshufb  xmm13, xmm8
223        pshufb  xmm14, xmm8
224        pshufb  xmm15, xmm8
225        movdqa  xmm8, xmmword ptr [BLAKE3_IV_0+rip]
226        paddd   xmm8, xmm12
227        paddd   xmm9, xmm13
228        paddd   xmm10, xmm14
229        paddd   xmm11, xmm15
230        pxor    xmm4, xmm8
231        pxor    xmm5, xmm9
232        pxor    xmm6, xmm10
233        pxor    xmm7, xmm11
234        movdqa  xmmword ptr [rsp+0x100], xmm8
235        movdqa  xmm8, xmm4
236        psrld   xmm8, 12
237        pslld   xmm4, 20
238        por     xmm4, xmm8
239        movdqa  xmm8, xmm5
240        psrld   xmm8, 12
241        pslld   xmm5, 20
242        por     xmm5, xmm8
243        movdqa  xmm8, xmm6
244        psrld   xmm8, 12
245        pslld   xmm6, 20
246        por     xmm6, xmm8
247        movdqa  xmm8, xmm7
248        psrld   xmm8, 12
249        pslld   xmm7, 20
250        por     xmm7, xmm8
251        paddd   xmm0, xmmword ptr [rsp+0x10]
252        paddd   xmm1, xmmword ptr [rsp+0x30]
253        paddd   xmm2, xmmword ptr [rsp+0x50]
254        paddd   xmm3, xmmword ptr [rsp+0x70]
255        paddd   xmm0, xmm4
256        paddd   xmm1, xmm5
257        paddd   xmm2, xmm6
258        paddd   xmm3, xmm7
259        pxor    xmm12, xmm0
260        pxor    xmm13, xmm1
261        pxor    xmm14, xmm2
262        pxor    xmm15, xmm3
263        movdqa  xmm8, xmmword ptr [ROT8+rip]
264        pshufb  xmm12, xmm8
265        pshufb  xmm13, xmm8
266        pshufb  xmm14, xmm8
267        pshufb  xmm15, xmm8
268        movdqa  xmm8, xmmword ptr [rsp+0x100]
269        paddd   xmm8, xmm12
270        paddd   xmm9, xmm13
271        paddd   xmm10, xmm14
272        paddd   xmm11, xmm15
273        pxor    xmm4, xmm8
274        pxor    xmm5, xmm9
275        pxor    xmm6, xmm10
276        pxor    xmm7, xmm11
277        movdqa  xmmword ptr [rsp+0x100], xmm8
278        movdqa  xmm8, xmm4
279        psrld   xmm8, 7
280        pslld   xmm4, 25
281        por     xmm4, xmm8
282        movdqa  xmm8, xmm5
283        psrld   xmm8, 7
284        pslld   xmm5, 25
285        por     xmm5, xmm8
286        movdqa  xmm8, xmm6
287        psrld   xmm8, 7
288        pslld   xmm6, 25
289        por     xmm6, xmm8
290        movdqa  xmm8, xmm7
291        psrld   xmm8, 7
292        pslld   xmm7, 25
293        por     xmm7, xmm8
294        paddd   xmm0, xmmword ptr [rsp+0x80]
295        paddd   xmm1, xmmword ptr [rsp+0xA0]
296        paddd   xmm2, xmmword ptr [rsp+0xC0]
297        paddd   xmm3, xmmword ptr [rsp+0xE0]
298        paddd   xmm0, xmm5
299        paddd   xmm1, xmm6
300        paddd   xmm2, xmm7
301        paddd   xmm3, xmm4
302        pxor    xmm15, xmm0
303        pxor    xmm12, xmm1
304        pxor    xmm13, xmm2
305        pxor    xmm14, xmm3
306        movdqa  xmm8, xmmword ptr [ROT16+rip]
307        pshufb  xmm15, xmm8
308        pshufb  xmm12, xmm8
309        pshufb  xmm13, xmm8
310        pshufb  xmm14, xmm8
311        paddd   xmm10, xmm15
312        paddd   xmm11, xmm12
313        movdqa  xmm8, xmmword ptr [rsp+0x100]
314        paddd   xmm8, xmm13
315        paddd   xmm9, xmm14
316        pxor    xmm5, xmm10
317        pxor    xmm6, xmm11
318        pxor    xmm7, xmm8
319        pxor    xmm4, xmm9
320        movdqa  xmmword ptr [rsp+0x100], xmm8
321        movdqa  xmm8, xmm5
322        psrld   xmm8, 12
323        pslld   xmm5, 20
324        por     xmm5, xmm8
325        movdqa  xmm8, xmm6
326        psrld   xmm8, 12
327        pslld   xmm6, 20
328        por     xmm6, xmm8
329        movdqa  xmm8, xmm7
330        psrld   xmm8, 12
331        pslld   xmm7, 20
332        por     xmm7, xmm8
333        movdqa  xmm8, xmm4
334        psrld   xmm8, 12
335        pslld   xmm4, 20
336        por     xmm4, xmm8
337        paddd   xmm0, xmmword ptr [rsp+0x90]
338        paddd   xmm1, xmmword ptr [rsp+0xB0]
339        paddd   xmm2, xmmword ptr [rsp+0xD0]
340        paddd   xmm3, xmmword ptr [rsp+0xF0]
341        paddd   xmm0, xmm5
342        paddd   xmm1, xmm6
343        paddd   xmm2, xmm7
344        paddd   xmm3, xmm4
345        pxor    xmm15, xmm0
346        pxor    xmm12, xmm1
347        pxor    xmm13, xmm2
348        pxor    xmm14, xmm3
349        movdqa  xmm8, xmmword ptr [ROT8+rip]
350        pshufb  xmm15, xmm8
351        pshufb  xmm12, xmm8
352        pshufb  xmm13, xmm8
353        pshufb  xmm14, xmm8
354        paddd   xmm10, xmm15
355        paddd   xmm11, xmm12
356        movdqa  xmm8, xmmword ptr [rsp+0x100]
357        paddd   xmm8, xmm13
358        paddd   xmm9, xmm14
359        pxor    xmm5, xmm10
360        pxor    xmm6, xmm11
361        pxor    xmm7, xmm8
362        pxor    xmm4, xmm9
363        movdqa  xmmword ptr [rsp+0x100], xmm8
364        movdqa  xmm8, xmm5
365        psrld   xmm8, 7
366        pslld   xmm5, 25
367        por     xmm5, xmm8
368        movdqa  xmm8, xmm6
369        psrld   xmm8, 7
370        pslld   xmm6, 25
371        por     xmm6, xmm8
372        movdqa  xmm8, xmm7
373        psrld   xmm8, 7
374        pslld   xmm7, 25
375        por     xmm7, xmm8
376        movdqa  xmm8, xmm4
377        psrld   xmm8, 7
378        pslld   xmm4, 25
379        por     xmm4, xmm8
380        paddd   xmm0, xmmword ptr [rsp+0x20]
381        paddd   xmm1, xmmword ptr [rsp+0x30]
382        paddd   xmm2, xmmword ptr [rsp+0x70]
383        paddd   xmm3, xmmword ptr [rsp+0x40]
384        paddd   xmm0, xmm4
385        paddd   xmm1, xmm5
386        paddd   xmm2, xmm6
387        paddd   xmm3, xmm7
388        pxor    xmm12, xmm0
389        pxor    xmm13, xmm1
390        pxor    xmm14, xmm2
391        pxor    xmm15, xmm3
392        movdqa  xmm8, xmmword ptr [ROT16+rip]
393        pshufb  xmm12, xmm8
394        pshufb  xmm13, xmm8
395        pshufb  xmm14, xmm8
396        pshufb  xmm15, xmm8
397        movdqa  xmm8, xmmword ptr [rsp+0x100]
398        paddd   xmm8, xmm12
399        paddd   xmm9, xmm13
400        paddd   xmm10, xmm14
401        paddd   xmm11, xmm15
402        pxor    xmm4, xmm8
403        pxor    xmm5, xmm9
404        pxor    xmm6, xmm10
405        pxor    xmm7, xmm11
406        movdqa  xmmword ptr [rsp+0x100], xmm8
407        movdqa  xmm8, xmm4
408        psrld   xmm8, 12
409        pslld   xmm4, 20
410        por     xmm4, xmm8
411        movdqa  xmm8, xmm5
412        psrld   xmm8, 12
413        pslld   xmm5, 20
414        por     xmm5, xmm8
415        movdqa  xmm8, xmm6
416        psrld   xmm8, 12
417        pslld   xmm6, 20
418        por     xmm6, xmm8
419        movdqa  xmm8, xmm7
420        psrld   xmm8, 12
421        pslld   xmm7, 20
422        por     xmm7, xmm8
423        paddd   xmm0, xmmword ptr [rsp+0x60]
424        paddd   xmm1, xmmword ptr [rsp+0xA0]
425        paddd   xmm2, xmmword ptr [rsp]
426        paddd   xmm3, xmmword ptr [rsp+0xD0]
427        paddd   xmm0, xmm4
428        paddd   xmm1, xmm5
429        paddd   xmm2, xmm6
430        paddd   xmm3, xmm7
431        pxor    xmm12, xmm0
432        pxor    xmm13, xmm1
433        pxor    xmm14, xmm2
434        pxor    xmm15, xmm3
435        movdqa  xmm8, xmmword ptr [ROT8+rip]
436        pshufb  xmm12, xmm8
437        pshufb  xmm13, xmm8
438        pshufb  xmm14, xmm8
439        pshufb  xmm15, xmm8
440        movdqa  xmm8, xmmword ptr [rsp+0x100]
441        paddd   xmm8, xmm12
442        paddd   xmm9, xmm13
443        paddd   xmm10, xmm14
444        paddd   xmm11, xmm15
445        pxor    xmm4, xmm8
446        pxor    xmm5, xmm9
447        pxor    xmm6, xmm10
448        pxor    xmm7, xmm11
449        movdqa  xmmword ptr [rsp+0x100], xmm8
450        movdqa  xmm8, xmm4
451        psrld   xmm8, 7
452        pslld   xmm4, 25
453        por     xmm4, xmm8
454        movdqa  xmm8, xmm5
455        psrld   xmm8, 7
456        pslld   xmm5, 25
457        por     xmm5, xmm8
458        movdqa  xmm8, xmm6
459        psrld   xmm8, 7
460        pslld   xmm6, 25
461        por     xmm6, xmm8
462        movdqa  xmm8, xmm7
463        psrld   xmm8, 7
464        pslld   xmm7, 25
465        por     xmm7, xmm8
466        paddd   xmm0, xmmword ptr [rsp+0x10]
467        paddd   xmm1, xmmword ptr [rsp+0xC0]
468        paddd   xmm2, xmmword ptr [rsp+0x90]
469        paddd   xmm3, xmmword ptr [rsp+0xF0]
470        paddd   xmm0, xmm5
471        paddd   xmm1, xmm6
472        paddd   xmm2, xmm7
473        paddd   xmm3, xmm4
474        pxor    xmm15, xmm0
475        pxor    xmm12, xmm1
476        pxor    xmm13, xmm2
477        pxor    xmm14, xmm3
478        movdqa  xmm8, xmmword ptr [ROT16+rip]
479        pshufb  xmm15, xmm8
480        pshufb  xmm12, xmm8
481        pshufb  xmm13, xmm8
482        pshufb  xmm14, xmm8
483        paddd   xmm10, xmm15
484        paddd   xmm11, xmm12
485        movdqa  xmm8, xmmword ptr [rsp+0x100]
486        paddd   xmm8, xmm13
487        paddd   xmm9, xmm14
488        pxor    xmm5, xmm10
489        pxor    xmm6, xmm11
490        pxor    xmm7, xmm8
491        pxor    xmm4, xmm9
492        movdqa  xmmword ptr [rsp+0x100], xmm8
493        movdqa  xmm8, xmm5
494        psrld   xmm8, 12
495        pslld   xmm5, 20
496        por     xmm5, xmm8
497        movdqa  xmm8, xmm6
498        psrld   xmm8, 12
499        pslld   xmm6, 20
500        por     xmm6, xmm8
501        movdqa  xmm8, xmm7
502        psrld   xmm8, 12
503        pslld   xmm7, 20
504        por     xmm7, xmm8
505        movdqa  xmm8, xmm4
506        psrld   xmm8, 12
507        pslld   xmm4, 20
508        por     xmm4, xmm8
509        paddd   xmm0, xmmword ptr [rsp+0xB0]
510        paddd   xmm1, xmmword ptr [rsp+0x50]
511        paddd   xmm2, xmmword ptr [rsp+0xE0]
512        paddd   xmm3, xmmword ptr [rsp+0x80]
513        paddd   xmm0, xmm5
514        paddd   xmm1, xmm6
515        paddd   xmm2, xmm7
516        paddd   xmm3, xmm4
517        pxor    xmm15, xmm0
518        pxor    xmm12, xmm1
519        pxor    xmm13, xmm2
520        pxor    xmm14, xmm3
521        movdqa  xmm8, xmmword ptr [ROT8+rip]
522        pshufb  xmm15, xmm8
523        pshufb  xmm12, xmm8
524        pshufb  xmm13, xmm8
525        pshufb  xmm14, xmm8
526        paddd   xmm10, xmm15
527        paddd   xmm11, xmm12
528        movdqa  xmm8, xmmword ptr [rsp+0x100]
529        paddd   xmm8, xmm13
530        paddd   xmm9, xmm14
531        pxor    xmm5, xmm10
532        pxor    xmm6, xmm11
533        pxor    xmm7, xmm8
534        pxor    xmm4, xmm9
535        movdqa  xmmword ptr [rsp+0x100], xmm8
536        movdqa  xmm8, xmm5
537        psrld   xmm8, 7
538        pslld   xmm5, 25
539        por     xmm5, xmm8
540        movdqa  xmm8, xmm6
541        psrld   xmm8, 7
542        pslld   xmm6, 25
543        por     xmm6, xmm8
544        movdqa  xmm8, xmm7
545        psrld   xmm8, 7
546        pslld   xmm7, 25
547        por     xmm7, xmm8
548        movdqa  xmm8, xmm4
549        psrld   xmm8, 7
550        pslld   xmm4, 25
551        por     xmm4, xmm8
552        paddd   xmm0, xmmword ptr [rsp+0x30]
553        paddd   xmm1, xmmword ptr [rsp+0xA0]
554        paddd   xmm2, xmmword ptr [rsp+0xD0]
555        paddd   xmm3, xmmword ptr [rsp+0x70]
556        paddd   xmm0, xmm4
557        paddd   xmm1, xmm5
558        paddd   xmm2, xmm6
559        paddd   xmm3, xmm7
560        pxor    xmm12, xmm0
561        pxor    xmm13, xmm1
562        pxor    xmm14, xmm2
563        pxor    xmm15, xmm3
564        movdqa  xmm8, xmmword ptr [ROT16+rip]
565        pshufb  xmm12, xmm8
566        pshufb  xmm13, xmm8
567        pshufb  xmm14, xmm8
568        pshufb  xmm15, xmm8
569        movdqa  xmm8, xmmword ptr [rsp+0x100]
570        paddd   xmm8, xmm12
571        paddd   xmm9, xmm13
572        paddd   xmm10, xmm14
573        paddd   xmm11, xmm15
574        pxor    xmm4, xmm8
575        pxor    xmm5, xmm9
576        pxor    xmm6, xmm10
577        pxor    xmm7, xmm11
578        movdqa  xmmword ptr [rsp+0x100], xmm8
579        movdqa  xmm8, xmm4
580        psrld   xmm8, 12
581        pslld   xmm4, 20
582        por     xmm4, xmm8
583        movdqa  xmm8, xmm5
584        psrld   xmm8, 12
585        pslld   xmm5, 20
586        por     xmm5, xmm8
587        movdqa  xmm8, xmm6
588        psrld   xmm8, 12
589        pslld   xmm6, 20
590        por     xmm6, xmm8
591        movdqa  xmm8, xmm7
592        psrld   xmm8, 12
593        pslld   xmm7, 20
594        por     xmm7, xmm8
595        paddd   xmm0, xmmword ptr [rsp+0x40]
596        paddd   xmm1, xmmword ptr [rsp+0xC0]
597        paddd   xmm2, xmmword ptr [rsp+0x20]
598        paddd   xmm3, xmmword ptr [rsp+0xE0]
599        paddd   xmm0, xmm4
600        paddd   xmm1, xmm5
601        paddd   xmm2, xmm6
602        paddd   xmm3, xmm7
603        pxor    xmm12, xmm0
604        pxor    xmm13, xmm1
605        pxor    xmm14, xmm2
606        pxor    xmm15, xmm3
607        movdqa  xmm8, xmmword ptr [ROT8+rip]
608        pshufb  xmm12, xmm8
609        pshufb  xmm13, xmm8
610        pshufb  xmm14, xmm8
611        pshufb  xmm15, xmm8
612        movdqa  xmm8, xmmword ptr [rsp+0x100]
613        paddd   xmm8, xmm12
614        paddd   xmm9, xmm13
615        paddd   xmm10, xmm14
616        paddd   xmm11, xmm15
617        pxor    xmm4, xmm8
618        pxor    xmm5, xmm9
619        pxor    xmm6, xmm10
620        pxor    xmm7, xmm11
621        movdqa  xmmword ptr [rsp+0x100], xmm8
622        movdqa  xmm8, xmm4
623        psrld   xmm8, 7
624        pslld   xmm4, 25
625        por     xmm4, xmm8
626        movdqa  xmm8, xmm5
627        psrld   xmm8, 7
628        pslld   xmm5, 25
629        por     xmm5, xmm8
630        movdqa  xmm8, xmm6
631        psrld   xmm8, 7
632        pslld   xmm6, 25
633        por     xmm6, xmm8
634        movdqa  xmm8, xmm7
635        psrld   xmm8, 7
636        pslld   xmm7, 25
637        por     xmm7, xmm8
638        paddd   xmm0, xmmword ptr [rsp+0x60]
639        paddd   xmm1, xmmword ptr [rsp+0x90]
640        paddd   xmm2, xmmword ptr [rsp+0xB0]
641        paddd   xmm3, xmmword ptr [rsp+0x80]
642        paddd   xmm0, xmm5
643        paddd   xmm1, xmm6
644        paddd   xmm2, xmm7
645        paddd   xmm3, xmm4
646        pxor    xmm15, xmm0
647        pxor    xmm12, xmm1
648        pxor    xmm13, xmm2
649        pxor    xmm14, xmm3
650        movdqa  xmm8, xmmword ptr [ROT16+rip]
651        pshufb  xmm15, xmm8
652        pshufb  xmm12, xmm8
653        pshufb  xmm13, xmm8
654        pshufb  xmm14, xmm8
655        paddd   xmm10, xmm15
656        paddd   xmm11, xmm12
657        movdqa  xmm8, xmmword ptr [rsp+0x100]
658        paddd   xmm8, xmm13
659        paddd   xmm9, xmm14
660        pxor    xmm5, xmm10
661        pxor    xmm6, xmm11
662        pxor    xmm7, xmm8
663        pxor    xmm4, xmm9
664        movdqa  xmmword ptr [rsp+0x100], xmm8
665        movdqa  xmm8, xmm5
666        psrld   xmm8, 12
667        pslld   xmm5, 20
668        por     xmm5, xmm8
669        movdqa  xmm8, xmm6
670        psrld   xmm8, 12
671        pslld   xmm6, 20
672        por     xmm6, xmm8
673        movdqa  xmm8, xmm7
674        psrld   xmm8, 12
675        pslld   xmm7, 20
676        por     xmm7, xmm8
677        movdqa  xmm8, xmm4
678        psrld   xmm8, 12
679        pslld   xmm4, 20
680        por     xmm4, xmm8
681        paddd   xmm0, xmmword ptr [rsp+0x50]
682        paddd   xmm1, xmmword ptr [rsp]
683        paddd   xmm2, xmmword ptr [rsp+0xF0]
684        paddd   xmm3, xmmword ptr [rsp+0x10]
685        paddd   xmm0, xmm5
686        paddd   xmm1, xmm6
687        paddd   xmm2, xmm7
688        paddd   xmm3, xmm4
689        pxor    xmm15, xmm0
690        pxor    xmm12, xmm1
691        pxor    xmm13, xmm2
692        pxor    xmm14, xmm3
693        movdqa  xmm8, xmmword ptr [ROT8+rip]
694        pshufb  xmm15, xmm8
695        pshufb  xmm12, xmm8
696        pshufb  xmm13, xmm8
697        pshufb  xmm14, xmm8
698        paddd   xmm10, xmm15
699        paddd   xmm11, xmm12
700        movdqa  xmm8, xmmword ptr [rsp+0x100]
701        paddd   xmm8, xmm13
702        paddd   xmm9, xmm14
703        pxor    xmm5, xmm10
704        pxor    xmm6, xmm11
705        pxor    xmm7, xmm8
706        pxor    xmm4, xmm9
707        movdqa  xmmword ptr [rsp+0x100], xmm8
708        movdqa  xmm8, xmm5
709        psrld   xmm8, 7
710        pslld   xmm5, 25
711        por     xmm5, xmm8
712        movdqa  xmm8, xmm6
713        psrld   xmm8, 7
714        pslld   xmm6, 25
715        por     xmm6, xmm8
716        movdqa  xmm8, xmm7
717        psrld   xmm8, 7
718        pslld   xmm7, 25
719        por     xmm7, xmm8
720        movdqa  xmm8, xmm4
721        psrld   xmm8, 7
722        pslld   xmm4, 25
723        por     xmm4, xmm8
724        paddd   xmm0, xmmword ptr [rsp+0xA0]
725        paddd   xmm1, xmmword ptr [rsp+0xC0]
726        paddd   xmm2, xmmword ptr [rsp+0xE0]
727        paddd   xmm3, xmmword ptr [rsp+0xD0]
728        paddd   xmm0, xmm4
729        paddd   xmm1, xmm5
730        paddd   xmm2, xmm6
731        paddd   xmm3, xmm7
732        pxor    xmm12, xmm0
733        pxor    xmm13, xmm1
734        pxor    xmm14, xmm2
735        pxor    xmm15, xmm3
736        movdqa  xmm8, xmmword ptr [ROT16+rip]
737        pshufb  xmm12, xmm8
738        pshufb  xmm13, xmm8
739        pshufb  xmm14, xmm8
740        pshufb  xmm15, xmm8
741        movdqa  xmm8, xmmword ptr [rsp+0x100]
742        paddd   xmm8, xmm12
743        paddd   xmm9, xmm13
744        paddd   xmm10, xmm14
745        paddd   xmm11, xmm15
746        pxor    xmm4, xmm8
747        pxor    xmm5, xmm9
748        pxor    xmm6, xmm10
749        pxor    xmm7, xmm11
750        movdqa  xmmword ptr [rsp+0x100], xmm8
751        movdqa  xmm8, xmm4
752        psrld   xmm8, 12
753        pslld   xmm4, 20
754        por     xmm4, xmm8
755        movdqa  xmm8, xmm5
756        psrld   xmm8, 12
757        pslld   xmm5, 20
758        por     xmm5, xmm8
759        movdqa  xmm8, xmm6
760        psrld   xmm8, 12
761        pslld   xmm6, 20
762        por     xmm6, xmm8
763        movdqa  xmm8, xmm7
764        psrld   xmm8, 12
765        pslld   xmm7, 20
766        por     xmm7, xmm8
767        paddd   xmm0, xmmword ptr [rsp+0x70]
768        paddd   xmm1, xmmword ptr [rsp+0x90]
769        paddd   xmm2, xmmword ptr [rsp+0x30]
770        paddd   xmm3, xmmword ptr [rsp+0xF0]
771        paddd   xmm0, xmm4
772        paddd   xmm1, xmm5
773        paddd   xmm2, xmm6
774        paddd   xmm3, xmm7
775        pxor    xmm12, xmm0
776        pxor    xmm13, xmm1
777        pxor    xmm14, xmm2
778        pxor    xmm15, xmm3
779        movdqa  xmm8, xmmword ptr [ROT8+rip]
780        pshufb  xmm12, xmm8
781        pshufb  xmm13, xmm8
782        pshufb  xmm14, xmm8
783        pshufb  xmm15, xmm8
784        movdqa  xmm8, xmmword ptr [rsp+0x100]
785        paddd   xmm8, xmm12
786        paddd   xmm9, xmm13
787        paddd   xmm10, xmm14
788        paddd   xmm11, xmm15
789        pxor    xmm4, xmm8
790        pxor    xmm5, xmm9
791        pxor    xmm6, xmm10
792        pxor    xmm7, xmm11
793        movdqa  xmmword ptr [rsp+0x100], xmm8
794        movdqa  xmm8, xmm4
795        psrld   xmm8, 7
796        pslld   xmm4, 25
797        por     xmm4, xmm8
798        movdqa  xmm8, xmm5
799        psrld   xmm8, 7
800        pslld   xmm5, 25
801        por     xmm5, xmm8
802        movdqa  xmm8, xmm6
803        psrld   xmm8, 7
804        pslld   xmm6, 25
805        por     xmm6, xmm8
806        movdqa  xmm8, xmm7
807        psrld   xmm8, 7
808        pslld   xmm7, 25
809        por     xmm7, xmm8
810        paddd   xmm0, xmmword ptr [rsp+0x40]
811        paddd   xmm1, xmmword ptr [rsp+0xB0]
812        paddd   xmm2, xmmword ptr [rsp+0x50]
813        paddd   xmm3, xmmword ptr [rsp+0x10]
814        paddd   xmm0, xmm5
815        paddd   xmm1, xmm6
816        paddd   xmm2, xmm7
817        paddd   xmm3, xmm4
818        pxor    xmm15, xmm0
819        pxor    xmm12, xmm1
820        pxor    xmm13, xmm2
821        pxor    xmm14, xmm3
822        movdqa  xmm8, xmmword ptr [ROT16+rip]
823        pshufb  xmm15, xmm8
824        pshufb  xmm12, xmm8
825        pshufb  xmm13, xmm8
826        pshufb  xmm14, xmm8
827        paddd   xmm10, xmm15
828        paddd   xmm11, xmm12
829        movdqa  xmm8, xmmword ptr [rsp+0x100]
830        paddd   xmm8, xmm13
831        paddd   xmm9, xmm14
832        pxor    xmm5, xmm10
833        pxor    xmm6, xmm11
834        pxor    xmm7, xmm8
835        pxor    xmm4, xmm9
836        movdqa  xmmword ptr [rsp+0x100], xmm8
837        movdqa  xmm8, xmm5
838        psrld   xmm8, 12
839        pslld   xmm5, 20
840        por     xmm5, xmm8
841        movdqa  xmm8, xmm6
842        psrld   xmm8, 12
843        pslld   xmm6, 20
844        por     xmm6, xmm8
845        movdqa  xmm8, xmm7
846        psrld   xmm8, 12
847        pslld   xmm7, 20
848        por     xmm7, xmm8
849        movdqa  xmm8, xmm4
850        psrld   xmm8, 12
851        pslld   xmm4, 20
852        por     xmm4, xmm8
853        paddd   xmm0, xmmword ptr [rsp]
854        paddd   xmm1, xmmword ptr [rsp+0x20]
855        paddd   xmm2, xmmword ptr [rsp+0x80]
856        paddd   xmm3, xmmword ptr [rsp+0x60]
857        paddd   xmm0, xmm5
858        paddd   xmm1, xmm6
859        paddd   xmm2, xmm7
860        paddd   xmm3, xmm4
861        pxor    xmm15, xmm0
862        pxor    xmm12, xmm1
863        pxor    xmm13, xmm2
864        pxor    xmm14, xmm3
865        movdqa  xmm8, xmmword ptr [ROT8+rip]
866        pshufb  xmm15, xmm8
867        pshufb  xmm12, xmm8
868        pshufb  xmm13, xmm8
869        pshufb  xmm14, xmm8
870        paddd   xmm10, xmm15
871        paddd   xmm11, xmm12
872        movdqa  xmm8, xmmword ptr [rsp+0x100]
873        paddd   xmm8, xmm13
874        paddd   xmm9, xmm14
875        pxor    xmm5, xmm10
876        pxor    xmm6, xmm11
877        pxor    xmm7, xmm8
878        pxor    xmm4, xmm9
879        movdqa  xmmword ptr [rsp+0x100], xmm8
880        movdqa  xmm8, xmm5
881        psrld   xmm8, 7
882        pslld   xmm5, 25
883        por     xmm5, xmm8
884        movdqa  xmm8, xmm6
885        psrld   xmm8, 7
886        pslld   xmm6, 25
887        por     xmm6, xmm8
888        movdqa  xmm8, xmm7
889        psrld   xmm8, 7
890        pslld   xmm7, 25
891        por     xmm7, xmm8
892        movdqa  xmm8, xmm4
893        psrld   xmm8, 7
894        pslld   xmm4, 25
895        por     xmm4, xmm8
896        paddd   xmm0, xmmword ptr [rsp+0xC0]
897        paddd   xmm1, xmmword ptr [rsp+0x90]
898        paddd   xmm2, xmmword ptr [rsp+0xF0]
899        paddd   xmm3, xmmword ptr [rsp+0xE0]
900        paddd   xmm0, xmm4
901        paddd   xmm1, xmm5
902        paddd   xmm2, xmm6
903        paddd   xmm3, xmm7
904        pxor    xmm12, xmm0
905        pxor    xmm13, xmm1
906        pxor    xmm14, xmm2
907        pxor    xmm15, xmm3
908        movdqa  xmm8, xmmword ptr [ROT16+rip]
909        pshufb  xmm12, xmm8
910        pshufb  xmm13, xmm8
911        pshufb  xmm14, xmm8
912        pshufb  xmm15, xmm8
913        movdqa  xmm8, xmmword ptr [rsp+0x100]
914        paddd   xmm8, xmm12
915        paddd   xmm9, xmm13
916        paddd   xmm10, xmm14
917        paddd   xmm11, xmm15
918        pxor    xmm4, xmm8
919        pxor    xmm5, xmm9
920        pxor    xmm6, xmm10
921        pxor    xmm7, xmm11
922        movdqa  xmmword ptr [rsp+0x100], xmm8
923        movdqa  xmm8, xmm4
924        psrld   xmm8, 12
925        pslld   xmm4, 20
926        por     xmm4, xmm8
927        movdqa  xmm8, xmm5
928        psrld   xmm8, 12
929        pslld   xmm5, 20
930        por     xmm5, xmm8
931        movdqa  xmm8, xmm6
932        psrld   xmm8, 12
933        pslld   xmm6, 20
934        por     xmm6, xmm8
935        movdqa  xmm8, xmm7
936        psrld   xmm8, 12
937        pslld   xmm7, 20
938        por     xmm7, xmm8
939        paddd   xmm0, xmmword ptr [rsp+0xD0]
940        paddd   xmm1, xmmword ptr [rsp+0xB0]
941        paddd   xmm2, xmmword ptr [rsp+0xA0]
942        paddd   xmm3, xmmword ptr [rsp+0x80]
943        paddd   xmm0, xmm4
944        paddd   xmm1, xmm5
945        paddd   xmm2, xmm6
946        paddd   xmm3, xmm7
947        pxor    xmm12, xmm0
948        pxor    xmm13, xmm1
949        pxor    xmm14, xmm2
950        pxor    xmm15, xmm3
951        movdqa  xmm8, xmmword ptr [ROT8+rip]
952        pshufb  xmm12, xmm8
953        pshufb  xmm13, xmm8
954        pshufb  xmm14, xmm8
955        pshufb  xmm15, xmm8
956        movdqa  xmm8, xmmword ptr [rsp+0x100]
957        paddd   xmm8, xmm12
958        paddd   xmm9, xmm13
959        paddd   xmm10, xmm14
960        paddd   xmm11, xmm15
961        pxor    xmm4, xmm8
962        pxor    xmm5, xmm9
963        pxor    xmm6, xmm10
964        pxor    xmm7, xmm11
965        movdqa  xmmword ptr [rsp+0x100], xmm8
966        movdqa  xmm8, xmm4
967        psrld   xmm8, 7
968        pslld   xmm4, 25
969        por     xmm4, xmm8
970        movdqa  xmm8, xmm5
971        psrld   xmm8, 7
972        pslld   xmm5, 25
973        por     xmm5, xmm8
974        movdqa  xmm8, xmm6
975        psrld   xmm8, 7
976        pslld   xmm6, 25
977        por     xmm6, xmm8
978        movdqa  xmm8, xmm7
979        psrld   xmm8, 7
980        pslld   xmm7, 25
981        por     xmm7, xmm8
982        paddd   xmm0, xmmword ptr [rsp+0x70]
983        paddd   xmm1, xmmword ptr [rsp+0x50]
984        paddd   xmm2, xmmword ptr [rsp]
985        paddd   xmm3, xmmword ptr [rsp+0x60]
986        paddd   xmm0, xmm5
987        paddd   xmm1, xmm6
988        paddd   xmm2, xmm7
989        paddd   xmm3, xmm4
990        pxor    xmm15, xmm0
991        pxor    xmm12, xmm1
992        pxor    xmm13, xmm2
993        pxor    xmm14, xmm3
994        movdqa  xmm8, xmmword ptr [ROT16+rip]
995        pshufb  xmm15, xmm8
996        pshufb  xmm12, xmm8
997        pshufb  xmm13, xmm8
998        pshufb  xmm14, xmm8
999        paddd   xmm10, xmm15
1000        paddd   xmm11, xmm12
1001        movdqa  xmm8, xmmword ptr [rsp+0x100]
1002        paddd   xmm8, xmm13
1003        paddd   xmm9, xmm14
1004        pxor    xmm5, xmm10
1005        pxor    xmm6, xmm11
1006        pxor    xmm7, xmm8
1007        pxor    xmm4, xmm9
1008        movdqa  xmmword ptr [rsp+0x100], xmm8
1009        movdqa  xmm8, xmm5
1010        psrld   xmm8, 12
1011        pslld   xmm5, 20
1012        por     xmm5, xmm8
1013        movdqa  xmm8, xmm6
1014        psrld   xmm8, 12
1015        pslld   xmm6, 20
1016        por     xmm6, xmm8
1017        movdqa  xmm8, xmm7
1018        psrld   xmm8, 12
1019        pslld   xmm7, 20
1020        por     xmm7, xmm8
1021        movdqa  xmm8, xmm4
1022        psrld   xmm8, 12
1023        pslld   xmm4, 20
1024        por     xmm4, xmm8
1025        paddd   xmm0, xmmword ptr [rsp+0x20]
1026        paddd   xmm1, xmmword ptr [rsp+0x30]
1027        paddd   xmm2, xmmword ptr [rsp+0x10]
1028        paddd   xmm3, xmmword ptr [rsp+0x40]
1029        paddd   xmm0, xmm5
1030        paddd   xmm1, xmm6
1031        paddd   xmm2, xmm7
1032        paddd   xmm3, xmm4
1033        pxor    xmm15, xmm0
1034        pxor    xmm12, xmm1
1035        pxor    xmm13, xmm2
1036        pxor    xmm14, xmm3
1037        movdqa  xmm8, xmmword ptr [ROT8+rip]
1038        pshufb  xmm15, xmm8
1039        pshufb  xmm12, xmm8
1040        pshufb  xmm13, xmm8
1041        pshufb  xmm14, xmm8
1042        paddd   xmm10, xmm15
1043        paddd   xmm11, xmm12
1044        movdqa  xmm8, xmmword ptr [rsp+0x100]
1045        paddd   xmm8, xmm13
1046        paddd   xmm9, xmm14
1047        pxor    xmm5, xmm10
1048        pxor    xmm6, xmm11
1049        pxor    xmm7, xmm8
1050        pxor    xmm4, xmm9
1051        movdqa  xmmword ptr [rsp+0x100], xmm8
1052        movdqa  xmm8, xmm5
1053        psrld   xmm8, 7
1054        pslld   xmm5, 25
1055        por     xmm5, xmm8
1056        movdqa  xmm8, xmm6
1057        psrld   xmm8, 7
1058        pslld   xmm6, 25
1059        por     xmm6, xmm8
1060        movdqa  xmm8, xmm7
1061        psrld   xmm8, 7
1062        pslld   xmm7, 25
1063        por     xmm7, xmm8
1064        movdqa  xmm8, xmm4
1065        psrld   xmm8, 7
1066        pslld   xmm4, 25
1067        por     xmm4, xmm8
1068        paddd   xmm0, xmmword ptr [rsp+0x90]
1069        paddd   xmm1, xmmword ptr [rsp+0xB0]
1070        paddd   xmm2, xmmword ptr [rsp+0x80]
1071        paddd   xmm3, xmmword ptr [rsp+0xF0]
1072        paddd   xmm0, xmm4
1073        paddd   xmm1, xmm5
1074        paddd   xmm2, xmm6
1075        paddd   xmm3, xmm7
1076        pxor    xmm12, xmm0
1077        pxor    xmm13, xmm1
1078        pxor    xmm14, xmm2
1079        pxor    xmm15, xmm3
1080        movdqa  xmm8, xmmword ptr [ROT16+rip]
1081        pshufb  xmm12, xmm8
1082        pshufb  xmm13, xmm8
1083        pshufb  xmm14, xmm8
1084        pshufb  xmm15, xmm8
1085        movdqa  xmm8, xmmword ptr [rsp+0x100]
1086        paddd   xmm8, xmm12
1087        paddd   xmm9, xmm13
1088        paddd   xmm10, xmm14
1089        paddd   xmm11, xmm15
1090        pxor    xmm4, xmm8
1091        pxor    xmm5, xmm9
1092        pxor    xmm6, xmm10
1093        pxor    xmm7, xmm11
1094        movdqa  xmmword ptr [rsp+0x100], xmm8
1095        movdqa  xmm8, xmm4
1096        psrld   xmm8, 12
1097        pslld   xmm4, 20
1098        por     xmm4, xmm8
1099        movdqa  xmm8, xmm5
1100        psrld   xmm8, 12
1101        pslld   xmm5, 20
1102        por     xmm5, xmm8
1103        movdqa  xmm8, xmm6
1104        psrld   xmm8, 12
1105        pslld   xmm6, 20
1106        por     xmm6, xmm8
1107        movdqa  xmm8, xmm7
1108        psrld   xmm8, 12
1109        pslld   xmm7, 20
1110        por     xmm7, xmm8
1111        paddd   xmm0, xmmword ptr [rsp+0xE0]
1112        paddd   xmm1, xmmword ptr [rsp+0x50]
1113        paddd   xmm2, xmmword ptr [rsp+0xC0]
1114        paddd   xmm3, xmmword ptr [rsp+0x10]
1115        paddd   xmm0, xmm4
1116        paddd   xmm1, xmm5
1117        paddd   xmm2, xmm6
1118        paddd   xmm3, xmm7
1119        pxor    xmm12, xmm0
1120        pxor    xmm13, xmm1
1121        pxor    xmm14, xmm2
1122        pxor    xmm15, xmm3
1123        movdqa  xmm8, xmmword ptr [ROT8+rip]
1124        pshufb  xmm12, xmm8
1125        pshufb  xmm13, xmm8
1126        pshufb  xmm14, xmm8
1127        pshufb  xmm15, xmm8
1128        movdqa  xmm8, xmmword ptr [rsp+0x100]
1129        paddd   xmm8, xmm12
1130        paddd   xmm9, xmm13
1131        paddd   xmm10, xmm14
1132        paddd   xmm11, xmm15
1133        pxor    xmm4, xmm8
1134        pxor    xmm5, xmm9
1135        pxor    xmm6, xmm10
1136        pxor    xmm7, xmm11
1137        movdqa  xmmword ptr [rsp+0x100], xmm8
1138        movdqa  xmm8, xmm4
1139        psrld   xmm8, 7
1140        pslld   xmm4, 25
1141        por     xmm4, xmm8
1142        movdqa  xmm8, xmm5
1143        psrld   xmm8, 7
1144        pslld   xmm5, 25
1145        por     xmm5, xmm8
1146        movdqa  xmm8, xmm6
1147        psrld   xmm8, 7
1148        pslld   xmm6, 25
1149        por     xmm6, xmm8
1150        movdqa  xmm8, xmm7
1151        psrld   xmm8, 7
1152        pslld   xmm7, 25
1153        por     xmm7, xmm8
1154        paddd   xmm0, xmmword ptr [rsp+0xD0]
1155        paddd   xmm1, xmmword ptr [rsp]
1156        paddd   xmm2, xmmword ptr [rsp+0x20]
1157        paddd   xmm3, xmmword ptr [rsp+0x40]
1158        paddd   xmm0, xmm5
1159        paddd   xmm1, xmm6
1160        paddd   xmm2, xmm7
1161        paddd   xmm3, xmm4
1162        pxor    xmm15, xmm0
1163        pxor    xmm12, xmm1
1164        pxor    xmm13, xmm2
1165        pxor    xmm14, xmm3
1166        movdqa  xmm8, xmmword ptr [ROT16+rip]
1167        pshufb  xmm15, xmm8
1168        pshufb  xmm12, xmm8
1169        pshufb  xmm13, xmm8
1170        pshufb  xmm14, xmm8
1171        paddd   xmm10, xmm15
1172        paddd   xmm11, xmm12
1173        movdqa  xmm8, xmmword ptr [rsp+0x100]
1174        paddd   xmm8, xmm13
1175        paddd   xmm9, xmm14
1176        pxor    xmm5, xmm10
1177        pxor    xmm6, xmm11
1178        pxor    xmm7, xmm8
1179        pxor    xmm4, xmm9
1180        movdqa  xmmword ptr [rsp+0x100], xmm8
1181        movdqa  xmm8, xmm5
1182        psrld   xmm8, 12
1183        pslld   xmm5, 20
1184        por     xmm5, xmm8
1185        movdqa  xmm8, xmm6
1186        psrld   xmm8, 12
1187        pslld   xmm6, 20
1188        por     xmm6, xmm8
1189        movdqa  xmm8, xmm7
1190        psrld   xmm8, 12
1191        pslld   xmm7, 20
1192        por     xmm7, xmm8
1193        movdqa  xmm8, xmm4
1194        psrld   xmm8, 12
1195        pslld   xmm4, 20
1196        por     xmm4, xmm8
1197        paddd   xmm0, xmmword ptr [rsp+0x30]
1198        paddd   xmm1, xmmword ptr [rsp+0xA0]
1199        paddd   xmm2, xmmword ptr [rsp+0x60]
1200        paddd   xmm3, xmmword ptr [rsp+0x70]
1201        paddd   xmm0, xmm5
1202        paddd   xmm1, xmm6
1203        paddd   xmm2, xmm7
1204        paddd   xmm3, xmm4
1205        pxor    xmm15, xmm0
1206        pxor    xmm12, xmm1
1207        pxor    xmm13, xmm2
1208        pxor    xmm14, xmm3
1209        movdqa  xmm8, xmmword ptr [ROT8+rip]
1210        pshufb  xmm15, xmm8
1211        pshufb  xmm12, xmm8
1212        pshufb  xmm13, xmm8
1213        pshufb  xmm14, xmm8
1214        paddd   xmm10, xmm15
1215        paddd   xmm11, xmm12
1216        movdqa  xmm8, xmmword ptr [rsp+0x100]
1217        paddd   xmm8, xmm13
1218        paddd   xmm9, xmm14
1219        pxor    xmm5, xmm10
1220        pxor    xmm6, xmm11
1221        pxor    xmm7, xmm8
1222        pxor    xmm4, xmm9
1223        movdqa  xmmword ptr [rsp+0x100], xmm8
1224        movdqa  xmm8, xmm5
1225        psrld   xmm8, 7
1226        pslld   xmm5, 25
1227        por     xmm5, xmm8
1228        movdqa  xmm8, xmm6
1229        psrld   xmm8, 7
1230        pslld   xmm6, 25
1231        por     xmm6, xmm8
1232        movdqa  xmm8, xmm7
1233        psrld   xmm8, 7
1234        pslld   xmm7, 25
1235        por     xmm7, xmm8
1236        movdqa  xmm8, xmm4
1237        psrld   xmm8, 7
1238        pslld   xmm4, 25
1239        por     xmm4, xmm8
1240        paddd   xmm0, xmmword ptr [rsp+0xB0]
1241        paddd   xmm1, xmmword ptr [rsp+0x50]
1242        paddd   xmm2, xmmword ptr [rsp+0x10]
1243        paddd   xmm3, xmmword ptr [rsp+0x80]
1244        paddd   xmm0, xmm4
1245        paddd   xmm1, xmm5
1246        paddd   xmm2, xmm6
1247        paddd   xmm3, xmm7
1248        pxor    xmm12, xmm0
1249        pxor    xmm13, xmm1
1250        pxor    xmm14, xmm2
1251        pxor    xmm15, xmm3
1252        movdqa  xmm8, xmmword ptr [ROT16+rip]
1253        pshufb  xmm12, xmm8
1254        pshufb  xmm13, xmm8
1255        pshufb  xmm14, xmm8
1256        pshufb  xmm15, xmm8
1257        movdqa  xmm8, xmmword ptr [rsp+0x100]
1258        paddd   xmm8, xmm12
1259        paddd   xmm9, xmm13
1260        paddd   xmm10, xmm14
1261        paddd   xmm11, xmm15
1262        pxor    xmm4, xmm8
1263        pxor    xmm5, xmm9
1264        pxor    xmm6, xmm10
1265        pxor    xmm7, xmm11
1266        movdqa  xmmword ptr [rsp+0x100], xmm8
1267        movdqa  xmm8, xmm4
1268        psrld   xmm8, 12
1269        pslld   xmm4, 20
1270        por     xmm4, xmm8
1271        movdqa  xmm8, xmm5
1272        psrld   xmm8, 12
1273        pslld   xmm5, 20
1274        por     xmm5, xmm8
1275        movdqa  xmm8, xmm6
1276        psrld   xmm8, 12
1277        pslld   xmm6, 20
1278        por     xmm6, xmm8
1279        movdqa  xmm8, xmm7
1280        psrld   xmm8, 12
1281        pslld   xmm7, 20
1282        por     xmm7, xmm8
1283        paddd   xmm0, xmmword ptr [rsp+0xF0]
1284        paddd   xmm1, xmmword ptr [rsp]
1285        paddd   xmm2, xmmword ptr [rsp+0x90]
1286        paddd   xmm3, xmmword ptr [rsp+0x60]
1287        paddd   xmm0, xmm4
1288        paddd   xmm1, xmm5
1289        paddd   xmm2, xmm6
1290        paddd   xmm3, xmm7
1291        pxor    xmm12, xmm0
1292        pxor    xmm13, xmm1
1293        pxor    xmm14, xmm2
1294        pxor    xmm15, xmm3
1295        movdqa  xmm8, xmmword ptr [ROT8+rip]
1296        pshufb  xmm12, xmm8
1297        pshufb  xmm13, xmm8
1298        pshufb  xmm14, xmm8
1299        pshufb  xmm15, xmm8
1300        movdqa  xmm8, xmmword ptr [rsp+0x100]
1301        paddd   xmm8, xmm12
1302        paddd   xmm9, xmm13
1303        paddd   xmm10, xmm14
1304        paddd   xmm11, xmm15
1305        pxor    xmm4, xmm8
1306        pxor    xmm5, xmm9
1307        pxor    xmm6, xmm10
1308        pxor    xmm7, xmm11
1309        movdqa  xmmword ptr [rsp+0x100], xmm8
1310        movdqa  xmm8, xmm4
1311        psrld   xmm8, 7
1312        pslld   xmm4, 25
1313        por     xmm4, xmm8
1314        movdqa  xmm8, xmm5
1315        psrld   xmm8, 7
1316        pslld   xmm5, 25
1317        por     xmm5, xmm8
1318        movdqa  xmm8, xmm6
1319        psrld   xmm8, 7
1320        pslld   xmm6, 25
1321        por     xmm6, xmm8
1322        movdqa  xmm8, xmm7
1323        psrld   xmm8, 7
1324        pslld   xmm7, 25
1325        por     xmm7, xmm8
1326        paddd   xmm0, xmmword ptr [rsp+0xE0]
1327        paddd   xmm1, xmmword ptr [rsp+0x20]
1328        paddd   xmm2, xmmword ptr [rsp+0x30]
1329        paddd   xmm3, xmmword ptr [rsp+0x70]
1330        paddd   xmm0, xmm5
1331        paddd   xmm1, xmm6
1332        paddd   xmm2, xmm7
1333        paddd   xmm3, xmm4
1334        pxor    xmm15, xmm0
1335        pxor    xmm12, xmm1
1336        pxor    xmm13, xmm2
1337        pxor    xmm14, xmm3
1338        movdqa  xmm8, xmmword ptr [ROT16+rip]
1339        pshufb  xmm15, xmm8
1340        pshufb  xmm12, xmm8
1341        pshufb  xmm13, xmm8
1342        pshufb  xmm14, xmm8
1343        paddd   xmm10, xmm15
1344        paddd   xmm11, xmm12
1345        movdqa  xmm8, xmmword ptr [rsp+0x100]
1346        paddd   xmm8, xmm13
1347        paddd   xmm9, xmm14
1348        pxor    xmm5, xmm10
1349        pxor    xmm6, xmm11
1350        pxor    xmm7, xmm8
1351        pxor    xmm4, xmm9
1352        movdqa  xmmword ptr [rsp+0x100], xmm8
1353        movdqa  xmm8, xmm5
1354        psrld   xmm8, 12
1355        pslld   xmm5, 20
1356        por     xmm5, xmm8
1357        movdqa  xmm8, xmm6
1358        psrld   xmm8, 12
1359        pslld   xmm6, 20
1360        por     xmm6, xmm8
1361        movdqa  xmm8, xmm7
1362        psrld   xmm8, 12
1363        pslld   xmm7, 20
1364        por     xmm7, xmm8
1365        movdqa  xmm8, xmm4
1366        psrld   xmm8, 12
1367        pslld   xmm4, 20
1368        por     xmm4, xmm8
1369        paddd   xmm0, xmmword ptr [rsp+0xA0]
1370        paddd   xmm1, xmmword ptr [rsp+0xC0]
1371        paddd   xmm2, xmmword ptr [rsp+0x40]
1372        paddd   xmm3, xmmword ptr [rsp+0xD0]
1373        paddd   xmm0, xmm5
1374        paddd   xmm1, xmm6
1375        paddd   xmm2, xmm7
1376        paddd   xmm3, xmm4
1377        pxor    xmm15, xmm0
1378        pxor    xmm12, xmm1
1379        pxor    xmm13, xmm2
1380        pxor    xmm14, xmm3
1381        movdqa  xmm8, xmmword ptr [ROT8+rip]
1382        pshufb  xmm15, xmm8
1383        pshufb  xmm12, xmm8
1384        pshufb  xmm13, xmm8
1385        pshufb  xmm14, xmm8
1386        paddd   xmm10, xmm15
1387        paddd   xmm11, xmm12
1388        movdqa  xmm8, xmmword ptr [rsp+0x100]
1389        paddd   xmm8, xmm13
1390        paddd   xmm9, xmm14
1391        pxor    xmm5, xmm10
1392        pxor    xmm6, xmm11
1393        pxor    xmm7, xmm8
1394        pxor    xmm4, xmm9
1395        pxor    xmm0, xmm8
1396        pxor    xmm1, xmm9
1397        pxor    xmm2, xmm10
1398        pxor    xmm3, xmm11
1399        movdqa  xmm8, xmm5
1400        psrld   xmm8, 7
1401        pslld   xmm5, 25
1402        por     xmm5, xmm8
1403        movdqa  xmm8, xmm6
1404        psrld   xmm8, 7
1405        pslld   xmm6, 25
1406        por     xmm6, xmm8
1407        movdqa  xmm8, xmm7
1408        psrld   xmm8, 7
1409        pslld   xmm7, 25
1410        por     xmm7, xmm8
1411        movdqa  xmm8, xmm4
1412        psrld   xmm8, 7
1413        pslld   xmm4, 25
1414        por     xmm4, xmm8
1415        pxor    xmm4, xmm12
1416        pxor    xmm5, xmm13
1417        pxor    xmm6, xmm14
1418        pxor    xmm7, xmm15
1419        mov     eax, r13d
1420        jne     9b
1421        movdqa  xmm9, xmm0
1422        punpckldq xmm0, xmm1
1423        punpckhdq xmm9, xmm1
1424        movdqa  xmm11, xmm2
1425        punpckldq xmm2, xmm3
1426        punpckhdq xmm11, xmm3
1427        movdqa  xmm1, xmm0
1428        punpcklqdq xmm0, xmm2
1429        punpckhqdq xmm1, xmm2
1430        movdqa  xmm3, xmm9
1431        punpcklqdq xmm9, xmm11
1432        punpckhqdq xmm3, xmm11
1433        movdqu  xmmword ptr [rbx], xmm0
1434        movdqu  xmmword ptr [rbx+0x20], xmm1
1435        movdqu  xmmword ptr [rbx+0x40], xmm9
1436        movdqu  xmmword ptr [rbx+0x60], xmm3
1437        movdqa  xmm9, xmm4
1438        punpckldq xmm4, xmm5
1439        punpckhdq xmm9, xmm5
1440        movdqa  xmm11, xmm6
1441        punpckldq xmm6, xmm7
1442        punpckhdq xmm11, xmm7
1443        movdqa  xmm5, xmm4
1444        punpcklqdq xmm4, xmm6
1445        punpckhqdq xmm5, xmm6
1446        movdqa  xmm7, xmm9
1447        punpcklqdq xmm9, xmm11
1448        punpckhqdq xmm7, xmm11
1449        movdqu  xmmword ptr [rbx+0x10], xmm4
1450        movdqu  xmmword ptr [rbx+0x30], xmm5
1451        movdqu  xmmword ptr [rbx+0x50], xmm9
1452        movdqu  xmmword ptr [rbx+0x70], xmm7
1453        movdqa  xmm1, xmmword ptr [rsp+0x110]
1454        movdqa  xmm0, xmm1
1455        paddd   xmm1, xmmword ptr [rsp+0x150]
1456        movdqa  xmmword ptr [rsp+0x110], xmm1
1457        pxor    xmm0, xmmword ptr [CMP_MSB_MASK+rip]
1458        pxor    xmm1, xmmword ptr [CMP_MSB_MASK+rip]
1459        pcmpgtd xmm0, xmm1
1460        movdqa  xmm1, xmmword ptr [rsp+0x120]
1461        psubd   xmm1, xmm0
1462        movdqa  xmmword ptr [rsp+0x120], xmm1
1463        add     rbx, 128
1464        add     rdi, 32
1465        sub     rsi, 4
1466        cmp     rsi, 4
1467        jnc     2b
1468        test    rsi, rsi
1469        jnz     3f
14704:
1471        mov     rsp, rbp
1472        pop     rbp
1473        pop     rbx
1474        pop     r12
1475        pop     r13
1476        pop     r14
1477        pop     r15
1478        RET
1479.p2align 5
14803:
1481        test    esi, 0x2
1482        je      3f
1483        movups  xmm0, xmmword ptr [rcx]
1484        movups  xmm1, xmmword ptr [rcx+0x10]
1485        movaps  xmm8, xmm0
1486        movaps  xmm9, xmm1
1487        movd    xmm13, dword ptr [rsp+0x110]
1488        pinsrd  xmm13, dword ptr [rsp+0x120], 1
1489        pinsrd  xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
1490        movaps  xmmword ptr [rsp], xmm13
1491        movd    xmm14, dword ptr [rsp+0x114]
1492        pinsrd  xmm14, dword ptr [rsp+0x124], 1
1493        pinsrd  xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
1494        movaps  xmmword ptr [rsp+0x10], xmm14
1495        mov     r8, qword ptr [rdi]
1496        mov     r9, qword ptr [rdi+0x8]
1497        movzx   eax, byte ptr [rbp+0x40]
1498        or      eax, r13d
1499        xor     edx, edx
15002:
1501        mov     r14d, eax
1502        or      eax, r12d
1503        add     rdx, 64
1504        cmp     rdx, r15
1505        cmovne  eax, r14d
1506        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
1507        movaps  xmm10, xmm2
1508        movups  xmm4, xmmword ptr [r8+rdx-0x40]
1509        movups  xmm5, xmmword ptr [r8+rdx-0x30]
1510        movaps  xmm3, xmm4
1511        shufps  xmm4, xmm5, 136
1512        shufps  xmm3, xmm5, 221
1513        movaps  xmm5, xmm3
1514        movups  xmm6, xmmword ptr [r8+rdx-0x20]
1515        movups  xmm7, xmmword ptr [r8+rdx-0x10]
1516        movaps  xmm3, xmm6
1517        shufps  xmm6, xmm7, 136
1518        pshufd  xmm6, xmm6, 0x93
1519        shufps  xmm3, xmm7, 221
1520        pshufd  xmm7, xmm3, 0x93
1521        movups  xmm12, xmmword ptr [r9+rdx-0x40]
1522        movups  xmm13, xmmword ptr [r9+rdx-0x30]
1523        movaps  xmm11, xmm12
1524        shufps  xmm12, xmm13, 136
1525        shufps  xmm11, xmm13, 221
1526        movaps  xmm13, xmm11
1527        movups  xmm14, xmmword ptr [r9+rdx-0x20]
1528        movups  xmm15, xmmword ptr [r9+rdx-0x10]
1529        movaps  xmm11, xmm14
1530        shufps  xmm14, xmm15, 136
1531        pshufd  xmm14, xmm14, 0x93
1532        shufps  xmm11, xmm15, 221
1533        pshufd  xmm15, xmm11, 0x93
1534        movaps  xmm3, xmmword ptr [rsp]
1535        movaps  xmm11, xmmword ptr [rsp+0x10]
1536        pinsrd  xmm3, eax, 3
1537        pinsrd  xmm11, eax, 3
1538        mov     al, 7
15399:
1540        paddd   xmm0, xmm4
1541        paddd   xmm8, xmm12
1542        movaps  xmmword ptr [rsp+0x20], xmm4
1543        movaps  xmmword ptr [rsp+0x30], xmm12
1544        paddd   xmm0, xmm1
1545        paddd   xmm8, xmm9
1546        pxor    xmm3, xmm0
1547        pxor    xmm11, xmm8
1548        movaps  xmm12, xmmword ptr [ROT16+rip]
1549        pshufb  xmm3, xmm12
1550        pshufb  xmm11, xmm12
1551        paddd   xmm2, xmm3
1552        paddd   xmm10, xmm11
1553        pxor    xmm1, xmm2
1554        pxor    xmm9, xmm10
1555        movdqa  xmm4, xmm1
1556        pslld   xmm1, 20
1557        psrld   xmm4, 12
1558        por     xmm1, xmm4
1559        movdqa  xmm4, xmm9
1560        pslld   xmm9, 20
1561        psrld   xmm4, 12
1562        por     xmm9, xmm4
1563        paddd   xmm0, xmm5
1564        paddd   xmm8, xmm13
1565        movaps  xmmword ptr [rsp+0x40], xmm5
1566        movaps  xmmword ptr [rsp+0x50], xmm13
1567        paddd   xmm0, xmm1
1568        paddd   xmm8, xmm9
1569        pxor    xmm3, xmm0
1570        pxor    xmm11, xmm8
1571        movaps  xmm13, xmmword ptr [ROT8+rip]
1572        pshufb  xmm3, xmm13
1573        pshufb  xmm11, xmm13
1574        paddd   xmm2, xmm3
1575        paddd   xmm10, xmm11
1576        pxor    xmm1, xmm2
1577        pxor    xmm9, xmm10
1578        movdqa  xmm4, xmm1
1579        pslld   xmm1, 25
1580        psrld   xmm4, 7
1581        por     xmm1, xmm4
1582        movdqa  xmm4, xmm9
1583        pslld   xmm9, 25
1584        psrld   xmm4, 7
1585        por     xmm9, xmm4
1586        pshufd  xmm0, xmm0, 0x93
1587        pshufd  xmm8, xmm8, 0x93
1588        pshufd  xmm3, xmm3, 0x4E
1589        pshufd  xmm11, xmm11, 0x4E
1590        pshufd  xmm2, xmm2, 0x39
1591        pshufd  xmm10, xmm10, 0x39
1592        paddd   xmm0, xmm6
1593        paddd   xmm8, xmm14
1594        paddd   xmm0, xmm1
1595        paddd   xmm8, xmm9
1596        pxor    xmm3, xmm0
1597        pxor    xmm11, xmm8
1598        pshufb  xmm3, xmm12
1599        pshufb  xmm11, xmm12
1600        paddd   xmm2, xmm3
1601        paddd   xmm10, xmm11
1602        pxor    xmm1, xmm2
1603        pxor    xmm9, xmm10
1604        movdqa  xmm4, xmm1
1605        pslld   xmm1, 20
1606        psrld   xmm4, 12
1607        por     xmm1, xmm4
1608        movdqa  xmm4, xmm9
1609        pslld   xmm9, 20
1610        psrld   xmm4, 12
1611        por     xmm9, xmm4
1612        paddd   xmm0, xmm7
1613        paddd   xmm8, xmm15
1614        paddd   xmm0, xmm1
1615        paddd   xmm8, xmm9
1616        pxor    xmm3, xmm0
1617        pxor    xmm11, xmm8
1618        pshufb  xmm3, xmm13
1619        pshufb  xmm11, xmm13
1620        paddd   xmm2, xmm3
1621        paddd   xmm10, xmm11
1622        pxor    xmm1, xmm2
1623        pxor    xmm9, xmm10
1624        movdqa  xmm4, xmm1
1625        pslld   xmm1, 25
1626        psrld   xmm4, 7
1627        por     xmm1, xmm4
1628        movdqa  xmm4, xmm9
1629        pslld   xmm9, 25
1630        psrld   xmm4, 7
1631        por     xmm9, xmm4
1632        pshufd  xmm0, xmm0, 0x39
1633        pshufd  xmm8, xmm8, 0x39
1634        pshufd  xmm3, xmm3, 0x4E
1635        pshufd  xmm11, xmm11, 0x4E
1636        pshufd  xmm2, xmm2, 0x93
1637        pshufd  xmm10, xmm10, 0x93
1638        dec     al
1639        je      9f
1640        movdqa  xmm12, xmmword ptr [rsp+0x20]
1641        movdqa  xmm5, xmmword ptr [rsp+0x40]
1642        pshufd  xmm13, xmm12, 0x0F
1643        shufps  xmm12, xmm5, 214
1644        pshufd  xmm4, xmm12, 0x39
1645        movdqa  xmm12, xmm6
1646        shufps  xmm12, xmm7, 250
1647        pblendw xmm13, xmm12, 0xCC
1648        movdqa  xmm12, xmm7
1649        punpcklqdq xmm12, xmm5
1650        pblendw xmm12, xmm6, 0xC0
1651        pshufd  xmm12, xmm12, 0x78
1652        punpckhdq xmm5, xmm7
1653        punpckldq xmm6, xmm5
1654        pshufd  xmm7, xmm6, 0x1E
1655        movdqa  xmmword ptr [rsp+0x20], xmm13
1656        movdqa  xmmword ptr [rsp+0x40], xmm12
1657        movdqa  xmm5, xmmword ptr [rsp+0x30]
1658        movdqa  xmm13, xmmword ptr [rsp+0x50]
1659        pshufd  xmm6, xmm5, 0x0F
1660        shufps  xmm5, xmm13, 214
1661        pshufd  xmm12, xmm5, 0x39
1662        movdqa  xmm5, xmm14
1663        shufps  xmm5, xmm15, 250
1664        pblendw xmm6, xmm5, 0xCC
1665        movdqa  xmm5, xmm15
1666        punpcklqdq xmm5, xmm13
1667        pblendw xmm5, xmm14, 0xC0
1668        pshufd  xmm5, xmm5, 0x78
1669        punpckhdq xmm13, xmm15
1670        punpckldq xmm14, xmm13
1671        pshufd  xmm15, xmm14, 0x1E
1672        movdqa  xmm13, xmm6
1673        movdqa  xmm14, xmm5
1674        movdqa  xmm5, xmmword ptr [rsp+0x20]
1675        movdqa  xmm6, xmmword ptr [rsp+0x40]
1676        jmp     9b
16779:
1678        pxor    xmm0, xmm2
1679        pxor    xmm1, xmm3
1680        pxor    xmm8, xmm10
1681        pxor    xmm9, xmm11
1682        mov     eax, r13d
1683        cmp     rdx, r15
1684        jne     2b
1685        movups  xmmword ptr [rbx], xmm0
1686        movups  xmmword ptr [rbx+0x10], xmm1
1687        movups  xmmword ptr [rbx+0x20], xmm8
1688        movups  xmmword ptr [rbx+0x30], xmm9
1689        movdqa  xmm0, xmmword ptr [rsp+0x130]
1690        movdqa  xmm1, xmmword ptr [rsp+0x110]
1691        movdqa  xmm2, xmmword ptr [rsp+0x120]
1692        movdqu  xmm3, xmmword ptr [rsp+0x118]
1693        movdqu  xmm4, xmmword ptr [rsp+0x128]
1694        blendvps xmm1, xmm3, xmm0
1695        blendvps xmm2, xmm4, xmm0
1696        movdqa  xmmword ptr [rsp+0x110], xmm1
1697        movdqa  xmmword ptr [rsp+0x120], xmm2
1698        add     rdi, 16
1699        add     rbx, 64
1700        sub     rsi, 2
17013:
1702        test    esi, 0x1
1703        je      4b
1704        movups  xmm0, xmmword ptr [rcx]
1705        movups  xmm1, xmmword ptr [rcx+0x10]
1706        movd    xmm13, dword ptr [rsp+0x110]
1707        pinsrd  xmm13, dword ptr [rsp+0x120], 1
1708        pinsrd  xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
1709        movaps  xmm14, xmmword ptr [ROT8+rip]
1710        movaps  xmm15, xmmword ptr [ROT16+rip]
1711        mov     r8, qword ptr [rdi]
1712        movzx   eax, byte ptr [rbp+0x40]
1713        or      eax, r13d
1714        xor     edx, edx
17152:
1716        mov     r14d, eax
1717        or      eax, r12d
1718        add     rdx, 64
1719        cmp     rdx, r15
1720        cmovne  eax, r14d
1721        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
1722        movaps  xmm3, xmm13
1723        pinsrd  xmm3, eax, 3
1724        movups  xmm4, xmmword ptr [r8+rdx-0x40]
1725        movups  xmm5, xmmword ptr [r8+rdx-0x30]
1726        movaps  xmm8, xmm4
1727        shufps  xmm4, xmm5, 136
1728        shufps  xmm8, xmm5, 221
1729        movaps  xmm5, xmm8
1730        movups  xmm6, xmmword ptr [r8+rdx-0x20]
1731        movups  xmm7, xmmword ptr [r8+rdx-0x10]
1732        movaps  xmm8, xmm6
1733        shufps  xmm6, xmm7, 136
1734        pshufd  xmm6, xmm6, 0x93
1735        shufps  xmm8, xmm7, 221
1736        pshufd  xmm7, xmm8, 0x93
1737        mov     al, 7
17389:
1739        paddd   xmm0, xmm4
1740        paddd   xmm0, xmm1
1741        pxor    xmm3, xmm0
1742        pshufb  xmm3, xmm15
1743        paddd   xmm2, xmm3
1744        pxor    xmm1, xmm2
1745        movdqa  xmm11, xmm1
1746        pslld   xmm1, 20
1747        psrld   xmm11, 12
1748        por     xmm1, xmm11
1749        paddd   xmm0, xmm5
1750        paddd   xmm0, xmm1
1751        pxor    xmm3, xmm0
1752        pshufb  xmm3, xmm14
1753        paddd   xmm2, xmm3
1754        pxor    xmm1, xmm2
1755        movdqa  xmm11, xmm1
1756        pslld   xmm1, 25
1757        psrld   xmm11, 7
1758        por     xmm1, xmm11
1759        pshufd  xmm0, xmm0, 0x93
1760        pshufd  xmm3, xmm3, 0x4E
1761        pshufd  xmm2, xmm2, 0x39
1762        paddd   xmm0, xmm6
1763        paddd   xmm0, xmm1
1764        pxor    xmm3, xmm0
1765        pshufb  xmm3, xmm15
1766        paddd   xmm2, xmm3
1767        pxor    xmm1, xmm2
1768        movdqa  xmm11, xmm1
1769        pslld   xmm1, 20
1770        psrld   xmm11, 12
1771        por     xmm1, xmm11
1772        paddd   xmm0, xmm7
1773        paddd   xmm0, xmm1
1774        pxor    xmm3, xmm0
1775        pshufb  xmm3, xmm14
1776        paddd   xmm2, xmm3
1777        pxor    xmm1, xmm2
1778        movdqa  xmm11, xmm1
1779        pslld   xmm1, 25
1780        psrld   xmm11, 7
1781        por     xmm1, xmm11
1782        pshufd  xmm0, xmm0, 0x39
1783        pshufd  xmm3, xmm3, 0x4E
1784        pshufd  xmm2, xmm2, 0x93
1785        dec     al
1786        jz      9f
1787        movdqa  xmm8, xmm4
1788        shufps  xmm8, xmm5, 214
1789        pshufd  xmm9, xmm4, 0x0F
1790        pshufd  xmm4, xmm8, 0x39
1791        movdqa  xmm8, xmm6
1792        shufps  xmm8, xmm7, 250
1793        pblendw xmm9, xmm8, 0xCC
1794        movdqa  xmm8, xmm7
1795        punpcklqdq xmm8, xmm5
1796        pblendw xmm8, xmm6, 0xC0
1797        pshufd  xmm8, xmm8, 0x78
1798        punpckhdq xmm5, xmm7
1799        punpckldq xmm6, xmm5
1800        pshufd  xmm7, xmm6, 0x1E
1801        movdqa  xmm5, xmm9
1802        movdqa  xmm6, xmm8
1803        jmp     9b
18049:
1805        pxor    xmm0, xmm2
1806        pxor    xmm1, xmm3
1807        mov     eax, r13d
1808        cmp     rdx, r15
1809        jne     2b
1810        movups  xmmword ptr [rbx], xmm0
1811        movups  xmmword ptr [rbx+0x10], xmm1
1812        jmp     4b
1813.p2align 6
1814zfs_blake3_compress_in_place_sse41:
1815        _CET_ENDBR
1816        movups  xmm0, xmmword ptr [rdi]
1817        movups  xmm1, xmmword ptr [rdi+0x10]
1818        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
1819        shl     r8, 32
1820        add     rdx, r8
1821        movq    xmm3, rcx
1822        movq    xmm4, rdx
1823        punpcklqdq xmm3, xmm4
1824        movups  xmm4, xmmword ptr [rsi]
1825        movups  xmm5, xmmword ptr [rsi+0x10]
1826        movaps  xmm8, xmm4
1827        shufps  xmm4, xmm5, 136
1828        shufps  xmm8, xmm5, 221
1829        movaps  xmm5, xmm8
1830        movups  xmm6, xmmword ptr [rsi+0x20]
1831        movups  xmm7, xmmword ptr [rsi+0x30]
1832        movaps  xmm8, xmm6
1833        shufps  xmm6, xmm7, 136
1834        pshufd  xmm6, xmm6, 0x93
1835        shufps  xmm8, xmm7, 221
1836        pshufd  xmm7, xmm8, 0x93
1837        movaps  xmm14, xmmword ptr [ROT8+rip]
1838        movaps  xmm15, xmmword ptr [ROT16+rip]
1839        mov     al, 7
18409:
1841        paddd   xmm0, xmm4
1842        paddd   xmm0, xmm1
1843        pxor    xmm3, xmm0
1844        pshufb  xmm3, xmm15
1845        paddd   xmm2, xmm3
1846        pxor    xmm1, xmm2
1847        movdqa  xmm11, xmm1
1848        pslld   xmm1, 20
1849        psrld   xmm11, 12
1850        por     xmm1, xmm11
1851        paddd   xmm0, xmm5
1852        paddd   xmm0, xmm1
1853        pxor    xmm3, xmm0
1854        pshufb  xmm3, xmm14
1855        paddd   xmm2, xmm3
1856        pxor    xmm1, xmm2
1857        movdqa  xmm11, xmm1
1858        pslld   xmm1, 25
1859        psrld   xmm11, 7
1860        por     xmm1, xmm11
1861        pshufd  xmm0, xmm0, 0x93
1862        pshufd  xmm3, xmm3, 0x4E
1863        pshufd  xmm2, xmm2, 0x39
1864        paddd   xmm0, xmm6
1865        paddd   xmm0, xmm1
1866        pxor    xmm3, xmm0
1867        pshufb  xmm3, xmm15
1868        paddd   xmm2, xmm3
1869        pxor    xmm1, xmm2
1870        movdqa  xmm11, xmm1
1871        pslld   xmm1, 20
1872        psrld   xmm11, 12
1873        por     xmm1, xmm11
1874        paddd   xmm0, xmm7
1875        paddd   xmm0, xmm1
1876        pxor    xmm3, xmm0
1877        pshufb  xmm3, xmm14
1878        paddd   xmm2, xmm3
1879        pxor    xmm1, xmm2
1880        movdqa  xmm11, xmm1
1881        pslld   xmm1, 25
1882        psrld   xmm11, 7
1883        por     xmm1, xmm11
1884        pshufd  xmm0, xmm0, 0x39
1885        pshufd  xmm3, xmm3, 0x4E
1886        pshufd  xmm2, xmm2, 0x93
1887        dec     al
1888        jz      9f
1889        movdqa  xmm8, xmm4
1890        shufps  xmm8, xmm5, 214
1891        pshufd  xmm9, xmm4, 0x0F
1892        pshufd  xmm4, xmm8, 0x39
1893        movdqa  xmm8, xmm6
1894        shufps  xmm8, xmm7, 250
1895        pblendw xmm9, xmm8, 0xCC
1896        movdqa  xmm8, xmm7
1897        punpcklqdq xmm8, xmm5
1898        pblendw xmm8, xmm6, 0xC0
1899        pshufd  xmm8, xmm8, 0x78
1900        punpckhdq xmm5, xmm7
1901        punpckldq xmm6, xmm5
1902        pshufd  xmm7, xmm6, 0x1E
1903        movdqa  xmm5, xmm9
1904        movdqa  xmm6, xmm8
1905        jmp     9b
19069:
1907        pxor    xmm0, xmm2
1908        pxor    xmm1, xmm3
1909        movups  xmmword ptr [rdi], xmm0
1910        movups  xmmword ptr [rdi+0x10], xmm1
1911        RET
1912.p2align 6
1913zfs_blake3_compress_xof_sse41:
1914        _CET_ENDBR
1915        movups  xmm0, xmmword ptr [rdi]
1916        movups  xmm1, xmmword ptr [rdi+0x10]
1917        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
1918        movzx   eax, r8b
1919        movzx   edx, dl
1920        shl     rax, 32
1921        add     rdx, rax
1922        movq    xmm3, rcx
1923        movq    xmm4, rdx
1924        punpcklqdq xmm3, xmm4
1925        movups  xmm4, xmmword ptr [rsi]
1926        movups  xmm5, xmmword ptr [rsi+0x10]
1927        movaps  xmm8, xmm4
1928        shufps  xmm4, xmm5, 136
1929        shufps  xmm8, xmm5, 221
1930        movaps  xmm5, xmm8
1931        movups  xmm6, xmmword ptr [rsi+0x20]
1932        movups  xmm7, xmmword ptr [rsi+0x30]
1933        movaps  xmm8, xmm6
1934        shufps  xmm6, xmm7, 136
1935        pshufd  xmm6, xmm6, 0x93
1936        shufps  xmm8, xmm7, 221
1937        pshufd  xmm7, xmm8, 0x93
1938        movaps  xmm14, xmmword ptr [ROT8+rip]
1939        movaps  xmm15, xmmword ptr [ROT16+rip]
1940        mov     al, 7
19419:
1942        paddd   xmm0, xmm4
1943        paddd   xmm0, xmm1
1944        pxor    xmm3, xmm0
1945        pshufb  xmm3, xmm15
1946        paddd   xmm2, xmm3
1947        pxor    xmm1, xmm2
1948        movdqa  xmm11, xmm1
1949        pslld   xmm1, 20
1950        psrld   xmm11, 12
1951        por     xmm1, xmm11
1952        paddd   xmm0, xmm5
1953        paddd   xmm0, xmm1
1954        pxor    xmm3, xmm0
1955        pshufb  xmm3, xmm14
1956        paddd   xmm2, xmm3
1957        pxor    xmm1, xmm2
1958        movdqa  xmm11, xmm1
1959        pslld   xmm1, 25
1960        psrld   xmm11, 7
1961        por     xmm1, xmm11
1962        pshufd  xmm0, xmm0, 0x93
1963        pshufd  xmm3, xmm3, 0x4E
1964        pshufd  xmm2, xmm2, 0x39
1965        paddd   xmm0, xmm6
1966        paddd   xmm0, xmm1
1967        pxor    xmm3, xmm0
1968        pshufb  xmm3, xmm15
1969        paddd   xmm2, xmm3
1970        pxor    xmm1, xmm2
1971        movdqa  xmm11, xmm1
1972        pslld   xmm1, 20
1973        psrld   xmm11, 12
1974        por     xmm1, xmm11
1975        paddd   xmm0, xmm7
1976        paddd   xmm0, xmm1
1977        pxor    xmm3, xmm0
1978        pshufb  xmm3, xmm14
1979        paddd   xmm2, xmm3
1980        pxor    xmm1, xmm2
1981        movdqa  xmm11, xmm1
1982        pslld   xmm1, 25
1983        psrld   xmm11, 7
1984        por     xmm1, xmm11
1985        pshufd  xmm0, xmm0, 0x39
1986        pshufd  xmm3, xmm3, 0x4E
1987        pshufd  xmm2, xmm2, 0x93
1988        dec     al
1989        jz      9f
1990        movdqa  xmm8, xmm4
1991        shufps  xmm8, xmm5, 214
1992        pshufd  xmm9, xmm4, 0x0F
1993        pshufd  xmm4, xmm8, 0x39
1994        movdqa  xmm8, xmm6
1995        shufps  xmm8, xmm7, 250
1996        pblendw xmm9, xmm8, 0xCC
1997        movdqa  xmm8, xmm7
1998        punpcklqdq xmm8, xmm5
1999        pblendw xmm8, xmm6, 0xC0
2000        pshufd  xmm8, xmm8, 0x78
2001        punpckhdq xmm5, xmm7
2002        punpckldq xmm6, xmm5
2003        pshufd  xmm7, xmm6, 0x1E
2004        movdqa  xmm5, xmm9
2005        movdqa  xmm6, xmm8
2006        jmp     9b
20079:
2008        movdqu  xmm4, xmmword ptr [rdi]
2009        movdqu  xmm5, xmmword ptr [rdi+0x10]
2010        pxor    xmm0, xmm2
2011        pxor    xmm1, xmm3
2012        pxor    xmm2, xmm4
2013        pxor    xmm3, xmm5
2014        movups  xmmword ptr [r9], xmm0
2015        movups  xmmword ptr [r9+0x10], xmm1
2016        movups  xmmword ptr [r9+0x20], xmm2
2017        movups  xmmword ptr [r9+0x30], xmm3
2018        RET
2019
2020.size zfs_blake3_hash_many_sse41, . - zfs_blake3_hash_many_sse41
2021.size zfs_blake3_compress_in_place_sse41, . - zfs_blake3_compress_in_place_sse41
2022.size zfs_blake3_compress_xof_sse41, . - zfs_blake3_compress_xof_sse41
2023
2024#ifdef __APPLE__
2025.static_data
2026#else
2027.section .rodata
2028#endif
2029.p2align  6
2030BLAKE3_IV:
2031        .long  0x6A09E667, 0xBB67AE85
2032        .long  0x3C6EF372, 0xA54FF53A
2033ROT16:
2034        .byte  2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
2035ROT8:
2036        .byte  1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
2037ADD0:
2038        .long  0, 1, 2, 3
2039ADD1:
2040	.long  4, 4, 4, 4
2041BLAKE3_IV_0:
2042	.long  0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
2043BLAKE3_IV_1:
2044	.long  0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
2045BLAKE3_IV_2:
2046	.long  0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
2047BLAKE3_IV_3:
2048	.long  0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
2049BLAKE3_BLOCK_LEN:
2050	.long  64, 64, 64, 64
2051CMP_MSB_MASK:
2052	.long  0x80000000, 0x80000000, 0x80000000, 0x80000000
2053
2054#endif	/* HAVE_SSE4_1 */
2055
2056#ifdef __ELF__
2057.section .note.GNU-stack,"",%progbits
2058#endif
2059