1#if defined(__x86_64__)
2
3#include "llvm_blake3_prefix.h"
4
5#if defined(__ELF__) && (defined(__linux__) || defined(__FreeBSD__))
6.section .note.GNU-stack,"",%progbits
7#endif
8
9#if defined(__ELF__) && defined(__CET__) && defined(__has_include)
10#if __has_include(<cet.h>)
11#include <cet.h>
12#endif
13#endif
14
15#if !defined(_CET_ENDBR)
16#define _CET_ENDBR
17#endif
18
19#ifdef __APPLE__
20#define HIDDEN .private_extern
21#else
22#define HIDDEN .hidden
23#endif
24
25.intel_syntax noprefix
26HIDDEN blake3_hash_many_sse41
27HIDDEN _blake3_hash_many_sse41
28HIDDEN blake3_compress_in_place_sse41
29HIDDEN _blake3_compress_in_place_sse41
30HIDDEN blake3_compress_xof_sse41
31HIDDEN _blake3_compress_xof_sse41
32.global blake3_hash_many_sse41
33.global _blake3_hash_many_sse41
34.global blake3_compress_in_place_sse41
35.global _blake3_compress_in_place_sse41
36.global blake3_compress_xof_sse41
37.global _blake3_compress_xof_sse41
38#ifdef __APPLE__
39.text
40#else
41.section .text
42#endif
43        .p2align  6
44_blake3_hash_many_sse41:
45blake3_hash_many_sse41:
46        _CET_ENDBR
47        push    r15
48        push    r14
49        push    r13
50        push    r12
51        push    rbx
52        push    rbp
53        mov     rbp, rsp
54        sub     rsp, 360
55        and     rsp, 0xFFFFFFFFFFFFFFC0
56        neg     r9d
57        movd    xmm0, r9d
58        pshufd  xmm0, xmm0, 0x00
59        movdqa  xmmword ptr [rsp+0x130], xmm0
60        movdqa  xmm1, xmm0
61        pand    xmm1, xmmword ptr [ADD0+rip]
62        pand    xmm0, xmmword ptr [ADD1+rip]
63        movdqa  xmmword ptr [rsp+0x150], xmm0
64        movd    xmm0, r8d
65        pshufd  xmm0, xmm0, 0x00
66        paddd   xmm0, xmm1
67        movdqa  xmmword ptr [rsp+0x110], xmm0
68        pxor    xmm0, xmmword ptr [CMP_MSB_MASK+rip]
69        pxor    xmm1, xmmword ptr [CMP_MSB_MASK+rip]
70        pcmpgtd xmm1, xmm0
71        shr     r8, 32
72        movd    xmm2, r8d
73        pshufd  xmm2, xmm2, 0x00
74        psubd   xmm2, xmm1
75        movdqa  xmmword ptr [rsp+0x120], xmm2
76        mov     rbx, qword ptr [rbp+0x50]
77        mov     r15, rdx
78        shl     r15, 6
79        movzx   r13d, byte ptr [rbp+0x38]
80        movzx   r12d, byte ptr [rbp+0x48]
81        cmp     rsi, 4
82        jc      3f
832:
84        movdqu  xmm3, xmmword ptr [rcx]
85        pshufd  xmm0, xmm3, 0x00
86        pshufd  xmm1, xmm3, 0x55
87        pshufd  xmm2, xmm3, 0xAA
88        pshufd  xmm3, xmm3, 0xFF
89        movdqu  xmm7, xmmword ptr [rcx+0x10]
90        pshufd  xmm4, xmm7, 0x00
91        pshufd  xmm5, xmm7, 0x55
92        pshufd  xmm6, xmm7, 0xAA
93        pshufd  xmm7, xmm7, 0xFF
94        mov     r8, qword ptr [rdi]
95        mov     r9, qword ptr [rdi+0x8]
96        mov     r10, qword ptr [rdi+0x10]
97        mov     r11, qword ptr [rdi+0x18]
98        movzx   eax, byte ptr [rbp+0x40]
99        or      eax, r13d
100        xor     edx, edx
1019:
102        mov     r14d, eax
103        or      eax, r12d
104        add     rdx, 64
105        cmp     rdx, r15
106        cmovne  eax, r14d
107        movdqu  xmm8, xmmword ptr [r8+rdx-0x40]
108        movdqu  xmm9, xmmword ptr [r9+rdx-0x40]
109        movdqu  xmm10, xmmword ptr [r10+rdx-0x40]
110        movdqu  xmm11, xmmword ptr [r11+rdx-0x40]
111        movdqa  xmm12, xmm8
112        punpckldq xmm8, xmm9
113        punpckhdq xmm12, xmm9
114        movdqa  xmm14, xmm10
115        punpckldq xmm10, xmm11
116        punpckhdq xmm14, xmm11
117        movdqa  xmm9, xmm8
118        punpcklqdq xmm8, xmm10
119        punpckhqdq xmm9, xmm10
120        movdqa  xmm13, xmm12
121        punpcklqdq xmm12, xmm14
122        punpckhqdq xmm13, xmm14
123        movdqa  xmmword ptr [rsp], xmm8
124        movdqa  xmmword ptr [rsp+0x10], xmm9
125        movdqa  xmmword ptr [rsp+0x20], xmm12
126        movdqa  xmmword ptr [rsp+0x30], xmm13
127        movdqu  xmm8, xmmword ptr [r8+rdx-0x30]
128        movdqu  xmm9, xmmword ptr [r9+rdx-0x30]
129        movdqu  xmm10, xmmword ptr [r10+rdx-0x30]
130        movdqu  xmm11, xmmword ptr [r11+rdx-0x30]
131        movdqa  xmm12, xmm8
132        punpckldq xmm8, xmm9
133        punpckhdq xmm12, xmm9
134        movdqa  xmm14, xmm10
135        punpckldq xmm10, xmm11
136        punpckhdq xmm14, xmm11
137        movdqa  xmm9, xmm8
138        punpcklqdq xmm8, xmm10
139        punpckhqdq xmm9, xmm10
140        movdqa  xmm13, xmm12
141        punpcklqdq xmm12, xmm14
142        punpckhqdq xmm13, xmm14
143        movdqa  xmmword ptr [rsp+0x40], xmm8
144        movdqa  xmmword ptr [rsp+0x50], xmm9
145        movdqa  xmmword ptr [rsp+0x60], xmm12
146        movdqa  xmmword ptr [rsp+0x70], xmm13
147        movdqu  xmm8, xmmword ptr [r8+rdx-0x20]
148        movdqu  xmm9, xmmword ptr [r9+rdx-0x20]
149        movdqu  xmm10, xmmword ptr [r10+rdx-0x20]
150        movdqu  xmm11, xmmword ptr [r11+rdx-0x20]
151        movdqa  xmm12, xmm8
152        punpckldq xmm8, xmm9
153        punpckhdq xmm12, xmm9
154        movdqa  xmm14, xmm10
155        punpckldq xmm10, xmm11
156        punpckhdq xmm14, xmm11
157        movdqa  xmm9, xmm8
158        punpcklqdq xmm8, xmm10
159        punpckhqdq xmm9, xmm10
160        movdqa  xmm13, xmm12
161        punpcklqdq xmm12, xmm14
162        punpckhqdq xmm13, xmm14
163        movdqa  xmmword ptr [rsp+0x80], xmm8
164        movdqa  xmmword ptr [rsp+0x90], xmm9
165        movdqa  xmmword ptr [rsp+0xA0], xmm12
166        movdqa  xmmword ptr [rsp+0xB0], xmm13
167        movdqu  xmm8, xmmword ptr [r8+rdx-0x10]
168        movdqu  xmm9, xmmword ptr [r9+rdx-0x10]
169        movdqu  xmm10, xmmword ptr [r10+rdx-0x10]
170        movdqu  xmm11, xmmword ptr [r11+rdx-0x10]
171        movdqa  xmm12, xmm8
172        punpckldq xmm8, xmm9
173        punpckhdq xmm12, xmm9
174        movdqa  xmm14, xmm10
175        punpckldq xmm10, xmm11
176        punpckhdq xmm14, xmm11
177        movdqa  xmm9, xmm8
178        punpcklqdq xmm8, xmm10
179        punpckhqdq xmm9, xmm10
180        movdqa  xmm13, xmm12
181        punpcklqdq xmm12, xmm14
182        punpckhqdq xmm13, xmm14
183        movdqa  xmmword ptr [rsp+0xC0], xmm8
184        movdqa  xmmword ptr [rsp+0xD0], xmm9
185        movdqa  xmmword ptr [rsp+0xE0], xmm12
186        movdqa  xmmword ptr [rsp+0xF0], xmm13
187        movdqa  xmm9, xmmword ptr [BLAKE3_IV_1+rip]
188        movdqa  xmm10, xmmword ptr [BLAKE3_IV_2+rip]
189        movdqa  xmm11, xmmword ptr [BLAKE3_IV_3+rip]
190        movdqa  xmm12, xmmword ptr [rsp+0x110]
191        movdqa  xmm13, xmmword ptr [rsp+0x120]
192        movdqa  xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
193        movd    xmm15, eax
194        pshufd  xmm15, xmm15, 0x00
195        prefetcht0 [r8+rdx+0x80]
196        prefetcht0 [r9+rdx+0x80]
197        prefetcht0 [r10+rdx+0x80]
198        prefetcht0 [r11+rdx+0x80]
199        paddd   xmm0, xmmword ptr [rsp]
200        paddd   xmm1, xmmword ptr [rsp+0x20]
201        paddd   xmm2, xmmword ptr [rsp+0x40]
202        paddd   xmm3, xmmword ptr [rsp+0x60]
203        paddd   xmm0, xmm4
204        paddd   xmm1, xmm5
205        paddd   xmm2, xmm6
206        paddd   xmm3, xmm7
207        pxor    xmm12, xmm0
208        pxor    xmm13, xmm1
209        pxor    xmm14, xmm2
210        pxor    xmm15, xmm3
211        movdqa  xmm8, xmmword ptr [ROT16+rip]
212        pshufb  xmm12, xmm8
213        pshufb  xmm13, xmm8
214        pshufb  xmm14, xmm8
215        pshufb  xmm15, xmm8
216        movdqa  xmm8, xmmword ptr [BLAKE3_IV_0+rip]
217        paddd   xmm8, xmm12
218        paddd   xmm9, xmm13
219        paddd   xmm10, xmm14
220        paddd   xmm11, xmm15
221        pxor    xmm4, xmm8
222        pxor    xmm5, xmm9
223        pxor    xmm6, xmm10
224        pxor    xmm7, xmm11
225        movdqa  xmmword ptr [rsp+0x100], xmm8
226        movdqa  xmm8, xmm4
227        psrld   xmm8, 12
228        pslld   xmm4, 20
229        por     xmm4, xmm8
230        movdqa  xmm8, xmm5
231        psrld   xmm8, 12
232        pslld   xmm5, 20
233        por     xmm5, xmm8
234        movdqa  xmm8, xmm6
235        psrld   xmm8, 12
236        pslld   xmm6, 20
237        por     xmm6, xmm8
238        movdqa  xmm8, xmm7
239        psrld   xmm8, 12
240        pslld   xmm7, 20
241        por     xmm7, xmm8
242        paddd   xmm0, xmmword ptr [rsp+0x10]
243        paddd   xmm1, xmmword ptr [rsp+0x30]
244        paddd   xmm2, xmmword ptr [rsp+0x50]
245        paddd   xmm3, xmmword ptr [rsp+0x70]
246        paddd   xmm0, xmm4
247        paddd   xmm1, xmm5
248        paddd   xmm2, xmm6
249        paddd   xmm3, xmm7
250        pxor    xmm12, xmm0
251        pxor    xmm13, xmm1
252        pxor    xmm14, xmm2
253        pxor    xmm15, xmm3
254        movdqa  xmm8, xmmword ptr [ROT8+rip]
255        pshufb  xmm12, xmm8
256        pshufb  xmm13, xmm8
257        pshufb  xmm14, xmm8
258        pshufb  xmm15, xmm8
259        movdqa  xmm8, xmmword ptr [rsp+0x100]
260        paddd   xmm8, xmm12
261        paddd   xmm9, xmm13
262        paddd   xmm10, xmm14
263        paddd   xmm11, xmm15
264        pxor    xmm4, xmm8
265        pxor    xmm5, xmm9
266        pxor    xmm6, xmm10
267        pxor    xmm7, xmm11
268        movdqa  xmmword ptr [rsp+0x100], xmm8
269        movdqa  xmm8, xmm4
270        psrld   xmm8, 7
271        pslld   xmm4, 25
272        por     xmm4, xmm8
273        movdqa  xmm8, xmm5
274        psrld   xmm8, 7
275        pslld   xmm5, 25
276        por     xmm5, xmm8
277        movdqa  xmm8, xmm6
278        psrld   xmm8, 7
279        pslld   xmm6, 25
280        por     xmm6, xmm8
281        movdqa  xmm8, xmm7
282        psrld   xmm8, 7
283        pslld   xmm7, 25
284        por     xmm7, xmm8
285        paddd   xmm0, xmmword ptr [rsp+0x80]
286        paddd   xmm1, xmmword ptr [rsp+0xA0]
287        paddd   xmm2, xmmword ptr [rsp+0xC0]
288        paddd   xmm3, xmmword ptr [rsp+0xE0]
289        paddd   xmm0, xmm5
290        paddd   xmm1, xmm6
291        paddd   xmm2, xmm7
292        paddd   xmm3, xmm4
293        pxor    xmm15, xmm0
294        pxor    xmm12, xmm1
295        pxor    xmm13, xmm2
296        pxor    xmm14, xmm3
297        movdqa  xmm8, xmmword ptr [ROT16+rip]
298        pshufb  xmm15, xmm8
299        pshufb  xmm12, xmm8
300        pshufb  xmm13, xmm8
301        pshufb  xmm14, xmm8
302        paddd   xmm10, xmm15
303        paddd   xmm11, xmm12
304        movdqa  xmm8, xmmword ptr [rsp+0x100]
305        paddd   xmm8, xmm13
306        paddd   xmm9, xmm14
307        pxor    xmm5, xmm10
308        pxor    xmm6, xmm11
309        pxor    xmm7, xmm8
310        pxor    xmm4, xmm9
311        movdqa  xmmword ptr [rsp+0x100], xmm8
312        movdqa  xmm8, xmm5
313        psrld   xmm8, 12
314        pslld   xmm5, 20
315        por     xmm5, xmm8
316        movdqa  xmm8, xmm6
317        psrld   xmm8, 12
318        pslld   xmm6, 20
319        por     xmm6, xmm8
320        movdqa  xmm8, xmm7
321        psrld   xmm8, 12
322        pslld   xmm7, 20
323        por     xmm7, xmm8
324        movdqa  xmm8, xmm4
325        psrld   xmm8, 12
326        pslld   xmm4, 20
327        por     xmm4, xmm8
328        paddd   xmm0, xmmword ptr [rsp+0x90]
329        paddd   xmm1, xmmword ptr [rsp+0xB0]
330        paddd   xmm2, xmmword ptr [rsp+0xD0]
331        paddd   xmm3, xmmword ptr [rsp+0xF0]
332        paddd   xmm0, xmm5
333        paddd   xmm1, xmm6
334        paddd   xmm2, xmm7
335        paddd   xmm3, xmm4
336        pxor    xmm15, xmm0
337        pxor    xmm12, xmm1
338        pxor    xmm13, xmm2
339        pxor    xmm14, xmm3
340        movdqa  xmm8, xmmword ptr [ROT8+rip]
341        pshufb  xmm15, xmm8
342        pshufb  xmm12, xmm8
343        pshufb  xmm13, xmm8
344        pshufb  xmm14, xmm8
345        paddd   xmm10, xmm15
346        paddd   xmm11, xmm12
347        movdqa  xmm8, xmmword ptr [rsp+0x100]
348        paddd   xmm8, xmm13
349        paddd   xmm9, xmm14
350        pxor    xmm5, xmm10
351        pxor    xmm6, xmm11
352        pxor    xmm7, xmm8
353        pxor    xmm4, xmm9
354        movdqa  xmmword ptr [rsp+0x100], xmm8
355        movdqa  xmm8, xmm5
356        psrld   xmm8, 7
357        pslld   xmm5, 25
358        por     xmm5, xmm8
359        movdqa  xmm8, xmm6
360        psrld   xmm8, 7
361        pslld   xmm6, 25
362        por     xmm6, xmm8
363        movdqa  xmm8, xmm7
364        psrld   xmm8, 7
365        pslld   xmm7, 25
366        por     xmm7, xmm8
367        movdqa  xmm8, xmm4
368        psrld   xmm8, 7
369        pslld   xmm4, 25
370        por     xmm4, xmm8
371        paddd   xmm0, xmmword ptr [rsp+0x20]
372        paddd   xmm1, xmmword ptr [rsp+0x30]
373        paddd   xmm2, xmmword ptr [rsp+0x70]
374        paddd   xmm3, xmmword ptr [rsp+0x40]
375        paddd   xmm0, xmm4
376        paddd   xmm1, xmm5
377        paddd   xmm2, xmm6
378        paddd   xmm3, xmm7
379        pxor    xmm12, xmm0
380        pxor    xmm13, xmm1
381        pxor    xmm14, xmm2
382        pxor    xmm15, xmm3
383        movdqa  xmm8, xmmword ptr [ROT16+rip]
384        pshufb  xmm12, xmm8
385        pshufb  xmm13, xmm8
386        pshufb  xmm14, xmm8
387        pshufb  xmm15, xmm8
388        movdqa  xmm8, xmmword ptr [rsp+0x100]
389        paddd   xmm8, xmm12
390        paddd   xmm9, xmm13
391        paddd   xmm10, xmm14
392        paddd   xmm11, xmm15
393        pxor    xmm4, xmm8
394        pxor    xmm5, xmm9
395        pxor    xmm6, xmm10
396        pxor    xmm7, xmm11
397        movdqa  xmmword ptr [rsp+0x100], xmm8
398        movdqa  xmm8, xmm4
399        psrld   xmm8, 12
400        pslld   xmm4, 20
401        por     xmm4, xmm8
402        movdqa  xmm8, xmm5
403        psrld   xmm8, 12
404        pslld   xmm5, 20
405        por     xmm5, xmm8
406        movdqa  xmm8, xmm6
407        psrld   xmm8, 12
408        pslld   xmm6, 20
409        por     xmm6, xmm8
410        movdqa  xmm8, xmm7
411        psrld   xmm8, 12
412        pslld   xmm7, 20
413        por     xmm7, xmm8
414        paddd   xmm0, xmmword ptr [rsp+0x60]
415        paddd   xmm1, xmmword ptr [rsp+0xA0]
416        paddd   xmm2, xmmword ptr [rsp]
417        paddd   xmm3, xmmword ptr [rsp+0xD0]
418        paddd   xmm0, xmm4
419        paddd   xmm1, xmm5
420        paddd   xmm2, xmm6
421        paddd   xmm3, xmm7
422        pxor    xmm12, xmm0
423        pxor    xmm13, xmm1
424        pxor    xmm14, xmm2
425        pxor    xmm15, xmm3
426        movdqa  xmm8, xmmword ptr [ROT8+rip]
427        pshufb  xmm12, xmm8
428        pshufb  xmm13, xmm8
429        pshufb  xmm14, xmm8
430        pshufb  xmm15, xmm8
431        movdqa  xmm8, xmmword ptr [rsp+0x100]
432        paddd   xmm8, xmm12
433        paddd   xmm9, xmm13
434        paddd   xmm10, xmm14
435        paddd   xmm11, xmm15
436        pxor    xmm4, xmm8
437        pxor    xmm5, xmm9
438        pxor    xmm6, xmm10
439        pxor    xmm7, xmm11
440        movdqa  xmmword ptr [rsp+0x100], xmm8
441        movdqa  xmm8, xmm4
442        psrld   xmm8, 7
443        pslld   xmm4, 25
444        por     xmm4, xmm8
445        movdqa  xmm8, xmm5
446        psrld   xmm8, 7
447        pslld   xmm5, 25
448        por     xmm5, xmm8
449        movdqa  xmm8, xmm6
450        psrld   xmm8, 7
451        pslld   xmm6, 25
452        por     xmm6, xmm8
453        movdqa  xmm8, xmm7
454        psrld   xmm8, 7
455        pslld   xmm7, 25
456        por     xmm7, xmm8
457        paddd   xmm0, xmmword ptr [rsp+0x10]
458        paddd   xmm1, xmmword ptr [rsp+0xC0]
459        paddd   xmm2, xmmword ptr [rsp+0x90]
460        paddd   xmm3, xmmword ptr [rsp+0xF0]
461        paddd   xmm0, xmm5
462        paddd   xmm1, xmm6
463        paddd   xmm2, xmm7
464        paddd   xmm3, xmm4
465        pxor    xmm15, xmm0
466        pxor    xmm12, xmm1
467        pxor    xmm13, xmm2
468        pxor    xmm14, xmm3
469        movdqa  xmm8, xmmword ptr [ROT16+rip]
470        pshufb  xmm15, xmm8
471        pshufb  xmm12, xmm8
472        pshufb  xmm13, xmm8
473        pshufb  xmm14, xmm8
474        paddd   xmm10, xmm15
475        paddd   xmm11, xmm12
476        movdqa  xmm8, xmmword ptr [rsp+0x100]
477        paddd   xmm8, xmm13
478        paddd   xmm9, xmm14
479        pxor    xmm5, xmm10
480        pxor    xmm6, xmm11
481        pxor    xmm7, xmm8
482        pxor    xmm4, xmm9
483        movdqa  xmmword ptr [rsp+0x100], xmm8
484        movdqa  xmm8, xmm5
485        psrld   xmm8, 12
486        pslld   xmm5, 20
487        por     xmm5, xmm8
488        movdqa  xmm8, xmm6
489        psrld   xmm8, 12
490        pslld   xmm6, 20
491        por     xmm6, xmm8
492        movdqa  xmm8, xmm7
493        psrld   xmm8, 12
494        pslld   xmm7, 20
495        por     xmm7, xmm8
496        movdqa  xmm8, xmm4
497        psrld   xmm8, 12
498        pslld   xmm4, 20
499        por     xmm4, xmm8
500        paddd   xmm0, xmmword ptr [rsp+0xB0]
501        paddd   xmm1, xmmword ptr [rsp+0x50]
502        paddd   xmm2, xmmword ptr [rsp+0xE0]
503        paddd   xmm3, xmmword ptr [rsp+0x80]
504        paddd   xmm0, xmm5
505        paddd   xmm1, xmm6
506        paddd   xmm2, xmm7
507        paddd   xmm3, xmm4
508        pxor    xmm15, xmm0
509        pxor    xmm12, xmm1
510        pxor    xmm13, xmm2
511        pxor    xmm14, xmm3
512        movdqa  xmm8, xmmword ptr [ROT8+rip]
513        pshufb  xmm15, xmm8
514        pshufb  xmm12, xmm8
515        pshufb  xmm13, xmm8
516        pshufb  xmm14, xmm8
517        paddd   xmm10, xmm15
518        paddd   xmm11, xmm12
519        movdqa  xmm8, xmmword ptr [rsp+0x100]
520        paddd   xmm8, xmm13
521        paddd   xmm9, xmm14
522        pxor    xmm5, xmm10
523        pxor    xmm6, xmm11
524        pxor    xmm7, xmm8
525        pxor    xmm4, xmm9
526        movdqa  xmmword ptr [rsp+0x100], xmm8
527        movdqa  xmm8, xmm5
528        psrld   xmm8, 7
529        pslld   xmm5, 25
530        por     xmm5, xmm8
531        movdqa  xmm8, xmm6
532        psrld   xmm8, 7
533        pslld   xmm6, 25
534        por     xmm6, xmm8
535        movdqa  xmm8, xmm7
536        psrld   xmm8, 7
537        pslld   xmm7, 25
538        por     xmm7, xmm8
539        movdqa  xmm8, xmm4
540        psrld   xmm8, 7
541        pslld   xmm4, 25
542        por     xmm4, xmm8
543        paddd   xmm0, xmmword ptr [rsp+0x30]
544        paddd   xmm1, xmmword ptr [rsp+0xA0]
545        paddd   xmm2, xmmword ptr [rsp+0xD0]
546        paddd   xmm3, xmmword ptr [rsp+0x70]
547        paddd   xmm0, xmm4
548        paddd   xmm1, xmm5
549        paddd   xmm2, xmm6
550        paddd   xmm3, xmm7
551        pxor    xmm12, xmm0
552        pxor    xmm13, xmm1
553        pxor    xmm14, xmm2
554        pxor    xmm15, xmm3
555        movdqa  xmm8, xmmword ptr [ROT16+rip]
556        pshufb  xmm12, xmm8
557        pshufb  xmm13, xmm8
558        pshufb  xmm14, xmm8
559        pshufb  xmm15, xmm8
560        movdqa  xmm8, xmmword ptr [rsp+0x100]
561        paddd   xmm8, xmm12
562        paddd   xmm9, xmm13
563        paddd   xmm10, xmm14
564        paddd   xmm11, xmm15
565        pxor    xmm4, xmm8
566        pxor    xmm5, xmm9
567        pxor    xmm6, xmm10
568        pxor    xmm7, xmm11
569        movdqa  xmmword ptr [rsp+0x100], xmm8
570        movdqa  xmm8, xmm4
571        psrld   xmm8, 12
572        pslld   xmm4, 20
573        por     xmm4, xmm8
574        movdqa  xmm8, xmm5
575        psrld   xmm8, 12
576        pslld   xmm5, 20
577        por     xmm5, xmm8
578        movdqa  xmm8, xmm6
579        psrld   xmm8, 12
580        pslld   xmm6, 20
581        por     xmm6, xmm8
582        movdqa  xmm8, xmm7
583        psrld   xmm8, 12
584        pslld   xmm7, 20
585        por     xmm7, xmm8
586        paddd   xmm0, xmmword ptr [rsp+0x40]
587        paddd   xmm1, xmmword ptr [rsp+0xC0]
588        paddd   xmm2, xmmword ptr [rsp+0x20]
589        paddd   xmm3, xmmword ptr [rsp+0xE0]
590        paddd   xmm0, xmm4
591        paddd   xmm1, xmm5
592        paddd   xmm2, xmm6
593        paddd   xmm3, xmm7
594        pxor    xmm12, xmm0
595        pxor    xmm13, xmm1
596        pxor    xmm14, xmm2
597        pxor    xmm15, xmm3
598        movdqa  xmm8, xmmword ptr [ROT8+rip]
599        pshufb  xmm12, xmm8
600        pshufb  xmm13, xmm8
601        pshufb  xmm14, xmm8
602        pshufb  xmm15, xmm8
603        movdqa  xmm8, xmmword ptr [rsp+0x100]
604        paddd   xmm8, xmm12
605        paddd   xmm9, xmm13
606        paddd   xmm10, xmm14
607        paddd   xmm11, xmm15
608        pxor    xmm4, xmm8
609        pxor    xmm5, xmm9
610        pxor    xmm6, xmm10
611        pxor    xmm7, xmm11
612        movdqa  xmmword ptr [rsp+0x100], xmm8
613        movdqa  xmm8, xmm4
614        psrld   xmm8, 7
615        pslld   xmm4, 25
616        por     xmm4, xmm8
617        movdqa  xmm8, xmm5
618        psrld   xmm8, 7
619        pslld   xmm5, 25
620        por     xmm5, xmm8
621        movdqa  xmm8, xmm6
622        psrld   xmm8, 7
623        pslld   xmm6, 25
624        por     xmm6, xmm8
625        movdqa  xmm8, xmm7
626        psrld   xmm8, 7
627        pslld   xmm7, 25
628        por     xmm7, xmm8
629        paddd   xmm0, xmmword ptr [rsp+0x60]
630        paddd   xmm1, xmmword ptr [rsp+0x90]
631        paddd   xmm2, xmmword ptr [rsp+0xB0]
632        paddd   xmm3, xmmword ptr [rsp+0x80]
633        paddd   xmm0, xmm5
634        paddd   xmm1, xmm6
635        paddd   xmm2, xmm7
636        paddd   xmm3, xmm4
637        pxor    xmm15, xmm0
638        pxor    xmm12, xmm1
639        pxor    xmm13, xmm2
640        pxor    xmm14, xmm3
641        movdqa  xmm8, xmmword ptr [ROT16+rip]
642        pshufb  xmm15, xmm8
643        pshufb  xmm12, xmm8
644        pshufb  xmm13, xmm8
645        pshufb  xmm14, xmm8
646        paddd   xmm10, xmm15
647        paddd   xmm11, xmm12
648        movdqa  xmm8, xmmword ptr [rsp+0x100]
649        paddd   xmm8, xmm13
650        paddd   xmm9, xmm14
651        pxor    xmm5, xmm10
652        pxor    xmm6, xmm11
653        pxor    xmm7, xmm8
654        pxor    xmm4, xmm9
655        movdqa  xmmword ptr [rsp+0x100], xmm8
656        movdqa  xmm8, xmm5
657        psrld   xmm8, 12
658        pslld   xmm5, 20
659        por     xmm5, xmm8
660        movdqa  xmm8, xmm6
661        psrld   xmm8, 12
662        pslld   xmm6, 20
663        por     xmm6, xmm8
664        movdqa  xmm8, xmm7
665        psrld   xmm8, 12
666        pslld   xmm7, 20
667        por     xmm7, xmm8
668        movdqa  xmm8, xmm4
669        psrld   xmm8, 12
670        pslld   xmm4, 20
671        por     xmm4, xmm8
672        paddd   xmm0, xmmword ptr [rsp+0x50]
673        paddd   xmm1, xmmword ptr [rsp]
674        paddd   xmm2, xmmword ptr [rsp+0xF0]
675        paddd   xmm3, xmmword ptr [rsp+0x10]
676        paddd   xmm0, xmm5
677        paddd   xmm1, xmm6
678        paddd   xmm2, xmm7
679        paddd   xmm3, xmm4
680        pxor    xmm15, xmm0
681        pxor    xmm12, xmm1
682        pxor    xmm13, xmm2
683        pxor    xmm14, xmm3
684        movdqa  xmm8, xmmword ptr [ROT8+rip]
685        pshufb  xmm15, xmm8
686        pshufb  xmm12, xmm8
687        pshufb  xmm13, xmm8
688        pshufb  xmm14, xmm8
689        paddd   xmm10, xmm15
690        paddd   xmm11, xmm12
691        movdqa  xmm8, xmmword ptr [rsp+0x100]
692        paddd   xmm8, xmm13
693        paddd   xmm9, xmm14
694        pxor    xmm5, xmm10
695        pxor    xmm6, xmm11
696        pxor    xmm7, xmm8
697        pxor    xmm4, xmm9
698        movdqa  xmmword ptr [rsp+0x100], xmm8
699        movdqa  xmm8, xmm5
700        psrld   xmm8, 7
701        pslld   xmm5, 25
702        por     xmm5, xmm8
703        movdqa  xmm8, xmm6
704        psrld   xmm8, 7
705        pslld   xmm6, 25
706        por     xmm6, xmm8
707        movdqa  xmm8, xmm7
708        psrld   xmm8, 7
709        pslld   xmm7, 25
710        por     xmm7, xmm8
711        movdqa  xmm8, xmm4
712        psrld   xmm8, 7
713        pslld   xmm4, 25
714        por     xmm4, xmm8
715        paddd   xmm0, xmmword ptr [rsp+0xA0]
716        paddd   xmm1, xmmword ptr [rsp+0xC0]
717        paddd   xmm2, xmmword ptr [rsp+0xE0]
718        paddd   xmm3, xmmword ptr [rsp+0xD0]
719        paddd   xmm0, xmm4
720        paddd   xmm1, xmm5
721        paddd   xmm2, xmm6
722        paddd   xmm3, xmm7
723        pxor    xmm12, xmm0
724        pxor    xmm13, xmm1
725        pxor    xmm14, xmm2
726        pxor    xmm15, xmm3
727        movdqa  xmm8, xmmword ptr [ROT16+rip]
728        pshufb  xmm12, xmm8
729        pshufb  xmm13, xmm8
730        pshufb  xmm14, xmm8
731        pshufb  xmm15, xmm8
732        movdqa  xmm8, xmmword ptr [rsp+0x100]
733        paddd   xmm8, xmm12
734        paddd   xmm9, xmm13
735        paddd   xmm10, xmm14
736        paddd   xmm11, xmm15
737        pxor    xmm4, xmm8
738        pxor    xmm5, xmm9
739        pxor    xmm6, xmm10
740        pxor    xmm7, xmm11
741        movdqa  xmmword ptr [rsp+0x100], xmm8
742        movdqa  xmm8, xmm4
743        psrld   xmm8, 12
744        pslld   xmm4, 20
745        por     xmm4, xmm8
746        movdqa  xmm8, xmm5
747        psrld   xmm8, 12
748        pslld   xmm5, 20
749        por     xmm5, xmm8
750        movdqa  xmm8, xmm6
751        psrld   xmm8, 12
752        pslld   xmm6, 20
753        por     xmm6, xmm8
754        movdqa  xmm8, xmm7
755        psrld   xmm8, 12
756        pslld   xmm7, 20
757        por     xmm7, xmm8
758        paddd   xmm0, xmmword ptr [rsp+0x70]
759        paddd   xmm1, xmmword ptr [rsp+0x90]
760        paddd   xmm2, xmmword ptr [rsp+0x30]
761        paddd   xmm3, xmmword ptr [rsp+0xF0]
762        paddd   xmm0, xmm4
763        paddd   xmm1, xmm5
764        paddd   xmm2, xmm6
765        paddd   xmm3, xmm7
766        pxor    xmm12, xmm0
767        pxor    xmm13, xmm1
768        pxor    xmm14, xmm2
769        pxor    xmm15, xmm3
770        movdqa  xmm8, xmmword ptr [ROT8+rip]
771        pshufb  xmm12, xmm8
772        pshufb  xmm13, xmm8
773        pshufb  xmm14, xmm8
774        pshufb  xmm15, xmm8
775        movdqa  xmm8, xmmword ptr [rsp+0x100]
776        paddd   xmm8, xmm12
777        paddd   xmm9, xmm13
778        paddd   xmm10, xmm14
779        paddd   xmm11, xmm15
780        pxor    xmm4, xmm8
781        pxor    xmm5, xmm9
782        pxor    xmm6, xmm10
783        pxor    xmm7, xmm11
784        movdqa  xmmword ptr [rsp+0x100], xmm8
785        movdqa  xmm8, xmm4
786        psrld   xmm8, 7
787        pslld   xmm4, 25
788        por     xmm4, xmm8
789        movdqa  xmm8, xmm5
790        psrld   xmm8, 7
791        pslld   xmm5, 25
792        por     xmm5, xmm8
793        movdqa  xmm8, xmm6
794        psrld   xmm8, 7
795        pslld   xmm6, 25
796        por     xmm6, xmm8
797        movdqa  xmm8, xmm7
798        psrld   xmm8, 7
799        pslld   xmm7, 25
800        por     xmm7, xmm8
801        paddd   xmm0, xmmword ptr [rsp+0x40]
802        paddd   xmm1, xmmword ptr [rsp+0xB0]
803        paddd   xmm2, xmmword ptr [rsp+0x50]
804        paddd   xmm3, xmmword ptr [rsp+0x10]
805        paddd   xmm0, xmm5
806        paddd   xmm1, xmm6
807        paddd   xmm2, xmm7
808        paddd   xmm3, xmm4
809        pxor    xmm15, xmm0
810        pxor    xmm12, xmm1
811        pxor    xmm13, xmm2
812        pxor    xmm14, xmm3
813        movdqa  xmm8, xmmword ptr [ROT16+rip]
814        pshufb  xmm15, xmm8
815        pshufb  xmm12, xmm8
816        pshufb  xmm13, xmm8
817        pshufb  xmm14, xmm8
818        paddd   xmm10, xmm15
819        paddd   xmm11, xmm12
820        movdqa  xmm8, xmmword ptr [rsp+0x100]
821        paddd   xmm8, xmm13
822        paddd   xmm9, xmm14
823        pxor    xmm5, xmm10
824        pxor    xmm6, xmm11
825        pxor    xmm7, xmm8
826        pxor    xmm4, xmm9
827        movdqa  xmmword ptr [rsp+0x100], xmm8
828        movdqa  xmm8, xmm5
829        psrld   xmm8, 12
830        pslld   xmm5, 20
831        por     xmm5, xmm8
832        movdqa  xmm8, xmm6
833        psrld   xmm8, 12
834        pslld   xmm6, 20
835        por     xmm6, xmm8
836        movdqa  xmm8, xmm7
837        psrld   xmm8, 12
838        pslld   xmm7, 20
839        por     xmm7, xmm8
840        movdqa  xmm8, xmm4
841        psrld   xmm8, 12
842        pslld   xmm4, 20
843        por     xmm4, xmm8
844        paddd   xmm0, xmmword ptr [rsp]
845        paddd   xmm1, xmmword ptr [rsp+0x20]
846        paddd   xmm2, xmmword ptr [rsp+0x80]
847        paddd   xmm3, xmmword ptr [rsp+0x60]
848        paddd   xmm0, xmm5
849        paddd   xmm1, xmm6
850        paddd   xmm2, xmm7
851        paddd   xmm3, xmm4
852        pxor    xmm15, xmm0
853        pxor    xmm12, xmm1
854        pxor    xmm13, xmm2
855        pxor    xmm14, xmm3
856        movdqa  xmm8, xmmword ptr [ROT8+rip]
857        pshufb  xmm15, xmm8
858        pshufb  xmm12, xmm8
859        pshufb  xmm13, xmm8
860        pshufb  xmm14, xmm8
861        paddd   xmm10, xmm15
862        paddd   xmm11, xmm12
863        movdqa  xmm8, xmmword ptr [rsp+0x100]
864        paddd   xmm8, xmm13
865        paddd   xmm9, xmm14
866        pxor    xmm5, xmm10
867        pxor    xmm6, xmm11
868        pxor    xmm7, xmm8
869        pxor    xmm4, xmm9
870        movdqa  xmmword ptr [rsp+0x100], xmm8
871        movdqa  xmm8, xmm5
872        psrld   xmm8, 7
873        pslld   xmm5, 25
874        por     xmm5, xmm8
875        movdqa  xmm8, xmm6
876        psrld   xmm8, 7
877        pslld   xmm6, 25
878        por     xmm6, xmm8
879        movdqa  xmm8, xmm7
880        psrld   xmm8, 7
881        pslld   xmm7, 25
882        por     xmm7, xmm8
883        movdqa  xmm8, xmm4
884        psrld   xmm8, 7
885        pslld   xmm4, 25
886        por     xmm4, xmm8
887        paddd   xmm0, xmmword ptr [rsp+0xC0]
888        paddd   xmm1, xmmword ptr [rsp+0x90]
889        paddd   xmm2, xmmword ptr [rsp+0xF0]
890        paddd   xmm3, xmmword ptr [rsp+0xE0]
891        paddd   xmm0, xmm4
892        paddd   xmm1, xmm5
893        paddd   xmm2, xmm6
894        paddd   xmm3, xmm7
895        pxor    xmm12, xmm0
896        pxor    xmm13, xmm1
897        pxor    xmm14, xmm2
898        pxor    xmm15, xmm3
899        movdqa  xmm8, xmmword ptr [ROT16+rip]
900        pshufb  xmm12, xmm8
901        pshufb  xmm13, xmm8
902        pshufb  xmm14, xmm8
903        pshufb  xmm15, xmm8
904        movdqa  xmm8, xmmword ptr [rsp+0x100]
905        paddd   xmm8, xmm12
906        paddd   xmm9, xmm13
907        paddd   xmm10, xmm14
908        paddd   xmm11, xmm15
909        pxor    xmm4, xmm8
910        pxor    xmm5, xmm9
911        pxor    xmm6, xmm10
912        pxor    xmm7, xmm11
913        movdqa  xmmword ptr [rsp+0x100], xmm8
914        movdqa  xmm8, xmm4
915        psrld   xmm8, 12
916        pslld   xmm4, 20
917        por     xmm4, xmm8
918        movdqa  xmm8, xmm5
919        psrld   xmm8, 12
920        pslld   xmm5, 20
921        por     xmm5, xmm8
922        movdqa  xmm8, xmm6
923        psrld   xmm8, 12
924        pslld   xmm6, 20
925        por     xmm6, xmm8
926        movdqa  xmm8, xmm7
927        psrld   xmm8, 12
928        pslld   xmm7, 20
929        por     xmm7, xmm8
930        paddd   xmm0, xmmword ptr [rsp+0xD0]
931        paddd   xmm1, xmmword ptr [rsp+0xB0]
932        paddd   xmm2, xmmword ptr [rsp+0xA0]
933        paddd   xmm3, xmmword ptr [rsp+0x80]
934        paddd   xmm0, xmm4
935        paddd   xmm1, xmm5
936        paddd   xmm2, xmm6
937        paddd   xmm3, xmm7
938        pxor    xmm12, xmm0
939        pxor    xmm13, xmm1
940        pxor    xmm14, xmm2
941        pxor    xmm15, xmm3
942        movdqa  xmm8, xmmword ptr [ROT8+rip]
943        pshufb  xmm12, xmm8
944        pshufb  xmm13, xmm8
945        pshufb  xmm14, xmm8
946        pshufb  xmm15, xmm8
947        movdqa  xmm8, xmmword ptr [rsp+0x100]
948        paddd   xmm8, xmm12
949        paddd   xmm9, xmm13
950        paddd   xmm10, xmm14
951        paddd   xmm11, xmm15
952        pxor    xmm4, xmm8
953        pxor    xmm5, xmm9
954        pxor    xmm6, xmm10
955        pxor    xmm7, xmm11
956        movdqa  xmmword ptr [rsp+0x100], xmm8
957        movdqa  xmm8, xmm4
958        psrld   xmm8, 7
959        pslld   xmm4, 25
960        por     xmm4, xmm8
961        movdqa  xmm8, xmm5
962        psrld   xmm8, 7
963        pslld   xmm5, 25
964        por     xmm5, xmm8
965        movdqa  xmm8, xmm6
966        psrld   xmm8, 7
967        pslld   xmm6, 25
968        por     xmm6, xmm8
969        movdqa  xmm8, xmm7
970        psrld   xmm8, 7
971        pslld   xmm7, 25
972        por     xmm7, xmm8
973        paddd   xmm0, xmmword ptr [rsp+0x70]
974        paddd   xmm1, xmmword ptr [rsp+0x50]
975        paddd   xmm2, xmmword ptr [rsp]
976        paddd   xmm3, xmmword ptr [rsp+0x60]
977        paddd   xmm0, xmm5
978        paddd   xmm1, xmm6
979        paddd   xmm2, xmm7
980        paddd   xmm3, xmm4
981        pxor    xmm15, xmm0
982        pxor    xmm12, xmm1
983        pxor    xmm13, xmm2
984        pxor    xmm14, xmm3
985        movdqa  xmm8, xmmword ptr [ROT16+rip]
986        pshufb  xmm15, xmm8
987        pshufb  xmm12, xmm8
988        pshufb  xmm13, xmm8
989        pshufb  xmm14, xmm8
990        paddd   xmm10, xmm15
991        paddd   xmm11, xmm12
992        movdqa  xmm8, xmmword ptr [rsp+0x100]
993        paddd   xmm8, xmm13
994        paddd   xmm9, xmm14
995        pxor    xmm5, xmm10
996        pxor    xmm6, xmm11
997        pxor    xmm7, xmm8
998        pxor    xmm4, xmm9
999        movdqa  xmmword ptr [rsp+0x100], xmm8
1000        movdqa  xmm8, xmm5
1001        psrld   xmm8, 12
1002        pslld   xmm5, 20
1003        por     xmm5, xmm8
1004        movdqa  xmm8, xmm6
1005        psrld   xmm8, 12
1006        pslld   xmm6, 20
1007        por     xmm6, xmm8
1008        movdqa  xmm8, xmm7
1009        psrld   xmm8, 12
1010        pslld   xmm7, 20
1011        por     xmm7, xmm8
1012        movdqa  xmm8, xmm4
1013        psrld   xmm8, 12
1014        pslld   xmm4, 20
1015        por     xmm4, xmm8
1016        paddd   xmm0, xmmword ptr [rsp+0x20]
1017        paddd   xmm1, xmmword ptr [rsp+0x30]
1018        paddd   xmm2, xmmword ptr [rsp+0x10]
1019        paddd   xmm3, xmmword ptr [rsp+0x40]
1020        paddd   xmm0, xmm5
1021        paddd   xmm1, xmm6
1022        paddd   xmm2, xmm7
1023        paddd   xmm3, xmm4
1024        pxor    xmm15, xmm0
1025        pxor    xmm12, xmm1
1026        pxor    xmm13, xmm2
1027        pxor    xmm14, xmm3
1028        movdqa  xmm8, xmmword ptr [ROT8+rip]
1029        pshufb  xmm15, xmm8
1030        pshufb  xmm12, xmm8
1031        pshufb  xmm13, xmm8
1032        pshufb  xmm14, xmm8
1033        paddd   xmm10, xmm15
1034        paddd   xmm11, xmm12
1035        movdqa  xmm8, xmmword ptr [rsp+0x100]
1036        paddd   xmm8, xmm13
1037        paddd   xmm9, xmm14
1038        pxor    xmm5, xmm10
1039        pxor    xmm6, xmm11
1040        pxor    xmm7, xmm8
1041        pxor    xmm4, xmm9
1042        movdqa  xmmword ptr [rsp+0x100], xmm8
1043        movdqa  xmm8, xmm5
1044        psrld   xmm8, 7
1045        pslld   xmm5, 25
1046        por     xmm5, xmm8
1047        movdqa  xmm8, xmm6
1048        psrld   xmm8, 7
1049        pslld   xmm6, 25
1050        por     xmm6, xmm8
1051        movdqa  xmm8, xmm7
1052        psrld   xmm8, 7
1053        pslld   xmm7, 25
1054        por     xmm7, xmm8
1055        movdqa  xmm8, xmm4
1056        psrld   xmm8, 7
1057        pslld   xmm4, 25
1058        por     xmm4, xmm8
1059        paddd   xmm0, xmmword ptr [rsp+0x90]
1060        paddd   xmm1, xmmword ptr [rsp+0xB0]
1061        paddd   xmm2, xmmword ptr [rsp+0x80]
1062        paddd   xmm3, xmmword ptr [rsp+0xF0]
1063        paddd   xmm0, xmm4
1064        paddd   xmm1, xmm5
1065        paddd   xmm2, xmm6
1066        paddd   xmm3, xmm7
1067        pxor    xmm12, xmm0
1068        pxor    xmm13, xmm1
1069        pxor    xmm14, xmm2
1070        pxor    xmm15, xmm3
1071        movdqa  xmm8, xmmword ptr [ROT16+rip]
1072        pshufb  xmm12, xmm8
1073        pshufb  xmm13, xmm8
1074        pshufb  xmm14, xmm8
1075        pshufb  xmm15, xmm8
1076        movdqa  xmm8, xmmword ptr [rsp+0x100]
1077        paddd   xmm8, xmm12
1078        paddd   xmm9, xmm13
1079        paddd   xmm10, xmm14
1080        paddd   xmm11, xmm15
1081        pxor    xmm4, xmm8
1082        pxor    xmm5, xmm9
1083        pxor    xmm6, xmm10
1084        pxor    xmm7, xmm11
1085        movdqa  xmmword ptr [rsp+0x100], xmm8
1086        movdqa  xmm8, xmm4
1087        psrld   xmm8, 12
1088        pslld   xmm4, 20
1089        por     xmm4, xmm8
1090        movdqa  xmm8, xmm5
1091        psrld   xmm8, 12
1092        pslld   xmm5, 20
1093        por     xmm5, xmm8
1094        movdqa  xmm8, xmm6
1095        psrld   xmm8, 12
1096        pslld   xmm6, 20
1097        por     xmm6, xmm8
1098        movdqa  xmm8, xmm7
1099        psrld   xmm8, 12
1100        pslld   xmm7, 20
1101        por     xmm7, xmm8
1102        paddd   xmm0, xmmword ptr [rsp+0xE0]
1103        paddd   xmm1, xmmword ptr [rsp+0x50]
1104        paddd   xmm2, xmmword ptr [rsp+0xC0]
1105        paddd   xmm3, xmmword ptr [rsp+0x10]
1106        paddd   xmm0, xmm4
1107        paddd   xmm1, xmm5
1108        paddd   xmm2, xmm6
1109        paddd   xmm3, xmm7
1110        pxor    xmm12, xmm0
1111        pxor    xmm13, xmm1
1112        pxor    xmm14, xmm2
1113        pxor    xmm15, xmm3
1114        movdqa  xmm8, xmmword ptr [ROT8+rip]
1115        pshufb  xmm12, xmm8
1116        pshufb  xmm13, xmm8
1117        pshufb  xmm14, xmm8
1118        pshufb  xmm15, xmm8
1119        movdqa  xmm8, xmmword ptr [rsp+0x100]
1120        paddd   xmm8, xmm12
1121        paddd   xmm9, xmm13
1122        paddd   xmm10, xmm14
1123        paddd   xmm11, xmm15
1124        pxor    xmm4, xmm8
1125        pxor    xmm5, xmm9
1126        pxor    xmm6, xmm10
1127        pxor    xmm7, xmm11
1128        movdqa  xmmword ptr [rsp+0x100], xmm8
1129        movdqa  xmm8, xmm4
1130        psrld   xmm8, 7
1131        pslld   xmm4, 25
1132        por     xmm4, xmm8
1133        movdqa  xmm8, xmm5
1134        psrld   xmm8, 7
1135        pslld   xmm5, 25
1136        por     xmm5, xmm8
1137        movdqa  xmm8, xmm6
1138        psrld   xmm8, 7
1139        pslld   xmm6, 25
1140        por     xmm6, xmm8
1141        movdqa  xmm8, xmm7
1142        psrld   xmm8, 7
1143        pslld   xmm7, 25
1144        por     xmm7, xmm8
1145        paddd   xmm0, xmmword ptr [rsp+0xD0]
1146        paddd   xmm1, xmmword ptr [rsp]
1147        paddd   xmm2, xmmword ptr [rsp+0x20]
1148        paddd   xmm3, xmmword ptr [rsp+0x40]
1149        paddd   xmm0, xmm5
1150        paddd   xmm1, xmm6
1151        paddd   xmm2, xmm7
1152        paddd   xmm3, xmm4
1153        pxor    xmm15, xmm0
1154        pxor    xmm12, xmm1
1155        pxor    xmm13, xmm2
1156        pxor    xmm14, xmm3
1157        movdqa  xmm8, xmmword ptr [ROT16+rip]
1158        pshufb  xmm15, xmm8
1159        pshufb  xmm12, xmm8
1160        pshufb  xmm13, xmm8
1161        pshufb  xmm14, xmm8
1162        paddd   xmm10, xmm15
1163        paddd   xmm11, xmm12
1164        movdqa  xmm8, xmmword ptr [rsp+0x100]
1165        paddd   xmm8, xmm13
1166        paddd   xmm9, xmm14
1167        pxor    xmm5, xmm10
1168        pxor    xmm6, xmm11
1169        pxor    xmm7, xmm8
1170        pxor    xmm4, xmm9
1171        movdqa  xmmword ptr [rsp+0x100], xmm8
1172        movdqa  xmm8, xmm5
1173        psrld   xmm8, 12
1174        pslld   xmm5, 20
1175        por     xmm5, xmm8
1176        movdqa  xmm8, xmm6
1177        psrld   xmm8, 12
1178        pslld   xmm6, 20
1179        por     xmm6, xmm8
1180        movdqa  xmm8, xmm7
1181        psrld   xmm8, 12
1182        pslld   xmm7, 20
1183        por     xmm7, xmm8
1184        movdqa  xmm8, xmm4
1185        psrld   xmm8, 12
1186        pslld   xmm4, 20
1187        por     xmm4, xmm8
1188        paddd   xmm0, xmmword ptr [rsp+0x30]
1189        paddd   xmm1, xmmword ptr [rsp+0xA0]
1190        paddd   xmm2, xmmword ptr [rsp+0x60]
1191        paddd   xmm3, xmmword ptr [rsp+0x70]
1192        paddd   xmm0, xmm5
1193        paddd   xmm1, xmm6
1194        paddd   xmm2, xmm7
1195        paddd   xmm3, xmm4
1196        pxor    xmm15, xmm0
1197        pxor    xmm12, xmm1
1198        pxor    xmm13, xmm2
1199        pxor    xmm14, xmm3
1200        movdqa  xmm8, xmmword ptr [ROT8+rip]
1201        pshufb  xmm15, xmm8
1202        pshufb  xmm12, xmm8
1203        pshufb  xmm13, xmm8
1204        pshufb  xmm14, xmm8
1205        paddd   xmm10, xmm15
1206        paddd   xmm11, xmm12
1207        movdqa  xmm8, xmmword ptr [rsp+0x100]
1208        paddd   xmm8, xmm13
1209        paddd   xmm9, xmm14
1210        pxor    xmm5, xmm10
1211        pxor    xmm6, xmm11
1212        pxor    xmm7, xmm8
1213        pxor    xmm4, xmm9
1214        movdqa  xmmword ptr [rsp+0x100], xmm8
1215        movdqa  xmm8, xmm5
1216        psrld   xmm8, 7
1217        pslld   xmm5, 25
1218        por     xmm5, xmm8
1219        movdqa  xmm8, xmm6
1220        psrld   xmm8, 7
1221        pslld   xmm6, 25
1222        por     xmm6, xmm8
1223        movdqa  xmm8, xmm7
1224        psrld   xmm8, 7
1225        pslld   xmm7, 25
1226        por     xmm7, xmm8
1227        movdqa  xmm8, xmm4
1228        psrld   xmm8, 7
1229        pslld   xmm4, 25
1230        por     xmm4, xmm8
1231        paddd   xmm0, xmmword ptr [rsp+0xB0]
1232        paddd   xmm1, xmmword ptr [rsp+0x50]
1233        paddd   xmm2, xmmword ptr [rsp+0x10]
1234        paddd   xmm3, xmmword ptr [rsp+0x80]
1235        paddd   xmm0, xmm4
1236        paddd   xmm1, xmm5
1237        paddd   xmm2, xmm6
1238        paddd   xmm3, xmm7
1239        pxor    xmm12, xmm0
1240        pxor    xmm13, xmm1
1241        pxor    xmm14, xmm2
1242        pxor    xmm15, xmm3
1243        movdqa  xmm8, xmmword ptr [ROT16+rip]
1244        pshufb  xmm12, xmm8
1245        pshufb  xmm13, xmm8
1246        pshufb  xmm14, xmm8
1247        pshufb  xmm15, xmm8
1248        movdqa  xmm8, xmmword ptr [rsp+0x100]
1249        paddd   xmm8, xmm12
1250        paddd   xmm9, xmm13
1251        paddd   xmm10, xmm14
1252        paddd   xmm11, xmm15
1253        pxor    xmm4, xmm8
1254        pxor    xmm5, xmm9
1255        pxor    xmm6, xmm10
1256        pxor    xmm7, xmm11
1257        movdqa  xmmword ptr [rsp+0x100], xmm8
1258        movdqa  xmm8, xmm4
1259        psrld   xmm8, 12
1260        pslld   xmm4, 20
1261        por     xmm4, xmm8
1262        movdqa  xmm8, xmm5
1263        psrld   xmm8, 12
1264        pslld   xmm5, 20
1265        por     xmm5, xmm8
1266        movdqa  xmm8, xmm6
1267        psrld   xmm8, 12
1268        pslld   xmm6, 20
1269        por     xmm6, xmm8
1270        movdqa  xmm8, xmm7
1271        psrld   xmm8, 12
1272        pslld   xmm7, 20
1273        por     xmm7, xmm8
1274        paddd   xmm0, xmmword ptr [rsp+0xF0]
1275        paddd   xmm1, xmmword ptr [rsp]
1276        paddd   xmm2, xmmword ptr [rsp+0x90]
1277        paddd   xmm3, xmmword ptr [rsp+0x60]
1278        paddd   xmm0, xmm4
1279        paddd   xmm1, xmm5
1280        paddd   xmm2, xmm6
1281        paddd   xmm3, xmm7
1282        pxor    xmm12, xmm0
1283        pxor    xmm13, xmm1
1284        pxor    xmm14, xmm2
1285        pxor    xmm15, xmm3
1286        movdqa  xmm8, xmmword ptr [ROT8+rip]
1287        pshufb  xmm12, xmm8
1288        pshufb  xmm13, xmm8
1289        pshufb  xmm14, xmm8
1290        pshufb  xmm15, xmm8
1291        movdqa  xmm8, xmmword ptr [rsp+0x100]
1292        paddd   xmm8, xmm12
1293        paddd   xmm9, xmm13
1294        paddd   xmm10, xmm14
1295        paddd   xmm11, xmm15
1296        pxor    xmm4, xmm8
1297        pxor    xmm5, xmm9
1298        pxor    xmm6, xmm10
1299        pxor    xmm7, xmm11
1300        movdqa  xmmword ptr [rsp+0x100], xmm8
1301        movdqa  xmm8, xmm4
1302        psrld   xmm8, 7
1303        pslld   xmm4, 25
1304        por     xmm4, xmm8
1305        movdqa  xmm8, xmm5
1306        psrld   xmm8, 7
1307        pslld   xmm5, 25
1308        por     xmm5, xmm8
1309        movdqa  xmm8, xmm6
1310        psrld   xmm8, 7
1311        pslld   xmm6, 25
1312        por     xmm6, xmm8
1313        movdqa  xmm8, xmm7
1314        psrld   xmm8, 7
1315        pslld   xmm7, 25
1316        por     xmm7, xmm8
1317        paddd   xmm0, xmmword ptr [rsp+0xE0]
1318        paddd   xmm1, xmmword ptr [rsp+0x20]
1319        paddd   xmm2, xmmword ptr [rsp+0x30]
1320        paddd   xmm3, xmmword ptr [rsp+0x70]
1321        paddd   xmm0, xmm5
1322        paddd   xmm1, xmm6
1323        paddd   xmm2, xmm7
1324        paddd   xmm3, xmm4
1325        pxor    xmm15, xmm0
1326        pxor    xmm12, xmm1
1327        pxor    xmm13, xmm2
1328        pxor    xmm14, xmm3
1329        movdqa  xmm8, xmmword ptr [ROT16+rip]
1330        pshufb  xmm15, xmm8
1331        pshufb  xmm12, xmm8
1332        pshufb  xmm13, xmm8
1333        pshufb  xmm14, xmm8
1334        paddd   xmm10, xmm15
1335        paddd   xmm11, xmm12
1336        movdqa  xmm8, xmmword ptr [rsp+0x100]
1337        paddd   xmm8, xmm13
1338        paddd   xmm9, xmm14
1339        pxor    xmm5, xmm10
1340        pxor    xmm6, xmm11
1341        pxor    xmm7, xmm8
1342        pxor    xmm4, xmm9
1343        movdqa  xmmword ptr [rsp+0x100], xmm8
1344        movdqa  xmm8, xmm5
1345        psrld   xmm8, 12
1346        pslld   xmm5, 20
1347        por     xmm5, xmm8
1348        movdqa  xmm8, xmm6
1349        psrld   xmm8, 12
1350        pslld   xmm6, 20
1351        por     xmm6, xmm8
1352        movdqa  xmm8, xmm7
1353        psrld   xmm8, 12
1354        pslld   xmm7, 20
1355        por     xmm7, xmm8
1356        movdqa  xmm8, xmm4
1357        psrld   xmm8, 12
1358        pslld   xmm4, 20
1359        por     xmm4, xmm8
1360        paddd   xmm0, xmmword ptr [rsp+0xA0]
1361        paddd   xmm1, xmmword ptr [rsp+0xC0]
1362        paddd   xmm2, xmmword ptr [rsp+0x40]
1363        paddd   xmm3, xmmword ptr [rsp+0xD0]
1364        paddd   xmm0, xmm5
1365        paddd   xmm1, xmm6
1366        paddd   xmm2, xmm7
1367        paddd   xmm3, xmm4
1368        pxor    xmm15, xmm0
1369        pxor    xmm12, xmm1
1370        pxor    xmm13, xmm2
1371        pxor    xmm14, xmm3
1372        movdqa  xmm8, xmmword ptr [ROT8+rip]
1373        pshufb  xmm15, xmm8
1374        pshufb  xmm12, xmm8
1375        pshufb  xmm13, xmm8
1376        pshufb  xmm14, xmm8
1377        paddd   xmm10, xmm15
1378        paddd   xmm11, xmm12
1379        movdqa  xmm8, xmmword ptr [rsp+0x100]
1380        paddd   xmm8, xmm13
1381        paddd   xmm9, xmm14
1382        pxor    xmm5, xmm10
1383        pxor    xmm6, xmm11
1384        pxor    xmm7, xmm8
1385        pxor    xmm4, xmm9
1386        pxor    xmm0, xmm8
1387        pxor    xmm1, xmm9
1388        pxor    xmm2, xmm10
1389        pxor    xmm3, xmm11
1390        movdqa  xmm8, xmm5
1391        psrld   xmm8, 7
1392        pslld   xmm5, 25
1393        por     xmm5, xmm8
1394        movdqa  xmm8, xmm6
1395        psrld   xmm8, 7
1396        pslld   xmm6, 25
1397        por     xmm6, xmm8
1398        movdqa  xmm8, xmm7
1399        psrld   xmm8, 7
1400        pslld   xmm7, 25
1401        por     xmm7, xmm8
1402        movdqa  xmm8, xmm4
1403        psrld   xmm8, 7
1404        pslld   xmm4, 25
1405        por     xmm4, xmm8
1406        pxor    xmm4, xmm12
1407        pxor    xmm5, xmm13
1408        pxor    xmm6, xmm14
1409        pxor    xmm7, xmm15
1410        mov     eax, r13d
1411        jne     9b
1412        movdqa  xmm9, xmm0
1413        punpckldq xmm0, xmm1
1414        punpckhdq xmm9, xmm1
1415        movdqa  xmm11, xmm2
1416        punpckldq xmm2, xmm3
1417        punpckhdq xmm11, xmm3
1418        movdqa  xmm1, xmm0
1419        punpcklqdq xmm0, xmm2
1420        punpckhqdq xmm1, xmm2
1421        movdqa  xmm3, xmm9
1422        punpcklqdq xmm9, xmm11
1423        punpckhqdq xmm3, xmm11
1424        movdqu  xmmword ptr [rbx], xmm0
1425        movdqu  xmmword ptr [rbx+0x20], xmm1
1426        movdqu  xmmword ptr [rbx+0x40], xmm9
1427        movdqu  xmmword ptr [rbx+0x60], xmm3
1428        movdqa  xmm9, xmm4
1429        punpckldq xmm4, xmm5
1430        punpckhdq xmm9, xmm5
1431        movdqa  xmm11, xmm6
1432        punpckldq xmm6, xmm7
1433        punpckhdq xmm11, xmm7
1434        movdqa  xmm5, xmm4
1435        punpcklqdq xmm4, xmm6
1436        punpckhqdq xmm5, xmm6
1437        movdqa  xmm7, xmm9
1438        punpcklqdq xmm9, xmm11
1439        punpckhqdq xmm7, xmm11
1440        movdqu  xmmword ptr [rbx+0x10], xmm4
1441        movdqu  xmmword ptr [rbx+0x30], xmm5
1442        movdqu  xmmword ptr [rbx+0x50], xmm9
1443        movdqu  xmmword ptr [rbx+0x70], xmm7
1444        movdqa  xmm1, xmmword ptr [rsp+0x110]
1445        movdqa  xmm0, xmm1
1446        paddd   xmm1, xmmword ptr [rsp+0x150]
1447        movdqa  xmmword ptr [rsp+0x110], xmm1
1448        pxor    xmm0, xmmword ptr [CMP_MSB_MASK+rip]
1449        pxor    xmm1, xmmword ptr [CMP_MSB_MASK+rip]
1450        pcmpgtd xmm0, xmm1
1451        movdqa  xmm1, xmmword ptr [rsp+0x120]
1452        psubd   xmm1, xmm0
1453        movdqa  xmmword ptr [rsp+0x120], xmm1
1454        add     rbx, 128
1455        add     rdi, 32
1456        sub     rsi, 4
1457        cmp     rsi, 4
1458        jnc     2b
1459        test    rsi, rsi
1460        jnz     3f
14614:
1462        mov     rsp, rbp
1463        pop     rbp
1464        pop     rbx
1465        pop     r12
1466        pop     r13
1467        pop     r14
1468        pop     r15
1469        ret
1470.p2align 5
14713:
1472        test    esi, 0x2
1473        je      3f
1474        movups  xmm0, xmmword ptr [rcx]
1475        movups  xmm1, xmmword ptr [rcx+0x10]
1476        movaps  xmm8, xmm0
1477        movaps  xmm9, xmm1
1478        movd    xmm13, dword ptr [rsp+0x110]
1479        pinsrd  xmm13, dword ptr [rsp+0x120], 1
1480        pinsrd  xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
1481        movaps  xmmword ptr [rsp], xmm13
1482        movd    xmm14, dword ptr [rsp+0x114]
1483        pinsrd  xmm14, dword ptr [rsp+0x124], 1
1484        pinsrd  xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
1485        movaps  xmmword ptr [rsp+0x10], xmm14
1486        mov     r8, qword ptr [rdi]
1487        mov     r9, qword ptr [rdi+0x8]
1488        movzx   eax, byte ptr [rbp+0x40]
1489        or      eax, r13d
1490        xor     edx, edx
14912:
1492        mov     r14d, eax
1493        or      eax, r12d
1494        add     rdx, 64
1495        cmp     rdx, r15
1496        cmovne  eax, r14d
1497        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
1498        movaps  xmm10, xmm2
1499        movups  xmm4, xmmword ptr [r8+rdx-0x40]
1500        movups  xmm5, xmmword ptr [r8+rdx-0x30]
1501        movaps  xmm3, xmm4
1502        shufps  xmm4, xmm5, 136
1503        shufps  xmm3, xmm5, 221
1504        movaps  xmm5, xmm3
1505        movups  xmm6, xmmword ptr [r8+rdx-0x20]
1506        movups  xmm7, xmmword ptr [r8+rdx-0x10]
1507        movaps  xmm3, xmm6
1508        shufps  xmm6, xmm7, 136
1509        pshufd  xmm6, xmm6, 0x93
1510        shufps  xmm3, xmm7, 221
1511        pshufd  xmm7, xmm3, 0x93
1512        movups  xmm12, xmmword ptr [r9+rdx-0x40]
1513        movups  xmm13, xmmword ptr [r9+rdx-0x30]
1514        movaps  xmm11, xmm12
1515        shufps  xmm12, xmm13, 136
1516        shufps  xmm11, xmm13, 221
1517        movaps  xmm13, xmm11
1518        movups  xmm14, xmmword ptr [r9+rdx-0x20]
1519        movups  xmm15, xmmword ptr [r9+rdx-0x10]
1520        movaps  xmm11, xmm14
1521        shufps  xmm14, xmm15, 136
1522        pshufd  xmm14, xmm14, 0x93
1523        shufps  xmm11, xmm15, 221
1524        pshufd  xmm15, xmm11, 0x93
1525        movaps  xmm3, xmmword ptr [rsp]
1526        movaps  xmm11, xmmword ptr [rsp+0x10]
1527        pinsrd  xmm3, eax, 3
1528        pinsrd  xmm11, eax, 3
1529        mov     al, 7
15309:
1531        paddd   xmm0, xmm4
1532        paddd   xmm8, xmm12
1533        movaps  xmmword ptr [rsp+0x20], xmm4
1534        movaps  xmmword ptr [rsp+0x30], xmm12
1535        paddd   xmm0, xmm1
1536        paddd   xmm8, xmm9
1537        pxor    xmm3, xmm0
1538        pxor    xmm11, xmm8
1539        movaps  xmm12, xmmword ptr [ROT16+rip]
1540        pshufb  xmm3, xmm12
1541        pshufb  xmm11, xmm12
1542        paddd   xmm2, xmm3
1543        paddd   xmm10, xmm11
1544        pxor    xmm1, xmm2
1545        pxor    xmm9, xmm10
1546        movdqa  xmm4, xmm1
1547        pslld   xmm1, 20
1548        psrld   xmm4, 12
1549        por     xmm1, xmm4
1550        movdqa  xmm4, xmm9
1551        pslld   xmm9, 20
1552        psrld   xmm4, 12
1553        por     xmm9, xmm4
1554        paddd   xmm0, xmm5
1555        paddd   xmm8, xmm13
1556        movaps  xmmword ptr [rsp+0x40], xmm5
1557        movaps  xmmword ptr [rsp+0x50], xmm13
1558        paddd   xmm0, xmm1
1559        paddd   xmm8, xmm9
1560        pxor    xmm3, xmm0
1561        pxor    xmm11, xmm8
1562        movaps  xmm13, xmmword ptr [ROT8+rip]
1563        pshufb  xmm3, xmm13
1564        pshufb  xmm11, xmm13
1565        paddd   xmm2, xmm3
1566        paddd   xmm10, xmm11
1567        pxor    xmm1, xmm2
1568        pxor    xmm9, xmm10
1569        movdqa  xmm4, xmm1
1570        pslld   xmm1, 25
1571        psrld   xmm4, 7
1572        por     xmm1, xmm4
1573        movdqa  xmm4, xmm9
1574        pslld   xmm9, 25
1575        psrld   xmm4, 7
1576        por     xmm9, xmm4
1577        pshufd  xmm0, xmm0, 0x93
1578        pshufd  xmm8, xmm8, 0x93
1579        pshufd  xmm3, xmm3, 0x4E
1580        pshufd  xmm11, xmm11, 0x4E
1581        pshufd  xmm2, xmm2, 0x39
1582        pshufd  xmm10, xmm10, 0x39
1583        paddd   xmm0, xmm6
1584        paddd   xmm8, xmm14
1585        paddd   xmm0, xmm1
1586        paddd   xmm8, xmm9
1587        pxor    xmm3, xmm0
1588        pxor    xmm11, xmm8
1589        pshufb  xmm3, xmm12
1590        pshufb  xmm11, xmm12
1591        paddd   xmm2, xmm3
1592        paddd   xmm10, xmm11
1593        pxor    xmm1, xmm2
1594        pxor    xmm9, xmm10
1595        movdqa  xmm4, xmm1
1596        pslld   xmm1, 20
1597        psrld   xmm4, 12
1598        por     xmm1, xmm4
1599        movdqa  xmm4, xmm9
1600        pslld   xmm9, 20
1601        psrld   xmm4, 12
1602        por     xmm9, xmm4
1603        paddd   xmm0, xmm7
1604        paddd   xmm8, xmm15
1605        paddd   xmm0, xmm1
1606        paddd   xmm8, xmm9
1607        pxor    xmm3, xmm0
1608        pxor    xmm11, xmm8
1609        pshufb  xmm3, xmm13
1610        pshufb  xmm11, xmm13
1611        paddd   xmm2, xmm3
1612        paddd   xmm10, xmm11
1613        pxor    xmm1, xmm2
1614        pxor    xmm9, xmm10
1615        movdqa  xmm4, xmm1
1616        pslld   xmm1, 25
1617        psrld   xmm4, 7
1618        por     xmm1, xmm4
1619        movdqa  xmm4, xmm9
1620        pslld   xmm9, 25
1621        psrld   xmm4, 7
1622        por     xmm9, xmm4
1623        pshufd  xmm0, xmm0, 0x39
1624        pshufd  xmm8, xmm8, 0x39
1625        pshufd  xmm3, xmm3, 0x4E
1626        pshufd  xmm11, xmm11, 0x4E
1627        pshufd  xmm2, xmm2, 0x93
1628        pshufd  xmm10, xmm10, 0x93
1629        dec     al
1630        je      9f
1631        movdqa  xmm12, xmmword ptr [rsp+0x20]
1632        movdqa  xmm5, xmmword ptr [rsp+0x40]
1633        pshufd  xmm13, xmm12, 0x0F
1634        shufps  xmm12, xmm5, 214
1635        pshufd  xmm4, xmm12, 0x39
1636        movdqa  xmm12, xmm6
1637        shufps  xmm12, xmm7, 250
1638        pblendw xmm13, xmm12, 0xCC
1639        movdqa  xmm12, xmm7
1640        punpcklqdq xmm12, xmm5
1641        pblendw xmm12, xmm6, 0xC0
1642        pshufd  xmm12, xmm12, 0x78
1643        punpckhdq xmm5, xmm7
1644        punpckldq xmm6, xmm5
1645        pshufd  xmm7, xmm6, 0x1E
1646        movdqa  xmmword ptr [rsp+0x20], xmm13
1647        movdqa  xmmword ptr [rsp+0x40], xmm12
1648        movdqa  xmm5, xmmword ptr [rsp+0x30]
1649        movdqa  xmm13, xmmword ptr [rsp+0x50]
1650        pshufd  xmm6, xmm5, 0x0F
1651        shufps  xmm5, xmm13, 214
1652        pshufd  xmm12, xmm5, 0x39
1653        movdqa  xmm5, xmm14
1654        shufps  xmm5, xmm15, 250
1655        pblendw xmm6, xmm5, 0xCC
1656        movdqa  xmm5, xmm15
1657        punpcklqdq xmm5, xmm13
1658        pblendw xmm5, xmm14, 0xC0
1659        pshufd  xmm5, xmm5, 0x78
1660        punpckhdq xmm13, xmm15
1661        punpckldq xmm14, xmm13
1662        pshufd  xmm15, xmm14, 0x1E
1663        movdqa  xmm13, xmm6
1664        movdqa  xmm14, xmm5
1665        movdqa  xmm5, xmmword ptr [rsp+0x20]
1666        movdqa  xmm6, xmmword ptr [rsp+0x40]
1667        jmp     9b
16689:
1669        pxor    xmm0, xmm2
1670        pxor    xmm1, xmm3
1671        pxor    xmm8, xmm10
1672        pxor    xmm9, xmm11
1673        mov     eax, r13d
1674        cmp     rdx, r15
1675        jne     2b
1676        movups  xmmword ptr [rbx], xmm0
1677        movups  xmmword ptr [rbx+0x10], xmm1
1678        movups  xmmword ptr [rbx+0x20], xmm8
1679        movups  xmmword ptr [rbx+0x30], xmm9
1680        movdqa  xmm0, xmmword ptr [rsp+0x130]
1681        movdqa  xmm1, xmmword ptr [rsp+0x110]
1682        movdqa  xmm2, xmmword ptr [rsp+0x120]
1683        movdqu  xmm3, xmmword ptr [rsp+0x118]
1684        movdqu  xmm4, xmmword ptr [rsp+0x128]
1685        blendvps xmm1, xmm3, xmm0
1686        blendvps xmm2, xmm4, xmm0
1687        movdqa  xmmword ptr [rsp+0x110], xmm1
1688        movdqa  xmmword ptr [rsp+0x120], xmm2
1689        add     rdi, 16
1690        add     rbx, 64
1691        sub     rsi, 2
16923:
1693        test    esi, 0x1
1694        je      4b
1695        movups  xmm0, xmmword ptr [rcx]
1696        movups  xmm1, xmmword ptr [rcx+0x10]
1697        movd    xmm13, dword ptr [rsp+0x110]
1698        pinsrd  xmm13, dword ptr [rsp+0x120], 1
1699        pinsrd  xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
1700        movaps  xmm14, xmmword ptr [ROT8+rip]
1701        movaps  xmm15, xmmword ptr [ROT16+rip]
1702        mov     r8, qword ptr [rdi]
1703        movzx   eax, byte ptr [rbp+0x40]
1704        or      eax, r13d
1705        xor     edx, edx
17062:
1707        mov     r14d, eax
1708        or      eax, r12d
1709        add     rdx, 64
1710        cmp     rdx, r15
1711        cmovne  eax, r14d
1712        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
1713        movaps  xmm3, xmm13
1714        pinsrd  xmm3, eax, 3
1715        movups  xmm4, xmmword ptr [r8+rdx-0x40]
1716        movups  xmm5, xmmword ptr [r8+rdx-0x30]
1717        movaps  xmm8, xmm4
1718        shufps  xmm4, xmm5, 136
1719        shufps  xmm8, xmm5, 221
1720        movaps  xmm5, xmm8
1721        movups  xmm6, xmmword ptr [r8+rdx-0x20]
1722        movups  xmm7, xmmword ptr [r8+rdx-0x10]
1723        movaps  xmm8, xmm6
1724        shufps  xmm6, xmm7, 136
1725        pshufd  xmm6, xmm6, 0x93
1726        shufps  xmm8, xmm7, 221
1727        pshufd  xmm7, xmm8, 0x93
1728        mov     al, 7
17299:
1730        paddd   xmm0, xmm4
1731        paddd   xmm0, xmm1
1732        pxor    xmm3, xmm0
1733        pshufb  xmm3, xmm15
1734        paddd   xmm2, xmm3
1735        pxor    xmm1, xmm2
1736        movdqa  xmm11, xmm1
1737        pslld   xmm1, 20
1738        psrld   xmm11, 12
1739        por     xmm1, xmm11
1740        paddd   xmm0, xmm5
1741        paddd   xmm0, xmm1
1742        pxor    xmm3, xmm0
1743        pshufb  xmm3, xmm14
1744        paddd   xmm2, xmm3
1745        pxor    xmm1, xmm2
1746        movdqa  xmm11, xmm1
1747        pslld   xmm1, 25
1748        psrld   xmm11, 7
1749        por     xmm1, xmm11
1750        pshufd  xmm0, xmm0, 0x93
1751        pshufd  xmm3, xmm3, 0x4E
1752        pshufd  xmm2, xmm2, 0x39
1753        paddd   xmm0, xmm6
1754        paddd   xmm0, xmm1
1755        pxor    xmm3, xmm0
1756        pshufb  xmm3, xmm15
1757        paddd   xmm2, xmm3
1758        pxor    xmm1, xmm2
1759        movdqa  xmm11, xmm1
1760        pslld   xmm1, 20
1761        psrld   xmm11, 12
1762        por     xmm1, xmm11
1763        paddd   xmm0, xmm7
1764        paddd   xmm0, xmm1
1765        pxor    xmm3, xmm0
1766        pshufb  xmm3, xmm14
1767        paddd   xmm2, xmm3
1768        pxor    xmm1, xmm2
1769        movdqa  xmm11, xmm1
1770        pslld   xmm1, 25
1771        psrld   xmm11, 7
1772        por     xmm1, xmm11
1773        pshufd  xmm0, xmm0, 0x39
1774        pshufd  xmm3, xmm3, 0x4E
1775        pshufd  xmm2, xmm2, 0x93
1776        dec     al
1777        jz      9f
1778        movdqa  xmm8, xmm4
1779        shufps  xmm8, xmm5, 214
1780        pshufd  xmm9, xmm4, 0x0F
1781        pshufd  xmm4, xmm8, 0x39
1782        movdqa  xmm8, xmm6
1783        shufps  xmm8, xmm7, 250
1784        pblendw xmm9, xmm8, 0xCC
1785        movdqa  xmm8, xmm7
1786        punpcklqdq xmm8, xmm5
1787        pblendw xmm8, xmm6, 0xC0
1788        pshufd  xmm8, xmm8, 0x78
1789        punpckhdq xmm5, xmm7
1790        punpckldq xmm6, xmm5
1791        pshufd  xmm7, xmm6, 0x1E
1792        movdqa  xmm5, xmm9
1793        movdqa  xmm6, xmm8
1794        jmp     9b
17959:
1796        pxor    xmm0, xmm2
1797        pxor    xmm1, xmm3
1798        mov     eax, r13d
1799        cmp     rdx, r15
1800        jne     2b
1801        movups  xmmword ptr [rbx], xmm0
1802        movups  xmmword ptr [rbx+0x10], xmm1
1803        jmp     4b
1804
1805.p2align 6
1806blake3_compress_in_place_sse41:
1807_blake3_compress_in_place_sse41:
1808        _CET_ENDBR
1809        movups  xmm0, xmmword ptr [rdi]
1810        movups  xmm1, xmmword ptr [rdi+0x10]
1811        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
1812        shl     r8, 32
1813        add     rdx, r8
1814        movq    xmm3, rcx
1815        movq    xmm4, rdx
1816        punpcklqdq xmm3, xmm4
1817        movups  xmm4, xmmword ptr [rsi]
1818        movups  xmm5, xmmword ptr [rsi+0x10]
1819        movaps  xmm8, xmm4
1820        shufps  xmm4, xmm5, 136
1821        shufps  xmm8, xmm5, 221
1822        movaps  xmm5, xmm8
1823        movups  xmm6, xmmword ptr [rsi+0x20]
1824        movups  xmm7, xmmword ptr [rsi+0x30]
1825        movaps  xmm8, xmm6
1826        shufps  xmm6, xmm7, 136
1827        pshufd  xmm6, xmm6, 0x93
1828        shufps  xmm8, xmm7, 221
1829        pshufd  xmm7, xmm8, 0x93
1830        movaps  xmm14, xmmword ptr [ROT8+rip]
1831        movaps  xmm15, xmmword ptr [ROT16+rip]
1832        mov     al, 7
18339:
1834        paddd   xmm0, xmm4
1835        paddd   xmm0, xmm1
1836        pxor    xmm3, xmm0
1837        pshufb  xmm3, xmm15
1838        paddd   xmm2, xmm3
1839        pxor    xmm1, xmm2
1840        movdqa  xmm11, xmm1
1841        pslld   xmm1, 20
1842        psrld   xmm11, 12
1843        por     xmm1, xmm11
1844        paddd   xmm0, xmm5
1845        paddd   xmm0, xmm1
1846        pxor    xmm3, xmm0
1847        pshufb  xmm3, xmm14
1848        paddd   xmm2, xmm3
1849        pxor    xmm1, xmm2
1850        movdqa  xmm11, xmm1
1851        pslld   xmm1, 25
1852        psrld   xmm11, 7
1853        por     xmm1, xmm11
1854        pshufd  xmm0, xmm0, 0x93
1855        pshufd  xmm3, xmm3, 0x4E
1856        pshufd  xmm2, xmm2, 0x39
1857        paddd   xmm0, xmm6
1858        paddd   xmm0, xmm1
1859        pxor    xmm3, xmm0
1860        pshufb  xmm3, xmm15
1861        paddd   xmm2, xmm3
1862        pxor    xmm1, xmm2
1863        movdqa  xmm11, xmm1
1864        pslld   xmm1, 20
1865        psrld   xmm11, 12
1866        por     xmm1, xmm11
1867        paddd   xmm0, xmm7
1868        paddd   xmm0, xmm1
1869        pxor    xmm3, xmm0
1870        pshufb  xmm3, xmm14
1871        paddd   xmm2, xmm3
1872        pxor    xmm1, xmm2
1873        movdqa  xmm11, xmm1
1874        pslld   xmm1, 25
1875        psrld   xmm11, 7
1876        por     xmm1, xmm11
1877        pshufd  xmm0, xmm0, 0x39
1878        pshufd  xmm3, xmm3, 0x4E
1879        pshufd  xmm2, xmm2, 0x93
1880        dec     al
1881        jz      9f
1882        movdqa  xmm8, xmm4
1883        shufps  xmm8, xmm5, 214
1884        pshufd  xmm9, xmm4, 0x0F
1885        pshufd  xmm4, xmm8, 0x39
1886        movdqa  xmm8, xmm6
1887        shufps  xmm8, xmm7, 250
1888        pblendw xmm9, xmm8, 0xCC
1889        movdqa  xmm8, xmm7
1890        punpcklqdq xmm8, xmm5
1891        pblendw xmm8, xmm6, 0xC0
1892        pshufd  xmm8, xmm8, 0x78
1893        punpckhdq xmm5, xmm7
1894        punpckldq xmm6, xmm5
1895        pshufd  xmm7, xmm6, 0x1E
1896        movdqa  xmm5, xmm9
1897        movdqa  xmm6, xmm8
1898        jmp     9b
18999:
1900        pxor    xmm0, xmm2
1901        pxor    xmm1, xmm3
1902        movups  xmmword ptr [rdi], xmm0
1903        movups  xmmword ptr [rdi+0x10], xmm1
1904        ret
1905
1906.p2align 6
1907blake3_compress_xof_sse41:
1908_blake3_compress_xof_sse41:
1909        _CET_ENDBR
1910        movups  xmm0, xmmword ptr [rdi]
1911        movups  xmm1, xmmword ptr [rdi+0x10]
1912        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
1913        movzx   eax, r8b
1914        movzx   edx, dl
1915        shl     rax, 32
1916        add     rdx, rax
1917        movq    xmm3, rcx
1918        movq    xmm4, rdx
1919        punpcklqdq xmm3, xmm4
1920        movups  xmm4, xmmword ptr [rsi]
1921        movups  xmm5, xmmword ptr [rsi+0x10]
1922        movaps  xmm8, xmm4
1923        shufps  xmm4, xmm5, 136
1924        shufps  xmm8, xmm5, 221
1925        movaps  xmm5, xmm8
1926        movups  xmm6, xmmword ptr [rsi+0x20]
1927        movups  xmm7, xmmword ptr [rsi+0x30]
1928        movaps  xmm8, xmm6
1929        shufps  xmm6, xmm7, 136
1930        pshufd  xmm6, xmm6, 0x93
1931        shufps  xmm8, xmm7, 221
1932        pshufd  xmm7, xmm8, 0x93
1933        movaps  xmm14, xmmword ptr [ROT8+rip]
1934        movaps  xmm15, xmmword ptr [ROT16+rip]
1935        mov     al, 7
19369:
1937        paddd   xmm0, xmm4
1938        paddd   xmm0, xmm1
1939        pxor    xmm3, xmm0
1940        pshufb  xmm3, xmm15
1941        paddd   xmm2, xmm3
1942        pxor    xmm1, xmm2
1943        movdqa  xmm11, xmm1
1944        pslld   xmm1, 20
1945        psrld   xmm11, 12
1946        por     xmm1, xmm11
1947        paddd   xmm0, xmm5
1948        paddd   xmm0, xmm1
1949        pxor    xmm3, xmm0
1950        pshufb  xmm3, xmm14
1951        paddd   xmm2, xmm3
1952        pxor    xmm1, xmm2
1953        movdqa  xmm11, xmm1
1954        pslld   xmm1, 25
1955        psrld   xmm11, 7
1956        por     xmm1, xmm11
1957        pshufd  xmm0, xmm0, 0x93
1958        pshufd  xmm3, xmm3, 0x4E
1959        pshufd  xmm2, xmm2, 0x39
1960        paddd   xmm0, xmm6
1961        paddd   xmm0, xmm1
1962        pxor    xmm3, xmm0
1963        pshufb  xmm3, xmm15
1964        paddd   xmm2, xmm3
1965        pxor    xmm1, xmm2
1966        movdqa  xmm11, xmm1
1967        pslld   xmm1, 20
1968        psrld   xmm11, 12
1969        por     xmm1, xmm11
1970        paddd   xmm0, xmm7
1971        paddd   xmm0, xmm1
1972        pxor    xmm3, xmm0
1973        pshufb  xmm3, xmm14
1974        paddd   xmm2, xmm3
1975        pxor    xmm1, xmm2
1976        movdqa  xmm11, xmm1
1977        pslld   xmm1, 25
1978        psrld   xmm11, 7
1979        por     xmm1, xmm11
1980        pshufd  xmm0, xmm0, 0x39
1981        pshufd  xmm3, xmm3, 0x4E
1982        pshufd  xmm2, xmm2, 0x93
1983        dec     al
1984        jz      9f
1985        movdqa  xmm8, xmm4
1986        shufps  xmm8, xmm5, 214
1987        pshufd  xmm9, xmm4, 0x0F
1988        pshufd  xmm4, xmm8, 0x39
1989        movdqa  xmm8, xmm6
1990        shufps  xmm8, xmm7, 250
1991        pblendw xmm9, xmm8, 0xCC
1992        movdqa  xmm8, xmm7
1993        punpcklqdq xmm8, xmm5
1994        pblendw xmm8, xmm6, 0xC0
1995        pshufd  xmm8, xmm8, 0x78
1996        punpckhdq xmm5, xmm7
1997        punpckldq xmm6, xmm5
1998        pshufd  xmm7, xmm6, 0x1E
1999        movdqa  xmm5, xmm9
2000        movdqa  xmm6, xmm8
2001        jmp     9b
20029:
2003        movdqu  xmm4, xmmword ptr [rdi]
2004        movdqu  xmm5, xmmword ptr [rdi+0x10]
2005        pxor    xmm0, xmm2
2006        pxor    xmm1, xmm3
2007        pxor    xmm2, xmm4
2008        pxor    xmm3, xmm5
2009        movups  xmmword ptr [r9], xmm0
2010        movups  xmmword ptr [r9+0x10], xmm1
2011        movups  xmmword ptr [r9+0x20], xmm2
2012        movups  xmmword ptr [r9+0x30], xmm3
2013        ret
2014
2015
2016#ifdef __APPLE__
2017.static_data
2018#else
2019.section .rodata
2020#endif
2021.p2align  6
2022BLAKE3_IV:
2023        .long  0x6A09E667, 0xBB67AE85
2024        .long  0x3C6EF372, 0xA54FF53A
2025ROT16:
2026        .byte  2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
2027ROT8:
2028        .byte  1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
2029ADD0:
2030        .long  0, 1, 2, 3
2031ADD1:
2032	.long  4, 4, 4, 4
2033BLAKE3_IV_0:
2034	.long  0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
2035BLAKE3_IV_1:
2036	.long  0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
2037BLAKE3_IV_2:
2038	.long  0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
2039BLAKE3_IV_3:
2040	.long  0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
2041BLAKE3_BLOCK_LEN:
2042	.long  64, 64, 64, 64
2043CMP_MSB_MASK:
2044	.long  0x80000000, 0x80000000, 0x80000000, 0x80000000
2045
2046#endif
2047