1#include "llvm_blake3_prefix.h"
2
3.intel_syntax noprefix
4.global blake3_hash_many_sse41
5.global _blake3_hash_many_sse41
6.global blake3_compress_in_place_sse41
7.global _blake3_compress_in_place_sse41
8.global blake3_compress_xof_sse41
9.global _blake3_compress_xof_sse41
10.section .text
11        .p2align  6
12_blake3_hash_many_sse41:
13blake3_hash_many_sse41:
14        push    r15
15        push    r14
16        push    r13
17        push    r12
18        push    rsi
19        push    rdi
20        push    rbx
21        push    rbp
22        mov     rbp, rsp
23        sub     rsp, 528
24        and     rsp, 0xFFFFFFFFFFFFFFC0
25        movdqa  xmmword ptr [rsp+0x170], xmm6
26        movdqa  xmmword ptr [rsp+0x180], xmm7
27        movdqa  xmmword ptr [rsp+0x190], xmm8
28        movdqa  xmmword ptr [rsp+0x1A0], xmm9
29        movdqa  xmmword ptr [rsp+0x1B0], xmm10
30        movdqa  xmmword ptr [rsp+0x1C0], xmm11
31        movdqa  xmmword ptr [rsp+0x1D0], xmm12
32        movdqa  xmmword ptr [rsp+0x1E0], xmm13
33        movdqa  xmmword ptr [rsp+0x1F0], xmm14
34        movdqa  xmmword ptr [rsp+0x200], xmm15
35        mov     rdi, rcx
36        mov     rsi, rdx
37        mov     rdx, r8
38        mov     rcx, r9
39        mov     r8, qword ptr [rbp+0x68]
40        movzx   r9, byte ptr [rbp+0x70]
41        neg     r9d
42        movd    xmm0, r9d
43        pshufd  xmm0, xmm0, 0x00
44        movdqa  xmmword ptr [rsp+0x130], xmm0
45        movdqa  xmm1, xmm0
46        pand    xmm1, xmmword ptr [ADD0+rip]
47        pand    xmm0, xmmword ptr [ADD1+rip]
48        movdqa  xmmword ptr [rsp+0x150], xmm0
49        movd    xmm0, r8d
50        pshufd  xmm0, xmm0, 0x00
51        paddd   xmm0, xmm1
52        movdqa  xmmword ptr [rsp+0x110], xmm0
53        pxor    xmm0, xmmword ptr [CMP_MSB_MASK+rip]
54        pxor    xmm1, xmmword ptr [CMP_MSB_MASK+rip]
55        pcmpgtd xmm1, xmm0
56        shr     r8, 32
57        movd    xmm2, r8d
58        pshufd  xmm2, xmm2, 0x00
59        psubd   xmm2, xmm1
60        movdqa  xmmword ptr [rsp+0x120], xmm2
61        mov     rbx, qword ptr [rbp+0x90]
62        mov     r15, rdx
63        shl     r15, 6
64        movzx   r13d, byte ptr [rbp+0x78]
65        movzx   r12d, byte ptr [rbp+0x88]
66        cmp     rsi, 4
67        jc      3f
682:
69        movdqu  xmm3, xmmword ptr [rcx]
70        pshufd  xmm0, xmm3, 0x00
71        pshufd  xmm1, xmm3, 0x55
72        pshufd  xmm2, xmm3, 0xAA
73        pshufd  xmm3, xmm3, 0xFF
74        movdqu  xmm7, xmmword ptr [rcx+0x10]
75        pshufd  xmm4, xmm7, 0x00
76        pshufd  xmm5, xmm7, 0x55
77        pshufd  xmm6, xmm7, 0xAA
78        pshufd  xmm7, xmm7, 0xFF
79        mov     r8, qword ptr [rdi]
80        mov     r9, qword ptr [rdi+0x8]
81        mov     r10, qword ptr [rdi+0x10]
82        mov     r11, qword ptr [rdi+0x18]
83        movzx   eax, byte ptr [rbp+0x80]
84        or      eax, r13d
85        xor     edx, edx
869:
87        mov     r14d, eax
88        or      eax, r12d
89        add     rdx, 64
90        cmp     rdx, r15
91        cmovne  eax, r14d
92        movdqu  xmm8, xmmword ptr [r8+rdx-0x40]
93        movdqu  xmm9, xmmword ptr [r9+rdx-0x40]
94        movdqu  xmm10, xmmword ptr [r10+rdx-0x40]
95        movdqu  xmm11, xmmword ptr [r11+rdx-0x40]
96        movdqa  xmm12, xmm8
97        punpckldq xmm8, xmm9
98        punpckhdq xmm12, xmm9
99        movdqa  xmm14, xmm10
100        punpckldq xmm10, xmm11
101        punpckhdq xmm14, xmm11
102        movdqa  xmm9, xmm8
103        punpcklqdq xmm8, xmm10
104        punpckhqdq xmm9, xmm10
105        movdqa  xmm13, xmm12
106        punpcklqdq xmm12, xmm14
107        punpckhqdq xmm13, xmm14
108        movdqa  xmmword ptr [rsp], xmm8
109        movdqa  xmmword ptr [rsp+0x10], xmm9
110        movdqa  xmmword ptr [rsp+0x20], xmm12
111        movdqa  xmmword ptr [rsp+0x30], xmm13
112        movdqu  xmm8, xmmword ptr [r8+rdx-0x30]
113        movdqu  xmm9, xmmword ptr [r9+rdx-0x30]
114        movdqu  xmm10, xmmword ptr [r10+rdx-0x30]
115        movdqu  xmm11, xmmword ptr [r11+rdx-0x30]
116        movdqa  xmm12, xmm8
117        punpckldq xmm8, xmm9
118        punpckhdq xmm12, xmm9
119        movdqa  xmm14, xmm10
120        punpckldq xmm10, xmm11
121        punpckhdq xmm14, xmm11
122        movdqa  xmm9, xmm8
123        punpcklqdq xmm8, xmm10
124        punpckhqdq xmm9, xmm10
125        movdqa  xmm13, xmm12
126        punpcklqdq xmm12, xmm14
127        punpckhqdq xmm13, xmm14
128        movdqa  xmmword ptr [rsp+0x40], xmm8
129        movdqa  xmmword ptr [rsp+0x50], xmm9
130        movdqa  xmmword ptr [rsp+0x60], xmm12
131        movdqa  xmmword ptr [rsp+0x70], xmm13
132        movdqu  xmm8, xmmword ptr [r8+rdx-0x20]
133        movdqu  xmm9, xmmword ptr [r9+rdx-0x20]
134        movdqu  xmm10, xmmword ptr [r10+rdx-0x20]
135        movdqu  xmm11, xmmword ptr [r11+rdx-0x20]
136        movdqa  xmm12, xmm8
137        punpckldq xmm8, xmm9
138        punpckhdq xmm12, xmm9
139        movdqa  xmm14, xmm10
140        punpckldq xmm10, xmm11
141        punpckhdq xmm14, xmm11
142        movdqa  xmm9, xmm8
143        punpcklqdq xmm8, xmm10
144        punpckhqdq xmm9, xmm10
145        movdqa  xmm13, xmm12
146        punpcklqdq xmm12, xmm14
147        punpckhqdq xmm13, xmm14
148        movdqa  xmmword ptr [rsp+0x80], xmm8
149        movdqa  xmmword ptr [rsp+0x90], xmm9
150        movdqa  xmmword ptr [rsp+0xA0], xmm12
151        movdqa  xmmword ptr [rsp+0xB0], xmm13
152        movdqu  xmm8, xmmword ptr [r8+rdx-0x10]
153        movdqu  xmm9, xmmword ptr [r9+rdx-0x10]
154        movdqu  xmm10, xmmword ptr [r10+rdx-0x10]
155        movdqu  xmm11, xmmword ptr [r11+rdx-0x10]
156        movdqa  xmm12, xmm8
157        punpckldq xmm8, xmm9
158        punpckhdq xmm12, xmm9
159        movdqa  xmm14, xmm10
160        punpckldq xmm10, xmm11
161        punpckhdq xmm14, xmm11
162        movdqa  xmm9, xmm8
163        punpcklqdq xmm8, xmm10
164        punpckhqdq xmm9, xmm10
165        movdqa  xmm13, xmm12
166        punpcklqdq xmm12, xmm14
167        punpckhqdq xmm13, xmm14
168        movdqa  xmmword ptr [rsp+0xC0], xmm8
169        movdqa  xmmword ptr [rsp+0xD0], xmm9
170        movdqa  xmmword ptr [rsp+0xE0], xmm12
171        movdqa  xmmword ptr [rsp+0xF0], xmm13
172        movdqa  xmm9, xmmword ptr [BLAKE3_IV_1+rip]
173        movdqa  xmm10, xmmword ptr [BLAKE3_IV_2+rip]
174        movdqa  xmm11, xmmword ptr [BLAKE3_IV_3+rip]
175        movdqa  xmm12, xmmword ptr [rsp+0x110]
176        movdqa  xmm13, xmmword ptr [rsp+0x120]
177        movdqa  xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
178        movd    xmm15, eax
179        pshufd  xmm15, xmm15, 0x00
180        prefetcht0 [r8+rdx+0x80]
181        prefetcht0 [r9+rdx+0x80]
182        prefetcht0 [r10+rdx+0x80]
183        prefetcht0 [r11+rdx+0x80]
184        paddd   xmm0, xmmword ptr [rsp]
185        paddd   xmm1, xmmword ptr [rsp+0x20]
186        paddd   xmm2, xmmword ptr [rsp+0x40]
187        paddd   xmm3, xmmword ptr [rsp+0x60]
188        paddd   xmm0, xmm4
189        paddd   xmm1, xmm5
190        paddd   xmm2, xmm6
191        paddd   xmm3, xmm7
192        pxor    xmm12, xmm0
193        pxor    xmm13, xmm1
194        pxor    xmm14, xmm2
195        pxor    xmm15, xmm3
196        movdqa  xmm8, xmmword ptr [ROT16+rip]
197        pshufb  xmm12, xmm8
198        pshufb  xmm13, xmm8
199        pshufb  xmm14, xmm8
200        pshufb  xmm15, xmm8
201        movdqa  xmm8, xmmword ptr [BLAKE3_IV_0+rip]
202        paddd   xmm8, xmm12
203        paddd   xmm9, xmm13
204        paddd   xmm10, xmm14
205        paddd   xmm11, xmm15
206        pxor    xmm4, xmm8
207        pxor    xmm5, xmm9
208        pxor    xmm6, xmm10
209        pxor    xmm7, xmm11
210        movdqa  xmmword ptr [rsp+0x100], xmm8
211        movdqa  xmm8, xmm4
212        psrld   xmm8, 12
213        pslld   xmm4, 20
214        por     xmm4, xmm8
215        movdqa  xmm8, xmm5
216        psrld   xmm8, 12
217        pslld   xmm5, 20
218        por     xmm5, xmm8
219        movdqa  xmm8, xmm6
220        psrld   xmm8, 12
221        pslld   xmm6, 20
222        por     xmm6, xmm8
223        movdqa  xmm8, xmm7
224        psrld   xmm8, 12
225        pslld   xmm7, 20
226        por     xmm7, xmm8
227        paddd   xmm0, xmmword ptr [rsp+0x10]
228        paddd   xmm1, xmmword ptr [rsp+0x30]
229        paddd   xmm2, xmmword ptr [rsp+0x50]
230        paddd   xmm3, xmmword ptr [rsp+0x70]
231        paddd   xmm0, xmm4
232        paddd   xmm1, xmm5
233        paddd   xmm2, xmm6
234        paddd   xmm3, xmm7
235        pxor    xmm12, xmm0
236        pxor    xmm13, xmm1
237        pxor    xmm14, xmm2
238        pxor    xmm15, xmm3
239        movdqa  xmm8, xmmword ptr [ROT8+rip]
240        pshufb  xmm12, xmm8
241        pshufb  xmm13, xmm8
242        pshufb  xmm14, xmm8
243        pshufb  xmm15, xmm8
244        movdqa  xmm8, xmmword ptr [rsp+0x100]
245        paddd   xmm8, xmm12
246        paddd   xmm9, xmm13
247        paddd   xmm10, xmm14
248        paddd   xmm11, xmm15
249        pxor    xmm4, xmm8
250        pxor    xmm5, xmm9
251        pxor    xmm6, xmm10
252        pxor    xmm7, xmm11
253        movdqa  xmmword ptr [rsp+0x100], xmm8
254        movdqa  xmm8, xmm4
255        psrld   xmm8, 7
256        pslld   xmm4, 25
257        por     xmm4, xmm8
258        movdqa  xmm8, xmm5
259        psrld   xmm8, 7
260        pslld   xmm5, 25
261        por     xmm5, xmm8
262        movdqa  xmm8, xmm6
263        psrld   xmm8, 7
264        pslld   xmm6, 25
265        por     xmm6, xmm8
266        movdqa  xmm8, xmm7
267        psrld   xmm8, 7
268        pslld   xmm7, 25
269        por     xmm7, xmm8
270        paddd   xmm0, xmmword ptr [rsp+0x80]
271        paddd   xmm1, xmmword ptr [rsp+0xA0]
272        paddd   xmm2, xmmword ptr [rsp+0xC0]
273        paddd   xmm3, xmmword ptr [rsp+0xE0]
274        paddd   xmm0, xmm5
275        paddd   xmm1, xmm6
276        paddd   xmm2, xmm7
277        paddd   xmm3, xmm4
278        pxor    xmm15, xmm0
279        pxor    xmm12, xmm1
280        pxor    xmm13, xmm2
281        pxor    xmm14, xmm3
282        movdqa  xmm8, xmmword ptr [ROT16+rip]
283        pshufb  xmm15, xmm8
284        pshufb  xmm12, xmm8
285        pshufb  xmm13, xmm8
286        pshufb  xmm14, xmm8
287        paddd   xmm10, xmm15
288        paddd   xmm11, xmm12
289        movdqa  xmm8, xmmword ptr [rsp+0x100]
290        paddd   xmm8, xmm13
291        paddd   xmm9, xmm14
292        pxor    xmm5, xmm10
293        pxor    xmm6, xmm11
294        pxor    xmm7, xmm8
295        pxor    xmm4, xmm9
296        movdqa  xmmword ptr [rsp+0x100], xmm8
297        movdqa  xmm8, xmm5
298        psrld   xmm8, 12
299        pslld   xmm5, 20
300        por     xmm5, xmm8
301        movdqa  xmm8, xmm6
302        psrld   xmm8, 12
303        pslld   xmm6, 20
304        por     xmm6, xmm8
305        movdqa  xmm8, xmm7
306        psrld   xmm8, 12
307        pslld   xmm7, 20
308        por     xmm7, xmm8
309        movdqa  xmm8, xmm4
310        psrld   xmm8, 12
311        pslld   xmm4, 20
312        por     xmm4, xmm8
313        paddd   xmm0, xmmword ptr [rsp+0x90]
314        paddd   xmm1, xmmword ptr [rsp+0xB0]
315        paddd   xmm2, xmmword ptr [rsp+0xD0]
316        paddd   xmm3, xmmword ptr [rsp+0xF0]
317        paddd   xmm0, xmm5
318        paddd   xmm1, xmm6
319        paddd   xmm2, xmm7
320        paddd   xmm3, xmm4
321        pxor    xmm15, xmm0
322        pxor    xmm12, xmm1
323        pxor    xmm13, xmm2
324        pxor    xmm14, xmm3
325        movdqa  xmm8, xmmword ptr [ROT8+rip]
326        pshufb  xmm15, xmm8
327        pshufb  xmm12, xmm8
328        pshufb  xmm13, xmm8
329        pshufb  xmm14, xmm8
330        paddd   xmm10, xmm15
331        paddd   xmm11, xmm12
332        movdqa  xmm8, xmmword ptr [rsp+0x100]
333        paddd   xmm8, xmm13
334        paddd   xmm9, xmm14
335        pxor    xmm5, xmm10
336        pxor    xmm6, xmm11
337        pxor    xmm7, xmm8
338        pxor    xmm4, xmm9
339        movdqa  xmmword ptr [rsp+0x100], xmm8
340        movdqa  xmm8, xmm5
341        psrld   xmm8, 7
342        pslld   xmm5, 25
343        por     xmm5, xmm8
344        movdqa  xmm8, xmm6
345        psrld   xmm8, 7
346        pslld   xmm6, 25
347        por     xmm6, xmm8
348        movdqa  xmm8, xmm7
349        psrld   xmm8, 7
350        pslld   xmm7, 25
351        por     xmm7, xmm8
352        movdqa  xmm8, xmm4
353        psrld   xmm8, 7
354        pslld   xmm4, 25
355        por     xmm4, xmm8
356        paddd   xmm0, xmmword ptr [rsp+0x20]
357        paddd   xmm1, xmmword ptr [rsp+0x30]
358        paddd   xmm2, xmmword ptr [rsp+0x70]
359        paddd   xmm3, xmmword ptr [rsp+0x40]
360        paddd   xmm0, xmm4
361        paddd   xmm1, xmm5
362        paddd   xmm2, xmm6
363        paddd   xmm3, xmm7
364        pxor    xmm12, xmm0
365        pxor    xmm13, xmm1
366        pxor    xmm14, xmm2
367        pxor    xmm15, xmm3
368        movdqa  xmm8, xmmword ptr [ROT16+rip]
369        pshufb  xmm12, xmm8
370        pshufb  xmm13, xmm8
371        pshufb  xmm14, xmm8
372        pshufb  xmm15, xmm8
373        movdqa  xmm8, xmmword ptr [rsp+0x100]
374        paddd   xmm8, xmm12
375        paddd   xmm9, xmm13
376        paddd   xmm10, xmm14
377        paddd   xmm11, xmm15
378        pxor    xmm4, xmm8
379        pxor    xmm5, xmm9
380        pxor    xmm6, xmm10
381        pxor    xmm7, xmm11
382        movdqa  xmmword ptr [rsp+0x100], xmm8
383        movdqa  xmm8, xmm4
384        psrld   xmm8, 12
385        pslld   xmm4, 20
386        por     xmm4, xmm8
387        movdqa  xmm8, xmm5
388        psrld   xmm8, 12
389        pslld   xmm5, 20
390        por     xmm5, xmm8
391        movdqa  xmm8, xmm6
392        psrld   xmm8, 12
393        pslld   xmm6, 20
394        por     xmm6, xmm8
395        movdqa  xmm8, xmm7
396        psrld   xmm8, 12
397        pslld   xmm7, 20
398        por     xmm7, xmm8
399        paddd   xmm0, xmmword ptr [rsp+0x60]
400        paddd   xmm1, xmmword ptr [rsp+0xA0]
401        paddd   xmm2, xmmword ptr [rsp]
402        paddd   xmm3, xmmword ptr [rsp+0xD0]
403        paddd   xmm0, xmm4
404        paddd   xmm1, xmm5
405        paddd   xmm2, xmm6
406        paddd   xmm3, xmm7
407        pxor    xmm12, xmm0
408        pxor    xmm13, xmm1
409        pxor    xmm14, xmm2
410        pxor    xmm15, xmm3
411        movdqa  xmm8, xmmword ptr [ROT8+rip]
412        pshufb  xmm12, xmm8
413        pshufb  xmm13, xmm8
414        pshufb  xmm14, xmm8
415        pshufb  xmm15, xmm8
416        movdqa  xmm8, xmmword ptr [rsp+0x100]
417        paddd   xmm8, xmm12
418        paddd   xmm9, xmm13
419        paddd   xmm10, xmm14
420        paddd   xmm11, xmm15
421        pxor    xmm4, xmm8
422        pxor    xmm5, xmm9
423        pxor    xmm6, xmm10
424        pxor    xmm7, xmm11
425        movdqa  xmmword ptr [rsp+0x100], xmm8
426        movdqa  xmm8, xmm4
427        psrld   xmm8, 7
428        pslld   xmm4, 25
429        por     xmm4, xmm8
430        movdqa  xmm8, xmm5
431        psrld   xmm8, 7
432        pslld   xmm5, 25
433        por     xmm5, xmm8
434        movdqa  xmm8, xmm6
435        psrld   xmm8, 7
436        pslld   xmm6, 25
437        por     xmm6, xmm8
438        movdqa  xmm8, xmm7
439        psrld   xmm8, 7
440        pslld   xmm7, 25
441        por     xmm7, xmm8
442        paddd   xmm0, xmmword ptr [rsp+0x10]
443        paddd   xmm1, xmmword ptr [rsp+0xC0]
444        paddd   xmm2, xmmword ptr [rsp+0x90]
445        paddd   xmm3, xmmword ptr [rsp+0xF0]
446        paddd   xmm0, xmm5
447        paddd   xmm1, xmm6
448        paddd   xmm2, xmm7
449        paddd   xmm3, xmm4
450        pxor    xmm15, xmm0
451        pxor    xmm12, xmm1
452        pxor    xmm13, xmm2
453        pxor    xmm14, xmm3
454        movdqa  xmm8, xmmword ptr [ROT16+rip]
455        pshufb  xmm15, xmm8
456        pshufb  xmm12, xmm8
457        pshufb  xmm13, xmm8
458        pshufb  xmm14, xmm8
459        paddd   xmm10, xmm15
460        paddd   xmm11, xmm12
461        movdqa  xmm8, xmmword ptr [rsp+0x100]
462        paddd   xmm8, xmm13
463        paddd   xmm9, xmm14
464        pxor    xmm5, xmm10
465        pxor    xmm6, xmm11
466        pxor    xmm7, xmm8
467        pxor    xmm4, xmm9
468        movdqa  xmmword ptr [rsp+0x100], xmm8
469        movdqa  xmm8, xmm5
470        psrld   xmm8, 12
471        pslld   xmm5, 20
472        por     xmm5, xmm8
473        movdqa  xmm8, xmm6
474        psrld   xmm8, 12
475        pslld   xmm6, 20
476        por     xmm6, xmm8
477        movdqa  xmm8, xmm7
478        psrld   xmm8, 12
479        pslld   xmm7, 20
480        por     xmm7, xmm8
481        movdqa  xmm8, xmm4
482        psrld   xmm8, 12
483        pslld   xmm4, 20
484        por     xmm4, xmm8
485        paddd   xmm0, xmmword ptr [rsp+0xB0]
486        paddd   xmm1, xmmword ptr [rsp+0x50]
487        paddd   xmm2, xmmword ptr [rsp+0xE0]
488        paddd   xmm3, xmmword ptr [rsp+0x80]
489        paddd   xmm0, xmm5
490        paddd   xmm1, xmm6
491        paddd   xmm2, xmm7
492        paddd   xmm3, xmm4
493        pxor    xmm15, xmm0
494        pxor    xmm12, xmm1
495        pxor    xmm13, xmm2
496        pxor    xmm14, xmm3
497        movdqa  xmm8, xmmword ptr [ROT8+rip]
498        pshufb  xmm15, xmm8
499        pshufb  xmm12, xmm8
500        pshufb  xmm13, xmm8
501        pshufb  xmm14, xmm8
502        paddd   xmm10, xmm15
503        paddd   xmm11, xmm12
504        movdqa  xmm8, xmmword ptr [rsp+0x100]
505        paddd   xmm8, xmm13
506        paddd   xmm9, xmm14
507        pxor    xmm5, xmm10
508        pxor    xmm6, xmm11
509        pxor    xmm7, xmm8
510        pxor    xmm4, xmm9
511        movdqa  xmmword ptr [rsp+0x100], xmm8
512        movdqa  xmm8, xmm5
513        psrld   xmm8, 7
514        pslld   xmm5, 25
515        por     xmm5, xmm8
516        movdqa  xmm8, xmm6
517        psrld   xmm8, 7
518        pslld   xmm6, 25
519        por     xmm6, xmm8
520        movdqa  xmm8, xmm7
521        psrld   xmm8, 7
522        pslld   xmm7, 25
523        por     xmm7, xmm8
524        movdqa  xmm8, xmm4
525        psrld   xmm8, 7
526        pslld   xmm4, 25
527        por     xmm4, xmm8
528        paddd   xmm0, xmmword ptr [rsp+0x30]
529        paddd   xmm1, xmmword ptr [rsp+0xA0]
530        paddd   xmm2, xmmword ptr [rsp+0xD0]
531        paddd   xmm3, xmmword ptr [rsp+0x70]
532        paddd   xmm0, xmm4
533        paddd   xmm1, xmm5
534        paddd   xmm2, xmm6
535        paddd   xmm3, xmm7
536        pxor    xmm12, xmm0
537        pxor    xmm13, xmm1
538        pxor    xmm14, xmm2
539        pxor    xmm15, xmm3
540        movdqa  xmm8, xmmword ptr [ROT16+rip]
541        pshufb  xmm12, xmm8
542        pshufb  xmm13, xmm8
543        pshufb  xmm14, xmm8
544        pshufb  xmm15, xmm8
545        movdqa  xmm8, xmmword ptr [rsp+0x100]
546        paddd   xmm8, xmm12
547        paddd   xmm9, xmm13
548        paddd   xmm10, xmm14
549        paddd   xmm11, xmm15
550        pxor    xmm4, xmm8
551        pxor    xmm5, xmm9
552        pxor    xmm6, xmm10
553        pxor    xmm7, xmm11
554        movdqa  xmmword ptr [rsp+0x100], xmm8
555        movdqa  xmm8, xmm4
556        psrld   xmm8, 12
557        pslld   xmm4, 20
558        por     xmm4, xmm8
559        movdqa  xmm8, xmm5
560        psrld   xmm8, 12
561        pslld   xmm5, 20
562        por     xmm5, xmm8
563        movdqa  xmm8, xmm6
564        psrld   xmm8, 12
565        pslld   xmm6, 20
566        por     xmm6, xmm8
567        movdqa  xmm8, xmm7
568        psrld   xmm8, 12
569        pslld   xmm7, 20
570        por     xmm7, xmm8
571        paddd   xmm0, xmmword ptr [rsp+0x40]
572        paddd   xmm1, xmmword ptr [rsp+0xC0]
573        paddd   xmm2, xmmword ptr [rsp+0x20]
574        paddd   xmm3, xmmword ptr [rsp+0xE0]
575        paddd   xmm0, xmm4
576        paddd   xmm1, xmm5
577        paddd   xmm2, xmm6
578        paddd   xmm3, xmm7
579        pxor    xmm12, xmm0
580        pxor    xmm13, xmm1
581        pxor    xmm14, xmm2
582        pxor    xmm15, xmm3
583        movdqa  xmm8, xmmword ptr [ROT8+rip]
584        pshufb  xmm12, xmm8
585        pshufb  xmm13, xmm8
586        pshufb  xmm14, xmm8
587        pshufb  xmm15, xmm8
588        movdqa  xmm8, xmmword ptr [rsp+0x100]
589        paddd   xmm8, xmm12
590        paddd   xmm9, xmm13
591        paddd   xmm10, xmm14
592        paddd   xmm11, xmm15
593        pxor    xmm4, xmm8
594        pxor    xmm5, xmm9
595        pxor    xmm6, xmm10
596        pxor    xmm7, xmm11
597        movdqa  xmmword ptr [rsp+0x100], xmm8
598        movdqa  xmm8, xmm4
599        psrld   xmm8, 7
600        pslld   xmm4, 25
601        por     xmm4, xmm8
602        movdqa  xmm8, xmm5
603        psrld   xmm8, 7
604        pslld   xmm5, 25
605        por     xmm5, xmm8
606        movdqa  xmm8, xmm6
607        psrld   xmm8, 7
608        pslld   xmm6, 25
609        por     xmm6, xmm8
610        movdqa  xmm8, xmm7
611        psrld   xmm8, 7
612        pslld   xmm7, 25
613        por     xmm7, xmm8
614        paddd   xmm0, xmmword ptr [rsp+0x60]
615        paddd   xmm1, xmmword ptr [rsp+0x90]
616        paddd   xmm2, xmmword ptr [rsp+0xB0]
617        paddd   xmm3, xmmword ptr [rsp+0x80]
618        paddd   xmm0, xmm5
619        paddd   xmm1, xmm6
620        paddd   xmm2, xmm7
621        paddd   xmm3, xmm4
622        pxor    xmm15, xmm0
623        pxor    xmm12, xmm1
624        pxor    xmm13, xmm2
625        pxor    xmm14, xmm3
626        movdqa  xmm8, xmmword ptr [ROT16+rip]
627        pshufb  xmm15, xmm8
628        pshufb  xmm12, xmm8
629        pshufb  xmm13, xmm8
630        pshufb  xmm14, xmm8
631        paddd   xmm10, xmm15
632        paddd   xmm11, xmm12
633        movdqa  xmm8, xmmword ptr [rsp+0x100]
634        paddd   xmm8, xmm13
635        paddd   xmm9, xmm14
636        pxor    xmm5, xmm10
637        pxor    xmm6, xmm11
638        pxor    xmm7, xmm8
639        pxor    xmm4, xmm9
640        movdqa  xmmword ptr [rsp+0x100], xmm8
641        movdqa  xmm8, xmm5
642        psrld   xmm8, 12
643        pslld   xmm5, 20
644        por     xmm5, xmm8
645        movdqa  xmm8, xmm6
646        psrld   xmm8, 12
647        pslld   xmm6, 20
648        por     xmm6, xmm8
649        movdqa  xmm8, xmm7
650        psrld   xmm8, 12
651        pslld   xmm7, 20
652        por     xmm7, xmm8
653        movdqa  xmm8, xmm4
654        psrld   xmm8, 12
655        pslld   xmm4, 20
656        por     xmm4, xmm8
657        paddd   xmm0, xmmword ptr [rsp+0x50]
658        paddd   xmm1, xmmword ptr [rsp]
659        paddd   xmm2, xmmword ptr [rsp+0xF0]
660        paddd   xmm3, xmmword ptr [rsp+0x10]
661        paddd   xmm0, xmm5
662        paddd   xmm1, xmm6
663        paddd   xmm2, xmm7
664        paddd   xmm3, xmm4
665        pxor    xmm15, xmm0
666        pxor    xmm12, xmm1
667        pxor    xmm13, xmm2
668        pxor    xmm14, xmm3
669        movdqa  xmm8, xmmword ptr [ROT8+rip]
670        pshufb  xmm15, xmm8
671        pshufb  xmm12, xmm8
672        pshufb  xmm13, xmm8
673        pshufb  xmm14, xmm8
674        paddd   xmm10, xmm15
675        paddd   xmm11, xmm12
676        movdqa  xmm8, xmmword ptr [rsp+0x100]
677        paddd   xmm8, xmm13
678        paddd   xmm9, xmm14
679        pxor    xmm5, xmm10
680        pxor    xmm6, xmm11
681        pxor    xmm7, xmm8
682        pxor    xmm4, xmm9
683        movdqa  xmmword ptr [rsp+0x100], xmm8
684        movdqa  xmm8, xmm5
685        psrld   xmm8, 7
686        pslld   xmm5, 25
687        por     xmm5, xmm8
688        movdqa  xmm8, xmm6
689        psrld   xmm8, 7
690        pslld   xmm6, 25
691        por     xmm6, xmm8
692        movdqa  xmm8, xmm7
693        psrld   xmm8, 7
694        pslld   xmm7, 25
695        por     xmm7, xmm8
696        movdqa  xmm8, xmm4
697        psrld   xmm8, 7
698        pslld   xmm4, 25
699        por     xmm4, xmm8
700        paddd   xmm0, xmmword ptr [rsp+0xA0]
701        paddd   xmm1, xmmword ptr [rsp+0xC0]
702        paddd   xmm2, xmmword ptr [rsp+0xE0]
703        paddd   xmm3, xmmword ptr [rsp+0xD0]
704        paddd   xmm0, xmm4
705        paddd   xmm1, xmm5
706        paddd   xmm2, xmm6
707        paddd   xmm3, xmm7
708        pxor    xmm12, xmm0
709        pxor    xmm13, xmm1
710        pxor    xmm14, xmm2
711        pxor    xmm15, xmm3
712        movdqa  xmm8, xmmword ptr [ROT16+rip]
713        pshufb  xmm12, xmm8
714        pshufb  xmm13, xmm8
715        pshufb  xmm14, xmm8
716        pshufb  xmm15, xmm8
717        movdqa  xmm8, xmmword ptr [rsp+0x100]
718        paddd   xmm8, xmm12
719        paddd   xmm9, xmm13
720        paddd   xmm10, xmm14
721        paddd   xmm11, xmm15
722        pxor    xmm4, xmm8
723        pxor    xmm5, xmm9
724        pxor    xmm6, xmm10
725        pxor    xmm7, xmm11
726        movdqa  xmmword ptr [rsp+0x100], xmm8
727        movdqa  xmm8, xmm4
728        psrld   xmm8, 12
729        pslld   xmm4, 20
730        por     xmm4, xmm8
731        movdqa  xmm8, xmm5
732        psrld   xmm8, 12
733        pslld   xmm5, 20
734        por     xmm5, xmm8
735        movdqa  xmm8, xmm6
736        psrld   xmm8, 12
737        pslld   xmm6, 20
738        por     xmm6, xmm8
739        movdqa  xmm8, xmm7
740        psrld   xmm8, 12
741        pslld   xmm7, 20
742        por     xmm7, xmm8
743        paddd   xmm0, xmmword ptr [rsp+0x70]
744        paddd   xmm1, xmmword ptr [rsp+0x90]
745        paddd   xmm2, xmmword ptr [rsp+0x30]
746        paddd   xmm3, xmmword ptr [rsp+0xF0]
747        paddd   xmm0, xmm4
748        paddd   xmm1, xmm5
749        paddd   xmm2, xmm6
750        paddd   xmm3, xmm7
751        pxor    xmm12, xmm0
752        pxor    xmm13, xmm1
753        pxor    xmm14, xmm2
754        pxor    xmm15, xmm3
755        movdqa  xmm8, xmmword ptr [ROT8+rip]
756        pshufb  xmm12, xmm8
757        pshufb  xmm13, xmm8
758        pshufb  xmm14, xmm8
759        pshufb  xmm15, xmm8
760        movdqa  xmm8, xmmword ptr [rsp+0x100]
761        paddd   xmm8, xmm12
762        paddd   xmm9, xmm13
763        paddd   xmm10, xmm14
764        paddd   xmm11, xmm15
765        pxor    xmm4, xmm8
766        pxor    xmm5, xmm9
767        pxor    xmm6, xmm10
768        pxor    xmm7, xmm11
769        movdqa  xmmword ptr [rsp+0x100], xmm8
770        movdqa  xmm8, xmm4
771        psrld   xmm8, 7
772        pslld   xmm4, 25
773        por     xmm4, xmm8
774        movdqa  xmm8, xmm5
775        psrld   xmm8, 7
776        pslld   xmm5, 25
777        por     xmm5, xmm8
778        movdqa  xmm8, xmm6
779        psrld   xmm8, 7
780        pslld   xmm6, 25
781        por     xmm6, xmm8
782        movdqa  xmm8, xmm7
783        psrld   xmm8, 7
784        pslld   xmm7, 25
785        por     xmm7, xmm8
786        paddd   xmm0, xmmword ptr [rsp+0x40]
787        paddd   xmm1, xmmword ptr [rsp+0xB0]
788        paddd   xmm2, xmmword ptr [rsp+0x50]
789        paddd   xmm3, xmmword ptr [rsp+0x10]
790        paddd   xmm0, xmm5
791        paddd   xmm1, xmm6
792        paddd   xmm2, xmm7
793        paddd   xmm3, xmm4
794        pxor    xmm15, xmm0
795        pxor    xmm12, xmm1
796        pxor    xmm13, xmm2
797        pxor    xmm14, xmm3
798        movdqa  xmm8, xmmword ptr [ROT16+rip]
799        pshufb  xmm15, xmm8
800        pshufb  xmm12, xmm8
801        pshufb  xmm13, xmm8
802        pshufb  xmm14, xmm8
803        paddd   xmm10, xmm15
804        paddd   xmm11, xmm12
805        movdqa  xmm8, xmmword ptr [rsp+0x100]
806        paddd   xmm8, xmm13
807        paddd   xmm9, xmm14
808        pxor    xmm5, xmm10
809        pxor    xmm6, xmm11
810        pxor    xmm7, xmm8
811        pxor    xmm4, xmm9
812        movdqa  xmmword ptr [rsp+0x100], xmm8
813        movdqa  xmm8, xmm5
814        psrld   xmm8, 12
815        pslld   xmm5, 20
816        por     xmm5, xmm8
817        movdqa  xmm8, xmm6
818        psrld   xmm8, 12
819        pslld   xmm6, 20
820        por     xmm6, xmm8
821        movdqa  xmm8, xmm7
822        psrld   xmm8, 12
823        pslld   xmm7, 20
824        por     xmm7, xmm8
825        movdqa  xmm8, xmm4
826        psrld   xmm8, 12
827        pslld   xmm4, 20
828        por     xmm4, xmm8
829        paddd   xmm0, xmmword ptr [rsp]
830        paddd   xmm1, xmmword ptr [rsp+0x20]
831        paddd   xmm2, xmmword ptr [rsp+0x80]
832        paddd   xmm3, xmmword ptr [rsp+0x60]
833        paddd   xmm0, xmm5
834        paddd   xmm1, xmm6
835        paddd   xmm2, xmm7
836        paddd   xmm3, xmm4
837        pxor    xmm15, xmm0
838        pxor    xmm12, xmm1
839        pxor    xmm13, xmm2
840        pxor    xmm14, xmm3
841        movdqa  xmm8, xmmword ptr [ROT8+rip]
842        pshufb  xmm15, xmm8
843        pshufb  xmm12, xmm8
844        pshufb  xmm13, xmm8
845        pshufb  xmm14, xmm8
846        paddd   xmm10, xmm15
847        paddd   xmm11, xmm12
848        movdqa  xmm8, xmmword ptr [rsp+0x100]
849        paddd   xmm8, xmm13
850        paddd   xmm9, xmm14
851        pxor    xmm5, xmm10
852        pxor    xmm6, xmm11
853        pxor    xmm7, xmm8
854        pxor    xmm4, xmm9
855        movdqa  xmmword ptr [rsp+0x100], xmm8
856        movdqa  xmm8, xmm5
857        psrld   xmm8, 7
858        pslld   xmm5, 25
859        por     xmm5, xmm8
860        movdqa  xmm8, xmm6
861        psrld   xmm8, 7
862        pslld   xmm6, 25
863        por     xmm6, xmm8
864        movdqa  xmm8, xmm7
865        psrld   xmm8, 7
866        pslld   xmm7, 25
867        por     xmm7, xmm8
868        movdqa  xmm8, xmm4
869        psrld   xmm8, 7
870        pslld   xmm4, 25
871        por     xmm4, xmm8
872        paddd   xmm0, xmmword ptr [rsp+0xC0]
873        paddd   xmm1, xmmword ptr [rsp+0x90]
874        paddd   xmm2, xmmword ptr [rsp+0xF0]
875        paddd   xmm3, xmmword ptr [rsp+0xE0]
876        paddd   xmm0, xmm4
877        paddd   xmm1, xmm5
878        paddd   xmm2, xmm6
879        paddd   xmm3, xmm7
880        pxor    xmm12, xmm0
881        pxor    xmm13, xmm1
882        pxor    xmm14, xmm2
883        pxor    xmm15, xmm3
884        movdqa  xmm8, xmmword ptr [ROT16+rip]
885        pshufb  xmm12, xmm8
886        pshufb  xmm13, xmm8
887        pshufb  xmm14, xmm8
888        pshufb  xmm15, xmm8
889        movdqa  xmm8, xmmword ptr [rsp+0x100]
890        paddd   xmm8, xmm12
891        paddd   xmm9, xmm13
892        paddd   xmm10, xmm14
893        paddd   xmm11, xmm15
894        pxor    xmm4, xmm8
895        pxor    xmm5, xmm9
896        pxor    xmm6, xmm10
897        pxor    xmm7, xmm11
898        movdqa  xmmword ptr [rsp+0x100], xmm8
899        movdqa  xmm8, xmm4
900        psrld   xmm8, 12
901        pslld   xmm4, 20
902        por     xmm4, xmm8
903        movdqa  xmm8, xmm5
904        psrld   xmm8, 12
905        pslld   xmm5, 20
906        por     xmm5, xmm8
907        movdqa  xmm8, xmm6
908        psrld   xmm8, 12
909        pslld   xmm6, 20
910        por     xmm6, xmm8
911        movdqa  xmm8, xmm7
912        psrld   xmm8, 12
913        pslld   xmm7, 20
914        por     xmm7, xmm8
915        paddd   xmm0, xmmword ptr [rsp+0xD0]
916        paddd   xmm1, xmmword ptr [rsp+0xB0]
917        paddd   xmm2, xmmword ptr [rsp+0xA0]
918        paddd   xmm3, xmmword ptr [rsp+0x80]
919        paddd   xmm0, xmm4
920        paddd   xmm1, xmm5
921        paddd   xmm2, xmm6
922        paddd   xmm3, xmm7
923        pxor    xmm12, xmm0
924        pxor    xmm13, xmm1
925        pxor    xmm14, xmm2
926        pxor    xmm15, xmm3
927        movdqa  xmm8, xmmword ptr [ROT8+rip]
928        pshufb  xmm12, xmm8
929        pshufb  xmm13, xmm8
930        pshufb  xmm14, xmm8
931        pshufb  xmm15, xmm8
932        movdqa  xmm8, xmmword ptr [rsp+0x100]
933        paddd   xmm8, xmm12
934        paddd   xmm9, xmm13
935        paddd   xmm10, xmm14
936        paddd   xmm11, xmm15
937        pxor    xmm4, xmm8
938        pxor    xmm5, xmm9
939        pxor    xmm6, xmm10
940        pxor    xmm7, xmm11
941        movdqa  xmmword ptr [rsp+0x100], xmm8
942        movdqa  xmm8, xmm4
943        psrld   xmm8, 7
944        pslld   xmm4, 25
945        por     xmm4, xmm8
946        movdqa  xmm8, xmm5
947        psrld   xmm8, 7
948        pslld   xmm5, 25
949        por     xmm5, xmm8
950        movdqa  xmm8, xmm6
951        psrld   xmm8, 7
952        pslld   xmm6, 25
953        por     xmm6, xmm8
954        movdqa  xmm8, xmm7
955        psrld   xmm8, 7
956        pslld   xmm7, 25
957        por     xmm7, xmm8
958        paddd   xmm0, xmmword ptr [rsp+0x70]
959        paddd   xmm1, xmmword ptr [rsp+0x50]
960        paddd   xmm2, xmmword ptr [rsp]
961        paddd   xmm3, xmmword ptr [rsp+0x60]
962        paddd   xmm0, xmm5
963        paddd   xmm1, xmm6
964        paddd   xmm2, xmm7
965        paddd   xmm3, xmm4
966        pxor    xmm15, xmm0
967        pxor    xmm12, xmm1
968        pxor    xmm13, xmm2
969        pxor    xmm14, xmm3
970        movdqa  xmm8, xmmword ptr [ROT16+rip]
971        pshufb  xmm15, xmm8
972        pshufb  xmm12, xmm8
973        pshufb  xmm13, xmm8
974        pshufb  xmm14, xmm8
975        paddd   xmm10, xmm15
976        paddd   xmm11, xmm12
977        movdqa  xmm8, xmmword ptr [rsp+0x100]
978        paddd   xmm8, xmm13
979        paddd   xmm9, xmm14
980        pxor    xmm5, xmm10
981        pxor    xmm6, xmm11
982        pxor    xmm7, xmm8
983        pxor    xmm4, xmm9
984        movdqa  xmmword ptr [rsp+0x100], xmm8
985        movdqa  xmm8, xmm5
986        psrld   xmm8, 12
987        pslld   xmm5, 20
988        por     xmm5, xmm8
989        movdqa  xmm8, xmm6
990        psrld   xmm8, 12
991        pslld   xmm6, 20
992        por     xmm6, xmm8
993        movdqa  xmm8, xmm7
994        psrld   xmm8, 12
995        pslld   xmm7, 20
996        por     xmm7, xmm8
997        movdqa  xmm8, xmm4
998        psrld   xmm8, 12
999        pslld   xmm4, 20
1000        por     xmm4, xmm8
1001        paddd   xmm0, xmmword ptr [rsp+0x20]
1002        paddd   xmm1, xmmword ptr [rsp+0x30]
1003        paddd   xmm2, xmmword ptr [rsp+0x10]
1004        paddd   xmm3, xmmword ptr [rsp+0x40]
1005        paddd   xmm0, xmm5
1006        paddd   xmm1, xmm6
1007        paddd   xmm2, xmm7
1008        paddd   xmm3, xmm4
1009        pxor    xmm15, xmm0
1010        pxor    xmm12, xmm1
1011        pxor    xmm13, xmm2
1012        pxor    xmm14, xmm3
1013        movdqa  xmm8, xmmword ptr [ROT8+rip]
1014        pshufb  xmm15, xmm8
1015        pshufb  xmm12, xmm8
1016        pshufb  xmm13, xmm8
1017        pshufb  xmm14, xmm8
1018        paddd   xmm10, xmm15
1019        paddd   xmm11, xmm12
1020        movdqa  xmm8, xmmword ptr [rsp+0x100]
1021        paddd   xmm8, xmm13
1022        paddd   xmm9, xmm14
1023        pxor    xmm5, xmm10
1024        pxor    xmm6, xmm11
1025        pxor    xmm7, xmm8
1026        pxor    xmm4, xmm9
1027        movdqa  xmmword ptr [rsp+0x100], xmm8
1028        movdqa  xmm8, xmm5
1029        psrld   xmm8, 7
1030        pslld   xmm5, 25
1031        por     xmm5, xmm8
1032        movdqa  xmm8, xmm6
1033        psrld   xmm8, 7
1034        pslld   xmm6, 25
1035        por     xmm6, xmm8
1036        movdqa  xmm8, xmm7
1037        psrld   xmm8, 7
1038        pslld   xmm7, 25
1039        por     xmm7, xmm8
1040        movdqa  xmm8, xmm4
1041        psrld   xmm8, 7
1042        pslld   xmm4, 25
1043        por     xmm4, xmm8
1044        paddd   xmm0, xmmword ptr [rsp+0x90]
1045        paddd   xmm1, xmmword ptr [rsp+0xB0]
1046        paddd   xmm2, xmmword ptr [rsp+0x80]
1047        paddd   xmm3, xmmword ptr [rsp+0xF0]
1048        paddd   xmm0, xmm4
1049        paddd   xmm1, xmm5
1050        paddd   xmm2, xmm6
1051        paddd   xmm3, xmm7
1052        pxor    xmm12, xmm0
1053        pxor    xmm13, xmm1
1054        pxor    xmm14, xmm2
1055        pxor    xmm15, xmm3
1056        movdqa  xmm8, xmmword ptr [ROT16+rip]
1057        pshufb  xmm12, xmm8
1058        pshufb  xmm13, xmm8
1059        pshufb  xmm14, xmm8
1060        pshufb  xmm15, xmm8
1061        movdqa  xmm8, xmmword ptr [rsp+0x100]
1062        paddd   xmm8, xmm12
1063        paddd   xmm9, xmm13
1064        paddd   xmm10, xmm14
1065        paddd   xmm11, xmm15
1066        pxor    xmm4, xmm8
1067        pxor    xmm5, xmm9
1068        pxor    xmm6, xmm10
1069        pxor    xmm7, xmm11
1070        movdqa  xmmword ptr [rsp+0x100], xmm8
1071        movdqa  xmm8, xmm4
1072        psrld   xmm8, 12
1073        pslld   xmm4, 20
1074        por     xmm4, xmm8
1075        movdqa  xmm8, xmm5
1076        psrld   xmm8, 12
1077        pslld   xmm5, 20
1078        por     xmm5, xmm8
1079        movdqa  xmm8, xmm6
1080        psrld   xmm8, 12
1081        pslld   xmm6, 20
1082        por     xmm6, xmm8
1083        movdqa  xmm8, xmm7
1084        psrld   xmm8, 12
1085        pslld   xmm7, 20
1086        por     xmm7, xmm8
1087        paddd   xmm0, xmmword ptr [rsp+0xE0]
1088        paddd   xmm1, xmmword ptr [rsp+0x50]
1089        paddd   xmm2, xmmword ptr [rsp+0xC0]
1090        paddd   xmm3, xmmword ptr [rsp+0x10]
1091        paddd   xmm0, xmm4
1092        paddd   xmm1, xmm5
1093        paddd   xmm2, xmm6
1094        paddd   xmm3, xmm7
1095        pxor    xmm12, xmm0
1096        pxor    xmm13, xmm1
1097        pxor    xmm14, xmm2
1098        pxor    xmm15, xmm3
1099        movdqa  xmm8, xmmword ptr [ROT8+rip]
1100        pshufb  xmm12, xmm8
1101        pshufb  xmm13, xmm8
1102        pshufb  xmm14, xmm8
1103        pshufb  xmm15, xmm8
1104        movdqa  xmm8, xmmword ptr [rsp+0x100]
1105        paddd   xmm8, xmm12
1106        paddd   xmm9, xmm13
1107        paddd   xmm10, xmm14
1108        paddd   xmm11, xmm15
1109        pxor    xmm4, xmm8
1110        pxor    xmm5, xmm9
1111        pxor    xmm6, xmm10
1112        pxor    xmm7, xmm11
1113        movdqa  xmmword ptr [rsp+0x100], xmm8
1114        movdqa  xmm8, xmm4
1115        psrld   xmm8, 7
1116        pslld   xmm4, 25
1117        por     xmm4, xmm8
1118        movdqa  xmm8, xmm5
1119        psrld   xmm8, 7
1120        pslld   xmm5, 25
1121        por     xmm5, xmm8
1122        movdqa  xmm8, xmm6
1123        psrld   xmm8, 7
1124        pslld   xmm6, 25
1125        por     xmm6, xmm8
1126        movdqa  xmm8, xmm7
1127        psrld   xmm8, 7
1128        pslld   xmm7, 25
1129        por     xmm7, xmm8
1130        paddd   xmm0, xmmword ptr [rsp+0xD0]
1131        paddd   xmm1, xmmword ptr [rsp]
1132        paddd   xmm2, xmmword ptr [rsp+0x20]
1133        paddd   xmm3, xmmword ptr [rsp+0x40]
1134        paddd   xmm0, xmm5
1135        paddd   xmm1, xmm6
1136        paddd   xmm2, xmm7
1137        paddd   xmm3, xmm4
1138        pxor    xmm15, xmm0
1139        pxor    xmm12, xmm1
1140        pxor    xmm13, xmm2
1141        pxor    xmm14, xmm3
1142        movdqa  xmm8, xmmword ptr [ROT16+rip]
1143        pshufb  xmm15, xmm8
1144        pshufb  xmm12, xmm8
1145        pshufb  xmm13, xmm8
1146        pshufb  xmm14, xmm8
1147        paddd   xmm10, xmm15
1148        paddd   xmm11, xmm12
1149        movdqa  xmm8, xmmword ptr [rsp+0x100]
1150        paddd   xmm8, xmm13
1151        paddd   xmm9, xmm14
1152        pxor    xmm5, xmm10
1153        pxor    xmm6, xmm11
1154        pxor    xmm7, xmm8
1155        pxor    xmm4, xmm9
1156        movdqa  xmmword ptr [rsp+0x100], xmm8
1157        movdqa  xmm8, xmm5
1158        psrld   xmm8, 12
1159        pslld   xmm5, 20
1160        por     xmm5, xmm8
1161        movdqa  xmm8, xmm6
1162        psrld   xmm8, 12
1163        pslld   xmm6, 20
1164        por     xmm6, xmm8
1165        movdqa  xmm8, xmm7
1166        psrld   xmm8, 12
1167        pslld   xmm7, 20
1168        por     xmm7, xmm8
1169        movdqa  xmm8, xmm4
1170        psrld   xmm8, 12
1171        pslld   xmm4, 20
1172        por     xmm4, xmm8
1173        paddd   xmm0, xmmword ptr [rsp+0x30]
1174        paddd   xmm1, xmmword ptr [rsp+0xA0]
1175        paddd   xmm2, xmmword ptr [rsp+0x60]
1176        paddd   xmm3, xmmword ptr [rsp+0x70]
1177        paddd   xmm0, xmm5
1178        paddd   xmm1, xmm6
1179        paddd   xmm2, xmm7
1180        paddd   xmm3, xmm4
1181        pxor    xmm15, xmm0
1182        pxor    xmm12, xmm1
1183        pxor    xmm13, xmm2
1184        pxor    xmm14, xmm3
1185        movdqa  xmm8, xmmword ptr [ROT8+rip]
1186        pshufb  xmm15, xmm8
1187        pshufb  xmm12, xmm8
1188        pshufb  xmm13, xmm8
1189        pshufb  xmm14, xmm8
1190        paddd   xmm10, xmm15
1191        paddd   xmm11, xmm12
1192        movdqa  xmm8, xmmword ptr [rsp+0x100]
1193        paddd   xmm8, xmm13
1194        paddd   xmm9, xmm14
1195        pxor    xmm5, xmm10
1196        pxor    xmm6, xmm11
1197        pxor    xmm7, xmm8
1198        pxor    xmm4, xmm9
1199        movdqa  xmmword ptr [rsp+0x100], xmm8
1200        movdqa  xmm8, xmm5
1201        psrld   xmm8, 7
1202        pslld   xmm5, 25
1203        por     xmm5, xmm8
1204        movdqa  xmm8, xmm6
1205        psrld   xmm8, 7
1206        pslld   xmm6, 25
1207        por     xmm6, xmm8
1208        movdqa  xmm8, xmm7
1209        psrld   xmm8, 7
1210        pslld   xmm7, 25
1211        por     xmm7, xmm8
1212        movdqa  xmm8, xmm4
1213        psrld   xmm8, 7
1214        pslld   xmm4, 25
1215        por     xmm4, xmm8
1216        paddd   xmm0, xmmword ptr [rsp+0xB0]
1217        paddd   xmm1, xmmword ptr [rsp+0x50]
1218        paddd   xmm2, xmmword ptr [rsp+0x10]
1219        paddd   xmm3, xmmword ptr [rsp+0x80]
1220        paddd   xmm0, xmm4
1221        paddd   xmm1, xmm5
1222        paddd   xmm2, xmm6
1223        paddd   xmm3, xmm7
1224        pxor    xmm12, xmm0
1225        pxor    xmm13, xmm1
1226        pxor    xmm14, xmm2
1227        pxor    xmm15, xmm3
1228        movdqa  xmm8, xmmword ptr [ROT16+rip]
1229        pshufb  xmm12, xmm8
1230        pshufb  xmm13, xmm8
1231        pshufb  xmm14, xmm8
1232        pshufb  xmm15, xmm8
1233        movdqa  xmm8, xmmword ptr [rsp+0x100]
1234        paddd   xmm8, xmm12
1235        paddd   xmm9, xmm13
1236        paddd   xmm10, xmm14
1237        paddd   xmm11, xmm15
1238        pxor    xmm4, xmm8
1239        pxor    xmm5, xmm9
1240        pxor    xmm6, xmm10
1241        pxor    xmm7, xmm11
1242        movdqa  xmmword ptr [rsp+0x100], xmm8
1243        movdqa  xmm8, xmm4
1244        psrld   xmm8, 12
1245        pslld   xmm4, 20
1246        por     xmm4, xmm8
1247        movdqa  xmm8, xmm5
1248        psrld   xmm8, 12
1249        pslld   xmm5, 20
1250        por     xmm5, xmm8
1251        movdqa  xmm8, xmm6
1252        psrld   xmm8, 12
1253        pslld   xmm6, 20
1254        por     xmm6, xmm8
1255        movdqa  xmm8, xmm7
1256        psrld   xmm8, 12
1257        pslld   xmm7, 20
1258        por     xmm7, xmm8
1259        paddd   xmm0, xmmword ptr [rsp+0xF0]
1260        paddd   xmm1, xmmword ptr [rsp]
1261        paddd   xmm2, xmmword ptr [rsp+0x90]
1262        paddd   xmm3, xmmword ptr [rsp+0x60]
1263        paddd   xmm0, xmm4
1264        paddd   xmm1, xmm5
1265        paddd   xmm2, xmm6
1266        paddd   xmm3, xmm7
1267        pxor    xmm12, xmm0
1268        pxor    xmm13, xmm1
1269        pxor    xmm14, xmm2
1270        pxor    xmm15, xmm3
1271        movdqa  xmm8, xmmword ptr [ROT8+rip]
1272        pshufb  xmm12, xmm8
1273        pshufb  xmm13, xmm8
1274        pshufb  xmm14, xmm8
1275        pshufb  xmm15, xmm8
1276        movdqa  xmm8, xmmword ptr [rsp+0x100]
1277        paddd   xmm8, xmm12
1278        paddd   xmm9, xmm13
1279        paddd   xmm10, xmm14
1280        paddd   xmm11, xmm15
1281        pxor    xmm4, xmm8
1282        pxor    xmm5, xmm9
1283        pxor    xmm6, xmm10
1284        pxor    xmm7, xmm11
1285        movdqa  xmmword ptr [rsp+0x100], xmm8
1286        movdqa  xmm8, xmm4
1287        psrld   xmm8, 7
1288        pslld   xmm4, 25
1289        por     xmm4, xmm8
1290        movdqa  xmm8, xmm5
1291        psrld   xmm8, 7
1292        pslld   xmm5, 25
1293        por     xmm5, xmm8
1294        movdqa  xmm8, xmm6
1295        psrld   xmm8, 7
1296        pslld   xmm6, 25
1297        por     xmm6, xmm8
1298        movdqa  xmm8, xmm7
1299        psrld   xmm8, 7
1300        pslld   xmm7, 25
1301        por     xmm7, xmm8
1302        paddd   xmm0, xmmword ptr [rsp+0xE0]
1303        paddd   xmm1, xmmword ptr [rsp+0x20]
1304        paddd   xmm2, xmmword ptr [rsp+0x30]
1305        paddd   xmm3, xmmword ptr [rsp+0x70]
1306        paddd   xmm0, xmm5
1307        paddd   xmm1, xmm6
1308        paddd   xmm2, xmm7
1309        paddd   xmm3, xmm4
1310        pxor    xmm15, xmm0
1311        pxor    xmm12, xmm1
1312        pxor    xmm13, xmm2
1313        pxor    xmm14, xmm3
1314        movdqa  xmm8, xmmword ptr [ROT16+rip]
1315        pshufb  xmm15, xmm8
1316        pshufb  xmm12, xmm8
1317        pshufb  xmm13, xmm8
1318        pshufb  xmm14, xmm8
1319        paddd   xmm10, xmm15
1320        paddd   xmm11, xmm12
1321        movdqa  xmm8, xmmword ptr [rsp+0x100]
1322        paddd   xmm8, xmm13
1323        paddd   xmm9, xmm14
1324        pxor    xmm5, xmm10
1325        pxor    xmm6, xmm11
1326        pxor    xmm7, xmm8
1327        pxor    xmm4, xmm9
1328        movdqa  xmmword ptr [rsp+0x100], xmm8
1329        movdqa  xmm8, xmm5
1330        psrld   xmm8, 12
1331        pslld   xmm5, 20
1332        por     xmm5, xmm8
1333        movdqa  xmm8, xmm6
1334        psrld   xmm8, 12
1335        pslld   xmm6, 20
1336        por     xmm6, xmm8
1337        movdqa  xmm8, xmm7
1338        psrld   xmm8, 12
1339        pslld   xmm7, 20
1340        por     xmm7, xmm8
1341        movdqa  xmm8, xmm4
1342        psrld   xmm8, 12
1343        pslld   xmm4, 20
1344        por     xmm4, xmm8
1345        paddd   xmm0, xmmword ptr [rsp+0xA0]
1346        paddd   xmm1, xmmword ptr [rsp+0xC0]
1347        paddd   xmm2, xmmword ptr [rsp+0x40]
1348        paddd   xmm3, xmmword ptr [rsp+0xD0]
1349        paddd   xmm0, xmm5
1350        paddd   xmm1, xmm6
1351        paddd   xmm2, xmm7
1352        paddd   xmm3, xmm4
1353        pxor    xmm15, xmm0
1354        pxor    xmm12, xmm1
1355        pxor    xmm13, xmm2
1356        pxor    xmm14, xmm3
1357        movdqa  xmm8, xmmword ptr [ROT8+rip]
1358        pshufb  xmm15, xmm8
1359        pshufb  xmm12, xmm8
1360        pshufb  xmm13, xmm8
1361        pshufb  xmm14, xmm8
1362        paddd   xmm10, xmm15
1363        paddd   xmm11, xmm12
1364        movdqa  xmm8, xmmword ptr [rsp+0x100]
1365        paddd   xmm8, xmm13
1366        paddd   xmm9, xmm14
1367        pxor    xmm5, xmm10
1368        pxor    xmm6, xmm11
1369        pxor    xmm7, xmm8
1370        pxor    xmm4, xmm9
1371        pxor    xmm0, xmm8
1372        pxor    xmm1, xmm9
1373        pxor    xmm2, xmm10
1374        pxor    xmm3, xmm11
1375        movdqa  xmm8, xmm5
1376        psrld   xmm8, 7
1377        pslld   xmm5, 25
1378        por     xmm5, xmm8
1379        movdqa  xmm8, xmm6
1380        psrld   xmm8, 7
1381        pslld   xmm6, 25
1382        por     xmm6, xmm8
1383        movdqa  xmm8, xmm7
1384        psrld   xmm8, 7
1385        pslld   xmm7, 25
1386        por     xmm7, xmm8
1387        movdqa  xmm8, xmm4
1388        psrld   xmm8, 7
1389        pslld   xmm4, 25
1390        por     xmm4, xmm8
1391        pxor    xmm4, xmm12
1392        pxor    xmm5, xmm13
1393        pxor    xmm6, xmm14
1394        pxor    xmm7, xmm15
1395        mov     eax, r13d
1396        jne     9b
1397        movdqa  xmm9, xmm0
1398        punpckldq xmm0, xmm1
1399        punpckhdq xmm9, xmm1
1400        movdqa  xmm11, xmm2
1401        punpckldq xmm2, xmm3
1402        punpckhdq xmm11, xmm3
1403        movdqa  xmm1, xmm0
1404        punpcklqdq xmm0, xmm2
1405        punpckhqdq xmm1, xmm2
1406        movdqa  xmm3, xmm9
1407        punpcklqdq xmm9, xmm11
1408        punpckhqdq xmm3, xmm11
1409        movdqu  xmmword ptr [rbx], xmm0
1410        movdqu  xmmword ptr [rbx+0x20], xmm1
1411        movdqu  xmmword ptr [rbx+0x40], xmm9
1412        movdqu  xmmword ptr [rbx+0x60], xmm3
1413        movdqa  xmm9, xmm4
1414        punpckldq xmm4, xmm5
1415        punpckhdq xmm9, xmm5
1416        movdqa  xmm11, xmm6
1417        punpckldq xmm6, xmm7
1418        punpckhdq xmm11, xmm7
1419        movdqa  xmm5, xmm4
1420        punpcklqdq xmm4, xmm6
1421        punpckhqdq xmm5, xmm6
1422        movdqa  xmm7, xmm9
1423        punpcklqdq xmm9, xmm11
1424        punpckhqdq xmm7, xmm11
1425        movdqu  xmmword ptr [rbx+0x10], xmm4
1426        movdqu  xmmword ptr [rbx+0x30], xmm5
1427        movdqu  xmmword ptr [rbx+0x50], xmm9
1428        movdqu  xmmword ptr [rbx+0x70], xmm7
1429        movdqa  xmm1, xmmword ptr [rsp+0x110]
1430        movdqa  xmm0, xmm1
1431        paddd   xmm1, xmmword ptr [rsp+0x150]
1432        movdqa  xmmword ptr [rsp+0x110], xmm1
1433        pxor    xmm0, xmmword ptr [CMP_MSB_MASK+rip]
1434        pxor    xmm1, xmmword ptr [CMP_MSB_MASK+rip]
1435        pcmpgtd xmm0, xmm1
1436        movdqa  xmm1, xmmword ptr [rsp+0x120]
1437        psubd   xmm1, xmm0
1438        movdqa  xmmword ptr [rsp+0x120], xmm1
1439        add     rbx, 128
1440        add     rdi, 32
1441        sub     rsi, 4
1442        cmp     rsi, 4
1443        jnc     2b
1444        test    rsi, rsi
1445        jne     3f
14464:
1447        movdqa  xmm6, xmmword ptr [rsp+0x170]
1448        movdqa  xmm7, xmmword ptr [rsp+0x180]
1449        movdqa  xmm8, xmmword ptr [rsp+0x190]
1450        movdqa  xmm9, xmmword ptr [rsp+0x1A0]
1451        movdqa  xmm10, xmmword ptr [rsp+0x1B0]
1452        movdqa  xmm11, xmmword ptr [rsp+0x1C0]
1453        movdqa  xmm12, xmmword ptr [rsp+0x1D0]
1454        movdqa  xmm13, xmmword ptr [rsp+0x1E0]
1455        movdqa  xmm14, xmmword ptr [rsp+0x1F0]
1456        movdqa  xmm15, xmmword ptr [rsp+0x200]
1457        mov     rsp, rbp
1458        pop     rbp
1459        pop     rbx
1460        pop     rdi
1461        pop     rsi
1462        pop     r12
1463        pop     r13
1464        pop     r14
1465        pop     r15
1466        ret
1467.p2align 5
14683:
1469        test    esi, 0x2
1470        je      3f
1471        movups  xmm0, xmmword ptr [rcx]
1472        movups  xmm1, xmmword ptr [rcx+0x10]
1473        movaps  xmm8, xmm0
1474        movaps  xmm9, xmm1
1475        movd    xmm13, dword ptr [rsp+0x110]
1476        pinsrd  xmm13, dword ptr [rsp+0x120], 1
1477        pinsrd  xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
1478        movaps  xmmword ptr [rsp], xmm13
1479        movd    xmm14, dword ptr [rsp+0x114]
1480        pinsrd  xmm14, dword ptr [rsp+0x124], 1
1481        pinsrd  xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
1482        movaps  xmmword ptr [rsp+0x10], xmm14
1483        mov     r8, qword ptr [rdi]
1484        mov     r9, qword ptr [rdi+0x8]
1485        movzx   eax, byte ptr [rbp+0x80]
1486        or      eax, r13d
1487        xor     edx, edx
14882:
1489        mov     r14d, eax
1490        or      eax, r12d
1491        add     rdx, 64
1492        cmp     rdx, r15
1493        cmovne  eax, r14d
1494        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
1495        movaps  xmm10, xmm2
1496        movups  xmm4, xmmword ptr [r8+rdx-0x40]
1497        movups  xmm5, xmmword ptr [r8+rdx-0x30]
1498        movaps  xmm3, xmm4
1499        shufps  xmm4, xmm5, 136
1500        shufps  xmm3, xmm5, 221
1501        movaps  xmm5, xmm3
1502        movups  xmm6, xmmword ptr [r8+rdx-0x20]
1503        movups  xmm7, xmmword ptr [r8+rdx-0x10]
1504        movaps  xmm3, xmm6
1505        shufps  xmm6, xmm7, 136
1506        pshufd  xmm6, xmm6, 0x93
1507        shufps  xmm3, xmm7, 221
1508        pshufd  xmm7, xmm3, 0x93
1509        movups  xmm12, xmmword ptr [r9+rdx-0x40]
1510        movups  xmm13, xmmword ptr [r9+rdx-0x30]
1511        movaps  xmm11, xmm12
1512        shufps  xmm12, xmm13, 136
1513        shufps  xmm11, xmm13, 221
1514        movaps  xmm13, xmm11
1515        movups  xmm14, xmmword ptr [r9+rdx-0x20]
1516        movups  xmm15, xmmword ptr [r9+rdx-0x10]
1517        movaps  xmm11, xmm14
1518        shufps  xmm14, xmm15, 136
1519        pshufd  xmm14, xmm14, 0x93
1520        shufps  xmm11, xmm15, 221
1521        pshufd  xmm15, xmm11, 0x93
1522        movaps  xmm3, xmmword ptr [rsp]
1523        movaps  xmm11, xmmword ptr [rsp+0x10]
1524        pinsrd  xmm3, eax, 3
1525        pinsrd  xmm11, eax, 3
1526        mov     al, 7
15279:
1528        paddd   xmm0, xmm4
1529        paddd   xmm8, xmm12
1530        movaps  xmmword ptr [rsp+0x20], xmm4
1531        movaps  xmmword ptr [rsp+0x30], xmm12
1532        paddd   xmm0, xmm1
1533        paddd   xmm8, xmm9
1534        pxor    xmm3, xmm0
1535        pxor    xmm11, xmm8
1536        movaps  xmm12, xmmword ptr [ROT16+rip]
1537        pshufb  xmm3, xmm12
1538        pshufb  xmm11, xmm12
1539        paddd   xmm2, xmm3
1540        paddd   xmm10, xmm11
1541        pxor    xmm1, xmm2
1542        pxor    xmm9, xmm10
1543        movdqa  xmm4, xmm1
1544        pslld   xmm1, 20
1545        psrld   xmm4, 12
1546        por     xmm1, xmm4
1547        movdqa  xmm4, xmm9
1548        pslld   xmm9, 20
1549        psrld   xmm4, 12
1550        por     xmm9, xmm4
1551        paddd   xmm0, xmm5
1552        paddd   xmm8, xmm13
1553        movaps  xmmword ptr [rsp+0x40], xmm5
1554        movaps  xmmword ptr [rsp+0x50], xmm13
1555        paddd   xmm0, xmm1
1556        paddd   xmm8, xmm9
1557        pxor    xmm3, xmm0
1558        pxor    xmm11, xmm8
1559        movaps  xmm13, xmmword ptr [ROT8+rip]
1560        pshufb  xmm3, xmm13
1561        pshufb  xmm11, xmm13
1562        paddd   xmm2, xmm3
1563        paddd   xmm10, xmm11
1564        pxor    xmm1, xmm2
1565        pxor    xmm9, xmm10
1566        movdqa  xmm4, xmm1
1567        pslld   xmm1, 25
1568        psrld   xmm4, 7
1569        por     xmm1, xmm4
1570        movdqa  xmm4, xmm9
1571        pslld   xmm9, 25
1572        psrld   xmm4, 7
1573        por     xmm9, xmm4
1574        pshufd  xmm0, xmm0, 0x93
1575        pshufd  xmm8, xmm8, 0x93
1576        pshufd  xmm3, xmm3, 0x4E
1577        pshufd  xmm11, xmm11, 0x4E
1578        pshufd  xmm2, xmm2, 0x39
1579        pshufd  xmm10, xmm10, 0x39
1580        paddd   xmm0, xmm6
1581        paddd   xmm8, xmm14
1582        paddd   xmm0, xmm1
1583        paddd   xmm8, xmm9
1584        pxor    xmm3, xmm0
1585        pxor    xmm11, xmm8
1586        pshufb  xmm3, xmm12
1587        pshufb  xmm11, xmm12
1588        paddd   xmm2, xmm3
1589        paddd   xmm10, xmm11
1590        pxor    xmm1, xmm2
1591        pxor    xmm9, xmm10
1592        movdqa  xmm4, xmm1
1593        pslld   xmm1, 20
1594        psrld   xmm4, 12
1595        por     xmm1, xmm4
1596        movdqa  xmm4, xmm9
1597        pslld   xmm9, 20
1598        psrld   xmm4, 12
1599        por     xmm9, xmm4
1600        paddd   xmm0, xmm7
1601        paddd   xmm8, xmm15
1602        paddd   xmm0, xmm1
1603        paddd   xmm8, xmm9
1604        pxor    xmm3, xmm0
1605        pxor    xmm11, xmm8
1606        pshufb  xmm3, xmm13
1607        pshufb  xmm11, xmm13
1608        paddd   xmm2, xmm3
1609        paddd   xmm10, xmm11
1610        pxor    xmm1, xmm2
1611        pxor    xmm9, xmm10
1612        movdqa  xmm4, xmm1
1613        pslld   xmm1, 25
1614        psrld   xmm4, 7
1615        por     xmm1, xmm4
1616        movdqa  xmm4, xmm9
1617        pslld   xmm9, 25
1618        psrld   xmm4, 7
1619        por     xmm9, xmm4
1620        pshufd  xmm0, xmm0, 0x39
1621        pshufd  xmm8, xmm8, 0x39
1622        pshufd  xmm3, xmm3, 0x4E
1623        pshufd  xmm11, xmm11, 0x4E
1624        pshufd  xmm2, xmm2, 0x93
1625        pshufd  xmm10, xmm10, 0x93
1626        dec     al
1627        je      9f
1628        movdqa  xmm12, xmmword ptr [rsp+0x20]
1629        movdqa  xmm5, xmmword ptr [rsp+0x40]
1630        pshufd  xmm13, xmm12, 0x0F
1631        shufps  xmm12, xmm5, 214
1632        pshufd  xmm4, xmm12, 0x39
1633        movdqa  xmm12, xmm6
1634        shufps  xmm12, xmm7, 250
1635        pblendw xmm13, xmm12, 0xCC
1636        movdqa  xmm12, xmm7
1637        punpcklqdq xmm12, xmm5
1638        pblendw xmm12, xmm6, 0xC0
1639        pshufd  xmm12, xmm12, 0x78
1640        punpckhdq xmm5, xmm7
1641        punpckldq xmm6, xmm5
1642        pshufd  xmm7, xmm6, 0x1E
1643        movdqa  xmmword ptr [rsp+0x20], xmm13
1644        movdqa  xmmword ptr [rsp+0x40], xmm12
1645        movdqa  xmm5, xmmword ptr [rsp+0x30]
1646        movdqa  xmm13, xmmword ptr [rsp+0x50]
1647        pshufd  xmm6, xmm5, 0x0F
1648        shufps  xmm5, xmm13, 214
1649        pshufd  xmm12, xmm5, 0x39
1650        movdqa  xmm5, xmm14
1651        shufps  xmm5, xmm15, 250
1652        pblendw xmm6, xmm5, 0xCC
1653        movdqa  xmm5, xmm15
1654        punpcklqdq xmm5, xmm13
1655        pblendw xmm5, xmm14, 0xC0
1656        pshufd  xmm5, xmm5, 0x78
1657        punpckhdq xmm13, xmm15
1658        punpckldq xmm14, xmm13
1659        pshufd  xmm15, xmm14, 0x1E
1660        movdqa  xmm13, xmm6
1661        movdqa  xmm14, xmm5
1662        movdqa  xmm5, xmmword ptr [rsp+0x20]
1663        movdqa  xmm6, xmmword ptr [rsp+0x40]
1664        jmp     9b
16659:
1666        pxor    xmm0, xmm2
1667        pxor    xmm1, xmm3
1668        pxor    xmm8, xmm10
1669        pxor    xmm9, xmm11
1670        mov     eax, r13d
1671        cmp     rdx, r15
1672        jne     2b
1673        movups  xmmword ptr [rbx], xmm0
1674        movups  xmmword ptr [rbx+0x10], xmm1
1675        movups  xmmword ptr [rbx+0x20], xmm8
1676        movups  xmmword ptr [rbx+0x30], xmm9
1677        movdqa  xmm0, xmmword ptr [rsp+0x130]
1678        movdqa  xmm1, xmmword ptr [rsp+0x110]
1679        movdqa  xmm2, xmmword ptr [rsp+0x120]
1680        movdqu  xmm3, xmmword ptr [rsp+0x118]
1681        movdqu  xmm4, xmmword ptr [rsp+0x128]
1682        blendvps xmm1, xmm3, xmm0
1683        blendvps xmm2, xmm4, xmm0
1684        movdqa  xmmword ptr [rsp+0x110], xmm1
1685        movdqa  xmmword ptr [rsp+0x120], xmm2
1686        add     rdi, 16
1687        add     rbx, 64
1688        sub     rsi, 2
16893:
1690        test    esi, 0x1
1691        je      4b
1692        movups  xmm0, xmmword ptr [rcx]
1693        movups  xmm1, xmmword ptr [rcx+0x10]
1694        movd    xmm13, dword ptr [rsp+0x110]
1695        pinsrd  xmm13, dword ptr [rsp+0x120], 1
1696        pinsrd  xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
1697        movaps  xmm14, xmmword ptr [ROT8+rip]
1698        movaps  xmm15, xmmword ptr [ROT16+rip]
1699        mov     r8, qword ptr [rdi]
1700        movzx   eax, byte ptr [rbp+0x80]
1701        or      eax, r13d
1702        xor     edx, edx
17032:
1704        mov     r14d, eax
1705        or      eax, r12d
1706        add     rdx, 64
1707        cmp     rdx, r15
1708        cmovne  eax, r14d
1709        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
1710        movaps  xmm3, xmm13
1711        pinsrd  xmm3, eax, 3
1712        movups  xmm4, xmmword ptr [r8+rdx-0x40]
1713        movups  xmm5, xmmword ptr [r8+rdx-0x30]
1714        movaps  xmm8, xmm4
1715        shufps  xmm4, xmm5, 136
1716        shufps  xmm8, xmm5, 221
1717        movaps  xmm5, xmm8
1718        movups  xmm6, xmmword ptr [r8+rdx-0x20]
1719        movups  xmm7, xmmword ptr [r8+rdx-0x10]
1720        movaps  xmm8, xmm6
1721        shufps  xmm6, xmm7, 136
1722        pshufd  xmm6, xmm6, 0x93
1723        shufps  xmm8, xmm7, 221
1724        pshufd  xmm7, xmm8, 0x93
1725        mov     al, 7
17269:
1727        paddd   xmm0, xmm4
1728        paddd   xmm0, xmm1
1729        pxor    xmm3, xmm0
1730        pshufb  xmm3, xmm15
1731        paddd   xmm2, xmm3
1732        pxor    xmm1, xmm2
1733        movdqa  xmm11, xmm1
1734        pslld   xmm1, 20
1735        psrld   xmm11, 12
1736        por     xmm1, xmm11
1737        paddd   xmm0, xmm5
1738        paddd   xmm0, xmm1
1739        pxor    xmm3, xmm0
1740        pshufb  xmm3, xmm14
1741        paddd   xmm2, xmm3
1742        pxor    xmm1, xmm2
1743        movdqa  xmm11, xmm1
1744        pslld   xmm1, 25
1745        psrld   xmm11, 7
1746        por     xmm1, xmm11
1747        pshufd  xmm0, xmm0, 0x93
1748        pshufd  xmm3, xmm3, 0x4E
1749        pshufd  xmm2, xmm2, 0x39
1750        paddd   xmm0, xmm6
1751        paddd   xmm0, xmm1
1752        pxor    xmm3, xmm0
1753        pshufb  xmm3, xmm15
1754        paddd   xmm2, xmm3
1755        pxor    xmm1, xmm2
1756        movdqa  xmm11, xmm1
1757        pslld   xmm1, 20
1758        psrld   xmm11, 12
1759        por     xmm1, xmm11
1760        paddd   xmm0, xmm7
1761        paddd   xmm0, xmm1
1762        pxor    xmm3, xmm0
1763        pshufb  xmm3, xmm14
1764        paddd   xmm2, xmm3
1765        pxor    xmm1, xmm2
1766        movdqa  xmm11, xmm1
1767        pslld   xmm1, 25
1768        psrld   xmm11, 7
1769        por     xmm1, xmm11
1770        pshufd  xmm0, xmm0, 0x39
1771        pshufd  xmm3, xmm3, 0x4E
1772        pshufd  xmm2, xmm2, 0x93
1773        dec     al
1774        jz      9f
1775        movdqa  xmm8, xmm4
1776        shufps  xmm8, xmm5, 214
1777        pshufd  xmm9, xmm4, 0x0F
1778        pshufd  xmm4, xmm8, 0x39
1779        movdqa  xmm8, xmm6
1780        shufps  xmm8, xmm7, 250
1781        pblendw xmm9, xmm8, 0xCC
1782        movdqa  xmm8, xmm7
1783        punpcklqdq xmm8, xmm5
1784        pblendw xmm8, xmm6, 0xC0
1785        pshufd  xmm8, xmm8, 0x78
1786        punpckhdq xmm5, xmm7
1787        punpckldq xmm6, xmm5
1788        pshufd  xmm7, xmm6, 0x1E
1789        movdqa  xmm5, xmm9
1790        movdqa  xmm6, xmm8
1791        jmp     9b
17929:
1793        pxor    xmm0, xmm2
1794        pxor    xmm1, xmm3
1795        mov     eax, r13d
1796        cmp     rdx, r15
1797        jne     2b
1798        movups  xmmword ptr [rbx], xmm0
1799        movups  xmmword ptr [rbx+0x10], xmm1
1800        jmp     4b
1801
1802.p2align 6
1803blake3_compress_in_place_sse41:
1804_blake3_compress_in_place_sse41:
1805        sub     rsp, 120
1806        movdqa  xmmword ptr [rsp], xmm6
1807        movdqa  xmmword ptr [rsp+0x10], xmm7
1808        movdqa  xmmword ptr [rsp+0x20], xmm8
1809        movdqa  xmmword ptr [rsp+0x30], xmm9
1810        movdqa  xmmword ptr [rsp+0x40], xmm11
1811        movdqa  xmmword ptr [rsp+0x50], xmm14
1812        movdqa  xmmword ptr [rsp+0x60], xmm15
1813        movups  xmm0, xmmword ptr [rcx]
1814        movups  xmm1, xmmword ptr [rcx+0x10]
1815        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
1816        movzx   eax, byte ptr [rsp+0xA0]
1817        movzx   r8d, r8b
1818        shl     rax, 32
1819        add     r8, rax
1820        movq    xmm3, r9
1821        movq    xmm4, r8
1822        punpcklqdq xmm3, xmm4
1823        movups  xmm4, xmmword ptr [rdx]
1824        movups  xmm5, xmmword ptr [rdx+0x10]
1825        movaps  xmm8, xmm4
1826        shufps  xmm4, xmm5, 136
1827        shufps  xmm8, xmm5, 221
1828        movaps  xmm5, xmm8
1829        movups  xmm6, xmmword ptr [rdx+0x20]
1830        movups  xmm7, xmmword ptr [rdx+0x30]
1831        movaps  xmm8, xmm6
1832        shufps  xmm6, xmm7, 136
1833        pshufd  xmm6, xmm6, 0x93
1834        shufps  xmm8, xmm7, 221
1835        pshufd  xmm7, xmm8, 0x93
1836        movaps  xmm14, xmmword ptr [ROT8+rip]
1837        movaps  xmm15, xmmword ptr [ROT16+rip]
1838        mov     al, 7
18399:
1840        paddd   xmm0, xmm4
1841        paddd   xmm0, xmm1
1842        pxor    xmm3, xmm0
1843        pshufb  xmm3, xmm15
1844        paddd   xmm2, xmm3
1845        pxor    xmm1, xmm2
1846        movdqa  xmm11, xmm1
1847        pslld   xmm1, 20
1848        psrld   xmm11, 12
1849        por     xmm1, xmm11
1850        paddd   xmm0, xmm5
1851        paddd   xmm0, xmm1
1852        pxor    xmm3, xmm0
1853        pshufb  xmm3, xmm14
1854        paddd   xmm2, xmm3
1855        pxor    xmm1, xmm2
1856        movdqa  xmm11, xmm1
1857        pslld   xmm1, 25
1858        psrld   xmm11, 7
1859        por     xmm1, xmm11
1860        pshufd  xmm0, xmm0, 0x93
1861        pshufd  xmm3, xmm3, 0x4E
1862        pshufd  xmm2, xmm2, 0x39
1863        paddd   xmm0, xmm6
1864        paddd   xmm0, xmm1
1865        pxor    xmm3, xmm0
1866        pshufb  xmm3, xmm15
1867        paddd   xmm2, xmm3
1868        pxor    xmm1, xmm2
1869        movdqa  xmm11, xmm1
1870        pslld   xmm1, 20
1871        psrld   xmm11, 12
1872        por     xmm1, xmm11
1873        paddd   xmm0, xmm7
1874        paddd   xmm0, xmm1
1875        pxor    xmm3, xmm0
1876        pshufb  xmm3, xmm14
1877        paddd   xmm2, xmm3
1878        pxor    xmm1, xmm2
1879        movdqa  xmm11, xmm1
1880        pslld   xmm1, 25
1881        psrld   xmm11, 7
1882        por     xmm1, xmm11
1883        pshufd  xmm0, xmm0, 0x39
1884        pshufd  xmm3, xmm3, 0x4E
1885        pshufd  xmm2, xmm2, 0x93
1886        dec     al
1887        jz      9f
1888        movdqa  xmm8, xmm4
1889        shufps  xmm8, xmm5, 214
1890        pshufd  xmm9, xmm4, 0x0F
1891        pshufd  xmm4, xmm8, 0x39
1892        movdqa  xmm8, xmm6
1893        shufps  xmm8, xmm7, 250
1894        pblendw xmm9, xmm8, 0xCC
1895        movdqa  xmm8, xmm7
1896        punpcklqdq xmm8, xmm5
1897        pblendw xmm8, xmm6, 0xC0
1898        pshufd  xmm8, xmm8, 0x78
1899        punpckhdq xmm5, xmm7
1900        punpckldq xmm6, xmm5
1901        pshufd  xmm7, xmm6, 0x1E
1902        movdqa  xmm5, xmm9
1903        movdqa  xmm6, xmm8
1904        jmp     9b
19059:
1906        pxor    xmm0, xmm2
1907        pxor    xmm1, xmm3
1908        movups  xmmword ptr [rcx], xmm0
1909        movups  xmmword ptr [rcx+0x10], xmm1
1910        movdqa  xmm6, xmmword ptr [rsp]
1911        movdqa  xmm7, xmmword ptr [rsp+0x10]
1912        movdqa  xmm8, xmmword ptr [rsp+0x20]
1913        movdqa  xmm9, xmmword ptr [rsp+0x30]
1914        movdqa  xmm11, xmmword ptr [rsp+0x40]
1915        movdqa  xmm14, xmmword ptr [rsp+0x50]
1916        movdqa  xmm15, xmmword ptr [rsp+0x60]
1917        add     rsp, 120
1918        ret
1919
1920
1921.p2align 6
1922_blake3_compress_xof_sse41:
1923blake3_compress_xof_sse41:
1924        sub     rsp, 120
1925        movdqa  xmmword ptr [rsp], xmm6
1926        movdqa  xmmword ptr [rsp+0x10], xmm7
1927        movdqa  xmmword ptr [rsp+0x20], xmm8
1928        movdqa  xmmword ptr [rsp+0x30], xmm9
1929        movdqa  xmmword ptr [rsp+0x40], xmm11
1930        movdqa  xmmword ptr [rsp+0x50], xmm14
1931        movdqa  xmmword ptr [rsp+0x60], xmm15
1932        movups  xmm0, xmmword ptr [rcx]
1933        movups  xmm1, xmmword ptr [rcx+0x10]
1934        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
1935        movzx   eax, byte ptr [rsp+0xA0]
1936        movzx   r8d, r8b
1937        mov     r10, qword ptr [rsp+0xA8]
1938        shl     rax, 32
1939        add     r8, rax
1940        movq    xmm3, r9
1941        movq    xmm4, r8
1942        punpcklqdq xmm3, xmm4
1943        movups  xmm4, xmmword ptr [rdx]
1944        movups  xmm5, xmmword ptr [rdx+0x10]
1945        movaps  xmm8, xmm4
1946        shufps  xmm4, xmm5, 136
1947        shufps  xmm8, xmm5, 221
1948        movaps  xmm5, xmm8
1949        movups  xmm6, xmmword ptr [rdx+0x20]
1950        movups  xmm7, xmmword ptr [rdx+0x30]
1951        movaps  xmm8, xmm6
1952        shufps  xmm6, xmm7, 136
1953        pshufd  xmm6, xmm6, 0x93
1954        shufps  xmm8, xmm7, 221
1955        pshufd  xmm7, xmm8, 0x93
1956        movaps  xmm14, xmmword ptr [ROT8+rip]
1957        movaps  xmm15, xmmword ptr [ROT16+rip]
1958        mov     al, 7
19599:
1960        paddd   xmm0, xmm4
1961        paddd   xmm0, xmm1
1962        pxor    xmm3, xmm0
1963        pshufb  xmm3, xmm15
1964        paddd   xmm2, xmm3
1965        pxor    xmm1, xmm2
1966        movdqa  xmm11, xmm1
1967        pslld   xmm1, 20
1968        psrld   xmm11, 12
1969        por     xmm1, xmm11
1970        paddd   xmm0, xmm5
1971        paddd   xmm0, xmm1
1972        pxor    xmm3, xmm0
1973        pshufb  xmm3, xmm14
1974        paddd   xmm2, xmm3
1975        pxor    xmm1, xmm2
1976        movdqa  xmm11, xmm1
1977        pslld   xmm1, 25
1978        psrld   xmm11, 7
1979        por     xmm1, xmm11
1980        pshufd  xmm0, xmm0, 0x93
1981        pshufd  xmm3, xmm3, 0x4E
1982        pshufd  xmm2, xmm2, 0x39
1983        paddd   xmm0, xmm6
1984        paddd   xmm0, xmm1
1985        pxor    xmm3, xmm0
1986        pshufb  xmm3, xmm15
1987        paddd   xmm2, xmm3
1988        pxor    xmm1, xmm2
1989        movdqa  xmm11, xmm1
1990        pslld   xmm1, 20
1991        psrld   xmm11, 12
1992        por     xmm1, xmm11
1993        paddd   xmm0, xmm7
1994        paddd   xmm0, xmm1
1995        pxor    xmm3, xmm0
1996        pshufb  xmm3, xmm14
1997        paddd   xmm2, xmm3
1998        pxor    xmm1, xmm2
1999        movdqa  xmm11, xmm1
2000        pslld   xmm1, 25
2001        psrld   xmm11, 7
2002        por     xmm1, xmm11
2003        pshufd  xmm0, xmm0, 0x39
2004        pshufd  xmm3, xmm3, 0x4E
2005        pshufd  xmm2, xmm2, 0x93
2006        dec     al
2007        jz      9f
2008        movdqa  xmm8, xmm4
2009        shufps  xmm8, xmm5, 214
2010        pshufd  xmm9, xmm4, 0x0F
2011        pshufd  xmm4, xmm8, 0x39
2012        movdqa  xmm8, xmm6
2013        shufps  xmm8, xmm7, 250
2014        pblendw xmm9, xmm8, 0xCC
2015        movdqa  xmm8, xmm7
2016        punpcklqdq xmm8, xmm5
2017        pblendw xmm8, xmm6, 0xC0
2018        pshufd  xmm8, xmm8, 0x78
2019        punpckhdq xmm5, xmm7
2020        punpckldq xmm6, xmm5
2021        pshufd  xmm7, xmm6, 0x1E
2022        movdqa  xmm5, xmm9
2023        movdqa  xmm6, xmm8
2024        jmp     9b
20259:
2026        movdqu  xmm4, xmmword ptr [rcx]
2027        movdqu  xmm5, xmmword ptr [rcx+0x10]
2028        pxor    xmm0, xmm2
2029        pxor    xmm1, xmm3
2030        pxor    xmm2, xmm4
2031        pxor    xmm3, xmm5
2032        movups  xmmword ptr [r10], xmm0
2033        movups  xmmword ptr [r10+0x10], xmm1
2034        movups  xmmword ptr [r10+0x20], xmm2
2035        movups  xmmword ptr [r10+0x30], xmm3
2036        movdqa  xmm6, xmmword ptr [rsp]
2037        movdqa  xmm7, xmmword ptr [rsp+0x10]
2038        movdqa  xmm8, xmmword ptr [rsp+0x20]
2039        movdqa  xmm9, xmmword ptr [rsp+0x30]
2040        movdqa  xmm11, xmmword ptr [rsp+0x40]
2041        movdqa  xmm14, xmmword ptr [rsp+0x50]
2042        movdqa  xmm15, xmmword ptr [rsp+0x60]
2043        add     rsp, 120
2044        ret
2045
2046
2047.section .rodata
2048.p2align  6
2049BLAKE3_IV:
2050        .long  0x6A09E667, 0xBB67AE85
2051        .long  0x3C6EF372, 0xA54FF53A
2052ROT16:
2053        .byte  2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
2054ROT8:
2055        .byte  1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
2056ADD0:
2057        .long  0, 1, 2, 3
2058ADD1:
2059        .long  4, 4, 4, 4
2060BLAKE3_IV_0:
2061        .long  0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
2062BLAKE3_IV_1:
2063        .long  0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
2064BLAKE3_IV_2:
2065        .long  0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
2066BLAKE3_IV_3:
2067        .long  0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
2068BLAKE3_BLOCK_LEN:
2069        .long  64, 64, 64, 64
2070CMP_MSB_MASK:
2071        .long  0x80000000, 0x80000000, 0x80000000, 0x80000000
2072