1.intel_syntax noprefix
2.global blake3_hash_many_sse41
3.global _blake3_hash_many_sse41
4.global blake3_compress_in_place_sse41
5.global _blake3_compress_in_place_sse41
6.global blake3_compress_xof_sse41
7.global _blake3_compress_xof_sse41
8.section .text
9        .p2align  6
10_blake3_hash_many_sse41:
11blake3_hash_many_sse41:
12        push    r15
13        push    r14
14        push    r13
15        push    r12
16        push    rsi
17        push    rdi
18        push    rbx
19        push    rbp
20        mov     rbp, rsp
21        sub     rsp, 528
22        and     rsp, 0xFFFFFFFFFFFFFFC0
23        movdqa  xmmword ptr [rsp+0x170], xmm6
24        movdqa  xmmword ptr [rsp+0x180], xmm7
25        movdqa  xmmword ptr [rsp+0x190], xmm8
26        movdqa  xmmword ptr [rsp+0x1A0], xmm9
27        movdqa  xmmword ptr [rsp+0x1B0], xmm10
28        movdqa  xmmword ptr [rsp+0x1C0], xmm11
29        movdqa  xmmword ptr [rsp+0x1D0], xmm12
30        movdqa  xmmword ptr [rsp+0x1E0], xmm13
31        movdqa  xmmword ptr [rsp+0x1F0], xmm14
32        movdqa  xmmword ptr [rsp+0x200], xmm15
33        mov     rdi, rcx
34        mov     rsi, rdx
35        mov     rdx, r8
36        mov     rcx, r9
37        mov     r8, qword ptr [rbp+0x68]
38        movzx   r9, byte ptr [rbp+0x70]
39        neg     r9d
40        movd    xmm0, r9d
41        pshufd  xmm0, xmm0, 0x00
42        movdqa  xmmword ptr [rsp+0x130], xmm0
43        movdqa  xmm1, xmm0
44        pand    xmm1, xmmword ptr [ADD0+rip]
45        pand    xmm0, xmmword ptr [ADD1+rip]
46        movdqa  xmmword ptr [rsp+0x150], xmm0
47        movd    xmm0, r8d
48        pshufd  xmm0, xmm0, 0x00
49        paddd   xmm0, xmm1
50        movdqa  xmmword ptr [rsp+0x110], xmm0
51        pxor    xmm0, xmmword ptr [CMP_MSB_MASK+rip]
52        pxor    xmm1, xmmword ptr [CMP_MSB_MASK+rip]
53        pcmpgtd xmm1, xmm0
54        shr     r8, 32
55        movd    xmm2, r8d
56        pshufd  xmm2, xmm2, 0x00
57        psubd   xmm2, xmm1
58        movdqa  xmmword ptr [rsp+0x120], xmm2
59        mov     rbx, qword ptr [rbp+0x90]
60        mov     r15, rdx
61        shl     r15, 6
62        movzx   r13d, byte ptr [rbp+0x78]
63        movzx   r12d, byte ptr [rbp+0x88]
64        cmp     rsi, 4
65        jc      3f
662:
67        movdqu  xmm3, xmmword ptr [rcx]
68        pshufd  xmm0, xmm3, 0x00
69        pshufd  xmm1, xmm3, 0x55
70        pshufd  xmm2, xmm3, 0xAA
71        pshufd  xmm3, xmm3, 0xFF
72        movdqu  xmm7, xmmword ptr [rcx+0x10]
73        pshufd  xmm4, xmm7, 0x00
74        pshufd  xmm5, xmm7, 0x55
75        pshufd  xmm6, xmm7, 0xAA
76        pshufd  xmm7, xmm7, 0xFF
77        mov     r8, qword ptr [rdi]
78        mov     r9, qword ptr [rdi+0x8]
79        mov     r10, qword ptr [rdi+0x10]
80        mov     r11, qword ptr [rdi+0x18]
81        movzx   eax, byte ptr [rbp+0x80]
82        or      eax, r13d
83        xor     edx, edx
849:
85        mov     r14d, eax
86        or      eax, r12d
87        add     rdx, 64
88        cmp     rdx, r15
89        cmovne  eax, r14d
90        movdqu  xmm8, xmmword ptr [r8+rdx-0x40]
91        movdqu  xmm9, xmmword ptr [r9+rdx-0x40]
92        movdqu  xmm10, xmmword ptr [r10+rdx-0x40]
93        movdqu  xmm11, xmmword ptr [r11+rdx-0x40]
94        movdqa  xmm12, xmm8
95        punpckldq xmm8, xmm9
96        punpckhdq xmm12, xmm9
97        movdqa  xmm14, xmm10
98        punpckldq xmm10, xmm11
99        punpckhdq xmm14, xmm11
100        movdqa  xmm9, xmm8
101        punpcklqdq xmm8, xmm10
102        punpckhqdq xmm9, xmm10
103        movdqa  xmm13, xmm12
104        punpcklqdq xmm12, xmm14
105        punpckhqdq xmm13, xmm14
106        movdqa  xmmword ptr [rsp], xmm8
107        movdqa  xmmword ptr [rsp+0x10], xmm9
108        movdqa  xmmword ptr [rsp+0x20], xmm12
109        movdqa  xmmword ptr [rsp+0x30], xmm13
110        movdqu  xmm8, xmmword ptr [r8+rdx-0x30]
111        movdqu  xmm9, xmmword ptr [r9+rdx-0x30]
112        movdqu  xmm10, xmmword ptr [r10+rdx-0x30]
113        movdqu  xmm11, xmmword ptr [r11+rdx-0x30]
114        movdqa  xmm12, xmm8
115        punpckldq xmm8, xmm9
116        punpckhdq xmm12, xmm9
117        movdqa  xmm14, xmm10
118        punpckldq xmm10, xmm11
119        punpckhdq xmm14, xmm11
120        movdqa  xmm9, xmm8
121        punpcklqdq xmm8, xmm10
122        punpckhqdq xmm9, xmm10
123        movdqa  xmm13, xmm12
124        punpcklqdq xmm12, xmm14
125        punpckhqdq xmm13, xmm14
126        movdqa  xmmword ptr [rsp+0x40], xmm8
127        movdqa  xmmword ptr [rsp+0x50], xmm9
128        movdqa  xmmword ptr [rsp+0x60], xmm12
129        movdqa  xmmword ptr [rsp+0x70], xmm13
130        movdqu  xmm8, xmmword ptr [r8+rdx-0x20]
131        movdqu  xmm9, xmmword ptr [r9+rdx-0x20]
132        movdqu  xmm10, xmmword ptr [r10+rdx-0x20]
133        movdqu  xmm11, xmmword ptr [r11+rdx-0x20]
134        movdqa  xmm12, xmm8
135        punpckldq xmm8, xmm9
136        punpckhdq xmm12, xmm9
137        movdqa  xmm14, xmm10
138        punpckldq xmm10, xmm11
139        punpckhdq xmm14, xmm11
140        movdqa  xmm9, xmm8
141        punpcklqdq xmm8, xmm10
142        punpckhqdq xmm9, xmm10
143        movdqa  xmm13, xmm12
144        punpcklqdq xmm12, xmm14
145        punpckhqdq xmm13, xmm14
146        movdqa  xmmword ptr [rsp+0x80], xmm8
147        movdqa  xmmword ptr [rsp+0x90], xmm9
148        movdqa  xmmword ptr [rsp+0xA0], xmm12
149        movdqa  xmmword ptr [rsp+0xB0], xmm13
150        movdqu  xmm8, xmmword ptr [r8+rdx-0x10]
151        movdqu  xmm9, xmmword ptr [r9+rdx-0x10]
152        movdqu  xmm10, xmmword ptr [r10+rdx-0x10]
153        movdqu  xmm11, xmmword ptr [r11+rdx-0x10]
154        movdqa  xmm12, xmm8
155        punpckldq xmm8, xmm9
156        punpckhdq xmm12, xmm9
157        movdqa  xmm14, xmm10
158        punpckldq xmm10, xmm11
159        punpckhdq xmm14, xmm11
160        movdqa  xmm9, xmm8
161        punpcklqdq xmm8, xmm10
162        punpckhqdq xmm9, xmm10
163        movdqa  xmm13, xmm12
164        punpcklqdq xmm12, xmm14
165        punpckhqdq xmm13, xmm14
166        movdqa  xmmword ptr [rsp+0xC0], xmm8
167        movdqa  xmmword ptr [rsp+0xD0], xmm9
168        movdqa  xmmword ptr [rsp+0xE0], xmm12
169        movdqa  xmmword ptr [rsp+0xF0], xmm13
170        movdqa  xmm9, xmmword ptr [BLAKE3_IV_1+rip]
171        movdqa  xmm10, xmmword ptr [BLAKE3_IV_2+rip]
172        movdqa  xmm11, xmmword ptr [BLAKE3_IV_3+rip]
173        movdqa  xmm12, xmmword ptr [rsp+0x110]
174        movdqa  xmm13, xmmword ptr [rsp+0x120]
175        movdqa  xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
176        movd    xmm15, eax
177        pshufd  xmm15, xmm15, 0x00
178        prefetcht0 [r8+rdx+0x80]
179        prefetcht0 [r9+rdx+0x80]
180        prefetcht0 [r10+rdx+0x80]
181        prefetcht0 [r11+rdx+0x80]
182        paddd   xmm0, xmmword ptr [rsp]
183        paddd   xmm1, xmmword ptr [rsp+0x20]
184        paddd   xmm2, xmmword ptr [rsp+0x40]
185        paddd   xmm3, xmmword ptr [rsp+0x60]
186        paddd   xmm0, xmm4
187        paddd   xmm1, xmm5
188        paddd   xmm2, xmm6
189        paddd   xmm3, xmm7
190        pxor    xmm12, xmm0
191        pxor    xmm13, xmm1
192        pxor    xmm14, xmm2
193        pxor    xmm15, xmm3
194        movdqa  xmm8, xmmword ptr [ROT16+rip]
195        pshufb  xmm12, xmm8
196        pshufb  xmm13, xmm8
197        pshufb  xmm14, xmm8
198        pshufb  xmm15, xmm8
199        movdqa  xmm8, xmmword ptr [BLAKE3_IV_0+rip]
200        paddd   xmm8, xmm12
201        paddd   xmm9, xmm13
202        paddd   xmm10, xmm14
203        paddd   xmm11, xmm15
204        pxor    xmm4, xmm8
205        pxor    xmm5, xmm9
206        pxor    xmm6, xmm10
207        pxor    xmm7, xmm11
208        movdqa  xmmword ptr [rsp+0x100], xmm8
209        movdqa  xmm8, xmm4
210        psrld   xmm8, 12
211        pslld   xmm4, 20
212        por     xmm4, xmm8
213        movdqa  xmm8, xmm5
214        psrld   xmm8, 12
215        pslld   xmm5, 20
216        por     xmm5, xmm8
217        movdqa  xmm8, xmm6
218        psrld   xmm8, 12
219        pslld   xmm6, 20
220        por     xmm6, xmm8
221        movdqa  xmm8, xmm7
222        psrld   xmm8, 12
223        pslld   xmm7, 20
224        por     xmm7, xmm8
225        paddd   xmm0, xmmword ptr [rsp+0x10]
226        paddd   xmm1, xmmword ptr [rsp+0x30]
227        paddd   xmm2, xmmword ptr [rsp+0x50]
228        paddd   xmm3, xmmword ptr [rsp+0x70]
229        paddd   xmm0, xmm4
230        paddd   xmm1, xmm5
231        paddd   xmm2, xmm6
232        paddd   xmm3, xmm7
233        pxor    xmm12, xmm0
234        pxor    xmm13, xmm1
235        pxor    xmm14, xmm2
236        pxor    xmm15, xmm3
237        movdqa  xmm8, xmmword ptr [ROT8+rip]
238        pshufb  xmm12, xmm8
239        pshufb  xmm13, xmm8
240        pshufb  xmm14, xmm8
241        pshufb  xmm15, xmm8
242        movdqa  xmm8, xmmword ptr [rsp+0x100]
243        paddd   xmm8, xmm12
244        paddd   xmm9, xmm13
245        paddd   xmm10, xmm14
246        paddd   xmm11, xmm15
247        pxor    xmm4, xmm8
248        pxor    xmm5, xmm9
249        pxor    xmm6, xmm10
250        pxor    xmm7, xmm11
251        movdqa  xmmword ptr [rsp+0x100], xmm8
252        movdqa  xmm8, xmm4
253        psrld   xmm8, 7
254        pslld   xmm4, 25
255        por     xmm4, xmm8
256        movdqa  xmm8, xmm5
257        psrld   xmm8, 7
258        pslld   xmm5, 25
259        por     xmm5, xmm8
260        movdqa  xmm8, xmm6
261        psrld   xmm8, 7
262        pslld   xmm6, 25
263        por     xmm6, xmm8
264        movdqa  xmm8, xmm7
265        psrld   xmm8, 7
266        pslld   xmm7, 25
267        por     xmm7, xmm8
268        paddd   xmm0, xmmword ptr [rsp+0x80]
269        paddd   xmm1, xmmword ptr [rsp+0xA0]
270        paddd   xmm2, xmmword ptr [rsp+0xC0]
271        paddd   xmm3, xmmword ptr [rsp+0xE0]
272        paddd   xmm0, xmm5
273        paddd   xmm1, xmm6
274        paddd   xmm2, xmm7
275        paddd   xmm3, xmm4
276        pxor    xmm15, xmm0
277        pxor    xmm12, xmm1
278        pxor    xmm13, xmm2
279        pxor    xmm14, xmm3
280        movdqa  xmm8, xmmword ptr [ROT16+rip]
281        pshufb  xmm15, xmm8
282        pshufb  xmm12, xmm8
283        pshufb  xmm13, xmm8
284        pshufb  xmm14, xmm8
285        paddd   xmm10, xmm15
286        paddd   xmm11, xmm12
287        movdqa  xmm8, xmmword ptr [rsp+0x100]
288        paddd   xmm8, xmm13
289        paddd   xmm9, xmm14
290        pxor    xmm5, xmm10
291        pxor    xmm6, xmm11
292        pxor    xmm7, xmm8
293        pxor    xmm4, xmm9
294        movdqa  xmmword ptr [rsp+0x100], xmm8
295        movdqa  xmm8, xmm5
296        psrld   xmm8, 12
297        pslld   xmm5, 20
298        por     xmm5, xmm8
299        movdqa  xmm8, xmm6
300        psrld   xmm8, 12
301        pslld   xmm6, 20
302        por     xmm6, xmm8
303        movdqa  xmm8, xmm7
304        psrld   xmm8, 12
305        pslld   xmm7, 20
306        por     xmm7, xmm8
307        movdqa  xmm8, xmm4
308        psrld   xmm8, 12
309        pslld   xmm4, 20
310        por     xmm4, xmm8
311        paddd   xmm0, xmmword ptr [rsp+0x90]
312        paddd   xmm1, xmmword ptr [rsp+0xB0]
313        paddd   xmm2, xmmword ptr [rsp+0xD0]
314        paddd   xmm3, xmmword ptr [rsp+0xF0]
315        paddd   xmm0, xmm5
316        paddd   xmm1, xmm6
317        paddd   xmm2, xmm7
318        paddd   xmm3, xmm4
319        pxor    xmm15, xmm0
320        pxor    xmm12, xmm1
321        pxor    xmm13, xmm2
322        pxor    xmm14, xmm3
323        movdqa  xmm8, xmmword ptr [ROT8+rip]
324        pshufb  xmm15, xmm8
325        pshufb  xmm12, xmm8
326        pshufb  xmm13, xmm8
327        pshufb  xmm14, xmm8
328        paddd   xmm10, xmm15
329        paddd   xmm11, xmm12
330        movdqa  xmm8, xmmword ptr [rsp+0x100]
331        paddd   xmm8, xmm13
332        paddd   xmm9, xmm14
333        pxor    xmm5, xmm10
334        pxor    xmm6, xmm11
335        pxor    xmm7, xmm8
336        pxor    xmm4, xmm9
337        movdqa  xmmword ptr [rsp+0x100], xmm8
338        movdqa  xmm8, xmm5
339        psrld   xmm8, 7
340        pslld   xmm5, 25
341        por     xmm5, xmm8
342        movdqa  xmm8, xmm6
343        psrld   xmm8, 7
344        pslld   xmm6, 25
345        por     xmm6, xmm8
346        movdqa  xmm8, xmm7
347        psrld   xmm8, 7
348        pslld   xmm7, 25
349        por     xmm7, xmm8
350        movdqa  xmm8, xmm4
351        psrld   xmm8, 7
352        pslld   xmm4, 25
353        por     xmm4, xmm8
354        paddd   xmm0, xmmword ptr [rsp+0x20]
355        paddd   xmm1, xmmword ptr [rsp+0x30]
356        paddd   xmm2, xmmword ptr [rsp+0x70]
357        paddd   xmm3, xmmword ptr [rsp+0x40]
358        paddd   xmm0, xmm4
359        paddd   xmm1, xmm5
360        paddd   xmm2, xmm6
361        paddd   xmm3, xmm7
362        pxor    xmm12, xmm0
363        pxor    xmm13, xmm1
364        pxor    xmm14, xmm2
365        pxor    xmm15, xmm3
366        movdqa  xmm8, xmmword ptr [ROT16+rip]
367        pshufb  xmm12, xmm8
368        pshufb  xmm13, xmm8
369        pshufb  xmm14, xmm8
370        pshufb  xmm15, xmm8
371        movdqa  xmm8, xmmword ptr [rsp+0x100]
372        paddd   xmm8, xmm12
373        paddd   xmm9, xmm13
374        paddd   xmm10, xmm14
375        paddd   xmm11, xmm15
376        pxor    xmm4, xmm8
377        pxor    xmm5, xmm9
378        pxor    xmm6, xmm10
379        pxor    xmm7, xmm11
380        movdqa  xmmword ptr [rsp+0x100], xmm8
381        movdqa  xmm8, xmm4
382        psrld   xmm8, 12
383        pslld   xmm4, 20
384        por     xmm4, xmm8
385        movdqa  xmm8, xmm5
386        psrld   xmm8, 12
387        pslld   xmm5, 20
388        por     xmm5, xmm8
389        movdqa  xmm8, xmm6
390        psrld   xmm8, 12
391        pslld   xmm6, 20
392        por     xmm6, xmm8
393        movdqa  xmm8, xmm7
394        psrld   xmm8, 12
395        pslld   xmm7, 20
396        por     xmm7, xmm8
397        paddd   xmm0, xmmword ptr [rsp+0x60]
398        paddd   xmm1, xmmword ptr [rsp+0xA0]
399        paddd   xmm2, xmmword ptr [rsp]
400        paddd   xmm3, xmmword ptr [rsp+0xD0]
401        paddd   xmm0, xmm4
402        paddd   xmm1, xmm5
403        paddd   xmm2, xmm6
404        paddd   xmm3, xmm7
405        pxor    xmm12, xmm0
406        pxor    xmm13, xmm1
407        pxor    xmm14, xmm2
408        pxor    xmm15, xmm3
409        movdqa  xmm8, xmmword ptr [ROT8+rip]
410        pshufb  xmm12, xmm8
411        pshufb  xmm13, xmm8
412        pshufb  xmm14, xmm8
413        pshufb  xmm15, xmm8
414        movdqa  xmm8, xmmword ptr [rsp+0x100]
415        paddd   xmm8, xmm12
416        paddd   xmm9, xmm13
417        paddd   xmm10, xmm14
418        paddd   xmm11, xmm15
419        pxor    xmm4, xmm8
420        pxor    xmm5, xmm9
421        pxor    xmm6, xmm10
422        pxor    xmm7, xmm11
423        movdqa  xmmword ptr [rsp+0x100], xmm8
424        movdqa  xmm8, xmm4
425        psrld   xmm8, 7
426        pslld   xmm4, 25
427        por     xmm4, xmm8
428        movdqa  xmm8, xmm5
429        psrld   xmm8, 7
430        pslld   xmm5, 25
431        por     xmm5, xmm8
432        movdqa  xmm8, xmm6
433        psrld   xmm8, 7
434        pslld   xmm6, 25
435        por     xmm6, xmm8
436        movdqa  xmm8, xmm7
437        psrld   xmm8, 7
438        pslld   xmm7, 25
439        por     xmm7, xmm8
440        paddd   xmm0, xmmword ptr [rsp+0x10]
441        paddd   xmm1, xmmword ptr [rsp+0xC0]
442        paddd   xmm2, xmmword ptr [rsp+0x90]
443        paddd   xmm3, xmmword ptr [rsp+0xF0]
444        paddd   xmm0, xmm5
445        paddd   xmm1, xmm6
446        paddd   xmm2, xmm7
447        paddd   xmm3, xmm4
448        pxor    xmm15, xmm0
449        pxor    xmm12, xmm1
450        pxor    xmm13, xmm2
451        pxor    xmm14, xmm3
452        movdqa  xmm8, xmmword ptr [ROT16+rip]
453        pshufb  xmm15, xmm8
454        pshufb  xmm12, xmm8
455        pshufb  xmm13, xmm8
456        pshufb  xmm14, xmm8
457        paddd   xmm10, xmm15
458        paddd   xmm11, xmm12
459        movdqa  xmm8, xmmword ptr [rsp+0x100]
460        paddd   xmm8, xmm13
461        paddd   xmm9, xmm14
462        pxor    xmm5, xmm10
463        pxor    xmm6, xmm11
464        pxor    xmm7, xmm8
465        pxor    xmm4, xmm9
466        movdqa  xmmword ptr [rsp+0x100], xmm8
467        movdqa  xmm8, xmm5
468        psrld   xmm8, 12
469        pslld   xmm5, 20
470        por     xmm5, xmm8
471        movdqa  xmm8, xmm6
472        psrld   xmm8, 12
473        pslld   xmm6, 20
474        por     xmm6, xmm8
475        movdqa  xmm8, xmm7
476        psrld   xmm8, 12
477        pslld   xmm7, 20
478        por     xmm7, xmm8
479        movdqa  xmm8, xmm4
480        psrld   xmm8, 12
481        pslld   xmm4, 20
482        por     xmm4, xmm8
483        paddd   xmm0, xmmword ptr [rsp+0xB0]
484        paddd   xmm1, xmmword ptr [rsp+0x50]
485        paddd   xmm2, xmmword ptr [rsp+0xE0]
486        paddd   xmm3, xmmword ptr [rsp+0x80]
487        paddd   xmm0, xmm5
488        paddd   xmm1, xmm6
489        paddd   xmm2, xmm7
490        paddd   xmm3, xmm4
491        pxor    xmm15, xmm0
492        pxor    xmm12, xmm1
493        pxor    xmm13, xmm2
494        pxor    xmm14, xmm3
495        movdqa  xmm8, xmmword ptr [ROT8+rip]
496        pshufb  xmm15, xmm8
497        pshufb  xmm12, xmm8
498        pshufb  xmm13, xmm8
499        pshufb  xmm14, xmm8
500        paddd   xmm10, xmm15
501        paddd   xmm11, xmm12
502        movdqa  xmm8, xmmword ptr [rsp+0x100]
503        paddd   xmm8, xmm13
504        paddd   xmm9, xmm14
505        pxor    xmm5, xmm10
506        pxor    xmm6, xmm11
507        pxor    xmm7, xmm8
508        pxor    xmm4, xmm9
509        movdqa  xmmword ptr [rsp+0x100], xmm8
510        movdqa  xmm8, xmm5
511        psrld   xmm8, 7
512        pslld   xmm5, 25
513        por     xmm5, xmm8
514        movdqa  xmm8, xmm6
515        psrld   xmm8, 7
516        pslld   xmm6, 25
517        por     xmm6, xmm8
518        movdqa  xmm8, xmm7
519        psrld   xmm8, 7
520        pslld   xmm7, 25
521        por     xmm7, xmm8
522        movdqa  xmm8, xmm4
523        psrld   xmm8, 7
524        pslld   xmm4, 25
525        por     xmm4, xmm8
526        paddd   xmm0, xmmword ptr [rsp+0x30]
527        paddd   xmm1, xmmword ptr [rsp+0xA0]
528        paddd   xmm2, xmmword ptr [rsp+0xD0]
529        paddd   xmm3, xmmword ptr [rsp+0x70]
530        paddd   xmm0, xmm4
531        paddd   xmm1, xmm5
532        paddd   xmm2, xmm6
533        paddd   xmm3, xmm7
534        pxor    xmm12, xmm0
535        pxor    xmm13, xmm1
536        pxor    xmm14, xmm2
537        pxor    xmm15, xmm3
538        movdqa  xmm8, xmmword ptr [ROT16+rip]
539        pshufb  xmm12, xmm8
540        pshufb  xmm13, xmm8
541        pshufb  xmm14, xmm8
542        pshufb  xmm15, xmm8
543        movdqa  xmm8, xmmword ptr [rsp+0x100]
544        paddd   xmm8, xmm12
545        paddd   xmm9, xmm13
546        paddd   xmm10, xmm14
547        paddd   xmm11, xmm15
548        pxor    xmm4, xmm8
549        pxor    xmm5, xmm9
550        pxor    xmm6, xmm10
551        pxor    xmm7, xmm11
552        movdqa  xmmword ptr [rsp+0x100], xmm8
553        movdqa  xmm8, xmm4
554        psrld   xmm8, 12
555        pslld   xmm4, 20
556        por     xmm4, xmm8
557        movdqa  xmm8, xmm5
558        psrld   xmm8, 12
559        pslld   xmm5, 20
560        por     xmm5, xmm8
561        movdqa  xmm8, xmm6
562        psrld   xmm8, 12
563        pslld   xmm6, 20
564        por     xmm6, xmm8
565        movdqa  xmm8, xmm7
566        psrld   xmm8, 12
567        pslld   xmm7, 20
568        por     xmm7, xmm8
569        paddd   xmm0, xmmword ptr [rsp+0x40]
570        paddd   xmm1, xmmword ptr [rsp+0xC0]
571        paddd   xmm2, xmmword ptr [rsp+0x20]
572        paddd   xmm3, xmmword ptr [rsp+0xE0]
573        paddd   xmm0, xmm4
574        paddd   xmm1, xmm5
575        paddd   xmm2, xmm6
576        paddd   xmm3, xmm7
577        pxor    xmm12, xmm0
578        pxor    xmm13, xmm1
579        pxor    xmm14, xmm2
580        pxor    xmm15, xmm3
581        movdqa  xmm8, xmmword ptr [ROT8+rip]
582        pshufb  xmm12, xmm8
583        pshufb  xmm13, xmm8
584        pshufb  xmm14, xmm8
585        pshufb  xmm15, xmm8
586        movdqa  xmm8, xmmword ptr [rsp+0x100]
587        paddd   xmm8, xmm12
588        paddd   xmm9, xmm13
589        paddd   xmm10, xmm14
590        paddd   xmm11, xmm15
591        pxor    xmm4, xmm8
592        pxor    xmm5, xmm9
593        pxor    xmm6, xmm10
594        pxor    xmm7, xmm11
595        movdqa  xmmword ptr [rsp+0x100], xmm8
596        movdqa  xmm8, xmm4
597        psrld   xmm8, 7
598        pslld   xmm4, 25
599        por     xmm4, xmm8
600        movdqa  xmm8, xmm5
601        psrld   xmm8, 7
602        pslld   xmm5, 25
603        por     xmm5, xmm8
604        movdqa  xmm8, xmm6
605        psrld   xmm8, 7
606        pslld   xmm6, 25
607        por     xmm6, xmm8
608        movdqa  xmm8, xmm7
609        psrld   xmm8, 7
610        pslld   xmm7, 25
611        por     xmm7, xmm8
612        paddd   xmm0, xmmword ptr [rsp+0x60]
613        paddd   xmm1, xmmword ptr [rsp+0x90]
614        paddd   xmm2, xmmword ptr [rsp+0xB0]
615        paddd   xmm3, xmmword ptr [rsp+0x80]
616        paddd   xmm0, xmm5
617        paddd   xmm1, xmm6
618        paddd   xmm2, xmm7
619        paddd   xmm3, xmm4
620        pxor    xmm15, xmm0
621        pxor    xmm12, xmm1
622        pxor    xmm13, xmm2
623        pxor    xmm14, xmm3
624        movdqa  xmm8, xmmword ptr [ROT16+rip]
625        pshufb  xmm15, xmm8
626        pshufb  xmm12, xmm8
627        pshufb  xmm13, xmm8
628        pshufb  xmm14, xmm8
629        paddd   xmm10, xmm15
630        paddd   xmm11, xmm12
631        movdqa  xmm8, xmmword ptr [rsp+0x100]
632        paddd   xmm8, xmm13
633        paddd   xmm9, xmm14
634        pxor    xmm5, xmm10
635        pxor    xmm6, xmm11
636        pxor    xmm7, xmm8
637        pxor    xmm4, xmm9
638        movdqa  xmmword ptr [rsp+0x100], xmm8
639        movdqa  xmm8, xmm5
640        psrld   xmm8, 12
641        pslld   xmm5, 20
642        por     xmm5, xmm8
643        movdqa  xmm8, xmm6
644        psrld   xmm8, 12
645        pslld   xmm6, 20
646        por     xmm6, xmm8
647        movdqa  xmm8, xmm7
648        psrld   xmm8, 12
649        pslld   xmm7, 20
650        por     xmm7, xmm8
651        movdqa  xmm8, xmm4
652        psrld   xmm8, 12
653        pslld   xmm4, 20
654        por     xmm4, xmm8
655        paddd   xmm0, xmmword ptr [rsp+0x50]
656        paddd   xmm1, xmmword ptr [rsp]
657        paddd   xmm2, xmmword ptr [rsp+0xF0]
658        paddd   xmm3, xmmword ptr [rsp+0x10]
659        paddd   xmm0, xmm5
660        paddd   xmm1, xmm6
661        paddd   xmm2, xmm7
662        paddd   xmm3, xmm4
663        pxor    xmm15, xmm0
664        pxor    xmm12, xmm1
665        pxor    xmm13, xmm2
666        pxor    xmm14, xmm3
667        movdqa  xmm8, xmmword ptr [ROT8+rip]
668        pshufb  xmm15, xmm8
669        pshufb  xmm12, xmm8
670        pshufb  xmm13, xmm8
671        pshufb  xmm14, xmm8
672        paddd   xmm10, xmm15
673        paddd   xmm11, xmm12
674        movdqa  xmm8, xmmword ptr [rsp+0x100]
675        paddd   xmm8, xmm13
676        paddd   xmm9, xmm14
677        pxor    xmm5, xmm10
678        pxor    xmm6, xmm11
679        pxor    xmm7, xmm8
680        pxor    xmm4, xmm9
681        movdqa  xmmword ptr [rsp+0x100], xmm8
682        movdqa  xmm8, xmm5
683        psrld   xmm8, 7
684        pslld   xmm5, 25
685        por     xmm5, xmm8
686        movdqa  xmm8, xmm6
687        psrld   xmm8, 7
688        pslld   xmm6, 25
689        por     xmm6, xmm8
690        movdqa  xmm8, xmm7
691        psrld   xmm8, 7
692        pslld   xmm7, 25
693        por     xmm7, xmm8
694        movdqa  xmm8, xmm4
695        psrld   xmm8, 7
696        pslld   xmm4, 25
697        por     xmm4, xmm8
698        paddd   xmm0, xmmword ptr [rsp+0xA0]
699        paddd   xmm1, xmmword ptr [rsp+0xC0]
700        paddd   xmm2, xmmword ptr [rsp+0xE0]
701        paddd   xmm3, xmmword ptr [rsp+0xD0]
702        paddd   xmm0, xmm4
703        paddd   xmm1, xmm5
704        paddd   xmm2, xmm6
705        paddd   xmm3, xmm7
706        pxor    xmm12, xmm0
707        pxor    xmm13, xmm1
708        pxor    xmm14, xmm2
709        pxor    xmm15, xmm3
710        movdqa  xmm8, xmmword ptr [ROT16+rip]
711        pshufb  xmm12, xmm8
712        pshufb  xmm13, xmm8
713        pshufb  xmm14, xmm8
714        pshufb  xmm15, xmm8
715        movdqa  xmm8, xmmword ptr [rsp+0x100]
716        paddd   xmm8, xmm12
717        paddd   xmm9, xmm13
718        paddd   xmm10, xmm14
719        paddd   xmm11, xmm15
720        pxor    xmm4, xmm8
721        pxor    xmm5, xmm9
722        pxor    xmm6, xmm10
723        pxor    xmm7, xmm11
724        movdqa  xmmword ptr [rsp+0x100], xmm8
725        movdqa  xmm8, xmm4
726        psrld   xmm8, 12
727        pslld   xmm4, 20
728        por     xmm4, xmm8
729        movdqa  xmm8, xmm5
730        psrld   xmm8, 12
731        pslld   xmm5, 20
732        por     xmm5, xmm8
733        movdqa  xmm8, xmm6
734        psrld   xmm8, 12
735        pslld   xmm6, 20
736        por     xmm6, xmm8
737        movdqa  xmm8, xmm7
738        psrld   xmm8, 12
739        pslld   xmm7, 20
740        por     xmm7, xmm8
741        paddd   xmm0, xmmword ptr [rsp+0x70]
742        paddd   xmm1, xmmword ptr [rsp+0x90]
743        paddd   xmm2, xmmword ptr [rsp+0x30]
744        paddd   xmm3, xmmword ptr [rsp+0xF0]
745        paddd   xmm0, xmm4
746        paddd   xmm1, xmm5
747        paddd   xmm2, xmm6
748        paddd   xmm3, xmm7
749        pxor    xmm12, xmm0
750        pxor    xmm13, xmm1
751        pxor    xmm14, xmm2
752        pxor    xmm15, xmm3
753        movdqa  xmm8, xmmword ptr [ROT8+rip]
754        pshufb  xmm12, xmm8
755        pshufb  xmm13, xmm8
756        pshufb  xmm14, xmm8
757        pshufb  xmm15, xmm8
758        movdqa  xmm8, xmmword ptr [rsp+0x100]
759        paddd   xmm8, xmm12
760        paddd   xmm9, xmm13
761        paddd   xmm10, xmm14
762        paddd   xmm11, xmm15
763        pxor    xmm4, xmm8
764        pxor    xmm5, xmm9
765        pxor    xmm6, xmm10
766        pxor    xmm7, xmm11
767        movdqa  xmmword ptr [rsp+0x100], xmm8
768        movdqa  xmm8, xmm4
769        psrld   xmm8, 7
770        pslld   xmm4, 25
771        por     xmm4, xmm8
772        movdqa  xmm8, xmm5
773        psrld   xmm8, 7
774        pslld   xmm5, 25
775        por     xmm5, xmm8
776        movdqa  xmm8, xmm6
777        psrld   xmm8, 7
778        pslld   xmm6, 25
779        por     xmm6, xmm8
780        movdqa  xmm8, xmm7
781        psrld   xmm8, 7
782        pslld   xmm7, 25
783        por     xmm7, xmm8
784        paddd   xmm0, xmmword ptr [rsp+0x40]
785        paddd   xmm1, xmmword ptr [rsp+0xB0]
786        paddd   xmm2, xmmword ptr [rsp+0x50]
787        paddd   xmm3, xmmword ptr [rsp+0x10]
788        paddd   xmm0, xmm5
789        paddd   xmm1, xmm6
790        paddd   xmm2, xmm7
791        paddd   xmm3, xmm4
792        pxor    xmm15, xmm0
793        pxor    xmm12, xmm1
794        pxor    xmm13, xmm2
795        pxor    xmm14, xmm3
796        movdqa  xmm8, xmmword ptr [ROT16+rip]
797        pshufb  xmm15, xmm8
798        pshufb  xmm12, xmm8
799        pshufb  xmm13, xmm8
800        pshufb  xmm14, xmm8
801        paddd   xmm10, xmm15
802        paddd   xmm11, xmm12
803        movdqa  xmm8, xmmword ptr [rsp+0x100]
804        paddd   xmm8, xmm13
805        paddd   xmm9, xmm14
806        pxor    xmm5, xmm10
807        pxor    xmm6, xmm11
808        pxor    xmm7, xmm8
809        pxor    xmm4, xmm9
810        movdqa  xmmword ptr [rsp+0x100], xmm8
811        movdqa  xmm8, xmm5
812        psrld   xmm8, 12
813        pslld   xmm5, 20
814        por     xmm5, xmm8
815        movdqa  xmm8, xmm6
816        psrld   xmm8, 12
817        pslld   xmm6, 20
818        por     xmm6, xmm8
819        movdqa  xmm8, xmm7
820        psrld   xmm8, 12
821        pslld   xmm7, 20
822        por     xmm7, xmm8
823        movdqa  xmm8, xmm4
824        psrld   xmm8, 12
825        pslld   xmm4, 20
826        por     xmm4, xmm8
827        paddd   xmm0, xmmword ptr [rsp]
828        paddd   xmm1, xmmword ptr [rsp+0x20]
829        paddd   xmm2, xmmword ptr [rsp+0x80]
830        paddd   xmm3, xmmword ptr [rsp+0x60]
831        paddd   xmm0, xmm5
832        paddd   xmm1, xmm6
833        paddd   xmm2, xmm7
834        paddd   xmm3, xmm4
835        pxor    xmm15, xmm0
836        pxor    xmm12, xmm1
837        pxor    xmm13, xmm2
838        pxor    xmm14, xmm3
839        movdqa  xmm8, xmmword ptr [ROT8+rip]
840        pshufb  xmm15, xmm8
841        pshufb  xmm12, xmm8
842        pshufb  xmm13, xmm8
843        pshufb  xmm14, xmm8
844        paddd   xmm10, xmm15
845        paddd   xmm11, xmm12
846        movdqa  xmm8, xmmword ptr [rsp+0x100]
847        paddd   xmm8, xmm13
848        paddd   xmm9, xmm14
849        pxor    xmm5, xmm10
850        pxor    xmm6, xmm11
851        pxor    xmm7, xmm8
852        pxor    xmm4, xmm9
853        movdqa  xmmword ptr [rsp+0x100], xmm8
854        movdqa  xmm8, xmm5
855        psrld   xmm8, 7
856        pslld   xmm5, 25
857        por     xmm5, xmm8
858        movdqa  xmm8, xmm6
859        psrld   xmm8, 7
860        pslld   xmm6, 25
861        por     xmm6, xmm8
862        movdqa  xmm8, xmm7
863        psrld   xmm8, 7
864        pslld   xmm7, 25
865        por     xmm7, xmm8
866        movdqa  xmm8, xmm4
867        psrld   xmm8, 7
868        pslld   xmm4, 25
869        por     xmm4, xmm8
870        paddd   xmm0, xmmword ptr [rsp+0xC0]
871        paddd   xmm1, xmmword ptr [rsp+0x90]
872        paddd   xmm2, xmmword ptr [rsp+0xF0]
873        paddd   xmm3, xmmword ptr [rsp+0xE0]
874        paddd   xmm0, xmm4
875        paddd   xmm1, xmm5
876        paddd   xmm2, xmm6
877        paddd   xmm3, xmm7
878        pxor    xmm12, xmm0
879        pxor    xmm13, xmm1
880        pxor    xmm14, xmm2
881        pxor    xmm15, xmm3
882        movdqa  xmm8, xmmword ptr [ROT16+rip]
883        pshufb  xmm12, xmm8
884        pshufb  xmm13, xmm8
885        pshufb  xmm14, xmm8
886        pshufb  xmm15, xmm8
887        movdqa  xmm8, xmmword ptr [rsp+0x100]
888        paddd   xmm8, xmm12
889        paddd   xmm9, xmm13
890        paddd   xmm10, xmm14
891        paddd   xmm11, xmm15
892        pxor    xmm4, xmm8
893        pxor    xmm5, xmm9
894        pxor    xmm6, xmm10
895        pxor    xmm7, xmm11
896        movdqa  xmmword ptr [rsp+0x100], xmm8
897        movdqa  xmm8, xmm4
898        psrld   xmm8, 12
899        pslld   xmm4, 20
900        por     xmm4, xmm8
901        movdqa  xmm8, xmm5
902        psrld   xmm8, 12
903        pslld   xmm5, 20
904        por     xmm5, xmm8
905        movdqa  xmm8, xmm6
906        psrld   xmm8, 12
907        pslld   xmm6, 20
908        por     xmm6, xmm8
909        movdqa  xmm8, xmm7
910        psrld   xmm8, 12
911        pslld   xmm7, 20
912        por     xmm7, xmm8
913        paddd   xmm0, xmmword ptr [rsp+0xD0]
914        paddd   xmm1, xmmword ptr [rsp+0xB0]
915        paddd   xmm2, xmmword ptr [rsp+0xA0]
916        paddd   xmm3, xmmword ptr [rsp+0x80]
917        paddd   xmm0, xmm4
918        paddd   xmm1, xmm5
919        paddd   xmm2, xmm6
920        paddd   xmm3, xmm7
921        pxor    xmm12, xmm0
922        pxor    xmm13, xmm1
923        pxor    xmm14, xmm2
924        pxor    xmm15, xmm3
925        movdqa  xmm8, xmmword ptr [ROT8+rip]
926        pshufb  xmm12, xmm8
927        pshufb  xmm13, xmm8
928        pshufb  xmm14, xmm8
929        pshufb  xmm15, xmm8
930        movdqa  xmm8, xmmword ptr [rsp+0x100]
931        paddd   xmm8, xmm12
932        paddd   xmm9, xmm13
933        paddd   xmm10, xmm14
934        paddd   xmm11, xmm15
935        pxor    xmm4, xmm8
936        pxor    xmm5, xmm9
937        pxor    xmm6, xmm10
938        pxor    xmm7, xmm11
939        movdqa  xmmword ptr [rsp+0x100], xmm8
940        movdqa  xmm8, xmm4
941        psrld   xmm8, 7
942        pslld   xmm4, 25
943        por     xmm4, xmm8
944        movdqa  xmm8, xmm5
945        psrld   xmm8, 7
946        pslld   xmm5, 25
947        por     xmm5, xmm8
948        movdqa  xmm8, xmm6
949        psrld   xmm8, 7
950        pslld   xmm6, 25
951        por     xmm6, xmm8
952        movdqa  xmm8, xmm7
953        psrld   xmm8, 7
954        pslld   xmm7, 25
955        por     xmm7, xmm8
956        paddd   xmm0, xmmword ptr [rsp+0x70]
957        paddd   xmm1, xmmword ptr [rsp+0x50]
958        paddd   xmm2, xmmword ptr [rsp]
959        paddd   xmm3, xmmword ptr [rsp+0x60]
960        paddd   xmm0, xmm5
961        paddd   xmm1, xmm6
962        paddd   xmm2, xmm7
963        paddd   xmm3, xmm4
964        pxor    xmm15, xmm0
965        pxor    xmm12, xmm1
966        pxor    xmm13, xmm2
967        pxor    xmm14, xmm3
968        movdqa  xmm8, xmmword ptr [ROT16+rip]
969        pshufb  xmm15, xmm8
970        pshufb  xmm12, xmm8
971        pshufb  xmm13, xmm8
972        pshufb  xmm14, xmm8
973        paddd   xmm10, xmm15
974        paddd   xmm11, xmm12
975        movdqa  xmm8, xmmword ptr [rsp+0x100]
976        paddd   xmm8, xmm13
977        paddd   xmm9, xmm14
978        pxor    xmm5, xmm10
979        pxor    xmm6, xmm11
980        pxor    xmm7, xmm8
981        pxor    xmm4, xmm9
982        movdqa  xmmword ptr [rsp+0x100], xmm8
983        movdqa  xmm8, xmm5
984        psrld   xmm8, 12
985        pslld   xmm5, 20
986        por     xmm5, xmm8
987        movdqa  xmm8, xmm6
988        psrld   xmm8, 12
989        pslld   xmm6, 20
990        por     xmm6, xmm8
991        movdqa  xmm8, xmm7
992        psrld   xmm8, 12
993        pslld   xmm7, 20
994        por     xmm7, xmm8
995        movdqa  xmm8, xmm4
996        psrld   xmm8, 12
997        pslld   xmm4, 20
998        por     xmm4, xmm8
999        paddd   xmm0, xmmword ptr [rsp+0x20]
1000        paddd   xmm1, xmmword ptr [rsp+0x30]
1001        paddd   xmm2, xmmword ptr [rsp+0x10]
1002        paddd   xmm3, xmmword ptr [rsp+0x40]
1003        paddd   xmm0, xmm5
1004        paddd   xmm1, xmm6
1005        paddd   xmm2, xmm7
1006        paddd   xmm3, xmm4
1007        pxor    xmm15, xmm0
1008        pxor    xmm12, xmm1
1009        pxor    xmm13, xmm2
1010        pxor    xmm14, xmm3
1011        movdqa  xmm8, xmmword ptr [ROT8+rip]
1012        pshufb  xmm15, xmm8
1013        pshufb  xmm12, xmm8
1014        pshufb  xmm13, xmm8
1015        pshufb  xmm14, xmm8
1016        paddd   xmm10, xmm15
1017        paddd   xmm11, xmm12
1018        movdqa  xmm8, xmmword ptr [rsp+0x100]
1019        paddd   xmm8, xmm13
1020        paddd   xmm9, xmm14
1021        pxor    xmm5, xmm10
1022        pxor    xmm6, xmm11
1023        pxor    xmm7, xmm8
1024        pxor    xmm4, xmm9
1025        movdqa  xmmword ptr [rsp+0x100], xmm8
1026        movdqa  xmm8, xmm5
1027        psrld   xmm8, 7
1028        pslld   xmm5, 25
1029        por     xmm5, xmm8
1030        movdqa  xmm8, xmm6
1031        psrld   xmm8, 7
1032        pslld   xmm6, 25
1033        por     xmm6, xmm8
1034        movdqa  xmm8, xmm7
1035        psrld   xmm8, 7
1036        pslld   xmm7, 25
1037        por     xmm7, xmm8
1038        movdqa  xmm8, xmm4
1039        psrld   xmm8, 7
1040        pslld   xmm4, 25
1041        por     xmm4, xmm8
1042        paddd   xmm0, xmmword ptr [rsp+0x90]
1043        paddd   xmm1, xmmword ptr [rsp+0xB0]
1044        paddd   xmm2, xmmword ptr [rsp+0x80]
1045        paddd   xmm3, xmmword ptr [rsp+0xF0]
1046        paddd   xmm0, xmm4
1047        paddd   xmm1, xmm5
1048        paddd   xmm2, xmm6
1049        paddd   xmm3, xmm7
1050        pxor    xmm12, xmm0
1051        pxor    xmm13, xmm1
1052        pxor    xmm14, xmm2
1053        pxor    xmm15, xmm3
1054        movdqa  xmm8, xmmword ptr [ROT16+rip]
1055        pshufb  xmm12, xmm8
1056        pshufb  xmm13, xmm8
1057        pshufb  xmm14, xmm8
1058        pshufb  xmm15, xmm8
1059        movdqa  xmm8, xmmword ptr [rsp+0x100]
1060        paddd   xmm8, xmm12
1061        paddd   xmm9, xmm13
1062        paddd   xmm10, xmm14
1063        paddd   xmm11, xmm15
1064        pxor    xmm4, xmm8
1065        pxor    xmm5, xmm9
1066        pxor    xmm6, xmm10
1067        pxor    xmm7, xmm11
1068        movdqa  xmmword ptr [rsp+0x100], xmm8
1069        movdqa  xmm8, xmm4
1070        psrld   xmm8, 12
1071        pslld   xmm4, 20
1072        por     xmm4, xmm8
1073        movdqa  xmm8, xmm5
1074        psrld   xmm8, 12
1075        pslld   xmm5, 20
1076        por     xmm5, xmm8
1077        movdqa  xmm8, xmm6
1078        psrld   xmm8, 12
1079        pslld   xmm6, 20
1080        por     xmm6, xmm8
1081        movdqa  xmm8, xmm7
1082        psrld   xmm8, 12
1083        pslld   xmm7, 20
1084        por     xmm7, xmm8
1085        paddd   xmm0, xmmword ptr [rsp+0xE0]
1086        paddd   xmm1, xmmword ptr [rsp+0x50]
1087        paddd   xmm2, xmmword ptr [rsp+0xC0]
1088        paddd   xmm3, xmmword ptr [rsp+0x10]
1089        paddd   xmm0, xmm4
1090        paddd   xmm1, xmm5
1091        paddd   xmm2, xmm6
1092        paddd   xmm3, xmm7
1093        pxor    xmm12, xmm0
1094        pxor    xmm13, xmm1
1095        pxor    xmm14, xmm2
1096        pxor    xmm15, xmm3
1097        movdqa  xmm8, xmmword ptr [ROT8+rip]
1098        pshufb  xmm12, xmm8
1099        pshufb  xmm13, xmm8
1100        pshufb  xmm14, xmm8
1101        pshufb  xmm15, xmm8
1102        movdqa  xmm8, xmmword ptr [rsp+0x100]
1103        paddd   xmm8, xmm12
1104        paddd   xmm9, xmm13
1105        paddd   xmm10, xmm14
1106        paddd   xmm11, xmm15
1107        pxor    xmm4, xmm8
1108        pxor    xmm5, xmm9
1109        pxor    xmm6, xmm10
1110        pxor    xmm7, xmm11
1111        movdqa  xmmword ptr [rsp+0x100], xmm8
1112        movdqa  xmm8, xmm4
1113        psrld   xmm8, 7
1114        pslld   xmm4, 25
1115        por     xmm4, xmm8
1116        movdqa  xmm8, xmm5
1117        psrld   xmm8, 7
1118        pslld   xmm5, 25
1119        por     xmm5, xmm8
1120        movdqa  xmm8, xmm6
1121        psrld   xmm8, 7
1122        pslld   xmm6, 25
1123        por     xmm6, xmm8
1124        movdqa  xmm8, xmm7
1125        psrld   xmm8, 7
1126        pslld   xmm7, 25
1127        por     xmm7, xmm8
1128        paddd   xmm0, xmmword ptr [rsp+0xD0]
1129        paddd   xmm1, xmmword ptr [rsp]
1130        paddd   xmm2, xmmword ptr [rsp+0x20]
1131        paddd   xmm3, xmmword ptr [rsp+0x40]
1132        paddd   xmm0, xmm5
1133        paddd   xmm1, xmm6
1134        paddd   xmm2, xmm7
1135        paddd   xmm3, xmm4
1136        pxor    xmm15, xmm0
1137        pxor    xmm12, xmm1
1138        pxor    xmm13, xmm2
1139        pxor    xmm14, xmm3
1140        movdqa  xmm8, xmmword ptr [ROT16+rip]
1141        pshufb  xmm15, xmm8
1142        pshufb  xmm12, xmm8
1143        pshufb  xmm13, xmm8
1144        pshufb  xmm14, xmm8
1145        paddd   xmm10, xmm15
1146        paddd   xmm11, xmm12
1147        movdqa  xmm8, xmmword ptr [rsp+0x100]
1148        paddd   xmm8, xmm13
1149        paddd   xmm9, xmm14
1150        pxor    xmm5, xmm10
1151        pxor    xmm6, xmm11
1152        pxor    xmm7, xmm8
1153        pxor    xmm4, xmm9
1154        movdqa  xmmword ptr [rsp+0x100], xmm8
1155        movdqa  xmm8, xmm5
1156        psrld   xmm8, 12
1157        pslld   xmm5, 20
1158        por     xmm5, xmm8
1159        movdqa  xmm8, xmm6
1160        psrld   xmm8, 12
1161        pslld   xmm6, 20
1162        por     xmm6, xmm8
1163        movdqa  xmm8, xmm7
1164        psrld   xmm8, 12
1165        pslld   xmm7, 20
1166        por     xmm7, xmm8
1167        movdqa  xmm8, xmm4
1168        psrld   xmm8, 12
1169        pslld   xmm4, 20
1170        por     xmm4, xmm8
1171        paddd   xmm0, xmmword ptr [rsp+0x30]
1172        paddd   xmm1, xmmword ptr [rsp+0xA0]
1173        paddd   xmm2, xmmword ptr [rsp+0x60]
1174        paddd   xmm3, xmmword ptr [rsp+0x70]
1175        paddd   xmm0, xmm5
1176        paddd   xmm1, xmm6
1177        paddd   xmm2, xmm7
1178        paddd   xmm3, xmm4
1179        pxor    xmm15, xmm0
1180        pxor    xmm12, xmm1
1181        pxor    xmm13, xmm2
1182        pxor    xmm14, xmm3
1183        movdqa  xmm8, xmmword ptr [ROT8+rip]
1184        pshufb  xmm15, xmm8
1185        pshufb  xmm12, xmm8
1186        pshufb  xmm13, xmm8
1187        pshufb  xmm14, xmm8
1188        paddd   xmm10, xmm15
1189        paddd   xmm11, xmm12
1190        movdqa  xmm8, xmmword ptr [rsp+0x100]
1191        paddd   xmm8, xmm13
1192        paddd   xmm9, xmm14
1193        pxor    xmm5, xmm10
1194        pxor    xmm6, xmm11
1195        pxor    xmm7, xmm8
1196        pxor    xmm4, xmm9
1197        movdqa  xmmword ptr [rsp+0x100], xmm8
1198        movdqa  xmm8, xmm5
1199        psrld   xmm8, 7
1200        pslld   xmm5, 25
1201        por     xmm5, xmm8
1202        movdqa  xmm8, xmm6
1203        psrld   xmm8, 7
1204        pslld   xmm6, 25
1205        por     xmm6, xmm8
1206        movdqa  xmm8, xmm7
1207        psrld   xmm8, 7
1208        pslld   xmm7, 25
1209        por     xmm7, xmm8
1210        movdqa  xmm8, xmm4
1211        psrld   xmm8, 7
1212        pslld   xmm4, 25
1213        por     xmm4, xmm8
1214        paddd   xmm0, xmmword ptr [rsp+0xB0]
1215        paddd   xmm1, xmmword ptr [rsp+0x50]
1216        paddd   xmm2, xmmword ptr [rsp+0x10]
1217        paddd   xmm3, xmmword ptr [rsp+0x80]
1218        paddd   xmm0, xmm4
1219        paddd   xmm1, xmm5
1220        paddd   xmm2, xmm6
1221        paddd   xmm3, xmm7
1222        pxor    xmm12, xmm0
1223        pxor    xmm13, xmm1
1224        pxor    xmm14, xmm2
1225        pxor    xmm15, xmm3
1226        movdqa  xmm8, xmmword ptr [ROT16+rip]
1227        pshufb  xmm12, xmm8
1228        pshufb  xmm13, xmm8
1229        pshufb  xmm14, xmm8
1230        pshufb  xmm15, xmm8
1231        movdqa  xmm8, xmmword ptr [rsp+0x100]
1232        paddd   xmm8, xmm12
1233        paddd   xmm9, xmm13
1234        paddd   xmm10, xmm14
1235        paddd   xmm11, xmm15
1236        pxor    xmm4, xmm8
1237        pxor    xmm5, xmm9
1238        pxor    xmm6, xmm10
1239        pxor    xmm7, xmm11
1240        movdqa  xmmword ptr [rsp+0x100], xmm8
1241        movdqa  xmm8, xmm4
1242        psrld   xmm8, 12
1243        pslld   xmm4, 20
1244        por     xmm4, xmm8
1245        movdqa  xmm8, xmm5
1246        psrld   xmm8, 12
1247        pslld   xmm5, 20
1248        por     xmm5, xmm8
1249        movdqa  xmm8, xmm6
1250        psrld   xmm8, 12
1251        pslld   xmm6, 20
1252        por     xmm6, xmm8
1253        movdqa  xmm8, xmm7
1254        psrld   xmm8, 12
1255        pslld   xmm7, 20
1256        por     xmm7, xmm8
1257        paddd   xmm0, xmmword ptr [rsp+0xF0]
1258        paddd   xmm1, xmmword ptr [rsp]
1259        paddd   xmm2, xmmword ptr [rsp+0x90]
1260        paddd   xmm3, xmmword ptr [rsp+0x60]
1261        paddd   xmm0, xmm4
1262        paddd   xmm1, xmm5
1263        paddd   xmm2, xmm6
1264        paddd   xmm3, xmm7
1265        pxor    xmm12, xmm0
1266        pxor    xmm13, xmm1
1267        pxor    xmm14, xmm2
1268        pxor    xmm15, xmm3
1269        movdqa  xmm8, xmmword ptr [ROT8+rip]
1270        pshufb  xmm12, xmm8
1271        pshufb  xmm13, xmm8
1272        pshufb  xmm14, xmm8
1273        pshufb  xmm15, xmm8
1274        movdqa  xmm8, xmmword ptr [rsp+0x100]
1275        paddd   xmm8, xmm12
1276        paddd   xmm9, xmm13
1277        paddd   xmm10, xmm14
1278        paddd   xmm11, xmm15
1279        pxor    xmm4, xmm8
1280        pxor    xmm5, xmm9
1281        pxor    xmm6, xmm10
1282        pxor    xmm7, xmm11
1283        movdqa  xmmword ptr [rsp+0x100], xmm8
1284        movdqa  xmm8, xmm4
1285        psrld   xmm8, 7
1286        pslld   xmm4, 25
1287        por     xmm4, xmm8
1288        movdqa  xmm8, xmm5
1289        psrld   xmm8, 7
1290        pslld   xmm5, 25
1291        por     xmm5, xmm8
1292        movdqa  xmm8, xmm6
1293        psrld   xmm8, 7
1294        pslld   xmm6, 25
1295        por     xmm6, xmm8
1296        movdqa  xmm8, xmm7
1297        psrld   xmm8, 7
1298        pslld   xmm7, 25
1299        por     xmm7, xmm8
1300        paddd   xmm0, xmmword ptr [rsp+0xE0]
1301        paddd   xmm1, xmmword ptr [rsp+0x20]
1302        paddd   xmm2, xmmword ptr [rsp+0x30]
1303        paddd   xmm3, xmmword ptr [rsp+0x70]
1304        paddd   xmm0, xmm5
1305        paddd   xmm1, xmm6
1306        paddd   xmm2, xmm7
1307        paddd   xmm3, xmm4
1308        pxor    xmm15, xmm0
1309        pxor    xmm12, xmm1
1310        pxor    xmm13, xmm2
1311        pxor    xmm14, xmm3
1312        movdqa  xmm8, xmmword ptr [ROT16+rip]
1313        pshufb  xmm15, xmm8
1314        pshufb  xmm12, xmm8
1315        pshufb  xmm13, xmm8
1316        pshufb  xmm14, xmm8
1317        paddd   xmm10, xmm15
1318        paddd   xmm11, xmm12
1319        movdqa  xmm8, xmmword ptr [rsp+0x100]
1320        paddd   xmm8, xmm13
1321        paddd   xmm9, xmm14
1322        pxor    xmm5, xmm10
1323        pxor    xmm6, xmm11
1324        pxor    xmm7, xmm8
1325        pxor    xmm4, xmm9
1326        movdqa  xmmword ptr [rsp+0x100], xmm8
1327        movdqa  xmm8, xmm5
1328        psrld   xmm8, 12
1329        pslld   xmm5, 20
1330        por     xmm5, xmm8
1331        movdqa  xmm8, xmm6
1332        psrld   xmm8, 12
1333        pslld   xmm6, 20
1334        por     xmm6, xmm8
1335        movdqa  xmm8, xmm7
1336        psrld   xmm8, 12
1337        pslld   xmm7, 20
1338        por     xmm7, xmm8
1339        movdqa  xmm8, xmm4
1340        psrld   xmm8, 12
1341        pslld   xmm4, 20
1342        por     xmm4, xmm8
1343        paddd   xmm0, xmmword ptr [rsp+0xA0]
1344        paddd   xmm1, xmmword ptr [rsp+0xC0]
1345        paddd   xmm2, xmmword ptr [rsp+0x40]
1346        paddd   xmm3, xmmword ptr [rsp+0xD0]
1347        paddd   xmm0, xmm5
1348        paddd   xmm1, xmm6
1349        paddd   xmm2, xmm7
1350        paddd   xmm3, xmm4
1351        pxor    xmm15, xmm0
1352        pxor    xmm12, xmm1
1353        pxor    xmm13, xmm2
1354        pxor    xmm14, xmm3
1355        movdqa  xmm8, xmmword ptr [ROT8+rip]
1356        pshufb  xmm15, xmm8
1357        pshufb  xmm12, xmm8
1358        pshufb  xmm13, xmm8
1359        pshufb  xmm14, xmm8
1360        paddd   xmm10, xmm15
1361        paddd   xmm11, xmm12
1362        movdqa  xmm8, xmmword ptr [rsp+0x100]
1363        paddd   xmm8, xmm13
1364        paddd   xmm9, xmm14
1365        pxor    xmm5, xmm10
1366        pxor    xmm6, xmm11
1367        pxor    xmm7, xmm8
1368        pxor    xmm4, xmm9
1369        pxor    xmm0, xmm8
1370        pxor    xmm1, xmm9
1371        pxor    xmm2, xmm10
1372        pxor    xmm3, xmm11
1373        movdqa  xmm8, xmm5
1374        psrld   xmm8, 7
1375        pslld   xmm5, 25
1376        por     xmm5, xmm8
1377        movdqa  xmm8, xmm6
1378        psrld   xmm8, 7
1379        pslld   xmm6, 25
1380        por     xmm6, xmm8
1381        movdqa  xmm8, xmm7
1382        psrld   xmm8, 7
1383        pslld   xmm7, 25
1384        por     xmm7, xmm8
1385        movdqa  xmm8, xmm4
1386        psrld   xmm8, 7
1387        pslld   xmm4, 25
1388        por     xmm4, xmm8
1389        pxor    xmm4, xmm12
1390        pxor    xmm5, xmm13
1391        pxor    xmm6, xmm14
1392        pxor    xmm7, xmm15
1393        mov     eax, r13d
1394        jne     9b
1395        movdqa  xmm9, xmm0
1396        punpckldq xmm0, xmm1
1397        punpckhdq xmm9, xmm1
1398        movdqa  xmm11, xmm2
1399        punpckldq xmm2, xmm3
1400        punpckhdq xmm11, xmm3
1401        movdqa  xmm1, xmm0
1402        punpcklqdq xmm0, xmm2
1403        punpckhqdq xmm1, xmm2
1404        movdqa  xmm3, xmm9
1405        punpcklqdq xmm9, xmm11
1406        punpckhqdq xmm3, xmm11
1407        movdqu  xmmword ptr [rbx], xmm0
1408        movdqu  xmmword ptr [rbx+0x20], xmm1
1409        movdqu  xmmword ptr [rbx+0x40], xmm9
1410        movdqu  xmmword ptr [rbx+0x60], xmm3
1411        movdqa  xmm9, xmm4
1412        punpckldq xmm4, xmm5
1413        punpckhdq xmm9, xmm5
1414        movdqa  xmm11, xmm6
1415        punpckldq xmm6, xmm7
1416        punpckhdq xmm11, xmm7
1417        movdqa  xmm5, xmm4
1418        punpcklqdq xmm4, xmm6
1419        punpckhqdq xmm5, xmm6
1420        movdqa  xmm7, xmm9
1421        punpcklqdq xmm9, xmm11
1422        punpckhqdq xmm7, xmm11
1423        movdqu  xmmword ptr [rbx+0x10], xmm4
1424        movdqu  xmmword ptr [rbx+0x30], xmm5
1425        movdqu  xmmword ptr [rbx+0x50], xmm9
1426        movdqu  xmmword ptr [rbx+0x70], xmm7
1427        movdqa  xmm1, xmmword ptr [rsp+0x110]
1428        movdqa  xmm0, xmm1
1429        paddd   xmm1, xmmword ptr [rsp+0x150]
1430        movdqa  xmmword ptr [rsp+0x110], xmm1
1431        pxor    xmm0, xmmword ptr [CMP_MSB_MASK+rip]
1432        pxor    xmm1, xmmword ptr [CMP_MSB_MASK+rip]
1433        pcmpgtd xmm0, xmm1
1434        movdqa  xmm1, xmmword ptr [rsp+0x120]
1435        psubd   xmm1, xmm0
1436        movdqa  xmmword ptr [rsp+0x120], xmm1
1437        add     rbx, 128
1438        add     rdi, 32
1439        sub     rsi, 4
1440        cmp     rsi, 4
1441        jnc     2b
1442        test    rsi, rsi
1443        jne     3f
14444:
1445        movdqa  xmm6, xmmword ptr [rsp+0x170]
1446        movdqa  xmm7, xmmword ptr [rsp+0x180]
1447        movdqa  xmm8, xmmword ptr [rsp+0x190]
1448        movdqa  xmm9, xmmword ptr [rsp+0x1A0]
1449        movdqa  xmm10, xmmword ptr [rsp+0x1B0]
1450        movdqa  xmm11, xmmword ptr [rsp+0x1C0]
1451        movdqa  xmm12, xmmword ptr [rsp+0x1D0]
1452        movdqa  xmm13, xmmword ptr [rsp+0x1E0]
1453        movdqa  xmm14, xmmword ptr [rsp+0x1F0]
1454        movdqa  xmm15, xmmword ptr [rsp+0x200]
1455        mov     rsp, rbp
1456        pop     rbp
1457        pop     rbx
1458        pop     rdi
1459        pop     rsi
1460        pop     r12
1461        pop     r13
1462        pop     r14
1463        pop     r15
1464        ret
1465.p2align 5
14663:
1467        test    esi, 0x2
1468        je      3f
1469        movups  xmm0, xmmword ptr [rcx]
1470        movups  xmm1, xmmword ptr [rcx+0x10]
1471        movaps  xmm8, xmm0
1472        movaps  xmm9, xmm1
1473        movd    xmm13, dword ptr [rsp+0x110]
1474        pinsrd  xmm13, dword ptr [rsp+0x120], 1
1475        pinsrd  xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
1476        movaps  xmmword ptr [rsp], xmm13
1477        movd    xmm14, dword ptr [rsp+0x114]
1478        pinsrd  xmm14, dword ptr [rsp+0x124], 1
1479        pinsrd  xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
1480        movaps  xmmword ptr [rsp+0x10], xmm14
1481        mov     r8, qword ptr [rdi]
1482        mov     r9, qword ptr [rdi+0x8]
1483        movzx   eax, byte ptr [rbp+0x80]
1484        or      eax, r13d
1485        xor     edx, edx
14862:
1487        mov     r14d, eax
1488        or      eax, r12d
1489        add     rdx, 64
1490        cmp     rdx, r15
1491        cmovne  eax, r14d
1492        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
1493        movaps  xmm10, xmm2
1494        movups  xmm4, xmmword ptr [r8+rdx-0x40]
1495        movups  xmm5, xmmword ptr [r8+rdx-0x30]
1496        movaps  xmm3, xmm4
1497        shufps  xmm4, xmm5, 136
1498        shufps  xmm3, xmm5, 221
1499        movaps  xmm5, xmm3
1500        movups  xmm6, xmmword ptr [r8+rdx-0x20]
1501        movups  xmm7, xmmword ptr [r8+rdx-0x10]
1502        movaps  xmm3, xmm6
1503        shufps  xmm6, xmm7, 136
1504        pshufd  xmm6, xmm6, 0x93
1505        shufps  xmm3, xmm7, 221
1506        pshufd  xmm7, xmm3, 0x93
1507        movups  xmm12, xmmword ptr [r9+rdx-0x40]
1508        movups  xmm13, xmmword ptr [r9+rdx-0x30]
1509        movaps  xmm11, xmm12
1510        shufps  xmm12, xmm13, 136
1511        shufps  xmm11, xmm13, 221
1512        movaps  xmm13, xmm11
1513        movups  xmm14, xmmword ptr [r9+rdx-0x20]
1514        movups  xmm15, xmmword ptr [r9+rdx-0x10]
1515        movaps  xmm11, xmm14
1516        shufps  xmm14, xmm15, 136
1517        pshufd  xmm14, xmm14, 0x93
1518        shufps  xmm11, xmm15, 221
1519        pshufd  xmm15, xmm11, 0x93
1520        movaps  xmm3, xmmword ptr [rsp]
1521        movaps  xmm11, xmmword ptr [rsp+0x10]
1522        pinsrd  xmm3, eax, 3
1523        pinsrd  xmm11, eax, 3
1524        mov     al, 7
15259:
1526        paddd   xmm0, xmm4
1527        paddd   xmm8, xmm12
1528        movaps  xmmword ptr [rsp+0x20], xmm4
1529        movaps  xmmword ptr [rsp+0x30], xmm12
1530        paddd   xmm0, xmm1
1531        paddd   xmm8, xmm9
1532        pxor    xmm3, xmm0
1533        pxor    xmm11, xmm8
1534        movaps  xmm12, xmmword ptr [ROT16+rip]
1535        pshufb  xmm3, xmm12
1536        pshufb  xmm11, xmm12
1537        paddd   xmm2, xmm3
1538        paddd   xmm10, xmm11
1539        pxor    xmm1, xmm2
1540        pxor    xmm9, xmm10
1541        movdqa  xmm4, xmm1
1542        pslld   xmm1, 20
1543        psrld   xmm4, 12
1544        por     xmm1, xmm4
1545        movdqa  xmm4, xmm9
1546        pslld   xmm9, 20
1547        psrld   xmm4, 12
1548        por     xmm9, xmm4
1549        paddd   xmm0, xmm5
1550        paddd   xmm8, xmm13
1551        movaps  xmmword ptr [rsp+0x40], xmm5
1552        movaps  xmmword ptr [rsp+0x50], xmm13
1553        paddd   xmm0, xmm1
1554        paddd   xmm8, xmm9
1555        pxor    xmm3, xmm0
1556        pxor    xmm11, xmm8
1557        movaps  xmm13, xmmword ptr [ROT8+rip]
1558        pshufb  xmm3, xmm13
1559        pshufb  xmm11, xmm13
1560        paddd   xmm2, xmm3
1561        paddd   xmm10, xmm11
1562        pxor    xmm1, xmm2
1563        pxor    xmm9, xmm10
1564        movdqa  xmm4, xmm1
1565        pslld   xmm1, 25
1566        psrld   xmm4, 7
1567        por     xmm1, xmm4
1568        movdqa  xmm4, xmm9
1569        pslld   xmm9, 25
1570        psrld   xmm4, 7
1571        por     xmm9, xmm4
1572        pshufd  xmm0, xmm0, 0x93
1573        pshufd  xmm8, xmm8, 0x93
1574        pshufd  xmm3, xmm3, 0x4E
1575        pshufd  xmm11, xmm11, 0x4E
1576        pshufd  xmm2, xmm2, 0x39
1577        pshufd  xmm10, xmm10, 0x39
1578        paddd   xmm0, xmm6
1579        paddd   xmm8, xmm14
1580        paddd   xmm0, xmm1
1581        paddd   xmm8, xmm9
1582        pxor    xmm3, xmm0
1583        pxor    xmm11, xmm8
1584        pshufb  xmm3, xmm12
1585        pshufb  xmm11, xmm12
1586        paddd   xmm2, xmm3
1587        paddd   xmm10, xmm11
1588        pxor    xmm1, xmm2
1589        pxor    xmm9, xmm10
1590        movdqa  xmm4, xmm1
1591        pslld   xmm1, 20
1592        psrld   xmm4, 12
1593        por     xmm1, xmm4
1594        movdqa  xmm4, xmm9
1595        pslld   xmm9, 20
1596        psrld   xmm4, 12
1597        por     xmm9, xmm4
1598        paddd   xmm0, xmm7
1599        paddd   xmm8, xmm15
1600        paddd   xmm0, xmm1
1601        paddd   xmm8, xmm9
1602        pxor    xmm3, xmm0
1603        pxor    xmm11, xmm8
1604        pshufb  xmm3, xmm13
1605        pshufb  xmm11, xmm13
1606        paddd   xmm2, xmm3
1607        paddd   xmm10, xmm11
1608        pxor    xmm1, xmm2
1609        pxor    xmm9, xmm10
1610        movdqa  xmm4, xmm1
1611        pslld   xmm1, 25
1612        psrld   xmm4, 7
1613        por     xmm1, xmm4
1614        movdqa  xmm4, xmm9
1615        pslld   xmm9, 25
1616        psrld   xmm4, 7
1617        por     xmm9, xmm4
1618        pshufd  xmm0, xmm0, 0x39
1619        pshufd  xmm8, xmm8, 0x39
1620        pshufd  xmm3, xmm3, 0x4E
1621        pshufd  xmm11, xmm11, 0x4E
1622        pshufd  xmm2, xmm2, 0x93
1623        pshufd  xmm10, xmm10, 0x93
1624        dec     al
1625        je      9f
1626        movdqa  xmm12, xmmword ptr [rsp+0x20]
1627        movdqa  xmm5, xmmword ptr [rsp+0x40]
1628        pshufd  xmm13, xmm12, 0x0F
1629        shufps  xmm12, xmm5, 214
1630        pshufd  xmm4, xmm12, 0x39
1631        movdqa  xmm12, xmm6
1632        shufps  xmm12, xmm7, 250
1633        pblendw xmm13, xmm12, 0xCC
1634        movdqa  xmm12, xmm7
1635        punpcklqdq xmm12, xmm5
1636        pblendw xmm12, xmm6, 0xC0
1637        pshufd  xmm12, xmm12, 0x78
1638        punpckhdq xmm5, xmm7
1639        punpckldq xmm6, xmm5
1640        pshufd  xmm7, xmm6, 0x1E
1641        movdqa  xmmword ptr [rsp+0x20], xmm13
1642        movdqa  xmmword ptr [rsp+0x40], xmm12
1643        movdqa  xmm5, xmmword ptr [rsp+0x30]
1644        movdqa  xmm13, xmmword ptr [rsp+0x50]
1645        pshufd  xmm6, xmm5, 0x0F
1646        shufps  xmm5, xmm13, 214
1647        pshufd  xmm12, xmm5, 0x39
1648        movdqa  xmm5, xmm14
1649        shufps  xmm5, xmm15, 250
1650        pblendw xmm6, xmm5, 0xCC
1651        movdqa  xmm5, xmm15
1652        punpcklqdq xmm5, xmm13
1653        pblendw xmm5, xmm14, 0xC0
1654        pshufd  xmm5, xmm5, 0x78
1655        punpckhdq xmm13, xmm15
1656        punpckldq xmm14, xmm13
1657        pshufd  xmm15, xmm14, 0x1E
1658        movdqa  xmm13, xmm6
1659        movdqa  xmm14, xmm5
1660        movdqa  xmm5, xmmword ptr [rsp+0x20]
1661        movdqa  xmm6, xmmword ptr [rsp+0x40]
1662        jmp     9b
16639:
1664        pxor    xmm0, xmm2
1665        pxor    xmm1, xmm3
1666        pxor    xmm8, xmm10
1667        pxor    xmm9, xmm11
1668        mov     eax, r13d
1669        cmp     rdx, r15
1670        jne     2b
1671        movups  xmmword ptr [rbx], xmm0
1672        movups  xmmword ptr [rbx+0x10], xmm1
1673        movups  xmmword ptr [rbx+0x20], xmm8
1674        movups  xmmword ptr [rbx+0x30], xmm9
1675        movdqa  xmm0, xmmword ptr [rsp+0x130]
1676        movdqa  xmm1, xmmword ptr [rsp+0x110]
1677        movdqa  xmm2, xmmword ptr [rsp+0x120]
1678        movdqu  xmm3, xmmword ptr [rsp+0x118]
1679        movdqu  xmm4, xmmword ptr [rsp+0x128]
1680        blendvps xmm1, xmm3, xmm0
1681        blendvps xmm2, xmm4, xmm0
1682        movdqa  xmmword ptr [rsp+0x110], xmm1
1683        movdqa  xmmword ptr [rsp+0x120], xmm2
1684        add     rdi, 16
1685        add     rbx, 64
1686        sub     rsi, 2
16873:
1688        test    esi, 0x1
1689        je      4b
1690        movups  xmm0, xmmword ptr [rcx]
1691        movups  xmm1, xmmword ptr [rcx+0x10]
1692        movd    xmm13, dword ptr [rsp+0x110]
1693        pinsrd  xmm13, dword ptr [rsp+0x120], 1
1694        pinsrd  xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
1695        movaps  xmm14, xmmword ptr [ROT8+rip]
1696        movaps  xmm15, xmmword ptr [ROT16+rip]
1697        mov     r8, qword ptr [rdi]
1698        movzx   eax, byte ptr [rbp+0x80]
1699        or      eax, r13d
1700        xor     edx, edx
17012:
1702        mov     r14d, eax
1703        or      eax, r12d
1704        add     rdx, 64
1705        cmp     rdx, r15
1706        cmovne  eax, r14d
1707        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
1708        movaps  xmm3, xmm13
1709        pinsrd  xmm3, eax, 3
1710        movups  xmm4, xmmword ptr [r8+rdx-0x40]
1711        movups  xmm5, xmmword ptr [r8+rdx-0x30]
1712        movaps  xmm8, xmm4
1713        shufps  xmm4, xmm5, 136
1714        shufps  xmm8, xmm5, 221
1715        movaps  xmm5, xmm8
1716        movups  xmm6, xmmword ptr [r8+rdx-0x20]
1717        movups  xmm7, xmmword ptr [r8+rdx-0x10]
1718        movaps  xmm8, xmm6
1719        shufps  xmm6, xmm7, 136
1720        pshufd  xmm6, xmm6, 0x93
1721        shufps  xmm8, xmm7, 221
1722        pshufd  xmm7, xmm8, 0x93
1723        mov     al, 7
17249:
1725        paddd   xmm0, xmm4
1726        paddd   xmm0, xmm1
1727        pxor    xmm3, xmm0
1728        pshufb  xmm3, xmm15
1729        paddd   xmm2, xmm3
1730        pxor    xmm1, xmm2
1731        movdqa  xmm11, xmm1
1732        pslld   xmm1, 20
1733        psrld   xmm11, 12
1734        por     xmm1, xmm11
1735        paddd   xmm0, xmm5
1736        paddd   xmm0, xmm1
1737        pxor    xmm3, xmm0
1738        pshufb  xmm3, xmm14
1739        paddd   xmm2, xmm3
1740        pxor    xmm1, xmm2
1741        movdqa  xmm11, xmm1
1742        pslld   xmm1, 25
1743        psrld   xmm11, 7
1744        por     xmm1, xmm11
1745        pshufd  xmm0, xmm0, 0x93
1746        pshufd  xmm3, xmm3, 0x4E
1747        pshufd  xmm2, xmm2, 0x39
1748        paddd   xmm0, xmm6
1749        paddd   xmm0, xmm1
1750        pxor    xmm3, xmm0
1751        pshufb  xmm3, xmm15
1752        paddd   xmm2, xmm3
1753        pxor    xmm1, xmm2
1754        movdqa  xmm11, xmm1
1755        pslld   xmm1, 20
1756        psrld   xmm11, 12
1757        por     xmm1, xmm11
1758        paddd   xmm0, xmm7
1759        paddd   xmm0, xmm1
1760        pxor    xmm3, xmm0
1761        pshufb  xmm3, xmm14
1762        paddd   xmm2, xmm3
1763        pxor    xmm1, xmm2
1764        movdqa  xmm11, xmm1
1765        pslld   xmm1, 25
1766        psrld   xmm11, 7
1767        por     xmm1, xmm11
1768        pshufd  xmm0, xmm0, 0x39
1769        pshufd  xmm3, xmm3, 0x4E
1770        pshufd  xmm2, xmm2, 0x93
1771        dec     al
1772        jz      9f
1773        movdqa  xmm8, xmm4
1774        shufps  xmm8, xmm5, 214
1775        pshufd  xmm9, xmm4, 0x0F
1776        pshufd  xmm4, xmm8, 0x39
1777        movdqa  xmm8, xmm6
1778        shufps  xmm8, xmm7, 250
1779        pblendw xmm9, xmm8, 0xCC
1780        movdqa  xmm8, xmm7
1781        punpcklqdq xmm8, xmm5
1782        pblendw xmm8, xmm6, 0xC0
1783        pshufd  xmm8, xmm8, 0x78
1784        punpckhdq xmm5, xmm7
1785        punpckldq xmm6, xmm5
1786        pshufd  xmm7, xmm6, 0x1E
1787        movdqa  xmm5, xmm9
1788        movdqa  xmm6, xmm8
1789        jmp     9b
17909:
1791        pxor    xmm0, xmm2
1792        pxor    xmm1, xmm3
1793        mov     eax, r13d
1794        cmp     rdx, r15
1795        jne     2b
1796        movups  xmmword ptr [rbx], xmm0
1797        movups  xmmword ptr [rbx+0x10], xmm1
1798        jmp     4b
1799
1800.p2align 6
1801blake3_compress_in_place_sse41:
1802_blake3_compress_in_place_sse41:
1803        sub     rsp, 120
1804        movdqa  xmmword ptr [rsp], xmm6
1805        movdqa  xmmword ptr [rsp+0x10], xmm7
1806        movdqa  xmmword ptr [rsp+0x20], xmm8
1807        movdqa  xmmword ptr [rsp+0x30], xmm9
1808        movdqa  xmmword ptr [rsp+0x40], xmm11
1809        movdqa  xmmword ptr [rsp+0x50], xmm14
1810        movdqa  xmmword ptr [rsp+0x60], xmm15
1811        movups  xmm0, xmmword ptr [rcx]
1812        movups  xmm1, xmmword ptr [rcx+0x10]
1813        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
1814        movzx   eax, byte ptr [rsp+0xA0]
1815        movzx   r8d, r8b
1816        shl     rax, 32
1817        add     r8, rax
1818        movq    xmm3, r9
1819        movq    xmm4, r8
1820        punpcklqdq xmm3, xmm4
1821        movups  xmm4, xmmword ptr [rdx]
1822        movups  xmm5, xmmword ptr [rdx+0x10]
1823        movaps  xmm8, xmm4
1824        shufps  xmm4, xmm5, 136
1825        shufps  xmm8, xmm5, 221
1826        movaps  xmm5, xmm8
1827        movups  xmm6, xmmword ptr [rdx+0x20]
1828        movups  xmm7, xmmword ptr [rdx+0x30]
1829        movaps  xmm8, xmm6
1830        shufps  xmm6, xmm7, 136
1831        pshufd  xmm6, xmm6, 0x93
1832        shufps  xmm8, xmm7, 221
1833        pshufd  xmm7, xmm8, 0x93
1834        movaps  xmm14, xmmword ptr [ROT8+rip]
1835        movaps  xmm15, xmmword ptr [ROT16+rip]
1836        mov     al, 7
18379:
1838        paddd   xmm0, xmm4
1839        paddd   xmm0, xmm1
1840        pxor    xmm3, xmm0
1841        pshufb  xmm3, xmm15
1842        paddd   xmm2, xmm3
1843        pxor    xmm1, xmm2
1844        movdqa  xmm11, xmm1
1845        pslld   xmm1, 20
1846        psrld   xmm11, 12
1847        por     xmm1, xmm11
1848        paddd   xmm0, xmm5
1849        paddd   xmm0, xmm1
1850        pxor    xmm3, xmm0
1851        pshufb  xmm3, xmm14
1852        paddd   xmm2, xmm3
1853        pxor    xmm1, xmm2
1854        movdqa  xmm11, xmm1
1855        pslld   xmm1, 25
1856        psrld   xmm11, 7
1857        por     xmm1, xmm11
1858        pshufd  xmm0, xmm0, 0x93
1859        pshufd  xmm3, xmm3, 0x4E
1860        pshufd  xmm2, xmm2, 0x39
1861        paddd   xmm0, xmm6
1862        paddd   xmm0, xmm1
1863        pxor    xmm3, xmm0
1864        pshufb  xmm3, xmm15
1865        paddd   xmm2, xmm3
1866        pxor    xmm1, xmm2
1867        movdqa  xmm11, xmm1
1868        pslld   xmm1, 20
1869        psrld   xmm11, 12
1870        por     xmm1, xmm11
1871        paddd   xmm0, xmm7
1872        paddd   xmm0, xmm1
1873        pxor    xmm3, xmm0
1874        pshufb  xmm3, xmm14
1875        paddd   xmm2, xmm3
1876        pxor    xmm1, xmm2
1877        movdqa  xmm11, xmm1
1878        pslld   xmm1, 25
1879        psrld   xmm11, 7
1880        por     xmm1, xmm11
1881        pshufd  xmm0, xmm0, 0x39
1882        pshufd  xmm3, xmm3, 0x4E
1883        pshufd  xmm2, xmm2, 0x93
1884        dec     al
1885        jz      9f
1886        movdqa  xmm8, xmm4
1887        shufps  xmm8, xmm5, 214
1888        pshufd  xmm9, xmm4, 0x0F
1889        pshufd  xmm4, xmm8, 0x39
1890        movdqa  xmm8, xmm6
1891        shufps  xmm8, xmm7, 250
1892        pblendw xmm9, xmm8, 0xCC
1893        movdqa  xmm8, xmm7
1894        punpcklqdq xmm8, xmm5
1895        pblendw xmm8, xmm6, 0xC0
1896        pshufd  xmm8, xmm8, 0x78
1897        punpckhdq xmm5, xmm7
1898        punpckldq xmm6, xmm5
1899        pshufd  xmm7, xmm6, 0x1E
1900        movdqa  xmm5, xmm9
1901        movdqa  xmm6, xmm8
1902        jmp     9b
19039:
1904        pxor    xmm0, xmm2
1905        pxor    xmm1, xmm3
1906        movups  xmmword ptr [rcx], xmm0
1907        movups  xmmword ptr [rcx+0x10], xmm1
1908        movdqa  xmm6, xmmword ptr [rsp]
1909        movdqa  xmm7, xmmword ptr [rsp+0x10]
1910        movdqa  xmm8, xmmword ptr [rsp+0x20]
1911        movdqa  xmm9, xmmword ptr [rsp+0x30]
1912        movdqa  xmm11, xmmword ptr [rsp+0x40]
1913        movdqa  xmm14, xmmword ptr [rsp+0x50]
1914        movdqa  xmm15, xmmword ptr [rsp+0x60]
1915        add     rsp, 120
1916        ret
1917
1918
1919.p2align 6
1920_blake3_compress_xof_sse41:
1921blake3_compress_xof_sse41:
1922        sub     rsp, 120
1923        movdqa  xmmword ptr [rsp], xmm6
1924        movdqa  xmmword ptr [rsp+0x10], xmm7
1925        movdqa  xmmword ptr [rsp+0x20], xmm8
1926        movdqa  xmmword ptr [rsp+0x30], xmm9
1927        movdqa  xmmword ptr [rsp+0x40], xmm11
1928        movdqa  xmmword ptr [rsp+0x50], xmm14
1929        movdqa  xmmword ptr [rsp+0x60], xmm15
1930        movups  xmm0, xmmword ptr [rcx]
1931        movups  xmm1, xmmword ptr [rcx+0x10]
1932        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
1933        movzx   eax, byte ptr [rsp+0xA0]
1934        movzx   r8d, r8b
1935        mov     r10, qword ptr [rsp+0xA8]
1936        shl     rax, 32
1937        add     r8, rax
1938        movq    xmm3, r9
1939        movq    xmm4, r8
1940        punpcklqdq xmm3, xmm4
1941        movups  xmm4, xmmword ptr [rdx]
1942        movups  xmm5, xmmword ptr [rdx+0x10]
1943        movaps  xmm8, xmm4
1944        shufps  xmm4, xmm5, 136
1945        shufps  xmm8, xmm5, 221
1946        movaps  xmm5, xmm8
1947        movups  xmm6, xmmword ptr [rdx+0x20]
1948        movups  xmm7, xmmword ptr [rdx+0x30]
1949        movaps  xmm8, xmm6
1950        shufps  xmm6, xmm7, 136
1951        pshufd  xmm6, xmm6, 0x93
1952        shufps  xmm8, xmm7, 221
1953        pshufd  xmm7, xmm8, 0x93
1954        movaps  xmm14, xmmword ptr [ROT8+rip]
1955        movaps  xmm15, xmmword ptr [ROT16+rip]
1956        mov     al, 7
19579:
1958        paddd   xmm0, xmm4
1959        paddd   xmm0, xmm1
1960        pxor    xmm3, xmm0
1961        pshufb  xmm3, xmm15
1962        paddd   xmm2, xmm3
1963        pxor    xmm1, xmm2
1964        movdqa  xmm11, xmm1
1965        pslld   xmm1, 20
1966        psrld   xmm11, 12
1967        por     xmm1, xmm11
1968        paddd   xmm0, xmm5
1969        paddd   xmm0, xmm1
1970        pxor    xmm3, xmm0
1971        pshufb  xmm3, xmm14
1972        paddd   xmm2, xmm3
1973        pxor    xmm1, xmm2
1974        movdqa  xmm11, xmm1
1975        pslld   xmm1, 25
1976        psrld   xmm11, 7
1977        por     xmm1, xmm11
1978        pshufd  xmm0, xmm0, 0x93
1979        pshufd  xmm3, xmm3, 0x4E
1980        pshufd  xmm2, xmm2, 0x39
1981        paddd   xmm0, xmm6
1982        paddd   xmm0, xmm1
1983        pxor    xmm3, xmm0
1984        pshufb  xmm3, xmm15
1985        paddd   xmm2, xmm3
1986        pxor    xmm1, xmm2
1987        movdqa  xmm11, xmm1
1988        pslld   xmm1, 20
1989        psrld   xmm11, 12
1990        por     xmm1, xmm11
1991        paddd   xmm0, xmm7
1992        paddd   xmm0, xmm1
1993        pxor    xmm3, xmm0
1994        pshufb  xmm3, xmm14
1995        paddd   xmm2, xmm3
1996        pxor    xmm1, xmm2
1997        movdqa  xmm11, xmm1
1998        pslld   xmm1, 25
1999        psrld   xmm11, 7
2000        por     xmm1, xmm11
2001        pshufd  xmm0, xmm0, 0x39
2002        pshufd  xmm3, xmm3, 0x4E
2003        pshufd  xmm2, xmm2, 0x93
2004        dec     al
2005        jz      9f
2006        movdqa  xmm8, xmm4
2007        shufps  xmm8, xmm5, 214
2008        pshufd  xmm9, xmm4, 0x0F
2009        pshufd  xmm4, xmm8, 0x39
2010        movdqa  xmm8, xmm6
2011        shufps  xmm8, xmm7, 250
2012        pblendw xmm9, xmm8, 0xCC
2013        movdqa  xmm8, xmm7
2014        punpcklqdq xmm8, xmm5
2015        pblendw xmm8, xmm6, 0xC0
2016        pshufd  xmm8, xmm8, 0x78
2017        punpckhdq xmm5, xmm7
2018        punpckldq xmm6, xmm5
2019        pshufd  xmm7, xmm6, 0x1E
2020        movdqa  xmm5, xmm9
2021        movdqa  xmm6, xmm8
2022        jmp     9b
20239:
2024        movdqu  xmm4, xmmword ptr [rcx]
2025        movdqu  xmm5, xmmword ptr [rcx+0x10]
2026        pxor    xmm0, xmm2
2027        pxor    xmm1, xmm3
2028        pxor    xmm2, xmm4
2029        pxor    xmm3, xmm5
2030        movups  xmmword ptr [r10], xmm0
2031        movups  xmmword ptr [r10+0x10], xmm1
2032        movups  xmmword ptr [r10+0x20], xmm2
2033        movups  xmmword ptr [r10+0x30], xmm3
2034        movdqa  xmm6, xmmword ptr [rsp]
2035        movdqa  xmm7, xmmword ptr [rsp+0x10]
2036        movdqa  xmm8, xmmword ptr [rsp+0x20]
2037        movdqa  xmm9, xmmword ptr [rsp+0x30]
2038        movdqa  xmm11, xmmword ptr [rsp+0x40]
2039        movdqa  xmm14, xmmword ptr [rsp+0x50]
2040        movdqa  xmm15, xmmword ptr [rsp+0x60]
2041        add     rsp, 120
2042        ret
2043
2044
2045.section .rodata
2046.p2align  6
2047BLAKE3_IV:
2048        .long  0x6A09E667, 0xBB67AE85
2049        .long  0x3C6EF372, 0xA54FF53A
2050ROT16:
2051        .byte  2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
2052ROT8:
2053        .byte  1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
2054ADD0:
2055        .long  0, 1, 2, 3
2056ADD1:
2057        .long  4, 4, 4, 4
2058BLAKE3_IV_0:
2059        .long  0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
2060BLAKE3_IV_1:
2061        .long  0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
2062BLAKE3_IV_2:
2063        .long  0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
2064BLAKE3_IV_3:
2065        .long  0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
2066BLAKE3_BLOCK_LEN:
2067        .long  64, 64, 64, 64
2068CMP_MSB_MASK:
2069        .long  0x80000000, 0x80000000, 0x80000000, 0x80000000
2070