1#if defined(__x86_64__)
2
3#include "llvm_blake3_prefix.h"
4
5#if defined(__ELF__) && (defined(__linux__) || defined(__FreeBSD__))
6.section .note.GNU-stack,"",%progbits
7#endif
8
9#if defined(__ELF__) && defined(__CET__) && defined(__has_include)
10#if __has_include(<cet.h>)
11#include <cet.h>
12#endif
13#endif
14
15#if !defined(_CET_ENDBR)
16#define _CET_ENDBR
17#endif
18
19#ifdef __APPLE__
20#define HIDDEN .private_extern
21#else
22#define HIDDEN .hidden
23#endif
24
25.intel_syntax noprefix
26HIDDEN blake3_hash_many_sse2
27HIDDEN _blake3_hash_many_sse2
28HIDDEN blake3_compress_in_place_sse2
29HIDDEN _blake3_compress_in_place_sse2
30HIDDEN blake3_compress_xof_sse2
31HIDDEN _blake3_compress_xof_sse2
32.global blake3_hash_many_sse2
33.global _blake3_hash_many_sse2
34.global blake3_compress_in_place_sse2
35.global _blake3_compress_in_place_sse2
36.global blake3_compress_xof_sse2
37.global _blake3_compress_xof_sse2
38#ifdef __APPLE__
39.text
40#else
41.section .text
42#endif
43        .p2align  6
44_blake3_hash_many_sse2:
45blake3_hash_many_sse2:
46        _CET_ENDBR
47        push    r15
48        push    r14
49        push    r13
50        push    r12
51        push    rbx
52        push    rbp
53        mov     rbp, rsp
54        sub     rsp, 360
55        and     rsp, 0xFFFFFFFFFFFFFFC0
56        neg     r9d
57        movd    xmm0, r9d
58        pshufd  xmm0, xmm0, 0x00
59        movdqa  xmmword ptr [rsp+0x130], xmm0
60        movdqa  xmm1, xmm0
61        pand    xmm1, xmmword ptr [ADD0+rip]
62        pand    xmm0, xmmword ptr [ADD1+rip]
63        movdqa  xmmword ptr [rsp+0x150], xmm0
64        movd    xmm0, r8d
65        pshufd  xmm0, xmm0, 0x00
66        paddd   xmm0, xmm1
67        movdqa  xmmword ptr [rsp+0x110], xmm0
68        pxor    xmm0, xmmword ptr [CMP_MSB_MASK+rip]
69        pxor    xmm1, xmmword ptr [CMP_MSB_MASK+rip]
70        pcmpgtd xmm1, xmm0
71        shr     r8, 32
72        movd    xmm2, r8d
73        pshufd  xmm2, xmm2, 0x00
74        psubd   xmm2, xmm1
75        movdqa  xmmword ptr [rsp+0x120], xmm2
76        mov     rbx, qword ptr [rbp+0x50]
77        mov     r15, rdx
78        shl     r15, 6
79        movzx   r13d, byte ptr [rbp+0x38]
80        movzx   r12d, byte ptr [rbp+0x48]
81        cmp     rsi, 4
82        jc      3f
832:
84        movdqu  xmm3, xmmword ptr [rcx]
85        pshufd  xmm0, xmm3, 0x00
86        pshufd  xmm1, xmm3, 0x55
87        pshufd  xmm2, xmm3, 0xAA
88        pshufd  xmm3, xmm3, 0xFF
89        movdqu  xmm7, xmmword ptr [rcx+0x10]
90        pshufd  xmm4, xmm7, 0x00
91        pshufd  xmm5, xmm7, 0x55
92        pshufd  xmm6, xmm7, 0xAA
93        pshufd  xmm7, xmm7, 0xFF
94        mov     r8, qword ptr [rdi]
95        mov     r9, qword ptr [rdi+0x8]
96        mov     r10, qword ptr [rdi+0x10]
97        mov     r11, qword ptr [rdi+0x18]
98        movzx   eax, byte ptr [rbp+0x40]
99        or      eax, r13d
100        xor     edx, edx
1019:
102        mov     r14d, eax
103        or      eax, r12d
104        add     rdx, 64
105        cmp     rdx, r15
106        cmovne  eax, r14d
107        movdqu  xmm8, xmmword ptr [r8+rdx-0x40]
108        movdqu  xmm9, xmmword ptr [r9+rdx-0x40]
109        movdqu  xmm10, xmmword ptr [r10+rdx-0x40]
110        movdqu  xmm11, xmmword ptr [r11+rdx-0x40]
111        movdqa  xmm12, xmm8
112        punpckldq xmm8, xmm9
113        punpckhdq xmm12, xmm9
114        movdqa  xmm14, xmm10
115        punpckldq xmm10, xmm11
116        punpckhdq xmm14, xmm11
117        movdqa  xmm9, xmm8
118        punpcklqdq xmm8, xmm10
119        punpckhqdq xmm9, xmm10
120        movdqa  xmm13, xmm12
121        punpcklqdq xmm12, xmm14
122        punpckhqdq xmm13, xmm14
123        movdqa  xmmword ptr [rsp], xmm8
124        movdqa  xmmword ptr [rsp+0x10], xmm9
125        movdqa  xmmword ptr [rsp+0x20], xmm12
126        movdqa  xmmword ptr [rsp+0x30], xmm13
127        movdqu  xmm8, xmmword ptr [r8+rdx-0x30]
128        movdqu  xmm9, xmmword ptr [r9+rdx-0x30]
129        movdqu  xmm10, xmmword ptr [r10+rdx-0x30]
130        movdqu  xmm11, xmmword ptr [r11+rdx-0x30]
131        movdqa  xmm12, xmm8
132        punpckldq xmm8, xmm9
133        punpckhdq xmm12, xmm9
134        movdqa  xmm14, xmm10
135        punpckldq xmm10, xmm11
136        punpckhdq xmm14, xmm11
137        movdqa  xmm9, xmm8
138        punpcklqdq xmm8, xmm10
139        punpckhqdq xmm9, xmm10
140        movdqa  xmm13, xmm12
141        punpcklqdq xmm12, xmm14
142        punpckhqdq xmm13, xmm14
143        movdqa  xmmword ptr [rsp+0x40], xmm8
144        movdqa  xmmword ptr [rsp+0x50], xmm9
145        movdqa  xmmword ptr [rsp+0x60], xmm12
146        movdqa  xmmword ptr [rsp+0x70], xmm13
147        movdqu  xmm8, xmmword ptr [r8+rdx-0x20]
148        movdqu  xmm9, xmmword ptr [r9+rdx-0x20]
149        movdqu  xmm10, xmmword ptr [r10+rdx-0x20]
150        movdqu  xmm11, xmmword ptr [r11+rdx-0x20]
151        movdqa  xmm12, xmm8
152        punpckldq xmm8, xmm9
153        punpckhdq xmm12, xmm9
154        movdqa  xmm14, xmm10
155        punpckldq xmm10, xmm11
156        punpckhdq xmm14, xmm11
157        movdqa  xmm9, xmm8
158        punpcklqdq xmm8, xmm10
159        punpckhqdq xmm9, xmm10
160        movdqa  xmm13, xmm12
161        punpcklqdq xmm12, xmm14
162        punpckhqdq xmm13, xmm14
163        movdqa  xmmword ptr [rsp+0x80], xmm8
164        movdqa  xmmword ptr [rsp+0x90], xmm9
165        movdqa  xmmword ptr [rsp+0xA0], xmm12
166        movdqa  xmmword ptr [rsp+0xB0], xmm13
167        movdqu  xmm8, xmmword ptr [r8+rdx-0x10]
168        movdqu  xmm9, xmmword ptr [r9+rdx-0x10]
169        movdqu  xmm10, xmmword ptr [r10+rdx-0x10]
170        movdqu  xmm11, xmmword ptr [r11+rdx-0x10]
171        movdqa  xmm12, xmm8
172        punpckldq xmm8, xmm9
173        punpckhdq xmm12, xmm9
174        movdqa  xmm14, xmm10
175        punpckldq xmm10, xmm11
176        punpckhdq xmm14, xmm11
177        movdqa  xmm9, xmm8
178        punpcklqdq xmm8, xmm10
179        punpckhqdq xmm9, xmm10
180        movdqa  xmm13, xmm12
181        punpcklqdq xmm12, xmm14
182        punpckhqdq xmm13, xmm14
183        movdqa  xmmword ptr [rsp+0xC0], xmm8
184        movdqa  xmmword ptr [rsp+0xD0], xmm9
185        movdqa  xmmword ptr [rsp+0xE0], xmm12
186        movdqa  xmmword ptr [rsp+0xF0], xmm13
187        movdqa  xmm9, xmmword ptr [BLAKE3_IV_1+rip]
188        movdqa  xmm10, xmmword ptr [BLAKE3_IV_2+rip]
189        movdqa  xmm11, xmmword ptr [BLAKE3_IV_3+rip]
190        movdqa  xmm12, xmmword ptr [rsp+0x110]
191        movdqa  xmm13, xmmword ptr [rsp+0x120]
192        movdqa  xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
193        movd    xmm15, eax
194        pshufd  xmm15, xmm15, 0x00
195        prefetcht0 [r8+rdx+0x80]
196        prefetcht0 [r9+rdx+0x80]
197        prefetcht0 [r10+rdx+0x80]
198        prefetcht0 [r11+rdx+0x80]
199        paddd   xmm0, xmmword ptr [rsp]
200        paddd   xmm1, xmmword ptr [rsp+0x20]
201        paddd   xmm2, xmmword ptr [rsp+0x40]
202        paddd   xmm3, xmmword ptr [rsp+0x60]
203        paddd   xmm0, xmm4
204        paddd   xmm1, xmm5
205        paddd   xmm2, xmm6
206        paddd   xmm3, xmm7
207        pxor    xmm12, xmm0
208        pxor    xmm13, xmm1
209        pxor    xmm14, xmm2
210        pxor    xmm15, xmm3
211        pshuflw xmm12, xmm12, 0xB1
212        pshufhw xmm12, xmm12, 0xB1
213        pshuflw xmm13, xmm13, 0xB1
214        pshufhw xmm13, xmm13, 0xB1
215        pshuflw xmm14, xmm14, 0xB1
216        pshufhw xmm14, xmm14, 0xB1
217        pshuflw xmm15, xmm15, 0xB1
218        pshufhw xmm15, xmm15, 0xB1
219        movdqa  xmm8, xmmword ptr [BLAKE3_IV_0+rip]
220        paddd   xmm8, xmm12
221        paddd   xmm9, xmm13
222        paddd   xmm10, xmm14
223        paddd   xmm11, xmm15
224        pxor    xmm4, xmm8
225        pxor    xmm5, xmm9
226        pxor    xmm6, xmm10
227        pxor    xmm7, xmm11
228        movdqa  xmmword ptr [rsp+0x100], xmm8
229        movdqa  xmm8, xmm4
230        psrld   xmm8, 12
231        pslld   xmm4, 20
232        por     xmm4, xmm8
233        movdqa  xmm8, xmm5
234        psrld   xmm8, 12
235        pslld   xmm5, 20
236        por     xmm5, xmm8
237        movdqa  xmm8, xmm6
238        psrld   xmm8, 12
239        pslld   xmm6, 20
240        por     xmm6, xmm8
241        movdqa  xmm8, xmm7
242        psrld   xmm8, 12
243        pslld   xmm7, 20
244        por     xmm7, xmm8
245        paddd   xmm0, xmmword ptr [rsp+0x10]
246        paddd   xmm1, xmmword ptr [rsp+0x30]
247        paddd   xmm2, xmmword ptr [rsp+0x50]
248        paddd   xmm3, xmmword ptr [rsp+0x70]
249        paddd   xmm0, xmm4
250        paddd   xmm1, xmm5
251        paddd   xmm2, xmm6
252        paddd   xmm3, xmm7
253        pxor    xmm12, xmm0
254        pxor    xmm13, xmm1
255        pxor    xmm14, xmm2
256        pxor    xmm15, xmm3
257        movdqa  xmm8, xmm12
258        psrld   xmm12, 8
259        pslld   xmm8, 24
260        pxor    xmm12, xmm8
261        movdqa  xmm8, xmm13
262        psrld   xmm13, 8
263        pslld   xmm8, 24
264        pxor    xmm13, xmm8
265        movdqa  xmm8, xmm14
266        psrld   xmm14, 8
267        pslld   xmm8, 24
268        pxor    xmm14, xmm8
269        movdqa  xmm8, xmm15
270        psrld   xmm15, 8
271        pslld   xmm8, 24
272        pxor    xmm15, xmm8
273        movdqa  xmm8, xmmword ptr [rsp+0x100]
274        paddd   xmm8, xmm12
275        paddd   xmm9, xmm13
276        paddd   xmm10, xmm14
277        paddd   xmm11, xmm15
278        pxor    xmm4, xmm8
279        pxor    xmm5, xmm9
280        pxor    xmm6, xmm10
281        pxor    xmm7, xmm11
282        movdqa  xmmword ptr [rsp+0x100], xmm8
283        movdqa  xmm8, xmm4
284        psrld   xmm8, 7
285        pslld   xmm4, 25
286        por     xmm4, xmm8
287        movdqa  xmm8, xmm5
288        psrld   xmm8, 7
289        pslld   xmm5, 25
290        por     xmm5, xmm8
291        movdqa  xmm8, xmm6
292        psrld   xmm8, 7
293        pslld   xmm6, 25
294        por     xmm6, xmm8
295        movdqa  xmm8, xmm7
296        psrld   xmm8, 7
297        pslld   xmm7, 25
298        por     xmm7, xmm8
299        paddd   xmm0, xmmword ptr [rsp+0x80]
300        paddd   xmm1, xmmword ptr [rsp+0xA0]
301        paddd   xmm2, xmmword ptr [rsp+0xC0]
302        paddd   xmm3, xmmword ptr [rsp+0xE0]
303        paddd   xmm0, xmm5
304        paddd   xmm1, xmm6
305        paddd   xmm2, xmm7
306        paddd   xmm3, xmm4
307        pxor    xmm15, xmm0
308        pxor    xmm12, xmm1
309        pxor    xmm13, xmm2
310        pxor    xmm14, xmm3
311        pshuflw xmm15, xmm15, 0xB1
312        pshufhw xmm15, xmm15, 0xB1
313        pshuflw xmm12, xmm12, 0xB1
314        pshufhw xmm12, xmm12, 0xB1
315        pshuflw xmm13, xmm13, 0xB1
316        pshufhw xmm13, xmm13, 0xB1
317        pshuflw xmm14, xmm14, 0xB1
318        pshufhw xmm14, xmm14, 0xB1
319        paddd   xmm10, xmm15
320        paddd   xmm11, xmm12
321        movdqa  xmm8, xmmword ptr [rsp+0x100]
322        paddd   xmm8, xmm13
323        paddd   xmm9, xmm14
324        pxor    xmm5, xmm10
325        pxor    xmm6, xmm11
326        pxor    xmm7, xmm8
327        pxor    xmm4, xmm9
328        movdqa  xmmword ptr [rsp+0x100], xmm8
329        movdqa  xmm8, xmm5
330        psrld   xmm8, 12
331        pslld   xmm5, 20
332        por     xmm5, xmm8
333        movdqa  xmm8, xmm6
334        psrld   xmm8, 12
335        pslld   xmm6, 20
336        por     xmm6, xmm8
337        movdqa  xmm8, xmm7
338        psrld   xmm8, 12
339        pslld   xmm7, 20
340        por     xmm7, xmm8
341        movdqa  xmm8, xmm4
342        psrld   xmm8, 12
343        pslld   xmm4, 20
344        por     xmm4, xmm8
345        paddd   xmm0, xmmword ptr [rsp+0x90]
346        paddd   xmm1, xmmword ptr [rsp+0xB0]
347        paddd   xmm2, xmmword ptr [rsp+0xD0]
348        paddd   xmm3, xmmword ptr [rsp+0xF0]
349        paddd   xmm0, xmm5
350        paddd   xmm1, xmm6
351        paddd   xmm2, xmm7
352        paddd   xmm3, xmm4
353        pxor    xmm15, xmm0
354        pxor    xmm12, xmm1
355        pxor    xmm13, xmm2
356        pxor    xmm14, xmm3
357        movdqa  xmm8, xmm15
358        psrld   xmm15, 8
359        pslld   xmm8, 24
360        pxor    xmm15, xmm8
361        movdqa  xmm8, xmm12
362        psrld   xmm12, 8
363        pslld   xmm8, 24
364        pxor    xmm12, xmm8
365        movdqa  xmm8, xmm13
366        psrld   xmm13, 8
367        pslld   xmm8, 24
368        pxor    xmm13, xmm8
369        movdqa  xmm8, xmm14
370        psrld   xmm14, 8
371        pslld   xmm8, 24
372        pxor    xmm14, xmm8
373        paddd   xmm10, xmm15
374        paddd   xmm11, xmm12
375        movdqa  xmm8, xmmword ptr [rsp+0x100]
376        paddd   xmm8, xmm13
377        paddd   xmm9, xmm14
378        pxor    xmm5, xmm10
379        pxor    xmm6, xmm11
380        pxor    xmm7, xmm8
381        pxor    xmm4, xmm9
382        movdqa  xmmword ptr [rsp+0x100], xmm8
383        movdqa  xmm8, xmm5
384        psrld   xmm8, 7
385        pslld   xmm5, 25
386        por     xmm5, xmm8
387        movdqa  xmm8, xmm6
388        psrld   xmm8, 7
389        pslld   xmm6, 25
390        por     xmm6, xmm8
391        movdqa  xmm8, xmm7
392        psrld   xmm8, 7
393        pslld   xmm7, 25
394        por     xmm7, xmm8
395        movdqa  xmm8, xmm4
396        psrld   xmm8, 7
397        pslld   xmm4, 25
398        por     xmm4, xmm8
399        paddd   xmm0, xmmword ptr [rsp+0x20]
400        paddd   xmm1, xmmword ptr [rsp+0x30]
401        paddd   xmm2, xmmword ptr [rsp+0x70]
402        paddd   xmm3, xmmword ptr [rsp+0x40]
403        paddd   xmm0, xmm4
404        paddd   xmm1, xmm5
405        paddd   xmm2, xmm6
406        paddd   xmm3, xmm7
407        pxor    xmm12, xmm0
408        pxor    xmm13, xmm1
409        pxor    xmm14, xmm2
410        pxor    xmm15, xmm3
411        pshuflw xmm12, xmm12, 0xB1
412        pshufhw xmm12, xmm12, 0xB1
413        pshuflw xmm13, xmm13, 0xB1
414        pshufhw xmm13, xmm13, 0xB1
415        pshuflw xmm14, xmm14, 0xB1
416        pshufhw xmm14, xmm14, 0xB1
417        pshuflw xmm15, xmm15, 0xB1
418        pshufhw xmm15, xmm15, 0xB1
419        movdqa  xmm8, xmmword ptr [rsp+0x100]
420        paddd   xmm8, xmm12
421        paddd   xmm9, xmm13
422        paddd   xmm10, xmm14
423        paddd   xmm11, xmm15
424        pxor    xmm4, xmm8
425        pxor    xmm5, xmm9
426        pxor    xmm6, xmm10
427        pxor    xmm7, xmm11
428        movdqa  xmmword ptr [rsp+0x100], xmm8
429        movdqa  xmm8, xmm4
430        psrld   xmm8, 12
431        pslld   xmm4, 20
432        por     xmm4, xmm8
433        movdqa  xmm8, xmm5
434        psrld   xmm8, 12
435        pslld   xmm5, 20
436        por     xmm5, xmm8
437        movdqa  xmm8, xmm6
438        psrld   xmm8, 12
439        pslld   xmm6, 20
440        por     xmm6, xmm8
441        movdqa  xmm8, xmm7
442        psrld   xmm8, 12
443        pslld   xmm7, 20
444        por     xmm7, xmm8
445        paddd   xmm0, xmmword ptr [rsp+0x60]
446        paddd   xmm1, xmmword ptr [rsp+0xA0]
447        paddd   xmm2, xmmword ptr [rsp]
448        paddd   xmm3, xmmword ptr [rsp+0xD0]
449        paddd   xmm0, xmm4
450        paddd   xmm1, xmm5
451        paddd   xmm2, xmm6
452        paddd   xmm3, xmm7
453        pxor    xmm12, xmm0
454        pxor    xmm13, xmm1
455        pxor    xmm14, xmm2
456        pxor    xmm15, xmm3
457        movdqa  xmm8, xmm12
458        psrld   xmm12, 8
459        pslld   xmm8, 24
460        pxor    xmm12, xmm8
461        movdqa  xmm8, xmm13
462        psrld   xmm13, 8
463        pslld   xmm8, 24
464        pxor    xmm13, xmm8
465        movdqa  xmm8, xmm14
466        psrld   xmm14, 8
467        pslld   xmm8, 24
468        pxor    xmm14, xmm8
469        movdqa  xmm8, xmm15
470        psrld   xmm15, 8
471        pslld   xmm8, 24
472        pxor    xmm15, xmm8
473        movdqa  xmm8, xmmword ptr [rsp+0x100]
474        paddd   xmm8, xmm12
475        paddd   xmm9, xmm13
476        paddd   xmm10, xmm14
477        paddd   xmm11, xmm15
478        pxor    xmm4, xmm8
479        pxor    xmm5, xmm9
480        pxor    xmm6, xmm10
481        pxor    xmm7, xmm11
482        movdqa  xmmword ptr [rsp+0x100], xmm8
483        movdqa  xmm8, xmm4
484        psrld   xmm8, 7
485        pslld   xmm4, 25
486        por     xmm4, xmm8
487        movdqa  xmm8, xmm5
488        psrld   xmm8, 7
489        pslld   xmm5, 25
490        por     xmm5, xmm8
491        movdqa  xmm8, xmm6
492        psrld   xmm8, 7
493        pslld   xmm6, 25
494        por     xmm6, xmm8
495        movdqa  xmm8, xmm7
496        psrld   xmm8, 7
497        pslld   xmm7, 25
498        por     xmm7, xmm8
499        paddd   xmm0, xmmword ptr [rsp+0x10]
500        paddd   xmm1, xmmword ptr [rsp+0xC0]
501        paddd   xmm2, xmmword ptr [rsp+0x90]
502        paddd   xmm3, xmmword ptr [rsp+0xF0]
503        paddd   xmm0, xmm5
504        paddd   xmm1, xmm6
505        paddd   xmm2, xmm7
506        paddd   xmm3, xmm4
507        pxor    xmm15, xmm0
508        pxor    xmm12, xmm1
509        pxor    xmm13, xmm2
510        pxor    xmm14, xmm3
511        pshuflw xmm15, xmm15, 0xB1
512        pshufhw xmm15, xmm15, 0xB1
513        pshuflw xmm12, xmm12, 0xB1
514        pshufhw xmm12, xmm12, 0xB1
515        pshuflw xmm13, xmm13, 0xB1
516        pshufhw xmm13, xmm13, 0xB1
517        pshuflw xmm14, xmm14, 0xB1
518        pshufhw xmm14, xmm14, 0xB1
519        paddd   xmm10, xmm15
520        paddd   xmm11, xmm12
521        movdqa  xmm8, xmmword ptr [rsp+0x100]
522        paddd   xmm8, xmm13
523        paddd   xmm9, xmm14
524        pxor    xmm5, xmm10
525        pxor    xmm6, xmm11
526        pxor    xmm7, xmm8
527        pxor    xmm4, xmm9
528        movdqa  xmmword ptr [rsp+0x100], xmm8
529        movdqa  xmm8, xmm5
530        psrld   xmm8, 12
531        pslld   xmm5, 20
532        por     xmm5, xmm8
533        movdqa  xmm8, xmm6
534        psrld   xmm8, 12
535        pslld   xmm6, 20
536        por     xmm6, xmm8
537        movdqa  xmm8, xmm7
538        psrld   xmm8, 12
539        pslld   xmm7, 20
540        por     xmm7, xmm8
541        movdqa  xmm8, xmm4
542        psrld   xmm8, 12
543        pslld   xmm4, 20
544        por     xmm4, xmm8
545        paddd   xmm0, xmmword ptr [rsp+0xB0]
546        paddd   xmm1, xmmword ptr [rsp+0x50]
547        paddd   xmm2, xmmword ptr [rsp+0xE0]
548        paddd   xmm3, xmmword ptr [rsp+0x80]
549        paddd   xmm0, xmm5
550        paddd   xmm1, xmm6
551        paddd   xmm2, xmm7
552        paddd   xmm3, xmm4
553        pxor    xmm15, xmm0
554        pxor    xmm12, xmm1
555        pxor    xmm13, xmm2
556        pxor    xmm14, xmm3
557        movdqa  xmm8, xmm15
558        psrld   xmm15, 8
559        pslld   xmm8, 24
560        pxor    xmm15, xmm8
561        movdqa  xmm8, xmm12
562        psrld   xmm12, 8
563        pslld   xmm8, 24
564        pxor    xmm12, xmm8
565        movdqa  xmm8, xmm13
566        psrld   xmm13, 8
567        pslld   xmm8, 24
568        pxor    xmm13, xmm8
569        movdqa  xmm8, xmm14
570        psrld   xmm14, 8
571        pslld   xmm8, 24
572        pxor    xmm14, xmm8
573        paddd   xmm10, xmm15
574        paddd   xmm11, xmm12
575        movdqa  xmm8, xmmword ptr [rsp+0x100]
576        paddd   xmm8, xmm13
577        paddd   xmm9, xmm14
578        pxor    xmm5, xmm10
579        pxor    xmm6, xmm11
580        pxor    xmm7, xmm8
581        pxor    xmm4, xmm9
582        movdqa  xmmword ptr [rsp+0x100], xmm8
583        movdqa  xmm8, xmm5
584        psrld   xmm8, 7
585        pslld   xmm5, 25
586        por     xmm5, xmm8
587        movdqa  xmm8, xmm6
588        psrld   xmm8, 7
589        pslld   xmm6, 25
590        por     xmm6, xmm8
591        movdqa  xmm8, xmm7
592        psrld   xmm8, 7
593        pslld   xmm7, 25
594        por     xmm7, xmm8
595        movdqa  xmm8, xmm4
596        psrld   xmm8, 7
597        pslld   xmm4, 25
598        por     xmm4, xmm8
599        paddd   xmm0, xmmword ptr [rsp+0x30]
600        paddd   xmm1, xmmword ptr [rsp+0xA0]
601        paddd   xmm2, xmmword ptr [rsp+0xD0]
602        paddd   xmm3, xmmword ptr [rsp+0x70]
603        paddd   xmm0, xmm4
604        paddd   xmm1, xmm5
605        paddd   xmm2, xmm6
606        paddd   xmm3, xmm7
607        pxor    xmm12, xmm0
608        pxor    xmm13, xmm1
609        pxor    xmm14, xmm2
610        pxor    xmm15, xmm3
611        pshuflw xmm12, xmm12, 0xB1
612        pshufhw xmm12, xmm12, 0xB1
613        pshuflw xmm13, xmm13, 0xB1
614        pshufhw xmm13, xmm13, 0xB1
615        pshuflw xmm14, xmm14, 0xB1
616        pshufhw xmm14, xmm14, 0xB1
617        pshuflw xmm15, xmm15, 0xB1
618        pshufhw xmm15, xmm15, 0xB1
619        movdqa  xmm8, xmmword ptr [rsp+0x100]
620        paddd   xmm8, xmm12
621        paddd   xmm9, xmm13
622        paddd   xmm10, xmm14
623        paddd   xmm11, xmm15
624        pxor    xmm4, xmm8
625        pxor    xmm5, xmm9
626        pxor    xmm6, xmm10
627        pxor    xmm7, xmm11
628        movdqa  xmmword ptr [rsp+0x100], xmm8
629        movdqa  xmm8, xmm4
630        psrld   xmm8, 12
631        pslld   xmm4, 20
632        por     xmm4, xmm8
633        movdqa  xmm8, xmm5
634        psrld   xmm8, 12
635        pslld   xmm5, 20
636        por     xmm5, xmm8
637        movdqa  xmm8, xmm6
638        psrld   xmm8, 12
639        pslld   xmm6, 20
640        por     xmm6, xmm8
641        movdqa  xmm8, xmm7
642        psrld   xmm8, 12
643        pslld   xmm7, 20
644        por     xmm7, xmm8
645        paddd   xmm0, xmmword ptr [rsp+0x40]
646        paddd   xmm1, xmmword ptr [rsp+0xC0]
647        paddd   xmm2, xmmword ptr [rsp+0x20]
648        paddd   xmm3, xmmword ptr [rsp+0xE0]
649        paddd   xmm0, xmm4
650        paddd   xmm1, xmm5
651        paddd   xmm2, xmm6
652        paddd   xmm3, xmm7
653        pxor    xmm12, xmm0
654        pxor    xmm13, xmm1
655        pxor    xmm14, xmm2
656        pxor    xmm15, xmm3
657        movdqa  xmm8, xmm12
658        psrld   xmm12, 8
659        pslld   xmm8, 24
660        pxor    xmm12, xmm8
661        movdqa  xmm8, xmm13
662        psrld   xmm13, 8
663        pslld   xmm8, 24
664        pxor    xmm13, xmm8
665        movdqa  xmm8, xmm14
666        psrld   xmm14, 8
667        pslld   xmm8, 24
668        pxor    xmm14, xmm8
669        movdqa  xmm8, xmm15
670        psrld   xmm15, 8
671        pslld   xmm8, 24
672        pxor    xmm15, xmm8
673        movdqa  xmm8, xmmword ptr [rsp+0x100]
674        paddd   xmm8, xmm12
675        paddd   xmm9, xmm13
676        paddd   xmm10, xmm14
677        paddd   xmm11, xmm15
678        pxor    xmm4, xmm8
679        pxor    xmm5, xmm9
680        pxor    xmm6, xmm10
681        pxor    xmm7, xmm11
682        movdqa  xmmword ptr [rsp+0x100], xmm8
683        movdqa  xmm8, xmm4
684        psrld   xmm8, 7
685        pslld   xmm4, 25
686        por     xmm4, xmm8
687        movdqa  xmm8, xmm5
688        psrld   xmm8, 7
689        pslld   xmm5, 25
690        por     xmm5, xmm8
691        movdqa  xmm8, xmm6
692        psrld   xmm8, 7
693        pslld   xmm6, 25
694        por     xmm6, xmm8
695        movdqa  xmm8, xmm7
696        psrld   xmm8, 7
697        pslld   xmm7, 25
698        por     xmm7, xmm8
699        paddd   xmm0, xmmword ptr [rsp+0x60]
700        paddd   xmm1, xmmword ptr [rsp+0x90]
701        paddd   xmm2, xmmword ptr [rsp+0xB0]
702        paddd   xmm3, xmmword ptr [rsp+0x80]
703        paddd   xmm0, xmm5
704        paddd   xmm1, xmm6
705        paddd   xmm2, xmm7
706        paddd   xmm3, xmm4
707        pxor    xmm15, xmm0
708        pxor    xmm12, xmm1
709        pxor    xmm13, xmm2
710        pxor    xmm14, xmm3
711        pshuflw xmm15, xmm15, 0xB1
712        pshufhw xmm15, xmm15, 0xB1
713        pshuflw xmm12, xmm12, 0xB1
714        pshufhw xmm12, xmm12, 0xB1
715        pshuflw xmm13, xmm13, 0xB1
716        pshufhw xmm13, xmm13, 0xB1
717        pshuflw xmm14, xmm14, 0xB1
718        pshufhw xmm14, xmm14, 0xB1
719        paddd   xmm10, xmm15
720        paddd   xmm11, xmm12
721        movdqa  xmm8, xmmword ptr [rsp+0x100]
722        paddd   xmm8, xmm13
723        paddd   xmm9, xmm14
724        pxor    xmm5, xmm10
725        pxor    xmm6, xmm11
726        pxor    xmm7, xmm8
727        pxor    xmm4, xmm9
728        movdqa  xmmword ptr [rsp+0x100], xmm8
729        movdqa  xmm8, xmm5
730        psrld   xmm8, 12
731        pslld   xmm5, 20
732        por     xmm5, xmm8
733        movdqa  xmm8, xmm6
734        psrld   xmm8, 12
735        pslld   xmm6, 20
736        por     xmm6, xmm8
737        movdqa  xmm8, xmm7
738        psrld   xmm8, 12
739        pslld   xmm7, 20
740        por     xmm7, xmm8
741        movdqa  xmm8, xmm4
742        psrld   xmm8, 12
743        pslld   xmm4, 20
744        por     xmm4, xmm8
745        paddd   xmm0, xmmword ptr [rsp+0x50]
746        paddd   xmm1, xmmword ptr [rsp]
747        paddd   xmm2, xmmword ptr [rsp+0xF0]
748        paddd   xmm3, xmmword ptr [rsp+0x10]
749        paddd   xmm0, xmm5
750        paddd   xmm1, xmm6
751        paddd   xmm2, xmm7
752        paddd   xmm3, xmm4
753        pxor    xmm15, xmm0
754        pxor    xmm12, xmm1
755        pxor    xmm13, xmm2
756        pxor    xmm14, xmm3
757        movdqa  xmm8, xmm15
758        psrld   xmm15, 8
759        pslld   xmm8, 24
760        pxor    xmm15, xmm8
761        movdqa  xmm8, xmm12
762        psrld   xmm12, 8
763        pslld   xmm8, 24
764        pxor    xmm12, xmm8
765        movdqa  xmm8, xmm13
766        psrld   xmm13, 8
767        pslld   xmm8, 24
768        pxor    xmm13, xmm8
769        movdqa  xmm8, xmm14
770        psrld   xmm14, 8
771        pslld   xmm8, 24
772        pxor    xmm14, xmm8
773        paddd   xmm10, xmm15
774        paddd   xmm11, xmm12
775        movdqa  xmm8, xmmword ptr [rsp+0x100]
776        paddd   xmm8, xmm13
777        paddd   xmm9, xmm14
778        pxor    xmm5, xmm10
779        pxor    xmm6, xmm11
780        pxor    xmm7, xmm8
781        pxor    xmm4, xmm9
782        movdqa  xmmword ptr [rsp+0x100], xmm8
783        movdqa  xmm8, xmm5
784        psrld   xmm8, 7
785        pslld   xmm5, 25
786        por     xmm5, xmm8
787        movdqa  xmm8, xmm6
788        psrld   xmm8, 7
789        pslld   xmm6, 25
790        por     xmm6, xmm8
791        movdqa  xmm8, xmm7
792        psrld   xmm8, 7
793        pslld   xmm7, 25
794        por     xmm7, xmm8
795        movdqa  xmm8, xmm4
796        psrld   xmm8, 7
797        pslld   xmm4, 25
798        por     xmm4, xmm8
799        paddd   xmm0, xmmword ptr [rsp+0xA0]
800        paddd   xmm1, xmmword ptr [rsp+0xC0]
801        paddd   xmm2, xmmword ptr [rsp+0xE0]
802        paddd   xmm3, xmmword ptr [rsp+0xD0]
803        paddd   xmm0, xmm4
804        paddd   xmm1, xmm5
805        paddd   xmm2, xmm6
806        paddd   xmm3, xmm7
807        pxor    xmm12, xmm0
808        pxor    xmm13, xmm1
809        pxor    xmm14, xmm2
810        pxor    xmm15, xmm3
811        pshuflw xmm12, xmm12, 0xB1
812        pshufhw xmm12, xmm12, 0xB1
813        pshuflw xmm13, xmm13, 0xB1
814        pshufhw xmm13, xmm13, 0xB1
815        pshuflw xmm14, xmm14, 0xB1
816        pshufhw xmm14, xmm14, 0xB1
817        pshuflw xmm15, xmm15, 0xB1
818        pshufhw xmm15, xmm15, 0xB1
819        movdqa  xmm8, xmmword ptr [rsp+0x100]
820        paddd   xmm8, xmm12
821        paddd   xmm9, xmm13
822        paddd   xmm10, xmm14
823        paddd   xmm11, xmm15
824        pxor    xmm4, xmm8
825        pxor    xmm5, xmm9
826        pxor    xmm6, xmm10
827        pxor    xmm7, xmm11
828        movdqa  xmmword ptr [rsp+0x100], xmm8
829        movdqa  xmm8, xmm4
830        psrld   xmm8, 12
831        pslld   xmm4, 20
832        por     xmm4, xmm8
833        movdqa  xmm8, xmm5
834        psrld   xmm8, 12
835        pslld   xmm5, 20
836        por     xmm5, xmm8
837        movdqa  xmm8, xmm6
838        psrld   xmm8, 12
839        pslld   xmm6, 20
840        por     xmm6, xmm8
841        movdqa  xmm8, xmm7
842        psrld   xmm8, 12
843        pslld   xmm7, 20
844        por     xmm7, xmm8
845        paddd   xmm0, xmmword ptr [rsp+0x70]
846        paddd   xmm1, xmmword ptr [rsp+0x90]
847        paddd   xmm2, xmmword ptr [rsp+0x30]
848        paddd   xmm3, xmmword ptr [rsp+0xF0]
849        paddd   xmm0, xmm4
850        paddd   xmm1, xmm5
851        paddd   xmm2, xmm6
852        paddd   xmm3, xmm7
853        pxor    xmm12, xmm0
854        pxor    xmm13, xmm1
855        pxor    xmm14, xmm2
856        pxor    xmm15, xmm3
857        movdqa  xmm8, xmm12
858        psrld   xmm12, 8
859        pslld   xmm8, 24
860        pxor    xmm12, xmm8
861        movdqa  xmm8, xmm13
862        psrld   xmm13, 8
863        pslld   xmm8, 24
864        pxor    xmm13, xmm8
865        movdqa  xmm8, xmm14
866        psrld   xmm14, 8
867        pslld   xmm8, 24
868        pxor    xmm14, xmm8
869        movdqa  xmm8, xmm15
870        psrld   xmm15, 8
871        pslld   xmm8, 24
872        pxor    xmm15, xmm8
873        movdqa  xmm8, xmmword ptr [rsp+0x100]
874        paddd   xmm8, xmm12
875        paddd   xmm9, xmm13
876        paddd   xmm10, xmm14
877        paddd   xmm11, xmm15
878        pxor    xmm4, xmm8
879        pxor    xmm5, xmm9
880        pxor    xmm6, xmm10
881        pxor    xmm7, xmm11
882        movdqa  xmmword ptr [rsp+0x100], xmm8
883        movdqa  xmm8, xmm4
884        psrld   xmm8, 7
885        pslld   xmm4, 25
886        por     xmm4, xmm8
887        movdqa  xmm8, xmm5
888        psrld   xmm8, 7
889        pslld   xmm5, 25
890        por     xmm5, xmm8
891        movdqa  xmm8, xmm6
892        psrld   xmm8, 7
893        pslld   xmm6, 25
894        por     xmm6, xmm8
895        movdqa  xmm8, xmm7
896        psrld   xmm8, 7
897        pslld   xmm7, 25
898        por     xmm7, xmm8
899        paddd   xmm0, xmmword ptr [rsp+0x40]
900        paddd   xmm1, xmmword ptr [rsp+0xB0]
901        paddd   xmm2, xmmword ptr [rsp+0x50]
902        paddd   xmm3, xmmword ptr [rsp+0x10]
903        paddd   xmm0, xmm5
904        paddd   xmm1, xmm6
905        paddd   xmm2, xmm7
906        paddd   xmm3, xmm4
907        pxor    xmm15, xmm0
908        pxor    xmm12, xmm1
909        pxor    xmm13, xmm2
910        pxor    xmm14, xmm3
911        pshuflw xmm15, xmm15, 0xB1
912        pshufhw xmm15, xmm15, 0xB1
913        pshuflw xmm12, xmm12, 0xB1
914        pshufhw xmm12, xmm12, 0xB1
915        pshuflw xmm13, xmm13, 0xB1
916        pshufhw xmm13, xmm13, 0xB1
917        pshuflw xmm14, xmm14, 0xB1
918        pshufhw xmm14, xmm14, 0xB1
919        paddd   xmm10, xmm15
920        paddd   xmm11, xmm12
921        movdqa  xmm8, xmmword ptr [rsp+0x100]
922        paddd   xmm8, xmm13
923        paddd   xmm9, xmm14
924        pxor    xmm5, xmm10
925        pxor    xmm6, xmm11
926        pxor    xmm7, xmm8
927        pxor    xmm4, xmm9
928        movdqa  xmmword ptr [rsp+0x100], xmm8
929        movdqa  xmm8, xmm5
930        psrld   xmm8, 12
931        pslld   xmm5, 20
932        por     xmm5, xmm8
933        movdqa  xmm8, xmm6
934        psrld   xmm8, 12
935        pslld   xmm6, 20
936        por     xmm6, xmm8
937        movdqa  xmm8, xmm7
938        psrld   xmm8, 12
939        pslld   xmm7, 20
940        por     xmm7, xmm8
941        movdqa  xmm8, xmm4
942        psrld   xmm8, 12
943        pslld   xmm4, 20
944        por     xmm4, xmm8
945        paddd   xmm0, xmmword ptr [rsp]
946        paddd   xmm1, xmmword ptr [rsp+0x20]
947        paddd   xmm2, xmmword ptr [rsp+0x80]
948        paddd   xmm3, xmmword ptr [rsp+0x60]
949        paddd   xmm0, xmm5
950        paddd   xmm1, xmm6
951        paddd   xmm2, xmm7
952        paddd   xmm3, xmm4
953        pxor    xmm15, xmm0
954        pxor    xmm12, xmm1
955        pxor    xmm13, xmm2
956        pxor    xmm14, xmm3
957        movdqa  xmm8, xmm15
958        psrld   xmm15, 8
959        pslld   xmm8, 24
960        pxor    xmm15, xmm8
961        movdqa  xmm8, xmm12
962        psrld   xmm12, 8
963        pslld   xmm8, 24
964        pxor    xmm12, xmm8
965        movdqa  xmm8, xmm13
966        psrld   xmm13, 8
967        pslld   xmm8, 24
968        pxor    xmm13, xmm8
969        movdqa  xmm8, xmm14
970        psrld   xmm14, 8
971        pslld   xmm8, 24
972        pxor    xmm14, xmm8
973        paddd   xmm10, xmm15
974        paddd   xmm11, xmm12
975        movdqa  xmm8, xmmword ptr [rsp+0x100]
976        paddd   xmm8, xmm13
977        paddd   xmm9, xmm14
978        pxor    xmm5, xmm10
979        pxor    xmm6, xmm11
980        pxor    xmm7, xmm8
981        pxor    xmm4, xmm9
982        movdqa  xmmword ptr [rsp+0x100], xmm8
983        movdqa  xmm8, xmm5
984        psrld   xmm8, 7
985        pslld   xmm5, 25
986        por     xmm5, xmm8
987        movdqa  xmm8, xmm6
988        psrld   xmm8, 7
989        pslld   xmm6, 25
990        por     xmm6, xmm8
991        movdqa  xmm8, xmm7
992        psrld   xmm8, 7
993        pslld   xmm7, 25
994        por     xmm7, xmm8
995        movdqa  xmm8, xmm4
996        psrld   xmm8, 7
997        pslld   xmm4, 25
998        por     xmm4, xmm8
999        paddd   xmm0, xmmword ptr [rsp+0xC0]
1000        paddd   xmm1, xmmword ptr [rsp+0x90]
1001        paddd   xmm2, xmmword ptr [rsp+0xF0]
1002        paddd   xmm3, xmmword ptr [rsp+0xE0]
1003        paddd   xmm0, xmm4
1004        paddd   xmm1, xmm5
1005        paddd   xmm2, xmm6
1006        paddd   xmm3, xmm7
1007        pxor    xmm12, xmm0
1008        pxor    xmm13, xmm1
1009        pxor    xmm14, xmm2
1010        pxor    xmm15, xmm3
1011        pshuflw xmm12, xmm12, 0xB1
1012        pshufhw xmm12, xmm12, 0xB1
1013        pshuflw xmm13, xmm13, 0xB1
1014        pshufhw xmm13, xmm13, 0xB1
1015        pshuflw xmm14, xmm14, 0xB1
1016        pshufhw xmm14, xmm14, 0xB1
1017        pshuflw xmm15, xmm15, 0xB1
1018        pshufhw xmm15, xmm15, 0xB1
1019        movdqa  xmm8, xmmword ptr [rsp+0x100]
1020        paddd   xmm8, xmm12
1021        paddd   xmm9, xmm13
1022        paddd   xmm10, xmm14
1023        paddd   xmm11, xmm15
1024        pxor    xmm4, xmm8
1025        pxor    xmm5, xmm9
1026        pxor    xmm6, xmm10
1027        pxor    xmm7, xmm11
1028        movdqa  xmmword ptr [rsp+0x100], xmm8
1029        movdqa  xmm8, xmm4
1030        psrld   xmm8, 12
1031        pslld   xmm4, 20
1032        por     xmm4, xmm8
1033        movdqa  xmm8, xmm5
1034        psrld   xmm8, 12
1035        pslld   xmm5, 20
1036        por     xmm5, xmm8
1037        movdqa  xmm8, xmm6
1038        psrld   xmm8, 12
1039        pslld   xmm6, 20
1040        por     xmm6, xmm8
1041        movdqa  xmm8, xmm7
1042        psrld   xmm8, 12
1043        pslld   xmm7, 20
1044        por     xmm7, xmm8
1045        paddd   xmm0, xmmword ptr [rsp+0xD0]
1046        paddd   xmm1, xmmword ptr [rsp+0xB0]
1047        paddd   xmm2, xmmword ptr [rsp+0xA0]
1048        paddd   xmm3, xmmword ptr [rsp+0x80]
1049        paddd   xmm0, xmm4
1050        paddd   xmm1, xmm5
1051        paddd   xmm2, xmm6
1052        paddd   xmm3, xmm7
1053        pxor    xmm12, xmm0
1054        pxor    xmm13, xmm1
1055        pxor    xmm14, xmm2
1056        pxor    xmm15, xmm3
1057        movdqa  xmm8, xmm12
1058        psrld   xmm12, 8
1059        pslld   xmm8, 24
1060        pxor    xmm12, xmm8
1061        movdqa  xmm8, xmm13
1062        psrld   xmm13, 8
1063        pslld   xmm8, 24
1064        pxor    xmm13, xmm8
1065        movdqa  xmm8, xmm14
1066        psrld   xmm14, 8
1067        pslld   xmm8, 24
1068        pxor    xmm14, xmm8
1069        movdqa  xmm8, xmm15
1070        psrld   xmm15, 8
1071        pslld   xmm8, 24
1072        pxor    xmm15, xmm8
1073        movdqa  xmm8, xmmword ptr [rsp+0x100]
1074        paddd   xmm8, xmm12
1075        paddd   xmm9, xmm13
1076        paddd   xmm10, xmm14
1077        paddd   xmm11, xmm15
1078        pxor    xmm4, xmm8
1079        pxor    xmm5, xmm9
1080        pxor    xmm6, xmm10
1081        pxor    xmm7, xmm11
1082        movdqa  xmmword ptr [rsp+0x100], xmm8
1083        movdqa  xmm8, xmm4
1084        psrld   xmm8, 7
1085        pslld   xmm4, 25
1086        por     xmm4, xmm8
1087        movdqa  xmm8, xmm5
1088        psrld   xmm8, 7
1089        pslld   xmm5, 25
1090        por     xmm5, xmm8
1091        movdqa  xmm8, xmm6
1092        psrld   xmm8, 7
1093        pslld   xmm6, 25
1094        por     xmm6, xmm8
1095        movdqa  xmm8, xmm7
1096        psrld   xmm8, 7
1097        pslld   xmm7, 25
1098        por     xmm7, xmm8
1099        paddd   xmm0, xmmword ptr [rsp+0x70]
1100        paddd   xmm1, xmmword ptr [rsp+0x50]
1101        paddd   xmm2, xmmword ptr [rsp]
1102        paddd   xmm3, xmmword ptr [rsp+0x60]
1103        paddd   xmm0, xmm5
1104        paddd   xmm1, xmm6
1105        paddd   xmm2, xmm7
1106        paddd   xmm3, xmm4
1107        pxor    xmm15, xmm0
1108        pxor    xmm12, xmm1
1109        pxor    xmm13, xmm2
1110        pxor    xmm14, xmm3
1111        pshuflw xmm15, xmm15, 0xB1
1112        pshufhw xmm15, xmm15, 0xB1
1113        pshuflw xmm12, xmm12, 0xB1
1114        pshufhw xmm12, xmm12, 0xB1
1115        pshuflw xmm13, xmm13, 0xB1
1116        pshufhw xmm13, xmm13, 0xB1
1117        pshuflw xmm14, xmm14, 0xB1
1118        pshufhw xmm14, xmm14, 0xB1
1119        paddd   xmm10, xmm15
1120        paddd   xmm11, xmm12
1121        movdqa  xmm8, xmmword ptr [rsp+0x100]
1122        paddd   xmm8, xmm13
1123        paddd   xmm9, xmm14
1124        pxor    xmm5, xmm10
1125        pxor    xmm6, xmm11
1126        pxor    xmm7, xmm8
1127        pxor    xmm4, xmm9
1128        movdqa  xmmword ptr [rsp+0x100], xmm8
1129        movdqa  xmm8, xmm5
1130        psrld   xmm8, 12
1131        pslld   xmm5, 20
1132        por     xmm5, xmm8
1133        movdqa  xmm8, xmm6
1134        psrld   xmm8, 12
1135        pslld   xmm6, 20
1136        por     xmm6, xmm8
1137        movdqa  xmm8, xmm7
1138        psrld   xmm8, 12
1139        pslld   xmm7, 20
1140        por     xmm7, xmm8
1141        movdqa  xmm8, xmm4
1142        psrld   xmm8, 12
1143        pslld   xmm4, 20
1144        por     xmm4, xmm8
1145        paddd   xmm0, xmmword ptr [rsp+0x20]
1146        paddd   xmm1, xmmword ptr [rsp+0x30]
1147        paddd   xmm2, xmmword ptr [rsp+0x10]
1148        paddd   xmm3, xmmword ptr [rsp+0x40]
1149        paddd   xmm0, xmm5
1150        paddd   xmm1, xmm6
1151        paddd   xmm2, xmm7
1152        paddd   xmm3, xmm4
1153        pxor    xmm15, xmm0
1154        pxor    xmm12, xmm1
1155        pxor    xmm13, xmm2
1156        pxor    xmm14, xmm3
1157        movdqa  xmm8, xmm15
1158        psrld   xmm15, 8
1159        pslld   xmm8, 24
1160        pxor    xmm15, xmm8
1161        movdqa  xmm8, xmm12
1162        psrld   xmm12, 8
1163        pslld   xmm8, 24
1164        pxor    xmm12, xmm8
1165        movdqa  xmm8, xmm13
1166        psrld   xmm13, 8
1167        pslld   xmm8, 24
1168        pxor    xmm13, xmm8
1169        movdqa  xmm8, xmm14
1170        psrld   xmm14, 8
1171        pslld   xmm8, 24
1172        pxor    xmm14, xmm8
1173        paddd   xmm10, xmm15
1174        paddd   xmm11, xmm12
1175        movdqa  xmm8, xmmword ptr [rsp+0x100]
1176        paddd   xmm8, xmm13
1177        paddd   xmm9, xmm14
1178        pxor    xmm5, xmm10
1179        pxor    xmm6, xmm11
1180        pxor    xmm7, xmm8
1181        pxor    xmm4, xmm9
1182        movdqa  xmmword ptr [rsp+0x100], xmm8
1183        movdqa  xmm8, xmm5
1184        psrld   xmm8, 7
1185        pslld   xmm5, 25
1186        por     xmm5, xmm8
1187        movdqa  xmm8, xmm6
1188        psrld   xmm8, 7
1189        pslld   xmm6, 25
1190        por     xmm6, xmm8
1191        movdqa  xmm8, xmm7
1192        psrld   xmm8, 7
1193        pslld   xmm7, 25
1194        por     xmm7, xmm8
1195        movdqa  xmm8, xmm4
1196        psrld   xmm8, 7
1197        pslld   xmm4, 25
1198        por     xmm4, xmm8
1199        paddd   xmm0, xmmword ptr [rsp+0x90]
1200        paddd   xmm1, xmmword ptr [rsp+0xB0]
1201        paddd   xmm2, xmmword ptr [rsp+0x80]
1202        paddd   xmm3, xmmword ptr [rsp+0xF0]
1203        paddd   xmm0, xmm4
1204        paddd   xmm1, xmm5
1205        paddd   xmm2, xmm6
1206        paddd   xmm3, xmm7
1207        pxor    xmm12, xmm0
1208        pxor    xmm13, xmm1
1209        pxor    xmm14, xmm2
1210        pxor    xmm15, xmm3
1211        pshuflw xmm12, xmm12, 0xB1
1212        pshufhw xmm12, xmm12, 0xB1
1213        pshuflw xmm13, xmm13, 0xB1
1214        pshufhw xmm13, xmm13, 0xB1
1215        pshuflw xmm14, xmm14, 0xB1
1216        pshufhw xmm14, xmm14, 0xB1
1217        pshuflw xmm15, xmm15, 0xB1
1218        pshufhw xmm15, xmm15, 0xB1
1219        movdqa  xmm8, xmmword ptr [rsp+0x100]
1220        paddd   xmm8, xmm12
1221        paddd   xmm9, xmm13
1222        paddd   xmm10, xmm14
1223        paddd   xmm11, xmm15
1224        pxor    xmm4, xmm8
1225        pxor    xmm5, xmm9
1226        pxor    xmm6, xmm10
1227        pxor    xmm7, xmm11
1228        movdqa  xmmword ptr [rsp+0x100], xmm8
1229        movdqa  xmm8, xmm4
1230        psrld   xmm8, 12
1231        pslld   xmm4, 20
1232        por     xmm4, xmm8
1233        movdqa  xmm8, xmm5
1234        psrld   xmm8, 12
1235        pslld   xmm5, 20
1236        por     xmm5, xmm8
1237        movdqa  xmm8, xmm6
1238        psrld   xmm8, 12
1239        pslld   xmm6, 20
1240        por     xmm6, xmm8
1241        movdqa  xmm8, xmm7
1242        psrld   xmm8, 12
1243        pslld   xmm7, 20
1244        por     xmm7, xmm8
1245        paddd   xmm0, xmmword ptr [rsp+0xE0]
1246        paddd   xmm1, xmmword ptr [rsp+0x50]
1247        paddd   xmm2, xmmword ptr [rsp+0xC0]
1248        paddd   xmm3, xmmword ptr [rsp+0x10]
1249        paddd   xmm0, xmm4
1250        paddd   xmm1, xmm5
1251        paddd   xmm2, xmm6
1252        paddd   xmm3, xmm7
1253        pxor    xmm12, xmm0
1254        pxor    xmm13, xmm1
1255        pxor    xmm14, xmm2
1256        pxor    xmm15, xmm3
1257        movdqa  xmm8, xmm12
1258        psrld   xmm12, 8
1259        pslld   xmm8, 24
1260        pxor    xmm12, xmm8
1261        movdqa  xmm8, xmm13
1262        psrld   xmm13, 8
1263        pslld   xmm8, 24
1264        pxor    xmm13, xmm8
1265        movdqa  xmm8, xmm14
1266        psrld   xmm14, 8
1267        pslld   xmm8, 24
1268        pxor    xmm14, xmm8
1269        movdqa  xmm8, xmm15
1270        psrld   xmm15, 8
1271        pslld   xmm8, 24
1272        pxor    xmm15, xmm8
1273        movdqa  xmm8, xmmword ptr [rsp+0x100]
1274        paddd   xmm8, xmm12
1275        paddd   xmm9, xmm13
1276        paddd   xmm10, xmm14
1277        paddd   xmm11, xmm15
1278        pxor    xmm4, xmm8
1279        pxor    xmm5, xmm9
1280        pxor    xmm6, xmm10
1281        pxor    xmm7, xmm11
1282        movdqa  xmmword ptr [rsp+0x100], xmm8
1283        movdqa  xmm8, xmm4
1284        psrld   xmm8, 7
1285        pslld   xmm4, 25
1286        por     xmm4, xmm8
1287        movdqa  xmm8, xmm5
1288        psrld   xmm8, 7
1289        pslld   xmm5, 25
1290        por     xmm5, xmm8
1291        movdqa  xmm8, xmm6
1292        psrld   xmm8, 7
1293        pslld   xmm6, 25
1294        por     xmm6, xmm8
1295        movdqa  xmm8, xmm7
1296        psrld   xmm8, 7
1297        pslld   xmm7, 25
1298        por     xmm7, xmm8
1299        paddd   xmm0, xmmword ptr [rsp+0xD0]
1300        paddd   xmm1, xmmword ptr [rsp]
1301        paddd   xmm2, xmmword ptr [rsp+0x20]
1302        paddd   xmm3, xmmword ptr [rsp+0x40]
1303        paddd   xmm0, xmm5
1304        paddd   xmm1, xmm6
1305        paddd   xmm2, xmm7
1306        paddd   xmm3, xmm4
1307        pxor    xmm15, xmm0
1308        pxor    xmm12, xmm1
1309        pxor    xmm13, xmm2
1310        pxor    xmm14, xmm3
1311        pshuflw xmm15, xmm15, 0xB1
1312        pshufhw xmm15, xmm15, 0xB1
1313        pshuflw xmm12, xmm12, 0xB1
1314        pshufhw xmm12, xmm12, 0xB1
1315        pshuflw xmm13, xmm13, 0xB1
1316        pshufhw xmm13, xmm13, 0xB1
1317        pshuflw xmm14, xmm14, 0xB1
1318        pshufhw xmm14, xmm14, 0xB1
1319        paddd   xmm10, xmm15
1320        paddd   xmm11, xmm12
1321        movdqa  xmm8, xmmword ptr [rsp+0x100]
1322        paddd   xmm8, xmm13
1323        paddd   xmm9, xmm14
1324        pxor    xmm5, xmm10
1325        pxor    xmm6, xmm11
1326        pxor    xmm7, xmm8
1327        pxor    xmm4, xmm9
1328        movdqa  xmmword ptr [rsp+0x100], xmm8
1329        movdqa  xmm8, xmm5
1330        psrld   xmm8, 12
1331        pslld   xmm5, 20
1332        por     xmm5, xmm8
1333        movdqa  xmm8, xmm6
1334        psrld   xmm8, 12
1335        pslld   xmm6, 20
1336        por     xmm6, xmm8
1337        movdqa  xmm8, xmm7
1338        psrld   xmm8, 12
1339        pslld   xmm7, 20
1340        por     xmm7, xmm8
1341        movdqa  xmm8, xmm4
1342        psrld   xmm8, 12
1343        pslld   xmm4, 20
1344        por     xmm4, xmm8
1345        paddd   xmm0, xmmword ptr [rsp+0x30]
1346        paddd   xmm1, xmmword ptr [rsp+0xA0]
1347        paddd   xmm2, xmmword ptr [rsp+0x60]
1348        paddd   xmm3, xmmword ptr [rsp+0x70]
1349        paddd   xmm0, xmm5
1350        paddd   xmm1, xmm6
1351        paddd   xmm2, xmm7
1352        paddd   xmm3, xmm4
1353        pxor    xmm15, xmm0
1354        pxor    xmm12, xmm1
1355        pxor    xmm13, xmm2
1356        pxor    xmm14, xmm3
1357        movdqa  xmm8, xmm15
1358        psrld   xmm15, 8
1359        pslld   xmm8, 24
1360        pxor    xmm15, xmm8
1361        movdqa  xmm8, xmm12
1362        psrld   xmm12, 8
1363        pslld   xmm8, 24
1364        pxor    xmm12, xmm8
1365        movdqa  xmm8, xmm13
1366        psrld   xmm13, 8
1367        pslld   xmm8, 24
1368        pxor    xmm13, xmm8
1369        movdqa  xmm8, xmm14
1370        psrld   xmm14, 8
1371        pslld   xmm8, 24
1372        pxor    xmm14, xmm8
1373        paddd   xmm10, xmm15
1374        paddd   xmm11, xmm12
1375        movdqa  xmm8, xmmword ptr [rsp+0x100]
1376        paddd   xmm8, xmm13
1377        paddd   xmm9, xmm14
1378        pxor    xmm5, xmm10
1379        pxor    xmm6, xmm11
1380        pxor    xmm7, xmm8
1381        pxor    xmm4, xmm9
1382        movdqa  xmmword ptr [rsp+0x100], xmm8
1383        movdqa  xmm8, xmm5
1384        psrld   xmm8, 7
1385        pslld   xmm5, 25
1386        por     xmm5, xmm8
1387        movdqa  xmm8, xmm6
1388        psrld   xmm8, 7
1389        pslld   xmm6, 25
1390        por     xmm6, xmm8
1391        movdqa  xmm8, xmm7
1392        psrld   xmm8, 7
1393        pslld   xmm7, 25
1394        por     xmm7, xmm8
1395        movdqa  xmm8, xmm4
1396        psrld   xmm8, 7
1397        pslld   xmm4, 25
1398        por     xmm4, xmm8
1399        paddd   xmm0, xmmword ptr [rsp+0xB0]
1400        paddd   xmm1, xmmword ptr [rsp+0x50]
1401        paddd   xmm2, xmmword ptr [rsp+0x10]
1402        paddd   xmm3, xmmword ptr [rsp+0x80]
1403        paddd   xmm0, xmm4
1404        paddd   xmm1, xmm5
1405        paddd   xmm2, xmm6
1406        paddd   xmm3, xmm7
1407        pxor    xmm12, xmm0
1408        pxor    xmm13, xmm1
1409        pxor    xmm14, xmm2
1410        pxor    xmm15, xmm3
1411        pshuflw xmm12, xmm12, 0xB1
1412        pshufhw xmm12, xmm12, 0xB1
1413        pshuflw xmm13, xmm13, 0xB1
1414        pshufhw xmm13, xmm13, 0xB1
1415        pshuflw xmm14, xmm14, 0xB1
1416        pshufhw xmm14, xmm14, 0xB1
1417        pshuflw xmm15, xmm15, 0xB1
1418        pshufhw xmm15, xmm15, 0xB1
1419        movdqa  xmm8, xmmword ptr [rsp+0x100]
1420        paddd   xmm8, xmm12
1421        paddd   xmm9, xmm13
1422        paddd   xmm10, xmm14
1423        paddd   xmm11, xmm15
1424        pxor    xmm4, xmm8
1425        pxor    xmm5, xmm9
1426        pxor    xmm6, xmm10
1427        pxor    xmm7, xmm11
1428        movdqa  xmmword ptr [rsp+0x100], xmm8
1429        movdqa  xmm8, xmm4
1430        psrld   xmm8, 12
1431        pslld   xmm4, 20
1432        por     xmm4, xmm8
1433        movdqa  xmm8, xmm5
1434        psrld   xmm8, 12
1435        pslld   xmm5, 20
1436        por     xmm5, xmm8
1437        movdqa  xmm8, xmm6
1438        psrld   xmm8, 12
1439        pslld   xmm6, 20
1440        por     xmm6, xmm8
1441        movdqa  xmm8, xmm7
1442        psrld   xmm8, 12
1443        pslld   xmm7, 20
1444        por     xmm7, xmm8
1445        paddd   xmm0, xmmword ptr [rsp+0xF0]
1446        paddd   xmm1, xmmword ptr [rsp]
1447        paddd   xmm2, xmmword ptr [rsp+0x90]
1448        paddd   xmm3, xmmword ptr [rsp+0x60]
1449        paddd   xmm0, xmm4
1450        paddd   xmm1, xmm5
1451        paddd   xmm2, xmm6
1452        paddd   xmm3, xmm7
1453        pxor    xmm12, xmm0
1454        pxor    xmm13, xmm1
1455        pxor    xmm14, xmm2
1456        pxor    xmm15, xmm3
1457        movdqa  xmm8, xmm12
1458        psrld   xmm12, 8
1459        pslld   xmm8, 24
1460        pxor    xmm12, xmm8
1461        movdqa  xmm8, xmm13
1462        psrld   xmm13, 8
1463        pslld   xmm8, 24
1464        pxor    xmm13, xmm8
1465        movdqa  xmm8, xmm14
1466        psrld   xmm14, 8
1467        pslld   xmm8, 24
1468        pxor    xmm14, xmm8
1469        movdqa  xmm8, xmm15
1470        psrld   xmm15, 8
1471        pslld   xmm8, 24
1472        pxor    xmm15, xmm8
1473        movdqa  xmm8, xmmword ptr [rsp+0x100]
1474        paddd   xmm8, xmm12
1475        paddd   xmm9, xmm13
1476        paddd   xmm10, xmm14
1477        paddd   xmm11, xmm15
1478        pxor    xmm4, xmm8
1479        pxor    xmm5, xmm9
1480        pxor    xmm6, xmm10
1481        pxor    xmm7, xmm11
1482        movdqa  xmmword ptr [rsp+0x100], xmm8
1483        movdqa  xmm8, xmm4
1484        psrld   xmm8, 7
1485        pslld   xmm4, 25
1486        por     xmm4, xmm8
1487        movdqa  xmm8, xmm5
1488        psrld   xmm8, 7
1489        pslld   xmm5, 25
1490        por     xmm5, xmm8
1491        movdqa  xmm8, xmm6
1492        psrld   xmm8, 7
1493        pslld   xmm6, 25
1494        por     xmm6, xmm8
1495        movdqa  xmm8, xmm7
1496        psrld   xmm8, 7
1497        pslld   xmm7, 25
1498        por     xmm7, xmm8
1499        paddd   xmm0, xmmword ptr [rsp+0xE0]
1500        paddd   xmm1, xmmword ptr [rsp+0x20]
1501        paddd   xmm2, xmmword ptr [rsp+0x30]
1502        paddd   xmm3, xmmword ptr [rsp+0x70]
1503        paddd   xmm0, xmm5
1504        paddd   xmm1, xmm6
1505        paddd   xmm2, xmm7
1506        paddd   xmm3, xmm4
1507        pxor    xmm15, xmm0
1508        pxor    xmm12, xmm1
1509        pxor    xmm13, xmm2
1510        pxor    xmm14, xmm3
1511        pshuflw xmm15, xmm15, 0xB1
1512        pshufhw xmm15, xmm15, 0xB1
1513        pshuflw xmm12, xmm12, 0xB1
1514        pshufhw xmm12, xmm12, 0xB1
1515        pshuflw xmm13, xmm13, 0xB1
1516        pshufhw xmm13, xmm13, 0xB1
1517        pshuflw xmm14, xmm14, 0xB1
1518        pshufhw xmm14, xmm14, 0xB1
1519        paddd   xmm10, xmm15
1520        paddd   xmm11, xmm12
1521        movdqa  xmm8, xmmword ptr [rsp+0x100]
1522        paddd   xmm8, xmm13
1523        paddd   xmm9, xmm14
1524        pxor    xmm5, xmm10
1525        pxor    xmm6, xmm11
1526        pxor    xmm7, xmm8
1527        pxor    xmm4, xmm9
1528        movdqa  xmmword ptr [rsp+0x100], xmm8
1529        movdqa  xmm8, xmm5
1530        psrld   xmm8, 12
1531        pslld   xmm5, 20
1532        por     xmm5, xmm8
1533        movdqa  xmm8, xmm6
1534        psrld   xmm8, 12
1535        pslld   xmm6, 20
1536        por     xmm6, xmm8
1537        movdqa  xmm8, xmm7
1538        psrld   xmm8, 12
1539        pslld   xmm7, 20
1540        por     xmm7, xmm8
1541        movdqa  xmm8, xmm4
1542        psrld   xmm8, 12
1543        pslld   xmm4, 20
1544        por     xmm4, xmm8
1545        paddd   xmm0, xmmword ptr [rsp+0xA0]
1546        paddd   xmm1, xmmword ptr [rsp+0xC0]
1547        paddd   xmm2, xmmword ptr [rsp+0x40]
1548        paddd   xmm3, xmmword ptr [rsp+0xD0]
1549        paddd   xmm0, xmm5
1550        paddd   xmm1, xmm6
1551        paddd   xmm2, xmm7
1552        paddd   xmm3, xmm4
1553        pxor    xmm15, xmm0
1554        pxor    xmm12, xmm1
1555        pxor    xmm13, xmm2
1556        pxor    xmm14, xmm3
1557        movdqa  xmm8, xmm15
1558        psrld   xmm15, 8
1559        pslld   xmm8, 24
1560        pxor    xmm15, xmm8
1561        movdqa  xmm8, xmm12
1562        psrld   xmm12, 8
1563        pslld   xmm8, 24
1564        pxor    xmm12, xmm8
1565        movdqa  xmm8, xmm13
1566        psrld   xmm13, 8
1567        pslld   xmm8, 24
1568        pxor    xmm13, xmm8
1569        movdqa  xmm8, xmm14
1570        psrld   xmm14, 8
1571        pslld   xmm8, 24
1572        pxor    xmm14, xmm8
1573        paddd   xmm10, xmm15
1574        paddd   xmm11, xmm12
1575        movdqa  xmm8, xmmword ptr [rsp+0x100]
1576        paddd   xmm8, xmm13
1577        paddd   xmm9, xmm14
1578        pxor    xmm5, xmm10
1579        pxor    xmm6, xmm11
1580        pxor    xmm7, xmm8
1581        pxor    xmm4, xmm9
1582        pxor    xmm0, xmm8
1583        pxor    xmm1, xmm9
1584        pxor    xmm2, xmm10
1585        pxor    xmm3, xmm11
1586        movdqa  xmm8, xmm5
1587        psrld   xmm8, 7
1588        pslld   xmm5, 25
1589        por     xmm5, xmm8
1590        movdqa  xmm8, xmm6
1591        psrld   xmm8, 7
1592        pslld   xmm6, 25
1593        por     xmm6, xmm8
1594        movdqa  xmm8, xmm7
1595        psrld   xmm8, 7
1596        pslld   xmm7, 25
1597        por     xmm7, xmm8
1598        movdqa  xmm8, xmm4
1599        psrld   xmm8, 7
1600        pslld   xmm4, 25
1601        por     xmm4, xmm8
1602        pxor    xmm4, xmm12
1603        pxor    xmm5, xmm13
1604        pxor    xmm6, xmm14
1605        pxor    xmm7, xmm15
1606        mov     eax, r13d
1607        jne     9b
1608        movdqa  xmm9, xmm0
1609        punpckldq xmm0, xmm1
1610        punpckhdq xmm9, xmm1
1611        movdqa  xmm11, xmm2
1612        punpckldq xmm2, xmm3
1613        punpckhdq xmm11, xmm3
1614        movdqa  xmm1, xmm0
1615        punpcklqdq xmm0, xmm2
1616        punpckhqdq xmm1, xmm2
1617        movdqa  xmm3, xmm9
1618        punpcklqdq xmm9, xmm11
1619        punpckhqdq xmm3, xmm11
1620        movdqu  xmmword ptr [rbx], xmm0
1621        movdqu  xmmword ptr [rbx+0x20], xmm1
1622        movdqu  xmmword ptr [rbx+0x40], xmm9
1623        movdqu  xmmword ptr [rbx+0x60], xmm3
1624        movdqa  xmm9, xmm4
1625        punpckldq xmm4, xmm5
1626        punpckhdq xmm9, xmm5
1627        movdqa  xmm11, xmm6
1628        punpckldq xmm6, xmm7
1629        punpckhdq xmm11, xmm7
1630        movdqa  xmm5, xmm4
1631        punpcklqdq xmm4, xmm6
1632        punpckhqdq xmm5, xmm6
1633        movdqa  xmm7, xmm9
1634        punpcklqdq xmm9, xmm11
1635        punpckhqdq xmm7, xmm11
1636        movdqu  xmmword ptr [rbx+0x10], xmm4
1637        movdqu  xmmword ptr [rbx+0x30], xmm5
1638        movdqu  xmmword ptr [rbx+0x50], xmm9
1639        movdqu  xmmword ptr [rbx+0x70], xmm7
1640        movdqa  xmm1, xmmword ptr [rsp+0x110]
1641        movdqa  xmm0, xmm1
1642        paddd   xmm1, xmmword ptr [rsp+0x150]
1643        movdqa  xmmword ptr [rsp+0x110], xmm1
1644        pxor    xmm0, xmmword ptr [CMP_MSB_MASK+rip]
1645        pxor    xmm1, xmmword ptr [CMP_MSB_MASK+rip]
1646        pcmpgtd xmm0, xmm1
1647        movdqa  xmm1, xmmword ptr [rsp+0x120]
1648        psubd   xmm1, xmm0
1649        movdqa  xmmword ptr [rsp+0x120], xmm1
1650        add     rbx, 128
1651        add     rdi, 32
1652        sub     rsi, 4
1653        cmp     rsi, 4
1654        jnc     2b
1655        test    rsi, rsi
1656        jnz     3f
16574:
1658        mov     rsp, rbp
1659        pop     rbp
1660        pop     rbx
1661        pop     r12
1662        pop     r13
1663        pop     r14
1664        pop     r15
1665        ret
1666.p2align 5
16673:
1668        test    esi, 0x2
1669        je      3f
1670        movups  xmm0, xmmword ptr [rcx]
1671        movups  xmm1, xmmword ptr [rcx+0x10]
1672        movaps  xmm8, xmm0
1673        movaps  xmm9, xmm1
1674        movd    xmm13, dword ptr [rsp+0x110]
1675        movd    xmm14, dword ptr [rsp+0x120]
1676        punpckldq xmm13, xmm14
1677        movaps  xmmword ptr [rsp], xmm13
1678        movd    xmm14, dword ptr [rsp+0x114]
1679        movd    xmm13, dword ptr [rsp+0x124]
1680        punpckldq xmm14, xmm13
1681        movaps  xmmword ptr [rsp+0x10], xmm14
1682        mov     r8, qword ptr [rdi]
1683        mov     r9, qword ptr [rdi+0x8]
1684        movzx   eax, byte ptr [rbp+0x40]
1685        or      eax, r13d
1686        xor     edx, edx
16872:
1688        mov     r14d, eax
1689        or      eax, r12d
1690        add     rdx, 64
1691        cmp     rdx, r15
1692        cmovne  eax, r14d
1693        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
1694        movaps  xmm10, xmm2
1695        movups  xmm4, xmmword ptr [r8+rdx-0x40]
1696        movups  xmm5, xmmword ptr [r8+rdx-0x30]
1697        movaps  xmm3, xmm4
1698        shufps  xmm4, xmm5, 136
1699        shufps  xmm3, xmm5, 221
1700        movaps  xmm5, xmm3
1701        movups  xmm6, xmmword ptr [r8+rdx-0x20]
1702        movups  xmm7, xmmword ptr [r8+rdx-0x10]
1703        movaps  xmm3, xmm6
1704        shufps  xmm6, xmm7, 136
1705        pshufd  xmm6, xmm6, 0x93
1706        shufps  xmm3, xmm7, 221
1707        pshufd  xmm7, xmm3, 0x93
1708        movups  xmm12, xmmword ptr [r9+rdx-0x40]
1709        movups  xmm13, xmmword ptr [r9+rdx-0x30]
1710        movaps  xmm11, xmm12
1711        shufps  xmm12, xmm13, 136
1712        shufps  xmm11, xmm13, 221
1713        movaps  xmm13, xmm11
1714        movups  xmm14, xmmword ptr [r9+rdx-0x20]
1715        movups  xmm15, xmmword ptr [r9+rdx-0x10]
1716        movaps  xmm11, xmm14
1717        shufps  xmm14, xmm15, 136
1718        pshufd  xmm14, xmm14, 0x93
1719        shufps  xmm11, xmm15, 221
1720        pshufd  xmm15, xmm11, 0x93
1721        shl     rax, 0x20
1722        or      rax, 0x40
1723        movq    xmm3, rax
1724        movdqa  xmmword ptr [rsp+0x20], xmm3
1725        movaps  xmm3, xmmword ptr [rsp]
1726        movaps  xmm11, xmmword ptr [rsp+0x10]
1727        punpcklqdq xmm3, xmmword ptr [rsp+0x20]
1728        punpcklqdq xmm11, xmmword ptr [rsp+0x20]
1729        mov     al, 7
17309:
1731        paddd   xmm0, xmm4
1732        paddd   xmm8, xmm12
1733        movaps  xmmword ptr [rsp+0x20], xmm4
1734        movaps  xmmword ptr [rsp+0x30], xmm12
1735        paddd   xmm0, xmm1
1736        paddd   xmm8, xmm9
1737        pxor    xmm3, xmm0
1738        pxor    xmm11, xmm8
1739        pshuflw xmm3, xmm3, 0xB1
1740        pshufhw xmm3, xmm3, 0xB1
1741        pshuflw xmm11, xmm11, 0xB1
1742        pshufhw xmm11, xmm11, 0xB1
1743        paddd   xmm2, xmm3
1744        paddd   xmm10, xmm11
1745        pxor    xmm1, xmm2
1746        pxor    xmm9, xmm10
1747        movdqa  xmm4, xmm1
1748        pslld   xmm1, 20
1749        psrld   xmm4, 12
1750        por     xmm1, xmm4
1751        movdqa  xmm4, xmm9
1752        pslld   xmm9, 20
1753        psrld   xmm4, 12
1754        por     xmm9, xmm4
1755        paddd   xmm0, xmm5
1756        paddd   xmm8, xmm13
1757        movaps  xmmword ptr [rsp+0x40], xmm5
1758        movaps  xmmword ptr [rsp+0x50], xmm13
1759        paddd   xmm0, xmm1
1760        paddd   xmm8, xmm9
1761        pxor    xmm3, xmm0
1762        pxor    xmm11, xmm8
1763        movdqa  xmm13, xmm3
1764        psrld   xmm3, 8
1765        pslld   xmm13, 24
1766        pxor    xmm3, xmm13
1767        movdqa  xmm13, xmm11
1768        psrld   xmm11, 8
1769        pslld   xmm13, 24
1770        pxor    xmm11, xmm13
1771        paddd   xmm2, xmm3
1772        paddd   xmm10, xmm11
1773        pxor    xmm1, xmm2
1774        pxor    xmm9, xmm10
1775        movdqa  xmm4, xmm1
1776        pslld   xmm1, 25
1777        psrld   xmm4, 7
1778        por     xmm1, xmm4
1779        movdqa  xmm4, xmm9
1780        pslld   xmm9, 25
1781        psrld   xmm4, 7
1782        por     xmm9, xmm4
1783        pshufd  xmm0, xmm0, 0x93
1784        pshufd  xmm8, xmm8, 0x93
1785        pshufd  xmm3, xmm3, 0x4E
1786        pshufd  xmm11, xmm11, 0x4E
1787        pshufd  xmm2, xmm2, 0x39
1788        pshufd  xmm10, xmm10, 0x39
1789        paddd   xmm0, xmm6
1790        paddd   xmm8, xmm14
1791        paddd   xmm0, xmm1
1792        paddd   xmm8, xmm9
1793        pxor    xmm3, xmm0
1794        pxor    xmm11, xmm8
1795        pshuflw xmm3, xmm3, 0xB1
1796        pshufhw xmm3, xmm3, 0xB1
1797        pshuflw xmm11, xmm11, 0xB1
1798        pshufhw xmm11, xmm11, 0xB1
1799        paddd   xmm2, xmm3
1800        paddd   xmm10, xmm11
1801        pxor    xmm1, xmm2
1802        pxor    xmm9, xmm10
1803        movdqa  xmm4, xmm1
1804        pslld   xmm1, 20
1805        psrld   xmm4, 12
1806        por     xmm1, xmm4
1807        movdqa  xmm4, xmm9
1808        pslld   xmm9, 20
1809        psrld   xmm4, 12
1810        por     xmm9, xmm4
1811        paddd   xmm0, xmm7
1812        paddd   xmm8, xmm15
1813        paddd   xmm0, xmm1
1814        paddd   xmm8, xmm9
1815        pxor    xmm3, xmm0
1816        pxor    xmm11, xmm8
1817        movdqa  xmm13, xmm3
1818        psrld   xmm3, 8
1819        pslld   xmm13, 24
1820        pxor    xmm3, xmm13
1821        movdqa  xmm13, xmm11
1822        psrld   xmm11, 8
1823        pslld   xmm13, 24
1824        pxor    xmm11, xmm13
1825        paddd   xmm2, xmm3
1826        paddd   xmm10, xmm11
1827        pxor    xmm1, xmm2
1828        pxor    xmm9, xmm10
1829        movdqa  xmm4, xmm1
1830        pslld   xmm1, 25
1831        psrld   xmm4, 7
1832        por     xmm1, xmm4
1833        movdqa  xmm4, xmm9
1834        pslld   xmm9, 25
1835        psrld   xmm4, 7
1836        por     xmm9, xmm4
1837        pshufd  xmm0, xmm0, 0x39
1838        pshufd  xmm8, xmm8, 0x39
1839        pshufd  xmm3, xmm3, 0x4E
1840        pshufd  xmm11, xmm11, 0x4E
1841        pshufd  xmm2, xmm2, 0x93
1842        pshufd  xmm10, xmm10, 0x93
1843        dec     al
1844        je      9f
1845        movdqa  xmm12, xmmword ptr [rsp+0x20]
1846        movdqa  xmm5, xmmword ptr [rsp+0x40]
1847        pshufd  xmm13, xmm12, 0x0F
1848        shufps  xmm12, xmm5, 214
1849        pshufd  xmm4, xmm12, 0x39
1850        movdqa  xmm12, xmm6
1851        shufps  xmm12, xmm7, 250
1852        pand    xmm13, xmmword ptr [PBLENDW_0x33_MASK+rip]
1853        pand    xmm12, xmmword ptr [PBLENDW_0xCC_MASK+rip]
1854        por     xmm13, xmm12
1855        movdqa  xmmword ptr [rsp+0x20], xmm13
1856        movdqa  xmm12, xmm7
1857        punpcklqdq xmm12, xmm5
1858        movdqa  xmm13, xmm6
1859        pand    xmm12, xmmword ptr [PBLENDW_0x3F_MASK+rip]
1860        pand    xmm13, xmmword ptr [PBLENDW_0xC0_MASK+rip]
1861        por     xmm12, xmm13
1862        pshufd  xmm12, xmm12, 0x78
1863        punpckhdq xmm5, xmm7
1864        punpckldq xmm6, xmm5
1865        pshufd  xmm7, xmm6, 0x1E
1866        movdqa  xmmword ptr [rsp+0x40], xmm12
1867        movdqa  xmm5, xmmword ptr [rsp+0x30]
1868        movdqa  xmm13, xmmword ptr [rsp+0x50]
1869        pshufd  xmm6, xmm5, 0x0F
1870        shufps  xmm5, xmm13, 214
1871        pshufd  xmm12, xmm5, 0x39
1872        movdqa  xmm5, xmm14
1873        shufps  xmm5, xmm15, 250
1874        pand    xmm6, xmmword ptr [PBLENDW_0x33_MASK+rip]
1875        pand    xmm5, xmmword ptr [PBLENDW_0xCC_MASK+rip]
1876        por     xmm6, xmm5
1877        movdqa  xmm5, xmm15
1878        punpcklqdq xmm5, xmm13
1879        movdqa  xmmword ptr [rsp+0x30], xmm2
1880        movdqa  xmm2, xmm14
1881        pand    xmm5, xmmword ptr [PBLENDW_0x3F_MASK+rip]
1882        pand    xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip]
1883        por     xmm5, xmm2
1884        movdqa  xmm2, xmmword ptr [rsp+0x30]
1885        pshufd  xmm5, xmm5, 0x78
1886        punpckhdq xmm13, xmm15
1887        punpckldq xmm14, xmm13
1888        pshufd  xmm15, xmm14, 0x1E
1889        movdqa  xmm13, xmm6
1890        movdqa  xmm14, xmm5
1891        movdqa  xmm5, xmmword ptr [rsp+0x20]
1892        movdqa  xmm6, xmmword ptr [rsp+0x40]
1893        jmp     9b
18949:
1895        pxor    xmm0, xmm2
1896        pxor    xmm1, xmm3
1897        pxor    xmm8, xmm10
1898        pxor    xmm9, xmm11
1899        mov     eax, r13d
1900        cmp     rdx, r15
1901        jne     2b
1902        movups  xmmword ptr [rbx], xmm0
1903        movups  xmmword ptr [rbx+0x10], xmm1
1904        movups  xmmword ptr [rbx+0x20], xmm8
1905        movups  xmmword ptr [rbx+0x30], xmm9
1906        mov     eax, dword ptr [rsp+0x130]
1907        neg     eax
1908        mov    r10d, dword ptr [rsp+0x110+8*rax]
1909        mov    r11d, dword ptr [rsp+0x120+8*rax]
1910        mov dword ptr [rsp+0x110], r10d
1911        mov dword ptr [rsp+0x120], r11d
1912        add     rdi, 16
1913        add     rbx, 64
1914        sub     rsi, 2
19153:
1916        test    esi, 0x1
1917        je      4b
1918        movups  xmm0, xmmword ptr [rcx]
1919        movups  xmm1, xmmword ptr [rcx+0x10]
1920        movd    xmm13, dword ptr [rsp+0x110]
1921        movd    xmm14, dword ptr [rsp+0x120]
1922        punpckldq xmm13, xmm14
1923        mov     r8, qword ptr [rdi]
1924        movzx   eax, byte ptr [rbp+0x40]
1925        or      eax, r13d
1926        xor     edx, edx
19272:
1928        mov     r14d, eax
1929        or      eax, r12d
1930        add     rdx, 64
1931        cmp     rdx, r15
1932        cmovne  eax, r14d
1933        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
1934        shl     rax, 32
1935        or      rax, 64
1936        movq    xmm12, rax
1937        movdqa  xmm3, xmm13
1938        punpcklqdq xmm3, xmm12
1939        movups  xmm4, xmmword ptr [r8+rdx-0x40]
1940        movups  xmm5, xmmword ptr [r8+rdx-0x30]
1941        movaps  xmm8, xmm4
1942        shufps  xmm4, xmm5, 136
1943        shufps  xmm8, xmm5, 221
1944        movaps  xmm5, xmm8
1945        movups  xmm6, xmmword ptr [r8+rdx-0x20]
1946        movups  xmm7, xmmword ptr [r8+rdx-0x10]
1947        movaps  xmm8, xmm6
1948        shufps  xmm6, xmm7, 136
1949        pshufd  xmm6, xmm6, 0x93
1950        shufps  xmm8, xmm7, 221
1951        pshufd  xmm7, xmm8, 0x93
1952        mov     al, 7
19539:
1954        paddd   xmm0, xmm4
1955        paddd   xmm0, xmm1
1956        pxor    xmm3, xmm0
1957        pshuflw xmm3, xmm3, 0xB1
1958        pshufhw xmm3, xmm3, 0xB1
1959        paddd   xmm2, xmm3
1960        pxor    xmm1, xmm2
1961        movdqa  xmm11, xmm1
1962        pslld   xmm1, 20
1963        psrld   xmm11, 12
1964        por     xmm1, xmm11
1965        paddd   xmm0, xmm5
1966        paddd   xmm0, xmm1
1967        pxor    xmm3, xmm0
1968        movdqa  xmm14, xmm3
1969        psrld   xmm3, 8
1970        pslld   xmm14, 24
1971        pxor    xmm3, xmm14
1972        paddd   xmm2, xmm3
1973        pxor    xmm1, xmm2
1974        movdqa  xmm11, xmm1
1975        pslld   xmm1, 25
1976        psrld   xmm11, 7
1977        por     xmm1, xmm11
1978        pshufd  xmm0, xmm0, 0x93
1979        pshufd  xmm3, xmm3, 0x4E
1980        pshufd  xmm2, xmm2, 0x39
1981        paddd   xmm0, xmm6
1982        paddd   xmm0, xmm1
1983        pxor    xmm3, xmm0
1984        pshuflw xmm3, xmm3, 0xB1
1985        pshufhw xmm3, xmm3, 0xB1
1986        paddd   xmm2, xmm3
1987        pxor    xmm1, xmm2
1988        movdqa  xmm11, xmm1
1989        pslld   xmm1, 20
1990        psrld   xmm11, 12
1991        por     xmm1, xmm11
1992        paddd   xmm0, xmm7
1993        paddd   xmm0, xmm1
1994        pxor    xmm3, xmm0
1995        movdqa  xmm14, xmm3
1996        psrld   xmm3, 8
1997        pslld   xmm14, 24
1998        pxor    xmm3, xmm14
1999        paddd   xmm2, xmm3
2000        pxor    xmm1, xmm2
2001        movdqa  xmm11, xmm1
2002        pslld   xmm1, 25
2003        psrld   xmm11, 7
2004        por     xmm1, xmm11
2005        pshufd  xmm0, xmm0, 0x39
2006        pshufd  xmm3, xmm3, 0x4E
2007        pshufd  xmm2, xmm2, 0x93
2008        dec     al
2009        jz      9f
2010        movdqa  xmm8, xmm4
2011        shufps  xmm8, xmm5, 214
2012        pshufd  xmm9, xmm4, 0x0F
2013        pshufd  xmm4, xmm8, 0x39
2014        movdqa  xmm8, xmm6
2015        shufps  xmm8, xmm7, 250
2016        pand    xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
2017        pand    xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
2018        por     xmm9, xmm8
2019        movdqa  xmm8, xmm7
2020        punpcklqdq xmm8, xmm5
2021        movdqa  xmm10, xmm6
2022        pand    xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
2023        pand    xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
2024        por     xmm8, xmm10
2025        pshufd  xmm8, xmm8, 0x78
2026        punpckhdq xmm5, xmm7
2027        punpckldq xmm6, xmm5
2028        pshufd  xmm7, xmm6, 0x1E
2029        movdqa  xmm5, xmm9
2030        movdqa  xmm6, xmm8
2031        jmp     9b
20329:
2033        pxor    xmm0, xmm2
2034        pxor    xmm1, xmm3
2035        mov     eax, r13d
2036        cmp     rdx, r15
2037        jne     2b
2038        movups  xmmword ptr [rbx], xmm0
2039        movups  xmmword ptr [rbx+0x10], xmm1
2040        jmp     4b
2041
2042.p2align 6
2043blake3_compress_in_place_sse2:
2044_blake3_compress_in_place_sse2:
2045        _CET_ENDBR
2046        movups  xmm0, xmmword ptr [rdi]
2047        movups  xmm1, xmmword ptr [rdi+0x10]
2048        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
2049        shl     r8, 32
2050        add     rdx, r8
2051        movq    xmm3, rcx
2052        movq    xmm4, rdx
2053        punpcklqdq xmm3, xmm4
2054        movups  xmm4, xmmword ptr [rsi]
2055        movups  xmm5, xmmword ptr [rsi+0x10]
2056        movaps  xmm8, xmm4
2057        shufps  xmm4, xmm5, 136
2058        shufps  xmm8, xmm5, 221
2059        movaps  xmm5, xmm8
2060        movups  xmm6, xmmword ptr [rsi+0x20]
2061        movups  xmm7, xmmword ptr [rsi+0x30]
2062        movaps  xmm8, xmm6
2063        shufps  xmm6, xmm7, 136
2064        pshufd  xmm6, xmm6, 0x93
2065        shufps  xmm8, xmm7, 221
2066        pshufd  xmm7, xmm8, 0x93
2067        mov     al, 7
20689:
2069        paddd   xmm0, xmm4
2070        paddd   xmm0, xmm1
2071        pxor    xmm3, xmm0
2072        pshuflw xmm3, xmm3, 0xB1
2073        pshufhw xmm3, xmm3, 0xB1
2074        paddd   xmm2, xmm3
2075        pxor    xmm1, xmm2
2076        movdqa  xmm11, xmm1
2077        pslld   xmm1, 20
2078        psrld   xmm11, 12
2079        por     xmm1, xmm11
2080        paddd   xmm0, xmm5
2081        paddd   xmm0, xmm1
2082        pxor    xmm3, xmm0
2083        movdqa  xmm14, xmm3
2084        psrld   xmm3, 8
2085        pslld   xmm14, 24
2086        pxor    xmm3, xmm14
2087        paddd   xmm2, xmm3
2088        pxor    xmm1, xmm2
2089        movdqa  xmm11, xmm1
2090        pslld   xmm1, 25
2091        psrld   xmm11, 7
2092        por     xmm1, xmm11
2093        pshufd  xmm0, xmm0, 0x93
2094        pshufd  xmm3, xmm3, 0x4E
2095        pshufd  xmm2, xmm2, 0x39
2096        paddd   xmm0, xmm6
2097        paddd   xmm0, xmm1
2098        pxor    xmm3, xmm0
2099        pshuflw xmm3, xmm3, 0xB1
2100        pshufhw xmm3, xmm3, 0xB1
2101        paddd   xmm2, xmm3
2102        pxor    xmm1, xmm2
2103        movdqa  xmm11, xmm1
2104        pslld   xmm1, 20
2105        psrld   xmm11, 12
2106        por     xmm1, xmm11
2107        paddd   xmm0, xmm7
2108        paddd   xmm0, xmm1
2109        pxor    xmm3, xmm0
2110        movdqa  xmm14, xmm3
2111        psrld   xmm3, 8
2112        pslld   xmm14, 24
2113        pxor    xmm3, xmm14
2114        paddd   xmm2, xmm3
2115        pxor    xmm1, xmm2
2116        movdqa  xmm11, xmm1
2117        pslld   xmm1, 25
2118        psrld   xmm11, 7
2119        por     xmm1, xmm11
2120        pshufd  xmm0, xmm0, 0x39
2121        pshufd  xmm3, xmm3, 0x4E
2122        pshufd  xmm2, xmm2, 0x93
2123        dec     al
2124        jz      9f
2125        movdqa  xmm8, xmm4
2126        shufps  xmm8, xmm5, 214
2127        pshufd  xmm9, xmm4, 0x0F
2128        pshufd  xmm4, xmm8, 0x39
2129        movdqa  xmm8, xmm6
2130        shufps  xmm8, xmm7, 250
2131        pand    xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
2132        pand    xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
2133        por     xmm9, xmm8
2134        movdqa  xmm8, xmm7
2135        punpcklqdq xmm8, xmm5
2136        movdqa  xmm10, xmm6
2137        pand    xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
2138        pand    xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
2139        por     xmm8, xmm10
2140        pshufd  xmm8, xmm8, 0x78
2141        punpckhdq xmm5, xmm7
2142        punpckldq xmm6, xmm5
2143        pshufd  xmm7, xmm6, 0x1E
2144        movdqa  xmm5, xmm9
2145        movdqa  xmm6, xmm8
2146        jmp     9b
21479:
2148        pxor    xmm0, xmm2
2149        pxor    xmm1, xmm3
2150        movups  xmmword ptr [rdi], xmm0
2151        movups  xmmword ptr [rdi+0x10], xmm1
2152        ret
2153
2154.p2align 6
2155blake3_compress_xof_sse2:
2156_blake3_compress_xof_sse2:
2157        _CET_ENDBR
2158        movups  xmm0, xmmword ptr [rdi]
2159        movups  xmm1, xmmword ptr [rdi+0x10]
2160        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
2161        movzx   eax, r8b
2162        movzx   edx, dl
2163        shl     rax, 32
2164        add     rdx, rax
2165        movq    xmm3, rcx
2166        movq    xmm4, rdx
2167        punpcklqdq xmm3, xmm4
2168        movups  xmm4, xmmword ptr [rsi]
2169        movups  xmm5, xmmword ptr [rsi+0x10]
2170        movaps  xmm8, xmm4
2171        shufps  xmm4, xmm5, 136
2172        shufps  xmm8, xmm5, 221
2173        movaps  xmm5, xmm8
2174        movups  xmm6, xmmword ptr [rsi+0x20]
2175        movups  xmm7, xmmword ptr [rsi+0x30]
2176        movaps  xmm8, xmm6
2177        shufps  xmm6, xmm7, 136
2178        pshufd  xmm6, xmm6, 0x93
2179        shufps  xmm8, xmm7, 221
2180        pshufd  xmm7, xmm8, 0x93
2181        mov     al, 7
21829:
2183        paddd   xmm0, xmm4
2184        paddd   xmm0, xmm1
2185        pxor    xmm3, xmm0
2186        pshuflw xmm3, xmm3, 0xB1
2187        pshufhw xmm3, xmm3, 0xB1
2188        paddd   xmm2, xmm3
2189        pxor    xmm1, xmm2
2190        movdqa  xmm11, xmm1
2191        pslld   xmm1, 20
2192        psrld   xmm11, 12
2193        por     xmm1, xmm11
2194        paddd   xmm0, xmm5
2195        paddd   xmm0, xmm1
2196        pxor    xmm3, xmm0
2197        movdqa  xmm14, xmm3
2198        psrld   xmm3, 8
2199        pslld   xmm14, 24
2200        pxor    xmm3, xmm14
2201        paddd   xmm2, xmm3
2202        pxor    xmm1, xmm2
2203        movdqa  xmm11, xmm1
2204        pslld   xmm1, 25
2205        psrld   xmm11, 7
2206        por     xmm1, xmm11
2207        pshufd  xmm0, xmm0, 0x93
2208        pshufd  xmm3, xmm3, 0x4E
2209        pshufd  xmm2, xmm2, 0x39
2210        paddd   xmm0, xmm6
2211        paddd   xmm0, xmm1
2212        pxor    xmm3, xmm0
2213        pshuflw xmm3, xmm3, 0xB1
2214        pshufhw xmm3, xmm3, 0xB1
2215        paddd   xmm2, xmm3
2216        pxor    xmm1, xmm2
2217        movdqa  xmm11, xmm1
2218        pslld   xmm1, 20
2219        psrld   xmm11, 12
2220        por     xmm1, xmm11
2221        paddd   xmm0, xmm7
2222        paddd   xmm0, xmm1
2223        pxor    xmm3, xmm0
2224        movdqa  xmm14, xmm3
2225        psrld   xmm3, 8
2226        pslld   xmm14, 24
2227        pxor    xmm3, xmm14
2228        paddd   xmm2, xmm3
2229        pxor    xmm1, xmm2
2230        movdqa  xmm11, xmm1
2231        pslld   xmm1, 25
2232        psrld   xmm11, 7
2233        por     xmm1, xmm11
2234        pshufd  xmm0, xmm0, 0x39
2235        pshufd  xmm3, xmm3, 0x4E
2236        pshufd  xmm2, xmm2, 0x93
2237        dec     al
2238        jz      9f
2239        movdqa  xmm8, xmm4
2240        shufps  xmm8, xmm5, 214
2241        pshufd  xmm9, xmm4, 0x0F
2242        pshufd  xmm4, xmm8, 0x39
2243        movdqa  xmm8, xmm6
2244        shufps  xmm8, xmm7, 250
2245        pand    xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
2246        pand    xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
2247        por     xmm9, xmm8
2248        movdqa  xmm8, xmm7
2249        punpcklqdq xmm8, xmm5
2250        movdqa  xmm10, xmm6
2251        pand    xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
2252        pand    xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
2253        por     xmm8, xmm10
2254        pshufd  xmm8, xmm8, 0x78
2255        punpckhdq xmm5, xmm7
2256        punpckldq xmm6, xmm5
2257        pshufd  xmm7, xmm6, 0x1E
2258        movdqa  xmm5, xmm9
2259        movdqa  xmm6, xmm8
2260        jmp     9b
22619:
2262        movdqu  xmm4, xmmword ptr [rdi]
2263        movdqu  xmm5, xmmword ptr [rdi+0x10]
2264        pxor    xmm0, xmm2
2265        pxor    xmm1, xmm3
2266        pxor    xmm2, xmm4
2267        pxor    xmm3, xmm5
2268        movups  xmmword ptr [r9], xmm0
2269        movups  xmmword ptr [r9+0x10], xmm1
2270        movups  xmmword ptr [r9+0x20], xmm2
2271        movups  xmmword ptr [r9+0x30], xmm3
2272        ret
2273
2274
2275#ifdef __APPLE__
2276.static_data
2277#else
2278.section .rodata
2279#endif
2280.p2align  6
2281BLAKE3_IV:
2282        .long  0x6A09E667, 0xBB67AE85
2283        .long  0x3C6EF372, 0xA54FF53A
2284ADD0:
2285        .long  0, 1, 2, 3
2286ADD1:
2287	.long  4, 4, 4, 4
2288BLAKE3_IV_0:
2289	.long  0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
2290BLAKE3_IV_1:
2291	.long  0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
2292BLAKE3_IV_2:
2293	.long  0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
2294BLAKE3_IV_3:
2295	.long  0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
2296BLAKE3_BLOCK_LEN:
2297	.long  64, 64, 64, 64
2298CMP_MSB_MASK:
2299	.long  0x80000000, 0x80000000, 0x80000000, 0x80000000
2300PBLENDW_0x33_MASK:
2301	.long  0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000
2302PBLENDW_0xCC_MASK:
2303	.long  0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF
2304PBLENDW_0x3F_MASK:
2305	.long  0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000
2306PBLENDW_0xC0_MASK:
2307	.long  0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF
2308
2309#endif
2310