1#if defined(__x86_64__)
2
3#if defined(__ELF__) && (defined(__linux__) || defined(__FreeBSD__))
4.section .note.GNU-stack,"",%progbits
5#endif
6
7#if defined(__ELF__) && defined(__CET__) && defined(__has_include)
8#if __has_include(<cet.h>)
9#include <cet.h>
10#endif
11#endif
12
13#if !defined(_CET_ENDBR)
14#define _CET_ENDBR
15#endif
16
17#ifdef __APPLE__
18#define HIDDEN .private_extern
19#else
20#define HIDDEN .hidden
21#endif
22
23.intel_syntax noprefix
24HIDDEN blake3_hash_many_sse2
25HIDDEN _blake3_hash_many_sse2
26HIDDEN blake3_compress_in_place_sse2
27HIDDEN _blake3_compress_in_place_sse2
28HIDDEN blake3_compress_xof_sse2
29HIDDEN _blake3_compress_xof_sse2
30.global blake3_hash_many_sse2
31.global _blake3_hash_many_sse2
32.global blake3_compress_in_place_sse2
33.global _blake3_compress_in_place_sse2
34.global blake3_compress_xof_sse2
35.global _blake3_compress_xof_sse2
36#ifdef __APPLE__
37.text
38#else
39.section .text
40#endif
41        .p2align  6
42_blake3_hash_many_sse2:
43blake3_hash_many_sse2:
44        _CET_ENDBR
45        push    r15
46        push    r14
47        push    r13
48        push    r12
49        push    rbx
50        push    rbp
51        mov     rbp, rsp
52        sub     rsp, 360
53        and     rsp, 0xFFFFFFFFFFFFFFC0
54        neg     r9d
55        movd    xmm0, r9d
56        pshufd  xmm0, xmm0, 0x00
57        movdqa  xmmword ptr [rsp+0x130], xmm0
58        movdqa  xmm1, xmm0
59        pand    xmm1, xmmword ptr [ADD0+rip]
60        pand    xmm0, xmmword ptr [ADD1+rip]
61        movdqa  xmmword ptr [rsp+0x150], xmm0
62        movd    xmm0, r8d
63        pshufd  xmm0, xmm0, 0x00
64        paddd   xmm0, xmm1
65        movdqa  xmmword ptr [rsp+0x110], xmm0
66        pxor    xmm0, xmmword ptr [CMP_MSB_MASK+rip]
67        pxor    xmm1, xmmword ptr [CMP_MSB_MASK+rip]
68        pcmpgtd xmm1, xmm0
69        shr     r8, 32
70        movd    xmm2, r8d
71        pshufd  xmm2, xmm2, 0x00
72        psubd   xmm2, xmm1
73        movdqa  xmmword ptr [rsp+0x120], xmm2
74        mov     rbx, qword ptr [rbp+0x50]
75        mov     r15, rdx
76        shl     r15, 6
77        movzx   r13d, byte ptr [rbp+0x38]
78        movzx   r12d, byte ptr [rbp+0x48]
79        cmp     rsi, 4
80        jc      3f
812:
82        movdqu  xmm3, xmmword ptr [rcx]
83        pshufd  xmm0, xmm3, 0x00
84        pshufd  xmm1, xmm3, 0x55
85        pshufd  xmm2, xmm3, 0xAA
86        pshufd  xmm3, xmm3, 0xFF
87        movdqu  xmm7, xmmword ptr [rcx+0x10]
88        pshufd  xmm4, xmm7, 0x00
89        pshufd  xmm5, xmm7, 0x55
90        pshufd  xmm6, xmm7, 0xAA
91        pshufd  xmm7, xmm7, 0xFF
92        mov     r8, qword ptr [rdi]
93        mov     r9, qword ptr [rdi+0x8]
94        mov     r10, qword ptr [rdi+0x10]
95        mov     r11, qword ptr [rdi+0x18]
96        movzx   eax, byte ptr [rbp+0x40]
97        or      eax, r13d
98        xor     edx, edx
999:
100        mov     r14d, eax
101        or      eax, r12d
102        add     rdx, 64
103        cmp     rdx, r15
104        cmovne  eax, r14d
105        movdqu  xmm8, xmmword ptr [r8+rdx-0x40]
106        movdqu  xmm9, xmmword ptr [r9+rdx-0x40]
107        movdqu  xmm10, xmmword ptr [r10+rdx-0x40]
108        movdqu  xmm11, xmmword ptr [r11+rdx-0x40]
109        movdqa  xmm12, xmm8
110        punpckldq xmm8, xmm9
111        punpckhdq xmm12, xmm9
112        movdqa  xmm14, xmm10
113        punpckldq xmm10, xmm11
114        punpckhdq xmm14, xmm11
115        movdqa  xmm9, xmm8
116        punpcklqdq xmm8, xmm10
117        punpckhqdq xmm9, xmm10
118        movdqa  xmm13, xmm12
119        punpcklqdq xmm12, xmm14
120        punpckhqdq xmm13, xmm14
121        movdqa  xmmword ptr [rsp], xmm8
122        movdqa  xmmword ptr [rsp+0x10], xmm9
123        movdqa  xmmword ptr [rsp+0x20], xmm12
124        movdqa  xmmword ptr [rsp+0x30], xmm13
125        movdqu  xmm8, xmmword ptr [r8+rdx-0x30]
126        movdqu  xmm9, xmmword ptr [r9+rdx-0x30]
127        movdqu  xmm10, xmmword ptr [r10+rdx-0x30]
128        movdqu  xmm11, xmmword ptr [r11+rdx-0x30]
129        movdqa  xmm12, xmm8
130        punpckldq xmm8, xmm9
131        punpckhdq xmm12, xmm9
132        movdqa  xmm14, xmm10
133        punpckldq xmm10, xmm11
134        punpckhdq xmm14, xmm11
135        movdqa  xmm9, xmm8
136        punpcklqdq xmm8, xmm10
137        punpckhqdq xmm9, xmm10
138        movdqa  xmm13, xmm12
139        punpcklqdq xmm12, xmm14
140        punpckhqdq xmm13, xmm14
141        movdqa  xmmword ptr [rsp+0x40], xmm8
142        movdqa  xmmword ptr [rsp+0x50], xmm9
143        movdqa  xmmword ptr [rsp+0x60], xmm12
144        movdqa  xmmword ptr [rsp+0x70], xmm13
145        movdqu  xmm8, xmmword ptr [r8+rdx-0x20]
146        movdqu  xmm9, xmmword ptr [r9+rdx-0x20]
147        movdqu  xmm10, xmmword ptr [r10+rdx-0x20]
148        movdqu  xmm11, xmmword ptr [r11+rdx-0x20]
149        movdqa  xmm12, xmm8
150        punpckldq xmm8, xmm9
151        punpckhdq xmm12, xmm9
152        movdqa  xmm14, xmm10
153        punpckldq xmm10, xmm11
154        punpckhdq xmm14, xmm11
155        movdqa  xmm9, xmm8
156        punpcklqdq xmm8, xmm10
157        punpckhqdq xmm9, xmm10
158        movdqa  xmm13, xmm12
159        punpcklqdq xmm12, xmm14
160        punpckhqdq xmm13, xmm14
161        movdqa  xmmword ptr [rsp+0x80], xmm8
162        movdqa  xmmword ptr [rsp+0x90], xmm9
163        movdqa  xmmword ptr [rsp+0xA0], xmm12
164        movdqa  xmmword ptr [rsp+0xB0], xmm13
165        movdqu  xmm8, xmmword ptr [r8+rdx-0x10]
166        movdqu  xmm9, xmmword ptr [r9+rdx-0x10]
167        movdqu  xmm10, xmmword ptr [r10+rdx-0x10]
168        movdqu  xmm11, xmmword ptr [r11+rdx-0x10]
169        movdqa  xmm12, xmm8
170        punpckldq xmm8, xmm9
171        punpckhdq xmm12, xmm9
172        movdqa  xmm14, xmm10
173        punpckldq xmm10, xmm11
174        punpckhdq xmm14, xmm11
175        movdqa  xmm9, xmm8
176        punpcklqdq xmm8, xmm10
177        punpckhqdq xmm9, xmm10
178        movdqa  xmm13, xmm12
179        punpcklqdq xmm12, xmm14
180        punpckhqdq xmm13, xmm14
181        movdqa  xmmword ptr [rsp+0xC0], xmm8
182        movdqa  xmmword ptr [rsp+0xD0], xmm9
183        movdqa  xmmword ptr [rsp+0xE0], xmm12
184        movdqa  xmmword ptr [rsp+0xF0], xmm13
185        movdqa  xmm9, xmmword ptr [BLAKE3_IV_1+rip]
186        movdqa  xmm10, xmmword ptr [BLAKE3_IV_2+rip]
187        movdqa  xmm11, xmmword ptr [BLAKE3_IV_3+rip]
188        movdqa  xmm12, xmmword ptr [rsp+0x110]
189        movdqa  xmm13, xmmword ptr [rsp+0x120]
190        movdqa  xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
191        movd    xmm15, eax
192        pshufd  xmm15, xmm15, 0x00
193        prefetcht0 [r8+rdx+0x80]
194        prefetcht0 [r9+rdx+0x80]
195        prefetcht0 [r10+rdx+0x80]
196        prefetcht0 [r11+rdx+0x80]
197        paddd   xmm0, xmmword ptr [rsp]
198        paddd   xmm1, xmmword ptr [rsp+0x20]
199        paddd   xmm2, xmmword ptr [rsp+0x40]
200        paddd   xmm3, xmmword ptr [rsp+0x60]
201        paddd   xmm0, xmm4
202        paddd   xmm1, xmm5
203        paddd   xmm2, xmm6
204        paddd   xmm3, xmm7
205        pxor    xmm12, xmm0
206        pxor    xmm13, xmm1
207        pxor    xmm14, xmm2
208        pxor    xmm15, xmm3
209        pshuflw xmm12, xmm12, 0xB1
210        pshufhw xmm12, xmm12, 0xB1
211        pshuflw xmm13, xmm13, 0xB1
212        pshufhw xmm13, xmm13, 0xB1
213        pshuflw xmm14, xmm14, 0xB1
214        pshufhw xmm14, xmm14, 0xB1
215        pshuflw xmm15, xmm15, 0xB1
216        pshufhw xmm15, xmm15, 0xB1
217        movdqa  xmm8, xmmword ptr [BLAKE3_IV_0+rip]
218        paddd   xmm8, xmm12
219        paddd   xmm9, xmm13
220        paddd   xmm10, xmm14
221        paddd   xmm11, xmm15
222        pxor    xmm4, xmm8
223        pxor    xmm5, xmm9
224        pxor    xmm6, xmm10
225        pxor    xmm7, xmm11
226        movdqa  xmmword ptr [rsp+0x100], xmm8
227        movdqa  xmm8, xmm4
228        psrld   xmm8, 12
229        pslld   xmm4, 20
230        por     xmm4, xmm8
231        movdqa  xmm8, xmm5
232        psrld   xmm8, 12
233        pslld   xmm5, 20
234        por     xmm5, xmm8
235        movdqa  xmm8, xmm6
236        psrld   xmm8, 12
237        pslld   xmm6, 20
238        por     xmm6, xmm8
239        movdqa  xmm8, xmm7
240        psrld   xmm8, 12
241        pslld   xmm7, 20
242        por     xmm7, xmm8
243        paddd   xmm0, xmmword ptr [rsp+0x10]
244        paddd   xmm1, xmmword ptr [rsp+0x30]
245        paddd   xmm2, xmmword ptr [rsp+0x50]
246        paddd   xmm3, xmmword ptr [rsp+0x70]
247        paddd   xmm0, xmm4
248        paddd   xmm1, xmm5
249        paddd   xmm2, xmm6
250        paddd   xmm3, xmm7
251        pxor    xmm12, xmm0
252        pxor    xmm13, xmm1
253        pxor    xmm14, xmm2
254        pxor    xmm15, xmm3
255        movdqa  xmm8, xmm12
256        psrld   xmm12, 8
257        pslld   xmm8, 24
258        pxor    xmm12, xmm8
259        movdqa  xmm8, xmm13
260        psrld   xmm13, 8
261        pslld   xmm8, 24
262        pxor    xmm13, xmm8
263        movdqa  xmm8, xmm14
264        psrld   xmm14, 8
265        pslld   xmm8, 24
266        pxor    xmm14, xmm8
267        movdqa  xmm8, xmm15
268        psrld   xmm15, 8
269        pslld   xmm8, 24
270        pxor    xmm15, xmm8
271        movdqa  xmm8, xmmword ptr [rsp+0x100]
272        paddd   xmm8, xmm12
273        paddd   xmm9, xmm13
274        paddd   xmm10, xmm14
275        paddd   xmm11, xmm15
276        pxor    xmm4, xmm8
277        pxor    xmm5, xmm9
278        pxor    xmm6, xmm10
279        pxor    xmm7, xmm11
280        movdqa  xmmword ptr [rsp+0x100], xmm8
281        movdqa  xmm8, xmm4
282        psrld   xmm8, 7
283        pslld   xmm4, 25
284        por     xmm4, xmm8
285        movdqa  xmm8, xmm5
286        psrld   xmm8, 7
287        pslld   xmm5, 25
288        por     xmm5, xmm8
289        movdqa  xmm8, xmm6
290        psrld   xmm8, 7
291        pslld   xmm6, 25
292        por     xmm6, xmm8
293        movdqa  xmm8, xmm7
294        psrld   xmm8, 7
295        pslld   xmm7, 25
296        por     xmm7, xmm8
297        paddd   xmm0, xmmword ptr [rsp+0x80]
298        paddd   xmm1, xmmword ptr [rsp+0xA0]
299        paddd   xmm2, xmmword ptr [rsp+0xC0]
300        paddd   xmm3, xmmword ptr [rsp+0xE0]
301        paddd   xmm0, xmm5
302        paddd   xmm1, xmm6
303        paddd   xmm2, xmm7
304        paddd   xmm3, xmm4
305        pxor    xmm15, xmm0
306        pxor    xmm12, xmm1
307        pxor    xmm13, xmm2
308        pxor    xmm14, xmm3
309        pshuflw xmm15, xmm15, 0xB1
310        pshufhw xmm15, xmm15, 0xB1
311        pshuflw xmm12, xmm12, 0xB1
312        pshufhw xmm12, xmm12, 0xB1
313        pshuflw xmm13, xmm13, 0xB1
314        pshufhw xmm13, xmm13, 0xB1
315        pshuflw xmm14, xmm14, 0xB1
316        pshufhw xmm14, xmm14, 0xB1
317        paddd   xmm10, xmm15
318        paddd   xmm11, xmm12
319        movdqa  xmm8, xmmword ptr [rsp+0x100]
320        paddd   xmm8, xmm13
321        paddd   xmm9, xmm14
322        pxor    xmm5, xmm10
323        pxor    xmm6, xmm11
324        pxor    xmm7, xmm8
325        pxor    xmm4, xmm9
326        movdqa  xmmword ptr [rsp+0x100], xmm8
327        movdqa  xmm8, xmm5
328        psrld   xmm8, 12
329        pslld   xmm5, 20
330        por     xmm5, xmm8
331        movdqa  xmm8, xmm6
332        psrld   xmm8, 12
333        pslld   xmm6, 20
334        por     xmm6, xmm8
335        movdqa  xmm8, xmm7
336        psrld   xmm8, 12
337        pslld   xmm7, 20
338        por     xmm7, xmm8
339        movdqa  xmm8, xmm4
340        psrld   xmm8, 12
341        pslld   xmm4, 20
342        por     xmm4, xmm8
343        paddd   xmm0, xmmword ptr [rsp+0x90]
344        paddd   xmm1, xmmword ptr [rsp+0xB0]
345        paddd   xmm2, xmmword ptr [rsp+0xD0]
346        paddd   xmm3, xmmword ptr [rsp+0xF0]
347        paddd   xmm0, xmm5
348        paddd   xmm1, xmm6
349        paddd   xmm2, xmm7
350        paddd   xmm3, xmm4
351        pxor    xmm15, xmm0
352        pxor    xmm12, xmm1
353        pxor    xmm13, xmm2
354        pxor    xmm14, xmm3
355        movdqa  xmm8, xmm15
356        psrld   xmm15, 8
357        pslld   xmm8, 24
358        pxor    xmm15, xmm8
359        movdqa  xmm8, xmm12
360        psrld   xmm12, 8
361        pslld   xmm8, 24
362        pxor    xmm12, xmm8
363        movdqa  xmm8, xmm13
364        psrld   xmm13, 8
365        pslld   xmm8, 24
366        pxor    xmm13, xmm8
367        movdqa  xmm8, xmm14
368        psrld   xmm14, 8
369        pslld   xmm8, 24
370        pxor    xmm14, xmm8
371        paddd   xmm10, xmm15
372        paddd   xmm11, xmm12
373        movdqa  xmm8, xmmword ptr [rsp+0x100]
374        paddd   xmm8, xmm13
375        paddd   xmm9, xmm14
376        pxor    xmm5, xmm10
377        pxor    xmm6, xmm11
378        pxor    xmm7, xmm8
379        pxor    xmm4, xmm9
380        movdqa  xmmword ptr [rsp+0x100], xmm8
381        movdqa  xmm8, xmm5
382        psrld   xmm8, 7
383        pslld   xmm5, 25
384        por     xmm5, xmm8
385        movdqa  xmm8, xmm6
386        psrld   xmm8, 7
387        pslld   xmm6, 25
388        por     xmm6, xmm8
389        movdqa  xmm8, xmm7
390        psrld   xmm8, 7
391        pslld   xmm7, 25
392        por     xmm7, xmm8
393        movdqa  xmm8, xmm4
394        psrld   xmm8, 7
395        pslld   xmm4, 25
396        por     xmm4, xmm8
397        paddd   xmm0, xmmword ptr [rsp+0x20]
398        paddd   xmm1, xmmword ptr [rsp+0x30]
399        paddd   xmm2, xmmword ptr [rsp+0x70]
400        paddd   xmm3, xmmword ptr [rsp+0x40]
401        paddd   xmm0, xmm4
402        paddd   xmm1, xmm5
403        paddd   xmm2, xmm6
404        paddd   xmm3, xmm7
405        pxor    xmm12, xmm0
406        pxor    xmm13, xmm1
407        pxor    xmm14, xmm2
408        pxor    xmm15, xmm3
409        pshuflw xmm12, xmm12, 0xB1
410        pshufhw xmm12, xmm12, 0xB1
411        pshuflw xmm13, xmm13, 0xB1
412        pshufhw xmm13, xmm13, 0xB1
413        pshuflw xmm14, xmm14, 0xB1
414        pshufhw xmm14, xmm14, 0xB1
415        pshuflw xmm15, xmm15, 0xB1
416        pshufhw xmm15, xmm15, 0xB1
417        movdqa  xmm8, xmmword ptr [rsp+0x100]
418        paddd   xmm8, xmm12
419        paddd   xmm9, xmm13
420        paddd   xmm10, xmm14
421        paddd   xmm11, xmm15
422        pxor    xmm4, xmm8
423        pxor    xmm5, xmm9
424        pxor    xmm6, xmm10
425        pxor    xmm7, xmm11
426        movdqa  xmmword ptr [rsp+0x100], xmm8
427        movdqa  xmm8, xmm4
428        psrld   xmm8, 12
429        pslld   xmm4, 20
430        por     xmm4, xmm8
431        movdqa  xmm8, xmm5
432        psrld   xmm8, 12
433        pslld   xmm5, 20
434        por     xmm5, xmm8
435        movdqa  xmm8, xmm6
436        psrld   xmm8, 12
437        pslld   xmm6, 20
438        por     xmm6, xmm8
439        movdqa  xmm8, xmm7
440        psrld   xmm8, 12
441        pslld   xmm7, 20
442        por     xmm7, xmm8
443        paddd   xmm0, xmmword ptr [rsp+0x60]
444        paddd   xmm1, xmmword ptr [rsp+0xA0]
445        paddd   xmm2, xmmword ptr [rsp]
446        paddd   xmm3, xmmword ptr [rsp+0xD0]
447        paddd   xmm0, xmm4
448        paddd   xmm1, xmm5
449        paddd   xmm2, xmm6
450        paddd   xmm3, xmm7
451        pxor    xmm12, xmm0
452        pxor    xmm13, xmm1
453        pxor    xmm14, xmm2
454        pxor    xmm15, xmm3
455        movdqa  xmm8, xmm12
456        psrld   xmm12, 8
457        pslld   xmm8, 24
458        pxor    xmm12, xmm8
459        movdqa  xmm8, xmm13
460        psrld   xmm13, 8
461        pslld   xmm8, 24
462        pxor    xmm13, xmm8
463        movdqa  xmm8, xmm14
464        psrld   xmm14, 8
465        pslld   xmm8, 24
466        pxor    xmm14, xmm8
467        movdqa  xmm8, xmm15
468        psrld   xmm15, 8
469        pslld   xmm8, 24
470        pxor    xmm15, xmm8
471        movdqa  xmm8, xmmword ptr [rsp+0x100]
472        paddd   xmm8, xmm12
473        paddd   xmm9, xmm13
474        paddd   xmm10, xmm14
475        paddd   xmm11, xmm15
476        pxor    xmm4, xmm8
477        pxor    xmm5, xmm9
478        pxor    xmm6, xmm10
479        pxor    xmm7, xmm11
480        movdqa  xmmword ptr [rsp+0x100], xmm8
481        movdqa  xmm8, xmm4
482        psrld   xmm8, 7
483        pslld   xmm4, 25
484        por     xmm4, xmm8
485        movdqa  xmm8, xmm5
486        psrld   xmm8, 7
487        pslld   xmm5, 25
488        por     xmm5, xmm8
489        movdqa  xmm8, xmm6
490        psrld   xmm8, 7
491        pslld   xmm6, 25
492        por     xmm6, xmm8
493        movdqa  xmm8, xmm7
494        psrld   xmm8, 7
495        pslld   xmm7, 25
496        por     xmm7, xmm8
497        paddd   xmm0, xmmword ptr [rsp+0x10]
498        paddd   xmm1, xmmword ptr [rsp+0xC0]
499        paddd   xmm2, xmmword ptr [rsp+0x90]
500        paddd   xmm3, xmmword ptr [rsp+0xF0]
501        paddd   xmm0, xmm5
502        paddd   xmm1, xmm6
503        paddd   xmm2, xmm7
504        paddd   xmm3, xmm4
505        pxor    xmm15, xmm0
506        pxor    xmm12, xmm1
507        pxor    xmm13, xmm2
508        pxor    xmm14, xmm3
509        pshuflw xmm15, xmm15, 0xB1
510        pshufhw xmm15, xmm15, 0xB1
511        pshuflw xmm12, xmm12, 0xB1
512        pshufhw xmm12, xmm12, 0xB1
513        pshuflw xmm13, xmm13, 0xB1
514        pshufhw xmm13, xmm13, 0xB1
515        pshuflw xmm14, xmm14, 0xB1
516        pshufhw xmm14, xmm14, 0xB1
517        paddd   xmm10, xmm15
518        paddd   xmm11, xmm12
519        movdqa  xmm8, xmmword ptr [rsp+0x100]
520        paddd   xmm8, xmm13
521        paddd   xmm9, xmm14
522        pxor    xmm5, xmm10
523        pxor    xmm6, xmm11
524        pxor    xmm7, xmm8
525        pxor    xmm4, xmm9
526        movdqa  xmmword ptr [rsp+0x100], xmm8
527        movdqa  xmm8, xmm5
528        psrld   xmm8, 12
529        pslld   xmm5, 20
530        por     xmm5, xmm8
531        movdqa  xmm8, xmm6
532        psrld   xmm8, 12
533        pslld   xmm6, 20
534        por     xmm6, xmm8
535        movdqa  xmm8, xmm7
536        psrld   xmm8, 12
537        pslld   xmm7, 20
538        por     xmm7, xmm8
539        movdqa  xmm8, xmm4
540        psrld   xmm8, 12
541        pslld   xmm4, 20
542        por     xmm4, xmm8
543        paddd   xmm0, xmmword ptr [rsp+0xB0]
544        paddd   xmm1, xmmword ptr [rsp+0x50]
545        paddd   xmm2, xmmword ptr [rsp+0xE0]
546        paddd   xmm3, xmmword ptr [rsp+0x80]
547        paddd   xmm0, xmm5
548        paddd   xmm1, xmm6
549        paddd   xmm2, xmm7
550        paddd   xmm3, xmm4
551        pxor    xmm15, xmm0
552        pxor    xmm12, xmm1
553        pxor    xmm13, xmm2
554        pxor    xmm14, xmm3
555        movdqa  xmm8, xmm15
556        psrld   xmm15, 8
557        pslld   xmm8, 24
558        pxor    xmm15, xmm8
559        movdqa  xmm8, xmm12
560        psrld   xmm12, 8
561        pslld   xmm8, 24
562        pxor    xmm12, xmm8
563        movdqa  xmm8, xmm13
564        psrld   xmm13, 8
565        pslld   xmm8, 24
566        pxor    xmm13, xmm8
567        movdqa  xmm8, xmm14
568        psrld   xmm14, 8
569        pslld   xmm8, 24
570        pxor    xmm14, xmm8
571        paddd   xmm10, xmm15
572        paddd   xmm11, xmm12
573        movdqa  xmm8, xmmword ptr [rsp+0x100]
574        paddd   xmm8, xmm13
575        paddd   xmm9, xmm14
576        pxor    xmm5, xmm10
577        pxor    xmm6, xmm11
578        pxor    xmm7, xmm8
579        pxor    xmm4, xmm9
580        movdqa  xmmword ptr [rsp+0x100], xmm8
581        movdqa  xmm8, xmm5
582        psrld   xmm8, 7
583        pslld   xmm5, 25
584        por     xmm5, xmm8
585        movdqa  xmm8, xmm6
586        psrld   xmm8, 7
587        pslld   xmm6, 25
588        por     xmm6, xmm8
589        movdqa  xmm8, xmm7
590        psrld   xmm8, 7
591        pslld   xmm7, 25
592        por     xmm7, xmm8
593        movdqa  xmm8, xmm4
594        psrld   xmm8, 7
595        pslld   xmm4, 25
596        por     xmm4, xmm8
597        paddd   xmm0, xmmword ptr [rsp+0x30]
598        paddd   xmm1, xmmword ptr [rsp+0xA0]
599        paddd   xmm2, xmmword ptr [rsp+0xD0]
600        paddd   xmm3, xmmword ptr [rsp+0x70]
601        paddd   xmm0, xmm4
602        paddd   xmm1, xmm5
603        paddd   xmm2, xmm6
604        paddd   xmm3, xmm7
605        pxor    xmm12, xmm0
606        pxor    xmm13, xmm1
607        pxor    xmm14, xmm2
608        pxor    xmm15, xmm3
609        pshuflw xmm12, xmm12, 0xB1
610        pshufhw xmm12, xmm12, 0xB1
611        pshuflw xmm13, xmm13, 0xB1
612        pshufhw xmm13, xmm13, 0xB1
613        pshuflw xmm14, xmm14, 0xB1
614        pshufhw xmm14, xmm14, 0xB1
615        pshuflw xmm15, xmm15, 0xB1
616        pshufhw xmm15, xmm15, 0xB1
617        movdqa  xmm8, xmmword ptr [rsp+0x100]
618        paddd   xmm8, xmm12
619        paddd   xmm9, xmm13
620        paddd   xmm10, xmm14
621        paddd   xmm11, xmm15
622        pxor    xmm4, xmm8
623        pxor    xmm5, xmm9
624        pxor    xmm6, xmm10
625        pxor    xmm7, xmm11
626        movdqa  xmmword ptr [rsp+0x100], xmm8
627        movdqa  xmm8, xmm4
628        psrld   xmm8, 12
629        pslld   xmm4, 20
630        por     xmm4, xmm8
631        movdqa  xmm8, xmm5
632        psrld   xmm8, 12
633        pslld   xmm5, 20
634        por     xmm5, xmm8
635        movdqa  xmm8, xmm6
636        psrld   xmm8, 12
637        pslld   xmm6, 20
638        por     xmm6, xmm8
639        movdqa  xmm8, xmm7
640        psrld   xmm8, 12
641        pslld   xmm7, 20
642        por     xmm7, xmm8
643        paddd   xmm0, xmmword ptr [rsp+0x40]
644        paddd   xmm1, xmmword ptr [rsp+0xC0]
645        paddd   xmm2, xmmword ptr [rsp+0x20]
646        paddd   xmm3, xmmword ptr [rsp+0xE0]
647        paddd   xmm0, xmm4
648        paddd   xmm1, xmm5
649        paddd   xmm2, xmm6
650        paddd   xmm3, xmm7
651        pxor    xmm12, xmm0
652        pxor    xmm13, xmm1
653        pxor    xmm14, xmm2
654        pxor    xmm15, xmm3
655        movdqa  xmm8, xmm12
656        psrld   xmm12, 8
657        pslld   xmm8, 24
658        pxor    xmm12, xmm8
659        movdqa  xmm8, xmm13
660        psrld   xmm13, 8
661        pslld   xmm8, 24
662        pxor    xmm13, xmm8
663        movdqa  xmm8, xmm14
664        psrld   xmm14, 8
665        pslld   xmm8, 24
666        pxor    xmm14, xmm8
667        movdqa  xmm8, xmm15
668        psrld   xmm15, 8
669        pslld   xmm8, 24
670        pxor    xmm15, xmm8
671        movdqa  xmm8, xmmword ptr [rsp+0x100]
672        paddd   xmm8, xmm12
673        paddd   xmm9, xmm13
674        paddd   xmm10, xmm14
675        paddd   xmm11, xmm15
676        pxor    xmm4, xmm8
677        pxor    xmm5, xmm9
678        pxor    xmm6, xmm10
679        pxor    xmm7, xmm11
680        movdqa  xmmword ptr [rsp+0x100], xmm8
681        movdqa  xmm8, xmm4
682        psrld   xmm8, 7
683        pslld   xmm4, 25
684        por     xmm4, xmm8
685        movdqa  xmm8, xmm5
686        psrld   xmm8, 7
687        pslld   xmm5, 25
688        por     xmm5, xmm8
689        movdqa  xmm8, xmm6
690        psrld   xmm8, 7
691        pslld   xmm6, 25
692        por     xmm6, xmm8
693        movdqa  xmm8, xmm7
694        psrld   xmm8, 7
695        pslld   xmm7, 25
696        por     xmm7, xmm8
697        paddd   xmm0, xmmword ptr [rsp+0x60]
698        paddd   xmm1, xmmword ptr [rsp+0x90]
699        paddd   xmm2, xmmword ptr [rsp+0xB0]
700        paddd   xmm3, xmmword ptr [rsp+0x80]
701        paddd   xmm0, xmm5
702        paddd   xmm1, xmm6
703        paddd   xmm2, xmm7
704        paddd   xmm3, xmm4
705        pxor    xmm15, xmm0
706        pxor    xmm12, xmm1
707        pxor    xmm13, xmm2
708        pxor    xmm14, xmm3
709        pshuflw xmm15, xmm15, 0xB1
710        pshufhw xmm15, xmm15, 0xB1
711        pshuflw xmm12, xmm12, 0xB1
712        pshufhw xmm12, xmm12, 0xB1
713        pshuflw xmm13, xmm13, 0xB1
714        pshufhw xmm13, xmm13, 0xB1
715        pshuflw xmm14, xmm14, 0xB1
716        pshufhw xmm14, xmm14, 0xB1
717        paddd   xmm10, xmm15
718        paddd   xmm11, xmm12
719        movdqa  xmm8, xmmword ptr [rsp+0x100]
720        paddd   xmm8, xmm13
721        paddd   xmm9, xmm14
722        pxor    xmm5, xmm10
723        pxor    xmm6, xmm11
724        pxor    xmm7, xmm8
725        pxor    xmm4, xmm9
726        movdqa  xmmword ptr [rsp+0x100], xmm8
727        movdqa  xmm8, xmm5
728        psrld   xmm8, 12
729        pslld   xmm5, 20
730        por     xmm5, xmm8
731        movdqa  xmm8, xmm6
732        psrld   xmm8, 12
733        pslld   xmm6, 20
734        por     xmm6, xmm8
735        movdqa  xmm8, xmm7
736        psrld   xmm8, 12
737        pslld   xmm7, 20
738        por     xmm7, xmm8
739        movdqa  xmm8, xmm4
740        psrld   xmm8, 12
741        pslld   xmm4, 20
742        por     xmm4, xmm8
743        paddd   xmm0, xmmword ptr [rsp+0x50]
744        paddd   xmm1, xmmword ptr [rsp]
745        paddd   xmm2, xmmword ptr [rsp+0xF0]
746        paddd   xmm3, xmmword ptr [rsp+0x10]
747        paddd   xmm0, xmm5
748        paddd   xmm1, xmm6
749        paddd   xmm2, xmm7
750        paddd   xmm3, xmm4
751        pxor    xmm15, xmm0
752        pxor    xmm12, xmm1
753        pxor    xmm13, xmm2
754        pxor    xmm14, xmm3
755        movdqa  xmm8, xmm15
756        psrld   xmm15, 8
757        pslld   xmm8, 24
758        pxor    xmm15, xmm8
759        movdqa  xmm8, xmm12
760        psrld   xmm12, 8
761        pslld   xmm8, 24
762        pxor    xmm12, xmm8
763        movdqa  xmm8, xmm13
764        psrld   xmm13, 8
765        pslld   xmm8, 24
766        pxor    xmm13, xmm8
767        movdqa  xmm8, xmm14
768        psrld   xmm14, 8
769        pslld   xmm8, 24
770        pxor    xmm14, xmm8
771        paddd   xmm10, xmm15
772        paddd   xmm11, xmm12
773        movdqa  xmm8, xmmword ptr [rsp+0x100]
774        paddd   xmm8, xmm13
775        paddd   xmm9, xmm14
776        pxor    xmm5, xmm10
777        pxor    xmm6, xmm11
778        pxor    xmm7, xmm8
779        pxor    xmm4, xmm9
780        movdqa  xmmword ptr [rsp+0x100], xmm8
781        movdqa  xmm8, xmm5
782        psrld   xmm8, 7
783        pslld   xmm5, 25
784        por     xmm5, xmm8
785        movdqa  xmm8, xmm6
786        psrld   xmm8, 7
787        pslld   xmm6, 25
788        por     xmm6, xmm8
789        movdqa  xmm8, xmm7
790        psrld   xmm8, 7
791        pslld   xmm7, 25
792        por     xmm7, xmm8
793        movdqa  xmm8, xmm4
794        psrld   xmm8, 7
795        pslld   xmm4, 25
796        por     xmm4, xmm8
797        paddd   xmm0, xmmword ptr [rsp+0xA0]
798        paddd   xmm1, xmmword ptr [rsp+0xC0]
799        paddd   xmm2, xmmword ptr [rsp+0xE0]
800        paddd   xmm3, xmmword ptr [rsp+0xD0]
801        paddd   xmm0, xmm4
802        paddd   xmm1, xmm5
803        paddd   xmm2, xmm6
804        paddd   xmm3, xmm7
805        pxor    xmm12, xmm0
806        pxor    xmm13, xmm1
807        pxor    xmm14, xmm2
808        pxor    xmm15, xmm3
809        pshuflw xmm12, xmm12, 0xB1
810        pshufhw xmm12, xmm12, 0xB1
811        pshuflw xmm13, xmm13, 0xB1
812        pshufhw xmm13, xmm13, 0xB1
813        pshuflw xmm14, xmm14, 0xB1
814        pshufhw xmm14, xmm14, 0xB1
815        pshuflw xmm15, xmm15, 0xB1
816        pshufhw xmm15, xmm15, 0xB1
817        movdqa  xmm8, xmmword ptr [rsp+0x100]
818        paddd   xmm8, xmm12
819        paddd   xmm9, xmm13
820        paddd   xmm10, xmm14
821        paddd   xmm11, xmm15
822        pxor    xmm4, xmm8
823        pxor    xmm5, xmm9
824        pxor    xmm6, xmm10
825        pxor    xmm7, xmm11
826        movdqa  xmmword ptr [rsp+0x100], xmm8
827        movdqa  xmm8, xmm4
828        psrld   xmm8, 12
829        pslld   xmm4, 20
830        por     xmm4, xmm8
831        movdqa  xmm8, xmm5
832        psrld   xmm8, 12
833        pslld   xmm5, 20
834        por     xmm5, xmm8
835        movdqa  xmm8, xmm6
836        psrld   xmm8, 12
837        pslld   xmm6, 20
838        por     xmm6, xmm8
839        movdqa  xmm8, xmm7
840        psrld   xmm8, 12
841        pslld   xmm7, 20
842        por     xmm7, xmm8
843        paddd   xmm0, xmmword ptr [rsp+0x70]
844        paddd   xmm1, xmmword ptr [rsp+0x90]
845        paddd   xmm2, xmmword ptr [rsp+0x30]
846        paddd   xmm3, xmmword ptr [rsp+0xF0]
847        paddd   xmm0, xmm4
848        paddd   xmm1, xmm5
849        paddd   xmm2, xmm6
850        paddd   xmm3, xmm7
851        pxor    xmm12, xmm0
852        pxor    xmm13, xmm1
853        pxor    xmm14, xmm2
854        pxor    xmm15, xmm3
855        movdqa  xmm8, xmm12
856        psrld   xmm12, 8
857        pslld   xmm8, 24
858        pxor    xmm12, xmm8
859        movdqa  xmm8, xmm13
860        psrld   xmm13, 8
861        pslld   xmm8, 24
862        pxor    xmm13, xmm8
863        movdqa  xmm8, xmm14
864        psrld   xmm14, 8
865        pslld   xmm8, 24
866        pxor    xmm14, xmm8
867        movdqa  xmm8, xmm15
868        psrld   xmm15, 8
869        pslld   xmm8, 24
870        pxor    xmm15, xmm8
871        movdqa  xmm8, xmmword ptr [rsp+0x100]
872        paddd   xmm8, xmm12
873        paddd   xmm9, xmm13
874        paddd   xmm10, xmm14
875        paddd   xmm11, xmm15
876        pxor    xmm4, xmm8
877        pxor    xmm5, xmm9
878        pxor    xmm6, xmm10
879        pxor    xmm7, xmm11
880        movdqa  xmmword ptr [rsp+0x100], xmm8
881        movdqa  xmm8, xmm4
882        psrld   xmm8, 7
883        pslld   xmm4, 25
884        por     xmm4, xmm8
885        movdqa  xmm8, xmm5
886        psrld   xmm8, 7
887        pslld   xmm5, 25
888        por     xmm5, xmm8
889        movdqa  xmm8, xmm6
890        psrld   xmm8, 7
891        pslld   xmm6, 25
892        por     xmm6, xmm8
893        movdqa  xmm8, xmm7
894        psrld   xmm8, 7
895        pslld   xmm7, 25
896        por     xmm7, xmm8
897        paddd   xmm0, xmmword ptr [rsp+0x40]
898        paddd   xmm1, xmmword ptr [rsp+0xB0]
899        paddd   xmm2, xmmword ptr [rsp+0x50]
900        paddd   xmm3, xmmword ptr [rsp+0x10]
901        paddd   xmm0, xmm5
902        paddd   xmm1, xmm6
903        paddd   xmm2, xmm7
904        paddd   xmm3, xmm4
905        pxor    xmm15, xmm0
906        pxor    xmm12, xmm1
907        pxor    xmm13, xmm2
908        pxor    xmm14, xmm3
909        pshuflw xmm15, xmm15, 0xB1
910        pshufhw xmm15, xmm15, 0xB1
911        pshuflw xmm12, xmm12, 0xB1
912        pshufhw xmm12, xmm12, 0xB1
913        pshuflw xmm13, xmm13, 0xB1
914        pshufhw xmm13, xmm13, 0xB1
915        pshuflw xmm14, xmm14, 0xB1
916        pshufhw xmm14, xmm14, 0xB1
917        paddd   xmm10, xmm15
918        paddd   xmm11, xmm12
919        movdqa  xmm8, xmmword ptr [rsp+0x100]
920        paddd   xmm8, xmm13
921        paddd   xmm9, xmm14
922        pxor    xmm5, xmm10
923        pxor    xmm6, xmm11
924        pxor    xmm7, xmm8
925        pxor    xmm4, xmm9
926        movdqa  xmmword ptr [rsp+0x100], xmm8
927        movdqa  xmm8, xmm5
928        psrld   xmm8, 12
929        pslld   xmm5, 20
930        por     xmm5, xmm8
931        movdqa  xmm8, xmm6
932        psrld   xmm8, 12
933        pslld   xmm6, 20
934        por     xmm6, xmm8
935        movdqa  xmm8, xmm7
936        psrld   xmm8, 12
937        pslld   xmm7, 20
938        por     xmm7, xmm8
939        movdqa  xmm8, xmm4
940        psrld   xmm8, 12
941        pslld   xmm4, 20
942        por     xmm4, xmm8
943        paddd   xmm0, xmmword ptr [rsp]
944        paddd   xmm1, xmmword ptr [rsp+0x20]
945        paddd   xmm2, xmmword ptr [rsp+0x80]
946        paddd   xmm3, xmmword ptr [rsp+0x60]
947        paddd   xmm0, xmm5
948        paddd   xmm1, xmm6
949        paddd   xmm2, xmm7
950        paddd   xmm3, xmm4
951        pxor    xmm15, xmm0
952        pxor    xmm12, xmm1
953        pxor    xmm13, xmm2
954        pxor    xmm14, xmm3
955        movdqa  xmm8, xmm15
956        psrld   xmm15, 8
957        pslld   xmm8, 24
958        pxor    xmm15, xmm8
959        movdqa  xmm8, xmm12
960        psrld   xmm12, 8
961        pslld   xmm8, 24
962        pxor    xmm12, xmm8
963        movdqa  xmm8, xmm13
964        psrld   xmm13, 8
965        pslld   xmm8, 24
966        pxor    xmm13, xmm8
967        movdqa  xmm8, xmm14
968        psrld   xmm14, 8
969        pslld   xmm8, 24
970        pxor    xmm14, xmm8
971        paddd   xmm10, xmm15
972        paddd   xmm11, xmm12
973        movdqa  xmm8, xmmword ptr [rsp+0x100]
974        paddd   xmm8, xmm13
975        paddd   xmm9, xmm14
976        pxor    xmm5, xmm10
977        pxor    xmm6, xmm11
978        pxor    xmm7, xmm8
979        pxor    xmm4, xmm9
980        movdqa  xmmword ptr [rsp+0x100], xmm8
981        movdqa  xmm8, xmm5
982        psrld   xmm8, 7
983        pslld   xmm5, 25
984        por     xmm5, xmm8
985        movdqa  xmm8, xmm6
986        psrld   xmm8, 7
987        pslld   xmm6, 25
988        por     xmm6, xmm8
989        movdqa  xmm8, xmm7
990        psrld   xmm8, 7
991        pslld   xmm7, 25
992        por     xmm7, xmm8
993        movdqa  xmm8, xmm4
994        psrld   xmm8, 7
995        pslld   xmm4, 25
996        por     xmm4, xmm8
997        paddd   xmm0, xmmword ptr [rsp+0xC0]
998        paddd   xmm1, xmmword ptr [rsp+0x90]
999        paddd   xmm2, xmmword ptr [rsp+0xF0]
1000        paddd   xmm3, xmmword ptr [rsp+0xE0]
1001        paddd   xmm0, xmm4
1002        paddd   xmm1, xmm5
1003        paddd   xmm2, xmm6
1004        paddd   xmm3, xmm7
1005        pxor    xmm12, xmm0
1006        pxor    xmm13, xmm1
1007        pxor    xmm14, xmm2
1008        pxor    xmm15, xmm3
1009        pshuflw xmm12, xmm12, 0xB1
1010        pshufhw xmm12, xmm12, 0xB1
1011        pshuflw xmm13, xmm13, 0xB1
1012        pshufhw xmm13, xmm13, 0xB1
1013        pshuflw xmm14, xmm14, 0xB1
1014        pshufhw xmm14, xmm14, 0xB1
1015        pshuflw xmm15, xmm15, 0xB1
1016        pshufhw xmm15, xmm15, 0xB1
1017        movdqa  xmm8, xmmword ptr [rsp+0x100]
1018        paddd   xmm8, xmm12
1019        paddd   xmm9, xmm13
1020        paddd   xmm10, xmm14
1021        paddd   xmm11, xmm15
1022        pxor    xmm4, xmm8
1023        pxor    xmm5, xmm9
1024        pxor    xmm6, xmm10
1025        pxor    xmm7, xmm11
1026        movdqa  xmmword ptr [rsp+0x100], xmm8
1027        movdqa  xmm8, xmm4
1028        psrld   xmm8, 12
1029        pslld   xmm4, 20
1030        por     xmm4, xmm8
1031        movdqa  xmm8, xmm5
1032        psrld   xmm8, 12
1033        pslld   xmm5, 20
1034        por     xmm5, xmm8
1035        movdqa  xmm8, xmm6
1036        psrld   xmm8, 12
1037        pslld   xmm6, 20
1038        por     xmm6, xmm8
1039        movdqa  xmm8, xmm7
1040        psrld   xmm8, 12
1041        pslld   xmm7, 20
1042        por     xmm7, xmm8
1043        paddd   xmm0, xmmword ptr [rsp+0xD0]
1044        paddd   xmm1, xmmword ptr [rsp+0xB0]
1045        paddd   xmm2, xmmword ptr [rsp+0xA0]
1046        paddd   xmm3, xmmword ptr [rsp+0x80]
1047        paddd   xmm0, xmm4
1048        paddd   xmm1, xmm5
1049        paddd   xmm2, xmm6
1050        paddd   xmm3, xmm7
1051        pxor    xmm12, xmm0
1052        pxor    xmm13, xmm1
1053        pxor    xmm14, xmm2
1054        pxor    xmm15, xmm3
1055        movdqa  xmm8, xmm12
1056        psrld   xmm12, 8
1057        pslld   xmm8, 24
1058        pxor    xmm12, xmm8
1059        movdqa  xmm8, xmm13
1060        psrld   xmm13, 8
1061        pslld   xmm8, 24
1062        pxor    xmm13, xmm8
1063        movdqa  xmm8, xmm14
1064        psrld   xmm14, 8
1065        pslld   xmm8, 24
1066        pxor    xmm14, xmm8
1067        movdqa  xmm8, xmm15
1068        psrld   xmm15, 8
1069        pslld   xmm8, 24
1070        pxor    xmm15, xmm8
1071        movdqa  xmm8, xmmword ptr [rsp+0x100]
1072        paddd   xmm8, xmm12
1073        paddd   xmm9, xmm13
1074        paddd   xmm10, xmm14
1075        paddd   xmm11, xmm15
1076        pxor    xmm4, xmm8
1077        pxor    xmm5, xmm9
1078        pxor    xmm6, xmm10
1079        pxor    xmm7, xmm11
1080        movdqa  xmmword ptr [rsp+0x100], xmm8
1081        movdqa  xmm8, xmm4
1082        psrld   xmm8, 7
1083        pslld   xmm4, 25
1084        por     xmm4, xmm8
1085        movdqa  xmm8, xmm5
1086        psrld   xmm8, 7
1087        pslld   xmm5, 25
1088        por     xmm5, xmm8
1089        movdqa  xmm8, xmm6
1090        psrld   xmm8, 7
1091        pslld   xmm6, 25
1092        por     xmm6, xmm8
1093        movdqa  xmm8, xmm7
1094        psrld   xmm8, 7
1095        pslld   xmm7, 25
1096        por     xmm7, xmm8
1097        paddd   xmm0, xmmword ptr [rsp+0x70]
1098        paddd   xmm1, xmmword ptr [rsp+0x50]
1099        paddd   xmm2, xmmword ptr [rsp]
1100        paddd   xmm3, xmmword ptr [rsp+0x60]
1101        paddd   xmm0, xmm5
1102        paddd   xmm1, xmm6
1103        paddd   xmm2, xmm7
1104        paddd   xmm3, xmm4
1105        pxor    xmm15, xmm0
1106        pxor    xmm12, xmm1
1107        pxor    xmm13, xmm2
1108        pxor    xmm14, xmm3
1109        pshuflw xmm15, xmm15, 0xB1
1110        pshufhw xmm15, xmm15, 0xB1
1111        pshuflw xmm12, xmm12, 0xB1
1112        pshufhw xmm12, xmm12, 0xB1
1113        pshuflw xmm13, xmm13, 0xB1
1114        pshufhw xmm13, xmm13, 0xB1
1115        pshuflw xmm14, xmm14, 0xB1
1116        pshufhw xmm14, xmm14, 0xB1
1117        paddd   xmm10, xmm15
1118        paddd   xmm11, xmm12
1119        movdqa  xmm8, xmmword ptr [rsp+0x100]
1120        paddd   xmm8, xmm13
1121        paddd   xmm9, xmm14
1122        pxor    xmm5, xmm10
1123        pxor    xmm6, xmm11
1124        pxor    xmm7, xmm8
1125        pxor    xmm4, xmm9
1126        movdqa  xmmword ptr [rsp+0x100], xmm8
1127        movdqa  xmm8, xmm5
1128        psrld   xmm8, 12
1129        pslld   xmm5, 20
1130        por     xmm5, xmm8
1131        movdqa  xmm8, xmm6
1132        psrld   xmm8, 12
1133        pslld   xmm6, 20
1134        por     xmm6, xmm8
1135        movdqa  xmm8, xmm7
1136        psrld   xmm8, 12
1137        pslld   xmm7, 20
1138        por     xmm7, xmm8
1139        movdqa  xmm8, xmm4
1140        psrld   xmm8, 12
1141        pslld   xmm4, 20
1142        por     xmm4, xmm8
1143        paddd   xmm0, xmmword ptr [rsp+0x20]
1144        paddd   xmm1, xmmword ptr [rsp+0x30]
1145        paddd   xmm2, xmmword ptr [rsp+0x10]
1146        paddd   xmm3, xmmword ptr [rsp+0x40]
1147        paddd   xmm0, xmm5
1148        paddd   xmm1, xmm6
1149        paddd   xmm2, xmm7
1150        paddd   xmm3, xmm4
1151        pxor    xmm15, xmm0
1152        pxor    xmm12, xmm1
1153        pxor    xmm13, xmm2
1154        pxor    xmm14, xmm3
1155        movdqa  xmm8, xmm15
1156        psrld   xmm15, 8
1157        pslld   xmm8, 24
1158        pxor    xmm15, xmm8
1159        movdqa  xmm8, xmm12
1160        psrld   xmm12, 8
1161        pslld   xmm8, 24
1162        pxor    xmm12, xmm8
1163        movdqa  xmm8, xmm13
1164        psrld   xmm13, 8
1165        pslld   xmm8, 24
1166        pxor    xmm13, xmm8
1167        movdqa  xmm8, xmm14
1168        psrld   xmm14, 8
1169        pslld   xmm8, 24
1170        pxor    xmm14, xmm8
1171        paddd   xmm10, xmm15
1172        paddd   xmm11, xmm12
1173        movdqa  xmm8, xmmword ptr [rsp+0x100]
1174        paddd   xmm8, xmm13
1175        paddd   xmm9, xmm14
1176        pxor    xmm5, xmm10
1177        pxor    xmm6, xmm11
1178        pxor    xmm7, xmm8
1179        pxor    xmm4, xmm9
1180        movdqa  xmmword ptr [rsp+0x100], xmm8
1181        movdqa  xmm8, xmm5
1182        psrld   xmm8, 7
1183        pslld   xmm5, 25
1184        por     xmm5, xmm8
1185        movdqa  xmm8, xmm6
1186        psrld   xmm8, 7
1187        pslld   xmm6, 25
1188        por     xmm6, xmm8
1189        movdqa  xmm8, xmm7
1190        psrld   xmm8, 7
1191        pslld   xmm7, 25
1192        por     xmm7, xmm8
1193        movdqa  xmm8, xmm4
1194        psrld   xmm8, 7
1195        pslld   xmm4, 25
1196        por     xmm4, xmm8
1197        paddd   xmm0, xmmword ptr [rsp+0x90]
1198        paddd   xmm1, xmmword ptr [rsp+0xB0]
1199        paddd   xmm2, xmmword ptr [rsp+0x80]
1200        paddd   xmm3, xmmword ptr [rsp+0xF0]
1201        paddd   xmm0, xmm4
1202        paddd   xmm1, xmm5
1203        paddd   xmm2, xmm6
1204        paddd   xmm3, xmm7
1205        pxor    xmm12, xmm0
1206        pxor    xmm13, xmm1
1207        pxor    xmm14, xmm2
1208        pxor    xmm15, xmm3
1209        pshuflw xmm12, xmm12, 0xB1
1210        pshufhw xmm12, xmm12, 0xB1
1211        pshuflw xmm13, xmm13, 0xB1
1212        pshufhw xmm13, xmm13, 0xB1
1213        pshuflw xmm14, xmm14, 0xB1
1214        pshufhw xmm14, xmm14, 0xB1
1215        pshuflw xmm15, xmm15, 0xB1
1216        pshufhw xmm15, xmm15, 0xB1
1217        movdqa  xmm8, xmmword ptr [rsp+0x100]
1218        paddd   xmm8, xmm12
1219        paddd   xmm9, xmm13
1220        paddd   xmm10, xmm14
1221        paddd   xmm11, xmm15
1222        pxor    xmm4, xmm8
1223        pxor    xmm5, xmm9
1224        pxor    xmm6, xmm10
1225        pxor    xmm7, xmm11
1226        movdqa  xmmword ptr [rsp+0x100], xmm8
1227        movdqa  xmm8, xmm4
1228        psrld   xmm8, 12
1229        pslld   xmm4, 20
1230        por     xmm4, xmm8
1231        movdqa  xmm8, xmm5
1232        psrld   xmm8, 12
1233        pslld   xmm5, 20
1234        por     xmm5, xmm8
1235        movdqa  xmm8, xmm6
1236        psrld   xmm8, 12
1237        pslld   xmm6, 20
1238        por     xmm6, xmm8
1239        movdqa  xmm8, xmm7
1240        psrld   xmm8, 12
1241        pslld   xmm7, 20
1242        por     xmm7, xmm8
1243        paddd   xmm0, xmmword ptr [rsp+0xE0]
1244        paddd   xmm1, xmmword ptr [rsp+0x50]
1245        paddd   xmm2, xmmword ptr [rsp+0xC0]
1246        paddd   xmm3, xmmword ptr [rsp+0x10]
1247        paddd   xmm0, xmm4
1248        paddd   xmm1, xmm5
1249        paddd   xmm2, xmm6
1250        paddd   xmm3, xmm7
1251        pxor    xmm12, xmm0
1252        pxor    xmm13, xmm1
1253        pxor    xmm14, xmm2
1254        pxor    xmm15, xmm3
1255        movdqa  xmm8, xmm12
1256        psrld   xmm12, 8
1257        pslld   xmm8, 24
1258        pxor    xmm12, xmm8
1259        movdqa  xmm8, xmm13
1260        psrld   xmm13, 8
1261        pslld   xmm8, 24
1262        pxor    xmm13, xmm8
1263        movdqa  xmm8, xmm14
1264        psrld   xmm14, 8
1265        pslld   xmm8, 24
1266        pxor    xmm14, xmm8
1267        movdqa  xmm8, xmm15
1268        psrld   xmm15, 8
1269        pslld   xmm8, 24
1270        pxor    xmm15, xmm8
1271        movdqa  xmm8, xmmword ptr [rsp+0x100]
1272        paddd   xmm8, xmm12
1273        paddd   xmm9, xmm13
1274        paddd   xmm10, xmm14
1275        paddd   xmm11, xmm15
1276        pxor    xmm4, xmm8
1277        pxor    xmm5, xmm9
1278        pxor    xmm6, xmm10
1279        pxor    xmm7, xmm11
1280        movdqa  xmmword ptr [rsp+0x100], xmm8
1281        movdqa  xmm8, xmm4
1282        psrld   xmm8, 7
1283        pslld   xmm4, 25
1284        por     xmm4, xmm8
1285        movdqa  xmm8, xmm5
1286        psrld   xmm8, 7
1287        pslld   xmm5, 25
1288        por     xmm5, xmm8
1289        movdqa  xmm8, xmm6
1290        psrld   xmm8, 7
1291        pslld   xmm6, 25
1292        por     xmm6, xmm8
1293        movdqa  xmm8, xmm7
1294        psrld   xmm8, 7
1295        pslld   xmm7, 25
1296        por     xmm7, xmm8
1297        paddd   xmm0, xmmword ptr [rsp+0xD0]
1298        paddd   xmm1, xmmword ptr [rsp]
1299        paddd   xmm2, xmmword ptr [rsp+0x20]
1300        paddd   xmm3, xmmword ptr [rsp+0x40]
1301        paddd   xmm0, xmm5
1302        paddd   xmm1, xmm6
1303        paddd   xmm2, xmm7
1304        paddd   xmm3, xmm4
1305        pxor    xmm15, xmm0
1306        pxor    xmm12, xmm1
1307        pxor    xmm13, xmm2
1308        pxor    xmm14, xmm3
1309        pshuflw xmm15, xmm15, 0xB1
1310        pshufhw xmm15, xmm15, 0xB1
1311        pshuflw xmm12, xmm12, 0xB1
1312        pshufhw xmm12, xmm12, 0xB1
1313        pshuflw xmm13, xmm13, 0xB1
1314        pshufhw xmm13, xmm13, 0xB1
1315        pshuflw xmm14, xmm14, 0xB1
1316        pshufhw xmm14, xmm14, 0xB1
1317        paddd   xmm10, xmm15
1318        paddd   xmm11, xmm12
1319        movdqa  xmm8, xmmword ptr [rsp+0x100]
1320        paddd   xmm8, xmm13
1321        paddd   xmm9, xmm14
1322        pxor    xmm5, xmm10
1323        pxor    xmm6, xmm11
1324        pxor    xmm7, xmm8
1325        pxor    xmm4, xmm9
1326        movdqa  xmmword ptr [rsp+0x100], xmm8
1327        movdqa  xmm8, xmm5
1328        psrld   xmm8, 12
1329        pslld   xmm5, 20
1330        por     xmm5, xmm8
1331        movdqa  xmm8, xmm6
1332        psrld   xmm8, 12
1333        pslld   xmm6, 20
1334        por     xmm6, xmm8
1335        movdqa  xmm8, xmm7
1336        psrld   xmm8, 12
1337        pslld   xmm7, 20
1338        por     xmm7, xmm8
1339        movdqa  xmm8, xmm4
1340        psrld   xmm8, 12
1341        pslld   xmm4, 20
1342        por     xmm4, xmm8
1343        paddd   xmm0, xmmword ptr [rsp+0x30]
1344        paddd   xmm1, xmmword ptr [rsp+0xA0]
1345        paddd   xmm2, xmmword ptr [rsp+0x60]
1346        paddd   xmm3, xmmword ptr [rsp+0x70]
1347        paddd   xmm0, xmm5
1348        paddd   xmm1, xmm6
1349        paddd   xmm2, xmm7
1350        paddd   xmm3, xmm4
1351        pxor    xmm15, xmm0
1352        pxor    xmm12, xmm1
1353        pxor    xmm13, xmm2
1354        pxor    xmm14, xmm3
1355        movdqa  xmm8, xmm15
1356        psrld   xmm15, 8
1357        pslld   xmm8, 24
1358        pxor    xmm15, xmm8
1359        movdqa  xmm8, xmm12
1360        psrld   xmm12, 8
1361        pslld   xmm8, 24
1362        pxor    xmm12, xmm8
1363        movdqa  xmm8, xmm13
1364        psrld   xmm13, 8
1365        pslld   xmm8, 24
1366        pxor    xmm13, xmm8
1367        movdqa  xmm8, xmm14
1368        psrld   xmm14, 8
1369        pslld   xmm8, 24
1370        pxor    xmm14, xmm8
1371        paddd   xmm10, xmm15
1372        paddd   xmm11, xmm12
1373        movdqa  xmm8, xmmword ptr [rsp+0x100]
1374        paddd   xmm8, xmm13
1375        paddd   xmm9, xmm14
1376        pxor    xmm5, xmm10
1377        pxor    xmm6, xmm11
1378        pxor    xmm7, xmm8
1379        pxor    xmm4, xmm9
1380        movdqa  xmmword ptr [rsp+0x100], xmm8
1381        movdqa  xmm8, xmm5
1382        psrld   xmm8, 7
1383        pslld   xmm5, 25
1384        por     xmm5, xmm8
1385        movdqa  xmm8, xmm6
1386        psrld   xmm8, 7
1387        pslld   xmm6, 25
1388        por     xmm6, xmm8
1389        movdqa  xmm8, xmm7
1390        psrld   xmm8, 7
1391        pslld   xmm7, 25
1392        por     xmm7, xmm8
1393        movdqa  xmm8, xmm4
1394        psrld   xmm8, 7
1395        pslld   xmm4, 25
1396        por     xmm4, xmm8
1397        paddd   xmm0, xmmword ptr [rsp+0xB0]
1398        paddd   xmm1, xmmword ptr [rsp+0x50]
1399        paddd   xmm2, xmmword ptr [rsp+0x10]
1400        paddd   xmm3, xmmword ptr [rsp+0x80]
1401        paddd   xmm0, xmm4
1402        paddd   xmm1, xmm5
1403        paddd   xmm2, xmm6
1404        paddd   xmm3, xmm7
1405        pxor    xmm12, xmm0
1406        pxor    xmm13, xmm1
1407        pxor    xmm14, xmm2
1408        pxor    xmm15, xmm3
1409        pshuflw xmm12, xmm12, 0xB1
1410        pshufhw xmm12, xmm12, 0xB1
1411        pshuflw xmm13, xmm13, 0xB1
1412        pshufhw xmm13, xmm13, 0xB1
1413        pshuflw xmm14, xmm14, 0xB1
1414        pshufhw xmm14, xmm14, 0xB1
1415        pshuflw xmm15, xmm15, 0xB1
1416        pshufhw xmm15, xmm15, 0xB1
1417        movdqa  xmm8, xmmword ptr [rsp+0x100]
1418        paddd   xmm8, xmm12
1419        paddd   xmm9, xmm13
1420        paddd   xmm10, xmm14
1421        paddd   xmm11, xmm15
1422        pxor    xmm4, xmm8
1423        pxor    xmm5, xmm9
1424        pxor    xmm6, xmm10
1425        pxor    xmm7, xmm11
1426        movdqa  xmmword ptr [rsp+0x100], xmm8
1427        movdqa  xmm8, xmm4
1428        psrld   xmm8, 12
1429        pslld   xmm4, 20
1430        por     xmm4, xmm8
1431        movdqa  xmm8, xmm5
1432        psrld   xmm8, 12
1433        pslld   xmm5, 20
1434        por     xmm5, xmm8
1435        movdqa  xmm8, xmm6
1436        psrld   xmm8, 12
1437        pslld   xmm6, 20
1438        por     xmm6, xmm8
1439        movdqa  xmm8, xmm7
1440        psrld   xmm8, 12
1441        pslld   xmm7, 20
1442        por     xmm7, xmm8
1443        paddd   xmm0, xmmword ptr [rsp+0xF0]
1444        paddd   xmm1, xmmword ptr [rsp]
1445        paddd   xmm2, xmmword ptr [rsp+0x90]
1446        paddd   xmm3, xmmword ptr [rsp+0x60]
1447        paddd   xmm0, xmm4
1448        paddd   xmm1, xmm5
1449        paddd   xmm2, xmm6
1450        paddd   xmm3, xmm7
1451        pxor    xmm12, xmm0
1452        pxor    xmm13, xmm1
1453        pxor    xmm14, xmm2
1454        pxor    xmm15, xmm3
1455        movdqa  xmm8, xmm12
1456        psrld   xmm12, 8
1457        pslld   xmm8, 24
1458        pxor    xmm12, xmm8
1459        movdqa  xmm8, xmm13
1460        psrld   xmm13, 8
1461        pslld   xmm8, 24
1462        pxor    xmm13, xmm8
1463        movdqa  xmm8, xmm14
1464        psrld   xmm14, 8
1465        pslld   xmm8, 24
1466        pxor    xmm14, xmm8
1467        movdqa  xmm8, xmm15
1468        psrld   xmm15, 8
1469        pslld   xmm8, 24
1470        pxor    xmm15, xmm8
1471        movdqa  xmm8, xmmword ptr [rsp+0x100]
1472        paddd   xmm8, xmm12
1473        paddd   xmm9, xmm13
1474        paddd   xmm10, xmm14
1475        paddd   xmm11, xmm15
1476        pxor    xmm4, xmm8
1477        pxor    xmm5, xmm9
1478        pxor    xmm6, xmm10
1479        pxor    xmm7, xmm11
1480        movdqa  xmmword ptr [rsp+0x100], xmm8
1481        movdqa  xmm8, xmm4
1482        psrld   xmm8, 7
1483        pslld   xmm4, 25
1484        por     xmm4, xmm8
1485        movdqa  xmm8, xmm5
1486        psrld   xmm8, 7
1487        pslld   xmm5, 25
1488        por     xmm5, xmm8
1489        movdqa  xmm8, xmm6
1490        psrld   xmm8, 7
1491        pslld   xmm6, 25
1492        por     xmm6, xmm8
1493        movdqa  xmm8, xmm7
1494        psrld   xmm8, 7
1495        pslld   xmm7, 25
1496        por     xmm7, xmm8
1497        paddd   xmm0, xmmword ptr [rsp+0xE0]
1498        paddd   xmm1, xmmword ptr [rsp+0x20]
1499        paddd   xmm2, xmmword ptr [rsp+0x30]
1500        paddd   xmm3, xmmword ptr [rsp+0x70]
1501        paddd   xmm0, xmm5
1502        paddd   xmm1, xmm6
1503        paddd   xmm2, xmm7
1504        paddd   xmm3, xmm4
1505        pxor    xmm15, xmm0
1506        pxor    xmm12, xmm1
1507        pxor    xmm13, xmm2
1508        pxor    xmm14, xmm3
1509        pshuflw xmm15, xmm15, 0xB1
1510        pshufhw xmm15, xmm15, 0xB1
1511        pshuflw xmm12, xmm12, 0xB1
1512        pshufhw xmm12, xmm12, 0xB1
1513        pshuflw xmm13, xmm13, 0xB1
1514        pshufhw xmm13, xmm13, 0xB1
1515        pshuflw xmm14, xmm14, 0xB1
1516        pshufhw xmm14, xmm14, 0xB1
1517        paddd   xmm10, xmm15
1518        paddd   xmm11, xmm12
1519        movdqa  xmm8, xmmword ptr [rsp+0x100]
1520        paddd   xmm8, xmm13
1521        paddd   xmm9, xmm14
1522        pxor    xmm5, xmm10
1523        pxor    xmm6, xmm11
1524        pxor    xmm7, xmm8
1525        pxor    xmm4, xmm9
1526        movdqa  xmmword ptr [rsp+0x100], xmm8
1527        movdqa  xmm8, xmm5
1528        psrld   xmm8, 12
1529        pslld   xmm5, 20
1530        por     xmm5, xmm8
1531        movdqa  xmm8, xmm6
1532        psrld   xmm8, 12
1533        pslld   xmm6, 20
1534        por     xmm6, xmm8
1535        movdqa  xmm8, xmm7
1536        psrld   xmm8, 12
1537        pslld   xmm7, 20
1538        por     xmm7, xmm8
1539        movdqa  xmm8, xmm4
1540        psrld   xmm8, 12
1541        pslld   xmm4, 20
1542        por     xmm4, xmm8
1543        paddd   xmm0, xmmword ptr [rsp+0xA0]
1544        paddd   xmm1, xmmword ptr [rsp+0xC0]
1545        paddd   xmm2, xmmword ptr [rsp+0x40]
1546        paddd   xmm3, xmmword ptr [rsp+0xD0]
1547        paddd   xmm0, xmm5
1548        paddd   xmm1, xmm6
1549        paddd   xmm2, xmm7
1550        paddd   xmm3, xmm4
1551        pxor    xmm15, xmm0
1552        pxor    xmm12, xmm1
1553        pxor    xmm13, xmm2
1554        pxor    xmm14, xmm3
1555        movdqa  xmm8, xmm15
1556        psrld   xmm15, 8
1557        pslld   xmm8, 24
1558        pxor    xmm15, xmm8
1559        movdqa  xmm8, xmm12
1560        psrld   xmm12, 8
1561        pslld   xmm8, 24
1562        pxor    xmm12, xmm8
1563        movdqa  xmm8, xmm13
1564        psrld   xmm13, 8
1565        pslld   xmm8, 24
1566        pxor    xmm13, xmm8
1567        movdqa  xmm8, xmm14
1568        psrld   xmm14, 8
1569        pslld   xmm8, 24
1570        pxor    xmm14, xmm8
1571        paddd   xmm10, xmm15
1572        paddd   xmm11, xmm12
1573        movdqa  xmm8, xmmword ptr [rsp+0x100]
1574        paddd   xmm8, xmm13
1575        paddd   xmm9, xmm14
1576        pxor    xmm5, xmm10
1577        pxor    xmm6, xmm11
1578        pxor    xmm7, xmm8
1579        pxor    xmm4, xmm9
1580        pxor    xmm0, xmm8
1581        pxor    xmm1, xmm9
1582        pxor    xmm2, xmm10
1583        pxor    xmm3, xmm11
1584        movdqa  xmm8, xmm5
1585        psrld   xmm8, 7
1586        pslld   xmm5, 25
1587        por     xmm5, xmm8
1588        movdqa  xmm8, xmm6
1589        psrld   xmm8, 7
1590        pslld   xmm6, 25
1591        por     xmm6, xmm8
1592        movdqa  xmm8, xmm7
1593        psrld   xmm8, 7
1594        pslld   xmm7, 25
1595        por     xmm7, xmm8
1596        movdqa  xmm8, xmm4
1597        psrld   xmm8, 7
1598        pslld   xmm4, 25
1599        por     xmm4, xmm8
1600        pxor    xmm4, xmm12
1601        pxor    xmm5, xmm13
1602        pxor    xmm6, xmm14
1603        pxor    xmm7, xmm15
1604        mov     eax, r13d
1605        jne     9b
1606        movdqa  xmm9, xmm0
1607        punpckldq xmm0, xmm1
1608        punpckhdq xmm9, xmm1
1609        movdqa  xmm11, xmm2
1610        punpckldq xmm2, xmm3
1611        punpckhdq xmm11, xmm3
1612        movdqa  xmm1, xmm0
1613        punpcklqdq xmm0, xmm2
1614        punpckhqdq xmm1, xmm2
1615        movdqa  xmm3, xmm9
1616        punpcklqdq xmm9, xmm11
1617        punpckhqdq xmm3, xmm11
1618        movdqu  xmmword ptr [rbx], xmm0
1619        movdqu  xmmword ptr [rbx+0x20], xmm1
1620        movdqu  xmmword ptr [rbx+0x40], xmm9
1621        movdqu  xmmword ptr [rbx+0x60], xmm3
1622        movdqa  xmm9, xmm4
1623        punpckldq xmm4, xmm5
1624        punpckhdq xmm9, xmm5
1625        movdqa  xmm11, xmm6
1626        punpckldq xmm6, xmm7
1627        punpckhdq xmm11, xmm7
1628        movdqa  xmm5, xmm4
1629        punpcklqdq xmm4, xmm6
1630        punpckhqdq xmm5, xmm6
1631        movdqa  xmm7, xmm9
1632        punpcklqdq xmm9, xmm11
1633        punpckhqdq xmm7, xmm11
1634        movdqu  xmmword ptr [rbx+0x10], xmm4
1635        movdqu  xmmword ptr [rbx+0x30], xmm5
1636        movdqu  xmmword ptr [rbx+0x50], xmm9
1637        movdqu  xmmword ptr [rbx+0x70], xmm7
1638        movdqa  xmm1, xmmword ptr [rsp+0x110]
1639        movdqa  xmm0, xmm1
1640        paddd   xmm1, xmmword ptr [rsp+0x150]
1641        movdqa  xmmword ptr [rsp+0x110], xmm1
1642        pxor    xmm0, xmmword ptr [CMP_MSB_MASK+rip]
1643        pxor    xmm1, xmmword ptr [CMP_MSB_MASK+rip]
1644        pcmpgtd xmm0, xmm1
1645        movdqa  xmm1, xmmword ptr [rsp+0x120]
1646        psubd   xmm1, xmm0
1647        movdqa  xmmword ptr [rsp+0x120], xmm1
1648        add     rbx, 128
1649        add     rdi, 32
1650        sub     rsi, 4
1651        cmp     rsi, 4
1652        jnc     2b
1653        test    rsi, rsi
1654        jnz     3f
16554:
1656        mov     rsp, rbp
1657        pop     rbp
1658        pop     rbx
1659        pop     r12
1660        pop     r13
1661        pop     r14
1662        pop     r15
1663        ret
1664.p2align 5
16653:
1666        test    esi, 0x2
1667        je      3f
1668        movups  xmm0, xmmword ptr [rcx]
1669        movups  xmm1, xmmword ptr [rcx+0x10]
1670        movaps  xmm8, xmm0
1671        movaps  xmm9, xmm1
1672        movd    xmm13, dword ptr [rsp+0x110]
1673        movd    xmm14, dword ptr [rsp+0x120]
1674        punpckldq xmm13, xmm14
1675        movaps  xmmword ptr [rsp], xmm13
1676        movd    xmm14, dword ptr [rsp+0x114]
1677        movd    xmm13, dword ptr [rsp+0x124]
1678        punpckldq xmm14, xmm13
1679        movaps  xmmword ptr [rsp+0x10], xmm14
1680        mov     r8, qword ptr [rdi]
1681        mov     r9, qword ptr [rdi+0x8]
1682        movzx   eax, byte ptr [rbp+0x40]
1683        or      eax, r13d
1684        xor     edx, edx
16852:
1686        mov     r14d, eax
1687        or      eax, r12d
1688        add     rdx, 64
1689        cmp     rdx, r15
1690        cmovne  eax, r14d
1691        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
1692        movaps  xmm10, xmm2
1693        movups  xmm4, xmmword ptr [r8+rdx-0x40]
1694        movups  xmm5, xmmword ptr [r8+rdx-0x30]
1695        movaps  xmm3, xmm4
1696        shufps  xmm4, xmm5, 136
1697        shufps  xmm3, xmm5, 221
1698        movaps  xmm5, xmm3
1699        movups  xmm6, xmmword ptr [r8+rdx-0x20]
1700        movups  xmm7, xmmword ptr [r8+rdx-0x10]
1701        movaps  xmm3, xmm6
1702        shufps  xmm6, xmm7, 136
1703        pshufd  xmm6, xmm6, 0x93
1704        shufps  xmm3, xmm7, 221
1705        pshufd  xmm7, xmm3, 0x93
1706        movups  xmm12, xmmword ptr [r9+rdx-0x40]
1707        movups  xmm13, xmmword ptr [r9+rdx-0x30]
1708        movaps  xmm11, xmm12
1709        shufps  xmm12, xmm13, 136
1710        shufps  xmm11, xmm13, 221
1711        movaps  xmm13, xmm11
1712        movups  xmm14, xmmword ptr [r9+rdx-0x20]
1713        movups  xmm15, xmmword ptr [r9+rdx-0x10]
1714        movaps  xmm11, xmm14
1715        shufps  xmm14, xmm15, 136
1716        pshufd  xmm14, xmm14, 0x93
1717        shufps  xmm11, xmm15, 221
1718        pshufd  xmm15, xmm11, 0x93
1719        shl     rax, 0x20
1720        or      rax, 0x40
1721        movq    xmm3, rax
1722        movdqa  xmmword ptr [rsp+0x20], xmm3
1723        movaps  xmm3, xmmword ptr [rsp]
1724        movaps  xmm11, xmmword ptr [rsp+0x10]
1725        punpcklqdq xmm3, xmmword ptr [rsp+0x20]
1726        punpcklqdq xmm11, xmmword ptr [rsp+0x20]
1727        mov     al, 7
17289:
1729        paddd   xmm0, xmm4
1730        paddd   xmm8, xmm12
1731        movaps  xmmword ptr [rsp+0x20], xmm4
1732        movaps  xmmword ptr [rsp+0x30], xmm12
1733        paddd   xmm0, xmm1
1734        paddd   xmm8, xmm9
1735        pxor    xmm3, xmm0
1736        pxor    xmm11, xmm8
1737        pshuflw xmm3, xmm3, 0xB1
1738        pshufhw xmm3, xmm3, 0xB1
1739        pshuflw xmm11, xmm11, 0xB1
1740        pshufhw xmm11, xmm11, 0xB1
1741        paddd   xmm2, xmm3
1742        paddd   xmm10, xmm11
1743        pxor    xmm1, xmm2
1744        pxor    xmm9, xmm10
1745        movdqa  xmm4, xmm1
1746        pslld   xmm1, 20
1747        psrld   xmm4, 12
1748        por     xmm1, xmm4
1749        movdqa  xmm4, xmm9
1750        pslld   xmm9, 20
1751        psrld   xmm4, 12
1752        por     xmm9, xmm4
1753        paddd   xmm0, xmm5
1754        paddd   xmm8, xmm13
1755        movaps  xmmword ptr [rsp+0x40], xmm5
1756        movaps  xmmword ptr [rsp+0x50], xmm13
1757        paddd   xmm0, xmm1
1758        paddd   xmm8, xmm9
1759        pxor    xmm3, xmm0
1760        pxor    xmm11, xmm8
1761        movdqa  xmm13, xmm3
1762        psrld   xmm3, 8
1763        pslld   xmm13, 24
1764        pxor    xmm3, xmm13
1765        movdqa  xmm13, xmm11
1766        psrld   xmm11, 8
1767        pslld   xmm13, 24
1768        pxor    xmm11, xmm13
1769        paddd   xmm2, xmm3
1770        paddd   xmm10, xmm11
1771        pxor    xmm1, xmm2
1772        pxor    xmm9, xmm10
1773        movdqa  xmm4, xmm1
1774        pslld   xmm1, 25
1775        psrld   xmm4, 7
1776        por     xmm1, xmm4
1777        movdqa  xmm4, xmm9
1778        pslld   xmm9, 25
1779        psrld   xmm4, 7
1780        por     xmm9, xmm4
1781        pshufd  xmm0, xmm0, 0x93
1782        pshufd  xmm8, xmm8, 0x93
1783        pshufd  xmm3, xmm3, 0x4E
1784        pshufd  xmm11, xmm11, 0x4E
1785        pshufd  xmm2, xmm2, 0x39
1786        pshufd  xmm10, xmm10, 0x39
1787        paddd   xmm0, xmm6
1788        paddd   xmm8, xmm14
1789        paddd   xmm0, xmm1
1790        paddd   xmm8, xmm9
1791        pxor    xmm3, xmm0
1792        pxor    xmm11, xmm8
1793        pshuflw xmm3, xmm3, 0xB1
1794        pshufhw xmm3, xmm3, 0xB1
1795        pshuflw xmm11, xmm11, 0xB1
1796        pshufhw xmm11, xmm11, 0xB1
1797        paddd   xmm2, xmm3
1798        paddd   xmm10, xmm11
1799        pxor    xmm1, xmm2
1800        pxor    xmm9, xmm10
1801        movdqa  xmm4, xmm1
1802        pslld   xmm1, 20
1803        psrld   xmm4, 12
1804        por     xmm1, xmm4
1805        movdqa  xmm4, xmm9
1806        pslld   xmm9, 20
1807        psrld   xmm4, 12
1808        por     xmm9, xmm4
1809        paddd   xmm0, xmm7
1810        paddd   xmm8, xmm15
1811        paddd   xmm0, xmm1
1812        paddd   xmm8, xmm9
1813        pxor    xmm3, xmm0
1814        pxor    xmm11, xmm8
1815        movdqa  xmm13, xmm3
1816        psrld   xmm3, 8
1817        pslld   xmm13, 24
1818        pxor    xmm3, xmm13
1819        movdqa  xmm13, xmm11
1820        psrld   xmm11, 8
1821        pslld   xmm13, 24
1822        pxor    xmm11, xmm13
1823        paddd   xmm2, xmm3
1824        paddd   xmm10, xmm11
1825        pxor    xmm1, xmm2
1826        pxor    xmm9, xmm10
1827        movdqa  xmm4, xmm1
1828        pslld   xmm1, 25
1829        psrld   xmm4, 7
1830        por     xmm1, xmm4
1831        movdqa  xmm4, xmm9
1832        pslld   xmm9, 25
1833        psrld   xmm4, 7
1834        por     xmm9, xmm4
1835        pshufd  xmm0, xmm0, 0x39
1836        pshufd  xmm8, xmm8, 0x39
1837        pshufd  xmm3, xmm3, 0x4E
1838        pshufd  xmm11, xmm11, 0x4E
1839        pshufd  xmm2, xmm2, 0x93
1840        pshufd  xmm10, xmm10, 0x93
1841        dec     al
1842        je      9f
1843        movdqa  xmm12, xmmword ptr [rsp+0x20]
1844        movdqa  xmm5, xmmword ptr [rsp+0x40]
1845        pshufd  xmm13, xmm12, 0x0F
1846        shufps  xmm12, xmm5, 214
1847        pshufd  xmm4, xmm12, 0x39
1848        movdqa  xmm12, xmm6
1849        shufps  xmm12, xmm7, 250
1850        pand    xmm13, xmmword ptr [PBLENDW_0x33_MASK+rip]
1851        pand    xmm12, xmmword ptr [PBLENDW_0xCC_MASK+rip]
1852        por     xmm13, xmm12
1853        movdqa  xmmword ptr [rsp+0x20], xmm13
1854        movdqa  xmm12, xmm7
1855        punpcklqdq xmm12, xmm5
1856        movdqa  xmm13, xmm6
1857        pand    xmm12, xmmword ptr [PBLENDW_0x3F_MASK+rip]
1858        pand    xmm13, xmmword ptr [PBLENDW_0xC0_MASK+rip]
1859        por     xmm12, xmm13
1860        pshufd  xmm12, xmm12, 0x78
1861        punpckhdq xmm5, xmm7
1862        punpckldq xmm6, xmm5
1863        pshufd  xmm7, xmm6, 0x1E
1864        movdqa  xmmword ptr [rsp+0x40], xmm12
1865        movdqa  xmm5, xmmword ptr [rsp+0x30]
1866        movdqa  xmm13, xmmword ptr [rsp+0x50]
1867        pshufd  xmm6, xmm5, 0x0F
1868        shufps  xmm5, xmm13, 214
1869        pshufd  xmm12, xmm5, 0x39
1870        movdqa  xmm5, xmm14
1871        shufps  xmm5, xmm15, 250
1872        pand    xmm6, xmmword ptr [PBLENDW_0x33_MASK+rip]
1873        pand    xmm5, xmmword ptr [PBLENDW_0xCC_MASK+rip]
1874        por     xmm6, xmm5
1875        movdqa  xmm5, xmm15
1876        punpcklqdq xmm5, xmm13
1877        movdqa  xmmword ptr [rsp+0x30], xmm2
1878        movdqa  xmm2, xmm14
1879        pand    xmm5, xmmword ptr [PBLENDW_0x3F_MASK+rip]
1880        pand    xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip]
1881        por     xmm5, xmm2
1882        movdqa  xmm2, xmmword ptr [rsp+0x30]
1883        pshufd  xmm5, xmm5, 0x78
1884        punpckhdq xmm13, xmm15
1885        punpckldq xmm14, xmm13
1886        pshufd  xmm15, xmm14, 0x1E
1887        movdqa  xmm13, xmm6
1888        movdqa  xmm14, xmm5
1889        movdqa  xmm5, xmmword ptr [rsp+0x20]
1890        movdqa  xmm6, xmmword ptr [rsp+0x40]
1891        jmp     9b
18929:
1893        pxor    xmm0, xmm2
1894        pxor    xmm1, xmm3
1895        pxor    xmm8, xmm10
1896        pxor    xmm9, xmm11
1897        mov     eax, r13d
1898        cmp     rdx, r15
1899        jne     2b
1900        movups  xmmword ptr [rbx], xmm0
1901        movups  xmmword ptr [rbx+0x10], xmm1
1902        movups  xmmword ptr [rbx+0x20], xmm8
1903        movups  xmmword ptr [rbx+0x30], xmm9
1904        mov     eax, dword ptr [rsp+0x130]
1905        neg     eax
1906        mov    r10d, dword ptr [rsp+0x110+8*rax]
1907        mov    r11d, dword ptr [rsp+0x120+8*rax]
1908        mov dword ptr [rsp+0x110], r10d
1909        mov dword ptr [rsp+0x120], r11d
1910        add     rdi, 16
1911        add     rbx, 64
1912        sub     rsi, 2
19133:
1914        test    esi, 0x1
1915        je      4b
1916        movups  xmm0, xmmword ptr [rcx]
1917        movups  xmm1, xmmword ptr [rcx+0x10]
1918        movd    xmm13, dword ptr [rsp+0x110]
1919        movd    xmm14, dword ptr [rsp+0x120]
1920        punpckldq xmm13, xmm14
1921        mov     r8, qword ptr [rdi]
1922        movzx   eax, byte ptr [rbp+0x40]
1923        or      eax, r13d
1924        xor     edx, edx
19252:
1926        mov     r14d, eax
1927        or      eax, r12d
1928        add     rdx, 64
1929        cmp     rdx, r15
1930        cmovne  eax, r14d
1931        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
1932        shl     rax, 32
1933        or      rax, 64
1934        movq    xmm12, rax
1935        movdqa  xmm3, xmm13
1936        punpcklqdq xmm3, xmm12
1937        movups  xmm4, xmmword ptr [r8+rdx-0x40]
1938        movups  xmm5, xmmword ptr [r8+rdx-0x30]
1939        movaps  xmm8, xmm4
1940        shufps  xmm4, xmm5, 136
1941        shufps  xmm8, xmm5, 221
1942        movaps  xmm5, xmm8
1943        movups  xmm6, xmmword ptr [r8+rdx-0x20]
1944        movups  xmm7, xmmword ptr [r8+rdx-0x10]
1945        movaps  xmm8, xmm6
1946        shufps  xmm6, xmm7, 136
1947        pshufd  xmm6, xmm6, 0x93
1948        shufps  xmm8, xmm7, 221
1949        pshufd  xmm7, xmm8, 0x93
1950        mov     al, 7
19519:
1952        paddd   xmm0, xmm4
1953        paddd   xmm0, xmm1
1954        pxor    xmm3, xmm0
1955        pshuflw xmm3, xmm3, 0xB1
1956        pshufhw xmm3, xmm3, 0xB1
1957        paddd   xmm2, xmm3
1958        pxor    xmm1, xmm2
1959        movdqa  xmm11, xmm1
1960        pslld   xmm1, 20
1961        psrld   xmm11, 12
1962        por     xmm1, xmm11
1963        paddd   xmm0, xmm5
1964        paddd   xmm0, xmm1
1965        pxor    xmm3, xmm0
1966        movdqa  xmm14, xmm3
1967        psrld   xmm3, 8
1968        pslld   xmm14, 24
1969        pxor    xmm3, xmm14
1970        paddd   xmm2, xmm3
1971        pxor    xmm1, xmm2
1972        movdqa  xmm11, xmm1
1973        pslld   xmm1, 25
1974        psrld   xmm11, 7
1975        por     xmm1, xmm11
1976        pshufd  xmm0, xmm0, 0x93
1977        pshufd  xmm3, xmm3, 0x4E
1978        pshufd  xmm2, xmm2, 0x39
1979        paddd   xmm0, xmm6
1980        paddd   xmm0, xmm1
1981        pxor    xmm3, xmm0
1982        pshuflw xmm3, xmm3, 0xB1
1983        pshufhw xmm3, xmm3, 0xB1
1984        paddd   xmm2, xmm3
1985        pxor    xmm1, xmm2
1986        movdqa  xmm11, xmm1
1987        pslld   xmm1, 20
1988        psrld   xmm11, 12
1989        por     xmm1, xmm11
1990        paddd   xmm0, xmm7
1991        paddd   xmm0, xmm1
1992        pxor    xmm3, xmm0
1993        movdqa  xmm14, xmm3
1994        psrld   xmm3, 8
1995        pslld   xmm14, 24
1996        pxor    xmm3, xmm14
1997        paddd   xmm2, xmm3
1998        pxor    xmm1, xmm2
1999        movdqa  xmm11, xmm1
2000        pslld   xmm1, 25
2001        psrld   xmm11, 7
2002        por     xmm1, xmm11
2003        pshufd  xmm0, xmm0, 0x39
2004        pshufd  xmm3, xmm3, 0x4E
2005        pshufd  xmm2, xmm2, 0x93
2006        dec     al
2007        jz      9f
2008        movdqa  xmm8, xmm4
2009        shufps  xmm8, xmm5, 214
2010        pshufd  xmm9, xmm4, 0x0F
2011        pshufd  xmm4, xmm8, 0x39
2012        movdqa  xmm8, xmm6
2013        shufps  xmm8, xmm7, 250
2014        pand    xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
2015        pand    xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
2016        por     xmm9, xmm8
2017        movdqa  xmm8, xmm7
2018        punpcklqdq xmm8, xmm5
2019        movdqa  xmm10, xmm6
2020        pand    xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
2021        pand    xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
2022        por     xmm8, xmm10
2023        pshufd  xmm8, xmm8, 0x78
2024        punpckhdq xmm5, xmm7
2025        punpckldq xmm6, xmm5
2026        pshufd  xmm7, xmm6, 0x1E
2027        movdqa  xmm5, xmm9
2028        movdqa  xmm6, xmm8
2029        jmp     9b
20309:
2031        pxor    xmm0, xmm2
2032        pxor    xmm1, xmm3
2033        mov     eax, r13d
2034        cmp     rdx, r15
2035        jne     2b
2036        movups  xmmword ptr [rbx], xmm0
2037        movups  xmmword ptr [rbx+0x10], xmm1
2038        jmp     4b
2039
2040.p2align 6
2041blake3_compress_in_place_sse2:
2042_blake3_compress_in_place_sse2:
2043        _CET_ENDBR
2044        movups  xmm0, xmmword ptr [rdi]
2045        movups  xmm1, xmmword ptr [rdi+0x10]
2046        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
2047        shl     r8, 32
2048        add     rdx, r8
2049        movq    xmm3, rcx
2050        movq    xmm4, rdx
2051        punpcklqdq xmm3, xmm4
2052        movups  xmm4, xmmword ptr [rsi]
2053        movups  xmm5, xmmword ptr [rsi+0x10]
2054        movaps  xmm8, xmm4
2055        shufps  xmm4, xmm5, 136
2056        shufps  xmm8, xmm5, 221
2057        movaps  xmm5, xmm8
2058        movups  xmm6, xmmword ptr [rsi+0x20]
2059        movups  xmm7, xmmword ptr [rsi+0x30]
2060        movaps  xmm8, xmm6
2061        shufps  xmm6, xmm7, 136
2062        pshufd  xmm6, xmm6, 0x93
2063        shufps  xmm8, xmm7, 221
2064        pshufd  xmm7, xmm8, 0x93
2065        mov     al, 7
20669:
2067        paddd   xmm0, xmm4
2068        paddd   xmm0, xmm1
2069        pxor    xmm3, xmm0
2070        pshuflw xmm3, xmm3, 0xB1
2071        pshufhw xmm3, xmm3, 0xB1
2072        paddd   xmm2, xmm3
2073        pxor    xmm1, xmm2
2074        movdqa  xmm11, xmm1
2075        pslld   xmm1, 20
2076        psrld   xmm11, 12
2077        por     xmm1, xmm11
2078        paddd   xmm0, xmm5
2079        paddd   xmm0, xmm1
2080        pxor    xmm3, xmm0
2081        movdqa  xmm14, xmm3
2082        psrld   xmm3, 8
2083        pslld   xmm14, 24
2084        pxor    xmm3, xmm14
2085        paddd   xmm2, xmm3
2086        pxor    xmm1, xmm2
2087        movdqa  xmm11, xmm1
2088        pslld   xmm1, 25
2089        psrld   xmm11, 7
2090        por     xmm1, xmm11
2091        pshufd  xmm0, xmm0, 0x93
2092        pshufd  xmm3, xmm3, 0x4E
2093        pshufd  xmm2, xmm2, 0x39
2094        paddd   xmm0, xmm6
2095        paddd   xmm0, xmm1
2096        pxor    xmm3, xmm0
2097        pshuflw xmm3, xmm3, 0xB1
2098        pshufhw xmm3, xmm3, 0xB1
2099        paddd   xmm2, xmm3
2100        pxor    xmm1, xmm2
2101        movdqa  xmm11, xmm1
2102        pslld   xmm1, 20
2103        psrld   xmm11, 12
2104        por     xmm1, xmm11
2105        paddd   xmm0, xmm7
2106        paddd   xmm0, xmm1
2107        pxor    xmm3, xmm0
2108        movdqa  xmm14, xmm3
2109        psrld   xmm3, 8
2110        pslld   xmm14, 24
2111        pxor    xmm3, xmm14
2112        paddd   xmm2, xmm3
2113        pxor    xmm1, xmm2
2114        movdqa  xmm11, xmm1
2115        pslld   xmm1, 25
2116        psrld   xmm11, 7
2117        por     xmm1, xmm11
2118        pshufd  xmm0, xmm0, 0x39
2119        pshufd  xmm3, xmm3, 0x4E
2120        pshufd  xmm2, xmm2, 0x93
2121        dec     al
2122        jz      9f
2123        movdqa  xmm8, xmm4
2124        shufps  xmm8, xmm5, 214
2125        pshufd  xmm9, xmm4, 0x0F
2126        pshufd  xmm4, xmm8, 0x39
2127        movdqa  xmm8, xmm6
2128        shufps  xmm8, xmm7, 250
2129        pand    xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
2130        pand    xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
2131        por     xmm9, xmm8
2132        movdqa  xmm8, xmm7
2133        punpcklqdq xmm8, xmm5
2134        movdqa  xmm10, xmm6
2135        pand    xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
2136        pand    xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
2137        por     xmm8, xmm10
2138        pshufd  xmm8, xmm8, 0x78
2139        punpckhdq xmm5, xmm7
2140        punpckldq xmm6, xmm5
2141        pshufd  xmm7, xmm6, 0x1E
2142        movdqa  xmm5, xmm9
2143        movdqa  xmm6, xmm8
2144        jmp     9b
21459:
2146        pxor    xmm0, xmm2
2147        pxor    xmm1, xmm3
2148        movups  xmmword ptr [rdi], xmm0
2149        movups  xmmword ptr [rdi+0x10], xmm1
2150        ret
2151
2152.p2align 6
2153blake3_compress_xof_sse2:
2154_blake3_compress_xof_sse2:
2155        _CET_ENDBR
2156        movups  xmm0, xmmword ptr [rdi]
2157        movups  xmm1, xmmword ptr [rdi+0x10]
2158        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
2159        movzx   eax, r8b
2160        movzx   edx, dl
2161        shl     rax, 32
2162        add     rdx, rax
2163        movq    xmm3, rcx
2164        movq    xmm4, rdx
2165        punpcklqdq xmm3, xmm4
2166        movups  xmm4, xmmword ptr [rsi]
2167        movups  xmm5, xmmword ptr [rsi+0x10]
2168        movaps  xmm8, xmm4
2169        shufps  xmm4, xmm5, 136
2170        shufps  xmm8, xmm5, 221
2171        movaps  xmm5, xmm8
2172        movups  xmm6, xmmword ptr [rsi+0x20]
2173        movups  xmm7, xmmword ptr [rsi+0x30]
2174        movaps  xmm8, xmm6
2175        shufps  xmm6, xmm7, 136
2176        pshufd  xmm6, xmm6, 0x93
2177        shufps  xmm8, xmm7, 221
2178        pshufd  xmm7, xmm8, 0x93
2179        mov     al, 7
21809:
2181        paddd   xmm0, xmm4
2182        paddd   xmm0, xmm1
2183        pxor    xmm3, xmm0
2184        pshuflw xmm3, xmm3, 0xB1
2185        pshufhw xmm3, xmm3, 0xB1
2186        paddd   xmm2, xmm3
2187        pxor    xmm1, xmm2
2188        movdqa  xmm11, xmm1
2189        pslld   xmm1, 20
2190        psrld   xmm11, 12
2191        por     xmm1, xmm11
2192        paddd   xmm0, xmm5
2193        paddd   xmm0, xmm1
2194        pxor    xmm3, xmm0
2195        movdqa  xmm14, xmm3
2196        psrld   xmm3, 8
2197        pslld   xmm14, 24
2198        pxor    xmm3, xmm14
2199        paddd   xmm2, xmm3
2200        pxor    xmm1, xmm2
2201        movdqa  xmm11, xmm1
2202        pslld   xmm1, 25
2203        psrld   xmm11, 7
2204        por     xmm1, xmm11
2205        pshufd  xmm0, xmm0, 0x93
2206        pshufd  xmm3, xmm3, 0x4E
2207        pshufd  xmm2, xmm2, 0x39
2208        paddd   xmm0, xmm6
2209        paddd   xmm0, xmm1
2210        pxor    xmm3, xmm0
2211        pshuflw xmm3, xmm3, 0xB1
2212        pshufhw xmm3, xmm3, 0xB1
2213        paddd   xmm2, xmm3
2214        pxor    xmm1, xmm2
2215        movdqa  xmm11, xmm1
2216        pslld   xmm1, 20
2217        psrld   xmm11, 12
2218        por     xmm1, xmm11
2219        paddd   xmm0, xmm7
2220        paddd   xmm0, xmm1
2221        pxor    xmm3, xmm0
2222        movdqa  xmm14, xmm3
2223        psrld   xmm3, 8
2224        pslld   xmm14, 24
2225        pxor    xmm3, xmm14
2226        paddd   xmm2, xmm3
2227        pxor    xmm1, xmm2
2228        movdqa  xmm11, xmm1
2229        pslld   xmm1, 25
2230        psrld   xmm11, 7
2231        por     xmm1, xmm11
2232        pshufd  xmm0, xmm0, 0x39
2233        pshufd  xmm3, xmm3, 0x4E
2234        pshufd  xmm2, xmm2, 0x93
2235        dec     al
2236        jz      9f
2237        movdqa  xmm8, xmm4
2238        shufps  xmm8, xmm5, 214
2239        pshufd  xmm9, xmm4, 0x0F
2240        pshufd  xmm4, xmm8, 0x39
2241        movdqa  xmm8, xmm6
2242        shufps  xmm8, xmm7, 250
2243        pand    xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
2244        pand    xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
2245        por     xmm9, xmm8
2246        movdqa  xmm8, xmm7
2247        punpcklqdq xmm8, xmm5
2248        movdqa  xmm10, xmm6
2249        pand    xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
2250        pand    xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
2251        por     xmm8, xmm10
2252        pshufd  xmm8, xmm8, 0x78
2253        punpckhdq xmm5, xmm7
2254        punpckldq xmm6, xmm5
2255        pshufd  xmm7, xmm6, 0x1E
2256        movdqa  xmm5, xmm9
2257        movdqa  xmm6, xmm8
2258        jmp     9b
22599:
2260        movdqu  xmm4, xmmword ptr [rdi]
2261        movdqu  xmm5, xmmword ptr [rdi+0x10]
2262        pxor    xmm0, xmm2
2263        pxor    xmm1, xmm3
2264        pxor    xmm2, xmm4
2265        pxor    xmm3, xmm5
2266        movups  xmmword ptr [r9], xmm0
2267        movups  xmmword ptr [r9+0x10], xmm1
2268        movups  xmmword ptr [r9+0x20], xmm2
2269        movups  xmmword ptr [r9+0x30], xmm3
2270        ret
2271
2272
2273#ifdef __APPLE__
2274.static_data
2275#else
2276.section .rodata
2277#endif
2278.p2align  6
2279BLAKE3_IV:
2280        .long  0x6A09E667, 0xBB67AE85
2281        .long  0x3C6EF372, 0xA54FF53A
2282ADD0:
2283        .long  0, 1, 2, 3
2284ADD1:
2285	.long  4, 4, 4, 4
2286BLAKE3_IV_0:
2287	.long  0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
2288BLAKE3_IV_1:
2289	.long  0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
2290BLAKE3_IV_2:
2291	.long  0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
2292BLAKE3_IV_3:
2293	.long  0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
2294BLAKE3_BLOCK_LEN:
2295	.long  64, 64, 64, 64
2296CMP_MSB_MASK:
2297	.long  0x80000000, 0x80000000, 0x80000000, 0x80000000
2298PBLENDW_0x33_MASK:
2299	.long  0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000
2300PBLENDW_0xCC_MASK:
2301	.long  0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF
2302PBLENDW_0x3F_MASK:
2303	.long  0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000
2304PBLENDW_0xC0_MASK:
2305	.long  0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF
2306
2307#endif
2308