1#include "llvm_blake3_prefix.h"
2
3.intel_syntax noprefix
4.global blake3_hash_many_sse2
5.global _blake3_hash_many_sse2
6.global blake3_compress_in_place_sse2
7.global _blake3_compress_in_place_sse2
8.global blake3_compress_xof_sse2
9.global _blake3_compress_xof_sse2
10.section .text
11        .p2align  6
12_blake3_hash_many_sse2:
13blake3_hash_many_sse2:
14        push    r15
15        push    r14
16        push    r13
17        push    r12
18        push    rsi
19        push    rdi
20        push    rbx
21        push    rbp
22        mov     rbp, rsp
23        sub     rsp, 528
24        and     rsp, 0xFFFFFFFFFFFFFFC0
25        movdqa  xmmword ptr [rsp+0x170], xmm6
26        movdqa  xmmword ptr [rsp+0x180], xmm7
27        movdqa  xmmword ptr [rsp+0x190], xmm8
28        movdqa  xmmword ptr [rsp+0x1A0], xmm9
29        movdqa  xmmword ptr [rsp+0x1B0], xmm10
30        movdqa  xmmword ptr [rsp+0x1C0], xmm11
31        movdqa  xmmword ptr [rsp+0x1D0], xmm12
32        movdqa  xmmword ptr [rsp+0x1E0], xmm13
33        movdqa  xmmword ptr [rsp+0x1F0], xmm14
34        movdqa  xmmword ptr [rsp+0x200], xmm15
35        mov     rdi, rcx
36        mov     rsi, rdx
37        mov     rdx, r8
38        mov     rcx, r9
39        mov     r8, qword ptr [rbp+0x68]
40        movzx   r9, byte ptr [rbp+0x70]
41        neg     r9d
42        movd    xmm0, r9d
43        pshufd  xmm0, xmm0, 0x00
44        movdqa  xmmword ptr [rsp+0x130], xmm0
45        movdqa  xmm1, xmm0
46        pand    xmm1, xmmword ptr [ADD0+rip]
47        pand    xmm0, xmmword ptr [ADD1+rip]
48        movdqa  xmmword ptr [rsp+0x150], xmm0
49        movd    xmm0, r8d
50        pshufd  xmm0, xmm0, 0x00
51        paddd   xmm0, xmm1
52        movdqa  xmmword ptr [rsp+0x110], xmm0
53        pxor    xmm0, xmmword ptr [CMP_MSB_MASK+rip]
54        pxor    xmm1, xmmword ptr [CMP_MSB_MASK+rip]
55        pcmpgtd xmm1, xmm0
56        shr     r8, 32
57        movd    xmm2, r8d
58        pshufd  xmm2, xmm2, 0x00
59        psubd   xmm2, xmm1
60        movdqa  xmmword ptr [rsp+0x120], xmm2
61        mov     rbx, qword ptr [rbp+0x90]
62        mov     r15, rdx
63        shl     r15, 6
64        movzx   r13d, byte ptr [rbp+0x78]
65        movzx   r12d, byte ptr [rbp+0x88]
66        cmp     rsi, 4
67        jc      3f
682:
69        movdqu  xmm3, xmmword ptr [rcx]
70        pshufd  xmm0, xmm3, 0x00
71        pshufd  xmm1, xmm3, 0x55
72        pshufd  xmm2, xmm3, 0xAA
73        pshufd  xmm3, xmm3, 0xFF
74        movdqu  xmm7, xmmword ptr [rcx+0x10]
75        pshufd  xmm4, xmm7, 0x00
76        pshufd  xmm5, xmm7, 0x55
77        pshufd  xmm6, xmm7, 0xAA
78        pshufd  xmm7, xmm7, 0xFF
79        mov     r8, qword ptr [rdi]
80        mov     r9, qword ptr [rdi+0x8]
81        mov     r10, qword ptr [rdi+0x10]
82        mov     r11, qword ptr [rdi+0x18]
83        movzx   eax, byte ptr [rbp+0x80]
84        or      eax, r13d
85        xor     edx, edx
869:
87        mov     r14d, eax
88        or      eax, r12d
89        add     rdx, 64
90        cmp     rdx, r15
91        cmovne  eax, r14d
92        movdqu  xmm8, xmmword ptr [r8+rdx-0x40]
93        movdqu  xmm9, xmmword ptr [r9+rdx-0x40]
94        movdqu  xmm10, xmmword ptr [r10+rdx-0x40]
95        movdqu  xmm11, xmmword ptr [r11+rdx-0x40]
96        movdqa  xmm12, xmm8
97        punpckldq xmm8, xmm9
98        punpckhdq xmm12, xmm9
99        movdqa  xmm14, xmm10
100        punpckldq xmm10, xmm11
101        punpckhdq xmm14, xmm11
102        movdqa  xmm9, xmm8
103        punpcklqdq xmm8, xmm10
104        punpckhqdq xmm9, xmm10
105        movdqa  xmm13, xmm12
106        punpcklqdq xmm12, xmm14
107        punpckhqdq xmm13, xmm14
108        movdqa  xmmword ptr [rsp], xmm8
109        movdqa  xmmword ptr [rsp+0x10], xmm9
110        movdqa  xmmword ptr [rsp+0x20], xmm12
111        movdqa  xmmword ptr [rsp+0x30], xmm13
112        movdqu  xmm8, xmmword ptr [r8+rdx-0x30]
113        movdqu  xmm9, xmmword ptr [r9+rdx-0x30]
114        movdqu  xmm10, xmmword ptr [r10+rdx-0x30]
115        movdqu  xmm11, xmmword ptr [r11+rdx-0x30]
116        movdqa  xmm12, xmm8
117        punpckldq xmm8, xmm9
118        punpckhdq xmm12, xmm9
119        movdqa  xmm14, xmm10
120        punpckldq xmm10, xmm11
121        punpckhdq xmm14, xmm11
122        movdqa  xmm9, xmm8
123        punpcklqdq xmm8, xmm10
124        punpckhqdq xmm9, xmm10
125        movdqa  xmm13, xmm12
126        punpcklqdq xmm12, xmm14
127        punpckhqdq xmm13, xmm14
128        movdqa  xmmword ptr [rsp+0x40], xmm8
129        movdqa  xmmword ptr [rsp+0x50], xmm9
130        movdqa  xmmword ptr [rsp+0x60], xmm12
131        movdqa  xmmword ptr [rsp+0x70], xmm13
132        movdqu  xmm8, xmmword ptr [r8+rdx-0x20]
133        movdqu  xmm9, xmmword ptr [r9+rdx-0x20]
134        movdqu  xmm10, xmmword ptr [r10+rdx-0x20]
135        movdqu  xmm11, xmmword ptr [r11+rdx-0x20]
136        movdqa  xmm12, xmm8
137        punpckldq xmm8, xmm9
138        punpckhdq xmm12, xmm9
139        movdqa  xmm14, xmm10
140        punpckldq xmm10, xmm11
141        punpckhdq xmm14, xmm11
142        movdqa  xmm9, xmm8
143        punpcklqdq xmm8, xmm10
144        punpckhqdq xmm9, xmm10
145        movdqa  xmm13, xmm12
146        punpcklqdq xmm12, xmm14
147        punpckhqdq xmm13, xmm14
148        movdqa  xmmword ptr [rsp+0x80], xmm8
149        movdqa  xmmword ptr [rsp+0x90], xmm9
150        movdqa  xmmword ptr [rsp+0xA0], xmm12
151        movdqa  xmmword ptr [rsp+0xB0], xmm13
152        movdqu  xmm8, xmmword ptr [r8+rdx-0x10]
153        movdqu  xmm9, xmmword ptr [r9+rdx-0x10]
154        movdqu  xmm10, xmmword ptr [r10+rdx-0x10]
155        movdqu  xmm11, xmmword ptr [r11+rdx-0x10]
156        movdqa  xmm12, xmm8
157        punpckldq xmm8, xmm9
158        punpckhdq xmm12, xmm9
159        movdqa  xmm14, xmm10
160        punpckldq xmm10, xmm11
161        punpckhdq xmm14, xmm11
162        movdqa  xmm9, xmm8
163        punpcklqdq xmm8, xmm10
164        punpckhqdq xmm9, xmm10
165        movdqa  xmm13, xmm12
166        punpcklqdq xmm12, xmm14
167        punpckhqdq xmm13, xmm14
168        movdqa  xmmword ptr [rsp+0xC0], xmm8
169        movdqa  xmmword ptr [rsp+0xD0], xmm9
170        movdqa  xmmword ptr [rsp+0xE0], xmm12
171        movdqa  xmmword ptr [rsp+0xF0], xmm13
172        movdqa  xmm9, xmmword ptr [BLAKE3_IV_1+rip]
173        movdqa  xmm10, xmmword ptr [BLAKE3_IV_2+rip]
174        movdqa  xmm11, xmmword ptr [BLAKE3_IV_3+rip]
175        movdqa  xmm12, xmmword ptr [rsp+0x110]
176        movdqa  xmm13, xmmword ptr [rsp+0x120]
177        movdqa  xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
178        movd    xmm15, eax
179        pshufd  xmm15, xmm15, 0x00
180        prefetcht0 [r8+rdx+0x80]
181        prefetcht0 [r9+rdx+0x80]
182        prefetcht0 [r10+rdx+0x80]
183        prefetcht0 [r11+rdx+0x80]
184        paddd   xmm0, xmmword ptr [rsp]
185        paddd   xmm1, xmmword ptr [rsp+0x20]
186        paddd   xmm2, xmmword ptr [rsp+0x40]
187        paddd   xmm3, xmmword ptr [rsp+0x60]
188        paddd   xmm0, xmm4
189        paddd   xmm1, xmm5
190        paddd   xmm2, xmm6
191        paddd   xmm3, xmm7
192        pxor    xmm12, xmm0
193        pxor    xmm13, xmm1
194        pxor    xmm14, xmm2
195        pxor    xmm15, xmm3
196        pshuflw xmm12, xmm12, 0xB1
197        pshufhw xmm12, xmm12, 0xB1
198        pshuflw xmm13, xmm13, 0xB1
199        pshufhw xmm13, xmm13, 0xB1
200        pshuflw xmm14, xmm14, 0xB1
201        pshufhw xmm14, xmm14, 0xB1
202        pshuflw xmm15, xmm15, 0xB1
203        pshufhw xmm15, xmm15, 0xB1
204        movdqa  xmm8, xmmword ptr [BLAKE3_IV_0+rip]
205        paddd   xmm8, xmm12
206        paddd   xmm9, xmm13
207        paddd   xmm10, xmm14
208        paddd   xmm11, xmm15
209        pxor    xmm4, xmm8
210        pxor    xmm5, xmm9
211        pxor    xmm6, xmm10
212        pxor    xmm7, xmm11
213        movdqa  xmmword ptr [rsp+0x100], xmm8
214        movdqa  xmm8, xmm4
215        psrld   xmm8, 12
216        pslld   xmm4, 20
217        por     xmm4, xmm8
218        movdqa  xmm8, xmm5
219        psrld   xmm8, 12
220        pslld   xmm5, 20
221        por     xmm5, xmm8
222        movdqa  xmm8, xmm6
223        psrld   xmm8, 12
224        pslld   xmm6, 20
225        por     xmm6, xmm8
226        movdqa  xmm8, xmm7
227        psrld   xmm8, 12
228        pslld   xmm7, 20
229        por     xmm7, xmm8
230        paddd   xmm0, xmmword ptr [rsp+0x10]
231        paddd   xmm1, xmmword ptr [rsp+0x30]
232        paddd   xmm2, xmmword ptr [rsp+0x50]
233        paddd   xmm3, xmmword ptr [rsp+0x70]
234        paddd   xmm0, xmm4
235        paddd   xmm1, xmm5
236        paddd   xmm2, xmm6
237        paddd   xmm3, xmm7
238        pxor    xmm12, xmm0
239        pxor    xmm13, xmm1
240        pxor    xmm14, xmm2
241        pxor    xmm15, xmm3
242        movdqa  xmm8, xmm12
243        psrld   xmm12, 8
244        pslld   xmm8, 24
245        pxor    xmm12, xmm8
246        movdqa  xmm8, xmm13
247        psrld   xmm13, 8
248        pslld   xmm8, 24
249        pxor    xmm13, xmm8
250        movdqa  xmm8, xmm14
251        psrld   xmm14, 8
252        pslld   xmm8, 24
253        pxor    xmm14, xmm8
254        movdqa  xmm8, xmm15
255        psrld   xmm15, 8
256        pslld   xmm8, 24
257        pxor    xmm15, xmm8
258        movdqa  xmm8, xmmword ptr [rsp+0x100]
259        paddd   xmm8, xmm12
260        paddd   xmm9, xmm13
261        paddd   xmm10, xmm14
262        paddd   xmm11, xmm15
263        pxor    xmm4, xmm8
264        pxor    xmm5, xmm9
265        pxor    xmm6, xmm10
266        pxor    xmm7, xmm11
267        movdqa  xmmword ptr [rsp+0x100], xmm8
268        movdqa  xmm8, xmm4
269        psrld   xmm8, 7
270        pslld   xmm4, 25
271        por     xmm4, xmm8
272        movdqa  xmm8, xmm5
273        psrld   xmm8, 7
274        pslld   xmm5, 25
275        por     xmm5, xmm8
276        movdqa  xmm8, xmm6
277        psrld   xmm8, 7
278        pslld   xmm6, 25
279        por     xmm6, xmm8
280        movdqa  xmm8, xmm7
281        psrld   xmm8, 7
282        pslld   xmm7, 25
283        por     xmm7, xmm8
284        paddd   xmm0, xmmword ptr [rsp+0x80]
285        paddd   xmm1, xmmword ptr [rsp+0xA0]
286        paddd   xmm2, xmmword ptr [rsp+0xC0]
287        paddd   xmm3, xmmword ptr [rsp+0xE0]
288        paddd   xmm0, xmm5
289        paddd   xmm1, xmm6
290        paddd   xmm2, xmm7
291        paddd   xmm3, xmm4
292        pxor    xmm15, xmm0
293        pxor    xmm12, xmm1
294        pxor    xmm13, xmm2
295        pxor    xmm14, xmm3
296        pshuflw xmm15, xmm15, 0xB1
297        pshufhw xmm15, xmm15, 0xB1
298        pshuflw xmm12, xmm12, 0xB1
299        pshufhw xmm12, xmm12, 0xB1
300        pshuflw xmm13, xmm13, 0xB1
301        pshufhw xmm13, xmm13, 0xB1
302        pshuflw xmm14, xmm14, 0xB1
303        pshufhw xmm14, xmm14, 0xB1
304        paddd   xmm10, xmm15
305        paddd   xmm11, xmm12
306        movdqa  xmm8, xmmword ptr [rsp+0x100]
307        paddd   xmm8, xmm13
308        paddd   xmm9, xmm14
309        pxor    xmm5, xmm10
310        pxor    xmm6, xmm11
311        pxor    xmm7, xmm8
312        pxor    xmm4, xmm9
313        movdqa  xmmword ptr [rsp+0x100], xmm8
314        movdqa  xmm8, xmm5
315        psrld   xmm8, 12
316        pslld   xmm5, 20
317        por     xmm5, xmm8
318        movdqa  xmm8, xmm6
319        psrld   xmm8, 12
320        pslld   xmm6, 20
321        por     xmm6, xmm8
322        movdqa  xmm8, xmm7
323        psrld   xmm8, 12
324        pslld   xmm7, 20
325        por     xmm7, xmm8
326        movdqa  xmm8, xmm4
327        psrld   xmm8, 12
328        pslld   xmm4, 20
329        por     xmm4, xmm8
330        paddd   xmm0, xmmword ptr [rsp+0x90]
331        paddd   xmm1, xmmword ptr [rsp+0xB0]
332        paddd   xmm2, xmmword ptr [rsp+0xD0]
333        paddd   xmm3, xmmword ptr [rsp+0xF0]
334        paddd   xmm0, xmm5
335        paddd   xmm1, xmm6
336        paddd   xmm2, xmm7
337        paddd   xmm3, xmm4
338        pxor    xmm15, xmm0
339        pxor    xmm12, xmm1
340        pxor    xmm13, xmm2
341        pxor    xmm14, xmm3
342        movdqa  xmm8, xmm15
343        psrld   xmm15, 8
344        pslld   xmm8, 24
345        pxor    xmm15, xmm8
346        movdqa  xmm8, xmm12
347        psrld   xmm12, 8
348        pslld   xmm8, 24
349        pxor    xmm12, xmm8
350        movdqa  xmm8, xmm13
351        psrld   xmm13, 8
352        pslld   xmm8, 24
353        pxor    xmm13, xmm8
354        movdqa  xmm8, xmm14
355        psrld   xmm14, 8
356        pslld   xmm8, 24
357        pxor    xmm14, xmm8
358        paddd   xmm10, xmm15
359        paddd   xmm11, xmm12
360        movdqa  xmm8, xmmword ptr [rsp+0x100]
361        paddd   xmm8, xmm13
362        paddd   xmm9, xmm14
363        pxor    xmm5, xmm10
364        pxor    xmm6, xmm11
365        pxor    xmm7, xmm8
366        pxor    xmm4, xmm9
367        movdqa  xmmword ptr [rsp+0x100], xmm8
368        movdqa  xmm8, xmm5
369        psrld   xmm8, 7
370        pslld   xmm5, 25
371        por     xmm5, xmm8
372        movdqa  xmm8, xmm6
373        psrld   xmm8, 7
374        pslld   xmm6, 25
375        por     xmm6, xmm8
376        movdqa  xmm8, xmm7
377        psrld   xmm8, 7
378        pslld   xmm7, 25
379        por     xmm7, xmm8
380        movdqa  xmm8, xmm4
381        psrld   xmm8, 7
382        pslld   xmm4, 25
383        por     xmm4, xmm8
384        paddd   xmm0, xmmword ptr [rsp+0x20]
385        paddd   xmm1, xmmword ptr [rsp+0x30]
386        paddd   xmm2, xmmword ptr [rsp+0x70]
387        paddd   xmm3, xmmword ptr [rsp+0x40]
388        paddd   xmm0, xmm4
389        paddd   xmm1, xmm5
390        paddd   xmm2, xmm6
391        paddd   xmm3, xmm7
392        pxor    xmm12, xmm0
393        pxor    xmm13, xmm1
394        pxor    xmm14, xmm2
395        pxor    xmm15, xmm3
396        pshuflw xmm12, xmm12, 0xB1
397        pshufhw xmm12, xmm12, 0xB1
398        pshuflw xmm13, xmm13, 0xB1
399        pshufhw xmm13, xmm13, 0xB1
400        pshuflw xmm14, xmm14, 0xB1
401        pshufhw xmm14, xmm14, 0xB1
402        pshuflw xmm15, xmm15, 0xB1
403        pshufhw xmm15, xmm15, 0xB1
404        movdqa  xmm8, xmmword ptr [rsp+0x100]
405        paddd   xmm8, xmm12
406        paddd   xmm9, xmm13
407        paddd   xmm10, xmm14
408        paddd   xmm11, xmm15
409        pxor    xmm4, xmm8
410        pxor    xmm5, xmm9
411        pxor    xmm6, xmm10
412        pxor    xmm7, xmm11
413        movdqa  xmmword ptr [rsp+0x100], xmm8
414        movdqa  xmm8, xmm4
415        psrld   xmm8, 12
416        pslld   xmm4, 20
417        por     xmm4, xmm8
418        movdqa  xmm8, xmm5
419        psrld   xmm8, 12
420        pslld   xmm5, 20
421        por     xmm5, xmm8
422        movdqa  xmm8, xmm6
423        psrld   xmm8, 12
424        pslld   xmm6, 20
425        por     xmm6, xmm8
426        movdqa  xmm8, xmm7
427        psrld   xmm8, 12
428        pslld   xmm7, 20
429        por     xmm7, xmm8
430        paddd   xmm0, xmmword ptr [rsp+0x60]
431        paddd   xmm1, xmmword ptr [rsp+0xA0]
432        paddd   xmm2, xmmword ptr [rsp]
433        paddd   xmm3, xmmword ptr [rsp+0xD0]
434        paddd   xmm0, xmm4
435        paddd   xmm1, xmm5
436        paddd   xmm2, xmm6
437        paddd   xmm3, xmm7
438        pxor    xmm12, xmm0
439        pxor    xmm13, xmm1
440        pxor    xmm14, xmm2
441        pxor    xmm15, xmm3
442        movdqa  xmm8, xmm12
443        psrld   xmm12, 8
444        pslld   xmm8, 24
445        pxor    xmm12, xmm8
446        movdqa  xmm8, xmm13
447        psrld   xmm13, 8
448        pslld   xmm8, 24
449        pxor    xmm13, xmm8
450        movdqa  xmm8, xmm14
451        psrld   xmm14, 8
452        pslld   xmm8, 24
453        pxor    xmm14, xmm8
454        movdqa  xmm8, xmm15
455        psrld   xmm15, 8
456        pslld   xmm8, 24
457        pxor    xmm15, xmm8
458        movdqa  xmm8, xmmword ptr [rsp+0x100]
459        paddd   xmm8, xmm12
460        paddd   xmm9, xmm13
461        paddd   xmm10, xmm14
462        paddd   xmm11, xmm15
463        pxor    xmm4, xmm8
464        pxor    xmm5, xmm9
465        pxor    xmm6, xmm10
466        pxor    xmm7, xmm11
467        movdqa  xmmword ptr [rsp+0x100], xmm8
468        movdqa  xmm8, xmm4
469        psrld   xmm8, 7
470        pslld   xmm4, 25
471        por     xmm4, xmm8
472        movdqa  xmm8, xmm5
473        psrld   xmm8, 7
474        pslld   xmm5, 25
475        por     xmm5, xmm8
476        movdqa  xmm8, xmm6
477        psrld   xmm8, 7
478        pslld   xmm6, 25
479        por     xmm6, xmm8
480        movdqa  xmm8, xmm7
481        psrld   xmm8, 7
482        pslld   xmm7, 25
483        por     xmm7, xmm8
484        paddd   xmm0, xmmword ptr [rsp+0x10]
485        paddd   xmm1, xmmword ptr [rsp+0xC0]
486        paddd   xmm2, xmmword ptr [rsp+0x90]
487        paddd   xmm3, xmmword ptr [rsp+0xF0]
488        paddd   xmm0, xmm5
489        paddd   xmm1, xmm6
490        paddd   xmm2, xmm7
491        paddd   xmm3, xmm4
492        pxor    xmm15, xmm0
493        pxor    xmm12, xmm1
494        pxor    xmm13, xmm2
495        pxor    xmm14, xmm3
496        pshuflw xmm15, xmm15, 0xB1
497        pshufhw xmm15, xmm15, 0xB1
498        pshuflw xmm12, xmm12, 0xB1
499        pshufhw xmm12, xmm12, 0xB1
500        pshuflw xmm13, xmm13, 0xB1
501        pshufhw xmm13, xmm13, 0xB1
502        pshuflw xmm14, xmm14, 0xB1
503        pshufhw xmm14, xmm14, 0xB1
504        paddd   xmm10, xmm15
505        paddd   xmm11, xmm12
506        movdqa  xmm8, xmmword ptr [rsp+0x100]
507        paddd   xmm8, xmm13
508        paddd   xmm9, xmm14
509        pxor    xmm5, xmm10
510        pxor    xmm6, xmm11
511        pxor    xmm7, xmm8
512        pxor    xmm4, xmm9
513        movdqa  xmmword ptr [rsp+0x100], xmm8
514        movdqa  xmm8, xmm5
515        psrld   xmm8, 12
516        pslld   xmm5, 20
517        por     xmm5, xmm8
518        movdqa  xmm8, xmm6
519        psrld   xmm8, 12
520        pslld   xmm6, 20
521        por     xmm6, xmm8
522        movdqa  xmm8, xmm7
523        psrld   xmm8, 12
524        pslld   xmm7, 20
525        por     xmm7, xmm8
526        movdqa  xmm8, xmm4
527        psrld   xmm8, 12
528        pslld   xmm4, 20
529        por     xmm4, xmm8
530        paddd   xmm0, xmmword ptr [rsp+0xB0]
531        paddd   xmm1, xmmword ptr [rsp+0x50]
532        paddd   xmm2, xmmword ptr [rsp+0xE0]
533        paddd   xmm3, xmmword ptr [rsp+0x80]
534        paddd   xmm0, xmm5
535        paddd   xmm1, xmm6
536        paddd   xmm2, xmm7
537        paddd   xmm3, xmm4
538        pxor    xmm15, xmm0
539        pxor    xmm12, xmm1
540        pxor    xmm13, xmm2
541        pxor    xmm14, xmm3
542        movdqa  xmm8, xmm15
543        psrld   xmm15, 8
544        pslld   xmm8, 24
545        pxor    xmm15, xmm8
546        movdqa  xmm8, xmm12
547        psrld   xmm12, 8
548        pslld   xmm8, 24
549        pxor    xmm12, xmm8
550        movdqa  xmm8, xmm13
551        psrld   xmm13, 8
552        pslld   xmm8, 24
553        pxor    xmm13, xmm8
554        movdqa  xmm8, xmm14
555        psrld   xmm14, 8
556        pslld   xmm8, 24
557        pxor    xmm14, xmm8
558        paddd   xmm10, xmm15
559        paddd   xmm11, xmm12
560        movdqa  xmm8, xmmword ptr [rsp+0x100]
561        paddd   xmm8, xmm13
562        paddd   xmm9, xmm14
563        pxor    xmm5, xmm10
564        pxor    xmm6, xmm11
565        pxor    xmm7, xmm8
566        pxor    xmm4, xmm9
567        movdqa  xmmword ptr [rsp+0x100], xmm8
568        movdqa  xmm8, xmm5
569        psrld   xmm8, 7
570        pslld   xmm5, 25
571        por     xmm5, xmm8
572        movdqa  xmm8, xmm6
573        psrld   xmm8, 7
574        pslld   xmm6, 25
575        por     xmm6, xmm8
576        movdqa  xmm8, xmm7
577        psrld   xmm8, 7
578        pslld   xmm7, 25
579        por     xmm7, xmm8
580        movdqa  xmm8, xmm4
581        psrld   xmm8, 7
582        pslld   xmm4, 25
583        por     xmm4, xmm8
584        paddd   xmm0, xmmword ptr [rsp+0x30]
585        paddd   xmm1, xmmword ptr [rsp+0xA0]
586        paddd   xmm2, xmmword ptr [rsp+0xD0]
587        paddd   xmm3, xmmword ptr [rsp+0x70]
588        paddd   xmm0, xmm4
589        paddd   xmm1, xmm5
590        paddd   xmm2, xmm6
591        paddd   xmm3, xmm7
592        pxor    xmm12, xmm0
593        pxor    xmm13, xmm1
594        pxor    xmm14, xmm2
595        pxor    xmm15, xmm3
596        pshuflw xmm12, xmm12, 0xB1
597        pshufhw xmm12, xmm12, 0xB1
598        pshuflw xmm13, xmm13, 0xB1
599        pshufhw xmm13, xmm13, 0xB1
600        pshuflw xmm14, xmm14, 0xB1
601        pshufhw xmm14, xmm14, 0xB1
602        pshuflw xmm15, xmm15, 0xB1
603        pshufhw xmm15, xmm15, 0xB1
604        movdqa  xmm8, xmmword ptr [rsp+0x100]
605        paddd   xmm8, xmm12
606        paddd   xmm9, xmm13
607        paddd   xmm10, xmm14
608        paddd   xmm11, xmm15
609        pxor    xmm4, xmm8
610        pxor    xmm5, xmm9
611        pxor    xmm6, xmm10
612        pxor    xmm7, xmm11
613        movdqa  xmmword ptr [rsp+0x100], xmm8
614        movdqa  xmm8, xmm4
615        psrld   xmm8, 12
616        pslld   xmm4, 20
617        por     xmm4, xmm8
618        movdqa  xmm8, xmm5
619        psrld   xmm8, 12
620        pslld   xmm5, 20
621        por     xmm5, xmm8
622        movdqa  xmm8, xmm6
623        psrld   xmm8, 12
624        pslld   xmm6, 20
625        por     xmm6, xmm8
626        movdqa  xmm8, xmm7
627        psrld   xmm8, 12
628        pslld   xmm7, 20
629        por     xmm7, xmm8
630        paddd   xmm0, xmmword ptr [rsp+0x40]
631        paddd   xmm1, xmmword ptr [rsp+0xC0]
632        paddd   xmm2, xmmword ptr [rsp+0x20]
633        paddd   xmm3, xmmword ptr [rsp+0xE0]
634        paddd   xmm0, xmm4
635        paddd   xmm1, xmm5
636        paddd   xmm2, xmm6
637        paddd   xmm3, xmm7
638        pxor    xmm12, xmm0
639        pxor    xmm13, xmm1
640        pxor    xmm14, xmm2
641        pxor    xmm15, xmm3
642        movdqa  xmm8, xmm12
643        psrld   xmm12, 8
644        pslld   xmm8, 24
645        pxor    xmm12, xmm8
646        movdqa  xmm8, xmm13
647        psrld   xmm13, 8
648        pslld   xmm8, 24
649        pxor    xmm13, xmm8
650        movdqa  xmm8, xmm14
651        psrld   xmm14, 8
652        pslld   xmm8, 24
653        pxor    xmm14, xmm8
654        movdqa  xmm8, xmm15
655        psrld   xmm15, 8
656        pslld   xmm8, 24
657        pxor    xmm15, xmm8
658        movdqa  xmm8, xmmword ptr [rsp+0x100]
659        paddd   xmm8, xmm12
660        paddd   xmm9, xmm13
661        paddd   xmm10, xmm14
662        paddd   xmm11, xmm15
663        pxor    xmm4, xmm8
664        pxor    xmm5, xmm9
665        pxor    xmm6, xmm10
666        pxor    xmm7, xmm11
667        movdqa  xmmword ptr [rsp+0x100], xmm8
668        movdqa  xmm8, xmm4
669        psrld   xmm8, 7
670        pslld   xmm4, 25
671        por     xmm4, xmm8
672        movdqa  xmm8, xmm5
673        psrld   xmm8, 7
674        pslld   xmm5, 25
675        por     xmm5, xmm8
676        movdqa  xmm8, xmm6
677        psrld   xmm8, 7
678        pslld   xmm6, 25
679        por     xmm6, xmm8
680        movdqa  xmm8, xmm7
681        psrld   xmm8, 7
682        pslld   xmm7, 25
683        por     xmm7, xmm8
684        paddd   xmm0, xmmword ptr [rsp+0x60]
685        paddd   xmm1, xmmword ptr [rsp+0x90]
686        paddd   xmm2, xmmword ptr [rsp+0xB0]
687        paddd   xmm3, xmmword ptr [rsp+0x80]
688        paddd   xmm0, xmm5
689        paddd   xmm1, xmm6
690        paddd   xmm2, xmm7
691        paddd   xmm3, xmm4
692        pxor    xmm15, xmm0
693        pxor    xmm12, xmm1
694        pxor    xmm13, xmm2
695        pxor    xmm14, xmm3
696        pshuflw xmm15, xmm15, 0xB1
697        pshufhw xmm15, xmm15, 0xB1
698        pshuflw xmm12, xmm12, 0xB1
699        pshufhw xmm12, xmm12, 0xB1
700        pshuflw xmm13, xmm13, 0xB1
701        pshufhw xmm13, xmm13, 0xB1
702        pshuflw xmm14, xmm14, 0xB1
703        pshufhw xmm14, xmm14, 0xB1
704        paddd   xmm10, xmm15
705        paddd   xmm11, xmm12
706        movdqa  xmm8, xmmword ptr [rsp+0x100]
707        paddd   xmm8, xmm13
708        paddd   xmm9, xmm14
709        pxor    xmm5, xmm10
710        pxor    xmm6, xmm11
711        pxor    xmm7, xmm8
712        pxor    xmm4, xmm9
713        movdqa  xmmword ptr [rsp+0x100], xmm8
714        movdqa  xmm8, xmm5
715        psrld   xmm8, 12
716        pslld   xmm5, 20
717        por     xmm5, xmm8
718        movdqa  xmm8, xmm6
719        psrld   xmm8, 12
720        pslld   xmm6, 20
721        por     xmm6, xmm8
722        movdqa  xmm8, xmm7
723        psrld   xmm8, 12
724        pslld   xmm7, 20
725        por     xmm7, xmm8
726        movdqa  xmm8, xmm4
727        psrld   xmm8, 12
728        pslld   xmm4, 20
729        por     xmm4, xmm8
730        paddd   xmm0, xmmword ptr [rsp+0x50]
731        paddd   xmm1, xmmword ptr [rsp]
732        paddd   xmm2, xmmword ptr [rsp+0xF0]
733        paddd   xmm3, xmmword ptr [rsp+0x10]
734        paddd   xmm0, xmm5
735        paddd   xmm1, xmm6
736        paddd   xmm2, xmm7
737        paddd   xmm3, xmm4
738        pxor    xmm15, xmm0
739        pxor    xmm12, xmm1
740        pxor    xmm13, xmm2
741        pxor    xmm14, xmm3
742        movdqa  xmm8, xmm15
743        psrld   xmm15, 8
744        pslld   xmm8, 24
745        pxor    xmm15, xmm8
746        movdqa  xmm8, xmm12
747        psrld   xmm12, 8
748        pslld   xmm8, 24
749        pxor    xmm12, xmm8
750        movdqa  xmm8, xmm13
751        psrld   xmm13, 8
752        pslld   xmm8, 24
753        pxor    xmm13, xmm8
754        movdqa  xmm8, xmm14
755        psrld   xmm14, 8
756        pslld   xmm8, 24
757        pxor    xmm14, xmm8
758        paddd   xmm10, xmm15
759        paddd   xmm11, xmm12
760        movdqa  xmm8, xmmword ptr [rsp+0x100]
761        paddd   xmm8, xmm13
762        paddd   xmm9, xmm14
763        pxor    xmm5, xmm10
764        pxor    xmm6, xmm11
765        pxor    xmm7, xmm8
766        pxor    xmm4, xmm9
767        movdqa  xmmword ptr [rsp+0x100], xmm8
768        movdqa  xmm8, xmm5
769        psrld   xmm8, 7
770        pslld   xmm5, 25
771        por     xmm5, xmm8
772        movdqa  xmm8, xmm6
773        psrld   xmm8, 7
774        pslld   xmm6, 25
775        por     xmm6, xmm8
776        movdqa  xmm8, xmm7
777        psrld   xmm8, 7
778        pslld   xmm7, 25
779        por     xmm7, xmm8
780        movdqa  xmm8, xmm4
781        psrld   xmm8, 7
782        pslld   xmm4, 25
783        por     xmm4, xmm8
784        paddd   xmm0, xmmword ptr [rsp+0xA0]
785        paddd   xmm1, xmmword ptr [rsp+0xC0]
786        paddd   xmm2, xmmword ptr [rsp+0xE0]
787        paddd   xmm3, xmmword ptr [rsp+0xD0]
788        paddd   xmm0, xmm4
789        paddd   xmm1, xmm5
790        paddd   xmm2, xmm6
791        paddd   xmm3, xmm7
792        pxor    xmm12, xmm0
793        pxor    xmm13, xmm1
794        pxor    xmm14, xmm2
795        pxor    xmm15, xmm3
796        pshuflw xmm12, xmm12, 0xB1
797        pshufhw xmm12, xmm12, 0xB1
798        pshuflw xmm13, xmm13, 0xB1
799        pshufhw xmm13, xmm13, 0xB1
800        pshuflw xmm14, xmm14, 0xB1
801        pshufhw xmm14, xmm14, 0xB1
802        pshuflw xmm15, xmm15, 0xB1
803        pshufhw xmm15, xmm15, 0xB1
804        movdqa  xmm8, xmmword ptr [rsp+0x100]
805        paddd   xmm8, xmm12
806        paddd   xmm9, xmm13
807        paddd   xmm10, xmm14
808        paddd   xmm11, xmm15
809        pxor    xmm4, xmm8
810        pxor    xmm5, xmm9
811        pxor    xmm6, xmm10
812        pxor    xmm7, xmm11
813        movdqa  xmmword ptr [rsp+0x100], xmm8
814        movdqa  xmm8, xmm4
815        psrld   xmm8, 12
816        pslld   xmm4, 20
817        por     xmm4, xmm8
818        movdqa  xmm8, xmm5
819        psrld   xmm8, 12
820        pslld   xmm5, 20
821        por     xmm5, xmm8
822        movdqa  xmm8, xmm6
823        psrld   xmm8, 12
824        pslld   xmm6, 20
825        por     xmm6, xmm8
826        movdqa  xmm8, xmm7
827        psrld   xmm8, 12
828        pslld   xmm7, 20
829        por     xmm7, xmm8
830        paddd   xmm0, xmmword ptr [rsp+0x70]
831        paddd   xmm1, xmmword ptr [rsp+0x90]
832        paddd   xmm2, xmmword ptr [rsp+0x30]
833        paddd   xmm3, xmmword ptr [rsp+0xF0]
834        paddd   xmm0, xmm4
835        paddd   xmm1, xmm5
836        paddd   xmm2, xmm6
837        paddd   xmm3, xmm7
838        pxor    xmm12, xmm0
839        pxor    xmm13, xmm1
840        pxor    xmm14, xmm2
841        pxor    xmm15, xmm3
842        movdqa  xmm8, xmm12
843        psrld   xmm12, 8
844        pslld   xmm8, 24
845        pxor    xmm12, xmm8
846        movdqa  xmm8, xmm13
847        psrld   xmm13, 8
848        pslld   xmm8, 24
849        pxor    xmm13, xmm8
850        movdqa  xmm8, xmm14
851        psrld   xmm14, 8
852        pslld   xmm8, 24
853        pxor    xmm14, xmm8
854        movdqa  xmm8, xmm15
855        psrld   xmm15, 8
856        pslld   xmm8, 24
857        pxor    xmm15, xmm8
858        movdqa  xmm8, xmmword ptr [rsp+0x100]
859        paddd   xmm8, xmm12
860        paddd   xmm9, xmm13
861        paddd   xmm10, xmm14
862        paddd   xmm11, xmm15
863        pxor    xmm4, xmm8
864        pxor    xmm5, xmm9
865        pxor    xmm6, xmm10
866        pxor    xmm7, xmm11
867        movdqa  xmmword ptr [rsp+0x100], xmm8
868        movdqa  xmm8, xmm4
869        psrld   xmm8, 7
870        pslld   xmm4, 25
871        por     xmm4, xmm8
872        movdqa  xmm8, xmm5
873        psrld   xmm8, 7
874        pslld   xmm5, 25
875        por     xmm5, xmm8
876        movdqa  xmm8, xmm6
877        psrld   xmm8, 7
878        pslld   xmm6, 25
879        por     xmm6, xmm8
880        movdqa  xmm8, xmm7
881        psrld   xmm8, 7
882        pslld   xmm7, 25
883        por     xmm7, xmm8
884        paddd   xmm0, xmmword ptr [rsp+0x40]
885        paddd   xmm1, xmmword ptr [rsp+0xB0]
886        paddd   xmm2, xmmword ptr [rsp+0x50]
887        paddd   xmm3, xmmword ptr [rsp+0x10]
888        paddd   xmm0, xmm5
889        paddd   xmm1, xmm6
890        paddd   xmm2, xmm7
891        paddd   xmm3, xmm4
892        pxor    xmm15, xmm0
893        pxor    xmm12, xmm1
894        pxor    xmm13, xmm2
895        pxor    xmm14, xmm3
896        pshuflw xmm15, xmm15, 0xB1
897        pshufhw xmm15, xmm15, 0xB1
898        pshuflw xmm12, xmm12, 0xB1
899        pshufhw xmm12, xmm12, 0xB1
900        pshuflw xmm13, xmm13, 0xB1
901        pshufhw xmm13, xmm13, 0xB1
902        pshuflw xmm14, xmm14, 0xB1
903        pshufhw xmm14, xmm14, 0xB1
904        paddd   xmm10, xmm15
905        paddd   xmm11, xmm12
906        movdqa  xmm8, xmmword ptr [rsp+0x100]
907        paddd   xmm8, xmm13
908        paddd   xmm9, xmm14
909        pxor    xmm5, xmm10
910        pxor    xmm6, xmm11
911        pxor    xmm7, xmm8
912        pxor    xmm4, xmm9
913        movdqa  xmmword ptr [rsp+0x100], xmm8
914        movdqa  xmm8, xmm5
915        psrld   xmm8, 12
916        pslld   xmm5, 20
917        por     xmm5, xmm8
918        movdqa  xmm8, xmm6
919        psrld   xmm8, 12
920        pslld   xmm6, 20
921        por     xmm6, xmm8
922        movdqa  xmm8, xmm7
923        psrld   xmm8, 12
924        pslld   xmm7, 20
925        por     xmm7, xmm8
926        movdqa  xmm8, xmm4
927        psrld   xmm8, 12
928        pslld   xmm4, 20
929        por     xmm4, xmm8
930        paddd   xmm0, xmmword ptr [rsp]
931        paddd   xmm1, xmmword ptr [rsp+0x20]
932        paddd   xmm2, xmmword ptr [rsp+0x80]
933        paddd   xmm3, xmmword ptr [rsp+0x60]
934        paddd   xmm0, xmm5
935        paddd   xmm1, xmm6
936        paddd   xmm2, xmm7
937        paddd   xmm3, xmm4
938        pxor    xmm15, xmm0
939        pxor    xmm12, xmm1
940        pxor    xmm13, xmm2
941        pxor    xmm14, xmm3
942        movdqa  xmm8, xmm15
943        psrld   xmm15, 8
944        pslld   xmm8, 24
945        pxor    xmm15, xmm8
946        movdqa  xmm8, xmm12
947        psrld   xmm12, 8
948        pslld   xmm8, 24
949        pxor    xmm12, xmm8
950        movdqa  xmm8, xmm13
951        psrld   xmm13, 8
952        pslld   xmm8, 24
953        pxor    xmm13, xmm8
954        movdqa  xmm8, xmm14
955        psrld   xmm14, 8
956        pslld   xmm8, 24
957        pxor    xmm14, xmm8
958        paddd   xmm10, xmm15
959        paddd   xmm11, xmm12
960        movdqa  xmm8, xmmword ptr [rsp+0x100]
961        paddd   xmm8, xmm13
962        paddd   xmm9, xmm14
963        pxor    xmm5, xmm10
964        pxor    xmm6, xmm11
965        pxor    xmm7, xmm8
966        pxor    xmm4, xmm9
967        movdqa  xmmword ptr [rsp+0x100], xmm8
968        movdqa  xmm8, xmm5
969        psrld   xmm8, 7
970        pslld   xmm5, 25
971        por     xmm5, xmm8
972        movdqa  xmm8, xmm6
973        psrld   xmm8, 7
974        pslld   xmm6, 25
975        por     xmm6, xmm8
976        movdqa  xmm8, xmm7
977        psrld   xmm8, 7
978        pslld   xmm7, 25
979        por     xmm7, xmm8
980        movdqa  xmm8, xmm4
981        psrld   xmm8, 7
982        pslld   xmm4, 25
983        por     xmm4, xmm8
984        paddd   xmm0, xmmword ptr [rsp+0xC0]
985        paddd   xmm1, xmmword ptr [rsp+0x90]
986        paddd   xmm2, xmmword ptr [rsp+0xF0]
987        paddd   xmm3, xmmword ptr [rsp+0xE0]
988        paddd   xmm0, xmm4
989        paddd   xmm1, xmm5
990        paddd   xmm2, xmm6
991        paddd   xmm3, xmm7
992        pxor    xmm12, xmm0
993        pxor    xmm13, xmm1
994        pxor    xmm14, xmm2
995        pxor    xmm15, xmm3
996        pshuflw xmm12, xmm12, 0xB1
997        pshufhw xmm12, xmm12, 0xB1
998        pshuflw xmm13, xmm13, 0xB1
999        pshufhw xmm13, xmm13, 0xB1
1000        pshuflw xmm14, xmm14, 0xB1
1001        pshufhw xmm14, xmm14, 0xB1
1002        pshuflw xmm15, xmm15, 0xB1
1003        pshufhw xmm15, xmm15, 0xB1
1004        movdqa  xmm8, xmmword ptr [rsp+0x100]
1005        paddd   xmm8, xmm12
1006        paddd   xmm9, xmm13
1007        paddd   xmm10, xmm14
1008        paddd   xmm11, xmm15
1009        pxor    xmm4, xmm8
1010        pxor    xmm5, xmm9
1011        pxor    xmm6, xmm10
1012        pxor    xmm7, xmm11
1013        movdqa  xmmword ptr [rsp+0x100], xmm8
1014        movdqa  xmm8, xmm4
1015        psrld   xmm8, 12
1016        pslld   xmm4, 20
1017        por     xmm4, xmm8
1018        movdqa  xmm8, xmm5
1019        psrld   xmm8, 12
1020        pslld   xmm5, 20
1021        por     xmm5, xmm8
1022        movdqa  xmm8, xmm6
1023        psrld   xmm8, 12
1024        pslld   xmm6, 20
1025        por     xmm6, xmm8
1026        movdqa  xmm8, xmm7
1027        psrld   xmm8, 12
1028        pslld   xmm7, 20
1029        por     xmm7, xmm8
1030        paddd   xmm0, xmmword ptr [rsp+0xD0]
1031        paddd   xmm1, xmmword ptr [rsp+0xB0]
1032        paddd   xmm2, xmmword ptr [rsp+0xA0]
1033        paddd   xmm3, xmmword ptr [rsp+0x80]
1034        paddd   xmm0, xmm4
1035        paddd   xmm1, xmm5
1036        paddd   xmm2, xmm6
1037        paddd   xmm3, xmm7
1038        pxor    xmm12, xmm0
1039        pxor    xmm13, xmm1
1040        pxor    xmm14, xmm2
1041        pxor    xmm15, xmm3
1042        movdqa  xmm8, xmm12
1043        psrld   xmm12, 8
1044        pslld   xmm8, 24
1045        pxor    xmm12, xmm8
1046        movdqa  xmm8, xmm13
1047        psrld   xmm13, 8
1048        pslld   xmm8, 24
1049        pxor    xmm13, xmm8
1050        movdqa  xmm8, xmm14
1051        psrld   xmm14, 8
1052        pslld   xmm8, 24
1053        pxor    xmm14, xmm8
1054        movdqa  xmm8, xmm15
1055        psrld   xmm15, 8
1056        pslld   xmm8, 24
1057        pxor    xmm15, xmm8
1058        movdqa  xmm8, xmmword ptr [rsp+0x100]
1059        paddd   xmm8, xmm12
1060        paddd   xmm9, xmm13
1061        paddd   xmm10, xmm14
1062        paddd   xmm11, xmm15
1063        pxor    xmm4, xmm8
1064        pxor    xmm5, xmm9
1065        pxor    xmm6, xmm10
1066        pxor    xmm7, xmm11
1067        movdqa  xmmword ptr [rsp+0x100], xmm8
1068        movdqa  xmm8, xmm4
1069        psrld   xmm8, 7
1070        pslld   xmm4, 25
1071        por     xmm4, xmm8
1072        movdqa  xmm8, xmm5
1073        psrld   xmm8, 7
1074        pslld   xmm5, 25
1075        por     xmm5, xmm8
1076        movdqa  xmm8, xmm6
1077        psrld   xmm8, 7
1078        pslld   xmm6, 25
1079        por     xmm6, xmm8
1080        movdqa  xmm8, xmm7
1081        psrld   xmm8, 7
1082        pslld   xmm7, 25
1083        por     xmm7, xmm8
1084        paddd   xmm0, xmmword ptr [rsp+0x70]
1085        paddd   xmm1, xmmword ptr [rsp+0x50]
1086        paddd   xmm2, xmmword ptr [rsp]
1087        paddd   xmm3, xmmword ptr [rsp+0x60]
1088        paddd   xmm0, xmm5
1089        paddd   xmm1, xmm6
1090        paddd   xmm2, xmm7
1091        paddd   xmm3, xmm4
1092        pxor    xmm15, xmm0
1093        pxor    xmm12, xmm1
1094        pxor    xmm13, xmm2
1095        pxor    xmm14, xmm3
1096        pshuflw xmm15, xmm15, 0xB1
1097        pshufhw xmm15, xmm15, 0xB1
1098        pshuflw xmm12, xmm12, 0xB1
1099        pshufhw xmm12, xmm12, 0xB1
1100        pshuflw xmm13, xmm13, 0xB1
1101        pshufhw xmm13, xmm13, 0xB1
1102        pshuflw xmm14, xmm14, 0xB1
1103        pshufhw xmm14, xmm14, 0xB1
1104        paddd   xmm10, xmm15
1105        paddd   xmm11, xmm12
1106        movdqa  xmm8, xmmword ptr [rsp+0x100]
1107        paddd   xmm8, xmm13
1108        paddd   xmm9, xmm14
1109        pxor    xmm5, xmm10
1110        pxor    xmm6, xmm11
1111        pxor    xmm7, xmm8
1112        pxor    xmm4, xmm9
1113        movdqa  xmmword ptr [rsp+0x100], xmm8
1114        movdqa  xmm8, xmm5
1115        psrld   xmm8, 12
1116        pslld   xmm5, 20
1117        por     xmm5, xmm8
1118        movdqa  xmm8, xmm6
1119        psrld   xmm8, 12
1120        pslld   xmm6, 20
1121        por     xmm6, xmm8
1122        movdqa  xmm8, xmm7
1123        psrld   xmm8, 12
1124        pslld   xmm7, 20
1125        por     xmm7, xmm8
1126        movdqa  xmm8, xmm4
1127        psrld   xmm8, 12
1128        pslld   xmm4, 20
1129        por     xmm4, xmm8
1130        paddd   xmm0, xmmword ptr [rsp+0x20]
1131        paddd   xmm1, xmmword ptr [rsp+0x30]
1132        paddd   xmm2, xmmword ptr [rsp+0x10]
1133        paddd   xmm3, xmmword ptr [rsp+0x40]
1134        paddd   xmm0, xmm5
1135        paddd   xmm1, xmm6
1136        paddd   xmm2, xmm7
1137        paddd   xmm3, xmm4
1138        pxor    xmm15, xmm0
1139        pxor    xmm12, xmm1
1140        pxor    xmm13, xmm2
1141        pxor    xmm14, xmm3
1142        movdqa  xmm8, xmm15
1143        psrld   xmm15, 8
1144        pslld   xmm8, 24
1145        pxor    xmm15, xmm8
1146        movdqa  xmm8, xmm12
1147        psrld   xmm12, 8
1148        pslld   xmm8, 24
1149        pxor    xmm12, xmm8
1150        movdqa  xmm8, xmm13
1151        psrld   xmm13, 8
1152        pslld   xmm8, 24
1153        pxor    xmm13, xmm8
1154        movdqa  xmm8, xmm14
1155        psrld   xmm14, 8
1156        pslld   xmm8, 24
1157        pxor    xmm14, xmm8
1158        paddd   xmm10, xmm15
1159        paddd   xmm11, xmm12
1160        movdqa  xmm8, xmmword ptr [rsp+0x100]
1161        paddd   xmm8, xmm13
1162        paddd   xmm9, xmm14
1163        pxor    xmm5, xmm10
1164        pxor    xmm6, xmm11
1165        pxor    xmm7, xmm8
1166        pxor    xmm4, xmm9
1167        movdqa  xmmword ptr [rsp+0x100], xmm8
1168        movdqa  xmm8, xmm5
1169        psrld   xmm8, 7
1170        pslld   xmm5, 25
1171        por     xmm5, xmm8
1172        movdqa  xmm8, xmm6
1173        psrld   xmm8, 7
1174        pslld   xmm6, 25
1175        por     xmm6, xmm8
1176        movdqa  xmm8, xmm7
1177        psrld   xmm8, 7
1178        pslld   xmm7, 25
1179        por     xmm7, xmm8
1180        movdqa  xmm8, xmm4
1181        psrld   xmm8, 7
1182        pslld   xmm4, 25
1183        por     xmm4, xmm8
1184        paddd   xmm0, xmmword ptr [rsp+0x90]
1185        paddd   xmm1, xmmword ptr [rsp+0xB0]
1186        paddd   xmm2, xmmword ptr [rsp+0x80]
1187        paddd   xmm3, xmmword ptr [rsp+0xF0]
1188        paddd   xmm0, xmm4
1189        paddd   xmm1, xmm5
1190        paddd   xmm2, xmm6
1191        paddd   xmm3, xmm7
1192        pxor    xmm12, xmm0
1193        pxor    xmm13, xmm1
1194        pxor    xmm14, xmm2
1195        pxor    xmm15, xmm3
1196        pshuflw xmm12, xmm12, 0xB1
1197        pshufhw xmm12, xmm12, 0xB1
1198        pshuflw xmm13, xmm13, 0xB1
1199        pshufhw xmm13, xmm13, 0xB1
1200        pshuflw xmm14, xmm14, 0xB1
1201        pshufhw xmm14, xmm14, 0xB1
1202        pshuflw xmm15, xmm15, 0xB1
1203        pshufhw xmm15, xmm15, 0xB1
1204        movdqa  xmm8, xmmword ptr [rsp+0x100]
1205        paddd   xmm8, xmm12
1206        paddd   xmm9, xmm13
1207        paddd   xmm10, xmm14
1208        paddd   xmm11, xmm15
1209        pxor    xmm4, xmm8
1210        pxor    xmm5, xmm9
1211        pxor    xmm6, xmm10
1212        pxor    xmm7, xmm11
1213        movdqa  xmmword ptr [rsp+0x100], xmm8
1214        movdqa  xmm8, xmm4
1215        psrld   xmm8, 12
1216        pslld   xmm4, 20
1217        por     xmm4, xmm8
1218        movdqa  xmm8, xmm5
1219        psrld   xmm8, 12
1220        pslld   xmm5, 20
1221        por     xmm5, xmm8
1222        movdqa  xmm8, xmm6
1223        psrld   xmm8, 12
1224        pslld   xmm6, 20
1225        por     xmm6, xmm8
1226        movdqa  xmm8, xmm7
1227        psrld   xmm8, 12
1228        pslld   xmm7, 20
1229        por     xmm7, xmm8
1230        paddd   xmm0, xmmword ptr [rsp+0xE0]
1231        paddd   xmm1, xmmword ptr [rsp+0x50]
1232        paddd   xmm2, xmmword ptr [rsp+0xC0]
1233        paddd   xmm3, xmmword ptr [rsp+0x10]
1234        paddd   xmm0, xmm4
1235        paddd   xmm1, xmm5
1236        paddd   xmm2, xmm6
1237        paddd   xmm3, xmm7
1238        pxor    xmm12, xmm0
1239        pxor    xmm13, xmm1
1240        pxor    xmm14, xmm2
1241        pxor    xmm15, xmm3
1242        movdqa  xmm8, xmm12
1243        psrld   xmm12, 8
1244        pslld   xmm8, 24
1245        pxor    xmm12, xmm8
1246        movdqa  xmm8, xmm13
1247        psrld   xmm13, 8
1248        pslld   xmm8, 24
1249        pxor    xmm13, xmm8
1250        movdqa  xmm8, xmm14
1251        psrld   xmm14, 8
1252        pslld   xmm8, 24
1253        pxor    xmm14, xmm8
1254        movdqa  xmm8, xmm15
1255        psrld   xmm15, 8
1256        pslld   xmm8, 24
1257        pxor    xmm15, xmm8
1258        movdqa  xmm8, xmmword ptr [rsp+0x100]
1259        paddd   xmm8, xmm12
1260        paddd   xmm9, xmm13
1261        paddd   xmm10, xmm14
1262        paddd   xmm11, xmm15
1263        pxor    xmm4, xmm8
1264        pxor    xmm5, xmm9
1265        pxor    xmm6, xmm10
1266        pxor    xmm7, xmm11
1267        movdqa  xmmword ptr [rsp+0x100], xmm8
1268        movdqa  xmm8, xmm4
1269        psrld   xmm8, 7
1270        pslld   xmm4, 25
1271        por     xmm4, xmm8
1272        movdqa  xmm8, xmm5
1273        psrld   xmm8, 7
1274        pslld   xmm5, 25
1275        por     xmm5, xmm8
1276        movdqa  xmm8, xmm6
1277        psrld   xmm8, 7
1278        pslld   xmm6, 25
1279        por     xmm6, xmm8
1280        movdqa  xmm8, xmm7
1281        psrld   xmm8, 7
1282        pslld   xmm7, 25
1283        por     xmm7, xmm8
1284        paddd   xmm0, xmmword ptr [rsp+0xD0]
1285        paddd   xmm1, xmmword ptr [rsp]
1286        paddd   xmm2, xmmword ptr [rsp+0x20]
1287        paddd   xmm3, xmmword ptr [rsp+0x40]
1288        paddd   xmm0, xmm5
1289        paddd   xmm1, xmm6
1290        paddd   xmm2, xmm7
1291        paddd   xmm3, xmm4
1292        pxor    xmm15, xmm0
1293        pxor    xmm12, xmm1
1294        pxor    xmm13, xmm2
1295        pxor    xmm14, xmm3
1296        pshuflw xmm15, xmm15, 0xB1
1297        pshufhw xmm15, xmm15, 0xB1
1298        pshuflw xmm12, xmm12, 0xB1
1299        pshufhw xmm12, xmm12, 0xB1
1300        pshuflw xmm13, xmm13, 0xB1
1301        pshufhw xmm13, xmm13, 0xB1
1302        pshuflw xmm14, xmm14, 0xB1
1303        pshufhw xmm14, xmm14, 0xB1
1304        paddd   xmm10, xmm15
1305        paddd   xmm11, xmm12
1306        movdqa  xmm8, xmmword ptr [rsp+0x100]
1307        paddd   xmm8, xmm13
1308        paddd   xmm9, xmm14
1309        pxor    xmm5, xmm10
1310        pxor    xmm6, xmm11
1311        pxor    xmm7, xmm8
1312        pxor    xmm4, xmm9
1313        movdqa  xmmword ptr [rsp+0x100], xmm8
1314        movdqa  xmm8, xmm5
1315        psrld   xmm8, 12
1316        pslld   xmm5, 20
1317        por     xmm5, xmm8
1318        movdqa  xmm8, xmm6
1319        psrld   xmm8, 12
1320        pslld   xmm6, 20
1321        por     xmm6, xmm8
1322        movdqa  xmm8, xmm7
1323        psrld   xmm8, 12
1324        pslld   xmm7, 20
1325        por     xmm7, xmm8
1326        movdqa  xmm8, xmm4
1327        psrld   xmm8, 12
1328        pslld   xmm4, 20
1329        por     xmm4, xmm8
1330        paddd   xmm0, xmmword ptr [rsp+0x30]
1331        paddd   xmm1, xmmword ptr [rsp+0xA0]
1332        paddd   xmm2, xmmword ptr [rsp+0x60]
1333        paddd   xmm3, xmmword ptr [rsp+0x70]
1334        paddd   xmm0, xmm5
1335        paddd   xmm1, xmm6
1336        paddd   xmm2, xmm7
1337        paddd   xmm3, xmm4
1338        pxor    xmm15, xmm0
1339        pxor    xmm12, xmm1
1340        pxor    xmm13, xmm2
1341        pxor    xmm14, xmm3
1342        movdqa  xmm8, xmm15
1343        psrld   xmm15, 8
1344        pslld   xmm8, 24
1345        pxor    xmm15, xmm8
1346        movdqa  xmm8, xmm12
1347        psrld   xmm12, 8
1348        pslld   xmm8, 24
1349        pxor    xmm12, xmm8
1350        movdqa  xmm8, xmm13
1351        psrld   xmm13, 8
1352        pslld   xmm8, 24
1353        pxor    xmm13, xmm8
1354        movdqa  xmm8, xmm14
1355        psrld   xmm14, 8
1356        pslld   xmm8, 24
1357        pxor    xmm14, xmm8
1358        paddd   xmm10, xmm15
1359        paddd   xmm11, xmm12
1360        movdqa  xmm8, xmmword ptr [rsp+0x100]
1361        paddd   xmm8, xmm13
1362        paddd   xmm9, xmm14
1363        pxor    xmm5, xmm10
1364        pxor    xmm6, xmm11
1365        pxor    xmm7, xmm8
1366        pxor    xmm4, xmm9
1367        movdqa  xmmword ptr [rsp+0x100], xmm8
1368        movdqa  xmm8, xmm5
1369        psrld   xmm8, 7
1370        pslld   xmm5, 25
1371        por     xmm5, xmm8
1372        movdqa  xmm8, xmm6
1373        psrld   xmm8, 7
1374        pslld   xmm6, 25
1375        por     xmm6, xmm8
1376        movdqa  xmm8, xmm7
1377        psrld   xmm8, 7
1378        pslld   xmm7, 25
1379        por     xmm7, xmm8
1380        movdqa  xmm8, xmm4
1381        psrld   xmm8, 7
1382        pslld   xmm4, 25
1383        por     xmm4, xmm8
1384        paddd   xmm0, xmmword ptr [rsp+0xB0]
1385        paddd   xmm1, xmmword ptr [rsp+0x50]
1386        paddd   xmm2, xmmword ptr [rsp+0x10]
1387        paddd   xmm3, xmmword ptr [rsp+0x80]
1388        paddd   xmm0, xmm4
1389        paddd   xmm1, xmm5
1390        paddd   xmm2, xmm6
1391        paddd   xmm3, xmm7
1392        pxor    xmm12, xmm0
1393        pxor    xmm13, xmm1
1394        pxor    xmm14, xmm2
1395        pxor    xmm15, xmm3
1396        pshuflw xmm12, xmm12, 0xB1
1397        pshufhw xmm12, xmm12, 0xB1
1398        pshuflw xmm13, xmm13, 0xB1
1399        pshufhw xmm13, xmm13, 0xB1
1400        pshuflw xmm14, xmm14, 0xB1
1401        pshufhw xmm14, xmm14, 0xB1
1402        pshuflw xmm15, xmm15, 0xB1
1403        pshufhw xmm15, xmm15, 0xB1
1404        movdqa  xmm8, xmmword ptr [rsp+0x100]
1405        paddd   xmm8, xmm12
1406        paddd   xmm9, xmm13
1407        paddd   xmm10, xmm14
1408        paddd   xmm11, xmm15
1409        pxor    xmm4, xmm8
1410        pxor    xmm5, xmm9
1411        pxor    xmm6, xmm10
1412        pxor    xmm7, xmm11
1413        movdqa  xmmword ptr [rsp+0x100], xmm8
1414        movdqa  xmm8, xmm4
1415        psrld   xmm8, 12
1416        pslld   xmm4, 20
1417        por     xmm4, xmm8
1418        movdqa  xmm8, xmm5
1419        psrld   xmm8, 12
1420        pslld   xmm5, 20
1421        por     xmm5, xmm8
1422        movdqa  xmm8, xmm6
1423        psrld   xmm8, 12
1424        pslld   xmm6, 20
1425        por     xmm6, xmm8
1426        movdqa  xmm8, xmm7
1427        psrld   xmm8, 12
1428        pslld   xmm7, 20
1429        por     xmm7, xmm8
1430        paddd   xmm0, xmmword ptr [rsp+0xF0]
1431        paddd   xmm1, xmmword ptr [rsp]
1432        paddd   xmm2, xmmword ptr [rsp+0x90]
1433        paddd   xmm3, xmmword ptr [rsp+0x60]
1434        paddd   xmm0, xmm4
1435        paddd   xmm1, xmm5
1436        paddd   xmm2, xmm6
1437        paddd   xmm3, xmm7
1438        pxor    xmm12, xmm0
1439        pxor    xmm13, xmm1
1440        pxor    xmm14, xmm2
1441        pxor    xmm15, xmm3
1442        movdqa  xmm8, xmm12
1443        psrld   xmm12, 8
1444        pslld   xmm8, 24
1445        pxor    xmm12, xmm8
1446        movdqa  xmm8, xmm13
1447        psrld   xmm13, 8
1448        pslld   xmm8, 24
1449        pxor    xmm13, xmm8
1450        movdqa  xmm8, xmm14
1451        psrld   xmm14, 8
1452        pslld   xmm8, 24
1453        pxor    xmm14, xmm8
1454        movdqa  xmm8, xmm15
1455        psrld   xmm15, 8
1456        pslld   xmm8, 24
1457        pxor    xmm15, xmm8
1458        movdqa  xmm8, xmmword ptr [rsp+0x100]
1459        paddd   xmm8, xmm12
1460        paddd   xmm9, xmm13
1461        paddd   xmm10, xmm14
1462        paddd   xmm11, xmm15
1463        pxor    xmm4, xmm8
1464        pxor    xmm5, xmm9
1465        pxor    xmm6, xmm10
1466        pxor    xmm7, xmm11
1467        movdqa  xmmword ptr [rsp+0x100], xmm8
1468        movdqa  xmm8, xmm4
1469        psrld   xmm8, 7
1470        pslld   xmm4, 25
1471        por     xmm4, xmm8
1472        movdqa  xmm8, xmm5
1473        psrld   xmm8, 7
1474        pslld   xmm5, 25
1475        por     xmm5, xmm8
1476        movdqa  xmm8, xmm6
1477        psrld   xmm8, 7
1478        pslld   xmm6, 25
1479        por     xmm6, xmm8
1480        movdqa  xmm8, xmm7
1481        psrld   xmm8, 7
1482        pslld   xmm7, 25
1483        por     xmm7, xmm8
1484        paddd   xmm0, xmmword ptr [rsp+0xE0]
1485        paddd   xmm1, xmmword ptr [rsp+0x20]
1486        paddd   xmm2, xmmword ptr [rsp+0x30]
1487        paddd   xmm3, xmmword ptr [rsp+0x70]
1488        paddd   xmm0, xmm5
1489        paddd   xmm1, xmm6
1490        paddd   xmm2, xmm7
1491        paddd   xmm3, xmm4
1492        pxor    xmm15, xmm0
1493        pxor    xmm12, xmm1
1494        pxor    xmm13, xmm2
1495        pxor    xmm14, xmm3
1496        pshuflw xmm15, xmm15, 0xB1
1497        pshufhw xmm15, xmm15, 0xB1
1498        pshuflw xmm12, xmm12, 0xB1
1499        pshufhw xmm12, xmm12, 0xB1
1500        pshuflw xmm13, xmm13, 0xB1
1501        pshufhw xmm13, xmm13, 0xB1
1502        pshuflw xmm14, xmm14, 0xB1
1503        pshufhw xmm14, xmm14, 0xB1
1504        paddd   xmm10, xmm15
1505        paddd   xmm11, xmm12
1506        movdqa  xmm8, xmmword ptr [rsp+0x100]
1507        paddd   xmm8, xmm13
1508        paddd   xmm9, xmm14
1509        pxor    xmm5, xmm10
1510        pxor    xmm6, xmm11
1511        pxor    xmm7, xmm8
1512        pxor    xmm4, xmm9
1513        movdqa  xmmword ptr [rsp+0x100], xmm8
1514        movdqa  xmm8, xmm5
1515        psrld   xmm8, 12
1516        pslld   xmm5, 20
1517        por     xmm5, xmm8
1518        movdqa  xmm8, xmm6
1519        psrld   xmm8, 12
1520        pslld   xmm6, 20
1521        por     xmm6, xmm8
1522        movdqa  xmm8, xmm7
1523        psrld   xmm8, 12
1524        pslld   xmm7, 20
1525        por     xmm7, xmm8
1526        movdqa  xmm8, xmm4
1527        psrld   xmm8, 12
1528        pslld   xmm4, 20
1529        por     xmm4, xmm8
1530        paddd   xmm0, xmmword ptr [rsp+0xA0]
1531        paddd   xmm1, xmmword ptr [rsp+0xC0]
1532        paddd   xmm2, xmmword ptr [rsp+0x40]
1533        paddd   xmm3, xmmword ptr [rsp+0xD0]
1534        paddd   xmm0, xmm5
1535        paddd   xmm1, xmm6
1536        paddd   xmm2, xmm7
1537        paddd   xmm3, xmm4
1538        pxor    xmm15, xmm0
1539        pxor    xmm12, xmm1
1540        pxor    xmm13, xmm2
1541        pxor    xmm14, xmm3
1542        movdqa  xmm8, xmm15
1543        psrld   xmm15, 8
1544        pslld   xmm8, 24
1545        pxor    xmm15, xmm8
1546        movdqa  xmm8, xmm12
1547        psrld   xmm12, 8
1548        pslld   xmm8, 24
1549        pxor    xmm12, xmm8
1550        movdqa  xmm8, xmm13
1551        psrld   xmm13, 8
1552        pslld   xmm8, 24
1553        pxor    xmm13, xmm8
1554        movdqa  xmm8, xmm14
1555        psrld   xmm14, 8
1556        pslld   xmm8, 24
1557        pxor    xmm14, xmm8
1558        paddd   xmm10, xmm15
1559        paddd   xmm11, xmm12
1560        movdqa  xmm8, xmmword ptr [rsp+0x100]
1561        paddd   xmm8, xmm13
1562        paddd   xmm9, xmm14
1563        pxor    xmm5, xmm10
1564        pxor    xmm6, xmm11
1565        pxor    xmm7, xmm8
1566        pxor    xmm4, xmm9
1567        pxor    xmm0, xmm8
1568        pxor    xmm1, xmm9
1569        pxor    xmm2, xmm10
1570        pxor    xmm3, xmm11
1571        movdqa  xmm8, xmm5
1572        psrld   xmm8, 7
1573        pslld   xmm5, 25
1574        por     xmm5, xmm8
1575        movdqa  xmm8, xmm6
1576        psrld   xmm8, 7
1577        pslld   xmm6, 25
1578        por     xmm6, xmm8
1579        movdqa  xmm8, xmm7
1580        psrld   xmm8, 7
1581        pslld   xmm7, 25
1582        por     xmm7, xmm8
1583        movdqa  xmm8, xmm4
1584        psrld   xmm8, 7
1585        pslld   xmm4, 25
1586        por     xmm4, xmm8
1587        pxor    xmm4, xmm12
1588        pxor    xmm5, xmm13
1589        pxor    xmm6, xmm14
1590        pxor    xmm7, xmm15
1591        mov     eax, r13d
1592        jne     9b
1593        movdqa  xmm9, xmm0
1594        punpckldq xmm0, xmm1
1595        punpckhdq xmm9, xmm1
1596        movdqa  xmm11, xmm2
1597        punpckldq xmm2, xmm3
1598        punpckhdq xmm11, xmm3
1599        movdqa  xmm1, xmm0
1600        punpcklqdq xmm0, xmm2
1601        punpckhqdq xmm1, xmm2
1602        movdqa  xmm3, xmm9
1603        punpcklqdq xmm9, xmm11
1604        punpckhqdq xmm3, xmm11
1605        movdqu  xmmword ptr [rbx], xmm0
1606        movdqu  xmmword ptr [rbx+0x20], xmm1
1607        movdqu  xmmword ptr [rbx+0x40], xmm9
1608        movdqu  xmmword ptr [rbx+0x60], xmm3
1609        movdqa  xmm9, xmm4
1610        punpckldq xmm4, xmm5
1611        punpckhdq xmm9, xmm5
1612        movdqa  xmm11, xmm6
1613        punpckldq xmm6, xmm7
1614        punpckhdq xmm11, xmm7
1615        movdqa  xmm5, xmm4
1616        punpcklqdq xmm4, xmm6
1617        punpckhqdq xmm5, xmm6
1618        movdqa  xmm7, xmm9
1619        punpcklqdq xmm9, xmm11
1620        punpckhqdq xmm7, xmm11
1621        movdqu  xmmword ptr [rbx+0x10], xmm4
1622        movdqu  xmmword ptr [rbx+0x30], xmm5
1623        movdqu  xmmword ptr [rbx+0x50], xmm9
1624        movdqu  xmmword ptr [rbx+0x70], xmm7
1625        movdqa  xmm1, xmmword ptr [rsp+0x110]
1626        movdqa  xmm0, xmm1
1627        paddd   xmm1, xmmword ptr [rsp+0x150]
1628        movdqa  xmmword ptr [rsp+0x110], xmm1
1629        pxor    xmm0, xmmword ptr [CMP_MSB_MASK+rip]
1630        pxor    xmm1, xmmword ptr [CMP_MSB_MASK+rip]
1631        pcmpgtd xmm0, xmm1
1632        movdqa  xmm1, xmmword ptr [rsp+0x120]
1633        psubd   xmm1, xmm0
1634        movdqa  xmmword ptr [rsp+0x120], xmm1
1635        add     rbx, 128
1636        add     rdi, 32
1637        sub     rsi, 4
1638        cmp     rsi, 4
1639        jnc     2b
1640        test    rsi, rsi
1641        jne     3f
16424:
1643        movdqa  xmm6, xmmword ptr [rsp+0x170]
1644        movdqa  xmm7, xmmword ptr [rsp+0x180]
1645        movdqa  xmm8, xmmword ptr [rsp+0x190]
1646        movdqa  xmm9, xmmword ptr [rsp+0x1A0]
1647        movdqa  xmm10, xmmword ptr [rsp+0x1B0]
1648        movdqa  xmm11, xmmword ptr [rsp+0x1C0]
1649        movdqa  xmm12, xmmword ptr [rsp+0x1D0]
1650        movdqa  xmm13, xmmword ptr [rsp+0x1E0]
1651        movdqa  xmm14, xmmword ptr [rsp+0x1F0]
1652        movdqa  xmm15, xmmword ptr [rsp+0x200]
1653        mov     rsp, rbp
1654        pop     rbp
1655        pop     rbx
1656        pop     rdi
1657        pop     rsi
1658        pop     r12
1659        pop     r13
1660        pop     r14
1661        pop     r15
1662        ret
1663.p2align 5
16643:
1665        test    esi, 0x2
1666        je      3f
1667        movups  xmm0, xmmword ptr [rcx]
1668        movups  xmm1, xmmword ptr [rcx+0x10]
1669        movaps  xmm8, xmm0
1670        movaps  xmm9, xmm1
1671        movd    xmm13, dword ptr [rsp+0x110]
1672        movd    xmm14, dword ptr [rsp+0x120]
1673        punpckldq xmm13, xmm14
1674        movaps  xmmword ptr [rsp], xmm13
1675        movd    xmm14, dword ptr [rsp+0x114]
1676        movd    xmm13, dword ptr [rsp+0x124]
1677        punpckldq xmm14, xmm13
1678        movaps  xmmword ptr [rsp+0x10], xmm14
1679        mov     r8, qword ptr [rdi]
1680        mov     r9, qword ptr [rdi+0x8]
1681        movzx   eax, byte ptr [rbp+0x80]
1682        or      eax, r13d
1683        xor     edx, edx
16842:
1685        mov     r14d, eax
1686        or      eax, r12d
1687        add     rdx, 64
1688        cmp     rdx, r15
1689        cmovne  eax, r14d
1690        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
1691        movaps  xmm10, xmm2
1692        movups  xmm4, xmmword ptr [r8+rdx-0x40]
1693        movups  xmm5, xmmword ptr [r8+rdx-0x30]
1694        movaps  xmm3, xmm4
1695        shufps  xmm4, xmm5, 136
1696        shufps  xmm3, xmm5, 221
1697        movaps  xmm5, xmm3
1698        movups  xmm6, xmmword ptr [r8+rdx-0x20]
1699        movups  xmm7, xmmword ptr [r8+rdx-0x10]
1700        movaps  xmm3, xmm6
1701        shufps  xmm6, xmm7, 136
1702        pshufd  xmm6, xmm6, 0x93
1703        shufps  xmm3, xmm7, 221
1704        pshufd  xmm7, xmm3, 0x93
1705        movups  xmm12, xmmword ptr [r9+rdx-0x40]
1706        movups  xmm13, xmmword ptr [r9+rdx-0x30]
1707        movaps  xmm11, xmm12
1708        shufps  xmm12, xmm13, 136
1709        shufps  xmm11, xmm13, 221
1710        movaps  xmm13, xmm11
1711        movups  xmm14, xmmword ptr [r9+rdx-0x20]
1712        movups  xmm15, xmmword ptr [r9+rdx-0x10]
1713        movaps  xmm11, xmm14
1714        shufps  xmm14, xmm15, 136
1715        pshufd  xmm14, xmm14, 0x93
1716        shufps  xmm11, xmm15, 221
1717        pshufd  xmm15, xmm11, 0x93
1718        shl     rax, 0x20
1719        or      rax, 0x40
1720        movq    xmm3, rax
1721        movdqa  xmmword ptr [rsp+0x20], xmm3
1722        movaps  xmm3, xmmword ptr [rsp]
1723        movaps  xmm11, xmmword ptr [rsp+0x10]
1724        punpcklqdq xmm3, xmmword ptr [rsp+0x20]
1725        punpcklqdq xmm11, xmmword ptr [rsp+0x20]
1726        mov     al, 7
17279:
1728        paddd   xmm0, xmm4
1729        paddd   xmm8, xmm12
1730        movaps  xmmword ptr [rsp+0x20], xmm4
1731        movaps  xmmword ptr [rsp+0x30], xmm12
1732        paddd   xmm0, xmm1
1733        paddd   xmm8, xmm9
1734        pxor    xmm3, xmm0
1735        pxor    xmm11, xmm8
1736        pshuflw xmm3, xmm3, 0xB1
1737        pshufhw xmm3, xmm3, 0xB1
1738        pshuflw xmm11, xmm11, 0xB1
1739        pshufhw xmm11, xmm11, 0xB1
1740        paddd   xmm2, xmm3
1741        paddd   xmm10, xmm11
1742        pxor    xmm1, xmm2
1743        pxor    xmm9, xmm10
1744        movdqa  xmm4, xmm1
1745        pslld   xmm1, 20
1746        psrld   xmm4, 12
1747        por     xmm1, xmm4
1748        movdqa  xmm4, xmm9
1749        pslld   xmm9, 20
1750        psrld   xmm4, 12
1751        por     xmm9, xmm4
1752        paddd   xmm0, xmm5
1753        paddd   xmm8, xmm13
1754        movaps  xmmword ptr [rsp+0x40], xmm5
1755        movaps  xmmword ptr [rsp+0x50], xmm13
1756        paddd   xmm0, xmm1
1757        paddd   xmm8, xmm9
1758        pxor    xmm3, xmm0
1759        pxor    xmm11, xmm8
1760        movdqa  xmm13, xmm3
1761        psrld   xmm3, 8
1762        pslld   xmm13, 24
1763        pxor    xmm3, xmm13
1764        movdqa  xmm13, xmm11
1765        psrld   xmm11, 8
1766        pslld   xmm13, 24
1767        pxor    xmm11, xmm13
1768        paddd   xmm2, xmm3
1769        paddd   xmm10, xmm11
1770        pxor    xmm1, xmm2
1771        pxor    xmm9, xmm10
1772        movdqa  xmm4, xmm1
1773        pslld   xmm1, 25
1774        psrld   xmm4, 7
1775        por     xmm1, xmm4
1776        movdqa  xmm4, xmm9
1777        pslld   xmm9, 25
1778        psrld   xmm4, 7
1779        por     xmm9, xmm4
1780        pshufd  xmm0, xmm0, 0x93
1781        pshufd  xmm8, xmm8, 0x93
1782        pshufd  xmm3, xmm3, 0x4E
1783        pshufd  xmm11, xmm11, 0x4E
1784        pshufd  xmm2, xmm2, 0x39
1785        pshufd  xmm10, xmm10, 0x39
1786        paddd   xmm0, xmm6
1787        paddd   xmm8, xmm14
1788        paddd   xmm0, xmm1
1789        paddd   xmm8, xmm9
1790        pxor    xmm3, xmm0
1791        pxor    xmm11, xmm8
1792        pshuflw xmm3, xmm3, 0xB1
1793        pshufhw xmm3, xmm3, 0xB1
1794        pshuflw xmm11, xmm11, 0xB1
1795        pshufhw xmm11, xmm11, 0xB1
1796        paddd   xmm2, xmm3
1797        paddd   xmm10, xmm11
1798        pxor    xmm1, xmm2
1799        pxor    xmm9, xmm10
1800        movdqa  xmm4, xmm1
1801        pslld   xmm1, 20
1802        psrld   xmm4, 12
1803        por     xmm1, xmm4
1804        movdqa  xmm4, xmm9
1805        pslld   xmm9, 20
1806        psrld   xmm4, 12
1807        por     xmm9, xmm4
1808        paddd   xmm0, xmm7
1809        paddd   xmm8, xmm15
1810        paddd   xmm0, xmm1
1811        paddd   xmm8, xmm9
1812        pxor    xmm3, xmm0
1813        pxor    xmm11, xmm8
1814        movdqa  xmm13, xmm3
1815        psrld   xmm3, 8
1816        pslld   xmm13, 24
1817        pxor    xmm3, xmm13
1818        movdqa  xmm13, xmm11
1819        psrld   xmm11, 8
1820        pslld   xmm13, 24
1821        pxor    xmm11, xmm13
1822        paddd   xmm2, xmm3
1823        paddd   xmm10, xmm11
1824        pxor    xmm1, xmm2
1825        pxor    xmm9, xmm10
1826        movdqa  xmm4, xmm1
1827        pslld   xmm1, 25
1828        psrld   xmm4, 7
1829        por     xmm1, xmm4
1830        movdqa  xmm4, xmm9
1831        pslld   xmm9, 25
1832        psrld   xmm4, 7
1833        por     xmm9, xmm4
1834        pshufd  xmm0, xmm0, 0x39
1835        pshufd  xmm8, xmm8, 0x39
1836        pshufd  xmm3, xmm3, 0x4E
1837        pshufd  xmm11, xmm11, 0x4E
1838        pshufd  xmm2, xmm2, 0x93
1839        pshufd  xmm10, xmm10, 0x93
1840        dec     al
1841        je      9f
1842        movdqa  xmm12, xmmword ptr [rsp+0x20]
1843        movdqa  xmm5, xmmword ptr [rsp+0x40]
1844        pshufd  xmm13, xmm12, 0x0F
1845        shufps  xmm12, xmm5, 214
1846        pshufd  xmm4, xmm12, 0x39
1847        movdqa  xmm12, xmm6
1848        shufps  xmm12, xmm7, 250
1849        pand    xmm13, xmmword ptr [PBLENDW_0x33_MASK+rip]
1850        pand    xmm12, xmmword ptr [PBLENDW_0xCC_MASK+rip]
1851        por     xmm13, xmm12
1852        movdqa  xmmword ptr [rsp+0x20], xmm13
1853        movdqa  xmm12, xmm7
1854        punpcklqdq xmm12, xmm5
1855        movdqa  xmm13, xmm6
1856        pand    xmm12, xmmword ptr [PBLENDW_0x3F_MASK+rip]
1857        pand    xmm13, xmmword ptr [PBLENDW_0xC0_MASK+rip]
1858        por     xmm12, xmm13
1859        pshufd  xmm12, xmm12, 0x78
1860        punpckhdq xmm5, xmm7
1861        punpckldq xmm6, xmm5
1862        pshufd  xmm7, xmm6, 0x1E
1863        movdqa  xmmword ptr [rsp+0x40], xmm12
1864        movdqa  xmm5, xmmword ptr [rsp+0x30]
1865        movdqa  xmm13, xmmword ptr [rsp+0x50]
1866        pshufd  xmm6, xmm5, 0x0F
1867        shufps  xmm5, xmm13, 214
1868        pshufd  xmm12, xmm5, 0x39
1869        movdqa  xmm5, xmm14
1870        shufps  xmm5, xmm15, 250
1871        pand    xmm6, xmmword ptr [PBLENDW_0x33_MASK+rip]
1872        pand    xmm5, xmmword ptr [PBLENDW_0xCC_MASK+rip]
1873        por     xmm6, xmm5
1874        movdqa  xmm5, xmm15
1875        punpcklqdq xmm5, xmm13
1876        movdqa  xmmword ptr [rsp+0x30], xmm2
1877        movdqa  xmm2, xmm14
1878        pand    xmm5, xmmword ptr [PBLENDW_0x3F_MASK+rip]
1879        pand    xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip]
1880        por     xmm5, xmm2
1881        movdqa  xmm2, xmmword ptr [rsp+0x30]
1882        pshufd  xmm5, xmm5, 0x78
1883        punpckhdq xmm13, xmm15
1884        punpckldq xmm14, xmm13
1885        pshufd  xmm15, xmm14, 0x1E
1886        movdqa  xmm13, xmm6
1887        movdqa  xmm14, xmm5
1888        movdqa  xmm5, xmmword ptr [rsp+0x20]
1889        movdqa  xmm6, xmmword ptr [rsp+0x40]
1890        jmp     9b
18919:
1892        pxor    xmm0, xmm2
1893        pxor    xmm1, xmm3
1894        pxor    xmm8, xmm10
1895        pxor    xmm9, xmm11
1896        mov     eax, r13d
1897        cmp     rdx, r15
1898        jne     2b
1899        movups  xmmword ptr [rbx], xmm0
1900        movups  xmmword ptr [rbx+0x10], xmm1
1901        movups  xmmword ptr [rbx+0x20], xmm8
1902        movups  xmmword ptr [rbx+0x30], xmm9
1903        mov     eax, dword ptr [rsp+0x130]
1904        neg     eax
1905        mov    r10d, dword ptr [rsp+0x110+8*rax]
1906        mov    r11d, dword ptr [rsp+0x120+8*rax]
1907        mov dword ptr [rsp+0x110], r10d
1908        mov dword ptr [rsp+0x120], r11d
1909        add     rdi, 16
1910        add     rbx, 64
1911        sub     rsi, 2
19123:
1913        test    esi, 0x1
1914        je      4b
1915        movups  xmm0, xmmword ptr [rcx]
1916        movups  xmm1, xmmword ptr [rcx+0x10]
1917        movd    xmm13, dword ptr [rsp+0x110]
1918        movd    xmm14, dword ptr [rsp+0x120]
1919        punpckldq xmm13, xmm14
1920        mov     r8, qword ptr [rdi]
1921        movzx   eax, byte ptr [rbp+0x80]
1922        or      eax, r13d
1923        xor     edx, edx
19242:
1925        mov     r14d, eax
1926        or      eax, r12d
1927        add     rdx, 64
1928        cmp     rdx, r15
1929        cmovne  eax, r14d
1930        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
1931        shl     rax, 32
1932        or      rax, 64
1933        movq    xmm12, rax
1934        movdqa  xmm3, xmm13
1935        punpcklqdq xmm3, xmm12
1936        movups  xmm4, xmmword ptr [r8+rdx-0x40]
1937        movups  xmm5, xmmword ptr [r8+rdx-0x30]
1938        movaps  xmm8, xmm4
1939        shufps  xmm4, xmm5, 136
1940        shufps  xmm8, xmm5, 221
1941        movaps  xmm5, xmm8
1942        movups  xmm6, xmmword ptr [r8+rdx-0x20]
1943        movups  xmm7, xmmword ptr [r8+rdx-0x10]
1944        movaps  xmm8, xmm6
1945        shufps  xmm6, xmm7, 136
1946        pshufd  xmm6, xmm6, 0x93
1947        shufps  xmm8, xmm7, 221
1948        pshufd  xmm7, xmm8, 0x93
1949        mov     al, 7
19509:
1951        paddd   xmm0, xmm4
1952        paddd   xmm0, xmm1
1953        pxor    xmm3, xmm0
1954        pshuflw xmm3, xmm3, 0xB1
1955        pshufhw xmm3, xmm3, 0xB1
1956        paddd   xmm2, xmm3
1957        pxor    xmm1, xmm2
1958        movdqa  xmm11, xmm1
1959        pslld   xmm1, 20
1960        psrld   xmm11, 12
1961        por     xmm1, xmm11
1962        paddd   xmm0, xmm5
1963        paddd   xmm0, xmm1
1964        pxor    xmm3, xmm0
1965        movdqa  xmm14, xmm3
1966        psrld   xmm3, 8
1967        pslld   xmm14, 24
1968        pxor    xmm3, xmm14
1969        paddd   xmm2, xmm3
1970        pxor    xmm1, xmm2
1971        movdqa  xmm11, xmm1
1972        pslld   xmm1, 25
1973        psrld   xmm11, 7
1974        por     xmm1, xmm11
1975        pshufd  xmm0, xmm0, 0x93
1976        pshufd  xmm3, xmm3, 0x4E
1977        pshufd  xmm2, xmm2, 0x39
1978        paddd   xmm0, xmm6
1979        paddd   xmm0, xmm1
1980        pxor    xmm3, xmm0
1981        pshuflw xmm3, xmm3, 0xB1
1982        pshufhw xmm3, xmm3, 0xB1
1983        paddd   xmm2, xmm3
1984        pxor    xmm1, xmm2
1985        movdqa  xmm11, xmm1
1986        pslld   xmm1, 20
1987        psrld   xmm11, 12
1988        por     xmm1, xmm11
1989        paddd   xmm0, xmm7
1990        paddd   xmm0, xmm1
1991        pxor    xmm3, xmm0
1992        movdqa  xmm14, xmm3
1993        psrld   xmm3, 8
1994        pslld   xmm14, 24
1995        pxor    xmm3, xmm14
1996        paddd   xmm2, xmm3
1997        pxor    xmm1, xmm2
1998        movdqa  xmm11, xmm1
1999        pslld   xmm1, 25
2000        psrld   xmm11, 7
2001        por     xmm1, xmm11
2002        pshufd  xmm0, xmm0, 0x39
2003        pshufd  xmm3, xmm3, 0x4E
2004        pshufd  xmm2, xmm2, 0x93
2005        dec     al
2006        jz      9f
2007        movdqa  xmm8, xmm4
2008        shufps  xmm8, xmm5, 214
2009        pshufd  xmm9, xmm4, 0x0F
2010        pshufd  xmm4, xmm8, 0x39
2011        movdqa  xmm8, xmm6
2012        shufps  xmm8, xmm7, 250
2013        pand    xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
2014        pand    xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
2015        por     xmm9, xmm8
2016        movdqa  xmm8, xmm7
2017        punpcklqdq xmm8, xmm5
2018        movdqa  xmm10, xmm6
2019        pand    xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
2020        pand    xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
2021        por     xmm8, xmm10
2022        pshufd  xmm8, xmm8, 0x78
2023        punpckhdq xmm5, xmm7
2024        punpckldq xmm6, xmm5
2025        pshufd  xmm7, xmm6, 0x1E
2026        movdqa  xmm5, xmm9
2027        movdqa  xmm6, xmm8
2028        jmp     9b
20299:
2030        pxor    xmm0, xmm2
2031        pxor    xmm1, xmm3
2032        mov     eax, r13d
2033        cmp     rdx, r15
2034        jne     2b
2035        movups  xmmword ptr [rbx], xmm0
2036        movups  xmmword ptr [rbx+0x10], xmm1
2037        jmp     4b
2038
2039.p2align 6
2040blake3_compress_in_place_sse2:
2041_blake3_compress_in_place_sse2:
2042        sub     rsp, 120
2043        movdqa  xmmword ptr [rsp], xmm6
2044        movdqa  xmmword ptr [rsp+0x10], xmm7
2045        movdqa  xmmword ptr [rsp+0x20], xmm8
2046        movdqa  xmmword ptr [rsp+0x30], xmm9
2047        movdqa  xmmword ptr [rsp+0x40], xmm11
2048        movdqa  xmmword ptr [rsp+0x50], xmm14
2049        movdqa  xmmword ptr [rsp+0x60], xmm15
2050        movups  xmm0, xmmword ptr [rcx]
2051        movups  xmm1, xmmword ptr [rcx+0x10]
2052        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
2053        movzx   eax, byte ptr [rsp+0xA0]
2054        movzx   r8d, r8b
2055        shl     rax, 32
2056        add     r8, rax
2057        movq    xmm3, r9
2058        movq    xmm4, r8
2059        punpcklqdq xmm3, xmm4
2060        movups  xmm4, xmmword ptr [rdx]
2061        movups  xmm5, xmmword ptr [rdx+0x10]
2062        movaps  xmm8, xmm4
2063        shufps  xmm4, xmm5, 136
2064        shufps  xmm8, xmm5, 221
2065        movaps  xmm5, xmm8
2066        movups  xmm6, xmmword ptr [rdx+0x20]
2067        movups  xmm7, xmmword ptr [rdx+0x30]
2068        movaps  xmm8, xmm6
2069        shufps  xmm6, xmm7, 136
2070        pshufd  xmm6, xmm6, 0x93
2071        shufps  xmm8, xmm7, 221
2072        pshufd  xmm7, xmm8, 0x93
2073        mov     al, 7
20749:
2075        paddd   xmm0, xmm4
2076        paddd   xmm0, xmm1
2077        pxor    xmm3, xmm0
2078        pshuflw xmm3, xmm3, 0xB1
2079        pshufhw xmm3, xmm3, 0xB1
2080        paddd   xmm2, xmm3
2081        pxor    xmm1, xmm2
2082        movdqa  xmm11, xmm1
2083        pslld   xmm1, 20
2084        psrld   xmm11, 12
2085        por     xmm1, xmm11
2086        paddd   xmm0, xmm5
2087        paddd   xmm0, xmm1
2088        pxor    xmm3, xmm0
2089        movdqa  xmm14, xmm3
2090        psrld   xmm3, 8
2091        pslld   xmm14, 24
2092        pxor    xmm3, xmm14
2093        paddd   xmm2, xmm3
2094        pxor    xmm1, xmm2
2095        movdqa  xmm11, xmm1
2096        pslld   xmm1, 25
2097        psrld   xmm11, 7
2098        por     xmm1, xmm11
2099        pshufd  xmm0, xmm0, 0x93
2100        pshufd  xmm3, xmm3, 0x4E
2101        pshufd  xmm2, xmm2, 0x39
2102        paddd   xmm0, xmm6
2103        paddd   xmm0, xmm1
2104        pxor    xmm3, xmm0
2105        pshuflw xmm3, xmm3, 0xB1
2106        pshufhw xmm3, xmm3, 0xB1
2107        paddd   xmm2, xmm3
2108        pxor    xmm1, xmm2
2109        movdqa  xmm11, xmm1
2110        pslld   xmm1, 20
2111        psrld   xmm11, 12
2112        por     xmm1, xmm11
2113        paddd   xmm0, xmm7
2114        paddd   xmm0, xmm1
2115        pxor    xmm3, xmm0
2116        movdqa  xmm14, xmm3
2117        psrld   xmm3, 8
2118        pslld   xmm14, 24
2119        pxor    xmm3, xmm14
2120        paddd   xmm2, xmm3
2121        pxor    xmm1, xmm2
2122        movdqa  xmm11, xmm1
2123        pslld   xmm1, 25
2124        psrld   xmm11, 7
2125        por     xmm1, xmm11
2126        pshufd  xmm0, xmm0, 0x39
2127        pshufd  xmm3, xmm3, 0x4E
2128        pshufd  xmm2, xmm2, 0x93
2129        dec     al
2130        jz      9f
2131        movdqa  xmm8, xmm4
2132        shufps  xmm8, xmm5, 214
2133        pshufd  xmm9, xmm4, 0x0F
2134        pshufd  xmm4, xmm8, 0x39
2135        movdqa  xmm8, xmm6
2136        shufps  xmm8, xmm7, 250
2137        pand    xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
2138        pand    xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
2139        por     xmm9, xmm8
2140        movdqa  xmm8, xmm7
2141        punpcklqdq xmm8, xmm5
2142        movdqa  xmm14, xmm6
2143        pand    xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
2144        pand    xmm14, xmmword ptr [PBLENDW_0xC0_MASK+rip]
2145        por     xmm8, xmm14
2146        pshufd  xmm8, xmm8, 0x78
2147        punpckhdq xmm5, xmm7
2148        punpckldq xmm6, xmm5
2149        pshufd  xmm7, xmm6, 0x1E
2150        movdqa  xmm5, xmm9
2151        movdqa  xmm6, xmm8
2152        jmp     9b
21539:
2154        pxor    xmm0, xmm2
2155        pxor    xmm1, xmm3
2156        movups  xmmword ptr [rcx], xmm0
2157        movups  xmmword ptr [rcx+0x10], xmm1
2158        movdqa  xmm6, xmmword ptr [rsp]
2159        movdqa  xmm7, xmmword ptr [rsp+0x10]
2160        movdqa  xmm8, xmmword ptr [rsp+0x20]
2161        movdqa  xmm9, xmmword ptr [rsp+0x30]
2162        movdqa  xmm11, xmmword ptr [rsp+0x40]
2163        movdqa  xmm14, xmmword ptr [rsp+0x50]
2164        movdqa  xmm15, xmmword ptr [rsp+0x60]
2165        add     rsp, 120
2166        ret
2167
2168
2169.p2align 6
2170_blake3_compress_xof_sse2:
2171blake3_compress_xof_sse2:
2172        sub     rsp, 120
2173        movdqa  xmmword ptr [rsp], xmm6
2174        movdqa  xmmword ptr [rsp+0x10], xmm7
2175        movdqa  xmmword ptr [rsp+0x20], xmm8
2176        movdqa  xmmword ptr [rsp+0x30], xmm9
2177        movdqa  xmmword ptr [rsp+0x40], xmm11
2178        movdqa  xmmword ptr [rsp+0x50], xmm14
2179        movdqa  xmmword ptr [rsp+0x60], xmm15
2180        movups  xmm0, xmmword ptr [rcx]
2181        movups  xmm1, xmmword ptr [rcx+0x10]
2182        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
2183        movzx   eax, byte ptr [rsp+0xA0]
2184        movzx   r8d, r8b
2185        mov     r10, qword ptr [rsp+0xA8]
2186        shl     rax, 32
2187        add     r8, rax
2188        movq    xmm3, r9
2189        movq    xmm4, r8
2190        punpcklqdq xmm3, xmm4
2191        movups  xmm4, xmmword ptr [rdx]
2192        movups  xmm5, xmmword ptr [rdx+0x10]
2193        movaps  xmm8, xmm4
2194        shufps  xmm4, xmm5, 136
2195        shufps  xmm8, xmm5, 221
2196        movaps  xmm5, xmm8
2197        movups  xmm6, xmmword ptr [rdx+0x20]
2198        movups  xmm7, xmmword ptr [rdx+0x30]
2199        movaps  xmm8, xmm6
2200        shufps  xmm6, xmm7, 136
2201        pshufd  xmm6, xmm6, 0x93
2202        shufps  xmm8, xmm7, 221
2203        pshufd  xmm7, xmm8, 0x93
2204        mov     al, 7
22059:
2206        paddd   xmm0, xmm4
2207        paddd   xmm0, xmm1
2208        pxor    xmm3, xmm0
2209        pshuflw xmm3, xmm3, 0xB1
2210        pshufhw xmm3, xmm3, 0xB1
2211        paddd   xmm2, xmm3
2212        pxor    xmm1, xmm2
2213        movdqa  xmm11, xmm1
2214        pslld   xmm1, 20
2215        psrld   xmm11, 12
2216        por     xmm1, xmm11
2217        paddd   xmm0, xmm5
2218        paddd   xmm0, xmm1
2219        pxor    xmm3, xmm0
2220        movdqa  xmm14, xmm3
2221        psrld   xmm3, 8
2222        pslld   xmm14, 24
2223        pxor    xmm3, xmm14
2224        paddd   xmm2, xmm3
2225        pxor    xmm1, xmm2
2226        movdqa  xmm11, xmm1
2227        pslld   xmm1, 25
2228        psrld   xmm11, 7
2229        por     xmm1, xmm11
2230        pshufd  xmm0, xmm0, 0x93
2231        pshufd  xmm3, xmm3, 0x4E
2232        pshufd  xmm2, xmm2, 0x39
2233        paddd   xmm0, xmm6
2234        paddd   xmm0, xmm1
2235        pxor    xmm3, xmm0
2236        pshuflw xmm3, xmm3, 0xB1
2237        pshufhw xmm3, xmm3, 0xB1
2238        paddd   xmm2, xmm3
2239        pxor    xmm1, xmm2
2240        movdqa  xmm11, xmm1
2241        pslld   xmm1, 20
2242        psrld   xmm11, 12
2243        por     xmm1, xmm11
2244        paddd   xmm0, xmm7
2245        paddd   xmm0, xmm1
2246        pxor    xmm3, xmm0
2247        movdqa  xmm14, xmm3
2248        psrld   xmm3, 8
2249        pslld   xmm14, 24
2250        pxor    xmm3, xmm14
2251        paddd   xmm2, xmm3
2252        pxor    xmm1, xmm2
2253        movdqa  xmm11, xmm1
2254        pslld   xmm1, 25
2255        psrld   xmm11, 7
2256        por     xmm1, xmm11
2257        pshufd  xmm0, xmm0, 0x39
2258        pshufd  xmm3, xmm3, 0x4E
2259        pshufd  xmm2, xmm2, 0x93
2260        dec     al
2261        jz      9f
2262        movdqa  xmm8, xmm4
2263        shufps  xmm8, xmm5, 214
2264        pshufd  xmm9, xmm4, 0x0F
2265        pshufd  xmm4, xmm8, 0x39
2266        movdqa  xmm8, xmm6
2267        shufps  xmm8, xmm7, 250
2268        pand    xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
2269        pand    xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
2270        por     xmm9, xmm8
2271        movdqa  xmm8, xmm7
2272        punpcklqdq xmm8, xmm5
2273        movdqa  xmm14, xmm6
2274        pand    xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
2275        pand    xmm14, xmmword ptr [PBLENDW_0xC0_MASK+rip]
2276        por     xmm8, xmm14
2277        pshufd  xmm8, xmm8, 0x78
2278        punpckhdq xmm5, xmm7
2279        punpckldq xmm6, xmm5
2280        pshufd  xmm7, xmm6, 0x1E
2281        movdqa  xmm5, xmm9
2282        movdqa  xmm6, xmm8
2283        jmp     9b
22849:
2285        movdqu  xmm4, xmmword ptr [rcx]
2286        movdqu  xmm5, xmmword ptr [rcx+0x10]
2287        pxor    xmm0, xmm2
2288        pxor    xmm1, xmm3
2289        pxor    xmm2, xmm4
2290        pxor    xmm3, xmm5
2291        movups  xmmword ptr [r10], xmm0
2292        movups  xmmword ptr [r10+0x10], xmm1
2293        movups  xmmword ptr [r10+0x20], xmm2
2294        movups  xmmword ptr [r10+0x30], xmm3
2295        movdqa  xmm6, xmmword ptr [rsp]
2296        movdqa  xmm7, xmmword ptr [rsp+0x10]
2297        movdqa  xmm8, xmmword ptr [rsp+0x20]
2298        movdqa  xmm9, xmmword ptr [rsp+0x30]
2299        movdqa  xmm11, xmmword ptr [rsp+0x40]
2300        movdqa  xmm14, xmmword ptr [rsp+0x50]
2301        movdqa  xmm15, xmmword ptr [rsp+0x60]
2302        add     rsp, 120
2303        ret
2304
2305
2306.section .rodata
2307.p2align  6
2308BLAKE3_IV:
2309        .long  0x6A09E667, 0xBB67AE85
2310        .long  0x3C6EF372, 0xA54FF53A
2311ADD0:
2312        .long  0, 1, 2, 3
2313ADD1:
2314        .long  4, 4, 4, 4
2315BLAKE3_IV_0:
2316        .long  0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
2317BLAKE3_IV_1:
2318        .long  0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
2319BLAKE3_IV_2:
2320        .long  0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
2321BLAKE3_IV_3:
2322        .long  0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
2323BLAKE3_BLOCK_LEN:
2324        .long  64, 64, 64, 64
2325CMP_MSB_MASK:
2326        .long  0x80000000, 0x80000000, 0x80000000, 0x80000000
2327PBLENDW_0x33_MASK:
2328        .long  0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000
2329PBLENDW_0xCC_MASK:
2330        .long  0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF
2331PBLENDW_0x3F_MASK:
2332        .long  0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000
2333PBLENDW_0xC0_MASK:
2334        .long  0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF
2335