1.intel_syntax noprefix
2.global blake3_hash_many_sse2
3.global _blake3_hash_many_sse2
4.global blake3_compress_in_place_sse2
5.global _blake3_compress_in_place_sse2
6.global blake3_compress_xof_sse2
7.global _blake3_compress_xof_sse2
8.section .text
9        .p2align  6
10_blake3_hash_many_sse2:
11blake3_hash_many_sse2:
12        push    r15
13        push    r14
14        push    r13
15        push    r12
16        push    rsi
17        push    rdi
18        push    rbx
19        push    rbp
20        mov     rbp, rsp
21        sub     rsp, 528
22        and     rsp, 0xFFFFFFFFFFFFFFC0
23        movdqa  xmmword ptr [rsp+0x170], xmm6
24        movdqa  xmmword ptr [rsp+0x180], xmm7
25        movdqa  xmmword ptr [rsp+0x190], xmm8
26        movdqa  xmmword ptr [rsp+0x1A0], xmm9
27        movdqa  xmmword ptr [rsp+0x1B0], xmm10
28        movdqa  xmmword ptr [rsp+0x1C0], xmm11
29        movdqa  xmmword ptr [rsp+0x1D0], xmm12
30        movdqa  xmmword ptr [rsp+0x1E0], xmm13
31        movdqa  xmmword ptr [rsp+0x1F0], xmm14
32        movdqa  xmmword ptr [rsp+0x200], xmm15
33        mov     rdi, rcx
34        mov     rsi, rdx
35        mov     rdx, r8
36        mov     rcx, r9
37        mov     r8, qword ptr [rbp+0x68]
38        movzx   r9, byte ptr [rbp+0x70]
39        neg     r9d
40        movd    xmm0, r9d
41        pshufd  xmm0, xmm0, 0x00
42        movdqa  xmmword ptr [rsp+0x130], xmm0
43        movdqa  xmm1, xmm0
44        pand    xmm1, xmmword ptr [ADD0+rip]
45        pand    xmm0, xmmword ptr [ADD1+rip]
46        movdqa  xmmword ptr [rsp+0x150], xmm0
47        movd    xmm0, r8d
48        pshufd  xmm0, xmm0, 0x00
49        paddd   xmm0, xmm1
50        movdqa  xmmword ptr [rsp+0x110], xmm0
51        pxor    xmm0, xmmword ptr [CMP_MSB_MASK+rip]
52        pxor    xmm1, xmmword ptr [CMP_MSB_MASK+rip]
53        pcmpgtd xmm1, xmm0
54        shr     r8, 32
55        movd    xmm2, r8d
56        pshufd  xmm2, xmm2, 0x00
57        psubd   xmm2, xmm1
58        movdqa  xmmword ptr [rsp+0x120], xmm2
59        mov     rbx, qword ptr [rbp+0x90]
60        mov     r15, rdx
61        shl     r15, 6
62        movzx   r13d, byte ptr [rbp+0x78]
63        movzx   r12d, byte ptr [rbp+0x88]
64        cmp     rsi, 4
65        jc      3f
662:
67        movdqu  xmm3, xmmword ptr [rcx]
68        pshufd  xmm0, xmm3, 0x00
69        pshufd  xmm1, xmm3, 0x55
70        pshufd  xmm2, xmm3, 0xAA
71        pshufd  xmm3, xmm3, 0xFF
72        movdqu  xmm7, xmmword ptr [rcx+0x10]
73        pshufd  xmm4, xmm7, 0x00
74        pshufd  xmm5, xmm7, 0x55
75        pshufd  xmm6, xmm7, 0xAA
76        pshufd  xmm7, xmm7, 0xFF
77        mov     r8, qword ptr [rdi]
78        mov     r9, qword ptr [rdi+0x8]
79        mov     r10, qword ptr [rdi+0x10]
80        mov     r11, qword ptr [rdi+0x18]
81        movzx   eax, byte ptr [rbp+0x80]
82        or      eax, r13d
83        xor     edx, edx
849:
85        mov     r14d, eax
86        or      eax, r12d
87        add     rdx, 64
88        cmp     rdx, r15
89        cmovne  eax, r14d
90        movdqu  xmm8, xmmword ptr [r8+rdx-0x40]
91        movdqu  xmm9, xmmword ptr [r9+rdx-0x40]
92        movdqu  xmm10, xmmword ptr [r10+rdx-0x40]
93        movdqu  xmm11, xmmword ptr [r11+rdx-0x40]
94        movdqa  xmm12, xmm8
95        punpckldq xmm8, xmm9
96        punpckhdq xmm12, xmm9
97        movdqa  xmm14, xmm10
98        punpckldq xmm10, xmm11
99        punpckhdq xmm14, xmm11
100        movdqa  xmm9, xmm8
101        punpcklqdq xmm8, xmm10
102        punpckhqdq xmm9, xmm10
103        movdqa  xmm13, xmm12
104        punpcklqdq xmm12, xmm14
105        punpckhqdq xmm13, xmm14
106        movdqa  xmmword ptr [rsp], xmm8
107        movdqa  xmmword ptr [rsp+0x10], xmm9
108        movdqa  xmmword ptr [rsp+0x20], xmm12
109        movdqa  xmmword ptr [rsp+0x30], xmm13
110        movdqu  xmm8, xmmword ptr [r8+rdx-0x30]
111        movdqu  xmm9, xmmword ptr [r9+rdx-0x30]
112        movdqu  xmm10, xmmword ptr [r10+rdx-0x30]
113        movdqu  xmm11, xmmword ptr [r11+rdx-0x30]
114        movdqa  xmm12, xmm8
115        punpckldq xmm8, xmm9
116        punpckhdq xmm12, xmm9
117        movdqa  xmm14, xmm10
118        punpckldq xmm10, xmm11
119        punpckhdq xmm14, xmm11
120        movdqa  xmm9, xmm8
121        punpcklqdq xmm8, xmm10
122        punpckhqdq xmm9, xmm10
123        movdqa  xmm13, xmm12
124        punpcklqdq xmm12, xmm14
125        punpckhqdq xmm13, xmm14
126        movdqa  xmmword ptr [rsp+0x40], xmm8
127        movdqa  xmmword ptr [rsp+0x50], xmm9
128        movdqa  xmmword ptr [rsp+0x60], xmm12
129        movdqa  xmmword ptr [rsp+0x70], xmm13
130        movdqu  xmm8, xmmword ptr [r8+rdx-0x20]
131        movdqu  xmm9, xmmword ptr [r9+rdx-0x20]
132        movdqu  xmm10, xmmword ptr [r10+rdx-0x20]
133        movdqu  xmm11, xmmword ptr [r11+rdx-0x20]
134        movdqa  xmm12, xmm8
135        punpckldq xmm8, xmm9
136        punpckhdq xmm12, xmm9
137        movdqa  xmm14, xmm10
138        punpckldq xmm10, xmm11
139        punpckhdq xmm14, xmm11
140        movdqa  xmm9, xmm8
141        punpcklqdq xmm8, xmm10
142        punpckhqdq xmm9, xmm10
143        movdqa  xmm13, xmm12
144        punpcklqdq xmm12, xmm14
145        punpckhqdq xmm13, xmm14
146        movdqa  xmmword ptr [rsp+0x80], xmm8
147        movdqa  xmmword ptr [rsp+0x90], xmm9
148        movdqa  xmmword ptr [rsp+0xA0], xmm12
149        movdqa  xmmword ptr [rsp+0xB0], xmm13
150        movdqu  xmm8, xmmword ptr [r8+rdx-0x10]
151        movdqu  xmm9, xmmword ptr [r9+rdx-0x10]
152        movdqu  xmm10, xmmword ptr [r10+rdx-0x10]
153        movdqu  xmm11, xmmword ptr [r11+rdx-0x10]
154        movdqa  xmm12, xmm8
155        punpckldq xmm8, xmm9
156        punpckhdq xmm12, xmm9
157        movdqa  xmm14, xmm10
158        punpckldq xmm10, xmm11
159        punpckhdq xmm14, xmm11
160        movdqa  xmm9, xmm8
161        punpcklqdq xmm8, xmm10
162        punpckhqdq xmm9, xmm10
163        movdqa  xmm13, xmm12
164        punpcklqdq xmm12, xmm14
165        punpckhqdq xmm13, xmm14
166        movdqa  xmmword ptr [rsp+0xC0], xmm8
167        movdqa  xmmword ptr [rsp+0xD0], xmm9
168        movdqa  xmmword ptr [rsp+0xE0], xmm12
169        movdqa  xmmword ptr [rsp+0xF0], xmm13
170        movdqa  xmm9, xmmword ptr [BLAKE3_IV_1+rip]
171        movdqa  xmm10, xmmword ptr [BLAKE3_IV_2+rip]
172        movdqa  xmm11, xmmword ptr [BLAKE3_IV_3+rip]
173        movdqa  xmm12, xmmword ptr [rsp+0x110]
174        movdqa  xmm13, xmmword ptr [rsp+0x120]
175        movdqa  xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
176        movd    xmm15, eax
177        pshufd  xmm15, xmm15, 0x00
178        prefetcht0 [r8+rdx+0x80]
179        prefetcht0 [r9+rdx+0x80]
180        prefetcht0 [r10+rdx+0x80]
181        prefetcht0 [r11+rdx+0x80]
182        paddd   xmm0, xmmword ptr [rsp]
183        paddd   xmm1, xmmword ptr [rsp+0x20]
184        paddd   xmm2, xmmword ptr [rsp+0x40]
185        paddd   xmm3, xmmword ptr [rsp+0x60]
186        paddd   xmm0, xmm4
187        paddd   xmm1, xmm5
188        paddd   xmm2, xmm6
189        paddd   xmm3, xmm7
190        pxor    xmm12, xmm0
191        pxor    xmm13, xmm1
192        pxor    xmm14, xmm2
193        pxor    xmm15, xmm3
194        pshuflw xmm12, xmm12, 0xB1
195        pshufhw xmm12, xmm12, 0xB1
196        pshuflw xmm13, xmm13, 0xB1
197        pshufhw xmm13, xmm13, 0xB1
198        pshuflw xmm14, xmm14, 0xB1
199        pshufhw xmm14, xmm14, 0xB1
200        pshuflw xmm15, xmm15, 0xB1
201        pshufhw xmm15, xmm15, 0xB1
202        movdqa  xmm8, xmmword ptr [BLAKE3_IV_0+rip]
203        paddd   xmm8, xmm12
204        paddd   xmm9, xmm13
205        paddd   xmm10, xmm14
206        paddd   xmm11, xmm15
207        pxor    xmm4, xmm8
208        pxor    xmm5, xmm9
209        pxor    xmm6, xmm10
210        pxor    xmm7, xmm11
211        movdqa  xmmword ptr [rsp+0x100], xmm8
212        movdqa  xmm8, xmm4
213        psrld   xmm8, 12
214        pslld   xmm4, 20
215        por     xmm4, xmm8
216        movdqa  xmm8, xmm5
217        psrld   xmm8, 12
218        pslld   xmm5, 20
219        por     xmm5, xmm8
220        movdqa  xmm8, xmm6
221        psrld   xmm8, 12
222        pslld   xmm6, 20
223        por     xmm6, xmm8
224        movdqa  xmm8, xmm7
225        psrld   xmm8, 12
226        pslld   xmm7, 20
227        por     xmm7, xmm8
228        paddd   xmm0, xmmword ptr [rsp+0x10]
229        paddd   xmm1, xmmword ptr [rsp+0x30]
230        paddd   xmm2, xmmword ptr [rsp+0x50]
231        paddd   xmm3, xmmword ptr [rsp+0x70]
232        paddd   xmm0, xmm4
233        paddd   xmm1, xmm5
234        paddd   xmm2, xmm6
235        paddd   xmm3, xmm7
236        pxor    xmm12, xmm0
237        pxor    xmm13, xmm1
238        pxor    xmm14, xmm2
239        pxor    xmm15, xmm3
240        movdqa  xmm8, xmm12
241        psrld   xmm12, 8
242        pslld   xmm8, 24
243        pxor    xmm12, xmm8
244        movdqa  xmm8, xmm13
245        psrld   xmm13, 8
246        pslld   xmm8, 24
247        pxor    xmm13, xmm8
248        movdqa  xmm8, xmm14
249        psrld   xmm14, 8
250        pslld   xmm8, 24
251        pxor    xmm14, xmm8
252        movdqa  xmm8, xmm15
253        psrld   xmm15, 8
254        pslld   xmm8, 24
255        pxor    xmm15, xmm8
256        movdqa  xmm8, xmmword ptr [rsp+0x100]
257        paddd   xmm8, xmm12
258        paddd   xmm9, xmm13
259        paddd   xmm10, xmm14
260        paddd   xmm11, xmm15
261        pxor    xmm4, xmm8
262        pxor    xmm5, xmm9
263        pxor    xmm6, xmm10
264        pxor    xmm7, xmm11
265        movdqa  xmmword ptr [rsp+0x100], xmm8
266        movdqa  xmm8, xmm4
267        psrld   xmm8, 7
268        pslld   xmm4, 25
269        por     xmm4, xmm8
270        movdqa  xmm8, xmm5
271        psrld   xmm8, 7
272        pslld   xmm5, 25
273        por     xmm5, xmm8
274        movdqa  xmm8, xmm6
275        psrld   xmm8, 7
276        pslld   xmm6, 25
277        por     xmm6, xmm8
278        movdqa  xmm8, xmm7
279        psrld   xmm8, 7
280        pslld   xmm7, 25
281        por     xmm7, xmm8
282        paddd   xmm0, xmmword ptr [rsp+0x80]
283        paddd   xmm1, xmmword ptr [rsp+0xA0]
284        paddd   xmm2, xmmword ptr [rsp+0xC0]
285        paddd   xmm3, xmmword ptr [rsp+0xE0]
286        paddd   xmm0, xmm5
287        paddd   xmm1, xmm6
288        paddd   xmm2, xmm7
289        paddd   xmm3, xmm4
290        pxor    xmm15, xmm0
291        pxor    xmm12, xmm1
292        pxor    xmm13, xmm2
293        pxor    xmm14, xmm3
294        pshuflw xmm15, xmm15, 0xB1
295        pshufhw xmm15, xmm15, 0xB1
296        pshuflw xmm12, xmm12, 0xB1
297        pshufhw xmm12, xmm12, 0xB1
298        pshuflw xmm13, xmm13, 0xB1
299        pshufhw xmm13, xmm13, 0xB1
300        pshuflw xmm14, xmm14, 0xB1
301        pshufhw xmm14, xmm14, 0xB1
302        paddd   xmm10, xmm15
303        paddd   xmm11, xmm12
304        movdqa  xmm8, xmmword ptr [rsp+0x100]
305        paddd   xmm8, xmm13
306        paddd   xmm9, xmm14
307        pxor    xmm5, xmm10
308        pxor    xmm6, xmm11
309        pxor    xmm7, xmm8
310        pxor    xmm4, xmm9
311        movdqa  xmmword ptr [rsp+0x100], xmm8
312        movdqa  xmm8, xmm5
313        psrld   xmm8, 12
314        pslld   xmm5, 20
315        por     xmm5, xmm8
316        movdqa  xmm8, xmm6
317        psrld   xmm8, 12
318        pslld   xmm6, 20
319        por     xmm6, xmm8
320        movdqa  xmm8, xmm7
321        psrld   xmm8, 12
322        pslld   xmm7, 20
323        por     xmm7, xmm8
324        movdqa  xmm8, xmm4
325        psrld   xmm8, 12
326        pslld   xmm4, 20
327        por     xmm4, xmm8
328        paddd   xmm0, xmmword ptr [rsp+0x90]
329        paddd   xmm1, xmmword ptr [rsp+0xB0]
330        paddd   xmm2, xmmword ptr [rsp+0xD0]
331        paddd   xmm3, xmmword ptr [rsp+0xF0]
332        paddd   xmm0, xmm5
333        paddd   xmm1, xmm6
334        paddd   xmm2, xmm7
335        paddd   xmm3, xmm4
336        pxor    xmm15, xmm0
337        pxor    xmm12, xmm1
338        pxor    xmm13, xmm2
339        pxor    xmm14, xmm3
340        movdqa  xmm8, xmm15
341        psrld   xmm15, 8
342        pslld   xmm8, 24
343        pxor    xmm15, xmm8
344        movdqa  xmm8, xmm12
345        psrld   xmm12, 8
346        pslld   xmm8, 24
347        pxor    xmm12, xmm8
348        movdqa  xmm8, xmm13
349        psrld   xmm13, 8
350        pslld   xmm8, 24
351        pxor    xmm13, xmm8
352        movdqa  xmm8, xmm14
353        psrld   xmm14, 8
354        pslld   xmm8, 24
355        pxor    xmm14, xmm8
356        paddd   xmm10, xmm15
357        paddd   xmm11, xmm12
358        movdqa  xmm8, xmmword ptr [rsp+0x100]
359        paddd   xmm8, xmm13
360        paddd   xmm9, xmm14
361        pxor    xmm5, xmm10
362        pxor    xmm6, xmm11
363        pxor    xmm7, xmm8
364        pxor    xmm4, xmm9
365        movdqa  xmmword ptr [rsp+0x100], xmm8
366        movdqa  xmm8, xmm5
367        psrld   xmm8, 7
368        pslld   xmm5, 25
369        por     xmm5, xmm8
370        movdqa  xmm8, xmm6
371        psrld   xmm8, 7
372        pslld   xmm6, 25
373        por     xmm6, xmm8
374        movdqa  xmm8, xmm7
375        psrld   xmm8, 7
376        pslld   xmm7, 25
377        por     xmm7, xmm8
378        movdqa  xmm8, xmm4
379        psrld   xmm8, 7
380        pslld   xmm4, 25
381        por     xmm4, xmm8
382        paddd   xmm0, xmmword ptr [rsp+0x20]
383        paddd   xmm1, xmmword ptr [rsp+0x30]
384        paddd   xmm2, xmmword ptr [rsp+0x70]
385        paddd   xmm3, xmmword ptr [rsp+0x40]
386        paddd   xmm0, xmm4
387        paddd   xmm1, xmm5
388        paddd   xmm2, xmm6
389        paddd   xmm3, xmm7
390        pxor    xmm12, xmm0
391        pxor    xmm13, xmm1
392        pxor    xmm14, xmm2
393        pxor    xmm15, xmm3
394        pshuflw xmm12, xmm12, 0xB1
395        pshufhw xmm12, xmm12, 0xB1
396        pshuflw xmm13, xmm13, 0xB1
397        pshufhw xmm13, xmm13, 0xB1
398        pshuflw xmm14, xmm14, 0xB1
399        pshufhw xmm14, xmm14, 0xB1
400        pshuflw xmm15, xmm15, 0xB1
401        pshufhw xmm15, xmm15, 0xB1
402        movdqa  xmm8, xmmword ptr [rsp+0x100]
403        paddd   xmm8, xmm12
404        paddd   xmm9, xmm13
405        paddd   xmm10, xmm14
406        paddd   xmm11, xmm15
407        pxor    xmm4, xmm8
408        pxor    xmm5, xmm9
409        pxor    xmm6, xmm10
410        pxor    xmm7, xmm11
411        movdqa  xmmword ptr [rsp+0x100], xmm8
412        movdqa  xmm8, xmm4
413        psrld   xmm8, 12
414        pslld   xmm4, 20
415        por     xmm4, xmm8
416        movdqa  xmm8, xmm5
417        psrld   xmm8, 12
418        pslld   xmm5, 20
419        por     xmm5, xmm8
420        movdqa  xmm8, xmm6
421        psrld   xmm8, 12
422        pslld   xmm6, 20
423        por     xmm6, xmm8
424        movdqa  xmm8, xmm7
425        psrld   xmm8, 12
426        pslld   xmm7, 20
427        por     xmm7, xmm8
428        paddd   xmm0, xmmword ptr [rsp+0x60]
429        paddd   xmm1, xmmword ptr [rsp+0xA0]
430        paddd   xmm2, xmmword ptr [rsp]
431        paddd   xmm3, xmmword ptr [rsp+0xD0]
432        paddd   xmm0, xmm4
433        paddd   xmm1, xmm5
434        paddd   xmm2, xmm6
435        paddd   xmm3, xmm7
436        pxor    xmm12, xmm0
437        pxor    xmm13, xmm1
438        pxor    xmm14, xmm2
439        pxor    xmm15, xmm3
440        movdqa  xmm8, xmm12
441        psrld   xmm12, 8
442        pslld   xmm8, 24
443        pxor    xmm12, xmm8
444        movdqa  xmm8, xmm13
445        psrld   xmm13, 8
446        pslld   xmm8, 24
447        pxor    xmm13, xmm8
448        movdqa  xmm8, xmm14
449        psrld   xmm14, 8
450        pslld   xmm8, 24
451        pxor    xmm14, xmm8
452        movdqa  xmm8, xmm15
453        psrld   xmm15, 8
454        pslld   xmm8, 24
455        pxor    xmm15, xmm8
456        movdqa  xmm8, xmmword ptr [rsp+0x100]
457        paddd   xmm8, xmm12
458        paddd   xmm9, xmm13
459        paddd   xmm10, xmm14
460        paddd   xmm11, xmm15
461        pxor    xmm4, xmm8
462        pxor    xmm5, xmm9
463        pxor    xmm6, xmm10
464        pxor    xmm7, xmm11
465        movdqa  xmmword ptr [rsp+0x100], xmm8
466        movdqa  xmm8, xmm4
467        psrld   xmm8, 7
468        pslld   xmm4, 25
469        por     xmm4, xmm8
470        movdqa  xmm8, xmm5
471        psrld   xmm8, 7
472        pslld   xmm5, 25
473        por     xmm5, xmm8
474        movdqa  xmm8, xmm6
475        psrld   xmm8, 7
476        pslld   xmm6, 25
477        por     xmm6, xmm8
478        movdqa  xmm8, xmm7
479        psrld   xmm8, 7
480        pslld   xmm7, 25
481        por     xmm7, xmm8
482        paddd   xmm0, xmmword ptr [rsp+0x10]
483        paddd   xmm1, xmmword ptr [rsp+0xC0]
484        paddd   xmm2, xmmword ptr [rsp+0x90]
485        paddd   xmm3, xmmword ptr [rsp+0xF0]
486        paddd   xmm0, xmm5
487        paddd   xmm1, xmm6
488        paddd   xmm2, xmm7
489        paddd   xmm3, xmm4
490        pxor    xmm15, xmm0
491        pxor    xmm12, xmm1
492        pxor    xmm13, xmm2
493        pxor    xmm14, xmm3
494        pshuflw xmm15, xmm15, 0xB1
495        pshufhw xmm15, xmm15, 0xB1
496        pshuflw xmm12, xmm12, 0xB1
497        pshufhw xmm12, xmm12, 0xB1
498        pshuflw xmm13, xmm13, 0xB1
499        pshufhw xmm13, xmm13, 0xB1
500        pshuflw xmm14, xmm14, 0xB1
501        pshufhw xmm14, xmm14, 0xB1
502        paddd   xmm10, xmm15
503        paddd   xmm11, xmm12
504        movdqa  xmm8, xmmword ptr [rsp+0x100]
505        paddd   xmm8, xmm13
506        paddd   xmm9, xmm14
507        pxor    xmm5, xmm10
508        pxor    xmm6, xmm11
509        pxor    xmm7, xmm8
510        pxor    xmm4, xmm9
511        movdqa  xmmword ptr [rsp+0x100], xmm8
512        movdqa  xmm8, xmm5
513        psrld   xmm8, 12
514        pslld   xmm5, 20
515        por     xmm5, xmm8
516        movdqa  xmm8, xmm6
517        psrld   xmm8, 12
518        pslld   xmm6, 20
519        por     xmm6, xmm8
520        movdqa  xmm8, xmm7
521        psrld   xmm8, 12
522        pslld   xmm7, 20
523        por     xmm7, xmm8
524        movdqa  xmm8, xmm4
525        psrld   xmm8, 12
526        pslld   xmm4, 20
527        por     xmm4, xmm8
528        paddd   xmm0, xmmword ptr [rsp+0xB0]
529        paddd   xmm1, xmmword ptr [rsp+0x50]
530        paddd   xmm2, xmmword ptr [rsp+0xE0]
531        paddd   xmm3, xmmword ptr [rsp+0x80]
532        paddd   xmm0, xmm5
533        paddd   xmm1, xmm6
534        paddd   xmm2, xmm7
535        paddd   xmm3, xmm4
536        pxor    xmm15, xmm0
537        pxor    xmm12, xmm1
538        pxor    xmm13, xmm2
539        pxor    xmm14, xmm3
540        movdqa  xmm8, xmm15
541        psrld   xmm15, 8
542        pslld   xmm8, 24
543        pxor    xmm15, xmm8
544        movdqa  xmm8, xmm12
545        psrld   xmm12, 8
546        pslld   xmm8, 24
547        pxor    xmm12, xmm8
548        movdqa  xmm8, xmm13
549        psrld   xmm13, 8
550        pslld   xmm8, 24
551        pxor    xmm13, xmm8
552        movdqa  xmm8, xmm14
553        psrld   xmm14, 8
554        pslld   xmm8, 24
555        pxor    xmm14, xmm8
556        paddd   xmm10, xmm15
557        paddd   xmm11, xmm12
558        movdqa  xmm8, xmmword ptr [rsp+0x100]
559        paddd   xmm8, xmm13
560        paddd   xmm9, xmm14
561        pxor    xmm5, xmm10
562        pxor    xmm6, xmm11
563        pxor    xmm7, xmm8
564        pxor    xmm4, xmm9
565        movdqa  xmmword ptr [rsp+0x100], xmm8
566        movdqa  xmm8, xmm5
567        psrld   xmm8, 7
568        pslld   xmm5, 25
569        por     xmm5, xmm8
570        movdqa  xmm8, xmm6
571        psrld   xmm8, 7
572        pslld   xmm6, 25
573        por     xmm6, xmm8
574        movdqa  xmm8, xmm7
575        psrld   xmm8, 7
576        pslld   xmm7, 25
577        por     xmm7, xmm8
578        movdqa  xmm8, xmm4
579        psrld   xmm8, 7
580        pslld   xmm4, 25
581        por     xmm4, xmm8
582        paddd   xmm0, xmmword ptr [rsp+0x30]
583        paddd   xmm1, xmmword ptr [rsp+0xA0]
584        paddd   xmm2, xmmword ptr [rsp+0xD0]
585        paddd   xmm3, xmmword ptr [rsp+0x70]
586        paddd   xmm0, xmm4
587        paddd   xmm1, xmm5
588        paddd   xmm2, xmm6
589        paddd   xmm3, xmm7
590        pxor    xmm12, xmm0
591        pxor    xmm13, xmm1
592        pxor    xmm14, xmm2
593        pxor    xmm15, xmm3
594        pshuflw xmm12, xmm12, 0xB1
595        pshufhw xmm12, xmm12, 0xB1
596        pshuflw xmm13, xmm13, 0xB1
597        pshufhw xmm13, xmm13, 0xB1
598        pshuflw xmm14, xmm14, 0xB1
599        pshufhw xmm14, xmm14, 0xB1
600        pshuflw xmm15, xmm15, 0xB1
601        pshufhw xmm15, xmm15, 0xB1
602        movdqa  xmm8, xmmword ptr [rsp+0x100]
603        paddd   xmm8, xmm12
604        paddd   xmm9, xmm13
605        paddd   xmm10, xmm14
606        paddd   xmm11, xmm15
607        pxor    xmm4, xmm8
608        pxor    xmm5, xmm9
609        pxor    xmm6, xmm10
610        pxor    xmm7, xmm11
611        movdqa  xmmword ptr [rsp+0x100], xmm8
612        movdqa  xmm8, xmm4
613        psrld   xmm8, 12
614        pslld   xmm4, 20
615        por     xmm4, xmm8
616        movdqa  xmm8, xmm5
617        psrld   xmm8, 12
618        pslld   xmm5, 20
619        por     xmm5, xmm8
620        movdqa  xmm8, xmm6
621        psrld   xmm8, 12
622        pslld   xmm6, 20
623        por     xmm6, xmm8
624        movdqa  xmm8, xmm7
625        psrld   xmm8, 12
626        pslld   xmm7, 20
627        por     xmm7, xmm8
628        paddd   xmm0, xmmword ptr [rsp+0x40]
629        paddd   xmm1, xmmword ptr [rsp+0xC0]
630        paddd   xmm2, xmmword ptr [rsp+0x20]
631        paddd   xmm3, xmmword ptr [rsp+0xE0]
632        paddd   xmm0, xmm4
633        paddd   xmm1, xmm5
634        paddd   xmm2, xmm6
635        paddd   xmm3, xmm7
636        pxor    xmm12, xmm0
637        pxor    xmm13, xmm1
638        pxor    xmm14, xmm2
639        pxor    xmm15, xmm3
640        movdqa  xmm8, xmm12
641        psrld   xmm12, 8
642        pslld   xmm8, 24
643        pxor    xmm12, xmm8
644        movdqa  xmm8, xmm13
645        psrld   xmm13, 8
646        pslld   xmm8, 24
647        pxor    xmm13, xmm8
648        movdqa  xmm8, xmm14
649        psrld   xmm14, 8
650        pslld   xmm8, 24
651        pxor    xmm14, xmm8
652        movdqa  xmm8, xmm15
653        psrld   xmm15, 8
654        pslld   xmm8, 24
655        pxor    xmm15, xmm8
656        movdqa  xmm8, xmmword ptr [rsp+0x100]
657        paddd   xmm8, xmm12
658        paddd   xmm9, xmm13
659        paddd   xmm10, xmm14
660        paddd   xmm11, xmm15
661        pxor    xmm4, xmm8
662        pxor    xmm5, xmm9
663        pxor    xmm6, xmm10
664        pxor    xmm7, xmm11
665        movdqa  xmmword ptr [rsp+0x100], xmm8
666        movdqa  xmm8, xmm4
667        psrld   xmm8, 7
668        pslld   xmm4, 25
669        por     xmm4, xmm8
670        movdqa  xmm8, xmm5
671        psrld   xmm8, 7
672        pslld   xmm5, 25
673        por     xmm5, xmm8
674        movdqa  xmm8, xmm6
675        psrld   xmm8, 7
676        pslld   xmm6, 25
677        por     xmm6, xmm8
678        movdqa  xmm8, xmm7
679        psrld   xmm8, 7
680        pslld   xmm7, 25
681        por     xmm7, xmm8
682        paddd   xmm0, xmmword ptr [rsp+0x60]
683        paddd   xmm1, xmmword ptr [rsp+0x90]
684        paddd   xmm2, xmmword ptr [rsp+0xB0]
685        paddd   xmm3, xmmword ptr [rsp+0x80]
686        paddd   xmm0, xmm5
687        paddd   xmm1, xmm6
688        paddd   xmm2, xmm7
689        paddd   xmm3, xmm4
690        pxor    xmm15, xmm0
691        pxor    xmm12, xmm1
692        pxor    xmm13, xmm2
693        pxor    xmm14, xmm3
694        pshuflw xmm15, xmm15, 0xB1
695        pshufhw xmm15, xmm15, 0xB1
696        pshuflw xmm12, xmm12, 0xB1
697        pshufhw xmm12, xmm12, 0xB1
698        pshuflw xmm13, xmm13, 0xB1
699        pshufhw xmm13, xmm13, 0xB1
700        pshuflw xmm14, xmm14, 0xB1
701        pshufhw xmm14, xmm14, 0xB1
702        paddd   xmm10, xmm15
703        paddd   xmm11, xmm12
704        movdqa  xmm8, xmmword ptr [rsp+0x100]
705        paddd   xmm8, xmm13
706        paddd   xmm9, xmm14
707        pxor    xmm5, xmm10
708        pxor    xmm6, xmm11
709        pxor    xmm7, xmm8
710        pxor    xmm4, xmm9
711        movdqa  xmmword ptr [rsp+0x100], xmm8
712        movdqa  xmm8, xmm5
713        psrld   xmm8, 12
714        pslld   xmm5, 20
715        por     xmm5, xmm8
716        movdqa  xmm8, xmm6
717        psrld   xmm8, 12
718        pslld   xmm6, 20
719        por     xmm6, xmm8
720        movdqa  xmm8, xmm7
721        psrld   xmm8, 12
722        pslld   xmm7, 20
723        por     xmm7, xmm8
724        movdqa  xmm8, xmm4
725        psrld   xmm8, 12
726        pslld   xmm4, 20
727        por     xmm4, xmm8
728        paddd   xmm0, xmmword ptr [rsp+0x50]
729        paddd   xmm1, xmmword ptr [rsp]
730        paddd   xmm2, xmmword ptr [rsp+0xF0]
731        paddd   xmm3, xmmword ptr [rsp+0x10]
732        paddd   xmm0, xmm5
733        paddd   xmm1, xmm6
734        paddd   xmm2, xmm7
735        paddd   xmm3, xmm4
736        pxor    xmm15, xmm0
737        pxor    xmm12, xmm1
738        pxor    xmm13, xmm2
739        pxor    xmm14, xmm3
740        movdqa  xmm8, xmm15
741        psrld   xmm15, 8
742        pslld   xmm8, 24
743        pxor    xmm15, xmm8
744        movdqa  xmm8, xmm12
745        psrld   xmm12, 8
746        pslld   xmm8, 24
747        pxor    xmm12, xmm8
748        movdqa  xmm8, xmm13
749        psrld   xmm13, 8
750        pslld   xmm8, 24
751        pxor    xmm13, xmm8
752        movdqa  xmm8, xmm14
753        psrld   xmm14, 8
754        pslld   xmm8, 24
755        pxor    xmm14, xmm8
756        paddd   xmm10, xmm15
757        paddd   xmm11, xmm12
758        movdqa  xmm8, xmmword ptr [rsp+0x100]
759        paddd   xmm8, xmm13
760        paddd   xmm9, xmm14
761        pxor    xmm5, xmm10
762        pxor    xmm6, xmm11
763        pxor    xmm7, xmm8
764        pxor    xmm4, xmm9
765        movdqa  xmmword ptr [rsp+0x100], xmm8
766        movdqa  xmm8, xmm5
767        psrld   xmm8, 7
768        pslld   xmm5, 25
769        por     xmm5, xmm8
770        movdqa  xmm8, xmm6
771        psrld   xmm8, 7
772        pslld   xmm6, 25
773        por     xmm6, xmm8
774        movdqa  xmm8, xmm7
775        psrld   xmm8, 7
776        pslld   xmm7, 25
777        por     xmm7, xmm8
778        movdqa  xmm8, xmm4
779        psrld   xmm8, 7
780        pslld   xmm4, 25
781        por     xmm4, xmm8
782        paddd   xmm0, xmmword ptr [rsp+0xA0]
783        paddd   xmm1, xmmword ptr [rsp+0xC0]
784        paddd   xmm2, xmmword ptr [rsp+0xE0]
785        paddd   xmm3, xmmword ptr [rsp+0xD0]
786        paddd   xmm0, xmm4
787        paddd   xmm1, xmm5
788        paddd   xmm2, xmm6
789        paddd   xmm3, xmm7
790        pxor    xmm12, xmm0
791        pxor    xmm13, xmm1
792        pxor    xmm14, xmm2
793        pxor    xmm15, xmm3
794        pshuflw xmm12, xmm12, 0xB1
795        pshufhw xmm12, xmm12, 0xB1
796        pshuflw xmm13, xmm13, 0xB1
797        pshufhw xmm13, xmm13, 0xB1
798        pshuflw xmm14, xmm14, 0xB1
799        pshufhw xmm14, xmm14, 0xB1
800        pshuflw xmm15, xmm15, 0xB1
801        pshufhw xmm15, xmm15, 0xB1
802        movdqa  xmm8, xmmword ptr [rsp+0x100]
803        paddd   xmm8, xmm12
804        paddd   xmm9, xmm13
805        paddd   xmm10, xmm14
806        paddd   xmm11, xmm15
807        pxor    xmm4, xmm8
808        pxor    xmm5, xmm9
809        pxor    xmm6, xmm10
810        pxor    xmm7, xmm11
811        movdqa  xmmword ptr [rsp+0x100], xmm8
812        movdqa  xmm8, xmm4
813        psrld   xmm8, 12
814        pslld   xmm4, 20
815        por     xmm4, xmm8
816        movdqa  xmm8, xmm5
817        psrld   xmm8, 12
818        pslld   xmm5, 20
819        por     xmm5, xmm8
820        movdqa  xmm8, xmm6
821        psrld   xmm8, 12
822        pslld   xmm6, 20
823        por     xmm6, xmm8
824        movdqa  xmm8, xmm7
825        psrld   xmm8, 12
826        pslld   xmm7, 20
827        por     xmm7, xmm8
828        paddd   xmm0, xmmword ptr [rsp+0x70]
829        paddd   xmm1, xmmword ptr [rsp+0x90]
830        paddd   xmm2, xmmword ptr [rsp+0x30]
831        paddd   xmm3, xmmword ptr [rsp+0xF0]
832        paddd   xmm0, xmm4
833        paddd   xmm1, xmm5
834        paddd   xmm2, xmm6
835        paddd   xmm3, xmm7
836        pxor    xmm12, xmm0
837        pxor    xmm13, xmm1
838        pxor    xmm14, xmm2
839        pxor    xmm15, xmm3
840        movdqa  xmm8, xmm12
841        psrld   xmm12, 8
842        pslld   xmm8, 24
843        pxor    xmm12, xmm8
844        movdqa  xmm8, xmm13
845        psrld   xmm13, 8
846        pslld   xmm8, 24
847        pxor    xmm13, xmm8
848        movdqa  xmm8, xmm14
849        psrld   xmm14, 8
850        pslld   xmm8, 24
851        pxor    xmm14, xmm8
852        movdqa  xmm8, xmm15
853        psrld   xmm15, 8
854        pslld   xmm8, 24
855        pxor    xmm15, xmm8
856        movdqa  xmm8, xmmword ptr [rsp+0x100]
857        paddd   xmm8, xmm12
858        paddd   xmm9, xmm13
859        paddd   xmm10, xmm14
860        paddd   xmm11, xmm15
861        pxor    xmm4, xmm8
862        pxor    xmm5, xmm9
863        pxor    xmm6, xmm10
864        pxor    xmm7, xmm11
865        movdqa  xmmword ptr [rsp+0x100], xmm8
866        movdqa  xmm8, xmm4
867        psrld   xmm8, 7
868        pslld   xmm4, 25
869        por     xmm4, xmm8
870        movdqa  xmm8, xmm5
871        psrld   xmm8, 7
872        pslld   xmm5, 25
873        por     xmm5, xmm8
874        movdqa  xmm8, xmm6
875        psrld   xmm8, 7
876        pslld   xmm6, 25
877        por     xmm6, xmm8
878        movdqa  xmm8, xmm7
879        psrld   xmm8, 7
880        pslld   xmm7, 25
881        por     xmm7, xmm8
882        paddd   xmm0, xmmword ptr [rsp+0x40]
883        paddd   xmm1, xmmword ptr [rsp+0xB0]
884        paddd   xmm2, xmmword ptr [rsp+0x50]
885        paddd   xmm3, xmmword ptr [rsp+0x10]
886        paddd   xmm0, xmm5
887        paddd   xmm1, xmm6
888        paddd   xmm2, xmm7
889        paddd   xmm3, xmm4
890        pxor    xmm15, xmm0
891        pxor    xmm12, xmm1
892        pxor    xmm13, xmm2
893        pxor    xmm14, xmm3
894        pshuflw xmm15, xmm15, 0xB1
895        pshufhw xmm15, xmm15, 0xB1
896        pshuflw xmm12, xmm12, 0xB1
897        pshufhw xmm12, xmm12, 0xB1
898        pshuflw xmm13, xmm13, 0xB1
899        pshufhw xmm13, xmm13, 0xB1
900        pshuflw xmm14, xmm14, 0xB1
901        pshufhw xmm14, xmm14, 0xB1
902        paddd   xmm10, xmm15
903        paddd   xmm11, xmm12
904        movdqa  xmm8, xmmword ptr [rsp+0x100]
905        paddd   xmm8, xmm13
906        paddd   xmm9, xmm14
907        pxor    xmm5, xmm10
908        pxor    xmm6, xmm11
909        pxor    xmm7, xmm8
910        pxor    xmm4, xmm9
911        movdqa  xmmword ptr [rsp+0x100], xmm8
912        movdqa  xmm8, xmm5
913        psrld   xmm8, 12
914        pslld   xmm5, 20
915        por     xmm5, xmm8
916        movdqa  xmm8, xmm6
917        psrld   xmm8, 12
918        pslld   xmm6, 20
919        por     xmm6, xmm8
920        movdqa  xmm8, xmm7
921        psrld   xmm8, 12
922        pslld   xmm7, 20
923        por     xmm7, xmm8
924        movdqa  xmm8, xmm4
925        psrld   xmm8, 12
926        pslld   xmm4, 20
927        por     xmm4, xmm8
928        paddd   xmm0, xmmword ptr [rsp]
929        paddd   xmm1, xmmword ptr [rsp+0x20]
930        paddd   xmm2, xmmword ptr [rsp+0x80]
931        paddd   xmm3, xmmword ptr [rsp+0x60]
932        paddd   xmm0, xmm5
933        paddd   xmm1, xmm6
934        paddd   xmm2, xmm7
935        paddd   xmm3, xmm4
936        pxor    xmm15, xmm0
937        pxor    xmm12, xmm1
938        pxor    xmm13, xmm2
939        pxor    xmm14, xmm3
940        movdqa  xmm8, xmm15
941        psrld   xmm15, 8
942        pslld   xmm8, 24
943        pxor    xmm15, xmm8
944        movdqa  xmm8, xmm12
945        psrld   xmm12, 8
946        pslld   xmm8, 24
947        pxor    xmm12, xmm8
948        movdqa  xmm8, xmm13
949        psrld   xmm13, 8
950        pslld   xmm8, 24
951        pxor    xmm13, xmm8
952        movdqa  xmm8, xmm14
953        psrld   xmm14, 8
954        pslld   xmm8, 24
955        pxor    xmm14, xmm8
956        paddd   xmm10, xmm15
957        paddd   xmm11, xmm12
958        movdqa  xmm8, xmmword ptr [rsp+0x100]
959        paddd   xmm8, xmm13
960        paddd   xmm9, xmm14
961        pxor    xmm5, xmm10
962        pxor    xmm6, xmm11
963        pxor    xmm7, xmm8
964        pxor    xmm4, xmm9
965        movdqa  xmmword ptr [rsp+0x100], xmm8
966        movdqa  xmm8, xmm5
967        psrld   xmm8, 7
968        pslld   xmm5, 25
969        por     xmm5, xmm8
970        movdqa  xmm8, xmm6
971        psrld   xmm8, 7
972        pslld   xmm6, 25
973        por     xmm6, xmm8
974        movdqa  xmm8, xmm7
975        psrld   xmm8, 7
976        pslld   xmm7, 25
977        por     xmm7, xmm8
978        movdqa  xmm8, xmm4
979        psrld   xmm8, 7
980        pslld   xmm4, 25
981        por     xmm4, xmm8
982        paddd   xmm0, xmmword ptr [rsp+0xC0]
983        paddd   xmm1, xmmword ptr [rsp+0x90]
984        paddd   xmm2, xmmword ptr [rsp+0xF0]
985        paddd   xmm3, xmmword ptr [rsp+0xE0]
986        paddd   xmm0, xmm4
987        paddd   xmm1, xmm5
988        paddd   xmm2, xmm6
989        paddd   xmm3, xmm7
990        pxor    xmm12, xmm0
991        pxor    xmm13, xmm1
992        pxor    xmm14, xmm2
993        pxor    xmm15, xmm3
994        pshuflw xmm12, xmm12, 0xB1
995        pshufhw xmm12, xmm12, 0xB1
996        pshuflw xmm13, xmm13, 0xB1
997        pshufhw xmm13, xmm13, 0xB1
998        pshuflw xmm14, xmm14, 0xB1
999        pshufhw xmm14, xmm14, 0xB1
1000        pshuflw xmm15, xmm15, 0xB1
1001        pshufhw xmm15, xmm15, 0xB1
1002        movdqa  xmm8, xmmword ptr [rsp+0x100]
1003        paddd   xmm8, xmm12
1004        paddd   xmm9, xmm13
1005        paddd   xmm10, xmm14
1006        paddd   xmm11, xmm15
1007        pxor    xmm4, xmm8
1008        pxor    xmm5, xmm9
1009        pxor    xmm6, xmm10
1010        pxor    xmm7, xmm11
1011        movdqa  xmmword ptr [rsp+0x100], xmm8
1012        movdqa  xmm8, xmm4
1013        psrld   xmm8, 12
1014        pslld   xmm4, 20
1015        por     xmm4, xmm8
1016        movdqa  xmm8, xmm5
1017        psrld   xmm8, 12
1018        pslld   xmm5, 20
1019        por     xmm5, xmm8
1020        movdqa  xmm8, xmm6
1021        psrld   xmm8, 12
1022        pslld   xmm6, 20
1023        por     xmm6, xmm8
1024        movdqa  xmm8, xmm7
1025        psrld   xmm8, 12
1026        pslld   xmm7, 20
1027        por     xmm7, xmm8
1028        paddd   xmm0, xmmword ptr [rsp+0xD0]
1029        paddd   xmm1, xmmword ptr [rsp+0xB0]
1030        paddd   xmm2, xmmword ptr [rsp+0xA0]
1031        paddd   xmm3, xmmword ptr [rsp+0x80]
1032        paddd   xmm0, xmm4
1033        paddd   xmm1, xmm5
1034        paddd   xmm2, xmm6
1035        paddd   xmm3, xmm7
1036        pxor    xmm12, xmm0
1037        pxor    xmm13, xmm1
1038        pxor    xmm14, xmm2
1039        pxor    xmm15, xmm3
1040        movdqa  xmm8, xmm12
1041        psrld   xmm12, 8
1042        pslld   xmm8, 24
1043        pxor    xmm12, xmm8
1044        movdqa  xmm8, xmm13
1045        psrld   xmm13, 8
1046        pslld   xmm8, 24
1047        pxor    xmm13, xmm8
1048        movdqa  xmm8, xmm14
1049        psrld   xmm14, 8
1050        pslld   xmm8, 24
1051        pxor    xmm14, xmm8
1052        movdqa  xmm8, xmm15
1053        psrld   xmm15, 8
1054        pslld   xmm8, 24
1055        pxor    xmm15, xmm8
1056        movdqa  xmm8, xmmword ptr [rsp+0x100]
1057        paddd   xmm8, xmm12
1058        paddd   xmm9, xmm13
1059        paddd   xmm10, xmm14
1060        paddd   xmm11, xmm15
1061        pxor    xmm4, xmm8
1062        pxor    xmm5, xmm9
1063        pxor    xmm6, xmm10
1064        pxor    xmm7, xmm11
1065        movdqa  xmmword ptr [rsp+0x100], xmm8
1066        movdqa  xmm8, xmm4
1067        psrld   xmm8, 7
1068        pslld   xmm4, 25
1069        por     xmm4, xmm8
1070        movdqa  xmm8, xmm5
1071        psrld   xmm8, 7
1072        pslld   xmm5, 25
1073        por     xmm5, xmm8
1074        movdqa  xmm8, xmm6
1075        psrld   xmm8, 7
1076        pslld   xmm6, 25
1077        por     xmm6, xmm8
1078        movdqa  xmm8, xmm7
1079        psrld   xmm8, 7
1080        pslld   xmm7, 25
1081        por     xmm7, xmm8
1082        paddd   xmm0, xmmword ptr [rsp+0x70]
1083        paddd   xmm1, xmmword ptr [rsp+0x50]
1084        paddd   xmm2, xmmword ptr [rsp]
1085        paddd   xmm3, xmmword ptr [rsp+0x60]
1086        paddd   xmm0, xmm5
1087        paddd   xmm1, xmm6
1088        paddd   xmm2, xmm7
1089        paddd   xmm3, xmm4
1090        pxor    xmm15, xmm0
1091        pxor    xmm12, xmm1
1092        pxor    xmm13, xmm2
1093        pxor    xmm14, xmm3
1094        pshuflw xmm15, xmm15, 0xB1
1095        pshufhw xmm15, xmm15, 0xB1
1096        pshuflw xmm12, xmm12, 0xB1
1097        pshufhw xmm12, xmm12, 0xB1
1098        pshuflw xmm13, xmm13, 0xB1
1099        pshufhw xmm13, xmm13, 0xB1
1100        pshuflw xmm14, xmm14, 0xB1
1101        pshufhw xmm14, xmm14, 0xB1
1102        paddd   xmm10, xmm15
1103        paddd   xmm11, xmm12
1104        movdqa  xmm8, xmmword ptr [rsp+0x100]
1105        paddd   xmm8, xmm13
1106        paddd   xmm9, xmm14
1107        pxor    xmm5, xmm10
1108        pxor    xmm6, xmm11
1109        pxor    xmm7, xmm8
1110        pxor    xmm4, xmm9
1111        movdqa  xmmword ptr [rsp+0x100], xmm8
1112        movdqa  xmm8, xmm5
1113        psrld   xmm8, 12
1114        pslld   xmm5, 20
1115        por     xmm5, xmm8
1116        movdqa  xmm8, xmm6
1117        psrld   xmm8, 12
1118        pslld   xmm6, 20
1119        por     xmm6, xmm8
1120        movdqa  xmm8, xmm7
1121        psrld   xmm8, 12
1122        pslld   xmm7, 20
1123        por     xmm7, xmm8
1124        movdqa  xmm8, xmm4
1125        psrld   xmm8, 12
1126        pslld   xmm4, 20
1127        por     xmm4, xmm8
1128        paddd   xmm0, xmmword ptr [rsp+0x20]
1129        paddd   xmm1, xmmword ptr [rsp+0x30]
1130        paddd   xmm2, xmmword ptr [rsp+0x10]
1131        paddd   xmm3, xmmword ptr [rsp+0x40]
1132        paddd   xmm0, xmm5
1133        paddd   xmm1, xmm6
1134        paddd   xmm2, xmm7
1135        paddd   xmm3, xmm4
1136        pxor    xmm15, xmm0
1137        pxor    xmm12, xmm1
1138        pxor    xmm13, xmm2
1139        pxor    xmm14, xmm3
1140        movdqa  xmm8, xmm15
1141        psrld   xmm15, 8
1142        pslld   xmm8, 24
1143        pxor    xmm15, xmm8
1144        movdqa  xmm8, xmm12
1145        psrld   xmm12, 8
1146        pslld   xmm8, 24
1147        pxor    xmm12, xmm8
1148        movdqa  xmm8, xmm13
1149        psrld   xmm13, 8
1150        pslld   xmm8, 24
1151        pxor    xmm13, xmm8
1152        movdqa  xmm8, xmm14
1153        psrld   xmm14, 8
1154        pslld   xmm8, 24
1155        pxor    xmm14, xmm8
1156        paddd   xmm10, xmm15
1157        paddd   xmm11, xmm12
1158        movdqa  xmm8, xmmword ptr [rsp+0x100]
1159        paddd   xmm8, xmm13
1160        paddd   xmm9, xmm14
1161        pxor    xmm5, xmm10
1162        pxor    xmm6, xmm11
1163        pxor    xmm7, xmm8
1164        pxor    xmm4, xmm9
1165        movdqa  xmmword ptr [rsp+0x100], xmm8
1166        movdqa  xmm8, xmm5
1167        psrld   xmm8, 7
1168        pslld   xmm5, 25
1169        por     xmm5, xmm8
1170        movdqa  xmm8, xmm6
1171        psrld   xmm8, 7
1172        pslld   xmm6, 25
1173        por     xmm6, xmm8
1174        movdqa  xmm8, xmm7
1175        psrld   xmm8, 7
1176        pslld   xmm7, 25
1177        por     xmm7, xmm8
1178        movdqa  xmm8, xmm4
1179        psrld   xmm8, 7
1180        pslld   xmm4, 25
1181        por     xmm4, xmm8
1182        paddd   xmm0, xmmword ptr [rsp+0x90]
1183        paddd   xmm1, xmmword ptr [rsp+0xB0]
1184        paddd   xmm2, xmmword ptr [rsp+0x80]
1185        paddd   xmm3, xmmword ptr [rsp+0xF0]
1186        paddd   xmm0, xmm4
1187        paddd   xmm1, xmm5
1188        paddd   xmm2, xmm6
1189        paddd   xmm3, xmm7
1190        pxor    xmm12, xmm0
1191        pxor    xmm13, xmm1
1192        pxor    xmm14, xmm2
1193        pxor    xmm15, xmm3
1194        pshuflw xmm12, xmm12, 0xB1
1195        pshufhw xmm12, xmm12, 0xB1
1196        pshuflw xmm13, xmm13, 0xB1
1197        pshufhw xmm13, xmm13, 0xB1
1198        pshuflw xmm14, xmm14, 0xB1
1199        pshufhw xmm14, xmm14, 0xB1
1200        pshuflw xmm15, xmm15, 0xB1
1201        pshufhw xmm15, xmm15, 0xB1
1202        movdqa  xmm8, xmmword ptr [rsp+0x100]
1203        paddd   xmm8, xmm12
1204        paddd   xmm9, xmm13
1205        paddd   xmm10, xmm14
1206        paddd   xmm11, xmm15
1207        pxor    xmm4, xmm8
1208        pxor    xmm5, xmm9
1209        pxor    xmm6, xmm10
1210        pxor    xmm7, xmm11
1211        movdqa  xmmword ptr [rsp+0x100], xmm8
1212        movdqa  xmm8, xmm4
1213        psrld   xmm8, 12
1214        pslld   xmm4, 20
1215        por     xmm4, xmm8
1216        movdqa  xmm8, xmm5
1217        psrld   xmm8, 12
1218        pslld   xmm5, 20
1219        por     xmm5, xmm8
1220        movdqa  xmm8, xmm6
1221        psrld   xmm8, 12
1222        pslld   xmm6, 20
1223        por     xmm6, xmm8
1224        movdqa  xmm8, xmm7
1225        psrld   xmm8, 12
1226        pslld   xmm7, 20
1227        por     xmm7, xmm8
1228        paddd   xmm0, xmmword ptr [rsp+0xE0]
1229        paddd   xmm1, xmmword ptr [rsp+0x50]
1230        paddd   xmm2, xmmword ptr [rsp+0xC0]
1231        paddd   xmm3, xmmword ptr [rsp+0x10]
1232        paddd   xmm0, xmm4
1233        paddd   xmm1, xmm5
1234        paddd   xmm2, xmm6
1235        paddd   xmm3, xmm7
1236        pxor    xmm12, xmm0
1237        pxor    xmm13, xmm1
1238        pxor    xmm14, xmm2
1239        pxor    xmm15, xmm3
1240        movdqa  xmm8, xmm12
1241        psrld   xmm12, 8
1242        pslld   xmm8, 24
1243        pxor    xmm12, xmm8
1244        movdqa  xmm8, xmm13
1245        psrld   xmm13, 8
1246        pslld   xmm8, 24
1247        pxor    xmm13, xmm8
1248        movdqa  xmm8, xmm14
1249        psrld   xmm14, 8
1250        pslld   xmm8, 24
1251        pxor    xmm14, xmm8
1252        movdqa  xmm8, xmm15
1253        psrld   xmm15, 8
1254        pslld   xmm8, 24
1255        pxor    xmm15, xmm8
1256        movdqa  xmm8, xmmword ptr [rsp+0x100]
1257        paddd   xmm8, xmm12
1258        paddd   xmm9, xmm13
1259        paddd   xmm10, xmm14
1260        paddd   xmm11, xmm15
1261        pxor    xmm4, xmm8
1262        pxor    xmm5, xmm9
1263        pxor    xmm6, xmm10
1264        pxor    xmm7, xmm11
1265        movdqa  xmmword ptr [rsp+0x100], xmm8
1266        movdqa  xmm8, xmm4
1267        psrld   xmm8, 7
1268        pslld   xmm4, 25
1269        por     xmm4, xmm8
1270        movdqa  xmm8, xmm5
1271        psrld   xmm8, 7
1272        pslld   xmm5, 25
1273        por     xmm5, xmm8
1274        movdqa  xmm8, xmm6
1275        psrld   xmm8, 7
1276        pslld   xmm6, 25
1277        por     xmm6, xmm8
1278        movdqa  xmm8, xmm7
1279        psrld   xmm8, 7
1280        pslld   xmm7, 25
1281        por     xmm7, xmm8
1282        paddd   xmm0, xmmword ptr [rsp+0xD0]
1283        paddd   xmm1, xmmword ptr [rsp]
1284        paddd   xmm2, xmmword ptr [rsp+0x20]
1285        paddd   xmm3, xmmword ptr [rsp+0x40]
1286        paddd   xmm0, xmm5
1287        paddd   xmm1, xmm6
1288        paddd   xmm2, xmm7
1289        paddd   xmm3, xmm4
1290        pxor    xmm15, xmm0
1291        pxor    xmm12, xmm1
1292        pxor    xmm13, xmm2
1293        pxor    xmm14, xmm3
1294        pshuflw xmm15, xmm15, 0xB1
1295        pshufhw xmm15, xmm15, 0xB1
1296        pshuflw xmm12, xmm12, 0xB1
1297        pshufhw xmm12, xmm12, 0xB1
1298        pshuflw xmm13, xmm13, 0xB1
1299        pshufhw xmm13, xmm13, 0xB1
1300        pshuflw xmm14, xmm14, 0xB1
1301        pshufhw xmm14, xmm14, 0xB1
1302        paddd   xmm10, xmm15
1303        paddd   xmm11, xmm12
1304        movdqa  xmm8, xmmword ptr [rsp+0x100]
1305        paddd   xmm8, xmm13
1306        paddd   xmm9, xmm14
1307        pxor    xmm5, xmm10
1308        pxor    xmm6, xmm11
1309        pxor    xmm7, xmm8
1310        pxor    xmm4, xmm9
1311        movdqa  xmmword ptr [rsp+0x100], xmm8
1312        movdqa  xmm8, xmm5
1313        psrld   xmm8, 12
1314        pslld   xmm5, 20
1315        por     xmm5, xmm8
1316        movdqa  xmm8, xmm6
1317        psrld   xmm8, 12
1318        pslld   xmm6, 20
1319        por     xmm6, xmm8
1320        movdqa  xmm8, xmm7
1321        psrld   xmm8, 12
1322        pslld   xmm7, 20
1323        por     xmm7, xmm8
1324        movdqa  xmm8, xmm4
1325        psrld   xmm8, 12
1326        pslld   xmm4, 20
1327        por     xmm4, xmm8
1328        paddd   xmm0, xmmword ptr [rsp+0x30]
1329        paddd   xmm1, xmmword ptr [rsp+0xA0]
1330        paddd   xmm2, xmmword ptr [rsp+0x60]
1331        paddd   xmm3, xmmword ptr [rsp+0x70]
1332        paddd   xmm0, xmm5
1333        paddd   xmm1, xmm6
1334        paddd   xmm2, xmm7
1335        paddd   xmm3, xmm4
1336        pxor    xmm15, xmm0
1337        pxor    xmm12, xmm1
1338        pxor    xmm13, xmm2
1339        pxor    xmm14, xmm3
1340        movdqa  xmm8, xmm15
1341        psrld   xmm15, 8
1342        pslld   xmm8, 24
1343        pxor    xmm15, xmm8
1344        movdqa  xmm8, xmm12
1345        psrld   xmm12, 8
1346        pslld   xmm8, 24
1347        pxor    xmm12, xmm8
1348        movdqa  xmm8, xmm13
1349        psrld   xmm13, 8
1350        pslld   xmm8, 24
1351        pxor    xmm13, xmm8
1352        movdqa  xmm8, xmm14
1353        psrld   xmm14, 8
1354        pslld   xmm8, 24
1355        pxor    xmm14, xmm8
1356        paddd   xmm10, xmm15
1357        paddd   xmm11, xmm12
1358        movdqa  xmm8, xmmword ptr [rsp+0x100]
1359        paddd   xmm8, xmm13
1360        paddd   xmm9, xmm14
1361        pxor    xmm5, xmm10
1362        pxor    xmm6, xmm11
1363        pxor    xmm7, xmm8
1364        pxor    xmm4, xmm9
1365        movdqa  xmmword ptr [rsp+0x100], xmm8
1366        movdqa  xmm8, xmm5
1367        psrld   xmm8, 7
1368        pslld   xmm5, 25
1369        por     xmm5, xmm8
1370        movdqa  xmm8, xmm6
1371        psrld   xmm8, 7
1372        pslld   xmm6, 25
1373        por     xmm6, xmm8
1374        movdqa  xmm8, xmm7
1375        psrld   xmm8, 7
1376        pslld   xmm7, 25
1377        por     xmm7, xmm8
1378        movdqa  xmm8, xmm4
1379        psrld   xmm8, 7
1380        pslld   xmm4, 25
1381        por     xmm4, xmm8
1382        paddd   xmm0, xmmword ptr [rsp+0xB0]
1383        paddd   xmm1, xmmword ptr [rsp+0x50]
1384        paddd   xmm2, xmmword ptr [rsp+0x10]
1385        paddd   xmm3, xmmword ptr [rsp+0x80]
1386        paddd   xmm0, xmm4
1387        paddd   xmm1, xmm5
1388        paddd   xmm2, xmm6
1389        paddd   xmm3, xmm7
1390        pxor    xmm12, xmm0
1391        pxor    xmm13, xmm1
1392        pxor    xmm14, xmm2
1393        pxor    xmm15, xmm3
1394        pshuflw xmm12, xmm12, 0xB1
1395        pshufhw xmm12, xmm12, 0xB1
1396        pshuflw xmm13, xmm13, 0xB1
1397        pshufhw xmm13, xmm13, 0xB1
1398        pshuflw xmm14, xmm14, 0xB1
1399        pshufhw xmm14, xmm14, 0xB1
1400        pshuflw xmm15, xmm15, 0xB1
1401        pshufhw xmm15, xmm15, 0xB1
1402        movdqa  xmm8, xmmword ptr [rsp+0x100]
1403        paddd   xmm8, xmm12
1404        paddd   xmm9, xmm13
1405        paddd   xmm10, xmm14
1406        paddd   xmm11, xmm15
1407        pxor    xmm4, xmm8
1408        pxor    xmm5, xmm9
1409        pxor    xmm6, xmm10
1410        pxor    xmm7, xmm11
1411        movdqa  xmmword ptr [rsp+0x100], xmm8
1412        movdqa  xmm8, xmm4
1413        psrld   xmm8, 12
1414        pslld   xmm4, 20
1415        por     xmm4, xmm8
1416        movdqa  xmm8, xmm5
1417        psrld   xmm8, 12
1418        pslld   xmm5, 20
1419        por     xmm5, xmm8
1420        movdqa  xmm8, xmm6
1421        psrld   xmm8, 12
1422        pslld   xmm6, 20
1423        por     xmm6, xmm8
1424        movdqa  xmm8, xmm7
1425        psrld   xmm8, 12
1426        pslld   xmm7, 20
1427        por     xmm7, xmm8
1428        paddd   xmm0, xmmword ptr [rsp+0xF0]
1429        paddd   xmm1, xmmword ptr [rsp]
1430        paddd   xmm2, xmmword ptr [rsp+0x90]
1431        paddd   xmm3, xmmword ptr [rsp+0x60]
1432        paddd   xmm0, xmm4
1433        paddd   xmm1, xmm5
1434        paddd   xmm2, xmm6
1435        paddd   xmm3, xmm7
1436        pxor    xmm12, xmm0
1437        pxor    xmm13, xmm1
1438        pxor    xmm14, xmm2
1439        pxor    xmm15, xmm3
1440        movdqa  xmm8, xmm12
1441        psrld   xmm12, 8
1442        pslld   xmm8, 24
1443        pxor    xmm12, xmm8
1444        movdqa  xmm8, xmm13
1445        psrld   xmm13, 8
1446        pslld   xmm8, 24
1447        pxor    xmm13, xmm8
1448        movdqa  xmm8, xmm14
1449        psrld   xmm14, 8
1450        pslld   xmm8, 24
1451        pxor    xmm14, xmm8
1452        movdqa  xmm8, xmm15
1453        psrld   xmm15, 8
1454        pslld   xmm8, 24
1455        pxor    xmm15, xmm8
1456        movdqa  xmm8, xmmword ptr [rsp+0x100]
1457        paddd   xmm8, xmm12
1458        paddd   xmm9, xmm13
1459        paddd   xmm10, xmm14
1460        paddd   xmm11, xmm15
1461        pxor    xmm4, xmm8
1462        pxor    xmm5, xmm9
1463        pxor    xmm6, xmm10
1464        pxor    xmm7, xmm11
1465        movdqa  xmmword ptr [rsp+0x100], xmm8
1466        movdqa  xmm8, xmm4
1467        psrld   xmm8, 7
1468        pslld   xmm4, 25
1469        por     xmm4, xmm8
1470        movdqa  xmm8, xmm5
1471        psrld   xmm8, 7
1472        pslld   xmm5, 25
1473        por     xmm5, xmm8
1474        movdqa  xmm8, xmm6
1475        psrld   xmm8, 7
1476        pslld   xmm6, 25
1477        por     xmm6, xmm8
1478        movdqa  xmm8, xmm7
1479        psrld   xmm8, 7
1480        pslld   xmm7, 25
1481        por     xmm7, xmm8
1482        paddd   xmm0, xmmword ptr [rsp+0xE0]
1483        paddd   xmm1, xmmword ptr [rsp+0x20]
1484        paddd   xmm2, xmmword ptr [rsp+0x30]
1485        paddd   xmm3, xmmword ptr [rsp+0x70]
1486        paddd   xmm0, xmm5
1487        paddd   xmm1, xmm6
1488        paddd   xmm2, xmm7
1489        paddd   xmm3, xmm4
1490        pxor    xmm15, xmm0
1491        pxor    xmm12, xmm1
1492        pxor    xmm13, xmm2
1493        pxor    xmm14, xmm3
1494        pshuflw xmm15, xmm15, 0xB1
1495        pshufhw xmm15, xmm15, 0xB1
1496        pshuflw xmm12, xmm12, 0xB1
1497        pshufhw xmm12, xmm12, 0xB1
1498        pshuflw xmm13, xmm13, 0xB1
1499        pshufhw xmm13, xmm13, 0xB1
1500        pshuflw xmm14, xmm14, 0xB1
1501        pshufhw xmm14, xmm14, 0xB1
1502        paddd   xmm10, xmm15
1503        paddd   xmm11, xmm12
1504        movdqa  xmm8, xmmword ptr [rsp+0x100]
1505        paddd   xmm8, xmm13
1506        paddd   xmm9, xmm14
1507        pxor    xmm5, xmm10
1508        pxor    xmm6, xmm11
1509        pxor    xmm7, xmm8
1510        pxor    xmm4, xmm9
1511        movdqa  xmmword ptr [rsp+0x100], xmm8
1512        movdqa  xmm8, xmm5
1513        psrld   xmm8, 12
1514        pslld   xmm5, 20
1515        por     xmm5, xmm8
1516        movdqa  xmm8, xmm6
1517        psrld   xmm8, 12
1518        pslld   xmm6, 20
1519        por     xmm6, xmm8
1520        movdqa  xmm8, xmm7
1521        psrld   xmm8, 12
1522        pslld   xmm7, 20
1523        por     xmm7, xmm8
1524        movdqa  xmm8, xmm4
1525        psrld   xmm8, 12
1526        pslld   xmm4, 20
1527        por     xmm4, xmm8
1528        paddd   xmm0, xmmword ptr [rsp+0xA0]
1529        paddd   xmm1, xmmword ptr [rsp+0xC0]
1530        paddd   xmm2, xmmword ptr [rsp+0x40]
1531        paddd   xmm3, xmmword ptr [rsp+0xD0]
1532        paddd   xmm0, xmm5
1533        paddd   xmm1, xmm6
1534        paddd   xmm2, xmm7
1535        paddd   xmm3, xmm4
1536        pxor    xmm15, xmm0
1537        pxor    xmm12, xmm1
1538        pxor    xmm13, xmm2
1539        pxor    xmm14, xmm3
1540        movdqa  xmm8, xmm15
1541        psrld   xmm15, 8
1542        pslld   xmm8, 24
1543        pxor    xmm15, xmm8
1544        movdqa  xmm8, xmm12
1545        psrld   xmm12, 8
1546        pslld   xmm8, 24
1547        pxor    xmm12, xmm8
1548        movdqa  xmm8, xmm13
1549        psrld   xmm13, 8
1550        pslld   xmm8, 24
1551        pxor    xmm13, xmm8
1552        movdqa  xmm8, xmm14
1553        psrld   xmm14, 8
1554        pslld   xmm8, 24
1555        pxor    xmm14, xmm8
1556        paddd   xmm10, xmm15
1557        paddd   xmm11, xmm12
1558        movdqa  xmm8, xmmword ptr [rsp+0x100]
1559        paddd   xmm8, xmm13
1560        paddd   xmm9, xmm14
1561        pxor    xmm5, xmm10
1562        pxor    xmm6, xmm11
1563        pxor    xmm7, xmm8
1564        pxor    xmm4, xmm9
1565        pxor    xmm0, xmm8
1566        pxor    xmm1, xmm9
1567        pxor    xmm2, xmm10
1568        pxor    xmm3, xmm11
1569        movdqa  xmm8, xmm5
1570        psrld   xmm8, 7
1571        pslld   xmm5, 25
1572        por     xmm5, xmm8
1573        movdqa  xmm8, xmm6
1574        psrld   xmm8, 7
1575        pslld   xmm6, 25
1576        por     xmm6, xmm8
1577        movdqa  xmm8, xmm7
1578        psrld   xmm8, 7
1579        pslld   xmm7, 25
1580        por     xmm7, xmm8
1581        movdqa  xmm8, xmm4
1582        psrld   xmm8, 7
1583        pslld   xmm4, 25
1584        por     xmm4, xmm8
1585        pxor    xmm4, xmm12
1586        pxor    xmm5, xmm13
1587        pxor    xmm6, xmm14
1588        pxor    xmm7, xmm15
1589        mov     eax, r13d
1590        jne     9b
1591        movdqa  xmm9, xmm0
1592        punpckldq xmm0, xmm1
1593        punpckhdq xmm9, xmm1
1594        movdqa  xmm11, xmm2
1595        punpckldq xmm2, xmm3
1596        punpckhdq xmm11, xmm3
1597        movdqa  xmm1, xmm0
1598        punpcklqdq xmm0, xmm2
1599        punpckhqdq xmm1, xmm2
1600        movdqa  xmm3, xmm9
1601        punpcklqdq xmm9, xmm11
1602        punpckhqdq xmm3, xmm11
1603        movdqu  xmmword ptr [rbx], xmm0
1604        movdqu  xmmword ptr [rbx+0x20], xmm1
1605        movdqu  xmmword ptr [rbx+0x40], xmm9
1606        movdqu  xmmword ptr [rbx+0x60], xmm3
1607        movdqa  xmm9, xmm4
1608        punpckldq xmm4, xmm5
1609        punpckhdq xmm9, xmm5
1610        movdqa  xmm11, xmm6
1611        punpckldq xmm6, xmm7
1612        punpckhdq xmm11, xmm7
1613        movdqa  xmm5, xmm4
1614        punpcklqdq xmm4, xmm6
1615        punpckhqdq xmm5, xmm6
1616        movdqa  xmm7, xmm9
1617        punpcklqdq xmm9, xmm11
1618        punpckhqdq xmm7, xmm11
1619        movdqu  xmmword ptr [rbx+0x10], xmm4
1620        movdqu  xmmword ptr [rbx+0x30], xmm5
1621        movdqu  xmmword ptr [rbx+0x50], xmm9
1622        movdqu  xmmword ptr [rbx+0x70], xmm7
1623        movdqa  xmm1, xmmword ptr [rsp+0x110]
1624        movdqa  xmm0, xmm1
1625        paddd   xmm1, xmmword ptr [rsp+0x150]
1626        movdqa  xmmword ptr [rsp+0x110], xmm1
1627        pxor    xmm0, xmmword ptr [CMP_MSB_MASK+rip]
1628        pxor    xmm1, xmmword ptr [CMP_MSB_MASK+rip]
1629        pcmpgtd xmm0, xmm1
1630        movdqa  xmm1, xmmword ptr [rsp+0x120]
1631        psubd   xmm1, xmm0
1632        movdqa  xmmword ptr [rsp+0x120], xmm1
1633        add     rbx, 128
1634        add     rdi, 32
1635        sub     rsi, 4
1636        cmp     rsi, 4
1637        jnc     2b
1638        test    rsi, rsi
1639        jne     3f
16404:
1641        movdqa  xmm6, xmmword ptr [rsp+0x170]
1642        movdqa  xmm7, xmmword ptr [rsp+0x180]
1643        movdqa  xmm8, xmmword ptr [rsp+0x190]
1644        movdqa  xmm9, xmmword ptr [rsp+0x1A0]
1645        movdqa  xmm10, xmmword ptr [rsp+0x1B0]
1646        movdqa  xmm11, xmmword ptr [rsp+0x1C0]
1647        movdqa  xmm12, xmmword ptr [rsp+0x1D0]
1648        movdqa  xmm13, xmmword ptr [rsp+0x1E0]
1649        movdqa  xmm14, xmmword ptr [rsp+0x1F0]
1650        movdqa  xmm15, xmmword ptr [rsp+0x200]
1651        mov     rsp, rbp
1652        pop     rbp
1653        pop     rbx
1654        pop     rdi
1655        pop     rsi
1656        pop     r12
1657        pop     r13
1658        pop     r14
1659        pop     r15
1660        ret
1661.p2align 5
16623:
1663        test    esi, 0x2
1664        je      3f
1665        movups  xmm0, xmmword ptr [rcx]
1666        movups  xmm1, xmmword ptr [rcx+0x10]
1667        movaps  xmm8, xmm0
1668        movaps  xmm9, xmm1
1669        movd    xmm13, dword ptr [rsp+0x110]
1670        movd    xmm14, dword ptr [rsp+0x120]
1671        punpckldq xmm13, xmm14
1672        movaps  xmmword ptr [rsp], xmm13
1673        movd    xmm14, dword ptr [rsp+0x114]
1674        movd    xmm13, dword ptr [rsp+0x124]
1675        punpckldq xmm14, xmm13
1676        movaps  xmmword ptr [rsp+0x10], xmm14
1677        mov     r8, qword ptr [rdi]
1678        mov     r9, qword ptr [rdi+0x8]
1679        movzx   eax, byte ptr [rbp+0x80]
1680        or      eax, r13d
1681        xor     edx, edx
16822:
1683        mov     r14d, eax
1684        or      eax, r12d
1685        add     rdx, 64
1686        cmp     rdx, r15
1687        cmovne  eax, r14d
1688        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
1689        movaps  xmm10, xmm2
1690        movups  xmm4, xmmword ptr [r8+rdx-0x40]
1691        movups  xmm5, xmmword ptr [r8+rdx-0x30]
1692        movaps  xmm3, xmm4
1693        shufps  xmm4, xmm5, 136
1694        shufps  xmm3, xmm5, 221
1695        movaps  xmm5, xmm3
1696        movups  xmm6, xmmword ptr [r8+rdx-0x20]
1697        movups  xmm7, xmmword ptr [r8+rdx-0x10]
1698        movaps  xmm3, xmm6
1699        shufps  xmm6, xmm7, 136
1700        pshufd  xmm6, xmm6, 0x93
1701        shufps  xmm3, xmm7, 221
1702        pshufd  xmm7, xmm3, 0x93
1703        movups  xmm12, xmmword ptr [r9+rdx-0x40]
1704        movups  xmm13, xmmword ptr [r9+rdx-0x30]
1705        movaps  xmm11, xmm12
1706        shufps  xmm12, xmm13, 136
1707        shufps  xmm11, xmm13, 221
1708        movaps  xmm13, xmm11
1709        movups  xmm14, xmmword ptr [r9+rdx-0x20]
1710        movups  xmm15, xmmword ptr [r9+rdx-0x10]
1711        movaps  xmm11, xmm14
1712        shufps  xmm14, xmm15, 136
1713        pshufd  xmm14, xmm14, 0x93
1714        shufps  xmm11, xmm15, 221
1715        pshufd  xmm15, xmm11, 0x93
1716        shl     rax, 0x20
1717        or      rax, 0x40
1718        movq    xmm3, rax
1719        movdqa  xmmword ptr [rsp+0x20], xmm3
1720        movaps  xmm3, xmmword ptr [rsp]
1721        movaps  xmm11, xmmword ptr [rsp+0x10]
1722        punpcklqdq xmm3, xmmword ptr [rsp+0x20]
1723        punpcklqdq xmm11, xmmword ptr [rsp+0x20]
1724        mov     al, 7
17259:
1726        paddd   xmm0, xmm4
1727        paddd   xmm8, xmm12
1728        movaps  xmmword ptr [rsp+0x20], xmm4
1729        movaps  xmmword ptr [rsp+0x30], xmm12
1730        paddd   xmm0, xmm1
1731        paddd   xmm8, xmm9
1732        pxor    xmm3, xmm0
1733        pxor    xmm11, xmm8
1734        pshuflw xmm3, xmm3, 0xB1
1735        pshufhw xmm3, xmm3, 0xB1
1736        pshuflw xmm11, xmm11, 0xB1
1737        pshufhw xmm11, xmm11, 0xB1
1738        paddd   xmm2, xmm3
1739        paddd   xmm10, xmm11
1740        pxor    xmm1, xmm2
1741        pxor    xmm9, xmm10
1742        movdqa  xmm4, xmm1
1743        pslld   xmm1, 20
1744        psrld   xmm4, 12
1745        por     xmm1, xmm4
1746        movdqa  xmm4, xmm9
1747        pslld   xmm9, 20
1748        psrld   xmm4, 12
1749        por     xmm9, xmm4
1750        paddd   xmm0, xmm5
1751        paddd   xmm8, xmm13
1752        movaps  xmmword ptr [rsp+0x40], xmm5
1753        movaps  xmmword ptr [rsp+0x50], xmm13
1754        paddd   xmm0, xmm1
1755        paddd   xmm8, xmm9
1756        pxor    xmm3, xmm0
1757        pxor    xmm11, xmm8
1758        movdqa  xmm13, xmm3
1759        psrld   xmm3, 8
1760        pslld   xmm13, 24
1761        pxor    xmm3, xmm13
1762        movdqa  xmm13, xmm11
1763        psrld   xmm11, 8
1764        pslld   xmm13, 24
1765        pxor    xmm11, xmm13
1766        paddd   xmm2, xmm3
1767        paddd   xmm10, xmm11
1768        pxor    xmm1, xmm2
1769        pxor    xmm9, xmm10
1770        movdqa  xmm4, xmm1
1771        pslld   xmm1, 25
1772        psrld   xmm4, 7
1773        por     xmm1, xmm4
1774        movdqa  xmm4, xmm9
1775        pslld   xmm9, 25
1776        psrld   xmm4, 7
1777        por     xmm9, xmm4
1778        pshufd  xmm0, xmm0, 0x93
1779        pshufd  xmm8, xmm8, 0x93
1780        pshufd  xmm3, xmm3, 0x4E
1781        pshufd  xmm11, xmm11, 0x4E
1782        pshufd  xmm2, xmm2, 0x39
1783        pshufd  xmm10, xmm10, 0x39
1784        paddd   xmm0, xmm6
1785        paddd   xmm8, xmm14
1786        paddd   xmm0, xmm1
1787        paddd   xmm8, xmm9
1788        pxor    xmm3, xmm0
1789        pxor    xmm11, xmm8
1790        pshuflw xmm3, xmm3, 0xB1
1791        pshufhw xmm3, xmm3, 0xB1
1792        pshuflw xmm11, xmm11, 0xB1
1793        pshufhw xmm11, xmm11, 0xB1
1794        paddd   xmm2, xmm3
1795        paddd   xmm10, xmm11
1796        pxor    xmm1, xmm2
1797        pxor    xmm9, xmm10
1798        movdqa  xmm4, xmm1
1799        pslld   xmm1, 20
1800        psrld   xmm4, 12
1801        por     xmm1, xmm4
1802        movdqa  xmm4, xmm9
1803        pslld   xmm9, 20
1804        psrld   xmm4, 12
1805        por     xmm9, xmm4
1806        paddd   xmm0, xmm7
1807        paddd   xmm8, xmm15
1808        paddd   xmm0, xmm1
1809        paddd   xmm8, xmm9
1810        pxor    xmm3, xmm0
1811        pxor    xmm11, xmm8
1812        movdqa  xmm13, xmm3
1813        psrld   xmm3, 8
1814        pslld   xmm13, 24
1815        pxor    xmm3, xmm13
1816        movdqa  xmm13, xmm11
1817        psrld   xmm11, 8
1818        pslld   xmm13, 24
1819        pxor    xmm11, xmm13
1820        paddd   xmm2, xmm3
1821        paddd   xmm10, xmm11
1822        pxor    xmm1, xmm2
1823        pxor    xmm9, xmm10
1824        movdqa  xmm4, xmm1
1825        pslld   xmm1, 25
1826        psrld   xmm4, 7
1827        por     xmm1, xmm4
1828        movdqa  xmm4, xmm9
1829        pslld   xmm9, 25
1830        psrld   xmm4, 7
1831        por     xmm9, xmm4
1832        pshufd  xmm0, xmm0, 0x39
1833        pshufd  xmm8, xmm8, 0x39
1834        pshufd  xmm3, xmm3, 0x4E
1835        pshufd  xmm11, xmm11, 0x4E
1836        pshufd  xmm2, xmm2, 0x93
1837        pshufd  xmm10, xmm10, 0x93
1838        dec     al
1839        je      9f
1840        movdqa  xmm12, xmmword ptr [rsp+0x20]
1841        movdqa  xmm5, xmmword ptr [rsp+0x40]
1842        pshufd  xmm13, xmm12, 0x0F
1843        shufps  xmm12, xmm5, 214
1844        pshufd  xmm4, xmm12, 0x39
1845        movdqa  xmm12, xmm6
1846        shufps  xmm12, xmm7, 250
1847        pand    xmm13, xmmword ptr [PBLENDW_0x33_MASK+rip]
1848        pand    xmm12, xmmword ptr [PBLENDW_0xCC_MASK+rip]
1849        por     xmm13, xmm12
1850        movdqa  xmmword ptr [rsp+0x20], xmm13
1851        movdqa  xmm12, xmm7
1852        punpcklqdq xmm12, xmm5
1853        movdqa  xmm13, xmm6
1854        pand    xmm12, xmmword ptr [PBLENDW_0x3F_MASK+rip]
1855        pand    xmm13, xmmword ptr [PBLENDW_0xC0_MASK+rip]
1856        por     xmm12, xmm13
1857        pshufd  xmm12, xmm12, 0x78
1858        punpckhdq xmm5, xmm7
1859        punpckldq xmm6, xmm5
1860        pshufd  xmm7, xmm6, 0x1E
1861        movdqa  xmmword ptr [rsp+0x40], xmm12
1862        movdqa  xmm5, xmmword ptr [rsp+0x30]
1863        movdqa  xmm13, xmmword ptr [rsp+0x50]
1864        pshufd  xmm6, xmm5, 0x0F
1865        shufps  xmm5, xmm13, 214
1866        pshufd  xmm12, xmm5, 0x39
1867        movdqa  xmm5, xmm14
1868        shufps  xmm5, xmm15, 250
1869        pand    xmm6, xmmword ptr [PBLENDW_0x33_MASK+rip]
1870        pand    xmm5, xmmword ptr [PBLENDW_0xCC_MASK+rip]
1871        por     xmm6, xmm5
1872        movdqa  xmm5, xmm15
1873        punpcklqdq xmm5, xmm13
1874        movdqa  xmmword ptr [rsp+0x30], xmm2
1875        movdqa  xmm2, xmm14
1876        pand    xmm5, xmmword ptr [PBLENDW_0x3F_MASK+rip]
1877        pand    xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip]
1878        por     xmm5, xmm2
1879        movdqa  xmm2, xmmword ptr [rsp+0x30]
1880        pshufd  xmm5, xmm5, 0x78
1881        punpckhdq xmm13, xmm15
1882        punpckldq xmm14, xmm13
1883        pshufd  xmm15, xmm14, 0x1E
1884        movdqa  xmm13, xmm6
1885        movdqa  xmm14, xmm5
1886        movdqa  xmm5, xmmword ptr [rsp+0x20]
1887        movdqa  xmm6, xmmword ptr [rsp+0x40]
1888        jmp     9b
18899:
1890        pxor    xmm0, xmm2
1891        pxor    xmm1, xmm3
1892        pxor    xmm8, xmm10
1893        pxor    xmm9, xmm11
1894        mov     eax, r13d
1895        cmp     rdx, r15
1896        jne     2b
1897        movups  xmmword ptr [rbx], xmm0
1898        movups  xmmword ptr [rbx+0x10], xmm1
1899        movups  xmmword ptr [rbx+0x20], xmm8
1900        movups  xmmword ptr [rbx+0x30], xmm9
1901        mov     eax, dword ptr [rsp+0x130]
1902        neg     eax
1903        mov    r10d, dword ptr [rsp+0x110+8*rax]
1904        mov    r11d, dword ptr [rsp+0x120+8*rax]
1905        mov dword ptr [rsp+0x110], r10d
1906        mov dword ptr [rsp+0x120], r11d
1907        add     rdi, 16
1908        add     rbx, 64
1909        sub     rsi, 2
19103:
1911        test    esi, 0x1
1912        je      4b
1913        movups  xmm0, xmmword ptr [rcx]
1914        movups  xmm1, xmmword ptr [rcx+0x10]
1915        movd    xmm13, dword ptr [rsp+0x110]
1916        movd    xmm14, dword ptr [rsp+0x120]
1917        punpckldq xmm13, xmm14
1918        mov     r8, qword ptr [rdi]
1919        movzx   eax, byte ptr [rbp+0x80]
1920        or      eax, r13d
1921        xor     edx, edx
19222:
1923        mov     r14d, eax
1924        or      eax, r12d
1925        add     rdx, 64
1926        cmp     rdx, r15
1927        cmovne  eax, r14d
1928        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
1929        shl     rax, 32
1930        or      rax, 64
1931        movq    xmm12, rax
1932        movdqa  xmm3, xmm13
1933        punpcklqdq xmm3, xmm12
1934        movups  xmm4, xmmword ptr [r8+rdx-0x40]
1935        movups  xmm5, xmmword ptr [r8+rdx-0x30]
1936        movaps  xmm8, xmm4
1937        shufps  xmm4, xmm5, 136
1938        shufps  xmm8, xmm5, 221
1939        movaps  xmm5, xmm8
1940        movups  xmm6, xmmword ptr [r8+rdx-0x20]
1941        movups  xmm7, xmmword ptr [r8+rdx-0x10]
1942        movaps  xmm8, xmm6
1943        shufps  xmm6, xmm7, 136
1944        pshufd  xmm6, xmm6, 0x93
1945        shufps  xmm8, xmm7, 221
1946        pshufd  xmm7, xmm8, 0x93
1947        mov     al, 7
19489:
1949        paddd   xmm0, xmm4
1950        paddd   xmm0, xmm1
1951        pxor    xmm3, xmm0
1952        pshuflw xmm3, xmm3, 0xB1
1953        pshufhw xmm3, xmm3, 0xB1
1954        paddd   xmm2, xmm3
1955        pxor    xmm1, xmm2
1956        movdqa  xmm11, xmm1
1957        pslld   xmm1, 20
1958        psrld   xmm11, 12
1959        por     xmm1, xmm11
1960        paddd   xmm0, xmm5
1961        paddd   xmm0, xmm1
1962        pxor    xmm3, xmm0
1963        movdqa  xmm14, xmm3
1964        psrld   xmm3, 8
1965        pslld   xmm14, 24
1966        pxor    xmm3, xmm14
1967        paddd   xmm2, xmm3
1968        pxor    xmm1, xmm2
1969        movdqa  xmm11, xmm1
1970        pslld   xmm1, 25
1971        psrld   xmm11, 7
1972        por     xmm1, xmm11
1973        pshufd  xmm0, xmm0, 0x93
1974        pshufd  xmm3, xmm3, 0x4E
1975        pshufd  xmm2, xmm2, 0x39
1976        paddd   xmm0, xmm6
1977        paddd   xmm0, xmm1
1978        pxor    xmm3, xmm0
1979        pshuflw xmm3, xmm3, 0xB1
1980        pshufhw xmm3, xmm3, 0xB1
1981        paddd   xmm2, xmm3
1982        pxor    xmm1, xmm2
1983        movdqa  xmm11, xmm1
1984        pslld   xmm1, 20
1985        psrld   xmm11, 12
1986        por     xmm1, xmm11
1987        paddd   xmm0, xmm7
1988        paddd   xmm0, xmm1
1989        pxor    xmm3, xmm0
1990        movdqa  xmm14, xmm3
1991        psrld   xmm3, 8
1992        pslld   xmm14, 24
1993        pxor    xmm3, xmm14
1994        paddd   xmm2, xmm3
1995        pxor    xmm1, xmm2
1996        movdqa  xmm11, xmm1
1997        pslld   xmm1, 25
1998        psrld   xmm11, 7
1999        por     xmm1, xmm11
2000        pshufd  xmm0, xmm0, 0x39
2001        pshufd  xmm3, xmm3, 0x4E
2002        pshufd  xmm2, xmm2, 0x93
2003        dec     al
2004        jz      9f
2005        movdqa  xmm8, xmm4
2006        shufps  xmm8, xmm5, 214
2007        pshufd  xmm9, xmm4, 0x0F
2008        pshufd  xmm4, xmm8, 0x39
2009        movdqa  xmm8, xmm6
2010        shufps  xmm8, xmm7, 250
2011        pand    xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
2012        pand    xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
2013        por     xmm9, xmm8
2014        movdqa  xmm8, xmm7
2015        punpcklqdq xmm8, xmm5
2016        movdqa  xmm10, xmm6
2017        pand    xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
2018        pand    xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
2019        por     xmm8, xmm10
2020        pshufd  xmm8, xmm8, 0x78
2021        punpckhdq xmm5, xmm7
2022        punpckldq xmm6, xmm5
2023        pshufd  xmm7, xmm6, 0x1E
2024        movdqa  xmm5, xmm9
2025        movdqa  xmm6, xmm8
2026        jmp     9b
20279:
2028        pxor    xmm0, xmm2
2029        pxor    xmm1, xmm3
2030        mov     eax, r13d
2031        cmp     rdx, r15
2032        jne     2b
2033        movups  xmmword ptr [rbx], xmm0
2034        movups  xmmword ptr [rbx+0x10], xmm1
2035        jmp     4b
2036
2037.p2align 6
2038blake3_compress_in_place_sse2:
2039_blake3_compress_in_place_sse2:
2040        sub     rsp, 120
2041        movdqa  xmmword ptr [rsp], xmm6
2042        movdqa  xmmword ptr [rsp+0x10], xmm7
2043        movdqa  xmmword ptr [rsp+0x20], xmm8
2044        movdqa  xmmword ptr [rsp+0x30], xmm9
2045        movdqa  xmmword ptr [rsp+0x40], xmm11
2046        movdqa  xmmword ptr [rsp+0x50], xmm14
2047        movdqa  xmmword ptr [rsp+0x60], xmm15
2048        movups  xmm0, xmmword ptr [rcx]
2049        movups  xmm1, xmmword ptr [rcx+0x10]
2050        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
2051        movzx   eax, byte ptr [rsp+0xA0]
2052        movzx   r8d, r8b
2053        shl     rax, 32
2054        add     r8, rax
2055        movq    xmm3, r9
2056        movq    xmm4, r8
2057        punpcklqdq xmm3, xmm4
2058        movups  xmm4, xmmword ptr [rdx]
2059        movups  xmm5, xmmword ptr [rdx+0x10]
2060        movaps  xmm8, xmm4
2061        shufps  xmm4, xmm5, 136
2062        shufps  xmm8, xmm5, 221
2063        movaps  xmm5, xmm8
2064        movups  xmm6, xmmword ptr [rdx+0x20]
2065        movups  xmm7, xmmword ptr [rdx+0x30]
2066        movaps  xmm8, xmm6
2067        shufps  xmm6, xmm7, 136
2068        pshufd  xmm6, xmm6, 0x93
2069        shufps  xmm8, xmm7, 221
2070        pshufd  xmm7, xmm8, 0x93
2071        mov     al, 7
20729:
2073        paddd   xmm0, xmm4
2074        paddd   xmm0, xmm1
2075        pxor    xmm3, xmm0
2076        pshuflw xmm3, xmm3, 0xB1
2077        pshufhw xmm3, xmm3, 0xB1
2078        paddd   xmm2, xmm3
2079        pxor    xmm1, xmm2
2080        movdqa  xmm11, xmm1
2081        pslld   xmm1, 20
2082        psrld   xmm11, 12
2083        por     xmm1, xmm11
2084        paddd   xmm0, xmm5
2085        paddd   xmm0, xmm1
2086        pxor    xmm3, xmm0
2087        movdqa  xmm14, xmm3
2088        psrld   xmm3, 8
2089        pslld   xmm14, 24
2090        pxor    xmm3, xmm14
2091        paddd   xmm2, xmm3
2092        pxor    xmm1, xmm2
2093        movdqa  xmm11, xmm1
2094        pslld   xmm1, 25
2095        psrld   xmm11, 7
2096        por     xmm1, xmm11
2097        pshufd  xmm0, xmm0, 0x93
2098        pshufd  xmm3, xmm3, 0x4E
2099        pshufd  xmm2, xmm2, 0x39
2100        paddd   xmm0, xmm6
2101        paddd   xmm0, xmm1
2102        pxor    xmm3, xmm0
2103        pshuflw xmm3, xmm3, 0xB1
2104        pshufhw xmm3, xmm3, 0xB1
2105        paddd   xmm2, xmm3
2106        pxor    xmm1, xmm2
2107        movdqa  xmm11, xmm1
2108        pslld   xmm1, 20
2109        psrld   xmm11, 12
2110        por     xmm1, xmm11
2111        paddd   xmm0, xmm7
2112        paddd   xmm0, xmm1
2113        pxor    xmm3, xmm0
2114        movdqa  xmm14, xmm3
2115        psrld   xmm3, 8
2116        pslld   xmm14, 24
2117        pxor    xmm3, xmm14
2118        paddd   xmm2, xmm3
2119        pxor    xmm1, xmm2
2120        movdqa  xmm11, xmm1
2121        pslld   xmm1, 25
2122        psrld   xmm11, 7
2123        por     xmm1, xmm11
2124        pshufd  xmm0, xmm0, 0x39
2125        pshufd  xmm3, xmm3, 0x4E
2126        pshufd  xmm2, xmm2, 0x93
2127        dec     al
2128        jz      9f
2129        movdqa  xmm8, xmm4
2130        shufps  xmm8, xmm5, 214
2131        pshufd  xmm9, xmm4, 0x0F
2132        pshufd  xmm4, xmm8, 0x39
2133        movdqa  xmm8, xmm6
2134        shufps  xmm8, xmm7, 250
2135        pand    xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
2136        pand    xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
2137        por     xmm9, xmm8
2138        movdqa  xmm8, xmm7
2139        punpcklqdq xmm8, xmm5
2140        movdqa  xmm14, xmm6
2141        pand    xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
2142        pand    xmm14, xmmword ptr [PBLENDW_0xC0_MASK+rip]
2143        por     xmm8, xmm14
2144        pshufd  xmm8, xmm8, 0x78
2145        punpckhdq xmm5, xmm7
2146        punpckldq xmm6, xmm5
2147        pshufd  xmm7, xmm6, 0x1E
2148        movdqa  xmm5, xmm9
2149        movdqa  xmm6, xmm8
2150        jmp     9b
21519:
2152        pxor    xmm0, xmm2
2153        pxor    xmm1, xmm3
2154        movups  xmmword ptr [rcx], xmm0
2155        movups  xmmword ptr [rcx+0x10], xmm1
2156        movdqa  xmm6, xmmword ptr [rsp]
2157        movdqa  xmm7, xmmword ptr [rsp+0x10]
2158        movdqa  xmm8, xmmword ptr [rsp+0x20]
2159        movdqa  xmm9, xmmword ptr [rsp+0x30]
2160        movdqa  xmm11, xmmword ptr [rsp+0x40]
2161        movdqa  xmm14, xmmword ptr [rsp+0x50]
2162        movdqa  xmm15, xmmword ptr [rsp+0x60]
2163        add     rsp, 120
2164        ret
2165
2166
2167.p2align 6
2168_blake3_compress_xof_sse2:
2169blake3_compress_xof_sse2:
2170        sub     rsp, 120
2171        movdqa  xmmword ptr [rsp], xmm6
2172        movdqa  xmmword ptr [rsp+0x10], xmm7
2173        movdqa  xmmword ptr [rsp+0x20], xmm8
2174        movdqa  xmmword ptr [rsp+0x30], xmm9
2175        movdqa  xmmword ptr [rsp+0x40], xmm11
2176        movdqa  xmmword ptr [rsp+0x50], xmm14
2177        movdqa  xmmword ptr [rsp+0x60], xmm15
2178        movups  xmm0, xmmword ptr [rcx]
2179        movups  xmm1, xmmword ptr [rcx+0x10]
2180        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
2181        movzx   eax, byte ptr [rsp+0xA0]
2182        movzx   r8d, r8b
2183        mov     r10, qword ptr [rsp+0xA8]
2184        shl     rax, 32
2185        add     r8, rax
2186        movq    xmm3, r9
2187        movq    xmm4, r8
2188        punpcklqdq xmm3, xmm4
2189        movups  xmm4, xmmword ptr [rdx]
2190        movups  xmm5, xmmword ptr [rdx+0x10]
2191        movaps  xmm8, xmm4
2192        shufps  xmm4, xmm5, 136
2193        shufps  xmm8, xmm5, 221
2194        movaps  xmm5, xmm8
2195        movups  xmm6, xmmword ptr [rdx+0x20]
2196        movups  xmm7, xmmword ptr [rdx+0x30]
2197        movaps  xmm8, xmm6
2198        shufps  xmm6, xmm7, 136
2199        pshufd  xmm6, xmm6, 0x93
2200        shufps  xmm8, xmm7, 221
2201        pshufd  xmm7, xmm8, 0x93
2202        mov     al, 7
22039:
2204        paddd   xmm0, xmm4
2205        paddd   xmm0, xmm1
2206        pxor    xmm3, xmm0
2207        pshuflw xmm3, xmm3, 0xB1
2208        pshufhw xmm3, xmm3, 0xB1
2209        paddd   xmm2, xmm3
2210        pxor    xmm1, xmm2
2211        movdqa  xmm11, xmm1
2212        pslld   xmm1, 20
2213        psrld   xmm11, 12
2214        por     xmm1, xmm11
2215        paddd   xmm0, xmm5
2216        paddd   xmm0, xmm1
2217        pxor    xmm3, xmm0
2218        movdqa  xmm14, xmm3
2219        psrld   xmm3, 8
2220        pslld   xmm14, 24
2221        pxor    xmm3, xmm14
2222        paddd   xmm2, xmm3
2223        pxor    xmm1, xmm2
2224        movdqa  xmm11, xmm1
2225        pslld   xmm1, 25
2226        psrld   xmm11, 7
2227        por     xmm1, xmm11
2228        pshufd  xmm0, xmm0, 0x93
2229        pshufd  xmm3, xmm3, 0x4E
2230        pshufd  xmm2, xmm2, 0x39
2231        paddd   xmm0, xmm6
2232        paddd   xmm0, xmm1
2233        pxor    xmm3, xmm0
2234        pshuflw xmm3, xmm3, 0xB1
2235        pshufhw xmm3, xmm3, 0xB1
2236        paddd   xmm2, xmm3
2237        pxor    xmm1, xmm2
2238        movdqa  xmm11, xmm1
2239        pslld   xmm1, 20
2240        psrld   xmm11, 12
2241        por     xmm1, xmm11
2242        paddd   xmm0, xmm7
2243        paddd   xmm0, xmm1
2244        pxor    xmm3, xmm0
2245        movdqa  xmm14, xmm3
2246        psrld   xmm3, 8
2247        pslld   xmm14, 24
2248        pxor    xmm3, xmm14
2249        paddd   xmm2, xmm3
2250        pxor    xmm1, xmm2
2251        movdqa  xmm11, xmm1
2252        pslld   xmm1, 25
2253        psrld   xmm11, 7
2254        por     xmm1, xmm11
2255        pshufd  xmm0, xmm0, 0x39
2256        pshufd  xmm3, xmm3, 0x4E
2257        pshufd  xmm2, xmm2, 0x93
2258        dec     al
2259        jz      9f
2260        movdqa  xmm8, xmm4
2261        shufps  xmm8, xmm5, 214
2262        pshufd  xmm9, xmm4, 0x0F
2263        pshufd  xmm4, xmm8, 0x39
2264        movdqa  xmm8, xmm6
2265        shufps  xmm8, xmm7, 250
2266        pand    xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
2267        pand    xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
2268        por     xmm9, xmm8
2269        movdqa  xmm8, xmm7
2270        punpcklqdq xmm8, xmm5
2271        movdqa  xmm14, xmm6
2272        pand    xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
2273        pand    xmm14, xmmword ptr [PBLENDW_0xC0_MASK+rip]
2274        por     xmm8, xmm14
2275        pshufd  xmm8, xmm8, 0x78
2276        punpckhdq xmm5, xmm7
2277        punpckldq xmm6, xmm5
2278        pshufd  xmm7, xmm6, 0x1E
2279        movdqa  xmm5, xmm9
2280        movdqa  xmm6, xmm8
2281        jmp     9b
22829:
2283        movdqu  xmm4, xmmword ptr [rcx]
2284        movdqu  xmm5, xmmword ptr [rcx+0x10]
2285        pxor    xmm0, xmm2
2286        pxor    xmm1, xmm3
2287        pxor    xmm2, xmm4
2288        pxor    xmm3, xmm5
2289        movups  xmmword ptr [r10], xmm0
2290        movups  xmmword ptr [r10+0x10], xmm1
2291        movups  xmmword ptr [r10+0x20], xmm2
2292        movups  xmmword ptr [r10+0x30], xmm3
2293        movdqa  xmm6, xmmword ptr [rsp]
2294        movdqa  xmm7, xmmword ptr [rsp+0x10]
2295        movdqa  xmm8, xmmword ptr [rsp+0x20]
2296        movdqa  xmm9, xmmword ptr [rsp+0x30]
2297        movdqa  xmm11, xmmword ptr [rsp+0x40]
2298        movdqa  xmm14, xmmword ptr [rsp+0x50]
2299        movdqa  xmm15, xmmword ptr [rsp+0x60]
2300        add     rsp, 120
2301        ret
2302
2303
2304.section .rodata
2305.p2align  6
2306BLAKE3_IV:
2307        .long  0x6A09E667, 0xBB67AE85
2308        .long  0x3C6EF372, 0xA54FF53A
2309ADD0:
2310        .long  0, 1, 2, 3
2311ADD1:
2312        .long  4, 4, 4, 4
2313BLAKE3_IV_0:
2314        .long  0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
2315BLAKE3_IV_1:
2316        .long  0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
2317BLAKE3_IV_2:
2318        .long  0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
2319BLAKE3_IV_3:
2320        .long  0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
2321BLAKE3_BLOCK_LEN:
2322        .long  64, 64, 64, 64
2323CMP_MSB_MASK:
2324        .long  0x80000000, 0x80000000, 0x80000000, 0x80000000
2325PBLENDW_0x33_MASK:
2326        .long  0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000
2327PBLENDW_0xCC_MASK:
2328        .long  0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF
2329PBLENDW_0x3F_MASK:
2330        .long  0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000
2331PBLENDW_0xC0_MASK:
2332        .long  0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF
2333