1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3
24 * Copyright (c) 2019-2020 Samuel Neves and Matthew Krupcale
25 * Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
26 */
27
28#if defined(HAVE_SSE2)
29
30#define _ASM
31#include <sys/asm_linkage.h>
32
33.intel_syntax noprefix
34
35SECTION_TEXT
36
37ENTRY_ALIGN(zfs_blake3_hash_many_sse2, 64)
38        ENDBR
39        push    r15
40        push    r14
41        push    r13
42        push    r12
43        push    rbx
44        push    rbp
45        mov     rbp, rsp
46        sub     rsp, 360
47        and     rsp, 0xFFFFFFFFFFFFFFC0
48        neg     r9d
49        movd    xmm0, r9d
50        pshufd  xmm0, xmm0, 0x00
51        movdqa  xmmword ptr [rsp+0x130], xmm0
52        movdqa  xmm1, xmm0
53        pand    xmm1, xmmword ptr [ADD0+rip]
54        pand    xmm0, xmmword ptr [ADD1+rip]
55        movdqa  xmmword ptr [rsp+0x150], xmm0
56        movd    xmm0, r8d
57        pshufd  xmm0, xmm0, 0x00
58        paddd   xmm0, xmm1
59        movdqa  xmmword ptr [rsp+0x110], xmm0
60        pxor    xmm0, xmmword ptr [CMP_MSB_MASK+rip]
61        pxor    xmm1, xmmword ptr [CMP_MSB_MASK+rip]
62        pcmpgtd xmm1, xmm0
63        shr     r8, 32
64        movd    xmm2, r8d
65        pshufd  xmm2, xmm2, 0x00
66        psubd   xmm2, xmm1
67        movdqa  xmmword ptr [rsp+0x120], xmm2
68        mov     rbx, qword ptr [rbp+0x50]
69        mov     r15, rdx
70        shl     r15, 6
71        movzx   r13d, byte ptr [rbp+0x38]
72        movzx   r12d, byte ptr [rbp+0x48]
73        cmp     rsi, 4
74        jc      3f
752:
76        movdqu  xmm3, xmmword ptr [rcx]
77        pshufd  xmm0, xmm3, 0x00
78        pshufd  xmm1, xmm3, 0x55
79        pshufd  xmm2, xmm3, 0xAA
80        pshufd  xmm3, xmm3, 0xFF
81        movdqu  xmm7, xmmword ptr [rcx+0x10]
82        pshufd  xmm4, xmm7, 0x00
83        pshufd  xmm5, xmm7, 0x55
84        pshufd  xmm6, xmm7, 0xAA
85        pshufd  xmm7, xmm7, 0xFF
86        mov     r8, qword ptr [rdi]
87        mov     r9, qword ptr [rdi+0x8]
88        mov     r10, qword ptr [rdi+0x10]
89        mov     r11, qword ptr [rdi+0x18]
90        movzx   eax, byte ptr [rbp+0x40]
91        or      eax, r13d
92        xor     edx, edx
939:
94        mov     r14d, eax
95        or      eax, r12d
96        add     rdx, 64
97        cmp     rdx, r15
98        cmovne  eax, r14d
99        movdqu  xmm8, xmmword ptr [r8+rdx-0x40]
100        movdqu  xmm9, xmmword ptr [r9+rdx-0x40]
101        movdqu  xmm10, xmmword ptr [r10+rdx-0x40]
102        movdqu  xmm11, xmmword ptr [r11+rdx-0x40]
103        movdqa  xmm12, xmm8
104        punpckldq xmm8, xmm9
105        punpckhdq xmm12, xmm9
106        movdqa  xmm14, xmm10
107        punpckldq xmm10, xmm11
108        punpckhdq xmm14, xmm11
109        movdqa  xmm9, xmm8
110        punpcklqdq xmm8, xmm10
111        punpckhqdq xmm9, xmm10
112        movdqa  xmm13, xmm12
113        punpcklqdq xmm12, xmm14
114        punpckhqdq xmm13, xmm14
115        movdqa  xmmword ptr [rsp], xmm8
116        movdqa  xmmword ptr [rsp+0x10], xmm9
117        movdqa  xmmword ptr [rsp+0x20], xmm12
118        movdqa  xmmword ptr [rsp+0x30], xmm13
119        movdqu  xmm8, xmmword ptr [r8+rdx-0x30]
120        movdqu  xmm9, xmmword ptr [r9+rdx-0x30]
121        movdqu  xmm10, xmmword ptr [r10+rdx-0x30]
122        movdqu  xmm11, xmmword ptr [r11+rdx-0x30]
123        movdqa  xmm12, xmm8
124        punpckldq xmm8, xmm9
125        punpckhdq xmm12, xmm9
126        movdqa  xmm14, xmm10
127        punpckldq xmm10, xmm11
128        punpckhdq xmm14, xmm11
129        movdqa  xmm9, xmm8
130        punpcklqdq xmm8, xmm10
131        punpckhqdq xmm9, xmm10
132        movdqa  xmm13, xmm12
133        punpcklqdq xmm12, xmm14
134        punpckhqdq xmm13, xmm14
135        movdqa  xmmword ptr [rsp+0x40], xmm8
136        movdqa  xmmword ptr [rsp+0x50], xmm9
137        movdqa  xmmword ptr [rsp+0x60], xmm12
138        movdqa  xmmword ptr [rsp+0x70], xmm13
139        movdqu  xmm8, xmmword ptr [r8+rdx-0x20]
140        movdqu  xmm9, xmmword ptr [r9+rdx-0x20]
141        movdqu  xmm10, xmmword ptr [r10+rdx-0x20]
142        movdqu  xmm11, xmmword ptr [r11+rdx-0x20]
143        movdqa  xmm12, xmm8
144        punpckldq xmm8, xmm9
145        punpckhdq xmm12, xmm9
146        movdqa  xmm14, xmm10
147        punpckldq xmm10, xmm11
148        punpckhdq xmm14, xmm11
149        movdqa  xmm9, xmm8
150        punpcklqdq xmm8, xmm10
151        punpckhqdq xmm9, xmm10
152        movdqa  xmm13, xmm12
153        punpcklqdq xmm12, xmm14
154        punpckhqdq xmm13, xmm14
155        movdqa  xmmword ptr [rsp+0x80], xmm8
156        movdqa  xmmword ptr [rsp+0x90], xmm9
157        movdqa  xmmword ptr [rsp+0xA0], xmm12
158        movdqa  xmmword ptr [rsp+0xB0], xmm13
159        movdqu  xmm8, xmmword ptr [r8+rdx-0x10]
160        movdqu  xmm9, xmmword ptr [r9+rdx-0x10]
161        movdqu  xmm10, xmmword ptr [r10+rdx-0x10]
162        movdqu  xmm11, xmmword ptr [r11+rdx-0x10]
163        movdqa  xmm12, xmm8
164        punpckldq xmm8, xmm9
165        punpckhdq xmm12, xmm9
166        movdqa  xmm14, xmm10
167        punpckldq xmm10, xmm11
168        punpckhdq xmm14, xmm11
169        movdqa  xmm9, xmm8
170        punpcklqdq xmm8, xmm10
171        punpckhqdq xmm9, xmm10
172        movdqa  xmm13, xmm12
173        punpcklqdq xmm12, xmm14
174        punpckhqdq xmm13, xmm14
175        movdqa  xmmword ptr [rsp+0xC0], xmm8
176        movdqa  xmmword ptr [rsp+0xD0], xmm9
177        movdqa  xmmword ptr [rsp+0xE0], xmm12
178        movdqa  xmmword ptr [rsp+0xF0], xmm13
179        movdqa  xmm9, xmmword ptr [BLAKE3_IV_1+rip]
180        movdqa  xmm10, xmmword ptr [BLAKE3_IV_2+rip]
181        movdqa  xmm11, xmmword ptr [BLAKE3_IV_3+rip]
182        movdqa  xmm12, xmmword ptr [rsp+0x110]
183        movdqa  xmm13, xmmword ptr [rsp+0x120]
184        movdqa  xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
185        movd    xmm15, eax
186        pshufd  xmm15, xmm15, 0x00
187        prefetcht0 [r8+rdx+0x80]
188        prefetcht0 [r9+rdx+0x80]
189        prefetcht0 [r10+rdx+0x80]
190        prefetcht0 [r11+rdx+0x80]
191        paddd   xmm0, xmmword ptr [rsp]
192        paddd   xmm1, xmmword ptr [rsp+0x20]
193        paddd   xmm2, xmmword ptr [rsp+0x40]
194        paddd   xmm3, xmmword ptr [rsp+0x60]
195        paddd   xmm0, xmm4
196        paddd   xmm1, xmm5
197        paddd   xmm2, xmm6
198        paddd   xmm3, xmm7
199        pxor    xmm12, xmm0
200        pxor    xmm13, xmm1
201        pxor    xmm14, xmm2
202        pxor    xmm15, xmm3
203        pshuflw xmm12, xmm12, 0xB1
204        pshufhw xmm12, xmm12, 0xB1
205        pshuflw xmm13, xmm13, 0xB1
206        pshufhw xmm13, xmm13, 0xB1
207        pshuflw xmm14, xmm14, 0xB1
208        pshufhw xmm14, xmm14, 0xB1
209        pshuflw xmm15, xmm15, 0xB1
210        pshufhw xmm15, xmm15, 0xB1
211        movdqa  xmm8, xmmword ptr [BLAKE3_IV_0+rip]
212        paddd   xmm8, xmm12
213        paddd   xmm9, xmm13
214        paddd   xmm10, xmm14
215        paddd   xmm11, xmm15
216        pxor    xmm4, xmm8
217        pxor    xmm5, xmm9
218        pxor    xmm6, xmm10
219        pxor    xmm7, xmm11
220        movdqa  xmmword ptr [rsp+0x100], xmm8
221        movdqa  xmm8, xmm4
222        psrld   xmm8, 12
223        pslld   xmm4, 20
224        por     xmm4, xmm8
225        movdqa  xmm8, xmm5
226        psrld   xmm8, 12
227        pslld   xmm5, 20
228        por     xmm5, xmm8
229        movdqa  xmm8, xmm6
230        psrld   xmm8, 12
231        pslld   xmm6, 20
232        por     xmm6, xmm8
233        movdqa  xmm8, xmm7
234        psrld   xmm8, 12
235        pslld   xmm7, 20
236        por     xmm7, xmm8
237        paddd   xmm0, xmmword ptr [rsp+0x10]
238        paddd   xmm1, xmmword ptr [rsp+0x30]
239        paddd   xmm2, xmmword ptr [rsp+0x50]
240        paddd   xmm3, xmmword ptr [rsp+0x70]
241        paddd   xmm0, xmm4
242        paddd   xmm1, xmm5
243        paddd   xmm2, xmm6
244        paddd   xmm3, xmm7
245        pxor    xmm12, xmm0
246        pxor    xmm13, xmm1
247        pxor    xmm14, xmm2
248        pxor    xmm15, xmm3
249        movdqa  xmm8, xmm12
250        psrld   xmm12, 8
251        pslld   xmm8, 24
252        pxor    xmm12, xmm8
253        movdqa  xmm8, xmm13
254        psrld   xmm13, 8
255        pslld   xmm8, 24
256        pxor    xmm13, xmm8
257        movdqa  xmm8, xmm14
258        psrld   xmm14, 8
259        pslld   xmm8, 24
260        pxor    xmm14, xmm8
261        movdqa  xmm8, xmm15
262        psrld   xmm15, 8
263        pslld   xmm8, 24
264        pxor    xmm15, xmm8
265        movdqa  xmm8, xmmword ptr [rsp+0x100]
266        paddd   xmm8, xmm12
267        paddd   xmm9, xmm13
268        paddd   xmm10, xmm14
269        paddd   xmm11, xmm15
270        pxor    xmm4, xmm8
271        pxor    xmm5, xmm9
272        pxor    xmm6, xmm10
273        pxor    xmm7, xmm11
274        movdqa  xmmword ptr [rsp+0x100], xmm8
275        movdqa  xmm8, xmm4
276        psrld   xmm8, 7
277        pslld   xmm4, 25
278        por     xmm4, xmm8
279        movdqa  xmm8, xmm5
280        psrld   xmm8, 7
281        pslld   xmm5, 25
282        por     xmm5, xmm8
283        movdqa  xmm8, xmm6
284        psrld   xmm8, 7
285        pslld   xmm6, 25
286        por     xmm6, xmm8
287        movdqa  xmm8, xmm7
288        psrld   xmm8, 7
289        pslld   xmm7, 25
290        por     xmm7, xmm8
291        paddd   xmm0, xmmword ptr [rsp+0x80]
292        paddd   xmm1, xmmword ptr [rsp+0xA0]
293        paddd   xmm2, xmmword ptr [rsp+0xC0]
294        paddd   xmm3, xmmword ptr [rsp+0xE0]
295        paddd   xmm0, xmm5
296        paddd   xmm1, xmm6
297        paddd   xmm2, xmm7
298        paddd   xmm3, xmm4
299        pxor    xmm15, xmm0
300        pxor    xmm12, xmm1
301        pxor    xmm13, xmm2
302        pxor    xmm14, xmm3
303        pshuflw xmm15, xmm15, 0xB1
304        pshufhw xmm15, xmm15, 0xB1
305        pshuflw xmm12, xmm12, 0xB1
306        pshufhw xmm12, xmm12, 0xB1
307        pshuflw xmm13, xmm13, 0xB1
308        pshufhw xmm13, xmm13, 0xB1
309        pshuflw xmm14, xmm14, 0xB1
310        pshufhw xmm14, xmm14, 0xB1
311        paddd   xmm10, xmm15
312        paddd   xmm11, xmm12
313        movdqa  xmm8, xmmword ptr [rsp+0x100]
314        paddd   xmm8, xmm13
315        paddd   xmm9, xmm14
316        pxor    xmm5, xmm10
317        pxor    xmm6, xmm11
318        pxor    xmm7, xmm8
319        pxor    xmm4, xmm9
320        movdqa  xmmword ptr [rsp+0x100], xmm8
321        movdqa  xmm8, xmm5
322        psrld   xmm8, 12
323        pslld   xmm5, 20
324        por     xmm5, xmm8
325        movdqa  xmm8, xmm6
326        psrld   xmm8, 12
327        pslld   xmm6, 20
328        por     xmm6, xmm8
329        movdqa  xmm8, xmm7
330        psrld   xmm8, 12
331        pslld   xmm7, 20
332        por     xmm7, xmm8
333        movdqa  xmm8, xmm4
334        psrld   xmm8, 12
335        pslld   xmm4, 20
336        por     xmm4, xmm8
337        paddd   xmm0, xmmword ptr [rsp+0x90]
338        paddd   xmm1, xmmword ptr [rsp+0xB0]
339        paddd   xmm2, xmmword ptr [rsp+0xD0]
340        paddd   xmm3, xmmword ptr [rsp+0xF0]
341        paddd   xmm0, xmm5
342        paddd   xmm1, xmm6
343        paddd   xmm2, xmm7
344        paddd   xmm3, xmm4
345        pxor    xmm15, xmm0
346        pxor    xmm12, xmm1
347        pxor    xmm13, xmm2
348        pxor    xmm14, xmm3
349        movdqa  xmm8, xmm15
350        psrld   xmm15, 8
351        pslld   xmm8, 24
352        pxor    xmm15, xmm8
353        movdqa  xmm8, xmm12
354        psrld   xmm12, 8
355        pslld   xmm8, 24
356        pxor    xmm12, xmm8
357        movdqa  xmm8, xmm13
358        psrld   xmm13, 8
359        pslld   xmm8, 24
360        pxor    xmm13, xmm8
361        movdqa  xmm8, xmm14
362        psrld   xmm14, 8
363        pslld   xmm8, 24
364        pxor    xmm14, xmm8
365        paddd   xmm10, xmm15
366        paddd   xmm11, xmm12
367        movdqa  xmm8, xmmword ptr [rsp+0x100]
368        paddd   xmm8, xmm13
369        paddd   xmm9, xmm14
370        pxor    xmm5, xmm10
371        pxor    xmm6, xmm11
372        pxor    xmm7, xmm8
373        pxor    xmm4, xmm9
374        movdqa  xmmword ptr [rsp+0x100], xmm8
375        movdqa  xmm8, xmm5
376        psrld   xmm8, 7
377        pslld   xmm5, 25
378        por     xmm5, xmm8
379        movdqa  xmm8, xmm6
380        psrld   xmm8, 7
381        pslld   xmm6, 25
382        por     xmm6, xmm8
383        movdqa  xmm8, xmm7
384        psrld   xmm8, 7
385        pslld   xmm7, 25
386        por     xmm7, xmm8
387        movdqa  xmm8, xmm4
388        psrld   xmm8, 7
389        pslld   xmm4, 25
390        por     xmm4, xmm8
391        paddd   xmm0, xmmword ptr [rsp+0x20]
392        paddd   xmm1, xmmword ptr [rsp+0x30]
393        paddd   xmm2, xmmword ptr [rsp+0x70]
394        paddd   xmm3, xmmword ptr [rsp+0x40]
395        paddd   xmm0, xmm4
396        paddd   xmm1, xmm5
397        paddd   xmm2, xmm6
398        paddd   xmm3, xmm7
399        pxor    xmm12, xmm0
400        pxor    xmm13, xmm1
401        pxor    xmm14, xmm2
402        pxor    xmm15, xmm3
403        pshuflw xmm12, xmm12, 0xB1
404        pshufhw xmm12, xmm12, 0xB1
405        pshuflw xmm13, xmm13, 0xB1
406        pshufhw xmm13, xmm13, 0xB1
407        pshuflw xmm14, xmm14, 0xB1
408        pshufhw xmm14, xmm14, 0xB1
409        pshuflw xmm15, xmm15, 0xB1
410        pshufhw xmm15, xmm15, 0xB1
411        movdqa  xmm8, xmmword ptr [rsp+0x100]
412        paddd   xmm8, xmm12
413        paddd   xmm9, xmm13
414        paddd   xmm10, xmm14
415        paddd   xmm11, xmm15
416        pxor    xmm4, xmm8
417        pxor    xmm5, xmm9
418        pxor    xmm6, xmm10
419        pxor    xmm7, xmm11
420        movdqa  xmmword ptr [rsp+0x100], xmm8
421        movdqa  xmm8, xmm4
422        psrld   xmm8, 12
423        pslld   xmm4, 20
424        por     xmm4, xmm8
425        movdqa  xmm8, xmm5
426        psrld   xmm8, 12
427        pslld   xmm5, 20
428        por     xmm5, xmm8
429        movdqa  xmm8, xmm6
430        psrld   xmm8, 12
431        pslld   xmm6, 20
432        por     xmm6, xmm8
433        movdqa  xmm8, xmm7
434        psrld   xmm8, 12
435        pslld   xmm7, 20
436        por     xmm7, xmm8
437        paddd   xmm0, xmmword ptr [rsp+0x60]
438        paddd   xmm1, xmmword ptr [rsp+0xA0]
439        paddd   xmm2, xmmword ptr [rsp]
440        paddd   xmm3, xmmword ptr [rsp+0xD0]
441        paddd   xmm0, xmm4
442        paddd   xmm1, xmm5
443        paddd   xmm2, xmm6
444        paddd   xmm3, xmm7
445        pxor    xmm12, xmm0
446        pxor    xmm13, xmm1
447        pxor    xmm14, xmm2
448        pxor    xmm15, xmm3
449        movdqa  xmm8, xmm12
450        psrld   xmm12, 8
451        pslld   xmm8, 24
452        pxor    xmm12, xmm8
453        movdqa  xmm8, xmm13
454        psrld   xmm13, 8
455        pslld   xmm8, 24
456        pxor    xmm13, xmm8
457        movdqa  xmm8, xmm14
458        psrld   xmm14, 8
459        pslld   xmm8, 24
460        pxor    xmm14, xmm8
461        movdqa  xmm8, xmm15
462        psrld   xmm15, 8
463        pslld   xmm8, 24
464        pxor    xmm15, xmm8
465        movdqa  xmm8, xmmword ptr [rsp+0x100]
466        paddd   xmm8, xmm12
467        paddd   xmm9, xmm13
468        paddd   xmm10, xmm14
469        paddd   xmm11, xmm15
470        pxor    xmm4, xmm8
471        pxor    xmm5, xmm9
472        pxor    xmm6, xmm10
473        pxor    xmm7, xmm11
474        movdqa  xmmword ptr [rsp+0x100], xmm8
475        movdqa  xmm8, xmm4
476        psrld   xmm8, 7
477        pslld   xmm4, 25
478        por     xmm4, xmm8
479        movdqa  xmm8, xmm5
480        psrld   xmm8, 7
481        pslld   xmm5, 25
482        por     xmm5, xmm8
483        movdqa  xmm8, xmm6
484        psrld   xmm8, 7
485        pslld   xmm6, 25
486        por     xmm6, xmm8
487        movdqa  xmm8, xmm7
488        psrld   xmm8, 7
489        pslld   xmm7, 25
490        por     xmm7, xmm8
491        paddd   xmm0, xmmword ptr [rsp+0x10]
492        paddd   xmm1, xmmword ptr [rsp+0xC0]
493        paddd   xmm2, xmmword ptr [rsp+0x90]
494        paddd   xmm3, xmmword ptr [rsp+0xF0]
495        paddd   xmm0, xmm5
496        paddd   xmm1, xmm6
497        paddd   xmm2, xmm7
498        paddd   xmm3, xmm4
499        pxor    xmm15, xmm0
500        pxor    xmm12, xmm1
501        pxor    xmm13, xmm2
502        pxor    xmm14, xmm3
503        pshuflw xmm15, xmm15, 0xB1
504        pshufhw xmm15, xmm15, 0xB1
505        pshuflw xmm12, xmm12, 0xB1
506        pshufhw xmm12, xmm12, 0xB1
507        pshuflw xmm13, xmm13, 0xB1
508        pshufhw xmm13, xmm13, 0xB1
509        pshuflw xmm14, xmm14, 0xB1
510        pshufhw xmm14, xmm14, 0xB1
511        paddd   xmm10, xmm15
512        paddd   xmm11, xmm12
513        movdqa  xmm8, xmmword ptr [rsp+0x100]
514        paddd   xmm8, xmm13
515        paddd   xmm9, xmm14
516        pxor    xmm5, xmm10
517        pxor    xmm6, xmm11
518        pxor    xmm7, xmm8
519        pxor    xmm4, xmm9
520        movdqa  xmmword ptr [rsp+0x100], xmm8
521        movdqa  xmm8, xmm5
522        psrld   xmm8, 12
523        pslld   xmm5, 20
524        por     xmm5, xmm8
525        movdqa  xmm8, xmm6
526        psrld   xmm8, 12
527        pslld   xmm6, 20
528        por     xmm6, xmm8
529        movdqa  xmm8, xmm7
530        psrld   xmm8, 12
531        pslld   xmm7, 20
532        por     xmm7, xmm8
533        movdqa  xmm8, xmm4
534        psrld   xmm8, 12
535        pslld   xmm4, 20
536        por     xmm4, xmm8
537        paddd   xmm0, xmmword ptr [rsp+0xB0]
538        paddd   xmm1, xmmword ptr [rsp+0x50]
539        paddd   xmm2, xmmword ptr [rsp+0xE0]
540        paddd   xmm3, xmmword ptr [rsp+0x80]
541        paddd   xmm0, xmm5
542        paddd   xmm1, xmm6
543        paddd   xmm2, xmm7
544        paddd   xmm3, xmm4
545        pxor    xmm15, xmm0
546        pxor    xmm12, xmm1
547        pxor    xmm13, xmm2
548        pxor    xmm14, xmm3
549        movdqa  xmm8, xmm15
550        psrld   xmm15, 8
551        pslld   xmm8, 24
552        pxor    xmm15, xmm8
553        movdqa  xmm8, xmm12
554        psrld   xmm12, 8
555        pslld   xmm8, 24
556        pxor    xmm12, xmm8
557        movdqa  xmm8, xmm13
558        psrld   xmm13, 8
559        pslld   xmm8, 24
560        pxor    xmm13, xmm8
561        movdqa  xmm8, xmm14
562        psrld   xmm14, 8
563        pslld   xmm8, 24
564        pxor    xmm14, xmm8
565        paddd   xmm10, xmm15
566        paddd   xmm11, xmm12
567        movdqa  xmm8, xmmword ptr [rsp+0x100]
568        paddd   xmm8, xmm13
569        paddd   xmm9, xmm14
570        pxor    xmm5, xmm10
571        pxor    xmm6, xmm11
572        pxor    xmm7, xmm8
573        pxor    xmm4, xmm9
574        movdqa  xmmword ptr [rsp+0x100], xmm8
575        movdqa  xmm8, xmm5
576        psrld   xmm8, 7
577        pslld   xmm5, 25
578        por     xmm5, xmm8
579        movdqa  xmm8, xmm6
580        psrld   xmm8, 7
581        pslld   xmm6, 25
582        por     xmm6, xmm8
583        movdqa  xmm8, xmm7
584        psrld   xmm8, 7
585        pslld   xmm7, 25
586        por     xmm7, xmm8
587        movdqa  xmm8, xmm4
588        psrld   xmm8, 7
589        pslld   xmm4, 25
590        por     xmm4, xmm8
591        paddd   xmm0, xmmword ptr [rsp+0x30]
592        paddd   xmm1, xmmword ptr [rsp+0xA0]
593        paddd   xmm2, xmmword ptr [rsp+0xD0]
594        paddd   xmm3, xmmword ptr [rsp+0x70]
595        paddd   xmm0, xmm4
596        paddd   xmm1, xmm5
597        paddd   xmm2, xmm6
598        paddd   xmm3, xmm7
599        pxor    xmm12, xmm0
600        pxor    xmm13, xmm1
601        pxor    xmm14, xmm2
602        pxor    xmm15, xmm3
603        pshuflw xmm12, xmm12, 0xB1
604        pshufhw xmm12, xmm12, 0xB1
605        pshuflw xmm13, xmm13, 0xB1
606        pshufhw xmm13, xmm13, 0xB1
607        pshuflw xmm14, xmm14, 0xB1
608        pshufhw xmm14, xmm14, 0xB1
609        pshuflw xmm15, xmm15, 0xB1
610        pshufhw xmm15, xmm15, 0xB1
611        movdqa  xmm8, xmmword ptr [rsp+0x100]
612        paddd   xmm8, xmm12
613        paddd   xmm9, xmm13
614        paddd   xmm10, xmm14
615        paddd   xmm11, xmm15
616        pxor    xmm4, xmm8
617        pxor    xmm5, xmm9
618        pxor    xmm6, xmm10
619        pxor    xmm7, xmm11
620        movdqa  xmmword ptr [rsp+0x100], xmm8
621        movdqa  xmm8, xmm4
622        psrld   xmm8, 12
623        pslld   xmm4, 20
624        por     xmm4, xmm8
625        movdqa  xmm8, xmm5
626        psrld   xmm8, 12
627        pslld   xmm5, 20
628        por     xmm5, xmm8
629        movdqa  xmm8, xmm6
630        psrld   xmm8, 12
631        pslld   xmm6, 20
632        por     xmm6, xmm8
633        movdqa  xmm8, xmm7
634        psrld   xmm8, 12
635        pslld   xmm7, 20
636        por     xmm7, xmm8
637        paddd   xmm0, xmmword ptr [rsp+0x40]
638        paddd   xmm1, xmmword ptr [rsp+0xC0]
639        paddd   xmm2, xmmword ptr [rsp+0x20]
640        paddd   xmm3, xmmword ptr [rsp+0xE0]
641        paddd   xmm0, xmm4
642        paddd   xmm1, xmm5
643        paddd   xmm2, xmm6
644        paddd   xmm3, xmm7
645        pxor    xmm12, xmm0
646        pxor    xmm13, xmm1
647        pxor    xmm14, xmm2
648        pxor    xmm15, xmm3
649        movdqa  xmm8, xmm12
650        psrld   xmm12, 8
651        pslld   xmm8, 24
652        pxor    xmm12, xmm8
653        movdqa  xmm8, xmm13
654        psrld   xmm13, 8
655        pslld   xmm8, 24
656        pxor    xmm13, xmm8
657        movdqa  xmm8, xmm14
658        psrld   xmm14, 8
659        pslld   xmm8, 24
660        pxor    xmm14, xmm8
661        movdqa  xmm8, xmm15
662        psrld   xmm15, 8
663        pslld   xmm8, 24
664        pxor    xmm15, xmm8
665        movdqa  xmm8, xmmword ptr [rsp+0x100]
666        paddd   xmm8, xmm12
667        paddd   xmm9, xmm13
668        paddd   xmm10, xmm14
669        paddd   xmm11, xmm15
670        pxor    xmm4, xmm8
671        pxor    xmm5, xmm9
672        pxor    xmm6, xmm10
673        pxor    xmm7, xmm11
674        movdqa  xmmword ptr [rsp+0x100], xmm8
675        movdqa  xmm8, xmm4
676        psrld   xmm8, 7
677        pslld   xmm4, 25
678        por     xmm4, xmm8
679        movdqa  xmm8, xmm5
680        psrld   xmm8, 7
681        pslld   xmm5, 25
682        por     xmm5, xmm8
683        movdqa  xmm8, xmm6
684        psrld   xmm8, 7
685        pslld   xmm6, 25
686        por     xmm6, xmm8
687        movdqa  xmm8, xmm7
688        psrld   xmm8, 7
689        pslld   xmm7, 25
690        por     xmm7, xmm8
691        paddd   xmm0, xmmword ptr [rsp+0x60]
692        paddd   xmm1, xmmword ptr [rsp+0x90]
693        paddd   xmm2, xmmword ptr [rsp+0xB0]
694        paddd   xmm3, xmmword ptr [rsp+0x80]
695        paddd   xmm0, xmm5
696        paddd   xmm1, xmm6
697        paddd   xmm2, xmm7
698        paddd   xmm3, xmm4
699        pxor    xmm15, xmm0
700        pxor    xmm12, xmm1
701        pxor    xmm13, xmm2
702        pxor    xmm14, xmm3
703        pshuflw xmm15, xmm15, 0xB1
704        pshufhw xmm15, xmm15, 0xB1
705        pshuflw xmm12, xmm12, 0xB1
706        pshufhw xmm12, xmm12, 0xB1
707        pshuflw xmm13, xmm13, 0xB1
708        pshufhw xmm13, xmm13, 0xB1
709        pshuflw xmm14, xmm14, 0xB1
710        pshufhw xmm14, xmm14, 0xB1
711        paddd   xmm10, xmm15
712        paddd   xmm11, xmm12
713        movdqa  xmm8, xmmword ptr [rsp+0x100]
714        paddd   xmm8, xmm13
715        paddd   xmm9, xmm14
716        pxor    xmm5, xmm10
717        pxor    xmm6, xmm11
718        pxor    xmm7, xmm8
719        pxor    xmm4, xmm9
720        movdqa  xmmword ptr [rsp+0x100], xmm8
721        movdqa  xmm8, xmm5
722        psrld   xmm8, 12
723        pslld   xmm5, 20
724        por     xmm5, xmm8
725        movdqa  xmm8, xmm6
726        psrld   xmm8, 12
727        pslld   xmm6, 20
728        por     xmm6, xmm8
729        movdqa  xmm8, xmm7
730        psrld   xmm8, 12
731        pslld   xmm7, 20
732        por     xmm7, xmm8
733        movdqa  xmm8, xmm4
734        psrld   xmm8, 12
735        pslld   xmm4, 20
736        por     xmm4, xmm8
737        paddd   xmm0, xmmword ptr [rsp+0x50]
738        paddd   xmm1, xmmword ptr [rsp]
739        paddd   xmm2, xmmword ptr [rsp+0xF0]
740        paddd   xmm3, xmmword ptr [rsp+0x10]
741        paddd   xmm0, xmm5
742        paddd   xmm1, xmm6
743        paddd   xmm2, xmm7
744        paddd   xmm3, xmm4
745        pxor    xmm15, xmm0
746        pxor    xmm12, xmm1
747        pxor    xmm13, xmm2
748        pxor    xmm14, xmm3
749        movdqa  xmm8, xmm15
750        psrld   xmm15, 8
751        pslld   xmm8, 24
752        pxor    xmm15, xmm8
753        movdqa  xmm8, xmm12
754        psrld   xmm12, 8
755        pslld   xmm8, 24
756        pxor    xmm12, xmm8
757        movdqa  xmm8, xmm13
758        psrld   xmm13, 8
759        pslld   xmm8, 24
760        pxor    xmm13, xmm8
761        movdqa  xmm8, xmm14
762        psrld   xmm14, 8
763        pslld   xmm8, 24
764        pxor    xmm14, xmm8
765        paddd   xmm10, xmm15
766        paddd   xmm11, xmm12
767        movdqa  xmm8, xmmword ptr [rsp+0x100]
768        paddd   xmm8, xmm13
769        paddd   xmm9, xmm14
770        pxor    xmm5, xmm10
771        pxor    xmm6, xmm11
772        pxor    xmm7, xmm8
773        pxor    xmm4, xmm9
774        movdqa  xmmword ptr [rsp+0x100], xmm8
775        movdqa  xmm8, xmm5
776        psrld   xmm8, 7
777        pslld   xmm5, 25
778        por     xmm5, xmm8
779        movdqa  xmm8, xmm6
780        psrld   xmm8, 7
781        pslld   xmm6, 25
782        por     xmm6, xmm8
783        movdqa  xmm8, xmm7
784        psrld   xmm8, 7
785        pslld   xmm7, 25
786        por     xmm7, xmm8
787        movdqa  xmm8, xmm4
788        psrld   xmm8, 7
789        pslld   xmm4, 25
790        por     xmm4, xmm8
791        paddd   xmm0, xmmword ptr [rsp+0xA0]
792        paddd   xmm1, xmmword ptr [rsp+0xC0]
793        paddd   xmm2, xmmword ptr [rsp+0xE0]
794        paddd   xmm3, xmmword ptr [rsp+0xD0]
795        paddd   xmm0, xmm4
796        paddd   xmm1, xmm5
797        paddd   xmm2, xmm6
798        paddd   xmm3, xmm7
799        pxor    xmm12, xmm0
800        pxor    xmm13, xmm1
801        pxor    xmm14, xmm2
802        pxor    xmm15, xmm3
803        pshuflw xmm12, xmm12, 0xB1
804        pshufhw xmm12, xmm12, 0xB1
805        pshuflw xmm13, xmm13, 0xB1
806        pshufhw xmm13, xmm13, 0xB1
807        pshuflw xmm14, xmm14, 0xB1
808        pshufhw xmm14, xmm14, 0xB1
809        pshuflw xmm15, xmm15, 0xB1
810        pshufhw xmm15, xmm15, 0xB1
811        movdqa  xmm8, xmmword ptr [rsp+0x100]
812        paddd   xmm8, xmm12
813        paddd   xmm9, xmm13
814        paddd   xmm10, xmm14
815        paddd   xmm11, xmm15
816        pxor    xmm4, xmm8
817        pxor    xmm5, xmm9
818        pxor    xmm6, xmm10
819        pxor    xmm7, xmm11
820        movdqa  xmmword ptr [rsp+0x100], xmm8
821        movdqa  xmm8, xmm4
822        psrld   xmm8, 12
823        pslld   xmm4, 20
824        por     xmm4, xmm8
825        movdqa  xmm8, xmm5
826        psrld   xmm8, 12
827        pslld   xmm5, 20
828        por     xmm5, xmm8
829        movdqa  xmm8, xmm6
830        psrld   xmm8, 12
831        pslld   xmm6, 20
832        por     xmm6, xmm8
833        movdqa  xmm8, xmm7
834        psrld   xmm8, 12
835        pslld   xmm7, 20
836        por     xmm7, xmm8
837        paddd   xmm0, xmmword ptr [rsp+0x70]
838        paddd   xmm1, xmmword ptr [rsp+0x90]
839        paddd   xmm2, xmmword ptr [rsp+0x30]
840        paddd   xmm3, xmmword ptr [rsp+0xF0]
841        paddd   xmm0, xmm4
842        paddd   xmm1, xmm5
843        paddd   xmm2, xmm6
844        paddd   xmm3, xmm7
845        pxor    xmm12, xmm0
846        pxor    xmm13, xmm1
847        pxor    xmm14, xmm2
848        pxor    xmm15, xmm3
849        movdqa  xmm8, xmm12
850        psrld   xmm12, 8
851        pslld   xmm8, 24
852        pxor    xmm12, xmm8
853        movdqa  xmm8, xmm13
854        psrld   xmm13, 8
855        pslld   xmm8, 24
856        pxor    xmm13, xmm8
857        movdqa  xmm8, xmm14
858        psrld   xmm14, 8
859        pslld   xmm8, 24
860        pxor    xmm14, xmm8
861        movdqa  xmm8, xmm15
862        psrld   xmm15, 8
863        pslld   xmm8, 24
864        pxor    xmm15, xmm8
865        movdqa  xmm8, xmmword ptr [rsp+0x100]
866        paddd   xmm8, xmm12
867        paddd   xmm9, xmm13
868        paddd   xmm10, xmm14
869        paddd   xmm11, xmm15
870        pxor    xmm4, xmm8
871        pxor    xmm5, xmm9
872        pxor    xmm6, xmm10
873        pxor    xmm7, xmm11
874        movdqa  xmmword ptr [rsp+0x100], xmm8
875        movdqa  xmm8, xmm4
876        psrld   xmm8, 7
877        pslld   xmm4, 25
878        por     xmm4, xmm8
879        movdqa  xmm8, xmm5
880        psrld   xmm8, 7
881        pslld   xmm5, 25
882        por     xmm5, xmm8
883        movdqa  xmm8, xmm6
884        psrld   xmm8, 7
885        pslld   xmm6, 25
886        por     xmm6, xmm8
887        movdqa  xmm8, xmm7
888        psrld   xmm8, 7
889        pslld   xmm7, 25
890        por     xmm7, xmm8
891        paddd   xmm0, xmmword ptr [rsp+0x40]
892        paddd   xmm1, xmmword ptr [rsp+0xB0]
893        paddd   xmm2, xmmword ptr [rsp+0x50]
894        paddd   xmm3, xmmword ptr [rsp+0x10]
895        paddd   xmm0, xmm5
896        paddd   xmm1, xmm6
897        paddd   xmm2, xmm7
898        paddd   xmm3, xmm4
899        pxor    xmm15, xmm0
900        pxor    xmm12, xmm1
901        pxor    xmm13, xmm2
902        pxor    xmm14, xmm3
903        pshuflw xmm15, xmm15, 0xB1
904        pshufhw xmm15, xmm15, 0xB1
905        pshuflw xmm12, xmm12, 0xB1
906        pshufhw xmm12, xmm12, 0xB1
907        pshuflw xmm13, xmm13, 0xB1
908        pshufhw xmm13, xmm13, 0xB1
909        pshuflw xmm14, xmm14, 0xB1
910        pshufhw xmm14, xmm14, 0xB1
911        paddd   xmm10, xmm15
912        paddd   xmm11, xmm12
913        movdqa  xmm8, xmmword ptr [rsp+0x100]
914        paddd   xmm8, xmm13
915        paddd   xmm9, xmm14
916        pxor    xmm5, xmm10
917        pxor    xmm6, xmm11
918        pxor    xmm7, xmm8
919        pxor    xmm4, xmm9
920        movdqa  xmmword ptr [rsp+0x100], xmm8
921        movdqa  xmm8, xmm5
922        psrld   xmm8, 12
923        pslld   xmm5, 20
924        por     xmm5, xmm8
925        movdqa  xmm8, xmm6
926        psrld   xmm8, 12
927        pslld   xmm6, 20
928        por     xmm6, xmm8
929        movdqa  xmm8, xmm7
930        psrld   xmm8, 12
931        pslld   xmm7, 20
932        por     xmm7, xmm8
933        movdqa  xmm8, xmm4
934        psrld   xmm8, 12
935        pslld   xmm4, 20
936        por     xmm4, xmm8
937        paddd   xmm0, xmmword ptr [rsp]
938        paddd   xmm1, xmmword ptr [rsp+0x20]
939        paddd   xmm2, xmmword ptr [rsp+0x80]
940        paddd   xmm3, xmmword ptr [rsp+0x60]
941        paddd   xmm0, xmm5
942        paddd   xmm1, xmm6
943        paddd   xmm2, xmm7
944        paddd   xmm3, xmm4
945        pxor    xmm15, xmm0
946        pxor    xmm12, xmm1
947        pxor    xmm13, xmm2
948        pxor    xmm14, xmm3
949        movdqa  xmm8, xmm15
950        psrld   xmm15, 8
951        pslld   xmm8, 24
952        pxor    xmm15, xmm8
953        movdqa  xmm8, xmm12
954        psrld   xmm12, 8
955        pslld   xmm8, 24
956        pxor    xmm12, xmm8
957        movdqa  xmm8, xmm13
958        psrld   xmm13, 8
959        pslld   xmm8, 24
960        pxor    xmm13, xmm8
961        movdqa  xmm8, xmm14
962        psrld   xmm14, 8
963        pslld   xmm8, 24
964        pxor    xmm14, xmm8
965        paddd   xmm10, xmm15
966        paddd   xmm11, xmm12
967        movdqa  xmm8, xmmword ptr [rsp+0x100]
968        paddd   xmm8, xmm13
969        paddd   xmm9, xmm14
970        pxor    xmm5, xmm10
971        pxor    xmm6, xmm11
972        pxor    xmm7, xmm8
973        pxor    xmm4, xmm9
974        movdqa  xmmword ptr [rsp+0x100], xmm8
975        movdqa  xmm8, xmm5
976        psrld   xmm8, 7
977        pslld   xmm5, 25
978        por     xmm5, xmm8
979        movdqa  xmm8, xmm6
980        psrld   xmm8, 7
981        pslld   xmm6, 25
982        por     xmm6, xmm8
983        movdqa  xmm8, xmm7
984        psrld   xmm8, 7
985        pslld   xmm7, 25
986        por     xmm7, xmm8
987        movdqa  xmm8, xmm4
988        psrld   xmm8, 7
989        pslld   xmm4, 25
990        por     xmm4, xmm8
991        paddd   xmm0, xmmword ptr [rsp+0xC0]
992        paddd   xmm1, xmmword ptr [rsp+0x90]
993        paddd   xmm2, xmmword ptr [rsp+0xF0]
994        paddd   xmm3, xmmword ptr [rsp+0xE0]
995        paddd   xmm0, xmm4
996        paddd   xmm1, xmm5
997        paddd   xmm2, xmm6
998        paddd   xmm3, xmm7
999        pxor    xmm12, xmm0
1000        pxor    xmm13, xmm1
1001        pxor    xmm14, xmm2
1002        pxor    xmm15, xmm3
1003        pshuflw xmm12, xmm12, 0xB1
1004        pshufhw xmm12, xmm12, 0xB1
1005        pshuflw xmm13, xmm13, 0xB1
1006        pshufhw xmm13, xmm13, 0xB1
1007        pshuflw xmm14, xmm14, 0xB1
1008        pshufhw xmm14, xmm14, 0xB1
1009        pshuflw xmm15, xmm15, 0xB1
1010        pshufhw xmm15, xmm15, 0xB1
1011        movdqa  xmm8, xmmword ptr [rsp+0x100]
1012        paddd   xmm8, xmm12
1013        paddd   xmm9, xmm13
1014        paddd   xmm10, xmm14
1015        paddd   xmm11, xmm15
1016        pxor    xmm4, xmm8
1017        pxor    xmm5, xmm9
1018        pxor    xmm6, xmm10
1019        pxor    xmm7, xmm11
1020        movdqa  xmmword ptr [rsp+0x100], xmm8
1021        movdqa  xmm8, xmm4
1022        psrld   xmm8, 12
1023        pslld   xmm4, 20
1024        por     xmm4, xmm8
1025        movdqa  xmm8, xmm5
1026        psrld   xmm8, 12
1027        pslld   xmm5, 20
1028        por     xmm5, xmm8
1029        movdqa  xmm8, xmm6
1030        psrld   xmm8, 12
1031        pslld   xmm6, 20
1032        por     xmm6, xmm8
1033        movdqa  xmm8, xmm7
1034        psrld   xmm8, 12
1035        pslld   xmm7, 20
1036        por     xmm7, xmm8
1037        paddd   xmm0, xmmword ptr [rsp+0xD0]
1038        paddd   xmm1, xmmword ptr [rsp+0xB0]
1039        paddd   xmm2, xmmword ptr [rsp+0xA0]
1040        paddd   xmm3, xmmword ptr [rsp+0x80]
1041        paddd   xmm0, xmm4
1042        paddd   xmm1, xmm5
1043        paddd   xmm2, xmm6
1044        paddd   xmm3, xmm7
1045        pxor    xmm12, xmm0
1046        pxor    xmm13, xmm1
1047        pxor    xmm14, xmm2
1048        pxor    xmm15, xmm3
1049        movdqa  xmm8, xmm12
1050        psrld   xmm12, 8
1051        pslld   xmm8, 24
1052        pxor    xmm12, xmm8
1053        movdqa  xmm8, xmm13
1054        psrld   xmm13, 8
1055        pslld   xmm8, 24
1056        pxor    xmm13, xmm8
1057        movdqa  xmm8, xmm14
1058        psrld   xmm14, 8
1059        pslld   xmm8, 24
1060        pxor    xmm14, xmm8
1061        movdqa  xmm8, xmm15
1062        psrld   xmm15, 8
1063        pslld   xmm8, 24
1064        pxor    xmm15, xmm8
1065        movdqa  xmm8, xmmword ptr [rsp+0x100]
1066        paddd   xmm8, xmm12
1067        paddd   xmm9, xmm13
1068        paddd   xmm10, xmm14
1069        paddd   xmm11, xmm15
1070        pxor    xmm4, xmm8
1071        pxor    xmm5, xmm9
1072        pxor    xmm6, xmm10
1073        pxor    xmm7, xmm11
1074        movdqa  xmmword ptr [rsp+0x100], xmm8
1075        movdqa  xmm8, xmm4
1076        psrld   xmm8, 7
1077        pslld   xmm4, 25
1078        por     xmm4, xmm8
1079        movdqa  xmm8, xmm5
1080        psrld   xmm8, 7
1081        pslld   xmm5, 25
1082        por     xmm5, xmm8
1083        movdqa  xmm8, xmm6
1084        psrld   xmm8, 7
1085        pslld   xmm6, 25
1086        por     xmm6, xmm8
1087        movdqa  xmm8, xmm7
1088        psrld   xmm8, 7
1089        pslld   xmm7, 25
1090        por     xmm7, xmm8
1091        paddd   xmm0, xmmword ptr [rsp+0x70]
1092        paddd   xmm1, xmmword ptr [rsp+0x50]
1093        paddd   xmm2, xmmword ptr [rsp]
1094        paddd   xmm3, xmmword ptr [rsp+0x60]
1095        paddd   xmm0, xmm5
1096        paddd   xmm1, xmm6
1097        paddd   xmm2, xmm7
1098        paddd   xmm3, xmm4
1099        pxor    xmm15, xmm0
1100        pxor    xmm12, xmm1
1101        pxor    xmm13, xmm2
1102        pxor    xmm14, xmm3
1103        pshuflw xmm15, xmm15, 0xB1
1104        pshufhw xmm15, xmm15, 0xB1
1105        pshuflw xmm12, xmm12, 0xB1
1106        pshufhw xmm12, xmm12, 0xB1
1107        pshuflw xmm13, xmm13, 0xB1
1108        pshufhw xmm13, xmm13, 0xB1
1109        pshuflw xmm14, xmm14, 0xB1
1110        pshufhw xmm14, xmm14, 0xB1
1111        paddd   xmm10, xmm15
1112        paddd   xmm11, xmm12
1113        movdqa  xmm8, xmmword ptr [rsp+0x100]
1114        paddd   xmm8, xmm13
1115        paddd   xmm9, xmm14
1116        pxor    xmm5, xmm10
1117        pxor    xmm6, xmm11
1118        pxor    xmm7, xmm8
1119        pxor    xmm4, xmm9
1120        movdqa  xmmword ptr [rsp+0x100], xmm8
1121        movdqa  xmm8, xmm5
1122        psrld   xmm8, 12
1123        pslld   xmm5, 20
1124        por     xmm5, xmm8
1125        movdqa  xmm8, xmm6
1126        psrld   xmm8, 12
1127        pslld   xmm6, 20
1128        por     xmm6, xmm8
1129        movdqa  xmm8, xmm7
1130        psrld   xmm8, 12
1131        pslld   xmm7, 20
1132        por     xmm7, xmm8
1133        movdqa  xmm8, xmm4
1134        psrld   xmm8, 12
1135        pslld   xmm4, 20
1136        por     xmm4, xmm8
1137        paddd   xmm0, xmmword ptr [rsp+0x20]
1138        paddd   xmm1, xmmword ptr [rsp+0x30]
1139        paddd   xmm2, xmmword ptr [rsp+0x10]
1140        paddd   xmm3, xmmword ptr [rsp+0x40]
1141        paddd   xmm0, xmm5
1142        paddd   xmm1, xmm6
1143        paddd   xmm2, xmm7
1144        paddd   xmm3, xmm4
1145        pxor    xmm15, xmm0
1146        pxor    xmm12, xmm1
1147        pxor    xmm13, xmm2
1148        pxor    xmm14, xmm3
1149        movdqa  xmm8, xmm15
1150        psrld   xmm15, 8
1151        pslld   xmm8, 24
1152        pxor    xmm15, xmm8
1153        movdqa  xmm8, xmm12
1154        psrld   xmm12, 8
1155        pslld   xmm8, 24
1156        pxor    xmm12, xmm8
1157        movdqa  xmm8, xmm13
1158        psrld   xmm13, 8
1159        pslld   xmm8, 24
1160        pxor    xmm13, xmm8
1161        movdqa  xmm8, xmm14
1162        psrld   xmm14, 8
1163        pslld   xmm8, 24
1164        pxor    xmm14, xmm8
1165        paddd   xmm10, xmm15
1166        paddd   xmm11, xmm12
1167        movdqa  xmm8, xmmword ptr [rsp+0x100]
1168        paddd   xmm8, xmm13
1169        paddd   xmm9, xmm14
1170        pxor    xmm5, xmm10
1171        pxor    xmm6, xmm11
1172        pxor    xmm7, xmm8
1173        pxor    xmm4, xmm9
1174        movdqa  xmmword ptr [rsp+0x100], xmm8
1175        movdqa  xmm8, xmm5
1176        psrld   xmm8, 7
1177        pslld   xmm5, 25
1178        por     xmm5, xmm8
1179        movdqa  xmm8, xmm6
1180        psrld   xmm8, 7
1181        pslld   xmm6, 25
1182        por     xmm6, xmm8
1183        movdqa  xmm8, xmm7
1184        psrld   xmm8, 7
1185        pslld   xmm7, 25
1186        por     xmm7, xmm8
1187        movdqa  xmm8, xmm4
1188        psrld   xmm8, 7
1189        pslld   xmm4, 25
1190        por     xmm4, xmm8
1191        paddd   xmm0, xmmword ptr [rsp+0x90]
1192        paddd   xmm1, xmmword ptr [rsp+0xB0]
1193        paddd   xmm2, xmmword ptr [rsp+0x80]
1194        paddd   xmm3, xmmword ptr [rsp+0xF0]
1195        paddd   xmm0, xmm4
1196        paddd   xmm1, xmm5
1197        paddd   xmm2, xmm6
1198        paddd   xmm3, xmm7
1199        pxor    xmm12, xmm0
1200        pxor    xmm13, xmm1
1201        pxor    xmm14, xmm2
1202        pxor    xmm15, xmm3
1203        pshuflw xmm12, xmm12, 0xB1
1204        pshufhw xmm12, xmm12, 0xB1
1205        pshuflw xmm13, xmm13, 0xB1
1206        pshufhw xmm13, xmm13, 0xB1
1207        pshuflw xmm14, xmm14, 0xB1
1208        pshufhw xmm14, xmm14, 0xB1
1209        pshuflw xmm15, xmm15, 0xB1
1210        pshufhw xmm15, xmm15, 0xB1
1211        movdqa  xmm8, xmmword ptr [rsp+0x100]
1212        paddd   xmm8, xmm12
1213        paddd   xmm9, xmm13
1214        paddd   xmm10, xmm14
1215        paddd   xmm11, xmm15
1216        pxor    xmm4, xmm8
1217        pxor    xmm5, xmm9
1218        pxor    xmm6, xmm10
1219        pxor    xmm7, xmm11
1220        movdqa  xmmword ptr [rsp+0x100], xmm8
1221        movdqa  xmm8, xmm4
1222        psrld   xmm8, 12
1223        pslld   xmm4, 20
1224        por     xmm4, xmm8
1225        movdqa  xmm8, xmm5
1226        psrld   xmm8, 12
1227        pslld   xmm5, 20
1228        por     xmm5, xmm8
1229        movdqa  xmm8, xmm6
1230        psrld   xmm8, 12
1231        pslld   xmm6, 20
1232        por     xmm6, xmm8
1233        movdqa  xmm8, xmm7
1234        psrld   xmm8, 12
1235        pslld   xmm7, 20
1236        por     xmm7, xmm8
1237        paddd   xmm0, xmmword ptr [rsp+0xE0]
1238        paddd   xmm1, xmmword ptr [rsp+0x50]
1239        paddd   xmm2, xmmword ptr [rsp+0xC0]
1240        paddd   xmm3, xmmword ptr [rsp+0x10]
1241        paddd   xmm0, xmm4
1242        paddd   xmm1, xmm5
1243        paddd   xmm2, xmm6
1244        paddd   xmm3, xmm7
1245        pxor    xmm12, xmm0
1246        pxor    xmm13, xmm1
1247        pxor    xmm14, xmm2
1248        pxor    xmm15, xmm3
1249        movdqa  xmm8, xmm12
1250        psrld   xmm12, 8
1251        pslld   xmm8, 24
1252        pxor    xmm12, xmm8
1253        movdqa  xmm8, xmm13
1254        psrld   xmm13, 8
1255        pslld   xmm8, 24
1256        pxor    xmm13, xmm8
1257        movdqa  xmm8, xmm14
1258        psrld   xmm14, 8
1259        pslld   xmm8, 24
1260        pxor    xmm14, xmm8
1261        movdqa  xmm8, xmm15
1262        psrld   xmm15, 8
1263        pslld   xmm8, 24
1264        pxor    xmm15, xmm8
1265        movdqa  xmm8, xmmword ptr [rsp+0x100]
1266        paddd   xmm8, xmm12
1267        paddd   xmm9, xmm13
1268        paddd   xmm10, xmm14
1269        paddd   xmm11, xmm15
1270        pxor    xmm4, xmm8
1271        pxor    xmm5, xmm9
1272        pxor    xmm6, xmm10
1273        pxor    xmm7, xmm11
1274        movdqa  xmmword ptr [rsp+0x100], xmm8
1275        movdqa  xmm8, xmm4
1276        psrld   xmm8, 7
1277        pslld   xmm4, 25
1278        por     xmm4, xmm8
1279        movdqa  xmm8, xmm5
1280        psrld   xmm8, 7
1281        pslld   xmm5, 25
1282        por     xmm5, xmm8
1283        movdqa  xmm8, xmm6
1284        psrld   xmm8, 7
1285        pslld   xmm6, 25
1286        por     xmm6, xmm8
1287        movdqa  xmm8, xmm7
1288        psrld   xmm8, 7
1289        pslld   xmm7, 25
1290        por     xmm7, xmm8
1291        paddd   xmm0, xmmword ptr [rsp+0xD0]
1292        paddd   xmm1, xmmword ptr [rsp]
1293        paddd   xmm2, xmmword ptr [rsp+0x20]
1294        paddd   xmm3, xmmword ptr [rsp+0x40]
1295        paddd   xmm0, xmm5
1296        paddd   xmm1, xmm6
1297        paddd   xmm2, xmm7
1298        paddd   xmm3, xmm4
1299        pxor    xmm15, xmm0
1300        pxor    xmm12, xmm1
1301        pxor    xmm13, xmm2
1302        pxor    xmm14, xmm3
1303        pshuflw xmm15, xmm15, 0xB1
1304        pshufhw xmm15, xmm15, 0xB1
1305        pshuflw xmm12, xmm12, 0xB1
1306        pshufhw xmm12, xmm12, 0xB1
1307        pshuflw xmm13, xmm13, 0xB1
1308        pshufhw xmm13, xmm13, 0xB1
1309        pshuflw xmm14, xmm14, 0xB1
1310        pshufhw xmm14, xmm14, 0xB1
1311        paddd   xmm10, xmm15
1312        paddd   xmm11, xmm12
1313        movdqa  xmm8, xmmword ptr [rsp+0x100]
1314        paddd   xmm8, xmm13
1315        paddd   xmm9, xmm14
1316        pxor    xmm5, xmm10
1317        pxor    xmm6, xmm11
1318        pxor    xmm7, xmm8
1319        pxor    xmm4, xmm9
1320        movdqa  xmmword ptr [rsp+0x100], xmm8
1321        movdqa  xmm8, xmm5
1322        psrld   xmm8, 12
1323        pslld   xmm5, 20
1324        por     xmm5, xmm8
1325        movdqa  xmm8, xmm6
1326        psrld   xmm8, 12
1327        pslld   xmm6, 20
1328        por     xmm6, xmm8
1329        movdqa  xmm8, xmm7
1330        psrld   xmm8, 12
1331        pslld   xmm7, 20
1332        por     xmm7, xmm8
1333        movdqa  xmm8, xmm4
1334        psrld   xmm8, 12
1335        pslld   xmm4, 20
1336        por     xmm4, xmm8
1337        paddd   xmm0, xmmword ptr [rsp+0x30]
1338        paddd   xmm1, xmmword ptr [rsp+0xA0]
1339        paddd   xmm2, xmmword ptr [rsp+0x60]
1340        paddd   xmm3, xmmword ptr [rsp+0x70]
1341        paddd   xmm0, xmm5
1342        paddd   xmm1, xmm6
1343        paddd   xmm2, xmm7
1344        paddd   xmm3, xmm4
1345        pxor    xmm15, xmm0
1346        pxor    xmm12, xmm1
1347        pxor    xmm13, xmm2
1348        pxor    xmm14, xmm3
1349        movdqa  xmm8, xmm15
1350        psrld   xmm15, 8
1351        pslld   xmm8, 24
1352        pxor    xmm15, xmm8
1353        movdqa  xmm8, xmm12
1354        psrld   xmm12, 8
1355        pslld   xmm8, 24
1356        pxor    xmm12, xmm8
1357        movdqa  xmm8, xmm13
1358        psrld   xmm13, 8
1359        pslld   xmm8, 24
1360        pxor    xmm13, xmm8
1361        movdqa  xmm8, xmm14
1362        psrld   xmm14, 8
1363        pslld   xmm8, 24
1364        pxor    xmm14, xmm8
1365        paddd   xmm10, xmm15
1366        paddd   xmm11, xmm12
1367        movdqa  xmm8, xmmword ptr [rsp+0x100]
1368        paddd   xmm8, xmm13
1369        paddd   xmm9, xmm14
1370        pxor    xmm5, xmm10
1371        pxor    xmm6, xmm11
1372        pxor    xmm7, xmm8
1373        pxor    xmm4, xmm9
1374        movdqa  xmmword ptr [rsp+0x100], xmm8
1375        movdqa  xmm8, xmm5
1376        psrld   xmm8, 7
1377        pslld   xmm5, 25
1378        por     xmm5, xmm8
1379        movdqa  xmm8, xmm6
1380        psrld   xmm8, 7
1381        pslld   xmm6, 25
1382        por     xmm6, xmm8
1383        movdqa  xmm8, xmm7
1384        psrld   xmm8, 7
1385        pslld   xmm7, 25
1386        por     xmm7, xmm8
1387        movdqa  xmm8, xmm4
1388        psrld   xmm8, 7
1389        pslld   xmm4, 25
1390        por     xmm4, xmm8
1391        paddd   xmm0, xmmword ptr [rsp+0xB0]
1392        paddd   xmm1, xmmword ptr [rsp+0x50]
1393        paddd   xmm2, xmmword ptr [rsp+0x10]
1394        paddd   xmm3, xmmword ptr [rsp+0x80]
1395        paddd   xmm0, xmm4
1396        paddd   xmm1, xmm5
1397        paddd   xmm2, xmm6
1398        paddd   xmm3, xmm7
1399        pxor    xmm12, xmm0
1400        pxor    xmm13, xmm1
1401        pxor    xmm14, xmm2
1402        pxor    xmm15, xmm3
1403        pshuflw xmm12, xmm12, 0xB1
1404        pshufhw xmm12, xmm12, 0xB1
1405        pshuflw xmm13, xmm13, 0xB1
1406        pshufhw xmm13, xmm13, 0xB1
1407        pshuflw xmm14, xmm14, 0xB1
1408        pshufhw xmm14, xmm14, 0xB1
1409        pshuflw xmm15, xmm15, 0xB1
1410        pshufhw xmm15, xmm15, 0xB1
1411        movdqa  xmm8, xmmword ptr [rsp+0x100]
1412        paddd   xmm8, xmm12
1413        paddd   xmm9, xmm13
1414        paddd   xmm10, xmm14
1415        paddd   xmm11, xmm15
1416        pxor    xmm4, xmm8
1417        pxor    xmm5, xmm9
1418        pxor    xmm6, xmm10
1419        pxor    xmm7, xmm11
1420        movdqa  xmmword ptr [rsp+0x100], xmm8
1421        movdqa  xmm8, xmm4
1422        psrld   xmm8, 12
1423        pslld   xmm4, 20
1424        por     xmm4, xmm8
1425        movdqa  xmm8, xmm5
1426        psrld   xmm8, 12
1427        pslld   xmm5, 20
1428        por     xmm5, xmm8
1429        movdqa  xmm8, xmm6
1430        psrld   xmm8, 12
1431        pslld   xmm6, 20
1432        por     xmm6, xmm8
1433        movdqa  xmm8, xmm7
1434        psrld   xmm8, 12
1435        pslld   xmm7, 20
1436        por     xmm7, xmm8
1437        paddd   xmm0, xmmword ptr [rsp+0xF0]
1438        paddd   xmm1, xmmword ptr [rsp]
1439        paddd   xmm2, xmmword ptr [rsp+0x90]
1440        paddd   xmm3, xmmword ptr [rsp+0x60]
1441        paddd   xmm0, xmm4
1442        paddd   xmm1, xmm5
1443        paddd   xmm2, xmm6
1444        paddd   xmm3, xmm7
1445        pxor    xmm12, xmm0
1446        pxor    xmm13, xmm1
1447        pxor    xmm14, xmm2
1448        pxor    xmm15, xmm3
1449        movdqa  xmm8, xmm12
1450        psrld   xmm12, 8
1451        pslld   xmm8, 24
1452        pxor    xmm12, xmm8
1453        movdqa  xmm8, xmm13
1454        psrld   xmm13, 8
1455        pslld   xmm8, 24
1456        pxor    xmm13, xmm8
1457        movdqa  xmm8, xmm14
1458        psrld   xmm14, 8
1459        pslld   xmm8, 24
1460        pxor    xmm14, xmm8
1461        movdqa  xmm8, xmm15
1462        psrld   xmm15, 8
1463        pslld   xmm8, 24
1464        pxor    xmm15, xmm8
1465        movdqa  xmm8, xmmword ptr [rsp+0x100]
1466        paddd   xmm8, xmm12
1467        paddd   xmm9, xmm13
1468        paddd   xmm10, xmm14
1469        paddd   xmm11, xmm15
1470        pxor    xmm4, xmm8
1471        pxor    xmm5, xmm9
1472        pxor    xmm6, xmm10
1473        pxor    xmm7, xmm11
1474        movdqa  xmmword ptr [rsp+0x100], xmm8
1475        movdqa  xmm8, xmm4
1476        psrld   xmm8, 7
1477        pslld   xmm4, 25
1478        por     xmm4, xmm8
1479        movdqa  xmm8, xmm5
1480        psrld   xmm8, 7
1481        pslld   xmm5, 25
1482        por     xmm5, xmm8
1483        movdqa  xmm8, xmm6
1484        psrld   xmm8, 7
1485        pslld   xmm6, 25
1486        por     xmm6, xmm8
1487        movdqa  xmm8, xmm7
1488        psrld   xmm8, 7
1489        pslld   xmm7, 25
1490        por     xmm7, xmm8
1491        paddd   xmm0, xmmword ptr [rsp+0xE0]
1492        paddd   xmm1, xmmword ptr [rsp+0x20]
1493        paddd   xmm2, xmmword ptr [rsp+0x30]
1494        paddd   xmm3, xmmword ptr [rsp+0x70]
1495        paddd   xmm0, xmm5
1496        paddd   xmm1, xmm6
1497        paddd   xmm2, xmm7
1498        paddd   xmm3, xmm4
1499        pxor    xmm15, xmm0
1500        pxor    xmm12, xmm1
1501        pxor    xmm13, xmm2
1502        pxor    xmm14, xmm3
1503        pshuflw xmm15, xmm15, 0xB1
1504        pshufhw xmm15, xmm15, 0xB1
1505        pshuflw xmm12, xmm12, 0xB1
1506        pshufhw xmm12, xmm12, 0xB1
1507        pshuflw xmm13, xmm13, 0xB1
1508        pshufhw xmm13, xmm13, 0xB1
1509        pshuflw xmm14, xmm14, 0xB1
1510        pshufhw xmm14, xmm14, 0xB1
1511        paddd   xmm10, xmm15
1512        paddd   xmm11, xmm12
1513        movdqa  xmm8, xmmword ptr [rsp+0x100]
1514        paddd   xmm8, xmm13
1515        paddd   xmm9, xmm14
1516        pxor    xmm5, xmm10
1517        pxor    xmm6, xmm11
1518        pxor    xmm7, xmm8
1519        pxor    xmm4, xmm9
1520        movdqa  xmmword ptr [rsp+0x100], xmm8
1521        movdqa  xmm8, xmm5
1522        psrld   xmm8, 12
1523        pslld   xmm5, 20
1524        por     xmm5, xmm8
1525        movdqa  xmm8, xmm6
1526        psrld   xmm8, 12
1527        pslld   xmm6, 20
1528        por     xmm6, xmm8
1529        movdqa  xmm8, xmm7
1530        psrld   xmm8, 12
1531        pslld   xmm7, 20
1532        por     xmm7, xmm8
1533        movdqa  xmm8, xmm4
1534        psrld   xmm8, 12
1535        pslld   xmm4, 20
1536        por     xmm4, xmm8
1537        paddd   xmm0, xmmword ptr [rsp+0xA0]
1538        paddd   xmm1, xmmword ptr [rsp+0xC0]
1539        paddd   xmm2, xmmword ptr [rsp+0x40]
1540        paddd   xmm3, xmmword ptr [rsp+0xD0]
1541        paddd   xmm0, xmm5
1542        paddd   xmm1, xmm6
1543        paddd   xmm2, xmm7
1544        paddd   xmm3, xmm4
1545        pxor    xmm15, xmm0
1546        pxor    xmm12, xmm1
1547        pxor    xmm13, xmm2
1548        pxor    xmm14, xmm3
1549        movdqa  xmm8, xmm15
1550        psrld   xmm15, 8
1551        pslld   xmm8, 24
1552        pxor    xmm15, xmm8
1553        movdqa  xmm8, xmm12
1554        psrld   xmm12, 8
1555        pslld   xmm8, 24
1556        pxor    xmm12, xmm8
1557        movdqa  xmm8, xmm13
1558        psrld   xmm13, 8
1559        pslld   xmm8, 24
1560        pxor    xmm13, xmm8
1561        movdqa  xmm8, xmm14
1562        psrld   xmm14, 8
1563        pslld   xmm8, 24
1564        pxor    xmm14, xmm8
1565        paddd   xmm10, xmm15
1566        paddd   xmm11, xmm12
1567        movdqa  xmm8, xmmword ptr [rsp+0x100]
1568        paddd   xmm8, xmm13
1569        paddd   xmm9, xmm14
1570        pxor    xmm5, xmm10
1571        pxor    xmm6, xmm11
1572        pxor    xmm7, xmm8
1573        pxor    xmm4, xmm9
1574        pxor    xmm0, xmm8
1575        pxor    xmm1, xmm9
1576        pxor    xmm2, xmm10
1577        pxor    xmm3, xmm11
1578        movdqa  xmm8, xmm5
1579        psrld   xmm8, 7
1580        pslld   xmm5, 25
1581        por     xmm5, xmm8
1582        movdqa  xmm8, xmm6
1583        psrld   xmm8, 7
1584        pslld   xmm6, 25
1585        por     xmm6, xmm8
1586        movdqa  xmm8, xmm7
1587        psrld   xmm8, 7
1588        pslld   xmm7, 25
1589        por     xmm7, xmm8
1590        movdqa  xmm8, xmm4
1591        psrld   xmm8, 7
1592        pslld   xmm4, 25
1593        por     xmm4, xmm8
1594        pxor    xmm4, xmm12
1595        pxor    xmm5, xmm13
1596        pxor    xmm6, xmm14
1597        pxor    xmm7, xmm15
1598        mov     eax, r13d
1599        jne     9b
1600        movdqa  xmm9, xmm0
1601        punpckldq xmm0, xmm1
1602        punpckhdq xmm9, xmm1
1603        movdqa  xmm11, xmm2
1604        punpckldq xmm2, xmm3
1605        punpckhdq xmm11, xmm3
1606        movdqa  xmm1, xmm0
1607        punpcklqdq xmm0, xmm2
1608        punpckhqdq xmm1, xmm2
1609        movdqa  xmm3, xmm9
1610        punpcklqdq xmm9, xmm11
1611        punpckhqdq xmm3, xmm11
1612        movdqu  xmmword ptr [rbx], xmm0
1613        movdqu  xmmword ptr [rbx+0x20], xmm1
1614        movdqu  xmmword ptr [rbx+0x40], xmm9
1615        movdqu  xmmword ptr [rbx+0x60], xmm3
1616        movdqa  xmm9, xmm4
1617        punpckldq xmm4, xmm5
1618        punpckhdq xmm9, xmm5
1619        movdqa  xmm11, xmm6
1620        punpckldq xmm6, xmm7
1621        punpckhdq xmm11, xmm7
1622        movdqa  xmm5, xmm4
1623        punpcklqdq xmm4, xmm6
1624        punpckhqdq xmm5, xmm6
1625        movdqa  xmm7, xmm9
1626        punpcklqdq xmm9, xmm11
1627        punpckhqdq xmm7, xmm11
1628        movdqu  xmmword ptr [rbx+0x10], xmm4
1629        movdqu  xmmword ptr [rbx+0x30], xmm5
1630        movdqu  xmmword ptr [rbx+0x50], xmm9
1631        movdqu  xmmword ptr [rbx+0x70], xmm7
1632        movdqa  xmm1, xmmword ptr [rsp+0x110]
1633        movdqa  xmm0, xmm1
1634        paddd   xmm1, xmmword ptr [rsp+0x150]
1635        movdqa  xmmword ptr [rsp+0x110], xmm1
1636        pxor    xmm0, xmmword ptr [CMP_MSB_MASK+rip]
1637        pxor    xmm1, xmmword ptr [CMP_MSB_MASK+rip]
1638        pcmpgtd xmm0, xmm1
1639        movdqa  xmm1, xmmword ptr [rsp+0x120]
1640        psubd   xmm1, xmm0
1641        movdqa  xmmword ptr [rsp+0x120], xmm1
1642        add     rbx, 128
1643        add     rdi, 32
1644        sub     rsi, 4
1645        cmp     rsi, 4
1646        jnc     2b
1647        test    rsi, rsi
1648        jnz     3f
16494:
1650        mov     rsp, rbp
1651        pop     rbp
1652        pop     rbx
1653        pop     r12
1654        pop     r13
1655        pop     r14
1656        pop     r15
1657        RET
1658.p2align 5
16593:
1660        test    esi, 0x2
1661        je      3f
1662        movups  xmm0, xmmword ptr [rcx]
1663        movups  xmm1, xmmword ptr [rcx+0x10]
1664        movaps  xmm8, xmm0
1665        movaps  xmm9, xmm1
1666        movd    xmm13, dword ptr [rsp+0x110]
1667        movd    xmm14, dword ptr [rsp+0x120]
1668        punpckldq xmm13, xmm14
1669        movaps  xmmword ptr [rsp], xmm13
1670        movd    xmm14, dword ptr [rsp+0x114]
1671        movd    xmm13, dword ptr [rsp+0x124]
1672        punpckldq xmm14, xmm13
1673        movaps  xmmword ptr [rsp+0x10], xmm14
1674        mov     r8, qword ptr [rdi]
1675        mov     r9, qword ptr [rdi+0x8]
1676        movzx   eax, byte ptr [rbp+0x40]
1677        or      eax, r13d
1678        xor     edx, edx
16792:
1680        mov     r14d, eax
1681        or      eax, r12d
1682        add     rdx, 64
1683        cmp     rdx, r15
1684        cmovne  eax, r14d
1685        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
1686        movaps  xmm10, xmm2
1687        movups  xmm4, xmmword ptr [r8+rdx-0x40]
1688        movups  xmm5, xmmword ptr [r8+rdx-0x30]
1689        movaps  xmm3, xmm4
1690        shufps  xmm4, xmm5, 136
1691        shufps  xmm3, xmm5, 221
1692        movaps  xmm5, xmm3
1693        movups  xmm6, xmmword ptr [r8+rdx-0x20]
1694        movups  xmm7, xmmword ptr [r8+rdx-0x10]
1695        movaps  xmm3, xmm6
1696        shufps  xmm6, xmm7, 136
1697        pshufd  xmm6, xmm6, 0x93
1698        shufps  xmm3, xmm7, 221
1699        pshufd  xmm7, xmm3, 0x93
1700        movups  xmm12, xmmword ptr [r9+rdx-0x40]
1701        movups  xmm13, xmmword ptr [r9+rdx-0x30]
1702        movaps  xmm11, xmm12
1703        shufps  xmm12, xmm13, 136
1704        shufps  xmm11, xmm13, 221
1705        movaps  xmm13, xmm11
1706        movups  xmm14, xmmword ptr [r9+rdx-0x20]
1707        movups  xmm15, xmmword ptr [r9+rdx-0x10]
1708        movaps  xmm11, xmm14
1709        shufps  xmm14, xmm15, 136
1710        pshufd  xmm14, xmm14, 0x93
1711        shufps  xmm11, xmm15, 221
1712        pshufd  xmm15, xmm11, 0x93
1713        shl     rax, 0x20
1714        or      rax, 0x40
1715        movq    xmm3, rax
1716        movdqa  xmmword ptr [rsp+0x20], xmm3
1717        movaps  xmm3, xmmword ptr [rsp]
1718        movaps  xmm11, xmmword ptr [rsp+0x10]
1719        punpcklqdq xmm3, xmmword ptr [rsp+0x20]
1720        punpcklqdq xmm11, xmmword ptr [rsp+0x20]
1721        mov     al, 7
17229:
1723        paddd   xmm0, xmm4
1724        paddd   xmm8, xmm12
1725        movaps  xmmword ptr [rsp+0x20], xmm4
1726        movaps  xmmword ptr [rsp+0x30], xmm12
1727        paddd   xmm0, xmm1
1728        paddd   xmm8, xmm9
1729        pxor    xmm3, xmm0
1730        pxor    xmm11, xmm8
1731        pshuflw xmm3, xmm3, 0xB1
1732        pshufhw xmm3, xmm3, 0xB1
1733        pshuflw xmm11, xmm11, 0xB1
1734        pshufhw xmm11, xmm11, 0xB1
1735        paddd   xmm2, xmm3
1736        paddd   xmm10, xmm11
1737        pxor    xmm1, xmm2
1738        pxor    xmm9, xmm10
1739        movdqa  xmm4, xmm1
1740        pslld   xmm1, 20
1741        psrld   xmm4, 12
1742        por     xmm1, xmm4
1743        movdqa  xmm4, xmm9
1744        pslld   xmm9, 20
1745        psrld   xmm4, 12
1746        por     xmm9, xmm4
1747        paddd   xmm0, xmm5
1748        paddd   xmm8, xmm13
1749        movaps  xmmword ptr [rsp+0x40], xmm5
1750        movaps  xmmword ptr [rsp+0x50], xmm13
1751        paddd   xmm0, xmm1
1752        paddd   xmm8, xmm9
1753        pxor    xmm3, xmm0
1754        pxor    xmm11, xmm8
1755        movdqa  xmm13, xmm3
1756        psrld   xmm3, 8
1757        pslld   xmm13, 24
1758        pxor    xmm3, xmm13
1759        movdqa  xmm13, xmm11
1760        psrld   xmm11, 8
1761        pslld   xmm13, 24
1762        pxor    xmm11, xmm13
1763        paddd   xmm2, xmm3
1764        paddd   xmm10, xmm11
1765        pxor    xmm1, xmm2
1766        pxor    xmm9, xmm10
1767        movdqa  xmm4, xmm1
1768        pslld   xmm1, 25
1769        psrld   xmm4, 7
1770        por     xmm1, xmm4
1771        movdqa  xmm4, xmm9
1772        pslld   xmm9, 25
1773        psrld   xmm4, 7
1774        por     xmm9, xmm4
1775        pshufd  xmm0, xmm0, 0x93
1776        pshufd  xmm8, xmm8, 0x93
1777        pshufd  xmm3, xmm3, 0x4E
1778        pshufd  xmm11, xmm11, 0x4E
1779        pshufd  xmm2, xmm2, 0x39
1780        pshufd  xmm10, xmm10, 0x39
1781        paddd   xmm0, xmm6
1782        paddd   xmm8, xmm14
1783        paddd   xmm0, xmm1
1784        paddd   xmm8, xmm9
1785        pxor    xmm3, xmm0
1786        pxor    xmm11, xmm8
1787        pshuflw xmm3, xmm3, 0xB1
1788        pshufhw xmm3, xmm3, 0xB1
1789        pshuflw xmm11, xmm11, 0xB1
1790        pshufhw xmm11, xmm11, 0xB1
1791        paddd   xmm2, xmm3
1792        paddd   xmm10, xmm11
1793        pxor    xmm1, xmm2
1794        pxor    xmm9, xmm10
1795        movdqa  xmm4, xmm1
1796        pslld   xmm1, 20
1797        psrld   xmm4, 12
1798        por     xmm1, xmm4
1799        movdqa  xmm4, xmm9
1800        pslld   xmm9, 20
1801        psrld   xmm4, 12
1802        por     xmm9, xmm4
1803        paddd   xmm0, xmm7
1804        paddd   xmm8, xmm15
1805        paddd   xmm0, xmm1
1806        paddd   xmm8, xmm9
1807        pxor    xmm3, xmm0
1808        pxor    xmm11, xmm8
1809        movdqa  xmm13, xmm3
1810        psrld   xmm3, 8
1811        pslld   xmm13, 24
1812        pxor    xmm3, xmm13
1813        movdqa  xmm13, xmm11
1814        psrld   xmm11, 8
1815        pslld   xmm13, 24
1816        pxor    xmm11, xmm13
1817        paddd   xmm2, xmm3
1818        paddd   xmm10, xmm11
1819        pxor    xmm1, xmm2
1820        pxor    xmm9, xmm10
1821        movdqa  xmm4, xmm1
1822        pslld   xmm1, 25
1823        psrld   xmm4, 7
1824        por     xmm1, xmm4
1825        movdqa  xmm4, xmm9
1826        pslld   xmm9, 25
1827        psrld   xmm4, 7
1828        por     xmm9, xmm4
1829        pshufd  xmm0, xmm0, 0x39
1830        pshufd  xmm8, xmm8, 0x39
1831        pshufd  xmm3, xmm3, 0x4E
1832        pshufd  xmm11, xmm11, 0x4E
1833        pshufd  xmm2, xmm2, 0x93
1834        pshufd  xmm10, xmm10, 0x93
1835        dec     al
1836        je      9f
1837        movdqa  xmm12, xmmword ptr [rsp+0x20]
1838        movdqa  xmm5, xmmword ptr [rsp+0x40]
1839        pshufd  xmm13, xmm12, 0x0F
1840        shufps  xmm12, xmm5, 214
1841        pshufd  xmm4, xmm12, 0x39
1842        movdqa  xmm12, xmm6
1843        shufps  xmm12, xmm7, 250
1844        pand    xmm13, xmmword ptr [PBLENDW_0x33_MASK+rip]
1845        pand    xmm12, xmmword ptr [PBLENDW_0xCC_MASK+rip]
1846        por     xmm13, xmm12
1847        movdqa  xmmword ptr [rsp+0x20], xmm13
1848        movdqa  xmm12, xmm7
1849        punpcklqdq xmm12, xmm5
1850        movdqa  xmm13, xmm6
1851        pand    xmm12, xmmword ptr [PBLENDW_0x3F_MASK+rip]
1852        pand    xmm13, xmmword ptr [PBLENDW_0xC0_MASK+rip]
1853        por     xmm12, xmm13
1854        pshufd  xmm12, xmm12, 0x78
1855        punpckhdq xmm5, xmm7
1856        punpckldq xmm6, xmm5
1857        pshufd  xmm7, xmm6, 0x1E
1858        movdqa  xmmword ptr [rsp+0x40], xmm12
1859        movdqa  xmm5, xmmword ptr [rsp+0x30]
1860        movdqa  xmm13, xmmword ptr [rsp+0x50]
1861        pshufd  xmm6, xmm5, 0x0F
1862        shufps  xmm5, xmm13, 214
1863        pshufd  xmm12, xmm5, 0x39
1864        movdqa  xmm5, xmm14
1865        shufps  xmm5, xmm15, 250
1866        pand    xmm6, xmmword ptr [PBLENDW_0x33_MASK+rip]
1867        pand    xmm5, xmmword ptr [PBLENDW_0xCC_MASK+rip]
1868        por     xmm6, xmm5
1869        movdqa  xmm5, xmm15
1870        punpcklqdq xmm5, xmm13
1871        movdqa  xmmword ptr [rsp+0x30], xmm2
1872        movdqa  xmm2, xmm14
1873        pand    xmm5, xmmword ptr [PBLENDW_0x3F_MASK+rip]
1874        pand    xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip]
1875        por     xmm5, xmm2
1876        movdqa  xmm2, xmmword ptr [rsp+0x30]
1877        pshufd  xmm5, xmm5, 0x78
1878        punpckhdq xmm13, xmm15
1879        punpckldq xmm14, xmm13
1880        pshufd  xmm15, xmm14, 0x1E
1881        movdqa  xmm13, xmm6
1882        movdqa  xmm14, xmm5
1883        movdqa  xmm5, xmmword ptr [rsp+0x20]
1884        movdqa  xmm6, xmmword ptr [rsp+0x40]
1885        jmp     9b
18869:
1887        pxor    xmm0, xmm2
1888        pxor    xmm1, xmm3
1889        pxor    xmm8, xmm10
1890        pxor    xmm9, xmm11
1891        mov     eax, r13d
1892        cmp     rdx, r15
1893        jne     2b
1894        movups  xmmword ptr [rbx], xmm0
1895        movups  xmmword ptr [rbx+0x10], xmm1
1896        movups  xmmword ptr [rbx+0x20], xmm8
1897        movups  xmmword ptr [rbx+0x30], xmm9
1898        mov     eax, dword ptr [rsp+0x130]
1899        neg     eax
1900        mov    r10d, dword ptr [rsp+0x110+8*rax]
1901        mov    r11d, dword ptr [rsp+0x120+8*rax]
1902        mov dword ptr [rsp+0x110], r10d
1903        mov dword ptr [rsp+0x120], r11d
1904        add     rdi, 16
1905        add     rbx, 64
1906        sub     rsi, 2
19073:
1908        test    esi, 0x1
1909        je      4b
1910        movups  xmm0, xmmword ptr [rcx]
1911        movups  xmm1, xmmword ptr [rcx+0x10]
1912        movd    xmm13, dword ptr [rsp+0x110]
1913        movd    xmm14, dword ptr [rsp+0x120]
1914        punpckldq xmm13, xmm14
1915        mov     r8, qword ptr [rdi]
1916        movzx   eax, byte ptr [rbp+0x40]
1917        or      eax, r13d
1918        xor     edx, edx
19192:
1920        mov     r14d, eax
1921        or      eax, r12d
1922        add     rdx, 64
1923        cmp     rdx, r15
1924        cmovne  eax, r14d
1925        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
1926        shl     rax, 32
1927        or      rax, 64
1928        movq    xmm12, rax
1929        movdqa  xmm3, xmm13
1930        punpcklqdq xmm3, xmm12
1931        movups  xmm4, xmmword ptr [r8+rdx-0x40]
1932        movups  xmm5, xmmword ptr [r8+rdx-0x30]
1933        movaps  xmm8, xmm4
1934        shufps  xmm4, xmm5, 136
1935        shufps  xmm8, xmm5, 221
1936        movaps  xmm5, xmm8
1937        movups  xmm6, xmmword ptr [r8+rdx-0x20]
1938        movups  xmm7, xmmword ptr [r8+rdx-0x10]
1939        movaps  xmm8, xmm6
1940        shufps  xmm6, xmm7, 136
1941        pshufd  xmm6, xmm6, 0x93
1942        shufps  xmm8, xmm7, 221
1943        pshufd  xmm7, xmm8, 0x93
1944        mov     al, 7
19459:
1946        paddd   xmm0, xmm4
1947        paddd   xmm0, xmm1
1948        pxor    xmm3, xmm0
1949        pshuflw xmm3, xmm3, 0xB1
1950        pshufhw xmm3, xmm3, 0xB1
1951        paddd   xmm2, xmm3
1952        pxor    xmm1, xmm2
1953        movdqa  xmm11, xmm1
1954        pslld   xmm1, 20
1955        psrld   xmm11, 12
1956        por     xmm1, xmm11
1957        paddd   xmm0, xmm5
1958        paddd   xmm0, xmm1
1959        pxor    xmm3, xmm0
1960        movdqa  xmm14, xmm3
1961        psrld   xmm3, 8
1962        pslld   xmm14, 24
1963        pxor    xmm3, xmm14
1964        paddd   xmm2, xmm3
1965        pxor    xmm1, xmm2
1966        movdqa  xmm11, xmm1
1967        pslld   xmm1, 25
1968        psrld   xmm11, 7
1969        por     xmm1, xmm11
1970        pshufd  xmm0, xmm0, 0x93
1971        pshufd  xmm3, xmm3, 0x4E
1972        pshufd  xmm2, xmm2, 0x39
1973        paddd   xmm0, xmm6
1974        paddd   xmm0, xmm1
1975        pxor    xmm3, xmm0
1976        pshuflw xmm3, xmm3, 0xB1
1977        pshufhw xmm3, xmm3, 0xB1
1978        paddd   xmm2, xmm3
1979        pxor    xmm1, xmm2
1980        movdqa  xmm11, xmm1
1981        pslld   xmm1, 20
1982        psrld   xmm11, 12
1983        por     xmm1, xmm11
1984        paddd   xmm0, xmm7
1985        paddd   xmm0, xmm1
1986        pxor    xmm3, xmm0
1987        movdqa  xmm14, xmm3
1988        psrld   xmm3, 8
1989        pslld   xmm14, 24
1990        pxor    xmm3, xmm14
1991        paddd   xmm2, xmm3
1992        pxor    xmm1, xmm2
1993        movdqa  xmm11, xmm1
1994        pslld   xmm1, 25
1995        psrld   xmm11, 7
1996        por     xmm1, xmm11
1997        pshufd  xmm0, xmm0, 0x39
1998        pshufd  xmm3, xmm3, 0x4E
1999        pshufd  xmm2, xmm2, 0x93
2000        dec     al
2001        jz      9f
2002        movdqa  xmm8, xmm4
2003        shufps  xmm8, xmm5, 214
2004        pshufd  xmm9, xmm4, 0x0F
2005        pshufd  xmm4, xmm8, 0x39
2006        movdqa  xmm8, xmm6
2007        shufps  xmm8, xmm7, 250
2008        pand    xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
2009        pand    xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
2010        por     xmm9, xmm8
2011        movdqa  xmm8, xmm7
2012        punpcklqdq xmm8, xmm5
2013        movdqa  xmm10, xmm6
2014        pand    xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
2015        pand    xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
2016        por     xmm8, xmm10
2017        pshufd  xmm8, xmm8, 0x78
2018        punpckhdq xmm5, xmm7
2019        punpckldq xmm6, xmm5
2020        pshufd  xmm7, xmm6, 0x1E
2021        movdqa  xmm5, xmm9
2022        movdqa  xmm6, xmm8
2023        jmp     9b
20249:
2025        pxor    xmm0, xmm2
2026        pxor    xmm1, xmm3
2027        mov     eax, r13d
2028        cmp     rdx, r15
2029        jne     2b
2030        movups  xmmword ptr [rbx], xmm0
2031        movups  xmmword ptr [rbx+0x10], xmm1
2032        jmp     4b
2033SET_SIZE(zfs_blake3_hash_many_sse2)
2034
2035ENTRY_ALIGN(zfs_blake3_compress_in_place_sse2, 64)
2036        ENDBR
2037        movups  xmm0, xmmword ptr [rdi]
2038        movups  xmm1, xmmword ptr [rdi+0x10]
2039        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
2040        shl     r8, 32
2041        add     rdx, r8
2042        movq    xmm3, rcx
2043        movq    xmm4, rdx
2044        punpcklqdq xmm3, xmm4
2045        movups  xmm4, xmmword ptr [rsi]
2046        movups  xmm5, xmmword ptr [rsi+0x10]
2047        movaps  xmm8, xmm4
2048        shufps  xmm4, xmm5, 136
2049        shufps  xmm8, xmm5, 221
2050        movaps  xmm5, xmm8
2051        movups  xmm6, xmmword ptr [rsi+0x20]
2052        movups  xmm7, xmmword ptr [rsi+0x30]
2053        movaps  xmm8, xmm6
2054        shufps  xmm6, xmm7, 136
2055        pshufd  xmm6, xmm6, 0x93
2056        shufps  xmm8, xmm7, 221
2057        pshufd  xmm7, xmm8, 0x93
2058        mov     al, 7
20599:
2060        paddd   xmm0, xmm4
2061        paddd   xmm0, xmm1
2062        pxor    xmm3, xmm0
2063        pshuflw xmm3, xmm3, 0xB1
2064        pshufhw xmm3, xmm3, 0xB1
2065        paddd   xmm2, xmm3
2066        pxor    xmm1, xmm2
2067        movdqa  xmm11, xmm1
2068        pslld   xmm1, 20
2069        psrld   xmm11, 12
2070        por     xmm1, xmm11
2071        paddd   xmm0, xmm5
2072        paddd   xmm0, xmm1
2073        pxor    xmm3, xmm0
2074        movdqa  xmm14, xmm3
2075        psrld   xmm3, 8
2076        pslld   xmm14, 24
2077        pxor    xmm3, xmm14
2078        paddd   xmm2, xmm3
2079        pxor    xmm1, xmm2
2080        movdqa  xmm11, xmm1
2081        pslld   xmm1, 25
2082        psrld   xmm11, 7
2083        por     xmm1, xmm11
2084        pshufd  xmm0, xmm0, 0x93
2085        pshufd  xmm3, xmm3, 0x4E
2086        pshufd  xmm2, xmm2, 0x39
2087        paddd   xmm0, xmm6
2088        paddd   xmm0, xmm1
2089        pxor    xmm3, xmm0
2090        pshuflw xmm3, xmm3, 0xB1
2091        pshufhw xmm3, xmm3, 0xB1
2092        paddd   xmm2, xmm3
2093        pxor    xmm1, xmm2
2094        movdqa  xmm11, xmm1
2095        pslld   xmm1, 20
2096        psrld   xmm11, 12
2097        por     xmm1, xmm11
2098        paddd   xmm0, xmm7
2099        paddd   xmm0, xmm1
2100        pxor    xmm3, xmm0
2101        movdqa  xmm14, xmm3
2102        psrld   xmm3, 8
2103        pslld   xmm14, 24
2104        pxor    xmm3, xmm14
2105        paddd   xmm2, xmm3
2106        pxor    xmm1, xmm2
2107        movdqa  xmm11, xmm1
2108        pslld   xmm1, 25
2109        psrld   xmm11, 7
2110        por     xmm1, xmm11
2111        pshufd  xmm0, xmm0, 0x39
2112        pshufd  xmm3, xmm3, 0x4E
2113        pshufd  xmm2, xmm2, 0x93
2114        dec     al
2115        jz      9f
2116        movdqa  xmm8, xmm4
2117        shufps  xmm8, xmm5, 214
2118        pshufd  xmm9, xmm4, 0x0F
2119        pshufd  xmm4, xmm8, 0x39
2120        movdqa  xmm8, xmm6
2121        shufps  xmm8, xmm7, 250
2122        pand    xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
2123        pand    xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
2124        por     xmm9, xmm8
2125        movdqa  xmm8, xmm7
2126        punpcklqdq xmm8, xmm5
2127        movdqa  xmm10, xmm6
2128        pand    xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
2129        pand    xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
2130        por     xmm8, xmm10
2131        pshufd  xmm8, xmm8, 0x78
2132        punpckhdq xmm5, xmm7
2133        punpckldq xmm6, xmm5
2134        pshufd  xmm7, xmm6, 0x1E
2135        movdqa  xmm5, xmm9
2136        movdqa  xmm6, xmm8
2137        jmp     9b
21389:
2139        pxor    xmm0, xmm2
2140        pxor    xmm1, xmm3
2141        movups  xmmword ptr [rdi], xmm0
2142        movups  xmmword ptr [rdi+0x10], xmm1
2143        RET
2144SET_SIZE(zfs_blake3_compress_in_place_sse2)
2145
2146ENTRY_ALIGN(zfs_blake3_compress_xof_sse2, 64)
2147        ENDBR
2148        movups  xmm0, xmmword ptr [rdi]
2149        movups  xmm1, xmmword ptr [rdi+0x10]
2150        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
2151        movzx   eax, r8b
2152        movzx   edx, dl
2153        shl     rax, 32
2154        add     rdx, rax
2155        movq    xmm3, rcx
2156        movq    xmm4, rdx
2157        punpcklqdq xmm3, xmm4
2158        movups  xmm4, xmmword ptr [rsi]
2159        movups  xmm5, xmmword ptr [rsi+0x10]
2160        movaps  xmm8, xmm4
2161        shufps  xmm4, xmm5, 136
2162        shufps  xmm8, xmm5, 221
2163        movaps  xmm5, xmm8
2164        movups  xmm6, xmmword ptr [rsi+0x20]
2165        movups  xmm7, xmmword ptr [rsi+0x30]
2166        movaps  xmm8, xmm6
2167        shufps  xmm6, xmm7, 136
2168        pshufd  xmm6, xmm6, 0x93
2169        shufps  xmm8, xmm7, 221
2170        pshufd  xmm7, xmm8, 0x93
2171        mov     al, 7
21729:
2173        paddd   xmm0, xmm4
2174        paddd   xmm0, xmm1
2175        pxor    xmm3, xmm0
2176        pshuflw xmm3, xmm3, 0xB1
2177        pshufhw xmm3, xmm3, 0xB1
2178        paddd   xmm2, xmm3
2179        pxor    xmm1, xmm2
2180        movdqa  xmm11, xmm1
2181        pslld   xmm1, 20
2182        psrld   xmm11, 12
2183        por     xmm1, xmm11
2184        paddd   xmm0, xmm5
2185        paddd   xmm0, xmm1
2186        pxor    xmm3, xmm0
2187        movdqa  xmm14, xmm3
2188        psrld   xmm3, 8
2189        pslld   xmm14, 24
2190        pxor    xmm3, xmm14
2191        paddd   xmm2, xmm3
2192        pxor    xmm1, xmm2
2193        movdqa  xmm11, xmm1
2194        pslld   xmm1, 25
2195        psrld   xmm11, 7
2196        por     xmm1, xmm11
2197        pshufd  xmm0, xmm0, 0x93
2198        pshufd  xmm3, xmm3, 0x4E
2199        pshufd  xmm2, xmm2, 0x39
2200        paddd   xmm0, xmm6
2201        paddd   xmm0, xmm1
2202        pxor    xmm3, xmm0
2203        pshuflw xmm3, xmm3, 0xB1
2204        pshufhw xmm3, xmm3, 0xB1
2205        paddd   xmm2, xmm3
2206        pxor    xmm1, xmm2
2207        movdqa  xmm11, xmm1
2208        pslld   xmm1, 20
2209        psrld   xmm11, 12
2210        por     xmm1, xmm11
2211        paddd   xmm0, xmm7
2212        paddd   xmm0, xmm1
2213        pxor    xmm3, xmm0
2214        movdqa  xmm14, xmm3
2215        psrld   xmm3, 8
2216        pslld   xmm14, 24
2217        pxor    xmm3, xmm14
2218        paddd   xmm2, xmm3
2219        pxor    xmm1, xmm2
2220        movdqa  xmm11, xmm1
2221        pslld   xmm1, 25
2222        psrld   xmm11, 7
2223        por     xmm1, xmm11
2224        pshufd  xmm0, xmm0, 0x39
2225        pshufd  xmm3, xmm3, 0x4E
2226        pshufd  xmm2, xmm2, 0x93
2227        dec     al
2228        jz      9f
2229        movdqa  xmm8, xmm4
2230        shufps  xmm8, xmm5, 214
2231        pshufd  xmm9, xmm4, 0x0F
2232        pshufd  xmm4, xmm8, 0x39
2233        movdqa  xmm8, xmm6
2234        shufps  xmm8, xmm7, 250
2235        pand    xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
2236        pand    xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
2237        por     xmm9, xmm8
2238        movdqa  xmm8, xmm7
2239        punpcklqdq xmm8, xmm5
2240        movdqa  xmm10, xmm6
2241        pand    xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
2242        pand    xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
2243        por     xmm8, xmm10
2244        pshufd  xmm8, xmm8, 0x78
2245        punpckhdq xmm5, xmm7
2246        punpckldq xmm6, xmm5
2247        pshufd  xmm7, xmm6, 0x1E
2248        movdqa  xmm5, xmm9
2249        movdqa  xmm6, xmm8
2250        jmp     9b
22519:
2252        movdqu  xmm4, xmmword ptr [rdi]
2253        movdqu  xmm5, xmmword ptr [rdi+0x10]
2254        pxor    xmm0, xmm2
2255        pxor    xmm1, xmm3
2256        pxor    xmm2, xmm4
2257        pxor    xmm3, xmm5
2258        movups  xmmword ptr [r9], xmm0
2259        movups  xmmword ptr [r9+0x10], xmm1
2260        movups  xmmword ptr [r9+0x20], xmm2
2261        movups  xmmword ptr [r9+0x30], xmm3
2262        RET
2263SET_SIZE(zfs_blake3_compress_xof_sse2)
2264
2265SECTION_STATIC
2266.p2align  6
2267BLAKE3_IV:
2268        .long  0x6A09E667, 0xBB67AE85
2269        .long  0x3C6EF372, 0xA54FF53A
2270ADD0:
2271        .long  0, 1, 2, 3
2272ADD1:
2273	.long  4, 4, 4, 4
2274BLAKE3_IV_0:
2275	.long  0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
2276BLAKE3_IV_1:
2277	.long  0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
2278BLAKE3_IV_2:
2279	.long  0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
2280BLAKE3_IV_3:
2281	.long  0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
2282BLAKE3_BLOCK_LEN:
2283	.long  64, 64, 64, 64
2284CMP_MSB_MASK:
2285	.long  0x80000000, 0x80000000, 0x80000000, 0x80000000
2286PBLENDW_0x33_MASK:
2287	.long  0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000
2288PBLENDW_0xCC_MASK:
2289	.long  0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF
2290PBLENDW_0x3F_MASK:
2291	.long  0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000
2292PBLENDW_0xC0_MASK:
2293	.long  0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF
2294
2295#endif	/* HAVE_SSE2 */
2296
2297#ifdef __ELF__
2298.section .note.GNU-stack,"",%progbits
2299#endif
2300