1#if defined(__x86_64__)
2
3#if defined(__ELF__) && (defined(__linux__) || defined(__FreeBSD__))
4.section .note.GNU-stack,"",%progbits
5#endif
6
7#if defined(__ELF__) && defined(__CET__) && defined(__has_include)
8#if __has_include(<cet.h>)
9#include <cet.h>
10#endif
11#endif
12
13#if !defined(_CET_ENDBR)
14#define _CET_ENDBR
15#endif
16
17#ifdef __APPLE__
18#define HIDDEN .private_extern
19#else
20#define HIDDEN .hidden
21#endif
22
23.intel_syntax noprefix
24HIDDEN blake3_hash_many_sse41
25HIDDEN _blake3_hash_many_sse41
26HIDDEN blake3_compress_in_place_sse41
27HIDDEN _blake3_compress_in_place_sse41
28HIDDEN blake3_compress_xof_sse41
29HIDDEN _blake3_compress_xof_sse41
30.global blake3_hash_many_sse41
31.global _blake3_hash_many_sse41
32.global blake3_compress_in_place_sse41
33.global _blake3_compress_in_place_sse41
34.global blake3_compress_xof_sse41
35.global _blake3_compress_xof_sse41
36#ifdef __APPLE__
37.text
38#else
39.section .text
40#endif
41        .p2align  6
42_blake3_hash_many_sse41:
43blake3_hash_many_sse41:
44        _CET_ENDBR
45        push    r15
46        push    r14
47        push    r13
48        push    r12
49        push    rbx
50        push    rbp
51        mov     rbp, rsp
52        sub     rsp, 360
53        and     rsp, 0xFFFFFFFFFFFFFFC0
54        neg     r9d
55        movd    xmm0, r9d
56        pshufd  xmm0, xmm0, 0x00
57        movdqa  xmmword ptr [rsp+0x130], xmm0
58        movdqa  xmm1, xmm0
59        pand    xmm1, xmmword ptr [ADD0+rip]
60        pand    xmm0, xmmword ptr [ADD1+rip]
61        movdqa  xmmword ptr [rsp+0x150], xmm0
62        movd    xmm0, r8d
63        pshufd  xmm0, xmm0, 0x00
64        paddd   xmm0, xmm1
65        movdqa  xmmword ptr [rsp+0x110], xmm0
66        pxor    xmm0, xmmword ptr [CMP_MSB_MASK+rip]
67        pxor    xmm1, xmmword ptr [CMP_MSB_MASK+rip]
68        pcmpgtd xmm1, xmm0
69        shr     r8, 32
70        movd    xmm2, r8d
71        pshufd  xmm2, xmm2, 0x00
72        psubd   xmm2, xmm1
73        movdqa  xmmword ptr [rsp+0x120], xmm2
74        mov     rbx, qword ptr [rbp+0x50]
75        mov     r15, rdx
76        shl     r15, 6
77        movzx   r13d, byte ptr [rbp+0x38]
78        movzx   r12d, byte ptr [rbp+0x48]
79        cmp     rsi, 4
80        jc      3f
812:
82        movdqu  xmm3, xmmword ptr [rcx]
83        pshufd  xmm0, xmm3, 0x00
84        pshufd  xmm1, xmm3, 0x55
85        pshufd  xmm2, xmm3, 0xAA
86        pshufd  xmm3, xmm3, 0xFF
87        movdqu  xmm7, xmmword ptr [rcx+0x10]
88        pshufd  xmm4, xmm7, 0x00
89        pshufd  xmm5, xmm7, 0x55
90        pshufd  xmm6, xmm7, 0xAA
91        pshufd  xmm7, xmm7, 0xFF
92        mov     r8, qword ptr [rdi]
93        mov     r9, qword ptr [rdi+0x8]
94        mov     r10, qword ptr [rdi+0x10]
95        mov     r11, qword ptr [rdi+0x18]
96        movzx   eax, byte ptr [rbp+0x40]
97        or      eax, r13d
98        xor     edx, edx
999:
100        mov     r14d, eax
101        or      eax, r12d
102        add     rdx, 64
103        cmp     rdx, r15
104        cmovne  eax, r14d
105        movdqu  xmm8, xmmword ptr [r8+rdx-0x40]
106        movdqu  xmm9, xmmword ptr [r9+rdx-0x40]
107        movdqu  xmm10, xmmword ptr [r10+rdx-0x40]
108        movdqu  xmm11, xmmword ptr [r11+rdx-0x40]
109        movdqa  xmm12, xmm8
110        punpckldq xmm8, xmm9
111        punpckhdq xmm12, xmm9
112        movdqa  xmm14, xmm10
113        punpckldq xmm10, xmm11
114        punpckhdq xmm14, xmm11
115        movdqa  xmm9, xmm8
116        punpcklqdq xmm8, xmm10
117        punpckhqdq xmm9, xmm10
118        movdqa  xmm13, xmm12
119        punpcklqdq xmm12, xmm14
120        punpckhqdq xmm13, xmm14
121        movdqa  xmmword ptr [rsp], xmm8
122        movdqa  xmmword ptr [rsp+0x10], xmm9
123        movdqa  xmmword ptr [rsp+0x20], xmm12
124        movdqa  xmmword ptr [rsp+0x30], xmm13
125        movdqu  xmm8, xmmword ptr [r8+rdx-0x30]
126        movdqu  xmm9, xmmword ptr [r9+rdx-0x30]
127        movdqu  xmm10, xmmword ptr [r10+rdx-0x30]
128        movdqu  xmm11, xmmword ptr [r11+rdx-0x30]
129        movdqa  xmm12, xmm8
130        punpckldq xmm8, xmm9
131        punpckhdq xmm12, xmm9
132        movdqa  xmm14, xmm10
133        punpckldq xmm10, xmm11
134        punpckhdq xmm14, xmm11
135        movdqa  xmm9, xmm8
136        punpcklqdq xmm8, xmm10
137        punpckhqdq xmm9, xmm10
138        movdqa  xmm13, xmm12
139        punpcklqdq xmm12, xmm14
140        punpckhqdq xmm13, xmm14
141        movdqa  xmmword ptr [rsp+0x40], xmm8
142        movdqa  xmmword ptr [rsp+0x50], xmm9
143        movdqa  xmmword ptr [rsp+0x60], xmm12
144        movdqa  xmmword ptr [rsp+0x70], xmm13
145        movdqu  xmm8, xmmword ptr [r8+rdx-0x20]
146        movdqu  xmm9, xmmword ptr [r9+rdx-0x20]
147        movdqu  xmm10, xmmword ptr [r10+rdx-0x20]
148        movdqu  xmm11, xmmword ptr [r11+rdx-0x20]
149        movdqa  xmm12, xmm8
150        punpckldq xmm8, xmm9
151        punpckhdq xmm12, xmm9
152        movdqa  xmm14, xmm10
153        punpckldq xmm10, xmm11
154        punpckhdq xmm14, xmm11
155        movdqa  xmm9, xmm8
156        punpcklqdq xmm8, xmm10
157        punpckhqdq xmm9, xmm10
158        movdqa  xmm13, xmm12
159        punpcklqdq xmm12, xmm14
160        punpckhqdq xmm13, xmm14
161        movdqa  xmmword ptr [rsp+0x80], xmm8
162        movdqa  xmmword ptr [rsp+0x90], xmm9
163        movdqa  xmmword ptr [rsp+0xA0], xmm12
164        movdqa  xmmword ptr [rsp+0xB0], xmm13
165        movdqu  xmm8, xmmword ptr [r8+rdx-0x10]
166        movdqu  xmm9, xmmword ptr [r9+rdx-0x10]
167        movdqu  xmm10, xmmword ptr [r10+rdx-0x10]
168        movdqu  xmm11, xmmword ptr [r11+rdx-0x10]
169        movdqa  xmm12, xmm8
170        punpckldq xmm8, xmm9
171        punpckhdq xmm12, xmm9
172        movdqa  xmm14, xmm10
173        punpckldq xmm10, xmm11
174        punpckhdq xmm14, xmm11
175        movdqa  xmm9, xmm8
176        punpcklqdq xmm8, xmm10
177        punpckhqdq xmm9, xmm10
178        movdqa  xmm13, xmm12
179        punpcklqdq xmm12, xmm14
180        punpckhqdq xmm13, xmm14
181        movdqa  xmmword ptr [rsp+0xC0], xmm8
182        movdqa  xmmword ptr [rsp+0xD0], xmm9
183        movdqa  xmmword ptr [rsp+0xE0], xmm12
184        movdqa  xmmword ptr [rsp+0xF0], xmm13
185        movdqa  xmm9, xmmword ptr [BLAKE3_IV_1+rip]
186        movdqa  xmm10, xmmword ptr [BLAKE3_IV_2+rip]
187        movdqa  xmm11, xmmword ptr [BLAKE3_IV_3+rip]
188        movdqa  xmm12, xmmword ptr [rsp+0x110]
189        movdqa  xmm13, xmmword ptr [rsp+0x120]
190        movdqa  xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
191        movd    xmm15, eax
192        pshufd  xmm15, xmm15, 0x00
193        prefetcht0 [r8+rdx+0x80]
194        prefetcht0 [r9+rdx+0x80]
195        prefetcht0 [r10+rdx+0x80]
196        prefetcht0 [r11+rdx+0x80]
197        paddd   xmm0, xmmword ptr [rsp]
198        paddd   xmm1, xmmword ptr [rsp+0x20]
199        paddd   xmm2, xmmword ptr [rsp+0x40]
200        paddd   xmm3, xmmword ptr [rsp+0x60]
201        paddd   xmm0, xmm4
202        paddd   xmm1, xmm5
203        paddd   xmm2, xmm6
204        paddd   xmm3, xmm7
205        pxor    xmm12, xmm0
206        pxor    xmm13, xmm1
207        pxor    xmm14, xmm2
208        pxor    xmm15, xmm3
209        movdqa  xmm8, xmmword ptr [ROT16+rip]
210        pshufb  xmm12, xmm8
211        pshufb  xmm13, xmm8
212        pshufb  xmm14, xmm8
213        pshufb  xmm15, xmm8
214        movdqa  xmm8, xmmword ptr [BLAKE3_IV_0+rip]
215        paddd   xmm8, xmm12
216        paddd   xmm9, xmm13
217        paddd   xmm10, xmm14
218        paddd   xmm11, xmm15
219        pxor    xmm4, xmm8
220        pxor    xmm5, xmm9
221        pxor    xmm6, xmm10
222        pxor    xmm7, xmm11
223        movdqa  xmmword ptr [rsp+0x100], xmm8
224        movdqa  xmm8, xmm4
225        psrld   xmm8, 12
226        pslld   xmm4, 20
227        por     xmm4, xmm8
228        movdqa  xmm8, xmm5
229        psrld   xmm8, 12
230        pslld   xmm5, 20
231        por     xmm5, xmm8
232        movdqa  xmm8, xmm6
233        psrld   xmm8, 12
234        pslld   xmm6, 20
235        por     xmm6, xmm8
236        movdqa  xmm8, xmm7
237        psrld   xmm8, 12
238        pslld   xmm7, 20
239        por     xmm7, xmm8
240        paddd   xmm0, xmmword ptr [rsp+0x10]
241        paddd   xmm1, xmmword ptr [rsp+0x30]
242        paddd   xmm2, xmmword ptr [rsp+0x50]
243        paddd   xmm3, xmmword ptr [rsp+0x70]
244        paddd   xmm0, xmm4
245        paddd   xmm1, xmm5
246        paddd   xmm2, xmm6
247        paddd   xmm3, xmm7
248        pxor    xmm12, xmm0
249        pxor    xmm13, xmm1
250        pxor    xmm14, xmm2
251        pxor    xmm15, xmm3
252        movdqa  xmm8, xmmword ptr [ROT8+rip]
253        pshufb  xmm12, xmm8
254        pshufb  xmm13, xmm8
255        pshufb  xmm14, xmm8
256        pshufb  xmm15, xmm8
257        movdqa  xmm8, xmmword ptr [rsp+0x100]
258        paddd   xmm8, xmm12
259        paddd   xmm9, xmm13
260        paddd   xmm10, xmm14
261        paddd   xmm11, xmm15
262        pxor    xmm4, xmm8
263        pxor    xmm5, xmm9
264        pxor    xmm6, xmm10
265        pxor    xmm7, xmm11
266        movdqa  xmmword ptr [rsp+0x100], xmm8
267        movdqa  xmm8, xmm4
268        psrld   xmm8, 7
269        pslld   xmm4, 25
270        por     xmm4, xmm8
271        movdqa  xmm8, xmm5
272        psrld   xmm8, 7
273        pslld   xmm5, 25
274        por     xmm5, xmm8
275        movdqa  xmm8, xmm6
276        psrld   xmm8, 7
277        pslld   xmm6, 25
278        por     xmm6, xmm8
279        movdqa  xmm8, xmm7
280        psrld   xmm8, 7
281        pslld   xmm7, 25
282        por     xmm7, xmm8
283        paddd   xmm0, xmmword ptr [rsp+0x80]
284        paddd   xmm1, xmmword ptr [rsp+0xA0]
285        paddd   xmm2, xmmword ptr [rsp+0xC0]
286        paddd   xmm3, xmmword ptr [rsp+0xE0]
287        paddd   xmm0, xmm5
288        paddd   xmm1, xmm6
289        paddd   xmm2, xmm7
290        paddd   xmm3, xmm4
291        pxor    xmm15, xmm0
292        pxor    xmm12, xmm1
293        pxor    xmm13, xmm2
294        pxor    xmm14, xmm3
295        movdqa  xmm8, xmmword ptr [ROT16+rip]
296        pshufb  xmm15, xmm8
297        pshufb  xmm12, xmm8
298        pshufb  xmm13, xmm8
299        pshufb  xmm14, xmm8
300        paddd   xmm10, xmm15
301        paddd   xmm11, xmm12
302        movdqa  xmm8, xmmword ptr [rsp+0x100]
303        paddd   xmm8, xmm13
304        paddd   xmm9, xmm14
305        pxor    xmm5, xmm10
306        pxor    xmm6, xmm11
307        pxor    xmm7, xmm8
308        pxor    xmm4, xmm9
309        movdqa  xmmword ptr [rsp+0x100], xmm8
310        movdqa  xmm8, xmm5
311        psrld   xmm8, 12
312        pslld   xmm5, 20
313        por     xmm5, xmm8
314        movdqa  xmm8, xmm6
315        psrld   xmm8, 12
316        pslld   xmm6, 20
317        por     xmm6, xmm8
318        movdqa  xmm8, xmm7
319        psrld   xmm8, 12
320        pslld   xmm7, 20
321        por     xmm7, xmm8
322        movdqa  xmm8, xmm4
323        psrld   xmm8, 12
324        pslld   xmm4, 20
325        por     xmm4, xmm8
326        paddd   xmm0, xmmword ptr [rsp+0x90]
327        paddd   xmm1, xmmword ptr [rsp+0xB0]
328        paddd   xmm2, xmmword ptr [rsp+0xD0]
329        paddd   xmm3, xmmword ptr [rsp+0xF0]
330        paddd   xmm0, xmm5
331        paddd   xmm1, xmm6
332        paddd   xmm2, xmm7
333        paddd   xmm3, xmm4
334        pxor    xmm15, xmm0
335        pxor    xmm12, xmm1
336        pxor    xmm13, xmm2
337        pxor    xmm14, xmm3
338        movdqa  xmm8, xmmword ptr [ROT8+rip]
339        pshufb  xmm15, xmm8
340        pshufb  xmm12, xmm8
341        pshufb  xmm13, xmm8
342        pshufb  xmm14, xmm8
343        paddd   xmm10, xmm15
344        paddd   xmm11, xmm12
345        movdqa  xmm8, xmmword ptr [rsp+0x100]
346        paddd   xmm8, xmm13
347        paddd   xmm9, xmm14
348        pxor    xmm5, xmm10
349        pxor    xmm6, xmm11
350        pxor    xmm7, xmm8
351        pxor    xmm4, xmm9
352        movdqa  xmmword ptr [rsp+0x100], xmm8
353        movdqa  xmm8, xmm5
354        psrld   xmm8, 7
355        pslld   xmm5, 25
356        por     xmm5, xmm8
357        movdqa  xmm8, xmm6
358        psrld   xmm8, 7
359        pslld   xmm6, 25
360        por     xmm6, xmm8
361        movdqa  xmm8, xmm7
362        psrld   xmm8, 7
363        pslld   xmm7, 25
364        por     xmm7, xmm8
365        movdqa  xmm8, xmm4
366        psrld   xmm8, 7
367        pslld   xmm4, 25
368        por     xmm4, xmm8
369        paddd   xmm0, xmmword ptr [rsp+0x20]
370        paddd   xmm1, xmmword ptr [rsp+0x30]
371        paddd   xmm2, xmmword ptr [rsp+0x70]
372        paddd   xmm3, xmmword ptr [rsp+0x40]
373        paddd   xmm0, xmm4
374        paddd   xmm1, xmm5
375        paddd   xmm2, xmm6
376        paddd   xmm3, xmm7
377        pxor    xmm12, xmm0
378        pxor    xmm13, xmm1
379        pxor    xmm14, xmm2
380        pxor    xmm15, xmm3
381        movdqa  xmm8, xmmword ptr [ROT16+rip]
382        pshufb  xmm12, xmm8
383        pshufb  xmm13, xmm8
384        pshufb  xmm14, xmm8
385        pshufb  xmm15, xmm8
386        movdqa  xmm8, xmmword ptr [rsp+0x100]
387        paddd   xmm8, xmm12
388        paddd   xmm9, xmm13
389        paddd   xmm10, xmm14
390        paddd   xmm11, xmm15
391        pxor    xmm4, xmm8
392        pxor    xmm5, xmm9
393        pxor    xmm6, xmm10
394        pxor    xmm7, xmm11
395        movdqa  xmmword ptr [rsp+0x100], xmm8
396        movdqa  xmm8, xmm4
397        psrld   xmm8, 12
398        pslld   xmm4, 20
399        por     xmm4, xmm8
400        movdqa  xmm8, xmm5
401        psrld   xmm8, 12
402        pslld   xmm5, 20
403        por     xmm5, xmm8
404        movdqa  xmm8, xmm6
405        psrld   xmm8, 12
406        pslld   xmm6, 20
407        por     xmm6, xmm8
408        movdqa  xmm8, xmm7
409        psrld   xmm8, 12
410        pslld   xmm7, 20
411        por     xmm7, xmm8
412        paddd   xmm0, xmmword ptr [rsp+0x60]
413        paddd   xmm1, xmmword ptr [rsp+0xA0]
414        paddd   xmm2, xmmword ptr [rsp]
415        paddd   xmm3, xmmword ptr [rsp+0xD0]
416        paddd   xmm0, xmm4
417        paddd   xmm1, xmm5
418        paddd   xmm2, xmm6
419        paddd   xmm3, xmm7
420        pxor    xmm12, xmm0
421        pxor    xmm13, xmm1
422        pxor    xmm14, xmm2
423        pxor    xmm15, xmm3
424        movdqa  xmm8, xmmword ptr [ROT8+rip]
425        pshufb  xmm12, xmm8
426        pshufb  xmm13, xmm8
427        pshufb  xmm14, xmm8
428        pshufb  xmm15, xmm8
429        movdqa  xmm8, xmmword ptr [rsp+0x100]
430        paddd   xmm8, xmm12
431        paddd   xmm9, xmm13
432        paddd   xmm10, xmm14
433        paddd   xmm11, xmm15
434        pxor    xmm4, xmm8
435        pxor    xmm5, xmm9
436        pxor    xmm6, xmm10
437        pxor    xmm7, xmm11
438        movdqa  xmmword ptr [rsp+0x100], xmm8
439        movdqa  xmm8, xmm4
440        psrld   xmm8, 7
441        pslld   xmm4, 25
442        por     xmm4, xmm8
443        movdqa  xmm8, xmm5
444        psrld   xmm8, 7
445        pslld   xmm5, 25
446        por     xmm5, xmm8
447        movdqa  xmm8, xmm6
448        psrld   xmm8, 7
449        pslld   xmm6, 25
450        por     xmm6, xmm8
451        movdqa  xmm8, xmm7
452        psrld   xmm8, 7
453        pslld   xmm7, 25
454        por     xmm7, xmm8
455        paddd   xmm0, xmmword ptr [rsp+0x10]
456        paddd   xmm1, xmmword ptr [rsp+0xC0]
457        paddd   xmm2, xmmword ptr [rsp+0x90]
458        paddd   xmm3, xmmword ptr [rsp+0xF0]
459        paddd   xmm0, xmm5
460        paddd   xmm1, xmm6
461        paddd   xmm2, xmm7
462        paddd   xmm3, xmm4
463        pxor    xmm15, xmm0
464        pxor    xmm12, xmm1
465        pxor    xmm13, xmm2
466        pxor    xmm14, xmm3
467        movdqa  xmm8, xmmword ptr [ROT16+rip]
468        pshufb  xmm15, xmm8
469        pshufb  xmm12, xmm8
470        pshufb  xmm13, xmm8
471        pshufb  xmm14, xmm8
472        paddd   xmm10, xmm15
473        paddd   xmm11, xmm12
474        movdqa  xmm8, xmmword ptr [rsp+0x100]
475        paddd   xmm8, xmm13
476        paddd   xmm9, xmm14
477        pxor    xmm5, xmm10
478        pxor    xmm6, xmm11
479        pxor    xmm7, xmm8
480        pxor    xmm4, xmm9
481        movdqa  xmmword ptr [rsp+0x100], xmm8
482        movdqa  xmm8, xmm5
483        psrld   xmm8, 12
484        pslld   xmm5, 20
485        por     xmm5, xmm8
486        movdqa  xmm8, xmm6
487        psrld   xmm8, 12
488        pslld   xmm6, 20
489        por     xmm6, xmm8
490        movdqa  xmm8, xmm7
491        psrld   xmm8, 12
492        pslld   xmm7, 20
493        por     xmm7, xmm8
494        movdqa  xmm8, xmm4
495        psrld   xmm8, 12
496        pslld   xmm4, 20
497        por     xmm4, xmm8
498        paddd   xmm0, xmmword ptr [rsp+0xB0]
499        paddd   xmm1, xmmword ptr [rsp+0x50]
500        paddd   xmm2, xmmword ptr [rsp+0xE0]
501        paddd   xmm3, xmmword ptr [rsp+0x80]
502        paddd   xmm0, xmm5
503        paddd   xmm1, xmm6
504        paddd   xmm2, xmm7
505        paddd   xmm3, xmm4
506        pxor    xmm15, xmm0
507        pxor    xmm12, xmm1
508        pxor    xmm13, xmm2
509        pxor    xmm14, xmm3
510        movdqa  xmm8, xmmword ptr [ROT8+rip]
511        pshufb  xmm15, xmm8
512        pshufb  xmm12, xmm8
513        pshufb  xmm13, xmm8
514        pshufb  xmm14, xmm8
515        paddd   xmm10, xmm15
516        paddd   xmm11, xmm12
517        movdqa  xmm8, xmmword ptr [rsp+0x100]
518        paddd   xmm8, xmm13
519        paddd   xmm9, xmm14
520        pxor    xmm5, xmm10
521        pxor    xmm6, xmm11
522        pxor    xmm7, xmm8
523        pxor    xmm4, xmm9
524        movdqa  xmmword ptr [rsp+0x100], xmm8
525        movdqa  xmm8, xmm5
526        psrld   xmm8, 7
527        pslld   xmm5, 25
528        por     xmm5, xmm8
529        movdqa  xmm8, xmm6
530        psrld   xmm8, 7
531        pslld   xmm6, 25
532        por     xmm6, xmm8
533        movdqa  xmm8, xmm7
534        psrld   xmm8, 7
535        pslld   xmm7, 25
536        por     xmm7, xmm8
537        movdqa  xmm8, xmm4
538        psrld   xmm8, 7
539        pslld   xmm4, 25
540        por     xmm4, xmm8
541        paddd   xmm0, xmmword ptr [rsp+0x30]
542        paddd   xmm1, xmmword ptr [rsp+0xA0]
543        paddd   xmm2, xmmword ptr [rsp+0xD0]
544        paddd   xmm3, xmmword ptr [rsp+0x70]
545        paddd   xmm0, xmm4
546        paddd   xmm1, xmm5
547        paddd   xmm2, xmm6
548        paddd   xmm3, xmm7
549        pxor    xmm12, xmm0
550        pxor    xmm13, xmm1
551        pxor    xmm14, xmm2
552        pxor    xmm15, xmm3
553        movdqa  xmm8, xmmword ptr [ROT16+rip]
554        pshufb  xmm12, xmm8
555        pshufb  xmm13, xmm8
556        pshufb  xmm14, xmm8
557        pshufb  xmm15, xmm8
558        movdqa  xmm8, xmmword ptr [rsp+0x100]
559        paddd   xmm8, xmm12
560        paddd   xmm9, xmm13
561        paddd   xmm10, xmm14
562        paddd   xmm11, xmm15
563        pxor    xmm4, xmm8
564        pxor    xmm5, xmm9
565        pxor    xmm6, xmm10
566        pxor    xmm7, xmm11
567        movdqa  xmmword ptr [rsp+0x100], xmm8
568        movdqa  xmm8, xmm4
569        psrld   xmm8, 12
570        pslld   xmm4, 20
571        por     xmm4, xmm8
572        movdqa  xmm8, xmm5
573        psrld   xmm8, 12
574        pslld   xmm5, 20
575        por     xmm5, xmm8
576        movdqa  xmm8, xmm6
577        psrld   xmm8, 12
578        pslld   xmm6, 20
579        por     xmm6, xmm8
580        movdqa  xmm8, xmm7
581        psrld   xmm8, 12
582        pslld   xmm7, 20
583        por     xmm7, xmm8
584        paddd   xmm0, xmmword ptr [rsp+0x40]
585        paddd   xmm1, xmmword ptr [rsp+0xC0]
586        paddd   xmm2, xmmword ptr [rsp+0x20]
587        paddd   xmm3, xmmword ptr [rsp+0xE0]
588        paddd   xmm0, xmm4
589        paddd   xmm1, xmm5
590        paddd   xmm2, xmm6
591        paddd   xmm3, xmm7
592        pxor    xmm12, xmm0
593        pxor    xmm13, xmm1
594        pxor    xmm14, xmm2
595        pxor    xmm15, xmm3
596        movdqa  xmm8, xmmword ptr [ROT8+rip]
597        pshufb  xmm12, xmm8
598        pshufb  xmm13, xmm8
599        pshufb  xmm14, xmm8
600        pshufb  xmm15, xmm8
601        movdqa  xmm8, xmmword ptr [rsp+0x100]
602        paddd   xmm8, xmm12
603        paddd   xmm9, xmm13
604        paddd   xmm10, xmm14
605        paddd   xmm11, xmm15
606        pxor    xmm4, xmm8
607        pxor    xmm5, xmm9
608        pxor    xmm6, xmm10
609        pxor    xmm7, xmm11
610        movdqa  xmmword ptr [rsp+0x100], xmm8
611        movdqa  xmm8, xmm4
612        psrld   xmm8, 7
613        pslld   xmm4, 25
614        por     xmm4, xmm8
615        movdqa  xmm8, xmm5
616        psrld   xmm8, 7
617        pslld   xmm5, 25
618        por     xmm5, xmm8
619        movdqa  xmm8, xmm6
620        psrld   xmm8, 7
621        pslld   xmm6, 25
622        por     xmm6, xmm8
623        movdqa  xmm8, xmm7
624        psrld   xmm8, 7
625        pslld   xmm7, 25
626        por     xmm7, xmm8
627        paddd   xmm0, xmmword ptr [rsp+0x60]
628        paddd   xmm1, xmmword ptr [rsp+0x90]
629        paddd   xmm2, xmmword ptr [rsp+0xB0]
630        paddd   xmm3, xmmword ptr [rsp+0x80]
631        paddd   xmm0, xmm5
632        paddd   xmm1, xmm6
633        paddd   xmm2, xmm7
634        paddd   xmm3, xmm4
635        pxor    xmm15, xmm0
636        pxor    xmm12, xmm1
637        pxor    xmm13, xmm2
638        pxor    xmm14, xmm3
639        movdqa  xmm8, xmmword ptr [ROT16+rip]
640        pshufb  xmm15, xmm8
641        pshufb  xmm12, xmm8
642        pshufb  xmm13, xmm8
643        pshufb  xmm14, xmm8
644        paddd   xmm10, xmm15
645        paddd   xmm11, xmm12
646        movdqa  xmm8, xmmword ptr [rsp+0x100]
647        paddd   xmm8, xmm13
648        paddd   xmm9, xmm14
649        pxor    xmm5, xmm10
650        pxor    xmm6, xmm11
651        pxor    xmm7, xmm8
652        pxor    xmm4, xmm9
653        movdqa  xmmword ptr [rsp+0x100], xmm8
654        movdqa  xmm8, xmm5
655        psrld   xmm8, 12
656        pslld   xmm5, 20
657        por     xmm5, xmm8
658        movdqa  xmm8, xmm6
659        psrld   xmm8, 12
660        pslld   xmm6, 20
661        por     xmm6, xmm8
662        movdqa  xmm8, xmm7
663        psrld   xmm8, 12
664        pslld   xmm7, 20
665        por     xmm7, xmm8
666        movdqa  xmm8, xmm4
667        psrld   xmm8, 12
668        pslld   xmm4, 20
669        por     xmm4, xmm8
670        paddd   xmm0, xmmword ptr [rsp+0x50]
671        paddd   xmm1, xmmword ptr [rsp]
672        paddd   xmm2, xmmword ptr [rsp+0xF0]
673        paddd   xmm3, xmmword ptr [rsp+0x10]
674        paddd   xmm0, xmm5
675        paddd   xmm1, xmm6
676        paddd   xmm2, xmm7
677        paddd   xmm3, xmm4
678        pxor    xmm15, xmm0
679        pxor    xmm12, xmm1
680        pxor    xmm13, xmm2
681        pxor    xmm14, xmm3
682        movdqa  xmm8, xmmword ptr [ROT8+rip]
683        pshufb  xmm15, xmm8
684        pshufb  xmm12, xmm8
685        pshufb  xmm13, xmm8
686        pshufb  xmm14, xmm8
687        paddd   xmm10, xmm15
688        paddd   xmm11, xmm12
689        movdqa  xmm8, xmmword ptr [rsp+0x100]
690        paddd   xmm8, xmm13
691        paddd   xmm9, xmm14
692        pxor    xmm5, xmm10
693        pxor    xmm6, xmm11
694        pxor    xmm7, xmm8
695        pxor    xmm4, xmm9
696        movdqa  xmmword ptr [rsp+0x100], xmm8
697        movdqa  xmm8, xmm5
698        psrld   xmm8, 7
699        pslld   xmm5, 25
700        por     xmm5, xmm8
701        movdqa  xmm8, xmm6
702        psrld   xmm8, 7
703        pslld   xmm6, 25
704        por     xmm6, xmm8
705        movdqa  xmm8, xmm7
706        psrld   xmm8, 7
707        pslld   xmm7, 25
708        por     xmm7, xmm8
709        movdqa  xmm8, xmm4
710        psrld   xmm8, 7
711        pslld   xmm4, 25
712        por     xmm4, xmm8
713        paddd   xmm0, xmmword ptr [rsp+0xA0]
714        paddd   xmm1, xmmword ptr [rsp+0xC0]
715        paddd   xmm2, xmmword ptr [rsp+0xE0]
716        paddd   xmm3, xmmword ptr [rsp+0xD0]
717        paddd   xmm0, xmm4
718        paddd   xmm1, xmm5
719        paddd   xmm2, xmm6
720        paddd   xmm3, xmm7
721        pxor    xmm12, xmm0
722        pxor    xmm13, xmm1
723        pxor    xmm14, xmm2
724        pxor    xmm15, xmm3
725        movdqa  xmm8, xmmword ptr [ROT16+rip]
726        pshufb  xmm12, xmm8
727        pshufb  xmm13, xmm8
728        pshufb  xmm14, xmm8
729        pshufb  xmm15, xmm8
730        movdqa  xmm8, xmmword ptr [rsp+0x100]
731        paddd   xmm8, xmm12
732        paddd   xmm9, xmm13
733        paddd   xmm10, xmm14
734        paddd   xmm11, xmm15
735        pxor    xmm4, xmm8
736        pxor    xmm5, xmm9
737        pxor    xmm6, xmm10
738        pxor    xmm7, xmm11
739        movdqa  xmmword ptr [rsp+0x100], xmm8
740        movdqa  xmm8, xmm4
741        psrld   xmm8, 12
742        pslld   xmm4, 20
743        por     xmm4, xmm8
744        movdqa  xmm8, xmm5
745        psrld   xmm8, 12
746        pslld   xmm5, 20
747        por     xmm5, xmm8
748        movdqa  xmm8, xmm6
749        psrld   xmm8, 12
750        pslld   xmm6, 20
751        por     xmm6, xmm8
752        movdqa  xmm8, xmm7
753        psrld   xmm8, 12
754        pslld   xmm7, 20
755        por     xmm7, xmm8
756        paddd   xmm0, xmmword ptr [rsp+0x70]
757        paddd   xmm1, xmmword ptr [rsp+0x90]
758        paddd   xmm2, xmmword ptr [rsp+0x30]
759        paddd   xmm3, xmmword ptr [rsp+0xF0]
760        paddd   xmm0, xmm4
761        paddd   xmm1, xmm5
762        paddd   xmm2, xmm6
763        paddd   xmm3, xmm7
764        pxor    xmm12, xmm0
765        pxor    xmm13, xmm1
766        pxor    xmm14, xmm2
767        pxor    xmm15, xmm3
768        movdqa  xmm8, xmmword ptr [ROT8+rip]
769        pshufb  xmm12, xmm8
770        pshufb  xmm13, xmm8
771        pshufb  xmm14, xmm8
772        pshufb  xmm15, xmm8
773        movdqa  xmm8, xmmword ptr [rsp+0x100]
774        paddd   xmm8, xmm12
775        paddd   xmm9, xmm13
776        paddd   xmm10, xmm14
777        paddd   xmm11, xmm15
778        pxor    xmm4, xmm8
779        pxor    xmm5, xmm9
780        pxor    xmm6, xmm10
781        pxor    xmm7, xmm11
782        movdqa  xmmword ptr [rsp+0x100], xmm8
783        movdqa  xmm8, xmm4
784        psrld   xmm8, 7
785        pslld   xmm4, 25
786        por     xmm4, xmm8
787        movdqa  xmm8, xmm5
788        psrld   xmm8, 7
789        pslld   xmm5, 25
790        por     xmm5, xmm8
791        movdqa  xmm8, xmm6
792        psrld   xmm8, 7
793        pslld   xmm6, 25
794        por     xmm6, xmm8
795        movdqa  xmm8, xmm7
796        psrld   xmm8, 7
797        pslld   xmm7, 25
798        por     xmm7, xmm8
799        paddd   xmm0, xmmword ptr [rsp+0x40]
800        paddd   xmm1, xmmword ptr [rsp+0xB0]
801        paddd   xmm2, xmmword ptr [rsp+0x50]
802        paddd   xmm3, xmmword ptr [rsp+0x10]
803        paddd   xmm0, xmm5
804        paddd   xmm1, xmm6
805        paddd   xmm2, xmm7
806        paddd   xmm3, xmm4
807        pxor    xmm15, xmm0
808        pxor    xmm12, xmm1
809        pxor    xmm13, xmm2
810        pxor    xmm14, xmm3
811        movdqa  xmm8, xmmword ptr [ROT16+rip]
812        pshufb  xmm15, xmm8
813        pshufb  xmm12, xmm8
814        pshufb  xmm13, xmm8
815        pshufb  xmm14, xmm8
816        paddd   xmm10, xmm15
817        paddd   xmm11, xmm12
818        movdqa  xmm8, xmmword ptr [rsp+0x100]
819        paddd   xmm8, xmm13
820        paddd   xmm9, xmm14
821        pxor    xmm5, xmm10
822        pxor    xmm6, xmm11
823        pxor    xmm7, xmm8
824        pxor    xmm4, xmm9
825        movdqa  xmmword ptr [rsp+0x100], xmm8
826        movdqa  xmm8, xmm5
827        psrld   xmm8, 12
828        pslld   xmm5, 20
829        por     xmm5, xmm8
830        movdqa  xmm8, xmm6
831        psrld   xmm8, 12
832        pslld   xmm6, 20
833        por     xmm6, xmm8
834        movdqa  xmm8, xmm7
835        psrld   xmm8, 12
836        pslld   xmm7, 20
837        por     xmm7, xmm8
838        movdqa  xmm8, xmm4
839        psrld   xmm8, 12
840        pslld   xmm4, 20
841        por     xmm4, xmm8
842        paddd   xmm0, xmmword ptr [rsp]
843        paddd   xmm1, xmmword ptr [rsp+0x20]
844        paddd   xmm2, xmmword ptr [rsp+0x80]
845        paddd   xmm3, xmmword ptr [rsp+0x60]
846        paddd   xmm0, xmm5
847        paddd   xmm1, xmm6
848        paddd   xmm2, xmm7
849        paddd   xmm3, xmm4
850        pxor    xmm15, xmm0
851        pxor    xmm12, xmm1
852        pxor    xmm13, xmm2
853        pxor    xmm14, xmm3
854        movdqa  xmm8, xmmword ptr [ROT8+rip]
855        pshufb  xmm15, xmm8
856        pshufb  xmm12, xmm8
857        pshufb  xmm13, xmm8
858        pshufb  xmm14, xmm8
859        paddd   xmm10, xmm15
860        paddd   xmm11, xmm12
861        movdqa  xmm8, xmmword ptr [rsp+0x100]
862        paddd   xmm8, xmm13
863        paddd   xmm9, xmm14
864        pxor    xmm5, xmm10
865        pxor    xmm6, xmm11
866        pxor    xmm7, xmm8
867        pxor    xmm4, xmm9
868        movdqa  xmmword ptr [rsp+0x100], xmm8
869        movdqa  xmm8, xmm5
870        psrld   xmm8, 7
871        pslld   xmm5, 25
872        por     xmm5, xmm8
873        movdqa  xmm8, xmm6
874        psrld   xmm8, 7
875        pslld   xmm6, 25
876        por     xmm6, xmm8
877        movdqa  xmm8, xmm7
878        psrld   xmm8, 7
879        pslld   xmm7, 25
880        por     xmm7, xmm8
881        movdqa  xmm8, xmm4
882        psrld   xmm8, 7
883        pslld   xmm4, 25
884        por     xmm4, xmm8
885        paddd   xmm0, xmmword ptr [rsp+0xC0]
886        paddd   xmm1, xmmword ptr [rsp+0x90]
887        paddd   xmm2, xmmword ptr [rsp+0xF0]
888        paddd   xmm3, xmmword ptr [rsp+0xE0]
889        paddd   xmm0, xmm4
890        paddd   xmm1, xmm5
891        paddd   xmm2, xmm6
892        paddd   xmm3, xmm7
893        pxor    xmm12, xmm0
894        pxor    xmm13, xmm1
895        pxor    xmm14, xmm2
896        pxor    xmm15, xmm3
897        movdqa  xmm8, xmmword ptr [ROT16+rip]
898        pshufb  xmm12, xmm8
899        pshufb  xmm13, xmm8
900        pshufb  xmm14, xmm8
901        pshufb  xmm15, xmm8
902        movdqa  xmm8, xmmword ptr [rsp+0x100]
903        paddd   xmm8, xmm12
904        paddd   xmm9, xmm13
905        paddd   xmm10, xmm14
906        paddd   xmm11, xmm15
907        pxor    xmm4, xmm8
908        pxor    xmm5, xmm9
909        pxor    xmm6, xmm10
910        pxor    xmm7, xmm11
911        movdqa  xmmword ptr [rsp+0x100], xmm8
912        movdqa  xmm8, xmm4
913        psrld   xmm8, 12
914        pslld   xmm4, 20
915        por     xmm4, xmm8
916        movdqa  xmm8, xmm5
917        psrld   xmm8, 12
918        pslld   xmm5, 20
919        por     xmm5, xmm8
920        movdqa  xmm8, xmm6
921        psrld   xmm8, 12
922        pslld   xmm6, 20
923        por     xmm6, xmm8
924        movdqa  xmm8, xmm7
925        psrld   xmm8, 12
926        pslld   xmm7, 20
927        por     xmm7, xmm8
928        paddd   xmm0, xmmword ptr [rsp+0xD0]
929        paddd   xmm1, xmmword ptr [rsp+0xB0]
930        paddd   xmm2, xmmword ptr [rsp+0xA0]
931        paddd   xmm3, xmmword ptr [rsp+0x80]
932        paddd   xmm0, xmm4
933        paddd   xmm1, xmm5
934        paddd   xmm2, xmm6
935        paddd   xmm3, xmm7
936        pxor    xmm12, xmm0
937        pxor    xmm13, xmm1
938        pxor    xmm14, xmm2
939        pxor    xmm15, xmm3
940        movdqa  xmm8, xmmword ptr [ROT8+rip]
941        pshufb  xmm12, xmm8
942        pshufb  xmm13, xmm8
943        pshufb  xmm14, xmm8
944        pshufb  xmm15, xmm8
945        movdqa  xmm8, xmmword ptr [rsp+0x100]
946        paddd   xmm8, xmm12
947        paddd   xmm9, xmm13
948        paddd   xmm10, xmm14
949        paddd   xmm11, xmm15
950        pxor    xmm4, xmm8
951        pxor    xmm5, xmm9
952        pxor    xmm6, xmm10
953        pxor    xmm7, xmm11
954        movdqa  xmmword ptr [rsp+0x100], xmm8
955        movdqa  xmm8, xmm4
956        psrld   xmm8, 7
957        pslld   xmm4, 25
958        por     xmm4, xmm8
959        movdqa  xmm8, xmm5
960        psrld   xmm8, 7
961        pslld   xmm5, 25
962        por     xmm5, xmm8
963        movdqa  xmm8, xmm6
964        psrld   xmm8, 7
965        pslld   xmm6, 25
966        por     xmm6, xmm8
967        movdqa  xmm8, xmm7
968        psrld   xmm8, 7
969        pslld   xmm7, 25
970        por     xmm7, xmm8
971        paddd   xmm0, xmmword ptr [rsp+0x70]
972        paddd   xmm1, xmmword ptr [rsp+0x50]
973        paddd   xmm2, xmmword ptr [rsp]
974        paddd   xmm3, xmmword ptr [rsp+0x60]
975        paddd   xmm0, xmm5
976        paddd   xmm1, xmm6
977        paddd   xmm2, xmm7
978        paddd   xmm3, xmm4
979        pxor    xmm15, xmm0
980        pxor    xmm12, xmm1
981        pxor    xmm13, xmm2
982        pxor    xmm14, xmm3
983        movdqa  xmm8, xmmword ptr [ROT16+rip]
984        pshufb  xmm15, xmm8
985        pshufb  xmm12, xmm8
986        pshufb  xmm13, xmm8
987        pshufb  xmm14, xmm8
988        paddd   xmm10, xmm15
989        paddd   xmm11, xmm12
990        movdqa  xmm8, xmmword ptr [rsp+0x100]
991        paddd   xmm8, xmm13
992        paddd   xmm9, xmm14
993        pxor    xmm5, xmm10
994        pxor    xmm6, xmm11
995        pxor    xmm7, xmm8
996        pxor    xmm4, xmm9
997        movdqa  xmmword ptr [rsp+0x100], xmm8
998        movdqa  xmm8, xmm5
999        psrld   xmm8, 12
1000        pslld   xmm5, 20
1001        por     xmm5, xmm8
1002        movdqa  xmm8, xmm6
1003        psrld   xmm8, 12
1004        pslld   xmm6, 20
1005        por     xmm6, xmm8
1006        movdqa  xmm8, xmm7
1007        psrld   xmm8, 12
1008        pslld   xmm7, 20
1009        por     xmm7, xmm8
1010        movdqa  xmm8, xmm4
1011        psrld   xmm8, 12
1012        pslld   xmm4, 20
1013        por     xmm4, xmm8
1014        paddd   xmm0, xmmword ptr [rsp+0x20]
1015        paddd   xmm1, xmmword ptr [rsp+0x30]
1016        paddd   xmm2, xmmword ptr [rsp+0x10]
1017        paddd   xmm3, xmmword ptr [rsp+0x40]
1018        paddd   xmm0, xmm5
1019        paddd   xmm1, xmm6
1020        paddd   xmm2, xmm7
1021        paddd   xmm3, xmm4
1022        pxor    xmm15, xmm0
1023        pxor    xmm12, xmm1
1024        pxor    xmm13, xmm2
1025        pxor    xmm14, xmm3
1026        movdqa  xmm8, xmmword ptr [ROT8+rip]
1027        pshufb  xmm15, xmm8
1028        pshufb  xmm12, xmm8
1029        pshufb  xmm13, xmm8
1030        pshufb  xmm14, xmm8
1031        paddd   xmm10, xmm15
1032        paddd   xmm11, xmm12
1033        movdqa  xmm8, xmmword ptr [rsp+0x100]
1034        paddd   xmm8, xmm13
1035        paddd   xmm9, xmm14
1036        pxor    xmm5, xmm10
1037        pxor    xmm6, xmm11
1038        pxor    xmm7, xmm8
1039        pxor    xmm4, xmm9
1040        movdqa  xmmword ptr [rsp+0x100], xmm8
1041        movdqa  xmm8, xmm5
1042        psrld   xmm8, 7
1043        pslld   xmm5, 25
1044        por     xmm5, xmm8
1045        movdqa  xmm8, xmm6
1046        psrld   xmm8, 7
1047        pslld   xmm6, 25
1048        por     xmm6, xmm8
1049        movdqa  xmm8, xmm7
1050        psrld   xmm8, 7
1051        pslld   xmm7, 25
1052        por     xmm7, xmm8
1053        movdqa  xmm8, xmm4
1054        psrld   xmm8, 7
1055        pslld   xmm4, 25
1056        por     xmm4, xmm8
1057        paddd   xmm0, xmmword ptr [rsp+0x90]
1058        paddd   xmm1, xmmword ptr [rsp+0xB0]
1059        paddd   xmm2, xmmword ptr [rsp+0x80]
1060        paddd   xmm3, xmmword ptr [rsp+0xF0]
1061        paddd   xmm0, xmm4
1062        paddd   xmm1, xmm5
1063        paddd   xmm2, xmm6
1064        paddd   xmm3, xmm7
1065        pxor    xmm12, xmm0
1066        pxor    xmm13, xmm1
1067        pxor    xmm14, xmm2
1068        pxor    xmm15, xmm3
1069        movdqa  xmm8, xmmword ptr [ROT16+rip]
1070        pshufb  xmm12, xmm8
1071        pshufb  xmm13, xmm8
1072        pshufb  xmm14, xmm8
1073        pshufb  xmm15, xmm8
1074        movdqa  xmm8, xmmword ptr [rsp+0x100]
1075        paddd   xmm8, xmm12
1076        paddd   xmm9, xmm13
1077        paddd   xmm10, xmm14
1078        paddd   xmm11, xmm15
1079        pxor    xmm4, xmm8
1080        pxor    xmm5, xmm9
1081        pxor    xmm6, xmm10
1082        pxor    xmm7, xmm11
1083        movdqa  xmmword ptr [rsp+0x100], xmm8
1084        movdqa  xmm8, xmm4
1085        psrld   xmm8, 12
1086        pslld   xmm4, 20
1087        por     xmm4, xmm8
1088        movdqa  xmm8, xmm5
1089        psrld   xmm8, 12
1090        pslld   xmm5, 20
1091        por     xmm5, xmm8
1092        movdqa  xmm8, xmm6
1093        psrld   xmm8, 12
1094        pslld   xmm6, 20
1095        por     xmm6, xmm8
1096        movdqa  xmm8, xmm7
1097        psrld   xmm8, 12
1098        pslld   xmm7, 20
1099        por     xmm7, xmm8
1100        paddd   xmm0, xmmword ptr [rsp+0xE0]
1101        paddd   xmm1, xmmword ptr [rsp+0x50]
1102        paddd   xmm2, xmmword ptr [rsp+0xC0]
1103        paddd   xmm3, xmmword ptr [rsp+0x10]
1104        paddd   xmm0, xmm4
1105        paddd   xmm1, xmm5
1106        paddd   xmm2, xmm6
1107        paddd   xmm3, xmm7
1108        pxor    xmm12, xmm0
1109        pxor    xmm13, xmm1
1110        pxor    xmm14, xmm2
1111        pxor    xmm15, xmm3
1112        movdqa  xmm8, xmmword ptr [ROT8+rip]
1113        pshufb  xmm12, xmm8
1114        pshufb  xmm13, xmm8
1115        pshufb  xmm14, xmm8
1116        pshufb  xmm15, xmm8
1117        movdqa  xmm8, xmmword ptr [rsp+0x100]
1118        paddd   xmm8, xmm12
1119        paddd   xmm9, xmm13
1120        paddd   xmm10, xmm14
1121        paddd   xmm11, xmm15
1122        pxor    xmm4, xmm8
1123        pxor    xmm5, xmm9
1124        pxor    xmm6, xmm10
1125        pxor    xmm7, xmm11
1126        movdqa  xmmword ptr [rsp+0x100], xmm8
1127        movdqa  xmm8, xmm4
1128        psrld   xmm8, 7
1129        pslld   xmm4, 25
1130        por     xmm4, xmm8
1131        movdqa  xmm8, xmm5
1132        psrld   xmm8, 7
1133        pslld   xmm5, 25
1134        por     xmm5, xmm8
1135        movdqa  xmm8, xmm6
1136        psrld   xmm8, 7
1137        pslld   xmm6, 25
1138        por     xmm6, xmm8
1139        movdqa  xmm8, xmm7
1140        psrld   xmm8, 7
1141        pslld   xmm7, 25
1142        por     xmm7, xmm8
1143        paddd   xmm0, xmmword ptr [rsp+0xD0]
1144        paddd   xmm1, xmmword ptr [rsp]
1145        paddd   xmm2, xmmword ptr [rsp+0x20]
1146        paddd   xmm3, xmmword ptr [rsp+0x40]
1147        paddd   xmm0, xmm5
1148        paddd   xmm1, xmm6
1149        paddd   xmm2, xmm7
1150        paddd   xmm3, xmm4
1151        pxor    xmm15, xmm0
1152        pxor    xmm12, xmm1
1153        pxor    xmm13, xmm2
1154        pxor    xmm14, xmm3
1155        movdqa  xmm8, xmmword ptr [ROT16+rip]
1156        pshufb  xmm15, xmm8
1157        pshufb  xmm12, xmm8
1158        pshufb  xmm13, xmm8
1159        pshufb  xmm14, xmm8
1160        paddd   xmm10, xmm15
1161        paddd   xmm11, xmm12
1162        movdqa  xmm8, xmmword ptr [rsp+0x100]
1163        paddd   xmm8, xmm13
1164        paddd   xmm9, xmm14
1165        pxor    xmm5, xmm10
1166        pxor    xmm6, xmm11
1167        pxor    xmm7, xmm8
1168        pxor    xmm4, xmm9
1169        movdqa  xmmword ptr [rsp+0x100], xmm8
1170        movdqa  xmm8, xmm5
1171        psrld   xmm8, 12
1172        pslld   xmm5, 20
1173        por     xmm5, xmm8
1174        movdqa  xmm8, xmm6
1175        psrld   xmm8, 12
1176        pslld   xmm6, 20
1177        por     xmm6, xmm8
1178        movdqa  xmm8, xmm7
1179        psrld   xmm8, 12
1180        pslld   xmm7, 20
1181        por     xmm7, xmm8
1182        movdqa  xmm8, xmm4
1183        psrld   xmm8, 12
1184        pslld   xmm4, 20
1185        por     xmm4, xmm8
1186        paddd   xmm0, xmmword ptr [rsp+0x30]
1187        paddd   xmm1, xmmword ptr [rsp+0xA0]
1188        paddd   xmm2, xmmword ptr [rsp+0x60]
1189        paddd   xmm3, xmmword ptr [rsp+0x70]
1190        paddd   xmm0, xmm5
1191        paddd   xmm1, xmm6
1192        paddd   xmm2, xmm7
1193        paddd   xmm3, xmm4
1194        pxor    xmm15, xmm0
1195        pxor    xmm12, xmm1
1196        pxor    xmm13, xmm2
1197        pxor    xmm14, xmm3
1198        movdqa  xmm8, xmmword ptr [ROT8+rip]
1199        pshufb  xmm15, xmm8
1200        pshufb  xmm12, xmm8
1201        pshufb  xmm13, xmm8
1202        pshufb  xmm14, xmm8
1203        paddd   xmm10, xmm15
1204        paddd   xmm11, xmm12
1205        movdqa  xmm8, xmmword ptr [rsp+0x100]
1206        paddd   xmm8, xmm13
1207        paddd   xmm9, xmm14
1208        pxor    xmm5, xmm10
1209        pxor    xmm6, xmm11
1210        pxor    xmm7, xmm8
1211        pxor    xmm4, xmm9
1212        movdqa  xmmword ptr [rsp+0x100], xmm8
1213        movdqa  xmm8, xmm5
1214        psrld   xmm8, 7
1215        pslld   xmm5, 25
1216        por     xmm5, xmm8
1217        movdqa  xmm8, xmm6
1218        psrld   xmm8, 7
1219        pslld   xmm6, 25
1220        por     xmm6, xmm8
1221        movdqa  xmm8, xmm7
1222        psrld   xmm8, 7
1223        pslld   xmm7, 25
1224        por     xmm7, xmm8
1225        movdqa  xmm8, xmm4
1226        psrld   xmm8, 7
1227        pslld   xmm4, 25
1228        por     xmm4, xmm8
1229        paddd   xmm0, xmmword ptr [rsp+0xB0]
1230        paddd   xmm1, xmmword ptr [rsp+0x50]
1231        paddd   xmm2, xmmword ptr [rsp+0x10]
1232        paddd   xmm3, xmmword ptr [rsp+0x80]
1233        paddd   xmm0, xmm4
1234        paddd   xmm1, xmm5
1235        paddd   xmm2, xmm6
1236        paddd   xmm3, xmm7
1237        pxor    xmm12, xmm0
1238        pxor    xmm13, xmm1
1239        pxor    xmm14, xmm2
1240        pxor    xmm15, xmm3
1241        movdqa  xmm8, xmmword ptr [ROT16+rip]
1242        pshufb  xmm12, xmm8
1243        pshufb  xmm13, xmm8
1244        pshufb  xmm14, xmm8
1245        pshufb  xmm15, xmm8
1246        movdqa  xmm8, xmmword ptr [rsp+0x100]
1247        paddd   xmm8, xmm12
1248        paddd   xmm9, xmm13
1249        paddd   xmm10, xmm14
1250        paddd   xmm11, xmm15
1251        pxor    xmm4, xmm8
1252        pxor    xmm5, xmm9
1253        pxor    xmm6, xmm10
1254        pxor    xmm7, xmm11
1255        movdqa  xmmword ptr [rsp+0x100], xmm8
1256        movdqa  xmm8, xmm4
1257        psrld   xmm8, 12
1258        pslld   xmm4, 20
1259        por     xmm4, xmm8
1260        movdqa  xmm8, xmm5
1261        psrld   xmm8, 12
1262        pslld   xmm5, 20
1263        por     xmm5, xmm8
1264        movdqa  xmm8, xmm6
1265        psrld   xmm8, 12
1266        pslld   xmm6, 20
1267        por     xmm6, xmm8
1268        movdqa  xmm8, xmm7
1269        psrld   xmm8, 12
1270        pslld   xmm7, 20
1271        por     xmm7, xmm8
1272        paddd   xmm0, xmmword ptr [rsp+0xF0]
1273        paddd   xmm1, xmmword ptr [rsp]
1274        paddd   xmm2, xmmword ptr [rsp+0x90]
1275        paddd   xmm3, xmmword ptr [rsp+0x60]
1276        paddd   xmm0, xmm4
1277        paddd   xmm1, xmm5
1278        paddd   xmm2, xmm6
1279        paddd   xmm3, xmm7
1280        pxor    xmm12, xmm0
1281        pxor    xmm13, xmm1
1282        pxor    xmm14, xmm2
1283        pxor    xmm15, xmm3
1284        movdqa  xmm8, xmmword ptr [ROT8+rip]
1285        pshufb  xmm12, xmm8
1286        pshufb  xmm13, xmm8
1287        pshufb  xmm14, xmm8
1288        pshufb  xmm15, xmm8
1289        movdqa  xmm8, xmmword ptr [rsp+0x100]
1290        paddd   xmm8, xmm12
1291        paddd   xmm9, xmm13
1292        paddd   xmm10, xmm14
1293        paddd   xmm11, xmm15
1294        pxor    xmm4, xmm8
1295        pxor    xmm5, xmm9
1296        pxor    xmm6, xmm10
1297        pxor    xmm7, xmm11
1298        movdqa  xmmword ptr [rsp+0x100], xmm8
1299        movdqa  xmm8, xmm4
1300        psrld   xmm8, 7
1301        pslld   xmm4, 25
1302        por     xmm4, xmm8
1303        movdqa  xmm8, xmm5
1304        psrld   xmm8, 7
1305        pslld   xmm5, 25
1306        por     xmm5, xmm8
1307        movdqa  xmm8, xmm6
1308        psrld   xmm8, 7
1309        pslld   xmm6, 25
1310        por     xmm6, xmm8
1311        movdqa  xmm8, xmm7
1312        psrld   xmm8, 7
1313        pslld   xmm7, 25
1314        por     xmm7, xmm8
1315        paddd   xmm0, xmmword ptr [rsp+0xE0]
1316        paddd   xmm1, xmmword ptr [rsp+0x20]
1317        paddd   xmm2, xmmword ptr [rsp+0x30]
1318        paddd   xmm3, xmmword ptr [rsp+0x70]
1319        paddd   xmm0, xmm5
1320        paddd   xmm1, xmm6
1321        paddd   xmm2, xmm7
1322        paddd   xmm3, xmm4
1323        pxor    xmm15, xmm0
1324        pxor    xmm12, xmm1
1325        pxor    xmm13, xmm2
1326        pxor    xmm14, xmm3
1327        movdqa  xmm8, xmmword ptr [ROT16+rip]
1328        pshufb  xmm15, xmm8
1329        pshufb  xmm12, xmm8
1330        pshufb  xmm13, xmm8
1331        pshufb  xmm14, xmm8
1332        paddd   xmm10, xmm15
1333        paddd   xmm11, xmm12
1334        movdqa  xmm8, xmmword ptr [rsp+0x100]
1335        paddd   xmm8, xmm13
1336        paddd   xmm9, xmm14
1337        pxor    xmm5, xmm10
1338        pxor    xmm6, xmm11
1339        pxor    xmm7, xmm8
1340        pxor    xmm4, xmm9
1341        movdqa  xmmword ptr [rsp+0x100], xmm8
1342        movdqa  xmm8, xmm5
1343        psrld   xmm8, 12
1344        pslld   xmm5, 20
1345        por     xmm5, xmm8
1346        movdqa  xmm8, xmm6
1347        psrld   xmm8, 12
1348        pslld   xmm6, 20
1349        por     xmm6, xmm8
1350        movdqa  xmm8, xmm7
1351        psrld   xmm8, 12
1352        pslld   xmm7, 20
1353        por     xmm7, xmm8
1354        movdqa  xmm8, xmm4
1355        psrld   xmm8, 12
1356        pslld   xmm4, 20
1357        por     xmm4, xmm8
1358        paddd   xmm0, xmmword ptr [rsp+0xA0]
1359        paddd   xmm1, xmmword ptr [rsp+0xC0]
1360        paddd   xmm2, xmmword ptr [rsp+0x40]
1361        paddd   xmm3, xmmword ptr [rsp+0xD0]
1362        paddd   xmm0, xmm5
1363        paddd   xmm1, xmm6
1364        paddd   xmm2, xmm7
1365        paddd   xmm3, xmm4
1366        pxor    xmm15, xmm0
1367        pxor    xmm12, xmm1
1368        pxor    xmm13, xmm2
1369        pxor    xmm14, xmm3
1370        movdqa  xmm8, xmmword ptr [ROT8+rip]
1371        pshufb  xmm15, xmm8
1372        pshufb  xmm12, xmm8
1373        pshufb  xmm13, xmm8
1374        pshufb  xmm14, xmm8
1375        paddd   xmm10, xmm15
1376        paddd   xmm11, xmm12
1377        movdqa  xmm8, xmmword ptr [rsp+0x100]
1378        paddd   xmm8, xmm13
1379        paddd   xmm9, xmm14
1380        pxor    xmm5, xmm10
1381        pxor    xmm6, xmm11
1382        pxor    xmm7, xmm8
1383        pxor    xmm4, xmm9
1384        pxor    xmm0, xmm8
1385        pxor    xmm1, xmm9
1386        pxor    xmm2, xmm10
1387        pxor    xmm3, xmm11
1388        movdqa  xmm8, xmm5
1389        psrld   xmm8, 7
1390        pslld   xmm5, 25
1391        por     xmm5, xmm8
1392        movdqa  xmm8, xmm6
1393        psrld   xmm8, 7
1394        pslld   xmm6, 25
1395        por     xmm6, xmm8
1396        movdqa  xmm8, xmm7
1397        psrld   xmm8, 7
1398        pslld   xmm7, 25
1399        por     xmm7, xmm8
1400        movdqa  xmm8, xmm4
1401        psrld   xmm8, 7
1402        pslld   xmm4, 25
1403        por     xmm4, xmm8
1404        pxor    xmm4, xmm12
1405        pxor    xmm5, xmm13
1406        pxor    xmm6, xmm14
1407        pxor    xmm7, xmm15
1408        mov     eax, r13d
1409        jne     9b
1410        movdqa  xmm9, xmm0
1411        punpckldq xmm0, xmm1
1412        punpckhdq xmm9, xmm1
1413        movdqa  xmm11, xmm2
1414        punpckldq xmm2, xmm3
1415        punpckhdq xmm11, xmm3
1416        movdqa  xmm1, xmm0
1417        punpcklqdq xmm0, xmm2
1418        punpckhqdq xmm1, xmm2
1419        movdqa  xmm3, xmm9
1420        punpcklqdq xmm9, xmm11
1421        punpckhqdq xmm3, xmm11
1422        movdqu  xmmword ptr [rbx], xmm0
1423        movdqu  xmmword ptr [rbx+0x20], xmm1
1424        movdqu  xmmword ptr [rbx+0x40], xmm9
1425        movdqu  xmmword ptr [rbx+0x60], xmm3
1426        movdqa  xmm9, xmm4
1427        punpckldq xmm4, xmm5
1428        punpckhdq xmm9, xmm5
1429        movdqa  xmm11, xmm6
1430        punpckldq xmm6, xmm7
1431        punpckhdq xmm11, xmm7
1432        movdqa  xmm5, xmm4
1433        punpcklqdq xmm4, xmm6
1434        punpckhqdq xmm5, xmm6
1435        movdqa  xmm7, xmm9
1436        punpcklqdq xmm9, xmm11
1437        punpckhqdq xmm7, xmm11
1438        movdqu  xmmword ptr [rbx+0x10], xmm4
1439        movdqu  xmmword ptr [rbx+0x30], xmm5
1440        movdqu  xmmword ptr [rbx+0x50], xmm9
1441        movdqu  xmmword ptr [rbx+0x70], xmm7
1442        movdqa  xmm1, xmmword ptr [rsp+0x110]
1443        movdqa  xmm0, xmm1
1444        paddd   xmm1, xmmword ptr [rsp+0x150]
1445        movdqa  xmmword ptr [rsp+0x110], xmm1
1446        pxor    xmm0, xmmword ptr [CMP_MSB_MASK+rip]
1447        pxor    xmm1, xmmword ptr [CMP_MSB_MASK+rip]
1448        pcmpgtd xmm0, xmm1
1449        movdqa  xmm1, xmmword ptr [rsp+0x120]
1450        psubd   xmm1, xmm0
1451        movdqa  xmmword ptr [rsp+0x120], xmm1
1452        add     rbx, 128
1453        add     rdi, 32
1454        sub     rsi, 4
1455        cmp     rsi, 4
1456        jnc     2b
1457        test    rsi, rsi
1458        jnz     3f
14594:
1460        mov     rsp, rbp
1461        pop     rbp
1462        pop     rbx
1463        pop     r12
1464        pop     r13
1465        pop     r14
1466        pop     r15
1467        ret
1468.p2align 5
14693:
1470        test    esi, 0x2
1471        je      3f
1472        movups  xmm0, xmmword ptr [rcx]
1473        movups  xmm1, xmmword ptr [rcx+0x10]
1474        movaps  xmm8, xmm0
1475        movaps  xmm9, xmm1
1476        movd    xmm13, dword ptr [rsp+0x110]
1477        pinsrd  xmm13, dword ptr [rsp+0x120], 1
1478        pinsrd  xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
1479        movaps  xmmword ptr [rsp], xmm13
1480        movd    xmm14, dword ptr [rsp+0x114]
1481        pinsrd  xmm14, dword ptr [rsp+0x124], 1
1482        pinsrd  xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
1483        movaps  xmmword ptr [rsp+0x10], xmm14
1484        mov     r8, qword ptr [rdi]
1485        mov     r9, qword ptr [rdi+0x8]
1486        movzx   eax, byte ptr [rbp+0x40]
1487        or      eax, r13d
1488        xor     edx, edx
14892:
1490        mov     r14d, eax
1491        or      eax, r12d
1492        add     rdx, 64
1493        cmp     rdx, r15
1494        cmovne  eax, r14d
1495        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
1496        movaps  xmm10, xmm2
1497        movups  xmm4, xmmword ptr [r8+rdx-0x40]
1498        movups  xmm5, xmmword ptr [r8+rdx-0x30]
1499        movaps  xmm3, xmm4
1500        shufps  xmm4, xmm5, 136
1501        shufps  xmm3, xmm5, 221
1502        movaps  xmm5, xmm3
1503        movups  xmm6, xmmword ptr [r8+rdx-0x20]
1504        movups  xmm7, xmmword ptr [r8+rdx-0x10]
1505        movaps  xmm3, xmm6
1506        shufps  xmm6, xmm7, 136
1507        pshufd  xmm6, xmm6, 0x93
1508        shufps  xmm3, xmm7, 221
1509        pshufd  xmm7, xmm3, 0x93
1510        movups  xmm12, xmmword ptr [r9+rdx-0x40]
1511        movups  xmm13, xmmword ptr [r9+rdx-0x30]
1512        movaps  xmm11, xmm12
1513        shufps  xmm12, xmm13, 136
1514        shufps  xmm11, xmm13, 221
1515        movaps  xmm13, xmm11
1516        movups  xmm14, xmmword ptr [r9+rdx-0x20]
1517        movups  xmm15, xmmword ptr [r9+rdx-0x10]
1518        movaps  xmm11, xmm14
1519        shufps  xmm14, xmm15, 136
1520        pshufd  xmm14, xmm14, 0x93
1521        shufps  xmm11, xmm15, 221
1522        pshufd  xmm15, xmm11, 0x93
1523        movaps  xmm3, xmmword ptr [rsp]
1524        movaps  xmm11, xmmword ptr [rsp+0x10]
1525        pinsrd  xmm3, eax, 3
1526        pinsrd  xmm11, eax, 3
1527        mov     al, 7
15289:
1529        paddd   xmm0, xmm4
1530        paddd   xmm8, xmm12
1531        movaps  xmmword ptr [rsp+0x20], xmm4
1532        movaps  xmmword ptr [rsp+0x30], xmm12
1533        paddd   xmm0, xmm1
1534        paddd   xmm8, xmm9
1535        pxor    xmm3, xmm0
1536        pxor    xmm11, xmm8
1537        movaps  xmm12, xmmword ptr [ROT16+rip]
1538        pshufb  xmm3, xmm12
1539        pshufb  xmm11, xmm12
1540        paddd   xmm2, xmm3
1541        paddd   xmm10, xmm11
1542        pxor    xmm1, xmm2
1543        pxor    xmm9, xmm10
1544        movdqa  xmm4, xmm1
1545        pslld   xmm1, 20
1546        psrld   xmm4, 12
1547        por     xmm1, xmm4
1548        movdqa  xmm4, xmm9
1549        pslld   xmm9, 20
1550        psrld   xmm4, 12
1551        por     xmm9, xmm4
1552        paddd   xmm0, xmm5
1553        paddd   xmm8, xmm13
1554        movaps  xmmword ptr [rsp+0x40], xmm5
1555        movaps  xmmword ptr [rsp+0x50], xmm13
1556        paddd   xmm0, xmm1
1557        paddd   xmm8, xmm9
1558        pxor    xmm3, xmm0
1559        pxor    xmm11, xmm8
1560        movaps  xmm13, xmmword ptr [ROT8+rip]
1561        pshufb  xmm3, xmm13
1562        pshufb  xmm11, xmm13
1563        paddd   xmm2, xmm3
1564        paddd   xmm10, xmm11
1565        pxor    xmm1, xmm2
1566        pxor    xmm9, xmm10
1567        movdqa  xmm4, xmm1
1568        pslld   xmm1, 25
1569        psrld   xmm4, 7
1570        por     xmm1, xmm4
1571        movdqa  xmm4, xmm9
1572        pslld   xmm9, 25
1573        psrld   xmm4, 7
1574        por     xmm9, xmm4
1575        pshufd  xmm0, xmm0, 0x93
1576        pshufd  xmm8, xmm8, 0x93
1577        pshufd  xmm3, xmm3, 0x4E
1578        pshufd  xmm11, xmm11, 0x4E
1579        pshufd  xmm2, xmm2, 0x39
1580        pshufd  xmm10, xmm10, 0x39
1581        paddd   xmm0, xmm6
1582        paddd   xmm8, xmm14
1583        paddd   xmm0, xmm1
1584        paddd   xmm8, xmm9
1585        pxor    xmm3, xmm0
1586        pxor    xmm11, xmm8
1587        pshufb  xmm3, xmm12
1588        pshufb  xmm11, xmm12
1589        paddd   xmm2, xmm3
1590        paddd   xmm10, xmm11
1591        pxor    xmm1, xmm2
1592        pxor    xmm9, xmm10
1593        movdqa  xmm4, xmm1
1594        pslld   xmm1, 20
1595        psrld   xmm4, 12
1596        por     xmm1, xmm4
1597        movdqa  xmm4, xmm9
1598        pslld   xmm9, 20
1599        psrld   xmm4, 12
1600        por     xmm9, xmm4
1601        paddd   xmm0, xmm7
1602        paddd   xmm8, xmm15
1603        paddd   xmm0, xmm1
1604        paddd   xmm8, xmm9
1605        pxor    xmm3, xmm0
1606        pxor    xmm11, xmm8
1607        pshufb  xmm3, xmm13
1608        pshufb  xmm11, xmm13
1609        paddd   xmm2, xmm3
1610        paddd   xmm10, xmm11
1611        pxor    xmm1, xmm2
1612        pxor    xmm9, xmm10
1613        movdqa  xmm4, xmm1
1614        pslld   xmm1, 25
1615        psrld   xmm4, 7
1616        por     xmm1, xmm4
1617        movdqa  xmm4, xmm9
1618        pslld   xmm9, 25
1619        psrld   xmm4, 7
1620        por     xmm9, xmm4
1621        pshufd  xmm0, xmm0, 0x39
1622        pshufd  xmm8, xmm8, 0x39
1623        pshufd  xmm3, xmm3, 0x4E
1624        pshufd  xmm11, xmm11, 0x4E
1625        pshufd  xmm2, xmm2, 0x93
1626        pshufd  xmm10, xmm10, 0x93
1627        dec     al
1628        je      9f
1629        movdqa  xmm12, xmmword ptr [rsp+0x20]
1630        movdqa  xmm5, xmmword ptr [rsp+0x40]
1631        pshufd  xmm13, xmm12, 0x0F
1632        shufps  xmm12, xmm5, 214
1633        pshufd  xmm4, xmm12, 0x39
1634        movdqa  xmm12, xmm6
1635        shufps  xmm12, xmm7, 250
1636        pblendw xmm13, xmm12, 0xCC
1637        movdqa  xmm12, xmm7
1638        punpcklqdq xmm12, xmm5
1639        pblendw xmm12, xmm6, 0xC0
1640        pshufd  xmm12, xmm12, 0x78
1641        punpckhdq xmm5, xmm7
1642        punpckldq xmm6, xmm5
1643        pshufd  xmm7, xmm6, 0x1E
1644        movdqa  xmmword ptr [rsp+0x20], xmm13
1645        movdqa  xmmword ptr [rsp+0x40], xmm12
1646        movdqa  xmm5, xmmword ptr [rsp+0x30]
1647        movdqa  xmm13, xmmword ptr [rsp+0x50]
1648        pshufd  xmm6, xmm5, 0x0F
1649        shufps  xmm5, xmm13, 214
1650        pshufd  xmm12, xmm5, 0x39
1651        movdqa  xmm5, xmm14
1652        shufps  xmm5, xmm15, 250
1653        pblendw xmm6, xmm5, 0xCC
1654        movdqa  xmm5, xmm15
1655        punpcklqdq xmm5, xmm13
1656        pblendw xmm5, xmm14, 0xC0
1657        pshufd  xmm5, xmm5, 0x78
1658        punpckhdq xmm13, xmm15
1659        punpckldq xmm14, xmm13
1660        pshufd  xmm15, xmm14, 0x1E
1661        movdqa  xmm13, xmm6
1662        movdqa  xmm14, xmm5
1663        movdqa  xmm5, xmmword ptr [rsp+0x20]
1664        movdqa  xmm6, xmmword ptr [rsp+0x40]
1665        jmp     9b
16669:
1667        pxor    xmm0, xmm2
1668        pxor    xmm1, xmm3
1669        pxor    xmm8, xmm10
1670        pxor    xmm9, xmm11
1671        mov     eax, r13d
1672        cmp     rdx, r15
1673        jne     2b
1674        movups  xmmword ptr [rbx], xmm0
1675        movups  xmmword ptr [rbx+0x10], xmm1
1676        movups  xmmword ptr [rbx+0x20], xmm8
1677        movups  xmmword ptr [rbx+0x30], xmm9
1678        movdqa  xmm0, xmmword ptr [rsp+0x130]
1679        movdqa  xmm1, xmmword ptr [rsp+0x110]
1680        movdqa  xmm2, xmmword ptr [rsp+0x120]
1681        movdqu  xmm3, xmmword ptr [rsp+0x118]
1682        movdqu  xmm4, xmmword ptr [rsp+0x128]
1683        blendvps xmm1, xmm3, xmm0
1684        blendvps xmm2, xmm4, xmm0
1685        movdqa  xmmword ptr [rsp+0x110], xmm1
1686        movdqa  xmmword ptr [rsp+0x120], xmm2
1687        add     rdi, 16
1688        add     rbx, 64
1689        sub     rsi, 2
16903:
1691        test    esi, 0x1
1692        je      4b
1693        movups  xmm0, xmmword ptr [rcx]
1694        movups  xmm1, xmmword ptr [rcx+0x10]
1695        movd    xmm13, dword ptr [rsp+0x110]
1696        pinsrd  xmm13, dword ptr [rsp+0x120], 1
1697        pinsrd  xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
1698        movaps  xmm14, xmmword ptr [ROT8+rip]
1699        movaps  xmm15, xmmword ptr [ROT16+rip]
1700        mov     r8, qword ptr [rdi]
1701        movzx   eax, byte ptr [rbp+0x40]
1702        or      eax, r13d
1703        xor     edx, edx
17042:
1705        mov     r14d, eax
1706        or      eax, r12d
1707        add     rdx, 64
1708        cmp     rdx, r15
1709        cmovne  eax, r14d
1710        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
1711        movaps  xmm3, xmm13
1712        pinsrd  xmm3, eax, 3
1713        movups  xmm4, xmmword ptr [r8+rdx-0x40]
1714        movups  xmm5, xmmword ptr [r8+rdx-0x30]
1715        movaps  xmm8, xmm4
1716        shufps  xmm4, xmm5, 136
1717        shufps  xmm8, xmm5, 221
1718        movaps  xmm5, xmm8
1719        movups  xmm6, xmmword ptr [r8+rdx-0x20]
1720        movups  xmm7, xmmword ptr [r8+rdx-0x10]
1721        movaps  xmm8, xmm6
1722        shufps  xmm6, xmm7, 136
1723        pshufd  xmm6, xmm6, 0x93
1724        shufps  xmm8, xmm7, 221
1725        pshufd  xmm7, xmm8, 0x93
1726        mov     al, 7
17279:
1728        paddd   xmm0, xmm4
1729        paddd   xmm0, xmm1
1730        pxor    xmm3, xmm0
1731        pshufb  xmm3, xmm15
1732        paddd   xmm2, xmm3
1733        pxor    xmm1, xmm2
1734        movdqa  xmm11, xmm1
1735        pslld   xmm1, 20
1736        psrld   xmm11, 12
1737        por     xmm1, xmm11
1738        paddd   xmm0, xmm5
1739        paddd   xmm0, xmm1
1740        pxor    xmm3, xmm0
1741        pshufb  xmm3, xmm14
1742        paddd   xmm2, xmm3
1743        pxor    xmm1, xmm2
1744        movdqa  xmm11, xmm1
1745        pslld   xmm1, 25
1746        psrld   xmm11, 7
1747        por     xmm1, xmm11
1748        pshufd  xmm0, xmm0, 0x93
1749        pshufd  xmm3, xmm3, 0x4E
1750        pshufd  xmm2, xmm2, 0x39
1751        paddd   xmm0, xmm6
1752        paddd   xmm0, xmm1
1753        pxor    xmm3, xmm0
1754        pshufb  xmm3, xmm15
1755        paddd   xmm2, xmm3
1756        pxor    xmm1, xmm2
1757        movdqa  xmm11, xmm1
1758        pslld   xmm1, 20
1759        psrld   xmm11, 12
1760        por     xmm1, xmm11
1761        paddd   xmm0, xmm7
1762        paddd   xmm0, xmm1
1763        pxor    xmm3, xmm0
1764        pshufb  xmm3, xmm14
1765        paddd   xmm2, xmm3
1766        pxor    xmm1, xmm2
1767        movdqa  xmm11, xmm1
1768        pslld   xmm1, 25
1769        psrld   xmm11, 7
1770        por     xmm1, xmm11
1771        pshufd  xmm0, xmm0, 0x39
1772        pshufd  xmm3, xmm3, 0x4E
1773        pshufd  xmm2, xmm2, 0x93
1774        dec     al
1775        jz      9f
1776        movdqa  xmm8, xmm4
1777        shufps  xmm8, xmm5, 214
1778        pshufd  xmm9, xmm4, 0x0F
1779        pshufd  xmm4, xmm8, 0x39
1780        movdqa  xmm8, xmm6
1781        shufps  xmm8, xmm7, 250
1782        pblendw xmm9, xmm8, 0xCC
1783        movdqa  xmm8, xmm7
1784        punpcklqdq xmm8, xmm5
1785        pblendw xmm8, xmm6, 0xC0
1786        pshufd  xmm8, xmm8, 0x78
1787        punpckhdq xmm5, xmm7
1788        punpckldq xmm6, xmm5
1789        pshufd  xmm7, xmm6, 0x1E
1790        movdqa  xmm5, xmm9
1791        movdqa  xmm6, xmm8
1792        jmp     9b
17939:
1794        pxor    xmm0, xmm2
1795        pxor    xmm1, xmm3
1796        mov     eax, r13d
1797        cmp     rdx, r15
1798        jne     2b
1799        movups  xmmword ptr [rbx], xmm0
1800        movups  xmmword ptr [rbx+0x10], xmm1
1801        jmp     4b
1802
1803.p2align 6
1804blake3_compress_in_place_sse41:
1805_blake3_compress_in_place_sse41:
1806        _CET_ENDBR
1807        movups  xmm0, xmmword ptr [rdi]
1808        movups  xmm1, xmmword ptr [rdi+0x10]
1809        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
1810        shl     r8, 32
1811        add     rdx, r8
1812        movq    xmm3, rcx
1813        movq    xmm4, rdx
1814        punpcklqdq xmm3, xmm4
1815        movups  xmm4, xmmword ptr [rsi]
1816        movups  xmm5, xmmword ptr [rsi+0x10]
1817        movaps  xmm8, xmm4
1818        shufps  xmm4, xmm5, 136
1819        shufps  xmm8, xmm5, 221
1820        movaps  xmm5, xmm8
1821        movups  xmm6, xmmword ptr [rsi+0x20]
1822        movups  xmm7, xmmword ptr [rsi+0x30]
1823        movaps  xmm8, xmm6
1824        shufps  xmm6, xmm7, 136
1825        pshufd  xmm6, xmm6, 0x93
1826        shufps  xmm8, xmm7, 221
1827        pshufd  xmm7, xmm8, 0x93
1828        movaps  xmm14, xmmword ptr [ROT8+rip]
1829        movaps  xmm15, xmmword ptr [ROT16+rip]
1830        mov     al, 7
18319:
1832        paddd   xmm0, xmm4
1833        paddd   xmm0, xmm1
1834        pxor    xmm3, xmm0
1835        pshufb  xmm3, xmm15
1836        paddd   xmm2, xmm3
1837        pxor    xmm1, xmm2
1838        movdqa  xmm11, xmm1
1839        pslld   xmm1, 20
1840        psrld   xmm11, 12
1841        por     xmm1, xmm11
1842        paddd   xmm0, xmm5
1843        paddd   xmm0, xmm1
1844        pxor    xmm3, xmm0
1845        pshufb  xmm3, xmm14
1846        paddd   xmm2, xmm3
1847        pxor    xmm1, xmm2
1848        movdqa  xmm11, xmm1
1849        pslld   xmm1, 25
1850        psrld   xmm11, 7
1851        por     xmm1, xmm11
1852        pshufd  xmm0, xmm0, 0x93
1853        pshufd  xmm3, xmm3, 0x4E
1854        pshufd  xmm2, xmm2, 0x39
1855        paddd   xmm0, xmm6
1856        paddd   xmm0, xmm1
1857        pxor    xmm3, xmm0
1858        pshufb  xmm3, xmm15
1859        paddd   xmm2, xmm3
1860        pxor    xmm1, xmm2
1861        movdqa  xmm11, xmm1
1862        pslld   xmm1, 20
1863        psrld   xmm11, 12
1864        por     xmm1, xmm11
1865        paddd   xmm0, xmm7
1866        paddd   xmm0, xmm1
1867        pxor    xmm3, xmm0
1868        pshufb  xmm3, xmm14
1869        paddd   xmm2, xmm3
1870        pxor    xmm1, xmm2
1871        movdqa  xmm11, xmm1
1872        pslld   xmm1, 25
1873        psrld   xmm11, 7
1874        por     xmm1, xmm11
1875        pshufd  xmm0, xmm0, 0x39
1876        pshufd  xmm3, xmm3, 0x4E
1877        pshufd  xmm2, xmm2, 0x93
1878        dec     al
1879        jz      9f
1880        movdqa  xmm8, xmm4
1881        shufps  xmm8, xmm5, 214
1882        pshufd  xmm9, xmm4, 0x0F
1883        pshufd  xmm4, xmm8, 0x39
1884        movdqa  xmm8, xmm6
1885        shufps  xmm8, xmm7, 250
1886        pblendw xmm9, xmm8, 0xCC
1887        movdqa  xmm8, xmm7
1888        punpcklqdq xmm8, xmm5
1889        pblendw xmm8, xmm6, 0xC0
1890        pshufd  xmm8, xmm8, 0x78
1891        punpckhdq xmm5, xmm7
1892        punpckldq xmm6, xmm5
1893        pshufd  xmm7, xmm6, 0x1E
1894        movdqa  xmm5, xmm9
1895        movdqa  xmm6, xmm8
1896        jmp     9b
18979:
1898        pxor    xmm0, xmm2
1899        pxor    xmm1, xmm3
1900        movups  xmmword ptr [rdi], xmm0
1901        movups  xmmword ptr [rdi+0x10], xmm1
1902        ret
1903
1904.p2align 6
1905blake3_compress_xof_sse41:
1906_blake3_compress_xof_sse41:
1907        _CET_ENDBR
1908        movups  xmm0, xmmword ptr [rdi]
1909        movups  xmm1, xmmword ptr [rdi+0x10]
1910        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
1911        movzx   eax, r8b
1912        movzx   edx, dl
1913        shl     rax, 32
1914        add     rdx, rax
1915        movq    xmm3, rcx
1916        movq    xmm4, rdx
1917        punpcklqdq xmm3, xmm4
1918        movups  xmm4, xmmword ptr [rsi]
1919        movups  xmm5, xmmword ptr [rsi+0x10]
1920        movaps  xmm8, xmm4
1921        shufps  xmm4, xmm5, 136
1922        shufps  xmm8, xmm5, 221
1923        movaps  xmm5, xmm8
1924        movups  xmm6, xmmword ptr [rsi+0x20]
1925        movups  xmm7, xmmword ptr [rsi+0x30]
1926        movaps  xmm8, xmm6
1927        shufps  xmm6, xmm7, 136
1928        pshufd  xmm6, xmm6, 0x93
1929        shufps  xmm8, xmm7, 221
1930        pshufd  xmm7, xmm8, 0x93
1931        movaps  xmm14, xmmword ptr [ROT8+rip]
1932        movaps  xmm15, xmmword ptr [ROT16+rip]
1933        mov     al, 7
19349:
1935        paddd   xmm0, xmm4
1936        paddd   xmm0, xmm1
1937        pxor    xmm3, xmm0
1938        pshufb  xmm3, xmm15
1939        paddd   xmm2, xmm3
1940        pxor    xmm1, xmm2
1941        movdqa  xmm11, xmm1
1942        pslld   xmm1, 20
1943        psrld   xmm11, 12
1944        por     xmm1, xmm11
1945        paddd   xmm0, xmm5
1946        paddd   xmm0, xmm1
1947        pxor    xmm3, xmm0
1948        pshufb  xmm3, xmm14
1949        paddd   xmm2, xmm3
1950        pxor    xmm1, xmm2
1951        movdqa  xmm11, xmm1
1952        pslld   xmm1, 25
1953        psrld   xmm11, 7
1954        por     xmm1, xmm11
1955        pshufd  xmm0, xmm0, 0x93
1956        pshufd  xmm3, xmm3, 0x4E
1957        pshufd  xmm2, xmm2, 0x39
1958        paddd   xmm0, xmm6
1959        paddd   xmm0, xmm1
1960        pxor    xmm3, xmm0
1961        pshufb  xmm3, xmm15
1962        paddd   xmm2, xmm3
1963        pxor    xmm1, xmm2
1964        movdqa  xmm11, xmm1
1965        pslld   xmm1, 20
1966        psrld   xmm11, 12
1967        por     xmm1, xmm11
1968        paddd   xmm0, xmm7
1969        paddd   xmm0, xmm1
1970        pxor    xmm3, xmm0
1971        pshufb  xmm3, xmm14
1972        paddd   xmm2, xmm3
1973        pxor    xmm1, xmm2
1974        movdqa  xmm11, xmm1
1975        pslld   xmm1, 25
1976        psrld   xmm11, 7
1977        por     xmm1, xmm11
1978        pshufd  xmm0, xmm0, 0x39
1979        pshufd  xmm3, xmm3, 0x4E
1980        pshufd  xmm2, xmm2, 0x93
1981        dec     al
1982        jz      9f
1983        movdqa  xmm8, xmm4
1984        shufps  xmm8, xmm5, 214
1985        pshufd  xmm9, xmm4, 0x0F
1986        pshufd  xmm4, xmm8, 0x39
1987        movdqa  xmm8, xmm6
1988        shufps  xmm8, xmm7, 250
1989        pblendw xmm9, xmm8, 0xCC
1990        movdqa  xmm8, xmm7
1991        punpcklqdq xmm8, xmm5
1992        pblendw xmm8, xmm6, 0xC0
1993        pshufd  xmm8, xmm8, 0x78
1994        punpckhdq xmm5, xmm7
1995        punpckldq xmm6, xmm5
1996        pshufd  xmm7, xmm6, 0x1E
1997        movdqa  xmm5, xmm9
1998        movdqa  xmm6, xmm8
1999        jmp     9b
20009:
2001        movdqu  xmm4, xmmword ptr [rdi]
2002        movdqu  xmm5, xmmword ptr [rdi+0x10]
2003        pxor    xmm0, xmm2
2004        pxor    xmm1, xmm3
2005        pxor    xmm2, xmm4
2006        pxor    xmm3, xmm5
2007        movups  xmmword ptr [r9], xmm0
2008        movups  xmmword ptr [r9+0x10], xmm1
2009        movups  xmmword ptr [r9+0x20], xmm2
2010        movups  xmmword ptr [r9+0x30], xmm3
2011        ret
2012
2013
2014#ifdef __APPLE__
2015.static_data
2016#else
2017.section .rodata
2018#endif
2019.p2align  6
2020BLAKE3_IV:
2021        .long  0x6A09E667, 0xBB67AE85
2022        .long  0x3C6EF372, 0xA54FF53A
2023ROT16:
2024        .byte  2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
2025ROT8:
2026        .byte  1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
2027ADD0:
2028        .long  0, 1, 2, 3
2029ADD1:
2030	.long  4, 4, 4, 4
2031BLAKE3_IV_0:
2032	.long  0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
2033BLAKE3_IV_1:
2034	.long  0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
2035BLAKE3_IV_2:
2036	.long  0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
2037BLAKE3_IV_3:
2038	.long  0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
2039BLAKE3_BLOCK_LEN:
2040	.long  64, 64, 64, 64
2041CMP_MSB_MASK:
2042	.long  0x80000000, 0x80000000, 0x80000000, 0x80000000
2043
2044#endif
2045