1public _llvm_blake3_hash_many_sse41
2public llvm_blake3_hash_many_sse41
3public llvm_blake3_compress_in_place_sse41
4public _llvm_blake3_compress_in_place_sse41
5public llvm_blake3_compress_xof_sse41
6public _llvm_blake3_compress_xof_sse41
7
8_TEXT   SEGMENT ALIGN(16) 'CODE'
9
10ALIGN   16
11llvm_blake3_hash_many_sse41 PROC
12_llvm_blake3_hash_many_sse41 PROC
13        push    r15
14        push    r14
15        push    r13
16        push    r12
17        push    rsi
18        push    rdi
19        push    rbx
20        push    rbp
21        mov     rbp, rsp
22        sub     rsp, 528
23        and     rsp, 0FFFFFFFFFFFFFFC0H
24        movdqa  xmmword ptr [rsp+170H], xmm6
25        movdqa  xmmword ptr [rsp+180H], xmm7
26        movdqa  xmmword ptr [rsp+190H], xmm8
27        movdqa  xmmword ptr [rsp+1A0H], xmm9
28        movdqa  xmmword ptr [rsp+1B0H], xmm10
29        movdqa  xmmword ptr [rsp+1C0H], xmm11
30        movdqa  xmmword ptr [rsp+1D0H], xmm12
31        movdqa  xmmword ptr [rsp+1E0H], xmm13
32        movdqa  xmmword ptr [rsp+1F0H], xmm14
33        movdqa  xmmword ptr [rsp+200H], xmm15
34        mov     rdi, rcx
35        mov     rsi, rdx
36        mov     rdx, r8
37        mov     rcx, r9
38        mov     r8, qword ptr [rbp+68H]
39        movzx   r9, byte ptr [rbp+70H]
40        neg     r9d
41        movd    xmm0, r9d
42        pshufd  xmm0, xmm0, 00H
43        movdqa  xmmword ptr [rsp+130H], xmm0
44        movdqa  xmm1, xmm0
45        pand    xmm1, xmmword ptr [ADD0]
46        pand    xmm0, xmmword ptr [ADD1]
47        movdqa  xmmword ptr [rsp+150H], xmm0
48        movd    xmm0, r8d
49        pshufd  xmm0, xmm0, 00H
50        paddd   xmm0, xmm1
51        movdqa  xmmword ptr [rsp+110H], xmm0
52        pxor    xmm0, xmmword ptr [CMP_MSB_MASK]
53        pxor    xmm1, xmmword ptr [CMP_MSB_MASK]
54        pcmpgtd xmm1, xmm0
55        shr     r8, 32
56        movd    xmm2, r8d
57        pshufd  xmm2, xmm2, 00H
58        psubd   xmm2, xmm1
59        movdqa  xmmword ptr [rsp+120H], xmm2
60        mov     rbx, qword ptr [rbp+90H]
61        mov     r15, rdx
62        shl     r15, 6
63        movzx   r13d, byte ptr [rbp+78H]
64        movzx   r12d, byte ptr [rbp+88H]
65        cmp     rsi, 4
66        jc      final3blocks
67outerloop4:
68        movdqu  xmm3, xmmword ptr [rcx]
69        pshufd  xmm0, xmm3, 00H
70        pshufd  xmm1, xmm3, 55H
71        pshufd  xmm2, xmm3, 0AAH
72        pshufd  xmm3, xmm3, 0FFH
73        movdqu  xmm7, xmmword ptr [rcx+10H]
74        pshufd  xmm4, xmm7, 00H
75        pshufd  xmm5, xmm7, 55H
76        pshufd  xmm6, xmm7, 0AAH
77        pshufd  xmm7, xmm7, 0FFH
78        mov     r8, qword ptr [rdi]
79        mov     r9, qword ptr [rdi+8H]
80        mov     r10, qword ptr [rdi+10H]
81        mov     r11, qword ptr [rdi+18H]
82        movzx   eax, byte ptr [rbp+80H]
83        or      eax, r13d
84        xor     edx, edx
85innerloop4:
86        mov     r14d, eax
87        or      eax, r12d
88        add     rdx, 64
89        cmp     rdx, r15
90        cmovne  eax, r14d
91        movdqu  xmm8, xmmword ptr [r8+rdx-40H]
92        movdqu  xmm9, xmmword ptr [r9+rdx-40H]
93        movdqu  xmm10, xmmword ptr [r10+rdx-40H]
94        movdqu  xmm11, xmmword ptr [r11+rdx-40H]
95        movdqa  xmm12, xmm8
96        punpckldq xmm8, xmm9
97        punpckhdq xmm12, xmm9
98        movdqa  xmm14, xmm10
99        punpckldq xmm10, xmm11
100        punpckhdq xmm14, xmm11
101        movdqa  xmm9, xmm8
102        punpcklqdq xmm8, xmm10
103        punpckhqdq xmm9, xmm10
104        movdqa  xmm13, xmm12
105        punpcklqdq xmm12, xmm14
106        punpckhqdq xmm13, xmm14
107        movdqa  xmmword ptr [rsp], xmm8
108        movdqa  xmmword ptr [rsp+10H], xmm9
109        movdqa  xmmword ptr [rsp+20H], xmm12
110        movdqa  xmmword ptr [rsp+30H], xmm13
111        movdqu  xmm8, xmmword ptr [r8+rdx-30H]
112        movdqu  xmm9, xmmword ptr [r9+rdx-30H]
113        movdqu  xmm10, xmmword ptr [r10+rdx-30H]
114        movdqu  xmm11, xmmword ptr [r11+rdx-30H]
115        movdqa  xmm12, xmm8
116        punpckldq xmm8, xmm9
117        punpckhdq xmm12, xmm9
118        movdqa  xmm14, xmm10
119        punpckldq xmm10, xmm11
120        punpckhdq xmm14, xmm11
121        movdqa  xmm9, xmm8
122        punpcklqdq xmm8, xmm10
123        punpckhqdq xmm9, xmm10
124        movdqa  xmm13, xmm12
125        punpcklqdq xmm12, xmm14
126        punpckhqdq xmm13, xmm14
127        movdqa  xmmword ptr [rsp+40H], xmm8
128        movdqa  xmmword ptr [rsp+50H], xmm9
129        movdqa  xmmword ptr [rsp+60H], xmm12
130        movdqa  xmmword ptr [rsp+70H], xmm13
131        movdqu  xmm8, xmmword ptr [r8+rdx-20H]
132        movdqu  xmm9, xmmword ptr [r9+rdx-20H]
133        movdqu  xmm10, xmmword ptr [r10+rdx-20H]
134        movdqu  xmm11, xmmword ptr [r11+rdx-20H]
135        movdqa  xmm12, xmm8
136        punpckldq xmm8, xmm9
137        punpckhdq xmm12, xmm9
138        movdqa  xmm14, xmm10
139        punpckldq xmm10, xmm11
140        punpckhdq xmm14, xmm11
141        movdqa  xmm9, xmm8
142        punpcklqdq xmm8, xmm10
143        punpckhqdq xmm9, xmm10
144        movdqa  xmm13, xmm12
145        punpcklqdq xmm12, xmm14
146        punpckhqdq xmm13, xmm14
147        movdqa  xmmword ptr [rsp+80H], xmm8
148        movdqa  xmmword ptr [rsp+90H], xmm9
149        movdqa  xmmword ptr [rsp+0A0H], xmm12
150        movdqa  xmmword ptr [rsp+0B0H], xmm13
151        movdqu  xmm8, xmmword ptr [r8+rdx-10H]
152        movdqu  xmm9, xmmword ptr [r9+rdx-10H]
153        movdqu  xmm10, xmmword ptr [r10+rdx-10H]
154        movdqu  xmm11, xmmword ptr [r11+rdx-10H]
155        movdqa  xmm12, xmm8
156        punpckldq xmm8, xmm9
157        punpckhdq xmm12, xmm9
158        movdqa  xmm14, xmm10
159        punpckldq xmm10, xmm11
160        punpckhdq xmm14, xmm11
161        movdqa  xmm9, xmm8
162        punpcklqdq xmm8, xmm10
163        punpckhqdq xmm9, xmm10
164        movdqa  xmm13, xmm12
165        punpcklqdq xmm12, xmm14
166        punpckhqdq xmm13, xmm14
167        movdqa  xmmword ptr [rsp+0C0H], xmm8
168        movdqa  xmmword ptr [rsp+0D0H], xmm9
169        movdqa  xmmword ptr [rsp+0E0H], xmm12
170        movdqa  xmmword ptr [rsp+0F0H], xmm13
171        movdqa  xmm9, xmmword ptr [BLAKE3_IV_1]
172        movdqa  xmm10, xmmword ptr [BLAKE3_IV_2]
173        movdqa  xmm11, xmmword ptr [BLAKE3_IV_3]
174        movdqa  xmm12, xmmword ptr [rsp+110H]
175        movdqa  xmm13, xmmword ptr [rsp+120H]
176        movdqa  xmm14, xmmword ptr [BLAKE3_BLOCK_LEN]
177        movd    xmm15, eax
178        pshufd  xmm15, xmm15, 00H
179        prefetcht0 byte ptr [r8+rdx+80H]
180        prefetcht0 byte ptr [r9+rdx+80H]
181        prefetcht0 byte ptr [r10+rdx+80H]
182        prefetcht0 byte ptr [r11+rdx+80H]
183        paddd   xmm0, xmmword ptr [rsp]
184        paddd   xmm1, xmmword ptr [rsp+20H]
185        paddd   xmm2, xmmword ptr [rsp+40H]
186        paddd   xmm3, xmmword ptr [rsp+60H]
187        paddd   xmm0, xmm4
188        paddd   xmm1, xmm5
189        paddd   xmm2, xmm6
190        paddd   xmm3, xmm7
191        pxor    xmm12, xmm0
192        pxor    xmm13, xmm1
193        pxor    xmm14, xmm2
194        pxor    xmm15, xmm3
195        movdqa  xmm8, xmmword ptr [ROT16]
196        pshufb  xmm12, xmm8
197        pshufb  xmm13, xmm8
198        pshufb  xmm14, xmm8
199        pshufb  xmm15, xmm8
200        movdqa  xmm8, xmmword ptr [BLAKE3_IV_0]
201        paddd   xmm8, xmm12
202        paddd   xmm9, xmm13
203        paddd   xmm10, xmm14
204        paddd   xmm11, xmm15
205        pxor    xmm4, xmm8
206        pxor    xmm5, xmm9
207        pxor    xmm6, xmm10
208        pxor    xmm7, xmm11
209        movdqa  xmmword ptr [rsp+100H], xmm8
210        movdqa  xmm8, xmm4
211        psrld   xmm8, 12
212        pslld   xmm4, 20
213        por     xmm4, xmm8
214        movdqa  xmm8, xmm5
215        psrld   xmm8, 12
216        pslld   xmm5, 20
217        por     xmm5, xmm8
218        movdqa  xmm8, xmm6
219        psrld   xmm8, 12
220        pslld   xmm6, 20
221        por     xmm6, xmm8
222        movdqa  xmm8, xmm7
223        psrld   xmm8, 12
224        pslld   xmm7, 20
225        por     xmm7, xmm8
226        paddd   xmm0, xmmword ptr [rsp+10H]
227        paddd   xmm1, xmmword ptr [rsp+30H]
228        paddd   xmm2, xmmword ptr [rsp+50H]
229        paddd   xmm3, xmmword ptr [rsp+70H]
230        paddd   xmm0, xmm4
231        paddd   xmm1, xmm5
232        paddd   xmm2, xmm6
233        paddd   xmm3, xmm7
234        pxor    xmm12, xmm0
235        pxor    xmm13, xmm1
236        pxor    xmm14, xmm2
237        pxor    xmm15, xmm3
238        movdqa  xmm8, xmmword ptr [ROT8]
239        pshufb  xmm12, xmm8
240        pshufb  xmm13, xmm8
241        pshufb  xmm14, xmm8
242        pshufb  xmm15, xmm8
243        movdqa  xmm8, xmmword ptr [rsp+100H]
244        paddd   xmm8, xmm12
245        paddd   xmm9, xmm13
246        paddd   xmm10, xmm14
247        paddd   xmm11, xmm15
248        pxor    xmm4, xmm8
249        pxor    xmm5, xmm9
250        pxor    xmm6, xmm10
251        pxor    xmm7, xmm11
252        movdqa  xmmword ptr [rsp+100H], xmm8
253        movdqa  xmm8, xmm4
254        psrld   xmm8, 7
255        pslld   xmm4, 25
256        por     xmm4, xmm8
257        movdqa  xmm8, xmm5
258        psrld   xmm8, 7
259        pslld   xmm5, 25
260        por     xmm5, xmm8
261        movdqa  xmm8, xmm6
262        psrld   xmm8, 7
263        pslld   xmm6, 25
264        por     xmm6, xmm8
265        movdqa  xmm8, xmm7
266        psrld   xmm8, 7
267        pslld   xmm7, 25
268        por     xmm7, xmm8
269        paddd   xmm0, xmmword ptr [rsp+80H]
270        paddd   xmm1, xmmword ptr [rsp+0A0H]
271        paddd   xmm2, xmmword ptr [rsp+0C0H]
272        paddd   xmm3, xmmword ptr [rsp+0E0H]
273        paddd   xmm0, xmm5
274        paddd   xmm1, xmm6
275        paddd   xmm2, xmm7
276        paddd   xmm3, xmm4
277        pxor    xmm15, xmm0
278        pxor    xmm12, xmm1
279        pxor    xmm13, xmm2
280        pxor    xmm14, xmm3
281        movdqa  xmm8, xmmword ptr [ROT16]
282        pshufb  xmm15, xmm8
283        pshufb  xmm12, xmm8
284        pshufb  xmm13, xmm8
285        pshufb  xmm14, xmm8
286        paddd   xmm10, xmm15
287        paddd   xmm11, xmm12
288        movdqa  xmm8, xmmword ptr [rsp+100H]
289        paddd   xmm8, xmm13
290        paddd   xmm9, xmm14
291        pxor    xmm5, xmm10
292        pxor    xmm6, xmm11
293        pxor    xmm7, xmm8
294        pxor    xmm4, xmm9
295        movdqa  xmmword ptr [rsp+100H], xmm8
296        movdqa  xmm8, xmm5
297        psrld   xmm8, 12
298        pslld   xmm5, 20
299        por     xmm5, xmm8
300        movdqa  xmm8, xmm6
301        psrld   xmm8, 12
302        pslld   xmm6, 20
303        por     xmm6, xmm8
304        movdqa  xmm8, xmm7
305        psrld   xmm8, 12
306        pslld   xmm7, 20
307        por     xmm7, xmm8
308        movdqa  xmm8, xmm4
309        psrld   xmm8, 12
310        pslld   xmm4, 20
311        por     xmm4, xmm8
312        paddd   xmm0, xmmword ptr [rsp+90H]
313        paddd   xmm1, xmmword ptr [rsp+0B0H]
314        paddd   xmm2, xmmword ptr [rsp+0D0H]
315        paddd   xmm3, xmmword ptr [rsp+0F0H]
316        paddd   xmm0, xmm5
317        paddd   xmm1, xmm6
318        paddd   xmm2, xmm7
319        paddd   xmm3, xmm4
320        pxor    xmm15, xmm0
321        pxor    xmm12, xmm1
322        pxor    xmm13, xmm2
323        pxor    xmm14, xmm3
324        movdqa  xmm8, xmmword ptr [ROT8]
325        pshufb  xmm15, xmm8
326        pshufb  xmm12, xmm8
327        pshufb  xmm13, xmm8
328        pshufb  xmm14, xmm8
329        paddd   xmm10, xmm15
330        paddd   xmm11, xmm12
331        movdqa  xmm8, xmmword ptr [rsp+100H]
332        paddd   xmm8, xmm13
333        paddd   xmm9, xmm14
334        pxor    xmm5, xmm10
335        pxor    xmm6, xmm11
336        pxor    xmm7, xmm8
337        pxor    xmm4, xmm9
338        movdqa  xmmword ptr [rsp+100H], xmm8
339        movdqa  xmm8, xmm5
340        psrld   xmm8, 7
341        pslld   xmm5, 25
342        por     xmm5, xmm8
343        movdqa  xmm8, xmm6
344        psrld   xmm8, 7
345        pslld   xmm6, 25
346        por     xmm6, xmm8
347        movdqa  xmm8, xmm7
348        psrld   xmm8, 7
349        pslld   xmm7, 25
350        por     xmm7, xmm8
351        movdqa  xmm8, xmm4
352        psrld   xmm8, 7
353        pslld   xmm4, 25
354        por     xmm4, xmm8
355        paddd   xmm0, xmmword ptr [rsp+20H]
356        paddd   xmm1, xmmword ptr [rsp+30H]
357        paddd   xmm2, xmmword ptr [rsp+70H]
358        paddd   xmm3, xmmword ptr [rsp+40H]
359        paddd   xmm0, xmm4
360        paddd   xmm1, xmm5
361        paddd   xmm2, xmm6
362        paddd   xmm3, xmm7
363        pxor    xmm12, xmm0
364        pxor    xmm13, xmm1
365        pxor    xmm14, xmm2
366        pxor    xmm15, xmm3
367        movdqa  xmm8, xmmword ptr [ROT16]
368        pshufb  xmm12, xmm8
369        pshufb  xmm13, xmm8
370        pshufb  xmm14, xmm8
371        pshufb  xmm15, xmm8
372        movdqa  xmm8, xmmword ptr [rsp+100H]
373        paddd   xmm8, xmm12
374        paddd   xmm9, xmm13
375        paddd   xmm10, xmm14
376        paddd   xmm11, xmm15
377        pxor    xmm4, xmm8
378        pxor    xmm5, xmm9
379        pxor    xmm6, xmm10
380        pxor    xmm7, xmm11
381        movdqa  xmmword ptr [rsp+100H], xmm8
382        movdqa  xmm8, xmm4
383        psrld   xmm8, 12
384        pslld   xmm4, 20
385        por     xmm4, xmm8
386        movdqa  xmm8, xmm5
387        psrld   xmm8, 12
388        pslld   xmm5, 20
389        por     xmm5, xmm8
390        movdqa  xmm8, xmm6
391        psrld   xmm8, 12
392        pslld   xmm6, 20
393        por     xmm6, xmm8
394        movdqa  xmm8, xmm7
395        psrld   xmm8, 12
396        pslld   xmm7, 20
397        por     xmm7, xmm8
398        paddd   xmm0, xmmword ptr [rsp+60H]
399        paddd   xmm1, xmmword ptr [rsp+0A0H]
400        paddd   xmm2, xmmword ptr [rsp]
401        paddd   xmm3, xmmword ptr [rsp+0D0H]
402        paddd   xmm0, xmm4
403        paddd   xmm1, xmm5
404        paddd   xmm2, xmm6
405        paddd   xmm3, xmm7
406        pxor    xmm12, xmm0
407        pxor    xmm13, xmm1
408        pxor    xmm14, xmm2
409        pxor    xmm15, xmm3
410        movdqa  xmm8, xmmword ptr [ROT8]
411        pshufb  xmm12, xmm8
412        pshufb  xmm13, xmm8
413        pshufb  xmm14, xmm8
414        pshufb  xmm15, xmm8
415        movdqa  xmm8, xmmword ptr [rsp+100H]
416        paddd   xmm8, xmm12
417        paddd   xmm9, xmm13
418        paddd   xmm10, xmm14
419        paddd   xmm11, xmm15
420        pxor    xmm4, xmm8
421        pxor    xmm5, xmm9
422        pxor    xmm6, xmm10
423        pxor    xmm7, xmm11
424        movdqa  xmmword ptr [rsp+100H], xmm8
425        movdqa  xmm8, xmm4
426        psrld   xmm8, 7
427        pslld   xmm4, 25
428        por     xmm4, xmm8
429        movdqa  xmm8, xmm5
430        psrld   xmm8, 7
431        pslld   xmm5, 25
432        por     xmm5, xmm8
433        movdqa  xmm8, xmm6
434        psrld   xmm8, 7
435        pslld   xmm6, 25
436        por     xmm6, xmm8
437        movdqa  xmm8, xmm7
438        psrld   xmm8, 7
439        pslld   xmm7, 25
440        por     xmm7, xmm8
441        paddd   xmm0, xmmword ptr [rsp+10H]
442        paddd   xmm1, xmmword ptr [rsp+0C0H]
443        paddd   xmm2, xmmword ptr [rsp+90H]
444        paddd   xmm3, xmmword ptr [rsp+0F0H]
445        paddd   xmm0, xmm5
446        paddd   xmm1, xmm6
447        paddd   xmm2, xmm7
448        paddd   xmm3, xmm4
449        pxor    xmm15, xmm0
450        pxor    xmm12, xmm1
451        pxor    xmm13, xmm2
452        pxor    xmm14, xmm3
453        movdqa  xmm8, xmmword ptr [ROT16]
454        pshufb  xmm15, xmm8
455        pshufb  xmm12, xmm8
456        pshufb  xmm13, xmm8
457        pshufb  xmm14, xmm8
458        paddd   xmm10, xmm15
459        paddd   xmm11, xmm12
460        movdqa  xmm8, xmmword ptr [rsp+100H]
461        paddd   xmm8, xmm13
462        paddd   xmm9, xmm14
463        pxor    xmm5, xmm10
464        pxor    xmm6, xmm11
465        pxor    xmm7, xmm8
466        pxor    xmm4, xmm9
467        movdqa  xmmword ptr [rsp+100H], xmm8
468        movdqa  xmm8, xmm5
469        psrld   xmm8, 12
470        pslld   xmm5, 20
471        por     xmm5, xmm8
472        movdqa  xmm8, xmm6
473        psrld   xmm8, 12
474        pslld   xmm6, 20
475        por     xmm6, xmm8
476        movdqa  xmm8, xmm7
477        psrld   xmm8, 12
478        pslld   xmm7, 20
479        por     xmm7, xmm8
480        movdqa  xmm8, xmm4
481        psrld   xmm8, 12
482        pslld   xmm4, 20
483        por     xmm4, xmm8
484        paddd   xmm0, xmmword ptr [rsp+0B0H]
485        paddd   xmm1, xmmword ptr [rsp+50H]
486        paddd   xmm2, xmmword ptr [rsp+0E0H]
487        paddd   xmm3, xmmword ptr [rsp+80H]
488        paddd   xmm0, xmm5
489        paddd   xmm1, xmm6
490        paddd   xmm2, xmm7
491        paddd   xmm3, xmm4
492        pxor    xmm15, xmm0
493        pxor    xmm12, xmm1
494        pxor    xmm13, xmm2
495        pxor    xmm14, xmm3
496        movdqa  xmm8, xmmword ptr [ROT8]
497        pshufb  xmm15, xmm8
498        pshufb  xmm12, xmm8
499        pshufb  xmm13, xmm8
500        pshufb  xmm14, xmm8
501        paddd   xmm10, xmm15
502        paddd   xmm11, xmm12
503        movdqa  xmm8, xmmword ptr [rsp+100H]
504        paddd   xmm8, xmm13
505        paddd   xmm9, xmm14
506        pxor    xmm5, xmm10
507        pxor    xmm6, xmm11
508        pxor    xmm7, xmm8
509        pxor    xmm4, xmm9
510        movdqa  xmmword ptr [rsp+100H], xmm8
511        movdqa  xmm8, xmm5
512        psrld   xmm8, 7
513        pslld   xmm5, 25
514        por     xmm5, xmm8
515        movdqa  xmm8, xmm6
516        psrld   xmm8, 7
517        pslld   xmm6, 25
518        por     xmm6, xmm8
519        movdqa  xmm8, xmm7
520        psrld   xmm8, 7
521        pslld   xmm7, 25
522        por     xmm7, xmm8
523        movdqa  xmm8, xmm4
524        psrld   xmm8, 7
525        pslld   xmm4, 25
526        por     xmm4, xmm8
527        paddd   xmm0, xmmword ptr [rsp+30H]
528        paddd   xmm1, xmmword ptr [rsp+0A0H]
529        paddd   xmm2, xmmword ptr [rsp+0D0H]
530        paddd   xmm3, xmmword ptr [rsp+70H]
531        paddd   xmm0, xmm4
532        paddd   xmm1, xmm5
533        paddd   xmm2, xmm6
534        paddd   xmm3, xmm7
535        pxor    xmm12, xmm0
536        pxor    xmm13, xmm1
537        pxor    xmm14, xmm2
538        pxor    xmm15, xmm3
539        movdqa  xmm8, xmmword ptr [ROT16]
540        pshufb  xmm12, xmm8
541        pshufb  xmm13, xmm8
542        pshufb  xmm14, xmm8
543        pshufb  xmm15, xmm8
544        movdqa  xmm8, xmmword ptr [rsp+100H]
545        paddd   xmm8, xmm12
546        paddd   xmm9, xmm13
547        paddd   xmm10, xmm14
548        paddd   xmm11, xmm15
549        pxor    xmm4, xmm8
550        pxor    xmm5, xmm9
551        pxor    xmm6, xmm10
552        pxor    xmm7, xmm11
553        movdqa  xmmword ptr [rsp+100H], xmm8
554        movdqa  xmm8, xmm4
555        psrld   xmm8, 12
556        pslld   xmm4, 20
557        por     xmm4, xmm8
558        movdqa  xmm8, xmm5
559        psrld   xmm8, 12
560        pslld   xmm5, 20
561        por     xmm5, xmm8
562        movdqa  xmm8, xmm6
563        psrld   xmm8, 12
564        pslld   xmm6, 20
565        por     xmm6, xmm8
566        movdqa  xmm8, xmm7
567        psrld   xmm8, 12
568        pslld   xmm7, 20
569        por     xmm7, xmm8
570        paddd   xmm0, xmmword ptr [rsp+40H]
571        paddd   xmm1, xmmword ptr [rsp+0C0H]
572        paddd   xmm2, xmmword ptr [rsp+20H]
573        paddd   xmm3, xmmword ptr [rsp+0E0H]
574        paddd   xmm0, xmm4
575        paddd   xmm1, xmm5
576        paddd   xmm2, xmm6
577        paddd   xmm3, xmm7
578        pxor    xmm12, xmm0
579        pxor    xmm13, xmm1
580        pxor    xmm14, xmm2
581        pxor    xmm15, xmm3
582        movdqa  xmm8, xmmword ptr [ROT8]
583        pshufb  xmm12, xmm8
584        pshufb  xmm13, xmm8
585        pshufb  xmm14, xmm8
586        pshufb  xmm15, xmm8
587        movdqa  xmm8, xmmword ptr [rsp+100H]
588        paddd   xmm8, xmm12
589        paddd   xmm9, xmm13
590        paddd   xmm10, xmm14
591        paddd   xmm11, xmm15
592        pxor    xmm4, xmm8
593        pxor    xmm5, xmm9
594        pxor    xmm6, xmm10
595        pxor    xmm7, xmm11
596        movdqa  xmmword ptr [rsp+100H], xmm8
597        movdqa  xmm8, xmm4
598        psrld   xmm8, 7
599        pslld   xmm4, 25
600        por     xmm4, xmm8
601        movdqa  xmm8, xmm5
602        psrld   xmm8, 7
603        pslld   xmm5, 25
604        por     xmm5, xmm8
605        movdqa  xmm8, xmm6
606        psrld   xmm8, 7
607        pslld   xmm6, 25
608        por     xmm6, xmm8
609        movdqa  xmm8, xmm7
610        psrld   xmm8, 7
611        pslld   xmm7, 25
612        por     xmm7, xmm8
613        paddd   xmm0, xmmword ptr [rsp+60H]
614        paddd   xmm1, xmmword ptr [rsp+90H]
615        paddd   xmm2, xmmword ptr [rsp+0B0H]
616        paddd   xmm3, xmmword ptr [rsp+80H]
617        paddd   xmm0, xmm5
618        paddd   xmm1, xmm6
619        paddd   xmm2, xmm7
620        paddd   xmm3, xmm4
621        pxor    xmm15, xmm0
622        pxor    xmm12, xmm1
623        pxor    xmm13, xmm2
624        pxor    xmm14, xmm3
625        movdqa  xmm8, xmmword ptr [ROT16]
626        pshufb  xmm15, xmm8
627        pshufb  xmm12, xmm8
628        pshufb  xmm13, xmm8
629        pshufb  xmm14, xmm8
630        paddd   xmm10, xmm15
631        paddd   xmm11, xmm12
632        movdqa  xmm8, xmmword ptr [rsp+100H]
633        paddd   xmm8, xmm13
634        paddd   xmm9, xmm14
635        pxor    xmm5, xmm10
636        pxor    xmm6, xmm11
637        pxor    xmm7, xmm8
638        pxor    xmm4, xmm9
639        movdqa  xmmword ptr [rsp+100H], xmm8
640        movdqa  xmm8, xmm5
641        psrld   xmm8, 12
642        pslld   xmm5, 20
643        por     xmm5, xmm8
644        movdqa  xmm8, xmm6
645        psrld   xmm8, 12
646        pslld   xmm6, 20
647        por     xmm6, xmm8
648        movdqa  xmm8, xmm7
649        psrld   xmm8, 12
650        pslld   xmm7, 20
651        por     xmm7, xmm8
652        movdqa  xmm8, xmm4
653        psrld   xmm8, 12
654        pslld   xmm4, 20
655        por     xmm4, xmm8
656        paddd   xmm0, xmmword ptr [rsp+50H]
657        paddd   xmm1, xmmword ptr [rsp]
658        paddd   xmm2, xmmword ptr [rsp+0F0H]
659        paddd   xmm3, xmmword ptr [rsp+10H]
660        paddd   xmm0, xmm5
661        paddd   xmm1, xmm6
662        paddd   xmm2, xmm7
663        paddd   xmm3, xmm4
664        pxor    xmm15, xmm0
665        pxor    xmm12, xmm1
666        pxor    xmm13, xmm2
667        pxor    xmm14, xmm3
668        movdqa  xmm8, xmmword ptr [ROT8]
669        pshufb  xmm15, xmm8
670        pshufb  xmm12, xmm8
671        pshufb  xmm13, xmm8
672        pshufb  xmm14, xmm8
673        paddd   xmm10, xmm15
674        paddd   xmm11, xmm12
675        movdqa  xmm8, xmmword ptr [rsp+100H]
676        paddd   xmm8, xmm13
677        paddd   xmm9, xmm14
678        pxor    xmm5, xmm10
679        pxor    xmm6, xmm11
680        pxor    xmm7, xmm8
681        pxor    xmm4, xmm9
682        movdqa  xmmword ptr [rsp+100H], xmm8
683        movdqa  xmm8, xmm5
684        psrld   xmm8, 7
685        pslld   xmm5, 25
686        por     xmm5, xmm8
687        movdqa  xmm8, xmm6
688        psrld   xmm8, 7
689        pslld   xmm6, 25
690        por     xmm6, xmm8
691        movdqa  xmm8, xmm7
692        psrld   xmm8, 7
693        pslld   xmm7, 25
694        por     xmm7, xmm8
695        movdqa  xmm8, xmm4
696        psrld   xmm8, 7
697        pslld   xmm4, 25
698        por     xmm4, xmm8
699        paddd   xmm0, xmmword ptr [rsp+0A0H]
700        paddd   xmm1, xmmword ptr [rsp+0C0H]
701        paddd   xmm2, xmmword ptr [rsp+0E0H]
702        paddd   xmm3, xmmword ptr [rsp+0D0H]
703        paddd   xmm0, xmm4
704        paddd   xmm1, xmm5
705        paddd   xmm2, xmm6
706        paddd   xmm3, xmm7
707        pxor    xmm12, xmm0
708        pxor    xmm13, xmm1
709        pxor    xmm14, xmm2
710        pxor    xmm15, xmm3
711        movdqa  xmm8, xmmword ptr [ROT16]
712        pshufb  xmm12, xmm8
713        pshufb  xmm13, xmm8
714        pshufb  xmm14, xmm8
715        pshufb  xmm15, xmm8
716        movdqa  xmm8, xmmword ptr [rsp+100H]
717        paddd   xmm8, xmm12
718        paddd   xmm9, xmm13
719        paddd   xmm10, xmm14
720        paddd   xmm11, xmm15
721        pxor    xmm4, xmm8
722        pxor    xmm5, xmm9
723        pxor    xmm6, xmm10
724        pxor    xmm7, xmm11
725        movdqa  xmmword ptr [rsp+100H], xmm8
726        movdqa  xmm8, xmm4
727        psrld   xmm8, 12
728        pslld   xmm4, 20
729        por     xmm4, xmm8
730        movdqa  xmm8, xmm5
731        psrld   xmm8, 12
732        pslld   xmm5, 20
733        por     xmm5, xmm8
734        movdqa  xmm8, xmm6
735        psrld   xmm8, 12
736        pslld   xmm6, 20
737        por     xmm6, xmm8
738        movdqa  xmm8, xmm7
739        psrld   xmm8, 12
740        pslld   xmm7, 20
741        por     xmm7, xmm8
742        paddd   xmm0, xmmword ptr [rsp+70H]
743        paddd   xmm1, xmmword ptr [rsp+90H]
744        paddd   xmm2, xmmword ptr [rsp+30H]
745        paddd   xmm3, xmmword ptr [rsp+0F0H]
746        paddd   xmm0, xmm4
747        paddd   xmm1, xmm5
748        paddd   xmm2, xmm6
749        paddd   xmm3, xmm7
750        pxor    xmm12, xmm0
751        pxor    xmm13, xmm1
752        pxor    xmm14, xmm2
753        pxor    xmm15, xmm3
754        movdqa  xmm8, xmmword ptr [ROT8]
755        pshufb  xmm12, xmm8
756        pshufb  xmm13, xmm8
757        pshufb  xmm14, xmm8
758        pshufb  xmm15, xmm8
759        movdqa  xmm8, xmmword ptr [rsp+100H]
760        paddd   xmm8, xmm12
761        paddd   xmm9, xmm13
762        paddd   xmm10, xmm14
763        paddd   xmm11, xmm15
764        pxor    xmm4, xmm8
765        pxor    xmm5, xmm9
766        pxor    xmm6, xmm10
767        pxor    xmm7, xmm11
768        movdqa  xmmword ptr [rsp+100H], xmm8
769        movdqa  xmm8, xmm4
770        psrld   xmm8, 7
771        pslld   xmm4, 25
772        por     xmm4, xmm8
773        movdqa  xmm8, xmm5
774        psrld   xmm8, 7
775        pslld   xmm5, 25
776        por     xmm5, xmm8
777        movdqa  xmm8, xmm6
778        psrld   xmm8, 7
779        pslld   xmm6, 25
780        por     xmm6, xmm8
781        movdqa  xmm8, xmm7
782        psrld   xmm8, 7
783        pslld   xmm7, 25
784        por     xmm7, xmm8
785        paddd   xmm0, xmmword ptr [rsp+40H]
786        paddd   xmm1, xmmword ptr [rsp+0B0H]
787        paddd   xmm2, xmmword ptr [rsp+50H]
788        paddd   xmm3, xmmword ptr [rsp+10H]
789        paddd   xmm0, xmm5
790        paddd   xmm1, xmm6
791        paddd   xmm2, xmm7
792        paddd   xmm3, xmm4
793        pxor    xmm15, xmm0
794        pxor    xmm12, xmm1
795        pxor    xmm13, xmm2
796        pxor    xmm14, xmm3
797        movdqa  xmm8, xmmword ptr [ROT16]
798        pshufb  xmm15, xmm8
799        pshufb  xmm12, xmm8
800        pshufb  xmm13, xmm8
801        pshufb  xmm14, xmm8
802        paddd   xmm10, xmm15
803        paddd   xmm11, xmm12
804        movdqa  xmm8, xmmword ptr [rsp+100H]
805        paddd   xmm8, xmm13
806        paddd   xmm9, xmm14
807        pxor    xmm5, xmm10
808        pxor    xmm6, xmm11
809        pxor    xmm7, xmm8
810        pxor    xmm4, xmm9
811        movdqa  xmmword ptr [rsp+100H], xmm8
812        movdqa  xmm8, xmm5
813        psrld   xmm8, 12
814        pslld   xmm5, 20
815        por     xmm5, xmm8
816        movdqa  xmm8, xmm6
817        psrld   xmm8, 12
818        pslld   xmm6, 20
819        por     xmm6, xmm8
820        movdqa  xmm8, xmm7
821        psrld   xmm8, 12
822        pslld   xmm7, 20
823        por     xmm7, xmm8
824        movdqa  xmm8, xmm4
825        psrld   xmm8, 12
826        pslld   xmm4, 20
827        por     xmm4, xmm8
828        paddd   xmm0, xmmword ptr [rsp]
829        paddd   xmm1, xmmword ptr [rsp+20H]
830        paddd   xmm2, xmmword ptr [rsp+80H]
831        paddd   xmm3, xmmword ptr [rsp+60H]
832        paddd   xmm0, xmm5
833        paddd   xmm1, xmm6
834        paddd   xmm2, xmm7
835        paddd   xmm3, xmm4
836        pxor    xmm15, xmm0
837        pxor    xmm12, xmm1
838        pxor    xmm13, xmm2
839        pxor    xmm14, xmm3
840        movdqa  xmm8, xmmword ptr [ROT8]
841        pshufb  xmm15, xmm8
842        pshufb  xmm12, xmm8
843        pshufb  xmm13, xmm8
844        pshufb  xmm14, xmm8
845        paddd   xmm10, xmm15
846        paddd   xmm11, xmm12
847        movdqa  xmm8, xmmword ptr [rsp+100H]
848        paddd   xmm8, xmm13
849        paddd   xmm9, xmm14
850        pxor    xmm5, xmm10
851        pxor    xmm6, xmm11
852        pxor    xmm7, xmm8
853        pxor    xmm4, xmm9
854        movdqa  xmmword ptr [rsp+100H], xmm8
855        movdqa  xmm8, xmm5
856        psrld   xmm8, 7
857        pslld   xmm5, 25
858        por     xmm5, xmm8
859        movdqa  xmm8, xmm6
860        psrld   xmm8, 7
861        pslld   xmm6, 25
862        por     xmm6, xmm8
863        movdqa  xmm8, xmm7
864        psrld   xmm8, 7
865        pslld   xmm7, 25
866        por     xmm7, xmm8
867        movdqa  xmm8, xmm4
868        psrld   xmm8, 7
869        pslld   xmm4, 25
870        por     xmm4, xmm8
871        paddd   xmm0, xmmword ptr [rsp+0C0H]
872        paddd   xmm1, xmmword ptr [rsp+90H]
873        paddd   xmm2, xmmword ptr [rsp+0F0H]
874        paddd   xmm3, xmmword ptr [rsp+0E0H]
875        paddd   xmm0, xmm4
876        paddd   xmm1, xmm5
877        paddd   xmm2, xmm6
878        paddd   xmm3, xmm7
879        pxor    xmm12, xmm0
880        pxor    xmm13, xmm1
881        pxor    xmm14, xmm2
882        pxor    xmm15, xmm3
883        movdqa  xmm8, xmmword ptr [ROT16]
884        pshufb  xmm12, xmm8
885        pshufb  xmm13, xmm8
886        pshufb  xmm14, xmm8
887        pshufb  xmm15, xmm8
888        movdqa  xmm8, xmmword ptr [rsp+100H]
889        paddd   xmm8, xmm12
890        paddd   xmm9, xmm13
891        paddd   xmm10, xmm14
892        paddd   xmm11, xmm15
893        pxor    xmm4, xmm8
894        pxor    xmm5, xmm9
895        pxor    xmm6, xmm10
896        pxor    xmm7, xmm11
897        movdqa  xmmword ptr [rsp+100H], xmm8
898        movdqa  xmm8, xmm4
899        psrld   xmm8, 12
900        pslld   xmm4, 20
901        por     xmm4, xmm8
902        movdqa  xmm8, xmm5
903        psrld   xmm8, 12
904        pslld   xmm5, 20
905        por     xmm5, xmm8
906        movdqa  xmm8, xmm6
907        psrld   xmm8, 12
908        pslld   xmm6, 20
909        por     xmm6, xmm8
910        movdqa  xmm8, xmm7
911        psrld   xmm8, 12
912        pslld   xmm7, 20
913        por     xmm7, xmm8
914        paddd   xmm0, xmmword ptr [rsp+0D0H]
915        paddd   xmm1, xmmword ptr [rsp+0B0H]
916        paddd   xmm2, xmmword ptr [rsp+0A0H]
917        paddd   xmm3, xmmword ptr [rsp+80H]
918        paddd   xmm0, xmm4
919        paddd   xmm1, xmm5
920        paddd   xmm2, xmm6
921        paddd   xmm3, xmm7
922        pxor    xmm12, xmm0
923        pxor    xmm13, xmm1
924        pxor    xmm14, xmm2
925        pxor    xmm15, xmm3
926        movdqa  xmm8, xmmword ptr [ROT8]
927        pshufb  xmm12, xmm8
928        pshufb  xmm13, xmm8
929        pshufb  xmm14, xmm8
930        pshufb  xmm15, xmm8
931        movdqa  xmm8, xmmword ptr [rsp+100H]
932        paddd   xmm8, xmm12
933        paddd   xmm9, xmm13
934        paddd   xmm10, xmm14
935        paddd   xmm11, xmm15
936        pxor    xmm4, xmm8
937        pxor    xmm5, xmm9
938        pxor    xmm6, xmm10
939        pxor    xmm7, xmm11
940        movdqa  xmmword ptr [rsp+100H], xmm8
941        movdqa  xmm8, xmm4
942        psrld   xmm8, 7
943        pslld   xmm4, 25
944        por     xmm4, xmm8
945        movdqa  xmm8, xmm5
946        psrld   xmm8, 7
947        pslld   xmm5, 25
948        por     xmm5, xmm8
949        movdqa  xmm8, xmm6
950        psrld   xmm8, 7
951        pslld   xmm6, 25
952        por     xmm6, xmm8
953        movdqa  xmm8, xmm7
954        psrld   xmm8, 7
955        pslld   xmm7, 25
956        por     xmm7, xmm8
957        paddd   xmm0, xmmword ptr [rsp+70H]
958        paddd   xmm1, xmmword ptr [rsp+50H]
959        paddd   xmm2, xmmword ptr [rsp]
960        paddd   xmm3, xmmword ptr [rsp+60H]
961        paddd   xmm0, xmm5
962        paddd   xmm1, xmm6
963        paddd   xmm2, xmm7
964        paddd   xmm3, xmm4
965        pxor    xmm15, xmm0
966        pxor    xmm12, xmm1
967        pxor    xmm13, xmm2
968        pxor    xmm14, xmm3
969        movdqa  xmm8, xmmword ptr [ROT16]
970        pshufb  xmm15, xmm8
971        pshufb  xmm12, xmm8
972        pshufb  xmm13, xmm8
973        pshufb  xmm14, xmm8
974        paddd   xmm10, xmm15
975        paddd   xmm11, xmm12
976        movdqa  xmm8, xmmword ptr [rsp+100H]
977        paddd   xmm8, xmm13
978        paddd   xmm9, xmm14
979        pxor    xmm5, xmm10
980        pxor    xmm6, xmm11
981        pxor    xmm7, xmm8
982        pxor    xmm4, xmm9
983        movdqa  xmmword ptr [rsp+100H], xmm8
984        movdqa  xmm8, xmm5
985        psrld   xmm8, 12
986        pslld   xmm5, 20
987        por     xmm5, xmm8
988        movdqa  xmm8, xmm6
989        psrld   xmm8, 12
990        pslld   xmm6, 20
991        por     xmm6, xmm8
992        movdqa  xmm8, xmm7
993        psrld   xmm8, 12
994        pslld   xmm7, 20
995        por     xmm7, xmm8
996        movdqa  xmm8, xmm4
997        psrld   xmm8, 12
998        pslld   xmm4, 20
999        por     xmm4, xmm8
1000        paddd   xmm0, xmmword ptr [rsp+20H]
1001        paddd   xmm1, xmmword ptr [rsp+30H]
1002        paddd   xmm2, xmmword ptr [rsp+10H]
1003        paddd   xmm3, xmmword ptr [rsp+40H]
1004        paddd   xmm0, xmm5
1005        paddd   xmm1, xmm6
1006        paddd   xmm2, xmm7
1007        paddd   xmm3, xmm4
1008        pxor    xmm15, xmm0
1009        pxor    xmm12, xmm1
1010        pxor    xmm13, xmm2
1011        pxor    xmm14, xmm3
1012        movdqa  xmm8, xmmword ptr [ROT8]
1013        pshufb  xmm15, xmm8
1014        pshufb  xmm12, xmm8
1015        pshufb  xmm13, xmm8
1016        pshufb  xmm14, xmm8
1017        paddd   xmm10, xmm15
1018        paddd   xmm11, xmm12
1019        movdqa  xmm8, xmmword ptr [rsp+100H]
1020        paddd   xmm8, xmm13
1021        paddd   xmm9, xmm14
1022        pxor    xmm5, xmm10
1023        pxor    xmm6, xmm11
1024        pxor    xmm7, xmm8
1025        pxor    xmm4, xmm9
1026        movdqa  xmmword ptr [rsp+100H], xmm8
1027        movdqa  xmm8, xmm5
1028        psrld   xmm8, 7
1029        pslld   xmm5, 25
1030        por     xmm5, xmm8
1031        movdqa  xmm8, xmm6
1032        psrld   xmm8, 7
1033        pslld   xmm6, 25
1034        por     xmm6, xmm8
1035        movdqa  xmm8, xmm7
1036        psrld   xmm8, 7
1037        pslld   xmm7, 25
1038        por     xmm7, xmm8
1039        movdqa  xmm8, xmm4
1040        psrld   xmm8, 7
1041        pslld   xmm4, 25
1042        por     xmm4, xmm8
1043        paddd   xmm0, xmmword ptr [rsp+90H]
1044        paddd   xmm1, xmmword ptr [rsp+0B0H]
1045        paddd   xmm2, xmmword ptr [rsp+80H]
1046        paddd   xmm3, xmmword ptr [rsp+0F0H]
1047        paddd   xmm0, xmm4
1048        paddd   xmm1, xmm5
1049        paddd   xmm2, xmm6
1050        paddd   xmm3, xmm7
1051        pxor    xmm12, xmm0
1052        pxor    xmm13, xmm1
1053        pxor    xmm14, xmm2
1054        pxor    xmm15, xmm3
1055        movdqa  xmm8, xmmword ptr [ROT16]
1056        pshufb  xmm12, xmm8
1057        pshufb  xmm13, xmm8
1058        pshufb  xmm14, xmm8
1059        pshufb  xmm15, xmm8
1060        movdqa  xmm8, xmmword ptr [rsp+100H]
1061        paddd   xmm8, xmm12
1062        paddd   xmm9, xmm13
1063        paddd   xmm10, xmm14
1064        paddd   xmm11, xmm15
1065        pxor    xmm4, xmm8
1066        pxor    xmm5, xmm9
1067        pxor    xmm6, xmm10
1068        pxor    xmm7, xmm11
1069        movdqa  xmmword ptr [rsp+100H], xmm8
1070        movdqa  xmm8, xmm4
1071        psrld   xmm8, 12
1072        pslld   xmm4, 20
1073        por     xmm4, xmm8
1074        movdqa  xmm8, xmm5
1075        psrld   xmm8, 12
1076        pslld   xmm5, 20
1077        por     xmm5, xmm8
1078        movdqa  xmm8, xmm6
1079        psrld   xmm8, 12
1080        pslld   xmm6, 20
1081        por     xmm6, xmm8
1082        movdqa  xmm8, xmm7
1083        psrld   xmm8, 12
1084        pslld   xmm7, 20
1085        por     xmm7, xmm8
1086        paddd   xmm0, xmmword ptr [rsp+0E0H]
1087        paddd   xmm1, xmmword ptr [rsp+50H]
1088        paddd   xmm2, xmmword ptr [rsp+0C0H]
1089        paddd   xmm3, xmmword ptr [rsp+10H]
1090        paddd   xmm0, xmm4
1091        paddd   xmm1, xmm5
1092        paddd   xmm2, xmm6
1093        paddd   xmm3, xmm7
1094        pxor    xmm12, xmm0
1095        pxor    xmm13, xmm1
1096        pxor    xmm14, xmm2
1097        pxor    xmm15, xmm3
1098        movdqa  xmm8, xmmword ptr [ROT8]
1099        pshufb  xmm12, xmm8
1100        pshufb  xmm13, xmm8
1101        pshufb  xmm14, xmm8
1102        pshufb  xmm15, xmm8
1103        movdqa  xmm8, xmmword ptr [rsp+100H]
1104        paddd   xmm8, xmm12
1105        paddd   xmm9, xmm13
1106        paddd   xmm10, xmm14
1107        paddd   xmm11, xmm15
1108        pxor    xmm4, xmm8
1109        pxor    xmm5, xmm9
1110        pxor    xmm6, xmm10
1111        pxor    xmm7, xmm11
1112        movdqa  xmmword ptr [rsp+100H], xmm8
1113        movdqa  xmm8, xmm4
1114        psrld   xmm8, 7
1115        pslld   xmm4, 25
1116        por     xmm4, xmm8
1117        movdqa  xmm8, xmm5
1118        psrld   xmm8, 7
1119        pslld   xmm5, 25
1120        por     xmm5, xmm8
1121        movdqa  xmm8, xmm6
1122        psrld   xmm8, 7
1123        pslld   xmm6, 25
1124        por     xmm6, xmm8
1125        movdqa  xmm8, xmm7
1126        psrld   xmm8, 7
1127        pslld   xmm7, 25
1128        por     xmm7, xmm8
1129        paddd   xmm0, xmmword ptr [rsp+0D0H]
1130        paddd   xmm1, xmmword ptr [rsp]
1131        paddd   xmm2, xmmword ptr [rsp+20H]
1132        paddd   xmm3, xmmword ptr [rsp+40H]
1133        paddd   xmm0, xmm5
1134        paddd   xmm1, xmm6
1135        paddd   xmm2, xmm7
1136        paddd   xmm3, xmm4
1137        pxor    xmm15, xmm0
1138        pxor    xmm12, xmm1
1139        pxor    xmm13, xmm2
1140        pxor    xmm14, xmm3
1141        movdqa  xmm8, xmmword ptr [ROT16]
1142        pshufb  xmm15, xmm8
1143        pshufb  xmm12, xmm8
1144        pshufb  xmm13, xmm8
1145        pshufb  xmm14, xmm8
1146        paddd   xmm10, xmm15
1147        paddd   xmm11, xmm12
1148        movdqa  xmm8, xmmword ptr [rsp+100H]
1149        paddd   xmm8, xmm13
1150        paddd   xmm9, xmm14
1151        pxor    xmm5, xmm10
1152        pxor    xmm6, xmm11
1153        pxor    xmm7, xmm8
1154        pxor    xmm4, xmm9
1155        movdqa  xmmword ptr [rsp+100H], xmm8
1156        movdqa  xmm8, xmm5
1157        psrld   xmm8, 12
1158        pslld   xmm5, 20
1159        por     xmm5, xmm8
1160        movdqa  xmm8, xmm6
1161        psrld   xmm8, 12
1162        pslld   xmm6, 20
1163        por     xmm6, xmm8
1164        movdqa  xmm8, xmm7
1165        psrld   xmm8, 12
1166        pslld   xmm7, 20
1167        por     xmm7, xmm8
1168        movdqa  xmm8, xmm4
1169        psrld   xmm8, 12
1170        pslld   xmm4, 20
1171        por     xmm4, xmm8
1172        paddd   xmm0, xmmword ptr [rsp+30H]
1173        paddd   xmm1, xmmword ptr [rsp+0A0H]
1174        paddd   xmm2, xmmword ptr [rsp+60H]
1175        paddd   xmm3, xmmword ptr [rsp+70H]
1176        paddd   xmm0, xmm5
1177        paddd   xmm1, xmm6
1178        paddd   xmm2, xmm7
1179        paddd   xmm3, xmm4
1180        pxor    xmm15, xmm0
1181        pxor    xmm12, xmm1
1182        pxor    xmm13, xmm2
1183        pxor    xmm14, xmm3
1184        movdqa  xmm8, xmmword ptr [ROT8]
1185        pshufb  xmm15, xmm8
1186        pshufb  xmm12, xmm8
1187        pshufb  xmm13, xmm8
1188        pshufb  xmm14, xmm8
1189        paddd   xmm10, xmm15
1190        paddd   xmm11, xmm12
1191        movdqa  xmm8, xmmword ptr [rsp+100H]
1192        paddd   xmm8, xmm13
1193        paddd   xmm9, xmm14
1194        pxor    xmm5, xmm10
1195        pxor    xmm6, xmm11
1196        pxor    xmm7, xmm8
1197        pxor    xmm4, xmm9
1198        movdqa  xmmword ptr [rsp+100H], xmm8
1199        movdqa  xmm8, xmm5
1200        psrld   xmm8, 7
1201        pslld   xmm5, 25
1202        por     xmm5, xmm8
1203        movdqa  xmm8, xmm6
1204        psrld   xmm8, 7
1205        pslld   xmm6, 25
1206        por     xmm6, xmm8
1207        movdqa  xmm8, xmm7
1208        psrld   xmm8, 7
1209        pslld   xmm7, 25
1210        por     xmm7, xmm8
1211        movdqa  xmm8, xmm4
1212        psrld   xmm8, 7
1213        pslld   xmm4, 25
1214        por     xmm4, xmm8
1215        paddd   xmm0, xmmword ptr [rsp+0B0H]
1216        paddd   xmm1, xmmword ptr [rsp+50H]
1217        paddd   xmm2, xmmword ptr [rsp+10H]
1218        paddd   xmm3, xmmword ptr [rsp+80H]
1219        paddd   xmm0, xmm4
1220        paddd   xmm1, xmm5
1221        paddd   xmm2, xmm6
1222        paddd   xmm3, xmm7
1223        pxor    xmm12, xmm0
1224        pxor    xmm13, xmm1
1225        pxor    xmm14, xmm2
1226        pxor    xmm15, xmm3
1227        movdqa  xmm8, xmmword ptr [ROT16]
1228        pshufb  xmm12, xmm8
1229        pshufb  xmm13, xmm8
1230        pshufb  xmm14, xmm8
1231        pshufb  xmm15, xmm8
1232        movdqa  xmm8, xmmword ptr [rsp+100H]
1233        paddd   xmm8, xmm12
1234        paddd   xmm9, xmm13
1235        paddd   xmm10, xmm14
1236        paddd   xmm11, xmm15
1237        pxor    xmm4, xmm8
1238        pxor    xmm5, xmm9
1239        pxor    xmm6, xmm10
1240        pxor    xmm7, xmm11
1241        movdqa  xmmword ptr [rsp+100H], xmm8
1242        movdqa  xmm8, xmm4
1243        psrld   xmm8, 12
1244        pslld   xmm4, 20
1245        por     xmm4, xmm8
1246        movdqa  xmm8, xmm5
1247        psrld   xmm8, 12
1248        pslld   xmm5, 20
1249        por     xmm5, xmm8
1250        movdqa  xmm8, xmm6
1251        psrld   xmm8, 12
1252        pslld   xmm6, 20
1253        por     xmm6, xmm8
1254        movdqa  xmm8, xmm7
1255        psrld   xmm8, 12
1256        pslld   xmm7, 20
1257        por     xmm7, xmm8
1258        paddd   xmm0, xmmword ptr [rsp+0F0H]
1259        paddd   xmm1, xmmword ptr [rsp]
1260        paddd   xmm2, xmmword ptr [rsp+90H]
1261        paddd   xmm3, xmmword ptr [rsp+60H]
1262        paddd   xmm0, xmm4
1263        paddd   xmm1, xmm5
1264        paddd   xmm2, xmm6
1265        paddd   xmm3, xmm7
1266        pxor    xmm12, xmm0
1267        pxor    xmm13, xmm1
1268        pxor    xmm14, xmm2
1269        pxor    xmm15, xmm3
1270        movdqa  xmm8, xmmword ptr [ROT8]
1271        pshufb  xmm12, xmm8
1272        pshufb  xmm13, xmm8
1273        pshufb  xmm14, xmm8
1274        pshufb  xmm15, xmm8
1275        movdqa  xmm8, xmmword ptr [rsp+100H]
1276        paddd   xmm8, xmm12
1277        paddd   xmm9, xmm13
1278        paddd   xmm10, xmm14
1279        paddd   xmm11, xmm15
1280        pxor    xmm4, xmm8
1281        pxor    xmm5, xmm9
1282        pxor    xmm6, xmm10
1283        pxor    xmm7, xmm11
1284        movdqa  xmmword ptr [rsp+100H], xmm8
1285        movdqa  xmm8, xmm4
1286        psrld   xmm8, 7
1287        pslld   xmm4, 25
1288        por     xmm4, xmm8
1289        movdqa  xmm8, xmm5
1290        psrld   xmm8, 7
1291        pslld   xmm5, 25
1292        por     xmm5, xmm8
1293        movdqa  xmm8, xmm6
1294        psrld   xmm8, 7
1295        pslld   xmm6, 25
1296        por     xmm6, xmm8
1297        movdqa  xmm8, xmm7
1298        psrld   xmm8, 7
1299        pslld   xmm7, 25
1300        por     xmm7, xmm8
1301        paddd   xmm0, xmmword ptr [rsp+0E0H]
1302        paddd   xmm1, xmmword ptr [rsp+20H]
1303        paddd   xmm2, xmmword ptr [rsp+30H]
1304        paddd   xmm3, xmmword ptr [rsp+70H]
1305        paddd   xmm0, xmm5
1306        paddd   xmm1, xmm6
1307        paddd   xmm2, xmm7
1308        paddd   xmm3, xmm4
1309        pxor    xmm15, xmm0
1310        pxor    xmm12, xmm1
1311        pxor    xmm13, xmm2
1312        pxor    xmm14, xmm3
1313        movdqa  xmm8, xmmword ptr [ROT16]
1314        pshufb  xmm15, xmm8
1315        pshufb  xmm12, xmm8
1316        pshufb  xmm13, xmm8
1317        pshufb  xmm14, xmm8
1318        paddd   xmm10, xmm15
1319        paddd   xmm11, xmm12
1320        movdqa  xmm8, xmmword ptr [rsp+100H]
1321        paddd   xmm8, xmm13
1322        paddd   xmm9, xmm14
1323        pxor    xmm5, xmm10
1324        pxor    xmm6, xmm11
1325        pxor    xmm7, xmm8
1326        pxor    xmm4, xmm9
1327        movdqa  xmmword ptr [rsp+100H], xmm8
1328        movdqa  xmm8, xmm5
1329        psrld   xmm8, 12
1330        pslld   xmm5, 20
1331        por     xmm5, xmm8
1332        movdqa  xmm8, xmm6
1333        psrld   xmm8, 12
1334        pslld   xmm6, 20
1335        por     xmm6, xmm8
1336        movdqa  xmm8, xmm7
1337        psrld   xmm8, 12
1338        pslld   xmm7, 20
1339        por     xmm7, xmm8
1340        movdqa  xmm8, xmm4
1341        psrld   xmm8, 12
1342        pslld   xmm4, 20
1343        por     xmm4, xmm8
1344        paddd   xmm0, xmmword ptr [rsp+0A0H]
1345        paddd   xmm1, xmmword ptr [rsp+0C0H]
1346        paddd   xmm2, xmmword ptr [rsp+40H]
1347        paddd   xmm3, xmmword ptr [rsp+0D0H]
1348        paddd   xmm0, xmm5
1349        paddd   xmm1, xmm6
1350        paddd   xmm2, xmm7
1351        paddd   xmm3, xmm4
1352        pxor    xmm15, xmm0
1353        pxor    xmm12, xmm1
1354        pxor    xmm13, xmm2
1355        pxor    xmm14, xmm3
1356        movdqa  xmm8, xmmword ptr [ROT8]
1357        pshufb  xmm15, xmm8
1358        pshufb  xmm12, xmm8
1359        pshufb  xmm13, xmm8
1360        pshufb  xmm14, xmm8
1361        paddd   xmm10, xmm15
1362        paddd   xmm11, xmm12
1363        movdqa  xmm8, xmmword ptr [rsp+100H]
1364        paddd   xmm8, xmm13
1365        paddd   xmm9, xmm14
1366        pxor    xmm5, xmm10
1367        pxor    xmm6, xmm11
1368        pxor    xmm7, xmm8
1369        pxor    xmm4, xmm9
1370        pxor    xmm0, xmm8
1371        pxor    xmm1, xmm9
1372        pxor    xmm2, xmm10
1373        pxor    xmm3, xmm11
1374        movdqa  xmm8, xmm5
1375        psrld   xmm8, 7
1376        pslld   xmm5, 25
1377        por     xmm5, xmm8
1378        movdqa  xmm8, xmm6
1379        psrld   xmm8, 7
1380        pslld   xmm6, 25
1381        por     xmm6, xmm8
1382        movdqa  xmm8, xmm7
1383        psrld   xmm8, 7
1384        pslld   xmm7, 25
1385        por     xmm7, xmm8
1386        movdqa  xmm8, xmm4
1387        psrld   xmm8, 7
1388        pslld   xmm4, 25
1389        por     xmm4, xmm8
1390        pxor    xmm4, xmm12
1391        pxor    xmm5, xmm13
1392        pxor    xmm6, xmm14
1393        pxor    xmm7, xmm15
1394        mov     eax, r13d
1395        jne     innerloop4
1396        movdqa  xmm9, xmm0
1397        punpckldq xmm0, xmm1
1398        punpckhdq xmm9, xmm1
1399        movdqa  xmm11, xmm2
1400        punpckldq xmm2, xmm3
1401        punpckhdq xmm11, xmm3
1402        movdqa  xmm1, xmm0
1403        punpcklqdq xmm0, xmm2
1404        punpckhqdq xmm1, xmm2
1405        movdqa  xmm3, xmm9
1406        punpcklqdq xmm9, xmm11
1407        punpckhqdq xmm3, xmm11
1408        movdqu  xmmword ptr [rbx], xmm0
1409        movdqu  xmmword ptr [rbx+20H], xmm1
1410        movdqu  xmmword ptr [rbx+40H], xmm9
1411        movdqu  xmmword ptr [rbx+60H], xmm3
1412        movdqa  xmm9, xmm4
1413        punpckldq xmm4, xmm5
1414        punpckhdq xmm9, xmm5
1415        movdqa  xmm11, xmm6
1416        punpckldq xmm6, xmm7
1417        punpckhdq xmm11, xmm7
1418        movdqa  xmm5, xmm4
1419        punpcklqdq xmm4, xmm6
1420        punpckhqdq xmm5, xmm6
1421        movdqa  xmm7, xmm9
1422        punpcklqdq xmm9, xmm11
1423        punpckhqdq xmm7, xmm11
1424        movdqu  xmmword ptr [rbx+10H], xmm4
1425        movdqu  xmmword ptr [rbx+30H], xmm5
1426        movdqu  xmmword ptr [rbx+50H], xmm9
1427        movdqu  xmmword ptr [rbx+70H], xmm7
1428        movdqa  xmm1, xmmword ptr [rsp+110H]
1429        movdqa  xmm0, xmm1
1430        paddd   xmm1, xmmword ptr [rsp+150H]
1431        movdqa  xmmword ptr [rsp+110H], xmm1
1432        pxor    xmm0, xmmword ptr [CMP_MSB_MASK]
1433        pxor    xmm1, xmmword ptr [CMP_MSB_MASK]
1434        pcmpgtd xmm0, xmm1
1435        movdqa  xmm1, xmmword ptr [rsp+120H]
1436        psubd   xmm1, xmm0
1437        movdqa  xmmword ptr [rsp+120H], xmm1
1438        add     rbx, 128
1439        add     rdi, 32
1440        sub     rsi, 4
1441        cmp     rsi, 4
1442        jnc     outerloop4
1443        test    rsi, rsi
1444        jne     final3blocks
1445unwind:
1446        movdqa  xmm6, xmmword ptr [rsp+170H]
1447        movdqa  xmm7, xmmword ptr [rsp+180H]
1448        movdqa  xmm8, xmmword ptr [rsp+190H]
1449        movdqa  xmm9, xmmword ptr [rsp+1A0H]
1450        movdqa  xmm10, xmmword ptr [rsp+1B0H]
1451        movdqa  xmm11, xmmword ptr [rsp+1C0H]
1452        movdqa  xmm12, xmmword ptr [rsp+1D0H]
1453        movdqa  xmm13, xmmword ptr [rsp+1E0H]
1454        movdqa  xmm14, xmmword ptr [rsp+1F0H]
1455        movdqa  xmm15, xmmword ptr [rsp+200H]
1456        mov     rsp, rbp
1457        pop     rbp
1458        pop     rbx
1459        pop     rdi
1460        pop     rsi
1461        pop     r12
1462        pop     r13
1463        pop     r14
1464        pop     r15
1465        ret
1466ALIGN   16
1467final3blocks:
1468        test    esi, 2H
1469        je      final1block
1470        movups  xmm0, xmmword ptr [rcx]
1471        movups  xmm1, xmmword ptr [rcx+10H]
1472        movaps  xmm8, xmm0
1473        movaps  xmm9, xmm1
1474        movd    xmm13, dword ptr [rsp+110H]
1475        pinsrd  xmm13, dword ptr [rsp+120H], 1
1476        pinsrd  xmm13, dword ptr [BLAKE3_BLOCK_LEN], 2
1477        movaps  xmmword ptr [rsp], xmm13
1478        movd    xmm14, dword ptr [rsp+114H]
1479        pinsrd  xmm14, dword ptr [rsp+124H], 1
1480        pinsrd  xmm14, dword ptr [BLAKE3_BLOCK_LEN], 2
1481        movaps  xmmword ptr [rsp+10H], xmm14
1482        mov     r8, qword ptr [rdi]
1483        mov     r9, qword ptr [rdi+8H]
1484        movzx   eax, byte ptr [rbp+80H]
1485        or      eax, r13d
1486        xor     edx, edx
1487innerloop2:
1488        mov     r14d, eax
1489        or      eax, r12d
1490        add     rdx, 64
1491        cmp     rdx, r15
1492        cmovne  eax, r14d
1493        movaps  xmm2, xmmword ptr [BLAKE3_IV]
1494        movaps  xmm10, xmm2
1495        movups  xmm4, xmmword ptr [r8+rdx-40H]
1496        movups  xmm5, xmmword ptr [r8+rdx-30H]
1497        movaps  xmm3, xmm4
1498        shufps  xmm4, xmm5, 136
1499        shufps  xmm3, xmm5, 221
1500        movaps  xmm5, xmm3
1501        movups  xmm6, xmmword ptr [r8+rdx-20H]
1502        movups  xmm7, xmmword ptr [r8+rdx-10H]
1503        movaps  xmm3, xmm6
1504        shufps  xmm6, xmm7, 136
1505        pshufd  xmm6, xmm6, 93H
1506        shufps  xmm3, xmm7, 221
1507        pshufd  xmm7, xmm3, 93H
1508        movups  xmm12, xmmword ptr [r9+rdx-40H]
1509        movups  xmm13, xmmword ptr [r9+rdx-30H]
1510        movaps  xmm11, xmm12
1511        shufps  xmm12, xmm13, 136
1512        shufps  xmm11, xmm13, 221
1513        movaps  xmm13, xmm11
1514        movups  xmm14, xmmword ptr [r9+rdx-20H]
1515        movups  xmm15, xmmword ptr [r9+rdx-10H]
1516        movaps  xmm11, xmm14
1517        shufps  xmm14, xmm15, 136
1518        pshufd  xmm14, xmm14, 93H
1519        shufps  xmm11, xmm15, 221
1520        pshufd  xmm15, xmm11, 93H
1521        movaps  xmm3, xmmword ptr [rsp]
1522        movaps  xmm11, xmmword ptr [rsp+10H]
1523        pinsrd  xmm3, eax, 3
1524        pinsrd  xmm11, eax, 3
1525        mov     al, 7
1526roundloop2:
1527        paddd   xmm0, xmm4
1528        paddd   xmm8, xmm12
1529        movaps  xmmword ptr [rsp+20H], xmm4
1530        movaps  xmmword ptr [rsp+30H], xmm12
1531        paddd   xmm0, xmm1
1532        paddd   xmm8, xmm9
1533        pxor    xmm3, xmm0
1534        pxor    xmm11, xmm8
1535        movaps  xmm12, xmmword ptr [ROT16]
1536        pshufb  xmm3, xmm12
1537        pshufb  xmm11, xmm12
1538        paddd   xmm2, xmm3
1539        paddd   xmm10, xmm11
1540        pxor    xmm1, xmm2
1541        pxor    xmm9, xmm10
1542        movdqa  xmm4, xmm1
1543        pslld   xmm1, 20
1544        psrld   xmm4, 12
1545        por     xmm1, xmm4
1546        movdqa  xmm4, xmm9
1547        pslld   xmm9, 20
1548        psrld   xmm4, 12
1549        por     xmm9, xmm4
1550        paddd   xmm0, xmm5
1551        paddd   xmm8, xmm13
1552        movaps  xmmword ptr [rsp+40H], xmm5
1553        movaps  xmmword ptr [rsp+50H], xmm13
1554        paddd   xmm0, xmm1
1555        paddd   xmm8, xmm9
1556        pxor    xmm3, xmm0
1557        pxor    xmm11, xmm8
1558        movaps  xmm13, xmmword ptr [ROT8]
1559        pshufb  xmm3, xmm13
1560        pshufb  xmm11, xmm13
1561        paddd   xmm2, xmm3
1562        paddd   xmm10, xmm11
1563        pxor    xmm1, xmm2
1564        pxor    xmm9, xmm10
1565        movdqa  xmm4, xmm1
1566        pslld   xmm1, 25
1567        psrld   xmm4, 7
1568        por     xmm1, xmm4
1569        movdqa  xmm4, xmm9
1570        pslld   xmm9, 25
1571        psrld   xmm4, 7
1572        por     xmm9, xmm4
1573        pshufd  xmm0, xmm0, 93H
1574        pshufd  xmm8, xmm8, 93H
1575        pshufd  xmm3, xmm3, 4EH
1576        pshufd  xmm11, xmm11, 4EH
1577        pshufd  xmm2, xmm2, 39H
1578        pshufd  xmm10, xmm10, 39H
1579        paddd   xmm0, xmm6
1580        paddd   xmm8, xmm14
1581        paddd   xmm0, xmm1
1582        paddd   xmm8, xmm9
1583        pxor    xmm3, xmm0
1584        pxor    xmm11, xmm8
1585        pshufb  xmm3, xmm12
1586        pshufb  xmm11, xmm12
1587        paddd   xmm2, xmm3
1588        paddd   xmm10, xmm11
1589        pxor    xmm1, xmm2
1590        pxor    xmm9, xmm10
1591        movdqa  xmm4, xmm1
1592        pslld   xmm1, 20
1593        psrld   xmm4, 12
1594        por     xmm1, xmm4
1595        movdqa  xmm4, xmm9
1596        pslld   xmm9, 20
1597        psrld   xmm4, 12
1598        por     xmm9, xmm4
1599        paddd   xmm0, xmm7
1600        paddd   xmm8, xmm15
1601        paddd   xmm0, xmm1
1602        paddd   xmm8, xmm9
1603        pxor    xmm3, xmm0
1604        pxor    xmm11, xmm8
1605        pshufb  xmm3, xmm13
1606        pshufb  xmm11, xmm13
1607        paddd   xmm2, xmm3
1608        paddd   xmm10, xmm11
1609        pxor    xmm1, xmm2
1610        pxor    xmm9, xmm10
1611        movdqa  xmm4, xmm1
1612        pslld   xmm1, 25
1613        psrld   xmm4, 7
1614        por     xmm1, xmm4
1615        movdqa  xmm4, xmm9
1616        pslld   xmm9, 25
1617        psrld   xmm4, 7
1618        por     xmm9, xmm4
1619        pshufd  xmm0, xmm0, 39H
1620        pshufd  xmm8, xmm8, 39H
1621        pshufd  xmm3, xmm3, 4EH
1622        pshufd  xmm11, xmm11, 4EH
1623        pshufd  xmm2, xmm2, 93H
1624        pshufd  xmm10, xmm10, 93H
1625        dec     al
1626        je      endroundloop2
1627        movdqa  xmm12, xmmword ptr [rsp+20H]
1628        movdqa  xmm5, xmmword ptr [rsp+40H]
1629        pshufd  xmm13, xmm12, 0FH
1630        shufps  xmm12, xmm5, 214
1631        pshufd  xmm4, xmm12, 39H
1632        movdqa  xmm12, xmm6
1633        shufps  xmm12, xmm7, 250
1634        pblendw xmm13, xmm12, 0CCH
1635        movdqa  xmm12, xmm7
1636        punpcklqdq xmm12, xmm5
1637        pblendw xmm12, xmm6, 0C0H
1638        pshufd  xmm12, xmm12, 78H
1639        punpckhdq xmm5, xmm7
1640        punpckldq xmm6, xmm5
1641        pshufd  xmm7, xmm6, 1EH
1642        movdqa  xmmword ptr [rsp+20H], xmm13
1643        movdqa  xmmword ptr [rsp+40H], xmm12
1644        movdqa  xmm5, xmmword ptr [rsp+30H]
1645        movdqa  xmm13, xmmword ptr [rsp+50H]
1646        pshufd  xmm6, xmm5, 0FH
1647        shufps  xmm5, xmm13, 214
1648        pshufd  xmm12, xmm5, 39H
1649        movdqa  xmm5, xmm14
1650        shufps  xmm5, xmm15, 250
1651        pblendw xmm6, xmm5, 0CCH
1652        movdqa  xmm5, xmm15
1653        punpcklqdq xmm5, xmm13
1654        pblendw xmm5, xmm14, 0C0H
1655        pshufd  xmm5, xmm5, 78H
1656        punpckhdq xmm13, xmm15
1657        punpckldq xmm14, xmm13
1658        pshufd  xmm15, xmm14, 1EH
1659        movdqa  xmm13, xmm6
1660        movdqa  xmm14, xmm5
1661        movdqa  xmm5, xmmword ptr [rsp+20H]
1662        movdqa  xmm6, xmmword ptr [rsp+40H]
1663        jmp     roundloop2
1664endroundloop2:
1665        pxor    xmm0, xmm2
1666        pxor    xmm1, xmm3
1667        pxor    xmm8, xmm10
1668        pxor    xmm9, xmm11
1669        mov     eax, r13d
1670        cmp     rdx, r15
1671        jne     innerloop2
1672        movups  xmmword ptr [rbx], xmm0
1673        movups  xmmword ptr [rbx+10H], xmm1
1674        movups  xmmword ptr [rbx+20H], xmm8
1675        movups  xmmword ptr [rbx+30H], xmm9
1676        movdqa  xmm0, xmmword ptr [rsp+130H]
1677        movdqa  xmm1, xmmword ptr [rsp+110H]
1678        movdqa  xmm2, xmmword ptr [rsp+120H]
1679        movdqu  xmm3, xmmword ptr [rsp+118H]
1680        movdqu  xmm4, xmmword ptr [rsp+128H]
1681        blendvps xmm1, xmm3, xmm0
1682        blendvps xmm2, xmm4, xmm0
1683        movdqa  xmmword ptr [rsp+110H], xmm1
1684        movdqa  xmmword ptr [rsp+120H], xmm2
1685        add     rdi, 16
1686        add     rbx, 64
1687        sub     rsi, 2
1688final1block:
1689        test    esi, 1H
1690        je      unwind
1691        movups  xmm0, xmmword ptr [rcx]
1692        movups  xmm1, xmmword ptr [rcx+10H]
1693        movd    xmm13, dword ptr [rsp+110H]
1694        pinsrd  xmm13, dword ptr [rsp+120H], 1
1695        pinsrd  xmm13, dword ptr [BLAKE3_BLOCK_LEN], 2
1696        movaps  xmm14, xmmword ptr [ROT8]
1697        movaps  xmm15, xmmword ptr [ROT16]
1698        mov     r8, qword ptr [rdi]
1699        movzx   eax, byte ptr [rbp+80H]
1700        or      eax, r13d
1701        xor     edx, edx
1702innerloop1:
1703        mov     r14d, eax
1704        or      eax, r12d
1705        add     rdx, 64
1706        cmp     rdx, r15
1707        cmovne  eax, r14d
1708        movaps  xmm2, xmmword ptr [BLAKE3_IV]
1709        movaps  xmm3, xmm13
1710        pinsrd  xmm3, eax, 3
1711        movups  xmm4, xmmword ptr [r8+rdx-40H]
1712        movups  xmm5, xmmword ptr [r8+rdx-30H]
1713        movaps  xmm8, xmm4
1714        shufps  xmm4, xmm5, 136
1715        shufps  xmm8, xmm5, 221
1716        movaps  xmm5, xmm8
1717        movups  xmm6, xmmword ptr [r8+rdx-20H]
1718        movups  xmm7, xmmword ptr [r8+rdx-10H]
1719        movaps  xmm8, xmm6
1720        shufps  xmm6, xmm7, 136
1721        pshufd  xmm6, xmm6, 93H
1722        shufps  xmm8, xmm7, 221
1723        pshufd  xmm7, xmm8, 93H
1724        mov     al, 7
1725roundloop1:
1726        paddd   xmm0, xmm4
1727        paddd   xmm0, xmm1
1728        pxor    xmm3, xmm0
1729        pshufb  xmm3, xmm15
1730        paddd   xmm2, xmm3
1731        pxor    xmm1, xmm2
1732        movdqa  xmm11, xmm1
1733        pslld   xmm1, 20
1734        psrld   xmm11, 12
1735        por     xmm1, xmm11
1736        paddd   xmm0, xmm5
1737        paddd   xmm0, xmm1
1738        pxor    xmm3, xmm0
1739        pshufb  xmm3, xmm14
1740        paddd   xmm2, xmm3
1741        pxor    xmm1, xmm2
1742        movdqa  xmm11, xmm1
1743        pslld   xmm1, 25
1744        psrld   xmm11, 7
1745        por     xmm1, xmm11
1746        pshufd  xmm0, xmm0, 93H
1747        pshufd  xmm3, xmm3, 4EH
1748        pshufd  xmm2, xmm2, 39H
1749        paddd   xmm0, xmm6
1750        paddd   xmm0, xmm1
1751        pxor    xmm3, xmm0
1752        pshufb  xmm3, xmm15
1753        paddd   xmm2, xmm3
1754        pxor    xmm1, xmm2
1755        movdqa  xmm11, xmm1
1756        pslld   xmm1, 20
1757        psrld   xmm11, 12
1758        por     xmm1, xmm11
1759        paddd   xmm0, xmm7
1760        paddd   xmm0, xmm1
1761        pxor    xmm3, xmm0
1762        pshufb  xmm3, xmm14
1763        paddd   xmm2, xmm3
1764        pxor    xmm1, xmm2
1765        movdqa  xmm11, xmm1
1766        pslld   xmm1, 25
1767        psrld   xmm11, 7
1768        por     xmm1, xmm11
1769        pshufd  xmm0, xmm0, 39H
1770        pshufd  xmm3, xmm3, 4EH
1771        pshufd  xmm2, xmm2, 93H
1772        dec     al
1773        jz      endroundloop1
1774        movdqa  xmm8, xmm4
1775        shufps  xmm8, xmm5, 214
1776        pshufd  xmm9, xmm4, 0FH
1777        pshufd  xmm4, xmm8, 39H
1778        movdqa  xmm8, xmm6
1779        shufps  xmm8, xmm7, 250
1780        pblendw xmm9, xmm8, 0CCH
1781        movdqa  xmm8, xmm7
1782        punpcklqdq xmm8, xmm5
1783        pblendw xmm8, xmm6, 0C0H
1784        pshufd  xmm8, xmm8, 78H
1785        punpckhdq xmm5, xmm7
1786        punpckldq xmm6, xmm5
1787        pshufd  xmm7, xmm6, 1EH
1788        movdqa  xmm5, xmm9
1789        movdqa  xmm6, xmm8
1790        jmp     roundloop1
1791endroundloop1:
1792        pxor    xmm0, xmm2
1793        pxor    xmm1, xmm3
1794        mov     eax, r13d
1795        cmp     rdx, r15
1796        jne     innerloop1
1797        movups  xmmword ptr [rbx], xmm0
1798        movups  xmmword ptr [rbx+10H], xmm1
1799        jmp     unwind
1800_llvm_blake3_hash_many_sse41 ENDP
1801llvm_blake3_hash_many_sse41 ENDP
1802
1803llvm_blake3_compress_in_place_sse41 PROC
1804_llvm_blake3_compress_in_place_sse41 PROC
1805        sub     rsp, 120
1806        movdqa  xmmword ptr [rsp], xmm6
1807        movdqa  xmmword ptr [rsp+10H], xmm7
1808        movdqa  xmmword ptr [rsp+20H], xmm8
1809        movdqa  xmmword ptr [rsp+30H], xmm9
1810        movdqa  xmmword ptr [rsp+40H], xmm11
1811        movdqa  xmmword ptr [rsp+50H], xmm14
1812        movdqa  xmmword ptr [rsp+60H], xmm15
1813        movups  xmm0, xmmword ptr [rcx]
1814        movups  xmm1, xmmword ptr [rcx+10H]
1815        movaps  xmm2, xmmword ptr [BLAKE3_IV]
1816        movzx   eax, byte ptr [rsp+0A0H]
1817        movzx   r8d, r8b
1818        shl     rax, 32
1819        add     r8, rax
1820        movd    xmm3, r9
1821        movd    xmm4, r8
1822        punpcklqdq xmm3, xmm4
1823        movups  xmm4, xmmword ptr [rdx]
1824        movups  xmm5, xmmword ptr [rdx+10H]
1825        movaps  xmm8, xmm4
1826        shufps  xmm4, xmm5, 136
1827        shufps  xmm8, xmm5, 221
1828        movaps  xmm5, xmm8
1829        movups  xmm6, xmmword ptr [rdx+20H]
1830        movups  xmm7, xmmword ptr [rdx+30H]
1831        movaps  xmm8, xmm6
1832        shufps  xmm6, xmm7, 136
1833        pshufd  xmm6, xmm6, 93H
1834        shufps  xmm8, xmm7, 221
1835        pshufd  xmm7, xmm8, 93H
1836        movaps  xmm14, xmmword ptr [ROT8]
1837        movaps  xmm15, xmmword ptr [ROT16]
1838        mov     al, 7
1839@@:
1840        paddd   xmm0, xmm4
1841        paddd   xmm0, xmm1
1842        pxor    xmm3, xmm0
1843        pshufb  xmm3, xmm15
1844        paddd   xmm2, xmm3
1845        pxor    xmm1, xmm2
1846        movdqa  xmm11, xmm1
1847        pslld   xmm1, 20
1848        psrld   xmm11, 12
1849        por     xmm1, xmm11
1850        paddd   xmm0, xmm5
1851        paddd   xmm0, xmm1
1852        pxor    xmm3, xmm0
1853        pshufb  xmm3, xmm14
1854        paddd   xmm2, xmm3
1855        pxor    xmm1, xmm2
1856        movdqa  xmm11, xmm1
1857        pslld   xmm1, 25
1858        psrld   xmm11, 7
1859        por     xmm1, xmm11
1860        pshufd  xmm0, xmm0, 93H
1861        pshufd  xmm3, xmm3, 4EH
1862        pshufd  xmm2, xmm2, 39H
1863        paddd   xmm0, xmm6
1864        paddd   xmm0, xmm1
1865        pxor    xmm3, xmm0
1866        pshufb  xmm3, xmm15
1867        paddd   xmm2, xmm3
1868        pxor    xmm1, xmm2
1869        movdqa  xmm11, xmm1
1870        pslld   xmm1, 20
1871        psrld   xmm11, 12
1872        por     xmm1, xmm11
1873        paddd   xmm0, xmm7
1874        paddd   xmm0, xmm1
1875        pxor    xmm3, xmm0
1876        pshufb  xmm3, xmm14
1877        paddd   xmm2, xmm3
1878        pxor    xmm1, xmm2
1879        movdqa  xmm11, xmm1
1880        pslld   xmm1, 25
1881        psrld   xmm11, 7
1882        por     xmm1, xmm11
1883        pshufd  xmm0, xmm0, 39H
1884        pshufd  xmm3, xmm3, 4EH
1885        pshufd  xmm2, xmm2, 93H
1886        dec     al
1887        jz      @F
1888        movdqa  xmm8, xmm4
1889        shufps  xmm8, xmm5, 214
1890        pshufd  xmm9, xmm4, 0FH
1891        pshufd  xmm4, xmm8, 39H
1892        movdqa  xmm8, xmm6
1893        shufps  xmm8, xmm7, 250
1894        pblendw xmm9, xmm8, 0CCH
1895        movdqa  xmm8, xmm7
1896        punpcklqdq xmm8, xmm5
1897        pblendw xmm8, xmm6, 0C0H
1898        pshufd  xmm8, xmm8, 78H
1899        punpckhdq xmm5, xmm7
1900        punpckldq xmm6, xmm5
1901        pshufd  xmm7, xmm6, 1EH
1902        movdqa  xmm5, xmm9
1903        movdqa  xmm6, xmm8
1904        jmp     @B
1905@@:
1906        pxor    xmm0, xmm2
1907        pxor    xmm1, xmm3
1908        movups  xmmword ptr [rcx], xmm0
1909        movups  xmmword ptr [rcx+10H], xmm1
1910        movdqa  xmm6, xmmword ptr [rsp]
1911        movdqa  xmm7, xmmword ptr [rsp+10H]
1912        movdqa  xmm8, xmmword ptr [rsp+20H]
1913        movdqa  xmm9, xmmword ptr [rsp+30H]
1914        movdqa  xmm11, xmmword ptr [rsp+40H]
1915        movdqa  xmm14, xmmword ptr [rsp+50H]
1916        movdqa  xmm15, xmmword ptr [rsp+60H]
1917        add     rsp, 120
1918        ret
1919_llvm_blake3_compress_in_place_sse41 ENDP
1920llvm_blake3_compress_in_place_sse41 ENDP
1921
1922ALIGN 16
1923llvm_blake3_compress_xof_sse41 PROC
1924_llvm_blake3_compress_xof_sse41 PROC
1925        sub     rsp, 120
1926        movdqa  xmmword ptr [rsp], xmm6
1927        movdqa  xmmword ptr [rsp+10H], xmm7
1928        movdqa  xmmword ptr [rsp+20H], xmm8
1929        movdqa  xmmword ptr [rsp+30H], xmm9
1930        movdqa  xmmword ptr [rsp+40H], xmm11
1931        movdqa  xmmword ptr [rsp+50H], xmm14
1932        movdqa  xmmword ptr [rsp+60H], xmm15
1933        movups  xmm0, xmmword ptr [rcx]
1934        movups  xmm1, xmmword ptr [rcx+10H]
1935        movaps  xmm2, xmmword ptr [BLAKE3_IV]
1936        movzx   eax, byte ptr [rsp+0A0H]
1937        movzx   r8d, r8b
1938        mov     r10, qword ptr [rsp+0A8H]
1939        shl     rax, 32
1940        add     r8, rax
1941        movd    xmm3, r9
1942        movd    xmm4, r8
1943        punpcklqdq xmm3, xmm4
1944        movups  xmm4, xmmword ptr [rdx]
1945        movups  xmm5, xmmword ptr [rdx+10H]
1946        movaps  xmm8, xmm4
1947        shufps  xmm4, xmm5, 136
1948        shufps  xmm8, xmm5, 221
1949        movaps  xmm5, xmm8
1950        movups  xmm6, xmmword ptr [rdx+20H]
1951        movups  xmm7, xmmword ptr [rdx+30H]
1952        movaps  xmm8, xmm6
1953        shufps  xmm6, xmm7, 136
1954        pshufd  xmm6, xmm6, 93H
1955        shufps  xmm8, xmm7, 221
1956        pshufd  xmm7, xmm8, 93H
1957        movaps  xmm14, xmmword ptr [ROT8]
1958        movaps  xmm15, xmmword ptr [ROT16]
1959        mov     al, 7
1960@@:
1961        paddd   xmm0, xmm4
1962        paddd   xmm0, xmm1
1963        pxor    xmm3, xmm0
1964        pshufb  xmm3, xmm15
1965        paddd   xmm2, xmm3
1966        pxor    xmm1, xmm2
1967        movdqa  xmm11, xmm1
1968        pslld   xmm1, 20
1969        psrld   xmm11, 12
1970        por     xmm1, xmm11
1971        paddd   xmm0, xmm5
1972        paddd   xmm0, xmm1
1973        pxor    xmm3, xmm0
1974        pshufb  xmm3, xmm14
1975        paddd   xmm2, xmm3
1976        pxor    xmm1, xmm2
1977        movdqa  xmm11, xmm1
1978        pslld   xmm1, 25
1979        psrld   xmm11, 7
1980        por     xmm1, xmm11
1981        pshufd  xmm0, xmm0, 93H
1982        pshufd  xmm3, xmm3, 4EH
1983        pshufd  xmm2, xmm2, 39H
1984        paddd   xmm0, xmm6
1985        paddd   xmm0, xmm1
1986        pxor    xmm3, xmm0
1987        pshufb  xmm3, xmm15
1988        paddd   xmm2, xmm3
1989        pxor    xmm1, xmm2
1990        movdqa  xmm11, xmm1
1991        pslld   xmm1, 20
1992        psrld   xmm11, 12
1993        por     xmm1, xmm11
1994        paddd   xmm0, xmm7
1995        paddd   xmm0, xmm1
1996        pxor    xmm3, xmm0
1997        pshufb  xmm3, xmm14
1998        paddd   xmm2, xmm3
1999        pxor    xmm1, xmm2
2000        movdqa  xmm11, xmm1
2001        pslld   xmm1, 25
2002        psrld   xmm11, 7
2003        por     xmm1, xmm11
2004        pshufd  xmm0, xmm0, 39H
2005        pshufd  xmm3, xmm3, 4EH
2006        pshufd  xmm2, xmm2, 93H
2007        dec     al
2008        jz      @F
2009        movdqa  xmm8, xmm4
2010        shufps  xmm8, xmm5, 214
2011        pshufd  xmm9, xmm4, 0FH
2012        pshufd  xmm4, xmm8, 39H
2013        movdqa  xmm8, xmm6
2014        shufps  xmm8, xmm7, 250
2015        pblendw xmm9, xmm8, 0CCH
2016        movdqa  xmm8, xmm7
2017        punpcklqdq xmm8, xmm5
2018        pblendw xmm8, xmm6, 0C0H
2019        pshufd  xmm8, xmm8, 78H
2020        punpckhdq xmm5, xmm7
2021        punpckldq xmm6, xmm5
2022        pshufd  xmm7, xmm6, 1EH
2023        movdqa  xmm5, xmm9
2024        movdqa  xmm6, xmm8
2025        jmp     @B
2026@@:
2027        movdqu  xmm4, xmmword ptr [rcx]
2028        movdqu  xmm5, xmmword ptr [rcx+10H]
2029        pxor    xmm0, xmm2
2030        pxor    xmm1, xmm3
2031        pxor    xmm2, xmm4
2032        pxor    xmm3, xmm5
2033        movups  xmmword ptr [r10], xmm0
2034        movups  xmmword ptr [r10+10H], xmm1
2035        movups  xmmword ptr [r10+20H], xmm2
2036        movups  xmmword ptr [r10+30H], xmm3
2037        movdqa  xmm6, xmmword ptr [rsp]
2038        movdqa  xmm7, xmmword ptr [rsp+10H]
2039        movdqa  xmm8, xmmword ptr [rsp+20H]
2040        movdqa  xmm9, xmmword ptr [rsp+30H]
2041        movdqa  xmm11, xmmword ptr [rsp+40H]
2042        movdqa  xmm14, xmmword ptr [rsp+50H]
2043        movdqa  xmm15, xmmword ptr [rsp+60H]
2044        add     rsp, 120
2045        ret
2046_llvm_blake3_compress_xof_sse41 ENDP
2047llvm_blake3_compress_xof_sse41 ENDP
2048
2049_TEXT ENDS
2050
2051
2052_RDATA SEGMENT READONLY PAGE ALIAS(".rdata") 'CONST'
2053ALIGN   64
2054BLAKE3_IV:
2055        dd 6A09E667H, 0BB67AE85H, 3C6EF372H, 0A54FF53AH
2056
2057ADD0:
2058        dd 0, 1, 2, 3
2059
2060ADD1:
2061        dd 4 dup (4)
2062
2063BLAKE3_IV_0:
2064        dd 4 dup (6A09E667H)
2065
2066BLAKE3_IV_1:
2067        dd 4 dup (0BB67AE85H)
2068
2069BLAKE3_IV_2:
2070        dd 4 dup (3C6EF372H)
2071
2072BLAKE3_IV_3:
2073        dd 4 dup (0A54FF53AH)
2074
2075BLAKE3_BLOCK_LEN:
2076        dd 4 dup (64)
2077
2078ROT16:
2079        db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
2080
2081ROT8:
2082        db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
2083
2084CMP_MSB_MASK:
2085        dd 8 dup(80000000H)
2086
2087_RDATA ENDS
2088END
2089
2090