1; This file is generated from a similarly-named Perl script in the BoringSSL
2; source tree. Do not edit by hand.
3
4default	rel
5%define XMMWORD
6%define YMMWORD
7%define ZMMWORD
8section	.text code align=64
9
10
11EXTERN	GFp_ia32cap_P
12
13ALIGN	64
14$L$zero:
15	DD	0,0,0,0
16$L$one:
17	DD	1,0,0,0
18$L$inc:
19	DD	0,1,2,3
20$L$four:
21	DD	4,4,4,4
22$L$incy:
23	DD	0,2,4,6,1,3,5,7
24$L$eight:
25	DD	8,8,8,8,8,8,8,8
26$L$rot16:
27DB	0x2,0x3,0x0,0x1,0x6,0x7,0x4,0x5,0xa,0xb,0x8,0x9,0xe,0xf,0xc,0xd
28$L$rot24:
29DB	0x3,0x0,0x1,0x2,0x7,0x4,0x5,0x6,0xb,0x8,0x9,0xa,0xf,0xc,0xd,0xe
30$L$sigma:
31DB	101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107
32DB	0
33ALIGN	64
34$L$zeroz:
35	DD	0,0,0,0,1,0,0,0,2,0,0,0,3,0,0,0
36$L$fourz:
37	DD	4,0,0,0,4,0,0,0,4,0,0,0,4,0,0,0
38$L$incz:
39	DD	0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
40$L$sixteen:
41	DD	16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
42DB	67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54
43DB	95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32
44DB	98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115
45DB	108,46,111,114,103,62,0
46global	GFp_ChaCha20_ctr32
47
48ALIGN	64
49GFp_ChaCha20_ctr32:
50	mov	QWORD[8+rsp],rdi	;WIN64 prologue
51	mov	QWORD[16+rsp],rsi
52	mov	rax,rsp
53$L$SEH_begin_GFp_ChaCha20_ctr32:
54	mov	rdi,rcx
55	mov	rsi,rdx
56	mov	rdx,r8
57	mov	rcx,r9
58	mov	r8,QWORD[40+rsp]
59
60
61
62	cmp	rdx,0
63	je	NEAR $L$no_data
64	mov	r10,QWORD[((GFp_ia32cap_P+4))]
65	test	r10d,512
66	jnz	NEAR $L$ChaCha20_ssse3
67
68	push	rbx
69
70	push	rbp
71
72	push	r12
73
74	push	r13
75
76	push	r14
77
78	push	r15
79
80	sub	rsp,64+24
81
82$L$ctr32_body:
83
84
85	movdqu	xmm1,XMMWORD[rcx]
86	movdqu	xmm2,XMMWORD[16+rcx]
87	movdqu	xmm3,XMMWORD[r8]
88	movdqa	xmm4,XMMWORD[$L$one]
89
90
91	movdqa	XMMWORD[16+rsp],xmm1
92	movdqa	XMMWORD[32+rsp],xmm2
93	movdqa	XMMWORD[48+rsp],xmm3
94	mov	rbp,rdx
95	jmp	NEAR $L$oop_outer
96
97ALIGN	32
98$L$oop_outer:
99	mov	eax,0x61707865
100	mov	ebx,0x3320646e
101	mov	ecx,0x79622d32
102	mov	edx,0x6b206574
103	mov	r8d,DWORD[16+rsp]
104	mov	r9d,DWORD[20+rsp]
105	mov	r10d,DWORD[24+rsp]
106	mov	r11d,DWORD[28+rsp]
107	movd	r12d,xmm3
108	mov	r13d,DWORD[52+rsp]
109	mov	r14d,DWORD[56+rsp]
110	mov	r15d,DWORD[60+rsp]
111
112	mov	QWORD[((64+0))+rsp],rbp
113	mov	ebp,10
114	mov	QWORD[((64+8))+rsp],rsi
115DB	102,72,15,126,214
116	mov	QWORD[((64+16))+rsp],rdi
117	mov	rdi,rsi
118	shr	rdi,32
119	jmp	NEAR $L$oop
120
121ALIGN	32
122$L$oop:
123	add	eax,r8d
124	xor	r12d,eax
125	rol	r12d,16
126	add	ebx,r9d
127	xor	r13d,ebx
128	rol	r13d,16
129	add	esi,r12d
130	xor	r8d,esi
131	rol	r8d,12
132	add	edi,r13d
133	xor	r9d,edi
134	rol	r9d,12
135	add	eax,r8d
136	xor	r12d,eax
137	rol	r12d,8
138	add	ebx,r9d
139	xor	r13d,ebx
140	rol	r13d,8
141	add	esi,r12d
142	xor	r8d,esi
143	rol	r8d,7
144	add	edi,r13d
145	xor	r9d,edi
146	rol	r9d,7
147	mov	DWORD[32+rsp],esi
148	mov	DWORD[36+rsp],edi
149	mov	esi,DWORD[40+rsp]
150	mov	edi,DWORD[44+rsp]
151	add	ecx,r10d
152	xor	r14d,ecx
153	rol	r14d,16
154	add	edx,r11d
155	xor	r15d,edx
156	rol	r15d,16
157	add	esi,r14d
158	xor	r10d,esi
159	rol	r10d,12
160	add	edi,r15d
161	xor	r11d,edi
162	rol	r11d,12
163	add	ecx,r10d
164	xor	r14d,ecx
165	rol	r14d,8
166	add	edx,r11d
167	xor	r15d,edx
168	rol	r15d,8
169	add	esi,r14d
170	xor	r10d,esi
171	rol	r10d,7
172	add	edi,r15d
173	xor	r11d,edi
174	rol	r11d,7
175	add	eax,r9d
176	xor	r15d,eax
177	rol	r15d,16
178	add	ebx,r10d
179	xor	r12d,ebx
180	rol	r12d,16
181	add	esi,r15d
182	xor	r9d,esi
183	rol	r9d,12
184	add	edi,r12d
185	xor	r10d,edi
186	rol	r10d,12
187	add	eax,r9d
188	xor	r15d,eax
189	rol	r15d,8
190	add	ebx,r10d
191	xor	r12d,ebx
192	rol	r12d,8
193	add	esi,r15d
194	xor	r9d,esi
195	rol	r9d,7
196	add	edi,r12d
197	xor	r10d,edi
198	rol	r10d,7
199	mov	DWORD[40+rsp],esi
200	mov	DWORD[44+rsp],edi
201	mov	esi,DWORD[32+rsp]
202	mov	edi,DWORD[36+rsp]
203	add	ecx,r11d
204	xor	r13d,ecx
205	rol	r13d,16
206	add	edx,r8d
207	xor	r14d,edx
208	rol	r14d,16
209	add	esi,r13d
210	xor	r11d,esi
211	rol	r11d,12
212	add	edi,r14d
213	xor	r8d,edi
214	rol	r8d,12
215	add	ecx,r11d
216	xor	r13d,ecx
217	rol	r13d,8
218	add	edx,r8d
219	xor	r14d,edx
220	rol	r14d,8
221	add	esi,r13d
222	xor	r11d,esi
223	rol	r11d,7
224	add	edi,r14d
225	xor	r8d,edi
226	rol	r8d,7
227	dec	ebp
228	jnz	NEAR $L$oop
229	mov	DWORD[36+rsp],edi
230	mov	DWORD[32+rsp],esi
231	mov	rbp,QWORD[64+rsp]
232	movdqa	xmm1,xmm2
233	mov	rsi,QWORD[((64+8))+rsp]
234	paddd	xmm3,xmm4
235	mov	rdi,QWORD[((64+16))+rsp]
236
237	add	eax,0x61707865
238	add	ebx,0x3320646e
239	add	ecx,0x79622d32
240	add	edx,0x6b206574
241	add	r8d,DWORD[16+rsp]
242	add	r9d,DWORD[20+rsp]
243	add	r10d,DWORD[24+rsp]
244	add	r11d,DWORD[28+rsp]
245	add	r12d,DWORD[48+rsp]
246	add	r13d,DWORD[52+rsp]
247	add	r14d,DWORD[56+rsp]
248	add	r15d,DWORD[60+rsp]
249	paddd	xmm1,XMMWORD[32+rsp]
250
251	cmp	rbp,64
252	jb	NEAR $L$tail
253
254	xor	eax,DWORD[rsi]
255	xor	ebx,DWORD[4+rsi]
256	xor	ecx,DWORD[8+rsi]
257	xor	edx,DWORD[12+rsi]
258	xor	r8d,DWORD[16+rsi]
259	xor	r9d,DWORD[20+rsi]
260	xor	r10d,DWORD[24+rsi]
261	xor	r11d,DWORD[28+rsi]
262	movdqu	xmm0,XMMWORD[32+rsi]
263	xor	r12d,DWORD[48+rsi]
264	xor	r13d,DWORD[52+rsi]
265	xor	r14d,DWORD[56+rsi]
266	xor	r15d,DWORD[60+rsi]
267	lea	rsi,[64+rsi]
268	pxor	xmm0,xmm1
269
270	movdqa	XMMWORD[32+rsp],xmm2
271	movd	DWORD[48+rsp],xmm3
272
273	mov	DWORD[rdi],eax
274	mov	DWORD[4+rdi],ebx
275	mov	DWORD[8+rdi],ecx
276	mov	DWORD[12+rdi],edx
277	mov	DWORD[16+rdi],r8d
278	mov	DWORD[20+rdi],r9d
279	mov	DWORD[24+rdi],r10d
280	mov	DWORD[28+rdi],r11d
281	movdqu	XMMWORD[32+rdi],xmm0
282	mov	DWORD[48+rdi],r12d
283	mov	DWORD[52+rdi],r13d
284	mov	DWORD[56+rdi],r14d
285	mov	DWORD[60+rdi],r15d
286	lea	rdi,[64+rdi]
287
288	sub	rbp,64
289	jnz	NEAR $L$oop_outer
290
291	jmp	NEAR $L$done
292
293ALIGN	16
294$L$tail:
295	mov	DWORD[rsp],eax
296	mov	DWORD[4+rsp],ebx
297	xor	rbx,rbx
298	mov	DWORD[8+rsp],ecx
299	mov	DWORD[12+rsp],edx
300	mov	DWORD[16+rsp],r8d
301	mov	DWORD[20+rsp],r9d
302	mov	DWORD[24+rsp],r10d
303	mov	DWORD[28+rsp],r11d
304	movdqa	XMMWORD[32+rsp],xmm1
305	mov	DWORD[48+rsp],r12d
306	mov	DWORD[52+rsp],r13d
307	mov	DWORD[56+rsp],r14d
308	mov	DWORD[60+rsp],r15d
309
310$L$oop_tail:
311	movzx	eax,BYTE[rbx*1+rsi]
312	movzx	edx,BYTE[rbx*1+rsp]
313	lea	rbx,[1+rbx]
314	xor	eax,edx
315	mov	BYTE[((-1))+rbx*1+rdi],al
316	dec	rbp
317	jnz	NEAR $L$oop_tail
318
319$L$done:
320	lea	rsi,[((64+24+48))+rsp]
321	mov	r15,QWORD[((-48))+rsi]
322
323	mov	r14,QWORD[((-40))+rsi]
324
325	mov	r13,QWORD[((-32))+rsi]
326
327	mov	r12,QWORD[((-24))+rsi]
328
329	mov	rbp,QWORD[((-16))+rsi]
330
331	mov	rbx,QWORD[((-8))+rsi]
332
333	lea	rsp,[rsi]
334
335$L$no_data:
336	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
337	mov	rsi,QWORD[16+rsp]
338	DB	0F3h,0C3h		;repret
339
340$L$SEH_end_GFp_ChaCha20_ctr32:
341
342ALIGN	32
343ChaCha20_ssse3:
344	mov	QWORD[8+rsp],rdi	;WIN64 prologue
345	mov	QWORD[16+rsp],rsi
346	mov	rax,rsp
347$L$SEH_begin_ChaCha20_ssse3:
348	mov	rdi,rcx
349	mov	rsi,rdx
350	mov	rdx,r8
351	mov	rcx,r9
352	mov	r8,QWORD[40+rsp]
353
354
355$L$ChaCha20_ssse3:
356
357	mov	r9,rsp
358
359	cmp	rdx,128
360	ja	NEAR $L$ChaCha20_4x
361
362$L$do_sse3_after_all:
363	sub	rsp,64+40
364	movaps	XMMWORD[(-40)+r9],xmm6
365	movaps	XMMWORD[(-24)+r9],xmm7
366$L$ssse3_body:
367	movdqa	xmm0,XMMWORD[$L$sigma]
368	movdqu	xmm1,XMMWORD[rcx]
369	movdqu	xmm2,XMMWORD[16+rcx]
370	movdqu	xmm3,XMMWORD[r8]
371	movdqa	xmm6,XMMWORD[$L$rot16]
372	movdqa	xmm7,XMMWORD[$L$rot24]
373
374	movdqa	XMMWORD[rsp],xmm0
375	movdqa	XMMWORD[16+rsp],xmm1
376	movdqa	XMMWORD[32+rsp],xmm2
377	movdqa	XMMWORD[48+rsp],xmm3
378	mov	r8,10
379	jmp	NEAR $L$oop_ssse3
380
381ALIGN	32
382$L$oop_outer_ssse3:
383	movdqa	xmm3,XMMWORD[$L$one]
384	movdqa	xmm0,XMMWORD[rsp]
385	movdqa	xmm1,XMMWORD[16+rsp]
386	movdqa	xmm2,XMMWORD[32+rsp]
387	paddd	xmm3,XMMWORD[48+rsp]
388	mov	r8,10
389	movdqa	XMMWORD[48+rsp],xmm3
390	jmp	NEAR $L$oop_ssse3
391
392ALIGN	32
393$L$oop_ssse3:
394	paddd	xmm0,xmm1
395	pxor	xmm3,xmm0
396DB	102,15,56,0,222
397	paddd	xmm2,xmm3
398	pxor	xmm1,xmm2
399	movdqa	xmm4,xmm1
400	psrld	xmm1,20
401	pslld	xmm4,12
402	por	xmm1,xmm4
403	paddd	xmm0,xmm1
404	pxor	xmm3,xmm0
405DB	102,15,56,0,223
406	paddd	xmm2,xmm3
407	pxor	xmm1,xmm2
408	movdqa	xmm4,xmm1
409	psrld	xmm1,25
410	pslld	xmm4,7
411	por	xmm1,xmm4
412	pshufd	xmm2,xmm2,78
413	pshufd	xmm1,xmm1,57
414	pshufd	xmm3,xmm3,147
415	nop
416	paddd	xmm0,xmm1
417	pxor	xmm3,xmm0
418DB	102,15,56,0,222
419	paddd	xmm2,xmm3
420	pxor	xmm1,xmm2
421	movdqa	xmm4,xmm1
422	psrld	xmm1,20
423	pslld	xmm4,12
424	por	xmm1,xmm4
425	paddd	xmm0,xmm1
426	pxor	xmm3,xmm0
427DB	102,15,56,0,223
428	paddd	xmm2,xmm3
429	pxor	xmm1,xmm2
430	movdqa	xmm4,xmm1
431	psrld	xmm1,25
432	pslld	xmm4,7
433	por	xmm1,xmm4
434	pshufd	xmm2,xmm2,78
435	pshufd	xmm1,xmm1,147
436	pshufd	xmm3,xmm3,57
437	dec	r8
438	jnz	NEAR $L$oop_ssse3
439	paddd	xmm0,XMMWORD[rsp]
440	paddd	xmm1,XMMWORD[16+rsp]
441	paddd	xmm2,XMMWORD[32+rsp]
442	paddd	xmm3,XMMWORD[48+rsp]
443
444	cmp	rdx,64
445	jb	NEAR $L$tail_ssse3
446
447	movdqu	xmm4,XMMWORD[rsi]
448	movdqu	xmm5,XMMWORD[16+rsi]
449	pxor	xmm0,xmm4
450	movdqu	xmm4,XMMWORD[32+rsi]
451	pxor	xmm1,xmm5
452	movdqu	xmm5,XMMWORD[48+rsi]
453	lea	rsi,[64+rsi]
454	pxor	xmm2,xmm4
455	pxor	xmm3,xmm5
456
457	movdqu	XMMWORD[rdi],xmm0
458	movdqu	XMMWORD[16+rdi],xmm1
459	movdqu	XMMWORD[32+rdi],xmm2
460	movdqu	XMMWORD[48+rdi],xmm3
461	lea	rdi,[64+rdi]
462
463	sub	rdx,64
464	jnz	NEAR $L$oop_outer_ssse3
465
466	jmp	NEAR $L$done_ssse3
467
468ALIGN	16
469$L$tail_ssse3:
470	movdqa	XMMWORD[rsp],xmm0
471	movdqa	XMMWORD[16+rsp],xmm1
472	movdqa	XMMWORD[32+rsp],xmm2
473	movdqa	XMMWORD[48+rsp],xmm3
474	xor	r8,r8
475
476$L$oop_tail_ssse3:
477	movzx	eax,BYTE[r8*1+rsi]
478	movzx	ecx,BYTE[r8*1+rsp]
479	lea	r8,[1+r8]
480	xor	eax,ecx
481	mov	BYTE[((-1))+r8*1+rdi],al
482	dec	rdx
483	jnz	NEAR $L$oop_tail_ssse3
484
485$L$done_ssse3:
486	movaps	xmm6,XMMWORD[((-40))+r9]
487	movaps	xmm7,XMMWORD[((-24))+r9]
488	lea	rsp,[r9]
489
490$L$ssse3_epilogue:
491	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
492	mov	rsi,QWORD[16+rsp]
493	DB	0F3h,0C3h		;repret
494
495$L$SEH_end_ChaCha20_ssse3:
496
497ALIGN	32
498ChaCha20_4x:
499	mov	QWORD[8+rsp],rdi	;WIN64 prologue
500	mov	QWORD[16+rsp],rsi
501	mov	rax,rsp
502$L$SEH_begin_ChaCha20_4x:
503	mov	rdi,rcx
504	mov	rsi,rdx
505	mov	rdx,r8
506	mov	rcx,r9
507	mov	r8,QWORD[40+rsp]
508
509
510$L$ChaCha20_4x:
511
512	mov	r9,rsp
513
514	mov	r11,r10
515	shr	r10,32
516	test	r10,32
517	jnz	NEAR $L$ChaCha20_8x
518	cmp	rdx,192
519	ja	NEAR $L$proceed4x
520
521	and	r11,71303168
522	cmp	r11,4194304
523	je	NEAR $L$do_sse3_after_all
524
525$L$proceed4x:
526	sub	rsp,0x140+168
527	movaps	XMMWORD[(-168)+r9],xmm6
528	movaps	XMMWORD[(-152)+r9],xmm7
529	movaps	XMMWORD[(-136)+r9],xmm8
530	movaps	XMMWORD[(-120)+r9],xmm9
531	movaps	XMMWORD[(-104)+r9],xmm10
532	movaps	XMMWORD[(-88)+r9],xmm11
533	movaps	XMMWORD[(-72)+r9],xmm12
534	movaps	XMMWORD[(-56)+r9],xmm13
535	movaps	XMMWORD[(-40)+r9],xmm14
536	movaps	XMMWORD[(-24)+r9],xmm15
537$L$4x_body:
538	movdqa	xmm11,XMMWORD[$L$sigma]
539	movdqu	xmm15,XMMWORD[rcx]
540	movdqu	xmm7,XMMWORD[16+rcx]
541	movdqu	xmm3,XMMWORD[r8]
542	lea	rcx,[256+rsp]
543	lea	r10,[$L$rot16]
544	lea	r11,[$L$rot24]
545
546	pshufd	xmm8,xmm11,0x00
547	pshufd	xmm9,xmm11,0x55
548	movdqa	XMMWORD[64+rsp],xmm8
549	pshufd	xmm10,xmm11,0xaa
550	movdqa	XMMWORD[80+rsp],xmm9
551	pshufd	xmm11,xmm11,0xff
552	movdqa	XMMWORD[96+rsp],xmm10
553	movdqa	XMMWORD[112+rsp],xmm11
554
555	pshufd	xmm12,xmm15,0x00
556	pshufd	xmm13,xmm15,0x55
557	movdqa	XMMWORD[(128-256)+rcx],xmm12
558	pshufd	xmm14,xmm15,0xaa
559	movdqa	XMMWORD[(144-256)+rcx],xmm13
560	pshufd	xmm15,xmm15,0xff
561	movdqa	XMMWORD[(160-256)+rcx],xmm14
562	movdqa	XMMWORD[(176-256)+rcx],xmm15
563
564	pshufd	xmm4,xmm7,0x00
565	pshufd	xmm5,xmm7,0x55
566	movdqa	XMMWORD[(192-256)+rcx],xmm4
567	pshufd	xmm6,xmm7,0xaa
568	movdqa	XMMWORD[(208-256)+rcx],xmm5
569	pshufd	xmm7,xmm7,0xff
570	movdqa	XMMWORD[(224-256)+rcx],xmm6
571	movdqa	XMMWORD[(240-256)+rcx],xmm7
572
573	pshufd	xmm0,xmm3,0x00
574	pshufd	xmm1,xmm3,0x55
575	paddd	xmm0,XMMWORD[$L$inc]
576	pshufd	xmm2,xmm3,0xaa
577	movdqa	XMMWORD[(272-256)+rcx],xmm1
578	pshufd	xmm3,xmm3,0xff
579	movdqa	XMMWORD[(288-256)+rcx],xmm2
580	movdqa	XMMWORD[(304-256)+rcx],xmm3
581
582	jmp	NEAR $L$oop_enter4x
583
584ALIGN	32
585$L$oop_outer4x:
586	movdqa	xmm8,XMMWORD[64+rsp]
587	movdqa	xmm9,XMMWORD[80+rsp]
588	movdqa	xmm10,XMMWORD[96+rsp]
589	movdqa	xmm11,XMMWORD[112+rsp]
590	movdqa	xmm12,XMMWORD[((128-256))+rcx]
591	movdqa	xmm13,XMMWORD[((144-256))+rcx]
592	movdqa	xmm14,XMMWORD[((160-256))+rcx]
593	movdqa	xmm15,XMMWORD[((176-256))+rcx]
594	movdqa	xmm4,XMMWORD[((192-256))+rcx]
595	movdqa	xmm5,XMMWORD[((208-256))+rcx]
596	movdqa	xmm6,XMMWORD[((224-256))+rcx]
597	movdqa	xmm7,XMMWORD[((240-256))+rcx]
598	movdqa	xmm0,XMMWORD[((256-256))+rcx]
599	movdqa	xmm1,XMMWORD[((272-256))+rcx]
600	movdqa	xmm2,XMMWORD[((288-256))+rcx]
601	movdqa	xmm3,XMMWORD[((304-256))+rcx]
602	paddd	xmm0,XMMWORD[$L$four]
603
604$L$oop_enter4x:
605	movdqa	XMMWORD[32+rsp],xmm6
606	movdqa	XMMWORD[48+rsp],xmm7
607	movdqa	xmm7,XMMWORD[r10]
608	mov	eax,10
609	movdqa	XMMWORD[(256-256)+rcx],xmm0
610	jmp	NEAR $L$oop4x
611
612ALIGN	32
613$L$oop4x:
614	paddd	xmm8,xmm12
615	paddd	xmm9,xmm13
616	pxor	xmm0,xmm8
617	pxor	xmm1,xmm9
618DB	102,15,56,0,199
619DB	102,15,56,0,207
620	paddd	xmm4,xmm0
621	paddd	xmm5,xmm1
622	pxor	xmm12,xmm4
623	pxor	xmm13,xmm5
624	movdqa	xmm6,xmm12
625	pslld	xmm12,12
626	psrld	xmm6,20
627	movdqa	xmm7,xmm13
628	pslld	xmm13,12
629	por	xmm12,xmm6
630	psrld	xmm7,20
631	movdqa	xmm6,XMMWORD[r11]
632	por	xmm13,xmm7
633	paddd	xmm8,xmm12
634	paddd	xmm9,xmm13
635	pxor	xmm0,xmm8
636	pxor	xmm1,xmm9
637DB	102,15,56,0,198
638DB	102,15,56,0,206
639	paddd	xmm4,xmm0
640	paddd	xmm5,xmm1
641	pxor	xmm12,xmm4
642	pxor	xmm13,xmm5
643	movdqa	xmm7,xmm12
644	pslld	xmm12,7
645	psrld	xmm7,25
646	movdqa	xmm6,xmm13
647	pslld	xmm13,7
648	por	xmm12,xmm7
649	psrld	xmm6,25
650	movdqa	xmm7,XMMWORD[r10]
651	por	xmm13,xmm6
652	movdqa	XMMWORD[rsp],xmm4
653	movdqa	XMMWORD[16+rsp],xmm5
654	movdqa	xmm4,XMMWORD[32+rsp]
655	movdqa	xmm5,XMMWORD[48+rsp]
656	paddd	xmm10,xmm14
657	paddd	xmm11,xmm15
658	pxor	xmm2,xmm10
659	pxor	xmm3,xmm11
660DB	102,15,56,0,215
661DB	102,15,56,0,223
662	paddd	xmm4,xmm2
663	paddd	xmm5,xmm3
664	pxor	xmm14,xmm4
665	pxor	xmm15,xmm5
666	movdqa	xmm6,xmm14
667	pslld	xmm14,12
668	psrld	xmm6,20
669	movdqa	xmm7,xmm15
670	pslld	xmm15,12
671	por	xmm14,xmm6
672	psrld	xmm7,20
673	movdqa	xmm6,XMMWORD[r11]
674	por	xmm15,xmm7
675	paddd	xmm10,xmm14
676	paddd	xmm11,xmm15
677	pxor	xmm2,xmm10
678	pxor	xmm3,xmm11
679DB	102,15,56,0,214
680DB	102,15,56,0,222
681	paddd	xmm4,xmm2
682	paddd	xmm5,xmm3
683	pxor	xmm14,xmm4
684	pxor	xmm15,xmm5
685	movdqa	xmm7,xmm14
686	pslld	xmm14,7
687	psrld	xmm7,25
688	movdqa	xmm6,xmm15
689	pslld	xmm15,7
690	por	xmm14,xmm7
691	psrld	xmm6,25
692	movdqa	xmm7,XMMWORD[r10]
693	por	xmm15,xmm6
694	paddd	xmm8,xmm13
695	paddd	xmm9,xmm14
696	pxor	xmm3,xmm8
697	pxor	xmm0,xmm9
698DB	102,15,56,0,223
699DB	102,15,56,0,199
700	paddd	xmm4,xmm3
701	paddd	xmm5,xmm0
702	pxor	xmm13,xmm4
703	pxor	xmm14,xmm5
704	movdqa	xmm6,xmm13
705	pslld	xmm13,12
706	psrld	xmm6,20
707	movdqa	xmm7,xmm14
708	pslld	xmm14,12
709	por	xmm13,xmm6
710	psrld	xmm7,20
711	movdqa	xmm6,XMMWORD[r11]
712	por	xmm14,xmm7
713	paddd	xmm8,xmm13
714	paddd	xmm9,xmm14
715	pxor	xmm3,xmm8
716	pxor	xmm0,xmm9
717DB	102,15,56,0,222
718DB	102,15,56,0,198
719	paddd	xmm4,xmm3
720	paddd	xmm5,xmm0
721	pxor	xmm13,xmm4
722	pxor	xmm14,xmm5
723	movdqa	xmm7,xmm13
724	pslld	xmm13,7
725	psrld	xmm7,25
726	movdqa	xmm6,xmm14
727	pslld	xmm14,7
728	por	xmm13,xmm7
729	psrld	xmm6,25
730	movdqa	xmm7,XMMWORD[r10]
731	por	xmm14,xmm6
732	movdqa	XMMWORD[32+rsp],xmm4
733	movdqa	XMMWORD[48+rsp],xmm5
734	movdqa	xmm4,XMMWORD[rsp]
735	movdqa	xmm5,XMMWORD[16+rsp]
736	paddd	xmm10,xmm15
737	paddd	xmm11,xmm12
738	pxor	xmm1,xmm10
739	pxor	xmm2,xmm11
740DB	102,15,56,0,207
741DB	102,15,56,0,215
742	paddd	xmm4,xmm1
743	paddd	xmm5,xmm2
744	pxor	xmm15,xmm4
745	pxor	xmm12,xmm5
746	movdqa	xmm6,xmm15
747	pslld	xmm15,12
748	psrld	xmm6,20
749	movdqa	xmm7,xmm12
750	pslld	xmm12,12
751	por	xmm15,xmm6
752	psrld	xmm7,20
753	movdqa	xmm6,XMMWORD[r11]
754	por	xmm12,xmm7
755	paddd	xmm10,xmm15
756	paddd	xmm11,xmm12
757	pxor	xmm1,xmm10
758	pxor	xmm2,xmm11
759DB	102,15,56,0,206
760DB	102,15,56,0,214
761	paddd	xmm4,xmm1
762	paddd	xmm5,xmm2
763	pxor	xmm15,xmm4
764	pxor	xmm12,xmm5
765	movdqa	xmm7,xmm15
766	pslld	xmm15,7
767	psrld	xmm7,25
768	movdqa	xmm6,xmm12
769	pslld	xmm12,7
770	por	xmm15,xmm7
771	psrld	xmm6,25
772	movdqa	xmm7,XMMWORD[r10]
773	por	xmm12,xmm6
774	dec	eax
775	jnz	NEAR $L$oop4x
776
777	paddd	xmm8,XMMWORD[64+rsp]
778	paddd	xmm9,XMMWORD[80+rsp]
779	paddd	xmm10,XMMWORD[96+rsp]
780	paddd	xmm11,XMMWORD[112+rsp]
781
782	movdqa	xmm6,xmm8
783	punpckldq	xmm8,xmm9
784	movdqa	xmm7,xmm10
785	punpckldq	xmm10,xmm11
786	punpckhdq	xmm6,xmm9
787	punpckhdq	xmm7,xmm11
788	movdqa	xmm9,xmm8
789	punpcklqdq	xmm8,xmm10
790	movdqa	xmm11,xmm6
791	punpcklqdq	xmm6,xmm7
792	punpckhqdq	xmm9,xmm10
793	punpckhqdq	xmm11,xmm7
794	paddd	xmm12,XMMWORD[((128-256))+rcx]
795	paddd	xmm13,XMMWORD[((144-256))+rcx]
796	paddd	xmm14,XMMWORD[((160-256))+rcx]
797	paddd	xmm15,XMMWORD[((176-256))+rcx]
798
799	movdqa	XMMWORD[rsp],xmm8
800	movdqa	XMMWORD[16+rsp],xmm9
801	movdqa	xmm8,XMMWORD[32+rsp]
802	movdqa	xmm9,XMMWORD[48+rsp]
803
804	movdqa	xmm10,xmm12
805	punpckldq	xmm12,xmm13
806	movdqa	xmm7,xmm14
807	punpckldq	xmm14,xmm15
808	punpckhdq	xmm10,xmm13
809	punpckhdq	xmm7,xmm15
810	movdqa	xmm13,xmm12
811	punpcklqdq	xmm12,xmm14
812	movdqa	xmm15,xmm10
813	punpcklqdq	xmm10,xmm7
814	punpckhqdq	xmm13,xmm14
815	punpckhqdq	xmm15,xmm7
816	paddd	xmm4,XMMWORD[((192-256))+rcx]
817	paddd	xmm5,XMMWORD[((208-256))+rcx]
818	paddd	xmm8,XMMWORD[((224-256))+rcx]
819	paddd	xmm9,XMMWORD[((240-256))+rcx]
820
821	movdqa	XMMWORD[32+rsp],xmm6
822	movdqa	XMMWORD[48+rsp],xmm11
823
824	movdqa	xmm14,xmm4
825	punpckldq	xmm4,xmm5
826	movdqa	xmm7,xmm8
827	punpckldq	xmm8,xmm9
828	punpckhdq	xmm14,xmm5
829	punpckhdq	xmm7,xmm9
830	movdqa	xmm5,xmm4
831	punpcklqdq	xmm4,xmm8
832	movdqa	xmm9,xmm14
833	punpcklqdq	xmm14,xmm7
834	punpckhqdq	xmm5,xmm8
835	punpckhqdq	xmm9,xmm7
836	paddd	xmm0,XMMWORD[((256-256))+rcx]
837	paddd	xmm1,XMMWORD[((272-256))+rcx]
838	paddd	xmm2,XMMWORD[((288-256))+rcx]
839	paddd	xmm3,XMMWORD[((304-256))+rcx]
840
841	movdqa	xmm8,xmm0
842	punpckldq	xmm0,xmm1
843	movdqa	xmm7,xmm2
844	punpckldq	xmm2,xmm3
845	punpckhdq	xmm8,xmm1
846	punpckhdq	xmm7,xmm3
847	movdqa	xmm1,xmm0
848	punpcklqdq	xmm0,xmm2
849	movdqa	xmm3,xmm8
850	punpcklqdq	xmm8,xmm7
851	punpckhqdq	xmm1,xmm2
852	punpckhqdq	xmm3,xmm7
853	cmp	rdx,64*4
854	jb	NEAR $L$tail4x
855
856	movdqu	xmm6,XMMWORD[rsi]
857	movdqu	xmm11,XMMWORD[16+rsi]
858	movdqu	xmm2,XMMWORD[32+rsi]
859	movdqu	xmm7,XMMWORD[48+rsi]
860	pxor	xmm6,XMMWORD[rsp]
861	pxor	xmm11,xmm12
862	pxor	xmm2,xmm4
863	pxor	xmm7,xmm0
864
865	movdqu	XMMWORD[rdi],xmm6
866	movdqu	xmm6,XMMWORD[64+rsi]
867	movdqu	XMMWORD[16+rdi],xmm11
868	movdqu	xmm11,XMMWORD[80+rsi]
869	movdqu	XMMWORD[32+rdi],xmm2
870	movdqu	xmm2,XMMWORD[96+rsi]
871	movdqu	XMMWORD[48+rdi],xmm7
872	movdqu	xmm7,XMMWORD[112+rsi]
873	lea	rsi,[128+rsi]
874	pxor	xmm6,XMMWORD[16+rsp]
875	pxor	xmm11,xmm13
876	pxor	xmm2,xmm5
877	pxor	xmm7,xmm1
878
879	movdqu	XMMWORD[64+rdi],xmm6
880	movdqu	xmm6,XMMWORD[rsi]
881	movdqu	XMMWORD[80+rdi],xmm11
882	movdqu	xmm11,XMMWORD[16+rsi]
883	movdqu	XMMWORD[96+rdi],xmm2
884	movdqu	xmm2,XMMWORD[32+rsi]
885	movdqu	XMMWORD[112+rdi],xmm7
886	lea	rdi,[128+rdi]
887	movdqu	xmm7,XMMWORD[48+rsi]
888	pxor	xmm6,XMMWORD[32+rsp]
889	pxor	xmm11,xmm10
890	pxor	xmm2,xmm14
891	pxor	xmm7,xmm8
892
893	movdqu	XMMWORD[rdi],xmm6
894	movdqu	xmm6,XMMWORD[64+rsi]
895	movdqu	XMMWORD[16+rdi],xmm11
896	movdqu	xmm11,XMMWORD[80+rsi]
897	movdqu	XMMWORD[32+rdi],xmm2
898	movdqu	xmm2,XMMWORD[96+rsi]
899	movdqu	XMMWORD[48+rdi],xmm7
900	movdqu	xmm7,XMMWORD[112+rsi]
901	lea	rsi,[128+rsi]
902	pxor	xmm6,XMMWORD[48+rsp]
903	pxor	xmm11,xmm15
904	pxor	xmm2,xmm9
905	pxor	xmm7,xmm3
906	movdqu	XMMWORD[64+rdi],xmm6
907	movdqu	XMMWORD[80+rdi],xmm11
908	movdqu	XMMWORD[96+rdi],xmm2
909	movdqu	XMMWORD[112+rdi],xmm7
910	lea	rdi,[128+rdi]
911
912	sub	rdx,64*4
913	jnz	NEAR $L$oop_outer4x
914
915	jmp	NEAR $L$done4x
916
917$L$tail4x:
918	cmp	rdx,192
919	jae	NEAR $L$192_or_more4x
920	cmp	rdx,128
921	jae	NEAR $L$128_or_more4x
922	cmp	rdx,64
923	jae	NEAR $L$64_or_more4x
924
925
926	xor	r10,r10
927
928	movdqa	XMMWORD[16+rsp],xmm12
929	movdqa	XMMWORD[32+rsp],xmm4
930	movdqa	XMMWORD[48+rsp],xmm0
931	jmp	NEAR $L$oop_tail4x
932
933ALIGN	32
934$L$64_or_more4x:
935	movdqu	xmm6,XMMWORD[rsi]
936	movdqu	xmm11,XMMWORD[16+rsi]
937	movdqu	xmm2,XMMWORD[32+rsi]
938	movdqu	xmm7,XMMWORD[48+rsi]
939	pxor	xmm6,XMMWORD[rsp]
940	pxor	xmm11,xmm12
941	pxor	xmm2,xmm4
942	pxor	xmm7,xmm0
943	movdqu	XMMWORD[rdi],xmm6
944	movdqu	XMMWORD[16+rdi],xmm11
945	movdqu	XMMWORD[32+rdi],xmm2
946	movdqu	XMMWORD[48+rdi],xmm7
947	je	NEAR $L$done4x
948
949	movdqa	xmm6,XMMWORD[16+rsp]
950	lea	rsi,[64+rsi]
951	xor	r10,r10
952	movdqa	XMMWORD[rsp],xmm6
953	movdqa	XMMWORD[16+rsp],xmm13
954	lea	rdi,[64+rdi]
955	movdqa	XMMWORD[32+rsp],xmm5
956	sub	rdx,64
957	movdqa	XMMWORD[48+rsp],xmm1
958	jmp	NEAR $L$oop_tail4x
959
960ALIGN	32
961$L$128_or_more4x:
962	movdqu	xmm6,XMMWORD[rsi]
963	movdqu	xmm11,XMMWORD[16+rsi]
964	movdqu	xmm2,XMMWORD[32+rsi]
965	movdqu	xmm7,XMMWORD[48+rsi]
966	pxor	xmm6,XMMWORD[rsp]
967	pxor	xmm11,xmm12
968	pxor	xmm2,xmm4
969	pxor	xmm7,xmm0
970
971	movdqu	XMMWORD[rdi],xmm6
972	movdqu	xmm6,XMMWORD[64+rsi]
973	movdqu	XMMWORD[16+rdi],xmm11
974	movdqu	xmm11,XMMWORD[80+rsi]
975	movdqu	XMMWORD[32+rdi],xmm2
976	movdqu	xmm2,XMMWORD[96+rsi]
977	movdqu	XMMWORD[48+rdi],xmm7
978	movdqu	xmm7,XMMWORD[112+rsi]
979	pxor	xmm6,XMMWORD[16+rsp]
980	pxor	xmm11,xmm13
981	pxor	xmm2,xmm5
982	pxor	xmm7,xmm1
983	movdqu	XMMWORD[64+rdi],xmm6
984	movdqu	XMMWORD[80+rdi],xmm11
985	movdqu	XMMWORD[96+rdi],xmm2
986	movdqu	XMMWORD[112+rdi],xmm7
987	je	NEAR $L$done4x
988
989	movdqa	xmm6,XMMWORD[32+rsp]
990	lea	rsi,[128+rsi]
991	xor	r10,r10
992	movdqa	XMMWORD[rsp],xmm6
993	movdqa	XMMWORD[16+rsp],xmm10
994	lea	rdi,[128+rdi]
995	movdqa	XMMWORD[32+rsp],xmm14
996	sub	rdx,128
997	movdqa	XMMWORD[48+rsp],xmm8
998	jmp	NEAR $L$oop_tail4x
999
1000ALIGN	32
1001$L$192_or_more4x:
1002	movdqu	xmm6,XMMWORD[rsi]
1003	movdqu	xmm11,XMMWORD[16+rsi]
1004	movdqu	xmm2,XMMWORD[32+rsi]
1005	movdqu	xmm7,XMMWORD[48+rsi]
1006	pxor	xmm6,XMMWORD[rsp]
1007	pxor	xmm11,xmm12
1008	pxor	xmm2,xmm4
1009	pxor	xmm7,xmm0
1010
1011	movdqu	XMMWORD[rdi],xmm6
1012	movdqu	xmm6,XMMWORD[64+rsi]
1013	movdqu	XMMWORD[16+rdi],xmm11
1014	movdqu	xmm11,XMMWORD[80+rsi]
1015	movdqu	XMMWORD[32+rdi],xmm2
1016	movdqu	xmm2,XMMWORD[96+rsi]
1017	movdqu	XMMWORD[48+rdi],xmm7
1018	movdqu	xmm7,XMMWORD[112+rsi]
1019	lea	rsi,[128+rsi]
1020	pxor	xmm6,XMMWORD[16+rsp]
1021	pxor	xmm11,xmm13
1022	pxor	xmm2,xmm5
1023	pxor	xmm7,xmm1
1024
1025	movdqu	XMMWORD[64+rdi],xmm6
1026	movdqu	xmm6,XMMWORD[rsi]
1027	movdqu	XMMWORD[80+rdi],xmm11
1028	movdqu	xmm11,XMMWORD[16+rsi]
1029	movdqu	XMMWORD[96+rdi],xmm2
1030	movdqu	xmm2,XMMWORD[32+rsi]
1031	movdqu	XMMWORD[112+rdi],xmm7
1032	lea	rdi,[128+rdi]
1033	movdqu	xmm7,XMMWORD[48+rsi]
1034	pxor	xmm6,XMMWORD[32+rsp]
1035	pxor	xmm11,xmm10
1036	pxor	xmm2,xmm14
1037	pxor	xmm7,xmm8
1038	movdqu	XMMWORD[rdi],xmm6
1039	movdqu	XMMWORD[16+rdi],xmm11
1040	movdqu	XMMWORD[32+rdi],xmm2
1041	movdqu	XMMWORD[48+rdi],xmm7
1042	je	NEAR $L$done4x
1043
1044	movdqa	xmm6,XMMWORD[48+rsp]
1045	lea	rsi,[64+rsi]
1046	xor	r10,r10
1047	movdqa	XMMWORD[rsp],xmm6
1048	movdqa	XMMWORD[16+rsp],xmm15
1049	lea	rdi,[64+rdi]
1050	movdqa	XMMWORD[32+rsp],xmm9
1051	sub	rdx,192
1052	movdqa	XMMWORD[48+rsp],xmm3
1053
1054$L$oop_tail4x:
1055	movzx	eax,BYTE[r10*1+rsi]
1056	movzx	ecx,BYTE[r10*1+rsp]
1057	lea	r10,[1+r10]
1058	xor	eax,ecx
1059	mov	BYTE[((-1))+r10*1+rdi],al
1060	dec	rdx
1061	jnz	NEAR $L$oop_tail4x
1062
1063$L$done4x:
1064	movaps	xmm6,XMMWORD[((-168))+r9]
1065	movaps	xmm7,XMMWORD[((-152))+r9]
1066	movaps	xmm8,XMMWORD[((-136))+r9]
1067	movaps	xmm9,XMMWORD[((-120))+r9]
1068	movaps	xmm10,XMMWORD[((-104))+r9]
1069	movaps	xmm11,XMMWORD[((-88))+r9]
1070	movaps	xmm12,XMMWORD[((-72))+r9]
1071	movaps	xmm13,XMMWORD[((-56))+r9]
1072	movaps	xmm14,XMMWORD[((-40))+r9]
1073	movaps	xmm15,XMMWORD[((-24))+r9]
1074	lea	rsp,[r9]
1075
1076$L$4x_epilogue:
1077	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
1078	mov	rsi,QWORD[16+rsp]
1079	DB	0F3h,0C3h		;repret
1080
1081$L$SEH_end_ChaCha20_4x:
1082
1083ALIGN	32
1084ChaCha20_8x:
1085	mov	QWORD[8+rsp],rdi	;WIN64 prologue
1086	mov	QWORD[16+rsp],rsi
1087	mov	rax,rsp
1088$L$SEH_begin_ChaCha20_8x:
1089	mov	rdi,rcx
1090	mov	rsi,rdx
1091	mov	rdx,r8
1092	mov	rcx,r9
1093	mov	r8,QWORD[40+rsp]
1094
1095
1096$L$ChaCha20_8x:
1097
1098	mov	r9,rsp
1099
1100	sub	rsp,0x280+168
1101	and	rsp,-32
1102	movaps	XMMWORD[(-168)+r9],xmm6
1103	movaps	XMMWORD[(-152)+r9],xmm7
1104	movaps	XMMWORD[(-136)+r9],xmm8
1105	movaps	XMMWORD[(-120)+r9],xmm9
1106	movaps	XMMWORD[(-104)+r9],xmm10
1107	movaps	XMMWORD[(-88)+r9],xmm11
1108	movaps	XMMWORD[(-72)+r9],xmm12
1109	movaps	XMMWORD[(-56)+r9],xmm13
1110	movaps	XMMWORD[(-40)+r9],xmm14
1111	movaps	XMMWORD[(-24)+r9],xmm15
1112$L$8x_body:
1113	vzeroupper
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124	vbroadcasti128	ymm11,XMMWORD[$L$sigma]
1125	vbroadcasti128	ymm3,XMMWORD[rcx]
1126	vbroadcasti128	ymm15,XMMWORD[16+rcx]
1127	vbroadcasti128	ymm7,XMMWORD[r8]
1128	lea	rcx,[256+rsp]
1129	lea	rax,[512+rsp]
1130	lea	r10,[$L$rot16]
1131	lea	r11,[$L$rot24]
1132
1133	vpshufd	ymm8,ymm11,0x00
1134	vpshufd	ymm9,ymm11,0x55
1135	vmovdqa	YMMWORD[(128-256)+rcx],ymm8
1136	vpshufd	ymm10,ymm11,0xaa
1137	vmovdqa	YMMWORD[(160-256)+rcx],ymm9
1138	vpshufd	ymm11,ymm11,0xff
1139	vmovdqa	YMMWORD[(192-256)+rcx],ymm10
1140	vmovdqa	YMMWORD[(224-256)+rcx],ymm11
1141
1142	vpshufd	ymm0,ymm3,0x00
1143	vpshufd	ymm1,ymm3,0x55
1144	vmovdqa	YMMWORD[(256-256)+rcx],ymm0
1145	vpshufd	ymm2,ymm3,0xaa
1146	vmovdqa	YMMWORD[(288-256)+rcx],ymm1
1147	vpshufd	ymm3,ymm3,0xff
1148	vmovdqa	YMMWORD[(320-256)+rcx],ymm2
1149	vmovdqa	YMMWORD[(352-256)+rcx],ymm3
1150
1151	vpshufd	ymm12,ymm15,0x00
1152	vpshufd	ymm13,ymm15,0x55
1153	vmovdqa	YMMWORD[(384-512)+rax],ymm12
1154	vpshufd	ymm14,ymm15,0xaa
1155	vmovdqa	YMMWORD[(416-512)+rax],ymm13
1156	vpshufd	ymm15,ymm15,0xff
1157	vmovdqa	YMMWORD[(448-512)+rax],ymm14
1158	vmovdqa	YMMWORD[(480-512)+rax],ymm15
1159
1160	vpshufd	ymm4,ymm7,0x00
1161	vpshufd	ymm5,ymm7,0x55
1162	vpaddd	ymm4,ymm4,YMMWORD[$L$incy]
1163	vpshufd	ymm6,ymm7,0xaa
1164	vmovdqa	YMMWORD[(544-512)+rax],ymm5
1165	vpshufd	ymm7,ymm7,0xff
1166	vmovdqa	YMMWORD[(576-512)+rax],ymm6
1167	vmovdqa	YMMWORD[(608-512)+rax],ymm7
1168
1169	jmp	NEAR $L$oop_enter8x
1170
1171ALIGN	32
1172$L$oop_outer8x:
1173	vmovdqa	ymm8,YMMWORD[((128-256))+rcx]
1174	vmovdqa	ymm9,YMMWORD[((160-256))+rcx]
1175	vmovdqa	ymm10,YMMWORD[((192-256))+rcx]
1176	vmovdqa	ymm11,YMMWORD[((224-256))+rcx]
1177	vmovdqa	ymm0,YMMWORD[((256-256))+rcx]
1178	vmovdqa	ymm1,YMMWORD[((288-256))+rcx]
1179	vmovdqa	ymm2,YMMWORD[((320-256))+rcx]
1180	vmovdqa	ymm3,YMMWORD[((352-256))+rcx]
1181	vmovdqa	ymm12,YMMWORD[((384-512))+rax]
1182	vmovdqa	ymm13,YMMWORD[((416-512))+rax]
1183	vmovdqa	ymm14,YMMWORD[((448-512))+rax]
1184	vmovdqa	ymm15,YMMWORD[((480-512))+rax]
1185	vmovdqa	ymm4,YMMWORD[((512-512))+rax]
1186	vmovdqa	ymm5,YMMWORD[((544-512))+rax]
1187	vmovdqa	ymm6,YMMWORD[((576-512))+rax]
1188	vmovdqa	ymm7,YMMWORD[((608-512))+rax]
1189	vpaddd	ymm4,ymm4,YMMWORD[$L$eight]
1190
1191$L$oop_enter8x:
1192	vmovdqa	YMMWORD[64+rsp],ymm14
1193	vmovdqa	YMMWORD[96+rsp],ymm15
1194	vbroadcasti128	ymm15,XMMWORD[r10]
1195	vmovdqa	YMMWORD[(512-512)+rax],ymm4
1196	mov	eax,10
1197	jmp	NEAR $L$oop8x
1198
1199ALIGN	32
1200$L$oop8x:
1201	vpaddd	ymm8,ymm8,ymm0
1202	vpxor	ymm4,ymm8,ymm4
1203	vpshufb	ymm4,ymm4,ymm15
1204	vpaddd	ymm9,ymm9,ymm1
1205	vpxor	ymm5,ymm9,ymm5
1206	vpshufb	ymm5,ymm5,ymm15
1207	vpaddd	ymm12,ymm12,ymm4
1208	vpxor	ymm0,ymm12,ymm0
1209	vpslld	ymm14,ymm0,12
1210	vpsrld	ymm0,ymm0,20
1211	vpor	ymm0,ymm14,ymm0
1212	vbroadcasti128	ymm14,XMMWORD[r11]
1213	vpaddd	ymm13,ymm13,ymm5
1214	vpxor	ymm1,ymm13,ymm1
1215	vpslld	ymm15,ymm1,12
1216	vpsrld	ymm1,ymm1,20
1217	vpor	ymm1,ymm15,ymm1
1218	vpaddd	ymm8,ymm8,ymm0
1219	vpxor	ymm4,ymm8,ymm4
1220	vpshufb	ymm4,ymm4,ymm14
1221	vpaddd	ymm9,ymm9,ymm1
1222	vpxor	ymm5,ymm9,ymm5
1223	vpshufb	ymm5,ymm5,ymm14
1224	vpaddd	ymm12,ymm12,ymm4
1225	vpxor	ymm0,ymm12,ymm0
1226	vpslld	ymm15,ymm0,7
1227	vpsrld	ymm0,ymm0,25
1228	vpor	ymm0,ymm15,ymm0
1229	vbroadcasti128	ymm15,XMMWORD[r10]
1230	vpaddd	ymm13,ymm13,ymm5
1231	vpxor	ymm1,ymm13,ymm1
1232	vpslld	ymm14,ymm1,7
1233	vpsrld	ymm1,ymm1,25
1234	vpor	ymm1,ymm14,ymm1
1235	vmovdqa	YMMWORD[rsp],ymm12
1236	vmovdqa	YMMWORD[32+rsp],ymm13
1237	vmovdqa	ymm12,YMMWORD[64+rsp]
1238	vmovdqa	ymm13,YMMWORD[96+rsp]
1239	vpaddd	ymm10,ymm10,ymm2
1240	vpxor	ymm6,ymm10,ymm6
1241	vpshufb	ymm6,ymm6,ymm15
1242	vpaddd	ymm11,ymm11,ymm3
1243	vpxor	ymm7,ymm11,ymm7
1244	vpshufb	ymm7,ymm7,ymm15
1245	vpaddd	ymm12,ymm12,ymm6
1246	vpxor	ymm2,ymm12,ymm2
1247	vpslld	ymm14,ymm2,12
1248	vpsrld	ymm2,ymm2,20
1249	vpor	ymm2,ymm14,ymm2
1250	vbroadcasti128	ymm14,XMMWORD[r11]
1251	vpaddd	ymm13,ymm13,ymm7
1252	vpxor	ymm3,ymm13,ymm3
1253	vpslld	ymm15,ymm3,12
1254	vpsrld	ymm3,ymm3,20
1255	vpor	ymm3,ymm15,ymm3
1256	vpaddd	ymm10,ymm10,ymm2
1257	vpxor	ymm6,ymm10,ymm6
1258	vpshufb	ymm6,ymm6,ymm14
1259	vpaddd	ymm11,ymm11,ymm3
1260	vpxor	ymm7,ymm11,ymm7
1261	vpshufb	ymm7,ymm7,ymm14
1262	vpaddd	ymm12,ymm12,ymm6
1263	vpxor	ymm2,ymm12,ymm2
1264	vpslld	ymm15,ymm2,7
1265	vpsrld	ymm2,ymm2,25
1266	vpor	ymm2,ymm15,ymm2
1267	vbroadcasti128	ymm15,XMMWORD[r10]
1268	vpaddd	ymm13,ymm13,ymm7
1269	vpxor	ymm3,ymm13,ymm3
1270	vpslld	ymm14,ymm3,7
1271	vpsrld	ymm3,ymm3,25
1272	vpor	ymm3,ymm14,ymm3
1273	vpaddd	ymm8,ymm8,ymm1
1274	vpxor	ymm7,ymm8,ymm7
1275	vpshufb	ymm7,ymm7,ymm15
1276	vpaddd	ymm9,ymm9,ymm2
1277	vpxor	ymm4,ymm9,ymm4
1278	vpshufb	ymm4,ymm4,ymm15
1279	vpaddd	ymm12,ymm12,ymm7
1280	vpxor	ymm1,ymm12,ymm1
1281	vpslld	ymm14,ymm1,12
1282	vpsrld	ymm1,ymm1,20
1283	vpor	ymm1,ymm14,ymm1
1284	vbroadcasti128	ymm14,XMMWORD[r11]
1285	vpaddd	ymm13,ymm13,ymm4
1286	vpxor	ymm2,ymm13,ymm2
1287	vpslld	ymm15,ymm2,12
1288	vpsrld	ymm2,ymm2,20
1289	vpor	ymm2,ymm15,ymm2
1290	vpaddd	ymm8,ymm8,ymm1
1291	vpxor	ymm7,ymm8,ymm7
1292	vpshufb	ymm7,ymm7,ymm14
1293	vpaddd	ymm9,ymm9,ymm2
1294	vpxor	ymm4,ymm9,ymm4
1295	vpshufb	ymm4,ymm4,ymm14
1296	vpaddd	ymm12,ymm12,ymm7
1297	vpxor	ymm1,ymm12,ymm1
1298	vpslld	ymm15,ymm1,7
1299	vpsrld	ymm1,ymm1,25
1300	vpor	ymm1,ymm15,ymm1
1301	vbroadcasti128	ymm15,XMMWORD[r10]
1302	vpaddd	ymm13,ymm13,ymm4
1303	vpxor	ymm2,ymm13,ymm2
1304	vpslld	ymm14,ymm2,7
1305	vpsrld	ymm2,ymm2,25
1306	vpor	ymm2,ymm14,ymm2
1307	vmovdqa	YMMWORD[64+rsp],ymm12
1308	vmovdqa	YMMWORD[96+rsp],ymm13
1309	vmovdqa	ymm12,YMMWORD[rsp]
1310	vmovdqa	ymm13,YMMWORD[32+rsp]
1311	vpaddd	ymm10,ymm10,ymm3
1312	vpxor	ymm5,ymm10,ymm5
1313	vpshufb	ymm5,ymm5,ymm15
1314	vpaddd	ymm11,ymm11,ymm0
1315	vpxor	ymm6,ymm11,ymm6
1316	vpshufb	ymm6,ymm6,ymm15
1317	vpaddd	ymm12,ymm12,ymm5
1318	vpxor	ymm3,ymm12,ymm3
1319	vpslld	ymm14,ymm3,12
1320	vpsrld	ymm3,ymm3,20
1321	vpor	ymm3,ymm14,ymm3
1322	vbroadcasti128	ymm14,XMMWORD[r11]
1323	vpaddd	ymm13,ymm13,ymm6
1324	vpxor	ymm0,ymm13,ymm0
1325	vpslld	ymm15,ymm0,12
1326	vpsrld	ymm0,ymm0,20
1327	vpor	ymm0,ymm15,ymm0
1328	vpaddd	ymm10,ymm10,ymm3
1329	vpxor	ymm5,ymm10,ymm5
1330	vpshufb	ymm5,ymm5,ymm14
1331	vpaddd	ymm11,ymm11,ymm0
1332	vpxor	ymm6,ymm11,ymm6
1333	vpshufb	ymm6,ymm6,ymm14
1334	vpaddd	ymm12,ymm12,ymm5
1335	vpxor	ymm3,ymm12,ymm3
1336	vpslld	ymm15,ymm3,7
1337	vpsrld	ymm3,ymm3,25
1338	vpor	ymm3,ymm15,ymm3
1339	vbroadcasti128	ymm15,XMMWORD[r10]
1340	vpaddd	ymm13,ymm13,ymm6
1341	vpxor	ymm0,ymm13,ymm0
1342	vpslld	ymm14,ymm0,7
1343	vpsrld	ymm0,ymm0,25
1344	vpor	ymm0,ymm14,ymm0
1345	dec	eax
1346	jnz	NEAR $L$oop8x
1347
1348	lea	rax,[512+rsp]
1349	vpaddd	ymm8,ymm8,YMMWORD[((128-256))+rcx]
1350	vpaddd	ymm9,ymm9,YMMWORD[((160-256))+rcx]
1351	vpaddd	ymm10,ymm10,YMMWORD[((192-256))+rcx]
1352	vpaddd	ymm11,ymm11,YMMWORD[((224-256))+rcx]
1353
1354	vpunpckldq	ymm14,ymm8,ymm9
1355	vpunpckldq	ymm15,ymm10,ymm11
1356	vpunpckhdq	ymm8,ymm8,ymm9
1357	vpunpckhdq	ymm10,ymm10,ymm11
1358	vpunpcklqdq	ymm9,ymm14,ymm15
1359	vpunpckhqdq	ymm14,ymm14,ymm15
1360	vpunpcklqdq	ymm11,ymm8,ymm10
1361	vpunpckhqdq	ymm8,ymm8,ymm10
1362	vpaddd	ymm0,ymm0,YMMWORD[((256-256))+rcx]
1363	vpaddd	ymm1,ymm1,YMMWORD[((288-256))+rcx]
1364	vpaddd	ymm2,ymm2,YMMWORD[((320-256))+rcx]
1365	vpaddd	ymm3,ymm3,YMMWORD[((352-256))+rcx]
1366
1367	vpunpckldq	ymm10,ymm0,ymm1
1368	vpunpckldq	ymm15,ymm2,ymm3
1369	vpunpckhdq	ymm0,ymm0,ymm1
1370	vpunpckhdq	ymm2,ymm2,ymm3
1371	vpunpcklqdq	ymm1,ymm10,ymm15
1372	vpunpckhqdq	ymm10,ymm10,ymm15
1373	vpunpcklqdq	ymm3,ymm0,ymm2
1374	vpunpckhqdq	ymm0,ymm0,ymm2
1375	vperm2i128	ymm15,ymm9,ymm1,0x20
1376	vperm2i128	ymm1,ymm9,ymm1,0x31
1377	vperm2i128	ymm9,ymm14,ymm10,0x20
1378	vperm2i128	ymm10,ymm14,ymm10,0x31
1379	vperm2i128	ymm14,ymm11,ymm3,0x20
1380	vperm2i128	ymm3,ymm11,ymm3,0x31
1381	vperm2i128	ymm11,ymm8,ymm0,0x20
1382	vperm2i128	ymm0,ymm8,ymm0,0x31
1383	vmovdqa	YMMWORD[rsp],ymm15
1384	vmovdqa	YMMWORD[32+rsp],ymm9
1385	vmovdqa	ymm15,YMMWORD[64+rsp]
1386	vmovdqa	ymm9,YMMWORD[96+rsp]
1387
1388	vpaddd	ymm12,ymm12,YMMWORD[((384-512))+rax]
1389	vpaddd	ymm13,ymm13,YMMWORD[((416-512))+rax]
1390	vpaddd	ymm15,ymm15,YMMWORD[((448-512))+rax]
1391	vpaddd	ymm9,ymm9,YMMWORD[((480-512))+rax]
1392
1393	vpunpckldq	ymm2,ymm12,ymm13
1394	vpunpckldq	ymm8,ymm15,ymm9
1395	vpunpckhdq	ymm12,ymm12,ymm13
1396	vpunpckhdq	ymm15,ymm15,ymm9
1397	vpunpcklqdq	ymm13,ymm2,ymm8
1398	vpunpckhqdq	ymm2,ymm2,ymm8
1399	vpunpcklqdq	ymm9,ymm12,ymm15
1400	vpunpckhqdq	ymm12,ymm12,ymm15
1401	vpaddd	ymm4,ymm4,YMMWORD[((512-512))+rax]
1402	vpaddd	ymm5,ymm5,YMMWORD[((544-512))+rax]
1403	vpaddd	ymm6,ymm6,YMMWORD[((576-512))+rax]
1404	vpaddd	ymm7,ymm7,YMMWORD[((608-512))+rax]
1405
1406	vpunpckldq	ymm15,ymm4,ymm5
1407	vpunpckldq	ymm8,ymm6,ymm7
1408	vpunpckhdq	ymm4,ymm4,ymm5
1409	vpunpckhdq	ymm6,ymm6,ymm7
1410	vpunpcklqdq	ymm5,ymm15,ymm8
1411	vpunpckhqdq	ymm15,ymm15,ymm8
1412	vpunpcklqdq	ymm7,ymm4,ymm6
1413	vpunpckhqdq	ymm4,ymm4,ymm6
1414	vperm2i128	ymm8,ymm13,ymm5,0x20
1415	vperm2i128	ymm5,ymm13,ymm5,0x31
1416	vperm2i128	ymm13,ymm2,ymm15,0x20
1417	vperm2i128	ymm15,ymm2,ymm15,0x31
1418	vperm2i128	ymm2,ymm9,ymm7,0x20
1419	vperm2i128	ymm7,ymm9,ymm7,0x31
1420	vperm2i128	ymm9,ymm12,ymm4,0x20
1421	vperm2i128	ymm4,ymm12,ymm4,0x31
1422	vmovdqa	ymm6,YMMWORD[rsp]
1423	vmovdqa	ymm12,YMMWORD[32+rsp]
1424
1425	cmp	rdx,64*8
1426	jb	NEAR $L$tail8x
1427
1428	vpxor	ymm6,ymm6,YMMWORD[rsi]
1429	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
1430	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
1431	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
1432	lea	rsi,[128+rsi]
1433	vmovdqu	YMMWORD[rdi],ymm6
1434	vmovdqu	YMMWORD[32+rdi],ymm8
1435	vmovdqu	YMMWORD[64+rdi],ymm1
1436	vmovdqu	YMMWORD[96+rdi],ymm5
1437	lea	rdi,[128+rdi]
1438
1439	vpxor	ymm12,ymm12,YMMWORD[rsi]
1440	vpxor	ymm13,ymm13,YMMWORD[32+rsi]
1441	vpxor	ymm10,ymm10,YMMWORD[64+rsi]
1442	vpxor	ymm15,ymm15,YMMWORD[96+rsi]
1443	lea	rsi,[128+rsi]
1444	vmovdqu	YMMWORD[rdi],ymm12
1445	vmovdqu	YMMWORD[32+rdi],ymm13
1446	vmovdqu	YMMWORD[64+rdi],ymm10
1447	vmovdqu	YMMWORD[96+rdi],ymm15
1448	lea	rdi,[128+rdi]
1449
1450	vpxor	ymm14,ymm14,YMMWORD[rsi]
1451	vpxor	ymm2,ymm2,YMMWORD[32+rsi]
1452	vpxor	ymm3,ymm3,YMMWORD[64+rsi]
1453	vpxor	ymm7,ymm7,YMMWORD[96+rsi]
1454	lea	rsi,[128+rsi]
1455	vmovdqu	YMMWORD[rdi],ymm14
1456	vmovdqu	YMMWORD[32+rdi],ymm2
1457	vmovdqu	YMMWORD[64+rdi],ymm3
1458	vmovdqu	YMMWORD[96+rdi],ymm7
1459	lea	rdi,[128+rdi]
1460
1461	vpxor	ymm11,ymm11,YMMWORD[rsi]
1462	vpxor	ymm9,ymm9,YMMWORD[32+rsi]
1463	vpxor	ymm0,ymm0,YMMWORD[64+rsi]
1464	vpxor	ymm4,ymm4,YMMWORD[96+rsi]
1465	lea	rsi,[128+rsi]
1466	vmovdqu	YMMWORD[rdi],ymm11
1467	vmovdqu	YMMWORD[32+rdi],ymm9
1468	vmovdqu	YMMWORD[64+rdi],ymm0
1469	vmovdqu	YMMWORD[96+rdi],ymm4
1470	lea	rdi,[128+rdi]
1471
1472	sub	rdx,64*8
1473	jnz	NEAR $L$oop_outer8x
1474
1475	jmp	NEAR $L$done8x
1476
1477$L$tail8x:
1478	cmp	rdx,448
1479	jae	NEAR $L$448_or_more8x
1480	cmp	rdx,384
1481	jae	NEAR $L$384_or_more8x
1482	cmp	rdx,320
1483	jae	NEAR $L$320_or_more8x
1484	cmp	rdx,256
1485	jae	NEAR $L$256_or_more8x
1486	cmp	rdx,192
1487	jae	NEAR $L$192_or_more8x
1488	cmp	rdx,128
1489	jae	NEAR $L$128_or_more8x
1490	cmp	rdx,64
1491	jae	NEAR $L$64_or_more8x
1492
1493	xor	r10,r10
1494	vmovdqa	YMMWORD[rsp],ymm6
1495	vmovdqa	YMMWORD[32+rsp],ymm8
1496	jmp	NEAR $L$oop_tail8x
1497
1498ALIGN	32
1499$L$64_or_more8x:
1500	vpxor	ymm6,ymm6,YMMWORD[rsi]
1501	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
1502	vmovdqu	YMMWORD[rdi],ymm6
1503	vmovdqu	YMMWORD[32+rdi],ymm8
1504	je	NEAR $L$done8x
1505
1506	lea	rsi,[64+rsi]
1507	xor	r10,r10
1508	vmovdqa	YMMWORD[rsp],ymm1
1509	lea	rdi,[64+rdi]
1510	sub	rdx,64
1511	vmovdqa	YMMWORD[32+rsp],ymm5
1512	jmp	NEAR $L$oop_tail8x
1513
1514ALIGN	32
1515$L$128_or_more8x:
1516	vpxor	ymm6,ymm6,YMMWORD[rsi]
1517	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
1518	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
1519	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
1520	vmovdqu	YMMWORD[rdi],ymm6
1521	vmovdqu	YMMWORD[32+rdi],ymm8
1522	vmovdqu	YMMWORD[64+rdi],ymm1
1523	vmovdqu	YMMWORD[96+rdi],ymm5
1524	je	NEAR $L$done8x
1525
1526	lea	rsi,[128+rsi]
1527	xor	r10,r10
1528	vmovdqa	YMMWORD[rsp],ymm12
1529	lea	rdi,[128+rdi]
1530	sub	rdx,128
1531	vmovdqa	YMMWORD[32+rsp],ymm13
1532	jmp	NEAR $L$oop_tail8x
1533
1534ALIGN	32
1535$L$192_or_more8x:
1536	vpxor	ymm6,ymm6,YMMWORD[rsi]
1537	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
1538	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
1539	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
1540	vpxor	ymm12,ymm12,YMMWORD[128+rsi]
1541	vpxor	ymm13,ymm13,YMMWORD[160+rsi]
1542	vmovdqu	YMMWORD[rdi],ymm6
1543	vmovdqu	YMMWORD[32+rdi],ymm8
1544	vmovdqu	YMMWORD[64+rdi],ymm1
1545	vmovdqu	YMMWORD[96+rdi],ymm5
1546	vmovdqu	YMMWORD[128+rdi],ymm12
1547	vmovdqu	YMMWORD[160+rdi],ymm13
1548	je	NEAR $L$done8x
1549
1550	lea	rsi,[192+rsi]
1551	xor	r10,r10
1552	vmovdqa	YMMWORD[rsp],ymm10
1553	lea	rdi,[192+rdi]
1554	sub	rdx,192
1555	vmovdqa	YMMWORD[32+rsp],ymm15
1556	jmp	NEAR $L$oop_tail8x
1557
1558ALIGN	32
1559$L$256_or_more8x:
1560	vpxor	ymm6,ymm6,YMMWORD[rsi]
1561	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
1562	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
1563	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
1564	vpxor	ymm12,ymm12,YMMWORD[128+rsi]
1565	vpxor	ymm13,ymm13,YMMWORD[160+rsi]
1566	vpxor	ymm10,ymm10,YMMWORD[192+rsi]
1567	vpxor	ymm15,ymm15,YMMWORD[224+rsi]
1568	vmovdqu	YMMWORD[rdi],ymm6
1569	vmovdqu	YMMWORD[32+rdi],ymm8
1570	vmovdqu	YMMWORD[64+rdi],ymm1
1571	vmovdqu	YMMWORD[96+rdi],ymm5
1572	vmovdqu	YMMWORD[128+rdi],ymm12
1573	vmovdqu	YMMWORD[160+rdi],ymm13
1574	vmovdqu	YMMWORD[192+rdi],ymm10
1575	vmovdqu	YMMWORD[224+rdi],ymm15
1576	je	NEAR $L$done8x
1577
1578	lea	rsi,[256+rsi]
1579	xor	r10,r10
1580	vmovdqa	YMMWORD[rsp],ymm14
1581	lea	rdi,[256+rdi]
1582	sub	rdx,256
1583	vmovdqa	YMMWORD[32+rsp],ymm2
1584	jmp	NEAR $L$oop_tail8x
1585
1586ALIGN	32
1587$L$320_or_more8x:
1588	vpxor	ymm6,ymm6,YMMWORD[rsi]
1589	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
1590	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
1591	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
1592	vpxor	ymm12,ymm12,YMMWORD[128+rsi]
1593	vpxor	ymm13,ymm13,YMMWORD[160+rsi]
1594	vpxor	ymm10,ymm10,YMMWORD[192+rsi]
1595	vpxor	ymm15,ymm15,YMMWORD[224+rsi]
1596	vpxor	ymm14,ymm14,YMMWORD[256+rsi]
1597	vpxor	ymm2,ymm2,YMMWORD[288+rsi]
1598	vmovdqu	YMMWORD[rdi],ymm6
1599	vmovdqu	YMMWORD[32+rdi],ymm8
1600	vmovdqu	YMMWORD[64+rdi],ymm1
1601	vmovdqu	YMMWORD[96+rdi],ymm5
1602	vmovdqu	YMMWORD[128+rdi],ymm12
1603	vmovdqu	YMMWORD[160+rdi],ymm13
1604	vmovdqu	YMMWORD[192+rdi],ymm10
1605	vmovdqu	YMMWORD[224+rdi],ymm15
1606	vmovdqu	YMMWORD[256+rdi],ymm14
1607	vmovdqu	YMMWORD[288+rdi],ymm2
1608	je	NEAR $L$done8x
1609
1610	lea	rsi,[320+rsi]
1611	xor	r10,r10
1612	vmovdqa	YMMWORD[rsp],ymm3
1613	lea	rdi,[320+rdi]
1614	sub	rdx,320
1615	vmovdqa	YMMWORD[32+rsp],ymm7
1616	jmp	NEAR $L$oop_tail8x
1617
1618ALIGN	32
1619$L$384_or_more8x:
1620	vpxor	ymm6,ymm6,YMMWORD[rsi]
1621	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
1622	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
1623	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
1624	vpxor	ymm12,ymm12,YMMWORD[128+rsi]
1625	vpxor	ymm13,ymm13,YMMWORD[160+rsi]
1626	vpxor	ymm10,ymm10,YMMWORD[192+rsi]
1627	vpxor	ymm15,ymm15,YMMWORD[224+rsi]
1628	vpxor	ymm14,ymm14,YMMWORD[256+rsi]
1629	vpxor	ymm2,ymm2,YMMWORD[288+rsi]
1630	vpxor	ymm3,ymm3,YMMWORD[320+rsi]
1631	vpxor	ymm7,ymm7,YMMWORD[352+rsi]
1632	vmovdqu	YMMWORD[rdi],ymm6
1633	vmovdqu	YMMWORD[32+rdi],ymm8
1634	vmovdqu	YMMWORD[64+rdi],ymm1
1635	vmovdqu	YMMWORD[96+rdi],ymm5
1636	vmovdqu	YMMWORD[128+rdi],ymm12
1637	vmovdqu	YMMWORD[160+rdi],ymm13
1638	vmovdqu	YMMWORD[192+rdi],ymm10
1639	vmovdqu	YMMWORD[224+rdi],ymm15
1640	vmovdqu	YMMWORD[256+rdi],ymm14
1641	vmovdqu	YMMWORD[288+rdi],ymm2
1642	vmovdqu	YMMWORD[320+rdi],ymm3
1643	vmovdqu	YMMWORD[352+rdi],ymm7
1644	je	NEAR $L$done8x
1645
1646	lea	rsi,[384+rsi]
1647	xor	r10,r10
1648	vmovdqa	YMMWORD[rsp],ymm11
1649	lea	rdi,[384+rdi]
1650	sub	rdx,384
1651	vmovdqa	YMMWORD[32+rsp],ymm9
1652	jmp	NEAR $L$oop_tail8x
1653
1654ALIGN	32
1655$L$448_or_more8x:
1656	vpxor	ymm6,ymm6,YMMWORD[rsi]
1657	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
1658	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
1659	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
1660	vpxor	ymm12,ymm12,YMMWORD[128+rsi]
1661	vpxor	ymm13,ymm13,YMMWORD[160+rsi]
1662	vpxor	ymm10,ymm10,YMMWORD[192+rsi]
1663	vpxor	ymm15,ymm15,YMMWORD[224+rsi]
1664	vpxor	ymm14,ymm14,YMMWORD[256+rsi]
1665	vpxor	ymm2,ymm2,YMMWORD[288+rsi]
1666	vpxor	ymm3,ymm3,YMMWORD[320+rsi]
1667	vpxor	ymm7,ymm7,YMMWORD[352+rsi]
1668	vpxor	ymm11,ymm11,YMMWORD[384+rsi]
1669	vpxor	ymm9,ymm9,YMMWORD[416+rsi]
1670	vmovdqu	YMMWORD[rdi],ymm6
1671	vmovdqu	YMMWORD[32+rdi],ymm8
1672	vmovdqu	YMMWORD[64+rdi],ymm1
1673	vmovdqu	YMMWORD[96+rdi],ymm5
1674	vmovdqu	YMMWORD[128+rdi],ymm12
1675	vmovdqu	YMMWORD[160+rdi],ymm13
1676	vmovdqu	YMMWORD[192+rdi],ymm10
1677	vmovdqu	YMMWORD[224+rdi],ymm15
1678	vmovdqu	YMMWORD[256+rdi],ymm14
1679	vmovdqu	YMMWORD[288+rdi],ymm2
1680	vmovdqu	YMMWORD[320+rdi],ymm3
1681	vmovdqu	YMMWORD[352+rdi],ymm7
1682	vmovdqu	YMMWORD[384+rdi],ymm11
1683	vmovdqu	YMMWORD[416+rdi],ymm9
1684	je	NEAR $L$done8x
1685
1686	lea	rsi,[448+rsi]
1687	xor	r10,r10
1688	vmovdqa	YMMWORD[rsp],ymm0
1689	lea	rdi,[448+rdi]
1690	sub	rdx,448
1691	vmovdqa	YMMWORD[32+rsp],ymm4
1692
1693$L$oop_tail8x:
1694	movzx	eax,BYTE[r10*1+rsi]
1695	movzx	ecx,BYTE[r10*1+rsp]
1696	lea	r10,[1+r10]
1697	xor	eax,ecx
1698	mov	BYTE[((-1))+r10*1+rdi],al
1699	dec	rdx
1700	jnz	NEAR $L$oop_tail8x
1701
1702$L$done8x:
1703	vzeroall
1704	movaps	xmm6,XMMWORD[((-168))+r9]
1705	movaps	xmm7,XMMWORD[((-152))+r9]
1706	movaps	xmm8,XMMWORD[((-136))+r9]
1707	movaps	xmm9,XMMWORD[((-120))+r9]
1708	movaps	xmm10,XMMWORD[((-104))+r9]
1709	movaps	xmm11,XMMWORD[((-88))+r9]
1710	movaps	xmm12,XMMWORD[((-72))+r9]
1711	movaps	xmm13,XMMWORD[((-56))+r9]
1712	movaps	xmm14,XMMWORD[((-40))+r9]
1713	movaps	xmm15,XMMWORD[((-24))+r9]
1714	lea	rsp,[r9]
1715
1716$L$8x_epilogue:
1717	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
1718	mov	rsi,QWORD[16+rsp]
1719	DB	0F3h,0C3h		;repret
1720
1721$L$SEH_end_ChaCha20_8x:
1722EXTERN	__imp_RtlVirtualUnwind
1723
1724ALIGN	16
1725se_handler:
1726	push	rsi
1727	push	rdi
1728	push	rbx
1729	push	rbp
1730	push	r12
1731	push	r13
1732	push	r14
1733	push	r15
1734	pushfq
1735	sub	rsp,64
1736
1737	mov	rax,QWORD[120+r8]
1738	mov	rbx,QWORD[248+r8]
1739
1740	mov	rsi,QWORD[8+r9]
1741	mov	r11,QWORD[56+r9]
1742
1743	lea	r10,[$L$ctr32_body]
1744	cmp	rbx,r10
1745	jb	NEAR $L$common_seh_tail
1746
1747	mov	rax,QWORD[152+r8]
1748
1749	lea	r10,[$L$no_data]
1750	cmp	rbx,r10
1751	jae	NEAR $L$common_seh_tail
1752
1753	lea	rax,[((64+24+48))+rax]
1754
1755	mov	rbx,QWORD[((-8))+rax]
1756	mov	rbp,QWORD[((-16))+rax]
1757	mov	r12,QWORD[((-24))+rax]
1758	mov	r13,QWORD[((-32))+rax]
1759	mov	r14,QWORD[((-40))+rax]
1760	mov	r15,QWORD[((-48))+rax]
1761	mov	QWORD[144+r8],rbx
1762	mov	QWORD[160+r8],rbp
1763	mov	QWORD[216+r8],r12
1764	mov	QWORD[224+r8],r13
1765	mov	QWORD[232+r8],r14
1766	mov	QWORD[240+r8],r15
1767
1768$L$common_seh_tail:
1769	mov	rdi,QWORD[8+rax]
1770	mov	rsi,QWORD[16+rax]
1771	mov	QWORD[152+r8],rax
1772	mov	QWORD[168+r8],rsi
1773	mov	QWORD[176+r8],rdi
1774
1775	mov	rdi,QWORD[40+r9]
1776	mov	rsi,r8
1777	mov	ecx,154
1778	DD	0xa548f3fc
1779
1780	mov	rsi,r9
1781	xor	rcx,rcx
1782	mov	rdx,QWORD[8+rsi]
1783	mov	r8,QWORD[rsi]
1784	mov	r9,QWORD[16+rsi]
1785	mov	r10,QWORD[40+rsi]
1786	lea	r11,[56+rsi]
1787	lea	r12,[24+rsi]
1788	mov	QWORD[32+rsp],r10
1789	mov	QWORD[40+rsp],r11
1790	mov	QWORD[48+rsp],r12
1791	mov	QWORD[56+rsp],rcx
1792	call	QWORD[__imp_RtlVirtualUnwind]
1793
1794	mov	eax,1
1795	add	rsp,64
1796	popfq
1797	pop	r15
1798	pop	r14
1799	pop	r13
1800	pop	r12
1801	pop	rbp
1802	pop	rbx
1803	pop	rdi
1804	pop	rsi
1805	DB	0F3h,0C3h		;repret
1806
1807
1808
1809ALIGN	16
1810ssse3_handler:
1811	push	rsi
1812	push	rdi
1813	push	rbx
1814	push	rbp
1815	push	r12
1816	push	r13
1817	push	r14
1818	push	r15
1819	pushfq
1820	sub	rsp,64
1821
1822	mov	rax,QWORD[120+r8]
1823	mov	rbx,QWORD[248+r8]
1824
1825	mov	rsi,QWORD[8+r9]
1826	mov	r11,QWORD[56+r9]
1827
1828	mov	r10d,DWORD[r11]
1829	lea	r10,[r10*1+rsi]
1830	cmp	rbx,r10
1831	jb	NEAR $L$common_seh_tail
1832
1833	mov	rax,QWORD[192+r8]
1834
1835	mov	r10d,DWORD[4+r11]
1836	lea	r10,[r10*1+rsi]
1837	cmp	rbx,r10
1838	jae	NEAR $L$common_seh_tail
1839
1840	lea	rsi,[((-40))+rax]
1841	lea	rdi,[512+r8]
1842	mov	ecx,4
1843	DD	0xa548f3fc
1844
1845	jmp	NEAR $L$common_seh_tail
1846
1847
1848
1849ALIGN	16
1850full_handler:
1851	push	rsi
1852	push	rdi
1853	push	rbx
1854	push	rbp
1855	push	r12
1856	push	r13
1857	push	r14
1858	push	r15
1859	pushfq
1860	sub	rsp,64
1861
1862	mov	rax,QWORD[120+r8]
1863	mov	rbx,QWORD[248+r8]
1864
1865	mov	rsi,QWORD[8+r9]
1866	mov	r11,QWORD[56+r9]
1867
1868	mov	r10d,DWORD[r11]
1869	lea	r10,[r10*1+rsi]
1870	cmp	rbx,r10
1871	jb	NEAR $L$common_seh_tail
1872
1873	mov	rax,QWORD[192+r8]
1874
1875	mov	r10d,DWORD[4+r11]
1876	lea	r10,[r10*1+rsi]
1877	cmp	rbx,r10
1878	jae	NEAR $L$common_seh_tail
1879
1880	lea	rsi,[((-168))+rax]
1881	lea	rdi,[512+r8]
1882	mov	ecx,20
1883	DD	0xa548f3fc
1884
1885	jmp	NEAR $L$common_seh_tail
1886
1887
1888section	.pdata rdata align=4
1889ALIGN	4
1890	DD	$L$SEH_begin_GFp_ChaCha20_ctr32 wrt ..imagebase
1891	DD	$L$SEH_end_GFp_ChaCha20_ctr32 wrt ..imagebase
1892	DD	$L$SEH_info_GFp_ChaCha20_ctr32 wrt ..imagebase
1893
1894	DD	$L$SEH_begin_ChaCha20_ssse3 wrt ..imagebase
1895	DD	$L$SEH_end_ChaCha20_ssse3 wrt ..imagebase
1896	DD	$L$SEH_info_ChaCha20_ssse3 wrt ..imagebase
1897
1898	DD	$L$SEH_begin_ChaCha20_4x wrt ..imagebase
1899	DD	$L$SEH_end_ChaCha20_4x wrt ..imagebase
1900	DD	$L$SEH_info_ChaCha20_4x wrt ..imagebase
1901	DD	$L$SEH_begin_ChaCha20_8x wrt ..imagebase
1902	DD	$L$SEH_end_ChaCha20_8x wrt ..imagebase
1903	DD	$L$SEH_info_ChaCha20_8x wrt ..imagebase
1904section	.xdata rdata align=8
1905ALIGN	8
1906$L$SEH_info_GFp_ChaCha20_ctr32:
1907DB	9,0,0,0
1908	DD	se_handler wrt ..imagebase
1909
1910$L$SEH_info_ChaCha20_ssse3:
1911DB	9,0,0,0
1912	DD	ssse3_handler wrt ..imagebase
1913	DD	$L$ssse3_body wrt ..imagebase,$L$ssse3_epilogue wrt ..imagebase
1914
1915$L$SEH_info_ChaCha20_4x:
1916DB	9,0,0,0
1917	DD	full_handler wrt ..imagebase
1918	DD	$L$4x_body wrt ..imagebase,$L$4x_epilogue wrt ..imagebase
1919$L$SEH_info_ChaCha20_8x:
1920DB	9,0,0,0
1921	DD	full_handler wrt ..imagebase
1922	DD	$L$8x_body wrt ..imagebase,$L$8x_epilogue wrt ..imagebase
1923