1.text
2
3
4
5.p2align	6
6L$zero:
7.long	0,0,0,0
8L$one:
9.long	1,0,0,0
10L$inc:
11.long	0,1,2,3
12L$four:
13.long	4,4,4,4
14L$incy:
15.long	0,2,4,6,1,3,5,7
16L$eight:
17.long	8,8,8,8,8,8,8,8
18L$rot16:
19.byte	0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
20L$rot24:
21.byte	0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
22L$twoy:
23.long	2,0,0,0, 2,0,0,0
24.p2align	6
25L$zeroz:
26.long	0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
27L$fourz:
28.long	4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
29L$incz:
30.long	0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
31L$sixteen:
32.long	16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
33L$sigma:
34.byte	101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0
35.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
36.globl	_ChaCha20_ctr32
37
38.p2align	6
39_ChaCha20_ctr32:
40
41	cmpq	$0,%rdx
42	je	L$no_data
43	movq	_OPENSSL_ia32cap_P+4(%rip),%r10
44	testl	$512,%r10d
45	jnz	L$ChaCha20_ssse3
46
47	pushq	%rbx
48
49	pushq	%rbp
50
51	pushq	%r12
52
53	pushq	%r13
54
55	pushq	%r14
56
57	pushq	%r15
58
59	subq	$64+24,%rsp
60
61L$ctr32_body:
62
63
64	movdqu	(%rcx),%xmm1
65	movdqu	16(%rcx),%xmm2
66	movdqu	(%r8),%xmm3
67	movdqa	L$one(%rip),%xmm4
68
69
70	movdqa	%xmm1,16(%rsp)
71	movdqa	%xmm2,32(%rsp)
72	movdqa	%xmm3,48(%rsp)
73	movq	%rdx,%rbp
74	jmp	L$oop_outer
75
76.p2align	5
77L$oop_outer:
78	movl	$0x61707865,%eax
79	movl	$0x3320646e,%ebx
80	movl	$0x79622d32,%ecx
81	movl	$0x6b206574,%edx
82	movl	16(%rsp),%r8d
83	movl	20(%rsp),%r9d
84	movl	24(%rsp),%r10d
85	movl	28(%rsp),%r11d
86	movd	%xmm3,%r12d
87	movl	52(%rsp),%r13d
88	movl	56(%rsp),%r14d
89	movl	60(%rsp),%r15d
90
91	movq	%rbp,64+0(%rsp)
92	movl	$10,%ebp
93	movq	%rsi,64+8(%rsp)
94.byte	102,72,15,126,214
95	movq	%rdi,64+16(%rsp)
96	movq	%rsi,%rdi
97	shrq	$32,%rdi
98	jmp	L$oop
99
100.p2align	5
101L$oop:
102	addl	%r8d,%eax
103	xorl	%eax,%r12d
104	roll	$16,%r12d
105	addl	%r9d,%ebx
106	xorl	%ebx,%r13d
107	roll	$16,%r13d
108	addl	%r12d,%esi
109	xorl	%esi,%r8d
110	roll	$12,%r8d
111	addl	%r13d,%edi
112	xorl	%edi,%r9d
113	roll	$12,%r9d
114	addl	%r8d,%eax
115	xorl	%eax,%r12d
116	roll	$8,%r12d
117	addl	%r9d,%ebx
118	xorl	%ebx,%r13d
119	roll	$8,%r13d
120	addl	%r12d,%esi
121	xorl	%esi,%r8d
122	roll	$7,%r8d
123	addl	%r13d,%edi
124	xorl	%edi,%r9d
125	roll	$7,%r9d
126	movl	%esi,32(%rsp)
127	movl	%edi,36(%rsp)
128	movl	40(%rsp),%esi
129	movl	44(%rsp),%edi
130	addl	%r10d,%ecx
131	xorl	%ecx,%r14d
132	roll	$16,%r14d
133	addl	%r11d,%edx
134	xorl	%edx,%r15d
135	roll	$16,%r15d
136	addl	%r14d,%esi
137	xorl	%esi,%r10d
138	roll	$12,%r10d
139	addl	%r15d,%edi
140	xorl	%edi,%r11d
141	roll	$12,%r11d
142	addl	%r10d,%ecx
143	xorl	%ecx,%r14d
144	roll	$8,%r14d
145	addl	%r11d,%edx
146	xorl	%edx,%r15d
147	roll	$8,%r15d
148	addl	%r14d,%esi
149	xorl	%esi,%r10d
150	roll	$7,%r10d
151	addl	%r15d,%edi
152	xorl	%edi,%r11d
153	roll	$7,%r11d
154	addl	%r9d,%eax
155	xorl	%eax,%r15d
156	roll	$16,%r15d
157	addl	%r10d,%ebx
158	xorl	%ebx,%r12d
159	roll	$16,%r12d
160	addl	%r15d,%esi
161	xorl	%esi,%r9d
162	roll	$12,%r9d
163	addl	%r12d,%edi
164	xorl	%edi,%r10d
165	roll	$12,%r10d
166	addl	%r9d,%eax
167	xorl	%eax,%r15d
168	roll	$8,%r15d
169	addl	%r10d,%ebx
170	xorl	%ebx,%r12d
171	roll	$8,%r12d
172	addl	%r15d,%esi
173	xorl	%esi,%r9d
174	roll	$7,%r9d
175	addl	%r12d,%edi
176	xorl	%edi,%r10d
177	roll	$7,%r10d
178	movl	%esi,40(%rsp)
179	movl	%edi,44(%rsp)
180	movl	32(%rsp),%esi
181	movl	36(%rsp),%edi
182	addl	%r11d,%ecx
183	xorl	%ecx,%r13d
184	roll	$16,%r13d
185	addl	%r8d,%edx
186	xorl	%edx,%r14d
187	roll	$16,%r14d
188	addl	%r13d,%esi
189	xorl	%esi,%r11d
190	roll	$12,%r11d
191	addl	%r14d,%edi
192	xorl	%edi,%r8d
193	roll	$12,%r8d
194	addl	%r11d,%ecx
195	xorl	%ecx,%r13d
196	roll	$8,%r13d
197	addl	%r8d,%edx
198	xorl	%edx,%r14d
199	roll	$8,%r14d
200	addl	%r13d,%esi
201	xorl	%esi,%r11d
202	roll	$7,%r11d
203	addl	%r14d,%edi
204	xorl	%edi,%r8d
205	roll	$7,%r8d
206	decl	%ebp
207	jnz	L$oop
208	movl	%edi,36(%rsp)
209	movl	%esi,32(%rsp)
210	movq	64(%rsp),%rbp
211	movdqa	%xmm2,%xmm1
212	movq	64+8(%rsp),%rsi
213	paddd	%xmm4,%xmm3
214	movq	64+16(%rsp),%rdi
215
216	addl	$0x61707865,%eax
217	addl	$0x3320646e,%ebx
218	addl	$0x79622d32,%ecx
219	addl	$0x6b206574,%edx
220	addl	16(%rsp),%r8d
221	addl	20(%rsp),%r9d
222	addl	24(%rsp),%r10d
223	addl	28(%rsp),%r11d
224	addl	48(%rsp),%r12d
225	addl	52(%rsp),%r13d
226	addl	56(%rsp),%r14d
227	addl	60(%rsp),%r15d
228	paddd	32(%rsp),%xmm1
229
230	cmpq	$64,%rbp
231	jb	L$tail
232
233	xorl	0(%rsi),%eax
234	xorl	4(%rsi),%ebx
235	xorl	8(%rsi),%ecx
236	xorl	12(%rsi),%edx
237	xorl	16(%rsi),%r8d
238	xorl	20(%rsi),%r9d
239	xorl	24(%rsi),%r10d
240	xorl	28(%rsi),%r11d
241	movdqu	32(%rsi),%xmm0
242	xorl	48(%rsi),%r12d
243	xorl	52(%rsi),%r13d
244	xorl	56(%rsi),%r14d
245	xorl	60(%rsi),%r15d
246	leaq	64(%rsi),%rsi
247	pxor	%xmm1,%xmm0
248
249	movdqa	%xmm2,32(%rsp)
250	movd	%xmm3,48(%rsp)
251
252	movl	%eax,0(%rdi)
253	movl	%ebx,4(%rdi)
254	movl	%ecx,8(%rdi)
255	movl	%edx,12(%rdi)
256	movl	%r8d,16(%rdi)
257	movl	%r9d,20(%rdi)
258	movl	%r10d,24(%rdi)
259	movl	%r11d,28(%rdi)
260	movdqu	%xmm0,32(%rdi)
261	movl	%r12d,48(%rdi)
262	movl	%r13d,52(%rdi)
263	movl	%r14d,56(%rdi)
264	movl	%r15d,60(%rdi)
265	leaq	64(%rdi),%rdi
266
267	subq	$64,%rbp
268	jnz	L$oop_outer
269
270	jmp	L$done
271
272.p2align	4
273L$tail:
274	movl	%eax,0(%rsp)
275	movl	%ebx,4(%rsp)
276	xorq	%rbx,%rbx
277	movl	%ecx,8(%rsp)
278	movl	%edx,12(%rsp)
279	movl	%r8d,16(%rsp)
280	movl	%r9d,20(%rsp)
281	movl	%r10d,24(%rsp)
282	movl	%r11d,28(%rsp)
283	movdqa	%xmm1,32(%rsp)
284	movl	%r12d,48(%rsp)
285	movl	%r13d,52(%rsp)
286	movl	%r14d,56(%rsp)
287	movl	%r15d,60(%rsp)
288
289L$oop_tail:
290	movzbl	(%rsi,%rbx,1),%eax
291	movzbl	(%rsp,%rbx,1),%edx
292	leaq	1(%rbx),%rbx
293	xorl	%edx,%eax
294	movb	%al,-1(%rdi,%rbx,1)
295	decq	%rbp
296	jnz	L$oop_tail
297
298L$done:
299	leaq	64+24+48(%rsp),%rsi
300
301	movq	-48(%rsi),%r15
302
303	movq	-40(%rsi),%r14
304
305	movq	-32(%rsi),%r13
306
307	movq	-24(%rsi),%r12
308
309	movq	-16(%rsi),%rbp
310
311	movq	-8(%rsi),%rbx
312
313	leaq	(%rsi),%rsp
314
315L$no_data:
316	.byte	0xf3,0xc3
317
318
319
320.p2align	5
321ChaCha20_ssse3:
322
323L$ChaCha20_ssse3:
324	movq	%rsp,%r9
325
326	testl	$2048,%r10d
327	jnz	L$ChaCha20_4xop
328	cmpq	$128,%rdx
329	je	L$ChaCha20_128
330	ja	L$ChaCha20_4x
331
332L$do_sse3_after_all:
333	subq	$64+8,%rsp
334	movdqa	L$sigma(%rip),%xmm0
335	movdqu	(%rcx),%xmm1
336	movdqu	16(%rcx),%xmm2
337	movdqu	(%r8),%xmm3
338	movdqa	L$rot16(%rip),%xmm6
339	movdqa	L$rot24(%rip),%xmm7
340
341	movdqa	%xmm0,0(%rsp)
342	movdqa	%xmm1,16(%rsp)
343	movdqa	%xmm2,32(%rsp)
344	movdqa	%xmm3,48(%rsp)
345	movq	$10,%r8
346	jmp	L$oop_ssse3
347
348.p2align	5
349L$oop_outer_ssse3:
350	movdqa	L$one(%rip),%xmm3
351	movdqa	0(%rsp),%xmm0
352	movdqa	16(%rsp),%xmm1
353	movdqa	32(%rsp),%xmm2
354	paddd	48(%rsp),%xmm3
355	movq	$10,%r8
356	movdqa	%xmm3,48(%rsp)
357	jmp	L$oop_ssse3
358
359.p2align	5
360L$oop_ssse3:
361	paddd	%xmm1,%xmm0
362	pxor	%xmm0,%xmm3
363.byte	102,15,56,0,222
364	paddd	%xmm3,%xmm2
365	pxor	%xmm2,%xmm1
366	movdqa	%xmm1,%xmm4
367	psrld	$20,%xmm1
368	pslld	$12,%xmm4
369	por	%xmm4,%xmm1
370	paddd	%xmm1,%xmm0
371	pxor	%xmm0,%xmm3
372.byte	102,15,56,0,223
373	paddd	%xmm3,%xmm2
374	pxor	%xmm2,%xmm1
375	movdqa	%xmm1,%xmm4
376	psrld	$25,%xmm1
377	pslld	$7,%xmm4
378	por	%xmm4,%xmm1
379	pshufd	$78,%xmm2,%xmm2
380	pshufd	$57,%xmm1,%xmm1
381	pshufd	$147,%xmm3,%xmm3
382	nop
383	paddd	%xmm1,%xmm0
384	pxor	%xmm0,%xmm3
385.byte	102,15,56,0,222
386	paddd	%xmm3,%xmm2
387	pxor	%xmm2,%xmm1
388	movdqa	%xmm1,%xmm4
389	psrld	$20,%xmm1
390	pslld	$12,%xmm4
391	por	%xmm4,%xmm1
392	paddd	%xmm1,%xmm0
393	pxor	%xmm0,%xmm3
394.byte	102,15,56,0,223
395	paddd	%xmm3,%xmm2
396	pxor	%xmm2,%xmm1
397	movdqa	%xmm1,%xmm4
398	psrld	$25,%xmm1
399	pslld	$7,%xmm4
400	por	%xmm4,%xmm1
401	pshufd	$78,%xmm2,%xmm2
402	pshufd	$147,%xmm1,%xmm1
403	pshufd	$57,%xmm3,%xmm3
404	decq	%r8
405	jnz	L$oop_ssse3
406	paddd	0(%rsp),%xmm0
407	paddd	16(%rsp),%xmm1
408	paddd	32(%rsp),%xmm2
409	paddd	48(%rsp),%xmm3
410
411	cmpq	$64,%rdx
412	jb	L$tail_ssse3
413
414	movdqu	0(%rsi),%xmm4
415	movdqu	16(%rsi),%xmm5
416	pxor	%xmm4,%xmm0
417	movdqu	32(%rsi),%xmm4
418	pxor	%xmm5,%xmm1
419	movdqu	48(%rsi),%xmm5
420	leaq	64(%rsi),%rsi
421	pxor	%xmm4,%xmm2
422	pxor	%xmm5,%xmm3
423
424	movdqu	%xmm0,0(%rdi)
425	movdqu	%xmm1,16(%rdi)
426	movdqu	%xmm2,32(%rdi)
427	movdqu	%xmm3,48(%rdi)
428	leaq	64(%rdi),%rdi
429
430	subq	$64,%rdx
431	jnz	L$oop_outer_ssse3
432
433	jmp	L$done_ssse3
434
435.p2align	4
436L$tail_ssse3:
437	movdqa	%xmm0,0(%rsp)
438	movdqa	%xmm1,16(%rsp)
439	movdqa	%xmm2,32(%rsp)
440	movdqa	%xmm3,48(%rsp)
441	xorq	%r8,%r8
442
443L$oop_tail_ssse3:
444	movzbl	(%rsi,%r8,1),%eax
445	movzbl	(%rsp,%r8,1),%ecx
446	leaq	1(%r8),%r8
447	xorl	%ecx,%eax
448	movb	%al,-1(%rdi,%r8,1)
449	decq	%rdx
450	jnz	L$oop_tail_ssse3
451
452L$done_ssse3:
453	leaq	(%r9),%rsp
454
455L$ssse3_epilogue:
456	.byte	0xf3,0xc3
457
458
459
460.p2align	5
461ChaCha20_128:
462
463L$ChaCha20_128:
464	movq	%rsp,%r9
465
466	subq	$64+8,%rsp
467	movdqa	L$sigma(%rip),%xmm8
468	movdqu	(%rcx),%xmm9
469	movdqu	16(%rcx),%xmm2
470	movdqu	(%r8),%xmm3
471	movdqa	L$one(%rip),%xmm1
472	movdqa	L$rot16(%rip),%xmm6
473	movdqa	L$rot24(%rip),%xmm7
474
475	movdqa	%xmm8,%xmm10
476	movdqa	%xmm8,0(%rsp)
477	movdqa	%xmm9,%xmm11
478	movdqa	%xmm9,16(%rsp)
479	movdqa	%xmm2,%xmm0
480	movdqa	%xmm2,32(%rsp)
481	paddd	%xmm3,%xmm1
482	movdqa	%xmm3,48(%rsp)
483	movq	$10,%r8
484	jmp	L$oop_128
485
486.p2align	5
487L$oop_128:
488	paddd	%xmm9,%xmm8
489	pxor	%xmm8,%xmm3
490	paddd	%xmm11,%xmm10
491	pxor	%xmm10,%xmm1
492.byte	102,15,56,0,222
493.byte	102,15,56,0,206
494	paddd	%xmm3,%xmm2
495	paddd	%xmm1,%xmm0
496	pxor	%xmm2,%xmm9
497	pxor	%xmm0,%xmm11
498	movdqa	%xmm9,%xmm4
499	psrld	$20,%xmm9
500	movdqa	%xmm11,%xmm5
501	pslld	$12,%xmm4
502	psrld	$20,%xmm11
503	por	%xmm4,%xmm9
504	pslld	$12,%xmm5
505	por	%xmm5,%xmm11
506	paddd	%xmm9,%xmm8
507	pxor	%xmm8,%xmm3
508	paddd	%xmm11,%xmm10
509	pxor	%xmm10,%xmm1
510.byte	102,15,56,0,223
511.byte	102,15,56,0,207
512	paddd	%xmm3,%xmm2
513	paddd	%xmm1,%xmm0
514	pxor	%xmm2,%xmm9
515	pxor	%xmm0,%xmm11
516	movdqa	%xmm9,%xmm4
517	psrld	$25,%xmm9
518	movdqa	%xmm11,%xmm5
519	pslld	$7,%xmm4
520	psrld	$25,%xmm11
521	por	%xmm4,%xmm9
522	pslld	$7,%xmm5
523	por	%xmm5,%xmm11
524	pshufd	$78,%xmm2,%xmm2
525	pshufd	$57,%xmm9,%xmm9
526	pshufd	$147,%xmm3,%xmm3
527	pshufd	$78,%xmm0,%xmm0
528	pshufd	$57,%xmm11,%xmm11
529	pshufd	$147,%xmm1,%xmm1
530	paddd	%xmm9,%xmm8
531	pxor	%xmm8,%xmm3
532	paddd	%xmm11,%xmm10
533	pxor	%xmm10,%xmm1
534.byte	102,15,56,0,222
535.byte	102,15,56,0,206
536	paddd	%xmm3,%xmm2
537	paddd	%xmm1,%xmm0
538	pxor	%xmm2,%xmm9
539	pxor	%xmm0,%xmm11
540	movdqa	%xmm9,%xmm4
541	psrld	$20,%xmm9
542	movdqa	%xmm11,%xmm5
543	pslld	$12,%xmm4
544	psrld	$20,%xmm11
545	por	%xmm4,%xmm9
546	pslld	$12,%xmm5
547	por	%xmm5,%xmm11
548	paddd	%xmm9,%xmm8
549	pxor	%xmm8,%xmm3
550	paddd	%xmm11,%xmm10
551	pxor	%xmm10,%xmm1
552.byte	102,15,56,0,223
553.byte	102,15,56,0,207
554	paddd	%xmm3,%xmm2
555	paddd	%xmm1,%xmm0
556	pxor	%xmm2,%xmm9
557	pxor	%xmm0,%xmm11
558	movdqa	%xmm9,%xmm4
559	psrld	$25,%xmm9
560	movdqa	%xmm11,%xmm5
561	pslld	$7,%xmm4
562	psrld	$25,%xmm11
563	por	%xmm4,%xmm9
564	pslld	$7,%xmm5
565	por	%xmm5,%xmm11
566	pshufd	$78,%xmm2,%xmm2
567	pshufd	$147,%xmm9,%xmm9
568	pshufd	$57,%xmm3,%xmm3
569	pshufd	$78,%xmm0,%xmm0
570	pshufd	$147,%xmm11,%xmm11
571	pshufd	$57,%xmm1,%xmm1
572	decq	%r8
573	jnz	L$oop_128
574	paddd	0(%rsp),%xmm8
575	paddd	16(%rsp),%xmm9
576	paddd	32(%rsp),%xmm2
577	paddd	48(%rsp),%xmm3
578	paddd	L$one(%rip),%xmm1
579	paddd	0(%rsp),%xmm10
580	paddd	16(%rsp),%xmm11
581	paddd	32(%rsp),%xmm0
582	paddd	48(%rsp),%xmm1
583
584	movdqu	0(%rsi),%xmm4
585	movdqu	16(%rsi),%xmm5
586	pxor	%xmm4,%xmm8
587	movdqu	32(%rsi),%xmm4
588	pxor	%xmm5,%xmm9
589	movdqu	48(%rsi),%xmm5
590	pxor	%xmm4,%xmm2
591	movdqu	64(%rsi),%xmm4
592	pxor	%xmm5,%xmm3
593	movdqu	80(%rsi),%xmm5
594	pxor	%xmm4,%xmm10
595	movdqu	96(%rsi),%xmm4
596	pxor	%xmm5,%xmm11
597	movdqu	112(%rsi),%xmm5
598	pxor	%xmm4,%xmm0
599	pxor	%xmm5,%xmm1
600
601	movdqu	%xmm8,0(%rdi)
602	movdqu	%xmm9,16(%rdi)
603	movdqu	%xmm2,32(%rdi)
604	movdqu	%xmm3,48(%rdi)
605	movdqu	%xmm10,64(%rdi)
606	movdqu	%xmm11,80(%rdi)
607	movdqu	%xmm0,96(%rdi)
608	movdqu	%xmm1,112(%rdi)
609	leaq	(%r9),%rsp
610
611L$128_epilogue:
612	.byte	0xf3,0xc3
613
614
615
616.p2align	5
617ChaCha20_4x:
618
619L$ChaCha20_4x:
620	movq	%rsp,%r9
621
622	movq	%r10,%r11
623	shrq	$32,%r10
624	testq	$32,%r10
625	jnz	L$ChaCha20_8x
626	cmpq	$192,%rdx
627	ja	L$proceed4x
628
629	andq	$71303168,%r11
630	cmpq	$4194304,%r11
631	je	L$do_sse3_after_all
632
633L$proceed4x:
634	subq	$0x140+8,%rsp
635	movdqa	L$sigma(%rip),%xmm11
636	movdqu	(%rcx),%xmm15
637	movdqu	16(%rcx),%xmm7
638	movdqu	(%r8),%xmm3
639	leaq	256(%rsp),%rcx
640	leaq	L$rot16(%rip),%r10
641	leaq	L$rot24(%rip),%r11
642
643	pshufd	$0x00,%xmm11,%xmm8
644	pshufd	$0x55,%xmm11,%xmm9
645	movdqa	%xmm8,64(%rsp)
646	pshufd	$0xaa,%xmm11,%xmm10
647	movdqa	%xmm9,80(%rsp)
648	pshufd	$0xff,%xmm11,%xmm11
649	movdqa	%xmm10,96(%rsp)
650	movdqa	%xmm11,112(%rsp)
651
652	pshufd	$0x00,%xmm15,%xmm12
653	pshufd	$0x55,%xmm15,%xmm13
654	movdqa	%xmm12,128-256(%rcx)
655	pshufd	$0xaa,%xmm15,%xmm14
656	movdqa	%xmm13,144-256(%rcx)
657	pshufd	$0xff,%xmm15,%xmm15
658	movdqa	%xmm14,160-256(%rcx)
659	movdqa	%xmm15,176-256(%rcx)
660
661	pshufd	$0x00,%xmm7,%xmm4
662	pshufd	$0x55,%xmm7,%xmm5
663	movdqa	%xmm4,192-256(%rcx)
664	pshufd	$0xaa,%xmm7,%xmm6
665	movdqa	%xmm5,208-256(%rcx)
666	pshufd	$0xff,%xmm7,%xmm7
667	movdqa	%xmm6,224-256(%rcx)
668	movdqa	%xmm7,240-256(%rcx)
669
670	pshufd	$0x00,%xmm3,%xmm0
671	pshufd	$0x55,%xmm3,%xmm1
672	paddd	L$inc(%rip),%xmm0
673	pshufd	$0xaa,%xmm3,%xmm2
674	movdqa	%xmm1,272-256(%rcx)
675	pshufd	$0xff,%xmm3,%xmm3
676	movdqa	%xmm2,288-256(%rcx)
677	movdqa	%xmm3,304-256(%rcx)
678
679	jmp	L$oop_enter4x
680
681.p2align	5
682L$oop_outer4x:
683	movdqa	64(%rsp),%xmm8
684	movdqa	80(%rsp),%xmm9
685	movdqa	96(%rsp),%xmm10
686	movdqa	112(%rsp),%xmm11
687	movdqa	128-256(%rcx),%xmm12
688	movdqa	144-256(%rcx),%xmm13
689	movdqa	160-256(%rcx),%xmm14
690	movdqa	176-256(%rcx),%xmm15
691	movdqa	192-256(%rcx),%xmm4
692	movdqa	208-256(%rcx),%xmm5
693	movdqa	224-256(%rcx),%xmm6
694	movdqa	240-256(%rcx),%xmm7
695	movdqa	256-256(%rcx),%xmm0
696	movdqa	272-256(%rcx),%xmm1
697	movdqa	288-256(%rcx),%xmm2
698	movdqa	304-256(%rcx),%xmm3
699	paddd	L$four(%rip),%xmm0
700
701L$oop_enter4x:
702	movdqa	%xmm6,32(%rsp)
703	movdqa	%xmm7,48(%rsp)
704	movdqa	(%r10),%xmm7
705	movl	$10,%eax
706	movdqa	%xmm0,256-256(%rcx)
707	jmp	L$oop4x
708
709.p2align	5
710L$oop4x:
711	paddd	%xmm12,%xmm8
712	paddd	%xmm13,%xmm9
713	pxor	%xmm8,%xmm0
714	pxor	%xmm9,%xmm1
715.byte	102,15,56,0,199
716.byte	102,15,56,0,207
717	paddd	%xmm0,%xmm4
718	paddd	%xmm1,%xmm5
719	pxor	%xmm4,%xmm12
720	pxor	%xmm5,%xmm13
721	movdqa	%xmm12,%xmm6
722	pslld	$12,%xmm12
723	psrld	$20,%xmm6
724	movdqa	%xmm13,%xmm7
725	pslld	$12,%xmm13
726	por	%xmm6,%xmm12
727	psrld	$20,%xmm7
728	movdqa	(%r11),%xmm6
729	por	%xmm7,%xmm13
730	paddd	%xmm12,%xmm8
731	paddd	%xmm13,%xmm9
732	pxor	%xmm8,%xmm0
733	pxor	%xmm9,%xmm1
734.byte	102,15,56,0,198
735.byte	102,15,56,0,206
736	paddd	%xmm0,%xmm4
737	paddd	%xmm1,%xmm5
738	pxor	%xmm4,%xmm12
739	pxor	%xmm5,%xmm13
740	movdqa	%xmm12,%xmm7
741	pslld	$7,%xmm12
742	psrld	$25,%xmm7
743	movdqa	%xmm13,%xmm6
744	pslld	$7,%xmm13
745	por	%xmm7,%xmm12
746	psrld	$25,%xmm6
747	movdqa	(%r10),%xmm7
748	por	%xmm6,%xmm13
749	movdqa	%xmm4,0(%rsp)
750	movdqa	%xmm5,16(%rsp)
751	movdqa	32(%rsp),%xmm4
752	movdqa	48(%rsp),%xmm5
753	paddd	%xmm14,%xmm10
754	paddd	%xmm15,%xmm11
755	pxor	%xmm10,%xmm2
756	pxor	%xmm11,%xmm3
757.byte	102,15,56,0,215
758.byte	102,15,56,0,223
759	paddd	%xmm2,%xmm4
760	paddd	%xmm3,%xmm5
761	pxor	%xmm4,%xmm14
762	pxor	%xmm5,%xmm15
763	movdqa	%xmm14,%xmm6
764	pslld	$12,%xmm14
765	psrld	$20,%xmm6
766	movdqa	%xmm15,%xmm7
767	pslld	$12,%xmm15
768	por	%xmm6,%xmm14
769	psrld	$20,%xmm7
770	movdqa	(%r11),%xmm6
771	por	%xmm7,%xmm15
772	paddd	%xmm14,%xmm10
773	paddd	%xmm15,%xmm11
774	pxor	%xmm10,%xmm2
775	pxor	%xmm11,%xmm3
776.byte	102,15,56,0,214
777.byte	102,15,56,0,222
778	paddd	%xmm2,%xmm4
779	paddd	%xmm3,%xmm5
780	pxor	%xmm4,%xmm14
781	pxor	%xmm5,%xmm15
782	movdqa	%xmm14,%xmm7
783	pslld	$7,%xmm14
784	psrld	$25,%xmm7
785	movdqa	%xmm15,%xmm6
786	pslld	$7,%xmm15
787	por	%xmm7,%xmm14
788	psrld	$25,%xmm6
789	movdqa	(%r10),%xmm7
790	por	%xmm6,%xmm15
791	paddd	%xmm13,%xmm8
792	paddd	%xmm14,%xmm9
793	pxor	%xmm8,%xmm3
794	pxor	%xmm9,%xmm0
795.byte	102,15,56,0,223
796.byte	102,15,56,0,199
797	paddd	%xmm3,%xmm4
798	paddd	%xmm0,%xmm5
799	pxor	%xmm4,%xmm13
800	pxor	%xmm5,%xmm14
801	movdqa	%xmm13,%xmm6
802	pslld	$12,%xmm13
803	psrld	$20,%xmm6
804	movdqa	%xmm14,%xmm7
805	pslld	$12,%xmm14
806	por	%xmm6,%xmm13
807	psrld	$20,%xmm7
808	movdqa	(%r11),%xmm6
809	por	%xmm7,%xmm14
810	paddd	%xmm13,%xmm8
811	paddd	%xmm14,%xmm9
812	pxor	%xmm8,%xmm3
813	pxor	%xmm9,%xmm0
814.byte	102,15,56,0,222
815.byte	102,15,56,0,198
816	paddd	%xmm3,%xmm4
817	paddd	%xmm0,%xmm5
818	pxor	%xmm4,%xmm13
819	pxor	%xmm5,%xmm14
820	movdqa	%xmm13,%xmm7
821	pslld	$7,%xmm13
822	psrld	$25,%xmm7
823	movdqa	%xmm14,%xmm6
824	pslld	$7,%xmm14
825	por	%xmm7,%xmm13
826	psrld	$25,%xmm6
827	movdqa	(%r10),%xmm7
828	por	%xmm6,%xmm14
829	movdqa	%xmm4,32(%rsp)
830	movdqa	%xmm5,48(%rsp)
831	movdqa	0(%rsp),%xmm4
832	movdqa	16(%rsp),%xmm5
833	paddd	%xmm15,%xmm10
834	paddd	%xmm12,%xmm11
835	pxor	%xmm10,%xmm1
836	pxor	%xmm11,%xmm2
837.byte	102,15,56,0,207
838.byte	102,15,56,0,215
839	paddd	%xmm1,%xmm4
840	paddd	%xmm2,%xmm5
841	pxor	%xmm4,%xmm15
842	pxor	%xmm5,%xmm12
843	movdqa	%xmm15,%xmm6
844	pslld	$12,%xmm15
845	psrld	$20,%xmm6
846	movdqa	%xmm12,%xmm7
847	pslld	$12,%xmm12
848	por	%xmm6,%xmm15
849	psrld	$20,%xmm7
850	movdqa	(%r11),%xmm6
851	por	%xmm7,%xmm12
852	paddd	%xmm15,%xmm10
853	paddd	%xmm12,%xmm11
854	pxor	%xmm10,%xmm1
855	pxor	%xmm11,%xmm2
856.byte	102,15,56,0,206
857.byte	102,15,56,0,214
858	paddd	%xmm1,%xmm4
859	paddd	%xmm2,%xmm5
860	pxor	%xmm4,%xmm15
861	pxor	%xmm5,%xmm12
862	movdqa	%xmm15,%xmm7
863	pslld	$7,%xmm15
864	psrld	$25,%xmm7
865	movdqa	%xmm12,%xmm6
866	pslld	$7,%xmm12
867	por	%xmm7,%xmm15
868	psrld	$25,%xmm6
869	movdqa	(%r10),%xmm7
870	por	%xmm6,%xmm12
871	decl	%eax
872	jnz	L$oop4x
873
874	paddd	64(%rsp),%xmm8
875	paddd	80(%rsp),%xmm9
876	paddd	96(%rsp),%xmm10
877	paddd	112(%rsp),%xmm11
878
879	movdqa	%xmm8,%xmm6
880	punpckldq	%xmm9,%xmm8
881	movdqa	%xmm10,%xmm7
882	punpckldq	%xmm11,%xmm10
883	punpckhdq	%xmm9,%xmm6
884	punpckhdq	%xmm11,%xmm7
885	movdqa	%xmm8,%xmm9
886	punpcklqdq	%xmm10,%xmm8
887	movdqa	%xmm6,%xmm11
888	punpcklqdq	%xmm7,%xmm6
889	punpckhqdq	%xmm10,%xmm9
890	punpckhqdq	%xmm7,%xmm11
891	paddd	128-256(%rcx),%xmm12
892	paddd	144-256(%rcx),%xmm13
893	paddd	160-256(%rcx),%xmm14
894	paddd	176-256(%rcx),%xmm15
895
896	movdqa	%xmm8,0(%rsp)
897	movdqa	%xmm9,16(%rsp)
898	movdqa	32(%rsp),%xmm8
899	movdqa	48(%rsp),%xmm9
900
901	movdqa	%xmm12,%xmm10
902	punpckldq	%xmm13,%xmm12
903	movdqa	%xmm14,%xmm7
904	punpckldq	%xmm15,%xmm14
905	punpckhdq	%xmm13,%xmm10
906	punpckhdq	%xmm15,%xmm7
907	movdqa	%xmm12,%xmm13
908	punpcklqdq	%xmm14,%xmm12
909	movdqa	%xmm10,%xmm15
910	punpcklqdq	%xmm7,%xmm10
911	punpckhqdq	%xmm14,%xmm13
912	punpckhqdq	%xmm7,%xmm15
913	paddd	192-256(%rcx),%xmm4
914	paddd	208-256(%rcx),%xmm5
915	paddd	224-256(%rcx),%xmm8
916	paddd	240-256(%rcx),%xmm9
917
918	movdqa	%xmm6,32(%rsp)
919	movdqa	%xmm11,48(%rsp)
920
921	movdqa	%xmm4,%xmm14
922	punpckldq	%xmm5,%xmm4
923	movdqa	%xmm8,%xmm7
924	punpckldq	%xmm9,%xmm8
925	punpckhdq	%xmm5,%xmm14
926	punpckhdq	%xmm9,%xmm7
927	movdqa	%xmm4,%xmm5
928	punpcklqdq	%xmm8,%xmm4
929	movdqa	%xmm14,%xmm9
930	punpcklqdq	%xmm7,%xmm14
931	punpckhqdq	%xmm8,%xmm5
932	punpckhqdq	%xmm7,%xmm9
933	paddd	256-256(%rcx),%xmm0
934	paddd	272-256(%rcx),%xmm1
935	paddd	288-256(%rcx),%xmm2
936	paddd	304-256(%rcx),%xmm3
937
938	movdqa	%xmm0,%xmm8
939	punpckldq	%xmm1,%xmm0
940	movdqa	%xmm2,%xmm7
941	punpckldq	%xmm3,%xmm2
942	punpckhdq	%xmm1,%xmm8
943	punpckhdq	%xmm3,%xmm7
944	movdqa	%xmm0,%xmm1
945	punpcklqdq	%xmm2,%xmm0
946	movdqa	%xmm8,%xmm3
947	punpcklqdq	%xmm7,%xmm8
948	punpckhqdq	%xmm2,%xmm1
949	punpckhqdq	%xmm7,%xmm3
950	cmpq	$256,%rdx
951	jb	L$tail4x
952
953	movdqu	0(%rsi),%xmm6
954	movdqu	16(%rsi),%xmm11
955	movdqu	32(%rsi),%xmm2
956	movdqu	48(%rsi),%xmm7
957	pxor	0(%rsp),%xmm6
958	pxor	%xmm12,%xmm11
959	pxor	%xmm4,%xmm2
960	pxor	%xmm0,%xmm7
961
962	movdqu	%xmm6,0(%rdi)
963	movdqu	64(%rsi),%xmm6
964	movdqu	%xmm11,16(%rdi)
965	movdqu	80(%rsi),%xmm11
966	movdqu	%xmm2,32(%rdi)
967	movdqu	96(%rsi),%xmm2
968	movdqu	%xmm7,48(%rdi)
969	movdqu	112(%rsi),%xmm7
970	leaq	128(%rsi),%rsi
971	pxor	16(%rsp),%xmm6
972	pxor	%xmm13,%xmm11
973	pxor	%xmm5,%xmm2
974	pxor	%xmm1,%xmm7
975
976	movdqu	%xmm6,64(%rdi)
977	movdqu	0(%rsi),%xmm6
978	movdqu	%xmm11,80(%rdi)
979	movdqu	16(%rsi),%xmm11
980	movdqu	%xmm2,96(%rdi)
981	movdqu	32(%rsi),%xmm2
982	movdqu	%xmm7,112(%rdi)
983	leaq	128(%rdi),%rdi
984	movdqu	48(%rsi),%xmm7
985	pxor	32(%rsp),%xmm6
986	pxor	%xmm10,%xmm11
987	pxor	%xmm14,%xmm2
988	pxor	%xmm8,%xmm7
989
990	movdqu	%xmm6,0(%rdi)
991	movdqu	64(%rsi),%xmm6
992	movdqu	%xmm11,16(%rdi)
993	movdqu	80(%rsi),%xmm11
994	movdqu	%xmm2,32(%rdi)
995	movdqu	96(%rsi),%xmm2
996	movdqu	%xmm7,48(%rdi)
997	movdqu	112(%rsi),%xmm7
998	leaq	128(%rsi),%rsi
999	pxor	48(%rsp),%xmm6
1000	pxor	%xmm15,%xmm11
1001	pxor	%xmm9,%xmm2
1002	pxor	%xmm3,%xmm7
1003	movdqu	%xmm6,64(%rdi)
1004	movdqu	%xmm11,80(%rdi)
1005	movdqu	%xmm2,96(%rdi)
1006	movdqu	%xmm7,112(%rdi)
1007	leaq	128(%rdi),%rdi
1008
1009	subq	$256,%rdx
1010	jnz	L$oop_outer4x
1011
1012	jmp	L$done4x
1013
1014L$tail4x:
1015	cmpq	$192,%rdx
1016	jae	L$192_or_more4x
1017	cmpq	$128,%rdx
1018	jae	L$128_or_more4x
1019	cmpq	$64,%rdx
1020	jae	L$64_or_more4x
1021
1022
1023	xorq	%r10,%r10
1024
1025	movdqa	%xmm12,16(%rsp)
1026	movdqa	%xmm4,32(%rsp)
1027	movdqa	%xmm0,48(%rsp)
1028	jmp	L$oop_tail4x
1029
1030.p2align	5
1031L$64_or_more4x:
1032	movdqu	0(%rsi),%xmm6
1033	movdqu	16(%rsi),%xmm11
1034	movdqu	32(%rsi),%xmm2
1035	movdqu	48(%rsi),%xmm7
1036	pxor	0(%rsp),%xmm6
1037	pxor	%xmm12,%xmm11
1038	pxor	%xmm4,%xmm2
1039	pxor	%xmm0,%xmm7
1040	movdqu	%xmm6,0(%rdi)
1041	movdqu	%xmm11,16(%rdi)
1042	movdqu	%xmm2,32(%rdi)
1043	movdqu	%xmm7,48(%rdi)
1044	je	L$done4x
1045
1046	movdqa	16(%rsp),%xmm6
1047	leaq	64(%rsi),%rsi
1048	xorq	%r10,%r10
1049	movdqa	%xmm6,0(%rsp)
1050	movdqa	%xmm13,16(%rsp)
1051	leaq	64(%rdi),%rdi
1052	movdqa	%xmm5,32(%rsp)
1053	subq	$64,%rdx
1054	movdqa	%xmm1,48(%rsp)
1055	jmp	L$oop_tail4x
1056
1057.p2align	5
1058L$128_or_more4x:
1059	movdqu	0(%rsi),%xmm6
1060	movdqu	16(%rsi),%xmm11
1061	movdqu	32(%rsi),%xmm2
1062	movdqu	48(%rsi),%xmm7
1063	pxor	0(%rsp),%xmm6
1064	pxor	%xmm12,%xmm11
1065	pxor	%xmm4,%xmm2
1066	pxor	%xmm0,%xmm7
1067
1068	movdqu	%xmm6,0(%rdi)
1069	movdqu	64(%rsi),%xmm6
1070	movdqu	%xmm11,16(%rdi)
1071	movdqu	80(%rsi),%xmm11
1072	movdqu	%xmm2,32(%rdi)
1073	movdqu	96(%rsi),%xmm2
1074	movdqu	%xmm7,48(%rdi)
1075	movdqu	112(%rsi),%xmm7
1076	pxor	16(%rsp),%xmm6
1077	pxor	%xmm13,%xmm11
1078	pxor	%xmm5,%xmm2
1079	pxor	%xmm1,%xmm7
1080	movdqu	%xmm6,64(%rdi)
1081	movdqu	%xmm11,80(%rdi)
1082	movdqu	%xmm2,96(%rdi)
1083	movdqu	%xmm7,112(%rdi)
1084	je	L$done4x
1085
1086	movdqa	32(%rsp),%xmm6
1087	leaq	128(%rsi),%rsi
1088	xorq	%r10,%r10
1089	movdqa	%xmm6,0(%rsp)
1090	movdqa	%xmm10,16(%rsp)
1091	leaq	128(%rdi),%rdi
1092	movdqa	%xmm14,32(%rsp)
1093	subq	$128,%rdx
1094	movdqa	%xmm8,48(%rsp)
1095	jmp	L$oop_tail4x
1096
1097.p2align	5
1098L$192_or_more4x:
1099	movdqu	0(%rsi),%xmm6
1100	movdqu	16(%rsi),%xmm11
1101	movdqu	32(%rsi),%xmm2
1102	movdqu	48(%rsi),%xmm7
1103	pxor	0(%rsp),%xmm6
1104	pxor	%xmm12,%xmm11
1105	pxor	%xmm4,%xmm2
1106	pxor	%xmm0,%xmm7
1107
1108	movdqu	%xmm6,0(%rdi)
1109	movdqu	64(%rsi),%xmm6
1110	movdqu	%xmm11,16(%rdi)
1111	movdqu	80(%rsi),%xmm11
1112	movdqu	%xmm2,32(%rdi)
1113	movdqu	96(%rsi),%xmm2
1114	movdqu	%xmm7,48(%rdi)
1115	movdqu	112(%rsi),%xmm7
1116	leaq	128(%rsi),%rsi
1117	pxor	16(%rsp),%xmm6
1118	pxor	%xmm13,%xmm11
1119	pxor	%xmm5,%xmm2
1120	pxor	%xmm1,%xmm7
1121
1122	movdqu	%xmm6,64(%rdi)
1123	movdqu	0(%rsi),%xmm6
1124	movdqu	%xmm11,80(%rdi)
1125	movdqu	16(%rsi),%xmm11
1126	movdqu	%xmm2,96(%rdi)
1127	movdqu	32(%rsi),%xmm2
1128	movdqu	%xmm7,112(%rdi)
1129	leaq	128(%rdi),%rdi
1130	movdqu	48(%rsi),%xmm7
1131	pxor	32(%rsp),%xmm6
1132	pxor	%xmm10,%xmm11
1133	pxor	%xmm14,%xmm2
1134	pxor	%xmm8,%xmm7
1135	movdqu	%xmm6,0(%rdi)
1136	movdqu	%xmm11,16(%rdi)
1137	movdqu	%xmm2,32(%rdi)
1138	movdqu	%xmm7,48(%rdi)
1139	je	L$done4x
1140
1141	movdqa	48(%rsp),%xmm6
1142	leaq	64(%rsi),%rsi
1143	xorq	%r10,%r10
1144	movdqa	%xmm6,0(%rsp)
1145	movdqa	%xmm15,16(%rsp)
1146	leaq	64(%rdi),%rdi
1147	movdqa	%xmm9,32(%rsp)
1148	subq	$192,%rdx
1149	movdqa	%xmm3,48(%rsp)
1150
1151L$oop_tail4x:
1152	movzbl	(%rsi,%r10,1),%eax
1153	movzbl	(%rsp,%r10,1),%ecx
1154	leaq	1(%r10),%r10
1155	xorl	%ecx,%eax
1156	movb	%al,-1(%rdi,%r10,1)
1157	decq	%rdx
1158	jnz	L$oop_tail4x
1159
1160L$done4x:
1161	leaq	(%r9),%rsp
1162
1163L$4x_epilogue:
1164	.byte	0xf3,0xc3
1165
1166
1167
1168.p2align	5
1169ChaCha20_4xop:
1170
1171L$ChaCha20_4xop:
1172	movq	%rsp,%r9
1173
1174	subq	$0x140+8,%rsp
1175	vzeroupper
1176
1177	vmovdqa	L$sigma(%rip),%xmm11
1178	vmovdqu	(%rcx),%xmm3
1179	vmovdqu	16(%rcx),%xmm15
1180	vmovdqu	(%r8),%xmm7
1181	leaq	256(%rsp),%rcx
1182
1183	vpshufd	$0x00,%xmm11,%xmm8
1184	vpshufd	$0x55,%xmm11,%xmm9
1185	vmovdqa	%xmm8,64(%rsp)
1186	vpshufd	$0xaa,%xmm11,%xmm10
1187	vmovdqa	%xmm9,80(%rsp)
1188	vpshufd	$0xff,%xmm11,%xmm11
1189	vmovdqa	%xmm10,96(%rsp)
1190	vmovdqa	%xmm11,112(%rsp)
1191
1192	vpshufd	$0x00,%xmm3,%xmm0
1193	vpshufd	$0x55,%xmm3,%xmm1
1194	vmovdqa	%xmm0,128-256(%rcx)
1195	vpshufd	$0xaa,%xmm3,%xmm2
1196	vmovdqa	%xmm1,144-256(%rcx)
1197	vpshufd	$0xff,%xmm3,%xmm3
1198	vmovdqa	%xmm2,160-256(%rcx)
1199	vmovdqa	%xmm3,176-256(%rcx)
1200
1201	vpshufd	$0x00,%xmm15,%xmm12
1202	vpshufd	$0x55,%xmm15,%xmm13
1203	vmovdqa	%xmm12,192-256(%rcx)
1204	vpshufd	$0xaa,%xmm15,%xmm14
1205	vmovdqa	%xmm13,208-256(%rcx)
1206	vpshufd	$0xff,%xmm15,%xmm15
1207	vmovdqa	%xmm14,224-256(%rcx)
1208	vmovdqa	%xmm15,240-256(%rcx)
1209
1210	vpshufd	$0x00,%xmm7,%xmm4
1211	vpshufd	$0x55,%xmm7,%xmm5
1212	vpaddd	L$inc(%rip),%xmm4,%xmm4
1213	vpshufd	$0xaa,%xmm7,%xmm6
1214	vmovdqa	%xmm5,272-256(%rcx)
1215	vpshufd	$0xff,%xmm7,%xmm7
1216	vmovdqa	%xmm6,288-256(%rcx)
1217	vmovdqa	%xmm7,304-256(%rcx)
1218
1219	jmp	L$oop_enter4xop
1220
1221.p2align	5
1222L$oop_outer4xop:
1223	vmovdqa	64(%rsp),%xmm8
1224	vmovdqa	80(%rsp),%xmm9
1225	vmovdqa	96(%rsp),%xmm10
1226	vmovdqa	112(%rsp),%xmm11
1227	vmovdqa	128-256(%rcx),%xmm0
1228	vmovdqa	144-256(%rcx),%xmm1
1229	vmovdqa	160-256(%rcx),%xmm2
1230	vmovdqa	176-256(%rcx),%xmm3
1231	vmovdqa	192-256(%rcx),%xmm12
1232	vmovdqa	208-256(%rcx),%xmm13
1233	vmovdqa	224-256(%rcx),%xmm14
1234	vmovdqa	240-256(%rcx),%xmm15
1235	vmovdqa	256-256(%rcx),%xmm4
1236	vmovdqa	272-256(%rcx),%xmm5
1237	vmovdqa	288-256(%rcx),%xmm6
1238	vmovdqa	304-256(%rcx),%xmm7
1239	vpaddd	L$four(%rip),%xmm4,%xmm4
1240
1241L$oop_enter4xop:
1242	movl	$10,%eax
1243	vmovdqa	%xmm4,256-256(%rcx)
1244	jmp	L$oop4xop
1245
1246.p2align	5
1247L$oop4xop:
1248	vpaddd	%xmm0,%xmm8,%xmm8
1249	vpaddd	%xmm1,%xmm9,%xmm9
1250	vpaddd	%xmm2,%xmm10,%xmm10
1251	vpaddd	%xmm3,%xmm11,%xmm11
1252	vpxor	%xmm4,%xmm8,%xmm4
1253	vpxor	%xmm5,%xmm9,%xmm5
1254	vpxor	%xmm6,%xmm10,%xmm6
1255	vpxor	%xmm7,%xmm11,%xmm7
1256.byte	143,232,120,194,228,16
1257.byte	143,232,120,194,237,16
1258.byte	143,232,120,194,246,16
1259.byte	143,232,120,194,255,16
1260	vpaddd	%xmm4,%xmm12,%xmm12
1261	vpaddd	%xmm5,%xmm13,%xmm13
1262	vpaddd	%xmm6,%xmm14,%xmm14
1263	vpaddd	%xmm7,%xmm15,%xmm15
1264	vpxor	%xmm0,%xmm12,%xmm0
1265	vpxor	%xmm1,%xmm13,%xmm1
1266	vpxor	%xmm14,%xmm2,%xmm2
1267	vpxor	%xmm15,%xmm3,%xmm3
1268.byte	143,232,120,194,192,12
1269.byte	143,232,120,194,201,12
1270.byte	143,232,120,194,210,12
1271.byte	143,232,120,194,219,12
1272	vpaddd	%xmm8,%xmm0,%xmm8
1273	vpaddd	%xmm9,%xmm1,%xmm9
1274	vpaddd	%xmm2,%xmm10,%xmm10
1275	vpaddd	%xmm3,%xmm11,%xmm11
1276	vpxor	%xmm4,%xmm8,%xmm4
1277	vpxor	%xmm5,%xmm9,%xmm5
1278	vpxor	%xmm6,%xmm10,%xmm6
1279	vpxor	%xmm7,%xmm11,%xmm7
1280.byte	143,232,120,194,228,8
1281.byte	143,232,120,194,237,8
1282.byte	143,232,120,194,246,8
1283.byte	143,232,120,194,255,8
1284	vpaddd	%xmm4,%xmm12,%xmm12
1285	vpaddd	%xmm5,%xmm13,%xmm13
1286	vpaddd	%xmm6,%xmm14,%xmm14
1287	vpaddd	%xmm7,%xmm15,%xmm15
1288	vpxor	%xmm0,%xmm12,%xmm0
1289	vpxor	%xmm1,%xmm13,%xmm1
1290	vpxor	%xmm14,%xmm2,%xmm2
1291	vpxor	%xmm15,%xmm3,%xmm3
1292.byte	143,232,120,194,192,7
1293.byte	143,232,120,194,201,7
1294.byte	143,232,120,194,210,7
1295.byte	143,232,120,194,219,7
1296	vpaddd	%xmm1,%xmm8,%xmm8
1297	vpaddd	%xmm2,%xmm9,%xmm9
1298	vpaddd	%xmm3,%xmm10,%xmm10
1299	vpaddd	%xmm0,%xmm11,%xmm11
1300	vpxor	%xmm7,%xmm8,%xmm7
1301	vpxor	%xmm4,%xmm9,%xmm4
1302	vpxor	%xmm5,%xmm10,%xmm5
1303	vpxor	%xmm6,%xmm11,%xmm6
1304.byte	143,232,120,194,255,16
1305.byte	143,232,120,194,228,16
1306.byte	143,232,120,194,237,16
1307.byte	143,232,120,194,246,16
1308	vpaddd	%xmm7,%xmm14,%xmm14
1309	vpaddd	%xmm4,%xmm15,%xmm15
1310	vpaddd	%xmm5,%xmm12,%xmm12
1311	vpaddd	%xmm6,%xmm13,%xmm13
1312	vpxor	%xmm1,%xmm14,%xmm1
1313	vpxor	%xmm2,%xmm15,%xmm2
1314	vpxor	%xmm12,%xmm3,%xmm3
1315	vpxor	%xmm13,%xmm0,%xmm0
1316.byte	143,232,120,194,201,12
1317.byte	143,232,120,194,210,12
1318.byte	143,232,120,194,219,12
1319.byte	143,232,120,194,192,12
1320	vpaddd	%xmm8,%xmm1,%xmm8
1321	vpaddd	%xmm9,%xmm2,%xmm9
1322	vpaddd	%xmm3,%xmm10,%xmm10
1323	vpaddd	%xmm0,%xmm11,%xmm11
1324	vpxor	%xmm7,%xmm8,%xmm7
1325	vpxor	%xmm4,%xmm9,%xmm4
1326	vpxor	%xmm5,%xmm10,%xmm5
1327	vpxor	%xmm6,%xmm11,%xmm6
1328.byte	143,232,120,194,255,8
1329.byte	143,232,120,194,228,8
1330.byte	143,232,120,194,237,8
1331.byte	143,232,120,194,246,8
1332	vpaddd	%xmm7,%xmm14,%xmm14
1333	vpaddd	%xmm4,%xmm15,%xmm15
1334	vpaddd	%xmm5,%xmm12,%xmm12
1335	vpaddd	%xmm6,%xmm13,%xmm13
1336	vpxor	%xmm1,%xmm14,%xmm1
1337	vpxor	%xmm2,%xmm15,%xmm2
1338	vpxor	%xmm12,%xmm3,%xmm3
1339	vpxor	%xmm13,%xmm0,%xmm0
1340.byte	143,232,120,194,201,7
1341.byte	143,232,120,194,210,7
1342.byte	143,232,120,194,219,7
1343.byte	143,232,120,194,192,7
1344	decl	%eax
1345	jnz	L$oop4xop
1346
1347	vpaddd	64(%rsp),%xmm8,%xmm8
1348	vpaddd	80(%rsp),%xmm9,%xmm9
1349	vpaddd	96(%rsp),%xmm10,%xmm10
1350	vpaddd	112(%rsp),%xmm11,%xmm11
1351
1352	vmovdqa	%xmm14,32(%rsp)
1353	vmovdqa	%xmm15,48(%rsp)
1354
1355	vpunpckldq	%xmm9,%xmm8,%xmm14
1356	vpunpckldq	%xmm11,%xmm10,%xmm15
1357	vpunpckhdq	%xmm9,%xmm8,%xmm8
1358	vpunpckhdq	%xmm11,%xmm10,%xmm10
1359	vpunpcklqdq	%xmm15,%xmm14,%xmm9
1360	vpunpckhqdq	%xmm15,%xmm14,%xmm14
1361	vpunpcklqdq	%xmm10,%xmm8,%xmm11
1362	vpunpckhqdq	%xmm10,%xmm8,%xmm8
1363	vpaddd	128-256(%rcx),%xmm0,%xmm0
1364	vpaddd	144-256(%rcx),%xmm1,%xmm1
1365	vpaddd	160-256(%rcx),%xmm2,%xmm2
1366	vpaddd	176-256(%rcx),%xmm3,%xmm3
1367
1368	vmovdqa	%xmm9,0(%rsp)
1369	vmovdqa	%xmm14,16(%rsp)
1370	vmovdqa	32(%rsp),%xmm9
1371	vmovdqa	48(%rsp),%xmm14
1372
1373	vpunpckldq	%xmm1,%xmm0,%xmm10
1374	vpunpckldq	%xmm3,%xmm2,%xmm15
1375	vpunpckhdq	%xmm1,%xmm0,%xmm0
1376	vpunpckhdq	%xmm3,%xmm2,%xmm2
1377	vpunpcklqdq	%xmm15,%xmm10,%xmm1
1378	vpunpckhqdq	%xmm15,%xmm10,%xmm10
1379	vpunpcklqdq	%xmm2,%xmm0,%xmm3
1380	vpunpckhqdq	%xmm2,%xmm0,%xmm0
1381	vpaddd	192-256(%rcx),%xmm12,%xmm12
1382	vpaddd	208-256(%rcx),%xmm13,%xmm13
1383	vpaddd	224-256(%rcx),%xmm9,%xmm9
1384	vpaddd	240-256(%rcx),%xmm14,%xmm14
1385
1386	vpunpckldq	%xmm13,%xmm12,%xmm2
1387	vpunpckldq	%xmm14,%xmm9,%xmm15
1388	vpunpckhdq	%xmm13,%xmm12,%xmm12
1389	vpunpckhdq	%xmm14,%xmm9,%xmm9
1390	vpunpcklqdq	%xmm15,%xmm2,%xmm13
1391	vpunpckhqdq	%xmm15,%xmm2,%xmm2
1392	vpunpcklqdq	%xmm9,%xmm12,%xmm14
1393	vpunpckhqdq	%xmm9,%xmm12,%xmm12
1394	vpaddd	256-256(%rcx),%xmm4,%xmm4
1395	vpaddd	272-256(%rcx),%xmm5,%xmm5
1396	vpaddd	288-256(%rcx),%xmm6,%xmm6
1397	vpaddd	304-256(%rcx),%xmm7,%xmm7
1398
1399	vpunpckldq	%xmm5,%xmm4,%xmm9
1400	vpunpckldq	%xmm7,%xmm6,%xmm15
1401	vpunpckhdq	%xmm5,%xmm4,%xmm4
1402	vpunpckhdq	%xmm7,%xmm6,%xmm6
1403	vpunpcklqdq	%xmm15,%xmm9,%xmm5
1404	vpunpckhqdq	%xmm15,%xmm9,%xmm9
1405	vpunpcklqdq	%xmm6,%xmm4,%xmm7
1406	vpunpckhqdq	%xmm6,%xmm4,%xmm4
1407	vmovdqa	0(%rsp),%xmm6
1408	vmovdqa	16(%rsp),%xmm15
1409
1410	cmpq	$256,%rdx
1411	jb	L$tail4xop
1412
1413	vpxor	0(%rsi),%xmm6,%xmm6
1414	vpxor	16(%rsi),%xmm1,%xmm1
1415	vpxor	32(%rsi),%xmm13,%xmm13
1416	vpxor	48(%rsi),%xmm5,%xmm5
1417	vpxor	64(%rsi),%xmm15,%xmm15
1418	vpxor	80(%rsi),%xmm10,%xmm10
1419	vpxor	96(%rsi),%xmm2,%xmm2
1420	vpxor	112(%rsi),%xmm9,%xmm9
1421	leaq	128(%rsi),%rsi
1422	vpxor	0(%rsi),%xmm11,%xmm11
1423	vpxor	16(%rsi),%xmm3,%xmm3
1424	vpxor	32(%rsi),%xmm14,%xmm14
1425	vpxor	48(%rsi),%xmm7,%xmm7
1426	vpxor	64(%rsi),%xmm8,%xmm8
1427	vpxor	80(%rsi),%xmm0,%xmm0
1428	vpxor	96(%rsi),%xmm12,%xmm12
1429	vpxor	112(%rsi),%xmm4,%xmm4
1430	leaq	128(%rsi),%rsi
1431
1432	vmovdqu	%xmm6,0(%rdi)
1433	vmovdqu	%xmm1,16(%rdi)
1434	vmovdqu	%xmm13,32(%rdi)
1435	vmovdqu	%xmm5,48(%rdi)
1436	vmovdqu	%xmm15,64(%rdi)
1437	vmovdqu	%xmm10,80(%rdi)
1438	vmovdqu	%xmm2,96(%rdi)
1439	vmovdqu	%xmm9,112(%rdi)
1440	leaq	128(%rdi),%rdi
1441	vmovdqu	%xmm11,0(%rdi)
1442	vmovdqu	%xmm3,16(%rdi)
1443	vmovdqu	%xmm14,32(%rdi)
1444	vmovdqu	%xmm7,48(%rdi)
1445	vmovdqu	%xmm8,64(%rdi)
1446	vmovdqu	%xmm0,80(%rdi)
1447	vmovdqu	%xmm12,96(%rdi)
1448	vmovdqu	%xmm4,112(%rdi)
1449	leaq	128(%rdi),%rdi
1450
1451	subq	$256,%rdx
1452	jnz	L$oop_outer4xop
1453
1454	jmp	L$done4xop
1455
1456.p2align	5
1457L$tail4xop:
1458	cmpq	$192,%rdx
1459	jae	L$192_or_more4xop
1460	cmpq	$128,%rdx
1461	jae	L$128_or_more4xop
1462	cmpq	$64,%rdx
1463	jae	L$64_or_more4xop
1464
1465	xorq	%r10,%r10
1466	vmovdqa	%xmm6,0(%rsp)
1467	vmovdqa	%xmm1,16(%rsp)
1468	vmovdqa	%xmm13,32(%rsp)
1469	vmovdqa	%xmm5,48(%rsp)
1470	jmp	L$oop_tail4xop
1471
1472.p2align	5
1473L$64_or_more4xop:
1474	vpxor	0(%rsi),%xmm6,%xmm6
1475	vpxor	16(%rsi),%xmm1,%xmm1
1476	vpxor	32(%rsi),%xmm13,%xmm13
1477	vpxor	48(%rsi),%xmm5,%xmm5
1478	vmovdqu	%xmm6,0(%rdi)
1479	vmovdqu	%xmm1,16(%rdi)
1480	vmovdqu	%xmm13,32(%rdi)
1481	vmovdqu	%xmm5,48(%rdi)
1482	je	L$done4xop
1483
1484	leaq	64(%rsi),%rsi
1485	vmovdqa	%xmm15,0(%rsp)
1486	xorq	%r10,%r10
1487	vmovdqa	%xmm10,16(%rsp)
1488	leaq	64(%rdi),%rdi
1489	vmovdqa	%xmm2,32(%rsp)
1490	subq	$64,%rdx
1491	vmovdqa	%xmm9,48(%rsp)
1492	jmp	L$oop_tail4xop
1493
1494.p2align	5
1495L$128_or_more4xop:
1496	vpxor	0(%rsi),%xmm6,%xmm6
1497	vpxor	16(%rsi),%xmm1,%xmm1
1498	vpxor	32(%rsi),%xmm13,%xmm13
1499	vpxor	48(%rsi),%xmm5,%xmm5
1500	vpxor	64(%rsi),%xmm15,%xmm15
1501	vpxor	80(%rsi),%xmm10,%xmm10
1502	vpxor	96(%rsi),%xmm2,%xmm2
1503	vpxor	112(%rsi),%xmm9,%xmm9
1504
1505	vmovdqu	%xmm6,0(%rdi)
1506	vmovdqu	%xmm1,16(%rdi)
1507	vmovdqu	%xmm13,32(%rdi)
1508	vmovdqu	%xmm5,48(%rdi)
1509	vmovdqu	%xmm15,64(%rdi)
1510	vmovdqu	%xmm10,80(%rdi)
1511	vmovdqu	%xmm2,96(%rdi)
1512	vmovdqu	%xmm9,112(%rdi)
1513	je	L$done4xop
1514
1515	leaq	128(%rsi),%rsi
1516	vmovdqa	%xmm11,0(%rsp)
1517	xorq	%r10,%r10
1518	vmovdqa	%xmm3,16(%rsp)
1519	leaq	128(%rdi),%rdi
1520	vmovdqa	%xmm14,32(%rsp)
1521	subq	$128,%rdx
1522	vmovdqa	%xmm7,48(%rsp)
1523	jmp	L$oop_tail4xop
1524
1525.p2align	5
1526L$192_or_more4xop:
1527	vpxor	0(%rsi),%xmm6,%xmm6
1528	vpxor	16(%rsi),%xmm1,%xmm1
1529	vpxor	32(%rsi),%xmm13,%xmm13
1530	vpxor	48(%rsi),%xmm5,%xmm5
1531	vpxor	64(%rsi),%xmm15,%xmm15
1532	vpxor	80(%rsi),%xmm10,%xmm10
1533	vpxor	96(%rsi),%xmm2,%xmm2
1534	vpxor	112(%rsi),%xmm9,%xmm9
1535	leaq	128(%rsi),%rsi
1536	vpxor	0(%rsi),%xmm11,%xmm11
1537	vpxor	16(%rsi),%xmm3,%xmm3
1538	vpxor	32(%rsi),%xmm14,%xmm14
1539	vpxor	48(%rsi),%xmm7,%xmm7
1540
1541	vmovdqu	%xmm6,0(%rdi)
1542	vmovdqu	%xmm1,16(%rdi)
1543	vmovdqu	%xmm13,32(%rdi)
1544	vmovdqu	%xmm5,48(%rdi)
1545	vmovdqu	%xmm15,64(%rdi)
1546	vmovdqu	%xmm10,80(%rdi)
1547	vmovdqu	%xmm2,96(%rdi)
1548	vmovdqu	%xmm9,112(%rdi)
1549	leaq	128(%rdi),%rdi
1550	vmovdqu	%xmm11,0(%rdi)
1551	vmovdqu	%xmm3,16(%rdi)
1552	vmovdqu	%xmm14,32(%rdi)
1553	vmovdqu	%xmm7,48(%rdi)
1554	je	L$done4xop
1555
1556	leaq	64(%rsi),%rsi
1557	vmovdqa	%xmm8,0(%rsp)
1558	xorq	%r10,%r10
1559	vmovdqa	%xmm0,16(%rsp)
1560	leaq	64(%rdi),%rdi
1561	vmovdqa	%xmm12,32(%rsp)
1562	subq	$192,%rdx
1563	vmovdqa	%xmm4,48(%rsp)
1564
1565L$oop_tail4xop:
1566	movzbl	(%rsi,%r10,1),%eax
1567	movzbl	(%rsp,%r10,1),%ecx
1568	leaq	1(%r10),%r10
1569	xorl	%ecx,%eax
1570	movb	%al,-1(%rdi,%r10,1)
1571	decq	%rdx
1572	jnz	L$oop_tail4xop
1573
1574L$done4xop:
1575	vzeroupper
1576	leaq	(%r9),%rsp
1577
1578L$4xop_epilogue:
1579	.byte	0xf3,0xc3
1580
1581
1582
1583.p2align	5
1584ChaCha20_8x:
1585
1586L$ChaCha20_8x:
1587	movq	%rsp,%r9
1588
1589	subq	$0x280+8,%rsp
1590	andq	$-32,%rsp
1591	vzeroupper
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602	vbroadcasti128	L$sigma(%rip),%ymm11
1603	vbroadcasti128	(%rcx),%ymm3
1604	vbroadcasti128	16(%rcx),%ymm15
1605	vbroadcasti128	(%r8),%ymm7
1606	leaq	256(%rsp),%rcx
1607	leaq	512(%rsp),%rax
1608	leaq	L$rot16(%rip),%r10
1609	leaq	L$rot24(%rip),%r11
1610
1611	vpshufd	$0x00,%ymm11,%ymm8
1612	vpshufd	$0x55,%ymm11,%ymm9
1613	vmovdqa	%ymm8,128-256(%rcx)
1614	vpshufd	$0xaa,%ymm11,%ymm10
1615	vmovdqa	%ymm9,160-256(%rcx)
1616	vpshufd	$0xff,%ymm11,%ymm11
1617	vmovdqa	%ymm10,192-256(%rcx)
1618	vmovdqa	%ymm11,224-256(%rcx)
1619
1620	vpshufd	$0x00,%ymm3,%ymm0
1621	vpshufd	$0x55,%ymm3,%ymm1
1622	vmovdqa	%ymm0,256-256(%rcx)
1623	vpshufd	$0xaa,%ymm3,%ymm2
1624	vmovdqa	%ymm1,288-256(%rcx)
1625	vpshufd	$0xff,%ymm3,%ymm3
1626	vmovdqa	%ymm2,320-256(%rcx)
1627	vmovdqa	%ymm3,352-256(%rcx)
1628
1629	vpshufd	$0x00,%ymm15,%ymm12
1630	vpshufd	$0x55,%ymm15,%ymm13
1631	vmovdqa	%ymm12,384-512(%rax)
1632	vpshufd	$0xaa,%ymm15,%ymm14
1633	vmovdqa	%ymm13,416-512(%rax)
1634	vpshufd	$0xff,%ymm15,%ymm15
1635	vmovdqa	%ymm14,448-512(%rax)
1636	vmovdqa	%ymm15,480-512(%rax)
1637
1638	vpshufd	$0x00,%ymm7,%ymm4
1639	vpshufd	$0x55,%ymm7,%ymm5
1640	vpaddd	L$incy(%rip),%ymm4,%ymm4
1641	vpshufd	$0xaa,%ymm7,%ymm6
1642	vmovdqa	%ymm5,544-512(%rax)
1643	vpshufd	$0xff,%ymm7,%ymm7
1644	vmovdqa	%ymm6,576-512(%rax)
1645	vmovdqa	%ymm7,608-512(%rax)
1646
1647	jmp	L$oop_enter8x
1648
1649.p2align	5
1650L$oop_outer8x:
1651	vmovdqa	128-256(%rcx),%ymm8
1652	vmovdqa	160-256(%rcx),%ymm9
1653	vmovdqa	192-256(%rcx),%ymm10
1654	vmovdqa	224-256(%rcx),%ymm11
1655	vmovdqa	256-256(%rcx),%ymm0
1656	vmovdqa	288-256(%rcx),%ymm1
1657	vmovdqa	320-256(%rcx),%ymm2
1658	vmovdqa	352-256(%rcx),%ymm3
1659	vmovdqa	384-512(%rax),%ymm12
1660	vmovdqa	416-512(%rax),%ymm13
1661	vmovdqa	448-512(%rax),%ymm14
1662	vmovdqa	480-512(%rax),%ymm15
1663	vmovdqa	512-512(%rax),%ymm4
1664	vmovdqa	544-512(%rax),%ymm5
1665	vmovdqa	576-512(%rax),%ymm6
1666	vmovdqa	608-512(%rax),%ymm7
1667	vpaddd	L$eight(%rip),%ymm4,%ymm4
1668
1669L$oop_enter8x:
1670	vmovdqa	%ymm14,64(%rsp)
1671	vmovdqa	%ymm15,96(%rsp)
1672	vbroadcasti128	(%r10),%ymm15
1673	vmovdqa	%ymm4,512-512(%rax)
1674	movl	$10,%eax
1675	jmp	L$oop8x
1676
1677.p2align	5
1678L$oop8x:
1679	vpaddd	%ymm0,%ymm8,%ymm8
1680	vpxor	%ymm4,%ymm8,%ymm4
1681	vpshufb	%ymm15,%ymm4,%ymm4
1682	vpaddd	%ymm1,%ymm9,%ymm9
1683	vpxor	%ymm5,%ymm9,%ymm5
1684	vpshufb	%ymm15,%ymm5,%ymm5
1685	vpaddd	%ymm4,%ymm12,%ymm12
1686	vpxor	%ymm0,%ymm12,%ymm0
1687	vpslld	$12,%ymm0,%ymm14
1688	vpsrld	$20,%ymm0,%ymm0
1689	vpor	%ymm0,%ymm14,%ymm0
1690	vbroadcasti128	(%r11),%ymm14
1691	vpaddd	%ymm5,%ymm13,%ymm13
1692	vpxor	%ymm1,%ymm13,%ymm1
1693	vpslld	$12,%ymm1,%ymm15
1694	vpsrld	$20,%ymm1,%ymm1
1695	vpor	%ymm1,%ymm15,%ymm1
1696	vpaddd	%ymm0,%ymm8,%ymm8
1697	vpxor	%ymm4,%ymm8,%ymm4
1698	vpshufb	%ymm14,%ymm4,%ymm4
1699	vpaddd	%ymm1,%ymm9,%ymm9
1700	vpxor	%ymm5,%ymm9,%ymm5
1701	vpshufb	%ymm14,%ymm5,%ymm5
1702	vpaddd	%ymm4,%ymm12,%ymm12
1703	vpxor	%ymm0,%ymm12,%ymm0
1704	vpslld	$7,%ymm0,%ymm15
1705	vpsrld	$25,%ymm0,%ymm0
1706	vpor	%ymm0,%ymm15,%ymm0
1707	vbroadcasti128	(%r10),%ymm15
1708	vpaddd	%ymm5,%ymm13,%ymm13
1709	vpxor	%ymm1,%ymm13,%ymm1
1710	vpslld	$7,%ymm1,%ymm14
1711	vpsrld	$25,%ymm1,%ymm1
1712	vpor	%ymm1,%ymm14,%ymm1
1713	vmovdqa	%ymm12,0(%rsp)
1714	vmovdqa	%ymm13,32(%rsp)
1715	vmovdqa	64(%rsp),%ymm12
1716	vmovdqa	96(%rsp),%ymm13
1717	vpaddd	%ymm2,%ymm10,%ymm10
1718	vpxor	%ymm6,%ymm10,%ymm6
1719	vpshufb	%ymm15,%ymm6,%ymm6
1720	vpaddd	%ymm3,%ymm11,%ymm11
1721	vpxor	%ymm7,%ymm11,%ymm7
1722	vpshufb	%ymm15,%ymm7,%ymm7
1723	vpaddd	%ymm6,%ymm12,%ymm12
1724	vpxor	%ymm2,%ymm12,%ymm2
1725	vpslld	$12,%ymm2,%ymm14
1726	vpsrld	$20,%ymm2,%ymm2
1727	vpor	%ymm2,%ymm14,%ymm2
1728	vbroadcasti128	(%r11),%ymm14
1729	vpaddd	%ymm7,%ymm13,%ymm13
1730	vpxor	%ymm3,%ymm13,%ymm3
1731	vpslld	$12,%ymm3,%ymm15
1732	vpsrld	$20,%ymm3,%ymm3
1733	vpor	%ymm3,%ymm15,%ymm3
1734	vpaddd	%ymm2,%ymm10,%ymm10
1735	vpxor	%ymm6,%ymm10,%ymm6
1736	vpshufb	%ymm14,%ymm6,%ymm6
1737	vpaddd	%ymm3,%ymm11,%ymm11
1738	vpxor	%ymm7,%ymm11,%ymm7
1739	vpshufb	%ymm14,%ymm7,%ymm7
1740	vpaddd	%ymm6,%ymm12,%ymm12
1741	vpxor	%ymm2,%ymm12,%ymm2
1742	vpslld	$7,%ymm2,%ymm15
1743	vpsrld	$25,%ymm2,%ymm2
1744	vpor	%ymm2,%ymm15,%ymm2
1745	vbroadcasti128	(%r10),%ymm15
1746	vpaddd	%ymm7,%ymm13,%ymm13
1747	vpxor	%ymm3,%ymm13,%ymm3
1748	vpslld	$7,%ymm3,%ymm14
1749	vpsrld	$25,%ymm3,%ymm3
1750	vpor	%ymm3,%ymm14,%ymm3
1751	vpaddd	%ymm1,%ymm8,%ymm8
1752	vpxor	%ymm7,%ymm8,%ymm7
1753	vpshufb	%ymm15,%ymm7,%ymm7
1754	vpaddd	%ymm2,%ymm9,%ymm9
1755	vpxor	%ymm4,%ymm9,%ymm4
1756	vpshufb	%ymm15,%ymm4,%ymm4
1757	vpaddd	%ymm7,%ymm12,%ymm12
1758	vpxor	%ymm1,%ymm12,%ymm1
1759	vpslld	$12,%ymm1,%ymm14
1760	vpsrld	$20,%ymm1,%ymm1
1761	vpor	%ymm1,%ymm14,%ymm1
1762	vbroadcasti128	(%r11),%ymm14
1763	vpaddd	%ymm4,%ymm13,%ymm13
1764	vpxor	%ymm2,%ymm13,%ymm2
1765	vpslld	$12,%ymm2,%ymm15
1766	vpsrld	$20,%ymm2,%ymm2
1767	vpor	%ymm2,%ymm15,%ymm2
1768	vpaddd	%ymm1,%ymm8,%ymm8
1769	vpxor	%ymm7,%ymm8,%ymm7
1770	vpshufb	%ymm14,%ymm7,%ymm7
1771	vpaddd	%ymm2,%ymm9,%ymm9
1772	vpxor	%ymm4,%ymm9,%ymm4
1773	vpshufb	%ymm14,%ymm4,%ymm4
1774	vpaddd	%ymm7,%ymm12,%ymm12
1775	vpxor	%ymm1,%ymm12,%ymm1
1776	vpslld	$7,%ymm1,%ymm15
1777	vpsrld	$25,%ymm1,%ymm1
1778	vpor	%ymm1,%ymm15,%ymm1
1779	vbroadcasti128	(%r10),%ymm15
1780	vpaddd	%ymm4,%ymm13,%ymm13
1781	vpxor	%ymm2,%ymm13,%ymm2
1782	vpslld	$7,%ymm2,%ymm14
1783	vpsrld	$25,%ymm2,%ymm2
1784	vpor	%ymm2,%ymm14,%ymm2
1785	vmovdqa	%ymm12,64(%rsp)
1786	vmovdqa	%ymm13,96(%rsp)
1787	vmovdqa	0(%rsp),%ymm12
1788	vmovdqa	32(%rsp),%ymm13
1789	vpaddd	%ymm3,%ymm10,%ymm10
1790	vpxor	%ymm5,%ymm10,%ymm5
1791	vpshufb	%ymm15,%ymm5,%ymm5
1792	vpaddd	%ymm0,%ymm11,%ymm11
1793	vpxor	%ymm6,%ymm11,%ymm6
1794	vpshufb	%ymm15,%ymm6,%ymm6
1795	vpaddd	%ymm5,%ymm12,%ymm12
1796	vpxor	%ymm3,%ymm12,%ymm3
1797	vpslld	$12,%ymm3,%ymm14
1798	vpsrld	$20,%ymm3,%ymm3
1799	vpor	%ymm3,%ymm14,%ymm3
1800	vbroadcasti128	(%r11),%ymm14
1801	vpaddd	%ymm6,%ymm13,%ymm13
1802	vpxor	%ymm0,%ymm13,%ymm0
1803	vpslld	$12,%ymm0,%ymm15
1804	vpsrld	$20,%ymm0,%ymm0
1805	vpor	%ymm0,%ymm15,%ymm0
1806	vpaddd	%ymm3,%ymm10,%ymm10
1807	vpxor	%ymm5,%ymm10,%ymm5
1808	vpshufb	%ymm14,%ymm5,%ymm5
1809	vpaddd	%ymm0,%ymm11,%ymm11
1810	vpxor	%ymm6,%ymm11,%ymm6
1811	vpshufb	%ymm14,%ymm6,%ymm6
1812	vpaddd	%ymm5,%ymm12,%ymm12
1813	vpxor	%ymm3,%ymm12,%ymm3
1814	vpslld	$7,%ymm3,%ymm15
1815	vpsrld	$25,%ymm3,%ymm3
1816	vpor	%ymm3,%ymm15,%ymm3
1817	vbroadcasti128	(%r10),%ymm15
1818	vpaddd	%ymm6,%ymm13,%ymm13
1819	vpxor	%ymm0,%ymm13,%ymm0
1820	vpslld	$7,%ymm0,%ymm14
1821	vpsrld	$25,%ymm0,%ymm0
1822	vpor	%ymm0,%ymm14,%ymm0
1823	decl	%eax
1824	jnz	L$oop8x
1825
1826	leaq	512(%rsp),%rax
1827	vpaddd	128-256(%rcx),%ymm8,%ymm8
1828	vpaddd	160-256(%rcx),%ymm9,%ymm9
1829	vpaddd	192-256(%rcx),%ymm10,%ymm10
1830	vpaddd	224-256(%rcx),%ymm11,%ymm11
1831
1832	vpunpckldq	%ymm9,%ymm8,%ymm14
1833	vpunpckldq	%ymm11,%ymm10,%ymm15
1834	vpunpckhdq	%ymm9,%ymm8,%ymm8
1835	vpunpckhdq	%ymm11,%ymm10,%ymm10
1836	vpunpcklqdq	%ymm15,%ymm14,%ymm9
1837	vpunpckhqdq	%ymm15,%ymm14,%ymm14
1838	vpunpcklqdq	%ymm10,%ymm8,%ymm11
1839	vpunpckhqdq	%ymm10,%ymm8,%ymm8
1840	vpaddd	256-256(%rcx),%ymm0,%ymm0
1841	vpaddd	288-256(%rcx),%ymm1,%ymm1
1842	vpaddd	320-256(%rcx),%ymm2,%ymm2
1843	vpaddd	352-256(%rcx),%ymm3,%ymm3
1844
1845	vpunpckldq	%ymm1,%ymm0,%ymm10
1846	vpunpckldq	%ymm3,%ymm2,%ymm15
1847	vpunpckhdq	%ymm1,%ymm0,%ymm0
1848	vpunpckhdq	%ymm3,%ymm2,%ymm2
1849	vpunpcklqdq	%ymm15,%ymm10,%ymm1
1850	vpunpckhqdq	%ymm15,%ymm10,%ymm10
1851	vpunpcklqdq	%ymm2,%ymm0,%ymm3
1852	vpunpckhqdq	%ymm2,%ymm0,%ymm0
1853	vperm2i128	$0x20,%ymm1,%ymm9,%ymm15
1854	vperm2i128	$0x31,%ymm1,%ymm9,%ymm1
1855	vperm2i128	$0x20,%ymm10,%ymm14,%ymm9
1856	vperm2i128	$0x31,%ymm10,%ymm14,%ymm10
1857	vperm2i128	$0x20,%ymm3,%ymm11,%ymm14
1858	vperm2i128	$0x31,%ymm3,%ymm11,%ymm3
1859	vperm2i128	$0x20,%ymm0,%ymm8,%ymm11
1860	vperm2i128	$0x31,%ymm0,%ymm8,%ymm0
1861	vmovdqa	%ymm15,0(%rsp)
1862	vmovdqa	%ymm9,32(%rsp)
1863	vmovdqa	64(%rsp),%ymm15
1864	vmovdqa	96(%rsp),%ymm9
1865
1866	vpaddd	384-512(%rax),%ymm12,%ymm12
1867	vpaddd	416-512(%rax),%ymm13,%ymm13
1868	vpaddd	448-512(%rax),%ymm15,%ymm15
1869	vpaddd	480-512(%rax),%ymm9,%ymm9
1870
1871	vpunpckldq	%ymm13,%ymm12,%ymm2
1872	vpunpckldq	%ymm9,%ymm15,%ymm8
1873	vpunpckhdq	%ymm13,%ymm12,%ymm12
1874	vpunpckhdq	%ymm9,%ymm15,%ymm15
1875	vpunpcklqdq	%ymm8,%ymm2,%ymm13
1876	vpunpckhqdq	%ymm8,%ymm2,%ymm2
1877	vpunpcklqdq	%ymm15,%ymm12,%ymm9
1878	vpunpckhqdq	%ymm15,%ymm12,%ymm12
1879	vpaddd	512-512(%rax),%ymm4,%ymm4
1880	vpaddd	544-512(%rax),%ymm5,%ymm5
1881	vpaddd	576-512(%rax),%ymm6,%ymm6
1882	vpaddd	608-512(%rax),%ymm7,%ymm7
1883
1884	vpunpckldq	%ymm5,%ymm4,%ymm15
1885	vpunpckldq	%ymm7,%ymm6,%ymm8
1886	vpunpckhdq	%ymm5,%ymm4,%ymm4
1887	vpunpckhdq	%ymm7,%ymm6,%ymm6
1888	vpunpcklqdq	%ymm8,%ymm15,%ymm5
1889	vpunpckhqdq	%ymm8,%ymm15,%ymm15
1890	vpunpcklqdq	%ymm6,%ymm4,%ymm7
1891	vpunpckhqdq	%ymm6,%ymm4,%ymm4
1892	vperm2i128	$0x20,%ymm5,%ymm13,%ymm8
1893	vperm2i128	$0x31,%ymm5,%ymm13,%ymm5
1894	vperm2i128	$0x20,%ymm15,%ymm2,%ymm13
1895	vperm2i128	$0x31,%ymm15,%ymm2,%ymm15
1896	vperm2i128	$0x20,%ymm7,%ymm9,%ymm2
1897	vperm2i128	$0x31,%ymm7,%ymm9,%ymm7
1898	vperm2i128	$0x20,%ymm4,%ymm12,%ymm9
1899	vperm2i128	$0x31,%ymm4,%ymm12,%ymm4
1900	vmovdqa	0(%rsp),%ymm6
1901	vmovdqa	32(%rsp),%ymm12
1902
1903	cmpq	$512,%rdx
1904	jb	L$tail8x
1905
1906	vpxor	0(%rsi),%ymm6,%ymm6
1907	vpxor	32(%rsi),%ymm8,%ymm8
1908	vpxor	64(%rsi),%ymm1,%ymm1
1909	vpxor	96(%rsi),%ymm5,%ymm5
1910	leaq	128(%rsi),%rsi
1911	vmovdqu	%ymm6,0(%rdi)
1912	vmovdqu	%ymm8,32(%rdi)
1913	vmovdqu	%ymm1,64(%rdi)
1914	vmovdqu	%ymm5,96(%rdi)
1915	leaq	128(%rdi),%rdi
1916
1917	vpxor	0(%rsi),%ymm12,%ymm12
1918	vpxor	32(%rsi),%ymm13,%ymm13
1919	vpxor	64(%rsi),%ymm10,%ymm10
1920	vpxor	96(%rsi),%ymm15,%ymm15
1921	leaq	128(%rsi),%rsi
1922	vmovdqu	%ymm12,0(%rdi)
1923	vmovdqu	%ymm13,32(%rdi)
1924	vmovdqu	%ymm10,64(%rdi)
1925	vmovdqu	%ymm15,96(%rdi)
1926	leaq	128(%rdi),%rdi
1927
1928	vpxor	0(%rsi),%ymm14,%ymm14
1929	vpxor	32(%rsi),%ymm2,%ymm2
1930	vpxor	64(%rsi),%ymm3,%ymm3
1931	vpxor	96(%rsi),%ymm7,%ymm7
1932	leaq	128(%rsi),%rsi
1933	vmovdqu	%ymm14,0(%rdi)
1934	vmovdqu	%ymm2,32(%rdi)
1935	vmovdqu	%ymm3,64(%rdi)
1936	vmovdqu	%ymm7,96(%rdi)
1937	leaq	128(%rdi),%rdi
1938
1939	vpxor	0(%rsi),%ymm11,%ymm11
1940	vpxor	32(%rsi),%ymm9,%ymm9
1941	vpxor	64(%rsi),%ymm0,%ymm0
1942	vpxor	96(%rsi),%ymm4,%ymm4
1943	leaq	128(%rsi),%rsi
1944	vmovdqu	%ymm11,0(%rdi)
1945	vmovdqu	%ymm9,32(%rdi)
1946	vmovdqu	%ymm0,64(%rdi)
1947	vmovdqu	%ymm4,96(%rdi)
1948	leaq	128(%rdi),%rdi
1949
1950	subq	$512,%rdx
1951	jnz	L$oop_outer8x
1952
1953	jmp	L$done8x
1954
1955L$tail8x:
1956	cmpq	$448,%rdx
1957	jae	L$448_or_more8x
1958	cmpq	$384,%rdx
1959	jae	L$384_or_more8x
1960	cmpq	$320,%rdx
1961	jae	L$320_or_more8x
1962	cmpq	$256,%rdx
1963	jae	L$256_or_more8x
1964	cmpq	$192,%rdx
1965	jae	L$192_or_more8x
1966	cmpq	$128,%rdx
1967	jae	L$128_or_more8x
1968	cmpq	$64,%rdx
1969	jae	L$64_or_more8x
1970
1971	xorq	%r10,%r10
1972	vmovdqa	%ymm6,0(%rsp)
1973	vmovdqa	%ymm8,32(%rsp)
1974	jmp	L$oop_tail8x
1975
1976.p2align	5
1977L$64_or_more8x:
1978	vpxor	0(%rsi),%ymm6,%ymm6
1979	vpxor	32(%rsi),%ymm8,%ymm8
1980	vmovdqu	%ymm6,0(%rdi)
1981	vmovdqu	%ymm8,32(%rdi)
1982	je	L$done8x
1983
1984	leaq	64(%rsi),%rsi
1985	xorq	%r10,%r10
1986	vmovdqa	%ymm1,0(%rsp)
1987	leaq	64(%rdi),%rdi
1988	subq	$64,%rdx
1989	vmovdqa	%ymm5,32(%rsp)
1990	jmp	L$oop_tail8x
1991
1992.p2align	5
1993L$128_or_more8x:
1994	vpxor	0(%rsi),%ymm6,%ymm6
1995	vpxor	32(%rsi),%ymm8,%ymm8
1996	vpxor	64(%rsi),%ymm1,%ymm1
1997	vpxor	96(%rsi),%ymm5,%ymm5
1998	vmovdqu	%ymm6,0(%rdi)
1999	vmovdqu	%ymm8,32(%rdi)
2000	vmovdqu	%ymm1,64(%rdi)
2001	vmovdqu	%ymm5,96(%rdi)
2002	je	L$done8x
2003
2004	leaq	128(%rsi),%rsi
2005	xorq	%r10,%r10
2006	vmovdqa	%ymm12,0(%rsp)
2007	leaq	128(%rdi),%rdi
2008	subq	$128,%rdx
2009	vmovdqa	%ymm13,32(%rsp)
2010	jmp	L$oop_tail8x
2011
2012.p2align	5
2013L$192_or_more8x:
2014	vpxor	0(%rsi),%ymm6,%ymm6
2015	vpxor	32(%rsi),%ymm8,%ymm8
2016	vpxor	64(%rsi),%ymm1,%ymm1
2017	vpxor	96(%rsi),%ymm5,%ymm5
2018	vpxor	128(%rsi),%ymm12,%ymm12
2019	vpxor	160(%rsi),%ymm13,%ymm13
2020	vmovdqu	%ymm6,0(%rdi)
2021	vmovdqu	%ymm8,32(%rdi)
2022	vmovdqu	%ymm1,64(%rdi)
2023	vmovdqu	%ymm5,96(%rdi)
2024	vmovdqu	%ymm12,128(%rdi)
2025	vmovdqu	%ymm13,160(%rdi)
2026	je	L$done8x
2027
2028	leaq	192(%rsi),%rsi
2029	xorq	%r10,%r10
2030	vmovdqa	%ymm10,0(%rsp)
2031	leaq	192(%rdi),%rdi
2032	subq	$192,%rdx
2033	vmovdqa	%ymm15,32(%rsp)
2034	jmp	L$oop_tail8x
2035
2036.p2align	5
2037L$256_or_more8x:
2038	vpxor	0(%rsi),%ymm6,%ymm6
2039	vpxor	32(%rsi),%ymm8,%ymm8
2040	vpxor	64(%rsi),%ymm1,%ymm1
2041	vpxor	96(%rsi),%ymm5,%ymm5
2042	vpxor	128(%rsi),%ymm12,%ymm12
2043	vpxor	160(%rsi),%ymm13,%ymm13
2044	vpxor	192(%rsi),%ymm10,%ymm10
2045	vpxor	224(%rsi),%ymm15,%ymm15
2046	vmovdqu	%ymm6,0(%rdi)
2047	vmovdqu	%ymm8,32(%rdi)
2048	vmovdqu	%ymm1,64(%rdi)
2049	vmovdqu	%ymm5,96(%rdi)
2050	vmovdqu	%ymm12,128(%rdi)
2051	vmovdqu	%ymm13,160(%rdi)
2052	vmovdqu	%ymm10,192(%rdi)
2053	vmovdqu	%ymm15,224(%rdi)
2054	je	L$done8x
2055
2056	leaq	256(%rsi),%rsi
2057	xorq	%r10,%r10
2058	vmovdqa	%ymm14,0(%rsp)
2059	leaq	256(%rdi),%rdi
2060	subq	$256,%rdx
2061	vmovdqa	%ymm2,32(%rsp)
2062	jmp	L$oop_tail8x
2063
2064.p2align	5
2065L$320_or_more8x:
2066	vpxor	0(%rsi),%ymm6,%ymm6
2067	vpxor	32(%rsi),%ymm8,%ymm8
2068	vpxor	64(%rsi),%ymm1,%ymm1
2069	vpxor	96(%rsi),%ymm5,%ymm5
2070	vpxor	128(%rsi),%ymm12,%ymm12
2071	vpxor	160(%rsi),%ymm13,%ymm13
2072	vpxor	192(%rsi),%ymm10,%ymm10
2073	vpxor	224(%rsi),%ymm15,%ymm15
2074	vpxor	256(%rsi),%ymm14,%ymm14
2075	vpxor	288(%rsi),%ymm2,%ymm2
2076	vmovdqu	%ymm6,0(%rdi)
2077	vmovdqu	%ymm8,32(%rdi)
2078	vmovdqu	%ymm1,64(%rdi)
2079	vmovdqu	%ymm5,96(%rdi)
2080	vmovdqu	%ymm12,128(%rdi)
2081	vmovdqu	%ymm13,160(%rdi)
2082	vmovdqu	%ymm10,192(%rdi)
2083	vmovdqu	%ymm15,224(%rdi)
2084	vmovdqu	%ymm14,256(%rdi)
2085	vmovdqu	%ymm2,288(%rdi)
2086	je	L$done8x
2087
2088	leaq	320(%rsi),%rsi
2089	xorq	%r10,%r10
2090	vmovdqa	%ymm3,0(%rsp)
2091	leaq	320(%rdi),%rdi
2092	subq	$320,%rdx
2093	vmovdqa	%ymm7,32(%rsp)
2094	jmp	L$oop_tail8x
2095
2096.p2align	5
2097L$384_or_more8x:
2098	vpxor	0(%rsi),%ymm6,%ymm6
2099	vpxor	32(%rsi),%ymm8,%ymm8
2100	vpxor	64(%rsi),%ymm1,%ymm1
2101	vpxor	96(%rsi),%ymm5,%ymm5
2102	vpxor	128(%rsi),%ymm12,%ymm12
2103	vpxor	160(%rsi),%ymm13,%ymm13
2104	vpxor	192(%rsi),%ymm10,%ymm10
2105	vpxor	224(%rsi),%ymm15,%ymm15
2106	vpxor	256(%rsi),%ymm14,%ymm14
2107	vpxor	288(%rsi),%ymm2,%ymm2
2108	vpxor	320(%rsi),%ymm3,%ymm3
2109	vpxor	352(%rsi),%ymm7,%ymm7
2110	vmovdqu	%ymm6,0(%rdi)
2111	vmovdqu	%ymm8,32(%rdi)
2112	vmovdqu	%ymm1,64(%rdi)
2113	vmovdqu	%ymm5,96(%rdi)
2114	vmovdqu	%ymm12,128(%rdi)
2115	vmovdqu	%ymm13,160(%rdi)
2116	vmovdqu	%ymm10,192(%rdi)
2117	vmovdqu	%ymm15,224(%rdi)
2118	vmovdqu	%ymm14,256(%rdi)
2119	vmovdqu	%ymm2,288(%rdi)
2120	vmovdqu	%ymm3,320(%rdi)
2121	vmovdqu	%ymm7,352(%rdi)
2122	je	L$done8x
2123
2124	leaq	384(%rsi),%rsi
2125	xorq	%r10,%r10
2126	vmovdqa	%ymm11,0(%rsp)
2127	leaq	384(%rdi),%rdi
2128	subq	$384,%rdx
2129	vmovdqa	%ymm9,32(%rsp)
2130	jmp	L$oop_tail8x
2131
2132.p2align	5
2133L$448_or_more8x:
2134	vpxor	0(%rsi),%ymm6,%ymm6
2135	vpxor	32(%rsi),%ymm8,%ymm8
2136	vpxor	64(%rsi),%ymm1,%ymm1
2137	vpxor	96(%rsi),%ymm5,%ymm5
2138	vpxor	128(%rsi),%ymm12,%ymm12
2139	vpxor	160(%rsi),%ymm13,%ymm13
2140	vpxor	192(%rsi),%ymm10,%ymm10
2141	vpxor	224(%rsi),%ymm15,%ymm15
2142	vpxor	256(%rsi),%ymm14,%ymm14
2143	vpxor	288(%rsi),%ymm2,%ymm2
2144	vpxor	320(%rsi),%ymm3,%ymm3
2145	vpxor	352(%rsi),%ymm7,%ymm7
2146	vpxor	384(%rsi),%ymm11,%ymm11
2147	vpxor	416(%rsi),%ymm9,%ymm9
2148	vmovdqu	%ymm6,0(%rdi)
2149	vmovdqu	%ymm8,32(%rdi)
2150	vmovdqu	%ymm1,64(%rdi)
2151	vmovdqu	%ymm5,96(%rdi)
2152	vmovdqu	%ymm12,128(%rdi)
2153	vmovdqu	%ymm13,160(%rdi)
2154	vmovdqu	%ymm10,192(%rdi)
2155	vmovdqu	%ymm15,224(%rdi)
2156	vmovdqu	%ymm14,256(%rdi)
2157	vmovdqu	%ymm2,288(%rdi)
2158	vmovdqu	%ymm3,320(%rdi)
2159	vmovdqu	%ymm7,352(%rdi)
2160	vmovdqu	%ymm11,384(%rdi)
2161	vmovdqu	%ymm9,416(%rdi)
2162	je	L$done8x
2163
2164	leaq	448(%rsi),%rsi
2165	xorq	%r10,%r10
2166	vmovdqa	%ymm0,0(%rsp)
2167	leaq	448(%rdi),%rdi
2168	subq	$448,%rdx
2169	vmovdqa	%ymm4,32(%rsp)
2170
2171L$oop_tail8x:
2172	movzbl	(%rsi,%r10,1),%eax
2173	movzbl	(%rsp,%r10,1),%ecx
2174	leaq	1(%r10),%r10
2175	xorl	%ecx,%eax
2176	movb	%al,-1(%rdi,%r10,1)
2177	decq	%rdx
2178	jnz	L$oop_tail8x
2179
2180L$done8x:
2181	vzeroall
2182	leaq	(%r9),%rsp
2183
2184L$8x_epilogue:
2185	.byte	0xf3,0xc3
2186
2187
2188