1# This file is generated from a similarly-named Perl script in the BoringSSL
2# source tree. Do not edit by hand.
3
4#if defined(__has_feature)
5#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
6#define OPENSSL_NO_ASM
7#endif
8#endif
9
10#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
11.text
12
13
14
15.p2align	6
16L$zero:
17.long	0,0,0,0
18L$one:
19.long	1,0,0,0
20L$inc:
21.long	0,1,2,3
22L$four:
23.long	4,4,4,4
24L$incy:
25.long	0,2,4,6,1,3,5,7
26L$eight:
27.long	8,8,8,8,8,8,8,8
28L$rot16:
29.byte	0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
30L$rot24:
31.byte	0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
32L$sigma:
33.byte	101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0
34.p2align	6
35L$zeroz:
36.long	0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
37L$fourz:
38.long	4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
39L$incz:
40.long	0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
41L$sixteen:
42.long	16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
43.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
44.globl	_GFp_ChaCha20_ctr32
45.private_extern _GFp_ChaCha20_ctr32
46
47.p2align	6
48_GFp_ChaCha20_ctr32:
49
50	cmpq	$0,%rdx
51	je	L$no_data
52	movq	_GFp_ia32cap_P+4(%rip),%r10
53	testl	$512,%r10d
54	jnz	L$ChaCha20_ssse3
55
56	pushq	%rbx
57
58	pushq	%rbp
59
60	pushq	%r12
61
62	pushq	%r13
63
64	pushq	%r14
65
66	pushq	%r15
67
68	subq	$64+24,%rsp
69
70L$ctr32_body:
71
72
73	movdqu	(%rcx),%xmm1
74	movdqu	16(%rcx),%xmm2
75	movdqu	(%r8),%xmm3
76	movdqa	L$one(%rip),%xmm4
77
78
79	movdqa	%xmm1,16(%rsp)
80	movdqa	%xmm2,32(%rsp)
81	movdqa	%xmm3,48(%rsp)
82	movq	%rdx,%rbp
83	jmp	L$oop_outer
84
85.p2align	5
86L$oop_outer:
87	movl	$0x61707865,%eax
88	movl	$0x3320646e,%ebx
89	movl	$0x79622d32,%ecx
90	movl	$0x6b206574,%edx
91	movl	16(%rsp),%r8d
92	movl	20(%rsp),%r9d
93	movl	24(%rsp),%r10d
94	movl	28(%rsp),%r11d
95	movd	%xmm3,%r12d
96	movl	52(%rsp),%r13d
97	movl	56(%rsp),%r14d
98	movl	60(%rsp),%r15d
99
100	movq	%rbp,64+0(%rsp)
101	movl	$10,%ebp
102	movq	%rsi,64+8(%rsp)
103.byte	102,72,15,126,214
104	movq	%rdi,64+16(%rsp)
105	movq	%rsi,%rdi
106	shrq	$32,%rdi
107	jmp	L$oop
108
109.p2align	5
110L$oop:
111	addl	%r8d,%eax
112	xorl	%eax,%r12d
113	roll	$16,%r12d
114	addl	%r9d,%ebx
115	xorl	%ebx,%r13d
116	roll	$16,%r13d
117	addl	%r12d,%esi
118	xorl	%esi,%r8d
119	roll	$12,%r8d
120	addl	%r13d,%edi
121	xorl	%edi,%r9d
122	roll	$12,%r9d
123	addl	%r8d,%eax
124	xorl	%eax,%r12d
125	roll	$8,%r12d
126	addl	%r9d,%ebx
127	xorl	%ebx,%r13d
128	roll	$8,%r13d
129	addl	%r12d,%esi
130	xorl	%esi,%r8d
131	roll	$7,%r8d
132	addl	%r13d,%edi
133	xorl	%edi,%r9d
134	roll	$7,%r9d
135	movl	%esi,32(%rsp)
136	movl	%edi,36(%rsp)
137	movl	40(%rsp),%esi
138	movl	44(%rsp),%edi
139	addl	%r10d,%ecx
140	xorl	%ecx,%r14d
141	roll	$16,%r14d
142	addl	%r11d,%edx
143	xorl	%edx,%r15d
144	roll	$16,%r15d
145	addl	%r14d,%esi
146	xorl	%esi,%r10d
147	roll	$12,%r10d
148	addl	%r15d,%edi
149	xorl	%edi,%r11d
150	roll	$12,%r11d
151	addl	%r10d,%ecx
152	xorl	%ecx,%r14d
153	roll	$8,%r14d
154	addl	%r11d,%edx
155	xorl	%edx,%r15d
156	roll	$8,%r15d
157	addl	%r14d,%esi
158	xorl	%esi,%r10d
159	roll	$7,%r10d
160	addl	%r15d,%edi
161	xorl	%edi,%r11d
162	roll	$7,%r11d
163	addl	%r9d,%eax
164	xorl	%eax,%r15d
165	roll	$16,%r15d
166	addl	%r10d,%ebx
167	xorl	%ebx,%r12d
168	roll	$16,%r12d
169	addl	%r15d,%esi
170	xorl	%esi,%r9d
171	roll	$12,%r9d
172	addl	%r12d,%edi
173	xorl	%edi,%r10d
174	roll	$12,%r10d
175	addl	%r9d,%eax
176	xorl	%eax,%r15d
177	roll	$8,%r15d
178	addl	%r10d,%ebx
179	xorl	%ebx,%r12d
180	roll	$8,%r12d
181	addl	%r15d,%esi
182	xorl	%esi,%r9d
183	roll	$7,%r9d
184	addl	%r12d,%edi
185	xorl	%edi,%r10d
186	roll	$7,%r10d
187	movl	%esi,40(%rsp)
188	movl	%edi,44(%rsp)
189	movl	32(%rsp),%esi
190	movl	36(%rsp),%edi
191	addl	%r11d,%ecx
192	xorl	%ecx,%r13d
193	roll	$16,%r13d
194	addl	%r8d,%edx
195	xorl	%edx,%r14d
196	roll	$16,%r14d
197	addl	%r13d,%esi
198	xorl	%esi,%r11d
199	roll	$12,%r11d
200	addl	%r14d,%edi
201	xorl	%edi,%r8d
202	roll	$12,%r8d
203	addl	%r11d,%ecx
204	xorl	%ecx,%r13d
205	roll	$8,%r13d
206	addl	%r8d,%edx
207	xorl	%edx,%r14d
208	roll	$8,%r14d
209	addl	%r13d,%esi
210	xorl	%esi,%r11d
211	roll	$7,%r11d
212	addl	%r14d,%edi
213	xorl	%edi,%r8d
214	roll	$7,%r8d
215	decl	%ebp
216	jnz	L$oop
217	movl	%edi,36(%rsp)
218	movl	%esi,32(%rsp)
219	movq	64(%rsp),%rbp
220	movdqa	%xmm2,%xmm1
221	movq	64+8(%rsp),%rsi
222	paddd	%xmm4,%xmm3
223	movq	64+16(%rsp),%rdi
224
225	addl	$0x61707865,%eax
226	addl	$0x3320646e,%ebx
227	addl	$0x79622d32,%ecx
228	addl	$0x6b206574,%edx
229	addl	16(%rsp),%r8d
230	addl	20(%rsp),%r9d
231	addl	24(%rsp),%r10d
232	addl	28(%rsp),%r11d
233	addl	48(%rsp),%r12d
234	addl	52(%rsp),%r13d
235	addl	56(%rsp),%r14d
236	addl	60(%rsp),%r15d
237	paddd	32(%rsp),%xmm1
238
239	cmpq	$64,%rbp
240	jb	L$tail
241
242	xorl	0(%rsi),%eax
243	xorl	4(%rsi),%ebx
244	xorl	8(%rsi),%ecx
245	xorl	12(%rsi),%edx
246	xorl	16(%rsi),%r8d
247	xorl	20(%rsi),%r9d
248	xorl	24(%rsi),%r10d
249	xorl	28(%rsi),%r11d
250	movdqu	32(%rsi),%xmm0
251	xorl	48(%rsi),%r12d
252	xorl	52(%rsi),%r13d
253	xorl	56(%rsi),%r14d
254	xorl	60(%rsi),%r15d
255	leaq	64(%rsi),%rsi
256	pxor	%xmm1,%xmm0
257
258	movdqa	%xmm2,32(%rsp)
259	movd	%xmm3,48(%rsp)
260
261	movl	%eax,0(%rdi)
262	movl	%ebx,4(%rdi)
263	movl	%ecx,8(%rdi)
264	movl	%edx,12(%rdi)
265	movl	%r8d,16(%rdi)
266	movl	%r9d,20(%rdi)
267	movl	%r10d,24(%rdi)
268	movl	%r11d,28(%rdi)
269	movdqu	%xmm0,32(%rdi)
270	movl	%r12d,48(%rdi)
271	movl	%r13d,52(%rdi)
272	movl	%r14d,56(%rdi)
273	movl	%r15d,60(%rdi)
274	leaq	64(%rdi),%rdi
275
276	subq	$64,%rbp
277	jnz	L$oop_outer
278
279	jmp	L$done
280
281.p2align	4
282L$tail:
283	movl	%eax,0(%rsp)
284	movl	%ebx,4(%rsp)
285	xorq	%rbx,%rbx
286	movl	%ecx,8(%rsp)
287	movl	%edx,12(%rsp)
288	movl	%r8d,16(%rsp)
289	movl	%r9d,20(%rsp)
290	movl	%r10d,24(%rsp)
291	movl	%r11d,28(%rsp)
292	movdqa	%xmm1,32(%rsp)
293	movl	%r12d,48(%rsp)
294	movl	%r13d,52(%rsp)
295	movl	%r14d,56(%rsp)
296	movl	%r15d,60(%rsp)
297
298L$oop_tail:
299	movzbl	(%rsi,%rbx,1),%eax
300	movzbl	(%rsp,%rbx,1),%edx
301	leaq	1(%rbx),%rbx
302	xorl	%edx,%eax
303	movb	%al,-1(%rdi,%rbx,1)
304	decq	%rbp
305	jnz	L$oop_tail
306
307L$done:
308	leaq	64+24+48(%rsp),%rsi
309	movq	-48(%rsi),%r15
310
311	movq	-40(%rsi),%r14
312
313	movq	-32(%rsi),%r13
314
315	movq	-24(%rsi),%r12
316
317	movq	-16(%rsi),%rbp
318
319	movq	-8(%rsi),%rbx
320
321	leaq	(%rsi),%rsp
322
323L$no_data:
324	.byte	0xf3,0xc3
325
326
327
328.p2align	5
329ChaCha20_ssse3:
330L$ChaCha20_ssse3:
331
332	movq	%rsp,%r9
333
334	cmpq	$128,%rdx
335	ja	L$ChaCha20_4x
336
337L$do_sse3_after_all:
338	subq	$64+8,%rsp
339	movdqa	L$sigma(%rip),%xmm0
340	movdqu	(%rcx),%xmm1
341	movdqu	16(%rcx),%xmm2
342	movdqu	(%r8),%xmm3
343	movdqa	L$rot16(%rip),%xmm6
344	movdqa	L$rot24(%rip),%xmm7
345
346	movdqa	%xmm0,0(%rsp)
347	movdqa	%xmm1,16(%rsp)
348	movdqa	%xmm2,32(%rsp)
349	movdqa	%xmm3,48(%rsp)
350	movq	$10,%r8
351	jmp	L$oop_ssse3
352
353.p2align	5
354L$oop_outer_ssse3:
355	movdqa	L$one(%rip),%xmm3
356	movdqa	0(%rsp),%xmm0
357	movdqa	16(%rsp),%xmm1
358	movdqa	32(%rsp),%xmm2
359	paddd	48(%rsp),%xmm3
360	movq	$10,%r8
361	movdqa	%xmm3,48(%rsp)
362	jmp	L$oop_ssse3
363
364.p2align	5
365L$oop_ssse3:
366	paddd	%xmm1,%xmm0
367	pxor	%xmm0,%xmm3
368.byte	102,15,56,0,222
369	paddd	%xmm3,%xmm2
370	pxor	%xmm2,%xmm1
371	movdqa	%xmm1,%xmm4
372	psrld	$20,%xmm1
373	pslld	$12,%xmm4
374	por	%xmm4,%xmm1
375	paddd	%xmm1,%xmm0
376	pxor	%xmm0,%xmm3
377.byte	102,15,56,0,223
378	paddd	%xmm3,%xmm2
379	pxor	%xmm2,%xmm1
380	movdqa	%xmm1,%xmm4
381	psrld	$25,%xmm1
382	pslld	$7,%xmm4
383	por	%xmm4,%xmm1
384	pshufd	$78,%xmm2,%xmm2
385	pshufd	$57,%xmm1,%xmm1
386	pshufd	$147,%xmm3,%xmm3
387	nop
388	paddd	%xmm1,%xmm0
389	pxor	%xmm0,%xmm3
390.byte	102,15,56,0,222
391	paddd	%xmm3,%xmm2
392	pxor	%xmm2,%xmm1
393	movdqa	%xmm1,%xmm4
394	psrld	$20,%xmm1
395	pslld	$12,%xmm4
396	por	%xmm4,%xmm1
397	paddd	%xmm1,%xmm0
398	pxor	%xmm0,%xmm3
399.byte	102,15,56,0,223
400	paddd	%xmm3,%xmm2
401	pxor	%xmm2,%xmm1
402	movdqa	%xmm1,%xmm4
403	psrld	$25,%xmm1
404	pslld	$7,%xmm4
405	por	%xmm4,%xmm1
406	pshufd	$78,%xmm2,%xmm2
407	pshufd	$147,%xmm1,%xmm1
408	pshufd	$57,%xmm3,%xmm3
409	decq	%r8
410	jnz	L$oop_ssse3
411	paddd	0(%rsp),%xmm0
412	paddd	16(%rsp),%xmm1
413	paddd	32(%rsp),%xmm2
414	paddd	48(%rsp),%xmm3
415
416	cmpq	$64,%rdx
417	jb	L$tail_ssse3
418
419	movdqu	0(%rsi),%xmm4
420	movdqu	16(%rsi),%xmm5
421	pxor	%xmm4,%xmm0
422	movdqu	32(%rsi),%xmm4
423	pxor	%xmm5,%xmm1
424	movdqu	48(%rsi),%xmm5
425	leaq	64(%rsi),%rsi
426	pxor	%xmm4,%xmm2
427	pxor	%xmm5,%xmm3
428
429	movdqu	%xmm0,0(%rdi)
430	movdqu	%xmm1,16(%rdi)
431	movdqu	%xmm2,32(%rdi)
432	movdqu	%xmm3,48(%rdi)
433	leaq	64(%rdi),%rdi
434
435	subq	$64,%rdx
436	jnz	L$oop_outer_ssse3
437
438	jmp	L$done_ssse3
439
440.p2align	4
441L$tail_ssse3:
442	movdqa	%xmm0,0(%rsp)
443	movdqa	%xmm1,16(%rsp)
444	movdqa	%xmm2,32(%rsp)
445	movdqa	%xmm3,48(%rsp)
446	xorq	%r8,%r8
447
448L$oop_tail_ssse3:
449	movzbl	(%rsi,%r8,1),%eax
450	movzbl	(%rsp,%r8,1),%ecx
451	leaq	1(%r8),%r8
452	xorl	%ecx,%eax
453	movb	%al,-1(%rdi,%r8,1)
454	decq	%rdx
455	jnz	L$oop_tail_ssse3
456
457L$done_ssse3:
458	leaq	(%r9),%rsp
459
460L$ssse3_epilogue:
461	.byte	0xf3,0xc3
462
463
464
465.p2align	5
466ChaCha20_4x:
467L$ChaCha20_4x:
468
469	movq	%rsp,%r9
470
471	movq	%r10,%r11
472	shrq	$32,%r10
473	testq	$32,%r10
474	jnz	L$ChaCha20_8x
475	cmpq	$192,%rdx
476	ja	L$proceed4x
477
478	andq	$71303168,%r11
479	cmpq	$4194304,%r11
480	je	L$do_sse3_after_all
481
482L$proceed4x:
483	subq	$0x140+8,%rsp
484	movdqa	L$sigma(%rip),%xmm11
485	movdqu	(%rcx),%xmm15
486	movdqu	16(%rcx),%xmm7
487	movdqu	(%r8),%xmm3
488	leaq	256(%rsp),%rcx
489	leaq	L$rot16(%rip),%r10
490	leaq	L$rot24(%rip),%r11
491
492	pshufd	$0x00,%xmm11,%xmm8
493	pshufd	$0x55,%xmm11,%xmm9
494	movdqa	%xmm8,64(%rsp)
495	pshufd	$0xaa,%xmm11,%xmm10
496	movdqa	%xmm9,80(%rsp)
497	pshufd	$0xff,%xmm11,%xmm11
498	movdqa	%xmm10,96(%rsp)
499	movdqa	%xmm11,112(%rsp)
500
501	pshufd	$0x00,%xmm15,%xmm12
502	pshufd	$0x55,%xmm15,%xmm13
503	movdqa	%xmm12,128-256(%rcx)
504	pshufd	$0xaa,%xmm15,%xmm14
505	movdqa	%xmm13,144-256(%rcx)
506	pshufd	$0xff,%xmm15,%xmm15
507	movdqa	%xmm14,160-256(%rcx)
508	movdqa	%xmm15,176-256(%rcx)
509
510	pshufd	$0x00,%xmm7,%xmm4
511	pshufd	$0x55,%xmm7,%xmm5
512	movdqa	%xmm4,192-256(%rcx)
513	pshufd	$0xaa,%xmm7,%xmm6
514	movdqa	%xmm5,208-256(%rcx)
515	pshufd	$0xff,%xmm7,%xmm7
516	movdqa	%xmm6,224-256(%rcx)
517	movdqa	%xmm7,240-256(%rcx)
518
519	pshufd	$0x00,%xmm3,%xmm0
520	pshufd	$0x55,%xmm3,%xmm1
521	paddd	L$inc(%rip),%xmm0
522	pshufd	$0xaa,%xmm3,%xmm2
523	movdqa	%xmm1,272-256(%rcx)
524	pshufd	$0xff,%xmm3,%xmm3
525	movdqa	%xmm2,288-256(%rcx)
526	movdqa	%xmm3,304-256(%rcx)
527
528	jmp	L$oop_enter4x
529
530.p2align	5
531L$oop_outer4x:
532	movdqa	64(%rsp),%xmm8
533	movdqa	80(%rsp),%xmm9
534	movdqa	96(%rsp),%xmm10
535	movdqa	112(%rsp),%xmm11
536	movdqa	128-256(%rcx),%xmm12
537	movdqa	144-256(%rcx),%xmm13
538	movdqa	160-256(%rcx),%xmm14
539	movdqa	176-256(%rcx),%xmm15
540	movdqa	192-256(%rcx),%xmm4
541	movdqa	208-256(%rcx),%xmm5
542	movdqa	224-256(%rcx),%xmm6
543	movdqa	240-256(%rcx),%xmm7
544	movdqa	256-256(%rcx),%xmm0
545	movdqa	272-256(%rcx),%xmm1
546	movdqa	288-256(%rcx),%xmm2
547	movdqa	304-256(%rcx),%xmm3
548	paddd	L$four(%rip),%xmm0
549
550L$oop_enter4x:
551	movdqa	%xmm6,32(%rsp)
552	movdqa	%xmm7,48(%rsp)
553	movdqa	(%r10),%xmm7
554	movl	$10,%eax
555	movdqa	%xmm0,256-256(%rcx)
556	jmp	L$oop4x
557
558.p2align	5
559L$oop4x:
560	paddd	%xmm12,%xmm8
561	paddd	%xmm13,%xmm9
562	pxor	%xmm8,%xmm0
563	pxor	%xmm9,%xmm1
564.byte	102,15,56,0,199
565.byte	102,15,56,0,207
566	paddd	%xmm0,%xmm4
567	paddd	%xmm1,%xmm5
568	pxor	%xmm4,%xmm12
569	pxor	%xmm5,%xmm13
570	movdqa	%xmm12,%xmm6
571	pslld	$12,%xmm12
572	psrld	$20,%xmm6
573	movdqa	%xmm13,%xmm7
574	pslld	$12,%xmm13
575	por	%xmm6,%xmm12
576	psrld	$20,%xmm7
577	movdqa	(%r11),%xmm6
578	por	%xmm7,%xmm13
579	paddd	%xmm12,%xmm8
580	paddd	%xmm13,%xmm9
581	pxor	%xmm8,%xmm0
582	pxor	%xmm9,%xmm1
583.byte	102,15,56,0,198
584.byte	102,15,56,0,206
585	paddd	%xmm0,%xmm4
586	paddd	%xmm1,%xmm5
587	pxor	%xmm4,%xmm12
588	pxor	%xmm5,%xmm13
589	movdqa	%xmm12,%xmm7
590	pslld	$7,%xmm12
591	psrld	$25,%xmm7
592	movdqa	%xmm13,%xmm6
593	pslld	$7,%xmm13
594	por	%xmm7,%xmm12
595	psrld	$25,%xmm6
596	movdqa	(%r10),%xmm7
597	por	%xmm6,%xmm13
598	movdqa	%xmm4,0(%rsp)
599	movdqa	%xmm5,16(%rsp)
600	movdqa	32(%rsp),%xmm4
601	movdqa	48(%rsp),%xmm5
602	paddd	%xmm14,%xmm10
603	paddd	%xmm15,%xmm11
604	pxor	%xmm10,%xmm2
605	pxor	%xmm11,%xmm3
606.byte	102,15,56,0,215
607.byte	102,15,56,0,223
608	paddd	%xmm2,%xmm4
609	paddd	%xmm3,%xmm5
610	pxor	%xmm4,%xmm14
611	pxor	%xmm5,%xmm15
612	movdqa	%xmm14,%xmm6
613	pslld	$12,%xmm14
614	psrld	$20,%xmm6
615	movdqa	%xmm15,%xmm7
616	pslld	$12,%xmm15
617	por	%xmm6,%xmm14
618	psrld	$20,%xmm7
619	movdqa	(%r11),%xmm6
620	por	%xmm7,%xmm15
621	paddd	%xmm14,%xmm10
622	paddd	%xmm15,%xmm11
623	pxor	%xmm10,%xmm2
624	pxor	%xmm11,%xmm3
625.byte	102,15,56,0,214
626.byte	102,15,56,0,222
627	paddd	%xmm2,%xmm4
628	paddd	%xmm3,%xmm5
629	pxor	%xmm4,%xmm14
630	pxor	%xmm5,%xmm15
631	movdqa	%xmm14,%xmm7
632	pslld	$7,%xmm14
633	psrld	$25,%xmm7
634	movdqa	%xmm15,%xmm6
635	pslld	$7,%xmm15
636	por	%xmm7,%xmm14
637	psrld	$25,%xmm6
638	movdqa	(%r10),%xmm7
639	por	%xmm6,%xmm15
640	paddd	%xmm13,%xmm8
641	paddd	%xmm14,%xmm9
642	pxor	%xmm8,%xmm3
643	pxor	%xmm9,%xmm0
644.byte	102,15,56,0,223
645.byte	102,15,56,0,199
646	paddd	%xmm3,%xmm4
647	paddd	%xmm0,%xmm5
648	pxor	%xmm4,%xmm13
649	pxor	%xmm5,%xmm14
650	movdqa	%xmm13,%xmm6
651	pslld	$12,%xmm13
652	psrld	$20,%xmm6
653	movdqa	%xmm14,%xmm7
654	pslld	$12,%xmm14
655	por	%xmm6,%xmm13
656	psrld	$20,%xmm7
657	movdqa	(%r11),%xmm6
658	por	%xmm7,%xmm14
659	paddd	%xmm13,%xmm8
660	paddd	%xmm14,%xmm9
661	pxor	%xmm8,%xmm3
662	pxor	%xmm9,%xmm0
663.byte	102,15,56,0,222
664.byte	102,15,56,0,198
665	paddd	%xmm3,%xmm4
666	paddd	%xmm0,%xmm5
667	pxor	%xmm4,%xmm13
668	pxor	%xmm5,%xmm14
669	movdqa	%xmm13,%xmm7
670	pslld	$7,%xmm13
671	psrld	$25,%xmm7
672	movdqa	%xmm14,%xmm6
673	pslld	$7,%xmm14
674	por	%xmm7,%xmm13
675	psrld	$25,%xmm6
676	movdqa	(%r10),%xmm7
677	por	%xmm6,%xmm14
678	movdqa	%xmm4,32(%rsp)
679	movdqa	%xmm5,48(%rsp)
680	movdqa	0(%rsp),%xmm4
681	movdqa	16(%rsp),%xmm5
682	paddd	%xmm15,%xmm10
683	paddd	%xmm12,%xmm11
684	pxor	%xmm10,%xmm1
685	pxor	%xmm11,%xmm2
686.byte	102,15,56,0,207
687.byte	102,15,56,0,215
688	paddd	%xmm1,%xmm4
689	paddd	%xmm2,%xmm5
690	pxor	%xmm4,%xmm15
691	pxor	%xmm5,%xmm12
692	movdqa	%xmm15,%xmm6
693	pslld	$12,%xmm15
694	psrld	$20,%xmm6
695	movdqa	%xmm12,%xmm7
696	pslld	$12,%xmm12
697	por	%xmm6,%xmm15
698	psrld	$20,%xmm7
699	movdqa	(%r11),%xmm6
700	por	%xmm7,%xmm12
701	paddd	%xmm15,%xmm10
702	paddd	%xmm12,%xmm11
703	pxor	%xmm10,%xmm1
704	pxor	%xmm11,%xmm2
705.byte	102,15,56,0,206
706.byte	102,15,56,0,214
707	paddd	%xmm1,%xmm4
708	paddd	%xmm2,%xmm5
709	pxor	%xmm4,%xmm15
710	pxor	%xmm5,%xmm12
711	movdqa	%xmm15,%xmm7
712	pslld	$7,%xmm15
713	psrld	$25,%xmm7
714	movdqa	%xmm12,%xmm6
715	pslld	$7,%xmm12
716	por	%xmm7,%xmm15
717	psrld	$25,%xmm6
718	movdqa	(%r10),%xmm7
719	por	%xmm6,%xmm12
720	decl	%eax
721	jnz	L$oop4x
722
723	paddd	64(%rsp),%xmm8
724	paddd	80(%rsp),%xmm9
725	paddd	96(%rsp),%xmm10
726	paddd	112(%rsp),%xmm11
727
728	movdqa	%xmm8,%xmm6
729	punpckldq	%xmm9,%xmm8
730	movdqa	%xmm10,%xmm7
731	punpckldq	%xmm11,%xmm10
732	punpckhdq	%xmm9,%xmm6
733	punpckhdq	%xmm11,%xmm7
734	movdqa	%xmm8,%xmm9
735	punpcklqdq	%xmm10,%xmm8
736	movdqa	%xmm6,%xmm11
737	punpcklqdq	%xmm7,%xmm6
738	punpckhqdq	%xmm10,%xmm9
739	punpckhqdq	%xmm7,%xmm11
740	paddd	128-256(%rcx),%xmm12
741	paddd	144-256(%rcx),%xmm13
742	paddd	160-256(%rcx),%xmm14
743	paddd	176-256(%rcx),%xmm15
744
745	movdqa	%xmm8,0(%rsp)
746	movdqa	%xmm9,16(%rsp)
747	movdqa	32(%rsp),%xmm8
748	movdqa	48(%rsp),%xmm9
749
750	movdqa	%xmm12,%xmm10
751	punpckldq	%xmm13,%xmm12
752	movdqa	%xmm14,%xmm7
753	punpckldq	%xmm15,%xmm14
754	punpckhdq	%xmm13,%xmm10
755	punpckhdq	%xmm15,%xmm7
756	movdqa	%xmm12,%xmm13
757	punpcklqdq	%xmm14,%xmm12
758	movdqa	%xmm10,%xmm15
759	punpcklqdq	%xmm7,%xmm10
760	punpckhqdq	%xmm14,%xmm13
761	punpckhqdq	%xmm7,%xmm15
762	paddd	192-256(%rcx),%xmm4
763	paddd	208-256(%rcx),%xmm5
764	paddd	224-256(%rcx),%xmm8
765	paddd	240-256(%rcx),%xmm9
766
767	movdqa	%xmm6,32(%rsp)
768	movdqa	%xmm11,48(%rsp)
769
770	movdqa	%xmm4,%xmm14
771	punpckldq	%xmm5,%xmm4
772	movdqa	%xmm8,%xmm7
773	punpckldq	%xmm9,%xmm8
774	punpckhdq	%xmm5,%xmm14
775	punpckhdq	%xmm9,%xmm7
776	movdqa	%xmm4,%xmm5
777	punpcklqdq	%xmm8,%xmm4
778	movdqa	%xmm14,%xmm9
779	punpcklqdq	%xmm7,%xmm14
780	punpckhqdq	%xmm8,%xmm5
781	punpckhqdq	%xmm7,%xmm9
782	paddd	256-256(%rcx),%xmm0
783	paddd	272-256(%rcx),%xmm1
784	paddd	288-256(%rcx),%xmm2
785	paddd	304-256(%rcx),%xmm3
786
787	movdqa	%xmm0,%xmm8
788	punpckldq	%xmm1,%xmm0
789	movdqa	%xmm2,%xmm7
790	punpckldq	%xmm3,%xmm2
791	punpckhdq	%xmm1,%xmm8
792	punpckhdq	%xmm3,%xmm7
793	movdqa	%xmm0,%xmm1
794	punpcklqdq	%xmm2,%xmm0
795	movdqa	%xmm8,%xmm3
796	punpcklqdq	%xmm7,%xmm8
797	punpckhqdq	%xmm2,%xmm1
798	punpckhqdq	%xmm7,%xmm3
799	cmpq	$256,%rdx
800	jb	L$tail4x
801
802	movdqu	0(%rsi),%xmm6
803	movdqu	16(%rsi),%xmm11
804	movdqu	32(%rsi),%xmm2
805	movdqu	48(%rsi),%xmm7
806	pxor	0(%rsp),%xmm6
807	pxor	%xmm12,%xmm11
808	pxor	%xmm4,%xmm2
809	pxor	%xmm0,%xmm7
810
811	movdqu	%xmm6,0(%rdi)
812	movdqu	64(%rsi),%xmm6
813	movdqu	%xmm11,16(%rdi)
814	movdqu	80(%rsi),%xmm11
815	movdqu	%xmm2,32(%rdi)
816	movdqu	96(%rsi),%xmm2
817	movdqu	%xmm7,48(%rdi)
818	movdqu	112(%rsi),%xmm7
819	leaq	128(%rsi),%rsi
820	pxor	16(%rsp),%xmm6
821	pxor	%xmm13,%xmm11
822	pxor	%xmm5,%xmm2
823	pxor	%xmm1,%xmm7
824
825	movdqu	%xmm6,64(%rdi)
826	movdqu	0(%rsi),%xmm6
827	movdqu	%xmm11,80(%rdi)
828	movdqu	16(%rsi),%xmm11
829	movdqu	%xmm2,96(%rdi)
830	movdqu	32(%rsi),%xmm2
831	movdqu	%xmm7,112(%rdi)
832	leaq	128(%rdi),%rdi
833	movdqu	48(%rsi),%xmm7
834	pxor	32(%rsp),%xmm6
835	pxor	%xmm10,%xmm11
836	pxor	%xmm14,%xmm2
837	pxor	%xmm8,%xmm7
838
839	movdqu	%xmm6,0(%rdi)
840	movdqu	64(%rsi),%xmm6
841	movdqu	%xmm11,16(%rdi)
842	movdqu	80(%rsi),%xmm11
843	movdqu	%xmm2,32(%rdi)
844	movdqu	96(%rsi),%xmm2
845	movdqu	%xmm7,48(%rdi)
846	movdqu	112(%rsi),%xmm7
847	leaq	128(%rsi),%rsi
848	pxor	48(%rsp),%xmm6
849	pxor	%xmm15,%xmm11
850	pxor	%xmm9,%xmm2
851	pxor	%xmm3,%xmm7
852	movdqu	%xmm6,64(%rdi)
853	movdqu	%xmm11,80(%rdi)
854	movdqu	%xmm2,96(%rdi)
855	movdqu	%xmm7,112(%rdi)
856	leaq	128(%rdi),%rdi
857
858	subq	$256,%rdx
859	jnz	L$oop_outer4x
860
861	jmp	L$done4x
862
863L$tail4x:
864	cmpq	$192,%rdx
865	jae	L$192_or_more4x
866	cmpq	$128,%rdx
867	jae	L$128_or_more4x
868	cmpq	$64,%rdx
869	jae	L$64_or_more4x
870
871
872	xorq	%r10,%r10
873
874	movdqa	%xmm12,16(%rsp)
875	movdqa	%xmm4,32(%rsp)
876	movdqa	%xmm0,48(%rsp)
877	jmp	L$oop_tail4x
878
879.p2align	5
880L$64_or_more4x:
881	movdqu	0(%rsi),%xmm6
882	movdqu	16(%rsi),%xmm11
883	movdqu	32(%rsi),%xmm2
884	movdqu	48(%rsi),%xmm7
885	pxor	0(%rsp),%xmm6
886	pxor	%xmm12,%xmm11
887	pxor	%xmm4,%xmm2
888	pxor	%xmm0,%xmm7
889	movdqu	%xmm6,0(%rdi)
890	movdqu	%xmm11,16(%rdi)
891	movdqu	%xmm2,32(%rdi)
892	movdqu	%xmm7,48(%rdi)
893	je	L$done4x
894
895	movdqa	16(%rsp),%xmm6
896	leaq	64(%rsi),%rsi
897	xorq	%r10,%r10
898	movdqa	%xmm6,0(%rsp)
899	movdqa	%xmm13,16(%rsp)
900	leaq	64(%rdi),%rdi
901	movdqa	%xmm5,32(%rsp)
902	subq	$64,%rdx
903	movdqa	%xmm1,48(%rsp)
904	jmp	L$oop_tail4x
905
906.p2align	5
907L$128_or_more4x:
908	movdqu	0(%rsi),%xmm6
909	movdqu	16(%rsi),%xmm11
910	movdqu	32(%rsi),%xmm2
911	movdqu	48(%rsi),%xmm7
912	pxor	0(%rsp),%xmm6
913	pxor	%xmm12,%xmm11
914	pxor	%xmm4,%xmm2
915	pxor	%xmm0,%xmm7
916
917	movdqu	%xmm6,0(%rdi)
918	movdqu	64(%rsi),%xmm6
919	movdqu	%xmm11,16(%rdi)
920	movdqu	80(%rsi),%xmm11
921	movdqu	%xmm2,32(%rdi)
922	movdqu	96(%rsi),%xmm2
923	movdqu	%xmm7,48(%rdi)
924	movdqu	112(%rsi),%xmm7
925	pxor	16(%rsp),%xmm6
926	pxor	%xmm13,%xmm11
927	pxor	%xmm5,%xmm2
928	pxor	%xmm1,%xmm7
929	movdqu	%xmm6,64(%rdi)
930	movdqu	%xmm11,80(%rdi)
931	movdqu	%xmm2,96(%rdi)
932	movdqu	%xmm7,112(%rdi)
933	je	L$done4x
934
935	movdqa	32(%rsp),%xmm6
936	leaq	128(%rsi),%rsi
937	xorq	%r10,%r10
938	movdqa	%xmm6,0(%rsp)
939	movdqa	%xmm10,16(%rsp)
940	leaq	128(%rdi),%rdi
941	movdqa	%xmm14,32(%rsp)
942	subq	$128,%rdx
943	movdqa	%xmm8,48(%rsp)
944	jmp	L$oop_tail4x
945
946.p2align	5
947L$192_or_more4x:
948	movdqu	0(%rsi),%xmm6
949	movdqu	16(%rsi),%xmm11
950	movdqu	32(%rsi),%xmm2
951	movdqu	48(%rsi),%xmm7
952	pxor	0(%rsp),%xmm6
953	pxor	%xmm12,%xmm11
954	pxor	%xmm4,%xmm2
955	pxor	%xmm0,%xmm7
956
957	movdqu	%xmm6,0(%rdi)
958	movdqu	64(%rsi),%xmm6
959	movdqu	%xmm11,16(%rdi)
960	movdqu	80(%rsi),%xmm11
961	movdqu	%xmm2,32(%rdi)
962	movdqu	96(%rsi),%xmm2
963	movdqu	%xmm7,48(%rdi)
964	movdqu	112(%rsi),%xmm7
965	leaq	128(%rsi),%rsi
966	pxor	16(%rsp),%xmm6
967	pxor	%xmm13,%xmm11
968	pxor	%xmm5,%xmm2
969	pxor	%xmm1,%xmm7
970
971	movdqu	%xmm6,64(%rdi)
972	movdqu	0(%rsi),%xmm6
973	movdqu	%xmm11,80(%rdi)
974	movdqu	16(%rsi),%xmm11
975	movdqu	%xmm2,96(%rdi)
976	movdqu	32(%rsi),%xmm2
977	movdqu	%xmm7,112(%rdi)
978	leaq	128(%rdi),%rdi
979	movdqu	48(%rsi),%xmm7
980	pxor	32(%rsp),%xmm6
981	pxor	%xmm10,%xmm11
982	pxor	%xmm14,%xmm2
983	pxor	%xmm8,%xmm7
984	movdqu	%xmm6,0(%rdi)
985	movdqu	%xmm11,16(%rdi)
986	movdqu	%xmm2,32(%rdi)
987	movdqu	%xmm7,48(%rdi)
988	je	L$done4x
989
990	movdqa	48(%rsp),%xmm6
991	leaq	64(%rsi),%rsi
992	xorq	%r10,%r10
993	movdqa	%xmm6,0(%rsp)
994	movdqa	%xmm15,16(%rsp)
995	leaq	64(%rdi),%rdi
996	movdqa	%xmm9,32(%rsp)
997	subq	$192,%rdx
998	movdqa	%xmm3,48(%rsp)
999
1000L$oop_tail4x:
1001	movzbl	(%rsi,%r10,1),%eax
1002	movzbl	(%rsp,%r10,1),%ecx
1003	leaq	1(%r10),%r10
1004	xorl	%ecx,%eax
1005	movb	%al,-1(%rdi,%r10,1)
1006	decq	%rdx
1007	jnz	L$oop_tail4x
1008
1009L$done4x:
1010	leaq	(%r9),%rsp
1011
1012L$4x_epilogue:
1013	.byte	0xf3,0xc3
1014
1015
1016
1017.p2align	5
1018ChaCha20_8x:
1019L$ChaCha20_8x:
1020
1021	movq	%rsp,%r9
1022
1023	subq	$0x280+8,%rsp
1024	andq	$-32,%rsp
1025	vzeroupper
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036	vbroadcasti128	L$sigma(%rip),%ymm11
1037	vbroadcasti128	(%rcx),%ymm3
1038	vbroadcasti128	16(%rcx),%ymm15
1039	vbroadcasti128	(%r8),%ymm7
1040	leaq	256(%rsp),%rcx
1041	leaq	512(%rsp),%rax
1042	leaq	L$rot16(%rip),%r10
1043	leaq	L$rot24(%rip),%r11
1044
1045	vpshufd	$0x00,%ymm11,%ymm8
1046	vpshufd	$0x55,%ymm11,%ymm9
1047	vmovdqa	%ymm8,128-256(%rcx)
1048	vpshufd	$0xaa,%ymm11,%ymm10
1049	vmovdqa	%ymm9,160-256(%rcx)
1050	vpshufd	$0xff,%ymm11,%ymm11
1051	vmovdqa	%ymm10,192-256(%rcx)
1052	vmovdqa	%ymm11,224-256(%rcx)
1053
1054	vpshufd	$0x00,%ymm3,%ymm0
1055	vpshufd	$0x55,%ymm3,%ymm1
1056	vmovdqa	%ymm0,256-256(%rcx)
1057	vpshufd	$0xaa,%ymm3,%ymm2
1058	vmovdqa	%ymm1,288-256(%rcx)
1059	vpshufd	$0xff,%ymm3,%ymm3
1060	vmovdqa	%ymm2,320-256(%rcx)
1061	vmovdqa	%ymm3,352-256(%rcx)
1062
1063	vpshufd	$0x00,%ymm15,%ymm12
1064	vpshufd	$0x55,%ymm15,%ymm13
1065	vmovdqa	%ymm12,384-512(%rax)
1066	vpshufd	$0xaa,%ymm15,%ymm14
1067	vmovdqa	%ymm13,416-512(%rax)
1068	vpshufd	$0xff,%ymm15,%ymm15
1069	vmovdqa	%ymm14,448-512(%rax)
1070	vmovdqa	%ymm15,480-512(%rax)
1071
1072	vpshufd	$0x00,%ymm7,%ymm4
1073	vpshufd	$0x55,%ymm7,%ymm5
1074	vpaddd	L$incy(%rip),%ymm4,%ymm4
1075	vpshufd	$0xaa,%ymm7,%ymm6
1076	vmovdqa	%ymm5,544-512(%rax)
1077	vpshufd	$0xff,%ymm7,%ymm7
1078	vmovdqa	%ymm6,576-512(%rax)
1079	vmovdqa	%ymm7,608-512(%rax)
1080
1081	jmp	L$oop_enter8x
1082
1083.p2align	5
1084L$oop_outer8x:
1085	vmovdqa	128-256(%rcx),%ymm8
1086	vmovdqa	160-256(%rcx),%ymm9
1087	vmovdqa	192-256(%rcx),%ymm10
1088	vmovdqa	224-256(%rcx),%ymm11
1089	vmovdqa	256-256(%rcx),%ymm0
1090	vmovdqa	288-256(%rcx),%ymm1
1091	vmovdqa	320-256(%rcx),%ymm2
1092	vmovdqa	352-256(%rcx),%ymm3
1093	vmovdqa	384-512(%rax),%ymm12
1094	vmovdqa	416-512(%rax),%ymm13
1095	vmovdqa	448-512(%rax),%ymm14
1096	vmovdqa	480-512(%rax),%ymm15
1097	vmovdqa	512-512(%rax),%ymm4
1098	vmovdqa	544-512(%rax),%ymm5
1099	vmovdqa	576-512(%rax),%ymm6
1100	vmovdqa	608-512(%rax),%ymm7
1101	vpaddd	L$eight(%rip),%ymm4,%ymm4
1102
1103L$oop_enter8x:
1104	vmovdqa	%ymm14,64(%rsp)
1105	vmovdqa	%ymm15,96(%rsp)
1106	vbroadcasti128	(%r10),%ymm15
1107	vmovdqa	%ymm4,512-512(%rax)
1108	movl	$10,%eax
1109	jmp	L$oop8x
1110
1111.p2align	5
1112L$oop8x:
1113	vpaddd	%ymm0,%ymm8,%ymm8
1114	vpxor	%ymm4,%ymm8,%ymm4
1115	vpshufb	%ymm15,%ymm4,%ymm4
1116	vpaddd	%ymm1,%ymm9,%ymm9
1117	vpxor	%ymm5,%ymm9,%ymm5
1118	vpshufb	%ymm15,%ymm5,%ymm5
1119	vpaddd	%ymm4,%ymm12,%ymm12
1120	vpxor	%ymm0,%ymm12,%ymm0
1121	vpslld	$12,%ymm0,%ymm14
1122	vpsrld	$20,%ymm0,%ymm0
1123	vpor	%ymm0,%ymm14,%ymm0
1124	vbroadcasti128	(%r11),%ymm14
1125	vpaddd	%ymm5,%ymm13,%ymm13
1126	vpxor	%ymm1,%ymm13,%ymm1
1127	vpslld	$12,%ymm1,%ymm15
1128	vpsrld	$20,%ymm1,%ymm1
1129	vpor	%ymm1,%ymm15,%ymm1
1130	vpaddd	%ymm0,%ymm8,%ymm8
1131	vpxor	%ymm4,%ymm8,%ymm4
1132	vpshufb	%ymm14,%ymm4,%ymm4
1133	vpaddd	%ymm1,%ymm9,%ymm9
1134	vpxor	%ymm5,%ymm9,%ymm5
1135	vpshufb	%ymm14,%ymm5,%ymm5
1136	vpaddd	%ymm4,%ymm12,%ymm12
1137	vpxor	%ymm0,%ymm12,%ymm0
1138	vpslld	$7,%ymm0,%ymm15
1139	vpsrld	$25,%ymm0,%ymm0
1140	vpor	%ymm0,%ymm15,%ymm0
1141	vbroadcasti128	(%r10),%ymm15
1142	vpaddd	%ymm5,%ymm13,%ymm13
1143	vpxor	%ymm1,%ymm13,%ymm1
1144	vpslld	$7,%ymm1,%ymm14
1145	vpsrld	$25,%ymm1,%ymm1
1146	vpor	%ymm1,%ymm14,%ymm1
1147	vmovdqa	%ymm12,0(%rsp)
1148	vmovdqa	%ymm13,32(%rsp)
1149	vmovdqa	64(%rsp),%ymm12
1150	vmovdqa	96(%rsp),%ymm13
1151	vpaddd	%ymm2,%ymm10,%ymm10
1152	vpxor	%ymm6,%ymm10,%ymm6
1153	vpshufb	%ymm15,%ymm6,%ymm6
1154	vpaddd	%ymm3,%ymm11,%ymm11
1155	vpxor	%ymm7,%ymm11,%ymm7
1156	vpshufb	%ymm15,%ymm7,%ymm7
1157	vpaddd	%ymm6,%ymm12,%ymm12
1158	vpxor	%ymm2,%ymm12,%ymm2
1159	vpslld	$12,%ymm2,%ymm14
1160	vpsrld	$20,%ymm2,%ymm2
1161	vpor	%ymm2,%ymm14,%ymm2
1162	vbroadcasti128	(%r11),%ymm14
1163	vpaddd	%ymm7,%ymm13,%ymm13
1164	vpxor	%ymm3,%ymm13,%ymm3
1165	vpslld	$12,%ymm3,%ymm15
1166	vpsrld	$20,%ymm3,%ymm3
1167	vpor	%ymm3,%ymm15,%ymm3
1168	vpaddd	%ymm2,%ymm10,%ymm10
1169	vpxor	%ymm6,%ymm10,%ymm6
1170	vpshufb	%ymm14,%ymm6,%ymm6
1171	vpaddd	%ymm3,%ymm11,%ymm11
1172	vpxor	%ymm7,%ymm11,%ymm7
1173	vpshufb	%ymm14,%ymm7,%ymm7
1174	vpaddd	%ymm6,%ymm12,%ymm12
1175	vpxor	%ymm2,%ymm12,%ymm2
1176	vpslld	$7,%ymm2,%ymm15
1177	vpsrld	$25,%ymm2,%ymm2
1178	vpor	%ymm2,%ymm15,%ymm2
1179	vbroadcasti128	(%r10),%ymm15
1180	vpaddd	%ymm7,%ymm13,%ymm13
1181	vpxor	%ymm3,%ymm13,%ymm3
1182	vpslld	$7,%ymm3,%ymm14
1183	vpsrld	$25,%ymm3,%ymm3
1184	vpor	%ymm3,%ymm14,%ymm3
1185	vpaddd	%ymm1,%ymm8,%ymm8
1186	vpxor	%ymm7,%ymm8,%ymm7
1187	vpshufb	%ymm15,%ymm7,%ymm7
1188	vpaddd	%ymm2,%ymm9,%ymm9
1189	vpxor	%ymm4,%ymm9,%ymm4
1190	vpshufb	%ymm15,%ymm4,%ymm4
1191	vpaddd	%ymm7,%ymm12,%ymm12
1192	vpxor	%ymm1,%ymm12,%ymm1
1193	vpslld	$12,%ymm1,%ymm14
1194	vpsrld	$20,%ymm1,%ymm1
1195	vpor	%ymm1,%ymm14,%ymm1
1196	vbroadcasti128	(%r11),%ymm14
1197	vpaddd	%ymm4,%ymm13,%ymm13
1198	vpxor	%ymm2,%ymm13,%ymm2
1199	vpslld	$12,%ymm2,%ymm15
1200	vpsrld	$20,%ymm2,%ymm2
1201	vpor	%ymm2,%ymm15,%ymm2
1202	vpaddd	%ymm1,%ymm8,%ymm8
1203	vpxor	%ymm7,%ymm8,%ymm7
1204	vpshufb	%ymm14,%ymm7,%ymm7
1205	vpaddd	%ymm2,%ymm9,%ymm9
1206	vpxor	%ymm4,%ymm9,%ymm4
1207	vpshufb	%ymm14,%ymm4,%ymm4
1208	vpaddd	%ymm7,%ymm12,%ymm12
1209	vpxor	%ymm1,%ymm12,%ymm1
1210	vpslld	$7,%ymm1,%ymm15
1211	vpsrld	$25,%ymm1,%ymm1
1212	vpor	%ymm1,%ymm15,%ymm1
1213	vbroadcasti128	(%r10),%ymm15
1214	vpaddd	%ymm4,%ymm13,%ymm13
1215	vpxor	%ymm2,%ymm13,%ymm2
1216	vpslld	$7,%ymm2,%ymm14
1217	vpsrld	$25,%ymm2,%ymm2
1218	vpor	%ymm2,%ymm14,%ymm2
1219	vmovdqa	%ymm12,64(%rsp)
1220	vmovdqa	%ymm13,96(%rsp)
1221	vmovdqa	0(%rsp),%ymm12
1222	vmovdqa	32(%rsp),%ymm13
1223	vpaddd	%ymm3,%ymm10,%ymm10
1224	vpxor	%ymm5,%ymm10,%ymm5
1225	vpshufb	%ymm15,%ymm5,%ymm5
1226	vpaddd	%ymm0,%ymm11,%ymm11
1227	vpxor	%ymm6,%ymm11,%ymm6
1228	vpshufb	%ymm15,%ymm6,%ymm6
1229	vpaddd	%ymm5,%ymm12,%ymm12
1230	vpxor	%ymm3,%ymm12,%ymm3
1231	vpslld	$12,%ymm3,%ymm14
1232	vpsrld	$20,%ymm3,%ymm3
1233	vpor	%ymm3,%ymm14,%ymm3
1234	vbroadcasti128	(%r11),%ymm14
1235	vpaddd	%ymm6,%ymm13,%ymm13
1236	vpxor	%ymm0,%ymm13,%ymm0
1237	vpslld	$12,%ymm0,%ymm15
1238	vpsrld	$20,%ymm0,%ymm0
1239	vpor	%ymm0,%ymm15,%ymm0
1240	vpaddd	%ymm3,%ymm10,%ymm10
1241	vpxor	%ymm5,%ymm10,%ymm5
1242	vpshufb	%ymm14,%ymm5,%ymm5
1243	vpaddd	%ymm0,%ymm11,%ymm11
1244	vpxor	%ymm6,%ymm11,%ymm6
1245	vpshufb	%ymm14,%ymm6,%ymm6
1246	vpaddd	%ymm5,%ymm12,%ymm12
1247	vpxor	%ymm3,%ymm12,%ymm3
1248	vpslld	$7,%ymm3,%ymm15
1249	vpsrld	$25,%ymm3,%ymm3
1250	vpor	%ymm3,%ymm15,%ymm3
1251	vbroadcasti128	(%r10),%ymm15
1252	vpaddd	%ymm6,%ymm13,%ymm13
1253	vpxor	%ymm0,%ymm13,%ymm0
1254	vpslld	$7,%ymm0,%ymm14
1255	vpsrld	$25,%ymm0,%ymm0
1256	vpor	%ymm0,%ymm14,%ymm0
1257	decl	%eax
1258	jnz	L$oop8x
1259
1260	leaq	512(%rsp),%rax
1261	vpaddd	128-256(%rcx),%ymm8,%ymm8
1262	vpaddd	160-256(%rcx),%ymm9,%ymm9
1263	vpaddd	192-256(%rcx),%ymm10,%ymm10
1264	vpaddd	224-256(%rcx),%ymm11,%ymm11
1265
1266	vpunpckldq	%ymm9,%ymm8,%ymm14
1267	vpunpckldq	%ymm11,%ymm10,%ymm15
1268	vpunpckhdq	%ymm9,%ymm8,%ymm8
1269	vpunpckhdq	%ymm11,%ymm10,%ymm10
1270	vpunpcklqdq	%ymm15,%ymm14,%ymm9
1271	vpunpckhqdq	%ymm15,%ymm14,%ymm14
1272	vpunpcklqdq	%ymm10,%ymm8,%ymm11
1273	vpunpckhqdq	%ymm10,%ymm8,%ymm8
1274	vpaddd	256-256(%rcx),%ymm0,%ymm0
1275	vpaddd	288-256(%rcx),%ymm1,%ymm1
1276	vpaddd	320-256(%rcx),%ymm2,%ymm2
1277	vpaddd	352-256(%rcx),%ymm3,%ymm3
1278
1279	vpunpckldq	%ymm1,%ymm0,%ymm10
1280	vpunpckldq	%ymm3,%ymm2,%ymm15
1281	vpunpckhdq	%ymm1,%ymm0,%ymm0
1282	vpunpckhdq	%ymm3,%ymm2,%ymm2
1283	vpunpcklqdq	%ymm15,%ymm10,%ymm1
1284	vpunpckhqdq	%ymm15,%ymm10,%ymm10
1285	vpunpcklqdq	%ymm2,%ymm0,%ymm3
1286	vpunpckhqdq	%ymm2,%ymm0,%ymm0
1287	vperm2i128	$0x20,%ymm1,%ymm9,%ymm15
1288	vperm2i128	$0x31,%ymm1,%ymm9,%ymm1
1289	vperm2i128	$0x20,%ymm10,%ymm14,%ymm9
1290	vperm2i128	$0x31,%ymm10,%ymm14,%ymm10
1291	vperm2i128	$0x20,%ymm3,%ymm11,%ymm14
1292	vperm2i128	$0x31,%ymm3,%ymm11,%ymm3
1293	vperm2i128	$0x20,%ymm0,%ymm8,%ymm11
1294	vperm2i128	$0x31,%ymm0,%ymm8,%ymm0
1295	vmovdqa	%ymm15,0(%rsp)
1296	vmovdqa	%ymm9,32(%rsp)
1297	vmovdqa	64(%rsp),%ymm15
1298	vmovdqa	96(%rsp),%ymm9
1299
1300	vpaddd	384-512(%rax),%ymm12,%ymm12
1301	vpaddd	416-512(%rax),%ymm13,%ymm13
1302	vpaddd	448-512(%rax),%ymm15,%ymm15
1303	vpaddd	480-512(%rax),%ymm9,%ymm9
1304
1305	vpunpckldq	%ymm13,%ymm12,%ymm2
1306	vpunpckldq	%ymm9,%ymm15,%ymm8
1307	vpunpckhdq	%ymm13,%ymm12,%ymm12
1308	vpunpckhdq	%ymm9,%ymm15,%ymm15
1309	vpunpcklqdq	%ymm8,%ymm2,%ymm13
1310	vpunpckhqdq	%ymm8,%ymm2,%ymm2
1311	vpunpcklqdq	%ymm15,%ymm12,%ymm9
1312	vpunpckhqdq	%ymm15,%ymm12,%ymm12
1313	vpaddd	512-512(%rax),%ymm4,%ymm4
1314	vpaddd	544-512(%rax),%ymm5,%ymm5
1315	vpaddd	576-512(%rax),%ymm6,%ymm6
1316	vpaddd	608-512(%rax),%ymm7,%ymm7
1317
1318	vpunpckldq	%ymm5,%ymm4,%ymm15
1319	vpunpckldq	%ymm7,%ymm6,%ymm8
1320	vpunpckhdq	%ymm5,%ymm4,%ymm4
1321	vpunpckhdq	%ymm7,%ymm6,%ymm6
1322	vpunpcklqdq	%ymm8,%ymm15,%ymm5
1323	vpunpckhqdq	%ymm8,%ymm15,%ymm15
1324	vpunpcklqdq	%ymm6,%ymm4,%ymm7
1325	vpunpckhqdq	%ymm6,%ymm4,%ymm4
1326	vperm2i128	$0x20,%ymm5,%ymm13,%ymm8
1327	vperm2i128	$0x31,%ymm5,%ymm13,%ymm5
1328	vperm2i128	$0x20,%ymm15,%ymm2,%ymm13
1329	vperm2i128	$0x31,%ymm15,%ymm2,%ymm15
1330	vperm2i128	$0x20,%ymm7,%ymm9,%ymm2
1331	vperm2i128	$0x31,%ymm7,%ymm9,%ymm7
1332	vperm2i128	$0x20,%ymm4,%ymm12,%ymm9
1333	vperm2i128	$0x31,%ymm4,%ymm12,%ymm4
1334	vmovdqa	0(%rsp),%ymm6
1335	vmovdqa	32(%rsp),%ymm12
1336
1337	cmpq	$512,%rdx
1338	jb	L$tail8x
1339
1340	vpxor	0(%rsi),%ymm6,%ymm6
1341	vpxor	32(%rsi),%ymm8,%ymm8
1342	vpxor	64(%rsi),%ymm1,%ymm1
1343	vpxor	96(%rsi),%ymm5,%ymm5
1344	leaq	128(%rsi),%rsi
1345	vmovdqu	%ymm6,0(%rdi)
1346	vmovdqu	%ymm8,32(%rdi)
1347	vmovdqu	%ymm1,64(%rdi)
1348	vmovdqu	%ymm5,96(%rdi)
1349	leaq	128(%rdi),%rdi
1350
1351	vpxor	0(%rsi),%ymm12,%ymm12
1352	vpxor	32(%rsi),%ymm13,%ymm13
1353	vpxor	64(%rsi),%ymm10,%ymm10
1354	vpxor	96(%rsi),%ymm15,%ymm15
1355	leaq	128(%rsi),%rsi
1356	vmovdqu	%ymm12,0(%rdi)
1357	vmovdqu	%ymm13,32(%rdi)
1358	vmovdqu	%ymm10,64(%rdi)
1359	vmovdqu	%ymm15,96(%rdi)
1360	leaq	128(%rdi),%rdi
1361
1362	vpxor	0(%rsi),%ymm14,%ymm14
1363	vpxor	32(%rsi),%ymm2,%ymm2
1364	vpxor	64(%rsi),%ymm3,%ymm3
1365	vpxor	96(%rsi),%ymm7,%ymm7
1366	leaq	128(%rsi),%rsi
1367	vmovdqu	%ymm14,0(%rdi)
1368	vmovdqu	%ymm2,32(%rdi)
1369	vmovdqu	%ymm3,64(%rdi)
1370	vmovdqu	%ymm7,96(%rdi)
1371	leaq	128(%rdi),%rdi
1372
1373	vpxor	0(%rsi),%ymm11,%ymm11
1374	vpxor	32(%rsi),%ymm9,%ymm9
1375	vpxor	64(%rsi),%ymm0,%ymm0
1376	vpxor	96(%rsi),%ymm4,%ymm4
1377	leaq	128(%rsi),%rsi
1378	vmovdqu	%ymm11,0(%rdi)
1379	vmovdqu	%ymm9,32(%rdi)
1380	vmovdqu	%ymm0,64(%rdi)
1381	vmovdqu	%ymm4,96(%rdi)
1382	leaq	128(%rdi),%rdi
1383
1384	subq	$512,%rdx
1385	jnz	L$oop_outer8x
1386
1387	jmp	L$done8x
1388
1389L$tail8x:
1390	cmpq	$448,%rdx
1391	jae	L$448_or_more8x
1392	cmpq	$384,%rdx
1393	jae	L$384_or_more8x
1394	cmpq	$320,%rdx
1395	jae	L$320_or_more8x
1396	cmpq	$256,%rdx
1397	jae	L$256_or_more8x
1398	cmpq	$192,%rdx
1399	jae	L$192_or_more8x
1400	cmpq	$128,%rdx
1401	jae	L$128_or_more8x
1402	cmpq	$64,%rdx
1403	jae	L$64_or_more8x
1404
1405	xorq	%r10,%r10
1406	vmovdqa	%ymm6,0(%rsp)
1407	vmovdqa	%ymm8,32(%rsp)
1408	jmp	L$oop_tail8x
1409
1410.p2align	5
1411L$64_or_more8x:
1412	vpxor	0(%rsi),%ymm6,%ymm6
1413	vpxor	32(%rsi),%ymm8,%ymm8
1414	vmovdqu	%ymm6,0(%rdi)
1415	vmovdqu	%ymm8,32(%rdi)
1416	je	L$done8x
1417
1418	leaq	64(%rsi),%rsi
1419	xorq	%r10,%r10
1420	vmovdqa	%ymm1,0(%rsp)
1421	leaq	64(%rdi),%rdi
1422	subq	$64,%rdx
1423	vmovdqa	%ymm5,32(%rsp)
1424	jmp	L$oop_tail8x
1425
1426.p2align	5
1427L$128_or_more8x:
1428	vpxor	0(%rsi),%ymm6,%ymm6
1429	vpxor	32(%rsi),%ymm8,%ymm8
1430	vpxor	64(%rsi),%ymm1,%ymm1
1431	vpxor	96(%rsi),%ymm5,%ymm5
1432	vmovdqu	%ymm6,0(%rdi)
1433	vmovdqu	%ymm8,32(%rdi)
1434	vmovdqu	%ymm1,64(%rdi)
1435	vmovdqu	%ymm5,96(%rdi)
1436	je	L$done8x
1437
1438	leaq	128(%rsi),%rsi
1439	xorq	%r10,%r10
1440	vmovdqa	%ymm12,0(%rsp)
1441	leaq	128(%rdi),%rdi
1442	subq	$128,%rdx
1443	vmovdqa	%ymm13,32(%rsp)
1444	jmp	L$oop_tail8x
1445
1446.p2align	5
1447L$192_or_more8x:
1448	vpxor	0(%rsi),%ymm6,%ymm6
1449	vpxor	32(%rsi),%ymm8,%ymm8
1450	vpxor	64(%rsi),%ymm1,%ymm1
1451	vpxor	96(%rsi),%ymm5,%ymm5
1452	vpxor	128(%rsi),%ymm12,%ymm12
1453	vpxor	160(%rsi),%ymm13,%ymm13
1454	vmovdqu	%ymm6,0(%rdi)
1455	vmovdqu	%ymm8,32(%rdi)
1456	vmovdqu	%ymm1,64(%rdi)
1457	vmovdqu	%ymm5,96(%rdi)
1458	vmovdqu	%ymm12,128(%rdi)
1459	vmovdqu	%ymm13,160(%rdi)
1460	je	L$done8x
1461
1462	leaq	192(%rsi),%rsi
1463	xorq	%r10,%r10
1464	vmovdqa	%ymm10,0(%rsp)
1465	leaq	192(%rdi),%rdi
1466	subq	$192,%rdx
1467	vmovdqa	%ymm15,32(%rsp)
1468	jmp	L$oop_tail8x
1469
1470.p2align	5
1471L$256_or_more8x:
1472	vpxor	0(%rsi),%ymm6,%ymm6
1473	vpxor	32(%rsi),%ymm8,%ymm8
1474	vpxor	64(%rsi),%ymm1,%ymm1
1475	vpxor	96(%rsi),%ymm5,%ymm5
1476	vpxor	128(%rsi),%ymm12,%ymm12
1477	vpxor	160(%rsi),%ymm13,%ymm13
1478	vpxor	192(%rsi),%ymm10,%ymm10
1479	vpxor	224(%rsi),%ymm15,%ymm15
1480	vmovdqu	%ymm6,0(%rdi)
1481	vmovdqu	%ymm8,32(%rdi)
1482	vmovdqu	%ymm1,64(%rdi)
1483	vmovdqu	%ymm5,96(%rdi)
1484	vmovdqu	%ymm12,128(%rdi)
1485	vmovdqu	%ymm13,160(%rdi)
1486	vmovdqu	%ymm10,192(%rdi)
1487	vmovdqu	%ymm15,224(%rdi)
1488	je	L$done8x
1489
1490	leaq	256(%rsi),%rsi
1491	xorq	%r10,%r10
1492	vmovdqa	%ymm14,0(%rsp)
1493	leaq	256(%rdi),%rdi
1494	subq	$256,%rdx
1495	vmovdqa	%ymm2,32(%rsp)
1496	jmp	L$oop_tail8x
1497
1498.p2align	5
1499L$320_or_more8x:
1500	vpxor	0(%rsi),%ymm6,%ymm6
1501	vpxor	32(%rsi),%ymm8,%ymm8
1502	vpxor	64(%rsi),%ymm1,%ymm1
1503	vpxor	96(%rsi),%ymm5,%ymm5
1504	vpxor	128(%rsi),%ymm12,%ymm12
1505	vpxor	160(%rsi),%ymm13,%ymm13
1506	vpxor	192(%rsi),%ymm10,%ymm10
1507	vpxor	224(%rsi),%ymm15,%ymm15
1508	vpxor	256(%rsi),%ymm14,%ymm14
1509	vpxor	288(%rsi),%ymm2,%ymm2
1510	vmovdqu	%ymm6,0(%rdi)
1511	vmovdqu	%ymm8,32(%rdi)
1512	vmovdqu	%ymm1,64(%rdi)
1513	vmovdqu	%ymm5,96(%rdi)
1514	vmovdqu	%ymm12,128(%rdi)
1515	vmovdqu	%ymm13,160(%rdi)
1516	vmovdqu	%ymm10,192(%rdi)
1517	vmovdqu	%ymm15,224(%rdi)
1518	vmovdqu	%ymm14,256(%rdi)
1519	vmovdqu	%ymm2,288(%rdi)
1520	je	L$done8x
1521
1522	leaq	320(%rsi),%rsi
1523	xorq	%r10,%r10
1524	vmovdqa	%ymm3,0(%rsp)
1525	leaq	320(%rdi),%rdi
1526	subq	$320,%rdx
1527	vmovdqa	%ymm7,32(%rsp)
1528	jmp	L$oop_tail8x
1529
1530.p2align	5
1531L$384_or_more8x:
1532	vpxor	0(%rsi),%ymm6,%ymm6
1533	vpxor	32(%rsi),%ymm8,%ymm8
1534	vpxor	64(%rsi),%ymm1,%ymm1
1535	vpxor	96(%rsi),%ymm5,%ymm5
1536	vpxor	128(%rsi),%ymm12,%ymm12
1537	vpxor	160(%rsi),%ymm13,%ymm13
1538	vpxor	192(%rsi),%ymm10,%ymm10
1539	vpxor	224(%rsi),%ymm15,%ymm15
1540	vpxor	256(%rsi),%ymm14,%ymm14
1541	vpxor	288(%rsi),%ymm2,%ymm2
1542	vpxor	320(%rsi),%ymm3,%ymm3
1543	vpxor	352(%rsi),%ymm7,%ymm7
1544	vmovdqu	%ymm6,0(%rdi)
1545	vmovdqu	%ymm8,32(%rdi)
1546	vmovdqu	%ymm1,64(%rdi)
1547	vmovdqu	%ymm5,96(%rdi)
1548	vmovdqu	%ymm12,128(%rdi)
1549	vmovdqu	%ymm13,160(%rdi)
1550	vmovdqu	%ymm10,192(%rdi)
1551	vmovdqu	%ymm15,224(%rdi)
1552	vmovdqu	%ymm14,256(%rdi)
1553	vmovdqu	%ymm2,288(%rdi)
1554	vmovdqu	%ymm3,320(%rdi)
1555	vmovdqu	%ymm7,352(%rdi)
1556	je	L$done8x
1557
1558	leaq	384(%rsi),%rsi
1559	xorq	%r10,%r10
1560	vmovdqa	%ymm11,0(%rsp)
1561	leaq	384(%rdi),%rdi
1562	subq	$384,%rdx
1563	vmovdqa	%ymm9,32(%rsp)
1564	jmp	L$oop_tail8x
1565
1566.p2align	5
1567L$448_or_more8x:
1568	vpxor	0(%rsi),%ymm6,%ymm6
1569	vpxor	32(%rsi),%ymm8,%ymm8
1570	vpxor	64(%rsi),%ymm1,%ymm1
1571	vpxor	96(%rsi),%ymm5,%ymm5
1572	vpxor	128(%rsi),%ymm12,%ymm12
1573	vpxor	160(%rsi),%ymm13,%ymm13
1574	vpxor	192(%rsi),%ymm10,%ymm10
1575	vpxor	224(%rsi),%ymm15,%ymm15
1576	vpxor	256(%rsi),%ymm14,%ymm14
1577	vpxor	288(%rsi),%ymm2,%ymm2
1578	vpxor	320(%rsi),%ymm3,%ymm3
1579	vpxor	352(%rsi),%ymm7,%ymm7
1580	vpxor	384(%rsi),%ymm11,%ymm11
1581	vpxor	416(%rsi),%ymm9,%ymm9
1582	vmovdqu	%ymm6,0(%rdi)
1583	vmovdqu	%ymm8,32(%rdi)
1584	vmovdqu	%ymm1,64(%rdi)
1585	vmovdqu	%ymm5,96(%rdi)
1586	vmovdqu	%ymm12,128(%rdi)
1587	vmovdqu	%ymm13,160(%rdi)
1588	vmovdqu	%ymm10,192(%rdi)
1589	vmovdqu	%ymm15,224(%rdi)
1590	vmovdqu	%ymm14,256(%rdi)
1591	vmovdqu	%ymm2,288(%rdi)
1592	vmovdqu	%ymm3,320(%rdi)
1593	vmovdqu	%ymm7,352(%rdi)
1594	vmovdqu	%ymm11,384(%rdi)
1595	vmovdqu	%ymm9,416(%rdi)
1596	je	L$done8x
1597
1598	leaq	448(%rsi),%rsi
1599	xorq	%r10,%r10
1600	vmovdqa	%ymm0,0(%rsp)
1601	leaq	448(%rdi),%rdi
1602	subq	$448,%rdx
1603	vmovdqa	%ymm4,32(%rsp)
1604
1605L$oop_tail8x:
1606	movzbl	(%rsi,%r10,1),%eax
1607	movzbl	(%rsp,%r10,1),%ecx
1608	leaq	1(%r10),%r10
1609	xorl	%ecx,%eax
1610	movb	%al,-1(%rdi,%r10,1)
1611	decq	%rdx
1612	jnz	L$oop_tail8x
1613
1614L$done8x:
1615	vzeroall
1616	leaq	(%r9),%rsp
1617
1618L$8x_epilogue:
1619	.byte	0xf3,0xc3
1620
1621
1622#endif
1623