1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#if defined(__has_feature)
5#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
6#define OPENSSL_NO_ASM
7#endif
8#endif
9
10#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
11#if defined(BORINGSSL_PREFIX)
12#include <boringssl_prefix_symbols_asm.h>
13#endif
14.text
15
16
17
18.p2align	6
19L$zero:
20.long	0,0,0,0
21L$one:
22.long	1,0,0,0
23L$inc:
24.long	0,1,2,3
25L$four:
26.long	4,4,4,4
27L$incy:
28.long	0,2,4,6,1,3,5,7
29L$eight:
30.long	8,8,8,8,8,8,8,8
31L$rot16:
32.byte	0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
33L$rot24:
34.byte	0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
35L$sigma:
36.byte	101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0
37.p2align	6
38L$zeroz:
39.long	0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
40L$fourz:
41.long	4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
42L$incz:
43.long	0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
44L$sixteen:
45.long	16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
46.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
47.globl	_ChaCha20_ctr32
48.private_extern _ChaCha20_ctr32
49
50.p2align	6
51_ChaCha20_ctr32:
52
53	cmpq	$0,%rdx
54	je	L$no_data
55	movq	_OPENSSL_ia32cap_P+4(%rip),%r10
56	testl	$512,%r10d
57	jnz	L$ChaCha20_ssse3
58
59	pushq	%rbx
60
61	pushq	%rbp
62
63	pushq	%r12
64
65	pushq	%r13
66
67	pushq	%r14
68
69	pushq	%r15
70
71	subq	$64+24,%rsp
72
73L$ctr32_body:
74
75
76	movdqu	(%rcx),%xmm1
77	movdqu	16(%rcx),%xmm2
78	movdqu	(%r8),%xmm3
79	movdqa	L$one(%rip),%xmm4
80
81
82	movdqa	%xmm1,16(%rsp)
83	movdqa	%xmm2,32(%rsp)
84	movdqa	%xmm3,48(%rsp)
85	movq	%rdx,%rbp
86	jmp	L$oop_outer
87
88.p2align	5
89L$oop_outer:
90	movl	$0x61707865,%eax
91	movl	$0x3320646e,%ebx
92	movl	$0x79622d32,%ecx
93	movl	$0x6b206574,%edx
94	movl	16(%rsp),%r8d
95	movl	20(%rsp),%r9d
96	movl	24(%rsp),%r10d
97	movl	28(%rsp),%r11d
98	movd	%xmm3,%r12d
99	movl	52(%rsp),%r13d
100	movl	56(%rsp),%r14d
101	movl	60(%rsp),%r15d
102
103	movq	%rbp,64+0(%rsp)
104	movl	$10,%ebp
105	movq	%rsi,64+8(%rsp)
106.byte	102,72,15,126,214
107	movq	%rdi,64+16(%rsp)
108	movq	%rsi,%rdi
109	shrq	$32,%rdi
110	jmp	L$oop
111
112.p2align	5
113L$oop:
114	addl	%r8d,%eax
115	xorl	%eax,%r12d
116	roll	$16,%r12d
117	addl	%r9d,%ebx
118	xorl	%ebx,%r13d
119	roll	$16,%r13d
120	addl	%r12d,%esi
121	xorl	%esi,%r8d
122	roll	$12,%r8d
123	addl	%r13d,%edi
124	xorl	%edi,%r9d
125	roll	$12,%r9d
126	addl	%r8d,%eax
127	xorl	%eax,%r12d
128	roll	$8,%r12d
129	addl	%r9d,%ebx
130	xorl	%ebx,%r13d
131	roll	$8,%r13d
132	addl	%r12d,%esi
133	xorl	%esi,%r8d
134	roll	$7,%r8d
135	addl	%r13d,%edi
136	xorl	%edi,%r9d
137	roll	$7,%r9d
138	movl	%esi,32(%rsp)
139	movl	%edi,36(%rsp)
140	movl	40(%rsp),%esi
141	movl	44(%rsp),%edi
142	addl	%r10d,%ecx
143	xorl	%ecx,%r14d
144	roll	$16,%r14d
145	addl	%r11d,%edx
146	xorl	%edx,%r15d
147	roll	$16,%r15d
148	addl	%r14d,%esi
149	xorl	%esi,%r10d
150	roll	$12,%r10d
151	addl	%r15d,%edi
152	xorl	%edi,%r11d
153	roll	$12,%r11d
154	addl	%r10d,%ecx
155	xorl	%ecx,%r14d
156	roll	$8,%r14d
157	addl	%r11d,%edx
158	xorl	%edx,%r15d
159	roll	$8,%r15d
160	addl	%r14d,%esi
161	xorl	%esi,%r10d
162	roll	$7,%r10d
163	addl	%r15d,%edi
164	xorl	%edi,%r11d
165	roll	$7,%r11d
166	addl	%r9d,%eax
167	xorl	%eax,%r15d
168	roll	$16,%r15d
169	addl	%r10d,%ebx
170	xorl	%ebx,%r12d
171	roll	$16,%r12d
172	addl	%r15d,%esi
173	xorl	%esi,%r9d
174	roll	$12,%r9d
175	addl	%r12d,%edi
176	xorl	%edi,%r10d
177	roll	$12,%r10d
178	addl	%r9d,%eax
179	xorl	%eax,%r15d
180	roll	$8,%r15d
181	addl	%r10d,%ebx
182	xorl	%ebx,%r12d
183	roll	$8,%r12d
184	addl	%r15d,%esi
185	xorl	%esi,%r9d
186	roll	$7,%r9d
187	addl	%r12d,%edi
188	xorl	%edi,%r10d
189	roll	$7,%r10d
190	movl	%esi,40(%rsp)
191	movl	%edi,44(%rsp)
192	movl	32(%rsp),%esi
193	movl	36(%rsp),%edi
194	addl	%r11d,%ecx
195	xorl	%ecx,%r13d
196	roll	$16,%r13d
197	addl	%r8d,%edx
198	xorl	%edx,%r14d
199	roll	$16,%r14d
200	addl	%r13d,%esi
201	xorl	%esi,%r11d
202	roll	$12,%r11d
203	addl	%r14d,%edi
204	xorl	%edi,%r8d
205	roll	$12,%r8d
206	addl	%r11d,%ecx
207	xorl	%ecx,%r13d
208	roll	$8,%r13d
209	addl	%r8d,%edx
210	xorl	%edx,%r14d
211	roll	$8,%r14d
212	addl	%r13d,%esi
213	xorl	%esi,%r11d
214	roll	$7,%r11d
215	addl	%r14d,%edi
216	xorl	%edi,%r8d
217	roll	$7,%r8d
218	decl	%ebp
219	jnz	L$oop
220	movl	%edi,36(%rsp)
221	movl	%esi,32(%rsp)
222	movq	64(%rsp),%rbp
223	movdqa	%xmm2,%xmm1
224	movq	64+8(%rsp),%rsi
225	paddd	%xmm4,%xmm3
226	movq	64+16(%rsp),%rdi
227
228	addl	$0x61707865,%eax
229	addl	$0x3320646e,%ebx
230	addl	$0x79622d32,%ecx
231	addl	$0x6b206574,%edx
232	addl	16(%rsp),%r8d
233	addl	20(%rsp),%r9d
234	addl	24(%rsp),%r10d
235	addl	28(%rsp),%r11d
236	addl	48(%rsp),%r12d
237	addl	52(%rsp),%r13d
238	addl	56(%rsp),%r14d
239	addl	60(%rsp),%r15d
240	paddd	32(%rsp),%xmm1
241
242	cmpq	$64,%rbp
243	jb	L$tail
244
245	xorl	0(%rsi),%eax
246	xorl	4(%rsi),%ebx
247	xorl	8(%rsi),%ecx
248	xorl	12(%rsi),%edx
249	xorl	16(%rsi),%r8d
250	xorl	20(%rsi),%r9d
251	xorl	24(%rsi),%r10d
252	xorl	28(%rsi),%r11d
253	movdqu	32(%rsi),%xmm0
254	xorl	48(%rsi),%r12d
255	xorl	52(%rsi),%r13d
256	xorl	56(%rsi),%r14d
257	xorl	60(%rsi),%r15d
258	leaq	64(%rsi),%rsi
259	pxor	%xmm1,%xmm0
260
261	movdqa	%xmm2,32(%rsp)
262	movd	%xmm3,48(%rsp)
263
264	movl	%eax,0(%rdi)
265	movl	%ebx,4(%rdi)
266	movl	%ecx,8(%rdi)
267	movl	%edx,12(%rdi)
268	movl	%r8d,16(%rdi)
269	movl	%r9d,20(%rdi)
270	movl	%r10d,24(%rdi)
271	movl	%r11d,28(%rdi)
272	movdqu	%xmm0,32(%rdi)
273	movl	%r12d,48(%rdi)
274	movl	%r13d,52(%rdi)
275	movl	%r14d,56(%rdi)
276	movl	%r15d,60(%rdi)
277	leaq	64(%rdi),%rdi
278
279	subq	$64,%rbp
280	jnz	L$oop_outer
281
282	jmp	L$done
283
284.p2align	4
285L$tail:
286	movl	%eax,0(%rsp)
287	movl	%ebx,4(%rsp)
288	xorq	%rbx,%rbx
289	movl	%ecx,8(%rsp)
290	movl	%edx,12(%rsp)
291	movl	%r8d,16(%rsp)
292	movl	%r9d,20(%rsp)
293	movl	%r10d,24(%rsp)
294	movl	%r11d,28(%rsp)
295	movdqa	%xmm1,32(%rsp)
296	movl	%r12d,48(%rsp)
297	movl	%r13d,52(%rsp)
298	movl	%r14d,56(%rsp)
299	movl	%r15d,60(%rsp)
300
301L$oop_tail:
302	movzbl	(%rsi,%rbx,1),%eax
303	movzbl	(%rsp,%rbx,1),%edx
304	leaq	1(%rbx),%rbx
305	xorl	%edx,%eax
306	movb	%al,-1(%rdi,%rbx,1)
307	decq	%rbp
308	jnz	L$oop_tail
309
310L$done:
311	leaq	64+24+48(%rsp),%rsi
312	movq	-48(%rsi),%r15
313
314	movq	-40(%rsi),%r14
315
316	movq	-32(%rsi),%r13
317
318	movq	-24(%rsi),%r12
319
320	movq	-16(%rsi),%rbp
321
322	movq	-8(%rsi),%rbx
323
324	leaq	(%rsi),%rsp
325
326L$no_data:
327	.byte	0xf3,0xc3
328
329
330
331.p2align	5
332ChaCha20_ssse3:
333L$ChaCha20_ssse3:
334
335	movq	%rsp,%r9
336
337	cmpq	$128,%rdx
338	ja	L$ChaCha20_4x
339
340L$do_sse3_after_all:
341	subq	$64+8,%rsp
342	movdqa	L$sigma(%rip),%xmm0
343	movdqu	(%rcx),%xmm1
344	movdqu	16(%rcx),%xmm2
345	movdqu	(%r8),%xmm3
346	movdqa	L$rot16(%rip),%xmm6
347	movdqa	L$rot24(%rip),%xmm7
348
349	movdqa	%xmm0,0(%rsp)
350	movdqa	%xmm1,16(%rsp)
351	movdqa	%xmm2,32(%rsp)
352	movdqa	%xmm3,48(%rsp)
353	movq	$10,%r8
354	jmp	L$oop_ssse3
355
356.p2align	5
357L$oop_outer_ssse3:
358	movdqa	L$one(%rip),%xmm3
359	movdqa	0(%rsp),%xmm0
360	movdqa	16(%rsp),%xmm1
361	movdqa	32(%rsp),%xmm2
362	paddd	48(%rsp),%xmm3
363	movq	$10,%r8
364	movdqa	%xmm3,48(%rsp)
365	jmp	L$oop_ssse3
366
367.p2align	5
368L$oop_ssse3:
369	paddd	%xmm1,%xmm0
370	pxor	%xmm0,%xmm3
371.byte	102,15,56,0,222
372	paddd	%xmm3,%xmm2
373	pxor	%xmm2,%xmm1
374	movdqa	%xmm1,%xmm4
375	psrld	$20,%xmm1
376	pslld	$12,%xmm4
377	por	%xmm4,%xmm1
378	paddd	%xmm1,%xmm0
379	pxor	%xmm0,%xmm3
380.byte	102,15,56,0,223
381	paddd	%xmm3,%xmm2
382	pxor	%xmm2,%xmm1
383	movdqa	%xmm1,%xmm4
384	psrld	$25,%xmm1
385	pslld	$7,%xmm4
386	por	%xmm4,%xmm1
387	pshufd	$78,%xmm2,%xmm2
388	pshufd	$57,%xmm1,%xmm1
389	pshufd	$147,%xmm3,%xmm3
390	nop
391	paddd	%xmm1,%xmm0
392	pxor	%xmm0,%xmm3
393.byte	102,15,56,0,222
394	paddd	%xmm3,%xmm2
395	pxor	%xmm2,%xmm1
396	movdqa	%xmm1,%xmm4
397	psrld	$20,%xmm1
398	pslld	$12,%xmm4
399	por	%xmm4,%xmm1
400	paddd	%xmm1,%xmm0
401	pxor	%xmm0,%xmm3
402.byte	102,15,56,0,223
403	paddd	%xmm3,%xmm2
404	pxor	%xmm2,%xmm1
405	movdqa	%xmm1,%xmm4
406	psrld	$25,%xmm1
407	pslld	$7,%xmm4
408	por	%xmm4,%xmm1
409	pshufd	$78,%xmm2,%xmm2
410	pshufd	$147,%xmm1,%xmm1
411	pshufd	$57,%xmm3,%xmm3
412	decq	%r8
413	jnz	L$oop_ssse3
414	paddd	0(%rsp),%xmm0
415	paddd	16(%rsp),%xmm1
416	paddd	32(%rsp),%xmm2
417	paddd	48(%rsp),%xmm3
418
419	cmpq	$64,%rdx
420	jb	L$tail_ssse3
421
422	movdqu	0(%rsi),%xmm4
423	movdqu	16(%rsi),%xmm5
424	pxor	%xmm4,%xmm0
425	movdqu	32(%rsi),%xmm4
426	pxor	%xmm5,%xmm1
427	movdqu	48(%rsi),%xmm5
428	leaq	64(%rsi),%rsi
429	pxor	%xmm4,%xmm2
430	pxor	%xmm5,%xmm3
431
432	movdqu	%xmm0,0(%rdi)
433	movdqu	%xmm1,16(%rdi)
434	movdqu	%xmm2,32(%rdi)
435	movdqu	%xmm3,48(%rdi)
436	leaq	64(%rdi),%rdi
437
438	subq	$64,%rdx
439	jnz	L$oop_outer_ssse3
440
441	jmp	L$done_ssse3
442
443.p2align	4
444L$tail_ssse3:
445	movdqa	%xmm0,0(%rsp)
446	movdqa	%xmm1,16(%rsp)
447	movdqa	%xmm2,32(%rsp)
448	movdqa	%xmm3,48(%rsp)
449	xorq	%r8,%r8
450
451L$oop_tail_ssse3:
452	movzbl	(%rsi,%r8,1),%eax
453	movzbl	(%rsp,%r8,1),%ecx
454	leaq	1(%r8),%r8
455	xorl	%ecx,%eax
456	movb	%al,-1(%rdi,%r8,1)
457	decq	%rdx
458	jnz	L$oop_tail_ssse3
459
460L$done_ssse3:
461	leaq	(%r9),%rsp
462
463L$ssse3_epilogue:
464	.byte	0xf3,0xc3
465
466
467
468.p2align	5
469ChaCha20_4x:
470L$ChaCha20_4x:
471
472	movq	%rsp,%r9
473
474	movq	%r10,%r11
475	shrq	$32,%r10
476	testq	$32,%r10
477	jnz	L$ChaCha20_8x
478	cmpq	$192,%rdx
479	ja	L$proceed4x
480
481	andq	$71303168,%r11
482	cmpq	$4194304,%r11
483	je	L$do_sse3_after_all
484
485L$proceed4x:
486	subq	$0x140+8,%rsp
487	movdqa	L$sigma(%rip),%xmm11
488	movdqu	(%rcx),%xmm15
489	movdqu	16(%rcx),%xmm7
490	movdqu	(%r8),%xmm3
491	leaq	256(%rsp),%rcx
492	leaq	L$rot16(%rip),%r10
493	leaq	L$rot24(%rip),%r11
494
495	pshufd	$0x00,%xmm11,%xmm8
496	pshufd	$0x55,%xmm11,%xmm9
497	movdqa	%xmm8,64(%rsp)
498	pshufd	$0xaa,%xmm11,%xmm10
499	movdqa	%xmm9,80(%rsp)
500	pshufd	$0xff,%xmm11,%xmm11
501	movdqa	%xmm10,96(%rsp)
502	movdqa	%xmm11,112(%rsp)
503
504	pshufd	$0x00,%xmm15,%xmm12
505	pshufd	$0x55,%xmm15,%xmm13
506	movdqa	%xmm12,128-256(%rcx)
507	pshufd	$0xaa,%xmm15,%xmm14
508	movdqa	%xmm13,144-256(%rcx)
509	pshufd	$0xff,%xmm15,%xmm15
510	movdqa	%xmm14,160-256(%rcx)
511	movdqa	%xmm15,176-256(%rcx)
512
513	pshufd	$0x00,%xmm7,%xmm4
514	pshufd	$0x55,%xmm7,%xmm5
515	movdqa	%xmm4,192-256(%rcx)
516	pshufd	$0xaa,%xmm7,%xmm6
517	movdqa	%xmm5,208-256(%rcx)
518	pshufd	$0xff,%xmm7,%xmm7
519	movdqa	%xmm6,224-256(%rcx)
520	movdqa	%xmm7,240-256(%rcx)
521
522	pshufd	$0x00,%xmm3,%xmm0
523	pshufd	$0x55,%xmm3,%xmm1
524	paddd	L$inc(%rip),%xmm0
525	pshufd	$0xaa,%xmm3,%xmm2
526	movdqa	%xmm1,272-256(%rcx)
527	pshufd	$0xff,%xmm3,%xmm3
528	movdqa	%xmm2,288-256(%rcx)
529	movdqa	%xmm3,304-256(%rcx)
530
531	jmp	L$oop_enter4x
532
533.p2align	5
534L$oop_outer4x:
535	movdqa	64(%rsp),%xmm8
536	movdqa	80(%rsp),%xmm9
537	movdqa	96(%rsp),%xmm10
538	movdqa	112(%rsp),%xmm11
539	movdqa	128-256(%rcx),%xmm12
540	movdqa	144-256(%rcx),%xmm13
541	movdqa	160-256(%rcx),%xmm14
542	movdqa	176-256(%rcx),%xmm15
543	movdqa	192-256(%rcx),%xmm4
544	movdqa	208-256(%rcx),%xmm5
545	movdqa	224-256(%rcx),%xmm6
546	movdqa	240-256(%rcx),%xmm7
547	movdqa	256-256(%rcx),%xmm0
548	movdqa	272-256(%rcx),%xmm1
549	movdqa	288-256(%rcx),%xmm2
550	movdqa	304-256(%rcx),%xmm3
551	paddd	L$four(%rip),%xmm0
552
553L$oop_enter4x:
554	movdqa	%xmm6,32(%rsp)
555	movdqa	%xmm7,48(%rsp)
556	movdqa	(%r10),%xmm7
557	movl	$10,%eax
558	movdqa	%xmm0,256-256(%rcx)
559	jmp	L$oop4x
560
561.p2align	5
562L$oop4x:
563	paddd	%xmm12,%xmm8
564	paddd	%xmm13,%xmm9
565	pxor	%xmm8,%xmm0
566	pxor	%xmm9,%xmm1
567.byte	102,15,56,0,199
568.byte	102,15,56,0,207
569	paddd	%xmm0,%xmm4
570	paddd	%xmm1,%xmm5
571	pxor	%xmm4,%xmm12
572	pxor	%xmm5,%xmm13
573	movdqa	%xmm12,%xmm6
574	pslld	$12,%xmm12
575	psrld	$20,%xmm6
576	movdqa	%xmm13,%xmm7
577	pslld	$12,%xmm13
578	por	%xmm6,%xmm12
579	psrld	$20,%xmm7
580	movdqa	(%r11),%xmm6
581	por	%xmm7,%xmm13
582	paddd	%xmm12,%xmm8
583	paddd	%xmm13,%xmm9
584	pxor	%xmm8,%xmm0
585	pxor	%xmm9,%xmm1
586.byte	102,15,56,0,198
587.byte	102,15,56,0,206
588	paddd	%xmm0,%xmm4
589	paddd	%xmm1,%xmm5
590	pxor	%xmm4,%xmm12
591	pxor	%xmm5,%xmm13
592	movdqa	%xmm12,%xmm7
593	pslld	$7,%xmm12
594	psrld	$25,%xmm7
595	movdqa	%xmm13,%xmm6
596	pslld	$7,%xmm13
597	por	%xmm7,%xmm12
598	psrld	$25,%xmm6
599	movdqa	(%r10),%xmm7
600	por	%xmm6,%xmm13
601	movdqa	%xmm4,0(%rsp)
602	movdqa	%xmm5,16(%rsp)
603	movdqa	32(%rsp),%xmm4
604	movdqa	48(%rsp),%xmm5
605	paddd	%xmm14,%xmm10
606	paddd	%xmm15,%xmm11
607	pxor	%xmm10,%xmm2
608	pxor	%xmm11,%xmm3
609.byte	102,15,56,0,215
610.byte	102,15,56,0,223
611	paddd	%xmm2,%xmm4
612	paddd	%xmm3,%xmm5
613	pxor	%xmm4,%xmm14
614	pxor	%xmm5,%xmm15
615	movdqa	%xmm14,%xmm6
616	pslld	$12,%xmm14
617	psrld	$20,%xmm6
618	movdqa	%xmm15,%xmm7
619	pslld	$12,%xmm15
620	por	%xmm6,%xmm14
621	psrld	$20,%xmm7
622	movdqa	(%r11),%xmm6
623	por	%xmm7,%xmm15
624	paddd	%xmm14,%xmm10
625	paddd	%xmm15,%xmm11
626	pxor	%xmm10,%xmm2
627	pxor	%xmm11,%xmm3
628.byte	102,15,56,0,214
629.byte	102,15,56,0,222
630	paddd	%xmm2,%xmm4
631	paddd	%xmm3,%xmm5
632	pxor	%xmm4,%xmm14
633	pxor	%xmm5,%xmm15
634	movdqa	%xmm14,%xmm7
635	pslld	$7,%xmm14
636	psrld	$25,%xmm7
637	movdqa	%xmm15,%xmm6
638	pslld	$7,%xmm15
639	por	%xmm7,%xmm14
640	psrld	$25,%xmm6
641	movdqa	(%r10),%xmm7
642	por	%xmm6,%xmm15
643	paddd	%xmm13,%xmm8
644	paddd	%xmm14,%xmm9
645	pxor	%xmm8,%xmm3
646	pxor	%xmm9,%xmm0
647.byte	102,15,56,0,223
648.byte	102,15,56,0,199
649	paddd	%xmm3,%xmm4
650	paddd	%xmm0,%xmm5
651	pxor	%xmm4,%xmm13
652	pxor	%xmm5,%xmm14
653	movdqa	%xmm13,%xmm6
654	pslld	$12,%xmm13
655	psrld	$20,%xmm6
656	movdqa	%xmm14,%xmm7
657	pslld	$12,%xmm14
658	por	%xmm6,%xmm13
659	psrld	$20,%xmm7
660	movdqa	(%r11),%xmm6
661	por	%xmm7,%xmm14
662	paddd	%xmm13,%xmm8
663	paddd	%xmm14,%xmm9
664	pxor	%xmm8,%xmm3
665	pxor	%xmm9,%xmm0
666.byte	102,15,56,0,222
667.byte	102,15,56,0,198
668	paddd	%xmm3,%xmm4
669	paddd	%xmm0,%xmm5
670	pxor	%xmm4,%xmm13
671	pxor	%xmm5,%xmm14
672	movdqa	%xmm13,%xmm7
673	pslld	$7,%xmm13
674	psrld	$25,%xmm7
675	movdqa	%xmm14,%xmm6
676	pslld	$7,%xmm14
677	por	%xmm7,%xmm13
678	psrld	$25,%xmm6
679	movdqa	(%r10),%xmm7
680	por	%xmm6,%xmm14
681	movdqa	%xmm4,32(%rsp)
682	movdqa	%xmm5,48(%rsp)
683	movdqa	0(%rsp),%xmm4
684	movdqa	16(%rsp),%xmm5
685	paddd	%xmm15,%xmm10
686	paddd	%xmm12,%xmm11
687	pxor	%xmm10,%xmm1
688	pxor	%xmm11,%xmm2
689.byte	102,15,56,0,207
690.byte	102,15,56,0,215
691	paddd	%xmm1,%xmm4
692	paddd	%xmm2,%xmm5
693	pxor	%xmm4,%xmm15
694	pxor	%xmm5,%xmm12
695	movdqa	%xmm15,%xmm6
696	pslld	$12,%xmm15
697	psrld	$20,%xmm6
698	movdqa	%xmm12,%xmm7
699	pslld	$12,%xmm12
700	por	%xmm6,%xmm15
701	psrld	$20,%xmm7
702	movdqa	(%r11),%xmm6
703	por	%xmm7,%xmm12
704	paddd	%xmm15,%xmm10
705	paddd	%xmm12,%xmm11
706	pxor	%xmm10,%xmm1
707	pxor	%xmm11,%xmm2
708.byte	102,15,56,0,206
709.byte	102,15,56,0,214
710	paddd	%xmm1,%xmm4
711	paddd	%xmm2,%xmm5
712	pxor	%xmm4,%xmm15
713	pxor	%xmm5,%xmm12
714	movdqa	%xmm15,%xmm7
715	pslld	$7,%xmm15
716	psrld	$25,%xmm7
717	movdqa	%xmm12,%xmm6
718	pslld	$7,%xmm12
719	por	%xmm7,%xmm15
720	psrld	$25,%xmm6
721	movdqa	(%r10),%xmm7
722	por	%xmm6,%xmm12
723	decl	%eax
724	jnz	L$oop4x
725
726	paddd	64(%rsp),%xmm8
727	paddd	80(%rsp),%xmm9
728	paddd	96(%rsp),%xmm10
729	paddd	112(%rsp),%xmm11
730
731	movdqa	%xmm8,%xmm6
732	punpckldq	%xmm9,%xmm8
733	movdqa	%xmm10,%xmm7
734	punpckldq	%xmm11,%xmm10
735	punpckhdq	%xmm9,%xmm6
736	punpckhdq	%xmm11,%xmm7
737	movdqa	%xmm8,%xmm9
738	punpcklqdq	%xmm10,%xmm8
739	movdqa	%xmm6,%xmm11
740	punpcklqdq	%xmm7,%xmm6
741	punpckhqdq	%xmm10,%xmm9
742	punpckhqdq	%xmm7,%xmm11
743	paddd	128-256(%rcx),%xmm12
744	paddd	144-256(%rcx),%xmm13
745	paddd	160-256(%rcx),%xmm14
746	paddd	176-256(%rcx),%xmm15
747
748	movdqa	%xmm8,0(%rsp)
749	movdqa	%xmm9,16(%rsp)
750	movdqa	32(%rsp),%xmm8
751	movdqa	48(%rsp),%xmm9
752
753	movdqa	%xmm12,%xmm10
754	punpckldq	%xmm13,%xmm12
755	movdqa	%xmm14,%xmm7
756	punpckldq	%xmm15,%xmm14
757	punpckhdq	%xmm13,%xmm10
758	punpckhdq	%xmm15,%xmm7
759	movdqa	%xmm12,%xmm13
760	punpcklqdq	%xmm14,%xmm12
761	movdqa	%xmm10,%xmm15
762	punpcklqdq	%xmm7,%xmm10
763	punpckhqdq	%xmm14,%xmm13
764	punpckhqdq	%xmm7,%xmm15
765	paddd	192-256(%rcx),%xmm4
766	paddd	208-256(%rcx),%xmm5
767	paddd	224-256(%rcx),%xmm8
768	paddd	240-256(%rcx),%xmm9
769
770	movdqa	%xmm6,32(%rsp)
771	movdqa	%xmm11,48(%rsp)
772
773	movdqa	%xmm4,%xmm14
774	punpckldq	%xmm5,%xmm4
775	movdqa	%xmm8,%xmm7
776	punpckldq	%xmm9,%xmm8
777	punpckhdq	%xmm5,%xmm14
778	punpckhdq	%xmm9,%xmm7
779	movdqa	%xmm4,%xmm5
780	punpcklqdq	%xmm8,%xmm4
781	movdqa	%xmm14,%xmm9
782	punpcklqdq	%xmm7,%xmm14
783	punpckhqdq	%xmm8,%xmm5
784	punpckhqdq	%xmm7,%xmm9
785	paddd	256-256(%rcx),%xmm0
786	paddd	272-256(%rcx),%xmm1
787	paddd	288-256(%rcx),%xmm2
788	paddd	304-256(%rcx),%xmm3
789
790	movdqa	%xmm0,%xmm8
791	punpckldq	%xmm1,%xmm0
792	movdqa	%xmm2,%xmm7
793	punpckldq	%xmm3,%xmm2
794	punpckhdq	%xmm1,%xmm8
795	punpckhdq	%xmm3,%xmm7
796	movdqa	%xmm0,%xmm1
797	punpcklqdq	%xmm2,%xmm0
798	movdqa	%xmm8,%xmm3
799	punpcklqdq	%xmm7,%xmm8
800	punpckhqdq	%xmm2,%xmm1
801	punpckhqdq	%xmm7,%xmm3
802	cmpq	$256,%rdx
803	jb	L$tail4x
804
805	movdqu	0(%rsi),%xmm6
806	movdqu	16(%rsi),%xmm11
807	movdqu	32(%rsi),%xmm2
808	movdqu	48(%rsi),%xmm7
809	pxor	0(%rsp),%xmm6
810	pxor	%xmm12,%xmm11
811	pxor	%xmm4,%xmm2
812	pxor	%xmm0,%xmm7
813
814	movdqu	%xmm6,0(%rdi)
815	movdqu	64(%rsi),%xmm6
816	movdqu	%xmm11,16(%rdi)
817	movdqu	80(%rsi),%xmm11
818	movdqu	%xmm2,32(%rdi)
819	movdqu	96(%rsi),%xmm2
820	movdqu	%xmm7,48(%rdi)
821	movdqu	112(%rsi),%xmm7
822	leaq	128(%rsi),%rsi
823	pxor	16(%rsp),%xmm6
824	pxor	%xmm13,%xmm11
825	pxor	%xmm5,%xmm2
826	pxor	%xmm1,%xmm7
827
828	movdqu	%xmm6,64(%rdi)
829	movdqu	0(%rsi),%xmm6
830	movdqu	%xmm11,80(%rdi)
831	movdqu	16(%rsi),%xmm11
832	movdqu	%xmm2,96(%rdi)
833	movdqu	32(%rsi),%xmm2
834	movdqu	%xmm7,112(%rdi)
835	leaq	128(%rdi),%rdi
836	movdqu	48(%rsi),%xmm7
837	pxor	32(%rsp),%xmm6
838	pxor	%xmm10,%xmm11
839	pxor	%xmm14,%xmm2
840	pxor	%xmm8,%xmm7
841
842	movdqu	%xmm6,0(%rdi)
843	movdqu	64(%rsi),%xmm6
844	movdqu	%xmm11,16(%rdi)
845	movdqu	80(%rsi),%xmm11
846	movdqu	%xmm2,32(%rdi)
847	movdqu	96(%rsi),%xmm2
848	movdqu	%xmm7,48(%rdi)
849	movdqu	112(%rsi),%xmm7
850	leaq	128(%rsi),%rsi
851	pxor	48(%rsp),%xmm6
852	pxor	%xmm15,%xmm11
853	pxor	%xmm9,%xmm2
854	pxor	%xmm3,%xmm7
855	movdqu	%xmm6,64(%rdi)
856	movdqu	%xmm11,80(%rdi)
857	movdqu	%xmm2,96(%rdi)
858	movdqu	%xmm7,112(%rdi)
859	leaq	128(%rdi),%rdi
860
861	subq	$256,%rdx
862	jnz	L$oop_outer4x
863
864	jmp	L$done4x
865
866L$tail4x:
867	cmpq	$192,%rdx
868	jae	L$192_or_more4x
869	cmpq	$128,%rdx
870	jae	L$128_or_more4x
871	cmpq	$64,%rdx
872	jae	L$64_or_more4x
873
874
875	xorq	%r10,%r10
876
877	movdqa	%xmm12,16(%rsp)
878	movdqa	%xmm4,32(%rsp)
879	movdqa	%xmm0,48(%rsp)
880	jmp	L$oop_tail4x
881
882.p2align	5
883L$64_or_more4x:
884	movdqu	0(%rsi),%xmm6
885	movdqu	16(%rsi),%xmm11
886	movdqu	32(%rsi),%xmm2
887	movdqu	48(%rsi),%xmm7
888	pxor	0(%rsp),%xmm6
889	pxor	%xmm12,%xmm11
890	pxor	%xmm4,%xmm2
891	pxor	%xmm0,%xmm7
892	movdqu	%xmm6,0(%rdi)
893	movdqu	%xmm11,16(%rdi)
894	movdqu	%xmm2,32(%rdi)
895	movdqu	%xmm7,48(%rdi)
896	je	L$done4x
897
898	movdqa	16(%rsp),%xmm6
899	leaq	64(%rsi),%rsi
900	xorq	%r10,%r10
901	movdqa	%xmm6,0(%rsp)
902	movdqa	%xmm13,16(%rsp)
903	leaq	64(%rdi),%rdi
904	movdqa	%xmm5,32(%rsp)
905	subq	$64,%rdx
906	movdqa	%xmm1,48(%rsp)
907	jmp	L$oop_tail4x
908
909.p2align	5
910L$128_or_more4x:
911	movdqu	0(%rsi),%xmm6
912	movdqu	16(%rsi),%xmm11
913	movdqu	32(%rsi),%xmm2
914	movdqu	48(%rsi),%xmm7
915	pxor	0(%rsp),%xmm6
916	pxor	%xmm12,%xmm11
917	pxor	%xmm4,%xmm2
918	pxor	%xmm0,%xmm7
919
920	movdqu	%xmm6,0(%rdi)
921	movdqu	64(%rsi),%xmm6
922	movdqu	%xmm11,16(%rdi)
923	movdqu	80(%rsi),%xmm11
924	movdqu	%xmm2,32(%rdi)
925	movdqu	96(%rsi),%xmm2
926	movdqu	%xmm7,48(%rdi)
927	movdqu	112(%rsi),%xmm7
928	pxor	16(%rsp),%xmm6
929	pxor	%xmm13,%xmm11
930	pxor	%xmm5,%xmm2
931	pxor	%xmm1,%xmm7
932	movdqu	%xmm6,64(%rdi)
933	movdqu	%xmm11,80(%rdi)
934	movdqu	%xmm2,96(%rdi)
935	movdqu	%xmm7,112(%rdi)
936	je	L$done4x
937
938	movdqa	32(%rsp),%xmm6
939	leaq	128(%rsi),%rsi
940	xorq	%r10,%r10
941	movdqa	%xmm6,0(%rsp)
942	movdqa	%xmm10,16(%rsp)
943	leaq	128(%rdi),%rdi
944	movdqa	%xmm14,32(%rsp)
945	subq	$128,%rdx
946	movdqa	%xmm8,48(%rsp)
947	jmp	L$oop_tail4x
948
949.p2align	5
950L$192_or_more4x:
951	movdqu	0(%rsi),%xmm6
952	movdqu	16(%rsi),%xmm11
953	movdqu	32(%rsi),%xmm2
954	movdqu	48(%rsi),%xmm7
955	pxor	0(%rsp),%xmm6
956	pxor	%xmm12,%xmm11
957	pxor	%xmm4,%xmm2
958	pxor	%xmm0,%xmm7
959
960	movdqu	%xmm6,0(%rdi)
961	movdqu	64(%rsi),%xmm6
962	movdqu	%xmm11,16(%rdi)
963	movdqu	80(%rsi),%xmm11
964	movdqu	%xmm2,32(%rdi)
965	movdqu	96(%rsi),%xmm2
966	movdqu	%xmm7,48(%rdi)
967	movdqu	112(%rsi),%xmm7
968	leaq	128(%rsi),%rsi
969	pxor	16(%rsp),%xmm6
970	pxor	%xmm13,%xmm11
971	pxor	%xmm5,%xmm2
972	pxor	%xmm1,%xmm7
973
974	movdqu	%xmm6,64(%rdi)
975	movdqu	0(%rsi),%xmm6
976	movdqu	%xmm11,80(%rdi)
977	movdqu	16(%rsi),%xmm11
978	movdqu	%xmm2,96(%rdi)
979	movdqu	32(%rsi),%xmm2
980	movdqu	%xmm7,112(%rdi)
981	leaq	128(%rdi),%rdi
982	movdqu	48(%rsi),%xmm7
983	pxor	32(%rsp),%xmm6
984	pxor	%xmm10,%xmm11
985	pxor	%xmm14,%xmm2
986	pxor	%xmm8,%xmm7
987	movdqu	%xmm6,0(%rdi)
988	movdqu	%xmm11,16(%rdi)
989	movdqu	%xmm2,32(%rdi)
990	movdqu	%xmm7,48(%rdi)
991	je	L$done4x
992
993	movdqa	48(%rsp),%xmm6
994	leaq	64(%rsi),%rsi
995	xorq	%r10,%r10
996	movdqa	%xmm6,0(%rsp)
997	movdqa	%xmm15,16(%rsp)
998	leaq	64(%rdi),%rdi
999	movdqa	%xmm9,32(%rsp)
1000	subq	$192,%rdx
1001	movdqa	%xmm3,48(%rsp)
1002
1003L$oop_tail4x:
1004	movzbl	(%rsi,%r10,1),%eax
1005	movzbl	(%rsp,%r10,1),%ecx
1006	leaq	1(%r10),%r10
1007	xorl	%ecx,%eax
1008	movb	%al,-1(%rdi,%r10,1)
1009	decq	%rdx
1010	jnz	L$oop_tail4x
1011
1012L$done4x:
1013	leaq	(%r9),%rsp
1014
1015L$4x_epilogue:
1016	.byte	0xf3,0xc3
1017
1018
1019
1020.p2align	5
1021ChaCha20_8x:
1022L$ChaCha20_8x:
1023
1024	movq	%rsp,%r9
1025
1026	subq	$0x280+8,%rsp
1027	andq	$-32,%rsp
1028	vzeroupper
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039	vbroadcasti128	L$sigma(%rip),%ymm11
1040	vbroadcasti128	(%rcx),%ymm3
1041	vbroadcasti128	16(%rcx),%ymm15
1042	vbroadcasti128	(%r8),%ymm7
1043	leaq	256(%rsp),%rcx
1044	leaq	512(%rsp),%rax
1045	leaq	L$rot16(%rip),%r10
1046	leaq	L$rot24(%rip),%r11
1047
1048	vpshufd	$0x00,%ymm11,%ymm8
1049	vpshufd	$0x55,%ymm11,%ymm9
1050	vmovdqa	%ymm8,128-256(%rcx)
1051	vpshufd	$0xaa,%ymm11,%ymm10
1052	vmovdqa	%ymm9,160-256(%rcx)
1053	vpshufd	$0xff,%ymm11,%ymm11
1054	vmovdqa	%ymm10,192-256(%rcx)
1055	vmovdqa	%ymm11,224-256(%rcx)
1056
1057	vpshufd	$0x00,%ymm3,%ymm0
1058	vpshufd	$0x55,%ymm3,%ymm1
1059	vmovdqa	%ymm0,256-256(%rcx)
1060	vpshufd	$0xaa,%ymm3,%ymm2
1061	vmovdqa	%ymm1,288-256(%rcx)
1062	vpshufd	$0xff,%ymm3,%ymm3
1063	vmovdqa	%ymm2,320-256(%rcx)
1064	vmovdqa	%ymm3,352-256(%rcx)
1065
1066	vpshufd	$0x00,%ymm15,%ymm12
1067	vpshufd	$0x55,%ymm15,%ymm13
1068	vmovdqa	%ymm12,384-512(%rax)
1069	vpshufd	$0xaa,%ymm15,%ymm14
1070	vmovdqa	%ymm13,416-512(%rax)
1071	vpshufd	$0xff,%ymm15,%ymm15
1072	vmovdqa	%ymm14,448-512(%rax)
1073	vmovdqa	%ymm15,480-512(%rax)
1074
1075	vpshufd	$0x00,%ymm7,%ymm4
1076	vpshufd	$0x55,%ymm7,%ymm5
1077	vpaddd	L$incy(%rip),%ymm4,%ymm4
1078	vpshufd	$0xaa,%ymm7,%ymm6
1079	vmovdqa	%ymm5,544-512(%rax)
1080	vpshufd	$0xff,%ymm7,%ymm7
1081	vmovdqa	%ymm6,576-512(%rax)
1082	vmovdqa	%ymm7,608-512(%rax)
1083
1084	jmp	L$oop_enter8x
1085
1086.p2align	5
1087L$oop_outer8x:
1088	vmovdqa	128-256(%rcx),%ymm8
1089	vmovdqa	160-256(%rcx),%ymm9
1090	vmovdqa	192-256(%rcx),%ymm10
1091	vmovdqa	224-256(%rcx),%ymm11
1092	vmovdqa	256-256(%rcx),%ymm0
1093	vmovdqa	288-256(%rcx),%ymm1
1094	vmovdqa	320-256(%rcx),%ymm2
1095	vmovdqa	352-256(%rcx),%ymm3
1096	vmovdqa	384-512(%rax),%ymm12
1097	vmovdqa	416-512(%rax),%ymm13
1098	vmovdqa	448-512(%rax),%ymm14
1099	vmovdqa	480-512(%rax),%ymm15
1100	vmovdqa	512-512(%rax),%ymm4
1101	vmovdqa	544-512(%rax),%ymm5
1102	vmovdqa	576-512(%rax),%ymm6
1103	vmovdqa	608-512(%rax),%ymm7
1104	vpaddd	L$eight(%rip),%ymm4,%ymm4
1105
1106L$oop_enter8x:
1107	vmovdqa	%ymm14,64(%rsp)
1108	vmovdqa	%ymm15,96(%rsp)
1109	vbroadcasti128	(%r10),%ymm15
1110	vmovdqa	%ymm4,512-512(%rax)
1111	movl	$10,%eax
1112	jmp	L$oop8x
1113
1114.p2align	5
1115L$oop8x:
1116	vpaddd	%ymm0,%ymm8,%ymm8
1117	vpxor	%ymm4,%ymm8,%ymm4
1118	vpshufb	%ymm15,%ymm4,%ymm4
1119	vpaddd	%ymm1,%ymm9,%ymm9
1120	vpxor	%ymm5,%ymm9,%ymm5
1121	vpshufb	%ymm15,%ymm5,%ymm5
1122	vpaddd	%ymm4,%ymm12,%ymm12
1123	vpxor	%ymm0,%ymm12,%ymm0
1124	vpslld	$12,%ymm0,%ymm14
1125	vpsrld	$20,%ymm0,%ymm0
1126	vpor	%ymm0,%ymm14,%ymm0
1127	vbroadcasti128	(%r11),%ymm14
1128	vpaddd	%ymm5,%ymm13,%ymm13
1129	vpxor	%ymm1,%ymm13,%ymm1
1130	vpslld	$12,%ymm1,%ymm15
1131	vpsrld	$20,%ymm1,%ymm1
1132	vpor	%ymm1,%ymm15,%ymm1
1133	vpaddd	%ymm0,%ymm8,%ymm8
1134	vpxor	%ymm4,%ymm8,%ymm4
1135	vpshufb	%ymm14,%ymm4,%ymm4
1136	vpaddd	%ymm1,%ymm9,%ymm9
1137	vpxor	%ymm5,%ymm9,%ymm5
1138	vpshufb	%ymm14,%ymm5,%ymm5
1139	vpaddd	%ymm4,%ymm12,%ymm12
1140	vpxor	%ymm0,%ymm12,%ymm0
1141	vpslld	$7,%ymm0,%ymm15
1142	vpsrld	$25,%ymm0,%ymm0
1143	vpor	%ymm0,%ymm15,%ymm0
1144	vbroadcasti128	(%r10),%ymm15
1145	vpaddd	%ymm5,%ymm13,%ymm13
1146	vpxor	%ymm1,%ymm13,%ymm1
1147	vpslld	$7,%ymm1,%ymm14
1148	vpsrld	$25,%ymm1,%ymm1
1149	vpor	%ymm1,%ymm14,%ymm1
1150	vmovdqa	%ymm12,0(%rsp)
1151	vmovdqa	%ymm13,32(%rsp)
1152	vmovdqa	64(%rsp),%ymm12
1153	vmovdqa	96(%rsp),%ymm13
1154	vpaddd	%ymm2,%ymm10,%ymm10
1155	vpxor	%ymm6,%ymm10,%ymm6
1156	vpshufb	%ymm15,%ymm6,%ymm6
1157	vpaddd	%ymm3,%ymm11,%ymm11
1158	vpxor	%ymm7,%ymm11,%ymm7
1159	vpshufb	%ymm15,%ymm7,%ymm7
1160	vpaddd	%ymm6,%ymm12,%ymm12
1161	vpxor	%ymm2,%ymm12,%ymm2
1162	vpslld	$12,%ymm2,%ymm14
1163	vpsrld	$20,%ymm2,%ymm2
1164	vpor	%ymm2,%ymm14,%ymm2
1165	vbroadcasti128	(%r11),%ymm14
1166	vpaddd	%ymm7,%ymm13,%ymm13
1167	vpxor	%ymm3,%ymm13,%ymm3
1168	vpslld	$12,%ymm3,%ymm15
1169	vpsrld	$20,%ymm3,%ymm3
1170	vpor	%ymm3,%ymm15,%ymm3
1171	vpaddd	%ymm2,%ymm10,%ymm10
1172	vpxor	%ymm6,%ymm10,%ymm6
1173	vpshufb	%ymm14,%ymm6,%ymm6
1174	vpaddd	%ymm3,%ymm11,%ymm11
1175	vpxor	%ymm7,%ymm11,%ymm7
1176	vpshufb	%ymm14,%ymm7,%ymm7
1177	vpaddd	%ymm6,%ymm12,%ymm12
1178	vpxor	%ymm2,%ymm12,%ymm2
1179	vpslld	$7,%ymm2,%ymm15
1180	vpsrld	$25,%ymm2,%ymm2
1181	vpor	%ymm2,%ymm15,%ymm2
1182	vbroadcasti128	(%r10),%ymm15
1183	vpaddd	%ymm7,%ymm13,%ymm13
1184	vpxor	%ymm3,%ymm13,%ymm3
1185	vpslld	$7,%ymm3,%ymm14
1186	vpsrld	$25,%ymm3,%ymm3
1187	vpor	%ymm3,%ymm14,%ymm3
1188	vpaddd	%ymm1,%ymm8,%ymm8
1189	vpxor	%ymm7,%ymm8,%ymm7
1190	vpshufb	%ymm15,%ymm7,%ymm7
1191	vpaddd	%ymm2,%ymm9,%ymm9
1192	vpxor	%ymm4,%ymm9,%ymm4
1193	vpshufb	%ymm15,%ymm4,%ymm4
1194	vpaddd	%ymm7,%ymm12,%ymm12
1195	vpxor	%ymm1,%ymm12,%ymm1
1196	vpslld	$12,%ymm1,%ymm14
1197	vpsrld	$20,%ymm1,%ymm1
1198	vpor	%ymm1,%ymm14,%ymm1
1199	vbroadcasti128	(%r11),%ymm14
1200	vpaddd	%ymm4,%ymm13,%ymm13
1201	vpxor	%ymm2,%ymm13,%ymm2
1202	vpslld	$12,%ymm2,%ymm15
1203	vpsrld	$20,%ymm2,%ymm2
1204	vpor	%ymm2,%ymm15,%ymm2
1205	vpaddd	%ymm1,%ymm8,%ymm8
1206	vpxor	%ymm7,%ymm8,%ymm7
1207	vpshufb	%ymm14,%ymm7,%ymm7
1208	vpaddd	%ymm2,%ymm9,%ymm9
1209	vpxor	%ymm4,%ymm9,%ymm4
1210	vpshufb	%ymm14,%ymm4,%ymm4
1211	vpaddd	%ymm7,%ymm12,%ymm12
1212	vpxor	%ymm1,%ymm12,%ymm1
1213	vpslld	$7,%ymm1,%ymm15
1214	vpsrld	$25,%ymm1,%ymm1
1215	vpor	%ymm1,%ymm15,%ymm1
1216	vbroadcasti128	(%r10),%ymm15
1217	vpaddd	%ymm4,%ymm13,%ymm13
1218	vpxor	%ymm2,%ymm13,%ymm2
1219	vpslld	$7,%ymm2,%ymm14
1220	vpsrld	$25,%ymm2,%ymm2
1221	vpor	%ymm2,%ymm14,%ymm2
1222	vmovdqa	%ymm12,64(%rsp)
1223	vmovdqa	%ymm13,96(%rsp)
1224	vmovdqa	0(%rsp),%ymm12
1225	vmovdqa	32(%rsp),%ymm13
1226	vpaddd	%ymm3,%ymm10,%ymm10
1227	vpxor	%ymm5,%ymm10,%ymm5
1228	vpshufb	%ymm15,%ymm5,%ymm5
1229	vpaddd	%ymm0,%ymm11,%ymm11
1230	vpxor	%ymm6,%ymm11,%ymm6
1231	vpshufb	%ymm15,%ymm6,%ymm6
1232	vpaddd	%ymm5,%ymm12,%ymm12
1233	vpxor	%ymm3,%ymm12,%ymm3
1234	vpslld	$12,%ymm3,%ymm14
1235	vpsrld	$20,%ymm3,%ymm3
1236	vpor	%ymm3,%ymm14,%ymm3
1237	vbroadcasti128	(%r11),%ymm14
1238	vpaddd	%ymm6,%ymm13,%ymm13
1239	vpxor	%ymm0,%ymm13,%ymm0
1240	vpslld	$12,%ymm0,%ymm15
1241	vpsrld	$20,%ymm0,%ymm0
1242	vpor	%ymm0,%ymm15,%ymm0
1243	vpaddd	%ymm3,%ymm10,%ymm10
1244	vpxor	%ymm5,%ymm10,%ymm5
1245	vpshufb	%ymm14,%ymm5,%ymm5
1246	vpaddd	%ymm0,%ymm11,%ymm11
1247	vpxor	%ymm6,%ymm11,%ymm6
1248	vpshufb	%ymm14,%ymm6,%ymm6
1249	vpaddd	%ymm5,%ymm12,%ymm12
1250	vpxor	%ymm3,%ymm12,%ymm3
1251	vpslld	$7,%ymm3,%ymm15
1252	vpsrld	$25,%ymm3,%ymm3
1253	vpor	%ymm3,%ymm15,%ymm3
1254	vbroadcasti128	(%r10),%ymm15
1255	vpaddd	%ymm6,%ymm13,%ymm13
1256	vpxor	%ymm0,%ymm13,%ymm0
1257	vpslld	$7,%ymm0,%ymm14
1258	vpsrld	$25,%ymm0,%ymm0
1259	vpor	%ymm0,%ymm14,%ymm0
1260	decl	%eax
1261	jnz	L$oop8x
1262
1263	leaq	512(%rsp),%rax
1264	vpaddd	128-256(%rcx),%ymm8,%ymm8
1265	vpaddd	160-256(%rcx),%ymm9,%ymm9
1266	vpaddd	192-256(%rcx),%ymm10,%ymm10
1267	vpaddd	224-256(%rcx),%ymm11,%ymm11
1268
1269	vpunpckldq	%ymm9,%ymm8,%ymm14
1270	vpunpckldq	%ymm11,%ymm10,%ymm15
1271	vpunpckhdq	%ymm9,%ymm8,%ymm8
1272	vpunpckhdq	%ymm11,%ymm10,%ymm10
1273	vpunpcklqdq	%ymm15,%ymm14,%ymm9
1274	vpunpckhqdq	%ymm15,%ymm14,%ymm14
1275	vpunpcklqdq	%ymm10,%ymm8,%ymm11
1276	vpunpckhqdq	%ymm10,%ymm8,%ymm8
1277	vpaddd	256-256(%rcx),%ymm0,%ymm0
1278	vpaddd	288-256(%rcx),%ymm1,%ymm1
1279	vpaddd	320-256(%rcx),%ymm2,%ymm2
1280	vpaddd	352-256(%rcx),%ymm3,%ymm3
1281
1282	vpunpckldq	%ymm1,%ymm0,%ymm10
1283	vpunpckldq	%ymm3,%ymm2,%ymm15
1284	vpunpckhdq	%ymm1,%ymm0,%ymm0
1285	vpunpckhdq	%ymm3,%ymm2,%ymm2
1286	vpunpcklqdq	%ymm15,%ymm10,%ymm1
1287	vpunpckhqdq	%ymm15,%ymm10,%ymm10
1288	vpunpcklqdq	%ymm2,%ymm0,%ymm3
1289	vpunpckhqdq	%ymm2,%ymm0,%ymm0
1290	vperm2i128	$0x20,%ymm1,%ymm9,%ymm15
1291	vperm2i128	$0x31,%ymm1,%ymm9,%ymm1
1292	vperm2i128	$0x20,%ymm10,%ymm14,%ymm9
1293	vperm2i128	$0x31,%ymm10,%ymm14,%ymm10
1294	vperm2i128	$0x20,%ymm3,%ymm11,%ymm14
1295	vperm2i128	$0x31,%ymm3,%ymm11,%ymm3
1296	vperm2i128	$0x20,%ymm0,%ymm8,%ymm11
1297	vperm2i128	$0x31,%ymm0,%ymm8,%ymm0
1298	vmovdqa	%ymm15,0(%rsp)
1299	vmovdqa	%ymm9,32(%rsp)
1300	vmovdqa	64(%rsp),%ymm15
1301	vmovdqa	96(%rsp),%ymm9
1302
1303	vpaddd	384-512(%rax),%ymm12,%ymm12
1304	vpaddd	416-512(%rax),%ymm13,%ymm13
1305	vpaddd	448-512(%rax),%ymm15,%ymm15
1306	vpaddd	480-512(%rax),%ymm9,%ymm9
1307
1308	vpunpckldq	%ymm13,%ymm12,%ymm2
1309	vpunpckldq	%ymm9,%ymm15,%ymm8
1310	vpunpckhdq	%ymm13,%ymm12,%ymm12
1311	vpunpckhdq	%ymm9,%ymm15,%ymm15
1312	vpunpcklqdq	%ymm8,%ymm2,%ymm13
1313	vpunpckhqdq	%ymm8,%ymm2,%ymm2
1314	vpunpcklqdq	%ymm15,%ymm12,%ymm9
1315	vpunpckhqdq	%ymm15,%ymm12,%ymm12
1316	vpaddd	512-512(%rax),%ymm4,%ymm4
1317	vpaddd	544-512(%rax),%ymm5,%ymm5
1318	vpaddd	576-512(%rax),%ymm6,%ymm6
1319	vpaddd	608-512(%rax),%ymm7,%ymm7
1320
1321	vpunpckldq	%ymm5,%ymm4,%ymm15
1322	vpunpckldq	%ymm7,%ymm6,%ymm8
1323	vpunpckhdq	%ymm5,%ymm4,%ymm4
1324	vpunpckhdq	%ymm7,%ymm6,%ymm6
1325	vpunpcklqdq	%ymm8,%ymm15,%ymm5
1326	vpunpckhqdq	%ymm8,%ymm15,%ymm15
1327	vpunpcklqdq	%ymm6,%ymm4,%ymm7
1328	vpunpckhqdq	%ymm6,%ymm4,%ymm4
1329	vperm2i128	$0x20,%ymm5,%ymm13,%ymm8
1330	vperm2i128	$0x31,%ymm5,%ymm13,%ymm5
1331	vperm2i128	$0x20,%ymm15,%ymm2,%ymm13
1332	vperm2i128	$0x31,%ymm15,%ymm2,%ymm15
1333	vperm2i128	$0x20,%ymm7,%ymm9,%ymm2
1334	vperm2i128	$0x31,%ymm7,%ymm9,%ymm7
1335	vperm2i128	$0x20,%ymm4,%ymm12,%ymm9
1336	vperm2i128	$0x31,%ymm4,%ymm12,%ymm4
1337	vmovdqa	0(%rsp),%ymm6
1338	vmovdqa	32(%rsp),%ymm12
1339
1340	cmpq	$512,%rdx
1341	jb	L$tail8x
1342
1343	vpxor	0(%rsi),%ymm6,%ymm6
1344	vpxor	32(%rsi),%ymm8,%ymm8
1345	vpxor	64(%rsi),%ymm1,%ymm1
1346	vpxor	96(%rsi),%ymm5,%ymm5
1347	leaq	128(%rsi),%rsi
1348	vmovdqu	%ymm6,0(%rdi)
1349	vmovdqu	%ymm8,32(%rdi)
1350	vmovdqu	%ymm1,64(%rdi)
1351	vmovdqu	%ymm5,96(%rdi)
1352	leaq	128(%rdi),%rdi
1353
1354	vpxor	0(%rsi),%ymm12,%ymm12
1355	vpxor	32(%rsi),%ymm13,%ymm13
1356	vpxor	64(%rsi),%ymm10,%ymm10
1357	vpxor	96(%rsi),%ymm15,%ymm15
1358	leaq	128(%rsi),%rsi
1359	vmovdqu	%ymm12,0(%rdi)
1360	vmovdqu	%ymm13,32(%rdi)
1361	vmovdqu	%ymm10,64(%rdi)
1362	vmovdqu	%ymm15,96(%rdi)
1363	leaq	128(%rdi),%rdi
1364
1365	vpxor	0(%rsi),%ymm14,%ymm14
1366	vpxor	32(%rsi),%ymm2,%ymm2
1367	vpxor	64(%rsi),%ymm3,%ymm3
1368	vpxor	96(%rsi),%ymm7,%ymm7
1369	leaq	128(%rsi),%rsi
1370	vmovdqu	%ymm14,0(%rdi)
1371	vmovdqu	%ymm2,32(%rdi)
1372	vmovdqu	%ymm3,64(%rdi)
1373	vmovdqu	%ymm7,96(%rdi)
1374	leaq	128(%rdi),%rdi
1375
1376	vpxor	0(%rsi),%ymm11,%ymm11
1377	vpxor	32(%rsi),%ymm9,%ymm9
1378	vpxor	64(%rsi),%ymm0,%ymm0
1379	vpxor	96(%rsi),%ymm4,%ymm4
1380	leaq	128(%rsi),%rsi
1381	vmovdqu	%ymm11,0(%rdi)
1382	vmovdqu	%ymm9,32(%rdi)
1383	vmovdqu	%ymm0,64(%rdi)
1384	vmovdqu	%ymm4,96(%rdi)
1385	leaq	128(%rdi),%rdi
1386
1387	subq	$512,%rdx
1388	jnz	L$oop_outer8x
1389
1390	jmp	L$done8x
1391
1392L$tail8x:
1393	cmpq	$448,%rdx
1394	jae	L$448_or_more8x
1395	cmpq	$384,%rdx
1396	jae	L$384_or_more8x
1397	cmpq	$320,%rdx
1398	jae	L$320_or_more8x
1399	cmpq	$256,%rdx
1400	jae	L$256_or_more8x
1401	cmpq	$192,%rdx
1402	jae	L$192_or_more8x
1403	cmpq	$128,%rdx
1404	jae	L$128_or_more8x
1405	cmpq	$64,%rdx
1406	jae	L$64_or_more8x
1407
1408	xorq	%r10,%r10
1409	vmovdqa	%ymm6,0(%rsp)
1410	vmovdqa	%ymm8,32(%rsp)
1411	jmp	L$oop_tail8x
1412
1413.p2align	5
1414L$64_or_more8x:
1415	vpxor	0(%rsi),%ymm6,%ymm6
1416	vpxor	32(%rsi),%ymm8,%ymm8
1417	vmovdqu	%ymm6,0(%rdi)
1418	vmovdqu	%ymm8,32(%rdi)
1419	je	L$done8x
1420
1421	leaq	64(%rsi),%rsi
1422	xorq	%r10,%r10
1423	vmovdqa	%ymm1,0(%rsp)
1424	leaq	64(%rdi),%rdi
1425	subq	$64,%rdx
1426	vmovdqa	%ymm5,32(%rsp)
1427	jmp	L$oop_tail8x
1428
1429.p2align	5
1430L$128_or_more8x:
1431	vpxor	0(%rsi),%ymm6,%ymm6
1432	vpxor	32(%rsi),%ymm8,%ymm8
1433	vpxor	64(%rsi),%ymm1,%ymm1
1434	vpxor	96(%rsi),%ymm5,%ymm5
1435	vmovdqu	%ymm6,0(%rdi)
1436	vmovdqu	%ymm8,32(%rdi)
1437	vmovdqu	%ymm1,64(%rdi)
1438	vmovdqu	%ymm5,96(%rdi)
1439	je	L$done8x
1440
1441	leaq	128(%rsi),%rsi
1442	xorq	%r10,%r10
1443	vmovdqa	%ymm12,0(%rsp)
1444	leaq	128(%rdi),%rdi
1445	subq	$128,%rdx
1446	vmovdqa	%ymm13,32(%rsp)
1447	jmp	L$oop_tail8x
1448
1449.p2align	5
1450L$192_or_more8x:
1451	vpxor	0(%rsi),%ymm6,%ymm6
1452	vpxor	32(%rsi),%ymm8,%ymm8
1453	vpxor	64(%rsi),%ymm1,%ymm1
1454	vpxor	96(%rsi),%ymm5,%ymm5
1455	vpxor	128(%rsi),%ymm12,%ymm12
1456	vpxor	160(%rsi),%ymm13,%ymm13
1457	vmovdqu	%ymm6,0(%rdi)
1458	vmovdqu	%ymm8,32(%rdi)
1459	vmovdqu	%ymm1,64(%rdi)
1460	vmovdqu	%ymm5,96(%rdi)
1461	vmovdqu	%ymm12,128(%rdi)
1462	vmovdqu	%ymm13,160(%rdi)
1463	je	L$done8x
1464
1465	leaq	192(%rsi),%rsi
1466	xorq	%r10,%r10
1467	vmovdqa	%ymm10,0(%rsp)
1468	leaq	192(%rdi),%rdi
1469	subq	$192,%rdx
1470	vmovdqa	%ymm15,32(%rsp)
1471	jmp	L$oop_tail8x
1472
1473.p2align	5
1474L$256_or_more8x:
1475	vpxor	0(%rsi),%ymm6,%ymm6
1476	vpxor	32(%rsi),%ymm8,%ymm8
1477	vpxor	64(%rsi),%ymm1,%ymm1
1478	vpxor	96(%rsi),%ymm5,%ymm5
1479	vpxor	128(%rsi),%ymm12,%ymm12
1480	vpxor	160(%rsi),%ymm13,%ymm13
1481	vpxor	192(%rsi),%ymm10,%ymm10
1482	vpxor	224(%rsi),%ymm15,%ymm15
1483	vmovdqu	%ymm6,0(%rdi)
1484	vmovdqu	%ymm8,32(%rdi)
1485	vmovdqu	%ymm1,64(%rdi)
1486	vmovdqu	%ymm5,96(%rdi)
1487	vmovdqu	%ymm12,128(%rdi)
1488	vmovdqu	%ymm13,160(%rdi)
1489	vmovdqu	%ymm10,192(%rdi)
1490	vmovdqu	%ymm15,224(%rdi)
1491	je	L$done8x
1492
1493	leaq	256(%rsi),%rsi
1494	xorq	%r10,%r10
1495	vmovdqa	%ymm14,0(%rsp)
1496	leaq	256(%rdi),%rdi
1497	subq	$256,%rdx
1498	vmovdqa	%ymm2,32(%rsp)
1499	jmp	L$oop_tail8x
1500
1501.p2align	5
1502L$320_or_more8x:
1503	vpxor	0(%rsi),%ymm6,%ymm6
1504	vpxor	32(%rsi),%ymm8,%ymm8
1505	vpxor	64(%rsi),%ymm1,%ymm1
1506	vpxor	96(%rsi),%ymm5,%ymm5
1507	vpxor	128(%rsi),%ymm12,%ymm12
1508	vpxor	160(%rsi),%ymm13,%ymm13
1509	vpxor	192(%rsi),%ymm10,%ymm10
1510	vpxor	224(%rsi),%ymm15,%ymm15
1511	vpxor	256(%rsi),%ymm14,%ymm14
1512	vpxor	288(%rsi),%ymm2,%ymm2
1513	vmovdqu	%ymm6,0(%rdi)
1514	vmovdqu	%ymm8,32(%rdi)
1515	vmovdqu	%ymm1,64(%rdi)
1516	vmovdqu	%ymm5,96(%rdi)
1517	vmovdqu	%ymm12,128(%rdi)
1518	vmovdqu	%ymm13,160(%rdi)
1519	vmovdqu	%ymm10,192(%rdi)
1520	vmovdqu	%ymm15,224(%rdi)
1521	vmovdqu	%ymm14,256(%rdi)
1522	vmovdqu	%ymm2,288(%rdi)
1523	je	L$done8x
1524
1525	leaq	320(%rsi),%rsi
1526	xorq	%r10,%r10
1527	vmovdqa	%ymm3,0(%rsp)
1528	leaq	320(%rdi),%rdi
1529	subq	$320,%rdx
1530	vmovdqa	%ymm7,32(%rsp)
1531	jmp	L$oop_tail8x
1532
1533.p2align	5
1534L$384_or_more8x:
1535	vpxor	0(%rsi),%ymm6,%ymm6
1536	vpxor	32(%rsi),%ymm8,%ymm8
1537	vpxor	64(%rsi),%ymm1,%ymm1
1538	vpxor	96(%rsi),%ymm5,%ymm5
1539	vpxor	128(%rsi),%ymm12,%ymm12
1540	vpxor	160(%rsi),%ymm13,%ymm13
1541	vpxor	192(%rsi),%ymm10,%ymm10
1542	vpxor	224(%rsi),%ymm15,%ymm15
1543	vpxor	256(%rsi),%ymm14,%ymm14
1544	vpxor	288(%rsi),%ymm2,%ymm2
1545	vpxor	320(%rsi),%ymm3,%ymm3
1546	vpxor	352(%rsi),%ymm7,%ymm7
1547	vmovdqu	%ymm6,0(%rdi)
1548	vmovdqu	%ymm8,32(%rdi)
1549	vmovdqu	%ymm1,64(%rdi)
1550	vmovdqu	%ymm5,96(%rdi)
1551	vmovdqu	%ymm12,128(%rdi)
1552	vmovdqu	%ymm13,160(%rdi)
1553	vmovdqu	%ymm10,192(%rdi)
1554	vmovdqu	%ymm15,224(%rdi)
1555	vmovdqu	%ymm14,256(%rdi)
1556	vmovdqu	%ymm2,288(%rdi)
1557	vmovdqu	%ymm3,320(%rdi)
1558	vmovdqu	%ymm7,352(%rdi)
1559	je	L$done8x
1560
1561	leaq	384(%rsi),%rsi
1562	xorq	%r10,%r10
1563	vmovdqa	%ymm11,0(%rsp)
1564	leaq	384(%rdi),%rdi
1565	subq	$384,%rdx
1566	vmovdqa	%ymm9,32(%rsp)
1567	jmp	L$oop_tail8x
1568
1569.p2align	5
1570L$448_or_more8x:
1571	vpxor	0(%rsi),%ymm6,%ymm6
1572	vpxor	32(%rsi),%ymm8,%ymm8
1573	vpxor	64(%rsi),%ymm1,%ymm1
1574	vpxor	96(%rsi),%ymm5,%ymm5
1575	vpxor	128(%rsi),%ymm12,%ymm12
1576	vpxor	160(%rsi),%ymm13,%ymm13
1577	vpxor	192(%rsi),%ymm10,%ymm10
1578	vpxor	224(%rsi),%ymm15,%ymm15
1579	vpxor	256(%rsi),%ymm14,%ymm14
1580	vpxor	288(%rsi),%ymm2,%ymm2
1581	vpxor	320(%rsi),%ymm3,%ymm3
1582	vpxor	352(%rsi),%ymm7,%ymm7
1583	vpxor	384(%rsi),%ymm11,%ymm11
1584	vpxor	416(%rsi),%ymm9,%ymm9
1585	vmovdqu	%ymm6,0(%rdi)
1586	vmovdqu	%ymm8,32(%rdi)
1587	vmovdqu	%ymm1,64(%rdi)
1588	vmovdqu	%ymm5,96(%rdi)
1589	vmovdqu	%ymm12,128(%rdi)
1590	vmovdqu	%ymm13,160(%rdi)
1591	vmovdqu	%ymm10,192(%rdi)
1592	vmovdqu	%ymm15,224(%rdi)
1593	vmovdqu	%ymm14,256(%rdi)
1594	vmovdqu	%ymm2,288(%rdi)
1595	vmovdqu	%ymm3,320(%rdi)
1596	vmovdqu	%ymm7,352(%rdi)
1597	vmovdqu	%ymm11,384(%rdi)
1598	vmovdqu	%ymm9,416(%rdi)
1599	je	L$done8x
1600
1601	leaq	448(%rsi),%rsi
1602	xorq	%r10,%r10
1603	vmovdqa	%ymm0,0(%rsp)
1604	leaq	448(%rdi),%rdi
1605	subq	$448,%rdx
1606	vmovdqa	%ymm4,32(%rsp)
1607
1608L$oop_tail8x:
1609	movzbl	(%rsi,%r10,1),%eax
1610	movzbl	(%rsp,%r10,1),%ecx
1611	leaq	1(%r10),%r10
1612	xorl	%ecx,%eax
1613	movb	%al,-1(%rdi,%r10,1)
1614	decq	%rdx
1615	jnz	L$oop_tail8x
1616
1617L$done8x:
1618	vzeroall
1619	leaq	(%r9),%rsp
1620
1621L$8x_epilogue:
1622	.byte	0xf3,0xc3
1623
1624
1625#endif
1626