1#! /usr/bin/env perl
2# Copyright 2016-2023 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# This module implements Poly1305 hash for x86_64.
18#
19# March 2015
20#
21# Initial release.
22#
23# December 2016
24#
25# Add AVX512F+VL+BW code path.
26#
27# November 2017
28#
29# Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be
30# executed even on Knights Landing. Trigger for modification was
31# observation that AVX512 code paths can negatively affect overall
32# Skylake-X system performance. Since we are likely to suppress
33# AVX512F capability flag [at least on Skylake-X], conversion serves
34# as kind of "investment protection". Note that next *lake processor,
35# Cannolake, has AVX512IFMA code path to execute...
36#
37# Numbers are cycles per processed byte with poly1305_blocks alone,
38# measured with rdtsc at fixed clock frequency.
39#
40#		IALU/gcc-4.8(*)	AVX(**)		AVX2	AVX-512
41# P4		4.46/+120%	-
42# Core 2	2.41/+90%	-
43# Westmere	1.88/+120%	-
44# Sandy Bridge	1.39/+140%	1.10
45# Haswell	1.14/+175%	1.11		0.65
46# Skylake[-X]	1.13/+120%	0.96		0.51	[0.35]
47# Silvermont	2.83/+95%	-
48# Knights L	3.60/?		1.65		1.10	0.41(***)
49# Goldmont	1.70/+180%	-
50# VIA Nano	1.82/+150%	-
51# Sledgehammer	1.38/+160%	-
52# Bulldozer	2.30/+130%	0.97
53# Ryzen		1.15/+200%	1.08		1.18
54#
55# (*)	improvement coefficients relative to clang are more modest and
56#	are ~50% on most processors, in both cases we are comparing to
57#	__int128 code;
58# (**)	SSE2 implementation was attempted, but among non-AVX processors
59#	it was faster than integer-only code only on older Intel P4 and
60#	Core processors, 50-30%, less newer processor is, but slower on
61#	contemporary ones, for example almost 2x slower on Atom, and as
62#	former are naturally disappearing, SSE2 is deemed unnecessary;
63# (***)	strangely enough performance seems to vary from core to core,
64#	listed result is best case;
65
66# $output is the last argument if it looks like a file (it has an extension)
67# $flavour is the first argument if it doesn't look like a file
68$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
69$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
70
71$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
72
73$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
74( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
75( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
76die "can't locate x86_64-xlate.pl";
77
78if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
79		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
80	$avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25) + ($1>=2.26);
81}
82
83if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
84	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
85	$avx = ($1>=2.09) + ($1>=2.10) + 2 * ($1>=2.12);
86	$avx += 2 if ($1==2.11 && $2>=8);
87}
88
89if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
90	   `ml64 2>&1` =~ /Version ([0-9]+)\./) {
91	$avx = ($1>=10) + ($1>=12);
92}
93
94if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
95	$avx = ($2>=3.0) + ($2>3.0);
96}
97
98open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
99    or die "can't call $xlate: $!";
100*STDOUT=*OUT;
101
102my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");
103my ($mac,$nonce)=($inp,$len);	# *_emit arguments
104my ($d1,$d2,$d3, $r0,$r1,$s1)=map("%r$_",(8..13));
105my ($h0,$h1,$h2)=("%r14","%rbx","%rbp");
106
107sub poly1305_iteration {
108# input:	copy of $r1 in %rax, $h0-$h2, $r0-$r1
109# output:	$h0-$h2 *= $r0-$r1
110$code.=<<___;
111	mulq	$h0			# h0*r1
112	mov	%rax,$d2
113	 mov	$r0,%rax
114	mov	%rdx,$d3
115
116	mulq	$h0			# h0*r0
117	mov	%rax,$h0		# future $h0
118	 mov	$r0,%rax
119	mov	%rdx,$d1
120
121	mulq	$h1			# h1*r0
122	add	%rax,$d2
123	 mov	$s1,%rax
124	adc	%rdx,$d3
125
126	mulq	$h1			# h1*s1
127	 mov	$h2,$h1			# borrow $h1
128	add	%rax,$h0
129	adc	%rdx,$d1
130
131	imulq	$s1,$h1			# h2*s1
132	add	$h1,$d2
133	 mov	$d1,$h1
134	adc	\$0,$d3
135
136	imulq	$r0,$h2			# h2*r0
137	add	$d2,$h1
138	mov	\$-4,%rax		# mask value
139	adc	$h2,$d3
140
141	and	$d3,%rax		# last reduction step
142	mov	$d3,$h2
143	shr	\$2,$d3
144	and	\$3,$h2
145	add	$d3,%rax
146	add	%rax,$h0
147	adc	\$0,$h1
148	adc	\$0,$h2
149___
150}
151
152########################################################################
153# Layout of opaque area is following.
154#
155#	unsigned __int64 h[3];		# current hash value base 2^64
156#	unsigned __int64 r[2];		# key value base 2^64
157
158$code.=<<___;
159.text
160
161.extern	OPENSSL_ia32cap_P
162
163.globl	poly1305_init
164.hidden	poly1305_init
165.globl	poly1305_blocks
166.hidden	poly1305_blocks
167.globl	poly1305_emit
168.hidden	poly1305_emit
169
170.type	poly1305_init,\@function,3
171.align	32
172poly1305_init:
173.cfi_startproc
174	xor	%rax,%rax
175	mov	%rax,0($ctx)		# initialize hash value
176	mov	%rax,8($ctx)
177	mov	%rax,16($ctx)
178
179	cmp	\$0,$inp
180	je	.Lno_key
181
182	lea	poly1305_blocks(%rip),%r10
183	lea	poly1305_emit(%rip),%r11
184___
185$code.=<<___	if ($avx);
186	mov	OPENSSL_ia32cap_P+4(%rip),%r9
187	lea	poly1305_blocks_avx(%rip),%rax
188	lea	poly1305_emit_avx(%rip),%rcx
189	bt	\$`60-32`,%r9		# AVX?
190	cmovc	%rax,%r10
191	cmovc	%rcx,%r11
192___
193$code.=<<___	if ($avx>1);
194	lea	poly1305_blocks_avx2(%rip),%rax
195	bt	\$`5+32`,%r9		# AVX2?
196	cmovc	%rax,%r10
197___
198$code.=<<___	if ($avx>3 && !$win64);
199	mov	\$`(1<<31|1<<21|1<<16)`,%rax
200	shr	\$32,%r9
201	and	%rax,%r9
202	cmp	%rax,%r9
203	je	.Linit_base2_44
204___
205$code.=<<___;
206	mov	\$0x0ffffffc0fffffff,%rax
207	mov	\$0x0ffffffc0ffffffc,%rcx
208	and	0($inp),%rax
209	and	8($inp),%rcx
210	mov	%rax,24($ctx)
211	mov	%rcx,32($ctx)
212___
213$code.=<<___	if ($flavour !~ /elf32/);
214	mov	%r10,0(%rdx)
215	mov	%r11,8(%rdx)
216___
217$code.=<<___	if ($flavour =~ /elf32/);
218	mov	%r10d,0(%rdx)
219	mov	%r11d,4(%rdx)
220___
221$code.=<<___;
222	mov	\$1,%eax
223.Lno_key:
224	ret
225.cfi_endproc
226.size	poly1305_init,.-poly1305_init
227
228.type	poly1305_blocks,\@function,4
229.align	32
230poly1305_blocks:
231.cfi_startproc
232.Lblocks:
233	shr	\$4,$len
234	jz	.Lno_data		# too short
235
236	push	%rbx
237.cfi_push	%rbx
238	push	%rbp
239.cfi_push	%rbp
240	push	%r12
241.cfi_push	%r12
242	push	%r13
243.cfi_push	%r13
244	push	%r14
245.cfi_push	%r14
246	push	%r15
247.cfi_push	%r15
248.Lblocks_body:
249
250	mov	$len,%r15		# reassign $len
251
252	mov	24($ctx),$r0		# load r
253	mov	32($ctx),$s1
254
255	mov	0($ctx),$h0		# load hash value
256	mov	8($ctx),$h1
257	mov	16($ctx),$h2
258
259	mov	$s1,$r1
260	shr	\$2,$s1
261	mov	$r1,%rax
262	add	$r1,$s1			# s1 = r1 + (r1 >> 2)
263	jmp	.Loop
264
265.align	32
266.Loop:
267	add	0($inp),$h0		# accumulate input
268	adc	8($inp),$h1
269	lea	16($inp),$inp
270	adc	$padbit,$h2
271___
272	&poly1305_iteration();
273$code.=<<___;
274	mov	$r1,%rax
275	dec	%r15			# len-=16
276	jnz	.Loop
277
278	mov	$h0,0($ctx)		# store hash value
279	mov	$h1,8($ctx)
280	mov	$h2,16($ctx)
281
282	mov	0(%rsp),%r15
283.cfi_restore	%r15
284	mov	8(%rsp),%r14
285.cfi_restore	%r14
286	mov	16(%rsp),%r13
287.cfi_restore	%r13
288	mov	24(%rsp),%r12
289.cfi_restore	%r12
290	mov	32(%rsp),%rbp
291.cfi_restore	%rbp
292	mov	40(%rsp),%rbx
293.cfi_restore	%rbx
294	lea	48(%rsp),%rsp
295.cfi_adjust_cfa_offset	-48
296.Lno_data:
297.Lblocks_epilogue:
298	ret
299.cfi_endproc
300.size	poly1305_blocks,.-poly1305_blocks
301
302.type	poly1305_emit,\@function,3
303.align	32
304poly1305_emit:
305.cfi_startproc
306.Lemit:
307	mov	0($ctx),%r8	# load hash value
308	mov	8($ctx),%r9
309	mov	16($ctx),%r10
310
311	mov	%r8,%rax
312	add	\$5,%r8		# compare to modulus
313	mov	%r9,%rcx
314	adc	\$0,%r9
315	adc	\$0,%r10
316	shr	\$2,%r10	# did 130-bit value overflow?
317	cmovnz	%r8,%rax
318	cmovnz	%r9,%rcx
319
320	add	0($nonce),%rax	# accumulate nonce
321	adc	8($nonce),%rcx
322	mov	%rax,0($mac)	# write result
323	mov	%rcx,8($mac)
324
325	ret
326.cfi_endproc
327.size	poly1305_emit,.-poly1305_emit
328___
329if ($avx) {
330
331########################################################################
332# Layout of opaque area is following.
333#
334#	unsigned __int32 h[5];		# current hash value base 2^26
335#	unsigned __int32 is_base2_26;
336#	unsigned __int64 r[2];		# key value base 2^64
337#	unsigned __int64 pad;
338#	struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9];
339#
340# where r^n are base 2^26 digits of degrees of multiplier key. There are
341# 5 digits, but last four are interleaved with multiples of 5, totalling
342# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4.
343
344my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) =
345    map("%xmm$_",(0..15));
346
347$code.=<<___;
348.type	__poly1305_block,\@abi-omnipotent
349.align	32
350__poly1305_block:
351.cfi_startproc
352___
353	&poly1305_iteration();
354$code.=<<___;
355	ret
356.cfi_endproc
357.size	__poly1305_block,.-__poly1305_block
358
359.type	__poly1305_init_avx,\@abi-omnipotent
360.align	32
361__poly1305_init_avx:
362.cfi_startproc
363	mov	$r0,$h0
364	mov	$r1,$h1
365	xor	$h2,$h2
366
367	lea	48+64($ctx),$ctx	# size optimization
368
369	mov	$r1,%rax
370	call	__poly1305_block	# r^2
371
372	mov	\$0x3ffffff,%eax	# save interleaved r^2 and r base 2^26
373	mov	\$0x3ffffff,%edx
374	mov	$h0,$d1
375	and	$h0#d,%eax
376	mov	$r0,$d2
377	and	$r0#d,%edx
378	mov	%eax,`16*0+0-64`($ctx)
379	shr	\$26,$d1
380	mov	%edx,`16*0+4-64`($ctx)
381	shr	\$26,$d2
382
383	mov	\$0x3ffffff,%eax
384	mov	\$0x3ffffff,%edx
385	and	$d1#d,%eax
386	and	$d2#d,%edx
387	mov	%eax,`16*1+0-64`($ctx)
388	lea	(%rax,%rax,4),%eax	# *5
389	mov	%edx,`16*1+4-64`($ctx)
390	lea	(%rdx,%rdx,4),%edx	# *5
391	mov	%eax,`16*2+0-64`($ctx)
392	shr	\$26,$d1
393	mov	%edx,`16*2+4-64`($ctx)
394	shr	\$26,$d2
395
396	mov	$h1,%rax
397	mov	$r1,%rdx
398	shl	\$12,%rax
399	shl	\$12,%rdx
400	or	$d1,%rax
401	or	$d2,%rdx
402	and	\$0x3ffffff,%eax
403	and	\$0x3ffffff,%edx
404	mov	%eax,`16*3+0-64`($ctx)
405	lea	(%rax,%rax,4),%eax	# *5
406	mov	%edx,`16*3+4-64`($ctx)
407	lea	(%rdx,%rdx,4),%edx	# *5
408	mov	%eax,`16*4+0-64`($ctx)
409	mov	$h1,$d1
410	mov	%edx,`16*4+4-64`($ctx)
411	mov	$r1,$d2
412
413	mov	\$0x3ffffff,%eax
414	mov	\$0x3ffffff,%edx
415	shr	\$14,$d1
416	shr	\$14,$d2
417	and	$d1#d,%eax
418	and	$d2#d,%edx
419	mov	%eax,`16*5+0-64`($ctx)
420	lea	(%rax,%rax,4),%eax	# *5
421	mov	%edx,`16*5+4-64`($ctx)
422	lea	(%rdx,%rdx,4),%edx	# *5
423	mov	%eax,`16*6+0-64`($ctx)
424	shr	\$26,$d1
425	mov	%edx,`16*6+4-64`($ctx)
426	shr	\$26,$d2
427
428	mov	$h2,%rax
429	shl	\$24,%rax
430	or	%rax,$d1
431	mov	$d1#d,`16*7+0-64`($ctx)
432	lea	($d1,$d1,4),$d1		# *5
433	mov	$d2#d,`16*7+4-64`($ctx)
434	lea	($d2,$d2,4),$d2		# *5
435	mov	$d1#d,`16*8+0-64`($ctx)
436	mov	$d2#d,`16*8+4-64`($ctx)
437
438	mov	$r1,%rax
439	call	__poly1305_block	# r^3
440
441	mov	\$0x3ffffff,%eax	# save r^3 base 2^26
442	mov	$h0,$d1
443	and	$h0#d,%eax
444	shr	\$26,$d1
445	mov	%eax,`16*0+12-64`($ctx)
446
447	mov	\$0x3ffffff,%edx
448	and	$d1#d,%edx
449	mov	%edx,`16*1+12-64`($ctx)
450	lea	(%rdx,%rdx,4),%edx	# *5
451	shr	\$26,$d1
452	mov	%edx,`16*2+12-64`($ctx)
453
454	mov	$h1,%rax
455	shl	\$12,%rax
456	or	$d1,%rax
457	and	\$0x3ffffff,%eax
458	mov	%eax,`16*3+12-64`($ctx)
459	lea	(%rax,%rax,4),%eax	# *5
460	mov	$h1,$d1
461	mov	%eax,`16*4+12-64`($ctx)
462
463	mov	\$0x3ffffff,%edx
464	shr	\$14,$d1
465	and	$d1#d,%edx
466	mov	%edx,`16*5+12-64`($ctx)
467	lea	(%rdx,%rdx,4),%edx	# *5
468	shr	\$26,$d1
469	mov	%edx,`16*6+12-64`($ctx)
470
471	mov	$h2,%rax
472	shl	\$24,%rax
473	or	%rax,$d1
474	mov	$d1#d,`16*7+12-64`($ctx)
475	lea	($d1,$d1,4),$d1		# *5
476	mov	$d1#d,`16*8+12-64`($ctx)
477
478	mov	$r1,%rax
479	call	__poly1305_block	# r^4
480
481	mov	\$0x3ffffff,%eax	# save r^4 base 2^26
482	mov	$h0,$d1
483	and	$h0#d,%eax
484	shr	\$26,$d1
485	mov	%eax,`16*0+8-64`($ctx)
486
487	mov	\$0x3ffffff,%edx
488	and	$d1#d,%edx
489	mov	%edx,`16*1+8-64`($ctx)
490	lea	(%rdx,%rdx,4),%edx	# *5
491	shr	\$26,$d1
492	mov	%edx,`16*2+8-64`($ctx)
493
494	mov	$h1,%rax
495	shl	\$12,%rax
496	or	$d1,%rax
497	and	\$0x3ffffff,%eax
498	mov	%eax,`16*3+8-64`($ctx)
499	lea	(%rax,%rax,4),%eax	# *5
500	mov	$h1,$d1
501	mov	%eax,`16*4+8-64`($ctx)
502
503	mov	\$0x3ffffff,%edx
504	shr	\$14,$d1
505	and	$d1#d,%edx
506	mov	%edx,`16*5+8-64`($ctx)
507	lea	(%rdx,%rdx,4),%edx	# *5
508	shr	\$26,$d1
509	mov	%edx,`16*6+8-64`($ctx)
510
511	mov	$h2,%rax
512	shl	\$24,%rax
513	or	%rax,$d1
514	mov	$d1#d,`16*7+8-64`($ctx)
515	lea	($d1,$d1,4),$d1		# *5
516	mov	$d1#d,`16*8+8-64`($ctx)
517
518	lea	-48-64($ctx),$ctx	# size [de-]optimization
519	ret
520.cfi_endproc
521.size	__poly1305_init_avx,.-__poly1305_init_avx
522
523.type	poly1305_blocks_avx,\@function,4
524.align	32
525poly1305_blocks_avx:
526.cfi_startproc
527	mov	20($ctx),%r8d		# is_base2_26
528	cmp	\$128,$len
529	jae	.Lblocks_avx
530	test	%r8d,%r8d
531	jz	.Lblocks
532
533.Lblocks_avx:
534	and	\$-16,$len
535	jz	.Lno_data_avx
536
537	vzeroupper
538
539	test	%r8d,%r8d
540	jz	.Lbase2_64_avx
541
542	test	\$31,$len
543	jz	.Leven_avx
544
545	push	%rbx
546.cfi_push	%rbx
547	push	%rbp
548.cfi_push	%rbp
549	push	%r12
550.cfi_push	%r12
551	push	%r13
552.cfi_push	%r13
553	push	%r14
554.cfi_push	%r14
555	push	%r15
556.cfi_push	%r15
557.Lblocks_avx_body:
558
559	mov	$len,%r15		# reassign $len
560
561	mov	0($ctx),$d1		# load hash value
562	mov	8($ctx),$d2
563	mov	16($ctx),$h2#d
564
565	mov	24($ctx),$r0		# load r
566	mov	32($ctx),$s1
567
568	################################# base 2^26 -> base 2^64
569	mov	$d1#d,$h0#d
570	and	\$`-1*(1<<31)`,$d1
571	mov	$d2,$r1			# borrow $r1
572	mov	$d2#d,$h1#d
573	and	\$`-1*(1<<31)`,$d2
574
575	shr	\$6,$d1
576	shl	\$52,$r1
577	add	$d1,$h0
578	shr	\$12,$h1
579	shr	\$18,$d2
580	add	$r1,$h0
581	adc	$d2,$h1
582
583	mov	$h2,$d1
584	shl	\$40,$d1
585	shr	\$24,$h2
586	add	$d1,$h1
587	adc	\$0,$h2			# can be partially reduced...
588
589	mov	\$-4,$d2		# ... so reduce
590	mov	$h2,$d1
591	and	$h2,$d2
592	shr	\$2,$d1
593	and	\$3,$h2
594	add	$d2,$d1			# =*5
595	add	$d1,$h0
596	adc	\$0,$h1
597	adc	\$0,$h2
598
599	mov	$s1,$r1
600	mov	$s1,%rax
601	shr	\$2,$s1
602	add	$r1,$s1			# s1 = r1 + (r1 >> 2)
603
604	add	0($inp),$h0		# accumulate input
605	adc	8($inp),$h1
606	lea	16($inp),$inp
607	adc	$padbit,$h2
608
609	call	__poly1305_block
610
611	test	$padbit,$padbit		# if $padbit is zero,
612	jz	.Lstore_base2_64_avx	# store hash in base 2^64 format
613
614	################################# base 2^64 -> base 2^26
615	mov	$h0,%rax
616	mov	$h0,%rdx
617	shr	\$52,$h0
618	mov	$h1,$r0
619	mov	$h1,$r1
620	shr	\$26,%rdx
621	and	\$0x3ffffff,%rax	# h[0]
622	shl	\$12,$r0
623	and	\$0x3ffffff,%rdx	# h[1]
624	shr	\$14,$h1
625	or	$r0,$h0
626	shl	\$24,$h2
627	and	\$0x3ffffff,$h0		# h[2]
628	shr	\$40,$r1
629	and	\$0x3ffffff,$h1		# h[3]
630	or	$r1,$h2			# h[4]
631
632	sub	\$16,%r15
633	jz	.Lstore_base2_26_avx
634
635	vmovd	%rax#d,$H0
636	vmovd	%rdx#d,$H1
637	vmovd	$h0#d,$H2
638	vmovd	$h1#d,$H3
639	vmovd	$h2#d,$H4
640	jmp	.Lproceed_avx
641
642.align	32
643.Lstore_base2_64_avx:
644	mov	$h0,0($ctx)
645	mov	$h1,8($ctx)
646	mov	$h2,16($ctx)		# note that is_base2_26 is zeroed
647	jmp	.Ldone_avx
648
649.align	16
650.Lstore_base2_26_avx:
651	mov	%rax#d,0($ctx)		# store hash value base 2^26
652	mov	%rdx#d,4($ctx)
653	mov	$h0#d,8($ctx)
654	mov	$h1#d,12($ctx)
655	mov	$h2#d,16($ctx)
656.align	16
657.Ldone_avx:
658	mov	0(%rsp),%r15
659.cfi_restore	%r15
660	mov	8(%rsp),%r14
661.cfi_restore	%r14
662	mov	16(%rsp),%r13
663.cfi_restore	%r13
664	mov	24(%rsp),%r12
665.cfi_restore	%r12
666	mov	32(%rsp),%rbp
667.cfi_restore	%rbp
668	mov	40(%rsp),%rbx
669.cfi_restore	%rbx
670	lea	48(%rsp),%rsp
671.cfi_adjust_cfa_offset	-48
672.Lno_data_avx:
673.Lblocks_avx_epilogue:
674	ret
675.cfi_endproc
676
677.align	32
678.Lbase2_64_avx:
679.cfi_startproc
680	push	%rbx
681.cfi_push	%rbx
682	push	%rbp
683.cfi_push	%rbp
684	push	%r12
685.cfi_push	%r12
686	push	%r13
687.cfi_push	%r13
688	push	%r14
689.cfi_push	%r14
690	push	%r15
691.cfi_push	%r15
692.Lbase2_64_avx_body:
693
694	mov	$len,%r15		# reassign $len
695
696	mov	24($ctx),$r0		# load r
697	mov	32($ctx),$s1
698
699	mov	0($ctx),$h0		# load hash value
700	mov	8($ctx),$h1
701	mov	16($ctx),$h2#d
702
703	mov	$s1,$r1
704	mov	$s1,%rax
705	shr	\$2,$s1
706	add	$r1,$s1			# s1 = r1 + (r1 >> 2)
707
708	test	\$31,$len
709	jz	.Linit_avx
710
711	add	0($inp),$h0		# accumulate input
712	adc	8($inp),$h1
713	lea	16($inp),$inp
714	adc	$padbit,$h2
715	sub	\$16,%r15
716
717	call	__poly1305_block
718
719.Linit_avx:
720	################################# base 2^64 -> base 2^26
721	mov	$h0,%rax
722	mov	$h0,%rdx
723	shr	\$52,$h0
724	mov	$h1,$d1
725	mov	$h1,$d2
726	shr	\$26,%rdx
727	and	\$0x3ffffff,%rax	# h[0]
728	shl	\$12,$d1
729	and	\$0x3ffffff,%rdx	# h[1]
730	shr	\$14,$h1
731	or	$d1,$h0
732	shl	\$24,$h2
733	and	\$0x3ffffff,$h0		# h[2]
734	shr	\$40,$d2
735	and	\$0x3ffffff,$h1		# h[3]
736	or	$d2,$h2			# h[4]
737
738	vmovd	%rax#d,$H0
739	vmovd	%rdx#d,$H1
740	vmovd	$h0#d,$H2
741	vmovd	$h1#d,$H3
742	vmovd	$h2#d,$H4
743	movl	\$1,20($ctx)		# set is_base2_26
744
745	call	__poly1305_init_avx
746
747.Lproceed_avx:
748	mov	%r15,$len
749
750	mov	0(%rsp),%r15
751.cfi_restore	%r15
752	mov	8(%rsp),%r14
753.cfi_restore	%r14
754	mov	16(%rsp),%r13
755.cfi_restore	%r13
756	mov	24(%rsp),%r12
757.cfi_restore	%r12
758	mov	32(%rsp),%rbp
759.cfi_restore	%rbp
760	mov	40(%rsp),%rbx
761.cfi_restore	%rbx
762	lea	48(%rsp),%rax
763	lea	48(%rsp),%rsp
764.cfi_adjust_cfa_offset	-48
765.Lbase2_64_avx_epilogue:
766	jmp	.Ldo_avx
767.cfi_endproc
768
769.align	32
770.Leven_avx:
771.cfi_startproc
772	vmovd		4*0($ctx),$H0		# load hash value
773	vmovd		4*1($ctx),$H1
774	vmovd		4*2($ctx),$H2
775	vmovd		4*3($ctx),$H3
776	vmovd		4*4($ctx),$H4
777
778.Ldo_avx:
779___
780$code.=<<___	if (!$win64);
781	lea		-0x58(%rsp),%r11
782.cfi_def_cfa		%r11,0x60
783	sub		\$0x178,%rsp
784___
785$code.=<<___	if ($win64);
786	lea		-0xf8(%rsp),%r11
787	sub		\$0x218,%rsp
788	vmovdqa		%xmm6,0x50(%r11)
789	vmovdqa		%xmm7,0x60(%r11)
790	vmovdqa		%xmm8,0x70(%r11)
791	vmovdqa		%xmm9,0x80(%r11)
792	vmovdqa		%xmm10,0x90(%r11)
793	vmovdqa		%xmm11,0xa0(%r11)
794	vmovdqa		%xmm12,0xb0(%r11)
795	vmovdqa		%xmm13,0xc0(%r11)
796	vmovdqa		%xmm14,0xd0(%r11)
797	vmovdqa		%xmm15,0xe0(%r11)
798.Ldo_avx_body:
799___
800$code.=<<___;
801	sub		\$64,$len
802	lea		-32($inp),%rax
803	cmovc		%rax,$inp
804
805	vmovdqu		`16*3`($ctx),$D4	# preload r0^2
806	lea		`16*3+64`($ctx),$ctx	# size optimization
807	lea		.Lconst(%rip),%rcx
808
809	################################################################
810	# load input
811	vmovdqu		16*2($inp),$T0
812	vmovdqu		16*3($inp),$T1
813	vmovdqa		64(%rcx),$MASK		# .Lmask26
814
815	vpsrldq		\$6,$T0,$T2		# splat input
816	vpsrldq		\$6,$T1,$T3
817	vpunpckhqdq	$T1,$T0,$T4		# 4
818	vpunpcklqdq	$T1,$T0,$T0		# 0:1
819	vpunpcklqdq	$T3,$T2,$T3		# 2:3
820
821	vpsrlq		\$40,$T4,$T4		# 4
822	vpsrlq		\$26,$T0,$T1
823	vpand		$MASK,$T0,$T0		# 0
824	vpsrlq		\$4,$T3,$T2
825	vpand		$MASK,$T1,$T1		# 1
826	vpsrlq		\$30,$T3,$T3
827	vpand		$MASK,$T2,$T2		# 2
828	vpand		$MASK,$T3,$T3		# 3
829	vpor		32(%rcx),$T4,$T4	# padbit, yes, always
830
831	jbe		.Lskip_loop_avx
832
833	# expand and copy pre-calculated table to stack
834	vmovdqu		`16*1-64`($ctx),$D1
835	vmovdqu		`16*2-64`($ctx),$D2
836	vpshufd		\$0xEE,$D4,$D3		# 34xx -> 3434
837	vpshufd		\$0x44,$D4,$D0		# xx12 -> 1212
838	vmovdqa		$D3,-0x90(%r11)
839	vmovdqa		$D0,0x00(%rsp)
840	vpshufd		\$0xEE,$D1,$D4
841	vmovdqu		`16*3-64`($ctx),$D0
842	vpshufd		\$0x44,$D1,$D1
843	vmovdqa		$D4,-0x80(%r11)
844	vmovdqa		$D1,0x10(%rsp)
845	vpshufd		\$0xEE,$D2,$D3
846	vmovdqu		`16*4-64`($ctx),$D1
847	vpshufd		\$0x44,$D2,$D2
848	vmovdqa		$D3,-0x70(%r11)
849	vmovdqa		$D2,0x20(%rsp)
850	vpshufd		\$0xEE,$D0,$D4
851	vmovdqu		`16*5-64`($ctx),$D2
852	vpshufd		\$0x44,$D0,$D0
853	vmovdqa		$D4,-0x60(%r11)
854	vmovdqa		$D0,0x30(%rsp)
855	vpshufd		\$0xEE,$D1,$D3
856	vmovdqu		`16*6-64`($ctx),$D0
857	vpshufd		\$0x44,$D1,$D1
858	vmovdqa		$D3,-0x50(%r11)
859	vmovdqa		$D1,0x40(%rsp)
860	vpshufd		\$0xEE,$D2,$D4
861	vmovdqu		`16*7-64`($ctx),$D1
862	vpshufd		\$0x44,$D2,$D2
863	vmovdqa		$D4,-0x40(%r11)
864	vmovdqa		$D2,0x50(%rsp)
865	vpshufd		\$0xEE,$D0,$D3
866	vmovdqu		`16*8-64`($ctx),$D2
867	vpshufd		\$0x44,$D0,$D0
868	vmovdqa		$D3,-0x30(%r11)
869	vmovdqa		$D0,0x60(%rsp)
870	vpshufd		\$0xEE,$D1,$D4
871	vpshufd		\$0x44,$D1,$D1
872	vmovdqa		$D4,-0x20(%r11)
873	vmovdqa		$D1,0x70(%rsp)
874	vpshufd		\$0xEE,$D2,$D3
875	 vmovdqa	0x00(%rsp),$D4		# preload r0^2
876	vpshufd		\$0x44,$D2,$D2
877	vmovdqa		$D3,-0x10(%r11)
878	vmovdqa		$D2,0x80(%rsp)
879
880	jmp		.Loop_avx
881
882.align	32
883.Loop_avx:
884	################################################################
885	# ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
886	# ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
887	#   \___________________/
888	# ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
889	# ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
890	#   \___________________/ \____________________/
891	#
892	# Note that we start with inp[2:3]*r^2. This is because it
893	# doesn't depend on reduction in previous iteration.
894	################################################################
895	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
896	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
897	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
898	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
899	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
900	#
901	# though note that $Tx and $Hx are "reversed" in this section,
902	# and $D4 is preloaded with r0^2...
903
904	vpmuludq	$T0,$D4,$D0		# d0 = h0*r0
905	vpmuludq	$T1,$D4,$D1		# d1 = h1*r0
906	  vmovdqa	$H2,0x20(%r11)				# offload hash
907	vpmuludq	$T2,$D4,$D2		# d3 = h2*r0
908	 vmovdqa	0x10(%rsp),$H2		# r1^2
909	vpmuludq	$T3,$D4,$D3		# d3 = h3*r0
910	vpmuludq	$T4,$D4,$D4		# d4 = h4*r0
911
912	  vmovdqa	$H0,0x00(%r11)				#
913	vpmuludq	0x20(%rsp),$T4,$H0	# h4*s1
914	  vmovdqa	$H1,0x10(%r11)				#
915	vpmuludq	$T3,$H2,$H1		# h3*r1
916	vpaddq		$H0,$D0,$D0		# d0 += h4*s1
917	vpaddq		$H1,$D4,$D4		# d4 += h3*r1
918	  vmovdqa	$H3,0x30(%r11)				#
919	vpmuludq	$T2,$H2,$H0		# h2*r1
920	vpmuludq	$T1,$H2,$H1		# h1*r1
921	vpaddq		$H0,$D3,$D3		# d3 += h2*r1
922	 vmovdqa	0x30(%rsp),$H3		# r2^2
923	vpaddq		$H1,$D2,$D2		# d2 += h1*r1
924	  vmovdqa	$H4,0x40(%r11)				#
925	vpmuludq	$T0,$H2,$H2		# h0*r1
926	 vpmuludq	$T2,$H3,$H0		# h2*r2
927	vpaddq		$H2,$D1,$D1		# d1 += h0*r1
928
929	 vmovdqa	0x40(%rsp),$H4		# s2^2
930	vpaddq		$H0,$D4,$D4		# d4 += h2*r2
931	vpmuludq	$T1,$H3,$H1		# h1*r2
932	vpmuludq	$T0,$H3,$H3		# h0*r2
933	vpaddq		$H1,$D3,$D3		# d3 += h1*r2
934	 vmovdqa	0x50(%rsp),$H2		# r3^2
935	vpaddq		$H3,$D2,$D2		# d2 += h0*r2
936	vpmuludq	$T4,$H4,$H0		# h4*s2
937	vpmuludq	$T3,$H4,$H4		# h3*s2
938	vpaddq		$H0,$D1,$D1		# d1 += h4*s2
939	 vmovdqa	0x60(%rsp),$H3		# s3^2
940	vpaddq		$H4,$D0,$D0		# d0 += h3*s2
941
942	 vmovdqa	0x80(%rsp),$H4		# s4^2
943	vpmuludq	$T1,$H2,$H1		# h1*r3
944	vpmuludq	$T0,$H2,$H2		# h0*r3
945	vpaddq		$H1,$D4,$D4		# d4 += h1*r3
946	vpaddq		$H2,$D3,$D3		# d3 += h0*r3
947	vpmuludq	$T4,$H3,$H0		# h4*s3
948	vpmuludq	$T3,$H3,$H1		# h3*s3
949	vpaddq		$H0,$D2,$D2		# d2 += h4*s3
950	 vmovdqu	16*0($inp),$H0				# load input
951	vpaddq		$H1,$D1,$D1		# d1 += h3*s3
952	vpmuludq	$T2,$H3,$H3		# h2*s3
953	 vpmuludq	$T2,$H4,$T2		# h2*s4
954	vpaddq		$H3,$D0,$D0		# d0 += h2*s3
955
956	 vmovdqu	16*1($inp),$H1				#
957	vpaddq		$T2,$D1,$D1		# d1 += h2*s4
958	vpmuludq	$T3,$H4,$T3		# h3*s4
959	vpmuludq	$T4,$H4,$T4		# h4*s4
960	 vpsrldq	\$6,$H0,$H2				# splat input
961	vpaddq		$T3,$D2,$D2		# d2 += h3*s4
962	vpaddq		$T4,$D3,$D3		# d3 += h4*s4
963	 vpsrldq	\$6,$H1,$H3				#
964	vpmuludq	0x70(%rsp),$T0,$T4	# h0*r4
965	vpmuludq	$T1,$H4,$T0		# h1*s4
966	 vpunpckhqdq	$H1,$H0,$H4		# 4
967	vpaddq		$T4,$D4,$D4		# d4 += h0*r4
968	 vmovdqa	-0x90(%r11),$T4		# r0^4
969	vpaddq		$T0,$D0,$D0		# d0 += h1*s4
970
971	vpunpcklqdq	$H1,$H0,$H0		# 0:1
972	vpunpcklqdq	$H3,$H2,$H3		# 2:3
973
974	#vpsrlq		\$40,$H4,$H4		# 4
975	vpsrldq		\$`40/8`,$H4,$H4	# 4
976	vpsrlq		\$26,$H0,$H1
977	vpand		$MASK,$H0,$H0		# 0
978	vpsrlq		\$4,$H3,$H2
979	vpand		$MASK,$H1,$H1		# 1
980	vpand		0(%rcx),$H4,$H4		# .Lmask24
981	vpsrlq		\$30,$H3,$H3
982	vpand		$MASK,$H2,$H2		# 2
983	vpand		$MASK,$H3,$H3		# 3
984	vpor		32(%rcx),$H4,$H4	# padbit, yes, always
985
986	vpaddq		0x00(%r11),$H0,$H0	# add hash value
987	vpaddq		0x10(%r11),$H1,$H1
988	vpaddq		0x20(%r11),$H2,$H2
989	vpaddq		0x30(%r11),$H3,$H3
990	vpaddq		0x40(%r11),$H4,$H4
991
992	lea		16*2($inp),%rax
993	lea		16*4($inp),$inp
994	sub		\$64,$len
995	cmovc		%rax,$inp
996
997	################################################################
998	# Now we accumulate (inp[0:1]+hash)*r^4
999	################################################################
1000	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
1001	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
1002	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
1003	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
1004	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
1005
1006	vpmuludq	$H0,$T4,$T0		# h0*r0
1007	vpmuludq	$H1,$T4,$T1		# h1*r0
1008	vpaddq		$T0,$D0,$D0
1009	vpaddq		$T1,$D1,$D1
1010	 vmovdqa	-0x80(%r11),$T2		# r1^4
1011	vpmuludq	$H2,$T4,$T0		# h2*r0
1012	vpmuludq	$H3,$T4,$T1		# h3*r0
1013	vpaddq		$T0,$D2,$D2
1014	vpaddq		$T1,$D3,$D3
1015	vpmuludq	$H4,$T4,$T4		# h4*r0
1016	 vpmuludq	-0x70(%r11),$H4,$T0	# h4*s1
1017	vpaddq		$T4,$D4,$D4
1018
1019	vpaddq		$T0,$D0,$D0		# d0 += h4*s1
1020	vpmuludq	$H2,$T2,$T1		# h2*r1
1021	vpmuludq	$H3,$T2,$T0		# h3*r1
1022	vpaddq		$T1,$D3,$D3		# d3 += h2*r1
1023	 vmovdqa	-0x60(%r11),$T3		# r2^4
1024	vpaddq		$T0,$D4,$D4		# d4 += h3*r1
1025	vpmuludq	$H1,$T2,$T1		# h1*r1
1026	vpmuludq	$H0,$T2,$T2		# h0*r1
1027	vpaddq		$T1,$D2,$D2		# d2 += h1*r1
1028	vpaddq		$T2,$D1,$D1		# d1 += h0*r1
1029
1030	 vmovdqa	-0x50(%r11),$T4		# s2^4
1031	vpmuludq	$H2,$T3,$T0		# h2*r2
1032	vpmuludq	$H1,$T3,$T1		# h1*r2
1033	vpaddq		$T0,$D4,$D4		# d4 += h2*r2
1034	vpaddq		$T1,$D3,$D3		# d3 += h1*r2
1035	 vmovdqa	-0x40(%r11),$T2		# r3^4
1036	vpmuludq	$H0,$T3,$T3		# h0*r2
1037	vpmuludq	$H4,$T4,$T0		# h4*s2
1038	vpaddq		$T3,$D2,$D2		# d2 += h0*r2
1039	vpaddq		$T0,$D1,$D1		# d1 += h4*s2
1040	 vmovdqa	-0x30(%r11),$T3		# s3^4
1041	vpmuludq	$H3,$T4,$T4		# h3*s2
1042	 vpmuludq	$H1,$T2,$T1		# h1*r3
1043	vpaddq		$T4,$D0,$D0		# d0 += h3*s2
1044
1045	 vmovdqa	-0x10(%r11),$T4		# s4^4
1046	vpaddq		$T1,$D4,$D4		# d4 += h1*r3
1047	vpmuludq	$H0,$T2,$T2		# h0*r3
1048	vpmuludq	$H4,$T3,$T0		# h4*s3
1049	vpaddq		$T2,$D3,$D3		# d3 += h0*r3
1050	vpaddq		$T0,$D2,$D2		# d2 += h4*s3
1051	 vmovdqu	16*2($inp),$T0				# load input
1052	vpmuludq	$H3,$T3,$T2		# h3*s3
1053	vpmuludq	$H2,$T3,$T3		# h2*s3
1054	vpaddq		$T2,$D1,$D1		# d1 += h3*s3
1055	 vmovdqu	16*3($inp),$T1				#
1056	vpaddq		$T3,$D0,$D0		# d0 += h2*s3
1057
1058	vpmuludq	$H2,$T4,$H2		# h2*s4
1059	vpmuludq	$H3,$T4,$H3		# h3*s4
1060	 vpsrldq	\$6,$T0,$T2				# splat input
1061	vpaddq		$H2,$D1,$D1		# d1 += h2*s4
1062	vpmuludq	$H4,$T4,$H4		# h4*s4
1063	 vpsrldq	\$6,$T1,$T3				#
1064	vpaddq		$H3,$D2,$H2		# h2 = d2 + h3*s4
1065	vpaddq		$H4,$D3,$H3		# h3 = d3 + h4*s4
1066	vpmuludq	-0x20(%r11),$H0,$H4	# h0*r4
1067	vpmuludq	$H1,$T4,$H0
1068	 vpunpckhqdq	$T1,$T0,$T4		# 4
1069	vpaddq		$H4,$D4,$H4		# h4 = d4 + h0*r4
1070	vpaddq		$H0,$D0,$H0		# h0 = d0 + h1*s4
1071
1072	vpunpcklqdq	$T1,$T0,$T0		# 0:1
1073	vpunpcklqdq	$T3,$T2,$T3		# 2:3
1074
1075	#vpsrlq		\$40,$T4,$T4		# 4
1076	vpsrldq		\$`40/8`,$T4,$T4	# 4
1077	vpsrlq		\$26,$T0,$T1
1078	 vmovdqa	0x00(%rsp),$D4		# preload r0^2
1079	vpand		$MASK,$T0,$T0		# 0
1080	vpsrlq		\$4,$T3,$T2
1081	vpand		$MASK,$T1,$T1		# 1
1082	vpand		0(%rcx),$T4,$T4		# .Lmask24
1083	vpsrlq		\$30,$T3,$T3
1084	vpand		$MASK,$T2,$T2		# 2
1085	vpand		$MASK,$T3,$T3		# 3
1086	vpor		32(%rcx),$T4,$T4	# padbit, yes, always
1087
1088	################################################################
1089	# lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
1090	# and P. Schwabe
1091
1092	vpsrlq		\$26,$H3,$D3
1093	vpand		$MASK,$H3,$H3
1094	vpaddq		$D3,$H4,$H4		# h3 -> h4
1095
1096	vpsrlq		\$26,$H0,$D0
1097	vpand		$MASK,$H0,$H0
1098	vpaddq		$D0,$D1,$H1		# h0 -> h1
1099
1100	vpsrlq		\$26,$H4,$D0
1101	vpand		$MASK,$H4,$H4
1102
1103	vpsrlq		\$26,$H1,$D1
1104	vpand		$MASK,$H1,$H1
1105	vpaddq		$D1,$H2,$H2		# h1 -> h2
1106
1107	vpaddq		$D0,$H0,$H0
1108	vpsllq		\$2,$D0,$D0
1109	vpaddq		$D0,$H0,$H0		# h4 -> h0
1110
1111	vpsrlq		\$26,$H2,$D2
1112	vpand		$MASK,$H2,$H2
1113	vpaddq		$D2,$H3,$H3		# h2 -> h3
1114
1115	vpsrlq		\$26,$H0,$D0
1116	vpand		$MASK,$H0,$H0
1117	vpaddq		$D0,$H1,$H1		# h0 -> h1
1118
1119	vpsrlq		\$26,$H3,$D3
1120	vpand		$MASK,$H3,$H3
1121	vpaddq		$D3,$H4,$H4		# h3 -> h4
1122
1123	ja		.Loop_avx
1124
1125.Lskip_loop_avx:
1126	################################################################
1127	# multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
1128
1129	vpshufd		\$0x10,$D4,$D4		# r0^n, xx12 -> x1x2
1130	add		\$32,$len
1131	jnz		.Long_tail_avx
1132
1133	vpaddq		$H2,$T2,$T2
1134	vpaddq		$H0,$T0,$T0
1135	vpaddq		$H1,$T1,$T1
1136	vpaddq		$H3,$T3,$T3
1137	vpaddq		$H4,$T4,$T4
1138
1139.Long_tail_avx:
1140	vmovdqa		$H2,0x20(%r11)
1141	vmovdqa		$H0,0x00(%r11)
1142	vmovdqa		$H1,0x10(%r11)
1143	vmovdqa		$H3,0x30(%r11)
1144	vmovdqa		$H4,0x40(%r11)
1145
1146	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
1147	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
1148	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
1149	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
1150	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
1151
1152	vpmuludq	$T2,$D4,$D2		# d2 = h2*r0
1153	vpmuludq	$T0,$D4,$D0		# d0 = h0*r0
1154	 vpshufd	\$0x10,`16*1-64`($ctx),$H2		# r1^n
1155	vpmuludq	$T1,$D4,$D1		# d1 = h1*r0
1156	vpmuludq	$T3,$D4,$D3		# d3 = h3*r0
1157	vpmuludq	$T4,$D4,$D4		# d4 = h4*r0
1158
1159	vpmuludq	$T3,$H2,$H0		# h3*r1
1160	vpaddq		$H0,$D4,$D4		# d4 += h3*r1
1161	 vpshufd	\$0x10,`16*2-64`($ctx),$H3		# s1^n
1162	vpmuludq	$T2,$H2,$H1		# h2*r1
1163	vpaddq		$H1,$D3,$D3		# d3 += h2*r1
1164	 vpshufd	\$0x10,`16*3-64`($ctx),$H4		# r2^n
1165	vpmuludq	$T1,$H2,$H0		# h1*r1
1166	vpaddq		$H0,$D2,$D2		# d2 += h1*r1
1167	vpmuludq	$T0,$H2,$H2		# h0*r1
1168	vpaddq		$H2,$D1,$D1		# d1 += h0*r1
1169	vpmuludq	$T4,$H3,$H3		# h4*s1
1170	vpaddq		$H3,$D0,$D0		# d0 += h4*s1
1171
1172	 vpshufd	\$0x10,`16*4-64`($ctx),$H2		# s2^n
1173	vpmuludq	$T2,$H4,$H1		# h2*r2
1174	vpaddq		$H1,$D4,$D4		# d4 += h2*r2
1175	vpmuludq	$T1,$H4,$H0		# h1*r2
1176	vpaddq		$H0,$D3,$D3		# d3 += h1*r2
1177	 vpshufd	\$0x10,`16*5-64`($ctx),$H3		# r3^n
1178	vpmuludq	$T0,$H4,$H4		# h0*r2
1179	vpaddq		$H4,$D2,$D2		# d2 += h0*r2
1180	vpmuludq	$T4,$H2,$H1		# h4*s2
1181	vpaddq		$H1,$D1,$D1		# d1 += h4*s2
1182	 vpshufd	\$0x10,`16*6-64`($ctx),$H4		# s3^n
1183	vpmuludq	$T3,$H2,$H2		# h3*s2
1184	vpaddq		$H2,$D0,$D0		# d0 += h3*s2
1185
1186	vpmuludq	$T1,$H3,$H0		# h1*r3
1187	vpaddq		$H0,$D4,$D4		# d4 += h1*r3
1188	vpmuludq	$T0,$H3,$H3		# h0*r3
1189	vpaddq		$H3,$D3,$D3		# d3 += h0*r3
1190	 vpshufd	\$0x10,`16*7-64`($ctx),$H2		# r4^n
1191	vpmuludq	$T4,$H4,$H1		# h4*s3
1192	vpaddq		$H1,$D2,$D2		# d2 += h4*s3
1193	 vpshufd	\$0x10,`16*8-64`($ctx),$H3		# s4^n
1194	vpmuludq	$T3,$H4,$H0		# h3*s3
1195	vpaddq		$H0,$D1,$D1		# d1 += h3*s3
1196	vpmuludq	$T2,$H4,$H4		# h2*s3
1197	vpaddq		$H4,$D0,$D0		# d0 += h2*s3
1198
1199	vpmuludq	$T0,$H2,$H2		# h0*r4
1200	vpaddq		$H2,$D4,$D4		# h4 = d4 + h0*r4
1201	vpmuludq	$T4,$H3,$H1		# h4*s4
1202	vpaddq		$H1,$D3,$D3		# h3 = d3 + h4*s4
1203	vpmuludq	$T3,$H3,$H0		# h3*s4
1204	vpaddq		$H0,$D2,$D2		# h2 = d2 + h3*s4
1205	vpmuludq	$T2,$H3,$H1		# h2*s4
1206	vpaddq		$H1,$D1,$D1		# h1 = d1 + h2*s4
1207	vpmuludq	$T1,$H3,$H3		# h1*s4
1208	vpaddq		$H3,$D0,$D0		# h0 = d0 + h1*s4
1209
1210	jz		.Lshort_tail_avx
1211
1212	vmovdqu		16*0($inp),$H0		# load input
1213	vmovdqu		16*1($inp),$H1
1214
1215	vpsrldq		\$6,$H0,$H2		# splat input
1216	vpsrldq		\$6,$H1,$H3
1217	vpunpckhqdq	$H1,$H0,$H4		# 4
1218	vpunpcklqdq	$H1,$H0,$H0		# 0:1
1219	vpunpcklqdq	$H3,$H2,$H3		# 2:3
1220
1221	vpsrlq		\$40,$H4,$H4		# 4
1222	vpsrlq		\$26,$H0,$H1
1223	vpand		$MASK,$H0,$H0		# 0
1224	vpsrlq		\$4,$H3,$H2
1225	vpand		$MASK,$H1,$H1		# 1
1226	vpsrlq		\$30,$H3,$H3
1227	vpand		$MASK,$H2,$H2		# 2
1228	vpand		$MASK,$H3,$H3		# 3
1229	vpor		32(%rcx),$H4,$H4	# padbit, yes, always
1230
1231	vpshufd		\$0x32,`16*0-64`($ctx),$T4	# r0^n, 34xx -> x3x4
1232	vpaddq		0x00(%r11),$H0,$H0
1233	vpaddq		0x10(%r11),$H1,$H1
1234	vpaddq		0x20(%r11),$H2,$H2
1235	vpaddq		0x30(%r11),$H3,$H3
1236	vpaddq		0x40(%r11),$H4,$H4
1237
1238	################################################################
1239	# multiply (inp[0:1]+hash) by r^4:r^3 and accumulate
1240
1241	vpmuludq	$H0,$T4,$T0		# h0*r0
1242	vpaddq		$T0,$D0,$D0		# d0 += h0*r0
1243	vpmuludq	$H1,$T4,$T1		# h1*r0
1244	vpaddq		$T1,$D1,$D1		# d1 += h1*r0
1245	vpmuludq	$H2,$T4,$T0		# h2*r0
1246	vpaddq		$T0,$D2,$D2		# d2 += h2*r0
1247	 vpshufd	\$0x32,`16*1-64`($ctx),$T2		# r1^n
1248	vpmuludq	$H3,$T4,$T1		# h3*r0
1249	vpaddq		$T1,$D3,$D3		# d3 += h3*r0
1250	vpmuludq	$H4,$T4,$T4		# h4*r0
1251	vpaddq		$T4,$D4,$D4		# d4 += h4*r0
1252
1253	vpmuludq	$H3,$T2,$T0		# h3*r1
1254	vpaddq		$T0,$D4,$D4		# d4 += h3*r1
1255	 vpshufd	\$0x32,`16*2-64`($ctx),$T3		# s1
1256	vpmuludq	$H2,$T2,$T1		# h2*r1
1257	vpaddq		$T1,$D3,$D3		# d3 += h2*r1
1258	 vpshufd	\$0x32,`16*3-64`($ctx),$T4		# r2
1259	vpmuludq	$H1,$T2,$T0		# h1*r1
1260	vpaddq		$T0,$D2,$D2		# d2 += h1*r1
1261	vpmuludq	$H0,$T2,$T2		# h0*r1
1262	vpaddq		$T2,$D1,$D1		# d1 += h0*r1
1263	vpmuludq	$H4,$T3,$T3		# h4*s1
1264	vpaddq		$T3,$D0,$D0		# d0 += h4*s1
1265
1266	 vpshufd	\$0x32,`16*4-64`($ctx),$T2		# s2
1267	vpmuludq	$H2,$T4,$T1		# h2*r2
1268	vpaddq		$T1,$D4,$D4		# d4 += h2*r2
1269	vpmuludq	$H1,$T4,$T0		# h1*r2
1270	vpaddq		$T0,$D3,$D3		# d3 += h1*r2
1271	 vpshufd	\$0x32,`16*5-64`($ctx),$T3		# r3
1272	vpmuludq	$H0,$T4,$T4		# h0*r2
1273	vpaddq		$T4,$D2,$D2		# d2 += h0*r2
1274	vpmuludq	$H4,$T2,$T1		# h4*s2
1275	vpaddq		$T1,$D1,$D1		# d1 += h4*s2
1276	 vpshufd	\$0x32,`16*6-64`($ctx),$T4		# s3
1277	vpmuludq	$H3,$T2,$T2		# h3*s2
1278	vpaddq		$T2,$D0,$D0		# d0 += h3*s2
1279
1280	vpmuludq	$H1,$T3,$T0		# h1*r3
1281	vpaddq		$T0,$D4,$D4		# d4 += h1*r3
1282	vpmuludq	$H0,$T3,$T3		# h0*r3
1283	vpaddq		$T3,$D3,$D3		# d3 += h0*r3
1284	 vpshufd	\$0x32,`16*7-64`($ctx),$T2		# r4
1285	vpmuludq	$H4,$T4,$T1		# h4*s3
1286	vpaddq		$T1,$D2,$D2		# d2 += h4*s3
1287	 vpshufd	\$0x32,`16*8-64`($ctx),$T3		# s4
1288	vpmuludq	$H3,$T4,$T0		# h3*s3
1289	vpaddq		$T0,$D1,$D1		# d1 += h3*s3
1290	vpmuludq	$H2,$T4,$T4		# h2*s3
1291	vpaddq		$T4,$D0,$D0		# d0 += h2*s3
1292
1293	vpmuludq	$H0,$T2,$T2		# h0*r4
1294	vpaddq		$T2,$D4,$D4		# d4 += h0*r4
1295	vpmuludq	$H4,$T3,$T1		# h4*s4
1296	vpaddq		$T1,$D3,$D3		# d3 += h4*s4
1297	vpmuludq	$H3,$T3,$T0		# h3*s4
1298	vpaddq		$T0,$D2,$D2		# d2 += h3*s4
1299	vpmuludq	$H2,$T3,$T1		# h2*s4
1300	vpaddq		$T1,$D1,$D1		# d1 += h2*s4
1301	vpmuludq	$H1,$T3,$T3		# h1*s4
1302	vpaddq		$T3,$D0,$D0		# d0 += h1*s4
1303
1304.Lshort_tail_avx:
1305	################################################################
1306	# horizontal addition
1307
1308	vpsrldq		\$8,$D4,$T4
1309	vpsrldq		\$8,$D3,$T3
1310	vpsrldq		\$8,$D1,$T1
1311	vpsrldq		\$8,$D0,$T0
1312	vpsrldq		\$8,$D2,$T2
1313	vpaddq		$T3,$D3,$D3
1314	vpaddq		$T4,$D4,$D4
1315	vpaddq		$T0,$D0,$D0
1316	vpaddq		$T1,$D1,$D1
1317	vpaddq		$T2,$D2,$D2
1318
1319	################################################################
1320	# lazy reduction
1321
1322	vpsrlq		\$26,$D3,$H3
1323	vpand		$MASK,$D3,$D3
1324	vpaddq		$H3,$D4,$D4		# h3 -> h4
1325
1326	vpsrlq		\$26,$D0,$H0
1327	vpand		$MASK,$D0,$D0
1328	vpaddq		$H0,$D1,$D1		# h0 -> h1
1329
1330	vpsrlq		\$26,$D4,$H4
1331	vpand		$MASK,$D4,$D4
1332
1333	vpsrlq		\$26,$D1,$H1
1334	vpand		$MASK,$D1,$D1
1335	vpaddq		$H1,$D2,$D2		# h1 -> h2
1336
1337	vpaddq		$H4,$D0,$D0
1338	vpsllq		\$2,$H4,$H4
1339	vpaddq		$H4,$D0,$D0		# h4 -> h0
1340
1341	vpsrlq		\$26,$D2,$H2
1342	vpand		$MASK,$D2,$D2
1343	vpaddq		$H2,$D3,$D3		# h2 -> h3
1344
1345	vpsrlq		\$26,$D0,$H0
1346	vpand		$MASK,$D0,$D0
1347	vpaddq		$H0,$D1,$D1		# h0 -> h1
1348
1349	vpsrlq		\$26,$D3,$H3
1350	vpand		$MASK,$D3,$D3
1351	vpaddq		$H3,$D4,$D4		# h3 -> h4
1352
1353	vmovd		$D0,`4*0-48-64`($ctx)	# save partially reduced
1354	vmovd		$D1,`4*1-48-64`($ctx)
1355	vmovd		$D2,`4*2-48-64`($ctx)
1356	vmovd		$D3,`4*3-48-64`($ctx)
1357	vmovd		$D4,`4*4-48-64`($ctx)
1358___
1359$code.=<<___	if ($win64);
1360	vmovdqa		0x50(%r11),%xmm6
1361	vmovdqa		0x60(%r11),%xmm7
1362	vmovdqa		0x70(%r11),%xmm8
1363	vmovdqa		0x80(%r11),%xmm9
1364	vmovdqa		0x90(%r11),%xmm10
1365	vmovdqa		0xa0(%r11),%xmm11
1366	vmovdqa		0xb0(%r11),%xmm12
1367	vmovdqa		0xc0(%r11),%xmm13
1368	vmovdqa		0xd0(%r11),%xmm14
1369	vmovdqa		0xe0(%r11),%xmm15
1370	lea		0xf8(%r11),%rsp
1371.Ldo_avx_epilogue:
1372___
1373$code.=<<___	if (!$win64);
1374	lea		0x58(%r11),%rsp
1375.cfi_def_cfa		%rsp,8
1376___
1377$code.=<<___;
1378	vzeroupper
1379	ret
1380.cfi_endproc
1381.size	poly1305_blocks_avx,.-poly1305_blocks_avx
1382
1383.type	poly1305_emit_avx,\@function,3
1384.align	32
1385poly1305_emit_avx:
1386.cfi_startproc
1387	cmpl	\$0,20($ctx)	# is_base2_26?
1388	je	.Lemit
1389
1390	mov	0($ctx),%eax	# load hash value base 2^26
1391	mov	4($ctx),%ecx
1392	mov	8($ctx),%r8d
1393	mov	12($ctx),%r11d
1394	mov	16($ctx),%r10d
1395
1396	shl	\$26,%rcx	# base 2^26 -> base 2^64
1397	mov	%r8,%r9
1398	shl	\$52,%r8
1399	add	%rcx,%rax
1400	shr	\$12,%r9
1401	add	%rax,%r8	# h0
1402	adc	\$0,%r9
1403
1404	shl	\$14,%r11
1405	mov	%r10,%rax
1406	shr	\$24,%r10
1407	add	%r11,%r9
1408	shl	\$40,%rax
1409	add	%rax,%r9	# h1
1410	adc	\$0,%r10	# h2
1411
1412	mov	%r10,%rax	# could be partially reduced, so reduce
1413	mov	%r10,%rcx
1414	and	\$3,%r10
1415	shr	\$2,%rax
1416	and	\$-4,%rcx
1417	add	%rcx,%rax
1418	add	%rax,%r8
1419	adc	\$0,%r9
1420	adc	\$0,%r10
1421
1422	mov	%r8,%rax
1423	add	\$5,%r8		# compare to modulus
1424	mov	%r9,%rcx
1425	adc	\$0,%r9
1426	adc	\$0,%r10
1427	shr	\$2,%r10	# did 130-bit value overflow?
1428	cmovnz	%r8,%rax
1429	cmovnz	%r9,%rcx
1430
1431	add	0($nonce),%rax	# accumulate nonce
1432	adc	8($nonce),%rcx
1433	mov	%rax,0($mac)	# write result
1434	mov	%rcx,8($mac)
1435
1436	ret
1437.cfi_endproc
1438.size	poly1305_emit_avx,.-poly1305_emit_avx
1439___
1440
1441if ($avx>1) {
1442my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =
1443    map("%ymm$_",(0..15));
1444my $S4=$MASK;
1445
1446$code.=<<___;
1447.type	poly1305_blocks_avx2,\@function,4
1448.align	32
1449poly1305_blocks_avx2:
1450.cfi_startproc
1451	mov	20($ctx),%r8d		# is_base2_26
1452	cmp	\$128,$len
1453	jae	.Lblocks_avx2
1454	test	%r8d,%r8d
1455	jz	.Lblocks
1456
1457.Lblocks_avx2:
1458	and	\$-16,$len
1459	jz	.Lno_data_avx2
1460
1461	vzeroupper
1462
1463	test	%r8d,%r8d
1464	jz	.Lbase2_64_avx2
1465
1466	test	\$63,$len
1467	jz	.Leven_avx2
1468
1469	push	%rbx
1470.cfi_push	%rbx
1471	push	%rbp
1472.cfi_push	%rbp
1473	push	%r12
1474.cfi_push	%r12
1475	push	%r13
1476.cfi_push	%r13
1477	push	%r14
1478.cfi_push	%r14
1479	push	%r15
1480.cfi_push	%r15
1481.Lblocks_avx2_body:
1482
1483	mov	$len,%r15		# reassign $len
1484
1485	mov	0($ctx),$d1		# load hash value
1486	mov	8($ctx),$d2
1487	mov	16($ctx),$h2#d
1488
1489	mov	24($ctx),$r0		# load r
1490	mov	32($ctx),$s1
1491
1492	################################# base 2^26 -> base 2^64
1493	mov	$d1#d,$h0#d
1494	and	\$`-1*(1<<31)`,$d1
1495	mov	$d2,$r1			# borrow $r1
1496	mov	$d2#d,$h1#d
1497	and	\$`-1*(1<<31)`,$d2
1498
1499	shr	\$6,$d1
1500	shl	\$52,$r1
1501	add	$d1,$h0
1502	shr	\$12,$h1
1503	shr	\$18,$d2
1504	add	$r1,$h0
1505	adc	$d2,$h1
1506
1507	mov	$h2,$d1
1508	shl	\$40,$d1
1509	shr	\$24,$h2
1510	add	$d1,$h1
1511	adc	\$0,$h2			# can be partially reduced...
1512
1513	mov	\$-4,$d2		# ... so reduce
1514	mov	$h2,$d1
1515	and	$h2,$d2
1516	shr	\$2,$d1
1517	and	\$3,$h2
1518	add	$d2,$d1			# =*5
1519	add	$d1,$h0
1520	adc	\$0,$h1
1521	adc	\$0,$h2
1522
1523	mov	$s1,$r1
1524	mov	$s1,%rax
1525	shr	\$2,$s1
1526	add	$r1,$s1			# s1 = r1 + (r1 >> 2)
1527
1528.Lbase2_26_pre_avx2:
1529	add	0($inp),$h0		# accumulate input
1530	adc	8($inp),$h1
1531	lea	16($inp),$inp
1532	adc	$padbit,$h2
1533	sub	\$16,%r15
1534
1535	call	__poly1305_block
1536	mov	$r1,%rax
1537
1538	test	\$63,%r15
1539	jnz	.Lbase2_26_pre_avx2
1540
1541	test	$padbit,$padbit		# if $padbit is zero,
1542	jz	.Lstore_base2_64_avx2	# store hash in base 2^64 format
1543
1544	################################# base 2^64 -> base 2^26
1545	mov	$h0,%rax
1546	mov	$h0,%rdx
1547	shr	\$52,$h0
1548	mov	$h1,$r0
1549	mov	$h1,$r1
1550	shr	\$26,%rdx
1551	and	\$0x3ffffff,%rax	# h[0]
1552	shl	\$12,$r0
1553	and	\$0x3ffffff,%rdx	# h[1]
1554	shr	\$14,$h1
1555	or	$r0,$h0
1556	shl	\$24,$h2
1557	and	\$0x3ffffff,$h0		# h[2]
1558	shr	\$40,$r1
1559	and	\$0x3ffffff,$h1		# h[3]
1560	or	$r1,$h2			# h[4]
1561
1562	test	%r15,%r15
1563	jz	.Lstore_base2_26_avx2
1564
1565	vmovd	%rax#d,%x#$H0
1566	vmovd	%rdx#d,%x#$H1
1567	vmovd	$h0#d,%x#$H2
1568	vmovd	$h1#d,%x#$H3
1569	vmovd	$h2#d,%x#$H4
1570	jmp	.Lproceed_avx2
1571
1572.align	32
1573.Lstore_base2_64_avx2:
1574	mov	$h0,0($ctx)
1575	mov	$h1,8($ctx)
1576	mov	$h2,16($ctx)		# note that is_base2_26 is zeroed
1577	jmp	.Ldone_avx2
1578
1579.align	16
1580.Lstore_base2_26_avx2:
1581	mov	%rax#d,0($ctx)		# store hash value base 2^26
1582	mov	%rdx#d,4($ctx)
1583	mov	$h0#d,8($ctx)
1584	mov	$h1#d,12($ctx)
1585	mov	$h2#d,16($ctx)
1586.align	16
1587.Ldone_avx2:
1588	mov	0(%rsp),%r15
1589.cfi_restore	%r15
1590	mov	8(%rsp),%r14
1591.cfi_restore	%r14
1592	mov	16(%rsp),%r13
1593.cfi_restore	%r13
1594	mov	24(%rsp),%r12
1595.cfi_restore	%r12
1596	mov	32(%rsp),%rbp
1597.cfi_restore	%rbp
1598	mov	40(%rsp),%rbx
1599.cfi_restore	%rbx
1600	lea	48(%rsp),%rsp
1601.cfi_adjust_cfa_offset	-48
1602.Lno_data_avx2:
1603.Lblocks_avx2_epilogue:
1604	ret
1605.cfi_endproc
1606
1607.align	32
1608.Lbase2_64_avx2:
1609.cfi_startproc
1610	push	%rbx
1611.cfi_push	%rbx
1612	push	%rbp
1613.cfi_push	%rbp
1614	push	%r12
1615.cfi_push	%r12
1616	push	%r13
1617.cfi_push	%r13
1618	push	%r14
1619.cfi_push	%r14
1620	push	%r15
1621.cfi_push	%r15
1622.Lbase2_64_avx2_body:
1623
1624	mov	$len,%r15		# reassign $len
1625
1626	mov	24($ctx),$r0		# load r
1627	mov	32($ctx),$s1
1628
1629	mov	0($ctx),$h0		# load hash value
1630	mov	8($ctx),$h1
1631	mov	16($ctx),$h2#d
1632
1633	mov	$s1,$r1
1634	mov	$s1,%rax
1635	shr	\$2,$s1
1636	add	$r1,$s1			# s1 = r1 + (r1 >> 2)
1637
1638	test	\$63,$len
1639	jz	.Linit_avx2
1640
1641.Lbase2_64_pre_avx2:
1642	add	0($inp),$h0		# accumulate input
1643	adc	8($inp),$h1
1644	lea	16($inp),$inp
1645	adc	$padbit,$h2
1646	sub	\$16,%r15
1647
1648	call	__poly1305_block
1649	mov	$r1,%rax
1650
1651	test	\$63,%r15
1652	jnz	.Lbase2_64_pre_avx2
1653
1654.Linit_avx2:
1655	################################# base 2^64 -> base 2^26
1656	mov	$h0,%rax
1657	mov	$h0,%rdx
1658	shr	\$52,$h0
1659	mov	$h1,$d1
1660	mov	$h1,$d2
1661	shr	\$26,%rdx
1662	and	\$0x3ffffff,%rax	# h[0]
1663	shl	\$12,$d1
1664	and	\$0x3ffffff,%rdx	# h[1]
1665	shr	\$14,$h1
1666	or	$d1,$h0
1667	shl	\$24,$h2
1668	and	\$0x3ffffff,$h0		# h[2]
1669	shr	\$40,$d2
1670	and	\$0x3ffffff,$h1		# h[3]
1671	or	$d2,$h2			# h[4]
1672
1673	vmovd	%rax#d,%x#$H0
1674	vmovd	%rdx#d,%x#$H1
1675	vmovd	$h0#d,%x#$H2
1676	vmovd	$h1#d,%x#$H3
1677	vmovd	$h2#d,%x#$H4
1678	movl	\$1,20($ctx)		# set is_base2_26
1679
1680	call	__poly1305_init_avx
1681
1682.Lproceed_avx2:
1683	mov	%r15,$len			# restore $len
1684	mov	OPENSSL_ia32cap_P+8(%rip),%r10d
1685	mov	\$`(1<<31|1<<30|1<<16)`,%r11d
1686
1687	mov	0(%rsp),%r15
1688.cfi_restore	%r15
1689	mov	8(%rsp),%r14
1690.cfi_restore	%r14
1691	mov	16(%rsp),%r13
1692.cfi_restore	%r13
1693	mov	24(%rsp),%r12
1694.cfi_restore	%r12
1695	mov	32(%rsp),%rbp
1696.cfi_restore	%rbp
1697	mov	40(%rsp),%rbx
1698.cfi_restore	%rbx
1699	lea	48(%rsp),%rax
1700	lea	48(%rsp),%rsp
1701.cfi_adjust_cfa_offset	-48
1702.Lbase2_64_avx2_epilogue:
1703	jmp	.Ldo_avx2
1704.cfi_endproc
1705
1706.align	32
1707.Leven_avx2:
1708.cfi_startproc
1709	mov		OPENSSL_ia32cap_P+8(%rip),%r10d
1710	vmovd		4*0($ctx),%x#$H0	# load hash value base 2^26
1711	vmovd		4*1($ctx),%x#$H1
1712	vmovd		4*2($ctx),%x#$H2
1713	vmovd		4*3($ctx),%x#$H3
1714	vmovd		4*4($ctx),%x#$H4
1715
1716.Ldo_avx2:
1717___
1718$code.=<<___		if ($avx>2);
1719	cmp		\$512,$len
1720	jb		.Lskip_avx512
1721	and		%r11d,%r10d
1722	test		\$`1<<16`,%r10d		# check for AVX512F
1723	jnz		.Lblocks_avx512
1724.Lskip_avx512:
1725___
1726$code.=<<___	if (!$win64);
1727	lea		-8(%rsp),%r11
1728.cfi_def_cfa		%r11,16
1729	sub		\$0x128,%rsp
1730___
1731$code.=<<___	if ($win64);
1732	lea		-0xf8(%rsp),%r11
1733	sub		\$0x1c8,%rsp
1734	vmovdqa		%xmm6,0x50(%r11)
1735	vmovdqa		%xmm7,0x60(%r11)
1736	vmovdqa		%xmm8,0x70(%r11)
1737	vmovdqa		%xmm9,0x80(%r11)
1738	vmovdqa		%xmm10,0x90(%r11)
1739	vmovdqa		%xmm11,0xa0(%r11)
1740	vmovdqa		%xmm12,0xb0(%r11)
1741	vmovdqa		%xmm13,0xc0(%r11)
1742	vmovdqa		%xmm14,0xd0(%r11)
1743	vmovdqa		%xmm15,0xe0(%r11)
1744.Ldo_avx2_body:
1745___
1746$code.=<<___;
1747	lea		.Lconst(%rip),%rcx
1748	lea		48+64($ctx),$ctx	# size optimization
1749	vmovdqa		96(%rcx),$T0		# .Lpermd_avx2
1750
1751	# expand and copy pre-calculated table to stack
1752	vmovdqu		`16*0-64`($ctx),%x#$T2
1753	and		\$-512,%rsp
1754	vmovdqu		`16*1-64`($ctx),%x#$T3
1755	vmovdqu		`16*2-64`($ctx),%x#$T4
1756	vmovdqu		`16*3-64`($ctx),%x#$D0
1757	vmovdqu		`16*4-64`($ctx),%x#$D1
1758	vmovdqu		`16*5-64`($ctx),%x#$D2
1759	lea		0x90(%rsp),%rax		# size optimization
1760	vmovdqu		`16*6-64`($ctx),%x#$D3
1761	vpermd		$T2,$T0,$T2		# 00003412 -> 14243444
1762	vmovdqu		`16*7-64`($ctx),%x#$D4
1763	vpermd		$T3,$T0,$T3
1764	vmovdqu		`16*8-64`($ctx),%x#$MASK
1765	vpermd		$T4,$T0,$T4
1766	vmovdqa		$T2,0x00(%rsp)
1767	vpermd		$D0,$T0,$D0
1768	vmovdqa		$T3,0x20-0x90(%rax)
1769	vpermd		$D1,$T0,$D1
1770	vmovdqa		$T4,0x40-0x90(%rax)
1771	vpermd		$D2,$T0,$D2
1772	vmovdqa		$D0,0x60-0x90(%rax)
1773	vpermd		$D3,$T0,$D3
1774	vmovdqa		$D1,0x80-0x90(%rax)
1775	vpermd		$D4,$T0,$D4
1776	vmovdqa		$D2,0xa0-0x90(%rax)
1777	vpermd		$MASK,$T0,$MASK
1778	vmovdqa		$D3,0xc0-0x90(%rax)
1779	vmovdqa		$D4,0xe0-0x90(%rax)
1780	vmovdqa		$MASK,0x100-0x90(%rax)
1781	vmovdqa		64(%rcx),$MASK		# .Lmask26
1782
1783	################################################################
1784	# load input
1785	vmovdqu		16*0($inp),%x#$T0
1786	vmovdqu		16*1($inp),%x#$T1
1787	vinserti128	\$1,16*2($inp),$T0,$T0
1788	vinserti128	\$1,16*3($inp),$T1,$T1
1789	lea		16*4($inp),$inp
1790
1791	vpsrldq		\$6,$T0,$T2		# splat input
1792	vpsrldq		\$6,$T1,$T3
1793	vpunpckhqdq	$T1,$T0,$T4		# 4
1794	vpunpcklqdq	$T3,$T2,$T2		# 2:3
1795	vpunpcklqdq	$T1,$T0,$T0		# 0:1
1796
1797	vpsrlq		\$30,$T2,$T3
1798	vpsrlq		\$4,$T2,$T2
1799	vpsrlq		\$26,$T0,$T1
1800	vpsrlq		\$40,$T4,$T4		# 4
1801	vpand		$MASK,$T2,$T2		# 2
1802	vpand		$MASK,$T0,$T0		# 0
1803	vpand		$MASK,$T1,$T1		# 1
1804	vpand		$MASK,$T3,$T3		# 3
1805	vpor		32(%rcx),$T4,$T4	# padbit, yes, always
1806
1807	vpaddq		$H2,$T2,$H2		# accumulate input
1808	sub		\$64,$len
1809	jz		.Ltail_avx2
1810	jmp		.Loop_avx2
1811
1812.align	32
1813.Loop_avx2:
1814	################################################################
1815	# ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4
1816	# ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3
1817	# ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2
1818	# ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1
1819	#   \________/\__________/
1820	################################################################
1821	#vpaddq		$H2,$T2,$H2		# accumulate input
1822	vpaddq		$H0,$T0,$H0
1823	vmovdqa		`32*0`(%rsp),$T0	# r0^4
1824	vpaddq		$H1,$T1,$H1
1825	vmovdqa		`32*1`(%rsp),$T1	# r1^4
1826	vpaddq		$H3,$T3,$H3
1827	vmovdqa		`32*3`(%rsp),$T2	# r2^4
1828	vpaddq		$H4,$T4,$H4
1829	vmovdqa		`32*6-0x90`(%rax),$T3	# s3^4
1830	vmovdqa		`32*8-0x90`(%rax),$S4	# s4^4
1831
1832	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
1833	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
1834	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
1835	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
1836	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
1837	#
1838	# however, as h2 is "chronologically" first one available pull
1839	# corresponding operations up, so it's
1840	#
1841	# d4 = h2*r2   + h4*r0 + h3*r1             + h1*r3   + h0*r4
1842	# d3 = h2*r1   + h3*r0           + h1*r2   + h0*r3   + h4*5*r4
1843	# d2 = h2*r0           + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
1844	# d1 = h2*5*r4 + h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3
1845	# d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2           + h1*5*r4
1846
1847	vpmuludq	$H2,$T0,$D2		# d2 = h2*r0
1848	vpmuludq	$H2,$T1,$D3		# d3 = h2*r1
1849	vpmuludq	$H2,$T2,$D4		# d4 = h2*r2
1850	vpmuludq	$H2,$T3,$D0		# d0 = h2*s3
1851	vpmuludq	$H2,$S4,$D1		# d1 = h2*s4
1852
1853	vpmuludq	$H0,$T1,$T4		# h0*r1
1854	vpmuludq	$H1,$T1,$H2		# h1*r1, borrow $H2 as temp
1855	vpaddq		$T4,$D1,$D1		# d1 += h0*r1
1856	vpaddq		$H2,$D2,$D2		# d2 += h1*r1
1857	vpmuludq	$H3,$T1,$T4		# h3*r1
1858	vpmuludq	`32*2`(%rsp),$H4,$H2	# h4*s1
1859	vpaddq		$T4,$D4,$D4		# d4 += h3*r1
1860	vpaddq		$H2,$D0,$D0		# d0 += h4*s1
1861	 vmovdqa	`32*4-0x90`(%rax),$T1	# s2
1862
1863	vpmuludq	$H0,$T0,$T4		# h0*r0
1864	vpmuludq	$H1,$T0,$H2		# h1*r0
1865	vpaddq		$T4,$D0,$D0		# d0 += h0*r0
1866	vpaddq		$H2,$D1,$D1		# d1 += h1*r0
1867	vpmuludq	$H3,$T0,$T4		# h3*r0
1868	vpmuludq	$H4,$T0,$H2		# h4*r0
1869	 vmovdqu	16*0($inp),%x#$T0	# load input
1870	vpaddq		$T4,$D3,$D3		# d3 += h3*r0
1871	vpaddq		$H2,$D4,$D4		# d4 += h4*r0
1872	 vinserti128	\$1,16*2($inp),$T0,$T0
1873
1874	vpmuludq	$H3,$T1,$T4		# h3*s2
1875	vpmuludq	$H4,$T1,$H2		# h4*s2
1876	 vmovdqu	16*1($inp),%x#$T1
1877	vpaddq		$T4,$D0,$D0		# d0 += h3*s2
1878	vpaddq		$H2,$D1,$D1		# d1 += h4*s2
1879	 vmovdqa	`32*5-0x90`(%rax),$H2	# r3
1880	vpmuludq	$H1,$T2,$T4		# h1*r2
1881	vpmuludq	$H0,$T2,$T2		# h0*r2
1882	vpaddq		$T4,$D3,$D3		# d3 += h1*r2
1883	vpaddq		$T2,$D2,$D2		# d2 += h0*r2
1884	 vinserti128	\$1,16*3($inp),$T1,$T1
1885	 lea		16*4($inp),$inp
1886
1887	vpmuludq	$H1,$H2,$T4		# h1*r3
1888	vpmuludq	$H0,$H2,$H2		# h0*r3
1889	 vpsrldq	\$6,$T0,$T2		# splat input
1890	vpaddq		$T4,$D4,$D4		# d4 += h1*r3
1891	vpaddq		$H2,$D3,$D3		# d3 += h0*r3
1892	vpmuludq	$H3,$T3,$T4		# h3*s3
1893	vpmuludq	$H4,$T3,$H2		# h4*s3
1894	 vpsrldq	\$6,$T1,$T3
1895	vpaddq		$T4,$D1,$D1		# d1 += h3*s3
1896	vpaddq		$H2,$D2,$D2		# d2 += h4*s3
1897	 vpunpckhqdq	$T1,$T0,$T4		# 4
1898
1899	vpmuludq	$H3,$S4,$H3		# h3*s4
1900	vpmuludq	$H4,$S4,$H4		# h4*s4
1901	 vpunpcklqdq	$T1,$T0,$T0		# 0:1
1902	vpaddq		$H3,$D2,$H2		# h2 = d2 + h3*r4
1903	vpaddq		$H4,$D3,$H3		# h3 = d3 + h4*r4
1904	 vpunpcklqdq	$T3,$T2,$T3		# 2:3
1905	vpmuludq	`32*7-0x90`(%rax),$H0,$H4	# h0*r4
1906	vpmuludq	$H1,$S4,$H0		# h1*s4
1907	vmovdqa		64(%rcx),$MASK		# .Lmask26
1908	vpaddq		$H4,$D4,$H4		# h4 = d4 + h0*r4
1909	vpaddq		$H0,$D0,$H0		# h0 = d0 + h1*s4
1910
1911	################################################################
1912	# lazy reduction (interleaved with tail of input splat)
1913
1914	vpsrlq		\$26,$H3,$D3
1915	vpand		$MASK,$H3,$H3
1916	vpaddq		$D3,$H4,$H4		# h3 -> h4
1917
1918	vpsrlq		\$26,$H0,$D0
1919	vpand		$MASK,$H0,$H0
1920	vpaddq		$D0,$D1,$H1		# h0 -> h1
1921
1922	vpsrlq		\$26,$H4,$D4
1923	vpand		$MASK,$H4,$H4
1924
1925	 vpsrlq		\$4,$T3,$T2
1926
1927	vpsrlq		\$26,$H1,$D1
1928	vpand		$MASK,$H1,$H1
1929	vpaddq		$D1,$H2,$H2		# h1 -> h2
1930
1931	vpaddq		$D4,$H0,$H0
1932	vpsllq		\$2,$D4,$D4
1933	vpaddq		$D4,$H0,$H0		# h4 -> h0
1934
1935	 vpand		$MASK,$T2,$T2		# 2
1936	 vpsrlq		\$26,$T0,$T1
1937
1938	vpsrlq		\$26,$H2,$D2
1939	vpand		$MASK,$H2,$H2
1940	vpaddq		$D2,$H3,$H3		# h2 -> h3
1941
1942	 vpaddq		$T2,$H2,$H2		# modulo-scheduled
1943	 vpsrlq		\$30,$T3,$T3
1944
1945	vpsrlq		\$26,$H0,$D0
1946	vpand		$MASK,$H0,$H0
1947	vpaddq		$D0,$H1,$H1		# h0 -> h1
1948
1949	 vpsrlq		\$40,$T4,$T4		# 4
1950
1951	vpsrlq		\$26,$H3,$D3
1952	vpand		$MASK,$H3,$H3
1953	vpaddq		$D3,$H4,$H4		# h3 -> h4
1954
1955	 vpand		$MASK,$T0,$T0		# 0
1956	 vpand		$MASK,$T1,$T1		# 1
1957	 vpand		$MASK,$T3,$T3		# 3
1958	 vpor		32(%rcx),$T4,$T4	# padbit, yes, always
1959
1960	sub		\$64,$len
1961	jnz		.Loop_avx2
1962
1963	.byte		0x66,0x90
1964.Ltail_avx2:
1965	################################################################
1966	# while above multiplications were by r^4 in all lanes, in last
1967	# iteration we multiply least significant lane by r^4 and most
1968	# significant one by r, so copy of above except that references
1969	# to the precomputed table are displaced by 4...
1970
1971	#vpaddq		$H2,$T2,$H2		# accumulate input
1972	vpaddq		$H0,$T0,$H0
1973	vmovdqu		`32*0+4`(%rsp),$T0	# r0^4
1974	vpaddq		$H1,$T1,$H1
1975	vmovdqu		`32*1+4`(%rsp),$T1	# r1^4
1976	vpaddq		$H3,$T3,$H3
1977	vmovdqu		`32*3+4`(%rsp),$T2	# r2^4
1978	vpaddq		$H4,$T4,$H4
1979	vmovdqu		`32*6+4-0x90`(%rax),$T3	# s3^4
1980	vmovdqu		`32*8+4-0x90`(%rax),$S4	# s4^4
1981
1982	vpmuludq	$H2,$T0,$D2		# d2 = h2*r0
1983	vpmuludq	$H2,$T1,$D3		# d3 = h2*r1
1984	vpmuludq	$H2,$T2,$D4		# d4 = h2*r2
1985	vpmuludq	$H2,$T3,$D0		# d0 = h2*s3
1986	vpmuludq	$H2,$S4,$D1		# d1 = h2*s4
1987
1988	vpmuludq	$H0,$T1,$T4		# h0*r1
1989	vpmuludq	$H1,$T1,$H2		# h1*r1
1990	vpaddq		$T4,$D1,$D1		# d1 += h0*r1
1991	vpaddq		$H2,$D2,$D2		# d2 += h1*r1
1992	vpmuludq	$H3,$T1,$T4		# h3*r1
1993	vpmuludq	`32*2+4`(%rsp),$H4,$H2	# h4*s1
1994	vpaddq		$T4,$D4,$D4		# d4 += h3*r1
1995	vpaddq		$H2,$D0,$D0		# d0 += h4*s1
1996
1997	vpmuludq	$H0,$T0,$T4		# h0*r0
1998	vpmuludq	$H1,$T0,$H2		# h1*r0
1999	vpaddq		$T4,$D0,$D0		# d0 += h0*r0
2000	 vmovdqu	`32*4+4-0x90`(%rax),$T1	# s2
2001	vpaddq		$H2,$D1,$D1		# d1 += h1*r0
2002	vpmuludq	$H3,$T0,$T4		# h3*r0
2003	vpmuludq	$H4,$T0,$H2		# h4*r0
2004	vpaddq		$T4,$D3,$D3		# d3 += h3*r0
2005	vpaddq		$H2,$D4,$D4		# d4 += h4*r0
2006
2007	vpmuludq	$H3,$T1,$T4		# h3*s2
2008	vpmuludq	$H4,$T1,$H2		# h4*s2
2009	vpaddq		$T4,$D0,$D0		# d0 += h3*s2
2010	vpaddq		$H2,$D1,$D1		# d1 += h4*s2
2011	 vmovdqu	`32*5+4-0x90`(%rax),$H2	# r3
2012	vpmuludq	$H1,$T2,$T4		# h1*r2
2013	vpmuludq	$H0,$T2,$T2		# h0*r2
2014	vpaddq		$T4,$D3,$D3		# d3 += h1*r2
2015	vpaddq		$T2,$D2,$D2		# d2 += h0*r2
2016
2017	vpmuludq	$H1,$H2,$T4		# h1*r3
2018	vpmuludq	$H0,$H2,$H2		# h0*r3
2019	vpaddq		$T4,$D4,$D4		# d4 += h1*r3
2020	vpaddq		$H2,$D3,$D3		# d3 += h0*r3
2021	vpmuludq	$H3,$T3,$T4		# h3*s3
2022	vpmuludq	$H4,$T3,$H2		# h4*s3
2023	vpaddq		$T4,$D1,$D1		# d1 += h3*s3
2024	vpaddq		$H2,$D2,$D2		# d2 += h4*s3
2025
2026	vpmuludq	$H3,$S4,$H3		# h3*s4
2027	vpmuludq	$H4,$S4,$H4		# h4*s4
2028	vpaddq		$H3,$D2,$H2		# h2 = d2 + h3*r4
2029	vpaddq		$H4,$D3,$H3		# h3 = d3 + h4*r4
2030	vpmuludq	`32*7+4-0x90`(%rax),$H0,$H4		# h0*r4
2031	vpmuludq	$H1,$S4,$H0		# h1*s4
2032	vmovdqa		64(%rcx),$MASK		# .Lmask26
2033	vpaddq		$H4,$D4,$H4		# h4 = d4 + h0*r4
2034	vpaddq		$H0,$D0,$H0		# h0 = d0 + h1*s4
2035
2036	################################################################
2037	# horizontal addition
2038
2039	vpsrldq		\$8,$D1,$T1
2040	vpsrldq		\$8,$H2,$T2
2041	vpsrldq		\$8,$H3,$T3
2042	vpsrldq		\$8,$H4,$T4
2043	vpsrldq		\$8,$H0,$T0
2044	vpaddq		$T1,$D1,$D1
2045	vpaddq		$T2,$H2,$H2
2046	vpaddq		$T3,$H3,$H3
2047	vpaddq		$T4,$H4,$H4
2048	vpaddq		$T0,$H0,$H0
2049
2050	vpermq		\$0x2,$H3,$T3
2051	vpermq		\$0x2,$H4,$T4
2052	vpermq		\$0x2,$H0,$T0
2053	vpermq		\$0x2,$D1,$T1
2054	vpermq		\$0x2,$H2,$T2
2055	vpaddq		$T3,$H3,$H3
2056	vpaddq		$T4,$H4,$H4
2057	vpaddq		$T0,$H0,$H0
2058	vpaddq		$T1,$D1,$D1
2059	vpaddq		$T2,$H2,$H2
2060
2061	################################################################
2062	# lazy reduction
2063
2064	vpsrlq		\$26,$H3,$D3
2065	vpand		$MASK,$H3,$H3
2066	vpaddq		$D3,$H4,$H4		# h3 -> h4
2067
2068	vpsrlq		\$26,$H0,$D0
2069	vpand		$MASK,$H0,$H0
2070	vpaddq		$D0,$D1,$H1		# h0 -> h1
2071
2072	vpsrlq		\$26,$H4,$D4
2073	vpand		$MASK,$H4,$H4
2074
2075	vpsrlq		\$26,$H1,$D1
2076	vpand		$MASK,$H1,$H1
2077	vpaddq		$D1,$H2,$H2		# h1 -> h2
2078
2079	vpaddq		$D4,$H0,$H0
2080	vpsllq		\$2,$D4,$D4
2081	vpaddq		$D4,$H0,$H0		# h4 -> h0
2082
2083	vpsrlq		\$26,$H2,$D2
2084	vpand		$MASK,$H2,$H2
2085	vpaddq		$D2,$H3,$H3		# h2 -> h3
2086
2087	vpsrlq		\$26,$H0,$D0
2088	vpand		$MASK,$H0,$H0
2089	vpaddq		$D0,$H1,$H1		# h0 -> h1
2090
2091	vpsrlq		\$26,$H3,$D3
2092	vpand		$MASK,$H3,$H3
2093	vpaddq		$D3,$H4,$H4		# h3 -> h4
2094
2095	vmovd		%x#$H0,`4*0-48-64`($ctx)# save partially reduced
2096	vmovd		%x#$H1,`4*1-48-64`($ctx)
2097	vmovd		%x#$H2,`4*2-48-64`($ctx)
2098	vmovd		%x#$H3,`4*3-48-64`($ctx)
2099	vmovd		%x#$H4,`4*4-48-64`($ctx)
2100___
2101$code.=<<___	if ($win64);
2102	vmovdqa		0x50(%r11),%xmm6
2103	vmovdqa		0x60(%r11),%xmm7
2104	vmovdqa		0x70(%r11),%xmm8
2105	vmovdqa		0x80(%r11),%xmm9
2106	vmovdqa		0x90(%r11),%xmm10
2107	vmovdqa		0xa0(%r11),%xmm11
2108	vmovdqa		0xb0(%r11),%xmm12
2109	vmovdqa		0xc0(%r11),%xmm13
2110	vmovdqa		0xd0(%r11),%xmm14
2111	vmovdqa		0xe0(%r11),%xmm15
2112	lea		0xf8(%r11),%rsp
2113.Ldo_avx2_epilogue:
2114___
2115$code.=<<___	if (!$win64);
2116	lea		8(%r11),%rsp
2117.cfi_def_cfa		%rsp,8
2118___
2119$code.=<<___;
2120	vzeroupper
2121	ret
2122.cfi_endproc
2123.size	poly1305_blocks_avx2,.-poly1305_blocks_avx2
2124___
2125#######################################################################
2126if ($avx>2) {
2127# On entry we have input length divisible by 64. But since inner loop
2128# processes 128 bytes per iteration, cases when length is not divisible
2129# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this
2130# reason stack layout is kept identical to poly1305_blocks_avx2. If not
2131# for this tail, we wouldn't have to even allocate stack frame...
2132
2133my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24));
2134my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29));
2135my $PADBIT="%zmm30";
2136
2137map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3));		# switch to %zmm domain
2138map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4));
2139map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4));
2140map(s/%y/%z/,($MASK));
2141
2142$code.=<<___;
2143.type	poly1305_blocks_avx512,\@function,4
2144.align	32
2145poly1305_blocks_avx512:
2146.cfi_startproc
2147.Lblocks_avx512:
2148	mov		\$15,%eax
2149	kmovw		%eax,%k2
2150___
2151$code.=<<___	if (!$win64);
2152	lea		-8(%rsp),%r11
2153.cfi_def_cfa		%r11,16
2154	sub		\$0x128,%rsp
2155___
2156$code.=<<___	if ($win64);
2157	lea		-0xf8(%rsp),%r11
2158	sub		\$0x1c8,%rsp
2159	vmovdqa		%xmm6,0x50(%r11)
2160	vmovdqa		%xmm7,0x60(%r11)
2161	vmovdqa		%xmm8,0x70(%r11)
2162	vmovdqa		%xmm9,0x80(%r11)
2163	vmovdqa		%xmm10,0x90(%r11)
2164	vmovdqa		%xmm11,0xa0(%r11)
2165	vmovdqa		%xmm12,0xb0(%r11)
2166	vmovdqa		%xmm13,0xc0(%r11)
2167	vmovdqa		%xmm14,0xd0(%r11)
2168	vmovdqa		%xmm15,0xe0(%r11)
2169.Ldo_avx512_body:
2170___
2171$code.=<<___;
2172	lea		.Lconst(%rip),%rcx
2173	lea		48+64($ctx),$ctx	# size optimization
2174	vmovdqa		96(%rcx),%y#$T2		# .Lpermd_avx2
2175
2176	# expand pre-calculated table
2177	vmovdqu		`16*0-64`($ctx),%x#$D0	# will become expanded ${R0}
2178	and		\$-512,%rsp
2179	vmovdqu		`16*1-64`($ctx),%x#$D1	# will become ... ${R1}
2180	mov		\$0x20,%rax
2181	vmovdqu		`16*2-64`($ctx),%x#$T0	# ... ${S1}
2182	vmovdqu		`16*3-64`($ctx),%x#$D2	# ... ${R2}
2183	vmovdqu		`16*4-64`($ctx),%x#$T1	# ... ${S2}
2184	vmovdqu		`16*5-64`($ctx),%x#$D3	# ... ${R3}
2185	vmovdqu		`16*6-64`($ctx),%x#$T3	# ... ${S3}
2186	vmovdqu		`16*7-64`($ctx),%x#$D4	# ... ${R4}
2187	vmovdqu		`16*8-64`($ctx),%x#$T4	# ... ${S4}
2188	vpermd		$D0,$T2,$R0		# 00003412 -> 14243444
2189	vpbroadcastq	64(%rcx),$MASK		# .Lmask26
2190	vpermd		$D1,$T2,$R1
2191	vpermd		$T0,$T2,$S1
2192	vpermd		$D2,$T2,$R2
2193	vmovdqa64	$R0,0x00(%rsp){%k2}	# save in case $len%128 != 0
2194	 vpsrlq		\$32,$R0,$T0		# 14243444 -> 01020304
2195	vpermd		$T1,$T2,$S2
2196	vmovdqu64	$R1,0x00(%rsp,%rax){%k2}
2197	 vpsrlq		\$32,$R1,$T1
2198	vpermd		$D3,$T2,$R3
2199	vmovdqa64	$S1,0x40(%rsp){%k2}
2200	vpermd		$T3,$T2,$S3
2201	vpermd		$D4,$T2,$R4
2202	vmovdqu64	$R2,0x40(%rsp,%rax){%k2}
2203	vpermd		$T4,$T2,$S4
2204	vmovdqa64	$S2,0x80(%rsp){%k2}
2205	vmovdqu64	$R3,0x80(%rsp,%rax){%k2}
2206	vmovdqa64	$S3,0xc0(%rsp){%k2}
2207	vmovdqu64	$R4,0xc0(%rsp,%rax){%k2}
2208	vmovdqa64	$S4,0x100(%rsp){%k2}
2209
2210	################################################################
2211	# calculate 5th through 8th powers of the key
2212	#
2213	# d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r1
2214	# d1 = r0'*r1 + r1'*r0   + r2'*5*r4 + r3'*5*r3 + r4'*5*r2
2215	# d2 = r0'*r2 + r1'*r1   + r2'*r0   + r3'*5*r4 + r4'*5*r3
2216	# d3 = r0'*r3 + r1'*r2   + r2'*r1   + r3'*r0   + r4'*5*r4
2217	# d4 = r0'*r4 + r1'*r3   + r2'*r2   + r3'*r1   + r4'*r0
2218
2219	vpmuludq	$T0,$R0,$D0		# d0 = r0'*r0
2220	vpmuludq	$T0,$R1,$D1		# d1 = r0'*r1
2221	vpmuludq	$T0,$R2,$D2		# d2 = r0'*r2
2222	vpmuludq	$T0,$R3,$D3		# d3 = r0'*r3
2223	vpmuludq	$T0,$R4,$D4		# d4 = r0'*r4
2224	 vpsrlq		\$32,$R2,$T2
2225
2226	vpmuludq	$T1,$S4,$M0
2227	vpmuludq	$T1,$R0,$M1
2228	vpmuludq	$T1,$R1,$M2
2229	vpmuludq	$T1,$R2,$M3
2230	vpmuludq	$T1,$R3,$M4
2231	 vpsrlq		\$32,$R3,$T3
2232	vpaddq		$M0,$D0,$D0		# d0 += r1'*5*r4
2233	vpaddq		$M1,$D1,$D1		# d1 += r1'*r0
2234	vpaddq		$M2,$D2,$D2		# d2 += r1'*r1
2235	vpaddq		$M3,$D3,$D3		# d3 += r1'*r2
2236	vpaddq		$M4,$D4,$D4		# d4 += r1'*r3
2237
2238	vpmuludq	$T2,$S3,$M0
2239	vpmuludq	$T2,$S4,$M1
2240	vpmuludq	$T2,$R1,$M3
2241	vpmuludq	$T2,$R2,$M4
2242	vpmuludq	$T2,$R0,$M2
2243	 vpsrlq		\$32,$R4,$T4
2244	vpaddq		$M0,$D0,$D0		# d0 += r2'*5*r3
2245	vpaddq		$M1,$D1,$D1		# d1 += r2'*5*r4
2246	vpaddq		$M3,$D3,$D3		# d3 += r2'*r1
2247	vpaddq		$M4,$D4,$D4		# d4 += r2'*r2
2248	vpaddq		$M2,$D2,$D2		# d2 += r2'*r0
2249
2250	vpmuludq	$T3,$S2,$M0
2251	vpmuludq	$T3,$R0,$M3
2252	vpmuludq	$T3,$R1,$M4
2253	vpmuludq	$T3,$S3,$M1
2254	vpmuludq	$T3,$S4,$M2
2255	vpaddq		$M0,$D0,$D0		# d0 += r3'*5*r2
2256	vpaddq		$M3,$D3,$D3		# d3 += r3'*r0
2257	vpaddq		$M4,$D4,$D4		# d4 += r3'*r1
2258	vpaddq		$M1,$D1,$D1		# d1 += r3'*5*r3
2259	vpaddq		$M2,$D2,$D2		# d2 += r3'*5*r4
2260
2261	vpmuludq	$T4,$S4,$M3
2262	vpmuludq	$T4,$R0,$M4
2263	vpmuludq	$T4,$S1,$M0
2264	vpmuludq	$T4,$S2,$M1
2265	vpmuludq	$T4,$S3,$M2
2266	vpaddq		$M3,$D3,$D3		# d3 += r2'*5*r4
2267	vpaddq		$M4,$D4,$D4		# d4 += r2'*r0
2268	vpaddq		$M0,$D0,$D0		# d0 += r2'*5*r1
2269	vpaddq		$M1,$D1,$D1		# d1 += r2'*5*r2
2270	vpaddq		$M2,$D2,$D2		# d2 += r2'*5*r3
2271
2272	################################################################
2273	# load input
2274	vmovdqu64	16*0($inp),%z#$T3
2275	vmovdqu64	16*4($inp),%z#$T4
2276	lea		16*8($inp),$inp
2277
2278	################################################################
2279	# lazy reduction
2280
2281	vpsrlq		\$26,$D3,$M3
2282	vpandq		$MASK,$D3,$D3
2283	vpaddq		$M3,$D4,$D4		# d3 -> d4
2284
2285	vpsrlq		\$26,$D0,$M0
2286	vpandq		$MASK,$D0,$D0
2287	vpaddq		$M0,$D1,$D1		# d0 -> d1
2288
2289	vpsrlq		\$26,$D4,$M4
2290	vpandq		$MASK,$D4,$D4
2291
2292	vpsrlq		\$26,$D1,$M1
2293	vpandq		$MASK,$D1,$D1
2294	vpaddq		$M1,$D2,$D2		# d1 -> d2
2295
2296	vpaddq		$M4,$D0,$D0
2297	vpsllq		\$2,$M4,$M4
2298	vpaddq		$M4,$D0,$D0		# d4 -> d0
2299
2300	vpsrlq		\$26,$D2,$M2
2301	vpandq		$MASK,$D2,$D2
2302	vpaddq		$M2,$D3,$D3		# d2 -> d3
2303
2304	vpsrlq		\$26,$D0,$M0
2305	vpandq		$MASK,$D0,$D0
2306	vpaddq		$M0,$D1,$D1		# d0 -> d1
2307
2308	vpsrlq		\$26,$D3,$M3
2309	vpandq		$MASK,$D3,$D3
2310	vpaddq		$M3,$D4,$D4		# d3 -> d4
2311
2312	################################################################
2313	# at this point we have 14243444 in $R0-$S4 and 05060708 in
2314	# $D0-$D4, ...
2315
2316	vpunpcklqdq	$T4,$T3,$T0	# transpose input
2317	vpunpckhqdq	$T4,$T3,$T4
2318
2319	# ... since input 64-bit lanes are ordered as 73625140, we could
2320	# "vperm" it to 76543210 (here and in each loop iteration), *or*
2321	# we could just flow along, hence the goal for $R0-$S4 is
2322	# 1858286838784888 ...
2323
2324	vmovdqa32	128(%rcx),$M0		# .Lpermd_avx512:
2325	mov		\$0x7777,%eax
2326	kmovw		%eax,%k1
2327
2328	vpermd		$R0,$M0,$R0		# 14243444 -> 1---2---3---4---
2329	vpermd		$R1,$M0,$R1
2330	vpermd		$R2,$M0,$R2
2331	vpermd		$R3,$M0,$R3
2332	vpermd		$R4,$M0,$R4
2333
2334	vpermd		$D0,$M0,${R0}{%k1}	# 05060708 -> 1858286838784888
2335	vpermd		$D1,$M0,${R1}{%k1}
2336	vpermd		$D2,$M0,${R2}{%k1}
2337	vpermd		$D3,$M0,${R3}{%k1}
2338	vpermd		$D4,$M0,${R4}{%k1}
2339
2340	vpslld		\$2,$R1,$S1		# *5
2341	vpslld		\$2,$R2,$S2
2342	vpslld		\$2,$R3,$S3
2343	vpslld		\$2,$R4,$S4
2344	vpaddd		$R1,$S1,$S1
2345	vpaddd		$R2,$S2,$S2
2346	vpaddd		$R3,$S3,$S3
2347	vpaddd		$R4,$S4,$S4
2348
2349	vpbroadcastq	32(%rcx),$PADBIT	# .L129
2350
2351	vpsrlq		\$52,$T0,$T2		# splat input
2352	vpsllq		\$12,$T4,$T3
2353	vporq		$T3,$T2,$T2
2354	vpsrlq		\$26,$T0,$T1
2355	vpsrlq		\$14,$T4,$T3
2356	vpsrlq		\$40,$T4,$T4		# 4
2357	vpandq		$MASK,$T2,$T2		# 2
2358	vpandq		$MASK,$T0,$T0		# 0
2359	#vpandq		$MASK,$T1,$T1		# 1
2360	#vpandq		$MASK,$T3,$T3		# 3
2361	#vporq		$PADBIT,$T4,$T4		# padbit, yes, always
2362
2363	vpaddq		$H2,$T2,$H2		# accumulate input
2364	sub		\$192,$len
2365	jbe		.Ltail_avx512
2366	jmp		.Loop_avx512
2367
2368.align	32
2369.Loop_avx512:
2370	################################################################
2371	# ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8
2372	# ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^7
2373	# ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^6
2374	# ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5
2375	# ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^4
2376	# ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3
2377	# ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^2
2378	# ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^1
2379	#   \________/\___________/
2380	################################################################
2381	#vpaddq		$H2,$T2,$H2		# accumulate input
2382
2383	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
2384	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
2385	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
2386	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
2387	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
2388	#
2389	# however, as h2 is "chronologically" first one available pull
2390	# corresponding operations up, so it's
2391	#
2392	# d3 = h2*r1   + h0*r3 + h1*r2   + h3*r0 + h4*5*r4
2393	# d4 = h2*r2   + h0*r4 + h1*r3   + h3*r1 + h4*r0
2394	# d0 = h2*5*r3 + h0*r0 + h1*5*r4         + h3*5*r2 + h4*5*r1
2395	# d1 = h2*5*r4 + h0*r1           + h1*r0 + h3*5*r3 + h4*5*r2
2396	# d2 = h2*r0           + h0*r2   + h1*r1 + h3*5*r4 + h4*5*r3
2397
2398	vpmuludq	$H2,$R1,$D3		# d3 = h2*r1
2399	 vpaddq		$H0,$T0,$H0
2400	vpmuludq	$H2,$R2,$D4		# d4 = h2*r2
2401	 vpandq		$MASK,$T1,$T1		# 1
2402	vpmuludq	$H2,$S3,$D0		# d0 = h2*s3
2403	 vpandq		$MASK,$T3,$T3		# 3
2404	vpmuludq	$H2,$S4,$D1		# d1 = h2*s4
2405	 vporq		$PADBIT,$T4,$T4		# padbit, yes, always
2406	vpmuludq	$H2,$R0,$D2		# d2 = h2*r0
2407	 vpaddq		$H1,$T1,$H1		# accumulate input
2408	 vpaddq		$H3,$T3,$H3
2409	 vpaddq		$H4,$T4,$H4
2410
2411	  vmovdqu64	16*0($inp),$T3		# load input
2412	  vmovdqu64	16*4($inp),$T4
2413	  lea		16*8($inp),$inp
2414	vpmuludq	$H0,$R3,$M3
2415	vpmuludq	$H0,$R4,$M4
2416	vpmuludq	$H0,$R0,$M0
2417	vpmuludq	$H0,$R1,$M1
2418	vpaddq		$M3,$D3,$D3		# d3 += h0*r3
2419	vpaddq		$M4,$D4,$D4		# d4 += h0*r4
2420	vpaddq		$M0,$D0,$D0		# d0 += h0*r0
2421	vpaddq		$M1,$D1,$D1		# d1 += h0*r1
2422
2423	vpmuludq	$H1,$R2,$M3
2424	vpmuludq	$H1,$R3,$M4
2425	vpmuludq	$H1,$S4,$M0
2426	vpmuludq	$H0,$R2,$M2
2427	vpaddq		$M3,$D3,$D3		# d3 += h1*r2
2428	vpaddq		$M4,$D4,$D4		# d4 += h1*r3
2429	vpaddq		$M0,$D0,$D0		# d0 += h1*s4
2430	vpaddq		$M2,$D2,$D2		# d2 += h0*r2
2431
2432	  vpunpcklqdq	$T4,$T3,$T0		# transpose input
2433	  vpunpckhqdq	$T4,$T3,$T4
2434
2435	vpmuludq	$H3,$R0,$M3
2436	vpmuludq	$H3,$R1,$M4
2437	vpmuludq	$H1,$R0,$M1
2438	vpmuludq	$H1,$R1,$M2
2439	vpaddq		$M3,$D3,$D3		# d3 += h3*r0
2440	vpaddq		$M4,$D4,$D4		# d4 += h3*r1
2441	vpaddq		$M1,$D1,$D1		# d1 += h1*r0
2442	vpaddq		$M2,$D2,$D2		# d2 += h1*r1
2443
2444	vpmuludq	$H4,$S4,$M3
2445	vpmuludq	$H4,$R0,$M4
2446	vpmuludq	$H3,$S2,$M0
2447	vpmuludq	$H3,$S3,$M1
2448	vpaddq		$M3,$D3,$D3		# d3 += h4*s4
2449	vpmuludq	$H3,$S4,$M2
2450	vpaddq		$M4,$D4,$D4		# d4 += h4*r0
2451	vpaddq		$M0,$D0,$D0		# d0 += h3*s2
2452	vpaddq		$M1,$D1,$D1		# d1 += h3*s3
2453	vpaddq		$M2,$D2,$D2		# d2 += h3*s4
2454
2455	vpmuludq	$H4,$S1,$M0
2456	vpmuludq	$H4,$S2,$M1
2457	vpmuludq	$H4,$S3,$M2
2458	vpaddq		$M0,$D0,$H0		# h0 = d0 + h4*s1
2459	vpaddq		$M1,$D1,$H1		# h1 = d2 + h4*s2
2460	vpaddq		$M2,$D2,$H2		# h2 = d3 + h4*s3
2461
2462	################################################################
2463	# lazy reduction (interleaved with input splat)
2464
2465	 vpsrlq		\$52,$T0,$T2		# splat input
2466	 vpsllq		\$12,$T4,$T3
2467
2468	vpsrlq		\$26,$D3,$H3
2469	vpandq		$MASK,$D3,$D3
2470	vpaddq		$H3,$D4,$H4		# h3 -> h4
2471
2472	 vporq		$T3,$T2,$T2
2473
2474	vpsrlq		\$26,$H0,$D0
2475	vpandq		$MASK,$H0,$H0
2476	vpaddq		$D0,$H1,$H1		# h0 -> h1
2477
2478	 vpandq		$MASK,$T2,$T2		# 2
2479
2480	vpsrlq		\$26,$H4,$D4
2481	vpandq		$MASK,$H4,$H4
2482
2483	vpsrlq		\$26,$H1,$D1
2484	vpandq		$MASK,$H1,$H1
2485	vpaddq		$D1,$H2,$H2		# h1 -> h2
2486
2487	vpaddq		$D4,$H0,$H0
2488	vpsllq		\$2,$D4,$D4
2489	vpaddq		$D4,$H0,$H0		# h4 -> h0
2490
2491	 vpaddq		$T2,$H2,$H2		# modulo-scheduled
2492	 vpsrlq		\$26,$T0,$T1
2493
2494	vpsrlq		\$26,$H2,$D2
2495	vpandq		$MASK,$H2,$H2
2496	vpaddq		$D2,$D3,$H3		# h2 -> h3
2497
2498	 vpsrlq		\$14,$T4,$T3
2499
2500	vpsrlq		\$26,$H0,$D0
2501	vpandq		$MASK,$H0,$H0
2502	vpaddq		$D0,$H1,$H1		# h0 -> h1
2503
2504	 vpsrlq		\$40,$T4,$T4		# 4
2505
2506	vpsrlq		\$26,$H3,$D3
2507	vpandq		$MASK,$H3,$H3
2508	vpaddq		$D3,$H4,$H4		# h3 -> h4
2509
2510	 vpandq		$MASK,$T0,$T0		# 0
2511	 #vpandq	$MASK,$T1,$T1		# 1
2512	 #vpandq	$MASK,$T3,$T3		# 3
2513	 #vporq		$PADBIT,$T4,$T4		# padbit, yes, always
2514
2515	sub		\$128,$len
2516	ja		.Loop_avx512
2517
2518.Ltail_avx512:
2519	################################################################
2520	# while above multiplications were by r^8 in all lanes, in last
2521	# iteration we multiply least significant lane by r^8 and most
2522	# significant one by r, that's why table gets shifted...
2523
2524	vpsrlq		\$32,$R0,$R0		# 0105020603070408
2525	vpsrlq		\$32,$R1,$R1
2526	vpsrlq		\$32,$R2,$R2
2527	vpsrlq		\$32,$S3,$S3
2528	vpsrlq		\$32,$S4,$S4
2529	vpsrlq		\$32,$R3,$R3
2530	vpsrlq		\$32,$R4,$R4
2531	vpsrlq		\$32,$S1,$S1
2532	vpsrlq		\$32,$S2,$S2
2533
2534	################################################################
2535	# load either next or last 64 byte of input
2536	lea		($inp,$len),$inp
2537
2538	#vpaddq		$H2,$T2,$H2		# accumulate input
2539	vpaddq		$H0,$T0,$H0
2540
2541	vpmuludq	$H2,$R1,$D3		# d3 = h2*r1
2542	vpmuludq	$H2,$R2,$D4		# d4 = h2*r2
2543	vpmuludq	$H2,$S3,$D0		# d0 = h2*s3
2544	 vpandq		$MASK,$T1,$T1		# 1
2545	vpmuludq	$H2,$S4,$D1		# d1 = h2*s4
2546	 vpandq		$MASK,$T3,$T3		# 3
2547	vpmuludq	$H2,$R0,$D2		# d2 = h2*r0
2548	 vporq		$PADBIT,$T4,$T4		# padbit, yes, always
2549	 vpaddq		$H1,$T1,$H1		# accumulate input
2550	 vpaddq		$H3,$T3,$H3
2551	 vpaddq		$H4,$T4,$H4
2552
2553	  vmovdqu	16*0($inp),%x#$T0
2554	vpmuludq	$H0,$R3,$M3
2555	vpmuludq	$H0,$R4,$M4
2556	vpmuludq	$H0,$R0,$M0
2557	vpmuludq	$H0,$R1,$M1
2558	vpaddq		$M3,$D3,$D3		# d3 += h0*r3
2559	vpaddq		$M4,$D4,$D4		# d4 += h0*r4
2560	vpaddq		$M0,$D0,$D0		# d0 += h0*r0
2561	vpaddq		$M1,$D1,$D1		# d1 += h0*r1
2562
2563	  vmovdqu	16*1($inp),%x#$T1
2564	vpmuludq	$H1,$R2,$M3
2565	vpmuludq	$H1,$R3,$M4
2566	vpmuludq	$H1,$S4,$M0
2567	vpmuludq	$H0,$R2,$M2
2568	vpaddq		$M3,$D3,$D3		# d3 += h1*r2
2569	vpaddq		$M4,$D4,$D4		# d4 += h1*r3
2570	vpaddq		$M0,$D0,$D0		# d0 += h1*s4
2571	vpaddq		$M2,$D2,$D2		# d2 += h0*r2
2572
2573	  vinserti128	\$1,16*2($inp),%y#$T0,%y#$T0
2574	vpmuludq	$H3,$R0,$M3
2575	vpmuludq	$H3,$R1,$M4
2576	vpmuludq	$H1,$R0,$M1
2577	vpmuludq	$H1,$R1,$M2
2578	vpaddq		$M3,$D3,$D3		# d3 += h3*r0
2579	vpaddq		$M4,$D4,$D4		# d4 += h3*r1
2580	vpaddq		$M1,$D1,$D1		# d1 += h1*r0
2581	vpaddq		$M2,$D2,$D2		# d2 += h1*r1
2582
2583	  vinserti128	\$1,16*3($inp),%y#$T1,%y#$T1
2584	vpmuludq	$H4,$S4,$M3
2585	vpmuludq	$H4,$R0,$M4
2586	vpmuludq	$H3,$S2,$M0
2587	vpmuludq	$H3,$S3,$M1
2588	vpmuludq	$H3,$S4,$M2
2589	vpaddq		$M3,$D3,$H3		# h3 = d3 + h4*s4
2590	vpaddq		$M4,$D4,$D4		# d4 += h4*r0
2591	vpaddq		$M0,$D0,$D0		# d0 += h3*s2
2592	vpaddq		$M1,$D1,$D1		# d1 += h3*s3
2593	vpaddq		$M2,$D2,$D2		# d2 += h3*s4
2594
2595	vpmuludq	$H4,$S1,$M0
2596	vpmuludq	$H4,$S2,$M1
2597	vpmuludq	$H4,$S3,$M2
2598	vpaddq		$M0,$D0,$H0		# h0 = d0 + h4*s1
2599	vpaddq		$M1,$D1,$H1		# h1 = d2 + h4*s2
2600	vpaddq		$M2,$D2,$H2		# h2 = d3 + h4*s3
2601
2602	################################################################
2603	# horizontal addition
2604
2605	mov		\$1,%eax
2606	vpermq		\$0xb1,$H3,$D3
2607	vpermq		\$0xb1,$D4,$H4
2608	vpermq		\$0xb1,$H0,$D0
2609	vpermq		\$0xb1,$H1,$D1
2610	vpermq		\$0xb1,$H2,$D2
2611	vpaddq		$D3,$H3,$H3
2612	vpaddq		$D4,$H4,$H4
2613	vpaddq		$D0,$H0,$H0
2614	vpaddq		$D1,$H1,$H1
2615	vpaddq		$D2,$H2,$H2
2616
2617	kmovw		%eax,%k3
2618	vpermq		\$0x2,$H3,$D3
2619	vpermq		\$0x2,$H4,$D4
2620	vpermq		\$0x2,$H0,$D0
2621	vpermq		\$0x2,$H1,$D1
2622	vpermq		\$0x2,$H2,$D2
2623	vpaddq		$D3,$H3,$H3
2624	vpaddq		$D4,$H4,$H4
2625	vpaddq		$D0,$H0,$H0
2626	vpaddq		$D1,$H1,$H1
2627	vpaddq		$D2,$H2,$H2
2628
2629	vextracti64x4	\$0x1,$H3,%y#$D3
2630	vextracti64x4	\$0x1,$H4,%y#$D4
2631	vextracti64x4	\$0x1,$H0,%y#$D0
2632	vextracti64x4	\$0x1,$H1,%y#$D1
2633	vextracti64x4	\$0x1,$H2,%y#$D2
2634	vpaddq		$D3,$H3,${H3}{%k3}{z}	# keep single qword in case
2635	vpaddq		$D4,$H4,${H4}{%k3}{z}	# it's passed to .Ltail_avx2
2636	vpaddq		$D0,$H0,${H0}{%k3}{z}
2637	vpaddq		$D1,$H1,${H1}{%k3}{z}
2638	vpaddq		$D2,$H2,${H2}{%k3}{z}
2639___
2640map(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT));
2641map(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK));
2642$code.=<<___;
2643	################################################################
2644	# lazy reduction (interleaved with input splat)
2645
2646	vpsrlq		\$26,$H3,$D3
2647	vpand		$MASK,$H3,$H3
2648	 vpsrldq	\$6,$T0,$T2		# splat input
2649	 vpsrldq	\$6,$T1,$T3
2650	 vpunpckhqdq	$T1,$T0,$T4		# 4
2651	vpaddq		$D3,$H4,$H4		# h3 -> h4
2652
2653	vpsrlq		\$26,$H0,$D0
2654	vpand		$MASK,$H0,$H0
2655	 vpunpcklqdq	$T3,$T2,$T2		# 2:3
2656	 vpunpcklqdq	$T1,$T0,$T0		# 0:1
2657	vpaddq		$D0,$H1,$H1		# h0 -> h1
2658
2659	vpsrlq		\$26,$H4,$D4
2660	vpand		$MASK,$H4,$H4
2661
2662	vpsrlq		\$26,$H1,$D1
2663	vpand		$MASK,$H1,$H1
2664	 vpsrlq		\$30,$T2,$T3
2665	 vpsrlq		\$4,$T2,$T2
2666	vpaddq		$D1,$H2,$H2		# h1 -> h2
2667
2668	vpaddq		$D4,$H0,$H0
2669	vpsllq		\$2,$D4,$D4
2670	 vpsrlq		\$26,$T0,$T1
2671	 vpsrlq		\$40,$T4,$T4		# 4
2672	vpaddq		$D4,$H0,$H0		# h4 -> h0
2673
2674	vpsrlq		\$26,$H2,$D2
2675	vpand		$MASK,$H2,$H2
2676	 vpand		$MASK,$T2,$T2		# 2
2677	 vpand		$MASK,$T0,$T0		# 0
2678	vpaddq		$D2,$H3,$H3		# h2 -> h3
2679
2680	vpsrlq		\$26,$H0,$D0
2681	vpand		$MASK,$H0,$H0
2682	 vpaddq		$H2,$T2,$H2		# accumulate input for .Ltail_avx2
2683	 vpand		$MASK,$T1,$T1		# 1
2684	vpaddq		$D0,$H1,$H1		# h0 -> h1
2685
2686	vpsrlq		\$26,$H3,$D3
2687	vpand		$MASK,$H3,$H3
2688	 vpand		$MASK,$T3,$T3		# 3
2689	 vpor		32(%rcx),$T4,$T4	# padbit, yes, always
2690	vpaddq		$D3,$H4,$H4		# h3 -> h4
2691
2692	lea		0x90(%rsp),%rax		# size optimization for .Ltail_avx2
2693	add		\$64,$len
2694	jnz		.Ltail_avx2
2695
2696	vpsubq		$T2,$H2,$H2		# undo input accumulation
2697	vmovd		%x#$H0,`4*0-48-64`($ctx)# save partially reduced
2698	vmovd		%x#$H1,`4*1-48-64`($ctx)
2699	vmovd		%x#$H2,`4*2-48-64`($ctx)
2700	vmovd		%x#$H3,`4*3-48-64`($ctx)
2701	vmovd		%x#$H4,`4*4-48-64`($ctx)
2702	vzeroall
2703___
2704$code.=<<___	if ($win64);
2705	movdqa		0x50(%r11),%xmm6
2706	movdqa		0x60(%r11),%xmm7
2707	movdqa		0x70(%r11),%xmm8
2708	movdqa		0x80(%r11),%xmm9
2709	movdqa		0x90(%r11),%xmm10
2710	movdqa		0xa0(%r11),%xmm11
2711	movdqa		0xb0(%r11),%xmm12
2712	movdqa		0xc0(%r11),%xmm13
2713	movdqa		0xd0(%r11),%xmm14
2714	movdqa		0xe0(%r11),%xmm15
2715	lea		0xf8(%r11),%rsp
2716.Ldo_avx512_epilogue:
2717___
2718$code.=<<___	if (!$win64);
2719	lea		8(%r11),%rsp
2720.cfi_def_cfa		%rsp,8
2721___
2722$code.=<<___;
2723	ret
2724.cfi_endproc
2725.size	poly1305_blocks_avx512,.-poly1305_blocks_avx512
2726___
2727if ($avx>3 && !$win64) {
2728########################################################################
2729# VPMADD52 version using 2^44 radix.
2730#
2731# One can argue that base 2^52 would be more natural. Well, even though
2732# some operations would be more natural, one has to recognize couple of
2733# things. Base 2^52 doesn't provide advantage over base 2^44 if you look
2734# at amount of multiply-n-accumulate operations. Secondly, it makes it
2735# impossible to pre-compute multiples of 5 [referred to as s[]/sN in
2736# reference implementations], which means that more such operations
2737# would have to be performed in inner loop, which in turn makes critical
2738# path longer. In other words, even though base 2^44 reduction might
2739# look less elegant, overall critical path is actually shorter...
2740
2741########################################################################
2742# Layout of opaque area is following.
2743#
2744#	unsigned __int64 h[3];		# current hash value base 2^44
2745#	unsigned __int64 s[2];		# key value*20 base 2^44
2746#	unsigned __int64 r[3];		# key value base 2^44
2747#	struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4];
2748#					# r^n positions reflect
2749#					# placement in register, not
2750#					# memory, R[3] is R[1]*20
2751
2752$code.=<<___;
2753.type	poly1305_init_base2_44,\@function,3
2754.align	32
2755poly1305_init_base2_44:
2756.cfi_startproc
2757	xor	%rax,%rax
2758	mov	%rax,0($ctx)		# initialize hash value
2759	mov	%rax,8($ctx)
2760	mov	%rax,16($ctx)
2761
2762.Linit_base2_44:
2763	lea	poly1305_blocks_vpmadd52(%rip),%r10
2764	lea	poly1305_emit_base2_44(%rip),%r11
2765
2766	mov	\$0x0ffffffc0fffffff,%rax
2767	mov	\$0x0ffffffc0ffffffc,%rcx
2768	and	0($inp),%rax
2769	mov	\$0x00000fffffffffff,%r8
2770	and	8($inp),%rcx
2771	mov	\$0x00000fffffffffff,%r9
2772	and	%rax,%r8
2773	shrd	\$44,%rcx,%rax
2774	mov	%r8,40($ctx)		# r0
2775	and	%r9,%rax
2776	shr	\$24,%rcx
2777	mov	%rax,48($ctx)		# r1
2778	lea	(%rax,%rax,4),%rax	# *5
2779	mov	%rcx,56($ctx)		# r2
2780	shl	\$2,%rax		# magic <<2
2781	lea	(%rcx,%rcx,4),%rcx	# *5
2782	shl	\$2,%rcx		# magic <<2
2783	mov	%rax,24($ctx)		# s1
2784	mov	%rcx,32($ctx)		# s2
2785	movq	\$-1,64($ctx)		# write impossible value
2786___
2787$code.=<<___	if ($flavour !~ /elf32/);
2788	mov	%r10,0(%rdx)
2789	mov	%r11,8(%rdx)
2790___
2791$code.=<<___	if ($flavour =~ /elf32/);
2792	mov	%r10d,0(%rdx)
2793	mov	%r11d,4(%rdx)
2794___
2795$code.=<<___;
2796	mov	\$1,%eax
2797	ret
2798.cfi_endproc
2799.size	poly1305_init_base2_44,.-poly1305_init_base2_44
2800___
2801{
2802my ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17));
2803my ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21));
2804my ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25));
2805
2806$code.=<<___;
2807.type	poly1305_blocks_vpmadd52,\@function,4
2808.align	32
2809poly1305_blocks_vpmadd52:
2810.cfi_startproc
2811	endbranch
2812	shr	\$4,$len
2813	jz	.Lno_data_vpmadd52		# too short
2814
2815	shl	\$40,$padbit
2816	mov	64($ctx),%r8			# peek on power of the key
2817
2818	# if powers of the key are not calculated yet, process up to 3
2819	# blocks with this single-block subroutine, otherwise ensure that
2820	# length is divisible by 2 blocks and pass the rest down to next
2821	# subroutine...
2822
2823	mov	\$3,%rax
2824	mov	\$1,%r10
2825	cmp	\$4,$len			# is input long
2826	cmovae	%r10,%rax
2827	test	%r8,%r8				# is power value impossible?
2828	cmovns	%r10,%rax
2829
2830	and	$len,%rax			# is input of favourable length?
2831	jz	.Lblocks_vpmadd52_4x
2832
2833	sub		%rax,$len
2834	mov		\$7,%r10d
2835	mov		\$1,%r11d
2836	kmovw		%r10d,%k7
2837	lea		.L2_44_inp_permd(%rip),%r10
2838	kmovw		%r11d,%k1
2839
2840	vmovq		$padbit,%x#$PAD
2841	vmovdqa64	0(%r10),$inp_permd	# .L2_44_inp_permd
2842	vmovdqa64	32(%r10),$inp_shift	# .L2_44_inp_shift
2843	vpermq		\$0xcf,$PAD,$PAD
2844	vmovdqa64	64(%r10),$reduc_mask	# .L2_44_mask
2845
2846	vmovdqu64	0($ctx),${Dlo}{%k7}{z}		# load hash value
2847	vmovdqu64	40($ctx),${r2r1r0}{%k7}{z}	# load keys
2848	vmovdqu64	32($ctx),${r1r0s2}{%k7}{z}
2849	vmovdqu64	24($ctx),${r0s2s1}{%k7}{z}
2850
2851	vmovdqa64	96(%r10),$reduc_rght	# .L2_44_shift_rgt
2852	vmovdqa64	128(%r10),$reduc_left	# .L2_44_shift_lft
2853
2854	jmp		.Loop_vpmadd52
2855
2856.align	32
2857.Loop_vpmadd52:
2858	vmovdqu32	0($inp),%x#$T0		# load input as ----3210
2859	lea		16($inp),$inp
2860
2861	vpermd		$T0,$inp_permd,$T0	# ----3210 -> --322110
2862	vpsrlvq		$inp_shift,$T0,$T0
2863	vpandq		$reduc_mask,$T0,$T0
2864	vporq		$PAD,$T0,$T0
2865
2866	vpaddq		$T0,$Dlo,$Dlo		# accumulate input
2867
2868	vpermq		\$0,$Dlo,${H0}{%k7}{z}	# smash hash value
2869	vpermq		\$0b01010101,$Dlo,${H1}{%k7}{z}
2870	vpermq		\$0b10101010,$Dlo,${H2}{%k7}{z}
2871
2872	vpxord		$Dlo,$Dlo,$Dlo
2873	vpxord		$Dhi,$Dhi,$Dhi
2874
2875	vpmadd52luq	$r2r1r0,$H0,$Dlo
2876	vpmadd52huq	$r2r1r0,$H0,$Dhi
2877
2878	vpmadd52luq	$r1r0s2,$H1,$Dlo
2879	vpmadd52huq	$r1r0s2,$H1,$Dhi
2880
2881	vpmadd52luq	$r0s2s1,$H2,$Dlo
2882	vpmadd52huq	$r0s2s1,$H2,$Dhi
2883
2884	vpsrlvq		$reduc_rght,$Dlo,$T0	# 0 in topmost qword
2885	vpsllvq		$reduc_left,$Dhi,$Dhi	# 0 in topmost qword
2886	vpandq		$reduc_mask,$Dlo,$Dlo
2887
2888	vpaddq		$T0,$Dhi,$Dhi
2889
2890	vpermq		\$0b10010011,$Dhi,$Dhi	# 0 in lowest qword
2891
2892	vpaddq		$Dhi,$Dlo,$Dlo		# note topmost qword :-)
2893
2894	vpsrlvq		$reduc_rght,$Dlo,$T0	# 0 in topmost word
2895	vpandq		$reduc_mask,$Dlo,$Dlo
2896
2897	vpermq		\$0b10010011,$T0,$T0
2898
2899	vpaddq		$T0,$Dlo,$Dlo
2900
2901	vpermq		\$0b10010011,$Dlo,${T0}{%k1}{z}
2902
2903	vpaddq		$T0,$Dlo,$Dlo
2904	vpsllq		\$2,$T0,$T0
2905
2906	vpaddq		$T0,$Dlo,$Dlo
2907
2908	dec		%rax			# len-=16
2909	jnz		.Loop_vpmadd52
2910
2911	vmovdqu64	$Dlo,0($ctx){%k7}	# store hash value
2912
2913	test		$len,$len
2914	jnz		.Lblocks_vpmadd52_4x
2915
2916.Lno_data_vpmadd52:
2917	ret
2918.cfi_endproc
2919.size	poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52
2920___
2921}
2922{
2923########################################################################
2924# As implied by its name 4x subroutine processes 4 blocks in parallel
2925# (but handles even 4*n+2 blocks lengths). It takes up to 4th key power
2926# and is handled in 256-bit %ymm registers.
2927
2928my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
2929my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
2930my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
2931
2932$code.=<<___;
2933.type	poly1305_blocks_vpmadd52_4x,\@function,4
2934.align	32
2935poly1305_blocks_vpmadd52_4x:
2936.cfi_startproc
2937	shr	\$4,$len
2938	jz	.Lno_data_vpmadd52_4x		# too short
2939
2940	shl	\$40,$padbit
2941	mov	64($ctx),%r8			# peek on power of the key
2942
2943.Lblocks_vpmadd52_4x:
2944	vpbroadcastq	$padbit,$PAD
2945
2946	vmovdqa64	.Lx_mask44(%rip),$mask44
2947	mov		\$5,%eax
2948	vmovdqa64	.Lx_mask42(%rip),$mask42
2949	kmovw		%eax,%k1		# used in 2x path
2950
2951	test		%r8,%r8			# is power value impossible?
2952	js		.Linit_vpmadd52		# if it is, then init R[4]
2953
2954	vmovq		0($ctx),%x#$H0		# load current hash value
2955	vmovq		8($ctx),%x#$H1
2956	vmovq		16($ctx),%x#$H2
2957
2958	test		\$3,$len		# is length 4*n+2?
2959	jnz		.Lblocks_vpmadd52_2x_do
2960
2961.Lblocks_vpmadd52_4x_do:
2962	vpbroadcastq	64($ctx),$R0		# load 4th power of the key
2963	vpbroadcastq	96($ctx),$R1
2964	vpbroadcastq	128($ctx),$R2
2965	vpbroadcastq	160($ctx),$S1
2966
2967.Lblocks_vpmadd52_4x_key_loaded:
2968	vpsllq		\$2,$R2,$S2		# S2 = R2*5*4
2969	vpaddq		$R2,$S2,$S2
2970	vpsllq		\$2,$S2,$S2
2971
2972	test		\$7,$len		# is len 8*n?
2973	jz		.Lblocks_vpmadd52_8x
2974
2975	vmovdqu64	16*0($inp),$T2		# load data
2976	vmovdqu64	16*2($inp),$T3
2977	lea		16*4($inp),$inp
2978
2979	vpunpcklqdq	$T3,$T2,$T1		# transpose data
2980	vpunpckhqdq	$T3,$T2,$T3
2981
2982	# at this point 64-bit lanes are ordered as 3-1-2-0
2983
2984	vpsrlq		\$24,$T3,$T2		# splat the data
2985	vporq		$PAD,$T2,$T2
2986	 vpaddq		$T2,$H2,$H2		# accumulate input
2987	vpandq		$mask44,$T1,$T0
2988	vpsrlq		\$44,$T1,$T1
2989	vpsllq		\$20,$T3,$T3
2990	vporq		$T3,$T1,$T1
2991	vpandq		$mask44,$T1,$T1
2992
2993	sub		\$4,$len
2994	jz		.Ltail_vpmadd52_4x
2995	jmp		.Loop_vpmadd52_4x
2996	ud2
2997
2998.align	32
2999.Linit_vpmadd52:
3000	vmovq		24($ctx),%x#$S1		# load key
3001	vmovq		56($ctx),%x#$H2
3002	vmovq		32($ctx),%x#$S2
3003	vmovq		40($ctx),%x#$R0
3004	vmovq		48($ctx),%x#$R1
3005
3006	vmovdqa		$R0,$H0
3007	vmovdqa		$R1,$H1
3008	vmovdqa		$H2,$R2
3009
3010	mov		\$2,%eax
3011
3012.Lmul_init_vpmadd52:
3013	vpxorq		$D0lo,$D0lo,$D0lo
3014	vpmadd52luq	$H2,$S1,$D0lo
3015	vpxorq		$D0hi,$D0hi,$D0hi
3016	vpmadd52huq	$H2,$S1,$D0hi
3017	vpxorq		$D1lo,$D1lo,$D1lo
3018	vpmadd52luq	$H2,$S2,$D1lo
3019	vpxorq		$D1hi,$D1hi,$D1hi
3020	vpmadd52huq	$H2,$S2,$D1hi
3021	vpxorq		$D2lo,$D2lo,$D2lo
3022	vpmadd52luq	$H2,$R0,$D2lo
3023	vpxorq		$D2hi,$D2hi,$D2hi
3024	vpmadd52huq	$H2,$R0,$D2hi
3025
3026	vpmadd52luq	$H0,$R0,$D0lo
3027	vpmadd52huq	$H0,$R0,$D0hi
3028	vpmadd52luq	$H0,$R1,$D1lo
3029	vpmadd52huq	$H0,$R1,$D1hi
3030	vpmadd52luq	$H0,$R2,$D2lo
3031	vpmadd52huq	$H0,$R2,$D2hi
3032
3033	vpmadd52luq	$H1,$S2,$D0lo
3034	vpmadd52huq	$H1,$S2,$D0hi
3035	vpmadd52luq	$H1,$R0,$D1lo
3036	vpmadd52huq	$H1,$R0,$D1hi
3037	vpmadd52luq	$H1,$R1,$D2lo
3038	vpmadd52huq	$H1,$R1,$D2hi
3039
3040	################################################################
3041	# partial reduction
3042	vpsrlq		\$44,$D0lo,$tmp
3043	vpsllq		\$8,$D0hi,$D0hi
3044	vpandq		$mask44,$D0lo,$H0
3045	vpaddq		$tmp,$D0hi,$D0hi
3046
3047	vpaddq		$D0hi,$D1lo,$D1lo
3048
3049	vpsrlq		\$44,$D1lo,$tmp
3050	vpsllq		\$8,$D1hi,$D1hi
3051	vpandq		$mask44,$D1lo,$H1
3052	vpaddq		$tmp,$D1hi,$D1hi
3053
3054	vpaddq		$D1hi,$D2lo,$D2lo
3055
3056	vpsrlq		\$42,$D2lo,$tmp
3057	vpsllq		\$10,$D2hi,$D2hi
3058	vpandq		$mask42,$D2lo,$H2
3059	vpaddq		$tmp,$D2hi,$D2hi
3060
3061	vpaddq		$D2hi,$H0,$H0
3062	vpsllq		\$2,$D2hi,$D2hi
3063
3064	vpaddq		$D2hi,$H0,$H0
3065
3066	vpsrlq		\$44,$H0,$tmp		# additional step
3067	vpandq		$mask44,$H0,$H0
3068
3069	vpaddq		$tmp,$H1,$H1
3070
3071	dec		%eax
3072	jz		.Ldone_init_vpmadd52
3073
3074	vpunpcklqdq	$R1,$H1,$R1		# 1,2
3075	vpbroadcastq	%x#$H1,%x#$H1		# 2,2
3076	vpunpcklqdq	$R2,$H2,$R2
3077	vpbroadcastq	%x#$H2,%x#$H2
3078	vpunpcklqdq	$R0,$H0,$R0
3079	vpbroadcastq	%x#$H0,%x#$H0
3080
3081	vpsllq		\$2,$R1,$S1		# S1 = R1*5*4
3082	vpsllq		\$2,$R2,$S2		# S2 = R2*5*4
3083	vpaddq		$R1,$S1,$S1
3084	vpaddq		$R2,$S2,$S2
3085	vpsllq		\$2,$S1,$S1
3086	vpsllq		\$2,$S2,$S2
3087
3088	jmp		.Lmul_init_vpmadd52
3089	ud2
3090
3091.align	32
3092.Ldone_init_vpmadd52:
3093	vinserti128	\$1,%x#$R1,$H1,$R1	# 1,2,3,4
3094	vinserti128	\$1,%x#$R2,$H2,$R2
3095	vinserti128	\$1,%x#$R0,$H0,$R0
3096
3097	vpermq		\$0b11011000,$R1,$R1	# 1,3,2,4
3098	vpermq		\$0b11011000,$R2,$R2
3099	vpermq		\$0b11011000,$R0,$R0
3100
3101	vpsllq		\$2,$R1,$S1		# S1 = R1*5*4
3102	vpaddq		$R1,$S1,$S1
3103	vpsllq		\$2,$S1,$S1
3104
3105	vmovq		0($ctx),%x#$H0		# load current hash value
3106	vmovq		8($ctx),%x#$H1
3107	vmovq		16($ctx),%x#$H2
3108
3109	test		\$3,$len		# is length 4*n+2?
3110	jnz		.Ldone_init_vpmadd52_2x
3111
3112	vmovdqu64	$R0,64($ctx)		# save key powers
3113	vpbroadcastq	%x#$R0,$R0		# broadcast 4th power
3114	vmovdqu64	$R1,96($ctx)
3115	vpbroadcastq	%x#$R1,$R1
3116	vmovdqu64	$R2,128($ctx)
3117	vpbroadcastq	%x#$R2,$R2
3118	vmovdqu64	$S1,160($ctx)
3119	vpbroadcastq	%x#$S1,$S1
3120
3121	jmp		.Lblocks_vpmadd52_4x_key_loaded
3122	ud2
3123
3124.align	32
3125.Ldone_init_vpmadd52_2x:
3126	vmovdqu64	$R0,64($ctx)		# save key powers
3127	vpsrldq		\$8,$R0,$R0		# 0-1-0-2
3128	vmovdqu64	$R1,96($ctx)
3129	vpsrldq		\$8,$R1,$R1
3130	vmovdqu64	$R2,128($ctx)
3131	vpsrldq		\$8,$R2,$R2
3132	vmovdqu64	$S1,160($ctx)
3133	vpsrldq		\$8,$S1,$S1
3134	jmp		.Lblocks_vpmadd52_2x_key_loaded
3135	ud2
3136
3137.align	32
3138.Lblocks_vpmadd52_2x_do:
3139	vmovdqu64	128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers
3140	vmovdqu64	160+8($ctx),${S1}{%k1}{z}
3141	vmovdqu64	64+8($ctx),${R0}{%k1}{z}
3142	vmovdqu64	96+8($ctx),${R1}{%k1}{z}
3143
3144.Lblocks_vpmadd52_2x_key_loaded:
3145	vmovdqu64	16*0($inp),$T2		# load data
3146	vpxorq		$T3,$T3,$T3
3147	lea		16*2($inp),$inp
3148
3149	vpunpcklqdq	$T3,$T2,$T1		# transpose data
3150	vpunpckhqdq	$T3,$T2,$T3
3151
3152	# at this point 64-bit lanes are ordered as x-1-x-0
3153
3154	vpsrlq		\$24,$T3,$T2		# splat the data
3155	vporq		$PAD,$T2,$T2
3156	 vpaddq		$T2,$H2,$H2		# accumulate input
3157	vpandq		$mask44,$T1,$T0
3158	vpsrlq		\$44,$T1,$T1
3159	vpsllq		\$20,$T3,$T3
3160	vporq		$T3,$T1,$T1
3161	vpandq		$mask44,$T1,$T1
3162
3163	jmp		.Ltail_vpmadd52_2x
3164	ud2
3165
3166.align	32
3167.Loop_vpmadd52_4x:
3168	#vpaddq		$T2,$H2,$H2		# accumulate input
3169	vpaddq		$T0,$H0,$H0
3170	vpaddq		$T1,$H1,$H1
3171
3172	vpxorq		$D0lo,$D0lo,$D0lo
3173	vpmadd52luq	$H2,$S1,$D0lo
3174	vpxorq		$D0hi,$D0hi,$D0hi
3175	vpmadd52huq	$H2,$S1,$D0hi
3176	vpxorq		$D1lo,$D1lo,$D1lo
3177	vpmadd52luq	$H2,$S2,$D1lo
3178	vpxorq		$D1hi,$D1hi,$D1hi
3179	vpmadd52huq	$H2,$S2,$D1hi
3180	vpxorq		$D2lo,$D2lo,$D2lo
3181	vpmadd52luq	$H2,$R0,$D2lo
3182	vpxorq		$D2hi,$D2hi,$D2hi
3183	vpmadd52huq	$H2,$R0,$D2hi
3184
3185	 vmovdqu64	16*0($inp),$T2		# load data
3186	 vmovdqu64	16*2($inp),$T3
3187	 lea		16*4($inp),$inp
3188	vpmadd52luq	$H0,$R0,$D0lo
3189	vpmadd52huq	$H0,$R0,$D0hi
3190	vpmadd52luq	$H0,$R1,$D1lo
3191	vpmadd52huq	$H0,$R1,$D1hi
3192	vpmadd52luq	$H0,$R2,$D2lo
3193	vpmadd52huq	$H0,$R2,$D2hi
3194
3195	 vpunpcklqdq	$T3,$T2,$T1		# transpose data
3196	 vpunpckhqdq	$T3,$T2,$T3
3197	vpmadd52luq	$H1,$S2,$D0lo
3198	vpmadd52huq	$H1,$S2,$D0hi
3199	vpmadd52luq	$H1,$R0,$D1lo
3200	vpmadd52huq	$H1,$R0,$D1hi
3201	vpmadd52luq	$H1,$R1,$D2lo
3202	vpmadd52huq	$H1,$R1,$D2hi
3203
3204	################################################################
3205	# partial reduction (interleaved with data splat)
3206	vpsrlq		\$44,$D0lo,$tmp
3207	vpsllq		\$8,$D0hi,$D0hi
3208	vpandq		$mask44,$D0lo,$H0
3209	vpaddq		$tmp,$D0hi,$D0hi
3210
3211	 vpsrlq		\$24,$T3,$T2
3212	 vporq		$PAD,$T2,$T2
3213	vpaddq		$D0hi,$D1lo,$D1lo
3214
3215	vpsrlq		\$44,$D1lo,$tmp
3216	vpsllq		\$8,$D1hi,$D1hi
3217	vpandq		$mask44,$D1lo,$H1
3218	vpaddq		$tmp,$D1hi,$D1hi
3219
3220	 vpandq		$mask44,$T1,$T0
3221	 vpsrlq		\$44,$T1,$T1
3222	 vpsllq		\$20,$T3,$T3
3223	vpaddq		$D1hi,$D2lo,$D2lo
3224
3225	vpsrlq		\$42,$D2lo,$tmp
3226	vpsllq		\$10,$D2hi,$D2hi
3227	vpandq		$mask42,$D2lo,$H2
3228	vpaddq		$tmp,$D2hi,$D2hi
3229
3230	  vpaddq	$T2,$H2,$H2		# accumulate input
3231	vpaddq		$D2hi,$H0,$H0
3232	vpsllq		\$2,$D2hi,$D2hi
3233
3234	vpaddq		$D2hi,$H0,$H0
3235	 vporq		$T3,$T1,$T1
3236	 vpandq		$mask44,$T1,$T1
3237
3238	vpsrlq		\$44,$H0,$tmp		# additional step
3239	vpandq		$mask44,$H0,$H0
3240
3241	vpaddq		$tmp,$H1,$H1
3242
3243	sub		\$4,$len		# len-=64
3244	jnz		.Loop_vpmadd52_4x
3245
3246.Ltail_vpmadd52_4x:
3247	vmovdqu64	128($ctx),$R2		# load all key powers
3248	vmovdqu64	160($ctx),$S1
3249	vmovdqu64	64($ctx),$R0
3250	vmovdqu64	96($ctx),$R1
3251
3252.Ltail_vpmadd52_2x:
3253	vpsllq		\$2,$R2,$S2		# S2 = R2*5*4
3254	vpaddq		$R2,$S2,$S2
3255	vpsllq		\$2,$S2,$S2
3256
3257	#vpaddq		$T2,$H2,$H2		# accumulate input
3258	vpaddq		$T0,$H0,$H0
3259	vpaddq		$T1,$H1,$H1
3260
3261	vpxorq		$D0lo,$D0lo,$D0lo
3262	vpmadd52luq	$H2,$S1,$D0lo
3263	vpxorq		$D0hi,$D0hi,$D0hi
3264	vpmadd52huq	$H2,$S1,$D0hi
3265	vpxorq		$D1lo,$D1lo,$D1lo
3266	vpmadd52luq	$H2,$S2,$D1lo
3267	vpxorq		$D1hi,$D1hi,$D1hi
3268	vpmadd52huq	$H2,$S2,$D1hi
3269	vpxorq		$D2lo,$D2lo,$D2lo
3270	vpmadd52luq	$H2,$R0,$D2lo
3271	vpxorq		$D2hi,$D2hi,$D2hi
3272	vpmadd52huq	$H2,$R0,$D2hi
3273
3274	vpmadd52luq	$H0,$R0,$D0lo
3275	vpmadd52huq	$H0,$R0,$D0hi
3276	vpmadd52luq	$H0,$R1,$D1lo
3277	vpmadd52huq	$H0,$R1,$D1hi
3278	vpmadd52luq	$H0,$R2,$D2lo
3279	vpmadd52huq	$H0,$R2,$D2hi
3280
3281	vpmadd52luq	$H1,$S2,$D0lo
3282	vpmadd52huq	$H1,$S2,$D0hi
3283	vpmadd52luq	$H1,$R0,$D1lo
3284	vpmadd52huq	$H1,$R0,$D1hi
3285	vpmadd52luq	$H1,$R1,$D2lo
3286	vpmadd52huq	$H1,$R1,$D2hi
3287
3288	################################################################
3289	# horizontal addition
3290
3291	mov		\$1,%eax
3292	kmovw		%eax,%k1
3293	vpsrldq		\$8,$D0lo,$T0
3294	vpsrldq		\$8,$D0hi,$H0
3295	vpsrldq		\$8,$D1lo,$T1
3296	vpsrldq		\$8,$D1hi,$H1
3297	vpaddq		$T0,$D0lo,$D0lo
3298	vpaddq		$H0,$D0hi,$D0hi
3299	vpsrldq		\$8,$D2lo,$T2
3300	vpsrldq		\$8,$D2hi,$H2
3301	vpaddq		$T1,$D1lo,$D1lo
3302	vpaddq		$H1,$D1hi,$D1hi
3303	 vpermq		\$0x2,$D0lo,$T0
3304	 vpermq		\$0x2,$D0hi,$H0
3305	vpaddq		$T2,$D2lo,$D2lo
3306	vpaddq		$H2,$D2hi,$D2hi
3307
3308	vpermq		\$0x2,$D1lo,$T1
3309	vpermq		\$0x2,$D1hi,$H1
3310	vpaddq		$T0,$D0lo,${D0lo}{%k1}{z}
3311	vpaddq		$H0,$D0hi,${D0hi}{%k1}{z}
3312	vpermq		\$0x2,$D2lo,$T2
3313	vpermq		\$0x2,$D2hi,$H2
3314	vpaddq		$T1,$D1lo,${D1lo}{%k1}{z}
3315	vpaddq		$H1,$D1hi,${D1hi}{%k1}{z}
3316	vpaddq		$T2,$D2lo,${D2lo}{%k1}{z}
3317	vpaddq		$H2,$D2hi,${D2hi}{%k1}{z}
3318
3319	################################################################
3320	# partial reduction
3321	vpsrlq		\$44,$D0lo,$tmp
3322	vpsllq		\$8,$D0hi,$D0hi
3323	vpandq		$mask44,$D0lo,$H0
3324	vpaddq		$tmp,$D0hi,$D0hi
3325
3326	vpaddq		$D0hi,$D1lo,$D1lo
3327
3328	vpsrlq		\$44,$D1lo,$tmp
3329	vpsllq		\$8,$D1hi,$D1hi
3330	vpandq		$mask44,$D1lo,$H1
3331	vpaddq		$tmp,$D1hi,$D1hi
3332
3333	vpaddq		$D1hi,$D2lo,$D2lo
3334
3335	vpsrlq		\$42,$D2lo,$tmp
3336	vpsllq		\$10,$D2hi,$D2hi
3337	vpandq		$mask42,$D2lo,$H2
3338	vpaddq		$tmp,$D2hi,$D2hi
3339
3340	vpaddq		$D2hi,$H0,$H0
3341	vpsllq		\$2,$D2hi,$D2hi
3342
3343	vpaddq		$D2hi,$H0,$H0
3344
3345	vpsrlq		\$44,$H0,$tmp		# additional step
3346	vpandq		$mask44,$H0,$H0
3347
3348	vpaddq		$tmp,$H1,$H1
3349						# at this point $len is
3350						# either 4*n+2 or 0...
3351	sub		\$2,$len		# len-=32
3352	ja		.Lblocks_vpmadd52_4x_do
3353
3354	vmovq		%x#$H0,0($ctx)
3355	vmovq		%x#$H1,8($ctx)
3356	vmovq		%x#$H2,16($ctx)
3357	vzeroall
3358
3359.Lno_data_vpmadd52_4x:
3360	ret
3361.cfi_endproc
3362.size	poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x
3363___
3364}
3365{
3366########################################################################
3367# As implied by its name 8x subroutine processes 8 blocks in parallel...
3368# This is intermediate version, as it's used only in cases when input
3369# length is either 8*n, 8*n+1 or 8*n+2...
3370
3371my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
3372my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
3373my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
3374my ($RR0,$RR1,$RR2,$SS1,$SS2) = map("%ymm$_",(6..10));
3375
3376$code.=<<___;
3377.type	poly1305_blocks_vpmadd52_8x,\@function,4
3378.align	32
3379poly1305_blocks_vpmadd52_8x:
3380.cfi_startproc
3381	shr	\$4,$len
3382	jz	.Lno_data_vpmadd52_8x		# too short
3383
3384	shl	\$40,$padbit
3385	mov	64($ctx),%r8			# peek on power of the key
3386
3387	vmovdqa64	.Lx_mask44(%rip),$mask44
3388	vmovdqa64	.Lx_mask42(%rip),$mask42
3389
3390	test	%r8,%r8				# is power value impossible?
3391	js	.Linit_vpmadd52			# if it is, then init R[4]
3392
3393	vmovq	0($ctx),%x#$H0			# load current hash value
3394	vmovq	8($ctx),%x#$H1
3395	vmovq	16($ctx),%x#$H2
3396
3397.Lblocks_vpmadd52_8x:
3398	################################################################
3399	# fist we calculate more key powers
3400
3401	vmovdqu64	128($ctx),$R2		# load 1-3-2-4 powers
3402	vmovdqu64	160($ctx),$S1
3403	vmovdqu64	64($ctx),$R0
3404	vmovdqu64	96($ctx),$R1
3405
3406	vpsllq		\$2,$R2,$S2		# S2 = R2*5*4
3407	vpaddq		$R2,$S2,$S2
3408	vpsllq		\$2,$S2,$S2
3409
3410	vpbroadcastq	%x#$R2,$RR2		# broadcast 4th power
3411	vpbroadcastq	%x#$R0,$RR0
3412	vpbroadcastq	%x#$R1,$RR1
3413
3414	vpxorq		$D0lo,$D0lo,$D0lo
3415	vpmadd52luq	$RR2,$S1,$D0lo
3416	vpxorq		$D0hi,$D0hi,$D0hi
3417	vpmadd52huq	$RR2,$S1,$D0hi
3418	vpxorq		$D1lo,$D1lo,$D1lo
3419	vpmadd52luq	$RR2,$S2,$D1lo
3420	vpxorq		$D1hi,$D1hi,$D1hi
3421	vpmadd52huq	$RR2,$S2,$D1hi
3422	vpxorq		$D2lo,$D2lo,$D2lo
3423	vpmadd52luq	$RR2,$R0,$D2lo
3424	vpxorq		$D2hi,$D2hi,$D2hi
3425	vpmadd52huq	$RR2,$R0,$D2hi
3426
3427	vpmadd52luq	$RR0,$R0,$D0lo
3428	vpmadd52huq	$RR0,$R0,$D0hi
3429	vpmadd52luq	$RR0,$R1,$D1lo
3430	vpmadd52huq	$RR0,$R1,$D1hi
3431	vpmadd52luq	$RR0,$R2,$D2lo
3432	vpmadd52huq	$RR0,$R2,$D2hi
3433
3434	vpmadd52luq	$RR1,$S2,$D0lo
3435	vpmadd52huq	$RR1,$S2,$D0hi
3436	vpmadd52luq	$RR1,$R0,$D1lo
3437	vpmadd52huq	$RR1,$R0,$D1hi
3438	vpmadd52luq	$RR1,$R1,$D2lo
3439	vpmadd52huq	$RR1,$R1,$D2hi
3440
3441	################################################################
3442	# partial reduction
3443	vpsrlq		\$44,$D0lo,$tmp
3444	vpsllq		\$8,$D0hi,$D0hi
3445	vpandq		$mask44,$D0lo,$RR0
3446	vpaddq		$tmp,$D0hi,$D0hi
3447
3448	vpaddq		$D0hi,$D1lo,$D1lo
3449
3450	vpsrlq		\$44,$D1lo,$tmp
3451	vpsllq		\$8,$D1hi,$D1hi
3452	vpandq		$mask44,$D1lo,$RR1
3453	vpaddq		$tmp,$D1hi,$D1hi
3454
3455	vpaddq		$D1hi,$D2lo,$D2lo
3456
3457	vpsrlq		\$42,$D2lo,$tmp
3458	vpsllq		\$10,$D2hi,$D2hi
3459	vpandq		$mask42,$D2lo,$RR2
3460	vpaddq		$tmp,$D2hi,$D2hi
3461
3462	vpaddq		$D2hi,$RR0,$RR0
3463	vpsllq		\$2,$D2hi,$D2hi
3464
3465	vpaddq		$D2hi,$RR0,$RR0
3466
3467	vpsrlq		\$44,$RR0,$tmp		# additional step
3468	vpandq		$mask44,$RR0,$RR0
3469
3470	vpaddq		$tmp,$RR1,$RR1
3471
3472	################################################################
3473	# At this point Rx holds 1324 powers, RRx - 5768, and the goal
3474	# is 15263748, which reflects how data is loaded...
3475
3476	vpunpcklqdq	$R2,$RR2,$T2		# 3748
3477	vpunpckhqdq	$R2,$RR2,$R2		# 1526
3478	vpunpcklqdq	$R0,$RR0,$T0
3479	vpunpckhqdq	$R0,$RR0,$R0
3480	vpunpcklqdq	$R1,$RR1,$T1
3481	vpunpckhqdq	$R1,$RR1,$R1
3482___
3483######## switch to %zmm
3484map(s/%y/%z/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
3485map(s/%y/%z/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
3486map(s/%y/%z/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
3487map(s/%y/%z/, $RR0,$RR1,$RR2,$SS1,$SS2);
3488
3489$code.=<<___;
3490	vshufi64x2	\$0x44,$R2,$T2,$RR2	# 15263748
3491	vshufi64x2	\$0x44,$R0,$T0,$RR0
3492	vshufi64x2	\$0x44,$R1,$T1,$RR1
3493
3494	vmovdqu64	16*0($inp),$T2		# load data
3495	vmovdqu64	16*4($inp),$T3
3496	lea		16*8($inp),$inp
3497
3498	vpsllq		\$2,$RR2,$SS2		# S2 = R2*5*4
3499	vpsllq		\$2,$RR1,$SS1		# S1 = R1*5*4
3500	vpaddq		$RR2,$SS2,$SS2
3501	vpaddq		$RR1,$SS1,$SS1
3502	vpsllq		\$2,$SS2,$SS2
3503	vpsllq		\$2,$SS1,$SS1
3504
3505	vpbroadcastq	$padbit,$PAD
3506	vpbroadcastq	%x#$mask44,$mask44
3507	vpbroadcastq	%x#$mask42,$mask42
3508
3509	vpbroadcastq	%x#$SS1,$S1		# broadcast 8th power
3510	vpbroadcastq	%x#$SS2,$S2
3511	vpbroadcastq	%x#$RR0,$R0
3512	vpbroadcastq	%x#$RR1,$R1
3513	vpbroadcastq	%x#$RR2,$R2
3514
3515	vpunpcklqdq	$T3,$T2,$T1		# transpose data
3516	vpunpckhqdq	$T3,$T2,$T3
3517
3518	# at this point 64-bit lanes are ordered as 73625140
3519
3520	vpsrlq		\$24,$T3,$T2		# splat the data
3521	vporq		$PAD,$T2,$T2
3522	 vpaddq		$T2,$H2,$H2		# accumulate input
3523	vpandq		$mask44,$T1,$T0
3524	vpsrlq		\$44,$T1,$T1
3525	vpsllq		\$20,$T3,$T3
3526	vporq		$T3,$T1,$T1
3527	vpandq		$mask44,$T1,$T1
3528
3529	sub		\$8,$len
3530	jz		.Ltail_vpmadd52_8x
3531	jmp		.Loop_vpmadd52_8x
3532
3533.align	32
3534.Loop_vpmadd52_8x:
3535	#vpaddq		$T2,$H2,$H2		# accumulate input
3536	vpaddq		$T0,$H0,$H0
3537	vpaddq		$T1,$H1,$H1
3538
3539	vpxorq		$D0lo,$D0lo,$D0lo
3540	vpmadd52luq	$H2,$S1,$D0lo
3541	vpxorq		$D0hi,$D0hi,$D0hi
3542	vpmadd52huq	$H2,$S1,$D0hi
3543	vpxorq		$D1lo,$D1lo,$D1lo
3544	vpmadd52luq	$H2,$S2,$D1lo
3545	vpxorq		$D1hi,$D1hi,$D1hi
3546	vpmadd52huq	$H2,$S2,$D1hi
3547	vpxorq		$D2lo,$D2lo,$D2lo
3548	vpmadd52luq	$H2,$R0,$D2lo
3549	vpxorq		$D2hi,$D2hi,$D2hi
3550	vpmadd52huq	$H2,$R0,$D2hi
3551
3552	 vmovdqu64	16*0($inp),$T2		# load data
3553	 vmovdqu64	16*4($inp),$T3
3554	 lea		16*8($inp),$inp
3555	vpmadd52luq	$H0,$R0,$D0lo
3556	vpmadd52huq	$H0,$R0,$D0hi
3557	vpmadd52luq	$H0,$R1,$D1lo
3558	vpmadd52huq	$H0,$R1,$D1hi
3559	vpmadd52luq	$H0,$R2,$D2lo
3560	vpmadd52huq	$H0,$R2,$D2hi
3561
3562	 vpunpcklqdq	$T3,$T2,$T1		# transpose data
3563	 vpunpckhqdq	$T3,$T2,$T3
3564	vpmadd52luq	$H1,$S2,$D0lo
3565	vpmadd52huq	$H1,$S2,$D0hi
3566	vpmadd52luq	$H1,$R0,$D1lo
3567	vpmadd52huq	$H1,$R0,$D1hi
3568	vpmadd52luq	$H1,$R1,$D2lo
3569	vpmadd52huq	$H1,$R1,$D2hi
3570
3571	################################################################
3572	# partial reduction (interleaved with data splat)
3573	vpsrlq		\$44,$D0lo,$tmp
3574	vpsllq		\$8,$D0hi,$D0hi
3575	vpandq		$mask44,$D0lo,$H0
3576	vpaddq		$tmp,$D0hi,$D0hi
3577
3578	 vpsrlq		\$24,$T3,$T2
3579	 vporq		$PAD,$T2,$T2
3580	vpaddq		$D0hi,$D1lo,$D1lo
3581
3582	vpsrlq		\$44,$D1lo,$tmp
3583	vpsllq		\$8,$D1hi,$D1hi
3584	vpandq		$mask44,$D1lo,$H1
3585	vpaddq		$tmp,$D1hi,$D1hi
3586
3587	 vpandq		$mask44,$T1,$T0
3588	 vpsrlq		\$44,$T1,$T1
3589	 vpsllq		\$20,$T3,$T3
3590	vpaddq		$D1hi,$D2lo,$D2lo
3591
3592	vpsrlq		\$42,$D2lo,$tmp
3593	vpsllq		\$10,$D2hi,$D2hi
3594	vpandq		$mask42,$D2lo,$H2
3595	vpaddq		$tmp,$D2hi,$D2hi
3596
3597	  vpaddq	$T2,$H2,$H2		# accumulate input
3598	vpaddq		$D2hi,$H0,$H0
3599	vpsllq		\$2,$D2hi,$D2hi
3600
3601	vpaddq		$D2hi,$H0,$H0
3602	 vporq		$T3,$T1,$T1
3603	 vpandq		$mask44,$T1,$T1
3604
3605	vpsrlq		\$44,$H0,$tmp		# additional step
3606	vpandq		$mask44,$H0,$H0
3607
3608	vpaddq		$tmp,$H1,$H1
3609
3610	sub		\$8,$len		# len-=128
3611	jnz		.Loop_vpmadd52_8x
3612
3613.Ltail_vpmadd52_8x:
3614	#vpaddq		$T2,$H2,$H2		# accumulate input
3615	vpaddq		$T0,$H0,$H0
3616	vpaddq		$T1,$H1,$H1
3617
3618	vpxorq		$D0lo,$D0lo,$D0lo
3619	vpmadd52luq	$H2,$SS1,$D0lo
3620	vpxorq		$D0hi,$D0hi,$D0hi
3621	vpmadd52huq	$H2,$SS1,$D0hi
3622	vpxorq		$D1lo,$D1lo,$D1lo
3623	vpmadd52luq	$H2,$SS2,$D1lo
3624	vpxorq		$D1hi,$D1hi,$D1hi
3625	vpmadd52huq	$H2,$SS2,$D1hi
3626	vpxorq		$D2lo,$D2lo,$D2lo
3627	vpmadd52luq	$H2,$RR0,$D2lo
3628	vpxorq		$D2hi,$D2hi,$D2hi
3629	vpmadd52huq	$H2,$RR0,$D2hi
3630
3631	vpmadd52luq	$H0,$RR0,$D0lo
3632	vpmadd52huq	$H0,$RR0,$D0hi
3633	vpmadd52luq	$H0,$RR1,$D1lo
3634	vpmadd52huq	$H0,$RR1,$D1hi
3635	vpmadd52luq	$H0,$RR2,$D2lo
3636	vpmadd52huq	$H0,$RR2,$D2hi
3637
3638	vpmadd52luq	$H1,$SS2,$D0lo
3639	vpmadd52huq	$H1,$SS2,$D0hi
3640	vpmadd52luq	$H1,$RR0,$D1lo
3641	vpmadd52huq	$H1,$RR0,$D1hi
3642	vpmadd52luq	$H1,$RR1,$D2lo
3643	vpmadd52huq	$H1,$RR1,$D2hi
3644
3645	################################################################
3646	# horizontal addition
3647
3648	mov		\$1,%eax
3649	kmovw		%eax,%k1
3650	vpsrldq		\$8,$D0lo,$T0
3651	vpsrldq		\$8,$D0hi,$H0
3652	vpsrldq		\$8,$D1lo,$T1
3653	vpsrldq		\$8,$D1hi,$H1
3654	vpaddq		$T0,$D0lo,$D0lo
3655	vpaddq		$H0,$D0hi,$D0hi
3656	vpsrldq		\$8,$D2lo,$T2
3657	vpsrldq		\$8,$D2hi,$H2
3658	vpaddq		$T1,$D1lo,$D1lo
3659	vpaddq		$H1,$D1hi,$D1hi
3660	 vpermq		\$0x2,$D0lo,$T0
3661	 vpermq		\$0x2,$D0hi,$H0
3662	vpaddq		$T2,$D2lo,$D2lo
3663	vpaddq		$H2,$D2hi,$D2hi
3664
3665	vpermq		\$0x2,$D1lo,$T1
3666	vpermq		\$0x2,$D1hi,$H1
3667	vpaddq		$T0,$D0lo,$D0lo
3668	vpaddq		$H0,$D0hi,$D0hi
3669	vpermq		\$0x2,$D2lo,$T2
3670	vpermq		\$0x2,$D2hi,$H2
3671	vpaddq		$T1,$D1lo,$D1lo
3672	vpaddq		$H1,$D1hi,$D1hi
3673	 vextracti64x4	\$1,$D0lo,%y#$T0
3674	 vextracti64x4	\$1,$D0hi,%y#$H0
3675	vpaddq		$T2,$D2lo,$D2lo
3676	vpaddq		$H2,$D2hi,$D2hi
3677
3678	vextracti64x4	\$1,$D1lo,%y#$T1
3679	vextracti64x4	\$1,$D1hi,%y#$H1
3680	vextracti64x4	\$1,$D2lo,%y#$T2
3681	vextracti64x4	\$1,$D2hi,%y#$H2
3682___
3683######## switch back to %ymm
3684map(s/%z/%y/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
3685map(s/%z/%y/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
3686map(s/%z/%y/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
3687
3688$code.=<<___;
3689	vpaddq		$T0,$D0lo,${D0lo}{%k1}{z}
3690	vpaddq		$H0,$D0hi,${D0hi}{%k1}{z}
3691	vpaddq		$T1,$D1lo,${D1lo}{%k1}{z}
3692	vpaddq		$H1,$D1hi,${D1hi}{%k1}{z}
3693	vpaddq		$T2,$D2lo,${D2lo}{%k1}{z}
3694	vpaddq		$H2,$D2hi,${D2hi}{%k1}{z}
3695
3696	################################################################
3697	# partial reduction
3698	vpsrlq		\$44,$D0lo,$tmp
3699	vpsllq		\$8,$D0hi,$D0hi
3700	vpandq		$mask44,$D0lo,$H0
3701	vpaddq		$tmp,$D0hi,$D0hi
3702
3703	vpaddq		$D0hi,$D1lo,$D1lo
3704
3705	vpsrlq		\$44,$D1lo,$tmp
3706	vpsllq		\$8,$D1hi,$D1hi
3707	vpandq		$mask44,$D1lo,$H1
3708	vpaddq		$tmp,$D1hi,$D1hi
3709
3710	vpaddq		$D1hi,$D2lo,$D2lo
3711
3712	vpsrlq		\$42,$D2lo,$tmp
3713	vpsllq		\$10,$D2hi,$D2hi
3714	vpandq		$mask42,$D2lo,$H2
3715	vpaddq		$tmp,$D2hi,$D2hi
3716
3717	vpaddq		$D2hi,$H0,$H0
3718	vpsllq		\$2,$D2hi,$D2hi
3719
3720	vpaddq		$D2hi,$H0,$H0
3721
3722	vpsrlq		\$44,$H0,$tmp		# additional step
3723	vpandq		$mask44,$H0,$H0
3724
3725	vpaddq		$tmp,$H1,$H1
3726
3727	################################################################
3728
3729	vmovq		%x#$H0,0($ctx)
3730	vmovq		%x#$H1,8($ctx)
3731	vmovq		%x#$H2,16($ctx)
3732	vzeroall
3733
3734.Lno_data_vpmadd52_8x:
3735	ret
3736.cfi_endproc
3737.size	poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x
3738___
3739}
3740$code.=<<___;
3741.type	poly1305_emit_base2_44,\@function,3
3742.align	32
3743poly1305_emit_base2_44:
3744.cfi_startproc
3745	endbranch
3746	mov	0($ctx),%r8	# load hash value
3747	mov	8($ctx),%r9
3748	mov	16($ctx),%r10
3749
3750	mov	%r9,%rax
3751	shr	\$20,%r9
3752	shl	\$44,%rax
3753	mov	%r10,%rcx
3754	shr	\$40,%r10
3755	shl	\$24,%rcx
3756
3757	add	%rax,%r8
3758	adc	%rcx,%r9
3759	adc	\$0,%r10
3760
3761	mov	%r8,%rax
3762	add	\$5,%r8		# compare to modulus
3763	mov	%r9,%rcx
3764	adc	\$0,%r9
3765	adc	\$0,%r10
3766	shr	\$2,%r10	# did 130-bit value overflow?
3767	cmovnz	%r8,%rax
3768	cmovnz	%r9,%rcx
3769
3770	add	0($nonce),%rax	# accumulate nonce
3771	adc	8($nonce),%rcx
3772	mov	%rax,0($mac)	# write result
3773	mov	%rcx,8($mac)
3774
3775	ret
3776.cfi_endproc
3777.size	poly1305_emit_base2_44,.-poly1305_emit_base2_44
3778___
3779}	}	}
3780$code.=<<___;
3781.align	64
3782.Lconst:
3783.Lmask24:
3784.long	0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
3785.L129:
3786.long	`1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
3787.Lmask26:
3788.long	0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
3789.Lpermd_avx2:
3790.long	2,2,2,3,2,0,2,1
3791.Lpermd_avx512:
3792.long	0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
3793
3794.L2_44_inp_permd:
3795.long	0,1,1,2,2,3,7,7
3796.L2_44_inp_shift:
3797.quad	0,12,24,64
3798.L2_44_mask:
3799.quad	0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
3800.L2_44_shift_rgt:
3801.quad	44,44,42,64
3802.L2_44_shift_lft:
3803.quad	8,8,10,64
3804
3805.align	64
3806.Lx_mask44:
3807.quad	0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
3808.quad	0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
3809.Lx_mask42:
3810.quad	0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
3811.quad	0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
3812___
3813}
3814$code.=<<___;
3815.asciz	"Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
3816.align	16
3817___
3818
3819{	# chacha20-poly1305 helpers
3820my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") :  # Win64 order
3821                                  ("%rdi","%rsi","%rdx","%rcx");  # Unix order
3822$code.=<<___;
3823.globl	xor128_encrypt_n_pad
3824.type	xor128_encrypt_n_pad,\@abi-omnipotent
3825.align	16
3826xor128_encrypt_n_pad:
3827.cfi_startproc
3828	sub	$otp,$inp
3829	sub	$otp,$out
3830	mov	$len,%r10		# put len aside
3831	shr	\$4,$len		# len / 16
3832	jz	.Ltail_enc
3833	nop
3834.Loop_enc_xmm:
3835	movdqu	($inp,$otp),%xmm0
3836	pxor	($otp),%xmm0
3837	movdqu	%xmm0,($out,$otp)
3838	movdqa	%xmm0,($otp)
3839	lea	16($otp),$otp
3840	dec	$len
3841	jnz	.Loop_enc_xmm
3842
3843	and	\$15,%r10		# len % 16
3844	jz	.Ldone_enc
3845
3846.Ltail_enc:
3847	mov	\$16,$len
3848	sub	%r10,$len
3849	xor	%eax,%eax
3850.Loop_enc_byte:
3851	mov	($inp,$otp),%al
3852	xor	($otp),%al
3853	mov	%al,($out,$otp)
3854	mov	%al,($otp)
3855	lea	1($otp),$otp
3856	dec	%r10
3857	jnz	.Loop_enc_byte
3858
3859	xor	%eax,%eax
3860.Loop_enc_pad:
3861	mov	%al,($otp)
3862	lea	1($otp),$otp
3863	dec	$len
3864	jnz	.Loop_enc_pad
3865
3866.Ldone_enc:
3867	mov	$otp,%rax
3868	ret
3869.cfi_endproc
3870.size	xor128_encrypt_n_pad,.-xor128_encrypt_n_pad
3871
3872.globl	xor128_decrypt_n_pad
3873.type	xor128_decrypt_n_pad,\@abi-omnipotent
3874.align	16
3875xor128_decrypt_n_pad:
3876.cfi_startproc
3877	sub	$otp,$inp
3878	sub	$otp,$out
3879	mov	$len,%r10		# put len aside
3880	shr	\$4,$len		# len / 16
3881	jz	.Ltail_dec
3882	nop
3883.Loop_dec_xmm:
3884	movdqu	($inp,$otp),%xmm0
3885	movdqa	($otp),%xmm1
3886	pxor	%xmm0,%xmm1
3887	movdqu	%xmm1,($out,$otp)
3888	movdqa	%xmm0,($otp)
3889	lea	16($otp),$otp
3890	dec	$len
3891	jnz	.Loop_dec_xmm
3892
3893	pxor	%xmm1,%xmm1
3894	and	\$15,%r10		# len % 16
3895	jz	.Ldone_dec
3896
3897.Ltail_dec:
3898	mov	\$16,$len
3899	sub	%r10,$len
3900	xor	%eax,%eax
3901	xor	%r11,%r11
3902.Loop_dec_byte:
3903	mov	($inp,$otp),%r11b
3904	mov	($otp),%al
3905	xor	%r11b,%al
3906	mov	%al,($out,$otp)
3907	mov	%r11b,($otp)
3908	lea	1($otp),$otp
3909	dec	%r10
3910	jnz	.Loop_dec_byte
3911
3912	xor	%eax,%eax
3913.Loop_dec_pad:
3914	mov	%al,($otp)
3915	lea	1($otp),$otp
3916	dec	$len
3917	jnz	.Loop_dec_pad
3918
3919.Ldone_dec:
3920	mov	$otp,%rax
3921	ret
3922.cfi_endproc
3923.size	xor128_decrypt_n_pad,.-xor128_decrypt_n_pad
3924___
3925}
3926
3927# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
3928#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
3929if ($win64) {
3930$rec="%rcx";
3931$frame="%rdx";
3932$context="%r8";
3933$disp="%r9";
3934
3935$code.=<<___;
3936.extern	__imp_RtlVirtualUnwind
3937.type	se_handler,\@abi-omnipotent
3938.align	16
3939se_handler:
3940	push	%rsi
3941	push	%rdi
3942	push	%rbx
3943	push	%rbp
3944	push	%r12
3945	push	%r13
3946	push	%r14
3947	push	%r15
3948	pushfq
3949	sub	\$64,%rsp
3950
3951	mov	120($context),%rax	# pull context->Rax
3952	mov	248($context),%rbx	# pull context->Rip
3953
3954	mov	8($disp),%rsi		# disp->ImageBase
3955	mov	56($disp),%r11		# disp->HandlerData
3956
3957	mov	0(%r11),%r10d		# HandlerData[0]
3958	lea	(%rsi,%r10),%r10	# prologue label
3959	cmp	%r10,%rbx		# context->Rip<.Lprologue
3960	jb	.Lcommon_seh_tail
3961
3962	mov	152($context),%rax	# pull context->Rsp
3963
3964	mov	4(%r11),%r10d		# HandlerData[1]
3965	lea	(%rsi,%r10),%r10	# epilogue label
3966	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
3967	jae	.Lcommon_seh_tail
3968
3969	lea	48(%rax),%rax
3970
3971	mov	-8(%rax),%rbx
3972	mov	-16(%rax),%rbp
3973	mov	-24(%rax),%r12
3974	mov	-32(%rax),%r13
3975	mov	-40(%rax),%r14
3976	mov	-48(%rax),%r15
3977	mov	%rbx,144($context)	# restore context->Rbx
3978	mov	%rbp,160($context)	# restore context->Rbp
3979	mov	%r12,216($context)	# restore context->R12
3980	mov	%r13,224($context)	# restore context->R13
3981	mov	%r14,232($context)	# restore context->R14
3982	mov	%r15,240($context)	# restore context->R14
3983
3984	jmp	.Lcommon_seh_tail
3985.size	se_handler,.-se_handler
3986
3987.type	avx_handler,\@abi-omnipotent
3988.align	16
3989avx_handler:
3990	push	%rsi
3991	push	%rdi
3992	push	%rbx
3993	push	%rbp
3994	push	%r12
3995	push	%r13
3996	push	%r14
3997	push	%r15
3998	pushfq
3999	sub	\$64,%rsp
4000
4001	mov	120($context),%rax	# pull context->Rax
4002	mov	248($context),%rbx	# pull context->Rip
4003
4004	mov	8($disp),%rsi		# disp->ImageBase
4005	mov	56($disp),%r11		# disp->HandlerData
4006
4007	mov	0(%r11),%r10d		# HandlerData[0]
4008	lea	(%rsi,%r10),%r10	# prologue label
4009	cmp	%r10,%rbx		# context->Rip<prologue label
4010	jb	.Lcommon_seh_tail
4011
4012	mov	152($context),%rax	# pull context->Rsp
4013
4014	mov	4(%r11),%r10d		# HandlerData[1]
4015	lea	(%rsi,%r10),%r10	# epilogue label
4016	cmp	%r10,%rbx		# context->Rip>=epilogue label
4017	jae	.Lcommon_seh_tail
4018
4019	mov	208($context),%rax	# pull context->R11
4020
4021	lea	0x50(%rax),%rsi
4022	lea	0xf8(%rax),%rax
4023	lea	512($context),%rdi	# &context.Xmm6
4024	mov	\$20,%ecx
4025	.long	0xa548f3fc		# cld; rep movsq
4026
4027.Lcommon_seh_tail:
4028	mov	8(%rax),%rdi
4029	mov	16(%rax),%rsi
4030	mov	%rax,152($context)	# restore context->Rsp
4031	mov	%rsi,168($context)	# restore context->Rsi
4032	mov	%rdi,176($context)	# restore context->Rdi
4033
4034	mov	40($disp),%rdi		# disp->ContextRecord
4035	mov	$context,%rsi		# context
4036	mov	\$154,%ecx		# sizeof(CONTEXT)
4037	.long	0xa548f3fc		# cld; rep movsq
4038
4039	mov	$disp,%rsi
4040	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
4041	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
4042	mov	0(%rsi),%r8		# arg3, disp->ControlPc
4043	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
4044	mov	40(%rsi),%r10		# disp->ContextRecord
4045	lea	56(%rsi),%r11		# &disp->HandlerData
4046	lea	24(%rsi),%r12		# &disp->EstablisherFrame
4047	mov	%r10,32(%rsp)		# arg5
4048	mov	%r11,40(%rsp)		# arg6
4049	mov	%r12,48(%rsp)		# arg7
4050	mov	%rcx,56(%rsp)		# arg8, (NULL)
4051	call	*__imp_RtlVirtualUnwind(%rip)
4052
4053	mov	\$1,%eax		# ExceptionContinueSearch
4054	add	\$64,%rsp
4055	popfq
4056	pop	%r15
4057	pop	%r14
4058	pop	%r13
4059	pop	%r12
4060	pop	%rbp
4061	pop	%rbx
4062	pop	%rdi
4063	pop	%rsi
4064	ret
4065.size	avx_handler,.-avx_handler
4066
4067.section	.pdata
4068.align	4
4069	.rva	.LSEH_begin_poly1305_init
4070	.rva	.LSEH_end_poly1305_init
4071	.rva	.LSEH_info_poly1305_init
4072
4073	.rva	.LSEH_begin_poly1305_blocks
4074	.rva	.LSEH_end_poly1305_blocks
4075	.rva	.LSEH_info_poly1305_blocks
4076
4077	.rva	.LSEH_begin_poly1305_emit
4078	.rva	.LSEH_end_poly1305_emit
4079	.rva	.LSEH_info_poly1305_emit
4080___
4081$code.=<<___ if ($avx);
4082	.rva	.LSEH_begin_poly1305_blocks_avx
4083	.rva	.Lbase2_64_avx
4084	.rva	.LSEH_info_poly1305_blocks_avx_1
4085
4086	.rva	.Lbase2_64_avx
4087	.rva	.Leven_avx
4088	.rva	.LSEH_info_poly1305_blocks_avx_2
4089
4090	.rva	.Leven_avx
4091	.rva	.LSEH_end_poly1305_blocks_avx
4092	.rva	.LSEH_info_poly1305_blocks_avx_3
4093
4094	.rva	.LSEH_begin_poly1305_emit_avx
4095	.rva	.LSEH_end_poly1305_emit_avx
4096	.rva	.LSEH_info_poly1305_emit_avx
4097___
4098$code.=<<___ if ($avx>1);
4099	.rva	.LSEH_begin_poly1305_blocks_avx2
4100	.rva	.Lbase2_64_avx2
4101	.rva	.LSEH_info_poly1305_blocks_avx2_1
4102
4103	.rva	.Lbase2_64_avx2
4104	.rva	.Leven_avx2
4105	.rva	.LSEH_info_poly1305_blocks_avx2_2
4106
4107	.rva	.Leven_avx2
4108	.rva	.LSEH_end_poly1305_blocks_avx2
4109	.rva	.LSEH_info_poly1305_blocks_avx2_3
4110___
4111$code.=<<___ if ($avx>2);
4112	.rva	.LSEH_begin_poly1305_blocks_avx512
4113	.rva	.LSEH_end_poly1305_blocks_avx512
4114	.rva	.LSEH_info_poly1305_blocks_avx512
4115___
4116$code.=<<___;
4117.section	.xdata
4118.align	8
4119.LSEH_info_poly1305_init:
4120	.byte	9,0,0,0
4121	.rva	se_handler
4122	.rva	.LSEH_begin_poly1305_init,.LSEH_begin_poly1305_init
4123
4124.LSEH_info_poly1305_blocks:
4125	.byte	9,0,0,0
4126	.rva	se_handler
4127	.rva	.Lblocks_body,.Lblocks_epilogue
4128
4129.LSEH_info_poly1305_emit:
4130	.byte	9,0,0,0
4131	.rva	se_handler
4132	.rva	.LSEH_begin_poly1305_emit,.LSEH_begin_poly1305_emit
4133___
4134$code.=<<___ if ($avx);
4135.LSEH_info_poly1305_blocks_avx_1:
4136	.byte	9,0,0,0
4137	.rva	se_handler
4138	.rva	.Lblocks_avx_body,.Lblocks_avx_epilogue		# HandlerData[]
4139
4140.LSEH_info_poly1305_blocks_avx_2:
4141	.byte	9,0,0,0
4142	.rva	se_handler
4143	.rva	.Lbase2_64_avx_body,.Lbase2_64_avx_epilogue	# HandlerData[]
4144
4145.LSEH_info_poly1305_blocks_avx_3:
4146	.byte	9,0,0,0
4147	.rva	avx_handler
4148	.rva	.Ldo_avx_body,.Ldo_avx_epilogue			# HandlerData[]
4149
4150.LSEH_info_poly1305_emit_avx:
4151	.byte	9,0,0,0
4152	.rva	se_handler
4153	.rva	.LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx
4154___
4155$code.=<<___ if ($avx>1);
4156.LSEH_info_poly1305_blocks_avx2_1:
4157	.byte	9,0,0,0
4158	.rva	se_handler
4159	.rva	.Lblocks_avx2_body,.Lblocks_avx2_epilogue	# HandlerData[]
4160
4161.LSEH_info_poly1305_blocks_avx2_2:
4162	.byte	9,0,0,0
4163	.rva	se_handler
4164	.rva	.Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue	# HandlerData[]
4165
4166.LSEH_info_poly1305_blocks_avx2_3:
4167	.byte	9,0,0,0
4168	.rva	avx_handler
4169	.rva	.Ldo_avx2_body,.Ldo_avx2_epilogue		# HandlerData[]
4170___
4171$code.=<<___ if ($avx>2);
4172.LSEH_info_poly1305_blocks_avx512:
4173	.byte	9,0,0,0
4174	.rva	avx_handler
4175	.rva	.Ldo_avx512_body,.Ldo_avx512_epilogue		# HandlerData[]
4176___
4177}
4178
4179foreach (split('\n',$code)) {
4180	s/\`([^\`]*)\`/eval($1)/ge;
4181	s/%r([a-z]+)#d/%e$1/g;
4182	s/%r([0-9]+)#d/%r$1d/g;
4183	s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g;
4184
4185	print $_,"\n";
4186}
4187close STDOUT or die "error closing STDOUT: $!";
4188