1#! /usr/bin/env perl
2# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# This module implements Poly1305 hash for x86.
18#
19# April 2015
20#
21# Numbers are cycles per processed byte with poly1305_blocks alone,
22# measured with rdtsc at fixed clock frequency.
23#
24#		IALU/gcc-3.4(*)	SSE2(**)	AVX2
25# Pentium	15.7/+80%	-
26# PIII		6.21/+90%	-
27# P4		19.8/+40%	3.24
28# Core 2	4.85/+90%	1.80
29# Westmere	4.58/+100%	1.43
30# Sandy Bridge	3.90/+100%	1.36
31# Haswell	3.88/+70%	1.18		0.72
32# Skylake	3.10/+60%	1.14		0.62
33# Silvermont	11.0/+40%	4.80
34# Goldmont	4.10/+200%	2.10
35# VIA Nano	6.71/+90%	2.47
36# Sledgehammer	3.51/+180%	4.27
37# Bulldozer	4.53/+140%	1.31
38#
39# (*)	gcc 4.8 for some reason generated worse code;
40# (**)	besides SSE2 there are floating-point and AVX options; FP
41#	is deemed unnecessary, because pre-SSE2 processor are too
42#	old to care about, while it's not the fastest option on
43#	SSE2-capable ones; AVX is omitted, because it doesn't give
44#	a lot of improvement, 5-10% depending on processor;
45
46$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
47push(@INC,"${dir}","${dir}../../perlasm");
48require "x86asm.pl";
49
50$output=pop;
51open STDOUT,">$output";
52
53&asm_init($ARGV[0],$ARGV[$#ARGV] eq "386");
54
55$sse2=$avx=0;
56for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
57
58if ($sse2) {
59	&static_label("const_sse2");
60	&static_label("enter_blocks");
61	&static_label("enter_emit");
62	&external_label("OPENSSL_ia32cap_P");
63
64	if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
65			=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
66		$avx = ($1>=2.19) + ($1>=2.22);
67	}
68
69	if (!$avx && $ARGV[0] eq "win32n" &&
70	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
71	$avx = ($1>=2.09) + ($1>=2.10);
72	}
73
74	if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([0-9]+\.[0-9]+)/) {
75		$avx = ($2>=3.0) + ($2>3.0);
76	}
77}
78
79########################################################################
80# Layout of opaque area is following.
81#
82#	unsigned __int32 h[5];		# current hash value base 2^32
83#	unsigned __int32 pad;		# is_base2_26 in vector context
84#	unsigned __int32 r[4];		# key value base 2^32
85
86&align(64);
87&function_begin("poly1305_init");
88	&mov	("edi",&wparam(0));		# context
89	&mov	("esi",&wparam(1));		# key
90	&mov	("ebp",&wparam(2));		# function table
91
92	&xor	("eax","eax");
93	&mov	(&DWP(4*0,"edi"),"eax");	# zero hash value
94	&mov	(&DWP(4*1,"edi"),"eax");
95	&mov	(&DWP(4*2,"edi"),"eax");
96	&mov	(&DWP(4*3,"edi"),"eax");
97	&mov	(&DWP(4*4,"edi"),"eax");
98	&mov	(&DWP(4*5,"edi"),"eax");	# is_base2_26
99
100	&cmp	("esi",0);
101	&je	(&label("nokey"));
102
103    if ($sse2) {
104	&call	(&label("pic_point"));
105    &set_label("pic_point");
106	&blindpop("ebx");
107
108	&lea	("eax",&DWP("poly1305_blocks-".&label("pic_point"),"ebx"));
109	&lea	("edx",&DWP("poly1305_emit-".&label("pic_point"),"ebx"));
110
111	&picmeup("edi","OPENSSL_ia32cap_P","ebx",&label("pic_point"));
112	&mov	("ecx",&DWP(0,"edi"));
113	&and	("ecx",1<<26|1<<24);
114	&cmp	("ecx",1<<26|1<<24);		# SSE2 and XMM?
115	&jne	(&label("no_sse2"));
116
117	&lea	("eax",&DWP("_poly1305_blocks_sse2-".&label("pic_point"),"ebx"));
118	&lea	("edx",&DWP("_poly1305_emit_sse2-".&label("pic_point"),"ebx"));
119
120      if ($avx>1) {
121	&mov	("ecx",&DWP(8,"edi"));
122	&test	("ecx",1<<5);			# AVX2?
123	&jz	(&label("no_sse2"));
124
125	&lea	("eax",&DWP("_poly1305_blocks_avx2-".&label("pic_point"),"ebx"));
126      }
127    &set_label("no_sse2");
128	&mov	("edi",&wparam(0));		# reload context
129	&mov	(&DWP(0,"ebp"),"eax");		# fill function table
130	&mov	(&DWP(4,"ebp"),"edx");
131    }
132
133	&mov	("eax",&DWP(4*0,"esi"));	# load input key
134	&mov	("ebx",&DWP(4*1,"esi"));
135	&mov	("ecx",&DWP(4*2,"esi"));
136	&mov	("edx",&DWP(4*3,"esi"));
137	&and	("eax",0x0fffffff);
138	&and	("ebx",0x0ffffffc);
139	&and	("ecx",0x0ffffffc);
140	&and	("edx",0x0ffffffc);
141	&mov	(&DWP(4*6,"edi"),"eax");
142	&mov	(&DWP(4*7,"edi"),"ebx");
143	&mov	(&DWP(4*8,"edi"),"ecx");
144	&mov	(&DWP(4*9,"edi"),"edx");
145
146	&mov	("eax",$sse2);
147&set_label("nokey");
148&function_end("poly1305_init");
149
150($h0,$h1,$h2,$h3,$h4,
151 $d0,$d1,$d2,$d3,
152 $r0,$r1,$r2,$r3,
153     $s1,$s2,$s3)=map(4*$_,(0..15));
154
155&function_begin("poly1305_blocks");
156	&mov	("edi",&wparam(0));		# ctx
157	&mov	("esi",&wparam(1));		# inp
158	&mov	("ecx",&wparam(2));		# len
159&set_label("enter_blocks");
160	&and	("ecx",-15);
161	&jz	(&label("nodata"));
162
163	&stack_push(16);
164	&mov	("eax",&DWP(4*6,"edi"));	# r0
165	&mov	("ebx",&DWP(4*7,"edi"));	# r1
166	 &lea	("ebp",&DWP(0,"esi","ecx"));	# end of input
167	&mov	("ecx",&DWP(4*8,"edi"));	# r2
168	&mov	("edx",&DWP(4*9,"edi"));	# r3
169
170	&mov	(&wparam(2),"ebp");
171	&mov	("ebp","esi");
172
173	&mov	(&DWP($r0,"esp"),"eax");	# r0
174	&mov	("eax","ebx");
175	&shr	("eax",2);
176	&mov	(&DWP($r1,"esp"),"ebx");	# r1
177	&add	("eax","ebx");			# s1
178	&mov	("ebx","ecx");
179	&shr	("ebx",2);
180	&mov	(&DWP($r2,"esp"),"ecx");	# r2
181	&add	("ebx","ecx");			# s2
182	&mov	("ecx","edx");
183	&shr	("ecx",2);
184	&mov	(&DWP($r3,"esp"),"edx");	# r3
185	&add	("ecx","edx");			# s3
186	&mov	(&DWP($s1,"esp"),"eax");	# s1
187	&mov	(&DWP($s2,"esp"),"ebx");	# s2
188	&mov	(&DWP($s3,"esp"),"ecx");	# s3
189
190	&mov	("eax",&DWP(4*0,"edi"));	# load hash value
191	&mov	("ebx",&DWP(4*1,"edi"));
192	&mov	("ecx",&DWP(4*2,"edi"));
193	&mov	("esi",&DWP(4*3,"edi"));
194	&mov	("edi",&DWP(4*4,"edi"));
195	&jmp	(&label("loop"));
196
197&set_label("loop",32);
198	&add	("eax",&DWP(4*0,"ebp"));	# accumulate input
199	&adc	("ebx",&DWP(4*1,"ebp"));
200	&adc	("ecx",&DWP(4*2,"ebp"));
201	&adc	("esi",&DWP(4*3,"ebp"));
202	&lea	("ebp",&DWP(4*4,"ebp"));
203	&adc	("edi",&wparam(3));		# padbit
204
205	&mov	(&DWP($h0,"esp"),"eax");	# put aside hash[+inp]
206	&mov	(&DWP($h3,"esp"),"esi");
207
208	&mul	(&DWP($r0,"esp"));		# h0*r0
209	 &mov	(&DWP($h4,"esp"),"edi");
210	&mov	("edi","eax");
211	&mov	("eax","ebx");			# h1
212	&mov	("esi","edx");
213	&mul	(&DWP($s3,"esp"));		# h1*s3
214	&add	("edi","eax");
215	&mov	("eax","ecx");			# h2
216	&adc	("esi","edx");
217	&mul	(&DWP($s2,"esp"));		# h2*s2
218	&add	("edi","eax");
219	&mov	("eax",&DWP($h3,"esp"));
220	&adc	("esi","edx");
221	&mul	(&DWP($s1,"esp"));		# h3*s1
222	&add	("edi","eax");
223	 &mov	("eax",&DWP($h0,"esp"));
224	&adc	("esi","edx");
225
226	&mul	(&DWP($r1,"esp"));		# h0*r1
227	 &mov	(&DWP($d0,"esp"),"edi");
228	&xor	("edi","edi");
229	&add	("esi","eax");
230	&mov	("eax","ebx");			# h1
231	&adc	("edi","edx");
232	&mul	(&DWP($r0,"esp"));		# h1*r0
233	&add	("esi","eax");
234	&mov	("eax","ecx");			# h2
235	&adc	("edi","edx");
236	&mul	(&DWP($s3,"esp"));		# h2*s3
237	&add	("esi","eax");
238	&mov	("eax",&DWP($h3,"esp"));
239	&adc	("edi","edx");
240	&mul	(&DWP($s2,"esp"));		# h3*s2
241	&add	("esi","eax");
242	&mov	("eax",&DWP($h4,"esp"));
243	&adc	("edi","edx");
244	&imul	("eax",&DWP($s1,"esp"));	# h4*s1
245	&add	("esi","eax");
246	 &mov	("eax",&DWP($h0,"esp"));
247	&adc	("edi",0);
248
249	&mul	(&DWP($r2,"esp"));		# h0*r2
250	 &mov	(&DWP($d1,"esp"),"esi");
251	&xor	("esi","esi");
252	&add	("edi","eax");
253	&mov	("eax","ebx");			# h1
254	&adc	("esi","edx");
255	&mul	(&DWP($r1,"esp"));		# h1*r1
256	&add	("edi","eax");
257	&mov	("eax","ecx");			# h2
258	&adc	("esi","edx");
259	&mul	(&DWP($r0,"esp"));		# h2*r0
260	&add	("edi","eax");
261	&mov	("eax",&DWP($h3,"esp"));
262	&adc	("esi","edx");
263	&mul	(&DWP($s3,"esp"));		# h3*s3
264	&add	("edi","eax");
265	&mov	("eax",&DWP($h4,"esp"));
266	&adc	("esi","edx");
267	&imul	("eax",&DWP($s2,"esp"));	# h4*s2
268	&add	("edi","eax");
269	 &mov	("eax",&DWP($h0,"esp"));
270	&adc	("esi",0);
271
272	&mul	(&DWP($r3,"esp"));		# h0*r3
273	 &mov	(&DWP($d2,"esp"),"edi");
274	&xor	("edi","edi");
275	&add	("esi","eax");
276	&mov	("eax","ebx");			# h1
277	&adc	("edi","edx");
278	&mul	(&DWP($r2,"esp"));		# h1*r2
279	&add	("esi","eax");
280	&mov	("eax","ecx");			# h2
281	&adc	("edi","edx");
282	&mul	(&DWP($r1,"esp"));		# h2*r1
283	&add	("esi","eax");
284	&mov	("eax",&DWP($h3,"esp"));
285	&adc	("edi","edx");
286	&mul	(&DWP($r0,"esp"));		# h3*r0
287	&add	("esi","eax");
288	 &mov	("ecx",&DWP($h4,"esp"));
289	&adc	("edi","edx");
290
291	&mov	("edx","ecx");
292	&imul	("ecx",&DWP($s3,"esp"));	# h4*s3
293	&add	("esi","ecx");
294	 &mov	("eax",&DWP($d0,"esp"));
295	&adc	("edi",0);
296
297	&imul	("edx",&DWP($r0,"esp"));	# h4*r0
298	&add	("edx","edi");
299
300	&mov	("ebx",&DWP($d1,"esp"));
301	&mov	("ecx",&DWP($d2,"esp"));
302
303	&mov	("edi","edx");			# last reduction step
304	&shr	("edx",2);
305	&and	("edi",3);
306	&lea	("edx",&DWP(0,"edx","edx",4));	# *5
307	&add	("eax","edx");
308	&adc	("ebx",0);
309	&adc	("ecx",0);
310	&adc	("esi",0);
311	&adc	("edi",0);
312
313	&cmp	("ebp",&wparam(2));		# done yet?
314	&jne	(&label("loop"));
315
316	&mov	("edx",&wparam(0));		# ctx
317	&stack_pop(16);
318	&mov	(&DWP(4*0,"edx"),"eax");	# store hash value
319	&mov	(&DWP(4*1,"edx"),"ebx");
320	&mov	(&DWP(4*2,"edx"),"ecx");
321	&mov	(&DWP(4*3,"edx"),"esi");
322	&mov	(&DWP(4*4,"edx"),"edi");
323&set_label("nodata");
324&function_end("poly1305_blocks");
325
326&function_begin("poly1305_emit");
327	&mov	("ebp",&wparam(0));		# context
328&set_label("enter_emit");
329	&mov	("edi",&wparam(1));		# output
330	&mov	("eax",&DWP(4*0,"ebp"));	# load hash value
331	&mov	("ebx",&DWP(4*1,"ebp"));
332	&mov	("ecx",&DWP(4*2,"ebp"));
333	&mov	("edx",&DWP(4*3,"ebp"));
334	&mov	("esi",&DWP(4*4,"ebp"));
335
336	&add	("eax",5);			# compare to modulus
337	&adc	("ebx",0);
338	&adc	("ecx",0);
339	&adc	("edx",0);
340	&adc	("esi",0);
341	&shr	("esi",2);			# did it carry/borrow?
342	&neg	("esi");			# do we choose hash-modulus?
343
344	&and	("eax","esi");
345	&and	("ebx","esi");
346	&and	("ecx","esi");
347	&and	("edx","esi");
348	&mov	(&DWP(4*0,"edi"),"eax");
349	&mov	(&DWP(4*1,"edi"),"ebx");
350	&mov	(&DWP(4*2,"edi"),"ecx");
351	&mov	(&DWP(4*3,"edi"),"edx");
352
353	&not	("esi");			# or original hash value?
354	&mov	("eax",&DWP(4*0,"ebp"));
355	&mov	("ebx",&DWP(4*1,"ebp"));
356	&mov	("ecx",&DWP(4*2,"ebp"));
357	&mov	("edx",&DWP(4*3,"ebp"));
358	&mov	("ebp",&wparam(2));
359	&and	("eax","esi");
360	&and	("ebx","esi");
361	&and	("ecx","esi");
362	&and	("edx","esi");
363	&or	("eax",&DWP(4*0,"edi"));
364	&or	("ebx",&DWP(4*1,"edi"));
365	&or	("ecx",&DWP(4*2,"edi"));
366	&or	("edx",&DWP(4*3,"edi"));
367
368	&add	("eax",&DWP(4*0,"ebp"));	# accumulate key
369	&adc	("ebx",&DWP(4*1,"ebp"));
370	&adc	("ecx",&DWP(4*2,"ebp"));
371	&adc	("edx",&DWP(4*3,"ebp"));
372
373	&mov	(&DWP(4*0,"edi"),"eax");
374	&mov	(&DWP(4*1,"edi"),"ebx");
375	&mov	(&DWP(4*2,"edi"),"ecx");
376	&mov	(&DWP(4*3,"edi"),"edx");
377&function_end("poly1305_emit");
378
379if ($sse2) {
380########################################################################
381# Layout of opaque area is following.
382#
383#	unsigned __int32 h[5];		# current hash value base 2^26
384#	unsigned __int32 is_base2_26;
385#	unsigned __int32 r[4];		# key value base 2^32
386#	unsigned __int32 pad[2];
387#	struct { unsigned __int32 r^4, r^3, r^2, r^1; } r[9];
388#
389# where r^n are base 2^26 digits of degrees of multiplier key. There are
390# 5 digits, but last four are interleaved with multiples of 5, totalling
391# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4.
392
393my ($D0,$D1,$D2,$D3,$D4,$T0,$T1,$T2)=map("xmm$_",(0..7));
394my $MASK=$T2;	# borrow and keep in mind
395
396&align	(32);
397&function_begin_B("_poly1305_init_sse2");
398	&movdqu		($D4,&QWP(4*6,"edi"));		# key base 2^32
399	&lea		("edi",&DWP(16*3,"edi"));	# size optimization
400	&mov		("ebp","esp");
401	&sub		("esp",16*(9+5));
402	&and		("esp",-16);
403
404	#&pand		($D4,&QWP(96,"ebx"));		# magic mask
405	&movq		($MASK,&QWP(64,"ebx"));
406
407	&movdqa		($D0,$D4);
408	&movdqa		($D1,$D4);
409	&movdqa		($D2,$D4);
410
411	&pand		($D0,$MASK);			# -> base 2^26
412	&psrlq		($D1,26);
413	&psrldq		($D2,6);
414	&pand		($D1,$MASK);
415	&movdqa		($D3,$D2);
416	&psrlq		($D2,4)
417	&psrlq		($D3,30);
418	&pand		($D2,$MASK);
419	&pand		($D3,$MASK);
420	&psrldq		($D4,13);
421
422	&lea		("edx",&DWP(16*9,"esp"));	# size optimization
423	&mov		("ecx",2);
424&set_label("square");
425	&movdqa		(&QWP(16*0,"esp"),$D0);
426	&movdqa		(&QWP(16*1,"esp"),$D1);
427	&movdqa		(&QWP(16*2,"esp"),$D2);
428	&movdqa		(&QWP(16*3,"esp"),$D3);
429	&movdqa		(&QWP(16*4,"esp"),$D4);
430
431	&movdqa		($T1,$D1);
432	&movdqa		($T0,$D2);
433	&pslld		($T1,2);
434	&pslld		($T0,2);
435	&paddd		($T1,$D1);			# *5
436	&paddd		($T0,$D2);			# *5
437	&movdqa		(&QWP(16*5,"esp"),$T1);
438	&movdqa		(&QWP(16*6,"esp"),$T0);
439	&movdqa		($T1,$D3);
440	&movdqa		($T0,$D4);
441	&pslld		($T1,2);
442	&pslld		($T0,2);
443	&paddd		($T1,$D3);			# *5
444	&paddd		($T0,$D4);			# *5
445	&movdqa		(&QWP(16*7,"esp"),$T1);
446	&movdqa		(&QWP(16*8,"esp"),$T0);
447
448	&pshufd		($T1,$D0,0b01000100);
449	&movdqa		($T0,$D1);
450	&pshufd		($D1,$D1,0b01000100);
451	&pshufd		($D2,$D2,0b01000100);
452	&pshufd		($D3,$D3,0b01000100);
453	&pshufd		($D4,$D4,0b01000100);
454	&movdqa		(&QWP(16*0,"edx"),$T1);
455	&movdqa		(&QWP(16*1,"edx"),$D1);
456	&movdqa		(&QWP(16*2,"edx"),$D2);
457	&movdqa		(&QWP(16*3,"edx"),$D3);
458	&movdqa		(&QWP(16*4,"edx"),$D4);
459
460	################################################################
461	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
462	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
463	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
464	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
465	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
466
467	&pmuludq	($D4,$D0);			# h4*r0
468	&pmuludq	($D3,$D0);			# h3*r0
469	&pmuludq	($D2,$D0);			# h2*r0
470	&pmuludq	($D1,$D0);			# h1*r0
471	&pmuludq	($D0,$T1);			# h0*r0
472
473sub pmuladd {
474my $load = shift;
475my $base = shift; $base = "esp" if (!defined($base));
476
477	################################################################
478	# As for choice to "rotate" $T0-$T2 in order to move paddq
479	# past next multiplication. While it makes code harder to read
480	# and doesn't have significant effect on most processors, it
481	# makes a lot of difference on Atom, up to 30% improvement.
482
483	&movdqa		($T1,$T0);
484	&pmuludq	($T0,&QWP(16*3,$base));		# r1*h3
485	&movdqa		($T2,$T1);
486	&pmuludq	($T1,&QWP(16*2,$base));		# r1*h2
487	&paddq		($D4,$T0);
488	&movdqa		($T0,$T2);
489	&pmuludq	($T2,&QWP(16*1,$base));		# r1*h1
490	&paddq		($D3,$T1);
491	&$load		($T1,5);			# s1
492	&pmuludq	($T0,&QWP(16*0,$base));		# r1*h0
493	&paddq		($D2,$T2);
494	&pmuludq	($T1,&QWP(16*4,$base));		# s1*h4
495	 &$load		($T2,2);			# r2^n
496	&paddq		($D1,$T0);
497
498	&movdqa		($T0,$T2);
499	&pmuludq	($T2,&QWP(16*2,$base));		# r2*h2
500	 &paddq		($D0,$T1);
501	&movdqa		($T1,$T0);
502	&pmuludq	($T0,&QWP(16*1,$base));		# r2*h1
503	&paddq		($D4,$T2);
504	&$load		($T2,6);			# s2^n
505	&pmuludq	($T1,&QWP(16*0,$base));		# r2*h0
506	&paddq		($D3,$T0);
507	&movdqa		($T0,$T2);
508	&pmuludq	($T2,&QWP(16*4,$base));		# s2*h4
509	&paddq		($D2,$T1);
510	&pmuludq	($T0,&QWP(16*3,$base));		# s2*h3
511	 &$load		($T1,3);			# r3^n
512	&paddq		($D1,$T2);
513
514	&movdqa		($T2,$T1);
515	&pmuludq	($T1,&QWP(16*1,$base));		# r3*h1
516	 &paddq		($D0,$T0);
517	&$load		($T0,7);			# s3^n
518	&pmuludq	($T2,&QWP(16*0,$base));		# r3*h0
519	&paddq		($D4,$T1);
520	&movdqa		($T1,$T0);
521	&pmuludq	($T0,&QWP(16*4,$base));		# s3*h4
522	&paddq		($D3,$T2);
523	&movdqa		($T2,$T1);
524	&pmuludq	($T1,&QWP(16*3,$base));		# s3*h3
525	&paddq		($D2,$T0);
526	&pmuludq	($T2,&QWP(16*2,$base));		# s3*h2
527	 &$load		($T0,4);			# r4^n
528	&paddq		($D1,$T1);
529
530	&$load		($T1,8);			# s4^n
531	&pmuludq	($T0,&QWP(16*0,$base));		# r4*h0
532	 &paddq		($D0,$T2);
533	&movdqa		($T2,$T1);
534	&pmuludq	($T1,&QWP(16*4,$base));		# s4*h4
535	&paddq		($D4,$T0);
536	&movdqa		($T0,$T2);
537	&pmuludq	($T2,&QWP(16*1,$base));		# s4*h1
538	&paddq		($D3,$T1);
539	&movdqa		($T1,$T0);
540	&pmuludq	($T0,&QWP(16*2,$base));		# s4*h2
541	&paddq		($D0,$T2);
542	&pmuludq	($T1,&QWP(16*3,$base));		# s4*h3
543	 &movdqa	($MASK,&QWP(64,"ebx"));
544	&paddq		($D1,$T0);
545	&paddq		($D2,$T1);
546}
547	&pmuladd	(sub {	my ($reg,$i)=@_;
548				&movdqa ($reg,&QWP(16*$i,"esp"));
549			     },"edx");
550
551sub lazy_reduction {
552my $extra = shift;
553
554	################################################################
555	# lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
556	# and P. Schwabe
557	#
558	# [(*) see discussion in poly1305-armv4 module]
559
560	 &movdqa	($T0,$D3);
561	 &pand		($D3,$MASK);
562	 &psrlq		($T0,26);
563	 &$extra	()				if (defined($extra));
564	 &paddq		($T0,$D4);			# h3 -> h4
565	&movdqa		($T1,$D0);
566	&pand		($D0,$MASK);
567	&psrlq		($T1,26);
568	 &movdqa	($D4,$T0);
569	&paddq		($T1,$D1);			# h0 -> h1
570	 &psrlq		($T0,26);
571	 &pand		($D4,$MASK);
572	&movdqa		($D1,$T1);
573	&psrlq		($T1,26);
574	 &paddd		($D0,$T0);			# favour paddd when
575							# possible, because
576							# paddq is "broken"
577							# on Atom
578	 &psllq		($T0,2);
579	&paddq		($T1,$D2);			# h1 -> h2
580	 &paddq		($T0,$D0);			# h4 -> h0 (*)
581	&pand		($D1,$MASK);
582	&movdqa		($D2,$T1);
583	&psrlq		($T1,26);
584	&pand		($D2,$MASK);
585	&paddd		($T1,$D3);			# h2 -> h3
586	 &movdqa	($D0,$T0);
587	 &psrlq		($T0,26);
588	&movdqa		($D3,$T1);
589	&psrlq		($T1,26);
590	 &pand		($D0,$MASK);
591	 &paddd		($D1,$T0);			# h0 -> h1
592	&pand		($D3,$MASK);
593	&paddd		($D4,$T1);			# h3 -> h4
594}
595	&lazy_reduction	();
596
597	&dec		("ecx");
598	&jz		(&label("square_break"));
599
600	&punpcklqdq	($D0,&QWP(16*0,"esp"));		# 0:r^1:0:r^2
601	&punpcklqdq	($D1,&QWP(16*1,"esp"));
602	&punpcklqdq	($D2,&QWP(16*2,"esp"));
603	&punpcklqdq	($D3,&QWP(16*3,"esp"));
604	&punpcklqdq	($D4,&QWP(16*4,"esp"));
605	&jmp		(&label("square"));
606
607&set_label("square_break");
608	&psllq		($D0,32);			# -> r^3:0:r^4:0
609	&psllq		($D1,32);
610	&psllq		($D2,32);
611	&psllq		($D3,32);
612	&psllq		($D4,32);
613	&por		($D0,&QWP(16*0,"esp"));		# r^3:r^1:r^4:r^2
614	&por		($D1,&QWP(16*1,"esp"));
615	&por		($D2,&QWP(16*2,"esp"));
616	&por		($D3,&QWP(16*3,"esp"));
617	&por		($D4,&QWP(16*4,"esp"));
618
619	&pshufd		($D0,$D0,0b10001101);		# -> r^1:r^2:r^3:r^4
620	&pshufd		($D1,$D1,0b10001101);
621	&pshufd		($D2,$D2,0b10001101);
622	&pshufd		($D3,$D3,0b10001101);
623	&pshufd		($D4,$D4,0b10001101);
624
625	&movdqu		(&QWP(16*0,"edi"),$D0);		# save the table
626	&movdqu		(&QWP(16*1,"edi"),$D1);
627	&movdqu		(&QWP(16*2,"edi"),$D2);
628	&movdqu		(&QWP(16*3,"edi"),$D3);
629	&movdqu		(&QWP(16*4,"edi"),$D4);
630
631	&movdqa		($T1,$D1);
632	&movdqa		($T0,$D2);
633	&pslld		($T1,2);
634	&pslld		($T0,2);
635	&paddd		($T1,$D1);			# *5
636	&paddd		($T0,$D2);			# *5
637	&movdqu		(&QWP(16*5,"edi"),$T1);
638	&movdqu		(&QWP(16*6,"edi"),$T0);
639	&movdqa		($T1,$D3);
640	&movdqa		($T0,$D4);
641	&pslld		($T1,2);
642	&pslld		($T0,2);
643	&paddd		($T1,$D3);			# *5
644	&paddd		($T0,$D4);			# *5
645	&movdqu		(&QWP(16*7,"edi"),$T1);
646	&movdqu		(&QWP(16*8,"edi"),$T0);
647
648	&mov		("esp","ebp");
649	&lea		("edi",&DWP(-16*3,"edi"));	# size de-optimization
650	&ret		();
651&function_end_B("_poly1305_init_sse2");
652
653&align	(32);
654&function_begin("_poly1305_blocks_sse2");
655	&mov	("edi",&wparam(0));			# ctx
656	&mov	("esi",&wparam(1));			# inp
657	&mov	("ecx",&wparam(2));			# len
658
659	&mov	("eax",&DWP(4*5,"edi"));		# is_base2_26
660	&and	("ecx",-16);
661	&jz	(&label("nodata"));
662	&cmp	("ecx",64);
663	&jae	(&label("enter_sse2"));
664	&test	("eax","eax");				# is_base2_26?
665	&jz	(&label("enter_blocks"));
666
667&set_label("enter_sse2",16);
668	&call	(&label("pic_point"));
669&set_label("pic_point");
670	&blindpop("ebx");
671	&lea	("ebx",&DWP(&label("const_sse2")."-".&label("pic_point"),"ebx"));
672
673	&test	("eax","eax");				# is_base2_26?
674	&jnz	(&label("base2_26"));
675
676	&call	("_poly1305_init_sse2");
677
678	################################################# base 2^32 -> base 2^26
679	&mov	("eax",&DWP(0,"edi"));
680	&mov	("ecx",&DWP(3,"edi"));
681	&mov	("edx",&DWP(6,"edi"));
682	&mov	("esi",&DWP(9,"edi"));
683	&mov	("ebp",&DWP(13,"edi"));
684	&mov	(&DWP(4*5,"edi"),1);			# is_base2_26
685
686	&shr	("ecx",2);
687	&and	("eax",0x3ffffff);
688	&shr	("edx",4);
689	&and	("ecx",0x3ffffff);
690	&shr	("esi",6);
691	&and	("edx",0x3ffffff);
692
693	&movd	($D0,"eax");
694	&movd	($D1,"ecx");
695	&movd	($D2,"edx");
696	&movd	($D3,"esi");
697	&movd	($D4,"ebp");
698
699	&mov	("esi",&wparam(1));			# [reload] inp
700	&mov	("ecx",&wparam(2));			# [reload] len
701	&jmp	(&label("base2_32"));
702
703&set_label("base2_26",16);
704	&movd	($D0,&DWP(4*0,"edi"));			# load hash value
705	&movd	($D1,&DWP(4*1,"edi"));
706	&movd	($D2,&DWP(4*2,"edi"));
707	&movd	($D3,&DWP(4*3,"edi"));
708	&movd	($D4,&DWP(4*4,"edi"));
709	&movdqa	($MASK,&QWP(64,"ebx"));
710
711&set_label("base2_32");
712	&mov	("eax",&wparam(3));			# padbit
713	&mov	("ebp","esp");
714
715	&sub	("esp",16*(5+5+5+9+9));
716	&and	("esp",-16);
717
718	&lea	("edi",&DWP(16*3,"edi"));		# size optimization
719	&shl	("eax",24);				# padbit
720
721	&test	("ecx",31);
722	&jz	(&label("even"));
723
724	################################################################
725	# process single block, with SSE2, because it's still faster
726	# even though half of result is discarded
727
728	&movdqu		($T1,&QWP(0,"esi"));		# input
729	&lea		("esi",&DWP(16,"esi"));
730
731	&movdqa		($T0,$T1);			# -> base 2^26 ...
732	&pand		($T1,$MASK);
733	&paddd		($D0,$T1);			# ... and accumulate
734
735	&movdqa		($T1,$T0);
736	&psrlq		($T0,26);
737	&psrldq		($T1,6);
738	&pand		($T0,$MASK);
739	&paddd		($D1,$T0);
740
741	&movdqa		($T0,$T1);
742	&psrlq		($T1,4);
743	&pand		($T1,$MASK);
744	&paddd		($D2,$T1);
745
746	&movdqa		($T1,$T0);
747	&psrlq		($T0,30);
748	&pand		($T0,$MASK);
749	&psrldq		($T1,7);
750	&paddd		($D3,$T0);
751
752	&movd		($T0,"eax");			# padbit
753	&paddd		($D4,$T1);
754	 &movd		($T1,&DWP(16*0+12,"edi"));	# r0
755	&paddd		($D4,$T0);
756
757	&movdqa		(&QWP(16*0,"esp"),$D0);
758	&movdqa		(&QWP(16*1,"esp"),$D1);
759	&movdqa		(&QWP(16*2,"esp"),$D2);
760	&movdqa		(&QWP(16*3,"esp"),$D3);
761	&movdqa		(&QWP(16*4,"esp"),$D4);
762
763	################################################################
764	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
765	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
766	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
767	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
768	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
769
770	&pmuludq	($D0,$T1);			# h4*r0
771	&pmuludq	($D1,$T1);			# h3*r0
772	&pmuludq	($D2,$T1);			# h2*r0
773	 &movd		($T0,&DWP(16*1+12,"edi"));	# r1
774	&pmuludq	($D3,$T1);			# h1*r0
775	&pmuludq	($D4,$T1);			# h0*r0
776
777	&pmuladd	(sub {	my ($reg,$i)=@_;
778				&movd ($reg,&DWP(16*$i+12,"edi"));
779			     });
780
781	&lazy_reduction	();
782
783	&sub		("ecx",16);
784	&jz		(&label("done"));
785
786&set_label("even");
787	&lea		("edx",&DWP(16*(5+5+5+9),"esp"));# size optimization
788	&lea		("eax",&DWP(-16*2,"esi"));
789	&sub		("ecx",64);
790
791	################################################################
792	# expand and copy pre-calculated table to stack
793
794	&movdqu		($T0,&QWP(16*0,"edi"));		# r^1:r^2:r^3:r^4
795	&pshufd		($T1,$T0,0b01000100);		# duplicate r^3:r^4
796	&cmovb		("esi","eax");
797	&pshufd		($T0,$T0,0b11101110);		# duplicate r^1:r^2
798	&movdqa		(&QWP(16*0,"edx"),$T1);
799	&lea		("eax",&DWP(16*10,"esp"));
800	&movdqu		($T1,&QWP(16*1,"edi"));
801	&movdqa		(&QWP(16*(0-9),"edx"),$T0);
802	&pshufd		($T0,$T1,0b01000100);
803	&pshufd		($T1,$T1,0b11101110);
804	&movdqa		(&QWP(16*1,"edx"),$T0);
805	&movdqu		($T0,&QWP(16*2,"edi"));
806	&movdqa		(&QWP(16*(1-9),"edx"),$T1);
807	&pshufd		($T1,$T0,0b01000100);
808	&pshufd		($T0,$T0,0b11101110);
809	&movdqa		(&QWP(16*2,"edx"),$T1);
810	&movdqu		($T1,&QWP(16*3,"edi"));
811	&movdqa		(&QWP(16*(2-9),"edx"),$T0);
812	&pshufd		($T0,$T1,0b01000100);
813	&pshufd		($T1,$T1,0b11101110);
814	&movdqa		(&QWP(16*3,"edx"),$T0);
815	&movdqu		($T0,&QWP(16*4,"edi"));
816	&movdqa		(&QWP(16*(3-9),"edx"),$T1);
817	&pshufd		($T1,$T0,0b01000100);
818	&pshufd		($T0,$T0,0b11101110);
819	&movdqa		(&QWP(16*4,"edx"),$T1);
820	&movdqu		($T1,&QWP(16*5,"edi"));
821	&movdqa		(&QWP(16*(4-9),"edx"),$T0);
822	&pshufd		($T0,$T1,0b01000100);
823	&pshufd		($T1,$T1,0b11101110);
824	&movdqa		(&QWP(16*5,"edx"),$T0);
825	&movdqu		($T0,&QWP(16*6,"edi"));
826	&movdqa		(&QWP(16*(5-9),"edx"),$T1);
827	&pshufd		($T1,$T0,0b01000100);
828	&pshufd		($T0,$T0,0b11101110);
829	&movdqa		(&QWP(16*6,"edx"),$T1);
830	&movdqu		($T1,&QWP(16*7,"edi"));
831	&movdqa		(&QWP(16*(6-9),"edx"),$T0);
832	&pshufd		($T0,$T1,0b01000100);
833	&pshufd		($T1,$T1,0b11101110);
834	&movdqa		(&QWP(16*7,"edx"),$T0);
835	&movdqu		($T0,&QWP(16*8,"edi"));
836	&movdqa		(&QWP(16*(7-9),"edx"),$T1);
837	&pshufd		($T1,$T0,0b01000100);
838	&pshufd		($T0,$T0,0b11101110);
839	&movdqa		(&QWP(16*8,"edx"),$T1);
840	&movdqa		(&QWP(16*(8-9),"edx"),$T0);
841
842sub load_input {
843my ($inpbase,$offbase)=@_;
844
845	&movdqu		($T0,&QWP($inpbase+0,"esi"));	# load input
846	&movdqu		($T1,&QWP($inpbase+16,"esi"));
847	&lea		("esi",&DWP(16*2,"esi"));
848
849	&movdqa		(&QWP($offbase+16*2,"esp"),$D2);
850	&movdqa		(&QWP($offbase+16*3,"esp"),$D3);
851	&movdqa		(&QWP($offbase+16*4,"esp"),$D4);
852
853	&movdqa		($D2,$T0);			# splat input
854	&movdqa		($D3,$T1);
855	&psrldq		($D2,6);
856	&psrldq		($D3,6);
857	&movdqa		($D4,$T0);
858	&punpcklqdq	($D2,$D3);			# 2:3
859	&punpckhqdq	($D4,$T1);			# 4
860	&punpcklqdq	($T0,$T1);			# 0:1
861
862	&movdqa		($D3,$D2);
863	&psrlq		($D2,4);
864	&psrlq		($D3,30);
865	&movdqa		($T1,$T0);
866	&psrlq		($D4,40);			# 4
867	&psrlq		($T1,26);
868	&pand		($T0,$MASK);			# 0
869	&pand		($T1,$MASK);			# 1
870	&pand		($D2,$MASK);			# 2
871	&pand		($D3,$MASK);			# 3
872	&por		($D4,&QWP(0,"ebx"));		# padbit, yes, always
873
874	&movdqa		(&QWP($offbase+16*0,"esp"),$D0)	if ($offbase);
875	&movdqa		(&QWP($offbase+16*1,"esp"),$D1)	if ($offbase);
876}
877	&load_input	(16*2,16*5);
878
879	&jbe		(&label("skip_loop"));
880	&jmp		(&label("loop"));
881
882&set_label("loop",32);
883	################################################################
884	# ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
885	# ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
886	#   \___________________/
887	# ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
888	# ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
889	#   \___________________/ \____________________/
890	################################################################
891
892	&movdqa		($T2,&QWP(16*(0-9),"edx"));	# r0^2
893	&movdqa		(&QWP(16*1,"eax"),$T1);
894	&movdqa		(&QWP(16*2,"eax"),$D2);
895	&movdqa		(&QWP(16*3,"eax"),$D3);
896	&movdqa		(&QWP(16*4,"eax"),$D4);
897
898	################################################################
899	# d4 = h4*r0 + h0*r4   + h1*r3   + h2*r2   + h3*r1
900	# d3 = h3*r0 + h0*r3   + h1*r2   + h2*r1   + h4*5*r4
901	# d2 = h2*r0 + h0*r2   + h1*r1   + h3*5*r4 + h4*5*r3
902	# d1 = h1*r0 + h0*r1   + h2*5*r4 + h3*5*r3 + h4*5*r2
903	# d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
904
905	&movdqa		($D1,$T0);
906	&pmuludq	($T0,$T2);			# h0*r0
907	&movdqa		($D0,$T1);
908	&pmuludq	($T1,$T2);			# h1*r0
909	&pmuludq	($D2,$T2);			# h2*r0
910	&pmuludq	($D3,$T2);			# h3*r0
911	&pmuludq	($D4,$T2);			# h4*r0
912
913sub pmuladd_alt {
914my $addr = shift;
915
916	&pmuludq	($D0,&$addr(8));		# h1*s4
917	&movdqa		($T2,$D1);
918	&pmuludq	($D1,&$addr(1));		# h0*r1
919	&paddq		($D0,$T0);
920	&movdqa		($T0,$T2);
921	&pmuludq	($T2,&$addr(2));		# h0*r2
922	&paddq		($D1,$T1);
923	&movdqa		($T1,$T0);
924	&pmuludq	($T0,&$addr(3));		# h0*r3
925	&paddq		($D2,$T2);
926	 &movdqa	($T2,&QWP(16*1,"eax"));		# pull h1
927	&pmuludq	($T1,&$addr(4));		# h0*r4
928	&paddq		($D3,$T0);
929
930	&movdqa		($T0,$T2);
931	&pmuludq	($T2,&$addr(1));		# h1*r1
932	 &paddq		($D4,$T1);
933	&movdqa		($T1,$T0);
934	&pmuludq	($T0,&$addr(2));		# h1*r2
935	&paddq		($D2,$T2);
936	&movdqa		($T2,&QWP(16*2,"eax"));		# pull h2
937	&pmuludq	($T1,&$addr(3));		# h1*r3
938	&paddq		($D3,$T0);
939	&movdqa		($T0,$T2);
940	&pmuludq	($T2,&$addr(7));		# h2*s3
941	&paddq		($D4,$T1);
942	&movdqa		($T1,$T0);
943	&pmuludq	($T0,&$addr(8));		# h2*s4
944	&paddq		($D0,$T2);
945
946	&movdqa		($T2,$T1);
947	&pmuludq	($T1,&$addr(1));		# h2*r1
948	 &paddq		($D1,$T0);
949	&movdqa		($T0,&QWP(16*3,"eax"));		# pull h3
950	&pmuludq	($T2,&$addr(2));		# h2*r2
951	&paddq		($D3,$T1);
952	&movdqa		($T1,$T0);
953	&pmuludq	($T0,&$addr(6));		# h3*s2
954	&paddq		($D4,$T2);
955	&movdqa		($T2,$T1);
956	&pmuludq	($T1,&$addr(7));		# h3*s3
957	&paddq		($D0,$T0);
958	&movdqa		($T0,$T2);
959	&pmuludq	($T2,&$addr(8));		# h3*s4
960	&paddq		($D1,$T1);
961
962	&movdqa		($T1,&QWP(16*4,"eax"));		# pull h4
963	&pmuludq	($T0,&$addr(1));		# h3*r1
964	 &paddq		($D2,$T2);
965	&movdqa		($T2,$T1);
966	&pmuludq	($T1,&$addr(8));		# h4*s4
967	&paddq		($D4,$T0);
968	&movdqa		($T0,$T2);
969	&pmuludq	($T2,&$addr(5));		# h4*s1
970	&paddq		($D3,$T1);
971	&movdqa		($T1,$T0);
972	&pmuludq	($T0,&$addr(6));		# h4*s2
973	&paddq		($D0,$T2);
974	 &movdqa	($MASK,&QWP(64,"ebx"));
975	&pmuludq	($T1,&$addr(7));		# h4*s3
976	&paddq		($D1,$T0);
977	&paddq		($D2,$T1);
978}
979	&pmuladd_alt	(sub {	my $i=shift; &QWP(16*($i-9),"edx");	});
980
981	&load_input	(-16*2,0);
982	&lea		("eax",&DWP(-16*2,"esi"));
983	&sub		("ecx",64);
984
985	&paddd		($T0,&QWP(16*(5+0),"esp"));	# add hash value
986	&paddd		($T1,&QWP(16*(5+1),"esp"));
987	&paddd		($D2,&QWP(16*(5+2),"esp"));
988	&paddd		($D3,&QWP(16*(5+3),"esp"));
989	&paddd		($D4,&QWP(16*(5+4),"esp"));
990
991	&cmovb		("esi","eax");
992	&lea		("eax",&DWP(16*10,"esp"));
993
994	&movdqa		($T2,&QWP(16*0,"edx"));		# r0^4
995	&movdqa		(&QWP(16*1,"esp"),$D1);
996	&movdqa		(&QWP(16*1,"eax"),$T1);
997	&movdqa		(&QWP(16*2,"eax"),$D2);
998	&movdqa		(&QWP(16*3,"eax"),$D3);
999	&movdqa		(&QWP(16*4,"eax"),$D4);
1000
1001	################################################################
1002	# d4 += h4*r0 + h0*r4   + h1*r3   + h2*r2   + h3*r1
1003	# d3 += h3*r0 + h0*r3   + h1*r2   + h2*r1   + h4*5*r4
1004	# d2 += h2*r0 + h0*r2   + h1*r1   + h3*5*r4 + h4*5*r3
1005	# d1 += h1*r0 + h0*r1   + h2*5*r4 + h3*5*r3 + h4*5*r2
1006	# d0 += h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
1007
1008	&movdqa		($D1,$T0);
1009	&pmuludq	($T0,$T2);			# h0*r0
1010	&paddq		($T0,$D0);
1011	&movdqa		($D0,$T1);
1012	&pmuludq	($T1,$T2);			# h1*r0
1013	&pmuludq	($D2,$T2);			# h2*r0
1014	&pmuludq	($D3,$T2);			# h3*r0
1015	&pmuludq	($D4,$T2);			# h4*r0
1016
1017	&paddq		($T1,&QWP(16*1,"esp"));
1018	&paddq		($D2,&QWP(16*2,"esp"));
1019	&paddq		($D3,&QWP(16*3,"esp"));
1020	&paddq		($D4,&QWP(16*4,"esp"));
1021
1022	&pmuladd_alt	(sub {	my $i=shift; &QWP(16*$i,"edx");	});
1023
1024	&lazy_reduction	();
1025
1026	&load_input	(16*2,16*5);
1027
1028	&ja		(&label("loop"));
1029
1030&set_label("skip_loop");
1031	################################################################
1032	# multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
1033
1034	 &pshufd	($T2,&QWP(16*(0-9),"edx"),0x10);# r0^n
1035	&add		("ecx",32);
1036	&jnz		(&label("long_tail"));
1037
1038	&paddd		($T0,$D0);			# add hash value
1039	&paddd		($T1,$D1);
1040	&paddd		($D2,&QWP(16*7,"esp"));
1041	&paddd		($D3,&QWP(16*8,"esp"));
1042	&paddd		($D4,&QWP(16*9,"esp"));
1043
1044&set_label("long_tail");
1045
1046	&movdqa		(&QWP(16*0,"eax"),$T0);
1047	&movdqa		(&QWP(16*1,"eax"),$T1);
1048	&movdqa		(&QWP(16*2,"eax"),$D2);
1049	&movdqa		(&QWP(16*3,"eax"),$D3);
1050	&movdqa		(&QWP(16*4,"eax"),$D4);
1051
1052	################################################################
1053	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
1054	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
1055	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
1056	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
1057	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
1058
1059	&pmuludq	($T0,$T2);			# h0*r0
1060	&pmuludq	($T1,$T2);			# h1*r0
1061	&pmuludq	($D2,$T2);			# h2*r0
1062	&movdqa		($D0,$T0);
1063	 &pshufd	($T0,&QWP(16*(1-9),"edx"),0x10);# r1^n
1064	&pmuludq	($D3,$T2);			# h3*r0
1065	&movdqa		($D1,$T1);
1066	&pmuludq	($D4,$T2);			# h4*r0
1067
1068	&pmuladd	(sub {	my ($reg,$i)=@_;
1069				&pshufd ($reg,&QWP(16*($i-9),"edx"),0x10);
1070			     },"eax");
1071
1072	&jz		(&label("short_tail"));
1073
1074	&load_input	(-16*2,0);
1075
1076	 &pshufd	($T2,&QWP(16*0,"edx"),0x10);	# r0^n
1077	&paddd		($T0,&QWP(16*5,"esp"));		# add hash value
1078	&paddd		($T1,&QWP(16*6,"esp"));
1079	&paddd		($D2,&QWP(16*7,"esp"));
1080	&paddd		($D3,&QWP(16*8,"esp"));
1081	&paddd		($D4,&QWP(16*9,"esp"));
1082
1083	################################################################
1084	# multiply inp[0:1] by r^4:r^3 and accumulate
1085
1086	&movdqa		(&QWP(16*0,"esp"),$T0);
1087	&pmuludq	($T0,$T2);			# h0*r0
1088	&movdqa		(&QWP(16*1,"esp"),$T1);
1089	&pmuludq	($T1,$T2);			# h1*r0
1090	&paddq		($D0,$T0);
1091	&movdqa		($T0,$D2);
1092	&pmuludq	($D2,$T2);			# h2*r0
1093	&paddq		($D1,$T1);
1094	&movdqa		($T1,$D3);
1095	&pmuludq	($D3,$T2);			# h3*r0
1096	&paddq		($D2,&QWP(16*2,"esp"));
1097	&movdqa		(&QWP(16*2,"esp"),$T0);
1098	 &pshufd	($T0,&QWP(16*1,"edx"),0x10);	# r1^n
1099	&paddq		($D3,&QWP(16*3,"esp"));
1100	&movdqa		(&QWP(16*3,"esp"),$T1);
1101	&movdqa		($T1,$D4);
1102	&pmuludq	($D4,$T2);			# h4*r0
1103	&paddq		($D4,&QWP(16*4,"esp"));
1104	&movdqa		(&QWP(16*4,"esp"),$T1);
1105
1106	&pmuladd	(sub {	my ($reg,$i)=@_;
1107				&pshufd ($reg,&QWP(16*$i,"edx"),0x10);
1108			     });
1109
1110&set_label("short_tail");
1111
1112	################################################################
1113	# horizontal addition
1114
1115	&pshufd		($T1,$D4,0b01001110);
1116	&pshufd		($T0,$D3,0b01001110);
1117	&paddq		($D4,$T1);
1118	&paddq		($D3,$T0);
1119	&pshufd		($T1,$D0,0b01001110);
1120	&pshufd		($T0,$D1,0b01001110);
1121	&paddq		($D0,$T1);
1122	&paddq		($D1,$T0);
1123	&pshufd		($T1,$D2,0b01001110);
1124	#&paddq		($D2,$T1);
1125
1126	&lazy_reduction	(sub { &paddq ($D2,$T1) });
1127
1128&set_label("done");
1129	&movd		(&DWP(-16*3+4*0,"edi"),$D0);	# store hash value
1130	&movd		(&DWP(-16*3+4*1,"edi"),$D1);
1131	&movd		(&DWP(-16*3+4*2,"edi"),$D2);
1132	&movd		(&DWP(-16*3+4*3,"edi"),$D3);
1133	&movd		(&DWP(-16*3+4*4,"edi"),$D4);
1134	&mov	("esp","ebp");
1135&set_label("nodata");
1136&function_end("_poly1305_blocks_sse2");
1137
1138&align	(32);
1139&function_begin("_poly1305_emit_sse2");
1140	&mov	("ebp",&wparam(0));		# context
1141
1142	&cmp	(&DWP(4*5,"ebp"),0);		# is_base2_26?
1143	&je	(&label("enter_emit"));
1144
1145	&mov	("eax",&DWP(4*0,"ebp"));	# load hash value
1146	&mov	("edi",&DWP(4*1,"ebp"));
1147	&mov	("ecx",&DWP(4*2,"ebp"));
1148	&mov	("edx",&DWP(4*3,"ebp"));
1149	&mov	("esi",&DWP(4*4,"ebp"));
1150
1151	&mov	("ebx","edi");			# base 2^26 -> base 2^32
1152	&shl	("edi",26);
1153	&shr	("ebx",6);
1154	&add	("eax","edi");
1155	&mov	("edi","ecx");
1156	&adc	("ebx",0);
1157
1158	&shl	("edi",20);
1159	&shr	("ecx",12);
1160	&add	("ebx","edi");
1161	&mov	("edi","edx");
1162	&adc	("ecx",0);
1163
1164	&shl	("edi",14);
1165	&shr	("edx",18);
1166	&add	("ecx","edi");
1167	&mov	("edi","esi");
1168	&adc	("edx",0);
1169
1170	&shl	("edi",8);
1171	&shr	("esi",24);
1172	&add	("edx","edi");
1173	&adc	("esi",0);			# can be partially reduced
1174
1175	&mov	("edi","esi");			# final reduction
1176	&and	("esi",3);
1177	&shr	("edi",2);
1178	&lea	("ebp",&DWP(0,"edi","edi",4));	# *5
1179	 &mov	("edi",&wparam(1));		# output
1180	&add	("eax","ebp");
1181	 &mov	("ebp",&wparam(2));		# key
1182	&adc	("ebx",0);
1183	&adc	("ecx",0);
1184	&adc	("edx",0);
1185	&adc	("esi",0);
1186
1187	&movd	($D0,"eax");			# offload original hash value
1188	&add	("eax",5);			# compare to modulus
1189	&movd	($D1,"ebx");
1190	&adc	("ebx",0);
1191	&movd	($D2,"ecx");
1192	&adc	("ecx",0);
1193	&movd	($D3,"edx");
1194	&adc	("edx",0);
1195	&adc	("esi",0);
1196	&shr	("esi",2);			# did it carry/borrow?
1197
1198	&neg	("esi");			# do we choose (hash-modulus) ...
1199	&and	("eax","esi");
1200	&and	("ebx","esi");
1201	&and	("ecx","esi");
1202	&and	("edx","esi");
1203	&mov	(&DWP(4*0,"edi"),"eax");
1204	&movd	("eax",$D0);
1205	&mov	(&DWP(4*1,"edi"),"ebx");
1206	&movd	("ebx",$D1);
1207	&mov	(&DWP(4*2,"edi"),"ecx");
1208	&movd	("ecx",$D2);
1209	&mov	(&DWP(4*3,"edi"),"edx");
1210	&movd	("edx",$D3);
1211
1212	&not	("esi");			# ... or original hash value?
1213	&and	("eax","esi");
1214	&and	("ebx","esi");
1215	&or	("eax",&DWP(4*0,"edi"));
1216	&and	("ecx","esi");
1217	&or	("ebx",&DWP(4*1,"edi"));
1218	&and	("edx","esi");
1219	&or	("ecx",&DWP(4*2,"edi"));
1220	&or	("edx",&DWP(4*3,"edi"));
1221
1222	&add	("eax",&DWP(4*0,"ebp"));	# accumulate key
1223	&adc	("ebx",&DWP(4*1,"ebp"));
1224	&mov	(&DWP(4*0,"edi"),"eax");
1225	&adc	("ecx",&DWP(4*2,"ebp"));
1226	&mov	(&DWP(4*1,"edi"),"ebx");
1227	&adc	("edx",&DWP(4*3,"ebp"));
1228	&mov	(&DWP(4*2,"edi"),"ecx");
1229	&mov	(&DWP(4*3,"edi"),"edx");
1230&function_end("_poly1305_emit_sse2");
1231
1232if ($avx>1) {
1233########################################################################
1234# Note that poly1305_init_avx2 operates on %xmm, I could have used
1235# poly1305_init_sse2...
1236
1237&align	(32);
1238&function_begin_B("_poly1305_init_avx2");
1239	&vmovdqu	($D4,&QWP(4*6,"edi"));		# key base 2^32
1240	&lea		("edi",&DWP(16*3,"edi"));	# size optimization
1241	&mov		("ebp","esp");
1242	&sub		("esp",16*(9+5));
1243	&and		("esp",-16);
1244
1245	#&vpand		($D4,$D4,&QWP(96,"ebx"));	# magic mask
1246	&vmovdqa	($MASK,&QWP(64,"ebx"));
1247
1248	&vpand		($D0,$D4,$MASK);		# -> base 2^26
1249	&vpsrlq		($D1,$D4,26);
1250	&vpsrldq	($D3,$D4,6);
1251	&vpand		($D1,$D1,$MASK);
1252	&vpsrlq		($D2,$D3,4)
1253	&vpsrlq		($D3,$D3,30);
1254	&vpand		($D2,$D2,$MASK);
1255	&vpand		($D3,$D3,$MASK);
1256	&vpsrldq	($D4,$D4,13);
1257
1258	&lea		("edx",&DWP(16*9,"esp"));	# size optimization
1259	&mov		("ecx",2);
1260&set_label("square");
1261	&vmovdqa	(&QWP(16*0,"esp"),$D0);
1262	&vmovdqa	(&QWP(16*1,"esp"),$D1);
1263	&vmovdqa	(&QWP(16*2,"esp"),$D2);
1264	&vmovdqa	(&QWP(16*3,"esp"),$D3);
1265	&vmovdqa	(&QWP(16*4,"esp"),$D4);
1266
1267	&vpslld		($T1,$D1,2);
1268	&vpslld		($T0,$D2,2);
1269	&vpaddd		($T1,$T1,$D1);			# *5
1270	&vpaddd		($T0,$T0,$D2);			# *5
1271	&vmovdqa	(&QWP(16*5,"esp"),$T1);
1272	&vmovdqa	(&QWP(16*6,"esp"),$T0);
1273	&vpslld		($T1,$D3,2);
1274	&vpslld		($T0,$D4,2);
1275	&vpaddd		($T1,$T1,$D3);			# *5
1276	&vpaddd		($T0,$T0,$D4);			# *5
1277	&vmovdqa	(&QWP(16*7,"esp"),$T1);
1278	&vmovdqa	(&QWP(16*8,"esp"),$T0);
1279
1280	&vpshufd	($T0,$D0,0b01000100);
1281	&vmovdqa	($T1,$D1);
1282	&vpshufd	($D1,$D1,0b01000100);
1283	&vpshufd	($D2,$D2,0b01000100);
1284	&vpshufd	($D3,$D3,0b01000100);
1285	&vpshufd	($D4,$D4,0b01000100);
1286	&vmovdqa	(&QWP(16*0,"edx"),$T0);
1287	&vmovdqa	(&QWP(16*1,"edx"),$D1);
1288	&vmovdqa	(&QWP(16*2,"edx"),$D2);
1289	&vmovdqa	(&QWP(16*3,"edx"),$D3);
1290	&vmovdqa	(&QWP(16*4,"edx"),$D4);
1291
1292	################################################################
1293	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
1294	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
1295	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
1296	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
1297	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
1298
1299	&vpmuludq	($D4,$D4,$D0);			# h4*r0
1300	&vpmuludq	($D3,$D3,$D0);			# h3*r0
1301	&vpmuludq	($D2,$D2,$D0);			# h2*r0
1302	&vpmuludq	($D1,$D1,$D0);			# h1*r0
1303	&vpmuludq	($D0,$T0,$D0);			# h0*r0
1304
1305	&vpmuludq	($T0,$T1,&QWP(16*3,"edx"));	# r1*h3
1306	&vpaddq		($D4,$D4,$T0);
1307	&vpmuludq	($T2,$T1,&QWP(16*2,"edx"));	# r1*h2
1308	&vpaddq		($D3,$D3,$T2);
1309	&vpmuludq	($T0,$T1,&QWP(16*1,"edx"));	# r1*h1
1310	&vpaddq		($D2,$D2,$T0);
1311	&vmovdqa	($T2,&QWP(16*5,"esp"));		# s1
1312	&vpmuludq	($T1,$T1,&QWP(16*0,"edx"));	# r1*h0
1313	&vpaddq		($D1,$D1,$T1);
1314	 &vmovdqa	($T0,&QWP(16*2,"esp"));		# r2
1315	&vpmuludq	($T2,$T2,&QWP(16*4,"edx"));	# s1*h4
1316	&vpaddq		($D0,$D0,$T2);
1317
1318	&vpmuludq	($T1,$T0,&QWP(16*2,"edx"));	# r2*h2
1319	&vpaddq		($D4,$D4,$T1);
1320	&vpmuludq	($T2,$T0,&QWP(16*1,"edx"));	# r2*h1
1321	&vpaddq		($D3,$D3,$T2);
1322	&vmovdqa	($T1,&QWP(16*6,"esp"));		# s2
1323	&vpmuludq	($T0,$T0,&QWP(16*0,"edx"));	# r2*h0
1324	&vpaddq		($D2,$D2,$T0);
1325	&vpmuludq	($T2,$T1,&QWP(16*4,"edx"));	# s2*h4
1326	&vpaddq		($D1,$D1,$T2);
1327	 &vmovdqa	($T0,&QWP(16*3,"esp"));		# r3
1328	&vpmuludq	($T1,$T1,&QWP(16*3,"edx"));	# s2*h3
1329	&vpaddq		($D0,$D0,$T1);
1330
1331	&vpmuludq	($T2,$T0,&QWP(16*1,"edx"));	# r3*h1
1332	&vpaddq		($D4,$D4,$T2);
1333	&vmovdqa	($T1,&QWP(16*7,"esp"));		# s3
1334	&vpmuludq	($T0,$T0,&QWP(16*0,"edx"));	# r3*h0
1335	&vpaddq		($D3,$D3,$T0);
1336	&vpmuludq	($T2,$T1,&QWP(16*4,"edx"));	# s3*h4
1337	&vpaddq		($D2,$D2,$T2);
1338	&vpmuludq	($T0,$T1,&QWP(16*3,"edx"));	# s3*h3
1339	&vpaddq		($D1,$D1,$T0);
1340	 &vmovdqa	($T2,&QWP(16*4,"esp"));		# r4
1341	&vpmuludq	($T1,$T1,&QWP(16*2,"edx"));	# s3*h2
1342	&vpaddq		($D0,$D0,$T1);
1343
1344	&vmovdqa	($T0,&QWP(16*8,"esp"));		# s4
1345	&vpmuludq	($T2,$T2,&QWP(16*0,"edx"));	# r4*h0
1346	&vpaddq		($D4,$D4,$T2);
1347	&vpmuludq	($T1,$T0,&QWP(16*4,"edx"));	# s4*h4
1348	&vpaddq		($D3,$D3,$T1);
1349	&vpmuludq	($T2,$T0,&QWP(16*1,"edx"));	# s4*h1
1350	&vpaddq		($D0,$D0,$T2);
1351	&vpmuludq	($T1,$T0,&QWP(16*2,"edx"));	# s4*h2
1352	&vpaddq		($D1,$D1,$T1);
1353	 &vmovdqa	($MASK,&QWP(64,"ebx"));
1354	&vpmuludq	($T0,$T0,&QWP(16*3,"edx"));	# s4*h3
1355	&vpaddq		($D2,$D2,$T0);
1356
1357	################################################################
1358	# lazy reduction
1359	 &vpsrlq	($T0,$D3,26);
1360	 &vpand		($D3,$D3,$MASK);
1361	&vpsrlq		($T1,$D0,26);
1362	&vpand		($D0,$D0,$MASK);
1363	 &vpaddq	($D4,$D4,$T0);			# h3 -> h4
1364	&vpaddq		($D1,$D1,$T1);			# h0 -> h1
1365	 &vpsrlq	($T0,$D4,26);
1366	 &vpand		($D4,$D4,$MASK);
1367	&vpsrlq		($T1,$D1,26);
1368	&vpand		($D1,$D1,$MASK);
1369	&vpaddq		($D2,$D2,$T1);			# h1 -> h2
1370	 &vpaddd	($D0,$D0,$T0);
1371	 &vpsllq	($T0,$T0,2);
1372	&vpsrlq		($T1,$D2,26);
1373	&vpand		($D2,$D2,$MASK);
1374	 &vpaddd	($D0,$D0,$T0);			# h4 -> h0
1375	&vpaddd		($D3,$D3,$T1);			# h2 -> h3
1376	&vpsrlq		($T1,$D3,26);
1377	 &vpsrlq	($T0,$D0,26);
1378	 &vpand		($D0,$D0,$MASK);
1379	&vpand		($D3,$D3,$MASK);
1380	 &vpaddd	($D1,$D1,$T0);			# h0 -> h1
1381	&vpaddd		($D4,$D4,$T1);			# h3 -> h4
1382
1383	&dec		("ecx");
1384	&jz		(&label("square_break"));
1385
1386	&vpunpcklqdq	($D0,$D0,&QWP(16*0,"esp"));	# 0:r^1:0:r^2
1387	&vpunpcklqdq	($D1,$D1,&QWP(16*1,"esp"));
1388	&vpunpcklqdq	($D2,$D2,&QWP(16*2,"esp"));
1389	&vpunpcklqdq	($D3,$D3,&QWP(16*3,"esp"));
1390	&vpunpcklqdq	($D4,$D4,&QWP(16*4,"esp"));
1391	&jmp		(&label("square"));
1392
1393&set_label("square_break");
1394	&vpsllq		($D0,$D0,32);			# -> r^3:0:r^4:0
1395	&vpsllq		($D1,$D1,32);
1396	&vpsllq		($D2,$D2,32);
1397	&vpsllq		($D3,$D3,32);
1398	&vpsllq		($D4,$D4,32);
1399	&vpor		($D0,$D0,&QWP(16*0,"esp"));	# r^3:r^1:r^4:r^2
1400	&vpor		($D1,$D1,&QWP(16*1,"esp"));
1401	&vpor		($D2,$D2,&QWP(16*2,"esp"));
1402	&vpor		($D3,$D3,&QWP(16*3,"esp"));
1403	&vpor		($D4,$D4,&QWP(16*4,"esp"));
1404
1405	&vpshufd	($D0,$D0,0b10001101);		# -> r^1:r^2:r^3:r^4
1406	&vpshufd	($D1,$D1,0b10001101);
1407	&vpshufd	($D2,$D2,0b10001101);
1408	&vpshufd	($D3,$D3,0b10001101);
1409	&vpshufd	($D4,$D4,0b10001101);
1410
1411	&vmovdqu	(&QWP(16*0,"edi"),$D0);		# save the table
1412	&vmovdqu	(&QWP(16*1,"edi"),$D1);
1413	&vmovdqu	(&QWP(16*2,"edi"),$D2);
1414	&vmovdqu	(&QWP(16*3,"edi"),$D3);
1415	&vmovdqu	(&QWP(16*4,"edi"),$D4);
1416
1417	&vpslld		($T1,$D1,2);
1418	&vpslld		($T0,$D2,2);
1419	&vpaddd		($T1,$T1,$D1);			# *5
1420	&vpaddd		($T0,$T0,$D2);			# *5
1421	&vmovdqu	(&QWP(16*5,"edi"),$T1);
1422	&vmovdqu	(&QWP(16*6,"edi"),$T0);
1423	&vpslld		($T1,$D3,2);
1424	&vpslld		($T0,$D4,2);
1425	&vpaddd		($T1,$T1,$D3);			# *5
1426	&vpaddd		($T0,$T0,$D4);			# *5
1427	&vmovdqu	(&QWP(16*7,"edi"),$T1);
1428	&vmovdqu	(&QWP(16*8,"edi"),$T0);
1429
1430	&mov		("esp","ebp");
1431	&lea		("edi",&DWP(-16*3,"edi"));	# size de-optimization
1432	&ret		();
1433&function_end_B("_poly1305_init_avx2");
1434
1435########################################################################
1436# now it's time to switch to %ymm
1437
1438my ($D0,$D1,$D2,$D3,$D4,$T0,$T1,$T2)=map("ymm$_",(0..7));
1439my $MASK=$T2;
1440
1441sub X { my $reg=shift; $reg=~s/^ymm/xmm/; $reg; }
1442
1443&align	(32);
1444&function_begin("_poly1305_blocks_avx2");
1445	&mov	("edi",&wparam(0));			# ctx
1446	&mov	("esi",&wparam(1));			# inp
1447	&mov	("ecx",&wparam(2));			# len
1448
1449	&mov	("eax",&DWP(4*5,"edi"));		# is_base2_26
1450	&and	("ecx",-16);
1451	&jz	(&label("nodata"));
1452	&cmp	("ecx",64);
1453	&jae	(&label("enter_avx2"));
1454	&test	("eax","eax");				# is_base2_26?
1455	&jz	(&label("enter_blocks"));
1456
1457&set_label("enter_avx2");
1458	&vzeroupper	();
1459
1460	&call	(&label("pic_point"));
1461&set_label("pic_point");
1462	&blindpop("ebx");
1463	&lea	("ebx",&DWP(&label("const_sse2")."-".&label("pic_point"),"ebx"));
1464
1465	&test	("eax","eax");				# is_base2_26?
1466	&jnz	(&label("base2_26"));
1467
1468	&call	("_poly1305_init_avx2");
1469
1470	################################################# base 2^32 -> base 2^26
1471	&mov	("eax",&DWP(0,"edi"));
1472	&mov	("ecx",&DWP(3,"edi"));
1473	&mov	("edx",&DWP(6,"edi"));
1474	&mov	("esi",&DWP(9,"edi"));
1475	&mov	("ebp",&DWP(13,"edi"));
1476
1477	&shr	("ecx",2);
1478	&and	("eax",0x3ffffff);
1479	&shr	("edx",4);
1480	&and	("ecx",0x3ffffff);
1481	&shr	("esi",6);
1482	&and	("edx",0x3ffffff);
1483
1484	&mov	(&DWP(4*0,"edi"),"eax");
1485	&mov	(&DWP(4*1,"edi"),"ecx");
1486	&mov	(&DWP(4*2,"edi"),"edx");
1487	&mov	(&DWP(4*3,"edi"),"esi");
1488	&mov	(&DWP(4*4,"edi"),"ebp");
1489	&mov	(&DWP(4*5,"edi"),1);			# is_base2_26
1490
1491	&mov	("esi",&wparam(1));			# [reload] inp
1492	&mov	("ecx",&wparam(2));			# [reload] len
1493
1494&set_label("base2_26");
1495	&mov	("eax",&wparam(3));			# padbit
1496	&mov	("ebp","esp");
1497
1498	&sub	("esp",32*(5+9));
1499	&and	("esp",-512);				# ensure that frame
1500							# doesn't cross page
1501							# boundary, which is
1502							# essential for
1503							# misaligned 32-byte
1504							# loads
1505
1506	################################################################
1507        # expand and copy pre-calculated table to stack
1508
1509	&vmovdqu	(&X($D0),&QWP(16*(3+0),"edi"));
1510	&lea		("edx",&DWP(32*5+128,"esp"));	# +128 size optimization
1511	&vmovdqu	(&X($D1),&QWP(16*(3+1),"edi"));
1512	&vmovdqu	(&X($D2),&QWP(16*(3+2),"edi"));
1513	&vmovdqu	(&X($D3),&QWP(16*(3+3),"edi"));
1514	&vmovdqu	(&X($D4),&QWP(16*(3+4),"edi"));
1515	&lea		("edi",&DWP(16*3,"edi"));	# size optimization
1516	&vpermq		($D0,$D0,0b01000000);		# 00001234 -> 12343434
1517	&vpermq		($D1,$D1,0b01000000);
1518	&vpermq		($D2,$D2,0b01000000);
1519	&vpermq		($D3,$D3,0b01000000);
1520	&vpermq		($D4,$D4,0b01000000);
1521	&vpshufd	($D0,$D0,0b11001000);		# 12343434 -> 14243444
1522	&vpshufd	($D1,$D1,0b11001000);
1523	&vpshufd	($D2,$D2,0b11001000);
1524	&vpshufd	($D3,$D3,0b11001000);
1525	&vpshufd	($D4,$D4,0b11001000);
1526	&vmovdqa	(&QWP(32*0-128,"edx"),$D0);
1527	&vmovdqu	(&X($D0),&QWP(16*5,"edi"));
1528	&vmovdqa	(&QWP(32*1-128,"edx"),$D1);
1529	&vmovdqu	(&X($D1),&QWP(16*6,"edi"));
1530	&vmovdqa	(&QWP(32*2-128,"edx"),$D2);
1531	&vmovdqu	(&X($D2),&QWP(16*7,"edi"));
1532	&vmovdqa	(&QWP(32*3-128,"edx"),$D3);
1533	&vmovdqu	(&X($D3),&QWP(16*8,"edi"));
1534	&vmovdqa	(&QWP(32*4-128,"edx"),$D4);
1535	&vpermq		($D0,$D0,0b01000000);
1536	&vpermq		($D1,$D1,0b01000000);
1537	&vpermq		($D2,$D2,0b01000000);
1538	&vpermq		($D3,$D3,0b01000000);
1539	&vpshufd	($D0,$D0,0b11001000);
1540	&vpshufd	($D1,$D1,0b11001000);
1541	&vpshufd	($D2,$D2,0b11001000);
1542	&vpshufd	($D3,$D3,0b11001000);
1543	&vmovdqa	(&QWP(32*5-128,"edx"),$D0);
1544	&vmovd		(&X($D0),&DWP(-16*3+4*0,"edi"));# load hash value
1545	&vmovdqa	(&QWP(32*6-128,"edx"),$D1);
1546	&vmovd		(&X($D1),&DWP(-16*3+4*1,"edi"));
1547	&vmovdqa	(&QWP(32*7-128,"edx"),$D2);
1548	&vmovd		(&X($D2),&DWP(-16*3+4*2,"edi"));
1549	&vmovdqa	(&QWP(32*8-128,"edx"),$D3);
1550	&vmovd		(&X($D3),&DWP(-16*3+4*3,"edi"));
1551	&vmovd		(&X($D4),&DWP(-16*3+4*4,"edi"));
1552	&vmovdqa	($MASK,&QWP(64,"ebx"));
1553	&neg		("eax");			# padbit
1554
1555	&test		("ecx",63);
1556	&jz		(&label("even"));
1557
1558	&mov		("edx","ecx");
1559	&and		("ecx",-64);
1560	&and		("edx",63);
1561
1562	&vmovdqu	(&X($T0),&QWP(16*0,"esi"));
1563	&cmp		("edx",32);
1564	&jb		(&label("one"));
1565
1566	&vmovdqu	(&X($T1),&QWP(16*1,"esi"));
1567	&je		(&label("two"));
1568
1569	&vinserti128	($T0,$T0,&QWP(16*2,"esi"),1);
1570	&lea		("esi",&DWP(16*3,"esi"));
1571	&lea		("ebx",&DWP(8,"ebx"));		# three padbits
1572	&lea		("edx",&DWP(32*5+128+8,"esp"));	# --:r^1:r^2:r^3 (*)
1573	&jmp		(&label("tail"));
1574
1575&set_label("two");
1576	&lea		("esi",&DWP(16*2,"esi"));
1577	&lea		("ebx",&DWP(16,"ebx"));		# two padbits
1578	&lea		("edx",&DWP(32*5+128+16,"esp"));# --:--:r^1:r^2 (*)
1579	&jmp		(&label("tail"));
1580
1581&set_label("one");
1582	&lea		("esi",&DWP(16*1,"esi"));
1583	&vpxor		($T1,$T1,$T1);
1584	&lea		("ebx",&DWP(32,"ebx","eax",8));	# one or no padbits
1585	&lea		("edx",&DWP(32*5+128+24,"esp"));# --:--:--:r^1 (*)
1586	&jmp		(&label("tail"));
1587
1588# (*)	spots marked with '--' are data from next table entry, but they
1589#	are multiplied by 0 and therefore rendered insignificant
1590
1591&set_label("even",32);
1592	&vmovdqu	(&X($T0),&QWP(16*0,"esi"));	# load input
1593	&vmovdqu	(&X($T1),&QWP(16*1,"esi"));
1594	&vinserti128	($T0,$T0,&QWP(16*2,"esi"),1);
1595	&vinserti128	($T1,$T1,&QWP(16*3,"esi"),1);
1596	&lea		("esi",&DWP(16*4,"esi"));
1597	&sub		("ecx",64);
1598	&jz		(&label("tail"));
1599
1600&set_label("loop");
1601	################################################################
1602	# ((inp[0]*r^4+r[4])*r^4+r[8])*r^4
1603	# ((inp[1]*r^4+r[5])*r^4+r[9])*r^3
1604	# ((inp[2]*r^4+r[6])*r^4+r[10])*r^2
1605	# ((inp[3]*r^4+r[7])*r^4+r[11])*r^1
1606	#   \________/ \_______/
1607	################################################################
1608
1609sub vsplat_input {
1610	&vmovdqa	(&QWP(32*2,"esp"),$D2);
1611	&vpsrldq	($D2,$T0,6);			# splat input
1612	&vmovdqa	(&QWP(32*0,"esp"),$D0);
1613	&vpsrldq	($D0,$T1,6);
1614	&vmovdqa	(&QWP(32*1,"esp"),$D1);
1615	&vpunpckhqdq	($D1,$T0,$T1);			# 4
1616	&vpunpcklqdq	($T0,$T0,$T1);			# 0:1
1617	&vpunpcklqdq	($D2,$D2,$D0);			# 2:3
1618
1619	&vpsrlq		($D0,$D2,30);
1620	&vpsrlq		($D2,$D2,4);
1621	&vpsrlq		($T1,$T0,26);
1622	&vpsrlq		($D1,$D1,40);			# 4
1623	&vpand		($D2,$D2,$MASK);		# 2
1624	&vpand		($T0,$T0,$MASK);		# 0
1625	&vpand		($T1,$T1,$MASK);		# 1
1626	&vpand		($D0,$D0,$MASK);		# 3 (*)
1627	&vpor		($D1,$D1,&QWP(0,"ebx"));	# padbit, yes, always
1628
1629	# (*)	note that output is counterintuitive, inp[3:4] is
1630	#	returned in $D1-2, while $D3-4 are preserved;
1631}
1632	&vsplat_input	();
1633
1634sub vpmuladd {
1635my $addr = shift;
1636
1637	&vpaddq		($D2,$D2,&QWP(32*2,"esp"));	# add hash value
1638	&vpaddq		($T0,$T0,&QWP(32*0,"esp"));
1639	&vpaddq		($T1,$T1,&QWP(32*1,"esp"));
1640	&vpaddq		($D0,$D0,$D3);
1641	&vpaddq		($D1,$D1,$D4);
1642
1643	################################################################
1644	# d3 = h2*r1   + h0*r3 + h1*r2   + h3*r0   + h4*5*r4
1645	# d4 = h2*r2   + h0*r4 + h1*r3   + h3*r1   + h4*r0
1646	# d0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r1
1647	# d1 = h2*5*r4 + h0*r1 + h1*r0   + h3*5*r3 + h4*5*r2
1648	# d2 = h2*r0   + h0*r2 + h1*r1   + h3*5*r4 + h4*5*r3
1649
1650	&vpmuludq	($D3,$D2,&$addr(1));		# d3 = h2*r1
1651	 &vmovdqa	(QWP(32*1,"esp"),$T1);
1652	&vpmuludq	($D4,$D2,&$addr(2));		# d4 = h2*r2
1653	 &vmovdqa	(QWP(32*3,"esp"),$D0);
1654	&vpmuludq	($D0,$D2,&$addr(7));		# d0 = h2*s3
1655	 &vmovdqa	(QWP(32*4,"esp"),$D1);
1656	&vpmuludq	($D1,$D2,&$addr(8));		# d1 = h2*s4
1657	&vpmuludq	($D2,$D2,&$addr(0));		# d2 = h2*r0
1658
1659	&vpmuludq	($T2,$T0,&$addr(3));		# h0*r3
1660	&vpaddq		($D3,$D3,$T2);			# d3 += h0*r3
1661	&vpmuludq	($T1,$T0,&$addr(4));		# h0*r4
1662	&vpaddq		($D4,$D4,$T1);			# d4 + h0*r4
1663	&vpmuludq	($T2,$T0,&$addr(0));		# h0*r0
1664	&vpaddq		($D0,$D0,$T2);			# d0 + h0*r0
1665	 &vmovdqa	($T2,&QWP(32*1,"esp"));		# h1
1666	&vpmuludq	($T1,$T0,&$addr(1));		# h0*r1
1667	&vpaddq		($D1,$D1,$T1);			# d1 += h0*r1
1668	&vpmuludq	($T0,$T0,&$addr(2));		# h0*r2
1669	&vpaddq		($D2,$D2,$T0);			# d2 += h0*r2
1670
1671	&vpmuludq	($T1,$T2,&$addr(2));		# h1*r2
1672	&vpaddq		($D3,$D3,$T1);			# d3 += h1*r2
1673	&vpmuludq	($T0,$T2,&$addr(3));		# h1*r3
1674	&vpaddq		($D4,$D4,$T0);			# d4 += h1*r3
1675	&vpmuludq	($T1,$T2,&$addr(8));		# h1*s4
1676	&vpaddq		($D0,$D0,$T1);			# d0 += h1*s4
1677	 &vmovdqa	($T1,&QWP(32*3,"esp"));		# h3
1678	&vpmuludq	($T0,$T2,&$addr(0));		# h1*r0
1679	&vpaddq		($D1,$D1,$T0);			# d1 += h1*r0
1680	&vpmuludq	($T2,$T2,&$addr(1));		# h1*r1
1681	&vpaddq		($D2,$D2,$T2);			# d2 += h1*r1
1682
1683	&vpmuludq	($T0,$T1,&$addr(0));		# h3*r0
1684	&vpaddq		($D3,$D3,$T0);			# d3 += h3*r0
1685	&vpmuludq	($T2,$T1,&$addr(1));		# h3*r1
1686	&vpaddq		($D4,$D4,$T2);			# d4 += h3*r1
1687	&vpmuludq	($T0,$T1,&$addr(6));		# h3*s2
1688	&vpaddq		($D0,$D0,$T0);			# d0 += h3*s2
1689	 &vmovdqa	($T0,&QWP(32*4,"esp"));		# h4
1690	&vpmuludq	($T2,$T1,&$addr(7));		# h3*s3
1691	&vpaddq		($D1,$D1,$T2);			# d1+= h3*s3
1692	&vpmuludq	($T1,$T1,&$addr(8));		# h3*s4
1693	&vpaddq		($D2,$D2,$T1);			# d2 += h3*s4
1694
1695	&vpmuludq	($T2,$T0,&$addr(8));		# h4*s4
1696	&vpaddq		($D3,$D3,$T2);			# d3 += h4*s4
1697	&vpmuludq	($T1,$T0,&$addr(5));		# h4*s1
1698	&vpaddq		($D0,$D0,$T1);			# d0 += h4*s1
1699	&vpmuludq	($T2,$T0,&$addr(0));		# h4*r0
1700	&vpaddq		($D4,$D4,$T2);			# d4 += h4*r0
1701	 &vmovdqa	($MASK,&QWP(64,"ebx"));
1702	&vpmuludq	($T1,$T0,&$addr(6));		# h4*s2
1703	&vpaddq		($D1,$D1,$T1);			# d1 += h4*s2
1704	&vpmuludq	($T0,$T0,&$addr(7));		# h4*s3
1705	&vpaddq		($D2,$D2,$T0);			# d2 += h4*s3
1706}
1707	&vpmuladd	(sub {	my $i=shift; &QWP(32*$i-128,"edx");	});
1708
1709sub vlazy_reduction {
1710	################################################################
1711	# lazy reduction
1712
1713	 &vpsrlq	($T0,$D3,26);
1714	 &vpand		($D3,$D3,$MASK);
1715	&vpsrlq		($T1,$D0,26);
1716	&vpand		($D0,$D0,$MASK);
1717	 &vpaddq	($D4,$D4,$T0);			# h3 -> h4
1718	&vpaddq		($D1,$D1,$T1);			# h0 -> h1
1719	 &vpsrlq	($T0,$D4,26);
1720	 &vpand		($D4,$D4,$MASK);
1721	&vpsrlq		($T1,$D1,26);
1722	&vpand		($D1,$D1,$MASK);
1723	&vpaddq		($D2,$D2,$T1);			# h1 -> h2
1724	 &vpaddq	($D0,$D0,$T0);
1725	 &vpsllq	($T0,$T0,2);
1726	&vpsrlq		($T1,$D2,26);
1727	&vpand		($D2,$D2,$MASK);
1728	 &vpaddq	($D0,$D0,$T0);			# h4 -> h0
1729	&vpaddq		($D3,$D3,$T1);			# h2 -> h3
1730	&vpsrlq		($T1,$D3,26);
1731	 &vpsrlq	($T0,$D0,26);
1732	 &vpand		($D0,$D0,$MASK);
1733	&vpand		($D3,$D3,$MASK);
1734	 &vpaddq	($D1,$D1,$T0);			# h0 -> h1
1735	&vpaddq		($D4,$D4,$T1);			# h3 -> h4
1736}
1737	&vlazy_reduction();
1738
1739	&vmovdqu	(&X($T0),&QWP(16*0,"esi"));	# load input
1740	&vmovdqu	(&X($T1),&QWP(16*1,"esi"));
1741	&vinserti128	($T0,$T0,&QWP(16*2,"esi"),1);
1742	&vinserti128	($T1,$T1,&QWP(16*3,"esi"),1);
1743	&lea		("esi",&DWP(16*4,"esi"));
1744	&sub		("ecx",64);
1745	&jnz		(&label("loop"));
1746
1747&set_label("tail");
1748	&vsplat_input	();
1749	&and		("ebx",-64);			# restore pointer
1750
1751	&vpmuladd	(sub {	my $i=shift; &QWP(4+32*$i-128,"edx");	});
1752
1753	################################################################
1754	# horizontal addition
1755
1756	&vpsrldq	($T0,$D4,8);
1757	&vpsrldq	($T1,$D3,8);
1758	&vpaddq		($D4,$D4,$T0);
1759	&vpsrldq	($T0,$D0,8);
1760	&vpaddq		($D3,$D3,$T1);
1761	&vpsrldq	($T1,$D1,8);
1762	&vpaddq		($D0,$D0,$T0);
1763	&vpsrldq	($T0,$D2,8);
1764	&vpaddq		($D1,$D1,$T1);
1765	&vpermq		($T1,$D4,2);			# keep folding
1766	&vpaddq		($D2,$D2,$T0);
1767	&vpermq		($T0,$D3,2);
1768	&vpaddq		($D4,$D4,$T1);
1769	&vpermq		($T1,$D0,2);
1770	&vpaddq		($D3,$D3,$T0);
1771	&vpermq		($T0,$D1,2);
1772	&vpaddq		($D0,$D0,$T1);
1773	&vpermq		($T1,$D2,2);
1774	&vpaddq		($D1,$D1,$T0);
1775	&vpaddq		($D2,$D2,$T1);
1776
1777	&vlazy_reduction();
1778
1779	&cmp		("ecx",0);
1780	&je		(&label("done"));
1781
1782	################################################################
1783	# clear all but single word
1784
1785	&vpshufd	(&X($D0),&X($D0),0b11111100);
1786	&lea		("edx",&DWP(32*5+128,"esp"));	# restore pointer
1787	&vpshufd	(&X($D1),&X($D1),0b11111100);
1788	&vpshufd	(&X($D2),&X($D2),0b11111100);
1789	&vpshufd	(&X($D3),&X($D3),0b11111100);
1790	&vpshufd	(&X($D4),&X($D4),0b11111100);
1791	&jmp		(&label("even"));
1792
1793&set_label("done",16);
1794	&vmovd		(&DWP(-16*3+4*0,"edi"),&X($D0));# store hash value
1795	&vmovd		(&DWP(-16*3+4*1,"edi"),&X($D1));
1796	&vmovd		(&DWP(-16*3+4*2,"edi"),&X($D2));
1797	&vmovd		(&DWP(-16*3+4*3,"edi"),&X($D3));
1798	&vmovd		(&DWP(-16*3+4*4,"edi"),&X($D4));
1799	&vzeroupper	();
1800	&mov	("esp","ebp");
1801&set_label("nodata");
1802&function_end("_poly1305_blocks_avx2");
1803}
1804&set_label("const_sse2",64);
1805	&data_word(1<<24,0,	1<<24,0,	1<<24,0,	1<<24,0);
1806	&data_word(0,0,		0,0,		0,0,		0,0);
1807	&data_word(0x03ffffff,0,0x03ffffff,0,	0x03ffffff,0,	0x03ffffff,0);
1808	&data_word(0x0fffffff,0x0ffffffc,0x0ffffffc,0x0ffffffc);
1809}
1810&asciz	("Poly1305 for x86, CRYPTOGAMS by <appro\@openssl.org>");
1811&align	(4);
1812
1813&asm_finish();
1814
1815close STDOUT or die "error closing STDOUT: $!";
1816