1#!/usr/bin/env perl
2# Copyright 2018 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8#
9# ====================================================================
10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11# project. The module is, however, dual licensed under OpenSSL and
12# CRYPTOGAMS licenses depending on where you obtain it. For further
13# details see http://www.openssl.org/~appro/cryptogams/.
14# ====================================================================
15#
16# X25519 lower-level primitives for x86_64.
17#
18# February 2018.
19#
20# This module implements radix 2^51 multiplication and squaring, and
21# radix 2^64 multiplication, squaring, addition, subtraction and final
22# reduction. Latter radix is used on ADCX/ADOX-capable processors such
23# as Broadwell. On related note one should mention that there are
24# vector implementations that provide significantly better performance
25# on some processors(*), but they are large and overly complex. Which
26# in combination with them being effectively processor-specific makes
27# the undertaking hard to justify. The goal for this implementation
28# is rather versatility and simplicity [and ultimately formal
29# verification].
30#
31# (*)	For example sandy2x should provide ~30% improvement on Sandy
32#	Bridge, but only nominal ~5% on Haswell [and big loss on
33#	Broadwell and successors].
34#
35######################################################################
36# Improvement coefficients:
37#
38#			amd64-51(*)	gcc-5.x(**)
39#
40# P4			+22%		+40%
41# Sandy Bridge		-3%		+11%
42# Haswell		-1%		+13%
43# Broadwell(***)	+30%		+35%
44# Skylake(***)		+33%		+47%
45# Silvermont		+20%		+26%
46# Goldmont		+40%		+50%
47# Bulldozer		+20%		+9%
48# Ryzen(***)		+43%		+40%
49# VIA			+170%		+120%
50#
51# (*)	amd64-51 is popular assembly implementation with 2^51 radix,
52#	only multiplication and squaring subroutines were linked
53#	for comparison, but not complete ladder step; gain on most
54#	processors is because this module refrains from shld, and
55#	minor regression on others is because this does result in
56#	higher instruction count;
57# (**)	compiler is free to inline functions, in assembly one would
58#	need to implement ladder step to do that, and it will improve
59#	performance by several percent;
60# (***)	ADCX/ADOX result for 2^64 radix, there is no corresponding
61#	C implementation, so that comparison is always against
62#	2^51 radix;
63
64$flavour = shift;
65$output  = shift;
66if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
67
68$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
69
70$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
71( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
72( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
73die "can't locate x86_64-xlate.pl";
74
75open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
76*STDOUT=*OUT;
77
78if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
79		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
80	$addx = ($1>=2.23);
81}
82
83if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
84	    `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
85	$addx = ($1>=2.10);
86}
87
88if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
89	    `ml64 2>&1` =~ /Version ([0-9]+)\./) {
90	$addx = ($1>=12);
91}
92
93if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) {
94	my $ver = $2 + $3/100.0;	# 3.1->3.01, 3.10->3.10
95	$addx = ($ver>=3.03);
96}
97
98$code.=<<___;
99.text
100
101.globl	x25519_fe51_mul
102.type	x25519_fe51_mul,\@function,3
103.align	32
104x25519_fe51_mul:
105.cfi_startproc
106	push	%rbp
107.cfi_push	%rbp
108	push	%rbx
109.cfi_push	%rbx
110	push	%r12
111.cfi_push	%r12
112	push	%r13
113.cfi_push	%r13
114	push	%r14
115.cfi_push	%r14
116	push	%r15
117.cfi_push	%r15
118	lea	-8*5(%rsp),%rsp
119.cfi_adjust_cfa_offset	40
120.Lfe51_mul_body:
121
122	mov	8*0(%rsi),%rax		# f[0]
123	mov	8*0(%rdx),%r11		# load g[0-4]
124	mov	8*1(%rdx),%r12
125	mov	8*2(%rdx),%r13
126	mov	8*3(%rdx),%rbp
127	mov	8*4(%rdx),%r14
128
129	mov	%rdi,8*4(%rsp)		# offload 1st argument
130	mov	%rax,%rdi
131	mulq	%r11			# f[0]*g[0]
132	mov	%r11,8*0(%rsp)		# offload g[0]
133	mov	%rax,%rbx		# %rbx:%rcx = h0
134	mov	%rdi,%rax
135	mov	%rdx,%rcx
136	mulq	%r12			# f[0]*g[1]
137	mov	%r12,8*1(%rsp)		# offload g[1]
138	mov	%rax,%r8		# %r8:%r9 = h1
139	mov	%rdi,%rax
140	lea	(%r14,%r14,8),%r15
141	mov	%rdx,%r9
142	mulq	%r13			# f[0]*g[2]
143	mov	%r13,8*2(%rsp)		# offload g[2]
144	mov	%rax,%r10		# %r10:%r11 = h2
145	mov	%rdi,%rax
146	lea	(%r14,%r15,2),%rdi	# g[4]*19
147	mov	%rdx,%r11
148	mulq	%rbp			# f[0]*g[3]
149	mov	%rax,%r12		# %r12:%r13 = h3
150	mov	8*0(%rsi),%rax		# f[0]
151	mov	%rdx,%r13
152	mulq	%r14			# f[0]*g[4]
153	mov	%rax,%r14		# %r14:%r15 = h4
154	mov	8*1(%rsi),%rax		# f[1]
155	mov	%rdx,%r15
156
157	mulq	%rdi			# f[1]*g[4]*19
158	add	%rax,%rbx
159	mov	8*2(%rsi),%rax		# f[2]
160	adc	%rdx,%rcx
161	mulq	%rdi			# f[2]*g[4]*19
162	add	%rax,%r8
163	mov	8*3(%rsi),%rax		# f[3]
164	adc	%rdx,%r9
165	mulq	%rdi			# f[3]*g[4]*19
166	add	%rax,%r10
167	mov	8*4(%rsi),%rax		# f[4]
168	adc	%rdx,%r11
169	mulq	%rdi			# f[4]*g[4]*19
170	imulq	\$19,%rbp,%rdi		# g[3]*19
171	add	%rax,%r12
172	mov	8*1(%rsi),%rax		# f[1]
173	adc	%rdx,%r13
174	mulq	%rbp			# f[1]*g[3]
175	mov	8*2(%rsp),%rbp		# g[2]
176	add	%rax,%r14
177	mov	8*2(%rsi),%rax		# f[2]
178	adc	%rdx,%r15
179
180	mulq	%rdi			# f[2]*g[3]*19
181	add	%rax,%rbx
182	mov	8*3(%rsi),%rax		# f[3]
183	adc	%rdx,%rcx
184	mulq	%rdi			# f[3]*g[3]*19
185	add	%rax,%r8
186	mov	8*4(%rsi),%rax		# f[4]
187	adc	%rdx,%r9
188	mulq	%rdi			# f[4]*g[3]*19
189	imulq	\$19,%rbp,%rdi		# g[2]*19
190	add	%rax,%r10
191	mov	8*1(%rsi),%rax		# f[1]
192	adc	%rdx,%r11
193	mulq	%rbp			# f[1]*g[2]
194	add	%rax,%r12
195	mov	8*2(%rsi),%rax		# f[2]
196	adc	%rdx,%r13
197	mulq	%rbp			# f[2]*g[2]
198	mov	8*1(%rsp),%rbp		# g[1]
199	add	%rax,%r14
200	mov	8*3(%rsi),%rax		# f[3]
201	adc	%rdx,%r15
202
203	mulq	%rdi			# f[3]*g[2]*19
204	add	%rax,%rbx
205	mov	8*4(%rsi),%rax		# f[3]
206	adc	%rdx,%rcx
207	mulq	%rdi			# f[4]*g[2]*19
208	add	%rax,%r8
209	mov	8*1(%rsi),%rax		# f[1]
210	adc	%rdx,%r9
211	mulq	%rbp			# f[1]*g[1]
212	imulq	\$19,%rbp,%rdi
213	add	%rax,%r10
214	mov	8*2(%rsi),%rax		# f[2]
215	adc	%rdx,%r11
216	mulq	%rbp			# f[2]*g[1]
217	add	%rax,%r12
218	mov	8*3(%rsi),%rax		# f[3]
219	adc	%rdx,%r13
220	mulq	%rbp			# f[3]*g[1]
221	mov	8*0(%rsp),%rbp		# g[0]
222	add	%rax,%r14
223	mov	8*4(%rsi),%rax		# f[4]
224	adc	%rdx,%r15
225
226	mulq	%rdi			# f[4]*g[1]*19
227	add	%rax,%rbx
228	mov	8*1(%rsi),%rax		# f[1]
229	adc	%rdx,%rcx
230	mul	%rbp			# f[1]*g[0]
231	add	%rax,%r8
232	mov	8*2(%rsi),%rax		# f[2]
233	adc	%rdx,%r9
234	mul	%rbp			# f[2]*g[0]
235	add	%rax,%r10
236	mov	8*3(%rsi),%rax		# f[3]
237	adc	%rdx,%r11
238	mul	%rbp			# f[3]*g[0]
239	add	%rax,%r12
240	mov	8*4(%rsi),%rax		# f[4]
241	adc	%rdx,%r13
242	mulq	%rbp			# f[4]*g[0]
243	add	%rax,%r14
244	adc	%rdx,%r15
245
246	mov	8*4(%rsp),%rdi		# restore 1st argument
247	jmp	.Lreduce51
248.Lfe51_mul_epilogue:
249.cfi_endproc
250.size	x25519_fe51_mul,.-x25519_fe51_mul
251
252.globl	x25519_fe51_sqr
253.type	x25519_fe51_sqr,\@function,2
254.align	32
255x25519_fe51_sqr:
256.cfi_startproc
257	push	%rbp
258.cfi_push	%rbp
259	push	%rbx
260.cfi_push	%rbx
261	push	%r12
262.cfi_push	%r12
263	push	%r13
264.cfi_push	%r13
265	push	%r14
266.cfi_push	%r14
267	push	%r15
268.cfi_push	%r15
269	lea	-8*5(%rsp),%rsp
270.cfi_adjust_cfa_offset	40
271.Lfe51_sqr_body:
272
273	mov	8*0(%rsi),%rax		# g[0]
274	mov	8*2(%rsi),%r15		# g[2]
275	mov	8*4(%rsi),%rbp		# g[4]
276
277	mov	%rdi,8*4(%rsp)		# offload 1st argument
278	lea	(%rax,%rax),%r14
279	mulq	%rax			# g[0]*g[0]
280	mov	%rax,%rbx
281	mov	8*1(%rsi),%rax		# g[1]
282	mov	%rdx,%rcx
283	mulq	%r14			# 2*g[0]*g[1]
284	mov	%rax,%r8
285	mov	%r15,%rax
286	mov	%r15,8*0(%rsp)		# offload g[2]
287	mov	%rdx,%r9
288	mulq	%r14			# 2*g[0]*g[2]
289	mov	%rax,%r10
290	mov	8*3(%rsi),%rax
291	mov	%rdx,%r11
292	imulq	\$19,%rbp,%rdi		# g[4]*19
293	mulq	%r14			# 2*g[0]*g[3]
294	mov	%rax,%r12
295	mov	%rbp,%rax
296	mov	%rdx,%r13
297	mulq	%r14			# 2*g[0]*g[4]
298	mov	%rax,%r14
299	mov	%rbp,%rax
300	mov	%rdx,%r15
301
302	mulq	%rdi			# g[4]*g[4]*19
303	add	%rax,%r12
304	mov	8*1(%rsi),%rax		# g[1]
305	adc	%rdx,%r13
306
307	mov	8*3(%rsi),%rsi		# g[3]
308	lea	(%rax,%rax),%rbp
309	mulq	%rax			# g[1]*g[1]
310	add	%rax,%r10
311	mov	8*0(%rsp),%rax		# g[2]
312	adc	%rdx,%r11
313	mulq	%rbp			# 2*g[1]*g[2]
314	add	%rax,%r12
315	mov	%rbp,%rax
316	adc	%rdx,%r13
317	mulq	%rsi			# 2*g[1]*g[3]
318	add	%rax,%r14
319	mov	%rbp,%rax
320	adc	%rdx,%r15
321	imulq	\$19,%rsi,%rbp		# g[3]*19
322	mulq	%rdi			# 2*g[1]*g[4]*19
323	add	%rax,%rbx
324	lea	(%rsi,%rsi),%rax
325	adc	%rdx,%rcx
326
327	mulq	%rdi			# 2*g[3]*g[4]*19
328	add	%rax,%r10
329	mov	%rsi,%rax
330	adc	%rdx,%r11
331	mulq	%rbp			# g[3]*g[3]*19
332	add	%rax,%r8
333	mov	8*0(%rsp),%rax		# g[2]
334	adc	%rdx,%r9
335
336	lea	(%rax,%rax),%rsi
337	mulq	%rax			# g[2]*g[2]
338	add	%rax,%r14
339	mov	%rbp,%rax
340	adc	%rdx,%r15
341	mulq	%rsi			# 2*g[2]*g[3]*19
342	add	%rax,%rbx
343	mov	%rsi,%rax
344	adc	%rdx,%rcx
345	mulq	%rdi			# 2*g[2]*g[4]*19
346	add	%rax,%r8
347	adc	%rdx,%r9
348
349	mov	8*4(%rsp),%rdi		# restore 1st argument
350	jmp	.Lreduce51
351
352.align	32
353.Lreduce51:
354	mov	\$0x7ffffffffffff,%rbp
355
356	mov	%r10,%rdx
357	shr	\$51,%r10
358	shl	\$13,%r11
359	and	%rbp,%rdx		# %rdx = g2 = h2 & mask
360	or	%r10,%r11		# h2>>51
361	add	%r11,%r12
362	adc	\$0,%r13		# h3 += h2>>51
363
364	mov	%rbx,%rax
365	shr	\$51,%rbx
366	shl	\$13,%rcx
367	and	%rbp,%rax		# %rax = g0 = h0 & mask
368	or	%rbx,%rcx		# h0>>51
369	add	%rcx,%r8		# h1 += h0>>51
370	adc	\$0,%r9
371
372	mov	%r12,%rbx
373	shr	\$51,%r12
374	shl	\$13,%r13
375	and	%rbp,%rbx		# %rbx = g3 = h3 & mask
376	or	%r12,%r13		# h3>>51
377	add	%r13,%r14		# h4 += h3>>51
378	adc	\$0,%r15
379
380	mov	%r8,%rcx
381	shr	\$51,%r8
382	shl	\$13,%r9
383	and	%rbp,%rcx		# %rcx = g1 = h1 & mask
384	or	%r8,%r9
385	add	%r9,%rdx		# g2 += h1>>51
386
387	mov	%r14,%r10
388	shr	\$51,%r14
389	shl	\$13,%r15
390	and	%rbp,%r10		# %r10 = g4 = h0 & mask
391	or	%r14,%r15		# h0>>51
392
393	lea	(%r15,%r15,8),%r14
394	lea	(%r15,%r14,2),%r15
395	add	%r15,%rax		# g0 += (h0>>51)*19
396
397	mov	%rdx,%r8
398	and	%rbp,%rdx		# g2 &= mask
399	shr	\$51,%r8
400	add	%r8,%rbx		# g3 += g2>>51
401
402	mov	%rax,%r9
403	and	%rbp,%rax		# g0 &= mask
404	shr	\$51,%r9
405	add	%r9,%rcx		# g1 += g0>>51
406
407	mov	%rax,8*0(%rdi)		# save the result
408	mov	%rcx,8*1(%rdi)
409	mov	%rdx,8*2(%rdi)
410	mov	%rbx,8*3(%rdi)
411	mov	%r10,8*4(%rdi)
412
413	mov	8*5(%rsp),%r15
414.cfi_restore	%r15
415	mov	8*6(%rsp),%r14
416.cfi_restore	%r14
417	mov	8*7(%rsp),%r13
418.cfi_restore	%r13
419	mov	8*8(%rsp),%r12
420.cfi_restore	%r12
421	mov	8*9(%rsp),%rbx
422.cfi_restore	%rbx
423	mov	8*10(%rsp),%rbp
424.cfi_restore	%rbp
425	lea	8*11(%rsp),%rsp
426.cfi_adjust_cfa_offset	88
427.Lfe51_sqr_epilogue:
428	ret
429.cfi_endproc
430.size	x25519_fe51_sqr,.-x25519_fe51_sqr
431
432.globl	x25519_fe51_mul121666
433.type	x25519_fe51_mul121666,\@function,2
434.align	32
435x25519_fe51_mul121666:
436.cfi_startproc
437	push	%rbp
438.cfi_push	%rbp
439	push	%rbx
440.cfi_push	%rbx
441	push	%r12
442.cfi_push	%r12
443	push	%r13
444.cfi_push	%r13
445	push	%r14
446.cfi_push	%r14
447	push	%r15
448.cfi_push	%r15
449	lea	-8*5(%rsp),%rsp
450.cfi_adjust_cfa_offset	40
451.Lfe51_mul121666_body:
452	mov	\$121666,%eax
453
454	mulq	8*0(%rsi)
455	mov	%rax,%rbx		# %rbx:%rcx = h0
456	mov	\$121666,%eax
457	mov	%rdx,%rcx
458	mulq	8*1(%rsi)
459	mov	%rax,%r8		# %r8:%r9 = h1
460	mov	\$121666,%eax
461	mov	%rdx,%r9
462	mulq	8*2(%rsi)
463	mov	%rax,%r10		# %r10:%r11 = h2
464	mov	\$121666,%eax
465	mov	%rdx,%r11
466	mulq	8*3(%rsi)
467	mov	%rax,%r12		# %r12:%r13 = h3
468	mov	\$121666,%eax		# f[0]
469	mov	%rdx,%r13
470	mulq	8*4(%rsi)
471	mov	%rax,%r14		# %r14:%r15 = h4
472	mov	%rdx,%r15
473
474	jmp	.Lreduce51
475.Lfe51_mul121666_epilogue:
476.cfi_endproc
477.size	x25519_fe51_mul121666,.-x25519_fe51_mul121666
478___
479########################################################################
480# Base 2^64 subroutines modulo 2*(2^255-19)
481#
482if ($addx) {
483my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7) = map("%r$_",(8..15));
484
485$code.=<<___;
486.extern	OPENSSL_ia32cap_P
487.globl	x25519_fe64_eligible
488.type	x25519_fe64_eligible,\@abi-omnipotent
489.align	32
490x25519_fe64_eligible:
491	mov	OPENSSL_ia32cap_P+8(%rip),%ecx
492	xor	%eax,%eax
493	and	\$0x80100,%ecx
494	cmp	\$0x80100,%ecx
495	cmove	%ecx,%eax
496	ret
497.size	x25519_fe64_eligible,.-x25519_fe64_eligible
498
499.globl	x25519_fe64_mul
500.type	x25519_fe64_mul,\@function,3
501.align	32
502x25519_fe64_mul:
503.cfi_startproc
504	push	%rbp
505.cfi_push	%rbp
506	push	%rbx
507.cfi_push	%rbx
508	push	%r12
509.cfi_push	%r12
510	push	%r13
511.cfi_push	%r13
512	push	%r14
513.cfi_push	%r14
514	push	%r15
515.cfi_push	%r15
516	push	%rdi			# offload dst
517.cfi_push	%rdi
518	lea	-8*2(%rsp),%rsp
519.cfi_adjust_cfa_offset	16
520.Lfe64_mul_body:
521
522	mov	%rdx,%rax
523	mov	8*0(%rdx),%rbp		# b[0]
524	mov	8*0(%rsi),%rdx		# a[0]
525	mov	8*1(%rax),%rcx		# b[1]
526	mov	8*2(%rax),$acc6		# b[2]
527	mov	8*3(%rax),$acc7		# b[3]
528
529	mulx	%rbp,$acc0,%rax		# a[0]*b[0]
530	xor	%edi,%edi		# cf=0,of=0
531	mulx	%rcx,$acc1,%rbx		# a[0]*b[1]
532	adcx	%rax,$acc1
533	mulx	$acc6,$acc2,%rax	# a[0]*b[2]
534	adcx	%rbx,$acc2
535	mulx	$acc7,$acc3,$acc4	# a[0]*b[3]
536	 mov	8*1(%rsi),%rdx		# a[1]
537	adcx	%rax,$acc3
538	mov	$acc6,(%rsp)		# offload b[2]
539	adcx	%rdi,$acc4		# cf=0
540
541	mulx	%rbp,%rax,%rbx		# a[1]*b[0]
542	adox	%rax,$acc1
543	adcx	%rbx,$acc2
544	mulx	%rcx,%rax,%rbx		# a[1]*b[1]
545	adox	%rax,$acc2
546	adcx	%rbx,$acc3
547	mulx	$acc6,%rax,%rbx		# a[1]*b[2]
548	adox	%rax,$acc3
549	adcx	%rbx,$acc4
550	mulx	$acc7,%rax,$acc5	# a[1]*b[3]
551	 mov	8*2(%rsi),%rdx		# a[2]
552	adox	%rax,$acc4
553	adcx	%rdi,$acc5		# cf=0
554	adox	%rdi,$acc5		# of=0
555
556	mulx	%rbp,%rax,%rbx		# a[2]*b[0]
557	adcx	%rax,$acc2
558	adox	%rbx,$acc3
559	mulx	%rcx,%rax,%rbx		# a[2]*b[1]
560	adcx	%rax,$acc3
561	adox	%rbx,$acc4
562	mulx	$acc6,%rax,%rbx		# a[2]*b[2]
563	adcx	%rax,$acc4
564	adox	%rbx,$acc5
565	mulx	$acc7,%rax,$acc6	# a[2]*b[3]
566	 mov	8*3(%rsi),%rdx		# a[3]
567	adcx	%rax,$acc5
568	adox	%rdi,$acc6		# of=0
569	adcx	%rdi,$acc6		# cf=0
570
571	mulx	%rbp,%rax,%rbx		# a[3]*b[0]
572	adox	%rax,$acc3
573	adcx	%rbx,$acc4
574	mulx	%rcx,%rax,%rbx		# a[3]*b[1]
575	adox	%rax,$acc4
576	adcx	%rbx,$acc5
577	mulx	(%rsp),%rax,%rbx	# a[3]*b[2]
578	adox	%rax,$acc5
579	adcx	%rbx,$acc6
580	mulx	$acc7,%rax,$acc7	# a[3]*b[3]
581	 mov	\$38,%edx
582	adox	%rax,$acc6
583	adcx	%rdi,$acc7		# cf=0
584	adox	%rdi,$acc7		# of=0
585
586	jmp	.Lreduce64
587.Lfe64_mul_epilogue:
588.cfi_endproc
589.size	x25519_fe64_mul,.-x25519_fe64_mul
590
591.globl	x25519_fe64_sqr
592.type	x25519_fe64_sqr,\@function,2
593.align	32
594x25519_fe64_sqr:
595.cfi_startproc
596	push	%rbp
597.cfi_push	%rbp
598	push	%rbx
599.cfi_push	%rbx
600	push	%r12
601.cfi_push	%r12
602	push	%r13
603.cfi_push	%r13
604	push	%r14
605.cfi_push	%r14
606	push	%r15
607.cfi_push	%r15
608	push	%rdi			# offload dst
609.cfi_push	%rdi
610	lea	-8*2(%rsp),%rsp
611.cfi_adjust_cfa_offset	16
612.Lfe64_sqr_body:
613
614	mov	8*0(%rsi),%rdx		# a[0]
615	mov	8*1(%rsi),%rcx		# a[1]
616	mov	8*2(%rsi),%rbp		# a[2]
617	mov	8*3(%rsi),%rsi		# a[3]
618
619	################################################################
620	mulx	%rdx,$acc0,$acc7	# a[0]*a[0]
621	mulx	%rcx,$acc1,%rax		# a[0]*a[1]
622	xor	%edi,%edi		# cf=0,of=0
623	mulx	%rbp,$acc2,%rbx		# a[0]*a[2]
624	adcx	%rax,$acc2
625	mulx	%rsi,$acc3,$acc4	# a[0]*a[3]
626	 mov	%rcx,%rdx		# a[1]
627	adcx	%rbx,$acc3
628	adcx	%rdi,$acc4		# cf=0
629
630	################################################################
631	mulx	%rbp,%rax,%rbx		# a[1]*a[2]
632	adox	%rax,$acc3
633	adcx	%rbx,$acc4
634	mulx	%rsi,%rax,$acc5		# a[1]*a[3]
635	 mov	%rbp,%rdx		# a[2]
636	adox	%rax,$acc4
637	adcx	%rdi,$acc5
638
639	################################################################
640	mulx	%rsi,%rax,$acc6		# a[2]*a[3]
641	 mov	%rcx,%rdx		# a[1]
642	adox	%rax,$acc5
643	adcx	%rdi,$acc6		# cf=0
644	adox	%rdi,$acc6		# of=0
645
646	 adcx	$acc1,$acc1		# acc1:6<<1
647	adox	$acc7,$acc1
648	 adcx	$acc2,$acc2
649	mulx	%rdx,%rax,%rbx		# a[1]*a[1]
650	 mov	%rbp,%rdx		# a[2]
651	 adcx	$acc3,$acc3
652	adox	%rax,$acc2
653	 adcx	$acc4,$acc4
654	adox	%rbx,$acc3
655	mulx	%rdx,%rax,%rbx		# a[2]*a[2]
656	 mov	%rsi,%rdx		# a[3]
657	 adcx	$acc5,$acc5
658	adox	%rax,$acc4
659	 adcx	$acc6,$acc6
660	adox	%rbx,$acc5
661	mulx	%rdx,%rax,$acc7		# a[3]*a[3]
662	 mov	\$38,%edx
663	adox	%rax,$acc6
664	adcx	%rdi,$acc7		# cf=0
665	adox	%rdi,$acc7		# of=0
666	jmp	.Lreduce64
667
668.align	32
669.Lreduce64:
670	mulx	$acc4,%rax,%rbx
671	adcx	%rax,$acc0
672	adox	%rbx,$acc1
673	mulx	$acc5,%rax,%rbx
674	adcx	%rax,$acc1
675	adox	%rbx,$acc2
676	mulx	$acc6,%rax,%rbx
677	adcx	%rax,$acc2
678	adox	%rbx,$acc3
679	mulx	$acc7,%rax,$acc4
680	adcx	%rax,$acc3
681	adox	%rdi,$acc4
682	adcx	%rdi,$acc4
683
684	mov	8*2(%rsp),%rdi		# restore dst
685	imulq	%rdx,$acc4
686
687	add	$acc4,$acc0
688	adc	\$0,$acc1
689	adc	\$0,$acc2
690	adc	\$0,$acc3
691
692	sbb	%rax,%rax		# cf -> mask
693	and	\$38,%rax
694
695	add	%rax,$acc0
696	mov	$acc1,8*1(%rdi)
697	mov	$acc2,8*2(%rdi)
698	mov	$acc3,8*3(%rdi)
699	mov	$acc0,8*0(%rdi)
700
701	mov	8*3(%rsp),%r15
702.cfi_restore	%r15
703	mov	8*4(%rsp),%r14
704.cfi_restore	%r14
705	mov	8*5(%rsp),%r13
706.cfi_restore	%r13
707	mov	8*6(%rsp),%r12
708.cfi_restore	%r12
709	mov	8*7(%rsp),%rbx
710.cfi_restore	%rbx
711	mov	8*8(%rsp),%rbp
712.cfi_restore	%rbp
713	lea	8*9(%rsp),%rsp
714.cfi_adjust_cfa_offset	88
715.Lfe64_sqr_epilogue:
716	ret
717.cfi_endproc
718.size	x25519_fe64_sqr,.-x25519_fe64_sqr
719
720.globl	x25519_fe64_mul121666
721.type	x25519_fe64_mul121666,\@function,2
722.align	32
723x25519_fe64_mul121666:
724.Lfe64_mul121666_body:
725	mov	\$121666,%edx
726	mulx	8*0(%rsi),$acc0,%rcx
727	mulx	8*1(%rsi),$acc1,%rax
728	add	%rcx,$acc1
729	mulx	8*2(%rsi),$acc2,%rcx
730	adc	%rax,$acc2
731	mulx	8*3(%rsi),$acc3,%rax
732	adc	%rcx,$acc3
733	adc	\$0,%rax
734
735	imulq	\$38,%rax,%rax
736
737	add	%rax,$acc0
738	adc	\$0,$acc1
739	adc	\$0,$acc2
740	adc	\$0,$acc3
741
742	sbb	%rax,%rax		# cf -> mask
743	and	\$38,%rax
744
745	add	%rax,$acc0
746	mov	$acc1,8*1(%rdi)
747	mov	$acc2,8*2(%rdi)
748	mov	$acc3,8*3(%rdi)
749	mov	$acc0,8*0(%rdi)
750
751.Lfe64_mul121666_epilogue:
752	ret
753.size	x25519_fe64_mul121666,.-x25519_fe64_mul121666
754
755.globl	x25519_fe64_add
756.type	x25519_fe64_add,\@function,3
757.align	32
758x25519_fe64_add:
759.Lfe64_add_body:
760	mov	8*0(%rsi),$acc0
761	mov	8*1(%rsi),$acc1
762	mov	8*2(%rsi),$acc2
763	mov	8*3(%rsi),$acc3
764
765	add	8*0(%rdx),$acc0
766	adc	8*1(%rdx),$acc1
767	adc	8*2(%rdx),$acc2
768	adc	8*3(%rdx),$acc3
769
770	sbb	%rax,%rax		# cf -> mask
771	and	\$38,%rax
772
773	add	%rax,$acc0
774	adc	\$0,$acc1
775	adc	\$0,$acc2
776	mov	$acc1,8*1(%rdi)
777	adc	\$0,$acc3
778	mov	$acc2,8*2(%rdi)
779	sbb	%rax,%rax		# cf -> mask
780	mov	$acc3,8*3(%rdi)
781	and	\$38,%rax
782
783	add	%rax,$acc0
784	mov	$acc0,8*0(%rdi)
785
786.Lfe64_add_epilogue:
787	ret
788.size	x25519_fe64_add,.-x25519_fe64_add
789
790.globl	x25519_fe64_sub
791.type	x25519_fe64_sub,\@function,3
792.align	32
793x25519_fe64_sub:
794.Lfe64_sub_body:
795	mov	8*0(%rsi),$acc0
796	mov	8*1(%rsi),$acc1
797	mov	8*2(%rsi),$acc2
798	mov	8*3(%rsi),$acc3
799
800	sub	8*0(%rdx),$acc0
801	sbb	8*1(%rdx),$acc1
802	sbb	8*2(%rdx),$acc2
803	sbb	8*3(%rdx),$acc3
804
805	sbb	%rax,%rax		# cf -> mask
806	and	\$38,%rax
807
808	sub	%rax,$acc0
809	sbb	\$0,$acc1
810	sbb	\$0,$acc2
811	mov	$acc1,8*1(%rdi)
812	sbb	\$0,$acc3
813	mov	$acc2,8*2(%rdi)
814	sbb	%rax,%rax		# cf -> mask
815	mov	$acc3,8*3(%rdi)
816	and	\$38,%rax
817
818	sub	%rax,$acc0
819	mov	$acc0,8*0(%rdi)
820
821.Lfe64_sub_epilogue:
822	ret
823.size	x25519_fe64_sub,.-x25519_fe64_sub
824
825.globl	x25519_fe64_tobytes
826.type	x25519_fe64_tobytes,\@function,2
827.align	32
828x25519_fe64_tobytes:
829.Lfe64_to_body:
830	mov	8*0(%rsi),$acc0
831	mov	8*1(%rsi),$acc1
832	mov	8*2(%rsi),$acc2
833	mov	8*3(%rsi),$acc3
834
835	################################# reduction modulo 2^255-19
836	lea	($acc3,$acc3),%rax
837	sar	\$63,$acc3		# most significant bit -> mask
838	shr	\$1,%rax		# most significant bit cleared
839	and	\$19,$acc3
840	add	\$19,$acc3		# compare to modulus in the same go
841
842	add	$acc3,$acc0
843	adc	\$0,$acc1
844	adc	\$0,$acc2
845	adc	\$0,%rax
846
847	lea	(%rax,%rax),$acc3
848	sar	\$63,%rax		# most significant bit -> mask
849	shr	\$1,$acc3		# most significant bit cleared
850	not	%rax
851	and	\$19,%rax
852
853	sub	%rax,$acc0
854	sbb	\$0,$acc1
855	sbb	\$0,$acc2
856	sbb	\$0,$acc3
857
858	mov	$acc0,8*0(%rdi)
859	mov	$acc1,8*1(%rdi)
860	mov	$acc2,8*2(%rdi)
861	mov	$acc3,8*3(%rdi)
862
863.Lfe64_to_epilogue:
864	ret
865.size	x25519_fe64_tobytes,.-x25519_fe64_tobytes
866___
867} else {
868$code.=<<___;
869.globl	x25519_fe64_eligible
870.type	x25519_fe64_eligible,\@abi-omnipotent
871.align	32
872x25519_fe64_eligible:
873	xor	%eax,%eax
874	ret
875.size	x25519_fe64_eligible,.-x25519_fe64_eligible
876
877.globl	x25519_fe64_mul
878.type	x25519_fe64_mul,\@abi-omnipotent
879.globl	x25519_fe64_sqr
880.globl	x25519_fe64_mul121666
881.globl	x25519_fe64_add
882.globl	x25519_fe64_sub
883.globl	x25519_fe64_tobytes
884x25519_fe64_mul:
885x25519_fe64_sqr:
886x25519_fe64_mul121666:
887x25519_fe64_add:
888x25519_fe64_sub:
889x25519_fe64_tobytes:
890	.byte	0x0f,0x0b	# ud2
891	ret
892.size	x25519_fe64_mul,.-x25519_fe64_mul
893___
894}
895$code.=<<___;
896.asciz	"X25519 primitives for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
897___
898
899# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
900#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
901if ($win64) {
902$rec="%rcx";
903$frame="%rdx";
904$context="%r8";
905$disp="%r9";
906
907$code.=<<___;
908.extern	__imp_RtlVirtualUnwind
909
910.type	short_handler,\@abi-omnipotent
911.align	16
912short_handler:
913	push	%rsi
914	push	%rdi
915	push	%rbx
916	push	%rbp
917	push	%r12
918	push	%r13
919	push	%r14
920	push	%r15
921	pushfq
922	sub	\$64,%rsp
923
924	mov	120($context),%rax	# pull context->Rax
925	mov	248($context),%rbx	# pull context->Rip
926
927	mov	8($disp),%rsi		# disp->ImageBase
928	mov	56($disp),%r11		# disp->HandlerData
929
930	mov	0(%r11),%r10d		# HandlerData[0]
931	lea	(%rsi,%r10),%r10	# end of prologue label
932	cmp	%r10,%rbx		# context->Rip<end of prologue label
933	jb	.Lcommon_seh_tail
934
935	mov	152($context),%rax	# pull context->Rsp
936	jmp	.Lcommon_seh_tail
937.size	short_handler,.-short_handler
938
939.type	full_handler,\@abi-omnipotent
940.align	16
941full_handler:
942	push	%rsi
943	push	%rdi
944	push	%rbx
945	push	%rbp
946	push	%r12
947	push	%r13
948	push	%r14
949	push	%r15
950	pushfq
951	sub	\$64,%rsp
952
953	mov	120($context),%rax	# pull context->Rax
954	mov	248($context),%rbx	# pull context->Rip
955
956	mov	8($disp),%rsi		# disp->ImageBase
957	mov	56($disp),%r11		# disp->HandlerData
958
959	mov	0(%r11),%r10d		# HandlerData[0]
960	lea	(%rsi,%r10),%r10	# end of prologue label
961	cmp	%r10,%rbx		# context->Rip<end of prologue label
962	jb	.Lcommon_seh_tail
963
964	mov	152($context),%rax	# pull context->Rsp
965
966	mov	4(%r11),%r10d		# HandlerData[1]
967	lea	(%rsi,%r10),%r10	# epilogue label
968	cmp	%r10,%rbx		# context->Rip>=epilogue label
969	jae	.Lcommon_seh_tail
970
971	mov	8(%r11),%r10d		# HandlerData[2]
972	lea	(%rax,%r10),%rax
973
974	mov	-8(%rax),%rbp
975	mov	-16(%rax),%rbx
976	mov	-24(%rax),%r12
977	mov	-32(%rax),%r13
978	mov	-40(%rax),%r14
979	mov	-48(%rax),%r15
980	mov	%rbx,144($context)	# restore context->Rbx
981	mov	%rbp,160($context)	# restore context->Rbp
982	mov	%r12,216($context)	# restore context->R12
983	mov	%r13,224($context)	# restore context->R13
984	mov	%r14,232($context)	# restore context->R14
985	mov	%r15,240($context)	# restore context->R15
986
987.Lcommon_seh_tail:
988	mov	8(%rax),%rdi
989	mov	16(%rax),%rsi
990	mov	%rax,152($context)	# restore context->Rsp
991	mov	%rsi,168($context)	# restore context->Rsi
992	mov	%rdi,176($context)	# restore context->Rdi
993
994	mov	40($disp),%rdi		# disp->ContextRecord
995	mov	$context,%rsi		# context
996	mov	\$154,%ecx		# sizeof(CONTEXT)
997	.long	0xa548f3fc		# cld; rep movsq
998
999	mov	$disp,%rsi
1000	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1001	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1002	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1003	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1004	mov	40(%rsi),%r10		# disp->ContextRecord
1005	lea	56(%rsi),%r11		# &disp->HandlerData
1006	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1007	mov	%r10,32(%rsp)		# arg5
1008	mov	%r11,40(%rsp)		# arg6
1009	mov	%r12,48(%rsp)		# arg7
1010	mov	%rcx,56(%rsp)		# arg8, (NULL)
1011	call	*__imp_RtlVirtualUnwind(%rip)
1012
1013	mov	\$1,%eax		# ExceptionContinueSearch
1014	add	\$64,%rsp
1015	popfq
1016	pop	%r15
1017	pop	%r14
1018	pop	%r13
1019	pop	%r12
1020	pop	%rbp
1021	pop	%rbx
1022	pop	%rdi
1023	pop	%rsi
1024	ret
1025.size	full_handler,.-full_handler
1026
1027.section	.pdata
1028.align	4
1029	.rva	.LSEH_begin_x25519_fe51_mul
1030	.rva	.LSEH_end_x25519_fe51_mul
1031	.rva	.LSEH_info_x25519_fe51_mul
1032
1033	.rva	.LSEH_begin_x25519_fe51_sqr
1034	.rva	.LSEH_end_x25519_fe51_sqr
1035	.rva	.LSEH_info_x25519_fe51_sqr
1036
1037	.rva	.LSEH_begin_x25519_fe51_mul121666
1038	.rva	.LSEH_end_x25519_fe51_mul121666
1039	.rva	.LSEH_info_x25519_fe51_mul121666
1040___
1041$code.=<<___	if ($addx);
1042	.rva	.LSEH_begin_x25519_fe64_mul
1043	.rva	.LSEH_end_x25519_fe64_mul
1044	.rva	.LSEH_info_x25519_fe64_mul
1045
1046	.rva	.LSEH_begin_x25519_fe64_sqr
1047	.rva	.LSEH_end_x25519_fe64_sqr
1048	.rva	.LSEH_info_x25519_fe64_sqr
1049
1050	.rva	.LSEH_begin_x25519_fe64_mul121666
1051	.rva	.LSEH_end_x25519_fe64_mul121666
1052	.rva	.LSEH_info_x25519_fe64_mul121666
1053
1054	.rva	.LSEH_begin_x25519_fe64_add
1055	.rva	.LSEH_end_x25519_fe64_add
1056	.rva	.LSEH_info_x25519_fe64_add
1057
1058	.rva	.LSEH_begin_x25519_fe64_sub
1059	.rva	.LSEH_end_x25519_fe64_sub
1060	.rva	.LSEH_info_x25519_fe64_sub
1061
1062	.rva	.LSEH_begin_x25519_fe64_tobytes
1063	.rva	.LSEH_end_x25519_fe64_tobytes
1064	.rva	.LSEH_info_x25519_fe64_tobytes
1065___
1066$code.=<<___;
1067.section	.xdata
1068.align	8
1069.LSEH_info_x25519_fe51_mul:
1070	.byte	9,0,0,0
1071	.rva	full_handler
1072	.rva	.Lfe51_mul_body,.Lfe51_mul_epilogue	# HandlerData[]
1073	.long	88,0
1074.LSEH_info_x25519_fe51_sqr:
1075	.byte	9,0,0,0
1076	.rva	full_handler
1077	.rva	.Lfe51_sqr_body,.Lfe51_sqr_epilogue	# HandlerData[]
1078	.long	88,0
1079.LSEH_info_x25519_fe51_mul121666:
1080	.byte	9,0,0,0
1081	.rva	full_handler
1082	.rva	.Lfe51_mul121666_body,.Lfe51_mul121666_epilogue	# HandlerData[]
1083	.long	88,0
1084___
1085$code.=<<___	if ($addx);
1086.LSEH_info_x25519_fe64_mul:
1087	.byte	9,0,0,0
1088	.rva	full_handler
1089	.rva	.Lfe64_mul_body,.Lfe64_mul_epilogue	# HandlerData[]
1090	.long	72,0
1091.LSEH_info_x25519_fe64_sqr:
1092	.byte	9,0,0,0
1093	.rva	full_handler
1094	.rva	.Lfe64_sqr_body,.Lfe64_sqr_epilogue	# HandlerData[]
1095	.long	72,0
1096.LSEH_info_x25519_fe64_mul121666:
1097	.byte	9,0,0,0
1098	.rva	short_handler
1099	.rva	.Lfe64_mul121666_body,.Lfe64_mul121666_epilogue	# HandlerData[]
1100.LSEH_info_x25519_fe64_add:
1101	.byte	9,0,0,0
1102	.rva	short_handler
1103	.rva	.Lfe64_add_body,.Lfe64_add_epilogue	# HandlerData[]
1104.LSEH_info_x25519_fe64_sub:
1105	.byte	9,0,0,0
1106	.rva	short_handler
1107	.rva	.Lfe64_sub_body,.Lfe64_sub_epilogue	# HandlerData[]
1108.LSEH_info_x25519_fe64_tobytes:
1109	.byte	9,0,0,0
1110	.rva	short_handler
1111	.rva	.Lfe64_to_body,.Lfe64_to_epilogue	# HandlerData[]
1112___
1113}
1114
1115$code =~ s/\`([^\`]*)\`/eval $1/gem;
1116print $code;
1117close STDOUT;
1118