1#! /usr/bin/env perl
2# Copyright 2005-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# October 2005.
18#
19# Montgomery multiplication routine for x86_64. While it gives modest
20# 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more
21# than twice, >2x, as fast. Most common rsa1024 sign is improved by
22# respectful 50%. It remains to be seen if loop unrolling and
23# dedicated squaring routine can provide further improvement...
24
25# July 2011.
26#
27# Add dedicated squaring procedure. Performance improvement varies
28# from platform to platform, but in average it's ~5%/15%/25%/33%
29# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
30
31# August 2011.
32#
33# Unroll and modulo-schedule inner loops in such manner that they
34# are "fallen through" for input lengths of 8, which is critical for
35# 1024-bit RSA *sign*. Average performance improvement in comparison
36# to *initial* version of this module from 2005 is ~0%/30%/40%/45%
37# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
38
39# June 2013.
40#
41# Optimize reduction in squaring procedure and improve 1024+-bit RSA
42# sign performance by 10-16% on Intel Sandy Bridge and later
43# (virtually same on non-Intel processors).
44
45# August 2013.
46#
47# Add MULX/ADOX/ADCX code path.
48
49$flavour = shift;
50$output  = shift;
51if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
52
53$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
54
55$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
56( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
57( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
58die "can't locate x86_64-xlate.pl";
59
60open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
61*STDOUT=*OUT;
62
63if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
64		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
65	$addx = ($1>=2.23);
66}
67
68if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
69	    `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
70	$addx = ($1>=2.10);
71}
72
73if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
74	    `ml64 2>&1` =~ /Version ([0-9]+)\./) {
75	$addx = ($1>=12);
76}
77
78if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) {
79	my $ver = $2 + $3/100.0;	# 3.1->3.01, 3.10->3.10
80	$addx = ($ver>=3.03);
81}
82
83# int bn_mul_mont(
84$rp="%rdi";	# BN_ULONG *rp,
85$ap="%rsi";	# const BN_ULONG *ap,
86$bp="%rdx";	# const BN_ULONG *bp,
87$np="%rcx";	# const BN_ULONG *np,
88$n0="%r8";	# const BN_ULONG *n0,
89$num="%r9";	# int num);
90$lo0="%r10";
91$hi0="%r11";
92$hi1="%r13";
93$i="%r14";
94$j="%r15";
95$m0="%rbx";
96$m1="%rbp";
97
98$code=<<___;
99.text
100
101.extern	OPENSSL_ia32cap_P
102
103.globl	bn_mul_mont
104.type	bn_mul_mont,\@function,6
105.align	16
106bn_mul_mont:
107.cfi_startproc
108	mov	${num}d,${num}d
109	mov	%rsp,%rax
110.cfi_def_cfa_register	%rax
111	test	\$3,${num}d
112	jnz	.Lmul_enter
113	cmp	\$8,${num}d
114	jb	.Lmul_enter
115___
116$code.=<<___ if ($addx);
117	mov	OPENSSL_ia32cap_P+8(%rip),%r11d
118___
119$code.=<<___;
120	cmp	$ap,$bp
121	jne	.Lmul4x_enter
122	test	\$7,${num}d
123	jz	.Lsqr8x_enter
124	jmp	.Lmul4x_enter
125
126.align	16
127.Lmul_enter:
128	push	%rbx
129.cfi_push	%rbx
130	push	%rbp
131.cfi_push	%rbp
132	push	%r12
133.cfi_push	%r12
134	push	%r13
135.cfi_push	%r13
136	push	%r14
137.cfi_push	%r14
138	push	%r15
139.cfi_push	%r15
140
141	neg	$num
142	mov	%rsp,%r11
143	lea	-16(%rsp,$num,8),%r10	# future alloca(8*(num+2))
144	neg	$num			# restore $num
145	and	\$-1024,%r10		# minimize TLB usage
146
147	# An OS-agnostic version of __chkstk.
148	#
149	# Some OSes (Windows) insist on stack being "wired" to
150	# physical memory in strictly sequential manner, i.e. if stack
151	# allocation spans two pages, then reference to farmost one can
152	# be punishable by SEGV. But page walking can do good even on
153	# other OSes, because it guarantees that villain thread hits
154	# the guard page before it can make damage to innocent one...
155	sub	%r10,%r11
156	and	\$-4096,%r11
157	lea	(%r10,%r11),%rsp
158	mov	(%rsp),%r11
159	cmp	%r10,%rsp
160	ja	.Lmul_page_walk
161	jmp	.Lmul_page_walk_done
162
163.align	16
164.Lmul_page_walk:
165	lea	-4096(%rsp),%rsp
166	mov	(%rsp),%r11
167	cmp	%r10,%rsp
168	ja	.Lmul_page_walk
169.Lmul_page_walk_done:
170
171	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
172.cfi_cfa_expression	%rsp+8,$num,8,mul,plus,deref,+8
173.Lmul_body:
174	mov	$bp,%r12		# reassign $bp
175___
176		$bp="%r12";
177$code.=<<___;
178	mov	($n0),$n0		# pull n0[0] value
179	mov	($bp),$m0		# m0=bp[0]
180	mov	($ap),%rax
181
182	xor	$i,$i			# i=0
183	xor	$j,$j			# j=0
184
185	mov	$n0,$m1
186	mulq	$m0			# ap[0]*bp[0]
187	mov	%rax,$lo0
188	mov	($np),%rax
189
190	imulq	$lo0,$m1		# "tp[0]"*n0
191	mov	%rdx,$hi0
192
193	mulq	$m1			# np[0]*m1
194	add	%rax,$lo0		# discarded
195	mov	8($ap),%rax
196	adc	\$0,%rdx
197	mov	%rdx,$hi1
198
199	lea	1($j),$j		# j++
200	jmp	.L1st_enter
201
202.align	16
203.L1st:
204	add	%rax,$hi1
205	mov	($ap,$j,8),%rax
206	adc	\$0,%rdx
207	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
208	mov	$lo0,$hi0
209	adc	\$0,%rdx
210	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
211	mov	%rdx,$hi1
212
213.L1st_enter:
214	mulq	$m0			# ap[j]*bp[0]
215	add	%rax,$hi0
216	mov	($np,$j,8),%rax
217	adc	\$0,%rdx
218	lea	1($j),$j		# j++
219	mov	%rdx,$lo0
220
221	mulq	$m1			# np[j]*m1
222	cmp	$num,$j
223	jne	.L1st
224
225	add	%rax,$hi1
226	mov	($ap),%rax		# ap[0]
227	adc	\$0,%rdx
228	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
229	adc	\$0,%rdx
230	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
231	mov	%rdx,$hi1
232	mov	$lo0,$hi0
233
234	xor	%rdx,%rdx
235	add	$hi0,$hi1
236	adc	\$0,%rdx
237	mov	$hi1,-8(%rsp,$num,8)
238	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
239
240	lea	1($i),$i		# i++
241	jmp	.Louter
242.align	16
243.Louter:
244	mov	($bp,$i,8),$m0		# m0=bp[i]
245	xor	$j,$j			# j=0
246	mov	$n0,$m1
247	mov	(%rsp),$lo0
248	mulq	$m0			# ap[0]*bp[i]
249	add	%rax,$lo0		# ap[0]*bp[i]+tp[0]
250	mov	($np),%rax
251	adc	\$0,%rdx
252
253	imulq	$lo0,$m1		# tp[0]*n0
254	mov	%rdx,$hi0
255
256	mulq	$m1			# np[0]*m1
257	add	%rax,$lo0		# discarded
258	mov	8($ap),%rax
259	adc	\$0,%rdx
260	mov	8(%rsp),$lo0		# tp[1]
261	mov	%rdx,$hi1
262
263	lea	1($j),$j		# j++
264	jmp	.Linner_enter
265
266.align	16
267.Linner:
268	add	%rax,$hi1
269	mov	($ap,$j,8),%rax
270	adc	\$0,%rdx
271	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
272	mov	(%rsp,$j,8),$lo0
273	adc	\$0,%rdx
274	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
275	mov	%rdx,$hi1
276
277.Linner_enter:
278	mulq	$m0			# ap[j]*bp[i]
279	add	%rax,$hi0
280	mov	($np,$j,8),%rax
281	adc	\$0,%rdx
282	add	$hi0,$lo0		# ap[j]*bp[i]+tp[j]
283	mov	%rdx,$hi0
284	adc	\$0,$hi0
285	lea	1($j),$j		# j++
286
287	mulq	$m1			# np[j]*m1
288	cmp	$num,$j
289	jne	.Linner
290
291	add	%rax,$hi1
292	mov	($ap),%rax		# ap[0]
293	adc	\$0,%rdx
294	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
295	mov	(%rsp,$j,8),$lo0
296	adc	\$0,%rdx
297	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
298	mov	%rdx,$hi1
299
300	xor	%rdx,%rdx
301	add	$hi0,$hi1
302	adc	\$0,%rdx
303	add	$lo0,$hi1		# pull upmost overflow bit
304	adc	\$0,%rdx
305	mov	$hi1,-8(%rsp,$num,8)
306	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
307
308	lea	1($i),$i		# i++
309	cmp	$num,$i
310	jb	.Louter
311
312	xor	$i,$i			# i=0 and clear CF!
313	mov	(%rsp),%rax		# tp[0]
314	mov	$num,$j			# j=num
315
316.align	16
317.Lsub:	sbb	($np,$i,8),%rax
318	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]-np[i]
319	mov	8(%rsp,$i,8),%rax	# tp[i+1]
320	lea	1($i),$i		# i++
321	dec	$j			# doesn't affect CF!
322	jnz	.Lsub
323
324	sbb	\$0,%rax		# handle upmost overflow bit
325	mov	\$-1,%rbx
326	xor	%rax,%rbx		# not %rax
327	xor	$i,$i
328	mov	$num,$j			# j=num
329
330.Lcopy:					# conditional copy
331	mov	($rp,$i,8),%rcx
332	mov	(%rsp,$i,8),%rdx
333	and	%rbx,%rcx
334	and	%rax,%rdx
335	mov	$num,(%rsp,$i,8)	# zap temporary vector
336	or	%rcx,%rdx
337	mov	%rdx,($rp,$i,8)		# rp[i]=tp[i]
338	lea	1($i),$i
339	sub	\$1,$j
340	jnz	.Lcopy
341
342	mov	8(%rsp,$num,8),%rsi	# restore %rsp
343.cfi_def_cfa	%rsi,8
344	mov	\$1,%rax
345	mov	-48(%rsi),%r15
346.cfi_restore	%r15
347	mov	-40(%rsi),%r14
348.cfi_restore	%r14
349	mov	-32(%rsi),%r13
350.cfi_restore	%r13
351	mov	-24(%rsi),%r12
352.cfi_restore	%r12
353	mov	-16(%rsi),%rbp
354.cfi_restore	%rbp
355	mov	-8(%rsi),%rbx
356.cfi_restore	%rbx
357	lea	(%rsi),%rsp
358.cfi_def_cfa_register	%rsp
359.Lmul_epilogue:
360	ret
361.cfi_endproc
362.size	bn_mul_mont,.-bn_mul_mont
363___
364{{{
365my @A=("%r10","%r11");
366my @N=("%r13","%rdi");
367$code.=<<___;
368.type	bn_mul4x_mont,\@function,6
369.align	16
370bn_mul4x_mont:
371.cfi_startproc
372	mov	${num}d,${num}d
373	mov	%rsp,%rax
374.cfi_def_cfa_register	%rax
375.Lmul4x_enter:
376___
377$code.=<<___ if ($addx);
378	and	\$0x80100,%r11d
379	cmp	\$0x80100,%r11d
380	je	.Lmulx4x_enter
381___
382$code.=<<___;
383	push	%rbx
384.cfi_push	%rbx
385	push	%rbp
386.cfi_push	%rbp
387	push	%r12
388.cfi_push	%r12
389	push	%r13
390.cfi_push	%r13
391	push	%r14
392.cfi_push	%r14
393	push	%r15
394.cfi_push	%r15
395
396	neg	$num
397	mov	%rsp,%r11
398	lea	-32(%rsp,$num,8),%r10	# future alloca(8*(num+4))
399	neg	$num			# restore
400	and	\$-1024,%r10		# minimize TLB usage
401
402	sub	%r10,%r11
403	and	\$-4096,%r11
404	lea	(%r10,%r11),%rsp
405	mov	(%rsp),%r11
406	cmp	%r10,%rsp
407	ja	.Lmul4x_page_walk
408	jmp	.Lmul4x_page_walk_done
409
410.Lmul4x_page_walk:
411	lea	-4096(%rsp),%rsp
412	mov	(%rsp),%r11
413	cmp	%r10,%rsp
414	ja	.Lmul4x_page_walk
415.Lmul4x_page_walk_done:
416
417	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
418.cfi_cfa_expression	%rsp+8,$num,8,mul,plus,deref,+8
419.Lmul4x_body:
420	mov	$rp,16(%rsp,$num,8)	# tp[num+2]=$rp
421	mov	%rdx,%r12		# reassign $bp
422___
423		$bp="%r12";
424$code.=<<___;
425	mov	($n0),$n0		# pull n0[0] value
426	mov	($bp),$m0		# m0=bp[0]
427	mov	($ap),%rax
428
429	xor	$i,$i			# i=0
430	xor	$j,$j			# j=0
431
432	mov	$n0,$m1
433	mulq	$m0			# ap[0]*bp[0]
434	mov	%rax,$A[0]
435	mov	($np),%rax
436
437	imulq	$A[0],$m1		# "tp[0]"*n0
438	mov	%rdx,$A[1]
439
440	mulq	$m1			# np[0]*m1
441	add	%rax,$A[0]		# discarded
442	mov	8($ap),%rax
443	adc	\$0,%rdx
444	mov	%rdx,$N[1]
445
446	mulq	$m0
447	add	%rax,$A[1]
448	mov	8($np),%rax
449	adc	\$0,%rdx
450	mov	%rdx,$A[0]
451
452	mulq	$m1
453	add	%rax,$N[1]
454	mov	16($ap),%rax
455	adc	\$0,%rdx
456	add	$A[1],$N[1]
457	lea	4($j),$j		# j++
458	adc	\$0,%rdx
459	mov	$N[1],(%rsp)
460	mov	%rdx,$N[0]
461	jmp	.L1st4x
462.align	16
463.L1st4x:
464	mulq	$m0			# ap[j]*bp[0]
465	add	%rax,$A[0]
466	mov	-16($np,$j,8),%rax
467	adc	\$0,%rdx
468	mov	%rdx,$A[1]
469
470	mulq	$m1			# np[j]*m1
471	add	%rax,$N[0]
472	mov	-8($ap,$j,8),%rax
473	adc	\$0,%rdx
474	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
475	adc	\$0,%rdx
476	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
477	mov	%rdx,$N[1]
478
479	mulq	$m0			# ap[j]*bp[0]
480	add	%rax,$A[1]
481	mov	-8($np,$j,8),%rax
482	adc	\$0,%rdx
483	mov	%rdx,$A[0]
484
485	mulq	$m1			# np[j]*m1
486	add	%rax,$N[1]
487	mov	($ap,$j,8),%rax
488	adc	\$0,%rdx
489	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
490	adc	\$0,%rdx
491	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
492	mov	%rdx,$N[0]
493
494	mulq	$m0			# ap[j]*bp[0]
495	add	%rax,$A[0]
496	mov	($np,$j,8),%rax
497	adc	\$0,%rdx
498	mov	%rdx,$A[1]
499
500	mulq	$m1			# np[j]*m1
501	add	%rax,$N[0]
502	mov	8($ap,$j,8),%rax
503	adc	\$0,%rdx
504	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
505	adc	\$0,%rdx
506	mov	$N[0],-8(%rsp,$j,8)	# tp[j-1]
507	mov	%rdx,$N[1]
508
509	mulq	$m0			# ap[j]*bp[0]
510	add	%rax,$A[1]
511	mov	8($np,$j,8),%rax
512	adc	\$0,%rdx
513	lea	4($j),$j		# j++
514	mov	%rdx,$A[0]
515
516	mulq	$m1			# np[j]*m1
517	add	%rax,$N[1]
518	mov	-16($ap,$j,8),%rax
519	adc	\$0,%rdx
520	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
521	adc	\$0,%rdx
522	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
523	mov	%rdx,$N[0]
524	cmp	$num,$j
525	jb	.L1st4x
526
527	mulq	$m0			# ap[j]*bp[0]
528	add	%rax,$A[0]
529	mov	-16($np,$j,8),%rax
530	adc	\$0,%rdx
531	mov	%rdx,$A[1]
532
533	mulq	$m1			# np[j]*m1
534	add	%rax,$N[0]
535	mov	-8($ap,$j,8),%rax
536	adc	\$0,%rdx
537	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
538	adc	\$0,%rdx
539	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
540	mov	%rdx,$N[1]
541
542	mulq	$m0			# ap[j]*bp[0]
543	add	%rax,$A[1]
544	mov	-8($np,$j,8),%rax
545	adc	\$0,%rdx
546	mov	%rdx,$A[0]
547
548	mulq	$m1			# np[j]*m1
549	add	%rax,$N[1]
550	mov	($ap),%rax		# ap[0]
551	adc	\$0,%rdx
552	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
553	adc	\$0,%rdx
554	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
555	mov	%rdx,$N[0]
556
557	xor	$N[1],$N[1]
558	add	$A[0],$N[0]
559	adc	\$0,$N[1]
560	mov	$N[0],-8(%rsp,$j,8)
561	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
562
563	lea	1($i),$i		# i++
564.align	4
565.Louter4x:
566	mov	($bp,$i,8),$m0		# m0=bp[i]
567	xor	$j,$j			# j=0
568	mov	(%rsp),$A[0]
569	mov	$n0,$m1
570	mulq	$m0			# ap[0]*bp[i]
571	add	%rax,$A[0]		# ap[0]*bp[i]+tp[0]
572	mov	($np),%rax
573	adc	\$0,%rdx
574
575	imulq	$A[0],$m1		# tp[0]*n0
576	mov	%rdx,$A[1]
577
578	mulq	$m1			# np[0]*m1
579	add	%rax,$A[0]		# "$N[0]", discarded
580	mov	8($ap),%rax
581	adc	\$0,%rdx
582	mov	%rdx,$N[1]
583
584	mulq	$m0			# ap[j]*bp[i]
585	add	%rax,$A[1]
586	mov	8($np),%rax
587	adc	\$0,%rdx
588	add	8(%rsp),$A[1]		# +tp[1]
589	adc	\$0,%rdx
590	mov	%rdx,$A[0]
591
592	mulq	$m1			# np[j]*m1
593	add	%rax,$N[1]
594	mov	16($ap),%rax
595	adc	\$0,%rdx
596	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[i]+tp[j]
597	lea	4($j),$j		# j+=2
598	adc	\$0,%rdx
599	mov	$N[1],(%rsp)		# tp[j-1]
600	mov	%rdx,$N[0]
601	jmp	.Linner4x
602.align	16
603.Linner4x:
604	mulq	$m0			# ap[j]*bp[i]
605	add	%rax,$A[0]
606	mov	-16($np,$j,8),%rax
607	adc	\$0,%rdx
608	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
609	adc	\$0,%rdx
610	mov	%rdx,$A[1]
611
612	mulq	$m1			# np[j]*m1
613	add	%rax,$N[0]
614	mov	-8($ap,$j,8),%rax
615	adc	\$0,%rdx
616	add	$A[0],$N[0]
617	adc	\$0,%rdx
618	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
619	mov	%rdx,$N[1]
620
621	mulq	$m0			# ap[j]*bp[i]
622	add	%rax,$A[1]
623	mov	-8($np,$j,8),%rax
624	adc	\$0,%rdx
625	add	-8(%rsp,$j,8),$A[1]
626	adc	\$0,%rdx
627	mov	%rdx,$A[0]
628
629	mulq	$m1			# np[j]*m1
630	add	%rax,$N[1]
631	mov	($ap,$j,8),%rax
632	adc	\$0,%rdx
633	add	$A[1],$N[1]
634	adc	\$0,%rdx
635	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
636	mov	%rdx,$N[0]
637
638	mulq	$m0			# ap[j]*bp[i]
639	add	%rax,$A[0]
640	mov	($np,$j,8),%rax
641	adc	\$0,%rdx
642	add	(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
643	adc	\$0,%rdx
644	mov	%rdx,$A[1]
645
646	mulq	$m1			# np[j]*m1
647	add	%rax,$N[0]
648	mov	8($ap,$j,8),%rax
649	adc	\$0,%rdx
650	add	$A[0],$N[0]
651	adc	\$0,%rdx
652	mov	$N[0],-8(%rsp,$j,8)	# tp[j-1]
653	mov	%rdx,$N[1]
654
655	mulq	$m0			# ap[j]*bp[i]
656	add	%rax,$A[1]
657	mov	8($np,$j,8),%rax
658	adc	\$0,%rdx
659	add	8(%rsp,$j,8),$A[1]
660	adc	\$0,%rdx
661	lea	4($j),$j		# j++
662	mov	%rdx,$A[0]
663
664	mulq	$m1			# np[j]*m1
665	add	%rax,$N[1]
666	mov	-16($ap,$j,8),%rax
667	adc	\$0,%rdx
668	add	$A[1],$N[1]
669	adc	\$0,%rdx
670	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
671	mov	%rdx,$N[0]
672	cmp	$num,$j
673	jb	.Linner4x
674
675	mulq	$m0			# ap[j]*bp[i]
676	add	%rax,$A[0]
677	mov	-16($np,$j,8),%rax
678	adc	\$0,%rdx
679	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
680	adc	\$0,%rdx
681	mov	%rdx,$A[1]
682
683	mulq	$m1			# np[j]*m1
684	add	%rax,$N[0]
685	mov	-8($ap,$j,8),%rax
686	adc	\$0,%rdx
687	add	$A[0],$N[0]
688	adc	\$0,%rdx
689	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
690	mov	%rdx,$N[1]
691
692	mulq	$m0			# ap[j]*bp[i]
693	add	%rax,$A[1]
694	mov	-8($np,$j,8),%rax
695	adc	\$0,%rdx
696	add	-8(%rsp,$j,8),$A[1]
697	adc	\$0,%rdx
698	lea	1($i),$i		# i++
699	mov	%rdx,$A[0]
700
701	mulq	$m1			# np[j]*m1
702	add	%rax,$N[1]
703	mov	($ap),%rax		# ap[0]
704	adc	\$0,%rdx
705	add	$A[1],$N[1]
706	adc	\$0,%rdx
707	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
708	mov	%rdx,$N[0]
709
710	xor	$N[1],$N[1]
711	add	$A[0],$N[0]
712	adc	\$0,$N[1]
713	add	(%rsp,$num,8),$N[0]	# pull upmost overflow bit
714	adc	\$0,$N[1]
715	mov	$N[0],-8(%rsp,$j,8)
716	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
717
718	cmp	$num,$i
719	jb	.Louter4x
720___
721{
722my @ri=("%rax","%rdx",$m0,$m1);
723$code.=<<___;
724	mov	16(%rsp,$num,8),$rp	# restore $rp
725	lea	-4($num),$j
726	mov	0(%rsp),@ri[0]		# tp[0]
727	mov	8(%rsp),@ri[1]		# tp[1]
728	shr	\$2,$j			# j=num/4-1
729	lea	(%rsp),$ap		# borrow ap for tp
730	xor	$i,$i			# i=0 and clear CF!
731
732	sub	0($np),@ri[0]
733	mov	16($ap),@ri[2]		# tp[2]
734	mov	24($ap),@ri[3]		# tp[3]
735	sbb	8($np),@ri[1]
736
737.Lsub4x:
738	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
739	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
740	sbb	16($np,$i,8),@ri[2]
741	mov	32($ap,$i,8),@ri[0]	# tp[i+1]
742	mov	40($ap,$i,8),@ri[1]
743	sbb	24($np,$i,8),@ri[3]
744	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
745	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
746	sbb	32($np,$i,8),@ri[0]
747	mov	48($ap,$i,8),@ri[2]
748	mov	56($ap,$i,8),@ri[3]
749	sbb	40($np,$i,8),@ri[1]
750	lea	4($i),$i		# i++
751	dec	$j			# doesn't affect CF!
752	jnz	.Lsub4x
753
754	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
755	mov	32($ap,$i,8),@ri[0]	# load overflow bit
756	sbb	16($np,$i,8),@ri[2]
757	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
758	sbb	24($np,$i,8),@ri[3]
759	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
760
761	sbb	\$0,@ri[0]		# handle upmost overflow bit
762	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
763	pxor	%xmm0,%xmm0
764	movq	@ri[0],%xmm4
765	pcmpeqd	%xmm5,%xmm5
766	pshufd	\$0,%xmm4,%xmm4
767	mov	$num,$j
768	pxor	%xmm4,%xmm5
769	shr	\$2,$j			# j=num/4
770	xor	%eax,%eax		# i=0
771
772	jmp	.Lcopy4x
773.align	16
774.Lcopy4x:				# conditional copy
775	movdqa	(%rsp,%rax),%xmm1
776	movdqu	($rp,%rax),%xmm2
777	pand	%xmm4,%xmm1
778	pand	%xmm5,%xmm2
779	movdqa	16(%rsp,%rax),%xmm3
780	movdqa	%xmm0,(%rsp,%rax)
781	por	%xmm2,%xmm1
782	movdqu	16($rp,%rax),%xmm2
783	movdqu	%xmm1,($rp,%rax)
784	pand	%xmm4,%xmm3
785	pand	%xmm5,%xmm2
786	movdqa	%xmm0,16(%rsp,%rax)
787	por	%xmm2,%xmm3
788	movdqu	%xmm3,16($rp,%rax)
789	lea	32(%rax),%rax
790	dec	$j
791	jnz	.Lcopy4x
792___
793}
794$code.=<<___;
795	mov	8(%rsp,$num,8),%rsi	# restore %rsp
796.cfi_def_cfa	%rsi, 8
797	mov	\$1,%rax
798	mov	-48(%rsi),%r15
799.cfi_restore	%r15
800	mov	-40(%rsi),%r14
801.cfi_restore	%r14
802	mov	-32(%rsi),%r13
803.cfi_restore	%r13
804	mov	-24(%rsi),%r12
805.cfi_restore	%r12
806	mov	-16(%rsi),%rbp
807.cfi_restore	%rbp
808	mov	-8(%rsi),%rbx
809.cfi_restore	%rbx
810	lea	(%rsi),%rsp
811.cfi_def_cfa_register	%rsp
812.Lmul4x_epilogue:
813	ret
814.cfi_endproc
815.size	bn_mul4x_mont,.-bn_mul4x_mont
816___
817}}}
818{{{
819######################################################################
820# void bn_sqr8x_mont(
821my $rptr="%rdi";	# const BN_ULONG *rptr,
822my $aptr="%rsi";	# const BN_ULONG *aptr,
823my $bptr="%rdx";	# not used
824my $nptr="%rcx";	# const BN_ULONG *nptr,
825my $n0  ="%r8";		# const BN_ULONG *n0);
826my $num ="%r9";		# int num, has to be divisible by 8
827
828my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
829my @A0=("%r10","%r11");
830my @A1=("%r12","%r13");
831my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
832
833$code.=<<___	if ($addx);
834.extern	bn_sqrx8x_internal		# see x86_64-mont5 module
835___
836$code.=<<___;
837.extern	bn_sqr8x_internal		# see x86_64-mont5 module
838
839.type	bn_sqr8x_mont,\@function,6
840.align	32
841bn_sqr8x_mont:
842.cfi_startproc
843	mov	%rsp,%rax
844.cfi_def_cfa_register	%rax
845.Lsqr8x_enter:
846	push	%rbx
847.cfi_push	%rbx
848	push	%rbp
849.cfi_push	%rbp
850	push	%r12
851.cfi_push	%r12
852	push	%r13
853.cfi_push	%r13
854	push	%r14
855.cfi_push	%r14
856	push	%r15
857.cfi_push	%r15
858.Lsqr8x_prologue:
859
860	mov	${num}d,%r10d
861	shl	\$3,${num}d		# convert $num to bytes
862	shl	\$3+2,%r10		# 4*$num
863	neg	$num
864
865	##############################################################
866	# ensure that stack frame doesn't alias with $aptr modulo
867	# 4096. this is done to allow memory disambiguation logic
868	# do its job.
869	#
870	lea	-64(%rsp,$num,2),%r11
871	mov	%rsp,%rbp
872	mov	($n0),$n0		# *n0
873	sub	$aptr,%r11
874	and	\$4095,%r11
875	cmp	%r11,%r10
876	jb	.Lsqr8x_sp_alt
877	sub	%r11,%rbp		# align with $aptr
878	lea	-64(%rbp,$num,2),%rbp	# future alloca(frame+2*$num)
879	jmp	.Lsqr8x_sp_done
880
881.align	32
882.Lsqr8x_sp_alt:
883	lea	4096-64(,$num,2),%r10	# 4096-frame-2*$num
884	lea	-64(%rbp,$num,2),%rbp	# future alloca(frame+2*$num)
885	sub	%r10,%r11
886	mov	\$0,%r10
887	cmovc	%r10,%r11
888	sub	%r11,%rbp
889.Lsqr8x_sp_done:
890	and	\$-64,%rbp
891	mov	%rsp,%r11
892	sub	%rbp,%r11
893	and	\$-4096,%r11
894	lea	(%rbp,%r11),%rsp
895	mov	(%rsp),%r10
896	cmp	%rbp,%rsp
897	ja	.Lsqr8x_page_walk
898	jmp	.Lsqr8x_page_walk_done
899
900.align	16
901.Lsqr8x_page_walk:
902	lea	-4096(%rsp),%rsp
903	mov	(%rsp),%r10
904	cmp	%rbp,%rsp
905	ja	.Lsqr8x_page_walk
906.Lsqr8x_page_walk_done:
907
908	mov	$num,%r10
909	neg	$num
910
911	mov	$n0,  32(%rsp)
912	mov	%rax, 40(%rsp)		# save original %rsp
913.cfi_cfa_expression	%rsp+40,deref,+8
914.Lsqr8x_body:
915
916	movq	$nptr, %xmm2		# save pointer to modulus
917	pxor	%xmm0,%xmm0
918	movq	$rptr,%xmm1		# save $rptr
919	movq	%r10, %xmm3		# -$num
920___
921$code.=<<___ if ($addx);
922	mov	OPENSSL_ia32cap_P+8(%rip),%eax
923	and	\$0x80100,%eax
924	cmp	\$0x80100,%eax
925	jne	.Lsqr8x_nox
926
927	call	bn_sqrx8x_internal	# see x86_64-mont5 module
928					# %rax	top-most carry
929					# %rbp	nptr
930					# %rcx	-8*num
931					# %r8	end of tp[2*num]
932	lea	(%r8,%rcx),%rbx
933	mov	%rcx,$num
934	mov	%rcx,%rdx
935	movq	%xmm1,$rptr
936	sar	\$3+2,%rcx		# %cf=0
937	jmp	.Lsqr8x_sub
938
939.align	32
940.Lsqr8x_nox:
941___
942$code.=<<___;
943	call	bn_sqr8x_internal	# see x86_64-mont5 module
944					# %rax	top-most carry
945					# %rbp	nptr
946					# %r8	-8*num
947					# %rdi	end of tp[2*num]
948	lea	(%rdi,$num),%rbx
949	mov	$num,%rcx
950	mov	$num,%rdx
951	movq	%xmm1,$rptr
952	sar	\$3+2,%rcx		# %cf=0
953	jmp	.Lsqr8x_sub
954
955.align	32
956.Lsqr8x_sub:
957	mov	8*0(%rbx),%r12
958	mov	8*1(%rbx),%r13
959	mov	8*2(%rbx),%r14
960	mov	8*3(%rbx),%r15
961	lea	8*4(%rbx),%rbx
962	sbb	8*0(%rbp),%r12
963	sbb	8*1(%rbp),%r13
964	sbb	8*2(%rbp),%r14
965	sbb	8*3(%rbp),%r15
966	lea	8*4(%rbp),%rbp
967	mov	%r12,8*0($rptr)
968	mov	%r13,8*1($rptr)
969	mov	%r14,8*2($rptr)
970	mov	%r15,8*3($rptr)
971	lea	8*4($rptr),$rptr
972	inc	%rcx			# preserves %cf
973	jnz	.Lsqr8x_sub
974
975	sbb	\$0,%rax		# top-most carry
976	lea	(%rbx,$num),%rbx	# rewind
977	lea	($rptr,$num),$rptr	# rewind
978
979	movq	%rax,%xmm1
980	pxor	%xmm0,%xmm0
981	pshufd	\$0,%xmm1,%xmm1
982	mov	40(%rsp),%rsi		# restore %rsp
983.cfi_def_cfa	%rsi,8
984	jmp	.Lsqr8x_cond_copy
985
986.align	32
987.Lsqr8x_cond_copy:
988	movdqa	16*0(%rbx),%xmm2
989	movdqa	16*1(%rbx),%xmm3
990	lea	16*2(%rbx),%rbx
991	movdqu	16*0($rptr),%xmm4
992	movdqu	16*1($rptr),%xmm5
993	lea	16*2($rptr),$rptr
994	movdqa	%xmm0,-16*2(%rbx)	# zero tp
995	movdqa	%xmm0,-16*1(%rbx)
996	movdqa	%xmm0,-16*2(%rbx,%rdx)
997	movdqa	%xmm0,-16*1(%rbx,%rdx)
998	pcmpeqd	%xmm1,%xmm0
999	pand	%xmm1,%xmm2
1000	pand	%xmm1,%xmm3
1001	pand	%xmm0,%xmm4
1002	pand	%xmm0,%xmm5
1003	pxor	%xmm0,%xmm0
1004	por	%xmm2,%xmm4
1005	por	%xmm3,%xmm5
1006	movdqu	%xmm4,-16*2($rptr)
1007	movdqu	%xmm5,-16*1($rptr)
1008	add	\$32,$num
1009	jnz	.Lsqr8x_cond_copy
1010
1011	mov	\$1,%rax
1012	mov	-48(%rsi),%r15
1013.cfi_restore	%r15
1014	mov	-40(%rsi),%r14
1015.cfi_restore	%r14
1016	mov	-32(%rsi),%r13
1017.cfi_restore	%r13
1018	mov	-24(%rsi),%r12
1019.cfi_restore	%r12
1020	mov	-16(%rsi),%rbp
1021.cfi_restore	%rbp
1022	mov	-8(%rsi),%rbx
1023.cfi_restore	%rbx
1024	lea	(%rsi),%rsp
1025.cfi_def_cfa_register	%rsp
1026.Lsqr8x_epilogue:
1027	ret
1028.cfi_endproc
1029.size	bn_sqr8x_mont,.-bn_sqr8x_mont
1030___
1031}}}
1032
1033if ($addx) {{{
1034my $bp="%rdx";	# original value
1035
1036$code.=<<___;
1037.type	bn_mulx4x_mont,\@function,6
1038.align	32
1039bn_mulx4x_mont:
1040.cfi_startproc
1041	mov	%rsp,%rax
1042.cfi_def_cfa_register	%rax
1043.Lmulx4x_enter:
1044	push	%rbx
1045.cfi_push	%rbx
1046	push	%rbp
1047.cfi_push	%rbp
1048	push	%r12
1049.cfi_push	%r12
1050	push	%r13
1051.cfi_push	%r13
1052	push	%r14
1053.cfi_push	%r14
1054	push	%r15
1055.cfi_push	%r15
1056.Lmulx4x_prologue:
1057
1058	shl	\$3,${num}d		# convert $num to bytes
1059	xor	%r10,%r10
1060	sub	$num,%r10		# -$num
1061	mov	($n0),$n0		# *n0
1062	lea	-72(%rsp,%r10),%rbp	# future alloca(frame+$num+8)
1063	and	\$-128,%rbp
1064	mov	%rsp,%r11
1065	sub	%rbp,%r11
1066	and	\$-4096,%r11
1067	lea	(%rbp,%r11),%rsp
1068	mov	(%rsp),%r10
1069	cmp	%rbp,%rsp
1070	ja	.Lmulx4x_page_walk
1071	jmp	.Lmulx4x_page_walk_done
1072
1073.align	16
1074.Lmulx4x_page_walk:
1075	lea	-4096(%rsp),%rsp
1076	mov	(%rsp),%r10
1077	cmp	%rbp,%rsp
1078	ja	.Lmulx4x_page_walk
1079.Lmulx4x_page_walk_done:
1080
1081	lea	($bp,$num),%r10
1082	##############################################################
1083	# Stack layout
1084	# +0	num
1085	# +8	off-loaded &b[i]
1086	# +16	end of b[num]
1087	# +24	saved n0
1088	# +32	saved rp
1089	# +40	saved %rsp
1090	# +48	inner counter
1091	# +56
1092	# +64	tmp[num+1]
1093	#
1094	mov	$num,0(%rsp)		# save $num
1095	shr	\$5,$num
1096	mov	%r10,16(%rsp)		# end of b[num]
1097	sub	\$1,$num
1098	mov	$n0, 24(%rsp)		# save *n0
1099	mov	$rp, 32(%rsp)		# save $rp
1100	mov	%rax,40(%rsp)		# save original %rsp
1101.cfi_cfa_expression	%rsp+40,deref,+8
1102	mov	$num,48(%rsp)		# inner counter
1103	jmp	.Lmulx4x_body
1104
1105.align	32
1106.Lmulx4x_body:
1107___
1108my ($aptr, $bptr, $nptr, $tptr, $mi,  $bi,  $zero, $num)=
1109   ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax");
1110my $rptr=$bptr;
1111$code.=<<___;
1112	lea	8($bp),$bptr
1113	mov	($bp),%rdx		# b[0], $bp==%rdx actually
1114	lea	64+32(%rsp),$tptr
1115	mov	%rdx,$bi
1116
1117	mulx	0*8($aptr),$mi,%rax	# a[0]*b[0]
1118	mulx	1*8($aptr),%r11,%r14	# a[1]*b[0]
1119	add	%rax,%r11
1120	mov	$bptr,8(%rsp)		# off-load &b[i]
1121	mulx	2*8($aptr),%r12,%r13	# ...
1122	adc	%r14,%r12
1123	adc	\$0,%r13
1124
1125	mov	$mi,$bptr		# borrow $bptr
1126	imulq	24(%rsp),$mi		# "t[0]"*n0
1127	xor	$zero,$zero		# cf=0, of=0
1128
1129	mulx	3*8($aptr),%rax,%r14
1130	 mov	$mi,%rdx
1131	lea	4*8($aptr),$aptr
1132	adcx	%rax,%r13
1133	adcx	$zero,%r14		# cf=0
1134
1135	mulx	0*8($nptr),%rax,%r10
1136	adcx	%rax,$bptr		# discarded
1137	adox	%r11,%r10
1138	mulx	1*8($nptr),%rax,%r11
1139	adcx	%rax,%r10
1140	adox	%r12,%r11
1141	.byte	0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00	# mulx	2*8($nptr),%rax,%r12
1142	mov	48(%rsp),$bptr		# counter value
1143	mov	%r10,-4*8($tptr)
1144	adcx	%rax,%r11
1145	adox	%r13,%r12
1146	mulx	3*8($nptr),%rax,%r15
1147	 mov	$bi,%rdx
1148	mov	%r11,-3*8($tptr)
1149	adcx	%rax,%r12
1150	adox	$zero,%r15		# of=0
1151	lea	4*8($nptr),$nptr
1152	mov	%r12,-2*8($tptr)
1153
1154	jmp	.Lmulx4x_1st
1155
1156.align	32
1157.Lmulx4x_1st:
1158	adcx	$zero,%r15		# cf=0, modulo-scheduled
1159	mulx	0*8($aptr),%r10,%rax	# a[4]*b[0]
1160	adcx	%r14,%r10
1161	mulx	1*8($aptr),%r11,%r14	# a[5]*b[0]
1162	adcx	%rax,%r11
1163	mulx	2*8($aptr),%r12,%rax	# ...
1164	adcx	%r14,%r12
1165	mulx	3*8($aptr),%r13,%r14
1166	 .byte	0x67,0x67
1167	 mov	$mi,%rdx
1168	adcx	%rax,%r13
1169	adcx	$zero,%r14		# cf=0
1170	lea	4*8($aptr),$aptr
1171	lea	4*8($tptr),$tptr
1172
1173	adox	%r15,%r10
1174	mulx	0*8($nptr),%rax,%r15
1175	adcx	%rax,%r10
1176	adox	%r15,%r11
1177	mulx	1*8($nptr),%rax,%r15
1178	adcx	%rax,%r11
1179	adox	%r15,%r12
1180	mulx	2*8($nptr),%rax,%r15
1181	mov	%r10,-5*8($tptr)
1182	adcx	%rax,%r12
1183	mov	%r11,-4*8($tptr)
1184	adox	%r15,%r13
1185	mulx	3*8($nptr),%rax,%r15
1186	 mov	$bi,%rdx
1187	mov	%r12,-3*8($tptr)
1188	adcx	%rax,%r13
1189	adox	$zero,%r15
1190	lea	4*8($nptr),$nptr
1191	mov	%r13,-2*8($tptr)
1192
1193	dec	$bptr			# of=0, pass cf
1194	jnz	.Lmulx4x_1st
1195
1196	mov	0(%rsp),$num		# load num
1197	mov	8(%rsp),$bptr		# re-load &b[i]
1198	adc	$zero,%r15		# modulo-scheduled
1199	add	%r15,%r14
1200	sbb	%r15,%r15		# top-most carry
1201	mov	%r14,-1*8($tptr)
1202	jmp	.Lmulx4x_outer
1203
1204.align	32
1205.Lmulx4x_outer:
1206	mov	($bptr),%rdx		# b[i]
1207	lea	8($bptr),$bptr		# b++
1208	sub	$num,$aptr		# rewind $aptr
1209	mov	%r15,($tptr)		# save top-most carry
1210	lea	64+4*8(%rsp),$tptr
1211	sub	$num,$nptr		# rewind $nptr
1212
1213	mulx	0*8($aptr),$mi,%r11	# a[0]*b[i]
1214	xor	%ebp,%ebp		# xor	$zero,$zero	# cf=0, of=0
1215	mov	%rdx,$bi
1216	mulx	1*8($aptr),%r14,%r12	# a[1]*b[i]
1217	adox	-4*8($tptr),$mi
1218	adcx	%r14,%r11
1219	mulx	2*8($aptr),%r15,%r13	# ...
1220	adox	-3*8($tptr),%r11
1221	adcx	%r15,%r12
1222	adox	-2*8($tptr),%r12
1223	adcx	$zero,%r13
1224	adox	$zero,%r13
1225
1226	mov	$bptr,8(%rsp)		# off-load &b[i]
1227	mov	$mi,%r15
1228	imulq	24(%rsp),$mi		# "t[0]"*n0
1229	xor	%ebp,%ebp		# xor	$zero,$zero	# cf=0, of=0
1230
1231	mulx	3*8($aptr),%rax,%r14
1232	 mov	$mi,%rdx
1233	adcx	%rax,%r13
1234	adox	-1*8($tptr),%r13
1235	adcx	$zero,%r14
1236	lea	4*8($aptr),$aptr
1237	adox	$zero,%r14
1238
1239	mulx	0*8($nptr),%rax,%r10
1240	adcx	%rax,%r15		# discarded
1241	adox	%r11,%r10
1242	mulx	1*8($nptr),%rax,%r11
1243	adcx	%rax,%r10
1244	adox	%r12,%r11
1245	mulx	2*8($nptr),%rax,%r12
1246	mov	%r10,-4*8($tptr)
1247	adcx	%rax,%r11
1248	adox	%r13,%r12
1249	mulx	3*8($nptr),%rax,%r15
1250	 mov	$bi,%rdx
1251	mov	%r11,-3*8($tptr)
1252	lea	4*8($nptr),$nptr
1253	adcx	%rax,%r12
1254	adox	$zero,%r15		# of=0
1255	mov	48(%rsp),$bptr		# counter value
1256	mov	%r12,-2*8($tptr)
1257
1258	jmp	.Lmulx4x_inner
1259
1260.align	32
1261.Lmulx4x_inner:
1262	mulx	0*8($aptr),%r10,%rax	# a[4]*b[i]
1263	adcx	$zero,%r15		# cf=0, modulo-scheduled
1264	adox	%r14,%r10
1265	mulx	1*8($aptr),%r11,%r14	# a[5]*b[i]
1266	adcx	0*8($tptr),%r10
1267	adox	%rax,%r11
1268	mulx	2*8($aptr),%r12,%rax	# ...
1269	adcx	1*8($tptr),%r11
1270	adox	%r14,%r12
1271	mulx	3*8($aptr),%r13,%r14
1272	 mov	$mi,%rdx
1273	adcx	2*8($tptr),%r12
1274	adox	%rax,%r13
1275	adcx	3*8($tptr),%r13
1276	adox	$zero,%r14		# of=0
1277	lea	4*8($aptr),$aptr
1278	lea	4*8($tptr),$tptr
1279	adcx	$zero,%r14		# cf=0
1280
1281	adox	%r15,%r10
1282	mulx	0*8($nptr),%rax,%r15
1283	adcx	%rax,%r10
1284	adox	%r15,%r11
1285	mulx	1*8($nptr),%rax,%r15
1286	adcx	%rax,%r11
1287	adox	%r15,%r12
1288	mulx	2*8($nptr),%rax,%r15
1289	mov	%r10,-5*8($tptr)
1290	adcx	%rax,%r12
1291	adox	%r15,%r13
1292	mulx	3*8($nptr),%rax,%r15
1293	 mov	$bi,%rdx
1294	mov	%r11,-4*8($tptr)
1295	mov	%r12,-3*8($tptr)
1296	adcx	%rax,%r13
1297	adox	$zero,%r15
1298	lea	4*8($nptr),$nptr
1299	mov	%r13,-2*8($tptr)
1300
1301	dec	$bptr			# of=0, pass cf
1302	jnz	.Lmulx4x_inner
1303
1304	mov	0(%rsp),$num		# load num
1305	mov	8(%rsp),$bptr		# re-load &b[i]
1306	adc	$zero,%r15		# modulo-scheduled
1307	sub	0*8($tptr),$zero	# pull top-most carry
1308	adc	%r15,%r14
1309	sbb	%r15,%r15		# top-most carry
1310	mov	%r14,-1*8($tptr)
1311
1312	cmp	16(%rsp),$bptr
1313	jne	.Lmulx4x_outer
1314
1315	lea	64(%rsp),$tptr
1316	sub	$num,$nptr		# rewind $nptr
1317	neg	%r15
1318	mov	$num,%rdx
1319	shr	\$3+2,$num		# %cf=0
1320	mov	32(%rsp),$rptr		# restore rp
1321	jmp	.Lmulx4x_sub
1322
1323.align	32
1324.Lmulx4x_sub:
1325	mov	8*0($tptr),%r11
1326	mov	8*1($tptr),%r12
1327	mov	8*2($tptr),%r13
1328	mov	8*3($tptr),%r14
1329	lea	8*4($tptr),$tptr
1330	sbb	8*0($nptr),%r11
1331	sbb	8*1($nptr),%r12
1332	sbb	8*2($nptr),%r13
1333	sbb	8*3($nptr),%r14
1334	lea	8*4($nptr),$nptr
1335	mov	%r11,8*0($rptr)
1336	mov	%r12,8*1($rptr)
1337	mov	%r13,8*2($rptr)
1338	mov	%r14,8*3($rptr)
1339	lea	8*4($rptr),$rptr
1340	dec	$num			# preserves %cf
1341	jnz	.Lmulx4x_sub
1342
1343	sbb	\$0,%r15		# top-most carry
1344	lea	64(%rsp),$tptr
1345	sub	%rdx,$rptr		# rewind
1346
1347	movq	%r15,%xmm1
1348	pxor	%xmm0,%xmm0
1349	pshufd	\$0,%xmm1,%xmm1
1350	mov	40(%rsp),%rsi		# restore %rsp
1351.cfi_def_cfa	%rsi,8
1352	jmp	.Lmulx4x_cond_copy
1353
1354.align	32
1355.Lmulx4x_cond_copy:
1356	movdqa	16*0($tptr),%xmm2
1357	movdqa	16*1($tptr),%xmm3
1358	lea	16*2($tptr),$tptr
1359	movdqu	16*0($rptr),%xmm4
1360	movdqu	16*1($rptr),%xmm5
1361	lea	16*2($rptr),$rptr
1362	movdqa	%xmm0,-16*2($tptr)	# zero tp
1363	movdqa	%xmm0,-16*1($tptr)
1364	pcmpeqd	%xmm1,%xmm0
1365	pand	%xmm1,%xmm2
1366	pand	%xmm1,%xmm3
1367	pand	%xmm0,%xmm4
1368	pand	%xmm0,%xmm5
1369	pxor	%xmm0,%xmm0
1370	por	%xmm2,%xmm4
1371	por	%xmm3,%xmm5
1372	movdqu	%xmm4,-16*2($rptr)
1373	movdqu	%xmm5,-16*1($rptr)
1374	sub	\$32,%rdx
1375	jnz	.Lmulx4x_cond_copy
1376
1377	mov	%rdx,($tptr)
1378
1379	mov	\$1,%rax
1380	mov	-48(%rsi),%r15
1381.cfi_restore	%r15
1382	mov	-40(%rsi),%r14
1383.cfi_restore	%r14
1384	mov	-32(%rsi),%r13
1385.cfi_restore	%r13
1386	mov	-24(%rsi),%r12
1387.cfi_restore	%r12
1388	mov	-16(%rsi),%rbp
1389.cfi_restore	%rbp
1390	mov	-8(%rsi),%rbx
1391.cfi_restore	%rbx
1392	lea	(%rsi),%rsp
1393.cfi_def_cfa_register	%rsp
1394.Lmulx4x_epilogue:
1395	ret
1396.cfi_endproc
1397.size	bn_mulx4x_mont,.-bn_mulx4x_mont
1398___
1399}}}
1400$code.=<<___;
1401.asciz	"Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1402.align	16
1403___
1404
1405# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1406#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1407if ($win64) {
1408$rec="%rcx";
1409$frame="%rdx";
1410$context="%r8";
1411$disp="%r9";
1412
1413$code.=<<___;
1414.extern	__imp_RtlVirtualUnwind
1415.type	mul_handler,\@abi-omnipotent
1416.align	16
1417mul_handler:
1418	push	%rsi
1419	push	%rdi
1420	push	%rbx
1421	push	%rbp
1422	push	%r12
1423	push	%r13
1424	push	%r14
1425	push	%r15
1426	pushfq
1427	sub	\$64,%rsp
1428
1429	mov	120($context),%rax	# pull context->Rax
1430	mov	248($context),%rbx	# pull context->Rip
1431
1432	mov	8($disp),%rsi		# disp->ImageBase
1433	mov	56($disp),%r11		# disp->HandlerData
1434
1435	mov	0(%r11),%r10d		# HandlerData[0]
1436	lea	(%rsi,%r10),%r10	# end of prologue label
1437	cmp	%r10,%rbx		# context->Rip<end of prologue label
1438	jb	.Lcommon_seh_tail
1439
1440	mov	152($context),%rax	# pull context->Rsp
1441
1442	mov	4(%r11),%r10d		# HandlerData[1]
1443	lea	(%rsi,%r10),%r10	# epilogue label
1444	cmp	%r10,%rbx		# context->Rip>=epilogue label
1445	jae	.Lcommon_seh_tail
1446
1447	mov	192($context),%r10	# pull $num
1448	mov	8(%rax,%r10,8),%rax	# pull saved stack pointer
1449
1450	jmp	.Lcommon_pop_regs
1451.size	mul_handler,.-mul_handler
1452
1453.type	sqr_handler,\@abi-omnipotent
1454.align	16
1455sqr_handler:
1456	push	%rsi
1457	push	%rdi
1458	push	%rbx
1459	push	%rbp
1460	push	%r12
1461	push	%r13
1462	push	%r14
1463	push	%r15
1464	pushfq
1465	sub	\$64,%rsp
1466
1467	mov	120($context),%rax	# pull context->Rax
1468	mov	248($context),%rbx	# pull context->Rip
1469
1470	mov	8($disp),%rsi		# disp->ImageBase
1471	mov	56($disp),%r11		# disp->HandlerData
1472
1473	mov	0(%r11),%r10d		# HandlerData[0]
1474	lea	(%rsi,%r10),%r10	# end of prologue label
1475	cmp	%r10,%rbx		# context->Rip<.Lsqr_prologue
1476	jb	.Lcommon_seh_tail
1477
1478	mov	4(%r11),%r10d		# HandlerData[1]
1479	lea	(%rsi,%r10),%r10	# body label
1480	cmp	%r10,%rbx		# context->Rip<.Lsqr_body
1481	jb	.Lcommon_pop_regs
1482
1483	mov	152($context),%rax	# pull context->Rsp
1484
1485	mov	8(%r11),%r10d		# HandlerData[2]
1486	lea	(%rsi,%r10),%r10	# epilogue label
1487	cmp	%r10,%rbx		# context->Rip>=.Lsqr_epilogue
1488	jae	.Lcommon_seh_tail
1489
1490	mov	40(%rax),%rax		# pull saved stack pointer
1491
1492.Lcommon_pop_regs:
1493	mov	-8(%rax),%rbx
1494	mov	-16(%rax),%rbp
1495	mov	-24(%rax),%r12
1496	mov	-32(%rax),%r13
1497	mov	-40(%rax),%r14
1498	mov	-48(%rax),%r15
1499	mov	%rbx,144($context)	# restore context->Rbx
1500	mov	%rbp,160($context)	# restore context->Rbp
1501	mov	%r12,216($context)	# restore context->R12
1502	mov	%r13,224($context)	# restore context->R13
1503	mov	%r14,232($context)	# restore context->R14
1504	mov	%r15,240($context)	# restore context->R15
1505
1506.Lcommon_seh_tail:
1507	mov	8(%rax),%rdi
1508	mov	16(%rax),%rsi
1509	mov	%rax,152($context)	# restore context->Rsp
1510	mov	%rsi,168($context)	# restore context->Rsi
1511	mov	%rdi,176($context)	# restore context->Rdi
1512
1513	mov	40($disp),%rdi		# disp->ContextRecord
1514	mov	$context,%rsi		# context
1515	mov	\$154,%ecx		# sizeof(CONTEXT)
1516	.long	0xa548f3fc		# cld; rep movsq
1517
1518	mov	$disp,%rsi
1519	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1520	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1521	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1522	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1523	mov	40(%rsi),%r10		# disp->ContextRecord
1524	lea	56(%rsi),%r11		# &disp->HandlerData
1525	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1526	mov	%r10,32(%rsp)		# arg5
1527	mov	%r11,40(%rsp)		# arg6
1528	mov	%r12,48(%rsp)		# arg7
1529	mov	%rcx,56(%rsp)		# arg8, (NULL)
1530	call	*__imp_RtlVirtualUnwind(%rip)
1531
1532	mov	\$1,%eax		# ExceptionContinueSearch
1533	add	\$64,%rsp
1534	popfq
1535	pop	%r15
1536	pop	%r14
1537	pop	%r13
1538	pop	%r12
1539	pop	%rbp
1540	pop	%rbx
1541	pop	%rdi
1542	pop	%rsi
1543	ret
1544.size	sqr_handler,.-sqr_handler
1545
1546.section	.pdata
1547.align	4
1548	.rva	.LSEH_begin_bn_mul_mont
1549	.rva	.LSEH_end_bn_mul_mont
1550	.rva	.LSEH_info_bn_mul_mont
1551
1552	.rva	.LSEH_begin_bn_mul4x_mont
1553	.rva	.LSEH_end_bn_mul4x_mont
1554	.rva	.LSEH_info_bn_mul4x_mont
1555
1556	.rva	.LSEH_begin_bn_sqr8x_mont
1557	.rva	.LSEH_end_bn_sqr8x_mont
1558	.rva	.LSEH_info_bn_sqr8x_mont
1559___
1560$code.=<<___ if ($addx);
1561	.rva	.LSEH_begin_bn_mulx4x_mont
1562	.rva	.LSEH_end_bn_mulx4x_mont
1563	.rva	.LSEH_info_bn_mulx4x_mont
1564___
1565$code.=<<___;
1566.section	.xdata
1567.align	8
1568.LSEH_info_bn_mul_mont:
1569	.byte	9,0,0,0
1570	.rva	mul_handler
1571	.rva	.Lmul_body,.Lmul_epilogue	# HandlerData[]
1572.LSEH_info_bn_mul4x_mont:
1573	.byte	9,0,0,0
1574	.rva	mul_handler
1575	.rva	.Lmul4x_body,.Lmul4x_epilogue	# HandlerData[]
1576.LSEH_info_bn_sqr8x_mont:
1577	.byte	9,0,0,0
1578	.rva	sqr_handler
1579	.rva	.Lsqr8x_prologue,.Lsqr8x_body,.Lsqr8x_epilogue		# HandlerData[]
1580.align	8
1581___
1582$code.=<<___ if ($addx);
1583.LSEH_info_bn_mulx4x_mont:
1584	.byte	9,0,0,0
1585	.rva	sqr_handler
1586	.rva	.Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue	# HandlerData[]
1587.align	8
1588___
1589}
1590
1591print $code;
1592close STDOUT or die "error closing STDOUT: $!";
1593