1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# October 2005.
11#
12# Montgomery multiplication routine for x86_64. While it gives modest
13# 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more
14# than twice, >2x, as fast. Most common rsa1024 sign is improved by
15# respectful 50%. It remains to be seen if loop unrolling and
16# dedicated squaring routine can provide further improvement...
17
18# July 2011.
19#
20# Add dedicated squaring procedure. Performance improvement varies
21# from platform to platform, but in average it's ~5%/15%/25%/33%
22# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
23
24# August 2011.
25#
26# Unroll and modulo-schedule inner loops in such manner that they
27# are "fallen through" for input lengths of 8, which is critical for
28# 1024-bit RSA *sign*. Average performance improvement in comparison
29# to *initial* version of this module from 2005 is ~0%/30%/40%/45%
30# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
31
32$flavour = shift;
33$output  = shift;
34if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
35
36$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
37
38$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
39( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
40( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
41die "can't locate x86_64-xlate.pl";
42
43open OUT,"| \"$^X\" $xlate $flavour $output";
44*STDOUT=*OUT;
45
46# int bn_mul_mont(
47$rp="%rdi";	# BN_ULONG *rp,
48$ap="%rsi";	# const BN_ULONG *ap,
49$bp="%rdx";	# const BN_ULONG *bp,
50$np="%rcx";	# const BN_ULONG *np,
51$n0="%r8";	# const BN_ULONG *n0,
52$num="%r9";	# int num);
53$lo0="%r10";
54$hi0="%r11";
55$hi1="%r13";
56$i="%r14";
57$j="%r15";
58$m0="%rbx";
59$m1="%rbp";
60
61$code=<<___;
62.text
63
64.globl	bn_mul_mont
65.type	bn_mul_mont,\@function,6
66.align	16
67bn_mul_mont:
68	test	\$3,${num}d
69	jnz	.Lmul_enter
70	cmp	\$8,${num}d
71	jb	.Lmul_enter
72	cmp	$ap,$bp
73	jne	.Lmul4x_enter
74	jmp	.Lsqr4x_enter
75
76.align	16
77.Lmul_enter:
78	push	%rbx
79	push	%rbp
80	push	%r12
81	push	%r13
82	push	%r14
83	push	%r15
84
85	mov	${num}d,${num}d
86	lea	2($num),%r10
87	mov	%rsp,%r11
88	neg	%r10
89	lea	(%rsp,%r10,8),%rsp	# tp=alloca(8*(num+2))
90	and	\$-1024,%rsp		# minimize TLB usage
91
92	mov	%r11,8(%rsp,$num,8)	# tp[num+1]=%rsp
93.Lmul_body:
94	mov	$bp,%r12		# reassign $bp
95___
96		$bp="%r12";
97$code.=<<___;
98	mov	($n0),$n0		# pull n0[0] value
99	mov	($bp),$m0		# m0=bp[0]
100	mov	($ap),%rax
101
102	xor	$i,$i			# i=0
103	xor	$j,$j			# j=0
104
105	mov	$n0,$m1
106	mulq	$m0			# ap[0]*bp[0]
107	mov	%rax,$lo0
108	mov	($np),%rax
109
110	imulq	$lo0,$m1		# "tp[0]"*n0
111	mov	%rdx,$hi0
112
113	mulq	$m1			# np[0]*m1
114	add	%rax,$lo0		# discarded
115	mov	8($ap),%rax
116	adc	\$0,%rdx
117	mov	%rdx,$hi1
118
119	lea	1($j),$j		# j++
120	jmp	.L1st_enter
121
122.align	16
123.L1st:
124	add	%rax,$hi1
125	mov	($ap,$j,8),%rax
126	adc	\$0,%rdx
127	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
128	mov	$lo0,$hi0
129	adc	\$0,%rdx
130	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
131	mov	%rdx,$hi1
132
133.L1st_enter:
134	mulq	$m0			# ap[j]*bp[0]
135	add	%rax,$hi0
136	mov	($np,$j,8),%rax
137	adc	\$0,%rdx
138	lea	1($j),$j		# j++
139	mov	%rdx,$lo0
140
141	mulq	$m1			# np[j]*m1
142	cmp	$num,$j
143	jne	.L1st
144
145	add	%rax,$hi1
146	mov	($ap),%rax		# ap[0]
147	adc	\$0,%rdx
148	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
149	adc	\$0,%rdx
150	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
151	mov	%rdx,$hi1
152	mov	$lo0,$hi0
153
154	xor	%rdx,%rdx
155	add	$hi0,$hi1
156	adc	\$0,%rdx
157	mov	$hi1,-8(%rsp,$num,8)
158	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
159
160	lea	1($i),$i		# i++
161	jmp	.Louter
162.align	16
163.Louter:
164	mov	($bp,$i,8),$m0		# m0=bp[i]
165	xor	$j,$j			# j=0
166	mov	$n0,$m1
167	mov	(%rsp),$lo0
168	mulq	$m0			# ap[0]*bp[i]
169	add	%rax,$lo0		# ap[0]*bp[i]+tp[0]
170	mov	($np),%rax
171	adc	\$0,%rdx
172
173	imulq	$lo0,$m1		# tp[0]*n0
174	mov	%rdx,$hi0
175
176	mulq	$m1			# np[0]*m1
177	add	%rax,$lo0		# discarded
178	mov	8($ap),%rax
179	adc	\$0,%rdx
180	mov	8(%rsp),$lo0		# tp[1]
181	mov	%rdx,$hi1
182
183	lea	1($j),$j		# j++
184	jmp	.Linner_enter
185
186.align	16
187.Linner:
188	add	%rax,$hi1
189	mov	($ap,$j,8),%rax
190	adc	\$0,%rdx
191	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
192	mov	(%rsp,$j,8),$lo0
193	adc	\$0,%rdx
194	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
195	mov	%rdx,$hi1
196
197.Linner_enter:
198	mulq	$m0			# ap[j]*bp[i]
199	add	%rax,$hi0
200	mov	($np,$j,8),%rax
201	adc	\$0,%rdx
202	add	$hi0,$lo0		# ap[j]*bp[i]+tp[j]
203	mov	%rdx,$hi0
204	adc	\$0,$hi0
205	lea	1($j),$j		# j++
206
207	mulq	$m1			# np[j]*m1
208	cmp	$num,$j
209	jne	.Linner
210
211	add	%rax,$hi1
212	mov	($ap),%rax		# ap[0]
213	adc	\$0,%rdx
214	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
215	mov	(%rsp,$j,8),$lo0
216	adc	\$0,%rdx
217	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
218	mov	%rdx,$hi1
219
220	xor	%rdx,%rdx
221	add	$hi0,$hi1
222	adc	\$0,%rdx
223	add	$lo0,$hi1		# pull upmost overflow bit
224	adc	\$0,%rdx
225	mov	$hi1,-8(%rsp,$num,8)
226	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
227
228	lea	1($i),$i		# i++
229	cmp	$num,$i
230	jl	.Louter
231
232	xor	$i,$i			# i=0 and clear CF!
233	mov	(%rsp),%rax		# tp[0]
234	lea	(%rsp),$ap		# borrow ap for tp
235	mov	$num,$j			# j=num
236	jmp	.Lsub
237.align	16
238.Lsub:	sbb	($np,$i,8),%rax
239	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]-np[i]
240	mov	8($ap,$i,8),%rax	# tp[i+1]
241	lea	1($i),$i		# i++
242	dec	$j			# doesnn't affect CF!
243	jnz	.Lsub
244
245	sbb	\$0,%rax		# handle upmost overflow bit
246	xor	$i,$i
247	and	%rax,$ap
248	not	%rax
249	mov	$rp,$np
250	and	%rax,$np
251	mov	$num,$j			# j=num
252	or	$np,$ap			# ap=borrow?tp:rp
253.align	16
254.Lcopy:					# copy or in-place refresh
255	mov	($ap,$i,8),%rax
256	mov	$i,(%rsp,$i,8)		# zap temporary vector
257	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]
258	lea	1($i),$i
259	sub	\$1,$j
260	jnz	.Lcopy
261
262	mov	8(%rsp,$num,8),%rsi	# restore %rsp
263	mov	\$1,%rax
264	mov	(%rsi),%r15
265	mov	8(%rsi),%r14
266	mov	16(%rsi),%r13
267	mov	24(%rsi),%r12
268	mov	32(%rsi),%rbp
269	mov	40(%rsi),%rbx
270	lea	48(%rsi),%rsp
271.Lmul_epilogue:
272	ret
273.size	bn_mul_mont,.-bn_mul_mont
274___
275{{{
276my @A=("%r10","%r11");
277my @N=("%r13","%rdi");
278$code.=<<___;
279.type	bn_mul4x_mont,\@function,6
280.align	16
281bn_mul4x_mont:
282.Lmul4x_enter:
283	push	%rbx
284	push	%rbp
285	push	%r12
286	push	%r13
287	push	%r14
288	push	%r15
289
290	mov	${num}d,${num}d
291	lea	4($num),%r10
292	mov	%rsp,%r11
293	neg	%r10
294	lea	(%rsp,%r10,8),%rsp	# tp=alloca(8*(num+4))
295	and	\$-1024,%rsp		# minimize TLB usage
296
297	mov	%r11,8(%rsp,$num,8)	# tp[num+1]=%rsp
298.Lmul4x_body:
299	mov	$rp,16(%rsp,$num,8)	# tp[num+2]=$rp
300	mov	%rdx,%r12		# reassign $bp
301___
302		$bp="%r12";
303$code.=<<___;
304	mov	($n0),$n0		# pull n0[0] value
305	mov	($bp),$m0		# m0=bp[0]
306	mov	($ap),%rax
307
308	xor	$i,$i			# i=0
309	xor	$j,$j			# j=0
310
311	mov	$n0,$m1
312	mulq	$m0			# ap[0]*bp[0]
313	mov	%rax,$A[0]
314	mov	($np),%rax
315
316	imulq	$A[0],$m1		# "tp[0]"*n0
317	mov	%rdx,$A[1]
318
319	mulq	$m1			# np[0]*m1
320	add	%rax,$A[0]		# discarded
321	mov	8($ap),%rax
322	adc	\$0,%rdx
323	mov	%rdx,$N[1]
324
325	mulq	$m0
326	add	%rax,$A[1]
327	mov	8($np),%rax
328	adc	\$0,%rdx
329	mov	%rdx,$A[0]
330
331	mulq	$m1
332	add	%rax,$N[1]
333	mov	16($ap),%rax
334	adc	\$0,%rdx
335	add	$A[1],$N[1]
336	lea	4($j),$j		# j++
337	adc	\$0,%rdx
338	mov	$N[1],(%rsp)
339	mov	%rdx,$N[0]
340	jmp	.L1st4x
341.align	16
342.L1st4x:
343	mulq	$m0			# ap[j]*bp[0]
344	add	%rax,$A[0]
345	mov	-16($np,$j,8),%rax
346	adc	\$0,%rdx
347	mov	%rdx,$A[1]
348
349	mulq	$m1			# np[j]*m1
350	add	%rax,$N[0]
351	mov	-8($ap,$j,8),%rax
352	adc	\$0,%rdx
353	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
354	adc	\$0,%rdx
355	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
356	mov	%rdx,$N[1]
357
358	mulq	$m0			# ap[j]*bp[0]
359	add	%rax,$A[1]
360	mov	-8($np,$j,8),%rax
361	adc	\$0,%rdx
362	mov	%rdx,$A[0]
363
364	mulq	$m1			# np[j]*m1
365	add	%rax,$N[1]
366	mov	($ap,$j,8),%rax
367	adc	\$0,%rdx
368	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
369	adc	\$0,%rdx
370	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
371	mov	%rdx,$N[0]
372
373	mulq	$m0			# ap[j]*bp[0]
374	add	%rax,$A[0]
375	mov	($np,$j,8),%rax
376	adc	\$0,%rdx
377	mov	%rdx,$A[1]
378
379	mulq	$m1			# np[j]*m1
380	add	%rax,$N[0]
381	mov	8($ap,$j,8),%rax
382	adc	\$0,%rdx
383	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
384	adc	\$0,%rdx
385	mov	$N[0],-8(%rsp,$j,8)	# tp[j-1]
386	mov	%rdx,$N[1]
387
388	mulq	$m0			# ap[j]*bp[0]
389	add	%rax,$A[1]
390	mov	8($np,$j,8),%rax
391	adc	\$0,%rdx
392	lea	4($j),$j		# j++
393	mov	%rdx,$A[0]
394
395	mulq	$m1			# np[j]*m1
396	add	%rax,$N[1]
397	mov	-16($ap,$j,8),%rax
398	adc	\$0,%rdx
399	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
400	adc	\$0,%rdx
401	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
402	mov	%rdx,$N[0]
403	cmp	$num,$j
404	jl	.L1st4x
405
406	mulq	$m0			# ap[j]*bp[0]
407	add	%rax,$A[0]
408	mov	-16($np,$j,8),%rax
409	adc	\$0,%rdx
410	mov	%rdx,$A[1]
411
412	mulq	$m1			# np[j]*m1
413	add	%rax,$N[0]
414	mov	-8($ap,$j,8),%rax
415	adc	\$0,%rdx
416	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
417	adc	\$0,%rdx
418	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
419	mov	%rdx,$N[1]
420
421	mulq	$m0			# ap[j]*bp[0]
422	add	%rax,$A[1]
423	mov	-8($np,$j,8),%rax
424	adc	\$0,%rdx
425	mov	%rdx,$A[0]
426
427	mulq	$m1			# np[j]*m1
428	add	%rax,$N[1]
429	mov	($ap),%rax		# ap[0]
430	adc	\$0,%rdx
431	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
432	adc	\$0,%rdx
433	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
434	mov	%rdx,$N[0]
435
436	xor	$N[1],$N[1]
437	add	$A[0],$N[0]
438	adc	\$0,$N[1]
439	mov	$N[0],-8(%rsp,$j,8)
440	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
441
442	lea	1($i),$i		# i++
443.align	4
444.Louter4x:
445	mov	($bp,$i,8),$m0		# m0=bp[i]
446	xor	$j,$j			# j=0
447	mov	(%rsp),$A[0]
448	mov	$n0,$m1
449	mulq	$m0			# ap[0]*bp[i]
450	add	%rax,$A[0]		# ap[0]*bp[i]+tp[0]
451	mov	($np),%rax
452	adc	\$0,%rdx
453
454	imulq	$A[0],$m1		# tp[0]*n0
455	mov	%rdx,$A[1]
456
457	mulq	$m1			# np[0]*m1
458	add	%rax,$A[0]		# "$N[0]", discarded
459	mov	8($ap),%rax
460	adc	\$0,%rdx
461	mov	%rdx,$N[1]
462
463	mulq	$m0			# ap[j]*bp[i]
464	add	%rax,$A[1]
465	mov	8($np),%rax
466	adc	\$0,%rdx
467	add	8(%rsp),$A[1]		# +tp[1]
468	adc	\$0,%rdx
469	mov	%rdx,$A[0]
470
471	mulq	$m1			# np[j]*m1
472	add	%rax,$N[1]
473	mov	16($ap),%rax
474	adc	\$0,%rdx
475	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[i]+tp[j]
476	lea	4($j),$j		# j+=2
477	adc	\$0,%rdx
478	mov	$N[1],(%rsp)		# tp[j-1]
479	mov	%rdx,$N[0]
480	jmp	.Linner4x
481.align	16
482.Linner4x:
483	mulq	$m0			# ap[j]*bp[i]
484	add	%rax,$A[0]
485	mov	-16($np,$j,8),%rax
486	adc	\$0,%rdx
487	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
488	adc	\$0,%rdx
489	mov	%rdx,$A[1]
490
491	mulq	$m1			# np[j]*m1
492	add	%rax,$N[0]
493	mov	-8($ap,$j,8),%rax
494	adc	\$0,%rdx
495	add	$A[0],$N[0]
496	adc	\$0,%rdx
497	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
498	mov	%rdx,$N[1]
499
500	mulq	$m0			# ap[j]*bp[i]
501	add	%rax,$A[1]
502	mov	-8($np,$j,8),%rax
503	adc	\$0,%rdx
504	add	-8(%rsp,$j,8),$A[1]
505	adc	\$0,%rdx
506	mov	%rdx,$A[0]
507
508	mulq	$m1			# np[j]*m1
509	add	%rax,$N[1]
510	mov	($ap,$j,8),%rax
511	adc	\$0,%rdx
512	add	$A[1],$N[1]
513	adc	\$0,%rdx
514	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
515	mov	%rdx,$N[0]
516
517	mulq	$m0			# ap[j]*bp[i]
518	add	%rax,$A[0]
519	mov	($np,$j,8),%rax
520	adc	\$0,%rdx
521	add	(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
522	adc	\$0,%rdx
523	mov	%rdx,$A[1]
524
525	mulq	$m1			# np[j]*m1
526	add	%rax,$N[0]
527	mov	8($ap,$j,8),%rax
528	adc	\$0,%rdx
529	add	$A[0],$N[0]
530	adc	\$0,%rdx
531	mov	$N[0],-8(%rsp,$j,8)	# tp[j-1]
532	mov	%rdx,$N[1]
533
534	mulq	$m0			# ap[j]*bp[i]
535	add	%rax,$A[1]
536	mov	8($np,$j,8),%rax
537	adc	\$0,%rdx
538	add	8(%rsp,$j,8),$A[1]
539	adc	\$0,%rdx
540	lea	4($j),$j		# j++
541	mov	%rdx,$A[0]
542
543	mulq	$m1			# np[j]*m1
544	add	%rax,$N[1]
545	mov	-16($ap,$j,8),%rax
546	adc	\$0,%rdx
547	add	$A[1],$N[1]
548	adc	\$0,%rdx
549	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
550	mov	%rdx,$N[0]
551	cmp	$num,$j
552	jl	.Linner4x
553
554	mulq	$m0			# ap[j]*bp[i]
555	add	%rax,$A[0]
556	mov	-16($np,$j,8),%rax
557	adc	\$0,%rdx
558	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
559	adc	\$0,%rdx
560	mov	%rdx,$A[1]
561
562	mulq	$m1			# np[j]*m1
563	add	%rax,$N[0]
564	mov	-8($ap,$j,8),%rax
565	adc	\$0,%rdx
566	add	$A[0],$N[0]
567	adc	\$0,%rdx
568	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
569	mov	%rdx,$N[1]
570
571	mulq	$m0			# ap[j]*bp[i]
572	add	%rax,$A[1]
573	mov	-8($np,$j,8),%rax
574	adc	\$0,%rdx
575	add	-8(%rsp,$j,8),$A[1]
576	adc	\$0,%rdx
577	lea	1($i),$i		# i++
578	mov	%rdx,$A[0]
579
580	mulq	$m1			# np[j]*m1
581	add	%rax,$N[1]
582	mov	($ap),%rax		# ap[0]
583	adc	\$0,%rdx
584	add	$A[1],$N[1]
585	adc	\$0,%rdx
586	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
587	mov	%rdx,$N[0]
588
589	xor	$N[1],$N[1]
590	add	$A[0],$N[0]
591	adc	\$0,$N[1]
592	add	(%rsp,$num,8),$N[0]	# pull upmost overflow bit
593	adc	\$0,$N[1]
594	mov	$N[0],-8(%rsp,$j,8)
595	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
596
597	cmp	$num,$i
598	jl	.Louter4x
599___
600{
601my @ri=("%rax","%rdx",$m0,$m1);
602$code.=<<___;
603	mov	16(%rsp,$num,8),$rp	# restore $rp
604	mov	0(%rsp),@ri[0]		# tp[0]
605	pxor	%xmm0,%xmm0
606	mov	8(%rsp),@ri[1]		# tp[1]
607	shr	\$2,$num		# num/=4
608	lea	(%rsp),$ap		# borrow ap for tp
609	xor	$i,$i			# i=0 and clear CF!
610
611	sub	0($np),@ri[0]
612	mov	16($ap),@ri[2]		# tp[2]
613	mov	24($ap),@ri[3]		# tp[3]
614	sbb	8($np),@ri[1]
615	lea	-1($num),$j		# j=num/4-1
616	jmp	.Lsub4x
617.align	16
618.Lsub4x:
619	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
620	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
621	sbb	16($np,$i,8),@ri[2]
622	mov	32($ap,$i,8),@ri[0]	# tp[i+1]
623	mov	40($ap,$i,8),@ri[1]
624	sbb	24($np,$i,8),@ri[3]
625	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
626	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
627	sbb	32($np,$i,8),@ri[0]
628	mov	48($ap,$i,8),@ri[2]
629	mov	56($ap,$i,8),@ri[3]
630	sbb	40($np,$i,8),@ri[1]
631	lea	4($i),$i		# i++
632	dec	$j			# doesnn't affect CF!
633	jnz	.Lsub4x
634
635	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
636	mov	32($ap,$i,8),@ri[0]	# load overflow bit
637	sbb	16($np,$i,8),@ri[2]
638	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
639	sbb	24($np,$i,8),@ri[3]
640	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
641
642	sbb	\$0,@ri[0]		# handle upmost overflow bit
643	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
644	xor	$i,$i			# i=0
645	and	@ri[0],$ap
646	not	@ri[0]
647	mov	$rp,$np
648	and	@ri[0],$np
649	lea	-1($num),$j
650	or	$np,$ap			# ap=borrow?tp:rp
651
652	movdqu	($ap),%xmm1
653	movdqa	%xmm0,(%rsp)
654	movdqu	%xmm1,($rp)
655	jmp	.Lcopy4x
656.align	16
657.Lcopy4x:					# copy or in-place refresh
658	movdqu	16($ap,$i),%xmm2
659	movdqu	32($ap,$i),%xmm1
660	movdqa	%xmm0,16(%rsp,$i)
661	movdqu	%xmm2,16($rp,$i)
662	movdqa	%xmm0,32(%rsp,$i)
663	movdqu	%xmm1,32($rp,$i)
664	lea	32($i),$i
665	dec	$j
666	jnz	.Lcopy4x
667
668	shl	\$2,$num
669	movdqu	16($ap,$i),%xmm2
670	movdqa	%xmm0,16(%rsp,$i)
671	movdqu	%xmm2,16($rp,$i)
672___
673}
674$code.=<<___;
675	mov	8(%rsp,$num,8),%rsi	# restore %rsp
676	mov	\$1,%rax
677	mov	(%rsi),%r15
678	mov	8(%rsi),%r14
679	mov	16(%rsi),%r13
680	mov	24(%rsi),%r12
681	mov	32(%rsi),%rbp
682	mov	40(%rsi),%rbx
683	lea	48(%rsi),%rsp
684.Lmul4x_epilogue:
685	ret
686.size	bn_mul4x_mont,.-bn_mul4x_mont
687___
688}}}
689{{{
690######################################################################
691# void bn_sqr4x_mont(
692my $rptr="%rdi";	# const BN_ULONG *rptr,
693my $aptr="%rsi";	# const BN_ULONG *aptr,
694my $bptr="%rdx";	# not used
695my $nptr="%rcx";	# const BN_ULONG *nptr,
696my $n0  ="%r8";		# const BN_ULONG *n0);
697my $num ="%r9";		# int num, has to be divisible by 4 and
698			# not less than 8
699
700my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
701my @A0=("%r10","%r11");
702my @A1=("%r12","%r13");
703my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
704
705$code.=<<___;
706.type	bn_sqr4x_mont,\@function,6
707.align	16
708bn_sqr4x_mont:
709.Lsqr4x_enter:
710	push	%rbx
711	push	%rbp
712	push	%r12
713	push	%r13
714	push	%r14
715	push	%r15
716
717	shl	\$3,${num}d		# convert $num to bytes
718	xor	%r10,%r10
719	mov	%rsp,%r11		# put aside %rsp
720	sub	$num,%r10		# -$num
721	mov	($n0),$n0		# *n0
722	lea	-72(%rsp,%r10,2),%rsp	# alloca(frame+2*$num)
723	and	\$-1024,%rsp		# minimize TLB usage
724	##############################################################
725	# Stack layout
726	#
727	# +0	saved $num, used in reduction section
728	# +8	&t[2*$num], used in reduction section
729	# +32	saved $rptr
730	# +40	saved $nptr
731	# +48	saved *n0
732	# +56	saved %rsp
733	# +64	t[2*$num]
734	#
735	mov	$rptr,32(%rsp)		# save $rptr
736	mov	$nptr,40(%rsp)
737	mov	$n0,  48(%rsp)
738	mov	%r11, 56(%rsp)		# save original %rsp
739.Lsqr4x_body:
740	##############################################################
741	# Squaring part:
742	#
743	# a) multiply-n-add everything but a[i]*a[i];
744	# b) shift result of a) by 1 to the left and accumulate
745	#    a[i]*a[i] products;
746	#
747	lea	32(%r10),$i		# $i=-($num-32)
748	lea	($aptr,$num),$aptr	# end of a[] buffer, ($aptr,$i)=&ap[2]
749
750	mov	$num,$j			# $j=$num
751
752					# comments apply to $num==8 case
753	mov	-32($aptr,$i),$a0	# a[0]
754	lea	64(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
755	mov	-24($aptr,$i),%rax	# a[1]
756	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
757	mov	-16($aptr,$i),$ai	# a[2]
758	mov	%rax,$a1
759
760	mul	$a0			# a[1]*a[0]
761	mov	%rax,$A0[0]		# a[1]*a[0]
762	 mov	$ai,%rax		# a[2]
763	mov	%rdx,$A0[1]
764	mov	$A0[0],-24($tptr,$i)	# t[1]
765
766	xor	$A0[0],$A0[0]
767	mul	$a0			# a[2]*a[0]
768	add	%rax,$A0[1]
769	 mov	$ai,%rax
770	adc	%rdx,$A0[0]
771	mov	$A0[1],-16($tptr,$i)	# t[2]
772
773	lea	-16($i),$j		# j=-16
774
775
776	 mov	8($aptr,$j),$ai		# a[3]
777	mul	$a1			# a[2]*a[1]
778	mov	%rax,$A1[0]		# a[2]*a[1]+t[3]
779	 mov	$ai,%rax
780	mov	%rdx,$A1[1]
781
782	xor	$A0[1],$A0[1]
783	add	$A1[0],$A0[0]
784	 lea	16($j),$j
785	adc	\$0,$A0[1]
786	mul	$a0			# a[3]*a[0]
787	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
788	 mov	$ai,%rax
789	adc	%rdx,$A0[1]
790	mov	$A0[0],-8($tptr,$j)	# t[3]
791	jmp	.Lsqr4x_1st
792
793.align	16
794.Lsqr4x_1st:
795	 mov	($aptr,$j),$ai		# a[4]
796	xor	$A1[0],$A1[0]
797	mul	$a1			# a[3]*a[1]
798	add	%rax,$A1[1]		# a[3]*a[1]+t[4]
799	 mov	$ai,%rax
800	adc	%rdx,$A1[0]
801
802	xor	$A0[0],$A0[0]
803	add	$A1[1],$A0[1]
804	adc	\$0,$A0[0]
805	mul	$a0			# a[4]*a[0]
806	add	%rax,$A0[1]		# a[4]*a[0]+a[3]*a[1]+t[4]
807	 mov	$ai,%rax		# a[3]
808	adc	%rdx,$A0[0]
809	mov	$A0[1],($tptr,$j)	# t[4]
810
811
812	 mov	8($aptr,$j),$ai		# a[5]
813	xor	$A1[1],$A1[1]
814	mul	$a1			# a[4]*a[3]
815	add	%rax,$A1[0]		# a[4]*a[3]+t[5]
816	 mov	$ai,%rax
817	adc	%rdx,$A1[1]
818
819	xor	$A0[1],$A0[1]
820	add	$A1[0],$A0[0]
821	adc	\$0,$A0[1]
822	mul	$a0			# a[5]*a[2]
823	add	%rax,$A0[0]		# a[5]*a[2]+a[4]*a[3]+t[5]
824	 mov	$ai,%rax
825	adc	%rdx,$A0[1]
826	mov	$A0[0],8($tptr,$j)	# t[5]
827
828	 mov	16($aptr,$j),$ai	# a[6]
829	xor	$A1[0],$A1[0]
830	mul	$a1			# a[5]*a[3]
831	add	%rax,$A1[1]		# a[5]*a[3]+t[6]
832	 mov	$ai,%rax
833	adc	%rdx,$A1[0]
834
835	xor	$A0[0],$A0[0]
836	add	$A1[1],$A0[1]
837	adc	\$0,$A0[0]
838	mul	$a0			# a[6]*a[2]
839	add	%rax,$A0[1]		# a[6]*a[2]+a[5]*a[3]+t[6]
840	 mov	$ai,%rax		# a[3]
841	adc	%rdx,$A0[0]
842	mov	$A0[1],16($tptr,$j)	# t[6]
843
844
845	 mov	24($aptr,$j),$ai	# a[7]
846	xor	$A1[1],$A1[1]
847	mul	$a1			# a[6]*a[5]
848	add	%rax,$A1[0]		# a[6]*a[5]+t[7]
849	 mov	$ai,%rax
850	adc	%rdx,$A1[1]
851
852	xor	$A0[1],$A0[1]
853	add	$A1[0],$A0[0]
854	 lea	32($j),$j
855	adc	\$0,$A0[1]
856	mul	$a0			# a[7]*a[4]
857	add	%rax,$A0[0]		# a[7]*a[4]+a[6]*a[5]+t[6]
858	 mov	$ai,%rax
859	adc	%rdx,$A0[1]
860	mov	$A0[0],-8($tptr,$j)	# t[7]
861
862	cmp	\$0,$j
863	jne	.Lsqr4x_1st
864
865	xor	$A1[0],$A1[0]
866	add	$A0[1],$A1[1]
867	adc	\$0,$A1[0]
868	mul	$a1			# a[7]*a[5]
869	add	%rax,$A1[1]
870	adc	%rdx,$A1[0]
871
872	mov	$A1[1],($tptr)		# t[8]
873	lea	16($i),$i
874	mov	$A1[0],8($tptr)		# t[9]
875	jmp	.Lsqr4x_outer
876
877.align	16
878.Lsqr4x_outer:				# comments apply to $num==6 case
879	mov	-32($aptr,$i),$a0	# a[0]
880	lea	64(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
881	mov	-24($aptr,$i),%rax	# a[1]
882	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
883	mov	-16($aptr,$i),$ai	# a[2]
884	mov	%rax,$a1
885
886	mov	-24($tptr,$i),$A0[0]	# t[1]
887	xor	$A0[1],$A0[1]
888	mul	$a0			# a[1]*a[0]
889	add	%rax,$A0[0]		# a[1]*a[0]+t[1]
890	 mov	$ai,%rax		# a[2]
891	adc	%rdx,$A0[1]
892	mov	$A0[0],-24($tptr,$i)	# t[1]
893
894	xor	$A0[0],$A0[0]
895	add	-16($tptr,$i),$A0[1]	# a[2]*a[0]+t[2]
896	adc	\$0,$A0[0]
897	mul	$a0			# a[2]*a[0]
898	add	%rax,$A0[1]
899	 mov	$ai,%rax
900	adc	%rdx,$A0[0]
901	mov	$A0[1],-16($tptr,$i)	# t[2]
902
903	lea	-16($i),$j		# j=-16
904	xor	$A1[0],$A1[0]
905
906
907	 mov	8($aptr,$j),$ai		# a[3]
908	xor	$A1[1],$A1[1]
909	add	8($tptr,$j),$A1[0]
910	adc	\$0,$A1[1]
911	mul	$a1			# a[2]*a[1]
912	add	%rax,$A1[0]		# a[2]*a[1]+t[3]
913	 mov	$ai,%rax
914	adc	%rdx,$A1[1]
915
916	xor	$A0[1],$A0[1]
917	add	$A1[0],$A0[0]
918	adc	\$0,$A0[1]
919	mul	$a0			# a[3]*a[0]
920	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
921	 mov	$ai,%rax
922	adc	%rdx,$A0[1]
923	mov	$A0[0],8($tptr,$j)	# t[3]
924
925	lea	16($j),$j
926	jmp	.Lsqr4x_inner
927
928.align	16
929.Lsqr4x_inner:
930	 mov	($aptr,$j),$ai		# a[4]
931	xor	$A1[0],$A1[0]
932	add	($tptr,$j),$A1[1]
933	adc	\$0,$A1[0]
934	mul	$a1			# a[3]*a[1]
935	add	%rax,$A1[1]		# a[3]*a[1]+t[4]
936	 mov	$ai,%rax
937	adc	%rdx,$A1[0]
938
939	xor	$A0[0],$A0[0]
940	add	$A1[1],$A0[1]
941	adc	\$0,$A0[0]
942	mul	$a0			# a[4]*a[0]
943	add	%rax,$A0[1]		# a[4]*a[0]+a[3]*a[1]+t[4]
944	 mov	$ai,%rax		# a[3]
945	adc	%rdx,$A0[0]
946	mov	$A0[1],($tptr,$j)	# t[4]
947
948	 mov	8($aptr,$j),$ai		# a[5]
949	xor	$A1[1],$A1[1]
950	add	8($tptr,$j),$A1[0]
951	adc	\$0,$A1[1]
952	mul	$a1			# a[4]*a[3]
953	add	%rax,$A1[0]		# a[4]*a[3]+t[5]
954	 mov	$ai,%rax
955	adc	%rdx,$A1[1]
956
957	xor	$A0[1],$A0[1]
958	add	$A1[0],$A0[0]
959	lea	16($j),$j		# j++
960	adc	\$0,$A0[1]
961	mul	$a0			# a[5]*a[2]
962	add	%rax,$A0[0]		# a[5]*a[2]+a[4]*a[3]+t[5]
963	 mov	$ai,%rax
964	adc	%rdx,$A0[1]
965	mov	$A0[0],-8($tptr,$j)	# t[5], "preloaded t[1]" below
966
967	cmp	\$0,$j
968	jne	.Lsqr4x_inner
969
970	xor	$A1[0],$A1[0]
971	add	$A0[1],$A1[1]
972	adc	\$0,$A1[0]
973	mul	$a1			# a[5]*a[3]
974	add	%rax,$A1[1]
975	adc	%rdx,$A1[0]
976
977	mov	$A1[1],($tptr)		# t[6], "preloaded t[2]" below
978	mov	$A1[0],8($tptr)		# t[7], "preloaded t[3]" below
979
980	add	\$16,$i
981	jnz	.Lsqr4x_outer
982
983					# comments apply to $num==4 case
984	mov	-32($aptr),$a0		# a[0]
985	lea	64(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
986	mov	-24($aptr),%rax		# a[1]
987	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
988	mov	-16($aptr),$ai		# a[2]
989	mov	%rax,$a1
990
991	xor	$A0[1],$A0[1]
992	mul	$a0			# a[1]*a[0]
993	add	%rax,$A0[0]		# a[1]*a[0]+t[1], preloaded t[1]
994	 mov	$ai,%rax		# a[2]
995	adc	%rdx,$A0[1]
996	mov	$A0[0],-24($tptr)	# t[1]
997
998	xor	$A0[0],$A0[0]
999	add	$A1[1],$A0[1]		# a[2]*a[0]+t[2], preloaded t[2]
1000	adc	\$0,$A0[0]
1001	mul	$a0			# a[2]*a[0]
1002	add	%rax,$A0[1]
1003	 mov	$ai,%rax
1004	adc	%rdx,$A0[0]
1005	mov	$A0[1],-16($tptr)	# t[2]
1006
1007	 mov	-8($aptr),$ai		# a[3]
1008	mul	$a1			# a[2]*a[1]
1009	add	%rax,$A1[0]		# a[2]*a[1]+t[3], preloaded t[3]
1010	 mov	$ai,%rax
1011	adc	\$0,%rdx
1012
1013	xor	$A0[1],$A0[1]
1014	add	$A1[0],$A0[0]
1015	 mov	%rdx,$A1[1]
1016	adc	\$0,$A0[1]
1017	mul	$a0			# a[3]*a[0]
1018	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
1019	 mov	$ai,%rax
1020	adc	%rdx,$A0[1]
1021	mov	$A0[0],-8($tptr)	# t[3]
1022
1023	xor	$A1[0],$A1[0]
1024	add	$A0[1],$A1[1]
1025	adc	\$0,$A1[0]
1026	mul	$a1			# a[3]*a[1]
1027	add	%rax,$A1[1]
1028	 mov	-16($aptr),%rax		# a[2]
1029	adc	%rdx,$A1[0]
1030
1031	mov	$A1[1],($tptr)		# t[4]
1032	mov	$A1[0],8($tptr)		# t[5]
1033
1034	mul	$ai			# a[2]*a[3]
1035___
1036{
1037my ($shift,$carry)=($a0,$a1);
1038my @S=(@A1,$ai,$n0);
1039$code.=<<___;
1040	 add	\$16,$i
1041	 xor	$shift,$shift
1042	 sub	$num,$i			# $i=16-$num
1043	 xor	$carry,$carry
1044
1045	add	$A1[0],%rax		# t[5]
1046	adc	\$0,%rdx
1047	mov	%rax,8($tptr)		# t[5]
1048	mov	%rdx,16($tptr)		# t[6]
1049	mov	$carry,24($tptr)	# t[7]
1050
1051	 mov	-16($aptr,$i),%rax	# a[0]
1052	lea	64(%rsp,$num,2),$tptr
1053	 xor	$A0[0],$A0[0]		# t[0]
1054	 mov	-24($tptr,$i,2),$A0[1]	# t[1]
1055
1056	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
1057	shr	\$63,$A0[0]
1058	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
1059	shr	\$63,$A0[1]
1060	or	$A0[0],$S[1]		# | t[2*i]>>63
1061	 mov	-16($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
1062	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1063	mul	%rax			# a[i]*a[i]
1064	neg	$carry			# mov $carry,cf
1065	 mov	-8($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
1066	adc	%rax,$S[0]
1067	 mov	-8($aptr,$i),%rax	# a[i+1]	# prefetch
1068	mov	$S[0],-32($tptr,$i,2)
1069	adc	%rdx,$S[1]
1070
1071	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
1072	 mov	$S[1],-24($tptr,$i,2)
1073	 sbb	$carry,$carry		# mov cf,$carry
1074	shr	\$63,$A0[0]
1075	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
1076	shr	\$63,$A0[1]
1077	or	$A0[0],$S[3]		# | t[2*i]>>63
1078	 mov	0($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
1079	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1080	mul	%rax			# a[i]*a[i]
1081	neg	$carry			# mov $carry,cf
1082	 mov	8($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
1083	adc	%rax,$S[2]
1084	 mov	0($aptr,$i),%rax	# a[i+1]	# prefetch
1085	mov	$S[2],-16($tptr,$i,2)
1086	adc	%rdx,$S[3]
1087	lea	16($i),$i
1088	mov	$S[3],-40($tptr,$i,2)
1089	sbb	$carry,$carry		# mov cf,$carry
1090	jmp	.Lsqr4x_shift_n_add
1091
1092.align	16
1093.Lsqr4x_shift_n_add:
1094	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
1095	shr	\$63,$A0[0]
1096	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
1097	shr	\$63,$A0[1]
1098	or	$A0[0],$S[1]		# | t[2*i]>>63
1099	 mov	-16($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
1100	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1101	mul	%rax			# a[i]*a[i]
1102	neg	$carry			# mov $carry,cf
1103	 mov	-8($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
1104	adc	%rax,$S[0]
1105	 mov	-8($aptr,$i),%rax	# a[i+1]	# prefetch
1106	mov	$S[0],-32($tptr,$i,2)
1107	adc	%rdx,$S[1]
1108
1109	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
1110	 mov	$S[1],-24($tptr,$i,2)
1111	 sbb	$carry,$carry		# mov cf,$carry
1112	shr	\$63,$A0[0]
1113	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
1114	shr	\$63,$A0[1]
1115	or	$A0[0],$S[3]		# | t[2*i]>>63
1116	 mov	0($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
1117	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1118	mul	%rax			# a[i]*a[i]
1119	neg	$carry			# mov $carry,cf
1120	 mov	8($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
1121	adc	%rax,$S[2]
1122	 mov	0($aptr,$i),%rax	# a[i+1]	# prefetch
1123	mov	$S[2],-16($tptr,$i,2)
1124	adc	%rdx,$S[3]
1125
1126	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
1127	 mov	$S[3],-8($tptr,$i,2)
1128	 sbb	$carry,$carry		# mov cf,$carry
1129	shr	\$63,$A0[0]
1130	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
1131	shr	\$63,$A0[1]
1132	or	$A0[0],$S[1]		# | t[2*i]>>63
1133	 mov	16($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
1134	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1135	mul	%rax			# a[i]*a[i]
1136	neg	$carry			# mov $carry,cf
1137	 mov	24($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
1138	adc	%rax,$S[0]
1139	 mov	8($aptr,$i),%rax	# a[i+1]	# prefetch
1140	mov	$S[0],0($tptr,$i,2)
1141	adc	%rdx,$S[1]
1142
1143	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
1144	 mov	$S[1],8($tptr,$i,2)
1145	 sbb	$carry,$carry		# mov cf,$carry
1146	shr	\$63,$A0[0]
1147	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
1148	shr	\$63,$A0[1]
1149	or	$A0[0],$S[3]		# | t[2*i]>>63
1150	 mov	32($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
1151	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1152	mul	%rax			# a[i]*a[i]
1153	neg	$carry			# mov $carry,cf
1154	 mov	40($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
1155	adc	%rax,$S[2]
1156	 mov	16($aptr,$i),%rax	# a[i+1]	# prefetch
1157	mov	$S[2],16($tptr,$i,2)
1158	adc	%rdx,$S[3]
1159	mov	$S[3],24($tptr,$i,2)
1160	sbb	$carry,$carry		# mov cf,$carry
1161	add	\$32,$i
1162	jnz	.Lsqr4x_shift_n_add
1163
1164	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
1165	shr	\$63,$A0[0]
1166	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
1167	shr	\$63,$A0[1]
1168	or	$A0[0],$S[1]		# | t[2*i]>>63
1169	 mov	-16($tptr),$A0[0]	# t[2*i+2]	# prefetch
1170	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1171	mul	%rax			# a[i]*a[i]
1172	neg	$carry			# mov $carry,cf
1173	 mov	-8($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
1174	adc	%rax,$S[0]
1175	 mov	-8($aptr),%rax		# a[i+1]	# prefetch
1176	mov	$S[0],-32($tptr)
1177	adc	%rdx,$S[1]
1178
1179	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1|shift
1180	 mov	$S[1],-24($tptr)
1181	 sbb	$carry,$carry		# mov cf,$carry
1182	shr	\$63,$A0[0]
1183	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
1184	shr	\$63,$A0[1]
1185	or	$A0[0],$S[3]		# | t[2*i]>>63
1186	mul	%rax			# a[i]*a[i]
1187	neg	$carry			# mov $carry,cf
1188	adc	%rax,$S[2]
1189	adc	%rdx,$S[3]
1190	mov	$S[2],-16($tptr)
1191	mov	$S[3],-8($tptr)
1192___
1193}
1194##############################################################
1195# Montgomery reduction part, "word-by-word" algorithm.
1196#
1197{
1198my ($topbit,$nptr)=("%rbp",$aptr);
1199my ($m0,$m1)=($a0,$a1);
1200my @Ni=("%rbx","%r9");
1201$code.=<<___;
1202	mov	40(%rsp),$nptr		# restore $nptr
1203	mov	48(%rsp),$n0		# restore *n0
1204	xor	$j,$j
1205	mov	$num,0(%rsp)		# save $num
1206	sub	$num,$j			# $j=-$num
1207	 mov	64(%rsp),$A0[0]		# t[0]		# modsched #
1208	 mov	$n0,$m0			#		# modsched #
1209	lea	64(%rsp,$num,2),%rax	# end of t[] buffer
1210	lea	64(%rsp,$num),$tptr	# end of t[] window
1211	mov	%rax,8(%rsp)		# save end of t[] buffer
1212	lea	($nptr,$num),$nptr	# end of n[] buffer
1213	xor	$topbit,$topbit		# $topbit=0
1214
1215	mov	0($nptr,$j),%rax	# n[0]		# modsched #
1216	mov	8($nptr,$j),$Ni[1]	# n[1]		# modsched #
1217	 imulq	$A0[0],$m0		# m0=t[0]*n0	# modsched #
1218	 mov	%rax,$Ni[0]		#		# modsched #
1219	jmp	.Lsqr4x_mont_outer
1220
1221.align	16
1222.Lsqr4x_mont_outer:
1223	xor	$A0[1],$A0[1]
1224	mul	$m0			# n[0]*m0
1225	add	%rax,$A0[0]		# n[0]*m0+t[0]
1226	 mov	$Ni[1],%rax
1227	adc	%rdx,$A0[1]
1228	mov	$n0,$m1
1229
1230	xor	$A0[0],$A0[0]
1231	add	8($tptr,$j),$A0[1]
1232	adc	\$0,$A0[0]
1233	mul	$m0			# n[1]*m0
1234	add	%rax,$A0[1]		# n[1]*m0+t[1]
1235	 mov	$Ni[0],%rax
1236	adc	%rdx,$A0[0]
1237
1238	imulq	$A0[1],$m1
1239
1240	mov	16($nptr,$j),$Ni[0]	# n[2]
1241	xor	$A1[1],$A1[1]
1242	add	$A0[1],$A1[0]
1243	adc	\$0,$A1[1]
1244	mul	$m1			# n[0]*m1
1245	add	%rax,$A1[0]		# n[0]*m1+"t[1]"
1246	 mov	$Ni[0],%rax
1247	adc	%rdx,$A1[1]
1248	mov	$A1[0],8($tptr,$j)	# "t[1]"
1249
1250	xor	$A0[1],$A0[1]
1251	add	16($tptr,$j),$A0[0]
1252	adc	\$0,$A0[1]
1253	mul	$m0			# n[2]*m0
1254	add	%rax,$A0[0]		# n[2]*m0+t[2]
1255	 mov	$Ni[1],%rax
1256	adc	%rdx,$A0[1]
1257
1258	mov	24($nptr,$j),$Ni[1]	# n[3]
1259	xor	$A1[0],$A1[0]
1260	add	$A0[0],$A1[1]
1261	adc	\$0,$A1[0]
1262	mul	$m1			# n[1]*m1
1263	add	%rax,$A1[1]		# n[1]*m1+"t[2]"
1264	 mov	$Ni[1],%rax
1265	adc	%rdx,$A1[0]
1266	mov	$A1[1],16($tptr,$j)	# "t[2]"
1267
1268	xor	$A0[0],$A0[0]
1269	add	24($tptr,$j),$A0[1]
1270	lea	32($j),$j
1271	adc	\$0,$A0[0]
1272	mul	$m0			# n[3]*m0
1273	add	%rax,$A0[1]		# n[3]*m0+t[3]
1274	 mov	$Ni[0],%rax
1275	adc	%rdx,$A0[0]
1276	jmp	.Lsqr4x_mont_inner
1277
1278.align	16
1279.Lsqr4x_mont_inner:
1280	mov	($nptr,$j),$Ni[0]	# n[4]
1281	xor	$A1[1],$A1[1]
1282	add	$A0[1],$A1[0]
1283	adc	\$0,$A1[1]
1284	mul	$m1			# n[2]*m1
1285	add	%rax,$A1[0]		# n[2]*m1+"t[3]"
1286	 mov	$Ni[0],%rax
1287	adc	%rdx,$A1[1]
1288	mov	$A1[0],-8($tptr,$j)	# "t[3]"
1289
1290	xor	$A0[1],$A0[1]
1291	add	($tptr,$j),$A0[0]
1292	adc	\$0,$A0[1]
1293	mul	$m0			# n[4]*m0
1294	add	%rax,$A0[0]		# n[4]*m0+t[4]
1295	 mov	$Ni[1],%rax
1296	adc	%rdx,$A0[1]
1297
1298	mov	8($nptr,$j),$Ni[1]	# n[5]
1299	xor	$A1[0],$A1[0]
1300	add	$A0[0],$A1[1]
1301	adc	\$0,$A1[0]
1302	mul	$m1			# n[3]*m1
1303	add	%rax,$A1[1]		# n[3]*m1+"t[4]"
1304	 mov	$Ni[1],%rax
1305	adc	%rdx,$A1[0]
1306	mov	$A1[1],($tptr,$j)	# "t[4]"
1307
1308	xor	$A0[0],$A0[0]
1309	add	8($tptr,$j),$A0[1]
1310	adc	\$0,$A0[0]
1311	mul	$m0			# n[5]*m0
1312	add	%rax,$A0[1]		# n[5]*m0+t[5]
1313	 mov	$Ni[0],%rax
1314	adc	%rdx,$A0[0]
1315
1316
1317	mov	16($nptr,$j),$Ni[0]	# n[6]
1318	xor	$A1[1],$A1[1]
1319	add	$A0[1],$A1[0]
1320	adc	\$0,$A1[1]
1321	mul	$m1			# n[4]*m1
1322	add	%rax,$A1[0]		# n[4]*m1+"t[5]"
1323	 mov	$Ni[0],%rax
1324	adc	%rdx,$A1[1]
1325	mov	$A1[0],8($tptr,$j)	# "t[5]"
1326
1327	xor	$A0[1],$A0[1]
1328	add	16($tptr,$j),$A0[0]
1329	adc	\$0,$A0[1]
1330	mul	$m0			# n[6]*m0
1331	add	%rax,$A0[0]		# n[6]*m0+t[6]
1332	 mov	$Ni[1],%rax
1333	adc	%rdx,$A0[1]
1334
1335	mov	24($nptr,$j),$Ni[1]	# n[7]
1336	xor	$A1[0],$A1[0]
1337	add	$A0[0],$A1[1]
1338	adc	\$0,$A1[0]
1339	mul	$m1			# n[5]*m1
1340	add	%rax,$A1[1]		# n[5]*m1+"t[6]"
1341	 mov	$Ni[1],%rax
1342	adc	%rdx,$A1[0]
1343	mov	$A1[1],16($tptr,$j)	# "t[6]"
1344
1345	xor	$A0[0],$A0[0]
1346	add	24($tptr,$j),$A0[1]
1347	lea	32($j),$j
1348	adc	\$0,$A0[0]
1349	mul	$m0			# n[7]*m0
1350	add	%rax,$A0[1]		# n[7]*m0+t[7]
1351	 mov	$Ni[0],%rax
1352	adc	%rdx,$A0[0]
1353	cmp	\$0,$j
1354	jne	.Lsqr4x_mont_inner
1355
1356	 sub	0(%rsp),$j		# $j=-$num	# modsched #
1357	 mov	$n0,$m0			#		# modsched #
1358
1359	xor	$A1[1],$A1[1]
1360	add	$A0[1],$A1[0]
1361	adc	\$0,$A1[1]
1362	mul	$m1			# n[6]*m1
1363	add	%rax,$A1[0]		# n[6]*m1+"t[7]"
1364	mov	$Ni[1],%rax
1365	adc	%rdx,$A1[1]
1366	mov	$A1[0],-8($tptr)	# "t[7]"
1367
1368	xor	$A0[1],$A0[1]
1369	add	($tptr),$A0[0]		# +t[8]
1370	adc	\$0,$A0[1]
1371	 mov	0($nptr,$j),$Ni[0]	# n[0]		# modsched #
1372	add	$topbit,$A0[0]
1373	adc	\$0,$A0[1]
1374
1375	 imulq	16($tptr,$j),$m0	# m0=t[0]*n0	# modsched #
1376	xor	$A1[0],$A1[0]
1377	 mov	8($nptr,$j),$Ni[1]	# n[1]		# modsched #
1378	add	$A0[0],$A1[1]
1379	 mov	16($tptr,$j),$A0[0]	# t[0]		# modsched #
1380	adc	\$0,$A1[0]
1381	mul	$m1			# n[7]*m1
1382	add	%rax,$A1[1]		# n[7]*m1+"t[8]"
1383	 mov	$Ni[0],%rax		#		# modsched #
1384	adc	%rdx,$A1[0]
1385	mov	$A1[1],($tptr)		# "t[8]"
1386
1387	xor	$topbit,$topbit
1388	add	8($tptr),$A1[0]		# +t[9]
1389	adc	$topbit,$topbit
1390	add	$A0[1],$A1[0]
1391	lea	16($tptr),$tptr		# "t[$num]>>128"
1392	adc	\$0,$topbit
1393	mov	$A1[0],-8($tptr)	# "t[9]"
1394	cmp	8(%rsp),$tptr		# are we done?
1395	jb	.Lsqr4x_mont_outer
1396
1397	mov	0(%rsp),$num		# restore $num
1398	mov	$topbit,($tptr)		# save $topbit
1399___
1400}
1401##############################################################
1402# Post-condition, 4x unrolled copy from bn_mul_mont
1403#
1404{
1405my ($tptr,$nptr)=("%rbx",$aptr);
1406my @ri=("%rax","%rdx","%r10","%r11");
1407$code.=<<___;
1408	mov	64(%rsp,$num),@ri[0]	# tp[0]
1409	lea	64(%rsp,$num),$tptr	# upper half of t[2*$num] holds result
1410	mov	40(%rsp),$nptr		# restore $nptr
1411	shr	\$5,$num		# num/4
1412	mov	8($tptr),@ri[1]		# t[1]
1413	xor	$i,$i			# i=0 and clear CF!
1414
1415	mov	32(%rsp),$rptr		# restore $rptr
1416	sub	0($nptr),@ri[0]
1417	mov	16($tptr),@ri[2]	# t[2]
1418	mov	24($tptr),@ri[3]	# t[3]
1419	sbb	8($nptr),@ri[1]
1420	lea	-1($num),$j		# j=num/4-1
1421	jmp	.Lsqr4x_sub
1422.align	16
1423.Lsqr4x_sub:
1424	mov	@ri[0],0($rptr,$i,8)	# rp[i]=tp[i]-np[i]
1425	mov	@ri[1],8($rptr,$i,8)	# rp[i]=tp[i]-np[i]
1426	sbb	16($nptr,$i,8),@ri[2]
1427	mov	32($tptr,$i,8),@ri[0]	# tp[i+1]
1428	mov	40($tptr,$i,8),@ri[1]
1429	sbb	24($nptr,$i,8),@ri[3]
1430	mov	@ri[2],16($rptr,$i,8)	# rp[i]=tp[i]-np[i]
1431	mov	@ri[3],24($rptr,$i,8)	# rp[i]=tp[i]-np[i]
1432	sbb	32($nptr,$i,8),@ri[0]
1433	mov	48($tptr,$i,8),@ri[2]
1434	mov	56($tptr,$i,8),@ri[3]
1435	sbb	40($nptr,$i,8),@ri[1]
1436	lea	4($i),$i		# i++
1437	dec	$j			# doesn't affect CF!
1438	jnz	.Lsqr4x_sub
1439
1440	mov	@ri[0],0($rptr,$i,8)	# rp[i]=tp[i]-np[i]
1441	mov	32($tptr,$i,8),@ri[0]	# load overflow bit
1442	sbb	16($nptr,$i,8),@ri[2]
1443	mov	@ri[1],8($rptr,$i,8)	# rp[i]=tp[i]-np[i]
1444	sbb	24($nptr,$i,8),@ri[3]
1445	mov	@ri[2],16($rptr,$i,8)	# rp[i]=tp[i]-np[i]
1446
1447	sbb	\$0,@ri[0]		# handle upmost overflow bit
1448	mov	@ri[3],24($rptr,$i,8)	# rp[i]=tp[i]-np[i]
1449	xor	$i,$i			# i=0
1450	and	@ri[0],$tptr
1451	not	@ri[0]
1452	mov	$rptr,$nptr
1453	and	@ri[0],$nptr
1454	lea	-1($num),$j
1455	or	$nptr,$tptr		# tp=borrow?tp:rp
1456
1457	pxor	%xmm0,%xmm0
1458	lea	64(%rsp,$num,8),$nptr
1459	movdqu	($tptr),%xmm1
1460	lea	($nptr,$num,8),$nptr
1461	movdqa	%xmm0,64(%rsp)		# zap lower half of temporary vector
1462	movdqa	%xmm0,($nptr)		# zap upper half of temporary vector
1463	movdqu	%xmm1,($rptr)
1464	jmp	.Lsqr4x_copy
1465.align	16
1466.Lsqr4x_copy:				# copy or in-place refresh
1467	movdqu	16($tptr,$i),%xmm2
1468	movdqu	32($tptr,$i),%xmm1
1469	movdqa	%xmm0,80(%rsp,$i)	# zap lower half of temporary vector
1470	movdqa	%xmm0,96(%rsp,$i)	# zap lower half of temporary vector
1471	movdqa	%xmm0,16($nptr,$i)	# zap upper half of temporary vector
1472	movdqa	%xmm0,32($nptr,$i)	# zap upper half of temporary vector
1473	movdqu	%xmm2,16($rptr,$i)
1474	movdqu	%xmm1,32($rptr,$i)
1475	lea	32($i),$i
1476	dec	$j
1477	jnz	.Lsqr4x_copy
1478
1479	movdqu	16($tptr,$i),%xmm2
1480	movdqa	%xmm0,80(%rsp,$i)	# zap lower half of temporary vector
1481	movdqa	%xmm0,16($nptr,$i)	# zap upper half of temporary vector
1482	movdqu	%xmm2,16($rptr,$i)
1483___
1484}
1485$code.=<<___;
1486	mov	56(%rsp),%rsi		# restore %rsp
1487	mov	\$1,%rax
1488	mov	0(%rsi),%r15
1489	mov	8(%rsi),%r14
1490	mov	16(%rsi),%r13
1491	mov	24(%rsi),%r12
1492	mov	32(%rsi),%rbp
1493	mov	40(%rsi),%rbx
1494	lea	48(%rsi),%rsp
1495.Lsqr4x_epilogue:
1496	ret
1497.size	bn_sqr4x_mont,.-bn_sqr4x_mont
1498___
1499}}}
1500$code.=<<___;
1501.asciz	"Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1502.align	16
1503___
1504
1505# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1506#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1507if ($win64) {
1508$rec="%rcx";
1509$frame="%rdx";
1510$context="%r8";
1511$disp="%r9";
1512
1513$code.=<<___;
1514.extern	__imp_RtlVirtualUnwind
1515.type	mul_handler,\@abi-omnipotent
1516.align	16
1517mul_handler:
1518	push	%rsi
1519	push	%rdi
1520	push	%rbx
1521	push	%rbp
1522	push	%r12
1523	push	%r13
1524	push	%r14
1525	push	%r15
1526	pushfq
1527	sub	\$64,%rsp
1528
1529	mov	120($context),%rax	# pull context->Rax
1530	mov	248($context),%rbx	# pull context->Rip
1531
1532	mov	8($disp),%rsi		# disp->ImageBase
1533	mov	56($disp),%r11		# disp->HandlerData
1534
1535	mov	0(%r11),%r10d		# HandlerData[0]
1536	lea	(%rsi,%r10),%r10	# end of prologue label
1537	cmp	%r10,%rbx		# context->Rip<end of prologue label
1538	jb	.Lcommon_seh_tail
1539
1540	mov	152($context),%rax	# pull context->Rsp
1541
1542	mov	4(%r11),%r10d		# HandlerData[1]
1543	lea	(%rsi,%r10),%r10	# epilogue label
1544	cmp	%r10,%rbx		# context->Rip>=epilogue label
1545	jae	.Lcommon_seh_tail
1546
1547	mov	192($context),%r10	# pull $num
1548	mov	8(%rax,%r10,8),%rax	# pull saved stack pointer
1549	lea	48(%rax),%rax
1550
1551	mov	-8(%rax),%rbx
1552	mov	-16(%rax),%rbp
1553	mov	-24(%rax),%r12
1554	mov	-32(%rax),%r13
1555	mov	-40(%rax),%r14
1556	mov	-48(%rax),%r15
1557	mov	%rbx,144($context)	# restore context->Rbx
1558	mov	%rbp,160($context)	# restore context->Rbp
1559	mov	%r12,216($context)	# restore context->R12
1560	mov	%r13,224($context)	# restore context->R13
1561	mov	%r14,232($context)	# restore context->R14
1562	mov	%r15,240($context)	# restore context->R15
1563
1564	jmp	.Lcommon_seh_tail
1565.size	mul_handler,.-mul_handler
1566
1567.type	sqr_handler,\@abi-omnipotent
1568.align	16
1569sqr_handler:
1570	push	%rsi
1571	push	%rdi
1572	push	%rbx
1573	push	%rbp
1574	push	%r12
1575	push	%r13
1576	push	%r14
1577	push	%r15
1578	pushfq
1579	sub	\$64,%rsp
1580
1581	mov	120($context),%rax	# pull context->Rax
1582	mov	248($context),%rbx	# pull context->Rip
1583
1584	lea	.Lsqr4x_body(%rip),%r10
1585	cmp	%r10,%rbx		# context->Rip<.Lsqr_body
1586	jb	.Lcommon_seh_tail
1587
1588	mov	152($context),%rax	# pull context->Rsp
1589
1590	lea	.Lsqr4x_epilogue(%rip),%r10
1591	cmp	%r10,%rbx		# context->Rip>=.Lsqr_epilogue
1592	jae	.Lcommon_seh_tail
1593
1594	mov	56(%rax),%rax		# pull saved stack pointer
1595	lea	48(%rax),%rax
1596
1597	mov	-8(%rax),%rbx
1598	mov	-16(%rax),%rbp
1599	mov	-24(%rax),%r12
1600	mov	-32(%rax),%r13
1601	mov	-40(%rax),%r14
1602	mov	-48(%rax),%r15
1603	mov	%rbx,144($context)	# restore context->Rbx
1604	mov	%rbp,160($context)	# restore context->Rbp
1605	mov	%r12,216($context)	# restore context->R12
1606	mov	%r13,224($context)	# restore context->R13
1607	mov	%r14,232($context)	# restore context->R14
1608	mov	%r15,240($context)	# restore context->R15
1609
1610.Lcommon_seh_tail:
1611	mov	8(%rax),%rdi
1612	mov	16(%rax),%rsi
1613	mov	%rax,152($context)	# restore context->Rsp
1614	mov	%rsi,168($context)	# restore context->Rsi
1615	mov	%rdi,176($context)	# restore context->Rdi
1616
1617	mov	40($disp),%rdi		# disp->ContextRecord
1618	mov	$context,%rsi		# context
1619	mov	\$154,%ecx		# sizeof(CONTEXT)
1620	.long	0xa548f3fc		# cld; rep movsq
1621
1622	mov	$disp,%rsi
1623	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1624	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1625	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1626	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1627	mov	40(%rsi),%r10		# disp->ContextRecord
1628	lea	56(%rsi),%r11		# &disp->HandlerData
1629	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1630	mov	%r10,32(%rsp)		# arg5
1631	mov	%r11,40(%rsp)		# arg6
1632	mov	%r12,48(%rsp)		# arg7
1633	mov	%rcx,56(%rsp)		# arg8, (NULL)
1634	call	*__imp_RtlVirtualUnwind(%rip)
1635
1636	mov	\$1,%eax		# ExceptionContinueSearch
1637	add	\$64,%rsp
1638	popfq
1639	pop	%r15
1640	pop	%r14
1641	pop	%r13
1642	pop	%r12
1643	pop	%rbp
1644	pop	%rbx
1645	pop	%rdi
1646	pop	%rsi
1647	ret
1648.size	sqr_handler,.-sqr_handler
1649
1650.section	.pdata
1651.align	4
1652	.rva	.LSEH_begin_bn_mul_mont
1653	.rva	.LSEH_end_bn_mul_mont
1654	.rva	.LSEH_info_bn_mul_mont
1655
1656	.rva	.LSEH_begin_bn_mul4x_mont
1657	.rva	.LSEH_end_bn_mul4x_mont
1658	.rva	.LSEH_info_bn_mul4x_mont
1659
1660	.rva	.LSEH_begin_bn_sqr4x_mont
1661	.rva	.LSEH_end_bn_sqr4x_mont
1662	.rva	.LSEH_info_bn_sqr4x_mont
1663
1664.section	.xdata
1665.align	8
1666.LSEH_info_bn_mul_mont:
1667	.byte	9,0,0,0
1668	.rva	mul_handler
1669	.rva	.Lmul_body,.Lmul_epilogue	# HandlerData[]
1670.LSEH_info_bn_mul4x_mont:
1671	.byte	9,0,0,0
1672	.rva	mul_handler
1673	.rva	.Lmul4x_body,.Lmul4x_epilogue	# HandlerData[]
1674.LSEH_info_bn_sqr4x_mont:
1675	.byte	9,0,0,0
1676	.rva	sqr_handler
1677___
1678}
1679
1680print $code;
1681close STDOUT;
1682