xref: /openbsd/lib/libcrypto/bn/asm/x86_64-mont.pl (revision a6445c1d)
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# October 2005.
11#
12# Montgomery multiplication routine for x86_64. While it gives modest
13# 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more
14# than twice, >2x, as fast. Most common rsa1024 sign is improved by
15# respectful 50%. It remains to be seen if loop unrolling and
16# dedicated squaring routine can provide further improvement...
17
18# July 2011.
19#
20# Add dedicated squaring procedure. Performance improvement varies
21# from platform to platform, but in average it's ~5%/15%/25%/33%
22# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
23
24# August 2011.
25#
26# Unroll and modulo-schedule inner loops in such manner that they
27# are "fallen through" for input lengths of 8, which is critical for
28# 1024-bit RSA *sign*. Average performance improvement in comparison
29# to *initial* version of this module from 2005 is ~0%/30%/40%/45%
30# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
31
32$flavour = shift;
33$output  = shift;
34if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
35
36$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
37( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
38( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
39die "can't locate x86_64-xlate.pl";
40
41open OUT,"| \"$^X\" $xlate $flavour $output";
42*STDOUT=*OUT;
43
44# int bn_mul_mont(
45$rp="%rdi";	# BN_ULONG *rp,
46$ap="%rsi";	# const BN_ULONG *ap,
47$bp="%rdx";	# const BN_ULONG *bp,
48$np="%rcx";	# const BN_ULONG *np,
49$n0="%r8";	# const BN_ULONG *n0,
50$num="%r9";	# int num);
51$lo0="%r10";
52$hi0="%r11";
53$hi1="%r13";
54$i="%r14";
55$j="%r15";
56$m0="%rbx";
57$m1="%rbp";
58
59$code=<<___;
60.text
61
62.globl	bn_mul_mont
63.type	bn_mul_mont,\@function,6
64.align	16
65bn_mul_mont:
66	test	\$3,${num}d
67	jnz	.Lmul_enter
68	cmp	\$8,${num}d
69	jb	.Lmul_enter
70	cmp	$ap,$bp
71	jne	.Lmul4x_enter
72	jmp	.Lsqr4x_enter
73
74.align	16
75.Lmul_enter:
76	push	%rbx
77	push	%rbp
78	push	%r12
79	push	%r13
80	push	%r14
81	push	%r15
82
83	mov	${num}d,${num}d
84	lea	2($num),%r10
85	mov	%rsp,%r11
86	neg	%r10
87	lea	(%rsp,%r10,8),%rsp	# tp=alloca(8*(num+2))
88	and	\$-1024,%rsp		# minimize TLB usage
89
90	mov	%r11,8(%rsp,$num,8)	# tp[num+1]=%rsp
91.Lmul_body:
92	mov	$bp,%r12		# reassign $bp
93___
94		$bp="%r12";
95$code.=<<___;
96	mov	($n0),$n0		# pull n0[0] value
97	mov	($bp),$m0		# m0=bp[0]
98	mov	($ap),%rax
99
100	xor	$i,$i			# i=0
101	xor	$j,$j			# j=0
102
103	mov	$n0,$m1
104	mulq	$m0			# ap[0]*bp[0]
105	mov	%rax,$lo0
106	mov	($np),%rax
107
108	imulq	$lo0,$m1		# "tp[0]"*n0
109	mov	%rdx,$hi0
110
111	mulq	$m1			# np[0]*m1
112	add	%rax,$lo0		# discarded
113	mov	8($ap),%rax
114	adc	\$0,%rdx
115	mov	%rdx,$hi1
116
117	lea	1($j),$j		# j++
118	jmp	.L1st_enter
119
120.align	16
121.L1st:
122	add	%rax,$hi1
123	mov	($ap,$j,8),%rax
124	adc	\$0,%rdx
125	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
126	mov	$lo0,$hi0
127	adc	\$0,%rdx
128	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
129	mov	%rdx,$hi1
130
131.L1st_enter:
132	mulq	$m0			# ap[j]*bp[0]
133	add	%rax,$hi0
134	mov	($np,$j,8),%rax
135	adc	\$0,%rdx
136	lea	1($j),$j		# j++
137	mov	%rdx,$lo0
138
139	mulq	$m1			# np[j]*m1
140	cmp	$num,$j
141	jl	.L1st
142
143	add	%rax,$hi1
144	mov	($ap),%rax		# ap[0]
145	adc	\$0,%rdx
146	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
147	adc	\$0,%rdx
148	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
149	mov	%rdx,$hi1
150	mov	$lo0,$hi0
151
152	xor	%rdx,%rdx
153	add	$hi0,$hi1
154	adc	\$0,%rdx
155	mov	$hi1,-8(%rsp,$num,8)
156	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
157
158	lea	1($i),$i		# i++
159	jmp	.Louter
160.align	16
161.Louter:
162	mov	($bp,$i,8),$m0		# m0=bp[i]
163	xor	$j,$j			# j=0
164	mov	$n0,$m1
165	mov	(%rsp),$lo0
166	mulq	$m0			# ap[0]*bp[i]
167	add	%rax,$lo0		# ap[0]*bp[i]+tp[0]
168	mov	($np),%rax
169	adc	\$0,%rdx
170
171	imulq	$lo0,$m1		# tp[0]*n0
172	mov	%rdx,$hi0
173
174	mulq	$m1			# np[0]*m1
175	add	%rax,$lo0		# discarded
176	mov	8($ap),%rax
177	adc	\$0,%rdx
178	mov	8(%rsp),$lo0		# tp[1]
179	mov	%rdx,$hi1
180
181	lea	1($j),$j		# j++
182	jmp	.Linner_enter
183
184.align	16
185.Linner:
186	add	%rax,$hi1
187	mov	($ap,$j,8),%rax
188	adc	\$0,%rdx
189	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
190	mov	(%rsp,$j,8),$lo0
191	adc	\$0,%rdx
192	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
193	mov	%rdx,$hi1
194
195.Linner_enter:
196	mulq	$m0			# ap[j]*bp[i]
197	add	%rax,$hi0
198	mov	($np,$j,8),%rax
199	adc	\$0,%rdx
200	add	$hi0,$lo0		# ap[j]*bp[i]+tp[j]
201	mov	%rdx,$hi0
202	adc	\$0,$hi0
203	lea	1($j),$j		# j++
204
205	mulq	$m1			# np[j]*m1
206	cmp	$num,$j
207	jl	.Linner
208
209	add	%rax,$hi1
210	mov	($ap),%rax		# ap[0]
211	adc	\$0,%rdx
212	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
213	mov	(%rsp,$j,8),$lo0
214	adc	\$0,%rdx
215	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
216	mov	%rdx,$hi1
217
218	xor	%rdx,%rdx
219	add	$hi0,$hi1
220	adc	\$0,%rdx
221	add	$lo0,$hi1		# pull upmost overflow bit
222	adc	\$0,%rdx
223	mov	$hi1,-8(%rsp,$num,8)
224	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
225
226	lea	1($i),$i		# i++
227	cmp	$num,$i
228	jl	.Louter
229
230	xor	$i,$i			# i=0 and clear CF!
231	mov	(%rsp),%rax		# tp[0]
232	lea	(%rsp),$ap		# borrow ap for tp
233	mov	$num,$j			# j=num
234	jmp	.Lsub
235.align	16
236.Lsub:	sbb	($np,$i,8),%rax
237	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]-np[i]
238	mov	8($ap,$i,8),%rax	# tp[i+1]
239	lea	1($i),$i		# i++
240	dec	$j			# doesnn't affect CF!
241	jnz	.Lsub
242
243	sbb	\$0,%rax		# handle upmost overflow bit
244	xor	$i,$i
245	and	%rax,$ap
246	not	%rax
247	mov	$rp,$np
248	and	%rax,$np
249	mov	$num,$j			# j=num
250	or	$np,$ap			# ap=borrow?tp:rp
251.align	16
252.Lcopy:					# copy or in-place refresh
253	mov	($ap,$i,8),%rax
254	mov	$i,(%rsp,$i,8)		# zap temporary vector
255	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]
256	lea	1($i),$i
257	sub	\$1,$j
258	jnz	.Lcopy
259
260	mov	8(%rsp,$num,8),%rsi	# restore %rsp
261	mov	\$1,%rax
262	mov	(%rsi),%r15
263	mov	8(%rsi),%r14
264	mov	16(%rsi),%r13
265	mov	24(%rsi),%r12
266	mov	32(%rsi),%rbp
267	mov	40(%rsi),%rbx
268	lea	48(%rsi),%rsp
269.Lmul_epilogue:
270	ret
271.size	bn_mul_mont,.-bn_mul_mont
272___
273{{{
274my @A=("%r10","%r11");
275my @N=("%r13","%rdi");
276$code.=<<___;
277.type	bn_mul4x_mont,\@function,6
278.align	16
279bn_mul4x_mont:
280.Lmul4x_enter:
281	push	%rbx
282	push	%rbp
283	push	%r12
284	push	%r13
285	push	%r14
286	push	%r15
287
288	mov	${num}d,${num}d
289	lea	4($num),%r10
290	mov	%rsp,%r11
291	neg	%r10
292	lea	(%rsp,%r10,8),%rsp	# tp=alloca(8*(num+4))
293	and	\$-1024,%rsp		# minimize TLB usage
294
295	mov	%r11,8(%rsp,$num,8)	# tp[num+1]=%rsp
296.Lmul4x_body:
297	mov	$rp,16(%rsp,$num,8)	# tp[num+2]=$rp
298	mov	%rdx,%r12		# reassign $bp
299___
300		$bp="%r12";
301$code.=<<___;
302	mov	($n0),$n0		# pull n0[0] value
303	mov	($bp),$m0		# m0=bp[0]
304	mov	($ap),%rax
305
306	xor	$i,$i			# i=0
307	xor	$j,$j			# j=0
308
309	mov	$n0,$m1
310	mulq	$m0			# ap[0]*bp[0]
311	mov	%rax,$A[0]
312	mov	($np),%rax
313
314	imulq	$A[0],$m1		# "tp[0]"*n0
315	mov	%rdx,$A[1]
316
317	mulq	$m1			# np[0]*m1
318	add	%rax,$A[0]		# discarded
319	mov	8($ap),%rax
320	adc	\$0,%rdx
321	mov	%rdx,$N[1]
322
323	mulq	$m0
324	add	%rax,$A[1]
325	mov	8($np),%rax
326	adc	\$0,%rdx
327	mov	%rdx,$A[0]
328
329	mulq	$m1
330	add	%rax,$N[1]
331	mov	16($ap),%rax
332	adc	\$0,%rdx
333	add	$A[1],$N[1]
334	lea	4($j),$j		# j++
335	adc	\$0,%rdx
336	mov	$N[1],(%rsp)
337	mov	%rdx,$N[0]
338	jmp	.L1st4x
339.align	16
340.L1st4x:
341	mulq	$m0			# ap[j]*bp[0]
342	add	%rax,$A[0]
343	mov	-16($np,$j,8),%rax
344	adc	\$0,%rdx
345	mov	%rdx,$A[1]
346
347	mulq	$m1			# np[j]*m1
348	add	%rax,$N[0]
349	mov	-8($ap,$j,8),%rax
350	adc	\$0,%rdx
351	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
352	adc	\$0,%rdx
353	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
354	mov	%rdx,$N[1]
355
356	mulq	$m0			# ap[j]*bp[0]
357	add	%rax,$A[1]
358	mov	-8($np,$j,8),%rax
359	adc	\$0,%rdx
360	mov	%rdx,$A[0]
361
362	mulq	$m1			# np[j]*m1
363	add	%rax,$N[1]
364	mov	($ap,$j,8),%rax
365	adc	\$0,%rdx
366	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
367	adc	\$0,%rdx
368	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
369	mov	%rdx,$N[0]
370
371	mulq	$m0			# ap[j]*bp[0]
372	add	%rax,$A[0]
373	mov	($np,$j,8),%rax
374	adc	\$0,%rdx
375	mov	%rdx,$A[1]
376
377	mulq	$m1			# np[j]*m1
378	add	%rax,$N[0]
379	mov	8($ap,$j,8),%rax
380	adc	\$0,%rdx
381	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
382	adc	\$0,%rdx
383	mov	$N[0],-8(%rsp,$j,8)	# tp[j-1]
384	mov	%rdx,$N[1]
385
386	mulq	$m0			# ap[j]*bp[0]
387	add	%rax,$A[1]
388	mov	8($np,$j,8),%rax
389	adc	\$0,%rdx
390	lea	4($j),$j		# j++
391	mov	%rdx,$A[0]
392
393	mulq	$m1			# np[j]*m1
394	add	%rax,$N[1]
395	mov	-16($ap,$j,8),%rax
396	adc	\$0,%rdx
397	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
398	adc	\$0,%rdx
399	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
400	mov	%rdx,$N[0]
401	cmp	$num,$j
402	jl	.L1st4x
403
404	mulq	$m0			# ap[j]*bp[0]
405	add	%rax,$A[0]
406	mov	-16($np,$j,8),%rax
407	adc	\$0,%rdx
408	mov	%rdx,$A[1]
409
410	mulq	$m1			# np[j]*m1
411	add	%rax,$N[0]
412	mov	-8($ap,$j,8),%rax
413	adc	\$0,%rdx
414	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
415	adc	\$0,%rdx
416	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
417	mov	%rdx,$N[1]
418
419	mulq	$m0			# ap[j]*bp[0]
420	add	%rax,$A[1]
421	mov	-8($np,$j,8),%rax
422	adc	\$0,%rdx
423	mov	%rdx,$A[0]
424
425	mulq	$m1			# np[j]*m1
426	add	%rax,$N[1]
427	mov	($ap),%rax		# ap[0]
428	adc	\$0,%rdx
429	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
430	adc	\$0,%rdx
431	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
432	mov	%rdx,$N[0]
433
434	xor	$N[1],$N[1]
435	add	$A[0],$N[0]
436	adc	\$0,$N[1]
437	mov	$N[0],-8(%rsp,$j,8)
438	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
439
440	lea	1($i),$i		# i++
441.align	4
442.Louter4x:
443	mov	($bp,$i,8),$m0		# m0=bp[i]
444	xor	$j,$j			# j=0
445	mov	(%rsp),$A[0]
446	mov	$n0,$m1
447	mulq	$m0			# ap[0]*bp[i]
448	add	%rax,$A[0]		# ap[0]*bp[i]+tp[0]
449	mov	($np),%rax
450	adc	\$0,%rdx
451
452	imulq	$A[0],$m1		# tp[0]*n0
453	mov	%rdx,$A[1]
454
455	mulq	$m1			# np[0]*m1
456	add	%rax,$A[0]		# "$N[0]", discarded
457	mov	8($ap),%rax
458	adc	\$0,%rdx
459	mov	%rdx,$N[1]
460
461	mulq	$m0			# ap[j]*bp[i]
462	add	%rax,$A[1]
463	mov	8($np),%rax
464	adc	\$0,%rdx
465	add	8(%rsp),$A[1]		# +tp[1]
466	adc	\$0,%rdx
467	mov	%rdx,$A[0]
468
469	mulq	$m1			# np[j]*m1
470	add	%rax,$N[1]
471	mov	16($ap),%rax
472	adc	\$0,%rdx
473	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[i]+tp[j]
474	lea	4($j),$j		# j+=2
475	adc	\$0,%rdx
476	mov	$N[1],(%rsp)		# tp[j-1]
477	mov	%rdx,$N[0]
478	jmp	.Linner4x
479.align	16
480.Linner4x:
481	mulq	$m0			# ap[j]*bp[i]
482	add	%rax,$A[0]
483	mov	-16($np,$j,8),%rax
484	adc	\$0,%rdx
485	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
486	adc	\$0,%rdx
487	mov	%rdx,$A[1]
488
489	mulq	$m1			# np[j]*m1
490	add	%rax,$N[0]
491	mov	-8($ap,$j,8),%rax
492	adc	\$0,%rdx
493	add	$A[0],$N[0]
494	adc	\$0,%rdx
495	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
496	mov	%rdx,$N[1]
497
498	mulq	$m0			# ap[j]*bp[i]
499	add	%rax,$A[1]
500	mov	-8($np,$j,8),%rax
501	adc	\$0,%rdx
502	add	-8(%rsp,$j,8),$A[1]
503	adc	\$0,%rdx
504	mov	%rdx,$A[0]
505
506	mulq	$m1			# np[j]*m1
507	add	%rax,$N[1]
508	mov	($ap,$j,8),%rax
509	adc	\$0,%rdx
510	add	$A[1],$N[1]
511	adc	\$0,%rdx
512	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
513	mov	%rdx,$N[0]
514
515	mulq	$m0			# ap[j]*bp[i]
516	add	%rax,$A[0]
517	mov	($np,$j,8),%rax
518	adc	\$0,%rdx
519	add	(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
520	adc	\$0,%rdx
521	mov	%rdx,$A[1]
522
523	mulq	$m1			# np[j]*m1
524	add	%rax,$N[0]
525	mov	8($ap,$j,8),%rax
526	adc	\$0,%rdx
527	add	$A[0],$N[0]
528	adc	\$0,%rdx
529	mov	$N[0],-8(%rsp,$j,8)	# tp[j-1]
530	mov	%rdx,$N[1]
531
532	mulq	$m0			# ap[j]*bp[i]
533	add	%rax,$A[1]
534	mov	8($np,$j,8),%rax
535	adc	\$0,%rdx
536	add	8(%rsp,$j,8),$A[1]
537	adc	\$0,%rdx
538	lea	4($j),$j		# j++
539	mov	%rdx,$A[0]
540
541	mulq	$m1			# np[j]*m1
542	add	%rax,$N[1]
543	mov	-16($ap,$j,8),%rax
544	adc	\$0,%rdx
545	add	$A[1],$N[1]
546	adc	\$0,%rdx
547	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
548	mov	%rdx,$N[0]
549	cmp	$num,$j
550	jl	.Linner4x
551
552	mulq	$m0			# ap[j]*bp[i]
553	add	%rax,$A[0]
554	mov	-16($np,$j,8),%rax
555	adc	\$0,%rdx
556	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
557	adc	\$0,%rdx
558	mov	%rdx,$A[1]
559
560	mulq	$m1			# np[j]*m1
561	add	%rax,$N[0]
562	mov	-8($ap,$j,8),%rax
563	adc	\$0,%rdx
564	add	$A[0],$N[0]
565	adc	\$0,%rdx
566	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
567	mov	%rdx,$N[1]
568
569	mulq	$m0			# ap[j]*bp[i]
570	add	%rax,$A[1]
571	mov	-8($np,$j,8),%rax
572	adc	\$0,%rdx
573	add	-8(%rsp,$j,8),$A[1]
574	adc	\$0,%rdx
575	lea	1($i),$i		# i++
576	mov	%rdx,$A[0]
577
578	mulq	$m1			# np[j]*m1
579	add	%rax,$N[1]
580	mov	($ap),%rax		# ap[0]
581	adc	\$0,%rdx
582	add	$A[1],$N[1]
583	adc	\$0,%rdx
584	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
585	mov	%rdx,$N[0]
586
587	xor	$N[1],$N[1]
588	add	$A[0],$N[0]
589	adc	\$0,$N[1]
590	add	(%rsp,$num,8),$N[0]	# pull upmost overflow bit
591	adc	\$0,$N[1]
592	mov	$N[0],-8(%rsp,$j,8)
593	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
594
595	cmp	$num,$i
596	jl	.Louter4x
597___
598{
599my @ri=("%rax","%rdx",$m0,$m1);
600$code.=<<___;
601	mov	16(%rsp,$num,8),$rp	# restore $rp
602	mov	0(%rsp),@ri[0]		# tp[0]
603	pxor	%xmm0,%xmm0
604	mov	8(%rsp),@ri[1]		# tp[1]
605	shr	\$2,$num		# num/=4
606	lea	(%rsp),$ap		# borrow ap for tp
607	xor	$i,$i			# i=0 and clear CF!
608
609	sub	0($np),@ri[0]
610	mov	16($ap),@ri[2]		# tp[2]
611	mov	24($ap),@ri[3]		# tp[3]
612	sbb	8($np),@ri[1]
613	lea	-1($num),$j		# j=num/4-1
614	jmp	.Lsub4x
615.align	16
616.Lsub4x:
617	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
618	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
619	sbb	16($np,$i,8),@ri[2]
620	mov	32($ap,$i,8),@ri[0]	# tp[i+1]
621	mov	40($ap,$i,8),@ri[1]
622	sbb	24($np,$i,8),@ri[3]
623	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
624	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
625	sbb	32($np,$i,8),@ri[0]
626	mov	48($ap,$i,8),@ri[2]
627	mov	56($ap,$i,8),@ri[3]
628	sbb	40($np,$i,8),@ri[1]
629	lea	4($i),$i		# i++
630	dec	$j			# doesnn't affect CF!
631	jnz	.Lsub4x
632
633	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
634	mov	32($ap,$i,8),@ri[0]	# load overflow bit
635	sbb	16($np,$i,8),@ri[2]
636	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
637	sbb	24($np,$i,8),@ri[3]
638	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
639
640	sbb	\$0,@ri[0]		# handle upmost overflow bit
641	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
642	xor	$i,$i			# i=0
643	and	@ri[0],$ap
644	not	@ri[0]
645	mov	$rp,$np
646	and	@ri[0],$np
647	lea	-1($num),$j
648	or	$np,$ap			# ap=borrow?tp:rp
649
650	movdqu	($ap),%xmm1
651	movdqa	%xmm0,(%rsp)
652	movdqu	%xmm1,($rp)
653	jmp	.Lcopy4x
654.align	16
655.Lcopy4x:					# copy or in-place refresh
656	movdqu	16($ap,$i),%xmm2
657	movdqu	32($ap,$i),%xmm1
658	movdqa	%xmm0,16(%rsp,$i)
659	movdqu	%xmm2,16($rp,$i)
660	movdqa	%xmm0,32(%rsp,$i)
661	movdqu	%xmm1,32($rp,$i)
662	lea	32($i),$i
663	dec	$j
664	jnz	.Lcopy4x
665
666	shl	\$2,$num
667	movdqu	16($ap,$i),%xmm2
668	movdqa	%xmm0,16(%rsp,$i)
669	movdqu	%xmm2,16($rp,$i)
670___
671}
672$code.=<<___;
673	mov	8(%rsp,$num,8),%rsi	# restore %rsp
674	mov	\$1,%rax
675	mov	(%rsi),%r15
676	mov	8(%rsi),%r14
677	mov	16(%rsi),%r13
678	mov	24(%rsi),%r12
679	mov	32(%rsi),%rbp
680	mov	40(%rsi),%rbx
681	lea	48(%rsi),%rsp
682.Lmul4x_epilogue:
683	ret
684.size	bn_mul4x_mont,.-bn_mul4x_mont
685___
686}}}
687{{{
688######################################################################
689# void bn_sqr4x_mont(
690my $rptr="%rdi";	# const BN_ULONG *rptr,
691my $aptr="%rsi";	# const BN_ULONG *aptr,
692my $bptr="%rdx";	# not used
693my $nptr="%rcx";	# const BN_ULONG *nptr,
694my $n0  ="%r8";		# const BN_ULONG *n0);
695my $num ="%r9";		# int num, has to be divisible by 4 and
696			# not less than 8
697
698my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
699my @A0=("%r10","%r11");
700my @A1=("%r12","%r13");
701my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
702
703$code.=<<___;
704.type	bn_sqr4x_mont,\@function,6
705.align	16
706bn_sqr4x_mont:
707.Lsqr4x_enter:
708	push	%rbx
709	push	%rbp
710	push	%r12
711	push	%r13
712	push	%r14
713	push	%r15
714
715	shl	\$3,${num}d		# convert $num to bytes
716	xor	%r10,%r10
717	mov	%rsp,%r11		# put aside %rsp
718	sub	$num,%r10		# -$num
719	mov	($n0),$n0		# *n0
720	lea	-72(%rsp,%r10,2),%rsp	# alloca(frame+2*$num)
721	and	\$-1024,%rsp		# minimize TLB usage
722	##############################################################
723	# Stack layout
724	#
725	# +0	saved $num, used in reduction section
726	# +8	&t[2*$num], used in reduction section
727	# +32	saved $rptr
728	# +40	saved $nptr
729	# +48	saved *n0
730	# +56	saved %rsp
731	# +64	t[2*$num]
732	#
733	mov	$rptr,32(%rsp)		# save $rptr
734	mov	$nptr,40(%rsp)
735	mov	$n0,  48(%rsp)
736	mov	%r11, 56(%rsp)		# save original %rsp
737.Lsqr4x_body:
738	##############################################################
739	# Squaring part:
740	#
741	# a) multiply-n-add everything but a[i]*a[i];
742	# b) shift result of a) by 1 to the left and accumulate
743	#    a[i]*a[i] products;
744	#
745	lea	32(%r10),$i		# $i=-($num-32)
746	lea	($aptr,$num),$aptr	# end of a[] buffer, ($aptr,$i)=&ap[2]
747
748	mov	$num,$j			# $j=$num
749
750					# comments apply to $num==8 case
751	mov	-32($aptr,$i),$a0	# a[0]
752	lea	64(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
753	mov	-24($aptr,$i),%rax	# a[1]
754	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
755	mov	-16($aptr,$i),$ai	# a[2]
756	mov	%rax,$a1
757
758	mul	$a0			# a[1]*a[0]
759	mov	%rax,$A0[0]		# a[1]*a[0]
760	 mov	$ai,%rax		# a[2]
761	mov	%rdx,$A0[1]
762	mov	$A0[0],-24($tptr,$i)	# t[1]
763
764	xor	$A0[0],$A0[0]
765	mul	$a0			# a[2]*a[0]
766	add	%rax,$A0[1]
767	 mov	$ai,%rax
768	adc	%rdx,$A0[0]
769	mov	$A0[1],-16($tptr,$i)	# t[2]
770
771	lea	-16($i),$j		# j=-16
772
773
774	 mov	8($aptr,$j),$ai		# a[3]
775	mul	$a1			# a[2]*a[1]
776	mov	%rax,$A1[0]		# a[2]*a[1]+t[3]
777	 mov	$ai,%rax
778	mov	%rdx,$A1[1]
779
780	xor	$A0[1],$A0[1]
781	add	$A1[0],$A0[0]
782	 lea	16($j),$j
783	adc	\$0,$A0[1]
784	mul	$a0			# a[3]*a[0]
785	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
786	 mov	$ai,%rax
787	adc	%rdx,$A0[1]
788	mov	$A0[0],-8($tptr,$j)	# t[3]
789	jmp	.Lsqr4x_1st
790
791.align	16
792.Lsqr4x_1st:
793	 mov	($aptr,$j),$ai		# a[4]
794	xor	$A1[0],$A1[0]
795	mul	$a1			# a[3]*a[1]
796	add	%rax,$A1[1]		# a[3]*a[1]+t[4]
797	 mov	$ai,%rax
798	adc	%rdx,$A1[0]
799
800	xor	$A0[0],$A0[0]
801	add	$A1[1],$A0[1]
802	adc	\$0,$A0[0]
803	mul	$a0			# a[4]*a[0]
804	add	%rax,$A0[1]		# a[4]*a[0]+a[3]*a[1]+t[4]
805	 mov	$ai,%rax		# a[3]
806	adc	%rdx,$A0[0]
807	mov	$A0[1],($tptr,$j)	# t[4]
808
809
810	 mov	8($aptr,$j),$ai		# a[5]
811	xor	$A1[1],$A1[1]
812	mul	$a1			# a[4]*a[3]
813	add	%rax,$A1[0]		# a[4]*a[3]+t[5]
814	 mov	$ai,%rax
815	adc	%rdx,$A1[1]
816
817	xor	$A0[1],$A0[1]
818	add	$A1[0],$A0[0]
819	adc	\$0,$A0[1]
820	mul	$a0			# a[5]*a[2]
821	add	%rax,$A0[0]		# a[5]*a[2]+a[4]*a[3]+t[5]
822	 mov	$ai,%rax
823	adc	%rdx,$A0[1]
824	mov	$A0[0],8($tptr,$j)	# t[5]
825
826	 mov	16($aptr,$j),$ai	# a[6]
827	xor	$A1[0],$A1[0]
828	mul	$a1			# a[5]*a[3]
829	add	%rax,$A1[1]		# a[5]*a[3]+t[6]
830	 mov	$ai,%rax
831	adc	%rdx,$A1[0]
832
833	xor	$A0[0],$A0[0]
834	add	$A1[1],$A0[1]
835	adc	\$0,$A0[0]
836	mul	$a0			# a[6]*a[2]
837	add	%rax,$A0[1]		# a[6]*a[2]+a[5]*a[3]+t[6]
838	 mov	$ai,%rax		# a[3]
839	adc	%rdx,$A0[0]
840	mov	$A0[1],16($tptr,$j)	# t[6]
841
842
843	 mov	24($aptr,$j),$ai	# a[7]
844	xor	$A1[1],$A1[1]
845	mul	$a1			# a[6]*a[5]
846	add	%rax,$A1[0]		# a[6]*a[5]+t[7]
847	 mov	$ai,%rax
848	adc	%rdx,$A1[1]
849
850	xor	$A0[1],$A0[1]
851	add	$A1[0],$A0[0]
852	 lea	32($j),$j
853	adc	\$0,$A0[1]
854	mul	$a0			# a[7]*a[4]
855	add	%rax,$A0[0]		# a[7]*a[4]+a[6]*a[5]+t[6]
856	 mov	$ai,%rax
857	adc	%rdx,$A0[1]
858	mov	$A0[0],-8($tptr,$j)	# t[7]
859
860	cmp	\$0,$j
861	jne	.Lsqr4x_1st
862
863	xor	$A1[0],$A1[0]
864	add	$A0[1],$A1[1]
865	adc	\$0,$A1[0]
866	mul	$a1			# a[7]*a[5]
867	add	%rax,$A1[1]
868	adc	%rdx,$A1[0]
869
870	mov	$A1[1],($tptr)		# t[8]
871	lea	16($i),$i
872	mov	$A1[0],8($tptr)		# t[9]
873	jmp	.Lsqr4x_outer
874
875.align	16
876.Lsqr4x_outer:				# comments apply to $num==6 case
877	mov	-32($aptr,$i),$a0	# a[0]
878	lea	64(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
879	mov	-24($aptr,$i),%rax	# a[1]
880	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
881	mov	-16($aptr,$i),$ai	# a[2]
882	mov	%rax,$a1
883
884	mov	-24($tptr,$i),$A0[0]	# t[1]
885	xor	$A0[1],$A0[1]
886	mul	$a0			# a[1]*a[0]
887	add	%rax,$A0[0]		# a[1]*a[0]+t[1]
888	 mov	$ai,%rax		# a[2]
889	adc	%rdx,$A0[1]
890	mov	$A0[0],-24($tptr,$i)	# t[1]
891
892	xor	$A0[0],$A0[0]
893	add	-16($tptr,$i),$A0[1]	# a[2]*a[0]+t[2]
894	adc	\$0,$A0[0]
895	mul	$a0			# a[2]*a[0]
896	add	%rax,$A0[1]
897	 mov	$ai,%rax
898	adc	%rdx,$A0[0]
899	mov	$A0[1],-16($tptr,$i)	# t[2]
900
901	lea	-16($i),$j		# j=-16
902	xor	$A1[0],$A1[0]
903
904
905	 mov	8($aptr,$j),$ai		# a[3]
906	xor	$A1[1],$A1[1]
907	add	8($tptr,$j),$A1[0]
908	adc	\$0,$A1[1]
909	mul	$a1			# a[2]*a[1]
910	add	%rax,$A1[0]		# a[2]*a[1]+t[3]
911	 mov	$ai,%rax
912	adc	%rdx,$A1[1]
913
914	xor	$A0[1],$A0[1]
915	add	$A1[0],$A0[0]
916	adc	\$0,$A0[1]
917	mul	$a0			# a[3]*a[0]
918	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
919	 mov	$ai,%rax
920	adc	%rdx,$A0[1]
921	mov	$A0[0],8($tptr,$j)	# t[3]
922
923	lea	16($j),$j
924	jmp	.Lsqr4x_inner
925
926.align	16
927.Lsqr4x_inner:
928	 mov	($aptr,$j),$ai		# a[4]
929	xor	$A1[0],$A1[0]
930	add	($tptr,$j),$A1[1]
931	adc	\$0,$A1[0]
932	mul	$a1			# a[3]*a[1]
933	add	%rax,$A1[1]		# a[3]*a[1]+t[4]
934	 mov	$ai,%rax
935	adc	%rdx,$A1[0]
936
937	xor	$A0[0],$A0[0]
938	add	$A1[1],$A0[1]
939	adc	\$0,$A0[0]
940	mul	$a0			# a[4]*a[0]
941	add	%rax,$A0[1]		# a[4]*a[0]+a[3]*a[1]+t[4]
942	 mov	$ai,%rax		# a[3]
943	adc	%rdx,$A0[0]
944	mov	$A0[1],($tptr,$j)	# t[4]
945
946	 mov	8($aptr,$j),$ai		# a[5]
947	xor	$A1[1],$A1[1]
948	add	8($tptr,$j),$A1[0]
949	adc	\$0,$A1[1]
950	mul	$a1			# a[4]*a[3]
951	add	%rax,$A1[0]		# a[4]*a[3]+t[5]
952	 mov	$ai,%rax
953	adc	%rdx,$A1[1]
954
955	xor	$A0[1],$A0[1]
956	add	$A1[0],$A0[0]
957	lea	16($j),$j		# j++
958	adc	\$0,$A0[1]
959	mul	$a0			# a[5]*a[2]
960	add	%rax,$A0[0]		# a[5]*a[2]+a[4]*a[3]+t[5]
961	 mov	$ai,%rax
962	adc	%rdx,$A0[1]
963	mov	$A0[0],-8($tptr,$j)	# t[5], "preloaded t[1]" below
964
965	cmp	\$0,$j
966	jne	.Lsqr4x_inner
967
968	xor	$A1[0],$A1[0]
969	add	$A0[1],$A1[1]
970	adc	\$0,$A1[0]
971	mul	$a1			# a[5]*a[3]
972	add	%rax,$A1[1]
973	adc	%rdx,$A1[0]
974
975	mov	$A1[1],($tptr)		# t[6], "preloaded t[2]" below
976	mov	$A1[0],8($tptr)		# t[7], "preloaded t[3]" below
977
978	add	\$16,$i
979	jnz	.Lsqr4x_outer
980
981					# comments apply to $num==4 case
982	mov	-32($aptr),$a0		# a[0]
983	lea	64(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
984	mov	-24($aptr),%rax		# a[1]
985	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
986	mov	-16($aptr),$ai		# a[2]
987	mov	%rax,$a1
988
989	xor	$A0[1],$A0[1]
990	mul	$a0			# a[1]*a[0]
991	add	%rax,$A0[0]		# a[1]*a[0]+t[1], preloaded t[1]
992	 mov	$ai,%rax		# a[2]
993	adc	%rdx,$A0[1]
994	mov	$A0[0],-24($tptr)	# t[1]
995
996	xor	$A0[0],$A0[0]
997	add	$A1[1],$A0[1]		# a[2]*a[0]+t[2], preloaded t[2]
998	adc	\$0,$A0[0]
999	mul	$a0			# a[2]*a[0]
1000	add	%rax,$A0[1]
1001	 mov	$ai,%rax
1002	adc	%rdx,$A0[0]
1003	mov	$A0[1],-16($tptr)	# t[2]
1004
1005	 mov	-8($aptr),$ai		# a[3]
1006	mul	$a1			# a[2]*a[1]
1007	add	%rax,$A1[0]		# a[2]*a[1]+t[3], preloaded t[3]
1008	 mov	$ai,%rax
1009	adc	\$0,%rdx
1010
1011	xor	$A0[1],$A0[1]
1012	add	$A1[0],$A0[0]
1013	 mov	%rdx,$A1[1]
1014	adc	\$0,$A0[1]
1015	mul	$a0			# a[3]*a[0]
1016	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
1017	 mov	$ai,%rax
1018	adc	%rdx,$A0[1]
1019	mov	$A0[0],-8($tptr)	# t[3]
1020
1021	xor	$A1[0],$A1[0]
1022	add	$A0[1],$A1[1]
1023	adc	\$0,$A1[0]
1024	mul	$a1			# a[3]*a[1]
1025	add	%rax,$A1[1]
1026	 mov	-16($aptr),%rax		# a[2]
1027	adc	%rdx,$A1[0]
1028
1029	mov	$A1[1],($tptr)		# t[4]
1030	mov	$A1[0],8($tptr)		# t[5]
1031
1032	mul	$ai			# a[2]*a[3]
1033___
1034{
1035my ($shift,$carry)=($a0,$a1);
1036my @S=(@A1,$ai,$n0);
1037$code.=<<___;
1038	 add	\$16,$i
1039	 xor	$shift,$shift
1040	 sub	$num,$i			# $i=16-$num
1041	 xor	$carry,$carry
1042
1043	add	$A1[0],%rax		# t[5]
1044	adc	\$0,%rdx
1045	mov	%rax,8($tptr)		# t[5]
1046	mov	%rdx,16($tptr)		# t[6]
1047	mov	$carry,24($tptr)	# t[7]
1048
1049	 mov	-16($aptr,$i),%rax	# a[0]
1050	lea	64(%rsp,$num,2),$tptr
1051	 xor	$A0[0],$A0[0]		# t[0]
1052	 mov	-24($tptr,$i,2),$A0[1]	# t[1]
1053
1054	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
1055	shr	\$63,$A0[0]
1056	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
1057	shr	\$63,$A0[1]
1058	or	$A0[0],$S[1]		# | t[2*i]>>63
1059	 mov	-16($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
1060	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1061	mul	%rax			# a[i]*a[i]
1062	neg	$carry			# mov $carry,cf
1063	 mov	-8($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
1064	adc	%rax,$S[0]
1065	 mov	-8($aptr,$i),%rax	# a[i+1]	# prefetch
1066	mov	$S[0],-32($tptr,$i,2)
1067	adc	%rdx,$S[1]
1068
1069	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
1070	 mov	$S[1],-24($tptr,$i,2)
1071	 sbb	$carry,$carry		# mov cf,$carry
1072	shr	\$63,$A0[0]
1073	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
1074	shr	\$63,$A0[1]
1075	or	$A0[0],$S[3]		# | t[2*i]>>63
1076	 mov	0($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
1077	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1078	mul	%rax			# a[i]*a[i]
1079	neg	$carry			# mov $carry,cf
1080	 mov	8($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
1081	adc	%rax,$S[2]
1082	 mov	0($aptr,$i),%rax	# a[i+1]	# prefetch
1083	mov	$S[2],-16($tptr,$i,2)
1084	adc	%rdx,$S[3]
1085	lea	16($i),$i
1086	mov	$S[3],-40($tptr,$i,2)
1087	sbb	$carry,$carry		# mov cf,$carry
1088	jmp	.Lsqr4x_shift_n_add
1089
1090.align	16
1091.Lsqr4x_shift_n_add:
1092	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
1093	shr	\$63,$A0[0]
1094	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
1095	shr	\$63,$A0[1]
1096	or	$A0[0],$S[1]		# | t[2*i]>>63
1097	 mov	-16($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
1098	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1099	mul	%rax			# a[i]*a[i]
1100	neg	$carry			# mov $carry,cf
1101	 mov	-8($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
1102	adc	%rax,$S[0]
1103	 mov	-8($aptr,$i),%rax	# a[i+1]	# prefetch
1104	mov	$S[0],-32($tptr,$i,2)
1105	adc	%rdx,$S[1]
1106
1107	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
1108	 mov	$S[1],-24($tptr,$i,2)
1109	 sbb	$carry,$carry		# mov cf,$carry
1110	shr	\$63,$A0[0]
1111	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
1112	shr	\$63,$A0[1]
1113	or	$A0[0],$S[3]		# | t[2*i]>>63
1114	 mov	0($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
1115	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1116	mul	%rax			# a[i]*a[i]
1117	neg	$carry			# mov $carry,cf
1118	 mov	8($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
1119	adc	%rax,$S[2]
1120	 mov	0($aptr,$i),%rax	# a[i+1]	# prefetch
1121	mov	$S[2],-16($tptr,$i,2)
1122	adc	%rdx,$S[3]
1123
1124	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
1125	 mov	$S[3],-8($tptr,$i,2)
1126	 sbb	$carry,$carry		# mov cf,$carry
1127	shr	\$63,$A0[0]
1128	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
1129	shr	\$63,$A0[1]
1130	or	$A0[0],$S[1]		# | t[2*i]>>63
1131	 mov	16($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
1132	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1133	mul	%rax			# a[i]*a[i]
1134	neg	$carry			# mov $carry,cf
1135	 mov	24($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
1136	adc	%rax,$S[0]
1137	 mov	8($aptr,$i),%rax	# a[i+1]	# prefetch
1138	mov	$S[0],0($tptr,$i,2)
1139	adc	%rdx,$S[1]
1140
1141	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
1142	 mov	$S[1],8($tptr,$i,2)
1143	 sbb	$carry,$carry		# mov cf,$carry
1144	shr	\$63,$A0[0]
1145	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
1146	shr	\$63,$A0[1]
1147	or	$A0[0],$S[3]		# | t[2*i]>>63
1148	 mov	32($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
1149	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1150	mul	%rax			# a[i]*a[i]
1151	neg	$carry			# mov $carry,cf
1152	 mov	40($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
1153	adc	%rax,$S[2]
1154	 mov	16($aptr,$i),%rax	# a[i+1]	# prefetch
1155	mov	$S[2],16($tptr,$i,2)
1156	adc	%rdx,$S[3]
1157	mov	$S[3],24($tptr,$i,2)
1158	sbb	$carry,$carry		# mov cf,$carry
1159	add	\$32,$i
1160	jnz	.Lsqr4x_shift_n_add
1161
1162	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
1163	shr	\$63,$A0[0]
1164	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
1165	shr	\$63,$A0[1]
1166	or	$A0[0],$S[1]		# | t[2*i]>>63
1167	 mov	-16($tptr),$A0[0]	# t[2*i+2]	# prefetch
1168	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1169	mul	%rax			# a[i]*a[i]
1170	neg	$carry			# mov $carry,cf
1171	 mov	-8($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
1172	adc	%rax,$S[0]
1173	 mov	-8($aptr),%rax		# a[i+1]	# prefetch
1174	mov	$S[0],-32($tptr)
1175	adc	%rdx,$S[1]
1176
1177	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1|shift
1178	 mov	$S[1],-24($tptr)
1179	 sbb	$carry,$carry		# mov cf,$carry
1180	shr	\$63,$A0[0]
1181	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
1182	shr	\$63,$A0[1]
1183	or	$A0[0],$S[3]		# | t[2*i]>>63
1184	mul	%rax			# a[i]*a[i]
1185	neg	$carry			# mov $carry,cf
1186	adc	%rax,$S[2]
1187	adc	%rdx,$S[3]
1188	mov	$S[2],-16($tptr)
1189	mov	$S[3],-8($tptr)
1190___
1191}
1192##############################################################
1193# Montgomery reduction part, "word-by-word" algorithm.
1194#
1195{
1196my ($topbit,$nptr)=("%rbp",$aptr);
1197my ($m0,$m1)=($a0,$a1);
1198my @Ni=("%rbx","%r9");
1199$code.=<<___;
1200	mov	40(%rsp),$nptr		# restore $nptr
1201	mov	48(%rsp),$n0		# restore *n0
1202	xor	$j,$j
1203	mov	$num,0(%rsp)		# save $num
1204	sub	$num,$j			# $j=-$num
1205	 mov	64(%rsp),$A0[0]		# t[0]		# modsched #
1206	 mov	$n0,$m0			#		# modsched #
1207	lea	64(%rsp,$num,2),%rax	# end of t[] buffer
1208	lea	64(%rsp,$num),$tptr	# end of t[] window
1209	mov	%rax,8(%rsp)		# save end of t[] buffer
1210	lea	($nptr,$num),$nptr	# end of n[] buffer
1211	xor	$topbit,$topbit		# $topbit=0
1212
1213	mov	0($nptr,$j),%rax	# n[0]		# modsched #
1214	mov	8($nptr,$j),$Ni[1]	# n[1]		# modsched #
1215	 imulq	$A0[0],$m0		# m0=t[0]*n0	# modsched #
1216	 mov	%rax,$Ni[0]		#		# modsched #
1217	jmp	.Lsqr4x_mont_outer
1218
1219.align	16
1220.Lsqr4x_mont_outer:
1221	xor	$A0[1],$A0[1]
1222	mul	$m0			# n[0]*m0
1223	add	%rax,$A0[0]		# n[0]*m0+t[0]
1224	 mov	$Ni[1],%rax
1225	adc	%rdx,$A0[1]
1226	mov	$n0,$m1
1227
1228	xor	$A0[0],$A0[0]
1229	add	8($tptr,$j),$A0[1]
1230	adc	\$0,$A0[0]
1231	mul	$m0			# n[1]*m0
1232	add	%rax,$A0[1]		# n[1]*m0+t[1]
1233	 mov	$Ni[0],%rax
1234	adc	%rdx,$A0[0]
1235
1236	imulq	$A0[1],$m1
1237
1238	mov	16($nptr,$j),$Ni[0]	# n[2]
1239	xor	$A1[1],$A1[1]
1240	add	$A0[1],$A1[0]
1241	adc	\$0,$A1[1]
1242	mul	$m1			# n[0]*m1
1243	add	%rax,$A1[0]		# n[0]*m1+"t[1]"
1244	 mov	$Ni[0],%rax
1245	adc	%rdx,$A1[1]
1246	mov	$A1[0],8($tptr,$j)	# "t[1]"
1247
1248	xor	$A0[1],$A0[1]
1249	add	16($tptr,$j),$A0[0]
1250	adc	\$0,$A0[1]
1251	mul	$m0			# n[2]*m0
1252	add	%rax,$A0[0]		# n[2]*m0+t[2]
1253	 mov	$Ni[1],%rax
1254	adc	%rdx,$A0[1]
1255
1256	mov	24($nptr,$j),$Ni[1]	# n[3]
1257	xor	$A1[0],$A1[0]
1258	add	$A0[0],$A1[1]
1259	adc	\$0,$A1[0]
1260	mul	$m1			# n[1]*m1
1261	add	%rax,$A1[1]		# n[1]*m1+"t[2]"
1262	 mov	$Ni[1],%rax
1263	adc	%rdx,$A1[0]
1264	mov	$A1[1],16($tptr,$j)	# "t[2]"
1265
1266	xor	$A0[0],$A0[0]
1267	add	24($tptr,$j),$A0[1]
1268	lea	32($j),$j
1269	adc	\$0,$A0[0]
1270	mul	$m0			# n[3]*m0
1271	add	%rax,$A0[1]		# n[3]*m0+t[3]
1272	 mov	$Ni[0],%rax
1273	adc	%rdx,$A0[0]
1274	jmp	.Lsqr4x_mont_inner
1275
1276.align	16
1277.Lsqr4x_mont_inner:
1278	mov	($nptr,$j),$Ni[0]	# n[4]
1279	xor	$A1[1],$A1[1]
1280	add	$A0[1],$A1[0]
1281	adc	\$0,$A1[1]
1282	mul	$m1			# n[2]*m1
1283	add	%rax,$A1[0]		# n[2]*m1+"t[3]"
1284	 mov	$Ni[0],%rax
1285	adc	%rdx,$A1[1]
1286	mov	$A1[0],-8($tptr,$j)	# "t[3]"
1287
1288	xor	$A0[1],$A0[1]
1289	add	($tptr,$j),$A0[0]
1290	adc	\$0,$A0[1]
1291	mul	$m0			# n[4]*m0
1292	add	%rax,$A0[0]		# n[4]*m0+t[4]
1293	 mov	$Ni[1],%rax
1294	adc	%rdx,$A0[1]
1295
1296	mov	8($nptr,$j),$Ni[1]	# n[5]
1297	xor	$A1[0],$A1[0]
1298	add	$A0[0],$A1[1]
1299	adc	\$0,$A1[0]
1300	mul	$m1			# n[3]*m1
1301	add	%rax,$A1[1]		# n[3]*m1+"t[4]"
1302	 mov	$Ni[1],%rax
1303	adc	%rdx,$A1[0]
1304	mov	$A1[1],($tptr,$j)	# "t[4]"
1305
1306	xor	$A0[0],$A0[0]
1307	add	8($tptr,$j),$A0[1]
1308	adc	\$0,$A0[0]
1309	mul	$m0			# n[5]*m0
1310	add	%rax,$A0[1]		# n[5]*m0+t[5]
1311	 mov	$Ni[0],%rax
1312	adc	%rdx,$A0[0]
1313
1314
1315	mov	16($nptr,$j),$Ni[0]	# n[6]
1316	xor	$A1[1],$A1[1]
1317	add	$A0[1],$A1[0]
1318	adc	\$0,$A1[1]
1319	mul	$m1			# n[4]*m1
1320	add	%rax,$A1[0]		# n[4]*m1+"t[5]"
1321	 mov	$Ni[0],%rax
1322	adc	%rdx,$A1[1]
1323	mov	$A1[0],8($tptr,$j)	# "t[5]"
1324
1325	xor	$A0[1],$A0[1]
1326	add	16($tptr,$j),$A0[0]
1327	adc	\$0,$A0[1]
1328	mul	$m0			# n[6]*m0
1329	add	%rax,$A0[0]		# n[6]*m0+t[6]
1330	 mov	$Ni[1],%rax
1331	adc	%rdx,$A0[1]
1332
1333	mov	24($nptr,$j),$Ni[1]	# n[7]
1334	xor	$A1[0],$A1[0]
1335	add	$A0[0],$A1[1]
1336	adc	\$0,$A1[0]
1337	mul	$m1			# n[5]*m1
1338	add	%rax,$A1[1]		# n[5]*m1+"t[6]"
1339	 mov	$Ni[1],%rax
1340	adc	%rdx,$A1[0]
1341	mov	$A1[1],16($tptr,$j)	# "t[6]"
1342
1343	xor	$A0[0],$A0[0]
1344	add	24($tptr,$j),$A0[1]
1345	lea	32($j),$j
1346	adc	\$0,$A0[0]
1347	mul	$m0			# n[7]*m0
1348	add	%rax,$A0[1]		# n[7]*m0+t[7]
1349	 mov	$Ni[0],%rax
1350	adc	%rdx,$A0[0]
1351	cmp	\$0,$j
1352	jne	.Lsqr4x_mont_inner
1353
1354	 sub	0(%rsp),$j		# $j=-$num	# modsched #
1355	 mov	$n0,$m0			#		# modsched #
1356
1357	xor	$A1[1],$A1[1]
1358	add	$A0[1],$A1[0]
1359	adc	\$0,$A1[1]
1360	mul	$m1			# n[6]*m1
1361	add	%rax,$A1[0]		# n[6]*m1+"t[7]"
1362	mov	$Ni[1],%rax
1363	adc	%rdx,$A1[1]
1364	mov	$A1[0],-8($tptr)	# "t[7]"
1365
1366	xor	$A0[1],$A0[1]
1367	add	($tptr),$A0[0]		# +t[8]
1368	adc	\$0,$A0[1]
1369	 mov	0($nptr,$j),$Ni[0]	# n[0]		# modsched #
1370	add	$topbit,$A0[0]
1371	adc	\$0,$A0[1]
1372
1373	 imulq	16($tptr,$j),$m0	# m0=t[0]*n0	# modsched #
1374	xor	$A1[0],$A1[0]
1375	 mov	8($nptr,$j),$Ni[1]	# n[1]		# modsched #
1376	add	$A0[0],$A1[1]
1377	 mov	16($tptr,$j),$A0[0]	# t[0]		# modsched #
1378	adc	\$0,$A1[0]
1379	mul	$m1			# n[7]*m1
1380	add	%rax,$A1[1]		# n[7]*m1+"t[8]"
1381	 mov	$Ni[0],%rax		#		# modsched #
1382	adc	%rdx,$A1[0]
1383	mov	$A1[1],($tptr)		# "t[8]"
1384
1385	xor	$topbit,$topbit
1386	add	8($tptr),$A1[0]		# +t[9]
1387	adc	$topbit,$topbit
1388	add	$A0[1],$A1[0]
1389	lea	16($tptr),$tptr		# "t[$num]>>128"
1390	adc	\$0,$topbit
1391	mov	$A1[0],-8($tptr)	# "t[9]"
1392	cmp	8(%rsp),$tptr		# are we done?
1393	jb	.Lsqr4x_mont_outer
1394
1395	mov	0(%rsp),$num		# restore $num
1396	mov	$topbit,($tptr)		# save $topbit
1397___
1398}
1399##############################################################
1400# Post-condition, 4x unrolled copy from bn_mul_mont
1401#
1402{
1403my ($tptr,$nptr)=("%rbx",$aptr);
1404my @ri=("%rax","%rdx","%r10","%r11");
1405$code.=<<___;
1406	mov	64(%rsp,$num),@ri[0]	# tp[0]
1407	lea	64(%rsp,$num),$tptr	# upper half of t[2*$num] holds result
1408	mov	40(%rsp),$nptr		# restore $nptr
1409	shr	\$5,$num		# num/4
1410	mov	8($tptr),@ri[1]		# t[1]
1411	xor	$i,$i			# i=0 and clear CF!
1412
1413	mov	32(%rsp),$rptr		# restore $rptr
1414	sub	0($nptr),@ri[0]
1415	mov	16($tptr),@ri[2]	# t[2]
1416	mov	24($tptr),@ri[3]	# t[3]
1417	sbb	8($nptr),@ri[1]
1418	lea	-1($num),$j		# j=num/4-1
1419	jmp	.Lsqr4x_sub
1420.align	16
1421.Lsqr4x_sub:
1422	mov	@ri[0],0($rptr,$i,8)	# rp[i]=tp[i]-np[i]
1423	mov	@ri[1],8($rptr,$i,8)	# rp[i]=tp[i]-np[i]
1424	sbb	16($nptr,$i,8),@ri[2]
1425	mov	32($tptr,$i,8),@ri[0]	# tp[i+1]
1426	mov	40($tptr,$i,8),@ri[1]
1427	sbb	24($nptr,$i,8),@ri[3]
1428	mov	@ri[2],16($rptr,$i,8)	# rp[i]=tp[i]-np[i]
1429	mov	@ri[3],24($rptr,$i,8)	# rp[i]=tp[i]-np[i]
1430	sbb	32($nptr,$i,8),@ri[0]
1431	mov	48($tptr,$i,8),@ri[2]
1432	mov	56($tptr,$i,8),@ri[3]
1433	sbb	40($nptr,$i,8),@ri[1]
1434	lea	4($i),$i		# i++
1435	dec	$j			# doesn't affect CF!
1436	jnz	.Lsqr4x_sub
1437
1438	mov	@ri[0],0($rptr,$i,8)	# rp[i]=tp[i]-np[i]
1439	mov	32($tptr,$i,8),@ri[0]	# load overflow bit
1440	sbb	16($nptr,$i,8),@ri[2]
1441	mov	@ri[1],8($rptr,$i,8)	# rp[i]=tp[i]-np[i]
1442	sbb	24($nptr,$i,8),@ri[3]
1443	mov	@ri[2],16($rptr,$i,8)	# rp[i]=tp[i]-np[i]
1444
1445	sbb	\$0,@ri[0]		# handle upmost overflow bit
1446	mov	@ri[3],24($rptr,$i,8)	# rp[i]=tp[i]-np[i]
1447	xor	$i,$i			# i=0
1448	and	@ri[0],$tptr
1449	not	@ri[0]
1450	mov	$rptr,$nptr
1451	and	@ri[0],$nptr
1452	lea	-1($num),$j
1453	or	$nptr,$tptr		# tp=borrow?tp:rp
1454
1455	pxor	%xmm0,%xmm0
1456	lea	64(%rsp,$num,8),$nptr
1457	movdqu	($tptr),%xmm1
1458	lea	($nptr,$num,8),$nptr
1459	movdqa	%xmm0,64(%rsp)		# zap lower half of temporary vector
1460	movdqa	%xmm0,($nptr)		# zap upper half of temporary vector
1461	movdqu	%xmm1,($rptr)
1462	jmp	.Lsqr4x_copy
1463.align	16
1464.Lsqr4x_copy:				# copy or in-place refresh
1465	movdqu	16($tptr,$i),%xmm2
1466	movdqu	32($tptr,$i),%xmm1
1467	movdqa	%xmm0,80(%rsp,$i)	# zap lower half of temporary vector
1468	movdqa	%xmm0,96(%rsp,$i)	# zap lower half of temporary vector
1469	movdqa	%xmm0,16($nptr,$i)	# zap upper half of temporary vector
1470	movdqa	%xmm0,32($nptr,$i)	# zap upper half of temporary vector
1471	movdqu	%xmm2,16($rptr,$i)
1472	movdqu	%xmm1,32($rptr,$i)
1473	lea	32($i),$i
1474	dec	$j
1475	jnz	.Lsqr4x_copy
1476
1477	movdqu	16($tptr,$i),%xmm2
1478	movdqa	%xmm0,80(%rsp,$i)	# zap lower half of temporary vector
1479	movdqa	%xmm0,16($nptr,$i)	# zap upper half of temporary vector
1480	movdqu	%xmm2,16($rptr,$i)
1481___
1482}
1483$code.=<<___;
1484	mov	56(%rsp),%rsi		# restore %rsp
1485	mov	\$1,%rax
1486	mov	0(%rsi),%r15
1487	mov	8(%rsi),%r14
1488	mov	16(%rsi),%r13
1489	mov	24(%rsi),%r12
1490	mov	32(%rsi),%rbp
1491	mov	40(%rsi),%rbx
1492	lea	48(%rsi),%rsp
1493.Lsqr4x_epilogue:
1494	ret
1495.size	bn_sqr4x_mont,.-bn_sqr4x_mont
1496___
1497}}}
1498$code.=<<___;
1499.asciz	"Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1500.align	16
1501___
1502
1503print $code;
1504close STDOUT;
1505