xref: /openbsd/lib/libcrypto/bn/asm/x86_64-mont5.pl (revision 22787c51)
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# August 2011.
11#
12# Companion to x86_64-mont.pl that optimizes cache-timing attack
13# countermeasures. The subroutines are produced by replacing bp[i]
14# references in their x86_64-mont.pl counterparts with cache-neutral
15# references to powers table computed in BN_mod_exp_mont_consttime.
16# In addition subroutine that scatters elements of the powers table
17# is implemented, so that scatter-/gathering can be tuned without
18# bn_exp.c modifications.
19
20$flavour = shift;
21$output  = shift;
22if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
23
24$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
25
26$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
27( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
28( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
29die "can't locate x86_64-xlate.pl";
30
31open OUT,"| \"$^X\" $xlate $flavour $output";
32*STDOUT=*OUT;
33
34# int bn_mul_mont_gather5(
35$rp="%rdi";	# BN_ULONG *rp,
36$ap="%rsi";	# const BN_ULONG *ap,
37$bp="%rdx";	# const BN_ULONG *bp,
38$np="%rcx";	# const BN_ULONG *np,
39$n0="%r8";	# const BN_ULONG *n0,
40$num="%r9";	# int num,
41		# int idx);	# 0 to 2^5-1, "index" in $bp holding
42				# pre-computed powers of a', interlaced
43				# in such manner that b[0] is $bp[idx],
44				# b[1] is [2^5+idx], etc.
45$lo0="%r10";
46$hi0="%r11";
47$hi1="%r13";
48$i="%r14";
49$j="%r15";
50$m0="%rbx";
51$m1="%rbp";
52
53$code=<<___;
54.text
55
56.globl	bn_mul_mont_gather5
57.type	bn_mul_mont_gather5,\@function,6
58.align	64
59bn_mul_mont_gather5:
60	_CET_ENDBR
61	test	\$3,${num}d
62	jnz	.Lmul_enter
63	cmp	\$8,${num}d
64	jb	.Lmul_enter
65	jmp	.Lmul4x_enter
66
67.align	16
68.Lmul_enter:
69	mov	${num}d,${num}d
70	movd	`($win64?56:8)`(%rsp),%xmm5	# load 7th argument
71	lea	.Linc(%rip),%r10
72	push	%rbx
73	push	%rbp
74	push	%r12
75	push	%r13
76	push	%r14
77	push	%r15
78
79.Lmul_alloca:
80	mov	%rsp,%rax
81	lea	2($num),%r11
82	neg	%r11
83	lea	-264(%rsp,%r11,8),%rsp	# tp=alloca(8*(num+2)+256+8)
84	and	\$-1024,%rsp		# minimize TLB usage
85
86	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
87.Lmul_body:
88	lea	128($bp),%r12		# reassign $bp (+size optimization)
89___
90		$bp="%r12";
91		$STRIDE=2**5*8;		# 5 is "window size"
92		$N=$STRIDE/4;		# should match cache line size
93$code.=<<___;
94	movdqa	0(%r10),%xmm0		# 00000001000000010000000000000000
95	movdqa	16(%r10),%xmm1		# 00000002000000020000000200000002
96	lea	24-112(%rsp,$num,8),%r10# place the mask after tp[num+3] (+ICache optimization)
97	and	\$-16,%r10
98
99	pshufd	\$0,%xmm5,%xmm5		# broadcast index
100	movdqa	%xmm1,%xmm4
101	movdqa	%xmm1,%xmm2
102___
103########################################################################
104# calculate mask by comparing 0..31 to index and save result to stack
105#
106$code.=<<___;
107	paddd	%xmm0,%xmm1
108	pcmpeqd	%xmm5,%xmm0		# compare to 1,0
109	.byte	0x67
110	movdqa	%xmm4,%xmm3
111___
112for($k=0;$k<$STRIDE/16-4;$k+=4) {
113$code.=<<___;
114	paddd	%xmm1,%xmm2
115	pcmpeqd	%xmm5,%xmm1		# compare to 3,2
116	movdqa	%xmm0,`16*($k+0)+112`(%r10)
117	movdqa	%xmm4,%xmm0
118
119	paddd	%xmm2,%xmm3
120	pcmpeqd	%xmm5,%xmm2		# compare to 5,4
121	movdqa	%xmm1,`16*($k+1)+112`(%r10)
122	movdqa	%xmm4,%xmm1
123
124	paddd	%xmm3,%xmm0
125	pcmpeqd	%xmm5,%xmm3		# compare to 7,6
126	movdqa	%xmm2,`16*($k+2)+112`(%r10)
127	movdqa	%xmm4,%xmm2
128
129	paddd	%xmm0,%xmm1
130	pcmpeqd	%xmm5,%xmm0
131	movdqa	%xmm3,`16*($k+3)+112`(%r10)
132	movdqa	%xmm4,%xmm3
133___
134}
135$code.=<<___;				# last iteration can be optimized
136	paddd	%xmm1,%xmm2
137	pcmpeqd	%xmm5,%xmm1
138	movdqa	%xmm0,`16*($k+0)+112`(%r10)
139
140	paddd	%xmm2,%xmm3
141	.byte	0x67
142	pcmpeqd	%xmm5,%xmm2
143	movdqa	%xmm1,`16*($k+1)+112`(%r10)
144
145	pcmpeqd	%xmm5,%xmm3
146	movdqa	%xmm2,`16*($k+2)+112`(%r10)
147	pand	`16*($k+0)-128`($bp),%xmm0	# while it's still in register
148
149	pand	`16*($k+1)-128`($bp),%xmm1
150	pand	`16*($k+2)-128`($bp),%xmm2
151	movdqa	%xmm3,`16*($k+3)+112`(%r10)
152	pand	`16*($k+3)-128`($bp),%xmm3
153	por	%xmm2,%xmm0
154	por	%xmm3,%xmm1
155___
156for($k=0;$k<$STRIDE/16-4;$k+=4) {
157$code.=<<___;
158	movdqa	`16*($k+0)-128`($bp),%xmm4
159	movdqa	`16*($k+1)-128`($bp),%xmm5
160	movdqa	`16*($k+2)-128`($bp),%xmm2
161	pand	`16*($k+0)+112`(%r10),%xmm4
162	movdqa	`16*($k+3)-128`($bp),%xmm3
163	pand	`16*($k+1)+112`(%r10),%xmm5
164	por	%xmm4,%xmm0
165	pand	`16*($k+2)+112`(%r10),%xmm2
166	por	%xmm5,%xmm1
167	pand	`16*($k+3)+112`(%r10),%xmm3
168	por	%xmm2,%xmm0
169	por	%xmm3,%xmm1
170___
171}
172$code.=<<___;
173	por	%xmm1,%xmm0
174	pshufd	\$0x4e,%xmm0,%xmm1
175	por	%xmm1,%xmm0
176	lea	$STRIDE($bp),$bp
177	movd	%xmm0,$m0		# m0=bp[0]
178
179	mov	($n0),$n0		# pull n0[0] value
180	mov	($ap),%rax
181
182	xor	$i,$i			# i=0
183	xor	$j,$j			# j=0
184
185	mov	$n0,$m1
186	mulq	$m0			# ap[0]*bp[0]
187	mov	%rax,$lo0
188	mov	($np),%rax
189
190	imulq	$lo0,$m1		# "tp[0]"*n0
191	mov	%rdx,$hi0
192
193	mulq	$m1			# np[0]*m1
194	add	%rax,$lo0		# discarded
195	mov	8($ap),%rax
196	adc	\$0,%rdx
197	mov	%rdx,$hi1
198
199	lea	1($j),$j		# j++
200	jmp	.L1st_enter
201
202.align	16
203.L1st:
204	add	%rax,$hi1
205	mov	($ap,$j,8),%rax
206	adc	\$0,%rdx
207	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
208	mov	$lo0,$hi0
209	adc	\$0,%rdx
210	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
211	mov	%rdx,$hi1
212
213.L1st_enter:
214	mulq	$m0			# ap[j]*bp[0]
215	add	%rax,$hi0
216	mov	($np,$j,8),%rax
217	adc	\$0,%rdx
218	lea	1($j),$j		# j++
219	mov	%rdx,$lo0
220
221	mulq	$m1			# np[j]*m1
222	cmp	$num,$j
223	jl	.L1st
224
225	add	%rax,$hi1
226	mov	($ap),%rax		# ap[0]
227	adc	\$0,%rdx
228	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
229	adc	\$0,%rdx
230	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
231	mov	%rdx,$hi1
232	mov	$lo0,$hi0
233
234	xor	%rdx,%rdx
235	add	$hi0,$hi1
236	adc	\$0,%rdx
237	mov	$hi1,-8(%rsp,$num,8)
238	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
239
240	lea	1($i),$i		# i++
241	jmp	.Louter
242.align	16
243.Louter:
244	lea	24+128(%rsp,$num,8),%rdx	# where 256-byte mask is (+size optimization)
245	and	\$-16,%rdx
246	pxor	%xmm4,%xmm4
247	pxor	%xmm5,%xmm5
248___
249for($k=0;$k<$STRIDE/16;$k+=4) {
250$code.=<<___;
251	movdqa	`16*($k+0)-128`($bp),%xmm0
252	movdqa	`16*($k+1)-128`($bp),%xmm1
253	movdqa	`16*($k+2)-128`($bp),%xmm2
254	movdqa	`16*($k+3)-128`($bp),%xmm3
255	pand	`16*($k+0)-128`(%rdx),%xmm0
256	pand	`16*($k+1)-128`(%rdx),%xmm1
257	por	%xmm0,%xmm4
258	pand	`16*($k+2)-128`(%rdx),%xmm2
259	por	%xmm1,%xmm5
260	pand	`16*($k+3)-128`(%rdx),%xmm3
261	por	%xmm2,%xmm4
262	por	%xmm3,%xmm5
263___
264}
265$code.=<<___;
266	por	%xmm5,%xmm4
267	pshufd	\$0x4e,%xmm4,%xmm0
268	por	%xmm4,%xmm0
269	lea	$STRIDE($bp),$bp
270	movd	%xmm0,$m0		# m0=bp[i]
271
272	xor	$j,$j			# j=0
273	mov	$n0,$m1
274	mov	(%rsp),$lo0
275
276	mulq	$m0			# ap[0]*bp[i]
277	add	%rax,$lo0		# ap[0]*bp[i]+tp[0]
278	mov	($np),%rax
279	adc	\$0,%rdx
280
281	imulq	$lo0,$m1		# tp[0]*n0
282	mov	%rdx,$hi0
283
284	mulq	$m1			# np[0]*m1
285	add	%rax,$lo0		# discarded
286	mov	8($ap),%rax
287	adc	\$0,%rdx
288	mov	8(%rsp),$lo0		# tp[1]
289	mov	%rdx,$hi1
290
291	lea	1($j),$j		# j++
292	jmp	.Linner_enter
293
294.align	16
295.Linner:
296	add	%rax,$hi1
297	mov	($ap,$j,8),%rax
298	adc	\$0,%rdx
299	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
300	mov	(%rsp,$j,8),$lo0
301	adc	\$0,%rdx
302	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
303	mov	%rdx,$hi1
304
305.Linner_enter:
306	mulq	$m0			# ap[j]*bp[i]
307	add	%rax,$hi0
308	mov	($np,$j,8),%rax
309	adc	\$0,%rdx
310	add	$hi0,$lo0		# ap[j]*bp[i]+tp[j]
311	mov	%rdx,$hi0
312	adc	\$0,$hi0
313	lea	1($j),$j		# j++
314
315	mulq	$m1			# np[j]*m1
316	cmp	$num,$j
317	jl	.Linner
318
319	add	%rax,$hi1
320	mov	($ap),%rax		# ap[0]
321	adc	\$0,%rdx
322	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
323	mov	(%rsp,$j,8),$lo0
324	adc	\$0,%rdx
325	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
326	mov	%rdx,$hi1
327
328	xor	%rdx,%rdx
329	add	$hi0,$hi1
330	adc	\$0,%rdx
331	add	$lo0,$hi1		# pull upmost overflow bit
332	adc	\$0,%rdx
333	mov	$hi1,-8(%rsp,$num,8)
334	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
335
336	lea	1($i),$i		# i++
337	cmp	$num,$i
338	jl	.Louter
339
340	xor	$i,$i			# i=0 and clear CF!
341	mov	(%rsp),%rax		# tp[0]
342	lea	(%rsp),$ap		# borrow ap for tp
343	mov	$num,$j			# j=num
344	jmp	.Lsub
345.align	16
346.Lsub:	sbb	($np,$i,8),%rax
347	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]-np[i]
348	mov	8($ap,$i,8),%rax	# tp[i+1]
349	lea	1($i),$i		# i++
350	dec	$j			# doesnn't affect CF!
351	jnz	.Lsub
352
353	sbb	\$0,%rax		# handle upmost overflow bit
354	xor	$i,$i
355	and	%rax,$ap
356	not	%rax
357	mov	$rp,$np
358	and	%rax,$np
359	mov	$num,$j			# j=num
360	or	$np,$ap			# ap=borrow?tp:rp
361.align	16
362.Lcopy:					# copy or in-place refresh
363	mov	($ap,$i,8),%rax
364	mov	$i,(%rsp,$i,8)		# zap temporary vector
365	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]
366	lea	1($i),$i
367	sub	\$1,$j
368	jnz	.Lcopy
369
370	mov	8(%rsp,$num,8),%rsi	# restore %rsp
371	mov	\$1,%rax
372
373	mov	(%rsi),%r15
374	mov	8(%rsi),%r14
375	mov	16(%rsi),%r13
376	mov	24(%rsi),%r12
377	mov	32(%rsi),%rbp
378	mov	40(%rsi),%rbx
379	lea	48(%rsi),%rsp
380.Lmul_epilogue:
381	ret
382.size	bn_mul_mont_gather5,.-bn_mul_mont_gather5
383___
384{{{
385my @A=("%r10","%r11");
386my @N=("%r13","%rdi");
387$code.=<<___;
388.type	bn_mul4x_mont_gather5,\@function,6
389.align	16
390bn_mul4x_mont_gather5:
391	_CET_ENDBR
392.Lmul4x_enter:
393	mov	${num}d,${num}d
394	movd	`($win64?56:8)`(%rsp),%xmm5	# load 7th argument
395	lea	.Linc(%rip),%r10
396	push	%rbx
397	push	%rbp
398	push	%r12
399	push	%r13
400	push	%r14
401	push	%r15
402
403.Lmul4x_alloca:
404	mov	%rsp,%rax
405	lea	4($num),%r11
406	neg	%r11
407	lea	-256(%rsp,%r11,8),%rsp	# tp=alloca(8*(num+4)+256)
408	and	\$-1024,%rsp		# minimize TLB usage
409
410	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
411.Lmul4x_body:
412	mov	$rp,16(%rsp,$num,8)	# tp[num+2]=$rp
413	lea	128(%rdx),%r12		# reassign $bp (+size optimization)
414___
415		$bp="%r12";
416		$STRIDE=2**5*8;		# 5 is "window size"
417		$N=$STRIDE/4;		# should match cache line size
418$code.=<<___;
419	movdqa	0(%r10),%xmm0		# 00000001000000010000000000000000
420	movdqa	16(%r10),%xmm1		# 00000002000000020000000200000002
421	lea	32-112(%rsp,$num,8),%r10# place the mask after tp[num+4] (+ICache optimization)
422
423	pshufd	\$0,%xmm5,%xmm5		# broadcast index
424	movdqa	%xmm1,%xmm4
425	.byte	0x67,0x67
426	movdqa	%xmm1,%xmm2
427___
428########################################################################
429# calculate mask by comparing 0..31 to index and save result to stack
430#
431$code.=<<___;
432	paddd	%xmm0,%xmm1
433	pcmpeqd	%xmm5,%xmm0		# compare to 1,0
434	.byte	0x67
435	movdqa	%xmm4,%xmm3
436___
437for($k=0;$k<$STRIDE/16-4;$k+=4) {
438$code.=<<___;
439	paddd	%xmm1,%xmm2
440	pcmpeqd	%xmm5,%xmm1		# compare to 3,2
441	movdqa	%xmm0,`16*($k+0)+112`(%r10)
442	movdqa	%xmm4,%xmm0
443
444	paddd	%xmm2,%xmm3
445	pcmpeqd	%xmm5,%xmm2		# compare to 5,4
446	movdqa	%xmm1,`16*($k+1)+112`(%r10)
447	movdqa	%xmm4,%xmm1
448
449	paddd	%xmm3,%xmm0
450	pcmpeqd	%xmm5,%xmm3		# compare to 7,6
451	movdqa	%xmm2,`16*($k+2)+112`(%r10)
452	movdqa	%xmm4,%xmm2
453
454	paddd	%xmm0,%xmm1
455	pcmpeqd	%xmm5,%xmm0
456	movdqa	%xmm3,`16*($k+3)+112`(%r10)
457	movdqa	%xmm4,%xmm3
458___
459}
460$code.=<<___;				# last iteration can be optimized
461	paddd	%xmm1,%xmm2
462	pcmpeqd	%xmm5,%xmm1
463	movdqa	%xmm0,`16*($k+0)+112`(%r10)
464
465	paddd	%xmm2,%xmm3
466	.byte	0x67
467	pcmpeqd	%xmm5,%xmm2
468	movdqa	%xmm1,`16*($k+1)+112`(%r10)
469
470	pcmpeqd	%xmm5,%xmm3
471	movdqa	%xmm2,`16*($k+2)+112`(%r10)
472	pand	`16*($k+0)-128`($bp),%xmm0	# while it's still in register
473
474	pand	`16*($k+1)-128`($bp),%xmm1
475	pand	`16*($k+2)-128`($bp),%xmm2
476	movdqa	%xmm3,`16*($k+3)+112`(%r10)
477	pand	`16*($k+3)-128`($bp),%xmm3
478	por	%xmm2,%xmm0
479	por	%xmm3,%xmm1
480___
481for($k=0;$k<$STRIDE/16-4;$k+=4) {
482$code.=<<___;
483	movdqa	`16*($k+0)-128`($bp),%xmm4
484	movdqa	`16*($k+1)-128`($bp),%xmm5
485	movdqa	`16*($k+2)-128`($bp),%xmm2
486	pand	`16*($k+0)+112`(%r10),%xmm4
487	movdqa	`16*($k+3)-128`($bp),%xmm3
488	pand	`16*($k+1)+112`(%r10),%xmm5
489	por	%xmm4,%xmm0
490	pand	`16*($k+2)+112`(%r10),%xmm2
491	por	%xmm5,%xmm1
492	pand	`16*($k+3)+112`(%r10),%xmm3
493	por	%xmm2,%xmm0
494	por	%xmm3,%xmm1
495___
496}
497$code.=<<___;
498	por	%xmm1,%xmm0
499	pshufd	\$0x4e,%xmm0,%xmm1
500	por	%xmm1,%xmm0
501	lea	$STRIDE($bp),$bp
502	movd	%xmm0,$m0		# m0=bp[0]
503
504	mov	($n0),$n0		# pull n0[0] value
505	mov	($ap),%rax
506
507	xor	$i,$i			# i=0
508	xor	$j,$j			# j=0
509
510	mov	$n0,$m1
511	mulq	$m0			# ap[0]*bp[0]
512	mov	%rax,$A[0]
513	mov	($np),%rax
514
515	imulq	$A[0],$m1		# "tp[0]"*n0
516	mov	%rdx,$A[1]
517
518	mulq	$m1			# np[0]*m1
519	add	%rax,$A[0]		# discarded
520	mov	8($ap),%rax
521	adc	\$0,%rdx
522	mov	%rdx,$N[1]
523
524	mulq	$m0
525	add	%rax,$A[1]
526	mov	8($np),%rax
527	adc	\$0,%rdx
528	mov	%rdx,$A[0]
529
530	mulq	$m1
531	add	%rax,$N[1]
532	mov	16($ap),%rax
533	adc	\$0,%rdx
534	add	$A[1],$N[1]
535	lea	4($j),$j		# j++
536	adc	\$0,%rdx
537	mov	$N[1],(%rsp)
538	mov	%rdx,$N[0]
539	jmp	.L1st4x
540.align	16
541.L1st4x:
542	mulq	$m0			# ap[j]*bp[0]
543	add	%rax,$A[0]
544	mov	-16($np,$j,8),%rax
545	adc	\$0,%rdx
546	mov	%rdx,$A[1]
547
548	mulq	$m1			# np[j]*m1
549	add	%rax,$N[0]
550	mov	-8($ap,$j,8),%rax
551	adc	\$0,%rdx
552	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
553	adc	\$0,%rdx
554	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
555	mov	%rdx,$N[1]
556
557	mulq	$m0			# ap[j]*bp[0]
558	add	%rax,$A[1]
559	mov	-8($np,$j,8),%rax
560	adc	\$0,%rdx
561	mov	%rdx,$A[0]
562
563	mulq	$m1			# np[j]*m1
564	add	%rax,$N[1]
565	mov	($ap,$j,8),%rax
566	adc	\$0,%rdx
567	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
568	adc	\$0,%rdx
569	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
570	mov	%rdx,$N[0]
571
572	mulq	$m0			# ap[j]*bp[0]
573	add	%rax,$A[0]
574	mov	($np,$j,8),%rax
575	adc	\$0,%rdx
576	mov	%rdx,$A[1]
577
578	mulq	$m1			# np[j]*m1
579	add	%rax,$N[0]
580	mov	8($ap,$j,8),%rax
581	adc	\$0,%rdx
582	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
583	adc	\$0,%rdx
584	mov	$N[0],-8(%rsp,$j,8)	# tp[j-1]
585	mov	%rdx,$N[1]
586
587	mulq	$m0			# ap[j]*bp[0]
588	add	%rax,$A[1]
589	mov	8($np,$j,8),%rax
590	adc	\$0,%rdx
591	lea	4($j),$j		# j++
592	mov	%rdx,$A[0]
593
594	mulq	$m1			# np[j]*m1
595	add	%rax,$N[1]
596	mov	-16($ap,$j,8),%rax
597	adc	\$0,%rdx
598	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
599	adc	\$0,%rdx
600	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
601	mov	%rdx,$N[0]
602	cmp	$num,$j
603	jl	.L1st4x
604
605	mulq	$m0			# ap[j]*bp[0]
606	add	%rax,$A[0]
607	mov	-16($np,$j,8),%rax
608	adc	\$0,%rdx
609	mov	%rdx,$A[1]
610
611	mulq	$m1			# np[j]*m1
612	add	%rax,$N[0]
613	mov	-8($ap,$j,8),%rax
614	adc	\$0,%rdx
615	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
616	adc	\$0,%rdx
617	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
618	mov	%rdx,$N[1]
619
620	mulq	$m0			# ap[j]*bp[0]
621	add	%rax,$A[1]
622	mov	-8($np,$j,8),%rax
623	adc	\$0,%rdx
624	mov	%rdx,$A[0]
625
626	mulq	$m1			# np[j]*m1
627	add	%rax,$N[1]
628	mov	($ap),%rax		# ap[0]
629	adc	\$0,%rdx
630	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
631	adc	\$0,%rdx
632	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
633	mov	%rdx,$N[0]
634
635	xor	$N[1],$N[1]
636	add	$A[0],$N[0]
637	adc	\$0,$N[1]
638	mov	$N[0],-8(%rsp,$j,8)
639	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
640
641	lea	1($i),$i		# i++
642.align	4
643.Louter4x:
644	lea	32+128(%rsp,$num,8),%rdx	# where 256-byte mask is (+size optimization)
645	pxor	%xmm4,%xmm4
646	pxor	%xmm5,%xmm5
647___
648for($k=0;$k<$STRIDE/16;$k+=4) {
649$code.=<<___;
650	movdqa	`16*($k+0)-128`($bp),%xmm0
651	movdqa	`16*($k+1)-128`($bp),%xmm1
652	movdqa	`16*($k+2)-128`($bp),%xmm2
653	movdqa	`16*($k+3)-128`($bp),%xmm3
654	pand	`16*($k+0)-128`(%rdx),%xmm0
655	pand	`16*($k+1)-128`(%rdx),%xmm1
656	por	%xmm0,%xmm4
657	pand	`16*($k+2)-128`(%rdx),%xmm2
658	por	%xmm1,%xmm5
659	pand	`16*($k+3)-128`(%rdx),%xmm3
660	por	%xmm2,%xmm4
661	por	%xmm3,%xmm5
662___
663}
664$code.=<<___;
665	por	%xmm5,%xmm4
666	pshufd	\$0x4e,%xmm4,%xmm0
667	por	%xmm4,%xmm0
668	lea	$STRIDE($bp),$bp
669	movd	%xmm0,$m0		# m0=bp[i]
670
671	xor	$j,$j			# j=0
672
673	mov	(%rsp),$A[0]
674	mov	$n0,$m1
675	mulq	$m0			# ap[0]*bp[i]
676	add	%rax,$A[0]		# ap[0]*bp[i]+tp[0]
677	mov	($np),%rax
678	adc	\$0,%rdx
679
680	imulq	$A[0],$m1		# tp[0]*n0
681	mov	%rdx,$A[1]
682
683	mulq	$m1			# np[0]*m1
684	add	%rax,$A[0]		# "$N[0]", discarded
685	mov	8($ap),%rax
686	adc	\$0,%rdx
687	mov	%rdx,$N[1]
688
689	mulq	$m0			# ap[j]*bp[i]
690	add	%rax,$A[1]
691	mov	8($np),%rax
692	adc	\$0,%rdx
693	add	8(%rsp),$A[1]		# +tp[1]
694	adc	\$0,%rdx
695	mov	%rdx,$A[0]
696
697	mulq	$m1			# np[j]*m1
698	add	%rax,$N[1]
699	mov	16($ap),%rax
700	adc	\$0,%rdx
701	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[i]+tp[j]
702	lea	4($j),$j		# j+=2
703	adc	\$0,%rdx
704	mov	%rdx,$N[0]
705	jmp	.Linner4x
706.align	16
707.Linner4x:
708	mulq	$m0			# ap[j]*bp[i]
709	add	%rax,$A[0]
710	mov	-16($np,$j,8),%rax
711	adc	\$0,%rdx
712	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
713	adc	\$0,%rdx
714	mov	%rdx,$A[1]
715
716	mulq	$m1			# np[j]*m1
717	add	%rax,$N[0]
718	mov	-8($ap,$j,8),%rax
719	adc	\$0,%rdx
720	add	$A[0],$N[0]
721	adc	\$0,%rdx
722	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
723	mov	%rdx,$N[1]
724
725	mulq	$m0			# ap[j]*bp[i]
726	add	%rax,$A[1]
727	mov	-8($np,$j,8),%rax
728	adc	\$0,%rdx
729	add	-8(%rsp,$j,8),$A[1]
730	adc	\$0,%rdx
731	mov	%rdx,$A[0]
732
733	mulq	$m1			# np[j]*m1
734	add	%rax,$N[1]
735	mov	($ap,$j,8),%rax
736	adc	\$0,%rdx
737	add	$A[1],$N[1]
738	adc	\$0,%rdx
739	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
740	mov	%rdx,$N[0]
741
742	mulq	$m0			# ap[j]*bp[i]
743	add	%rax,$A[0]
744	mov	($np,$j,8),%rax
745	adc	\$0,%rdx
746	add	(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
747	adc	\$0,%rdx
748	mov	%rdx,$A[1]
749
750	mulq	$m1			# np[j]*m1
751	add	%rax,$N[0]
752	mov	8($ap,$j,8),%rax
753	adc	\$0,%rdx
754	add	$A[0],$N[0]
755	adc	\$0,%rdx
756	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
757	mov	%rdx,$N[1]
758
759	mulq	$m0			# ap[j]*bp[i]
760	add	%rax,$A[1]
761	mov	8($np,$j,8),%rax
762	adc	\$0,%rdx
763	add	8(%rsp,$j,8),$A[1]
764	adc	\$0,%rdx
765	lea	4($j),$j		# j++
766	mov	%rdx,$A[0]
767
768	mulq	$m1			# np[j]*m1
769	add	%rax,$N[1]
770	mov	-16($ap,$j,8),%rax
771	adc	\$0,%rdx
772	add	$A[1],$N[1]
773	adc	\$0,%rdx
774	mov	$N[0],-40(%rsp,$j,8)	# tp[j-1]
775	mov	%rdx,$N[0]
776	cmp	$num,$j
777	jl	.Linner4x
778
779	mulq	$m0			# ap[j]*bp[i]
780	add	%rax,$A[0]
781	mov	-16($np,$j,8),%rax
782	adc	\$0,%rdx
783	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
784	adc	\$0,%rdx
785	mov	%rdx,$A[1]
786
787	mulq	$m1			# np[j]*m1
788	add	%rax,$N[0]
789	mov	-8($ap,$j,8),%rax
790	adc	\$0,%rdx
791	add	$A[0],$N[0]
792	adc	\$0,%rdx
793	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
794	mov	%rdx,$N[1]
795
796	mulq	$m0			# ap[j]*bp[i]
797	add	%rax,$A[1]
798	mov	-8($np,$j,8),%rax
799	adc	\$0,%rdx
800	add	-8(%rsp,$j,8),$A[1]
801	adc	\$0,%rdx
802	lea	1($i),$i		# i++
803	mov	%rdx,$A[0]
804
805	mulq	$m1			# np[j]*m1
806	add	%rax,$N[1]
807	mov	($ap),%rax		# ap[0]
808	adc	\$0,%rdx
809	add	$A[1],$N[1]
810	adc	\$0,%rdx
811	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
812	mov	%rdx,$N[0]
813
814	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
815
816	xor	$N[1],$N[1]
817	add	$A[0],$N[0]
818	adc	\$0,$N[1]
819	add	(%rsp,$num,8),$N[0]	# pull upmost overflow bit
820	adc	\$0,$N[1]
821	mov	$N[0],-8(%rsp,$j,8)
822	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
823
824	cmp	$num,$i
825	jl	.Louter4x
826___
827{
828my @ri=("%rax","%rdx",$m0,$m1);
829$code.=<<___;
830	mov	16(%rsp,$num,8),$rp	# restore $rp
831	mov	0(%rsp),@ri[0]		# tp[0]
832	pxor	%xmm0,%xmm0
833	mov	8(%rsp),@ri[1]		# tp[1]
834	shr	\$2,$num		# num/=4
835	lea	(%rsp),$ap		# borrow ap for tp
836	xor	$i,$i			# i=0 and clear CF!
837
838	sub	0($np),@ri[0]
839	mov	16($ap),@ri[2]		# tp[2]
840	mov	24($ap),@ri[3]		# tp[3]
841	sbb	8($np),@ri[1]
842	lea	-1($num),$j		# j=num/4-1
843	jmp	.Lsub4x
844.align	16
845.Lsub4x:
846	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
847	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
848	sbb	16($np,$i,8),@ri[2]
849	mov	32($ap,$i,8),@ri[0]	# tp[i+1]
850	mov	40($ap,$i,8),@ri[1]
851	sbb	24($np,$i,8),@ri[3]
852	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
853	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
854	sbb	32($np,$i,8),@ri[0]
855	mov	48($ap,$i,8),@ri[2]
856	mov	56($ap,$i,8),@ri[3]
857	sbb	40($np,$i,8),@ri[1]
858	lea	4($i),$i		# i++
859	dec	$j			# doesnn't affect CF!
860	jnz	.Lsub4x
861
862	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
863	mov	32($ap,$i,8),@ri[0]	# load overflow bit
864	sbb	16($np,$i,8),@ri[2]
865	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
866	sbb	24($np,$i,8),@ri[3]
867	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
868
869	sbb	\$0,@ri[0]		# handle upmost overflow bit
870	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
871	xor	$i,$i			# i=0
872	and	@ri[0],$ap
873	not	@ri[0]
874	mov	$rp,$np
875	and	@ri[0],$np
876	lea	-1($num),$j
877	or	$np,$ap			# ap=borrow?tp:rp
878
879	movdqu	($ap),%xmm1
880	movdqa	%xmm0,(%rsp)
881	movdqu	%xmm1,($rp)
882	jmp	.Lcopy4x
883.align	16
884.Lcopy4x:					# copy or in-place refresh
885	movdqu	16($ap,$i),%xmm2
886	movdqu	32($ap,$i),%xmm1
887	movdqa	%xmm0,16(%rsp,$i)
888	movdqu	%xmm2,16($rp,$i)
889	movdqa	%xmm0,32(%rsp,$i)
890	movdqu	%xmm1,32($rp,$i)
891	lea	32($i),$i
892	dec	$j
893	jnz	.Lcopy4x
894
895	shl	\$2,$num
896	movdqu	16($ap,$i),%xmm2
897	movdqa	%xmm0,16(%rsp,$i)
898	movdqu	%xmm2,16($rp,$i)
899___
900}
901$code.=<<___;
902	mov	8(%rsp,$num,8),%rsi	# restore %rsp
903	mov	\$1,%rax
904
905	mov	(%rsi),%r15
906	mov	8(%rsi),%r14
907	mov	16(%rsi),%r13
908	mov	24(%rsi),%r12
909	mov	32(%rsi),%rbp
910	mov	40(%rsi),%rbx
911	lea	48(%rsi),%rsp
912.Lmul4x_epilogue:
913	ret
914.size	bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
915___
916}}}
917
918{
919my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%rdx","%r8", "%r9d") : # Win64 order
920				("%rdi","%rsi","%rdx","%ecx"); # Unix order
921my $out=$inp;
922my $STRIDE=2**5*8;
923my $N=$STRIDE/4;
924
925$code.=<<___;
926.globl	bn_scatter5
927.type	bn_scatter5,\@abi-omnipotent
928.align	16
929bn_scatter5:
930	_CET_ENDBR
931	cmp	\$0, $num
932	jz	.Lscatter_epilogue
933	lea	($tbl,$idx,8),$tbl
934.Lscatter:
935	mov	($inp),%rax
936	lea	8($inp),$inp
937	mov	%rax,($tbl)
938	lea	32*8($tbl),$tbl
939	sub	\$1,$num
940	jnz	.Lscatter
941.Lscatter_epilogue:
942	ret
943.size	bn_scatter5,.-bn_scatter5
944
945.globl	bn_gather5
946.type	bn_gather5,\@abi-omnipotent
947.align	16
948bn_gather5:
949	_CET_ENDBR
950.LSEH_begin_bn_gather5:			# Win64 thing, but harmless in other cases
951	# I can't trust assembler to use specific encoding:-(
952	.byte	0x4c,0x8d,0x14,0x24			# lea    (%rsp),%r10
953	.byte	0x48,0x81,0xec,0x08,0x01,0x00,0x00	# sub	$0x108,%rsp
954	lea	.Linc(%rip),%rax
955	and	\$-16,%rsp		# shouldn't be formally required
956
957	movd	$idx,%xmm5
958	movdqa	0(%rax),%xmm0		# 00000001000000010000000000000000
959	movdqa	16(%rax),%xmm1		# 00000002000000020000000200000002
960	lea	128($tbl),%r11		# size optimization
961	lea	128(%rsp),%rax		# size optimization
962
963	pshufd	\$0,%xmm5,%xmm5		# broadcast $idx
964	movdqa	%xmm1,%xmm4
965	movdqa	%xmm1,%xmm2
966___
967########################################################################
968# calculate mask by comparing 0..31 to $idx and save result to stack
969#
970for($i=0;$i<$STRIDE/16;$i+=4) {
971$code.=<<___;
972	paddd	%xmm0,%xmm1
973	pcmpeqd	%xmm5,%xmm0		# compare to 1,0
974___
975$code.=<<___	if ($i);
976	movdqa	%xmm3,`16*($i-1)-128`(%rax)
977___
978$code.=<<___;
979	movdqa	%xmm4,%xmm3
980
981	paddd	%xmm1,%xmm2
982	pcmpeqd	%xmm5,%xmm1		# compare to 3,2
983	movdqa	%xmm0,`16*($i+0)-128`(%rax)
984	movdqa	%xmm4,%xmm0
985
986	paddd	%xmm2,%xmm3
987	pcmpeqd	%xmm5,%xmm2		# compare to 5,4
988	movdqa	%xmm1,`16*($i+1)-128`(%rax)
989	movdqa	%xmm4,%xmm1
990
991	paddd	%xmm3,%xmm0
992	pcmpeqd	%xmm5,%xmm3		# compare to 7,6
993	movdqa	%xmm2,`16*($i+2)-128`(%rax)
994	movdqa	%xmm4,%xmm2
995___
996}
997$code.=<<___;
998	movdqa	%xmm3,`16*($i-1)-128`(%rax)
999	jmp	.Lgather
1000
1001.align	32
1002.Lgather:
1003	pxor	%xmm4,%xmm4
1004	pxor	%xmm5,%xmm5
1005___
1006for($i=0;$i<$STRIDE/16;$i+=4) {
1007$code.=<<___;
1008	movdqa	`16*($i+0)-128`(%r11),%xmm0
1009	movdqa	`16*($i+1)-128`(%r11),%xmm1
1010	movdqa	`16*($i+2)-128`(%r11),%xmm2
1011	pand	`16*($i+0)-128`(%rax),%xmm0
1012	movdqa	`16*($i+3)-128`(%r11),%xmm3
1013	pand	`16*($i+1)-128`(%rax),%xmm1
1014	por	%xmm0,%xmm4
1015	pand	`16*($i+2)-128`(%rax),%xmm2
1016	por	%xmm1,%xmm5
1017	pand	`16*($i+3)-128`(%rax),%xmm3
1018	por	%xmm2,%xmm4
1019	por	%xmm3,%xmm5
1020___
1021}
1022$code.=<<___;
1023	por	%xmm5,%xmm4
1024	lea	$STRIDE(%r11),%r11
1025	pshufd	\$0x4e,%xmm4,%xmm0
1026	por	%xmm4,%xmm0
1027	movq	%xmm0,($out)		# m0=bp[0]
1028	lea	8($out),$out
1029	sub	\$1,$num
1030	jnz	.Lgather
1031
1032	lea	(%r10),%rsp
1033	ret
1034.LSEH_end_bn_gather5:
1035.size	bn_gather5,.-bn_gather5
1036___
1037}
1038$code.=<<___;
1039.section .rodata
1040.align	64
1041.Linc:
1042	.long	0,0, 1,1
1043	.long	2,2, 2,2
1044.text
1045___
1046
1047# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1048#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1049if ($win64) {
1050$rec="%rcx";
1051$frame="%rdx";
1052$context="%r8";
1053$disp="%r9";
1054
1055$code.=<<___;
1056.extern	__imp_RtlVirtualUnwind
1057.type	mul_handler,\@abi-omnipotent
1058.align	16
1059mul_handler:
1060	_CET_ENDBR
1061	push	%rsi
1062	push	%rdi
1063	push	%rbx
1064	push	%rbp
1065	push	%r12
1066	push	%r13
1067	push	%r14
1068	push	%r15
1069	pushfq
1070	sub	\$64,%rsp
1071
1072	mov	120($context),%rax	# pull context->Rax
1073	mov	248($context),%rbx	# pull context->Rip
1074
1075	mov	8($disp),%rsi		# disp->ImageBase
1076	mov	56($disp),%r11		# disp->HandlerData
1077
1078	mov	0(%r11),%r10d		# HandlerData[0]
1079	lea	(%rsi,%r10),%r10	# end of prologue label
1080	cmp	%r10,%rbx		# context->Rip<end of prologue label
1081	jb	.Lcommon_seh_tail
1082
1083	lea	48(%rax),%rax
1084
1085	mov	4(%r11),%r10d		# HandlerData[1]
1086	lea	(%rsi,%r10),%r10	# end of alloca label
1087	cmp	%r10,%rbx		# context->Rip<end of alloca label
1088	jb	.Lcommon_seh_tail
1089
1090	mov	152($context),%rax	# pull context->Rsp
1091
1092	mov	8(%r11),%r10d		# HandlerData[2]
1093	lea	(%rsi,%r10),%r10	# epilogue label
1094	cmp	%r10,%rbx		# context->Rip>=epilogue label
1095	jae	.Lcommon_seh_tail
1096
1097	mov	192($context),%r10	# pull $num
1098	mov	8(%rax,%r10,8),%rax	# pull saved stack pointer
1099
1100	lea	48(%rax),%rax
1101
1102	mov	-8(%rax),%rbx
1103	mov	-16(%rax),%rbp
1104	mov	-24(%rax),%r12
1105	mov	-32(%rax),%r13
1106	mov	-40(%rax),%r14
1107	mov	-48(%rax),%r15
1108	mov	%rbx,144($context)	# restore context->Rbx
1109	mov	%rbp,160($context)	# restore context->Rbp
1110	mov	%r12,216($context)	# restore context->R12
1111	mov	%r13,224($context)	# restore context->R13
1112	mov	%r14,232($context)	# restore context->R14
1113	mov	%r15,240($context)	# restore context->R15
1114
1115.Lcommon_seh_tail:
1116	mov	8(%rax),%rdi
1117	mov	16(%rax),%rsi
1118	mov	%rax,152($context)	# restore context->Rsp
1119	mov	%rsi,168($context)	# restore context->Rsi
1120	mov	%rdi,176($context)	# restore context->Rdi
1121
1122	mov	40($disp),%rdi		# disp->ContextRecord
1123	mov	$context,%rsi		# context
1124	mov	\$154,%ecx		# sizeof(CONTEXT)
1125	.long	0xa548f3fc		# cld; rep movsq
1126
1127	mov	$disp,%rsi
1128	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1129	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1130	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1131	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1132	mov	40(%rsi),%r10		# disp->ContextRecord
1133	lea	56(%rsi),%r11		# &disp->HandlerData
1134	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1135	mov	%r10,32(%rsp)		# arg5
1136	mov	%r11,40(%rsp)		# arg6
1137	mov	%r12,48(%rsp)		# arg7
1138	mov	%rcx,56(%rsp)		# arg8, (NULL)
1139	call	*__imp_RtlVirtualUnwind(%rip)
1140
1141	mov	\$1,%eax		# ExceptionContinueSearch
1142	add	\$64,%rsp
1143	popfq
1144	pop	%r15
1145	pop	%r14
1146	pop	%r13
1147	pop	%r12
1148	pop	%rbp
1149	pop	%rbx
1150	pop	%rdi
1151	pop	%rsi
1152	ret
1153.size	mul_handler,.-mul_handler
1154
1155.section	.pdata
1156.align	4
1157	.rva	.LSEH_begin_bn_mul_mont_gather5
1158	.rva	.LSEH_end_bn_mul_mont_gather5
1159	.rva	.LSEH_info_bn_mul_mont_gather5
1160
1161	.rva	.LSEH_begin_bn_mul4x_mont_gather5
1162	.rva	.LSEH_end_bn_mul4x_mont_gather5
1163	.rva	.LSEH_info_bn_mul4x_mont_gather5
1164
1165	.rva	.LSEH_begin_bn_gather5
1166	.rva	.LSEH_end_bn_gather5
1167	.rva	.LSEH_info_bn_gather5
1168
1169.section	.xdata
1170.align	8
1171.LSEH_info_bn_mul_mont_gather5:
1172	.byte	9,0,0,0
1173	.rva	mul_handler
1174	.rva	.Lmul_alloca,.Lmul_body,.Lmul_epilogue		# HandlerData[]
1175.align	8
1176.LSEH_info_bn_mul4x_mont_gather5:
1177	.byte	9,0,0,0
1178	.rva	mul_handler
1179	.rva	.Lmul4x_alloca,.Lmul4x_body,.Lmul4x_epilogue	# HandlerData[]
1180.align	8
1181.LSEH_info_bn_gather5:
1182	.byte	0x01,0x0b,0x03,0x0a
1183	.byte	0x0b,0x01,0x21,0x00	# sub	rsp,0x108
1184	.byte	0x04,0xa3,0x00,0x00	# lea	r10,(rsp), set_frame r10
1185.align	8
1186___
1187}
1188
1189$code =~ s/\`([^\`]*)\`/eval($1)/gem;
1190
1191print $code;
1192close STDOUT;
1193