1#!/usr/bin/env perl
2
3##############################################################################
4#                                                                            #
5#  Copyright (c) 2012, Intel Corporation                                     #
6#                                                                            #
7#  All rights reserved.                                                      #
8#                                                                            #
9#  Redistribution and use in source and binary forms, with or without        #
10#  modification, are permitted provided that the following conditions are    #
11#  met:                                                                      #
12#                                                                            #
13#  *  Redistributions of source code must retain the above copyright         #
14#     notice, this list of conditions and the following disclaimer.          #
15#                                                                            #
16#  *  Redistributions in binary form must reproduce the above copyright      #
17#     notice, this list of conditions and the following disclaimer in the    #
18#     documentation and/or other materials provided with the                 #
19#     distribution.                                                          #
20#                                                                            #
21#  *  Neither the name of the Intel Corporation nor the names of its         #
22#     contributors may be used to endorse or promote products derived from   #
23#     this software without specific prior written permission.               #
24#                                                                            #
25#                                                                            #
26#  THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY          #
27#  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE         #
28#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR        #
29#  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR            #
30#  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,     #
31#  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,       #
32#  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR        #
33#  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF    #
34#  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING      #
35#  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS        #
36#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.              #
37#                                                                            #
38##############################################################################
39# Developers and authors:                                                    #
40# Shay Gueron (1, 2), and Vlad Krasnov (1)                                   #
41# (1) Intel Architecture Group, Microprocessor and Chipset Development,      #
42#     Israel Development Center, Haifa, Israel                               #
43# (2) University of Haifa                                                    #
44##############################################################################
45# Reference:                                                                 #
46# [1] S. Gueron, "Efficient Software Implementations of Modular              #
47#     Exponentiation", http://eprint.iacr.org/2011/239                       #
48# [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring".             #
49#     IEEE Proceedings of 9th International Conference on Information        #
50#     Technology: New Generations (ITNG 2012), 821-823 (2012).               #
51# [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation#
52#     Journal of Cryptographic Engineering 2:31-43 (2012).                   #
53# [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis    #
54#     resistant 512-bit and 1024-bit modular exponentiation for optimizing   #
55#     RSA1024 and RSA2048 on x86_64 platforms",                              #
56#     http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest#
57##############################################################################
58
59# While original submission covers 512- and 1024-bit exponentiation,
60# this module is limited to 512-bit version only (and as such
61# accelerates RSA1024 sign). This is because improvement for longer
62# keys is not high enough to justify the effort, highest measured
63# was ~5% on Westmere. [This is relative to OpenSSL 1.0.2, upcoming
64# for the moment of this writing!] Nor does this module implement
65# "monolithic" complete exponentiation jumbo-subroutine, but adheres
66# to more modular mixture of C and assembly. And it's optimized even
67# for processors other than Intel Core family (see table below for
68# improvement coefficients).
69# 						<appro@openssl.org>
70#
71# RSA1024 sign/sec	this/original	|this/rsax(*)	this/fips(*)
72#			----------------+---------------------------
73# Opteron		+13%		|+5%		+20%
74# Bulldozer		-0%		|-1%		+10%
75# P4			+11%		|+7%		+8%
76# Westmere		+5%		|+14%		+17%
77# Sandy Bridge		+2%		|+12%		+29%
78# Ivy Bridge		+1%		|+11%		+35%
79# Haswell(**)		-0%		|+12%		+39%
80# Atom			+13%		|+11%		+4%
81# VIA Nano		+70%		|+9%		+25%
82#
83# (*)	rsax engine and fips numbers are presented for reference
84#	purposes;
85# (**)	MULX was attempted, but found to give only marginal improvement;
86
87$flavour = shift;
88$output  = shift;
89if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
90
91$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
92
93$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
94( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
95( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
96die "can't locate x86_64-xlate.pl";
97
98open OUT,"| \"$^X\" $xlate $flavour $output";
99*STDOUT=*OUT;
100
101if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
102		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
103	$addx = ($1>=2.23);
104}
105
106if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
107	    `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
108	$addx = ($1>=2.10);
109}
110
111if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
112	    `ml64 2>&1` =~ /Version ([0-9]+)\./) {
113	$addx = ($1>=12);
114}
115
116if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) {
117	my $ver = $2 + $3/100.0;	# 3.1->3.01, 3.10->3.10
118	$addx = ($ver>=3.03);
119}
120
121($out, $inp, $mod) = ("%rdi", "%rsi", "%rbp");	# common internal API
122{
123my ($out,$inp,$mod,$n0,$times) = ("%rdi","%rsi","%rdx","%rcx","%r8d");
124
125$code.=<<___;
126.text
127
128.extern	OPENSSL_ia32cap_P
129
130.globl	rsaz_512_sqr
131.type	rsaz_512_sqr,\@function,5
132.align	32
133rsaz_512_sqr:				# 25-29% faster than rsaz_512_mul
134	push	%rbx
135	push	%rbp
136	push	%r12
137	push	%r13
138	push	%r14
139	push	%r15
140
141	subq	\$128+24, %rsp
142.Lsqr_body:
143	movq	$mod, %rbp		# common argument
144	movq	($inp), %rdx
145	movq	8($inp), %rax
146	movq	$n0, 128(%rsp)
147___
148$code.=<<___ if ($addx);
149	movl	\$0x80100,%r11d
150	andl	OPENSSL_ia32cap_P+8(%rip),%r11d
151	cmpl	\$0x80100,%r11d		# check for MULX and ADO/CX
152	je	.Loop_sqrx
153___
154$code.=<<___;
155	jmp	.Loop_sqr
156
157.align	32
158.Loop_sqr:
159	movl	$times,128+8(%rsp)
160#first iteration
161	movq	%rdx, %rbx
162	mulq	%rdx
163	movq	%rax, %r8
164	movq	16($inp), %rax
165	movq	%rdx, %r9
166
167	mulq	%rbx
168	addq	%rax, %r9
169	movq	24($inp), %rax
170	movq	%rdx, %r10
171	adcq	\$0, %r10
172
173	mulq	%rbx
174	addq	%rax, %r10
175	movq	32($inp), %rax
176	movq	%rdx, %r11
177	adcq	\$0, %r11
178
179	mulq	%rbx
180	addq	%rax, %r11
181	movq	40($inp), %rax
182	movq	%rdx, %r12
183	adcq	\$0, %r12
184
185	mulq	%rbx
186	addq	%rax, %r12
187	movq	48($inp), %rax
188	movq	%rdx, %r13
189	adcq	\$0, %r13
190
191	mulq	%rbx
192	addq	%rax, %r13
193	movq	56($inp), %rax
194	movq	%rdx, %r14
195	adcq	\$0, %r14
196
197	mulq	%rbx
198	addq	%rax, %r14
199	movq	%rbx, %rax
200	movq	%rdx, %r15
201	adcq	\$0, %r15
202
203	addq	%r8, %r8		#shlq	\$1, %r8
204	movq	%r9, %rcx
205	adcq	%r9, %r9		#shld	\$1, %r8, %r9
206
207	mulq	%rax
208	movq	%rax, (%rsp)
209	addq	%rdx, %r8
210	adcq	\$0, %r9
211
212	movq	%r8, 8(%rsp)
213	shrq	\$63, %rcx
214
215#second iteration
216	movq	8($inp), %r8
217	movq	16($inp), %rax
218	mulq	%r8
219	addq	%rax, %r10
220	movq	24($inp), %rax
221	movq	%rdx, %rbx
222	adcq	\$0, %rbx
223
224	mulq	%r8
225	addq	%rax, %r11
226	movq	32($inp), %rax
227	adcq	\$0, %rdx
228	addq	%rbx, %r11
229	movq	%rdx, %rbx
230	adcq	\$0, %rbx
231
232	mulq	%r8
233	addq	%rax, %r12
234	movq	40($inp), %rax
235	adcq	\$0, %rdx
236	addq	%rbx, %r12
237	movq	%rdx, %rbx
238	adcq	\$0, %rbx
239
240	mulq	%r8
241	addq	%rax, %r13
242	movq	48($inp), %rax
243	adcq	\$0, %rdx
244	addq	%rbx, %r13
245	movq	%rdx, %rbx
246	adcq	\$0, %rbx
247
248	mulq	%r8
249	addq	%rax, %r14
250	movq	56($inp), %rax
251	adcq	\$0, %rdx
252	addq	%rbx, %r14
253	movq	%rdx, %rbx
254	adcq	\$0, %rbx
255
256	mulq	%r8
257	addq	%rax, %r15
258	movq	%r8, %rax
259	adcq	\$0, %rdx
260	addq	%rbx, %r15
261	movq	%rdx, %r8
262	movq	%r10, %rdx
263	adcq	\$0, %r8
264
265	add	%rdx, %rdx
266	lea	(%rcx,%r10,2), %r10	#shld	\$1, %rcx, %r10
267	movq	%r11, %rbx
268	adcq	%r11, %r11		#shld	\$1, %r10, %r11
269
270	mulq	%rax
271	addq	%rax, %r9
272	adcq	%rdx, %r10
273	adcq	\$0, %r11
274
275	movq	%r9, 16(%rsp)
276	movq	%r10, 24(%rsp)
277	shrq	\$63, %rbx
278
279#third iteration
280	movq	16($inp), %r9
281	movq	24($inp), %rax
282	mulq	%r9
283	addq	%rax, %r12
284	movq	32($inp), %rax
285	movq	%rdx, %rcx
286	adcq	\$0, %rcx
287
288	mulq	%r9
289	addq	%rax, %r13
290	movq	40($inp), %rax
291	adcq	\$0, %rdx
292	addq	%rcx, %r13
293	movq	%rdx, %rcx
294	adcq	\$0, %rcx
295
296	mulq	%r9
297	addq	%rax, %r14
298	movq	48($inp), %rax
299	adcq	\$0, %rdx
300	addq	%rcx, %r14
301	movq	%rdx, %rcx
302	adcq	\$0, %rcx
303
304	mulq	%r9
305	 movq	%r12, %r10
306	 lea	(%rbx,%r12,2), %r12	#shld	\$1, %rbx, %r12
307	addq	%rax, %r15
308	movq	56($inp), %rax
309	adcq	\$0, %rdx
310	addq	%rcx, %r15
311	movq	%rdx, %rcx
312	adcq	\$0, %rcx
313
314	mulq	%r9
315	 shrq	\$63, %r10
316	addq	%rax, %r8
317	movq	%r9, %rax
318	adcq	\$0, %rdx
319	addq	%rcx, %r8
320	movq	%rdx, %r9
321	adcq	\$0, %r9
322
323	movq	%r13, %rcx
324	leaq	(%r10,%r13,2), %r13	#shld	\$1, %r12, %r13
325
326	mulq	%rax
327	addq	%rax, %r11
328	adcq	%rdx, %r12
329	adcq	\$0, %r13
330
331	movq	%r11, 32(%rsp)
332	movq	%r12, 40(%rsp)
333	shrq	\$63, %rcx
334
335#fourth iteration
336	movq	24($inp), %r10
337	movq	32($inp), %rax
338	mulq	%r10
339	addq	%rax, %r14
340	movq	40($inp), %rax
341	movq	%rdx, %rbx
342	adcq	\$0, %rbx
343
344	mulq	%r10
345	addq	%rax, %r15
346	movq	48($inp), %rax
347	adcq	\$0, %rdx
348	addq	%rbx, %r15
349	movq	%rdx, %rbx
350	adcq	\$0, %rbx
351
352	mulq	%r10
353	 movq	%r14, %r12
354	 leaq	(%rcx,%r14,2), %r14	#shld	\$1, %rcx, %r14
355	addq	%rax, %r8
356	movq	56($inp), %rax
357	adcq	\$0, %rdx
358	addq	%rbx, %r8
359	movq	%rdx, %rbx
360	adcq	\$0, %rbx
361
362	mulq	%r10
363	 shrq	\$63, %r12
364	addq	%rax, %r9
365	movq	%r10, %rax
366	adcq	\$0, %rdx
367	addq	%rbx, %r9
368	movq	%rdx, %r10
369	adcq	\$0, %r10
370
371	movq	%r15, %rbx
372	leaq	(%r12,%r15,2),%r15	#shld	\$1, %r14, %r15
373
374	mulq	%rax
375	addq	%rax, %r13
376	adcq	%rdx, %r14
377	adcq	\$0, %r15
378
379	movq	%r13, 48(%rsp)
380	movq	%r14, 56(%rsp)
381	shrq	\$63, %rbx
382
383#fifth iteration
384	movq	32($inp), %r11
385	movq	40($inp), %rax
386	mulq	%r11
387	addq	%rax, %r8
388	movq	48($inp), %rax
389	movq	%rdx, %rcx
390	adcq	\$0, %rcx
391
392	mulq	%r11
393	addq	%rax, %r9
394	movq	56($inp), %rax
395	adcq	\$0, %rdx
396	 movq	%r8, %r12
397	 leaq	(%rbx,%r8,2), %r8	#shld	\$1, %rbx, %r8
398	addq	%rcx, %r9
399	movq	%rdx, %rcx
400	adcq	\$0, %rcx
401
402	mulq	%r11
403	 shrq	\$63, %r12
404	addq	%rax, %r10
405	movq	%r11, %rax
406	adcq	\$0, %rdx
407	addq	%rcx, %r10
408	movq	%rdx, %r11
409	adcq	\$0, %r11
410
411	movq	%r9, %rcx
412	leaq	(%r12,%r9,2), %r9	#shld	\$1, %r8, %r9
413
414	mulq	%rax
415	addq	%rax, %r15
416	adcq	%rdx, %r8
417	adcq	\$0, %r9
418
419	movq	%r15, 64(%rsp)
420	movq	%r8, 72(%rsp)
421	shrq	\$63, %rcx
422
423#sixth iteration
424	movq	40($inp), %r12
425	movq	48($inp), %rax
426	mulq	%r12
427	addq	%rax, %r10
428	movq	56($inp), %rax
429	movq	%rdx, %rbx
430	adcq	\$0, %rbx
431
432	mulq	%r12
433	addq	%rax, %r11
434	movq	%r12, %rax
435	 movq	%r10, %r15
436	 leaq	(%rcx,%r10,2), %r10	#shld	\$1, %rcx, %r10
437	adcq	\$0, %rdx
438	 shrq	\$63, %r15
439	addq	%rbx, %r11
440	movq	%rdx, %r12
441	adcq	\$0, %r12
442
443	movq	%r11, %rbx
444	leaq	(%r15,%r11,2), %r11	#shld	\$1, %r10, %r11
445
446	mulq	%rax
447	addq	%rax, %r9
448	adcq	%rdx, %r10
449	adcq	\$0, %r11
450
451	movq	%r9, 80(%rsp)
452	movq	%r10, 88(%rsp)
453
454#seventh iteration
455	movq	48($inp), %r13
456	movq	56($inp), %rax
457	mulq	%r13
458	addq	%rax, %r12
459	movq	%r13, %rax
460	movq	%rdx, %r13
461	adcq	\$0, %r13
462
463	xorq	%r14, %r14
464	shlq	\$1, %rbx
465	adcq	%r12, %r12		#shld	\$1, %rbx, %r12
466	adcq	%r13, %r13		#shld	\$1, %r12, %r13
467	adcq	%r14, %r14		#shld	\$1, %r13, %r14
468
469	mulq	%rax
470	addq	%rax, %r11
471	adcq	%rdx, %r12
472	adcq	\$0, %r13
473
474	movq	%r11, 96(%rsp)
475	movq	%r12, 104(%rsp)
476
477#eighth iteration
478	movq	56($inp), %rax
479	mulq	%rax
480	addq	%rax, %r13
481	adcq	\$0, %rdx
482
483	addq	%rdx, %r14
484
485	movq	%r13, 112(%rsp)
486	movq	%r14, 120(%rsp)
487
488	movq	(%rsp), %r8
489	movq	8(%rsp), %r9
490	movq	16(%rsp), %r10
491	movq	24(%rsp), %r11
492	movq	32(%rsp), %r12
493	movq	40(%rsp), %r13
494	movq	48(%rsp), %r14
495	movq	56(%rsp), %r15
496
497	call	__rsaz_512_reduce
498
499	addq	64(%rsp), %r8
500	adcq	72(%rsp), %r9
501	adcq	80(%rsp), %r10
502	adcq	88(%rsp), %r11
503	adcq	96(%rsp), %r12
504	adcq	104(%rsp), %r13
505	adcq	112(%rsp), %r14
506	adcq	120(%rsp), %r15
507	sbbq	%rcx, %rcx
508
509	call	__rsaz_512_subtract
510
511	movq	%r8, %rdx
512	movq	%r9, %rax
513	movl	128+8(%rsp), $times
514	movq	$out, $inp
515
516	decl	$times
517	jnz	.Loop_sqr
518___
519if ($addx) {
520$code.=<<___;
521	jmp	.Lsqr_tail
522
523.align	32
524.Loop_sqrx:
525	movl	$times,128+8(%rsp)
526	movq	$out, %xmm0		# off-load
527	movq	%rbp, %xmm1		# off-load
528#first iteration
529	mulx	%rax, %r8, %r9
530
531	mulx	16($inp), %rcx, %r10
532	xor	%rbp, %rbp		# cf=0, of=0
533
534	mulx	24($inp), %rax, %r11
535	adcx	%rcx, %r9
536
537	mulx	32($inp), %rcx, %r12
538	adcx	%rax, %r10
539
540	mulx	40($inp), %rax, %r13
541	adcx	%rcx, %r11
542
543	.byte	0xc4,0x62,0xf3,0xf6,0xb6,0x30,0x00,0x00,0x00	# mulx	48($inp), %rcx, %r14
544	adcx	%rax, %r12
545	adcx	%rcx, %r13
546
547	.byte	0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00	# mulx	56($inp), %rax, %r15
548	adcx	%rax, %r14
549	adcx	%rbp, %r15		# %rbp is 0
550
551	mov	%r9, %rcx
552	shld	\$1, %r8, %r9
553	shl	\$1, %r8
554
555	xor	%ebp, %ebp
556	mulx	%rdx, %rax, %rdx
557	adcx	%rdx, %r8
558	 mov	8($inp), %rdx
559	adcx	%rbp, %r9
560
561	mov	%rax, (%rsp)
562	mov	%r8, 8(%rsp)
563
564#second iteration
565	mulx	16($inp), %rax, %rbx
566	adox	%rax, %r10
567	adcx	%rbx, %r11
568
569	.byte	0xc4,0x62,0xc3,0xf6,0x86,0x18,0x00,0x00,0x00	# mulx	24($inp), $out, %r8
570	adox	$out, %r11
571	adcx	%r8, %r12
572
573	mulx	32($inp), %rax, %rbx
574	adox	%rax, %r12
575	adcx	%rbx, %r13
576
577	mulx	40($inp), $out, %r8
578	adox	$out, %r13
579	adcx	%r8, %r14
580
581	.byte	0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00	# mulx	48($inp), %rax, %rbx
582	adox	%rax, %r14
583	adcx	%rbx, %r15
584
585	.byte	0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00	# mulx	56($inp), $out, %r8
586	adox	$out, %r15
587	adcx	%rbp, %r8
588	adox	%rbp, %r8
589
590	mov	%r11, %rbx
591	shld	\$1, %r10, %r11
592	shld	\$1, %rcx, %r10
593
594	xor	%ebp,%ebp
595	mulx	%rdx, %rax, %rcx
596	 mov	16($inp), %rdx
597	adcx	%rax, %r9
598	adcx	%rcx, %r10
599	adcx	%rbp, %r11
600
601	mov	%r9, 16(%rsp)
602	.byte	0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00		# mov	%r10, 24(%rsp)
603
604#third iteration
605	.byte	0xc4,0x62,0xc3,0xf6,0x8e,0x18,0x00,0x00,0x00	# mulx	24($inp), $out, %r9
606	adox	$out, %r12
607	adcx	%r9, %r13
608
609	mulx	32($inp), %rax, %rcx
610	adox	%rax, %r13
611	adcx	%rcx, %r14
612
613	mulx	40($inp), $out, %r9
614	adox	$out, %r14
615	adcx	%r9, %r15
616
617	.byte	0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00	# mulx	48($inp), %rax, %rcx
618	adox	%rax, %r15
619	adcx	%rcx, %r8
620
621	.byte	0xc4,0x62,0xc3,0xf6,0x8e,0x38,0x00,0x00,0x00	# mulx	56($inp), $out, %r9
622	adox	$out, %r8
623	adcx	%rbp, %r9
624	adox	%rbp, %r9
625
626	mov	%r13, %rcx
627	shld	\$1, %r12, %r13
628	shld	\$1, %rbx, %r12
629
630	xor	%ebp, %ebp
631	mulx	%rdx, %rax, %rdx
632	adcx	%rax, %r11
633	adcx	%rdx, %r12
634	 mov	24($inp), %rdx
635	adcx	%rbp, %r13
636
637	mov	%r11, 32(%rsp)
638	.byte	0x4c,0x89,0xa4,0x24,0x28,0x00,0x00,0x00		# mov	%r12, 40(%rsp)
639
640#fourth iteration
641	.byte	0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00	# mulx	32($inp), %rax, %rbx
642	adox	%rax, %r14
643	adcx	%rbx, %r15
644
645	mulx	40($inp), $out, %r10
646	adox	$out, %r15
647	adcx	%r10, %r8
648
649	mulx	48($inp), %rax, %rbx
650	adox	%rax, %r8
651	adcx	%rbx, %r9
652
653	mulx	56($inp), $out, %r10
654	adox	$out, %r9
655	adcx	%rbp, %r10
656	adox	%rbp, %r10
657
658	.byte	0x66
659	mov	%r15, %rbx
660	shld	\$1, %r14, %r15
661	shld	\$1, %rcx, %r14
662
663	xor	%ebp, %ebp
664	mulx	%rdx, %rax, %rdx
665	adcx	%rax, %r13
666	adcx	%rdx, %r14
667	 mov	32($inp), %rdx
668	adcx	%rbp, %r15
669
670	mov	%r13, 48(%rsp)
671	mov	%r14, 56(%rsp)
672
673#fifth iteration
674	.byte	0xc4,0x62,0xc3,0xf6,0x9e,0x28,0x00,0x00,0x00	# mulx	40($inp), $out, %r11
675	adox	$out, %r8
676	adcx	%r11, %r9
677
678	mulx	48($inp), %rax, %rcx
679	adox	%rax, %r9
680	adcx	%rcx, %r10
681
682	mulx	56($inp), $out, %r11
683	adox	$out, %r10
684	adcx	%rbp, %r11
685	adox	%rbp, %r11
686
687	mov	%r9, %rcx
688	shld	\$1, %r8, %r9
689	shld	\$1, %rbx, %r8
690
691	xor	%ebp, %ebp
692	mulx	%rdx, %rax, %rdx
693	adcx	%rax, %r15
694	adcx	%rdx, %r8
695	 mov	40($inp), %rdx
696	adcx	%rbp, %r9
697
698	mov	%r15, 64(%rsp)
699	mov	%r8, 72(%rsp)
700
701#sixth iteration
702	.byte	0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00	# mulx	48($inp), %rax, %rbx
703	adox	%rax, %r10
704	adcx	%rbx, %r11
705
706	.byte	0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00	# mulx	56($inp), $out, %r12
707	adox	$out, %r11
708	adcx	%rbp, %r12
709	adox	%rbp, %r12
710
711	mov	%r11, %rbx
712	shld	\$1, %r10, %r11
713	shld	\$1, %rcx, %r10
714
715	xor	%ebp, %ebp
716	mulx	%rdx, %rax, %rdx
717	adcx	%rax, %r9
718	adcx	%rdx, %r10
719	 mov	48($inp), %rdx
720	adcx	%rbp, %r11
721
722	mov	%r9, 80(%rsp)
723	mov	%r10, 88(%rsp)
724
725#seventh iteration
726	.byte	0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00	# mulx	56($inp), %rax, %r13
727	adox	%rax, %r12
728	adox	%rbp, %r13
729
730	xor	%r14, %r14
731	shld	\$1, %r13, %r14
732	shld	\$1, %r12, %r13
733	shld	\$1, %rbx, %r12
734
735	xor	%ebp, %ebp
736	mulx	%rdx, %rax, %rdx
737	adcx	%rax, %r11
738	adcx	%rdx, %r12
739	 mov	56($inp), %rdx
740	adcx	%rbp, %r13
741
742	.byte	0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00		# mov	%r11, 96(%rsp)
743	.byte	0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00		# mov	%r12, 104(%rsp)
744
745#eighth iteration
746	mulx	%rdx, %rax, %rdx
747	adox	%rax, %r13
748	adox	%rbp, %rdx
749
750	.byte	0x66
751	add	%rdx, %r14
752
753	movq	%r13, 112(%rsp)
754	movq	%r14, 120(%rsp)
755	movq	%xmm0, $out
756	movq	%xmm1, %rbp
757
758	movq	128(%rsp), %rdx		# pull $n0
759	movq	(%rsp), %r8
760	movq	8(%rsp), %r9
761	movq	16(%rsp), %r10
762	movq	24(%rsp), %r11
763	movq	32(%rsp), %r12
764	movq	40(%rsp), %r13
765	movq	48(%rsp), %r14
766	movq	56(%rsp), %r15
767
768	call	__rsaz_512_reducex
769
770	addq	64(%rsp), %r8
771	adcq	72(%rsp), %r9
772	adcq	80(%rsp), %r10
773	adcq	88(%rsp), %r11
774	adcq	96(%rsp), %r12
775	adcq	104(%rsp), %r13
776	adcq	112(%rsp), %r14
777	adcq	120(%rsp), %r15
778	sbbq	%rcx, %rcx
779
780	call	__rsaz_512_subtract
781
782	movq	%r8, %rdx
783	movq	%r9, %rax
784	movl	128+8(%rsp), $times
785	movq	$out, $inp
786
787	decl	$times
788	jnz	.Loop_sqrx
789
790.Lsqr_tail:
791___
792}
793$code.=<<___;
794
795	leaq	128+24+48(%rsp), %rax
796	movq	-48(%rax), %r15
797	movq	-40(%rax), %r14
798	movq	-32(%rax), %r13
799	movq	-24(%rax), %r12
800	movq	-16(%rax), %rbp
801	movq	-8(%rax), %rbx
802	leaq	(%rax), %rsp
803.Lsqr_epilogue:
804	ret
805.size	rsaz_512_sqr,.-rsaz_512_sqr
806___
807}
808{
809my ($out,$ap,$bp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
810$code.=<<___;
811.globl	rsaz_512_mul
812.type	rsaz_512_mul,\@function,5
813.align	32
814rsaz_512_mul:
815	push	%rbx
816	push	%rbp
817	push	%r12
818	push	%r13
819	push	%r14
820	push	%r15
821
822	subq	\$128+24, %rsp
823.Lmul_body:
824	movq	$out, %xmm0		# off-load arguments
825	movq	$mod, %xmm1
826	movq	$n0, 128(%rsp)
827___
828$code.=<<___ if ($addx);
829	movl	\$0x80100,%r11d
830	andl	OPENSSL_ia32cap_P+8(%rip),%r11d
831	cmpl	\$0x80100,%r11d		# check for MULX and ADO/CX
832	je	.Lmulx
833___
834$code.=<<___;
835	movq	($bp), %rbx		# pass b[0]
836	movq	$bp, %rbp		# pass argument
837	call	__rsaz_512_mul
838
839	movq	%xmm0, $out
840	movq	%xmm1, %rbp
841
842	movq	(%rsp), %r8
843	movq	8(%rsp), %r9
844	movq	16(%rsp), %r10
845	movq	24(%rsp), %r11
846	movq	32(%rsp), %r12
847	movq	40(%rsp), %r13
848	movq	48(%rsp), %r14
849	movq	56(%rsp), %r15
850
851	call	__rsaz_512_reduce
852___
853$code.=<<___ if ($addx);
854	jmp	.Lmul_tail
855
856.align	32
857.Lmulx:
858	movq	$bp, %rbp		# pass argument
859	movq	($bp), %rdx		# pass b[0]
860	call	__rsaz_512_mulx
861
862	movq	%xmm0, $out
863	movq	%xmm1, %rbp
864
865	movq	128(%rsp), %rdx		# pull $n0
866	movq	(%rsp), %r8
867	movq	8(%rsp), %r9
868	movq	16(%rsp), %r10
869	movq	24(%rsp), %r11
870	movq	32(%rsp), %r12
871	movq	40(%rsp), %r13
872	movq	48(%rsp), %r14
873	movq	56(%rsp), %r15
874
875	call	__rsaz_512_reducex
876.Lmul_tail:
877___
878$code.=<<___;
879	addq	64(%rsp), %r8
880	adcq	72(%rsp), %r9
881	adcq	80(%rsp), %r10
882	adcq	88(%rsp), %r11
883	adcq	96(%rsp), %r12
884	adcq	104(%rsp), %r13
885	adcq	112(%rsp), %r14
886	adcq	120(%rsp), %r15
887	sbbq	%rcx, %rcx
888
889	call	__rsaz_512_subtract
890
891	leaq	128+24+48(%rsp), %rax
892	movq	-48(%rax), %r15
893	movq	-40(%rax), %r14
894	movq	-32(%rax), %r13
895	movq	-24(%rax), %r12
896	movq	-16(%rax), %rbp
897	movq	-8(%rax), %rbx
898	leaq	(%rax), %rsp
899.Lmul_epilogue:
900	ret
901.size	rsaz_512_mul,.-rsaz_512_mul
902___
903}
904{
905my ($out,$ap,$bp,$mod,$n0,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
906$code.=<<___;
907.globl	rsaz_512_mul_gather4
908.type	rsaz_512_mul_gather4,\@function,6
909.align	32
910rsaz_512_mul_gather4:
911	push	%rbx
912	push	%rbp
913	push	%r12
914	push	%r13
915	push	%r14
916	push	%r15
917
918	subq	\$`128+24+($win64?0xb0:0)`, %rsp
919___
920$code.=<<___	if ($win64);
921	movaps	%xmm6,0xa0(%rsp)
922	movaps	%xmm7,0xb0(%rsp)
923	movaps	%xmm8,0xc0(%rsp)
924	movaps	%xmm9,0xd0(%rsp)
925	movaps	%xmm10,0xe0(%rsp)
926	movaps	%xmm11,0xf0(%rsp)
927	movaps	%xmm12,0x100(%rsp)
928	movaps	%xmm13,0x110(%rsp)
929	movaps	%xmm14,0x120(%rsp)
930	movaps	%xmm15,0x130(%rsp)
931___
932$code.=<<___;
933.Lmul_gather4_body:
934	movd	$pwr,%xmm8
935	movdqa	.Linc+16(%rip),%xmm1	# 00000002000000020000000200000002
936	movdqa	.Linc(%rip),%xmm0	# 00000001000000010000000000000000
937
938	pshufd	\$0,%xmm8,%xmm8		# broadcast $power
939	movdqa	%xmm1,%xmm7
940	movdqa	%xmm1,%xmm2
941___
942########################################################################
943# calculate mask by comparing 0..15 to $power
944#
945for($i=0;$i<4;$i++) {
946$code.=<<___;
947	paddd	%xmm`$i`,%xmm`$i+1`
948	pcmpeqd	%xmm8,%xmm`$i`
949	movdqa	%xmm7,%xmm`$i+3`
950___
951}
952for(;$i<7;$i++) {
953$code.=<<___;
954	paddd	%xmm`$i`,%xmm`$i+1`
955	pcmpeqd	%xmm8,%xmm`$i`
956___
957}
958$code.=<<___;
959	pcmpeqd	%xmm8,%xmm7
960
961	movdqa	16*0($bp),%xmm8
962	movdqa	16*1($bp),%xmm9
963	movdqa	16*2($bp),%xmm10
964	movdqa	16*3($bp),%xmm11
965	pand	%xmm0,%xmm8
966	movdqa	16*4($bp),%xmm12
967	pand	%xmm1,%xmm9
968	movdqa	16*5($bp),%xmm13
969	pand	%xmm2,%xmm10
970	movdqa	16*6($bp),%xmm14
971	pand	%xmm3,%xmm11
972	movdqa	16*7($bp),%xmm15
973	leaq	128($bp), %rbp
974	pand	%xmm4,%xmm12
975	pand	%xmm5,%xmm13
976	pand	%xmm6,%xmm14
977	pand	%xmm7,%xmm15
978	por	%xmm10,%xmm8
979	por	%xmm11,%xmm9
980	por	%xmm12,%xmm8
981	por	%xmm13,%xmm9
982	por	%xmm14,%xmm8
983	por	%xmm15,%xmm9
984
985	por	%xmm9,%xmm8
986	pshufd	\$0x4e,%xmm8,%xmm9
987	por	%xmm9,%xmm8
988___
989$code.=<<___ if ($addx);
990	movl	\$0x80100,%r11d
991	andl	OPENSSL_ia32cap_P+8(%rip),%r11d
992	cmpl	\$0x80100,%r11d		# check for MULX and ADO/CX
993	je	.Lmulx_gather
994___
995$code.=<<___;
996	movq	%xmm8,%rbx
997
998	movq	$n0, 128(%rsp)		# off-load arguments
999	movq	$out, 128+8(%rsp)
1000	movq	$mod, 128+16(%rsp)
1001
1002	movq	($ap), %rax
1003	 movq	8($ap), %rcx
1004	mulq	%rbx			# 0 iteration
1005	movq	%rax, (%rsp)
1006	movq	%rcx, %rax
1007	movq	%rdx, %r8
1008
1009	mulq	%rbx
1010	addq	%rax, %r8
1011	movq	16($ap), %rax
1012	movq	%rdx, %r9
1013	adcq	\$0, %r9
1014
1015	mulq	%rbx
1016	addq	%rax, %r9
1017	movq	24($ap), %rax
1018	movq	%rdx, %r10
1019	adcq	\$0, %r10
1020
1021	mulq	%rbx
1022	addq	%rax, %r10
1023	movq	32($ap), %rax
1024	movq	%rdx, %r11
1025	adcq	\$0, %r11
1026
1027	mulq	%rbx
1028	addq	%rax, %r11
1029	movq	40($ap), %rax
1030	movq	%rdx, %r12
1031	adcq	\$0, %r12
1032
1033	mulq	%rbx
1034	addq	%rax, %r12
1035	movq	48($ap), %rax
1036	movq	%rdx, %r13
1037	adcq	\$0, %r13
1038
1039	mulq	%rbx
1040	addq	%rax, %r13
1041	movq	56($ap), %rax
1042	movq	%rdx, %r14
1043	adcq	\$0, %r14
1044
1045	mulq	%rbx
1046	addq	%rax, %r14
1047	 movq	($ap), %rax
1048	movq	%rdx, %r15
1049	adcq	\$0, %r15
1050
1051	leaq	8(%rsp), %rdi
1052	movl	\$7, %ecx
1053	jmp	.Loop_mul_gather
1054
1055.align	32
1056.Loop_mul_gather:
1057	movdqa	16*0(%rbp),%xmm8
1058	movdqa	16*1(%rbp),%xmm9
1059	movdqa	16*2(%rbp),%xmm10
1060	movdqa	16*3(%rbp),%xmm11
1061	pand	%xmm0,%xmm8
1062	movdqa	16*4(%rbp),%xmm12
1063	pand	%xmm1,%xmm9
1064	movdqa	16*5(%rbp),%xmm13
1065	pand	%xmm2,%xmm10
1066	movdqa	16*6(%rbp),%xmm14
1067	pand	%xmm3,%xmm11
1068	movdqa	16*7(%rbp),%xmm15
1069	leaq	128(%rbp), %rbp
1070	pand	%xmm4,%xmm12
1071	pand	%xmm5,%xmm13
1072	pand	%xmm6,%xmm14
1073	pand	%xmm7,%xmm15
1074	por	%xmm10,%xmm8
1075	por	%xmm11,%xmm9
1076	por	%xmm12,%xmm8
1077	por	%xmm13,%xmm9
1078	por	%xmm14,%xmm8
1079	por	%xmm15,%xmm9
1080
1081	por	%xmm9,%xmm8
1082	pshufd	\$0x4e,%xmm8,%xmm9
1083	por	%xmm9,%xmm8
1084	movq	%xmm8,%rbx
1085
1086	mulq	%rbx
1087	addq	%rax, %r8
1088	movq	8($ap), %rax
1089	movq	%r8, (%rdi)
1090	movq	%rdx, %r8
1091	adcq	\$0, %r8
1092
1093	mulq	%rbx
1094	addq	%rax, %r9
1095	movq	16($ap), %rax
1096	adcq	\$0, %rdx
1097	addq	%r9, %r8
1098	movq	%rdx, %r9
1099	adcq	\$0, %r9
1100
1101	mulq	%rbx
1102	addq	%rax, %r10
1103	movq	24($ap), %rax
1104	adcq	\$0, %rdx
1105	addq	%r10, %r9
1106	movq	%rdx, %r10
1107	adcq	\$0, %r10
1108
1109	mulq	%rbx
1110	addq	%rax, %r11
1111	movq	32($ap), %rax
1112	adcq	\$0, %rdx
1113	addq	%r11, %r10
1114	movq	%rdx, %r11
1115	adcq	\$0, %r11
1116
1117	mulq	%rbx
1118	addq	%rax, %r12
1119	movq	40($ap), %rax
1120	adcq	\$0, %rdx
1121	addq	%r12, %r11
1122	movq	%rdx, %r12
1123	adcq	\$0, %r12
1124
1125	mulq	%rbx
1126	addq	%rax, %r13
1127	movq	48($ap), %rax
1128	adcq	\$0, %rdx
1129	addq	%r13, %r12
1130	movq	%rdx, %r13
1131	adcq	\$0, %r13
1132
1133	mulq	%rbx
1134	addq	%rax, %r14
1135	movq	56($ap), %rax
1136	adcq	\$0, %rdx
1137	addq	%r14, %r13
1138	movq	%rdx, %r14
1139	adcq	\$0, %r14
1140
1141	mulq	%rbx
1142	addq	%rax, %r15
1143	 movq	($ap), %rax
1144	adcq	\$0, %rdx
1145	addq	%r15, %r14
1146	movq	%rdx, %r15
1147	adcq	\$0, %r15
1148
1149	leaq	8(%rdi), %rdi
1150
1151	decl	%ecx
1152	jnz	.Loop_mul_gather
1153
1154	movq	%r8, (%rdi)
1155	movq	%r9, 8(%rdi)
1156	movq	%r10, 16(%rdi)
1157	movq	%r11, 24(%rdi)
1158	movq	%r12, 32(%rdi)
1159	movq	%r13, 40(%rdi)
1160	movq	%r14, 48(%rdi)
1161	movq	%r15, 56(%rdi)
1162
1163	movq	128+8(%rsp), $out
1164	movq	128+16(%rsp), %rbp
1165
1166	movq	(%rsp), %r8
1167	movq	8(%rsp), %r9
1168	movq	16(%rsp), %r10
1169	movq	24(%rsp), %r11
1170	movq	32(%rsp), %r12
1171	movq	40(%rsp), %r13
1172	movq	48(%rsp), %r14
1173	movq	56(%rsp), %r15
1174
1175	call	__rsaz_512_reduce
1176___
1177$code.=<<___ if ($addx);
1178	jmp	.Lmul_gather_tail
1179
1180.align	32
1181.Lmulx_gather:
1182	movq	%xmm8,%rdx
1183
1184	mov	$n0, 128(%rsp)		# off-load arguments
1185	mov	$out, 128+8(%rsp)
1186	mov	$mod, 128+16(%rsp)
1187
1188	mulx	($ap), %rbx, %r8	# 0 iteration
1189	mov	%rbx, (%rsp)
1190	xor	%edi, %edi		# cf=0, of=0
1191
1192	mulx	8($ap), %rax, %r9
1193
1194	mulx	16($ap), %rbx, %r10
1195	adcx	%rax, %r8
1196
1197	mulx	24($ap), %rax, %r11
1198	adcx	%rbx, %r9
1199
1200	mulx	32($ap), %rbx, %r12
1201	adcx	%rax, %r10
1202
1203	mulx	40($ap), %rax, %r13
1204	adcx	%rbx, %r11
1205
1206	mulx	48($ap), %rbx, %r14
1207	adcx	%rax, %r12
1208
1209	mulx	56($ap), %rax, %r15
1210	adcx	%rbx, %r13
1211	adcx	%rax, %r14
1212	.byte	0x67
1213	mov	%r8, %rbx
1214	adcx	%rdi, %r15		# %rdi is 0
1215
1216	mov	\$-7, %rcx
1217	jmp	.Loop_mulx_gather
1218
1219.align	32
1220.Loop_mulx_gather:
1221	movdqa	16*0(%rbp),%xmm8
1222	movdqa	16*1(%rbp),%xmm9
1223	movdqa	16*2(%rbp),%xmm10
1224	movdqa	16*3(%rbp),%xmm11
1225	pand	%xmm0,%xmm8
1226	movdqa	16*4(%rbp),%xmm12
1227	pand	%xmm1,%xmm9
1228	movdqa	16*5(%rbp),%xmm13
1229	pand	%xmm2,%xmm10
1230	movdqa	16*6(%rbp),%xmm14
1231	pand	%xmm3,%xmm11
1232	movdqa	16*7(%rbp),%xmm15
1233	leaq	128(%rbp), %rbp
1234	pand	%xmm4,%xmm12
1235	pand	%xmm5,%xmm13
1236	pand	%xmm6,%xmm14
1237	pand	%xmm7,%xmm15
1238	por	%xmm10,%xmm8
1239	por	%xmm11,%xmm9
1240	por	%xmm12,%xmm8
1241	por	%xmm13,%xmm9
1242	por	%xmm14,%xmm8
1243	por	%xmm15,%xmm9
1244
1245	por	%xmm9,%xmm8
1246	pshufd	\$0x4e,%xmm8,%xmm9
1247	por	%xmm9,%xmm8
1248	movq	%xmm8,%rdx
1249
1250	.byte	0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00	# mulx	($ap), %rax, %r8
1251	adcx	%rax, %rbx
1252	adox	%r9, %r8
1253
1254	mulx	8($ap), %rax, %r9
1255	adcx	%rax, %r8
1256	adox	%r10, %r9
1257
1258	mulx	16($ap), %rax, %r10
1259	adcx	%rax, %r9
1260	adox	%r11, %r10
1261
1262	.byte	0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00	# mulx	24($ap), %rax, %r11
1263	adcx	%rax, %r10
1264	adox	%r12, %r11
1265
1266	mulx	32($ap), %rax, %r12
1267	adcx	%rax, %r11
1268	adox	%r13, %r12
1269
1270	mulx	40($ap), %rax, %r13
1271	adcx	%rax, %r12
1272	adox	%r14, %r13
1273
1274	.byte	0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00	# mulx	48($ap), %rax, %r14
1275	adcx	%rax, %r13
1276	.byte	0x67
1277	adox	%r15, %r14
1278
1279	mulx	56($ap), %rax, %r15
1280	 mov	%rbx, 64(%rsp,%rcx,8)
1281	adcx	%rax, %r14
1282	adox	%rdi, %r15
1283	mov	%r8, %rbx
1284	adcx	%rdi, %r15		# cf=0
1285
1286	inc	%rcx			# of=0
1287	jnz	.Loop_mulx_gather
1288
1289	mov	%r8, 64(%rsp)
1290	mov	%r9, 64+8(%rsp)
1291	mov	%r10, 64+16(%rsp)
1292	mov	%r11, 64+24(%rsp)
1293	mov	%r12, 64+32(%rsp)
1294	mov	%r13, 64+40(%rsp)
1295	mov	%r14, 64+48(%rsp)
1296	mov	%r15, 64+56(%rsp)
1297
1298	mov	128(%rsp), %rdx		# pull arguments
1299	mov	128+8(%rsp), $out
1300	mov	128+16(%rsp), %rbp
1301
1302	mov	(%rsp), %r8
1303	mov	8(%rsp), %r9
1304	mov	16(%rsp), %r10
1305	mov	24(%rsp), %r11
1306	mov	32(%rsp), %r12
1307	mov	40(%rsp), %r13
1308	mov	48(%rsp), %r14
1309	mov	56(%rsp), %r15
1310
1311	call	__rsaz_512_reducex
1312
1313.Lmul_gather_tail:
1314___
1315$code.=<<___;
1316	addq	64(%rsp), %r8
1317	adcq	72(%rsp), %r9
1318	adcq	80(%rsp), %r10
1319	adcq	88(%rsp), %r11
1320	adcq	96(%rsp), %r12
1321	adcq	104(%rsp), %r13
1322	adcq	112(%rsp), %r14
1323	adcq	120(%rsp), %r15
1324	sbbq	%rcx, %rcx
1325
1326	call	__rsaz_512_subtract
1327
1328	leaq	128+24+48(%rsp), %rax
1329___
1330$code.=<<___	if ($win64);
1331	movaps	0xa0-0xc8(%rax),%xmm6
1332	movaps	0xb0-0xc8(%rax),%xmm7
1333	movaps	0xc0-0xc8(%rax),%xmm8
1334	movaps	0xd0-0xc8(%rax),%xmm9
1335	movaps	0xe0-0xc8(%rax),%xmm10
1336	movaps	0xf0-0xc8(%rax),%xmm11
1337	movaps	0x100-0xc8(%rax),%xmm12
1338	movaps	0x110-0xc8(%rax),%xmm13
1339	movaps	0x120-0xc8(%rax),%xmm14
1340	movaps	0x130-0xc8(%rax),%xmm15
1341	lea	0xb0(%rax),%rax
1342___
1343$code.=<<___;
1344	movq	-48(%rax), %r15
1345	movq	-40(%rax), %r14
1346	movq	-32(%rax), %r13
1347	movq	-24(%rax), %r12
1348	movq	-16(%rax), %rbp
1349	movq	-8(%rax), %rbx
1350	leaq	(%rax), %rsp
1351.Lmul_gather4_epilogue:
1352	ret
1353.size	rsaz_512_mul_gather4,.-rsaz_512_mul_gather4
1354___
1355}
1356{
1357my ($out,$ap,$mod,$n0,$tbl,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1358$code.=<<___;
1359.globl	rsaz_512_mul_scatter4
1360.type	rsaz_512_mul_scatter4,\@function,6
1361.align	32
1362rsaz_512_mul_scatter4:
1363	push	%rbx
1364	push	%rbp
1365	push	%r12
1366	push	%r13
1367	push	%r14
1368	push	%r15
1369
1370	mov	$pwr, $pwr
1371	subq	\$128+24, %rsp
1372.Lmul_scatter4_body:
1373	leaq	($tbl,$pwr,8), $tbl
1374	movq	$out, %xmm0		# off-load arguments
1375	movq	$mod, %xmm1
1376	movq	$tbl, %xmm2
1377	movq	$n0, 128(%rsp)
1378
1379	movq	$out, %rbp
1380___
1381$code.=<<___ if ($addx);
1382	movl	\$0x80100,%r11d
1383	andl	OPENSSL_ia32cap_P+8(%rip),%r11d
1384	cmpl	\$0x80100,%r11d		# check for MULX and ADO/CX
1385	je	.Lmulx_scatter
1386___
1387$code.=<<___;
1388	movq	($out),%rbx		# pass b[0]
1389	call	__rsaz_512_mul
1390
1391	movq	%xmm0, $out
1392	movq	%xmm1, %rbp
1393
1394	movq	(%rsp), %r8
1395	movq	8(%rsp), %r9
1396	movq	16(%rsp), %r10
1397	movq	24(%rsp), %r11
1398	movq	32(%rsp), %r12
1399	movq	40(%rsp), %r13
1400	movq	48(%rsp), %r14
1401	movq	56(%rsp), %r15
1402
1403	call	__rsaz_512_reduce
1404___
1405$code.=<<___ if ($addx);
1406	jmp	.Lmul_scatter_tail
1407
1408.align	32
1409.Lmulx_scatter:
1410	movq	($out), %rdx		# pass b[0]
1411	call	__rsaz_512_mulx
1412
1413	movq	%xmm0, $out
1414	movq	%xmm1, %rbp
1415
1416	movq	128(%rsp), %rdx		# pull $n0
1417	movq	(%rsp), %r8
1418	movq	8(%rsp), %r9
1419	movq	16(%rsp), %r10
1420	movq	24(%rsp), %r11
1421	movq	32(%rsp), %r12
1422	movq	40(%rsp), %r13
1423	movq	48(%rsp), %r14
1424	movq	56(%rsp), %r15
1425
1426	call	__rsaz_512_reducex
1427
1428.Lmul_scatter_tail:
1429___
1430$code.=<<___;
1431	addq	64(%rsp), %r8
1432	adcq	72(%rsp), %r9
1433	adcq	80(%rsp), %r10
1434	adcq	88(%rsp), %r11
1435	adcq	96(%rsp), %r12
1436	adcq	104(%rsp), %r13
1437	adcq	112(%rsp), %r14
1438	adcq	120(%rsp), %r15
1439	movq	%xmm2, $inp
1440	sbbq	%rcx, %rcx
1441
1442	call	__rsaz_512_subtract
1443
1444	movq	%r8, 128*0($inp)	# scatter
1445	movq	%r9, 128*1($inp)
1446	movq	%r10, 128*2($inp)
1447	movq	%r11, 128*3($inp)
1448	movq	%r12, 128*4($inp)
1449	movq	%r13, 128*5($inp)
1450	movq	%r14, 128*6($inp)
1451	movq	%r15, 128*7($inp)
1452
1453	leaq	128+24+48(%rsp), %rax
1454	movq	-48(%rax), %r15
1455	movq	-40(%rax), %r14
1456	movq	-32(%rax), %r13
1457	movq	-24(%rax), %r12
1458	movq	-16(%rax), %rbp
1459	movq	-8(%rax), %rbx
1460	leaq	(%rax), %rsp
1461.Lmul_scatter4_epilogue:
1462	ret
1463.size	rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4
1464___
1465}
1466{
1467my ($out,$inp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx");
1468$code.=<<___;
1469.globl	rsaz_512_mul_by_one
1470.type	rsaz_512_mul_by_one,\@function,4
1471.align	32
1472rsaz_512_mul_by_one:
1473	push	%rbx
1474	push	%rbp
1475	push	%r12
1476	push	%r13
1477	push	%r14
1478	push	%r15
1479
1480	subq	\$128+24, %rsp
1481.Lmul_by_one_body:
1482___
1483$code.=<<___ if ($addx);
1484	movl	OPENSSL_ia32cap_P+8(%rip),%eax
1485___
1486$code.=<<___;
1487	movq	$mod, %rbp	# reassign argument
1488	movq	$n0, 128(%rsp)
1489
1490	movq	($inp), %r8
1491	pxor	%xmm0, %xmm0
1492	movq	8($inp), %r9
1493	movq	16($inp), %r10
1494	movq	24($inp), %r11
1495	movq	32($inp), %r12
1496	movq	40($inp), %r13
1497	movq	48($inp), %r14
1498	movq	56($inp), %r15
1499
1500	movdqa	%xmm0, (%rsp)
1501	movdqa	%xmm0, 16(%rsp)
1502	movdqa	%xmm0, 32(%rsp)
1503	movdqa	%xmm0, 48(%rsp)
1504	movdqa	%xmm0, 64(%rsp)
1505	movdqa	%xmm0, 80(%rsp)
1506	movdqa	%xmm0, 96(%rsp)
1507___
1508$code.=<<___ if ($addx);
1509	andl	\$0x80100,%eax
1510	cmpl	\$0x80100,%eax		# check for MULX and ADO/CX
1511	je	.Lby_one_callx
1512___
1513$code.=<<___;
1514	call	__rsaz_512_reduce
1515___
1516$code.=<<___ if ($addx);
1517	jmp	.Lby_one_tail
1518.align	32
1519.Lby_one_callx:
1520	movq	128(%rsp), %rdx		# pull $n0
1521	call	__rsaz_512_reducex
1522.Lby_one_tail:
1523___
1524$code.=<<___;
1525	movq	%r8, ($out)
1526	movq	%r9, 8($out)
1527	movq	%r10, 16($out)
1528	movq	%r11, 24($out)
1529	movq	%r12, 32($out)
1530	movq	%r13, 40($out)
1531	movq	%r14, 48($out)
1532	movq	%r15, 56($out)
1533
1534	leaq	128+24+48(%rsp), %rax
1535	movq	-48(%rax), %r15
1536	movq	-40(%rax), %r14
1537	movq	-32(%rax), %r13
1538	movq	-24(%rax), %r12
1539	movq	-16(%rax), %rbp
1540	movq	-8(%rax), %rbx
1541	leaq	(%rax), %rsp
1542.Lmul_by_one_epilogue:
1543	ret
1544.size	rsaz_512_mul_by_one,.-rsaz_512_mul_by_one
1545___
1546}
1547{	# __rsaz_512_reduce
1548	#
1549	# input:	%r8-%r15, %rbp - mod, 128(%rsp) - n0
1550	# output:	%r8-%r15
1551	# clobbers:	everything except %rbp and %rdi
1552$code.=<<___;
1553.type	__rsaz_512_reduce,\@abi-omnipotent
1554.align	32
1555__rsaz_512_reduce:
1556	movq	%r8, %rbx
1557	imulq	128+8(%rsp), %rbx
1558	movq	0(%rbp), %rax
1559	movl	\$8, %ecx
1560	jmp	.Lreduction_loop
1561
1562.align	32
1563.Lreduction_loop:
1564	mulq	%rbx
1565	movq	8(%rbp), %rax
1566	negq	%r8
1567	movq	%rdx, %r8
1568	adcq	\$0, %r8
1569
1570	mulq	%rbx
1571	addq	%rax, %r9
1572	movq	16(%rbp), %rax
1573	adcq	\$0, %rdx
1574	addq	%r9, %r8
1575	movq	%rdx, %r9
1576	adcq	\$0, %r9
1577
1578	mulq	%rbx
1579	addq	%rax, %r10
1580	movq	24(%rbp), %rax
1581	adcq	\$0, %rdx
1582	addq	%r10, %r9
1583	movq	%rdx, %r10
1584	adcq	\$0, %r10
1585
1586	mulq	%rbx
1587	addq	%rax, %r11
1588	movq	32(%rbp), %rax
1589	adcq	\$0, %rdx
1590	addq	%r11, %r10
1591	 movq	128+8(%rsp), %rsi
1592	#movq	%rdx, %r11
1593	#adcq	\$0, %r11
1594	adcq	\$0, %rdx
1595	movq	%rdx, %r11
1596
1597	mulq	%rbx
1598	addq	%rax, %r12
1599	movq	40(%rbp), %rax
1600	adcq	\$0, %rdx
1601	 imulq	%r8, %rsi
1602	addq	%r12, %r11
1603	movq	%rdx, %r12
1604	adcq	\$0, %r12
1605
1606	mulq	%rbx
1607	addq	%rax, %r13
1608	movq	48(%rbp), %rax
1609	adcq	\$0, %rdx
1610	addq	%r13, %r12
1611	movq	%rdx, %r13
1612	adcq	\$0, %r13
1613
1614	mulq	%rbx
1615	addq	%rax, %r14
1616	movq	56(%rbp), %rax
1617	adcq	\$0, %rdx
1618	addq	%r14, %r13
1619	movq	%rdx, %r14
1620	adcq	\$0, %r14
1621
1622	mulq	%rbx
1623	 movq	%rsi, %rbx
1624	addq	%rax, %r15
1625	 movq	0(%rbp), %rax
1626	adcq	\$0, %rdx
1627	addq	%r15, %r14
1628	movq	%rdx, %r15
1629	adcq	\$0, %r15
1630
1631	decl	%ecx
1632	jne	.Lreduction_loop
1633
1634	ret
1635.size	__rsaz_512_reduce,.-__rsaz_512_reduce
1636___
1637}
1638if ($addx) {
1639	# __rsaz_512_reducex
1640	#
1641	# input:	%r8-%r15, %rbp - mod, 128(%rsp) - n0
1642	# output:	%r8-%r15
1643	# clobbers:	everything except %rbp and %rdi
1644$code.=<<___;
1645.type	__rsaz_512_reducex,\@abi-omnipotent
1646.align	32
1647__rsaz_512_reducex:
1648	#movq	128+8(%rsp), %rdx		# pull $n0
1649	imulq	%r8, %rdx
1650	xorq	%rsi, %rsi			# cf=0,of=0
1651	movl	\$8, %ecx
1652	jmp	.Lreduction_loopx
1653
1654.align	32
1655.Lreduction_loopx:
1656	mov	%r8, %rbx
1657	mulx	0(%rbp), %rax, %r8
1658	adcx	%rbx, %rax
1659	adox	%r9, %r8
1660
1661	mulx	8(%rbp), %rax, %r9
1662	adcx	%rax, %r8
1663	adox	%r10, %r9
1664
1665	mulx	16(%rbp), %rbx, %r10
1666	adcx	%rbx, %r9
1667	adox	%r11, %r10
1668
1669	mulx	24(%rbp), %rbx, %r11
1670	adcx	%rbx, %r10
1671	adox	%r12, %r11
1672
1673	.byte	0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00	# mulx	32(%rbp), %rbx, %r12
1674	 mov	%rdx, %rax
1675	 mov	%r8, %rdx
1676	adcx	%rbx, %r11
1677	adox	%r13, %r12
1678
1679	 mulx	128+8(%rsp), %rbx, %rdx
1680	 mov	%rax, %rdx
1681
1682	mulx	40(%rbp), %rax, %r13
1683	adcx	%rax, %r12
1684	adox	%r14, %r13
1685
1686	.byte	0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00	# mulx	48(%rbp), %rax, %r14
1687	adcx	%rax, %r13
1688	adox	%r15, %r14
1689
1690	mulx	56(%rbp), %rax, %r15
1691	 mov	%rbx, %rdx
1692	adcx	%rax, %r14
1693	adox	%rsi, %r15			# %rsi is 0
1694	adcx	%rsi, %r15			# cf=0
1695
1696	decl	%ecx				# of=0
1697	jne	.Lreduction_loopx
1698
1699	ret
1700.size	__rsaz_512_reducex,.-__rsaz_512_reducex
1701___
1702}
1703{	# __rsaz_512_subtract
1704	# input: %r8-%r15, %rdi - $out, %rbp - $mod, %rcx - mask
1705	# output:
1706	# clobbers: everything but %rdi, %rsi and %rbp
1707$code.=<<___;
1708.type	__rsaz_512_subtract,\@abi-omnipotent
1709.align	32
1710__rsaz_512_subtract:
1711	movq	%r8, ($out)
1712	movq	%r9, 8($out)
1713	movq	%r10, 16($out)
1714	movq	%r11, 24($out)
1715	movq	%r12, 32($out)
1716	movq	%r13, 40($out)
1717	movq	%r14, 48($out)
1718	movq	%r15, 56($out)
1719
1720	movq	0($mod), %r8
1721	movq	8($mod), %r9
1722	negq	%r8
1723	notq	%r9
1724	andq	%rcx, %r8
1725	movq	16($mod), %r10
1726	andq	%rcx, %r9
1727	notq	%r10
1728	movq	24($mod), %r11
1729	andq	%rcx, %r10
1730	notq	%r11
1731	movq	32($mod), %r12
1732	andq	%rcx, %r11
1733	notq	%r12
1734	movq	40($mod), %r13
1735	andq	%rcx, %r12
1736	notq	%r13
1737	movq	48($mod), %r14
1738	andq	%rcx, %r13
1739	notq	%r14
1740	movq	56($mod), %r15
1741	andq	%rcx, %r14
1742	notq	%r15
1743	andq	%rcx, %r15
1744
1745	addq	($out), %r8
1746	adcq	8($out), %r9
1747	adcq	16($out), %r10
1748	adcq	24($out), %r11
1749	adcq	32($out), %r12
1750	adcq	40($out), %r13
1751	adcq	48($out), %r14
1752	adcq	56($out), %r15
1753
1754	movq	%r8, ($out)
1755	movq	%r9, 8($out)
1756	movq	%r10, 16($out)
1757	movq	%r11, 24($out)
1758	movq	%r12, 32($out)
1759	movq	%r13, 40($out)
1760	movq	%r14, 48($out)
1761	movq	%r15, 56($out)
1762
1763	ret
1764.size	__rsaz_512_subtract,.-__rsaz_512_subtract
1765___
1766}
1767{	# __rsaz_512_mul
1768	#
1769	# input: %rsi - ap, %rbp - bp
1770	# ouput:
1771	# clobbers: everything
1772my ($ap,$bp) = ("%rsi","%rbp");
1773$code.=<<___;
1774.type	__rsaz_512_mul,\@abi-omnipotent
1775.align	32
1776__rsaz_512_mul:
1777	leaq	8(%rsp), %rdi
1778
1779	movq	($ap), %rax
1780	mulq	%rbx
1781	movq	%rax, (%rdi)
1782	movq	8($ap), %rax
1783	movq	%rdx, %r8
1784
1785	mulq	%rbx
1786	addq	%rax, %r8
1787	movq	16($ap), %rax
1788	movq	%rdx, %r9
1789	adcq	\$0, %r9
1790
1791	mulq	%rbx
1792	addq	%rax, %r9
1793	movq	24($ap), %rax
1794	movq	%rdx, %r10
1795	adcq	\$0, %r10
1796
1797	mulq	%rbx
1798	addq	%rax, %r10
1799	movq	32($ap), %rax
1800	movq	%rdx, %r11
1801	adcq	\$0, %r11
1802
1803	mulq	%rbx
1804	addq	%rax, %r11
1805	movq	40($ap), %rax
1806	movq	%rdx, %r12
1807	adcq	\$0, %r12
1808
1809	mulq	%rbx
1810	addq	%rax, %r12
1811	movq	48($ap), %rax
1812	movq	%rdx, %r13
1813	adcq	\$0, %r13
1814
1815	mulq	%rbx
1816	addq	%rax, %r13
1817	movq	56($ap), %rax
1818	movq	%rdx, %r14
1819	adcq	\$0, %r14
1820
1821	mulq	%rbx
1822	addq	%rax, %r14
1823	 movq	($ap), %rax
1824	movq	%rdx, %r15
1825	adcq	\$0, %r15
1826
1827	leaq	8($bp), $bp
1828	leaq	8(%rdi), %rdi
1829
1830	movl	\$7, %ecx
1831	jmp	.Loop_mul
1832
1833.align	32
1834.Loop_mul:
1835	movq	($bp), %rbx
1836	mulq	%rbx
1837	addq	%rax, %r8
1838	movq	8($ap), %rax
1839	movq	%r8, (%rdi)
1840	movq	%rdx, %r8
1841	adcq	\$0, %r8
1842
1843	mulq	%rbx
1844	addq	%rax, %r9
1845	movq	16($ap), %rax
1846	adcq	\$0, %rdx
1847	addq	%r9, %r8
1848	movq	%rdx, %r9
1849	adcq	\$0, %r9
1850
1851	mulq	%rbx
1852	addq	%rax, %r10
1853	movq	24($ap), %rax
1854	adcq	\$0, %rdx
1855	addq	%r10, %r9
1856	movq	%rdx, %r10
1857	adcq	\$0, %r10
1858
1859	mulq	%rbx
1860	addq	%rax, %r11
1861	movq	32($ap), %rax
1862	adcq	\$0, %rdx
1863	addq	%r11, %r10
1864	movq	%rdx, %r11
1865	adcq	\$0, %r11
1866
1867	mulq	%rbx
1868	addq	%rax, %r12
1869	movq	40($ap), %rax
1870	adcq	\$0, %rdx
1871	addq	%r12, %r11
1872	movq	%rdx, %r12
1873	adcq	\$0, %r12
1874
1875	mulq	%rbx
1876	addq	%rax, %r13
1877	movq	48($ap), %rax
1878	adcq	\$0, %rdx
1879	addq	%r13, %r12
1880	movq	%rdx, %r13
1881	adcq	\$0, %r13
1882
1883	mulq	%rbx
1884	addq	%rax, %r14
1885	movq	56($ap), %rax
1886	adcq	\$0, %rdx
1887	addq	%r14, %r13
1888	movq	%rdx, %r14
1889	 leaq	8($bp), $bp
1890	adcq	\$0, %r14
1891
1892	mulq	%rbx
1893	addq	%rax, %r15
1894	 movq	($ap), %rax
1895	adcq	\$0, %rdx
1896	addq	%r15, %r14
1897	movq	%rdx, %r15
1898	adcq	\$0, %r15
1899
1900	leaq	8(%rdi), %rdi
1901
1902	decl	%ecx
1903	jnz	.Loop_mul
1904
1905	movq	%r8, (%rdi)
1906	movq	%r9, 8(%rdi)
1907	movq	%r10, 16(%rdi)
1908	movq	%r11, 24(%rdi)
1909	movq	%r12, 32(%rdi)
1910	movq	%r13, 40(%rdi)
1911	movq	%r14, 48(%rdi)
1912	movq	%r15, 56(%rdi)
1913
1914	ret
1915.size	__rsaz_512_mul,.-__rsaz_512_mul
1916___
1917}
1918if ($addx) {
1919	# __rsaz_512_mulx
1920	#
1921	# input: %rsi - ap, %rbp - bp
1922	# ouput:
1923	# clobbers: everything
1924my ($ap,$bp,$zero) = ("%rsi","%rbp","%rdi");
1925$code.=<<___;
1926.type	__rsaz_512_mulx,\@abi-omnipotent
1927.align	32
1928__rsaz_512_mulx:
1929	mulx	($ap), %rbx, %r8	# initial %rdx preloaded by caller
1930	mov	\$-6, %rcx
1931
1932	mulx	8($ap), %rax, %r9
1933	movq	%rbx, 8(%rsp)
1934
1935	mulx	16($ap), %rbx, %r10
1936	adc	%rax, %r8
1937
1938	mulx	24($ap), %rax, %r11
1939	adc	%rbx, %r9
1940
1941	mulx	32($ap), %rbx, %r12
1942	adc	%rax, %r10
1943
1944	mulx	40($ap), %rax, %r13
1945	adc	%rbx, %r11
1946
1947	mulx	48($ap), %rbx, %r14
1948	adc	%rax, %r12
1949
1950	mulx	56($ap), %rax, %r15
1951	 mov	8($bp), %rdx
1952	adc	%rbx, %r13
1953	adc	%rax, %r14
1954	adc	\$0, %r15
1955
1956	xor	$zero, $zero		# cf=0,of=0
1957	jmp	.Loop_mulx
1958
1959.align	32
1960.Loop_mulx:
1961	movq	%r8, %rbx
1962	mulx	($ap), %rax, %r8
1963	adcx	%rax, %rbx
1964	adox	%r9, %r8
1965
1966	mulx	8($ap), %rax, %r9
1967	adcx	%rax, %r8
1968	adox	%r10, %r9
1969
1970	mulx	16($ap), %rax, %r10
1971	adcx	%rax, %r9
1972	adox	%r11, %r10
1973
1974	mulx	24($ap), %rax, %r11
1975	adcx	%rax, %r10
1976	adox	%r12, %r11
1977
1978	.byte	0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00	# mulx	32($ap), %rax, %r12
1979	adcx	%rax, %r11
1980	adox	%r13, %r12
1981
1982	mulx	40($ap), %rax, %r13
1983	adcx	%rax, %r12
1984	adox	%r14, %r13
1985
1986	mulx	48($ap), %rax, %r14
1987	adcx	%rax, %r13
1988	adox	%r15, %r14
1989
1990	mulx	56($ap), %rax, %r15
1991	 movq	64($bp,%rcx,8), %rdx
1992	 movq	%rbx, 8+64-8(%rsp,%rcx,8)
1993	adcx	%rax, %r14
1994	adox	$zero, %r15
1995	adcx	$zero, %r15		# cf=0
1996
1997	inc	%rcx			# of=0
1998	jnz	.Loop_mulx
1999
2000	movq	%r8, %rbx
2001	mulx	($ap), %rax, %r8
2002	adcx	%rax, %rbx
2003	adox	%r9, %r8
2004
2005	.byte	0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00	# mulx	8($ap), %rax, %r9
2006	adcx	%rax, %r8
2007	adox	%r10, %r9
2008
2009	.byte	0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00	# mulx	16($ap), %rax, %r10
2010	adcx	%rax, %r9
2011	adox	%r11, %r10
2012
2013	mulx	24($ap), %rax, %r11
2014	adcx	%rax, %r10
2015	adox	%r12, %r11
2016
2017	mulx	32($ap), %rax, %r12
2018	adcx	%rax, %r11
2019	adox	%r13, %r12
2020
2021	mulx	40($ap), %rax, %r13
2022	adcx	%rax, %r12
2023	adox	%r14, %r13
2024
2025	.byte	0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00	# mulx	48($ap), %rax, %r14
2026	adcx	%rax, %r13
2027	adox	%r15, %r14
2028
2029	.byte	0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00	# mulx	56($ap), %rax, %r15
2030	adcx	%rax, %r14
2031	adox	$zero, %r15
2032	adcx	$zero, %r15
2033
2034	mov	%rbx, 8+64-8(%rsp)
2035	mov	%r8, 8+64(%rsp)
2036	mov	%r9, 8+64+8(%rsp)
2037	mov	%r10, 8+64+16(%rsp)
2038	mov	%r11, 8+64+24(%rsp)
2039	mov	%r12, 8+64+32(%rsp)
2040	mov	%r13, 8+64+40(%rsp)
2041	mov	%r14, 8+64+48(%rsp)
2042	mov	%r15, 8+64+56(%rsp)
2043
2044	ret
2045.size	__rsaz_512_mulx,.-__rsaz_512_mulx
2046___
2047}
2048{
2049my ($out,$inp,$power)= $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
2050$code.=<<___;
2051.globl	rsaz_512_scatter4
2052.type	rsaz_512_scatter4,\@abi-omnipotent
2053.align	16
2054rsaz_512_scatter4:
2055	leaq	($out,$power,8), $out
2056	movl	\$8, %r9d
2057	jmp	.Loop_scatter
2058.align	16
2059.Loop_scatter:
2060	movq	($inp), %rax
2061	leaq	8($inp), $inp
2062	movq	%rax, ($out)
2063	leaq	128($out), $out
2064	decl	%r9d
2065	jnz	.Loop_scatter
2066	ret
2067.size	rsaz_512_scatter4,.-rsaz_512_scatter4
2068
2069.globl	rsaz_512_gather4
2070.type	rsaz_512_gather4,\@abi-omnipotent
2071.align	16
2072rsaz_512_gather4:
2073___
2074$code.=<<___	if ($win64);
2075.LSEH_begin_rsaz_512_gather4:
2076	.byte	0x48,0x81,0xec,0xa8,0x00,0x00,0x00	# sub    $0xa8,%rsp
2077	.byte	0x0f,0x29,0x34,0x24			# movaps %xmm6,(%rsp)
2078	.byte	0x0f,0x29,0x7c,0x24,0x10		# movaps %xmm7,0x10(%rsp)
2079	.byte	0x44,0x0f,0x29,0x44,0x24,0x20		# movaps %xmm8,0x20(%rsp)
2080	.byte	0x44,0x0f,0x29,0x4c,0x24,0x30		# movaps %xmm9,0x30(%rsp)
2081	.byte	0x44,0x0f,0x29,0x54,0x24,0x40		# movaps %xmm10,0x40(%rsp)
2082	.byte	0x44,0x0f,0x29,0x5c,0x24,0x50		# movaps %xmm11,0x50(%rsp)
2083	.byte	0x44,0x0f,0x29,0x64,0x24,0x60		# movaps %xmm12,0x60(%rsp)
2084	.byte	0x44,0x0f,0x29,0x6c,0x24,0x70		# movaps %xmm13,0x70(%rsp)
2085	.byte	0x44,0x0f,0x29,0xb4,0x24,0x80,0,0,0	# movaps %xmm14,0x80(%rsp)
2086	.byte	0x44,0x0f,0x29,0xbc,0x24,0x90,0,0,0	# movaps %xmm15,0x90(%rsp)
2087___
2088$code.=<<___;
2089	movd	$power,%xmm8
2090	movdqa	.Linc+16(%rip),%xmm1	# 00000002000000020000000200000002
2091	movdqa	.Linc(%rip),%xmm0	# 00000001000000010000000000000000
2092
2093	pshufd	\$0,%xmm8,%xmm8		# broadcast $power
2094	movdqa	%xmm1,%xmm7
2095	movdqa	%xmm1,%xmm2
2096___
2097########################################################################
2098# calculate mask by comparing 0..15 to $power
2099#
2100for($i=0;$i<4;$i++) {
2101$code.=<<___;
2102	paddd	%xmm`$i`,%xmm`$i+1`
2103	pcmpeqd	%xmm8,%xmm`$i`
2104	movdqa	%xmm7,%xmm`$i+3`
2105___
2106}
2107for(;$i<7;$i++) {
2108$code.=<<___;
2109	paddd	%xmm`$i`,%xmm`$i+1`
2110	pcmpeqd	%xmm8,%xmm`$i`
2111___
2112}
2113$code.=<<___;
2114	pcmpeqd	%xmm8,%xmm7
2115	movl	\$8, %r9d
2116	jmp	.Loop_gather
2117.align	16
2118.Loop_gather:
2119	movdqa	16*0($inp),%xmm8
2120	movdqa	16*1($inp),%xmm9
2121	movdqa	16*2($inp),%xmm10
2122	movdqa	16*3($inp),%xmm11
2123	pand	%xmm0,%xmm8
2124	movdqa	16*4($inp),%xmm12
2125	pand	%xmm1,%xmm9
2126	movdqa	16*5($inp),%xmm13
2127	pand	%xmm2,%xmm10
2128	movdqa	16*6($inp),%xmm14
2129	pand	%xmm3,%xmm11
2130	movdqa	16*7($inp),%xmm15
2131	leaq	128($inp), $inp
2132	pand	%xmm4,%xmm12
2133	pand	%xmm5,%xmm13
2134	pand	%xmm6,%xmm14
2135	pand	%xmm7,%xmm15
2136	por	%xmm10,%xmm8
2137	por	%xmm11,%xmm9
2138	por	%xmm12,%xmm8
2139	por	%xmm13,%xmm9
2140	por	%xmm14,%xmm8
2141	por	%xmm15,%xmm9
2142
2143	por	%xmm9,%xmm8
2144	pshufd	\$0x4e,%xmm8,%xmm9
2145	por	%xmm9,%xmm8
2146	movq	%xmm8,($out)
2147	leaq	8($out), $out
2148	decl	%r9d
2149	jnz	.Loop_gather
2150___
2151$code.=<<___	if ($win64);
2152	movaps	0x00(%rsp),%xmm6
2153	movaps	0x10(%rsp),%xmm7
2154	movaps	0x20(%rsp),%xmm8
2155	movaps	0x30(%rsp),%xmm9
2156	movaps	0x40(%rsp),%xmm10
2157	movaps	0x50(%rsp),%xmm11
2158	movaps	0x60(%rsp),%xmm12
2159	movaps	0x70(%rsp),%xmm13
2160	movaps	0x80(%rsp),%xmm14
2161	movaps	0x90(%rsp),%xmm15
2162	add	\$0xa8,%rsp
2163___
2164$code.=<<___;
2165	ret
2166.LSEH_end_rsaz_512_gather4:
2167.size	rsaz_512_gather4,.-rsaz_512_gather4
2168
2169.align	64
2170.Linc:
2171	.long	0,0, 1,1
2172	.long	2,2, 2,2
2173___
2174}
2175
2176# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2177#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
2178if ($win64) {
2179$rec="%rcx";
2180$frame="%rdx";
2181$context="%r8";
2182$disp="%r9";
2183
2184$code.=<<___;
2185.extern	__imp_RtlVirtualUnwind
2186.type	se_handler,\@abi-omnipotent
2187.align	16
2188se_handler:
2189	push	%rsi
2190	push	%rdi
2191	push	%rbx
2192	push	%rbp
2193	push	%r12
2194	push	%r13
2195	push	%r14
2196	push	%r15
2197	pushfq
2198	sub	\$64,%rsp
2199
2200	mov	120($context),%rax	# pull context->Rax
2201	mov	248($context),%rbx	# pull context->Rip
2202
2203	mov	8($disp),%rsi		# disp->ImageBase
2204	mov	56($disp),%r11		# disp->HandlerData
2205
2206	mov	0(%r11),%r10d		# HandlerData[0]
2207	lea	(%rsi,%r10),%r10	# end of prologue label
2208	cmp	%r10,%rbx		# context->Rip<end of prologue label
2209	jb	.Lcommon_seh_tail
2210
2211	mov	152($context),%rax	# pull context->Rsp
2212
2213	mov	4(%r11),%r10d		# HandlerData[1]
2214	lea	(%rsi,%r10),%r10	# epilogue label
2215	cmp	%r10,%rbx		# context->Rip>=epilogue label
2216	jae	.Lcommon_seh_tail
2217
2218	lea	128+24+48(%rax),%rax
2219
2220	lea	.Lmul_gather4_epilogue(%rip),%rbx
2221	cmp	%r10,%rbx
2222	jne	.Lse_not_in_mul_gather4
2223
2224	lea	0xb0(%rax),%rax
2225
2226	lea	-48-0xa8(%rax),%rsi
2227	lea	512($context),%rdi
2228	mov	\$20,%ecx
2229	.long	0xa548f3fc		# cld; rep movsq
2230
2231.Lse_not_in_mul_gather4:
2232	mov	-8(%rax),%rbx
2233	mov	-16(%rax),%rbp
2234	mov	-24(%rax),%r12
2235	mov	-32(%rax),%r13
2236	mov	-40(%rax),%r14
2237	mov	-48(%rax),%r15
2238	mov	%rbx,144($context)	# restore context->Rbx
2239	mov	%rbp,160($context)	# restore context->Rbp
2240	mov	%r12,216($context)	# restore context->R12
2241	mov	%r13,224($context)	# restore context->R13
2242	mov	%r14,232($context)	# restore context->R14
2243	mov	%r15,240($context)	# restore context->R15
2244
2245.Lcommon_seh_tail:
2246	mov	8(%rax),%rdi
2247	mov	16(%rax),%rsi
2248	mov	%rax,152($context)	# restore context->Rsp
2249	mov	%rsi,168($context)	# restore context->Rsi
2250	mov	%rdi,176($context)	# restore context->Rdi
2251
2252	mov	40($disp),%rdi		# disp->ContextRecord
2253	mov	$context,%rsi		# context
2254	mov	\$154,%ecx		# sizeof(CONTEXT)
2255	.long	0xa548f3fc		# cld; rep movsq
2256
2257	mov	$disp,%rsi
2258	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
2259	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
2260	mov	0(%rsi),%r8		# arg3, disp->ControlPc
2261	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
2262	mov	40(%rsi),%r10		# disp->ContextRecord
2263	lea	56(%rsi),%r11		# &disp->HandlerData
2264	lea	24(%rsi),%r12		# &disp->EstablisherFrame
2265	mov	%r10,32(%rsp)		# arg5
2266	mov	%r11,40(%rsp)		# arg6
2267	mov	%r12,48(%rsp)		# arg7
2268	mov	%rcx,56(%rsp)		# arg8, (NULL)
2269	call	*__imp_RtlVirtualUnwind(%rip)
2270
2271	mov	\$1,%eax		# ExceptionContinueSearch
2272	add	\$64,%rsp
2273	popfq
2274	pop	%r15
2275	pop	%r14
2276	pop	%r13
2277	pop	%r12
2278	pop	%rbp
2279	pop	%rbx
2280	pop	%rdi
2281	pop	%rsi
2282	ret
2283.size	se_handler,.-se_handler
2284
2285.section	.pdata
2286.align	4
2287	.rva	.LSEH_begin_rsaz_512_sqr
2288	.rva	.LSEH_end_rsaz_512_sqr
2289	.rva	.LSEH_info_rsaz_512_sqr
2290
2291	.rva	.LSEH_begin_rsaz_512_mul
2292	.rva	.LSEH_end_rsaz_512_mul
2293	.rva	.LSEH_info_rsaz_512_mul
2294
2295	.rva	.LSEH_begin_rsaz_512_mul_gather4
2296	.rva	.LSEH_end_rsaz_512_mul_gather4
2297	.rva	.LSEH_info_rsaz_512_mul_gather4
2298
2299	.rva	.LSEH_begin_rsaz_512_mul_scatter4
2300	.rva	.LSEH_end_rsaz_512_mul_scatter4
2301	.rva	.LSEH_info_rsaz_512_mul_scatter4
2302
2303	.rva	.LSEH_begin_rsaz_512_mul_by_one
2304	.rva	.LSEH_end_rsaz_512_mul_by_one
2305	.rva	.LSEH_info_rsaz_512_mul_by_one
2306
2307	.rva	.LSEH_begin_rsaz_512_gather4
2308	.rva	.LSEH_end_rsaz_512_gather4
2309	.rva	.LSEH_info_rsaz_512_gather4
2310
2311.section	.xdata
2312.align	8
2313.LSEH_info_rsaz_512_sqr:
2314	.byte	9,0,0,0
2315	.rva	se_handler
2316	.rva	.Lsqr_body,.Lsqr_epilogue			# HandlerData[]
2317.LSEH_info_rsaz_512_mul:
2318	.byte	9,0,0,0
2319	.rva	se_handler
2320	.rva	.Lmul_body,.Lmul_epilogue			# HandlerData[]
2321.LSEH_info_rsaz_512_mul_gather4:
2322	.byte	9,0,0,0
2323	.rva	se_handler
2324	.rva	.Lmul_gather4_body,.Lmul_gather4_epilogue	# HandlerData[]
2325.LSEH_info_rsaz_512_mul_scatter4:
2326	.byte	9,0,0,0
2327	.rva	se_handler
2328	.rva	.Lmul_scatter4_body,.Lmul_scatter4_epilogue	# HandlerData[]
2329.LSEH_info_rsaz_512_mul_by_one:
2330	.byte	9,0,0,0
2331	.rva	se_handler
2332	.rva	.Lmul_by_one_body,.Lmul_by_one_epilogue		# HandlerData[]
2333.LSEH_info_rsaz_512_gather4:
2334	.byte	0x01,0x46,0x16,0x00
2335	.byte	0x46,0xf8,0x09,0x00	# vmovaps 0x90(rsp),xmm15
2336	.byte	0x3d,0xe8,0x08,0x00	# vmovaps 0x80(rsp),xmm14
2337	.byte	0x34,0xd8,0x07,0x00	# vmovaps 0x70(rsp),xmm13
2338	.byte	0x2e,0xc8,0x06,0x00	# vmovaps 0x60(rsp),xmm12
2339	.byte	0x28,0xb8,0x05,0x00	# vmovaps 0x50(rsp),xmm11
2340	.byte	0x22,0xa8,0x04,0x00	# vmovaps 0x40(rsp),xmm10
2341	.byte	0x1c,0x98,0x03,0x00	# vmovaps 0x30(rsp),xmm9
2342	.byte	0x16,0x88,0x02,0x00	# vmovaps 0x20(rsp),xmm8
2343	.byte	0x10,0x78,0x01,0x00	# vmovaps 0x10(rsp),xmm7
2344	.byte	0x0b,0x68,0x00,0x00	# vmovaps 0x00(rsp),xmm6
2345	.byte	0x07,0x01,0x15,0x00	# sub     rsp,0xa8
2346___
2347}
2348
2349$code =~ s/\`([^\`]*)\`/eval $1/gem;
2350print $code;
2351close STDOUT;
2352