1#! /usr/bin/env perl
2# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
3# Copyright (c) 2012, Intel Corporation. All Rights Reserved.
4#
5# Licensed under the OpenSSL license (the "License").  You may not use
6# this file except in compliance with the License.  You can obtain a copy
7# in the file LICENSE in the source distribution or at
8# https://www.openssl.org/source/license.html
9#
10# Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)
11# (1) Intel Corporation, Israel Development Center, Haifa, Israel
12# (2) University of Haifa, Israel
13#
14# References:
15# [1] S. Gueron, "Efficient Software Implementations of Modular
16#     Exponentiation", http://eprint.iacr.org/2011/239
17# [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring".
18#     IEEE Proceedings of 9th International Conference on Information
19#     Technology: New Generations (ITNG 2012), 821-823 (2012).
20# [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation
21#     Journal of Cryptographic Engineering 2:31-43 (2012).
22# [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis
23#     resistant 512-bit and 1024-bit modular exponentiation for optimizing
24#     RSA1024 and RSA2048 on x86_64 platforms",
25#     http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest
26#
27# While original submission covers 512- and 1024-bit exponentiation,
28# this module is limited to 512-bit version only (and as such
29# accelerates RSA1024 sign). This is because improvement for longer
30# keys is not high enough to justify the effort, highest measured
31# was ~5% on Westmere. [This is relative to OpenSSL 1.0.2, upcoming
32# for the moment of this writing!] Nor does this module implement
33# "monolithic" complete exponentiation jumbo-subroutine, but adheres
34# to more modular mixture of C and assembly. And it's optimized even
35# for processors other than Intel Core family (see table below for
36# improvement coefficients).
37# 						<appro@openssl.org>
38#
39# RSA1024 sign/sec	this/original	|this/rsax(*)	this/fips(*)
40#			----------------+---------------------------
41# Opteron		+13%		|+5%		+20%
42# Bulldozer		-0%		|-1%		+10%
43# P4			+11%		|+7%		+8%
44# Westmere		+5%		|+14%		+17%
45# Sandy Bridge		+2%		|+12%		+29%
46# Ivy Bridge		+1%		|+11%		+35%
47# Haswell(**)		-0%		|+12%		+39%
48# Atom			+13%		|+11%		+4%
49# VIA Nano		+70%		|+9%		+25%
50#
51# (*)	rsax engine and fips numbers are presented for reference
52#	purposes;
53# (**)	MULX was attempted, but found to give only marginal improvement;
54
55$flavour = shift;
56$output  = shift;
57if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
58
59$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
60
61$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
62( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
63( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
64die "can't locate x86_64-xlate.pl";
65
66open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
67*STDOUT=*OUT;
68
69if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
70		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
71	$addx = ($1>=2.23);
72}
73
74if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
75	    `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
76	$addx = ($1>=2.10);
77}
78
79if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
80	    `ml64 2>&1` =~ /Version ([0-9]+)\./) {
81	$addx = ($1>=12);
82}
83
84if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) {
85	my $ver = $2 + $3/100.0;	# 3.1->3.01, 3.10->3.10
86	$addx = ($ver>=3.03);
87}
88
89($out, $inp, $mod) = ("%rdi", "%rsi", "%rbp");	# common internal API
90{
91my ($out,$inp,$mod,$n0,$times) = ("%rdi","%rsi","%rdx","%rcx","%r8d");
92
93$code.=<<___;
94.text
95
96.extern	OPENSSL_ia32cap_P
97
98.globl	rsaz_512_sqr
99.type	rsaz_512_sqr,\@function,5
100.align	32
101rsaz_512_sqr:				# 25-29% faster than rsaz_512_mul
102.cfi_startproc
103	push	%rbx
104.cfi_push	%rbx
105	push	%rbp
106.cfi_push	%rbp
107	push	%r12
108.cfi_push	%r12
109	push	%r13
110.cfi_push	%r13
111	push	%r14
112.cfi_push	%r14
113	push	%r15
114.cfi_push	%r15
115
116	subq	\$128+24, %rsp
117.cfi_adjust_cfa_offset	128+24
118.Lsqr_body:
119	movq	$mod, %rbp		# common argument
120	movq	($inp), %rdx
121	movq	8($inp), %rax
122	movq	$n0, 128(%rsp)
123___
124$code.=<<___ if ($addx);
125	movl	\$0x80100,%r11d
126	andl	OPENSSL_ia32cap_P+8(%rip),%r11d
127	cmpl	\$0x80100,%r11d		# check for MULX and ADO/CX
128	je	.Loop_sqrx
129___
130$code.=<<___;
131	jmp	.Loop_sqr
132
133.align	32
134.Loop_sqr:
135	movl	$times,128+8(%rsp)
136#first iteration
137	movq	%rdx, %rbx
138	mulq	%rdx
139	movq	%rax, %r8
140	movq	16($inp), %rax
141	movq	%rdx, %r9
142
143	mulq	%rbx
144	addq	%rax, %r9
145	movq	24($inp), %rax
146	movq	%rdx, %r10
147	adcq	\$0, %r10
148
149	mulq	%rbx
150	addq	%rax, %r10
151	movq	32($inp), %rax
152	movq	%rdx, %r11
153	adcq	\$0, %r11
154
155	mulq	%rbx
156	addq	%rax, %r11
157	movq	40($inp), %rax
158	movq	%rdx, %r12
159	adcq	\$0, %r12
160
161	mulq	%rbx
162	addq	%rax, %r12
163	movq	48($inp), %rax
164	movq	%rdx, %r13
165	adcq	\$0, %r13
166
167	mulq	%rbx
168	addq	%rax, %r13
169	movq	56($inp), %rax
170	movq	%rdx, %r14
171	adcq	\$0, %r14
172
173	mulq	%rbx
174	addq	%rax, %r14
175	movq	%rbx, %rax
176	movq	%rdx, %r15
177	adcq	\$0, %r15
178
179	addq	%r8, %r8		#shlq	\$1, %r8
180	movq	%r9, %rcx
181	adcq	%r9, %r9		#shld	\$1, %r8, %r9
182
183	mulq	%rax
184	movq	%rax, (%rsp)
185	addq	%rdx, %r8
186	adcq	\$0, %r9
187
188	movq	%r8, 8(%rsp)
189	shrq	\$63, %rcx
190
191#second iteration
192	movq	8($inp), %r8
193	movq	16($inp), %rax
194	mulq	%r8
195	addq	%rax, %r10
196	movq	24($inp), %rax
197	movq	%rdx, %rbx
198	adcq	\$0, %rbx
199
200	mulq	%r8
201	addq	%rax, %r11
202	movq	32($inp), %rax
203	adcq	\$0, %rdx
204	addq	%rbx, %r11
205	movq	%rdx, %rbx
206	adcq	\$0, %rbx
207
208	mulq	%r8
209	addq	%rax, %r12
210	movq	40($inp), %rax
211	adcq	\$0, %rdx
212	addq	%rbx, %r12
213	movq	%rdx, %rbx
214	adcq	\$0, %rbx
215
216	mulq	%r8
217	addq	%rax, %r13
218	movq	48($inp), %rax
219	adcq	\$0, %rdx
220	addq	%rbx, %r13
221	movq	%rdx, %rbx
222	adcq	\$0, %rbx
223
224	mulq	%r8
225	addq	%rax, %r14
226	movq	56($inp), %rax
227	adcq	\$0, %rdx
228	addq	%rbx, %r14
229	movq	%rdx, %rbx
230	adcq	\$0, %rbx
231
232	mulq	%r8
233	addq	%rax, %r15
234	movq	%r8, %rax
235	adcq	\$0, %rdx
236	addq	%rbx, %r15
237	movq	%rdx, %r8
238	movq	%r10, %rdx
239	adcq	\$0, %r8
240
241	add	%rdx, %rdx
242	lea	(%rcx,%r10,2), %r10	#shld	\$1, %rcx, %r10
243	movq	%r11, %rbx
244	adcq	%r11, %r11		#shld	\$1, %r10, %r11
245
246	mulq	%rax
247	addq	%rax, %r9
248	adcq	%rdx, %r10
249	adcq	\$0, %r11
250
251	movq	%r9, 16(%rsp)
252	movq	%r10, 24(%rsp)
253	shrq	\$63, %rbx
254
255#third iteration
256	movq	16($inp), %r9
257	movq	24($inp), %rax
258	mulq	%r9
259	addq	%rax, %r12
260	movq	32($inp), %rax
261	movq	%rdx, %rcx
262	adcq	\$0, %rcx
263
264	mulq	%r9
265	addq	%rax, %r13
266	movq	40($inp), %rax
267	adcq	\$0, %rdx
268	addq	%rcx, %r13
269	movq	%rdx, %rcx
270	adcq	\$0, %rcx
271
272	mulq	%r9
273	addq	%rax, %r14
274	movq	48($inp), %rax
275	adcq	\$0, %rdx
276	addq	%rcx, %r14
277	movq	%rdx, %rcx
278	adcq	\$0, %rcx
279
280	mulq	%r9
281	 movq	%r12, %r10
282	 lea	(%rbx,%r12,2), %r12	#shld	\$1, %rbx, %r12
283	addq	%rax, %r15
284	movq	56($inp), %rax
285	adcq	\$0, %rdx
286	addq	%rcx, %r15
287	movq	%rdx, %rcx
288	adcq	\$0, %rcx
289
290	mulq	%r9
291	 shrq	\$63, %r10
292	addq	%rax, %r8
293	movq	%r9, %rax
294	adcq	\$0, %rdx
295	addq	%rcx, %r8
296	movq	%rdx, %r9
297	adcq	\$0, %r9
298
299	movq	%r13, %rcx
300	leaq	(%r10,%r13,2), %r13	#shld	\$1, %r12, %r13
301
302	mulq	%rax
303	addq	%rax, %r11
304	adcq	%rdx, %r12
305	adcq	\$0, %r13
306
307	movq	%r11, 32(%rsp)
308	movq	%r12, 40(%rsp)
309	shrq	\$63, %rcx
310
311#fourth iteration
312	movq	24($inp), %r10
313	movq	32($inp), %rax
314	mulq	%r10
315	addq	%rax, %r14
316	movq	40($inp), %rax
317	movq	%rdx, %rbx
318	adcq	\$0, %rbx
319
320	mulq	%r10
321	addq	%rax, %r15
322	movq	48($inp), %rax
323	adcq	\$0, %rdx
324	addq	%rbx, %r15
325	movq	%rdx, %rbx
326	adcq	\$0, %rbx
327
328	mulq	%r10
329	 movq	%r14, %r12
330	 leaq	(%rcx,%r14,2), %r14	#shld	\$1, %rcx, %r14
331	addq	%rax, %r8
332	movq	56($inp), %rax
333	adcq	\$0, %rdx
334	addq	%rbx, %r8
335	movq	%rdx, %rbx
336	adcq	\$0, %rbx
337
338	mulq	%r10
339	 shrq	\$63, %r12
340	addq	%rax, %r9
341	movq	%r10, %rax
342	adcq	\$0, %rdx
343	addq	%rbx, %r9
344	movq	%rdx, %r10
345	adcq	\$0, %r10
346
347	movq	%r15, %rbx
348	leaq	(%r12,%r15,2),%r15	#shld	\$1, %r14, %r15
349
350	mulq	%rax
351	addq	%rax, %r13
352	adcq	%rdx, %r14
353	adcq	\$0, %r15
354
355	movq	%r13, 48(%rsp)
356	movq	%r14, 56(%rsp)
357	shrq	\$63, %rbx
358
359#fifth iteration
360	movq	32($inp), %r11
361	movq	40($inp), %rax
362	mulq	%r11
363	addq	%rax, %r8
364	movq	48($inp), %rax
365	movq	%rdx, %rcx
366	adcq	\$0, %rcx
367
368	mulq	%r11
369	addq	%rax, %r9
370	movq	56($inp), %rax
371	adcq	\$0, %rdx
372	 movq	%r8, %r12
373	 leaq	(%rbx,%r8,2), %r8	#shld	\$1, %rbx, %r8
374	addq	%rcx, %r9
375	movq	%rdx, %rcx
376	adcq	\$0, %rcx
377
378	mulq	%r11
379	 shrq	\$63, %r12
380	addq	%rax, %r10
381	movq	%r11, %rax
382	adcq	\$0, %rdx
383	addq	%rcx, %r10
384	movq	%rdx, %r11
385	adcq	\$0, %r11
386
387	movq	%r9, %rcx
388	leaq	(%r12,%r9,2), %r9	#shld	\$1, %r8, %r9
389
390	mulq	%rax
391	addq	%rax, %r15
392	adcq	%rdx, %r8
393	adcq	\$0, %r9
394
395	movq	%r15, 64(%rsp)
396	movq	%r8, 72(%rsp)
397	shrq	\$63, %rcx
398
399#sixth iteration
400	movq	40($inp), %r12
401	movq	48($inp), %rax
402	mulq	%r12
403	addq	%rax, %r10
404	movq	56($inp), %rax
405	movq	%rdx, %rbx
406	adcq	\$0, %rbx
407
408	mulq	%r12
409	addq	%rax, %r11
410	movq	%r12, %rax
411	 movq	%r10, %r15
412	 leaq	(%rcx,%r10,2), %r10	#shld	\$1, %rcx, %r10
413	adcq	\$0, %rdx
414	 shrq	\$63, %r15
415	addq	%rbx, %r11
416	movq	%rdx, %r12
417	adcq	\$0, %r12
418
419	movq	%r11, %rbx
420	leaq	(%r15,%r11,2), %r11	#shld	\$1, %r10, %r11
421
422	mulq	%rax
423	addq	%rax, %r9
424	adcq	%rdx, %r10
425	adcq	\$0, %r11
426
427	movq	%r9, 80(%rsp)
428	movq	%r10, 88(%rsp)
429
430#seventh iteration
431	movq	48($inp), %r13
432	movq	56($inp), %rax
433	mulq	%r13
434	addq	%rax, %r12
435	movq	%r13, %rax
436	movq	%rdx, %r13
437	adcq	\$0, %r13
438
439	xorq	%r14, %r14
440	shlq	\$1, %rbx
441	adcq	%r12, %r12		#shld	\$1, %rbx, %r12
442	adcq	%r13, %r13		#shld	\$1, %r12, %r13
443	adcq	%r14, %r14		#shld	\$1, %r13, %r14
444
445	mulq	%rax
446	addq	%rax, %r11
447	adcq	%rdx, %r12
448	adcq	\$0, %r13
449
450	movq	%r11, 96(%rsp)
451	movq	%r12, 104(%rsp)
452
453#eighth iteration
454	movq	56($inp), %rax
455	mulq	%rax
456	addq	%rax, %r13
457	adcq	\$0, %rdx
458
459	addq	%rdx, %r14
460
461	movq	%r13, 112(%rsp)
462	movq	%r14, 120(%rsp)
463
464	movq	(%rsp), %r8
465	movq	8(%rsp), %r9
466	movq	16(%rsp), %r10
467	movq	24(%rsp), %r11
468	movq	32(%rsp), %r12
469	movq	40(%rsp), %r13
470	movq	48(%rsp), %r14
471	movq	56(%rsp), %r15
472
473	call	__rsaz_512_reduce
474
475	addq	64(%rsp), %r8
476	adcq	72(%rsp), %r9
477	adcq	80(%rsp), %r10
478	adcq	88(%rsp), %r11
479	adcq	96(%rsp), %r12
480	adcq	104(%rsp), %r13
481	adcq	112(%rsp), %r14
482	adcq	120(%rsp), %r15
483	sbbq	%rcx, %rcx
484
485	call	__rsaz_512_subtract
486
487	movq	%r8, %rdx
488	movq	%r9, %rax
489	movl	128+8(%rsp), $times
490	movq	$out, $inp
491
492	decl	$times
493	jnz	.Loop_sqr
494___
495if ($addx) {
496$code.=<<___;
497	jmp	.Lsqr_tail
498
499.align	32
500.Loop_sqrx:
501	movl	$times,128+8(%rsp)
502	movq	$out, %xmm0		# off-load
503	movq	%rbp, %xmm1		# off-load
504#first iteration
505	mulx	%rax, %r8, %r9
506
507	mulx	16($inp), %rcx, %r10
508	xor	%rbp, %rbp		# cf=0, of=0
509
510	mulx	24($inp), %rax, %r11
511	adcx	%rcx, %r9
512
513	mulx	32($inp), %rcx, %r12
514	adcx	%rax, %r10
515
516	mulx	40($inp), %rax, %r13
517	adcx	%rcx, %r11
518
519	.byte	0xc4,0x62,0xf3,0xf6,0xb6,0x30,0x00,0x00,0x00	# mulx	48($inp), %rcx, %r14
520	adcx	%rax, %r12
521	adcx	%rcx, %r13
522
523	.byte	0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00	# mulx	56($inp), %rax, %r15
524	adcx	%rax, %r14
525	adcx	%rbp, %r15		# %rbp is 0
526
527	mov	%r9, %rcx
528	shld	\$1, %r8, %r9
529	shl	\$1, %r8
530
531	xor	%ebp, %ebp
532	mulx	%rdx, %rax, %rdx
533	adcx	%rdx, %r8
534	 mov	8($inp), %rdx
535	adcx	%rbp, %r9
536
537	mov	%rax, (%rsp)
538	mov	%r8, 8(%rsp)
539
540#second iteration
541	mulx	16($inp), %rax, %rbx
542	adox	%rax, %r10
543	adcx	%rbx, %r11
544
545	.byte	0xc4,0x62,0xc3,0xf6,0x86,0x18,0x00,0x00,0x00	# mulx	24($inp), $out, %r8
546	adox	$out, %r11
547	adcx	%r8, %r12
548
549	mulx	32($inp), %rax, %rbx
550	adox	%rax, %r12
551	adcx	%rbx, %r13
552
553	mulx	40($inp), $out, %r8
554	adox	$out, %r13
555	adcx	%r8, %r14
556
557	.byte	0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00	# mulx	48($inp), %rax, %rbx
558	adox	%rax, %r14
559	adcx	%rbx, %r15
560
561	.byte	0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00	# mulx	56($inp), $out, %r8
562	adox	$out, %r15
563	adcx	%rbp, %r8
564	adox	%rbp, %r8
565
566	mov	%r11, %rbx
567	shld	\$1, %r10, %r11
568	shld	\$1, %rcx, %r10
569
570	xor	%ebp,%ebp
571	mulx	%rdx, %rax, %rcx
572	 mov	16($inp), %rdx
573	adcx	%rax, %r9
574	adcx	%rcx, %r10
575	adcx	%rbp, %r11
576
577	mov	%r9, 16(%rsp)
578	.byte	0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00		# mov	%r10, 24(%rsp)
579
580#third iteration
581	.byte	0xc4,0x62,0xc3,0xf6,0x8e,0x18,0x00,0x00,0x00	# mulx	24($inp), $out, %r9
582	adox	$out, %r12
583	adcx	%r9, %r13
584
585	mulx	32($inp), %rax, %rcx
586	adox	%rax, %r13
587	adcx	%rcx, %r14
588
589	mulx	40($inp), $out, %r9
590	adox	$out, %r14
591	adcx	%r9, %r15
592
593	.byte	0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00	# mulx	48($inp), %rax, %rcx
594	adox	%rax, %r15
595	adcx	%rcx, %r8
596
597	.byte	0xc4,0x62,0xc3,0xf6,0x8e,0x38,0x00,0x00,0x00	# mulx	56($inp), $out, %r9
598	adox	$out, %r8
599	adcx	%rbp, %r9
600	adox	%rbp, %r9
601
602	mov	%r13, %rcx
603	shld	\$1, %r12, %r13
604	shld	\$1, %rbx, %r12
605
606	xor	%ebp, %ebp
607	mulx	%rdx, %rax, %rdx
608	adcx	%rax, %r11
609	adcx	%rdx, %r12
610	 mov	24($inp), %rdx
611	adcx	%rbp, %r13
612
613	mov	%r11, 32(%rsp)
614	.byte	0x4c,0x89,0xa4,0x24,0x28,0x00,0x00,0x00		# mov	%r12, 40(%rsp)
615
616#fourth iteration
617	.byte	0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00	# mulx	32($inp), %rax, %rbx
618	adox	%rax, %r14
619	adcx	%rbx, %r15
620
621	mulx	40($inp), $out, %r10
622	adox	$out, %r15
623	adcx	%r10, %r8
624
625	mulx	48($inp), %rax, %rbx
626	adox	%rax, %r8
627	adcx	%rbx, %r9
628
629	mulx	56($inp), $out, %r10
630	adox	$out, %r9
631	adcx	%rbp, %r10
632	adox	%rbp, %r10
633
634	.byte	0x66
635	mov	%r15, %rbx
636	shld	\$1, %r14, %r15
637	shld	\$1, %rcx, %r14
638
639	xor	%ebp, %ebp
640	mulx	%rdx, %rax, %rdx
641	adcx	%rax, %r13
642	adcx	%rdx, %r14
643	 mov	32($inp), %rdx
644	adcx	%rbp, %r15
645
646	mov	%r13, 48(%rsp)
647	mov	%r14, 56(%rsp)
648
649#fifth iteration
650	.byte	0xc4,0x62,0xc3,0xf6,0x9e,0x28,0x00,0x00,0x00	# mulx	40($inp), $out, %r11
651	adox	$out, %r8
652	adcx	%r11, %r9
653
654	mulx	48($inp), %rax, %rcx
655	adox	%rax, %r9
656	adcx	%rcx, %r10
657
658	mulx	56($inp), $out, %r11
659	adox	$out, %r10
660	adcx	%rbp, %r11
661	adox	%rbp, %r11
662
663	mov	%r9, %rcx
664	shld	\$1, %r8, %r9
665	shld	\$1, %rbx, %r8
666
667	xor	%ebp, %ebp
668	mulx	%rdx, %rax, %rdx
669	adcx	%rax, %r15
670	adcx	%rdx, %r8
671	 mov	40($inp), %rdx
672	adcx	%rbp, %r9
673
674	mov	%r15, 64(%rsp)
675	mov	%r8, 72(%rsp)
676
677#sixth iteration
678	.byte	0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00	# mulx	48($inp), %rax, %rbx
679	adox	%rax, %r10
680	adcx	%rbx, %r11
681
682	.byte	0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00	# mulx	56($inp), $out, %r12
683	adox	$out, %r11
684	adcx	%rbp, %r12
685	adox	%rbp, %r12
686
687	mov	%r11, %rbx
688	shld	\$1, %r10, %r11
689	shld	\$1, %rcx, %r10
690
691	xor	%ebp, %ebp
692	mulx	%rdx, %rax, %rdx
693	adcx	%rax, %r9
694	adcx	%rdx, %r10
695	 mov	48($inp), %rdx
696	adcx	%rbp, %r11
697
698	mov	%r9, 80(%rsp)
699	mov	%r10, 88(%rsp)
700
701#seventh iteration
702	.byte	0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00	# mulx	56($inp), %rax, %r13
703	adox	%rax, %r12
704	adox	%rbp, %r13
705
706	xor	%r14, %r14
707	shld	\$1, %r13, %r14
708	shld	\$1, %r12, %r13
709	shld	\$1, %rbx, %r12
710
711	xor	%ebp, %ebp
712	mulx	%rdx, %rax, %rdx
713	adcx	%rax, %r11
714	adcx	%rdx, %r12
715	 mov	56($inp), %rdx
716	adcx	%rbp, %r13
717
718	.byte	0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00		# mov	%r11, 96(%rsp)
719	.byte	0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00		# mov	%r12, 104(%rsp)
720
721#eighth iteration
722	mulx	%rdx, %rax, %rdx
723	adox	%rax, %r13
724	adox	%rbp, %rdx
725
726	.byte	0x66
727	add	%rdx, %r14
728
729	movq	%r13, 112(%rsp)
730	movq	%r14, 120(%rsp)
731	movq	%xmm0, $out
732	movq	%xmm1, %rbp
733
734	movq	128(%rsp), %rdx		# pull $n0
735	movq	(%rsp), %r8
736	movq	8(%rsp), %r9
737	movq	16(%rsp), %r10
738	movq	24(%rsp), %r11
739	movq	32(%rsp), %r12
740	movq	40(%rsp), %r13
741	movq	48(%rsp), %r14
742	movq	56(%rsp), %r15
743
744	call	__rsaz_512_reducex
745
746	addq	64(%rsp), %r8
747	adcq	72(%rsp), %r9
748	adcq	80(%rsp), %r10
749	adcq	88(%rsp), %r11
750	adcq	96(%rsp), %r12
751	adcq	104(%rsp), %r13
752	adcq	112(%rsp), %r14
753	adcq	120(%rsp), %r15
754	sbbq	%rcx, %rcx
755
756	call	__rsaz_512_subtract
757
758	movq	%r8, %rdx
759	movq	%r9, %rax
760	movl	128+8(%rsp), $times
761	movq	$out, $inp
762
763	decl	$times
764	jnz	.Loop_sqrx
765
766.Lsqr_tail:
767___
768}
769$code.=<<___;
770
771	leaq	128+24+48(%rsp), %rax
772.cfi_def_cfa	%rax,8
773	movq	-48(%rax), %r15
774.cfi_restore	%r15
775	movq	-40(%rax), %r14
776.cfi_restore	%r14
777	movq	-32(%rax), %r13
778.cfi_restore	%r13
779	movq	-24(%rax), %r12
780.cfi_restore	%r12
781	movq	-16(%rax), %rbp
782.cfi_restore	%rbp
783	movq	-8(%rax), %rbx
784.cfi_restore	%rbx
785	leaq	(%rax), %rsp
786.cfi_def_cfa_register	%rsp
787.Lsqr_epilogue:
788	ret
789.cfi_endproc
790.size	rsaz_512_sqr,.-rsaz_512_sqr
791___
792}
793{
794my ($out,$ap,$bp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
795$code.=<<___;
796.globl	rsaz_512_mul
797.type	rsaz_512_mul,\@function,5
798.align	32
799rsaz_512_mul:
800.cfi_startproc
801	push	%rbx
802.cfi_push	%rbx
803	push	%rbp
804.cfi_push	%rbp
805	push	%r12
806.cfi_push	%r12
807	push	%r13
808.cfi_push	%r13
809	push	%r14
810.cfi_push	%r14
811	push	%r15
812.cfi_push	%r15
813
814	subq	\$128+24, %rsp
815.cfi_adjust_cfa_offset	128+24
816.Lmul_body:
817	movq	$out, %xmm0		# off-load arguments
818	movq	$mod, %xmm1
819	movq	$n0, 128(%rsp)
820___
821$code.=<<___ if ($addx);
822	movl	\$0x80100,%r11d
823	andl	OPENSSL_ia32cap_P+8(%rip),%r11d
824	cmpl	\$0x80100,%r11d		# check for MULX and ADO/CX
825	je	.Lmulx
826___
827$code.=<<___;
828	movq	($bp), %rbx		# pass b[0]
829	movq	$bp, %rbp		# pass argument
830	call	__rsaz_512_mul
831
832	movq	%xmm0, $out
833	movq	%xmm1, %rbp
834
835	movq	(%rsp), %r8
836	movq	8(%rsp), %r9
837	movq	16(%rsp), %r10
838	movq	24(%rsp), %r11
839	movq	32(%rsp), %r12
840	movq	40(%rsp), %r13
841	movq	48(%rsp), %r14
842	movq	56(%rsp), %r15
843
844	call	__rsaz_512_reduce
845___
846$code.=<<___ if ($addx);
847	jmp	.Lmul_tail
848
849.align	32
850.Lmulx:
851	movq	$bp, %rbp		# pass argument
852	movq	($bp), %rdx		# pass b[0]
853	call	__rsaz_512_mulx
854
855	movq	%xmm0, $out
856	movq	%xmm1, %rbp
857
858	movq	128(%rsp), %rdx		# pull $n0
859	movq	(%rsp), %r8
860	movq	8(%rsp), %r9
861	movq	16(%rsp), %r10
862	movq	24(%rsp), %r11
863	movq	32(%rsp), %r12
864	movq	40(%rsp), %r13
865	movq	48(%rsp), %r14
866	movq	56(%rsp), %r15
867
868	call	__rsaz_512_reducex
869.Lmul_tail:
870___
871$code.=<<___;
872	addq	64(%rsp), %r8
873	adcq	72(%rsp), %r9
874	adcq	80(%rsp), %r10
875	adcq	88(%rsp), %r11
876	adcq	96(%rsp), %r12
877	adcq	104(%rsp), %r13
878	adcq	112(%rsp), %r14
879	adcq	120(%rsp), %r15
880	sbbq	%rcx, %rcx
881
882	call	__rsaz_512_subtract
883
884	leaq	128+24+48(%rsp), %rax
885.cfi_def_cfa	%rax,8
886	movq	-48(%rax), %r15
887.cfi_restore	%r15
888	movq	-40(%rax), %r14
889.cfi_restore	%r14
890	movq	-32(%rax), %r13
891.cfi_restore	%r13
892	movq	-24(%rax), %r12
893.cfi_restore	%r12
894	movq	-16(%rax), %rbp
895.cfi_restore	%rbp
896	movq	-8(%rax), %rbx
897.cfi_restore	%rbx
898	leaq	(%rax), %rsp
899.cfi_def_cfa_register	%rsp
900.Lmul_epilogue:
901	ret
902.cfi_endproc
903.size	rsaz_512_mul,.-rsaz_512_mul
904___
905}
906{
907my ($out,$ap,$bp,$mod,$n0,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
908$code.=<<___;
909.globl	rsaz_512_mul_gather4
910.type	rsaz_512_mul_gather4,\@function,6
911.align	32
912rsaz_512_mul_gather4:
913.cfi_startproc
914	push	%rbx
915.cfi_push	%rbx
916	push	%rbp
917.cfi_push	%rbp
918	push	%r12
919.cfi_push	%r12
920	push	%r13
921.cfi_push	%r13
922	push	%r14
923.cfi_push	%r14
924	push	%r15
925.cfi_push	%r15
926
927	subq	\$`128+24+($win64?0xb0:0)`, %rsp
928.cfi_adjust_cfa_offset	`128+24+($win64?0xb0:0)`
929___
930$code.=<<___	if ($win64);
931	movaps	%xmm6,0xa0(%rsp)
932	movaps	%xmm7,0xb0(%rsp)
933	movaps	%xmm8,0xc0(%rsp)
934	movaps	%xmm9,0xd0(%rsp)
935	movaps	%xmm10,0xe0(%rsp)
936	movaps	%xmm11,0xf0(%rsp)
937	movaps	%xmm12,0x100(%rsp)
938	movaps	%xmm13,0x110(%rsp)
939	movaps	%xmm14,0x120(%rsp)
940	movaps	%xmm15,0x130(%rsp)
941___
942$code.=<<___;
943.Lmul_gather4_body:
944	movd	$pwr,%xmm8
945	movdqa	.Linc+16(%rip),%xmm1	# 00000002000000020000000200000002
946	movdqa	.Linc(%rip),%xmm0	# 00000001000000010000000000000000
947
948	pshufd	\$0,%xmm8,%xmm8		# broadcast $power
949	movdqa	%xmm1,%xmm7
950	movdqa	%xmm1,%xmm2
951___
952########################################################################
953# calculate mask by comparing 0..15 to $power
954#
955for($i=0;$i<4;$i++) {
956$code.=<<___;
957	paddd	%xmm`$i`,%xmm`$i+1`
958	pcmpeqd	%xmm8,%xmm`$i`
959	movdqa	%xmm7,%xmm`$i+3`
960___
961}
962for(;$i<7;$i++) {
963$code.=<<___;
964	paddd	%xmm`$i`,%xmm`$i+1`
965	pcmpeqd	%xmm8,%xmm`$i`
966___
967}
968$code.=<<___;
969	pcmpeqd	%xmm8,%xmm7
970
971	movdqa	16*0($bp),%xmm8
972	movdqa	16*1($bp),%xmm9
973	movdqa	16*2($bp),%xmm10
974	movdqa	16*3($bp),%xmm11
975	pand	%xmm0,%xmm8
976	movdqa	16*4($bp),%xmm12
977	pand	%xmm1,%xmm9
978	movdqa	16*5($bp),%xmm13
979	pand	%xmm2,%xmm10
980	movdqa	16*6($bp),%xmm14
981	pand	%xmm3,%xmm11
982	movdqa	16*7($bp),%xmm15
983	leaq	128($bp), %rbp
984	pand	%xmm4,%xmm12
985	pand	%xmm5,%xmm13
986	pand	%xmm6,%xmm14
987	pand	%xmm7,%xmm15
988	por	%xmm10,%xmm8
989	por	%xmm11,%xmm9
990	por	%xmm12,%xmm8
991	por	%xmm13,%xmm9
992	por	%xmm14,%xmm8
993	por	%xmm15,%xmm9
994
995	por	%xmm9,%xmm8
996	pshufd	\$0x4e,%xmm8,%xmm9
997	por	%xmm9,%xmm8
998___
999$code.=<<___ if ($addx);
1000	movl	\$0x80100,%r11d
1001	andl	OPENSSL_ia32cap_P+8(%rip),%r11d
1002	cmpl	\$0x80100,%r11d		# check for MULX and ADO/CX
1003	je	.Lmulx_gather
1004___
1005$code.=<<___;
1006	movq	%xmm8,%rbx
1007
1008	movq	$n0, 128(%rsp)		# off-load arguments
1009	movq	$out, 128+8(%rsp)
1010	movq	$mod, 128+16(%rsp)
1011
1012	movq	($ap), %rax
1013	 movq	8($ap), %rcx
1014	mulq	%rbx			# 0 iteration
1015	movq	%rax, (%rsp)
1016	movq	%rcx, %rax
1017	movq	%rdx, %r8
1018
1019	mulq	%rbx
1020	addq	%rax, %r8
1021	movq	16($ap), %rax
1022	movq	%rdx, %r9
1023	adcq	\$0, %r9
1024
1025	mulq	%rbx
1026	addq	%rax, %r9
1027	movq	24($ap), %rax
1028	movq	%rdx, %r10
1029	adcq	\$0, %r10
1030
1031	mulq	%rbx
1032	addq	%rax, %r10
1033	movq	32($ap), %rax
1034	movq	%rdx, %r11
1035	adcq	\$0, %r11
1036
1037	mulq	%rbx
1038	addq	%rax, %r11
1039	movq	40($ap), %rax
1040	movq	%rdx, %r12
1041	adcq	\$0, %r12
1042
1043	mulq	%rbx
1044	addq	%rax, %r12
1045	movq	48($ap), %rax
1046	movq	%rdx, %r13
1047	adcq	\$0, %r13
1048
1049	mulq	%rbx
1050	addq	%rax, %r13
1051	movq	56($ap), %rax
1052	movq	%rdx, %r14
1053	adcq	\$0, %r14
1054
1055	mulq	%rbx
1056	addq	%rax, %r14
1057	 movq	($ap), %rax
1058	movq	%rdx, %r15
1059	adcq	\$0, %r15
1060
1061	leaq	8(%rsp), %rdi
1062	movl	\$7, %ecx
1063	jmp	.Loop_mul_gather
1064
1065.align	32
1066.Loop_mul_gather:
1067	movdqa	16*0(%rbp),%xmm8
1068	movdqa	16*1(%rbp),%xmm9
1069	movdqa	16*2(%rbp),%xmm10
1070	movdqa	16*3(%rbp),%xmm11
1071	pand	%xmm0,%xmm8
1072	movdqa	16*4(%rbp),%xmm12
1073	pand	%xmm1,%xmm9
1074	movdqa	16*5(%rbp),%xmm13
1075	pand	%xmm2,%xmm10
1076	movdqa	16*6(%rbp),%xmm14
1077	pand	%xmm3,%xmm11
1078	movdqa	16*7(%rbp),%xmm15
1079	leaq	128(%rbp), %rbp
1080	pand	%xmm4,%xmm12
1081	pand	%xmm5,%xmm13
1082	pand	%xmm6,%xmm14
1083	pand	%xmm7,%xmm15
1084	por	%xmm10,%xmm8
1085	por	%xmm11,%xmm9
1086	por	%xmm12,%xmm8
1087	por	%xmm13,%xmm9
1088	por	%xmm14,%xmm8
1089	por	%xmm15,%xmm9
1090
1091	por	%xmm9,%xmm8
1092	pshufd	\$0x4e,%xmm8,%xmm9
1093	por	%xmm9,%xmm8
1094	movq	%xmm8,%rbx
1095
1096	mulq	%rbx
1097	addq	%rax, %r8
1098	movq	8($ap), %rax
1099	movq	%r8, (%rdi)
1100	movq	%rdx, %r8
1101	adcq	\$0, %r8
1102
1103	mulq	%rbx
1104	addq	%rax, %r9
1105	movq	16($ap), %rax
1106	adcq	\$0, %rdx
1107	addq	%r9, %r8
1108	movq	%rdx, %r9
1109	adcq	\$0, %r9
1110
1111	mulq	%rbx
1112	addq	%rax, %r10
1113	movq	24($ap), %rax
1114	adcq	\$0, %rdx
1115	addq	%r10, %r9
1116	movq	%rdx, %r10
1117	adcq	\$0, %r10
1118
1119	mulq	%rbx
1120	addq	%rax, %r11
1121	movq	32($ap), %rax
1122	adcq	\$0, %rdx
1123	addq	%r11, %r10
1124	movq	%rdx, %r11
1125	adcq	\$0, %r11
1126
1127	mulq	%rbx
1128	addq	%rax, %r12
1129	movq	40($ap), %rax
1130	adcq	\$0, %rdx
1131	addq	%r12, %r11
1132	movq	%rdx, %r12
1133	adcq	\$0, %r12
1134
1135	mulq	%rbx
1136	addq	%rax, %r13
1137	movq	48($ap), %rax
1138	adcq	\$0, %rdx
1139	addq	%r13, %r12
1140	movq	%rdx, %r13
1141	adcq	\$0, %r13
1142
1143	mulq	%rbx
1144	addq	%rax, %r14
1145	movq	56($ap), %rax
1146	adcq	\$0, %rdx
1147	addq	%r14, %r13
1148	movq	%rdx, %r14
1149	adcq	\$0, %r14
1150
1151	mulq	%rbx
1152	addq	%rax, %r15
1153	 movq	($ap), %rax
1154	adcq	\$0, %rdx
1155	addq	%r15, %r14
1156	movq	%rdx, %r15
1157	adcq	\$0, %r15
1158
1159	leaq	8(%rdi), %rdi
1160
1161	decl	%ecx
1162	jnz	.Loop_mul_gather
1163
1164	movq	%r8, (%rdi)
1165	movq	%r9, 8(%rdi)
1166	movq	%r10, 16(%rdi)
1167	movq	%r11, 24(%rdi)
1168	movq	%r12, 32(%rdi)
1169	movq	%r13, 40(%rdi)
1170	movq	%r14, 48(%rdi)
1171	movq	%r15, 56(%rdi)
1172
1173	movq	128+8(%rsp), $out
1174	movq	128+16(%rsp), %rbp
1175
1176	movq	(%rsp), %r8
1177	movq	8(%rsp), %r9
1178	movq	16(%rsp), %r10
1179	movq	24(%rsp), %r11
1180	movq	32(%rsp), %r12
1181	movq	40(%rsp), %r13
1182	movq	48(%rsp), %r14
1183	movq	56(%rsp), %r15
1184
1185	call	__rsaz_512_reduce
1186___
1187$code.=<<___ if ($addx);
1188	jmp	.Lmul_gather_tail
1189
1190.align	32
1191.Lmulx_gather:
1192	movq	%xmm8,%rdx
1193
1194	mov	$n0, 128(%rsp)		# off-load arguments
1195	mov	$out, 128+8(%rsp)
1196	mov	$mod, 128+16(%rsp)
1197
1198	mulx	($ap), %rbx, %r8	# 0 iteration
1199	mov	%rbx, (%rsp)
1200	xor	%edi, %edi		# cf=0, of=0
1201
1202	mulx	8($ap), %rax, %r9
1203
1204	mulx	16($ap), %rbx, %r10
1205	adcx	%rax, %r8
1206
1207	mulx	24($ap), %rax, %r11
1208	adcx	%rbx, %r9
1209
1210	mulx	32($ap), %rbx, %r12
1211	adcx	%rax, %r10
1212
1213	mulx	40($ap), %rax, %r13
1214	adcx	%rbx, %r11
1215
1216	mulx	48($ap), %rbx, %r14
1217	adcx	%rax, %r12
1218
1219	mulx	56($ap), %rax, %r15
1220	adcx	%rbx, %r13
1221	adcx	%rax, %r14
1222	.byte	0x67
1223	mov	%r8, %rbx
1224	adcx	%rdi, %r15		# %rdi is 0
1225
1226	mov	\$-7, %rcx
1227	jmp	.Loop_mulx_gather
1228
1229.align	32
1230.Loop_mulx_gather:
1231	movdqa	16*0(%rbp),%xmm8
1232	movdqa	16*1(%rbp),%xmm9
1233	movdqa	16*2(%rbp),%xmm10
1234	movdqa	16*3(%rbp),%xmm11
1235	pand	%xmm0,%xmm8
1236	movdqa	16*4(%rbp),%xmm12
1237	pand	%xmm1,%xmm9
1238	movdqa	16*5(%rbp),%xmm13
1239	pand	%xmm2,%xmm10
1240	movdqa	16*6(%rbp),%xmm14
1241	pand	%xmm3,%xmm11
1242	movdqa	16*7(%rbp),%xmm15
1243	leaq	128(%rbp), %rbp
1244	pand	%xmm4,%xmm12
1245	pand	%xmm5,%xmm13
1246	pand	%xmm6,%xmm14
1247	pand	%xmm7,%xmm15
1248	por	%xmm10,%xmm8
1249	por	%xmm11,%xmm9
1250	por	%xmm12,%xmm8
1251	por	%xmm13,%xmm9
1252	por	%xmm14,%xmm8
1253	por	%xmm15,%xmm9
1254
1255	por	%xmm9,%xmm8
1256	pshufd	\$0x4e,%xmm8,%xmm9
1257	por	%xmm9,%xmm8
1258	movq	%xmm8,%rdx
1259
1260	.byte	0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00	# mulx	($ap), %rax, %r8
1261	adcx	%rax, %rbx
1262	adox	%r9, %r8
1263
1264	mulx	8($ap), %rax, %r9
1265	adcx	%rax, %r8
1266	adox	%r10, %r9
1267
1268	mulx	16($ap), %rax, %r10
1269	adcx	%rax, %r9
1270	adox	%r11, %r10
1271
1272	.byte	0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00	# mulx	24($ap), %rax, %r11
1273	adcx	%rax, %r10
1274	adox	%r12, %r11
1275
1276	mulx	32($ap), %rax, %r12
1277	adcx	%rax, %r11
1278	adox	%r13, %r12
1279
1280	mulx	40($ap), %rax, %r13
1281	adcx	%rax, %r12
1282	adox	%r14, %r13
1283
1284	.byte	0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00	# mulx	48($ap), %rax, %r14
1285	adcx	%rax, %r13
1286	.byte	0x67
1287	adox	%r15, %r14
1288
1289	mulx	56($ap), %rax, %r15
1290	 mov	%rbx, 64(%rsp,%rcx,8)
1291	adcx	%rax, %r14
1292	adox	%rdi, %r15
1293	mov	%r8, %rbx
1294	adcx	%rdi, %r15		# cf=0
1295
1296	inc	%rcx			# of=0
1297	jnz	.Loop_mulx_gather
1298
1299	mov	%r8, 64(%rsp)
1300	mov	%r9, 64+8(%rsp)
1301	mov	%r10, 64+16(%rsp)
1302	mov	%r11, 64+24(%rsp)
1303	mov	%r12, 64+32(%rsp)
1304	mov	%r13, 64+40(%rsp)
1305	mov	%r14, 64+48(%rsp)
1306	mov	%r15, 64+56(%rsp)
1307
1308	mov	128(%rsp), %rdx		# pull arguments
1309	mov	128+8(%rsp), $out
1310	mov	128+16(%rsp), %rbp
1311
1312	mov	(%rsp), %r8
1313	mov	8(%rsp), %r9
1314	mov	16(%rsp), %r10
1315	mov	24(%rsp), %r11
1316	mov	32(%rsp), %r12
1317	mov	40(%rsp), %r13
1318	mov	48(%rsp), %r14
1319	mov	56(%rsp), %r15
1320
1321	call	__rsaz_512_reducex
1322
1323.Lmul_gather_tail:
1324___
1325$code.=<<___;
1326	addq	64(%rsp), %r8
1327	adcq	72(%rsp), %r9
1328	adcq	80(%rsp), %r10
1329	adcq	88(%rsp), %r11
1330	adcq	96(%rsp), %r12
1331	adcq	104(%rsp), %r13
1332	adcq	112(%rsp), %r14
1333	adcq	120(%rsp), %r15
1334	sbbq	%rcx, %rcx
1335
1336	call	__rsaz_512_subtract
1337
1338	leaq	128+24+48(%rsp), %rax
1339___
1340$code.=<<___	if ($win64);
1341	movaps	0xa0-0xc8(%rax),%xmm6
1342	movaps	0xb0-0xc8(%rax),%xmm7
1343	movaps	0xc0-0xc8(%rax),%xmm8
1344	movaps	0xd0-0xc8(%rax),%xmm9
1345	movaps	0xe0-0xc8(%rax),%xmm10
1346	movaps	0xf0-0xc8(%rax),%xmm11
1347	movaps	0x100-0xc8(%rax),%xmm12
1348	movaps	0x110-0xc8(%rax),%xmm13
1349	movaps	0x120-0xc8(%rax),%xmm14
1350	movaps	0x130-0xc8(%rax),%xmm15
1351	lea	0xb0(%rax),%rax
1352___
1353$code.=<<___;
1354.cfi_def_cfa	%rax,8
1355	movq	-48(%rax), %r15
1356.cfi_restore	%r15
1357	movq	-40(%rax), %r14
1358.cfi_restore	%r14
1359	movq	-32(%rax), %r13
1360.cfi_restore	%r13
1361	movq	-24(%rax), %r12
1362.cfi_restore	%r12
1363	movq	-16(%rax), %rbp
1364.cfi_restore	%rbp
1365	movq	-8(%rax), %rbx
1366.cfi_restore	%rbx
1367	leaq	(%rax), %rsp
1368.cfi_def_cfa_register	%rsp
1369.Lmul_gather4_epilogue:
1370	ret
1371.cfi_endproc
1372.size	rsaz_512_mul_gather4,.-rsaz_512_mul_gather4
1373___
1374}
1375{
1376my ($out,$ap,$mod,$n0,$tbl,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1377$code.=<<___;
1378.globl	rsaz_512_mul_scatter4
1379.type	rsaz_512_mul_scatter4,\@function,6
1380.align	32
1381rsaz_512_mul_scatter4:
1382.cfi_startproc
1383	push	%rbx
1384.cfi_push	%rbx
1385	push	%rbp
1386.cfi_push	%rbp
1387	push	%r12
1388.cfi_push	%r12
1389	push	%r13
1390.cfi_push	%r13
1391	push	%r14
1392.cfi_push	%r14
1393	push	%r15
1394.cfi_push	%r15
1395
1396	mov	$pwr, $pwr
1397	subq	\$128+24, %rsp
1398.cfi_adjust_cfa_offset	128+24
1399.Lmul_scatter4_body:
1400	leaq	($tbl,$pwr,8), $tbl
1401	movq	$out, %xmm0		# off-load arguments
1402	movq	$mod, %xmm1
1403	movq	$tbl, %xmm2
1404	movq	$n0, 128(%rsp)
1405
1406	movq	$out, %rbp
1407___
1408$code.=<<___ if ($addx);
1409	movl	\$0x80100,%r11d
1410	andl	OPENSSL_ia32cap_P+8(%rip),%r11d
1411	cmpl	\$0x80100,%r11d		# check for MULX and ADO/CX
1412	je	.Lmulx_scatter
1413___
1414$code.=<<___;
1415	movq	($out),%rbx		# pass b[0]
1416	call	__rsaz_512_mul
1417
1418	movq	%xmm0, $out
1419	movq	%xmm1, %rbp
1420
1421	movq	(%rsp), %r8
1422	movq	8(%rsp), %r9
1423	movq	16(%rsp), %r10
1424	movq	24(%rsp), %r11
1425	movq	32(%rsp), %r12
1426	movq	40(%rsp), %r13
1427	movq	48(%rsp), %r14
1428	movq	56(%rsp), %r15
1429
1430	call	__rsaz_512_reduce
1431___
1432$code.=<<___ if ($addx);
1433	jmp	.Lmul_scatter_tail
1434
1435.align	32
1436.Lmulx_scatter:
1437	movq	($out), %rdx		# pass b[0]
1438	call	__rsaz_512_mulx
1439
1440	movq	%xmm0, $out
1441	movq	%xmm1, %rbp
1442
1443	movq	128(%rsp), %rdx		# pull $n0
1444	movq	(%rsp), %r8
1445	movq	8(%rsp), %r9
1446	movq	16(%rsp), %r10
1447	movq	24(%rsp), %r11
1448	movq	32(%rsp), %r12
1449	movq	40(%rsp), %r13
1450	movq	48(%rsp), %r14
1451	movq	56(%rsp), %r15
1452
1453	call	__rsaz_512_reducex
1454
1455.Lmul_scatter_tail:
1456___
1457$code.=<<___;
1458	addq	64(%rsp), %r8
1459	adcq	72(%rsp), %r9
1460	adcq	80(%rsp), %r10
1461	adcq	88(%rsp), %r11
1462	adcq	96(%rsp), %r12
1463	adcq	104(%rsp), %r13
1464	adcq	112(%rsp), %r14
1465	adcq	120(%rsp), %r15
1466	movq	%xmm2, $inp
1467	sbbq	%rcx, %rcx
1468
1469	call	__rsaz_512_subtract
1470
1471	movq	%r8, 128*0($inp)	# scatter
1472	movq	%r9, 128*1($inp)
1473	movq	%r10, 128*2($inp)
1474	movq	%r11, 128*3($inp)
1475	movq	%r12, 128*4($inp)
1476	movq	%r13, 128*5($inp)
1477	movq	%r14, 128*6($inp)
1478	movq	%r15, 128*7($inp)
1479
1480	leaq	128+24+48(%rsp), %rax
1481.cfi_def_cfa	%rax,8
1482	movq	-48(%rax), %r15
1483.cfi_restore	%r15
1484	movq	-40(%rax), %r14
1485.cfi_restore	%r14
1486	movq	-32(%rax), %r13
1487.cfi_restore	%r13
1488	movq	-24(%rax), %r12
1489.cfi_restore	%r12
1490	movq	-16(%rax), %rbp
1491.cfi_restore	%rbp
1492	movq	-8(%rax), %rbx
1493.cfi_restore	%rbx
1494	leaq	(%rax), %rsp
1495.cfi_def_cfa_register	%rsp
1496.Lmul_scatter4_epilogue:
1497	ret
1498.cfi_endproc
1499.size	rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4
1500___
1501}
1502{
1503my ($out,$inp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx");
1504$code.=<<___;
1505.globl	rsaz_512_mul_by_one
1506.type	rsaz_512_mul_by_one,\@function,4
1507.align	32
1508rsaz_512_mul_by_one:
1509.cfi_startproc
1510	push	%rbx
1511.cfi_push	%rbx
1512	push	%rbp
1513.cfi_push	%rbp
1514	push	%r12
1515.cfi_push	%r12
1516	push	%r13
1517.cfi_push	%r13
1518	push	%r14
1519.cfi_push	%r14
1520	push	%r15
1521.cfi_push	%r15
1522
1523	subq	\$128+24, %rsp
1524.cfi_adjust_cfa_offset	128+24
1525.Lmul_by_one_body:
1526___
1527$code.=<<___ if ($addx);
1528	movl	OPENSSL_ia32cap_P+8(%rip),%eax
1529___
1530$code.=<<___;
1531	movq	$mod, %rbp	# reassign argument
1532	movq	$n0, 128(%rsp)
1533
1534	movq	($inp), %r8
1535	pxor	%xmm0, %xmm0
1536	movq	8($inp), %r9
1537	movq	16($inp), %r10
1538	movq	24($inp), %r11
1539	movq	32($inp), %r12
1540	movq	40($inp), %r13
1541	movq	48($inp), %r14
1542	movq	56($inp), %r15
1543
1544	movdqa	%xmm0, (%rsp)
1545	movdqa	%xmm0, 16(%rsp)
1546	movdqa	%xmm0, 32(%rsp)
1547	movdqa	%xmm0, 48(%rsp)
1548	movdqa	%xmm0, 64(%rsp)
1549	movdqa	%xmm0, 80(%rsp)
1550	movdqa	%xmm0, 96(%rsp)
1551___
1552$code.=<<___ if ($addx);
1553	andl	\$0x80100,%eax
1554	cmpl	\$0x80100,%eax		# check for MULX and ADO/CX
1555	je	.Lby_one_callx
1556___
1557$code.=<<___;
1558	call	__rsaz_512_reduce
1559___
1560$code.=<<___ if ($addx);
1561	jmp	.Lby_one_tail
1562.align	32
1563.Lby_one_callx:
1564	movq	128(%rsp), %rdx		# pull $n0
1565	call	__rsaz_512_reducex
1566.Lby_one_tail:
1567___
1568$code.=<<___;
1569	movq	%r8, ($out)
1570	movq	%r9, 8($out)
1571	movq	%r10, 16($out)
1572	movq	%r11, 24($out)
1573	movq	%r12, 32($out)
1574	movq	%r13, 40($out)
1575	movq	%r14, 48($out)
1576	movq	%r15, 56($out)
1577
1578	leaq	128+24+48(%rsp), %rax
1579.cfi_def_cfa	%rax,8
1580	movq	-48(%rax), %r15
1581.cfi_restore	%r15
1582	movq	-40(%rax), %r14
1583.cfi_restore	%r14
1584	movq	-32(%rax), %r13
1585.cfi_restore	%r13
1586	movq	-24(%rax), %r12
1587.cfi_restore	%r12
1588	movq	-16(%rax), %rbp
1589.cfi_restore	%rbp
1590	movq	-8(%rax), %rbx
1591.cfi_restore	%rbx
1592	leaq	(%rax), %rsp
1593.cfi_def_cfa_register	%rsp
1594.Lmul_by_one_epilogue:
1595	ret
1596.cfi_endproc
1597.size	rsaz_512_mul_by_one,.-rsaz_512_mul_by_one
1598___
1599}
1600{	# __rsaz_512_reduce
1601	#
1602	# input:	%r8-%r15, %rbp - mod, 128(%rsp) - n0
1603	# output:	%r8-%r15
1604	# clobbers:	everything except %rbp and %rdi
1605$code.=<<___;
1606.type	__rsaz_512_reduce,\@abi-omnipotent
1607.align	32
1608__rsaz_512_reduce:
1609	movq	%r8, %rbx
1610	imulq	128+8(%rsp), %rbx
1611	movq	0(%rbp), %rax
1612	movl	\$8, %ecx
1613	jmp	.Lreduction_loop
1614
1615.align	32
1616.Lreduction_loop:
1617	mulq	%rbx
1618	movq	8(%rbp), %rax
1619	negq	%r8
1620	movq	%rdx, %r8
1621	adcq	\$0, %r8
1622
1623	mulq	%rbx
1624	addq	%rax, %r9
1625	movq	16(%rbp), %rax
1626	adcq	\$0, %rdx
1627	addq	%r9, %r8
1628	movq	%rdx, %r9
1629	adcq	\$0, %r9
1630
1631	mulq	%rbx
1632	addq	%rax, %r10
1633	movq	24(%rbp), %rax
1634	adcq	\$0, %rdx
1635	addq	%r10, %r9
1636	movq	%rdx, %r10
1637	adcq	\$0, %r10
1638
1639	mulq	%rbx
1640	addq	%rax, %r11
1641	movq	32(%rbp), %rax
1642	adcq	\$0, %rdx
1643	addq	%r11, %r10
1644	 movq	128+8(%rsp), %rsi
1645	#movq	%rdx, %r11
1646	#adcq	\$0, %r11
1647	adcq	\$0, %rdx
1648	movq	%rdx, %r11
1649
1650	mulq	%rbx
1651	addq	%rax, %r12
1652	movq	40(%rbp), %rax
1653	adcq	\$0, %rdx
1654	 imulq	%r8, %rsi
1655	addq	%r12, %r11
1656	movq	%rdx, %r12
1657	adcq	\$0, %r12
1658
1659	mulq	%rbx
1660	addq	%rax, %r13
1661	movq	48(%rbp), %rax
1662	adcq	\$0, %rdx
1663	addq	%r13, %r12
1664	movq	%rdx, %r13
1665	adcq	\$0, %r13
1666
1667	mulq	%rbx
1668	addq	%rax, %r14
1669	movq	56(%rbp), %rax
1670	adcq	\$0, %rdx
1671	addq	%r14, %r13
1672	movq	%rdx, %r14
1673	adcq	\$0, %r14
1674
1675	mulq	%rbx
1676	 movq	%rsi, %rbx
1677	addq	%rax, %r15
1678	 movq	0(%rbp), %rax
1679	adcq	\$0, %rdx
1680	addq	%r15, %r14
1681	movq	%rdx, %r15
1682	adcq	\$0, %r15
1683
1684	decl	%ecx
1685	jne	.Lreduction_loop
1686
1687	ret
1688.size	__rsaz_512_reduce,.-__rsaz_512_reduce
1689___
1690}
1691if ($addx) {
1692	# __rsaz_512_reducex
1693	#
1694	# input:	%r8-%r15, %rbp - mod, 128(%rsp) - n0
1695	# output:	%r8-%r15
1696	# clobbers:	everything except %rbp and %rdi
1697$code.=<<___;
1698.type	__rsaz_512_reducex,\@abi-omnipotent
1699.align	32
1700__rsaz_512_reducex:
1701	#movq	128+8(%rsp), %rdx		# pull $n0
1702	imulq	%r8, %rdx
1703	xorq	%rsi, %rsi			# cf=0,of=0
1704	movl	\$8, %ecx
1705	jmp	.Lreduction_loopx
1706
1707.align	32
1708.Lreduction_loopx:
1709	mov	%r8, %rbx
1710	mulx	0(%rbp), %rax, %r8
1711	adcx	%rbx, %rax
1712	adox	%r9, %r8
1713
1714	mulx	8(%rbp), %rax, %r9
1715	adcx	%rax, %r8
1716	adox	%r10, %r9
1717
1718	mulx	16(%rbp), %rbx, %r10
1719	adcx	%rbx, %r9
1720	adox	%r11, %r10
1721
1722	mulx	24(%rbp), %rbx, %r11
1723	adcx	%rbx, %r10
1724	adox	%r12, %r11
1725
1726	.byte	0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00	# mulx	32(%rbp), %rbx, %r12
1727	 mov	%rdx, %rax
1728	 mov	%r8, %rdx
1729	adcx	%rbx, %r11
1730	adox	%r13, %r12
1731
1732	 mulx	128+8(%rsp), %rbx, %rdx
1733	 mov	%rax, %rdx
1734
1735	mulx	40(%rbp), %rax, %r13
1736	adcx	%rax, %r12
1737	adox	%r14, %r13
1738
1739	.byte	0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00	# mulx	48(%rbp), %rax, %r14
1740	adcx	%rax, %r13
1741	adox	%r15, %r14
1742
1743	mulx	56(%rbp), %rax, %r15
1744	 mov	%rbx, %rdx
1745	adcx	%rax, %r14
1746	adox	%rsi, %r15			# %rsi is 0
1747	adcx	%rsi, %r15			# cf=0
1748
1749	decl	%ecx				# of=0
1750	jne	.Lreduction_loopx
1751
1752	ret
1753.size	__rsaz_512_reducex,.-__rsaz_512_reducex
1754___
1755}
1756{	# __rsaz_512_subtract
1757	# input: %r8-%r15, %rdi - $out, %rbp - $mod, %rcx - mask
1758	# output:
1759	# clobbers: everything but %rdi, %rsi and %rbp
1760$code.=<<___;
1761.type	__rsaz_512_subtract,\@abi-omnipotent
1762.align	32
1763__rsaz_512_subtract:
1764	movq	%r8, ($out)
1765	movq	%r9, 8($out)
1766	movq	%r10, 16($out)
1767	movq	%r11, 24($out)
1768	movq	%r12, 32($out)
1769	movq	%r13, 40($out)
1770	movq	%r14, 48($out)
1771	movq	%r15, 56($out)
1772
1773	movq	0($mod), %r8
1774	movq	8($mod), %r9
1775	negq	%r8
1776	notq	%r9
1777	andq	%rcx, %r8
1778	movq	16($mod), %r10
1779	andq	%rcx, %r9
1780	notq	%r10
1781	movq	24($mod), %r11
1782	andq	%rcx, %r10
1783	notq	%r11
1784	movq	32($mod), %r12
1785	andq	%rcx, %r11
1786	notq	%r12
1787	movq	40($mod), %r13
1788	andq	%rcx, %r12
1789	notq	%r13
1790	movq	48($mod), %r14
1791	andq	%rcx, %r13
1792	notq	%r14
1793	movq	56($mod), %r15
1794	andq	%rcx, %r14
1795	notq	%r15
1796	andq	%rcx, %r15
1797
1798	addq	($out), %r8
1799	adcq	8($out), %r9
1800	adcq	16($out), %r10
1801	adcq	24($out), %r11
1802	adcq	32($out), %r12
1803	adcq	40($out), %r13
1804	adcq	48($out), %r14
1805	adcq	56($out), %r15
1806
1807	movq	%r8, ($out)
1808	movq	%r9, 8($out)
1809	movq	%r10, 16($out)
1810	movq	%r11, 24($out)
1811	movq	%r12, 32($out)
1812	movq	%r13, 40($out)
1813	movq	%r14, 48($out)
1814	movq	%r15, 56($out)
1815
1816	ret
1817.size	__rsaz_512_subtract,.-__rsaz_512_subtract
1818___
1819}
1820{	# __rsaz_512_mul
1821	#
1822	# input: %rsi - ap, %rbp - bp
1823	# output:
1824	# clobbers: everything
1825my ($ap,$bp) = ("%rsi","%rbp");
1826$code.=<<___;
1827.type	__rsaz_512_mul,\@abi-omnipotent
1828.align	32
1829__rsaz_512_mul:
1830	leaq	8(%rsp), %rdi
1831
1832	movq	($ap), %rax
1833	mulq	%rbx
1834	movq	%rax, (%rdi)
1835	movq	8($ap), %rax
1836	movq	%rdx, %r8
1837
1838	mulq	%rbx
1839	addq	%rax, %r8
1840	movq	16($ap), %rax
1841	movq	%rdx, %r9
1842	adcq	\$0, %r9
1843
1844	mulq	%rbx
1845	addq	%rax, %r9
1846	movq	24($ap), %rax
1847	movq	%rdx, %r10
1848	adcq	\$0, %r10
1849
1850	mulq	%rbx
1851	addq	%rax, %r10
1852	movq	32($ap), %rax
1853	movq	%rdx, %r11
1854	adcq	\$0, %r11
1855
1856	mulq	%rbx
1857	addq	%rax, %r11
1858	movq	40($ap), %rax
1859	movq	%rdx, %r12
1860	adcq	\$0, %r12
1861
1862	mulq	%rbx
1863	addq	%rax, %r12
1864	movq	48($ap), %rax
1865	movq	%rdx, %r13
1866	adcq	\$0, %r13
1867
1868	mulq	%rbx
1869	addq	%rax, %r13
1870	movq	56($ap), %rax
1871	movq	%rdx, %r14
1872	adcq	\$0, %r14
1873
1874	mulq	%rbx
1875	addq	%rax, %r14
1876	 movq	($ap), %rax
1877	movq	%rdx, %r15
1878	adcq	\$0, %r15
1879
1880	leaq	8($bp), $bp
1881	leaq	8(%rdi), %rdi
1882
1883	movl	\$7, %ecx
1884	jmp	.Loop_mul
1885
1886.align	32
1887.Loop_mul:
1888	movq	($bp), %rbx
1889	mulq	%rbx
1890	addq	%rax, %r8
1891	movq	8($ap), %rax
1892	movq	%r8, (%rdi)
1893	movq	%rdx, %r8
1894	adcq	\$0, %r8
1895
1896	mulq	%rbx
1897	addq	%rax, %r9
1898	movq	16($ap), %rax
1899	adcq	\$0, %rdx
1900	addq	%r9, %r8
1901	movq	%rdx, %r9
1902	adcq	\$0, %r9
1903
1904	mulq	%rbx
1905	addq	%rax, %r10
1906	movq	24($ap), %rax
1907	adcq	\$0, %rdx
1908	addq	%r10, %r9
1909	movq	%rdx, %r10
1910	adcq	\$0, %r10
1911
1912	mulq	%rbx
1913	addq	%rax, %r11
1914	movq	32($ap), %rax
1915	adcq	\$0, %rdx
1916	addq	%r11, %r10
1917	movq	%rdx, %r11
1918	adcq	\$0, %r11
1919
1920	mulq	%rbx
1921	addq	%rax, %r12
1922	movq	40($ap), %rax
1923	adcq	\$0, %rdx
1924	addq	%r12, %r11
1925	movq	%rdx, %r12
1926	adcq	\$0, %r12
1927
1928	mulq	%rbx
1929	addq	%rax, %r13
1930	movq	48($ap), %rax
1931	adcq	\$0, %rdx
1932	addq	%r13, %r12
1933	movq	%rdx, %r13
1934	adcq	\$0, %r13
1935
1936	mulq	%rbx
1937	addq	%rax, %r14
1938	movq	56($ap), %rax
1939	adcq	\$0, %rdx
1940	addq	%r14, %r13
1941	movq	%rdx, %r14
1942	 leaq	8($bp), $bp
1943	adcq	\$0, %r14
1944
1945	mulq	%rbx
1946	addq	%rax, %r15
1947	 movq	($ap), %rax
1948	adcq	\$0, %rdx
1949	addq	%r15, %r14
1950	movq	%rdx, %r15
1951	adcq	\$0, %r15
1952
1953	leaq	8(%rdi), %rdi
1954
1955	decl	%ecx
1956	jnz	.Loop_mul
1957
1958	movq	%r8, (%rdi)
1959	movq	%r9, 8(%rdi)
1960	movq	%r10, 16(%rdi)
1961	movq	%r11, 24(%rdi)
1962	movq	%r12, 32(%rdi)
1963	movq	%r13, 40(%rdi)
1964	movq	%r14, 48(%rdi)
1965	movq	%r15, 56(%rdi)
1966
1967	ret
1968.size	__rsaz_512_mul,.-__rsaz_512_mul
1969___
1970}
1971if ($addx) {
1972	# __rsaz_512_mulx
1973	#
1974	# input: %rsi - ap, %rbp - bp
1975	# output:
1976	# clobbers: everything
1977my ($ap,$bp,$zero) = ("%rsi","%rbp","%rdi");
1978$code.=<<___;
1979.type	__rsaz_512_mulx,\@abi-omnipotent
1980.align	32
1981__rsaz_512_mulx:
1982	mulx	($ap), %rbx, %r8	# initial %rdx preloaded by caller
1983	mov	\$-6, %rcx
1984
1985	mulx	8($ap), %rax, %r9
1986	movq	%rbx, 8(%rsp)
1987
1988	mulx	16($ap), %rbx, %r10
1989	adc	%rax, %r8
1990
1991	mulx	24($ap), %rax, %r11
1992	adc	%rbx, %r9
1993
1994	mulx	32($ap), %rbx, %r12
1995	adc	%rax, %r10
1996
1997	mulx	40($ap), %rax, %r13
1998	adc	%rbx, %r11
1999
2000	mulx	48($ap), %rbx, %r14
2001	adc	%rax, %r12
2002
2003	mulx	56($ap), %rax, %r15
2004	 mov	8($bp), %rdx
2005	adc	%rbx, %r13
2006	adc	%rax, %r14
2007	adc	\$0, %r15
2008
2009	xor	$zero, $zero		# cf=0,of=0
2010	jmp	.Loop_mulx
2011
2012.align	32
2013.Loop_mulx:
2014	movq	%r8, %rbx
2015	mulx	($ap), %rax, %r8
2016	adcx	%rax, %rbx
2017	adox	%r9, %r8
2018
2019	mulx	8($ap), %rax, %r9
2020	adcx	%rax, %r8
2021	adox	%r10, %r9
2022
2023	mulx	16($ap), %rax, %r10
2024	adcx	%rax, %r9
2025	adox	%r11, %r10
2026
2027	mulx	24($ap), %rax, %r11
2028	adcx	%rax, %r10
2029	adox	%r12, %r11
2030
2031	.byte	0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00	# mulx	32($ap), %rax, %r12
2032	adcx	%rax, %r11
2033	adox	%r13, %r12
2034
2035	mulx	40($ap), %rax, %r13
2036	adcx	%rax, %r12
2037	adox	%r14, %r13
2038
2039	mulx	48($ap), %rax, %r14
2040	adcx	%rax, %r13
2041	adox	%r15, %r14
2042
2043	mulx	56($ap), %rax, %r15
2044	 movq	64($bp,%rcx,8), %rdx
2045	 movq	%rbx, 8+64-8(%rsp,%rcx,8)
2046	adcx	%rax, %r14
2047	adox	$zero, %r15
2048	adcx	$zero, %r15		# cf=0
2049
2050	inc	%rcx			# of=0
2051	jnz	.Loop_mulx
2052
2053	movq	%r8, %rbx
2054	mulx	($ap), %rax, %r8
2055	adcx	%rax, %rbx
2056	adox	%r9, %r8
2057
2058	.byte	0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00	# mulx	8($ap), %rax, %r9
2059	adcx	%rax, %r8
2060	adox	%r10, %r9
2061
2062	.byte	0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00	# mulx	16($ap), %rax, %r10
2063	adcx	%rax, %r9
2064	adox	%r11, %r10
2065
2066	mulx	24($ap), %rax, %r11
2067	adcx	%rax, %r10
2068	adox	%r12, %r11
2069
2070	mulx	32($ap), %rax, %r12
2071	adcx	%rax, %r11
2072	adox	%r13, %r12
2073
2074	mulx	40($ap), %rax, %r13
2075	adcx	%rax, %r12
2076	adox	%r14, %r13
2077
2078	.byte	0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00	# mulx	48($ap), %rax, %r14
2079	adcx	%rax, %r13
2080	adox	%r15, %r14
2081
2082	.byte	0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00	# mulx	56($ap), %rax, %r15
2083	adcx	%rax, %r14
2084	adox	$zero, %r15
2085	adcx	$zero, %r15
2086
2087	mov	%rbx, 8+64-8(%rsp)
2088	mov	%r8, 8+64(%rsp)
2089	mov	%r9, 8+64+8(%rsp)
2090	mov	%r10, 8+64+16(%rsp)
2091	mov	%r11, 8+64+24(%rsp)
2092	mov	%r12, 8+64+32(%rsp)
2093	mov	%r13, 8+64+40(%rsp)
2094	mov	%r14, 8+64+48(%rsp)
2095	mov	%r15, 8+64+56(%rsp)
2096
2097	ret
2098.size	__rsaz_512_mulx,.-__rsaz_512_mulx
2099___
2100}
2101{
2102my ($out,$inp,$power)= $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
2103$code.=<<___;
2104.globl	rsaz_512_scatter4
2105.type	rsaz_512_scatter4,\@abi-omnipotent
2106.align	16
2107rsaz_512_scatter4:
2108	leaq	($out,$power,8), $out
2109	movl	\$8, %r9d
2110	jmp	.Loop_scatter
2111.align	16
2112.Loop_scatter:
2113	movq	($inp), %rax
2114	leaq	8($inp), $inp
2115	movq	%rax, ($out)
2116	leaq	128($out), $out
2117	decl	%r9d
2118	jnz	.Loop_scatter
2119	ret
2120.size	rsaz_512_scatter4,.-rsaz_512_scatter4
2121
2122.globl	rsaz_512_gather4
2123.type	rsaz_512_gather4,\@abi-omnipotent
2124.align	16
2125rsaz_512_gather4:
2126___
2127$code.=<<___	if ($win64);
2128.LSEH_begin_rsaz_512_gather4:
2129	.byte	0x48,0x81,0xec,0xa8,0x00,0x00,0x00	# sub    $0xa8,%rsp
2130	.byte	0x0f,0x29,0x34,0x24			# movaps %xmm6,(%rsp)
2131	.byte	0x0f,0x29,0x7c,0x24,0x10		# movaps %xmm7,0x10(%rsp)
2132	.byte	0x44,0x0f,0x29,0x44,0x24,0x20		# movaps %xmm8,0x20(%rsp)
2133	.byte	0x44,0x0f,0x29,0x4c,0x24,0x30		# movaps %xmm9,0x30(%rsp)
2134	.byte	0x44,0x0f,0x29,0x54,0x24,0x40		# movaps %xmm10,0x40(%rsp)
2135	.byte	0x44,0x0f,0x29,0x5c,0x24,0x50		# movaps %xmm11,0x50(%rsp)
2136	.byte	0x44,0x0f,0x29,0x64,0x24,0x60		# movaps %xmm12,0x60(%rsp)
2137	.byte	0x44,0x0f,0x29,0x6c,0x24,0x70		# movaps %xmm13,0x70(%rsp)
2138	.byte	0x44,0x0f,0x29,0xb4,0x24,0x80,0,0,0	# movaps %xmm14,0x80(%rsp)
2139	.byte	0x44,0x0f,0x29,0xbc,0x24,0x90,0,0,0	# movaps %xmm15,0x90(%rsp)
2140___
2141$code.=<<___;
2142	movd	$power,%xmm8
2143	movdqa	.Linc+16(%rip),%xmm1	# 00000002000000020000000200000002
2144	movdqa	.Linc(%rip),%xmm0	# 00000001000000010000000000000000
2145
2146	pshufd	\$0,%xmm8,%xmm8		# broadcast $power
2147	movdqa	%xmm1,%xmm7
2148	movdqa	%xmm1,%xmm2
2149___
2150########################################################################
2151# calculate mask by comparing 0..15 to $power
2152#
2153for($i=0;$i<4;$i++) {
2154$code.=<<___;
2155	paddd	%xmm`$i`,%xmm`$i+1`
2156	pcmpeqd	%xmm8,%xmm`$i`
2157	movdqa	%xmm7,%xmm`$i+3`
2158___
2159}
2160for(;$i<7;$i++) {
2161$code.=<<___;
2162	paddd	%xmm`$i`,%xmm`$i+1`
2163	pcmpeqd	%xmm8,%xmm`$i`
2164___
2165}
2166$code.=<<___;
2167	pcmpeqd	%xmm8,%xmm7
2168	movl	\$8, %r9d
2169	jmp	.Loop_gather
2170.align	16
2171.Loop_gather:
2172	movdqa	16*0($inp),%xmm8
2173	movdqa	16*1($inp),%xmm9
2174	movdqa	16*2($inp),%xmm10
2175	movdqa	16*3($inp),%xmm11
2176	pand	%xmm0,%xmm8
2177	movdqa	16*4($inp),%xmm12
2178	pand	%xmm1,%xmm9
2179	movdqa	16*5($inp),%xmm13
2180	pand	%xmm2,%xmm10
2181	movdqa	16*6($inp),%xmm14
2182	pand	%xmm3,%xmm11
2183	movdqa	16*7($inp),%xmm15
2184	leaq	128($inp), $inp
2185	pand	%xmm4,%xmm12
2186	pand	%xmm5,%xmm13
2187	pand	%xmm6,%xmm14
2188	pand	%xmm7,%xmm15
2189	por	%xmm10,%xmm8
2190	por	%xmm11,%xmm9
2191	por	%xmm12,%xmm8
2192	por	%xmm13,%xmm9
2193	por	%xmm14,%xmm8
2194	por	%xmm15,%xmm9
2195
2196	por	%xmm9,%xmm8
2197	pshufd	\$0x4e,%xmm8,%xmm9
2198	por	%xmm9,%xmm8
2199	movq	%xmm8,($out)
2200	leaq	8($out), $out
2201	decl	%r9d
2202	jnz	.Loop_gather
2203___
2204$code.=<<___	if ($win64);
2205	movaps	0x00(%rsp),%xmm6
2206	movaps	0x10(%rsp),%xmm7
2207	movaps	0x20(%rsp),%xmm8
2208	movaps	0x30(%rsp),%xmm9
2209	movaps	0x40(%rsp),%xmm10
2210	movaps	0x50(%rsp),%xmm11
2211	movaps	0x60(%rsp),%xmm12
2212	movaps	0x70(%rsp),%xmm13
2213	movaps	0x80(%rsp),%xmm14
2214	movaps	0x90(%rsp),%xmm15
2215	add	\$0xa8,%rsp
2216___
2217$code.=<<___;
2218	ret
2219.LSEH_end_rsaz_512_gather4:
2220.size	rsaz_512_gather4,.-rsaz_512_gather4
2221
2222.align	64
2223.Linc:
2224	.long	0,0, 1,1
2225	.long	2,2, 2,2
2226___
2227}
2228
2229# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2230#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
2231if ($win64) {
2232$rec="%rcx";
2233$frame="%rdx";
2234$context="%r8";
2235$disp="%r9";
2236
2237$code.=<<___;
2238.extern	__imp_RtlVirtualUnwind
2239.type	se_handler,\@abi-omnipotent
2240.align	16
2241se_handler:
2242	push	%rsi
2243	push	%rdi
2244	push	%rbx
2245	push	%rbp
2246	push	%r12
2247	push	%r13
2248	push	%r14
2249	push	%r15
2250	pushfq
2251	sub	\$64,%rsp
2252
2253	mov	120($context),%rax	# pull context->Rax
2254	mov	248($context),%rbx	# pull context->Rip
2255
2256	mov	8($disp),%rsi		# disp->ImageBase
2257	mov	56($disp),%r11		# disp->HandlerData
2258
2259	mov	0(%r11),%r10d		# HandlerData[0]
2260	lea	(%rsi,%r10),%r10	# end of prologue label
2261	cmp	%r10,%rbx		# context->Rip<end of prologue label
2262	jb	.Lcommon_seh_tail
2263
2264	mov	152($context),%rax	# pull context->Rsp
2265
2266	mov	4(%r11),%r10d		# HandlerData[1]
2267	lea	(%rsi,%r10),%r10	# epilogue label
2268	cmp	%r10,%rbx		# context->Rip>=epilogue label
2269	jae	.Lcommon_seh_tail
2270
2271	lea	128+24+48(%rax),%rax
2272
2273	lea	.Lmul_gather4_epilogue(%rip),%rbx
2274	cmp	%r10,%rbx
2275	jne	.Lse_not_in_mul_gather4
2276
2277	lea	0xb0(%rax),%rax
2278
2279	lea	-48-0xa8(%rax),%rsi
2280	lea	512($context),%rdi
2281	mov	\$20,%ecx
2282	.long	0xa548f3fc		# cld; rep movsq
2283
2284.Lse_not_in_mul_gather4:
2285	mov	-8(%rax),%rbx
2286	mov	-16(%rax),%rbp
2287	mov	-24(%rax),%r12
2288	mov	-32(%rax),%r13
2289	mov	-40(%rax),%r14
2290	mov	-48(%rax),%r15
2291	mov	%rbx,144($context)	# restore context->Rbx
2292	mov	%rbp,160($context)	# restore context->Rbp
2293	mov	%r12,216($context)	# restore context->R12
2294	mov	%r13,224($context)	# restore context->R13
2295	mov	%r14,232($context)	# restore context->R14
2296	mov	%r15,240($context)	# restore context->R15
2297
2298.Lcommon_seh_tail:
2299	mov	8(%rax),%rdi
2300	mov	16(%rax),%rsi
2301	mov	%rax,152($context)	# restore context->Rsp
2302	mov	%rsi,168($context)	# restore context->Rsi
2303	mov	%rdi,176($context)	# restore context->Rdi
2304
2305	mov	40($disp),%rdi		# disp->ContextRecord
2306	mov	$context,%rsi		# context
2307	mov	\$154,%ecx		# sizeof(CONTEXT)
2308	.long	0xa548f3fc		# cld; rep movsq
2309
2310	mov	$disp,%rsi
2311	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
2312	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
2313	mov	0(%rsi),%r8		# arg3, disp->ControlPc
2314	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
2315	mov	40(%rsi),%r10		# disp->ContextRecord
2316	lea	56(%rsi),%r11		# &disp->HandlerData
2317	lea	24(%rsi),%r12		# &disp->EstablisherFrame
2318	mov	%r10,32(%rsp)		# arg5
2319	mov	%r11,40(%rsp)		# arg6
2320	mov	%r12,48(%rsp)		# arg7
2321	mov	%rcx,56(%rsp)		# arg8, (NULL)
2322	call	*__imp_RtlVirtualUnwind(%rip)
2323
2324	mov	\$1,%eax		# ExceptionContinueSearch
2325	add	\$64,%rsp
2326	popfq
2327	pop	%r15
2328	pop	%r14
2329	pop	%r13
2330	pop	%r12
2331	pop	%rbp
2332	pop	%rbx
2333	pop	%rdi
2334	pop	%rsi
2335	ret
2336.size	se_handler,.-se_handler
2337
2338.section	.pdata
2339.align	4
2340	.rva	.LSEH_begin_rsaz_512_sqr
2341	.rva	.LSEH_end_rsaz_512_sqr
2342	.rva	.LSEH_info_rsaz_512_sqr
2343
2344	.rva	.LSEH_begin_rsaz_512_mul
2345	.rva	.LSEH_end_rsaz_512_mul
2346	.rva	.LSEH_info_rsaz_512_mul
2347
2348	.rva	.LSEH_begin_rsaz_512_mul_gather4
2349	.rva	.LSEH_end_rsaz_512_mul_gather4
2350	.rva	.LSEH_info_rsaz_512_mul_gather4
2351
2352	.rva	.LSEH_begin_rsaz_512_mul_scatter4
2353	.rva	.LSEH_end_rsaz_512_mul_scatter4
2354	.rva	.LSEH_info_rsaz_512_mul_scatter4
2355
2356	.rva	.LSEH_begin_rsaz_512_mul_by_one
2357	.rva	.LSEH_end_rsaz_512_mul_by_one
2358	.rva	.LSEH_info_rsaz_512_mul_by_one
2359
2360	.rva	.LSEH_begin_rsaz_512_gather4
2361	.rva	.LSEH_end_rsaz_512_gather4
2362	.rva	.LSEH_info_rsaz_512_gather4
2363
2364.section	.xdata
2365.align	8
2366.LSEH_info_rsaz_512_sqr:
2367	.byte	9,0,0,0
2368	.rva	se_handler
2369	.rva	.Lsqr_body,.Lsqr_epilogue			# HandlerData[]
2370.LSEH_info_rsaz_512_mul:
2371	.byte	9,0,0,0
2372	.rva	se_handler
2373	.rva	.Lmul_body,.Lmul_epilogue			# HandlerData[]
2374.LSEH_info_rsaz_512_mul_gather4:
2375	.byte	9,0,0,0
2376	.rva	se_handler
2377	.rva	.Lmul_gather4_body,.Lmul_gather4_epilogue	# HandlerData[]
2378.LSEH_info_rsaz_512_mul_scatter4:
2379	.byte	9,0,0,0
2380	.rva	se_handler
2381	.rva	.Lmul_scatter4_body,.Lmul_scatter4_epilogue	# HandlerData[]
2382.LSEH_info_rsaz_512_mul_by_one:
2383	.byte	9,0,0,0
2384	.rva	se_handler
2385	.rva	.Lmul_by_one_body,.Lmul_by_one_epilogue		# HandlerData[]
2386.LSEH_info_rsaz_512_gather4:
2387	.byte	0x01,0x46,0x16,0x00
2388	.byte	0x46,0xf8,0x09,0x00	# vmovaps 0x90(rsp),xmm15
2389	.byte	0x3d,0xe8,0x08,0x00	# vmovaps 0x80(rsp),xmm14
2390	.byte	0x34,0xd8,0x07,0x00	# vmovaps 0x70(rsp),xmm13
2391	.byte	0x2e,0xc8,0x06,0x00	# vmovaps 0x60(rsp),xmm12
2392	.byte	0x28,0xb8,0x05,0x00	# vmovaps 0x50(rsp),xmm11
2393	.byte	0x22,0xa8,0x04,0x00	# vmovaps 0x40(rsp),xmm10
2394	.byte	0x1c,0x98,0x03,0x00	# vmovaps 0x30(rsp),xmm9
2395	.byte	0x16,0x88,0x02,0x00	# vmovaps 0x20(rsp),xmm8
2396	.byte	0x10,0x78,0x01,0x00	# vmovaps 0x10(rsp),xmm7
2397	.byte	0x0b,0x68,0x00,0x00	# vmovaps 0x00(rsp),xmm6
2398	.byte	0x07,0x01,0x15,0x00	# sub     rsp,0xa8
2399___
2400}
2401
2402$code =~ s/\`([^\`]*)\`/eval $1/gem;
2403print $code;
2404close STDOUT;
2405