17bded2dbSJung-uk Kim#! /usr/bin/env perl
217f01e99SJung-uk Kim# Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved.
3e71b7053SJung-uk Kim# Copyright (c) 2012, Intel Corporation. All Rights Reserved.
4e71b7053SJung-uk Kim#
5b077aed3SPierre Pronchery# Licensed under the Apache License 2.0 (the "License").  You may not use
6e71b7053SJung-uk Kim# this file except in compliance with the License.  You can obtain a copy
7e71b7053SJung-uk Kim# in the file LICENSE in the source distribution or at
8e71b7053SJung-uk Kim# https://www.openssl.org/source/license.html
9e71b7053SJung-uk Kim#
10e71b7053SJung-uk Kim# Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)
11e71b7053SJung-uk Kim# (1) Intel Corporation, Israel Development Center, Haifa, Israel
12e71b7053SJung-uk Kim# (2) University of Haifa, Israel
13e71b7053SJung-uk Kim#
14e71b7053SJung-uk Kim# References:
15e71b7053SJung-uk Kim# [1] S. Gueron, V. Krasnov: "Software Implementation of Modular
16e71b7053SJung-uk Kim#     Exponentiation,  Using Advanced Vector Instructions Architectures",
17e71b7053SJung-uk Kim#     F. Ozbudak and F. Rodriguez-Henriquez (Eds.): WAIFI 2012, LNCS 7369,
18e71b7053SJung-uk Kim#     pp. 119?135, 2012. Springer-Verlag Berlin Heidelberg 2012
19e71b7053SJung-uk Kim# [2] S. Gueron: "Efficient Software Implementations of Modular
20e71b7053SJung-uk Kim#     Exponentiation", Journal of Cryptographic Engineering 2:31-43 (2012).
21e71b7053SJung-uk Kim# [3] S. Gueron, V. Krasnov: "Speeding up Big-numbers Squaring",IEEE
22e71b7053SJung-uk Kim#     Proceedings of 9th International Conference on Information Technology:
23e71b7053SJung-uk Kim#     New Generations (ITNG 2012), pp.821-823 (2012)
24e71b7053SJung-uk Kim# [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis
25e71b7053SJung-uk Kim#     resistant 1024-bit modular exponentiation, for optimizing RSA2048
26e71b7053SJung-uk Kim#     on AVX2 capable x86_64 platforms",
27e71b7053SJung-uk Kim#     http://rt.openssl.org/Ticket/Display.html?id=2850&user=guest&pass=guest
287bded2dbSJung-uk Kim#
297bded2dbSJung-uk Kim# +13% improvement over original submission by <appro@openssl.org>
307bded2dbSJung-uk Kim#
317bded2dbSJung-uk Kim# rsa2048 sign/sec	OpenSSL 1.0.1	scalar(*)	this
327bded2dbSJung-uk Kim# 2.3GHz Haswell	621		765/+23%	1113/+79%
337bded2dbSJung-uk Kim# 2.3GHz Broadwell(**)	688		1200(***)/+74%	1120/+63%
347bded2dbSJung-uk Kim#
357bded2dbSJung-uk Kim# (*)	if system doesn't support AVX2, for reference purposes;
367bded2dbSJung-uk Kim# (**)	scaled to 2.3GHz to simplify comparison;
377bded2dbSJung-uk Kim# (***)	scalar AD*X code is faster than AVX2 and is preferred code
387bded2dbSJung-uk Kim#	path for Broadwell;
397bded2dbSJung-uk Kim
40b077aed3SPierre Pronchery# $output is the last argument if it looks like a file (it has an extension)
41b077aed3SPierre Pronchery# $flavour is the first argument if it doesn't look like a file
42b077aed3SPierre Pronchery$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
43b077aed3SPierre Pronchery$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
447bded2dbSJung-uk Kim
457bded2dbSJung-uk Kim$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
467bded2dbSJung-uk Kim
477bded2dbSJung-uk Kim$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
487bded2dbSJung-uk Kim( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
497bded2dbSJung-uk Kim( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
507bded2dbSJung-uk Kimdie "can't locate x86_64-xlate.pl";
517bded2dbSJung-uk Kim
527bded2dbSJung-uk Kimif (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
537bded2dbSJung-uk Kim		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
547bded2dbSJung-uk Kim	$avx = ($1>=2.19) + ($1>=2.22);
557bded2dbSJung-uk Kim	$addx = ($1>=2.23);
567bded2dbSJung-uk Kim}
577bded2dbSJung-uk Kim
587bded2dbSJung-uk Kimif (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
597bded2dbSJung-uk Kim	    `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
607bded2dbSJung-uk Kim	$avx = ($1>=2.09) + ($1>=2.10);
617bded2dbSJung-uk Kim	$addx = ($1>=2.10);
627bded2dbSJung-uk Kim}
637bded2dbSJung-uk Kim
647bded2dbSJung-uk Kimif (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
657bded2dbSJung-uk Kim	    `ml64 2>&1` =~ /Version ([0-9]+)\./) {
667bded2dbSJung-uk Kim	$avx = ($1>=10) + ($1>=11);
677bded2dbSJung-uk Kim	$addx = ($1>=11);
687bded2dbSJung-uk Kim}
697bded2dbSJung-uk Kim
7063c1bb51SJung-uk Kimif (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|based on LLVM) ([0-9]+)\.([0-9]+)/) {
717bded2dbSJung-uk Kim	my $ver = $2 + $3/100.0;	# 3.1->3.01, 3.10->3.10
727bded2dbSJung-uk Kim	$avx = ($ver>=3.0) + ($ver>=3.01);
737bded2dbSJung-uk Kim	$addx = ($ver>=3.03);
747bded2dbSJung-uk Kim}
757bded2dbSJung-uk Kim
76b077aed3SPierre Proncheryopen OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
77b077aed3SPierre Pronchery    or die "can't call $xlate: $!";
787bded2dbSJung-uk Kim*STDOUT = *OUT;
797bded2dbSJung-uk Kim
807bded2dbSJung-uk Kimif ($avx>1) {{{
817bded2dbSJung-uk Kim{ # void AMS_WW(
827bded2dbSJung-uk Kimmy $rp="%rdi";	# BN_ULONG *rp,
837bded2dbSJung-uk Kimmy $ap="%rsi";	# const BN_ULONG *ap,
847bded2dbSJung-uk Kimmy $np="%rdx";	# const BN_ULONG *np,
857bded2dbSJung-uk Kimmy $n0="%ecx";	# const BN_ULONG n0,
867bded2dbSJung-uk Kimmy $rep="%r8d";	# int repeat);
877bded2dbSJung-uk Kim
887bded2dbSJung-uk Kim# The registers that hold the accumulated redundant result
897bded2dbSJung-uk Kim# The AMM works on 1024 bit operands, and redundant word size is 29
907bded2dbSJung-uk Kim# Therefore: ceil(1024/29)/4 = 9
917bded2dbSJung-uk Kimmy $ACC0="%ymm0";
927bded2dbSJung-uk Kimmy $ACC1="%ymm1";
937bded2dbSJung-uk Kimmy $ACC2="%ymm2";
947bded2dbSJung-uk Kimmy $ACC3="%ymm3";
957bded2dbSJung-uk Kimmy $ACC4="%ymm4";
967bded2dbSJung-uk Kimmy $ACC5="%ymm5";
977bded2dbSJung-uk Kimmy $ACC6="%ymm6";
987bded2dbSJung-uk Kimmy $ACC7="%ymm7";
997bded2dbSJung-uk Kimmy $ACC8="%ymm8";
1007bded2dbSJung-uk Kimmy $ACC9="%ymm9";
1017bded2dbSJung-uk Kim# Registers that hold the broadcasted words of bp, currently used
1027bded2dbSJung-uk Kimmy $B1="%ymm10";
1037bded2dbSJung-uk Kimmy $B2="%ymm11";
1047bded2dbSJung-uk Kim# Registers that hold the broadcasted words of Y, currently used
1057bded2dbSJung-uk Kimmy $Y1="%ymm12";
1067bded2dbSJung-uk Kimmy $Y2="%ymm13";
1077bded2dbSJung-uk Kim# Helper registers
1087bded2dbSJung-uk Kimmy $TEMP1="%ymm14";
1097bded2dbSJung-uk Kimmy $AND_MASK="%ymm15";
1107bded2dbSJung-uk Kim# alu registers that hold the first words of the ACC
1117bded2dbSJung-uk Kimmy $r0="%r9";
1127bded2dbSJung-uk Kimmy $r1="%r10";
1137bded2dbSJung-uk Kimmy $r2="%r11";
1147bded2dbSJung-uk Kimmy $r3="%r12";
1157bded2dbSJung-uk Kim
1167bded2dbSJung-uk Kimmy $i="%r14d";			# loop counter
1177bded2dbSJung-uk Kimmy $tmp = "%r15";
1187bded2dbSJung-uk Kim
1197bded2dbSJung-uk Kimmy $FrameSize=32*18+32*8;	# place for A^2 and 2*A
1207bded2dbSJung-uk Kim
1217bded2dbSJung-uk Kimmy $aap=$r0;
1227bded2dbSJung-uk Kimmy $tp0="%rbx";
1237bded2dbSJung-uk Kimmy $tp1=$r3;
1247bded2dbSJung-uk Kimmy $tpa=$tmp;
1257bded2dbSJung-uk Kim
1267bded2dbSJung-uk Kim$np="%r13";			# reassigned argument
1277bded2dbSJung-uk Kim
1287bded2dbSJung-uk Kim$code.=<<___;
1297bded2dbSJung-uk Kim.text
1307bded2dbSJung-uk Kim
1317bded2dbSJung-uk Kim.globl	rsaz_1024_sqr_avx2
1327bded2dbSJung-uk Kim.type	rsaz_1024_sqr_avx2,\@function,5
1337bded2dbSJung-uk Kim.align	64
1347bded2dbSJung-uk Kimrsaz_1024_sqr_avx2:		# 702 cycles, 14% faster than rsaz_1024_mul_avx2
135e71b7053SJung-uk Kim.cfi_startproc
1367bded2dbSJung-uk Kim	lea	(%rsp), %rax
137e71b7053SJung-uk Kim.cfi_def_cfa_register	%rax
1387bded2dbSJung-uk Kim	push	%rbx
139e71b7053SJung-uk Kim.cfi_push	%rbx
1407bded2dbSJung-uk Kim	push	%rbp
141e71b7053SJung-uk Kim.cfi_push	%rbp
1427bded2dbSJung-uk Kim	push	%r12
143e71b7053SJung-uk Kim.cfi_push	%r12
1447bded2dbSJung-uk Kim	push	%r13
145e71b7053SJung-uk Kim.cfi_push	%r13
1467bded2dbSJung-uk Kim	push	%r14
147e71b7053SJung-uk Kim.cfi_push	%r14
1487bded2dbSJung-uk Kim	push	%r15
149e71b7053SJung-uk Kim.cfi_push	%r15
1507bded2dbSJung-uk Kim	vzeroupper
1517bded2dbSJung-uk Kim___
1527bded2dbSJung-uk Kim$code.=<<___ if ($win64);
1537bded2dbSJung-uk Kim	lea	-0xa8(%rsp),%rsp
1547bded2dbSJung-uk Kim	vmovaps	%xmm6,-0xd8(%rax)
1557bded2dbSJung-uk Kim	vmovaps	%xmm7,-0xc8(%rax)
1567bded2dbSJung-uk Kim	vmovaps	%xmm8,-0xb8(%rax)
1577bded2dbSJung-uk Kim	vmovaps	%xmm9,-0xa8(%rax)
1587bded2dbSJung-uk Kim	vmovaps	%xmm10,-0x98(%rax)
1597bded2dbSJung-uk Kim	vmovaps	%xmm11,-0x88(%rax)
1607bded2dbSJung-uk Kim	vmovaps	%xmm12,-0x78(%rax)
1617bded2dbSJung-uk Kim	vmovaps	%xmm13,-0x68(%rax)
1627bded2dbSJung-uk Kim	vmovaps	%xmm14,-0x58(%rax)
1637bded2dbSJung-uk Kim	vmovaps	%xmm15,-0x48(%rax)
1647bded2dbSJung-uk Kim.Lsqr_1024_body:
1657bded2dbSJung-uk Kim___
1667bded2dbSJung-uk Kim$code.=<<___;
1677bded2dbSJung-uk Kim	mov	%rax,%rbp
168e71b7053SJung-uk Kim.cfi_def_cfa_register	%rbp
1697bded2dbSJung-uk Kim	mov	%rdx, $np			# reassigned argument
1707bded2dbSJung-uk Kim	sub	\$$FrameSize, %rsp
1717bded2dbSJung-uk Kim	mov	$np, $tmp
1727bded2dbSJung-uk Kim	sub	\$-128, $rp			# size optimization
1737bded2dbSJung-uk Kim	sub	\$-128, $ap
1747bded2dbSJung-uk Kim	sub	\$-128, $np
1757bded2dbSJung-uk Kim
1767bded2dbSJung-uk Kim	and	\$4095, $tmp			# see if $np crosses page
1777bded2dbSJung-uk Kim	add	\$32*10, $tmp
1787bded2dbSJung-uk Kim	shr	\$12, $tmp
1797bded2dbSJung-uk Kim	vpxor	$ACC9,$ACC9,$ACC9
1807bded2dbSJung-uk Kim	jz	.Lsqr_1024_no_n_copy
1817bded2dbSJung-uk Kim
1827bded2dbSJung-uk Kim	# unaligned 256-bit load that crosses page boundary can
1837bded2dbSJung-uk Kim	# cause >2x performance degradation here, so if $np does
1847bded2dbSJung-uk Kim	# cross page boundary, copy it to stack and make sure stack
1857bded2dbSJung-uk Kim	# frame doesn't...
1867bded2dbSJung-uk Kim	sub		\$32*10,%rsp
1877bded2dbSJung-uk Kim	vmovdqu		32*0-128($np), $ACC0
1887bded2dbSJung-uk Kim	and		\$-2048, %rsp
1897bded2dbSJung-uk Kim	vmovdqu		32*1-128($np), $ACC1
1907bded2dbSJung-uk Kim	vmovdqu		32*2-128($np), $ACC2
1917bded2dbSJung-uk Kim	vmovdqu		32*3-128($np), $ACC3
1927bded2dbSJung-uk Kim	vmovdqu		32*4-128($np), $ACC4
1937bded2dbSJung-uk Kim	vmovdqu		32*5-128($np), $ACC5
1947bded2dbSJung-uk Kim	vmovdqu		32*6-128($np), $ACC6
1957bded2dbSJung-uk Kim	vmovdqu		32*7-128($np), $ACC7
1967bded2dbSJung-uk Kim	vmovdqu		32*8-128($np), $ACC8
1977bded2dbSJung-uk Kim	lea		$FrameSize+128(%rsp),$np
1987bded2dbSJung-uk Kim	vmovdqu		$ACC0, 32*0-128($np)
1997bded2dbSJung-uk Kim	vmovdqu		$ACC1, 32*1-128($np)
2007bded2dbSJung-uk Kim	vmovdqu		$ACC2, 32*2-128($np)
2017bded2dbSJung-uk Kim	vmovdqu		$ACC3, 32*3-128($np)
2027bded2dbSJung-uk Kim	vmovdqu		$ACC4, 32*4-128($np)
2037bded2dbSJung-uk Kim	vmovdqu		$ACC5, 32*5-128($np)
2047bded2dbSJung-uk Kim	vmovdqu		$ACC6, 32*6-128($np)
2057bded2dbSJung-uk Kim	vmovdqu		$ACC7, 32*7-128($np)
2067bded2dbSJung-uk Kim	vmovdqu		$ACC8, 32*8-128($np)
2077bded2dbSJung-uk Kim	vmovdqu		$ACC9, 32*9-128($np)	# $ACC9 is zero
2087bded2dbSJung-uk Kim
2097bded2dbSJung-uk Kim.Lsqr_1024_no_n_copy:
2107bded2dbSJung-uk Kim	and		\$-1024, %rsp
2117bded2dbSJung-uk Kim
2127bded2dbSJung-uk Kim	vmovdqu		32*1-128($ap), $ACC1
2137bded2dbSJung-uk Kim	vmovdqu		32*2-128($ap), $ACC2
2147bded2dbSJung-uk Kim	vmovdqu		32*3-128($ap), $ACC3
2157bded2dbSJung-uk Kim	vmovdqu		32*4-128($ap), $ACC4
2167bded2dbSJung-uk Kim	vmovdqu		32*5-128($ap), $ACC5
2177bded2dbSJung-uk Kim	vmovdqu		32*6-128($ap), $ACC6
2187bded2dbSJung-uk Kim	vmovdqu		32*7-128($ap), $ACC7
2197bded2dbSJung-uk Kim	vmovdqu		32*8-128($ap), $ACC8
2207bded2dbSJung-uk Kim
2217bded2dbSJung-uk Kim	lea	192(%rsp), $tp0			# 64+128=192
222c4ad4dffSJung-uk Kim	vmovdqu	.Land_mask(%rip), $AND_MASK
2237bded2dbSJung-uk Kim	jmp	.LOOP_GRANDE_SQR_1024
2247bded2dbSJung-uk Kim
2257bded2dbSJung-uk Kim.align	32
2267bded2dbSJung-uk Kim.LOOP_GRANDE_SQR_1024:
2277bded2dbSJung-uk Kim	lea	32*18+128(%rsp), $aap		# size optimization
2287bded2dbSJung-uk Kim	lea	448(%rsp), $tp1			# 64+128+256=448
2297bded2dbSJung-uk Kim
2307bded2dbSJung-uk Kim	# the squaring is performed as described in Variant B of
2317bded2dbSJung-uk Kim	# "Speeding up Big-Number Squaring", so start by calculating
2327bded2dbSJung-uk Kim	# the A*2=A+A vector
2337bded2dbSJung-uk Kim	vpaddq		$ACC1, $ACC1, $ACC1
2347bded2dbSJung-uk Kim	 vpbroadcastq	32*0-128($ap), $B1
2357bded2dbSJung-uk Kim	vpaddq		$ACC2, $ACC2, $ACC2
2367bded2dbSJung-uk Kim	vmovdqa		$ACC1, 32*0-128($aap)
2377bded2dbSJung-uk Kim	vpaddq		$ACC3, $ACC3, $ACC3
2387bded2dbSJung-uk Kim	vmovdqa		$ACC2, 32*1-128($aap)
2397bded2dbSJung-uk Kim	vpaddq		$ACC4, $ACC4, $ACC4
2407bded2dbSJung-uk Kim	vmovdqa		$ACC3, 32*2-128($aap)
2417bded2dbSJung-uk Kim	vpaddq		$ACC5, $ACC5, $ACC5
2427bded2dbSJung-uk Kim	vmovdqa		$ACC4, 32*3-128($aap)
2437bded2dbSJung-uk Kim	vpaddq		$ACC6, $ACC6, $ACC6
2447bded2dbSJung-uk Kim	vmovdqa		$ACC5, 32*4-128($aap)
2457bded2dbSJung-uk Kim	vpaddq		$ACC7, $ACC7, $ACC7
2467bded2dbSJung-uk Kim	vmovdqa		$ACC6, 32*5-128($aap)
2477bded2dbSJung-uk Kim	vpaddq		$ACC8, $ACC8, $ACC8
2487bded2dbSJung-uk Kim	vmovdqa		$ACC7, 32*6-128($aap)
2497bded2dbSJung-uk Kim	vpxor		$ACC9, $ACC9, $ACC9
2507bded2dbSJung-uk Kim	vmovdqa		$ACC8, 32*7-128($aap)
2517bded2dbSJung-uk Kim
2527bded2dbSJung-uk Kim	vpmuludq	32*0-128($ap), $B1, $ACC0
2537bded2dbSJung-uk Kim	 vpbroadcastq	32*1-128($ap), $B2
2547bded2dbSJung-uk Kim	 vmovdqu	$ACC9, 32*9-192($tp0)	# zero upper half
2557bded2dbSJung-uk Kim	vpmuludq	$B1, $ACC1, $ACC1
2567bded2dbSJung-uk Kim	 vmovdqu	$ACC9, 32*10-448($tp1)
2577bded2dbSJung-uk Kim	vpmuludq	$B1, $ACC2, $ACC2
2587bded2dbSJung-uk Kim	 vmovdqu	$ACC9, 32*11-448($tp1)
2597bded2dbSJung-uk Kim	vpmuludq	$B1, $ACC3, $ACC3
2607bded2dbSJung-uk Kim	 vmovdqu	$ACC9, 32*12-448($tp1)
2617bded2dbSJung-uk Kim	vpmuludq	$B1, $ACC4, $ACC4
2627bded2dbSJung-uk Kim	 vmovdqu	$ACC9, 32*13-448($tp1)
2637bded2dbSJung-uk Kim	vpmuludq	$B1, $ACC5, $ACC5
2647bded2dbSJung-uk Kim	 vmovdqu	$ACC9, 32*14-448($tp1)
2657bded2dbSJung-uk Kim	vpmuludq	$B1, $ACC6, $ACC6
2667bded2dbSJung-uk Kim	 vmovdqu	$ACC9, 32*15-448($tp1)
2677bded2dbSJung-uk Kim	vpmuludq	$B1, $ACC7, $ACC7
2687bded2dbSJung-uk Kim	 vmovdqu	$ACC9, 32*16-448($tp1)
2697bded2dbSJung-uk Kim	vpmuludq	$B1, $ACC8, $ACC8
2707bded2dbSJung-uk Kim	 vpbroadcastq	32*2-128($ap), $B1
2717bded2dbSJung-uk Kim	 vmovdqu	$ACC9, 32*17-448($tp1)
2727bded2dbSJung-uk Kim
2737bded2dbSJung-uk Kim	mov	$ap, $tpa
2747bded2dbSJung-uk Kim	mov 	\$4, $i
2757bded2dbSJung-uk Kim	jmp	.Lsqr_entry_1024
2767bded2dbSJung-uk Kim___
2777bded2dbSJung-uk Kim$TEMP0=$Y1;
2787bded2dbSJung-uk Kim$TEMP2=$Y2;
2797bded2dbSJung-uk Kim$code.=<<___;
2807bded2dbSJung-uk Kim.align	32
2817bded2dbSJung-uk Kim.LOOP_SQR_1024:
2827bded2dbSJung-uk Kim	 vpbroadcastq	32*1-128($tpa), $B2
2837bded2dbSJung-uk Kim	vpmuludq	32*0-128($ap), $B1, $ACC0
2847bded2dbSJung-uk Kim	vpaddq		32*0-192($tp0), $ACC0, $ACC0
2857bded2dbSJung-uk Kim	vpmuludq	32*0-128($aap), $B1, $ACC1
2867bded2dbSJung-uk Kim	vpaddq		32*1-192($tp0), $ACC1, $ACC1
2877bded2dbSJung-uk Kim	vpmuludq	32*1-128($aap), $B1, $ACC2
2887bded2dbSJung-uk Kim	vpaddq		32*2-192($tp0), $ACC2, $ACC2
2897bded2dbSJung-uk Kim	vpmuludq	32*2-128($aap), $B1, $ACC3
2907bded2dbSJung-uk Kim	vpaddq		32*3-192($tp0), $ACC3, $ACC3
2917bded2dbSJung-uk Kim	vpmuludq	32*3-128($aap), $B1, $ACC4
2927bded2dbSJung-uk Kim	vpaddq		32*4-192($tp0), $ACC4, $ACC4
2937bded2dbSJung-uk Kim	vpmuludq	32*4-128($aap), $B1, $ACC5
2947bded2dbSJung-uk Kim	vpaddq		32*5-192($tp0), $ACC5, $ACC5
2957bded2dbSJung-uk Kim	vpmuludq	32*5-128($aap), $B1, $ACC6
2967bded2dbSJung-uk Kim	vpaddq		32*6-192($tp0), $ACC6, $ACC6
2977bded2dbSJung-uk Kim	vpmuludq	32*6-128($aap), $B1, $ACC7
2987bded2dbSJung-uk Kim	vpaddq		32*7-192($tp0), $ACC7, $ACC7
2997bded2dbSJung-uk Kim	vpmuludq	32*7-128($aap), $B1, $ACC8
3007bded2dbSJung-uk Kim	 vpbroadcastq	32*2-128($tpa), $B1
3017bded2dbSJung-uk Kim	vpaddq		32*8-192($tp0), $ACC8, $ACC8
3027bded2dbSJung-uk Kim.Lsqr_entry_1024:
3037bded2dbSJung-uk Kim	vmovdqu		$ACC0, 32*0-192($tp0)
3047bded2dbSJung-uk Kim	vmovdqu		$ACC1, 32*1-192($tp0)
3057bded2dbSJung-uk Kim
3067bded2dbSJung-uk Kim	vpmuludq	32*1-128($ap), $B2, $TEMP0
3077bded2dbSJung-uk Kim	vpaddq		$TEMP0, $ACC2, $ACC2
3087bded2dbSJung-uk Kim	vpmuludq	32*1-128($aap), $B2, $TEMP1
3097bded2dbSJung-uk Kim	vpaddq		$TEMP1, $ACC3, $ACC3
3107bded2dbSJung-uk Kim	vpmuludq	32*2-128($aap), $B2, $TEMP2
3117bded2dbSJung-uk Kim	vpaddq		$TEMP2, $ACC4, $ACC4
3127bded2dbSJung-uk Kim	vpmuludq	32*3-128($aap), $B2, $TEMP0
3137bded2dbSJung-uk Kim	vpaddq		$TEMP0, $ACC5, $ACC5
3147bded2dbSJung-uk Kim	vpmuludq	32*4-128($aap), $B2, $TEMP1
3157bded2dbSJung-uk Kim	vpaddq		$TEMP1, $ACC6, $ACC6
3167bded2dbSJung-uk Kim	vpmuludq	32*5-128($aap), $B2, $TEMP2
3177bded2dbSJung-uk Kim	vpaddq		$TEMP2, $ACC7, $ACC7
3187bded2dbSJung-uk Kim	vpmuludq	32*6-128($aap), $B2, $TEMP0
3197bded2dbSJung-uk Kim	vpaddq		$TEMP0, $ACC8, $ACC8
3207bded2dbSJung-uk Kim	vpmuludq	32*7-128($aap), $B2, $ACC0
3217bded2dbSJung-uk Kim	 vpbroadcastq	32*3-128($tpa), $B2
3227bded2dbSJung-uk Kim	vpaddq		32*9-192($tp0), $ACC0, $ACC0
3237bded2dbSJung-uk Kim
3247bded2dbSJung-uk Kim	vmovdqu		$ACC2, 32*2-192($tp0)
3257bded2dbSJung-uk Kim	vmovdqu		$ACC3, 32*3-192($tp0)
3267bded2dbSJung-uk Kim
3277bded2dbSJung-uk Kim	vpmuludq	32*2-128($ap), $B1, $TEMP2
3287bded2dbSJung-uk Kim	vpaddq		$TEMP2, $ACC4, $ACC4
3297bded2dbSJung-uk Kim	vpmuludq	32*2-128($aap), $B1, $TEMP0
3307bded2dbSJung-uk Kim	vpaddq		$TEMP0, $ACC5, $ACC5
3317bded2dbSJung-uk Kim	vpmuludq	32*3-128($aap), $B1, $TEMP1
3327bded2dbSJung-uk Kim	vpaddq		$TEMP1, $ACC6, $ACC6
3337bded2dbSJung-uk Kim	vpmuludq	32*4-128($aap), $B1, $TEMP2
3347bded2dbSJung-uk Kim	vpaddq		$TEMP2, $ACC7, $ACC7
3357bded2dbSJung-uk Kim	vpmuludq	32*5-128($aap), $B1, $TEMP0
3367bded2dbSJung-uk Kim	vpaddq		$TEMP0, $ACC8, $ACC8
3377bded2dbSJung-uk Kim	vpmuludq	32*6-128($aap), $B1, $TEMP1
3387bded2dbSJung-uk Kim	vpaddq		$TEMP1, $ACC0, $ACC0
3397bded2dbSJung-uk Kim	vpmuludq	32*7-128($aap), $B1, $ACC1
3407bded2dbSJung-uk Kim	 vpbroadcastq	32*4-128($tpa), $B1
3417bded2dbSJung-uk Kim	vpaddq		32*10-448($tp1), $ACC1, $ACC1
3427bded2dbSJung-uk Kim
3437bded2dbSJung-uk Kim	vmovdqu		$ACC4, 32*4-192($tp0)
3447bded2dbSJung-uk Kim	vmovdqu		$ACC5, 32*5-192($tp0)
3457bded2dbSJung-uk Kim
3467bded2dbSJung-uk Kim	vpmuludq	32*3-128($ap), $B2, $TEMP0
3477bded2dbSJung-uk Kim	vpaddq		$TEMP0, $ACC6, $ACC6
3487bded2dbSJung-uk Kim	vpmuludq	32*3-128($aap), $B2, $TEMP1
3497bded2dbSJung-uk Kim	vpaddq		$TEMP1, $ACC7, $ACC7
3507bded2dbSJung-uk Kim	vpmuludq	32*4-128($aap), $B2, $TEMP2
3517bded2dbSJung-uk Kim	vpaddq		$TEMP2, $ACC8, $ACC8
3527bded2dbSJung-uk Kim	vpmuludq	32*5-128($aap), $B2, $TEMP0
3537bded2dbSJung-uk Kim	vpaddq		$TEMP0, $ACC0, $ACC0
3547bded2dbSJung-uk Kim	vpmuludq	32*6-128($aap), $B2, $TEMP1
3557bded2dbSJung-uk Kim	vpaddq		$TEMP1, $ACC1, $ACC1
3567bded2dbSJung-uk Kim	vpmuludq	32*7-128($aap), $B2, $ACC2
3577bded2dbSJung-uk Kim	 vpbroadcastq	32*5-128($tpa), $B2
3587bded2dbSJung-uk Kim	vpaddq		32*11-448($tp1), $ACC2, $ACC2
3597bded2dbSJung-uk Kim
3607bded2dbSJung-uk Kim	vmovdqu		$ACC6, 32*6-192($tp0)
3617bded2dbSJung-uk Kim	vmovdqu		$ACC7, 32*7-192($tp0)
3627bded2dbSJung-uk Kim
3637bded2dbSJung-uk Kim	vpmuludq	32*4-128($ap), $B1, $TEMP0
3647bded2dbSJung-uk Kim	vpaddq		$TEMP0, $ACC8, $ACC8
3657bded2dbSJung-uk Kim	vpmuludq	32*4-128($aap), $B1, $TEMP1
3667bded2dbSJung-uk Kim	vpaddq		$TEMP1, $ACC0, $ACC0
3677bded2dbSJung-uk Kim	vpmuludq	32*5-128($aap), $B1, $TEMP2
3687bded2dbSJung-uk Kim	vpaddq		$TEMP2, $ACC1, $ACC1
3697bded2dbSJung-uk Kim	vpmuludq	32*6-128($aap), $B1, $TEMP0
3707bded2dbSJung-uk Kim	vpaddq		$TEMP0, $ACC2, $ACC2
3717bded2dbSJung-uk Kim	vpmuludq	32*7-128($aap), $B1, $ACC3
3727bded2dbSJung-uk Kim	 vpbroadcastq	32*6-128($tpa), $B1
3737bded2dbSJung-uk Kim	vpaddq		32*12-448($tp1), $ACC3, $ACC3
3747bded2dbSJung-uk Kim
3757bded2dbSJung-uk Kim	vmovdqu		$ACC8, 32*8-192($tp0)
3767bded2dbSJung-uk Kim	vmovdqu		$ACC0, 32*9-192($tp0)
3777bded2dbSJung-uk Kim	lea		8($tp0), $tp0
3787bded2dbSJung-uk Kim
3797bded2dbSJung-uk Kim	vpmuludq	32*5-128($ap), $B2, $TEMP2
3807bded2dbSJung-uk Kim	vpaddq		$TEMP2, $ACC1, $ACC1
3817bded2dbSJung-uk Kim	vpmuludq	32*5-128($aap), $B2, $TEMP0
3827bded2dbSJung-uk Kim	vpaddq		$TEMP0, $ACC2, $ACC2
3837bded2dbSJung-uk Kim	vpmuludq	32*6-128($aap), $B2, $TEMP1
3847bded2dbSJung-uk Kim	vpaddq		$TEMP1, $ACC3, $ACC3
3857bded2dbSJung-uk Kim	vpmuludq	32*7-128($aap), $B2, $ACC4
3867bded2dbSJung-uk Kim	 vpbroadcastq	32*7-128($tpa), $B2
3877bded2dbSJung-uk Kim	vpaddq		32*13-448($tp1), $ACC4, $ACC4
3887bded2dbSJung-uk Kim
3897bded2dbSJung-uk Kim	vmovdqu		$ACC1, 32*10-448($tp1)
3907bded2dbSJung-uk Kim	vmovdqu		$ACC2, 32*11-448($tp1)
3917bded2dbSJung-uk Kim
3927bded2dbSJung-uk Kim	vpmuludq	32*6-128($ap), $B1, $TEMP0
3937bded2dbSJung-uk Kim	vpaddq		$TEMP0, $ACC3, $ACC3
3947bded2dbSJung-uk Kim	vpmuludq	32*6-128($aap), $B1, $TEMP1
3957bded2dbSJung-uk Kim	 vpbroadcastq	32*8-128($tpa), $ACC0		# borrow $ACC0 for $B1
3967bded2dbSJung-uk Kim	vpaddq		$TEMP1, $ACC4, $ACC4
3977bded2dbSJung-uk Kim	vpmuludq	32*7-128($aap), $B1, $ACC5
3987bded2dbSJung-uk Kim	 vpbroadcastq	32*0+8-128($tpa), $B1		# for next iteration
3997bded2dbSJung-uk Kim	vpaddq		32*14-448($tp1), $ACC5, $ACC5
4007bded2dbSJung-uk Kim
4017bded2dbSJung-uk Kim	vmovdqu		$ACC3, 32*12-448($tp1)
4027bded2dbSJung-uk Kim	vmovdqu		$ACC4, 32*13-448($tp1)
4037bded2dbSJung-uk Kim	lea		8($tpa), $tpa
4047bded2dbSJung-uk Kim
4057bded2dbSJung-uk Kim	vpmuludq	32*7-128($ap), $B2, $TEMP0
4067bded2dbSJung-uk Kim	vpaddq		$TEMP0, $ACC5, $ACC5
4077bded2dbSJung-uk Kim	vpmuludq	32*7-128($aap), $B2, $ACC6
4087bded2dbSJung-uk Kim	vpaddq		32*15-448($tp1), $ACC6, $ACC6
4097bded2dbSJung-uk Kim
4107bded2dbSJung-uk Kim	vpmuludq	32*8-128($ap), $ACC0, $ACC7
4117bded2dbSJung-uk Kim	vmovdqu		$ACC5, 32*14-448($tp1)
4127bded2dbSJung-uk Kim	vpaddq		32*16-448($tp1), $ACC7, $ACC7
4137bded2dbSJung-uk Kim	vmovdqu		$ACC6, 32*15-448($tp1)
4147bded2dbSJung-uk Kim	vmovdqu		$ACC7, 32*16-448($tp1)
4157bded2dbSJung-uk Kim	lea		8($tp1), $tp1
4167bded2dbSJung-uk Kim
4177bded2dbSJung-uk Kim	dec	$i
4187bded2dbSJung-uk Kim	jnz	.LOOP_SQR_1024
4197bded2dbSJung-uk Kim___
4207bded2dbSJung-uk Kim$ZERO = $ACC9;
4217bded2dbSJung-uk Kim$TEMP0 = $B1;
4227bded2dbSJung-uk Kim$TEMP2 = $B2;
4237bded2dbSJung-uk Kim$TEMP3 = $Y1;
4247bded2dbSJung-uk Kim$TEMP4 = $Y2;
4257bded2dbSJung-uk Kim$code.=<<___;
4264c6a0400SJung-uk Kim	# we need to fix indices 32-39 to avoid overflow
4277bded2dbSJung-uk Kim	vmovdqu		32*8(%rsp), $ACC8		# 32*8-192($tp0),
4287bded2dbSJung-uk Kim	vmovdqu		32*9(%rsp), $ACC1		# 32*9-192($tp0)
4297bded2dbSJung-uk Kim	vmovdqu		32*10(%rsp), $ACC2		# 32*10-192($tp0)
4307bded2dbSJung-uk Kim	lea		192(%rsp), $tp0			# 64+128=192
4317bded2dbSJung-uk Kim
4327bded2dbSJung-uk Kim	vpsrlq		\$29, $ACC8, $TEMP1
4337bded2dbSJung-uk Kim	vpand		$AND_MASK, $ACC8, $ACC8
4347bded2dbSJung-uk Kim	vpsrlq		\$29, $ACC1, $TEMP2
4357bded2dbSJung-uk Kim	vpand		$AND_MASK, $ACC1, $ACC1
4367bded2dbSJung-uk Kim
4377bded2dbSJung-uk Kim	vpermq		\$0x93, $TEMP1, $TEMP1
4387bded2dbSJung-uk Kim	vpxor		$ZERO, $ZERO, $ZERO
4397bded2dbSJung-uk Kim	vpermq		\$0x93, $TEMP2, $TEMP2
4407bded2dbSJung-uk Kim
4417bded2dbSJung-uk Kim	vpblendd	\$3, $ZERO, $TEMP1, $TEMP0
4427bded2dbSJung-uk Kim	vpblendd	\$3, $TEMP1, $TEMP2, $TEMP1
4437bded2dbSJung-uk Kim	vpaddq		$TEMP0, $ACC8, $ACC8
4447bded2dbSJung-uk Kim	vpblendd	\$3, $TEMP2, $ZERO, $TEMP2
4457bded2dbSJung-uk Kim	vpaddq		$TEMP1, $ACC1, $ACC1
4467bded2dbSJung-uk Kim	vpaddq		$TEMP2, $ACC2, $ACC2
4477bded2dbSJung-uk Kim	vmovdqu		$ACC1, 32*9-192($tp0)
4487bded2dbSJung-uk Kim	vmovdqu		$ACC2, 32*10-192($tp0)
4497bded2dbSJung-uk Kim
4507bded2dbSJung-uk Kim	mov	(%rsp), %rax
4517bded2dbSJung-uk Kim	mov	8(%rsp), $r1
4527bded2dbSJung-uk Kim	mov	16(%rsp), $r2
4537bded2dbSJung-uk Kim	mov	24(%rsp), $r3
4547bded2dbSJung-uk Kim	vmovdqu	32*1(%rsp), $ACC1
4557bded2dbSJung-uk Kim	vmovdqu	32*2-192($tp0), $ACC2
4567bded2dbSJung-uk Kim	vmovdqu	32*3-192($tp0), $ACC3
4577bded2dbSJung-uk Kim	vmovdqu	32*4-192($tp0), $ACC4
4587bded2dbSJung-uk Kim	vmovdqu	32*5-192($tp0), $ACC5
4597bded2dbSJung-uk Kim	vmovdqu	32*6-192($tp0), $ACC6
4607bded2dbSJung-uk Kim	vmovdqu	32*7-192($tp0), $ACC7
4617bded2dbSJung-uk Kim
4627bded2dbSJung-uk Kim	mov	%rax, $r0
4637bded2dbSJung-uk Kim	imull	$n0, %eax
4647bded2dbSJung-uk Kim	and	\$0x1fffffff, %eax
4657bded2dbSJung-uk Kim	vmovd	%eax, $Y1
4667bded2dbSJung-uk Kim
4677bded2dbSJung-uk Kim	mov	%rax, %rdx
4687bded2dbSJung-uk Kim	imulq	-128($np), %rax
4697bded2dbSJung-uk Kim	 vpbroadcastq	$Y1, $Y1
4707bded2dbSJung-uk Kim	add	%rax, $r0
4717bded2dbSJung-uk Kim	mov	%rdx, %rax
4727bded2dbSJung-uk Kim	imulq	8-128($np), %rax
4737bded2dbSJung-uk Kim	shr	\$29, $r0
4747bded2dbSJung-uk Kim	add	%rax, $r1
4757bded2dbSJung-uk Kim	mov	%rdx, %rax
4767bded2dbSJung-uk Kim	imulq	16-128($np), %rax
4777bded2dbSJung-uk Kim	add	$r0, $r1
4787bded2dbSJung-uk Kim	add	%rax, $r2
4797bded2dbSJung-uk Kim	imulq	24-128($np), %rdx
4807bded2dbSJung-uk Kim	add	%rdx, $r3
4817bded2dbSJung-uk Kim
4827bded2dbSJung-uk Kim	mov	$r1, %rax
4837bded2dbSJung-uk Kim	imull	$n0, %eax
4847bded2dbSJung-uk Kim	and	\$0x1fffffff, %eax
4857bded2dbSJung-uk Kim
4867bded2dbSJung-uk Kim	mov \$9, $i
4877bded2dbSJung-uk Kim	jmp .LOOP_REDUCE_1024
4887bded2dbSJung-uk Kim
4897bded2dbSJung-uk Kim.align	32
4907bded2dbSJung-uk Kim.LOOP_REDUCE_1024:
4917bded2dbSJung-uk Kim	vmovd	%eax, $Y2
4927bded2dbSJung-uk Kim	vpbroadcastq	$Y2, $Y2
4937bded2dbSJung-uk Kim
4947bded2dbSJung-uk Kim	vpmuludq	32*1-128($np), $Y1, $TEMP0
4957bded2dbSJung-uk Kim	 mov	%rax, %rdx
4967bded2dbSJung-uk Kim	 imulq	-128($np), %rax
4977bded2dbSJung-uk Kim	vpaddq		$TEMP0, $ACC1, $ACC1
4987bded2dbSJung-uk Kim	 add	%rax, $r1
4997bded2dbSJung-uk Kim	vpmuludq	32*2-128($np), $Y1, $TEMP1
5007bded2dbSJung-uk Kim	 mov	%rdx, %rax
5017bded2dbSJung-uk Kim	 imulq	8-128($np), %rax
5027bded2dbSJung-uk Kim	vpaddq		$TEMP1, $ACC2, $ACC2
5037bded2dbSJung-uk Kim	vpmuludq	32*3-128($np), $Y1, $TEMP2
5047bded2dbSJung-uk Kim	 .byte	0x67
5057bded2dbSJung-uk Kim	 add	%rax, $r2
5067bded2dbSJung-uk Kim	 .byte	0x67
5077bded2dbSJung-uk Kim	 mov	%rdx, %rax
5087bded2dbSJung-uk Kim	 imulq	16-128($np), %rax
5097bded2dbSJung-uk Kim	 shr	\$29, $r1
5107bded2dbSJung-uk Kim	vpaddq		$TEMP2, $ACC3, $ACC3
5117bded2dbSJung-uk Kim	vpmuludq	32*4-128($np), $Y1, $TEMP0
5127bded2dbSJung-uk Kim	 add	%rax, $r3
5137bded2dbSJung-uk Kim	 add	$r1, $r2
5147bded2dbSJung-uk Kim	vpaddq		$TEMP0, $ACC4, $ACC4
5157bded2dbSJung-uk Kim	vpmuludq	32*5-128($np), $Y1, $TEMP1
5167bded2dbSJung-uk Kim	 mov	$r2, %rax
5177bded2dbSJung-uk Kim	 imull	$n0, %eax
5187bded2dbSJung-uk Kim	vpaddq		$TEMP1, $ACC5, $ACC5
5197bded2dbSJung-uk Kim	vpmuludq	32*6-128($np), $Y1, $TEMP2
5207bded2dbSJung-uk Kim	 and	\$0x1fffffff, %eax
5217bded2dbSJung-uk Kim	vpaddq		$TEMP2, $ACC6, $ACC6
5227bded2dbSJung-uk Kim	vpmuludq	32*7-128($np), $Y1, $TEMP0
5237bded2dbSJung-uk Kim	vpaddq		$TEMP0, $ACC7, $ACC7
5247bded2dbSJung-uk Kim	vpmuludq	32*8-128($np), $Y1, $TEMP1
5257bded2dbSJung-uk Kim	 vmovd	%eax, $Y1
5267bded2dbSJung-uk Kim	 #vmovdqu	32*1-8-128($np), $TEMP2		# moved below
5277bded2dbSJung-uk Kim	vpaddq		$TEMP1, $ACC8, $ACC8
5287bded2dbSJung-uk Kim	 #vmovdqu	32*2-8-128($np), $TEMP0		# moved below
5297bded2dbSJung-uk Kim	 vpbroadcastq	$Y1, $Y1
5307bded2dbSJung-uk Kim
5317bded2dbSJung-uk Kim	vpmuludq	32*1-8-128($np), $Y2, $TEMP2	# see above
5327bded2dbSJung-uk Kim	vmovdqu		32*3-8-128($np), $TEMP1
5337bded2dbSJung-uk Kim	 mov	%rax, %rdx
5347bded2dbSJung-uk Kim	 imulq	-128($np), %rax
5357bded2dbSJung-uk Kim	vpaddq		$TEMP2, $ACC1, $ACC1
5367bded2dbSJung-uk Kim	vpmuludq	32*2-8-128($np), $Y2, $TEMP0	# see above
5377bded2dbSJung-uk Kim	vmovdqu		32*4-8-128($np), $TEMP2
5387bded2dbSJung-uk Kim	 add	%rax, $r2
5397bded2dbSJung-uk Kim	 mov	%rdx, %rax
5407bded2dbSJung-uk Kim	 imulq	8-128($np), %rax
5417bded2dbSJung-uk Kim	vpaddq		$TEMP0, $ACC2, $ACC2
5427bded2dbSJung-uk Kim	 add	$r3, %rax
5437bded2dbSJung-uk Kim	 shr	\$29, $r2
5447bded2dbSJung-uk Kim	vpmuludq	$Y2, $TEMP1, $TEMP1
5457bded2dbSJung-uk Kim	vmovdqu		32*5-8-128($np), $TEMP0
5467bded2dbSJung-uk Kim	 add	$r2, %rax
5477bded2dbSJung-uk Kim	vpaddq		$TEMP1, $ACC3, $ACC3
5487bded2dbSJung-uk Kim	vpmuludq	$Y2, $TEMP2, $TEMP2
5497bded2dbSJung-uk Kim	vmovdqu		32*6-8-128($np), $TEMP1
5507bded2dbSJung-uk Kim	 .byte	0x67
5517bded2dbSJung-uk Kim	 mov	%rax, $r3
5527bded2dbSJung-uk Kim	 imull	$n0, %eax
5537bded2dbSJung-uk Kim	vpaddq		$TEMP2, $ACC4, $ACC4
5547bded2dbSJung-uk Kim	vpmuludq	$Y2, $TEMP0, $TEMP0
5557bded2dbSJung-uk Kim	.byte	0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00	# vmovdqu		32*7-8-128($np), $TEMP2
5567bded2dbSJung-uk Kim	 and	\$0x1fffffff, %eax
5577bded2dbSJung-uk Kim	vpaddq		$TEMP0, $ACC5, $ACC5
5587bded2dbSJung-uk Kim	vpmuludq	$Y2, $TEMP1, $TEMP1
5597bded2dbSJung-uk Kim	vmovdqu		32*8-8-128($np), $TEMP0
5607bded2dbSJung-uk Kim	vpaddq		$TEMP1, $ACC6, $ACC6
5617bded2dbSJung-uk Kim	vpmuludq	$Y2, $TEMP2, $TEMP2
5627bded2dbSJung-uk Kim	vmovdqu		32*9-8-128($np), $ACC9
5637bded2dbSJung-uk Kim	 vmovd	%eax, $ACC0			# borrow ACC0 for Y2
5647bded2dbSJung-uk Kim	 imulq	-128($np), %rax
5657bded2dbSJung-uk Kim	vpaddq		$TEMP2, $ACC7, $ACC7
5667bded2dbSJung-uk Kim	vpmuludq	$Y2, $TEMP0, $TEMP0
5677bded2dbSJung-uk Kim	 vmovdqu	32*1-16-128($np), $TEMP1
5687bded2dbSJung-uk Kim	 vpbroadcastq	$ACC0, $ACC0
5697bded2dbSJung-uk Kim	vpaddq		$TEMP0, $ACC8, $ACC8
5707bded2dbSJung-uk Kim	vpmuludq	$Y2, $ACC9, $ACC9
5717bded2dbSJung-uk Kim	 vmovdqu	32*2-16-128($np), $TEMP2
5727bded2dbSJung-uk Kim	 add	%rax, $r3
5737bded2dbSJung-uk Kim
5747bded2dbSJung-uk Kim___
5757bded2dbSJung-uk Kim($ACC0,$Y2)=($Y2,$ACC0);
5767bded2dbSJung-uk Kim$code.=<<___;
5777bded2dbSJung-uk Kim	 vmovdqu	32*1-24-128($np), $ACC0
5787bded2dbSJung-uk Kim	vpmuludq	$Y1, $TEMP1, $TEMP1
5797bded2dbSJung-uk Kim	vmovdqu		32*3-16-128($np), $TEMP0
5807bded2dbSJung-uk Kim	vpaddq		$TEMP1, $ACC1, $ACC1
5817bded2dbSJung-uk Kim	 vpmuludq	$Y2, $ACC0, $ACC0
5827bded2dbSJung-uk Kim	vpmuludq	$Y1, $TEMP2, $TEMP2
5837bded2dbSJung-uk Kim	.byte	0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff	# vmovdqu		32*4-16-128($np), $TEMP1
5847bded2dbSJung-uk Kim	 vpaddq		$ACC1, $ACC0, $ACC0
5857bded2dbSJung-uk Kim	vpaddq		$TEMP2, $ACC2, $ACC2
5867bded2dbSJung-uk Kim	vpmuludq	$Y1, $TEMP0, $TEMP0
5877bded2dbSJung-uk Kim	vmovdqu		32*5-16-128($np), $TEMP2
5887bded2dbSJung-uk Kim	 .byte	0x67
5897bded2dbSJung-uk Kim	 vmovq		$ACC0, %rax
5907bded2dbSJung-uk Kim	 vmovdqu	$ACC0, (%rsp)		# transfer $r0-$r3
5917bded2dbSJung-uk Kim	vpaddq		$TEMP0, $ACC3, $ACC3
5927bded2dbSJung-uk Kim	vpmuludq	$Y1, $TEMP1, $TEMP1
5937bded2dbSJung-uk Kim	vmovdqu		32*6-16-128($np), $TEMP0
5947bded2dbSJung-uk Kim	vpaddq		$TEMP1, $ACC4, $ACC4
5957bded2dbSJung-uk Kim	vpmuludq	$Y1, $TEMP2, $TEMP2
5967bded2dbSJung-uk Kim	vmovdqu		32*7-16-128($np), $TEMP1
5977bded2dbSJung-uk Kim	vpaddq		$TEMP2, $ACC5, $ACC5
5987bded2dbSJung-uk Kim	vpmuludq	$Y1, $TEMP0, $TEMP0
5997bded2dbSJung-uk Kim	vmovdqu		32*8-16-128($np), $TEMP2
6007bded2dbSJung-uk Kim	vpaddq		$TEMP0, $ACC6, $ACC6
6017bded2dbSJung-uk Kim	vpmuludq	$Y1, $TEMP1, $TEMP1
6027bded2dbSJung-uk Kim	 shr	\$29, $r3
6037bded2dbSJung-uk Kim	vmovdqu		32*9-16-128($np), $TEMP0
6047bded2dbSJung-uk Kim	 add	$r3, %rax
6057bded2dbSJung-uk Kim	vpaddq		$TEMP1, $ACC7, $ACC7
6067bded2dbSJung-uk Kim	vpmuludq	$Y1, $TEMP2, $TEMP2
6077bded2dbSJung-uk Kim	 #vmovdqu	32*2-24-128($np), $TEMP1	# moved below
6087bded2dbSJung-uk Kim	 mov	%rax, $r0
6097bded2dbSJung-uk Kim	 imull	$n0, %eax
6107bded2dbSJung-uk Kim	vpaddq		$TEMP2, $ACC8, $ACC8
6117bded2dbSJung-uk Kim	vpmuludq	$Y1, $TEMP0, $TEMP0
6127bded2dbSJung-uk Kim	 and	\$0x1fffffff, %eax
6137bded2dbSJung-uk Kim	 vmovd	%eax, $Y1
6147bded2dbSJung-uk Kim	 vmovdqu	32*3-24-128($np), $TEMP2
6157bded2dbSJung-uk Kim	.byte	0x67
6167bded2dbSJung-uk Kim	vpaddq		$TEMP0, $ACC9, $ACC9
6177bded2dbSJung-uk Kim	 vpbroadcastq	$Y1, $Y1
6187bded2dbSJung-uk Kim
6197bded2dbSJung-uk Kim	vpmuludq	32*2-24-128($np), $Y2, $TEMP1	# see above
6207bded2dbSJung-uk Kim	vmovdqu		32*4-24-128($np), $TEMP0
6217bded2dbSJung-uk Kim	 mov	%rax, %rdx
6227bded2dbSJung-uk Kim	 imulq	-128($np), %rax
6237bded2dbSJung-uk Kim	 mov	8(%rsp), $r1
6247bded2dbSJung-uk Kim	vpaddq		$TEMP1, $ACC2, $ACC1
6257bded2dbSJung-uk Kim	vpmuludq	$Y2, $TEMP2, $TEMP2
6267bded2dbSJung-uk Kim	vmovdqu		32*5-24-128($np), $TEMP1
6277bded2dbSJung-uk Kim	 add	%rax, $r0
6287bded2dbSJung-uk Kim	 mov	%rdx, %rax
6297bded2dbSJung-uk Kim	 imulq	8-128($np), %rax
6307bded2dbSJung-uk Kim	 .byte	0x67
6317bded2dbSJung-uk Kim	 shr	\$29, $r0
6327bded2dbSJung-uk Kim	 mov	16(%rsp), $r2
6337bded2dbSJung-uk Kim	vpaddq		$TEMP2, $ACC3, $ACC2
6347bded2dbSJung-uk Kim	vpmuludq	$Y2, $TEMP0, $TEMP0
6357bded2dbSJung-uk Kim	vmovdqu		32*6-24-128($np), $TEMP2
6367bded2dbSJung-uk Kim	 add	%rax, $r1
6377bded2dbSJung-uk Kim	 mov	%rdx, %rax
6387bded2dbSJung-uk Kim	 imulq	16-128($np), %rax
6397bded2dbSJung-uk Kim	vpaddq		$TEMP0, $ACC4, $ACC3
6407bded2dbSJung-uk Kim	vpmuludq	$Y2, $TEMP1, $TEMP1
6417bded2dbSJung-uk Kim	vmovdqu		32*7-24-128($np), $TEMP0
6427bded2dbSJung-uk Kim	 imulq	24-128($np), %rdx		# future $r3
6437bded2dbSJung-uk Kim	 add	%rax, $r2
6447bded2dbSJung-uk Kim	 lea	($r0,$r1), %rax
6457bded2dbSJung-uk Kim	vpaddq		$TEMP1, $ACC5, $ACC4
6467bded2dbSJung-uk Kim	vpmuludq	$Y2, $TEMP2, $TEMP2
6477bded2dbSJung-uk Kim	vmovdqu		32*8-24-128($np), $TEMP1
6487bded2dbSJung-uk Kim	 mov	%rax, $r1
6497bded2dbSJung-uk Kim	 imull	$n0, %eax
6507bded2dbSJung-uk Kim	vpmuludq	$Y2, $TEMP0, $TEMP0
6517bded2dbSJung-uk Kim	vpaddq		$TEMP2, $ACC6, $ACC5
6527bded2dbSJung-uk Kim	vmovdqu		32*9-24-128($np), $TEMP2
6537bded2dbSJung-uk Kim	 and	\$0x1fffffff, %eax
6547bded2dbSJung-uk Kim	vpaddq		$TEMP0, $ACC7, $ACC6
6557bded2dbSJung-uk Kim	vpmuludq	$Y2, $TEMP1, $TEMP1
6567bded2dbSJung-uk Kim	 add	24(%rsp), %rdx
6577bded2dbSJung-uk Kim	vpaddq		$TEMP1, $ACC8, $ACC7
6587bded2dbSJung-uk Kim	vpmuludq	$Y2, $TEMP2, $TEMP2
6597bded2dbSJung-uk Kim	vpaddq		$TEMP2, $ACC9, $ACC8
6607bded2dbSJung-uk Kim	 vmovq	$r3, $ACC9
6617bded2dbSJung-uk Kim	 mov	%rdx, $r3
6627bded2dbSJung-uk Kim
6637bded2dbSJung-uk Kim	dec	$i
6647bded2dbSJung-uk Kim	jnz	.LOOP_REDUCE_1024
6657bded2dbSJung-uk Kim___
6667bded2dbSJung-uk Kim($ACC0,$Y2)=($Y2,$ACC0);
6677bded2dbSJung-uk Kim$code.=<<___;
6687bded2dbSJung-uk Kim	lea	448(%rsp), $tp1			# size optimization
6697bded2dbSJung-uk Kim	vpaddq	$ACC9, $Y2, $ACC0
6707bded2dbSJung-uk Kim	vpxor	$ZERO, $ZERO, $ZERO
6717bded2dbSJung-uk Kim
6727bded2dbSJung-uk Kim	vpaddq		32*9-192($tp0), $ACC0, $ACC0
6737bded2dbSJung-uk Kim	vpaddq		32*10-448($tp1), $ACC1, $ACC1
6747bded2dbSJung-uk Kim	vpaddq		32*11-448($tp1), $ACC2, $ACC2
6757bded2dbSJung-uk Kim	vpaddq		32*12-448($tp1), $ACC3, $ACC3
6767bded2dbSJung-uk Kim	vpaddq		32*13-448($tp1), $ACC4, $ACC4
6777bded2dbSJung-uk Kim	vpaddq		32*14-448($tp1), $ACC5, $ACC5
6787bded2dbSJung-uk Kim	vpaddq		32*15-448($tp1), $ACC6, $ACC6
6797bded2dbSJung-uk Kim	vpaddq		32*16-448($tp1), $ACC7, $ACC7
6807bded2dbSJung-uk Kim	vpaddq		32*17-448($tp1), $ACC8, $ACC8
6817bded2dbSJung-uk Kim
6827bded2dbSJung-uk Kim	vpsrlq		\$29, $ACC0, $TEMP1
6837bded2dbSJung-uk Kim	vpand		$AND_MASK, $ACC0, $ACC0
6847bded2dbSJung-uk Kim	vpsrlq		\$29, $ACC1, $TEMP2
6857bded2dbSJung-uk Kim	vpand		$AND_MASK, $ACC1, $ACC1
6867bded2dbSJung-uk Kim	vpsrlq		\$29, $ACC2, $TEMP3
6877bded2dbSJung-uk Kim	vpermq		\$0x93, $TEMP1, $TEMP1
6887bded2dbSJung-uk Kim	vpand		$AND_MASK, $ACC2, $ACC2
6897bded2dbSJung-uk Kim	vpsrlq		\$29, $ACC3, $TEMP4
6907bded2dbSJung-uk Kim	vpermq		\$0x93, $TEMP2, $TEMP2
6917bded2dbSJung-uk Kim	vpand		$AND_MASK, $ACC3, $ACC3
6927bded2dbSJung-uk Kim	vpermq		\$0x93, $TEMP3, $TEMP3
6937bded2dbSJung-uk Kim
6947bded2dbSJung-uk Kim	vpblendd	\$3, $ZERO, $TEMP1, $TEMP0
6957bded2dbSJung-uk Kim	vpermq		\$0x93, $TEMP4, $TEMP4
6967bded2dbSJung-uk Kim	vpblendd	\$3, $TEMP1, $TEMP2, $TEMP1
6977bded2dbSJung-uk Kim	vpaddq		$TEMP0, $ACC0, $ACC0
6987bded2dbSJung-uk Kim	vpblendd	\$3, $TEMP2, $TEMP3, $TEMP2
6997bded2dbSJung-uk Kim	vpaddq		$TEMP1, $ACC1, $ACC1
7007bded2dbSJung-uk Kim	vpblendd	\$3, $TEMP3, $TEMP4, $TEMP3
7017bded2dbSJung-uk Kim	vpaddq		$TEMP2, $ACC2, $ACC2
7027bded2dbSJung-uk Kim	vpblendd	\$3, $TEMP4, $ZERO, $TEMP4
7037bded2dbSJung-uk Kim	vpaddq		$TEMP3, $ACC3, $ACC3
7047bded2dbSJung-uk Kim	vpaddq		$TEMP4, $ACC4, $ACC4
7057bded2dbSJung-uk Kim
7067bded2dbSJung-uk Kim	vpsrlq		\$29, $ACC0, $TEMP1
7077bded2dbSJung-uk Kim	vpand		$AND_MASK, $ACC0, $ACC0
7087bded2dbSJung-uk Kim	vpsrlq		\$29, $ACC1, $TEMP2
7097bded2dbSJung-uk Kim	vpand		$AND_MASK, $ACC1, $ACC1
7107bded2dbSJung-uk Kim	vpsrlq		\$29, $ACC2, $TEMP3
7117bded2dbSJung-uk Kim	vpermq		\$0x93, $TEMP1, $TEMP1
7127bded2dbSJung-uk Kim	vpand		$AND_MASK, $ACC2, $ACC2
7137bded2dbSJung-uk Kim	vpsrlq		\$29, $ACC3, $TEMP4
7147bded2dbSJung-uk Kim	vpermq		\$0x93, $TEMP2, $TEMP2
7157bded2dbSJung-uk Kim	vpand		$AND_MASK, $ACC3, $ACC3
7167bded2dbSJung-uk Kim	vpermq		\$0x93, $TEMP3, $TEMP3
7177bded2dbSJung-uk Kim
7187bded2dbSJung-uk Kim	vpblendd	\$3, $ZERO, $TEMP1, $TEMP0
7197bded2dbSJung-uk Kim	vpermq		\$0x93, $TEMP4, $TEMP4
7207bded2dbSJung-uk Kim	vpblendd	\$3, $TEMP1, $TEMP2, $TEMP1
7217bded2dbSJung-uk Kim	vpaddq		$TEMP0, $ACC0, $ACC0
7227bded2dbSJung-uk Kim	vpblendd	\$3, $TEMP2, $TEMP3, $TEMP2
7237bded2dbSJung-uk Kim	vpaddq		$TEMP1, $ACC1, $ACC1
7247bded2dbSJung-uk Kim	vmovdqu		$ACC0, 32*0-128($rp)
7257bded2dbSJung-uk Kim	vpblendd	\$3, $TEMP3, $TEMP4, $TEMP3
7267bded2dbSJung-uk Kim	vpaddq		$TEMP2, $ACC2, $ACC2
7277bded2dbSJung-uk Kim	vmovdqu		$ACC1, 32*1-128($rp)
7287bded2dbSJung-uk Kim	vpblendd	\$3, $TEMP4, $ZERO, $TEMP4
7297bded2dbSJung-uk Kim	vpaddq		$TEMP3, $ACC3, $ACC3
7307bded2dbSJung-uk Kim	vmovdqu		$ACC2, 32*2-128($rp)
7317bded2dbSJung-uk Kim	vpaddq		$TEMP4, $ACC4, $ACC4
7327bded2dbSJung-uk Kim	vmovdqu		$ACC3, 32*3-128($rp)
7337bded2dbSJung-uk Kim___
7347bded2dbSJung-uk Kim$TEMP5=$ACC0;
7357bded2dbSJung-uk Kim$code.=<<___;
7367bded2dbSJung-uk Kim	vpsrlq		\$29, $ACC4, $TEMP1
7377bded2dbSJung-uk Kim	vpand		$AND_MASK, $ACC4, $ACC4
7387bded2dbSJung-uk Kim	vpsrlq		\$29, $ACC5, $TEMP2
7397bded2dbSJung-uk Kim	vpand		$AND_MASK, $ACC5, $ACC5
7407bded2dbSJung-uk Kim	vpsrlq		\$29, $ACC6, $TEMP3
7417bded2dbSJung-uk Kim	vpermq		\$0x93, $TEMP1, $TEMP1
7427bded2dbSJung-uk Kim	vpand		$AND_MASK, $ACC6, $ACC6
7437bded2dbSJung-uk Kim	vpsrlq		\$29, $ACC7, $TEMP4
7447bded2dbSJung-uk Kim	vpermq		\$0x93, $TEMP2, $TEMP2
7457bded2dbSJung-uk Kim	vpand		$AND_MASK, $ACC7, $ACC7
7467bded2dbSJung-uk Kim	vpsrlq		\$29, $ACC8, $TEMP5
7477bded2dbSJung-uk Kim	vpermq		\$0x93, $TEMP3, $TEMP3
7487bded2dbSJung-uk Kim	vpand		$AND_MASK, $ACC8, $ACC8
7497bded2dbSJung-uk Kim	vpermq		\$0x93, $TEMP4, $TEMP4
7507bded2dbSJung-uk Kim
7517bded2dbSJung-uk Kim	vpblendd	\$3, $ZERO, $TEMP1, $TEMP0
7527bded2dbSJung-uk Kim	vpermq		\$0x93, $TEMP5, $TEMP5
7537bded2dbSJung-uk Kim	vpblendd	\$3, $TEMP1, $TEMP2, $TEMP1
7547bded2dbSJung-uk Kim	vpaddq		$TEMP0, $ACC4, $ACC4
7557bded2dbSJung-uk Kim	vpblendd	\$3, $TEMP2, $TEMP3, $TEMP2
7567bded2dbSJung-uk Kim	vpaddq		$TEMP1, $ACC5, $ACC5
7577bded2dbSJung-uk Kim	vpblendd	\$3, $TEMP3, $TEMP4, $TEMP3
7587bded2dbSJung-uk Kim	vpaddq		$TEMP2, $ACC6, $ACC6
7597bded2dbSJung-uk Kim	vpblendd	\$3, $TEMP4, $TEMP5, $TEMP4
7607bded2dbSJung-uk Kim	vpaddq		$TEMP3, $ACC7, $ACC7
7617bded2dbSJung-uk Kim	vpaddq		$TEMP4, $ACC8, $ACC8
7627bded2dbSJung-uk Kim
7637bded2dbSJung-uk Kim	vpsrlq		\$29, $ACC4, $TEMP1
7647bded2dbSJung-uk Kim	vpand		$AND_MASK, $ACC4, $ACC4
7657bded2dbSJung-uk Kim	vpsrlq		\$29, $ACC5, $TEMP2
7667bded2dbSJung-uk Kim	vpand		$AND_MASK, $ACC5, $ACC5
7677bded2dbSJung-uk Kim	vpsrlq		\$29, $ACC6, $TEMP3
7687bded2dbSJung-uk Kim	vpermq		\$0x93, $TEMP1, $TEMP1
7697bded2dbSJung-uk Kim	vpand		$AND_MASK, $ACC6, $ACC6
7707bded2dbSJung-uk Kim	vpsrlq		\$29, $ACC7, $TEMP4
7717bded2dbSJung-uk Kim	vpermq		\$0x93, $TEMP2, $TEMP2
7727bded2dbSJung-uk Kim	vpand		$AND_MASK, $ACC7, $ACC7
7737bded2dbSJung-uk Kim	vpsrlq		\$29, $ACC8, $TEMP5
7747bded2dbSJung-uk Kim	vpermq		\$0x93, $TEMP3, $TEMP3
7757bded2dbSJung-uk Kim	vpand		$AND_MASK, $ACC8, $ACC8
7767bded2dbSJung-uk Kim	vpermq		\$0x93, $TEMP4, $TEMP4
7777bded2dbSJung-uk Kim
7787bded2dbSJung-uk Kim	vpblendd	\$3, $ZERO, $TEMP1, $TEMP0
7797bded2dbSJung-uk Kim	vpermq		\$0x93, $TEMP5, $TEMP5
7807bded2dbSJung-uk Kim	vpblendd	\$3, $TEMP1, $TEMP2, $TEMP1
7817bded2dbSJung-uk Kim	vpaddq		$TEMP0, $ACC4, $ACC4
7827bded2dbSJung-uk Kim	vpblendd	\$3, $TEMP2, $TEMP3, $TEMP2
7837bded2dbSJung-uk Kim	vpaddq		$TEMP1, $ACC5, $ACC5
7847bded2dbSJung-uk Kim	vmovdqu		$ACC4, 32*4-128($rp)
7857bded2dbSJung-uk Kim	vpblendd	\$3, $TEMP3, $TEMP4, $TEMP3
7867bded2dbSJung-uk Kim	vpaddq		$TEMP2, $ACC6, $ACC6
7877bded2dbSJung-uk Kim	vmovdqu		$ACC5, 32*5-128($rp)
7887bded2dbSJung-uk Kim	vpblendd	\$3, $TEMP4, $TEMP5, $TEMP4
7897bded2dbSJung-uk Kim	vpaddq		$TEMP3, $ACC7, $ACC7
7907bded2dbSJung-uk Kim	vmovdqu		$ACC6, 32*6-128($rp)
7917bded2dbSJung-uk Kim	vpaddq		$TEMP4, $ACC8, $ACC8
7927bded2dbSJung-uk Kim	vmovdqu		$ACC7, 32*7-128($rp)
7937bded2dbSJung-uk Kim	vmovdqu		$ACC8, 32*8-128($rp)
7947bded2dbSJung-uk Kim
7957bded2dbSJung-uk Kim	mov	$rp, $ap
7967bded2dbSJung-uk Kim	dec	$rep
7977bded2dbSJung-uk Kim	jne	.LOOP_GRANDE_SQR_1024
7987bded2dbSJung-uk Kim
7997bded2dbSJung-uk Kim	vzeroall
8007bded2dbSJung-uk Kim	mov	%rbp, %rax
801e71b7053SJung-uk Kim.cfi_def_cfa_register	%rax
8027bded2dbSJung-uk Kim___
8037bded2dbSJung-uk Kim$code.=<<___ if ($win64);
804e71b7053SJung-uk Kim.Lsqr_1024_in_tail:
8057bded2dbSJung-uk Kim	movaps	-0xd8(%rax),%xmm6
8067bded2dbSJung-uk Kim	movaps	-0xc8(%rax),%xmm7
8077bded2dbSJung-uk Kim	movaps	-0xb8(%rax),%xmm8
8087bded2dbSJung-uk Kim	movaps	-0xa8(%rax),%xmm9
8097bded2dbSJung-uk Kim	movaps	-0x98(%rax),%xmm10
8107bded2dbSJung-uk Kim	movaps	-0x88(%rax),%xmm11
8117bded2dbSJung-uk Kim	movaps	-0x78(%rax),%xmm12
8127bded2dbSJung-uk Kim	movaps	-0x68(%rax),%xmm13
8137bded2dbSJung-uk Kim	movaps	-0x58(%rax),%xmm14
8147bded2dbSJung-uk Kim	movaps	-0x48(%rax),%xmm15
8157bded2dbSJung-uk Kim___
8167bded2dbSJung-uk Kim$code.=<<___;
8177bded2dbSJung-uk Kim	mov	-48(%rax),%r15
818e71b7053SJung-uk Kim.cfi_restore	%r15
8197bded2dbSJung-uk Kim	mov	-40(%rax),%r14
820e71b7053SJung-uk Kim.cfi_restore	%r14
8217bded2dbSJung-uk Kim	mov	-32(%rax),%r13
822e71b7053SJung-uk Kim.cfi_restore	%r13
8237bded2dbSJung-uk Kim	mov	-24(%rax),%r12
824e71b7053SJung-uk Kim.cfi_restore	%r12
8257bded2dbSJung-uk Kim	mov	-16(%rax),%rbp
826e71b7053SJung-uk Kim.cfi_restore	%rbp
8277bded2dbSJung-uk Kim	mov	-8(%rax),%rbx
828e71b7053SJung-uk Kim.cfi_restore	%rbx
8297bded2dbSJung-uk Kim	lea	(%rax),%rsp		# restore %rsp
830e71b7053SJung-uk Kim.cfi_def_cfa_register	%rsp
8317bded2dbSJung-uk Kim.Lsqr_1024_epilogue:
8327bded2dbSJung-uk Kim	ret
833e71b7053SJung-uk Kim.cfi_endproc
8347bded2dbSJung-uk Kim.size	rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
8357bded2dbSJung-uk Kim___
8367bded2dbSJung-uk Kim}
8377bded2dbSJung-uk Kim
8387bded2dbSJung-uk Kim{ # void AMM_WW(
8397bded2dbSJung-uk Kimmy $rp="%rdi";	# BN_ULONG *rp,
8407bded2dbSJung-uk Kimmy $ap="%rsi";	# const BN_ULONG *ap,
8417bded2dbSJung-uk Kimmy $bp="%rdx";	# const BN_ULONG *bp,
8427bded2dbSJung-uk Kimmy $np="%rcx";	# const BN_ULONG *np,
8437bded2dbSJung-uk Kimmy $n0="%r8d";	# unsigned int n0);
8447bded2dbSJung-uk Kim
8457bded2dbSJung-uk Kim# The registers that hold the accumulated redundant result
8467bded2dbSJung-uk Kim# The AMM works on 1024 bit operands, and redundant word size is 29
8477bded2dbSJung-uk Kim# Therefore: ceil(1024/29)/4 = 9
8487bded2dbSJung-uk Kimmy $ACC0="%ymm0";
8497bded2dbSJung-uk Kimmy $ACC1="%ymm1";
8507bded2dbSJung-uk Kimmy $ACC2="%ymm2";
8517bded2dbSJung-uk Kimmy $ACC3="%ymm3";
8527bded2dbSJung-uk Kimmy $ACC4="%ymm4";
8537bded2dbSJung-uk Kimmy $ACC5="%ymm5";
8547bded2dbSJung-uk Kimmy $ACC6="%ymm6";
8557bded2dbSJung-uk Kimmy $ACC7="%ymm7";
8567bded2dbSJung-uk Kimmy $ACC8="%ymm8";
8577bded2dbSJung-uk Kimmy $ACC9="%ymm9";
8587bded2dbSJung-uk Kim
8597bded2dbSJung-uk Kim# Registers that hold the broadcasted words of multiplier, currently used
8607bded2dbSJung-uk Kimmy $Bi="%ymm10";
8617bded2dbSJung-uk Kimmy $Yi="%ymm11";
8627bded2dbSJung-uk Kim
8637bded2dbSJung-uk Kim# Helper registers
8647bded2dbSJung-uk Kimmy $TEMP0=$ACC0;
8657bded2dbSJung-uk Kimmy $TEMP1="%ymm12";
8667bded2dbSJung-uk Kimmy $TEMP2="%ymm13";
8677bded2dbSJung-uk Kimmy $ZERO="%ymm14";
8687bded2dbSJung-uk Kimmy $AND_MASK="%ymm15";
8697bded2dbSJung-uk Kim
8707bded2dbSJung-uk Kim# alu registers that hold the first words of the ACC
8717bded2dbSJung-uk Kimmy $r0="%r9";
8727bded2dbSJung-uk Kimmy $r1="%r10";
8737bded2dbSJung-uk Kimmy $r2="%r11";
8747bded2dbSJung-uk Kimmy $r3="%r12";
8757bded2dbSJung-uk Kim
8767bded2dbSJung-uk Kimmy $i="%r14d";
8777bded2dbSJung-uk Kimmy $tmp="%r15";
8787bded2dbSJung-uk Kim
8797bded2dbSJung-uk Kim$bp="%r13";	# reassigned argument
8807bded2dbSJung-uk Kim
8817bded2dbSJung-uk Kim$code.=<<___;
8827bded2dbSJung-uk Kim.globl	rsaz_1024_mul_avx2
8837bded2dbSJung-uk Kim.type	rsaz_1024_mul_avx2,\@function,5
8847bded2dbSJung-uk Kim.align	64
8857bded2dbSJung-uk Kimrsaz_1024_mul_avx2:
886e71b7053SJung-uk Kim.cfi_startproc
8877bded2dbSJung-uk Kim	lea	(%rsp), %rax
888e71b7053SJung-uk Kim.cfi_def_cfa_register	%rax
8897bded2dbSJung-uk Kim	push	%rbx
890e71b7053SJung-uk Kim.cfi_push	%rbx
8917bded2dbSJung-uk Kim	push	%rbp
892e71b7053SJung-uk Kim.cfi_push	%rbp
8937bded2dbSJung-uk Kim	push	%r12
894e71b7053SJung-uk Kim.cfi_push	%r12
8957bded2dbSJung-uk Kim	push	%r13
896e71b7053SJung-uk Kim.cfi_push	%r13
8977bded2dbSJung-uk Kim	push	%r14
898e71b7053SJung-uk Kim.cfi_push	%r14
8997bded2dbSJung-uk Kim	push	%r15
900e71b7053SJung-uk Kim.cfi_push	%r15
9017bded2dbSJung-uk Kim___
9027bded2dbSJung-uk Kim$code.=<<___ if ($win64);
9037bded2dbSJung-uk Kim	vzeroupper
9047bded2dbSJung-uk Kim	lea	-0xa8(%rsp),%rsp
9057bded2dbSJung-uk Kim	vmovaps	%xmm6,-0xd8(%rax)
9067bded2dbSJung-uk Kim	vmovaps	%xmm7,-0xc8(%rax)
9077bded2dbSJung-uk Kim	vmovaps	%xmm8,-0xb8(%rax)
9087bded2dbSJung-uk Kim	vmovaps	%xmm9,-0xa8(%rax)
9097bded2dbSJung-uk Kim	vmovaps	%xmm10,-0x98(%rax)
9107bded2dbSJung-uk Kim	vmovaps	%xmm11,-0x88(%rax)
9117bded2dbSJung-uk Kim	vmovaps	%xmm12,-0x78(%rax)
9127bded2dbSJung-uk Kim	vmovaps	%xmm13,-0x68(%rax)
9137bded2dbSJung-uk Kim	vmovaps	%xmm14,-0x58(%rax)
9147bded2dbSJung-uk Kim	vmovaps	%xmm15,-0x48(%rax)
9157bded2dbSJung-uk Kim.Lmul_1024_body:
9167bded2dbSJung-uk Kim___
9177bded2dbSJung-uk Kim$code.=<<___;
9187bded2dbSJung-uk Kim	mov	%rax,%rbp
919e71b7053SJung-uk Kim.cfi_def_cfa_register	%rbp
9207bded2dbSJung-uk Kim	vzeroall
9217bded2dbSJung-uk Kim	mov	%rdx, $bp	# reassigned argument
9227bded2dbSJung-uk Kim	sub	\$64,%rsp
9237bded2dbSJung-uk Kim
9247bded2dbSJung-uk Kim	# unaligned 256-bit load that crosses page boundary can
9257bded2dbSJung-uk Kim	# cause severe performance degradation here, so if $ap does
9267bded2dbSJung-uk Kim	# cross page boundary, swap it with $bp [meaning that caller
9277bded2dbSJung-uk Kim	# is advised to lay down $ap and $bp next to each other, so
9287bded2dbSJung-uk Kim	# that only one can cross page boundary].
9297bded2dbSJung-uk Kim	.byte	0x67,0x67
9307bded2dbSJung-uk Kim	mov	$ap, $tmp
9317bded2dbSJung-uk Kim	and	\$4095, $tmp
9327bded2dbSJung-uk Kim	add	\$32*10, $tmp
9337bded2dbSJung-uk Kim	shr	\$12, $tmp
9347bded2dbSJung-uk Kim	mov	$ap, $tmp
9357bded2dbSJung-uk Kim	cmovnz	$bp, $ap
9367bded2dbSJung-uk Kim	cmovnz	$tmp, $bp
9377bded2dbSJung-uk Kim
9387bded2dbSJung-uk Kim	mov	$np, $tmp
9397bded2dbSJung-uk Kim	sub	\$-128,$ap	# size optimization
9407bded2dbSJung-uk Kim	sub	\$-128,$np
9417bded2dbSJung-uk Kim	sub	\$-128,$rp
9427bded2dbSJung-uk Kim
9437bded2dbSJung-uk Kim	and	\$4095, $tmp	# see if $np crosses page
9447bded2dbSJung-uk Kim	add	\$32*10, $tmp
9457bded2dbSJung-uk Kim	.byte	0x67,0x67
9467bded2dbSJung-uk Kim	shr	\$12, $tmp
9477bded2dbSJung-uk Kim	jz	.Lmul_1024_no_n_copy
9487bded2dbSJung-uk Kim
9497bded2dbSJung-uk Kim	# unaligned 256-bit load that crosses page boundary can
9507bded2dbSJung-uk Kim	# cause severe performance degradation here, so if $np does
9517bded2dbSJung-uk Kim	# cross page boundary, copy it to stack and make sure stack
9527bded2dbSJung-uk Kim	# frame doesn't...
9537bded2dbSJung-uk Kim	sub		\$32*10,%rsp
9547bded2dbSJung-uk Kim	vmovdqu		32*0-128($np), $ACC0
9557bded2dbSJung-uk Kim	and		\$-512, %rsp
9567bded2dbSJung-uk Kim	vmovdqu		32*1-128($np), $ACC1
9577bded2dbSJung-uk Kim	vmovdqu		32*2-128($np), $ACC2
9587bded2dbSJung-uk Kim	vmovdqu		32*3-128($np), $ACC3
9597bded2dbSJung-uk Kim	vmovdqu		32*4-128($np), $ACC4
9607bded2dbSJung-uk Kim	vmovdqu		32*5-128($np), $ACC5
9617bded2dbSJung-uk Kim	vmovdqu		32*6-128($np), $ACC6
9627bded2dbSJung-uk Kim	vmovdqu		32*7-128($np), $ACC7
9637bded2dbSJung-uk Kim	vmovdqu		32*8-128($np), $ACC8
9647bded2dbSJung-uk Kim	lea		64+128(%rsp),$np
9657bded2dbSJung-uk Kim	vmovdqu		$ACC0, 32*0-128($np)
9667bded2dbSJung-uk Kim	vpxor		$ACC0, $ACC0, $ACC0
9677bded2dbSJung-uk Kim	vmovdqu		$ACC1, 32*1-128($np)
9687bded2dbSJung-uk Kim	vpxor		$ACC1, $ACC1, $ACC1
9697bded2dbSJung-uk Kim	vmovdqu		$ACC2, 32*2-128($np)
9707bded2dbSJung-uk Kim	vpxor		$ACC2, $ACC2, $ACC2
9717bded2dbSJung-uk Kim	vmovdqu		$ACC3, 32*3-128($np)
9727bded2dbSJung-uk Kim	vpxor		$ACC3, $ACC3, $ACC3
9737bded2dbSJung-uk Kim	vmovdqu		$ACC4, 32*4-128($np)
9747bded2dbSJung-uk Kim	vpxor		$ACC4, $ACC4, $ACC4
9757bded2dbSJung-uk Kim	vmovdqu		$ACC5, 32*5-128($np)
9767bded2dbSJung-uk Kim	vpxor		$ACC5, $ACC5, $ACC5
9777bded2dbSJung-uk Kim	vmovdqu		$ACC6, 32*6-128($np)
9787bded2dbSJung-uk Kim	vpxor		$ACC6, $ACC6, $ACC6
9797bded2dbSJung-uk Kim	vmovdqu		$ACC7, 32*7-128($np)
9807bded2dbSJung-uk Kim	vpxor		$ACC7, $ACC7, $ACC7
9817bded2dbSJung-uk Kim	vmovdqu		$ACC8, 32*8-128($np)
9827bded2dbSJung-uk Kim	vmovdqa		$ACC0, $ACC8
9837bded2dbSJung-uk Kim	vmovdqu		$ACC9, 32*9-128($np)	# $ACC9 is zero after vzeroall
9847bded2dbSJung-uk Kim.Lmul_1024_no_n_copy:
9857bded2dbSJung-uk Kim	and	\$-64,%rsp
9867bded2dbSJung-uk Kim
9877bded2dbSJung-uk Kim	mov	($bp), %rbx
9887bded2dbSJung-uk Kim	vpbroadcastq ($bp), $Bi
9897bded2dbSJung-uk Kim	vmovdqu	$ACC0, (%rsp)			# clear top of stack
9907bded2dbSJung-uk Kim	xor	$r0, $r0
9917bded2dbSJung-uk Kim	.byte	0x67
9927bded2dbSJung-uk Kim	xor	$r1, $r1
9937bded2dbSJung-uk Kim	xor	$r2, $r2
9947bded2dbSJung-uk Kim	xor	$r3, $r3
9957bded2dbSJung-uk Kim
9967bded2dbSJung-uk Kim	vmovdqu	.Land_mask(%rip), $AND_MASK
9977bded2dbSJung-uk Kim	mov	\$9, $i
9987bded2dbSJung-uk Kim	vmovdqu	$ACC9, 32*9-128($rp)		# $ACC9 is zero after vzeroall
9997bded2dbSJung-uk Kim	jmp	.Loop_mul_1024
10007bded2dbSJung-uk Kim
10017bded2dbSJung-uk Kim.align	32
10027bded2dbSJung-uk Kim.Loop_mul_1024:
10037bded2dbSJung-uk Kim	 vpsrlq		\$29, $ACC3, $ACC9		# correct $ACC3(*)
10047bded2dbSJung-uk Kim	mov	%rbx, %rax
10057bded2dbSJung-uk Kim	imulq	-128($ap), %rax
10067bded2dbSJung-uk Kim	add	$r0, %rax
10077bded2dbSJung-uk Kim	mov	%rbx, $r1
10087bded2dbSJung-uk Kim	imulq	8-128($ap), $r1
10097bded2dbSJung-uk Kim	add	8(%rsp), $r1
10107bded2dbSJung-uk Kim
10117bded2dbSJung-uk Kim	mov	%rax, $r0
10127bded2dbSJung-uk Kim	imull	$n0, %eax
10137bded2dbSJung-uk Kim	and	\$0x1fffffff, %eax
10147bded2dbSJung-uk Kim
10157bded2dbSJung-uk Kim	 mov	%rbx, $r2
10167bded2dbSJung-uk Kim	 imulq	16-128($ap), $r2
10177bded2dbSJung-uk Kim	 add	16(%rsp), $r2
10187bded2dbSJung-uk Kim
10197bded2dbSJung-uk Kim	 mov	%rbx, $r3
10207bded2dbSJung-uk Kim	 imulq	24-128($ap), $r3
10217bded2dbSJung-uk Kim	 add	24(%rsp), $r3
10227bded2dbSJung-uk Kim	vpmuludq	32*1-128($ap),$Bi,$TEMP0
10237bded2dbSJung-uk Kim	 vmovd		%eax, $Yi
10247bded2dbSJung-uk Kim	vpaddq		$TEMP0,$ACC1,$ACC1
10257bded2dbSJung-uk Kim	vpmuludq	32*2-128($ap),$Bi,$TEMP1
10267bded2dbSJung-uk Kim	 vpbroadcastq	$Yi, $Yi
10277bded2dbSJung-uk Kim	vpaddq		$TEMP1,$ACC2,$ACC2
10287bded2dbSJung-uk Kim	vpmuludq	32*3-128($ap),$Bi,$TEMP2
10297bded2dbSJung-uk Kim	 vpand		$AND_MASK, $ACC3, $ACC3		# correct $ACC3
10307bded2dbSJung-uk Kim	vpaddq		$TEMP2,$ACC3,$ACC3
10317bded2dbSJung-uk Kim	vpmuludq	32*4-128($ap),$Bi,$TEMP0
10327bded2dbSJung-uk Kim	vpaddq		$TEMP0,$ACC4,$ACC4
10337bded2dbSJung-uk Kim	vpmuludq	32*5-128($ap),$Bi,$TEMP1
10347bded2dbSJung-uk Kim	vpaddq		$TEMP1,$ACC5,$ACC5
10357bded2dbSJung-uk Kim	vpmuludq	32*6-128($ap),$Bi,$TEMP2
10367bded2dbSJung-uk Kim	vpaddq		$TEMP2,$ACC6,$ACC6
10377bded2dbSJung-uk Kim	vpmuludq	32*7-128($ap),$Bi,$TEMP0
10387bded2dbSJung-uk Kim	 vpermq		\$0x93, $ACC9, $ACC9		# correct $ACC3
10397bded2dbSJung-uk Kim	vpaddq		$TEMP0,$ACC7,$ACC7
10407bded2dbSJung-uk Kim	vpmuludq	32*8-128($ap),$Bi,$TEMP1
10417bded2dbSJung-uk Kim	 vpbroadcastq	8($bp), $Bi
10427bded2dbSJung-uk Kim	vpaddq		$TEMP1,$ACC8,$ACC8
10437bded2dbSJung-uk Kim
10447bded2dbSJung-uk Kim	mov	%rax,%rdx
10457bded2dbSJung-uk Kim	imulq	-128($np),%rax
10467bded2dbSJung-uk Kim	add	%rax,$r0
10477bded2dbSJung-uk Kim	mov	%rdx,%rax
10487bded2dbSJung-uk Kim	imulq	8-128($np),%rax
10497bded2dbSJung-uk Kim	add	%rax,$r1
10507bded2dbSJung-uk Kim	mov	%rdx,%rax
10517bded2dbSJung-uk Kim	imulq	16-128($np),%rax
10527bded2dbSJung-uk Kim	add	%rax,$r2
10537bded2dbSJung-uk Kim	shr	\$29, $r0
10547bded2dbSJung-uk Kim	imulq	24-128($np),%rdx
10557bded2dbSJung-uk Kim	add	%rdx,$r3
10567bded2dbSJung-uk Kim	add	$r0, $r1
10577bded2dbSJung-uk Kim
10587bded2dbSJung-uk Kim	vpmuludq	32*1-128($np),$Yi,$TEMP2
10597bded2dbSJung-uk Kim	 vmovq		$Bi, %rbx
10607bded2dbSJung-uk Kim	vpaddq		$TEMP2,$ACC1,$ACC1
10617bded2dbSJung-uk Kim	vpmuludq	32*2-128($np),$Yi,$TEMP0
10627bded2dbSJung-uk Kim	vpaddq		$TEMP0,$ACC2,$ACC2
10637bded2dbSJung-uk Kim	vpmuludq	32*3-128($np),$Yi,$TEMP1
10647bded2dbSJung-uk Kim	vpaddq		$TEMP1,$ACC3,$ACC3
10657bded2dbSJung-uk Kim	vpmuludq	32*4-128($np),$Yi,$TEMP2
10667bded2dbSJung-uk Kim	vpaddq		$TEMP2,$ACC4,$ACC4
10677bded2dbSJung-uk Kim	vpmuludq	32*5-128($np),$Yi,$TEMP0
10687bded2dbSJung-uk Kim	vpaddq		$TEMP0,$ACC5,$ACC5
10697bded2dbSJung-uk Kim	vpmuludq	32*6-128($np),$Yi,$TEMP1
10707bded2dbSJung-uk Kim	vpaddq		$TEMP1,$ACC6,$ACC6
10717bded2dbSJung-uk Kim	vpmuludq	32*7-128($np),$Yi,$TEMP2
1072c4ad4dffSJung-uk Kim	 vpblendd	\$3, $ZERO, $ACC9, $TEMP1	# correct $ACC3
10737bded2dbSJung-uk Kim	vpaddq		$TEMP2,$ACC7,$ACC7
10747bded2dbSJung-uk Kim	vpmuludq	32*8-128($np),$Yi,$TEMP0
1075c4ad4dffSJung-uk Kim	 vpaddq		$TEMP1, $ACC3, $ACC3		# correct $ACC3
10767bded2dbSJung-uk Kim	vpaddq		$TEMP0,$ACC8,$ACC8
10777bded2dbSJung-uk Kim
10787bded2dbSJung-uk Kim	mov	%rbx, %rax
10797bded2dbSJung-uk Kim	imulq	-128($ap),%rax
10807bded2dbSJung-uk Kim	add	%rax,$r1
10817bded2dbSJung-uk Kim	 vmovdqu	-8+32*1-128($ap),$TEMP1
10827bded2dbSJung-uk Kim	mov	%rbx, %rax
10837bded2dbSJung-uk Kim	imulq	8-128($ap),%rax
10847bded2dbSJung-uk Kim	add	%rax,$r2
10857bded2dbSJung-uk Kim	 vmovdqu	-8+32*2-128($ap),$TEMP2
10867bded2dbSJung-uk Kim
10877bded2dbSJung-uk Kim	mov	$r1, %rax
1088c4ad4dffSJung-uk Kim	 vpblendd	\$0xfc, $ZERO, $ACC9, $ACC9	# correct $ACC3
10897bded2dbSJung-uk Kim	imull	$n0, %eax
1090c4ad4dffSJung-uk Kim	 vpaddq		$ACC9,$ACC4,$ACC4		# correct $ACC3
10917bded2dbSJung-uk Kim	and	\$0x1fffffff, %eax
10927bded2dbSJung-uk Kim
10937bded2dbSJung-uk Kim	 imulq	16-128($ap),%rbx
10947bded2dbSJung-uk Kim	 add	%rbx,$r3
10957bded2dbSJung-uk Kim	vpmuludq	$Bi,$TEMP1,$TEMP1
10967bded2dbSJung-uk Kim	 vmovd		%eax, $Yi
10977bded2dbSJung-uk Kim	vmovdqu		-8+32*3-128($ap),$TEMP0
10987bded2dbSJung-uk Kim	vpaddq		$TEMP1,$ACC1,$ACC1
10997bded2dbSJung-uk Kim	vpmuludq	$Bi,$TEMP2,$TEMP2
11007bded2dbSJung-uk Kim	 vpbroadcastq	$Yi, $Yi
11017bded2dbSJung-uk Kim	vmovdqu		-8+32*4-128($ap),$TEMP1
11027bded2dbSJung-uk Kim	vpaddq		$TEMP2,$ACC2,$ACC2
11037bded2dbSJung-uk Kim	vpmuludq	$Bi,$TEMP0,$TEMP0
11047bded2dbSJung-uk Kim	vmovdqu		-8+32*5-128($ap),$TEMP2
11057bded2dbSJung-uk Kim	vpaddq		$TEMP0,$ACC3,$ACC3
11067bded2dbSJung-uk Kim	vpmuludq	$Bi,$TEMP1,$TEMP1
11077bded2dbSJung-uk Kim	vmovdqu		-8+32*6-128($ap),$TEMP0
11087bded2dbSJung-uk Kim	vpaddq		$TEMP1,$ACC4,$ACC4
11097bded2dbSJung-uk Kim	vpmuludq	$Bi,$TEMP2,$TEMP2
11107bded2dbSJung-uk Kim	vmovdqu		-8+32*7-128($ap),$TEMP1
11117bded2dbSJung-uk Kim	vpaddq		$TEMP2,$ACC5,$ACC5
11127bded2dbSJung-uk Kim	vpmuludq	$Bi,$TEMP0,$TEMP0
11137bded2dbSJung-uk Kim	vmovdqu		-8+32*8-128($ap),$TEMP2
11147bded2dbSJung-uk Kim	vpaddq		$TEMP0,$ACC6,$ACC6
11157bded2dbSJung-uk Kim	vpmuludq	$Bi,$TEMP1,$TEMP1
11167bded2dbSJung-uk Kim	vmovdqu		-8+32*9-128($ap),$ACC9
11177bded2dbSJung-uk Kim	vpaddq		$TEMP1,$ACC7,$ACC7
11187bded2dbSJung-uk Kim	vpmuludq	$Bi,$TEMP2,$TEMP2
11197bded2dbSJung-uk Kim	vpaddq		$TEMP2,$ACC8,$ACC8
11207bded2dbSJung-uk Kim	vpmuludq	$Bi,$ACC9,$ACC9
11217bded2dbSJung-uk Kim	 vpbroadcastq	16($bp), $Bi
11227bded2dbSJung-uk Kim
11237bded2dbSJung-uk Kim	mov	%rax,%rdx
11247bded2dbSJung-uk Kim	imulq	-128($np),%rax
11257bded2dbSJung-uk Kim	add	%rax,$r1
11267bded2dbSJung-uk Kim	 vmovdqu	-8+32*1-128($np),$TEMP0
11277bded2dbSJung-uk Kim	mov	%rdx,%rax
11287bded2dbSJung-uk Kim	imulq	8-128($np),%rax
11297bded2dbSJung-uk Kim	add	%rax,$r2
11307bded2dbSJung-uk Kim	 vmovdqu	-8+32*2-128($np),$TEMP1
11317bded2dbSJung-uk Kim	shr	\$29, $r1
11327bded2dbSJung-uk Kim	imulq	16-128($np),%rdx
11337bded2dbSJung-uk Kim	add	%rdx,$r3
11347bded2dbSJung-uk Kim	add	$r1, $r2
11357bded2dbSJung-uk Kim
11367bded2dbSJung-uk Kim	vpmuludq	$Yi,$TEMP0,$TEMP0
11377bded2dbSJung-uk Kim	 vmovq		$Bi, %rbx
11387bded2dbSJung-uk Kim	vmovdqu		-8+32*3-128($np),$TEMP2
11397bded2dbSJung-uk Kim	vpaddq		$TEMP0,$ACC1,$ACC1
11407bded2dbSJung-uk Kim	vpmuludq	$Yi,$TEMP1,$TEMP1
11417bded2dbSJung-uk Kim	vmovdqu		-8+32*4-128($np),$TEMP0
11427bded2dbSJung-uk Kim	vpaddq		$TEMP1,$ACC2,$ACC2
11437bded2dbSJung-uk Kim	vpmuludq	$Yi,$TEMP2,$TEMP2
11447bded2dbSJung-uk Kim	vmovdqu		-8+32*5-128($np),$TEMP1
11457bded2dbSJung-uk Kim	vpaddq		$TEMP2,$ACC3,$ACC3
11467bded2dbSJung-uk Kim	vpmuludq	$Yi,$TEMP0,$TEMP0
11477bded2dbSJung-uk Kim	vmovdqu		-8+32*6-128($np),$TEMP2
11487bded2dbSJung-uk Kim	vpaddq		$TEMP0,$ACC4,$ACC4
11497bded2dbSJung-uk Kim	vpmuludq	$Yi,$TEMP1,$TEMP1
11507bded2dbSJung-uk Kim	vmovdqu		-8+32*7-128($np),$TEMP0
11517bded2dbSJung-uk Kim	vpaddq		$TEMP1,$ACC5,$ACC5
11527bded2dbSJung-uk Kim	vpmuludq	$Yi,$TEMP2,$TEMP2
11537bded2dbSJung-uk Kim	vmovdqu		-8+32*8-128($np),$TEMP1
11547bded2dbSJung-uk Kim	vpaddq		$TEMP2,$ACC6,$ACC6
11557bded2dbSJung-uk Kim	vpmuludq	$Yi,$TEMP0,$TEMP0
11567bded2dbSJung-uk Kim	vmovdqu		-8+32*9-128($np),$TEMP2
11577bded2dbSJung-uk Kim	vpaddq		$TEMP0,$ACC7,$ACC7
11587bded2dbSJung-uk Kim	vpmuludq	$Yi,$TEMP1,$TEMP1
11597bded2dbSJung-uk Kim	vpaddq		$TEMP1,$ACC8,$ACC8
11607bded2dbSJung-uk Kim	vpmuludq	$Yi,$TEMP2,$TEMP2
11617bded2dbSJung-uk Kim	vpaddq		$TEMP2,$ACC9,$ACC9
11627bded2dbSJung-uk Kim
11637bded2dbSJung-uk Kim	 vmovdqu	-16+32*1-128($ap),$TEMP0
11647bded2dbSJung-uk Kim	mov	%rbx,%rax
11657bded2dbSJung-uk Kim	imulq	-128($ap),%rax
11667bded2dbSJung-uk Kim	add	$r2,%rax
11677bded2dbSJung-uk Kim
11687bded2dbSJung-uk Kim	 vmovdqu	-16+32*2-128($ap),$TEMP1
11697bded2dbSJung-uk Kim	mov	%rax,$r2
11707bded2dbSJung-uk Kim	imull	$n0, %eax
11717bded2dbSJung-uk Kim	and	\$0x1fffffff, %eax
11727bded2dbSJung-uk Kim
11737bded2dbSJung-uk Kim	 imulq	8-128($ap),%rbx
11747bded2dbSJung-uk Kim	 add	%rbx,$r3
11757bded2dbSJung-uk Kim	vpmuludq	$Bi,$TEMP0,$TEMP0
11767bded2dbSJung-uk Kim	 vmovd		%eax, $Yi
11777bded2dbSJung-uk Kim	vmovdqu		-16+32*3-128($ap),$TEMP2
11787bded2dbSJung-uk Kim	vpaddq		$TEMP0,$ACC1,$ACC1
11797bded2dbSJung-uk Kim	vpmuludq	$Bi,$TEMP1,$TEMP1
11807bded2dbSJung-uk Kim	 vpbroadcastq	$Yi, $Yi
11817bded2dbSJung-uk Kim	vmovdqu		-16+32*4-128($ap),$TEMP0
11827bded2dbSJung-uk Kim	vpaddq		$TEMP1,$ACC2,$ACC2
11837bded2dbSJung-uk Kim	vpmuludq	$Bi,$TEMP2,$TEMP2
11847bded2dbSJung-uk Kim	vmovdqu		-16+32*5-128($ap),$TEMP1
11857bded2dbSJung-uk Kim	vpaddq		$TEMP2,$ACC3,$ACC3
11867bded2dbSJung-uk Kim	vpmuludq	$Bi,$TEMP0,$TEMP0
11877bded2dbSJung-uk Kim	vmovdqu		-16+32*6-128($ap),$TEMP2
11887bded2dbSJung-uk Kim	vpaddq		$TEMP0,$ACC4,$ACC4
11897bded2dbSJung-uk Kim	vpmuludq	$Bi,$TEMP1,$TEMP1
11907bded2dbSJung-uk Kim	vmovdqu		-16+32*7-128($ap),$TEMP0
11917bded2dbSJung-uk Kim	vpaddq		$TEMP1,$ACC5,$ACC5
11927bded2dbSJung-uk Kim	vpmuludq	$Bi,$TEMP2,$TEMP2
11937bded2dbSJung-uk Kim	vmovdqu		-16+32*8-128($ap),$TEMP1
11947bded2dbSJung-uk Kim	vpaddq		$TEMP2,$ACC6,$ACC6
11957bded2dbSJung-uk Kim	vpmuludq	$Bi,$TEMP0,$TEMP0
11967bded2dbSJung-uk Kim	vmovdqu		-16+32*9-128($ap),$TEMP2
11977bded2dbSJung-uk Kim	vpaddq		$TEMP0,$ACC7,$ACC7
11987bded2dbSJung-uk Kim	vpmuludq	$Bi,$TEMP1,$TEMP1
11997bded2dbSJung-uk Kim	vpaddq		$TEMP1,$ACC8,$ACC8
12007bded2dbSJung-uk Kim	vpmuludq	$Bi,$TEMP2,$TEMP2
12017bded2dbSJung-uk Kim	 vpbroadcastq	24($bp), $Bi
12027bded2dbSJung-uk Kim	vpaddq		$TEMP2,$ACC9,$ACC9
12037bded2dbSJung-uk Kim
12047bded2dbSJung-uk Kim	 vmovdqu	-16+32*1-128($np),$TEMP0
12057bded2dbSJung-uk Kim	mov	%rax,%rdx
12067bded2dbSJung-uk Kim	imulq	-128($np),%rax
12077bded2dbSJung-uk Kim	add	%rax,$r2
12087bded2dbSJung-uk Kim	 vmovdqu	-16+32*2-128($np),$TEMP1
12097bded2dbSJung-uk Kim	imulq	8-128($np),%rdx
12107bded2dbSJung-uk Kim	add	%rdx,$r3
12117bded2dbSJung-uk Kim	shr	\$29, $r2
12127bded2dbSJung-uk Kim
12137bded2dbSJung-uk Kim	vpmuludq	$Yi,$TEMP0,$TEMP0
12147bded2dbSJung-uk Kim	 vmovq		$Bi, %rbx
12157bded2dbSJung-uk Kim	vmovdqu		-16+32*3-128($np),$TEMP2
12167bded2dbSJung-uk Kim	vpaddq		$TEMP0,$ACC1,$ACC1
12177bded2dbSJung-uk Kim	vpmuludq	$Yi,$TEMP1,$TEMP1
12187bded2dbSJung-uk Kim	vmovdqu		-16+32*4-128($np),$TEMP0
12197bded2dbSJung-uk Kim	vpaddq		$TEMP1,$ACC2,$ACC2
12207bded2dbSJung-uk Kim	vpmuludq	$Yi,$TEMP2,$TEMP2
12217bded2dbSJung-uk Kim	vmovdqu		-16+32*5-128($np),$TEMP1
12227bded2dbSJung-uk Kim	vpaddq		$TEMP2,$ACC3,$ACC3
12237bded2dbSJung-uk Kim	vpmuludq	$Yi,$TEMP0,$TEMP0
12247bded2dbSJung-uk Kim	vmovdqu		-16+32*6-128($np),$TEMP2
12257bded2dbSJung-uk Kim	vpaddq		$TEMP0,$ACC4,$ACC4
12267bded2dbSJung-uk Kim	vpmuludq	$Yi,$TEMP1,$TEMP1
12277bded2dbSJung-uk Kim	vmovdqu		-16+32*7-128($np),$TEMP0
12287bded2dbSJung-uk Kim	vpaddq		$TEMP1,$ACC5,$ACC5
12297bded2dbSJung-uk Kim	vpmuludq	$Yi,$TEMP2,$TEMP2
12307bded2dbSJung-uk Kim	vmovdqu		-16+32*8-128($np),$TEMP1
12317bded2dbSJung-uk Kim	vpaddq		$TEMP2,$ACC6,$ACC6
12327bded2dbSJung-uk Kim	vpmuludq	$Yi,$TEMP0,$TEMP0
12337bded2dbSJung-uk Kim	vmovdqu		-16+32*9-128($np),$TEMP2
12347bded2dbSJung-uk Kim	vpaddq		$TEMP0,$ACC7,$ACC7
12357bded2dbSJung-uk Kim	vpmuludq	$Yi,$TEMP1,$TEMP1
12367bded2dbSJung-uk Kim	 vmovdqu	-24+32*1-128($ap),$TEMP0
12377bded2dbSJung-uk Kim	vpaddq		$TEMP1,$ACC8,$ACC8
12387bded2dbSJung-uk Kim	vpmuludq	$Yi,$TEMP2,$TEMP2
12397bded2dbSJung-uk Kim	 vmovdqu	-24+32*2-128($ap),$TEMP1
12407bded2dbSJung-uk Kim	vpaddq		$TEMP2,$ACC9,$ACC9
12417bded2dbSJung-uk Kim
12427bded2dbSJung-uk Kim	add	$r2, $r3
12437bded2dbSJung-uk Kim	imulq	-128($ap),%rbx
12447bded2dbSJung-uk Kim	add	%rbx,$r3
12457bded2dbSJung-uk Kim
12467bded2dbSJung-uk Kim	mov	$r3, %rax
12477bded2dbSJung-uk Kim	imull	$n0, %eax
12487bded2dbSJung-uk Kim	and	\$0x1fffffff, %eax
12497bded2dbSJung-uk Kim
12507bded2dbSJung-uk Kim	vpmuludq	$Bi,$TEMP0,$TEMP0
12517bded2dbSJung-uk Kim	 vmovd		%eax, $Yi
12527bded2dbSJung-uk Kim	vmovdqu		-24+32*3-128($ap),$TEMP2
12537bded2dbSJung-uk Kim	vpaddq		$TEMP0,$ACC1,$ACC1
12547bded2dbSJung-uk Kim	vpmuludq	$Bi,$TEMP1,$TEMP1
12557bded2dbSJung-uk Kim	 vpbroadcastq	$Yi, $Yi
12567bded2dbSJung-uk Kim	vmovdqu		-24+32*4-128($ap),$TEMP0
12577bded2dbSJung-uk Kim	vpaddq		$TEMP1,$ACC2,$ACC2
12587bded2dbSJung-uk Kim	vpmuludq	$Bi,$TEMP2,$TEMP2
12597bded2dbSJung-uk Kim	vmovdqu		-24+32*5-128($ap),$TEMP1
12607bded2dbSJung-uk Kim	vpaddq		$TEMP2,$ACC3,$ACC3
12617bded2dbSJung-uk Kim	vpmuludq	$Bi,$TEMP0,$TEMP0
12627bded2dbSJung-uk Kim	vmovdqu		-24+32*6-128($ap),$TEMP2
12637bded2dbSJung-uk Kim	vpaddq		$TEMP0,$ACC4,$ACC4
12647bded2dbSJung-uk Kim	vpmuludq	$Bi,$TEMP1,$TEMP1
12657bded2dbSJung-uk Kim	vmovdqu		-24+32*7-128($ap),$TEMP0
12667bded2dbSJung-uk Kim	vpaddq		$TEMP1,$ACC5,$ACC5
12677bded2dbSJung-uk Kim	vpmuludq	$Bi,$TEMP2,$TEMP2
12687bded2dbSJung-uk Kim	vmovdqu		-24+32*8-128($ap),$TEMP1
12697bded2dbSJung-uk Kim	vpaddq		$TEMP2,$ACC6,$ACC6
12707bded2dbSJung-uk Kim	vpmuludq	$Bi,$TEMP0,$TEMP0
12717bded2dbSJung-uk Kim	vmovdqu		-24+32*9-128($ap),$TEMP2
12727bded2dbSJung-uk Kim	vpaddq		$TEMP0,$ACC7,$ACC7
12737bded2dbSJung-uk Kim	vpmuludq	$Bi,$TEMP1,$TEMP1
12747bded2dbSJung-uk Kim	vpaddq		$TEMP1,$ACC8,$ACC8
12757bded2dbSJung-uk Kim	vpmuludq	$Bi,$TEMP2,$TEMP2
12767bded2dbSJung-uk Kim	 vpbroadcastq	32($bp), $Bi
12777bded2dbSJung-uk Kim	vpaddq		$TEMP2,$ACC9,$ACC9
12787bded2dbSJung-uk Kim	 add		\$32, $bp			# $bp++
12797bded2dbSJung-uk Kim
12807bded2dbSJung-uk Kim	vmovdqu		-24+32*1-128($np),$TEMP0
12817bded2dbSJung-uk Kim	imulq	-128($np),%rax
12827bded2dbSJung-uk Kim	add	%rax,$r3
12837bded2dbSJung-uk Kim	shr	\$29, $r3
12847bded2dbSJung-uk Kim
12857bded2dbSJung-uk Kim	vmovdqu		-24+32*2-128($np),$TEMP1
12867bded2dbSJung-uk Kim	vpmuludq	$Yi,$TEMP0,$TEMP0
12877bded2dbSJung-uk Kim	 vmovq		$Bi, %rbx
12887bded2dbSJung-uk Kim	vmovdqu		-24+32*3-128($np),$TEMP2
12897bded2dbSJung-uk Kim	vpaddq		$TEMP0,$ACC1,$ACC0		# $ACC0==$TEMP0
12907bded2dbSJung-uk Kim	vpmuludq	$Yi,$TEMP1,$TEMP1
12917bded2dbSJung-uk Kim	 vmovdqu	$ACC0, (%rsp)			# transfer $r0-$r3
12927bded2dbSJung-uk Kim	vpaddq		$TEMP1,$ACC2,$ACC1
12937bded2dbSJung-uk Kim	vmovdqu		-24+32*4-128($np),$TEMP0
12947bded2dbSJung-uk Kim	vpmuludq	$Yi,$TEMP2,$TEMP2
12957bded2dbSJung-uk Kim	vmovdqu		-24+32*5-128($np),$TEMP1
12967bded2dbSJung-uk Kim	vpaddq		$TEMP2,$ACC3,$ACC2
12977bded2dbSJung-uk Kim	vpmuludq	$Yi,$TEMP0,$TEMP0
12987bded2dbSJung-uk Kim	vmovdqu		-24+32*6-128($np),$TEMP2
12997bded2dbSJung-uk Kim	vpaddq		$TEMP0,$ACC4,$ACC3
13007bded2dbSJung-uk Kim	vpmuludq	$Yi,$TEMP1,$TEMP1
13017bded2dbSJung-uk Kim	vmovdqu		-24+32*7-128($np),$TEMP0
13027bded2dbSJung-uk Kim	vpaddq		$TEMP1,$ACC5,$ACC4
13037bded2dbSJung-uk Kim	vpmuludq	$Yi,$TEMP2,$TEMP2
13047bded2dbSJung-uk Kim	vmovdqu		-24+32*8-128($np),$TEMP1
13057bded2dbSJung-uk Kim	vpaddq		$TEMP2,$ACC6,$ACC5
13067bded2dbSJung-uk Kim	vpmuludq	$Yi,$TEMP0,$TEMP0
13077bded2dbSJung-uk Kim	vmovdqu		-24+32*9-128($np),$TEMP2
13087bded2dbSJung-uk Kim	 mov	$r3, $r0
13097bded2dbSJung-uk Kim	vpaddq		$TEMP0,$ACC7,$ACC6
13107bded2dbSJung-uk Kim	vpmuludq	$Yi,$TEMP1,$TEMP1
13117bded2dbSJung-uk Kim	 add	(%rsp), $r0
13127bded2dbSJung-uk Kim	vpaddq		$TEMP1,$ACC8,$ACC7
13137bded2dbSJung-uk Kim	vpmuludq	$Yi,$TEMP2,$TEMP2
13147bded2dbSJung-uk Kim	 vmovq	$r3, $TEMP1
13157bded2dbSJung-uk Kim	vpaddq		$TEMP2,$ACC9,$ACC8
13167bded2dbSJung-uk Kim
13177bded2dbSJung-uk Kim	dec	$i
13187bded2dbSJung-uk Kim	jnz	.Loop_mul_1024
13197bded2dbSJung-uk Kim___
13207bded2dbSJung-uk Kim
13217bded2dbSJung-uk Kim# (*)	Original implementation was correcting ACC1-ACC3 for overflow
13227bded2dbSJung-uk Kim#	after 7 loop runs, or after 28 iterations, or 56 additions.
13237bded2dbSJung-uk Kim#	But as we underutilize resources, it's possible to correct in
13247bded2dbSJung-uk Kim#	each iteration with marginal performance loss. But then, as
13257bded2dbSJung-uk Kim#	we do it in each iteration, we can correct less digits, and
1326c4ad4dffSJung-uk Kim#	avoid performance penalties completely.
13277bded2dbSJung-uk Kim
13287bded2dbSJung-uk Kim$TEMP0 = $ACC9;
13297bded2dbSJung-uk Kim$TEMP3 = $Bi;
13307bded2dbSJung-uk Kim$TEMP4 = $Yi;
13317bded2dbSJung-uk Kim$code.=<<___;
13327bded2dbSJung-uk Kim	vpaddq		(%rsp), $TEMP1, $ACC0
13337bded2dbSJung-uk Kim
13347bded2dbSJung-uk Kim	vpsrlq		\$29, $ACC0, $TEMP1
13357bded2dbSJung-uk Kim	vpand		$AND_MASK, $ACC0, $ACC0
13367bded2dbSJung-uk Kim	vpsrlq		\$29, $ACC1, $TEMP2
13377bded2dbSJung-uk Kim	vpand		$AND_MASK, $ACC1, $ACC1
13387bded2dbSJung-uk Kim	vpsrlq		\$29, $ACC2, $TEMP3
13397bded2dbSJung-uk Kim	vpermq		\$0x93, $TEMP1, $TEMP1
13407bded2dbSJung-uk Kim	vpand		$AND_MASK, $ACC2, $ACC2
13417bded2dbSJung-uk Kim	vpsrlq		\$29, $ACC3, $TEMP4
13427bded2dbSJung-uk Kim	vpermq		\$0x93, $TEMP2, $TEMP2
13437bded2dbSJung-uk Kim	vpand		$AND_MASK, $ACC3, $ACC3
13447bded2dbSJung-uk Kim
13457bded2dbSJung-uk Kim	vpblendd	\$3, $ZERO, $TEMP1, $TEMP0
13467bded2dbSJung-uk Kim	vpermq		\$0x93, $TEMP3, $TEMP3
13477bded2dbSJung-uk Kim	vpblendd	\$3, $TEMP1, $TEMP2, $TEMP1
13487bded2dbSJung-uk Kim	vpermq		\$0x93, $TEMP4, $TEMP4
13497bded2dbSJung-uk Kim	vpaddq		$TEMP0, $ACC0, $ACC0
13507bded2dbSJung-uk Kim	vpblendd	\$3, $TEMP2, $TEMP3, $TEMP2
13517bded2dbSJung-uk Kim	vpaddq		$TEMP1, $ACC1, $ACC1
13527bded2dbSJung-uk Kim	vpblendd	\$3, $TEMP3, $TEMP4, $TEMP3
13537bded2dbSJung-uk Kim	vpaddq		$TEMP2, $ACC2, $ACC2
13547bded2dbSJung-uk Kim	vpblendd	\$3, $TEMP4, $ZERO, $TEMP4
13557bded2dbSJung-uk Kim	vpaddq		$TEMP3, $ACC3, $ACC3
13567bded2dbSJung-uk Kim	vpaddq		$TEMP4, $ACC4, $ACC4
13577bded2dbSJung-uk Kim
13587bded2dbSJung-uk Kim	vpsrlq		\$29, $ACC0, $TEMP1
13597bded2dbSJung-uk Kim	vpand		$AND_MASK, $ACC0, $ACC0
13607bded2dbSJung-uk Kim	vpsrlq		\$29, $ACC1, $TEMP2
13617bded2dbSJung-uk Kim	vpand		$AND_MASK, $ACC1, $ACC1
13627bded2dbSJung-uk Kim	vpsrlq		\$29, $ACC2, $TEMP3
13637bded2dbSJung-uk Kim	vpermq		\$0x93, $TEMP1, $TEMP1
13647bded2dbSJung-uk Kim	vpand		$AND_MASK, $ACC2, $ACC2
13657bded2dbSJung-uk Kim	vpsrlq		\$29, $ACC3, $TEMP4
13667bded2dbSJung-uk Kim	vpermq		\$0x93, $TEMP2, $TEMP2
13677bded2dbSJung-uk Kim	vpand		$AND_MASK, $ACC3, $ACC3
13687bded2dbSJung-uk Kim	vpermq		\$0x93, $TEMP3, $TEMP3
13697bded2dbSJung-uk Kim
13707bded2dbSJung-uk Kim	vpblendd	\$3, $ZERO, $TEMP1, $TEMP0
13717bded2dbSJung-uk Kim	vpermq		\$0x93, $TEMP4, $TEMP4
13727bded2dbSJung-uk Kim	vpblendd	\$3, $TEMP1, $TEMP2, $TEMP1
13737bded2dbSJung-uk Kim	vpaddq		$TEMP0, $ACC0, $ACC0
13747bded2dbSJung-uk Kim	vpblendd	\$3, $TEMP2, $TEMP3, $TEMP2
13757bded2dbSJung-uk Kim	vpaddq		$TEMP1, $ACC1, $ACC1
13767bded2dbSJung-uk Kim	vpblendd	\$3, $TEMP3, $TEMP4, $TEMP3
13777bded2dbSJung-uk Kim	vpaddq		$TEMP2, $ACC2, $ACC2
13787bded2dbSJung-uk Kim	vpblendd	\$3, $TEMP4, $ZERO, $TEMP4
13797bded2dbSJung-uk Kim	vpaddq		$TEMP3, $ACC3, $ACC3
13807bded2dbSJung-uk Kim	vpaddq		$TEMP4, $ACC4, $ACC4
13817bded2dbSJung-uk Kim
13827bded2dbSJung-uk Kim	vmovdqu		$ACC0, 0-128($rp)
13837bded2dbSJung-uk Kim	vmovdqu		$ACC1, 32-128($rp)
13847bded2dbSJung-uk Kim	vmovdqu		$ACC2, 64-128($rp)
13857bded2dbSJung-uk Kim	vmovdqu		$ACC3, 96-128($rp)
13867bded2dbSJung-uk Kim___
13877bded2dbSJung-uk Kim
13887bded2dbSJung-uk Kim$TEMP5=$ACC0;
13897bded2dbSJung-uk Kim$code.=<<___;
13907bded2dbSJung-uk Kim	vpsrlq		\$29, $ACC4, $TEMP1
13917bded2dbSJung-uk Kim	vpand		$AND_MASK, $ACC4, $ACC4
13927bded2dbSJung-uk Kim	vpsrlq		\$29, $ACC5, $TEMP2
13937bded2dbSJung-uk Kim	vpand		$AND_MASK, $ACC5, $ACC5
13947bded2dbSJung-uk Kim	vpsrlq		\$29, $ACC6, $TEMP3
13957bded2dbSJung-uk Kim	vpermq		\$0x93, $TEMP1, $TEMP1
13967bded2dbSJung-uk Kim	vpand		$AND_MASK, $ACC6, $ACC6
13977bded2dbSJung-uk Kim	vpsrlq		\$29, $ACC7, $TEMP4
13987bded2dbSJung-uk Kim	vpermq		\$0x93, $TEMP2, $TEMP2
13997bded2dbSJung-uk Kim	vpand		$AND_MASK, $ACC7, $ACC7
14007bded2dbSJung-uk Kim	vpsrlq		\$29, $ACC8, $TEMP5
14017bded2dbSJung-uk Kim	vpermq		\$0x93, $TEMP3, $TEMP3
14027bded2dbSJung-uk Kim	vpand		$AND_MASK, $ACC8, $ACC8
14037bded2dbSJung-uk Kim	vpermq		\$0x93, $TEMP4, $TEMP4
14047bded2dbSJung-uk Kim
14057bded2dbSJung-uk Kim	vpblendd	\$3, $ZERO, $TEMP1, $TEMP0
14067bded2dbSJung-uk Kim	vpermq		\$0x93, $TEMP5, $TEMP5
14077bded2dbSJung-uk Kim	vpblendd	\$3, $TEMP1, $TEMP2, $TEMP1
14087bded2dbSJung-uk Kim	vpaddq		$TEMP0, $ACC4, $ACC4
14097bded2dbSJung-uk Kim	vpblendd	\$3, $TEMP2, $TEMP3, $TEMP2
14107bded2dbSJung-uk Kim	vpaddq		$TEMP1, $ACC5, $ACC5
14117bded2dbSJung-uk Kim	vpblendd	\$3, $TEMP3, $TEMP4, $TEMP3
14127bded2dbSJung-uk Kim	vpaddq		$TEMP2, $ACC6, $ACC6
14137bded2dbSJung-uk Kim	vpblendd	\$3, $TEMP4, $TEMP5, $TEMP4
14147bded2dbSJung-uk Kim	vpaddq		$TEMP3, $ACC7, $ACC7
14157bded2dbSJung-uk Kim	vpaddq		$TEMP4, $ACC8, $ACC8
14167bded2dbSJung-uk Kim
14177bded2dbSJung-uk Kim	vpsrlq		\$29, $ACC4, $TEMP1
14187bded2dbSJung-uk Kim	vpand		$AND_MASK, $ACC4, $ACC4
14197bded2dbSJung-uk Kim	vpsrlq		\$29, $ACC5, $TEMP2
14207bded2dbSJung-uk Kim	vpand		$AND_MASK, $ACC5, $ACC5
14217bded2dbSJung-uk Kim	vpsrlq		\$29, $ACC6, $TEMP3
14227bded2dbSJung-uk Kim	vpermq		\$0x93, $TEMP1, $TEMP1
14237bded2dbSJung-uk Kim	vpand		$AND_MASK, $ACC6, $ACC6
14247bded2dbSJung-uk Kim	vpsrlq		\$29, $ACC7, $TEMP4
14257bded2dbSJung-uk Kim	vpermq		\$0x93, $TEMP2, $TEMP2
14267bded2dbSJung-uk Kim	vpand		$AND_MASK, $ACC7, $ACC7
14277bded2dbSJung-uk Kim	vpsrlq		\$29, $ACC8, $TEMP5
14287bded2dbSJung-uk Kim	vpermq		\$0x93, $TEMP3, $TEMP3
14297bded2dbSJung-uk Kim	vpand		$AND_MASK, $ACC8, $ACC8
14307bded2dbSJung-uk Kim	vpermq		\$0x93, $TEMP4, $TEMP4
14317bded2dbSJung-uk Kim
14327bded2dbSJung-uk Kim	vpblendd	\$3, $ZERO, $TEMP1, $TEMP0
14337bded2dbSJung-uk Kim	vpermq		\$0x93, $TEMP5, $TEMP5
14347bded2dbSJung-uk Kim	vpblendd	\$3, $TEMP1, $TEMP2, $TEMP1
14357bded2dbSJung-uk Kim	vpaddq		$TEMP0, $ACC4, $ACC4
14367bded2dbSJung-uk Kim	vpblendd	\$3, $TEMP2, $TEMP3, $TEMP2
14377bded2dbSJung-uk Kim	vpaddq		$TEMP1, $ACC5, $ACC5
14387bded2dbSJung-uk Kim	vpblendd	\$3, $TEMP3, $TEMP4, $TEMP3
14397bded2dbSJung-uk Kim	vpaddq		$TEMP2, $ACC6, $ACC6
14407bded2dbSJung-uk Kim	vpblendd	\$3, $TEMP4, $TEMP5, $TEMP4
14417bded2dbSJung-uk Kim	vpaddq		$TEMP3, $ACC7, $ACC7
14427bded2dbSJung-uk Kim	vpaddq		$TEMP4, $ACC8, $ACC8
14437bded2dbSJung-uk Kim
14447bded2dbSJung-uk Kim	vmovdqu		$ACC4, 128-128($rp)
14457bded2dbSJung-uk Kim	vmovdqu		$ACC5, 160-128($rp)
14467bded2dbSJung-uk Kim	vmovdqu		$ACC6, 192-128($rp)
14477bded2dbSJung-uk Kim	vmovdqu		$ACC7, 224-128($rp)
14487bded2dbSJung-uk Kim	vmovdqu		$ACC8, 256-128($rp)
14497bded2dbSJung-uk Kim	vzeroupper
14507bded2dbSJung-uk Kim
14517bded2dbSJung-uk Kim	mov	%rbp, %rax
1452e71b7053SJung-uk Kim.cfi_def_cfa_register	%rax
14537bded2dbSJung-uk Kim___
14547bded2dbSJung-uk Kim$code.=<<___ if ($win64);
1455e71b7053SJung-uk Kim.Lmul_1024_in_tail:
14567bded2dbSJung-uk Kim	movaps	-0xd8(%rax),%xmm6
14577bded2dbSJung-uk Kim	movaps	-0xc8(%rax),%xmm7
14587bded2dbSJung-uk Kim	movaps	-0xb8(%rax),%xmm8
14597bded2dbSJung-uk Kim	movaps	-0xa8(%rax),%xmm9
14607bded2dbSJung-uk Kim	movaps	-0x98(%rax),%xmm10
14617bded2dbSJung-uk Kim	movaps	-0x88(%rax),%xmm11
14627bded2dbSJung-uk Kim	movaps	-0x78(%rax),%xmm12
14637bded2dbSJung-uk Kim	movaps	-0x68(%rax),%xmm13
14647bded2dbSJung-uk Kim	movaps	-0x58(%rax),%xmm14
14657bded2dbSJung-uk Kim	movaps	-0x48(%rax),%xmm15
14667bded2dbSJung-uk Kim___
14677bded2dbSJung-uk Kim$code.=<<___;
14687bded2dbSJung-uk Kim	mov	-48(%rax),%r15
1469e71b7053SJung-uk Kim.cfi_restore	%r15
14707bded2dbSJung-uk Kim	mov	-40(%rax),%r14
1471e71b7053SJung-uk Kim.cfi_restore	%r14
14727bded2dbSJung-uk Kim	mov	-32(%rax),%r13
1473e71b7053SJung-uk Kim.cfi_restore	%r13
14747bded2dbSJung-uk Kim	mov	-24(%rax),%r12
1475e71b7053SJung-uk Kim.cfi_restore	%r12
14767bded2dbSJung-uk Kim	mov	-16(%rax),%rbp
1477e71b7053SJung-uk Kim.cfi_restore	%rbp
14787bded2dbSJung-uk Kim	mov	-8(%rax),%rbx
1479e71b7053SJung-uk Kim.cfi_restore	%rbx
14807bded2dbSJung-uk Kim	lea	(%rax),%rsp		# restore %rsp
1481e71b7053SJung-uk Kim.cfi_def_cfa_register	%rsp
14827bded2dbSJung-uk Kim.Lmul_1024_epilogue:
14837bded2dbSJung-uk Kim	ret
1484e71b7053SJung-uk Kim.cfi_endproc
14857bded2dbSJung-uk Kim.size	rsaz_1024_mul_avx2,.-rsaz_1024_mul_avx2
14867bded2dbSJung-uk Kim___
14877bded2dbSJung-uk Kim}
14887bded2dbSJung-uk Kim{
14897bded2dbSJung-uk Kimmy ($out,$inp) = $win64 ? ("%rcx","%rdx") : ("%rdi","%rsi");
14907bded2dbSJung-uk Kimmy @T = map("%r$_",(8..11));
14917bded2dbSJung-uk Kim
14927bded2dbSJung-uk Kim$code.=<<___;
14937bded2dbSJung-uk Kim.globl	rsaz_1024_red2norm_avx2
14947bded2dbSJung-uk Kim.type	rsaz_1024_red2norm_avx2,\@abi-omnipotent
14957bded2dbSJung-uk Kim.align	32
14967bded2dbSJung-uk Kimrsaz_1024_red2norm_avx2:
14976935a639SJung-uk Kim.cfi_startproc
14987bded2dbSJung-uk Kim	sub	\$-128,$inp	# size optimization
14997bded2dbSJung-uk Kim	xor	%rax,%rax
15007bded2dbSJung-uk Kim___
15017bded2dbSJung-uk Kim
15027bded2dbSJung-uk Kimfor ($j=0,$i=0; $i<16; $i++) {
15037bded2dbSJung-uk Kim    my $k=0;
15047bded2dbSJung-uk Kim    while (29*$j<64*($i+1)) {	# load data till boundary
15057bded2dbSJung-uk Kim	$code.="	mov	`8*$j-128`($inp), @T[0]\n";
15067bded2dbSJung-uk Kim	$j++; $k++; push(@T,shift(@T));
15077bded2dbSJung-uk Kim    }
15087bded2dbSJung-uk Kim    $l=$k;
15097bded2dbSJung-uk Kim    while ($k>1) {		# shift loaded data but last value
15107bded2dbSJung-uk Kim	$code.="	shl	\$`29*($j-$k)`,@T[-$k]\n";
15117bded2dbSJung-uk Kim	$k--;
15127bded2dbSJung-uk Kim    }
15137bded2dbSJung-uk Kim    $code.=<<___;		# shift last value
15147bded2dbSJung-uk Kim	mov	@T[-1], @T[0]
15157bded2dbSJung-uk Kim	shl	\$`29*($j-1)`, @T[-1]
15167bded2dbSJung-uk Kim	shr	\$`-29*($j-1)`, @T[0]
15177bded2dbSJung-uk Kim___
15187bded2dbSJung-uk Kim    while ($l) {		# accumulate all values
15197bded2dbSJung-uk Kim	$code.="	add	@T[-$l], %rax\n";
15207bded2dbSJung-uk Kim	$l--;
15217bded2dbSJung-uk Kim    }
15227bded2dbSJung-uk Kim	$code.=<<___;
15237bded2dbSJung-uk Kim	adc	\$0, @T[0]	# consume eventual carry
15247bded2dbSJung-uk Kim	mov	%rax, 8*$i($out)
15257bded2dbSJung-uk Kim	mov	@T[0], %rax
15267bded2dbSJung-uk Kim___
15277bded2dbSJung-uk Kim    push(@T,shift(@T));
15287bded2dbSJung-uk Kim}
15297bded2dbSJung-uk Kim$code.=<<___;
15307bded2dbSJung-uk Kim	ret
15316935a639SJung-uk Kim.cfi_endproc
15327bded2dbSJung-uk Kim.size	rsaz_1024_red2norm_avx2,.-rsaz_1024_red2norm_avx2
15337bded2dbSJung-uk Kim
15347bded2dbSJung-uk Kim.globl	rsaz_1024_norm2red_avx2
15357bded2dbSJung-uk Kim.type	rsaz_1024_norm2red_avx2,\@abi-omnipotent
15367bded2dbSJung-uk Kim.align	32
15377bded2dbSJung-uk Kimrsaz_1024_norm2red_avx2:
15386935a639SJung-uk Kim.cfi_startproc
15397bded2dbSJung-uk Kim	sub	\$-128,$out	# size optimization
15407bded2dbSJung-uk Kim	mov	($inp),@T[0]
15417bded2dbSJung-uk Kim	mov	\$0x1fffffff,%eax
15427bded2dbSJung-uk Kim___
15437bded2dbSJung-uk Kimfor ($j=0,$i=0; $i<16; $i++) {
15447bded2dbSJung-uk Kim    $code.="	mov	`8*($i+1)`($inp),@T[1]\n"	if ($i<15);
15457bded2dbSJung-uk Kim    $code.="	xor	@T[1],@T[1]\n"			if ($i==15);
15467bded2dbSJung-uk Kim    my $k=1;
15477bded2dbSJung-uk Kim    while (29*($j+1)<64*($i+1)) {
15487bded2dbSJung-uk Kim    	$code.=<<___;
15497bded2dbSJung-uk Kim	mov	@T[0],@T[-$k]
15507bded2dbSJung-uk Kim	shr	\$`29*$j`,@T[-$k]
15517bded2dbSJung-uk Kim	and	%rax,@T[-$k]				# &0x1fffffff
15527bded2dbSJung-uk Kim	mov	@T[-$k],`8*$j-128`($out)
15537bded2dbSJung-uk Kim___
15547bded2dbSJung-uk Kim	$j++; $k++;
15557bded2dbSJung-uk Kim    }
15567bded2dbSJung-uk Kim    $code.=<<___;
15577bded2dbSJung-uk Kim	shrd	\$`29*$j`,@T[1],@T[0]
15587bded2dbSJung-uk Kim	and	%rax,@T[0]
15597bded2dbSJung-uk Kim	mov	@T[0],`8*$j-128`($out)
15607bded2dbSJung-uk Kim___
15617bded2dbSJung-uk Kim    $j++;
15627bded2dbSJung-uk Kim    push(@T,shift(@T));
15637bded2dbSJung-uk Kim}
15647bded2dbSJung-uk Kim$code.=<<___;
15657bded2dbSJung-uk Kim	mov	@T[0],`8*$j-128`($out)			# zero
15667bded2dbSJung-uk Kim	mov	@T[0],`8*($j+1)-128`($out)
15677bded2dbSJung-uk Kim	mov	@T[0],`8*($j+2)-128`($out)
15687bded2dbSJung-uk Kim	mov	@T[0],`8*($j+3)-128`($out)
15697bded2dbSJung-uk Kim	ret
15706935a639SJung-uk Kim.cfi_endproc
15717bded2dbSJung-uk Kim.size	rsaz_1024_norm2red_avx2,.-rsaz_1024_norm2red_avx2
15727bded2dbSJung-uk Kim___
15737bded2dbSJung-uk Kim}
15747bded2dbSJung-uk Kim{
15757bded2dbSJung-uk Kimmy ($out,$inp,$power) = $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
15767bded2dbSJung-uk Kim
15777bded2dbSJung-uk Kim$code.=<<___;
15787bded2dbSJung-uk Kim.globl	rsaz_1024_scatter5_avx2
15797bded2dbSJung-uk Kim.type	rsaz_1024_scatter5_avx2,\@abi-omnipotent
15807bded2dbSJung-uk Kim.align	32
15817bded2dbSJung-uk Kimrsaz_1024_scatter5_avx2:
15826935a639SJung-uk Kim.cfi_startproc
15837bded2dbSJung-uk Kim	vzeroupper
15847bded2dbSJung-uk Kim	vmovdqu	.Lscatter_permd(%rip),%ymm5
15857bded2dbSJung-uk Kim	shl	\$4,$power
15867bded2dbSJung-uk Kim	lea	($out,$power),$out
15877bded2dbSJung-uk Kim	mov	\$9,%eax
15887bded2dbSJung-uk Kim	jmp	.Loop_scatter_1024
15897bded2dbSJung-uk Kim
15907bded2dbSJung-uk Kim.align	32
15917bded2dbSJung-uk Kim.Loop_scatter_1024:
15927bded2dbSJung-uk Kim	vmovdqu		($inp),%ymm0
15937bded2dbSJung-uk Kim	lea		32($inp),$inp
15947bded2dbSJung-uk Kim	vpermd		%ymm0,%ymm5,%ymm0
15957bded2dbSJung-uk Kim	vmovdqu		%xmm0,($out)
15967bded2dbSJung-uk Kim	lea		16*32($out),$out
15977bded2dbSJung-uk Kim	dec	%eax
15987bded2dbSJung-uk Kim	jnz	.Loop_scatter_1024
15997bded2dbSJung-uk Kim
16007bded2dbSJung-uk Kim	vzeroupper
16017bded2dbSJung-uk Kim	ret
16026935a639SJung-uk Kim.cfi_endproc
16037bded2dbSJung-uk Kim.size	rsaz_1024_scatter5_avx2,.-rsaz_1024_scatter5_avx2
16047bded2dbSJung-uk Kim
16057bded2dbSJung-uk Kim.globl	rsaz_1024_gather5_avx2
16067bded2dbSJung-uk Kim.type	rsaz_1024_gather5_avx2,\@abi-omnipotent
16077bded2dbSJung-uk Kim.align	32
16087bded2dbSJung-uk Kimrsaz_1024_gather5_avx2:
1609e71b7053SJung-uk Kim.cfi_startproc
16104c6a0400SJung-uk Kim	vzeroupper
16114c6a0400SJung-uk Kim	mov	%rsp,%r11
1612e71b7053SJung-uk Kim.cfi_def_cfa_register	%r11
16137bded2dbSJung-uk Kim___
16147bded2dbSJung-uk Kim$code.=<<___ if ($win64);
16157bded2dbSJung-uk Kim	lea	-0x88(%rsp),%rax
16167bded2dbSJung-uk Kim.LSEH_begin_rsaz_1024_gather5:
16177bded2dbSJung-uk Kim	# I can't trust assembler to use specific encoding:-(
16187bded2dbSJung-uk Kim	.byte	0x48,0x8d,0x60,0xe0		# lea	-0x20(%rax),%rsp
16197bded2dbSJung-uk Kim	.byte	0xc5,0xf8,0x29,0x70,0xe0	# vmovaps %xmm6,-0x20(%rax)
16207bded2dbSJung-uk Kim	.byte	0xc5,0xf8,0x29,0x78,0xf0	# vmovaps %xmm7,-0x10(%rax)
16217bded2dbSJung-uk Kim	.byte	0xc5,0x78,0x29,0x40,0x00	# vmovaps %xmm8,0(%rax)
16227bded2dbSJung-uk Kim	.byte	0xc5,0x78,0x29,0x48,0x10	# vmovaps %xmm9,0x10(%rax)
16237bded2dbSJung-uk Kim	.byte	0xc5,0x78,0x29,0x50,0x20	# vmovaps %xmm10,0x20(%rax)
16247bded2dbSJung-uk Kim	.byte	0xc5,0x78,0x29,0x58,0x30	# vmovaps %xmm11,0x30(%rax)
16257bded2dbSJung-uk Kim	.byte	0xc5,0x78,0x29,0x60,0x40	# vmovaps %xmm12,0x40(%rax)
16267bded2dbSJung-uk Kim	.byte	0xc5,0x78,0x29,0x68,0x50	# vmovaps %xmm13,0x50(%rax)
16277bded2dbSJung-uk Kim	.byte	0xc5,0x78,0x29,0x70,0x60	# vmovaps %xmm14,0x60(%rax)
16287bded2dbSJung-uk Kim	.byte	0xc5,0x78,0x29,0x78,0x70	# vmovaps %xmm15,0x70(%rax)
16297bded2dbSJung-uk Kim___
16307bded2dbSJung-uk Kim$code.=<<___;
16314c6a0400SJung-uk Kim	lea	-0x100(%rsp),%rsp
16324c6a0400SJung-uk Kim	and	\$-32, %rsp
16334c6a0400SJung-uk Kim	lea	.Linc(%rip), %r10
16344c6a0400SJung-uk Kim	lea	-128(%rsp),%rax			# control u-op density
16357bded2dbSJung-uk Kim
16364c6a0400SJung-uk Kim	vmovd		$power, %xmm4
16374c6a0400SJung-uk Kim	vmovdqa		(%r10),%ymm0
16384c6a0400SJung-uk Kim	vmovdqa		32(%r10),%ymm1
16394c6a0400SJung-uk Kim	vmovdqa		64(%r10),%ymm5
16404c6a0400SJung-uk Kim	vpbroadcastd	%xmm4,%ymm4
16417bded2dbSJung-uk Kim
16424c6a0400SJung-uk Kim	vpaddd		%ymm5, %ymm0, %ymm2
16434c6a0400SJung-uk Kim	vpcmpeqd	%ymm4, %ymm0, %ymm0
16444c6a0400SJung-uk Kim	vpaddd		%ymm5, %ymm1, %ymm3
16454c6a0400SJung-uk Kim	vpcmpeqd	%ymm4, %ymm1, %ymm1
16464c6a0400SJung-uk Kim	vmovdqa		%ymm0, 32*0+128(%rax)
16474c6a0400SJung-uk Kim	vpaddd		%ymm5, %ymm2, %ymm0
16484c6a0400SJung-uk Kim	vpcmpeqd	%ymm4, %ymm2, %ymm2
16494c6a0400SJung-uk Kim	vmovdqa		%ymm1, 32*1+128(%rax)
16504c6a0400SJung-uk Kim	vpaddd		%ymm5, %ymm3, %ymm1
16514c6a0400SJung-uk Kim	vpcmpeqd	%ymm4, %ymm3, %ymm3
16524c6a0400SJung-uk Kim	vmovdqa		%ymm2, 32*2+128(%rax)
16534c6a0400SJung-uk Kim	vpaddd		%ymm5, %ymm0, %ymm2
16544c6a0400SJung-uk Kim	vpcmpeqd	%ymm4, %ymm0, %ymm0
16554c6a0400SJung-uk Kim	vmovdqa		%ymm3, 32*3+128(%rax)
16564c6a0400SJung-uk Kim	vpaddd		%ymm5, %ymm1, %ymm3
16574c6a0400SJung-uk Kim	vpcmpeqd	%ymm4, %ymm1, %ymm1
16584c6a0400SJung-uk Kim	vmovdqa		%ymm0, 32*4+128(%rax)
16594c6a0400SJung-uk Kim	vpaddd		%ymm5, %ymm2, %ymm8
16604c6a0400SJung-uk Kim	vpcmpeqd	%ymm4, %ymm2, %ymm2
16614c6a0400SJung-uk Kim	vmovdqa		%ymm1, 32*5+128(%rax)
16624c6a0400SJung-uk Kim	vpaddd		%ymm5, %ymm3, %ymm9
16634c6a0400SJung-uk Kim	vpcmpeqd	%ymm4, %ymm3, %ymm3
16644c6a0400SJung-uk Kim	vmovdqa		%ymm2, 32*6+128(%rax)
16654c6a0400SJung-uk Kim	vpaddd		%ymm5, %ymm8, %ymm10
16664c6a0400SJung-uk Kim	vpcmpeqd	%ymm4, %ymm8, %ymm8
16674c6a0400SJung-uk Kim	vmovdqa		%ymm3, 32*7+128(%rax)
16684c6a0400SJung-uk Kim	vpaddd		%ymm5, %ymm9, %ymm11
16694c6a0400SJung-uk Kim	vpcmpeqd	%ymm4, %ymm9, %ymm9
16704c6a0400SJung-uk Kim	vpaddd		%ymm5, %ymm10, %ymm12
16714c6a0400SJung-uk Kim	vpcmpeqd	%ymm4, %ymm10, %ymm10
16724c6a0400SJung-uk Kim	vpaddd		%ymm5, %ymm11, %ymm13
16734c6a0400SJung-uk Kim	vpcmpeqd	%ymm4, %ymm11, %ymm11
16744c6a0400SJung-uk Kim	vpaddd		%ymm5, %ymm12, %ymm14
16754c6a0400SJung-uk Kim	vpcmpeqd	%ymm4, %ymm12, %ymm12
16764c6a0400SJung-uk Kim	vpaddd		%ymm5, %ymm13, %ymm15
16774c6a0400SJung-uk Kim	vpcmpeqd	%ymm4, %ymm13, %ymm13
16784c6a0400SJung-uk Kim	vpcmpeqd	%ymm4, %ymm14, %ymm14
16794c6a0400SJung-uk Kim	vpcmpeqd	%ymm4, %ymm15, %ymm15
16807bded2dbSJung-uk Kim
16814c6a0400SJung-uk Kim	vmovdqa	-32(%r10),%ymm7			# .Lgather_permd
16824c6a0400SJung-uk Kim	lea	128($inp), $inp
16834c6a0400SJung-uk Kim	mov	\$9,$power
16844c6a0400SJung-uk Kim
16857bded2dbSJung-uk Kim.Loop_gather_1024:
16864c6a0400SJung-uk Kim	vmovdqa		32*0-128($inp),	%ymm0
16874c6a0400SJung-uk Kim	vmovdqa		32*1-128($inp),	%ymm1
16884c6a0400SJung-uk Kim	vmovdqa		32*2-128($inp),	%ymm2
16894c6a0400SJung-uk Kim	vmovdqa		32*3-128($inp),	%ymm3
16904c6a0400SJung-uk Kim	vpand		32*0+128(%rax),	%ymm0,	%ymm0
16914c6a0400SJung-uk Kim	vpand		32*1+128(%rax),	%ymm1,	%ymm1
16924c6a0400SJung-uk Kim	vpand		32*2+128(%rax),	%ymm2,	%ymm2
16934c6a0400SJung-uk Kim	vpor		%ymm0, %ymm1, %ymm4
16944c6a0400SJung-uk Kim	vpand		32*3+128(%rax),	%ymm3,	%ymm3
16954c6a0400SJung-uk Kim	vmovdqa		32*4-128($inp),	%ymm0
16964c6a0400SJung-uk Kim	vmovdqa		32*5-128($inp),	%ymm1
16974c6a0400SJung-uk Kim	vpor		%ymm2, %ymm3, %ymm5
16984c6a0400SJung-uk Kim	vmovdqa		32*6-128($inp),	%ymm2
16994c6a0400SJung-uk Kim	vmovdqa		32*7-128($inp),	%ymm3
17004c6a0400SJung-uk Kim	vpand		32*4+128(%rax),	%ymm0,	%ymm0
17014c6a0400SJung-uk Kim	vpand		32*5+128(%rax),	%ymm1,	%ymm1
17024c6a0400SJung-uk Kim	vpand		32*6+128(%rax),	%ymm2,	%ymm2
17034c6a0400SJung-uk Kim	vpor		%ymm0, %ymm4, %ymm4
17044c6a0400SJung-uk Kim	vpand		32*7+128(%rax),	%ymm3,	%ymm3
17054c6a0400SJung-uk Kim	vpand		32*8-128($inp),	%ymm8,	%ymm0
17064c6a0400SJung-uk Kim	vpor		%ymm1, %ymm5, %ymm5
17074c6a0400SJung-uk Kim	vpand		32*9-128($inp),	%ymm9,	%ymm1
17084c6a0400SJung-uk Kim	vpor		%ymm2, %ymm4, %ymm4
17094c6a0400SJung-uk Kim	vpand		32*10-128($inp),%ymm10,	%ymm2
17104c6a0400SJung-uk Kim	vpor		%ymm3, %ymm5, %ymm5
17114c6a0400SJung-uk Kim	vpand		32*11-128($inp),%ymm11,	%ymm3
17124c6a0400SJung-uk Kim	vpor		%ymm0, %ymm4, %ymm4
17134c6a0400SJung-uk Kim	vpand		32*12-128($inp),%ymm12,	%ymm0
17144c6a0400SJung-uk Kim	vpor		%ymm1, %ymm5, %ymm5
17154c6a0400SJung-uk Kim	vpand		32*13-128($inp),%ymm13,	%ymm1
17164c6a0400SJung-uk Kim	vpor		%ymm2, %ymm4, %ymm4
17174c6a0400SJung-uk Kim	vpand		32*14-128($inp),%ymm14,	%ymm2
17184c6a0400SJung-uk Kim	vpor		%ymm3, %ymm5, %ymm5
17194c6a0400SJung-uk Kim	vpand		32*15-128($inp),%ymm15,	%ymm3
17204c6a0400SJung-uk Kim	lea		32*16($inp), $inp
17214c6a0400SJung-uk Kim	vpor		%ymm0, %ymm4, %ymm4
17224c6a0400SJung-uk Kim	vpor		%ymm1, %ymm5, %ymm5
17234c6a0400SJung-uk Kim	vpor		%ymm2, %ymm4, %ymm4
17244c6a0400SJung-uk Kim	vpor		%ymm3, %ymm5, %ymm5
17254c6a0400SJung-uk Kim
17264c6a0400SJung-uk Kim	vpor		%ymm5, %ymm4, %ymm4
17274c6a0400SJung-uk Kim	vextracti128	\$1, %ymm4, %xmm5	# upper half is cleared
17287bded2dbSJung-uk Kim	vpor		%xmm4, %xmm5, %xmm5
17294c6a0400SJung-uk Kim	vpermd		%ymm5,%ymm7,%ymm5
17304c6a0400SJung-uk Kim	vmovdqu		%ymm5,($out)
17317bded2dbSJung-uk Kim	lea		32($out),$out
17324c6a0400SJung-uk Kim	dec	$power
17337bded2dbSJung-uk Kim	jnz	.Loop_gather_1024
17347bded2dbSJung-uk Kim
17357bded2dbSJung-uk Kim	vpxor	%ymm0,%ymm0,%ymm0
17367bded2dbSJung-uk Kim	vmovdqu	%ymm0,($out)
17377bded2dbSJung-uk Kim	vzeroupper
17387bded2dbSJung-uk Kim___
17397bded2dbSJung-uk Kim$code.=<<___ if ($win64);
17404c6a0400SJung-uk Kim	movaps	-0xa8(%r11),%xmm6
17414c6a0400SJung-uk Kim	movaps	-0x98(%r11),%xmm7
17424c6a0400SJung-uk Kim	movaps	-0x88(%r11),%xmm8
17434c6a0400SJung-uk Kim	movaps	-0x78(%r11),%xmm9
17444c6a0400SJung-uk Kim	movaps	-0x68(%r11),%xmm10
17454c6a0400SJung-uk Kim	movaps	-0x58(%r11),%xmm11
17464c6a0400SJung-uk Kim	movaps	-0x48(%r11),%xmm12
17474c6a0400SJung-uk Kim	movaps	-0x38(%r11),%xmm13
17484c6a0400SJung-uk Kim	movaps	-0x28(%r11),%xmm14
17494c6a0400SJung-uk Kim	movaps	-0x18(%r11),%xmm15
17507bded2dbSJung-uk Kim___
17517bded2dbSJung-uk Kim$code.=<<___;
17524c6a0400SJung-uk Kim	lea	(%r11),%rsp
1753e71b7053SJung-uk Kim.cfi_def_cfa_register	%rsp
17547bded2dbSJung-uk Kim	ret
1755e71b7053SJung-uk Kim.cfi_endproc
1756e71b7053SJung-uk Kim.LSEH_end_rsaz_1024_gather5:
17577bded2dbSJung-uk Kim.size	rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2
17587bded2dbSJung-uk Kim___
17597bded2dbSJung-uk Kim}
17607bded2dbSJung-uk Kim
17617bded2dbSJung-uk Kim$code.=<<___;
17627bded2dbSJung-uk Kim.extern	OPENSSL_ia32cap_P
17637bded2dbSJung-uk Kim.globl	rsaz_avx2_eligible
17647bded2dbSJung-uk Kim.type	rsaz_avx2_eligible,\@abi-omnipotent
17657bded2dbSJung-uk Kim.align	32
17667bded2dbSJung-uk Kimrsaz_avx2_eligible:
17677bded2dbSJung-uk Kim	mov	OPENSSL_ia32cap_P+8(%rip),%eax
17687bded2dbSJung-uk Kim___
17697bded2dbSJung-uk Kim$code.=<<___	if ($addx);
17707bded2dbSJung-uk Kim	mov	\$`1<<8|1<<19`,%ecx
17717bded2dbSJung-uk Kim	mov	\$0,%edx
17727bded2dbSJung-uk Kim	and	%eax,%ecx
17737bded2dbSJung-uk Kim	cmp	\$`1<<8|1<<19`,%ecx	# check for BMI2+AD*X
17747bded2dbSJung-uk Kim	cmove	%edx,%eax
17757bded2dbSJung-uk Kim___
17767bded2dbSJung-uk Kim$code.=<<___;
17777bded2dbSJung-uk Kim	and	\$`1<<5`,%eax
17787bded2dbSJung-uk Kim	shr	\$5,%eax
17797bded2dbSJung-uk Kim	ret
17807bded2dbSJung-uk Kim.size	rsaz_avx2_eligible,.-rsaz_avx2_eligible
17817bded2dbSJung-uk Kim
17827bded2dbSJung-uk Kim.align	64
17837bded2dbSJung-uk Kim.Land_mask:
1784c4ad4dffSJung-uk Kim	.quad	0x1fffffff,0x1fffffff,0x1fffffff,0x1fffffff
17857bded2dbSJung-uk Kim.Lscatter_permd:
17867bded2dbSJung-uk Kim	.long	0,2,4,6,7,7,7,7
17877bded2dbSJung-uk Kim.Lgather_permd:
17887bded2dbSJung-uk Kim	.long	0,7,1,7,2,7,3,7
17894c6a0400SJung-uk Kim.Linc:
17904c6a0400SJung-uk Kim	.long	0,0,0,0, 1,1,1,1
17914c6a0400SJung-uk Kim	.long	2,2,2,2, 3,3,3,3
17924c6a0400SJung-uk Kim	.long	4,4,4,4, 4,4,4,4
17937bded2dbSJung-uk Kim.align	64
17947bded2dbSJung-uk Kim___
17957bded2dbSJung-uk Kim
17967bded2dbSJung-uk Kimif ($win64) {
17977bded2dbSJung-uk Kim$rec="%rcx";
17987bded2dbSJung-uk Kim$frame="%rdx";
17997bded2dbSJung-uk Kim$context="%r8";
18007bded2dbSJung-uk Kim$disp="%r9";
18017bded2dbSJung-uk Kim
18027bded2dbSJung-uk Kim$code.=<<___
18037bded2dbSJung-uk Kim.extern	__imp_RtlVirtualUnwind
18047bded2dbSJung-uk Kim.type	rsaz_se_handler,\@abi-omnipotent
18057bded2dbSJung-uk Kim.align	16
18067bded2dbSJung-uk Kimrsaz_se_handler:
18077bded2dbSJung-uk Kim	push	%rsi
18087bded2dbSJung-uk Kim	push	%rdi
18097bded2dbSJung-uk Kim	push	%rbx
18107bded2dbSJung-uk Kim	push	%rbp
18117bded2dbSJung-uk Kim	push	%r12
18127bded2dbSJung-uk Kim	push	%r13
18137bded2dbSJung-uk Kim	push	%r14
18147bded2dbSJung-uk Kim	push	%r15
18157bded2dbSJung-uk Kim	pushfq
18167bded2dbSJung-uk Kim	sub	\$64,%rsp
18177bded2dbSJung-uk Kim
18187bded2dbSJung-uk Kim	mov	120($context),%rax	# pull context->Rax
18197bded2dbSJung-uk Kim	mov	248($context),%rbx	# pull context->Rip
18207bded2dbSJung-uk Kim
18217bded2dbSJung-uk Kim	mov	8($disp),%rsi		# disp->ImageBase
18227bded2dbSJung-uk Kim	mov	56($disp),%r11		# disp->HandlerData
18237bded2dbSJung-uk Kim
18247bded2dbSJung-uk Kim	mov	0(%r11),%r10d		# HandlerData[0]
18257bded2dbSJung-uk Kim	lea	(%rsi,%r10),%r10	# prologue label
18267bded2dbSJung-uk Kim	cmp	%r10,%rbx		# context->Rip<prologue label
18277bded2dbSJung-uk Kim	jb	.Lcommon_seh_tail
18287bded2dbSJung-uk Kim
18297bded2dbSJung-uk Kim	mov	4(%r11),%r10d		# HandlerData[1]
18307bded2dbSJung-uk Kim	lea	(%rsi,%r10),%r10	# epilogue label
18317bded2dbSJung-uk Kim	cmp	%r10,%rbx		# context->Rip>=epilogue label
18327bded2dbSJung-uk Kim	jae	.Lcommon_seh_tail
18337bded2dbSJung-uk Kim
1834e71b7053SJung-uk Kim	mov	160($context),%rbp	# pull context->Rbp
1835e71b7053SJung-uk Kim
1836e71b7053SJung-uk Kim	mov	8(%r11),%r10d		# HandlerData[2]
1837e71b7053SJung-uk Kim	lea	(%rsi,%r10),%r10	# "in tail" label
1838e71b7053SJung-uk Kim	cmp	%r10,%rbx		# context->Rip>="in tail" label
1839e71b7053SJung-uk Kim	cmovc	%rbp,%rax
18407bded2dbSJung-uk Kim
18417bded2dbSJung-uk Kim	mov	-48(%rax),%r15
18427bded2dbSJung-uk Kim	mov	-40(%rax),%r14
18437bded2dbSJung-uk Kim	mov	-32(%rax),%r13
18447bded2dbSJung-uk Kim	mov	-24(%rax),%r12
18457bded2dbSJung-uk Kim	mov	-16(%rax),%rbp
18467bded2dbSJung-uk Kim	mov	-8(%rax),%rbx
18477bded2dbSJung-uk Kim	mov	%r15,240($context)
18487bded2dbSJung-uk Kim	mov	%r14,232($context)
18497bded2dbSJung-uk Kim	mov	%r13,224($context)
18507bded2dbSJung-uk Kim	mov	%r12,216($context)
18517bded2dbSJung-uk Kim	mov	%rbp,160($context)
18527bded2dbSJung-uk Kim	mov	%rbx,144($context)
18537bded2dbSJung-uk Kim
18547bded2dbSJung-uk Kim	lea	-0xd8(%rax),%rsi	# %xmm save area
18557bded2dbSJung-uk Kim	lea	512($context),%rdi	# & context.Xmm6
18567bded2dbSJung-uk Kim	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
18577bded2dbSJung-uk Kim	.long	0xa548f3fc		# cld; rep movsq
18587bded2dbSJung-uk Kim
18597bded2dbSJung-uk Kim.Lcommon_seh_tail:
18607bded2dbSJung-uk Kim	mov	8(%rax),%rdi
18617bded2dbSJung-uk Kim	mov	16(%rax),%rsi
18627bded2dbSJung-uk Kim	mov	%rax,152($context)	# restore context->Rsp
18637bded2dbSJung-uk Kim	mov	%rsi,168($context)	# restore context->Rsi
18647bded2dbSJung-uk Kim	mov	%rdi,176($context)	# restore context->Rdi
18657bded2dbSJung-uk Kim
18667bded2dbSJung-uk Kim	mov	40($disp),%rdi		# disp->ContextRecord
18677bded2dbSJung-uk Kim	mov	$context,%rsi		# context
18687bded2dbSJung-uk Kim	mov	\$154,%ecx		# sizeof(CONTEXT)
18697bded2dbSJung-uk Kim	.long	0xa548f3fc		# cld; rep movsq
18707bded2dbSJung-uk Kim
18717bded2dbSJung-uk Kim	mov	$disp,%rsi
18727bded2dbSJung-uk Kim	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
18737bded2dbSJung-uk Kim	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
18747bded2dbSJung-uk Kim	mov	0(%rsi),%r8		# arg3, disp->ControlPc
18757bded2dbSJung-uk Kim	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
18767bded2dbSJung-uk Kim	mov	40(%rsi),%r10		# disp->ContextRecord
18777bded2dbSJung-uk Kim	lea	56(%rsi),%r11		# &disp->HandlerData
18787bded2dbSJung-uk Kim	lea	24(%rsi),%r12		# &disp->EstablisherFrame
18797bded2dbSJung-uk Kim	mov	%r10,32(%rsp)		# arg5
18807bded2dbSJung-uk Kim	mov	%r11,40(%rsp)		# arg6
18817bded2dbSJung-uk Kim	mov	%r12,48(%rsp)		# arg7
18827bded2dbSJung-uk Kim	mov	%rcx,56(%rsp)		# arg8, (NULL)
18837bded2dbSJung-uk Kim	call	*__imp_RtlVirtualUnwind(%rip)
18847bded2dbSJung-uk Kim
18857bded2dbSJung-uk Kim	mov	\$1,%eax		# ExceptionContinueSearch
18867bded2dbSJung-uk Kim	add	\$64,%rsp
18877bded2dbSJung-uk Kim	popfq
18887bded2dbSJung-uk Kim	pop	%r15
18897bded2dbSJung-uk Kim	pop	%r14
18907bded2dbSJung-uk Kim	pop	%r13
18917bded2dbSJung-uk Kim	pop	%r12
18927bded2dbSJung-uk Kim	pop	%rbp
18937bded2dbSJung-uk Kim	pop	%rbx
18947bded2dbSJung-uk Kim	pop	%rdi
18957bded2dbSJung-uk Kim	pop	%rsi
18967bded2dbSJung-uk Kim	ret
18977bded2dbSJung-uk Kim.size	rsaz_se_handler,.-rsaz_se_handler
18987bded2dbSJung-uk Kim
18997bded2dbSJung-uk Kim.section	.pdata
19007bded2dbSJung-uk Kim.align	4
19017bded2dbSJung-uk Kim	.rva	.LSEH_begin_rsaz_1024_sqr_avx2
19027bded2dbSJung-uk Kim	.rva	.LSEH_end_rsaz_1024_sqr_avx2
19037bded2dbSJung-uk Kim	.rva	.LSEH_info_rsaz_1024_sqr_avx2
19047bded2dbSJung-uk Kim
19057bded2dbSJung-uk Kim	.rva	.LSEH_begin_rsaz_1024_mul_avx2
19067bded2dbSJung-uk Kim	.rva	.LSEH_end_rsaz_1024_mul_avx2
19077bded2dbSJung-uk Kim	.rva	.LSEH_info_rsaz_1024_mul_avx2
19087bded2dbSJung-uk Kim
19097bded2dbSJung-uk Kim	.rva	.LSEH_begin_rsaz_1024_gather5
19107bded2dbSJung-uk Kim	.rva	.LSEH_end_rsaz_1024_gather5
19117bded2dbSJung-uk Kim	.rva	.LSEH_info_rsaz_1024_gather5
19127bded2dbSJung-uk Kim.section	.xdata
19137bded2dbSJung-uk Kim.align	8
19147bded2dbSJung-uk Kim.LSEH_info_rsaz_1024_sqr_avx2:
19157bded2dbSJung-uk Kim	.byte	9,0,0,0
19167bded2dbSJung-uk Kim	.rva	rsaz_se_handler
1917e71b7053SJung-uk Kim	.rva	.Lsqr_1024_body,.Lsqr_1024_epilogue,.Lsqr_1024_in_tail
1918e71b7053SJung-uk Kim	.long	0
19197bded2dbSJung-uk Kim.LSEH_info_rsaz_1024_mul_avx2:
19207bded2dbSJung-uk Kim	.byte	9,0,0,0
19217bded2dbSJung-uk Kim	.rva	rsaz_se_handler
1922e71b7053SJung-uk Kim	.rva	.Lmul_1024_body,.Lmul_1024_epilogue,.Lmul_1024_in_tail
1923e71b7053SJung-uk Kim	.long	0
19247bded2dbSJung-uk Kim.LSEH_info_rsaz_1024_gather5:
19254c6a0400SJung-uk Kim	.byte	0x01,0x36,0x17,0x0b
19267bded2dbSJung-uk Kim	.byte	0x36,0xf8,0x09,0x00	# vmovaps 0x90(rsp),xmm15
19277bded2dbSJung-uk Kim	.byte	0x31,0xe8,0x08,0x00	# vmovaps 0x80(rsp),xmm14
19287bded2dbSJung-uk Kim	.byte	0x2c,0xd8,0x07,0x00	# vmovaps 0x70(rsp),xmm13
19297bded2dbSJung-uk Kim	.byte	0x27,0xc8,0x06,0x00	# vmovaps 0x60(rsp),xmm12
19307bded2dbSJung-uk Kim	.byte	0x22,0xb8,0x05,0x00	# vmovaps 0x50(rsp),xmm11
19317bded2dbSJung-uk Kim	.byte	0x1d,0xa8,0x04,0x00	# vmovaps 0x40(rsp),xmm10
19327bded2dbSJung-uk Kim	.byte	0x18,0x98,0x03,0x00	# vmovaps 0x30(rsp),xmm9
19337bded2dbSJung-uk Kim	.byte	0x13,0x88,0x02,0x00	# vmovaps 0x20(rsp),xmm8
19347bded2dbSJung-uk Kim	.byte	0x0e,0x78,0x01,0x00	# vmovaps 0x10(rsp),xmm7
19357bded2dbSJung-uk Kim	.byte	0x09,0x68,0x00,0x00	# vmovaps 0x00(rsp),xmm6
19367bded2dbSJung-uk Kim	.byte	0x04,0x01,0x15,0x00	# sub	  rsp,0xa8
19374c6a0400SJung-uk Kim	.byte	0x00,0xb3,0x00,0x00	# set_frame r11
19387bded2dbSJung-uk Kim___
19397bded2dbSJung-uk Kim}
19407bded2dbSJung-uk Kim
19417bded2dbSJung-uk Kimforeach (split("\n",$code)) {
19427bded2dbSJung-uk Kim	s/\`([^\`]*)\`/eval($1)/ge;
19437bded2dbSJung-uk Kim
19447bded2dbSJung-uk Kim	s/\b(sh[rl]d?\s+\$)(-?[0-9]+)/$1.$2%64/ge		or
19457bded2dbSJung-uk Kim
19467bded2dbSJung-uk Kim	s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go		or
19477bded2dbSJung-uk Kim	s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go		or
19487bded2dbSJung-uk Kim	s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go	or
19497bded2dbSJung-uk Kim	s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go	or
19507bded2dbSJung-uk Kim	s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;
19517bded2dbSJung-uk Kim	print $_,"\n";
19527bded2dbSJung-uk Kim}
19537bded2dbSJung-uk Kim
19547bded2dbSJung-uk Kim}}} else {{{
19557bded2dbSJung-uk Kimprint <<___;	# assembler is too old
19567bded2dbSJung-uk Kim.text
19577bded2dbSJung-uk Kim
19587bded2dbSJung-uk Kim.globl	rsaz_avx2_eligible
19597bded2dbSJung-uk Kim.type	rsaz_avx2_eligible,\@abi-omnipotent
19607bded2dbSJung-uk Kimrsaz_avx2_eligible:
19617bded2dbSJung-uk Kim	xor	%eax,%eax
19627bded2dbSJung-uk Kim	ret
19637bded2dbSJung-uk Kim.size	rsaz_avx2_eligible,.-rsaz_avx2_eligible
19647bded2dbSJung-uk Kim
19657bded2dbSJung-uk Kim.globl	rsaz_1024_sqr_avx2
19667bded2dbSJung-uk Kim.globl	rsaz_1024_mul_avx2
19677bded2dbSJung-uk Kim.globl	rsaz_1024_norm2red_avx2
19687bded2dbSJung-uk Kim.globl	rsaz_1024_red2norm_avx2
19697bded2dbSJung-uk Kim.globl	rsaz_1024_scatter5_avx2
19707bded2dbSJung-uk Kim.globl	rsaz_1024_gather5_avx2
19717bded2dbSJung-uk Kim.type	rsaz_1024_sqr_avx2,\@abi-omnipotent
19727bded2dbSJung-uk Kimrsaz_1024_sqr_avx2:
19737bded2dbSJung-uk Kimrsaz_1024_mul_avx2:
19747bded2dbSJung-uk Kimrsaz_1024_norm2red_avx2:
19757bded2dbSJung-uk Kimrsaz_1024_red2norm_avx2:
19767bded2dbSJung-uk Kimrsaz_1024_scatter5_avx2:
19777bded2dbSJung-uk Kimrsaz_1024_gather5_avx2:
19787bded2dbSJung-uk Kim	.byte	0x0f,0x0b	# ud2
19797bded2dbSJung-uk Kim	ret
19807bded2dbSJung-uk Kim.size	rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
19817bded2dbSJung-uk Kim___
19827bded2dbSJung-uk Kim}}}
19837bded2dbSJung-uk Kim
198417f01e99SJung-uk Kimclose STDOUT or die "error closing STDOUT: $!";
1985