17bded2dbSJung-uk Kim#! /usr/bin/env perl 217f01e99SJung-uk Kim# Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved. 3e71b7053SJung-uk Kim# Copyright (c) 2012, Intel Corporation. All Rights Reserved. 4e71b7053SJung-uk Kim# 5b077aed3SPierre Pronchery# Licensed under the Apache License 2.0 (the "License"). You may not use 6e71b7053SJung-uk Kim# this file except in compliance with the License. You can obtain a copy 7e71b7053SJung-uk Kim# in the file LICENSE in the source distribution or at 8e71b7053SJung-uk Kim# https://www.openssl.org/source/license.html 9e71b7053SJung-uk Kim# 10e71b7053SJung-uk Kim# Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1) 11e71b7053SJung-uk Kim# (1) Intel Corporation, Israel Development Center, Haifa, Israel 12e71b7053SJung-uk Kim# (2) University of Haifa, Israel 13e71b7053SJung-uk Kim# 14e71b7053SJung-uk Kim# References: 15e71b7053SJung-uk Kim# [1] S. Gueron, V. Krasnov: "Software Implementation of Modular 16e71b7053SJung-uk Kim# Exponentiation, Using Advanced Vector Instructions Architectures", 17e71b7053SJung-uk Kim# F. Ozbudak and F. Rodriguez-Henriquez (Eds.): WAIFI 2012, LNCS 7369, 18e71b7053SJung-uk Kim# pp. 119?135, 2012. Springer-Verlag Berlin Heidelberg 2012 19e71b7053SJung-uk Kim# [2] S. Gueron: "Efficient Software Implementations of Modular 20e71b7053SJung-uk Kim# Exponentiation", Journal of Cryptographic Engineering 2:31-43 (2012). 21e71b7053SJung-uk Kim# [3] S. Gueron, V. Krasnov: "Speeding up Big-numbers Squaring",IEEE 22e71b7053SJung-uk Kim# Proceedings of 9th International Conference on Information Technology: 23e71b7053SJung-uk Kim# New Generations (ITNG 2012), pp.821-823 (2012) 24e71b7053SJung-uk Kim# [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis 25e71b7053SJung-uk Kim# resistant 1024-bit modular exponentiation, for optimizing RSA2048 26e71b7053SJung-uk Kim# on AVX2 capable x86_64 platforms", 27e71b7053SJung-uk Kim# http://rt.openssl.org/Ticket/Display.html?id=2850&user=guest&pass=guest 287bded2dbSJung-uk Kim# 297bded2dbSJung-uk Kim# +13% improvement over original submission by <appro@openssl.org> 307bded2dbSJung-uk Kim# 317bded2dbSJung-uk Kim# rsa2048 sign/sec OpenSSL 1.0.1 scalar(*) this 327bded2dbSJung-uk Kim# 2.3GHz Haswell 621 765/+23% 1113/+79% 337bded2dbSJung-uk Kim# 2.3GHz Broadwell(**) 688 1200(***)/+74% 1120/+63% 347bded2dbSJung-uk Kim# 357bded2dbSJung-uk Kim# (*) if system doesn't support AVX2, for reference purposes; 367bded2dbSJung-uk Kim# (**) scaled to 2.3GHz to simplify comparison; 377bded2dbSJung-uk Kim# (***) scalar AD*X code is faster than AVX2 and is preferred code 387bded2dbSJung-uk Kim# path for Broadwell; 397bded2dbSJung-uk Kim 40b077aed3SPierre Pronchery# $output is the last argument if it looks like a file (it has an extension) 41b077aed3SPierre Pronchery# $flavour is the first argument if it doesn't look like a file 42b077aed3SPierre Pronchery$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 43b077aed3SPierre Pronchery$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 447bded2dbSJung-uk Kim 457bded2dbSJung-uk Kim$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 467bded2dbSJung-uk Kim 477bded2dbSJung-uk Kim$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 487bded2dbSJung-uk Kim( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 497bded2dbSJung-uk Kim( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 507bded2dbSJung-uk Kimdie "can't locate x86_64-xlate.pl"; 517bded2dbSJung-uk Kim 527bded2dbSJung-uk Kimif (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 537bded2dbSJung-uk Kim =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 547bded2dbSJung-uk Kim $avx = ($1>=2.19) + ($1>=2.22); 557bded2dbSJung-uk Kim $addx = ($1>=2.23); 567bded2dbSJung-uk Kim} 577bded2dbSJung-uk Kim 587bded2dbSJung-uk Kimif (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 597bded2dbSJung-uk Kim `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 607bded2dbSJung-uk Kim $avx = ($1>=2.09) + ($1>=2.10); 617bded2dbSJung-uk Kim $addx = ($1>=2.10); 627bded2dbSJung-uk Kim} 637bded2dbSJung-uk Kim 647bded2dbSJung-uk Kimif (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 657bded2dbSJung-uk Kim `ml64 2>&1` =~ /Version ([0-9]+)\./) { 667bded2dbSJung-uk Kim $avx = ($1>=10) + ($1>=11); 677bded2dbSJung-uk Kim $addx = ($1>=11); 687bded2dbSJung-uk Kim} 697bded2dbSJung-uk Kim 7063c1bb51SJung-uk Kimif (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|based on LLVM) ([0-9]+)\.([0-9]+)/) { 717bded2dbSJung-uk Kim my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 727bded2dbSJung-uk Kim $avx = ($ver>=3.0) + ($ver>=3.01); 737bded2dbSJung-uk Kim $addx = ($ver>=3.03); 747bded2dbSJung-uk Kim} 757bded2dbSJung-uk Kim 76b077aed3SPierre Proncheryopen OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" 77b077aed3SPierre Pronchery or die "can't call $xlate: $!"; 787bded2dbSJung-uk Kim*STDOUT = *OUT; 797bded2dbSJung-uk Kim 807bded2dbSJung-uk Kimif ($avx>1) {{{ 817bded2dbSJung-uk Kim{ # void AMS_WW( 827bded2dbSJung-uk Kimmy $rp="%rdi"; # BN_ULONG *rp, 837bded2dbSJung-uk Kimmy $ap="%rsi"; # const BN_ULONG *ap, 847bded2dbSJung-uk Kimmy $np="%rdx"; # const BN_ULONG *np, 857bded2dbSJung-uk Kimmy $n0="%ecx"; # const BN_ULONG n0, 867bded2dbSJung-uk Kimmy $rep="%r8d"; # int repeat); 877bded2dbSJung-uk Kim 887bded2dbSJung-uk Kim# The registers that hold the accumulated redundant result 897bded2dbSJung-uk Kim# The AMM works on 1024 bit operands, and redundant word size is 29 907bded2dbSJung-uk Kim# Therefore: ceil(1024/29)/4 = 9 917bded2dbSJung-uk Kimmy $ACC0="%ymm0"; 927bded2dbSJung-uk Kimmy $ACC1="%ymm1"; 937bded2dbSJung-uk Kimmy $ACC2="%ymm2"; 947bded2dbSJung-uk Kimmy $ACC3="%ymm3"; 957bded2dbSJung-uk Kimmy $ACC4="%ymm4"; 967bded2dbSJung-uk Kimmy $ACC5="%ymm5"; 977bded2dbSJung-uk Kimmy $ACC6="%ymm6"; 987bded2dbSJung-uk Kimmy $ACC7="%ymm7"; 997bded2dbSJung-uk Kimmy $ACC8="%ymm8"; 1007bded2dbSJung-uk Kimmy $ACC9="%ymm9"; 1017bded2dbSJung-uk Kim# Registers that hold the broadcasted words of bp, currently used 1027bded2dbSJung-uk Kimmy $B1="%ymm10"; 1037bded2dbSJung-uk Kimmy $B2="%ymm11"; 1047bded2dbSJung-uk Kim# Registers that hold the broadcasted words of Y, currently used 1057bded2dbSJung-uk Kimmy $Y1="%ymm12"; 1067bded2dbSJung-uk Kimmy $Y2="%ymm13"; 1077bded2dbSJung-uk Kim# Helper registers 1087bded2dbSJung-uk Kimmy $TEMP1="%ymm14"; 1097bded2dbSJung-uk Kimmy $AND_MASK="%ymm15"; 1107bded2dbSJung-uk Kim# alu registers that hold the first words of the ACC 1117bded2dbSJung-uk Kimmy $r0="%r9"; 1127bded2dbSJung-uk Kimmy $r1="%r10"; 1137bded2dbSJung-uk Kimmy $r2="%r11"; 1147bded2dbSJung-uk Kimmy $r3="%r12"; 1157bded2dbSJung-uk Kim 1167bded2dbSJung-uk Kimmy $i="%r14d"; # loop counter 1177bded2dbSJung-uk Kimmy $tmp = "%r15"; 1187bded2dbSJung-uk Kim 1197bded2dbSJung-uk Kimmy $FrameSize=32*18+32*8; # place for A^2 and 2*A 1207bded2dbSJung-uk Kim 1217bded2dbSJung-uk Kimmy $aap=$r0; 1227bded2dbSJung-uk Kimmy $tp0="%rbx"; 1237bded2dbSJung-uk Kimmy $tp1=$r3; 1247bded2dbSJung-uk Kimmy $tpa=$tmp; 1257bded2dbSJung-uk Kim 1267bded2dbSJung-uk Kim$np="%r13"; # reassigned argument 1277bded2dbSJung-uk Kim 1287bded2dbSJung-uk Kim$code.=<<___; 1297bded2dbSJung-uk Kim.text 1307bded2dbSJung-uk Kim 1317bded2dbSJung-uk Kim.globl rsaz_1024_sqr_avx2 1327bded2dbSJung-uk Kim.type rsaz_1024_sqr_avx2,\@function,5 1337bded2dbSJung-uk Kim.align 64 1347bded2dbSJung-uk Kimrsaz_1024_sqr_avx2: # 702 cycles, 14% faster than rsaz_1024_mul_avx2 135e71b7053SJung-uk Kim.cfi_startproc 1367bded2dbSJung-uk Kim lea (%rsp), %rax 137e71b7053SJung-uk Kim.cfi_def_cfa_register %rax 1387bded2dbSJung-uk Kim push %rbx 139e71b7053SJung-uk Kim.cfi_push %rbx 1407bded2dbSJung-uk Kim push %rbp 141e71b7053SJung-uk Kim.cfi_push %rbp 1427bded2dbSJung-uk Kim push %r12 143e71b7053SJung-uk Kim.cfi_push %r12 1447bded2dbSJung-uk Kim push %r13 145e71b7053SJung-uk Kim.cfi_push %r13 1467bded2dbSJung-uk Kim push %r14 147e71b7053SJung-uk Kim.cfi_push %r14 1487bded2dbSJung-uk Kim push %r15 149e71b7053SJung-uk Kim.cfi_push %r15 1507bded2dbSJung-uk Kim vzeroupper 1517bded2dbSJung-uk Kim___ 1527bded2dbSJung-uk Kim$code.=<<___ if ($win64); 1537bded2dbSJung-uk Kim lea -0xa8(%rsp),%rsp 1547bded2dbSJung-uk Kim vmovaps %xmm6,-0xd8(%rax) 1557bded2dbSJung-uk Kim vmovaps %xmm7,-0xc8(%rax) 1567bded2dbSJung-uk Kim vmovaps %xmm8,-0xb8(%rax) 1577bded2dbSJung-uk Kim vmovaps %xmm9,-0xa8(%rax) 1587bded2dbSJung-uk Kim vmovaps %xmm10,-0x98(%rax) 1597bded2dbSJung-uk Kim vmovaps %xmm11,-0x88(%rax) 1607bded2dbSJung-uk Kim vmovaps %xmm12,-0x78(%rax) 1617bded2dbSJung-uk Kim vmovaps %xmm13,-0x68(%rax) 1627bded2dbSJung-uk Kim vmovaps %xmm14,-0x58(%rax) 1637bded2dbSJung-uk Kim vmovaps %xmm15,-0x48(%rax) 1647bded2dbSJung-uk Kim.Lsqr_1024_body: 1657bded2dbSJung-uk Kim___ 1667bded2dbSJung-uk Kim$code.=<<___; 1677bded2dbSJung-uk Kim mov %rax,%rbp 168e71b7053SJung-uk Kim.cfi_def_cfa_register %rbp 1697bded2dbSJung-uk Kim mov %rdx, $np # reassigned argument 1707bded2dbSJung-uk Kim sub \$$FrameSize, %rsp 1717bded2dbSJung-uk Kim mov $np, $tmp 1727bded2dbSJung-uk Kim sub \$-128, $rp # size optimization 1737bded2dbSJung-uk Kim sub \$-128, $ap 1747bded2dbSJung-uk Kim sub \$-128, $np 1757bded2dbSJung-uk Kim 1767bded2dbSJung-uk Kim and \$4095, $tmp # see if $np crosses page 1777bded2dbSJung-uk Kim add \$32*10, $tmp 1787bded2dbSJung-uk Kim shr \$12, $tmp 1797bded2dbSJung-uk Kim vpxor $ACC9,$ACC9,$ACC9 1807bded2dbSJung-uk Kim jz .Lsqr_1024_no_n_copy 1817bded2dbSJung-uk Kim 1827bded2dbSJung-uk Kim # unaligned 256-bit load that crosses page boundary can 1837bded2dbSJung-uk Kim # cause >2x performance degradation here, so if $np does 1847bded2dbSJung-uk Kim # cross page boundary, copy it to stack and make sure stack 1857bded2dbSJung-uk Kim # frame doesn't... 1867bded2dbSJung-uk Kim sub \$32*10,%rsp 1877bded2dbSJung-uk Kim vmovdqu 32*0-128($np), $ACC0 1887bded2dbSJung-uk Kim and \$-2048, %rsp 1897bded2dbSJung-uk Kim vmovdqu 32*1-128($np), $ACC1 1907bded2dbSJung-uk Kim vmovdqu 32*2-128($np), $ACC2 1917bded2dbSJung-uk Kim vmovdqu 32*3-128($np), $ACC3 1927bded2dbSJung-uk Kim vmovdqu 32*4-128($np), $ACC4 1937bded2dbSJung-uk Kim vmovdqu 32*5-128($np), $ACC5 1947bded2dbSJung-uk Kim vmovdqu 32*6-128($np), $ACC6 1957bded2dbSJung-uk Kim vmovdqu 32*7-128($np), $ACC7 1967bded2dbSJung-uk Kim vmovdqu 32*8-128($np), $ACC8 1977bded2dbSJung-uk Kim lea $FrameSize+128(%rsp),$np 1987bded2dbSJung-uk Kim vmovdqu $ACC0, 32*0-128($np) 1997bded2dbSJung-uk Kim vmovdqu $ACC1, 32*1-128($np) 2007bded2dbSJung-uk Kim vmovdqu $ACC2, 32*2-128($np) 2017bded2dbSJung-uk Kim vmovdqu $ACC3, 32*3-128($np) 2027bded2dbSJung-uk Kim vmovdqu $ACC4, 32*4-128($np) 2037bded2dbSJung-uk Kim vmovdqu $ACC5, 32*5-128($np) 2047bded2dbSJung-uk Kim vmovdqu $ACC6, 32*6-128($np) 2057bded2dbSJung-uk Kim vmovdqu $ACC7, 32*7-128($np) 2067bded2dbSJung-uk Kim vmovdqu $ACC8, 32*8-128($np) 2077bded2dbSJung-uk Kim vmovdqu $ACC9, 32*9-128($np) # $ACC9 is zero 2087bded2dbSJung-uk Kim 2097bded2dbSJung-uk Kim.Lsqr_1024_no_n_copy: 2107bded2dbSJung-uk Kim and \$-1024, %rsp 2117bded2dbSJung-uk Kim 2127bded2dbSJung-uk Kim vmovdqu 32*1-128($ap), $ACC1 2137bded2dbSJung-uk Kim vmovdqu 32*2-128($ap), $ACC2 2147bded2dbSJung-uk Kim vmovdqu 32*3-128($ap), $ACC3 2157bded2dbSJung-uk Kim vmovdqu 32*4-128($ap), $ACC4 2167bded2dbSJung-uk Kim vmovdqu 32*5-128($ap), $ACC5 2177bded2dbSJung-uk Kim vmovdqu 32*6-128($ap), $ACC6 2187bded2dbSJung-uk Kim vmovdqu 32*7-128($ap), $ACC7 2197bded2dbSJung-uk Kim vmovdqu 32*8-128($ap), $ACC8 2207bded2dbSJung-uk Kim 2217bded2dbSJung-uk Kim lea 192(%rsp), $tp0 # 64+128=192 222c4ad4dffSJung-uk Kim vmovdqu .Land_mask(%rip), $AND_MASK 2237bded2dbSJung-uk Kim jmp .LOOP_GRANDE_SQR_1024 2247bded2dbSJung-uk Kim 2257bded2dbSJung-uk Kim.align 32 2267bded2dbSJung-uk Kim.LOOP_GRANDE_SQR_1024: 2277bded2dbSJung-uk Kim lea 32*18+128(%rsp), $aap # size optimization 2287bded2dbSJung-uk Kim lea 448(%rsp), $tp1 # 64+128+256=448 2297bded2dbSJung-uk Kim 2307bded2dbSJung-uk Kim # the squaring is performed as described in Variant B of 2317bded2dbSJung-uk Kim # "Speeding up Big-Number Squaring", so start by calculating 2327bded2dbSJung-uk Kim # the A*2=A+A vector 2337bded2dbSJung-uk Kim vpaddq $ACC1, $ACC1, $ACC1 2347bded2dbSJung-uk Kim vpbroadcastq 32*0-128($ap), $B1 2357bded2dbSJung-uk Kim vpaddq $ACC2, $ACC2, $ACC2 2367bded2dbSJung-uk Kim vmovdqa $ACC1, 32*0-128($aap) 2377bded2dbSJung-uk Kim vpaddq $ACC3, $ACC3, $ACC3 2387bded2dbSJung-uk Kim vmovdqa $ACC2, 32*1-128($aap) 2397bded2dbSJung-uk Kim vpaddq $ACC4, $ACC4, $ACC4 2407bded2dbSJung-uk Kim vmovdqa $ACC3, 32*2-128($aap) 2417bded2dbSJung-uk Kim vpaddq $ACC5, $ACC5, $ACC5 2427bded2dbSJung-uk Kim vmovdqa $ACC4, 32*3-128($aap) 2437bded2dbSJung-uk Kim vpaddq $ACC6, $ACC6, $ACC6 2447bded2dbSJung-uk Kim vmovdqa $ACC5, 32*4-128($aap) 2457bded2dbSJung-uk Kim vpaddq $ACC7, $ACC7, $ACC7 2467bded2dbSJung-uk Kim vmovdqa $ACC6, 32*5-128($aap) 2477bded2dbSJung-uk Kim vpaddq $ACC8, $ACC8, $ACC8 2487bded2dbSJung-uk Kim vmovdqa $ACC7, 32*6-128($aap) 2497bded2dbSJung-uk Kim vpxor $ACC9, $ACC9, $ACC9 2507bded2dbSJung-uk Kim vmovdqa $ACC8, 32*7-128($aap) 2517bded2dbSJung-uk Kim 2527bded2dbSJung-uk Kim vpmuludq 32*0-128($ap), $B1, $ACC0 2537bded2dbSJung-uk Kim vpbroadcastq 32*1-128($ap), $B2 2547bded2dbSJung-uk Kim vmovdqu $ACC9, 32*9-192($tp0) # zero upper half 2557bded2dbSJung-uk Kim vpmuludq $B1, $ACC1, $ACC1 2567bded2dbSJung-uk Kim vmovdqu $ACC9, 32*10-448($tp1) 2577bded2dbSJung-uk Kim vpmuludq $B1, $ACC2, $ACC2 2587bded2dbSJung-uk Kim vmovdqu $ACC9, 32*11-448($tp1) 2597bded2dbSJung-uk Kim vpmuludq $B1, $ACC3, $ACC3 2607bded2dbSJung-uk Kim vmovdqu $ACC9, 32*12-448($tp1) 2617bded2dbSJung-uk Kim vpmuludq $B1, $ACC4, $ACC4 2627bded2dbSJung-uk Kim vmovdqu $ACC9, 32*13-448($tp1) 2637bded2dbSJung-uk Kim vpmuludq $B1, $ACC5, $ACC5 2647bded2dbSJung-uk Kim vmovdqu $ACC9, 32*14-448($tp1) 2657bded2dbSJung-uk Kim vpmuludq $B1, $ACC6, $ACC6 2667bded2dbSJung-uk Kim vmovdqu $ACC9, 32*15-448($tp1) 2677bded2dbSJung-uk Kim vpmuludq $B1, $ACC7, $ACC7 2687bded2dbSJung-uk Kim vmovdqu $ACC9, 32*16-448($tp1) 2697bded2dbSJung-uk Kim vpmuludq $B1, $ACC8, $ACC8 2707bded2dbSJung-uk Kim vpbroadcastq 32*2-128($ap), $B1 2717bded2dbSJung-uk Kim vmovdqu $ACC9, 32*17-448($tp1) 2727bded2dbSJung-uk Kim 2737bded2dbSJung-uk Kim mov $ap, $tpa 2747bded2dbSJung-uk Kim mov \$4, $i 2757bded2dbSJung-uk Kim jmp .Lsqr_entry_1024 2767bded2dbSJung-uk Kim___ 2777bded2dbSJung-uk Kim$TEMP0=$Y1; 2787bded2dbSJung-uk Kim$TEMP2=$Y2; 2797bded2dbSJung-uk Kim$code.=<<___; 2807bded2dbSJung-uk Kim.align 32 2817bded2dbSJung-uk Kim.LOOP_SQR_1024: 2827bded2dbSJung-uk Kim vpbroadcastq 32*1-128($tpa), $B2 2837bded2dbSJung-uk Kim vpmuludq 32*0-128($ap), $B1, $ACC0 2847bded2dbSJung-uk Kim vpaddq 32*0-192($tp0), $ACC0, $ACC0 2857bded2dbSJung-uk Kim vpmuludq 32*0-128($aap), $B1, $ACC1 2867bded2dbSJung-uk Kim vpaddq 32*1-192($tp0), $ACC1, $ACC1 2877bded2dbSJung-uk Kim vpmuludq 32*1-128($aap), $B1, $ACC2 2887bded2dbSJung-uk Kim vpaddq 32*2-192($tp0), $ACC2, $ACC2 2897bded2dbSJung-uk Kim vpmuludq 32*2-128($aap), $B1, $ACC3 2907bded2dbSJung-uk Kim vpaddq 32*3-192($tp0), $ACC3, $ACC3 2917bded2dbSJung-uk Kim vpmuludq 32*3-128($aap), $B1, $ACC4 2927bded2dbSJung-uk Kim vpaddq 32*4-192($tp0), $ACC4, $ACC4 2937bded2dbSJung-uk Kim vpmuludq 32*4-128($aap), $B1, $ACC5 2947bded2dbSJung-uk Kim vpaddq 32*5-192($tp0), $ACC5, $ACC5 2957bded2dbSJung-uk Kim vpmuludq 32*5-128($aap), $B1, $ACC6 2967bded2dbSJung-uk Kim vpaddq 32*6-192($tp0), $ACC6, $ACC6 2977bded2dbSJung-uk Kim vpmuludq 32*6-128($aap), $B1, $ACC7 2987bded2dbSJung-uk Kim vpaddq 32*7-192($tp0), $ACC7, $ACC7 2997bded2dbSJung-uk Kim vpmuludq 32*7-128($aap), $B1, $ACC8 3007bded2dbSJung-uk Kim vpbroadcastq 32*2-128($tpa), $B1 3017bded2dbSJung-uk Kim vpaddq 32*8-192($tp0), $ACC8, $ACC8 3027bded2dbSJung-uk Kim.Lsqr_entry_1024: 3037bded2dbSJung-uk Kim vmovdqu $ACC0, 32*0-192($tp0) 3047bded2dbSJung-uk Kim vmovdqu $ACC1, 32*1-192($tp0) 3057bded2dbSJung-uk Kim 3067bded2dbSJung-uk Kim vpmuludq 32*1-128($ap), $B2, $TEMP0 3077bded2dbSJung-uk Kim vpaddq $TEMP0, $ACC2, $ACC2 3087bded2dbSJung-uk Kim vpmuludq 32*1-128($aap), $B2, $TEMP1 3097bded2dbSJung-uk Kim vpaddq $TEMP1, $ACC3, $ACC3 3107bded2dbSJung-uk Kim vpmuludq 32*2-128($aap), $B2, $TEMP2 3117bded2dbSJung-uk Kim vpaddq $TEMP2, $ACC4, $ACC4 3127bded2dbSJung-uk Kim vpmuludq 32*3-128($aap), $B2, $TEMP0 3137bded2dbSJung-uk Kim vpaddq $TEMP0, $ACC5, $ACC5 3147bded2dbSJung-uk Kim vpmuludq 32*4-128($aap), $B2, $TEMP1 3157bded2dbSJung-uk Kim vpaddq $TEMP1, $ACC6, $ACC6 3167bded2dbSJung-uk Kim vpmuludq 32*5-128($aap), $B2, $TEMP2 3177bded2dbSJung-uk Kim vpaddq $TEMP2, $ACC7, $ACC7 3187bded2dbSJung-uk Kim vpmuludq 32*6-128($aap), $B2, $TEMP0 3197bded2dbSJung-uk Kim vpaddq $TEMP0, $ACC8, $ACC8 3207bded2dbSJung-uk Kim vpmuludq 32*7-128($aap), $B2, $ACC0 3217bded2dbSJung-uk Kim vpbroadcastq 32*3-128($tpa), $B2 3227bded2dbSJung-uk Kim vpaddq 32*9-192($tp0), $ACC0, $ACC0 3237bded2dbSJung-uk Kim 3247bded2dbSJung-uk Kim vmovdqu $ACC2, 32*2-192($tp0) 3257bded2dbSJung-uk Kim vmovdqu $ACC3, 32*3-192($tp0) 3267bded2dbSJung-uk Kim 3277bded2dbSJung-uk Kim vpmuludq 32*2-128($ap), $B1, $TEMP2 3287bded2dbSJung-uk Kim vpaddq $TEMP2, $ACC4, $ACC4 3297bded2dbSJung-uk Kim vpmuludq 32*2-128($aap), $B1, $TEMP0 3307bded2dbSJung-uk Kim vpaddq $TEMP0, $ACC5, $ACC5 3317bded2dbSJung-uk Kim vpmuludq 32*3-128($aap), $B1, $TEMP1 3327bded2dbSJung-uk Kim vpaddq $TEMP1, $ACC6, $ACC6 3337bded2dbSJung-uk Kim vpmuludq 32*4-128($aap), $B1, $TEMP2 3347bded2dbSJung-uk Kim vpaddq $TEMP2, $ACC7, $ACC7 3357bded2dbSJung-uk Kim vpmuludq 32*5-128($aap), $B1, $TEMP0 3367bded2dbSJung-uk Kim vpaddq $TEMP0, $ACC8, $ACC8 3377bded2dbSJung-uk Kim vpmuludq 32*6-128($aap), $B1, $TEMP1 3387bded2dbSJung-uk Kim vpaddq $TEMP1, $ACC0, $ACC0 3397bded2dbSJung-uk Kim vpmuludq 32*7-128($aap), $B1, $ACC1 3407bded2dbSJung-uk Kim vpbroadcastq 32*4-128($tpa), $B1 3417bded2dbSJung-uk Kim vpaddq 32*10-448($tp1), $ACC1, $ACC1 3427bded2dbSJung-uk Kim 3437bded2dbSJung-uk Kim vmovdqu $ACC4, 32*4-192($tp0) 3447bded2dbSJung-uk Kim vmovdqu $ACC5, 32*5-192($tp0) 3457bded2dbSJung-uk Kim 3467bded2dbSJung-uk Kim vpmuludq 32*3-128($ap), $B2, $TEMP0 3477bded2dbSJung-uk Kim vpaddq $TEMP0, $ACC6, $ACC6 3487bded2dbSJung-uk Kim vpmuludq 32*3-128($aap), $B2, $TEMP1 3497bded2dbSJung-uk Kim vpaddq $TEMP1, $ACC7, $ACC7 3507bded2dbSJung-uk Kim vpmuludq 32*4-128($aap), $B2, $TEMP2 3517bded2dbSJung-uk Kim vpaddq $TEMP2, $ACC8, $ACC8 3527bded2dbSJung-uk Kim vpmuludq 32*5-128($aap), $B2, $TEMP0 3537bded2dbSJung-uk Kim vpaddq $TEMP0, $ACC0, $ACC0 3547bded2dbSJung-uk Kim vpmuludq 32*6-128($aap), $B2, $TEMP1 3557bded2dbSJung-uk Kim vpaddq $TEMP1, $ACC1, $ACC1 3567bded2dbSJung-uk Kim vpmuludq 32*7-128($aap), $B2, $ACC2 3577bded2dbSJung-uk Kim vpbroadcastq 32*5-128($tpa), $B2 3587bded2dbSJung-uk Kim vpaddq 32*11-448($tp1), $ACC2, $ACC2 3597bded2dbSJung-uk Kim 3607bded2dbSJung-uk Kim vmovdqu $ACC6, 32*6-192($tp0) 3617bded2dbSJung-uk Kim vmovdqu $ACC7, 32*7-192($tp0) 3627bded2dbSJung-uk Kim 3637bded2dbSJung-uk Kim vpmuludq 32*4-128($ap), $B1, $TEMP0 3647bded2dbSJung-uk Kim vpaddq $TEMP0, $ACC8, $ACC8 3657bded2dbSJung-uk Kim vpmuludq 32*4-128($aap), $B1, $TEMP1 3667bded2dbSJung-uk Kim vpaddq $TEMP1, $ACC0, $ACC0 3677bded2dbSJung-uk Kim vpmuludq 32*5-128($aap), $B1, $TEMP2 3687bded2dbSJung-uk Kim vpaddq $TEMP2, $ACC1, $ACC1 3697bded2dbSJung-uk Kim vpmuludq 32*6-128($aap), $B1, $TEMP0 3707bded2dbSJung-uk Kim vpaddq $TEMP0, $ACC2, $ACC2 3717bded2dbSJung-uk Kim vpmuludq 32*7-128($aap), $B1, $ACC3 3727bded2dbSJung-uk Kim vpbroadcastq 32*6-128($tpa), $B1 3737bded2dbSJung-uk Kim vpaddq 32*12-448($tp1), $ACC3, $ACC3 3747bded2dbSJung-uk Kim 3757bded2dbSJung-uk Kim vmovdqu $ACC8, 32*8-192($tp0) 3767bded2dbSJung-uk Kim vmovdqu $ACC0, 32*9-192($tp0) 3777bded2dbSJung-uk Kim lea 8($tp0), $tp0 3787bded2dbSJung-uk Kim 3797bded2dbSJung-uk Kim vpmuludq 32*5-128($ap), $B2, $TEMP2 3807bded2dbSJung-uk Kim vpaddq $TEMP2, $ACC1, $ACC1 3817bded2dbSJung-uk Kim vpmuludq 32*5-128($aap), $B2, $TEMP0 3827bded2dbSJung-uk Kim vpaddq $TEMP0, $ACC2, $ACC2 3837bded2dbSJung-uk Kim vpmuludq 32*6-128($aap), $B2, $TEMP1 3847bded2dbSJung-uk Kim vpaddq $TEMP1, $ACC3, $ACC3 3857bded2dbSJung-uk Kim vpmuludq 32*7-128($aap), $B2, $ACC4 3867bded2dbSJung-uk Kim vpbroadcastq 32*7-128($tpa), $B2 3877bded2dbSJung-uk Kim vpaddq 32*13-448($tp1), $ACC4, $ACC4 3887bded2dbSJung-uk Kim 3897bded2dbSJung-uk Kim vmovdqu $ACC1, 32*10-448($tp1) 3907bded2dbSJung-uk Kim vmovdqu $ACC2, 32*11-448($tp1) 3917bded2dbSJung-uk Kim 3927bded2dbSJung-uk Kim vpmuludq 32*6-128($ap), $B1, $TEMP0 3937bded2dbSJung-uk Kim vpaddq $TEMP0, $ACC3, $ACC3 3947bded2dbSJung-uk Kim vpmuludq 32*6-128($aap), $B1, $TEMP1 3957bded2dbSJung-uk Kim vpbroadcastq 32*8-128($tpa), $ACC0 # borrow $ACC0 for $B1 3967bded2dbSJung-uk Kim vpaddq $TEMP1, $ACC4, $ACC4 3977bded2dbSJung-uk Kim vpmuludq 32*7-128($aap), $B1, $ACC5 3987bded2dbSJung-uk Kim vpbroadcastq 32*0+8-128($tpa), $B1 # for next iteration 3997bded2dbSJung-uk Kim vpaddq 32*14-448($tp1), $ACC5, $ACC5 4007bded2dbSJung-uk Kim 4017bded2dbSJung-uk Kim vmovdqu $ACC3, 32*12-448($tp1) 4027bded2dbSJung-uk Kim vmovdqu $ACC4, 32*13-448($tp1) 4037bded2dbSJung-uk Kim lea 8($tpa), $tpa 4047bded2dbSJung-uk Kim 4057bded2dbSJung-uk Kim vpmuludq 32*7-128($ap), $B2, $TEMP0 4067bded2dbSJung-uk Kim vpaddq $TEMP0, $ACC5, $ACC5 4077bded2dbSJung-uk Kim vpmuludq 32*7-128($aap), $B2, $ACC6 4087bded2dbSJung-uk Kim vpaddq 32*15-448($tp1), $ACC6, $ACC6 4097bded2dbSJung-uk Kim 4107bded2dbSJung-uk Kim vpmuludq 32*8-128($ap), $ACC0, $ACC7 4117bded2dbSJung-uk Kim vmovdqu $ACC5, 32*14-448($tp1) 4127bded2dbSJung-uk Kim vpaddq 32*16-448($tp1), $ACC7, $ACC7 4137bded2dbSJung-uk Kim vmovdqu $ACC6, 32*15-448($tp1) 4147bded2dbSJung-uk Kim vmovdqu $ACC7, 32*16-448($tp1) 4157bded2dbSJung-uk Kim lea 8($tp1), $tp1 4167bded2dbSJung-uk Kim 4177bded2dbSJung-uk Kim dec $i 4187bded2dbSJung-uk Kim jnz .LOOP_SQR_1024 4197bded2dbSJung-uk Kim___ 4207bded2dbSJung-uk Kim$ZERO = $ACC9; 4217bded2dbSJung-uk Kim$TEMP0 = $B1; 4227bded2dbSJung-uk Kim$TEMP2 = $B2; 4237bded2dbSJung-uk Kim$TEMP3 = $Y1; 4247bded2dbSJung-uk Kim$TEMP4 = $Y2; 4257bded2dbSJung-uk Kim$code.=<<___; 4264c6a0400SJung-uk Kim # we need to fix indices 32-39 to avoid overflow 4277bded2dbSJung-uk Kim vmovdqu 32*8(%rsp), $ACC8 # 32*8-192($tp0), 4287bded2dbSJung-uk Kim vmovdqu 32*9(%rsp), $ACC1 # 32*9-192($tp0) 4297bded2dbSJung-uk Kim vmovdqu 32*10(%rsp), $ACC2 # 32*10-192($tp0) 4307bded2dbSJung-uk Kim lea 192(%rsp), $tp0 # 64+128=192 4317bded2dbSJung-uk Kim 4327bded2dbSJung-uk Kim vpsrlq \$29, $ACC8, $TEMP1 4337bded2dbSJung-uk Kim vpand $AND_MASK, $ACC8, $ACC8 4347bded2dbSJung-uk Kim vpsrlq \$29, $ACC1, $TEMP2 4357bded2dbSJung-uk Kim vpand $AND_MASK, $ACC1, $ACC1 4367bded2dbSJung-uk Kim 4377bded2dbSJung-uk Kim vpermq \$0x93, $TEMP1, $TEMP1 4387bded2dbSJung-uk Kim vpxor $ZERO, $ZERO, $ZERO 4397bded2dbSJung-uk Kim vpermq \$0x93, $TEMP2, $TEMP2 4407bded2dbSJung-uk Kim 4417bded2dbSJung-uk Kim vpblendd \$3, $ZERO, $TEMP1, $TEMP0 4427bded2dbSJung-uk Kim vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 4437bded2dbSJung-uk Kim vpaddq $TEMP0, $ACC8, $ACC8 4447bded2dbSJung-uk Kim vpblendd \$3, $TEMP2, $ZERO, $TEMP2 4457bded2dbSJung-uk Kim vpaddq $TEMP1, $ACC1, $ACC1 4467bded2dbSJung-uk Kim vpaddq $TEMP2, $ACC2, $ACC2 4477bded2dbSJung-uk Kim vmovdqu $ACC1, 32*9-192($tp0) 4487bded2dbSJung-uk Kim vmovdqu $ACC2, 32*10-192($tp0) 4497bded2dbSJung-uk Kim 4507bded2dbSJung-uk Kim mov (%rsp), %rax 4517bded2dbSJung-uk Kim mov 8(%rsp), $r1 4527bded2dbSJung-uk Kim mov 16(%rsp), $r2 4537bded2dbSJung-uk Kim mov 24(%rsp), $r3 4547bded2dbSJung-uk Kim vmovdqu 32*1(%rsp), $ACC1 4557bded2dbSJung-uk Kim vmovdqu 32*2-192($tp0), $ACC2 4567bded2dbSJung-uk Kim vmovdqu 32*3-192($tp0), $ACC3 4577bded2dbSJung-uk Kim vmovdqu 32*4-192($tp0), $ACC4 4587bded2dbSJung-uk Kim vmovdqu 32*5-192($tp0), $ACC5 4597bded2dbSJung-uk Kim vmovdqu 32*6-192($tp0), $ACC6 4607bded2dbSJung-uk Kim vmovdqu 32*7-192($tp0), $ACC7 4617bded2dbSJung-uk Kim 4627bded2dbSJung-uk Kim mov %rax, $r0 4637bded2dbSJung-uk Kim imull $n0, %eax 4647bded2dbSJung-uk Kim and \$0x1fffffff, %eax 4657bded2dbSJung-uk Kim vmovd %eax, $Y1 4667bded2dbSJung-uk Kim 4677bded2dbSJung-uk Kim mov %rax, %rdx 4687bded2dbSJung-uk Kim imulq -128($np), %rax 4697bded2dbSJung-uk Kim vpbroadcastq $Y1, $Y1 4707bded2dbSJung-uk Kim add %rax, $r0 4717bded2dbSJung-uk Kim mov %rdx, %rax 4727bded2dbSJung-uk Kim imulq 8-128($np), %rax 4737bded2dbSJung-uk Kim shr \$29, $r0 4747bded2dbSJung-uk Kim add %rax, $r1 4757bded2dbSJung-uk Kim mov %rdx, %rax 4767bded2dbSJung-uk Kim imulq 16-128($np), %rax 4777bded2dbSJung-uk Kim add $r0, $r1 4787bded2dbSJung-uk Kim add %rax, $r2 4797bded2dbSJung-uk Kim imulq 24-128($np), %rdx 4807bded2dbSJung-uk Kim add %rdx, $r3 4817bded2dbSJung-uk Kim 4827bded2dbSJung-uk Kim mov $r1, %rax 4837bded2dbSJung-uk Kim imull $n0, %eax 4847bded2dbSJung-uk Kim and \$0x1fffffff, %eax 4857bded2dbSJung-uk Kim 4867bded2dbSJung-uk Kim mov \$9, $i 4877bded2dbSJung-uk Kim jmp .LOOP_REDUCE_1024 4887bded2dbSJung-uk Kim 4897bded2dbSJung-uk Kim.align 32 4907bded2dbSJung-uk Kim.LOOP_REDUCE_1024: 4917bded2dbSJung-uk Kim vmovd %eax, $Y2 4927bded2dbSJung-uk Kim vpbroadcastq $Y2, $Y2 4937bded2dbSJung-uk Kim 4947bded2dbSJung-uk Kim vpmuludq 32*1-128($np), $Y1, $TEMP0 4957bded2dbSJung-uk Kim mov %rax, %rdx 4967bded2dbSJung-uk Kim imulq -128($np), %rax 4977bded2dbSJung-uk Kim vpaddq $TEMP0, $ACC1, $ACC1 4987bded2dbSJung-uk Kim add %rax, $r1 4997bded2dbSJung-uk Kim vpmuludq 32*2-128($np), $Y1, $TEMP1 5007bded2dbSJung-uk Kim mov %rdx, %rax 5017bded2dbSJung-uk Kim imulq 8-128($np), %rax 5027bded2dbSJung-uk Kim vpaddq $TEMP1, $ACC2, $ACC2 5037bded2dbSJung-uk Kim vpmuludq 32*3-128($np), $Y1, $TEMP2 5047bded2dbSJung-uk Kim .byte 0x67 5057bded2dbSJung-uk Kim add %rax, $r2 5067bded2dbSJung-uk Kim .byte 0x67 5077bded2dbSJung-uk Kim mov %rdx, %rax 5087bded2dbSJung-uk Kim imulq 16-128($np), %rax 5097bded2dbSJung-uk Kim shr \$29, $r1 5107bded2dbSJung-uk Kim vpaddq $TEMP2, $ACC3, $ACC3 5117bded2dbSJung-uk Kim vpmuludq 32*4-128($np), $Y1, $TEMP0 5127bded2dbSJung-uk Kim add %rax, $r3 5137bded2dbSJung-uk Kim add $r1, $r2 5147bded2dbSJung-uk Kim vpaddq $TEMP0, $ACC4, $ACC4 5157bded2dbSJung-uk Kim vpmuludq 32*5-128($np), $Y1, $TEMP1 5167bded2dbSJung-uk Kim mov $r2, %rax 5177bded2dbSJung-uk Kim imull $n0, %eax 5187bded2dbSJung-uk Kim vpaddq $TEMP1, $ACC5, $ACC5 5197bded2dbSJung-uk Kim vpmuludq 32*6-128($np), $Y1, $TEMP2 5207bded2dbSJung-uk Kim and \$0x1fffffff, %eax 5217bded2dbSJung-uk Kim vpaddq $TEMP2, $ACC6, $ACC6 5227bded2dbSJung-uk Kim vpmuludq 32*7-128($np), $Y1, $TEMP0 5237bded2dbSJung-uk Kim vpaddq $TEMP0, $ACC7, $ACC7 5247bded2dbSJung-uk Kim vpmuludq 32*8-128($np), $Y1, $TEMP1 5257bded2dbSJung-uk Kim vmovd %eax, $Y1 5267bded2dbSJung-uk Kim #vmovdqu 32*1-8-128($np), $TEMP2 # moved below 5277bded2dbSJung-uk Kim vpaddq $TEMP1, $ACC8, $ACC8 5287bded2dbSJung-uk Kim #vmovdqu 32*2-8-128($np), $TEMP0 # moved below 5297bded2dbSJung-uk Kim vpbroadcastq $Y1, $Y1 5307bded2dbSJung-uk Kim 5317bded2dbSJung-uk Kim vpmuludq 32*1-8-128($np), $Y2, $TEMP2 # see above 5327bded2dbSJung-uk Kim vmovdqu 32*3-8-128($np), $TEMP1 5337bded2dbSJung-uk Kim mov %rax, %rdx 5347bded2dbSJung-uk Kim imulq -128($np), %rax 5357bded2dbSJung-uk Kim vpaddq $TEMP2, $ACC1, $ACC1 5367bded2dbSJung-uk Kim vpmuludq 32*2-8-128($np), $Y2, $TEMP0 # see above 5377bded2dbSJung-uk Kim vmovdqu 32*4-8-128($np), $TEMP2 5387bded2dbSJung-uk Kim add %rax, $r2 5397bded2dbSJung-uk Kim mov %rdx, %rax 5407bded2dbSJung-uk Kim imulq 8-128($np), %rax 5417bded2dbSJung-uk Kim vpaddq $TEMP0, $ACC2, $ACC2 5427bded2dbSJung-uk Kim add $r3, %rax 5437bded2dbSJung-uk Kim shr \$29, $r2 5447bded2dbSJung-uk Kim vpmuludq $Y2, $TEMP1, $TEMP1 5457bded2dbSJung-uk Kim vmovdqu 32*5-8-128($np), $TEMP0 5467bded2dbSJung-uk Kim add $r2, %rax 5477bded2dbSJung-uk Kim vpaddq $TEMP1, $ACC3, $ACC3 5487bded2dbSJung-uk Kim vpmuludq $Y2, $TEMP2, $TEMP2 5497bded2dbSJung-uk Kim vmovdqu 32*6-8-128($np), $TEMP1 5507bded2dbSJung-uk Kim .byte 0x67 5517bded2dbSJung-uk Kim mov %rax, $r3 5527bded2dbSJung-uk Kim imull $n0, %eax 5537bded2dbSJung-uk Kim vpaddq $TEMP2, $ACC4, $ACC4 5547bded2dbSJung-uk Kim vpmuludq $Y2, $TEMP0, $TEMP0 5557bded2dbSJung-uk Kim .byte 0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00 # vmovdqu 32*7-8-128($np), $TEMP2 5567bded2dbSJung-uk Kim and \$0x1fffffff, %eax 5577bded2dbSJung-uk Kim vpaddq $TEMP0, $ACC5, $ACC5 5587bded2dbSJung-uk Kim vpmuludq $Y2, $TEMP1, $TEMP1 5597bded2dbSJung-uk Kim vmovdqu 32*8-8-128($np), $TEMP0 5607bded2dbSJung-uk Kim vpaddq $TEMP1, $ACC6, $ACC6 5617bded2dbSJung-uk Kim vpmuludq $Y2, $TEMP2, $TEMP2 5627bded2dbSJung-uk Kim vmovdqu 32*9-8-128($np), $ACC9 5637bded2dbSJung-uk Kim vmovd %eax, $ACC0 # borrow ACC0 for Y2 5647bded2dbSJung-uk Kim imulq -128($np), %rax 5657bded2dbSJung-uk Kim vpaddq $TEMP2, $ACC7, $ACC7 5667bded2dbSJung-uk Kim vpmuludq $Y2, $TEMP0, $TEMP0 5677bded2dbSJung-uk Kim vmovdqu 32*1-16-128($np), $TEMP1 5687bded2dbSJung-uk Kim vpbroadcastq $ACC0, $ACC0 5697bded2dbSJung-uk Kim vpaddq $TEMP0, $ACC8, $ACC8 5707bded2dbSJung-uk Kim vpmuludq $Y2, $ACC9, $ACC9 5717bded2dbSJung-uk Kim vmovdqu 32*2-16-128($np), $TEMP2 5727bded2dbSJung-uk Kim add %rax, $r3 5737bded2dbSJung-uk Kim 5747bded2dbSJung-uk Kim___ 5757bded2dbSJung-uk Kim($ACC0,$Y2)=($Y2,$ACC0); 5767bded2dbSJung-uk Kim$code.=<<___; 5777bded2dbSJung-uk Kim vmovdqu 32*1-24-128($np), $ACC0 5787bded2dbSJung-uk Kim vpmuludq $Y1, $TEMP1, $TEMP1 5797bded2dbSJung-uk Kim vmovdqu 32*3-16-128($np), $TEMP0 5807bded2dbSJung-uk Kim vpaddq $TEMP1, $ACC1, $ACC1 5817bded2dbSJung-uk Kim vpmuludq $Y2, $ACC0, $ACC0 5827bded2dbSJung-uk Kim vpmuludq $Y1, $TEMP2, $TEMP2 5837bded2dbSJung-uk Kim .byte 0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff # vmovdqu 32*4-16-128($np), $TEMP1 5847bded2dbSJung-uk Kim vpaddq $ACC1, $ACC0, $ACC0 5857bded2dbSJung-uk Kim vpaddq $TEMP2, $ACC2, $ACC2 5867bded2dbSJung-uk Kim vpmuludq $Y1, $TEMP0, $TEMP0 5877bded2dbSJung-uk Kim vmovdqu 32*5-16-128($np), $TEMP2 5887bded2dbSJung-uk Kim .byte 0x67 5897bded2dbSJung-uk Kim vmovq $ACC0, %rax 5907bded2dbSJung-uk Kim vmovdqu $ACC0, (%rsp) # transfer $r0-$r3 5917bded2dbSJung-uk Kim vpaddq $TEMP0, $ACC3, $ACC3 5927bded2dbSJung-uk Kim vpmuludq $Y1, $TEMP1, $TEMP1 5937bded2dbSJung-uk Kim vmovdqu 32*6-16-128($np), $TEMP0 5947bded2dbSJung-uk Kim vpaddq $TEMP1, $ACC4, $ACC4 5957bded2dbSJung-uk Kim vpmuludq $Y1, $TEMP2, $TEMP2 5967bded2dbSJung-uk Kim vmovdqu 32*7-16-128($np), $TEMP1 5977bded2dbSJung-uk Kim vpaddq $TEMP2, $ACC5, $ACC5 5987bded2dbSJung-uk Kim vpmuludq $Y1, $TEMP0, $TEMP0 5997bded2dbSJung-uk Kim vmovdqu 32*8-16-128($np), $TEMP2 6007bded2dbSJung-uk Kim vpaddq $TEMP0, $ACC6, $ACC6 6017bded2dbSJung-uk Kim vpmuludq $Y1, $TEMP1, $TEMP1 6027bded2dbSJung-uk Kim shr \$29, $r3 6037bded2dbSJung-uk Kim vmovdqu 32*9-16-128($np), $TEMP0 6047bded2dbSJung-uk Kim add $r3, %rax 6057bded2dbSJung-uk Kim vpaddq $TEMP1, $ACC7, $ACC7 6067bded2dbSJung-uk Kim vpmuludq $Y1, $TEMP2, $TEMP2 6077bded2dbSJung-uk Kim #vmovdqu 32*2-24-128($np), $TEMP1 # moved below 6087bded2dbSJung-uk Kim mov %rax, $r0 6097bded2dbSJung-uk Kim imull $n0, %eax 6107bded2dbSJung-uk Kim vpaddq $TEMP2, $ACC8, $ACC8 6117bded2dbSJung-uk Kim vpmuludq $Y1, $TEMP0, $TEMP0 6127bded2dbSJung-uk Kim and \$0x1fffffff, %eax 6137bded2dbSJung-uk Kim vmovd %eax, $Y1 6147bded2dbSJung-uk Kim vmovdqu 32*3-24-128($np), $TEMP2 6157bded2dbSJung-uk Kim .byte 0x67 6167bded2dbSJung-uk Kim vpaddq $TEMP0, $ACC9, $ACC9 6177bded2dbSJung-uk Kim vpbroadcastq $Y1, $Y1 6187bded2dbSJung-uk Kim 6197bded2dbSJung-uk Kim vpmuludq 32*2-24-128($np), $Y2, $TEMP1 # see above 6207bded2dbSJung-uk Kim vmovdqu 32*4-24-128($np), $TEMP0 6217bded2dbSJung-uk Kim mov %rax, %rdx 6227bded2dbSJung-uk Kim imulq -128($np), %rax 6237bded2dbSJung-uk Kim mov 8(%rsp), $r1 6247bded2dbSJung-uk Kim vpaddq $TEMP1, $ACC2, $ACC1 6257bded2dbSJung-uk Kim vpmuludq $Y2, $TEMP2, $TEMP2 6267bded2dbSJung-uk Kim vmovdqu 32*5-24-128($np), $TEMP1 6277bded2dbSJung-uk Kim add %rax, $r0 6287bded2dbSJung-uk Kim mov %rdx, %rax 6297bded2dbSJung-uk Kim imulq 8-128($np), %rax 6307bded2dbSJung-uk Kim .byte 0x67 6317bded2dbSJung-uk Kim shr \$29, $r0 6327bded2dbSJung-uk Kim mov 16(%rsp), $r2 6337bded2dbSJung-uk Kim vpaddq $TEMP2, $ACC3, $ACC2 6347bded2dbSJung-uk Kim vpmuludq $Y2, $TEMP0, $TEMP0 6357bded2dbSJung-uk Kim vmovdqu 32*6-24-128($np), $TEMP2 6367bded2dbSJung-uk Kim add %rax, $r1 6377bded2dbSJung-uk Kim mov %rdx, %rax 6387bded2dbSJung-uk Kim imulq 16-128($np), %rax 6397bded2dbSJung-uk Kim vpaddq $TEMP0, $ACC4, $ACC3 6407bded2dbSJung-uk Kim vpmuludq $Y2, $TEMP1, $TEMP1 6417bded2dbSJung-uk Kim vmovdqu 32*7-24-128($np), $TEMP0 6427bded2dbSJung-uk Kim imulq 24-128($np), %rdx # future $r3 6437bded2dbSJung-uk Kim add %rax, $r2 6447bded2dbSJung-uk Kim lea ($r0,$r1), %rax 6457bded2dbSJung-uk Kim vpaddq $TEMP1, $ACC5, $ACC4 6467bded2dbSJung-uk Kim vpmuludq $Y2, $TEMP2, $TEMP2 6477bded2dbSJung-uk Kim vmovdqu 32*8-24-128($np), $TEMP1 6487bded2dbSJung-uk Kim mov %rax, $r1 6497bded2dbSJung-uk Kim imull $n0, %eax 6507bded2dbSJung-uk Kim vpmuludq $Y2, $TEMP0, $TEMP0 6517bded2dbSJung-uk Kim vpaddq $TEMP2, $ACC6, $ACC5 6527bded2dbSJung-uk Kim vmovdqu 32*9-24-128($np), $TEMP2 6537bded2dbSJung-uk Kim and \$0x1fffffff, %eax 6547bded2dbSJung-uk Kim vpaddq $TEMP0, $ACC7, $ACC6 6557bded2dbSJung-uk Kim vpmuludq $Y2, $TEMP1, $TEMP1 6567bded2dbSJung-uk Kim add 24(%rsp), %rdx 6577bded2dbSJung-uk Kim vpaddq $TEMP1, $ACC8, $ACC7 6587bded2dbSJung-uk Kim vpmuludq $Y2, $TEMP2, $TEMP2 6597bded2dbSJung-uk Kim vpaddq $TEMP2, $ACC9, $ACC8 6607bded2dbSJung-uk Kim vmovq $r3, $ACC9 6617bded2dbSJung-uk Kim mov %rdx, $r3 6627bded2dbSJung-uk Kim 6637bded2dbSJung-uk Kim dec $i 6647bded2dbSJung-uk Kim jnz .LOOP_REDUCE_1024 6657bded2dbSJung-uk Kim___ 6667bded2dbSJung-uk Kim($ACC0,$Y2)=($Y2,$ACC0); 6677bded2dbSJung-uk Kim$code.=<<___; 6687bded2dbSJung-uk Kim lea 448(%rsp), $tp1 # size optimization 6697bded2dbSJung-uk Kim vpaddq $ACC9, $Y2, $ACC0 6707bded2dbSJung-uk Kim vpxor $ZERO, $ZERO, $ZERO 6717bded2dbSJung-uk Kim 6727bded2dbSJung-uk Kim vpaddq 32*9-192($tp0), $ACC0, $ACC0 6737bded2dbSJung-uk Kim vpaddq 32*10-448($tp1), $ACC1, $ACC1 6747bded2dbSJung-uk Kim vpaddq 32*11-448($tp1), $ACC2, $ACC2 6757bded2dbSJung-uk Kim vpaddq 32*12-448($tp1), $ACC3, $ACC3 6767bded2dbSJung-uk Kim vpaddq 32*13-448($tp1), $ACC4, $ACC4 6777bded2dbSJung-uk Kim vpaddq 32*14-448($tp1), $ACC5, $ACC5 6787bded2dbSJung-uk Kim vpaddq 32*15-448($tp1), $ACC6, $ACC6 6797bded2dbSJung-uk Kim vpaddq 32*16-448($tp1), $ACC7, $ACC7 6807bded2dbSJung-uk Kim vpaddq 32*17-448($tp1), $ACC8, $ACC8 6817bded2dbSJung-uk Kim 6827bded2dbSJung-uk Kim vpsrlq \$29, $ACC0, $TEMP1 6837bded2dbSJung-uk Kim vpand $AND_MASK, $ACC0, $ACC0 6847bded2dbSJung-uk Kim vpsrlq \$29, $ACC1, $TEMP2 6857bded2dbSJung-uk Kim vpand $AND_MASK, $ACC1, $ACC1 6867bded2dbSJung-uk Kim vpsrlq \$29, $ACC2, $TEMP3 6877bded2dbSJung-uk Kim vpermq \$0x93, $TEMP1, $TEMP1 6887bded2dbSJung-uk Kim vpand $AND_MASK, $ACC2, $ACC2 6897bded2dbSJung-uk Kim vpsrlq \$29, $ACC3, $TEMP4 6907bded2dbSJung-uk Kim vpermq \$0x93, $TEMP2, $TEMP2 6917bded2dbSJung-uk Kim vpand $AND_MASK, $ACC3, $ACC3 6927bded2dbSJung-uk Kim vpermq \$0x93, $TEMP3, $TEMP3 6937bded2dbSJung-uk Kim 6947bded2dbSJung-uk Kim vpblendd \$3, $ZERO, $TEMP1, $TEMP0 6957bded2dbSJung-uk Kim vpermq \$0x93, $TEMP4, $TEMP4 6967bded2dbSJung-uk Kim vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 6977bded2dbSJung-uk Kim vpaddq $TEMP0, $ACC0, $ACC0 6987bded2dbSJung-uk Kim vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 6997bded2dbSJung-uk Kim vpaddq $TEMP1, $ACC1, $ACC1 7007bded2dbSJung-uk Kim vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 7017bded2dbSJung-uk Kim vpaddq $TEMP2, $ACC2, $ACC2 7027bded2dbSJung-uk Kim vpblendd \$3, $TEMP4, $ZERO, $TEMP4 7037bded2dbSJung-uk Kim vpaddq $TEMP3, $ACC3, $ACC3 7047bded2dbSJung-uk Kim vpaddq $TEMP4, $ACC4, $ACC4 7057bded2dbSJung-uk Kim 7067bded2dbSJung-uk Kim vpsrlq \$29, $ACC0, $TEMP1 7077bded2dbSJung-uk Kim vpand $AND_MASK, $ACC0, $ACC0 7087bded2dbSJung-uk Kim vpsrlq \$29, $ACC1, $TEMP2 7097bded2dbSJung-uk Kim vpand $AND_MASK, $ACC1, $ACC1 7107bded2dbSJung-uk Kim vpsrlq \$29, $ACC2, $TEMP3 7117bded2dbSJung-uk Kim vpermq \$0x93, $TEMP1, $TEMP1 7127bded2dbSJung-uk Kim vpand $AND_MASK, $ACC2, $ACC2 7137bded2dbSJung-uk Kim vpsrlq \$29, $ACC3, $TEMP4 7147bded2dbSJung-uk Kim vpermq \$0x93, $TEMP2, $TEMP2 7157bded2dbSJung-uk Kim vpand $AND_MASK, $ACC3, $ACC3 7167bded2dbSJung-uk Kim vpermq \$0x93, $TEMP3, $TEMP3 7177bded2dbSJung-uk Kim 7187bded2dbSJung-uk Kim vpblendd \$3, $ZERO, $TEMP1, $TEMP0 7197bded2dbSJung-uk Kim vpermq \$0x93, $TEMP4, $TEMP4 7207bded2dbSJung-uk Kim vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 7217bded2dbSJung-uk Kim vpaddq $TEMP0, $ACC0, $ACC0 7227bded2dbSJung-uk Kim vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 7237bded2dbSJung-uk Kim vpaddq $TEMP1, $ACC1, $ACC1 7247bded2dbSJung-uk Kim vmovdqu $ACC0, 32*0-128($rp) 7257bded2dbSJung-uk Kim vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 7267bded2dbSJung-uk Kim vpaddq $TEMP2, $ACC2, $ACC2 7277bded2dbSJung-uk Kim vmovdqu $ACC1, 32*1-128($rp) 7287bded2dbSJung-uk Kim vpblendd \$3, $TEMP4, $ZERO, $TEMP4 7297bded2dbSJung-uk Kim vpaddq $TEMP3, $ACC3, $ACC3 7307bded2dbSJung-uk Kim vmovdqu $ACC2, 32*2-128($rp) 7317bded2dbSJung-uk Kim vpaddq $TEMP4, $ACC4, $ACC4 7327bded2dbSJung-uk Kim vmovdqu $ACC3, 32*3-128($rp) 7337bded2dbSJung-uk Kim___ 7347bded2dbSJung-uk Kim$TEMP5=$ACC0; 7357bded2dbSJung-uk Kim$code.=<<___; 7367bded2dbSJung-uk Kim vpsrlq \$29, $ACC4, $TEMP1 7377bded2dbSJung-uk Kim vpand $AND_MASK, $ACC4, $ACC4 7387bded2dbSJung-uk Kim vpsrlq \$29, $ACC5, $TEMP2 7397bded2dbSJung-uk Kim vpand $AND_MASK, $ACC5, $ACC5 7407bded2dbSJung-uk Kim vpsrlq \$29, $ACC6, $TEMP3 7417bded2dbSJung-uk Kim vpermq \$0x93, $TEMP1, $TEMP1 7427bded2dbSJung-uk Kim vpand $AND_MASK, $ACC6, $ACC6 7437bded2dbSJung-uk Kim vpsrlq \$29, $ACC7, $TEMP4 7447bded2dbSJung-uk Kim vpermq \$0x93, $TEMP2, $TEMP2 7457bded2dbSJung-uk Kim vpand $AND_MASK, $ACC7, $ACC7 7467bded2dbSJung-uk Kim vpsrlq \$29, $ACC8, $TEMP5 7477bded2dbSJung-uk Kim vpermq \$0x93, $TEMP3, $TEMP3 7487bded2dbSJung-uk Kim vpand $AND_MASK, $ACC8, $ACC8 7497bded2dbSJung-uk Kim vpermq \$0x93, $TEMP4, $TEMP4 7507bded2dbSJung-uk Kim 7517bded2dbSJung-uk Kim vpblendd \$3, $ZERO, $TEMP1, $TEMP0 7527bded2dbSJung-uk Kim vpermq \$0x93, $TEMP5, $TEMP5 7537bded2dbSJung-uk Kim vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 7547bded2dbSJung-uk Kim vpaddq $TEMP0, $ACC4, $ACC4 7557bded2dbSJung-uk Kim vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 7567bded2dbSJung-uk Kim vpaddq $TEMP1, $ACC5, $ACC5 7577bded2dbSJung-uk Kim vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 7587bded2dbSJung-uk Kim vpaddq $TEMP2, $ACC6, $ACC6 7597bded2dbSJung-uk Kim vpblendd \$3, $TEMP4, $TEMP5, $TEMP4 7607bded2dbSJung-uk Kim vpaddq $TEMP3, $ACC7, $ACC7 7617bded2dbSJung-uk Kim vpaddq $TEMP4, $ACC8, $ACC8 7627bded2dbSJung-uk Kim 7637bded2dbSJung-uk Kim vpsrlq \$29, $ACC4, $TEMP1 7647bded2dbSJung-uk Kim vpand $AND_MASK, $ACC4, $ACC4 7657bded2dbSJung-uk Kim vpsrlq \$29, $ACC5, $TEMP2 7667bded2dbSJung-uk Kim vpand $AND_MASK, $ACC5, $ACC5 7677bded2dbSJung-uk Kim vpsrlq \$29, $ACC6, $TEMP3 7687bded2dbSJung-uk Kim vpermq \$0x93, $TEMP1, $TEMP1 7697bded2dbSJung-uk Kim vpand $AND_MASK, $ACC6, $ACC6 7707bded2dbSJung-uk Kim vpsrlq \$29, $ACC7, $TEMP4 7717bded2dbSJung-uk Kim vpermq \$0x93, $TEMP2, $TEMP2 7727bded2dbSJung-uk Kim vpand $AND_MASK, $ACC7, $ACC7 7737bded2dbSJung-uk Kim vpsrlq \$29, $ACC8, $TEMP5 7747bded2dbSJung-uk Kim vpermq \$0x93, $TEMP3, $TEMP3 7757bded2dbSJung-uk Kim vpand $AND_MASK, $ACC8, $ACC8 7767bded2dbSJung-uk Kim vpermq \$0x93, $TEMP4, $TEMP4 7777bded2dbSJung-uk Kim 7787bded2dbSJung-uk Kim vpblendd \$3, $ZERO, $TEMP1, $TEMP0 7797bded2dbSJung-uk Kim vpermq \$0x93, $TEMP5, $TEMP5 7807bded2dbSJung-uk Kim vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 7817bded2dbSJung-uk Kim vpaddq $TEMP0, $ACC4, $ACC4 7827bded2dbSJung-uk Kim vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 7837bded2dbSJung-uk Kim vpaddq $TEMP1, $ACC5, $ACC5 7847bded2dbSJung-uk Kim vmovdqu $ACC4, 32*4-128($rp) 7857bded2dbSJung-uk Kim vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 7867bded2dbSJung-uk Kim vpaddq $TEMP2, $ACC6, $ACC6 7877bded2dbSJung-uk Kim vmovdqu $ACC5, 32*5-128($rp) 7887bded2dbSJung-uk Kim vpblendd \$3, $TEMP4, $TEMP5, $TEMP4 7897bded2dbSJung-uk Kim vpaddq $TEMP3, $ACC7, $ACC7 7907bded2dbSJung-uk Kim vmovdqu $ACC6, 32*6-128($rp) 7917bded2dbSJung-uk Kim vpaddq $TEMP4, $ACC8, $ACC8 7927bded2dbSJung-uk Kim vmovdqu $ACC7, 32*7-128($rp) 7937bded2dbSJung-uk Kim vmovdqu $ACC8, 32*8-128($rp) 7947bded2dbSJung-uk Kim 7957bded2dbSJung-uk Kim mov $rp, $ap 7967bded2dbSJung-uk Kim dec $rep 7977bded2dbSJung-uk Kim jne .LOOP_GRANDE_SQR_1024 7987bded2dbSJung-uk Kim 7997bded2dbSJung-uk Kim vzeroall 8007bded2dbSJung-uk Kim mov %rbp, %rax 801e71b7053SJung-uk Kim.cfi_def_cfa_register %rax 8027bded2dbSJung-uk Kim___ 8037bded2dbSJung-uk Kim$code.=<<___ if ($win64); 804e71b7053SJung-uk Kim.Lsqr_1024_in_tail: 8057bded2dbSJung-uk Kim movaps -0xd8(%rax),%xmm6 8067bded2dbSJung-uk Kim movaps -0xc8(%rax),%xmm7 8077bded2dbSJung-uk Kim movaps -0xb8(%rax),%xmm8 8087bded2dbSJung-uk Kim movaps -0xa8(%rax),%xmm9 8097bded2dbSJung-uk Kim movaps -0x98(%rax),%xmm10 8107bded2dbSJung-uk Kim movaps -0x88(%rax),%xmm11 8117bded2dbSJung-uk Kim movaps -0x78(%rax),%xmm12 8127bded2dbSJung-uk Kim movaps -0x68(%rax),%xmm13 8137bded2dbSJung-uk Kim movaps -0x58(%rax),%xmm14 8147bded2dbSJung-uk Kim movaps -0x48(%rax),%xmm15 8157bded2dbSJung-uk Kim___ 8167bded2dbSJung-uk Kim$code.=<<___; 8177bded2dbSJung-uk Kim mov -48(%rax),%r15 818e71b7053SJung-uk Kim.cfi_restore %r15 8197bded2dbSJung-uk Kim mov -40(%rax),%r14 820e71b7053SJung-uk Kim.cfi_restore %r14 8217bded2dbSJung-uk Kim mov -32(%rax),%r13 822e71b7053SJung-uk Kim.cfi_restore %r13 8237bded2dbSJung-uk Kim mov -24(%rax),%r12 824e71b7053SJung-uk Kim.cfi_restore %r12 8257bded2dbSJung-uk Kim mov -16(%rax),%rbp 826e71b7053SJung-uk Kim.cfi_restore %rbp 8277bded2dbSJung-uk Kim mov -8(%rax),%rbx 828e71b7053SJung-uk Kim.cfi_restore %rbx 8297bded2dbSJung-uk Kim lea (%rax),%rsp # restore %rsp 830e71b7053SJung-uk Kim.cfi_def_cfa_register %rsp 8317bded2dbSJung-uk Kim.Lsqr_1024_epilogue: 8327bded2dbSJung-uk Kim ret 833e71b7053SJung-uk Kim.cfi_endproc 8347bded2dbSJung-uk Kim.size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2 8357bded2dbSJung-uk Kim___ 8367bded2dbSJung-uk Kim} 8377bded2dbSJung-uk Kim 8387bded2dbSJung-uk Kim{ # void AMM_WW( 8397bded2dbSJung-uk Kimmy $rp="%rdi"; # BN_ULONG *rp, 8407bded2dbSJung-uk Kimmy $ap="%rsi"; # const BN_ULONG *ap, 8417bded2dbSJung-uk Kimmy $bp="%rdx"; # const BN_ULONG *bp, 8427bded2dbSJung-uk Kimmy $np="%rcx"; # const BN_ULONG *np, 8437bded2dbSJung-uk Kimmy $n0="%r8d"; # unsigned int n0); 8447bded2dbSJung-uk Kim 8457bded2dbSJung-uk Kim# The registers that hold the accumulated redundant result 8467bded2dbSJung-uk Kim# The AMM works on 1024 bit operands, and redundant word size is 29 8477bded2dbSJung-uk Kim# Therefore: ceil(1024/29)/4 = 9 8487bded2dbSJung-uk Kimmy $ACC0="%ymm0"; 8497bded2dbSJung-uk Kimmy $ACC1="%ymm1"; 8507bded2dbSJung-uk Kimmy $ACC2="%ymm2"; 8517bded2dbSJung-uk Kimmy $ACC3="%ymm3"; 8527bded2dbSJung-uk Kimmy $ACC4="%ymm4"; 8537bded2dbSJung-uk Kimmy $ACC5="%ymm5"; 8547bded2dbSJung-uk Kimmy $ACC6="%ymm6"; 8557bded2dbSJung-uk Kimmy $ACC7="%ymm7"; 8567bded2dbSJung-uk Kimmy $ACC8="%ymm8"; 8577bded2dbSJung-uk Kimmy $ACC9="%ymm9"; 8587bded2dbSJung-uk Kim 8597bded2dbSJung-uk Kim# Registers that hold the broadcasted words of multiplier, currently used 8607bded2dbSJung-uk Kimmy $Bi="%ymm10"; 8617bded2dbSJung-uk Kimmy $Yi="%ymm11"; 8627bded2dbSJung-uk Kim 8637bded2dbSJung-uk Kim# Helper registers 8647bded2dbSJung-uk Kimmy $TEMP0=$ACC0; 8657bded2dbSJung-uk Kimmy $TEMP1="%ymm12"; 8667bded2dbSJung-uk Kimmy $TEMP2="%ymm13"; 8677bded2dbSJung-uk Kimmy $ZERO="%ymm14"; 8687bded2dbSJung-uk Kimmy $AND_MASK="%ymm15"; 8697bded2dbSJung-uk Kim 8707bded2dbSJung-uk Kim# alu registers that hold the first words of the ACC 8717bded2dbSJung-uk Kimmy $r0="%r9"; 8727bded2dbSJung-uk Kimmy $r1="%r10"; 8737bded2dbSJung-uk Kimmy $r2="%r11"; 8747bded2dbSJung-uk Kimmy $r3="%r12"; 8757bded2dbSJung-uk Kim 8767bded2dbSJung-uk Kimmy $i="%r14d"; 8777bded2dbSJung-uk Kimmy $tmp="%r15"; 8787bded2dbSJung-uk Kim 8797bded2dbSJung-uk Kim$bp="%r13"; # reassigned argument 8807bded2dbSJung-uk Kim 8817bded2dbSJung-uk Kim$code.=<<___; 8827bded2dbSJung-uk Kim.globl rsaz_1024_mul_avx2 8837bded2dbSJung-uk Kim.type rsaz_1024_mul_avx2,\@function,5 8847bded2dbSJung-uk Kim.align 64 8857bded2dbSJung-uk Kimrsaz_1024_mul_avx2: 886e71b7053SJung-uk Kim.cfi_startproc 8877bded2dbSJung-uk Kim lea (%rsp), %rax 888e71b7053SJung-uk Kim.cfi_def_cfa_register %rax 8897bded2dbSJung-uk Kim push %rbx 890e71b7053SJung-uk Kim.cfi_push %rbx 8917bded2dbSJung-uk Kim push %rbp 892e71b7053SJung-uk Kim.cfi_push %rbp 8937bded2dbSJung-uk Kim push %r12 894e71b7053SJung-uk Kim.cfi_push %r12 8957bded2dbSJung-uk Kim push %r13 896e71b7053SJung-uk Kim.cfi_push %r13 8977bded2dbSJung-uk Kim push %r14 898e71b7053SJung-uk Kim.cfi_push %r14 8997bded2dbSJung-uk Kim push %r15 900e71b7053SJung-uk Kim.cfi_push %r15 9017bded2dbSJung-uk Kim___ 9027bded2dbSJung-uk Kim$code.=<<___ if ($win64); 9037bded2dbSJung-uk Kim vzeroupper 9047bded2dbSJung-uk Kim lea -0xa8(%rsp),%rsp 9057bded2dbSJung-uk Kim vmovaps %xmm6,-0xd8(%rax) 9067bded2dbSJung-uk Kim vmovaps %xmm7,-0xc8(%rax) 9077bded2dbSJung-uk Kim vmovaps %xmm8,-0xb8(%rax) 9087bded2dbSJung-uk Kim vmovaps %xmm9,-0xa8(%rax) 9097bded2dbSJung-uk Kim vmovaps %xmm10,-0x98(%rax) 9107bded2dbSJung-uk Kim vmovaps %xmm11,-0x88(%rax) 9117bded2dbSJung-uk Kim vmovaps %xmm12,-0x78(%rax) 9127bded2dbSJung-uk Kim vmovaps %xmm13,-0x68(%rax) 9137bded2dbSJung-uk Kim vmovaps %xmm14,-0x58(%rax) 9147bded2dbSJung-uk Kim vmovaps %xmm15,-0x48(%rax) 9157bded2dbSJung-uk Kim.Lmul_1024_body: 9167bded2dbSJung-uk Kim___ 9177bded2dbSJung-uk Kim$code.=<<___; 9187bded2dbSJung-uk Kim mov %rax,%rbp 919e71b7053SJung-uk Kim.cfi_def_cfa_register %rbp 9207bded2dbSJung-uk Kim vzeroall 9217bded2dbSJung-uk Kim mov %rdx, $bp # reassigned argument 9227bded2dbSJung-uk Kim sub \$64,%rsp 9237bded2dbSJung-uk Kim 9247bded2dbSJung-uk Kim # unaligned 256-bit load that crosses page boundary can 9257bded2dbSJung-uk Kim # cause severe performance degradation here, so if $ap does 9267bded2dbSJung-uk Kim # cross page boundary, swap it with $bp [meaning that caller 9277bded2dbSJung-uk Kim # is advised to lay down $ap and $bp next to each other, so 9287bded2dbSJung-uk Kim # that only one can cross page boundary]. 9297bded2dbSJung-uk Kim .byte 0x67,0x67 9307bded2dbSJung-uk Kim mov $ap, $tmp 9317bded2dbSJung-uk Kim and \$4095, $tmp 9327bded2dbSJung-uk Kim add \$32*10, $tmp 9337bded2dbSJung-uk Kim shr \$12, $tmp 9347bded2dbSJung-uk Kim mov $ap, $tmp 9357bded2dbSJung-uk Kim cmovnz $bp, $ap 9367bded2dbSJung-uk Kim cmovnz $tmp, $bp 9377bded2dbSJung-uk Kim 9387bded2dbSJung-uk Kim mov $np, $tmp 9397bded2dbSJung-uk Kim sub \$-128,$ap # size optimization 9407bded2dbSJung-uk Kim sub \$-128,$np 9417bded2dbSJung-uk Kim sub \$-128,$rp 9427bded2dbSJung-uk Kim 9437bded2dbSJung-uk Kim and \$4095, $tmp # see if $np crosses page 9447bded2dbSJung-uk Kim add \$32*10, $tmp 9457bded2dbSJung-uk Kim .byte 0x67,0x67 9467bded2dbSJung-uk Kim shr \$12, $tmp 9477bded2dbSJung-uk Kim jz .Lmul_1024_no_n_copy 9487bded2dbSJung-uk Kim 9497bded2dbSJung-uk Kim # unaligned 256-bit load that crosses page boundary can 9507bded2dbSJung-uk Kim # cause severe performance degradation here, so if $np does 9517bded2dbSJung-uk Kim # cross page boundary, copy it to stack and make sure stack 9527bded2dbSJung-uk Kim # frame doesn't... 9537bded2dbSJung-uk Kim sub \$32*10,%rsp 9547bded2dbSJung-uk Kim vmovdqu 32*0-128($np), $ACC0 9557bded2dbSJung-uk Kim and \$-512, %rsp 9567bded2dbSJung-uk Kim vmovdqu 32*1-128($np), $ACC1 9577bded2dbSJung-uk Kim vmovdqu 32*2-128($np), $ACC2 9587bded2dbSJung-uk Kim vmovdqu 32*3-128($np), $ACC3 9597bded2dbSJung-uk Kim vmovdqu 32*4-128($np), $ACC4 9607bded2dbSJung-uk Kim vmovdqu 32*5-128($np), $ACC5 9617bded2dbSJung-uk Kim vmovdqu 32*6-128($np), $ACC6 9627bded2dbSJung-uk Kim vmovdqu 32*7-128($np), $ACC7 9637bded2dbSJung-uk Kim vmovdqu 32*8-128($np), $ACC8 9647bded2dbSJung-uk Kim lea 64+128(%rsp),$np 9657bded2dbSJung-uk Kim vmovdqu $ACC0, 32*0-128($np) 9667bded2dbSJung-uk Kim vpxor $ACC0, $ACC0, $ACC0 9677bded2dbSJung-uk Kim vmovdqu $ACC1, 32*1-128($np) 9687bded2dbSJung-uk Kim vpxor $ACC1, $ACC1, $ACC1 9697bded2dbSJung-uk Kim vmovdqu $ACC2, 32*2-128($np) 9707bded2dbSJung-uk Kim vpxor $ACC2, $ACC2, $ACC2 9717bded2dbSJung-uk Kim vmovdqu $ACC3, 32*3-128($np) 9727bded2dbSJung-uk Kim vpxor $ACC3, $ACC3, $ACC3 9737bded2dbSJung-uk Kim vmovdqu $ACC4, 32*4-128($np) 9747bded2dbSJung-uk Kim vpxor $ACC4, $ACC4, $ACC4 9757bded2dbSJung-uk Kim vmovdqu $ACC5, 32*5-128($np) 9767bded2dbSJung-uk Kim vpxor $ACC5, $ACC5, $ACC5 9777bded2dbSJung-uk Kim vmovdqu $ACC6, 32*6-128($np) 9787bded2dbSJung-uk Kim vpxor $ACC6, $ACC6, $ACC6 9797bded2dbSJung-uk Kim vmovdqu $ACC7, 32*7-128($np) 9807bded2dbSJung-uk Kim vpxor $ACC7, $ACC7, $ACC7 9817bded2dbSJung-uk Kim vmovdqu $ACC8, 32*8-128($np) 9827bded2dbSJung-uk Kim vmovdqa $ACC0, $ACC8 9837bded2dbSJung-uk Kim vmovdqu $ACC9, 32*9-128($np) # $ACC9 is zero after vzeroall 9847bded2dbSJung-uk Kim.Lmul_1024_no_n_copy: 9857bded2dbSJung-uk Kim and \$-64,%rsp 9867bded2dbSJung-uk Kim 9877bded2dbSJung-uk Kim mov ($bp), %rbx 9887bded2dbSJung-uk Kim vpbroadcastq ($bp), $Bi 9897bded2dbSJung-uk Kim vmovdqu $ACC0, (%rsp) # clear top of stack 9907bded2dbSJung-uk Kim xor $r0, $r0 9917bded2dbSJung-uk Kim .byte 0x67 9927bded2dbSJung-uk Kim xor $r1, $r1 9937bded2dbSJung-uk Kim xor $r2, $r2 9947bded2dbSJung-uk Kim xor $r3, $r3 9957bded2dbSJung-uk Kim 9967bded2dbSJung-uk Kim vmovdqu .Land_mask(%rip), $AND_MASK 9977bded2dbSJung-uk Kim mov \$9, $i 9987bded2dbSJung-uk Kim vmovdqu $ACC9, 32*9-128($rp) # $ACC9 is zero after vzeroall 9997bded2dbSJung-uk Kim jmp .Loop_mul_1024 10007bded2dbSJung-uk Kim 10017bded2dbSJung-uk Kim.align 32 10027bded2dbSJung-uk Kim.Loop_mul_1024: 10037bded2dbSJung-uk Kim vpsrlq \$29, $ACC3, $ACC9 # correct $ACC3(*) 10047bded2dbSJung-uk Kim mov %rbx, %rax 10057bded2dbSJung-uk Kim imulq -128($ap), %rax 10067bded2dbSJung-uk Kim add $r0, %rax 10077bded2dbSJung-uk Kim mov %rbx, $r1 10087bded2dbSJung-uk Kim imulq 8-128($ap), $r1 10097bded2dbSJung-uk Kim add 8(%rsp), $r1 10107bded2dbSJung-uk Kim 10117bded2dbSJung-uk Kim mov %rax, $r0 10127bded2dbSJung-uk Kim imull $n0, %eax 10137bded2dbSJung-uk Kim and \$0x1fffffff, %eax 10147bded2dbSJung-uk Kim 10157bded2dbSJung-uk Kim mov %rbx, $r2 10167bded2dbSJung-uk Kim imulq 16-128($ap), $r2 10177bded2dbSJung-uk Kim add 16(%rsp), $r2 10187bded2dbSJung-uk Kim 10197bded2dbSJung-uk Kim mov %rbx, $r3 10207bded2dbSJung-uk Kim imulq 24-128($ap), $r3 10217bded2dbSJung-uk Kim add 24(%rsp), $r3 10227bded2dbSJung-uk Kim vpmuludq 32*1-128($ap),$Bi,$TEMP0 10237bded2dbSJung-uk Kim vmovd %eax, $Yi 10247bded2dbSJung-uk Kim vpaddq $TEMP0,$ACC1,$ACC1 10257bded2dbSJung-uk Kim vpmuludq 32*2-128($ap),$Bi,$TEMP1 10267bded2dbSJung-uk Kim vpbroadcastq $Yi, $Yi 10277bded2dbSJung-uk Kim vpaddq $TEMP1,$ACC2,$ACC2 10287bded2dbSJung-uk Kim vpmuludq 32*3-128($ap),$Bi,$TEMP2 10297bded2dbSJung-uk Kim vpand $AND_MASK, $ACC3, $ACC3 # correct $ACC3 10307bded2dbSJung-uk Kim vpaddq $TEMP2,$ACC3,$ACC3 10317bded2dbSJung-uk Kim vpmuludq 32*4-128($ap),$Bi,$TEMP0 10327bded2dbSJung-uk Kim vpaddq $TEMP0,$ACC4,$ACC4 10337bded2dbSJung-uk Kim vpmuludq 32*5-128($ap),$Bi,$TEMP1 10347bded2dbSJung-uk Kim vpaddq $TEMP1,$ACC5,$ACC5 10357bded2dbSJung-uk Kim vpmuludq 32*6-128($ap),$Bi,$TEMP2 10367bded2dbSJung-uk Kim vpaddq $TEMP2,$ACC6,$ACC6 10377bded2dbSJung-uk Kim vpmuludq 32*7-128($ap),$Bi,$TEMP0 10387bded2dbSJung-uk Kim vpermq \$0x93, $ACC9, $ACC9 # correct $ACC3 10397bded2dbSJung-uk Kim vpaddq $TEMP0,$ACC7,$ACC7 10407bded2dbSJung-uk Kim vpmuludq 32*8-128($ap),$Bi,$TEMP1 10417bded2dbSJung-uk Kim vpbroadcastq 8($bp), $Bi 10427bded2dbSJung-uk Kim vpaddq $TEMP1,$ACC8,$ACC8 10437bded2dbSJung-uk Kim 10447bded2dbSJung-uk Kim mov %rax,%rdx 10457bded2dbSJung-uk Kim imulq -128($np),%rax 10467bded2dbSJung-uk Kim add %rax,$r0 10477bded2dbSJung-uk Kim mov %rdx,%rax 10487bded2dbSJung-uk Kim imulq 8-128($np),%rax 10497bded2dbSJung-uk Kim add %rax,$r1 10507bded2dbSJung-uk Kim mov %rdx,%rax 10517bded2dbSJung-uk Kim imulq 16-128($np),%rax 10527bded2dbSJung-uk Kim add %rax,$r2 10537bded2dbSJung-uk Kim shr \$29, $r0 10547bded2dbSJung-uk Kim imulq 24-128($np),%rdx 10557bded2dbSJung-uk Kim add %rdx,$r3 10567bded2dbSJung-uk Kim add $r0, $r1 10577bded2dbSJung-uk Kim 10587bded2dbSJung-uk Kim vpmuludq 32*1-128($np),$Yi,$TEMP2 10597bded2dbSJung-uk Kim vmovq $Bi, %rbx 10607bded2dbSJung-uk Kim vpaddq $TEMP2,$ACC1,$ACC1 10617bded2dbSJung-uk Kim vpmuludq 32*2-128($np),$Yi,$TEMP0 10627bded2dbSJung-uk Kim vpaddq $TEMP0,$ACC2,$ACC2 10637bded2dbSJung-uk Kim vpmuludq 32*3-128($np),$Yi,$TEMP1 10647bded2dbSJung-uk Kim vpaddq $TEMP1,$ACC3,$ACC3 10657bded2dbSJung-uk Kim vpmuludq 32*4-128($np),$Yi,$TEMP2 10667bded2dbSJung-uk Kim vpaddq $TEMP2,$ACC4,$ACC4 10677bded2dbSJung-uk Kim vpmuludq 32*5-128($np),$Yi,$TEMP0 10687bded2dbSJung-uk Kim vpaddq $TEMP0,$ACC5,$ACC5 10697bded2dbSJung-uk Kim vpmuludq 32*6-128($np),$Yi,$TEMP1 10707bded2dbSJung-uk Kim vpaddq $TEMP1,$ACC6,$ACC6 10717bded2dbSJung-uk Kim vpmuludq 32*7-128($np),$Yi,$TEMP2 1072c4ad4dffSJung-uk Kim vpblendd \$3, $ZERO, $ACC9, $TEMP1 # correct $ACC3 10737bded2dbSJung-uk Kim vpaddq $TEMP2,$ACC7,$ACC7 10747bded2dbSJung-uk Kim vpmuludq 32*8-128($np),$Yi,$TEMP0 1075c4ad4dffSJung-uk Kim vpaddq $TEMP1, $ACC3, $ACC3 # correct $ACC3 10767bded2dbSJung-uk Kim vpaddq $TEMP0,$ACC8,$ACC8 10777bded2dbSJung-uk Kim 10787bded2dbSJung-uk Kim mov %rbx, %rax 10797bded2dbSJung-uk Kim imulq -128($ap),%rax 10807bded2dbSJung-uk Kim add %rax,$r1 10817bded2dbSJung-uk Kim vmovdqu -8+32*1-128($ap),$TEMP1 10827bded2dbSJung-uk Kim mov %rbx, %rax 10837bded2dbSJung-uk Kim imulq 8-128($ap),%rax 10847bded2dbSJung-uk Kim add %rax,$r2 10857bded2dbSJung-uk Kim vmovdqu -8+32*2-128($ap),$TEMP2 10867bded2dbSJung-uk Kim 10877bded2dbSJung-uk Kim mov $r1, %rax 1088c4ad4dffSJung-uk Kim vpblendd \$0xfc, $ZERO, $ACC9, $ACC9 # correct $ACC3 10897bded2dbSJung-uk Kim imull $n0, %eax 1090c4ad4dffSJung-uk Kim vpaddq $ACC9,$ACC4,$ACC4 # correct $ACC3 10917bded2dbSJung-uk Kim and \$0x1fffffff, %eax 10927bded2dbSJung-uk Kim 10937bded2dbSJung-uk Kim imulq 16-128($ap),%rbx 10947bded2dbSJung-uk Kim add %rbx,$r3 10957bded2dbSJung-uk Kim vpmuludq $Bi,$TEMP1,$TEMP1 10967bded2dbSJung-uk Kim vmovd %eax, $Yi 10977bded2dbSJung-uk Kim vmovdqu -8+32*3-128($ap),$TEMP0 10987bded2dbSJung-uk Kim vpaddq $TEMP1,$ACC1,$ACC1 10997bded2dbSJung-uk Kim vpmuludq $Bi,$TEMP2,$TEMP2 11007bded2dbSJung-uk Kim vpbroadcastq $Yi, $Yi 11017bded2dbSJung-uk Kim vmovdqu -8+32*4-128($ap),$TEMP1 11027bded2dbSJung-uk Kim vpaddq $TEMP2,$ACC2,$ACC2 11037bded2dbSJung-uk Kim vpmuludq $Bi,$TEMP0,$TEMP0 11047bded2dbSJung-uk Kim vmovdqu -8+32*5-128($ap),$TEMP2 11057bded2dbSJung-uk Kim vpaddq $TEMP0,$ACC3,$ACC3 11067bded2dbSJung-uk Kim vpmuludq $Bi,$TEMP1,$TEMP1 11077bded2dbSJung-uk Kim vmovdqu -8+32*6-128($ap),$TEMP0 11087bded2dbSJung-uk Kim vpaddq $TEMP1,$ACC4,$ACC4 11097bded2dbSJung-uk Kim vpmuludq $Bi,$TEMP2,$TEMP2 11107bded2dbSJung-uk Kim vmovdqu -8+32*7-128($ap),$TEMP1 11117bded2dbSJung-uk Kim vpaddq $TEMP2,$ACC5,$ACC5 11127bded2dbSJung-uk Kim vpmuludq $Bi,$TEMP0,$TEMP0 11137bded2dbSJung-uk Kim vmovdqu -8+32*8-128($ap),$TEMP2 11147bded2dbSJung-uk Kim vpaddq $TEMP0,$ACC6,$ACC6 11157bded2dbSJung-uk Kim vpmuludq $Bi,$TEMP1,$TEMP1 11167bded2dbSJung-uk Kim vmovdqu -8+32*9-128($ap),$ACC9 11177bded2dbSJung-uk Kim vpaddq $TEMP1,$ACC7,$ACC7 11187bded2dbSJung-uk Kim vpmuludq $Bi,$TEMP2,$TEMP2 11197bded2dbSJung-uk Kim vpaddq $TEMP2,$ACC8,$ACC8 11207bded2dbSJung-uk Kim vpmuludq $Bi,$ACC9,$ACC9 11217bded2dbSJung-uk Kim vpbroadcastq 16($bp), $Bi 11227bded2dbSJung-uk Kim 11237bded2dbSJung-uk Kim mov %rax,%rdx 11247bded2dbSJung-uk Kim imulq -128($np),%rax 11257bded2dbSJung-uk Kim add %rax,$r1 11267bded2dbSJung-uk Kim vmovdqu -8+32*1-128($np),$TEMP0 11277bded2dbSJung-uk Kim mov %rdx,%rax 11287bded2dbSJung-uk Kim imulq 8-128($np),%rax 11297bded2dbSJung-uk Kim add %rax,$r2 11307bded2dbSJung-uk Kim vmovdqu -8+32*2-128($np),$TEMP1 11317bded2dbSJung-uk Kim shr \$29, $r1 11327bded2dbSJung-uk Kim imulq 16-128($np),%rdx 11337bded2dbSJung-uk Kim add %rdx,$r3 11347bded2dbSJung-uk Kim add $r1, $r2 11357bded2dbSJung-uk Kim 11367bded2dbSJung-uk Kim vpmuludq $Yi,$TEMP0,$TEMP0 11377bded2dbSJung-uk Kim vmovq $Bi, %rbx 11387bded2dbSJung-uk Kim vmovdqu -8+32*3-128($np),$TEMP2 11397bded2dbSJung-uk Kim vpaddq $TEMP0,$ACC1,$ACC1 11407bded2dbSJung-uk Kim vpmuludq $Yi,$TEMP1,$TEMP1 11417bded2dbSJung-uk Kim vmovdqu -8+32*4-128($np),$TEMP0 11427bded2dbSJung-uk Kim vpaddq $TEMP1,$ACC2,$ACC2 11437bded2dbSJung-uk Kim vpmuludq $Yi,$TEMP2,$TEMP2 11447bded2dbSJung-uk Kim vmovdqu -8+32*5-128($np),$TEMP1 11457bded2dbSJung-uk Kim vpaddq $TEMP2,$ACC3,$ACC3 11467bded2dbSJung-uk Kim vpmuludq $Yi,$TEMP0,$TEMP0 11477bded2dbSJung-uk Kim vmovdqu -8+32*6-128($np),$TEMP2 11487bded2dbSJung-uk Kim vpaddq $TEMP0,$ACC4,$ACC4 11497bded2dbSJung-uk Kim vpmuludq $Yi,$TEMP1,$TEMP1 11507bded2dbSJung-uk Kim vmovdqu -8+32*7-128($np),$TEMP0 11517bded2dbSJung-uk Kim vpaddq $TEMP1,$ACC5,$ACC5 11527bded2dbSJung-uk Kim vpmuludq $Yi,$TEMP2,$TEMP2 11537bded2dbSJung-uk Kim vmovdqu -8+32*8-128($np),$TEMP1 11547bded2dbSJung-uk Kim vpaddq $TEMP2,$ACC6,$ACC6 11557bded2dbSJung-uk Kim vpmuludq $Yi,$TEMP0,$TEMP0 11567bded2dbSJung-uk Kim vmovdqu -8+32*9-128($np),$TEMP2 11577bded2dbSJung-uk Kim vpaddq $TEMP0,$ACC7,$ACC7 11587bded2dbSJung-uk Kim vpmuludq $Yi,$TEMP1,$TEMP1 11597bded2dbSJung-uk Kim vpaddq $TEMP1,$ACC8,$ACC8 11607bded2dbSJung-uk Kim vpmuludq $Yi,$TEMP2,$TEMP2 11617bded2dbSJung-uk Kim vpaddq $TEMP2,$ACC9,$ACC9 11627bded2dbSJung-uk Kim 11637bded2dbSJung-uk Kim vmovdqu -16+32*1-128($ap),$TEMP0 11647bded2dbSJung-uk Kim mov %rbx,%rax 11657bded2dbSJung-uk Kim imulq -128($ap),%rax 11667bded2dbSJung-uk Kim add $r2,%rax 11677bded2dbSJung-uk Kim 11687bded2dbSJung-uk Kim vmovdqu -16+32*2-128($ap),$TEMP1 11697bded2dbSJung-uk Kim mov %rax,$r2 11707bded2dbSJung-uk Kim imull $n0, %eax 11717bded2dbSJung-uk Kim and \$0x1fffffff, %eax 11727bded2dbSJung-uk Kim 11737bded2dbSJung-uk Kim imulq 8-128($ap),%rbx 11747bded2dbSJung-uk Kim add %rbx,$r3 11757bded2dbSJung-uk Kim vpmuludq $Bi,$TEMP0,$TEMP0 11767bded2dbSJung-uk Kim vmovd %eax, $Yi 11777bded2dbSJung-uk Kim vmovdqu -16+32*3-128($ap),$TEMP2 11787bded2dbSJung-uk Kim vpaddq $TEMP0,$ACC1,$ACC1 11797bded2dbSJung-uk Kim vpmuludq $Bi,$TEMP1,$TEMP1 11807bded2dbSJung-uk Kim vpbroadcastq $Yi, $Yi 11817bded2dbSJung-uk Kim vmovdqu -16+32*4-128($ap),$TEMP0 11827bded2dbSJung-uk Kim vpaddq $TEMP1,$ACC2,$ACC2 11837bded2dbSJung-uk Kim vpmuludq $Bi,$TEMP2,$TEMP2 11847bded2dbSJung-uk Kim vmovdqu -16+32*5-128($ap),$TEMP1 11857bded2dbSJung-uk Kim vpaddq $TEMP2,$ACC3,$ACC3 11867bded2dbSJung-uk Kim vpmuludq $Bi,$TEMP0,$TEMP0 11877bded2dbSJung-uk Kim vmovdqu -16+32*6-128($ap),$TEMP2 11887bded2dbSJung-uk Kim vpaddq $TEMP0,$ACC4,$ACC4 11897bded2dbSJung-uk Kim vpmuludq $Bi,$TEMP1,$TEMP1 11907bded2dbSJung-uk Kim vmovdqu -16+32*7-128($ap),$TEMP0 11917bded2dbSJung-uk Kim vpaddq $TEMP1,$ACC5,$ACC5 11927bded2dbSJung-uk Kim vpmuludq $Bi,$TEMP2,$TEMP2 11937bded2dbSJung-uk Kim vmovdqu -16+32*8-128($ap),$TEMP1 11947bded2dbSJung-uk Kim vpaddq $TEMP2,$ACC6,$ACC6 11957bded2dbSJung-uk Kim vpmuludq $Bi,$TEMP0,$TEMP0 11967bded2dbSJung-uk Kim vmovdqu -16+32*9-128($ap),$TEMP2 11977bded2dbSJung-uk Kim vpaddq $TEMP0,$ACC7,$ACC7 11987bded2dbSJung-uk Kim vpmuludq $Bi,$TEMP1,$TEMP1 11997bded2dbSJung-uk Kim vpaddq $TEMP1,$ACC8,$ACC8 12007bded2dbSJung-uk Kim vpmuludq $Bi,$TEMP2,$TEMP2 12017bded2dbSJung-uk Kim vpbroadcastq 24($bp), $Bi 12027bded2dbSJung-uk Kim vpaddq $TEMP2,$ACC9,$ACC9 12037bded2dbSJung-uk Kim 12047bded2dbSJung-uk Kim vmovdqu -16+32*1-128($np),$TEMP0 12057bded2dbSJung-uk Kim mov %rax,%rdx 12067bded2dbSJung-uk Kim imulq -128($np),%rax 12077bded2dbSJung-uk Kim add %rax,$r2 12087bded2dbSJung-uk Kim vmovdqu -16+32*2-128($np),$TEMP1 12097bded2dbSJung-uk Kim imulq 8-128($np),%rdx 12107bded2dbSJung-uk Kim add %rdx,$r3 12117bded2dbSJung-uk Kim shr \$29, $r2 12127bded2dbSJung-uk Kim 12137bded2dbSJung-uk Kim vpmuludq $Yi,$TEMP0,$TEMP0 12147bded2dbSJung-uk Kim vmovq $Bi, %rbx 12157bded2dbSJung-uk Kim vmovdqu -16+32*3-128($np),$TEMP2 12167bded2dbSJung-uk Kim vpaddq $TEMP0,$ACC1,$ACC1 12177bded2dbSJung-uk Kim vpmuludq $Yi,$TEMP1,$TEMP1 12187bded2dbSJung-uk Kim vmovdqu -16+32*4-128($np),$TEMP0 12197bded2dbSJung-uk Kim vpaddq $TEMP1,$ACC2,$ACC2 12207bded2dbSJung-uk Kim vpmuludq $Yi,$TEMP2,$TEMP2 12217bded2dbSJung-uk Kim vmovdqu -16+32*5-128($np),$TEMP1 12227bded2dbSJung-uk Kim vpaddq $TEMP2,$ACC3,$ACC3 12237bded2dbSJung-uk Kim vpmuludq $Yi,$TEMP0,$TEMP0 12247bded2dbSJung-uk Kim vmovdqu -16+32*6-128($np),$TEMP2 12257bded2dbSJung-uk Kim vpaddq $TEMP0,$ACC4,$ACC4 12267bded2dbSJung-uk Kim vpmuludq $Yi,$TEMP1,$TEMP1 12277bded2dbSJung-uk Kim vmovdqu -16+32*7-128($np),$TEMP0 12287bded2dbSJung-uk Kim vpaddq $TEMP1,$ACC5,$ACC5 12297bded2dbSJung-uk Kim vpmuludq $Yi,$TEMP2,$TEMP2 12307bded2dbSJung-uk Kim vmovdqu -16+32*8-128($np),$TEMP1 12317bded2dbSJung-uk Kim vpaddq $TEMP2,$ACC6,$ACC6 12327bded2dbSJung-uk Kim vpmuludq $Yi,$TEMP0,$TEMP0 12337bded2dbSJung-uk Kim vmovdqu -16+32*9-128($np),$TEMP2 12347bded2dbSJung-uk Kim vpaddq $TEMP0,$ACC7,$ACC7 12357bded2dbSJung-uk Kim vpmuludq $Yi,$TEMP1,$TEMP1 12367bded2dbSJung-uk Kim vmovdqu -24+32*1-128($ap),$TEMP0 12377bded2dbSJung-uk Kim vpaddq $TEMP1,$ACC8,$ACC8 12387bded2dbSJung-uk Kim vpmuludq $Yi,$TEMP2,$TEMP2 12397bded2dbSJung-uk Kim vmovdqu -24+32*2-128($ap),$TEMP1 12407bded2dbSJung-uk Kim vpaddq $TEMP2,$ACC9,$ACC9 12417bded2dbSJung-uk Kim 12427bded2dbSJung-uk Kim add $r2, $r3 12437bded2dbSJung-uk Kim imulq -128($ap),%rbx 12447bded2dbSJung-uk Kim add %rbx,$r3 12457bded2dbSJung-uk Kim 12467bded2dbSJung-uk Kim mov $r3, %rax 12477bded2dbSJung-uk Kim imull $n0, %eax 12487bded2dbSJung-uk Kim and \$0x1fffffff, %eax 12497bded2dbSJung-uk Kim 12507bded2dbSJung-uk Kim vpmuludq $Bi,$TEMP0,$TEMP0 12517bded2dbSJung-uk Kim vmovd %eax, $Yi 12527bded2dbSJung-uk Kim vmovdqu -24+32*3-128($ap),$TEMP2 12537bded2dbSJung-uk Kim vpaddq $TEMP0,$ACC1,$ACC1 12547bded2dbSJung-uk Kim vpmuludq $Bi,$TEMP1,$TEMP1 12557bded2dbSJung-uk Kim vpbroadcastq $Yi, $Yi 12567bded2dbSJung-uk Kim vmovdqu -24+32*4-128($ap),$TEMP0 12577bded2dbSJung-uk Kim vpaddq $TEMP1,$ACC2,$ACC2 12587bded2dbSJung-uk Kim vpmuludq $Bi,$TEMP2,$TEMP2 12597bded2dbSJung-uk Kim vmovdqu -24+32*5-128($ap),$TEMP1 12607bded2dbSJung-uk Kim vpaddq $TEMP2,$ACC3,$ACC3 12617bded2dbSJung-uk Kim vpmuludq $Bi,$TEMP0,$TEMP0 12627bded2dbSJung-uk Kim vmovdqu -24+32*6-128($ap),$TEMP2 12637bded2dbSJung-uk Kim vpaddq $TEMP0,$ACC4,$ACC4 12647bded2dbSJung-uk Kim vpmuludq $Bi,$TEMP1,$TEMP1 12657bded2dbSJung-uk Kim vmovdqu -24+32*7-128($ap),$TEMP0 12667bded2dbSJung-uk Kim vpaddq $TEMP1,$ACC5,$ACC5 12677bded2dbSJung-uk Kim vpmuludq $Bi,$TEMP2,$TEMP2 12687bded2dbSJung-uk Kim vmovdqu -24+32*8-128($ap),$TEMP1 12697bded2dbSJung-uk Kim vpaddq $TEMP2,$ACC6,$ACC6 12707bded2dbSJung-uk Kim vpmuludq $Bi,$TEMP0,$TEMP0 12717bded2dbSJung-uk Kim vmovdqu -24+32*9-128($ap),$TEMP2 12727bded2dbSJung-uk Kim vpaddq $TEMP0,$ACC7,$ACC7 12737bded2dbSJung-uk Kim vpmuludq $Bi,$TEMP1,$TEMP1 12747bded2dbSJung-uk Kim vpaddq $TEMP1,$ACC8,$ACC8 12757bded2dbSJung-uk Kim vpmuludq $Bi,$TEMP2,$TEMP2 12767bded2dbSJung-uk Kim vpbroadcastq 32($bp), $Bi 12777bded2dbSJung-uk Kim vpaddq $TEMP2,$ACC9,$ACC9 12787bded2dbSJung-uk Kim add \$32, $bp # $bp++ 12797bded2dbSJung-uk Kim 12807bded2dbSJung-uk Kim vmovdqu -24+32*1-128($np),$TEMP0 12817bded2dbSJung-uk Kim imulq -128($np),%rax 12827bded2dbSJung-uk Kim add %rax,$r3 12837bded2dbSJung-uk Kim shr \$29, $r3 12847bded2dbSJung-uk Kim 12857bded2dbSJung-uk Kim vmovdqu -24+32*2-128($np),$TEMP1 12867bded2dbSJung-uk Kim vpmuludq $Yi,$TEMP0,$TEMP0 12877bded2dbSJung-uk Kim vmovq $Bi, %rbx 12887bded2dbSJung-uk Kim vmovdqu -24+32*3-128($np),$TEMP2 12897bded2dbSJung-uk Kim vpaddq $TEMP0,$ACC1,$ACC0 # $ACC0==$TEMP0 12907bded2dbSJung-uk Kim vpmuludq $Yi,$TEMP1,$TEMP1 12917bded2dbSJung-uk Kim vmovdqu $ACC0, (%rsp) # transfer $r0-$r3 12927bded2dbSJung-uk Kim vpaddq $TEMP1,$ACC2,$ACC1 12937bded2dbSJung-uk Kim vmovdqu -24+32*4-128($np),$TEMP0 12947bded2dbSJung-uk Kim vpmuludq $Yi,$TEMP2,$TEMP2 12957bded2dbSJung-uk Kim vmovdqu -24+32*5-128($np),$TEMP1 12967bded2dbSJung-uk Kim vpaddq $TEMP2,$ACC3,$ACC2 12977bded2dbSJung-uk Kim vpmuludq $Yi,$TEMP0,$TEMP0 12987bded2dbSJung-uk Kim vmovdqu -24+32*6-128($np),$TEMP2 12997bded2dbSJung-uk Kim vpaddq $TEMP0,$ACC4,$ACC3 13007bded2dbSJung-uk Kim vpmuludq $Yi,$TEMP1,$TEMP1 13017bded2dbSJung-uk Kim vmovdqu -24+32*7-128($np),$TEMP0 13027bded2dbSJung-uk Kim vpaddq $TEMP1,$ACC5,$ACC4 13037bded2dbSJung-uk Kim vpmuludq $Yi,$TEMP2,$TEMP2 13047bded2dbSJung-uk Kim vmovdqu -24+32*8-128($np),$TEMP1 13057bded2dbSJung-uk Kim vpaddq $TEMP2,$ACC6,$ACC5 13067bded2dbSJung-uk Kim vpmuludq $Yi,$TEMP0,$TEMP0 13077bded2dbSJung-uk Kim vmovdqu -24+32*9-128($np),$TEMP2 13087bded2dbSJung-uk Kim mov $r3, $r0 13097bded2dbSJung-uk Kim vpaddq $TEMP0,$ACC7,$ACC6 13107bded2dbSJung-uk Kim vpmuludq $Yi,$TEMP1,$TEMP1 13117bded2dbSJung-uk Kim add (%rsp), $r0 13127bded2dbSJung-uk Kim vpaddq $TEMP1,$ACC8,$ACC7 13137bded2dbSJung-uk Kim vpmuludq $Yi,$TEMP2,$TEMP2 13147bded2dbSJung-uk Kim vmovq $r3, $TEMP1 13157bded2dbSJung-uk Kim vpaddq $TEMP2,$ACC9,$ACC8 13167bded2dbSJung-uk Kim 13177bded2dbSJung-uk Kim dec $i 13187bded2dbSJung-uk Kim jnz .Loop_mul_1024 13197bded2dbSJung-uk Kim___ 13207bded2dbSJung-uk Kim 13217bded2dbSJung-uk Kim# (*) Original implementation was correcting ACC1-ACC3 for overflow 13227bded2dbSJung-uk Kim# after 7 loop runs, or after 28 iterations, or 56 additions. 13237bded2dbSJung-uk Kim# But as we underutilize resources, it's possible to correct in 13247bded2dbSJung-uk Kim# each iteration with marginal performance loss. But then, as 13257bded2dbSJung-uk Kim# we do it in each iteration, we can correct less digits, and 1326c4ad4dffSJung-uk Kim# avoid performance penalties completely. 13277bded2dbSJung-uk Kim 13287bded2dbSJung-uk Kim$TEMP0 = $ACC9; 13297bded2dbSJung-uk Kim$TEMP3 = $Bi; 13307bded2dbSJung-uk Kim$TEMP4 = $Yi; 13317bded2dbSJung-uk Kim$code.=<<___; 13327bded2dbSJung-uk Kim vpaddq (%rsp), $TEMP1, $ACC0 13337bded2dbSJung-uk Kim 13347bded2dbSJung-uk Kim vpsrlq \$29, $ACC0, $TEMP1 13357bded2dbSJung-uk Kim vpand $AND_MASK, $ACC0, $ACC0 13367bded2dbSJung-uk Kim vpsrlq \$29, $ACC1, $TEMP2 13377bded2dbSJung-uk Kim vpand $AND_MASK, $ACC1, $ACC1 13387bded2dbSJung-uk Kim vpsrlq \$29, $ACC2, $TEMP3 13397bded2dbSJung-uk Kim vpermq \$0x93, $TEMP1, $TEMP1 13407bded2dbSJung-uk Kim vpand $AND_MASK, $ACC2, $ACC2 13417bded2dbSJung-uk Kim vpsrlq \$29, $ACC3, $TEMP4 13427bded2dbSJung-uk Kim vpermq \$0x93, $TEMP2, $TEMP2 13437bded2dbSJung-uk Kim vpand $AND_MASK, $ACC3, $ACC3 13447bded2dbSJung-uk Kim 13457bded2dbSJung-uk Kim vpblendd \$3, $ZERO, $TEMP1, $TEMP0 13467bded2dbSJung-uk Kim vpermq \$0x93, $TEMP3, $TEMP3 13477bded2dbSJung-uk Kim vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 13487bded2dbSJung-uk Kim vpermq \$0x93, $TEMP4, $TEMP4 13497bded2dbSJung-uk Kim vpaddq $TEMP0, $ACC0, $ACC0 13507bded2dbSJung-uk Kim vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 13517bded2dbSJung-uk Kim vpaddq $TEMP1, $ACC1, $ACC1 13527bded2dbSJung-uk Kim vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 13537bded2dbSJung-uk Kim vpaddq $TEMP2, $ACC2, $ACC2 13547bded2dbSJung-uk Kim vpblendd \$3, $TEMP4, $ZERO, $TEMP4 13557bded2dbSJung-uk Kim vpaddq $TEMP3, $ACC3, $ACC3 13567bded2dbSJung-uk Kim vpaddq $TEMP4, $ACC4, $ACC4 13577bded2dbSJung-uk Kim 13587bded2dbSJung-uk Kim vpsrlq \$29, $ACC0, $TEMP1 13597bded2dbSJung-uk Kim vpand $AND_MASK, $ACC0, $ACC0 13607bded2dbSJung-uk Kim vpsrlq \$29, $ACC1, $TEMP2 13617bded2dbSJung-uk Kim vpand $AND_MASK, $ACC1, $ACC1 13627bded2dbSJung-uk Kim vpsrlq \$29, $ACC2, $TEMP3 13637bded2dbSJung-uk Kim vpermq \$0x93, $TEMP1, $TEMP1 13647bded2dbSJung-uk Kim vpand $AND_MASK, $ACC2, $ACC2 13657bded2dbSJung-uk Kim vpsrlq \$29, $ACC3, $TEMP4 13667bded2dbSJung-uk Kim vpermq \$0x93, $TEMP2, $TEMP2 13677bded2dbSJung-uk Kim vpand $AND_MASK, $ACC3, $ACC3 13687bded2dbSJung-uk Kim vpermq \$0x93, $TEMP3, $TEMP3 13697bded2dbSJung-uk Kim 13707bded2dbSJung-uk Kim vpblendd \$3, $ZERO, $TEMP1, $TEMP0 13717bded2dbSJung-uk Kim vpermq \$0x93, $TEMP4, $TEMP4 13727bded2dbSJung-uk Kim vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 13737bded2dbSJung-uk Kim vpaddq $TEMP0, $ACC0, $ACC0 13747bded2dbSJung-uk Kim vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 13757bded2dbSJung-uk Kim vpaddq $TEMP1, $ACC1, $ACC1 13767bded2dbSJung-uk Kim vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 13777bded2dbSJung-uk Kim vpaddq $TEMP2, $ACC2, $ACC2 13787bded2dbSJung-uk Kim vpblendd \$3, $TEMP4, $ZERO, $TEMP4 13797bded2dbSJung-uk Kim vpaddq $TEMP3, $ACC3, $ACC3 13807bded2dbSJung-uk Kim vpaddq $TEMP4, $ACC4, $ACC4 13817bded2dbSJung-uk Kim 13827bded2dbSJung-uk Kim vmovdqu $ACC0, 0-128($rp) 13837bded2dbSJung-uk Kim vmovdqu $ACC1, 32-128($rp) 13847bded2dbSJung-uk Kim vmovdqu $ACC2, 64-128($rp) 13857bded2dbSJung-uk Kim vmovdqu $ACC3, 96-128($rp) 13867bded2dbSJung-uk Kim___ 13877bded2dbSJung-uk Kim 13887bded2dbSJung-uk Kim$TEMP5=$ACC0; 13897bded2dbSJung-uk Kim$code.=<<___; 13907bded2dbSJung-uk Kim vpsrlq \$29, $ACC4, $TEMP1 13917bded2dbSJung-uk Kim vpand $AND_MASK, $ACC4, $ACC4 13927bded2dbSJung-uk Kim vpsrlq \$29, $ACC5, $TEMP2 13937bded2dbSJung-uk Kim vpand $AND_MASK, $ACC5, $ACC5 13947bded2dbSJung-uk Kim vpsrlq \$29, $ACC6, $TEMP3 13957bded2dbSJung-uk Kim vpermq \$0x93, $TEMP1, $TEMP1 13967bded2dbSJung-uk Kim vpand $AND_MASK, $ACC6, $ACC6 13977bded2dbSJung-uk Kim vpsrlq \$29, $ACC7, $TEMP4 13987bded2dbSJung-uk Kim vpermq \$0x93, $TEMP2, $TEMP2 13997bded2dbSJung-uk Kim vpand $AND_MASK, $ACC7, $ACC7 14007bded2dbSJung-uk Kim vpsrlq \$29, $ACC8, $TEMP5 14017bded2dbSJung-uk Kim vpermq \$0x93, $TEMP3, $TEMP3 14027bded2dbSJung-uk Kim vpand $AND_MASK, $ACC8, $ACC8 14037bded2dbSJung-uk Kim vpermq \$0x93, $TEMP4, $TEMP4 14047bded2dbSJung-uk Kim 14057bded2dbSJung-uk Kim vpblendd \$3, $ZERO, $TEMP1, $TEMP0 14067bded2dbSJung-uk Kim vpermq \$0x93, $TEMP5, $TEMP5 14077bded2dbSJung-uk Kim vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 14087bded2dbSJung-uk Kim vpaddq $TEMP0, $ACC4, $ACC4 14097bded2dbSJung-uk Kim vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 14107bded2dbSJung-uk Kim vpaddq $TEMP1, $ACC5, $ACC5 14117bded2dbSJung-uk Kim vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 14127bded2dbSJung-uk Kim vpaddq $TEMP2, $ACC6, $ACC6 14137bded2dbSJung-uk Kim vpblendd \$3, $TEMP4, $TEMP5, $TEMP4 14147bded2dbSJung-uk Kim vpaddq $TEMP3, $ACC7, $ACC7 14157bded2dbSJung-uk Kim vpaddq $TEMP4, $ACC8, $ACC8 14167bded2dbSJung-uk Kim 14177bded2dbSJung-uk Kim vpsrlq \$29, $ACC4, $TEMP1 14187bded2dbSJung-uk Kim vpand $AND_MASK, $ACC4, $ACC4 14197bded2dbSJung-uk Kim vpsrlq \$29, $ACC5, $TEMP2 14207bded2dbSJung-uk Kim vpand $AND_MASK, $ACC5, $ACC5 14217bded2dbSJung-uk Kim vpsrlq \$29, $ACC6, $TEMP3 14227bded2dbSJung-uk Kim vpermq \$0x93, $TEMP1, $TEMP1 14237bded2dbSJung-uk Kim vpand $AND_MASK, $ACC6, $ACC6 14247bded2dbSJung-uk Kim vpsrlq \$29, $ACC7, $TEMP4 14257bded2dbSJung-uk Kim vpermq \$0x93, $TEMP2, $TEMP2 14267bded2dbSJung-uk Kim vpand $AND_MASK, $ACC7, $ACC7 14277bded2dbSJung-uk Kim vpsrlq \$29, $ACC8, $TEMP5 14287bded2dbSJung-uk Kim vpermq \$0x93, $TEMP3, $TEMP3 14297bded2dbSJung-uk Kim vpand $AND_MASK, $ACC8, $ACC8 14307bded2dbSJung-uk Kim vpermq \$0x93, $TEMP4, $TEMP4 14317bded2dbSJung-uk Kim 14327bded2dbSJung-uk Kim vpblendd \$3, $ZERO, $TEMP1, $TEMP0 14337bded2dbSJung-uk Kim vpermq \$0x93, $TEMP5, $TEMP5 14347bded2dbSJung-uk Kim vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 14357bded2dbSJung-uk Kim vpaddq $TEMP0, $ACC4, $ACC4 14367bded2dbSJung-uk Kim vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 14377bded2dbSJung-uk Kim vpaddq $TEMP1, $ACC5, $ACC5 14387bded2dbSJung-uk Kim vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 14397bded2dbSJung-uk Kim vpaddq $TEMP2, $ACC6, $ACC6 14407bded2dbSJung-uk Kim vpblendd \$3, $TEMP4, $TEMP5, $TEMP4 14417bded2dbSJung-uk Kim vpaddq $TEMP3, $ACC7, $ACC7 14427bded2dbSJung-uk Kim vpaddq $TEMP4, $ACC8, $ACC8 14437bded2dbSJung-uk Kim 14447bded2dbSJung-uk Kim vmovdqu $ACC4, 128-128($rp) 14457bded2dbSJung-uk Kim vmovdqu $ACC5, 160-128($rp) 14467bded2dbSJung-uk Kim vmovdqu $ACC6, 192-128($rp) 14477bded2dbSJung-uk Kim vmovdqu $ACC7, 224-128($rp) 14487bded2dbSJung-uk Kim vmovdqu $ACC8, 256-128($rp) 14497bded2dbSJung-uk Kim vzeroupper 14507bded2dbSJung-uk Kim 14517bded2dbSJung-uk Kim mov %rbp, %rax 1452e71b7053SJung-uk Kim.cfi_def_cfa_register %rax 14537bded2dbSJung-uk Kim___ 14547bded2dbSJung-uk Kim$code.=<<___ if ($win64); 1455e71b7053SJung-uk Kim.Lmul_1024_in_tail: 14567bded2dbSJung-uk Kim movaps -0xd8(%rax),%xmm6 14577bded2dbSJung-uk Kim movaps -0xc8(%rax),%xmm7 14587bded2dbSJung-uk Kim movaps -0xb8(%rax),%xmm8 14597bded2dbSJung-uk Kim movaps -0xa8(%rax),%xmm9 14607bded2dbSJung-uk Kim movaps -0x98(%rax),%xmm10 14617bded2dbSJung-uk Kim movaps -0x88(%rax),%xmm11 14627bded2dbSJung-uk Kim movaps -0x78(%rax),%xmm12 14637bded2dbSJung-uk Kim movaps -0x68(%rax),%xmm13 14647bded2dbSJung-uk Kim movaps -0x58(%rax),%xmm14 14657bded2dbSJung-uk Kim movaps -0x48(%rax),%xmm15 14667bded2dbSJung-uk Kim___ 14677bded2dbSJung-uk Kim$code.=<<___; 14687bded2dbSJung-uk Kim mov -48(%rax),%r15 1469e71b7053SJung-uk Kim.cfi_restore %r15 14707bded2dbSJung-uk Kim mov -40(%rax),%r14 1471e71b7053SJung-uk Kim.cfi_restore %r14 14727bded2dbSJung-uk Kim mov -32(%rax),%r13 1473e71b7053SJung-uk Kim.cfi_restore %r13 14747bded2dbSJung-uk Kim mov -24(%rax),%r12 1475e71b7053SJung-uk Kim.cfi_restore %r12 14767bded2dbSJung-uk Kim mov -16(%rax),%rbp 1477e71b7053SJung-uk Kim.cfi_restore %rbp 14787bded2dbSJung-uk Kim mov -8(%rax),%rbx 1479e71b7053SJung-uk Kim.cfi_restore %rbx 14807bded2dbSJung-uk Kim lea (%rax),%rsp # restore %rsp 1481e71b7053SJung-uk Kim.cfi_def_cfa_register %rsp 14827bded2dbSJung-uk Kim.Lmul_1024_epilogue: 14837bded2dbSJung-uk Kim ret 1484e71b7053SJung-uk Kim.cfi_endproc 14857bded2dbSJung-uk Kim.size rsaz_1024_mul_avx2,.-rsaz_1024_mul_avx2 14867bded2dbSJung-uk Kim___ 14877bded2dbSJung-uk Kim} 14887bded2dbSJung-uk Kim{ 14897bded2dbSJung-uk Kimmy ($out,$inp) = $win64 ? ("%rcx","%rdx") : ("%rdi","%rsi"); 14907bded2dbSJung-uk Kimmy @T = map("%r$_",(8..11)); 14917bded2dbSJung-uk Kim 14927bded2dbSJung-uk Kim$code.=<<___; 14937bded2dbSJung-uk Kim.globl rsaz_1024_red2norm_avx2 14947bded2dbSJung-uk Kim.type rsaz_1024_red2norm_avx2,\@abi-omnipotent 14957bded2dbSJung-uk Kim.align 32 14967bded2dbSJung-uk Kimrsaz_1024_red2norm_avx2: 14976935a639SJung-uk Kim.cfi_startproc 14987bded2dbSJung-uk Kim sub \$-128,$inp # size optimization 14997bded2dbSJung-uk Kim xor %rax,%rax 15007bded2dbSJung-uk Kim___ 15017bded2dbSJung-uk Kim 15027bded2dbSJung-uk Kimfor ($j=0,$i=0; $i<16; $i++) { 15037bded2dbSJung-uk Kim my $k=0; 15047bded2dbSJung-uk Kim while (29*$j<64*($i+1)) { # load data till boundary 15057bded2dbSJung-uk Kim $code.=" mov `8*$j-128`($inp), @T[0]\n"; 15067bded2dbSJung-uk Kim $j++; $k++; push(@T,shift(@T)); 15077bded2dbSJung-uk Kim } 15087bded2dbSJung-uk Kim $l=$k; 15097bded2dbSJung-uk Kim while ($k>1) { # shift loaded data but last value 15107bded2dbSJung-uk Kim $code.=" shl \$`29*($j-$k)`,@T[-$k]\n"; 15117bded2dbSJung-uk Kim $k--; 15127bded2dbSJung-uk Kim } 15137bded2dbSJung-uk Kim $code.=<<___; # shift last value 15147bded2dbSJung-uk Kim mov @T[-1], @T[0] 15157bded2dbSJung-uk Kim shl \$`29*($j-1)`, @T[-1] 15167bded2dbSJung-uk Kim shr \$`-29*($j-1)`, @T[0] 15177bded2dbSJung-uk Kim___ 15187bded2dbSJung-uk Kim while ($l) { # accumulate all values 15197bded2dbSJung-uk Kim $code.=" add @T[-$l], %rax\n"; 15207bded2dbSJung-uk Kim $l--; 15217bded2dbSJung-uk Kim } 15227bded2dbSJung-uk Kim $code.=<<___; 15237bded2dbSJung-uk Kim adc \$0, @T[0] # consume eventual carry 15247bded2dbSJung-uk Kim mov %rax, 8*$i($out) 15257bded2dbSJung-uk Kim mov @T[0], %rax 15267bded2dbSJung-uk Kim___ 15277bded2dbSJung-uk Kim push(@T,shift(@T)); 15287bded2dbSJung-uk Kim} 15297bded2dbSJung-uk Kim$code.=<<___; 15307bded2dbSJung-uk Kim ret 15316935a639SJung-uk Kim.cfi_endproc 15327bded2dbSJung-uk Kim.size rsaz_1024_red2norm_avx2,.-rsaz_1024_red2norm_avx2 15337bded2dbSJung-uk Kim 15347bded2dbSJung-uk Kim.globl rsaz_1024_norm2red_avx2 15357bded2dbSJung-uk Kim.type rsaz_1024_norm2red_avx2,\@abi-omnipotent 15367bded2dbSJung-uk Kim.align 32 15377bded2dbSJung-uk Kimrsaz_1024_norm2red_avx2: 15386935a639SJung-uk Kim.cfi_startproc 15397bded2dbSJung-uk Kim sub \$-128,$out # size optimization 15407bded2dbSJung-uk Kim mov ($inp),@T[0] 15417bded2dbSJung-uk Kim mov \$0x1fffffff,%eax 15427bded2dbSJung-uk Kim___ 15437bded2dbSJung-uk Kimfor ($j=0,$i=0; $i<16; $i++) { 15447bded2dbSJung-uk Kim $code.=" mov `8*($i+1)`($inp),@T[1]\n" if ($i<15); 15457bded2dbSJung-uk Kim $code.=" xor @T[1],@T[1]\n" if ($i==15); 15467bded2dbSJung-uk Kim my $k=1; 15477bded2dbSJung-uk Kim while (29*($j+1)<64*($i+1)) { 15487bded2dbSJung-uk Kim $code.=<<___; 15497bded2dbSJung-uk Kim mov @T[0],@T[-$k] 15507bded2dbSJung-uk Kim shr \$`29*$j`,@T[-$k] 15517bded2dbSJung-uk Kim and %rax,@T[-$k] # &0x1fffffff 15527bded2dbSJung-uk Kim mov @T[-$k],`8*$j-128`($out) 15537bded2dbSJung-uk Kim___ 15547bded2dbSJung-uk Kim $j++; $k++; 15557bded2dbSJung-uk Kim } 15567bded2dbSJung-uk Kim $code.=<<___; 15577bded2dbSJung-uk Kim shrd \$`29*$j`,@T[1],@T[0] 15587bded2dbSJung-uk Kim and %rax,@T[0] 15597bded2dbSJung-uk Kim mov @T[0],`8*$j-128`($out) 15607bded2dbSJung-uk Kim___ 15617bded2dbSJung-uk Kim $j++; 15627bded2dbSJung-uk Kim push(@T,shift(@T)); 15637bded2dbSJung-uk Kim} 15647bded2dbSJung-uk Kim$code.=<<___; 15657bded2dbSJung-uk Kim mov @T[0],`8*$j-128`($out) # zero 15667bded2dbSJung-uk Kim mov @T[0],`8*($j+1)-128`($out) 15677bded2dbSJung-uk Kim mov @T[0],`8*($j+2)-128`($out) 15687bded2dbSJung-uk Kim mov @T[0],`8*($j+3)-128`($out) 15697bded2dbSJung-uk Kim ret 15706935a639SJung-uk Kim.cfi_endproc 15717bded2dbSJung-uk Kim.size rsaz_1024_norm2red_avx2,.-rsaz_1024_norm2red_avx2 15727bded2dbSJung-uk Kim___ 15737bded2dbSJung-uk Kim} 15747bded2dbSJung-uk Kim{ 15757bded2dbSJung-uk Kimmy ($out,$inp,$power) = $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx"); 15767bded2dbSJung-uk Kim 15777bded2dbSJung-uk Kim$code.=<<___; 15787bded2dbSJung-uk Kim.globl rsaz_1024_scatter5_avx2 15797bded2dbSJung-uk Kim.type rsaz_1024_scatter5_avx2,\@abi-omnipotent 15807bded2dbSJung-uk Kim.align 32 15817bded2dbSJung-uk Kimrsaz_1024_scatter5_avx2: 15826935a639SJung-uk Kim.cfi_startproc 15837bded2dbSJung-uk Kim vzeroupper 15847bded2dbSJung-uk Kim vmovdqu .Lscatter_permd(%rip),%ymm5 15857bded2dbSJung-uk Kim shl \$4,$power 15867bded2dbSJung-uk Kim lea ($out,$power),$out 15877bded2dbSJung-uk Kim mov \$9,%eax 15887bded2dbSJung-uk Kim jmp .Loop_scatter_1024 15897bded2dbSJung-uk Kim 15907bded2dbSJung-uk Kim.align 32 15917bded2dbSJung-uk Kim.Loop_scatter_1024: 15927bded2dbSJung-uk Kim vmovdqu ($inp),%ymm0 15937bded2dbSJung-uk Kim lea 32($inp),$inp 15947bded2dbSJung-uk Kim vpermd %ymm0,%ymm5,%ymm0 15957bded2dbSJung-uk Kim vmovdqu %xmm0,($out) 15967bded2dbSJung-uk Kim lea 16*32($out),$out 15977bded2dbSJung-uk Kim dec %eax 15987bded2dbSJung-uk Kim jnz .Loop_scatter_1024 15997bded2dbSJung-uk Kim 16007bded2dbSJung-uk Kim vzeroupper 16017bded2dbSJung-uk Kim ret 16026935a639SJung-uk Kim.cfi_endproc 16037bded2dbSJung-uk Kim.size rsaz_1024_scatter5_avx2,.-rsaz_1024_scatter5_avx2 16047bded2dbSJung-uk Kim 16057bded2dbSJung-uk Kim.globl rsaz_1024_gather5_avx2 16067bded2dbSJung-uk Kim.type rsaz_1024_gather5_avx2,\@abi-omnipotent 16077bded2dbSJung-uk Kim.align 32 16087bded2dbSJung-uk Kimrsaz_1024_gather5_avx2: 1609e71b7053SJung-uk Kim.cfi_startproc 16104c6a0400SJung-uk Kim vzeroupper 16114c6a0400SJung-uk Kim mov %rsp,%r11 1612e71b7053SJung-uk Kim.cfi_def_cfa_register %r11 16137bded2dbSJung-uk Kim___ 16147bded2dbSJung-uk Kim$code.=<<___ if ($win64); 16157bded2dbSJung-uk Kim lea -0x88(%rsp),%rax 16167bded2dbSJung-uk Kim.LSEH_begin_rsaz_1024_gather5: 16177bded2dbSJung-uk Kim # I can't trust assembler to use specific encoding:-( 16187bded2dbSJung-uk Kim .byte 0x48,0x8d,0x60,0xe0 # lea -0x20(%rax),%rsp 16197bded2dbSJung-uk Kim .byte 0xc5,0xf8,0x29,0x70,0xe0 # vmovaps %xmm6,-0x20(%rax) 16207bded2dbSJung-uk Kim .byte 0xc5,0xf8,0x29,0x78,0xf0 # vmovaps %xmm7,-0x10(%rax) 16217bded2dbSJung-uk Kim .byte 0xc5,0x78,0x29,0x40,0x00 # vmovaps %xmm8,0(%rax) 16227bded2dbSJung-uk Kim .byte 0xc5,0x78,0x29,0x48,0x10 # vmovaps %xmm9,0x10(%rax) 16237bded2dbSJung-uk Kim .byte 0xc5,0x78,0x29,0x50,0x20 # vmovaps %xmm10,0x20(%rax) 16247bded2dbSJung-uk Kim .byte 0xc5,0x78,0x29,0x58,0x30 # vmovaps %xmm11,0x30(%rax) 16257bded2dbSJung-uk Kim .byte 0xc5,0x78,0x29,0x60,0x40 # vmovaps %xmm12,0x40(%rax) 16267bded2dbSJung-uk Kim .byte 0xc5,0x78,0x29,0x68,0x50 # vmovaps %xmm13,0x50(%rax) 16277bded2dbSJung-uk Kim .byte 0xc5,0x78,0x29,0x70,0x60 # vmovaps %xmm14,0x60(%rax) 16287bded2dbSJung-uk Kim .byte 0xc5,0x78,0x29,0x78,0x70 # vmovaps %xmm15,0x70(%rax) 16297bded2dbSJung-uk Kim___ 16307bded2dbSJung-uk Kim$code.=<<___; 16314c6a0400SJung-uk Kim lea -0x100(%rsp),%rsp 16324c6a0400SJung-uk Kim and \$-32, %rsp 16334c6a0400SJung-uk Kim lea .Linc(%rip), %r10 16344c6a0400SJung-uk Kim lea -128(%rsp),%rax # control u-op density 16357bded2dbSJung-uk Kim 16364c6a0400SJung-uk Kim vmovd $power, %xmm4 16374c6a0400SJung-uk Kim vmovdqa (%r10),%ymm0 16384c6a0400SJung-uk Kim vmovdqa 32(%r10),%ymm1 16394c6a0400SJung-uk Kim vmovdqa 64(%r10),%ymm5 16404c6a0400SJung-uk Kim vpbroadcastd %xmm4,%ymm4 16417bded2dbSJung-uk Kim 16424c6a0400SJung-uk Kim vpaddd %ymm5, %ymm0, %ymm2 16434c6a0400SJung-uk Kim vpcmpeqd %ymm4, %ymm0, %ymm0 16444c6a0400SJung-uk Kim vpaddd %ymm5, %ymm1, %ymm3 16454c6a0400SJung-uk Kim vpcmpeqd %ymm4, %ymm1, %ymm1 16464c6a0400SJung-uk Kim vmovdqa %ymm0, 32*0+128(%rax) 16474c6a0400SJung-uk Kim vpaddd %ymm5, %ymm2, %ymm0 16484c6a0400SJung-uk Kim vpcmpeqd %ymm4, %ymm2, %ymm2 16494c6a0400SJung-uk Kim vmovdqa %ymm1, 32*1+128(%rax) 16504c6a0400SJung-uk Kim vpaddd %ymm5, %ymm3, %ymm1 16514c6a0400SJung-uk Kim vpcmpeqd %ymm4, %ymm3, %ymm3 16524c6a0400SJung-uk Kim vmovdqa %ymm2, 32*2+128(%rax) 16534c6a0400SJung-uk Kim vpaddd %ymm5, %ymm0, %ymm2 16544c6a0400SJung-uk Kim vpcmpeqd %ymm4, %ymm0, %ymm0 16554c6a0400SJung-uk Kim vmovdqa %ymm3, 32*3+128(%rax) 16564c6a0400SJung-uk Kim vpaddd %ymm5, %ymm1, %ymm3 16574c6a0400SJung-uk Kim vpcmpeqd %ymm4, %ymm1, %ymm1 16584c6a0400SJung-uk Kim vmovdqa %ymm0, 32*4+128(%rax) 16594c6a0400SJung-uk Kim vpaddd %ymm5, %ymm2, %ymm8 16604c6a0400SJung-uk Kim vpcmpeqd %ymm4, %ymm2, %ymm2 16614c6a0400SJung-uk Kim vmovdqa %ymm1, 32*5+128(%rax) 16624c6a0400SJung-uk Kim vpaddd %ymm5, %ymm3, %ymm9 16634c6a0400SJung-uk Kim vpcmpeqd %ymm4, %ymm3, %ymm3 16644c6a0400SJung-uk Kim vmovdqa %ymm2, 32*6+128(%rax) 16654c6a0400SJung-uk Kim vpaddd %ymm5, %ymm8, %ymm10 16664c6a0400SJung-uk Kim vpcmpeqd %ymm4, %ymm8, %ymm8 16674c6a0400SJung-uk Kim vmovdqa %ymm3, 32*7+128(%rax) 16684c6a0400SJung-uk Kim vpaddd %ymm5, %ymm9, %ymm11 16694c6a0400SJung-uk Kim vpcmpeqd %ymm4, %ymm9, %ymm9 16704c6a0400SJung-uk Kim vpaddd %ymm5, %ymm10, %ymm12 16714c6a0400SJung-uk Kim vpcmpeqd %ymm4, %ymm10, %ymm10 16724c6a0400SJung-uk Kim vpaddd %ymm5, %ymm11, %ymm13 16734c6a0400SJung-uk Kim vpcmpeqd %ymm4, %ymm11, %ymm11 16744c6a0400SJung-uk Kim vpaddd %ymm5, %ymm12, %ymm14 16754c6a0400SJung-uk Kim vpcmpeqd %ymm4, %ymm12, %ymm12 16764c6a0400SJung-uk Kim vpaddd %ymm5, %ymm13, %ymm15 16774c6a0400SJung-uk Kim vpcmpeqd %ymm4, %ymm13, %ymm13 16784c6a0400SJung-uk Kim vpcmpeqd %ymm4, %ymm14, %ymm14 16794c6a0400SJung-uk Kim vpcmpeqd %ymm4, %ymm15, %ymm15 16807bded2dbSJung-uk Kim 16814c6a0400SJung-uk Kim vmovdqa -32(%r10),%ymm7 # .Lgather_permd 16824c6a0400SJung-uk Kim lea 128($inp), $inp 16834c6a0400SJung-uk Kim mov \$9,$power 16844c6a0400SJung-uk Kim 16857bded2dbSJung-uk Kim.Loop_gather_1024: 16864c6a0400SJung-uk Kim vmovdqa 32*0-128($inp), %ymm0 16874c6a0400SJung-uk Kim vmovdqa 32*1-128($inp), %ymm1 16884c6a0400SJung-uk Kim vmovdqa 32*2-128($inp), %ymm2 16894c6a0400SJung-uk Kim vmovdqa 32*3-128($inp), %ymm3 16904c6a0400SJung-uk Kim vpand 32*0+128(%rax), %ymm0, %ymm0 16914c6a0400SJung-uk Kim vpand 32*1+128(%rax), %ymm1, %ymm1 16924c6a0400SJung-uk Kim vpand 32*2+128(%rax), %ymm2, %ymm2 16934c6a0400SJung-uk Kim vpor %ymm0, %ymm1, %ymm4 16944c6a0400SJung-uk Kim vpand 32*3+128(%rax), %ymm3, %ymm3 16954c6a0400SJung-uk Kim vmovdqa 32*4-128($inp), %ymm0 16964c6a0400SJung-uk Kim vmovdqa 32*5-128($inp), %ymm1 16974c6a0400SJung-uk Kim vpor %ymm2, %ymm3, %ymm5 16984c6a0400SJung-uk Kim vmovdqa 32*6-128($inp), %ymm2 16994c6a0400SJung-uk Kim vmovdqa 32*7-128($inp), %ymm3 17004c6a0400SJung-uk Kim vpand 32*4+128(%rax), %ymm0, %ymm0 17014c6a0400SJung-uk Kim vpand 32*5+128(%rax), %ymm1, %ymm1 17024c6a0400SJung-uk Kim vpand 32*6+128(%rax), %ymm2, %ymm2 17034c6a0400SJung-uk Kim vpor %ymm0, %ymm4, %ymm4 17044c6a0400SJung-uk Kim vpand 32*7+128(%rax), %ymm3, %ymm3 17054c6a0400SJung-uk Kim vpand 32*8-128($inp), %ymm8, %ymm0 17064c6a0400SJung-uk Kim vpor %ymm1, %ymm5, %ymm5 17074c6a0400SJung-uk Kim vpand 32*9-128($inp), %ymm9, %ymm1 17084c6a0400SJung-uk Kim vpor %ymm2, %ymm4, %ymm4 17094c6a0400SJung-uk Kim vpand 32*10-128($inp),%ymm10, %ymm2 17104c6a0400SJung-uk Kim vpor %ymm3, %ymm5, %ymm5 17114c6a0400SJung-uk Kim vpand 32*11-128($inp),%ymm11, %ymm3 17124c6a0400SJung-uk Kim vpor %ymm0, %ymm4, %ymm4 17134c6a0400SJung-uk Kim vpand 32*12-128($inp),%ymm12, %ymm0 17144c6a0400SJung-uk Kim vpor %ymm1, %ymm5, %ymm5 17154c6a0400SJung-uk Kim vpand 32*13-128($inp),%ymm13, %ymm1 17164c6a0400SJung-uk Kim vpor %ymm2, %ymm4, %ymm4 17174c6a0400SJung-uk Kim vpand 32*14-128($inp),%ymm14, %ymm2 17184c6a0400SJung-uk Kim vpor %ymm3, %ymm5, %ymm5 17194c6a0400SJung-uk Kim vpand 32*15-128($inp),%ymm15, %ymm3 17204c6a0400SJung-uk Kim lea 32*16($inp), $inp 17214c6a0400SJung-uk Kim vpor %ymm0, %ymm4, %ymm4 17224c6a0400SJung-uk Kim vpor %ymm1, %ymm5, %ymm5 17234c6a0400SJung-uk Kim vpor %ymm2, %ymm4, %ymm4 17244c6a0400SJung-uk Kim vpor %ymm3, %ymm5, %ymm5 17254c6a0400SJung-uk Kim 17264c6a0400SJung-uk Kim vpor %ymm5, %ymm4, %ymm4 17274c6a0400SJung-uk Kim vextracti128 \$1, %ymm4, %xmm5 # upper half is cleared 17287bded2dbSJung-uk Kim vpor %xmm4, %xmm5, %xmm5 17294c6a0400SJung-uk Kim vpermd %ymm5,%ymm7,%ymm5 17304c6a0400SJung-uk Kim vmovdqu %ymm5,($out) 17317bded2dbSJung-uk Kim lea 32($out),$out 17324c6a0400SJung-uk Kim dec $power 17337bded2dbSJung-uk Kim jnz .Loop_gather_1024 17347bded2dbSJung-uk Kim 17357bded2dbSJung-uk Kim vpxor %ymm0,%ymm0,%ymm0 17367bded2dbSJung-uk Kim vmovdqu %ymm0,($out) 17377bded2dbSJung-uk Kim vzeroupper 17387bded2dbSJung-uk Kim___ 17397bded2dbSJung-uk Kim$code.=<<___ if ($win64); 17404c6a0400SJung-uk Kim movaps -0xa8(%r11),%xmm6 17414c6a0400SJung-uk Kim movaps -0x98(%r11),%xmm7 17424c6a0400SJung-uk Kim movaps -0x88(%r11),%xmm8 17434c6a0400SJung-uk Kim movaps -0x78(%r11),%xmm9 17444c6a0400SJung-uk Kim movaps -0x68(%r11),%xmm10 17454c6a0400SJung-uk Kim movaps -0x58(%r11),%xmm11 17464c6a0400SJung-uk Kim movaps -0x48(%r11),%xmm12 17474c6a0400SJung-uk Kim movaps -0x38(%r11),%xmm13 17484c6a0400SJung-uk Kim movaps -0x28(%r11),%xmm14 17494c6a0400SJung-uk Kim movaps -0x18(%r11),%xmm15 17507bded2dbSJung-uk Kim___ 17517bded2dbSJung-uk Kim$code.=<<___; 17524c6a0400SJung-uk Kim lea (%r11),%rsp 1753e71b7053SJung-uk Kim.cfi_def_cfa_register %rsp 17547bded2dbSJung-uk Kim ret 1755e71b7053SJung-uk Kim.cfi_endproc 1756e71b7053SJung-uk Kim.LSEH_end_rsaz_1024_gather5: 17577bded2dbSJung-uk Kim.size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2 17587bded2dbSJung-uk Kim___ 17597bded2dbSJung-uk Kim} 17607bded2dbSJung-uk Kim 17617bded2dbSJung-uk Kim$code.=<<___; 17627bded2dbSJung-uk Kim.extern OPENSSL_ia32cap_P 17637bded2dbSJung-uk Kim.globl rsaz_avx2_eligible 17647bded2dbSJung-uk Kim.type rsaz_avx2_eligible,\@abi-omnipotent 17657bded2dbSJung-uk Kim.align 32 17667bded2dbSJung-uk Kimrsaz_avx2_eligible: 17677bded2dbSJung-uk Kim mov OPENSSL_ia32cap_P+8(%rip),%eax 17687bded2dbSJung-uk Kim___ 17697bded2dbSJung-uk Kim$code.=<<___ if ($addx); 17707bded2dbSJung-uk Kim mov \$`1<<8|1<<19`,%ecx 17717bded2dbSJung-uk Kim mov \$0,%edx 17727bded2dbSJung-uk Kim and %eax,%ecx 17737bded2dbSJung-uk Kim cmp \$`1<<8|1<<19`,%ecx # check for BMI2+AD*X 17747bded2dbSJung-uk Kim cmove %edx,%eax 17757bded2dbSJung-uk Kim___ 17767bded2dbSJung-uk Kim$code.=<<___; 17777bded2dbSJung-uk Kim and \$`1<<5`,%eax 17787bded2dbSJung-uk Kim shr \$5,%eax 17797bded2dbSJung-uk Kim ret 17807bded2dbSJung-uk Kim.size rsaz_avx2_eligible,.-rsaz_avx2_eligible 17817bded2dbSJung-uk Kim 17827bded2dbSJung-uk Kim.align 64 17837bded2dbSJung-uk Kim.Land_mask: 1784c4ad4dffSJung-uk Kim .quad 0x1fffffff,0x1fffffff,0x1fffffff,0x1fffffff 17857bded2dbSJung-uk Kim.Lscatter_permd: 17867bded2dbSJung-uk Kim .long 0,2,4,6,7,7,7,7 17877bded2dbSJung-uk Kim.Lgather_permd: 17887bded2dbSJung-uk Kim .long 0,7,1,7,2,7,3,7 17894c6a0400SJung-uk Kim.Linc: 17904c6a0400SJung-uk Kim .long 0,0,0,0, 1,1,1,1 17914c6a0400SJung-uk Kim .long 2,2,2,2, 3,3,3,3 17924c6a0400SJung-uk Kim .long 4,4,4,4, 4,4,4,4 17937bded2dbSJung-uk Kim.align 64 17947bded2dbSJung-uk Kim___ 17957bded2dbSJung-uk Kim 17967bded2dbSJung-uk Kimif ($win64) { 17977bded2dbSJung-uk Kim$rec="%rcx"; 17987bded2dbSJung-uk Kim$frame="%rdx"; 17997bded2dbSJung-uk Kim$context="%r8"; 18007bded2dbSJung-uk Kim$disp="%r9"; 18017bded2dbSJung-uk Kim 18027bded2dbSJung-uk Kim$code.=<<___ 18037bded2dbSJung-uk Kim.extern __imp_RtlVirtualUnwind 18047bded2dbSJung-uk Kim.type rsaz_se_handler,\@abi-omnipotent 18057bded2dbSJung-uk Kim.align 16 18067bded2dbSJung-uk Kimrsaz_se_handler: 18077bded2dbSJung-uk Kim push %rsi 18087bded2dbSJung-uk Kim push %rdi 18097bded2dbSJung-uk Kim push %rbx 18107bded2dbSJung-uk Kim push %rbp 18117bded2dbSJung-uk Kim push %r12 18127bded2dbSJung-uk Kim push %r13 18137bded2dbSJung-uk Kim push %r14 18147bded2dbSJung-uk Kim push %r15 18157bded2dbSJung-uk Kim pushfq 18167bded2dbSJung-uk Kim sub \$64,%rsp 18177bded2dbSJung-uk Kim 18187bded2dbSJung-uk Kim mov 120($context),%rax # pull context->Rax 18197bded2dbSJung-uk Kim mov 248($context),%rbx # pull context->Rip 18207bded2dbSJung-uk Kim 18217bded2dbSJung-uk Kim mov 8($disp),%rsi # disp->ImageBase 18227bded2dbSJung-uk Kim mov 56($disp),%r11 # disp->HandlerData 18237bded2dbSJung-uk Kim 18247bded2dbSJung-uk Kim mov 0(%r11),%r10d # HandlerData[0] 18257bded2dbSJung-uk Kim lea (%rsi,%r10),%r10 # prologue label 18267bded2dbSJung-uk Kim cmp %r10,%rbx # context->Rip<prologue label 18277bded2dbSJung-uk Kim jb .Lcommon_seh_tail 18287bded2dbSJung-uk Kim 18297bded2dbSJung-uk Kim mov 4(%r11),%r10d # HandlerData[1] 18307bded2dbSJung-uk Kim lea (%rsi,%r10),%r10 # epilogue label 18317bded2dbSJung-uk Kim cmp %r10,%rbx # context->Rip>=epilogue label 18327bded2dbSJung-uk Kim jae .Lcommon_seh_tail 18337bded2dbSJung-uk Kim 1834e71b7053SJung-uk Kim mov 160($context),%rbp # pull context->Rbp 1835e71b7053SJung-uk Kim 1836e71b7053SJung-uk Kim mov 8(%r11),%r10d # HandlerData[2] 1837e71b7053SJung-uk Kim lea (%rsi,%r10),%r10 # "in tail" label 1838e71b7053SJung-uk Kim cmp %r10,%rbx # context->Rip>="in tail" label 1839e71b7053SJung-uk Kim cmovc %rbp,%rax 18407bded2dbSJung-uk Kim 18417bded2dbSJung-uk Kim mov -48(%rax),%r15 18427bded2dbSJung-uk Kim mov -40(%rax),%r14 18437bded2dbSJung-uk Kim mov -32(%rax),%r13 18447bded2dbSJung-uk Kim mov -24(%rax),%r12 18457bded2dbSJung-uk Kim mov -16(%rax),%rbp 18467bded2dbSJung-uk Kim mov -8(%rax),%rbx 18477bded2dbSJung-uk Kim mov %r15,240($context) 18487bded2dbSJung-uk Kim mov %r14,232($context) 18497bded2dbSJung-uk Kim mov %r13,224($context) 18507bded2dbSJung-uk Kim mov %r12,216($context) 18517bded2dbSJung-uk Kim mov %rbp,160($context) 18527bded2dbSJung-uk Kim mov %rbx,144($context) 18537bded2dbSJung-uk Kim 18547bded2dbSJung-uk Kim lea -0xd8(%rax),%rsi # %xmm save area 18557bded2dbSJung-uk Kim lea 512($context),%rdi # & context.Xmm6 18567bded2dbSJung-uk Kim mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 18577bded2dbSJung-uk Kim .long 0xa548f3fc # cld; rep movsq 18587bded2dbSJung-uk Kim 18597bded2dbSJung-uk Kim.Lcommon_seh_tail: 18607bded2dbSJung-uk Kim mov 8(%rax),%rdi 18617bded2dbSJung-uk Kim mov 16(%rax),%rsi 18627bded2dbSJung-uk Kim mov %rax,152($context) # restore context->Rsp 18637bded2dbSJung-uk Kim mov %rsi,168($context) # restore context->Rsi 18647bded2dbSJung-uk Kim mov %rdi,176($context) # restore context->Rdi 18657bded2dbSJung-uk Kim 18667bded2dbSJung-uk Kim mov 40($disp),%rdi # disp->ContextRecord 18677bded2dbSJung-uk Kim mov $context,%rsi # context 18687bded2dbSJung-uk Kim mov \$154,%ecx # sizeof(CONTEXT) 18697bded2dbSJung-uk Kim .long 0xa548f3fc # cld; rep movsq 18707bded2dbSJung-uk Kim 18717bded2dbSJung-uk Kim mov $disp,%rsi 18727bded2dbSJung-uk Kim xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 18737bded2dbSJung-uk Kim mov 8(%rsi),%rdx # arg2, disp->ImageBase 18747bded2dbSJung-uk Kim mov 0(%rsi),%r8 # arg3, disp->ControlPc 18757bded2dbSJung-uk Kim mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 18767bded2dbSJung-uk Kim mov 40(%rsi),%r10 # disp->ContextRecord 18777bded2dbSJung-uk Kim lea 56(%rsi),%r11 # &disp->HandlerData 18787bded2dbSJung-uk Kim lea 24(%rsi),%r12 # &disp->EstablisherFrame 18797bded2dbSJung-uk Kim mov %r10,32(%rsp) # arg5 18807bded2dbSJung-uk Kim mov %r11,40(%rsp) # arg6 18817bded2dbSJung-uk Kim mov %r12,48(%rsp) # arg7 18827bded2dbSJung-uk Kim mov %rcx,56(%rsp) # arg8, (NULL) 18837bded2dbSJung-uk Kim call *__imp_RtlVirtualUnwind(%rip) 18847bded2dbSJung-uk Kim 18857bded2dbSJung-uk Kim mov \$1,%eax # ExceptionContinueSearch 18867bded2dbSJung-uk Kim add \$64,%rsp 18877bded2dbSJung-uk Kim popfq 18887bded2dbSJung-uk Kim pop %r15 18897bded2dbSJung-uk Kim pop %r14 18907bded2dbSJung-uk Kim pop %r13 18917bded2dbSJung-uk Kim pop %r12 18927bded2dbSJung-uk Kim pop %rbp 18937bded2dbSJung-uk Kim pop %rbx 18947bded2dbSJung-uk Kim pop %rdi 18957bded2dbSJung-uk Kim pop %rsi 18967bded2dbSJung-uk Kim ret 18977bded2dbSJung-uk Kim.size rsaz_se_handler,.-rsaz_se_handler 18987bded2dbSJung-uk Kim 18997bded2dbSJung-uk Kim.section .pdata 19007bded2dbSJung-uk Kim.align 4 19017bded2dbSJung-uk Kim .rva .LSEH_begin_rsaz_1024_sqr_avx2 19027bded2dbSJung-uk Kim .rva .LSEH_end_rsaz_1024_sqr_avx2 19037bded2dbSJung-uk Kim .rva .LSEH_info_rsaz_1024_sqr_avx2 19047bded2dbSJung-uk Kim 19057bded2dbSJung-uk Kim .rva .LSEH_begin_rsaz_1024_mul_avx2 19067bded2dbSJung-uk Kim .rva .LSEH_end_rsaz_1024_mul_avx2 19077bded2dbSJung-uk Kim .rva .LSEH_info_rsaz_1024_mul_avx2 19087bded2dbSJung-uk Kim 19097bded2dbSJung-uk Kim .rva .LSEH_begin_rsaz_1024_gather5 19107bded2dbSJung-uk Kim .rva .LSEH_end_rsaz_1024_gather5 19117bded2dbSJung-uk Kim .rva .LSEH_info_rsaz_1024_gather5 19127bded2dbSJung-uk Kim.section .xdata 19137bded2dbSJung-uk Kim.align 8 19147bded2dbSJung-uk Kim.LSEH_info_rsaz_1024_sqr_avx2: 19157bded2dbSJung-uk Kim .byte 9,0,0,0 19167bded2dbSJung-uk Kim .rva rsaz_se_handler 1917e71b7053SJung-uk Kim .rva .Lsqr_1024_body,.Lsqr_1024_epilogue,.Lsqr_1024_in_tail 1918e71b7053SJung-uk Kim .long 0 19197bded2dbSJung-uk Kim.LSEH_info_rsaz_1024_mul_avx2: 19207bded2dbSJung-uk Kim .byte 9,0,0,0 19217bded2dbSJung-uk Kim .rva rsaz_se_handler 1922e71b7053SJung-uk Kim .rva .Lmul_1024_body,.Lmul_1024_epilogue,.Lmul_1024_in_tail 1923e71b7053SJung-uk Kim .long 0 19247bded2dbSJung-uk Kim.LSEH_info_rsaz_1024_gather5: 19254c6a0400SJung-uk Kim .byte 0x01,0x36,0x17,0x0b 19267bded2dbSJung-uk Kim .byte 0x36,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15 19277bded2dbSJung-uk Kim .byte 0x31,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14 19287bded2dbSJung-uk Kim .byte 0x2c,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13 19297bded2dbSJung-uk Kim .byte 0x27,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12 19307bded2dbSJung-uk Kim .byte 0x22,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11 19317bded2dbSJung-uk Kim .byte 0x1d,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10 19327bded2dbSJung-uk Kim .byte 0x18,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9 19337bded2dbSJung-uk Kim .byte 0x13,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8 19347bded2dbSJung-uk Kim .byte 0x0e,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7 19357bded2dbSJung-uk Kim .byte 0x09,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6 19367bded2dbSJung-uk Kim .byte 0x04,0x01,0x15,0x00 # sub rsp,0xa8 19374c6a0400SJung-uk Kim .byte 0x00,0xb3,0x00,0x00 # set_frame r11 19387bded2dbSJung-uk Kim___ 19397bded2dbSJung-uk Kim} 19407bded2dbSJung-uk Kim 19417bded2dbSJung-uk Kimforeach (split("\n",$code)) { 19427bded2dbSJung-uk Kim s/\`([^\`]*)\`/eval($1)/ge; 19437bded2dbSJung-uk Kim 19447bded2dbSJung-uk Kim s/\b(sh[rl]d?\s+\$)(-?[0-9]+)/$1.$2%64/ge or 19457bded2dbSJung-uk Kim 19467bded2dbSJung-uk Kim s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or 19477bded2dbSJung-uk Kim s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or 19487bded2dbSJung-uk Kim s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or 19497bded2dbSJung-uk Kim s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or 19507bded2dbSJung-uk Kim s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go; 19517bded2dbSJung-uk Kim print $_,"\n"; 19527bded2dbSJung-uk Kim} 19537bded2dbSJung-uk Kim 19547bded2dbSJung-uk Kim}}} else {{{ 19557bded2dbSJung-uk Kimprint <<___; # assembler is too old 19567bded2dbSJung-uk Kim.text 19577bded2dbSJung-uk Kim 19587bded2dbSJung-uk Kim.globl rsaz_avx2_eligible 19597bded2dbSJung-uk Kim.type rsaz_avx2_eligible,\@abi-omnipotent 19607bded2dbSJung-uk Kimrsaz_avx2_eligible: 19617bded2dbSJung-uk Kim xor %eax,%eax 19627bded2dbSJung-uk Kim ret 19637bded2dbSJung-uk Kim.size rsaz_avx2_eligible,.-rsaz_avx2_eligible 19647bded2dbSJung-uk Kim 19657bded2dbSJung-uk Kim.globl rsaz_1024_sqr_avx2 19667bded2dbSJung-uk Kim.globl rsaz_1024_mul_avx2 19677bded2dbSJung-uk Kim.globl rsaz_1024_norm2red_avx2 19687bded2dbSJung-uk Kim.globl rsaz_1024_red2norm_avx2 19697bded2dbSJung-uk Kim.globl rsaz_1024_scatter5_avx2 19707bded2dbSJung-uk Kim.globl rsaz_1024_gather5_avx2 19717bded2dbSJung-uk Kim.type rsaz_1024_sqr_avx2,\@abi-omnipotent 19727bded2dbSJung-uk Kimrsaz_1024_sqr_avx2: 19737bded2dbSJung-uk Kimrsaz_1024_mul_avx2: 19747bded2dbSJung-uk Kimrsaz_1024_norm2red_avx2: 19757bded2dbSJung-uk Kimrsaz_1024_red2norm_avx2: 19767bded2dbSJung-uk Kimrsaz_1024_scatter5_avx2: 19777bded2dbSJung-uk Kimrsaz_1024_gather5_avx2: 19787bded2dbSJung-uk Kim .byte 0x0f,0x0b # ud2 19797bded2dbSJung-uk Kim ret 19807bded2dbSJung-uk Kim.size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2 19817bded2dbSJung-uk Kim___ 19827bded2dbSJung-uk Kim}}} 19837bded2dbSJung-uk Kim 198417f01e99SJung-uk Kimclose STDOUT or die "error closing STDOUT: $!"; 1985