1# Copyright 2020-2021 The OpenSSL Project Authors. All Rights Reserved. 2# Copyright (c) 2020, Intel Corporation. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8# 9# 10# Originally written by Ilya Albrekht, Sergey Kirillov and Andrey Matyukov 11# Intel Corporation 12# 13# December 2020 14# 15# Initial release. 16# 17# Implementation utilizes 256-bit (ymm) registers to avoid frequency scaling issues. 18# 19# IceLake-Client @ 1.3GHz 20# |---------+----------------------+--------------+-------------| 21# | | OpenSSL 3.0.0-alpha9 | this | Unit | 22# |---------+----------------------+--------------+-------------| 23# | rsa2048 | 2 127 659 | 1 015 625 | cycles/sign | 24# | | 611 | 1280 / +109% | sign/s | 25# |---------+----------------------+--------------+-------------| 26# 27 28# $output is the last argument if it looks like a file (it has an extension) 29# $flavour is the first argument if it doesn't look like a file 30$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 31$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 32 33$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 34$avx512ifma=0; 35 36$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 37( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 38( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 39die "can't locate x86_64-xlate.pl"; 40 41if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 42 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 43 $avx512ifma = ($1>=2.26); 44} 45 46if (!$avx512 && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 47 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) { 48 $avx512ifma = ($1==2.11 && $2>=8) + ($1>=2.12); 49} 50 51if (!$avx512 && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { 52 $avx512ifma = ($2>=7.0); 53} 54 55open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" 56 or die "can't call $xlate: $!"; 57*STDOUT=*OUT; 58 59if ($avx512ifma>0) {{{ 60@_6_args_universal_ABI = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9"); 61 62$code.=<<___; 63.extern OPENSSL_ia32cap_P 64.globl ossl_rsaz_avx512ifma_eligible 65.type ossl_rsaz_avx512ifma_eligible,\@abi-omnipotent 66.align 32 67ossl_rsaz_avx512ifma_eligible: 68 mov OPENSSL_ia32cap_P+8(%rip), %ecx 69 xor %eax,%eax 70 and \$`1<<31|1<<21|1<<17|1<<16`, %ecx # avx512vl + avx512ifma + avx512dq + avx512f 71 cmp \$`1<<31|1<<21|1<<17|1<<16`, %ecx 72 cmove %ecx,%eax 73 ret 74.size ossl_rsaz_avx512ifma_eligible, .-ossl_rsaz_avx512ifma_eligible 75___ 76 77############################################################################### 78# Almost Montgomery Multiplication (AMM) for 20-digit number in radix 2^52. 79# 80# AMM is defined as presented in the paper 81# "Efficient Software Implementations of Modular Exponentiation" by Shay Gueron. 82# 83# The input and output are presented in 2^52 radix domain, i.e. 84# |res|, |a|, |b|, |m| are arrays of 20 64-bit qwords with 12 high bits zeroed. 85# |k0| is a Montgomery coefficient, which is here k0 = -1/m mod 2^64 86# (note, the implementation counts only 52 bits from it). 87# 88# NB: the AMM implementation does not perform "conditional" subtraction step as 89# specified in the original algorithm as according to the paper "Enhanced Montgomery 90# Multiplication" by Shay Gueron (see Lemma 1), the result will be always < 2*2^1024 91# and can be used as a direct input to the next AMM iteration. 92# This post-condition is true, provided the correct parameter |s| is choosen, i.e. 93# s >= n + 2 * k, which matches our case: 1040 > 1024 + 2 * 1. 94# 95# void ossl_rsaz_amm52x20_x1_256(BN_ULONG *res, 96# const BN_ULONG *a, 97# const BN_ULONG *b, 98# const BN_ULONG *m, 99# BN_ULONG k0); 100############################################################################### 101{ 102# input parameters ("%rdi","%rsi","%rdx","%rcx","%r8") 103my ($res,$a,$b,$m,$k0) = @_6_args_universal_ABI; 104 105my $mask52 = "%rax"; 106my $acc0_0 = "%r9"; 107my $acc0_0_low = "%r9d"; 108my $acc0_1 = "%r15"; 109my $acc0_1_low = "%r15d"; 110my $b_ptr = "%r11"; 111 112my $iter = "%ebx"; 113 114my $zero = "%ymm0"; 115my ($R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0) = ("%ymm1", map("%ymm$_",(16..19))); 116my ($R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1) = ("%ymm2", map("%ymm$_",(20..23))); 117my $Bi = "%ymm3"; 118my $Yi = "%ymm4"; 119 120# Registers mapping for normalization. 121# We can reuse Bi, Yi registers here. 122my $TMP = $Bi; 123my $mask52x4 = $Yi; 124my ($T0,$T0h,$T1,$T1h,$T2) = map("%ymm$_", (24..28)); 125 126sub amm52x20_x1() { 127# _data_offset - offset in the |a| or |m| arrays pointing to the beginning 128# of data for corresponding AMM operation; 129# _b_offset - offset in the |b| array pointing to the next qword digit; 130my ($_data_offset,$_b_offset,$_acc,$_R0,$_R0h,$_R1,$_R1h,$_R2,$_k0) = @_; 131my $_R0_xmm = $_R0; 132$_R0_xmm =~ s/%y/%x/; 133$code.=<<___; 134 movq $_b_offset($b_ptr), %r13 # b[i] 135 136 vpbroadcastq %r13, $Bi # broadcast b[i] 137 movq $_data_offset($a), %rdx 138 mulx %r13, %r13, %r12 # a[0]*b[i] = (t0,t2) 139 addq %r13, $_acc # acc += t0 140 movq %r12, %r10 141 adcq \$0, %r10 # t2 += CF 142 143 movq $_k0, %r13 144 imulq $_acc, %r13 # acc * k0 145 andq $mask52, %r13 # yi = (acc * k0) & mask52 146 147 vpbroadcastq %r13, $Yi # broadcast y[i] 148 movq $_data_offset($m), %rdx 149 mulx %r13, %r13, %r12 # yi * m[0] = (t0,t1) 150 addq %r13, $_acc # acc += t0 151 adcq %r12, %r10 # t2 += (t1 + CF) 152 153 shrq \$52, $_acc 154 salq \$12, %r10 155 or %r10, $_acc # acc = ((acc >> 52) | (t2 << 12)) 156 157 vpmadd52luq `$_data_offset+64*0`($a), $Bi, $_R0 158 vpmadd52luq `$_data_offset+64*0+32`($a), $Bi, $_R0h 159 vpmadd52luq `$_data_offset+64*1`($a), $Bi, $_R1 160 vpmadd52luq `$_data_offset+64*1+32`($a), $Bi, $_R1h 161 vpmadd52luq `$_data_offset+64*2`($a), $Bi, $_R2 162 163 vpmadd52luq `$_data_offset+64*0`($m), $Yi, $_R0 164 vpmadd52luq `$_data_offset+64*0+32`($m), $Yi, $_R0h 165 vpmadd52luq `$_data_offset+64*1`($m), $Yi, $_R1 166 vpmadd52luq `$_data_offset+64*1+32`($m), $Yi, $_R1h 167 vpmadd52luq `$_data_offset+64*2`($m), $Yi, $_R2 168 169 # Shift accumulators right by 1 qword, zero extending the highest one 170 valignq \$1, $_R0, $_R0h, $_R0 171 valignq \$1, $_R0h, $_R1, $_R0h 172 valignq \$1, $_R1, $_R1h, $_R1 173 valignq \$1, $_R1h, $_R2, $_R1h 174 valignq \$1, $_R2, $zero, $_R2 175 176 vmovq $_R0_xmm, %r13 177 addq %r13, $_acc # acc += R0[0] 178 179 vpmadd52huq `$_data_offset+64*0`($a), $Bi, $_R0 180 vpmadd52huq `$_data_offset+64*0+32`($a), $Bi, $_R0h 181 vpmadd52huq `$_data_offset+64*1`($a), $Bi, $_R1 182 vpmadd52huq `$_data_offset+64*1+32`($a), $Bi, $_R1h 183 vpmadd52huq `$_data_offset+64*2`($a), $Bi, $_R2 184 185 vpmadd52huq `$_data_offset+64*0`($m), $Yi, $_R0 186 vpmadd52huq `$_data_offset+64*0+32`($m), $Yi, $_R0h 187 vpmadd52huq `$_data_offset+64*1`($m), $Yi, $_R1 188 vpmadd52huq `$_data_offset+64*1+32`($m), $Yi, $_R1h 189 vpmadd52huq `$_data_offset+64*2`($m), $Yi, $_R2 190___ 191} 192 193# Normalization routine: handles carry bits in R0..R2 QWs and 194# gets R0..R2 back to normalized 2^52 representation. 195# 196# Uses %r8-14,%e[bcd]x 197sub amm52x20_x1_norm { 198my ($_acc,$_R0,$_R0h,$_R1,$_R1h,$_R2) = @_; 199$code.=<<___; 200 # Put accumulator to low qword in R0 201 vpbroadcastq $_acc, $TMP 202 vpblendd \$3, $TMP, $_R0, $_R0 203 204 # Extract "carries" (12 high bits) from each QW of R0..R2 205 # Save them to LSB of QWs in T0..T2 206 vpsrlq \$52, $_R0, $T0 207 vpsrlq \$52, $_R0h, $T0h 208 vpsrlq \$52, $_R1, $T1 209 vpsrlq \$52, $_R1h, $T1h 210 vpsrlq \$52, $_R2, $T2 211 212 # "Shift left" T0..T2 by 1 QW 213 valignq \$3, $T1h, $T2, $T2 214 valignq \$3, $T1, $T1h, $T1h 215 valignq \$3, $T0h, $T1, $T1 216 valignq \$3, $T0, $T0h, $T0h 217 valignq \$3, $zero, $T0, $T0 218 219 # Drop "carries" from R0..R2 QWs 220 vpandq $mask52x4, $_R0, $_R0 221 vpandq $mask52x4, $_R0h, $_R0h 222 vpandq $mask52x4, $_R1, $_R1 223 vpandq $mask52x4, $_R1h, $_R1h 224 vpandq $mask52x4, $_R2, $_R2 225 226 # Sum R0..R2 with corresponding adjusted carries 227 vpaddq $T0, $_R0, $_R0 228 vpaddq $T0h, $_R0h, $_R0h 229 vpaddq $T1, $_R1, $_R1 230 vpaddq $T1h, $_R1h, $_R1h 231 vpaddq $T2, $_R2, $_R2 232 233 # Now handle carry bits from this addition 234 # Get mask of QWs which 52-bit parts overflow... 235 vpcmpuq \$1, $_R0, $mask52x4, %k1 # OP=lt 236 vpcmpuq \$1, $_R0h, $mask52x4, %k2 237 vpcmpuq \$1, $_R1, $mask52x4, %k3 238 vpcmpuq \$1, $_R1h, $mask52x4, %k4 239 vpcmpuq \$1, $_R2, $mask52x4, %k5 240 kmovb %k1, %r14d # k1 241 kmovb %k2, %r13d # k1h 242 kmovb %k3, %r12d # k2 243 kmovb %k4, %r11d # k2h 244 kmovb %k5, %r10d # k3 245 246 # ...or saturated 247 vpcmpuq \$0, $_R0, $mask52x4, %k1 # OP=eq 248 vpcmpuq \$0, $_R0h, $mask52x4, %k2 249 vpcmpuq \$0, $_R1, $mask52x4, %k3 250 vpcmpuq \$0, $_R1h, $mask52x4, %k4 251 vpcmpuq \$0, $_R2, $mask52x4, %k5 252 kmovb %k1, %r9d # k4 253 kmovb %k2, %r8d # k4h 254 kmovb %k3, %ebx # k5 255 kmovb %k4, %ecx # k5h 256 kmovb %k5, %edx # k6 257 258 # Get mask of QWs where carries shall be propagated to. 259 # Merge 4-bit masks to 8-bit values to use add with carry. 260 shl \$4, %r13b 261 or %r13b, %r14b 262 shl \$4, %r11b 263 or %r11b, %r12b 264 265 add %r14b, %r14b 266 adc %r12b, %r12b 267 adc %r10b, %r10b 268 269 shl \$4, %r8b 270 or %r8b,%r9b 271 shl \$4, %cl 272 or %cl, %bl 273 274 add %r9b, %r14b 275 adc %bl, %r12b 276 adc %dl, %r10b 277 278 xor %r9b, %r14b 279 xor %bl, %r12b 280 xor %dl, %r10b 281 282 kmovb %r14d, %k1 283 shr \$4, %r14b 284 kmovb %r14d, %k2 285 kmovb %r12d, %k3 286 shr \$4, %r12b 287 kmovb %r12d, %k4 288 kmovb %r10d, %k5 289 290 # Add carries according to the obtained mask 291 vpsubq $mask52x4, $_R0, ${_R0}{%k1} 292 vpsubq $mask52x4, $_R0h, ${_R0h}{%k2} 293 vpsubq $mask52x4, $_R1, ${_R1}{%k3} 294 vpsubq $mask52x4, $_R1h, ${_R1h}{%k4} 295 vpsubq $mask52x4, $_R2, ${_R2}{%k5} 296 297 vpandq $mask52x4, $_R0, $_R0 298 vpandq $mask52x4, $_R0h, $_R0h 299 vpandq $mask52x4, $_R1, $_R1 300 vpandq $mask52x4, $_R1h, $_R1h 301 vpandq $mask52x4, $_R2, $_R2 302___ 303} 304 305$code.=<<___; 306.text 307 308.globl ossl_rsaz_amm52x20_x1_256 309.type ossl_rsaz_amm52x20_x1_256,\@function,5 310.align 32 311ossl_rsaz_amm52x20_x1_256: 312.cfi_startproc 313 endbranch 314 push %rbx 315.cfi_push %rbx 316 push %rbp 317.cfi_push %rbp 318 push %r12 319.cfi_push %r12 320 push %r13 321.cfi_push %r13 322 push %r14 323.cfi_push %r14 324 push %r15 325.cfi_push %r15 326.Lrsaz_amm52x20_x1_256_body: 327 328 # Zeroing accumulators 329 vpxord $zero, $zero, $zero 330 vmovdqa64 $zero, $R0_0 331 vmovdqa64 $zero, $R0_0h 332 vmovdqa64 $zero, $R1_0 333 vmovdqa64 $zero, $R1_0h 334 vmovdqa64 $zero, $R2_0 335 336 xorl $acc0_0_low, $acc0_0_low 337 338 movq $b, $b_ptr # backup address of b 339 movq \$0xfffffffffffff, $mask52 # 52-bit mask 340 341 # Loop over 20 digits unrolled by 4 342 mov \$5, $iter 343 344.align 32 345.Lloop5: 346___ 347 foreach my $idx (0..3) { 348 &amm52x20_x1(0,8*$idx,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$k0); 349 } 350$code.=<<___; 351 lea `4*8`($b_ptr), $b_ptr 352 dec $iter 353 jne .Lloop5 354 355 vmovdqa64 .Lmask52x4(%rip), $mask52x4 356___ 357 &amm52x20_x1_norm($acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0); 358$code.=<<___; 359 360 vmovdqu64 $R0_0, ($res) 361 vmovdqu64 $R0_0h, 32($res) 362 vmovdqu64 $R1_0, 64($res) 363 vmovdqu64 $R1_0h, 96($res) 364 vmovdqu64 $R2_0, 128($res) 365 366 vzeroupper 367 mov 0(%rsp),%r15 368.cfi_restore %r15 369 mov 8(%rsp),%r14 370.cfi_restore %r14 371 mov 16(%rsp),%r13 372.cfi_restore %r13 373 mov 24(%rsp),%r12 374.cfi_restore %r12 375 mov 32(%rsp),%rbp 376.cfi_restore %rbp 377 mov 40(%rsp),%rbx 378.cfi_restore %rbx 379 lea 48(%rsp),%rsp 380.cfi_adjust_cfa_offset -48 381.Lrsaz_amm52x20_x1_256_epilogue: 382 ret 383.cfi_endproc 384.size ossl_rsaz_amm52x20_x1_256, .-ossl_rsaz_amm52x20_x1_256 385___ 386 387$code.=<<___; 388.data 389.align 32 390.Lmask52x4: 391 .quad 0xfffffffffffff 392 .quad 0xfffffffffffff 393 .quad 0xfffffffffffff 394 .quad 0xfffffffffffff 395___ 396 397############################################################################### 398# Dual Almost Montgomery Multiplication for 20-digit number in radix 2^52 399# 400# See description of ossl_rsaz_amm52x20_x1_256() above for details about Almost 401# Montgomery Multiplication algorithm and function input parameters description. 402# 403# This function does two AMMs for two independent inputs, hence dual. 404# 405# void ossl_rsaz_amm52x20_x2_256(BN_ULONG out[2][20], 406# const BN_ULONG a[2][20], 407# const BN_ULONG b[2][20], 408# const BN_ULONG m[2][20], 409# const BN_ULONG k0[2]); 410############################################################################### 411 412$code.=<<___; 413.text 414 415.globl ossl_rsaz_amm52x20_x2_256 416.type ossl_rsaz_amm52x20_x2_256,\@function,5 417.align 32 418ossl_rsaz_amm52x20_x2_256: 419.cfi_startproc 420 endbranch 421 push %rbx 422.cfi_push %rbx 423 push %rbp 424.cfi_push %rbp 425 push %r12 426.cfi_push %r12 427 push %r13 428.cfi_push %r13 429 push %r14 430.cfi_push %r14 431 push %r15 432.cfi_push %r15 433.Lrsaz_amm52x20_x2_256_body: 434 435 # Zeroing accumulators 436 vpxord $zero, $zero, $zero 437 vmovdqa64 $zero, $R0_0 438 vmovdqa64 $zero, $R0_0h 439 vmovdqa64 $zero, $R1_0 440 vmovdqa64 $zero, $R1_0h 441 vmovdqa64 $zero, $R2_0 442 vmovdqa64 $zero, $R0_1 443 vmovdqa64 $zero, $R0_1h 444 vmovdqa64 $zero, $R1_1 445 vmovdqa64 $zero, $R1_1h 446 vmovdqa64 $zero, $R2_1 447 448 xorl $acc0_0_low, $acc0_0_low 449 xorl $acc0_1_low, $acc0_1_low 450 451 movq $b, $b_ptr # backup address of b 452 movq \$0xfffffffffffff, $mask52 # 52-bit mask 453 454 mov \$20, $iter 455 456.align 32 457.Lloop20: 458___ 459 &amm52x20_x1( 0, 0,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,"($k0)"); 460 # 20*8 = offset of the next dimension in two-dimension array 461 &amm52x20_x1(20*8,20*8,$acc0_1,$R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1,"8($k0)"); 462$code.=<<___; 463 lea 8($b_ptr), $b_ptr 464 dec $iter 465 jne .Lloop20 466 467 vmovdqa64 .Lmask52x4(%rip), $mask52x4 468___ 469 &amm52x20_x1_norm($acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0); 470 &amm52x20_x1_norm($acc0_1,$R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1); 471$code.=<<___; 472 473 vmovdqu64 $R0_0, ($res) 474 vmovdqu64 $R0_0h, 32($res) 475 vmovdqu64 $R1_0, 64($res) 476 vmovdqu64 $R1_0h, 96($res) 477 vmovdqu64 $R2_0, 128($res) 478 479 vmovdqu64 $R0_1, 160($res) 480 vmovdqu64 $R0_1h, 192($res) 481 vmovdqu64 $R1_1, 224($res) 482 vmovdqu64 $R1_1h, 256($res) 483 vmovdqu64 $R2_1, 288($res) 484 485 vzeroupper 486 mov 0(%rsp),%r15 487.cfi_restore %r15 488 mov 8(%rsp),%r14 489.cfi_restore %r14 490 mov 16(%rsp),%r13 491.cfi_restore %r13 492 mov 24(%rsp),%r12 493.cfi_restore %r12 494 mov 32(%rsp),%rbp 495.cfi_restore %rbp 496 mov 40(%rsp),%rbx 497.cfi_restore %rbx 498 lea 48(%rsp),%rsp 499.cfi_adjust_cfa_offset -48 500.Lrsaz_amm52x20_x2_256_epilogue: 501 ret 502.cfi_endproc 503.size ossl_rsaz_amm52x20_x2_256, .-ossl_rsaz_amm52x20_x2_256 504___ 505} 506 507############################################################################### 508# Constant time extraction from the precomputed table of powers base^i, where 509# i = 0..2^EXP_WIN_SIZE-1 510# 511# The input |red_table| contains precomputations for two independent base values, 512# so the |tbl_idx| indicates for which base shall we extract the value. 513# |red_table_idx| is a power index. 514# 515# Extracted value (output) is 20 digit number in 2^52 radix. 516# 517# void ossl_extract_multiplier_2x20_win5(BN_ULONG *red_Y, 518# const BN_ULONG red_table[1 << EXP_WIN_SIZE][2][20], 519# int red_table_idx, 520# int tbl_idx); # 0 or 1 521# 522# EXP_WIN_SIZE = 5 523############################################################################### 524{ 525# input parameters 526my ($out,$red_tbl,$red_tbl_idx,$tbl_idx) = @_6_args_universal_ABI; 527 528my ($t0,$t1,$t2,$t3,$t4) = map("%ymm$_", (0..4)); 529my $t4xmm = $t4; 530$t4xmm =~ s/%y/%x/; 531my ($tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = map("%ymm$_", (16..20)); 532my ($cur_idx,$idx,$ones) = map("%ymm$_", (21..23)); 533 534$code.=<<___; 535.text 536 537.align 32 538.globl ossl_extract_multiplier_2x20_win5 539.type ossl_extract_multiplier_2x20_win5,\@function,4 540ossl_extract_multiplier_2x20_win5: 541.cfi_startproc 542 endbranch 543 leaq ($tbl_idx,$tbl_idx,4), %rax 544 salq \$5, %rax 545 addq %rax, $red_tbl 546 547 vmovdqa64 .Lones(%rip), $ones # broadcast ones 548 vpbroadcastq $red_tbl_idx, $idx 549 leaq `(1<<5)*2*20*8`($red_tbl), %rax # holds end of the tbl 550 551 vpxor $t4xmm, $t4xmm, $t4xmm 552 vmovdqa64 $t4, $t3 # zeroing t0..4, cur_idx 553 vmovdqa64 $t4, $t2 554 vmovdqa64 $t4, $t1 555 vmovdqa64 $t4, $t0 556 vmovdqa64 $t4, $cur_idx 557 558.align 32 559.Lloop: 560 vpcmpq \$0, $cur_idx, $idx, %k1 # mask of (idx == cur_idx) 561 addq \$320, $red_tbl # 320 = 2 * 20 digits * 8 bytes 562 vpaddq $ones, $cur_idx, $cur_idx # increment cur_idx 563 vmovdqu64 -320($red_tbl), $tmp0 # load data from red_tbl 564 vmovdqu64 -288($red_tbl), $tmp1 565 vmovdqu64 -256($red_tbl), $tmp2 566 vmovdqu64 -224($red_tbl), $tmp3 567 vmovdqu64 -192($red_tbl), $tmp4 568 vpblendmq $tmp0, $t0, ${t0}{%k1} # extract data when mask is not zero 569 vpblendmq $tmp1, $t1, ${t1}{%k1} 570 vpblendmq $tmp2, $t2, ${t2}{%k1} 571 vpblendmq $tmp3, $t3, ${t3}{%k1} 572 vpblendmq $tmp4, $t4, ${t4}{%k1} 573 cmpq $red_tbl, %rax 574 jne .Lloop 575 576 vmovdqu64 $t0, ($out) # store t0..4 577 vmovdqu64 $t1, 32($out) 578 vmovdqu64 $t2, 64($out) 579 vmovdqu64 $t3, 96($out) 580 vmovdqu64 $t4, 128($out) 581 582 ret 583.cfi_endproc 584.size ossl_extract_multiplier_2x20_win5, .-ossl_extract_multiplier_2x20_win5 585___ 586$code.=<<___; 587.data 588.align 32 589.Lones: 590 .quad 1,1,1,1 591___ 592} 593 594if ($win64) { 595$rec="%rcx"; 596$frame="%rdx"; 597$context="%r8"; 598$disp="%r9"; 599 600$code.=<<___ 601.extern __imp_RtlVirtualUnwind 602.type rsaz_def_handler,\@abi-omnipotent 603.align 16 604rsaz_def_handler: 605 push %rsi 606 push %rdi 607 push %rbx 608 push %rbp 609 push %r12 610 push %r13 611 push %r14 612 push %r15 613 pushfq 614 sub \$64,%rsp 615 616 mov 120($context),%rax # pull context->Rax 617 mov 248($context),%rbx # pull context->Rip 618 619 mov 8($disp),%rsi # disp->ImageBase 620 mov 56($disp),%r11 # disp->HandlerData 621 622 mov 0(%r11),%r10d # HandlerData[0] 623 lea (%rsi,%r10),%r10 # prologue label 624 cmp %r10,%rbx # context->Rip<.Lprologue 625 jb .Lcommon_seh_tail 626 627 mov 152($context),%rax # pull context->Rsp 628 629 mov 4(%r11),%r10d # HandlerData[1] 630 lea (%rsi,%r10),%r10 # epilogue label 631 cmp %r10,%rbx # context->Rip>=.Lepilogue 632 jae .Lcommon_seh_tail 633 634 lea 48(%rax),%rax 635 636 mov -8(%rax),%rbx 637 mov -16(%rax),%rbp 638 mov -24(%rax),%r12 639 mov -32(%rax),%r13 640 mov -40(%rax),%r14 641 mov -48(%rax),%r15 642 mov %rbx,144($context) # restore context->Rbx 643 mov %rbp,160($context) # restore context->Rbp 644 mov %r12,216($context) # restore context->R12 645 mov %r13,224($context) # restore context->R13 646 mov %r14,232($context) # restore context->R14 647 mov %r15,240($context) # restore context->R14 648 649.Lcommon_seh_tail: 650 mov 8(%rax),%rdi 651 mov 16(%rax),%rsi 652 mov %rax,152($context) # restore context->Rsp 653 mov %rsi,168($context) # restore context->Rsi 654 mov %rdi,176($context) # restore context->Rdi 655 656 mov 40($disp),%rdi # disp->ContextRecord 657 mov $context,%rsi # context 658 mov \$154,%ecx # sizeof(CONTEXT) 659 .long 0xa548f3fc # cld; rep movsq 660 661 mov $disp,%rsi 662 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 663 mov 8(%rsi),%rdx # arg2, disp->ImageBase 664 mov 0(%rsi),%r8 # arg3, disp->ControlPc 665 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 666 mov 40(%rsi),%r10 # disp->ContextRecord 667 lea 56(%rsi),%r11 # &disp->HandlerData 668 lea 24(%rsi),%r12 # &disp->EstablisherFrame 669 mov %r10,32(%rsp) # arg5 670 mov %r11,40(%rsp) # arg6 671 mov %r12,48(%rsp) # arg7 672 mov %rcx,56(%rsp) # arg8, (NULL) 673 call *__imp_RtlVirtualUnwind(%rip) 674 675 mov \$1,%eax # ExceptionContinueSearch 676 add \$64,%rsp 677 popfq 678 pop %r15 679 pop %r14 680 pop %r13 681 pop %r12 682 pop %rbp 683 pop %rbx 684 pop %rdi 685 pop %rsi 686 ret 687.size rsaz_def_handler,.-rsaz_def_handler 688 689.section .pdata 690.align 4 691 .rva .LSEH_begin_ossl_rsaz_amm52x20_x1_256 692 .rva .LSEH_end_ossl_rsaz_amm52x20_x1_256 693 .rva .LSEH_info_ossl_rsaz_amm52x20_x1_256 694 695 .rva .LSEH_begin_ossl_rsaz_amm52x20_x2_256 696 .rva .LSEH_end_ossl_rsaz_amm52x20_x2_256 697 .rva .LSEH_info_ossl_rsaz_amm52x20_x2_256 698 699 .rva .LSEH_begin_ossl_extract_multiplier_2x20_win5 700 .rva .LSEH_end_ossl_extract_multiplier_2x20_win5 701 .rva .LSEH_info_ossl_extract_multiplier_2x20_win5 702 703.section .xdata 704.align 8 705.LSEH_info_ossl_rsaz_amm52x20_x1_256: 706 .byte 9,0,0,0 707 .rva rsaz_def_handler 708 .rva .Lrsaz_amm52x20_x1_256_body,.Lrsaz_amm52x20_x1_256_epilogue 709.LSEH_info_ossl_rsaz_amm52x20_x2_256: 710 .byte 9,0,0,0 711 .rva rsaz_def_handler 712 .rva .Lrsaz_amm52x20_x2_256_body,.Lrsaz_amm52x20_x2_256_epilogue 713.LSEH_info_ossl_extract_multiplier_2x20_win5: 714 .byte 9,0,0,0 715 .rva rsaz_def_handler 716 .rva .LSEH_begin_ossl_extract_multiplier_2x20_win5,.LSEH_begin_ossl_extract_multiplier_2x20_win5 717___ 718} 719}}} else {{{ # fallback for old assembler 720$code.=<<___; 721.text 722 723.globl ossl_rsaz_avx512ifma_eligible 724.type ossl_rsaz_avx512ifma_eligible,\@abi-omnipotent 725ossl_rsaz_avx512ifma_eligible: 726 xor %eax,%eax 727 ret 728.size ossl_rsaz_avx512ifma_eligible, .-ossl_rsaz_avx512ifma_eligible 729 730.globl ossl_rsaz_amm52x20_x1_256 731.globl ossl_rsaz_amm52x20_x2_256 732.globl ossl_extract_multiplier_2x20_win5 733.type ossl_rsaz_amm52x20_x1_256,\@abi-omnipotent 734ossl_rsaz_amm52x20_x1_256: 735ossl_rsaz_amm52x20_x2_256: 736ossl_extract_multiplier_2x20_win5: 737 .byte 0x0f,0x0b # ud2 738 ret 739.size ossl_rsaz_amm52x20_x1_256, .-ossl_rsaz_amm52x20_x1_256 740___ 741}}} 742 743$code =~ s/\`([^\`]*)\`/eval $1/gem; 744print $code; 745close STDOUT or die "error closing STDOUT: $!"; 746