1#! /usr/bin/env perl 2# Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved. 3# Copyright (c) 2012, Intel Corporation. All Rights Reserved. 4# 5# Licensed under the OpenSSL license (the "License"). You may not use 6# this file except in compliance with the License. You can obtain a copy 7# in the file LICENSE in the source distribution or at 8# https://www.openssl.org/source/license.html 9# 10# Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1) 11# (1) Intel Corporation, Israel Development Center, Haifa, Israel 12# (2) University of Haifa, Israel 13# 14# References: 15# [1] S. Gueron, V. Krasnov: "Software Implementation of Modular 16# Exponentiation, Using Advanced Vector Instructions Architectures", 17# F. Ozbudak and F. Rodriguez-Henriquez (Eds.): WAIFI 2012, LNCS 7369, 18# pp. 119?135, 2012. Springer-Verlag Berlin Heidelberg 2012 19# [2] S. Gueron: "Efficient Software Implementations of Modular 20# Exponentiation", Journal of Cryptographic Engineering 2:31-43 (2012). 21# [3] S. Gueron, V. Krasnov: "Speeding up Big-numbers Squaring",IEEE 22# Proceedings of 9th International Conference on Information Technology: 23# New Generations (ITNG 2012), pp.821-823 (2012) 24# [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis 25# resistant 1024-bit modular exponentiation, for optimizing RSA2048 26# on AVX2 capable x86_64 platforms", 27# http://rt.openssl.org/Ticket/Display.html?id=2850&user=guest&pass=guest 28# 29# +13% improvement over original submission by <appro@openssl.org> 30# 31# rsa2048 sign/sec OpenSSL 1.0.1 scalar(*) this 32# 2.3GHz Haswell 621 765/+23% 1113/+79% 33# 2.3GHz Broadwell(**) 688 1200(***)/+74% 1120/+63% 34# 35# (*) if system doesn't support AVX2, for reference purposes; 36# (**) scaled to 2.3GHz to simplify comparison; 37# (***) scalar AD*X code is faster than AVX2 and is preferred code 38# path for Broadwell; 39 40$flavour = shift; 41$output = shift; 42if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 43 44$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 45 46$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 47( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 48( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 49die "can't locate x86_64-xlate.pl"; 50 51if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 52 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 53 $avx = ($1>=2.19) + ($1>=2.22); 54 $addx = ($1>=2.23); 55} 56 57if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 58 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 59 $avx = ($1>=2.09) + ($1>=2.10); 60 $addx = ($1>=2.10); 61} 62 63if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 64 `ml64 2>&1` =~ /Version ([0-9]+)\./) { 65 $avx = ($1>=10) + ($1>=11); 66 $addx = ($1>=11); 67} 68 69if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|based on LLVM) ([0-9]+)\.([0-9]+)/) { 70 my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 71 $avx = ($ver>=3.0) + ($ver>=3.01); 72 $addx = ($ver>=3.03); 73} 74 75open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 76*STDOUT = *OUT; 77 78if ($avx>1) {{{ 79{ # void AMS_WW( 80my $rp="%rdi"; # BN_ULONG *rp, 81my $ap="%rsi"; # const BN_ULONG *ap, 82my $np="%rdx"; # const BN_ULONG *np, 83my $n0="%ecx"; # const BN_ULONG n0, 84my $rep="%r8d"; # int repeat); 85 86# The registers that hold the accumulated redundant result 87# The AMM works on 1024 bit operands, and redundant word size is 29 88# Therefore: ceil(1024/29)/4 = 9 89my $ACC0="%ymm0"; 90my $ACC1="%ymm1"; 91my $ACC2="%ymm2"; 92my $ACC3="%ymm3"; 93my $ACC4="%ymm4"; 94my $ACC5="%ymm5"; 95my $ACC6="%ymm6"; 96my $ACC7="%ymm7"; 97my $ACC8="%ymm8"; 98my $ACC9="%ymm9"; 99# Registers that hold the broadcasted words of bp, currently used 100my $B1="%ymm10"; 101my $B2="%ymm11"; 102# Registers that hold the broadcasted words of Y, currently used 103my $Y1="%ymm12"; 104my $Y2="%ymm13"; 105# Helper registers 106my $TEMP1="%ymm14"; 107my $AND_MASK="%ymm15"; 108# alu registers that hold the first words of the ACC 109my $r0="%r9"; 110my $r1="%r10"; 111my $r2="%r11"; 112my $r3="%r12"; 113 114my $i="%r14d"; # loop counter 115my $tmp = "%r15"; 116 117my $FrameSize=32*18+32*8; # place for A^2 and 2*A 118 119my $aap=$r0; 120my $tp0="%rbx"; 121my $tp1=$r3; 122my $tpa=$tmp; 123 124$np="%r13"; # reassigned argument 125 126$code.=<<___; 127.text 128 129.globl rsaz_1024_sqr_avx2 130.type rsaz_1024_sqr_avx2,\@function,5 131.align 64 132rsaz_1024_sqr_avx2: # 702 cycles, 14% faster than rsaz_1024_mul_avx2 133.cfi_startproc 134 lea (%rsp), %rax 135.cfi_def_cfa_register %rax 136 push %rbx 137.cfi_push %rbx 138 push %rbp 139.cfi_push %rbp 140 push %r12 141.cfi_push %r12 142 push %r13 143.cfi_push %r13 144 push %r14 145.cfi_push %r14 146 push %r15 147.cfi_push %r15 148 vzeroupper 149___ 150$code.=<<___ if ($win64); 151 lea -0xa8(%rsp),%rsp 152 vmovaps %xmm6,-0xd8(%rax) 153 vmovaps %xmm7,-0xc8(%rax) 154 vmovaps %xmm8,-0xb8(%rax) 155 vmovaps %xmm9,-0xa8(%rax) 156 vmovaps %xmm10,-0x98(%rax) 157 vmovaps %xmm11,-0x88(%rax) 158 vmovaps %xmm12,-0x78(%rax) 159 vmovaps %xmm13,-0x68(%rax) 160 vmovaps %xmm14,-0x58(%rax) 161 vmovaps %xmm15,-0x48(%rax) 162.Lsqr_1024_body: 163___ 164$code.=<<___; 165 mov %rax,%rbp 166.cfi_def_cfa_register %rbp 167 mov %rdx, $np # reassigned argument 168 sub \$$FrameSize, %rsp 169 mov $np, $tmp 170 sub \$-128, $rp # size optimization 171 sub \$-128, $ap 172 sub \$-128, $np 173 174 and \$4095, $tmp # see if $np crosses page 175 add \$32*10, $tmp 176 shr \$12, $tmp 177 vpxor $ACC9,$ACC9,$ACC9 178 jz .Lsqr_1024_no_n_copy 179 180 # unaligned 256-bit load that crosses page boundary can 181 # cause >2x performance degradation here, so if $np does 182 # cross page boundary, copy it to stack and make sure stack 183 # frame doesn't... 184 sub \$32*10,%rsp 185 vmovdqu 32*0-128($np), $ACC0 186 and \$-2048, %rsp 187 vmovdqu 32*1-128($np), $ACC1 188 vmovdqu 32*2-128($np), $ACC2 189 vmovdqu 32*3-128($np), $ACC3 190 vmovdqu 32*4-128($np), $ACC4 191 vmovdqu 32*5-128($np), $ACC5 192 vmovdqu 32*6-128($np), $ACC6 193 vmovdqu 32*7-128($np), $ACC7 194 vmovdqu 32*8-128($np), $ACC8 195 lea $FrameSize+128(%rsp),$np 196 vmovdqu $ACC0, 32*0-128($np) 197 vmovdqu $ACC1, 32*1-128($np) 198 vmovdqu $ACC2, 32*2-128($np) 199 vmovdqu $ACC3, 32*3-128($np) 200 vmovdqu $ACC4, 32*4-128($np) 201 vmovdqu $ACC5, 32*5-128($np) 202 vmovdqu $ACC6, 32*6-128($np) 203 vmovdqu $ACC7, 32*7-128($np) 204 vmovdqu $ACC8, 32*8-128($np) 205 vmovdqu $ACC9, 32*9-128($np) # $ACC9 is zero 206 207.Lsqr_1024_no_n_copy: 208 and \$-1024, %rsp 209 210 vmovdqu 32*1-128($ap), $ACC1 211 vmovdqu 32*2-128($ap), $ACC2 212 vmovdqu 32*3-128($ap), $ACC3 213 vmovdqu 32*4-128($ap), $ACC4 214 vmovdqu 32*5-128($ap), $ACC5 215 vmovdqu 32*6-128($ap), $ACC6 216 vmovdqu 32*7-128($ap), $ACC7 217 vmovdqu 32*8-128($ap), $ACC8 218 219 lea 192(%rsp), $tp0 # 64+128=192 220 vmovdqu .Land_mask(%rip), $AND_MASK 221 jmp .LOOP_GRANDE_SQR_1024 222 223.align 32 224.LOOP_GRANDE_SQR_1024: 225 lea 32*18+128(%rsp), $aap # size optimization 226 lea 448(%rsp), $tp1 # 64+128+256=448 227 228 # the squaring is performed as described in Variant B of 229 # "Speeding up Big-Number Squaring", so start by calculating 230 # the A*2=A+A vector 231 vpaddq $ACC1, $ACC1, $ACC1 232 vpbroadcastq 32*0-128($ap), $B1 233 vpaddq $ACC2, $ACC2, $ACC2 234 vmovdqa $ACC1, 32*0-128($aap) 235 vpaddq $ACC3, $ACC3, $ACC3 236 vmovdqa $ACC2, 32*1-128($aap) 237 vpaddq $ACC4, $ACC4, $ACC4 238 vmovdqa $ACC3, 32*2-128($aap) 239 vpaddq $ACC5, $ACC5, $ACC5 240 vmovdqa $ACC4, 32*3-128($aap) 241 vpaddq $ACC6, $ACC6, $ACC6 242 vmovdqa $ACC5, 32*4-128($aap) 243 vpaddq $ACC7, $ACC7, $ACC7 244 vmovdqa $ACC6, 32*5-128($aap) 245 vpaddq $ACC8, $ACC8, $ACC8 246 vmovdqa $ACC7, 32*6-128($aap) 247 vpxor $ACC9, $ACC9, $ACC9 248 vmovdqa $ACC8, 32*7-128($aap) 249 250 vpmuludq 32*0-128($ap), $B1, $ACC0 251 vpbroadcastq 32*1-128($ap), $B2 252 vmovdqu $ACC9, 32*9-192($tp0) # zero upper half 253 vpmuludq $B1, $ACC1, $ACC1 254 vmovdqu $ACC9, 32*10-448($tp1) 255 vpmuludq $B1, $ACC2, $ACC2 256 vmovdqu $ACC9, 32*11-448($tp1) 257 vpmuludq $B1, $ACC3, $ACC3 258 vmovdqu $ACC9, 32*12-448($tp1) 259 vpmuludq $B1, $ACC4, $ACC4 260 vmovdqu $ACC9, 32*13-448($tp1) 261 vpmuludq $B1, $ACC5, $ACC5 262 vmovdqu $ACC9, 32*14-448($tp1) 263 vpmuludq $B1, $ACC6, $ACC6 264 vmovdqu $ACC9, 32*15-448($tp1) 265 vpmuludq $B1, $ACC7, $ACC7 266 vmovdqu $ACC9, 32*16-448($tp1) 267 vpmuludq $B1, $ACC8, $ACC8 268 vpbroadcastq 32*2-128($ap), $B1 269 vmovdqu $ACC9, 32*17-448($tp1) 270 271 mov $ap, $tpa 272 mov \$4, $i 273 jmp .Lsqr_entry_1024 274___ 275$TEMP0=$Y1; 276$TEMP2=$Y2; 277$code.=<<___; 278.align 32 279.LOOP_SQR_1024: 280 vpbroadcastq 32*1-128($tpa), $B2 281 vpmuludq 32*0-128($ap), $B1, $ACC0 282 vpaddq 32*0-192($tp0), $ACC0, $ACC0 283 vpmuludq 32*0-128($aap), $B1, $ACC1 284 vpaddq 32*1-192($tp0), $ACC1, $ACC1 285 vpmuludq 32*1-128($aap), $B1, $ACC2 286 vpaddq 32*2-192($tp0), $ACC2, $ACC2 287 vpmuludq 32*2-128($aap), $B1, $ACC3 288 vpaddq 32*3-192($tp0), $ACC3, $ACC3 289 vpmuludq 32*3-128($aap), $B1, $ACC4 290 vpaddq 32*4-192($tp0), $ACC4, $ACC4 291 vpmuludq 32*4-128($aap), $B1, $ACC5 292 vpaddq 32*5-192($tp0), $ACC5, $ACC5 293 vpmuludq 32*5-128($aap), $B1, $ACC6 294 vpaddq 32*6-192($tp0), $ACC6, $ACC6 295 vpmuludq 32*6-128($aap), $B1, $ACC7 296 vpaddq 32*7-192($tp0), $ACC7, $ACC7 297 vpmuludq 32*7-128($aap), $B1, $ACC8 298 vpbroadcastq 32*2-128($tpa), $B1 299 vpaddq 32*8-192($tp0), $ACC8, $ACC8 300.Lsqr_entry_1024: 301 vmovdqu $ACC0, 32*0-192($tp0) 302 vmovdqu $ACC1, 32*1-192($tp0) 303 304 vpmuludq 32*1-128($ap), $B2, $TEMP0 305 vpaddq $TEMP0, $ACC2, $ACC2 306 vpmuludq 32*1-128($aap), $B2, $TEMP1 307 vpaddq $TEMP1, $ACC3, $ACC3 308 vpmuludq 32*2-128($aap), $B2, $TEMP2 309 vpaddq $TEMP2, $ACC4, $ACC4 310 vpmuludq 32*3-128($aap), $B2, $TEMP0 311 vpaddq $TEMP0, $ACC5, $ACC5 312 vpmuludq 32*4-128($aap), $B2, $TEMP1 313 vpaddq $TEMP1, $ACC6, $ACC6 314 vpmuludq 32*5-128($aap), $B2, $TEMP2 315 vpaddq $TEMP2, $ACC7, $ACC7 316 vpmuludq 32*6-128($aap), $B2, $TEMP0 317 vpaddq $TEMP0, $ACC8, $ACC8 318 vpmuludq 32*7-128($aap), $B2, $ACC0 319 vpbroadcastq 32*3-128($tpa), $B2 320 vpaddq 32*9-192($tp0), $ACC0, $ACC0 321 322 vmovdqu $ACC2, 32*2-192($tp0) 323 vmovdqu $ACC3, 32*3-192($tp0) 324 325 vpmuludq 32*2-128($ap), $B1, $TEMP2 326 vpaddq $TEMP2, $ACC4, $ACC4 327 vpmuludq 32*2-128($aap), $B1, $TEMP0 328 vpaddq $TEMP0, $ACC5, $ACC5 329 vpmuludq 32*3-128($aap), $B1, $TEMP1 330 vpaddq $TEMP1, $ACC6, $ACC6 331 vpmuludq 32*4-128($aap), $B1, $TEMP2 332 vpaddq $TEMP2, $ACC7, $ACC7 333 vpmuludq 32*5-128($aap), $B1, $TEMP0 334 vpaddq $TEMP0, $ACC8, $ACC8 335 vpmuludq 32*6-128($aap), $B1, $TEMP1 336 vpaddq $TEMP1, $ACC0, $ACC0 337 vpmuludq 32*7-128($aap), $B1, $ACC1 338 vpbroadcastq 32*4-128($tpa), $B1 339 vpaddq 32*10-448($tp1), $ACC1, $ACC1 340 341 vmovdqu $ACC4, 32*4-192($tp0) 342 vmovdqu $ACC5, 32*5-192($tp0) 343 344 vpmuludq 32*3-128($ap), $B2, $TEMP0 345 vpaddq $TEMP0, $ACC6, $ACC6 346 vpmuludq 32*3-128($aap), $B2, $TEMP1 347 vpaddq $TEMP1, $ACC7, $ACC7 348 vpmuludq 32*4-128($aap), $B2, $TEMP2 349 vpaddq $TEMP2, $ACC8, $ACC8 350 vpmuludq 32*5-128($aap), $B2, $TEMP0 351 vpaddq $TEMP0, $ACC0, $ACC0 352 vpmuludq 32*6-128($aap), $B2, $TEMP1 353 vpaddq $TEMP1, $ACC1, $ACC1 354 vpmuludq 32*7-128($aap), $B2, $ACC2 355 vpbroadcastq 32*5-128($tpa), $B2 356 vpaddq 32*11-448($tp1), $ACC2, $ACC2 357 358 vmovdqu $ACC6, 32*6-192($tp0) 359 vmovdqu $ACC7, 32*7-192($tp0) 360 361 vpmuludq 32*4-128($ap), $B1, $TEMP0 362 vpaddq $TEMP0, $ACC8, $ACC8 363 vpmuludq 32*4-128($aap), $B1, $TEMP1 364 vpaddq $TEMP1, $ACC0, $ACC0 365 vpmuludq 32*5-128($aap), $B1, $TEMP2 366 vpaddq $TEMP2, $ACC1, $ACC1 367 vpmuludq 32*6-128($aap), $B1, $TEMP0 368 vpaddq $TEMP0, $ACC2, $ACC2 369 vpmuludq 32*7-128($aap), $B1, $ACC3 370 vpbroadcastq 32*6-128($tpa), $B1 371 vpaddq 32*12-448($tp1), $ACC3, $ACC3 372 373 vmovdqu $ACC8, 32*8-192($tp0) 374 vmovdqu $ACC0, 32*9-192($tp0) 375 lea 8($tp0), $tp0 376 377 vpmuludq 32*5-128($ap), $B2, $TEMP2 378 vpaddq $TEMP2, $ACC1, $ACC1 379 vpmuludq 32*5-128($aap), $B2, $TEMP0 380 vpaddq $TEMP0, $ACC2, $ACC2 381 vpmuludq 32*6-128($aap), $B2, $TEMP1 382 vpaddq $TEMP1, $ACC3, $ACC3 383 vpmuludq 32*7-128($aap), $B2, $ACC4 384 vpbroadcastq 32*7-128($tpa), $B2 385 vpaddq 32*13-448($tp1), $ACC4, $ACC4 386 387 vmovdqu $ACC1, 32*10-448($tp1) 388 vmovdqu $ACC2, 32*11-448($tp1) 389 390 vpmuludq 32*6-128($ap), $B1, $TEMP0 391 vpaddq $TEMP0, $ACC3, $ACC3 392 vpmuludq 32*6-128($aap), $B1, $TEMP1 393 vpbroadcastq 32*8-128($tpa), $ACC0 # borrow $ACC0 for $B1 394 vpaddq $TEMP1, $ACC4, $ACC4 395 vpmuludq 32*7-128($aap), $B1, $ACC5 396 vpbroadcastq 32*0+8-128($tpa), $B1 # for next iteration 397 vpaddq 32*14-448($tp1), $ACC5, $ACC5 398 399 vmovdqu $ACC3, 32*12-448($tp1) 400 vmovdqu $ACC4, 32*13-448($tp1) 401 lea 8($tpa), $tpa 402 403 vpmuludq 32*7-128($ap), $B2, $TEMP0 404 vpaddq $TEMP0, $ACC5, $ACC5 405 vpmuludq 32*7-128($aap), $B2, $ACC6 406 vpaddq 32*15-448($tp1), $ACC6, $ACC6 407 408 vpmuludq 32*8-128($ap), $ACC0, $ACC7 409 vmovdqu $ACC5, 32*14-448($tp1) 410 vpaddq 32*16-448($tp1), $ACC7, $ACC7 411 vmovdqu $ACC6, 32*15-448($tp1) 412 vmovdqu $ACC7, 32*16-448($tp1) 413 lea 8($tp1), $tp1 414 415 dec $i 416 jnz .LOOP_SQR_1024 417___ 418$ZERO = $ACC9; 419$TEMP0 = $B1; 420$TEMP2 = $B2; 421$TEMP3 = $Y1; 422$TEMP4 = $Y2; 423$code.=<<___; 424 # we need to fix indices 32-39 to avoid overflow 425 vmovdqu 32*8(%rsp), $ACC8 # 32*8-192($tp0), 426 vmovdqu 32*9(%rsp), $ACC1 # 32*9-192($tp0) 427 vmovdqu 32*10(%rsp), $ACC2 # 32*10-192($tp0) 428 lea 192(%rsp), $tp0 # 64+128=192 429 430 vpsrlq \$29, $ACC8, $TEMP1 431 vpand $AND_MASK, $ACC8, $ACC8 432 vpsrlq \$29, $ACC1, $TEMP2 433 vpand $AND_MASK, $ACC1, $ACC1 434 435 vpermq \$0x93, $TEMP1, $TEMP1 436 vpxor $ZERO, $ZERO, $ZERO 437 vpermq \$0x93, $TEMP2, $TEMP2 438 439 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 440 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 441 vpaddq $TEMP0, $ACC8, $ACC8 442 vpblendd \$3, $TEMP2, $ZERO, $TEMP2 443 vpaddq $TEMP1, $ACC1, $ACC1 444 vpaddq $TEMP2, $ACC2, $ACC2 445 vmovdqu $ACC1, 32*9-192($tp0) 446 vmovdqu $ACC2, 32*10-192($tp0) 447 448 mov (%rsp), %rax 449 mov 8(%rsp), $r1 450 mov 16(%rsp), $r2 451 mov 24(%rsp), $r3 452 vmovdqu 32*1(%rsp), $ACC1 453 vmovdqu 32*2-192($tp0), $ACC2 454 vmovdqu 32*3-192($tp0), $ACC3 455 vmovdqu 32*4-192($tp0), $ACC4 456 vmovdqu 32*5-192($tp0), $ACC5 457 vmovdqu 32*6-192($tp0), $ACC6 458 vmovdqu 32*7-192($tp0), $ACC7 459 460 mov %rax, $r0 461 imull $n0, %eax 462 and \$0x1fffffff, %eax 463 vmovd %eax, $Y1 464 465 mov %rax, %rdx 466 imulq -128($np), %rax 467 vpbroadcastq $Y1, $Y1 468 add %rax, $r0 469 mov %rdx, %rax 470 imulq 8-128($np), %rax 471 shr \$29, $r0 472 add %rax, $r1 473 mov %rdx, %rax 474 imulq 16-128($np), %rax 475 add $r0, $r1 476 add %rax, $r2 477 imulq 24-128($np), %rdx 478 add %rdx, $r3 479 480 mov $r1, %rax 481 imull $n0, %eax 482 and \$0x1fffffff, %eax 483 484 mov \$9, $i 485 jmp .LOOP_REDUCE_1024 486 487.align 32 488.LOOP_REDUCE_1024: 489 vmovd %eax, $Y2 490 vpbroadcastq $Y2, $Y2 491 492 vpmuludq 32*1-128($np), $Y1, $TEMP0 493 mov %rax, %rdx 494 imulq -128($np), %rax 495 vpaddq $TEMP0, $ACC1, $ACC1 496 add %rax, $r1 497 vpmuludq 32*2-128($np), $Y1, $TEMP1 498 mov %rdx, %rax 499 imulq 8-128($np), %rax 500 vpaddq $TEMP1, $ACC2, $ACC2 501 vpmuludq 32*3-128($np), $Y1, $TEMP2 502 .byte 0x67 503 add %rax, $r2 504 .byte 0x67 505 mov %rdx, %rax 506 imulq 16-128($np), %rax 507 shr \$29, $r1 508 vpaddq $TEMP2, $ACC3, $ACC3 509 vpmuludq 32*4-128($np), $Y1, $TEMP0 510 add %rax, $r3 511 add $r1, $r2 512 vpaddq $TEMP0, $ACC4, $ACC4 513 vpmuludq 32*5-128($np), $Y1, $TEMP1 514 mov $r2, %rax 515 imull $n0, %eax 516 vpaddq $TEMP1, $ACC5, $ACC5 517 vpmuludq 32*6-128($np), $Y1, $TEMP2 518 and \$0x1fffffff, %eax 519 vpaddq $TEMP2, $ACC6, $ACC6 520 vpmuludq 32*7-128($np), $Y1, $TEMP0 521 vpaddq $TEMP0, $ACC7, $ACC7 522 vpmuludq 32*8-128($np), $Y1, $TEMP1 523 vmovd %eax, $Y1 524 #vmovdqu 32*1-8-128($np), $TEMP2 # moved below 525 vpaddq $TEMP1, $ACC8, $ACC8 526 #vmovdqu 32*2-8-128($np), $TEMP0 # moved below 527 vpbroadcastq $Y1, $Y1 528 529 vpmuludq 32*1-8-128($np), $Y2, $TEMP2 # see above 530 vmovdqu 32*3-8-128($np), $TEMP1 531 mov %rax, %rdx 532 imulq -128($np), %rax 533 vpaddq $TEMP2, $ACC1, $ACC1 534 vpmuludq 32*2-8-128($np), $Y2, $TEMP0 # see above 535 vmovdqu 32*4-8-128($np), $TEMP2 536 add %rax, $r2 537 mov %rdx, %rax 538 imulq 8-128($np), %rax 539 vpaddq $TEMP0, $ACC2, $ACC2 540 add $r3, %rax 541 shr \$29, $r2 542 vpmuludq $Y2, $TEMP1, $TEMP1 543 vmovdqu 32*5-8-128($np), $TEMP0 544 add $r2, %rax 545 vpaddq $TEMP1, $ACC3, $ACC3 546 vpmuludq $Y2, $TEMP2, $TEMP2 547 vmovdqu 32*6-8-128($np), $TEMP1 548 .byte 0x67 549 mov %rax, $r3 550 imull $n0, %eax 551 vpaddq $TEMP2, $ACC4, $ACC4 552 vpmuludq $Y2, $TEMP0, $TEMP0 553 .byte 0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00 # vmovdqu 32*7-8-128($np), $TEMP2 554 and \$0x1fffffff, %eax 555 vpaddq $TEMP0, $ACC5, $ACC5 556 vpmuludq $Y2, $TEMP1, $TEMP1 557 vmovdqu 32*8-8-128($np), $TEMP0 558 vpaddq $TEMP1, $ACC6, $ACC6 559 vpmuludq $Y2, $TEMP2, $TEMP2 560 vmovdqu 32*9-8-128($np), $ACC9 561 vmovd %eax, $ACC0 # borrow ACC0 for Y2 562 imulq -128($np), %rax 563 vpaddq $TEMP2, $ACC7, $ACC7 564 vpmuludq $Y2, $TEMP0, $TEMP0 565 vmovdqu 32*1-16-128($np), $TEMP1 566 vpbroadcastq $ACC0, $ACC0 567 vpaddq $TEMP0, $ACC8, $ACC8 568 vpmuludq $Y2, $ACC9, $ACC9 569 vmovdqu 32*2-16-128($np), $TEMP2 570 add %rax, $r3 571 572___ 573($ACC0,$Y2)=($Y2,$ACC0); 574$code.=<<___; 575 vmovdqu 32*1-24-128($np), $ACC0 576 vpmuludq $Y1, $TEMP1, $TEMP1 577 vmovdqu 32*3-16-128($np), $TEMP0 578 vpaddq $TEMP1, $ACC1, $ACC1 579 vpmuludq $Y2, $ACC0, $ACC0 580 vpmuludq $Y1, $TEMP2, $TEMP2 581 .byte 0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff # vmovdqu 32*4-16-128($np), $TEMP1 582 vpaddq $ACC1, $ACC0, $ACC0 583 vpaddq $TEMP2, $ACC2, $ACC2 584 vpmuludq $Y1, $TEMP0, $TEMP0 585 vmovdqu 32*5-16-128($np), $TEMP2 586 .byte 0x67 587 vmovq $ACC0, %rax 588 vmovdqu $ACC0, (%rsp) # transfer $r0-$r3 589 vpaddq $TEMP0, $ACC3, $ACC3 590 vpmuludq $Y1, $TEMP1, $TEMP1 591 vmovdqu 32*6-16-128($np), $TEMP0 592 vpaddq $TEMP1, $ACC4, $ACC4 593 vpmuludq $Y1, $TEMP2, $TEMP2 594 vmovdqu 32*7-16-128($np), $TEMP1 595 vpaddq $TEMP2, $ACC5, $ACC5 596 vpmuludq $Y1, $TEMP0, $TEMP0 597 vmovdqu 32*8-16-128($np), $TEMP2 598 vpaddq $TEMP0, $ACC6, $ACC6 599 vpmuludq $Y1, $TEMP1, $TEMP1 600 shr \$29, $r3 601 vmovdqu 32*9-16-128($np), $TEMP0 602 add $r3, %rax 603 vpaddq $TEMP1, $ACC7, $ACC7 604 vpmuludq $Y1, $TEMP2, $TEMP2 605 #vmovdqu 32*2-24-128($np), $TEMP1 # moved below 606 mov %rax, $r0 607 imull $n0, %eax 608 vpaddq $TEMP2, $ACC8, $ACC8 609 vpmuludq $Y1, $TEMP0, $TEMP0 610 and \$0x1fffffff, %eax 611 vmovd %eax, $Y1 612 vmovdqu 32*3-24-128($np), $TEMP2 613 .byte 0x67 614 vpaddq $TEMP0, $ACC9, $ACC9 615 vpbroadcastq $Y1, $Y1 616 617 vpmuludq 32*2-24-128($np), $Y2, $TEMP1 # see above 618 vmovdqu 32*4-24-128($np), $TEMP0 619 mov %rax, %rdx 620 imulq -128($np), %rax 621 mov 8(%rsp), $r1 622 vpaddq $TEMP1, $ACC2, $ACC1 623 vpmuludq $Y2, $TEMP2, $TEMP2 624 vmovdqu 32*5-24-128($np), $TEMP1 625 add %rax, $r0 626 mov %rdx, %rax 627 imulq 8-128($np), %rax 628 .byte 0x67 629 shr \$29, $r0 630 mov 16(%rsp), $r2 631 vpaddq $TEMP2, $ACC3, $ACC2 632 vpmuludq $Y2, $TEMP0, $TEMP0 633 vmovdqu 32*6-24-128($np), $TEMP2 634 add %rax, $r1 635 mov %rdx, %rax 636 imulq 16-128($np), %rax 637 vpaddq $TEMP0, $ACC4, $ACC3 638 vpmuludq $Y2, $TEMP1, $TEMP1 639 vmovdqu 32*7-24-128($np), $TEMP0 640 imulq 24-128($np), %rdx # future $r3 641 add %rax, $r2 642 lea ($r0,$r1), %rax 643 vpaddq $TEMP1, $ACC5, $ACC4 644 vpmuludq $Y2, $TEMP2, $TEMP2 645 vmovdqu 32*8-24-128($np), $TEMP1 646 mov %rax, $r1 647 imull $n0, %eax 648 vpmuludq $Y2, $TEMP0, $TEMP0 649 vpaddq $TEMP2, $ACC6, $ACC5 650 vmovdqu 32*9-24-128($np), $TEMP2 651 and \$0x1fffffff, %eax 652 vpaddq $TEMP0, $ACC7, $ACC6 653 vpmuludq $Y2, $TEMP1, $TEMP1 654 add 24(%rsp), %rdx 655 vpaddq $TEMP1, $ACC8, $ACC7 656 vpmuludq $Y2, $TEMP2, $TEMP2 657 vpaddq $TEMP2, $ACC9, $ACC8 658 vmovq $r3, $ACC9 659 mov %rdx, $r3 660 661 dec $i 662 jnz .LOOP_REDUCE_1024 663___ 664($ACC0,$Y2)=($Y2,$ACC0); 665$code.=<<___; 666 lea 448(%rsp), $tp1 # size optimization 667 vpaddq $ACC9, $Y2, $ACC0 668 vpxor $ZERO, $ZERO, $ZERO 669 670 vpaddq 32*9-192($tp0), $ACC0, $ACC0 671 vpaddq 32*10-448($tp1), $ACC1, $ACC1 672 vpaddq 32*11-448($tp1), $ACC2, $ACC2 673 vpaddq 32*12-448($tp1), $ACC3, $ACC3 674 vpaddq 32*13-448($tp1), $ACC4, $ACC4 675 vpaddq 32*14-448($tp1), $ACC5, $ACC5 676 vpaddq 32*15-448($tp1), $ACC6, $ACC6 677 vpaddq 32*16-448($tp1), $ACC7, $ACC7 678 vpaddq 32*17-448($tp1), $ACC8, $ACC8 679 680 vpsrlq \$29, $ACC0, $TEMP1 681 vpand $AND_MASK, $ACC0, $ACC0 682 vpsrlq \$29, $ACC1, $TEMP2 683 vpand $AND_MASK, $ACC1, $ACC1 684 vpsrlq \$29, $ACC2, $TEMP3 685 vpermq \$0x93, $TEMP1, $TEMP1 686 vpand $AND_MASK, $ACC2, $ACC2 687 vpsrlq \$29, $ACC3, $TEMP4 688 vpermq \$0x93, $TEMP2, $TEMP2 689 vpand $AND_MASK, $ACC3, $ACC3 690 vpermq \$0x93, $TEMP3, $TEMP3 691 692 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 693 vpermq \$0x93, $TEMP4, $TEMP4 694 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 695 vpaddq $TEMP0, $ACC0, $ACC0 696 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 697 vpaddq $TEMP1, $ACC1, $ACC1 698 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 699 vpaddq $TEMP2, $ACC2, $ACC2 700 vpblendd \$3, $TEMP4, $ZERO, $TEMP4 701 vpaddq $TEMP3, $ACC3, $ACC3 702 vpaddq $TEMP4, $ACC4, $ACC4 703 704 vpsrlq \$29, $ACC0, $TEMP1 705 vpand $AND_MASK, $ACC0, $ACC0 706 vpsrlq \$29, $ACC1, $TEMP2 707 vpand $AND_MASK, $ACC1, $ACC1 708 vpsrlq \$29, $ACC2, $TEMP3 709 vpermq \$0x93, $TEMP1, $TEMP1 710 vpand $AND_MASK, $ACC2, $ACC2 711 vpsrlq \$29, $ACC3, $TEMP4 712 vpermq \$0x93, $TEMP2, $TEMP2 713 vpand $AND_MASK, $ACC3, $ACC3 714 vpermq \$0x93, $TEMP3, $TEMP3 715 716 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 717 vpermq \$0x93, $TEMP4, $TEMP4 718 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 719 vpaddq $TEMP0, $ACC0, $ACC0 720 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 721 vpaddq $TEMP1, $ACC1, $ACC1 722 vmovdqu $ACC0, 32*0-128($rp) 723 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 724 vpaddq $TEMP2, $ACC2, $ACC2 725 vmovdqu $ACC1, 32*1-128($rp) 726 vpblendd \$3, $TEMP4, $ZERO, $TEMP4 727 vpaddq $TEMP3, $ACC3, $ACC3 728 vmovdqu $ACC2, 32*2-128($rp) 729 vpaddq $TEMP4, $ACC4, $ACC4 730 vmovdqu $ACC3, 32*3-128($rp) 731___ 732$TEMP5=$ACC0; 733$code.=<<___; 734 vpsrlq \$29, $ACC4, $TEMP1 735 vpand $AND_MASK, $ACC4, $ACC4 736 vpsrlq \$29, $ACC5, $TEMP2 737 vpand $AND_MASK, $ACC5, $ACC5 738 vpsrlq \$29, $ACC6, $TEMP3 739 vpermq \$0x93, $TEMP1, $TEMP1 740 vpand $AND_MASK, $ACC6, $ACC6 741 vpsrlq \$29, $ACC7, $TEMP4 742 vpermq \$0x93, $TEMP2, $TEMP2 743 vpand $AND_MASK, $ACC7, $ACC7 744 vpsrlq \$29, $ACC8, $TEMP5 745 vpermq \$0x93, $TEMP3, $TEMP3 746 vpand $AND_MASK, $ACC8, $ACC8 747 vpermq \$0x93, $TEMP4, $TEMP4 748 749 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 750 vpermq \$0x93, $TEMP5, $TEMP5 751 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 752 vpaddq $TEMP0, $ACC4, $ACC4 753 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 754 vpaddq $TEMP1, $ACC5, $ACC5 755 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 756 vpaddq $TEMP2, $ACC6, $ACC6 757 vpblendd \$3, $TEMP4, $TEMP5, $TEMP4 758 vpaddq $TEMP3, $ACC7, $ACC7 759 vpaddq $TEMP4, $ACC8, $ACC8 760 761 vpsrlq \$29, $ACC4, $TEMP1 762 vpand $AND_MASK, $ACC4, $ACC4 763 vpsrlq \$29, $ACC5, $TEMP2 764 vpand $AND_MASK, $ACC5, $ACC5 765 vpsrlq \$29, $ACC6, $TEMP3 766 vpermq \$0x93, $TEMP1, $TEMP1 767 vpand $AND_MASK, $ACC6, $ACC6 768 vpsrlq \$29, $ACC7, $TEMP4 769 vpermq \$0x93, $TEMP2, $TEMP2 770 vpand $AND_MASK, $ACC7, $ACC7 771 vpsrlq \$29, $ACC8, $TEMP5 772 vpermq \$0x93, $TEMP3, $TEMP3 773 vpand $AND_MASK, $ACC8, $ACC8 774 vpermq \$0x93, $TEMP4, $TEMP4 775 776 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 777 vpermq \$0x93, $TEMP5, $TEMP5 778 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 779 vpaddq $TEMP0, $ACC4, $ACC4 780 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 781 vpaddq $TEMP1, $ACC5, $ACC5 782 vmovdqu $ACC4, 32*4-128($rp) 783 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 784 vpaddq $TEMP2, $ACC6, $ACC6 785 vmovdqu $ACC5, 32*5-128($rp) 786 vpblendd \$3, $TEMP4, $TEMP5, $TEMP4 787 vpaddq $TEMP3, $ACC7, $ACC7 788 vmovdqu $ACC6, 32*6-128($rp) 789 vpaddq $TEMP4, $ACC8, $ACC8 790 vmovdqu $ACC7, 32*7-128($rp) 791 vmovdqu $ACC8, 32*8-128($rp) 792 793 mov $rp, $ap 794 dec $rep 795 jne .LOOP_GRANDE_SQR_1024 796 797 vzeroall 798 mov %rbp, %rax 799.cfi_def_cfa_register %rax 800___ 801$code.=<<___ if ($win64); 802.Lsqr_1024_in_tail: 803 movaps -0xd8(%rax),%xmm6 804 movaps -0xc8(%rax),%xmm7 805 movaps -0xb8(%rax),%xmm8 806 movaps -0xa8(%rax),%xmm9 807 movaps -0x98(%rax),%xmm10 808 movaps -0x88(%rax),%xmm11 809 movaps -0x78(%rax),%xmm12 810 movaps -0x68(%rax),%xmm13 811 movaps -0x58(%rax),%xmm14 812 movaps -0x48(%rax),%xmm15 813___ 814$code.=<<___; 815 mov -48(%rax),%r15 816.cfi_restore %r15 817 mov -40(%rax),%r14 818.cfi_restore %r14 819 mov -32(%rax),%r13 820.cfi_restore %r13 821 mov -24(%rax),%r12 822.cfi_restore %r12 823 mov -16(%rax),%rbp 824.cfi_restore %rbp 825 mov -8(%rax),%rbx 826.cfi_restore %rbx 827 lea (%rax),%rsp # restore %rsp 828.cfi_def_cfa_register %rsp 829.Lsqr_1024_epilogue: 830 ret 831.cfi_endproc 832.size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2 833___ 834} 835 836{ # void AMM_WW( 837my $rp="%rdi"; # BN_ULONG *rp, 838my $ap="%rsi"; # const BN_ULONG *ap, 839my $bp="%rdx"; # const BN_ULONG *bp, 840my $np="%rcx"; # const BN_ULONG *np, 841my $n0="%r8d"; # unsigned int n0); 842 843# The registers that hold the accumulated redundant result 844# The AMM works on 1024 bit operands, and redundant word size is 29 845# Therefore: ceil(1024/29)/4 = 9 846my $ACC0="%ymm0"; 847my $ACC1="%ymm1"; 848my $ACC2="%ymm2"; 849my $ACC3="%ymm3"; 850my $ACC4="%ymm4"; 851my $ACC5="%ymm5"; 852my $ACC6="%ymm6"; 853my $ACC7="%ymm7"; 854my $ACC8="%ymm8"; 855my $ACC9="%ymm9"; 856 857# Registers that hold the broadcasted words of multiplier, currently used 858my $Bi="%ymm10"; 859my $Yi="%ymm11"; 860 861# Helper registers 862my $TEMP0=$ACC0; 863my $TEMP1="%ymm12"; 864my $TEMP2="%ymm13"; 865my $ZERO="%ymm14"; 866my $AND_MASK="%ymm15"; 867 868# alu registers that hold the first words of the ACC 869my $r0="%r9"; 870my $r1="%r10"; 871my $r2="%r11"; 872my $r3="%r12"; 873 874my $i="%r14d"; 875my $tmp="%r15"; 876 877$bp="%r13"; # reassigned argument 878 879$code.=<<___; 880.globl rsaz_1024_mul_avx2 881.type rsaz_1024_mul_avx2,\@function,5 882.align 64 883rsaz_1024_mul_avx2: 884.cfi_startproc 885 lea (%rsp), %rax 886.cfi_def_cfa_register %rax 887 push %rbx 888.cfi_push %rbx 889 push %rbp 890.cfi_push %rbp 891 push %r12 892.cfi_push %r12 893 push %r13 894.cfi_push %r13 895 push %r14 896.cfi_push %r14 897 push %r15 898.cfi_push %r15 899___ 900$code.=<<___ if ($win64); 901 vzeroupper 902 lea -0xa8(%rsp),%rsp 903 vmovaps %xmm6,-0xd8(%rax) 904 vmovaps %xmm7,-0xc8(%rax) 905 vmovaps %xmm8,-0xb8(%rax) 906 vmovaps %xmm9,-0xa8(%rax) 907 vmovaps %xmm10,-0x98(%rax) 908 vmovaps %xmm11,-0x88(%rax) 909 vmovaps %xmm12,-0x78(%rax) 910 vmovaps %xmm13,-0x68(%rax) 911 vmovaps %xmm14,-0x58(%rax) 912 vmovaps %xmm15,-0x48(%rax) 913.Lmul_1024_body: 914___ 915$code.=<<___; 916 mov %rax,%rbp 917.cfi_def_cfa_register %rbp 918 vzeroall 919 mov %rdx, $bp # reassigned argument 920 sub \$64,%rsp 921 922 # unaligned 256-bit load that crosses page boundary can 923 # cause severe performance degradation here, so if $ap does 924 # cross page boundary, swap it with $bp [meaning that caller 925 # is advised to lay down $ap and $bp next to each other, so 926 # that only one can cross page boundary]. 927 .byte 0x67,0x67 928 mov $ap, $tmp 929 and \$4095, $tmp 930 add \$32*10, $tmp 931 shr \$12, $tmp 932 mov $ap, $tmp 933 cmovnz $bp, $ap 934 cmovnz $tmp, $bp 935 936 mov $np, $tmp 937 sub \$-128,$ap # size optimization 938 sub \$-128,$np 939 sub \$-128,$rp 940 941 and \$4095, $tmp # see if $np crosses page 942 add \$32*10, $tmp 943 .byte 0x67,0x67 944 shr \$12, $tmp 945 jz .Lmul_1024_no_n_copy 946 947 # unaligned 256-bit load that crosses page boundary can 948 # cause severe performance degradation here, so if $np does 949 # cross page boundary, copy it to stack and make sure stack 950 # frame doesn't... 951 sub \$32*10,%rsp 952 vmovdqu 32*0-128($np), $ACC0 953 and \$-512, %rsp 954 vmovdqu 32*1-128($np), $ACC1 955 vmovdqu 32*2-128($np), $ACC2 956 vmovdqu 32*3-128($np), $ACC3 957 vmovdqu 32*4-128($np), $ACC4 958 vmovdqu 32*5-128($np), $ACC5 959 vmovdqu 32*6-128($np), $ACC6 960 vmovdqu 32*7-128($np), $ACC7 961 vmovdqu 32*8-128($np), $ACC8 962 lea 64+128(%rsp),$np 963 vmovdqu $ACC0, 32*0-128($np) 964 vpxor $ACC0, $ACC0, $ACC0 965 vmovdqu $ACC1, 32*1-128($np) 966 vpxor $ACC1, $ACC1, $ACC1 967 vmovdqu $ACC2, 32*2-128($np) 968 vpxor $ACC2, $ACC2, $ACC2 969 vmovdqu $ACC3, 32*3-128($np) 970 vpxor $ACC3, $ACC3, $ACC3 971 vmovdqu $ACC4, 32*4-128($np) 972 vpxor $ACC4, $ACC4, $ACC4 973 vmovdqu $ACC5, 32*5-128($np) 974 vpxor $ACC5, $ACC5, $ACC5 975 vmovdqu $ACC6, 32*6-128($np) 976 vpxor $ACC6, $ACC6, $ACC6 977 vmovdqu $ACC7, 32*7-128($np) 978 vpxor $ACC7, $ACC7, $ACC7 979 vmovdqu $ACC8, 32*8-128($np) 980 vmovdqa $ACC0, $ACC8 981 vmovdqu $ACC9, 32*9-128($np) # $ACC9 is zero after vzeroall 982.Lmul_1024_no_n_copy: 983 and \$-64,%rsp 984 985 mov ($bp), %rbx 986 vpbroadcastq ($bp), $Bi 987 vmovdqu $ACC0, (%rsp) # clear top of stack 988 xor $r0, $r0 989 .byte 0x67 990 xor $r1, $r1 991 xor $r2, $r2 992 xor $r3, $r3 993 994 vmovdqu .Land_mask(%rip), $AND_MASK 995 mov \$9, $i 996 vmovdqu $ACC9, 32*9-128($rp) # $ACC9 is zero after vzeroall 997 jmp .Loop_mul_1024 998 999.align 32 1000.Loop_mul_1024: 1001 vpsrlq \$29, $ACC3, $ACC9 # correct $ACC3(*) 1002 mov %rbx, %rax 1003 imulq -128($ap), %rax 1004 add $r0, %rax 1005 mov %rbx, $r1 1006 imulq 8-128($ap), $r1 1007 add 8(%rsp), $r1 1008 1009 mov %rax, $r0 1010 imull $n0, %eax 1011 and \$0x1fffffff, %eax 1012 1013 mov %rbx, $r2 1014 imulq 16-128($ap), $r2 1015 add 16(%rsp), $r2 1016 1017 mov %rbx, $r3 1018 imulq 24-128($ap), $r3 1019 add 24(%rsp), $r3 1020 vpmuludq 32*1-128($ap),$Bi,$TEMP0 1021 vmovd %eax, $Yi 1022 vpaddq $TEMP0,$ACC1,$ACC1 1023 vpmuludq 32*2-128($ap),$Bi,$TEMP1 1024 vpbroadcastq $Yi, $Yi 1025 vpaddq $TEMP1,$ACC2,$ACC2 1026 vpmuludq 32*3-128($ap),$Bi,$TEMP2 1027 vpand $AND_MASK, $ACC3, $ACC3 # correct $ACC3 1028 vpaddq $TEMP2,$ACC3,$ACC3 1029 vpmuludq 32*4-128($ap),$Bi,$TEMP0 1030 vpaddq $TEMP0,$ACC4,$ACC4 1031 vpmuludq 32*5-128($ap),$Bi,$TEMP1 1032 vpaddq $TEMP1,$ACC5,$ACC5 1033 vpmuludq 32*6-128($ap),$Bi,$TEMP2 1034 vpaddq $TEMP2,$ACC6,$ACC6 1035 vpmuludq 32*7-128($ap),$Bi,$TEMP0 1036 vpermq \$0x93, $ACC9, $ACC9 # correct $ACC3 1037 vpaddq $TEMP0,$ACC7,$ACC7 1038 vpmuludq 32*8-128($ap),$Bi,$TEMP1 1039 vpbroadcastq 8($bp), $Bi 1040 vpaddq $TEMP1,$ACC8,$ACC8 1041 1042 mov %rax,%rdx 1043 imulq -128($np),%rax 1044 add %rax,$r0 1045 mov %rdx,%rax 1046 imulq 8-128($np),%rax 1047 add %rax,$r1 1048 mov %rdx,%rax 1049 imulq 16-128($np),%rax 1050 add %rax,$r2 1051 shr \$29, $r0 1052 imulq 24-128($np),%rdx 1053 add %rdx,$r3 1054 add $r0, $r1 1055 1056 vpmuludq 32*1-128($np),$Yi,$TEMP2 1057 vmovq $Bi, %rbx 1058 vpaddq $TEMP2,$ACC1,$ACC1 1059 vpmuludq 32*2-128($np),$Yi,$TEMP0 1060 vpaddq $TEMP0,$ACC2,$ACC2 1061 vpmuludq 32*3-128($np),$Yi,$TEMP1 1062 vpaddq $TEMP1,$ACC3,$ACC3 1063 vpmuludq 32*4-128($np),$Yi,$TEMP2 1064 vpaddq $TEMP2,$ACC4,$ACC4 1065 vpmuludq 32*5-128($np),$Yi,$TEMP0 1066 vpaddq $TEMP0,$ACC5,$ACC5 1067 vpmuludq 32*6-128($np),$Yi,$TEMP1 1068 vpaddq $TEMP1,$ACC6,$ACC6 1069 vpmuludq 32*7-128($np),$Yi,$TEMP2 1070 vpblendd \$3, $ZERO, $ACC9, $TEMP1 # correct $ACC3 1071 vpaddq $TEMP2,$ACC7,$ACC7 1072 vpmuludq 32*8-128($np),$Yi,$TEMP0 1073 vpaddq $TEMP1, $ACC3, $ACC3 # correct $ACC3 1074 vpaddq $TEMP0,$ACC8,$ACC8 1075 1076 mov %rbx, %rax 1077 imulq -128($ap),%rax 1078 add %rax,$r1 1079 vmovdqu -8+32*1-128($ap),$TEMP1 1080 mov %rbx, %rax 1081 imulq 8-128($ap),%rax 1082 add %rax,$r2 1083 vmovdqu -8+32*2-128($ap),$TEMP2 1084 1085 mov $r1, %rax 1086 vpblendd \$0xfc, $ZERO, $ACC9, $ACC9 # correct $ACC3 1087 imull $n0, %eax 1088 vpaddq $ACC9,$ACC4,$ACC4 # correct $ACC3 1089 and \$0x1fffffff, %eax 1090 1091 imulq 16-128($ap),%rbx 1092 add %rbx,$r3 1093 vpmuludq $Bi,$TEMP1,$TEMP1 1094 vmovd %eax, $Yi 1095 vmovdqu -8+32*3-128($ap),$TEMP0 1096 vpaddq $TEMP1,$ACC1,$ACC1 1097 vpmuludq $Bi,$TEMP2,$TEMP2 1098 vpbroadcastq $Yi, $Yi 1099 vmovdqu -8+32*4-128($ap),$TEMP1 1100 vpaddq $TEMP2,$ACC2,$ACC2 1101 vpmuludq $Bi,$TEMP0,$TEMP0 1102 vmovdqu -8+32*5-128($ap),$TEMP2 1103 vpaddq $TEMP0,$ACC3,$ACC3 1104 vpmuludq $Bi,$TEMP1,$TEMP1 1105 vmovdqu -8+32*6-128($ap),$TEMP0 1106 vpaddq $TEMP1,$ACC4,$ACC4 1107 vpmuludq $Bi,$TEMP2,$TEMP2 1108 vmovdqu -8+32*7-128($ap),$TEMP1 1109 vpaddq $TEMP2,$ACC5,$ACC5 1110 vpmuludq $Bi,$TEMP0,$TEMP0 1111 vmovdqu -8+32*8-128($ap),$TEMP2 1112 vpaddq $TEMP0,$ACC6,$ACC6 1113 vpmuludq $Bi,$TEMP1,$TEMP1 1114 vmovdqu -8+32*9-128($ap),$ACC9 1115 vpaddq $TEMP1,$ACC7,$ACC7 1116 vpmuludq $Bi,$TEMP2,$TEMP2 1117 vpaddq $TEMP2,$ACC8,$ACC8 1118 vpmuludq $Bi,$ACC9,$ACC9 1119 vpbroadcastq 16($bp), $Bi 1120 1121 mov %rax,%rdx 1122 imulq -128($np),%rax 1123 add %rax,$r1 1124 vmovdqu -8+32*1-128($np),$TEMP0 1125 mov %rdx,%rax 1126 imulq 8-128($np),%rax 1127 add %rax,$r2 1128 vmovdqu -8+32*2-128($np),$TEMP1 1129 shr \$29, $r1 1130 imulq 16-128($np),%rdx 1131 add %rdx,$r3 1132 add $r1, $r2 1133 1134 vpmuludq $Yi,$TEMP0,$TEMP0 1135 vmovq $Bi, %rbx 1136 vmovdqu -8+32*3-128($np),$TEMP2 1137 vpaddq $TEMP0,$ACC1,$ACC1 1138 vpmuludq $Yi,$TEMP1,$TEMP1 1139 vmovdqu -8+32*4-128($np),$TEMP0 1140 vpaddq $TEMP1,$ACC2,$ACC2 1141 vpmuludq $Yi,$TEMP2,$TEMP2 1142 vmovdqu -8+32*5-128($np),$TEMP1 1143 vpaddq $TEMP2,$ACC3,$ACC3 1144 vpmuludq $Yi,$TEMP0,$TEMP0 1145 vmovdqu -8+32*6-128($np),$TEMP2 1146 vpaddq $TEMP0,$ACC4,$ACC4 1147 vpmuludq $Yi,$TEMP1,$TEMP1 1148 vmovdqu -8+32*7-128($np),$TEMP0 1149 vpaddq $TEMP1,$ACC5,$ACC5 1150 vpmuludq $Yi,$TEMP2,$TEMP2 1151 vmovdqu -8+32*8-128($np),$TEMP1 1152 vpaddq $TEMP2,$ACC6,$ACC6 1153 vpmuludq $Yi,$TEMP0,$TEMP0 1154 vmovdqu -8+32*9-128($np),$TEMP2 1155 vpaddq $TEMP0,$ACC7,$ACC7 1156 vpmuludq $Yi,$TEMP1,$TEMP1 1157 vpaddq $TEMP1,$ACC8,$ACC8 1158 vpmuludq $Yi,$TEMP2,$TEMP2 1159 vpaddq $TEMP2,$ACC9,$ACC9 1160 1161 vmovdqu -16+32*1-128($ap),$TEMP0 1162 mov %rbx,%rax 1163 imulq -128($ap),%rax 1164 add $r2,%rax 1165 1166 vmovdqu -16+32*2-128($ap),$TEMP1 1167 mov %rax,$r2 1168 imull $n0, %eax 1169 and \$0x1fffffff, %eax 1170 1171 imulq 8-128($ap),%rbx 1172 add %rbx,$r3 1173 vpmuludq $Bi,$TEMP0,$TEMP0 1174 vmovd %eax, $Yi 1175 vmovdqu -16+32*3-128($ap),$TEMP2 1176 vpaddq $TEMP0,$ACC1,$ACC1 1177 vpmuludq $Bi,$TEMP1,$TEMP1 1178 vpbroadcastq $Yi, $Yi 1179 vmovdqu -16+32*4-128($ap),$TEMP0 1180 vpaddq $TEMP1,$ACC2,$ACC2 1181 vpmuludq $Bi,$TEMP2,$TEMP2 1182 vmovdqu -16+32*5-128($ap),$TEMP1 1183 vpaddq $TEMP2,$ACC3,$ACC3 1184 vpmuludq $Bi,$TEMP0,$TEMP0 1185 vmovdqu -16+32*6-128($ap),$TEMP2 1186 vpaddq $TEMP0,$ACC4,$ACC4 1187 vpmuludq $Bi,$TEMP1,$TEMP1 1188 vmovdqu -16+32*7-128($ap),$TEMP0 1189 vpaddq $TEMP1,$ACC5,$ACC5 1190 vpmuludq $Bi,$TEMP2,$TEMP2 1191 vmovdqu -16+32*8-128($ap),$TEMP1 1192 vpaddq $TEMP2,$ACC6,$ACC6 1193 vpmuludq $Bi,$TEMP0,$TEMP0 1194 vmovdqu -16+32*9-128($ap),$TEMP2 1195 vpaddq $TEMP0,$ACC7,$ACC7 1196 vpmuludq $Bi,$TEMP1,$TEMP1 1197 vpaddq $TEMP1,$ACC8,$ACC8 1198 vpmuludq $Bi,$TEMP2,$TEMP2 1199 vpbroadcastq 24($bp), $Bi 1200 vpaddq $TEMP2,$ACC9,$ACC9 1201 1202 vmovdqu -16+32*1-128($np),$TEMP0 1203 mov %rax,%rdx 1204 imulq -128($np),%rax 1205 add %rax,$r2 1206 vmovdqu -16+32*2-128($np),$TEMP1 1207 imulq 8-128($np),%rdx 1208 add %rdx,$r3 1209 shr \$29, $r2 1210 1211 vpmuludq $Yi,$TEMP0,$TEMP0 1212 vmovq $Bi, %rbx 1213 vmovdqu -16+32*3-128($np),$TEMP2 1214 vpaddq $TEMP0,$ACC1,$ACC1 1215 vpmuludq $Yi,$TEMP1,$TEMP1 1216 vmovdqu -16+32*4-128($np),$TEMP0 1217 vpaddq $TEMP1,$ACC2,$ACC2 1218 vpmuludq $Yi,$TEMP2,$TEMP2 1219 vmovdqu -16+32*5-128($np),$TEMP1 1220 vpaddq $TEMP2,$ACC3,$ACC3 1221 vpmuludq $Yi,$TEMP0,$TEMP0 1222 vmovdqu -16+32*6-128($np),$TEMP2 1223 vpaddq $TEMP0,$ACC4,$ACC4 1224 vpmuludq $Yi,$TEMP1,$TEMP1 1225 vmovdqu -16+32*7-128($np),$TEMP0 1226 vpaddq $TEMP1,$ACC5,$ACC5 1227 vpmuludq $Yi,$TEMP2,$TEMP2 1228 vmovdqu -16+32*8-128($np),$TEMP1 1229 vpaddq $TEMP2,$ACC6,$ACC6 1230 vpmuludq $Yi,$TEMP0,$TEMP0 1231 vmovdqu -16+32*9-128($np),$TEMP2 1232 vpaddq $TEMP0,$ACC7,$ACC7 1233 vpmuludq $Yi,$TEMP1,$TEMP1 1234 vmovdqu -24+32*1-128($ap),$TEMP0 1235 vpaddq $TEMP1,$ACC8,$ACC8 1236 vpmuludq $Yi,$TEMP2,$TEMP2 1237 vmovdqu -24+32*2-128($ap),$TEMP1 1238 vpaddq $TEMP2,$ACC9,$ACC9 1239 1240 add $r2, $r3 1241 imulq -128($ap),%rbx 1242 add %rbx,$r3 1243 1244 mov $r3, %rax 1245 imull $n0, %eax 1246 and \$0x1fffffff, %eax 1247 1248 vpmuludq $Bi,$TEMP0,$TEMP0 1249 vmovd %eax, $Yi 1250 vmovdqu -24+32*3-128($ap),$TEMP2 1251 vpaddq $TEMP0,$ACC1,$ACC1 1252 vpmuludq $Bi,$TEMP1,$TEMP1 1253 vpbroadcastq $Yi, $Yi 1254 vmovdqu -24+32*4-128($ap),$TEMP0 1255 vpaddq $TEMP1,$ACC2,$ACC2 1256 vpmuludq $Bi,$TEMP2,$TEMP2 1257 vmovdqu -24+32*5-128($ap),$TEMP1 1258 vpaddq $TEMP2,$ACC3,$ACC3 1259 vpmuludq $Bi,$TEMP0,$TEMP0 1260 vmovdqu -24+32*6-128($ap),$TEMP2 1261 vpaddq $TEMP0,$ACC4,$ACC4 1262 vpmuludq $Bi,$TEMP1,$TEMP1 1263 vmovdqu -24+32*7-128($ap),$TEMP0 1264 vpaddq $TEMP1,$ACC5,$ACC5 1265 vpmuludq $Bi,$TEMP2,$TEMP2 1266 vmovdqu -24+32*8-128($ap),$TEMP1 1267 vpaddq $TEMP2,$ACC6,$ACC6 1268 vpmuludq $Bi,$TEMP0,$TEMP0 1269 vmovdqu -24+32*9-128($ap),$TEMP2 1270 vpaddq $TEMP0,$ACC7,$ACC7 1271 vpmuludq $Bi,$TEMP1,$TEMP1 1272 vpaddq $TEMP1,$ACC8,$ACC8 1273 vpmuludq $Bi,$TEMP2,$TEMP2 1274 vpbroadcastq 32($bp), $Bi 1275 vpaddq $TEMP2,$ACC9,$ACC9 1276 add \$32, $bp # $bp++ 1277 1278 vmovdqu -24+32*1-128($np),$TEMP0 1279 imulq -128($np),%rax 1280 add %rax,$r3 1281 shr \$29, $r3 1282 1283 vmovdqu -24+32*2-128($np),$TEMP1 1284 vpmuludq $Yi,$TEMP0,$TEMP0 1285 vmovq $Bi, %rbx 1286 vmovdqu -24+32*3-128($np),$TEMP2 1287 vpaddq $TEMP0,$ACC1,$ACC0 # $ACC0==$TEMP0 1288 vpmuludq $Yi,$TEMP1,$TEMP1 1289 vmovdqu $ACC0, (%rsp) # transfer $r0-$r3 1290 vpaddq $TEMP1,$ACC2,$ACC1 1291 vmovdqu -24+32*4-128($np),$TEMP0 1292 vpmuludq $Yi,$TEMP2,$TEMP2 1293 vmovdqu -24+32*5-128($np),$TEMP1 1294 vpaddq $TEMP2,$ACC3,$ACC2 1295 vpmuludq $Yi,$TEMP0,$TEMP0 1296 vmovdqu -24+32*6-128($np),$TEMP2 1297 vpaddq $TEMP0,$ACC4,$ACC3 1298 vpmuludq $Yi,$TEMP1,$TEMP1 1299 vmovdqu -24+32*7-128($np),$TEMP0 1300 vpaddq $TEMP1,$ACC5,$ACC4 1301 vpmuludq $Yi,$TEMP2,$TEMP2 1302 vmovdqu -24+32*8-128($np),$TEMP1 1303 vpaddq $TEMP2,$ACC6,$ACC5 1304 vpmuludq $Yi,$TEMP0,$TEMP0 1305 vmovdqu -24+32*9-128($np),$TEMP2 1306 mov $r3, $r0 1307 vpaddq $TEMP0,$ACC7,$ACC6 1308 vpmuludq $Yi,$TEMP1,$TEMP1 1309 add (%rsp), $r0 1310 vpaddq $TEMP1,$ACC8,$ACC7 1311 vpmuludq $Yi,$TEMP2,$TEMP2 1312 vmovq $r3, $TEMP1 1313 vpaddq $TEMP2,$ACC9,$ACC8 1314 1315 dec $i 1316 jnz .Loop_mul_1024 1317___ 1318 1319# (*) Original implementation was correcting ACC1-ACC3 for overflow 1320# after 7 loop runs, or after 28 iterations, or 56 additions. 1321# But as we underutilize resources, it's possible to correct in 1322# each iteration with marginal performance loss. But then, as 1323# we do it in each iteration, we can correct less digits, and 1324# avoid performance penalties completely. 1325 1326$TEMP0 = $ACC9; 1327$TEMP3 = $Bi; 1328$TEMP4 = $Yi; 1329$code.=<<___; 1330 vpaddq (%rsp), $TEMP1, $ACC0 1331 1332 vpsrlq \$29, $ACC0, $TEMP1 1333 vpand $AND_MASK, $ACC0, $ACC0 1334 vpsrlq \$29, $ACC1, $TEMP2 1335 vpand $AND_MASK, $ACC1, $ACC1 1336 vpsrlq \$29, $ACC2, $TEMP3 1337 vpermq \$0x93, $TEMP1, $TEMP1 1338 vpand $AND_MASK, $ACC2, $ACC2 1339 vpsrlq \$29, $ACC3, $TEMP4 1340 vpermq \$0x93, $TEMP2, $TEMP2 1341 vpand $AND_MASK, $ACC3, $ACC3 1342 1343 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 1344 vpermq \$0x93, $TEMP3, $TEMP3 1345 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 1346 vpermq \$0x93, $TEMP4, $TEMP4 1347 vpaddq $TEMP0, $ACC0, $ACC0 1348 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 1349 vpaddq $TEMP1, $ACC1, $ACC1 1350 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 1351 vpaddq $TEMP2, $ACC2, $ACC2 1352 vpblendd \$3, $TEMP4, $ZERO, $TEMP4 1353 vpaddq $TEMP3, $ACC3, $ACC3 1354 vpaddq $TEMP4, $ACC4, $ACC4 1355 1356 vpsrlq \$29, $ACC0, $TEMP1 1357 vpand $AND_MASK, $ACC0, $ACC0 1358 vpsrlq \$29, $ACC1, $TEMP2 1359 vpand $AND_MASK, $ACC1, $ACC1 1360 vpsrlq \$29, $ACC2, $TEMP3 1361 vpermq \$0x93, $TEMP1, $TEMP1 1362 vpand $AND_MASK, $ACC2, $ACC2 1363 vpsrlq \$29, $ACC3, $TEMP4 1364 vpermq \$0x93, $TEMP2, $TEMP2 1365 vpand $AND_MASK, $ACC3, $ACC3 1366 vpermq \$0x93, $TEMP3, $TEMP3 1367 1368 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 1369 vpermq \$0x93, $TEMP4, $TEMP4 1370 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 1371 vpaddq $TEMP0, $ACC0, $ACC0 1372 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 1373 vpaddq $TEMP1, $ACC1, $ACC1 1374 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 1375 vpaddq $TEMP2, $ACC2, $ACC2 1376 vpblendd \$3, $TEMP4, $ZERO, $TEMP4 1377 vpaddq $TEMP3, $ACC3, $ACC3 1378 vpaddq $TEMP4, $ACC4, $ACC4 1379 1380 vmovdqu $ACC0, 0-128($rp) 1381 vmovdqu $ACC1, 32-128($rp) 1382 vmovdqu $ACC2, 64-128($rp) 1383 vmovdqu $ACC3, 96-128($rp) 1384___ 1385 1386$TEMP5=$ACC0; 1387$code.=<<___; 1388 vpsrlq \$29, $ACC4, $TEMP1 1389 vpand $AND_MASK, $ACC4, $ACC4 1390 vpsrlq \$29, $ACC5, $TEMP2 1391 vpand $AND_MASK, $ACC5, $ACC5 1392 vpsrlq \$29, $ACC6, $TEMP3 1393 vpermq \$0x93, $TEMP1, $TEMP1 1394 vpand $AND_MASK, $ACC6, $ACC6 1395 vpsrlq \$29, $ACC7, $TEMP4 1396 vpermq \$0x93, $TEMP2, $TEMP2 1397 vpand $AND_MASK, $ACC7, $ACC7 1398 vpsrlq \$29, $ACC8, $TEMP5 1399 vpermq \$0x93, $TEMP3, $TEMP3 1400 vpand $AND_MASK, $ACC8, $ACC8 1401 vpermq \$0x93, $TEMP4, $TEMP4 1402 1403 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 1404 vpermq \$0x93, $TEMP5, $TEMP5 1405 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 1406 vpaddq $TEMP0, $ACC4, $ACC4 1407 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 1408 vpaddq $TEMP1, $ACC5, $ACC5 1409 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 1410 vpaddq $TEMP2, $ACC6, $ACC6 1411 vpblendd \$3, $TEMP4, $TEMP5, $TEMP4 1412 vpaddq $TEMP3, $ACC7, $ACC7 1413 vpaddq $TEMP4, $ACC8, $ACC8 1414 1415 vpsrlq \$29, $ACC4, $TEMP1 1416 vpand $AND_MASK, $ACC4, $ACC4 1417 vpsrlq \$29, $ACC5, $TEMP2 1418 vpand $AND_MASK, $ACC5, $ACC5 1419 vpsrlq \$29, $ACC6, $TEMP3 1420 vpermq \$0x93, $TEMP1, $TEMP1 1421 vpand $AND_MASK, $ACC6, $ACC6 1422 vpsrlq \$29, $ACC7, $TEMP4 1423 vpermq \$0x93, $TEMP2, $TEMP2 1424 vpand $AND_MASK, $ACC7, $ACC7 1425 vpsrlq \$29, $ACC8, $TEMP5 1426 vpermq \$0x93, $TEMP3, $TEMP3 1427 vpand $AND_MASK, $ACC8, $ACC8 1428 vpermq \$0x93, $TEMP4, $TEMP4 1429 1430 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 1431 vpermq \$0x93, $TEMP5, $TEMP5 1432 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 1433 vpaddq $TEMP0, $ACC4, $ACC4 1434 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 1435 vpaddq $TEMP1, $ACC5, $ACC5 1436 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 1437 vpaddq $TEMP2, $ACC6, $ACC6 1438 vpblendd \$3, $TEMP4, $TEMP5, $TEMP4 1439 vpaddq $TEMP3, $ACC7, $ACC7 1440 vpaddq $TEMP4, $ACC8, $ACC8 1441 1442 vmovdqu $ACC4, 128-128($rp) 1443 vmovdqu $ACC5, 160-128($rp) 1444 vmovdqu $ACC6, 192-128($rp) 1445 vmovdqu $ACC7, 224-128($rp) 1446 vmovdqu $ACC8, 256-128($rp) 1447 vzeroupper 1448 1449 mov %rbp, %rax 1450.cfi_def_cfa_register %rax 1451___ 1452$code.=<<___ if ($win64); 1453.Lmul_1024_in_tail: 1454 movaps -0xd8(%rax),%xmm6 1455 movaps -0xc8(%rax),%xmm7 1456 movaps -0xb8(%rax),%xmm8 1457 movaps -0xa8(%rax),%xmm9 1458 movaps -0x98(%rax),%xmm10 1459 movaps -0x88(%rax),%xmm11 1460 movaps -0x78(%rax),%xmm12 1461 movaps -0x68(%rax),%xmm13 1462 movaps -0x58(%rax),%xmm14 1463 movaps -0x48(%rax),%xmm15 1464___ 1465$code.=<<___; 1466 mov -48(%rax),%r15 1467.cfi_restore %r15 1468 mov -40(%rax),%r14 1469.cfi_restore %r14 1470 mov -32(%rax),%r13 1471.cfi_restore %r13 1472 mov -24(%rax),%r12 1473.cfi_restore %r12 1474 mov -16(%rax),%rbp 1475.cfi_restore %rbp 1476 mov -8(%rax),%rbx 1477.cfi_restore %rbx 1478 lea (%rax),%rsp # restore %rsp 1479.cfi_def_cfa_register %rsp 1480.Lmul_1024_epilogue: 1481 ret 1482.cfi_endproc 1483.size rsaz_1024_mul_avx2,.-rsaz_1024_mul_avx2 1484___ 1485} 1486{ 1487my ($out,$inp) = $win64 ? ("%rcx","%rdx") : ("%rdi","%rsi"); 1488my @T = map("%r$_",(8..11)); 1489 1490$code.=<<___; 1491.globl rsaz_1024_red2norm_avx2 1492.type rsaz_1024_red2norm_avx2,\@abi-omnipotent 1493.align 32 1494rsaz_1024_red2norm_avx2: 1495.cfi_startproc 1496 sub \$-128,$inp # size optimization 1497 xor %rax,%rax 1498___ 1499 1500for ($j=0,$i=0; $i<16; $i++) { 1501 my $k=0; 1502 while (29*$j<64*($i+1)) { # load data till boundary 1503 $code.=" mov `8*$j-128`($inp), @T[0]\n"; 1504 $j++; $k++; push(@T,shift(@T)); 1505 } 1506 $l=$k; 1507 while ($k>1) { # shift loaded data but last value 1508 $code.=" shl \$`29*($j-$k)`,@T[-$k]\n"; 1509 $k--; 1510 } 1511 $code.=<<___; # shift last value 1512 mov @T[-1], @T[0] 1513 shl \$`29*($j-1)`, @T[-1] 1514 shr \$`-29*($j-1)`, @T[0] 1515___ 1516 while ($l) { # accumulate all values 1517 $code.=" add @T[-$l], %rax\n"; 1518 $l--; 1519 } 1520 $code.=<<___; 1521 adc \$0, @T[0] # consume eventual carry 1522 mov %rax, 8*$i($out) 1523 mov @T[0], %rax 1524___ 1525 push(@T,shift(@T)); 1526} 1527$code.=<<___; 1528 ret 1529.cfi_endproc 1530.size rsaz_1024_red2norm_avx2,.-rsaz_1024_red2norm_avx2 1531 1532.globl rsaz_1024_norm2red_avx2 1533.type rsaz_1024_norm2red_avx2,\@abi-omnipotent 1534.align 32 1535rsaz_1024_norm2red_avx2: 1536.cfi_startproc 1537 sub \$-128,$out # size optimization 1538 mov ($inp),@T[0] 1539 mov \$0x1fffffff,%eax 1540___ 1541for ($j=0,$i=0; $i<16; $i++) { 1542 $code.=" mov `8*($i+1)`($inp),@T[1]\n" if ($i<15); 1543 $code.=" xor @T[1],@T[1]\n" if ($i==15); 1544 my $k=1; 1545 while (29*($j+1)<64*($i+1)) { 1546 $code.=<<___; 1547 mov @T[0],@T[-$k] 1548 shr \$`29*$j`,@T[-$k] 1549 and %rax,@T[-$k] # &0x1fffffff 1550 mov @T[-$k],`8*$j-128`($out) 1551___ 1552 $j++; $k++; 1553 } 1554 $code.=<<___; 1555 shrd \$`29*$j`,@T[1],@T[0] 1556 and %rax,@T[0] 1557 mov @T[0],`8*$j-128`($out) 1558___ 1559 $j++; 1560 push(@T,shift(@T)); 1561} 1562$code.=<<___; 1563 mov @T[0],`8*$j-128`($out) # zero 1564 mov @T[0],`8*($j+1)-128`($out) 1565 mov @T[0],`8*($j+2)-128`($out) 1566 mov @T[0],`8*($j+3)-128`($out) 1567 ret 1568.cfi_endproc 1569.size rsaz_1024_norm2red_avx2,.-rsaz_1024_norm2red_avx2 1570___ 1571} 1572{ 1573my ($out,$inp,$power) = $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx"); 1574 1575$code.=<<___; 1576.globl rsaz_1024_scatter5_avx2 1577.type rsaz_1024_scatter5_avx2,\@abi-omnipotent 1578.align 32 1579rsaz_1024_scatter5_avx2: 1580.cfi_startproc 1581 vzeroupper 1582 vmovdqu .Lscatter_permd(%rip),%ymm5 1583 shl \$4,$power 1584 lea ($out,$power),$out 1585 mov \$9,%eax 1586 jmp .Loop_scatter_1024 1587 1588.align 32 1589.Loop_scatter_1024: 1590 vmovdqu ($inp),%ymm0 1591 lea 32($inp),$inp 1592 vpermd %ymm0,%ymm5,%ymm0 1593 vmovdqu %xmm0,($out) 1594 lea 16*32($out),$out 1595 dec %eax 1596 jnz .Loop_scatter_1024 1597 1598 vzeroupper 1599 ret 1600.cfi_endproc 1601.size rsaz_1024_scatter5_avx2,.-rsaz_1024_scatter5_avx2 1602 1603.globl rsaz_1024_gather5_avx2 1604.type rsaz_1024_gather5_avx2,\@abi-omnipotent 1605.align 32 1606rsaz_1024_gather5_avx2: 1607.cfi_startproc 1608 vzeroupper 1609 mov %rsp,%r11 1610.cfi_def_cfa_register %r11 1611___ 1612$code.=<<___ if ($win64); 1613 lea -0x88(%rsp),%rax 1614.LSEH_begin_rsaz_1024_gather5: 1615 # I can't trust assembler to use specific encoding:-( 1616 .byte 0x48,0x8d,0x60,0xe0 # lea -0x20(%rax),%rsp 1617 .byte 0xc5,0xf8,0x29,0x70,0xe0 # vmovaps %xmm6,-0x20(%rax) 1618 .byte 0xc5,0xf8,0x29,0x78,0xf0 # vmovaps %xmm7,-0x10(%rax) 1619 .byte 0xc5,0x78,0x29,0x40,0x00 # vmovaps %xmm8,0(%rax) 1620 .byte 0xc5,0x78,0x29,0x48,0x10 # vmovaps %xmm9,0x10(%rax) 1621 .byte 0xc5,0x78,0x29,0x50,0x20 # vmovaps %xmm10,0x20(%rax) 1622 .byte 0xc5,0x78,0x29,0x58,0x30 # vmovaps %xmm11,0x30(%rax) 1623 .byte 0xc5,0x78,0x29,0x60,0x40 # vmovaps %xmm12,0x40(%rax) 1624 .byte 0xc5,0x78,0x29,0x68,0x50 # vmovaps %xmm13,0x50(%rax) 1625 .byte 0xc5,0x78,0x29,0x70,0x60 # vmovaps %xmm14,0x60(%rax) 1626 .byte 0xc5,0x78,0x29,0x78,0x70 # vmovaps %xmm15,0x70(%rax) 1627___ 1628$code.=<<___; 1629 lea -0x100(%rsp),%rsp 1630 and \$-32, %rsp 1631 lea .Linc(%rip), %r10 1632 lea -128(%rsp),%rax # control u-op density 1633 1634 vmovd $power, %xmm4 1635 vmovdqa (%r10),%ymm0 1636 vmovdqa 32(%r10),%ymm1 1637 vmovdqa 64(%r10),%ymm5 1638 vpbroadcastd %xmm4,%ymm4 1639 1640 vpaddd %ymm5, %ymm0, %ymm2 1641 vpcmpeqd %ymm4, %ymm0, %ymm0 1642 vpaddd %ymm5, %ymm1, %ymm3 1643 vpcmpeqd %ymm4, %ymm1, %ymm1 1644 vmovdqa %ymm0, 32*0+128(%rax) 1645 vpaddd %ymm5, %ymm2, %ymm0 1646 vpcmpeqd %ymm4, %ymm2, %ymm2 1647 vmovdqa %ymm1, 32*1+128(%rax) 1648 vpaddd %ymm5, %ymm3, %ymm1 1649 vpcmpeqd %ymm4, %ymm3, %ymm3 1650 vmovdqa %ymm2, 32*2+128(%rax) 1651 vpaddd %ymm5, %ymm0, %ymm2 1652 vpcmpeqd %ymm4, %ymm0, %ymm0 1653 vmovdqa %ymm3, 32*3+128(%rax) 1654 vpaddd %ymm5, %ymm1, %ymm3 1655 vpcmpeqd %ymm4, %ymm1, %ymm1 1656 vmovdqa %ymm0, 32*4+128(%rax) 1657 vpaddd %ymm5, %ymm2, %ymm8 1658 vpcmpeqd %ymm4, %ymm2, %ymm2 1659 vmovdqa %ymm1, 32*5+128(%rax) 1660 vpaddd %ymm5, %ymm3, %ymm9 1661 vpcmpeqd %ymm4, %ymm3, %ymm3 1662 vmovdqa %ymm2, 32*6+128(%rax) 1663 vpaddd %ymm5, %ymm8, %ymm10 1664 vpcmpeqd %ymm4, %ymm8, %ymm8 1665 vmovdqa %ymm3, 32*7+128(%rax) 1666 vpaddd %ymm5, %ymm9, %ymm11 1667 vpcmpeqd %ymm4, %ymm9, %ymm9 1668 vpaddd %ymm5, %ymm10, %ymm12 1669 vpcmpeqd %ymm4, %ymm10, %ymm10 1670 vpaddd %ymm5, %ymm11, %ymm13 1671 vpcmpeqd %ymm4, %ymm11, %ymm11 1672 vpaddd %ymm5, %ymm12, %ymm14 1673 vpcmpeqd %ymm4, %ymm12, %ymm12 1674 vpaddd %ymm5, %ymm13, %ymm15 1675 vpcmpeqd %ymm4, %ymm13, %ymm13 1676 vpcmpeqd %ymm4, %ymm14, %ymm14 1677 vpcmpeqd %ymm4, %ymm15, %ymm15 1678 1679 vmovdqa -32(%r10),%ymm7 # .Lgather_permd 1680 lea 128($inp), $inp 1681 mov \$9,$power 1682 1683.Loop_gather_1024: 1684 vmovdqa 32*0-128($inp), %ymm0 1685 vmovdqa 32*1-128($inp), %ymm1 1686 vmovdqa 32*2-128($inp), %ymm2 1687 vmovdqa 32*3-128($inp), %ymm3 1688 vpand 32*0+128(%rax), %ymm0, %ymm0 1689 vpand 32*1+128(%rax), %ymm1, %ymm1 1690 vpand 32*2+128(%rax), %ymm2, %ymm2 1691 vpor %ymm0, %ymm1, %ymm4 1692 vpand 32*3+128(%rax), %ymm3, %ymm3 1693 vmovdqa 32*4-128($inp), %ymm0 1694 vmovdqa 32*5-128($inp), %ymm1 1695 vpor %ymm2, %ymm3, %ymm5 1696 vmovdqa 32*6-128($inp), %ymm2 1697 vmovdqa 32*7-128($inp), %ymm3 1698 vpand 32*4+128(%rax), %ymm0, %ymm0 1699 vpand 32*5+128(%rax), %ymm1, %ymm1 1700 vpand 32*6+128(%rax), %ymm2, %ymm2 1701 vpor %ymm0, %ymm4, %ymm4 1702 vpand 32*7+128(%rax), %ymm3, %ymm3 1703 vpand 32*8-128($inp), %ymm8, %ymm0 1704 vpor %ymm1, %ymm5, %ymm5 1705 vpand 32*9-128($inp), %ymm9, %ymm1 1706 vpor %ymm2, %ymm4, %ymm4 1707 vpand 32*10-128($inp),%ymm10, %ymm2 1708 vpor %ymm3, %ymm5, %ymm5 1709 vpand 32*11-128($inp),%ymm11, %ymm3 1710 vpor %ymm0, %ymm4, %ymm4 1711 vpand 32*12-128($inp),%ymm12, %ymm0 1712 vpor %ymm1, %ymm5, %ymm5 1713 vpand 32*13-128($inp),%ymm13, %ymm1 1714 vpor %ymm2, %ymm4, %ymm4 1715 vpand 32*14-128($inp),%ymm14, %ymm2 1716 vpor %ymm3, %ymm5, %ymm5 1717 vpand 32*15-128($inp),%ymm15, %ymm3 1718 lea 32*16($inp), $inp 1719 vpor %ymm0, %ymm4, %ymm4 1720 vpor %ymm1, %ymm5, %ymm5 1721 vpor %ymm2, %ymm4, %ymm4 1722 vpor %ymm3, %ymm5, %ymm5 1723 1724 vpor %ymm5, %ymm4, %ymm4 1725 vextracti128 \$1, %ymm4, %xmm5 # upper half is cleared 1726 vpor %xmm4, %xmm5, %xmm5 1727 vpermd %ymm5,%ymm7,%ymm5 1728 vmovdqu %ymm5,($out) 1729 lea 32($out),$out 1730 dec $power 1731 jnz .Loop_gather_1024 1732 1733 vpxor %ymm0,%ymm0,%ymm0 1734 vmovdqu %ymm0,($out) 1735 vzeroupper 1736___ 1737$code.=<<___ if ($win64); 1738 movaps -0xa8(%r11),%xmm6 1739 movaps -0x98(%r11),%xmm7 1740 movaps -0x88(%r11),%xmm8 1741 movaps -0x78(%r11),%xmm9 1742 movaps -0x68(%r11),%xmm10 1743 movaps -0x58(%r11),%xmm11 1744 movaps -0x48(%r11),%xmm12 1745 movaps -0x38(%r11),%xmm13 1746 movaps -0x28(%r11),%xmm14 1747 movaps -0x18(%r11),%xmm15 1748___ 1749$code.=<<___; 1750 lea (%r11),%rsp 1751.cfi_def_cfa_register %rsp 1752 ret 1753.cfi_endproc 1754.LSEH_end_rsaz_1024_gather5: 1755.size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2 1756___ 1757} 1758 1759$code.=<<___; 1760.extern OPENSSL_ia32cap_P 1761.globl rsaz_avx2_eligible 1762.type rsaz_avx2_eligible,\@abi-omnipotent 1763.align 32 1764rsaz_avx2_eligible: 1765 mov OPENSSL_ia32cap_P+8(%rip),%eax 1766___ 1767$code.=<<___ if ($addx); 1768 mov \$`1<<8|1<<19`,%ecx 1769 mov \$0,%edx 1770 and %eax,%ecx 1771 cmp \$`1<<8|1<<19`,%ecx # check for BMI2+AD*X 1772 cmove %edx,%eax 1773___ 1774$code.=<<___; 1775 and \$`1<<5`,%eax 1776 shr \$5,%eax 1777 ret 1778.size rsaz_avx2_eligible,.-rsaz_avx2_eligible 1779 1780.align 64 1781.Land_mask: 1782 .quad 0x1fffffff,0x1fffffff,0x1fffffff,0x1fffffff 1783.Lscatter_permd: 1784 .long 0,2,4,6,7,7,7,7 1785.Lgather_permd: 1786 .long 0,7,1,7,2,7,3,7 1787.Linc: 1788 .long 0,0,0,0, 1,1,1,1 1789 .long 2,2,2,2, 3,3,3,3 1790 .long 4,4,4,4, 4,4,4,4 1791.align 64 1792___ 1793 1794if ($win64) { 1795$rec="%rcx"; 1796$frame="%rdx"; 1797$context="%r8"; 1798$disp="%r9"; 1799 1800$code.=<<___ 1801.extern __imp_RtlVirtualUnwind 1802.type rsaz_se_handler,\@abi-omnipotent 1803.align 16 1804rsaz_se_handler: 1805 push %rsi 1806 push %rdi 1807 push %rbx 1808 push %rbp 1809 push %r12 1810 push %r13 1811 push %r14 1812 push %r15 1813 pushfq 1814 sub \$64,%rsp 1815 1816 mov 120($context),%rax # pull context->Rax 1817 mov 248($context),%rbx # pull context->Rip 1818 1819 mov 8($disp),%rsi # disp->ImageBase 1820 mov 56($disp),%r11 # disp->HandlerData 1821 1822 mov 0(%r11),%r10d # HandlerData[0] 1823 lea (%rsi,%r10),%r10 # prologue label 1824 cmp %r10,%rbx # context->Rip<prologue label 1825 jb .Lcommon_seh_tail 1826 1827 mov 4(%r11),%r10d # HandlerData[1] 1828 lea (%rsi,%r10),%r10 # epilogue label 1829 cmp %r10,%rbx # context->Rip>=epilogue label 1830 jae .Lcommon_seh_tail 1831 1832 mov 160($context),%rbp # pull context->Rbp 1833 1834 mov 8(%r11),%r10d # HandlerData[2] 1835 lea (%rsi,%r10),%r10 # "in tail" label 1836 cmp %r10,%rbx # context->Rip>="in tail" label 1837 cmovc %rbp,%rax 1838 1839 mov -48(%rax),%r15 1840 mov -40(%rax),%r14 1841 mov -32(%rax),%r13 1842 mov -24(%rax),%r12 1843 mov -16(%rax),%rbp 1844 mov -8(%rax),%rbx 1845 mov %r15,240($context) 1846 mov %r14,232($context) 1847 mov %r13,224($context) 1848 mov %r12,216($context) 1849 mov %rbp,160($context) 1850 mov %rbx,144($context) 1851 1852 lea -0xd8(%rax),%rsi # %xmm save area 1853 lea 512($context),%rdi # & context.Xmm6 1854 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 1855 .long 0xa548f3fc # cld; rep movsq 1856 1857.Lcommon_seh_tail: 1858 mov 8(%rax),%rdi 1859 mov 16(%rax),%rsi 1860 mov %rax,152($context) # restore context->Rsp 1861 mov %rsi,168($context) # restore context->Rsi 1862 mov %rdi,176($context) # restore context->Rdi 1863 1864 mov 40($disp),%rdi # disp->ContextRecord 1865 mov $context,%rsi # context 1866 mov \$154,%ecx # sizeof(CONTEXT) 1867 .long 0xa548f3fc # cld; rep movsq 1868 1869 mov $disp,%rsi 1870 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 1871 mov 8(%rsi),%rdx # arg2, disp->ImageBase 1872 mov 0(%rsi),%r8 # arg3, disp->ControlPc 1873 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 1874 mov 40(%rsi),%r10 # disp->ContextRecord 1875 lea 56(%rsi),%r11 # &disp->HandlerData 1876 lea 24(%rsi),%r12 # &disp->EstablisherFrame 1877 mov %r10,32(%rsp) # arg5 1878 mov %r11,40(%rsp) # arg6 1879 mov %r12,48(%rsp) # arg7 1880 mov %rcx,56(%rsp) # arg8, (NULL) 1881 call *__imp_RtlVirtualUnwind(%rip) 1882 1883 mov \$1,%eax # ExceptionContinueSearch 1884 add \$64,%rsp 1885 popfq 1886 pop %r15 1887 pop %r14 1888 pop %r13 1889 pop %r12 1890 pop %rbp 1891 pop %rbx 1892 pop %rdi 1893 pop %rsi 1894 ret 1895.size rsaz_se_handler,.-rsaz_se_handler 1896 1897.section .pdata 1898.align 4 1899 .rva .LSEH_begin_rsaz_1024_sqr_avx2 1900 .rva .LSEH_end_rsaz_1024_sqr_avx2 1901 .rva .LSEH_info_rsaz_1024_sqr_avx2 1902 1903 .rva .LSEH_begin_rsaz_1024_mul_avx2 1904 .rva .LSEH_end_rsaz_1024_mul_avx2 1905 .rva .LSEH_info_rsaz_1024_mul_avx2 1906 1907 .rva .LSEH_begin_rsaz_1024_gather5 1908 .rva .LSEH_end_rsaz_1024_gather5 1909 .rva .LSEH_info_rsaz_1024_gather5 1910.section .xdata 1911.align 8 1912.LSEH_info_rsaz_1024_sqr_avx2: 1913 .byte 9,0,0,0 1914 .rva rsaz_se_handler 1915 .rva .Lsqr_1024_body,.Lsqr_1024_epilogue,.Lsqr_1024_in_tail 1916 .long 0 1917.LSEH_info_rsaz_1024_mul_avx2: 1918 .byte 9,0,0,0 1919 .rva rsaz_se_handler 1920 .rva .Lmul_1024_body,.Lmul_1024_epilogue,.Lmul_1024_in_tail 1921 .long 0 1922.LSEH_info_rsaz_1024_gather5: 1923 .byte 0x01,0x36,0x17,0x0b 1924 .byte 0x36,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15 1925 .byte 0x31,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14 1926 .byte 0x2c,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13 1927 .byte 0x27,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12 1928 .byte 0x22,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11 1929 .byte 0x1d,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10 1930 .byte 0x18,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9 1931 .byte 0x13,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8 1932 .byte 0x0e,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7 1933 .byte 0x09,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6 1934 .byte 0x04,0x01,0x15,0x00 # sub rsp,0xa8 1935 .byte 0x00,0xb3,0x00,0x00 # set_frame r11 1936___ 1937} 1938 1939foreach (split("\n",$code)) { 1940 s/\`([^\`]*)\`/eval($1)/ge; 1941 1942 s/\b(sh[rl]d?\s+\$)(-?[0-9]+)/$1.$2%64/ge or 1943 1944 s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or 1945 s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or 1946 s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or 1947 s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or 1948 s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go; 1949 print $_,"\n"; 1950} 1951 1952}}} else {{{ 1953print <<___; # assembler is too old 1954.text 1955 1956.globl rsaz_avx2_eligible 1957.type rsaz_avx2_eligible,\@abi-omnipotent 1958rsaz_avx2_eligible: 1959 xor %eax,%eax 1960 ret 1961.size rsaz_avx2_eligible,.-rsaz_avx2_eligible 1962 1963.globl rsaz_1024_sqr_avx2 1964.globl rsaz_1024_mul_avx2 1965.globl rsaz_1024_norm2red_avx2 1966.globl rsaz_1024_red2norm_avx2 1967.globl rsaz_1024_scatter5_avx2 1968.globl rsaz_1024_gather5_avx2 1969.type rsaz_1024_sqr_avx2,\@abi-omnipotent 1970rsaz_1024_sqr_avx2: 1971rsaz_1024_mul_avx2: 1972rsaz_1024_norm2red_avx2: 1973rsaz_1024_red2norm_avx2: 1974rsaz_1024_scatter5_avx2: 1975rsaz_1024_gather5_avx2: 1976 .byte 0x0f,0x0b # ud2 1977 ret 1978.size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2 1979___ 1980}}} 1981 1982close STDOUT or die "error closing STDOUT: $!"; 1983