1#!/usr/bin/env perl 2 3############################################################################## 4# # 5# Copyright 2014 Intel Corporation # 6# # 7# Licensed under the Apache License, Version 2.0 (the "License"); # 8# you may not use this file except in compliance with the License. # 9# You may obtain a copy of the License at # 10# # 11# http://www.apache.org/licenses/LICENSE-2.0 # 12# # 13# Unless required by applicable law or agreed to in writing, software # 14# distributed under the License is distributed on an "AS IS" BASIS, # 15# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # 16# See the License for the specific language governing permissions and # 17# limitations under the License. # 18# # 19############################################################################## 20# # 21# Developers and authors: # 22# Shay Gueron (1, 2), and Vlad Krasnov (1) # 23# (1) Intel Corporation, Israel Development Center # 24# (2) University of Haifa # 25# Reference: # 26# S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with# 27# 256 Bit Primes" # 28# # 29############################################################################## 30 31# Further optimization by <appro@openssl.org>: 32# 33# this/original with/without -DECP_NISTZ256_ASM(*) 34# Opteron +12-49% +110-150% 35# Bulldozer +14-45% +175-210% 36# P4 +18-46% n/a :-( 37# Westmere +12-34% +80-87% 38# Sandy Bridge +9-35% +110-120% 39# Ivy Bridge +9-35% +110-125% 40# Haswell +8-37% +140-160% 41# Broadwell +18-58% +145-210% 42# Atom +15-50% +130-180% 43# VIA Nano +43-160% +300-480% 44# 45# (*) "without -DECP_NISTZ256_ASM" refers to build with 46# "enable-ec_nistp_64_gcc_128"; 47# 48# Ranges denote minimum and maximum improvement coefficients depending 49# on benchmark. Lower coefficients are for ECDSA sign, relatively fastest 50# server-side operation. Keep in mind that +100% means 2x improvement. 51 52$flavour = shift; 53$output = shift; 54if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 55 56$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 57 58$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 59( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 60( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 61die "can't locate x86_64-xlate.pl"; 62 63open OUT,"| \"$^X\" $xlate $flavour $output"; 64*STDOUT=*OUT; 65 66if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 67 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 68 $avx = ($1>=2.19) + ($1>=2.22); 69 $addx = ($1>=2.23); 70} 71 72if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 73 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 74 $avx = ($1>=2.09) + ($1>=2.10); 75 $addx = ($1>=2.10); 76} 77 78if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 79 `ml64 2>&1` =~ /Version ([0-9]+)\./) { 80 $avx = ($1>=10) + ($1>=11); 81 $addx = ($1>=12); 82} 83 84if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) { 85 my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 86 $avx = ($ver>=3.0) + ($ver>=3.01); 87 $addx = ($ver>=3.03); 88} 89 90$code.=<<___; 91.text 92.extern OPENSSL_ia32cap_P 93 94# The polynomial 95.align 64 96.Lpoly: 97.quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001 98 99# 2^512 mod P precomputed for NIST P256 polynomial 100.LRR: 101.quad 0x0000000000000003, 0xfffffffbffffffff, 0xfffffffffffffffe, 0x00000004fffffffd 102 103.LOne: 104.long 1,1,1,1,1,1,1,1 105.LTwo: 106.long 2,2,2,2,2,2,2,2 107.LThree: 108.long 3,3,3,3,3,3,3,3 109.LONE_mont: 110.quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe 111___ 112 113{ 114################################################################################ 115# void ecp_nistz256_mul_by_2(uint64_t res[4], uint64_t a[4]); 116 117my ($a0,$a1,$a2,$a3)=map("%r$_",(8..11)); 118my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rdx","%rcx","%r12","%r13"); 119my ($r_ptr,$a_ptr,$b_ptr)=("%rdi","%rsi","%rdx"); 120 121$code.=<<___; 122 123.globl ecp_nistz256_mul_by_2 124.type ecp_nistz256_mul_by_2,\@function,2 125.align 64 126ecp_nistz256_mul_by_2: 127 push %r12 128 push %r13 129 130 mov 8*0($a_ptr), $a0 131 xor $t4,$t4 132 mov 8*1($a_ptr), $a1 133 add $a0, $a0 # a0:a3+a0:a3 134 mov 8*2($a_ptr), $a2 135 adc $a1, $a1 136 mov 8*3($a_ptr), $a3 137 lea .Lpoly(%rip), $a_ptr 138 mov $a0, $t0 139 adc $a2, $a2 140 adc $a3, $a3 141 mov $a1, $t1 142 adc \$0, $t4 143 144 sub 8*0($a_ptr), $a0 145 mov $a2, $t2 146 sbb 8*1($a_ptr), $a1 147 sbb 8*2($a_ptr), $a2 148 mov $a3, $t3 149 sbb 8*3($a_ptr), $a3 150 sbb \$0, $t4 151 152 cmovc $t0, $a0 153 cmovc $t1, $a1 154 mov $a0, 8*0($r_ptr) 155 cmovc $t2, $a2 156 mov $a1, 8*1($r_ptr) 157 cmovc $t3, $a3 158 mov $a2, 8*2($r_ptr) 159 mov $a3, 8*3($r_ptr) 160 161 pop %r13 162 pop %r12 163 ret 164.size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 165 166################################################################################ 167# void ecp_nistz256_div_by_2(uint64_t res[4], uint64_t a[4]); 168.globl ecp_nistz256_div_by_2 169.type ecp_nistz256_div_by_2,\@function,2 170.align 32 171ecp_nistz256_div_by_2: 172 push %r12 173 push %r13 174 175 mov 8*0($a_ptr), $a0 176 mov 8*1($a_ptr), $a1 177 mov 8*2($a_ptr), $a2 178 mov $a0, $t0 179 mov 8*3($a_ptr), $a3 180 lea .Lpoly(%rip), $a_ptr 181 182 mov $a1, $t1 183 xor $t4, $t4 184 add 8*0($a_ptr), $a0 185 mov $a2, $t2 186 adc 8*1($a_ptr), $a1 187 adc 8*2($a_ptr), $a2 188 mov $a3, $t3 189 adc 8*3($a_ptr), $a3 190 adc \$0, $t4 191 xor $a_ptr, $a_ptr # borrow $a_ptr 192 test \$1, $t0 193 194 cmovz $t0, $a0 195 cmovz $t1, $a1 196 cmovz $t2, $a2 197 cmovz $t3, $a3 198 cmovz $a_ptr, $t4 199 200 mov $a1, $t0 # a0:a3>>1 201 shr \$1, $a0 202 shl \$63, $t0 203 mov $a2, $t1 204 shr \$1, $a1 205 or $t0, $a0 206 shl \$63, $t1 207 mov $a3, $t2 208 shr \$1, $a2 209 or $t1, $a1 210 shl \$63, $t2 211 shr \$1, $a3 212 shl \$63, $t4 213 or $t2, $a2 214 or $t4, $a3 215 216 mov $a0, 8*0($r_ptr) 217 mov $a1, 8*1($r_ptr) 218 mov $a2, 8*2($r_ptr) 219 mov $a3, 8*3($r_ptr) 220 221 pop %r13 222 pop %r12 223 ret 224.size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2 225 226################################################################################ 227# void ecp_nistz256_mul_by_3(uint64_t res[4], uint64_t a[4]); 228.globl ecp_nistz256_mul_by_3 229.type ecp_nistz256_mul_by_3,\@function,2 230.align 32 231ecp_nistz256_mul_by_3: 232 push %r12 233 push %r13 234 235 mov 8*0($a_ptr), $a0 236 xor $t4, $t4 237 mov 8*1($a_ptr), $a1 238 add $a0, $a0 # a0:a3+a0:a3 239 mov 8*2($a_ptr), $a2 240 adc $a1, $a1 241 mov 8*3($a_ptr), $a3 242 mov $a0, $t0 243 adc $a2, $a2 244 adc $a3, $a3 245 mov $a1, $t1 246 adc \$0, $t4 247 248 sub \$-1, $a0 249 mov $a2, $t2 250 sbb .Lpoly+8*1(%rip), $a1 251 sbb \$0, $a2 252 mov $a3, $t3 253 sbb .Lpoly+8*3(%rip), $a3 254 sbb \$0, $t4 255 256 cmovc $t0, $a0 257 cmovc $t1, $a1 258 cmovc $t2, $a2 259 cmovc $t3, $a3 260 261 xor $t4, $t4 262 add 8*0($a_ptr), $a0 # a0:a3+=a_ptr[0:3] 263 adc 8*1($a_ptr), $a1 264 mov $a0, $t0 265 adc 8*2($a_ptr), $a2 266 adc 8*3($a_ptr), $a3 267 mov $a1, $t1 268 adc \$0, $t4 269 270 sub \$-1, $a0 271 mov $a2, $t2 272 sbb .Lpoly+8*1(%rip), $a1 273 sbb \$0, $a2 274 mov $a3, $t3 275 sbb .Lpoly+8*3(%rip), $a3 276 sbb \$0, $t4 277 278 cmovc $t0, $a0 279 cmovc $t1, $a1 280 mov $a0, 8*0($r_ptr) 281 cmovc $t2, $a2 282 mov $a1, 8*1($r_ptr) 283 cmovc $t3, $a3 284 mov $a2, 8*2($r_ptr) 285 mov $a3, 8*3($r_ptr) 286 287 pop %r13 288 pop %r12 289 ret 290.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 291 292################################################################################ 293# void ecp_nistz256_add(uint64_t res[4], uint64_t a[4], uint64_t b[4]); 294.globl ecp_nistz256_add 295.type ecp_nistz256_add,\@function,3 296.align 32 297ecp_nistz256_add: 298 push %r12 299 push %r13 300 301 mov 8*0($a_ptr), $a0 302 xor $t4, $t4 303 mov 8*1($a_ptr), $a1 304 mov 8*2($a_ptr), $a2 305 mov 8*3($a_ptr), $a3 306 lea .Lpoly(%rip), $a_ptr 307 308 add 8*0($b_ptr), $a0 309 adc 8*1($b_ptr), $a1 310 mov $a0, $t0 311 adc 8*2($b_ptr), $a2 312 adc 8*3($b_ptr), $a3 313 mov $a1, $t1 314 adc \$0, $t4 315 316 sub 8*0($a_ptr), $a0 317 mov $a2, $t2 318 sbb 8*1($a_ptr), $a1 319 sbb 8*2($a_ptr), $a2 320 mov $a3, $t3 321 sbb 8*3($a_ptr), $a3 322 sbb \$0, $t4 323 324 cmovc $t0, $a0 325 cmovc $t1, $a1 326 mov $a0, 8*0($r_ptr) 327 cmovc $t2, $a2 328 mov $a1, 8*1($r_ptr) 329 cmovc $t3, $a3 330 mov $a2, 8*2($r_ptr) 331 mov $a3, 8*3($r_ptr) 332 333 pop %r13 334 pop %r12 335 ret 336.size ecp_nistz256_add,.-ecp_nistz256_add 337 338################################################################################ 339# void ecp_nistz256_sub(uint64_t res[4], uint64_t a[4], uint64_t b[4]); 340.globl ecp_nistz256_sub 341.type ecp_nistz256_sub,\@function,3 342.align 32 343ecp_nistz256_sub: 344 push %r12 345 push %r13 346 347 mov 8*0($a_ptr), $a0 348 xor $t4, $t4 349 mov 8*1($a_ptr), $a1 350 mov 8*2($a_ptr), $a2 351 mov 8*3($a_ptr), $a3 352 lea .Lpoly(%rip), $a_ptr 353 354 sub 8*0($b_ptr), $a0 355 sbb 8*1($b_ptr), $a1 356 mov $a0, $t0 357 sbb 8*2($b_ptr), $a2 358 sbb 8*3($b_ptr), $a3 359 mov $a1, $t1 360 sbb \$0, $t4 361 362 add 8*0($a_ptr), $a0 363 mov $a2, $t2 364 adc 8*1($a_ptr), $a1 365 adc 8*2($a_ptr), $a2 366 mov $a3, $t3 367 adc 8*3($a_ptr), $a3 368 test $t4, $t4 369 370 cmovz $t0, $a0 371 cmovz $t1, $a1 372 mov $a0, 8*0($r_ptr) 373 cmovz $t2, $a2 374 mov $a1, 8*1($r_ptr) 375 cmovz $t3, $a3 376 mov $a2, 8*2($r_ptr) 377 mov $a3, 8*3($r_ptr) 378 379 pop %r13 380 pop %r12 381 ret 382.size ecp_nistz256_sub,.-ecp_nistz256_sub 383 384################################################################################ 385# void ecp_nistz256_neg(uint64_t res[4], uint64_t a[4]); 386.globl ecp_nistz256_neg 387.type ecp_nistz256_neg,\@function,2 388.align 32 389ecp_nistz256_neg: 390 push %r12 391 push %r13 392 393 xor $a0, $a0 394 xor $a1, $a1 395 xor $a2, $a2 396 xor $a3, $a3 397 xor $t4, $t4 398 399 sub 8*0($a_ptr), $a0 400 sbb 8*1($a_ptr), $a1 401 sbb 8*2($a_ptr), $a2 402 mov $a0, $t0 403 sbb 8*3($a_ptr), $a3 404 lea .Lpoly(%rip), $a_ptr 405 mov $a1, $t1 406 sbb \$0, $t4 407 408 add 8*0($a_ptr), $a0 409 mov $a2, $t2 410 adc 8*1($a_ptr), $a1 411 adc 8*2($a_ptr), $a2 412 mov $a3, $t3 413 adc 8*3($a_ptr), $a3 414 test $t4, $t4 415 416 cmovz $t0, $a0 417 cmovz $t1, $a1 418 mov $a0, 8*0($r_ptr) 419 cmovz $t2, $a2 420 mov $a1, 8*1($r_ptr) 421 cmovz $t3, $a3 422 mov $a2, 8*2($r_ptr) 423 mov $a3, 8*3($r_ptr) 424 425 pop %r13 426 pop %r12 427 ret 428.size ecp_nistz256_neg,.-ecp_nistz256_neg 429___ 430} 431{ 432my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx"); 433my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15)); 434my ($t0,$t1,$t2,$t3,$t4)=("%rcx","%rbp","%rbx","%rdx","%rax"); 435my ($poly1,$poly3)=($acc6,$acc7); 436 437$code.=<<___; 438################################################################################ 439# void ecp_nistz256_to_mont( 440# uint64_t res[4], 441# uint64_t in[4]); 442.globl ecp_nistz256_to_mont 443.type ecp_nistz256_to_mont,\@function,2 444.align 32 445ecp_nistz256_to_mont: 446___ 447$code.=<<___ if ($addx); 448 mov \$0x80100, %ecx 449 and OPENSSL_ia32cap_P+8(%rip), %ecx 450___ 451$code.=<<___; 452 lea .LRR(%rip), $b_org 453 jmp .Lmul_mont 454.size ecp_nistz256_to_mont,.-ecp_nistz256_to_mont 455 456################################################################################ 457# void ecp_nistz256_mul_mont( 458# uint64_t res[4], 459# uint64_t a[4], 460# uint64_t b[4]); 461 462.globl ecp_nistz256_mul_mont 463.type ecp_nistz256_mul_mont,\@function,3 464.align 32 465ecp_nistz256_mul_mont: 466___ 467$code.=<<___ if ($addx); 468 mov \$0x80100, %ecx 469 and OPENSSL_ia32cap_P+8(%rip), %ecx 470___ 471$code.=<<___; 472.Lmul_mont: 473 push %rbp 474 push %rbx 475 push %r12 476 push %r13 477 push %r14 478 push %r15 479___ 480$code.=<<___ if ($addx); 481 cmp \$0x80100, %ecx 482 je .Lmul_montx 483___ 484$code.=<<___; 485 mov $b_org, $b_ptr 486 mov 8*0($b_org), %rax 487 mov 8*0($a_ptr), $acc1 488 mov 8*1($a_ptr), $acc2 489 mov 8*2($a_ptr), $acc3 490 mov 8*3($a_ptr), $acc4 491 492 call __ecp_nistz256_mul_montq 493___ 494$code.=<<___ if ($addx); 495 jmp .Lmul_mont_done 496 497.align 32 498.Lmul_montx: 499 mov $b_org, $b_ptr 500 mov 8*0($b_org), %rdx 501 mov 8*0($a_ptr), $acc1 502 mov 8*1($a_ptr), $acc2 503 mov 8*2($a_ptr), $acc3 504 mov 8*3($a_ptr), $acc4 505 lea -128($a_ptr), $a_ptr # control u-op density 506 507 call __ecp_nistz256_mul_montx 508___ 509$code.=<<___; 510.Lmul_mont_done: 511 pop %r15 512 pop %r14 513 pop %r13 514 pop %r12 515 pop %rbx 516 pop %rbp 517 ret 518.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont 519 520.type __ecp_nistz256_mul_montq,\@abi-omnipotent 521.align 32 522__ecp_nistz256_mul_montq: 523 ######################################################################## 524 # Multiply a by b[0] 525 mov %rax, $t1 526 mulq $acc1 527 mov .Lpoly+8*1(%rip),$poly1 528 mov %rax, $acc0 529 mov $t1, %rax 530 mov %rdx, $acc1 531 532 mulq $acc2 533 mov .Lpoly+8*3(%rip),$poly3 534 add %rax, $acc1 535 mov $t1, %rax 536 adc \$0, %rdx 537 mov %rdx, $acc2 538 539 mulq $acc3 540 add %rax, $acc2 541 mov $t1, %rax 542 adc \$0, %rdx 543 mov %rdx, $acc3 544 545 mulq $acc4 546 add %rax, $acc3 547 mov $acc0, %rax 548 adc \$0, %rdx 549 xor $acc5, $acc5 550 mov %rdx, $acc4 551 552 ######################################################################## 553 # First reduction step 554 # Basically now we want to multiply acc[0] by p256, 555 # and add the result to the acc. 556 # Due to the special form of p256 we do some optimizations 557 # 558 # acc[0] x p256[0..1] = acc[0] x 2^96 - acc[0] 559 # then we add acc[0] and get acc[0] x 2^96 560 561 mov $acc0, $t1 562 shl \$32, $acc0 563 mulq $poly3 564 shr \$32, $t1 565 add $acc0, $acc1 # +=acc[0]<<96 566 adc $t1, $acc2 567 adc %rax, $acc3 568 mov 8*1($b_ptr), %rax 569 adc %rdx, $acc4 570 adc \$0, $acc5 571 xor $acc0, $acc0 572 573 ######################################################################## 574 # Multiply by b[1] 575 mov %rax, $t1 576 mulq 8*0($a_ptr) 577 add %rax, $acc1 578 mov $t1, %rax 579 adc \$0, %rdx 580 mov %rdx, $t0 581 582 mulq 8*1($a_ptr) 583 add $t0, $acc2 584 adc \$0, %rdx 585 add %rax, $acc2 586 mov $t1, %rax 587 adc \$0, %rdx 588 mov %rdx, $t0 589 590 mulq 8*2($a_ptr) 591 add $t0, $acc3 592 adc \$0, %rdx 593 add %rax, $acc3 594 mov $t1, %rax 595 adc \$0, %rdx 596 mov %rdx, $t0 597 598 mulq 8*3($a_ptr) 599 add $t0, $acc4 600 adc \$0, %rdx 601 add %rax, $acc4 602 mov $acc1, %rax 603 adc %rdx, $acc5 604 adc \$0, $acc0 605 606 ######################################################################## 607 # Second reduction step 608 mov $acc1, $t1 609 shl \$32, $acc1 610 mulq $poly3 611 shr \$32, $t1 612 add $acc1, $acc2 613 adc $t1, $acc3 614 adc %rax, $acc4 615 mov 8*2($b_ptr), %rax 616 adc %rdx, $acc5 617 adc \$0, $acc0 618 xor $acc1, $acc1 619 620 ######################################################################## 621 # Multiply by b[2] 622 mov %rax, $t1 623 mulq 8*0($a_ptr) 624 add %rax, $acc2 625 mov $t1, %rax 626 adc \$0, %rdx 627 mov %rdx, $t0 628 629 mulq 8*1($a_ptr) 630 add $t0, $acc3 631 adc \$0, %rdx 632 add %rax, $acc3 633 mov $t1, %rax 634 adc \$0, %rdx 635 mov %rdx, $t0 636 637 mulq 8*2($a_ptr) 638 add $t0, $acc4 639 adc \$0, %rdx 640 add %rax, $acc4 641 mov $t1, %rax 642 adc \$0, %rdx 643 mov %rdx, $t0 644 645 mulq 8*3($a_ptr) 646 add $t0, $acc5 647 adc \$0, %rdx 648 add %rax, $acc5 649 mov $acc2, %rax 650 adc %rdx, $acc0 651 adc \$0, $acc1 652 653 ######################################################################## 654 # Third reduction step 655 mov $acc2, $t1 656 shl \$32, $acc2 657 mulq $poly3 658 shr \$32, $t1 659 add $acc2, $acc3 660 adc $t1, $acc4 661 adc %rax, $acc5 662 mov 8*3($b_ptr), %rax 663 adc %rdx, $acc0 664 adc \$0, $acc1 665 xor $acc2, $acc2 666 667 ######################################################################## 668 # Multiply by b[3] 669 mov %rax, $t1 670 mulq 8*0($a_ptr) 671 add %rax, $acc3 672 mov $t1, %rax 673 adc \$0, %rdx 674 mov %rdx, $t0 675 676 mulq 8*1($a_ptr) 677 add $t0, $acc4 678 adc \$0, %rdx 679 add %rax, $acc4 680 mov $t1, %rax 681 adc \$0, %rdx 682 mov %rdx, $t0 683 684 mulq 8*2($a_ptr) 685 add $t0, $acc5 686 adc \$0, %rdx 687 add %rax, $acc5 688 mov $t1, %rax 689 adc \$0, %rdx 690 mov %rdx, $t0 691 692 mulq 8*3($a_ptr) 693 add $t0, $acc0 694 adc \$0, %rdx 695 add %rax, $acc0 696 mov $acc3, %rax 697 adc %rdx, $acc1 698 adc \$0, $acc2 699 700 ######################################################################## 701 # Final reduction step 702 mov $acc3, $t1 703 shl \$32, $acc3 704 mulq $poly3 705 shr \$32, $t1 706 add $acc3, $acc4 707 adc $t1, $acc5 708 mov $acc4, $t0 709 adc %rax, $acc0 710 adc %rdx, $acc1 711 mov $acc5, $t1 712 adc \$0, $acc2 713 714 ######################################################################## 715 # Branch-less conditional subtraction of P 716 sub \$-1, $acc4 # .Lpoly[0] 717 mov $acc0, $t2 718 sbb $poly1, $acc5 # .Lpoly[1] 719 sbb \$0, $acc0 # .Lpoly[2] 720 mov $acc1, $t3 721 sbb $poly3, $acc1 # .Lpoly[3] 722 sbb \$0, $acc2 723 724 cmovc $t0, $acc4 725 cmovc $t1, $acc5 726 mov $acc4, 8*0($r_ptr) 727 cmovc $t2, $acc0 728 mov $acc5, 8*1($r_ptr) 729 cmovc $t3, $acc1 730 mov $acc0, 8*2($r_ptr) 731 mov $acc1, 8*3($r_ptr) 732 733 ret 734.size __ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq 735 736################################################################################ 737# void ecp_nistz256_sqr_mont( 738# uint64_t res[4], 739# uint64_t a[4]); 740 741# we optimize the square according to S.Gueron and V.Krasnov, 742# "Speeding up Big-Number Squaring" 743.globl ecp_nistz256_sqr_mont 744.type ecp_nistz256_sqr_mont,\@function,2 745.align 32 746ecp_nistz256_sqr_mont: 747___ 748$code.=<<___ if ($addx); 749 mov \$0x80100, %ecx 750 and OPENSSL_ia32cap_P+8(%rip), %ecx 751___ 752$code.=<<___; 753 push %rbp 754 push %rbx 755 push %r12 756 push %r13 757 push %r14 758 push %r15 759___ 760$code.=<<___ if ($addx); 761 cmp \$0x80100, %ecx 762 je .Lsqr_montx 763___ 764$code.=<<___; 765 mov 8*0($a_ptr), %rax 766 mov 8*1($a_ptr), $acc6 767 mov 8*2($a_ptr), $acc7 768 mov 8*3($a_ptr), $acc0 769 770 call __ecp_nistz256_sqr_montq 771___ 772$code.=<<___ if ($addx); 773 jmp .Lsqr_mont_done 774 775.align 32 776.Lsqr_montx: 777 mov 8*0($a_ptr), %rdx 778 mov 8*1($a_ptr), $acc6 779 mov 8*2($a_ptr), $acc7 780 mov 8*3($a_ptr), $acc0 781 lea -128($a_ptr), $a_ptr # control u-op density 782 783 call __ecp_nistz256_sqr_montx 784___ 785$code.=<<___; 786.Lsqr_mont_done: 787 pop %r15 788 pop %r14 789 pop %r13 790 pop %r12 791 pop %rbx 792 pop %rbp 793 ret 794.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont 795 796.type __ecp_nistz256_sqr_montq,\@abi-omnipotent 797.align 32 798__ecp_nistz256_sqr_montq: 799 mov %rax, $acc5 800 mulq $acc6 # a[1]*a[0] 801 mov %rax, $acc1 802 mov $acc7, %rax 803 mov %rdx, $acc2 804 805 mulq $acc5 # a[0]*a[2] 806 add %rax, $acc2 807 mov $acc0, %rax 808 adc \$0, %rdx 809 mov %rdx, $acc3 810 811 mulq $acc5 # a[0]*a[3] 812 add %rax, $acc3 813 mov $acc7, %rax 814 adc \$0, %rdx 815 mov %rdx, $acc4 816 817 ################################# 818 mulq $acc6 # a[1]*a[2] 819 add %rax, $acc3 820 mov $acc0, %rax 821 adc \$0, %rdx 822 mov %rdx, $t1 823 824 mulq $acc6 # a[1]*a[3] 825 add %rax, $acc4 826 mov $acc0, %rax 827 adc \$0, %rdx 828 add $t1, $acc4 829 mov %rdx, $acc5 830 adc \$0, $acc5 831 832 ################################# 833 mulq $acc7 # a[2]*a[3] 834 xor $acc7, $acc7 835 add %rax, $acc5 836 mov 8*0($a_ptr), %rax 837 mov %rdx, $acc6 838 adc \$0, $acc6 839 840 add $acc1, $acc1 # acc1:6<<1 841 adc $acc2, $acc2 842 adc $acc3, $acc3 843 adc $acc4, $acc4 844 adc $acc5, $acc5 845 adc $acc6, $acc6 846 adc \$0, $acc7 847 848 mulq %rax 849 mov %rax, $acc0 850 mov 8*1($a_ptr), %rax 851 mov %rdx, $t0 852 853 mulq %rax 854 add $t0, $acc1 855 adc %rax, $acc2 856 mov 8*2($a_ptr), %rax 857 adc \$0, %rdx 858 mov %rdx, $t0 859 860 mulq %rax 861 add $t0, $acc3 862 adc %rax, $acc4 863 mov 8*3($a_ptr), %rax 864 adc \$0, %rdx 865 mov %rdx, $t0 866 867 mulq %rax 868 add $t0, $acc5 869 adc %rax, $acc6 870 mov $acc0, %rax 871 adc %rdx, $acc7 872 873 mov .Lpoly+8*1(%rip), $a_ptr 874 mov .Lpoly+8*3(%rip), $t1 875 876 ########################################## 877 # Now the reduction 878 # First iteration 879 mov $acc0, $t0 880 shl \$32, $acc0 881 mulq $t1 882 shr \$32, $t0 883 add $acc0, $acc1 # +=acc[0]<<96 884 adc $t0, $acc2 885 adc %rax, $acc3 886 mov $acc1, %rax 887 adc \$0, %rdx 888 889 ########################################## 890 # Second iteration 891 mov $acc1, $t0 892 shl \$32, $acc1 893 mov %rdx, $acc0 894 mulq $t1 895 shr \$32, $t0 896 add $acc1, $acc2 897 adc $t0, $acc3 898 adc %rax, $acc0 899 mov $acc2, %rax 900 adc \$0, %rdx 901 902 ########################################## 903 # Third iteration 904 mov $acc2, $t0 905 shl \$32, $acc2 906 mov %rdx, $acc1 907 mulq $t1 908 shr \$32, $t0 909 add $acc2, $acc3 910 adc $t0, $acc0 911 adc %rax, $acc1 912 mov $acc3, %rax 913 adc \$0, %rdx 914 915 ########################################### 916 # Last iteration 917 mov $acc3, $t0 918 shl \$32, $acc3 919 mov %rdx, $acc2 920 mulq $t1 921 shr \$32, $t0 922 add $acc3, $acc0 923 adc $t0, $acc1 924 adc %rax, $acc2 925 adc \$0, %rdx 926 xor $acc3, $acc3 927 928 ############################################ 929 # Add the rest of the acc 930 add $acc0, $acc4 931 adc $acc1, $acc5 932 mov $acc4, $acc0 933 adc $acc2, $acc6 934 adc %rdx, $acc7 935 mov $acc5, $acc1 936 adc \$0, $acc3 937 938 sub \$-1, $acc4 # .Lpoly[0] 939 mov $acc6, $acc2 940 sbb $a_ptr, $acc5 # .Lpoly[1] 941 sbb \$0, $acc6 # .Lpoly[2] 942 mov $acc7, $t0 943 sbb $t1, $acc7 # .Lpoly[3] 944 sbb \$0, $acc3 945 946 cmovc $acc0, $acc4 947 cmovc $acc1, $acc5 948 mov $acc4, 8*0($r_ptr) 949 cmovc $acc2, $acc6 950 mov $acc5, 8*1($r_ptr) 951 cmovc $t0, $acc7 952 mov $acc6, 8*2($r_ptr) 953 mov $acc7, 8*3($r_ptr) 954 955 ret 956.size __ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq 957___ 958 959if ($addx) { 960$code.=<<___; 961.type __ecp_nistz256_mul_montx,\@abi-omnipotent 962.align 32 963__ecp_nistz256_mul_montx: 964 ######################################################################## 965 # Multiply by b[0] 966 mulx $acc1, $acc0, $acc1 967 mulx $acc2, $t0, $acc2 968 mov \$32, $poly1 969 xor $acc5, $acc5 # cf=0 970 mulx $acc3, $t1, $acc3 971 mov .Lpoly+8*3(%rip), $poly3 972 adc $t0, $acc1 973 mulx $acc4, $t0, $acc4 974 mov $acc0, %rdx 975 adc $t1, $acc2 976 shlx $poly1,$acc0,$t1 977 adc $t0, $acc3 978 shrx $poly1,$acc0,$t0 979 adc \$0, $acc4 980 981 ######################################################################## 982 # First reduction step 983 add $t1, $acc1 984 adc $t0, $acc2 985 986 mulx $poly3, $t0, $t1 987 mov 8*1($b_ptr), %rdx 988 adc $t0, $acc3 989 adc $t1, $acc4 990 adc \$0, $acc5 991 xor $acc0, $acc0 # $acc0=0,cf=0,of=0 992 993 ######################################################################## 994 # Multiply by b[1] 995 mulx 8*0+128($a_ptr), $t0, $t1 996 adcx $t0, $acc1 997 adox $t1, $acc2 998 999 mulx 8*1+128($a_ptr), $t0, $t1 1000 adcx $t0, $acc2 1001 adox $t1, $acc3 1002 1003 mulx 8*2+128($a_ptr), $t0, $t1 1004 adcx $t0, $acc3 1005 adox $t1, $acc4 1006 1007 mulx 8*3+128($a_ptr), $t0, $t1 1008 mov $acc1, %rdx 1009 adcx $t0, $acc4 1010 shlx $poly1, $acc1, $t0 1011 adox $t1, $acc5 1012 shrx $poly1, $acc1, $t1 1013 1014 adcx $acc0, $acc5 1015 adox $acc0, $acc0 1016 adc \$0, $acc0 1017 1018 ######################################################################## 1019 # Second reduction step 1020 add $t0, $acc2 1021 adc $t1, $acc3 1022 1023 mulx $poly3, $t0, $t1 1024 mov 8*2($b_ptr), %rdx 1025 adc $t0, $acc4 1026 adc $t1, $acc5 1027 adc \$0, $acc0 1028 xor $acc1 ,$acc1 # $acc1=0,cf=0,of=0 1029 1030 ######################################################################## 1031 # Multiply by b[2] 1032 mulx 8*0+128($a_ptr), $t0, $t1 1033 adcx $t0, $acc2 1034 adox $t1, $acc3 1035 1036 mulx 8*1+128($a_ptr), $t0, $t1 1037 adcx $t0, $acc3 1038 adox $t1, $acc4 1039 1040 mulx 8*2+128($a_ptr), $t0, $t1 1041 adcx $t0, $acc4 1042 adox $t1, $acc5 1043 1044 mulx 8*3+128($a_ptr), $t0, $t1 1045 mov $acc2, %rdx 1046 adcx $t0, $acc5 1047 shlx $poly1, $acc2, $t0 1048 adox $t1, $acc0 1049 shrx $poly1, $acc2, $t1 1050 1051 adcx $acc1, $acc0 1052 adox $acc1, $acc1 1053 adc \$0, $acc1 1054 1055 ######################################################################## 1056 # Third reduction step 1057 add $t0, $acc3 1058 adc $t1, $acc4 1059 1060 mulx $poly3, $t0, $t1 1061 mov 8*3($b_ptr), %rdx 1062 adc $t0, $acc5 1063 adc $t1, $acc0 1064 adc \$0, $acc1 1065 xor $acc2, $acc2 # $acc2=0,cf=0,of=0 1066 1067 ######################################################################## 1068 # Multiply by b[3] 1069 mulx 8*0+128($a_ptr), $t0, $t1 1070 adcx $t0, $acc3 1071 adox $t1, $acc4 1072 1073 mulx 8*1+128($a_ptr), $t0, $t1 1074 adcx $t0, $acc4 1075 adox $t1, $acc5 1076 1077 mulx 8*2+128($a_ptr), $t0, $t1 1078 adcx $t0, $acc5 1079 adox $t1, $acc0 1080 1081 mulx 8*3+128($a_ptr), $t0, $t1 1082 mov $acc3, %rdx 1083 adcx $t0, $acc0 1084 shlx $poly1, $acc3, $t0 1085 adox $t1, $acc1 1086 shrx $poly1, $acc3, $t1 1087 1088 adcx $acc2, $acc1 1089 adox $acc2, $acc2 1090 adc \$0, $acc2 1091 1092 ######################################################################## 1093 # Fourth reduction step 1094 add $t0, $acc4 1095 adc $t1, $acc5 1096 1097 mulx $poly3, $t0, $t1 1098 mov $acc4, $t2 1099 mov .Lpoly+8*1(%rip), $poly1 1100 adc $t0, $acc0 1101 mov $acc5, $t3 1102 adc $t1, $acc1 1103 adc \$0, $acc2 1104 1105 ######################################################################## 1106 # Branch-less conditional subtraction of P 1107 xor %eax, %eax 1108 mov $acc0, $t0 1109 sbb \$-1, $acc4 # .Lpoly[0] 1110 sbb $poly1, $acc5 # .Lpoly[1] 1111 sbb \$0, $acc0 # .Lpoly[2] 1112 mov $acc1, $t1 1113 sbb $poly3, $acc1 # .Lpoly[3] 1114 sbb \$0, $acc2 1115 1116 cmovc $t2, $acc4 1117 cmovc $t3, $acc5 1118 mov $acc4, 8*0($r_ptr) 1119 cmovc $t0, $acc0 1120 mov $acc5, 8*1($r_ptr) 1121 cmovc $t1, $acc1 1122 mov $acc0, 8*2($r_ptr) 1123 mov $acc1, 8*3($r_ptr) 1124 1125 ret 1126.size __ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx 1127 1128.type __ecp_nistz256_sqr_montx,\@abi-omnipotent 1129.align 32 1130__ecp_nistz256_sqr_montx: 1131 mulx $acc6, $acc1, $acc2 # a[0]*a[1] 1132 mulx $acc7, $t0, $acc3 # a[0]*a[2] 1133 xor %eax, %eax 1134 adc $t0, $acc2 1135 mulx $acc0, $t1, $acc4 # a[0]*a[3] 1136 mov $acc6, %rdx 1137 adc $t1, $acc3 1138 adc \$0, $acc4 1139 xor $acc5, $acc5 # $acc5=0,cf=0,of=0 1140 1141 ################################# 1142 mulx $acc7, $t0, $t1 # a[1]*a[2] 1143 adcx $t0, $acc3 1144 adox $t1, $acc4 1145 1146 mulx $acc0, $t0, $t1 # a[1]*a[3] 1147 mov $acc7, %rdx 1148 adcx $t0, $acc4 1149 adox $t1, $acc5 1150 adc \$0, $acc5 1151 1152 ################################# 1153 mulx $acc0, $t0, $acc6 # a[2]*a[3] 1154 mov 8*0+128($a_ptr), %rdx 1155 xor $acc7, $acc7 # $acc7=0,cf=0,of=0 1156 adcx $acc1, $acc1 # acc1:6<<1 1157 adox $t0, $acc5 1158 adcx $acc2, $acc2 1159 adox $acc7, $acc6 # of=0 1160 1161 mulx %rdx, $acc0, $t1 1162 mov 8*1+128($a_ptr), %rdx 1163 adcx $acc3, $acc3 1164 adox $t1, $acc1 1165 adcx $acc4, $acc4 1166 mulx %rdx, $t0, $t4 1167 mov 8*2+128($a_ptr), %rdx 1168 adcx $acc5, $acc5 1169 adox $t0, $acc2 1170 adcx $acc6, $acc6 1171 .byte 0x67 1172 mulx %rdx, $t0, $t1 1173 mov 8*3+128($a_ptr), %rdx 1174 adox $t4, $acc3 1175 adcx $acc7, $acc7 1176 adox $t0, $acc4 1177 mov \$32, $a_ptr 1178 adox $t1, $acc5 1179 .byte 0x67,0x67 1180 mulx %rdx, $t0, $t4 1181 mov .Lpoly+8*3(%rip), %rdx 1182 adox $t0, $acc6 1183 shlx $a_ptr, $acc0, $t0 1184 adox $t4, $acc7 1185 shrx $a_ptr, $acc0, $t4 1186 mov %rdx,$t1 1187 1188 # reduction step 1 1189 add $t0, $acc1 1190 adc $t4, $acc2 1191 1192 mulx $acc0, $t0, $acc0 1193 adc $t0, $acc3 1194 shlx $a_ptr, $acc1, $t0 1195 adc \$0, $acc0 1196 shrx $a_ptr, $acc1, $t4 1197 1198 # reduction step 2 1199 add $t0, $acc2 1200 adc $t4, $acc3 1201 1202 mulx $acc1, $t0, $acc1 1203 adc $t0, $acc0 1204 shlx $a_ptr, $acc2, $t0 1205 adc \$0, $acc1 1206 shrx $a_ptr, $acc2, $t4 1207 1208 # reduction step 3 1209 add $t0, $acc3 1210 adc $t4, $acc0 1211 1212 mulx $acc2, $t0, $acc2 1213 adc $t0, $acc1 1214 shlx $a_ptr, $acc3, $t0 1215 adc \$0, $acc2 1216 shrx $a_ptr, $acc3, $t4 1217 1218 # reduction step 4 1219 add $t0, $acc0 1220 adc $t4, $acc1 1221 1222 mulx $acc3, $t0, $acc3 1223 adc $t0, $acc2 1224 adc \$0, $acc3 1225 1226 xor $t3, $t3 1227 add $acc0, $acc4 # accumulate upper half 1228 mov .Lpoly+8*1(%rip), $a_ptr 1229 adc $acc1, $acc5 1230 mov $acc4, $acc0 1231 adc $acc2, $acc6 1232 adc $acc3, $acc7 1233 mov $acc5, $acc1 1234 adc \$0, $t3 1235 1236 sub \$-1, $acc4 # .Lpoly[0] 1237 mov $acc6, $acc2 1238 sbb $a_ptr, $acc5 # .Lpoly[1] 1239 sbb \$0, $acc6 # .Lpoly[2] 1240 mov $acc7, $acc3 1241 sbb $t1, $acc7 # .Lpoly[3] 1242 sbb \$0, $t3 1243 1244 cmovc $acc0, $acc4 1245 cmovc $acc1, $acc5 1246 mov $acc4, 8*0($r_ptr) 1247 cmovc $acc2, $acc6 1248 mov $acc5, 8*1($r_ptr) 1249 cmovc $acc3, $acc7 1250 mov $acc6, 8*2($r_ptr) 1251 mov $acc7, 8*3($r_ptr) 1252 1253 ret 1254.size __ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx 1255___ 1256} 1257} 1258{ 1259my ($r_ptr,$in_ptr)=("%rdi","%rsi"); 1260my ($acc0,$acc1,$acc2,$acc3)=map("%r$_",(8..11)); 1261my ($t0,$t1,$t2)=("%rcx","%r12","%r13"); 1262 1263$code.=<<___; 1264################################################################################ 1265# void ecp_nistz256_from_mont( 1266# uint64_t res[4], 1267# uint64_t in[4]); 1268# This one performs Montgomery multiplication by 1, so we only need the reduction 1269 1270.globl ecp_nistz256_from_mont 1271.type ecp_nistz256_from_mont,\@function,2 1272.align 32 1273ecp_nistz256_from_mont: 1274 push %r12 1275 push %r13 1276 1277 mov 8*0($in_ptr), %rax 1278 mov .Lpoly+8*3(%rip), $t2 1279 mov 8*1($in_ptr), $acc1 1280 mov 8*2($in_ptr), $acc2 1281 mov 8*3($in_ptr), $acc3 1282 mov %rax, $acc0 1283 mov .Lpoly+8*1(%rip), $t1 1284 1285 ######################################### 1286 # First iteration 1287 mov %rax, $t0 1288 shl \$32, $acc0 1289 mulq $t2 1290 shr \$32, $t0 1291 add $acc0, $acc1 1292 adc $t0, $acc2 1293 adc %rax, $acc3 1294 mov $acc1, %rax 1295 adc \$0, %rdx 1296 1297 ######################################### 1298 # Second iteration 1299 mov $acc1, $t0 1300 shl \$32, $acc1 1301 mov %rdx, $acc0 1302 mulq $t2 1303 shr \$32, $t0 1304 add $acc1, $acc2 1305 adc $t0, $acc3 1306 adc %rax, $acc0 1307 mov $acc2, %rax 1308 adc \$0, %rdx 1309 1310 ########################################## 1311 # Third iteration 1312 mov $acc2, $t0 1313 shl \$32, $acc2 1314 mov %rdx, $acc1 1315 mulq $t2 1316 shr \$32, $t0 1317 add $acc2, $acc3 1318 adc $t0, $acc0 1319 adc %rax, $acc1 1320 mov $acc3, %rax 1321 adc \$0, %rdx 1322 1323 ########################################### 1324 # Last iteration 1325 mov $acc3, $t0 1326 shl \$32, $acc3 1327 mov %rdx, $acc2 1328 mulq $t2 1329 shr \$32, $t0 1330 add $acc3, $acc0 1331 adc $t0, $acc1 1332 mov $acc0, $t0 1333 adc %rax, $acc2 1334 mov $acc1, $in_ptr 1335 adc \$0, %rdx 1336 1337 ########################################### 1338 # Branch-less conditional subtraction 1339 sub \$-1, $acc0 1340 mov $acc2, %rax 1341 sbb $t1, $acc1 1342 sbb \$0, $acc2 1343 mov %rdx, $acc3 1344 sbb $t2, %rdx 1345 sbb $t2, $t2 1346 1347 cmovnz $t0, $acc0 1348 cmovnz $in_ptr, $acc1 1349 mov $acc0, 8*0($r_ptr) 1350 cmovnz %rax, $acc2 1351 mov $acc1, 8*1($r_ptr) 1352 cmovz %rdx, $acc3 1353 mov $acc2, 8*2($r_ptr) 1354 mov $acc3, 8*3($r_ptr) 1355 1356 pop %r13 1357 pop %r12 1358 ret 1359.size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont 1360___ 1361} 1362{ 1363my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx"); 1364my ($ONE,$INDEX,$Ra,$Rb,$Rc,$Rd,$Re,$Rf)=map("%xmm$_",(0..7)); 1365my ($M0,$T0a,$T0b,$T0c,$T0d,$T0e,$T0f,$TMP0)=map("%xmm$_",(8..15)); 1366my ($M1,$T2a,$T2b,$TMP2,$M2,$T2a,$T2b,$TMP2)=map("%xmm$_",(8..15)); 1367 1368$code.=<<___; 1369################################################################################ 1370# void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index); 1371.globl ecp_nistz256_select_w5 1372.type ecp_nistz256_select_w5,\@abi-omnipotent 1373.align 32 1374ecp_nistz256_select_w5: 1375___ 1376$code.=<<___ if ($avx>1); 1377 mov OPENSSL_ia32cap_P+8(%rip), %eax 1378 test \$`1<<5`, %eax 1379 jnz .Lavx2_select_w5 1380___ 1381$code.=<<___ if ($win64); 1382 lea -0x88(%rsp), %rax 1383.LSEH_begin_ecp_nistz256_select_w5: 1384 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp 1385 .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax) 1386 .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax) 1387 .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8, 0(%rax) 1388 .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9, 0x10(%rax) 1389 .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10, 0x20(%rax) 1390 .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11, 0x30(%rax) 1391 .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12, 0x40(%rax) 1392 .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13, 0x50(%rax) 1393 .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14, 0x60(%rax) 1394 .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15, 0x70(%rax) 1395___ 1396$code.=<<___; 1397 movdqa .LOne(%rip), $ONE 1398 movd $index, $INDEX 1399 1400 pxor $Ra, $Ra 1401 pxor $Rb, $Rb 1402 pxor $Rc, $Rc 1403 pxor $Rd, $Rd 1404 pxor $Re, $Re 1405 pxor $Rf, $Rf 1406 1407 movdqa $ONE, $M0 1408 pshufd \$0, $INDEX, $INDEX 1409 1410 mov \$16, %rax 1411.Lselect_loop_sse_w5: 1412 1413 movdqa $M0, $TMP0 1414 paddd $ONE, $M0 1415 pcmpeqd $INDEX, $TMP0 1416 1417 movdqa 16*0($in_t), $T0a 1418 movdqa 16*1($in_t), $T0b 1419 movdqa 16*2($in_t), $T0c 1420 movdqa 16*3($in_t), $T0d 1421 movdqa 16*4($in_t), $T0e 1422 movdqa 16*5($in_t), $T0f 1423 lea 16*6($in_t), $in_t 1424 1425 pand $TMP0, $T0a 1426 pand $TMP0, $T0b 1427 por $T0a, $Ra 1428 pand $TMP0, $T0c 1429 por $T0b, $Rb 1430 pand $TMP0, $T0d 1431 por $T0c, $Rc 1432 pand $TMP0, $T0e 1433 por $T0d, $Rd 1434 pand $TMP0, $T0f 1435 por $T0e, $Re 1436 por $T0f, $Rf 1437 1438 dec %rax 1439 jnz .Lselect_loop_sse_w5 1440 1441 movdqu $Ra, 16*0($val) 1442 movdqu $Rb, 16*1($val) 1443 movdqu $Rc, 16*2($val) 1444 movdqu $Rd, 16*3($val) 1445 movdqu $Re, 16*4($val) 1446 movdqu $Rf, 16*5($val) 1447___ 1448$code.=<<___ if ($win64); 1449 movaps (%rsp), %xmm6 1450 movaps 0x10(%rsp), %xmm7 1451 movaps 0x20(%rsp), %xmm8 1452 movaps 0x30(%rsp), %xmm9 1453 movaps 0x40(%rsp), %xmm10 1454 movaps 0x50(%rsp), %xmm11 1455 movaps 0x60(%rsp), %xmm12 1456 movaps 0x70(%rsp), %xmm13 1457 movaps 0x80(%rsp), %xmm14 1458 movaps 0x90(%rsp), %xmm15 1459 lea 0xa8(%rsp), %rsp 1460.LSEH_end_ecp_nistz256_select_w5: 1461___ 1462$code.=<<___; 1463 ret 1464.size ecp_nistz256_select_w5,.-ecp_nistz256_select_w5 1465 1466################################################################################ 1467# void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index); 1468.globl ecp_nistz256_select_w7 1469.type ecp_nistz256_select_w7,\@abi-omnipotent 1470.align 32 1471ecp_nistz256_select_w7: 1472___ 1473$code.=<<___ if ($avx>1); 1474 mov OPENSSL_ia32cap_P+8(%rip), %eax 1475 test \$`1<<5`, %eax 1476 jnz .Lavx2_select_w7 1477___ 1478$code.=<<___ if ($win64); 1479 lea -0x88(%rsp), %rax 1480.LSEH_begin_ecp_nistz256_select_w7: 1481 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp 1482 .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax) 1483 .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax) 1484 .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8, 0(%rax) 1485 .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9, 0x10(%rax) 1486 .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10, 0x20(%rax) 1487 .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11, 0x30(%rax) 1488 .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12, 0x40(%rax) 1489 .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13, 0x50(%rax) 1490 .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14, 0x60(%rax) 1491 .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15, 0x70(%rax) 1492___ 1493$code.=<<___; 1494 movdqa .LOne(%rip), $M0 1495 movd $index, $INDEX 1496 1497 pxor $Ra, $Ra 1498 pxor $Rb, $Rb 1499 pxor $Rc, $Rc 1500 pxor $Rd, $Rd 1501 1502 movdqa $M0, $ONE 1503 pshufd \$0, $INDEX, $INDEX 1504 mov \$64, %rax 1505 1506.Lselect_loop_sse_w7: 1507 movdqa $M0, $TMP0 1508 paddd $ONE, $M0 1509 movdqa 16*0($in_t), $T0a 1510 movdqa 16*1($in_t), $T0b 1511 pcmpeqd $INDEX, $TMP0 1512 movdqa 16*2($in_t), $T0c 1513 movdqa 16*3($in_t), $T0d 1514 lea 16*4($in_t), $in_t 1515 1516 pand $TMP0, $T0a 1517 pand $TMP0, $T0b 1518 por $T0a, $Ra 1519 pand $TMP0, $T0c 1520 por $T0b, $Rb 1521 pand $TMP0, $T0d 1522 por $T0c, $Rc 1523 prefetcht0 255($in_t) 1524 por $T0d, $Rd 1525 1526 dec %rax 1527 jnz .Lselect_loop_sse_w7 1528 1529 movdqu $Ra, 16*0($val) 1530 movdqu $Rb, 16*1($val) 1531 movdqu $Rc, 16*2($val) 1532 movdqu $Rd, 16*3($val) 1533___ 1534$code.=<<___ if ($win64); 1535 movaps (%rsp), %xmm6 1536 movaps 0x10(%rsp), %xmm7 1537 movaps 0x20(%rsp), %xmm8 1538 movaps 0x30(%rsp), %xmm9 1539 movaps 0x40(%rsp), %xmm10 1540 movaps 0x50(%rsp), %xmm11 1541 movaps 0x60(%rsp), %xmm12 1542 movaps 0x70(%rsp), %xmm13 1543 movaps 0x80(%rsp), %xmm14 1544 movaps 0x90(%rsp), %xmm15 1545 lea 0xa8(%rsp), %rsp 1546.LSEH_end_ecp_nistz256_select_w7: 1547___ 1548$code.=<<___; 1549 ret 1550.size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7 1551___ 1552} 1553if ($avx>1) { 1554my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx"); 1555my ($TWO,$INDEX,$Ra,$Rb,$Rc)=map("%ymm$_",(0..4)); 1556my ($M0,$T0a,$T0b,$T0c,$TMP0)=map("%ymm$_",(5..9)); 1557my ($M1,$T1a,$T1b,$T1c,$TMP1)=map("%ymm$_",(10..14)); 1558 1559$code.=<<___; 1560################################################################################ 1561# void ecp_nistz256_avx2_select_w5(uint64_t *val, uint64_t *in_t, int index); 1562.type ecp_nistz256_avx2_select_w5,\@abi-omnipotent 1563.align 32 1564ecp_nistz256_avx2_select_w5: 1565.Lavx2_select_w5: 1566 vzeroupper 1567___ 1568$code.=<<___ if ($win64); 1569 lea -0x88(%rsp), %rax 1570.LSEH_begin_ecp_nistz256_avx2_select_w5: 1571 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp 1572 .byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6, -0x20(%rax) 1573 .byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7, -0x10(%rax) 1574 .byte 0xc5,0x78,0x29,0x40,0x00 #vmovaps %xmm8, 8(%rax) 1575 .byte 0xc5,0x78,0x29,0x48,0x10 #vmovaps %xmm9, 0x10(%rax) 1576 .byte 0xc5,0x78,0x29,0x50,0x20 #vmovaps %xmm10, 0x20(%rax) 1577 .byte 0xc5,0x78,0x29,0x58,0x30 #vmovaps %xmm11, 0x30(%rax) 1578 .byte 0xc5,0x78,0x29,0x60,0x40 #vmovaps %xmm12, 0x40(%rax) 1579 .byte 0xc5,0x78,0x29,0x68,0x50 #vmovaps %xmm13, 0x50(%rax) 1580 .byte 0xc5,0x78,0x29,0x70,0x60 #vmovaps %xmm14, 0x60(%rax) 1581 .byte 0xc5,0x78,0x29,0x78,0x70 #vmovaps %xmm15, 0x70(%rax) 1582___ 1583$code.=<<___; 1584 vmovdqa .LTwo(%rip), $TWO 1585 1586 vpxor $Ra, $Ra, $Ra 1587 vpxor $Rb, $Rb, $Rb 1588 vpxor $Rc, $Rc, $Rc 1589 1590 vmovdqa .LOne(%rip), $M0 1591 vmovdqa .LTwo(%rip), $M1 1592 1593 vmovd $index, %xmm1 1594 vpermd $INDEX, $Ra, $INDEX 1595 1596 mov \$8, %rax 1597.Lselect_loop_avx2_w5: 1598 1599 vmovdqa 32*0($in_t), $T0a 1600 vmovdqa 32*1($in_t), $T0b 1601 vmovdqa 32*2($in_t), $T0c 1602 1603 vmovdqa 32*3($in_t), $T1a 1604 vmovdqa 32*4($in_t), $T1b 1605 vmovdqa 32*5($in_t), $T1c 1606 1607 vpcmpeqd $INDEX, $M0, $TMP0 1608 vpcmpeqd $INDEX, $M1, $TMP1 1609 1610 vpaddd $TWO, $M0, $M0 1611 vpaddd $TWO, $M1, $M1 1612 lea 32*6($in_t), $in_t 1613 1614 vpand $TMP0, $T0a, $T0a 1615 vpand $TMP0, $T0b, $T0b 1616 vpand $TMP0, $T0c, $T0c 1617 vpand $TMP1, $T1a, $T1a 1618 vpand $TMP1, $T1b, $T1b 1619 vpand $TMP1, $T1c, $T1c 1620 1621 vpxor $T0a, $Ra, $Ra 1622 vpxor $T0b, $Rb, $Rb 1623 vpxor $T0c, $Rc, $Rc 1624 vpxor $T1a, $Ra, $Ra 1625 vpxor $T1b, $Rb, $Rb 1626 vpxor $T1c, $Rc, $Rc 1627 1628 dec %rax 1629 jnz .Lselect_loop_avx2_w5 1630 1631 vmovdqu $Ra, 32*0($val) 1632 vmovdqu $Rb, 32*1($val) 1633 vmovdqu $Rc, 32*2($val) 1634 vzeroupper 1635___ 1636$code.=<<___ if ($win64); 1637 movaps (%rsp), %xmm6 1638 movaps 0x10(%rsp), %xmm7 1639 movaps 0x20(%rsp), %xmm8 1640 movaps 0x30(%rsp), %xmm9 1641 movaps 0x40(%rsp), %xmm10 1642 movaps 0x50(%rsp), %xmm11 1643 movaps 0x60(%rsp), %xmm12 1644 movaps 0x70(%rsp), %xmm13 1645 movaps 0x80(%rsp), %xmm14 1646 movaps 0x90(%rsp), %xmm15 1647 lea 0xa8(%rsp), %rsp 1648.LSEH_end_ecp_nistz256_avx2_select_w5: 1649___ 1650$code.=<<___; 1651 ret 1652.size ecp_nistz256_avx2_select_w5,.-ecp_nistz256_avx2_select_w5 1653___ 1654} 1655if ($avx>1) { 1656my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx"); 1657my ($THREE,$INDEX,$Ra,$Rb)=map("%ymm$_",(0..3)); 1658my ($M0,$T0a,$T0b,$TMP0)=map("%ymm$_",(4..7)); 1659my ($M1,$T1a,$T1b,$TMP1)=map("%ymm$_",(8..11)); 1660my ($M2,$T2a,$T2b,$TMP2)=map("%ymm$_",(12..15)); 1661 1662$code.=<<___; 1663 1664################################################################################ 1665# void ecp_nistz256_avx2_select_w7(uint64_t *val, uint64_t *in_t, int index); 1666.globl ecp_nistz256_avx2_select_w7 1667.type ecp_nistz256_avx2_select_w7,\@abi-omnipotent 1668.align 32 1669ecp_nistz256_avx2_select_w7: 1670.Lavx2_select_w7: 1671 vzeroupper 1672___ 1673$code.=<<___ if ($win64); 1674 lea -0x88(%rsp), %rax 1675.LSEH_begin_ecp_nistz256_avx2_select_w7: 1676 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp 1677 .byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6, -0x20(%rax) 1678 .byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7, -0x10(%rax) 1679 .byte 0xc5,0x78,0x29,0x40,0x00 #vmovaps %xmm8, 8(%rax) 1680 .byte 0xc5,0x78,0x29,0x48,0x10 #vmovaps %xmm9, 0x10(%rax) 1681 .byte 0xc5,0x78,0x29,0x50,0x20 #vmovaps %xmm10, 0x20(%rax) 1682 .byte 0xc5,0x78,0x29,0x58,0x30 #vmovaps %xmm11, 0x30(%rax) 1683 .byte 0xc5,0x78,0x29,0x60,0x40 #vmovaps %xmm12, 0x40(%rax) 1684 .byte 0xc5,0x78,0x29,0x68,0x50 #vmovaps %xmm13, 0x50(%rax) 1685 .byte 0xc5,0x78,0x29,0x70,0x60 #vmovaps %xmm14, 0x60(%rax) 1686 .byte 0xc5,0x78,0x29,0x78,0x70 #vmovaps %xmm15, 0x70(%rax) 1687___ 1688$code.=<<___; 1689 vmovdqa .LThree(%rip), $THREE 1690 1691 vpxor $Ra, $Ra, $Ra 1692 vpxor $Rb, $Rb, $Rb 1693 1694 vmovdqa .LOne(%rip), $M0 1695 vmovdqa .LTwo(%rip), $M1 1696 vmovdqa .LThree(%rip), $M2 1697 1698 vmovd $index, %xmm1 1699 vpermd $INDEX, $Ra, $INDEX 1700 # Skip index = 0, because it is implicitly the point at infinity 1701 1702 mov \$21, %rax 1703.Lselect_loop_avx2_w7: 1704 1705 vmovdqa 32*0($in_t), $T0a 1706 vmovdqa 32*1($in_t), $T0b 1707 1708 vmovdqa 32*2($in_t), $T1a 1709 vmovdqa 32*3($in_t), $T1b 1710 1711 vmovdqa 32*4($in_t), $T2a 1712 vmovdqa 32*5($in_t), $T2b 1713 1714 vpcmpeqd $INDEX, $M0, $TMP0 1715 vpcmpeqd $INDEX, $M1, $TMP1 1716 vpcmpeqd $INDEX, $M2, $TMP2 1717 1718 vpaddd $THREE, $M0, $M0 1719 vpaddd $THREE, $M1, $M1 1720 vpaddd $THREE, $M2, $M2 1721 lea 32*6($in_t), $in_t 1722 1723 vpand $TMP0, $T0a, $T0a 1724 vpand $TMP0, $T0b, $T0b 1725 vpand $TMP1, $T1a, $T1a 1726 vpand $TMP1, $T1b, $T1b 1727 vpand $TMP2, $T2a, $T2a 1728 vpand $TMP2, $T2b, $T2b 1729 1730 vpxor $T0a, $Ra, $Ra 1731 vpxor $T0b, $Rb, $Rb 1732 vpxor $T1a, $Ra, $Ra 1733 vpxor $T1b, $Rb, $Rb 1734 vpxor $T2a, $Ra, $Ra 1735 vpxor $T2b, $Rb, $Rb 1736 1737 dec %rax 1738 jnz .Lselect_loop_avx2_w7 1739 1740 1741 vmovdqa 32*0($in_t), $T0a 1742 vmovdqa 32*1($in_t), $T0b 1743 1744 vpcmpeqd $INDEX, $M0, $TMP0 1745 1746 vpand $TMP0, $T0a, $T0a 1747 vpand $TMP0, $T0b, $T0b 1748 1749 vpxor $T0a, $Ra, $Ra 1750 vpxor $T0b, $Rb, $Rb 1751 1752 vmovdqu $Ra, 32*0($val) 1753 vmovdqu $Rb, 32*1($val) 1754 vzeroupper 1755___ 1756$code.=<<___ if ($win64); 1757 movaps (%rsp), %xmm6 1758 movaps 0x10(%rsp), %xmm7 1759 movaps 0x20(%rsp), %xmm8 1760 movaps 0x30(%rsp), %xmm9 1761 movaps 0x40(%rsp), %xmm10 1762 movaps 0x50(%rsp), %xmm11 1763 movaps 0x60(%rsp), %xmm12 1764 movaps 0x70(%rsp), %xmm13 1765 movaps 0x80(%rsp), %xmm14 1766 movaps 0x90(%rsp), %xmm15 1767 lea 0xa8(%rsp), %rsp 1768.LSEH_end_ecp_nistz256_avx2_select_w7: 1769___ 1770$code.=<<___; 1771 ret 1772.size ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7 1773___ 1774} else { 1775$code.=<<___; 1776.globl ecp_nistz256_avx2_select_w7 1777.type ecp_nistz256_avx2_select_w7,\@function,3 1778.align 32 1779ecp_nistz256_avx2_select_w7: 1780 .byte 0x0f,0x0b # ud2 1781 ret 1782.size ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7 1783___ 1784} 1785{{{ 1786######################################################################## 1787# This block implements higher level point_double, point_add and 1788# point_add_affine. The key to performance in this case is to allow 1789# out-of-order execution logic to overlap computations from next step 1790# with tail processing from current step. By using tailored calling 1791# sequence we minimize inter-step overhead to give processor better 1792# shot at overlapping operations... 1793# 1794# You will notice that input data is copied to stack. Trouble is that 1795# there are no registers to spare for holding original pointers and 1796# reloading them, pointers, would create undesired dependencies on 1797# effective addresses calculation paths. In other words it's too done 1798# to favour out-of-order execution logic. 1799# <appro@openssl.org> 1800 1801my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx"); 1802my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15)); 1803my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rbp","%rcx",$acc4,$acc4); 1804my ($poly1,$poly3)=($acc6,$acc7); 1805 1806sub load_for_mul () { 1807my ($a,$b,$src0) = @_; 1808my $bias = $src0 eq "%rax" ? 0 : -128; 1809 1810" mov $b, $src0 1811 lea $b, $b_ptr 1812 mov 8*0+$a, $acc1 1813 mov 8*1+$a, $acc2 1814 lea $bias+$a, $a_ptr 1815 mov 8*2+$a, $acc3 1816 mov 8*3+$a, $acc4" 1817} 1818 1819sub load_for_sqr () { 1820my ($a,$src0) = @_; 1821my $bias = $src0 eq "%rax" ? 0 : -128; 1822 1823" mov 8*0+$a, $src0 1824 mov 8*1+$a, $acc6 1825 lea $bias+$a, $a_ptr 1826 mov 8*2+$a, $acc7 1827 mov 8*3+$a, $acc0" 1828} 1829 1830 { 1831######################################################################## 1832# operate in 4-5-0-1 "name space" that matches multiplication output 1833# 1834my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); 1835 1836$code.=<<___; 1837.type __ecp_nistz256_add_toq,\@abi-omnipotent 1838.align 32 1839__ecp_nistz256_add_toq: 1840 xor $t4,$t4 1841 add 8*0($b_ptr), $a0 1842 adc 8*1($b_ptr), $a1 1843 mov $a0, $t0 1844 adc 8*2($b_ptr), $a2 1845 adc 8*3($b_ptr), $a3 1846 mov $a1, $t1 1847 adc \$0, $t4 1848 1849 sub \$-1, $a0 1850 mov $a2, $t2 1851 sbb $poly1, $a1 1852 sbb \$0, $a2 1853 mov $a3, $t3 1854 sbb $poly3, $a3 1855 sbb \$0, $t4 1856 1857 cmovc $t0, $a0 1858 cmovc $t1, $a1 1859 mov $a0, 8*0($r_ptr) 1860 cmovc $t2, $a2 1861 mov $a1, 8*1($r_ptr) 1862 cmovc $t3, $a3 1863 mov $a2, 8*2($r_ptr) 1864 mov $a3, 8*3($r_ptr) 1865 1866 ret 1867.size __ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq 1868 1869.type __ecp_nistz256_sub_fromq,\@abi-omnipotent 1870.align 32 1871__ecp_nistz256_sub_fromq: 1872 sub 8*0($b_ptr), $a0 1873 sbb 8*1($b_ptr), $a1 1874 mov $a0, $t0 1875 sbb 8*2($b_ptr), $a2 1876 sbb 8*3($b_ptr), $a3 1877 mov $a1, $t1 1878 sbb $t4, $t4 1879 1880 add \$-1, $a0 1881 mov $a2, $t2 1882 adc $poly1, $a1 1883 adc \$0, $a2 1884 mov $a3, $t3 1885 adc $poly3, $a3 1886 test $t4, $t4 1887 1888 cmovz $t0, $a0 1889 cmovz $t1, $a1 1890 mov $a0, 8*0($r_ptr) 1891 cmovz $t2, $a2 1892 mov $a1, 8*1($r_ptr) 1893 cmovz $t3, $a3 1894 mov $a2, 8*2($r_ptr) 1895 mov $a3, 8*3($r_ptr) 1896 1897 ret 1898.size __ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq 1899 1900.type __ecp_nistz256_subq,\@abi-omnipotent 1901.align 32 1902__ecp_nistz256_subq: 1903 sub $a0, $t0 1904 sbb $a1, $t1 1905 mov $t0, $a0 1906 sbb $a2, $t2 1907 sbb $a3, $t3 1908 mov $t1, $a1 1909 sbb $t4, $t4 1910 1911 add \$-1, $t0 1912 mov $t2, $a2 1913 adc $poly1, $t1 1914 adc \$0, $t2 1915 mov $t3, $a3 1916 adc $poly3, $t3 1917 test $t4, $t4 1918 1919 cmovnz $t0, $a0 1920 cmovnz $t1, $a1 1921 cmovnz $t2, $a2 1922 cmovnz $t3, $a3 1923 1924 ret 1925.size __ecp_nistz256_subq,.-__ecp_nistz256_subq 1926 1927.type __ecp_nistz256_mul_by_2q,\@abi-omnipotent 1928.align 32 1929__ecp_nistz256_mul_by_2q: 1930 xor $t4, $t4 1931 add $a0, $a0 # a0:a3+a0:a3 1932 adc $a1, $a1 1933 mov $a0, $t0 1934 adc $a2, $a2 1935 adc $a3, $a3 1936 mov $a1, $t1 1937 adc \$0, $t4 1938 1939 sub \$-1, $a0 1940 mov $a2, $t2 1941 sbb $poly1, $a1 1942 sbb \$0, $a2 1943 mov $a3, $t3 1944 sbb $poly3, $a3 1945 sbb \$0, $t4 1946 1947 cmovc $t0, $a0 1948 cmovc $t1, $a1 1949 mov $a0, 8*0($r_ptr) 1950 cmovc $t2, $a2 1951 mov $a1, 8*1($r_ptr) 1952 cmovc $t3, $a3 1953 mov $a2, 8*2($r_ptr) 1954 mov $a3, 8*3($r_ptr) 1955 1956 ret 1957.size __ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q 1958___ 1959 } 1960sub gen_double () { 1961 my $x = shift; 1962 my ($src0,$sfx,$bias); 1963 my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4)); 1964 1965 if ($x ne "x") { 1966 $src0 = "%rax"; 1967 $sfx = ""; 1968 $bias = 0; 1969 1970$code.=<<___; 1971.globl ecp_nistz256_point_double 1972.type ecp_nistz256_point_double,\@function,2 1973.align 32 1974ecp_nistz256_point_double: 1975___ 1976$code.=<<___ if ($addx); 1977 mov \$0x80100, %ecx 1978 and OPENSSL_ia32cap_P+8(%rip), %ecx 1979 cmp \$0x80100, %ecx 1980 je .Lpoint_doublex 1981___ 1982 } else { 1983 $src0 = "%rdx"; 1984 $sfx = "x"; 1985 $bias = 128; 1986 1987$code.=<<___; 1988.type ecp_nistz256_point_doublex,\@function,2 1989.align 32 1990ecp_nistz256_point_doublex: 1991.Lpoint_doublex: 1992___ 1993 } 1994$code.=<<___; 1995 push %rbp 1996 push %rbx 1997 push %r12 1998 push %r13 1999 push %r14 2000 push %r15 2001 sub \$32*5+8, %rsp 2002 2003.Lpoint_double_shortcut$x: 2004 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr.x 2005 mov $a_ptr, $b_ptr # backup copy 2006 movdqu 0x10($a_ptr), %xmm1 2007 mov 0x20+8*0($a_ptr), $acc4 # load in_y in "5-4-0-1" order 2008 mov 0x20+8*1($a_ptr), $acc5 2009 mov 0x20+8*2($a_ptr), $acc0 2010 mov 0x20+8*3($a_ptr), $acc1 2011 mov .Lpoly+8*1(%rip), $poly1 2012 mov .Lpoly+8*3(%rip), $poly3 2013 movdqa %xmm0, $in_x(%rsp) 2014 movdqa %xmm1, $in_x+0x10(%rsp) 2015 lea 0x20($r_ptr), $acc2 2016 lea 0x40($r_ptr), $acc3 2017 movq $r_ptr, %xmm0 2018 movq $acc2, %xmm1 2019 movq $acc3, %xmm2 2020 2021 lea $S(%rsp), $r_ptr 2022 call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(S, in_y); 2023 2024 mov 0x40+8*0($a_ptr), $src0 2025 mov 0x40+8*1($a_ptr), $acc6 2026 mov 0x40+8*2($a_ptr), $acc7 2027 mov 0x40+8*3($a_ptr), $acc0 2028 lea 0x40-$bias($a_ptr), $a_ptr 2029 lea $Zsqr(%rsp), $r_ptr 2030 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Zsqr, in_z); 2031 2032 `&load_for_sqr("$S(%rsp)", "$src0")` 2033 lea $S(%rsp), $r_ptr 2034 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(S, S); 2035 2036 mov 0x20($b_ptr), $src0 # $b_ptr is still valid 2037 mov 0x40+8*0($b_ptr), $acc1 2038 mov 0x40+8*1($b_ptr), $acc2 2039 mov 0x40+8*2($b_ptr), $acc3 2040 mov 0x40+8*3($b_ptr), $acc4 2041 lea 0x40-$bias($b_ptr), $a_ptr 2042 lea 0x20($b_ptr), $b_ptr 2043 movq %xmm2, $r_ptr 2044 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, in_z, in_y); 2045 call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(res_z, res_z); 2046 2047 mov $in_x+8*0(%rsp), $acc4 # "5-4-0-1" order 2048 mov $in_x+8*1(%rsp), $acc5 2049 lea $Zsqr(%rsp), $b_ptr 2050 mov $in_x+8*2(%rsp), $acc0 2051 mov $in_x+8*3(%rsp), $acc1 2052 lea $M(%rsp), $r_ptr 2053 call __ecp_nistz256_add_to$x # p256_add(M, in_x, Zsqr); 2054 2055 mov $in_x+8*0(%rsp), $acc4 # "5-4-0-1" order 2056 mov $in_x+8*1(%rsp), $acc5 2057 lea $Zsqr(%rsp), $b_ptr 2058 mov $in_x+8*2(%rsp), $acc0 2059 mov $in_x+8*3(%rsp), $acc1 2060 lea $Zsqr(%rsp), $r_ptr 2061 call __ecp_nistz256_sub_from$x # p256_sub(Zsqr, in_x, Zsqr); 2062 2063 `&load_for_sqr("$S(%rsp)", "$src0")` 2064 movq %xmm1, $r_ptr 2065 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_y, S); 2066___ 2067{ 2068######## ecp_nistz256_div_by_2(res_y, res_y); ########################## 2069# operate in 4-5-6-7 "name space" that matches squaring output 2070# 2071my ($poly1,$poly3)=($a_ptr,$t1); 2072my ($a0,$a1,$a2,$a3,$t3,$t4,$t1)=($acc4,$acc5,$acc6,$acc7,$acc0,$acc1,$acc2); 2073 2074$code.=<<___; 2075 xor $t4, $t4 2076 mov $a0, $t0 2077 add \$-1, $a0 2078 mov $a1, $t1 2079 adc $poly1, $a1 2080 mov $a2, $t2 2081 adc \$0, $a2 2082 mov $a3, $t3 2083 adc $poly3, $a3 2084 adc \$0, $t4 2085 xor $a_ptr, $a_ptr # borrow $a_ptr 2086 test \$1, $t0 2087 2088 cmovz $t0, $a0 2089 cmovz $t1, $a1 2090 cmovz $t2, $a2 2091 cmovz $t3, $a3 2092 cmovz $a_ptr, $t4 2093 2094 mov $a1, $t0 # a0:a3>>1 2095 shr \$1, $a0 2096 shl \$63, $t0 2097 mov $a2, $t1 2098 shr \$1, $a1 2099 or $t0, $a0 2100 shl \$63, $t1 2101 mov $a3, $t2 2102 shr \$1, $a2 2103 or $t1, $a1 2104 shl \$63, $t2 2105 mov $a0, 8*0($r_ptr) 2106 shr \$1, $a3 2107 mov $a1, 8*1($r_ptr) 2108 shl \$63, $t4 2109 or $t2, $a2 2110 or $t4, $a3 2111 mov $a2, 8*2($r_ptr) 2112 mov $a3, 8*3($r_ptr) 2113___ 2114} 2115$code.=<<___; 2116 `&load_for_mul("$M(%rsp)", "$Zsqr(%rsp)", "$src0")` 2117 lea $M(%rsp), $r_ptr 2118 call __ecp_nistz256_mul_mont$x # p256_mul_mont(M, M, Zsqr); 2119 2120 lea $tmp0(%rsp), $r_ptr 2121 call __ecp_nistz256_mul_by_2$x 2122 2123 lea $M(%rsp), $b_ptr 2124 lea $M(%rsp), $r_ptr 2125 call __ecp_nistz256_add_to$x # p256_mul_by_3(M, M); 2126 2127 `&load_for_mul("$S(%rsp)", "$in_x(%rsp)", "$src0")` 2128 lea $S(%rsp), $r_ptr 2129 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S, S, in_x); 2130 2131 lea $tmp0(%rsp), $r_ptr 2132 call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(tmp0, S); 2133 2134 `&load_for_sqr("$M(%rsp)", "$src0")` 2135 movq %xmm0, $r_ptr 2136 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_x, M); 2137 2138 lea $tmp0(%rsp), $b_ptr 2139 mov $acc6, $acc0 # harmonize sqr output and sub input 2140 mov $acc7, $acc1 2141 mov $a_ptr, $poly1 2142 mov $t1, $poly3 2143 call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, tmp0); 2144 2145 mov $S+8*0(%rsp), $t0 2146 mov $S+8*1(%rsp), $t1 2147 mov $S+8*2(%rsp), $t2 2148 mov $S+8*3(%rsp), $acc2 # "4-5-0-1" order 2149 lea $S(%rsp), $r_ptr 2150 call __ecp_nistz256_sub$x # p256_sub(S, S, res_x); 2151 2152 mov $M(%rsp), $src0 2153 lea $M(%rsp), $b_ptr 2154 mov $acc4, $acc6 # harmonize sub output and mul input 2155 xor %ecx, %ecx 2156 mov $acc4, $S+8*0(%rsp) # have to save:-( 2157 mov $acc5, $acc2 2158 mov $acc5, $S+8*1(%rsp) 2159 cmovz $acc0, $acc3 2160 mov $acc0, $S+8*2(%rsp) 2161 lea $S-$bias(%rsp), $a_ptr 2162 cmovz $acc1, $acc4 2163 mov $acc1, $S+8*3(%rsp) 2164 mov $acc6, $acc1 2165 lea $S(%rsp), $r_ptr 2166 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S, S, M); 2167 2168 movq %xmm1, $b_ptr 2169 movq %xmm1, $r_ptr 2170 call __ecp_nistz256_sub_from$x # p256_sub(res_y, S, res_y); 2171 2172 add \$32*5+8, %rsp 2173 pop %r15 2174 pop %r14 2175 pop %r13 2176 pop %r12 2177 pop %rbx 2178 pop %rbp 2179 ret 2180.size ecp_nistz256_point_double$sfx,.-ecp_nistz256_point_double$sfx 2181___ 2182} 2183&gen_double("q"); 2184 2185sub gen_add () { 2186 my $x = shift; 2187 my ($src0,$sfx,$bias); 2188 my ($H,$Hsqr,$R,$Rsqr,$Hcub, 2189 $U1,$U2,$S1,$S2, 2190 $res_x,$res_y,$res_z, 2191 $in1_x,$in1_y,$in1_z, 2192 $in2_x,$in2_y,$in2_z)=map(32*$_,(0..17)); 2193 my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); 2194 2195 if ($x ne "x") { 2196 $src0 = "%rax"; 2197 $sfx = ""; 2198 $bias = 0; 2199 2200$code.=<<___; 2201.globl ecp_nistz256_point_add 2202.type ecp_nistz256_point_add,\@function,3 2203.align 32 2204ecp_nistz256_point_add: 2205___ 2206$code.=<<___ if ($addx); 2207 mov \$0x80100, %ecx 2208 and OPENSSL_ia32cap_P+8(%rip), %ecx 2209 cmp \$0x80100, %ecx 2210 je .Lpoint_addx 2211___ 2212 } else { 2213 $src0 = "%rdx"; 2214 $sfx = "x"; 2215 $bias = 128; 2216 2217$code.=<<___; 2218.type ecp_nistz256_point_addx,\@function,3 2219.align 32 2220ecp_nistz256_point_addx: 2221.Lpoint_addx: 2222___ 2223 } 2224$code.=<<___; 2225 push %rbp 2226 push %rbx 2227 push %r12 2228 push %r13 2229 push %r14 2230 push %r15 2231 sub \$32*18+8, %rsp 2232 2233 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr 2234 movdqu 0x10($a_ptr), %xmm1 2235 movdqu 0x20($a_ptr), %xmm2 2236 movdqu 0x30($a_ptr), %xmm3 2237 movdqu 0x40($a_ptr), %xmm4 2238 movdqu 0x50($a_ptr), %xmm5 2239 mov $a_ptr, $b_ptr # reassign 2240 mov $b_org, $a_ptr # reassign 2241 movdqa %xmm0, $in1_x(%rsp) 2242 movdqa %xmm1, $in1_x+0x10(%rsp) 2243 movdqa %xmm2, $in1_y(%rsp) 2244 movdqa %xmm3, $in1_y+0x10(%rsp) 2245 movdqa %xmm4, $in1_z(%rsp) 2246 movdqa %xmm5, $in1_z+0x10(%rsp) 2247 por %xmm4, %xmm5 2248 2249 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$b_ptr 2250 pshufd \$0xb1, %xmm5, %xmm3 2251 movdqu 0x10($a_ptr), %xmm1 2252 movdqu 0x20($a_ptr), %xmm2 2253 por %xmm3, %xmm5 2254 movdqu 0x30($a_ptr), %xmm3 2255 mov 0x40+8*0($a_ptr), $src0 # load original in2_z 2256 mov 0x40+8*1($a_ptr), $acc6 2257 mov 0x40+8*2($a_ptr), $acc7 2258 mov 0x40+8*3($a_ptr), $acc0 2259 movdqa %xmm0, $in2_x(%rsp) 2260 pshufd \$0x1e, %xmm5, %xmm4 2261 movdqa %xmm1, $in2_x+0x10(%rsp) 2262 movdqu 0x40($a_ptr),%xmm0 # in2_z again 2263 movdqu 0x50($a_ptr),%xmm1 2264 movdqa %xmm2, $in2_y(%rsp) 2265 movdqa %xmm3, $in2_y+0x10(%rsp) 2266 por %xmm4, %xmm5 2267 pxor %xmm4, %xmm4 2268 por %xmm0, %xmm1 2269 movq $r_ptr, %xmm0 # save $r_ptr 2270 2271 lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid 2272 mov $src0, $in2_z+8*0(%rsp) # make in2_z copy 2273 mov $acc6, $in2_z+8*1(%rsp) 2274 mov $acc7, $in2_z+8*2(%rsp) 2275 mov $acc0, $in2_z+8*3(%rsp) 2276 lea $Z2sqr(%rsp), $r_ptr # Z2^2 2277 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z2sqr, in2_z); 2278 2279 pcmpeqd %xmm4, %xmm5 2280 pshufd \$0xb1, %xmm1, %xmm4 2281 por %xmm1, %xmm4 2282 pshufd \$0, %xmm5, %xmm5 # in1infty 2283 pshufd \$0x1e, %xmm4, %xmm3 2284 por %xmm3, %xmm4 2285 pxor %xmm3, %xmm3 2286 pcmpeqd %xmm3, %xmm4 2287 pshufd \$0, %xmm4, %xmm4 # in2infty 2288 mov 0x40+8*0($b_ptr), $src0 # load original in1_z 2289 mov 0x40+8*1($b_ptr), $acc6 2290 mov 0x40+8*2($b_ptr), $acc7 2291 mov 0x40+8*3($b_ptr), $acc0 2292 movq $b_ptr, %xmm1 2293 2294 lea 0x40-$bias($b_ptr), $a_ptr 2295 lea $Z1sqr(%rsp), $r_ptr # Z1^2 2296 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z1sqr, in1_z); 2297 2298 `&load_for_mul("$Z2sqr(%rsp)", "$in2_z(%rsp)", "$src0")` 2299 lea $S1(%rsp), $r_ptr # S1 = Z2^3 2300 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S1, Z2sqr, in2_z); 2301 2302 `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")` 2303 lea $S2(%rsp), $r_ptr # S2 = Z1^3 2304 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Z1sqr, in1_z); 2305 2306 `&load_for_mul("$S1(%rsp)", "$in1_y(%rsp)", "$src0")` 2307 lea $S1(%rsp), $r_ptr # S1 = Y1*Z2^3 2308 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S1, S1, in1_y); 2309 2310 `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")` 2311 lea $S2(%rsp), $r_ptr # S2 = Y2*Z1^3 2312 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S2, in2_y); 2313 2314 lea $S1(%rsp), $b_ptr 2315 lea $R(%rsp), $r_ptr # R = S2 - S1 2316 call __ecp_nistz256_sub_from$x # p256_sub(R, S2, S1); 2317 2318 or $acc5, $acc4 # see if result is zero 2319 movdqa %xmm4, %xmm2 2320 or $acc0, $acc4 2321 or $acc1, $acc4 2322 por %xmm5, %xmm2 # in1infty || in2infty 2323 movq $acc4, %xmm3 2324 2325 `&load_for_mul("$Z2sqr(%rsp)", "$in1_x(%rsp)", "$src0")` 2326 lea $U1(%rsp), $r_ptr # U1 = X1*Z2^2 2327 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U1, in1_x, Z2sqr); 2328 2329 `&load_for_mul("$Z1sqr(%rsp)", "$in2_x(%rsp)", "$src0")` 2330 lea $U2(%rsp), $r_ptr # U2 = X2*Z1^2 2331 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, in2_x, Z1sqr); 2332 2333 lea $U1(%rsp), $b_ptr 2334 lea $H(%rsp), $r_ptr # H = U2 - U1 2335 call __ecp_nistz256_sub_from$x # p256_sub(H, U2, U1); 2336 2337 or $acc5, $acc4 # see if result is zero 2338 or $acc0, $acc4 2339 or $acc1, $acc4 2340 2341 .byte 0x3e # predict taken 2342 jnz .Ladd_proceed$x # is_equal(U1,U2)? 2343 movq %xmm2, $acc0 2344 movq %xmm3, $acc1 2345 test $acc0, $acc0 2346 jnz .Ladd_proceed$x # (in1infty || in2infty)? 2347 test $acc1, $acc1 2348 jz .Ladd_double$x # is_equal(S1,S2)? 2349 2350 movq %xmm0, $r_ptr # restore $r_ptr 2351 pxor %xmm0, %xmm0 2352 movdqu %xmm0, 0x00($r_ptr) 2353 movdqu %xmm0, 0x10($r_ptr) 2354 movdqu %xmm0, 0x20($r_ptr) 2355 movdqu %xmm0, 0x30($r_ptr) 2356 movdqu %xmm0, 0x40($r_ptr) 2357 movdqu %xmm0, 0x50($r_ptr) 2358 jmp .Ladd_done$x 2359 2360.align 32 2361.Ladd_double$x: 2362 movq %xmm1, $a_ptr # restore $a_ptr 2363 movq %xmm0, $r_ptr # restore $r_ptr 2364 add \$`32*(18-5)`, %rsp # difference in frame sizes 2365 jmp .Lpoint_double_shortcut$x 2366 2367.align 32 2368.Ladd_proceed$x: 2369 `&load_for_sqr("$R(%rsp)", "$src0")` 2370 lea $Rsqr(%rsp), $r_ptr # R^2 2371 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Rsqr, R); 2372 2373 `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")` 2374 lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2 2375 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, H, in1_z); 2376 2377 `&load_for_sqr("$H(%rsp)", "$src0")` 2378 lea $Hsqr(%rsp), $r_ptr # H^2 2379 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Hsqr, H); 2380 2381 `&load_for_mul("$res_z(%rsp)", "$in2_z(%rsp)", "$src0")` 2382 lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2 2383 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, res_z, in2_z); 2384 2385 `&load_for_mul("$Hsqr(%rsp)", "$H(%rsp)", "$src0")` 2386 lea $Hcub(%rsp), $r_ptr # H^3 2387 call __ecp_nistz256_mul_mont$x # p256_mul_mont(Hcub, Hsqr, H); 2388 2389 `&load_for_mul("$Hsqr(%rsp)", "$U1(%rsp)", "$src0")` 2390 lea $U2(%rsp), $r_ptr # U1*H^2 2391 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, U1, Hsqr); 2392___ 2393{ 2394####################################################################### 2395# operate in 4-5-0-1 "name space" that matches multiplication output 2396# 2397my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); 2398my ($poly1, $poly3)=($acc6,$acc7); 2399 2400$code.=<<___; 2401 #lea $U2(%rsp), $a_ptr 2402 #lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2 2403 #call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2); 2404 2405 xor $t4, $t4 2406 add $acc0, $acc0 # a0:a3+a0:a3 2407 lea $Rsqr(%rsp), $a_ptr 2408 adc $acc1, $acc1 2409 mov $acc0, $t0 2410 adc $acc2, $acc2 2411 adc $acc3, $acc3 2412 mov $acc1, $t1 2413 adc \$0, $t4 2414 2415 sub \$-1, $acc0 2416 mov $acc2, $t2 2417 sbb $poly1, $acc1 2418 sbb \$0, $acc2 2419 mov $acc3, $t3 2420 sbb $poly3, $acc3 2421 sbb \$0, $t4 2422 2423 cmovc $t0, $acc0 2424 mov 8*0($a_ptr), $t0 2425 cmovc $t1, $acc1 2426 mov 8*1($a_ptr), $t1 2427 cmovc $t2, $acc2 2428 mov 8*2($a_ptr), $t2 2429 cmovc $t3, $acc3 2430 mov 8*3($a_ptr), $t3 2431 2432 call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr); 2433 2434 lea $Hcub(%rsp), $b_ptr 2435 lea $res_x(%rsp), $r_ptr 2436 call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, Hcub); 2437 2438 mov $U2+8*0(%rsp), $t0 2439 mov $U2+8*1(%rsp), $t1 2440 mov $U2+8*2(%rsp), $t2 2441 mov $U2+8*3(%rsp), $t3 2442 lea $res_y(%rsp), $r_ptr 2443 2444 call __ecp_nistz256_sub$x # p256_sub(res_y, U2, res_x); 2445 2446 mov $acc0, 8*0($r_ptr) # save the result, as 2447 mov $acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't 2448 mov $acc2, 8*2($r_ptr) 2449 mov $acc3, 8*3($r_ptr) 2450___ 2451} 2452$code.=<<___; 2453 `&load_for_mul("$S1(%rsp)", "$Hcub(%rsp)", "$src0")` 2454 lea $S2(%rsp), $r_ptr 2455 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S1, Hcub); 2456 2457 `&load_for_mul("$R(%rsp)", "$res_y(%rsp)", "$src0")` 2458 lea $res_y(%rsp), $r_ptr 2459 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_y, R, res_y); 2460 2461 lea $S2(%rsp), $b_ptr 2462 lea $res_y(%rsp), $r_ptr 2463 call __ecp_nistz256_sub_from$x # p256_sub(res_y, res_y, S2); 2464 2465 movq %xmm0, $r_ptr # restore $r_ptr 2466 2467 movdqa %xmm5, %xmm0 # copy_conditional(res_z, in2_z, in1infty); 2468 movdqa %xmm5, %xmm1 2469 pandn $res_z(%rsp), %xmm0 2470 movdqa %xmm5, %xmm2 2471 pandn $res_z+0x10(%rsp), %xmm1 2472 movdqa %xmm5, %xmm3 2473 pand $in2_z(%rsp), %xmm2 2474 pand $in2_z+0x10(%rsp), %xmm3 2475 por %xmm0, %xmm2 2476 por %xmm1, %xmm3 2477 2478 movdqa %xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty); 2479 movdqa %xmm4, %xmm1 2480 pandn %xmm2, %xmm0 2481 movdqa %xmm4, %xmm2 2482 pandn %xmm3, %xmm1 2483 movdqa %xmm4, %xmm3 2484 pand $in1_z(%rsp), %xmm2 2485 pand $in1_z+0x10(%rsp), %xmm3 2486 por %xmm0, %xmm2 2487 por %xmm1, %xmm3 2488 movdqu %xmm2, 0x40($r_ptr) 2489 movdqu %xmm3, 0x50($r_ptr) 2490 2491 movdqa %xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty); 2492 movdqa %xmm5, %xmm1 2493 pandn $res_x(%rsp), %xmm0 2494 movdqa %xmm5, %xmm2 2495 pandn $res_x+0x10(%rsp), %xmm1 2496 movdqa %xmm5, %xmm3 2497 pand $in2_x(%rsp), %xmm2 2498 pand $in2_x+0x10(%rsp), %xmm3 2499 por %xmm0, %xmm2 2500 por %xmm1, %xmm3 2501 2502 movdqa %xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty); 2503 movdqa %xmm4, %xmm1 2504 pandn %xmm2, %xmm0 2505 movdqa %xmm4, %xmm2 2506 pandn %xmm3, %xmm1 2507 movdqa %xmm4, %xmm3 2508 pand $in1_x(%rsp), %xmm2 2509 pand $in1_x+0x10(%rsp), %xmm3 2510 por %xmm0, %xmm2 2511 por %xmm1, %xmm3 2512 movdqu %xmm2, 0x00($r_ptr) 2513 movdqu %xmm3, 0x10($r_ptr) 2514 2515 movdqa %xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty); 2516 movdqa %xmm5, %xmm1 2517 pandn $res_y(%rsp), %xmm0 2518 movdqa %xmm5, %xmm2 2519 pandn $res_y+0x10(%rsp), %xmm1 2520 movdqa %xmm5, %xmm3 2521 pand $in2_y(%rsp), %xmm2 2522 pand $in2_y+0x10(%rsp), %xmm3 2523 por %xmm0, %xmm2 2524 por %xmm1, %xmm3 2525 2526 movdqa %xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty); 2527 movdqa %xmm4, %xmm1 2528 pandn %xmm2, %xmm0 2529 movdqa %xmm4, %xmm2 2530 pandn %xmm3, %xmm1 2531 movdqa %xmm4, %xmm3 2532 pand $in1_y(%rsp), %xmm2 2533 pand $in1_y+0x10(%rsp), %xmm3 2534 por %xmm0, %xmm2 2535 por %xmm1, %xmm3 2536 movdqu %xmm2, 0x20($r_ptr) 2537 movdqu %xmm3, 0x30($r_ptr) 2538 2539.Ladd_done$x: 2540 add \$32*18+8, %rsp 2541 pop %r15 2542 pop %r14 2543 pop %r13 2544 pop %r12 2545 pop %rbx 2546 pop %rbp 2547 ret 2548.size ecp_nistz256_point_add$sfx,.-ecp_nistz256_point_add$sfx 2549___ 2550} 2551&gen_add("q"); 2552 2553sub gen_add_affine () { 2554 my $x = shift; 2555 my ($src0,$sfx,$bias); 2556 my ($U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr, 2557 $res_x,$res_y,$res_z, 2558 $in1_x,$in1_y,$in1_z, 2559 $in2_x,$in2_y)=map(32*$_,(0..14)); 2560 my $Z1sqr = $S2; 2561 2562 if ($x ne "x") { 2563 $src0 = "%rax"; 2564 $sfx = ""; 2565 $bias = 0; 2566 2567$code.=<<___; 2568.globl ecp_nistz256_point_add_affine 2569.type ecp_nistz256_point_add_affine,\@function,3 2570.align 32 2571ecp_nistz256_point_add_affine: 2572___ 2573$code.=<<___ if ($addx); 2574 mov \$0x80100, %ecx 2575 and OPENSSL_ia32cap_P+8(%rip), %ecx 2576 cmp \$0x80100, %ecx 2577 je .Lpoint_add_affinex 2578___ 2579 } else { 2580 $src0 = "%rdx"; 2581 $sfx = "x"; 2582 $bias = 128; 2583 2584$code.=<<___; 2585.type ecp_nistz256_point_add_affinex,\@function,3 2586.align 32 2587ecp_nistz256_point_add_affinex: 2588.Lpoint_add_affinex: 2589___ 2590 } 2591$code.=<<___; 2592 push %rbp 2593 push %rbx 2594 push %r12 2595 push %r13 2596 push %r14 2597 push %r15 2598 sub \$32*15+8, %rsp 2599 2600 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr 2601 mov $b_org, $b_ptr # reassign 2602 movdqu 0x10($a_ptr), %xmm1 2603 movdqu 0x20($a_ptr), %xmm2 2604 movdqu 0x30($a_ptr), %xmm3 2605 movdqu 0x40($a_ptr), %xmm4 2606 movdqu 0x50($a_ptr), %xmm5 2607 mov 0x40+8*0($a_ptr), $src0 # load original in1_z 2608 mov 0x40+8*1($a_ptr), $acc6 2609 mov 0x40+8*2($a_ptr), $acc7 2610 mov 0x40+8*3($a_ptr), $acc0 2611 movdqa %xmm0, $in1_x(%rsp) 2612 movdqa %xmm1, $in1_x+0x10(%rsp) 2613 movdqa %xmm2, $in1_y(%rsp) 2614 movdqa %xmm3, $in1_y+0x10(%rsp) 2615 movdqa %xmm4, $in1_z(%rsp) 2616 movdqa %xmm5, $in1_z+0x10(%rsp) 2617 por %xmm4, %xmm5 2618 2619 movdqu 0x00($b_ptr), %xmm0 # copy *(P256_POINT_AFFINE *)$b_ptr 2620 pshufd \$0xb1, %xmm5, %xmm3 2621 movdqu 0x10($b_ptr), %xmm1 2622 movdqu 0x20($b_ptr), %xmm2 2623 por %xmm3, %xmm5 2624 movdqu 0x30($b_ptr), %xmm3 2625 movdqa %xmm0, $in2_x(%rsp) 2626 pshufd \$0x1e, %xmm5, %xmm4 2627 movdqa %xmm1, $in2_x+0x10(%rsp) 2628 por %xmm0, %xmm1 2629 movq $r_ptr, %xmm0 # save $r_ptr 2630 movdqa %xmm2, $in2_y(%rsp) 2631 movdqa %xmm3, $in2_y+0x10(%rsp) 2632 por %xmm2, %xmm3 2633 por %xmm4, %xmm5 2634 pxor %xmm4, %xmm4 2635 por %xmm1, %xmm3 2636 2637 lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid 2638 lea $Z1sqr(%rsp), $r_ptr # Z1^2 2639 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z1sqr, in1_z); 2640 2641 pcmpeqd %xmm4, %xmm5 2642 pshufd \$0xb1, %xmm3, %xmm4 2643 mov 0x00($b_ptr), $src0 # $b_ptr is still valid 2644 #lea 0x00($b_ptr), $b_ptr 2645 mov $acc4, $acc1 # harmonize sqr output and mul input 2646 por %xmm3, %xmm4 2647 pshufd \$0, %xmm5, %xmm5 # in1infty 2648 pshufd \$0x1e, %xmm4, %xmm3 2649 mov $acc5, $acc2 2650 por %xmm3, %xmm4 2651 pxor %xmm3, %xmm3 2652 mov $acc6, $acc3 2653 pcmpeqd %xmm3, %xmm4 2654 pshufd \$0, %xmm4, %xmm4 # in2infty 2655 2656 lea $Z1sqr-$bias(%rsp), $a_ptr 2657 mov $acc7, $acc4 2658 lea $U2(%rsp), $r_ptr # U2 = X2*Z1^2 2659 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, Z1sqr, in2_x); 2660 2661 lea $in1_x(%rsp), $b_ptr 2662 lea $H(%rsp), $r_ptr # H = U2 - U1 2663 call __ecp_nistz256_sub_from$x # p256_sub(H, U2, in1_x); 2664 2665 `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")` 2666 lea $S2(%rsp), $r_ptr # S2 = Z1^3 2667 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Z1sqr, in1_z); 2668 2669 `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")` 2670 lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2 2671 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, H, in1_z); 2672 2673 `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")` 2674 lea $S2(%rsp), $r_ptr # S2 = Y2*Z1^3 2675 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S2, in2_y); 2676 2677 lea $in1_y(%rsp), $b_ptr 2678 lea $R(%rsp), $r_ptr # R = S2 - S1 2679 call __ecp_nistz256_sub_from$x # p256_sub(R, S2, in1_y); 2680 2681 `&load_for_sqr("$H(%rsp)", "$src0")` 2682 lea $Hsqr(%rsp), $r_ptr # H^2 2683 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Hsqr, H); 2684 2685 `&load_for_sqr("$R(%rsp)", "$src0")` 2686 lea $Rsqr(%rsp), $r_ptr # R^2 2687 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Rsqr, R); 2688 2689 `&load_for_mul("$H(%rsp)", "$Hsqr(%rsp)", "$src0")` 2690 lea $Hcub(%rsp), $r_ptr # H^3 2691 call __ecp_nistz256_mul_mont$x # p256_mul_mont(Hcub, Hsqr, H); 2692 2693 `&load_for_mul("$Hsqr(%rsp)", "$in1_x(%rsp)", "$src0")` 2694 lea $U2(%rsp), $r_ptr # U1*H^2 2695 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, in1_x, Hsqr); 2696___ 2697{ 2698####################################################################### 2699# operate in 4-5-0-1 "name space" that matches multiplication output 2700# 2701my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); 2702my ($poly1, $poly3)=($acc6,$acc7); 2703 2704$code.=<<___; 2705 #lea $U2(%rsp), $a_ptr 2706 #lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2 2707 #call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2); 2708 2709 xor $t4, $t4 2710 add $acc0, $acc0 # a0:a3+a0:a3 2711 lea $Rsqr(%rsp), $a_ptr 2712 adc $acc1, $acc1 2713 mov $acc0, $t0 2714 adc $acc2, $acc2 2715 adc $acc3, $acc3 2716 mov $acc1, $t1 2717 adc \$0, $t4 2718 2719 sub \$-1, $acc0 2720 mov $acc2, $t2 2721 sbb $poly1, $acc1 2722 sbb \$0, $acc2 2723 mov $acc3, $t3 2724 sbb $poly3, $acc3 2725 sbb \$0, $t4 2726 2727 cmovc $t0, $acc0 2728 mov 8*0($a_ptr), $t0 2729 cmovc $t1, $acc1 2730 mov 8*1($a_ptr), $t1 2731 cmovc $t2, $acc2 2732 mov 8*2($a_ptr), $t2 2733 cmovc $t3, $acc3 2734 mov 8*3($a_ptr), $t3 2735 2736 call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr); 2737 2738 lea $Hcub(%rsp), $b_ptr 2739 lea $res_x(%rsp), $r_ptr 2740 call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, Hcub); 2741 2742 mov $U2+8*0(%rsp), $t0 2743 mov $U2+8*1(%rsp), $t1 2744 mov $U2+8*2(%rsp), $t2 2745 mov $U2+8*3(%rsp), $t3 2746 lea $H(%rsp), $r_ptr 2747 2748 call __ecp_nistz256_sub$x # p256_sub(H, U2, res_x); 2749 2750 mov $acc0, 8*0($r_ptr) # save the result, as 2751 mov $acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't 2752 mov $acc2, 8*2($r_ptr) 2753 mov $acc3, 8*3($r_ptr) 2754___ 2755} 2756$code.=<<___; 2757 `&load_for_mul("$Hcub(%rsp)", "$in1_y(%rsp)", "$src0")` 2758 lea $S2(%rsp), $r_ptr 2759 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Hcub, in1_y); 2760 2761 `&load_for_mul("$H(%rsp)", "$R(%rsp)", "$src0")` 2762 lea $H(%rsp), $r_ptr 2763 call __ecp_nistz256_mul_mont$x # p256_mul_mont(H, H, R); 2764 2765 lea $S2(%rsp), $b_ptr 2766 lea $res_y(%rsp), $r_ptr 2767 call __ecp_nistz256_sub_from$x # p256_sub(res_y, H, S2); 2768 2769 movq %xmm0, $r_ptr # restore $r_ptr 2770 2771 movdqa %xmm5, %xmm0 # copy_conditional(res_z, ONE, in1infty); 2772 movdqa %xmm5, %xmm1 2773 pandn $res_z(%rsp), %xmm0 2774 movdqa %xmm5, %xmm2 2775 pandn $res_z+0x10(%rsp), %xmm1 2776 movdqa %xmm5, %xmm3 2777 pand .LONE_mont(%rip), %xmm2 2778 pand .LONE_mont+0x10(%rip), %xmm3 2779 por %xmm0, %xmm2 2780 por %xmm1, %xmm3 2781 2782 movdqa %xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty); 2783 movdqa %xmm4, %xmm1 2784 pandn %xmm2, %xmm0 2785 movdqa %xmm4, %xmm2 2786 pandn %xmm3, %xmm1 2787 movdqa %xmm4, %xmm3 2788 pand $in1_z(%rsp), %xmm2 2789 pand $in1_z+0x10(%rsp), %xmm3 2790 por %xmm0, %xmm2 2791 por %xmm1, %xmm3 2792 movdqu %xmm2, 0x40($r_ptr) 2793 movdqu %xmm3, 0x50($r_ptr) 2794 2795 movdqa %xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty); 2796 movdqa %xmm5, %xmm1 2797 pandn $res_x(%rsp), %xmm0 2798 movdqa %xmm5, %xmm2 2799 pandn $res_x+0x10(%rsp), %xmm1 2800 movdqa %xmm5, %xmm3 2801 pand $in2_x(%rsp), %xmm2 2802 pand $in2_x+0x10(%rsp), %xmm3 2803 por %xmm0, %xmm2 2804 por %xmm1, %xmm3 2805 2806 movdqa %xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty); 2807 movdqa %xmm4, %xmm1 2808 pandn %xmm2, %xmm0 2809 movdqa %xmm4, %xmm2 2810 pandn %xmm3, %xmm1 2811 movdqa %xmm4, %xmm3 2812 pand $in1_x(%rsp), %xmm2 2813 pand $in1_x+0x10(%rsp), %xmm3 2814 por %xmm0, %xmm2 2815 por %xmm1, %xmm3 2816 movdqu %xmm2, 0x00($r_ptr) 2817 movdqu %xmm3, 0x10($r_ptr) 2818 2819 movdqa %xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty); 2820 movdqa %xmm5, %xmm1 2821 pandn $res_y(%rsp), %xmm0 2822 movdqa %xmm5, %xmm2 2823 pandn $res_y+0x10(%rsp), %xmm1 2824 movdqa %xmm5, %xmm3 2825 pand $in2_y(%rsp), %xmm2 2826 pand $in2_y+0x10(%rsp), %xmm3 2827 por %xmm0, %xmm2 2828 por %xmm1, %xmm3 2829 2830 movdqa %xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty); 2831 movdqa %xmm4, %xmm1 2832 pandn %xmm2, %xmm0 2833 movdqa %xmm4, %xmm2 2834 pandn %xmm3, %xmm1 2835 movdqa %xmm4, %xmm3 2836 pand $in1_y(%rsp), %xmm2 2837 pand $in1_y+0x10(%rsp), %xmm3 2838 por %xmm0, %xmm2 2839 por %xmm1, %xmm3 2840 movdqu %xmm2, 0x20($r_ptr) 2841 movdqu %xmm3, 0x30($r_ptr) 2842 2843 add \$32*15+8, %rsp 2844 pop %r15 2845 pop %r14 2846 pop %r13 2847 pop %r12 2848 pop %rbx 2849 pop %rbp 2850 ret 2851.size ecp_nistz256_point_add_affine$sfx,.-ecp_nistz256_point_add_affine$sfx 2852___ 2853} 2854&gen_add_affine("q"); 2855 2856######################################################################## 2857# AD*X magic 2858# 2859if ($addx) { { 2860######################################################################## 2861# operate in 4-5-0-1 "name space" that matches multiplication output 2862# 2863my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); 2864 2865$code.=<<___; 2866.type __ecp_nistz256_add_tox,\@abi-omnipotent 2867.align 32 2868__ecp_nistz256_add_tox: 2869 xor $t4, $t4 2870 adc 8*0($b_ptr), $a0 2871 adc 8*1($b_ptr), $a1 2872 mov $a0, $t0 2873 adc 8*2($b_ptr), $a2 2874 adc 8*3($b_ptr), $a3 2875 mov $a1, $t1 2876 adc \$0, $t4 2877 2878 xor $t3, $t3 2879 sbb \$-1, $a0 2880 mov $a2, $t2 2881 sbb $poly1, $a1 2882 sbb \$0, $a2 2883 mov $a3, $t3 2884 sbb $poly3, $a3 2885 sbb \$0, $t4 2886 2887 cmovc $t0, $a0 2888 cmovc $t1, $a1 2889 mov $a0, 8*0($r_ptr) 2890 cmovc $t2, $a2 2891 mov $a1, 8*1($r_ptr) 2892 cmovc $t3, $a3 2893 mov $a2, 8*2($r_ptr) 2894 mov $a3, 8*3($r_ptr) 2895 2896 ret 2897.size __ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox 2898 2899.type __ecp_nistz256_sub_fromx,\@abi-omnipotent 2900.align 32 2901__ecp_nistz256_sub_fromx: 2902 xor $t4, $t4 2903 sbb 8*0($b_ptr), $a0 2904 sbb 8*1($b_ptr), $a1 2905 mov $a0, $t0 2906 sbb 8*2($b_ptr), $a2 2907 sbb 8*3($b_ptr), $a3 2908 mov $a1, $t1 2909 sbb \$0, $t4 2910 2911 xor $t3, $t3 2912 adc \$-1, $a0 2913 mov $a2, $t2 2914 adc $poly1, $a1 2915 adc \$0, $a2 2916 mov $a3, $t3 2917 adc $poly3, $a3 2918 2919 bt \$0, $t4 2920 cmovnc $t0, $a0 2921 cmovnc $t1, $a1 2922 mov $a0, 8*0($r_ptr) 2923 cmovnc $t2, $a2 2924 mov $a1, 8*1($r_ptr) 2925 cmovnc $t3, $a3 2926 mov $a2, 8*2($r_ptr) 2927 mov $a3, 8*3($r_ptr) 2928 2929 ret 2930.size __ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx 2931 2932.type __ecp_nistz256_subx,\@abi-omnipotent 2933.align 32 2934__ecp_nistz256_subx: 2935 xor $t4, $t4 2936 sbb $a0, $t0 2937 sbb $a1, $t1 2938 mov $t0, $a0 2939 sbb $a2, $t2 2940 sbb $a3, $t3 2941 mov $t1, $a1 2942 sbb \$0, $t4 2943 2944 xor $a3 ,$a3 2945 adc \$-1, $t0 2946 mov $t2, $a2 2947 adc $poly1, $t1 2948 adc \$0, $t2 2949 mov $t3, $a3 2950 adc $poly3, $t3 2951 2952 bt \$0, $t4 2953 cmovc $t0, $a0 2954 cmovc $t1, $a1 2955 cmovc $t2, $a2 2956 cmovc $t3, $a3 2957 2958 ret 2959.size __ecp_nistz256_subx,.-__ecp_nistz256_subx 2960 2961.type __ecp_nistz256_mul_by_2x,\@abi-omnipotent 2962.align 32 2963__ecp_nistz256_mul_by_2x: 2964 xor $t4, $t4 2965 adc $a0, $a0 # a0:a3+a0:a3 2966 adc $a1, $a1 2967 mov $a0, $t0 2968 adc $a2, $a2 2969 adc $a3, $a3 2970 mov $a1, $t1 2971 adc \$0, $t4 2972 2973 xor $t3, $t3 2974 sbb \$-1, $a0 2975 mov $a2, $t2 2976 sbb $poly1, $a1 2977 sbb \$0, $a2 2978 mov $a3, $t3 2979 sbb $poly3, $a3 2980 sbb \$0, $t4 2981 2982 cmovc $t0, $a0 2983 cmovc $t1, $a1 2984 mov $a0, 8*0($r_ptr) 2985 cmovc $t2, $a2 2986 mov $a1, 8*1($r_ptr) 2987 cmovc $t3, $a3 2988 mov $a2, 8*2($r_ptr) 2989 mov $a3, 8*3($r_ptr) 2990 2991 ret 2992.size __ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x 2993___ 2994 } 2995&gen_double("x"); 2996&gen_add("x"); 2997&gen_add_affine("x"); 2998} 2999}}} 3000 3001$code =~ s/\`([^\`]*)\`/eval $1/gem; 3002print $code; 3003close STDOUT; 3004