1#!/usr/bin/env perl 2# Copyright 2018 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8# 9# ==================================================================== 10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 11# project. The module is, however, dual licensed under OpenSSL and 12# CRYPTOGAMS licenses depending on where you obtain it. For further 13# details see http://www.openssl.org/~appro/cryptogams/. 14# ==================================================================== 15# 16# X25519 lower-level primitives for x86_64. 17# 18# February 2018. 19# 20# This module implements radix 2^51 multiplication and squaring, and 21# radix 2^64 multiplication, squaring, addition, subtraction and final 22# reduction. Latter radix is used on ADCX/ADOX-capable processors such 23# as Broadwell. On related note one should mention that there are 24# vector implementations that provide significantly better performance 25# on some processors(*), but they are large and overly complex. Which 26# in combination with them being effectively processor-specific makes 27# the undertaking hard to justify. The goal for this implementation 28# is rather versatility and simplicity [and ultimately formal 29# verification]. 30# 31# (*) For example sandy2x should provide ~30% improvement on Sandy 32# Bridge, but only nominal ~5% on Haswell [and big loss on 33# Broadwell and successors]. 34# 35###################################################################### 36# Improvement coefficients: 37# 38# amd64-51(*) gcc-5.x(**) 39# 40# P4 +22% +40% 41# Sandy Bridge -3% +11% 42# Haswell -1% +13% 43# Broadwell(***) +30% +35% 44# Skylake(***) +33% +47% 45# Silvermont +20% +26% 46# Goldmont +40% +50% 47# Bulldozer +20% +9% 48# Ryzen(***) +43% +40% 49# VIA +170% +120% 50# 51# (*) amd64-51 is popular assembly implementation with 2^51 radix, 52# only multiplication and squaring subroutines were linked 53# for comparison, but not complete ladder step; gain on most 54# processors is because this module refrains from shld, and 55# minor regression on others is because this does result in 56# higher instruction count; 57# (**) compiler is free to inline functions, in assembly one would 58# need to implement ladder step to do that, and it will improve 59# performance by several percent; 60# (***) ADCX/ADOX result for 2^64 radix, there is no corresponding 61# C implementation, so that comparison is always against 62# 2^51 radix; 63 64$flavour = shift; 65$output = shift; 66if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 67 68$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 69 70$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 71( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 72( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 73die "can't locate x86_64-xlate.pl"; 74 75open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 76*STDOUT=*OUT; 77 78if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 79 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 80 $addx = ($1>=2.23); 81} 82 83if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 84 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 85 $addx = ($1>=2.10); 86} 87 88if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 89 `ml64 2>&1` =~ /Version ([0-9]+)\./) { 90 $addx = ($1>=12); 91} 92 93if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) { 94 my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 95 $addx = ($ver>=3.03); 96} 97 98$code.=<<___; 99.text 100 101.globl x25519_fe51_mul 102.type x25519_fe51_mul,\@function,3 103.align 32 104x25519_fe51_mul: 105.cfi_startproc 106 push %rbp 107.cfi_push %rbp 108 push %rbx 109.cfi_push %rbx 110 push %r12 111.cfi_push %r12 112 push %r13 113.cfi_push %r13 114 push %r14 115.cfi_push %r14 116 push %r15 117.cfi_push %r15 118 lea -8*5(%rsp),%rsp 119.cfi_adjust_cfa_offset 40 120.Lfe51_mul_body: 121 122 mov 8*0(%rsi),%rax # f[0] 123 mov 8*0(%rdx),%r11 # load g[0-4] 124 mov 8*1(%rdx),%r12 125 mov 8*2(%rdx),%r13 126 mov 8*3(%rdx),%rbp 127 mov 8*4(%rdx),%r14 128 129 mov %rdi,8*4(%rsp) # offload 1st argument 130 mov %rax,%rdi 131 mulq %r11 # f[0]*g[0] 132 mov %r11,8*0(%rsp) # offload g[0] 133 mov %rax,%rbx # %rbx:%rcx = h0 134 mov %rdi,%rax 135 mov %rdx,%rcx 136 mulq %r12 # f[0]*g[1] 137 mov %r12,8*1(%rsp) # offload g[1] 138 mov %rax,%r8 # %r8:%r9 = h1 139 mov %rdi,%rax 140 lea (%r14,%r14,8),%r15 141 mov %rdx,%r9 142 mulq %r13 # f[0]*g[2] 143 mov %r13,8*2(%rsp) # offload g[2] 144 mov %rax,%r10 # %r10:%r11 = h2 145 mov %rdi,%rax 146 lea (%r14,%r15,2),%rdi # g[4]*19 147 mov %rdx,%r11 148 mulq %rbp # f[0]*g[3] 149 mov %rax,%r12 # %r12:%r13 = h3 150 mov 8*0(%rsi),%rax # f[0] 151 mov %rdx,%r13 152 mulq %r14 # f[0]*g[4] 153 mov %rax,%r14 # %r14:%r15 = h4 154 mov 8*1(%rsi),%rax # f[1] 155 mov %rdx,%r15 156 157 mulq %rdi # f[1]*g[4]*19 158 add %rax,%rbx 159 mov 8*2(%rsi),%rax # f[2] 160 adc %rdx,%rcx 161 mulq %rdi # f[2]*g[4]*19 162 add %rax,%r8 163 mov 8*3(%rsi),%rax # f[3] 164 adc %rdx,%r9 165 mulq %rdi # f[3]*g[4]*19 166 add %rax,%r10 167 mov 8*4(%rsi),%rax # f[4] 168 adc %rdx,%r11 169 mulq %rdi # f[4]*g[4]*19 170 imulq \$19,%rbp,%rdi # g[3]*19 171 add %rax,%r12 172 mov 8*1(%rsi),%rax # f[1] 173 adc %rdx,%r13 174 mulq %rbp # f[1]*g[3] 175 mov 8*2(%rsp),%rbp # g[2] 176 add %rax,%r14 177 mov 8*2(%rsi),%rax # f[2] 178 adc %rdx,%r15 179 180 mulq %rdi # f[2]*g[3]*19 181 add %rax,%rbx 182 mov 8*3(%rsi),%rax # f[3] 183 adc %rdx,%rcx 184 mulq %rdi # f[3]*g[3]*19 185 add %rax,%r8 186 mov 8*4(%rsi),%rax # f[4] 187 adc %rdx,%r9 188 mulq %rdi # f[4]*g[3]*19 189 imulq \$19,%rbp,%rdi # g[2]*19 190 add %rax,%r10 191 mov 8*1(%rsi),%rax # f[1] 192 adc %rdx,%r11 193 mulq %rbp # f[1]*g[2] 194 add %rax,%r12 195 mov 8*2(%rsi),%rax # f[2] 196 adc %rdx,%r13 197 mulq %rbp # f[2]*g[2] 198 mov 8*1(%rsp),%rbp # g[1] 199 add %rax,%r14 200 mov 8*3(%rsi),%rax # f[3] 201 adc %rdx,%r15 202 203 mulq %rdi # f[3]*g[2]*19 204 add %rax,%rbx 205 mov 8*4(%rsi),%rax # f[3] 206 adc %rdx,%rcx 207 mulq %rdi # f[4]*g[2]*19 208 add %rax,%r8 209 mov 8*1(%rsi),%rax # f[1] 210 adc %rdx,%r9 211 mulq %rbp # f[1]*g[1] 212 imulq \$19,%rbp,%rdi 213 add %rax,%r10 214 mov 8*2(%rsi),%rax # f[2] 215 adc %rdx,%r11 216 mulq %rbp # f[2]*g[1] 217 add %rax,%r12 218 mov 8*3(%rsi),%rax # f[3] 219 adc %rdx,%r13 220 mulq %rbp # f[3]*g[1] 221 mov 8*0(%rsp),%rbp # g[0] 222 add %rax,%r14 223 mov 8*4(%rsi),%rax # f[4] 224 adc %rdx,%r15 225 226 mulq %rdi # f[4]*g[1]*19 227 add %rax,%rbx 228 mov 8*1(%rsi),%rax # f[1] 229 adc %rdx,%rcx 230 mul %rbp # f[1]*g[0] 231 add %rax,%r8 232 mov 8*2(%rsi),%rax # f[2] 233 adc %rdx,%r9 234 mul %rbp # f[2]*g[0] 235 add %rax,%r10 236 mov 8*3(%rsi),%rax # f[3] 237 adc %rdx,%r11 238 mul %rbp # f[3]*g[0] 239 add %rax,%r12 240 mov 8*4(%rsi),%rax # f[4] 241 adc %rdx,%r13 242 mulq %rbp # f[4]*g[0] 243 add %rax,%r14 244 adc %rdx,%r15 245 246 mov 8*4(%rsp),%rdi # restore 1st argument 247 jmp .Lreduce51 248.Lfe51_mul_epilogue: 249.cfi_endproc 250.size x25519_fe51_mul,.-x25519_fe51_mul 251 252.globl x25519_fe51_sqr 253.type x25519_fe51_sqr,\@function,2 254.align 32 255x25519_fe51_sqr: 256.cfi_startproc 257 push %rbp 258.cfi_push %rbp 259 push %rbx 260.cfi_push %rbx 261 push %r12 262.cfi_push %r12 263 push %r13 264.cfi_push %r13 265 push %r14 266.cfi_push %r14 267 push %r15 268.cfi_push %r15 269 lea -8*5(%rsp),%rsp 270.cfi_adjust_cfa_offset 40 271.Lfe51_sqr_body: 272 273 mov 8*0(%rsi),%rax # g[0] 274 mov 8*2(%rsi),%r15 # g[2] 275 mov 8*4(%rsi),%rbp # g[4] 276 277 mov %rdi,8*4(%rsp) # offload 1st argument 278 lea (%rax,%rax),%r14 279 mulq %rax # g[0]*g[0] 280 mov %rax,%rbx 281 mov 8*1(%rsi),%rax # g[1] 282 mov %rdx,%rcx 283 mulq %r14 # 2*g[0]*g[1] 284 mov %rax,%r8 285 mov %r15,%rax 286 mov %r15,8*0(%rsp) # offload g[2] 287 mov %rdx,%r9 288 mulq %r14 # 2*g[0]*g[2] 289 mov %rax,%r10 290 mov 8*3(%rsi),%rax 291 mov %rdx,%r11 292 imulq \$19,%rbp,%rdi # g[4]*19 293 mulq %r14 # 2*g[0]*g[3] 294 mov %rax,%r12 295 mov %rbp,%rax 296 mov %rdx,%r13 297 mulq %r14 # 2*g[0]*g[4] 298 mov %rax,%r14 299 mov %rbp,%rax 300 mov %rdx,%r15 301 302 mulq %rdi # g[4]*g[4]*19 303 add %rax,%r12 304 mov 8*1(%rsi),%rax # g[1] 305 adc %rdx,%r13 306 307 mov 8*3(%rsi),%rsi # g[3] 308 lea (%rax,%rax),%rbp 309 mulq %rax # g[1]*g[1] 310 add %rax,%r10 311 mov 8*0(%rsp),%rax # g[2] 312 adc %rdx,%r11 313 mulq %rbp # 2*g[1]*g[2] 314 add %rax,%r12 315 mov %rbp,%rax 316 adc %rdx,%r13 317 mulq %rsi # 2*g[1]*g[3] 318 add %rax,%r14 319 mov %rbp,%rax 320 adc %rdx,%r15 321 imulq \$19,%rsi,%rbp # g[3]*19 322 mulq %rdi # 2*g[1]*g[4]*19 323 add %rax,%rbx 324 lea (%rsi,%rsi),%rax 325 adc %rdx,%rcx 326 327 mulq %rdi # 2*g[3]*g[4]*19 328 add %rax,%r10 329 mov %rsi,%rax 330 adc %rdx,%r11 331 mulq %rbp # g[3]*g[3]*19 332 add %rax,%r8 333 mov 8*0(%rsp),%rax # g[2] 334 adc %rdx,%r9 335 336 lea (%rax,%rax),%rsi 337 mulq %rax # g[2]*g[2] 338 add %rax,%r14 339 mov %rbp,%rax 340 adc %rdx,%r15 341 mulq %rsi # 2*g[2]*g[3]*19 342 add %rax,%rbx 343 mov %rsi,%rax 344 adc %rdx,%rcx 345 mulq %rdi # 2*g[2]*g[4]*19 346 add %rax,%r8 347 adc %rdx,%r9 348 349 mov 8*4(%rsp),%rdi # restore 1st argument 350 jmp .Lreduce51 351 352.align 32 353.Lreduce51: 354 mov \$0x7ffffffffffff,%rbp 355 356 mov %r10,%rdx 357 shr \$51,%r10 358 shl \$13,%r11 359 and %rbp,%rdx # %rdx = g2 = h2 & mask 360 or %r10,%r11 # h2>>51 361 add %r11,%r12 362 adc \$0,%r13 # h3 += h2>>51 363 364 mov %rbx,%rax 365 shr \$51,%rbx 366 shl \$13,%rcx 367 and %rbp,%rax # %rax = g0 = h0 & mask 368 or %rbx,%rcx # h0>>51 369 add %rcx,%r8 # h1 += h0>>51 370 adc \$0,%r9 371 372 mov %r12,%rbx 373 shr \$51,%r12 374 shl \$13,%r13 375 and %rbp,%rbx # %rbx = g3 = h3 & mask 376 or %r12,%r13 # h3>>51 377 add %r13,%r14 # h4 += h3>>51 378 adc \$0,%r15 379 380 mov %r8,%rcx 381 shr \$51,%r8 382 shl \$13,%r9 383 and %rbp,%rcx # %rcx = g1 = h1 & mask 384 or %r8,%r9 385 add %r9,%rdx # g2 += h1>>51 386 387 mov %r14,%r10 388 shr \$51,%r14 389 shl \$13,%r15 390 and %rbp,%r10 # %r10 = g4 = h0 & mask 391 or %r14,%r15 # h0>>51 392 393 lea (%r15,%r15,8),%r14 394 lea (%r15,%r14,2),%r15 395 add %r15,%rax # g0 += (h0>>51)*19 396 397 mov %rdx,%r8 398 and %rbp,%rdx # g2 &= mask 399 shr \$51,%r8 400 add %r8,%rbx # g3 += g2>>51 401 402 mov %rax,%r9 403 and %rbp,%rax # g0 &= mask 404 shr \$51,%r9 405 add %r9,%rcx # g1 += g0>>51 406 407 mov %rax,8*0(%rdi) # save the result 408 mov %rcx,8*1(%rdi) 409 mov %rdx,8*2(%rdi) 410 mov %rbx,8*3(%rdi) 411 mov %r10,8*4(%rdi) 412 413 mov 8*5(%rsp),%r15 414.cfi_restore %r15 415 mov 8*6(%rsp),%r14 416.cfi_restore %r14 417 mov 8*7(%rsp),%r13 418.cfi_restore %r13 419 mov 8*8(%rsp),%r12 420.cfi_restore %r12 421 mov 8*9(%rsp),%rbx 422.cfi_restore %rbx 423 mov 8*10(%rsp),%rbp 424.cfi_restore %rbp 425 lea 8*11(%rsp),%rsp 426.cfi_adjust_cfa_offset 88 427.Lfe51_sqr_epilogue: 428 ret 429.cfi_endproc 430.size x25519_fe51_sqr,.-x25519_fe51_sqr 431 432.globl x25519_fe51_mul121666 433.type x25519_fe51_mul121666,\@function,2 434.align 32 435x25519_fe51_mul121666: 436.cfi_startproc 437 push %rbp 438.cfi_push %rbp 439 push %rbx 440.cfi_push %rbx 441 push %r12 442.cfi_push %r12 443 push %r13 444.cfi_push %r13 445 push %r14 446.cfi_push %r14 447 push %r15 448.cfi_push %r15 449 lea -8*5(%rsp),%rsp 450.cfi_adjust_cfa_offset 40 451.Lfe51_mul121666_body: 452 mov \$121666,%eax 453 454 mulq 8*0(%rsi) 455 mov %rax,%rbx # %rbx:%rcx = h0 456 mov \$121666,%eax 457 mov %rdx,%rcx 458 mulq 8*1(%rsi) 459 mov %rax,%r8 # %r8:%r9 = h1 460 mov \$121666,%eax 461 mov %rdx,%r9 462 mulq 8*2(%rsi) 463 mov %rax,%r10 # %r10:%r11 = h2 464 mov \$121666,%eax 465 mov %rdx,%r11 466 mulq 8*3(%rsi) 467 mov %rax,%r12 # %r12:%r13 = h3 468 mov \$121666,%eax # f[0] 469 mov %rdx,%r13 470 mulq 8*4(%rsi) 471 mov %rax,%r14 # %r14:%r15 = h4 472 mov %rdx,%r15 473 474 jmp .Lreduce51 475.Lfe51_mul121666_epilogue: 476.cfi_endproc 477.size x25519_fe51_mul121666,.-x25519_fe51_mul121666 478___ 479######################################################################## 480# Base 2^64 subroutines modulo 2*(2^255-19) 481# 482if ($addx) { 483my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7) = map("%r$_",(8..15)); 484 485$code.=<<___; 486.extern OPENSSL_ia32cap_P 487.globl x25519_fe64_eligible 488.type x25519_fe64_eligible,\@abi-omnipotent 489.align 32 490x25519_fe64_eligible: 491 mov OPENSSL_ia32cap_P+8(%rip),%ecx 492 xor %eax,%eax 493 and \$0x80100,%ecx 494 cmp \$0x80100,%ecx 495 cmove %ecx,%eax 496 ret 497.size x25519_fe64_eligible,.-x25519_fe64_eligible 498 499.globl x25519_fe64_mul 500.type x25519_fe64_mul,\@function,3 501.align 32 502x25519_fe64_mul: 503.cfi_startproc 504 push %rbp 505.cfi_push %rbp 506 push %rbx 507.cfi_push %rbx 508 push %r12 509.cfi_push %r12 510 push %r13 511.cfi_push %r13 512 push %r14 513.cfi_push %r14 514 push %r15 515.cfi_push %r15 516 push %rdi # offload dst 517.cfi_push %rdi 518 lea -8*2(%rsp),%rsp 519.cfi_adjust_cfa_offset 16 520.Lfe64_mul_body: 521 522 mov %rdx,%rax 523 mov 8*0(%rdx),%rbp # b[0] 524 mov 8*0(%rsi),%rdx # a[0] 525 mov 8*1(%rax),%rcx # b[1] 526 mov 8*2(%rax),$acc6 # b[2] 527 mov 8*3(%rax),$acc7 # b[3] 528 529 mulx %rbp,$acc0,%rax # a[0]*b[0] 530 xor %edi,%edi # cf=0,of=0 531 mulx %rcx,$acc1,%rbx # a[0]*b[1] 532 adcx %rax,$acc1 533 mulx $acc6,$acc2,%rax # a[0]*b[2] 534 adcx %rbx,$acc2 535 mulx $acc7,$acc3,$acc4 # a[0]*b[3] 536 mov 8*1(%rsi),%rdx # a[1] 537 adcx %rax,$acc3 538 mov $acc6,(%rsp) # offload b[2] 539 adcx %rdi,$acc4 # cf=0 540 541 mulx %rbp,%rax,%rbx # a[1]*b[0] 542 adox %rax,$acc1 543 adcx %rbx,$acc2 544 mulx %rcx,%rax,%rbx # a[1]*b[1] 545 adox %rax,$acc2 546 adcx %rbx,$acc3 547 mulx $acc6,%rax,%rbx # a[1]*b[2] 548 adox %rax,$acc3 549 adcx %rbx,$acc4 550 mulx $acc7,%rax,$acc5 # a[1]*b[3] 551 mov 8*2(%rsi),%rdx # a[2] 552 adox %rax,$acc4 553 adcx %rdi,$acc5 # cf=0 554 adox %rdi,$acc5 # of=0 555 556 mulx %rbp,%rax,%rbx # a[2]*b[0] 557 adcx %rax,$acc2 558 adox %rbx,$acc3 559 mulx %rcx,%rax,%rbx # a[2]*b[1] 560 adcx %rax,$acc3 561 adox %rbx,$acc4 562 mulx $acc6,%rax,%rbx # a[2]*b[2] 563 adcx %rax,$acc4 564 adox %rbx,$acc5 565 mulx $acc7,%rax,$acc6 # a[2]*b[3] 566 mov 8*3(%rsi),%rdx # a[3] 567 adcx %rax,$acc5 568 adox %rdi,$acc6 # of=0 569 adcx %rdi,$acc6 # cf=0 570 571 mulx %rbp,%rax,%rbx # a[3]*b[0] 572 adox %rax,$acc3 573 adcx %rbx,$acc4 574 mulx %rcx,%rax,%rbx # a[3]*b[1] 575 adox %rax,$acc4 576 adcx %rbx,$acc5 577 mulx (%rsp),%rax,%rbx # a[3]*b[2] 578 adox %rax,$acc5 579 adcx %rbx,$acc6 580 mulx $acc7,%rax,$acc7 # a[3]*b[3] 581 mov \$38,%edx 582 adox %rax,$acc6 583 adcx %rdi,$acc7 # cf=0 584 adox %rdi,$acc7 # of=0 585 586 jmp .Lreduce64 587.Lfe64_mul_epilogue: 588.cfi_endproc 589.size x25519_fe64_mul,.-x25519_fe64_mul 590 591.globl x25519_fe64_sqr 592.type x25519_fe64_sqr,\@function,2 593.align 32 594x25519_fe64_sqr: 595.cfi_startproc 596 push %rbp 597.cfi_push %rbp 598 push %rbx 599.cfi_push %rbx 600 push %r12 601.cfi_push %r12 602 push %r13 603.cfi_push %r13 604 push %r14 605.cfi_push %r14 606 push %r15 607.cfi_push %r15 608 push %rdi # offload dst 609.cfi_push %rdi 610 lea -8*2(%rsp),%rsp 611.cfi_adjust_cfa_offset 16 612.Lfe64_sqr_body: 613 614 mov 8*0(%rsi),%rdx # a[0] 615 mov 8*1(%rsi),%rcx # a[1] 616 mov 8*2(%rsi),%rbp # a[2] 617 mov 8*3(%rsi),%rsi # a[3] 618 619 ################################################################ 620 mulx %rdx,$acc0,$acc7 # a[0]*a[0] 621 mulx %rcx,$acc1,%rax # a[0]*a[1] 622 xor %edi,%edi # cf=0,of=0 623 mulx %rbp,$acc2,%rbx # a[0]*a[2] 624 adcx %rax,$acc2 625 mulx %rsi,$acc3,$acc4 # a[0]*a[3] 626 mov %rcx,%rdx # a[1] 627 adcx %rbx,$acc3 628 adcx %rdi,$acc4 # cf=0 629 630 ################################################################ 631 mulx %rbp,%rax,%rbx # a[1]*a[2] 632 adox %rax,$acc3 633 adcx %rbx,$acc4 634 mulx %rsi,%rax,$acc5 # a[1]*a[3] 635 mov %rbp,%rdx # a[2] 636 adox %rax,$acc4 637 adcx %rdi,$acc5 638 639 ################################################################ 640 mulx %rsi,%rax,$acc6 # a[2]*a[3] 641 mov %rcx,%rdx # a[1] 642 adox %rax,$acc5 643 adcx %rdi,$acc6 # cf=0 644 adox %rdi,$acc6 # of=0 645 646 adcx $acc1,$acc1 # acc1:6<<1 647 adox $acc7,$acc1 648 adcx $acc2,$acc2 649 mulx %rdx,%rax,%rbx # a[1]*a[1] 650 mov %rbp,%rdx # a[2] 651 adcx $acc3,$acc3 652 adox %rax,$acc2 653 adcx $acc4,$acc4 654 adox %rbx,$acc3 655 mulx %rdx,%rax,%rbx # a[2]*a[2] 656 mov %rsi,%rdx # a[3] 657 adcx $acc5,$acc5 658 adox %rax,$acc4 659 adcx $acc6,$acc6 660 adox %rbx,$acc5 661 mulx %rdx,%rax,$acc7 # a[3]*a[3] 662 mov \$38,%edx 663 adox %rax,$acc6 664 adcx %rdi,$acc7 # cf=0 665 adox %rdi,$acc7 # of=0 666 jmp .Lreduce64 667 668.align 32 669.Lreduce64: 670 mulx $acc4,%rax,%rbx 671 adcx %rax,$acc0 672 adox %rbx,$acc1 673 mulx $acc5,%rax,%rbx 674 adcx %rax,$acc1 675 adox %rbx,$acc2 676 mulx $acc6,%rax,%rbx 677 adcx %rax,$acc2 678 adox %rbx,$acc3 679 mulx $acc7,%rax,$acc4 680 adcx %rax,$acc3 681 adox %rdi,$acc4 682 adcx %rdi,$acc4 683 684 mov 8*2(%rsp),%rdi # restore dst 685 imulq %rdx,$acc4 686 687 add $acc4,$acc0 688 adc \$0,$acc1 689 adc \$0,$acc2 690 adc \$0,$acc3 691 692 sbb %rax,%rax # cf -> mask 693 and \$38,%rax 694 695 add %rax,$acc0 696 mov $acc1,8*1(%rdi) 697 mov $acc2,8*2(%rdi) 698 mov $acc3,8*3(%rdi) 699 mov $acc0,8*0(%rdi) 700 701 mov 8*3(%rsp),%r15 702.cfi_restore %r15 703 mov 8*4(%rsp),%r14 704.cfi_restore %r14 705 mov 8*5(%rsp),%r13 706.cfi_restore %r13 707 mov 8*6(%rsp),%r12 708.cfi_restore %r12 709 mov 8*7(%rsp),%rbx 710.cfi_restore %rbx 711 mov 8*8(%rsp),%rbp 712.cfi_restore %rbp 713 lea 8*9(%rsp),%rsp 714.cfi_adjust_cfa_offset 88 715.Lfe64_sqr_epilogue: 716 ret 717.cfi_endproc 718.size x25519_fe64_sqr,.-x25519_fe64_sqr 719 720.globl x25519_fe64_mul121666 721.type x25519_fe64_mul121666,\@function,2 722.align 32 723x25519_fe64_mul121666: 724.Lfe64_mul121666_body: 725 mov \$121666,%edx 726 mulx 8*0(%rsi),$acc0,%rcx 727 mulx 8*1(%rsi),$acc1,%rax 728 add %rcx,$acc1 729 mulx 8*2(%rsi),$acc2,%rcx 730 adc %rax,$acc2 731 mulx 8*3(%rsi),$acc3,%rax 732 adc %rcx,$acc3 733 adc \$0,%rax 734 735 imulq \$38,%rax,%rax 736 737 add %rax,$acc0 738 adc \$0,$acc1 739 adc \$0,$acc2 740 adc \$0,$acc3 741 742 sbb %rax,%rax # cf -> mask 743 and \$38,%rax 744 745 add %rax,$acc0 746 mov $acc1,8*1(%rdi) 747 mov $acc2,8*2(%rdi) 748 mov $acc3,8*3(%rdi) 749 mov $acc0,8*0(%rdi) 750 751.Lfe64_mul121666_epilogue: 752 ret 753.size x25519_fe64_mul121666,.-x25519_fe64_mul121666 754 755.globl x25519_fe64_add 756.type x25519_fe64_add,\@function,3 757.align 32 758x25519_fe64_add: 759.Lfe64_add_body: 760 mov 8*0(%rsi),$acc0 761 mov 8*1(%rsi),$acc1 762 mov 8*2(%rsi),$acc2 763 mov 8*3(%rsi),$acc3 764 765 add 8*0(%rdx),$acc0 766 adc 8*1(%rdx),$acc1 767 adc 8*2(%rdx),$acc2 768 adc 8*3(%rdx),$acc3 769 770 sbb %rax,%rax # cf -> mask 771 and \$38,%rax 772 773 add %rax,$acc0 774 adc \$0,$acc1 775 adc \$0,$acc2 776 mov $acc1,8*1(%rdi) 777 adc \$0,$acc3 778 mov $acc2,8*2(%rdi) 779 sbb %rax,%rax # cf -> mask 780 mov $acc3,8*3(%rdi) 781 and \$38,%rax 782 783 add %rax,$acc0 784 mov $acc0,8*0(%rdi) 785 786.Lfe64_add_epilogue: 787 ret 788.size x25519_fe64_add,.-x25519_fe64_add 789 790.globl x25519_fe64_sub 791.type x25519_fe64_sub,\@function,3 792.align 32 793x25519_fe64_sub: 794.Lfe64_sub_body: 795 mov 8*0(%rsi),$acc0 796 mov 8*1(%rsi),$acc1 797 mov 8*2(%rsi),$acc2 798 mov 8*3(%rsi),$acc3 799 800 sub 8*0(%rdx),$acc0 801 sbb 8*1(%rdx),$acc1 802 sbb 8*2(%rdx),$acc2 803 sbb 8*3(%rdx),$acc3 804 805 sbb %rax,%rax # cf -> mask 806 and \$38,%rax 807 808 sub %rax,$acc0 809 sbb \$0,$acc1 810 sbb \$0,$acc2 811 mov $acc1,8*1(%rdi) 812 sbb \$0,$acc3 813 mov $acc2,8*2(%rdi) 814 sbb %rax,%rax # cf -> mask 815 mov $acc3,8*3(%rdi) 816 and \$38,%rax 817 818 sub %rax,$acc0 819 mov $acc0,8*0(%rdi) 820 821.Lfe64_sub_epilogue: 822 ret 823.size x25519_fe64_sub,.-x25519_fe64_sub 824 825.globl x25519_fe64_tobytes 826.type x25519_fe64_tobytes,\@function,2 827.align 32 828x25519_fe64_tobytes: 829.Lfe64_to_body: 830 mov 8*0(%rsi),$acc0 831 mov 8*1(%rsi),$acc1 832 mov 8*2(%rsi),$acc2 833 mov 8*3(%rsi),$acc3 834 835 ################################# reduction modulo 2^255-19 836 lea ($acc3,$acc3),%rax 837 sar \$63,$acc3 # most significant bit -> mask 838 shr \$1,%rax # most significant bit cleared 839 and \$19,$acc3 840 add \$19,$acc3 # compare to modulus in the same go 841 842 add $acc3,$acc0 843 adc \$0,$acc1 844 adc \$0,$acc2 845 adc \$0,%rax 846 847 lea (%rax,%rax),$acc3 848 sar \$63,%rax # most significant bit -> mask 849 shr \$1,$acc3 # most significant bit cleared 850 not %rax 851 and \$19,%rax 852 853 sub %rax,$acc0 854 sbb \$0,$acc1 855 sbb \$0,$acc2 856 sbb \$0,$acc3 857 858 mov $acc0,8*0(%rdi) 859 mov $acc1,8*1(%rdi) 860 mov $acc2,8*2(%rdi) 861 mov $acc3,8*3(%rdi) 862 863.Lfe64_to_epilogue: 864 ret 865.size x25519_fe64_tobytes,.-x25519_fe64_tobytes 866___ 867} else { 868$code.=<<___; 869.globl x25519_fe64_eligible 870.type x25519_fe64_eligible,\@abi-omnipotent 871.align 32 872x25519_fe64_eligible: 873 xor %eax,%eax 874 ret 875.size x25519_fe64_eligible,.-x25519_fe64_eligible 876 877.globl x25519_fe64_mul 878.type x25519_fe64_mul,\@abi-omnipotent 879.globl x25519_fe64_sqr 880.globl x25519_fe64_mul121666 881.globl x25519_fe64_add 882.globl x25519_fe64_sub 883.globl x25519_fe64_tobytes 884x25519_fe64_mul: 885x25519_fe64_sqr: 886x25519_fe64_mul121666: 887x25519_fe64_add: 888x25519_fe64_sub: 889x25519_fe64_tobytes: 890 .byte 0x0f,0x0b # ud2 891 ret 892.size x25519_fe64_mul,.-x25519_fe64_mul 893___ 894} 895$code.=<<___; 896.asciz "X25519 primitives for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 897___ 898 899# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 900# CONTEXT *context,DISPATCHER_CONTEXT *disp) 901if ($win64) { 902$rec="%rcx"; 903$frame="%rdx"; 904$context="%r8"; 905$disp="%r9"; 906 907$code.=<<___; 908.extern __imp_RtlVirtualUnwind 909 910.type short_handler,\@abi-omnipotent 911.align 16 912short_handler: 913 push %rsi 914 push %rdi 915 push %rbx 916 push %rbp 917 push %r12 918 push %r13 919 push %r14 920 push %r15 921 pushfq 922 sub \$64,%rsp 923 924 mov 120($context),%rax # pull context->Rax 925 mov 248($context),%rbx # pull context->Rip 926 927 mov 8($disp),%rsi # disp->ImageBase 928 mov 56($disp),%r11 # disp->HandlerData 929 930 mov 0(%r11),%r10d # HandlerData[0] 931 lea (%rsi,%r10),%r10 # end of prologue label 932 cmp %r10,%rbx # context->Rip<end of prologue label 933 jb .Lcommon_seh_tail 934 935 mov 152($context),%rax # pull context->Rsp 936 jmp .Lcommon_seh_tail 937.size short_handler,.-short_handler 938 939.type full_handler,\@abi-omnipotent 940.align 16 941full_handler: 942 push %rsi 943 push %rdi 944 push %rbx 945 push %rbp 946 push %r12 947 push %r13 948 push %r14 949 push %r15 950 pushfq 951 sub \$64,%rsp 952 953 mov 120($context),%rax # pull context->Rax 954 mov 248($context),%rbx # pull context->Rip 955 956 mov 8($disp),%rsi # disp->ImageBase 957 mov 56($disp),%r11 # disp->HandlerData 958 959 mov 0(%r11),%r10d # HandlerData[0] 960 lea (%rsi,%r10),%r10 # end of prologue label 961 cmp %r10,%rbx # context->Rip<end of prologue label 962 jb .Lcommon_seh_tail 963 964 mov 152($context),%rax # pull context->Rsp 965 966 mov 4(%r11),%r10d # HandlerData[1] 967 lea (%rsi,%r10),%r10 # epilogue label 968 cmp %r10,%rbx # context->Rip>=epilogue label 969 jae .Lcommon_seh_tail 970 971 mov 8(%r11),%r10d # HandlerData[2] 972 lea (%rax,%r10),%rax 973 974 mov -8(%rax),%rbp 975 mov -16(%rax),%rbx 976 mov -24(%rax),%r12 977 mov -32(%rax),%r13 978 mov -40(%rax),%r14 979 mov -48(%rax),%r15 980 mov %rbx,144($context) # restore context->Rbx 981 mov %rbp,160($context) # restore context->Rbp 982 mov %r12,216($context) # restore context->R12 983 mov %r13,224($context) # restore context->R13 984 mov %r14,232($context) # restore context->R14 985 mov %r15,240($context) # restore context->R15 986 987.Lcommon_seh_tail: 988 mov 8(%rax),%rdi 989 mov 16(%rax),%rsi 990 mov %rax,152($context) # restore context->Rsp 991 mov %rsi,168($context) # restore context->Rsi 992 mov %rdi,176($context) # restore context->Rdi 993 994 mov 40($disp),%rdi # disp->ContextRecord 995 mov $context,%rsi # context 996 mov \$154,%ecx # sizeof(CONTEXT) 997 .long 0xa548f3fc # cld; rep movsq 998 999 mov $disp,%rsi 1000 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 1001 mov 8(%rsi),%rdx # arg2, disp->ImageBase 1002 mov 0(%rsi),%r8 # arg3, disp->ControlPc 1003 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 1004 mov 40(%rsi),%r10 # disp->ContextRecord 1005 lea 56(%rsi),%r11 # &disp->HandlerData 1006 lea 24(%rsi),%r12 # &disp->EstablisherFrame 1007 mov %r10,32(%rsp) # arg5 1008 mov %r11,40(%rsp) # arg6 1009 mov %r12,48(%rsp) # arg7 1010 mov %rcx,56(%rsp) # arg8, (NULL) 1011 call *__imp_RtlVirtualUnwind(%rip) 1012 1013 mov \$1,%eax # ExceptionContinueSearch 1014 add \$64,%rsp 1015 popfq 1016 pop %r15 1017 pop %r14 1018 pop %r13 1019 pop %r12 1020 pop %rbp 1021 pop %rbx 1022 pop %rdi 1023 pop %rsi 1024 ret 1025.size full_handler,.-full_handler 1026 1027.section .pdata 1028.align 4 1029 .rva .LSEH_begin_x25519_fe51_mul 1030 .rva .LSEH_end_x25519_fe51_mul 1031 .rva .LSEH_info_x25519_fe51_mul 1032 1033 .rva .LSEH_begin_x25519_fe51_sqr 1034 .rva .LSEH_end_x25519_fe51_sqr 1035 .rva .LSEH_info_x25519_fe51_sqr 1036 1037 .rva .LSEH_begin_x25519_fe51_mul121666 1038 .rva .LSEH_end_x25519_fe51_mul121666 1039 .rva .LSEH_info_x25519_fe51_mul121666 1040___ 1041$code.=<<___ if ($addx); 1042 .rva .LSEH_begin_x25519_fe64_mul 1043 .rva .LSEH_end_x25519_fe64_mul 1044 .rva .LSEH_info_x25519_fe64_mul 1045 1046 .rva .LSEH_begin_x25519_fe64_sqr 1047 .rva .LSEH_end_x25519_fe64_sqr 1048 .rva .LSEH_info_x25519_fe64_sqr 1049 1050 .rva .LSEH_begin_x25519_fe64_mul121666 1051 .rva .LSEH_end_x25519_fe64_mul121666 1052 .rva .LSEH_info_x25519_fe64_mul121666 1053 1054 .rva .LSEH_begin_x25519_fe64_add 1055 .rva .LSEH_end_x25519_fe64_add 1056 .rva .LSEH_info_x25519_fe64_add 1057 1058 .rva .LSEH_begin_x25519_fe64_sub 1059 .rva .LSEH_end_x25519_fe64_sub 1060 .rva .LSEH_info_x25519_fe64_sub 1061 1062 .rva .LSEH_begin_x25519_fe64_tobytes 1063 .rva .LSEH_end_x25519_fe64_tobytes 1064 .rva .LSEH_info_x25519_fe64_tobytes 1065___ 1066$code.=<<___; 1067.section .xdata 1068.align 8 1069.LSEH_info_x25519_fe51_mul: 1070 .byte 9,0,0,0 1071 .rva full_handler 1072 .rva .Lfe51_mul_body,.Lfe51_mul_epilogue # HandlerData[] 1073 .long 88,0 1074.LSEH_info_x25519_fe51_sqr: 1075 .byte 9,0,0,0 1076 .rva full_handler 1077 .rva .Lfe51_sqr_body,.Lfe51_sqr_epilogue # HandlerData[] 1078 .long 88,0 1079.LSEH_info_x25519_fe51_mul121666: 1080 .byte 9,0,0,0 1081 .rva full_handler 1082 .rva .Lfe51_mul121666_body,.Lfe51_mul121666_epilogue # HandlerData[] 1083 .long 88,0 1084___ 1085$code.=<<___ if ($addx); 1086.LSEH_info_x25519_fe64_mul: 1087 .byte 9,0,0,0 1088 .rva full_handler 1089 .rva .Lfe64_mul_body,.Lfe64_mul_epilogue # HandlerData[] 1090 .long 72,0 1091.LSEH_info_x25519_fe64_sqr: 1092 .byte 9,0,0,0 1093 .rva full_handler 1094 .rva .Lfe64_sqr_body,.Lfe64_sqr_epilogue # HandlerData[] 1095 .long 72,0 1096.LSEH_info_x25519_fe64_mul121666: 1097 .byte 9,0,0,0 1098 .rva short_handler 1099 .rva .Lfe64_mul121666_body,.Lfe64_mul121666_epilogue # HandlerData[] 1100.LSEH_info_x25519_fe64_add: 1101 .byte 9,0,0,0 1102 .rva short_handler 1103 .rva .Lfe64_add_body,.Lfe64_add_epilogue # HandlerData[] 1104.LSEH_info_x25519_fe64_sub: 1105 .byte 9,0,0,0 1106 .rva short_handler 1107 .rva .Lfe64_sub_body,.Lfe64_sub_epilogue # HandlerData[] 1108.LSEH_info_x25519_fe64_tobytes: 1109 .byte 9,0,0,0 1110 .rva short_handler 1111 .rva .Lfe64_to_body,.Lfe64_to_epilogue # HandlerData[] 1112___ 1113} 1114 1115$code =~ s/\`([^\`]*)\`/eval $1/gem; 1116print $code; 1117close STDOUT; 1118