1#! /usr/bin/env perl 2# Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# This module implements Poly1305 hash for x86_64. 18# 19# March 2015 20# 21# Initial release. 22# 23# December 2016 24# 25# Add AVX512F+VL+BW code path. 26# 27# November 2017 28# 29# Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be 30# executed even on Knights Landing. Trigger for modification was 31# observation that AVX512 code paths can negatively affect overall 32# Skylake-X system performance. Since we are likely to suppress 33# AVX512F capability flag [at least on Skylake-X], conversion serves 34# as kind of "investment protection". Note that next *lake processor, 35# Cannolake, has AVX512IFMA code path to execute... 36# 37# Numbers are cycles per processed byte with poly1305_blocks alone, 38# measured with rdtsc at fixed clock frequency. 39# 40# IALU/gcc-4.8(*) AVX(**) AVX2 AVX-512 41# P4 4.46/+120% - 42# Core 2 2.41/+90% - 43# Westmere 1.88/+120% - 44# Sandy Bridge 1.39/+140% 1.10 45# Haswell 1.14/+175% 1.11 0.65 46# Skylake[-X] 1.13/+120% 0.96 0.51 [0.35] 47# Silvermont 2.83/+95% - 48# Knights L 3.60/? 1.65 1.10 0.41(***) 49# Goldmont 1.70/+180% - 50# VIA Nano 1.82/+150% - 51# Sledgehammer 1.38/+160% - 52# Bulldozer 2.30/+130% 0.97 53# Ryzen 1.15/+200% 1.08 1.18 54# 55# (*) improvement coefficients relative to clang are more modest and 56# are ~50% on most processors, in both cases we are comparing to 57# __int128 code; 58# (**) SSE2 implementation was attempted, but among non-AVX processors 59# it was faster than integer-only code only on older Intel P4 and 60# Core processors, 50-30%, less newer processor is, but slower on 61# contemporary ones, for example almost 2x slower on Atom, and as 62# former are naturally disappearing, SSE2 is deemed unnecessary; 63# (***) strangely enough performance seems to vary from core to core, 64# listed result is best case; 65 66$flavour = shift; 67$output = shift; 68if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 69 70$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 71 72$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 73( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 74( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 75die "can't locate x86_64-xlate.pl"; 76 77if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 78 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 79 $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25) + ($1>=2.26); 80} 81 82if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 83 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) { 84 $avx = ($1>=2.09) + ($1>=2.10) + 2 * ($1>=2.12); 85 $avx += 2 if ($1==2.11 && $2>=8); 86} 87 88if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 89 `ml64 2>&1` =~ /Version ([0-9]+)\./) { 90 $avx = ($1>=10) + ($1>=12); 91} 92 93if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) { 94 $avx = ($2>=3.0) + ($2>3.0); 95} 96 97open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 98*STDOUT=*OUT; 99 100my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx"); 101my ($mac,$nonce)=($inp,$len); # *_emit arguments 102my ($d1,$d2,$d3, $r0,$r1,$s1)=map("%r$_",(8..13)); 103my ($h0,$h1,$h2)=("%r14","%rbx","%rbp"); 104 105sub poly1305_iteration { 106# input: copy of $r1 in %rax, $h0-$h2, $r0-$r1 107# output: $h0-$h2 *= $r0-$r1 108$code.=<<___; 109 mulq $h0 # h0*r1 110 mov %rax,$d2 111 mov $r0,%rax 112 mov %rdx,$d3 113 114 mulq $h0 # h0*r0 115 mov %rax,$h0 # future $h0 116 mov $r0,%rax 117 mov %rdx,$d1 118 119 mulq $h1 # h1*r0 120 add %rax,$d2 121 mov $s1,%rax 122 adc %rdx,$d3 123 124 mulq $h1 # h1*s1 125 mov $h2,$h1 # borrow $h1 126 add %rax,$h0 127 adc %rdx,$d1 128 129 imulq $s1,$h1 # h2*s1 130 add $h1,$d2 131 mov $d1,$h1 132 adc \$0,$d3 133 134 imulq $r0,$h2 # h2*r0 135 add $d2,$h1 136 mov \$-4,%rax # mask value 137 adc $h2,$d3 138 139 and $d3,%rax # last reduction step 140 mov $d3,$h2 141 shr \$2,$d3 142 and \$3,$h2 143 add $d3,%rax 144 add %rax,$h0 145 adc \$0,$h1 146 adc \$0,$h2 147___ 148} 149 150######################################################################## 151# Layout of opaque area is following. 152# 153# unsigned __int64 h[3]; # current hash value base 2^64 154# unsigned __int64 r[2]; # key value base 2^64 155 156$code.=<<___; 157.text 158 159.extern OPENSSL_ia32cap_P 160 161.globl poly1305_init 162.hidden poly1305_init 163.globl poly1305_blocks 164.hidden poly1305_blocks 165.globl poly1305_emit 166.hidden poly1305_emit 167 168.type poly1305_init,\@function,3 169.align 32 170poly1305_init: 171 xor %rax,%rax 172 mov %rax,0($ctx) # initialize hash value 173 mov %rax,8($ctx) 174 mov %rax,16($ctx) 175 176 cmp \$0,$inp 177 je .Lno_key 178 179 lea poly1305_blocks(%rip),%r10 180 lea poly1305_emit(%rip),%r11 181___ 182$code.=<<___ if ($avx); 183 mov OPENSSL_ia32cap_P+4(%rip),%r9 184 lea poly1305_blocks_avx(%rip),%rax 185 lea poly1305_emit_avx(%rip),%rcx 186 bt \$`60-32`,%r9 # AVX? 187 cmovc %rax,%r10 188 cmovc %rcx,%r11 189___ 190$code.=<<___ if ($avx>1); 191 lea poly1305_blocks_avx2(%rip),%rax 192 bt \$`5+32`,%r9 # AVX2? 193 cmovc %rax,%r10 194___ 195$code.=<<___ if ($avx>3); 196 mov \$`(1<<31|1<<21|1<<16)`,%rax 197 shr \$32,%r9 198 and %rax,%r9 199 cmp %rax,%r9 200 je .Linit_base2_44 201___ 202$code.=<<___; 203 mov \$0x0ffffffc0fffffff,%rax 204 mov \$0x0ffffffc0ffffffc,%rcx 205 and 0($inp),%rax 206 and 8($inp),%rcx 207 mov %rax,24($ctx) 208 mov %rcx,32($ctx) 209___ 210$code.=<<___ if ($flavour !~ /elf32/); 211 mov %r10,0(%rdx) 212 mov %r11,8(%rdx) 213___ 214$code.=<<___ if ($flavour =~ /elf32/); 215 mov %r10d,0(%rdx) 216 mov %r11d,4(%rdx) 217___ 218$code.=<<___; 219 mov \$1,%eax 220.Lno_key: 221 ret 222.size poly1305_init,.-poly1305_init 223 224.type poly1305_blocks,\@function,4 225.align 32 226poly1305_blocks: 227.cfi_startproc 228.Lblocks: 229 shr \$4,$len 230 jz .Lno_data # too short 231 232 push %rbx 233.cfi_push %rbx 234 push %rbp 235.cfi_push %rbp 236 push %r12 237.cfi_push %r12 238 push %r13 239.cfi_push %r13 240 push %r14 241.cfi_push %r14 242 push %r15 243.cfi_push %r15 244.Lblocks_body: 245 246 mov $len,%r15 # reassign $len 247 248 mov 24($ctx),$r0 # load r 249 mov 32($ctx),$s1 250 251 mov 0($ctx),$h0 # load hash value 252 mov 8($ctx),$h1 253 mov 16($ctx),$h2 254 255 mov $s1,$r1 256 shr \$2,$s1 257 mov $r1,%rax 258 add $r1,$s1 # s1 = r1 + (r1 >> 2) 259 jmp .Loop 260 261.align 32 262.Loop: 263 add 0($inp),$h0 # accumulate input 264 adc 8($inp),$h1 265 lea 16($inp),$inp 266 adc $padbit,$h2 267___ 268 &poly1305_iteration(); 269$code.=<<___; 270 mov $r1,%rax 271 dec %r15 # len-=16 272 jnz .Loop 273 274 mov $h0,0($ctx) # store hash value 275 mov $h1,8($ctx) 276 mov $h2,16($ctx) 277 278 mov 0(%rsp),%r15 279.cfi_restore %r15 280 mov 8(%rsp),%r14 281.cfi_restore %r14 282 mov 16(%rsp),%r13 283.cfi_restore %r13 284 mov 24(%rsp),%r12 285.cfi_restore %r12 286 mov 32(%rsp),%rbp 287.cfi_restore %rbp 288 mov 40(%rsp),%rbx 289.cfi_restore %rbx 290 lea 48(%rsp),%rsp 291.cfi_adjust_cfa_offset -48 292.Lno_data: 293.Lblocks_epilogue: 294 ret 295.cfi_endproc 296.size poly1305_blocks,.-poly1305_blocks 297 298.type poly1305_emit,\@function,3 299.align 32 300poly1305_emit: 301.Lemit: 302 mov 0($ctx),%r8 # load hash value 303 mov 8($ctx),%r9 304 mov 16($ctx),%r10 305 306 mov %r8,%rax 307 add \$5,%r8 # compare to modulus 308 mov %r9,%rcx 309 adc \$0,%r9 310 adc \$0,%r10 311 shr \$2,%r10 # did 130-bit value overflow? 312 cmovnz %r8,%rax 313 cmovnz %r9,%rcx 314 315 add 0($nonce),%rax # accumulate nonce 316 adc 8($nonce),%rcx 317 mov %rax,0($mac) # write result 318 mov %rcx,8($mac) 319 320 ret 321.size poly1305_emit,.-poly1305_emit 322___ 323if ($avx) { 324 325######################################################################## 326# Layout of opaque area is following. 327# 328# unsigned __int32 h[5]; # current hash value base 2^26 329# unsigned __int32 is_base2_26; 330# unsigned __int64 r[2]; # key value base 2^64 331# unsigned __int64 pad; 332# struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9]; 333# 334# where r^n are base 2^26 digits of degrees of multiplier key. There are 335# 5 digits, but last four are interleaved with multiples of 5, totalling 336# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4. 337 338my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) = 339 map("%xmm$_",(0..15)); 340 341$code.=<<___; 342.type __poly1305_block,\@abi-omnipotent 343.align 32 344__poly1305_block: 345___ 346 &poly1305_iteration(); 347$code.=<<___; 348 ret 349.size __poly1305_block,.-__poly1305_block 350 351.type __poly1305_init_avx,\@abi-omnipotent 352.align 32 353__poly1305_init_avx: 354 mov $r0,$h0 355 mov $r1,$h1 356 xor $h2,$h2 357 358 lea 48+64($ctx),$ctx # size optimization 359 360 mov $r1,%rax 361 call __poly1305_block # r^2 362 363 mov \$0x3ffffff,%eax # save interleaved r^2 and r base 2^26 364 mov \$0x3ffffff,%edx 365 mov $h0,$d1 366 and $h0#d,%eax 367 mov $r0,$d2 368 and $r0#d,%edx 369 mov %eax,`16*0+0-64`($ctx) 370 shr \$26,$d1 371 mov %edx,`16*0+4-64`($ctx) 372 shr \$26,$d2 373 374 mov \$0x3ffffff,%eax 375 mov \$0x3ffffff,%edx 376 and $d1#d,%eax 377 and $d2#d,%edx 378 mov %eax,`16*1+0-64`($ctx) 379 lea (%rax,%rax,4),%eax # *5 380 mov %edx,`16*1+4-64`($ctx) 381 lea (%rdx,%rdx,4),%edx # *5 382 mov %eax,`16*2+0-64`($ctx) 383 shr \$26,$d1 384 mov %edx,`16*2+4-64`($ctx) 385 shr \$26,$d2 386 387 mov $h1,%rax 388 mov $r1,%rdx 389 shl \$12,%rax 390 shl \$12,%rdx 391 or $d1,%rax 392 or $d2,%rdx 393 and \$0x3ffffff,%eax 394 and \$0x3ffffff,%edx 395 mov %eax,`16*3+0-64`($ctx) 396 lea (%rax,%rax,4),%eax # *5 397 mov %edx,`16*3+4-64`($ctx) 398 lea (%rdx,%rdx,4),%edx # *5 399 mov %eax,`16*4+0-64`($ctx) 400 mov $h1,$d1 401 mov %edx,`16*4+4-64`($ctx) 402 mov $r1,$d2 403 404 mov \$0x3ffffff,%eax 405 mov \$0x3ffffff,%edx 406 shr \$14,$d1 407 shr \$14,$d2 408 and $d1#d,%eax 409 and $d2#d,%edx 410 mov %eax,`16*5+0-64`($ctx) 411 lea (%rax,%rax,4),%eax # *5 412 mov %edx,`16*5+4-64`($ctx) 413 lea (%rdx,%rdx,4),%edx # *5 414 mov %eax,`16*6+0-64`($ctx) 415 shr \$26,$d1 416 mov %edx,`16*6+4-64`($ctx) 417 shr \$26,$d2 418 419 mov $h2,%rax 420 shl \$24,%rax 421 or %rax,$d1 422 mov $d1#d,`16*7+0-64`($ctx) 423 lea ($d1,$d1,4),$d1 # *5 424 mov $d2#d,`16*7+4-64`($ctx) 425 lea ($d2,$d2,4),$d2 # *5 426 mov $d1#d,`16*8+0-64`($ctx) 427 mov $d2#d,`16*8+4-64`($ctx) 428 429 mov $r1,%rax 430 call __poly1305_block # r^3 431 432 mov \$0x3ffffff,%eax # save r^3 base 2^26 433 mov $h0,$d1 434 and $h0#d,%eax 435 shr \$26,$d1 436 mov %eax,`16*0+12-64`($ctx) 437 438 mov \$0x3ffffff,%edx 439 and $d1#d,%edx 440 mov %edx,`16*1+12-64`($ctx) 441 lea (%rdx,%rdx,4),%edx # *5 442 shr \$26,$d1 443 mov %edx,`16*2+12-64`($ctx) 444 445 mov $h1,%rax 446 shl \$12,%rax 447 or $d1,%rax 448 and \$0x3ffffff,%eax 449 mov %eax,`16*3+12-64`($ctx) 450 lea (%rax,%rax,4),%eax # *5 451 mov $h1,$d1 452 mov %eax,`16*4+12-64`($ctx) 453 454 mov \$0x3ffffff,%edx 455 shr \$14,$d1 456 and $d1#d,%edx 457 mov %edx,`16*5+12-64`($ctx) 458 lea (%rdx,%rdx,4),%edx # *5 459 shr \$26,$d1 460 mov %edx,`16*6+12-64`($ctx) 461 462 mov $h2,%rax 463 shl \$24,%rax 464 or %rax,$d1 465 mov $d1#d,`16*7+12-64`($ctx) 466 lea ($d1,$d1,4),$d1 # *5 467 mov $d1#d,`16*8+12-64`($ctx) 468 469 mov $r1,%rax 470 call __poly1305_block # r^4 471 472 mov \$0x3ffffff,%eax # save r^4 base 2^26 473 mov $h0,$d1 474 and $h0#d,%eax 475 shr \$26,$d1 476 mov %eax,`16*0+8-64`($ctx) 477 478 mov \$0x3ffffff,%edx 479 and $d1#d,%edx 480 mov %edx,`16*1+8-64`($ctx) 481 lea (%rdx,%rdx,4),%edx # *5 482 shr \$26,$d1 483 mov %edx,`16*2+8-64`($ctx) 484 485 mov $h1,%rax 486 shl \$12,%rax 487 or $d1,%rax 488 and \$0x3ffffff,%eax 489 mov %eax,`16*3+8-64`($ctx) 490 lea (%rax,%rax,4),%eax # *5 491 mov $h1,$d1 492 mov %eax,`16*4+8-64`($ctx) 493 494 mov \$0x3ffffff,%edx 495 shr \$14,$d1 496 and $d1#d,%edx 497 mov %edx,`16*5+8-64`($ctx) 498 lea (%rdx,%rdx,4),%edx # *5 499 shr \$26,$d1 500 mov %edx,`16*6+8-64`($ctx) 501 502 mov $h2,%rax 503 shl \$24,%rax 504 or %rax,$d1 505 mov $d1#d,`16*7+8-64`($ctx) 506 lea ($d1,$d1,4),$d1 # *5 507 mov $d1#d,`16*8+8-64`($ctx) 508 509 lea -48-64($ctx),$ctx # size [de-]optimization 510 ret 511.size __poly1305_init_avx,.-__poly1305_init_avx 512 513.type poly1305_blocks_avx,\@function,4 514.align 32 515poly1305_blocks_avx: 516.cfi_startproc 517 mov 20($ctx),%r8d # is_base2_26 518 cmp \$128,$len 519 jae .Lblocks_avx 520 test %r8d,%r8d 521 jz .Lblocks 522 523.Lblocks_avx: 524 and \$-16,$len 525 jz .Lno_data_avx 526 527 vzeroupper 528 529 test %r8d,%r8d 530 jz .Lbase2_64_avx 531 532 test \$31,$len 533 jz .Leven_avx 534 535 push %rbx 536.cfi_push %rbx 537 push %rbp 538.cfi_push %rbp 539 push %r12 540.cfi_push %r12 541 push %r13 542.cfi_push %r13 543 push %r14 544.cfi_push %r14 545 push %r15 546.cfi_push %r15 547.Lblocks_avx_body: 548 549 mov $len,%r15 # reassign $len 550 551 mov 0($ctx),$d1 # load hash value 552 mov 8($ctx),$d2 553 mov 16($ctx),$h2#d 554 555 mov 24($ctx),$r0 # load r 556 mov 32($ctx),$s1 557 558 ################################# base 2^26 -> base 2^64 559 mov $d1#d,$h0#d 560 and \$`-1*(1<<31)`,$d1 561 mov $d2,$r1 # borrow $r1 562 mov $d2#d,$h1#d 563 and \$`-1*(1<<31)`,$d2 564 565 shr \$6,$d1 566 shl \$52,$r1 567 add $d1,$h0 568 shr \$12,$h1 569 shr \$18,$d2 570 add $r1,$h0 571 adc $d2,$h1 572 573 mov $h2,$d1 574 shl \$40,$d1 575 shr \$24,$h2 576 add $d1,$h1 577 adc \$0,$h2 # can be partially reduced... 578 579 mov \$-4,$d2 # ... so reduce 580 mov $h2,$d1 581 and $h2,$d2 582 shr \$2,$d1 583 and \$3,$h2 584 add $d2,$d1 # =*5 585 add $d1,$h0 586 adc \$0,$h1 587 adc \$0,$h2 588 589 mov $s1,$r1 590 mov $s1,%rax 591 shr \$2,$s1 592 add $r1,$s1 # s1 = r1 + (r1 >> 2) 593 594 add 0($inp),$h0 # accumulate input 595 adc 8($inp),$h1 596 lea 16($inp),$inp 597 adc $padbit,$h2 598 599 call __poly1305_block 600 601 test $padbit,$padbit # if $padbit is zero, 602 jz .Lstore_base2_64_avx # store hash in base 2^64 format 603 604 ################################# base 2^64 -> base 2^26 605 mov $h0,%rax 606 mov $h0,%rdx 607 shr \$52,$h0 608 mov $h1,$r0 609 mov $h1,$r1 610 shr \$26,%rdx 611 and \$0x3ffffff,%rax # h[0] 612 shl \$12,$r0 613 and \$0x3ffffff,%rdx # h[1] 614 shr \$14,$h1 615 or $r0,$h0 616 shl \$24,$h2 617 and \$0x3ffffff,$h0 # h[2] 618 shr \$40,$r1 619 and \$0x3ffffff,$h1 # h[3] 620 or $r1,$h2 # h[4] 621 622 sub \$16,%r15 623 jz .Lstore_base2_26_avx 624 625 vmovd %rax#d,$H0 626 vmovd %rdx#d,$H1 627 vmovd $h0#d,$H2 628 vmovd $h1#d,$H3 629 vmovd $h2#d,$H4 630 jmp .Lproceed_avx 631 632.align 32 633.Lstore_base2_64_avx: 634 mov $h0,0($ctx) 635 mov $h1,8($ctx) 636 mov $h2,16($ctx) # note that is_base2_26 is zeroed 637 jmp .Ldone_avx 638 639.align 16 640.Lstore_base2_26_avx: 641 mov %rax#d,0($ctx) # store hash value base 2^26 642 mov %rdx#d,4($ctx) 643 mov $h0#d,8($ctx) 644 mov $h1#d,12($ctx) 645 mov $h2#d,16($ctx) 646.align 16 647.Ldone_avx: 648 mov 0(%rsp),%r15 649.cfi_restore %r15 650 mov 8(%rsp),%r14 651.cfi_restore %r14 652 mov 16(%rsp),%r13 653.cfi_restore %r13 654 mov 24(%rsp),%r12 655.cfi_restore %r12 656 mov 32(%rsp),%rbp 657.cfi_restore %rbp 658 mov 40(%rsp),%rbx 659.cfi_restore %rbx 660 lea 48(%rsp),%rsp 661.cfi_adjust_cfa_offset -48 662.Lno_data_avx: 663.Lblocks_avx_epilogue: 664 ret 665.cfi_endproc 666 667.align 32 668.Lbase2_64_avx: 669.cfi_startproc 670 push %rbx 671.cfi_push %rbx 672 push %rbp 673.cfi_push %rbp 674 push %r12 675.cfi_push %r12 676 push %r13 677.cfi_push %r13 678 push %r14 679.cfi_push %r14 680 push %r15 681.cfi_push %r15 682.Lbase2_64_avx_body: 683 684 mov $len,%r15 # reassign $len 685 686 mov 24($ctx),$r0 # load r 687 mov 32($ctx),$s1 688 689 mov 0($ctx),$h0 # load hash value 690 mov 8($ctx),$h1 691 mov 16($ctx),$h2#d 692 693 mov $s1,$r1 694 mov $s1,%rax 695 shr \$2,$s1 696 add $r1,$s1 # s1 = r1 + (r1 >> 2) 697 698 test \$31,$len 699 jz .Linit_avx 700 701 add 0($inp),$h0 # accumulate input 702 adc 8($inp),$h1 703 lea 16($inp),$inp 704 adc $padbit,$h2 705 sub \$16,%r15 706 707 call __poly1305_block 708 709.Linit_avx: 710 ################################# base 2^64 -> base 2^26 711 mov $h0,%rax 712 mov $h0,%rdx 713 shr \$52,$h0 714 mov $h1,$d1 715 mov $h1,$d2 716 shr \$26,%rdx 717 and \$0x3ffffff,%rax # h[0] 718 shl \$12,$d1 719 and \$0x3ffffff,%rdx # h[1] 720 shr \$14,$h1 721 or $d1,$h0 722 shl \$24,$h2 723 and \$0x3ffffff,$h0 # h[2] 724 shr \$40,$d2 725 and \$0x3ffffff,$h1 # h[3] 726 or $d2,$h2 # h[4] 727 728 vmovd %rax#d,$H0 729 vmovd %rdx#d,$H1 730 vmovd $h0#d,$H2 731 vmovd $h1#d,$H3 732 vmovd $h2#d,$H4 733 movl \$1,20($ctx) # set is_base2_26 734 735 call __poly1305_init_avx 736 737.Lproceed_avx: 738 mov %r15,$len 739 740 mov 0(%rsp),%r15 741.cfi_restore %r15 742 mov 8(%rsp),%r14 743.cfi_restore %r14 744 mov 16(%rsp),%r13 745.cfi_restore %r13 746 mov 24(%rsp),%r12 747.cfi_restore %r12 748 mov 32(%rsp),%rbp 749.cfi_restore %rbp 750 mov 40(%rsp),%rbx 751.cfi_restore %rbx 752 lea 48(%rsp),%rax 753 lea 48(%rsp),%rsp 754.cfi_adjust_cfa_offset -48 755.Lbase2_64_avx_epilogue: 756 jmp .Ldo_avx 757.cfi_endproc 758 759.align 32 760.Leven_avx: 761.cfi_startproc 762 vmovd 4*0($ctx),$H0 # load hash value 763 vmovd 4*1($ctx),$H1 764 vmovd 4*2($ctx),$H2 765 vmovd 4*3($ctx),$H3 766 vmovd 4*4($ctx),$H4 767 768.Ldo_avx: 769___ 770$code.=<<___ if (!$win64); 771 lea -0x58(%rsp),%r11 772.cfi_def_cfa %r11,0x60 773 sub \$0x178,%rsp 774___ 775$code.=<<___ if ($win64); 776 lea -0xf8(%rsp),%r11 777 sub \$0x218,%rsp 778 vmovdqa %xmm6,0x50(%r11) 779 vmovdqa %xmm7,0x60(%r11) 780 vmovdqa %xmm8,0x70(%r11) 781 vmovdqa %xmm9,0x80(%r11) 782 vmovdqa %xmm10,0x90(%r11) 783 vmovdqa %xmm11,0xa0(%r11) 784 vmovdqa %xmm12,0xb0(%r11) 785 vmovdqa %xmm13,0xc0(%r11) 786 vmovdqa %xmm14,0xd0(%r11) 787 vmovdqa %xmm15,0xe0(%r11) 788.Ldo_avx_body: 789___ 790$code.=<<___; 791 sub \$64,$len 792 lea -32($inp),%rax 793 cmovc %rax,$inp 794 795 vmovdqu `16*3`($ctx),$D4 # preload r0^2 796 lea `16*3+64`($ctx),$ctx # size optimization 797 lea .Lconst(%rip),%rcx 798 799 ################################################################ 800 # load input 801 vmovdqu 16*2($inp),$T0 802 vmovdqu 16*3($inp),$T1 803 vmovdqa 64(%rcx),$MASK # .Lmask26 804 805 vpsrldq \$6,$T0,$T2 # splat input 806 vpsrldq \$6,$T1,$T3 807 vpunpckhqdq $T1,$T0,$T4 # 4 808 vpunpcklqdq $T1,$T0,$T0 # 0:1 809 vpunpcklqdq $T3,$T2,$T3 # 2:3 810 811 vpsrlq \$40,$T4,$T4 # 4 812 vpsrlq \$26,$T0,$T1 813 vpand $MASK,$T0,$T0 # 0 814 vpsrlq \$4,$T3,$T2 815 vpand $MASK,$T1,$T1 # 1 816 vpsrlq \$30,$T3,$T3 817 vpand $MASK,$T2,$T2 # 2 818 vpand $MASK,$T3,$T3 # 3 819 vpor 32(%rcx),$T4,$T4 # padbit, yes, always 820 821 jbe .Lskip_loop_avx 822 823 # expand and copy pre-calculated table to stack 824 vmovdqu `16*1-64`($ctx),$D1 825 vmovdqu `16*2-64`($ctx),$D2 826 vpshufd \$0xEE,$D4,$D3 # 34xx -> 3434 827 vpshufd \$0x44,$D4,$D0 # xx12 -> 1212 828 vmovdqa $D3,-0x90(%r11) 829 vmovdqa $D0,0x00(%rsp) 830 vpshufd \$0xEE,$D1,$D4 831 vmovdqu `16*3-64`($ctx),$D0 832 vpshufd \$0x44,$D1,$D1 833 vmovdqa $D4,-0x80(%r11) 834 vmovdqa $D1,0x10(%rsp) 835 vpshufd \$0xEE,$D2,$D3 836 vmovdqu `16*4-64`($ctx),$D1 837 vpshufd \$0x44,$D2,$D2 838 vmovdqa $D3,-0x70(%r11) 839 vmovdqa $D2,0x20(%rsp) 840 vpshufd \$0xEE,$D0,$D4 841 vmovdqu `16*5-64`($ctx),$D2 842 vpshufd \$0x44,$D0,$D0 843 vmovdqa $D4,-0x60(%r11) 844 vmovdqa $D0,0x30(%rsp) 845 vpshufd \$0xEE,$D1,$D3 846 vmovdqu `16*6-64`($ctx),$D0 847 vpshufd \$0x44,$D1,$D1 848 vmovdqa $D3,-0x50(%r11) 849 vmovdqa $D1,0x40(%rsp) 850 vpshufd \$0xEE,$D2,$D4 851 vmovdqu `16*7-64`($ctx),$D1 852 vpshufd \$0x44,$D2,$D2 853 vmovdqa $D4,-0x40(%r11) 854 vmovdqa $D2,0x50(%rsp) 855 vpshufd \$0xEE,$D0,$D3 856 vmovdqu `16*8-64`($ctx),$D2 857 vpshufd \$0x44,$D0,$D0 858 vmovdqa $D3,-0x30(%r11) 859 vmovdqa $D0,0x60(%rsp) 860 vpshufd \$0xEE,$D1,$D4 861 vpshufd \$0x44,$D1,$D1 862 vmovdqa $D4,-0x20(%r11) 863 vmovdqa $D1,0x70(%rsp) 864 vpshufd \$0xEE,$D2,$D3 865 vmovdqa 0x00(%rsp),$D4 # preload r0^2 866 vpshufd \$0x44,$D2,$D2 867 vmovdqa $D3,-0x10(%r11) 868 vmovdqa $D2,0x80(%rsp) 869 870 jmp .Loop_avx 871 872.align 32 873.Loop_avx: 874 ################################################################ 875 # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 876 # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r 877 # \___________________/ 878 # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 879 # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r 880 # \___________________/ \____________________/ 881 # 882 # Note that we start with inp[2:3]*r^2. This is because it 883 # doesn't depend on reduction in previous iteration. 884 ################################################################ 885 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 886 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 887 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 888 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 889 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 890 # 891 # though note that $Tx and $Hx are "reversed" in this section, 892 # and $D4 is preloaded with r0^2... 893 894 vpmuludq $T0,$D4,$D0 # d0 = h0*r0 895 vpmuludq $T1,$D4,$D1 # d1 = h1*r0 896 vmovdqa $H2,0x20(%r11) # offload hash 897 vpmuludq $T2,$D4,$D2 # d3 = h2*r0 898 vmovdqa 0x10(%rsp),$H2 # r1^2 899 vpmuludq $T3,$D4,$D3 # d3 = h3*r0 900 vpmuludq $T4,$D4,$D4 # d4 = h4*r0 901 902 vmovdqa $H0,0x00(%r11) # 903 vpmuludq 0x20(%rsp),$T4,$H0 # h4*s1 904 vmovdqa $H1,0x10(%r11) # 905 vpmuludq $T3,$H2,$H1 # h3*r1 906 vpaddq $H0,$D0,$D0 # d0 += h4*s1 907 vpaddq $H1,$D4,$D4 # d4 += h3*r1 908 vmovdqa $H3,0x30(%r11) # 909 vpmuludq $T2,$H2,$H0 # h2*r1 910 vpmuludq $T1,$H2,$H1 # h1*r1 911 vpaddq $H0,$D3,$D3 # d3 += h2*r1 912 vmovdqa 0x30(%rsp),$H3 # r2^2 913 vpaddq $H1,$D2,$D2 # d2 += h1*r1 914 vmovdqa $H4,0x40(%r11) # 915 vpmuludq $T0,$H2,$H2 # h0*r1 916 vpmuludq $T2,$H3,$H0 # h2*r2 917 vpaddq $H2,$D1,$D1 # d1 += h0*r1 918 919 vmovdqa 0x40(%rsp),$H4 # s2^2 920 vpaddq $H0,$D4,$D4 # d4 += h2*r2 921 vpmuludq $T1,$H3,$H1 # h1*r2 922 vpmuludq $T0,$H3,$H3 # h0*r2 923 vpaddq $H1,$D3,$D3 # d3 += h1*r2 924 vmovdqa 0x50(%rsp),$H2 # r3^2 925 vpaddq $H3,$D2,$D2 # d2 += h0*r2 926 vpmuludq $T4,$H4,$H0 # h4*s2 927 vpmuludq $T3,$H4,$H4 # h3*s2 928 vpaddq $H0,$D1,$D1 # d1 += h4*s2 929 vmovdqa 0x60(%rsp),$H3 # s3^2 930 vpaddq $H4,$D0,$D0 # d0 += h3*s2 931 932 vmovdqa 0x80(%rsp),$H4 # s4^2 933 vpmuludq $T1,$H2,$H1 # h1*r3 934 vpmuludq $T0,$H2,$H2 # h0*r3 935 vpaddq $H1,$D4,$D4 # d4 += h1*r3 936 vpaddq $H2,$D3,$D3 # d3 += h0*r3 937 vpmuludq $T4,$H3,$H0 # h4*s3 938 vpmuludq $T3,$H3,$H1 # h3*s3 939 vpaddq $H0,$D2,$D2 # d2 += h4*s3 940 vmovdqu 16*0($inp),$H0 # load input 941 vpaddq $H1,$D1,$D1 # d1 += h3*s3 942 vpmuludq $T2,$H3,$H3 # h2*s3 943 vpmuludq $T2,$H4,$T2 # h2*s4 944 vpaddq $H3,$D0,$D0 # d0 += h2*s3 945 946 vmovdqu 16*1($inp),$H1 # 947 vpaddq $T2,$D1,$D1 # d1 += h2*s4 948 vpmuludq $T3,$H4,$T3 # h3*s4 949 vpmuludq $T4,$H4,$T4 # h4*s4 950 vpsrldq \$6,$H0,$H2 # splat input 951 vpaddq $T3,$D2,$D2 # d2 += h3*s4 952 vpaddq $T4,$D3,$D3 # d3 += h4*s4 953 vpsrldq \$6,$H1,$H3 # 954 vpmuludq 0x70(%rsp),$T0,$T4 # h0*r4 955 vpmuludq $T1,$H4,$T0 # h1*s4 956 vpunpckhqdq $H1,$H0,$H4 # 4 957 vpaddq $T4,$D4,$D4 # d4 += h0*r4 958 vmovdqa -0x90(%r11),$T4 # r0^4 959 vpaddq $T0,$D0,$D0 # d0 += h1*s4 960 961 vpunpcklqdq $H1,$H0,$H0 # 0:1 962 vpunpcklqdq $H3,$H2,$H3 # 2:3 963 964 #vpsrlq \$40,$H4,$H4 # 4 965 vpsrldq \$`40/8`,$H4,$H4 # 4 966 vpsrlq \$26,$H0,$H1 967 vpand $MASK,$H0,$H0 # 0 968 vpsrlq \$4,$H3,$H2 969 vpand $MASK,$H1,$H1 # 1 970 vpand 0(%rcx),$H4,$H4 # .Lmask24 971 vpsrlq \$30,$H3,$H3 972 vpand $MASK,$H2,$H2 # 2 973 vpand $MASK,$H3,$H3 # 3 974 vpor 32(%rcx),$H4,$H4 # padbit, yes, always 975 976 vpaddq 0x00(%r11),$H0,$H0 # add hash value 977 vpaddq 0x10(%r11),$H1,$H1 978 vpaddq 0x20(%r11),$H2,$H2 979 vpaddq 0x30(%r11),$H3,$H3 980 vpaddq 0x40(%r11),$H4,$H4 981 982 lea 16*2($inp),%rax 983 lea 16*4($inp),$inp 984 sub \$64,$len 985 cmovc %rax,$inp 986 987 ################################################################ 988 # Now we accumulate (inp[0:1]+hash)*r^4 989 ################################################################ 990 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 991 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 992 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 993 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 994 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 995 996 vpmuludq $H0,$T4,$T0 # h0*r0 997 vpmuludq $H1,$T4,$T1 # h1*r0 998 vpaddq $T0,$D0,$D0 999 vpaddq $T1,$D1,$D1 1000 vmovdqa -0x80(%r11),$T2 # r1^4 1001 vpmuludq $H2,$T4,$T0 # h2*r0 1002 vpmuludq $H3,$T4,$T1 # h3*r0 1003 vpaddq $T0,$D2,$D2 1004 vpaddq $T1,$D3,$D3 1005 vpmuludq $H4,$T4,$T4 # h4*r0 1006 vpmuludq -0x70(%r11),$H4,$T0 # h4*s1 1007 vpaddq $T4,$D4,$D4 1008 1009 vpaddq $T0,$D0,$D0 # d0 += h4*s1 1010 vpmuludq $H2,$T2,$T1 # h2*r1 1011 vpmuludq $H3,$T2,$T0 # h3*r1 1012 vpaddq $T1,$D3,$D3 # d3 += h2*r1 1013 vmovdqa -0x60(%r11),$T3 # r2^4 1014 vpaddq $T0,$D4,$D4 # d4 += h3*r1 1015 vpmuludq $H1,$T2,$T1 # h1*r1 1016 vpmuludq $H0,$T2,$T2 # h0*r1 1017 vpaddq $T1,$D2,$D2 # d2 += h1*r1 1018 vpaddq $T2,$D1,$D1 # d1 += h0*r1 1019 1020 vmovdqa -0x50(%r11),$T4 # s2^4 1021 vpmuludq $H2,$T3,$T0 # h2*r2 1022 vpmuludq $H1,$T3,$T1 # h1*r2 1023 vpaddq $T0,$D4,$D4 # d4 += h2*r2 1024 vpaddq $T1,$D3,$D3 # d3 += h1*r2 1025 vmovdqa -0x40(%r11),$T2 # r3^4 1026 vpmuludq $H0,$T3,$T3 # h0*r2 1027 vpmuludq $H4,$T4,$T0 # h4*s2 1028 vpaddq $T3,$D2,$D2 # d2 += h0*r2 1029 vpaddq $T0,$D1,$D1 # d1 += h4*s2 1030 vmovdqa -0x30(%r11),$T3 # s3^4 1031 vpmuludq $H3,$T4,$T4 # h3*s2 1032 vpmuludq $H1,$T2,$T1 # h1*r3 1033 vpaddq $T4,$D0,$D0 # d0 += h3*s2 1034 1035 vmovdqa -0x10(%r11),$T4 # s4^4 1036 vpaddq $T1,$D4,$D4 # d4 += h1*r3 1037 vpmuludq $H0,$T2,$T2 # h0*r3 1038 vpmuludq $H4,$T3,$T0 # h4*s3 1039 vpaddq $T2,$D3,$D3 # d3 += h0*r3 1040 vpaddq $T0,$D2,$D2 # d2 += h4*s3 1041 vmovdqu 16*2($inp),$T0 # load input 1042 vpmuludq $H3,$T3,$T2 # h3*s3 1043 vpmuludq $H2,$T3,$T3 # h2*s3 1044 vpaddq $T2,$D1,$D1 # d1 += h3*s3 1045 vmovdqu 16*3($inp),$T1 # 1046 vpaddq $T3,$D0,$D0 # d0 += h2*s3 1047 1048 vpmuludq $H2,$T4,$H2 # h2*s4 1049 vpmuludq $H3,$T4,$H3 # h3*s4 1050 vpsrldq \$6,$T0,$T2 # splat input 1051 vpaddq $H2,$D1,$D1 # d1 += h2*s4 1052 vpmuludq $H4,$T4,$H4 # h4*s4 1053 vpsrldq \$6,$T1,$T3 # 1054 vpaddq $H3,$D2,$H2 # h2 = d2 + h3*s4 1055 vpaddq $H4,$D3,$H3 # h3 = d3 + h4*s4 1056 vpmuludq -0x20(%r11),$H0,$H4 # h0*r4 1057 vpmuludq $H1,$T4,$H0 1058 vpunpckhqdq $T1,$T0,$T4 # 4 1059 vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 1060 vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 1061 1062 vpunpcklqdq $T1,$T0,$T0 # 0:1 1063 vpunpcklqdq $T3,$T2,$T3 # 2:3 1064 1065 #vpsrlq \$40,$T4,$T4 # 4 1066 vpsrldq \$`40/8`,$T4,$T4 # 4 1067 vpsrlq \$26,$T0,$T1 1068 vmovdqa 0x00(%rsp),$D4 # preload r0^2 1069 vpand $MASK,$T0,$T0 # 0 1070 vpsrlq \$4,$T3,$T2 1071 vpand $MASK,$T1,$T1 # 1 1072 vpand 0(%rcx),$T4,$T4 # .Lmask24 1073 vpsrlq \$30,$T3,$T3 1074 vpand $MASK,$T2,$T2 # 2 1075 vpand $MASK,$T3,$T3 # 3 1076 vpor 32(%rcx),$T4,$T4 # padbit, yes, always 1077 1078 ################################################################ 1079 # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein 1080 # and P. Schwabe 1081 1082 vpsrlq \$26,$H3,$D3 1083 vpand $MASK,$H3,$H3 1084 vpaddq $D3,$H4,$H4 # h3 -> h4 1085 1086 vpsrlq \$26,$H0,$D0 1087 vpand $MASK,$H0,$H0 1088 vpaddq $D0,$D1,$H1 # h0 -> h1 1089 1090 vpsrlq \$26,$H4,$D0 1091 vpand $MASK,$H4,$H4 1092 1093 vpsrlq \$26,$H1,$D1 1094 vpand $MASK,$H1,$H1 1095 vpaddq $D1,$H2,$H2 # h1 -> h2 1096 1097 vpaddq $D0,$H0,$H0 1098 vpsllq \$2,$D0,$D0 1099 vpaddq $D0,$H0,$H0 # h4 -> h0 1100 1101 vpsrlq \$26,$H2,$D2 1102 vpand $MASK,$H2,$H2 1103 vpaddq $D2,$H3,$H3 # h2 -> h3 1104 1105 vpsrlq \$26,$H0,$D0 1106 vpand $MASK,$H0,$H0 1107 vpaddq $D0,$H1,$H1 # h0 -> h1 1108 1109 vpsrlq \$26,$H3,$D3 1110 vpand $MASK,$H3,$H3 1111 vpaddq $D3,$H4,$H4 # h3 -> h4 1112 1113 ja .Loop_avx 1114 1115.Lskip_loop_avx: 1116 ################################################################ 1117 # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 1118 1119 vpshufd \$0x10,$D4,$D4 # r0^n, xx12 -> x1x2 1120 add \$32,$len 1121 jnz .Long_tail_avx 1122 1123 vpaddq $H2,$T2,$T2 1124 vpaddq $H0,$T0,$T0 1125 vpaddq $H1,$T1,$T1 1126 vpaddq $H3,$T3,$T3 1127 vpaddq $H4,$T4,$T4 1128 1129.Long_tail_avx: 1130 vmovdqa $H2,0x20(%r11) 1131 vmovdqa $H0,0x00(%r11) 1132 vmovdqa $H1,0x10(%r11) 1133 vmovdqa $H3,0x30(%r11) 1134 vmovdqa $H4,0x40(%r11) 1135 1136 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 1137 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 1138 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 1139 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 1140 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 1141 1142 vpmuludq $T2,$D4,$D2 # d2 = h2*r0 1143 vpmuludq $T0,$D4,$D0 # d0 = h0*r0 1144 vpshufd \$0x10,`16*1-64`($ctx),$H2 # r1^n 1145 vpmuludq $T1,$D4,$D1 # d1 = h1*r0 1146 vpmuludq $T3,$D4,$D3 # d3 = h3*r0 1147 vpmuludq $T4,$D4,$D4 # d4 = h4*r0 1148 1149 vpmuludq $T3,$H2,$H0 # h3*r1 1150 vpaddq $H0,$D4,$D4 # d4 += h3*r1 1151 vpshufd \$0x10,`16*2-64`($ctx),$H3 # s1^n 1152 vpmuludq $T2,$H2,$H1 # h2*r1 1153 vpaddq $H1,$D3,$D3 # d3 += h2*r1 1154 vpshufd \$0x10,`16*3-64`($ctx),$H4 # r2^n 1155 vpmuludq $T1,$H2,$H0 # h1*r1 1156 vpaddq $H0,$D2,$D2 # d2 += h1*r1 1157 vpmuludq $T0,$H2,$H2 # h0*r1 1158 vpaddq $H2,$D1,$D1 # d1 += h0*r1 1159 vpmuludq $T4,$H3,$H3 # h4*s1 1160 vpaddq $H3,$D0,$D0 # d0 += h4*s1 1161 1162 vpshufd \$0x10,`16*4-64`($ctx),$H2 # s2^n 1163 vpmuludq $T2,$H4,$H1 # h2*r2 1164 vpaddq $H1,$D4,$D4 # d4 += h2*r2 1165 vpmuludq $T1,$H4,$H0 # h1*r2 1166 vpaddq $H0,$D3,$D3 # d3 += h1*r2 1167 vpshufd \$0x10,`16*5-64`($ctx),$H3 # r3^n 1168 vpmuludq $T0,$H4,$H4 # h0*r2 1169 vpaddq $H4,$D2,$D2 # d2 += h0*r2 1170 vpmuludq $T4,$H2,$H1 # h4*s2 1171 vpaddq $H1,$D1,$D1 # d1 += h4*s2 1172 vpshufd \$0x10,`16*6-64`($ctx),$H4 # s3^n 1173 vpmuludq $T3,$H2,$H2 # h3*s2 1174 vpaddq $H2,$D0,$D0 # d0 += h3*s2 1175 1176 vpmuludq $T1,$H3,$H0 # h1*r3 1177 vpaddq $H0,$D4,$D4 # d4 += h1*r3 1178 vpmuludq $T0,$H3,$H3 # h0*r3 1179 vpaddq $H3,$D3,$D3 # d3 += h0*r3 1180 vpshufd \$0x10,`16*7-64`($ctx),$H2 # r4^n 1181 vpmuludq $T4,$H4,$H1 # h4*s3 1182 vpaddq $H1,$D2,$D2 # d2 += h4*s3 1183 vpshufd \$0x10,`16*8-64`($ctx),$H3 # s4^n 1184 vpmuludq $T3,$H4,$H0 # h3*s3 1185 vpaddq $H0,$D1,$D1 # d1 += h3*s3 1186 vpmuludq $T2,$H4,$H4 # h2*s3 1187 vpaddq $H4,$D0,$D0 # d0 += h2*s3 1188 1189 vpmuludq $T0,$H2,$H2 # h0*r4 1190 vpaddq $H2,$D4,$D4 # h4 = d4 + h0*r4 1191 vpmuludq $T4,$H3,$H1 # h4*s4 1192 vpaddq $H1,$D3,$D3 # h3 = d3 + h4*s4 1193 vpmuludq $T3,$H3,$H0 # h3*s4 1194 vpaddq $H0,$D2,$D2 # h2 = d2 + h3*s4 1195 vpmuludq $T2,$H3,$H1 # h2*s4 1196 vpaddq $H1,$D1,$D1 # h1 = d1 + h2*s4 1197 vpmuludq $T1,$H3,$H3 # h1*s4 1198 vpaddq $H3,$D0,$D0 # h0 = d0 + h1*s4 1199 1200 jz .Lshort_tail_avx 1201 1202 vmovdqu 16*0($inp),$H0 # load input 1203 vmovdqu 16*1($inp),$H1 1204 1205 vpsrldq \$6,$H0,$H2 # splat input 1206 vpsrldq \$6,$H1,$H3 1207 vpunpckhqdq $H1,$H0,$H4 # 4 1208 vpunpcklqdq $H1,$H0,$H0 # 0:1 1209 vpunpcklqdq $H3,$H2,$H3 # 2:3 1210 1211 vpsrlq \$40,$H4,$H4 # 4 1212 vpsrlq \$26,$H0,$H1 1213 vpand $MASK,$H0,$H0 # 0 1214 vpsrlq \$4,$H3,$H2 1215 vpand $MASK,$H1,$H1 # 1 1216 vpsrlq \$30,$H3,$H3 1217 vpand $MASK,$H2,$H2 # 2 1218 vpand $MASK,$H3,$H3 # 3 1219 vpor 32(%rcx),$H4,$H4 # padbit, yes, always 1220 1221 vpshufd \$0x32,`16*0-64`($ctx),$T4 # r0^n, 34xx -> x3x4 1222 vpaddq 0x00(%r11),$H0,$H0 1223 vpaddq 0x10(%r11),$H1,$H1 1224 vpaddq 0x20(%r11),$H2,$H2 1225 vpaddq 0x30(%r11),$H3,$H3 1226 vpaddq 0x40(%r11),$H4,$H4 1227 1228 ################################################################ 1229 # multiply (inp[0:1]+hash) by r^4:r^3 and accumulate 1230 1231 vpmuludq $H0,$T4,$T0 # h0*r0 1232 vpaddq $T0,$D0,$D0 # d0 += h0*r0 1233 vpmuludq $H1,$T4,$T1 # h1*r0 1234 vpaddq $T1,$D1,$D1 # d1 += h1*r0 1235 vpmuludq $H2,$T4,$T0 # h2*r0 1236 vpaddq $T0,$D2,$D2 # d2 += h2*r0 1237 vpshufd \$0x32,`16*1-64`($ctx),$T2 # r1^n 1238 vpmuludq $H3,$T4,$T1 # h3*r0 1239 vpaddq $T1,$D3,$D3 # d3 += h3*r0 1240 vpmuludq $H4,$T4,$T4 # h4*r0 1241 vpaddq $T4,$D4,$D4 # d4 += h4*r0 1242 1243 vpmuludq $H3,$T2,$T0 # h3*r1 1244 vpaddq $T0,$D4,$D4 # d4 += h3*r1 1245 vpshufd \$0x32,`16*2-64`($ctx),$T3 # s1 1246 vpmuludq $H2,$T2,$T1 # h2*r1 1247 vpaddq $T1,$D3,$D3 # d3 += h2*r1 1248 vpshufd \$0x32,`16*3-64`($ctx),$T4 # r2 1249 vpmuludq $H1,$T2,$T0 # h1*r1 1250 vpaddq $T0,$D2,$D2 # d2 += h1*r1 1251 vpmuludq $H0,$T2,$T2 # h0*r1 1252 vpaddq $T2,$D1,$D1 # d1 += h0*r1 1253 vpmuludq $H4,$T3,$T3 # h4*s1 1254 vpaddq $T3,$D0,$D0 # d0 += h4*s1 1255 1256 vpshufd \$0x32,`16*4-64`($ctx),$T2 # s2 1257 vpmuludq $H2,$T4,$T1 # h2*r2 1258 vpaddq $T1,$D4,$D4 # d4 += h2*r2 1259 vpmuludq $H1,$T4,$T0 # h1*r2 1260 vpaddq $T0,$D3,$D3 # d3 += h1*r2 1261 vpshufd \$0x32,`16*5-64`($ctx),$T3 # r3 1262 vpmuludq $H0,$T4,$T4 # h0*r2 1263 vpaddq $T4,$D2,$D2 # d2 += h0*r2 1264 vpmuludq $H4,$T2,$T1 # h4*s2 1265 vpaddq $T1,$D1,$D1 # d1 += h4*s2 1266 vpshufd \$0x32,`16*6-64`($ctx),$T4 # s3 1267 vpmuludq $H3,$T2,$T2 # h3*s2 1268 vpaddq $T2,$D0,$D0 # d0 += h3*s2 1269 1270 vpmuludq $H1,$T3,$T0 # h1*r3 1271 vpaddq $T0,$D4,$D4 # d4 += h1*r3 1272 vpmuludq $H0,$T3,$T3 # h0*r3 1273 vpaddq $T3,$D3,$D3 # d3 += h0*r3 1274 vpshufd \$0x32,`16*7-64`($ctx),$T2 # r4 1275 vpmuludq $H4,$T4,$T1 # h4*s3 1276 vpaddq $T1,$D2,$D2 # d2 += h4*s3 1277 vpshufd \$0x32,`16*8-64`($ctx),$T3 # s4 1278 vpmuludq $H3,$T4,$T0 # h3*s3 1279 vpaddq $T0,$D1,$D1 # d1 += h3*s3 1280 vpmuludq $H2,$T4,$T4 # h2*s3 1281 vpaddq $T4,$D0,$D0 # d0 += h2*s3 1282 1283 vpmuludq $H0,$T2,$T2 # h0*r4 1284 vpaddq $T2,$D4,$D4 # d4 += h0*r4 1285 vpmuludq $H4,$T3,$T1 # h4*s4 1286 vpaddq $T1,$D3,$D3 # d3 += h4*s4 1287 vpmuludq $H3,$T3,$T0 # h3*s4 1288 vpaddq $T0,$D2,$D2 # d2 += h3*s4 1289 vpmuludq $H2,$T3,$T1 # h2*s4 1290 vpaddq $T1,$D1,$D1 # d1 += h2*s4 1291 vpmuludq $H1,$T3,$T3 # h1*s4 1292 vpaddq $T3,$D0,$D0 # d0 += h1*s4 1293 1294.Lshort_tail_avx: 1295 ################################################################ 1296 # horizontal addition 1297 1298 vpsrldq \$8,$D4,$T4 1299 vpsrldq \$8,$D3,$T3 1300 vpsrldq \$8,$D1,$T1 1301 vpsrldq \$8,$D0,$T0 1302 vpsrldq \$8,$D2,$T2 1303 vpaddq $T3,$D3,$D3 1304 vpaddq $T4,$D4,$D4 1305 vpaddq $T0,$D0,$D0 1306 vpaddq $T1,$D1,$D1 1307 vpaddq $T2,$D2,$D2 1308 1309 ################################################################ 1310 # lazy reduction 1311 1312 vpsrlq \$26,$D3,$H3 1313 vpand $MASK,$D3,$D3 1314 vpaddq $H3,$D4,$D4 # h3 -> h4 1315 1316 vpsrlq \$26,$D0,$H0 1317 vpand $MASK,$D0,$D0 1318 vpaddq $H0,$D1,$D1 # h0 -> h1 1319 1320 vpsrlq \$26,$D4,$H4 1321 vpand $MASK,$D4,$D4 1322 1323 vpsrlq \$26,$D1,$H1 1324 vpand $MASK,$D1,$D1 1325 vpaddq $H1,$D2,$D2 # h1 -> h2 1326 1327 vpaddq $H4,$D0,$D0 1328 vpsllq \$2,$H4,$H4 1329 vpaddq $H4,$D0,$D0 # h4 -> h0 1330 1331 vpsrlq \$26,$D2,$H2 1332 vpand $MASK,$D2,$D2 1333 vpaddq $H2,$D3,$D3 # h2 -> h3 1334 1335 vpsrlq \$26,$D0,$H0 1336 vpand $MASK,$D0,$D0 1337 vpaddq $H0,$D1,$D1 # h0 -> h1 1338 1339 vpsrlq \$26,$D3,$H3 1340 vpand $MASK,$D3,$D3 1341 vpaddq $H3,$D4,$D4 # h3 -> h4 1342 1343 vmovd $D0,`4*0-48-64`($ctx) # save partially reduced 1344 vmovd $D1,`4*1-48-64`($ctx) 1345 vmovd $D2,`4*2-48-64`($ctx) 1346 vmovd $D3,`4*3-48-64`($ctx) 1347 vmovd $D4,`4*4-48-64`($ctx) 1348___ 1349$code.=<<___ if ($win64); 1350 vmovdqa 0x50(%r11),%xmm6 1351 vmovdqa 0x60(%r11),%xmm7 1352 vmovdqa 0x70(%r11),%xmm8 1353 vmovdqa 0x80(%r11),%xmm9 1354 vmovdqa 0x90(%r11),%xmm10 1355 vmovdqa 0xa0(%r11),%xmm11 1356 vmovdqa 0xb0(%r11),%xmm12 1357 vmovdqa 0xc0(%r11),%xmm13 1358 vmovdqa 0xd0(%r11),%xmm14 1359 vmovdqa 0xe0(%r11),%xmm15 1360 lea 0xf8(%r11),%rsp 1361.Ldo_avx_epilogue: 1362___ 1363$code.=<<___ if (!$win64); 1364 lea 0x58(%r11),%rsp 1365.cfi_def_cfa %rsp,8 1366___ 1367$code.=<<___; 1368 vzeroupper 1369 ret 1370.cfi_endproc 1371.size poly1305_blocks_avx,.-poly1305_blocks_avx 1372 1373.type poly1305_emit_avx,\@function,3 1374.align 32 1375poly1305_emit_avx: 1376 cmpl \$0,20($ctx) # is_base2_26? 1377 je .Lemit 1378 1379 mov 0($ctx),%eax # load hash value base 2^26 1380 mov 4($ctx),%ecx 1381 mov 8($ctx),%r8d 1382 mov 12($ctx),%r11d 1383 mov 16($ctx),%r10d 1384 1385 shl \$26,%rcx # base 2^26 -> base 2^64 1386 mov %r8,%r9 1387 shl \$52,%r8 1388 add %rcx,%rax 1389 shr \$12,%r9 1390 add %rax,%r8 # h0 1391 adc \$0,%r9 1392 1393 shl \$14,%r11 1394 mov %r10,%rax 1395 shr \$24,%r10 1396 add %r11,%r9 1397 shl \$40,%rax 1398 add %rax,%r9 # h1 1399 adc \$0,%r10 # h2 1400 1401 mov %r10,%rax # could be partially reduced, so reduce 1402 mov %r10,%rcx 1403 and \$3,%r10 1404 shr \$2,%rax 1405 and \$-4,%rcx 1406 add %rcx,%rax 1407 add %rax,%r8 1408 adc \$0,%r9 1409 adc \$0,%r10 1410 1411 mov %r8,%rax 1412 add \$5,%r8 # compare to modulus 1413 mov %r9,%rcx 1414 adc \$0,%r9 1415 adc \$0,%r10 1416 shr \$2,%r10 # did 130-bit value overflow? 1417 cmovnz %r8,%rax 1418 cmovnz %r9,%rcx 1419 1420 add 0($nonce),%rax # accumulate nonce 1421 adc 8($nonce),%rcx 1422 mov %rax,0($mac) # write result 1423 mov %rcx,8($mac) 1424 1425 ret 1426.size poly1305_emit_avx,.-poly1305_emit_avx 1427___ 1428 1429if ($avx>1) { 1430my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) = 1431 map("%ymm$_",(0..15)); 1432my $S4=$MASK; 1433 1434$code.=<<___; 1435.type poly1305_blocks_avx2,\@function,4 1436.align 32 1437poly1305_blocks_avx2: 1438.cfi_startproc 1439 mov 20($ctx),%r8d # is_base2_26 1440 cmp \$128,$len 1441 jae .Lblocks_avx2 1442 test %r8d,%r8d 1443 jz .Lblocks 1444 1445.Lblocks_avx2: 1446 and \$-16,$len 1447 jz .Lno_data_avx2 1448 1449 vzeroupper 1450 1451 test %r8d,%r8d 1452 jz .Lbase2_64_avx2 1453 1454 test \$63,$len 1455 jz .Leven_avx2 1456 1457 push %rbx 1458.cfi_push %rbx 1459 push %rbp 1460.cfi_push %rbp 1461 push %r12 1462.cfi_push %r12 1463 push %r13 1464.cfi_push %r13 1465 push %r14 1466.cfi_push %r14 1467 push %r15 1468.cfi_push %r15 1469.Lblocks_avx2_body: 1470 1471 mov $len,%r15 # reassign $len 1472 1473 mov 0($ctx),$d1 # load hash value 1474 mov 8($ctx),$d2 1475 mov 16($ctx),$h2#d 1476 1477 mov 24($ctx),$r0 # load r 1478 mov 32($ctx),$s1 1479 1480 ################################# base 2^26 -> base 2^64 1481 mov $d1#d,$h0#d 1482 and \$`-1*(1<<31)`,$d1 1483 mov $d2,$r1 # borrow $r1 1484 mov $d2#d,$h1#d 1485 and \$`-1*(1<<31)`,$d2 1486 1487 shr \$6,$d1 1488 shl \$52,$r1 1489 add $d1,$h0 1490 shr \$12,$h1 1491 shr \$18,$d2 1492 add $r1,$h0 1493 adc $d2,$h1 1494 1495 mov $h2,$d1 1496 shl \$40,$d1 1497 shr \$24,$h2 1498 add $d1,$h1 1499 adc \$0,$h2 # can be partially reduced... 1500 1501 mov \$-4,$d2 # ... so reduce 1502 mov $h2,$d1 1503 and $h2,$d2 1504 shr \$2,$d1 1505 and \$3,$h2 1506 add $d2,$d1 # =*5 1507 add $d1,$h0 1508 adc \$0,$h1 1509 adc \$0,$h2 1510 1511 mov $s1,$r1 1512 mov $s1,%rax 1513 shr \$2,$s1 1514 add $r1,$s1 # s1 = r1 + (r1 >> 2) 1515 1516.Lbase2_26_pre_avx2: 1517 add 0($inp),$h0 # accumulate input 1518 adc 8($inp),$h1 1519 lea 16($inp),$inp 1520 adc $padbit,$h2 1521 sub \$16,%r15 1522 1523 call __poly1305_block 1524 mov $r1,%rax 1525 1526 test \$63,%r15 1527 jnz .Lbase2_26_pre_avx2 1528 1529 test $padbit,$padbit # if $padbit is zero, 1530 jz .Lstore_base2_64_avx2 # store hash in base 2^64 format 1531 1532 ################################# base 2^64 -> base 2^26 1533 mov $h0,%rax 1534 mov $h0,%rdx 1535 shr \$52,$h0 1536 mov $h1,$r0 1537 mov $h1,$r1 1538 shr \$26,%rdx 1539 and \$0x3ffffff,%rax # h[0] 1540 shl \$12,$r0 1541 and \$0x3ffffff,%rdx # h[1] 1542 shr \$14,$h1 1543 or $r0,$h0 1544 shl \$24,$h2 1545 and \$0x3ffffff,$h0 # h[2] 1546 shr \$40,$r1 1547 and \$0x3ffffff,$h1 # h[3] 1548 or $r1,$h2 # h[4] 1549 1550 test %r15,%r15 1551 jz .Lstore_base2_26_avx2 1552 1553 vmovd %rax#d,%x#$H0 1554 vmovd %rdx#d,%x#$H1 1555 vmovd $h0#d,%x#$H2 1556 vmovd $h1#d,%x#$H3 1557 vmovd $h2#d,%x#$H4 1558 jmp .Lproceed_avx2 1559 1560.align 32 1561.Lstore_base2_64_avx2: 1562 mov $h0,0($ctx) 1563 mov $h1,8($ctx) 1564 mov $h2,16($ctx) # note that is_base2_26 is zeroed 1565 jmp .Ldone_avx2 1566 1567.align 16 1568.Lstore_base2_26_avx2: 1569 mov %rax#d,0($ctx) # store hash value base 2^26 1570 mov %rdx#d,4($ctx) 1571 mov $h0#d,8($ctx) 1572 mov $h1#d,12($ctx) 1573 mov $h2#d,16($ctx) 1574.align 16 1575.Ldone_avx2: 1576 mov 0(%rsp),%r15 1577.cfi_restore %r15 1578 mov 8(%rsp),%r14 1579.cfi_restore %r14 1580 mov 16(%rsp),%r13 1581.cfi_restore %r13 1582 mov 24(%rsp),%r12 1583.cfi_restore %r12 1584 mov 32(%rsp),%rbp 1585.cfi_restore %rbp 1586 mov 40(%rsp),%rbx 1587.cfi_restore %rbx 1588 lea 48(%rsp),%rsp 1589.cfi_adjust_cfa_offset -48 1590.Lno_data_avx2: 1591.Lblocks_avx2_epilogue: 1592 ret 1593.cfi_endproc 1594 1595.align 32 1596.Lbase2_64_avx2: 1597.cfi_startproc 1598 push %rbx 1599.cfi_push %rbx 1600 push %rbp 1601.cfi_push %rbp 1602 push %r12 1603.cfi_push %r12 1604 push %r13 1605.cfi_push %r13 1606 push %r14 1607.cfi_push %r14 1608 push %r15 1609.cfi_push %r15 1610.Lbase2_64_avx2_body: 1611 1612 mov $len,%r15 # reassign $len 1613 1614 mov 24($ctx),$r0 # load r 1615 mov 32($ctx),$s1 1616 1617 mov 0($ctx),$h0 # load hash value 1618 mov 8($ctx),$h1 1619 mov 16($ctx),$h2#d 1620 1621 mov $s1,$r1 1622 mov $s1,%rax 1623 shr \$2,$s1 1624 add $r1,$s1 # s1 = r1 + (r1 >> 2) 1625 1626 test \$63,$len 1627 jz .Linit_avx2 1628 1629.Lbase2_64_pre_avx2: 1630 add 0($inp),$h0 # accumulate input 1631 adc 8($inp),$h1 1632 lea 16($inp),$inp 1633 adc $padbit,$h2 1634 sub \$16,%r15 1635 1636 call __poly1305_block 1637 mov $r1,%rax 1638 1639 test \$63,%r15 1640 jnz .Lbase2_64_pre_avx2 1641 1642.Linit_avx2: 1643 ################################# base 2^64 -> base 2^26 1644 mov $h0,%rax 1645 mov $h0,%rdx 1646 shr \$52,$h0 1647 mov $h1,$d1 1648 mov $h1,$d2 1649 shr \$26,%rdx 1650 and \$0x3ffffff,%rax # h[0] 1651 shl \$12,$d1 1652 and \$0x3ffffff,%rdx # h[1] 1653 shr \$14,$h1 1654 or $d1,$h0 1655 shl \$24,$h2 1656 and \$0x3ffffff,$h0 # h[2] 1657 shr \$40,$d2 1658 and \$0x3ffffff,$h1 # h[3] 1659 or $d2,$h2 # h[4] 1660 1661 vmovd %rax#d,%x#$H0 1662 vmovd %rdx#d,%x#$H1 1663 vmovd $h0#d,%x#$H2 1664 vmovd $h1#d,%x#$H3 1665 vmovd $h2#d,%x#$H4 1666 movl \$1,20($ctx) # set is_base2_26 1667 1668 call __poly1305_init_avx 1669 1670.Lproceed_avx2: 1671 mov %r15,$len # restore $len 1672 mov OPENSSL_ia32cap_P+8(%rip),%r10d 1673 mov \$`(1<<31|1<<30|1<<16)`,%r11d 1674 1675 mov 0(%rsp),%r15 1676.cfi_restore %r15 1677 mov 8(%rsp),%r14 1678.cfi_restore %r14 1679 mov 16(%rsp),%r13 1680.cfi_restore %r13 1681 mov 24(%rsp),%r12 1682.cfi_restore %r12 1683 mov 32(%rsp),%rbp 1684.cfi_restore %rbp 1685 mov 40(%rsp),%rbx 1686.cfi_restore %rbx 1687 lea 48(%rsp),%rax 1688 lea 48(%rsp),%rsp 1689.cfi_adjust_cfa_offset -48 1690.Lbase2_64_avx2_epilogue: 1691 jmp .Ldo_avx2 1692.cfi_endproc 1693 1694.align 32 1695.Leven_avx2: 1696.cfi_startproc 1697 mov OPENSSL_ia32cap_P+8(%rip),%r10d 1698 vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26 1699 vmovd 4*1($ctx),%x#$H1 1700 vmovd 4*2($ctx),%x#$H2 1701 vmovd 4*3($ctx),%x#$H3 1702 vmovd 4*4($ctx),%x#$H4 1703 1704.Ldo_avx2: 1705___ 1706$code.=<<___ if ($avx>2); 1707 cmp \$512,$len 1708 jb .Lskip_avx512 1709 and %r11d,%r10d 1710 test \$`1<<16`,%r10d # check for AVX512F 1711 jnz .Lblocks_avx512 1712.Lskip_avx512: 1713___ 1714$code.=<<___ if (!$win64); 1715 lea -8(%rsp),%r11 1716.cfi_def_cfa %r11,16 1717 sub \$0x128,%rsp 1718___ 1719$code.=<<___ if ($win64); 1720 lea -0xf8(%rsp),%r11 1721 sub \$0x1c8,%rsp 1722 vmovdqa %xmm6,0x50(%r11) 1723 vmovdqa %xmm7,0x60(%r11) 1724 vmovdqa %xmm8,0x70(%r11) 1725 vmovdqa %xmm9,0x80(%r11) 1726 vmovdqa %xmm10,0x90(%r11) 1727 vmovdqa %xmm11,0xa0(%r11) 1728 vmovdqa %xmm12,0xb0(%r11) 1729 vmovdqa %xmm13,0xc0(%r11) 1730 vmovdqa %xmm14,0xd0(%r11) 1731 vmovdqa %xmm15,0xe0(%r11) 1732.Ldo_avx2_body: 1733___ 1734$code.=<<___; 1735 lea .Lconst(%rip),%rcx 1736 lea 48+64($ctx),$ctx # size optimization 1737 vmovdqa 96(%rcx),$T0 # .Lpermd_avx2 1738 1739 # expand and copy pre-calculated table to stack 1740 vmovdqu `16*0-64`($ctx),%x#$T2 1741 and \$-512,%rsp 1742 vmovdqu `16*1-64`($ctx),%x#$T3 1743 vmovdqu `16*2-64`($ctx),%x#$T4 1744 vmovdqu `16*3-64`($ctx),%x#$D0 1745 vmovdqu `16*4-64`($ctx),%x#$D1 1746 vmovdqu `16*5-64`($ctx),%x#$D2 1747 lea 0x90(%rsp),%rax # size optimization 1748 vmovdqu `16*6-64`($ctx),%x#$D3 1749 vpermd $T2,$T0,$T2 # 00003412 -> 14243444 1750 vmovdqu `16*7-64`($ctx),%x#$D4 1751 vpermd $T3,$T0,$T3 1752 vmovdqu `16*8-64`($ctx),%x#$MASK 1753 vpermd $T4,$T0,$T4 1754 vmovdqa $T2,0x00(%rsp) 1755 vpermd $D0,$T0,$D0 1756 vmovdqa $T3,0x20-0x90(%rax) 1757 vpermd $D1,$T0,$D1 1758 vmovdqa $T4,0x40-0x90(%rax) 1759 vpermd $D2,$T0,$D2 1760 vmovdqa $D0,0x60-0x90(%rax) 1761 vpermd $D3,$T0,$D3 1762 vmovdqa $D1,0x80-0x90(%rax) 1763 vpermd $D4,$T0,$D4 1764 vmovdqa $D2,0xa0-0x90(%rax) 1765 vpermd $MASK,$T0,$MASK 1766 vmovdqa $D3,0xc0-0x90(%rax) 1767 vmovdqa $D4,0xe0-0x90(%rax) 1768 vmovdqa $MASK,0x100-0x90(%rax) 1769 vmovdqa 64(%rcx),$MASK # .Lmask26 1770 1771 ################################################################ 1772 # load input 1773 vmovdqu 16*0($inp),%x#$T0 1774 vmovdqu 16*1($inp),%x#$T1 1775 vinserti128 \$1,16*2($inp),$T0,$T0 1776 vinserti128 \$1,16*3($inp),$T1,$T1 1777 lea 16*4($inp),$inp 1778 1779 vpsrldq \$6,$T0,$T2 # splat input 1780 vpsrldq \$6,$T1,$T3 1781 vpunpckhqdq $T1,$T0,$T4 # 4 1782 vpunpcklqdq $T3,$T2,$T2 # 2:3 1783 vpunpcklqdq $T1,$T0,$T0 # 0:1 1784 1785 vpsrlq \$30,$T2,$T3 1786 vpsrlq \$4,$T2,$T2 1787 vpsrlq \$26,$T0,$T1 1788 vpsrlq \$40,$T4,$T4 # 4 1789 vpand $MASK,$T2,$T2 # 2 1790 vpand $MASK,$T0,$T0 # 0 1791 vpand $MASK,$T1,$T1 # 1 1792 vpand $MASK,$T3,$T3 # 3 1793 vpor 32(%rcx),$T4,$T4 # padbit, yes, always 1794 1795 vpaddq $H2,$T2,$H2 # accumulate input 1796 sub \$64,$len 1797 jz .Ltail_avx2 1798 jmp .Loop_avx2 1799 1800.align 32 1801.Loop_avx2: 1802 ################################################################ 1803 # ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4 1804 # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3 1805 # ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2 1806 # ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1 1807 # \________/\__________/ 1808 ################################################################ 1809 #vpaddq $H2,$T2,$H2 # accumulate input 1810 vpaddq $H0,$T0,$H0 1811 vmovdqa `32*0`(%rsp),$T0 # r0^4 1812 vpaddq $H1,$T1,$H1 1813 vmovdqa `32*1`(%rsp),$T1 # r1^4 1814 vpaddq $H3,$T3,$H3 1815 vmovdqa `32*3`(%rsp),$T2 # r2^4 1816 vpaddq $H4,$T4,$H4 1817 vmovdqa `32*6-0x90`(%rax),$T3 # s3^4 1818 vmovdqa `32*8-0x90`(%rax),$S4 # s4^4 1819 1820 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 1821 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 1822 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 1823 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 1824 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 1825 # 1826 # however, as h2 is "chronologically" first one available pull 1827 # corresponding operations up, so it's 1828 # 1829 # d4 = h2*r2 + h4*r0 + h3*r1 + h1*r3 + h0*r4 1830 # d3 = h2*r1 + h3*r0 + h1*r2 + h0*r3 + h4*5*r4 1831 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 1832 # d1 = h2*5*r4 + h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 1833 # d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2 + h1*5*r4 1834 1835 vpmuludq $H2,$T0,$D2 # d2 = h2*r0 1836 vpmuludq $H2,$T1,$D3 # d3 = h2*r1 1837 vpmuludq $H2,$T2,$D4 # d4 = h2*r2 1838 vpmuludq $H2,$T3,$D0 # d0 = h2*s3 1839 vpmuludq $H2,$S4,$D1 # d1 = h2*s4 1840 1841 vpmuludq $H0,$T1,$T4 # h0*r1 1842 vpmuludq $H1,$T1,$H2 # h1*r1, borrow $H2 as temp 1843 vpaddq $T4,$D1,$D1 # d1 += h0*r1 1844 vpaddq $H2,$D2,$D2 # d2 += h1*r1 1845 vpmuludq $H3,$T1,$T4 # h3*r1 1846 vpmuludq `32*2`(%rsp),$H4,$H2 # h4*s1 1847 vpaddq $T4,$D4,$D4 # d4 += h3*r1 1848 vpaddq $H2,$D0,$D0 # d0 += h4*s1 1849 vmovdqa `32*4-0x90`(%rax),$T1 # s2 1850 1851 vpmuludq $H0,$T0,$T4 # h0*r0 1852 vpmuludq $H1,$T0,$H2 # h1*r0 1853 vpaddq $T4,$D0,$D0 # d0 += h0*r0 1854 vpaddq $H2,$D1,$D1 # d1 += h1*r0 1855 vpmuludq $H3,$T0,$T4 # h3*r0 1856 vpmuludq $H4,$T0,$H2 # h4*r0 1857 vmovdqu 16*0($inp),%x#$T0 # load input 1858 vpaddq $T4,$D3,$D3 # d3 += h3*r0 1859 vpaddq $H2,$D4,$D4 # d4 += h4*r0 1860 vinserti128 \$1,16*2($inp),$T0,$T0 1861 1862 vpmuludq $H3,$T1,$T4 # h3*s2 1863 vpmuludq $H4,$T1,$H2 # h4*s2 1864 vmovdqu 16*1($inp),%x#$T1 1865 vpaddq $T4,$D0,$D0 # d0 += h3*s2 1866 vpaddq $H2,$D1,$D1 # d1 += h4*s2 1867 vmovdqa `32*5-0x90`(%rax),$H2 # r3 1868 vpmuludq $H1,$T2,$T4 # h1*r2 1869 vpmuludq $H0,$T2,$T2 # h0*r2 1870 vpaddq $T4,$D3,$D3 # d3 += h1*r2 1871 vpaddq $T2,$D2,$D2 # d2 += h0*r2 1872 vinserti128 \$1,16*3($inp),$T1,$T1 1873 lea 16*4($inp),$inp 1874 1875 vpmuludq $H1,$H2,$T4 # h1*r3 1876 vpmuludq $H0,$H2,$H2 # h0*r3 1877 vpsrldq \$6,$T0,$T2 # splat input 1878 vpaddq $T4,$D4,$D4 # d4 += h1*r3 1879 vpaddq $H2,$D3,$D3 # d3 += h0*r3 1880 vpmuludq $H3,$T3,$T4 # h3*s3 1881 vpmuludq $H4,$T3,$H2 # h4*s3 1882 vpsrldq \$6,$T1,$T3 1883 vpaddq $T4,$D1,$D1 # d1 += h3*s3 1884 vpaddq $H2,$D2,$D2 # d2 += h4*s3 1885 vpunpckhqdq $T1,$T0,$T4 # 4 1886 1887 vpmuludq $H3,$S4,$H3 # h3*s4 1888 vpmuludq $H4,$S4,$H4 # h4*s4 1889 vpunpcklqdq $T1,$T0,$T0 # 0:1 1890 vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4 1891 vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4 1892 vpunpcklqdq $T3,$T2,$T3 # 2:3 1893 vpmuludq `32*7-0x90`(%rax),$H0,$H4 # h0*r4 1894 vpmuludq $H1,$S4,$H0 # h1*s4 1895 vmovdqa 64(%rcx),$MASK # .Lmask26 1896 vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 1897 vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 1898 1899 ################################################################ 1900 # lazy reduction (interleaved with tail of input splat) 1901 1902 vpsrlq \$26,$H3,$D3 1903 vpand $MASK,$H3,$H3 1904 vpaddq $D3,$H4,$H4 # h3 -> h4 1905 1906 vpsrlq \$26,$H0,$D0 1907 vpand $MASK,$H0,$H0 1908 vpaddq $D0,$D1,$H1 # h0 -> h1 1909 1910 vpsrlq \$26,$H4,$D4 1911 vpand $MASK,$H4,$H4 1912 1913 vpsrlq \$4,$T3,$T2 1914 1915 vpsrlq \$26,$H1,$D1 1916 vpand $MASK,$H1,$H1 1917 vpaddq $D1,$H2,$H2 # h1 -> h2 1918 1919 vpaddq $D4,$H0,$H0 1920 vpsllq \$2,$D4,$D4 1921 vpaddq $D4,$H0,$H0 # h4 -> h0 1922 1923 vpand $MASK,$T2,$T2 # 2 1924 vpsrlq \$26,$T0,$T1 1925 1926 vpsrlq \$26,$H2,$D2 1927 vpand $MASK,$H2,$H2 1928 vpaddq $D2,$H3,$H3 # h2 -> h3 1929 1930 vpaddq $T2,$H2,$H2 # modulo-scheduled 1931 vpsrlq \$30,$T3,$T3 1932 1933 vpsrlq \$26,$H0,$D0 1934 vpand $MASK,$H0,$H0 1935 vpaddq $D0,$H1,$H1 # h0 -> h1 1936 1937 vpsrlq \$40,$T4,$T4 # 4 1938 1939 vpsrlq \$26,$H3,$D3 1940 vpand $MASK,$H3,$H3 1941 vpaddq $D3,$H4,$H4 # h3 -> h4 1942 1943 vpand $MASK,$T0,$T0 # 0 1944 vpand $MASK,$T1,$T1 # 1 1945 vpand $MASK,$T3,$T3 # 3 1946 vpor 32(%rcx),$T4,$T4 # padbit, yes, always 1947 1948 sub \$64,$len 1949 jnz .Loop_avx2 1950 1951 .byte 0x66,0x90 1952.Ltail_avx2: 1953 ################################################################ 1954 # while above multiplications were by r^4 in all lanes, in last 1955 # iteration we multiply least significant lane by r^4 and most 1956 # significant one by r, so copy of above except that references 1957 # to the precomputed table are displaced by 4... 1958 1959 #vpaddq $H2,$T2,$H2 # accumulate input 1960 vpaddq $H0,$T0,$H0 1961 vmovdqu `32*0+4`(%rsp),$T0 # r0^4 1962 vpaddq $H1,$T1,$H1 1963 vmovdqu `32*1+4`(%rsp),$T1 # r1^4 1964 vpaddq $H3,$T3,$H3 1965 vmovdqu `32*3+4`(%rsp),$T2 # r2^4 1966 vpaddq $H4,$T4,$H4 1967 vmovdqu `32*6+4-0x90`(%rax),$T3 # s3^4 1968 vmovdqu `32*8+4-0x90`(%rax),$S4 # s4^4 1969 1970 vpmuludq $H2,$T0,$D2 # d2 = h2*r0 1971 vpmuludq $H2,$T1,$D3 # d3 = h2*r1 1972 vpmuludq $H2,$T2,$D4 # d4 = h2*r2 1973 vpmuludq $H2,$T3,$D0 # d0 = h2*s3 1974 vpmuludq $H2,$S4,$D1 # d1 = h2*s4 1975 1976 vpmuludq $H0,$T1,$T4 # h0*r1 1977 vpmuludq $H1,$T1,$H2 # h1*r1 1978 vpaddq $T4,$D1,$D1 # d1 += h0*r1 1979 vpaddq $H2,$D2,$D2 # d2 += h1*r1 1980 vpmuludq $H3,$T1,$T4 # h3*r1 1981 vpmuludq `32*2+4`(%rsp),$H4,$H2 # h4*s1 1982 vpaddq $T4,$D4,$D4 # d4 += h3*r1 1983 vpaddq $H2,$D0,$D0 # d0 += h4*s1 1984 1985 vpmuludq $H0,$T0,$T4 # h0*r0 1986 vpmuludq $H1,$T0,$H2 # h1*r0 1987 vpaddq $T4,$D0,$D0 # d0 += h0*r0 1988 vmovdqu `32*4+4-0x90`(%rax),$T1 # s2 1989 vpaddq $H2,$D1,$D1 # d1 += h1*r0 1990 vpmuludq $H3,$T0,$T4 # h3*r0 1991 vpmuludq $H4,$T0,$H2 # h4*r0 1992 vpaddq $T4,$D3,$D3 # d3 += h3*r0 1993 vpaddq $H2,$D4,$D4 # d4 += h4*r0 1994 1995 vpmuludq $H3,$T1,$T4 # h3*s2 1996 vpmuludq $H4,$T1,$H2 # h4*s2 1997 vpaddq $T4,$D0,$D0 # d0 += h3*s2 1998 vpaddq $H2,$D1,$D1 # d1 += h4*s2 1999 vmovdqu `32*5+4-0x90`(%rax),$H2 # r3 2000 vpmuludq $H1,$T2,$T4 # h1*r2 2001 vpmuludq $H0,$T2,$T2 # h0*r2 2002 vpaddq $T4,$D3,$D3 # d3 += h1*r2 2003 vpaddq $T2,$D2,$D2 # d2 += h0*r2 2004 2005 vpmuludq $H1,$H2,$T4 # h1*r3 2006 vpmuludq $H0,$H2,$H2 # h0*r3 2007 vpaddq $T4,$D4,$D4 # d4 += h1*r3 2008 vpaddq $H2,$D3,$D3 # d3 += h0*r3 2009 vpmuludq $H3,$T3,$T4 # h3*s3 2010 vpmuludq $H4,$T3,$H2 # h4*s3 2011 vpaddq $T4,$D1,$D1 # d1 += h3*s3 2012 vpaddq $H2,$D2,$D2 # d2 += h4*s3 2013 2014 vpmuludq $H3,$S4,$H3 # h3*s4 2015 vpmuludq $H4,$S4,$H4 # h4*s4 2016 vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4 2017 vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4 2018 vpmuludq `32*7+4-0x90`(%rax),$H0,$H4 # h0*r4 2019 vpmuludq $H1,$S4,$H0 # h1*s4 2020 vmovdqa 64(%rcx),$MASK # .Lmask26 2021 vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 2022 vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 2023 2024 ################################################################ 2025 # horizontal addition 2026 2027 vpsrldq \$8,$D1,$T1 2028 vpsrldq \$8,$H2,$T2 2029 vpsrldq \$8,$H3,$T3 2030 vpsrldq \$8,$H4,$T4 2031 vpsrldq \$8,$H0,$T0 2032 vpaddq $T1,$D1,$D1 2033 vpaddq $T2,$H2,$H2 2034 vpaddq $T3,$H3,$H3 2035 vpaddq $T4,$H4,$H4 2036 vpaddq $T0,$H0,$H0 2037 2038 vpermq \$0x2,$H3,$T3 2039 vpermq \$0x2,$H4,$T4 2040 vpermq \$0x2,$H0,$T0 2041 vpermq \$0x2,$D1,$T1 2042 vpermq \$0x2,$H2,$T2 2043 vpaddq $T3,$H3,$H3 2044 vpaddq $T4,$H4,$H4 2045 vpaddq $T0,$H0,$H0 2046 vpaddq $T1,$D1,$D1 2047 vpaddq $T2,$H2,$H2 2048 2049 ################################################################ 2050 # lazy reduction 2051 2052 vpsrlq \$26,$H3,$D3 2053 vpand $MASK,$H3,$H3 2054 vpaddq $D3,$H4,$H4 # h3 -> h4 2055 2056 vpsrlq \$26,$H0,$D0 2057 vpand $MASK,$H0,$H0 2058 vpaddq $D0,$D1,$H1 # h0 -> h1 2059 2060 vpsrlq \$26,$H4,$D4 2061 vpand $MASK,$H4,$H4 2062 2063 vpsrlq \$26,$H1,$D1 2064 vpand $MASK,$H1,$H1 2065 vpaddq $D1,$H2,$H2 # h1 -> h2 2066 2067 vpaddq $D4,$H0,$H0 2068 vpsllq \$2,$D4,$D4 2069 vpaddq $D4,$H0,$H0 # h4 -> h0 2070 2071 vpsrlq \$26,$H2,$D2 2072 vpand $MASK,$H2,$H2 2073 vpaddq $D2,$H3,$H3 # h2 -> h3 2074 2075 vpsrlq \$26,$H0,$D0 2076 vpand $MASK,$H0,$H0 2077 vpaddq $D0,$H1,$H1 # h0 -> h1 2078 2079 vpsrlq \$26,$H3,$D3 2080 vpand $MASK,$H3,$H3 2081 vpaddq $D3,$H4,$H4 # h3 -> h4 2082 2083 vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced 2084 vmovd %x#$H1,`4*1-48-64`($ctx) 2085 vmovd %x#$H2,`4*2-48-64`($ctx) 2086 vmovd %x#$H3,`4*3-48-64`($ctx) 2087 vmovd %x#$H4,`4*4-48-64`($ctx) 2088___ 2089$code.=<<___ if ($win64); 2090 vmovdqa 0x50(%r11),%xmm6 2091 vmovdqa 0x60(%r11),%xmm7 2092 vmovdqa 0x70(%r11),%xmm8 2093 vmovdqa 0x80(%r11),%xmm9 2094 vmovdqa 0x90(%r11),%xmm10 2095 vmovdqa 0xa0(%r11),%xmm11 2096 vmovdqa 0xb0(%r11),%xmm12 2097 vmovdqa 0xc0(%r11),%xmm13 2098 vmovdqa 0xd0(%r11),%xmm14 2099 vmovdqa 0xe0(%r11),%xmm15 2100 lea 0xf8(%r11),%rsp 2101.Ldo_avx2_epilogue: 2102___ 2103$code.=<<___ if (!$win64); 2104 lea 8(%r11),%rsp 2105.cfi_def_cfa %rsp,8 2106___ 2107$code.=<<___; 2108 vzeroupper 2109 ret 2110.cfi_endproc 2111.size poly1305_blocks_avx2,.-poly1305_blocks_avx2 2112___ 2113####################################################################### 2114if ($avx>2) { 2115# On entry we have input length divisible by 64. But since inner loop 2116# processes 128 bytes per iteration, cases when length is not divisible 2117# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this 2118# reason stack layout is kept identical to poly1305_blocks_avx2. If not 2119# for this tail, we wouldn't have to even allocate stack frame... 2120 2121my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24)); 2122my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29)); 2123my $PADBIT="%zmm30"; 2124 2125map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3)); # switch to %zmm domain 2126map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4)); 2127map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4)); 2128map(s/%y/%z/,($MASK)); 2129 2130$code.=<<___; 2131.type poly1305_blocks_avx512,\@function,4 2132.align 32 2133poly1305_blocks_avx512: 2134.cfi_startproc 2135.Lblocks_avx512: 2136 mov \$15,%eax 2137 kmovw %eax,%k2 2138___ 2139$code.=<<___ if (!$win64); 2140 lea -8(%rsp),%r11 2141.cfi_def_cfa %r11,16 2142 sub \$0x128,%rsp 2143___ 2144$code.=<<___ if ($win64); 2145 lea -0xf8(%rsp),%r11 2146 sub \$0x1c8,%rsp 2147 vmovdqa %xmm6,0x50(%r11) 2148 vmovdqa %xmm7,0x60(%r11) 2149 vmovdqa %xmm8,0x70(%r11) 2150 vmovdqa %xmm9,0x80(%r11) 2151 vmovdqa %xmm10,0x90(%r11) 2152 vmovdqa %xmm11,0xa0(%r11) 2153 vmovdqa %xmm12,0xb0(%r11) 2154 vmovdqa %xmm13,0xc0(%r11) 2155 vmovdqa %xmm14,0xd0(%r11) 2156 vmovdqa %xmm15,0xe0(%r11) 2157.Ldo_avx512_body: 2158___ 2159$code.=<<___; 2160 lea .Lconst(%rip),%rcx 2161 lea 48+64($ctx),$ctx # size optimization 2162 vmovdqa 96(%rcx),%y#$T2 # .Lpermd_avx2 2163 2164 # expand pre-calculated table 2165 vmovdqu `16*0-64`($ctx),%x#$D0 # will become expanded ${R0} 2166 and \$-512,%rsp 2167 vmovdqu `16*1-64`($ctx),%x#$D1 # will become ... ${R1} 2168 mov \$0x20,%rax 2169 vmovdqu `16*2-64`($ctx),%x#$T0 # ... ${S1} 2170 vmovdqu `16*3-64`($ctx),%x#$D2 # ... ${R2} 2171 vmovdqu `16*4-64`($ctx),%x#$T1 # ... ${S2} 2172 vmovdqu `16*5-64`($ctx),%x#$D3 # ... ${R3} 2173 vmovdqu `16*6-64`($ctx),%x#$T3 # ... ${S3} 2174 vmovdqu `16*7-64`($ctx),%x#$D4 # ... ${R4} 2175 vmovdqu `16*8-64`($ctx),%x#$T4 # ... ${S4} 2176 vpermd $D0,$T2,$R0 # 00003412 -> 14243444 2177 vpbroadcastq 64(%rcx),$MASK # .Lmask26 2178 vpermd $D1,$T2,$R1 2179 vpermd $T0,$T2,$S1 2180 vpermd $D2,$T2,$R2 2181 vmovdqa64 $R0,0x00(%rsp){%k2} # save in case $len%128 != 0 2182 vpsrlq \$32,$R0,$T0 # 14243444 -> 01020304 2183 vpermd $T1,$T2,$S2 2184 vmovdqu64 $R1,0x00(%rsp,%rax){%k2} 2185 vpsrlq \$32,$R1,$T1 2186 vpermd $D3,$T2,$R3 2187 vmovdqa64 $S1,0x40(%rsp){%k2} 2188 vpermd $T3,$T2,$S3 2189 vpermd $D4,$T2,$R4 2190 vmovdqu64 $R2,0x40(%rsp,%rax){%k2} 2191 vpermd $T4,$T2,$S4 2192 vmovdqa64 $S2,0x80(%rsp){%k2} 2193 vmovdqu64 $R3,0x80(%rsp,%rax){%k2} 2194 vmovdqa64 $S3,0xc0(%rsp){%k2} 2195 vmovdqu64 $R4,0xc0(%rsp,%rax){%k2} 2196 vmovdqa64 $S4,0x100(%rsp){%k2} 2197 2198 ################################################################ 2199 # calculate 5th through 8th powers of the key 2200 # 2201 # d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r1 2202 # d1 = r0'*r1 + r1'*r0 + r2'*5*r4 + r3'*5*r3 + r4'*5*r2 2203 # d2 = r0'*r2 + r1'*r1 + r2'*r0 + r3'*5*r4 + r4'*5*r3 2204 # d3 = r0'*r3 + r1'*r2 + r2'*r1 + r3'*r0 + r4'*5*r4 2205 # d4 = r0'*r4 + r1'*r3 + r2'*r2 + r3'*r1 + r4'*r0 2206 2207 vpmuludq $T0,$R0,$D0 # d0 = r0'*r0 2208 vpmuludq $T0,$R1,$D1 # d1 = r0'*r1 2209 vpmuludq $T0,$R2,$D2 # d2 = r0'*r2 2210 vpmuludq $T0,$R3,$D3 # d3 = r0'*r3 2211 vpmuludq $T0,$R4,$D4 # d4 = r0'*r4 2212 vpsrlq \$32,$R2,$T2 2213 2214 vpmuludq $T1,$S4,$M0 2215 vpmuludq $T1,$R0,$M1 2216 vpmuludq $T1,$R1,$M2 2217 vpmuludq $T1,$R2,$M3 2218 vpmuludq $T1,$R3,$M4 2219 vpsrlq \$32,$R3,$T3 2220 vpaddq $M0,$D0,$D0 # d0 += r1'*5*r4 2221 vpaddq $M1,$D1,$D1 # d1 += r1'*r0 2222 vpaddq $M2,$D2,$D2 # d2 += r1'*r1 2223 vpaddq $M3,$D3,$D3 # d3 += r1'*r2 2224 vpaddq $M4,$D4,$D4 # d4 += r1'*r3 2225 2226 vpmuludq $T2,$S3,$M0 2227 vpmuludq $T2,$S4,$M1 2228 vpmuludq $T2,$R1,$M3 2229 vpmuludq $T2,$R2,$M4 2230 vpmuludq $T2,$R0,$M2 2231 vpsrlq \$32,$R4,$T4 2232 vpaddq $M0,$D0,$D0 # d0 += r2'*5*r3 2233 vpaddq $M1,$D1,$D1 # d1 += r2'*5*r4 2234 vpaddq $M3,$D3,$D3 # d3 += r2'*r1 2235 vpaddq $M4,$D4,$D4 # d4 += r2'*r2 2236 vpaddq $M2,$D2,$D2 # d2 += r2'*r0 2237 2238 vpmuludq $T3,$S2,$M0 2239 vpmuludq $T3,$R0,$M3 2240 vpmuludq $T3,$R1,$M4 2241 vpmuludq $T3,$S3,$M1 2242 vpmuludq $T3,$S4,$M2 2243 vpaddq $M0,$D0,$D0 # d0 += r3'*5*r2 2244 vpaddq $M3,$D3,$D3 # d3 += r3'*r0 2245 vpaddq $M4,$D4,$D4 # d4 += r3'*r1 2246 vpaddq $M1,$D1,$D1 # d1 += r3'*5*r3 2247 vpaddq $M2,$D2,$D2 # d2 += r3'*5*r4 2248 2249 vpmuludq $T4,$S4,$M3 2250 vpmuludq $T4,$R0,$M4 2251 vpmuludq $T4,$S1,$M0 2252 vpmuludq $T4,$S2,$M1 2253 vpmuludq $T4,$S3,$M2 2254 vpaddq $M3,$D3,$D3 # d3 += r2'*5*r4 2255 vpaddq $M4,$D4,$D4 # d4 += r2'*r0 2256 vpaddq $M0,$D0,$D0 # d0 += r2'*5*r1 2257 vpaddq $M1,$D1,$D1 # d1 += r2'*5*r2 2258 vpaddq $M2,$D2,$D2 # d2 += r2'*5*r3 2259 2260 ################################################################ 2261 # load input 2262 vmovdqu64 16*0($inp),%z#$T3 2263 vmovdqu64 16*4($inp),%z#$T4 2264 lea 16*8($inp),$inp 2265 2266 ################################################################ 2267 # lazy reduction 2268 2269 vpsrlq \$26,$D3,$M3 2270 vpandq $MASK,$D3,$D3 2271 vpaddq $M3,$D4,$D4 # d3 -> d4 2272 2273 vpsrlq \$26,$D0,$M0 2274 vpandq $MASK,$D0,$D0 2275 vpaddq $M0,$D1,$D1 # d0 -> d1 2276 2277 vpsrlq \$26,$D4,$M4 2278 vpandq $MASK,$D4,$D4 2279 2280 vpsrlq \$26,$D1,$M1 2281 vpandq $MASK,$D1,$D1 2282 vpaddq $M1,$D2,$D2 # d1 -> d2 2283 2284 vpaddq $M4,$D0,$D0 2285 vpsllq \$2,$M4,$M4 2286 vpaddq $M4,$D0,$D0 # d4 -> d0 2287 2288 vpsrlq \$26,$D2,$M2 2289 vpandq $MASK,$D2,$D2 2290 vpaddq $M2,$D3,$D3 # d2 -> d3 2291 2292 vpsrlq \$26,$D0,$M0 2293 vpandq $MASK,$D0,$D0 2294 vpaddq $M0,$D1,$D1 # d0 -> d1 2295 2296 vpsrlq \$26,$D3,$M3 2297 vpandq $MASK,$D3,$D3 2298 vpaddq $M3,$D4,$D4 # d3 -> d4 2299 2300 ################################################################ 2301 # at this point we have 14243444 in $R0-$S4 and 05060708 in 2302 # $D0-$D4, ... 2303 2304 vpunpcklqdq $T4,$T3,$T0 # transpose input 2305 vpunpckhqdq $T4,$T3,$T4 2306 2307 # ... since input 64-bit lanes are ordered as 73625140, we could 2308 # "vperm" it to 76543210 (here and in each loop iteration), *or* 2309 # we could just flow along, hence the goal for $R0-$S4 is 2310 # 1858286838784888 ... 2311 2312 vmovdqa32 128(%rcx),$M0 # .Lpermd_avx512: 2313 mov \$0x7777,%eax 2314 kmovw %eax,%k1 2315 2316 vpermd $R0,$M0,$R0 # 14243444 -> 1---2---3---4--- 2317 vpermd $R1,$M0,$R1 2318 vpermd $R2,$M0,$R2 2319 vpermd $R3,$M0,$R3 2320 vpermd $R4,$M0,$R4 2321 2322 vpermd $D0,$M0,${R0}{%k1} # 05060708 -> 1858286838784888 2323 vpermd $D1,$M0,${R1}{%k1} 2324 vpermd $D2,$M0,${R2}{%k1} 2325 vpermd $D3,$M0,${R3}{%k1} 2326 vpermd $D4,$M0,${R4}{%k1} 2327 2328 vpslld \$2,$R1,$S1 # *5 2329 vpslld \$2,$R2,$S2 2330 vpslld \$2,$R3,$S3 2331 vpslld \$2,$R4,$S4 2332 vpaddd $R1,$S1,$S1 2333 vpaddd $R2,$S2,$S2 2334 vpaddd $R3,$S3,$S3 2335 vpaddd $R4,$S4,$S4 2336 2337 vpbroadcastq 32(%rcx),$PADBIT # .L129 2338 2339 vpsrlq \$52,$T0,$T2 # splat input 2340 vpsllq \$12,$T4,$T3 2341 vporq $T3,$T2,$T2 2342 vpsrlq \$26,$T0,$T1 2343 vpsrlq \$14,$T4,$T3 2344 vpsrlq \$40,$T4,$T4 # 4 2345 vpandq $MASK,$T2,$T2 # 2 2346 vpandq $MASK,$T0,$T0 # 0 2347 #vpandq $MASK,$T1,$T1 # 1 2348 #vpandq $MASK,$T3,$T3 # 3 2349 #vporq $PADBIT,$T4,$T4 # padbit, yes, always 2350 2351 vpaddq $H2,$T2,$H2 # accumulate input 2352 sub \$192,$len 2353 jbe .Ltail_avx512 2354 jmp .Loop_avx512 2355 2356.align 32 2357.Loop_avx512: 2358 ################################################################ 2359 # ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8 2360 # ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^7 2361 # ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^6 2362 # ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5 2363 # ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^4 2364 # ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3 2365 # ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^2 2366 # ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^1 2367 # \________/\___________/ 2368 ################################################################ 2369 #vpaddq $H2,$T2,$H2 # accumulate input 2370 2371 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 2372 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 2373 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 2374 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 2375 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 2376 # 2377 # however, as h2 is "chronologically" first one available pull 2378 # corresponding operations up, so it's 2379 # 2380 # d3 = h2*r1 + h0*r3 + h1*r2 + h3*r0 + h4*5*r4 2381 # d4 = h2*r2 + h0*r4 + h1*r3 + h3*r1 + h4*r0 2382 # d0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r1 2383 # d1 = h2*5*r4 + h0*r1 + h1*r0 + h3*5*r3 + h4*5*r2 2384 # d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3 2385 2386 vpmuludq $H2,$R1,$D3 # d3 = h2*r1 2387 vpaddq $H0,$T0,$H0 2388 vpmuludq $H2,$R2,$D4 # d4 = h2*r2 2389 vpandq $MASK,$T1,$T1 # 1 2390 vpmuludq $H2,$S3,$D0 # d0 = h2*s3 2391 vpandq $MASK,$T3,$T3 # 3 2392 vpmuludq $H2,$S4,$D1 # d1 = h2*s4 2393 vporq $PADBIT,$T4,$T4 # padbit, yes, always 2394 vpmuludq $H2,$R0,$D2 # d2 = h2*r0 2395 vpaddq $H1,$T1,$H1 # accumulate input 2396 vpaddq $H3,$T3,$H3 2397 vpaddq $H4,$T4,$H4 2398 2399 vmovdqu64 16*0($inp),$T3 # load input 2400 vmovdqu64 16*4($inp),$T4 2401 lea 16*8($inp),$inp 2402 vpmuludq $H0,$R3,$M3 2403 vpmuludq $H0,$R4,$M4 2404 vpmuludq $H0,$R0,$M0 2405 vpmuludq $H0,$R1,$M1 2406 vpaddq $M3,$D3,$D3 # d3 += h0*r3 2407 vpaddq $M4,$D4,$D4 # d4 += h0*r4 2408 vpaddq $M0,$D0,$D0 # d0 += h0*r0 2409 vpaddq $M1,$D1,$D1 # d1 += h0*r1 2410 2411 vpmuludq $H1,$R2,$M3 2412 vpmuludq $H1,$R3,$M4 2413 vpmuludq $H1,$S4,$M0 2414 vpmuludq $H0,$R2,$M2 2415 vpaddq $M3,$D3,$D3 # d3 += h1*r2 2416 vpaddq $M4,$D4,$D4 # d4 += h1*r3 2417 vpaddq $M0,$D0,$D0 # d0 += h1*s4 2418 vpaddq $M2,$D2,$D2 # d2 += h0*r2 2419 2420 vpunpcklqdq $T4,$T3,$T0 # transpose input 2421 vpunpckhqdq $T4,$T3,$T4 2422 2423 vpmuludq $H3,$R0,$M3 2424 vpmuludq $H3,$R1,$M4 2425 vpmuludq $H1,$R0,$M1 2426 vpmuludq $H1,$R1,$M2 2427 vpaddq $M3,$D3,$D3 # d3 += h3*r0 2428 vpaddq $M4,$D4,$D4 # d4 += h3*r1 2429 vpaddq $M1,$D1,$D1 # d1 += h1*r0 2430 vpaddq $M2,$D2,$D2 # d2 += h1*r1 2431 2432 vpmuludq $H4,$S4,$M3 2433 vpmuludq $H4,$R0,$M4 2434 vpmuludq $H3,$S2,$M0 2435 vpmuludq $H3,$S3,$M1 2436 vpaddq $M3,$D3,$D3 # d3 += h4*s4 2437 vpmuludq $H3,$S4,$M2 2438 vpaddq $M4,$D4,$D4 # d4 += h4*r0 2439 vpaddq $M0,$D0,$D0 # d0 += h3*s2 2440 vpaddq $M1,$D1,$D1 # d1 += h3*s3 2441 vpaddq $M2,$D2,$D2 # d2 += h3*s4 2442 2443 vpmuludq $H4,$S1,$M0 2444 vpmuludq $H4,$S2,$M1 2445 vpmuludq $H4,$S3,$M2 2446 vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1 2447 vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2 2448 vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3 2449 2450 ################################################################ 2451 # lazy reduction (interleaved with input splat) 2452 2453 vpsrlq \$52,$T0,$T2 # splat input 2454 vpsllq \$12,$T4,$T3 2455 2456 vpsrlq \$26,$D3,$H3 2457 vpandq $MASK,$D3,$D3 2458 vpaddq $H3,$D4,$H4 # h3 -> h4 2459 2460 vporq $T3,$T2,$T2 2461 2462 vpsrlq \$26,$H0,$D0 2463 vpandq $MASK,$H0,$H0 2464 vpaddq $D0,$H1,$H1 # h0 -> h1 2465 2466 vpandq $MASK,$T2,$T2 # 2 2467 2468 vpsrlq \$26,$H4,$D4 2469 vpandq $MASK,$H4,$H4 2470 2471 vpsrlq \$26,$H1,$D1 2472 vpandq $MASK,$H1,$H1 2473 vpaddq $D1,$H2,$H2 # h1 -> h2 2474 2475 vpaddq $D4,$H0,$H0 2476 vpsllq \$2,$D4,$D4 2477 vpaddq $D4,$H0,$H0 # h4 -> h0 2478 2479 vpaddq $T2,$H2,$H2 # modulo-scheduled 2480 vpsrlq \$26,$T0,$T1 2481 2482 vpsrlq \$26,$H2,$D2 2483 vpandq $MASK,$H2,$H2 2484 vpaddq $D2,$D3,$H3 # h2 -> h3 2485 2486 vpsrlq \$14,$T4,$T3 2487 2488 vpsrlq \$26,$H0,$D0 2489 vpandq $MASK,$H0,$H0 2490 vpaddq $D0,$H1,$H1 # h0 -> h1 2491 2492 vpsrlq \$40,$T4,$T4 # 4 2493 2494 vpsrlq \$26,$H3,$D3 2495 vpandq $MASK,$H3,$H3 2496 vpaddq $D3,$H4,$H4 # h3 -> h4 2497 2498 vpandq $MASK,$T0,$T0 # 0 2499 #vpandq $MASK,$T1,$T1 # 1 2500 #vpandq $MASK,$T3,$T3 # 3 2501 #vporq $PADBIT,$T4,$T4 # padbit, yes, always 2502 2503 sub \$128,$len 2504 ja .Loop_avx512 2505 2506.Ltail_avx512: 2507 ################################################################ 2508 # while above multiplications were by r^8 in all lanes, in last 2509 # iteration we multiply least significant lane by r^8 and most 2510 # significant one by r, that's why table gets shifted... 2511 2512 vpsrlq \$32,$R0,$R0 # 0105020603070408 2513 vpsrlq \$32,$R1,$R1 2514 vpsrlq \$32,$R2,$R2 2515 vpsrlq \$32,$S3,$S3 2516 vpsrlq \$32,$S4,$S4 2517 vpsrlq \$32,$R3,$R3 2518 vpsrlq \$32,$R4,$R4 2519 vpsrlq \$32,$S1,$S1 2520 vpsrlq \$32,$S2,$S2 2521 2522 ################################################################ 2523 # load either next or last 64 byte of input 2524 lea ($inp,$len),$inp 2525 2526 #vpaddq $H2,$T2,$H2 # accumulate input 2527 vpaddq $H0,$T0,$H0 2528 2529 vpmuludq $H2,$R1,$D3 # d3 = h2*r1 2530 vpmuludq $H2,$R2,$D4 # d4 = h2*r2 2531 vpmuludq $H2,$S3,$D0 # d0 = h2*s3 2532 vpandq $MASK,$T1,$T1 # 1 2533 vpmuludq $H2,$S4,$D1 # d1 = h2*s4 2534 vpandq $MASK,$T3,$T3 # 3 2535 vpmuludq $H2,$R0,$D2 # d2 = h2*r0 2536 vporq $PADBIT,$T4,$T4 # padbit, yes, always 2537 vpaddq $H1,$T1,$H1 # accumulate input 2538 vpaddq $H3,$T3,$H3 2539 vpaddq $H4,$T4,$H4 2540 2541 vmovdqu 16*0($inp),%x#$T0 2542 vpmuludq $H0,$R3,$M3 2543 vpmuludq $H0,$R4,$M4 2544 vpmuludq $H0,$R0,$M0 2545 vpmuludq $H0,$R1,$M1 2546 vpaddq $M3,$D3,$D3 # d3 += h0*r3 2547 vpaddq $M4,$D4,$D4 # d4 += h0*r4 2548 vpaddq $M0,$D0,$D0 # d0 += h0*r0 2549 vpaddq $M1,$D1,$D1 # d1 += h0*r1 2550 2551 vmovdqu 16*1($inp),%x#$T1 2552 vpmuludq $H1,$R2,$M3 2553 vpmuludq $H1,$R3,$M4 2554 vpmuludq $H1,$S4,$M0 2555 vpmuludq $H0,$R2,$M2 2556 vpaddq $M3,$D3,$D3 # d3 += h1*r2 2557 vpaddq $M4,$D4,$D4 # d4 += h1*r3 2558 vpaddq $M0,$D0,$D0 # d0 += h1*s4 2559 vpaddq $M2,$D2,$D2 # d2 += h0*r2 2560 2561 vinserti128 \$1,16*2($inp),%y#$T0,%y#$T0 2562 vpmuludq $H3,$R0,$M3 2563 vpmuludq $H3,$R1,$M4 2564 vpmuludq $H1,$R0,$M1 2565 vpmuludq $H1,$R1,$M2 2566 vpaddq $M3,$D3,$D3 # d3 += h3*r0 2567 vpaddq $M4,$D4,$D4 # d4 += h3*r1 2568 vpaddq $M1,$D1,$D1 # d1 += h1*r0 2569 vpaddq $M2,$D2,$D2 # d2 += h1*r1 2570 2571 vinserti128 \$1,16*3($inp),%y#$T1,%y#$T1 2572 vpmuludq $H4,$S4,$M3 2573 vpmuludq $H4,$R0,$M4 2574 vpmuludq $H3,$S2,$M0 2575 vpmuludq $H3,$S3,$M1 2576 vpmuludq $H3,$S4,$M2 2577 vpaddq $M3,$D3,$H3 # h3 = d3 + h4*s4 2578 vpaddq $M4,$D4,$D4 # d4 += h4*r0 2579 vpaddq $M0,$D0,$D0 # d0 += h3*s2 2580 vpaddq $M1,$D1,$D1 # d1 += h3*s3 2581 vpaddq $M2,$D2,$D2 # d2 += h3*s4 2582 2583 vpmuludq $H4,$S1,$M0 2584 vpmuludq $H4,$S2,$M1 2585 vpmuludq $H4,$S3,$M2 2586 vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1 2587 vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2 2588 vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3 2589 2590 ################################################################ 2591 # horizontal addition 2592 2593 mov \$1,%eax 2594 vpermq \$0xb1,$H3,$D3 2595 vpermq \$0xb1,$D4,$H4 2596 vpermq \$0xb1,$H0,$D0 2597 vpermq \$0xb1,$H1,$D1 2598 vpermq \$0xb1,$H2,$D2 2599 vpaddq $D3,$H3,$H3 2600 vpaddq $D4,$H4,$H4 2601 vpaddq $D0,$H0,$H0 2602 vpaddq $D1,$H1,$H1 2603 vpaddq $D2,$H2,$H2 2604 2605 kmovw %eax,%k3 2606 vpermq \$0x2,$H3,$D3 2607 vpermq \$0x2,$H4,$D4 2608 vpermq \$0x2,$H0,$D0 2609 vpermq \$0x2,$H1,$D1 2610 vpermq \$0x2,$H2,$D2 2611 vpaddq $D3,$H3,$H3 2612 vpaddq $D4,$H4,$H4 2613 vpaddq $D0,$H0,$H0 2614 vpaddq $D1,$H1,$H1 2615 vpaddq $D2,$H2,$H2 2616 2617 vextracti64x4 \$0x1,$H3,%y#$D3 2618 vextracti64x4 \$0x1,$H4,%y#$D4 2619 vextracti64x4 \$0x1,$H0,%y#$D0 2620 vextracti64x4 \$0x1,$H1,%y#$D1 2621 vextracti64x4 \$0x1,$H2,%y#$D2 2622 vpaddq $D3,$H3,${H3}{%k3}{z} # keep single qword in case 2623 vpaddq $D4,$H4,${H4}{%k3}{z} # it's passed to .Ltail_avx2 2624 vpaddq $D0,$H0,${H0}{%k3}{z} 2625 vpaddq $D1,$H1,${H1}{%k3}{z} 2626 vpaddq $D2,$H2,${H2}{%k3}{z} 2627___ 2628map(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT)); 2629map(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK)); 2630$code.=<<___; 2631 ################################################################ 2632 # lazy reduction (interleaved with input splat) 2633 2634 vpsrlq \$26,$H3,$D3 2635 vpand $MASK,$H3,$H3 2636 vpsrldq \$6,$T0,$T2 # splat input 2637 vpsrldq \$6,$T1,$T3 2638 vpunpckhqdq $T1,$T0,$T4 # 4 2639 vpaddq $D3,$H4,$H4 # h3 -> h4 2640 2641 vpsrlq \$26,$H0,$D0 2642 vpand $MASK,$H0,$H0 2643 vpunpcklqdq $T3,$T2,$T2 # 2:3 2644 vpunpcklqdq $T1,$T0,$T0 # 0:1 2645 vpaddq $D0,$H1,$H1 # h0 -> h1 2646 2647 vpsrlq \$26,$H4,$D4 2648 vpand $MASK,$H4,$H4 2649 2650 vpsrlq \$26,$H1,$D1 2651 vpand $MASK,$H1,$H1 2652 vpsrlq \$30,$T2,$T3 2653 vpsrlq \$4,$T2,$T2 2654 vpaddq $D1,$H2,$H2 # h1 -> h2 2655 2656 vpaddq $D4,$H0,$H0 2657 vpsllq \$2,$D4,$D4 2658 vpsrlq \$26,$T0,$T1 2659 vpsrlq \$40,$T4,$T4 # 4 2660 vpaddq $D4,$H0,$H0 # h4 -> h0 2661 2662 vpsrlq \$26,$H2,$D2 2663 vpand $MASK,$H2,$H2 2664 vpand $MASK,$T2,$T2 # 2 2665 vpand $MASK,$T0,$T0 # 0 2666 vpaddq $D2,$H3,$H3 # h2 -> h3 2667 2668 vpsrlq \$26,$H0,$D0 2669 vpand $MASK,$H0,$H0 2670 vpaddq $H2,$T2,$H2 # accumulate input for .Ltail_avx2 2671 vpand $MASK,$T1,$T1 # 1 2672 vpaddq $D0,$H1,$H1 # h0 -> h1 2673 2674 vpsrlq \$26,$H3,$D3 2675 vpand $MASK,$H3,$H3 2676 vpand $MASK,$T3,$T3 # 3 2677 vpor 32(%rcx),$T4,$T4 # padbit, yes, always 2678 vpaddq $D3,$H4,$H4 # h3 -> h4 2679 2680 lea 0x90(%rsp),%rax # size optimization for .Ltail_avx2 2681 add \$64,$len 2682 jnz .Ltail_avx2 2683 2684 vpsubq $T2,$H2,$H2 # undo input accumulation 2685 vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced 2686 vmovd %x#$H1,`4*1-48-64`($ctx) 2687 vmovd %x#$H2,`4*2-48-64`($ctx) 2688 vmovd %x#$H3,`4*3-48-64`($ctx) 2689 vmovd %x#$H4,`4*4-48-64`($ctx) 2690 vzeroall 2691___ 2692$code.=<<___ if ($win64); 2693 movdqa 0x50(%r11),%xmm6 2694 movdqa 0x60(%r11),%xmm7 2695 movdqa 0x70(%r11),%xmm8 2696 movdqa 0x80(%r11),%xmm9 2697 movdqa 0x90(%r11),%xmm10 2698 movdqa 0xa0(%r11),%xmm11 2699 movdqa 0xb0(%r11),%xmm12 2700 movdqa 0xc0(%r11),%xmm13 2701 movdqa 0xd0(%r11),%xmm14 2702 movdqa 0xe0(%r11),%xmm15 2703 lea 0xf8(%r11),%rsp 2704.Ldo_avx512_epilogue: 2705___ 2706$code.=<<___ if (!$win64); 2707 lea 8(%r11),%rsp 2708.cfi_def_cfa %rsp,8 2709___ 2710$code.=<<___; 2711 ret 2712.cfi_endproc 2713.size poly1305_blocks_avx512,.-poly1305_blocks_avx512 2714___ 2715if ($avx>3) { 2716######################################################################## 2717# VPMADD52 version using 2^44 radix. 2718# 2719# One can argue that base 2^52 would be more natural. Well, even though 2720# some operations would be more natural, one has to recognize couple of 2721# things. Base 2^52 doesn't provide advantage over base 2^44 if you look 2722# at amount of multiply-n-accumulate operations. Secondly, it makes it 2723# impossible to pre-compute multiples of 5 [referred to as s[]/sN in 2724# reference implementations], which means that more such operations 2725# would have to be performed in inner loop, which in turn makes critical 2726# path longer. In other words, even though base 2^44 reduction might 2727# look less elegant, overall critical path is actually shorter... 2728 2729######################################################################## 2730# Layout of opaque area is following. 2731# 2732# unsigned __int64 h[3]; # current hash value base 2^44 2733# unsigned __int64 s[2]; # key value*20 base 2^44 2734# unsigned __int64 r[3]; # key value base 2^44 2735# struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4]; 2736# # r^n positions reflect 2737# # placement in register, not 2738# # memory, R[3] is R[1]*20 2739 2740$code.=<<___; 2741.type poly1305_init_base2_44,\@function,3 2742.align 32 2743poly1305_init_base2_44: 2744 xor %rax,%rax 2745 mov %rax,0($ctx) # initialize hash value 2746 mov %rax,8($ctx) 2747 mov %rax,16($ctx) 2748 2749.Linit_base2_44: 2750 lea poly1305_blocks_vpmadd52(%rip),%r10 2751 lea poly1305_emit_base2_44(%rip),%r11 2752 2753 mov \$0x0ffffffc0fffffff,%rax 2754 mov \$0x0ffffffc0ffffffc,%rcx 2755 and 0($inp),%rax 2756 mov \$0x00000fffffffffff,%r8 2757 and 8($inp),%rcx 2758 mov \$0x00000fffffffffff,%r9 2759 and %rax,%r8 2760 shrd \$44,%rcx,%rax 2761 mov %r8,40($ctx) # r0 2762 and %r9,%rax 2763 shr \$24,%rcx 2764 mov %rax,48($ctx) # r1 2765 lea (%rax,%rax,4),%rax # *5 2766 mov %rcx,56($ctx) # r2 2767 shl \$2,%rax # magic <<2 2768 lea (%rcx,%rcx,4),%rcx # *5 2769 shl \$2,%rcx # magic <<2 2770 mov %rax,24($ctx) # s1 2771 mov %rcx,32($ctx) # s2 2772 movq \$-1,64($ctx) # write impossible value 2773___ 2774$code.=<<___ if ($flavour !~ /elf32/); 2775 mov %r10,0(%rdx) 2776 mov %r11,8(%rdx) 2777___ 2778$code.=<<___ if ($flavour =~ /elf32/); 2779 mov %r10d,0(%rdx) 2780 mov %r11d,4(%rdx) 2781___ 2782$code.=<<___; 2783 mov \$1,%eax 2784 ret 2785.size poly1305_init_base2_44,.-poly1305_init_base2_44 2786___ 2787{ 2788my ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17)); 2789my ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21)); 2790my ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25)); 2791 2792$code.=<<___; 2793.type poly1305_blocks_vpmadd52,\@function,4 2794.align 32 2795poly1305_blocks_vpmadd52: 2796 shr \$4,$len 2797 jz .Lno_data_vpmadd52 # too short 2798 2799 shl \$40,$padbit 2800 mov 64($ctx),%r8 # peek on power of the key 2801 2802 # if powers of the key are not calculated yet, process up to 3 2803 # blocks with this single-block subroutine, otherwise ensure that 2804 # length is divisible by 2 blocks and pass the rest down to next 2805 # subroutine... 2806 2807 mov \$3,%rax 2808 mov \$1,%r10 2809 cmp \$4,$len # is input long 2810 cmovae %r10,%rax 2811 test %r8,%r8 # is power value impossible? 2812 cmovns %r10,%rax 2813 2814 and $len,%rax # is input of favourable length? 2815 jz .Lblocks_vpmadd52_4x 2816 2817 sub %rax,$len 2818 mov \$7,%r10d 2819 mov \$1,%r11d 2820 kmovw %r10d,%k7 2821 lea .L2_44_inp_permd(%rip),%r10 2822 kmovw %r11d,%k1 2823 2824 vmovq $padbit,%x#$PAD 2825 vmovdqa64 0(%r10),$inp_permd # .L2_44_inp_permd 2826 vmovdqa64 32(%r10),$inp_shift # .L2_44_inp_shift 2827 vpermq \$0xcf,$PAD,$PAD 2828 vmovdqa64 64(%r10),$reduc_mask # .L2_44_mask 2829 2830 vmovdqu64 0($ctx),${Dlo}{%k7}{z} # load hash value 2831 vmovdqu64 40($ctx),${r2r1r0}{%k7}{z} # load keys 2832 vmovdqu64 32($ctx),${r1r0s2}{%k7}{z} 2833 vmovdqu64 24($ctx),${r0s2s1}{%k7}{z} 2834 2835 vmovdqa64 96(%r10),$reduc_rght # .L2_44_shift_rgt 2836 vmovdqa64 128(%r10),$reduc_left # .L2_44_shift_lft 2837 2838 jmp .Loop_vpmadd52 2839 2840.align 32 2841.Loop_vpmadd52: 2842 vmovdqu32 0($inp),%x#$T0 # load input as ----3210 2843 lea 16($inp),$inp 2844 2845 vpermd $T0,$inp_permd,$T0 # ----3210 -> --322110 2846 vpsrlvq $inp_shift,$T0,$T0 2847 vpandq $reduc_mask,$T0,$T0 2848 vporq $PAD,$T0,$T0 2849 2850 vpaddq $T0,$Dlo,$Dlo # accumulate input 2851 2852 vpermq \$0,$Dlo,${H0}{%k7}{z} # smash hash value 2853 vpermq \$0b01010101,$Dlo,${H1}{%k7}{z} 2854 vpermq \$0b10101010,$Dlo,${H2}{%k7}{z} 2855 2856 vpxord $Dlo,$Dlo,$Dlo 2857 vpxord $Dhi,$Dhi,$Dhi 2858 2859 vpmadd52luq $r2r1r0,$H0,$Dlo 2860 vpmadd52huq $r2r1r0,$H0,$Dhi 2861 2862 vpmadd52luq $r1r0s2,$H1,$Dlo 2863 vpmadd52huq $r1r0s2,$H1,$Dhi 2864 2865 vpmadd52luq $r0s2s1,$H2,$Dlo 2866 vpmadd52huq $r0s2s1,$H2,$Dhi 2867 2868 vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost qword 2869 vpsllvq $reduc_left,$Dhi,$Dhi # 0 in topmost qword 2870 vpandq $reduc_mask,$Dlo,$Dlo 2871 2872 vpaddq $T0,$Dhi,$Dhi 2873 2874 vpermq \$0b10010011,$Dhi,$Dhi # 0 in lowest qword 2875 2876 vpaddq $Dhi,$Dlo,$Dlo # note topmost qword :-) 2877 2878 vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost word 2879 vpandq $reduc_mask,$Dlo,$Dlo 2880 2881 vpermq \$0b10010011,$T0,$T0 2882 2883 vpaddq $T0,$Dlo,$Dlo 2884 2885 vpermq \$0b10010011,$Dlo,${T0}{%k1}{z} 2886 2887 vpaddq $T0,$Dlo,$Dlo 2888 vpsllq \$2,$T0,$T0 2889 2890 vpaddq $T0,$Dlo,$Dlo 2891 2892 dec %rax # len-=16 2893 jnz .Loop_vpmadd52 2894 2895 vmovdqu64 $Dlo,0($ctx){%k7} # store hash value 2896 2897 test $len,$len 2898 jnz .Lblocks_vpmadd52_4x 2899 2900.Lno_data_vpmadd52: 2901 ret 2902.size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52 2903___ 2904} 2905{ 2906######################################################################## 2907# As implied by its name 4x subroutine processes 4 blocks in parallel 2908# (but handles even 4*n+2 blocks lengths). It takes up to 4th key power 2909# and is handled in 256-bit %ymm registers. 2910 2911my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17)); 2912my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23)); 2913my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31)); 2914 2915$code.=<<___; 2916.type poly1305_blocks_vpmadd52_4x,\@function,4 2917.align 32 2918poly1305_blocks_vpmadd52_4x: 2919 shr \$4,$len 2920 jz .Lno_data_vpmadd52_4x # too short 2921 2922 shl \$40,$padbit 2923 mov 64($ctx),%r8 # peek on power of the key 2924 2925.Lblocks_vpmadd52_4x: 2926 vpbroadcastq $padbit,$PAD 2927 2928 vmovdqa64 .Lx_mask44(%rip),$mask44 2929 mov \$5,%eax 2930 vmovdqa64 .Lx_mask42(%rip),$mask42 2931 kmovw %eax,%k1 # used in 2x path 2932 2933 test %r8,%r8 # is power value impossible? 2934 js .Linit_vpmadd52 # if it is, then init R[4] 2935 2936 vmovq 0($ctx),%x#$H0 # load current hash value 2937 vmovq 8($ctx),%x#$H1 2938 vmovq 16($ctx),%x#$H2 2939 2940 test \$3,$len # is length 4*n+2? 2941 jnz .Lblocks_vpmadd52_2x_do 2942 2943.Lblocks_vpmadd52_4x_do: 2944 vpbroadcastq 64($ctx),$R0 # load 4th power of the key 2945 vpbroadcastq 96($ctx),$R1 2946 vpbroadcastq 128($ctx),$R2 2947 vpbroadcastq 160($ctx),$S1 2948 2949.Lblocks_vpmadd52_4x_key_loaded: 2950 vpsllq \$2,$R2,$S2 # S2 = R2*5*4 2951 vpaddq $R2,$S2,$S2 2952 vpsllq \$2,$S2,$S2 2953 2954 test \$7,$len # is len 8*n? 2955 jz .Lblocks_vpmadd52_8x 2956 2957 vmovdqu64 16*0($inp),$T2 # load data 2958 vmovdqu64 16*2($inp),$T3 2959 lea 16*4($inp),$inp 2960 2961 vpunpcklqdq $T3,$T2,$T1 # transpose data 2962 vpunpckhqdq $T3,$T2,$T3 2963 2964 # at this point 64-bit lanes are ordered as 3-1-2-0 2965 2966 vpsrlq \$24,$T3,$T2 # splat the data 2967 vporq $PAD,$T2,$T2 2968 vpaddq $T2,$H2,$H2 # accumulate input 2969 vpandq $mask44,$T1,$T0 2970 vpsrlq \$44,$T1,$T1 2971 vpsllq \$20,$T3,$T3 2972 vporq $T3,$T1,$T1 2973 vpandq $mask44,$T1,$T1 2974 2975 sub \$4,$len 2976 jz .Ltail_vpmadd52_4x 2977 jmp .Loop_vpmadd52_4x 2978 ud2 2979 2980.align 32 2981.Linit_vpmadd52: 2982 vmovq 24($ctx),%x#$S1 # load key 2983 vmovq 56($ctx),%x#$H2 2984 vmovq 32($ctx),%x#$S2 2985 vmovq 40($ctx),%x#$R0 2986 vmovq 48($ctx),%x#$R1 2987 2988 vmovdqa $R0,$H0 2989 vmovdqa $R1,$H1 2990 vmovdqa $H2,$R2 2991 2992 mov \$2,%eax 2993 2994.Lmul_init_vpmadd52: 2995 vpxorq $D0lo,$D0lo,$D0lo 2996 vpmadd52luq $H2,$S1,$D0lo 2997 vpxorq $D0hi,$D0hi,$D0hi 2998 vpmadd52huq $H2,$S1,$D0hi 2999 vpxorq $D1lo,$D1lo,$D1lo 3000 vpmadd52luq $H2,$S2,$D1lo 3001 vpxorq $D1hi,$D1hi,$D1hi 3002 vpmadd52huq $H2,$S2,$D1hi 3003 vpxorq $D2lo,$D2lo,$D2lo 3004 vpmadd52luq $H2,$R0,$D2lo 3005 vpxorq $D2hi,$D2hi,$D2hi 3006 vpmadd52huq $H2,$R0,$D2hi 3007 3008 vpmadd52luq $H0,$R0,$D0lo 3009 vpmadd52huq $H0,$R0,$D0hi 3010 vpmadd52luq $H0,$R1,$D1lo 3011 vpmadd52huq $H0,$R1,$D1hi 3012 vpmadd52luq $H0,$R2,$D2lo 3013 vpmadd52huq $H0,$R2,$D2hi 3014 3015 vpmadd52luq $H1,$S2,$D0lo 3016 vpmadd52huq $H1,$S2,$D0hi 3017 vpmadd52luq $H1,$R0,$D1lo 3018 vpmadd52huq $H1,$R0,$D1hi 3019 vpmadd52luq $H1,$R1,$D2lo 3020 vpmadd52huq $H1,$R1,$D2hi 3021 3022 ################################################################ 3023 # partial reduction 3024 vpsrlq \$44,$D0lo,$tmp 3025 vpsllq \$8,$D0hi,$D0hi 3026 vpandq $mask44,$D0lo,$H0 3027 vpaddq $tmp,$D0hi,$D0hi 3028 3029 vpaddq $D0hi,$D1lo,$D1lo 3030 3031 vpsrlq \$44,$D1lo,$tmp 3032 vpsllq \$8,$D1hi,$D1hi 3033 vpandq $mask44,$D1lo,$H1 3034 vpaddq $tmp,$D1hi,$D1hi 3035 3036 vpaddq $D1hi,$D2lo,$D2lo 3037 3038 vpsrlq \$42,$D2lo,$tmp 3039 vpsllq \$10,$D2hi,$D2hi 3040 vpandq $mask42,$D2lo,$H2 3041 vpaddq $tmp,$D2hi,$D2hi 3042 3043 vpaddq $D2hi,$H0,$H0 3044 vpsllq \$2,$D2hi,$D2hi 3045 3046 vpaddq $D2hi,$H0,$H0 3047 3048 vpsrlq \$44,$H0,$tmp # additional step 3049 vpandq $mask44,$H0,$H0 3050 3051 vpaddq $tmp,$H1,$H1 3052 3053 dec %eax 3054 jz .Ldone_init_vpmadd52 3055 3056 vpunpcklqdq $R1,$H1,$R1 # 1,2 3057 vpbroadcastq %x#$H1,%x#$H1 # 2,2 3058 vpunpcklqdq $R2,$H2,$R2 3059 vpbroadcastq %x#$H2,%x#$H2 3060 vpunpcklqdq $R0,$H0,$R0 3061 vpbroadcastq %x#$H0,%x#$H0 3062 3063 vpsllq \$2,$R1,$S1 # S1 = R1*5*4 3064 vpsllq \$2,$R2,$S2 # S2 = R2*5*4 3065 vpaddq $R1,$S1,$S1 3066 vpaddq $R2,$S2,$S2 3067 vpsllq \$2,$S1,$S1 3068 vpsllq \$2,$S2,$S2 3069 3070 jmp .Lmul_init_vpmadd52 3071 ud2 3072 3073.align 32 3074.Ldone_init_vpmadd52: 3075 vinserti128 \$1,%x#$R1,$H1,$R1 # 1,2,3,4 3076 vinserti128 \$1,%x#$R2,$H2,$R2 3077 vinserti128 \$1,%x#$R0,$H0,$R0 3078 3079 vpermq \$0b11011000,$R1,$R1 # 1,3,2,4 3080 vpermq \$0b11011000,$R2,$R2 3081 vpermq \$0b11011000,$R0,$R0 3082 3083 vpsllq \$2,$R1,$S1 # S1 = R1*5*4 3084 vpaddq $R1,$S1,$S1 3085 vpsllq \$2,$S1,$S1 3086 3087 vmovq 0($ctx),%x#$H0 # load current hash value 3088 vmovq 8($ctx),%x#$H1 3089 vmovq 16($ctx),%x#$H2 3090 3091 test \$3,$len # is length 4*n+2? 3092 jnz .Ldone_init_vpmadd52_2x 3093 3094 vmovdqu64 $R0,64($ctx) # save key powers 3095 vpbroadcastq %x#$R0,$R0 # broadcast 4th power 3096 vmovdqu64 $R1,96($ctx) 3097 vpbroadcastq %x#$R1,$R1 3098 vmovdqu64 $R2,128($ctx) 3099 vpbroadcastq %x#$R2,$R2 3100 vmovdqu64 $S1,160($ctx) 3101 vpbroadcastq %x#$S1,$S1 3102 3103 jmp .Lblocks_vpmadd52_4x_key_loaded 3104 ud2 3105 3106.align 32 3107.Ldone_init_vpmadd52_2x: 3108 vmovdqu64 $R0,64($ctx) # save key powers 3109 vpsrldq \$8,$R0,$R0 # 0-1-0-2 3110 vmovdqu64 $R1,96($ctx) 3111 vpsrldq \$8,$R1,$R1 3112 vmovdqu64 $R2,128($ctx) 3113 vpsrldq \$8,$R2,$R2 3114 vmovdqu64 $S1,160($ctx) 3115 vpsrldq \$8,$S1,$S1 3116 jmp .Lblocks_vpmadd52_2x_key_loaded 3117 ud2 3118 3119.align 32 3120.Lblocks_vpmadd52_2x_do: 3121 vmovdqu64 128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers 3122 vmovdqu64 160+8($ctx),${S1}{%k1}{z} 3123 vmovdqu64 64+8($ctx),${R0}{%k1}{z} 3124 vmovdqu64 96+8($ctx),${R1}{%k1}{z} 3125 3126.Lblocks_vpmadd52_2x_key_loaded: 3127 vmovdqu64 16*0($inp),$T2 # load data 3128 vpxorq $T3,$T3,$T3 3129 lea 16*2($inp),$inp 3130 3131 vpunpcklqdq $T3,$T2,$T1 # transpose data 3132 vpunpckhqdq $T3,$T2,$T3 3133 3134 # at this point 64-bit lanes are ordered as x-1-x-0 3135 3136 vpsrlq \$24,$T3,$T2 # splat the data 3137 vporq $PAD,$T2,$T2 3138 vpaddq $T2,$H2,$H2 # accumulate input 3139 vpandq $mask44,$T1,$T0 3140 vpsrlq \$44,$T1,$T1 3141 vpsllq \$20,$T3,$T3 3142 vporq $T3,$T1,$T1 3143 vpandq $mask44,$T1,$T1 3144 3145 jmp .Ltail_vpmadd52_2x 3146 ud2 3147 3148.align 32 3149.Loop_vpmadd52_4x: 3150 #vpaddq $T2,$H2,$H2 # accumulate input 3151 vpaddq $T0,$H0,$H0 3152 vpaddq $T1,$H1,$H1 3153 3154 vpxorq $D0lo,$D0lo,$D0lo 3155 vpmadd52luq $H2,$S1,$D0lo 3156 vpxorq $D0hi,$D0hi,$D0hi 3157 vpmadd52huq $H2,$S1,$D0hi 3158 vpxorq $D1lo,$D1lo,$D1lo 3159 vpmadd52luq $H2,$S2,$D1lo 3160 vpxorq $D1hi,$D1hi,$D1hi 3161 vpmadd52huq $H2,$S2,$D1hi 3162 vpxorq $D2lo,$D2lo,$D2lo 3163 vpmadd52luq $H2,$R0,$D2lo 3164 vpxorq $D2hi,$D2hi,$D2hi 3165 vpmadd52huq $H2,$R0,$D2hi 3166 3167 vmovdqu64 16*0($inp),$T2 # load data 3168 vmovdqu64 16*2($inp),$T3 3169 lea 16*4($inp),$inp 3170 vpmadd52luq $H0,$R0,$D0lo 3171 vpmadd52huq $H0,$R0,$D0hi 3172 vpmadd52luq $H0,$R1,$D1lo 3173 vpmadd52huq $H0,$R1,$D1hi 3174 vpmadd52luq $H0,$R2,$D2lo 3175 vpmadd52huq $H0,$R2,$D2hi 3176 3177 vpunpcklqdq $T3,$T2,$T1 # transpose data 3178 vpunpckhqdq $T3,$T2,$T3 3179 vpmadd52luq $H1,$S2,$D0lo 3180 vpmadd52huq $H1,$S2,$D0hi 3181 vpmadd52luq $H1,$R0,$D1lo 3182 vpmadd52huq $H1,$R0,$D1hi 3183 vpmadd52luq $H1,$R1,$D2lo 3184 vpmadd52huq $H1,$R1,$D2hi 3185 3186 ################################################################ 3187 # partial reduction (interleaved with data splat) 3188 vpsrlq \$44,$D0lo,$tmp 3189 vpsllq \$8,$D0hi,$D0hi 3190 vpandq $mask44,$D0lo,$H0 3191 vpaddq $tmp,$D0hi,$D0hi 3192 3193 vpsrlq \$24,$T3,$T2 3194 vporq $PAD,$T2,$T2 3195 vpaddq $D0hi,$D1lo,$D1lo 3196 3197 vpsrlq \$44,$D1lo,$tmp 3198 vpsllq \$8,$D1hi,$D1hi 3199 vpandq $mask44,$D1lo,$H1 3200 vpaddq $tmp,$D1hi,$D1hi 3201 3202 vpandq $mask44,$T1,$T0 3203 vpsrlq \$44,$T1,$T1 3204 vpsllq \$20,$T3,$T3 3205 vpaddq $D1hi,$D2lo,$D2lo 3206 3207 vpsrlq \$42,$D2lo,$tmp 3208 vpsllq \$10,$D2hi,$D2hi 3209 vpandq $mask42,$D2lo,$H2 3210 vpaddq $tmp,$D2hi,$D2hi 3211 3212 vpaddq $T2,$H2,$H2 # accumulate input 3213 vpaddq $D2hi,$H0,$H0 3214 vpsllq \$2,$D2hi,$D2hi 3215 3216 vpaddq $D2hi,$H0,$H0 3217 vporq $T3,$T1,$T1 3218 vpandq $mask44,$T1,$T1 3219 3220 vpsrlq \$44,$H0,$tmp # additional step 3221 vpandq $mask44,$H0,$H0 3222 3223 vpaddq $tmp,$H1,$H1 3224 3225 sub \$4,$len # len-=64 3226 jnz .Loop_vpmadd52_4x 3227 3228.Ltail_vpmadd52_4x: 3229 vmovdqu64 128($ctx),$R2 # load all key powers 3230 vmovdqu64 160($ctx),$S1 3231 vmovdqu64 64($ctx),$R0 3232 vmovdqu64 96($ctx),$R1 3233 3234.Ltail_vpmadd52_2x: 3235 vpsllq \$2,$R2,$S2 # S2 = R2*5*4 3236 vpaddq $R2,$S2,$S2 3237 vpsllq \$2,$S2,$S2 3238 3239 #vpaddq $T2,$H2,$H2 # accumulate input 3240 vpaddq $T0,$H0,$H0 3241 vpaddq $T1,$H1,$H1 3242 3243 vpxorq $D0lo,$D0lo,$D0lo 3244 vpmadd52luq $H2,$S1,$D0lo 3245 vpxorq $D0hi,$D0hi,$D0hi 3246 vpmadd52huq $H2,$S1,$D0hi 3247 vpxorq $D1lo,$D1lo,$D1lo 3248 vpmadd52luq $H2,$S2,$D1lo 3249 vpxorq $D1hi,$D1hi,$D1hi 3250 vpmadd52huq $H2,$S2,$D1hi 3251 vpxorq $D2lo,$D2lo,$D2lo 3252 vpmadd52luq $H2,$R0,$D2lo 3253 vpxorq $D2hi,$D2hi,$D2hi 3254 vpmadd52huq $H2,$R0,$D2hi 3255 3256 vpmadd52luq $H0,$R0,$D0lo 3257 vpmadd52huq $H0,$R0,$D0hi 3258 vpmadd52luq $H0,$R1,$D1lo 3259 vpmadd52huq $H0,$R1,$D1hi 3260 vpmadd52luq $H0,$R2,$D2lo 3261 vpmadd52huq $H0,$R2,$D2hi 3262 3263 vpmadd52luq $H1,$S2,$D0lo 3264 vpmadd52huq $H1,$S2,$D0hi 3265 vpmadd52luq $H1,$R0,$D1lo 3266 vpmadd52huq $H1,$R0,$D1hi 3267 vpmadd52luq $H1,$R1,$D2lo 3268 vpmadd52huq $H1,$R1,$D2hi 3269 3270 ################################################################ 3271 # horizontal addition 3272 3273 mov \$1,%eax 3274 kmovw %eax,%k1 3275 vpsrldq \$8,$D0lo,$T0 3276 vpsrldq \$8,$D0hi,$H0 3277 vpsrldq \$8,$D1lo,$T1 3278 vpsrldq \$8,$D1hi,$H1 3279 vpaddq $T0,$D0lo,$D0lo 3280 vpaddq $H0,$D0hi,$D0hi 3281 vpsrldq \$8,$D2lo,$T2 3282 vpsrldq \$8,$D2hi,$H2 3283 vpaddq $T1,$D1lo,$D1lo 3284 vpaddq $H1,$D1hi,$D1hi 3285 vpermq \$0x2,$D0lo,$T0 3286 vpermq \$0x2,$D0hi,$H0 3287 vpaddq $T2,$D2lo,$D2lo 3288 vpaddq $H2,$D2hi,$D2hi 3289 3290 vpermq \$0x2,$D1lo,$T1 3291 vpermq \$0x2,$D1hi,$H1 3292 vpaddq $T0,$D0lo,${D0lo}{%k1}{z} 3293 vpaddq $H0,$D0hi,${D0hi}{%k1}{z} 3294 vpermq \$0x2,$D2lo,$T2 3295 vpermq \$0x2,$D2hi,$H2 3296 vpaddq $T1,$D1lo,${D1lo}{%k1}{z} 3297 vpaddq $H1,$D1hi,${D1hi}{%k1}{z} 3298 vpaddq $T2,$D2lo,${D2lo}{%k1}{z} 3299 vpaddq $H2,$D2hi,${D2hi}{%k1}{z} 3300 3301 ################################################################ 3302 # partial reduction 3303 vpsrlq \$44,$D0lo,$tmp 3304 vpsllq \$8,$D0hi,$D0hi 3305 vpandq $mask44,$D0lo,$H0 3306 vpaddq $tmp,$D0hi,$D0hi 3307 3308 vpaddq $D0hi,$D1lo,$D1lo 3309 3310 vpsrlq \$44,$D1lo,$tmp 3311 vpsllq \$8,$D1hi,$D1hi 3312 vpandq $mask44,$D1lo,$H1 3313 vpaddq $tmp,$D1hi,$D1hi 3314 3315 vpaddq $D1hi,$D2lo,$D2lo 3316 3317 vpsrlq \$42,$D2lo,$tmp 3318 vpsllq \$10,$D2hi,$D2hi 3319 vpandq $mask42,$D2lo,$H2 3320 vpaddq $tmp,$D2hi,$D2hi 3321 3322 vpaddq $D2hi,$H0,$H0 3323 vpsllq \$2,$D2hi,$D2hi 3324 3325 vpaddq $D2hi,$H0,$H0 3326 3327 vpsrlq \$44,$H0,$tmp # additional step 3328 vpandq $mask44,$H0,$H0 3329 3330 vpaddq $tmp,$H1,$H1 3331 # at this point $len is 3332 # either 4*n+2 or 0... 3333 sub \$2,$len # len-=32 3334 ja .Lblocks_vpmadd52_4x_do 3335 3336 vmovq %x#$H0,0($ctx) 3337 vmovq %x#$H1,8($ctx) 3338 vmovq %x#$H2,16($ctx) 3339 vzeroall 3340 3341.Lno_data_vpmadd52_4x: 3342 ret 3343.size poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x 3344___ 3345} 3346{ 3347######################################################################## 3348# As implied by its name 8x subroutine processes 8 blocks in parallel... 3349# This is intermediate version, as it's used only in cases when input 3350# length is either 8*n, 8*n+1 or 8*n+2... 3351 3352my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17)); 3353my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23)); 3354my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31)); 3355my ($RR0,$RR1,$RR2,$SS1,$SS2) = map("%ymm$_",(6..10)); 3356 3357$code.=<<___; 3358.type poly1305_blocks_vpmadd52_8x,\@function,4 3359.align 32 3360poly1305_blocks_vpmadd52_8x: 3361 shr \$4,$len 3362 jz .Lno_data_vpmadd52_8x # too short 3363 3364 shl \$40,$padbit 3365 mov 64($ctx),%r8 # peek on power of the key 3366 3367 vmovdqa64 .Lx_mask44(%rip),$mask44 3368 vmovdqa64 .Lx_mask42(%rip),$mask42 3369 3370 test %r8,%r8 # is power value impossible? 3371 js .Linit_vpmadd52 # if it is, then init R[4] 3372 3373 vmovq 0($ctx),%x#$H0 # load current hash value 3374 vmovq 8($ctx),%x#$H1 3375 vmovq 16($ctx),%x#$H2 3376 3377.Lblocks_vpmadd52_8x: 3378 ################################################################ 3379 # fist we calculate more key powers 3380 3381 vmovdqu64 128($ctx),$R2 # load 1-3-2-4 powers 3382 vmovdqu64 160($ctx),$S1 3383 vmovdqu64 64($ctx),$R0 3384 vmovdqu64 96($ctx),$R1 3385 3386 vpsllq \$2,$R2,$S2 # S2 = R2*5*4 3387 vpaddq $R2,$S2,$S2 3388 vpsllq \$2,$S2,$S2 3389 3390 vpbroadcastq %x#$R2,$RR2 # broadcast 4th power 3391 vpbroadcastq %x#$R0,$RR0 3392 vpbroadcastq %x#$R1,$RR1 3393 3394 vpxorq $D0lo,$D0lo,$D0lo 3395 vpmadd52luq $RR2,$S1,$D0lo 3396 vpxorq $D0hi,$D0hi,$D0hi 3397 vpmadd52huq $RR2,$S1,$D0hi 3398 vpxorq $D1lo,$D1lo,$D1lo 3399 vpmadd52luq $RR2,$S2,$D1lo 3400 vpxorq $D1hi,$D1hi,$D1hi 3401 vpmadd52huq $RR2,$S2,$D1hi 3402 vpxorq $D2lo,$D2lo,$D2lo 3403 vpmadd52luq $RR2,$R0,$D2lo 3404 vpxorq $D2hi,$D2hi,$D2hi 3405 vpmadd52huq $RR2,$R0,$D2hi 3406 3407 vpmadd52luq $RR0,$R0,$D0lo 3408 vpmadd52huq $RR0,$R0,$D0hi 3409 vpmadd52luq $RR0,$R1,$D1lo 3410 vpmadd52huq $RR0,$R1,$D1hi 3411 vpmadd52luq $RR0,$R2,$D2lo 3412 vpmadd52huq $RR0,$R2,$D2hi 3413 3414 vpmadd52luq $RR1,$S2,$D0lo 3415 vpmadd52huq $RR1,$S2,$D0hi 3416 vpmadd52luq $RR1,$R0,$D1lo 3417 vpmadd52huq $RR1,$R0,$D1hi 3418 vpmadd52luq $RR1,$R1,$D2lo 3419 vpmadd52huq $RR1,$R1,$D2hi 3420 3421 ################################################################ 3422 # partial reduction 3423 vpsrlq \$44,$D0lo,$tmp 3424 vpsllq \$8,$D0hi,$D0hi 3425 vpandq $mask44,$D0lo,$RR0 3426 vpaddq $tmp,$D0hi,$D0hi 3427 3428 vpaddq $D0hi,$D1lo,$D1lo 3429 3430 vpsrlq \$44,$D1lo,$tmp 3431 vpsllq \$8,$D1hi,$D1hi 3432 vpandq $mask44,$D1lo,$RR1 3433 vpaddq $tmp,$D1hi,$D1hi 3434 3435 vpaddq $D1hi,$D2lo,$D2lo 3436 3437 vpsrlq \$42,$D2lo,$tmp 3438 vpsllq \$10,$D2hi,$D2hi 3439 vpandq $mask42,$D2lo,$RR2 3440 vpaddq $tmp,$D2hi,$D2hi 3441 3442 vpaddq $D2hi,$RR0,$RR0 3443 vpsllq \$2,$D2hi,$D2hi 3444 3445 vpaddq $D2hi,$RR0,$RR0 3446 3447 vpsrlq \$44,$RR0,$tmp # additional step 3448 vpandq $mask44,$RR0,$RR0 3449 3450 vpaddq $tmp,$RR1,$RR1 3451 3452 ################################################################ 3453 # At this point Rx holds 1324 powers, RRx - 5768, and the goal 3454 # is 15263748, which reflects how data is loaded... 3455 3456 vpunpcklqdq $R2,$RR2,$T2 # 3748 3457 vpunpckhqdq $R2,$RR2,$R2 # 1526 3458 vpunpcklqdq $R0,$RR0,$T0 3459 vpunpckhqdq $R0,$RR0,$R0 3460 vpunpcklqdq $R1,$RR1,$T1 3461 vpunpckhqdq $R1,$RR1,$R1 3462___ 3463######## switch to %zmm 3464map(s/%y/%z/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2); 3465map(s/%y/%z/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi); 3466map(s/%y/%z/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD); 3467map(s/%y/%z/, $RR0,$RR1,$RR2,$SS1,$SS2); 3468 3469$code.=<<___; 3470 vshufi64x2 \$0x44,$R2,$T2,$RR2 # 15263748 3471 vshufi64x2 \$0x44,$R0,$T0,$RR0 3472 vshufi64x2 \$0x44,$R1,$T1,$RR1 3473 3474 vmovdqu64 16*0($inp),$T2 # load data 3475 vmovdqu64 16*4($inp),$T3 3476 lea 16*8($inp),$inp 3477 3478 vpsllq \$2,$RR2,$SS2 # S2 = R2*5*4 3479 vpsllq \$2,$RR1,$SS1 # S1 = R1*5*4 3480 vpaddq $RR2,$SS2,$SS2 3481 vpaddq $RR1,$SS1,$SS1 3482 vpsllq \$2,$SS2,$SS2 3483 vpsllq \$2,$SS1,$SS1 3484 3485 vpbroadcastq $padbit,$PAD 3486 vpbroadcastq %x#$mask44,$mask44 3487 vpbroadcastq %x#$mask42,$mask42 3488 3489 vpbroadcastq %x#$SS1,$S1 # broadcast 8th power 3490 vpbroadcastq %x#$SS2,$S2 3491 vpbroadcastq %x#$RR0,$R0 3492 vpbroadcastq %x#$RR1,$R1 3493 vpbroadcastq %x#$RR2,$R2 3494 3495 vpunpcklqdq $T3,$T2,$T1 # transpose data 3496 vpunpckhqdq $T3,$T2,$T3 3497 3498 # at this point 64-bit lanes are ordered as 73625140 3499 3500 vpsrlq \$24,$T3,$T2 # splat the data 3501 vporq $PAD,$T2,$T2 3502 vpaddq $T2,$H2,$H2 # accumulate input 3503 vpandq $mask44,$T1,$T0 3504 vpsrlq \$44,$T1,$T1 3505 vpsllq \$20,$T3,$T3 3506 vporq $T3,$T1,$T1 3507 vpandq $mask44,$T1,$T1 3508 3509 sub \$8,$len 3510 jz .Ltail_vpmadd52_8x 3511 jmp .Loop_vpmadd52_8x 3512 3513.align 32 3514.Loop_vpmadd52_8x: 3515 #vpaddq $T2,$H2,$H2 # accumulate input 3516 vpaddq $T0,$H0,$H0 3517 vpaddq $T1,$H1,$H1 3518 3519 vpxorq $D0lo,$D0lo,$D0lo 3520 vpmadd52luq $H2,$S1,$D0lo 3521 vpxorq $D0hi,$D0hi,$D0hi 3522 vpmadd52huq $H2,$S1,$D0hi 3523 vpxorq $D1lo,$D1lo,$D1lo 3524 vpmadd52luq $H2,$S2,$D1lo 3525 vpxorq $D1hi,$D1hi,$D1hi 3526 vpmadd52huq $H2,$S2,$D1hi 3527 vpxorq $D2lo,$D2lo,$D2lo 3528 vpmadd52luq $H2,$R0,$D2lo 3529 vpxorq $D2hi,$D2hi,$D2hi 3530 vpmadd52huq $H2,$R0,$D2hi 3531 3532 vmovdqu64 16*0($inp),$T2 # load data 3533 vmovdqu64 16*4($inp),$T3 3534 lea 16*8($inp),$inp 3535 vpmadd52luq $H0,$R0,$D0lo 3536 vpmadd52huq $H0,$R0,$D0hi 3537 vpmadd52luq $H0,$R1,$D1lo 3538 vpmadd52huq $H0,$R1,$D1hi 3539 vpmadd52luq $H0,$R2,$D2lo 3540 vpmadd52huq $H0,$R2,$D2hi 3541 3542 vpunpcklqdq $T3,$T2,$T1 # transpose data 3543 vpunpckhqdq $T3,$T2,$T3 3544 vpmadd52luq $H1,$S2,$D0lo 3545 vpmadd52huq $H1,$S2,$D0hi 3546 vpmadd52luq $H1,$R0,$D1lo 3547 vpmadd52huq $H1,$R0,$D1hi 3548 vpmadd52luq $H1,$R1,$D2lo 3549 vpmadd52huq $H1,$R1,$D2hi 3550 3551 ################################################################ 3552 # partial reduction (interleaved with data splat) 3553 vpsrlq \$44,$D0lo,$tmp 3554 vpsllq \$8,$D0hi,$D0hi 3555 vpandq $mask44,$D0lo,$H0 3556 vpaddq $tmp,$D0hi,$D0hi 3557 3558 vpsrlq \$24,$T3,$T2 3559 vporq $PAD,$T2,$T2 3560 vpaddq $D0hi,$D1lo,$D1lo 3561 3562 vpsrlq \$44,$D1lo,$tmp 3563 vpsllq \$8,$D1hi,$D1hi 3564 vpandq $mask44,$D1lo,$H1 3565 vpaddq $tmp,$D1hi,$D1hi 3566 3567 vpandq $mask44,$T1,$T0 3568 vpsrlq \$44,$T1,$T1 3569 vpsllq \$20,$T3,$T3 3570 vpaddq $D1hi,$D2lo,$D2lo 3571 3572 vpsrlq \$42,$D2lo,$tmp 3573 vpsllq \$10,$D2hi,$D2hi 3574 vpandq $mask42,$D2lo,$H2 3575 vpaddq $tmp,$D2hi,$D2hi 3576 3577 vpaddq $T2,$H2,$H2 # accumulate input 3578 vpaddq $D2hi,$H0,$H0 3579 vpsllq \$2,$D2hi,$D2hi 3580 3581 vpaddq $D2hi,$H0,$H0 3582 vporq $T3,$T1,$T1 3583 vpandq $mask44,$T1,$T1 3584 3585 vpsrlq \$44,$H0,$tmp # additional step 3586 vpandq $mask44,$H0,$H0 3587 3588 vpaddq $tmp,$H1,$H1 3589 3590 sub \$8,$len # len-=128 3591 jnz .Loop_vpmadd52_8x 3592 3593.Ltail_vpmadd52_8x: 3594 #vpaddq $T2,$H2,$H2 # accumulate input 3595 vpaddq $T0,$H0,$H0 3596 vpaddq $T1,$H1,$H1 3597 3598 vpxorq $D0lo,$D0lo,$D0lo 3599 vpmadd52luq $H2,$SS1,$D0lo 3600 vpxorq $D0hi,$D0hi,$D0hi 3601 vpmadd52huq $H2,$SS1,$D0hi 3602 vpxorq $D1lo,$D1lo,$D1lo 3603 vpmadd52luq $H2,$SS2,$D1lo 3604 vpxorq $D1hi,$D1hi,$D1hi 3605 vpmadd52huq $H2,$SS2,$D1hi 3606 vpxorq $D2lo,$D2lo,$D2lo 3607 vpmadd52luq $H2,$RR0,$D2lo 3608 vpxorq $D2hi,$D2hi,$D2hi 3609 vpmadd52huq $H2,$RR0,$D2hi 3610 3611 vpmadd52luq $H0,$RR0,$D0lo 3612 vpmadd52huq $H0,$RR0,$D0hi 3613 vpmadd52luq $H0,$RR1,$D1lo 3614 vpmadd52huq $H0,$RR1,$D1hi 3615 vpmadd52luq $H0,$RR2,$D2lo 3616 vpmadd52huq $H0,$RR2,$D2hi 3617 3618 vpmadd52luq $H1,$SS2,$D0lo 3619 vpmadd52huq $H1,$SS2,$D0hi 3620 vpmadd52luq $H1,$RR0,$D1lo 3621 vpmadd52huq $H1,$RR0,$D1hi 3622 vpmadd52luq $H1,$RR1,$D2lo 3623 vpmadd52huq $H1,$RR1,$D2hi 3624 3625 ################################################################ 3626 # horizontal addition 3627 3628 mov \$1,%eax 3629 kmovw %eax,%k1 3630 vpsrldq \$8,$D0lo,$T0 3631 vpsrldq \$8,$D0hi,$H0 3632 vpsrldq \$8,$D1lo,$T1 3633 vpsrldq \$8,$D1hi,$H1 3634 vpaddq $T0,$D0lo,$D0lo 3635 vpaddq $H0,$D0hi,$D0hi 3636 vpsrldq \$8,$D2lo,$T2 3637 vpsrldq \$8,$D2hi,$H2 3638 vpaddq $T1,$D1lo,$D1lo 3639 vpaddq $H1,$D1hi,$D1hi 3640 vpermq \$0x2,$D0lo,$T0 3641 vpermq \$0x2,$D0hi,$H0 3642 vpaddq $T2,$D2lo,$D2lo 3643 vpaddq $H2,$D2hi,$D2hi 3644 3645 vpermq \$0x2,$D1lo,$T1 3646 vpermq \$0x2,$D1hi,$H1 3647 vpaddq $T0,$D0lo,$D0lo 3648 vpaddq $H0,$D0hi,$D0hi 3649 vpermq \$0x2,$D2lo,$T2 3650 vpermq \$0x2,$D2hi,$H2 3651 vpaddq $T1,$D1lo,$D1lo 3652 vpaddq $H1,$D1hi,$D1hi 3653 vextracti64x4 \$1,$D0lo,%y#$T0 3654 vextracti64x4 \$1,$D0hi,%y#$H0 3655 vpaddq $T2,$D2lo,$D2lo 3656 vpaddq $H2,$D2hi,$D2hi 3657 3658 vextracti64x4 \$1,$D1lo,%y#$T1 3659 vextracti64x4 \$1,$D1hi,%y#$H1 3660 vextracti64x4 \$1,$D2lo,%y#$T2 3661 vextracti64x4 \$1,$D2hi,%y#$H2 3662___ 3663######## switch back to %ymm 3664map(s/%z/%y/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2); 3665map(s/%z/%y/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi); 3666map(s/%z/%y/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD); 3667 3668$code.=<<___; 3669 vpaddq $T0,$D0lo,${D0lo}{%k1}{z} 3670 vpaddq $H0,$D0hi,${D0hi}{%k1}{z} 3671 vpaddq $T1,$D1lo,${D1lo}{%k1}{z} 3672 vpaddq $H1,$D1hi,${D1hi}{%k1}{z} 3673 vpaddq $T2,$D2lo,${D2lo}{%k1}{z} 3674 vpaddq $H2,$D2hi,${D2hi}{%k1}{z} 3675 3676 ################################################################ 3677 # partial reduction 3678 vpsrlq \$44,$D0lo,$tmp 3679 vpsllq \$8,$D0hi,$D0hi 3680 vpandq $mask44,$D0lo,$H0 3681 vpaddq $tmp,$D0hi,$D0hi 3682 3683 vpaddq $D0hi,$D1lo,$D1lo 3684 3685 vpsrlq \$44,$D1lo,$tmp 3686 vpsllq \$8,$D1hi,$D1hi 3687 vpandq $mask44,$D1lo,$H1 3688 vpaddq $tmp,$D1hi,$D1hi 3689 3690 vpaddq $D1hi,$D2lo,$D2lo 3691 3692 vpsrlq \$42,$D2lo,$tmp 3693 vpsllq \$10,$D2hi,$D2hi 3694 vpandq $mask42,$D2lo,$H2 3695 vpaddq $tmp,$D2hi,$D2hi 3696 3697 vpaddq $D2hi,$H0,$H0 3698 vpsllq \$2,$D2hi,$D2hi 3699 3700 vpaddq $D2hi,$H0,$H0 3701 3702 vpsrlq \$44,$H0,$tmp # additional step 3703 vpandq $mask44,$H0,$H0 3704 3705 vpaddq $tmp,$H1,$H1 3706 3707 ################################################################ 3708 3709 vmovq %x#$H0,0($ctx) 3710 vmovq %x#$H1,8($ctx) 3711 vmovq %x#$H2,16($ctx) 3712 vzeroall 3713 3714.Lno_data_vpmadd52_8x: 3715 ret 3716.size poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x 3717___ 3718} 3719$code.=<<___; 3720.type poly1305_emit_base2_44,\@function,3 3721.align 32 3722poly1305_emit_base2_44: 3723 mov 0($ctx),%r8 # load hash value 3724 mov 8($ctx),%r9 3725 mov 16($ctx),%r10 3726 3727 mov %r9,%rax 3728 shr \$20,%r9 3729 shl \$44,%rax 3730 mov %r10,%rcx 3731 shr \$40,%r10 3732 shl \$24,%rcx 3733 3734 add %rax,%r8 3735 adc %rcx,%r9 3736 adc \$0,%r10 3737 3738 mov %r8,%rax 3739 add \$5,%r8 # compare to modulus 3740 mov %r9,%rcx 3741 adc \$0,%r9 3742 adc \$0,%r10 3743 shr \$2,%r10 # did 130-bit value overflow? 3744 cmovnz %r8,%rax 3745 cmovnz %r9,%rcx 3746 3747 add 0($nonce),%rax # accumulate nonce 3748 adc 8($nonce),%rcx 3749 mov %rax,0($mac) # write result 3750 mov %rcx,8($mac) 3751 3752 ret 3753.size poly1305_emit_base2_44,.-poly1305_emit_base2_44 3754___ 3755} } } 3756$code.=<<___; 3757.align 64 3758.Lconst: 3759.Lmask24: 3760.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0 3761.L129: 3762.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0 3763.Lmask26: 3764.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0 3765.Lpermd_avx2: 3766.long 2,2,2,3,2,0,2,1 3767.Lpermd_avx512: 3768.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7 3769 3770.L2_44_inp_permd: 3771.long 0,1,1,2,2,3,7,7 3772.L2_44_inp_shift: 3773.quad 0,12,24,64 3774.L2_44_mask: 3775.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff 3776.L2_44_shift_rgt: 3777.quad 44,44,42,64 3778.L2_44_shift_lft: 3779.quad 8,8,10,64 3780 3781.align 64 3782.Lx_mask44: 3783.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff 3784.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff 3785.Lx_mask42: 3786.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff 3787.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff 3788___ 3789} 3790$code.=<<___; 3791.asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 3792.align 16 3793___ 3794 3795{ # chacha20-poly1305 helpers 3796my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order 3797 ("%rdi","%rsi","%rdx","%rcx"); # Unix order 3798$code.=<<___; 3799.globl xor128_encrypt_n_pad 3800.type xor128_encrypt_n_pad,\@abi-omnipotent 3801.align 16 3802xor128_encrypt_n_pad: 3803 sub $otp,$inp 3804 sub $otp,$out 3805 mov $len,%r10 # put len aside 3806 shr \$4,$len # len / 16 3807 jz .Ltail_enc 3808 nop 3809.Loop_enc_xmm: 3810 movdqu ($inp,$otp),%xmm0 3811 pxor ($otp),%xmm0 3812 movdqu %xmm0,($out,$otp) 3813 movdqa %xmm0,($otp) 3814 lea 16($otp),$otp 3815 dec $len 3816 jnz .Loop_enc_xmm 3817 3818 and \$15,%r10 # len % 16 3819 jz .Ldone_enc 3820 3821.Ltail_enc: 3822 mov \$16,$len 3823 sub %r10,$len 3824 xor %eax,%eax 3825.Loop_enc_byte: 3826 mov ($inp,$otp),%al 3827 xor ($otp),%al 3828 mov %al,($out,$otp) 3829 mov %al,($otp) 3830 lea 1($otp),$otp 3831 dec %r10 3832 jnz .Loop_enc_byte 3833 3834 xor %eax,%eax 3835.Loop_enc_pad: 3836 mov %al,($otp) 3837 lea 1($otp),$otp 3838 dec $len 3839 jnz .Loop_enc_pad 3840 3841.Ldone_enc: 3842 mov $otp,%rax 3843 ret 3844.size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad 3845 3846.globl xor128_decrypt_n_pad 3847.type xor128_decrypt_n_pad,\@abi-omnipotent 3848.align 16 3849xor128_decrypt_n_pad: 3850 sub $otp,$inp 3851 sub $otp,$out 3852 mov $len,%r10 # put len aside 3853 shr \$4,$len # len / 16 3854 jz .Ltail_dec 3855 nop 3856.Loop_dec_xmm: 3857 movdqu ($inp,$otp),%xmm0 3858 movdqa ($otp),%xmm1 3859 pxor %xmm0,%xmm1 3860 movdqu %xmm1,($out,$otp) 3861 movdqa %xmm0,($otp) 3862 lea 16($otp),$otp 3863 dec $len 3864 jnz .Loop_dec_xmm 3865 3866 pxor %xmm1,%xmm1 3867 and \$15,%r10 # len % 16 3868 jz .Ldone_dec 3869 3870.Ltail_dec: 3871 mov \$16,$len 3872 sub %r10,$len 3873 xor %eax,%eax 3874 xor %r11,%r11 3875.Loop_dec_byte: 3876 mov ($inp,$otp),%r11b 3877 mov ($otp),%al 3878 xor %r11b,%al 3879 mov %al,($out,$otp) 3880 mov %r11b,($otp) 3881 lea 1($otp),$otp 3882 dec %r10 3883 jnz .Loop_dec_byte 3884 3885 xor %eax,%eax 3886.Loop_dec_pad: 3887 mov %al,($otp) 3888 lea 1($otp),$otp 3889 dec $len 3890 jnz .Loop_dec_pad 3891 3892.Ldone_dec: 3893 mov $otp,%rax 3894 ret 3895.size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad 3896___ 3897} 3898 3899# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 3900# CONTEXT *context,DISPATCHER_CONTEXT *disp) 3901if ($win64) { 3902$rec="%rcx"; 3903$frame="%rdx"; 3904$context="%r8"; 3905$disp="%r9"; 3906 3907$code.=<<___; 3908.extern __imp_RtlVirtualUnwind 3909.type se_handler,\@abi-omnipotent 3910.align 16 3911se_handler: 3912 push %rsi 3913 push %rdi 3914 push %rbx 3915 push %rbp 3916 push %r12 3917 push %r13 3918 push %r14 3919 push %r15 3920 pushfq 3921 sub \$64,%rsp 3922 3923 mov 120($context),%rax # pull context->Rax 3924 mov 248($context),%rbx # pull context->Rip 3925 3926 mov 8($disp),%rsi # disp->ImageBase 3927 mov 56($disp),%r11 # disp->HandlerData 3928 3929 mov 0(%r11),%r10d # HandlerData[0] 3930 lea (%rsi,%r10),%r10 # prologue label 3931 cmp %r10,%rbx # context->Rip<.Lprologue 3932 jb .Lcommon_seh_tail 3933 3934 mov 152($context),%rax # pull context->Rsp 3935 3936 mov 4(%r11),%r10d # HandlerData[1] 3937 lea (%rsi,%r10),%r10 # epilogue label 3938 cmp %r10,%rbx # context->Rip>=.Lepilogue 3939 jae .Lcommon_seh_tail 3940 3941 lea 48(%rax),%rax 3942 3943 mov -8(%rax),%rbx 3944 mov -16(%rax),%rbp 3945 mov -24(%rax),%r12 3946 mov -32(%rax),%r13 3947 mov -40(%rax),%r14 3948 mov -48(%rax),%r15 3949 mov %rbx,144($context) # restore context->Rbx 3950 mov %rbp,160($context) # restore context->Rbp 3951 mov %r12,216($context) # restore context->R12 3952 mov %r13,224($context) # restore context->R13 3953 mov %r14,232($context) # restore context->R14 3954 mov %r15,240($context) # restore context->R14 3955 3956 jmp .Lcommon_seh_tail 3957.size se_handler,.-se_handler 3958 3959.type avx_handler,\@abi-omnipotent 3960.align 16 3961avx_handler: 3962 push %rsi 3963 push %rdi 3964 push %rbx 3965 push %rbp 3966 push %r12 3967 push %r13 3968 push %r14 3969 push %r15 3970 pushfq 3971 sub \$64,%rsp 3972 3973 mov 120($context),%rax # pull context->Rax 3974 mov 248($context),%rbx # pull context->Rip 3975 3976 mov 8($disp),%rsi # disp->ImageBase 3977 mov 56($disp),%r11 # disp->HandlerData 3978 3979 mov 0(%r11),%r10d # HandlerData[0] 3980 lea (%rsi,%r10),%r10 # prologue label 3981 cmp %r10,%rbx # context->Rip<prologue label 3982 jb .Lcommon_seh_tail 3983 3984 mov 152($context),%rax # pull context->Rsp 3985 3986 mov 4(%r11),%r10d # HandlerData[1] 3987 lea (%rsi,%r10),%r10 # epilogue label 3988 cmp %r10,%rbx # context->Rip>=epilogue label 3989 jae .Lcommon_seh_tail 3990 3991 mov 208($context),%rax # pull context->R11 3992 3993 lea 0x50(%rax),%rsi 3994 lea 0xf8(%rax),%rax 3995 lea 512($context),%rdi # &context.Xmm6 3996 mov \$20,%ecx 3997 .long 0xa548f3fc # cld; rep movsq 3998 3999.Lcommon_seh_tail: 4000 mov 8(%rax),%rdi 4001 mov 16(%rax),%rsi 4002 mov %rax,152($context) # restore context->Rsp 4003 mov %rsi,168($context) # restore context->Rsi 4004 mov %rdi,176($context) # restore context->Rdi 4005 4006 mov 40($disp),%rdi # disp->ContextRecord 4007 mov $context,%rsi # context 4008 mov \$154,%ecx # sizeof(CONTEXT) 4009 .long 0xa548f3fc # cld; rep movsq 4010 4011 mov $disp,%rsi 4012 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 4013 mov 8(%rsi),%rdx # arg2, disp->ImageBase 4014 mov 0(%rsi),%r8 # arg3, disp->ControlPc 4015 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 4016 mov 40(%rsi),%r10 # disp->ContextRecord 4017 lea 56(%rsi),%r11 # &disp->HandlerData 4018 lea 24(%rsi),%r12 # &disp->EstablisherFrame 4019 mov %r10,32(%rsp) # arg5 4020 mov %r11,40(%rsp) # arg6 4021 mov %r12,48(%rsp) # arg7 4022 mov %rcx,56(%rsp) # arg8, (NULL) 4023 call *__imp_RtlVirtualUnwind(%rip) 4024 4025 mov \$1,%eax # ExceptionContinueSearch 4026 add \$64,%rsp 4027 popfq 4028 pop %r15 4029 pop %r14 4030 pop %r13 4031 pop %r12 4032 pop %rbp 4033 pop %rbx 4034 pop %rdi 4035 pop %rsi 4036 ret 4037.size avx_handler,.-avx_handler 4038 4039.section .pdata 4040.align 4 4041 .rva .LSEH_begin_poly1305_init 4042 .rva .LSEH_end_poly1305_init 4043 .rva .LSEH_info_poly1305_init 4044 4045 .rva .LSEH_begin_poly1305_blocks 4046 .rva .LSEH_end_poly1305_blocks 4047 .rva .LSEH_info_poly1305_blocks 4048 4049 .rva .LSEH_begin_poly1305_emit 4050 .rva .LSEH_end_poly1305_emit 4051 .rva .LSEH_info_poly1305_emit 4052___ 4053$code.=<<___ if ($avx); 4054 .rva .LSEH_begin_poly1305_blocks_avx 4055 .rva .Lbase2_64_avx 4056 .rva .LSEH_info_poly1305_blocks_avx_1 4057 4058 .rva .Lbase2_64_avx 4059 .rva .Leven_avx 4060 .rva .LSEH_info_poly1305_blocks_avx_2 4061 4062 .rva .Leven_avx 4063 .rva .LSEH_end_poly1305_blocks_avx 4064 .rva .LSEH_info_poly1305_blocks_avx_3 4065 4066 .rva .LSEH_begin_poly1305_emit_avx 4067 .rva .LSEH_end_poly1305_emit_avx 4068 .rva .LSEH_info_poly1305_emit_avx 4069___ 4070$code.=<<___ if ($avx>1); 4071 .rva .LSEH_begin_poly1305_blocks_avx2 4072 .rva .Lbase2_64_avx2 4073 .rva .LSEH_info_poly1305_blocks_avx2_1 4074 4075 .rva .Lbase2_64_avx2 4076 .rva .Leven_avx2 4077 .rva .LSEH_info_poly1305_blocks_avx2_2 4078 4079 .rva .Leven_avx2 4080 .rva .LSEH_end_poly1305_blocks_avx2 4081 .rva .LSEH_info_poly1305_blocks_avx2_3 4082___ 4083$code.=<<___ if ($avx>2); 4084 .rva .LSEH_begin_poly1305_blocks_avx512 4085 .rva .LSEH_end_poly1305_blocks_avx512 4086 .rva .LSEH_info_poly1305_blocks_avx512 4087___ 4088$code.=<<___; 4089.section .xdata 4090.align 8 4091.LSEH_info_poly1305_init: 4092 .byte 9,0,0,0 4093 .rva se_handler 4094 .rva .LSEH_begin_poly1305_init,.LSEH_begin_poly1305_init 4095 4096.LSEH_info_poly1305_blocks: 4097 .byte 9,0,0,0 4098 .rva se_handler 4099 .rva .Lblocks_body,.Lblocks_epilogue 4100 4101.LSEH_info_poly1305_emit: 4102 .byte 9,0,0,0 4103 .rva se_handler 4104 .rva .LSEH_begin_poly1305_emit,.LSEH_begin_poly1305_emit 4105___ 4106$code.=<<___ if ($avx); 4107.LSEH_info_poly1305_blocks_avx_1: 4108 .byte 9,0,0,0 4109 .rva se_handler 4110 .rva .Lblocks_avx_body,.Lblocks_avx_epilogue # HandlerData[] 4111 4112.LSEH_info_poly1305_blocks_avx_2: 4113 .byte 9,0,0,0 4114 .rva se_handler 4115 .rva .Lbase2_64_avx_body,.Lbase2_64_avx_epilogue # HandlerData[] 4116 4117.LSEH_info_poly1305_blocks_avx_3: 4118 .byte 9,0,0,0 4119 .rva avx_handler 4120 .rva .Ldo_avx_body,.Ldo_avx_epilogue # HandlerData[] 4121 4122.LSEH_info_poly1305_emit_avx: 4123 .byte 9,0,0,0 4124 .rva se_handler 4125 .rva .LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx 4126___ 4127$code.=<<___ if ($avx>1); 4128.LSEH_info_poly1305_blocks_avx2_1: 4129 .byte 9,0,0,0 4130 .rva se_handler 4131 .rva .Lblocks_avx2_body,.Lblocks_avx2_epilogue # HandlerData[] 4132 4133.LSEH_info_poly1305_blocks_avx2_2: 4134 .byte 9,0,0,0 4135 .rva se_handler 4136 .rva .Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue # HandlerData[] 4137 4138.LSEH_info_poly1305_blocks_avx2_3: 4139 .byte 9,0,0,0 4140 .rva avx_handler 4141 .rva .Ldo_avx2_body,.Ldo_avx2_epilogue # HandlerData[] 4142___ 4143$code.=<<___ if ($avx>2); 4144.LSEH_info_poly1305_blocks_avx512: 4145 .byte 9,0,0,0 4146 .rva avx_handler 4147 .rva .Ldo_avx512_body,.Ldo_avx512_epilogue # HandlerData[] 4148___ 4149} 4150 4151foreach (split('\n',$code)) { 4152 s/\`([^\`]*)\`/eval($1)/ge; 4153 s/%r([a-z]+)#d/%e$1/g; 4154 s/%r([0-9]+)#d/%r$1d/g; 4155 s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g; 4156 4157 print $_,"\n"; 4158} 4159close STDOUT; 4160